diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000000..d0c3eaf8ee --- /dev/null +++ b/.coveragerc @@ -0,0 +1,13 @@ +[run] +source = + dffml + tests +branch = True + +[report] +exclude_lines = + no cov + no qa + noqa + pragma: no cover + if __name__ == .__main__.: diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000..8a9c8b85f6 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,6 @@ +.venv/ +.eggs/ +.mypy_cache/ +*/*.venv/ +*/*.eggs/ +*/*.mypy_cache/ diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..dfe0770424 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..748a6bcb33 --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +*.log +*.pyc +.cache/ +.coverage +.idea/ +.vscode/ +*.egg-info/ +build/ +dist/ +docs/build/ +venv/ +wheelhouse/ +*.png +*.eggs +*.swp +.mypy_cache/ +*.egg-info/ +htmlcov/ +.venv/ diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000000..d92bda802a --- /dev/null +++ b/.pylintrc @@ -0,0 +1,3 @@ +[MESSAGES CONTROL] +disable=too-few-public-methods, + bad-continuation diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000000..c31a693270 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,26 @@ +language: python +dist: xenial +python: + - "3.7" +addons: + apt: + packages: + - git + - subversion + - cloc +env: + global: + - LOGGING=debug + matrix: + - PLUGIN=. + - PLUGIN=model/tensorflow + - PLUGIN=feature/git +before_install: + # Update cloc (xenial has bad version) + - | + curl -o /tmp/cloc -sSL https://github.com/AlDanial/cloc/raw/1.80/cloc + sudo cp /tmp/cloc /usr/bin/cloc + sudo chmod 755 /usr/bin/cloc + - | +script: + - python setup.py install && cd $PLUGIN && python setup.py test diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000000..144a787d3c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,18 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +- Nothing yet... + +## [0.1.0] - 2019-03-07 +### Added +- Feature class to collect a feature in a dataset +- Git features to collect feature data from Git repos +- Model class to wrap implementations of machine learning models +- Tensorflow DNN model for generic usage of the DNN estimator +- CLI interface and framework +- Source class to manage dataset storage diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000..b46d255267 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.7 + +WORKDIR /usr/src/dffml + +COPY . . +RUN pip install --no-cache-dir . && \ + cp scripts/docker-entrypoint.sh /usr/bin/ && \ + chmod 755 /usr/bin/docker-entrypoint.sh + +ENTRYPOINT ["/usr/bin/docker-entrypoint.sh"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..8ce5aa9e27 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +Copyright (c) 2017-2019 Intel + +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000..9d5d250d09 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include LICENSE +include README.rst diff --git a/README.rst b/README.rst new file mode 100644 index 0000000000..3dfa732252 --- /dev/null +++ b/README.rst @@ -0,0 +1,260 @@ +Data Flow Facilitator for Machine Learning (dffml) +================================================== + +.. image:: https://travis-ci.org/intel/dffml.svg + :target: https://travis-ci.org/intel/dffml +.. image:: https://bestpractices.coreinfrastructure.org/projects/2594/badge + :target: https://bestpractices.coreinfrastructure.org/projects/2594 + +Is DFFML Right For Me? +---------------------- + +If you answer yes to any of these questions DFFML can make your life easier. + +- Dataset Generation + + - Need to generate a dataset + - Need to run asynchronous operations in order to gather dataset (http + requests, interaction with command line utilities, etc.) + +- Models + + - Want to quickly prototype how machine learning could be used on a dataset + without writing a model + - Need to write a finely tuned model by interacting with low level APIs of + popular machine learning frameworks. + +- Storage + + - Need a way to use datasets which could be stored in different locations or + formats. + +About +----- + +DFFML facilitates data generation, model creation, and use of models via +services. See `Architecture`_ to learn how it works. + +- Facilitates data collection, model creation, and use of models via services. +- Provides plumbing to facilitate the collection of feature data to create + datasets. +- Allows developers to define their ML models via a standardized API. + + - This let's users try different libraries / models to compare performance. + +- Plugin based + + - Features which gather feature data (Number of Git Authors, etc.) + - Models which expose ML models via the standard API (Tensorflow, Scikit, + etc.) + - Sources which load and store feature data (CSV, JSON, MySQL, etc.) + +The plumbing DFFML provides enables users to swap out models and features, +in order to quickly prototype. + +Installation +------------ + +DFFML currently should work with Python 3.6. However, only Python 3.7 is +officially supported. This is because there are a lot of nice helper methods +Python 3.7 implemented that we intend to use instead of re-implementing. + +.. code-block:: bash + + python3.7 -m pip install -U dffml + +You can also install the Features for Git Version Control, and Models for +Tensorflow Library all at once. + +- `DFFML Features for Git Version Control `_ +- `DFFML Models for Tensorflow Library `_ + +If you want a quick how to on the iris dataset head to the +`DFFML Models for Tensorflow Library `_ repo. + +.. code-block:: bash + + python3.7 -m pip install -U dffml[git,tensorflow] + +Docker Build +------------ + +This is a good option if you don't want to deal with installing Python 3.7. + +.. code-block:: bash + + docker build -t dffml . + +You can then alias dffml to run the docker container. + +.. code-block:: bash + + alias dffml="docker run --rm -ti -v $HOME/.local:/home/$USER/.local/ -v $PWD:/workdir -w /workdir -e UID=$(id -u) -e USER=$USER dffml" + +This creates an alias that takes your current working directory and mounts it +into ``/workdir`` as well as your ``$HOME/.local`` to the same in the container. + +With the alias, you can run ``dffml`` commands as you would if installed via +``pip``. + +.. code-block:: bash + + dffml list + +Keep in mind that if you're working on files they can only be ones in your +current working directory, and if you want to access network resources and they +are on your host, you'll have to talk to ``172.17.0.1`` (docker0 inet address) +instead of ``localhost`` or ``127.0.0.1``. + +The purpose of mounting ``$HOME/.local`` is so that if you want to +``pip install`` anything, you can, and it will persist between invocations due +to that being on the host. + +If you wan to run ``pip`` you can put it after ``dffml``. + +.. code-block:: bash + + dffml pip install example + +Hacking +------- + +Then install in development mode to the virtualenv and development dependencies. + +.. code-block:: bash + + git clone git@github.com:intel/dffml + cd dffml + pip install --user -e .[git,tensorflow] + +Usage +----- + +See `DFFML Models for Tensorflow Library `_ repo +until documentation here is updated with a generic example. + +Testing +------- + +.. code-block:: bash + + python3.7 setup.py test + +Architecture +------------ + +When applying Machine Learning to a new problem developers must first collect +data for models to train on. DFFML facilitates the collection of feature data +to create datasets for models to learn on. + +.. image:: https://github.com/intel/dffml/raw/master/docs/arch.png + +DFFML's architecture can be thought of similarly to a search engine. Each +**Feature** a developer defines searches for data associated with the unique key +its provided with. Once the data is found it is added to a **Repo** (repository) +associated with that unique key. A **Feature**'s search for data is dubbed +*evaluation*. A **Repo** holds the results of each **Feature**'s evaluation. +Results are stored under their respective **Feature** names. + +To define machine learning a model within DFFML, users create a **Model**. +Models are responsible for training, assessing accuracy, and making +predictions. After evaluation a **Repo** can be used by a **Model** for any of +those tasks. Defining a machine learning model as a **Model** allows users to +quickly compare accuracy of various models on their gathered dataset. + +Once the best most accurate model is known, users can easily integrate use of +the model into existing applications via the Python API, or a **Service**. +Services provide applications with ways to access the DFFML API over various +protocols and deployment scenarios. + +Repo +---- + +A repo is a repository of information. It is instantiated with a source URL +which represents or points to where more information on it can be found. + +Every repo has (or wants) a classification. Those which already have +classifications can be used to train Models. The classification of the repo is +what Education will ask it's models to make predictions on. + +Feature +------ + +Features are given a repo, containing at the minimum a source URL for it, +and produce a list of results which represent the evaluation of that feature. + +Not all methods are applicable to all repos. As such, all Features implement the +``applicable`` method. + +Feature is the abstract base class for all features. New features must be +derived from this class and implement the fetch, parse, and calc methods. These +methods are always called in order by the evaluator. However, they are executed +in parallel with the same stages of other features. + +A feature is provided with a repo +and is expected to fetch any data it needs to calculate itself when fetch +is called. All data fetched should be stored in tempdir() if it must reside +on disk. + +Once the appropriate data is fetched the parse method is responsible for +storing the parts of that data which will be used to calculate in the +subclass + +.. code-block:: python + + from dffml.feature import Feature + + class StringByFT(Feature): + + async def fetch(self): + self.__value = '42' + + async def parse(self): + self.__value = int(self.__value) + +The calc method then uses variables set in parse to calculate the feature. + +.. code-block:: python + + async def calc(self): + return self.__value * 42 + +.. code-block:: python + + entry_points={ + 'dffml.feature': [ + 'string_by_42 = mypackage.string_by_42:StringByFT', + ], + }, + +Source +------ + +Repos come from a source. Sources may contain more information on a repo than +just it's source URL. Sources are responsible for providing the repos they +contain and updating those repos upon request. + +Model +------- + +Models are feed classified repos from which they learn from during their +training phase. After training they can be used to make a prediction about the +classification of a repo. + +License +------- + +dffml is distributed under the MIT License, see ``LICENSE``. + +Legal +----- + +.. + + This software is subject to the U.S. Export Administration Regulations and + other U.S. law, and may not be exported or re-exported to certain countries + (Cuba, Iran, Crimea Region of Ukraine, North Korea, Sudan, and Syria) or to + persons or entities prohibited from receiving U.S. exports (including + Denied Parties, Specially Designated Nationals, and entities on the Bureau + of Export Administration Entity List or involved with missile technology or + nuclear, chemical or biological weapons). diff --git a/dffml/__init__.py b/dffml/__init__.py new file mode 100644 index 0000000000..fc390d5f20 --- /dev/null +++ b/dffml/__init__.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +Education (dffml) is a package and tool for doing machine learning. + +It uses the setuptools dynamic discovery of services and plugins [1] to +evaluate a package based on the installed features. + +[1]: http://setuptools.readthedocs.io/en/latest/setuptools.html +''' +from .feature import Feature + +# Used to declare our namespace for resource discovery +__import__('pkg_resources').declare_namespace(__name__) diff --git a/dffml/accuracy.py b/dffml/accuracy.py new file mode 100644 index 0000000000..1a164bcfbc --- /dev/null +++ b/dffml/accuracy.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +class Accuracy(float): + + def __str__(self): + return '%.02f' % (float(self) * 100.0) diff --git a/dffml/cli.py b/dffml/cli.py new file mode 100644 index 0000000000..545b1f5b7a --- /dev/null +++ b/dffml/cli.py @@ -0,0 +1,293 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +Command line interface evaluates packages given their source URLs +''' +import os +import sys +import pdb +import json +import asyncio +import logging +import inspect +import argparse +import pkg_resources + +from .log import LOGGER +from .version import VERSION +from .port import Port +from .feature import Feature, Features, Data +from .source import Source, Sources, SubsetSources +from .model import Model +from .util.cli import CMD, Arg, SourcesCMD, FeaturesCMD, ModelCMD, PortCMD, \ + KeysCMD, ListEntrypoint, ParseSourcesAction + +class Version(CMD): + ''' + Print version and exit + ''' + + async def run(self): + LOGGER.debug('Reporting version') + print(VERSION) + +class Edit(SourcesCMD, KeysCMD): + ''' + Edit each specified repo + ''' + + async def run(self): + async with self.sources as sources: + for key in self.keys: + repo = await sources.repo(key) + pdb.set_trace() + await sources.update(repo) + +class ListRepos(SourcesCMD): + ''' + List repos stored in sources + ''' + + async def run(self): + async with self.sources as sources: + async for repo in sources.repos(): + print(repo) + +class ListFeatures(ListEntrypoint): + ''' + List installed features + ''' + + ENTRYPOINT = Feature + + def display(self, cls): + if not cls.__doc__ is None: + print('%s(%s):' % (cls.NAME, cls.__qualname__)) + print(cls.__doc__.rstrip()) + else: + print('%s(%s)' % (cls.NAME, cls.__qualname__)) + print() + +class ListServices(ListEntrypoint): + ''' + List installed services + ''' + + async def run(self): + for i in pkg_resources.iter_entry_points('dffml.service.cli'): + loaded = i.load() + if issubclass(loaded, CMD): + self.display(loaded) + +class ListSources(ListEntrypoint): + ''' + List installed sources + ''' + + ENTRYPOINT = Source + +class ListModels(ListEntrypoint): + ''' + List installed models + ''' + + ENTRYPOINT = Model + +class ListPorts(ListEntrypoint): + ''' + List installed ports + ''' + + ENTRYPOINT = Port + +class List(CMD): + ''' + List repos and installed interfaces + ''' + + repos = ListRepos + features = ListFeatures + sources = ListSources + models = ListModels + services = ListServices + ports = ListPorts + +class Applicable(FeaturesCMD): + + arg_key = Arg('-key', help='Check if features is applicable for this key', + required=True) + + async def run(self): + async with self.features as features: + return await features.applicable(Data(self.key)) + +class Merge(CMD): + ''' + Merge repo data between sources + ''' + + arg_dest = Arg(name='dest', help='Sources merge repos into', + action=ParseSourcesAction) + arg_src = Arg('src', help='Sources to pull repos from', + action=ParseSourcesAction) + + async def run(self): + async with self.src, self.dest: + async for repo in self.src.repos(): + repo.merge(await self.dest.repo(repo.src_url)) + await self.dest.update(repo) + +class EvaluateCMD(FeaturesCMD, SourcesCMD): + + arg_sources = SourcesCMD.arg_sources.modify(required=False) + arg_caching = Arg('-caching', help='Re-evaluate or use last', + required=False, default=False, action='store_true') + arg_parallel = Arg('-parallel', help='Evaluate in parallel', + required=False, default=1, type=int) + arg_cacheless = Arg('-cacheless', + help='Do not re-evaluate if these features are missing', + required=False, default=[], nargs='+') + +class EvaluateAll(EvaluateCMD): + '''Evaluate all repos in sources''' + + arg_update = Arg('-update', help='Update repo with sources', required=False, + default=False, action='store_true') + + async def evaluate(self, sources, features): + async for repo in features.evaluate_repos(sources.repos(), + features=[name for name in features.names() \ + if not name in self.cacheless], + num_workers=self.parallel, caching=self.caching): + yield repo + if self.update: + await sources.update(repo) + + async def run(self): + async with self.sources as sources, self.features as features: + async for repo in self.evaluate(sources, features): + yield repo + +class EvaluateRepo(EvaluateAll, KeysCMD): + '''Evaluate features on individual repos''' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.sources = SubsetSources(*self.sources, keys=self.keys) + +class Evaluate(CMD): + '''Evaluate features against repos''' + + repo = EvaluateRepo + _all = EvaluateAll + +class MLCMD(ModelCMD, FeaturesCMD, SourcesCMD): + ''' + Commands which use models share many similar arguments. + ''' + + arg_classifications = Arg('-classifications', nargs='+', required=True, + default=[]) + +class Train(MLCMD): + '''Train a model on data from given sources''' + + arg_steps = Arg('-steps', help='Number of steps', required=True, type=int, + default=5000) + arg_num_epochs = Arg('-num_epochs', help='Number of epochs', required=True, + type=int, default=30) + + async def run(self): + if not self.model_dir is None and not os.path.isdir(self.model_dir): + os.makedirs(self.model_dir) + async with self.sources as sources, self.features as features: + return await self.model.train(sources, features, + self.classifications, self.steps, self.num_epochs) + +class Accuracy(MLCMD): + '''Assess model accuracy on data from given sources''' + + async def run(self): + async with self.sources as sources, self.features as features: + return float(await self.model.accuracy(sources, features, + self.classifications)) + +class PredictAll(EvaluateAll, MLCMD): + '''Predicts for all sources''' + + async def predict(self, sources, features, repos): + async for repo, classification, confidence in \ + self.model.predict(repos, features, self.classifications): + repo.predicted(classification, confidence) + yield repo + if self.update: + await sources.update(repo) + + async def run(self): + async with self.sources as sources, self.features as features: + async for repo in self.predict(sources, features, + self.evaluate(sources, features)): + yield repo + +class PredictRepo(PredictAll, EvaluateRepo): + '''Predictions for individual repos''' + pass + +class Predict(CMD): + '''Evaluate features against repos and produce a prediction''' + + repo = PredictRepo + _all = PredictAll + +class ImportExportCMD(PortCMD, SourcesCMD): + '''Shared import export arguments''' + + arg_filename = Arg('filename', type=str) + +class Import(ImportExportCMD): + '''Imports repos''' + + async def run(self): + async with self.sources as sources: + return await self.port.import_from_file(sources, self.filename) + +class Export(ImportExportCMD): + '''Exports repos''' + + async def run(self): + async with self.sources as sources: + return await self.port.export_to_file(sources, self.filename) + +def services(): + ''' + Loads dffml.services.cli entrypoint and creates a CMD class incorporating + all of the loaded CLI versions of services as subcommands. + ''' + class Service(CMD): + ''' + Expose various functionalities of dffml + ''' + pass + for i in pkg_resources.iter_entry_points('dffml.service.cli'): + loaded = i.load() + if issubclass(loaded, CMD): + setattr(Service, i.name, loaded) + return Service + +class CLI(CMD): + ''' + CLI interface for dffml + ''' + + version = Version + _list = List + edit = Edit + merge = Merge + _import = Import + export = Export + train = Train + accuracy = Accuracy + predict = Predict + evaluate = Evaluate + service = services() + applicable = Applicable diff --git a/dffml/feature/__init__.py b/dffml/feature/__init__.py new file mode 100644 index 0000000000..36da2e2a2d --- /dev/null +++ b/dffml/feature/__init__.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +All features registered to the dffml.feature entry point using setuptools are +derived from the Feature class. To add a feature, create a module which has a +setup.py which specifies where to find your Feature subclass within your module. + +>>> setup( +>>> name='myfeatures', +... +>>> entry_points={ +>>> 'dffml.feature': [ +>>> 'numfiles = myfeatures:NumFilesFeature', +>>> ], +>>> }, +>>> ) +''' +from .feature import Data, Feature, Features, LoggingDict, DefFeature + +# Declares dffml.feature is a namespace package +__import__('pkg_resources').declare_namespace(__name__) diff --git a/dffml/feature/feature.py b/dffml/feature/feature.py new file mode 100644 index 0000000000..54c3fe2309 --- /dev/null +++ b/dffml/feature/feature.py @@ -0,0 +1,412 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +Feature subclasses are responsible for generating an integer value given an open +feature project's feature URL. +''' +import abc +import pydoc +import asyncio +import traceback +import concurrent.futures as futures +import pkg_resources +from functools import singledispatch, partial +from typing import Optional, List, Dict, Type, AsyncIterator, Any, Callable + +from .log import LOGGER +from ..repo import Repo +from ..util.monitor import Monitor, Task +from ..util.entrypoint import Entrypoint +from ..util.asynchelper import AsyncContextManagerList + +class Frequency(object): + ''' + Frequency in months + ''' + MONTHS: int = 0 + +class Quarterly(Frequency): + ''' + Evaluate on a quarterly basis (every 3 months). + ''' + MONTHS = 3 + +class Yearly(Frequency): + ''' + Evaluate on a yearly basis. + ''' + MONTHS = 12 + +class LoggingDict(object): + + def __init__(self, data: 'Data') -> None: + self.__data = data + self.__dict: Dict = {} + self.ignore = (asyncio.Lock,) + + async def get(self, key, default=None): + val = self.__dict.get(key, default) + return val + + async def set(self, key, value): + self.__dict[key] = value + if not isinstance(value, self.ignore): + await self.__data.update({key: value}, event='set') + + async def inc(self, key, default=None, by=1): + value = await self.get(key, default=default) + value += by + await self.set(key, value) + return value + +class Data(Task): + ''' + Passed to each feature during evaluation. Shared between all features a repo + is being evaluated with + ''' + + LOGGER = LOGGER.getChild('Data') + + def __init__(self, src_url: str) -> None: + super().__init__(key=src_url) + self.src_url = src_url + self.lock: asyncio.Lock = asyncio.Lock() + self.temp: Dict[str, Any] = {} + self.data: LoggingDict = LoggingDict(self) + self.results: Dict[str, Any] = {} + self.locks: Dict[str, Any] = {} + + async def mklock(self, name: str) -> asyncio.Lock: + ''' + Return a lock stored in data under the key `name`. Create the lock if it + does not exist. + ''' + async with self.lock: + lock = self.locks.get(name, None) + if lock is None: + lock = asyncio.Lock() + self.locks[name] = lock + return lock + + async def result(self): + results = await self.complete() + self.results = results + self.LOGGER.debug('Data got results: %r', results) + return results + +class Feature(abc.ABC, Entrypoint): + ''' + Abstract base class for all features. New features must be derived from this + class and implement the fetch, parse, and calc methods. These methods are + always expected to be called in order. Anything you add to your feature + subclass in fetch or parse is accessible in calc. + + A feature is provided with the feature URL of the package (in self._src_url) + and is expected to fetch any data it needs to calculate itself when fetch + is called. All data fetched should be stored in tempdir() if it must reside + on disk. + + Once the appropriate data is fetched the parse method is responsible for + storing the parts of that data which will be used to calculate in the + subclass + + >>> self.__example_parsed_value_name = example_value + + The calc method then uses variables set in parse to output an integer value. + + >>> def calc(self): + >>> return self.__example_parsed_value_name + + Full example of a feature implementation: + + >>> import glob + >>> from dffml.feature import Feature + >>> + >>> class NumFilesFeature(Feature): + >>> + >>> @abc.abstractmethod + >>> def fetch(self, data): + >>> self._downloader.vcs(self._src_url, self.tempdir('src')) + >>> + >>> @abc.abstractmethod + >>> def parse(self, data): + >>> self.__num_files = glob.glob(self.tempdir(), recursive=True) + >>> + >>> @abc.abstractmethod + >>> def calc(self, data): + >>> return self.__num_files + ''' + + LOGGER = LOGGER.getChild('Feature') + + NAME: str = '' + # LENGTH: int = 10 + # FREQUENCY: Type[Frequency] = Quarterly + ENTRY_POINT = 'dffml.feature' + + def __str__(self): + return '%s(%s)' % (self.NAME, self.__class__.__qualname__) + + def __repr__(self): + return '%s[%r, %d]' % (self.__str__(), self.dtype(), self.length()) + + def dtype(self) -> Type: + ''' + Models need to know a Feature's datatype. + ''' + self.LOGGER.warning('%s dtype unimplemented', self) + return int + + def length(self) -> int: + ''' + Models need to know a Feature's length, 1 means single value, more than + that is the length of the array calc returns. + ''' + self.LOGGER.warning('%s length unimplemented', self) + return 1 + + async def applicable(self, data) -> bool: + return True + + async def fetch(self, data): + ''' + Fetch retrieves any additional information about the software we are + evaluating. Any data fetched should be stored in tempdir(). + ''' + pass + + async def parse(self, data): + ''' + Parse the data we downloaded in fetch() into a usable form. + ''' + pass + + async def calc(self, data): + ''' + Calculates the score for this feature based on data found by parse(). + ''' + return False + + async def setUp(self, data): + ''' + Preform setup + ''' + pass + + async def tearDown(self, data, error=False): + ''' + Release any post calculation resources + ''' + pass + + async def open(self): + ''' + Opens any resources needed + ''' + pass + + async def close(self): + ''' + Closes any opened resources + ''' + pass + + async def __aenter__(self): + await self.open() + # TODO Context management + return self + + async def __aexit__(self, exc_type, exc_value, traceback): + await self.close() + +def DefFeature(name, dtype, length): + + class DefinedFeature(Feature): + + LOGGER = LOGGER.getChild('DefFeature') + + def __init__(self, name: str = '', dtype: Type = int, length: int = 1) \ + -> None: + super().__init__() + self.NAME = name + self._dtype = dtype + self._length = length + + def dtype(self) -> Type: + ''' + Models need to know a Feature's datatype. + ''' + return self._dtype + + def length(self) -> int: + ''' + Models need to know a Feature's length, 1 means single value, more than + that is the length of the array calc returns. + ''' + return self._length + + return DefinedFeature(name=name, dtype=dtype, length=length) + +class Features(AsyncContextManagerList, Monitor): + + TIMEOUT: int = 60 * 2 + + LOGGER = LOGGER.getChild('Features') + + def __init__(self, *args: Feature, timeout: int = None) -> None: + super().__init__(*args) + Monitor.__init__(self) + self.timeout = timeout if not timeout is None \ + else self.TIMEOUT + + def names(self) -> List[str]: + return list(({feature.NAME: True for feature in self}).keys()) + + async def evaluate(self, src: str, task: Task = None) -> Dict[str, Any]: + return await asyncio.wait_for(self._evaluate(src, task=task), + self.timeout) + + async def _evaluate(self, src: str, task: Task = None) -> Dict[str, Any]: + ''' + Evaluates all repos passed to it. + Args: + src: src of repo to be evaluated + caching: If `True` sources will NOT be re-evaluated if they have + features + Returns: + A `dict` containing source URLs and their repos + ''' + toreDown = False + data: Data = Data(src) + if not task is None: + data = task # type: ignore + features: Dict[str, Feature] = {} + results: Dict[str, Any] = {} + try: + applicable = await self.applicable(data) + self.LOGGER.debug('Applicable[%r]: %r', data.src_url, applicable) + await applicable.on_all('setUp', data) + await applicable.on_all('fetch', data) + await applicable.on_all('parse', data) + await applicable.run_calc(results, data) + await applicable.on_all('tearDown', data) + toreDown = True + except futures._base.CancelledError as err: + if not toreDown: + await applicable.on_all('tearDown', data) + return {} + data.results.update(results) + return results + + async def applicable(self, data: Data) -> 'Features': + return self.__class__(*[feature for feature in self \ + if feature.NAME and await feature.applicable(data)]) + + async def on_all(self, method_name: str, data: Data): + await asyncio.gather(*[self.run_feature_method( + feature, getattr(feature, method_name), data) \ + for feature in self]) + + async def run_calc(self, results: Dict[str, Any], data: Data): + await asyncio.gather(*[self._run_calc(feature, results, data) \ + for feature in self]) + + async def _run_calc(self, feature: Feature, results: Dict[str, Any], + data: Data) -> Any: + results[feature.NAME] = await self.run_feature_method(feature, + feature.calc, data) + + async def run_feature_method(self, feature: Feature, + method: Callable[[Data], Any], data: Data) -> Any: + error: Exception = Exception('Not an error') + try: + self.LOGGER.debug('%s %s(%s).%s', data.src_url, feature.NAME, + feature.__class__.__qualname__, method.__name__) + return await method(data) + except futures._base.CancelledError as err: + raise + except Exception as err: + error = err + self.LOGGER.error('Error evaluating %s: %s: %s', data.src_url, err, + traceback.format_exc().strip()) + if str(error) != 'Not an error': + if method.__name__ != 'tearDown': + await feature.tearDown(data) + self.remove(feature) + + def mktask(self, func, key): + data = Data(key) + Task.__init__(data, func, key) + return data + + async def evaluate_repo(self, repo: Repo, *, + features: List[str] = [], caching: bool = False): + results: Dict[str, Any] = repo.features(features) + if caching and results: + return repo + try: + results = await self.evaluate(repo.src_url) + if results: + repo.evaluated(results) + except futures._base.TimeoutError: + self.LOGGER.warning('Evaluation timed out: %s', repo.src_url) + return repo + + async def evaluate_repos(self, repos: AsyncIterator[Repo], *, + features: Optional[List[str]] = None, caching: bool = False, + num_workers: int = 1): + if features is None: + features = self.names() + sem = asyncio.Semaphore(value=num_workers) + async def with_sem(sem, func, *args, **kwargs): + async with sem: + return await func(*args, **kwargs) + evaluate_repo = partial(with_sem, sem, self.evaluate_repo, + features=features, caching=caching) + for repo in await asyncio.gather(*[evaluate_repo(repo) \ + async for repo in repos]): + yield repo + + async def submit(self, src: str): + return await super().start(partial(self.evaluate, src), src, + mktask=self.mktask) + + @classmethod + def load(cls, *these: str): + ''' + Loads all installed loading and returns them as a list. Sources to be + loaded should be registered to ENTRY_POINT via setuptools. + ''' + these, loading_classes = cls.load_defs(*these) + for i in pkg_resources.iter_entry_points(Feature.ENTRY_POINT): + loaded = i.load() + if issubclass(loaded, Feature) and loaded.NAME in these: + loading_classes.append(loaded()) + self = cls(*loading_classes) + for name in these: + if not name in self.names(): + raise KeyError('%s was not found in (%s)' % \ + (repr(name), ', '.join(map(str, loading_classes)))) + if not self.names(): + raise KeyError('No features were loaded') + return self + + @classmethod + def load_defs(cls, *args: str): + defs = [] + no_def = [arg for arg in args if not arg.startswith('def:')] + for arg in args: + if arg.startswith('def:'): + defs.append(cls.load_def(*arg.replace('def:', '').split(':'))) + return no_def, defs + + @classmethod + def load_def(cls, name: str, dtype: str, length: str): + return DefFeature(name, cls.convert_dtype(dtype), int(length)) + + @classmethod + def convert_dtype(cls, dtype: str): + found = pydoc.locate(dtype) + if found is None: + raise TypeError('Failed to convert_dtype %r' % (dtype,)) + return found diff --git a/dffml/feature/log.py b/dffml/feature/log.py new file mode 100644 index 0000000000..d153e40552 --- /dev/null +++ b/dffml/feature/log.py @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +'''Logging''' +import logging +LOGGER = logging.getLogger(__package__) diff --git a/dffml/log.py b/dffml/log.py new file mode 100644 index 0000000000..d153e40552 --- /dev/null +++ b/dffml/log.py @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +'''Logging''' +import logging +LOGGER = logging.getLogger(__package__) diff --git a/dffml/model/__init__.py b/dffml/model/__init__.py new file mode 100644 index 0000000000..4e2783a7b6 --- /dev/null +++ b/dffml/model/__init__.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +All models registered to the dffml.model entry point using setuptools are +derived from the Model class. To add a model, create a module which has a +setup.py which specifies where to find your Model subclass within your module. + +>>> setup( +>>> name='mymodel', +... +>>> entry_points={ +>>> 'dffml.model': [ +>>> 'mymodel = mymodel:MyModel', +>>> ], +>>> }, +>>> ) +''' +from .model import Model + +# Declares dffml.model as a namespace package +__import__('pkg_resources').declare_namespace(__name__) diff --git a/dffml/model/log.py b/dffml/model/log.py new file mode 100644 index 0000000000..d153e40552 --- /dev/null +++ b/dffml/model/log.py @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +'''Logging''' +import logging +LOGGER = logging.getLogger(__package__) diff --git a/dffml/model/model.py b/dffml/model/model.py new file mode 100644 index 0000000000..c16a5546ac --- /dev/null +++ b/dffml/model/model.py @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +Model subclasses are responsible for training themselves on repos, making +predictions about the classifications of repos, and assessing their prediction +accuracy. +''' +import abc +from typing import AsyncIterator, Tuple, Any, List, Optional + +from ..repo import Repo +from ..source import Sources +from ..feature import Features +from ..accuracy import Accuracy +from ..util.entrypoint import Entrypoint + +class Model(abc.ABC, Entrypoint): + ''' + Abstract base class which should be derived from and implmented using + various machine learning frameworks or concepts. + ''' + + ENTRY_POINT = 'dffml.model' + + def __init__(self, model_dir: Optional[str] = None) -> None: + super().__init__() + self.model_dir = model_dir + + @abc.abstractmethod + async def train(self, sources: Sources, features: Features, + classifications: List[Any], steps: int, num_epochs: int): + ''' + Train using repos as the data to learn from. + ''' + raise NotImplementedError() + + @abc.abstractmethod + async def accuracy(self, sources: Sources, features: Features, + classifications: List[Any]) -> Accuracy: + ''' + Evaluates the accuracy of our model after training using the input repos + as test data. + ''' + raise NotImplementedError() + + @abc.abstractmethod + async def predict(self, repos: AsyncIterator[Repo], features: Features, + classifications: List[Any]) -> \ + AsyncIterator[Tuple[Repo, Any, float]]: + ''' + Uses trained data to make a prediction about the quality of a repo. + ''' + raise NotImplementedError() + yield (Repo(''), '', 0.0) + + @classmethod + def installed(cls): + return {key: model() for key, model in cls.load().items()} diff --git a/dffml/port/__init__.py b/dffml/port/__init__.py new file mode 100644 index 0000000000..28b9141a86 --- /dev/null +++ b/dffml/port/__init__.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +All ports registered to the dffml.port entry point using setuptools are +derived from the Port class. To add a port, create a module which has a +setup.py which specifies where to find your Port subclass within your module. + +>>> setup( +>>> name='myport', +... +>>> entry_points={ +>>> 'dffml.port': [ +>>> 'myport = myport:MyPort', +>>> ], +>>> }, +>>> ) +''' +from .port import Port + +# Declares dffml.port as a namespace package +__import__('pkg_resources').declare_namespace(__name__) diff --git a/dffml/port/json.py b/dffml/port/json.py new file mode 100644 index 0000000000..c18fc84bfb --- /dev/null +++ b/dffml/port/json.py @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +Ports repos in JSON format +''' +import json + +from .port import Port +from ..repo import Repo +from ..source import Source + +class JSON(Port): + ''' + Imports and exports repos in JSON format + ''' + + async def export_fd(self, source: Source, fd): + json.dump({repo.src_url: repo.dict() async for repo in source.repos()}, + fd) + + async def import_fd(self, source: Source, fd): + for src_url, data in json.load(fd): + await source.update(Repo(src_url, data=data)) diff --git a/dffml/port/log.py b/dffml/port/log.py new file mode 100644 index 0000000000..d153e40552 --- /dev/null +++ b/dffml/port/log.py @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +'''Logging''' +import logging +LOGGER = logging.getLogger(__package__) diff --git a/dffml/port/port.py b/dffml/port/port.py new file mode 100644 index 0000000000..db75608bf3 --- /dev/null +++ b/dffml/port/port.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +Port subclasses import and export repos. +''' +import abc + +from ..source import Source +from ..util.entrypoint import Entrypoint + +class Port(abc.ABC, Entrypoint): + ''' + Port repos into the format the porter understands + ''' + + ENTRY_POINT = 'dffml.port' + + @abc.abstractmethod + async def export_fd(self, source: Source, fd): + ''' + Export repos + ''' + + @abc.abstractmethod + async def import_fd(self, source: Source, fd): + ''' + Import repos + ''' + + async def export_to_file(self, source: Source, filename: str): + with open(filename, 'w') as fd: + await self.export_fd(source, fd) + + async def import_from_file(self, source: Source, filename: str): + with open(filename, 'r') as fd: + await self.import_fd(source, fd) diff --git a/dffml/repo.py b/dffml/repo.py new file mode 100644 index 0000000000..b4a7b1fe43 --- /dev/null +++ b/dffml/repo.py @@ -0,0 +1,210 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +Information on the software to evaluate is stored in a Repo instance. +''' +import os +from datetime import datetime +from typing import Optional, List, Dict, Any, AsyncIterator + +from .log import LOGGER + +LOGGER = LOGGER.getChild('repo') + +class RepoPrediction(dict): + + EXPORTED = ['classification', 'confidence'] + + def __init__(self, *, + confidence: float = 0.0, + classification: Any = '') -> None: + self['confidence'] = confidence + self['classification'] = classification + + @property + def confidence(self): + return self['confidence'] + + @property + def classification(self): + return self['classification'] + + def dict(self): + if not self: + return [] + return self + + def __len__(self): + if self['confidence'] == 0.0 and not self['classification']: + return 0 + return 2 + + def __bool__(self): + return bool(len(self)) + __nonzero__ = __bool__ + +class RepoData(object): + + DATE_FORMAT = '%Y-%m-%dT%H:%M:%SZ' + EXPORTED = ['src_url', 'features', 'classification', 'prediction'] + + def __init__(self, *, + src_url: Optional[str] = None, + features: Optional[Dict[str, Any]] = None, + classification: Optional[str] = None, + prediction: Optional[RepoPrediction] = None, + last_updated: Optional[datetime] = None) -> None: + # If the repo is not evaluated or predicted then don't report out a new + # value for last_updated + self.last_updated_default = datetime.now() + if src_url is None: + src_url = '' + if features is None: + features = {} + if classification is None: + classification = '' + if prediction is None: + prediction = RepoPrediction() + if last_updated is None: + last_updated = self.last_updated_default + if isinstance(last_updated, str): + last_updated = datetime.strptime(last_updated, self.DATE_FORMAT) + self.src_url = src_url + self.features = features + self.classification = classification + self.prediction = RepoPrediction(**prediction) + self.last_updated = last_updated + + def dict(self): + data = {key: getattr(self, key, []) for key in self.EXPORTED \ + if len(getattr(self, key, []))} + # Do not report if there has been no change since instantiation to + # a default time value + if self.last_updated != self.last_updated_default: + data['last_updated'] = self.last_updated.strftime(self.DATE_FORMAT) + return data + + def __repr__(self): + return str(self.dict()) + +class Repo(object): + ''' + Manages feature independent information and actions for a repo. + ''' + + REPO_DATA = RepoData + + def __init__(self, src_url: str, *, + data: Optional[Dict[str, Any]] = None, + extra: Optional[Dict[str, Any]] = None) -> None: + if data is None: + data = {} + if extra is None: + extra = {} + data['src_url'] = src_url + if 'extra' in data: + # Prefer extra from init arguments to extra stored in data + data['extra'].update(extra) + extra = data['extra'] + del data['extra'] + self.data = self.REPO_DATA(**data) + self.extra = extra + + def dict(self): + data = self.data.dict() + data['extra'] = self.extra + return data + + def __repr__(self): + return str(self.dict()) + + def __str__(self): + if not self.data.prediction: + confidence, classification = (0.0, 'Undetermined') + else: + confidence, classification = (self.data.prediction.confidence, + self.data.prediction.classification) + header = ('%-11s (%2.1f%% confidence) %s' % \ + (classification, 100.0 * confidence, self.src_url)) + if self.classified(): + header += ' classified as: %s' % (self.classification(),) + if len(self.extra.keys()): + header += ' ' + str(self.extra) + return '\n'.join([header] + \ + [('%-30s%s' % (feature, str(results))) \ + for feature, results in self.features().items()]).rstrip() + + def merge(self, repo: 'Repo'): + data = self.data.dict() + data.update(repo.data.dict()) + self.data = self.REPO_DATA(**data) + self.extra.update(repo.extra) # type: ignore + + @property + def src_url(self) -> str: + return self.data.src_url + + def evaluated(self, results: Dict[str, Any], overwrite=False): + ''' + Updates features with the result dict + ''' + if overwrite: + self.data.features = results + else: + self.data.features.update(results) + self.data.last_updated = datetime.now() + LOGGER.info('Evaluated %s %r', self.data.src_url, self.data.features) + + def features(self, subset: List[str] = []) -> Dict[str, Any]: + ''' + Returns all features for the repo or the subset specified. + ''' + if not subset: + return self.data.features + for name in subset: + if not name in self.data.features or self.data.features[name] is None: + return {} + return {name: self.data.features[name] for name in subset} + + def predicted(self, classification: Any, confidence: float): + ''' + Set the prediction for this repo + ''' + self.data.prediction = RepoPrediction( + classification=classification, + confidence=float(confidence)) + self.data.last_updated = datetime.now() + + def prediction(self) -> RepoPrediction: + ''' + Get the prediction for this repo + ''' + return self.data.prediction + + def classify(self, classification): + ''' + Set the classification for the repo + ''' + self.data.classification = classification + + def classified(self): + ''' + Return True if the repo has a classification + ''' + if self.data.classification == '': + return False + return True + + def classification(self): + ''' + Repo classification or value error if unclassified + ''' + if not self.classified(): + raise ValueError('Unclassified') + return self.data.classification + + async def asyncgen(self) -> AsyncIterator['Repo']: + ''' + Async gen for a single repo + ''' + yield self diff --git a/dffml/source/__init__.py b/dffml/source/__init__.py new file mode 100644 index 0000000000..28d6a8f026 --- /dev/null +++ b/dffml/source/__init__.py @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +All sources registered to the dffml.source entry point using setuptools are +derived from the Source class. To add a source, create a module which has a +setup.py which specifies where to find your Source subclass within your module. + +>>> setup( +>>> name='mysource', +... +>>> entry_points={ +>>> 'dffml.source': [ +>>> 'mysource = mysource:MySource', +>>> ], +>>> }, +>>> ) +''' +from .source import Source, Sources, SubsetSources, ValidationSources +from .memory import MemorySource, RepoSource +from .json import JSONSource +from .file import FileSource + +# Declares dffml.source as a namespace package +__import__('pkg_resources').declare_namespace(__name__) diff --git a/dffml/source/csvfile.py b/dffml/source/csvfile.py new file mode 100644 index 0000000000..37eb3c2a34 --- /dev/null +++ b/dffml/source/csvfile.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +Loads repos from a csv file, using columns as features +''' +import os +import csv +import ast +import urllib.request + +from ..repo import Repo +from .memory import MemorySource +from .file import FileSource + +from .log import LOGGER + +LOGGER = LOGGER.getChild('csv') + +csv.register_dialect('strip', skipinitialspace=True) + +class CSVSource(FileSource, MemorySource): + ''' + Uses a CSV file as the source of repo feature data + ''' + + async def load_fd(self, fd): + ''' + Parses a CSV stream into Repo instances + ''' + i = 0 + self.mem = {} + for data in csv.DictReader(fd, dialect='strip'): + for key, value in data.items(): + try: + data[key] = ast.literal_eval(value) + except (SyntaxError, ValueError): + data[key] = value + if not data.get('classification') is None: + classification = data['classification'] + del data['classification'] + repo = Repo(str(i), data={'features': data, + 'classification': str(classification)}) + else: + repo = Repo(str(i), data={'features': data}) + i += 1 + self.mem[repo.src_url] = repo + LOGGER.debug('%r loaded %d records', self, len(self.mem)) + + async def _close(self): + LOGGER.debug('%r save to file not implemented', self) + + async def dump_fd(self, fd): + pass + # LOGGER.debug('%r saved %d records', self, len(self.mem)) diff --git a/dffml/source/file.py b/dffml/source/file.py new file mode 100644 index 0000000000..96b4c7e962 --- /dev/null +++ b/dffml/source/file.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import os +import abc +import asyncio + +from .source import Source +# from .log import LOGGER + +# LOGGER = LOGGER.getChild('file') + +class FileSource(Source): + ''' + FileSource reads and write from a file on open / close. + ''' + + @property + def readonly(self) -> bool: + return bool(self.src[::-1].startswith((':ro')[::-1])) + + @property + def filename(self): + ''' + Path to JSON file used for storage on disk. + ''' + if self.readonly: + return self.src[:-3] + return self.src + + def __repr__(self): + return '%s(%r)' % (self.__class__.__qualname__, self.filename) + + async def open(self): + await asyncio.shield(self._open()) + + async def _open(self): + if not os.path.isfile(self.filename): + self.mem = {} + return + with open(self.filename, 'r') as fd: + await self.load_fd(fd) + + async def close(self): + await asyncio.shield(self._close()) + + async def _close(self): + if not self.readonly: + with open(self.filename, 'w') as fd: + await self.dump_fd(fd) + + @abc.abstractmethod + async def load_fd(self, fd): + pass # pragma: no cover + + @abc.abstractmethod + async def dump_fd(self, fd): + pass # pragma: no cover diff --git a/dffml/source/json.py b/dffml/source/json.py new file mode 100644 index 0000000000..4c73781585 --- /dev/null +++ b/dffml/source/json.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import json + +from ..repo import Repo +from .memory import MemorySource +from .file import FileSource + +from .log import LOGGER + +LOGGER = LOGGER.getChild('json') + +class JSONSource(FileSource, MemorySource): + ''' + JSONSource reads and write from a JSON file on open / close. Otherwise + stored in memory. + ''' + + async def load_fd(self, fd): + repos = json.load(fd) + self.mem = {src_url: Repo(src_url, data=data) \ + for src_url, data in repos.items()} + LOGGER.debug('%r loaded %d records', self, len(self.mem)) + + async def dump_fd(self, fd): + json.dump({repo.src_url: repo.dict() for repo in self.mem.values()}, fd) + LOGGER.debug('%r saved %d records', self, len(self.mem)) diff --git a/dffml/source/log.py b/dffml/source/log.py new file mode 100644 index 0000000000..d153e40552 --- /dev/null +++ b/dffml/source/log.py @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +'''Logging''' +import logging +LOGGER = logging.getLogger(__package__) diff --git a/dffml/source/memory.py b/dffml/source/memory.py new file mode 100644 index 0000000000..0e7a3d76e4 --- /dev/null +++ b/dffml/source/memory.py @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +Fake data sources used for testing +''' +import asyncio +from typing import Dict, AsyncIterator + +from ..repo import Repo +from .source import Source + +class MemorySource(Source): + ''' + Stores repos in a dict in memory + ''' + + def __init__(self, src: str) -> None: + super().__init__(src) + self.mem: Dict[str, Repo] = {} + self.lock = asyncio.Lock() + + async def update(self, repo): + async with self.lock: + self.mem[repo.src_url] = repo + + async def repos(self) -> AsyncIterator[Repo]: + # NOTE No lock used here because sometimes we iterate and update + # Feel free to debate this by opening an issue. + for repo in self.mem.values(): + yield repo + + async def repo(self, src_url: str) -> Repo: + async with self.lock: + return self.mem.get(src_url, Repo(src_url)) + +class RepoSource(MemorySource): + ''' + Takes repo data from instantiation arguments. Stores repos in memory. + ''' + + def __init__(self, *args: Repo, src: str = '') -> None: + super().__init__(src) + self.mem = {repo.src_url: repo for repo in args} diff --git a/dffml/source/source.py b/dffml/source/source.py new file mode 100644 index 0000000000..ad16509a1f --- /dev/null +++ b/dffml/source/source.py @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +Source subclasses are responsible for generating an integer value given an open +source project's source URL. +''' +import abc +import asyncio +from typing import AsyncIterator, Dict, List, Optional, Callable + +from .log import LOGGER +from ..repo import Repo, RepoData +from ..util.asynchelper import AsyncContextManagerList +from ..util.entrypoint import Entrypoint + +class Source(abc.ABC, Entrypoint): + ''' + Abstract base class for all sources. New sources must be derived from this + class and implement the repos method. + ''' + + ENTRY_POINT = 'dffml.source' + + def __init__(self, src: str) -> None: + self.src = src + + @abc.abstractmethod + async def update(self, repo: Repo): + ''' + Updates a repo for a source + ''' + + @abc.abstractmethod + async def repos(self) -> AsyncIterator[Repo]: + ''' + Returns a list of repos retrieved from self.src + ''' + # mypy ignores AsyncIterator[Repo], therefore this is needed + yield Repo('') # pragma: no cover + + @abc.abstractmethod + async def repo(self, src_url: str): + ''' + Get a repo from the source or add it if it doesn't exist + ''' + + @classmethod + def load_from_dict(cls, sources: Dict[str, str]): + ''' + Loads each source requested and instantiates it with its src_url. + ''' + loaded: Dict[str, Source] = {} + for src_url, name in sources.items(): + loaded[src_url] = cls.load(name)(src_url) + return loaded + + def __repr__(self): + return '%s(%r)' % (self.__class__.__qualname__, self.src) + + async def open(self): + return + + async def close(self): + return + + async def __aenter__(self): + await self.open() + # TODO Context management + return self + + async def __aexit__(self, exc_type, exc_value, traceback): + await self.close() + +class Sources(AsyncContextManagerList): + + async def update(self, repo: Repo): + ''' + Updates a repo for a source + ''' + LOGGER.debug('Updating %r: %r', repo.src_url, repo.dict()) + for source in self: + await source.update(repo) + + async def repos(self, validation: Optional[Callable[[Repo], bool]] = None) \ + -> AsyncIterator[Repo]: + ''' + Retrieves repos from all sources + ''' + for source in self: + async for repo in source.repos(): + if validation is None or validation(repo): + yield repo + + async def repo(self, src_url: str): + ''' + Retrieve and or register repo will all sources + ''' + repo = Repo(src_url) + for source in self: + repo.merge(await source.repo(src_url)) + return repo + + async def classified_with_features(self, + features: List[str]) -> AsyncIterator[Repo]: + ''' + Returns all classified repos which have the requested features + ''' + async for repo in self.repos(lambda repo: \ + bool(repo.features(features) and repo.classified())): + yield repo + + async def unclassified_with_features(self, + features: List[str]) -> AsyncIterator[Repo]: + ''' + Returns all unclassified repos which have the requested features + ''' + async for repo in self.repos(lambda repo: \ + bool(repo.features(features) and not repo.classified())): + yield repo + + async def with_features(self, features: List[str]) -> AsyncIterator[Repo]: + ''' + Returns all repos which have the requested features + ''' + async for repo in self.repos(lambda repo: bool(repo.features(features))): + yield repo + +class SubsetSources(Sources): + ''' + Restricts access to a subset of repos during iteration based on their keys. + ''' + + def __init__(self, *args: Source, keys: Optional[List[str]] = None) \ + -> None: + super().__init__(*args) + if keys is None: + keys = [] + self.keys = keys + + async def repos(self, validation: Optional[Callable[[Repo], bool]] = None) \ + -> AsyncIterator[Repo]: + for key in self.keys: + repo = await self.repo(key) + if validation is None or validation(repo): + yield repo + +class ValidationSources(Sources): + ''' + Restricts access to a subset of repos during iteration based on a validation + function. + ''' + + def __init__(self, *args: Source, validation: Callable[[Repo], bool]) \ + -> None: + super().__init__(*args) + self.validation = validation + + async def repos(self, validation: Optional[Callable[[Repo], bool]] = None) \ + -> AsyncIterator[Repo]: + async for repo in super().repos(): + if self.validation(repo) \ + and (validation is None or validation(repo)): + yield repo diff --git a/dffml/util/__init__.py b/dffml/util/__init__.py new file mode 100644 index 0000000000..5bbefb030a --- /dev/null +++ b/dffml/util/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation diff --git a/dffml/util/asynchelper.py b/dffml/util/asynchelper.py new file mode 100644 index 0000000000..e684f92e04 --- /dev/null +++ b/dffml/util/asynchelper.py @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +WARNING: concurrent can be much slower for quick tasks. It is best used for long +running concurrent tasks. +''' +import random +import asyncio +from threading import Thread + +from .log import LOGGER + +class AsyncContextManagerList(list): + + def __init__(self, *args): + super().__init__(list(args)) + + async def __aenter__(self): + for item in self: + await item.__aenter__() + # TODO Context management + return self + + async def __aexit__(self, exc_type, exc_value, traceback): + for item in self: + await item.__aexit__(exc_type, exc_value, traceback) diff --git a/dffml/util/asynctestcase.py b/dffml/util/asynctestcase.py new file mode 100644 index 0000000000..79a6f7014d --- /dev/null +++ b/dffml/util/asynctestcase.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +Adds support for test cases which need to be run in an event loop. +''' +import os +import asyncio +import inspect +import logging +import unittest + +class AsyncTestCase(unittest.TestCase): + ''' + Runs any test_ methods as coroutines in the default event loop. + + USAGE + >>> from dffml.util.asynctestcase import AsyncTestCase + >>> + >>> class AsyncTestCase(unittest.AsyncTestCase): + >>> + >>> async def test_sleep(self): + >>> await asyncio.sleep(1) + ''' + + # The event loop to run test_ functions in + loop = asyncio.get_event_loop() + + def async_wrapper(self, coro): + ''' + Returns a function which calls the test_ function which calls + loop.run_until_complete to return the result of the test. + ''' + def run_it(*args, **kwargs): + ''' + Calls the loop's run_until_complete method. + ''' + logging.basicConfig(level=getattr(logging, + os.getenv('LOGGING', 'CRITICAL').upper(), logging.CRITICAL)) + result = self.loop.run_until_complete(coro(*args, **kwargs)) + logging.basicConfig(level=logging.CRITICAL) + return result + return run_it + + def run(self, result=None): + ''' + Convert all test_ methods via async_wrapper so that they are run in the + event loop. + ''' + methods = inspect.getmembers(self, predicate=inspect.ismethod) + for name, method in methods: + if inspect.iscoroutinefunction(method) \ + and (name.startswith('test_') \ + or name in ['setUp', 'tearDown']): + setattr(self, name, self.async_wrapper(method)) + return super().run(result=result) diff --git a/dffml/util/cli.py b/dffml/util/cli.py new file mode 100644 index 0000000000..659c2782ad --- /dev/null +++ b/dffml/util/cli.py @@ -0,0 +1,241 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import os +import sys +import copy +import json +import asyncio +import inspect +import logging +import argparse +from typing import Optional + +from .log import LOGGER +from ..repo import Repo +from ..port import Port +from ..feature import Feature, Features +from ..source import Source, Sources, JSONSource +from ..model import Model + +LOGGER = LOGGER.getChild('cli') + +class ParseSourcesAction(argparse.Action): + + def __call__(self, parser, namespace, values, option_string=None): + if not isinstance(values, list): + values = [values] + parse = dict(map(lambda source: source.split('=', maxsplit=2)[::-1], + values)) + values = Sources(*list(Source.load_from_dict(parse).values())) + setattr(namespace, self.dest, values) + +class ParseFeaturesAction(argparse.Action): + + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, Features.load(*values)) + +class ParseModelAction(argparse.Action): + + def __call__(self, parser, namespace, value, option_string=None): + setattr(namespace, self.dest, Model.load(value)()) + +class ParsePortAction(argparse.Action): + + def __call__(self, parser, namespace, value, option_string=None): + setattr(namespace, self.dest, Port.load(value)()) + +class ParseLoggingAction(argparse.Action): + + def __call__(self, parser, namespace, value, option_string=None): + setattr(namespace, self.dest, + getattr(logging, value.upper(), logging.INFO)) + logging.basicConfig(level=getattr(namespace, self.dest)) + +class Arg(dict): + + def __init__(self, name: str, **kwargs) -> None: + super().__init__(**kwargs) + self.name = name + + def modify(self, name: Optional[str] = None, **kwargs): + updated = copy.copy(self) + updated.update(kwargs) + if not name is None: + updated.name = name + return updated + +class JSONEncoder(json.JSONEncoder): + ''' + Encodes dffml types to JSON representation. + ''' + + def default(self, obj): + if isinstance(obj, Repo): + return obj.dict() + elif isinstance(obj, Feature): + return obj.NAME + return json.JSONEncoder.default(self, obj) + +class CMD(object): + + JSONEncoder = JSONEncoder + + arg_log = Arg('-log', help='Logging level', action=ParseLoggingAction, + required=False, default=logging.INFO) + + def __init__(self, **kwargs) -> None: + for name, method in [(name.lower().replace('arg_', ''), method) \ + for name, method in inspect.getmembers(self) \ + if isinstance(method, Arg)]: + if not name in kwargs and method.name in kwargs: + name = method.name + if not name in kwargs and 'default' in method: + kwargs[name] = method['default'] + if name in kwargs: + LOGGER.debug('Setting %s.%s = %r', self, name, kwargs[name]) + setattr(self, name, kwargs[name]) + else: + LOGGER.debug('Ignored %s.%s', self, name) + + async def __aenter__(self): + pass + + async def __aexit__(self, exc_type, exc_value, traceback): + pass + + @classmethod + async def parse_args(cls, *args): + parser = Parser() + parser.add_subs(cls) + return parser, parser.parse_args(args) + + @classmethod + async def cli(cls, *args): + self = cls() + parser, args = await self.parse_args(*args) + if getattr(args, 'cmd', None) is None: + parser.print_help() + return None + if getattr(args.cmd, 'run', None) is None: + args.parser.print_help() + return None + cmd = args.cmd(**self.sanitize_args(vars(args))) + async with cmd: + if inspect.isasyncgenfunction(cmd.run): + return [res async for res in cmd.run()] + else: + return await cmd.run() + + def sanitize_args(self, args): + ''' + Remove CMD internals from arguments passed to subclasses of CMD. + ''' + for rm in ['cmd', 'parser', 'log']: + if rm in args: + del args[rm] + return args + + @classmethod + def main(cls, loop=asyncio.get_event_loop(), argv=sys.argv): + ''' + Runs cli commands in asyncio loop and outputs in appropriate format + ''' + result = None + try: + result = loop.run_until_complete(cls.cli(*argv[1:])) + except KeyboardInterrupt: # pragma: no cover + pass # pragma: no cover + loop.run_until_complete(loop.shutdown_asyncgens()) + loop.close() + if not result is None: + json.dump(result, sys.stdout, sort_keys=True, indent=4, + separators=(',', ': '), cls=cls.JSONEncoder) + print() + +class Parser(argparse.ArgumentParser): + + def add_subs(self, add_from: CMD): + ''' + Add sub commands and arguments recursively + ''' + # Only one subparser should be created even if multiple sub commands + subparsers = None + for name, method in [(name.lower().replace('_', ''), method) \ + for name, method in inspect.getmembers(add_from)]: + if inspect.isclass(method) and issubclass(method, CMD): + if subparsers is None: # pragma: no cover + subparsers = self.add_subparsers() # pragma: no cover + parser = subparsers.add_parser(name, help=None \ + if method.__doc__ is None else method.__doc__.strip()) + parser.set_defaults(cmd=method) + parser.set_defaults(parser=parser) + parser.add_subs(method) # type: ignore + elif isinstance(method, Arg): + self.add_argument(method.name, **method) + +class ListEntrypoint(CMD): + ''' + Subclass this with an Entrypoint to display all registered classes. + ''' + + def display(self, cls): + ''' + Print out the loaded but uninstantiated class + ''' + if not cls.__doc__ is None: + print('%s:' % (cls.__qualname__)) + print(cls.__doc__.rstrip()) + else: + print('%s' % (cls.__qualname__)) + print() + + async def run(self): + ''' + Display all classes registered with the entrypoint + ''' + for cls in self.ENTRYPOINT.load(): + self.display(cls) + +class FeaturesCMD(CMD): + ''' + Set timeout for features + ''' + + arg_features = Arg('-features', nargs='+', required=True, + default=Features(), action=ParseFeaturesAction) + arg_timeout = Arg('-timeout', help='Feature evaluation timeout', + required=False, default=Features.TIMEOUT, type=int) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.features.timeout = self.timeout + +class SourcesCMD(CMD): + + arg_sources = Arg('-sources', help='Sources for loading and saving', + nargs='+', default=Sources(JSONSource(os.path.join( + os.path.expanduser('~'), '.cache', 'dffml.json'))), + action=ParseSourcesAction) + +class ModelCMD(CMD): + ''' + Set a models model dir. + ''' + + arg_model = Arg('-model', help='Model used for ML', + action=ParseModelAction, required=True) + arg_model_dir = Arg('-model_dir', help='Model directory for ML', + default=os.path.join(os.path.expanduser('~'), '.cache', 'dffml')) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.model.model_dir = self.model_dir + +class PortCMD(CMD): + + arg_port = Arg('port', action=ParsePortAction) + +class KeysCMD(CMD): + + arg_keys = Arg('-keys', help='Key used for source lookup and evaluation', + nargs='+', required=True) diff --git a/dffml/util/entrypoint.py b/dffml/util/entrypoint.py new file mode 100644 index 0000000000..761702fd80 --- /dev/null +++ b/dffml/util/entrypoint.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +Loader subclasses know how to load classes under their entry point which conform +to their subclasses. +''' +import pkg_resources +from typing import List + +class Entrypoint(object): + ''' + Uses the pkg_resources.iter_entry_points on the ENTRY_POINT of the class + ''' + + ENTRY_POINT = 'util.entrypoint' + + @classmethod + def load(cls, loading=None): + ''' + Loads all installed loading and returns them as a list. Sources to be + loaded should be registered to ENTRY_POINT via setuptools. + ''' + loading_classes = [] + for i in pkg_resources.iter_entry_points(cls.ENTRY_POINT): + loaded = i.load() + if issubclass(loaded, cls): + loading_classes.append(loaded) + if loading is not None and i.name == loading: + return loaded + if loading is not None: + raise KeyError('%s was not found in (%s)' % \ + (repr(loading), ', '.join(list(map(str, loading_classes))))) + return loading_classes + + @classmethod + def load_multiple(cls, to_load: List[str]): + ''' + Loads each class requested without instantiating it. + ''' + return {name: cls.load(name) for name in to_load} diff --git a/dffml/util/log.py b/dffml/util/log.py new file mode 100644 index 0000000000..d153e40552 --- /dev/null +++ b/dffml/util/log.py @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +'''Logging''' +import logging +LOGGER = logging.getLogger(__package__) diff --git a/dffml/util/monitor.py b/dffml/util/monitor.py new file mode 100644 index 0000000000..740efc2b0d --- /dev/null +++ b/dffml/util/monitor.py @@ -0,0 +1,184 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import asyncio +from typing import List, Set, Any, Optional + +from .log import LOGGER + +LOGGER = LOGGER.getChild('monitor') + +class Watchdog(object): + + LOGGER = LOGGER.getChild('Watchdog') + + def __init__(self) -> None: + ''' + Specifiy event types to ignore with ignore list. + ''' + self.queue: asyncio.Queue = asyncio.Queue() + + async def enqueue(self, event, msg): + self.LOGGER.debug('put: %r', (event, msg)) + await self.queue.put((event, msg)) + + async def events(self): + event = '' + while event != 'done': + event, msg = await self.queue.get() + self.LOGGER.debug('got: %r', (event, msg)) + self.queue.task_done() + if event == 'done': + await self.queue.join() + yield event, msg + +class Task(object): + + LOGGER = LOGGER.getChild('Task') + + def __init__(self, func = None, key: Any = '') -> None: + coro = None + if not func is None: + coro = func(task=self) + if not key: + key = coro + self.__key = key + self.__coro = coro + self.__lock = asyncio.Lock() + # Previous updates so addded watchdogs get all updates ever + self.__events: List[Any] = [] + self.__watchdogs: List[Watchdog] = [] + + @property + def key(self): + return self.__key + + @property + def coro(self): + return self.__coro + + async def add_watchdog(self, watchdog: Watchdog): + async with self.__lock: + self.__watchdogs.append(watchdog) + self.LOGGER.debug('[%r] adding watcher', self.__key) + self.LOGGER.debug('[%r] adding watcher backlog: %r', self.__key, self.__events) + self.LOGGER.debug('[%r] watchers: %r', self.__key, self.__watchdogs) + async for event, msg in self.get_events(): + await watchdog.enqueue(event, msg) + + async def completed(self, result): + async with self.__lock: + self.LOGGER.debug('[%r] completed', self.__key) + await self.append_event('done', result) + for watchdog in self.__watchdogs: + await watchdog.enqueue('done', result) + self.__watchdogs = [] + + async def update(self, msg, event='update'): + async with self.__lock: + self.LOGGER.debug('[%r] sending %s: %r', self.__key, event, msg) + await self.append_event(event, msg) + for watchdog in self.__watchdogs: + await watchdog.enqueue(event, msg) + + async def log(self, fmt, *args): + await self.update(fmt % args, event='log') + + async def append_event(self, event, msg): + self.__events.append((event, msg)) + + async def get_events(self): + for event, msg in self.__events: + yield event, msg + + async def complete(self): + async for event, msg in self.events(): + if event == 'done': + self.LOGGER.debug('[%r] complete %r', self.__key, msg) + return msg + + async def events(self): + watchdog = Watchdog() + await self.add_watchdog(watchdog) + async for event, msg in watchdog.events(): + self.LOGGER.debug('[%r] got event %r: %r', self.__key, event, msg) + yield event, msg + + async def status(self): + async for event, msg in self.events(): + if event == 'done': + break + elif event == 'update': + yield msg + + async def statuses(self): + return [msg async for msg in self.status()] + + async def logs(self): + return [msg async for event, msg in self.events() if event == 'log'] + +class Monitor(object): + + LOGGER = LOGGER.getChild('Monitor') + + def __init__(self): + self.in_progress = {} + self.lock = asyncio.Lock() + self.log_lock = asyncio.Lock() + + async def task(self, key: Any): + task = None + async with self.lock: + task = self.in_progress.get(key, None) + if task is None: + return + return task + + async def complete(self, key: Any): + task = await self.task(key) + if task is None: + return + await task.complete() + + async def events(self, key: Any): + task = await self.task(key) + if task is None: + return + async for event, msg in task.events(): + yield event, msg + + async def status(self, key: Any): + task = None + async with self.lock: + task = self.in_progress.get(key, None) + if task is None: + return + async for msg in task.status(): + yield msg + + async def statuses(self, key: Any): + return [msg async for msg in self.status(key)] + + async def log_status(self, key: Any): + async for msg in self.status(key): + self.LOGGER.debug('status [%r]: %r', key, msg) + yield msg + self.LOGGER.debug('log status [%r] is done', key) + + async def run_task(self, task: Task): + self.LOGGER.debug('Started running %r', task.key) + result = await task.coro # type: ignore + self.LOGGER.debug('Done running %r', task.key) + async with self.lock: + await task.completed(result) + del self.in_progress[task.key] + self.LOGGER.debug('Removed running %r', task.key) + + async def start(self, func, key: Any = '', mktask = Task): + async with self.lock: + if key in self.in_progress: + self.LOGGER.debug('Already running %r', key) + return + task = mktask(func, key) + self.in_progress[task.key] = task + asyncio.ensure_future(self.run_task(task)) + return task diff --git a/dffml/util/tempdir.py b/dffml/util/tempdir.py new file mode 100644 index 0000000000..ee44fbbc7f --- /dev/null +++ b/dffml/util/tempdir.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +Create and remove OS temporary directories. +''' +import queue +import shutil +import os.path +import tempfile +from typing import List + +from .log import LOGGER + +LOGGER = LOGGER.getChild('tempdir') + +class TempDir(object): + ''' + Creates and deletes temporary directories. Removes any created directories + when the program using this class terminates (see rmtempdirs for details). + ''' + + SUFFIX: str = None + PREFIX: str = 'dffml_' + + def __init__(self): + self.suffix = self.__class__.SUFFIX + self.prefix = self.__class__.PREFIX + self.dirs: List[str] = [] + + def mktempdir(self): + ''' + Creates a temporary directory using TempDir's SUFFIX and PREFIX. + Adds the directory to the to be deleted queue. + ''' + dirname = tempfile.mkdtemp(suffix=self.suffix, prefix=self.prefix) + LOGGER.debug('Created directory %r', dirname) + self.dirs.append(dirname) + return dirname + + def rmtempdirs(self): + ''' + Removes all created temporary directories. Decorated with the + atexit.register method to ensure all created directories will be removed + on termination. + ''' + for rmdir in self.dirs: + LOGGER.debug('Removing directory %r', rmdir) + # OSError 39 sometimes if removal isn't attempted twice + shutil.rmtree(rmdir, ignore_errors=True) + shutil.rmtree(rmdir, ignore_errors=True) + + async def __aenter__(self): + pass + + async def __aexit__(self, exc_type, exc_value, traceback): + self.rmtempdirs() diff --git a/dffml/version.py b/dffml/version.py new file mode 100644 index 0000000000..d6c188270b --- /dev/null +++ b/dffml/version.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +''' +Copyright (C) 2018 Intel Corporation + +SPDX-License-Identifier: MIT + +Version of DFFML +''' +VERSION = '0.1.0' diff --git a/feature/git/.coveragerc b/feature/git/.coveragerc new file mode 100644 index 0000000000..a5790b1341 --- /dev/null +++ b/feature/git/.coveragerc @@ -0,0 +1,15 @@ +[run] +source = + dffml_feature_git + tests +branch = True +omit = + dffml_feature_git/cli.py + +[report] +exclude_lines = + no cov + no qa + noqa + pragma: no cover + if __name__ == .__main__.: diff --git a/feature/git/.gitattributes b/feature/git/.gitattributes new file mode 100644 index 0000000000..dfe0770424 --- /dev/null +++ b/feature/git/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/feature/git/.gitignore b/feature/git/.gitignore new file mode 100644 index 0000000000..070ee81c83 --- /dev/null +++ b/feature/git/.gitignore @@ -0,0 +1,20 @@ +*.log +*.pyc +.cache/ +.coverage +.idea/ +.vscode/ +*.egg-info/ +build/ +dist/ +docs/build/ +venv/ +wheelhouse/ +*.egss +.mypy_cache/ +*.swp +.venv/ +.eggs/ +*.modeldir +*.db +htmlcov/ diff --git a/feature/git/LICENSE b/feature/git/LICENSE new file mode 100644 index 0000000000..8ce5aa9e27 --- /dev/null +++ b/feature/git/LICENSE @@ -0,0 +1,21 @@ +Copyright (c) 2017-2019 Intel + +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/feature/git/MANIFEST.in b/feature/git/MANIFEST.in new file mode 100644 index 0000000000..a9bf0b9e9d --- /dev/null +++ b/feature/git/MANIFEST.in @@ -0,0 +1,3 @@ +include README.rst +include LICENSE-APACHE +include LICENSE-MIT diff --git a/feature/git/README.rst b/feature/git/README.rst new file mode 100644 index 0000000000..61a3be76b2 --- /dev/null +++ b/feature/git/README.rst @@ -0,0 +1,8 @@ +DFFML Features For Git Version Control +====================================== + +License +------- + +DFFML DFFML Features For Git Version Control are distributed under the +`MIT License `_ diff --git a/feature/git/dffml_feature_git/__init__.py b/feature/git/dffml_feature_git/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/feature/git/dffml_feature_git/feature/__init__.py b/feature/git/dffml_feature_git/feature/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/feature/git/dffml_feature_git/feature/authors.py b/feature/git/dffml_feature_git/feature/authors.py new file mode 100644 index 0000000000..454a450b31 --- /dev/null +++ b/feature/git/dffml_feature_git/feature/authors.py @@ -0,0 +1,35 @@ +'''Authors Feature''' +from dffml_feature_git.util.proc import create, stop, check_output + +from .git import GitFeature + +class GitAuthorsFeature(GitFeature): + ''' + Counts the number of unique authors within the frequency. + ''' + + NAME: str = 'authors' + + async def git_parse(self, data): + authors = [] + for current in range(0, self.LENGTH * self.FREQUENCY.MONTHS, + self.FREQUENCY.MONTHS): + current_authors = {} + proc = await data.git.create('log', + '--pretty=format:%aN', + '--date', 'relative', + '--before', '%d months' % (current), + '--after', '%d months' % (current + \ + self.FREQUENCY.MONTHS)) + while not proc.stdout.at_eof(): + line = await proc.stdout.readline() + line = line.decode(errors='ignore').strip() + if line != '': + current_authors.setdefault(line, 0) + await stop(proc) + authors.append(len(current_authors)) + data.temp.setdefault(self.NAME, authors) + await data.data.set(self.NAME, await self.calc(data)) + + async def calc(self, data): + return data.temp.get(self.NAME) diff --git a/feature/git/dffml_feature_git/feature/cloc.py b/feature/git/dffml_feature_git/feature/cloc.py new file mode 100644 index 0000000000..a70dccbe4d --- /dev/null +++ b/feature/git/dffml_feature_git/feature/cloc.py @@ -0,0 +1,66 @@ +'''Cloc Feature''' +from dffml_feature_git.util.proc import inpath, create, stop + +from .log import LOGGER + +from .monthly import GitMonthlyFeature + +class GitClocFeature(GitMonthlyFeature): + ''' + Count Lines Of Code + ''' + + NAME: str = 'cloc' + BINARY: str = 'cloc' + FASTER_THAN_CLOC = ['tokei'] + + def __init__(self): + super().__init__() + self.binary = self.BINARY + for binary in self.FASTER_THAN_CLOC: + if inpath(binary): + self.binary = binary + + async def applicable(self, data): + return inpath(self.binary) \ + and await GitMonthlyFeature.applicable(self, data) + + async def git_parse(self, data): + if not data.temp.get('cloc_data', False): + data.temp.setdefault('cloc_data', [{'sum': 0}] * self.LENGTH) + await super().git_parse(data) + + async def month_parse(self, data, i): + parsed = data.temp.get('cloc_data') + proc = await create(self.binary, data.git.cwd) + cols = [] + while not proc.stdout.at_eof(): + line = (await proc.stdout.readline()).decode().split() + if not line or line[0].startswith('-'): + continue + LOGGER.debug('%s line: %r', self.binary, line) + if line[0].lower().startswith('lang'): + cols = [cat.lower() for cat in line[1:]] + # Tokei -> cloc compatibility + if 'comments' in cols: + cols[cols.index('comments')] = 'comment' + continue + if cols: + header_cols = [word for word in line if not word.isdigit()] + header = ''.join([c for c in '_'.join(header_cols).lower() \ + if c.isalpha() or c == '_']) + # Tokei -> cloc compatibility + if header == 'total': + header = 'sum' + parsed[i][header] = dict(zip(cols, + map(int, line[len(header_cols):]))) + LOGGER.debug('parsed[%d]: %r', i, parsed[i]) + await stop(proc) + + async def calc(self, data): + try: + return [int(100 * month['sum']['comment'] / \ + (month['sum']['comment'] + month['sum']['code'])) + for month in (data.temp.get('cloc_data'))] + except ZeroDivisionError: + return [0 for month in (data.temp.get('cloc_data'))] diff --git a/feature/git/dffml_feature_git/feature/commits.py b/feature/git/dffml_feature_git/feature/commits.py new file mode 100644 index 0000000000..6cbaccbef1 --- /dev/null +++ b/feature/git/dffml_feature_git/feature/commits.py @@ -0,0 +1,32 @@ +'''Commits Feature''' +from dffml_feature_git.util.proc import stop + +from .git import GitFeature + +class GitCommitsFeature(GitFeature): + ''' + Counts the number of commits within the frequency. + ''' + + NAME: str = 'commits' + + async def git_parse(self, data): + commits = [] + for current in range(0, self.LENGTH * self.FREQUENCY.MONTHS, + self.FREQUENCY.MONTHS): + lines = 0 + proc = await data.git.create('log', + '--oneline', '--date', 'relative', + '--before', '%d months' % (current), + '--after', '%d months' % (current + \ + self.FREQUENCY.MONTHS)) + while not proc.stdout.at_eof(): + if (await proc.stdout.readline()) != b'': + lines += 1 + commits.append(lines) + await stop(proc) + data.temp.setdefault('commits', commits) + await data.data.set('commits', await self.calc(data)) + + async def calc(self, data): + return data.temp.get('commits') diff --git a/feature/git/dffml_feature_git/feature/git.py b/feature/git/dffml_feature_git/feature/git.py new file mode 100644 index 0000000000..d0c51ee5ab --- /dev/null +++ b/feature/git/dffml_feature_git/feature/git.py @@ -0,0 +1,248 @@ +'''Git based features''' +import os +import sys +import shutil +import asyncio +import tempfile +from asyncio.subprocess import PIPE +from typing import Type + +from dffml.feature import Feature +from dffml.feature.feature import Quarterly +from dffml.util.tempdir import TempDir + +from dffml_feature_git.util.proc import check_output, create, stop +from .log import LOGGER + +LOGGER = LOGGER.getChild('git') + +if sys.platform == 'win32': + loop = asyncio.ProactorEventLoop() + asyncio.set_event_loop(loop) + +class Git(object): + + TIMEOUT = 10 + DEFAULT_MAIN_BRANCH = 'master' + NO_SPACE = 'No space left on device' + + def __init__(self, tempdir: TempDir, cwd: str = '', + binary: str = 'git') -> None: + self.tempdir = tempdir + self.cwd = cwd if len(cwd) else '' + self.binary = binary + self.main_branch: str = '' + + async def _create(self, *args, **kwargs): + return await create(self.binary, *args, **kwargs, cwd=self.cwd) + + async def create(self, *args, **kwargs): + return await self._create(*args, **kwargs) + + async def check_output(self, *args, **kwargs): + return await check_output(self.binary, *args, **kwargs, cwd=self.cwd) + + async def checkout(self, branch: str = ''): + if not branch: + branch = self.main_branch + return await self.check_output('checkout', '-f', branch) + + async def ls_remote(self, src_url): + return (await self._ls_remote(src_url) or await self._svn_info(src_url)) + + async def _svn_info(self, src_url): + env = os.environ.copy() + env['git_askpass'] = 'echo' + proc = await create('svn', 'info', src_url, env=env) + done, pending = await asyncio.wait( + [proc.stdout.read(), proc.stderr.read()], + timeout=self.TIMEOUT, + return_when=asyncio.FIRST_COMPLETED) + [fut.cancel() for fut in pending] + first = ''.join([fut.result().decode(errors='ignore') \ + for fut in done]) + LOGGER.debug('svn info result: %r', first) + try: + proc.kill() + except: + pass + exit_code = await proc.wait() + if exit_code != 0: + return False + return True + + async def _ls_remote(self, src_url): + with tempfile.TemporaryDirectory(prefix='git_') as tempdir: + env = os.environ.copy() + env['git_askpass'] = 'echo' + proc = await create(self.binary, 'ls-remote', '--exit-code', + src_url, '-h', 'HEAD', env=env, cwd=tempdir) + done, pending = await asyncio.wait( + [proc.stdout.read(8), proc.stderr.read(5)], + timeout=self.TIMEOUT, + return_when=asyncio.FIRST_COMPLETED) + [fut.cancel() for fut in pending] + first = ''.join([fut.result().decode(errors='ignore') \ + for fut in done]) + LOGGER.debug('ls-remote result: %r', first) + if first.startswith('fatal'): + LOGGER.debug('ls-remote result: fatal%s', await + proc.stderr.read()) + await proc.wait() + return False + elif first.startswith('Username'): + LOGGER.debug('ls-remote got auth challenge') + proc.kill() + await proc.wait() + return False + # TODO Configurable ls-remote timeout + done, pending = await asyncio.wait([proc.wait()], + timeout=10) + cancelled = bool(len([fut.cancel() for fut in pending])) + if cancelled: + try: + proc.kill() + except: + pass + await proc.wait() + return False + else: + exit_code = [fut.result() for fut in done][0] + if exit_code != 0: + return False + return True + + async def clone(self, src_url: str): + if not (await self._clone(src_url) or await self._svn_clone(src_url)): + return False + self.main_branch = await self.infer_main_branch() + LOGGER.debug('main branch for %r is %r', src_url, self.main_branch) + return await self.check_output('log', '-n', '1') + + async def _svn_clone(self, src_url: str): + if not await self._svn_info(src_url): + return False + env = os.environ.copy() + env['git_askpass'] = 'echo' + if self.cwd is False or not len(self.cwd): + self.cwd = self.tempdir.mktempdir() + proc = await self.create('svn', 'clone', src_url, self.cwd, env=env) + await self._handle_clone_stream(proc, src_url) + return True + + async def _clone(self, src_url: str): + ''' + Downloads a git repo using the git binary. This requires that the git + binary be in the PATH environment variable. + ''' + if not await self._ls_remote(src_url): + return False + env = os.environ.copy() + env['git_askpass'] = 'echo' + if self.cwd is False or not len(self.cwd): + self.cwd = self.tempdir.mktempdir() + proc = await self.create('clone', src_url, self.cwd, env=env) + await self._handle_clone_stream(proc, src_url) + return True + + async def _handle_clone_stream(self, proc, src_url: str): + error = Exception('No errors') + try: + done, pending = await asyncio.wait( + [proc.stdout.read(8), proc.stderr.read(5)], + timeout=self.TIMEOUT, + return_when=asyncio.FIRST_COMPLETED) + [fut.cancel() for fut in pending] + first = ''.join([str(fut.result()) for fut in done]) + LOGGER.debug('clone result: %s', first) + if first == 'fatal': + raise RuntimeError(await proc.stderr.readline()) + elif first == 'Username': + raise RuntimeError('Requires authentication') + stream = '' + while proc.returncode is None: + done, pending = await asyncio.wait( + [proc.stdout.readline(), + proc.stderr.readline()], + timeout=self.TIMEOUT, + return_when=asyncio.FIRST_COMPLETED) + [fut.cancel() for fut in pending] + stream = ''.join([fut.result().decode(errors='ignore') \ + for fut in done]) + LOGGER.debug('clone stream %r: %r', src_url, stream) + await stop(proc) + except RuntimeError as err: + error = RuntimeError(repr(stream)) + if self.NO_SPACE in stream: + LOGGER.critical('Git clone error: %s', self.NO_SPACE) + if os.path.isdir(self.cwd): + shutil.rmtree(self.cwd) + error = RuntimeError(self.NO_SPACE) + if str(error) != 'No errors': + raise error + + async def infer_main_branch(self): + try: + branches = (await self.check_output('branch', '-r')).split('\n') + main = [branch for branch in branches \ + if '->' in branch][0].split()[-1] + main = '/'.join(main.split('/')[1:]) + except Exception as error: + LOGGER.error('Infering main branch: %s', error) + return self.DEFAULT_MAIN_BRANCH + return main + +class GitFeature(Feature): + ''' + Git repo based features + ''' + + NAME: str = 'git' + INAPPLICABLE_MESSAGE = 'Not a git repo' + LENGTH: int = 10 + FREQUENCY: int = Quarterly # type: ignore + + def dtype(self) -> Type: + return int + + def length(self) -> int: + return self.LENGTH + + async def applicable(self, data): + async with (await data.mklock('git_lock')): + # Count number of git features so that only the last feature removes + # the directory on tearDown + num_git_features = data.temp.get('num_git_features', 0) + num_git_features += 1 + data.temp['num_git_features'] = num_git_features + # If is_git_repo has been set to False then src is not a git repo + is_git_repo = data.temp.get('is_git_repo', None) + if not is_git_repo is None: + return is_git_repo + # Create an instance of the git helper so we can run git commands + data.git = Git(TempDir()) + await data.log('Git start ls-remote') + is_git_repo = await data.git.ls_remote(data.src_url) + await data.log('Git ls-remote complete') + data.temp.setdefault('is_git_repo', is_git_repo) + return is_git_repo + + async def fetch(self, data): + async with (await data.mklock('git_lock')): + if not os.path.isdir(data.git.cwd): + await data.log('Git start clone') + await data.git.clone(data.src_url) + await data.log('Git clone complete') + LOGGER.debug('Cloned to: %s', data.git.cwd) + + async def parse(self, data): + async with (await data.mklock('git_lock')): + LOGGER.debug('%s took git_lock', self.__class__.__qualname__) + await data.git.checkout() + return await self.git_parse(data) + + async def tearDown(self, data): + async with (await data.mklock('git_lock')): + data.temp['num_git_features'] -= 1 + if data.temp['num_git_features'] == 0: + data.git.tempdir.rmtempdirs() diff --git a/feature/git/dffml_feature_git/feature/lang.py b/feature/git/dffml_feature_git/feature/lang.py new file mode 100644 index 0000000000..afc5c906e8 --- /dev/null +++ b/feature/git/dffml_feature_git/feature/lang.py @@ -0,0 +1,45 @@ +'''Lang Feature''' +from typing import Dict + +from .cloc import GitClocFeature + +class GitLangsFeature(GitClocFeature): + ''' + Language usage by percentage for a git repo + ''' + + NAME: str = 'langs' + + def dtype(self): + return Dict[str, float] + + def length(self): + return 1 + + def percentage_of(self, numbers): + for key in ['sum', 'total']: + if key in numbers: + del numbers[key] + whole = sum(numbers.values()) + for key in numbers.keys(): + numbers[key] /= whole + return numbers + + async def calc(self, data): + return self.percentage_of({lang: numbers['code'] for lang, numbers in \ + (data.temp.get('cloc_data'))[0].items()}) + +class GitLangFeature(GitLangsFeature): + ''' + Most used language for a git repo + ''' + + NAME: str = 'lang' + + def dtype(self): + return str + + async def calc(self, data): + langs_percentages = await super().calc(data) + return sorted(langs_percentages, + key=langs_percentages.__getitem__)[::-1][0] diff --git a/feature/git/dffml_feature_git/feature/log.py b/feature/git/dffml_feature_git/feature/log.py new file mode 100644 index 0000000000..283f375316 --- /dev/null +++ b/feature/git/dffml_feature_git/feature/log.py @@ -0,0 +1,3 @@ +'''Logging''' +import logging +LOGGER = logging.getLogger(__package__) diff --git a/feature/git/dffml_feature_git/feature/monthly.py b/feature/git/dffml_feature_git/feature/monthly.py new file mode 100644 index 0000000000..265ddd5107 --- /dev/null +++ b/feature/git/dffml_feature_git/feature/monthly.py @@ -0,0 +1,36 @@ +'''Checkout each month Feature''' +import abc + +from dffml.feature import Data + +from .git import GitFeature + +class GitMonthlyFeature(GitFeature): + ''' + Checkout the repo each month + ''' + + NAME: str = 'cloc' + + async def git_parse(self, data): + i = -1 + for current in range(0, self.LENGTH * self.FREQUENCY.MONTHS, + self.FREQUENCY.MONTHS): + last_commit = (await data.git.check_output('log', + '--pretty=oneline', + '--no-abbrev-commit', '-n', '1', '--date', 'relative', + '--before', '%d months' % (current))).strip() + i += 1 + if len(last_commit) == 0: + continue + last_commit = last_commit.split()[0] + await data.git.check_output('reset', '--hard', last_commit) + await data.git.checkout(last_commit) + await self.month_parse(data, i) + + @abc.abstractmethod + async def month_parse(self, data: Data, i: int): + ''' + Parse the git repo this month + ''' + pass diff --git a/feature/git/dffml_feature_git/feature/release.py b/feature/git/dffml_feature_git/feature/release.py new file mode 100644 index 0000000000..b085e79dbc --- /dev/null +++ b/feature/git/dffml_feature_git/feature/release.py @@ -0,0 +1,56 @@ +'''Release Feature''' +from datetime import datetime +from dateutil.relativedelta import relativedelta + +from dffml_feature_git.util.proc import create, stop, check_output + +from .log import LOGGER +from .git import GitFeature + +class GitReleaseFeature(GitFeature): + ''' + Was there a release within the last 6 months + ''' + + NAME = 'release' + # Number of months since last release + LAST: int = 18 + + def valid_version(self, tag): + # Remove v from v1 to make isnumeric return True + tag = tag.replace('v', '') + # Make the only seperator . instead of - or _ + for replace in ['-', '_']: + tag = tag.replace(replace, '.') + # Make sure there is at least one number in the tag when split by . + return bool(sum([1 for num in tag.split('.') if num.isnumeric()])) + + async def git_parse(self, data): + release = [0] * self.LENGTH + releases = [] + # Parse log + proc = await data.git.create('log', '--tags', + '--simplify-by-decoration', '--pretty=format:%at %D') + while not proc.stdout.at_eof(): + line = await proc.stdout.readline() + line = line.decode(errors='ignore').strip().split() + LOGGER.debug('%r %s: %r', self, data.src_url, line) + # Ensure there is at'v' + # or it starts with v and then a number + if not line or not self.valid_version(line[-1]): + continue + releases.append(datetime.fromtimestamp(int(line[0]))) + await stop(proc) + # Check if there was a release within LAST months of each quarter + current = datetime.now() + for i in range(0, self.LENGTH): + six_months_from_current = current - relativedelta(months=self.LAST) + for date in releases: + if date < current and date > six_months_from_current: + release[i] = 1 + current -= relativedelta(months=self.FREQUENCY.MONTHS) + data.temp.setdefault(self.NAME, release) + await data.data.set(self.NAME, await self.calc(data)) + + async def calc(self, data): + return data.temp.get(self.NAME) diff --git a/feature/git/dffml_feature_git/feature/work.py b/feature/git/dffml_feature_git/feature/work.py new file mode 100644 index 0000000000..c5126effce --- /dev/null +++ b/feature/git/dffml_feature_git/feature/work.py @@ -0,0 +1,63 @@ +'''Work Feature''' +import asyncio +from asyncio.subprocess import PIPE + +from dffml_feature_git.util.proc import create, stop, check_output + +from .git import GitFeature + +def simpsons_diversity_index(*args): + ''' + From https://en.wikipedia.org/wiki/Diversity_index#Simpson_index + + The measure equals the probability that two entities taken at random from + the dataset of interest represent the same type. + ''' + if len(args) < 2: + return 0 + def __n_times_n_minus_1(number): + return number * (number - 1) + try: + return int(round((1.0 - (float(sum(map(__n_times_n_minus_1, args))) \ + / float(sum(args) * (sum(args) - 1)))) * 100.0)) + except ZeroDivisionError: + return 0 + +class GitWorkFeature(GitFeature): + ''' + Calculates the spread of authors and returns an integer between 0 and 10 + representing how varying the authorship of code is. For example a repo with + two authors where one commits 90% of the lines of code would calculates to + a 1. Equal work would calculate to a 10. + ''' + + NAME: str = 'work' + + async def git_parse(self, data): + work = [] + for current in range(0, self.LENGTH * self.FREQUENCY.MONTHS, + self.FREQUENCY.MONTHS): + author = '' + current_work = {} + proc = await data.git.create('log', + '--pretty=format:Author:%aN', '--numstat', + '--before', '%d months' % (current), + '--after', '%d months' % (current + \ + self.FREQUENCY.MONTHS)) + while not proc.stdout.at_eof(): + line = await proc.stdout.readline() + line = line.decode(errors='ignore').rstrip() + if line.startswith('Author:'): + author = line.split(':')[1] + if author and author not in current_work: + current_work[author] = 0 + elif line and author in current_work and \ + line.split()[0].isdigit(): + current_work[author] += int(line.split()[0]) + work.append(current_work) + await stop(proc) + data.temp.setdefault(self.NAME, work) + + async def calc(self, data): + return [simpsons_diversity_index(*authorship.values()) \ + for authorship in data.temp.get(self.NAME)] diff --git a/feature/git/dffml_feature_git/log.py b/feature/git/dffml_feature_git/log.py new file mode 100644 index 0000000000..283f375316 --- /dev/null +++ b/feature/git/dffml_feature_git/log.py @@ -0,0 +1,3 @@ +'''Logging''' +import logging +LOGGER = logging.getLogger(__package__) diff --git a/feature/git/dffml_feature_git/util/__init__.py b/feature/git/dffml_feature_git/util/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/feature/git/dffml_feature_git/util/log.py b/feature/git/dffml_feature_git/util/log.py new file mode 100644 index 0000000000..283f375316 --- /dev/null +++ b/feature/git/dffml_feature_git/util/log.py @@ -0,0 +1,3 @@ +'''Logging''' +import logging +LOGGER = logging.getLogger(__package__) diff --git a/feature/git/dffml_feature_git/util/proc.py b/feature/git/dffml_feature_git/util/proc.py new file mode 100644 index 0000000000..6562fd0e6e --- /dev/null +++ b/feature/git/dffml_feature_git/util/proc.py @@ -0,0 +1,56 @@ +''' +Asynchronous subprocess interaction. +''' +import os +import asyncio.subprocess + +from .log import LOGGER + +def inpath(binary): + return any(list(map(lambda dirname: os.path.isfile(os.path.join(dirname, + binary)), os.environ.get('PATH', '').split(':')))) + +async def stop(proc): + ''' + Stops a subprocess + ''' + exit_code = await proc.wait() + if exit_code != 0: + raise RuntimeError('\'%s\' exited with code %d: \'%s\'' \ + % (getattr(proc, 'name', 'subprocess'), exit_code, + getattr(proc, 'data', '').rstrip())) + return exit_code, proc + +async def create(*args, **kwargs): + ''' + Runs a subprocess using asyncio.create_subprocess_exec and returns the + process. + ''' + LOGGER.debug('proc.create: %r', args) + proc = await asyncio.create_subprocess_exec(*args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + start_new_session=True, + **kwargs) + proc.name = args[0] + proc.args = args[1:] + return proc + +async def get_output(proc): + ''' + Combines stdout and stderr + ''' + stderr = (await proc.stderr.read()).decode(errors='ignore') + stdout = (await proc.stdout.read()).decode(errors='ignore') + proc.data = stdout + stderr + return stdout, stderr + +async def check_output(*args, **kwargs): + ''' + Runs a subprocess using asyncio.create_subprocess_exec and returns either + its standard error or output. + ''' + proc = await create(*args, **kwargs) + stdout, stderr = await get_output(proc) + await stop(proc) + return stdout or stderr diff --git a/feature/git/dffml_feature_git/version.py b/feature/git/dffml_feature_git/version.py new file mode 100644 index 0000000000..856ce1d12d --- /dev/null +++ b/feature/git/dffml_feature_git/version.py @@ -0,0 +1 @@ +VERSION = '0.1.2' diff --git a/feature/git/pyproject.toml b/feature/git/pyproject.toml new file mode 100644 index 0000000000..22002d6625 --- /dev/null +++ b/feature/git/pyproject.toml @@ -0,0 +1,17 @@ +[metadata] +name = 'wllearn' +version = '0.0.1' +description = '' +author = 'U.N. Owen' +author_email = 'me@un.known' +license = 'MIT/Apache-2.0' +url = 'https://github.com/_/wllearn' + +[requires] +python_version = ['2.7', '3.5', '3.6', 'pypy', 'pypy3'] + +[build-system] +requires = ['setuptools', 'wheel'] + +[tool.hatch.commands] +prerelease = 'hatch build' diff --git a/feature/git/setup.py b/feature/git/setup.py new file mode 100644 index 0000000000..12e5442656 --- /dev/null +++ b/feature/git/setup.py @@ -0,0 +1,65 @@ +import os +import ast +from io import open + +from setuptools import find_packages, setup + +self_path = os.path.dirname(os.path.realpath(__file__)) + +with open(os.path.join(self_path, 'dffml_feature_git', 'version.py'), + 'r') as f: + for line in f: + if line.startswith('VERSION'): + version = ast.literal_eval(line.strip().split('=')[-1].strip()) + break + +with open(os.path.join(self_path, 'README.rst'), 'r', encoding='utf-8') as f: + readme = f.read() + +INSTALL_REQUIRES = [ + "python-dateutil>=2.7.3" + ] + +setup( + name='dffml_feature_git', + version=version, + description='', + long_description=readme, + author='John Andersen', + author_email='john.s.andersen@intel.com', + url='https://github.com/intel/dffml/blob/master/feature/git/README.rst', + license='MIT', + + keywords=[ + '', + ], + + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'License :: OSI Approved :: Apache Software License', + 'Natural Language :: English', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: PyPy', + ], + + install_requires=INSTALL_REQUIRES, + tests_require=[], + + packages=find_packages(), + entry_points={ + 'dffml.feature': [ + 'git = dffml_feature_git.feature.git:GitFeature', + 'work = dffml_feature_git.feature.work:GitWorkFeature', + 'cloc = dffml_feature_git.feature.cloc:GitClocFeature', + 'lang = dffml_feature_git.feature.lang:GitLangFeature', + 'langs = dffml_feature_git.feature.lang:GitLangsFeature', + 'commits = dffml_feature_git.feature.commits:GitCommitsFeature', + 'authors = dffml_feature_git.feature.authors:GitAuthorsFeature', + 'release = dffml_feature_git.feature.release:GitReleaseFeature', + ], + }, +) diff --git a/feature/git/tests/__init__.py b/feature/git/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/feature/git/tests/test_feature.py b/feature/git/tests/test_feature.py new file mode 100644 index 0000000000..a48e3af765 --- /dev/null +++ b/feature/git/tests/test_feature.py @@ -0,0 +1,79 @@ +# pylint: disable=missing-docstring,no-self-use +import unittest + +from dffml.feature import Feature, Features +from dffml.source import MemorySource +from dffml.util.asynctestcase import AsyncTestCase + +# Git Repo based features +from dffml_feature_git.feature.git import GitFeature +from dffml_feature_git.feature.cloc import GitClocFeature +from dffml_feature_git.feature.lang import GitLangsFeature, GitLangFeature +from dffml_feature_git.feature.work import GitWorkFeature +from dffml_feature_git.feature.release import GitReleaseFeature +from dffml_feature_git.feature.commits import GitCommitsFeature +from dffml_feature_git.feature.authors import GitAuthorsFeature + +FEATURES = [ + # Git repo features + GitCommitsFeature, + GitAuthorsFeature, + GitWorkFeature, + GitClocFeature, + GitReleaseFeature, +] +GIT_FEATURES = Features( + *[feature() for feature in FEATURES if issubclass(feature, GitFeature)]) + +class TestFeature(unittest.TestCase): + + def test_load_builtin_features(self): + features = Feature.load() + for mustLoad in FEATURES: + with self.subTest(mustLoad=mustLoad): + self.assertIn(mustLoad, features) + +class TestGitFeatures(AsyncTestCase): + + async def test_git_features(self): + async with GIT_FEATURES: + for src_url in ['https://github.com/tpm2-software/tpm2-tss', + 'https://github.com/github/gitignore']: + with self.subTest(src_url=src_url): + features = await GIT_FEATURES.evaluate(src_url) + self.assertEqual(len(features.values()), len(GIT_FEATURES)) + for results in features.values(): + self.assertEqual(len(results), 10) + + async def test_git_feature_fail(self): + async with GIT_FEATURES: + for src_url in ['https://github.com/github/nope', + 'https://google.com']: + with self.subTest(src_url=src_url): + features = await GIT_FEATURES.evaluate(src_url) + self.assertEqual(len(features.values()), 0) + +class TestLangs(AsyncTestCase): + + def setUp(self): + self.src_url = 'https://github.com/tpm2-software/tpm2-tss' + self.features = Features(GitLangsFeature()) + + async def test_langs(self): + async with self.features: + features = await self.features.evaluate(self.src_url) + self.assertIn('langs', features) + self.assertIn('c', features['langs']) + self.assertGreater(features['langs']['c'], 0.1) + +class TestLang(AsyncTestCase): + + def setUp(self): + self.src_url = 'https://github.com/tpm2-software/tpm2-tss' + self.features = Features(GitLangFeature()) + + async def test_lang(self): + async with self.features: + features = await self.features.evaluate(self.src_url) + self.assertIn('lang', features) + self.assertEqual('c', features['lang']) diff --git a/feature/git/tests/test_git.py b/feature/git/tests/test_git.py new file mode 100644 index 0000000000..5f6958b77c --- /dev/null +++ b/feature/git/tests/test_git.py @@ -0,0 +1,111 @@ +# pylint: disable=missing-docstring,no-self-use +import shutil +import random +import os.path +import unittest +import subprocess + +from dffml.util.tempdir import TempDir +from dffml.util.asynctestcase import AsyncTestCase + +from dffml_feature_git.feature.git import Git + +def has_git_svn() -> bool: + ''' + Travis installs git from the maintainers ppa the xenial git-svn does not + work with, and therefore does not install. + ''' + try: + subprocess.check_output(['git', 'svn'], stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as error: + if b'clone' in error.output: + return True + return False + +def mkgitrepo(gitdir): + subprocess.check_output(['git', 'init'], cwd=gitdir) + with open(os.path.join(gitdir, 'README.md'), 'w') as handle: + handle.write('# Hello World') + subprocess.check_output(['git', 'add', '-A'], cwd=gitdir) + subprocess.check_output(['git', 'commit', '-m', 'Initial Commit'], + cwd=gitdir) + +def mksvnrepo(gitdir): + return + +class TestGit(AsyncTestCase): + + async def setUp(self): + self.tempdir = TempDir() + await self.tempdir.__aenter__() + self.gcreated = self.tempdir.mktempdir() + self.screated = self.tempdir.mktempdir() + mkgitrepo(self.gcreated) + mksvnrepo(self.screated) + + async def tearDown(self): + await self.tempdir.__aexit__(None, None, None) + + async def test_git_clone(self): + git = Git(self.tempdir) + self.assertTrue(await git.clone(self.gcreated)) + shutil.rmtree(git.cwd, ignore_errors=True) + + @unittest.skipUnless(has_git_svn() and os.getenv('LONG_TESTS', '') != '', + 'Long SVN clone') + async def test_git_clone_svn(self): + git = Git(self.tempdir) + self.assertTrue( + len(await git.clone('https://svn.code.sf.net/p/lame/svn/trunk/lame'))) + shutil.rmtree(git.cwd, ignore_errors=True) + + async def test_no_repo(self): + git = Git(self.tempdir) + self.assertFalse(await git.clone(str(random.random()))) + self.assertFalse(os.path.isdir(git.cwd)) + + async def test_not_a_git_repo(self): + git = Git(self.tempdir) + self.assertFalse(await git.clone('https://example.com')) + self.assertFalse(os.path.isdir(git.cwd)) + + async def test_ls_remote_no_repo(self): + git = Git(self.tempdir) + self.assertFalse(await git.ls_remote(str(random.random()))) + + async def test_ls_remote(self): + git = Git(self.tempdir) + self.assertTrue(await git.ls_remote(self.gcreated)) + + @unittest.skipUnless(has_git_svn() and os.getenv('LONG_TESTS', '') != '', + 'Long SVN ls-remote') + async def test_ls_remote_svn(self): + git = Git(self.tempdir) + self.assertTrue(await git.ls_remote('https://svn.code.sf.net/p/lame/svn/trunk/lame')) + + @unittest.skipUnless(os.getenv('LONG_TESTS', '') != '', 'Hanging test') + async def test_ls_remote_forever(self): + ''' + Test case for a repo which hangs for a long time to make sure we git + ls-remote eventually. + ''' + git = Git(self.tempdir) + self.assertFalse(await git.ls_remote('git://java.net/jax-rs-spec~api')) + + async def test_infer_main_branch(self): + gitdir = self.tempdir.mktempdir() + subprocess.check_output(['git', 'init'], cwd=gitdir) + with open(os.path.join(gitdir, 'README.md'), 'w') as handle: + handle.write('# Hello World') + subprocess.check_output(['git', 'add', '-A'], cwd=gitdir) + subprocess.check_output(['git', 'checkout', '-b', 'v2'], cwd=gitdir) + subprocess.check_output(['git', 'commit', '-m', 'Initial Commit'], + cwd=gitdir) + for src_url, branch in [ + (self.gcreated, 'master'), + (gitdir, 'v2')]: + git = Git(self.tempdir) + with self.subTest(src_url=src_url, branch=branch): + self.assertTrue(await git.clone(src_url)) + self.assertEqual(git.main_branch, branch) + shutil.rmtree(git.cwd, ignore_errors=True) diff --git a/feature/git/tests/test_release.py b/feature/git/tests/test_release.py new file mode 100644 index 0000000000..65d04cf3aa --- /dev/null +++ b/feature/git/tests/test_release.py @@ -0,0 +1,31 @@ +# pylint: disable=missing-docstring,no-self-use +import unittest + +from dffml_feature_git.feature.release import GitReleaseFeature + +class TestReleaseFeature(unittest.TestCase): + + VALID = [ + '1.0.0', + 'v1.0.0', + 'curl-7_19_7', + 'miniupnpc_2_1', + '2_7_5', + ] + NOT_VALID = [ + 'asdf1', + 'as.df1', + ] + + def setUp(self): + self.feature = GitReleaseFeature() + + def test_valid(self): + for line in self.VALID: + with self.subTest(line=line): + self.assertTrue(self.feature.valid_version(line)) + + def test_not_valid(self): + for line in self.NOT_VALID: + with self.subTest(line=line): + self.assertFalse(self.feature.valid_version(line)) diff --git a/feature/git/tox.ini b/feature/git/tox.ini new file mode 100644 index 0000000000..e25dc9a70c --- /dev/null +++ b/feature/git/tox.ini @@ -0,0 +1,18 @@ +[tox] +envlist = + py27, + py35, + py36, + pypy, + pypy3, + +[testenv] +passenv = * +deps = + coverage + pytest +commands = + python setup.py --quiet clean develop + coverage run --parallel-mode -m pytest + coverage combine --append + coverage report -m diff --git a/model/tensorflow/.coveragerc b/model/tensorflow/.coveragerc new file mode 100644 index 0000000000..5b22b1dcfa --- /dev/null +++ b/model/tensorflow/.coveragerc @@ -0,0 +1,13 @@ +[run] +source = + dffml_model_tensorflow + tests +branch = True + +[report] +exclude_lines = + no cov + no qa + noqa + pragma: no cover + if __name__ == .__main__.: diff --git a/model/tensorflow/.gitattributes b/model/tensorflow/.gitattributes new file mode 100644 index 0000000000..dfe0770424 --- /dev/null +++ b/model/tensorflow/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/model/tensorflow/.gitignore b/model/tensorflow/.gitignore new file mode 100644 index 0000000000..070ee81c83 --- /dev/null +++ b/model/tensorflow/.gitignore @@ -0,0 +1,20 @@ +*.log +*.pyc +.cache/ +.coverage +.idea/ +.vscode/ +*.egg-info/ +build/ +dist/ +docs/build/ +venv/ +wheelhouse/ +*.egss +.mypy_cache/ +*.swp +.venv/ +.eggs/ +*.modeldir +*.db +htmlcov/ diff --git a/model/tensorflow/LICENSE b/model/tensorflow/LICENSE new file mode 100644 index 0000000000..8ce5aa9e27 --- /dev/null +++ b/model/tensorflow/LICENSE @@ -0,0 +1,21 @@ +Copyright (c) 2017-2019 Intel + +MIT License + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/model/tensorflow/MANIFEST.in b/model/tensorflow/MANIFEST.in new file mode 100644 index 0000000000..a5021c60e3 --- /dev/null +++ b/model/tensorflow/MANIFEST.in @@ -0,0 +1,2 @@ +include README.rst +include LICENSE diff --git a/model/tensorflow/README.rst b/model/tensorflow/README.rst new file mode 100644 index 0000000000..d7ca81816c --- /dev/null +++ b/model/tensorflow/README.rst @@ -0,0 +1,53 @@ +DFFML Models for Tensorflow Library +=================================== + +About +----- + +DFFML models backed by Tensorflow. + +Install +------- + +.. code-block:: console + + virtualenv -p python3.7 .venv + . .venv/bin/activate + python3.7 -m pip install --user -U dffml[tensorflow] + +Usage +----- + +.. code-block:: console + + wget http://download.tensorflow.org/data/iris_training.csv + wget http://download.tensorflow.org/data/iris_test.csv + head iris_training.csv + sed -i 's/.*setosa,versicolor,virginica/SepalLength,SepalWidth,PetalLength,PetalWidth,classification/g' *.csv + head iris_training.csv + dffml train \ + -model dnn \ + -sources csv=iris_training.csv \ + -classifications 0 1 2 \ + -features \ + def:SepalLength:float:1 \ + def:SepalWidth:float:1 \ + def:PetalLength:float:1 \ + def:PetalWidth:float:1 \ + -num_epochs 3000 \ + -steps 20000 + dffml accuracy \ + -model dnn \ + -sources csv=iris_training.csv \ + -classifications 0 1 2 \ + -features \ + def:SepalLength:float:1 \ + def:SepalWidth:float:1 \ + def:PetalLength:float:1 \ + def:PetalWidth:float:1 + +License +------- + +DFFML Tensorflow Models are distributed under the terms of the `MIT License +`_ diff --git a/model/tensorflow/dffml_model_tensorflow/__init__.py b/model/tensorflow/dffml_model_tensorflow/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/model/tensorflow/dffml_model_tensorflow/model/__init__.py b/model/tensorflow/dffml_model_tensorflow/model/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/model/tensorflow/dffml_model_tensorflow/model/dnn.py b/model/tensorflow/dffml_model_tensorflow/model/dnn.py new file mode 100644 index 0000000000..081634c7ea --- /dev/null +++ b/model/tensorflow/dffml_model_tensorflow/model/dnn.py @@ -0,0 +1,241 @@ +''' +Uses Tensorflow to create a generic DNN which learns on all of the features in a +repo. +''' +import os +import asyncio +import hashlib +import numpy as np +import tensorflow +from typing import List, Dict, Any, AsyncIterator, Tuple, Optional + +from dffml.repo import Repo +from dffml.feature import Feature, Features +from dffml.source import Sources +from dffml.model import Model +from dffml.accuracy import Accuracy + +from .log import LOGGER + +LOGGER = LOGGER.getChild('dnn') + +class DNN(Model): + ''' + Model using tensorflow to make predictions. Handels creation of feature + columns for real valued, string, and list of real valued features. + ''' + + def __init__(self): + super().__init__() + self._model = None + # Load packages with lots of dependencies durring instantiation so that + # users can choose to install these or not. + self.__np = np + self._tf = tensorflow + + def mkclassifications(self, classifications): + classifications = {value: key for key, value in \ + self.mkcids(classifications).items()} + LOGGER.debug('classifications(%d): %r', len(classifications), + classifications) + return classifications + + def mkcids(self, classifications): + cids = dict(zip(range(0, len(classifications)), + sorted(classifications))) + LOGGER.debug('cids(%d): %r', len(cids), cids) + return cids + + async def applicable_features(self, features: Features): + usable = await self.features(features) + return [name for name in features.names() if name in usable] + + async def training_input_fn(self, sources: Sources, features: Features, + classifications: List[Any], + batch_size=20, shuffle=False, num_epochs=1, **kwargs): + ''' + Uses the numpy input function with data from repo features. + ''' + classifications = self.mkclassifications(classifications) + features = await self.applicable_features(features) + LOGGER.debug('Training on features: %r', features) + x_cols: Dict[str, Any] = {feature: [] for feature in features} + y_cols = [] + for repo in [repo async for repo in \ + sources.classified_with_features(features) \ + if repo.classification() in classifications]: + for feature, results in repo.features(features).items(): + x_cols[feature].append(self.__np.array(results)) + y_cols.append(classifications[repo.classification()]) + presplit = len(y_cols) + if not presplit: + raise ValueError('No repos to train on') + split = 0.7 + split = int(float(presplit) * split) + y_cols = self.__np.array(y_cols[:split]) + for feature in x_cols: + x_cols[feature] = self.__np.array(x_cols[feature][:split]) + LOGGER.info('------ Repo Data ------') + LOGGER.info('total: %d', presplit) + LOGGER.info('x_cols: %d', len(list(x_cols.values())[0])) + LOGGER.info('y_cols: %d', len(y_cols)) + LOGGER.info('-----------------------') + input_fn = self._tf.estimator.inputs.numpy_input_fn(x_cols, + y_cols, batch_size=batch_size, + shuffle=shuffle, num_epochs=num_epochs, **kwargs) + return input_fn + + async def accuracy_input_fn(self, sources: Sources, features: Features, + classifications: List[Any], + batch_size=20, shuffle=False, num_epochs=1, **kwargs): + ''' + Uses the numpy input function with data from repo features. + ''' + features = await self.applicable_features(features) + classifications = self.mkclassifications(classifications) + x_cols: Dict[str, Any] = {feature: [] for feature in features} + y_cols = [] + for repo in [repo async for repo in \ + sources.classified_with_features(features) \ + if repo.classification() in classifications]: + for feature, results in repo.features(features).items(): + x_cols[feature].append(self.__np.array(results)) + y_cols.append(classifications[repo.classification()]) + presplit = len(y_cols) + split = 0.7 + split = int(float(presplit) * split) + y_cols = self.__np.array(y_cols[split:]) + for feature in x_cols: + x_cols[feature] = self.__np.array(x_cols[feature][split:]) + LOGGER.info('------ Repo Data ------') + LOGGER.info('total: %d', presplit) + LOGGER.info('x_cols: %d', len(list(x_cols.values())[0])) + LOGGER.info('y_cols: %d', len(y_cols)) + LOGGER.info('-----------------------') + input_fn = self._tf.estimator.inputs.numpy_input_fn(x_cols, + y_cols, batch_size=batch_size, + shuffle=shuffle, num_epochs=num_epochs, **kwargs) + return input_fn + + async def predict_input_fn(self, repos: AsyncIterator[Repo], + features: Features, classifications: List[Any], **kwargs): + ''' + Uses the numpy input function with data from repo features. + ''' + features = await self.applicable_features(features) + classifications = self.mkclassifications(classifications) + x_cols: Dict[str, Any] = {feature: [] for feature in features} + ret_repos = [] + async for repo in repos: + if not repo.features(features): + continue + ret_repos.append(repo) + for feature, results in repo.features(features).items(): + x_cols[feature].append(self.__np.array(results)) + for feature in x_cols: + x_cols[feature] = self.__np.array(x_cols[feature]) + LOGGER.info('------ Repo Data ------') + LOGGER.info('x_cols: %d', len(list(x_cols.values())[0])) + LOGGER.info('-----------------------') + input_fn = self._tf.estimator.inputs.numpy_input_fn(x_cols, + shuffle=False, num_epochs=1, **kwargs) + return input_fn, ret_repos + + async def features(self, features: Features): + ''' + Converts repos into training data + ''' + cols: Dict[str, Any] = {} + for feature in features: + col = self.feature_feature_column(feature) + if not col is None: + cols[feature.NAME] = col + return cols + + def feature_feature_column(self, feature: Feature): + ''' + Creates a feature column for a feature + ''' + dtype = feature.dtype() + if dtype is int or issubclass(dtype, int) \ + or dtype is float or issubclass(dtype, float): + return self._tf.feature_column.numeric_column(feature.NAME, + shape=feature.length()) + LOGGER.warning('Unknown dtype %r. Cound not create column' % (dtype)) + return None + + def model_dir_path(self, features: Features): + ''' + Creates the path to the model dir by using the provided model dir and + the sha256 hash of the concatenated feature names. + ''' + if self.model_dir is None: + return None + model = hashlib.sha256(''.join(features.names()).encode('utf-8'))\ + .hexdigest() + if not os.path.isdir(self.model_dir): + raise NotADirectoryError('%s is not a directory' % (self.model_dir)) + return os.path.join(self.model_dir, model) + + async def model(self, features: Features, classifications: List[Any]): + ''' + Generates or loads a model + ''' + if self._model is not None: + return self._model + # Build 3 layer DNN with 10, 20, 10 units respectively. + # 2 classifications whitelist or blacklist + LOGGER.debug('Loading model with classifications(%d): %r', + len(classifications), classifications) + self._model = self._tf.estimator.DNNClassifier( + feature_columns=list((await self.features(features)).values()), + hidden_units=[10, 20, 10], + n_classes=len(classifications), + model_dir=self.model_dir_path(features)) + return self._model + + async def train(self, sources: Sources, features: Features, + classifications: List[Any], steps: int, num_epochs: int): + ''' + Train on data submitted via classify. + ''' + input_fn = await self.training_input_fn(sources, features, + classifications, + batch_size=20, shuffle=True, num_epochs=num_epochs) + (await self.model(features, classifications))\ + .train(input_fn=input_fn, steps=steps) + + async def accuracy(self, sources: Sources, features: Features, + classifications: List[Any]) -> Accuracy: + ''' + Evaluates the accuracy of our model after training using the input repos + as test data. + ''' + if not os.path.isdir(self.model_dir_path(features)): + raise NotADirectoryError('Model not trained') + input_fn = await self.accuracy_input_fn(sources, features, + classifications, + batch_size=20, shuffle=False, num_epochs=1) + accuracy_score = (await self.model(features, classifications))\ + .evaluate(input_fn=input_fn) + return Accuracy(accuracy_score['accuracy']) + + async def predict(self, repos: AsyncIterator[Repo], features: Features, + classifications: List[Any]) -> \ + AsyncIterator[Tuple[Repo, Any, float]]: + ''' + Uses trained data to make a prediction about the quality of a repo. + ''' + if not os.path.isdir(self.model_dir_path(features)): + raise NotADirectoryError('Model not trained') + cids = self.mkcids(classifications) + # Create the input function + input_fn, predict = await self.predict_input_fn(repos, features, + classifications) + # Makes predictions on classifications + predictions = (await self.model(features, classifications))\ + .predict(input_fn=input_fn) + for repo, pred_dict in zip(predict, predictions): + class_id = pred_dict['class_ids'][0] + probability = pred_dict['probabilities'][class_id] + yield repo, cids[class_id], probability diff --git a/model/tensorflow/dffml_model_tensorflow/model/log.py b/model/tensorflow/dffml_model_tensorflow/model/log.py new file mode 100644 index 0000000000..283f375316 --- /dev/null +++ b/model/tensorflow/dffml_model_tensorflow/model/log.py @@ -0,0 +1,3 @@ +'''Logging''' +import logging +LOGGER = logging.getLogger(__package__) diff --git a/model/tensorflow/dffml_model_tensorflow/version.py b/model/tensorflow/dffml_model_tensorflow/version.py new file mode 100644 index 0000000000..856ce1d12d --- /dev/null +++ b/model/tensorflow/dffml_model_tensorflow/version.py @@ -0,0 +1 @@ +VERSION = '0.1.2' diff --git a/model/tensorflow/setup.py b/model/tensorflow/setup.py new file mode 100644 index 0000000000..bc0230be3a --- /dev/null +++ b/model/tensorflow/setup.py @@ -0,0 +1,58 @@ +import os +import ast +from io import open + +from setuptools import find_packages, setup + +self_path = os.path.dirname(os.path.realpath(__file__)) + +with open(os.path.join(self_path, 'dffml_model_tensorflow', 'version.py'), + 'r') as f: + for line in f: + if line.startswith('VERSION'): + version = ast.literal_eval(line.strip().split('=')[-1].strip()) + break + +with open(os.path.join(self_path, 'README.rst'), 'r', encoding='utf-8') as f: + readme = f.read() + +INSTALL_REQUIRES = [ + "tensorflow>=1.13.1,<2.0.0" + ] + +setup( + name='dffml-model-tensorflow', + version=version, + description='', + long_description=readme, + author='John Andersen', + author_email='john.s.andersen@intel.com', + url='https://github.com/intel/dffml/blob/master/model/tensorflow/README.rst', + license='MIT', + + keywords=[ + '', + ], + + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'License :: OSI Approved :: Apache Software License', + 'Natural Language :: English', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3 :: Only', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: PyPy', + ], + + install_requires=INSTALL_REQUIRES, + + packages=find_packages(), + entry_points={ + 'dffml.model': [ + 'dnn = dffml_model_tensorflow.model.dnn:DNN', + ], + }, +) diff --git a/model/tensorflow/tests/__init__.py b/model/tensorflow/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/model/tensorflow/tests/test_dnn.py b/model/tensorflow/tests/test_dnn.py new file mode 100644 index 0000000000..ee8567a01f --- /dev/null +++ b/model/tensorflow/tests/test_dnn.py @@ -0,0 +1,68 @@ +import random +import tempfile +from typing import Type + +from dffml.repo import Repo, RepoData +from dffml.source import Sources, RepoSource +from dffml.feature import Data, Feature, Features +from dffml.util.asynctestcase import AsyncTestCase + +from dffml_model_tensorflow.model.dnn import DNN + +class StartsWithA(Feature): + + NAME: str = 'starts_with_a' + + def dtype(self) -> Type: + return int + + def length(self) -> int: + return 1 + + async def calc(self, data: Data) -> int: + return 1 if data.src_url.lower().startswith('a') \ + else 0 + +class TestDNN(AsyncTestCase): + + @classmethod + def setUpClass(cls): + cls.model_dir = tempfile.TemporaryDirectory() + cls.model = DNN() + cls.model.model_dir = cls.model_dir.name + cls.feature = StartsWithA() + cls.features = Features(cls.feature) + cls.classifications = ['a', 'not a'] + cls.repos = [Repo('a' + str(random.random()), + data={'features': {cls.feature.NAME: 1}, + 'classification': 'a'}) for _ in range(0, 1000)] + cls.repos += [Repo('b' + str(random.random()), + data={'features': {cls.feature.NAME: 0}, + 'classification': 'not a'}) for _ in range(0, 1000)] + cls.sources = Sources(RepoSource(*cls.repos)) + + @classmethod + def tearDownClass(cls): + cls.model_dir.cleanup() + + async def test_00_train(self): + async with self.sources as sources, self.features as features: + await self.model.train(sources, features, + self.classifications, steps=1000, + num_epochs=30) + + async def test_01_accuracy(self): + async with self.sources as sources, self.features as features: + res = await self.model.accuracy(sources, features, + self.classifications) + self.assertGreater(res, 0.9) + + async def test_02_predict(self): + a = Repo('a', data={'features': {self.feature.NAME: 1}}) + sources = Sources(RepoSource(a)) + async with sources as sources, self.features as features: + res = [repo async for repo in self.model.predict(sources.repos(), + features, self.classifications)] + self.assertEqual(len(res), 1) + self.assertEqual(res[0][0].src_url, a.src_url) + self.assertTrue(res[0][1]) diff --git a/scripts/docker-entrypoint.sh b/scripts/docker-entrypoint.sh new file mode 100755 index 0000000000..a3f02cac0a --- /dev/null +++ b/scripts/docker-entrypoint.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +echo "#!/usr/bin/env bash" > /usr/bin/cmd.sh +chmod 755 /usr/bin/cmd.sh +runit () { + exec /usr/bin/cmd.sh +} +if [ "x$USER" != "x" ] && [ "x$UID" != "x" ]; then + export HOME="/home/$USER" + mkdir -p $HOME/.cache + useradd -o -u $UID $USER + chown $UID $HOME + chown $UID $HOME/.cache + runit () { + exec su - $USER -m -s /usr/bin/cmd.sh + } +fi + +if [ "$1" == "pip" ]; then + # Run pip. Used in case the user want to install something + echo "$@" >> /usr/bin/cmd.sh +else + # Run dffml otherwise. + echo "dffml $@" >> /usr/bin/cmd.sh +fi + +runit diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000..c6f193f123 --- /dev/null +++ b/setup.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import ast +from io import open +from setuptools import find_packages, setup + +with open('dffml/version.py', 'r') as f: + for line in f: + if line.startswith('VERSION'): + version = ast.literal_eval(line.strip().split('=')[-1].strip()) + break + +with open('README.rst', 'r', encoding='utf-8') as f: + readme = f.read() + +setup( + name='dffml', + version=version, + description='Data Flow Facilitator for Machine Learning', + long_description=readme, + author='John Andersen', + author_email='john.s.andersen@intel.com', + maintainer='John Andersen', + maintainer_email='john.s.andersen@intel.com', + url='https://github.com/intel/dffml', + license='MIT', + keywords=[ + '', + ], + classifiers=[ + 'Development Status :: 3 - Alpha', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Natural Language :: English', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3 :: Only', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: PyPy', + ], + packages=find_packages(), + extras_require={ + 'tensorflow': ['dffml-model-tensorflow'], + 'git': ['dffml-feature-git'], + }, + entry_points={ + 'console_scripts': [ + 'dffml = dffml.cli:CLI.main', + ], + 'dffml.source': [ + 'csv = dffml.source.csvfile:CSVSource', + 'json = dffml.source.json:JSONSource', + 'memory = dffml.source.memory:MemorySource', + ], + 'dffml.port': [ + 'json = dffml.port.json:JSON', + ], + }, +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000..5bbefb030a --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation diff --git a/tests/source/__init__.py b/tests/source/__init__.py new file mode 100644 index 0000000000..5bbefb030a --- /dev/null +++ b/tests/source/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation diff --git a/tests/source/test_file.py b/tests/source/test_file.py new file mode 100644 index 0000000000..26898d7ec8 --- /dev/null +++ b/tests/source/test_file.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import os +import io +import atexit +import shutil +import random +import inspect +import asyncio +import logging +import tempfile +import unittest +import collections +from unittest.mock import patch, mock_open +from functools import wraps +from contextlib import contextmanager +from typing import List, Dict, Any, Optional, Tuple, AsyncIterator + +from dffml.repo import Repo +from dffml.source import Sources, FileSource +from dffml.util.asynctestcase import AsyncTestCase + +class FakeFileSource(FileSource): + + async def update(self, repo: Repo): + pass # pragma: no cover + + async def repos(self) -> AsyncIterator[Repo]: + yield Repo('') # pragma: no cover + + async def repo(self, src_url: str): + pass # pragma: no cover + + async def load_fd(self, fd): + pass # pragma: no cover + + async def dump_fd(self, fd): + pass # pragma: no cover + +class TestFileSource(AsyncTestCase): + + def test_readonly(self) -> bool: + self.assertTrue(FakeFileSource('testfile:ro').readonly) + self.assertFalse(FakeFileSource('testfile').readonly) + + def test_filename(self) -> bool: + self.assertEqual(FakeFileSource('testfile').filename, + 'testfile') + + def test_filename_readonly(self) -> bool: + self.assertEqual(FakeFileSource('testfile:ro').filename, + 'testfile') + + def test_repr(self): + self.assertEqual(repr(FakeFileSource('testfile')), + 'FakeFileSource(\'testfile\')') + + async def test_open(self): + source = FakeFileSource('testfile') + m_open = mock_open() + with patch('os.path.isfile', return_value=True), \ + patch('builtins.open', m_open): + await source.open() + m_open.assert_called_once_with('testfile', 'r') + + async def test_open_no_file(self): + source = FakeFileSource('testfile') + with patch('os.path.isfile', return_value=False): + await source.open() + self.assertTrue(isinstance(source.mem, dict)) + + async def test_close(self): + source = FakeFileSource('testfile') + m_open = mock_open() + with patch('builtins.open', m_open): + await source.close() + m_open.assert_called_once_with('testfile', 'w') + + async def test_close_readonly(self): + source = FakeFileSource('testfile:ro') + m_open = mock_open() + with patch('builtins.open', m_open): + await source.close() + m_open.assert_not_called() diff --git a/tests/test_accuracy.py b/tests/test_accuracy.py new file mode 100644 index 0000000000..7c4a8ac295 --- /dev/null +++ b/tests/test_accuracy.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import unittest + +from dffml.accuracy import Accuracy + +class TestAccuracry(unittest.TestCase): + + def test_str(self): + self.assertEqual(str(Accuracy(0.04242)), '4.24') diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000000..de27d04728 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,150 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import os +import io +import atexit +import shutil +import random +import inspect +import asyncio +import logging +import tempfile +import unittest +import collections +from unittest.mock import patch +from functools import wraps +from contextlib import contextmanager +from typing import List, Dict, Any, Optional, Tuple, AsyncIterator + +from dffml.repo import Repo +from dffml.feature import Feature, Features +from dffml.source import Sources, RepoSource +from dffml.model import Model +from dffml.accuracy import Accuracy as AccuracyType +from dffml.util.asynctestcase import AsyncTestCase + +from dffml.cli import EvaluateAll, EvaluateRepo, \ + Train, Accuracy, PredictAll, PredictRepo + +class ReposTestCase(AsyncTestCase): + + def setUp(self): + self.repos = [Repo(str(random.random())) for _ in range(0, 10)] + self.sources = Sources(RepoSource(*self.repos)) + self.features = Features(FakeFeature()) + +class FakeFeature(Feature): + + NAME: str = 'fake' + + def dtype(self): + return float # pragma: no cov + + def length(self): + return 1 # pragma: no cov + + async def applicable(self, data): + return True + + async def fetch(self, data): + pass + + async def parse(self, data): + pass + + async def calc(self, data): + return float(data.src_url) + +class FakeModel(Model): + + async def train(self, sources: Sources, features: Features, + classifications: List[Any], steps: int, num_epochs: int): + pass + + async def accuracy(self, sources: Sources, features: Features, + classifications: List[Any]) -> AccuracyType: + return AccuracyType(1.00) + + async def predict(self, repos: AsyncIterator[Repo], features: Features, + classifications: List[Any]) -> \ + AsyncIterator[Tuple[Repo, Any, float]]: + async for repo in repos: + yield repo, '', 1.0 + +class TestEvaluateAll(ReposTestCase): + + def setUp(self): + super().setUp() + self.cli = EvaluateAll(sources=self.sources, features=self.features) + + async def test_run(self): + repos = {repo.src_url: repo async for repo in self.cli.run()} + self.assertEqual(len(repos), len(self.repos)) + for repo in self.repos: + self.assertIn(repo.src_url, repos) + self.assertIn('fake', repos[repo.src_url].features()) + self.assertEqual(float(repo.src_url), + repos[repo.src_url].features(['fake'])['fake']) + +class TestEvaluateRepo(ReposTestCase): + + def setUp(self): + super().setUp() + self.subset = self.repos[int(len(self.repos) / 2):] + self.cli = EvaluateRepo(sources=self.sources, features=self.features, + keys=[repo.src_url for repo in self.subset]) + + async def test_run(self): + repos = {repo.src_url: repo async for repo in self.cli.run()} + self.assertEqual(len(repos), len(self.subset)) + for repo in self.subset: + self.assertIn(repo.src_url, repos) + self.assertIn('fake', repos[repo.src_url].features()) + self.assertEqual(float(repo.src_url), + repos[repo.src_url].features(['fake'])['fake']) + +class TestTrain(AsyncTestCase): + + def setUp(self): + self.cli = Train(model=FakeModel(), model_dir=None, + sources=Sources(RepoSource()), features=Features()) + + async def test_run(self): + await self.cli.run() + +class TestAccuracy(AsyncTestCase): + + def setUp(self): + self.cli = Accuracy(model=FakeModel(), + sources=Sources(RepoSource()), features=Features()) + + async def test_run(self): + self.assertEqual(1.0, await self.cli.run()) + +class TestPredictAll(ReposTestCase): + + def setUp(self): + super().setUp() + self.cli = PredictAll(model=FakeModel(), sources=self.sources, + features=self.features) + + async def test_run(self): + repos = {repo.src_url: repo async for repo in self.cli.run()} + self.assertEqual(len(repos), len(self.repos)) + for repo in self.repos: + self.assertIn(repo.src_url, repos) + +class TestPredictRepo(ReposTestCase): + + def setUp(self): + super().setUp() + self.subset = self.repos[int(len(self.repos) / 2):] + self.cli = PredictRepo(model=FakeModel(), sources=self.sources, + features=self.features, + keys=[repo.src_url for repo in self.subset]) + + async def test_run(self): + repos = {repo.src_url: repo async for repo in self.cli.run()} + self.assertEqual(len(repos), len(self.subset)) + for repo in self.subset: + self.assertIn(repo.src_url, repos) diff --git a/tests/test_feature.py b/tests/test_feature.py new file mode 100644 index 0000000000..9e7cfe588c --- /dev/null +++ b/tests/test_feature.py @@ -0,0 +1,189 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import asyncio +import unittest +from unittest.mock import patch + +from dffml.feature import Data, Feature, Features, LoggingDict, DefFeature +from dffml.util.asynctestcase import AsyncTestCase + +class SingleFeature(Feature): + + def dtype(self): + return bool # pragma: no cov + + def length(self): + return 1 # pragma: no cov + +class OneFeatureTester(SingleFeature): + NAME: str = 'one' + +class TwoFeatureTester(SingleFeature): + NAME: str = 'two' + + async def calc(self, data: Data) -> bool: + return True + +class TwoBFeatureTester(SingleFeature): + pass + +class ThreeFeatureTester(SingleFeature): + NAME: str = 'three' + + async def applicable(self, data: Data) -> bool: + return False + +class ProgessFeatureTester(SingleFeature): + NAME: str = 'progress' + + async def calc(self, data: Data) -> bool: + await data.log('Hi') + return True + +class TestLoggingDict(AsyncTestCase): + + def setUp(self): + self.data = Data('test') + self.ldict = LoggingDict(self.data) + + def ginternal(self, key): + return getattr(self.ldict, + '_%s__dict' % (self.ldict.__class__.__qualname__,))[key] + + async def test_get(self): + self.assertEqual(await self.ldict.get('feed', default='face'), 'face') + + async def test_set(self): + await self.ldict.set('dead', 'beef') + self.assertEqual(self.ginternal('dead'), 'beef') + + async def test_set_ignored(self): + lock = asyncio.Lock() + await self.ldict.set('dead', lock) + + async def test_inc(self): + await self.ldict.set('babe', 0) + self.assertEqual(self.ginternal('babe'), 0) + await self.ldict.inc('babe') + self.assertEqual(self.ginternal('babe'), 1) + +class TestData(AsyncTestCase): + + def setUp(self): + self.data = Data('test') + + async def test_mklock_new(self): + self.assertNotIn('feed', self.data.locks) + await self.data.mklock('feed') + self.assertIn('feed', self.data.locks) + + async def test_mklock_exists(self): + self.data.locks['feed'] = asyncio.Lock() + self.assertIn('feed', self.data.locks) + await self.data.mklock('feed') + self.assertIn('feed', self.data.locks) + + async def test_results(self): + async def complete(*args): + return 'face' + with patch.object(self.data, 'complete', complete): + await self.data.result() + self.assertEqual(self.data.results, 'face') + +class TestFeature(AsyncTestCase): + + def setUp(self): + self.feature = Feature() + + def test_default_dtype(self): + self.assertEqual(self.feature.dtype(), int) + + def test_default_length(self): + self.assertEqual(self.feature.length(), 1) + + async def test_default_applicable(self): + self.assertEqual(await self.feature.applicable(Data('test')), True) + +class TestDefFeature(AsyncTestCase): + + def test_deffeature(self): + feature = DefFeature('test', float, 10) + self.assertEqual(feature.NAME, 'test') + self.assertEqual(feature.dtype(), float) + self.assertEqual(feature.length(), 10) + +class TestFeatures(AsyncTestCase): + + def setUp(self): + self.one = OneFeatureTester() + self.two = TwoFeatureTester() + self.three = ThreeFeatureTester() + self.features = Features(self.one, self.two, self.three) + + async def test_names(self): + async with self.features: + names = self.features.names() + for check in ['one', 'two', 'three']: + self.assertIn(check, names) + + async def test_applicable(self): + async with self.features: + applicable = await self.features.applicable('test') + self.assertIn(self.one, applicable) + self.assertIn(self.two, applicable) + self.assertNotIn(self.three, applicable) + + async def test_evaluate(self): + async with self.features: + results = await self.features.evaluate('test') + self.assertIn(self.one.NAME, results) + self.assertIn(self.two.NAME, results) + self.assertNotIn(self.three.NAME, results) + self.assertEqual(results[self.one.NAME], False) + self.assertEqual(results[self.two.NAME], True) + + async def test_one_applicable_other_not(self): + twob = TwoBFeatureTester() + features = Features(self.two, twob) + async with features: + results = await features.evaluate('test') + self.assertIn(self.two.NAME, results) + self.assertEqual(len(results), 1) + self.assertEqual(results[self.two.NAME], True) + + async def test_monitor_progess(self): + progress = ProgessFeatureTester() + features = Features(progress) + async with features: + data = await features.submit('test') + logs = await data.logs() + results = await data.result() + self.assertTrue(logs) + self.assertIn('Hi', logs) + self.assertIn(progress.NAME, results) + self.assertEqual(len(results), 1) + self.assertEqual(results[progress.NAME], True) + + def test_load_def(self): + feature = Features.load_def('test', 'float', 10) + self.assertEqual(feature.NAME, 'test') + self.assertEqual(feature.dtype(), float) + self.assertEqual(feature.length(), 10) + + def test_load_defs(self): + no_def, (one, two) = Features.load_defs('na', 'def:one:float:10', + 'def:two:bool:1') + self.assertEqual(no_def, ['na']) + self.assertEqual(one.NAME, 'one') + self.assertEqual(one.dtype(), float) + self.assertEqual(one.length(), 10) + self.assertEqual(two.NAME, 'two') + self.assertEqual(two.dtype(), bool) + self.assertEqual(two.length(), 1) + + def test_convert_dtype(self): + self.assertEqual(Features.convert_dtype('float'), float) + + def test_convert_dtype_invalid(self): + with self.assertRaisesRegex(TypeError, 'Failed to convert'): + Features.convert_dtype('not a python data type') diff --git a/tests/test_monitor.py b/tests/test_monitor.py new file mode 100644 index 0000000000..c8588b9c7d --- /dev/null +++ b/tests/test_monitor.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import asyncio +import unittest + +from dffml.util.monitor import Monitor, Task +from dffml.util.asynctestcase import AsyncTestCase + +async def test_task(task=Task()): + for i in range(0, 10): + await asyncio.sleep(0.01) + await task.update(i) + +async def log_task(task=Task()): + for i in range(0, 10): + await task.log('i is now %d', i) + +async def recv_statuses(status, sleep): + log = [] + await asyncio.sleep(sleep) + async for msg in status: + log.append(msg) + return log + +class TestMonitor(AsyncTestCase): + + def setUp(self): + self.monitor = Monitor() + + async def test_00_await_complete(self): + await self.monitor.complete((await self.monitor.start(test_task)).key) + + async def test_01_single_watching_status(self): + task = await self.monitor.start(test_task) + statuses = await recv_statuses(self.monitor.status(task.key), 0.05) + self.assertEqual(len(statuses), 10) + for i in range(0, 10): + self.assertEqual(statuses[i], i) + + async def test_02_multiple_watching(self): + task = await self.monitor.start(test_task) + res = await asyncio.gather( + *[recv_statuses(self.monitor.status(task.key), i * 0.01) + for i in range(0, 5)]) + for statuses in res: + self.assertEqual(len(statuses), 10) + for i in range(0, 10): + self.assertEqual(statuses[i], i) + + async def test_03_log(self): + await self.monitor.complete((await self.monitor.start(log_task)).key) + + async def test_04_already_complete(self): + task = await self.monitor.start(log_task) + await self.monitor.complete(task.key) + await self.monitor.complete(task.key) + + async def test_05_already_complete_status(self): + task = await self.monitor.start(log_task) + await self.monitor.complete(task.key) + self.assertFalse([msg async for msg in self.monitor.status(task.key)]) + + async def test_06_log_status(self): + i = 0 + async for msg in self.monitor.log_status( + (await self.monitor.start(test_task)).key): + self.assertEqual(msg, i) + i += 1 + self.assertEqual(i, 10) + + async def test_07_already_running(self): + task = await self.monitor.start(test_task) + await self.monitor.start(task, task.key) + await self.monitor.complete(task.key) diff --git a/tests/test_repo.py b/tests/test_repo.py new file mode 100644 index 0000000000..6d828e6831 --- /dev/null +++ b/tests/test_repo.py @@ -0,0 +1,136 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import unittest + +from dffml.repo import RepoPrediction, RepoData, Repo + +class TestRepoPrediction(unittest.TestCase): + + def setUp(self): + self.confidence = 0.42 + self.classification = 'good' + self.full = RepoPrediction(confidence=self.confidence, + classification=self.classification) + self.null = RepoPrediction() + + def test_full_property_confidence(self): + self.assertEqual(self.confidence, self.full['confidence']) + self.assertEqual(self.full.confidence, + self.full['confidence']) + + def test_full_property_classification(self): + self.assertEqual(self.classification, self.full['classification']) + self.assertEqual(self.full.classification, + self.full['classification']) + + def test_full_dict_returns_self(self): + self.assertEqual(self.full, self.full.dict()) + + def test_full_len_2(self): + self.assertEqual(2, len(self.full)) + + def test_full_bool_true(self): + self.assertTrue(self.full) + + def test_null_dict_empty_array(self): + self.assertEqual([], self.null.dict()) + + def test_null_len_0(self): + self.assertEqual(0, len(self.null)) + + def test_null_bool_false(self): + self.assertFalse(self.null) + +class TestRepoData(unittest.TestCase): + + def setUp(self): + self.full = RepoData( + src_url=None, + features=None, + classification=None, + prediction=None, + last_updated=None) + self.null = RepoData() + + def test_null_dict_no_prediction(self): + self.assertNotIn('prediction', self.null.dict()) + +class TestRepo(unittest.TestCase): + + def setUp(self): + self.null = Repo('null') + self.full = Repo('full', + data=dict(features=dict(dead='beef'), + extra=dict(extra='read all about it')), + extra=dict(half=True)) + + def test_dict(self): + data = self.full.dict() + self.assertIn('extra', data) + + def test_repr(self): + repr(self.full) + + def test_str(self): + self.full.prediction = RepoPrediction() + self.assertIn('Undetermined', str(self.full)) + self.full.data.prediction = RepoPrediction(classification='Good') + self.assertIn('Good', str(self.full)) + self.full.data.classification = 'Great' + self.assertIn('Great', str(self.full)) + self.full.extra.update(dict(hi=5)) + self.assertIn('5', str(self.full)) + self.full.extra = dict() + self.assertNotIn('5', str(self.full)) + + def test_merge(self): + null = Repo('null') + null.merge(self.full) + self.assertIn('half', null.extra) + self.assertTrue(null.extra['half']) + + def test_src_url(self): + return self.full.data.src_url + + def test_evaluated(self): + old_last_updated = self.full.data.last_updated + results = {'new': 'feature'} + self.full.evaluated({'feed': 'face'}) + self.assertIn('feed', self.full.data.features) + self.assertEqual('face', self.full.data.features['feed']) + self.full.evaluated(results, overwrite=True) + self.assertEqual(self.full.data.features, results) + self.assertNotEqual(old_last_updated, self.full.data.last_updated) + + def test_features(self): + self.assertIn('dead', self.full.features()) + self.assertIn('dead', self.full.features(['dead'])) + self.assertFalse(self.full.features(['dead', 'beaf'])) + + def test_predicted(self): + old_prediction = self.full.data.prediction + old_last_updated = self.full.data.last_updated + self.full.predicted('feed', 1.00) + self.assertNotEqual(old_prediction, self.full.data.prediction) + self.assertNotEqual(old_last_updated, self.full.data.last_updated) + + def test_prediction(self): + self.full.predicted('feed', 1.00) + self.assertTrue(self.full.prediction()) + + def test_classify(self): + self.full.classify('face') + self.assertEqual(self.full.data.classification, 'face') + + def test_classified(self): + self.full.classify('') + self.assertFalse(self.full.classified()) + self.full.classify(True) + self.assertTrue(self.full.classified()) + + def test_classification(self): + self.full.classify(True) + self.assertTrue(self.full.classification()) + self.full.classify('') + with self.assertRaisesRegex(ValueError, 'Unclassified'): + self.full.classification() diff --git a/tests/util/__init__.py b/tests/util/__init__.py new file mode 100644 index 0000000000..5bbefb030a --- /dev/null +++ b/tests/util/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation diff --git a/tests/util/test_asynccontextmanager.py b/tests/util/test_asynccontextmanager.py new file mode 100644 index 0000000000..5fce878a1c --- /dev/null +++ b/tests/util/test_asynccontextmanager.py @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import unittest + +from dffml.util.asynchelper import AsyncContextManagerList +from dffml.util.asynctestcase import AsyncTestCase + +class OpenCloseTester(object): + + def __init__(self): + self.isopen = False + + async def __aenter__(self): + self.isopen = True + + async def __aexit__(self, exc_type, exc_value, traceback): + self.isopen = False + +class TestAsyncContextManagerList(AsyncTestCase): + + async def test_open_close_all(self): + test_list = AsyncContextManagerList(OpenCloseTester(), + OpenCloseTester()) + for listel in test_list: + self.assertFalse(listel.isopen) + async with test_list: + for listel in test_list: + self.assertTrue(listel.isopen) + for listel in test_list: + self.assertFalse(listel.isopen) diff --git a/tests/util/test_cli.py b/tests/util/test_cli.py new file mode 100644 index 0000000000..73988ff0b9 --- /dev/null +++ b/tests/util/test_cli.py @@ -0,0 +1,252 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import sys +import json +import asyncio +import logging +import unittest +from unittest.mock import patch + +from dffml.repo import Repo +from dffml.port import Port +from dffml.feature import Feature, Features +from dffml.source import Source, Sources +from dffml.model import Model + +from dffml.util.cli import \ + ParseSourcesAction, \ + ParseFeaturesAction, \ + ParseModelAction, \ + ParsePortAction, \ + ParseLoggingAction, \ + Arg, \ + JSONEncoder, \ + CMD, \ + Parser, \ + ListEntrypoint, \ + FeaturesCMD, \ + ModelCMD + +from dffml.util.asynctestcase import AsyncTestCase + +def Namespace(**kwargs): + class MakeNamespace(object): + pass + for key, value in kwargs.items(): + setattr(MakeNamespace, key, value) + return MakeNamespace + +class TestParseActions(unittest.TestCase): + + def test_sources(self): + def load_from_dict(toload): + return toload + namespace = Namespace(sources=False) + with patch.object(Source, 'load_from_dict', + new=load_from_dict) \ + as mock_method: + action = ParseSourcesAction(dest='sources', option_strings='') + action(None, namespace, ['first=src0', 'second=src1']) + self.assertEqual(len(namespace.sources), 2) + self.assertEqual(namespace.sources[0], 'first') + self.assertEqual(namespace.sources[1], 'second') + action(None, namespace, 'second=src2') + self.assertEqual(len(namespace.sources), 1) + self.assertEqual(namespace.sources[0], 'second') + + def test_features(self): + dest, cls, parser = ('features', Features, ParseFeaturesAction) + namespace = Namespace(**{dest: False}) + with patch.object(cls, 'load') as mock_method: + action = parser(dest=dest, option_strings='') + action(None, namespace, 'fake_%s' % (dest,)) + mock_method.assert_called_once_with(*('fake_%s' % (dest,))) + self.assertTrue(getattr(namespace, dest, False)) + + def test_features_model_port(self): + for dest, cls, parser in [('model', Model, ParseModelAction), + ('port', Port, ParsePortAction)]: + namespace = Namespace(**{dest: False}) + with self.subTest(dest=dest, cls=cls, parser=parser): + with patch.object(cls, 'load', + return_value=lambda: True) as mock_method: + action = parser(dest=dest, option_strings='') + action(None, namespace, 'fake_%s' % (dest,)) + mock_method.assert_called_once_with('fake_%s' % (dest,)) + self.assertTrue(getattr(namespace, dest, False)) + + def test_logging(self): + namespace = Namespace(log=False) + action = ParseLoggingAction(dest='log', option_strings='') + with patch.object(logging, 'basicConfig') as mock_method: + action(None, namespace, 'DEBUG') + mock_method.assert_called_once_with(level=logging.DEBUG) + with patch.object(logging, 'basicConfig') as mock_method: + action(None, namespace, 'WARNING') + mock_method.assert_called_once_with(level=logging.WARNING) + +class TestArg(unittest.TestCase): + + def test_init(self): + arg = Arg('-test', key='value') + self.assertEqual(arg.name, '-test') + self.assertIn('key', arg) + self.assertEqual(arg['key'], 'value') + + def test_modify(self): + arg = Arg('-test', key='value') + first = arg.modify(name='-first') + second = arg.modify(key='new_value') + self.assertEqual(arg.name, '-test') + self.assertEqual(first.name, '-first') + self.assertEqual(second.name, '-test') + self.assertEqual(second['key'], 'new_value') + +class TestJSONEncoder(unittest.TestCase): + + def test_default(self): + class UnregisteredObject(object): + pass + with self.assertRaisesRegex(TypeError, 'not JSON serializable'): + json.dumps(UnregisteredObject, cls=JSONEncoder) + + def test_repo(self): + self.assertIn('face', json.dumps(Repo('face'), cls=JSONEncoder)) + + def test_feature(self): + class FaceFeature(Feature): + NAME = 'face' + self.assertIn('face', json.dumps(FaceFeature(), cls=JSONEncoder)) + +class TestCMD(AsyncTestCase): + + def test_init(self): + class CMDTest(CMD): + arg_nope_present = Arg('nope', default=False) + arg_ignored = Arg('ignored') + cmd = CMDTest(nope=True) + self.assertTrue(getattr(cmd, 'log', False)) + self.assertTrue(getattr(cmd, 'nope', False)) + + async def test_async_context_management(self): + async with CMD(): + pass + + async def test_parse_args(self): + with patch.object(Parser, 'add_subs') as mock_method: + await CMD.parse_args() + mock_method.assert_called_once_with(CMD) + + async def test_cli_no_sub_command(self): + with patch.object(Parser, 'print_help') as mock_method: + await CMD.cli() + mock_method.assert_called_once() + + async def test_cli_sub_command_without_run(self): + class Secondary(CMD): + pass + class Primary(CMD): + secondary = Secondary + with patch.object(Parser, 'print_help') as mock_method: + await Primary.cli('secondary') + mock_method.assert_called_once() + + async def test_cli_run_sub_command_asyncgen(self): + class Secondary(CMD): + async def run(self): + yield 1 + class Primary(CMD): + secondary = Secondary + self.assertEqual(sum(await Primary.cli('secondary')), 1) + + async def test_cli_run_sub_command(self): + class Secondary(CMD): + async def run(self): + return 2 + class Primary(CMD): + secondary = Secondary + self.assertEqual(await Primary.cli('secondary'), 2) + + def test_sanitize_args(self): + args = {'cmd': True, 'non_internal': True} + args = CMD().sanitize_args(args) + self.assertNotIn('cmd', args) + self.assertIn('non_internal', args) + + def test_main_result_none(self): + class Secondary(CMD): + async def run(self): + return None + class Primary(CMD): + secondary = Secondary + Primary.main(loop=asyncio.new_event_loop(), argv=['t', 'secondary']) + + def test_main_result_not_none(self): + class Secondary(CMD): + async def run(self): + return True + class Primary(CMD): + secondary = Secondary + with patch.object(json, 'dump') as mock_method: + Primary.main(loop=asyncio.new_event_loop(), argv=['t', 'secondary']) + mock_method.assert_called_once() + +class TestParser(unittest.TestCase): + + def test_add_subs(self): + class FakeSubCMD(CMD): + arg_test = Arg('-test') + class FakeCMD(CMD): + sub_cmd = FakeSubCMD + parser = Parser() + with patch.object(parser, 'add_subparsers') as mock_method: + parser.add_subs(FakeCMD) + mock_method.assert_called_once() + parser = Parser() + with patch.object(parser, 'add_subparsers') as mock_method: + parser.add_subs(FakeSubCMD) + with self.assertRaisesRegex(AssertionError, 'Called 0 times'): + mock_method.assert_called_once() + +class TestListEntrypoint(AsyncTestCase): + + def test_display_no_docstring(self): + class FakeClass(CMD): + pass + with patch.object(sys.stdout, 'write') as mock_method: + ListEntrypoint().display(FakeClass) + with self.assertRaisesRegex(AssertionError, 'call not found'): + mock_method.assert_any_call('docstring!') + + def test_display_docstring(self): + class FakeClass(CMD): + 'docstring!' + with patch.object(sys.stdout, 'write') as mock_method: + ListEntrypoint().display(FakeClass) + mock_method.assert_any_call('docstring!') + + async def test_run(self): + class FakeClass(CMD): + 'docstring!' + class FakeEntrypoint(object): + @classmethod + def load(cls): + return [FakeClass] + class FakeListEntrypoint(ListEntrypoint): + ENTRYPOINT = FakeEntrypoint + with patch.object(sys.stdout, 'write') as mock_method: + await FakeListEntrypoint().run() + mock_method.assert_any_call('docstring!') + +class TestFeaturesCMD(unittest.TestCase): + + def test_set_timeout(self): + cmd = FeaturesCMD(timeout=5) + self.assertEqual(cmd.features.timeout, 5) + +class TestModelCMD(unittest.TestCase): + + def test_set_model_dir(self): + with patch.multiple(Model, __abstractmethods__=set()): + cmd = ModelCMD(model_dir='feed', model=Model) + self.assertEqual(cmd.model.model_dir, 'feed') diff --git a/tests/util/test_entrypoint.py b/tests/util/test_entrypoint.py new file mode 100644 index 0000000000..de03a8e9f4 --- /dev/null +++ b/tests/util/test_entrypoint.py @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import os +import unittest +import pkg_resources +from unittest.mock import patch +from typing import Type + +from dffml.util.entrypoint import Entrypoint + +class Loadable(object): + + def __init__(self, name: str, parent_class: Type[object]): + self.name = name + self.parent_class = parent_class + + def load(self): + class NewClass(self.parent_class): + name = self.name + return NewClass + +class FakeEntrypoint(Entrypoint): + + ENTRY_POINT = 'fake' + +class TestEntrypoint(unittest.TestCase): + + FAKE_ITER = [ + Loadable('one', FakeEntrypoint), + Loadable('two', object), + Loadable('three', FakeEntrypoint) + ] + + def test_load_only_subclasses(self): + with patch.object(pkg_resources, 'iter_entry_points', + return_value=self.FAKE_ITER) as mock_method: + loaded = FakeEntrypoint.load() + self.assertTrue(loaded) + names = [i.name for i in loaded] + for should_load in ['one', 'three']: + with self.subTest(should_load=should_load): + self.assertIn(should_load, names) + with self.subTest(should_not_load='two'): + self.assertNotIn('two', names) + + def test_load_given_name(self): + with patch.object(pkg_resources, 'iter_entry_points', + return_value=self.FAKE_ITER) as mock_method: + loaded = FakeEntrypoint.load('three') + self.assertEqual('three', loaded.name) + + def test_load_no_found(self): + with patch.object(pkg_resources, 'iter_entry_points', + return_value=self.FAKE_ITER) as mock_method: + with self.assertRaisesRegex(KeyError, 'was not found in'): + FakeEntrypoint.load('four') + + def test_load_multiple(self): + with patch.object(pkg_resources, 'iter_entry_points', + return_value=self.FAKE_ITER) as mock_method: + loaded = FakeEntrypoint.load_multiple(['one', 'three']) + self.assertTrue(loaded) + self.assertIn('one', loaded) + self.assertNotIn('two', loaded) + self.assertIn('three', loaded) diff --git a/tests/util/test_tempdir.py b/tests/util/test_tempdir.py new file mode 100644 index 0000000000..81f221b38c --- /dev/null +++ b/tests/util/test_tempdir.py @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2019 Intel Corporation +import os +import unittest +from typing import List + +from dffml.util.tempdir import TempDir +from dffml.util.asynctestcase import AsyncTestCase + +class TestTempDir(unittest.TestCase): + + def test_mktempdir(self): + dirname = TempDir().mktempdir() + self.assertEqual(os.path.isdir(dirname), True) + os.rmdir(dirname) + + def test_rmtempdirs(self): + tempdir = TempDir() + dirname = tempdir.mktempdir() + self.assertEqual(os.path.isdir(dirname), True) + tempdir.rmtempdirs() + self.assertEqual(os.path.isdir(dirname), False) + +class TestTempDirAsyncContextManager(AsyncTestCase): + + async def test_removes_on_aexit(self): + length: int = 10 + dirs: List[str] = [] + tempdir: TempDir = TempDir() + async with tempdir: + for _i in range(0, length): + dirs.append(tempdir.mktempdir()) + self.assertTrue(os.path.isdir(dirs[-1])) + self.assertEqual(len(dirs), length) + for dirname in dirs: + self.assertFalse(os.path.exists(dirname))