From ea87f2bb3b71594504fbe76f844bd54e6872d5f7 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Thu, 24 Jun 2021 10:39:37 -0700 Subject: [PATCH 01/40] Command-line argument repo_dir is changed The optional third command-line argument, repo_dir, is now a keyword argument and the name is changed to target-dir; this allows us to avoid having to pass an empty branch_name as the second argument if you want to pass in the repo_dir(now target-dir) as the third argument. In the previous configuration, we used positional arguments: gitpuller git_url branch_name repo_dir Now, we use a keyword argument for target-dir: gitpuller git_url [branch_name] --target-dir [TARGET_DIR] --- nbgitpuller/pull.py | 7 ++++--- tests/test_gitpuller.py | 11 ++++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/nbgitpuller/pull.py b/nbgitpuller/pull.py index ef2aefb7..3dfa9fdd 100644 --- a/nbgitpuller/pull.py +++ b/nbgitpuller/pull.py @@ -304,13 +304,14 @@ def main(): parser = argparse.ArgumentParser(description='Synchronizes a github repository with a local repository.') parser.add_argument('git_url', help='Url of the repo to sync') parser.add_argument('branch_name', default=None, help='Branch of repo to sync', nargs='?') - parser.add_argument('repo_dir', default='.', help='Path to clone repo under', nargs='?') + parser.add_argument('--target-dir', default='.', help='Path to clone repo under') + args = parser.parse_args() for line in GitPuller( args.git_url, - args.repo_dir, - branch=args.branch_name if args.branch_name else None + args.target_dir, + branch=args.branch_name ).pull(): print(line) diff --git a/tests/test_gitpuller.py b/tests/test_gitpuller.py index 0055b0da..ef1393c1 100644 --- a/tests/test_gitpuller.py +++ b/tests/test_gitpuller.py @@ -99,7 +99,11 @@ def test_initialize(): def command_line_test_helper(remote_path, branch, pusher_path): work_dir = "/".join(os.path.dirname(os.path.abspath(__file__)).split("/")[:-1]) + "/nbgitpuller" try: - cmd = ['python3', 'pull.py', remote_path, branch, pusher_path] + cmd = ['python3', 'pull.py', remote_path] + if branch is not None: + cmd += [branch] + if pusher_path is not None: + cmd += ['--target-dir', pusher_path] sp.check_output( cmd, cwd=work_dir @@ -119,8 +123,9 @@ def test_command_line_existing_branch(): assert subprocess_result -def test_command_line_default_branch(): - branch = "" +def test_command_line_no_branch_passed(): + # so it should use the default branch + branch = None with Remote() as remote, Pusher(remote) as pusher: pusher.push_file('README.md', '1') remotepath = "file://%s" % os.path.abspath(remote.path) From 10385bbabd6ab4153cb1a44512a428d4e24723f9 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Wed, 23 Jun 2021 18:13:57 -0700 Subject: [PATCH 02/40] Added non-git source puller functionality Handles non-git source compressed archives from google drive, dropbox, and any publicly available web address. --- MANIFEST.in | 1 + nbgitpuller/handlers.py | 38 +- nbgitpuller/hookspecs.py | 21 + nbgitpuller/plugins/__init__.py | 0 nbgitpuller/plugins/plugin_helper.py | 116 +++ nbgitpuller/plugins/zip_puller.py | 79 ++ nbgitpuller/static/index.js | 7 +- nbgitpuller/templates/status.html | 1 + setup.py | 2 +- tests/test_files/hw/hw01/hw01.ipynb | 1405 ++++++++++++++++++++++++++ tests/test_zip_puller.py | 55 + 11 files changed, 1719 insertions(+), 6 deletions(-) create mode 100644 nbgitpuller/hookspecs.py create mode 100644 nbgitpuller/plugins/__init__.py create mode 100644 nbgitpuller/plugins/plugin_helper.py create mode 100644 nbgitpuller/plugins/zip_puller.py create mode 100644 tests/test_files/hw/hw01/hw01.ipynb create mode 100644 tests/test_zip_puller.py diff --git a/MANIFEST.in b/MANIFEST.in index 607df237..0e8f8cc4 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,6 @@ include *.md include LICENSE include setup.cfg +recursive-include nbgitpuller/plugins * recursive-include nbgitpuller/static * recursive-include nbgitpuller/templates * diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index f83ad7d5..b8b5ff33 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -11,6 +11,11 @@ from .pull import GitPuller from .version import __version__ +from .hookspecs import handle_files +from .plugins.zip_puller import ZipSourceGoogleDriveDownloader +from .plugins.zip_puller import ZipSourceDropBoxDownloader +from .plugins.zip_puller import ZipSourceWebDownloader +import pluggy class SyncHandler(IPythonHandler): @@ -38,6 +43,17 @@ def emit(self, data): self.write('data: {}\n\n'.format(serialized_data)) yield self.flush() + def setup_plugins(self, repo): + pm = pluggy.PluginManager("nbgitpuller") + pm.add_hookspecs(handle_files) + if "drive.google.com" in repo: + pm.register(ZipSourceGoogleDriveDownloader()) + elif "dropbox.com" in repo: + pm.register(ZipSourceDropBoxDownloader()) + else: + pm.register(ZipSourceWebDownloader()) + return pm + @web.authenticated @gen.coroutine def get(self): @@ -53,6 +69,7 @@ def get(self): try: repo = self.get_argument('repo') branch = self.get_argument('branch', None) + compressed = self.get_argument('compressed', "false") depth = self.get_argument('depth', None) if depth: depth = int(depth) @@ -73,6 +90,12 @@ def get(self): self.set_header('content-type', 'text/event-stream') self.set_header('cache-control', 'no-cache') + if compressed == 'true': + pm = self.setup_plugins(repo) + results = pm.hook.handle_files(repo=repo, repo_parent_dir=repo_parent_dir)[0] + repo_dir = repo_parent_dir + results["unzip_dir"] + repo = "file://" + results["origin_repo_path"] + gp = GitPuller(repo, repo_dir, branch=branch, depth=depth, parent=self.settings['nbapp']) q = Queue() @@ -151,14 +174,15 @@ def get(self): repo = self.get_argument('repo') branch = self.get_argument('branch', None) depth = self.get_argument('depth', None) + compressed = self.get_argument('compressed', "false") urlPath = self.get_argument('urlpath', None) or \ - self.get_argument('urlPath', None) + self.get_argument('urlPath', None) subPath = self.get_argument('subpath', None) or \ - self.get_argument('subPath', '.') + self.get_argument('subPath', '.') app = self.get_argument('app', app_env) parent_reldir = os.getenv('NBGITPULLER_PARENTPATH', '') targetpath = self.get_argument('targetpath', None) or \ - self.get_argument('targetPath', repo.split('/')[-1]) + self.get_argument('targetPath', repo.split('/')[-1]) if urlPath: path = urlPath @@ -174,7 +198,13 @@ def get(self): self.write( self.render_template( 'status.html', - repo=repo, branch=branch, path=path, depth=depth, targetpath=targetpath, version=__version__ + repo=repo, + branch=branch, + compressed=compressed, + path=path, + depth=depth, + targetpath=targetpath, + version=__version__ )) self.flush() diff --git a/nbgitpuller/hookspecs.py b/nbgitpuller/hookspecs.py new file mode 100644 index 00000000..59dbf10e --- /dev/null +++ b/nbgitpuller/hookspecs.py @@ -0,0 +1,21 @@ +import pluggy + +hookspec = pluggy.HookspecMarker("nbgitpuller") + + +@hookspec +def handle_files(self, repo, repo_parent_dir): + """ + :param str repo: download url to source + :param str repo_parent_dir: where we will store the downloaded repo + :return two parameter json unzip_dir and origin_repo_path + :rtype json object + This handles the downloading of non-git source + files into the user directory. Once downloaded, + the files are merged into a local git repository. + + Once the local git repository is updated(or created + the first time), git puller can then handle this + directory as it would sources coming from a + git repository. + """ diff --git a/nbgitpuller/plugins/__init__.py b/nbgitpuller/plugins/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nbgitpuller/plugins/plugin_helper.py b/nbgitpuller/plugins/plugin_helper.py new file mode 100644 index 00000000..1e2f77c7 --- /dev/null +++ b/nbgitpuller/plugins/plugin_helper.py @@ -0,0 +1,116 @@ +import subprocess +import os +import logging +import requests +from requests_file import FileAdapter +import shutil +import re + + +# for large files from Google Drive +def get_confirm_token(response): + for key, value in response.cookies.items(): + if key.startswith('download_warning'): + return value + return None + + +# sets up the a local repo that acts like a remote +def initialize_local_repo(local_repo_path): + logging.info(f"Creating local_repo_path: {local_repo_path}") + os.makedirs(local_repo_path, exist_ok=True) + + subprocess.check_output(["git", "init", "--bare"], cwd=local_repo_path) + + +# local repo cloned from the "remote" which is in user drive +def clone_local_origin_repo(origin_repo_path, temp_download_repo): + logging.info(f"Creating temp_download_repo: {temp_download_repo}") + os.makedirs(temp_download_repo, exist_ok=True) + + cmd = ["git", "clone", f"file://{origin_repo_path}", temp_download_repo] + subprocess.check_output(cmd, cwd=temp_download_repo) + + +# this is needed to unarchive various formats(eg. zip, tgz, etc) +def determine_file_extension(url, response): + file_type = response.headers.get('content-type') + content_disposition = response.headers.get('content-disposition') + ext = None + if content_disposition: + fname = re.findall("filename\\*?=([^;]+)", content_disposition) + fname = fname[0].strip().strip('"') + ext = fname.split(".")[1] + elif file_type and "/zip" in file_type: + ext = "zip" + else: + url = url.split("/")[-1] + if "?" in url: + url = url[0:url.find('?')] + if "." in url: + ext = url.split(".")[1] + + if not ext: + m = f"Could not determine the file extension for unarchiving: {url}" + raise Exception(m) + return ext + + +# the downloaded content is in the response -- unarchive and save to the disk +def save_response_content(url, response, temp_download_repo): + try: + ext = determine_file_extension(url, response) + CHUNK_SIZE = 32768 + temp_download_file = f"{temp_download_repo}/download.{ext}" + with open(temp_download_file, "wb") as f: + for chunk in response.iter_content(CHUNK_SIZE): + # filter out keep-alive new chunks + if chunk: + f.write(chunk) + + shutil.unpack_archive(temp_download_file, temp_download_repo) + + os.remove(temp_download_file) + except Exception as e: + m = f"Problem handling file download: {str(e)}" + raise Exception(m) + + +# grab archive file from url +def fetch_files(url, id=-1): + session = requests.Session() + session.mount('file://', FileAdapter()) # add adapter for pytests + response = session.get(url, params={'id': id}, stream=True) + token = get_confirm_token(response) + if token: + params = {'id': id, 'confirm': token} + response = session.get(url, params=params, stream=True) + + return response + + +# this drive the file handling -- called from zip_puller by all the +# handle_files implementations for GoogleDrive, Dropbox, and standard +# Web url +def handle_files_helper(args): + try: + origin_repo = args["repo_parent_dir"] + args["origin_dir"] + temp_download_repo = args["repo_parent_dir"] + args["download_dir"] + if os.path.exists(temp_download_repo): + shutil.rmtree(temp_download_repo) + + if not os.path.exists(origin_repo): + initialize_local_repo(origin_repo) + + clone_local_origin_repo(origin_repo, temp_download_repo) + save_response_content(args["repo"], args["response"], temp_download_repo) + subprocess.check_output(["git", "add", "."], cwd=temp_download_repo) + subprocess.check_output(["git", "-c", "user.email=nbgitpuller@nbgitpuller.link", "-c", "user.name=nbgitpuller", "commit", "-m", "test", "--allow-empty"], cwd=temp_download_repo) + subprocess.check_output(["git", "push", "origin", "master"], cwd=temp_download_repo) + unzipped_dirs = os.listdir(temp_download_repo) + + dir_names = list(filter(lambda dir: ".git" not in dir, unzipped_dirs)) + return {"unzip_dir": dir_names[0], "origin_repo_path": origin_repo} + except Exception as e: + logging.exception(e) + raise ValueError(e) diff --git a/nbgitpuller/plugins/zip_puller.py b/nbgitpuller/plugins/zip_puller.py new file mode 100644 index 00000000..af18a488 --- /dev/null +++ b/nbgitpuller/plugins/zip_puller.py @@ -0,0 +1,79 @@ +from .plugin_helper import fetch_files +from .plugin_helper import handle_files_helper +import pluggy + +hookimpl = pluggy.HookimplMarker("nbgitpuller") +TEMP_DOWNLOAD_REPO_DIR = ".temp_download_repo" +CACHED_ORIGIN_NON_GIT_REPO = ".origin_non_git_sources" + + +# handles standard web addresses(not google drive or dropbox) +class ZipSourceWebDownloader(object): + @hookimpl + def handle_files(self, repo, repo_parent_dir): + """ + :param str repo: publicly accessible url to compressed source files + :param str repo_parent_dir: where we will store the downloaded repo + :return two parameter json unzip_dir and origin_repo_path + :rtype json object + """ + response = fetch_files(repo) + args = { + "repo": repo, + "repo_parent_dir": repo_parent_dir, + "response": response, + "origin_dir": CACHED_ORIGIN_NON_GIT_REPO, + "download_dir": TEMP_DOWNLOAD_REPO_DIR + } + return handle_files_helper(args) + + +# handles downloads from google drive +class ZipSourceGoogleDriveDownloader(object): + def __init__(self): + self.DOWNLOAD_URL = "https://docs.google.com/uc?export=download" + + def get_id(self, repo): + start_id_index = repo.index("d/") + 2 + end_id_index = repo.index("/view") + return repo[start_id_index:end_id_index] + + @hookimpl + def handle_files(self, repo, repo_parent_dir): + """ + :param str repo: google drive share link to compressed source files + :param str repo_parent_dir: where we will store the downloaded repo + :return two parameter json unzip_dir and origin_repo_path + :rtype json object + """ + response = fetch_files(self.DOWNLOAD_URL, self.get_id(repo)) + args = { + "repo": repo, + "repo_parent_dir": repo_parent_dir, + "response": response, + "origin_dir": CACHED_ORIGIN_NON_GIT_REPO, + "download_dir": TEMP_DOWNLOAD_REPO_DIR + } + return handle_files_helper(args) + + +# handles downloads from DropBox +class ZipSourceDropBoxDownloader(object): + @hookimpl + def handle_files(self, repo, repo_parent_dir): + """ + :param str repo: dropbox download link to compressed source files + :param str repo_parent_dir: where we will store the downloaded repo + :return two parameter json unzip_dir and origin_repo_path + :rtype json object + """ + repo = repo.replace("dl=0", "dl=1") # download set to 1 for dropbox + response = fetch_files(repo) + args = { + "repo": repo, + "repo_parent_dir": repo_parent_dir, + "response": response, + "origin_dir": CACHED_ORIGIN_NON_GIT_REPO, + "download_dir": TEMP_DOWNLOAD_REPO_DIR + } + return handle_files_helper(args) diff --git a/nbgitpuller/static/index.js b/nbgitpuller/static/index.js index c85d5897..7be399e9 100644 --- a/nbgitpuller/static/index.js +++ b/nbgitpuller/static/index.js @@ -12,12 +12,13 @@ require([ Terminal.applyAddon(fit); - function GitSync(baseUrl, repo, branch, depth, targetpath, path) { + function GitSync(baseUrl, repo, branch, depth, compressed, targetpath, path) { // Class that talks to the API backend & emits events as appropriate this.baseUrl = baseUrl; this.repo = repo; this.branch = branch; this.depth = depth; + this.compressed = compressed; this.targetpath = targetpath; this.redirectUrl = baseUrl + path; @@ -52,6 +53,9 @@ require([ if (typeof this.branch !== 'undefined' && this.branch != undefined) { syncUrlParams['branch'] = this.branch; } + if (typeof this.compressed !== 'undefined' && this.compressed != undefined) { + syncUrlParams['compressed'] = this.compressed; + } var syncUrl = this.baseUrl + 'git-pull/api?' + $.param(syncUrlParams); this.eventSource = new EventSource(syncUrl); @@ -133,6 +137,7 @@ require([ utils.get_body_data('repo'), utils.get_body_data('branch'), utils.get_body_data('depth'), + utils.get_body_data('compressed'), utils.get_body_data('targetpath'), utils.get_body_data('path') ); diff --git a/nbgitpuller/templates/status.html b/nbgitpuller/templates/status.html index 1fcd00dc..99b9f53e 100644 --- a/nbgitpuller/templates/status.html +++ b/nbgitpuller/templates/status.html @@ -7,6 +7,7 @@ data-path="{{ path | urlencode }}" {% if branch %}data-branch="{{ branch | urlencode }}"{% endif %} {% if depth %}data-depth="{{ depth | urlencode }}"{% endif %} +{% if compressed %}data-compressed="{{ compressed | urlencode }}"{% endif %} data-targetpath="{{ targetpath | urlencode }}" {% endblock %} diff --git a/setup.py b/setup.py index 2afcea0f..3a367291 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ packages=find_packages(), include_package_data=True, platforms='any', - install_requires=['notebook>=5.5.0', 'tornado'], + install_requires=['notebook>=5.5.0', 'tornado', 'requests', 'requests-file'], data_files=[ ('etc/jupyter/jupyter_notebook_config.d', ['nbgitpuller/etc/nbgitpuller.json']) ], diff --git a/tests/test_files/hw/hw01/hw01.ipynb b/tests/test_files/hw/hw01/hw01.ipynb new file mode 100644 index 00000000..960747ce --- /dev/null +++ b/tests/test_files/hw/hw01/hw01.ipynb @@ -0,0 +1,1405 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "# Initialize Otter\n", + "import otter\n", + "grader = otter.Notebook(\"hw01.ipynb\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Homework 1: Causality and Expressions\n", + "\n", + "Please complete this notebook by filling in the cells provided." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Recommended Reading:**\n", + "- [What is Data Science](http://www.inferentialthinking.com/chapters/01/what-is-data-science.html)\n", + "- [Causality and Experiments](http://www.inferentialthinking.com/chapters/02/causality-and-experiments.html) \n", + "- [Programming in Python](http://www.inferentialthinking.com/chapters/03/programming-in-python.html)\n", + "\n", + "For all problems that you must write explanations and sentences for, you **must** provide your answer in the designated space. Moreover, throughout this homework and all future ones, please be sure to not re-assign variables throughout the notebook! For example, if you use `max_temperature` in your answer to one question, do not reassign it later on. Otherwise, you will fail tests that you thought you were passing previously!\n", + "\n", + "\n", + "Directly sharing answers is not okay, but discussing problems with the course staff or with other students is encouraged. Refer to the policies page to learn more about how to learn cooperatively.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Scary Arithmetic\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "\n", + "\n", + "An ad for ADT Security Systems says,\n", + "\n", + "> \"When you go on vacation, burglars go to work [...] According to FBI statistics, over 25% of home burglaries occur between Memorial Day and Labor Day.\"\n", + "\n", + "Do the data in the ad support the claim that burglars are more likely to go to work during the time between Memorial Day and Labor Day? Please explain your answer.\n", + "\n", + "**Note:** You can assume that \"over 25%\" means only slightly over. Had it been much over, say closer to 30%, then the marketers would have said so.\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "_Type your answer here, replacing this text._" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## 2. Characters in Little Women\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In lecture, we counted the number of times that the literary characters were named in each chapter of the classic book, [*Little Women*](https://www.inferentialthinking.com/chapters/01/3/1/literary-characters). In computer science, the word \"character\" also refers to a letter, digit, space, or punctuation mark; any single element of a text. The following code generates a scatter plot in which each dot corresponds to a chapter of *Little Women*. The horizontal position of a dot measures the number of periods in the chapter. The vertical position measures the total number of characters." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# This cell contains code that hasn't yet been covered in the course,\n", + "# but you should be able to interpret the scatter plot it generates.\n", + "\n", + "from datascience import *\n", + "from urllib.request import urlopen\n", + "import numpy as np\n", + "%matplotlib inline\n", + "\n", + "little_women_url = 'https://www.inferentialthinking.com/data/little_women.txt'\n", + "chapters = urlopen(little_women_url).read().decode().split('CHAPTER ')[1:]\n", + "text = Table().with_column('Chapters', chapters)\n", + "Table().with_columns(\n", + " 'Periods', np.char.count(chapters, '.'),\n", + " 'Characters', text.apply(len, 0)\n", + " ).scatter(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "**Question 1.** Around how many periods are there in the chapter with the most characters? Assign either 1, 2, 3, 4, or 5 to the name `characters_q1` below.\n", + "\n", + "1. 250\n", + "2. 390\n", + "3. 440\n", + "4. 32,000\n", + "5. 40,000\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "characters_q1 = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q2_1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The test above checks that your answers are in the correct format. **This test does not check that you answered correctly**, only that you assigned a number successfully in each multiple-choice answer cell." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "**Question 2.** Which of the following chapters has the most characters per period? Assign either 1, 2, or 3 to the name `characters_q2` below.\n", + "1. The chapter with about 60 periods\n", + "2. The chapter with about 350 periods\n", + "3. The chapter with about 440 periods\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "characters_q2 = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q2_2\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, the test above checks that your answers are in the correct format, but not that you have answered correctly." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To discover more interesting facts from this plot, read [Section 1.3.2](https://www.inferentialthinking.com/chapters/01/3/2/another-kind-of-character) of the textbook." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Names and Assignment Statements\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 1.** When you run the following cell, Python produces a cryptic error message." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "4 = 2 + 2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "Choose the best explanation of what's wrong with the code, and then assign 1, 2, 3, or 4 to `names_q1` below to indicate your answer.\n", + "\n", + "1. Python is smart and already knows `4 = 2 + 2`.\n", + "\n", + "2. `4` is already a defined number, and it doesn't make sense to make a number be a name for something else. In Python, \"`x = 2 + 2`\" means \"assign `x` as the name for the value of `2 + 2`.\"\n", + "\n", + "3. It should be `2 + 2 = 4`.\n", + "\n", + "4. I don't get an error message. This is a trick question.\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "names_q1 = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q3_1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 2.** When you run the following cell, Python will produce another cryptic error message." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "two = 3\n", + "six = two plus two" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "Choose the best explanation of what's wrong with the code and assign 1, 2, 3, or 4 to `names_q2` below to indicate your answer.\n", + "\n", + "1. The `plus` operation only applies to numbers, not the word \"two\".\n", + "\n", + "2. The name \"two\" cannot be assigned to the number 3.\n", + "\n", + "3. Two plus two is four, not six.\n", + "\n", + "4. Python cannot interpret the name `two` followed directly by a name that has not been defined.\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "names_q2 = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q3_2\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Question 3.** When you run the following cell, Python will, yet again, produce another cryptic error message." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "x = print(5)\n", + "y = x + 2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "Choose the best explanation of what's wrong with the code and assign 1, 2, or 3 to `names_q3` below to indicate your answer.\n", + "\n", + "1. Python doesn't want `y` to be assigned.\n", + "\n", + "2. The `print` operation is meant for displaying values to the programmer, not for assigning values!\n", + "\n", + "3. Python can’t do addition between one name and one number. It has to be 2 numbers or 2 predefined names.\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "names_q3 = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q3_3\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Job Opportunities & Education in Rural India\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A [study](http://www.nber.org/papers/w16021.pdf) at UCLA investigated factors that might result in greater attention to the health and education of girls in rural India. One such factor is information about job opportunities for women. The idea is that if people know that educated women can get good jobs, they might take more care of the health and education of girls in their families, as an investment in the girls’ future potential as earners. Without the knowledge of job opportunities, the author hypothesizes that families do not invest in women’s well-being.\n", + "\n", + "The study focused on 160 villages outside the capital of India, all with little access to information about call centers and similar organizations that offer job opportunities to women. In 80 of the villages chosen at random, recruiters visited the village, described the opportunities, recruited women who had some English language proficiency and experience with computers, and provided ongoing support free of charge for three years. In the other 80 villages, no recruiters visited and no other intervention was made.\n", + "\n", + "At the end of the study period, the researchers recorded data about the school attendance and health of the children in the villages." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "**Question 1.** Which statement best describes the *treatment* and *control* groups for this study? Assign either 1, 2, or 3 to the name `jobs_q1` below.\n", + "\n", + "1. The treatment group was the 80 villages visited by recruiters, and the control group was the other 80 villages with no intervention.\n", + "\n", + "2. The treatment group was the 160 villages selected, and the control group was the rest of the villages outside the capital of India.\n", + "\n", + "3. There is no clear notion of *treatment* and *control* group in this study.\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "jobs_q1 = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q4_1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "**Question 2.** Was this an observational study or a randomized controlled experiment? Assign either 1, 2, or 3 to the name `jobs_q2` below.\n", + "\n", + "1. This was an observational study.\n", + "\n", + "2. This was a randomized controlled experiment. \n", + "\n", + "3. This was a randomized observational study.\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "jobs_q2 = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q4_2\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "\n", + "\n", + "**Question 3.** The study reported, \"Girls aged 5-15 in villages that received the recruiting services were 3 to 5 percentage points more likely to be in school and experienced an increase in Body Mass Index, reflecting greater nutrition and/or medical care. However, there was no net gain in height. For boys, there was no change in any of these measures.\" Why do you think the author points out the lack of change in the boys?\n", + "\n", + "*Hint:* Remember the original hypothesis. The author believes that educating women in job opportunities will cause families to invest more in the women’s well-being.\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "_Type your answer here, replacing this text._" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## 5. Differences between Majors\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Berkeley’s Office of Planning and Analysis provides data on numerous aspects of the campus. Adapted from the OPA website, the table below displays the numbers of degree recipients in three majors in the academic years 2008-2009 and 2017-2018.\n", + "\n", + "| Major | 2008-2009 | 2017-2018 |\n", + "|------------------------------------|--------------|-------------|\n", + "| Gender and Women's Studies | 17 | 28 |\n", + "| Linguistics | 49 | 67 |\n", + "| Rhetoric | 113 | 56 |\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "\n", + "\n", + "**Question 1.** Suppose you want to find the **biggest** absolute difference between the numbers of degree recipients in the two years, among the three majors.\n", + "\n", + "In the cell below, compute this value and call it `biggest_change`. Use a single expression (a single line of code) to compute the answer. Let Python perform all the arithmetic (like subtracting 49 from 67) rather than simplifying the expression yourself. The built-in `abs` function takes a numerical input and returns the absolute value. The built-in `max` function can take in 3 arguments and returns the maximum of the three numbers\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "export_pdf": true + }, + "outputs": [], + "source": [ + "biggest_change = ...\n", + "biggest_change" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q5_1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "\n", + "\n", + "**Question 2.** Which of the three majors had the **smallest** absolute difference? Assign `smallest_change_major` to 1, 2, or 3 where each number corresponds to the following major:\n", + "\n", + "1: Gender and Women's Studies \n", + "2: Linguistics \n", + "3: Rhetoric\n", + "\n", + "Choose the number that corresponds to the major with the smallest absolute difference.\n", + "\n", + "You should be able to answer by rough mental arithmetic, without having to calculate the exact value for each major. \n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "smallest_change_major = ...\n", + "smallest_change_major" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q5_2\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "**Question 3.** For each major, define the \"relative change\" to be the following: $\\large{\\frac{\\text{absolute difference}}{\\text{value in 2008-2009}} * 100}$ \n", + "\n", + "Fill in the code below such that `gws_relative_change`, `linguistics_relative_change` and `rhetoric_relative_change` are assigned to the relative changes for their respective majors.\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "for_assignment_type": "student" + }, + "outputs": [], + "source": [ + "gws_relative_change = (abs(...) / 17) * 100\n", + "linguistics_relative_change = ...\n", + "rhetoric_relative_change = ...\n", + "gws_relative_change, linguistics_relative_change, rhetoric_relative_change" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q5_3\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "**Question 4.** Assign `biggest_rel_change_major` to 1, 2, or 3 where each number corresponds to to the following: \n", + "\n", + "1: Gender and Women's Studies \n", + "2: Linguistics \n", + "3: Rhetoric\n", + "\n", + "Choose the number that corresponds to the major with the biggest relative change.\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "# Assign biggest_rel_change_major to the number corresponding to the major with the biggest relative change.\n", + "biggest_rel_change_major = ...\n", + "biggest_rel_change_major" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q5_4\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Nearsightedness Study\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Myopia, or nearsightedness, results from a number of genetic and environmental factors. In 1999, Quinn et al studied the relation between myopia and ambient lighting at night (for example, from nightlights or room lights) during childhood." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "\n", + "\n", + "**Question 1.** The data were gathered by the following procedure, reported in the study. \"Between January and June 1998, parents of children aged 2-16 years [...] that were seen as outpatients in a university pediatric ophthalmology clinic completed a questionnaire on the child’s light exposure both at present and before the age of 2 years.\" Was this study observational, or was it a controlled experiment? Explain. \n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "_Type your answer here, replacing this text._" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "\n", + "\n", + "\n", + "\n", + "**Question 2.** The study found that of the children who slept with a room light on before the age of 2, 55% were myopic. Of the children who slept with a night light on before the age of 2, 34% were myopic. Of the children who slept in the dark before the age of 2, 10% were myopic. The study concluded that, \"The prevalence of myopia [...] during childhood was strongly associated with ambient light exposure during sleep at night in the first two years after birth.\"\n", + "\n", + "Do the data support this statement? You may interpret \"strongly\" in any reasonable qualitative way.\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "_Type your answer here, replacing this text._" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "\n", + "\n", + "\n", + "\n", + "**Question 3.** On May 13, 1999, CNN reported the results of this study under the headline, \"Night light may lead to nearsightedness.\" Does the conclusion of the study claim that night light causes nearsightedness?\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "_Type your answer here, replacing this text._" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "\n", + "\n", + "\n", + "\n", + "**Question 4.** The final paragraph of the CNN report said that \"several eye specialists\" had pointed out that the study should have accounted for heredity.\n", + "\n", + "Myopia is passed down from parents to children. Myopic parents are more likely to have myopic children, and may also be more likely to leave lights on habitually (since the parents have poor vision). In what way does the knowledge of this possible genetic link affect how we interpret the data from the study? \n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "_Type your answer here, replacing this text._" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "\n", + "\n", + "## 7. Studying the Survivors\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "The Reverend Henry Whitehead was skeptical of John Snow’s conclusion about the Broad Street pump. After the Broad Street cholera epidemic ended, Whitehead set about trying to prove Snow wrong. (The history of the event is detailed [here](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1034367/pdf/medhist00183-0026.pdf).)\n", + "\n", + "He realized that Snow had focused his analysis almost entirely on those who had died. Whitehead, therefore, investigated the drinking habits of people in the Broad Street area who had not died in the outbreak.\n", + "\n", + "What is the main reason it was important to study this group?\n", + "\n", + "1) If Whitehead had found that many people had drunk water from the Broad Street pump and not caught cholera, that would have been evidence against Snow's hypothesis.\n", + "\n", + "2) Survivors could provide additional information about what else could have caused the cholera, potentially unearthing another cause.\n", + "\n", + "3) Through considering the survivors, Whitehead could have identified a cure for cholera.\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# Assign survivor_answer to 1, 2, or 3\n", + "survivor_answer = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q7_1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note:** Whitehead ended up finding further proof that the Broad Street pump played the central role in spreading the disease to the people who lived near it. Eventually, he became one of Snow’s greatest defenders." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Policies and Administrivia\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This section of the homework is to ensure that you have read over the policies and frequently asked questions for the course. \n", + "\n", + "**It's important that you read through this section of the homework very carefully**. If you can get through all of this section and are sure you have all of the correct resources set up, you will be able to focus on the actual material this semester!\n", + "\n", + "Reading through the [policies](http://data8.org/sp20/policies.html) and the [FAQ](http://data8.org/sp20/faq.html) will help you get through this section very easily. It is recommended you do this before. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "**Question 1:** You have a question regarding the grading of your assignments that has not been previously answered on Piazza or the FAQ. Who do you contact? Assign `contact` to the number corresponding to the best choice below. \n", + "\n", + "1. The Instructors\n", + "2. Post on Piazza\n", + "3. Contact your Lab TA\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "contact = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q8_1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "**Question 2:** Why will the grades on Gradescope and OkPy be different? Assign `grades` to the number corresponding to the best choice below. \n", + "\n", + "1. There was a mistake in the grading. I should contact someone about this\n", + "2. Gradescope grades the written portion, while OkPy grades the coded portion\n", + "3. Trick question; the grades should be the same on both platforms\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "grades = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q8_2\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "**Question 3:** Regrade deadline dates will always be posted on the same Piazza post that releases the assignment grades, common mistakes, and solutions. Can you ask for parts of your assignment regraded after the regrade request window has passed? Assign `regrade` to the number corresponding to the best choice below. \n", + "\n", + "1. Yes\n", + "2. No\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "regrade = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q8_3\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "**Question 4:** Do you have an Gradescope account? Head to [gradescope.com](http://gradescope.com) and check if you see Data 8. If you do not, please send your Lab TA an email with your email and student ID number. \n", + "\n", + "Once you have been enrolled, go to the Data 8 Gradescope course website. At the end of the url (link), you should see a number. Assign `gradescope` to that number. \n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "gradescope = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q8_4\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "**Question 5:** Given the following scenarios, assign `acceptable` to the number of the scenario that is permissible given the guidelines on the [policies](http://data8.org/sp20/policies.html) page. \n", + "\n", + "1. Alice gets stuck on a homework assignment, so she googles a fix. She stumbles across a pdf of the solutions for the homework assignment from a previous semester's offering of Data 8. After inspecting the solution, Alice writes her own solution and submits the assignment.\n", + "\n", + "2. After getting confused by a project, Bob asks his friend for help. His friend helps by walking the student through his own logic, pointing out areas that are important given the context of the question. Upon hearing his friends logic, the Bob writes his own code and completes the project.\n", + "\n", + "3. Eve has an extremely busy schedule, so she really wants to leave lab early by finishing it and getting checked off. Her neighbor, Charlie, simply turns his computer so Eve can see how he completed some questions. After looking at his code, Eve finishes the lab and gets checked off.\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "acceptable = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q8_5\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "**Question 6:** To make sure you have read through the [policies](http://data8.org/sp20/policies.html) and the [FAQ](http://data8.org/sp20/faq.html) carefully, how many HW/lab drops are there? Assign `drops` to the number corresponding to the best choice below. \n", + "\n", + "1. Two homework drops and one lab drop\n", + "2. One homework drop and one lab drop\n", + "3. Only one homework drop\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "drops = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q8_6\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "**Question 7:** Does Data 8 offer any alternate exams? Assign `exams` to the number corresponding to the best choice below. \n", + "\n", + "1. Yes\n", + "2. No\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "exams = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q8_7\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "**Question 8:** Are you actually checking Piazza? Go to this semester's [Data 8 Piazza](https://piazza.com/class/k5fwiw4wql642x), and find an instructor posted thread with a certain secret phrase. Assign `secret` to this secret phrase in quotes (aka as a string).\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "secret = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q8_8\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Welcome Survey\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have submitted, please also complete the welcome survey in order to receive credit for homework 1.\n", + "\n", + "Welcome survey is here: https://docs.google.com/forms/d/e/1FAIpQLSd28-DvELnGk4n6lHcqMOWcsovDulNSbhmlLFXqDMQIsdldaQ/viewform?usp=sf_link" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "Assign `survey` to the secret string given at the end of the welcome survey:\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "survey = ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check(\"q9\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "---\n", + "\n", + "To double-check your work, the cell below will rerun all of the autograder tests." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "grader.check_all()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false + }, + "source": [ + "## Submission\n", + "\n", + "Make sure you have run all cells in your notebook in order before running the cell below, so that all images/graphs appear in the output. The cell below will generate a zip file for you to submit. **Please save before exporting!**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false + }, + "outputs": [], + "source": [ + "# Save your notebook first, then run this cell to export your submission.\n", + "grader.export(pdf=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " " + ] + } + ], + "metadata": { + "celltoolbar": "None", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/tests/test_zip_puller.py b/tests/test_zip_puller.py new file mode 100644 index 00000000..6d7988fe --- /dev/null +++ b/tests/test_zip_puller.py @@ -0,0 +1,55 @@ +import os +import pytest +import shutil +from nbgitpuller.plugins.zip_puller import ZipSourceWebDownloader +from nbgitpuller.plugins.zip_puller import ZipSourceDropBoxDownloader +from nbgitpuller.plugins.zip_puller import ZipSourceGoogleDriveDownloader + +test_files_dir = os.getcwd() + "/tests/test_files" +archive_base = "/tmp/test_files" +repo_parent_dir = "/tmp/fake/" +repo_zip = 'file://' + archive_base + ".zip" +repo_tgz = 'file://' + archive_base + ".tar.gz" + + +@pytest.fixture +def test_configuration(): + shutil.make_archive(archive_base, 'zip', test_files_dir) + shutil.make_archive(archive_base, 'gztar', test_files_dir) + os.makedirs(repo_parent_dir, exist_ok=True) + yield "test finishing" + shutil.rmtree(repo_parent_dir) + os.remove(archive_base + ".zip") + os.remove(archive_base + ".tar.gz") + + +def assert_helper(down, zip, tgz): + resp_zip = down.handle_files(zip, repo_parent_dir) + resp_tgz = down.handle_files(tgz, repo_parent_dir) + assert "unzip_dir" in resp_zip + assert "origin_repo_path" in resp_zip + assert f"{repo_parent_dir}.origin_non_git_sources" in resp_zip["origin_repo_path"] + assert "hw" in resp_zip["unzip_dir"] + assert "unzip_dir" in resp_tgz + assert "origin_repo_path" in resp_tgz + assert f"{repo_parent_dir}.origin_non_git_sources" in resp_tgz["origin_repo_path"] + assert "hw" in resp_tgz["unzip_dir"] + + +def test_web_downloader(test_configuration): + down = ZipSourceWebDownloader() + assert_helper(down, repo_zip, repo_tgz) + + +def test_dropbox_downloader(test_configuration): + down = ZipSourceDropBoxDownloader() + drop_repo_zip = repo_zip + "?dl=0" + drop_repo_tgz = repo_tgz + "?dl=0" + assert_helper(down, drop_repo_zip, drop_repo_tgz) + + +def test_google_get_id(): + down = ZipSourceGoogleDriveDownloader() + google_repo = "https://drive.google.com/file/d/1p3m0h5UGWdLkVVP0SSJH6j1HpG2yeDlU/view?usp=sharing" + file_id = down.get_id(google_repo) + assert file_id == "1p3m0h5UGWdLkVVP0SSJH6j1HpG2yeDlU" From ab80daf5424ff9f8d9215c331f503a7e5caec209 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Tue, 10 Aug 2021 17:26:22 -0700 Subject: [PATCH 03/40] Added async functionality to non-git archives This includes moving most of the functions in plugin_helper.py to async generators. The GitSyncView also opens the console view whenever anything is written to it -- error or just progress output. --- MANIFEST.in | 1 - nbgitpuller/__init__.py | 10 + nbgitpuller/handlers.py | 112 ++++---- nbgitpuller/hookspecs.py | 23 +- nbgitpuller/plugin_helper.py | 244 ++++++++++++++++++ .../plugins/nbgitpuller-dropbox/__init__.py | 0 .../nbgitpuller-dropbox/dropbox_puller.py | 33 +++ .../plugins/nbgitpuller-dropbox/setup.py | 9 + .../nbgitpuller-googledrive/__init__.py | 0 .../googledrive_puller.py | 138 ++++++++++ .../plugins/nbgitpuller-googledrive/setup.py | 9 + .../plugins/nbgitpuller-standard/__init__.py | 0 .../plugins/nbgitpuller-standard/setup.py | 9 + .../standardweb_puller.py | 32 +++ nbgitpuller/plugins/plugin_helper.py | 116 --------- nbgitpuller/plugins/zip_puller.py | 79 ------ nbgitpuller/static/index.js | 14 +- nbgitpuller/templates/status.html | 2 +- setup.py | 2 +- tests/test_download_puller.py | 112 ++++++++ tests/test_zip_puller.py | 55 ---- tox.ini | 3 + 22 files changed, 683 insertions(+), 320 deletions(-) create mode 100644 nbgitpuller/plugin_helper.py create mode 100644 nbgitpuller/plugins/nbgitpuller-dropbox/__init__.py create mode 100644 nbgitpuller/plugins/nbgitpuller-dropbox/dropbox_puller.py create mode 100644 nbgitpuller/plugins/nbgitpuller-dropbox/setup.py create mode 100644 nbgitpuller/plugins/nbgitpuller-googledrive/__init__.py create mode 100644 nbgitpuller/plugins/nbgitpuller-googledrive/googledrive_puller.py create mode 100644 nbgitpuller/plugins/nbgitpuller-googledrive/setup.py create mode 100644 nbgitpuller/plugins/nbgitpuller-standard/__init__.py create mode 100644 nbgitpuller/plugins/nbgitpuller-standard/setup.py create mode 100644 nbgitpuller/plugins/nbgitpuller-standard/standardweb_puller.py delete mode 100644 nbgitpuller/plugins/plugin_helper.py delete mode 100644 nbgitpuller/plugins/zip_puller.py create mode 100644 tests/test_download_puller.py delete mode 100644 tests/test_zip_puller.py diff --git a/MANIFEST.in b/MANIFEST.in index 0e8f8cc4..607df237 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,5 @@ include *.md include LICENSE include setup.cfg -recursive-include nbgitpuller/plugins * recursive-include nbgitpuller/static * recursive-include nbgitpuller/templates * diff --git a/nbgitpuller/__init__.py b/nbgitpuller/__init__.py index f7bd3773..95b21548 100644 --- a/nbgitpuller/__init__.py +++ b/nbgitpuller/__init__.py @@ -4,6 +4,16 @@ from notebook.utils import url_path_join from tornado.web import StaticFileHandler import os +import nest_asyncio + +REPO_PARENT_DIR = None +TEMP_DOWNLOAD_REPO_DIR = "/tmp/temp_download_repo" +CACHED_ORIGIN_NON_GIT_REPO = ".nbgitpuller/targets/" + +# this allows us to nest usage of the event_loop from asyncio +# being used by tornado in jupyter distro +# Ref: https://medium.com/@vyshali.enukonda/how-to-get-around-runtimeerror-this-event-loop-is-already-running-3f26f67e762e +nest_asyncio.apply() def _jupyter_server_extension_paths(): diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index b8b5ff33..ca5b7ee9 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -1,7 +1,6 @@ from tornado import gen, web, locks import traceback import urllib.parse - from notebook.base.handlers import IPythonHandler import threading import json @@ -11,11 +10,9 @@ from .pull import GitPuller from .version import __version__ -from .hookspecs import handle_files -from .plugins.zip_puller import ZipSourceGoogleDriveDownloader -from .plugins.zip_puller import ZipSourceDropBoxDownloader -from .plugins.zip_puller import ZipSourceWebDownloader +from . import hookspecs import pluggy +import nbgitpuller class SyncHandler(IPythonHandler): @@ -43,17 +40,38 @@ def emit(self, data): self.write('data: {}\n\n'.format(serialized_data)) yield self.flush() - def setup_plugins(self, repo): + def setup_plugins(self, provider): pm = pluggy.PluginManager("nbgitpuller") - pm.add_hookspecs(handle_files) - if "drive.google.com" in repo: - pm.register(ZipSourceGoogleDriveDownloader()) - elif "dropbox.com" in repo: - pm.register(ZipSourceDropBoxDownloader()) - else: - pm.register(ZipSourceWebDownloader()) + pm.add_hookspecs(hookspecs) + pm.load_setuptools_entrypoints("nbgitpuller", name=provider) return pm + @gen.coroutine + def progress_loop(self, queue): + while True: + try: + progress = queue.get_nowait() + except Empty: + yield gen.sleep(0.1) + continue + if progress is None: + yield gen.sleep(5) + return + if isinstance(progress, Exception): + self.emit({ + 'phase': 'error', + 'message': str(progress), + 'output': '\n'.join([ + line.strip() + for line in traceback.format_exception( + type(progress), progress, progress.__traceback__ + ) + ]) + }) + return + + self.emit({'output': progress, 'phase': 'syncing'}) + @web.authenticated @gen.coroutine def get(self): @@ -69,7 +87,7 @@ def get(self): try: repo = self.get_argument('repo') branch = self.get_argument('branch', None) - compressed = self.get_argument('compressed', "false") + provider = self.get_argument('provider', None) depth = self.get_argument('depth', None) if depth: depth = int(depth) @@ -82,22 +100,31 @@ def get(self): # so that all repos are always in scope after cloning. Sometimes # server_root_dir will include things like `~` and so the path # must be expanded. - repo_parent_dir = os.path.join(os.path.expanduser(self.settings['server_root_dir']), - os.getenv('NBGITPULLER_PARENTPATH', '')) - repo_dir = os.path.join(repo_parent_dir, self.get_argument('targetpath', repo.split('/')[-1])) + repo_parent_dir = os.path.join(os.path.expanduser(self.settings['server_root_dir']), os.getenv('NBGITPULLER_PARENTPATH', '')) + nbgitpuller.REPO_PARENT_DIR = repo_parent_dir + + repo_dir = os.path.join( + repo_parent_dir, + self.get_argument('targetpath', repo.split('/')[-1])) # We gonna send out event streams! self.set_header('content-type', 'text/event-stream') self.set_header('cache-control', 'no-cache') - if compressed == 'true': - pm = self.setup_plugins(repo) - results = pm.hook.handle_files(repo=repo, repo_parent_dir=repo_parent_dir)[0] + # if provider is specified then we are dealing with compressed + # archive and not a git repo + if provider is not None: + pm = self.setup_plugins(provider) + req_args = {k: v[0].decode() for k, v in self.request.arguments.items()} + download_q = Queue() + req_args["progress_func"] = lambda: self.progress_loop(download_q) + req_args["download_q"] = download_q + hf_args = {"query_line_args": req_args} + results = pm.hook.handle_files(**hf_args) repo_dir = repo_parent_dir + results["unzip_dir"] repo = "file://" + results["origin_repo_path"] gp = GitPuller(repo, repo_dir, branch=branch, depth=depth, parent=self.settings['nbapp']) - q = Queue() def pull(): @@ -110,33 +137,11 @@ def pull(): q.put_nowait(e) raise e self.gp_thread = threading.Thread(target=pull) - self.gp_thread.start() - - while True: - try: - progress = q.get_nowait() - except Empty: - yield gen.sleep(0.5) - continue - if progress is None: - break - if isinstance(progress, Exception): - self.emit({ - 'phase': 'error', - 'message': str(progress), - 'output': '\n'.join([ - line.strip() - for line in traceback.format_exception( - type(progress), progress, progress.__traceback__ - ) - ]) - }) - return - - self.emit({'output': progress, 'phase': 'syncing'}) - + self.progress_loop(q) + yield gen.sleep(3) self.emit({'phase': 'finished'}) + except Exception as e: self.emit({ 'phase': 'error', @@ -170,11 +175,10 @@ def initialize(self): @gen.coroutine def get(self): app_env = os.getenv('NBGITPULLER_APP', default='notebook') - repo = self.get_argument('repo') branch = self.get_argument('branch', None) depth = self.get_argument('depth', None) - compressed = self.get_argument('compressed', "false") + provider = self.get_argument('provider', None) urlPath = self.get_argument('urlpath', None) or \ self.get_argument('urlPath', None) subPath = self.get_argument('subpath', None) or \ @@ -195,14 +199,17 @@ def get(self): else: path = 'tree/' + path + if provider is not None: + path = "tree/" + self.write( self.render_template( 'status.html', repo=repo, branch=branch, - compressed=compressed, path=path, depth=depth, + provider=provider, targetpath=targetpath, version=__version__ )) @@ -239,3 +246,10 @@ def get(self): ) self.redirect(new_url) + + +class ThreadWithResult(threading.Thread): + def __init__(self, group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None): + def function(): + self.result = target(*args, **kwargs) + super().__init__(group=group, target=function, name=name, daemon=daemon) diff --git a/nbgitpuller/hookspecs.py b/nbgitpuller/hookspecs.py index 59dbf10e..320637ce 100644 --- a/nbgitpuller/hookspecs.py +++ b/nbgitpuller/hookspecs.py @@ -1,21 +1,22 @@ import pluggy hookspec = pluggy.HookspecMarker("nbgitpuller") +hookimpl = pluggy.HookimplMarker("nbgitpuller") -@hookspec -def handle_files(self, repo, repo_parent_dir): +@hookspec(firstresult=True) +def handle_files(query_line_args): """ - :param str repo: download url to source - :param str repo_parent_dir: where we will store the downloaded repo + :param json query_line_args: this includes any argument you put on the url :return two parameter json unzip_dir and origin_repo_path :rtype json object - This handles the downloading of non-git source - files into the user directory. Once downloaded, - the files are merged into a local git repository. - Once the local git repository is updated(or created - the first time), git puller can then handle this - directory as it would sources coming from a - git repository. + The developer uses this function to download, un-compress and save the + source files to the TEMP_DOWNLOAD_REPO_DIR folder. + + The parameter, query_line_args, is any argument you put on the URL + + Once the files are saved to the directly, git puller can handle all the + standard functions needed to make sure source files are updated or created + as needed. """ diff --git a/nbgitpuller/plugin_helper.py b/nbgitpuller/plugin_helper.py new file mode 100644 index 00000000..77d14cc8 --- /dev/null +++ b/nbgitpuller/plugin_helper.py @@ -0,0 +1,244 @@ +import string +import os +import logging +import aiohttp +import asyncio +import subprocess +import shutil +from urllib.parse import urlparse +from functools import partial +from nbgitpuller import \ + TEMP_DOWNLOAD_REPO_DIR, \ + CACHED_ORIGIN_NON_GIT_REPO, \ + REPO_PARENT_DIR + + +async def execute_cmd(cmd, **kwargs): + """ + :param array cmd: the commands to be executed + :param json kwargs: potential keyword args included with command + + Call given command, yielding output line by line + """ + yield '$ {}\n'.format(' '.join(cmd)) + kwargs['stdout'] = subprocess.PIPE + kwargs['stderr'] = subprocess.STDOUT + + proc = subprocess.Popen(cmd, **kwargs) + + # Capture output for logging. + # Each line will be yielded as text. + # This should behave the same as .readline(), but splits on `\r` OR `\n`, + # not just `\n`. + buf = [] + + def flush(): + line = b''.join(buf).decode('utf8', 'replace') + buf[:] = [] + return line + + c_last = '' + try: + for c in iter(partial(proc.stdout.read, 1), b''): + if c_last == b'\r' and buf and c != b'\n': + yield flush() + buf.append(c) + if c == b'\n': + yield flush() + c_last = c + finally: + ret = proc.wait() + if ret != 0: + raise subprocess.CalledProcessError(ret, cmd) + + +async def initialize_local_repo(local_repo_path): + """ + :param str local_repo_path: the locla path where the git repo is initialized + + Sets up the a local repo that acts like a remote; yields the + output from the git init + """ + yield "Initializing repo ...\n" + logging.info(f"Creating local_repo_path: {local_repo_path}") + os.makedirs(local_repo_path, exist_ok=True) + async for e in execute_cmd(["git", "init", "--bare"], cwd=local_repo_path): + yield e + + +async def clone_local_origin_repo(origin_repo_path, temp_download_repo): + """ + :param str origin_repo_path: the local path we used to git init into + :param str temp_download_repo: folder where the compressed archive + is downloaded to + + Cloned the origin(which is local) to the folder, temp_download_repo. + The folder, temp_download_repo, acts like the space where someone makes changes + to master notebooks and then pushes the changes to origin. In other words, + the folder, temp_download_repo, is where the compressed archive is downloaded, + unarchived, and then pushed to the origin. + """ + yield "Cloning repo ...\n" + if os.path.exists(temp_download_repo): + shutil.rmtree(temp_download_repo) + logging.info(f"Creating temp_download_repo: {temp_download_repo}") + os.makedirs(temp_download_repo, exist_ok=True) + + cmd = ["git", "clone", f"file://{origin_repo_path}", temp_download_repo] + async for e in execute_cmd(cmd, cwd=temp_download_repo): + yield e + + +def extract_file_extension(url): + """ + :param str url: the url contains the extension we need to determine + what kind of compression is used on the file being downloaded + + this is needed to unarchive various formats(eg. zip, tgz, etc) + """ + u = urlparse(url) + url_arr = u.path.split(".") + if len(url_arr) >= 2: + return url_arr[-1] + raise Exception(f"Could not determine compression type of: {url}") + + +async def execute_unarchive(ext, temp_download_file, temp_download_repo): + """ + :param str ext: extension used to determine type of compression + :param str temp_download_file: the file path to be unarchived + :param str temp_download_repo: where the file is unarchived to + + un-archives file using unzip or tar to the temp_download_repo + """ + if ext == 'zip': + cmd_arr = ['unzip', "-qo", temp_download_file, "-d", temp_download_repo] + else: + cmd_arr = ['tar', 'xzf', temp_download_file, '-C', temp_download_repo] + async for e in execute_cmd(cmd_arr, cwd=temp_download_repo): + yield e + + +async def download_archive(args, temp_download_file): + """ + :param map args: key-value pairs including the aiohttp session object and repo path + :param str temp_download_file: the path to save the requested file to + + This requests the file from the repo(url) given and saves it to the disk + """ + yield "Downloading archive ...\n" + try: + CHUNK_SIZE = 1024 + async with args["client"] as session: + async with session.get(args["repo"]) as response: + with open(temp_download_file, 'ab') as fd: + count_chunks = 1 + while True: + count_chunks += 1 + if count_chunks % 1000 == 0: + display = count_chunks / 1000 + yield f"Downloading Progress ... {display}MB\n" + chunk = await response.content.read(CHUNK_SIZE) + if not chunk: + break + fd.write(chunk) + except Exception as e: + raise e + + yield "Archive Downloaded....\n" + + +async def push_to_local_origin(temp_download_repo): + """ + :param str temp_download_repo: the current working directly of folder + where the archive had been downloaded and unarchived + + The unarchived files are pushed back to the origin + """ + async for e in execute_cmd(["git", "add", "."], cwd=temp_download_repo): + yield e + commit_cmd = [ + "git", + "-c", "user.email=nbgitpuller@nbgitpuller.link", + "-c", "user.name=nbgitpuller", + "commit", "-q", "-m", "test", "--allow-empty" + ] + async for e in execute_cmd(commit_cmd, cwd=temp_download_repo): + yield e + async for e in execute_cmd(["git", "push", "origin", "master"], cwd=temp_download_repo): + yield e + + +# this is needed becuase in handle_files_helper I can not return +# from the async generator so it needs a global variable to hold the +# director name of the files downloaded +dir_names = None + + +async def handle_files_helper(args): + """ + :param map args: key-value pairs including the repo, provider, extenstion, + download function and download parameters in the case + that the source needs to handle the download in a specific way(e.g. google + requires a confirmation of the download) + :return json object with the directory name of the download and + the origin_repo_path + :rtype json object + + This does all the heavy lifting in order needed to set up your local + repos, origin, download the file, unarchiving and push the files + back to the origin + """ + url = args["repo"].translate(str.maketrans('', '', string.punctuation)) + provider = args["provider"] + origin_repo = f"{REPO_PARENT_DIR}{CACHED_ORIGIN_NON_GIT_REPO}{provider}/{url}/" + temp_download_repo = TEMP_DOWNLOAD_REPO_DIR + temp_download_file = f"{TEMP_DOWNLOAD_REPO_DIR}/download.{args['extension']}" + + async def gener(): + global dir_names + try: + if not os.path.exists(origin_repo): + async for i in initialize_local_repo(origin_repo): + yield i + + async for c in clone_local_origin_repo(origin_repo, temp_download_repo): + yield c + + args["client"] = aiohttp.ClientSession() + download_func = download_archive + download_args = args, temp_download_file + if "dowload_func" in args: + download_func = args["dowload_func"] + download_args = args["dowload_func_params"] + + async for d in download_func(*download_args): + yield d + + async for e in execute_unarchive(args["extension"], temp_download_file, temp_download_repo): + yield e + + os.remove(temp_download_file) + async for p in push_to_local_origin(temp_download_repo): + yield p + + unzipped_dirs = os.listdir(temp_download_repo) + # name of the extracted directory + dir_names = list(filter(lambda dir: ".git" not in dir and "__MACOSX" not in dir, unzipped_dirs)) + yield "\n\n" + yield "Process Complete: Archive is finished importing into hub\n" + yield f"The directory of your download is: {dir_names[0]}\n" + shutil.rmtree(temp_download_repo) # remove temporary download space + except Exception as e: + logging.exception(e) + raise ValueError(e) + + try: + async for line in gener(): + args["download_q"].put_nowait(line) + await asyncio.sleep(0.1) + except Exception as e: + args["download_q"].put_nowait(e) + raise e + args["download_q"].put_nowait(None) + return {"unzip_dir": dir_names[0], "origin_repo_path": origin_repo} diff --git a/nbgitpuller/plugins/nbgitpuller-dropbox/__init__.py b/nbgitpuller/plugins/nbgitpuller-dropbox/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nbgitpuller/plugins/nbgitpuller-dropbox/dropbox_puller.py b/nbgitpuller/plugins/nbgitpuller-dropbox/dropbox_puller.py new file mode 100644 index 00000000..36b0b18c --- /dev/null +++ b/nbgitpuller/plugins/nbgitpuller-dropbox/dropbox_puller.py @@ -0,0 +1,33 @@ +from nbgitpuller.plugin_helper import handle_files_helper +from nbgitpuller.plugin_helper import extract_file_extension +from nbgitpuller.hookspecs import hookimpl +import asyncio + + +def determine_file_extension(url): + """ + :param str url: url to source + :return the extension indicating the file compression(e.g. zip, tgz) + :rtype str + """ + return extract_file_extension(url) + + +@hookimpl +def handle_files(query_line_args): + """ + :param json args: this includes any argument you put on the url + PLUS the function, query_line_args["progress_func"], that writes messages to + the progress stream in the browser window and the download_q, + query_line_args["download_q"] the progress function uses. + :return two parameter json unzip_dir and origin_repo_path + :rtype json object + """ + query_line_args["repo"] = query_line_args["repo"].replace("dl=0", "dl=1") # dropbox: download set to 1 + ext = determine_file_extension(query_line_args["repo"]) + query_line_args["extension"] = ext + + loop = asyncio.get_event_loop() + tasks = handle_files_helper(query_line_args), query_line_args["progress_func"]() + result_handle, _ = loop.run_until_complete(asyncio.gather(*tasks)) + return result_handle diff --git a/nbgitpuller/plugins/nbgitpuller-dropbox/setup.py b/nbgitpuller/plugins/nbgitpuller-dropbox/setup.py new file mode 100644 index 00000000..cc6e6ee0 --- /dev/null +++ b/nbgitpuller/plugins/nbgitpuller-dropbox/setup.py @@ -0,0 +1,9 @@ +from setuptools import setup + +setup( + name="nbgitpuller-dropbox", + entry_points={ + "nbgitpuller": ["dropbox=dropbox_puller"] + }, + py_modules=["dropbox_puller"] +) diff --git a/nbgitpuller/plugins/nbgitpuller-googledrive/__init__.py b/nbgitpuller/plugins/nbgitpuller-googledrive/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nbgitpuller/plugins/nbgitpuller-googledrive/googledrive_puller.py b/nbgitpuller/plugins/nbgitpuller-googledrive/googledrive_puller.py new file mode 100644 index 00000000..dfcca579 --- /dev/null +++ b/nbgitpuller/plugins/nbgitpuller-googledrive/googledrive_puller.py @@ -0,0 +1,138 @@ +from nbgitpuller.hookspecs import hookimpl +import re +import asyncio +import aiohttp +from nbgitpuller.plugin_helper import handle_files_helper +from nbgitpuller import TEMP_DOWNLOAD_REPO_DIR + +DOWNLOAD_URL = "https://docs.google.com/uc?export=download" + + +@hookimpl +def handle_files(query_line_args): + """ + :param json args: this includes any argument you put on the url + PLUS the function, query_line_args["progress_func"], that writes messages to + the progress stream in the browser window and the download_q, + query_line_args["download_q"] the progress function uses. + :return two parameter json unzip_dir and origin_repo_path + :rtype json object + """ + loop = asyncio.get_event_loop() + repo = query_line_args["repo"] + query_line_args["download_q"].put_nowait("Determining type of archive...\n") + response = loop.run_until_complete(get_response_from_drive(DOWNLOAD_URL, get_id(repo))) + ext = determine_file_extension_from_response(response) + query_line_args["download_q"].put_nowait(f"Archive is: {ext}\n") + temp_download_file = f"{TEMP_DOWNLOAD_REPO_DIR}/download.{ext}" + + query_line_args["extension"] = ext + query_line_args["dowload_func"] = download_archive_for_google + query_line_args["dowload_func_params"] = query_line_args, temp_download_file + + tasks = handle_files_helper(query_line_args), query_line_args["progress_func"]() + result_handle, _ = loop.run_until_complete(asyncio.gather(*tasks)) + return result_handle + + +def get_id(repo): + """ + :param str repo: the url to the compressed file contained the google id + :return the google drive id of the file to be downloaded + :rtype str + """ + start_id_index = repo.index("d/") + 2 + end_id_index = repo.index("/view") + return repo[start_id_index:end_id_index] + + +def get_confirm_token(session, url): + """ + :param aiohttp.ClientSession session: used to the get the cookies from the reponse + :param str url : the url is used to filter out the correct cookies from the session + :return the cookie if found or None if not found + :rtype str + + This used to determine whether or not Google needs you to confirm a large download + file is being downloaded + """ + cookies = session.cookie_jar.filter_cookies(url) + for key, cookie in cookies.items(): + if key.startswith('download_warning'): + return cookie + return None + + +async def download_archive_for_google(args, temp_download_file): + """ + :param map args: key-value pairs includes repo path + :param str temp_download_file: the path to save the requested file to + + This requests the file from the repo(url) given and saves it to the disk + """ + yield "Downloading archive ...\n" + try: + repo = args["repo"] + id = get_id(repo) + CHUNK_SIZE = 1024 + async with aiohttp.ClientSession() as session: + async with session.get(DOWNLOAD_URL, params={'id': id}) as response: + token = get_confirm_token(session, repo) + if token: + params = {'id': id, 'confirm': token} + response = await session.get(repo, params=params) + with open(temp_download_file, 'ab') as fd: + count_chunks = 1 + while True: + count_chunks += 1 + if count_chunks % 1000 == 0: + display = count_chunks / 1000 + yield f"Downloading Progress ... {display}MB\n" + chunk = await response.content.read(CHUNK_SIZE) + if not chunk: + break + fd.write(chunk) + yield "Archive Downloaded....\n" + except Exception as e: + raise e + + +async def get_response_from_drive(url, id): + """ + :param str url: the google download URL + :param str id: the google id of the file to download + :return response object + :rtype json object + You need to check to see that Google Drive has not asked the + request to confirm that they disabled the virus scan on files that + are bigger than 100MB(The size is mentioned online but I did not see + confirmation - something larger essentially). For large files, you have + to request again but this time putting the 'confirm=XXX' as a query + parameter. + """ + async with aiohttp.ClientSession() as session: + async with session.get(url, params={'id': id}) as response: + token = get_confirm_token(session, url) + if token: + params = {'id': id, 'confirm': token} + response = await session.get(url, params=params) + return response + return response + + +def determine_file_extension_from_response(response): + """ + :param str response: the response object from the download + :return the extension indicating the file compression(e.g. zip, tgz) + :rtype str + """ + content_disposition = response.headers.get('content-disposition') + if content_disposition: + fname = re.findall("filename\\*?=([^;]+)", content_disposition) + fname = fname[0].strip().strip('"') + ext = fname.split(".")[1] + + if ext is None: + m = f"Could not determine compression type of: {content_disposition}" + raise Exception(m) + return ext diff --git a/nbgitpuller/plugins/nbgitpuller-googledrive/setup.py b/nbgitpuller/plugins/nbgitpuller-googledrive/setup.py new file mode 100644 index 00000000..37b0064d --- /dev/null +++ b/nbgitpuller/plugins/nbgitpuller-googledrive/setup.py @@ -0,0 +1,9 @@ +from setuptools import setup + +setup( + name="nbgitpuller-googledrive", + entry_points={ + "nbgitpuller": ["googledrive=googledrive_puller"] + }, + py_modules=["googledrive_puller"] +) diff --git a/nbgitpuller/plugins/nbgitpuller-standard/__init__.py b/nbgitpuller/plugins/nbgitpuller-standard/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nbgitpuller/plugins/nbgitpuller-standard/setup.py b/nbgitpuller/plugins/nbgitpuller-standard/setup.py new file mode 100644 index 00000000..1289ab08 --- /dev/null +++ b/nbgitpuller/plugins/nbgitpuller-standard/setup.py @@ -0,0 +1,9 @@ +from setuptools import setup + +setup( + name="nbgitpuller-standard", + entry_points={ + "nbgitpuller": ["standard=standardweb_puller"] + }, + py_modules=["standardweb_puller"] +) diff --git a/nbgitpuller/plugins/nbgitpuller-standard/standardweb_puller.py b/nbgitpuller/plugins/nbgitpuller-standard/standardweb_puller.py new file mode 100644 index 00000000..39303608 --- /dev/null +++ b/nbgitpuller/plugins/nbgitpuller-standard/standardweb_puller.py @@ -0,0 +1,32 @@ +from nbgitpuller.plugin_helper import handle_files_helper +from nbgitpuller.plugin_helper import extract_file_extension +from nbgitpuller.hookspecs import hookimpl +import asyncio + + +def determine_file_extension(url): + """ + :param str url: url to source + :return the extension indicating the file compression(e.g. zip, tgz) + :rtype str + """ + return extract_file_extension(url) + + +@hookimpl +def handle_files(query_line_args): + """ + :param json args: this includes any argument you put on the url + PLUS the function, query_line_args["progress_func"], that writes messages to + the progress stream in the browser window and the download_q, + query_line_args["download_q"] the progress function uses. + :return two parameter json unzip_dir and origin_repo_path + :rtype json object + """ + ext = determine_file_extension(query_line_args["repo"]) + query_line_args["extension"] = ext + + loop = asyncio.get_event_loop() + tasks = handle_files_helper(query_line_args), query_line_args["progress_func"]() + result_handle, _ = loop.run_until_complete(asyncio.gather(*tasks)) + return result_handle diff --git a/nbgitpuller/plugins/plugin_helper.py b/nbgitpuller/plugins/plugin_helper.py deleted file mode 100644 index 1e2f77c7..00000000 --- a/nbgitpuller/plugins/plugin_helper.py +++ /dev/null @@ -1,116 +0,0 @@ -import subprocess -import os -import logging -import requests -from requests_file import FileAdapter -import shutil -import re - - -# for large files from Google Drive -def get_confirm_token(response): - for key, value in response.cookies.items(): - if key.startswith('download_warning'): - return value - return None - - -# sets up the a local repo that acts like a remote -def initialize_local_repo(local_repo_path): - logging.info(f"Creating local_repo_path: {local_repo_path}") - os.makedirs(local_repo_path, exist_ok=True) - - subprocess.check_output(["git", "init", "--bare"], cwd=local_repo_path) - - -# local repo cloned from the "remote" which is in user drive -def clone_local_origin_repo(origin_repo_path, temp_download_repo): - logging.info(f"Creating temp_download_repo: {temp_download_repo}") - os.makedirs(temp_download_repo, exist_ok=True) - - cmd = ["git", "clone", f"file://{origin_repo_path}", temp_download_repo] - subprocess.check_output(cmd, cwd=temp_download_repo) - - -# this is needed to unarchive various formats(eg. zip, tgz, etc) -def determine_file_extension(url, response): - file_type = response.headers.get('content-type') - content_disposition = response.headers.get('content-disposition') - ext = None - if content_disposition: - fname = re.findall("filename\\*?=([^;]+)", content_disposition) - fname = fname[0].strip().strip('"') - ext = fname.split(".")[1] - elif file_type and "/zip" in file_type: - ext = "zip" - else: - url = url.split("/")[-1] - if "?" in url: - url = url[0:url.find('?')] - if "." in url: - ext = url.split(".")[1] - - if not ext: - m = f"Could not determine the file extension for unarchiving: {url}" - raise Exception(m) - return ext - - -# the downloaded content is in the response -- unarchive and save to the disk -def save_response_content(url, response, temp_download_repo): - try: - ext = determine_file_extension(url, response) - CHUNK_SIZE = 32768 - temp_download_file = f"{temp_download_repo}/download.{ext}" - with open(temp_download_file, "wb") as f: - for chunk in response.iter_content(CHUNK_SIZE): - # filter out keep-alive new chunks - if chunk: - f.write(chunk) - - shutil.unpack_archive(temp_download_file, temp_download_repo) - - os.remove(temp_download_file) - except Exception as e: - m = f"Problem handling file download: {str(e)}" - raise Exception(m) - - -# grab archive file from url -def fetch_files(url, id=-1): - session = requests.Session() - session.mount('file://', FileAdapter()) # add adapter for pytests - response = session.get(url, params={'id': id}, stream=True) - token = get_confirm_token(response) - if token: - params = {'id': id, 'confirm': token} - response = session.get(url, params=params, stream=True) - - return response - - -# this drive the file handling -- called from zip_puller by all the -# handle_files implementations for GoogleDrive, Dropbox, and standard -# Web url -def handle_files_helper(args): - try: - origin_repo = args["repo_parent_dir"] + args["origin_dir"] - temp_download_repo = args["repo_parent_dir"] + args["download_dir"] - if os.path.exists(temp_download_repo): - shutil.rmtree(temp_download_repo) - - if not os.path.exists(origin_repo): - initialize_local_repo(origin_repo) - - clone_local_origin_repo(origin_repo, temp_download_repo) - save_response_content(args["repo"], args["response"], temp_download_repo) - subprocess.check_output(["git", "add", "."], cwd=temp_download_repo) - subprocess.check_output(["git", "-c", "user.email=nbgitpuller@nbgitpuller.link", "-c", "user.name=nbgitpuller", "commit", "-m", "test", "--allow-empty"], cwd=temp_download_repo) - subprocess.check_output(["git", "push", "origin", "master"], cwd=temp_download_repo) - unzipped_dirs = os.listdir(temp_download_repo) - - dir_names = list(filter(lambda dir: ".git" not in dir, unzipped_dirs)) - return {"unzip_dir": dir_names[0], "origin_repo_path": origin_repo} - except Exception as e: - logging.exception(e) - raise ValueError(e) diff --git a/nbgitpuller/plugins/zip_puller.py b/nbgitpuller/plugins/zip_puller.py deleted file mode 100644 index af18a488..00000000 --- a/nbgitpuller/plugins/zip_puller.py +++ /dev/null @@ -1,79 +0,0 @@ -from .plugin_helper import fetch_files -from .plugin_helper import handle_files_helper -import pluggy - -hookimpl = pluggy.HookimplMarker("nbgitpuller") -TEMP_DOWNLOAD_REPO_DIR = ".temp_download_repo" -CACHED_ORIGIN_NON_GIT_REPO = ".origin_non_git_sources" - - -# handles standard web addresses(not google drive or dropbox) -class ZipSourceWebDownloader(object): - @hookimpl - def handle_files(self, repo, repo_parent_dir): - """ - :param str repo: publicly accessible url to compressed source files - :param str repo_parent_dir: where we will store the downloaded repo - :return two parameter json unzip_dir and origin_repo_path - :rtype json object - """ - response = fetch_files(repo) - args = { - "repo": repo, - "repo_parent_dir": repo_parent_dir, - "response": response, - "origin_dir": CACHED_ORIGIN_NON_GIT_REPO, - "download_dir": TEMP_DOWNLOAD_REPO_DIR - } - return handle_files_helper(args) - - -# handles downloads from google drive -class ZipSourceGoogleDriveDownloader(object): - def __init__(self): - self.DOWNLOAD_URL = "https://docs.google.com/uc?export=download" - - def get_id(self, repo): - start_id_index = repo.index("d/") + 2 - end_id_index = repo.index("/view") - return repo[start_id_index:end_id_index] - - @hookimpl - def handle_files(self, repo, repo_parent_dir): - """ - :param str repo: google drive share link to compressed source files - :param str repo_parent_dir: where we will store the downloaded repo - :return two parameter json unzip_dir and origin_repo_path - :rtype json object - """ - response = fetch_files(self.DOWNLOAD_URL, self.get_id(repo)) - args = { - "repo": repo, - "repo_parent_dir": repo_parent_dir, - "response": response, - "origin_dir": CACHED_ORIGIN_NON_GIT_REPO, - "download_dir": TEMP_DOWNLOAD_REPO_DIR - } - return handle_files_helper(args) - - -# handles downloads from DropBox -class ZipSourceDropBoxDownloader(object): - @hookimpl - def handle_files(self, repo, repo_parent_dir): - """ - :param str repo: dropbox download link to compressed source files - :param str repo_parent_dir: where we will store the downloaded repo - :return two parameter json unzip_dir and origin_repo_path - :rtype json object - """ - repo = repo.replace("dl=0", "dl=1") # download set to 1 for dropbox - response = fetch_files(repo) - args = { - "repo": repo, - "repo_parent_dir": repo_parent_dir, - "response": response, - "origin_dir": CACHED_ORIGIN_NON_GIT_REPO, - "download_dir": TEMP_DOWNLOAD_REPO_DIR - } - return handle_files_helper(args) diff --git a/nbgitpuller/static/index.js b/nbgitpuller/static/index.js index 7be399e9..c4349207 100644 --- a/nbgitpuller/static/index.js +++ b/nbgitpuller/static/index.js @@ -12,16 +12,15 @@ require([ Terminal.applyAddon(fit); - function GitSync(baseUrl, repo, branch, depth, compressed, targetpath, path) { + function GitSync(baseUrl, repo, branch, depth, targetpath, path, provider) { // Class that talks to the API backend & emits events as appropriate this.baseUrl = baseUrl; this.repo = repo; this.branch = branch; this.depth = depth; - this.compressed = compressed; this.targetpath = targetpath; + this.provider = provider; this.redirectUrl = baseUrl + path; - this.callbacks = {}; } @@ -53,8 +52,8 @@ require([ if (typeof this.branch !== 'undefined' && this.branch != undefined) { syncUrlParams['branch'] = this.branch; } - if (typeof this.compressed !== 'undefined' && this.compressed != undefined) { - syncUrlParams['compressed'] = this.compressed; + if (typeof this.provider !== 'undefined' && this.provider != undefined) { + syncUrlParams['provider'] = this.provider; } var syncUrl = this.baseUrl + 'git-pull/api?' + $.param(syncUrlParams); @@ -137,9 +136,9 @@ require([ utils.get_body_data('repo'), utils.get_body_data('branch'), utils.get_body_data('depth'), - utils.get_body_data('compressed'), utils.get_body_data('targetpath'), - utils.get_body_data('path') + utils.get_body_data('path'), + utils.get_body_data('provider') ); var gsv = new GitSyncView( @@ -149,6 +148,7 @@ require([ ); gs.addHandler('syncing', function(data) { + gsv.setTerminalVisibility(true); gsv.term.write(data.output); }); gs.addHandler('finished', function(data) { diff --git a/nbgitpuller/templates/status.html b/nbgitpuller/templates/status.html index 99b9f53e..b1fdc1e4 100644 --- a/nbgitpuller/templates/status.html +++ b/nbgitpuller/templates/status.html @@ -7,7 +7,7 @@ data-path="{{ path | urlencode }}" {% if branch %}data-branch="{{ branch | urlencode }}"{% endif %} {% if depth %}data-depth="{{ depth | urlencode }}"{% endif %} -{% if compressed %}data-compressed="{{ compressed | urlencode }}"{% endif %} +{% if provider %}data-provider="{{ provider | urlencode }}"{% endif %} data-targetpath="{{ targetpath | urlencode }}" {% endblock %} diff --git a/setup.py b/setup.py index 3a367291..a8b69302 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ packages=find_packages(), include_package_data=True, platforms='any', - install_requires=['notebook>=5.5.0', 'tornado', 'requests', 'requests-file'], + install_requires=['notebook>=5.5.0', 'tornado', 'aiohttp', 'pluggy'], data_files=[ ('etc/jupyter/jupyter_notebook_config.d', ['nbgitpuller/etc/nbgitpuller.json']) ], diff --git a/tests/test_download_puller.py b/tests/test_download_puller.py new file mode 100644 index 00000000..74d9dc80 --- /dev/null +++ b/tests/test_download_puller.py @@ -0,0 +1,112 @@ +import os +import pytest +import shutil +import nbgitpuller.plugin_helper as ph +import importlib +import aiohttp +from aioresponses import aioresponses +google_nb = importlib.import_module("nbgitpuller.plugins.nbgitpuller-googledrive.googledrive_puller") + +test_files_dir = os.getcwd() + "/tests/test_files" +archive_base = "/tmp/test_files" +repo_parent_dir = "/tmp/fake/" +temp_download_repo = "/tmp/download/" +temp_archive_download = "/tmp/archive_download/" +provider = "dropbox_test" +url = "http://test/this/repo" +CACHED_ORIGIN_NON_GIT_REPO = ".nbgitpuller/targets/" +origin_repo = f"{repo_parent_dir}{CACHED_ORIGIN_NON_GIT_REPO}{provider}/{url}/" + +repo_zip = 'file://' + archive_base + ".zip" +repo_tgz = 'file://' + archive_base + ".tar.gz" + + +@pytest.fixture +async def test_configuration(): + shutil.make_archive(archive_base, 'zip', test_files_dir) + shutil.make_archive(archive_base, 'gztar', test_files_dir) + os.makedirs(temp_archive_download, exist_ok=True) + os.makedirs(repo_parent_dir, exist_ok=True) + os.makedirs(temp_download_repo, exist_ok=True) + yield "test finishing" + os.remove(archive_base + ".zip") + os.remove(archive_base + ".tar.gz") + if os.path.isfile(temp_archive_download + "downloaded.zip"): + os.remove(temp_archive_download + "downloaded.zip") + shutil.rmtree(repo_parent_dir) + shutil.rmtree(temp_download_repo) + shutil.rmtree(temp_archive_download) + + +def test_extract_file_extension(): + url = "https://github.com/sean-morris/APCS-Source-Code/raw/master/materials-sp20-external.tgz" + ext = ph.extract_file_extension(url) + assert "tgz" in ext + + +@pytest.mark.asyncio +async def test_initialize_local_repo(test_configuration): + yield_str = "" + async for line in ph.initialize_local_repo(origin_repo): + yield_str += line + assert "init --bare" in yield_str + assert os.path.isdir(origin_repo) + + +@pytest.mark.asyncio +async def test_clone_local_origin_repo(test_configuration): + async for line in ph.initialize_local_repo(origin_repo): + pass + + yield_str = "" + async for line in ph.clone_local_origin_repo(origin_repo, temp_download_repo): + yield_str += line + + assert "Cloning into" in yield_str + assert os.path.isdir(temp_download_repo + ".git") + + +@pytest.mark.asyncio +async def test_execute_unarchive(test_configuration): + yield_str = "" + async for line in ph.execute_unarchive("zip", archive_base + ".zip", temp_download_repo): + yield_str += line + assert os.path.isfile("/tmp/download/hw/hw01/hw01.ipynb") + + +@pytest.mark.asyncio +async def test_push_to_local_origin(test_configuration): + async for line in ph.initialize_local_repo(origin_repo): + pass + + async for line in ph.clone_local_origin_repo(origin_repo, temp_download_repo): + pass + + async for line in ph.execute_unarchive("zip", archive_base + ".zip", temp_download_repo): + pass + + yield_str = "" + async for line in ph.push_to_local_origin(temp_download_repo): + yield_str += line + assert "[new branch]" in yield_str + + +@pytest.mark.asyncio +async def test_download_archive(test_configuration): + args = {} + args["repo"] = "http://fake.com" + with aioresponses() as mocked: + mocked.get(args["repo"], status=200, body=b'Pretend you are zip file being downloaded') + args["client"] = aiohttp.ClientSession() + yield_str = "" + async for line in ph.download_archive(args, temp_archive_download + "downloaded.zip"): + yield_str += line + assert 'Downloading archive' in yield_str + assert os.path.isfile(temp_archive_download + "downloaded.zip") + + +def test_google_get_id(): + google_repo = "https://drive.google.com/file/d/1p3m0h5UGWdLkVVP0SSJH6j1HpG2yeDlU/view?usp=sharing" + gnb = getattr(google_nb, "get_id") + file_id = gnb(google_repo) + assert file_id == "1p3m0h5UGWdLkVVP0SSJH6j1HpG2yeDlU" diff --git a/tests/test_zip_puller.py b/tests/test_zip_puller.py deleted file mode 100644 index 6d7988fe..00000000 --- a/tests/test_zip_puller.py +++ /dev/null @@ -1,55 +0,0 @@ -import os -import pytest -import shutil -from nbgitpuller.plugins.zip_puller import ZipSourceWebDownloader -from nbgitpuller.plugins.zip_puller import ZipSourceDropBoxDownloader -from nbgitpuller.plugins.zip_puller import ZipSourceGoogleDriveDownloader - -test_files_dir = os.getcwd() + "/tests/test_files" -archive_base = "/tmp/test_files" -repo_parent_dir = "/tmp/fake/" -repo_zip = 'file://' + archive_base + ".zip" -repo_tgz = 'file://' + archive_base + ".tar.gz" - - -@pytest.fixture -def test_configuration(): - shutil.make_archive(archive_base, 'zip', test_files_dir) - shutil.make_archive(archive_base, 'gztar', test_files_dir) - os.makedirs(repo_parent_dir, exist_ok=True) - yield "test finishing" - shutil.rmtree(repo_parent_dir) - os.remove(archive_base + ".zip") - os.remove(archive_base + ".tar.gz") - - -def assert_helper(down, zip, tgz): - resp_zip = down.handle_files(zip, repo_parent_dir) - resp_tgz = down.handle_files(tgz, repo_parent_dir) - assert "unzip_dir" in resp_zip - assert "origin_repo_path" in resp_zip - assert f"{repo_parent_dir}.origin_non_git_sources" in resp_zip["origin_repo_path"] - assert "hw" in resp_zip["unzip_dir"] - assert "unzip_dir" in resp_tgz - assert "origin_repo_path" in resp_tgz - assert f"{repo_parent_dir}.origin_non_git_sources" in resp_tgz["origin_repo_path"] - assert "hw" in resp_tgz["unzip_dir"] - - -def test_web_downloader(test_configuration): - down = ZipSourceWebDownloader() - assert_helper(down, repo_zip, repo_tgz) - - -def test_dropbox_downloader(test_configuration): - down = ZipSourceDropBoxDownloader() - drop_repo_zip = repo_zip + "?dl=0" - drop_repo_tgz = repo_tgz + "?dl=0" - assert_helper(down, drop_repo_zip, drop_repo_tgz) - - -def test_google_get_id(): - down = ZipSourceGoogleDriveDownloader() - google_repo = "https://drive.google.com/file/d/1p3m0h5UGWdLkVVP0SSJH6j1HpG2yeDlU/view?usp=sharing" - file_id = down.get_id(google_repo) - assert file_id == "1p3m0h5UGWdLkVVP0SSJH6j1HpG2yeDlU" diff --git a/tox.ini b/tox.ini index d83c017b..2f961185 100644 --- a/tox.ini +++ b/tox.ini @@ -11,6 +11,9 @@ deps= tornado notebook +[flake8] +max-line-length = 150 + [testenv:flake8] basepython = python3.8 deps = From 71ca2f4dc2c0f8a20a5c3e54a96d251852c17dc5 Mon Sep 17 00:00:00 2001 From: sean-morris Date: Wed, 3 Nov 2021 16:24:19 -0700 Subject: [PATCH 04/40] Update nbgitpuller/plugin_helper.py Co-authored-by: Erik Sundell --- nbgitpuller/plugin_helper.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nbgitpuller/plugin_helper.py b/nbgitpuller/plugin_helper.py index 77d14cc8..33838aea 100644 --- a/nbgitpuller/plugin_helper.py +++ b/nbgitpuller/plugin_helper.py @@ -7,10 +7,11 @@ import shutil from urllib.parse import urlparse from functools import partial -from nbgitpuller import \ - TEMP_DOWNLOAD_REPO_DIR, \ - CACHED_ORIGIN_NON_GIT_REPO, \ - REPO_PARENT_DIR +from nbgitpuller import ( + TEMP_DOWNLOAD_REPO_DIR, + CACHED_ORIGIN_NON_GIT_REPO, + REPO_PARENT_DIR, +) async def execute_cmd(cmd, **kwargs): From ae66e53f99745df7ccab39465de7a6c1a7314caa Mon Sep 17 00:00:00 2001 From: sean-morris Date: Wed, 3 Nov 2021 16:24:48 -0700 Subject: [PATCH 05/40] Update nbgitpuller/hookspecs.py Co-authored-by: Erik Sundell --- nbgitpuller/hookspecs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nbgitpuller/hookspecs.py b/nbgitpuller/hookspecs.py index 320637ce..50ac933e 100644 --- a/nbgitpuller/hookspecs.py +++ b/nbgitpuller/hookspecs.py @@ -16,7 +16,7 @@ def handle_files(query_line_args): The parameter, query_line_args, is any argument you put on the URL - Once the files are saved to the directly, git puller can handle all the + Once the files are saved to the directory, git puller can handle all the standard functions needed to make sure source files are updated or created as needed. """ From 8934f5f338786998acfd06fea8edf8c100a9e08a Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Wed, 3 Nov 2021 17:36:53 -0700 Subject: [PATCH 06/40] renamed and simplified the test_files --- tests/test_download_puller.py | 8 +- tests/test_files/hw/hw01/hw01.ipynb | 1405 --------------------------- tests/test_files/test.txt | 13 + 3 files changed, 17 insertions(+), 1409 deletions(-) delete mode 100644 tests/test_files/hw/hw01/hw01.ipynb create mode 100644 tests/test_files/test.txt diff --git a/tests/test_download_puller.py b/tests/test_download_puller.py index 74d9dc80..2d9f372e 100644 --- a/tests/test_download_puller.py +++ b/tests/test_download_puller.py @@ -39,7 +39,7 @@ async def test_configuration(): def test_extract_file_extension(): - url = "https://github.com/sean-morris/APCS-Source-Code/raw/master/materials-sp20-external.tgz" + url = "https://fake.com/master/materials-sp20-external.tgz" ext = ph.extract_file_extension(url) assert "tgz" in ext @@ -71,7 +71,7 @@ async def test_execute_unarchive(test_configuration): yield_str = "" async for line in ph.execute_unarchive("zip", archive_base + ".zip", temp_download_repo): yield_str += line - assert os.path.isfile("/tmp/download/hw/hw01/hw01.ipynb") + assert os.path.isfile("/tmp/download/test.txt") @pytest.mark.asyncio @@ -106,7 +106,7 @@ async def test_download_archive(test_configuration): def test_google_get_id(): - google_repo = "https://drive.google.com/file/d/1p3m0h5UGWdLkVVP0SSJH6j1HpG2yeDlU/view?usp=sharing" + google_repo = "https://drive.google.com/fake/d/1111122223333444444/view?usp=sharing" gnb = getattr(google_nb, "get_id") file_id = gnb(google_repo) - assert file_id == "1p3m0h5UGWdLkVVP0SSJH6j1HpG2yeDlU" + assert file_id == "1111122223333444444" diff --git a/tests/test_files/hw/hw01/hw01.ipynb b/tests/test_files/hw/hw01/hw01.ipynb deleted file mode 100644 index 960747ce..00000000 --- a/tests/test_files/hw/hw01/hw01.ipynb +++ /dev/null @@ -1,1405 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "# Initialize Otter\n", - "import otter\n", - "grader = otter.Notebook(\"hw01.ipynb\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Homework 1: Causality and Expressions\n", - "\n", - "Please complete this notebook by filling in the cells provided." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Recommended Reading:**\n", - "- [What is Data Science](http://www.inferentialthinking.com/chapters/01/what-is-data-science.html)\n", - "- [Causality and Experiments](http://www.inferentialthinking.com/chapters/02/causality-and-experiments.html) \n", - "- [Programming in Python](http://www.inferentialthinking.com/chapters/03/programming-in-python.html)\n", - "\n", - "For all problems that you must write explanations and sentences for, you **must** provide your answer in the designated space. Moreover, throughout this homework and all future ones, please be sure to not re-assign variables throughout the notebook! For example, if you use `max_temperature` in your answer to one question, do not reassign it later on. Otherwise, you will fail tests that you thought you were passing previously!\n", - "\n", - "\n", - "Directly sharing answers is not okay, but discussing problems with the course staff or with other students is encouraged. Refer to the policies page to learn more about how to learn cooperatively.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Scary Arithmetic\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "\n", - "\n", - "An ad for ADT Security Systems says,\n", - "\n", - "> \"When you go on vacation, burglars go to work [...] According to FBI statistics, over 25% of home burglaries occur between Memorial Day and Labor Day.\"\n", - "\n", - "Do the data in the ad support the claim that burglars are more likely to go to work during the time between Memorial Day and Labor Day? Please explain your answer.\n", - "\n", - "**Note:** You can assume that \"over 25%\" means only slightly over. Had it been much over, say closer to 30%, then the marketers would have said so.\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_Type your answer here, replacing this text._" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## 2. Characters in Little Women\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In lecture, we counted the number of times that the literary characters were named in each chapter of the classic book, [*Little Women*](https://www.inferentialthinking.com/chapters/01/3/1/literary-characters). In computer science, the word \"character\" also refers to a letter, digit, space, or punctuation mark; any single element of a text. The following code generates a scatter plot in which each dot corresponds to a chapter of *Little Women*. The horizontal position of a dot measures the number of periods in the chapter. The vertical position measures the total number of characters." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# This cell contains code that hasn't yet been covered in the course,\n", - "# but you should be able to interpret the scatter plot it generates.\n", - "\n", - "from datascience import *\n", - "from urllib.request import urlopen\n", - "import numpy as np\n", - "%matplotlib inline\n", - "\n", - "little_women_url = 'https://www.inferentialthinking.com/data/little_women.txt'\n", - "chapters = urlopen(little_women_url).read().decode().split('CHAPTER ')[1:]\n", - "text = Table().with_column('Chapters', chapters)\n", - "Table().with_columns(\n", - " 'Periods', np.char.count(chapters, '.'),\n", - " 'Characters', text.apply(len, 0)\n", - " ).scatter(0)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "**Question 1.** Around how many periods are there in the chapter with the most characters? Assign either 1, 2, 3, 4, or 5 to the name `characters_q1` below.\n", - "\n", - "1. 250\n", - "2. 390\n", - "3. 440\n", - "4. 32,000\n", - "5. 40,000\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "characters_q1 = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q2_1\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The test above checks that your answers are in the correct format. **This test does not check that you answered correctly**, only that you assigned a number successfully in each multiple-choice answer cell." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "**Question 2.** Which of the following chapters has the most characters per period? Assign either 1, 2, or 3 to the name `characters_q2` below.\n", - "1. The chapter with about 60 periods\n", - "2. The chapter with about 350 periods\n", - "3. The chapter with about 440 periods\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "characters_q2 = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q2_2\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Again, the test above checks that your answers are in the correct format, but not that you have answered correctly." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To discover more interesting facts from this plot, read [Section 1.3.2](https://www.inferentialthinking.com/chapters/01/3/2/another-kind-of-character) of the textbook." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Names and Assignment Statements\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 1.** When you run the following cell, Python produces a cryptic error message." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "4 = 2 + 2" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "Choose the best explanation of what's wrong with the code, and then assign 1, 2, 3, or 4 to `names_q1` below to indicate your answer.\n", - "\n", - "1. Python is smart and already knows `4 = 2 + 2`.\n", - "\n", - "2. `4` is already a defined number, and it doesn't make sense to make a number be a name for something else. In Python, \"`x = 2 + 2`\" means \"assign `x` as the name for the value of `2 + 2`.\"\n", - "\n", - "3. It should be `2 + 2 = 4`.\n", - "\n", - "4. I don't get an error message. This is a trick question.\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "names_q1 = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q3_1\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 2.** When you run the following cell, Python will produce another cryptic error message." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "two = 3\n", - "six = two plus two" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "Choose the best explanation of what's wrong with the code and assign 1, 2, 3, or 4 to `names_q2` below to indicate your answer.\n", - "\n", - "1. The `plus` operation only applies to numbers, not the word \"two\".\n", - "\n", - "2. The name \"two\" cannot be assigned to the number 3.\n", - "\n", - "3. Two plus two is four, not six.\n", - "\n", - "4. Python cannot interpret the name `two` followed directly by a name that has not been defined.\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "names_q2 = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q3_2\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Question 3.** When you run the following cell, Python will, yet again, produce another cryptic error message." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "x = print(5)\n", - "y = x + 2" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "Choose the best explanation of what's wrong with the code and assign 1, 2, or 3 to `names_q3` below to indicate your answer.\n", - "\n", - "1. Python doesn't want `y` to be assigned.\n", - "\n", - "2. The `print` operation is meant for displaying values to the programmer, not for assigning values!\n", - "\n", - "3. Python can’t do addition between one name and one number. It has to be 2 numbers or 2 predefined names.\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "names_q3 = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q3_3\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Job Opportunities & Education in Rural India\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A [study](http://www.nber.org/papers/w16021.pdf) at UCLA investigated factors that might result in greater attention to the health and education of girls in rural India. One such factor is information about job opportunities for women. The idea is that if people know that educated women can get good jobs, they might take more care of the health and education of girls in their families, as an investment in the girls’ future potential as earners. Without the knowledge of job opportunities, the author hypothesizes that families do not invest in women’s well-being.\n", - "\n", - "The study focused on 160 villages outside the capital of India, all with little access to information about call centers and similar organizations that offer job opportunities to women. In 80 of the villages chosen at random, recruiters visited the village, described the opportunities, recruited women who had some English language proficiency and experience with computers, and provided ongoing support free of charge for three years. In the other 80 villages, no recruiters visited and no other intervention was made.\n", - "\n", - "At the end of the study period, the researchers recorded data about the school attendance and health of the children in the villages." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "**Question 1.** Which statement best describes the *treatment* and *control* groups for this study? Assign either 1, 2, or 3 to the name `jobs_q1` below.\n", - "\n", - "1. The treatment group was the 80 villages visited by recruiters, and the control group was the other 80 villages with no intervention.\n", - "\n", - "2. The treatment group was the 160 villages selected, and the control group was the rest of the villages outside the capital of India.\n", - "\n", - "3. There is no clear notion of *treatment* and *control* group in this study.\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "jobs_q1 = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q4_1\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "**Question 2.** Was this an observational study or a randomized controlled experiment? Assign either 1, 2, or 3 to the name `jobs_q2` below.\n", - "\n", - "1. This was an observational study.\n", - "\n", - "2. This was a randomized controlled experiment. \n", - "\n", - "3. This was a randomized observational study.\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "jobs_q2 = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q4_2\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "\n", - "\n", - "**Question 3.** The study reported, \"Girls aged 5-15 in villages that received the recruiting services were 3 to 5 percentage points more likely to be in school and experienced an increase in Body Mass Index, reflecting greater nutrition and/or medical care. However, there was no net gain in height. For boys, there was no change in any of these measures.\" Why do you think the author points out the lack of change in the boys?\n", - "\n", - "*Hint:* Remember the original hypothesis. The author believes that educating women in job opportunities will cause families to invest more in the women’s well-being.\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_Type your answer here, replacing this text._" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## 5. Differences between Majors\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Berkeley’s Office of Planning and Analysis provides data on numerous aspects of the campus. Adapted from the OPA website, the table below displays the numbers of degree recipients in three majors in the academic years 2008-2009 and 2017-2018.\n", - "\n", - "| Major | 2008-2009 | 2017-2018 |\n", - "|------------------------------------|--------------|-------------|\n", - "| Gender and Women's Studies | 17 | 28 |\n", - "| Linguistics | 49 | 67 |\n", - "| Rhetoric | 113 | 56 |\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "\n", - "\n", - "**Question 1.** Suppose you want to find the **biggest** absolute difference between the numbers of degree recipients in the two years, among the three majors.\n", - "\n", - "In the cell below, compute this value and call it `biggest_change`. Use a single expression (a single line of code) to compute the answer. Let Python perform all the arithmetic (like subtracting 49 from 67) rather than simplifying the expression yourself. The built-in `abs` function takes a numerical input and returns the absolute value. The built-in `max` function can take in 3 arguments and returns the maximum of the three numbers\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "export_pdf": true - }, - "outputs": [], - "source": [ - "biggest_change = ...\n", - "biggest_change" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q5_1\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "\n", - "\n", - "**Question 2.** Which of the three majors had the **smallest** absolute difference? Assign `smallest_change_major` to 1, 2, or 3 where each number corresponds to the following major:\n", - "\n", - "1: Gender and Women's Studies \n", - "2: Linguistics \n", - "3: Rhetoric\n", - "\n", - "Choose the number that corresponds to the major with the smallest absolute difference.\n", - "\n", - "You should be able to answer by rough mental arithmetic, without having to calculate the exact value for each major. \n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "smallest_change_major = ...\n", - "smallest_change_major" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q5_2\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "**Question 3.** For each major, define the \"relative change\" to be the following: $\\large{\\frac{\\text{absolute difference}}{\\text{value in 2008-2009}} * 100}$ \n", - "\n", - "Fill in the code below such that `gws_relative_change`, `linguistics_relative_change` and `rhetoric_relative_change` are assigned to the relative changes for their respective majors.\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "for_assignment_type": "student" - }, - "outputs": [], - "source": [ - "gws_relative_change = (abs(...) / 17) * 100\n", - "linguistics_relative_change = ...\n", - "rhetoric_relative_change = ...\n", - "gws_relative_change, linguistics_relative_change, rhetoric_relative_change" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q5_3\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "**Question 4.** Assign `biggest_rel_change_major` to 1, 2, or 3 where each number corresponds to to the following: \n", - "\n", - "1: Gender and Women's Studies \n", - "2: Linguistics \n", - "3: Rhetoric\n", - "\n", - "Choose the number that corresponds to the major with the biggest relative change.\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "# Assign biggest_rel_change_major to the number corresponding to the major with the biggest relative change.\n", - "biggest_rel_change_major = ...\n", - "biggest_rel_change_major" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q5_4\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Nearsightedness Study\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Myopia, or nearsightedness, results from a number of genetic and environmental factors. In 1999, Quinn et al studied the relation between myopia and ambient lighting at night (for example, from nightlights or room lights) during childhood." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "\n", - "\n", - "**Question 1.** The data were gathered by the following procedure, reported in the study. \"Between January and June 1998, parents of children aged 2-16 years [...] that were seen as outpatients in a university pediatric ophthalmology clinic completed a questionnaire on the child’s light exposure both at present and before the age of 2 years.\" Was this study observational, or was it a controlled experiment? Explain. \n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_Type your answer here, replacing this text._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "\n", - "\n", - "\n", - "\n", - "**Question 2.** The study found that of the children who slept with a room light on before the age of 2, 55% were myopic. Of the children who slept with a night light on before the age of 2, 34% were myopic. Of the children who slept in the dark before the age of 2, 10% were myopic. The study concluded that, \"The prevalence of myopia [...] during childhood was strongly associated with ambient light exposure during sleep at night in the first two years after birth.\"\n", - "\n", - "Do the data support this statement? You may interpret \"strongly\" in any reasonable qualitative way.\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_Type your answer here, replacing this text._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "\n", - "\n", - "\n", - "\n", - "**Question 3.** On May 13, 1999, CNN reported the results of this study under the headline, \"Night light may lead to nearsightedness.\" Does the conclusion of the study claim that night light causes nearsightedness?\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_Type your answer here, replacing this text._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "\n", - "\n", - "\n", - "\n", - "**Question 4.** The final paragraph of the CNN report said that \"several eye specialists\" had pointed out that the study should have accounted for heredity.\n", - "\n", - "Myopia is passed down from parents to children. Myopic parents are more likely to have myopic children, and may also be more likely to leave lights on habitually (since the parents have poor vision). In what way does the knowledge of this possible genetic link affect how we interpret the data from the study? \n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_Type your answer here, replacing this text._" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "\n", - "\n", - "## 7. Studying the Survivors\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "The Reverend Henry Whitehead was skeptical of John Snow’s conclusion about the Broad Street pump. After the Broad Street cholera epidemic ended, Whitehead set about trying to prove Snow wrong. (The history of the event is detailed [here](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1034367/pdf/medhist00183-0026.pdf).)\n", - "\n", - "He realized that Snow had focused his analysis almost entirely on those who had died. Whitehead, therefore, investigated the drinking habits of people in the Broad Street area who had not died in the outbreak.\n", - "\n", - "What is the main reason it was important to study this group?\n", - "\n", - "1) If Whitehead had found that many people had drunk water from the Broad Street pump and not caught cholera, that would have been evidence against Snow's hypothesis.\n", - "\n", - "2) Survivors could provide additional information about what else could have caused the cholera, potentially unearthing another cause.\n", - "\n", - "3) Through considering the survivors, Whitehead could have identified a cure for cholera.\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "# Assign survivor_answer to 1, 2, or 3\n", - "survivor_answer = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q7_1\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note:** Whitehead ended up finding further proof that the Broad Street pump played the central role in spreading the disease to the people who lived near it. Eventually, he became one of Snow’s greatest defenders." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 8. Policies and Administrivia\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This section of the homework is to ensure that you have read over the policies and frequently asked questions for the course. \n", - "\n", - "**It's important that you read through this section of the homework very carefully**. If you can get through all of this section and are sure you have all of the correct resources set up, you will be able to focus on the actual material this semester!\n", - "\n", - "Reading through the [policies](http://data8.org/sp20/policies.html) and the [FAQ](http://data8.org/sp20/faq.html) will help you get through this section very easily. It is recommended you do this before. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "**Question 1:** You have a question regarding the grading of your assignments that has not been previously answered on Piazza or the FAQ. Who do you contact? Assign `contact` to the number corresponding to the best choice below. \n", - "\n", - "1. The Instructors\n", - "2. Post on Piazza\n", - "3. Contact your Lab TA\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "contact = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q8_1\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "**Question 2:** Why will the grades on Gradescope and OkPy be different? Assign `grades` to the number corresponding to the best choice below. \n", - "\n", - "1. There was a mistake in the grading. I should contact someone about this\n", - "2. Gradescope grades the written portion, while OkPy grades the coded portion\n", - "3. Trick question; the grades should be the same on both platforms\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [], - "source": [ - "grades = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q8_2\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "**Question 3:** Regrade deadline dates will always be posted on the same Piazza post that releases the assignment grades, common mistakes, and solutions. Can you ask for parts of your assignment regraded after the regrade request window has passed? Assign `regrade` to the number corresponding to the best choice below. \n", - "\n", - "1. Yes\n", - "2. No\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [], - "source": [ - "regrade = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q8_3\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "**Question 4:** Do you have an Gradescope account? Head to [gradescope.com](http://gradescope.com) and check if you see Data 8. If you do not, please send your Lab TA an email with your email and student ID number. \n", - "\n", - "Once you have been enrolled, go to the Data 8 Gradescope course website. At the end of the url (link), you should see a number. Assign `gradescope` to that number. \n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [], - "source": [ - "gradescope = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q8_4\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "**Question 5:** Given the following scenarios, assign `acceptable` to the number of the scenario that is permissible given the guidelines on the [policies](http://data8.org/sp20/policies.html) page. \n", - "\n", - "1. Alice gets stuck on a homework assignment, so she googles a fix. She stumbles across a pdf of the solutions for the homework assignment from a previous semester's offering of Data 8. After inspecting the solution, Alice writes her own solution and submits the assignment.\n", - "\n", - "2. After getting confused by a project, Bob asks his friend for help. His friend helps by walking the student through his own logic, pointing out areas that are important given the context of the question. Upon hearing his friends logic, the Bob writes his own code and completes the project.\n", - "\n", - "3. Eve has an extremely busy schedule, so she really wants to leave lab early by finishing it and getting checked off. Her neighbor, Charlie, simply turns his computer so Eve can see how he completed some questions. After looking at his code, Eve finishes the lab and gets checked off.\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [], - "source": [ - "acceptable = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q8_5\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "**Question 6:** To make sure you have read through the [policies](http://data8.org/sp20/policies.html) and the [FAQ](http://data8.org/sp20/faq.html) carefully, how many HW/lab drops are there? Assign `drops` to the number corresponding to the best choice below. \n", - "\n", - "1. Two homework drops and one lab drop\n", - "2. One homework drop and one lab drop\n", - "3. Only one homework drop\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [], - "source": [ - "drops = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q8_6\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "**Question 7:** Does Data 8 offer any alternate exams? Assign `exams` to the number corresponding to the best choice below. \n", - "\n", - "1. Yes\n", - "2. No\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [], - "source": [ - "exams = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q8_7\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "**Question 8:** Are you actually checking Piazza? Go to this semester's [Data 8 Piazza](https://piazza.com/class/k5fwiw4wql642x), and find an instructor posted thread with a certain secret phrase. Assign `secret` to this secret phrase in quotes (aka as a string).\n", - "\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [], - "source": [ - "secret = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q8_8\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 9. Welcome Survey\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once you have submitted, please also complete the welcome survey in order to receive credit for homework 1.\n", - "\n", - "Welcome survey is here: https://docs.google.com/forms/d/e/1FAIpQLSd28-DvELnGk4n6lHcqMOWcsovDulNSbhmlLFXqDMQIsdldaQ/viewform?usp=sf_link" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "Assign `survey` to the secret string given at the end of the welcome survey:\n", - "" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [], - "source": [ - "survey = ..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check(\"q9\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "---\n", - "\n", - "To double-check your work, the cell below will rerun all of the autograder tests." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "grader.check_all()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": false, - "editable": false - }, - "source": [ - "## Submission\n", - "\n", - "Make sure you have run all cells in your notebook in order before running the cell below, so that all images/graphs appear in the output. The cell below will generate a zip file for you to submit. **Please save before exporting!**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "deletable": false, - "editable": false - }, - "outputs": [], - "source": [ - "# Save your notebook first, then run this cell to export your submission.\n", - "grader.export(pdf=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " " - ] - } - ], - "metadata": { - "celltoolbar": "None", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.5" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/tests/test_files/test.txt b/tests/test_files/test.txt new file mode 100644 index 00000000..8e435da9 --- /dev/null +++ b/tests/test_files/test.txt @@ -0,0 +1,13 @@ +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 From ac2072c34ac4978fbd9bb800c17d4c96612f162d Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Wed, 3 Nov 2021 21:07:29 -0700 Subject: [PATCH 07/40] added README to plugins --- nbgitpuller/plugins/README.md | 23 +++++++++++++++++++ .../plugins/nbgitpuller-dropbox/README.md | 19 +++++++++++++++ .../plugins/nbgitpuller-googledrive/README.md | 17 ++++++++++++++ .../plugins/nbgitpuller-standard/README.md | 20 ++++++++++++++++ 4 files changed, 79 insertions(+) create mode 100644 nbgitpuller/plugins/README.md create mode 100644 nbgitpuller/plugins/nbgitpuller-dropbox/README.md create mode 100644 nbgitpuller/plugins/nbgitpuller-googledrive/README.md create mode 100644 nbgitpuller/plugins/nbgitpuller-standard/README.md diff --git a/nbgitpuller/plugins/README.md b/nbgitpuller/plugins/README.md new file mode 100644 index 00000000..8d5dc474 --- /dev/null +++ b/nbgitpuller/plugins/README.md @@ -0,0 +1,23 @@ +# [nbgitpuller download plugins](https://github.com/jupyterhub/nbgitpuller) + +`nbgitpuller` download plugins enable users to download compressed +archives(zip or tar-compatible) into jupyter hubs from any publicly accessible URL +including from services such as Google Drive and Dropbox. Each plugin in this directory +includes a README file describing the format of the URL expected from each provider. + +You can install some or all of the plugins into your environment. They are automatically +discovered by the system; we used pluggy(https://pluggy.readthedocs.io/en/stable/) to handle +the loading and implementation of these plugins. + +If you would like to add a provider, you can mimic the plug-in format in one of the provided +examples, install it into your jupyterhub environment and it will be automatically discovered +by nbgitpuller. + + +## Installation + +```shell +python3 -m pip install nbgitpuller-dropbox +python3 -m pip install nbgitpuller-googledrive +python3 -m pip install nbgitpuller-standard +``` diff --git a/nbgitpuller/plugins/nbgitpuller-dropbox/README.md b/nbgitpuller/plugins/nbgitpuller-dropbox/README.md new file mode 100644 index 00000000..e7e84be8 --- /dev/null +++ b/nbgitpuller/plugins/nbgitpuller-dropbox/README.md @@ -0,0 +1,19 @@ +# nbgitpuller - dropbox download plugin + +Dropbox file/folder names add the dl=0 URL query parameter to their URLs. + +This plugin expects the URL to look like this: +- https://www.dropbox.com/s/qou3g7hf41vq6sw/materials-sp20-external.zip?dl=0 + +This plugin replaces dl=0 with dl=1 and then downloads the file. + +Please note that the file(compressed archive) must have permissions set so that anyone +with the link can view the file. + +## Installation + +```shell +python3 -m pip install nbgitpuller-dropbox +``` + + diff --git a/nbgitpuller/plugins/nbgitpuller-googledrive/README.md b/nbgitpuller/plugins/nbgitpuller-googledrive/README.md new file mode 100644 index 00000000..372c6b39 --- /dev/null +++ b/nbgitpuller/plugins/nbgitpuller-googledrive/README.md @@ -0,0 +1,17 @@ +# nbgitpuller - google drive download plugin + +Google Drive uses a uniquely formatted URL to identify files and folders. As a result, +programmatically downloading from Google Drive requires special handling. The +implementation of the download plugin for Google Drive handles these requirements. + +The plugin is expecting a URL in this format: +- https://drive.google.com/file/d/1p3m0h5UGWdLkVVP0SSJH6j1HpG2yeDlU/view?usp=sharing + +Please note that the file(compressed archive) must have permissions set so that anyone +with the link can view the file. + +## Installation + +```shell +python3 -m pip install nbgitpuller-googledrive +``` diff --git a/nbgitpuller/plugins/nbgitpuller-standard/README.md b/nbgitpuller/plugins/nbgitpuller-standard/README.md new file mode 100644 index 00000000..a2e56e1c --- /dev/null +++ b/nbgitpuller/plugins/nbgitpuller-standard/README.md @@ -0,0 +1,20 @@ +# nbgitpuller - standard web server download plugin + +The standard web server download plugin handles any publicly accessible URL that points +by name to the compressed archive; this is in contrast to URLs that point to compressed archives stored in +services like Google Drive or Dropbox. + +In these services, the URL uses the services mechanism for determining +the compressed file(eg. an ID that identifies the file rather than the name of the file itself) +and hence the downloading from these services is slightly different. I have provided +examples in the folders nbgitpuller-dropbox and nbgitpuller-googledrive. + +The format of the URL is like any standard web address. For example: +- https://github.com/username/folder/raw/master/materials-sp20-external.tgz +- https://myinstituition.edu/courseX/x-materials.zip + +## Installation + +```shell +python3 -m pip install nbgitpuller-standard +``` From a84096d37de82e5f17a768d082659b060827e772 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Wed, 3 Nov 2021 21:18:50 -0700 Subject: [PATCH 08/40] added docstring to progress_loop function --- nbgitpuller/handlers.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index ca5b7ee9..124814ec 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -48,6 +48,13 @@ def setup_plugins(self, provider): @gen.coroutine def progress_loop(self, queue): + """ + The loop below constantly checks the queue paremeter for messages + that are being sent to the UI so the user is kept aware of progress related to + the downloading of archives and the merging of files into the user's home folder + + :param queue: This is either the download_queue or the original pull queue + """ while True: try: progress = queue.get_nowait() From 86fd7bf43a06960f78a50b9fd1b4062a4e90ae74 Mon Sep 17 00:00:00 2001 From: sean-morris Date: Thu, 4 Nov 2021 12:31:56 -0700 Subject: [PATCH 09/40] Update tests/test_download_puller.py Co-authored-by: Erik Sundell --- tests/test_download_puller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_download_puller.py b/tests/test_download_puller.py index 2d9f372e..d28b7090 100644 --- a/tests/test_download_puller.py +++ b/tests/test_download_puller.py @@ -39,7 +39,7 @@ async def test_configuration(): def test_extract_file_extension(): - url = "https://fake.com/master/materials-sp20-external.tgz" + url = "https://example.org/master/materials-sp20-external.tgz" ext = ph.extract_file_extension(url) assert "tgz" in ext From c686651fedbffbcb3411328bd9c4580198a9ab3a Mon Sep 17 00:00:00 2001 From: sean-morris Date: Thu, 4 Nov 2021 12:32:14 -0700 Subject: [PATCH 10/40] Update tests/test_download_puller.py Co-authored-by: Erik Sundell --- tests/test_download_puller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_download_puller.py b/tests/test_download_puller.py index d28b7090..6bfa65d1 100644 --- a/tests/test_download_puller.py +++ b/tests/test_download_puller.py @@ -94,7 +94,7 @@ async def test_push_to_local_origin(test_configuration): @pytest.mark.asyncio async def test_download_archive(test_configuration): args = {} - args["repo"] = "http://fake.com" + args["repo"] = "http://example.org/mocked-download-url" with aioresponses() as mocked: mocked.get(args["repo"], status=200, body=b'Pretend you are zip file being downloaded') args["client"] = aiohttp.ClientSession() From f8e04f1f5a99ec145b55df22fc9ebd7d774477fc Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Fri, 5 Nov 2021 17:05:23 -0700 Subject: [PATCH 11/40] Removed Downloader Plugins from Repo - removed plugins from this repo and put into their own repos - renamed the test file that is testing plugin_helper - removed one test related to googledrive downloader plugin --- nbgitpuller/plugins/README.md | 23 --- nbgitpuller/plugins/__init__.py | 0 .../plugins/nbgitpuller-dropbox/README.md | 19 --- .../plugins/nbgitpuller-dropbox/__init__.py | 0 .../nbgitpuller-dropbox/dropbox_puller.py | 33 ----- .../plugins/nbgitpuller-dropbox/setup.py | 9 -- .../plugins/nbgitpuller-googledrive/README.md | 17 --- .../nbgitpuller-googledrive/__init__.py | 0 .../googledrive_puller.py | 138 ------------------ .../plugins/nbgitpuller-googledrive/setup.py | 9 -- .../plugins/nbgitpuller-standard/README.md | 20 --- .../plugins/nbgitpuller-standard/__init__.py | 0 .../plugins/nbgitpuller-standard/setup.py | 9 -- .../standardweb_puller.py | 32 ---- ...wnload_puller.py => test_plugin_helper.py} | 8 - 15 files changed, 317 deletions(-) delete mode 100644 nbgitpuller/plugins/README.md delete mode 100644 nbgitpuller/plugins/__init__.py delete mode 100644 nbgitpuller/plugins/nbgitpuller-dropbox/README.md delete mode 100644 nbgitpuller/plugins/nbgitpuller-dropbox/__init__.py delete mode 100644 nbgitpuller/plugins/nbgitpuller-dropbox/dropbox_puller.py delete mode 100644 nbgitpuller/plugins/nbgitpuller-dropbox/setup.py delete mode 100644 nbgitpuller/plugins/nbgitpuller-googledrive/README.md delete mode 100644 nbgitpuller/plugins/nbgitpuller-googledrive/__init__.py delete mode 100644 nbgitpuller/plugins/nbgitpuller-googledrive/googledrive_puller.py delete mode 100644 nbgitpuller/plugins/nbgitpuller-googledrive/setup.py delete mode 100644 nbgitpuller/plugins/nbgitpuller-standard/README.md delete mode 100644 nbgitpuller/plugins/nbgitpuller-standard/__init__.py delete mode 100644 nbgitpuller/plugins/nbgitpuller-standard/setup.py delete mode 100644 nbgitpuller/plugins/nbgitpuller-standard/standardweb_puller.py rename tests/{test_download_puller.py => test_plugin_helper.py} (91%) diff --git a/nbgitpuller/plugins/README.md b/nbgitpuller/plugins/README.md deleted file mode 100644 index 8d5dc474..00000000 --- a/nbgitpuller/plugins/README.md +++ /dev/null @@ -1,23 +0,0 @@ -# [nbgitpuller download plugins](https://github.com/jupyterhub/nbgitpuller) - -`nbgitpuller` download plugins enable users to download compressed -archives(zip or tar-compatible) into jupyter hubs from any publicly accessible URL -including from services such as Google Drive and Dropbox. Each plugin in this directory -includes a README file describing the format of the URL expected from each provider. - -You can install some or all of the plugins into your environment. They are automatically -discovered by the system; we used pluggy(https://pluggy.readthedocs.io/en/stable/) to handle -the loading and implementation of these plugins. - -If you would like to add a provider, you can mimic the plug-in format in one of the provided -examples, install it into your jupyterhub environment and it will be automatically discovered -by nbgitpuller. - - -## Installation - -```shell -python3 -m pip install nbgitpuller-dropbox -python3 -m pip install nbgitpuller-googledrive -python3 -m pip install nbgitpuller-standard -``` diff --git a/nbgitpuller/plugins/__init__.py b/nbgitpuller/plugins/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/nbgitpuller/plugins/nbgitpuller-dropbox/README.md b/nbgitpuller/plugins/nbgitpuller-dropbox/README.md deleted file mode 100644 index e7e84be8..00000000 --- a/nbgitpuller/plugins/nbgitpuller-dropbox/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# nbgitpuller - dropbox download plugin - -Dropbox file/folder names add the dl=0 URL query parameter to their URLs. - -This plugin expects the URL to look like this: -- https://www.dropbox.com/s/qou3g7hf41vq6sw/materials-sp20-external.zip?dl=0 - -This plugin replaces dl=0 with dl=1 and then downloads the file. - -Please note that the file(compressed archive) must have permissions set so that anyone -with the link can view the file. - -## Installation - -```shell -python3 -m pip install nbgitpuller-dropbox -``` - - diff --git a/nbgitpuller/plugins/nbgitpuller-dropbox/__init__.py b/nbgitpuller/plugins/nbgitpuller-dropbox/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/nbgitpuller/plugins/nbgitpuller-dropbox/dropbox_puller.py b/nbgitpuller/plugins/nbgitpuller-dropbox/dropbox_puller.py deleted file mode 100644 index 36b0b18c..00000000 --- a/nbgitpuller/plugins/nbgitpuller-dropbox/dropbox_puller.py +++ /dev/null @@ -1,33 +0,0 @@ -from nbgitpuller.plugin_helper import handle_files_helper -from nbgitpuller.plugin_helper import extract_file_extension -from nbgitpuller.hookspecs import hookimpl -import asyncio - - -def determine_file_extension(url): - """ - :param str url: url to source - :return the extension indicating the file compression(e.g. zip, tgz) - :rtype str - """ - return extract_file_extension(url) - - -@hookimpl -def handle_files(query_line_args): - """ - :param json args: this includes any argument you put on the url - PLUS the function, query_line_args["progress_func"], that writes messages to - the progress stream in the browser window and the download_q, - query_line_args["download_q"] the progress function uses. - :return two parameter json unzip_dir and origin_repo_path - :rtype json object - """ - query_line_args["repo"] = query_line_args["repo"].replace("dl=0", "dl=1") # dropbox: download set to 1 - ext = determine_file_extension(query_line_args["repo"]) - query_line_args["extension"] = ext - - loop = asyncio.get_event_loop() - tasks = handle_files_helper(query_line_args), query_line_args["progress_func"]() - result_handle, _ = loop.run_until_complete(asyncio.gather(*tasks)) - return result_handle diff --git a/nbgitpuller/plugins/nbgitpuller-dropbox/setup.py b/nbgitpuller/plugins/nbgitpuller-dropbox/setup.py deleted file mode 100644 index cc6e6ee0..00000000 --- a/nbgitpuller/plugins/nbgitpuller-dropbox/setup.py +++ /dev/null @@ -1,9 +0,0 @@ -from setuptools import setup - -setup( - name="nbgitpuller-dropbox", - entry_points={ - "nbgitpuller": ["dropbox=dropbox_puller"] - }, - py_modules=["dropbox_puller"] -) diff --git a/nbgitpuller/plugins/nbgitpuller-googledrive/README.md b/nbgitpuller/plugins/nbgitpuller-googledrive/README.md deleted file mode 100644 index 372c6b39..00000000 --- a/nbgitpuller/plugins/nbgitpuller-googledrive/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# nbgitpuller - google drive download plugin - -Google Drive uses a uniquely formatted URL to identify files and folders. As a result, -programmatically downloading from Google Drive requires special handling. The -implementation of the download plugin for Google Drive handles these requirements. - -The plugin is expecting a URL in this format: -- https://drive.google.com/file/d/1p3m0h5UGWdLkVVP0SSJH6j1HpG2yeDlU/view?usp=sharing - -Please note that the file(compressed archive) must have permissions set so that anyone -with the link can view the file. - -## Installation - -```shell -python3 -m pip install nbgitpuller-googledrive -``` diff --git a/nbgitpuller/plugins/nbgitpuller-googledrive/__init__.py b/nbgitpuller/plugins/nbgitpuller-googledrive/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/nbgitpuller/plugins/nbgitpuller-googledrive/googledrive_puller.py b/nbgitpuller/plugins/nbgitpuller-googledrive/googledrive_puller.py deleted file mode 100644 index dfcca579..00000000 --- a/nbgitpuller/plugins/nbgitpuller-googledrive/googledrive_puller.py +++ /dev/null @@ -1,138 +0,0 @@ -from nbgitpuller.hookspecs import hookimpl -import re -import asyncio -import aiohttp -from nbgitpuller.plugin_helper import handle_files_helper -from nbgitpuller import TEMP_DOWNLOAD_REPO_DIR - -DOWNLOAD_URL = "https://docs.google.com/uc?export=download" - - -@hookimpl -def handle_files(query_line_args): - """ - :param json args: this includes any argument you put on the url - PLUS the function, query_line_args["progress_func"], that writes messages to - the progress stream in the browser window and the download_q, - query_line_args["download_q"] the progress function uses. - :return two parameter json unzip_dir and origin_repo_path - :rtype json object - """ - loop = asyncio.get_event_loop() - repo = query_line_args["repo"] - query_line_args["download_q"].put_nowait("Determining type of archive...\n") - response = loop.run_until_complete(get_response_from_drive(DOWNLOAD_URL, get_id(repo))) - ext = determine_file_extension_from_response(response) - query_line_args["download_q"].put_nowait(f"Archive is: {ext}\n") - temp_download_file = f"{TEMP_DOWNLOAD_REPO_DIR}/download.{ext}" - - query_line_args["extension"] = ext - query_line_args["dowload_func"] = download_archive_for_google - query_line_args["dowload_func_params"] = query_line_args, temp_download_file - - tasks = handle_files_helper(query_line_args), query_line_args["progress_func"]() - result_handle, _ = loop.run_until_complete(asyncio.gather(*tasks)) - return result_handle - - -def get_id(repo): - """ - :param str repo: the url to the compressed file contained the google id - :return the google drive id of the file to be downloaded - :rtype str - """ - start_id_index = repo.index("d/") + 2 - end_id_index = repo.index("/view") - return repo[start_id_index:end_id_index] - - -def get_confirm_token(session, url): - """ - :param aiohttp.ClientSession session: used to the get the cookies from the reponse - :param str url : the url is used to filter out the correct cookies from the session - :return the cookie if found or None if not found - :rtype str - - This used to determine whether or not Google needs you to confirm a large download - file is being downloaded - """ - cookies = session.cookie_jar.filter_cookies(url) - for key, cookie in cookies.items(): - if key.startswith('download_warning'): - return cookie - return None - - -async def download_archive_for_google(args, temp_download_file): - """ - :param map args: key-value pairs includes repo path - :param str temp_download_file: the path to save the requested file to - - This requests the file from the repo(url) given and saves it to the disk - """ - yield "Downloading archive ...\n" - try: - repo = args["repo"] - id = get_id(repo) - CHUNK_SIZE = 1024 - async with aiohttp.ClientSession() as session: - async with session.get(DOWNLOAD_URL, params={'id': id}) as response: - token = get_confirm_token(session, repo) - if token: - params = {'id': id, 'confirm': token} - response = await session.get(repo, params=params) - with open(temp_download_file, 'ab') as fd: - count_chunks = 1 - while True: - count_chunks += 1 - if count_chunks % 1000 == 0: - display = count_chunks / 1000 - yield f"Downloading Progress ... {display}MB\n" - chunk = await response.content.read(CHUNK_SIZE) - if not chunk: - break - fd.write(chunk) - yield "Archive Downloaded....\n" - except Exception as e: - raise e - - -async def get_response_from_drive(url, id): - """ - :param str url: the google download URL - :param str id: the google id of the file to download - :return response object - :rtype json object - You need to check to see that Google Drive has not asked the - request to confirm that they disabled the virus scan on files that - are bigger than 100MB(The size is mentioned online but I did not see - confirmation - something larger essentially). For large files, you have - to request again but this time putting the 'confirm=XXX' as a query - parameter. - """ - async with aiohttp.ClientSession() as session: - async with session.get(url, params={'id': id}) as response: - token = get_confirm_token(session, url) - if token: - params = {'id': id, 'confirm': token} - response = await session.get(url, params=params) - return response - return response - - -def determine_file_extension_from_response(response): - """ - :param str response: the response object from the download - :return the extension indicating the file compression(e.g. zip, tgz) - :rtype str - """ - content_disposition = response.headers.get('content-disposition') - if content_disposition: - fname = re.findall("filename\\*?=([^;]+)", content_disposition) - fname = fname[0].strip().strip('"') - ext = fname.split(".")[1] - - if ext is None: - m = f"Could not determine compression type of: {content_disposition}" - raise Exception(m) - return ext diff --git a/nbgitpuller/plugins/nbgitpuller-googledrive/setup.py b/nbgitpuller/plugins/nbgitpuller-googledrive/setup.py deleted file mode 100644 index 37b0064d..00000000 --- a/nbgitpuller/plugins/nbgitpuller-googledrive/setup.py +++ /dev/null @@ -1,9 +0,0 @@ -from setuptools import setup - -setup( - name="nbgitpuller-googledrive", - entry_points={ - "nbgitpuller": ["googledrive=googledrive_puller"] - }, - py_modules=["googledrive_puller"] -) diff --git a/nbgitpuller/plugins/nbgitpuller-standard/README.md b/nbgitpuller/plugins/nbgitpuller-standard/README.md deleted file mode 100644 index a2e56e1c..00000000 --- a/nbgitpuller/plugins/nbgitpuller-standard/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# nbgitpuller - standard web server download plugin - -The standard web server download plugin handles any publicly accessible URL that points -by name to the compressed archive; this is in contrast to URLs that point to compressed archives stored in -services like Google Drive or Dropbox. - -In these services, the URL uses the services mechanism for determining -the compressed file(eg. an ID that identifies the file rather than the name of the file itself) -and hence the downloading from these services is slightly different. I have provided -examples in the folders nbgitpuller-dropbox and nbgitpuller-googledrive. - -The format of the URL is like any standard web address. For example: -- https://github.com/username/folder/raw/master/materials-sp20-external.tgz -- https://myinstituition.edu/courseX/x-materials.zip - -## Installation - -```shell -python3 -m pip install nbgitpuller-standard -``` diff --git a/nbgitpuller/plugins/nbgitpuller-standard/__init__.py b/nbgitpuller/plugins/nbgitpuller-standard/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/nbgitpuller/plugins/nbgitpuller-standard/setup.py b/nbgitpuller/plugins/nbgitpuller-standard/setup.py deleted file mode 100644 index 1289ab08..00000000 --- a/nbgitpuller/plugins/nbgitpuller-standard/setup.py +++ /dev/null @@ -1,9 +0,0 @@ -from setuptools import setup - -setup( - name="nbgitpuller-standard", - entry_points={ - "nbgitpuller": ["standard=standardweb_puller"] - }, - py_modules=["standardweb_puller"] -) diff --git a/nbgitpuller/plugins/nbgitpuller-standard/standardweb_puller.py b/nbgitpuller/plugins/nbgitpuller-standard/standardweb_puller.py deleted file mode 100644 index 39303608..00000000 --- a/nbgitpuller/plugins/nbgitpuller-standard/standardweb_puller.py +++ /dev/null @@ -1,32 +0,0 @@ -from nbgitpuller.plugin_helper import handle_files_helper -from nbgitpuller.plugin_helper import extract_file_extension -from nbgitpuller.hookspecs import hookimpl -import asyncio - - -def determine_file_extension(url): - """ - :param str url: url to source - :return the extension indicating the file compression(e.g. zip, tgz) - :rtype str - """ - return extract_file_extension(url) - - -@hookimpl -def handle_files(query_line_args): - """ - :param json args: this includes any argument you put on the url - PLUS the function, query_line_args["progress_func"], that writes messages to - the progress stream in the browser window and the download_q, - query_line_args["download_q"] the progress function uses. - :return two parameter json unzip_dir and origin_repo_path - :rtype json object - """ - ext = determine_file_extension(query_line_args["repo"]) - query_line_args["extension"] = ext - - loop = asyncio.get_event_loop() - tasks = handle_files_helper(query_line_args), query_line_args["progress_func"]() - result_handle, _ = loop.run_until_complete(asyncio.gather(*tasks)) - return result_handle diff --git a/tests/test_download_puller.py b/tests/test_plugin_helper.py similarity index 91% rename from tests/test_download_puller.py rename to tests/test_plugin_helper.py index 6bfa65d1..b6eb4a90 100644 --- a/tests/test_download_puller.py +++ b/tests/test_plugin_helper.py @@ -5,7 +5,6 @@ import importlib import aiohttp from aioresponses import aioresponses -google_nb = importlib.import_module("nbgitpuller.plugins.nbgitpuller-googledrive.googledrive_puller") test_files_dir = os.getcwd() + "/tests/test_files" archive_base = "/tmp/test_files" @@ -103,10 +102,3 @@ async def test_download_archive(test_configuration): yield_str += line assert 'Downloading archive' in yield_str assert os.path.isfile(temp_archive_download + "downloaded.zip") - - -def test_google_get_id(): - google_repo = "https://drive.google.com/fake/d/1111122223333444444/view?usp=sharing" - gnb = getattr(google_nb, "get_id") - file_id = gnb(google_repo) - assert file_id == "1111122223333444444" From 958b0b15e17550cd5cde23cb8cc1bf43e22f528b Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Fri, 5 Nov 2021 17:09:19 -0700 Subject: [PATCH 12/40] Added Custom Exception for Bad Provider If the automatic plugin discovery can not locate the plugin with the name provided in the URL parameter provider an Exception is thrown. This happens when the plugin is not installed or the name provided does not correspond to the plugin name. --- .gitignore | 5 ++++- nbgitpuller/handlers.py | 24 +++++++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 70ca3865..f949a534 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,9 @@ data8assets/ summer/ test-repo/ venv/ +.idea/ .ipynb_checkpoints -docs/_build \ No newline at end of file +docs/_build +jupyterhub.sqlite +jupyterhub_cookie_secret \ No newline at end of file diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index 124814ec..b50f3c02 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -15,6 +15,15 @@ import nbgitpuller +class ProviderException(Exception): + """ + Custom Exception thrown when the provider key specifying + the downloader plugin is not installed or can not be found by the + name given + """ + def __init__(self, response=None): + self.response = response + class SyncHandler(IPythonHandler): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -43,7 +52,9 @@ def emit(self, data): def setup_plugins(self, provider): pm = pluggy.PluginManager("nbgitpuller") pm.add_hookspecs(hookspecs) - pm.load_setuptools_entrypoints("nbgitpuller", name=provider) + num_loaded =pm.load_setuptools_entrypoints("nbgitpuller", name=provider) + if num_loaded == 0: + raise ProviderException(f"The provider key you supplied in the URL could not be found: {provider}") return pm @gen.coroutine @@ -149,6 +160,17 @@ def pull(): yield gen.sleep(3) self.emit({'phase': 'finished'}) + except ProviderException as pe: + self.emit({ + 'phase': 'error', + 'message': str(pe), + 'output': '\n'.join([ + line.strip() + for line in traceback.format_exception( + type(pe), pe, pe.__traceback__ + ) + ]) + }) except Exception as e: self.emit({ 'phase': 'error', From 9a8fcab6cc5848d67f6af93e6d77ab82ea5817b1 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Mon, 8 Nov 2021 09:10:55 -0800 Subject: [PATCH 13/40] Removed unused import from test file --- tests/test_plugin_helper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_plugin_helper.py b/tests/test_plugin_helper.py index b6eb4a90..b191db74 100644 --- a/tests/test_plugin_helper.py +++ b/tests/test_plugin_helper.py @@ -2,7 +2,6 @@ import pytest import shutil import nbgitpuller.plugin_helper as ph -import importlib import aiohttp from aioresponses import aioresponses From 78e31c3e2702b97ce459dd7e577642003c0ad22d Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Mon, 8 Nov 2021 09:39:05 -0800 Subject: [PATCH 14/40] Added packages to dev-requirements.txt aioresponses and pytest-asyncio are needed run the plugin_helper tests --- dev-requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev-requirements.txt b/dev-requirements.txt index f797739a..03f5a7ee 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -3,3 +3,5 @@ pytest pytest-cov flake8 nbclassic +aioresponses +pytest-asyncio \ No newline at end of file From a131b933b00312cc982c43d60ab63021168ee396 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Tue, 9 Nov 2021 16:07:09 -0800 Subject: [PATCH 15/40] Moved the two constants and REPO_PARENT_DIR out of __init__.py The REPO_PARENT_DIR never belonged there. I now resolve it in Synchandler as we always did and pass into the handle_files function as part of the args; it will be used in the plugin_helper.py as the path to the files. The other two TEMP_DOWNLOAD_REPO_SIR and CACHED_ORIGIN_NON_GIT_REPO are now declared and initialized in plugin_helper.py where they are used. --- nbgitpuller/__init__.py | 4 ---- nbgitpuller/handlers.py | 4 +--- nbgitpuller/plugin_helper.py | 19 +++++++++++-------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/nbgitpuller/__init__.py b/nbgitpuller/__init__.py index 7e579b6d..a2815bd6 100644 --- a/nbgitpuller/__init__.py +++ b/nbgitpuller/__init__.py @@ -6,10 +6,6 @@ import os import nest_asyncio -REPO_PARENT_DIR = None -TEMP_DOWNLOAD_REPO_DIR = "/tmp/temp_download_repo" -CACHED_ORIGIN_NON_GIT_REPO = ".nbgitpuller/targets/" - # this allows us to nest usage of the event_loop from asyncio # being used by tornado in jupyter distro # Ref: https://medium.com/@vyshali.enukonda/how-to-get-around-runtimeerror-this-event-loop-is-already-running-3f26f67e762e diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index b50f3c02..1b8eb6d2 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -12,7 +12,6 @@ from .version import __version__ from . import hookspecs import pluggy -import nbgitpuller class ProviderException(Exception): @@ -73,7 +72,6 @@ def progress_loop(self, queue): yield gen.sleep(0.1) continue if progress is None: - yield gen.sleep(5) return if isinstance(progress, Exception): self.emit({ @@ -119,7 +117,6 @@ def get(self): # server_root_dir will include things like `~` and so the path # must be expanded. repo_parent_dir = os.path.join(os.path.expanduser(self.settings['server_root_dir']), os.getenv('NBGITPULLER_PARENTPATH', '')) - nbgitpuller.REPO_PARENT_DIR = repo_parent_dir repo_dir = os.path.join( repo_parent_dir, @@ -137,6 +134,7 @@ def get(self): download_q = Queue() req_args["progress_func"] = lambda: self.progress_loop(download_q) req_args["download_q"] = download_q + req_args["repo_parent_dir"] = repo_parent_dir hf_args = {"query_line_args": req_args} results = pm.hook.handle_files(**hf_args) repo_dir = repo_parent_dir + results["unzip_dir"] diff --git a/nbgitpuller/plugin_helper.py b/nbgitpuller/plugin_helper.py index 33838aea..02c73883 100644 --- a/nbgitpuller/plugin_helper.py +++ b/nbgitpuller/plugin_helper.py @@ -7,12 +7,14 @@ import shutil from urllib.parse import urlparse from functools import partial -from nbgitpuller import ( - TEMP_DOWNLOAD_REPO_DIR, - CACHED_ORIGIN_NON_GIT_REPO, - REPO_PARENT_DIR, -) +# this is a temporary folder used to download the archive into before +# it is decompressed and brought into the users drive +TEMP_DOWNLOAD_REPO_DIR = "/tmp/temp_download_repo" + +# this is the path to the local origin repository that nbgitpuller uses to mimic +# a remote repo in GitPuller +CACHED_ORIGIN_NON_GIT_REPO = ".nbgitpuller/targets/" async def execute_cmd(cmd, **kwargs): """ @@ -178,8 +180,8 @@ async def push_to_local_origin(temp_download_repo): async def handle_files_helper(args): """ - :param map args: key-value pairs including the repo, provider, extenstion, - download function and download parameters in the case + :param map args: key-value pairs including the repo, provider, extension, repo_parent_dir, + the download function and download parameters in the case that the source needs to handle the download in a specific way(e.g. google requires a confirmation of the download) :return json object with the directory name of the download and @@ -192,7 +194,8 @@ async def handle_files_helper(args): """ url = args["repo"].translate(str.maketrans('', '', string.punctuation)) provider = args["provider"] - origin_repo = f"{REPO_PARENT_DIR}{CACHED_ORIGIN_NON_GIT_REPO}{provider}/{url}/" + repo_parent_dir = args["repo_parent_dir"] + origin_repo = f"{repo_parent_dir}{CACHED_ORIGIN_NON_GIT_REPO}{provider}/{url}/" temp_download_repo = TEMP_DOWNLOAD_REPO_DIR temp_download_file = f"{TEMP_DOWNLOAD_REPO_DIR}/download.{args['extension']}" From 55da5e15438b07ecab91837ac3e06c0fb0e46c84 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Wed, 17 Nov 2021 19:20:58 +0100 Subject: [PATCH 16/40] Revert some trivial formatting changes I wanted to make this change to make it even easier to review the changes in a quite large PR. --- nbgitpuller/handlers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index 1b8eb6d2..eae3e9ee 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -202,18 +202,19 @@ def initialize(self): @gen.coroutine def get(self): app_env = os.getenv('NBGITPULLER_APP', default='notebook') + repo = self.get_argument('repo') branch = self.get_argument('branch', None) depth = self.get_argument('depth', None) provider = self.get_argument('provider', None) urlPath = self.get_argument('urlpath', None) or \ - self.get_argument('urlPath', None) + self.get_argument('urlPath', None) subPath = self.get_argument('subpath', None) or \ - self.get_argument('subPath', '.') + self.get_argument('subPath', '.') app = self.get_argument('app', app_env) parent_reldir = os.getenv('NBGITPULLER_PARENTPATH', '') targetpath = self.get_argument('targetpath', None) or \ - self.get_argument('targetPath', repo.split('/')[-1]) + self.get_argument('targetPath', repo.split('/')[-1]) if urlPath: path = urlPath From 0ca6cf94ac94160fa624d9718e2a05c64d577464 Mon Sep 17 00:00:00 2001 From: sean-morris Date: Wed, 17 Nov 2021 14:38:55 -0800 Subject: [PATCH 17/40] Apply suggestions from code review - refactored the handling of the queue - reverted line break handling to make it easier to review Co-authored-by: Erik Sundell --- nbgitpuller/handlers.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index eae3e9ee..a8c3c94d 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -66,11 +66,10 @@ def progress_loop(self, queue): :param queue: This is either the download_queue or the original pull queue """ while True: - try: - progress = queue.get_nowait() - except Empty: - yield gen.sleep(0.1) + if queue.empty(): + yield gen.sleep(0.5) continue + progress = queue.get_nowait() if progress is None: return if isinstance(progress, Exception): @@ -116,11 +115,9 @@ def get(self): # so that all repos are always in scope after cloning. Sometimes # server_root_dir will include things like `~` and so the path # must be expanded. - repo_parent_dir = os.path.join(os.path.expanduser(self.settings['server_root_dir']), os.getenv('NBGITPULLER_PARENTPATH', '')) - - repo_dir = os.path.join( - repo_parent_dir, - self.get_argument('targetpath', repo.split('/')[-1])) + repo_parent_dir = os.path.join(os.path.expanduser(self.settings['server_root_dir']), + os.getenv('NBGITPULLER_PARENTPATH', '')) + repo_dir = os.path.join(repo_parent_dir, self.get_argument('targetpath', repo.split('/')[-1])) # We gonna send out event streams! self.set_header('content-type', 'text/event-stream') From 9e808e57e26f176b161bea9ef3c3539a921df21a Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Wed, 17 Nov 2021 14:41:04 -0800 Subject: [PATCH 18/40] Changes from code review - renamed progress_loop function to _wait_for_sync_progress_queue - added docstring explaining setup_plugins - renamed provider to content_provider - renamed ProviderException to ContentProviderException - await execution of the _wait_for_sync_progress_queue - removed custom ThreadWithResult class -- unused --- nbgitpuller/handlers.py | 53 ++++++++++++++++--------------- nbgitpuller/static/js/index.js | 10 +++--- nbgitpuller/templates/status.html | 2 +- 3 files changed, 33 insertions(+), 32 deletions(-) diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index a8c3c94d..4fb537e2 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -5,7 +5,7 @@ import threading import json import os -from queue import Queue, Empty +from queue import Queue import jinja2 from .pull import GitPuller @@ -14,9 +14,9 @@ import pluggy -class ProviderException(Exception): +class ContentProviderException(Exception): """ - Custom Exception thrown when the provider key specifying + Custom Exception thrown when the content_provider key specifying the downloader plugin is not installed or can not be found by the name given """ @@ -48,18 +48,27 @@ def emit(self, data): self.write('data: {}\n\n'.format(serialized_data)) yield self.flush() - def setup_plugins(self, provider): + def setup_plugins(self, content_provider): + """ + This automatically searches for and loads packages whose entrypoint is nbgitpuller. If found, + the plugin manager object is returned and used to execute the hook implemented by + the plugin. + :param content_provider: this is the name of the content_provider; each plugin is named to identify the + content_provider of the archive to be loaded(e.g. googledrive, dropbox, etc) + :return: returns the PluginManager object used to call the implemented hooks of the plugin + :raises: ContentProviderException -- this occurs when the content_provider parameter is not found + """ pm = pluggy.PluginManager("nbgitpuller") pm.add_hookspecs(hookspecs) - num_loaded =pm.load_setuptools_entrypoints("nbgitpuller", name=provider) + num_loaded =pm.load_setuptools_entrypoints("nbgitpuller", name=content_provider) if num_loaded == 0: - raise ProviderException(f"The provider key you supplied in the URL could not be found: {provider}") + raise ContentProviderException(f"The content_provider key you supplied in the URL could not be found: {content_provider}") return pm @gen.coroutine - def progress_loop(self, queue): + def _wait_for_sync_progress_queue(self, queue): """ - The loop below constantly checks the queue paremeter for messages + The loop below constantly checks the queue parameter for messages that are being sent to the UI so the user is kept aware of progress related to the downloading of archives and the merging of files into the user's home folder @@ -102,7 +111,7 @@ def get(self): try: repo = self.get_argument('repo') branch = self.get_argument('branch', None) - provider = self.get_argument('provider', None) + content_provider = self.get_argument('provider', None) depth = self.get_argument('depth', None) if depth: depth = int(depth) @@ -123,13 +132,13 @@ def get(self): self.set_header('content-type', 'text/event-stream') self.set_header('cache-control', 'no-cache') - # if provider is specified then we are dealing with compressed + # if content_provider is specified then we are dealing with compressed # archive and not a git repo - if provider is not None: - pm = self.setup_plugins(provider) + if content_provider is not None: + pm = self.setup_plugins(content_provider) req_args = {k: v[0].decode() for k, v in self.request.arguments.items()} download_q = Queue() - req_args["progress_func"] = lambda: self.progress_loop(download_q) + req_args["progress_func"] = lambda: self._wait_for_sync_progress_queue(download_q) req_args["download_q"] = download_q req_args["repo_parent_dir"] = repo_parent_dir hf_args = {"query_line_args": req_args} @@ -151,11 +160,10 @@ def pull(): raise e self.gp_thread = threading.Thread(target=pull) self.gp_thread.start() - self.progress_loop(q) - yield gen.sleep(3) + yield self._wait_for_sync_progress_queue(q) self.emit({'phase': 'finished'}) - except ProviderException as pe: + except ContentProviderException as pe: self.emit({ 'phase': 'error', 'message': str(pe), @@ -203,7 +211,7 @@ def get(self): repo = self.get_argument('repo') branch = self.get_argument('branch', None) depth = self.get_argument('depth', None) - provider = self.get_argument('provider', None) + content_provider = self.get_argument('content_provider', None) urlPath = self.get_argument('urlpath', None) or \ self.get_argument('urlPath', None) subPath = self.get_argument('subpath', None) or \ @@ -224,7 +232,7 @@ def get(self): else: path = 'tree/' + path - if provider is not None: + if content_provider is not None: path = "tree/" self.write( @@ -234,7 +242,7 @@ def get(self): branch=branch, path=path, depth=depth, - provider=provider, + provider=content_provider, targetpath=targetpath, version=__version__ )) @@ -271,10 +279,3 @@ def get(self): ) self.redirect(new_url) - - -class ThreadWithResult(threading.Thread): - def __init__(self, group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None): - def function(): - self.result = target(*args, **kwargs) - super().__init__(group=group, target=function, name=name, daemon=daemon) diff --git a/nbgitpuller/static/js/index.js b/nbgitpuller/static/js/index.js index 60e1c624..c7206722 100644 --- a/nbgitpuller/static/js/index.js +++ b/nbgitpuller/static/js/index.js @@ -2,14 +2,14 @@ import { Terminal } from 'xterm'; import { FitAddon } from 'xterm-addon-fit'; import css from '../../../node_modules/xterm/css/xterm.css'; -function GitSync(baseUrl, repo, branch, depth, targetpath, path, provider) { +function GitSync(baseUrl, repo, branch, depth, targetpath, path, content_provider) { // Class that talks to the API backend & emits events as appropriate this.baseUrl = baseUrl; this.repo = repo; this.branch = branch; this.depth = depth; this.targetpath = targetpath; - this.provider = provider; + this.content_provider = content_provider; this.redirectUrl = baseUrl + path; this.callbacks = {}; } @@ -42,8 +42,8 @@ GitSync.prototype.start = function() { if (typeof this.branch !== 'undefined' && this.branch != undefined) { syncUrlParams['branch'] = this.branch; } - if (typeof this.provider !== 'undefined' && this.provider != undefined) { - syncUrlParams['provider'] = this.provider; + if (typeof this.content_provider !== 'undefined' && this.content_provider != undefined) { + syncUrlParams['content_provider'] = this.content_provider; } var syncUrl = this.baseUrl + 'git-pull/api?' + $.param(syncUrlParams); @@ -143,7 +143,7 @@ var gs = new GitSync( get_body_data('depth'), get_body_data('targetpath'), get_body_data('path'), - get_body_data('provider') + get_body_data('content_provider') ); var gsv = new GitSyncView( diff --git a/nbgitpuller/templates/status.html b/nbgitpuller/templates/status.html index 9f660f3c..5c8e1914 100644 --- a/nbgitpuller/templates/status.html +++ b/nbgitpuller/templates/status.html @@ -12,7 +12,7 @@ {% endblock %} {% block site %} -
+
From 8d63ee4b63eee1c4735002394faebf3b7e161faf Mon Sep 17 00:00:00 2001 From: sean-morris Date: Thu, 18 Nov 2021 16:20:04 -0800 Subject: [PATCH 19/40] Apply suggestions from code review Co-authored-by: Erik Sundell --- nbgitpuller/handlers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index 4fb537e2..1fb668c0 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -135,10 +135,10 @@ def get(self): # if content_provider is specified then we are dealing with compressed # archive and not a git repo if content_provider is not None: - pm = self.setup_plugins(content_provider) + plugin_manager = self.setup_plugins(content_provider) req_args = {k: v[0].decode() for k, v in self.request.arguments.items()} download_q = Queue() - req_args["progress_func"] = lambda: self._wait_for_sync_progress_queue(download_q) + req_args["progress_func"] = partial(self._wait_for_sync_progress_queue, download_q) req_args["download_q"] = download_q req_args["repo_parent_dir"] = repo_parent_dir hf_args = {"query_line_args": req_args} From deecc7bd9b8a676d476a9e18e342d80e26ad36d8 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Tue, 23 Nov 2021 13:04:51 -0800 Subject: [PATCH 20/40] Removed setTerminalVisibility from automatically opening in UI The console in the UI should either be manually openned by the user or automatically opened when an error occurs --- nbgitpuller/static/js/index.js | 1 - 1 file changed, 1 deletion(-) diff --git a/nbgitpuller/static/js/index.js b/nbgitpuller/static/js/index.js index c7206722..d9b6af71 100644 --- a/nbgitpuller/static/js/index.js +++ b/nbgitpuller/static/js/index.js @@ -153,7 +153,6 @@ var gsv = new GitSyncView( ); gs.addHandler('syncing', function(data) { - gsv.setTerminalVisibility(true); gsv.term.write(data.output); }); gs.addHandler('finished', function(data) { From a9e08c4ecf5c9003d5bdfbaaf75dd3754686db0e Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Tue, 23 Nov 2021 13:05:50 -0800 Subject: [PATCH 21/40] Reverted a mistaken change to command-line args --- nbgitpuller/pull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nbgitpuller/pull.py b/nbgitpuller/pull.py index 63b9798f..5eb45f38 100644 --- a/nbgitpuller/pull.py +++ b/nbgitpuller/pull.py @@ -304,7 +304,7 @@ def main(): parser = argparse.ArgumentParser(description='Synchronizes a github repository with a local repository.') parser.add_argument('git_url', help='Url of the repo to sync') parser.add_argument('branch_name', default=None, help='Branch of repo to sync', nargs='?') - parser.add_argument('--target-dir', default='.', help='Path to clone repo under') + parser.add_argument('repo_dir', default='.', help='Path to clone repo under', nargs='?') args = parser.parse_args() From 09c9249f1335012e2dc0fad5a4a57b5bc4a8c692 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Tue, 23 Nov 2021 13:43:20 -0800 Subject: [PATCH 22/40] Hookspecs renamed and documented renamed to plugin_hook_specs and added documentation --- nbgitpuller/hookspecs.py | 22 ------------ nbgitpuller/plugin_hook_specs.py | 61 ++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 22 deletions(-) delete mode 100644 nbgitpuller/hookspecs.py create mode 100644 nbgitpuller/plugin_hook_specs.py diff --git a/nbgitpuller/hookspecs.py b/nbgitpuller/hookspecs.py deleted file mode 100644 index 50ac933e..00000000 --- a/nbgitpuller/hookspecs.py +++ /dev/null @@ -1,22 +0,0 @@ -import pluggy - -hookspec = pluggy.HookspecMarker("nbgitpuller") -hookimpl = pluggy.HookimplMarker("nbgitpuller") - - -@hookspec(firstresult=True) -def handle_files(query_line_args): - """ - :param json query_line_args: this includes any argument you put on the url - :return two parameter json unzip_dir and origin_repo_path - :rtype json object - - The developer uses this function to download, un-compress and save the - source files to the TEMP_DOWNLOAD_REPO_DIR folder. - - The parameter, query_line_args, is any argument you put on the URL - - Once the files are saved to the directory, git puller can handle all the - standard functions needed to make sure source files are updated or created - as needed. - """ diff --git a/nbgitpuller/plugin_hook_specs.py b/nbgitpuller/plugin_hook_specs.py new file mode 100644 index 00000000..7e640d0c --- /dev/null +++ b/nbgitpuller/plugin_hook_specs.py @@ -0,0 +1,61 @@ +import pluggy + +# this hookspec is decorating the handle_files function below. The decorator defines +# the interface(hook specifications) for any implementing content-provider plugins. The project name, nbgitpuller, +# is passed to the constructor for HookspecMarker and HookimplMarker as well as to the constructor for the +# PluginManager in handlers.py in order to allow the PluginManager.add_hookspecs method to automatically discover +# all marked functions. +hookspec = pluggy.HookspecMarker("nbgitpuller") + +# As a convenience the hookimpl field can be used by content-provider plugins to decorate the implementations of the +# handle_files function. A content-provider plugin could create the HookImplMarker itself but in order to register +# with the PluginManager the name('nbgitpuller') must be used as we do here. +hookimpl = pluggy.HookimplMarker("nbgitpuller") + + +@hookspec(firstresult=True) +def handle_files(helper_args, query_line_args): + """ + :param json helper_args: these keyword args are passed from the main thread of nbgitpuller and include: + - repo_parent_dir: save your downloaded archive here + - wait_for_sync_progress_queue: + A partial function with an infinite loop continuously checking the download_q for messages to show the + user in the UI. + - download_q: + This is a Queue that accepts messages to be displayed in the UI. You might tell the user what percent of + the download is complete or any other progress that might inform the user. + :param json query_line_args: this includes any argument you put on the nbgitpuller URL + :return two parameter json unzip_dir and origin_repo_path + :rtype json object + + This function must be implemented by content-provider plugins in order to handle the downloading and decompression + of a non-git sourced compressed archive. + + The helper_args contain three keyword arguments that are necessary to successfully save a + compressed archive: + - repo_parent_dir: save your downloaded archive here + - wait_for_sync_progress_queue: + A partial function with an infinite loop continuously checking the download_q for messages to show the + user in the UI. + - download_q: + This is a Queue that accepts messages to be displayed in the UI. You might tell the user what percent of + the download is complete or any other progress that might inform the user.to a user's jupyterhub home drive. + + The parameter, query_line_args, contains all the arguments you put on the nbgitpuller URL link. This allows you + flexibility to pass information your content-provider download plugin may need to successfully download + source files. + + This function needs to return two pieces of information as a json object: + - unzip_dir -- the is the name of the folder you unzipped the archive into + - origin_repo_path -- this is path to the local git repo that "acts" like the remote origin you would use + if the content-provider is git. + + Once the files are saved to the directory, git puller can handle all the standard functions needed to make sure + source files are updated or created as needed. + + I suggest you study the function handle_files_helper in the plugin_helper.py file to get a deep sense of how + we handle the downloading of compressed archives. There is also more documentation in the docs section of + nbgitpuller. Finally, you can always implement the entire download process yourself and not use the + handle_files_helper function but please to sure understand what is being passed into and back to the nbgitpuller + handlers. + """ From 0085fab6b99866001ff902f785eded12356276aa Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Tue, 23 Nov 2021 13:45:42 -0800 Subject: [PATCH 23/40] Hookspecs name and seperate helper_args - integrated the new name for hookspecs -- plugin_hook_specs - seperated query_line_args from args used to help with the download --- nbgitpuller/handlers.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index 4fb537e2..add8e685 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -10,7 +10,7 @@ from .pull import GitPuller from .version import __version__ -from . import hookspecs +from . import plugin_hook_specs import pluggy @@ -59,7 +59,7 @@ def setup_plugins(self, content_provider): :raises: ContentProviderException -- this occurs when the content_provider parameter is not found """ pm = pluggy.PluginManager("nbgitpuller") - pm.add_hookspecs(hookspecs) + pm.add_hookspecs(plugin_hook_specs) num_loaded =pm.load_setuptools_entrypoints("nbgitpuller", name=content_provider) if num_loaded == 0: raise ContentProviderException(f"The content_provider key you supplied in the URL could not be found: {content_provider}") @@ -136,13 +136,13 @@ def get(self): # archive and not a git repo if content_provider is not None: pm = self.setup_plugins(content_provider) - req_args = {k: v[0].decode() for k, v in self.request.arguments.items()} + query_line_args = {k: v[0].decode() for k, v in self.request.arguments.items()} download_q = Queue() - req_args["progress_func"] = lambda: self._wait_for_sync_progress_queue(download_q) - req_args["download_q"] = download_q - req_args["repo_parent_dir"] = repo_parent_dir - hf_args = {"query_line_args": req_args} - results = pm.hook.handle_files(**hf_args) + helper_args = dict() + helper_args["wait_for_sync_progress_queue"] = lambda: self._wait_for_sync_progress_queue(download_q) + helper_args["download_q"] = download_q + helper_args["repo_parent_dir"] = repo_parent_dir + results = pm.hook.handle_files(helper_args=helper_args,query_line_args=query_line_args) repo_dir = repo_parent_dir + results["unzip_dir"] repo = "file://" + results["origin_repo_path"] From 88ec80649f9004ca3b8ceb8bed7e5ac01490b41d Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Wed, 24 Nov 2021 11:43:21 -0800 Subject: [PATCH 24/40] Renamed for clarity - in handlers.py and plugin_hook_specs.py unzip_dir to output_dir - in handlers.py and index.js provider to content_provider - added ability to extract extension if not passed in args to handle_files_helper --- nbgitpuller/handlers.py | 4 ++-- nbgitpuller/plugin_hook_specs.py | 5 +++-- nbgitpuller/static/js/index.js | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index add8e685..e14e73f9 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -111,7 +111,7 @@ def get(self): try: repo = self.get_argument('repo') branch = self.get_argument('branch', None) - content_provider = self.get_argument('provider', None) + content_provider = self.get_argument('content_provider', None) depth = self.get_argument('depth', None) if depth: depth = int(depth) @@ -143,7 +143,7 @@ def get(self): helper_args["download_q"] = download_q helper_args["repo_parent_dir"] = repo_parent_dir results = pm.hook.handle_files(helper_args=helper_args,query_line_args=query_line_args) - repo_dir = repo_parent_dir + results["unzip_dir"] + repo_dir = repo_parent_dir + results["output_dir"] repo = "file://" + results["origin_repo_path"] gp = GitPuller(repo, repo_dir, branch=branch, depth=depth, parent=self.settings['nbapp']) diff --git a/nbgitpuller/plugin_hook_specs.py b/nbgitpuller/plugin_hook_specs.py index 7e640d0c..51358d9a 100644 --- a/nbgitpuller/plugin_hook_specs.py +++ b/nbgitpuller/plugin_hook_specs.py @@ -25,7 +25,7 @@ def handle_files(helper_args, query_line_args): This is a Queue that accepts messages to be displayed in the UI. You might tell the user what percent of the download is complete or any other progress that might inform the user. :param json query_line_args: this includes any argument you put on the nbgitpuller URL - :return two parameter json unzip_dir and origin_repo_path + :return two parameter json output_dir and origin_repo_path :rtype json object This function must be implemented by content-provider plugins in order to handle the downloading and decompression @@ -46,7 +46,8 @@ def handle_files(helper_args, query_line_args): source files. This function needs to return two pieces of information as a json object: - - unzip_dir -- the is the name of the folder you unzipped the archive into + - output_dir -- the is the name of the directory that will hold all the files you want GitPuller to expose + for comparison, when git is the source, this is name of git repository you are pulling - origin_repo_path -- this is path to the local git repo that "acts" like the remote origin you would use if the content-provider is git. diff --git a/nbgitpuller/static/js/index.js b/nbgitpuller/static/js/index.js index d9b6af71..9bfe6ae2 100644 --- a/nbgitpuller/static/js/index.js +++ b/nbgitpuller/static/js/index.js @@ -143,7 +143,7 @@ var gs = new GitSync( get_body_data('depth'), get_body_data('targetpath'), get_body_data('path'), - get_body_data('content_provider') + get_body_data('provider') ); var gsv = new GitSyncView( From 8592d1f33fc07bd04f41fd4ecd9d935f3d99d31d Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Wed, 24 Nov 2021 11:45:57 -0800 Subject: [PATCH 25/40] Seperated actual query_line_args from helper_args We now pass two parameters helper_args and query_line_args to handle_files_helper --- nbgitpuller/plugin_helper.py | 64 ++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/nbgitpuller/plugin_helper.py b/nbgitpuller/plugin_helper.py index 02c73883..05c06f2e 100644 --- a/nbgitpuller/plugin_helper.py +++ b/nbgitpuller/plugin_helper.py @@ -122,9 +122,9 @@ async def execute_unarchive(ext, temp_download_file, temp_download_repo): yield e -async def download_archive(args, temp_download_file): +async def download_archive(repo_path, temp_download_file): """ - :param map args: key-value pairs including the aiohttp session object and repo path + :param str repo_path: the git repo path :param str temp_download_file: the path to save the requested file to This requests the file from the repo(url) given and saves it to the disk @@ -132,8 +132,8 @@ async def download_archive(args, temp_download_file): yield "Downloading archive ...\n" try: CHUNK_SIZE = 1024 - async with args["client"] as session: - async with session.get(args["repo"]) as response: + async with aiohttp.ClientSession() as session: + async with session.get(repo_path) as response: with open(temp_download_file, 'ab') as fd: count_chunks = 1 while True: @@ -178,26 +178,40 @@ async def push_to_local_origin(temp_download_repo): dir_names = None -async def handle_files_helper(args): +async def handle_files_helper(helper_args, query_line_args): """ - :param map args: key-value pairs including the repo, provider, extension, repo_parent_dir, - the download function and download parameters in the case - that the source needs to handle the download in a specific way(e.g. google - requires a confirmation of the download) + :param dict helper_args: key-value pairs including the: + - download function + - download parameters in the case + that the source needs to handle the download in a specific way(e.g. google + requires a confirmation of the download) + - extension (e.g. zip, tar) ] [OPTIONAL] this may or may not be included. If the repo name contains + name of archive (e.g. example.zip) then this function can determine the extension for you; if not it + needs to be provided. + :param dict query_line_args: + - repo, + - provider, + - repo_parent_dir :return json object with the directory name of the download and the origin_repo_path :rtype json object - This does all the heavy lifting in order needed to set up your local - repos, origin, download the file, unarchiving and push the files + This does all the heavy lifting in the order needed to set up your local + repos, origin, download the file, unarchive and push the files back to the origin """ - url = args["repo"].translate(str.maketrans('', '', string.punctuation)) - provider = args["provider"] - repo_parent_dir = args["repo_parent_dir"] + url = query_line_args["repo"].translate(str.maketrans('', '', string.punctuation)) + provider = query_line_args["content_provider"] + repo_parent_dir = helper_args["repo_parent_dir"] origin_repo = f"{repo_parent_dir}{CACHED_ORIGIN_NON_GIT_REPO}{provider}/{url}/" temp_download_repo = TEMP_DOWNLOAD_REPO_DIR - temp_download_file = f"{TEMP_DOWNLOAD_REPO_DIR}/download.{args['extension']}" + # you can optionally pass the extension of your archive(e.g zip) if it is not identifiable from the URL file name + # otherwise the extract_file_extension function will pull it off the repo name + if "extension" not in helper_args: + ext = extract_file_extension(query_line_args["repo"]) + else: + ext = helper_args['extension'] + temp_download_file = f"{TEMP_DOWNLOAD_REPO_DIR}/download.{ext}" async def gener(): global dir_names @@ -209,17 +223,16 @@ async def gener(): async for c in clone_local_origin_repo(origin_repo, temp_download_repo): yield c - args["client"] = aiohttp.ClientSession() download_func = download_archive - download_args = args, temp_download_file - if "dowload_func" in args: - download_func = args["dowload_func"] - download_args = args["dowload_func_params"] + download_args = query_line_args["repo"], temp_download_file + if "dowload_func" in helper_args: + download_func = helper_args["dowload_func"] + download_args = helper_args["dowload_func_params"] async for d in download_func(*download_args): yield d - async for e in execute_unarchive(args["extension"], temp_download_file, temp_download_repo): + async for e in execute_unarchive(ext, temp_download_file, temp_download_repo): yield e os.remove(temp_download_file) @@ -229,6 +242,7 @@ async def gener(): unzipped_dirs = os.listdir(temp_download_repo) # name of the extracted directory dir_names = list(filter(lambda dir: ".git" not in dir and "__MACOSX" not in dir, unzipped_dirs)) + yield "\n\n" yield "Process Complete: Archive is finished importing into hub\n" yield f"The directory of your download is: {dir_names[0]}\n" @@ -239,10 +253,10 @@ async def gener(): try: async for line in gener(): - args["download_q"].put_nowait(line) + helper_args["download_q"].put_nowait(line) await asyncio.sleep(0.1) except Exception as e: - args["download_q"].put_nowait(e) + helper_args["download_q"].put_nowait(e) raise e - args["download_q"].put_nowait(None) - return {"unzip_dir": dir_names[0], "origin_repo_path": origin_repo} + helper_args["download_q"].put_nowait(None) + return {"output_dir": dir_names[0], "origin_repo_path": origin_repo} From ab5dd108be8325ac8d5154a0a869005cea73829c Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Wed, 24 Nov 2021 12:02:42 -0800 Subject: [PATCH 26/40] Fixed tests - I needed to pull out the tests from test_gitpuller that should be in the PR related to command-line arguments - changed the test case call to download_archive. The signature had changed - removed unused import --- nbgitpuller/pull.py | 5 ++--- tests/test_gitpuller.py | 33 +-------------------------------- tests/test_plugin_helper.py | 4 +--- 3 files changed, 4 insertions(+), 38 deletions(-) diff --git a/nbgitpuller/pull.py b/nbgitpuller/pull.py index 5eb45f38..cc18ac97 100644 --- a/nbgitpuller/pull.py +++ b/nbgitpuller/pull.py @@ -305,13 +305,12 @@ def main(): parser.add_argument('git_url', help='Url of the repo to sync') parser.add_argument('branch_name', default=None, help='Branch of repo to sync', nargs='?') parser.add_argument('repo_dir', default='.', help='Path to clone repo under', nargs='?') - args = parser.parse_args() for line in GitPuller( args.git_url, - args.target_dir, - branch=args.branch_name + args.repo_dir, + branch=args.branch_name if args.branch_name else None ).pull(): print(line) diff --git a/tests/test_gitpuller.py b/tests/test_gitpuller.py index ef1393c1..977fecc4 100644 --- a/tests/test_gitpuller.py +++ b/tests/test_gitpuller.py @@ -99,7 +99,7 @@ def test_initialize(): def command_line_test_helper(remote_path, branch, pusher_path): work_dir = "/".join(os.path.dirname(os.path.abspath(__file__)).split("/")[:-1]) + "/nbgitpuller" try: - cmd = ['python3', 'pull.py', remote_path] + cmd = ['python3', 'pull.py', remote_path, branch, pusher_path] if branch is not None: cmd += [branch] if pusher_path is not None: @@ -113,37 +113,6 @@ def command_line_test_helper(remote_path, branch, pusher_path): return False -def test_command_line_existing_branch(): - branch = "master" - with Remote() as remote, Pusher(remote) as pusher: - pusher.push_file('README.md', '1') - remotepath = "file://%s" % os.path.abspath(remote.path) - pusherpath = os.path.abspath(pusher.path) - subprocess_result = command_line_test_helper(remotepath, branch, pusherpath) - assert subprocess_result - - -def test_command_line_no_branch_passed(): - # so it should use the default branch - branch = None - with Remote() as remote, Pusher(remote) as pusher: - pusher.push_file('README.md', '1') - remotepath = "file://%s" % os.path.abspath(remote.path) - pusherpath = os.path.abspath(pusher.path) - subprocess_result = command_line_test_helper(remotepath, branch, pusherpath) - assert subprocess_result - - -def test_command_line_non_existing_branch(): - branch = "wrong" - with Remote() as remote, Pusher(remote) as pusher: - pusher.push_file('README.md', '1') - remotepath = "file://%s" % os.path.abspath(remote.path) - pusherpath = os.path.abspath(pusher.path) - subprocess_result = command_line_test_helper(remotepath, branch, pusherpath) - assert not subprocess_result - - def test_branch_exists(): with Remote() as remote, Pusher(remote) as pusher: pusher.push_file('README.md', '1') diff --git a/tests/test_plugin_helper.py b/tests/test_plugin_helper.py index b191db74..12f5fbed 100644 --- a/tests/test_plugin_helper.py +++ b/tests/test_plugin_helper.py @@ -2,7 +2,6 @@ import pytest import shutil import nbgitpuller.plugin_helper as ph -import aiohttp from aioresponses import aioresponses test_files_dir = os.getcwd() + "/tests/test_files" @@ -95,9 +94,8 @@ async def test_download_archive(test_configuration): args["repo"] = "http://example.org/mocked-download-url" with aioresponses() as mocked: mocked.get(args["repo"], status=200, body=b'Pretend you are zip file being downloaded') - args["client"] = aiohttp.ClientSession() yield_str = "" - async for line in ph.download_archive(args, temp_archive_download + "downloaded.zip"): + async for line in ph.download_archive(args["repo"], temp_archive_download + "downloaded.zip"): yield_str += line assert 'Downloading archive' in yield_str assert os.path.isfile(temp_archive_download + "downloaded.zip") From e8ae5ca0acd2999eae8c67477928c13294c7a548 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Fri, 26 Nov 2021 12:28:06 -0800 Subject: [PATCH 27/40] Removed changes not meant to merged These tests are removed from this PR. --- tests/test_gitpuller.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/tests/test_gitpuller.py b/tests/test_gitpuller.py index 977fecc4..0055b0da 100644 --- a/tests/test_gitpuller.py +++ b/tests/test_gitpuller.py @@ -100,10 +100,6 @@ def command_line_test_helper(remote_path, branch, pusher_path): work_dir = "/".join(os.path.dirname(os.path.abspath(__file__)).split("/")[:-1]) + "/nbgitpuller" try: cmd = ['python3', 'pull.py', remote_path, branch, pusher_path] - if branch is not None: - cmd += [branch] - if pusher_path is not None: - cmd += ['--target-dir', pusher_path] sp.check_output( cmd, cwd=work_dir @@ -113,6 +109,36 @@ def command_line_test_helper(remote_path, branch, pusher_path): return False +def test_command_line_existing_branch(): + branch = "master" + with Remote() as remote, Pusher(remote) as pusher: + pusher.push_file('README.md', '1') + remotepath = "file://%s" % os.path.abspath(remote.path) + pusherpath = os.path.abspath(pusher.path) + subprocess_result = command_line_test_helper(remotepath, branch, pusherpath) + assert subprocess_result + + +def test_command_line_default_branch(): + branch = "" + with Remote() as remote, Pusher(remote) as pusher: + pusher.push_file('README.md', '1') + remotepath = "file://%s" % os.path.abspath(remote.path) + pusherpath = os.path.abspath(pusher.path) + subprocess_result = command_line_test_helper(remotepath, branch, pusherpath) + assert subprocess_result + + +def test_command_line_non_existing_branch(): + branch = "wrong" + with Remote() as remote, Pusher(remote) as pusher: + pusher.push_file('README.md', '1') + remotepath = "file://%s" % os.path.abspath(remote.path) + pusherpath = os.path.abspath(pusher.path) + subprocess_result = command_line_test_helper(remotepath, branch, pusherpath) + assert not subprocess_result + + def test_branch_exists(): with Remote() as remote, Pusher(remote) as pusher: pusher.push_file('README.md', '1') From 56ad1ee0e1eaca212faea2bfcb0c414227e05703 Mon Sep 17 00:00:00 2001 From: sean-morris Date: Mon, 29 Nov 2021 15:20:29 -0800 Subject: [PATCH 28/40] Apply suggestions from code review Co-authored-by: Erik Sundell --- nbgitpuller/handlers.py | 12 ++++++------ nbgitpuller/plugin_helper.py | 1 + nbgitpuller/static/js/index.js | 8 ++++---- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index e14e73f9..06d12b97 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -58,12 +58,12 @@ def setup_plugins(self, content_provider): :return: returns the PluginManager object used to call the implemented hooks of the plugin :raises: ContentProviderException -- this occurs when the content_provider parameter is not found """ - pm = pluggy.PluginManager("nbgitpuller") - pm.add_hookspecs(plugin_hook_specs) - num_loaded =pm.load_setuptools_entrypoints("nbgitpuller", name=content_provider) + plugin_manager = pluggy.PluginManager("nbgitpuller") + plugin_manager.add_hookspecs(plugin_hook_specs) + num_loaded = plugin_manager.load_setuptools_entrypoints("nbgitpuller", name=content_provider) if num_loaded == 0: raise ContentProviderException(f"The content_provider key you supplied in the URL could not be found: {content_provider}") - return pm + return plugin_manager @gen.coroutine def _wait_for_sync_progress_queue(self, queue): @@ -135,14 +135,14 @@ def get(self): # if content_provider is specified then we are dealing with compressed # archive and not a git repo if content_provider is not None: - pm = self.setup_plugins(content_provider) + plugin_manager = self.setup_plugins(content_provider) query_line_args = {k: v[0].decode() for k, v in self.request.arguments.items()} download_q = Queue() helper_args = dict() helper_args["wait_for_sync_progress_queue"] = lambda: self._wait_for_sync_progress_queue(download_q) helper_args["download_q"] = download_q helper_args["repo_parent_dir"] = repo_parent_dir - results = pm.hook.handle_files(helper_args=helper_args,query_line_args=query_line_args) + results = plugin_manager.hook.handle_files(helper_args=helper_args,query_line_args=query_line_args) repo_dir = repo_parent_dir + results["output_dir"] repo = "file://" + results["origin_repo_path"] diff --git a/nbgitpuller/plugin_helper.py b/nbgitpuller/plugin_helper.py index 05c06f2e..4fcd4389 100644 --- a/nbgitpuller/plugin_helper.py +++ b/nbgitpuller/plugin_helper.py @@ -258,5 +258,6 @@ async def gener(): except Exception as e: helper_args["download_q"].put_nowait(e) raise e + # mark the end of the queue with a None value helper_args["download_q"].put_nowait(None) return {"output_dir": dir_names[0], "origin_repo_path": origin_repo} diff --git a/nbgitpuller/static/js/index.js b/nbgitpuller/static/js/index.js index 9bfe6ae2..2adffcc4 100644 --- a/nbgitpuller/static/js/index.js +++ b/nbgitpuller/static/js/index.js @@ -2,14 +2,14 @@ import { Terminal } from 'xterm'; import { FitAddon } from 'xterm-addon-fit'; import css from '../../../node_modules/xterm/css/xterm.css'; -function GitSync(baseUrl, repo, branch, depth, targetpath, path, content_provider) { +function GitSync(baseUrl, repo, branch, depth, targetpath, path, contentProvider) { // Class that talks to the API backend & emits events as appropriate this.baseUrl = baseUrl; this.repo = repo; this.branch = branch; this.depth = depth; this.targetpath = targetpath; - this.content_provider = content_provider; + this.contentProvider = contentProvider; this.redirectUrl = baseUrl + path; this.callbacks = {}; } @@ -42,8 +42,8 @@ GitSync.prototype.start = function() { if (typeof this.branch !== 'undefined' && this.branch != undefined) { syncUrlParams['branch'] = this.branch; } - if (typeof this.content_provider !== 'undefined' && this.content_provider != undefined) { - syncUrlParams['content_provider'] = this.content_provider; + if (typeof this.contentProvider !== 'undefined' && this.contentProvider != undefined) { + syncUrlParams['content_provider'] = this.contentProvider; } var syncUrl = this.baseUrl + 'git-pull/api?' + $.param(syncUrlParams); From af567ca2ccd254319bad83f4d1cd882d17bda946 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Mon, 29 Nov 2021 15:32:25 -0800 Subject: [PATCH 29/40] Refactored docstrings - put description first in each docstring - not sure what came over me here! - fixed parameter description in handlers.py --- nbgitpuller/handlers.py | 2 +- nbgitpuller/plugin_helper.py | 45 ++++++++++++++++++------------------ 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index 06d12b97..b131f5a2 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -72,7 +72,7 @@ def _wait_for_sync_progress_queue(self, queue): that are being sent to the UI so the user is kept aware of progress related to the downloading of archives and the merging of files into the user's home folder - :param queue: This is either the download_queue or the original pull queue + :param queue: download_queue or the original pull queue """ while True: if queue.empty(): diff --git a/nbgitpuller/plugin_helper.py b/nbgitpuller/plugin_helper.py index 4fcd4389..6a3ca7ce 100644 --- a/nbgitpuller/plugin_helper.py +++ b/nbgitpuller/plugin_helper.py @@ -18,10 +18,10 @@ async def execute_cmd(cmd, **kwargs): """ + Call given command, yielding output line by line + :param array cmd: the commands to be executed :param json kwargs: potential keyword args included with command - - Call given command, yielding output line by line """ yield '$ {}\n'.format(' '.join(cmd)) kwargs['stdout'] = subprocess.PIPE @@ -57,10 +57,10 @@ def flush(): async def initialize_local_repo(local_repo_path): """ - :param str local_repo_path: the locla path where the git repo is initialized - Sets up the a local repo that acts like a remote; yields the output from the git init + + :param str local_repo_path: the locla path where the git repo is initialized """ yield "Initializing repo ...\n" logging.info(f"Creating local_repo_path: {local_repo_path}") @@ -71,15 +71,15 @@ async def initialize_local_repo(local_repo_path): async def clone_local_origin_repo(origin_repo_path, temp_download_repo): """ - :param str origin_repo_path: the local path we used to git init into - :param str temp_download_repo: folder where the compressed archive - is downloaded to - Cloned the origin(which is local) to the folder, temp_download_repo. The folder, temp_download_repo, acts like the space where someone makes changes to master notebooks and then pushes the changes to origin. In other words, the folder, temp_download_repo, is where the compressed archive is downloaded, unarchived, and then pushed to the origin. + + :param str origin_repo_path: the local path we used to git init into + :param str temp_download_repo: folder where the compressed archive + is downloaded to """ yield "Cloning repo ...\n" if os.path.exists(temp_download_repo): @@ -94,10 +94,11 @@ async def clone_local_origin_repo(origin_repo_path, temp_download_repo): def extract_file_extension(url): """ + The file extension(eg. zip, tgz, etc) is extracted from the url to facilitate de-compressing the file + using the correct application -- (zip, tar). + :param str url: the url contains the extension we need to determine what kind of compression is used on the file being downloaded - - this is needed to unarchive various formats(eg. zip, tgz, etc) """ u = urlparse(url) url_arr = u.path.split(".") @@ -108,11 +109,11 @@ def extract_file_extension(url): async def execute_unarchive(ext, temp_download_file, temp_download_repo): """ + un-archives file using unzip or tar to the temp_download_repo + :param str ext: extension used to determine type of compression :param str temp_download_file: the file path to be unarchived :param str temp_download_repo: where the file is unarchived to - - un-archives file using unzip or tar to the temp_download_repo """ if ext == 'zip': cmd_arr = ['unzip', "-qo", temp_download_file, "-d", temp_download_repo] @@ -124,10 +125,10 @@ async def execute_unarchive(ext, temp_download_file, temp_download_repo): async def download_archive(repo_path, temp_download_file): """ + This requests the file from the repo(url) given and saves it to the disk + :param str repo_path: the git repo path :param str temp_download_file: the path to save the requested file to - - This requests the file from the repo(url) given and saves it to the disk """ yield "Downloading archive ...\n" try: @@ -153,10 +154,10 @@ async def download_archive(repo_path, temp_download_file): async def push_to_local_origin(temp_download_repo): """ + The unarchived files are pushed back to the origin + :param str temp_download_repo: the current working directly of folder where the archive had been downloaded and unarchived - - The unarchived files are pushed back to the origin """ async for e in execute_cmd(["git", "add", "."], cwd=temp_download_repo): yield e @@ -172,14 +173,18 @@ async def push_to_local_origin(temp_download_repo): yield e -# this is needed becuase in handle_files_helper I can not return +# this is needed because in handle_files_helper I can not return # from the async generator so it needs a global variable to hold the -# director name of the files downloaded +# directory names of the files downloaded dir_names = None async def handle_files_helper(helper_args, query_line_args): """ + This does all the heavy lifting in the order needed to set up your local + repos, origin, download the file, unarchive and push the files + back to the origin + :param dict helper_args: key-value pairs including the: - download function - download parameters in the case @@ -195,10 +200,6 @@ async def handle_files_helper(helper_args, query_line_args): :return json object with the directory name of the download and the origin_repo_path :rtype json object - - This does all the heavy lifting in the order needed to set up your local - repos, origin, download the file, unarchive and push the files - back to the origin """ url = query_line_args["repo"].translate(str.maketrans('', '', string.punctuation)) provider = query_line_args["content_provider"] From 782a35ba9027e93fde3d470962e609028aa9acb2 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Mon, 29 Nov 2021 15:32:25 -0800 Subject: [PATCH 30/40] Refactored docstrings - put description first in each docstring - not sure what came over me here! - fixed parameter description in handlers.py --- nbgitpuller/handlers.py | 2 +- nbgitpuller/plugin_helper.py | 45 ++++++++++++++++---------------- nbgitpuller/plugin_hook_specs.py | 24 ++++++++--------- 3 files changed, 36 insertions(+), 35 deletions(-) diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index 06d12b97..b131f5a2 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -72,7 +72,7 @@ def _wait_for_sync_progress_queue(self, queue): that are being sent to the UI so the user is kept aware of progress related to the downloading of archives and the merging of files into the user's home folder - :param queue: This is either the download_queue or the original pull queue + :param queue: download_queue or the original pull queue """ while True: if queue.empty(): diff --git a/nbgitpuller/plugin_helper.py b/nbgitpuller/plugin_helper.py index 4fcd4389..6a3ca7ce 100644 --- a/nbgitpuller/plugin_helper.py +++ b/nbgitpuller/plugin_helper.py @@ -18,10 +18,10 @@ async def execute_cmd(cmd, **kwargs): """ + Call given command, yielding output line by line + :param array cmd: the commands to be executed :param json kwargs: potential keyword args included with command - - Call given command, yielding output line by line """ yield '$ {}\n'.format(' '.join(cmd)) kwargs['stdout'] = subprocess.PIPE @@ -57,10 +57,10 @@ def flush(): async def initialize_local_repo(local_repo_path): """ - :param str local_repo_path: the locla path where the git repo is initialized - Sets up the a local repo that acts like a remote; yields the output from the git init + + :param str local_repo_path: the locla path where the git repo is initialized """ yield "Initializing repo ...\n" logging.info(f"Creating local_repo_path: {local_repo_path}") @@ -71,15 +71,15 @@ async def initialize_local_repo(local_repo_path): async def clone_local_origin_repo(origin_repo_path, temp_download_repo): """ - :param str origin_repo_path: the local path we used to git init into - :param str temp_download_repo: folder where the compressed archive - is downloaded to - Cloned the origin(which is local) to the folder, temp_download_repo. The folder, temp_download_repo, acts like the space where someone makes changes to master notebooks and then pushes the changes to origin. In other words, the folder, temp_download_repo, is where the compressed archive is downloaded, unarchived, and then pushed to the origin. + + :param str origin_repo_path: the local path we used to git init into + :param str temp_download_repo: folder where the compressed archive + is downloaded to """ yield "Cloning repo ...\n" if os.path.exists(temp_download_repo): @@ -94,10 +94,11 @@ async def clone_local_origin_repo(origin_repo_path, temp_download_repo): def extract_file_extension(url): """ + The file extension(eg. zip, tgz, etc) is extracted from the url to facilitate de-compressing the file + using the correct application -- (zip, tar). + :param str url: the url contains the extension we need to determine what kind of compression is used on the file being downloaded - - this is needed to unarchive various formats(eg. zip, tgz, etc) """ u = urlparse(url) url_arr = u.path.split(".") @@ -108,11 +109,11 @@ def extract_file_extension(url): async def execute_unarchive(ext, temp_download_file, temp_download_repo): """ + un-archives file using unzip or tar to the temp_download_repo + :param str ext: extension used to determine type of compression :param str temp_download_file: the file path to be unarchived :param str temp_download_repo: where the file is unarchived to - - un-archives file using unzip or tar to the temp_download_repo """ if ext == 'zip': cmd_arr = ['unzip', "-qo", temp_download_file, "-d", temp_download_repo] @@ -124,10 +125,10 @@ async def execute_unarchive(ext, temp_download_file, temp_download_repo): async def download_archive(repo_path, temp_download_file): """ + This requests the file from the repo(url) given and saves it to the disk + :param str repo_path: the git repo path :param str temp_download_file: the path to save the requested file to - - This requests the file from the repo(url) given and saves it to the disk """ yield "Downloading archive ...\n" try: @@ -153,10 +154,10 @@ async def download_archive(repo_path, temp_download_file): async def push_to_local_origin(temp_download_repo): """ + The unarchived files are pushed back to the origin + :param str temp_download_repo: the current working directly of folder where the archive had been downloaded and unarchived - - The unarchived files are pushed back to the origin """ async for e in execute_cmd(["git", "add", "."], cwd=temp_download_repo): yield e @@ -172,14 +173,18 @@ async def push_to_local_origin(temp_download_repo): yield e -# this is needed becuase in handle_files_helper I can not return +# this is needed because in handle_files_helper I can not return # from the async generator so it needs a global variable to hold the -# director name of the files downloaded +# directory names of the files downloaded dir_names = None async def handle_files_helper(helper_args, query_line_args): """ + This does all the heavy lifting in the order needed to set up your local + repos, origin, download the file, unarchive and push the files + back to the origin + :param dict helper_args: key-value pairs including the: - download function - download parameters in the case @@ -195,10 +200,6 @@ async def handle_files_helper(helper_args, query_line_args): :return json object with the directory name of the download and the origin_repo_path :rtype json object - - This does all the heavy lifting in the order needed to set up your local - repos, origin, download the file, unarchive and push the files - back to the origin """ url = query_line_args["repo"].translate(str.maketrans('', '', string.punctuation)) provider = query_line_args["content_provider"] diff --git a/nbgitpuller/plugin_hook_specs.py b/nbgitpuller/plugin_hook_specs.py index 51358d9a..46a877c1 100644 --- a/nbgitpuller/plugin_hook_specs.py +++ b/nbgitpuller/plugin_hook_specs.py @@ -16,18 +16,6 @@ @hookspec(firstresult=True) def handle_files(helper_args, query_line_args): """ - :param json helper_args: these keyword args are passed from the main thread of nbgitpuller and include: - - repo_parent_dir: save your downloaded archive here - - wait_for_sync_progress_queue: - A partial function with an infinite loop continuously checking the download_q for messages to show the - user in the UI. - - download_q: - This is a Queue that accepts messages to be displayed in the UI. You might tell the user what percent of - the download is complete or any other progress that might inform the user. - :param json query_line_args: this includes any argument you put on the nbgitpuller URL - :return two parameter json output_dir and origin_repo_path - :rtype json object - This function must be implemented by content-provider plugins in order to handle the downloading and decompression of a non-git sourced compressed archive. @@ -59,4 +47,16 @@ def handle_files(helper_args, query_line_args): nbgitpuller. Finally, you can always implement the entire download process yourself and not use the handle_files_helper function but please to sure understand what is being passed into and back to the nbgitpuller handlers. + + :param json helper_args: these keyword args are passed from the main thread of nbgitpuller and include: + - repo_parent_dir: save your downloaded archive here + - wait_for_sync_progress_queue: + A partial function with an infinite loop continuously checking the download_q for messages to show the + user in the UI. + - download_q: + This is a Queue that accepts messages to be displayed in the UI. You might tell the user what percent of + the download is complete or any other progress that might inform the user. + :param json query_line_args: this includes any argument you put on the nbgitpuller URL + :return two parameter json output_dir and origin_repo_path + :rtype json object """ From 972946486c7041759f8d9b7aa1407f795f52a212 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Mon, 29 Nov 2021 18:25:20 -0800 Subject: [PATCH 31/40] Fix temp download dir to use the package tempfile - removed the global declaration of TEMP_DOWNLOAD_REPO_DIR - the directory is now created by tempfile.TemporaryDirectory --- nbgitpuller/plugin_helper.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/nbgitpuller/plugin_helper.py b/nbgitpuller/plugin_helper.py index 6a3ca7ce..9218cfea 100644 --- a/nbgitpuller/plugin_helper.py +++ b/nbgitpuller/plugin_helper.py @@ -7,15 +7,13 @@ import shutil from urllib.parse import urlparse from functools import partial - -# this is a temporary folder used to download the archive into before -# it is decompressed and brought into the users drive -TEMP_DOWNLOAD_REPO_DIR = "/tmp/temp_download_repo" +import tempfile # this is the path to the local origin repository that nbgitpuller uses to mimic # a remote repo in GitPuller CACHED_ORIGIN_NON_GIT_REPO = ".nbgitpuller/targets/" + async def execute_cmd(cmd, **kwargs): """ Call given command, yielding output line by line @@ -205,14 +203,14 @@ async def handle_files_helper(helper_args, query_line_args): provider = query_line_args["content_provider"] repo_parent_dir = helper_args["repo_parent_dir"] origin_repo = f"{repo_parent_dir}{CACHED_ORIGIN_NON_GIT_REPO}{provider}/{url}/" - temp_download_repo = TEMP_DOWNLOAD_REPO_DIR + temp_download_dir = tempfile.TemporaryDirectory(dir="/tmp") # you can optionally pass the extension of your archive(e.g zip) if it is not identifiable from the URL file name # otherwise the extract_file_extension function will pull it off the repo name if "extension" not in helper_args: ext = extract_file_extension(query_line_args["repo"]) else: ext = helper_args['extension'] - temp_download_file = f"{TEMP_DOWNLOAD_REPO_DIR}/download.{ext}" + temp_download_file = f"{temp_download_dir.name}/download.{ext}" async def gener(): global dir_names @@ -221,7 +219,7 @@ async def gener(): async for i in initialize_local_repo(origin_repo): yield i - async for c in clone_local_origin_repo(origin_repo, temp_download_repo): + async for c in clone_local_origin_repo(origin_repo, temp_download_dir.name): yield c download_func = download_archive @@ -233,21 +231,21 @@ async def gener(): async for d in download_func(*download_args): yield d - async for e in execute_unarchive(ext, temp_download_file, temp_download_repo): + async for e in execute_unarchive(ext, temp_download_file, temp_download_dir.name): yield e os.remove(temp_download_file) - async for p in push_to_local_origin(temp_download_repo): + async for p in push_to_local_origin(temp_download_dir.name): yield p - unzipped_dirs = os.listdir(temp_download_repo) + unzipped_dirs = os.listdir(temp_download_dir.name) # name of the extracted directory dir_names = list(filter(lambda dir: ".git" not in dir and "__MACOSX" not in dir, unzipped_dirs)) yield "\n\n" yield "Process Complete: Archive is finished importing into hub\n" yield f"The directory of your download is: {dir_names[0]}\n" - shutil.rmtree(temp_download_repo) # remove temporary download space + temp_download_dir.cleanup() # remove temporary download space except Exception as e: logging.exception(e) raise ValueError(e) From 602ef01f541efde0cdb55c57e7652079f9f6aede Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Mon, 29 Nov 2021 19:32:54 -0800 Subject: [PATCH 32/40] provider is now contentProvider in the html/js/query parameters --- nbgitpuller/handlers.py | 6 +++--- nbgitpuller/plugin_helper.py | 2 +- nbgitpuller/static/js/index.js | 4 ++-- nbgitpuller/templates/status.html | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index b131f5a2..decf3771 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -111,7 +111,7 @@ def get(self): try: repo = self.get_argument('repo') branch = self.get_argument('branch', None) - content_provider = self.get_argument('content_provider', None) + content_provider = self.get_argument('contentProvider', None) depth = self.get_argument('depth', None) if depth: depth = int(depth) @@ -211,7 +211,7 @@ def get(self): repo = self.get_argument('repo') branch = self.get_argument('branch', None) depth = self.get_argument('depth', None) - content_provider = self.get_argument('content_provider', None) + content_provider = self.get_argument('contentProvider', None) urlPath = self.get_argument('urlpath', None) or \ self.get_argument('urlPath', None) subPath = self.get_argument('subpath', None) or \ @@ -242,7 +242,7 @@ def get(self): branch=branch, path=path, depth=depth, - provider=content_provider, + contentProvider=content_provider, targetpath=targetpath, version=__version__ )) diff --git a/nbgitpuller/plugin_helper.py b/nbgitpuller/plugin_helper.py index 9218cfea..17a26295 100644 --- a/nbgitpuller/plugin_helper.py +++ b/nbgitpuller/plugin_helper.py @@ -200,7 +200,7 @@ async def handle_files_helper(helper_args, query_line_args): :rtype json object """ url = query_line_args["repo"].translate(str.maketrans('', '', string.punctuation)) - provider = query_line_args["content_provider"] + provider = query_line_args["contentProvider"] repo_parent_dir = helper_args["repo_parent_dir"] origin_repo = f"{repo_parent_dir}{CACHED_ORIGIN_NON_GIT_REPO}{provider}/{url}/" temp_download_dir = tempfile.TemporaryDirectory(dir="/tmp") diff --git a/nbgitpuller/static/js/index.js b/nbgitpuller/static/js/index.js index 2adffcc4..f51332d1 100644 --- a/nbgitpuller/static/js/index.js +++ b/nbgitpuller/static/js/index.js @@ -43,7 +43,7 @@ GitSync.prototype.start = function() { syncUrlParams['branch'] = this.branch; } if (typeof this.contentProvider !== 'undefined' && this.contentProvider != undefined) { - syncUrlParams['content_provider'] = this.contentProvider; + syncUrlParams['contentProvider'] = this.contentProvider; } var syncUrl = this.baseUrl + 'git-pull/api?' + $.param(syncUrlParams); @@ -143,7 +143,7 @@ var gs = new GitSync( get_body_data('depth'), get_body_data('targetpath'), get_body_data('path'), - get_body_data('provider') + get_body_data('contentProvider') ); var gsv = new GitSyncView( diff --git a/nbgitpuller/templates/status.html b/nbgitpuller/templates/status.html index 5c8e1914..3d8a0e18 100644 --- a/nbgitpuller/templates/status.html +++ b/nbgitpuller/templates/status.html @@ -7,7 +7,7 @@ data-path="{{ path | urlencode }}" {% if branch %}data-branch="{{ branch | urlencode }}"{% endif %} {% if depth %}data-depth="{{ depth | urlencode }}"{% endif %} -{% if provider %}data-provider="{{ provider | urlencode }}"{% endif %} +{% if contentProvider %}data-content-provider="{{ contentProvider | urlencode }}"{% endif %} data-targetpath="{{ targetpath | urlencode }}" {% endblock %} From 3ebdc7e67400fca2b7e6e9fbed0d5c95a01e0a08 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Tue, 30 Nov 2021 15:55:10 -0800 Subject: [PATCH 33/40] The download_func and download_func_params brought in separately - can now pass in a custom download function and/or custom download function parameters. - the temp_download_file is added to custom download_func_params -- I did this so that a plugin does not need to know about the temp_download_file nor try to handle it. - the download_archive function now uses keywords: repo and temp_download_file - moved the tempfile cleanup to a finally - removed the dir specification for tempfile --- nbgitpuller/plugin_helper.py | 37 ++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/nbgitpuller/plugin_helper.py b/nbgitpuller/plugin_helper.py index 17a26295..69a87306 100644 --- a/nbgitpuller/plugin_helper.py +++ b/nbgitpuller/plugin_helper.py @@ -121,18 +121,18 @@ async def execute_unarchive(ext, temp_download_file, temp_download_repo): yield e -async def download_archive(repo_path, temp_download_file): +async def download_archive(repo=None, temp_download_file=None): """ This requests the file from the repo(url) given and saves it to the disk - :param str repo_path: the git repo path + :param str repo: the git repo path :param str temp_download_file: the path to save the requested file to """ yield "Downloading archive ...\n" try: CHUNK_SIZE = 1024 async with aiohttp.ClientSession() as session: - async with session.get(repo_path) as response: + async with session.get(repo) as response: with open(temp_download_file, 'ab') as fd: count_chunks = 1 while True: @@ -184,8 +184,8 @@ async def handle_files_helper(helper_args, query_line_args): back to the origin :param dict helper_args: key-value pairs including the: - - download function - - download parameters in the case + - download_func download function + - download_func_params download parameters in the case that the source needs to handle the download in a specific way(e.g. google requires a confirmation of the download) - extension (e.g. zip, tar) ] [OPTIONAL] this may or may not be included. If the repo name contains @@ -203,7 +203,7 @@ async def handle_files_helper(helper_args, query_line_args): provider = query_line_args["contentProvider"] repo_parent_dir = helper_args["repo_parent_dir"] origin_repo = f"{repo_parent_dir}{CACHED_ORIGIN_NON_GIT_REPO}{provider}/{url}/" - temp_download_dir = tempfile.TemporaryDirectory(dir="/tmp") + temp_download_dir = tempfile.TemporaryDirectory() # you can optionally pass the extension of your archive(e.g zip) if it is not identifiable from the URL file name # otherwise the extract_file_extension function will pull it off the repo name if "extension" not in helper_args: @@ -223,12 +223,20 @@ async def gener(): yield c download_func = download_archive - download_args = query_line_args["repo"], temp_download_file - if "dowload_func" in helper_args: - download_func = helper_args["dowload_func"] - download_args = helper_args["dowload_func_params"] - - async for d in download_func(*download_args): + download_args = { + "repo": query_line_args["repo"], + "temp_download_file": temp_download_file + } + # you can pass your own download function as well as download function parameters + # if they are different from the standard download function and parameters. Notice I add + # the temp_download_file to the parameters + if "download_func" in helper_args: + download_func = helper_args["download_func"] + if "download_func_params" in helper_args: + helper_args["download_func_params"]["temp_download_file"] = temp_download_file + download_args = helper_args["download_func_params"] + + async for d in download_func(**download_args): yield d async for e in execute_unarchive(ext, temp_download_file, temp_download_dir.name): @@ -245,11 +253,12 @@ async def gener(): yield "\n\n" yield "Process Complete: Archive is finished importing into hub\n" yield f"The directory of your download is: {dir_names[0]}\n" - temp_download_dir.cleanup() # remove temporary download space + except Exception as e: logging.exception(e) raise ValueError(e) - + finally: + temp_download_dir.cleanup() # remove temporary download space try: async for line in gener(): helper_args["download_q"].put_nowait(line) From e22d076e326d1d32c5bfa39061ab339797b92f4f Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Tue, 30 Nov 2021 16:49:40 -0800 Subject: [PATCH 34/40] Moved the handle_files_helper in Class We needed this to deal with the dir_names being set in the async generator but needing to be returned by handle_files_helper --- nbgitpuller/plugin_helper.py | 165 +++++++++++++++++++---------------- 1 file changed, 90 insertions(+), 75 deletions(-) diff --git a/nbgitpuller/plugin_helper.py b/nbgitpuller/plugin_helper.py index 69a87306..ae355ec2 100644 --- a/nbgitpuller/plugin_helper.py +++ b/nbgitpuller/plugin_helper.py @@ -171,101 +171,116 @@ async def push_to_local_origin(temp_download_repo): yield e -# this is needed because in handle_files_helper I can not return -# from the async generator so it needs a global variable to hold the -# directory names of the files downloaded -dir_names = None - - -async def handle_files_helper(helper_args, query_line_args): +class HandleFilesHelper: """ - This does all the heavy lifting in the order needed to set up your local - repos, origin, download the file, unarchive and push the files - back to the origin - - :param dict helper_args: key-value pairs including the: - - download_func download function - - download_func_params download parameters in the case - that the source needs to handle the download in a specific way(e.g. google - requires a confirmation of the download) - - extension (e.g. zip, tar) ] [OPTIONAL] this may or may not be included. If the repo name contains - name of archive (e.g. example.zip) then this function can determine the extension for you; if not it - needs to be provided. - :param dict query_line_args: - - repo, - - provider, - - repo_parent_dir - :return json object with the directory name of the download and - the origin_repo_path - :rtype json object + This class is needed to handle the use of dir_names inside the async generator as well as in the return object for + the function handle_files_helper. """ - url = query_line_args["repo"].translate(str.maketrans('', '', string.punctuation)) - provider = query_line_args["contentProvider"] - repo_parent_dir = helper_args["repo_parent_dir"] - origin_repo = f"{repo_parent_dir}{CACHED_ORIGIN_NON_GIT_REPO}{provider}/{url}/" - temp_download_dir = tempfile.TemporaryDirectory() - # you can optionally pass the extension of your archive(e.g zip) if it is not identifiable from the URL file name - # otherwise the extract_file_extension function will pull it off the repo name - if "extension" not in helper_args: - ext = extract_file_extension(query_line_args["repo"]) - else: - ext = helper_args['extension'] - temp_download_file = f"{temp_download_dir.name}/download.{ext}" + def __init__(self, helper_args, query_line_args): + """ + This sets up the helper_args and query_line_args for use in the handle_files_helper and gener functions. + + :param dict helper_args: key-value pairs including the: + - download_func download function + - download_func_params download parameters in the case + that the source needs to handle the download in a specific way(e.g. google + requires a confirmation of the download) + - extension (e.g. zip, tar) ] [OPTIONAL] this may or may not be included. If the repo name contains + name of archive (e.g. example.zip) then this function can determine the extension for you; if not it + needs to be provided. + :param dict query_line_args: + - repo, + - provider, + - repo_parent_dir + :param helper_args: + :param query_line_args: + """ + self.dir_names = None + self.url = query_line_args["repo"].translate(str.maketrans('', '', string.punctuation)) + self.content_provider = query_line_args["contentProvider"] + self.repo = query_line_args["repo"] + self.repo_parent_dir = helper_args["repo_parent_dir"] + self.download_q = helper_args["download_q"] + self.origin_repo = f"{self.repo_parent_dir}{CACHED_ORIGIN_NON_GIT_REPO}{self.content_provider}/{self.url}/" + self.temp_download_dir = tempfile.TemporaryDirectory() + + # you can optionally pass the extension of your archive(e.g zip) if it is not identifiable from the URL file name + # otherwise the extract_file_extension function will pull it off the repo name + if "extension" not in helper_args: + self.ext = extract_file_extension(query_line_args["repo"]) + else: + self.ext = helper_args['extension'] + self.temp_download_file = f"{self.temp_download_dir.name}/download.{self.ext}" + self.download_func = download_archive + self.download_args = { + "repo": self.repo, + "temp_download_file": self.temp_download_file + } + + # you can pass your own download function as well as download function parameters + # if they are different from the standard download function and parameters. Notice I add + # the temp_download_file to the parameters + if "download_func" in helper_args: + self.download_func = helper_args["download_func"] + if "download_func_params" in helper_args: + helper_args["download_func_params"]["temp_download_file"] = self.temp_download_file + self.download_args = helper_args["download_func_params"] + + async def gener(self): + """ + This does all the heavy lifting in the order needed to set up your local + repos, origin, download the file, unarchive and push the files + back to the origin + """ - async def gener(): - global dir_names try: - if not os.path.exists(origin_repo): - async for i in initialize_local_repo(origin_repo): + if not os.path.exists(self.origin_repo): + async for i in initialize_local_repo(self.origin_repo): yield i - async for c in clone_local_origin_repo(origin_repo, temp_download_dir.name): + async for c in clone_local_origin_repo(self.origin_repo, self.temp_download_dir.name): yield c - download_func = download_archive - download_args = { - "repo": query_line_args["repo"], - "temp_download_file": temp_download_file - } - # you can pass your own download function as well as download function parameters - # if they are different from the standard download function and parameters. Notice I add - # the temp_download_file to the parameters - if "download_func" in helper_args: - download_func = helper_args["download_func"] - if "download_func_params" in helper_args: - helper_args["download_func_params"]["temp_download_file"] = temp_download_file - download_args = helper_args["download_func_params"] - - async for d in download_func(**download_args): + async for d in self.download_func(**self.download_args): yield d - async for e in execute_unarchive(ext, temp_download_file, temp_download_dir.name): + async for e in execute_unarchive(self.ext, self.temp_download_file, self.temp_download_dir.name): yield e - os.remove(temp_download_file) - async for p in push_to_local_origin(temp_download_dir.name): + os.remove(self.temp_download_file) + async for p in push_to_local_origin(self.temp_download_dir.name): yield p - unzipped_dirs = os.listdir(temp_download_dir.name) + unzipped_dirs = os.listdir(self.temp_download_dir.name) # name of the extracted directory - dir_names = list(filter(lambda dir: ".git" not in dir and "__MACOSX" not in dir, unzipped_dirs)) + self.dir_names = list(filter(lambda dir: ".git" not in dir and "__MACOSX" not in dir, unzipped_dirs)) yield "\n\n" yield "Process Complete: Archive is finished importing into hub\n" - yield f"The directory of your download is: {dir_names[0]}\n" + yield f"The directory of your download is: {self.dir_names[0]}\n" except Exception as e: logging.exception(e) raise ValueError(e) finally: - temp_download_dir.cleanup() # remove temporary download space - try: - async for line in gener(): - helper_args["download_q"].put_nowait(line) - await asyncio.sleep(0.1) - except Exception as e: - helper_args["download_q"].put_nowait(e) - raise e - # mark the end of the queue with a None value - helper_args["download_q"].put_nowait(None) - return {"output_dir": dir_names[0], "origin_repo_path": origin_repo} + self.temp_download_dir.cleanup() # remove temporary download space + + async def handle_files_helper(self): + """ + This calls the async generator function and handle the storing of messages from the gener() function + into the download_q + + :return json object with the directory name of the download and + the origin_repo_path + :rtype json object + """ + try: + async for line in self.gener(): + self.download_q.put_nowait(line) + await asyncio.sleep(0.1) + except Exception as e: + self.download_q.put_nowait(e) + raise e + # mark the end of the queue with a None value + self.download_q.put_nowait(None) + return {"output_dir": self.dir_names[0], "origin_repo_path": self.origin_repo} From 3b14405fca03c0a163742f1e8b65f3e7e81969df Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Mon, 20 Dec 2021 14:23:49 -0800 Subject: [PATCH 35/40] Moved downloader-plugin util to own repo The downloader-plugin utilities and tests are now in their own repo with the other downloader plugins. --- .gitignore | 2 + dev-requirements.txt | 4 +- nbgitpuller/plugin_helper.py | 286 ----------------------------------- tests/test_files/test.txt | 13 -- tests/test_plugin_helper.py | 101 ------------- 5 files changed, 3 insertions(+), 403 deletions(-) delete mode 100644 nbgitpuller/plugin_helper.py delete mode 100644 tests/test_files/test.txt delete mode 100644 tests/test_plugin_helper.py diff --git a/.gitignore b/.gitignore index 40f71359..51520b99 100644 --- a/.gitignore +++ b/.gitignore @@ -21,7 +21,9 @@ docs/_build jupyterhub.sqlite jupyterhub_cookie_secret +/jupyterhub-proxy.pid node_modules/ package-lock.json nbgitpuller/static/dist + diff --git a/dev-requirements.txt b/dev-requirements.txt index 03f5a7ee..bb21db7a 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,6 +2,4 @@ jupyter-packaging>=0.10 pytest pytest-cov flake8 -nbclassic -aioresponses -pytest-asyncio \ No newline at end of file +nbclassic \ No newline at end of file diff --git a/nbgitpuller/plugin_helper.py b/nbgitpuller/plugin_helper.py deleted file mode 100644 index ae355ec2..00000000 --- a/nbgitpuller/plugin_helper.py +++ /dev/null @@ -1,286 +0,0 @@ -import string -import os -import logging -import aiohttp -import asyncio -import subprocess -import shutil -from urllib.parse import urlparse -from functools import partial -import tempfile - -# this is the path to the local origin repository that nbgitpuller uses to mimic -# a remote repo in GitPuller -CACHED_ORIGIN_NON_GIT_REPO = ".nbgitpuller/targets/" - - -async def execute_cmd(cmd, **kwargs): - """ - Call given command, yielding output line by line - - :param array cmd: the commands to be executed - :param json kwargs: potential keyword args included with command - """ - yield '$ {}\n'.format(' '.join(cmd)) - kwargs['stdout'] = subprocess.PIPE - kwargs['stderr'] = subprocess.STDOUT - - proc = subprocess.Popen(cmd, **kwargs) - - # Capture output for logging. - # Each line will be yielded as text. - # This should behave the same as .readline(), but splits on `\r` OR `\n`, - # not just `\n`. - buf = [] - - def flush(): - line = b''.join(buf).decode('utf8', 'replace') - buf[:] = [] - return line - - c_last = '' - try: - for c in iter(partial(proc.stdout.read, 1), b''): - if c_last == b'\r' and buf and c != b'\n': - yield flush() - buf.append(c) - if c == b'\n': - yield flush() - c_last = c - finally: - ret = proc.wait() - if ret != 0: - raise subprocess.CalledProcessError(ret, cmd) - - -async def initialize_local_repo(local_repo_path): - """ - Sets up the a local repo that acts like a remote; yields the - output from the git init - - :param str local_repo_path: the locla path where the git repo is initialized - """ - yield "Initializing repo ...\n" - logging.info(f"Creating local_repo_path: {local_repo_path}") - os.makedirs(local_repo_path, exist_ok=True) - async for e in execute_cmd(["git", "init", "--bare"], cwd=local_repo_path): - yield e - - -async def clone_local_origin_repo(origin_repo_path, temp_download_repo): - """ - Cloned the origin(which is local) to the folder, temp_download_repo. - The folder, temp_download_repo, acts like the space where someone makes changes - to master notebooks and then pushes the changes to origin. In other words, - the folder, temp_download_repo, is where the compressed archive is downloaded, - unarchived, and then pushed to the origin. - - :param str origin_repo_path: the local path we used to git init into - :param str temp_download_repo: folder where the compressed archive - is downloaded to - """ - yield "Cloning repo ...\n" - if os.path.exists(temp_download_repo): - shutil.rmtree(temp_download_repo) - logging.info(f"Creating temp_download_repo: {temp_download_repo}") - os.makedirs(temp_download_repo, exist_ok=True) - - cmd = ["git", "clone", f"file://{origin_repo_path}", temp_download_repo] - async for e in execute_cmd(cmd, cwd=temp_download_repo): - yield e - - -def extract_file_extension(url): - """ - The file extension(eg. zip, tgz, etc) is extracted from the url to facilitate de-compressing the file - using the correct application -- (zip, tar). - - :param str url: the url contains the extension we need to determine - what kind of compression is used on the file being downloaded - """ - u = urlparse(url) - url_arr = u.path.split(".") - if len(url_arr) >= 2: - return url_arr[-1] - raise Exception(f"Could not determine compression type of: {url}") - - -async def execute_unarchive(ext, temp_download_file, temp_download_repo): - """ - un-archives file using unzip or tar to the temp_download_repo - - :param str ext: extension used to determine type of compression - :param str temp_download_file: the file path to be unarchived - :param str temp_download_repo: where the file is unarchived to - """ - if ext == 'zip': - cmd_arr = ['unzip', "-qo", temp_download_file, "-d", temp_download_repo] - else: - cmd_arr = ['tar', 'xzf', temp_download_file, '-C', temp_download_repo] - async for e in execute_cmd(cmd_arr, cwd=temp_download_repo): - yield e - - -async def download_archive(repo=None, temp_download_file=None): - """ - This requests the file from the repo(url) given and saves it to the disk - - :param str repo: the git repo path - :param str temp_download_file: the path to save the requested file to - """ - yield "Downloading archive ...\n" - try: - CHUNK_SIZE = 1024 - async with aiohttp.ClientSession() as session: - async with session.get(repo) as response: - with open(temp_download_file, 'ab') as fd: - count_chunks = 1 - while True: - count_chunks += 1 - if count_chunks % 1000 == 0: - display = count_chunks / 1000 - yield f"Downloading Progress ... {display}MB\n" - chunk = await response.content.read(CHUNK_SIZE) - if not chunk: - break - fd.write(chunk) - except Exception as e: - raise e - - yield "Archive Downloaded....\n" - - -async def push_to_local_origin(temp_download_repo): - """ - The unarchived files are pushed back to the origin - - :param str temp_download_repo: the current working directly of folder - where the archive had been downloaded and unarchived - """ - async for e in execute_cmd(["git", "add", "."], cwd=temp_download_repo): - yield e - commit_cmd = [ - "git", - "-c", "user.email=nbgitpuller@nbgitpuller.link", - "-c", "user.name=nbgitpuller", - "commit", "-q", "-m", "test", "--allow-empty" - ] - async for e in execute_cmd(commit_cmd, cwd=temp_download_repo): - yield e - async for e in execute_cmd(["git", "push", "origin", "master"], cwd=temp_download_repo): - yield e - - -class HandleFilesHelper: - """ - This class is needed to handle the use of dir_names inside the async generator as well as in the return object for - the function handle_files_helper. - """ - def __init__(self, helper_args, query_line_args): - """ - This sets up the helper_args and query_line_args for use in the handle_files_helper and gener functions. - - :param dict helper_args: key-value pairs including the: - - download_func download function - - download_func_params download parameters in the case - that the source needs to handle the download in a specific way(e.g. google - requires a confirmation of the download) - - extension (e.g. zip, tar) ] [OPTIONAL] this may or may not be included. If the repo name contains - name of archive (e.g. example.zip) then this function can determine the extension for you; if not it - needs to be provided. - :param dict query_line_args: - - repo, - - provider, - - repo_parent_dir - :param helper_args: - :param query_line_args: - """ - self.dir_names = None - self.url = query_line_args["repo"].translate(str.maketrans('', '', string.punctuation)) - self.content_provider = query_line_args["contentProvider"] - self.repo = query_line_args["repo"] - self.repo_parent_dir = helper_args["repo_parent_dir"] - self.download_q = helper_args["download_q"] - self.origin_repo = f"{self.repo_parent_dir}{CACHED_ORIGIN_NON_GIT_REPO}{self.content_provider}/{self.url}/" - self.temp_download_dir = tempfile.TemporaryDirectory() - - # you can optionally pass the extension of your archive(e.g zip) if it is not identifiable from the URL file name - # otherwise the extract_file_extension function will pull it off the repo name - if "extension" not in helper_args: - self.ext = extract_file_extension(query_line_args["repo"]) - else: - self.ext = helper_args['extension'] - self.temp_download_file = f"{self.temp_download_dir.name}/download.{self.ext}" - self.download_func = download_archive - self.download_args = { - "repo": self.repo, - "temp_download_file": self.temp_download_file - } - - # you can pass your own download function as well as download function parameters - # if they are different from the standard download function and parameters. Notice I add - # the temp_download_file to the parameters - if "download_func" in helper_args: - self.download_func = helper_args["download_func"] - if "download_func_params" in helper_args: - helper_args["download_func_params"]["temp_download_file"] = self.temp_download_file - self.download_args = helper_args["download_func_params"] - - async def gener(self): - """ - This does all the heavy lifting in the order needed to set up your local - repos, origin, download the file, unarchive and push the files - back to the origin - """ - - try: - if not os.path.exists(self.origin_repo): - async for i in initialize_local_repo(self.origin_repo): - yield i - - async for c in clone_local_origin_repo(self.origin_repo, self.temp_download_dir.name): - yield c - - async for d in self.download_func(**self.download_args): - yield d - - async for e in execute_unarchive(self.ext, self.temp_download_file, self.temp_download_dir.name): - yield e - - os.remove(self.temp_download_file) - async for p in push_to_local_origin(self.temp_download_dir.name): - yield p - - unzipped_dirs = os.listdir(self.temp_download_dir.name) - # name of the extracted directory - self.dir_names = list(filter(lambda dir: ".git" not in dir and "__MACOSX" not in dir, unzipped_dirs)) - - yield "\n\n" - yield "Process Complete: Archive is finished importing into hub\n" - yield f"The directory of your download is: {self.dir_names[0]}\n" - - except Exception as e: - logging.exception(e) - raise ValueError(e) - finally: - self.temp_download_dir.cleanup() # remove temporary download space - - async def handle_files_helper(self): - """ - This calls the async generator function and handle the storing of messages from the gener() function - into the download_q - - :return json object with the directory name of the download and - the origin_repo_path - :rtype json object - """ - try: - async for line in self.gener(): - self.download_q.put_nowait(line) - await asyncio.sleep(0.1) - except Exception as e: - self.download_q.put_nowait(e) - raise e - # mark the end of the queue with a None value - self.download_q.put_nowait(None) - return {"output_dir": self.dir_names[0], "origin_repo_path": self.origin_repo} diff --git a/tests/test_files/test.txt b/tests/test_files/test.txt deleted file mode 100644 index 8e435da9..00000000 --- a/tests/test_files/test.txt +++ /dev/null @@ -1,13 +0,0 @@ -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 diff --git a/tests/test_plugin_helper.py b/tests/test_plugin_helper.py deleted file mode 100644 index 12f5fbed..00000000 --- a/tests/test_plugin_helper.py +++ /dev/null @@ -1,101 +0,0 @@ -import os -import pytest -import shutil -import nbgitpuller.plugin_helper as ph -from aioresponses import aioresponses - -test_files_dir = os.getcwd() + "/tests/test_files" -archive_base = "/tmp/test_files" -repo_parent_dir = "/tmp/fake/" -temp_download_repo = "/tmp/download/" -temp_archive_download = "/tmp/archive_download/" -provider = "dropbox_test" -url = "http://test/this/repo" -CACHED_ORIGIN_NON_GIT_REPO = ".nbgitpuller/targets/" -origin_repo = f"{repo_parent_dir}{CACHED_ORIGIN_NON_GIT_REPO}{provider}/{url}/" - -repo_zip = 'file://' + archive_base + ".zip" -repo_tgz = 'file://' + archive_base + ".tar.gz" - - -@pytest.fixture -async def test_configuration(): - shutil.make_archive(archive_base, 'zip', test_files_dir) - shutil.make_archive(archive_base, 'gztar', test_files_dir) - os.makedirs(temp_archive_download, exist_ok=True) - os.makedirs(repo_parent_dir, exist_ok=True) - os.makedirs(temp_download_repo, exist_ok=True) - yield "test finishing" - os.remove(archive_base + ".zip") - os.remove(archive_base + ".tar.gz") - if os.path.isfile(temp_archive_download + "downloaded.zip"): - os.remove(temp_archive_download + "downloaded.zip") - shutil.rmtree(repo_parent_dir) - shutil.rmtree(temp_download_repo) - shutil.rmtree(temp_archive_download) - - -def test_extract_file_extension(): - url = "https://example.org/master/materials-sp20-external.tgz" - ext = ph.extract_file_extension(url) - assert "tgz" in ext - - -@pytest.mark.asyncio -async def test_initialize_local_repo(test_configuration): - yield_str = "" - async for line in ph.initialize_local_repo(origin_repo): - yield_str += line - assert "init --bare" in yield_str - assert os.path.isdir(origin_repo) - - -@pytest.mark.asyncio -async def test_clone_local_origin_repo(test_configuration): - async for line in ph.initialize_local_repo(origin_repo): - pass - - yield_str = "" - async for line in ph.clone_local_origin_repo(origin_repo, temp_download_repo): - yield_str += line - - assert "Cloning into" in yield_str - assert os.path.isdir(temp_download_repo + ".git") - - -@pytest.mark.asyncio -async def test_execute_unarchive(test_configuration): - yield_str = "" - async for line in ph.execute_unarchive("zip", archive_base + ".zip", temp_download_repo): - yield_str += line - assert os.path.isfile("/tmp/download/test.txt") - - -@pytest.mark.asyncio -async def test_push_to_local_origin(test_configuration): - async for line in ph.initialize_local_repo(origin_repo): - pass - - async for line in ph.clone_local_origin_repo(origin_repo, temp_download_repo): - pass - - async for line in ph.execute_unarchive("zip", archive_base + ".zip", temp_download_repo): - pass - - yield_str = "" - async for line in ph.push_to_local_origin(temp_download_repo): - yield_str += line - assert "[new branch]" in yield_str - - -@pytest.mark.asyncio -async def test_download_archive(test_configuration): - args = {} - args["repo"] = "http://example.org/mocked-download-url" - with aioresponses() as mocked: - mocked.get(args["repo"], status=200, body=b'Pretend you are zip file being downloaded') - yield_str = "" - async for line in ph.download_archive(args["repo"], temp_archive_download + "downloaded.zip"): - yield_str += line - assert 'Downloading archive' in yield_str - assert os.path.isfile(temp_archive_download + "downloaded.zip") From 613f863b50b25970fe6ffdd09cf44b547bc5950b Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Mon, 20 Dec 2021 14:23:49 -0800 Subject: [PATCH 36/40] Moved downloader-plugin util to own repo The downloader-plugin utilities and tests are now in their own repo with the other downloader plugins. --- .gitignore | 2 + dev-requirements.txt | 4 +- nbgitpuller/handlers.py | 1 + nbgitpuller/plugin_helper.py | 286 ----------------------------------- setup.py | 2 +- tests/test_files/test.txt | 13 -- tests/test_plugin_helper.py | 101 ------------- 7 files changed, 5 insertions(+), 404 deletions(-) delete mode 100644 nbgitpuller/plugin_helper.py delete mode 100644 tests/test_files/test.txt delete mode 100644 tests/test_plugin_helper.py diff --git a/.gitignore b/.gitignore index 40f71359..51520b99 100644 --- a/.gitignore +++ b/.gitignore @@ -21,7 +21,9 @@ docs/_build jupyterhub.sqlite jupyterhub_cookie_secret +/jupyterhub-proxy.pid node_modules/ package-lock.json nbgitpuller/static/dist + diff --git a/dev-requirements.txt b/dev-requirements.txt index 03f5a7ee..bb21db7a 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,6 +2,4 @@ jupyter-packaging>=0.10 pytest pytest-cov flake8 -nbclassic -aioresponses -pytest-asyncio \ No newline at end of file +nbclassic \ No newline at end of file diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index decf3771..8152448b 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -1,6 +1,7 @@ from tornado import gen, web, locks import traceback import urllib.parse + from notebook.base.handlers import IPythonHandler import threading import json diff --git a/nbgitpuller/plugin_helper.py b/nbgitpuller/plugin_helper.py deleted file mode 100644 index ae355ec2..00000000 --- a/nbgitpuller/plugin_helper.py +++ /dev/null @@ -1,286 +0,0 @@ -import string -import os -import logging -import aiohttp -import asyncio -import subprocess -import shutil -from urllib.parse import urlparse -from functools import partial -import tempfile - -# this is the path to the local origin repository that nbgitpuller uses to mimic -# a remote repo in GitPuller -CACHED_ORIGIN_NON_GIT_REPO = ".nbgitpuller/targets/" - - -async def execute_cmd(cmd, **kwargs): - """ - Call given command, yielding output line by line - - :param array cmd: the commands to be executed - :param json kwargs: potential keyword args included with command - """ - yield '$ {}\n'.format(' '.join(cmd)) - kwargs['stdout'] = subprocess.PIPE - kwargs['stderr'] = subprocess.STDOUT - - proc = subprocess.Popen(cmd, **kwargs) - - # Capture output for logging. - # Each line will be yielded as text. - # This should behave the same as .readline(), but splits on `\r` OR `\n`, - # not just `\n`. - buf = [] - - def flush(): - line = b''.join(buf).decode('utf8', 'replace') - buf[:] = [] - return line - - c_last = '' - try: - for c in iter(partial(proc.stdout.read, 1), b''): - if c_last == b'\r' and buf and c != b'\n': - yield flush() - buf.append(c) - if c == b'\n': - yield flush() - c_last = c - finally: - ret = proc.wait() - if ret != 0: - raise subprocess.CalledProcessError(ret, cmd) - - -async def initialize_local_repo(local_repo_path): - """ - Sets up the a local repo that acts like a remote; yields the - output from the git init - - :param str local_repo_path: the locla path where the git repo is initialized - """ - yield "Initializing repo ...\n" - logging.info(f"Creating local_repo_path: {local_repo_path}") - os.makedirs(local_repo_path, exist_ok=True) - async for e in execute_cmd(["git", "init", "--bare"], cwd=local_repo_path): - yield e - - -async def clone_local_origin_repo(origin_repo_path, temp_download_repo): - """ - Cloned the origin(which is local) to the folder, temp_download_repo. - The folder, temp_download_repo, acts like the space where someone makes changes - to master notebooks and then pushes the changes to origin. In other words, - the folder, temp_download_repo, is where the compressed archive is downloaded, - unarchived, and then pushed to the origin. - - :param str origin_repo_path: the local path we used to git init into - :param str temp_download_repo: folder where the compressed archive - is downloaded to - """ - yield "Cloning repo ...\n" - if os.path.exists(temp_download_repo): - shutil.rmtree(temp_download_repo) - logging.info(f"Creating temp_download_repo: {temp_download_repo}") - os.makedirs(temp_download_repo, exist_ok=True) - - cmd = ["git", "clone", f"file://{origin_repo_path}", temp_download_repo] - async for e in execute_cmd(cmd, cwd=temp_download_repo): - yield e - - -def extract_file_extension(url): - """ - The file extension(eg. zip, tgz, etc) is extracted from the url to facilitate de-compressing the file - using the correct application -- (zip, tar). - - :param str url: the url contains the extension we need to determine - what kind of compression is used on the file being downloaded - """ - u = urlparse(url) - url_arr = u.path.split(".") - if len(url_arr) >= 2: - return url_arr[-1] - raise Exception(f"Could not determine compression type of: {url}") - - -async def execute_unarchive(ext, temp_download_file, temp_download_repo): - """ - un-archives file using unzip or tar to the temp_download_repo - - :param str ext: extension used to determine type of compression - :param str temp_download_file: the file path to be unarchived - :param str temp_download_repo: where the file is unarchived to - """ - if ext == 'zip': - cmd_arr = ['unzip', "-qo", temp_download_file, "-d", temp_download_repo] - else: - cmd_arr = ['tar', 'xzf', temp_download_file, '-C', temp_download_repo] - async for e in execute_cmd(cmd_arr, cwd=temp_download_repo): - yield e - - -async def download_archive(repo=None, temp_download_file=None): - """ - This requests the file from the repo(url) given and saves it to the disk - - :param str repo: the git repo path - :param str temp_download_file: the path to save the requested file to - """ - yield "Downloading archive ...\n" - try: - CHUNK_SIZE = 1024 - async with aiohttp.ClientSession() as session: - async with session.get(repo) as response: - with open(temp_download_file, 'ab') as fd: - count_chunks = 1 - while True: - count_chunks += 1 - if count_chunks % 1000 == 0: - display = count_chunks / 1000 - yield f"Downloading Progress ... {display}MB\n" - chunk = await response.content.read(CHUNK_SIZE) - if not chunk: - break - fd.write(chunk) - except Exception as e: - raise e - - yield "Archive Downloaded....\n" - - -async def push_to_local_origin(temp_download_repo): - """ - The unarchived files are pushed back to the origin - - :param str temp_download_repo: the current working directly of folder - where the archive had been downloaded and unarchived - """ - async for e in execute_cmd(["git", "add", "."], cwd=temp_download_repo): - yield e - commit_cmd = [ - "git", - "-c", "user.email=nbgitpuller@nbgitpuller.link", - "-c", "user.name=nbgitpuller", - "commit", "-q", "-m", "test", "--allow-empty" - ] - async for e in execute_cmd(commit_cmd, cwd=temp_download_repo): - yield e - async for e in execute_cmd(["git", "push", "origin", "master"], cwd=temp_download_repo): - yield e - - -class HandleFilesHelper: - """ - This class is needed to handle the use of dir_names inside the async generator as well as in the return object for - the function handle_files_helper. - """ - def __init__(self, helper_args, query_line_args): - """ - This sets up the helper_args and query_line_args for use in the handle_files_helper and gener functions. - - :param dict helper_args: key-value pairs including the: - - download_func download function - - download_func_params download parameters in the case - that the source needs to handle the download in a specific way(e.g. google - requires a confirmation of the download) - - extension (e.g. zip, tar) ] [OPTIONAL] this may or may not be included. If the repo name contains - name of archive (e.g. example.zip) then this function can determine the extension for you; if not it - needs to be provided. - :param dict query_line_args: - - repo, - - provider, - - repo_parent_dir - :param helper_args: - :param query_line_args: - """ - self.dir_names = None - self.url = query_line_args["repo"].translate(str.maketrans('', '', string.punctuation)) - self.content_provider = query_line_args["contentProvider"] - self.repo = query_line_args["repo"] - self.repo_parent_dir = helper_args["repo_parent_dir"] - self.download_q = helper_args["download_q"] - self.origin_repo = f"{self.repo_parent_dir}{CACHED_ORIGIN_NON_GIT_REPO}{self.content_provider}/{self.url}/" - self.temp_download_dir = tempfile.TemporaryDirectory() - - # you can optionally pass the extension of your archive(e.g zip) if it is not identifiable from the URL file name - # otherwise the extract_file_extension function will pull it off the repo name - if "extension" not in helper_args: - self.ext = extract_file_extension(query_line_args["repo"]) - else: - self.ext = helper_args['extension'] - self.temp_download_file = f"{self.temp_download_dir.name}/download.{self.ext}" - self.download_func = download_archive - self.download_args = { - "repo": self.repo, - "temp_download_file": self.temp_download_file - } - - # you can pass your own download function as well as download function parameters - # if they are different from the standard download function and parameters. Notice I add - # the temp_download_file to the parameters - if "download_func" in helper_args: - self.download_func = helper_args["download_func"] - if "download_func_params" in helper_args: - helper_args["download_func_params"]["temp_download_file"] = self.temp_download_file - self.download_args = helper_args["download_func_params"] - - async def gener(self): - """ - This does all the heavy lifting in the order needed to set up your local - repos, origin, download the file, unarchive and push the files - back to the origin - """ - - try: - if not os.path.exists(self.origin_repo): - async for i in initialize_local_repo(self.origin_repo): - yield i - - async for c in clone_local_origin_repo(self.origin_repo, self.temp_download_dir.name): - yield c - - async for d in self.download_func(**self.download_args): - yield d - - async for e in execute_unarchive(self.ext, self.temp_download_file, self.temp_download_dir.name): - yield e - - os.remove(self.temp_download_file) - async for p in push_to_local_origin(self.temp_download_dir.name): - yield p - - unzipped_dirs = os.listdir(self.temp_download_dir.name) - # name of the extracted directory - self.dir_names = list(filter(lambda dir: ".git" not in dir and "__MACOSX" not in dir, unzipped_dirs)) - - yield "\n\n" - yield "Process Complete: Archive is finished importing into hub\n" - yield f"The directory of your download is: {self.dir_names[0]}\n" - - except Exception as e: - logging.exception(e) - raise ValueError(e) - finally: - self.temp_download_dir.cleanup() # remove temporary download space - - async def handle_files_helper(self): - """ - This calls the async generator function and handle the storing of messages from the gener() function - into the download_q - - :return json object with the directory name of the download and - the origin_repo_path - :rtype json object - """ - try: - async for line in self.gener(): - self.download_q.put_nowait(line) - await asyncio.sleep(0.1) - except Exception as e: - self.download_q.put_nowait(e) - raise e - # mark the end of the queue with a None value - self.download_q.put_nowait(None) - return {"output_dir": self.dir_names[0], "origin_repo_path": self.origin_repo} diff --git a/setup.py b/setup.py index 7f096a66..5c3691b3 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ packages=find_packages(), include_package_data=True, platforms='any', - install_requires=['notebook>=5.5.0', 'jupyter_server>=1.10.1', 'tornado', 'aiohttp', 'pluggy'], + install_requires=['notebook>=5.5.0', 'jupyter_server>=1.10.1', 'tornado', 'pluggy'], data_files=[ ('etc/jupyter/jupyter_server_config.d', ['nbgitpuller/etc/jupyter_server_config.d/nbgitpuller.json']), ('etc/jupyter/jupyter_notebook_config.d', ['nbgitpuller/etc/jupyter_notebook_config.d/nbgitpuller.json']) diff --git a/tests/test_files/test.txt b/tests/test_files/test.txt deleted file mode 100644 index 8e435da9..00000000 --- a/tests/test_files/test.txt +++ /dev/null @@ -1,13 +0,0 @@ -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 diff --git a/tests/test_plugin_helper.py b/tests/test_plugin_helper.py deleted file mode 100644 index 12f5fbed..00000000 --- a/tests/test_plugin_helper.py +++ /dev/null @@ -1,101 +0,0 @@ -import os -import pytest -import shutil -import nbgitpuller.plugin_helper as ph -from aioresponses import aioresponses - -test_files_dir = os.getcwd() + "/tests/test_files" -archive_base = "/tmp/test_files" -repo_parent_dir = "/tmp/fake/" -temp_download_repo = "/tmp/download/" -temp_archive_download = "/tmp/archive_download/" -provider = "dropbox_test" -url = "http://test/this/repo" -CACHED_ORIGIN_NON_GIT_REPO = ".nbgitpuller/targets/" -origin_repo = f"{repo_parent_dir}{CACHED_ORIGIN_NON_GIT_REPO}{provider}/{url}/" - -repo_zip = 'file://' + archive_base + ".zip" -repo_tgz = 'file://' + archive_base + ".tar.gz" - - -@pytest.fixture -async def test_configuration(): - shutil.make_archive(archive_base, 'zip', test_files_dir) - shutil.make_archive(archive_base, 'gztar', test_files_dir) - os.makedirs(temp_archive_download, exist_ok=True) - os.makedirs(repo_parent_dir, exist_ok=True) - os.makedirs(temp_download_repo, exist_ok=True) - yield "test finishing" - os.remove(archive_base + ".zip") - os.remove(archive_base + ".tar.gz") - if os.path.isfile(temp_archive_download + "downloaded.zip"): - os.remove(temp_archive_download + "downloaded.zip") - shutil.rmtree(repo_parent_dir) - shutil.rmtree(temp_download_repo) - shutil.rmtree(temp_archive_download) - - -def test_extract_file_extension(): - url = "https://example.org/master/materials-sp20-external.tgz" - ext = ph.extract_file_extension(url) - assert "tgz" in ext - - -@pytest.mark.asyncio -async def test_initialize_local_repo(test_configuration): - yield_str = "" - async for line in ph.initialize_local_repo(origin_repo): - yield_str += line - assert "init --bare" in yield_str - assert os.path.isdir(origin_repo) - - -@pytest.mark.asyncio -async def test_clone_local_origin_repo(test_configuration): - async for line in ph.initialize_local_repo(origin_repo): - pass - - yield_str = "" - async for line in ph.clone_local_origin_repo(origin_repo, temp_download_repo): - yield_str += line - - assert "Cloning into" in yield_str - assert os.path.isdir(temp_download_repo + ".git") - - -@pytest.mark.asyncio -async def test_execute_unarchive(test_configuration): - yield_str = "" - async for line in ph.execute_unarchive("zip", archive_base + ".zip", temp_download_repo): - yield_str += line - assert os.path.isfile("/tmp/download/test.txt") - - -@pytest.mark.asyncio -async def test_push_to_local_origin(test_configuration): - async for line in ph.initialize_local_repo(origin_repo): - pass - - async for line in ph.clone_local_origin_repo(origin_repo, temp_download_repo): - pass - - async for line in ph.execute_unarchive("zip", archive_base + ".zip", temp_download_repo): - pass - - yield_str = "" - async for line in ph.push_to_local_origin(temp_download_repo): - yield_str += line - assert "[new branch]" in yield_str - - -@pytest.mark.asyncio -async def test_download_archive(test_configuration): - args = {} - args["repo"] = "http://example.org/mocked-download-url" - with aioresponses() as mocked: - mocked.get(args["repo"], status=200, body=b'Pretend you are zip file being downloaded') - yield_str = "" - async for line in ph.download_archive(args["repo"], temp_archive_download + "downloaded.zip"): - yield_str += line - assert 'Downloading archive' in yield_str - assert os.path.isfile(temp_archive_download + "downloaded.zip") From f6185606818fceb2b9b5f76e7b34a4bb5473051a Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Mon, 10 Jan 2022 18:40:01 -0800 Subject: [PATCH 37/40] Removed nested_asyncio from init.py The nested_asyncio import and call are moved to the nbgitpuller downloader plugins --- nbgitpuller/__init__.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/nbgitpuller/__init__.py b/nbgitpuller/__init__.py index a2815bd6..b5941286 100644 --- a/nbgitpuller/__init__.py +++ b/nbgitpuller/__init__.py @@ -4,12 +4,6 @@ from notebook.utils import url_path_join from tornado.web import StaticFileHandler import os -import nest_asyncio - -# this allows us to nest usage of the event_loop from asyncio -# being used by tornado in jupyter distro -# Ref: https://medium.com/@vyshali.enukonda/how-to-get-around-runtimeerror-this-event-loop-is-already-running-3f26f67e762e -nest_asyncio.apply() def _jupyter_server_extension_paths(): From 367f3c7f4f2a2770e02011b4a50507665823f275 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Sat, 15 Jan 2022 08:03:32 -0800 Subject: [PATCH 38/40] Moved downloader-plugin handling to puller thread The logic related to downloading compressed archives now is initiated by the the same Thread GitPuller uses. --- nbgitpuller/handlers.py | 45 ++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index 8152448b..b42c4234 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -110,7 +110,9 @@ def get(self): return try: - repo = self.get_argument('repo') + q = Queue() + + self.repo = self.get_argument('repo') branch = self.get_argument('branch', None) content_provider = self.get_argument('contentProvider', None) depth = self.get_argument('depth', None) @@ -127,30 +129,34 @@ def get(self): # must be expanded. repo_parent_dir = os.path.join(os.path.expanduser(self.settings['server_root_dir']), os.getenv('NBGITPULLER_PARENTPATH', '')) - repo_dir = os.path.join(repo_parent_dir, self.get_argument('targetpath', repo.split('/')[-1])) + self.repo_dir = os.path.join(repo_parent_dir, self.get_argument('targetpath', self.repo.split('/')[-1])) # We gonna send out event streams! self.set_header('content-type', 'text/event-stream') self.set_header('cache-control', 'no-cache') - # if content_provider is specified then we are dealing with compressed - # archive and not a git repo - if content_provider is not None: - plugin_manager = self.setup_plugins(content_provider) - query_line_args = {k: v[0].decode() for k, v in self.request.arguments.items()} - download_q = Queue() - helper_args = dict() - helper_args["wait_for_sync_progress_queue"] = lambda: self._wait_for_sync_progress_queue(download_q) - helper_args["download_q"] = download_q - helper_args["repo_parent_dir"] = repo_parent_dir - results = plugin_manager.hook.handle_files(helper_args=helper_args,query_line_args=query_line_args) - repo_dir = repo_parent_dir + results["output_dir"] - repo = "file://" + results["origin_repo_path"] - - gp = GitPuller(repo, repo_dir, branch=branch, depth=depth, parent=self.settings['nbapp']) - q = Queue() - def pull(): + # if content_provider is specified then we are dealing with compressed + # archive and not a git repo + if content_provider is not None: + plugin_manager = self.setup_plugins(content_provider) + query_line_args = {k: v[0].decode() for k, v in self.request.arguments.items()} + helper_args = dict() + helper_args["repo_parent_dir"] = repo_parent_dir + + try: + for line in plugin_manager.hook.handle_files(helper_args=helper_args,query_line_args=query_line_args): + q.put_nowait(line) + except Exception as e: + q.put_nowait(e) + raise e + + results = helper_args["handle_files_output"] + self.repo_dir = repo_parent_dir + results["output_dir"] + self.repo = "file://" + results["origin_repo_path"] + + gp = GitPuller(self.repo, self.repo_dir, branch=branch, depth=depth, parent=self.settings['nbapp']) + try: for line in gp.pull(): q.put_nowait(line) @@ -159,6 +165,7 @@ def pull(): except Exception as e: q.put_nowait(e) raise e + self.gp_thread = threading.Thread(target=pull) self.gp_thread.start() yield self._wait_for_sync_progress_queue(q) From 8893970a5c577ffad9a21ba520f7d353e2c3820f Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Wed, 19 Jan 2022 11:32:21 -0800 Subject: [PATCH 39/40] Moved downloader plugins handling to pull.py We moved all the logic related to pulling compressed archives to the GitPuller class; --- nbgitpuller/handlers.py | 64 ++-------------------------- nbgitpuller/plugin_hook_specs.py | 32 ++++---------- nbgitpuller/pull.py | 72 ++++++++++++++++++++++++++++---- 3 files changed, 76 insertions(+), 92 deletions(-) diff --git a/nbgitpuller/handlers.py b/nbgitpuller/handlers.py index b42c4234..07b3a279 100644 --- a/nbgitpuller/handlers.py +++ b/nbgitpuller/handlers.py @@ -11,18 +11,8 @@ from .pull import GitPuller from .version import __version__ -from . import plugin_hook_specs -import pluggy -class ContentProviderException(Exception): - """ - Custom Exception thrown when the content_provider key specifying - the downloader plugin is not installed or can not be found by the - name given - """ - def __init__(self, response=None): - self.response = response class SyncHandler(IPythonHandler): def __init__(self, *args, **kwargs): @@ -49,23 +39,6 @@ def emit(self, data): self.write('data: {}\n\n'.format(serialized_data)) yield self.flush() - def setup_plugins(self, content_provider): - """ - This automatically searches for and loads packages whose entrypoint is nbgitpuller. If found, - the plugin manager object is returned and used to execute the hook implemented by - the plugin. - :param content_provider: this is the name of the content_provider; each plugin is named to identify the - content_provider of the archive to be loaded(e.g. googledrive, dropbox, etc) - :return: returns the PluginManager object used to call the implemented hooks of the plugin - :raises: ContentProviderException -- this occurs when the content_provider parameter is not found - """ - plugin_manager = pluggy.PluginManager("nbgitpuller") - plugin_manager.add_hookspecs(plugin_hook_specs) - num_loaded = plugin_manager.load_setuptools_entrypoints("nbgitpuller", name=content_provider) - if num_loaded == 0: - raise ContentProviderException(f"The content_provider key you supplied in the URL could not be found: {content_provider}") - return plugin_manager - @gen.coroutine def _wait_for_sync_progress_queue(self, queue): """ @@ -112,7 +85,7 @@ def get(self): try: q = Queue() - self.repo = self.get_argument('repo') + repo = self.get_argument('repo') branch = self.get_argument('branch', None) content_provider = self.get_argument('contentProvider', None) depth = self.get_argument('depth', None) @@ -129,34 +102,14 @@ def get(self): # must be expanded. repo_parent_dir = os.path.join(os.path.expanduser(self.settings['server_root_dir']), os.getenv('NBGITPULLER_PARENTPATH', '')) - self.repo_dir = os.path.join(repo_parent_dir, self.get_argument('targetpath', self.repo.split('/')[-1])) + repo_dir = os.path.join(repo_parent_dir, self.get_argument('targetpath', repo.split('/')[-1])) # We gonna send out event streams! self.set_header('content-type', 'text/event-stream') self.set_header('cache-control', 'no-cache') def pull(): - # if content_provider is specified then we are dealing with compressed - # archive and not a git repo - if content_provider is not None: - plugin_manager = self.setup_plugins(content_provider) - query_line_args = {k: v[0].decode() for k, v in self.request.arguments.items()} - helper_args = dict() - helper_args["repo_parent_dir"] = repo_parent_dir - - try: - for line in plugin_manager.hook.handle_files(helper_args=helper_args,query_line_args=query_line_args): - q.put_nowait(line) - except Exception as e: - q.put_nowait(e) - raise e - - results = helper_args["handle_files_output"] - self.repo_dir = repo_parent_dir + results["output_dir"] - self.repo = "file://" + results["origin_repo_path"] - - gp = GitPuller(self.repo, self.repo_dir, branch=branch, depth=depth, parent=self.settings['nbapp']) - + gp = GitPuller(repo, repo_dir, branch=branch, depth=depth, parent=self.settings['nbapp'], content_provider=content_provider, repo_parent_dir=repo_parent_dir, other_kw_args=self.request.arguments.items()) try: for line in gp.pull(): q.put_nowait(line) @@ -171,17 +124,6 @@ def pull(): yield self._wait_for_sync_progress_queue(q) self.emit({'phase': 'finished'}) - except ContentProviderException as pe: - self.emit({ - 'phase': 'error', - 'message': str(pe), - 'output': '\n'.join([ - line.strip() - for line in traceback.format_exception( - type(pe), pe, pe.__traceback__ - ) - ]) - }) except Exception as e: self.emit({ 'phase': 'error', diff --git a/nbgitpuller/plugin_hook_specs.py b/nbgitpuller/plugin_hook_specs.py index 46a877c1..e6703ad9 100644 --- a/nbgitpuller/plugin_hook_specs.py +++ b/nbgitpuller/plugin_hook_specs.py @@ -14,24 +14,16 @@ @hookspec(firstresult=True) -def handle_files(helper_args, query_line_args): +def handle_files(repo_parent_dir, other_kw_args): """ This function must be implemented by content-provider plugins in order to handle the downloading and decompression of a non-git sourced compressed archive. - The helper_args contain three keyword arguments that are necessary to successfully save a - compressed archive: - - repo_parent_dir: save your downloaded archive here - - wait_for_sync_progress_queue: - A partial function with an infinite loop continuously checking the download_q for messages to show the - user in the UI. - - download_q: - This is a Queue that accepts messages to be displayed in the UI. You might tell the user what percent of - the download is complete or any other progress that might inform the user.to a user's jupyterhub home drive. + The repo_parent_dir is where you will save your downloaded archive - The parameter, query_line_args, contains all the arguments you put on the nbgitpuller URL link. This allows you - flexibility to pass information your content-provider download plugin may need to successfully download - source files. + The parameter, other_kw_args, contains all the arguments you put on the nbgitpuller URL link or passed to GitPuller + via CLI. This allows you flexibility to pass information your content-provider download plugin may need to + successfully download source files. This function needs to return two pieces of information as a json object: - output_dir -- the is the name of the directory that will hold all the files you want GitPuller to expose @@ -42,21 +34,15 @@ def handle_files(helper_args, query_line_args): Once the files are saved to the directory, git puller can handle all the standard functions needed to make sure source files are updated or created as needed. - I suggest you study the function handle_files_helper in the plugin_helper.py file to get a deep sense of how + I suggest you study the function handle_files_helper in file plugin_helper.py found in the + nbgitpuller-downloader-plugins repository to get a deep sense of how we handle the downloading of compressed archives. There is also more documentation in the docs section of nbgitpuller. Finally, you can always implement the entire download process yourself and not use the handle_files_helper function but please to sure understand what is being passed into and back to the nbgitpuller handlers. - :param json helper_args: these keyword args are passed from the main thread of nbgitpuller and include: - - repo_parent_dir: save your downloaded archive here - - wait_for_sync_progress_queue: - A partial function with an infinite loop continuously checking the download_q for messages to show the - user in the UI. - - download_q: - This is a Queue that accepts messages to be displayed in the UI. You might tell the user what percent of - the download is complete or any other progress that might inform the user. - :param json query_line_args: this includes any argument you put on the nbgitpuller URL + :param str repo_parent_dir: save your downloaded archive here + :param dict other_kw_args: this includes any argument you put on the nbgitpuller URL or pass via CLI as a dict :return two parameter json output_dir and origin_repo_path :rtype json object """ diff --git a/nbgitpuller/pull.py b/nbgitpuller/pull.py index cc18ac97..1bf81c9c 100644 --- a/nbgitpuller/pull.py +++ b/nbgitpuller/pull.py @@ -4,11 +4,22 @@ import time import argparse import datetime +import pluggy from traitlets import Integer, default from traitlets.config import Configurable from functools import partial +from . import plugin_hook_specs +class ContentProviderException(Exception): + """ + Custom Exception thrown when the content_provider key specifying + the downloader plugin is not installed or can not be found by the + name given + """ + def __init__(self, response=None): + self.response = response + def execute_cmd(cmd, **kwargs): """ Call given command, yielding output line by line @@ -45,6 +56,24 @@ def flush(): raise subprocess.CalledProcessError(ret, cmd) +def setup_plugins(content_provider): + """ + This automatically searches for and loads packages whose entrypoint is nbgitpuller. If found, + the plugin manager object is returned and used to execute the hook implemented by + the plugin. + :param content_provider: this is the name of the content_provider; each plugin is named to identify the + content_provider of the archive to be loaded(e.g. googledrive, dropbox, etc) + :return: returns the PluginManager object used to call the implemented hooks of the plugin + :raises: ContentProviderException -- this occurs when the content_provider parameter is not found + """ + plugin_manager = pluggy.PluginManager("nbgitpuller") + plugin_manager.add_hookspecs(plugin_hook_specs) + num_loaded = plugin_manager.load_setuptools_entrypoints("nbgitpuller", name=content_provider) + if num_loaded == 0: + raise ContentProviderException(f"The content_provider key you supplied in the URL could not be found: {content_provider}") + return plugin_manager + + class GitPuller(Configurable): depth = Integer( config=True, @@ -71,12 +100,9 @@ def __init__(self, git_url, repo_dir, **kwargs): self.git_url = git_url self.branch_name = kwargs.pop("branch") - - if self.branch_name is None: - self.branch_name = self.resolve_default_branch() - elif not self.branch_exists(self.branch_name): - raise ValueError(f"Branch: {self.branch_name} -- not found in repo: {self.git_url}") - + self.content_provider = kwargs.pop("content_provider") + self.repo_parent_dir = kwargs.pop("repo_parent_dir") + self.other_kw_args = kwargs.pop("other_kw_args") self.repo_dir = repo_dir newargs = {k: v for k, v in kwargs.items() if v is not None} super(GitPuller, self).__init__(**newargs) @@ -135,11 +161,35 @@ def resolve_default_branch(self): logging.exception(m) raise ValueError(m) + def handle_archive_download(self): + try: + plugin_manager = setup_plugins(self.content_provider) + other_kw_args = {k: v[0].decode() for k, v in self.other_kw_args} + yield from plugin_manager.hook.handle_files(repo_parent_dir=self.repo_parent_dir,other_kw_args=other_kw_args) + results = other_kw_args["handle_files_output"] + self.repo_dir = self.repo_parent_dir + results["output_dir"] + self.git_url = "file://" + results["origin_repo_path"] + except ContentProviderException as c: + raise c + + def handle_branch_name(self): + if self.branch_name is None: + self.branch_name = self.resolve_default_branch() + elif not self.branch_exists(self.branch_name): + raise ValueError(f"Branch: {self.branch_name} -- not found in repo: {self.git_url}") + def pull(self): """ - Pull selected repo from a remote git repository, + if compressed archive download first. + Execute pull of repo from a git repository(remote or temporary local created for compressed archives), while preserving user changes """ + # if content_provider is specified then we are dealing with compressed archive and not a git repo + if self.content_provider is not None: + yield from self.handle_archive_download() + + self.handle_branch_name() + if not os.path.exists(self.repo_dir): yield from self.initialize_repo() else: @@ -305,12 +355,18 @@ def main(): parser.add_argument('git_url', help='Url of the repo to sync') parser.add_argument('branch_name', default=None, help='Branch of repo to sync', nargs='?') parser.add_argument('repo_dir', default='.', help='Path to clone repo under', nargs='?') + parser.add_argument('content_provider', default=None, help='If downloading compressed archive instead of using git repo set this(e.g. dropbox, googledrive, generic_web)', nargs='?') + parser.add_argument('repo_parent_dir', default='.', help='Only used if downloading compressed archive, location of download', nargs='?') + parser.add_argument('other_kw_args', default=None, help='you can pass any keyword args you want as a dict{"arg1":"value1","arg2":"value2"} -- could be used in downloader plugins', nargs='?') args = parser.parse_args() for line in GitPuller( args.git_url, args.repo_dir, - branch=args.branch_name if args.branch_name else None + branch=args.branch_name if args.branch_name else None, + content_provider=args.content_provider if args.content_provider else None, + repo_parent_dir=args.repo_parent_dir if args.repo_parent_dir else None, + other_kw_args=args.other_kw_args if args.other_kw_args else None ).pull(): print(line) From 7590c382e2b66a9cb99bcf081a915e62aeefe242 Mon Sep 17 00:00:00 2001 From: Sean Morris Date: Wed, 19 Jan 2022 14:24:04 -0800 Subject: [PATCH 40/40] Access downloader-plugin results from plugin instance variable These changes reflect changes to the downloader-plugins that now encapsulate classes and store the handle_files outputs in an instance variable that we access from nbgitpuller when the downloader plugin is complete. This required the addition of a function to automatically register nbgitpuller-downloader-plugin classes with pluggy. --- nbgitpuller/pull.py | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/nbgitpuller/pull.py b/nbgitpuller/pull.py index 1bf81c9c..6ffc34c1 100644 --- a/nbgitpuller/pull.py +++ b/nbgitpuller/pull.py @@ -5,10 +5,12 @@ import argparse import datetime import pluggy +import importlib_metadata +import inspect from traitlets import Integer, default from traitlets.config import Configurable from functools import partial -from . import plugin_hook_specs +import plugin_hook_specs class ContentProviderException(Exception): @@ -20,6 +22,7 @@ class ContentProviderException(Exception): def __init__(self, response=None): self.response = response + def execute_cmd(cmd, **kwargs): """ Call given command, yielding output line by line @@ -56,6 +59,19 @@ def flush(): raise subprocess.CalledProcessError(ret, cmd) +def load_downloader_plugin_classes_from_entrypoints(group, content_provider): + for dist in list(importlib_metadata.distributions()): + for ep in dist.entry_points: + if ep.group == group: + plugin = ep.load() + for name, cls in inspect.getmembers(plugin, inspect.isclass): + if cls.__module__ == ep.value and ep.name == content_provider: + for fn_name, fn in inspect.getmembers(cls, inspect.isfunction): + if fn_name == "handle_files": + return cls + return None + + def setup_plugins(content_provider): """ This automatically searches for and loads packages whose entrypoint is nbgitpuller. If found, @@ -68,10 +84,13 @@ def setup_plugins(content_provider): """ plugin_manager = pluggy.PluginManager("nbgitpuller") plugin_manager.add_hookspecs(plugin_hook_specs) - num_loaded = plugin_manager.load_setuptools_entrypoints("nbgitpuller", name=content_provider) - if num_loaded == 0: + download_class = load_downloader_plugin_classes_from_entrypoints("nbgitpuller", content_provider) + downloader_obj = download_class() + #num_loaded = plugin_manager.load_setuptools_entrypoints("nbgitpuller", name=content_provider) + if download_class is None: raise ContentProviderException(f"The content_provider key you supplied in the URL could not be found: {content_provider}") - return plugin_manager + plugin_manager.register(downloader_obj) + return {"plugin_manager": plugin_manager, "downloader_obj": downloader_obj } class GitPuller(Configurable): @@ -163,10 +182,12 @@ def resolve_default_branch(self): def handle_archive_download(self): try: - plugin_manager = setup_plugins(self.content_provider) + plugin_info = setup_plugins(self.content_provider) + plugin_manager = plugin_info["plugin_manager"] + downloader_obj = plugin_info["downloader_obj"] other_kw_args = {k: v[0].decode() for k, v in self.other_kw_args} yield from plugin_manager.hook.handle_files(repo_parent_dir=self.repo_parent_dir,other_kw_args=other_kw_args) - results = other_kw_args["handle_files_output"] + results = downloader_obj.handle_files_results self.repo_dir = self.repo_parent_dir + results["output_dir"] self.git_url = "file://" + results["origin_repo_path"] except ContentProviderException as c: @@ -353,11 +374,11 @@ def main(): parser = argparse.ArgumentParser(description='Synchronizes a github repository with a local repository.') parser.add_argument('git_url', help='Url of the repo to sync') - parser.add_argument('branch_name', default=None, help='Branch of repo to sync', nargs='?') - parser.add_argument('repo_dir', default='.', help='Path to clone repo under', nargs='?') - parser.add_argument('content_provider', default=None, help='If downloading compressed archive instead of using git repo set this(e.g. dropbox, googledrive, generic_web)', nargs='?') - parser.add_argument('repo_parent_dir', default='.', help='Only used if downloading compressed archive, location of download', nargs='?') - parser.add_argument('other_kw_args', default=None, help='you can pass any keyword args you want as a dict{"arg1":"value1","arg2":"value2"} -- could be used in downloader plugins', nargs='?') + parser.add_argument('repo_dir', help='Path to clone repo under', nargs='?') + parser.add_argument('--branch_name', default=None, help='Branch of repo to sync', nargs='?') + parser.add_argument('--content_provider', default=None, help='If downloading compressed archive instead of using git repo set this(e.g. dropbox, googledrive, generic_web)', nargs='?') + parser.add_argument('--repo_parent_dir', default='.', help='Only used if downloading compressed archive, location of download', nargs='?') + parser.add_argument('--other_kw_args', default=None, help='you can pass any keyword args you want as a dict{"arg1":"value1","arg2":"value2"} -- could be used in downloader plugins', nargs='?') args = parser.parse_args() for line in GitPuller(