diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml new file mode 100644 index 0000000..0286cd0 --- /dev/null +++ b/.github/workflows/codeql-analysis.yml @@ -0,0 +1,70 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + push: + branches: [ main, development ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ main, development ] + schedule: + - cron: '42 3 * * 5' + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'python' ] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] + # Learn more about CodeQL language support at https://git.io/codeql-language-support + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v1 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + # queries: ./path/to/local/query, your-org/your-repo/queries@main + + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + - name: Autobuild + uses: github/codeql-action/autobuild@v1 + + # ℹī¸ Command-line programs to run using the OS shell. + # 📚 https://git.io/JvXDl + + # ✏ī¸ If the Autobuild fails above, remove it and uncomment the following three lines + # and modify them (or add more) to build your code if your project + # uses a compiled language + + #- run: | + # make bootstrap + # make release + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v1 diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000..1715b1c --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,46 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python package + +on: + push: + branches: [ main, development ] + pull_request: + branches: [ main, development ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.7", "3.8", "3.9"] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + # Kerberos must be installed for kerberos support + sudo apt-get install -f libkrb5-dev + # Minio support + wget https://dl.min.io/server/minio/release/linux-amd64/minio + chmod +x minio + sudo mv minio /usr/bin/minio + python -m pip install --upgrade pip + python -m pip install flake8 pytest flit + flit install + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..3bfabfc --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,36 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/uploadandscan.yml b/.github/workflows/uploadandscan.yml index 30b9aba..fccb3df 100644 --- a/.github/workflows/uploadandscan.yml +++ b/.github/workflows/uploadandscan.yml @@ -4,7 +4,13 @@ name: Vericode Scan on: - workflow_dispatch + push: + branches: [ main, development ] + pull_request: + # The branches below must be a subset of the branches above + branches: [ main, development ] + schedule: + - cron: '42 3 * * 5' jobs: build: diff --git a/README.md b/README.md index 64290aa..9067d8f 100644 --- a/README.md +++ b/README.md @@ -105,7 +105,7 @@ credentials: # In Yaml the - indiciates this is a list element. # Any additional stanzas will start with a similar - - realm: - # LVFS uses realms to determine which set of credentials to use for each URL. + # LVFS uses realms to determine which set of credentials to use for each URL. # Every credential stanza gets one realm. # LVFS will use the first stanza that matches the URL. # In this case we match anything using HDFSOverSSH @@ -134,7 +134,7 @@ processing within the cluster. In that case, configuration is a lot easier. ```yaml credentials: - realm: - # For historic reasons, all HDFS connections use the HDFSOverSSH connection class + # For historic reasons, all HDFS connections use the HDFSOverSSH connection class # because SSH will be disabled when you don't configure it here. classname: HDFSOverSSH # This is the Hadoop username; there is no jumpbox @@ -207,11 +207,11 @@ At any rate, this is not been tested with LVFS. For later reference, these are the possible modes for HDFSOverSSH: ssh_jump_host | ssh_username | username | password | webhdfs_root | use case ---------------|--------------|----------|----------|-------------------------------------- -* | * | * | * | None | Invalid, unconfigured -not None | None | * | * | * | Invalid -None | not None | * | * | * | Invalid -* | * | None | not None | * | Invalid +------------- | ------------ | -------- | -------- | ------------ | -------- +(any) | (any) | (any) | (any) | None | Invalid, unconfigured +not None | None | (any) | (any) | (any) | Invalid +None | not None | (any) | (any) | (any) | Invalid +(any) | (any) | None | not None | (any) | Invalid not None | not None | not None | None | not None | HDP2 with SSH None | None | not None | None | not None | HDP2 without SSH not None | not None | not None | not None | not None | HDP3+AD with SSH diff --git a/lvfs/__init__.py b/lvfs/__init__.py index 50f1324..b701a40 100644 --- a/lvfs/__init__.py +++ b/lvfs/__init__.py @@ -11,7 +11,7 @@ * Artifactory, for model and code reading and writing """ -__version__ = "1.1.1" +__version__ = "1.1.2" __all__ = ["URL"] import copyreg import urllib diff --git a/lvfs/credentials.py b/lvfs/credentials.py index b46de68..d0b3a7d 100644 --- a/lvfs/credentials.py +++ b/lvfs/credentials.py @@ -26,7 +26,7 @@ def init_register(cls, search_paths: List[Path] = None): location: List of Paths to try to read the configuration from (must be local) If location is not specified, the following locations will be searched: - * `./lvfs.conf` + * `./lvfs.yml` * `~/.config/lvfs.yml` * `/etc/creds/lvfs.yml` * `/etc/secret/lvfs.yml` diff --git a/lvfs/minio.py b/lvfs/minio.py index d5f1a70..482eebf 100644 --- a/lvfs/minio.py +++ b/lvfs/minio.py @@ -58,7 +58,7 @@ def __bucket(self, required=True): raise ValueError(f"No bucket specified for Minio URL {self}") return bucket - def __path_without_bucket(self, required=True): + def __path_without_bucket(self): """ Get the path without the bucket from this URL. Accepts @@ -67,22 +67,18 @@ def __path_without_bucket(self, required=True): """ path = self.path path = path[1:] if path.startswith("/") else path - path = path.split("/", 1)[1] if "/" in path else None - path = path or None - if required and not path: - raise ValueError(f"No path specified for Minio URL {self}") - return path + return path.split("/", 1)[1] if "/" in path else "" def __connect(self): - creds = self.__creds - if creds not in self.__clients: - self.__clients[creds] = minio.Minio( + host, creds = self.host, self.__creds + if (host, creds) not in self.__clients: + self.__clients[(host, creds)] = minio.Minio( self.host, access_key=creds[0], secret_key=creds[1], secure=creds[2] ) - return self.__clients[creds] + return self.__clients[(host, creds)] @_wrap_error async def read_binary(self) -> bytes: @@ -122,12 +118,12 @@ async def ls(self, recursive: bool = False) -> List[URL]: """ Get the list of files in this directory, if it is one Returns a list of URL objects. Results are always absolute. - *DO NOT `root.join(file)`* """ bucket = self.__bucket() if bucket: prefix = self.__path_without_bucket() - prefix = prefix if prefix.endswith("/") else prefix + "/" + if prefix and not prefix.endswith("/"): + prefix = prefix + "/" return [ # These paths are relative to the bucket, but join() # is relative to this prefix. So slice it off. @@ -175,13 +171,38 @@ async def mkdir(self, ignore_if_exists: bool = False): ignore_if_exists: boolean: DEPRECATED Included for backward compatibility. Existing directories are always ignored. """ - # This doesn't really exist + # This doesn't really exist, but it sorta does for buckets + + @_wrap_error + async def make_bucket(self): + """ Create a bucket. + + The path is not used and no folders are created. + For filesystems without buckets, this method has no effect. + In other filesystems you may need special permissions to create buckets. + Creating buckets programmatically may be unwise on account of billing. + """ + self.__connect().make_bucket(self.__bucket()) def supports_permissions(self) -> bool: """ Some implementations, like blobs, do not always support permissions, If this method returns true, the file system supports permissions """ return False + + def supports_directories(self) -> bool: + """ Return whether the protocol supports first-class directories. + + Notes + ----- + If the filesystems support directories, then: + - mkdir() and isdir() have meaning + - mkdir() followed by isdir() should be True + Otherwise: + - mkdir() has no effect and isdir() degrades to best-effort + which usually means it will only be True if the directory has content + """ + return False @_wrap_error async def unlink(self, ignore_if_missing: bool = False): diff --git a/lvfs/url.py b/lvfs/url.py index 0dae2cc..1f45b35 100644 --- a/lvfs/url.py +++ b/lvfs/url.py @@ -111,9 +111,9 @@ def parse(self) -> urllib.parse.ParseResult: Returns a ParseResult. Example: - >>> urllib.parse.urlparse("derk://admin@uhhuh/local/thing;xyz?key=value&key2=value2#4") + >>> urllib.parse.urlparse("derk://admin@uhhuh:8080/local/thing;xyz?key=value&key2=value2#4") ParseResult( - scheme='derk', netloc='admin@uhhuh', path='/local/thing', params='xyz', + scheme='derk', netloc='admin@uhhuh:8080', path='/local/thing', params='xyz', query='key=value&key2=value2', fragment='4' ) """ @@ -223,6 +223,20 @@ async def write_binary(self, content: bytes, overwrite: bool = True): so with the specific implementations you plan to use. """ raise NotImplementedError + + def supports_directories(self) -> bool: + """ Return whether the protocol supports first-class directories. + + Notes + ----- + If the filesystems support directories, then: + - mkdir() and isdir() have meaning + - mkdir() followed by isdir() should be True + Otherwise: + - mkdir() has no effect and isdir() degrades to best-effort + which usually means it will only be True if the directory has content + """ + return True @abstractmethod async def ls(self, recursive: bool = False): @@ -446,12 +460,39 @@ def _as_numpy_column(num_rows, pa_schema, pa_col, decimal_as="float"): else: return pandas_obj - async def read_csv(self, *, recursive: bool = False) -> pd.DataFrame: + async def read_csv(self, *, recursive: bool = False, **pandas_args) -> pd.DataFrame: """ Read one or many csv files - - If this is a directory, read all the csv files within it. - - If recursive, read all csv descended from it ad infinitum + + Accepts + ------- + recursive: bool: Whether to read all CSV files within the directory + pandas_args: dict: any other arguments to pass to read_csv(), which is very flexible + + Notes + ----- + - The CSV serialization library may one day change (on a minor version bump), in which + case extensively customizing the serialization may incur tech debt + - Recursion is not supported when writing, so round-trips require you to write to + a specific file, not a whole directory! """ - return await self._read_file(pd.read_csv, recursive=recursive) + return await self._read_file(lambda f: pd.read_csv(f, **pandas_args), recursive=recursive) + + async def write_csv(self, frame: pd.DataFrame, **pandas_args): + """ Write exactly one CSV file (not a directory) + + Accepts + ------- + frame: Pandas Dataframe: the frame to write to a file + pandas_args: dict: any other arguments to pass to to_csv(), which is very flexible + + Notes + ----- + The CSV serialization library may one day change (on a minor version bump), in which + case extensively customizing the serialization may incur tech debt + """ + file_handle = io.BytesIO() + frame.to_csv(file_handle, **pandas_args) + await self.write_binary(file_handle.getbuffer()) async def read_parquet(self, *, recursive: bool = False) -> pd.DataFrame: """ Read one or many parquet files @@ -883,3 +924,20 @@ async def write_stream(self, gen): """ chunks = [chunk async for chunk in gen] await self.write_binary(b"".join(chunks)) + + async def make_bucket(self): + """ Make a bucket. + + Notes + ----- + The path is not used and no folders are created. + For filesystems without buckets, this method has no effect. + In other filesystems you may need special permissions to create buckets. + Creating buckets programmatically may be unwise on account of billing. + + Errors + ------ + Creating a bucket that already exists will fail with an error, + provided the filesystem supports buckets and can recognize the bucket exists. + """ + pass diff --git a/pyproject.toml b/pyproject.toml index a49e1f2..fb0c4fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ author-email = " lowesopensource@lowes.com" home-page = "https://github.com/lowes" classifiers = ["License :: OSI Approved :: MIT License"] description-file = "README.md" -requires-python = ">=3.6" +requires-python = ">=3.7" requires = [ "pandas ~= 1.0", "pyyaml ~= 5.3", diff --git a/requirements.txt b/requirements.txt index 1bf327b..671bb0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,10 +20,10 @@ hdfs==2.5.8 idna==2.10 importlib-metadata==2.0.0 iniconfig==1.1.1 -minio==6.0.0 -numpy==1.19.2 +minio >= 6.0.0 +numpy >= 1.19.2 packaging==20.4 -pandas==1.1.3 +pandas >= 1.1.3 paramiko==2.7.2 pluggy==0.13.1 protobuf==3.13.0 @@ -44,11 +44,11 @@ pytest-timeout==1.4.2 python-dateutil==2.8.1 pytz==2020.1 PyYAML==5.4 -requests==2.24.0 +requests >= 2.24.0 rsa==4.7 six==1.15.0 toml==0.10.1 -urllib3==1.25.11 +urllib3 >= 1.26.0 zipp==3.3.1 keyring==21.5.0 hvac==0.10.5 diff --git a/test-installation.dockerfile b/test-installation.dockerfile deleted file mode 100644 index cd20a99..0000000 --- a/test-installation.dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -FROM python:3.7 AS starter -RUN useradd -d /home/nemo -m nemo -WORKDIR /home/nemo/lvfs -RUN pip install flit -COPY lvfs /home/nemo/lvfs/lvfs -COPY README.md pyproject.toml /home/nemo/lvfs/ -RUN chown -R nemo /home/nemo -USER nemo -# Normally you don't override PATH because your normally don't use virtualenv inside docker -# But this is not a normal script. -ENV PATH /home/nemo/.local/bin:/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin - - -FROM starter AS wheel-build -RUN python3 -m venv .venv \ - && . .venv/bin/activate \ - && flit build - -FROM starter AS symlink-install -RUN python3 -m venv .venv \ - && . .venv/bin/activate \ - && flit install -s - -FROM starter AS normal-install -RUN python3 -m venv .venv \ - && . .venv/bin/activate \ - && flit install - -FROM starter as novenv-install -RUN flit install \ No newline at end of file diff --git a/test-installation.sh b/test-installation.sh deleted file mode 100755 index 1f8dfcd..0000000 --- a/test-installation.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -set -e -docker build -t lvfs-test-installation -f test-installation.dockerfile . \ No newline at end of file diff --git a/test.dockerfile b/test.dockerfile new file mode 100644 index 0000000..a856ec4 --- /dev/null +++ b/test.dockerfile @@ -0,0 +1,14 @@ +FROM python:3.9 +RUN mkdir -p /app/lvfs \ +&& pip install -U pip \ +&& pip install flit \ +&& wget "https://dl.min.io/server/minio/release/linux-amd64/minio" \ +&& chmod 755 minio \ +&& mv minio /usr/bin/minio +WORKDIR /app +COPY lvfs/__init__.py /app/lvfs/__init__.py +COPY README.md pyproject.toml /app/ +RUN FLIT_ROOT_INSTALL=1 flit install -s + +COPY lvfs /app/lvfs +COPY tests /app/tests diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..75cea37 --- /dev/null +++ b/test.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e +docker build -t test_lvfs -f test.dockerfile . +docker run --rm test_lvfs python3 -m pytest \ No newline at end of file diff --git a/tests/test_generic.py b/tests/test_generic.py new file mode 100644 index 0000000..8951cb0 --- /dev/null +++ b/tests/test_generic.py @@ -0,0 +1,121 @@ +from lvfs import URL +from tests.test_minio import ensure_minio_is_running +import pytest +import pandas as pd + +argnames = "home,init" +homes = [ + (URL.to("/tmp/lvfs-test"), lambda home: home.mkdir()), + (URL.to("s3://localhost:9000/default"), lambda home: ensure_minio_is_running()) +] +home_names = ["local", "minio"] + +@pytest.mark.parametrize(argnames, homes, ids=home_names) +@pytest.mark.asyncio +async def test_write_dicts(home: URL, init): + await init(home) + path = home.join("tmp") + for get, put in [ + (path.read_json, path.write_json), + (path.read_yaml, path.write_yaml), + (path.read_pickle, path.write_pickle) + ]: + await put({"second_best_animal": "puppies"}) + assert (await get())['second_best_animal'] == 'puppies' + +@pytest.mark.parametrize(argnames, homes, ids=home_names) +@pytest.mark.asyncio +async def test_write_tables(home: URL, init): + await init(home) + path = home.join("tmp") + df = pd.DataFrame({'a': ['Gwen', 'once', 'exclaimed', 'that', 'this', 'is', 'bananas'], + 'b': ['b', 'a', 'n', 'a', 'n', 'a', 's']}) + for get, put in [ + (path.read_parquet, path.write_parquet), + (path.read_orc, path.write_orc), + (path.read_csv, path.write_csv) + ]: + await put(df) + assert (await get())['b'].values.tolist() == list('bananas') + +@pytest.mark.parametrize(argnames, homes, ids=home_names) +@pytest.mark.asyncio +async def test_cp(home: URL, init): + await init(home) + the_truth = "There are three kinds of lies: lies, damned lies, and statistics." + home = home.join("test-cp") + await home.mkdir() + path_1 = home.join("tmp_1.txt") + path_2 = home.join("tmp_2.txt") + await path_1.write_text(the_truth) + await path_1.cp(path_2) + + # These should be the only things in the directory + filenames = await home.ls() + assert len(filenames) == 2 + assert path_1 in filenames + assert path_2 in filenames + + # File content should agree + assert (await path_1.read_text()) == (await path_2.read_text()) + +@pytest.mark.parametrize(argnames, homes, ids=home_names) +@pytest.mark.asyncio +async def test_mv(home: URL, init): + await init(home) + the_truth = "There are three kinds of lies: lies, damned lies, and statistics." + home = home.join("test-mv") + await home.mkdir() + path_1 = home.join("tmp_1.txt") + path_2 = home.join("tmp_2.txt") + # Writing makes the file + assert not await path_1.exists() + await path_1.write_text(the_truth) + assert await path_1.exists() + + # Then moving creates a new file and deletes the old + assert not await path_2.exists() + await path_1.mv(path_2) + assert not await path_1.exists() + assert await path_2.exists() + + # The filenames should agree + filenames = await home.ls() + assert len(filenames) == 1 + assert path_1 not in filenames + assert path_2 in filenames + + # And lastly the content should match + assert the_truth == await path_2.read_text() + +@pytest.mark.parametrize(argnames, homes, ids=home_names) +@pytest.mark.asyncio +async def test_directories(home: URL, init): + await init(home) + item = home.join("test-directory") + + # Should list correctly + await item.write_text("nothing") + assert item in await home.ls() + + # Now delete the item, and it should be gone + await item.rm() + assert item not in await home.ls() + + # Cool. Make a new directory + if home.supports_directories(): + await item.mkdir() + assert item in await home.ls() + assert await item.isdir() + + # Even if the directory is not supported its content will be empty + assert [] == await item.ls() + + # Make sure you can use the directory too + await item.join("x").write_text("bogus") + # And at this point it should always list content + assert item.join("x") in await item.ls() + + # Should be able to recursively delete even if directories are not supported + await item.rm() + assert item not in await home.ls() \ No newline at end of file diff --git a/tests/test_installation_methods.sh b/tests/test_installation_methods.sh new file mode 100755 index 0000000..fae6a80 --- /dev/null +++ b/tests/test_installation_methods.sh @@ -0,0 +1,32 @@ +#!/bin/sh +set -e + +# First, build a wheel +( + python3 -m venv .venv \ + && . .venv/bin/activate \ + && flit build \ + && py.test +) + +# Next, try a symlink installation +( + python3 -m venv .venv \ + && . .venv/bin/activate \ + && flit install -s \ + && py.test +) + +# Now try a normal installation +( + python3 -m venv .venv \ + && . .venv/bin/activate \ + && flit install \ + && py.test +) + +# Last, try to install without a virtual environment +( + flit install \ + && py.test +) \ No newline at end of file diff --git a/tests/test_local.py b/tests/test_local.py new file mode 100644 index 0000000..c8a052c --- /dev/null +++ b/tests/test_local.py @@ -0,0 +1,47 @@ +import os +import pandas as pd +import warnings +import asyncio +import pytest +from lvfs import URL +from tests import data_dir +from tempfile import TemporaryDirectory + + +@pytest.mark.asyncio +async def test_read_json(): + x = await URL.to(os.path.join(data_dir, "test_json.json")).read_json() + assert x['best_animal'] == 'kittens' + +@pytest.mark.asyncio +async def test_read_yaml(): + x = await URL.to(os.path.join(data_dir, "test_yaml.yml")).read_yaml() + assert all([v in ['do', 're', 'mi', 'fa'] for v in x.keys() if v != 'other notes']) + assert sorted(x['other notes']) == ['do', 'la', 'so', 'ti'] + +@pytest.mark.asyncio +async def test_read_pickle(): + x = await URL.to(os.path.join(data_dir, "test_pkl.pkl")).read_pickle() + assert len(x) == 6 + assert 'cucumbers' in x + +@pytest.mark.asyncio +async def test_read_text(): + x = await URL.to(os.path.join(data_dir, "dummy.txt")).read_text() + assert x == 'this file exists as a placeholder for the data directory.' + +@pytest.mark.asyncio +async def test_write_text(): + the_truth = "There are three kinds of lies: lies, damned lies, and statistics." + with TemporaryDirectory() as tmpdir: + file_path = os.path.join(tmpdir, "tmp.txt") + await URL.to(file_path).write_text(the_truth) + x = await URL.to(file_path).read_text() + assert x == the_truth + +@pytest.mark.asyncio +async def test_read_parquet(): + warnings.simplefilter(action="ignore", category=DeprecationWarning) + warnings.simplefilter(action="ignore", category=FutureWarning) + x = await URL.to(os.path.join(data_dir, "test_parquet.pq")).read_parquet() + assert x.to_json() == '{"x":{"0":1,"1":2,"2":3},"y":{"0":4,"1":5,"2":6}}' diff --git a/tests/test_minio.py b/tests/test_minio.py new file mode 100644 index 0000000..db44997 --- /dev/null +++ b/tests/test_minio.py @@ -0,0 +1,52 @@ +from pathlib import Path +import subprocess +from lvfs import URL +from lvfs.credentials import Credentials +import pytest +import time +import logging +import os +import secrets + +MINHOME = URL.to("s3://localhost:9000/default") + +MINIO_PROCESSES = [] +async def ensure_minio_is_running(): + if not MINIO_PROCESSES: + # Populate test credentials + access_key = secrets.token_hex(16) + secret_key = secrets.token_hex(16) + Credentials.register(dict(access_key=access_key, secret_key=secret_key), "Minio") + + logging.info("Starting Minio server") + proc = subprocess.Popen( + ["/usr/bin/minio", "server", Path.cwd().joinpath("tests/data").as_posix()], + env=dict(MINIO_ROOT_USER=access_key, MINIO_ROOT_PASSWORD=secret_key, **os.environ) + ) + try: + proc.wait(3) + except subprocess.TimeoutExpired: + # Good! It's still running + MINIO_PROCESSES.append(proc) + await MINHOME.make_bucket() + else: + raise RuntimeError("Minio failed to run") + +@pytest.mark.asyncio +async def test_can_start_minio(): + await ensure_minio_is_running() + +@pytest.mark.asyncio +@pytest.mark.xfail +async def test_create_duplicate_bucket(): + # This already creates the default bucket + await ensure_minio_is_running() + # This checks that creating it again fails + await MINHOME.make_bucket() + +@pytest.mark.asyncio +async def test_minio_json(): + await ensure_minio_is_running() + basicjson = MINHOME.join("example.json") + await basicjson.write_json({"key": "value"}) + assert (await basicjson.read_json()) == {"key": "value"} \ No newline at end of file diff --git a/tests/test_url_local.py b/tests/test_url_local.py deleted file mode 100644 index 0d4325e..0000000 --- a/tests/test_url_local.py +++ /dev/null @@ -1,141 +0,0 @@ -import os -import pandas as pd -import warnings -import asyncio -import pytest -from lvfs import URL -from tests import data_dir -from tempfile import TemporaryDirectory - - -@pytest.mark.asyncio -async def test_read_json(): - x = await URL.to(os.path.join(data_dir, "test_json.json")).read_json() - assert x['best_animal'] == 'kittens' - -@pytest.mark.asyncio -async def test_write_json(): - with TemporaryDirectory() as tmpdir: - file_path = os.path.join(tmpdir, "tmp.json") - await URL.to(file_path).write_json({"second_best_animal": "puppies"}) - x = await URL.to(file_path).read_json() - assert x['second_best_animal'] == 'puppies' - -@pytest.mark.asyncio -async def test_read_yaml(): - x = await URL.to(os.path.join(data_dir, "test_yaml.yml")).read_yaml() - assert all([v in ['do', 're', 'mi', 'fa'] for v in x.keys() if v != 'other notes']) - assert sorted(x['other notes']) == ['do', 'la', 'so', 'ti'] - -@pytest.mark.asyncio -async def test_write_yaml(): - with TemporaryDirectory() as tmpdir: - file_path = os.path.join(tmpdir, "tmp.yml") - await URL.to(file_path).write_yaml({"second_best_animal": "puppies"}) - x = await URL.to(file_path).read_yaml() - assert x['second_best_animal'] == 'puppies' - -@pytest.mark.asyncio -async def test_read_pickle(): - x = await URL.to(os.path.join(data_dir, "test_pkl.pkl")).read_pickle() - assert len(x) == 6 - assert 'cucumbers' in x - -@pytest.mark.asyncio -async def test_write_pickle(): - with TemporaryDirectory() as tmpdir: - file_path = os.path.join(tmpdir, "tmp.pkl") - await URL.to(file_path).write_pickle({"where its at": "two turn tables"}) - x = await URL.to(file_path).read_pickle() - assert x['where its at'] == 'two turn tables' - -@pytest.mark.asyncio -async def test_read_text(): - x = await URL.to(os.path.join(data_dir, "dummy.txt")).read_text() - assert x == 'this file exists as a placeholder for the data directory.' - -@pytest.mark.asyncio -async def test_write_text(): - the_truth = "There are three kinds of lies: lies, damned lies, and statistics." - with TemporaryDirectory() as tmpdir: - file_path = os.path.join(tmpdir, "tmp.txt") - await URL.to(file_path).write_text(the_truth) - x = await URL.to(file_path).read_text() - assert x == the_truth - -@pytest.mark.asyncio -async def test_read_parquet(): - warnings.simplefilter(action="ignore", category=DeprecationWarning) - warnings.simplefilter(action="ignore", category=FutureWarning) - x = await URL.to(os.path.join(data_dir, "test_parquet.pq")).read_parquet() - assert x.to_json() == '{"x":{"0":1,"1":2,"2":3},"y":{"0":4,"1":5,"2":6}}' - -@pytest.mark.asyncio -async def test_write_parquet(): - warnings.simplefilter(action="ignore", category=DeprecationWarning) - warnings.simplefilter(action="ignore", category=FutureWarning) - with TemporaryDirectory() as tmpdir: - file_path = os.path.join(tmpdir, "tmp.pq") - pd.DataFrame({'a': ['Gwen', 'once', 'exclaimed', 'that', 'this', 'is', 'bananas'], - 'b': ['b', 'a', 'n', 'a', 'n', 'a', 's']})\ - .to_parquet(file_path) - x = await URL.to(file_path).read_parquet() - assert all(x['b'].values == ['b', 'a', 'n', 'a', 'n', 'a', 's']) - -@pytest.mark.asyncio -async def test_ls(): - the_truth = "There are three kinds of lies: lies, damned lies, and statistics." - with TemporaryDirectory() as tmpdir: - file_path = os.path.join(tmpdir, "tmp.txt") - await URL.to(file_path).write_text(the_truth) - x = list(await URL.to(tmpdir).ls()) - assert len(x) == 1 - assert x[0].path == file_path - -@pytest.mark.asyncio -async def test_cp(): - the_truth = "There are three kinds of lies: lies, damned lies, and statistics." - with TemporaryDirectory() as tmpdir: - file_path_1 = os.path.join(tmpdir, "tmp_1.txt") - file_path_2 = os.path.join(tmpdir, "tmp_2.txt") - await URL.to(file_path_1).write_text(the_truth) - await URL.to(file_path_1).cp(file_path_2) - x = list(await URL.to(tmpdir).ls()) - t_1 = await URL.to(file_path_1).read_text() - t_2 = await URL.to(file_path_2).read_text() - assert len(x) == 2 - assert file_path_1 in x - assert file_path_2 in x - assert t_1 == t_2 - -@pytest.mark.asyncio -async def test_rm(): - the_truth = "There are three kinds of lies: lies, damned lies, and statistics." - with TemporaryDirectory() as tmpdir: - file_path = os.path.join(tmpdir, "tmp.txt") - await URL.to(file_path).write_text(the_truth) - await URL.to(file_path).rm() - x = list(await URL.to(tmpdir).ls()) - assert len(x) == 0 - -@pytest.mark.asyncio -async def test_mv(): - the_truth = "There are three kinds of lies: lies, damned lies, and statistics." - with TemporaryDirectory() as tmpdir: - file_path_1 = os.path.join(tmpdir, "tmp_1.txt") - file_path_2 = os.path.join(tmpdir, "tmp_2.txt") - await URL.to(file_path_1).write_text(the_truth) - t_1 = await URL.to(file_path_1).read_text() - await URL.to(file_path_1).mv(file_path_2) - x = list(await URL.to(tmpdir).ls()) - t_2 = await URL.to(file_path_2).read_text() - assert len(x) == 1 - assert file_path_1 not in x - assert file_path_2 in x - assert t_1 == t_2 - -@pytest.mark.asyncio -async def test_isdir(): - with TemporaryDirectory() as tmpdir: - assert await URL.to(tmpdir).isdir() - assert not await URL.to(f"{tmpdir}_this_shouldnt_exist").isdir()