Skip to content

Commit

Permalink
v1.6.6 - support for S3 buckets at data sources
Browse files Browse the repository at this point in the history
  • Loading branch information
evilkermit committed Oct 27, 2022
1 parent b98a4a6 commit 3a926d9
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 17 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,6 @@ RUN unzip awscliv2.zip
RUN aws/install
RUN npm install aws-cdk aws-cdk-lib

RUN pip install seelabutk-substrate==1.5.5
RUN pip install seelabutk-substrate==1.6.6

ENV PATH "${PATH}:/root/.nvm/versions/node/v16.17.1/bin"
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ entry-points = {console_scripts = {substrate = "substrate.substrate:main"}}
license = {text = "CC-BY-4.0"}
readme = "README.md"
requires-python = ">=3.7"
version = "1.5.5"
version = "1.6.6"

[project.urls]
Homepage = "https://github.com/seelabutk/substrate/"
Expand Down
59 changes: 46 additions & 13 deletions src/substrate/substrate.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env python3
from argparse import ArgumentParser
import os
import subprocess
import sys
from urllib.request import urlretrieve
from urllib.parse import urlparse

import yaml
Expand All @@ -23,6 +23,7 @@ class Substrate():
def __init__(self, tool_name, path=None):
self.tool_name = tool_name
self.path, self.config = self._parse_yaml(path)
self._check_config()

self.target = None
if self.config.get('aws', None) is not None:
Expand All @@ -37,24 +38,59 @@ def __init__(self, tool_name, path=None):

self.target_obj = self.target(self.path, self.config, self.tool)

def _check_config(self):
config = self.config

if 'aws' not in config and 'docker' not in config:
raise Exception(
'No deployment target was chosen. Please choose either "aws" or "docker".'
)

if 'aws' in config and 'docker' in config:
raise Exception(
'The "aws" and "docker" options cannot be used simultaneously.'
)

if 'aws' in config:
if not os.environ.get('AWS_ACCESS_KEY_ID', False):
raise Exception(
'AWS_ACCESS_KEY_ID environment variable must be set to deploy to AWS.'
)

if not os.environ.get('AWS_SECRET_ACCESS_KEY', False):
raise Exception(
'AWS_SECRET_ACCESS_KEY environment variable must be set to deploy to AWS.'
)

if 'docker' in config:
try:
subprocess.check_call(
'docker info',
shell=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL
)
except subprocess.CalledProcessError as exc:
raise Exception(
"Docker doesn't appear to be running. Please check that it's installed "
"and the daemon is running."
) from exc

def _get_data(self, config):
source_paths = config['data']['source']
data_paths = []
data_urls = []

for source_path in source_paths:
is_url = urlparse(source_path).scheme.startswith(('ftp', 'http'))
is_url = urlparse(source_path).scheme.startswith(('ftp', 'http', 's3'))

# Download the dataset if necessary to the target location
if is_url and self.target == DockerSwarm:
target_path = os.path.abspath(config['data']['target'])

os.makedirs(os.path.dirname(target_path), exist_ok=True)
urlretrieve(source_path, target_path)
raise Exception(
'You cannot use remote data sources with a local deployment. Please '
'download the dataset and reference it locally in data.source.'
)

if target_path not in data_paths:
data_paths.append(target_path)
elif is_url:
if is_url:
data_urls.append(source_path)
else:
data_paths.append(os.path.abspath(source_path))
Expand All @@ -79,9 +115,6 @@ def _parse_yaml(self, path):
with open(path, 'r', encoding='utf8') as stream:
_config = yaml.load(stream, Loader=yaml.Loader)

if _config.get('docker', None) and _config.get('aws', None):
sys.exit('The "docker" and "aws" options cannot be used simultaneously.')

return (path, _config)

def start(self):
Expand Down
15 changes: 13 additions & 2 deletions src/substrate/targets/aws_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def start(self):
while True:
print('Checking if AWS instance is ready…', end='')
try:
response = requests.get(f'http://{location}')
response = requests.get(f'http://{location}', timeout=10)
if response.status_code == 200:
break
except requests.exceptions.ConnectionError:
Expand Down Expand Up @@ -83,11 +83,21 @@ class _AWSStack(Stack): # pylint: disable=too-many-instance-attributes
def __init__(self, scope, _id, tool, config, data_urls, **kwargs):
self.tool = tool
self.config = config
self.data_urls = data_urls

self.leader_name = ''
self.nodes = []
self.use_https = self.config['aws'].get('https', False)

self.data_urls = [
data_url for data_url in data_urls if not data_url.startswith('s3')
]
self.data_buckets = [
data_url.split('|') for data_url in data_urls if data_url.startswith('s3')
]
for bucket in self.data_buckets:
if len(bucket) < 2:
bucket.append('*')

self.tool.upload_to_s3()

super().__init__(scope, _id, **kwargs)
Expand Down Expand Up @@ -165,6 +175,7 @@ def get_udata(self, _type):
f'aws s3 sync s3://{self.config["aws"]["bucket"]} /mnt/efs',
'mkdir -p /mnt/efs/data',
'cd /mnt/efs/data',
*[f'aws s3 sync {data_bucket[0]} /mnt/efs/data --exclude "*" --include "{data_bucket[1]}"' for data_bucket in self.data_buckets], # noqa: E501
*[f'curl -O {data_url}' for data_url in self.data_urls],
'cd -'
)
Expand Down

0 comments on commit 3a926d9

Please sign in to comment.