Skip to content

Commit

Permalink
Allow multiple groups for user definition
Browse files Browse the repository at this point in the history
  • Loading branch information
thomas-vogels committed Nov 1, 2021
1 parent ff28a0b commit 58cdcf9
Show file tree
Hide file tree
Showing 10 changed files with 303 additions and 172 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ RUN python3 -m venv "/opt/local/$PROJ_NAME/venv" && \
python3 -m pip install --upgrade pip==20.3.4 --disable-pip-version-check --no-cache-dir && \
python3 -m pip install --requirement /tmp/requirements-dev.txt --disable-pip-version-check --no-cache-dir

# Create an empty .pgpass file to help with create_user and update_user commands.
RUN echo '# Format to set password (used by create_user and update_user): *:5439:*:<user>:<password>' > /home/arthur/.pgpass \
# Create an empty .pgpass file to help with update_user command.
RUN echo '# Format to set password (used by update_user): *:5439:*:<user>:<password>' > /home/arthur/.pgpass \
&& chmod go= /home/arthur/.pgpass

# Note that at runtime we (can or may) mount the local directory here.
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ you can simply use `../arthur-redshift-etl/` to find your way back to this ETL c

Although the Redshift cluster can be administered using the AWS console and `psql`, some
helper scripts will make setting up the cluster consistently much easier.
(See below for `initialize` and `create_user`.)
(See below for `initialize`, `create_groups`, and `create_users`.)

Also, add the AWS IAM role that the database owner may assume within Redshift
to your settings file so that Redshift has the needed permissions to access the
Expand Down Expand Up @@ -163,7 +163,8 @@ Don't forget to run `terminate_emr_cluster.sh` when you're done.
| Sub-command | Goal |
| ---- | ---- |
| `initialize` | Create schemas, groups and users |
| `create_user` | Create (or configure) users that are not mentioned in the configuration file |
| `create_groups` | Create groups that are mentioned in the configuration file |
| `create_users` | Create users that are mentioned in the configuration file |

```shell
# The commands to setup the data warehouse users and groups or any database is by ADMIN (connected to `dev`)
Expand Down
3 changes: 2 additions & 1 deletion etc/arthur_completion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@ _arthur_completion()
create_groups
create_index
create_schemas
create_user
create_users
delete_finished_pipelines
design
explain
extract
help
initialize
list_users
load
ls
ping
Expand Down
68 changes: 44 additions & 24 deletions python/etl/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,8 @@ def build_full_parser(prog_name):
InitializeSetupCommand,
ShowRandomPassword,
CreateGroupsCommand,
CreateUserCommand,
CreateUsersCommand,
ListUsersCommand,
UpdateUserCommand,
RunSqlCommand,
# Commands to help with table designs and uploading them
Expand Down Expand Up @@ -671,7 +672,7 @@ def __init__(self):
"Make sure that all groups mentioned in the configuration file actually exist."
" (This allows to specify a group (as reader or writer) on a schema when that"
" group does not appear with a user and thus may not have been previously"
" created using a 'create_user' call.)",
" created with 'create_users'.)",
)

def add_arguments(self, parser):
Expand All @@ -682,42 +683,59 @@ def callback(self, args):
etl.data_warehouse.create_groups(dry_run=args.dry_run)


class CreateUserCommand(SubCommand):
class CreateUsersCommand(SubCommand):
def __init__(self):
super().__init__(
"create_user",
"add new user",
"Add new user and set group membership, optionally create a personal schema."
"create_users",
"add users to cluster",
"Add users to cluster and set group membership."
" It is ok to re-initialize a user defined in a settings file."
" Note that you have to set a password for the user in your '~/.pgpass' file"
" before invoking this command. The password must be valid in Redshift,"
" so must contain upper-case and lower-case characters as well as numbers.",
" This will add them to any new groups."
" NOTE we currently do not remove users from groups.",
# Old command name that we want to phase out:
aliases=["create_user"],
)

def add_arguments(self, parser):
add_standard_arguments(parser, ["dry-run"])
parser.add_argument("-g", "--group", help="add user to specified group")
parser.add_argument("-g", "--group", help="DEPRECATED (specify group in configuration file)")
parser.add_argument(
"-a", "--add-user-schema", help="add new schema, writable for the user", action="store_true"
"-a", "--add-user-schema", help="DEPRECATED (use 'update_user' instead)", action="store_true"
)
parser.add_argument("username", help="name for new user")
parser.add_argument("name", help="name of user", nargs="*")

def callback(self, args):
if args.group:
logger.warning("Ignoring specified group, using configuration instead")
if args.add_user_schema:
logger.warning("Ignoring request to add user schema, use 'update_user' instead.")

with etl.db.log_error():
etl.data_warehouse.create_new_user(
args.username,
group=args.group,
add_user_schema=args.add_user_schema,
dry_run=args.dry_run,
)
etl.data_warehouse.create_users(args.name, dry_run=args.dry_run)


class ListUsersCommand(SubCommand):
def __init__(self):
super().__init__(
"list_users",
"list users as they are configured",
"List all users and their groups in the way that they are configurd.",
)

def add_arguments(self, parser):
parser.add_argument("-t", "--transpose", help="group list by user's groups", action="store_true")

def callback(self, args):
with etl.db.log_error():
etl.data_warehouse.list_users(transpose=args.transpose)


class UpdateUserCommand(SubCommand):
def __init__(self):
super().__init__(
"update_user",
"update user's group, password, and path",
"For an existing user, update group membership, password, and search path."
"update user's password, their schema and search path",
"For an existing user, update password, create a schema, and update the search path."
" Note that you have to set a password for the user in your '~/.pgpass' file"
" before invoking this command if you want to update the password. The password must"
" be valid in Redshift, so must contain upper-case and lower-case characters as well"
Expand All @@ -726,17 +744,19 @@ def __init__(self):

def add_arguments(self, parser):
add_standard_arguments(parser, ["dry-run"])
parser.add_argument("-g", "--group", help="add user to specified group")
parser.add_argument("-g", "--group", help="DEPRECATED (specify group in configuration file)")
parser.add_argument(
"-a", "--add-user-schema", help="add new schema, writable for the user", action="store_true"
)
parser.add_argument("username", help="name of existing user")
parser.add_argument("name", help="name of user")

def callback(self, args):
if args.group:
logger.warning("Ignoring specified group, using configuration instead")

with etl.db.log_error():
etl.data_warehouse.update_user(
args.username,
group=args.group,
args.name,
add_user_schema=args.add_user_schema,
dry_run=args.dry_run,
)
Expand Down
5 changes: 2 additions & 3 deletions python/etl/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,8 @@ def set_safe_config_value(name: str, value: str) -> None:
def get_config_map() -> Dict[str, str]:
if _mapped_config is None:
return {}
else:
# Since the mapped config is flattened, we don't worry about a deep copy here.
return dict(_mapped_config)
# Since the mapped config is flattened, we don't worry about a deep copy here.
return dict(_mapped_config)


def _flatten_hierarchy(prefix, props):
Expand Down
2 changes: 1 addition & 1 deletion python/etl/config/default_settings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
{
# Default group specified as group of pseudo-user "default"
"name": "default",
"group": "analyst_ro"
"group": "analyst"
}
]
},
Expand Down
111 changes: 65 additions & 46 deletions python/etl/config/dw.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Data warehouse configuration based on config files for setup, sources, transformations, users."""

from typing import Dict
from typing import Dict, List

import etl.config.env
import etl.db
Expand All @@ -11,16 +11,20 @@

class DataWarehouseUser:
"""
Data warehouse users have always a name and group associated with them.
Data warehouse users have always a name and a list of groups associated with them.
Users may have a schema "belong" to them which they then have write access to.
This is useful for system users, mostly, since end users should treat the
data warehouse as read-only.
Users may have a schema "belong" to them which they then have write access to. This is useful
for system users, mostly, since end users should treat the data warehouse as read-only.
"""

def __init__(self, user_info):
self.name = user_info["name"]
self.group = user_info["group"]
if "group" in user_info:
self.groups = [user_info["group"]]
elif "groups" in user_info:
self.groups = user_info["groups"]
else:
self.groups = []
self.schema = user_info.get("schema")


Expand Down Expand Up @@ -155,79 +159,94 @@ def dsn(self):
return etl.db.parse_connection_string(etl.config.env.get(self._dsn_env_var))

@property
def groups(self):
def groups(self) -> List[str]:
return self.reader_groups + self.writer_groups

@property
def backup_name(self):
def backup_name(self) -> str:
return etl.names.as_backup_name(self.name)

@property
def staging_name(self):
def staging_name(self) -> str:
return etl.names.as_staging_name(self.name)


class DataWarehouseConfig:
"""Pretty interface to create objects from the settings files."""

def __init__(self, settings):
def __init__(self, settings) -> None:
dw_settings = settings["data_warehouse"]
schema_settings = settings.get("sources", []) + dw_settings.get("transformations", [])

# Environment variables with DSN
self._admin_access = dw_settings["admin_access"]
self._etl_access = dw_settings["etl_access"]
self._check_access_to_cluster()
root = DataWarehouseUser(dw_settings["owner"])
# Users are in the order from the config
other_users = [
DataWarehouseUser(user) for user in dw_settings.get("users", []) if user["name"] != "default"
]

# Note that the "owner," which is our super-user of sorts, comes first.
self.users = [root] + other_users
schema_owner_map = {u.schema: u.name for u in self.users if u.schema}
self.users = self._parse_users(dw_settings)
schema_owner_map = {user.schema: user.name for user in self.users if user.schema}

# Schemas (upstream sources followed by transformations, keeps order of settings file)
self.schemas = [
DataWarehouseSchema(
dict(info, owner=schema_owner_map.get(info["name"], root.name)), self._etl_access
)
for info in schema_settings
if not info.get("external", False)
]
self.schemas = self._parse_schemas(
filter(lambda info: not info.get("external"), schema_settings), schema_owner_map
)
self._schema_lookup = {schema.name: schema for schema in self.schemas}

# External schemas are kept separate (for example, we don't back them up).
self.external_schemas = [
DataWarehouseSchema(
dict(info, owner=schema_owner_map.get(info["name"], root.name)), self._etl_access
)
for info in schema_settings
if info.get("external", False)
]

# Schemas may grant access to groups that have no bootstrapped users, so create all
# mentioned user groups.
other_groups = {u.group for u in other_users} | {
g for schema in self.schemas for g in schema.reader_groups
}
self.external_schemas = self._parse_schemas(
filter(lambda info: info.get("external", False), schema_settings), schema_owner_map
)

# Groups are in sorted order after the root group
self.groups = [root.group] + sorted(other_groups)
try:
[self.default_group] = [
user["group"] for user in dw_settings["users"] if user["name"] == "default"
]
except ValueError:
raise ETLConfigError("Failed to find group of default user")
# Schemas may grant access to groups that have no bootstrapped users.
# So we "union" groups mentioned for users and groups mentioned for schemas.
self.groups = sorted(
{group for user in self.users for group in user.groups}.union(
{group for schema in self.schemas for group in schema.groups}
)
)
self.default_group = self._parse_default_group(dw_settings)
# Relation glob patterns indicating unacceptable load failures; matches everything if unset
required_patterns = dw_settings.get("required_for_success", [])
self.required_in_full_load_selector = etl.names.TableSelector(required_patterns)

# Map of SQL types to be able to automatically insert "expressions" into table design files.
self.type_maps = settings["type_maps"]

def _parse_default_group(self, dw_settings) -> str:
"""Return default group based on a user called "default"."""
try:
[default_user] = [user for user in dw_settings["users"] if user["name"] == "default"]
except ValueError:
raise ETLConfigError("failed to find user 'default'")
try:
return default_user["group"]
except KeyError:
raise ETLConfigError("Failed to find 'group' for user 'default'")

def _parse_schemas(self, partial_settings, schema_owner_map) -> List[DataWarehouseSchema]:
# Any schema that is not explicitly claimed belongs to the owner.
return [
DataWarehouseSchema(
dict(info, owner=schema_owner_map.get(info["name"], self.owner.name)), self._etl_access
)
for info in partial_settings
]

def _parse_users(self, dw_settings) -> List[DataWarehouseUser]:
"""
Return list of users (with the owner as the first user).
Users are in the order from the config (but skip pseudo user "default").
"""
owner = DataWarehouseUser(dw_settings["owner"])
other_users = [
DataWarehouseUser(user)
for user in dw_settings["users"]
if user["name"] not in (owner.name, "default")
]
# Note that the "owner," which is our super-user of sorts, must always come first.
return [owner] + other_users

def _check_access_to_cluster(self):
"""
Make sure that ETL user and admin may connect and connect to different databases.
Expand Down
8 changes: 5 additions & 3 deletions python/etl/config/settings.schema
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,17 @@
"minItems": 1
},
"user_info": {
"type": "object",
"additionalProperties": false,
"required": [ "name" ],
"not": { "required": [ "group", "groups" ] },
"properties": {
"name": { "$ref": "#/$defs/identifier" },
"description": { "type": "string" },
"group": { "$ref": "#/$defs/identifier" },
"groups": { "$ref": "#/$defs/identifier_list" },
"schema": { "$ref": "#/$defs/identifier" }
},
"required": [ "name", "group" ],
"additionalProperties": false
"type": "object"
},
"glob_pattern_list": {
"type": "array",
Expand Down
Loading

0 comments on commit 58cdcf9

Please sign in to comment.