Skip to content

Commit

Permalink
[DPE-3500] Share backup password + create&list backups (#351)
Browse files Browse the repository at this point in the history
## Issue
Backups are not feasible in sharded clusters

## Solution
Make backups possible in sharded cluster + add tests. 

Necessary changes to make backups feasible in sharded cluster:
1. pbm password shared across cluster
2. shards restart with new pbm uri

## Other Changes
pbm was incorrectly configured to the wrong URI, according to the
documentation it should use standalone URI
  • Loading branch information
MiaAltieri authored Feb 20, 2024
1 parent 3016d4d commit 5b8c969
Show file tree
Hide file tree
Showing 6 changed files with 170 additions and 18 deletions.
13 changes: 12 additions & 1 deletion lib/charms/mongodb/v0/mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
wait_fixed,
)

from config import Config

# The unique Charmhub library identifier, never change it
LIBID = "49c69d9977574dd7942eb7b54f43355b"

Expand All @@ -30,7 +32,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 7
LIBPATCH = 8

# path to store mongodb ketFile
logger = logging.getLogger(__name__)
Expand All @@ -57,6 +59,7 @@ class MongoDBConfiguration:
roles: Set[str]
tls_external: bool
tls_internal: bool
standalone: bool = False

@property
def uri(self):
Expand All @@ -66,6 +69,14 @@ def uri(self):
auth_source = ""
if self.database != "admin":
auth_source = "&authSource=admin"

if self.standalone:
return (
f"mongodb://{quote_plus(self.username)}:"
f"{quote_plus(self.password)}@"
f"localhost:{Config.MONGODB_PORT}/?authSource=admin"
)

return (
f"mongodb://{quote_plus(self.username)}:"
f"{quote_plus(self.password)}@"
Expand Down
4 changes: 2 additions & 2 deletions lib/charms/mongodb/v1/mongodb_backups.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 2
LIBPATCH = 3

logger = logging.getLogger(__name__)

Expand All @@ -57,7 +57,7 @@
REMAPPING_PATTERN = r"\ABackup doesn't match current cluster topology - it has different replica set names. Extra shards in the backup will cause this, for a simple example. The extra/unknown replica set names found in the backup are: ([^,\s]+)([.] Backup has no data for the config server or sole replicaset)?\Z"
PBM_STATUS_CMD = ["status", "-o", "json"]
MONGODB_SNAP_DATA_DIR = "/var/snap/charmed-mongodb/current"
BACKUP_RESTORE_MAX_ATTEMPTS = 5
BACKUP_RESTORE_MAX_ATTEMPTS = 10
BACKUP_RESTORE_ATTEMPT_COOLDOWN = 15


Expand Down
35 changes: 23 additions & 12 deletions lib/charms/mongodb/v1/shards_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
ShardNotInClusterError,
ShardNotPlannedForRemovalError,
)
from charms.mongodb.v1.users import MongoDBUser, OperatorUser
from charms.mongodb.v1.users import BackupUser, MongoDBUser, OperatorUser
from ops.charm import CharmBase, EventBase, RelationBrokenEvent
from ops.framework import Object
from ops.model import (
Expand All @@ -55,10 +55,11 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 6
LIBPATCH = 7
KEYFILE_KEY = "key-file"
HOSTS_KEY = "host"
OPERATOR_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(OperatorUser.get_username())
BACKUP_PASSWORD_KEY = MongoDBUser.get_password_key_name_for_user(BackupUser.get_username())
FORBIDDEN_REMOVAL_ERR_CODE = 20
AUTH_FAILED_CODE = 18

Expand Down Expand Up @@ -118,6 +119,10 @@ def _on_relation_joined(self, event):
Config.Relations.APP_SCOPE,
OPERATOR_PASSWORD_KEY,
),
BACKUP_PASSWORD_KEY: self.charm.get_secret(
Config.Relations.APP_SCOPE,
BACKUP_PASSWORD_KEY,
),
KEYFILE_KEY: self.charm.get_secret(
Config.Relations.APP_SCOPE, Config.Secrets.SECRET_KEYFILE_NAME
),
Expand Down Expand Up @@ -450,7 +455,7 @@ def __init__(
self.database_requires = DatabaseRequires(
self.charm,
relation_name=self.relation_name,
additional_secret_fields=[KEYFILE_KEY, OPERATOR_PASSWORD_KEY],
additional_secret_fields=[KEYFILE_KEY, OPERATOR_PASSWORD_KEY, BACKUP_PASSWORD_KEY],
# a database isn't required for the relation between shards + config servers, but is a
# requirement for using `DatabaseRequires`
database_name="",
Expand Down Expand Up @@ -478,7 +483,6 @@ def _on_relation_changed(self, event):

# if re-using an old shard, re-set drained flag.
self.charm.unit_peer_data["drained"] = json.dumps(False)

self.charm.unit.status = MaintenanceStatus("Adding shard to config-server")

# shards rely on the config server for secrets
Expand Down Expand Up @@ -507,13 +511,19 @@ def _on_relation_changed(self, event):
operator_password = self.database_requires.fetch_relation_field(
event.relation.id, OPERATOR_PASSWORD_KEY
)
if not operator_password:
backup_password = self.database_requires.fetch_relation_field(
event.relation.id, BACKUP_PASSWORD_KEY
)
if not operator_password or not backup_password:
event.defer()
self.charm.unit.status = WaitingStatus("Waiting for secrets from config-server")
return

try:
self.update_operator_password(new_password=operator_password)
self.update_password(
username=OperatorUser.get_username(), new_password=operator_password
)
self.update_password(BackupUser.get_username(), new_password=backup_password)
except RetryError:
self.charm.unit.status = BlockedStatus("Shard not added to config-server")
logger.error(
Expand All @@ -522,6 +532,8 @@ def _on_relation_changed(self, event):
event.defer()
return

# after updating the password of the backup user, restart pbm with correct password
self.charm._connect_pbm_agent()
self.charm.app_peer_data["mongos_hosts"] = json.dumps(self.get_mongos_hosts())

def pass_hook_checks(self, event):
Expand Down Expand Up @@ -685,8 +697,8 @@ def drained(self, mongos_hosts: Set[str], shard_name: str) -> bool:
self.charm.unit_peer_data["drained"] = json.dumps(drained)
return drained

def update_operator_password(self, new_password: str) -> None:
"""Updates the password for the operator user.
def update_password(self, username: str, new_password: str) -> None:
"""Updates the password for the given user.
Raises:
RetryError
Expand All @@ -696,8 +708,7 @@ def update_operator_password(self, new_password: str) -> None:

current_password = (
self.charm.get_secret(
Config.Relations.APP_SCOPE,
OPERATOR_PASSWORD_KEY,
Config.Relations.APP_SCOPE, MongoDBUser.get_password_key_name_for_user(username)
),
)

Expand All @@ -712,7 +723,7 @@ def update_operator_password(self, new_password: str) -> None:
# a library, for exceptions used in both charm code and lib code.
with MongoDBConnection(self.charm.mongodb_config) as mongo:
try:
mongo.set_user_password(OperatorUser.get_username(), new_password)
mongo.set_user_password(username, new_password)
except NotReadyError:
logger.error(
"Failed changing the password: Not all members healthy or finished initial sync."
Expand All @@ -724,7 +735,7 @@ def update_operator_password(self, new_password: str) -> None:

self.charm.set_secret(
Config.Relations.APP_SCOPE,
OPERATOR_PASSWORD_KEY,
MongoDBUser.get_password_key_name_for_user(username),
new_password,
)

Expand Down
7 changes: 5 additions & 2 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,9 @@ def monitor_config(self) -> MongoDBConfiguration:
def backup_config(self) -> MongoDBConfiguration:
"""Generates a MongoDBConfiguration object for backup."""
self._check_or_set_user_password(BackupUser)
return self._get_mongodb_config_for_user(BackupUser, BackupUser.get_hosts())
return self._get_mongodb_config_for_user(
BackupUser, BackupUser.get_hosts(), standalone=True
)

@property
def unit_peer_data(self) -> Dict:
Expand Down Expand Up @@ -773,7 +775,7 @@ def _get_mongos_config_for_user(
)

def _get_mongodb_config_for_user(
self, user: MongoDBUser, hosts: Set[str]
self, user: MongoDBUser, hosts: Set[str], standalone: bool = False
) -> MongoDBConfiguration:
external_ca, _ = self.tls.get_tls_files(UNIT_SCOPE)
internal_ca, _ = self.tls.get_tls_files(APP_SCOPE)
Expand All @@ -787,6 +789,7 @@ def _get_mongodb_config_for_user(
roles=user.get_roles(),
tls_external=external_ca is not None,
tls_internal=internal_ca is not None,
standalone=standalone,
)

def _get_user_or_fail_event(self, event: ActionEvent, default_username: str) -> Optional[str]:
Expand Down
2 changes: 1 addition & 1 deletion src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class Config:
MONGODB_SNAP_DATA_DIR = "/var/snap/charmed-mongodb/current"
MONGOD_CONF_DIR = f"{MONGODB_SNAP_DATA_DIR}/etc/mongod"
MONGOD_CONF_FILE_PATH = f"{MONGOD_CONF_DIR}/mongod.conf"
SNAP_PACKAGES = [("charmed-mongodb", "6/edge", 93)]
SNAP_PACKAGES = [("charmed-mongodb", "6/edge", 111)]

# Keep these alphabetically sorted
class Actions:
Expand Down
127 changes: 127 additions & 0 deletions tests/integration/sharding_tests/test_sharding_backups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!/usr/bin/env python3
# Copyright 2024 Canonical Ltd.
# See LICENSE file for licensing details.

import secrets
import string

import pytest
from pytest_operator.plugin import OpsTest
from tenacity import RetryError, Retrying, stop_after_delay, wait_fixed

from ..backup_tests import helpers as backup_helpers

S3_APP_NAME = "s3-integrator"
SHARD_ONE_APP_NAME = "shard-one"
SHARD_TWO_APP_NAME = "shard-two"
SHARD_APPS = [SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME]
CONFIG_SERVER_APP_NAME = "config-server-one"
SHARD_REL_NAME = "sharding"
CONFIG_SERVER_REL_NAME = "config-server"
S3_REL_NAME = "s3-credentials"
TIMEOUT = 10 * 60


@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_build_and_deploy(ops_test: OpsTest) -> None:
"""Build and deploy a sharded cluster."""
my_charm = await ops_test.build_charm(".")
await ops_test.model.deploy(
my_charm,
num_units=2,
config={"role": "config-server"},
application_name=CONFIG_SERVER_APP_NAME,
)
await ops_test.model.deploy(
my_charm, num_units=2, config={"role": "shard"}, application_name=SHARD_ONE_APP_NAME
)
await ops_test.model.deploy(
my_charm, num_units=1, config={"role": "shard"}, application_name=SHARD_TWO_APP_NAME
)

# deploy the s3 integrator charm
await ops_test.model.deploy(S3_APP_NAME, channel="edge")

await ops_test.model.wait_for_idle(
apps=[S3_APP_NAME, CONFIG_SERVER_APP_NAME, SHARD_ONE_APP_NAME, SHARD_TWO_APP_NAME],
idle_period=20,
raise_on_blocked=False,
timeout=TIMEOUT,
raise_on_error=False,
)


@pytest.mark.group(1)
async def test_set_credentials_in_cluster(ops_test: OpsTest, github_secrets) -> None:
"""Tests that sharded cluster can be configured for s3 configurations."""
await backup_helpers.set_credentials(ops_test, github_secrets, cloud="AWS")
choices = string.ascii_letters + string.digits
unique_path = "".join([secrets.choice(choices) for _ in range(4)])
configuration_parameters = {
"bucket": "data-charms-testing",
"path": f"mongodb-vm/test-{unique_path}",
"endpoint": "https://s3.amazonaws.com",
"region": "us-east-1",
}

# apply new configuration options
await ops_test.model.applications[S3_APP_NAME].set_config(configuration_parameters)
await ops_test.model.wait_for_idle(apps=[S3_APP_NAME], status="active", timeout=TIMEOUT)

# provide config-server to entire cluster and s3-integrator to config-server - integrations
# made in succession to test race conditions.
await ops_test.model.integrate(
f"{S3_APP_NAME}:{S3_REL_NAME}",
f"{CONFIG_SERVER_APP_NAME}:{S3_REL_NAME}",
)
await ops_test.model.integrate(
f"{SHARD_ONE_APP_NAME}:{SHARD_REL_NAME}",
f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
)
await ops_test.model.integrate(
f"{SHARD_TWO_APP_NAME}:{SHARD_REL_NAME}",
f"{CONFIG_SERVER_APP_NAME}:{CONFIG_SERVER_REL_NAME}",
)

await ops_test.model.wait_for_idle(
apps=[
CONFIG_SERVER_APP_NAME,
SHARD_ONE_APP_NAME,
SHARD_TWO_APP_NAME,
],
idle_period=20,
status="active",
timeout=TIMEOUT,
)


@pytest.mark.group(1)
async def test_create_and_list_backups_in_cluster(ops_test: OpsTest, github_secrets) -> None:
"""Tests that sharded cluster can successfully create and list backups."""
leader_unit = await backup_helpers.get_leader_unit(
ops_test, db_app_name=CONFIG_SERVER_APP_NAME
)
await backup_helpers.set_credentials(ops_test, github_secrets, cloud="AWS")
# verify backup list works
action = await leader_unit.run_action(action_name="list-backups")
list_result = await action.wait()
backups = list_result.results["backups"]
assert backups, "backups not outputted"

# verify backup is started
action = await leader_unit.run_action(action_name="create-backup")
backup_result = await action.wait()
assert "backup started" in backup_result.results["backup-status"], "backup didn't start"

# verify backup is present in the list of backups
# the action `create-backup` only confirms that the command was sent to the `pbm`. Creating a
# backup can take a lot of time so this function returns once the command was successfully
# sent to pbm. Therefore we should retry listing the backup several times
try:
for attempt in Retrying(stop=stop_after_delay(20), wait=wait_fixed(3)):
with attempt:
backups = await backup_helpers.count_logical_backups(leader_unit)
assert backups == 1
except RetryError:
assert backups == 1, "Backup not created."

0 comments on commit 5b8c969

Please sign in to comment.