From 36356924623576ee3269bccf4f9a28e83cd0d88c Mon Sep 17 00:00:00 2001 From: Toby Jennings Date: Tue, 28 Jan 2025 13:17:11 -0600 Subject: [PATCH] fix: refactor config objects --- src/lsst/cmservice/common/butler.py | 22 +++++++++++----- src/lsst/cmservice/common/daemon.py | 1 + src/lsst/cmservice/common/htcondor.py | 20 +++++++-------- src/lsst/cmservice/common/slurm.py | 4 +-- src/lsst/cmservice/config.py | 37 +++++++++++++++++++-------- tests/common/test_config.py | 3 --- tests/db/test_handlers.py | 6 ++--- tests/db/test_micro.py | 6 ++--- 8 files changed, 59 insertions(+), 40 deletions(-) diff --git a/src/lsst/cmservice/common/butler.py b/src/lsst/cmservice/common/butler.py index 86aa3a21..1d00cd19 100644 --- a/src/lsst/cmservice/common/butler.py +++ b/src/lsst/cmservice/common/butler.py @@ -14,8 +14,11 @@ from lsst.utils.db_auth import DbAuth from ..common import errors +from ..common.logging import LOGGER from ..config import config +logger = LOGGER.bind(__name__) + # TODO this should be cached for the lifetime of the service def parse_butler_repos_from_environment() -> Mapping[str, str]: @@ -55,22 +58,27 @@ async def get_butler_config(repo: str, *, without_datastore: bool = False) -> Bu repo_uri = parse_butler_repos_from_environment()[repo] except KeyError: # No such repo known to the service - # TODO log an error, otherwise let butler try to sort it out + logger.warning("Butler repo %s not known to environment.", repo) repo_uri = repo - bc = ButlerConfig( - other=repo_uri, - without_datastore=without_datastore, - ) - db_url = url.make_url(bc["registry"]["db"]) + try: + bc = ButlerConfig( + other=repo_uri, + without_datastore=without_datastore, + ) + except FileNotFoundError: + # No such repo known to Butler + logger.error("Butler repo %s not known.", repo) + raise RuntimeError("Unknown Butler Repo %s", repo) try: db_auth_info = yaml.safe_load(await Path(config.butler.authentication_file).read_text()) except FileNotFoundError: - # TODO Log error or warning + logger.error("No Butler Registry authentication secrets found.") # delegate db auth info discovery to normal toolchain return bc + db_url = url.make_url(bc["registry"]["db"]) db_auth = DbAuth(authList=db_auth_info).getAuth( dialectname=db_url.drivername, username=db_url.username, diff --git a/src/lsst/cmservice/common/daemon.py b/src/lsst/cmservice/common/daemon.py index 02fbe09f..e897586d 100644 --- a/src/lsst/cmservice/common/daemon.py +++ b/src/lsst/cmservice/common/daemon.py @@ -13,6 +13,7 @@ async def daemon_iteration(session: async_scoped_session) -> None: iteration_start = datetime.now() + # TODO limit query to queues that do not have a "time_finished" queue_entries = await session.execute(select(Queue).where(Queue.time_next_check < iteration_start)) logger.debug("Daemon Iteration: %s", iteration_start) diff --git a/src/lsst/cmservice/common/htcondor.py b/src/lsst/cmservice/common/htcondor.py index 54609700..ea24052c 100644 --- a/src/lsst/cmservice/common/htcondor.py +++ b/src/lsst/cmservice/common/htcondor.py @@ -192,17 +192,17 @@ def build_htcondor_submit_environment() -> Mapping[str, str]: _CONDOR_COLLECTOR_HOST=config.htcondor.collector_host, _CONDOR_SCHEDD_HOST=config.htcondor.schedd_host, _CONDOR_SEC_CLIENT_AUTHENTICATION_METHODS=config.htcondor.authn_methods, - DAF_BUTLER_REPOSITORY_INDEX="/sdf/group/rubin/shared/data-repos.yaml", + _CONDOR_DAGMAN_MANAGER_JOB_APPEND_GETENV=str(config.htcondor.dagman_job_append_get_env), + DAF_BUTLER_REPOSITORY_INDEX=config.butler.repository_index, FS_REMOTE_DIR=config.htcondor.fs_remote_dir, - HOME="/sdf/home/l/lsstsvc1", - LSST_VERSION="w_latest", - LSST_DISTRIB_DIR="/sdf/group/rubin/sw", - # LSST_DISTRIB_DIR="/cvmfs/sw.lsst.eu/linux-x86_64/lsst_distrib", - PGPASSFILE="/sdf/home/l/lsstsvc1/.lsst/postgres-credentials.txt", - PGUSER="rubin", + HOME=config.htcondor.user_home, + LSST_VERSION=config.bps.lsst_version, + LSST_DISTRIB_DIR=config.bps.lsst_distrib_dir, + # FIX: because there is no db-auth.yaml in lsstsvc1's home directory + PGPASSFILE=f"{config.htcondor.user_home}/.lsst/postgres-credentials.txt", + PGUSER=config.butler.default_username, PATH=( - "/sdf/home/l/lsstsvc1/.local/bin:/sdf/home/l/lsstsvc1/bin:" - "/opt/slurm/slurm-curr/bin:/usr/local/bin:/usr/bin:" - "/usr/local/sbin:/usr/sbin" + f"{config.htcondor.user_home}/.local/bin:{config.htcondor.user_home}/bin:{config.slurm.home}:" + f"/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin" ), ) diff --git a/src/lsst/cmservice/common/slurm.py b/src/lsst/cmservice/common/slurm.py index 8b41d447..cb1743c9 100644 --- a/src/lsst/cmservice/common/slurm.py +++ b/src/lsst/cmservice/common/slurm.py @@ -65,7 +65,7 @@ async def submit_slurm_job( try: async with await open_process( [ - config.slurm.sbatch_bin, + f"{config.slurm.home}/sbatch", "-o", log_url, "--mem", @@ -119,7 +119,7 @@ async def check_slurm_job( return StatusEnum.running try: async with await open_process( - [config.slurm.sacct_bin, "--parsable", "-b", "-j", slurm_id] + [f"{config.slurm.home}/sacct", "--parsable", "-b", "-j", slurm_id] ) as slurm_check: # pragma: no cover await slurm_check.wait() if slurm_check.returncode != 0: diff --git a/src/lsst/cmservice/config.py b/src/lsst/cmservice/config.py index 500604ab..1b416ccd 100644 --- a/src/lsst/cmservice/config.py +++ b/src/lsst/cmservice/config.py @@ -19,6 +19,16 @@ class BpsConfiguration(BaseModel): FIXME: rename to LsstConfiguration and consolidate multiple models? """ + lsst_version: str = Field( + description="Default LSST version", + default="w_latest", + ) + + lsst_distrib_dir: str = Field( + description="Default distribution directory from which to setup lsst", + default="/sdf/group/rubin/sw", + ) + bps_bin: str = Field( description="Name of a bps client binary", default="bps", @@ -61,6 +71,11 @@ class ButlerConfiguration(BaseModel): default="~/.lsst/db-auth.yaml", ) + default_username: str = Field( + description="Default username to use for Butler registry authentication", + default="rubin", + ) + access_token: str | None = Field( description=("Gafaelfawr access token used to authenticate to a Butler server."), default=None, @@ -99,6 +114,11 @@ class HTCondorConfiguration(BaseModel): their serialization alias. """ + user_home: str = Field( + description=("Path to the user's home directory, as resolvable from an htcondor access node."), + default="/sdf/home/l/lsstsvc1", + ) + condor_home: str = Field( description=("Path to the Condor home directory. Equivalent to the condor ``RELEASE_DIR`` macro."), default="/opt/htcondor", @@ -193,6 +213,8 @@ class HTCondorConfiguration(BaseModel): ) +# TODO deprecate and remove "slurm"-specific logic from cm-service; it is +# unlikely that interfacing with slurm directly from k8s will be possible. class SlurmConfiguration(BaseModel): """Configuration settings for slurm client operations. @@ -202,21 +224,14 @@ class SlurmConfiguration(BaseModel): ---- Default SBATCH_* variables could work just as well, but it is useful to have this as a document of what settings are actually used. + + These settings should also apply to htcondor resource allocation jobs + that are equivalent to "allocateNodes" """ home: str = Field( description="Location of the installed slurm client binaries", - default="", - ) - - sacct_bin: str = Field( - description="Name of sacct slurm client binary", - default="sacct", - ) - - sbatch_bin: str = Field( - description="Name of sbatch slurm client binary", - default="sbatch", + default="/opt/slurm/slurm-curr/bin", ) memory: str = Field( diff --git a/tests/common/test_config.py b/tests/common/test_config.py index 1d56f258..77d69bbf 100644 --- a/tests/common/test_config.py +++ b/tests/common/test_config.py @@ -14,21 +14,18 @@ def test_config_nested_partial_update(monkeypatch: Any) -> None: config_a = Configuration() assert not config_a.db.echo assert config_a.htcondor.condor_q_bin == "condor_q" - assert config_a.slurm.sacct_bin == "sacct" assert config_a.asgi.port == 8080 assert config_a.logging.profile == "development" monkeypatch.setenv("LOGGING__PROFILE", "production") monkeypatch.setenv("ASGI__PORT", "5000") monkeypatch.setenv("HTCONDOR__CONDOR_Q_BIN", "/usr/local/bin/condor_q") - monkeypatch.setenv("SLURM__SACCT_BIN", "/opt/slurm/bin/sacct") monkeypatch.setenv("DB__ECHO", "1") # Partial updates in nested configuration models config_b = Configuration() assert config_b.db.echo assert config_b.htcondor.condor_q_bin == "/usr/local/bin/condor_q" - assert config_b.slurm.sacct_bin == "/opt/slurm/bin/sacct" assert config_b.asgi.port == 5000 assert config_b.logging.profile == "production" diff --git a/tests/db/test_handlers.py b/tests/db/test_handlers.py index c52f8328..02182eb2 100644 --- a/tests/db/test_handlers.py +++ b/tests/db/test_handlers.py @@ -246,9 +246,8 @@ async def test_handlers_group_level_db( dict( lsst_distrib_dir="lsst_distrib_dir", bps_core_yaml_template="bps_core_yaml_template", - bps_core_script_template="bps_core_script_template", + bps_core_script_template="stack_script_template", bps_panda_script_template="bps_panda_script_template", - bps_htcondor_script_template="bps_htcondor_script_template", manifest_script_template="stack_script_template", ) @@ -304,9 +303,8 @@ async def test_handlers_job_level_db( data = dict( lsst_distrib_dir="lsst_distrib_dir", bps_core_yaml_template="bps_core_yaml_template", - bps_core_script_template="bps_core_script_template", + bps_core_script_template="stack_script_template", bps_panda_script_template="bps_panda_script_template", - bps_htcondor_script_template="bps_htcondor_script_template", manifest_script_template="stack_script_template", ) diff --git a/tests/db/test_micro.py b/tests/db/test_micro.py index 99b13c99..e9c5d4b4 100644 --- a/tests/db/test_micro.py +++ b/tests/db/test_micro.py @@ -45,13 +45,13 @@ async def test_micro_db( with pytest.raises(errors.CMSpecificationError): await specification.get_script_template(session, "bad") - script_template = await specification.get_script_template(session, "bps_core_script_template") - assert script_template.name == "bps_core_script_template", "Script template name mismatch" + script_template = await specification.get_script_template(session, "stack_script_template") + assert script_template.name == "stack_script_template", "Script template name mismatch" await script_template.update_from_file( session, script_template.name, - "examples/templates/example_bps_core_script_template.yaml", + "examples/templates/example_stack_script_template.yaml", ) campaign = await interface.load_and_create_campaign(