diff --git a/.github/workflows/slurm-collector.yml b/.github/workflows/slurm-collector.yml index 3f161be96..fd60bbbb7 100644 --- a/.github/workflows/slurm-collector.yml +++ b/.github/workflows/slurm-collector.yml @@ -80,3 +80,7 @@ jobs: - name: Test Slurm collector run: SKIP_COMPILATION=true ./scripts/test_slurm_collector.sh + + - name: Test tls Slurm collector + run: SKIP_COMPILATION=true ./scripts/test_tls_slurm_collector.sh + diff --git a/containers/docker-centos7-slurm/collector_tls_config.yaml b/containers/docker-centos7-slurm/collector_tls_config.yaml new file mode 100644 index 000000000..bb5d9bd3b --- /dev/null +++ b/containers/docker-centos7-slurm/collector_tls_config.yaml @@ -0,0 +1,56 @@ +addr: "host.docker.internal" +port: 8000 +record_prefix: "slurm" +job_filter: + status: # A list of acceptable job statuses + - "completed" +sacct_frequency: 2 # in seconds +sender_frequency: 1 # in seconds +sites: + - name: "SiteA" + only_if: + key: "Partition" + matches: "^part1$" + - name: "SiteB" + only_if: + key: "Partition" + matches: "^part2$" +meta: + - name: Comment + key: "Comment" + key_type: Json +components: + - name: "Cores" + key: "NCPUS" + scores: + - name: "HEPSPEC06" + value: 1.1 + only_if: + key: "Partition" + matches: "^part1$" + - name: "HEPSPEC06" + value: 1.2 + only_if: + key: "Partition" + matches: "^part2$" + - name: "SystemCPU" + key: "SystemCPU" + key_type: Time + only_if: + key: "Partition" + matches: "^part1$" + - name: "UserCPU" + key: "UserCPU" + key_type: Time + only_if: + key: "Partition" + matches: "^part1$" + - name: "Memory" + key: "ReqMem" + key_type: IntegerMega +tls_config: + use_tls: true + ca_cert_path: "/client_certs/rootCA.pem" + client_cert_path: "/client_certs/client-cert.pem" + client_key_path: "/client_certs/client-key.pem" + diff --git a/containers/docker-centos7-slurm/tls_auditor_config.yaml b/containers/docker-centos7-slurm/tls_auditor_config.yaml new file mode 100644 index 000000000..64502669d --- /dev/null +++ b/containers/docker-centos7-slurm/tls_auditor_config.yaml @@ -0,0 +1,21 @@ +application: + port: 8000 +database: + host: "localhost" + port: 5432 + username: "postgres" + password: "password" + database_name: "auditor" +metrics: + database: + frequency: 30 + metrics: + - RecordCount + - RecordCountPerSite + - RecordCountPerGroup + - RecordCountPerUser +tls_config: + use_tls: true + ca_cert_path: "/server_certs/rootCA.pem" + server_cert_path: "/server_certs/server-cert.pem" + server_key_path: "/server_certs/server-key.pem" diff --git a/scripts/test_slurm_collector.sh b/scripts/test_slurm_collector.sh index 82982f9ef..606f86313 100755 --- a/scripts/test_slurm_collector.sh +++ b/scripts/test_slurm_collector.sh @@ -48,6 +48,28 @@ function start_container() { --project-directory=$DOCKER_PROJECT_DIR \ --project-name="$COMPOSE_PROJECT_NAME" \ cp ./containers/docker-centos7-slurm/collector_config.yaml slurm:/collector_config.yaml + # Copy tls_config for collector + docker compose \ + --file $DOCKER_COMPOSE_FILE \ + --project-directory=$DOCKER_PROJECT_DIR \ + --project-name="$COMPOSE_PROJECT_NAME" \ + cp ./containers/docker-centos7-slurm/collector_tls_config.yaml slurm:/collector_tls_config.yaml + + # Copy config for auditor + docker compose \ + --file $DOCKER_COMPOSE_FILE \ + --project-directory=$DOCKER_PROJECT_DIR \ + --project-name="$COMPOSE_PROJECT_NAME" \ + cp ./containers/docker-centos7-slurm/tls_auditor_config.yaml slurm:/tls_auditor_config.yaml + + # Copy client tls certs + docker compose \ + --file $DOCKER_COMPOSE_FILE \ + --project-directory=$DOCKER_PROJECT_DIR \ + --project-name="$COMPOSE_PROJECT_NAME" \ + cp ./scripts/certs slurm:/client_certs + + # Copy basic batch script docker compose \ --file $DOCKER_COMPOSE_FILE \ @@ -125,6 +147,32 @@ function start_auditor() { done } +function start_tls_auditor() { + if [[ -z "${SKIP_COMPILATION}" ]] + then + compile_auditor + fi + if [ "$RELEASE_MODE" = true ]; then + AUDITOR_APPLICATION__ADDR=0.0.0.0 AUDITOR_DATABASE__DATABASE_NAME=$DB_NAME ./target/release/auditor auditor/configuration/tls_config.yaml & + else + AUDITOR_APPLICATION__ADDR=0.0.0.0 AUDITOR_DATABASE__DATABASE_NAME=$DB_NAME ./target/debug/auditor auditor/configuration/tls_config.yaml & + fi + AUDITOR_SERVER_PID=$! + COUNTER=0 + until curl http://localhost:8000/health_check; do + >&2 echo "Auditor is still unavailable - sleeping" + let COUNTER=COUNTER+1 + if [ "$COUNTER" -gt "30" ]; then + echo >&2 "Auditor did not come up in time." + stop_auditor $AUDITOR_SERVER_PID + echo >&2 "Exiting." + exit 1 + fi + sleep 1 + done +} + + function start_slurm_collector() { if [[ -z "${SKIP_COMPILATION}" ]] then @@ -133,16 +181,30 @@ function start_slurm_collector() { docker exec "${COMPOSE_PROJECT_NAME}-slurm-1" /auditor-slurm-collector /collector_config.yaml & } +function start_tls_slurm_collector() { + if [[ -z "${SKIP_COMPILATION}" ]] + then + compile_collector + fi + docker exec "${COMPOSE_PROJECT_NAME}-slurm-1" /auditor-slurm-collector collector_tls_config.yaml & +} + + + function stop_auditor() { echo >&2 "Stopping Auditor" kill $AUDITOR_SERVER_PID } -function test_collector() { +function submit_test_job_1() { # Run on partition1 docker exec "${COMPOSE_PROJECT_NAME}-slurm-1" sh -c "sbatch --job-name=test_part1 --partition=part1 --comment=\"$COMMENT\" /batch.sh" sleep 20 +} + +function test_collector_1() { + TEST1=$(curl -X GET http://localhost:8000/records | jq) if [ "$(echo $TEST1 | jq '. | length')" != 1 ] @@ -189,9 +251,15 @@ function test_collector() { exit 1 fi +} + +function submit_test_job_2(){ # Run on partition2 docker exec "${COMPOSE_PROJECT_NAME}-slurm-1" sh -c "sbatch --job-name=test_part2 --partition=part2 /batch.sh" sleep 20 +} + +function test_collector_2(){ TEST2=$(curl -X GET http://localhost:8000/records | jq) @@ -236,11 +304,25 @@ if [[ -z "${SKIP_COMPILATION}" ]] then compile_collector fi + start_container start_auditor start_slurm_collector -test_collector +submit_test_job_1 +test_collector_1 +submit_test_job_2 +test_collector_2 + +stop_container +stop_auditor + +# Testing TLS setup +start_container +start_tls_auditor +start_tls_slurm_collector + +test_collector_2 stop_container stop_auditor diff --git a/scripts/test_tls_slurm_collector.sh b/scripts/test_tls_slurm_collector.sh new file mode 100755 index 000000000..5c9d3b7d2 --- /dev/null +++ b/scripts/test_tls_slurm_collector.sh @@ -0,0 +1,271 @@ +#!/usr/bin/env bash +set -x +set -eo pipefail + +# Docker +DOCKER_COMPOSE_FILE=${DOCKER_COMPOSE_FILE:="containers/docker-centos7-slurm/docker-compose.yml"} +DOCKER_PROJECT_DIR=${DOCKER_PROJECT_DIR:="."} +COMPOSE_PROJECT_NAME=${COMPOSE_PROJECT_NAME:="auditor"} +# Collector build +RELEASE_MODE=${RELEASE_MODE:=false} +TARGET_ARCH=${TARGET_ARCH:="x86_64-unknown-linux-musl"} +DB_NAME=${DB_NAME:=$(uuidgen)} +COMMENT="{ 'voms': '/atlas/Role=production', 'subject': '/some/thing' }" + + +function stop_container () { + echo >&2 "Stopping container" + docker compose \ + --file $DOCKER_COMPOSE_FILE \ + --project-directory=$DOCKER_PROJECT_DIR \ + --project-name="$COMPOSE_PROJECT_NAME" \ + down +} + +function start_container() { + docker compose \ + --file $DOCKER_COMPOSE_FILE \ + --project-directory=$DOCKER_PROJECT_DIR \ + --project-name="$COMPOSE_PROJECT_NAME" \ + up -d + # Copy slurm.conf to container + docker compose \ + --file $DOCKER_COMPOSE_FILE \ + --project-directory=$DOCKER_PROJECT_DIR \ + --project-name="$COMPOSE_PROJECT_NAME" \ + cp ./containers/docker-centos7-slurm/slurm.conf slurm:/etc/slurm/slurm.conf + # Copy Slurm collector to container + docker compose \ + --file $DOCKER_COMPOSE_FILE \ + --project-directory=$DOCKER_PROJECT_DIR \ + --project-name="$COMPOSE_PROJECT_NAME" \ + cp \ + ./target/${TARGET_ARCH}/debug/auditor-slurm-collector \ + slurm:/auditor-slurm-collector + # Copy config for auditor + docker compose \ + --file $DOCKER_COMPOSE_FILE \ + --project-directory=$DOCKER_PROJECT_DIR \ + --project-name="$COMPOSE_PROJECT_NAME" \ + cp ./containers/docker-centos7-slurm/tls_auditor_config.yaml slurm:/tls_auditor_config.yaml + # Copy config for collector + docker compose \ + --file $DOCKER_COMPOSE_FILE \ + --project-directory=$DOCKER_PROJECT_DIR \ + --project-name="$COMPOSE_PROJECT_NAME" \ + cp ./containers/docker-centos7-slurm/collector_config.yaml slurm:/collector_config.yaml + # Copy tls_config for collector + docker compose \ + --file $DOCKER_COMPOSE_FILE \ + --project-directory=$DOCKER_PROJECT_DIR \ + --project-name="$COMPOSE_PROJECT_NAME" \ + cp ./containers/docker-centos7-slurm/collector_tls_config.yaml slurm:/collector_tls_config.yaml + # Copy client tls certs + docker compose \ + --file $DOCKER_COMPOSE_FILE \ + --project-directory=$DOCKER_PROJECT_DIR \ + --project-name="$COMPOSE_PROJECT_NAME" \ + cp ./scripts/certs slurm:/client_certs + # Copy server tls certs + docker compose \ + --file $DOCKER_COMPOSE_FILE \ + --project-directory=$DOCKER_PROJECT_DIR \ + --project-name="$COMPOSE_PROJECT_NAME" \ + cp ./auditor/certs slurm:/server_certs + # Copy basic batch script + docker compose \ + --file $DOCKER_COMPOSE_FILE \ + --project-directory=$DOCKER_PROJECT_DIR \ + --project-name="$COMPOSE_PROJECT_NAME" \ + cp ./containers/docker-centos7-slurm/batch5.sh slurm:/batch.sh + + # docker exec "${COMPOSE_PROJECT_NAME}-slurm-1" chown slurm:slurm /auditor-slurm-collector /collector_config.yaml + docker exec "${COMPOSE_PROJECT_NAME}-slurm-1" mkdir -p /collector_logs + docker exec "${COMPOSE_PROJECT_NAME}-slurm-1" chown slurm:slurm /collector_logs + + COUNTER=0 + until docker exec "${COMPOSE_PROJECT_NAME}-slurm-1" scontrol ping; do + >&2 echo "Slurm container is still unavailable - sleeping" + let COUNTER=COUNTER+1 + if [ "$COUNTER" -gt "30" ]; then + echo >&2 "Docker container did not come up in time." + echo >&2 "Docker logs:" + docker logs "${COMPOSE_PROJECT_NAME}-slurm-1" + docker exec "${COMPOSE_PROJECT_NAME}-slurm-1" cat /var/log/slurm/slurmctld.log + stop_container + echo >&2 "Exiting." + exit 1 + fi + sleep 1 + done +} + + +function compile_auditor() { + if [ "$RELEASE_MODE" = true ]; then + cargo build --bin auditor --release + else + cargo build --bin auditor + fi +} + +function compile_collector() { + if [ "$RELEASE_MODE" = true ]; then + RUSTFLAGS='-C link-args=-s' \ + cargo build \ + --target $TARGET_ARCH \ + --bin auditor-slurm-collector \ + --release + else + RUSTFLAGS='-C link-args=-s' \ + cargo build \ + --target $TARGET_ARCH \ + --bin auditor-slurm-collector + fi +} + +function start_auditor() { + if [[ -z "${SKIP_COMPILATION}" ]] + then + compile_auditor + fi + if [ "$RELEASE_MODE" = true ]; then + AUDITOR_APPLICATION__ADDR=0.0.0.0 AUDITOR_DATABASE__DATABASE_NAME=$DB_NAME ./target/release/auditor auditor/configuration/tls_config.yaml & + else + AUDITOR_APPLICATION__ADDR=0.0.0.0 AUDITOR_DATABASE__DATABASE_NAME=$DB_NAME ./target/debug/auditor auditor/configuration/tls_config.yaml & + fi + AUDITOR_SERVER_PID=$! + COUNTER=0 + until curl http://localhost:8000/health_check; do + >&2 echo "Auditor is still unavailable - sleeping" + let COUNTER=COUNTER+1 + if [ "$COUNTER" -gt "30" ]; then + echo >&2 "Auditor did not come up in time." + stop_auditor $AUDITOR_SERVER_PID + echo >&2 "Exiting." + exit 1 + fi + sleep 1 + done +} + +function start_slurm_collector() { + if [[ -z "${SKIP_COMPILATION}" ]] + then + compile_collector + fi + docker exec "${COMPOSE_PROJECT_NAME}-slurm-1" /auditor-slurm-collector /collector_tls_config.yaml & +} + +function stop_auditor() { + echo >&2 "Stopping Auditor" + kill $AUDITOR_SERVER_PID +} + +function test_collector() { + # Run on partition1 + docker exec "${COMPOSE_PROJECT_NAME}-slurm-1" sh -c "sbatch --job-name=test_part1 --partition=part1 --comment=\"$COMMENT\" /batch.sh" + sleep 20 + + TEST1=$(curl -X GET http://localhost:8000/records | jq) + + if [ "$(echo $TEST1 | jq '. | length')" != 1 ] + then + echo >&2 "Incorrect number of records in accounting database." + stop_container + stop_auditor + exit 1 + fi + + if [ "$(echo $TEST1 | jq '.[] | select(.record_id=="slurm-1") | .components | .[] | .scores | .[] | .value')" != 1.1 ] + then + echo >&2 "Incorrect score of record in accounting database. Returned record:" + echo >&2 $TEST1 + stop_container + stop_auditor + exit 1 + fi + + if [ "$(echo $TEST1 | jq '.[] | select(.record_id=="slurm-1") | .meta | .voms | .[0]')" != '"%2Fatlas%2FRole=production"' ] + then + echo >&2 "Incorrect meta of record in accounting database. Returned record:" + echo >&2 $TEST1 + stop_container + stop_auditor + exit 1 + fi + + if [ "$(echo $TEST1 | jq '.[] | select(.record_id=="slurm-1") | .meta | .subject | .[0]')" != '"%2Fsome%2Fthing"' ] + then + echo >&2 "Incorrect meta of record in accounting database. Returned record:" + echo >&2 $TEST1 + stop_container + stop_auditor + exit 1 + fi + + if [ $(echo $TEST1 | jq '.[] | select(.record_id=="slurm-1") | .meta | .site_id | .[0]') != '"SiteA"' ] + then + echo >&2 "Incorrect site_id of record in accounting database. Returned record:" + echo >&2 $TEST1 + stop_container + stop_auditor + exit 1 + fi + + # Run on partition2 + docker exec "${COMPOSE_PROJECT_NAME}-slurm-1" sh -c "sbatch --job-name=test_part2 --partition=part2 /batch.sh" + sleep 20 + + TEST2=$(curl -X GET http://localhost:8000/records | jq) + + if [ "$(echo $TEST2 | jq '. | length')" != 2 ] + then + echo >&2 "Incorrect number of records in accounting database." + stop_container + stop_auditor + exit 1 + fi + + if [ "$(echo $TEST2 | jq '.[] | select(.record_id=="slurm-2") | .components | .[] | .scores | .[] | .value')" != 1.2 ] + then + echo >&2 "Incorrect score of record in accounting database. Returned record:" + echo >&2 $TEST2 + stop_container + stop_auditor + exit 1 + fi + + if [ $(echo $TEST2 | jq '.[] | select(.record_id=="slurm-2") | .meta | .site_id | .[0]') != '"SiteB"' ] + then + echo >&2 "Incorrect site_id of record in accounting database. Returned record:" + echo >&2 $TEST1 + stop_container + stop_auditor + exit 1 + fi +} + +SKIP_DOCKER=true POSTGRES_DB=$DB_NAME ./scripts/init_db.sh + +cleanup_exit() { + setsid nohup bash -c " + docker compose --file $DOCKER_COMPOSE_FILE --project-directory=$DOCKER_PROJECT_DIR --project-name=$COMPOSE_PROJECT_NAME down + kill $AUDITOR_SERVER_PID + " +} +trap "cleanup_exit" SIGINT SIGQUIT SIGTERM EXIT + +if [[ -z "${SKIP_COMPILATION}" ]] +then + compile_collector +fi +start_container +start_auditor +start_slurm_collector + +test_collector + +stop_container +stop_auditor +