diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 0000000..cb16bba --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,68 @@ +name: Build Artifacts +on: + release: + types: [created] + push: + branches: + - '**' + workflow_dispatch: + inputs: + publish_docker: + description: "Publish image to ghcr.io/netcracker/pgskipper-backup-daemon" + type: boolean + default: false + required: false + +env: + TAG_NAME: ${{ github.event.release.tag_name || github.ref }} + PUSH: ${{ github.event_name != 'workflow_dispatch' || inputs.publish_docker }} + +jobs: + multiplatform_build: + strategy: + fail-fast: false + matrix: + component: + - name: pgskipper-backup-daemon + file: Dockerfile + context: "" + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${GITHUB_ACTOR} + password: ${{secrets.GITHUB_TOKEN}} + - name: Prepare Tag + run: echo "TAG_NAME=$(echo ${TAG_NAME} | sed 's@refs/tags/@@;s@refs/heads/@@;s@/@_@g')" >> $GITHUB_ENV + - name: Get package IDs for delete + id: get-ids-for-delete + uses: Netcracker/get-package-ids@v0.0.1 + with: + component-name: ${{ matrix.component.name }} + component-tag: ${{ env.TAG_NAME }} + access-token: ${{secrets.GITHUB_TOKEN}} + if: ${{ env.PUSH }} + - name: Build and push + uses: docker/build-push-action@v6 + with: + no-cache: true + context: ${{ matrix.component.context }} + file: ${{ matrix.component.file }} + platforms: linux/amd64 #,linux/arm64 + push: ${{ env.PUSH }} + tags: ghcr.io/netcracker/${{ matrix.component.name }}:${{ env.TAG_NAME }} + provenance: false + - uses: actions/delete-package-versions@v5 + with: + package-name: ${{ matrix.component.name }} + package-type: 'container' + package-version-ids: ${{ steps.get-ids-for-delete.outputs.ids-for-delete }} + if: ${{ steps.get-ids-for-delete.outputs.ids-for-delete != '' }} diff --git a/.github/workflows/clean.yaml b/.github/workflows/clean.yaml new file mode 100644 index 0000000..33e4bf5 --- /dev/null +++ b/.github/workflows/clean.yaml @@ -0,0 +1,35 @@ +name: Branch Deleted +on: delete + +env: + COMPONENT_NAME: pgskipper-backup-daemon + TAG_NAME: ${{ github.event.ref }} + +jobs: + delete: + if: github.event.ref_type == 'branch' + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Prepare Tag + run: echo "TAG_NAME=$(echo ${TAG_NAME} | sed 's@refs/heads/@@;s@/@_@g')" >> $GITHUB_ENV + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${GITHUB_ACTOR} + password: ${{secrets.GITHUB_TOKEN}} + - name: Get package IDs for delete + id: get-ids-for-delete + uses: Netcracker/get-package-ids@v0.0.1 + with: + component-name: ${{ matrix.component.name }} + component-tag: ${{ env.TAG_NAME }} + access-token: ${{ secrets.GITHUB_TOKEN }} + - uses: actions/delete-package-versions@v5 + with: + package-name: ${{ env.COMPONENT_NAME }} + package-type: 'container' + package-version-ids: ${{ steps.get-ids-for-delete.outputs.ids-for-delete }} + if: ${{ steps.get-ids-for-delete.outputs.ids-for-delete != '' }} diff --git a/.github/workflows/license.yaml b/.github/workflows/license.yaml new file mode 100644 index 0000000..82c392c --- /dev/null +++ b/.github/workflows/license.yaml @@ -0,0 +1,22 @@ +name: Add License Header +on: + push: + branches: + - 'main' +env: + COPYRIGHT_COMPANY: 'NetCracker Technology Corporation' + COPYRIGHT_YEAR: '2024-2025' +jobs: + license: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - run: docker run -v "${PWD}:/src" -i ghcr.io/google/addlicense -v -c "${{ env.COPYRIGHT_COMPANY }}" -y "${{ env.COPYRIGHT_YEAR }}" $(find . -type f -name "*.go" -o -type f -name "*.sh" -o -type f -name "*.py" | xargs echo) + - name: Create Pull Request + uses: peter-evans/create-pull-request@v7 + with: + commit-message: Auto-update license header + branch: license-update + title: Add License Header + body: Automated license header update + delete-branch: true \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..750dba1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +# Build output. +target + +.idea/* \ No newline at end of file diff --git a/CODE-OF-CONDUCT.md b/CODE-OF-CONDUCT.md new file mode 100644 index 0000000..f5b511b --- /dev/null +++ b/CODE-OF-CONDUCT.md @@ -0,0 +1,73 @@ +# Code of Conduct + +This repository is governed by following code of conduct guidelines. + +We put collaboration, trust, respect and transparency as core values for our community. +Our community welcomes participants from all over the world with different experience, +opinion and ideas to share. + +We have adopted this code of conduct and require all contributors to agree with that to build a healthy, +safe and productive community for all. + +The guideline is aimed to support a community where all people should feel safe to participate, +introduce new ideas and inspire others, regardless of: + +* Age +* Gender +* Gender identity or expression +* Family status +* Marital status +* Ability +* Ethnicity +* Race +* Sex characteristics +* Sexual identity and orientation +* Education +* Native language +* Background +* Caste +* Religion +* Geographic location +* Socioeconomic status +* Personal appearance +* Any other dimension of diversity + +## Our Standards + +We are welcoming the following behavior: + +* Be respectful for different ideas, opinions and points of view +* Be constructive and professional +* Use inclusive language +* Be collaborative and show the empathy +* Focus on the best results for the community + +The following behavior is unacceptable: + +* Violence, threats of violence, or inciting others to commit self-harm +* Personal attacks, trolling, intentionally spreading misinformation, insulting/derogatory comments +* Public or private harassment +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Derogatory language +* Encouraging unacceptable behavior +* Other conduct which could reasonably be considered inappropriate in a professional community + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of the Code of Conduct +and are expected to take appropriate actions in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, +commits, code, wiki edits, issues, and other contributions that are not aligned +to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors +that they deem inappropriate, threatening, offensive, or harmful. + +## Reporting + +If you believe you’re experiencing unacceptable behavior that will not be tolerated as outlined above, +please report to `opensourcegroup@netcracker.com`. All complaints will be reviewed and investigated and will result in a response +that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality +with regard to the reporter of an incident. + +Please also report if you observe a potentially dangerous situation, someone in distress, or violations of these guidelines, +even if the situation is not happening to you. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..292ce26 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,12 @@ +# Contribution Guide + +We'd love to accept patches and contributions to this project. +Please, follow these guidelines to make the contribution process easy and effective for everyone involved. + +## Contributor License Agreement + +You must sign the [Contributor License Agreement](https://pages.netcracker.com/cla-main.html) in order to contribute. + +## Code of Conduct + +Please make sure to read and follow the [Code of Conduct](CODE-OF-CONDUCT.md). diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..a2e8f57 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,87 @@ +FROM --platform=$BUILDPLATFORM golang:1.22.5-alpine3.19 as builder + +ENV GO111MODULE=on + +# Copy the go source +COPY go /workspace + +WORKDIR /workspace + +RUN go mod tidy + +# Build +RUN CGO_ENABLED=0 GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o ./_output/bin/azure_restore ./cmd/main.go + +FROM ubuntu:22.04 + +ENV PGPASSWORD=password PG_CLUSTER_NAME=common STORAGE_ROOT=/backup-storage EXTERNAL_STORAGE_ROOT=/external \ + LC_ALL=en_US.UTF-8 \ + LANG=en_US.UTF-8 + +COPY docker/pip.conf /root/.pip/pip.conf +COPY docker/requirements.txt /root/requirements.txt + +RUN echo "deb [trusted=yes] http://apt.postgresql.org/pub/repos/apt jammy-pgdg main" >> /etc/apt/sources.list.d/pgdg.list +RUN cat /etc/apt/sources.list +RUN ls -la /etc/apt/ +RUN apt-get -y update +RUN apt-get -o DPkg::Options::="--force-confnew" -y dist-upgrade +RUN apt-get update && \ + apt-get install -y --allow-downgrades gcc-12 cpp-12 gcc-12-base libgcc-12-dev libstdc++6 libgcc-s1 libnsl2 +RUN apt-get --no-install-recommends install -y python3.11 python3-pip python3-dev libpq-dev cython3 + +RUN ls -ls /usr/bin/ +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get --no-install-recommends install -y comerr-dev \ + unzip \ + build-essential \ + manpages-dev \ + libkrb5-dev \ + libsasl2-dev libldap2-dev libssl-dev \ + postgresql-13 postgresql-14 postgresql-15 postgresql-16 \ + jq \ + openssl curl +RUN python3 -m pip install -U setuptools +RUN python3 -m pip install --no-cache-dir -r /root/requirements.txt \ + && python3 -m pip install --upgrade pip \ + && python3 -m pip install grpcio \ + && python3 -m pip install opentelemetry-distro opentelemetry-exporter-otlp opentelemetry-api opentelemetry-sdk opentelemetry-instrumentation-flask \ + && opentelemetry-bootstrap -a install \ + && pip3 uninstall -y pip \ + && apt-get remove -y --purge gcc-12 \ + && apt-get remove -y --purge python3-dev \ + && apt-get remove -y --purge libpq-dev \ + && apt-get remove -y --purge cython3 \ + && apt-get clean + +RUN ln -s /usr/bin/python3 /usr/bin/python + +COPY --from=builder --chown=${USER_UID} /workspace/_output/bin/azure_restore /opt/backup/ +COPY docker/postgres/ docker/health.sh /opt/backup/ +COPY docker/granular /opt/backup/granular +COPY docker/postgres/encryption.py /opt/backup/granular/encryption.py +COPY docker/external_scripts/azure_restore.sh /opt/backup/ +COPY maintenance /maintenance + +RUN mkdir -p /backup-storage/ && \ + mkdir -p /external/ && \ + chmod -R +x /opt/backup/ && \ + chmod -R 777 /opt/backup/ && \ + chmod -R 777 /backup-storage/ && \ + chmod -R 777 /external/ && \ + chmod -R g+w /maintenance/recovery/ && \ + chmod +x /maintenance/recovery/*.sh && \ + chgrp -R 0 /backup-storage/ && \ + chgrp -R 0 /external/ + + +#VOLUME /backup-storage +#VOLUME /external +# Volumes are defined to support read-only root file system +VOLUME /etc +VOLUME /opt/backup +VOLUME /tmp + +EXPOSE 8080 8081 8082 9000 + +CMD ["bash", "/opt/backup/start_backup_daemon.sh"] \ No newline at end of file diff --git a/README.md b/README.md index 3caa983..38dd9d4 100644 --- a/README.md +++ b/README.md @@ -1 +1,11 @@ -# pgskipper-backup-daemon \ No newline at end of file +# pgskipper-backup-daemon + +## Repository structure + +* `./docker` - directory with backup daemon source code. +* `./go` - directory with backup daemon source code for azure part. +* `./docs` - directory with actual documentation for the service. + +# Overview + +postgres-backup-daemon allows to run backups periodically and recovery database from this backups. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..8162261 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,15 @@ +# Security Reporting Process + +Please, report any security issue to `opensourcegroup@netcracker.com` where the issue will be triaged appropriately. + +If you know of a publicly disclosed security vulnerability please IMMEDIATELY email `opensourcegroup@netcracker.com` +to inform the team about the vulnerability, so we may start the patch, release, and communication process. + +# Security Release Process + +If the vulnerability is found in the latest stable release, then it would be fixed in patch version for that release. +E.g., issue is found in 2.5.0 release, then 2.5.1 version with a fix will be released. +By default, older versions will not have security releases. + +If the issue doesn't affect any existing public releases, the fix for medium and high issues is performed +in a main branch before releasing a new version. For low priority issues the fix can be planned for future releases. diff --git a/docker/external_scripts/azure_restore.sh b/docker/external_scripts/azure_restore.sh new file mode 100644 index 0000000..b04f9ca --- /dev/null +++ b/docker/external_scripts/azure_restore.sh @@ -0,0 +1,147 @@ +#!/usr/bin/bash +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +#set -x +#set -e +function log { + echo -e "[$(date +'%Y-%d-%m-%T.%N')][az_restore] $2$1\e[m" +} + +[[ "${DEBUG}" == 'true' ]] && set -x + +timestamp=$1 + +#default values +restoreAsSeparate='false' +geoRestore='false' +mirrorRestore='false' + +usage(){ + echo "$0 < timestamp > [ --mirror-restore ] [ --restore-as-separate ] [ --geo-restore ]" + echo ' timestamp - format yyyy-mm-ddThh:mm:ssZ' + echo ' --mirror-restore - restore to another cloud, src instance does not stop' + echo ' --restore-as-separate - restore to another ns/cloud, but src instance does not stop and src svc does not changed' + echo ' --geo-restore - invoke geo-restore during restoration of Azure PG' + echo " Examples:" + echo " $0 2023-07-14T07:23:57Z" + echo " $0 2023-01-11T01:11:11Z --restore-as-separate" + echo " $0 2023-01-11T01:11:11Z --restore-as-separate --geo-restore" +} + +log "azure restore backup invoked" + +if [[ -z "$timestamp" ]]; then + echo 'Timestamp should be set' + exit 1 +fi + +isCorrectTimestamp=$(echo "$timestamp" | grep '^[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]T[0-9][0-9]:[0-9][0-9]:[0-9][0-9]Z*$' -c) +if [[ "$isCorrectTimestamp" -ne 1 ]]; then + log 'Timestamp is incorrect!' + log 'Timestamp should be the following format yyyy-mm-ddThh:mm:ssZ' + exit 1 +fi + + +COUNT_ARGS=$# +i=2 +while [ $i -le $COUNT_ARGS ]; do + argument="${!i}" + let i=i+1 + + # find tabs and space + echo "$argument" | grep -P ' |\t' > /dev/null + EXIT_CODE=$? + if [ "$EXIT_CODE" -eq 0 ]; then + echo '' + echo '|'"$argument"'| - have space ot tab! Exit!' + echo '' + usage + exit 2 + fi + if [[ $argument == --restore-as-separate ]]; then restoreAsSeparate='true'; continue; fi + if [[ $argument == --mirror-restore ]]; then mirrorRestore='true'; continue; fi + if [[ $argument == --geo-restore ]]; then geoRestore='true'; continue; fi + + log 'ERROR! Wrong parameters!'; usage; exit 1; +done + +echo +echo "Set parameters: $@" +echo + +log 'Waiting for API server till 120...' + +for i in {1..120} +do + curl --fail --connect-timeout 1 --max-time 2 -o /dev/null -s localhost:8080/v2/health && break + sleep 1 + log "$i" +done + +echo +log 'Check API code for restore method' +checkApiCode=$(curl -XOPTION --max-time 30 -o /dev/null -s -w "%{http_code}\n" localhost:8080/external/restore) +if [[ "$checkApiCode" == '404' ]]; then + echo "ERROR! API to restore external DB is off" + exit 1 +fi + +if [[ "$checkApiCode" == '000' ]]; then + echo "ERROR! API to restore external DB is not ready" + exit 1 +fi + +request="{\"restore_time\":\"$timestamp\", \"restore_as_separate\":\"$restoreAsSeparate\", \"subnet\":\"$mirrorRestore\", \"geo_restore\":\"$geoRestore\"}" + +log "request to backup daemon:" +log "$request" + +rawResponse=$(curl --max-time 30 -v -XPOST -H "Content-Type: application/json" localhost:8080/external/restore -d "$request") + +log "response from backup daemon:" +log "$rawResponse" + +restoreId=$(echo "$rawResponse" | grep '^restore-20[0-9T]*$' || echo 'error') + +if [[ "$restoreId" == 'error' ]]; then + log "Error. Expected restore id but found '$rawResponse'" + exit 1 +fi + +log "restoreId is: $restoreId" + +operationStatus='' +counter=0 + + +for i in {1..1500} +do + if [[ "$operationStatus" == 'Successful' ]]; then + break + fi + operationStatus=$(curl -XGET --max-time 3 localhost:8080/external/restore/$restoreId | jq -r '.status') + if [[ "$operationStatus" == 'Failed' ]]; then + log 'Restore operation is in Failed status' + exit 1 + fi + sleep 10 + log "waiting Successful status, current status: $operationStatus" +done + +log 'Successful status' + +exit 0 diff --git a/docker/granular/backups.py b/docker/granular/backups.py new file mode 100644 index 0000000..e6b7114 --- /dev/null +++ b/docker/granular/backups.py @@ -0,0 +1,413 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import json +import logging +import shutil +import os +import re +import time + +import storage_s3 +import configs +import utils +from itertools import groupby + + +class BackupNotFoundException(Exception): + + def __init__(self, backup_id, namespace, database=None): + super(Exception, self).__init__("Backup of database '%s' is not found in backup '%s' in namespace '%s'." + % (database, backup_id, namespace)) + + +class BackupBadStatusException(Exception): + + def __init__(self, backup_id, status=None, database=None): + super(Exception, self).__init__("Backup of database '%s' in backup '%s' has bad status for restore: %s" + % (database, backup_id, status)) + + +class BackupFailedException(Exception): + + def __init__(self, database, reason=None): + super(Exception, self).__init__("Backup of database '%s' has failed: %s." % (database, reason)) + + +class RestoreFailedException(Exception): + + def __init__(self, database, reason=None): + super(Exception, self).__init__("Restore of database '%s' has failed: %s" % (database, reason)) + + +class BackupStatus: + SUCCESSFUL = 'Successful' + FAILED = 'Failed' + IN_PROGRESS = 'In progress' + PLANNED = 'Planned' + UNKNOWN = 'Unknown' + CANCELED = 'Canceled' + + +def get_backup_status_id(status): + statuses = { + BackupStatus.SUCCESSFUL: 0, + BackupStatus.CANCELED: 7, + BackupStatus.FAILED: 6, + BackupStatus.IN_PROGRESS: 5, + BackupStatus.PLANNED: 4, + BackupStatus.UNKNOWN: -1 + } + return statuses[status] + +def is_valid_namespace(namespace): + return re.match("^[a-zA-z0-9_]+$", namespace) is not None + + +def backup_exists(backup_id, namespace=configs.default_namespace(), external_backup_storage=None): + return os.path.exists(build_backup_path(backup_id, namespace, external_backup_storage)) + + +def database_backup_exists(backup_id, database, namespace=configs.default_namespace(), external_backup_storage=None): + return os.path.exists(build_database_backup_path(backup_id, database, namespace, external_backup_storage)) + + +def build_namespace_path(namespace=configs.default_namespace()): + return '%s/%s' % (configs.backups_storage(), namespace) + + +def build_database_backup_path(backup_id, database, + namespace=configs.default_namespace(), external_backup_storage=None): + if configs.get_encryption(): + return '%s/%s_enc.dump' % ( + build_backup_path(backup_id, namespace, external_backup_storage), database) + else: + return '%s/%s.dump' % ( + build_backup_path(backup_id, namespace, external_backup_storage), database) + + +def build_roles_backup_path(backup_id, database, + namespace=configs.default_namespace(), external_backup_storage=None): + if configs.get_encryption(): + return "%s/%s.roles_enc.sql" % ( + build_backup_path(backup_id, namespace, external_backup_storage), database) + else: + return "%s/%s.roles.sql" % ( + build_backup_path(backup_id, namespace, external_backup_storage), database) + + +def build_database_backup_full_path(backup_id, database, storage_root, + namespace=configs.default_namespace(), + ): + if configs.get_encryption(): + return '%s/%s/%s/%s_enc.dump' % ( + storage_root, namespace, backup_id, database) + else: + return '%s/%s/%s/%s.dump' % ( + storage_root, namespace, backup_id, database) + + +def build_database_restore_report_path(backup_id, database, restore_tracking_id, namespace=configs.default_namespace()): + return '%s/%s.%s.report' % (build_backup_path(backup_id, namespace), database, restore_tracking_id) + + +def build_backup_path(backup_id, namespace=configs.default_namespace(), external_backup_storage=None): + return '%s/%s/%s' % (configs.backups_storage() if external_backup_storage is None else external_backup_storage, + namespace, backup_id) + + +def build_external_backup_root(external_backup_path): + return '%s/%s' % (os.getenv("EXTERNAL_STORAGE_ROOT"), external_backup_path) + + +def build_backup_status_file_path(backup_id, namespace=configs.default_namespace(), external_backup_storage=None): + return '%s/status.json' % build_backup_path(backup_id, namespace, external_backup_storage) + + +def build_restore_status_file_path(backup_id, tracking_id, namespace=configs.default_namespace(), + external_backup_storage=None): + return '%s/%s.json' % (build_backup_path(backup_id, namespace, external_backup_storage), tracking_id) + + +def get_key_name_by_backup_id(backup_id, namespace, external_backup_storage=None): + status_path = build_backup_status_file_path(backup_id, namespace, external_backup_storage) + with open(status_path) as f: + data = json.load(f) + return data.get("key_name") + + +def generate_id(): + return datetime.datetime.now().strftime("%Y%m%dT%H%M%S%f") + + +def generate_backup_id(): + return 'backup-%s' % generate_id() + + +def generate_restore_id(backup_id, namespace=configs.default_namespace()): + m = re.match("^backup-(?P[a-zA-Z0-9]+)$", backup_id) + backup_id = m.group('backupId') + return 'restore-%s-%s-%s' % (namespace, backup_id, generate_id()) + + +def extract_backup_id_from_tracking_id(tracking_id): + m = re.match("^restore-(?P[a-zA-Z0-9_]+)-(?P[a-zA-Z0-9]+)-[a-zA-Z0-9]+$", tracking_id) + return 'backup-%s' % m.group('backupId'), m.group('namespace') + + +# Kindly offered by https://stackoverflow.com/a/1094933/6519476 +def sizeof_fmt(num, suffix='B'): + for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: + if abs(num) < 1024.0: + return "%3.1f%s%s" % (num, unit, suffix) + num /= 1024.0 + return "%.1f%s%s" % (num, 'Yi', suffix) + + +def calculate_expiration_timestamp(start_timestamp, period): + return start_timestamp + get_seconds(period) + + +def is_backup_completed(status): + return status == BackupStatus.SUCCESSFUL or status == BackupStatus.FAILED + + +def get_seconds(delta): + time_unit = delta.split() + + if len(time_unit) != 2 or not time_unit[0].isdigit(): + raise Exception("Malformed expiration period: %s." % delta) + + value = int(time_unit[0]) + unit = time_unit[1].lower() + + if unit == 'week' or unit == 'weeks': + td = datetime.timedelta(weeks=value) + elif unit == 'day' or unit == 'days': + td = datetime.timedelta(days=value) + elif unit == 'hour' or unit == 'hours': + td = datetime.timedelta(hours=value) + elif unit == 'minute' or unit == 'minutes': + td = datetime.timedelta(minutes=value) + elif unit == 'second' or unit == 'seconds': + td = datetime.timedelta(seconds=value) + else: + raise Exception("Time unit '%s' is not supported" % unit) + + return int(td.total_seconds()) + + +def is_database_protected(database): + return database in configs.protected_databases() or database in configs.protected_greenplum_databases() + + +def get_backup_create_date(backup_id): + return int(datetime.datetime.strptime(backup_id, "backup-%Y%m%dT%H%M%S%f").strftime('%s')) + + +def backup_expired(backup, expire_date): + return 1 if get_backup_create_date(os.path.basename(backup)) < expire_date else 0 + + +def get_s3_client(): + return storage_s3.AwsS3Vault() + + +def sweep_manager(): + """ Sweep procedure manager """ + if os.getenv("USE_EVICTION_POLICY_FIRST") is None or os.getenv("USE_EVICTION_POLICY_FIRST").lower() == 'false': + sweep_by_keep() + else: + sweep_by_policy() + + +def sweep_by_keep(): + log = logging.getLogger("Sweeper") + log.info("Start backups sweeping by keep.") + current_time = time.time() + storage = configs.backups_storage() + s3 = None + if os.environ['STORAGE_TYPE'] == "s3": + s3 = get_s3_client() + namespaces = s3.get_granular_namespaces(storage) + + for namespace in os.listdir(storage) if not s3 else namespaces: + log.info("Sweeping namespace: %s." % namespace) + expired_backups = [] + failed_expired_backups = [] + + healthy_backups = 0 + if s3: + backup_ids = s3.get_backup_ids(storage, namespace) + + for backup_id in os.listdir(build_namespace_path(namespace)) if not s3 else backup_ids: + status_file = build_backup_status_file_path(backup_id, namespace) + if s3: + try: + if s3.is_file_exists(status_file): + status_file = s3.read_object(status_file) + backup_details = json.loads(status_file) + else: + log.error("Cannot find status file in bucket with backup id {}".format(backup_id)) + failed_expired_backups.append(backup_id) + continue + except ValueError: + log.exception("Cannot read status file") + + elif not os.path.isfile(status_file): + failed_expired_backups.append(backup_id) + continue + else: + backup_details = utils.get_json_by_path(status_file) + + expires = backup_details.get('expires') + backup_status = backup_details.get('status') + timestamp = backup_details.get('timestamp') + + if backup_status == BackupStatus.SUCCESSFUL: + # We may get unicode string here so check `basestring`, but not `str` + if isinstance(expires, str) and expires.lower() == 'never': + continue + else: + healthy_backups += 1 + + if expires < current_time: + if backup_status == BackupStatus.SUCCESSFUL: + expired_backups.append((backup_id, timestamp)) # Keep timestamp to sort later. + else: + # We delete expired PLANNED and IN_PROGRESS backups as well + # since they probably hang if they have already expired, not not even finished. + failed_expired_backups.append(backup_id) + + if expired_backups and len(expired_backups) == healthy_backups: + # If all healthy are expired then keep the freshest. + # Sort by timestamp and take ID of the latest. + expired_backups.sort(key=lambda backup: backup[1]) + saved_backup = expired_backups.pop()[0] + log.info("All successful backups are expired. Keep backup '%s/%s' as last healthy." + % (saved_backup, namespace)) + + for i in failed_expired_backups: + log.info("Sweep out failed expired backup status of '%s/%s'." % (namespace, i)) + shutil.rmtree(build_backup_path(i, namespace)) if not s3 else s3.delete_objects(build_backup_path(i, namespace)) + + for i in expired_backups: + log.info("Sweep out expired backup '%s/%s'." % (namespace, i[0])) + shutil.rmtree(build_backup_path(i[0], namespace)) if not s3 else s3.delete_objects( + build_backup_path(i[0], namespace)) + + log.info("Backups sweeping finished.") + + +def sweep_by_policy(): + """ remove expired backups according EVICTION_POLICY """ + + log = logging.getLogger("Sweeper (policy)") + log.info("Start backups sweeping according eviction policy") + current_time = time.time() + storage = configs.backups_storage() + s3 = None + if os.environ['STORAGE_TYPE'] == "s3": + s3 = get_s3_client() + namespaces = s3.get_granular_namespaces(storage) + + start_point_time = time.time() + for namespace in os.listdir(storage) if not s3 else namespaces: + log.info("Sweeping namespace: %s." % namespace) + expired_backups = [] + failed_expired_backups = [] + success_backups = [] + log.debug("namespace: {}".format(namespace)) + current_policy = os.getenv("EVICTION_POLICY_GRANULAR_" + namespace) or os.getenv("EVICTION_POLICY") + log.debug("Current_policy: {}".format(current_policy)) + log.debug("policy for current namespace") + + for i in utils.parse(current_policy): + log.debug('{}/{}'.format(i.start, i.interval)) + + if s3: + backup_ids = s3.get_backup_ids(storage, namespace) + + for backup_id in os.listdir(build_namespace_path(namespace)) if not s3 else backup_ids: + status_file = build_backup_status_file_path(backup_id, namespace) + if s3: + try: + if s3.is_file_exists(status_file): + status_file = s3.read_object(status_file) + backup_details = json.loads(status_file) + else: + log.error("Cannot find status file in bucket with backup id {}".format(backup_id)) + failed_expired_backups.append(os.path.join(build_backup_path(backup_id, namespace))) + continue + except ValueError: + log.exception("Cannot read status file") + elif not os.path.isfile(status_file): + failed_expired_backups.append(os.path.join(build_backup_path(backup_id, namespace))) + continue + else: + backup_details = utils.get_json_by_path(status_file) + + expires = backup_details.get('expires') + backup_status = backup_details.get('status') + + if backup_status == BackupStatus.SUCCESSFUL or backup_status == BackupStatus.IN_PROGRESS: + if isinstance(expires, str) and expires.lower() == 'never': + log.debug("Skip backup {} as marked Never delete".format(backup_id)) + continue + success_backups.append(os.path.join(build_backup_path(backup_id, namespace))) + else: + failed_expired_backups.append(os.path.join(build_backup_path(backup_id, namespace))) + + log.debug("success backups:") + log.debug("\n".join(success_backups)) + log.debug("") + + log.debug("fail backups:") + log.debug("\n".join(failed_expired_backups)) + log.debug("") + + for rule in utils.parse(current_policy): + log.debug("current rule: {}/{}".format(rule.start, rule.interval)) + operateVersions = [t for t in success_backups if backup_expired(t, start_point_time - rule.start)] + log.debug("stage1 selected:") + log.debug("\n".join(operateVersions)) + if rule.interval == "delete": + # all versions should be evicted catched by this interval + expired_backups.extend(operateVersions) + else: + # group by interval and leave only first on each + thursday = 3 * 24 * 60 * 60 + for _, versionsIt in groupby(operateVersions, lambda t: int( + (get_backup_create_date(os.path.basename(t)) - thursday) / rule.interval)): + grouped = sorted(list(versionsIt), key=lambda t: get_backup_create_date(os.path.basename(t))) + expired_backups.extend(grouped[:-1]) + log.debug("stage2 expired:") + log.debug("\n".join(expired_backups)) + + expired_backups = list(set(expired_backups)) + log.debug("stage3 expired unique backups:") + log.debug("\n".join(expired_backups)) + + log.debug("Remove expired backups:") + for dir in expired_backups: + log.info("remove backup: {}".format(dir)) + shutil.rmtree(dir) if not s3 else s3.delete_objects(dir) + + log.debug("Remove failed backups:") + for dir in failed_expired_backups: + log.info("remove backup: {}".format(dir)) + shutil.rmtree(dir) if not s3 else s3.delete_objects(dir) + + log.info("Backups sweeping finished.") diff --git a/docker/granular/configs.py b/docker/granular/configs.py new file mode 100644 index 0000000..09fc731 --- /dev/null +++ b/docker/granular/configs.py @@ -0,0 +1,178 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import utils +import logging +from utils import get_postgres_version_by_path + + +_PROTECTED_DATABASES = ['template0', 'template1', 'postgres', + 'rdsadmin', # aws rds + 'cloudsqladmin', # cloudsql + 'azure_maintenance', 'azure_sys', # azure postgresql + 'powa'] # powa +_PROTECTED_GREENPLUM_DATABASES = ['gpadmin', 'gpperfmon'] +_PROTECTED_ROLES = ['replicator', 'postgresadmin', 'psqladmin', 'azuresu', 'azure_pg_admin', 'replication'] + +log = logging.getLogger("configs") + +def backups_storage(version=None): + if is_external_pg(): + return '/backup-storage/external/granular' + + storageRoot = '/backup-storage' + if not version: + try: + version = utils.get_version_of_pgsql_server() + except Exception as e: + version = get_postgres_version_by_path(storageRoot) + log.info(f"version returned from exception block {version}") + log.exception(e) + storage_path = '/backup-storage/granular' + # checking if version of pgsql server if above 10 => saving backups + # in different folder + if [10, 0] <= version < [11, 0]: + storage_path = '/backup-storage/pg10/granular' + elif [11, 0] <= version < [12, 0]: + storage_path = '/backup-storage/pg11/granular' + elif [12, 0] <= version < [13, 0]: + storage_path = '/backup-storage/pg12/granular' + elif [13, 0] <= version < [14, 0]: + storage_path = '/backup-storage/pg13/granular' + elif [14, 0] <= version < [15, 0]: + storage_path = '/backup-storage/pg14/granular' + elif [15, 0] <= version < [16, 0]: + storage_path = '/backup-storage/pg15/granular' + elif version >= [16, 0]: + storage_path = '/backup-storage/pg16/granular' + return storage_path + + +def default_namespace(): + return 'default' + + +def default_backup_type(): + return 'all' + + +def default_backup_expiration_period(): + return '2 weeks' + + +def postgresql_user(): + return os.getenv('POSTGRES_USER') or 'postgres' + + +def postgresql_host(): + return os.getenv('POSTGRES_HOST') or 'localhost' + + +def postgresql_port(): + return os.getenv('POSTGRES_PORT') or '5432' + + +def postgres_password(): + return os.getenv('POSTGRES_PASSWORD') + + +def postgresql_no_role_password_flag(): + if is_external_pg(): + return "--no-role-passwords" + return "" + + +def protected_databases(): + return _PROTECTED_DATABASES + + +def protected_greenplum_databases(): + return _PROTECTED_GREENPLUM_DATABASES + + +def protected_roles(): + postgres_admin_user = os.getenv("POSTGRES_USER", "postgres") + return _PROTECTED_ROLES + [postgres_admin_user] + + +def eviction_interval(): + interval = os.getenv("GRANULAR_EVICTION", "3600") + if interval: + interval = int(interval) + return interval or 3600 + + +def granular_cron_pattern(): + return os.getenv("GRANULAR_BACKUP_SCHEDULE", "none") + +def diff_cron_pattern(): + return os.getenv("DIFF_SCHEDULE", "none") + +def diff_cron_pattern(): + return os.getenv("INCR_SCHEDULE", "none") + + +def get_parallel_jobs(): + return os.getenv("JOB_FLAG" , "1") + +def dbs_to_granular_backup(): + databases = os.getenv("DATABASES_TO_SCHEDULE") + if databases: + databases = databases.split(',') + return databases or [] + +def connection_properties(username = postgresql_user(), password = postgres_password(), database = 'postgres'): + return { + 'host': postgresql_host(), + 'port': postgresql_port(), + 'user': username, + 'password': password, + 'database': database, + 'connect_timeout': 5 + } + + +def get_encryption(): + encrypt_backups = os.getenv("KEY_SOURCE", 'false').lower() + return encrypt_backups != 'false' + + +def get_pgsql_bin_path(version): + major_version = version[0] + minor_version = version[1] + if major_version == 9: + if minor_version != 4: + return "/usr/pgsql-9.{}/bin".format(version[1]) + else: + # GPDB uses Postgresql 9.4 + return "/usr/local/greenplum-db/bin/" + elif major_version == 10: + return "/usr/lib/postgresql/10/bin" + elif major_version == 11: + return "/usr/lib/postgresql/11/bin" + elif major_version == 12: + return "/usr/lib/postgresql/12/bin" + elif major_version == 13: + return "/usr/lib/postgresql/13/bin" + elif major_version == 14: + return "/usr/lib/postgresql/14/bin" + elif major_version == 15: + return "/usr/lib/postgresql/15/bin" + elif major_version == 16: + return "/usr/lib/postgresql/16/bin" + + +def is_external_pg(): + return os.getenv("EXTERNAL_POSTGRESQL", "") != "" diff --git a/docker/granular/granular.py b/docker/granular/granular.py new file mode 100644 index 0000000..15b6ee6 --- /dev/null +++ b/docker/granular/granular.py @@ -0,0 +1,1039 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import http.client +import json +import logging +import os +import io + +import flask +import flask_restful +from flask import Flask +from apscheduler.schedulers.background import BackgroundScheduler + +import requests +import backups +import configs +import pg_backup +import pg_restore +import utils +import storage_s3 +import psycopg2 +import threading +from functools import wraps + +import shutil +from backups import build_backup_path, build_namespace_path, is_valid_namespace, build_backup_status_file_path +from flask_httpauth import HTTPBasicAuth + +from flask import request, abort, Response, stream_with_context + +from opentelemetry.instrumentation.flask import FlaskInstrumentor +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.resources import SERVICE_NAME, Resource + + +auth = HTTPBasicAuth() + + +def superuser_authorization(func_to_decorate): + @wraps(func_to_decorate) + def wrap(self, *args, **kwargs): + if utils.is_auth_needed(): + if request.authorization.username == configs.postgresql_user(): + return func_to_decorate(self, *args, **kwargs) + else: + abort(403, 'You are not authorized to perform such action') + else: + return func_to_decorate(self, *args, **kwargs) + + return wrap + + +@auth.verify_password +def authenticate_user(username, password): + if utils.is_auth_needed(): + + connection_properties = configs.connection_properties(username=username, password=password) + connect = None + try: + connect = psycopg2.connect(**connection_properties) + connect.cursor() + return True + except psycopg2.Error: + return False + finally: + if connect: + connect.close() + else: + return True + + +def common_authorization(func_to_decorate): + @wraps(func_to_decorate) + def wrap(self, *args, **kwargs): + if utils.is_auth_needed(): + content_type = request.headers.get('Content-Type') + + if content_type and content_type.split(";")[0] != 'application/json' \ + and request.headers.get('Content-Length'): + return "Invalid request body: Content Type is not json", http.client.BAD_REQUEST + + backup_request = request.get_json() or {} + + for k in list(backup_request.keys()): + if k not in self.allowed_fields: + return "Unknown field: %s" % k.encode('utf-8'), http.client.BAD_REQUEST + + databases = backup_request.get('databases') or [] + + cred = request.authorization + if not cred: + abort(401, 'Credentials should be provided for this endpoint') + + databases_count = len(databases) + if databases_count == 1: + dbname = databases[0] + connection_properties = \ + configs.connection_properties(username=cred.username, password=cred.password, database='postgres') + connect = None + try: + connect = psycopg2.connect(**connection_properties) + with connect.cursor() as cur: + cur.execute(""" + SELECT pg_catalog.pg_get_userbyid(d.datdba) as Owner + FROM pg_catalog.pg_database d WHERE d.datname = %s + ORDER BY 1; + """, (dbname,)) + database_owner = cur.fetchone()[0] + if database_owner == cred.username: + return func_to_decorate(self, *args, **kwargs) + else: + abort(403, 'You are not authorized to perform such action') + finally: + if connect: + connect.close() + elif not cred.username == configs.postgresql_user(): + abort(403, 'You are not authorized to perform such action') + else: + return func_to_decorate(self, *args, **kwargs) + else: + return func_to_decorate(self, *args, **kwargs) + + return wrap + + +if os.getenv("DEBUG") and os.getenv("DEBUG").lower() == 'true': + logging.getLogger().setLevel(logging.DEBUG) + + +def schedule_granular_backup(scheduler): + cron_pattern = configs.granular_cron_pattern() + if cron_pattern.lower() != 'none' and os.getenv("GRANULAR_BACKUP_SCHEDULE") != "": + if utils.is_mirror_env(): + logging.info('It is a mirror env') + return + logging.info('Start schedule granular backup') + databases = configs.dbs_to_granular_backup() + backup_request = {'databases': databases, 'namespace': 'schedule'} + items = cron_pattern.split(' ', 5) + minute, hour, day, month, day_of_week = items[0], items[1], items[2], items[3], items[4] + + granular_backup_request = GranularBackupRequestEndpoint() + + return scheduler.add_job( + granular_backup_request.perform_granular_backup, + 'cron', + [backup_request], + minute=minute, + hour=hour, + day=day, + month=month, + day_of_week=day_of_week) + + +def schedule_diff_backup(scheduler): + cron_pattern = configs.diff_cron_pattern() + logging.info(f'DIFF SHEDULE {os.getenv("DIFF_SCHEDULE")}') + if cron_pattern.lower() != 'none' and os.getenv("DIFF_SCHEDULE") is not None: + logging.info('Start schedule diff backup') + items = cron_pattern.split(' ', 5) + logging.info(f"{items} cron items") + minute, hour, day, month, day_of_week = items[0], items[1], items[2], items[3], items[4] + + diff_backup_request = DiffBackupRequestEndpoint() + + return scheduler.add_job( + diff_backup_request.perform_diff_backup, + 'cron', + minute=minute, + hour=hour, + day=day, + month=month, + day_of_week=day_of_week) + +class GranularBackupsListEndpoint(flask_restful.Resource): + + def __init__(self): + self.log = logging.getLogger('BackupsListEndpoint') + self.s3 = storage_s3.AwsS3Vault() if os.environ['STORAGE_TYPE'] == "s3" else None + + @auth.login_required + def get(self): + # for gke full backup + # if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): + # self.log.info('Getting GKE backup list') + # client = utils.GkeBackupApiCaller() + # response = client.backup_list() + # return response + status = {} + storage = configs.backups_storage() + + if self.s3: + if not self.s3.is_s3_storage_path_exist(storage): + return "Backups in s3 storage does not exist.", http.client.NOT_FOUND + namespaces = self.s3.get_granular_namespaces(storage) + elif not os.path.exists(storage): + return "Backups storage does not exist.", http.client.NOT_FOUND + + for namespace in os.listdir(storage) if not self.s3 else namespaces: + if not backups.is_valid_namespace(namespace): + continue + + status[namespace] = {} + if self.s3: + backup_ids = self.s3.get_backup_ids(storage, namespace) + for backup in os.listdir(backups.build_namespace_path(namespace)) if not self.s3 else backup_ids: + status_file = backups.build_backup_status_file_path(backup, namespace) + if self.s3: + try: + if self.s3.is_file_exists(status_file): + status_file = self.s3.read_object(status_file) + backup_status = json.loads(status_file) + status[namespace][backup] = { + 'status': backup_status.get('status'), + 'created': backup_status.get('created'), + 'expirationDate': backup_status.get('expirationDate') + } + else: + self.log.error("Cannot find status file in bucket with backup id {}".format(backup)) + status[namespace][backup] = {'status': 'Unknown'} + + except ValueError: + self.log.exception("Cannot read status file") + status[namespace][backup] = {'status': 'Unknown'} + + elif os.path.isfile(status_file): + with open(status_file, 'r') as f: + try: + backup_status = json.load(f) + status[namespace][backup] = { + 'status': backup_status.get('status'), + 'created': backup_status.get('created'), + 'expirationDate': backup_status.get('expirationDate') + } + except ValueError: + self.log.exception("Cannot read status file") + status[namespace][backup] = {'status': 'Unknown'} + else: + status[namespace][backup] = {'status': 'Unknown'} + + return status, http.client.OK + + +class GranularBackupRequestEndpoint(flask_restful.Resource): + + def __init__(self): + self.log = logging.getLogger('BackupRequestEndpoint') + self.allowed_fields = ['backupId', + 'namespace', + 'databases', + 'keep', + 'compressionLevel', + 'externalBackupPath'] + + def perform_granular_backup(self, backup_request): + # # for gke full backup + # if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): + # self.log.info('Perform GKE backup') + # client = utils.GkeBackupApiCaller() + # backup_id = client.perform_backup() + # if "error" not in backup_id: + # return { + # 'backupId': backup_id + # }, http.client.ACCEPTED + # else: + # return backup_id, http.client.BAD_REQUEST + + self.log.info('Perform granular backup') + + for k in list(backup_request.keys()): + if k not in self.allowed_fields: + self.log.exception("Unknown field: %s" % k.encode('utf-8')) + return "Unknown field: %s" % k.encode('utf-8'), http.client.BAD_REQUEST + + databases = backup_request.get('databases') or [] + namespace = backup_request.get('namespace') or configs.default_namespace() + if databases: + baselist = utils.get_database_list(databases) + + if not isinstance(databases, list) and not isinstance(databases, tuple): + self.log.exception("Field 'database' must be an array.") + return "Field 'database' must be an array.", http.client.BAD_REQUEST + + if not backups.is_valid_namespace(namespace): + self.log.exception("Invalid namespace name: %s." % namespace.encode('utf-8')) + return "Invalid namespace name: %s." % namespace.encode('utf-8'), http.client.BAD_REQUEST + + for database in databases: + if backups.is_database_protected(database): + self.log.exception("Database '%s' is not suitable for backup/restore." % database) + return "Database '%s' is not suitable for backup/restore." % database, http.client.FORBIDDEN + + if database not in baselist: + self.log.exception("Database '%s' does not exist" % database) + return "Database '%s' does not exist" % database, http.client.BAD_REQUEST + + backup_id = backups.generate_backup_id() + backup_request['backupId'] = backup_id + + worker = pg_backup.PostgreSQLDumpWorker(databases, backup_request) + + worker.start() + + return { + 'backupId': backup_id + }, http.client.ACCEPTED + + @auth.login_required + @common_authorization + def post(self): + content_type = request.headers.get('Content-Type') + + if content_type and content_type.split(";")[0] != 'application/json' \ + and request.headers.get('Content-Length'): + return "Invalid request body: Content Type is not json", http.client.BAD_REQUEST + + backup_request = request.get_json() or {} + + return self.perform_granular_backup(backup_request) + + +class GranularBackupStatusEndpoint(flask_restful.Resource): + + def __init__(self): + self.log = logging.getLogger('BackupRequestEndpoint') + self.s3 = storage_s3.AwsS3Vault() if os.environ['STORAGE_TYPE'] == "s3" else None + + @auth.login_required + def get(self, backup_id): + if not backup_id: + return "Backup ID is not specified.", http.client.BAD_REQUEST + # for gke full backup + # if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): + # self.log.info('Getting GKE backup status') + # client = utils.GkeBackupApiCaller() + # response = client.backup_status(backup_id) + # return response + + namespace = flask.request.args.get('namespace') or configs.default_namespace() + + if not backups.is_valid_namespace(namespace): + return "Invalid namespace name: %s." % namespace.encode('utf-8'), http.client.BAD_REQUEST + + external_backup_path = flask.request.args.get('externalBackupPath') or None + external_backup_root = None + if external_backup_path is not None: + external_backup_root = backups.build_external_backup_root(external_backup_path) + backup_status_file = backups.build_backup_status_file_path(backup_id, namespace, external_backup_root) + if self.s3: + try: + status = self.s3.read_object(backup_status_file) + logging.info(status) + + except: + return "Backup in bucket is not found.", http.client.NOT_FOUND + return json.loads(status), http.client.OK + else: + if not os.path.isfile(backup_status_file): + return "Backup is not found.", http.client.NOT_FOUND + + return utils.get_json_by_path(backup_status_file), http.client.OK + + +class GranularBackupStatusJSONEndpoint(flask_restful.Resource): + + def __init__(self): + self.log = logging.getLogger('BackupRequestEndpoint') + self.allowed_fields = ['backupId', 'namespace', 'externalBackupPath'] + self.s3 = storage_s3.AwsS3Vault() if os.environ['STORAGE_TYPE'] == "s3" else None + + @auth.login_required + def post(self): + backup_request = flask.request.get_json() or {} + + for k in list(backup_request.keys()): + if k not in self.allowed_fields: + return "Unknown field: %s" % k.encode('utf-8'), http.client.BAD_REQUEST + + backup_id = backup_request.get('backupId') + namespace = backup_request.get('namespace') or configs.default_namespace() + + if not backups.is_valid_namespace(namespace): + return "Invalid namespace name: %s." % namespace.encode('utf-8'), http.client.BAD_REQUEST + + if not backup_request: + return "Request body is empty.", http.client.BAD_REQUEST + + if not backup_id: + return "Backup ID is not specified.", http.client.BAD_REQUEST + + external_backup_path = backup_request.get('externalBackupPath') + external_backup_root = None + if external_backup_path is not None: + external_backup_root = backups.build_external_backup_root(external_backup_path) + status_path = backups.build_backup_status_file_path(backup_id, namespace, external_backup_root) + + if self.s3: + try: + status = self.s3.read_object(status_path) + logging.info(status) + + except: + return "Backup in bucket is not found.", http.client.NOT_FOUND + return json.loads(status), http.client.OK + else: + if not os.path.isfile(status_path): + return "Backup is not found.", http.client.NOT_FOUND + + with open(status_path) as f: + return json.load(f), http.client.OK + + +class GranularRestoreRequestEndpoint(flask_restful.Resource): + + def __init__(self): + self.log = logging.getLogger('BackupRequestEndpoint') + self.allowed_fields = ['backupId', 'namespace', 'databases', 'force', 'restoreRoles', 'databasesMapping', + 'externalBackupPath', 'singleTransaction', "dbaasClone"] + self.s3 = storage_s3.AwsS3Vault() if os.environ['STORAGE_TYPE'] == "s3" else None + + @auth.login_required + @superuser_authorization + def post(self): + restore_request = flask.request.get_json() or {} + # for gke full backup + # if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): + # self.log.info('Perform GKE restore') + # client = utils.GkeBackupApiCaller() + # response = client.restore(restore_request) + # return response + + for k in list(restore_request.keys()): + if k not in self.allowed_fields: + return "Unknown field: %s" % k.encode('utf-8'), http.client.BAD_REQUEST + + databases = restore_request.get('databases') or [] + databases_mapping = restore_request.get('databasesMapping') or {} + + if not isinstance(databases, list) and not isinstance(databases, tuple): + return "Field 'database' must be an array.", http.client.BAD_REQUEST + + if not isinstance(databases_mapping, dict): + return "Field 'database_mapping' must be a dictionary.", http.client.BAD_REQUEST + + backup_id = restore_request.get('backupId') + if not backup_id: + return "Backup ID is not specified.", http.client.BAD_REQUEST + + namespace = restore_request.get('namespace') or configs.default_namespace() + + if not backups.is_valid_namespace(namespace): + return "Invalid namespace name: %s." % namespace.encode('utf-8'), http.client.BAD_REQUEST + + external_backup_path = restore_request.get('externalBackupPath') + external_backup_root = None + if external_backup_path is not None: + external_backup_root = backups.build_external_backup_root(external_backup_path) + backup_details_file = backups.build_backup_status_file_path(backup_id, namespace, external_backup_root) + + if self.s3: + try: + status = self.s3.read_object(backup_details_file) + except: + return "Backup in bucket is not found.", http.client.NOT_FOUND + backup_details = json.loads(status) + + else: + if not os.path.isfile(backup_details_file): + return "Backup is not found.", http.client.NOT_FOUND + + with open(backup_details_file, 'r') as f: + backup_details = json.load(f) + + backup_status = backup_details['status'] + + if backup_status != backups.BackupStatus.SUCCESSFUL: + return "Backup status '%s' is unsuitable status for restore." % backup_status, http.client.FORBIDDEN + if self.s3: + databases = list(backup_details.get('databases', {}).keys()) + for database in databases: + if not self.s3.is_file_exists(backups.build_database_backup_path(backup_id, database, + namespace, external_backup_root)): + return "Backup in bucket is not found.", http.client.NOT_FOUND + elif not backups.backup_exists(backup_id, namespace, external_backup_root): + return "Backup is not found.", http.client.NOT_FOUND + + ghost_databases = [] + uncompleted_backups = [] + + databases = restore_request.get('databases') or list(backup_details.get('databases', {}).keys()) + + # dict of owners {"db": "db_owner", ..} + owners_mapping = {} + + for database in databases: + database_details = backup_details['databases'].get(database) + if not database_details: + ghost_databases.append(database) + continue + if database_details['status'] != backups.BackupStatus.SUCCESSFUL: + uncompleted_backups.append((database, database_details['status'])) + continue + + owners_mapping[database] = database_details.get('owner', 'postgres') + + if ghost_databases: + return "Databases are not found: %s." % ', '.join([db.encode('utf-8') for db in ghost_databases]), \ + http.client.NOT_FOUND + + if uncompleted_backups: + return "Database backup is in unsuitable status for restore: %s." \ + % ', '.join(['%s: %s' % (i[0].encode('utf-8'), i[1]) for i in uncompleted_backups]), \ + http.client.FORBIDDEN + + tracking_id = backups.generate_restore_id(backup_id, namespace) + restore_request['trackingId'] = tracking_id + + # force is false by default + force = False + force_param = restore_request.get('force') + + if force_param: + if isinstance(force_param, str): + force = force_param == 'true' + elif type(force_param) is bool: + force = force_param + + + # restore_roles is true by default + restore_roles = True + restore_roles_param = restore_request.get('restoreRoles', True) + + if restore_roles_param: + if isinstance(restore_roles_param, str): + restore_roles = restore_roles_param == 'true' + elif type(restore_roles_param) is bool: + restore_roles = restore_roles_param + single_transaction = False + single_transaction_param = restore_request.get('singleTransaction', True) + if single_transaction_param: + if isinstance(single_transaction_param, str): + single_transaction = single_transaction_param == 'true' + elif type(single_transaction_param) is bool: + single_transaction = single_transaction_param + + is_dbaas_clone= restore_request.get('dbaasClone') + worker = pg_restore.PostgreSQLRestoreWorker(databases, force, restore_request, databases_mapping, + owners_mapping, restore_roles,single_transaction, is_dbaas_clone) + + worker.start() + + return { + 'trackingId': tracking_id + }, http.client.ACCEPTED + + +class TerminateBackupEndpoint(flask_restful.Resource): + + def __init__(self): + self.log = logging.getLogger("TerminateBackupEndpoint") + + @auth.login_required + def post(self, backup_id): + self.log.info("Terminate request accepted for backup {}".format(backup_id)) + cancelled = False + + try: + for thread in threading.enumerate(): + if thread.name == str(backup_id): + thread.cancel() + cancelled = thread.is_cancelled() + if cancelled: + self.log.info("Backup {} terminated successfully".format(thread.name)) + return Response("Backup %s terminated successfully\n" % backup_id, status=200) + else: + self.log.info("There is no active backup with id {}".format(backup_id)) + return Response("There is no active backup with id: %s\n" % backup_id, status=404) + except Exception as e: + self.log.exception("Backup {0} termination failed. \n {1}".format(backup_id, str(e))) + return Response("Backup {} termination failed".format(backup_id), status=500) + + +class TerminateRestoreEndpoint(flask_restful.Resource): + + def __init__(self): + self.log = logging.getLogger("TerminateRestoreEndpoint") + + @auth.login_required + def post(self, tracking_id): + self.log.info("Terminate request accepted for id {}".format(tracking_id)) + cancelled = False + + try: + for thread in threading.enumerate(): + if thread.name == str(tracking_id): + thread.cancel() + cancelled = thread.is_cancelled() + if cancelled: + self.log.info("Restore {} terminated successfully".format(thread.name)) + return Response("Restore %s terminated successfully\n" % tracking_id, status=200) + else: + self.log.info("There is no active restore with id {}".format(tracking_id)) + return Response("There is no active backup with id: %s\n" % tracking_id, status=404) + except Exception as e: + self.log.exception("Restore {0} termination failed. \n {1}".format(tracking_id, str(e))) + return Response("Restore {} termination failed".format(tracking_id), status=500) + +class GranularRestoreStatusEndpoint(flask_restful.Resource): + + def __init__(self): + self.log = logging.getLogger('BackupRequestEndpoint') + self.s3 = storage_s3.AwsS3Vault() if os.environ['STORAGE_TYPE'] == "s3" else None + + @auth.login_required + @superuser_authorization + def get(self, tracking_id): + if not tracking_id: + return http.client.BAD_REQUEST, "Restore tracking ID is not specified." + + # for gke full backup + # if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): + # self.log.info('Getting GKE restore status') + # client = utils.GkeBackupApiCaller() + # response = client.restore_status(tracking_id) + # return response + + try: + backup_id, namespace = backups.extract_backup_id_from_tracking_id(tracking_id) + except Exception as e: + self.log.exception(e) + return 'Malformed restore tracking ID.', http.client.BAD_REQUEST + + external_backup_path = flask.request.args.get('externalBackupPath') or None + external_backup_root = None + if external_backup_path is not None: + external_backup_root = backups.build_external_backup_root(external_backup_path) + restore_status_file = backups.build_restore_status_file_path(backup_id, tracking_id, namespace, + external_backup_root) + + if not backups.is_valid_namespace(namespace): + return "Invalid namespace name: %s." % namespace.encode('utf-8'), http.client.BAD_REQUEST + if self.s3: + try: + status = self.s3.read_object(restore_status_file) + logging.info(status) + + except: + return "Backup in bucket is not found.", http.client.NOT_FOUND + return json.loads(status), http.client.OK + else: + if not os.path.isfile(restore_status_file): + return "Restore is not found.", http.client.NOT_FOUND + + return utils.get_json_by_path(restore_status_file), http.client.OK + + +class GranularRestoreStatusJSONEndpoint(flask_restful.Resource): + + def __init__(self): + self.log = logging.getLogger('BackupRequestEndpoint') + self.allowed_fields = ['trackingId', 'externalBackupPath'] + self.s3 = storage_s3.AwsS3Vault() if os.environ['STORAGE_TYPE'] == "s3" else None + + @auth.login_required + @superuser_authorization + def post(self): + tracking_request = flask.request.get_json() or {} + + for k in list(tracking_request.keys()): + if k not in self.allowed_fields: + return "Unknown field: %s" % k.encode('utf-8'), http.client.BAD_REQUEST + + if not tracking_request: + return "Restore tracking request has empty body.", http.client.BAD_REQUEST + + tracking_id = tracking_request.get('trackingId') + + if not tracking_id: + return "Restore tracking ID is not specified.", http.client.BAD_REQUEST + + try: + backup_id, namespace = backups.extract_backup_id_from_tracking_id(tracking_id) + except Exception as e: + self.log.exception(e) + return 'Malformed restore tracking ID.', http.client.BAD_REQUEST + + external_backup_path = tracking_request.get('externalBackupPath') + external_backup_root = None + if external_backup_path is not None: + external_backup_root = backups.build_external_backup_root(external_backup_path) + restore_status_file = backups.build_restore_status_file_path(backup_id, tracking_id, namespace, + external_backup_root) + if self.s3: + try: + status = self.s3.read_object(restore_status_file) + logging.info(status) + + except: + return "Backup in bucket is not found.", http.client.NOT_FOUND + return json.loads(status), http.client.OK + else: + if not os.path.isfile(restore_status_file): + return "Restore is not found.", http.client.NOT_FOUND + + with open(restore_status_file) as f: + return json.load(f), http.client.OK + + +class GranularBackupDeleteEndpoint(flask_restful.Resource): + + def __init__(self): + self.log = logging.getLogger('GranularBackupDeleteEndpoint') + self.s3 = storage_s3.AwsS3Vault() if os.environ['STORAGE_TYPE'] == "s3" else None + + @auth.login_required + @superuser_authorization + def get(self, backup_id): + return self.process_delete(backup_id) + + @auth.login_required + @superuser_authorization + def post(self, backup_id): + return self.process_delete(backup_id) + + def process_delete(self, backup_id): + self.log.info("Request to delete backup %s" % backup_id) + if not backup_id: + return self.response(backup_id, + "Backup ID is not specified.", + backups.BackupStatus.FAILED, + http.client.BAD_REQUEST) + + # for gke full backup + # if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): + # self.log.info('Perform GKE backup delete') + # client = utils.GkeBackupApiCaller() + # response = client.delete_backup(backup_id) + # return response + + namespace = flask.request.args.get('namespace') or configs.default_namespace() + + if not is_valid_namespace(namespace): + return self.response(backup_id, + "Invalid namespace name: %s." % namespace.encode('utf-8'), + backups.BackupStatus.FAILED, + http.client.BAD_REQUEST) + + backup_status_file = build_backup_status_file_path(backup_id, namespace) + if self.s3: + try: + self.s3.read_object(backup_status_file) + except: + return "Backup in bucket is not found.", http.client.NOT_FOUND + + elif not os.path.isfile(backup_status_file): + return self.response(backup_id, + "Backup is not found.", + backups.BackupStatus.FAILED, + http.client.NOT_FOUND) + + try: + dir = build_backup_path(backup_id, namespace) + if self.s3: + self.s3.delete_objects(dir) + else: + terminate = TerminateBackupEndpoint() + terminate.post(backup_id) + shutil.rmtree(dir) + + # remove namespace dir if no more backups in namespace + backup_list = os.listdir(build_namespace_path(namespace)) + if len(backup_list) == 0 and namespace != 'default': + shutil.rmtree(build_namespace_path(namespace)) + + except Exception as e: + self.log.exception(e) + return self.response(backup_id, + 'An error occurred while deleting backup {} : {}.'.format(backup_id, e), + backups.BackupStatus.FAILED, + http.client.INTERNAL_SERVER_ERROR) + + return self.response(backup_id, "Backup deleted successfully.", backups.BackupStatus.SUCCESSFUL, http.client.OK) + + def response(self, backup_id, message, status, code): + return { + 'backupId': backup_id, + 'message': message, + 'status': status + }, code + + +class GranularBackupHealthEndpoint(flask_restful.Resource): + + def __init__(self): + self.log = logging.getLogger('GranularBackupHealthEndpoint') + self.s3 = storage_s3.AwsS3Vault() if os.environ['STORAGE_TYPE'] == "s3" else None + + def get(self): + + status = {} + namespace = "schedule" + status[namespace] = {} + + namespace_path = backups.build_namespace_path(namespace) + if not os.path.exists(namespace_path): + return status, http.client.OK + + sorted_backups = sorted(os.listdir(namespace_path), reverse=True) + dump_count = len(sorted_backups) + space = os.statvfs(namespace_path) + free_space, total_space = space.f_bfree * space.f_bsize, space.f_blocks * space.f_bsize + status[namespace]['dump_count'] = dump_count + status[namespace]['total_space'] = total_space + status[namespace]['free_space'] = free_space + + if len(sorted_backups) > 0: + status[namespace]['backup'] = { + 'count': len(sorted_backups) + } + last_backup = sorted_backups[-1] + status_file = backups.build_backup_status_file_path(last_backup, namespace) + if os.path.isfile(status_file): + with open(status_file, 'r') as f: + try: + backup_status = json.load(f) + status[namespace]['last'] = { + 'id': last_backup, + 'status': backup_status.get('status'), + 'status_id': backups.get_backup_status_id(backup_status.get('status')), + 'created': backup_status.get('created'), + 'expires': backup_status.get('expires'), + 'expirationDate': backup_status.get('expirationDate') + } + return status, http.client.OK + except ValueError: + self.log.exception("Cannot read status file") + status[namespace]['last'] = { + 'id': last_backup, + 'status': backups.BackupStatus.UNKNOWN, + 'status_id': backups.get_backup_status_id(backups.BackupStatus.UNKNOWN) + } + else: + status[namespace]['last'] = { + 'id': last_backup, + 'status': backups.BackupStatus.UNKNOWN, + 'status_id': backups.get_backup_status_id(backups.BackupStatus.UNKNOWN) + } + + return status, http.client.OK + + +class GranularBackupDownloadEndpoint(flask_restful.Resource): + + def __init__(self): + self.log = logging.getLogger("GranularBackupDownloadEndpoint") + self.s3 = storage_s3.AwsS3Vault() if os.environ['STORAGE_TYPE'] == "Zs3" else None + + @auth.login_required + def get(self, backup_id): + self.log.info("Download request accepted ") + + def generate(stream_path): + stream = io.FileIO(stream_path, "r", closefd=True) + with stream as f: + chunk_size = 4096 + while True: + data = f.read(chunk_size) + if len(data) == 0: + f.close() + os.remove(stream_path) + self.log.info("Download ends ") + return + yield data + + namespace = flask.request.args.get('namespace') or configs.default_namespace() + path_for_streaming = utils.get_backup_tar_file_path(backup_id, namespace) + if path_for_streaming: + return Response(stream_with_context( + generate(path_for_streaming)), + mimetype='application/octet-stream', + headers=[ + ('Content-Type', 'application/octet-stream'), + ('Content-Disposition', + "pg_granular_backup_{}.tar.gz".format( + backup_id)) + ]) + else: + return Response("Cannot find backup ", status=404) + + + +class DiffBackupRequestEndpoint(flask_restful.Resource): + + def __init__(self): + self.log = logging.getLogger('DifferentialBackup') + self.allowed_fields = ['timestamp'] + + def perform_diff_backup(self): + + self.log.info('Perform diff backup') + + backup_id = backups.generate_backup_id() + payload = {'timestamp':backup_id} + r = requests.post("http://pgbackrest:3000/backup/diff", payload) + if r.status_code == 200: + return { + 'backupId': backup_id + }, http.client.ACCEPTED + else: + return r.status_code + + + def post(self): + content_type = request.headers.get('Content-Type') + + # if content_type and content_type.split(";")[0] != 'application/json' \ + # and request.headers.get('Content-Length'): + # return "Invalid request body: Content Type is not json", http.client.BAD_REQUEST + + + return self.perform_diff_backup() + +class GranularBackupStatusInfoEndpoint(flask_restful.Resource): + + def __init__(self): + self.log = logging.getLogger('GranularBackupStatusMetricEndpoint') + self.s3 = storage_s3.AwsS3Vault() if os.environ['STORAGE_TYPE'] == "s3" else None + + @auth.login_required + def get(self): + self.log.info("Backups metric gathering") + storage = configs.backups_storage() + s3 = None + if os.environ['STORAGE_TYPE'] == "s3": + s3 = backups.get_s3_client() + namespaces = s3.get_granular_namespaces(storage) + + all_backups = [] + + for namespace in os.listdir(storage) if not s3 else namespaces: + if s3: + backup_ids = s3.get_backup_ids(storage, namespace) + + for backup_id in os.listdir(build_namespace_path(namespace)) if not s3 else backup_ids: + status_file = build_backup_status_file_path(backup_id, namespace) + if s3: + try: + if s3.is_file_exists(status_file): + status_file = s3.read_object(status_file) + backup_details = json.loads(status_file) + all_backups.append(self.build_backup_info(backup_details)) + continue + else: + self.log.error("Cannot find status file in bucket with backup id {}".format(backup_id)) + failed_backup = {"backupId": backup_id, "namespace": namespace, "status": backups.BackupStatus.FAILED} + all_backups.append(failed_backup) + continue + except ValueError: + self.log.exception("Cannot read status file") + + if not os.path.isfile(status_file): + failed_backup = {"backupId": backup_id, "namespace": namespace, "status": backups.BackupStatus.FAILED} + all_backups.append(failed_backup) + continue + else: + backup_details = utils.get_json_by_path(status_file) + all_backups.append(self.build_backup_info(backup_details)) + response = {"granular": all_backups} + + return response, http.client.OK + + def build_backup_info(self, backup): + + backupInfo = { + "backupId": backup.get("backupId", "UNDEFINED"), + "namespace": backup.get("namespace", "UNDEFINED"), + "status": backup.get("status", "UNDEFINED"), + "expirationDate": backup.get("expirationDate", "UNDEFINED"), + "created": backup.get("created", "UNDEFINED"), + } + + return backupInfo + + + +app = Flask("GranularREST") +collector_endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", "") +if collector_endpoint != "": + collector_endpoint = "http://" + collector_endpoint + NAMESPACE_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/namespace' + ns = open(NAMESPACE_PATH).read() + resource = Resource(attributes={ + SERVICE_NAME: "postgresql-backup-daemon-" + ns + }) + provider = TracerProvider(resource=resource) + processor = BatchSpanProcessor(OTLPSpanExporter(endpoint=collector_endpoint, insecure=True)) + provider.add_span_processor(processor) + FlaskInstrumentor().instrument_app(app=app, tracer_provider=provider, excluded_urls="health,/health,v2/health,/v2/health") +api = flask_restful.Api(app) + +api.add_resource(GranularBackupsListEndpoint, '/backups') +api.add_resource(GranularBackupRequestEndpoint, '/backup/request') +api.add_resource(GranularBackupStatusEndpoint, '/backup/status/') +api.add_resource(GranularBackupStatusJSONEndpoint, '/backup/status') +api.add_resource(GranularRestoreRequestEndpoint, '/restore/request') +api.add_resource(TerminateBackupEndpoint, '/terminate/') +api.add_resource(TerminateRestoreEndpoint, '/restore/terminate/') +api.add_resource(GranularRestoreStatusEndpoint, '/restore/status/') +api.add_resource(GranularRestoreStatusJSONEndpoint, '/restore/status') +api.add_resource(GranularBackupDeleteEndpoint, '/delete/') +api.add_resource(GranularBackupHealthEndpoint, '/health') +api.add_resource(GranularBackupDownloadEndpoint, '/backup/download/') +api.add_resource(DiffBackupRequestEndpoint, '/backup/diff') +api.add_resource(GranularBackupStatusInfoEndpoint, '/backup/info') + +scheduler = BackgroundScheduler() +scheduler.start() +scheduler.add_job(backups.sweep_manager, 'interval', seconds=configs.eviction_interval()) +schedule_granular_backup(scheduler) + +# Add diff scheduler + +backrest_scheduler = BackgroundScheduler() +backrest_scheduler.start() +schedule_diff_backup(scheduler) \ No newline at end of file diff --git a/docker/granular/kube_utils.py b/docker/granular/kube_utils.py new file mode 100644 index 0000000..95ec364 --- /dev/null +++ b/docker/granular/kube_utils.py @@ -0,0 +1,36 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import logging +from kubernetes import client, config +from kubernetes.client.rest import ApiException + +NS_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" + +log = logging.getLogger("KubernetesAPI") +namespace = open(NS_PATH).read() + +config.load_incluster_config() +api_instance = client.CoreV1Api() + +def get_configmap(name): + try: + api_response = api_instance.read_namespaced_config_map(name, namespace) + return api_response + except ApiException as e: + if e.status != 404: + log.error(f'cannot get cm {namespace}/{name}') + raise e + \ No newline at end of file diff --git a/docker/granular/pg_backup.py b/docker/granular/pg_backup.py new file mode 100644 index 0000000..9f869f1 --- /dev/null +++ b/docker/granular/pg_backup.py @@ -0,0 +1,517 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import http.client +import datetime +import glob +import logging +import os +import subprocess +import time +from threading import Thread, Event +from subprocess import Popen, PIPE +import psycopg2 + +import utils +import backups +import configs +import encryption +import storage_s3 + + +class PostgreSQLDumpWorker(Thread): + + def __init__(self, databases, backup_request): + Thread.__init__(self) + + self.log = logging.getLogger("PostgreSQLDumpWorker") + + self.backup_id = backup_request.get('backupId') or backups.generate_backup_id() + self.name = self.backup_id + self.namespace = backup_request.get('namespace') or configs.default_namespace() + self.compression_level = backup_request.get('compressionLevel', 9) + self.keep = backup_request.get('keep') or configs.default_backup_expiration_period() + self.postgres_version = utils.get_version_of_pgsql_server() + self.is_standard_storage = True if backup_request.get('externalBackupPath') is None else False + self.location = configs.backups_storage(self.postgres_version) if self.is_standard_storage \ + else backups.build_external_backup_root(backup_request.get('externalBackupPath')) + self.external_backup_root = None if self.is_standard_storage else self.location + self.bin_path = configs.get_pgsql_bin_path(self.postgres_version) + self.parallel_jobs = configs.get_parallel_jobs() + self.databases = databases if databases else [] + self.backup_dir = backups.build_backup_path(self.backup_id, self.namespace, self.external_backup_root) + self.create_backup_dir() + self.s3 = storage_s3.AwsS3Vault() if os.environ['STORAGE_TYPE'] == "s3" else None + self._cancel_event = Event() + if configs.get_encryption(): + self.encryption = True + self.key = encryption.KeyManagement.get_object().get_password() + self.key_name = encryption.KeyManagement.get_key_name() + self.key_source = encryption.KeyManagement.get_key_source() + else: + self.encryption = False + self.status = { + 'backupId': self.backup_id, + 'namespace': self.namespace, + 'status': backups.BackupStatus.PLANNED + } + self.pg_dump_proc = None + + self.flush_status(self.external_backup_root) + + def cancel(self): + self.kill_processes() + self._cancel_event.set() + self.log.info(self.log_msg("Worker stopped")) + + def is_cancelled(self): + return self._cancel_event.is_set() + + def log_msg(self, msg): + return "[backupId={}] {}".format(self.backup_id, msg) + + def kill_processes(self): + if self.pg_dump_proc: + self.log.info("kill backup process with pid: {}".format(self.pg_dump_proc.pid)) + self.pg_dump_proc.kill() + + def update_status(self, key, value, database=None, flush=False): + if database: + databases_section = self.status.get('databases') + + if not databases_section: + databases_section = {} + self.status['databases'] = databases_section + + database_details = databases_section.get(database) + if not database_details: + database_details = {} + databases_section[database] = database_details + + database_details[key] = value + databases_section[database] = database_details + self.status['databases'] = databases_section + else: + self.status[key] = value + if flush or self.s3: + self.flush_status(self.external_backup_root) + + def flush_status(self, external_backup_storage=None): + path = backups.build_backup_status_file_path(self.backup_id, self.namespace, external_backup_storage) + utils.write_in_json(path, self.status) + if self.s3: + try: + # upload dumpfile + self.s3.upload_file(path) + except Exception as e: + raise e + + def stderr_file(self, database): + return '{}/{}.error'.format(self.backup_dir, database) + + def populate_databases_list(self): + connection_properties = configs.connection_properties() + conn = None + try: + conn = psycopg2.connect(**connection_properties) + with conn.cursor() as cur: + cur.execute("SELECT datname " + "FROM pg_database " + "WHERE datallowconn = 't' and " + " datistemplate = 'f' and " + " datname not in ({0})".format(",".join("'{0}'".format(x) for x in + configs.protected_databases()))) + self.databases = [db[0] for db in cur] + finally: + if conn: + conn.close() + + def backup_single_database(self, database): + self.log.info(self.log_msg("Start processing database '{}'.".format(database))) + self.log.info(self.log_msg("Will use binaries: '{}' for backup.".format(self.bin_path))) + + if database == 'postgres': + raise backups.BackupFailedException( + database, "Database 'postgres' is not suitable for " + "backup/restore since Patroni always keeps " + "connection to the database.") + + self.update_status('status', backups.BackupStatus.IN_PROGRESS, database) + self.update_status('timestamp', int(time.time()), database) + iso_date = datetime.datetime.fromtimestamp(self.status['timestamp']).isoformat() + self.update_status('created', str(iso_date), database, flush=True) + + pg_dump_backup_path = backups.build_backup_path(self.backup_id, self.namespace, self.external_backup_root) + + # Some databases may contain special symbols like '=', + # '!' and others, so use this WA. + os.environ['PGDATABASE'] = database + if configs.postgres_password(): + os.environ['PGPASSWORD'] = configs.postgres_password() + + if int(self.parallel_jobs) > 1: + command = ['{}/pg_dump'.format(self.bin_path), + '--format=directory', + '--file', os.path.join(pg_dump_backup_path, database), + '--user', configs.postgresql_user(), + '--host', configs.postgresql_host(), + '--port', configs.postgresql_port(), + # '--clean', + # '--create', + # '--if-exists', + '--blobs'] + + command.extend(['-j', self.parallel_jobs]) + + # Zero is corner-case in Python :( + if self.compression_level or self.compression_level == 0: + command.extend(['-Z', str(self.compression_level)]) + + with open(self.stderr_file(database), "w+") as stderr: + start = time.time() + if self.encryption: + pg_dump_proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=stderr) + openssl_proc = subprocess.Popen( + "openssl enc -aes-256-cbc -nosalt -pass pass:%s" % self.key, + stdin=pg_dump_proc.stdout, shell=True, stderr=stderr) + self.pg_dump_proc = openssl_proc + exit_code = openssl_proc.wait() + else: + pg_dump_proc = subprocess.Popen(command, stderr=stderr) + self.pg_dump_proc = pg_dump_proc + exit_code = pg_dump_proc.wait() + + if exit_code != 0: + with open(self.stderr_file(database)) as f: + raise backups.BackupFailedException(database, '\n'.join(f.readlines())) + + self.pg_dump_proc = None + + + else: + command = ['{}/pg_dump'.format(self.bin_path), + '--format=custom', + '--user', configs.postgresql_user(), + '--host', configs.postgresql_host(), + '--port', configs.postgresql_port(), + # '--clean', + # '--create', + # '--if-exists', + '--blobs'] + + if self.compression_level or self.compression_level == 0: + command.extend(['-Z', str(self.compression_level)]) + + database_backup_path = backups.build_database_backup_path(self.backup_id, database, + self.namespace, self.external_backup_root) + + + with open(database_backup_path, 'w+') as dump, \ + open(self.stderr_file(database), "w+") as stderr: + start = time.time() + # in case of encryption lets redirect output of pg_dump to openssl + if self.encryption: + pg_dump_proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=stderr) + openssl_proc = subprocess.Popen( + "openssl enc -aes-256-cbc -nosalt -pass pass:%s" % self.key, + stdin=pg_dump_proc.stdout, stdout=dump, shell=True, stderr=stderr) + self.pg_dump_proc = openssl_proc + exit_code = openssl_proc.wait() + else: + pg_dump_proc = subprocess.Popen(command, stdout=dump, stderr=stderr) + self.pg_dump_proc = pg_dump_proc + exit_code = pg_dump_proc.wait() + + if exit_code != 0: + with open(self.stderr_file(database)) as f: + raise backups.BackupFailedException(database, '\n'.join(f.readlines())) + + self.pg_dump_proc = None + if self.s3: + try: + # upload dumpfile + self.s3.upload_file(database_backup_path) + # upload errorfile + self.s3.upload_file(self.stderr_file(database)) + except Exception as e: + raise e + + self.update_status('duration', (int(time.time() - start)), database) + owner = utils.get_owner_of_db(database) + self.log.info(self.log_msg("owner of database '%s'." % owner)) + self.update_status('owner', owner, database) + + roles = self.fetch_roles(database) + self.dump_roles_for_db(roles, database) + self.update_status('duration', (int(time.time() - start)), database) + self.log.info(self.log_msg("Finished processing of database '%s'." % database)) + if self.s3: + os.remove(database_backup_path) + + def fetch_roles(self, database): + connection_properties = configs.connection_properties() + rolesList = [] + conn = None + try: + conn = psycopg2.connect(**connection_properties) + conn.autocommit = True + with conn.cursor() as cur: + # check if there are any active connections to pgsql + cur.execute("SELECT pid " + "FROM pg_stat_activity " + "WHERE pid <> pg_backend_pid() " + " AND datname = %s LIMIT 1", (database,)) + pids = [p[0] for p in cur.fetchall()] + + cur.execute(""" + SELECT r2.rolname grantee + FROM + (SELECT datname AS objname, + datallowconn, (aclexplode(datacl)).grantor AS grantorI, + (aclexplode(datacl)).grantee AS granteeI, + (aclexplode(datacl)).privilege_type + FROM pg_database) AS db + JOIN pg_roles r1 ON db.grantorI = r1.oid + JOIN pg_roles r2 ON db.granteeI = r2.oid + WHERE db.objname = %s + AND db.privilege_type = 'CONNECT' AND r2.rolname not in ('postgresadmin'); + """, (database,)) + rolesList = [p[0] for p in cur.fetchall()] + finally: + if conn: + conn.close() + + self.log.debug(self.log_msg("Roles {} have been fetched for backup ".format(rolesList))) + + roles_backup_path = backups.build_roles_backup_path(self.backup_id, database, + self.namespace, self.external_backup_root) + database_backup_path = backups.build_database_backup_path(self.backup_id, database, + self.namespace, self.external_backup_root) + + pg_dump_backup_path = backups.build_backup_path(self.backup_id, self.namespace, self.external_backup_root) + path_for_parallel_flag_backup = os.path.join(pg_dump_backup_path, database) + + self.log.debug("Will try to fetch users to %s" % roles_backup_path) + with open(self.stderr_file(database), "w+") as stderr: + fetch_command = \ + "| grep -P 'ALTER TABLE.*OWNER TO.*' | " \ + "awk 'NF>1{print substr($NF, 1, length($NF)-1)}' | uniq" + if self.encryption: + encrypt = "openssl enc -aes-256-cbc -nosalt -d -pass " \ + "pass:'%s' < '%s' | %s/pg_restore " % \ + (self.key, database_backup_path, self.bin_path) + fetch_command = encrypt + fetch_command + else: + if int(self.parallel_jobs) > 1: + dump_version = self.get_pg_version_from_dump(path_for_parallel_flag_backup) + else: + dump_version = self.get_pg_version_from_dump(database_backup_path) + + pg_restore_options = "-f - " + if dump_version[0] == 9 and dump_version[1] == 4: + pg_restore_options = "" + + if int(self.parallel_jobs) > 1: + pg_restore_options += " -j {} ".format(self.parallel_jobs) + fetch_command = ("%s/pg_restore '%s' %s" + fetch_command) % \ + (self.bin_path, path_for_parallel_flag_backup, pg_restore_options) + else: + fetch_command = ("%s/pg_restore '%s' %s" + fetch_command) % \ + (self.bin_path, database_backup_path, pg_restore_options) + self.log.debug("Roles fetch command: %s." % fetch_command) + p = Popen(fetch_command, shell=True, stdout=PIPE, stderr=stderr) + output, err = p.communicate() + exit_code = p.returncode + self.log.info("Roles search result: {} type: {} . Exit code: {}".format(output, type(output), exit_code)) + if exit_code != 0: + raise backups.BackupFailedException(database, '\n'.join( + stderr.readlines())) + rolesFromRestore = [x for x in output.decode().split("\n") if x.strip()] + rolesList = list(set(rolesList + rolesFromRestore)) + roles = "|".join(list( + [x for x in rolesList if x not in configs.protected_roles()])) + self.log.debug("Selected roles template: %s " % roles) + return roles + + def dump_roles_for_db(self, roles, database): + roles_backup_path = backups.build_roles_backup_path(self.backup_id, database, + self.namespace, self.external_backup_root) + + with open(roles_backup_path, 'w+') as dump, \ + open(self.stderr_file(database), "w+") as stderr: + if roles: + cmd = "{}/pg_dumpall --roles-only -U {} --host {} --port {} {}" \ + "| grep -P '{}' ".format(self.bin_path, + configs.postgresql_user(), + configs.postgresql_host(), + configs.postgresql_port(), + configs.postgresql_no_role_password_flag(), + roles + ) + + if self.encryption: + encrypt_cmd = \ + "| openssl enc -aes-256-cbc -nosalt -pass" \ + " pass:'{}'".format(self.key) + cmd = cmd + encrypt_cmd + p = Popen(cmd, shell=True, stdout=dump, stderr=stderr) + output, err = p.communicate() + self.log.debug("Fetch roles command: {}".format(cmd)) + exit_code = p.returncode + if exit_code != 0: + # stderr.seek(0) + raise backups.BackupFailedException( + database, '\n'.join(stderr.readlines())) + else: + self.log.info("No roles to fetch") + if self.s3: + try: + logging.info("Streaming {} roles to AWS".format(database)) + self.s3.upload_file(roles_backup_path) + except Exception as e: + raise e + finally: + os.remove(roles_backup_path) + + def cleanup(self, database): + if self.s3: + self.s3.delete_file(self.stderr_file(database)) + + os.remove(self.stderr_file(database)) + + def on_success(self, database): + database_backup_path = backups.build_database_backup_path(self.backup_id, database, self.namespace, self.external_backup_root) + + pg_dump_backup_path = backups.build_backup_path(self.backup_id, self.namespace, self.external_backup_root) + path_for_parallel_flag_backup = os.path.join(pg_dump_backup_path, database) + + if self.s3: + if int(self.parallel_jobs) > 1: + size_bytes = self.s3.get_file_size(path_for_parallel_flag_backup) + else: + size_bytes = self.s3.get_file_size(database_backup_path) + else: + if int(self.parallel_jobs) > 1: + size_bytes = os.path.getsize(path_for_parallel_flag_backup) + else: + size_bytes = os.path.getsize(database_backup_path) + self.update_status('path', backups. + build_database_backup_full_path( + self.backup_id, database, self.location, self.namespace), database) + self.update_status('sizeBytes', size_bytes, database) + self.update_status('size', backups.sizeof_fmt(size_bytes), database) + if self.encryption: + self.update_status('key_name', self.key_name) + self.update_status('key_source', self.key_source) + self.update_status( + 'status', backups.BackupStatus.SUCCESSFUL, database, flush=True + ) + + def on_failure(self, database, e): + self.log.exception("Failed to backup database {0} {1}.".format(database, str(e))) + self.update_status('details', str(e), database) + self.update_status('status', backups.BackupStatus.FAILED, database) + + for f in glob.glob(self.backup_dir + '/*.dump'): + os.remove(f) + + def on_cancel(self, database=None): + if database: + self.log.exception("Backup got canceled for database {0}".format(database)) + self.update_status('details', "Backup got canceled for database {0}".format(database), database) + self.update_status('status', backups.BackupStatus.CANCELED, database) + self.update_status('details', "Backup got canceled for database") + self.update_status("status", backups.BackupStatus.CANCELED, flush=True) + + def expire(self, start_timestamp=None, keep=configs.default_backup_expiration_period()): + + if not start_timestamp: + start_timestamp = int(time.time()) + + if keep.lower() == 'forever': + self.update_status('expires', 'Never') + self.update_status('expirationDate', 'Never', flush=True) + else: + expiration_timestamp = backups.calculate_expiration_timestamp(start_timestamp, keep) + self.update_status('expires', expiration_timestamp) + self.update_status('expirationDate', str(datetime.datetime.fromtimestamp(expiration_timestamp).isoformat()), flush=True) + + def process_backup_request(self): + self.update_status('status', backups.BackupStatus.IN_PROGRESS, flush=True) + self.log.info(self.log_msg("Backup request processing has been started. Databases to backup: '{}'.".format(self.databases))) + + start_timestamp = int(time.time()) + self.expire(keep=self.keep) + self.update_status('timestamp', start_timestamp) + self.update_status('created', str(datetime.datetime.fromtimestamp(start_timestamp).isoformat())) + + for database in self.databases: + if backups.is_database_protected(database): + return "Database '{}' is not suitable for backup/restore.".format(database, http.client.FORBIDDEN) + #if self.should_be_skipped(database): + # self.log.info("Skipping Logical Database: {}, because it's not suitable for the backup.".format(database)) + try: + self.backup_single_database(database) + self.on_success(database) + except Exception as e: # Call on_failure here to mark database backup failed on any exception. + self.on_failure(database, e) + raise e + finally: + self.cleanup(database) + + def should_be_skipped(self, database): + # if not external, shouldn't skip + if not configs.is_external_pg(): + return False + + # if external check for _DBAAS_METADATA table presence + # establish connect to logical db and check if metadata presented + connection_properties = configs.connection_properties(database=database) + conn = None + try: + conn = psycopg2.connect(**connection_properties) + with conn.cursor() as cur: + cur.execute("select 1 from pg_tables where upper(tablename) = '_DBAAS_METADATA'") + return cur.fetchone() == None + finally: + if conn: + conn.close() + + def create_backup_dir(self): + if not os.path.exists(self.backup_dir): + os.makedirs(self.backup_dir) + + def run(self): + try: + if not self.databases: + self.log.info(self.log_msg("No databases specified for backup. " + "According to the contract, all databases will be backup'ed.")) + self.populate_databases_list() + + self.process_backup_request() + self.update_status('status', backups.BackupStatus.SUCCESSFUL, flush=True) + self.log.info(self.log_msg("Backup request processing has been completed.")) + except Exception as e: + self.log.exception(self.log_msg("Backup request processing has failed.")) + self.update_status('details', str(e)) + self.update_status('status', backups.BackupStatus.FAILED, flush=True) + self.expire() + raise e + finally: + if self.is_cancelled(): + self.on_cancel() + + def get_pg_version_from_dump(self, database_backup_path): + return utils.get_pg_version_from_dump(database_backup_path, self.key_name if self.encryption else None, self.bin_path) + diff --git a/docker/granular/pg_restore.py b/docker/granular/pg_restore.py new file mode 100644 index 0000000..77d3ee2 --- /dev/null +++ b/docker/granular/pg_restore.py @@ -0,0 +1,692 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import json +import shutil + +from psycopg2.extensions import AsIs +import logging +from threading import Thread, Event +import os +import os.path +import subprocess +import time + +import psycopg2 + +import utils +import backups +import configs +import encryption +import storage_s3 + + +class PostgreSQLRestoreWorker(Thread): + + def __init__(self, databases, force, restore_request, databases_mapping, owners_mapping, restore_roles=True, single_transaction=False, dbaas_clone=False): + Thread.__init__(self) + + self.log = logging.getLogger("PostgreSQLRestoreWorker") + self.backup_id = restore_request.get('backupId') + + if not self.backup_id: + raise Exception("Backup ID is not specified.") + + self.databases = databases or [] + self.force = force + self.single_transaction = single_transaction + self.namespace = restore_request.get( + 'namespace') or configs.default_namespace() + self.tracking_id = restore_request.get( + 'trackingId') or backups.generate_restore_id(self.backup_id, + self.namespace) + self.name = self.tracking_id + self.is_standard_storage = True if restore_request.get('externalBackupPath') is None else False + self.restore_roles = restore_roles + self.postgres_version = utils.get_version_of_pgsql_server() + self.location = configs.backups_storage(self.postgres_version) if self.is_standard_storage \ + else backups.build_external_backup_root(restore_request.get('externalBackupPath')) + self.external_backup_root = None if self.is_standard_storage else self.location + self.databases_mapping = databases_mapping + self.owners_mapping = owners_mapping + self.bin_path = configs.get_pgsql_bin_path(self.postgres_version) + self.parallel_jobs = configs.get_parallel_jobs() + self.s3 = storage_s3.AwsS3Vault() if os.environ['STORAGE_TYPE'] == "s3" else None + if self.s3: + self.backup_dir = backups.build_backup_path(self.backup_id, self.namespace, self.external_backup_root) + self.create_backup_dir(self.backup_dir) + if configs.get_encryption(): + self.encryption = True + self.key_name = backups.get_key_name_by_backup_id(self.backup_id, + self.namespace, self.external_backup_root) + else: + self.encryption = False + if databases_mapping: + self.databases = list(databases_mapping.keys()) + self.status = { + 'trackingId': self.tracking_id, + 'namespace': self.namespace, + 'backupId': self.backup_id, + 'status': backups.BackupStatus.PLANNED + } + self._cancel_event = Event() + self.pg_restore_proc = None + self.flush_status(self.external_backup_root) + self.dbaas_clone=dbaas_clone + + def create_backup_dir(self, backup_dir): + if not os.path.exists(backup_dir): + os.makedirs(backup_dir) + + def log_msg(self, msg): + return "[trackingId=%s] %s" % (self.tracking_id, msg) + + def flush_status(self, external_backup_storage=None): + path = backups.build_restore_status_file_path(self.backup_id, self.tracking_id, self.namespace, + external_backup_storage) + utils.write_in_json(path, self.status) + if self.s3: + try: + # upload status file + self.s3.upload_file(path) + except Exception as e: + raise e + + def update_status(self, key, value, database=None, flush=False): + if database: + databases_section = self.status.get('databases') + + if not databases_section: + databases_section = {} + self.status['databases'] = databases_section + + database_details = databases_section.get(database) + if not database_details: + database_details = {} + databases_section[database] = database_details + + database_details[key] = value + databases_section[database] = database_details + self.status['databases'] = databases_section + else: + self.status[key] = value + + if flush or self.s3: + self.flush_status(self.external_backup_root) + + @staticmethod + def db_exists(database): + connection_properties = configs.connection_properties() + conn = None + try: + conn = psycopg2.connect(**connection_properties) + with conn.cursor() as cur: + cur.execute("select 1 from pg_database where datname = %s", (database,)) + return bool(cur.fetchone()) + finally: + if conn: + conn.close() + + def kill_pids_and_revoke_rights(self, database, cur, roles): + if not self.db_exists(database): + self.log.info(self.log_msg("skipping revoke, for not existing '%s' db" % database)) + return + for role in roles: + self.log.info(self.log_msg("Revoking grants from: {} on: {}.".format(role, database))) + cur.execute("REVOKE CONNECT ON DATABASE \"%(database)s\" from \"%(role)s\";", {"database": AsIs(database), + "role": AsIs(role)}) + # also, revoking connect rights from public + cur.execute("REVOKE CONNECT ON DATABASE \"%(database)s\" from PUBLIC;", {"database": AsIs(database)}) + # selecting pids to kill after revoking rights + cur.execute("SELECT pg_terminate_backend(pid) " + "FROM pg_stat_activity " + "WHERE pid <> pg_backend_pid() " + " AND datname = %s ", (database,)) + + def get_pg_version_from_dump(self, dump_path): + return utils.get_pg_version_from_dump(dump_path, self.key_name if self.encryption else None, self.bin_path) + + def restore_single_database(self, database): + self.log.info(self.log_msg("Start restoring database '%s'." + % database)) + + self.update_status('status', backups.BackupStatus.IN_PROGRESS, database) + self.update_status('source', backups.build_database_backup_full_path( + self.backup_id, database, self.location, + self.namespace), database, flush=True) + + if int(self.parallel_jobs) > 1: + pg_dump_backup_path = backups.build_backup_path(self.backup_id, self.namespace, self.external_backup_root) + dump_path = os.path.join(pg_dump_backup_path, database) + else: + dump_path = backups.build_database_backup_path(self.backup_id, database, + self.namespace, self.external_backup_root) + roles_backup_path = backups.build_roles_backup_path(self.backup_id, database, + self.namespace, self.external_backup_root) + stderr_path = backups.build_database_backup_path(self.backup_id, database, + self.namespace, self.external_backup_root) + stderr_path = stderr_path + '.stderr' + stdout_path = backups.build_database_backup_path(self.backup_id, database, + self.namespace, self.external_backup_root) + sql_script_path = stdout_path + '.sql' + stdout_path = stdout_path + '.stdout' + if self.s3: + try: + self.s3.download_file(dump_path) + if self.restore_roles: + self.s3.download_file(roles_backup_path) + except Exception as e: + raise e + new_bd_name = self.databases_mapping.get(database) or database + db_owner = self.owners_mapping.get(database, 'postgres') + dump_version = self.get_pg_version_from_dump(dump_path) + bin_path = configs.get_pgsql_bin_path(dump_version) + self.log.info(self.log_msg("Will use binaries: '%s' for restore." + % bin_path)) + os.environ['PGPASSWORD'] = configs.postgres_password() + + connection_properties = configs.connection_properties() + # Restoring the pg_restore command with the -C flag to save the role rights. If we restore to the old database and the same version of PG and backup + restore_without_psql = database == new_bd_name and self.postgres_version[0]==dump_version[0] and not configs.is_external_pg() and (not self.single_transaction) + roles = [] + conn = None + try: + conn = psycopg2.connect(**connection_properties) + conn.autocommit = True + with conn.cursor() as cur: + # check if there are any active connections to pgsql + cur.execute("SELECT pid " + "FROM pg_stat_activity " + "WHERE pid <> pg_backend_pid() " + " AND datname = %s LIMIT 1", (new_bd_name,)) + pids = [p[0] for p in cur.fetchall()] + + # get roles to revoke from + pg_user = configs.postgresql_user() + cur.execute(""" + SELECT r2.rolname grantee + FROM + (SELECT datname AS objname, + datallowconn, (aclexplode(datacl)).grantor AS grantorI, + (aclexplode(datacl)).grantee AS granteeI, + (aclexplode(datacl)).privilege_type + FROM pg_database) AS db + JOIN pg_roles r1 ON db.grantorI = r1.oid + JOIN pg_roles r2 ON db.granteeI = r2.oid + WHERE db.objname = %s + AND db.privilege_type = 'CONNECT' AND r2.rolname not in ('postgresadmin', %s); + """, (new_bd_name, pg_user,)) + + roles = [p[0] for p in cur.fetchall()] + + if pids and not self.force: + with open(stderr_path, 'a+') as f: + raise backups.RestoreFailedException("Not able to restore database {} with running connection". + format(new_bd_name), '\n'.join(f.readlines())) + + if pids and self.force: + # revoke grants for connection, to prevent new connection + self.kill_pids_and_revoke_rights(new_bd_name, cur, roles) + if (not self.single_transaction): + self.log.debug(self.log_msg("DROP DROP DATABASE IF EXISTS".format(new_bd_name))) + cur.execute('DROP DATABASE IF EXISTS \"%(database)s\"', {"database": AsIs(new_bd_name)}) + + if (not restore_without_psql) and (not self.single_transaction): + cur.execute('CREATE DATABASE \"%(database)s\"', {"database": AsIs(new_bd_name)}) + self.drop_lookup_func_for_db(new_bd_name) + + if self.restore_roles and os.path.isfile(roles_backup_path): + self.log.info(self.log_msg("Will try to restore roles")) + command = "psql --dbname=postgres --username {} --host {}" \ + " --port {} --echo-all --file {}" \ + .format(configs.postgresql_user(), configs.postgresql_host(), + configs.postgresql_port(), roles_backup_path) + if self.encryption: + command = \ + "openssl enc -aes-256-cbc -nosalt -d -pass " \ + "pass:'%s' < '%s' | psql" \ + " --username '%s' --dbname=postgres --host '%s' " \ + "--port '%s' --echo-all" % ( + encryption.KeyManagement.get_object().get_password_by_name( + self.key_name), + roles_backup_path, + configs.postgresql_user(), + configs.postgresql_host(), + configs.postgresql_port()) + + with open(stderr_path, 'a') as stderr: + with open(stdout_path, 'a') as stdout: + p = subprocess.Popen(command, shell=True, + stdout=stdout, stderr=stderr) + self.pg_restore_proc = p + exit_code = p.wait() + + if exit_code != 0: + with open(stderr_path, 'r') as f: + raise backups.RestoreFailedException(database, '\n'.join(f.readlines())) + else: + self.log.info(self.log_msg("Roles has been successfully restored")) + self.pg_restore_proc = None + + + if self.single_transaction: + self.log.info(self.log_msg("Attempt to restore the {} database to a single transaction".format(new_bd_name))) + try: + os.remove(sql_script_path) + except OSError: + pass + con_properties2 = configs.connection_properties(database=new_bd_name) + connDb = None + try: + connDb = psycopg2.connect(**con_properties2) + connDb.autocommit = True + with connDb.cursor() as curDb: + with open(sql_script_path, 'w') as file: + curDb.execute("select schemaname from pg_catalog.pg_tables where schemaname !~ '^pg_' AND schemaname <> 'information_schema' AND schemaname <>'public';") + schemas = [r[0] for r in curDb.fetchall()] + for schema in schemas: + self.log.debug(self.log_msg("Adding to the list for deletion: SCHEMA IF EXISTS {} CASCADE\n".format(schema))) + file.write(("DROP SCHEMA IF EXISTS {} CASCADE;\n".format(schema))) + curDb.execute("SELECT tablename FROM pg_catalog.pg_tables where schemaname !~ '^pg_' AND schemaname <> 'information_schema';") + tablenames = [r[0] for r in curDb.fetchall()] + for table in tablenames: + self.log.debug(self.log_msg("Adding to the list for deletion: TABLE IF EXISTS {} CASCADE".format(table))) + file.write(("DROP TABLE IF EXISTS {} CASCADE;\n".format(table))) + except psycopg2.OperationalError as err: + self.log.error(self.log_msg(err)) + if ('database "{}" does not exist'.format(new_bd_name)) in str(err): + self.log.info(self.log_msg("Try create database")) + cur.execute('CREATE DATABASE \"%(database)s\"', {"database": AsIs(new_bd_name)}) + self.drop_lookup_func_for_db(new_bd_name) + except psycopg2.Error as err: + self.log.error(self.log_msg(err)) + with open(stderr_path, 'r') as f: + raise backups.RestoreFailedException(database, '\n'.join(f.readlines())) + except Exception as err: + self.log.error(self.log_msg(err)) + with open(stderr_path, 'r') as f: + raise backups.RestoreFailedException(database, '\n'.join(f.readlines())) + finally: + if connDb: + connDb.close() + + pg_restore_options = "-f - " + + if self.dbaas_clone: + pg_restore_options = pg_restore_options + "--no-owner --no-acl" + + if int(self.parallel_jobs) > 1: + commandWriteToFile = "{}/pg_restore {} {} -j {} >> {}".format(bin_path, dump_path, pg_restore_options, self.parallel_jobs, sql_script_path) + else: + commandWriteToFile = "{}/pg_restore {} {} >> {}".format(bin_path, dump_path, pg_restore_options, sql_script_path) + commandRest = ' (echo "BEGIN;"; cat {}; echo "COMMIT;") | psql -v --single-transaction ON_ERROR_STOP=1 --echo-errors --dbname={} ' \ + '--user {} --host {} --port {}' .format(sql_script_path,new_bd_name, + configs.postgresql_user(),configs.postgresql_host(), + configs.postgresql_port()) + self.log.debug(self.log_msg("command WriteToFile: {} ".format(commandWriteToFile))) + self.log.debug(self.log_msg("command Restore: {} ".format( commandRest))) + with open(stderr_path, 'a') as stderr: + with open(stdout_path, 'a+') as stdout: + p = subprocess.Popen(commandWriteToFile, shell=True, + stdout=stdout, stderr=stderr) + self.pg_restore_proc = p + exit_code = p.wait() + self.log.debug(self.log_msg("exit_code: {}".format(exit_code))) + if exit_code != 0: + raise backups.RestoreFailedException(database, '\n'.join( + stderr.readlines())) + return() + else: + self.log.debug(self.log_msg("Restore file {} is recorded. Starting the restore".format(sql_script_path))) + p2 = subprocess.Popen(commandRest, shell=True, + stdout=stdout, stderr=stderr) + self.pg_restore_proc = p2 + exit_code2 = p2.wait() + if exit_code2 != 0: + raise backups.RestoreFailedException(database, '\n'.join( + stderr.readlines())) + else: + # when resetting to a single transaction, ON_ERROR_STOP=1 is sometimes ignored and + # exit_code==0, so we read the last 9 characters from stdout if there is a ROLLBACK, + # then we generate an exception + stdout.seek(stdout.tell()-9) + line = stdout.read(9) + self.log.debug(self.log_msg("the last 9 characters from stdout:{}".format(line))) + if "ROLLBACK" in line: + self.log.error("ROLLBACK") + raise backups.RestoreFailedException(database, "ROLLBACK") + self.log.info(self.log_msg("Successful restored")) + self.pg_restore_proc = None + try: + os.remove(sql_script_path) + except OSError: + pass + return() + # setting owner back + if self.restore_roles and not restore_without_psql: + cur.execute('ALTER DATABASE \"%(database)s\" OWNER TO \"%(owner)s\"', + {"database": AsIs(new_bd_name), "owner": AsIs(db_owner)}) + + if pids and self.force: + self.kill_pids_and_revoke_rights(new_bd_name, cur, roles) + self.log.info(self.log_msg("Target database {} created".format(new_bd_name))) + finally: + if conn: + conn.close() + # restoring database entities + # add restore options "-f -" for all pg versions except greenplum + pg_restore_options = "-f - " + if dump_version[0] == 9 and dump_version[1] == 4: + pg_restore_options = "" + if configs.is_external_pg() or self.dbaas_clone: + pg_restore_options = pg_restore_options + "--no-owner --no-acl" + + if int(self.parallel_jobs) > 1: + pg_restore_options += "-j {} ".format(self.parallel_jobs) + + command = "{}/pg_restore {} {}| psql -v ON_ERROR_STOP=1 --dbname={} " \ + "--user {} --host {} --port {}" \ + .format(bin_path, dump_path, + pg_restore_options, + new_bd_name, + configs.postgresql_user(), + configs.postgresql_host(), + configs.postgresql_port() + ) + + self.log.info(self.log_msg("DB version: {} dump version: {}, database: {}, new DB name: {} ". + format(self.postgres_version[0], dump_version[0], database,new_bd_name))) + if restore_without_psql: + command = "{}/pg_restore -C {} " \ + "--user {} --host {} --port {} --dbname=postgres --no-password" \ + .format(bin_path, dump_path,configs.postgresql_user(), + configs.postgresql_host(), configs.postgresql_port()) + + # Add support for the -j flag + if int(self.parallel_jobs) > 1: + command += " -j {} ".format(self.parallel_jobs) + + if self.encryption: + command = \ + "openssl enc -aes-256-cbc -d -nosalt -pass pass:{} < {}" \ + "| {}/pg_restore {}| psql -v ON_ERROR_STOP=1 --dbname={} " \ + "--user {} --host {} " \ + "--port {}".format( + encryption.KeyManagement.get_object().get_password(), + dump_path, + bin_path, + pg_restore_options, + new_bd_name, configs.postgresql_user(), + configs.postgresql_host(), + configs.postgresql_port()) + + # Add support for the -j flag + if int(self.parallel_jobs) > 1: + command += " -j {} ".format(self.parallel_jobs) + + self.log.debug(self.log_msg("Restore Command: {}".format(command))) + if database != new_bd_name: + self.log.info(self.log_msg("New database name: {} specified for database: {}". + format(new_bd_name, database))) + + with open(stderr_path, 'a') as stderr: + with open(stdout_path, 'a') as stdout: + pg_restore_proc = subprocess.Popen(command,shell=True, stdout=stdout, stderr=stderr) + self.pg_restore_proc = pg_restore_proc + exit_code = pg_restore_proc.wait() + self.pg_restore_proc = None + + if pids and self.force: + conn = None + try: + conn = psycopg2.connect(**connection_properties) + conn.autocommit = True + with conn.cursor() as cur: + for role in roles: + cur.execute("GRANT CONNECT ON DATABASE \"%(database)s\" to \"%(role)s\";", + {"database": AsIs(new_bd_name), "role": AsIs(role)}) + self.log.info(self.log_msg( + "Rights for connection granted for db {} to role {}".format(new_bd_name, role))) + finally: + if conn: + conn.close() + + if self.restore_roles: + self.grant_connect_to_db_for_role(roles_backup_path, new_bd_name) + + if self.restore_roles and configs.is_external_pg(): + self.grant_connect_for_external_pg(roles_backup_path, new_bd_name) + + if exit_code != 0: + with open(stderr_path, 'r') as f: + raise backups.RestoreFailedException(database, '\n'.join(f.readlines())) + + if database != new_bd_name: + self.log.info(self.log_msg("Database '%s' has been successfully restored with new name '%s'." % + (database, new_bd_name))) + else: + self.log.info(self.log_msg("Database '%s' has been successfully restored." % database)) + + def run(self): + try: + self.process_restore_request() + self.update_status('status', backups.BackupStatus.SUCCESSFUL, flush=True) + self.log.info(self.log_msg("Backup has been successfully restored.")) + if self.s3: + shutil.rmtree(self.backup_dir) + except Exception as e: + self.log.exception(self.log_msg("Restore request processing has failed.")) + self.update_status('details', str(e)) + self.update_status('status', backups.BackupStatus.FAILED, flush=True) + if self.s3: + shutil.rmtree(self.backup_dir) + finally: + if self.is_cancelled(): + self.on_cancel() + + def process_restore_request(self): + self.log.info(self.log_msg("Start restore procedure.")) + start_timestamp = int(time.time()) + self.update_status('status', backups.BackupStatus.IN_PROGRESS) + self.update_status('timestamp', start_timestamp) + self.update_status('started', str(datetime.datetime.fromtimestamp(start_timestamp).isoformat()), flush=True) + + backup_status_file = backups.build_backup_status_file_path(self.backup_id, + self.namespace, self.external_backup_root) + if self.s3: + try: + status = self.s3.read_object(backup_status_file) + except Exception as e: + raise e + backup_details = json.loads(status) + + else: + if not os.path.isfile(backup_status_file): + raise backups.BackupNotFoundException(self.backup_id, self.namespace) + + backup_details = utils.get_json_by_path(backup_status_file) + + backup_status = backup_details['status'] + + if backup_status != backups.BackupStatus.SUCCESSFUL: + raise backups.BackupBadStatusException(self.backup_id, backup_status) + + if not self.databases: + self.log.info(self.log_msg("Databases not specified -> restore all databases from backup.")) + self.databases = list(backup_details.get('databases', {}).keys()) + + self.log.info(self.log_msg("Databases to restore: %s" % (self.databases,))) + + # Check physical dump files existence. + for database in self.databases: + if self.s3: + is_backup_exist = self.s3.is_file_exists(backups.build_database_backup_path(self.backup_id, database, + self.namespace, self.external_backup_root)) + else: + if int(self.parallel_jobs) > 1: + pg_dump_backup_path = backups.build_backup_path(self.backup_id, self.namespace, self.external_backup_root) + path_for_parallel_flag_backup = os.path.join(pg_dump_backup_path, database) + is_backup_exist = os.path.exists(path_for_parallel_flag_backup) + else: + is_backup_exist = backups.database_backup_exists(self.backup_id, database, + self.namespace, self.external_backup_root) + if not is_backup_exist: + raise backups.BackupNotFoundException(self.backup_id, database, self.namespace) + + for database in self.databases: + try: + self.restore_single_database(database) + self.update_status('status', backups.BackupStatus.SUCCESSFUL, database) + self.update_status('completed', str(datetime.datetime.fromtimestamp(int(time.time())).isoformat()), + flush=True) + except Exception as e: + self.log.exception(self.log_msg("Restore has failed: %s " % str(e))) + self.update_status('details', str(e), database) + self.update_status('status', backups.BackupStatus.FAILED, database, flush=True) + raise e + finally: + try: + if int(self.parallel_jobs) > 1: + pg_dump_backup_path = backups.build_backup_path(self.backup_id, self.namespace, self.external_backup_root) + backup_path = os.path.join(pg_dump_backup_path, database) + else: + backup_path = backups.build_database_backup_path(self.backup_id, database, + self.namespace, self.external_backup_root) + os.remove(backup_path + '.stderr') + except OSError as ex: + self.log.exception(self.log_msg("Unable to remove stderr log due to: %s " % str(ex))) + + def grant_connect_to_db_for_role(self, roles_backup_path, db_name): + connection_properties = configs.connection_properties() + conn = None + try: + conn = psycopg2.connect(**connection_properties) + conn.autocommit = True + with conn.cursor() as cur: + with open(roles_backup_path, "r") as role_file: + for line in role_file: + if not line.startswith("CREATE ROLE"): + continue + try: + role_name = line.split('CREATE ROLE ')[1].split(';')[0] + cur.execute( + "GRANT CONNECT ON DATABASE \"%(database)s\" TO \"%(role)s\";", + {"database": AsIs(db_name), "role": AsIs(role_name)} + ) + self.log.info(self.log_msg( + "Rights for connection granted for db {} to role {}".format(db_name, role_name))) + except Exception as e: + self.log.error(self.log_msg("error grant connect on database {} to role {}" + .format(db_name, role_name)), e) + finally: + if conn: + conn.close() + + def cancel(self): + self.kill_processes() + self._cancel_event.set() + self.log.info(self.log_msg("Worker stopped")) + + def is_cancelled(self): + return self._cancel_event.is_set() + + def on_cancel(self, database=None): + if database: + self.log.exception("Restore got canceled for database {0}".format(database)) + self.update_status('details', "Restore got canceled for database {0}".format(database), database) + self.update_status('status', backups.BackupStatus.CANCELED, database) + self.update_status('details', "Backup got canceled for database") + self.update_status("status", backups.BackupStatus.CANCELED, flush=True) + + def kill_processes(self): + if self.pg_restore_proc: + self.log.info("kill restore process with pid: {}".format(self.pg_restore_proc.pid)) + self.pg_restore_proc.kill() + + def grant_connect_for_external_pg(self, roles_backup_path, db_name): + with open(roles_backup_path, "r") as role_file: + for line in role_file: + if not line.startswith("CREATE ROLE"): + continue + role_name = line.split('CREATE ROLE ')[1].split(';')[0] + self.log.debug(self.log_msg("Try restore TABLE, SEQUENCE, VIEW owners to '%s' without superuser" + " privileges." % role_name)) + con_properties = configs.connection_properties(database=db_name) + conn = None + try: + conn = psycopg2.connect(**con_properties) + conn.autocommit = True + with conn.cursor() as cur: + cur.execute( + "SELECT 'ALTER TABLE '||schemaname||'.'||tablename||' OWNER TO {};' FROM pg_tables " + "WHERE NOT schemaname IN ('pg_catalog', 'information_schema')".format(role_name)) + alter_roles = [r[0] for r in cur.fetchall()] + for alter_role in alter_roles: + self.log.debug(self.log_msg("Try execute command: %s" % alter_role)) + try: + cur.execute(alter_role) + except Exception as e: + self.log.info(self.log_msg("ERROR ALTER TABLE. Command: {} " + "ERROR: {}".format(alter_role, e))) + + cur.execute("SELECT 'ALTER SEQUENCE '||sequence_schema||'.'||sequence_name||' OWNER TO " + "{};' FROM information_schema.sequences WHERE NOT sequence_schema " + "IN ('pg_catalog', 'information_schema')".format(role_name)) + alter_sequences = [r[0] for r in cur.fetchall()] + for alter_sequence in alter_sequences: + self.log.debug(self.log_msg("Try execute command: %s" % alter_sequence)) + try: + cur.execute(alter_sequence) + except Exception as e: + self.log.info(self.log_msg("ERROR ALTER SEQUENCE. Command: {} " + "ERROR: {}".format(alter_sequence, e))) + + cur.execute("SELECT 'ALTER VIEW '||table_schema||'.'||table_name ||' OWNER TO {};' " + "FROM information_schema.views WHERE NOT table_schema " + "IN ('pg_catalog', 'information_schema')".format(role_name)) + alter_views = [r[0] for r in cur.fetchall()] + for alter_view in alter_views: + self.log.debug(self.log_msg("Try execute command: %s" % alter_view)) + try: + cur.execute(alter_view) + except Exception as e: + self.log.info(self.log_msg("ERROR ALTER VIEW. Command: {} " + "ERROR: {}".format(alter_view, e))) + cur.execute("GRANT {} TO {};".format(role_name, configs.postgresql_user())) + try: + cur.execute( + "ALTER ROLE {} WITH LOGIN;".format(role_name)) + except Exception as e: + self.log.info(self.log_msg("ERROR ALTER ROLE {} WITH LOGIN " + "ERROR: {}".format(role_name, e))) + finally: + if conn: + conn.close() + else: + self.log.info(self.log_msg("Username not found in the file %s. " + "ALTER tables OWNER TO was not executed" % roles_backup_path)) + + @staticmethod + def drop_lookup_func_for_db(db_name): + connection_properties = configs.connection_properties(database=db_name) + conn = None + try: + conn = psycopg2.connect(**connection_properties) + conn.autocommit = True + with conn.cursor() as cur: + cur.execute("drop function if exists lookup(name);") + finally: + if conn: + conn.close() diff --git a/docker/granular/storage_s3.py b/docker/granular/storage_s3.py new file mode 100644 index 0000000..f89cde7 --- /dev/null +++ b/docker/granular/storage_s3.py @@ -0,0 +1,140 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import boto3 + +import botocore +import botocore.exceptions +import urllib3 +import os +import logging +import configs +from retrying import retry + +try: + from io import StringIO +except ImportError: + from io import StringIO + +bucket = os.getenv("CONTAINER") +CONTAINER_SEG = "{}_segments".format(bucket) +PG_CLUSTER_NAME = os.getenv("PG_CLUSTER_NAME") + +RETRY_COUNT = 10 +RETRY_WAIT = 1000 + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +class AwsS3Vault: + __log = logging.getLogger("AwsS3Granular") + + def __init__(self, cluster_name=None, cache_enabled=False, + aws_s3_bucket_listing=None): + + self.bucket = bucket + self.console = None + self.cluster_name = cluster_name + self.cache_enabled = cache_enabled + self.cached_state = {} + self.aws_s3_bucket_listing = aws_s3_bucket_listing + self.aws_prefix = os.getenv("AWS_S3_PREFIX", "") + + def get_s3_client(self): + return boto3.client("s3", + region_name=os.getenv("AWS_DEFAULT_REGION") if os.getenv("AWS_DEFAULT_REGION") else None, + endpoint_url=os.getenv("AWS_S3_ENDPOINT_URL"), + aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), + aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), + verify=(False if os.getenv("AWS_S3_UNTRUSTED_CERT", "false").lower() == "true" else None)) + + @retry(stop_max_attempt_number=RETRY_COUNT, wait_fixed=RETRY_WAIT) + def upload_file(self, file_path): + return self.get_s3_client().upload_file(file_path, self.bucket, self.aws_prefix + file_path) + + @retry(stop_max_attempt_number=RETRY_COUNT, wait_fixed=RETRY_WAIT) + def delete_file(self, filename): + return self.get_s3_client().delete_object(Bucket=self.bucket, Key=self.aws_prefix + filename) + + @retry(stop_max_attempt_number=RETRY_COUNT, wait_fixed=RETRY_WAIT) + def delete_objects(self, filename): + objects_to_delete = self.get_s3_client().list_objects(Bucket=self.bucket, Prefix=self.aws_prefix + filename) + for obj in objects_to_delete.get('Contents', []): + self.get_s3_client().delete_object(Bucket=self.bucket, Key=obj['Key']) + + @retry(stop_max_attempt_number=RETRY_COUNT, wait_fixed=RETRY_WAIT) + def read_object(self, file_path): + self.__log.info("Reading object %s" % self.aws_prefix + file_path) + obj = self.get_s3_client().get_object(Bucket=self.bucket, Key=self.aws_prefix + file_path) + # self.__log.info(obj['Body'].read().decode('utf8')) + return obj['Body'].read().decode('utf8') + + @retry(stop_max_attempt_number=RETRY_COUNT, wait_fixed=RETRY_WAIT) + def get_file_size(self, file_path): + obj = self.get_s3_client().list_objects_v2(Bucket=self.bucket, Prefix=self.aws_prefix + file_path) + if 'Contents' in obj: + for field in obj["Contents"]: + return field["Size"] + else: + self.__log.info("Requested {} file not found".format(file_path)) + + @retry(stop_max_attempt_number=RETRY_COUNT, wait_fixed=RETRY_WAIT) + def download_file(self, filename): + logging.info("Downloading file {}" .format(self.aws_prefix + filename)) + try: + self.get_s3_client().download_file(self.bucket, self.aws_prefix + filename, filename) + except Exception as e: + raise e + return + + @retry(stop_max_attempt_number=RETRY_COUNT, wait_fixed=RETRY_WAIT) + def is_file_exists(self, file): + exists = True + try: + self.get_s3_client().head_object(Bucket=self.bucket, Key=self.aws_prefix + file) + except botocore.exceptions.ClientError as e: + exists = False + return exists + + def is_s3_storage_path_exist(self, storage): + bucket = self.get_s3_client().list_objects_v2(Bucket=self.bucket, Prefix=self.aws_prefix + storage) + if 'Contents' in bucket: + s3_storage_path = bucket['Contents'][0]["Key"] + return True if storage in s3_storage_path else False + return False + + def get_granular_namespaces(self, storage): + bucket = self.get_s3_client().list_objects_v2(Bucket=self.bucket, Prefix=self.aws_prefix + storage) + namespaces = [] + if 'Contents' in bucket: + for obj in bucket["Contents"]: + vault = obj["Key"].split(storage, 1)[1] + namespace = vault.split("/", 2)[1] + if namespace not in namespaces: + namespaces.append(namespace) + else: + pass + return namespaces + + def get_backup_ids(self, storage, namespace): + namespaced_path = self.aws_prefix + storage + "/" + namespace + bucket = self.get_s3_client().list_objects_v2(Bucket=self.bucket, Prefix=namespaced_path) + backup_ids = [] + if 'Contents' in bucket: + for obj in bucket["Contents"]: + vault = obj["Key"].split(storage, 2)[1] + backup_id = vault.split("/", 3)[2] + if backup_id not in backup_ids: + backup_ids.append(backup_id) + else: + pass + return backup_ids diff --git a/docker/granular/utils.py b/docker/granular/utils.py new file mode 100644 index 0000000..d285fe2 --- /dev/null +++ b/docker/granular/utils.py @@ -0,0 +1,325 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + +import tarfile +import psycopg2 +from retrying import retry +import fcntl +import logging +import configs +import os +import re +import backups +import encryption +import subprocess +import fnmatch +import kube_utils + +import http.client + +from googleapiclient.discovery import build +from googleapiclient import errors + +log = logging.getLogger("utils") + +""" + sometimes write operations can take more than 5 seconds (storage problems) + hence, file will be locked for more than 5 seconds, for this cases + `stop_max_delay` should be increased, so `FILE_OPERATION_DELAY` + env can be used +""" + + +@retry(stop_max_delay=int(os.getenv("FILE_OPERATION_DELAY", '5000'))) +def get_json_by_path(path): + with open(path) as fd: + try: + fcntl.lockf(fd, fcntl.LOCK_SH | fcntl.LOCK_NB) + return json.load(fd) + except IOError: # another process accessing + log.info("trying to access locked file while reading") + raise + finally: + fcntl.lockf(fd, fcntl.LOCK_UN) + + +@retry(stop_max_delay=int(os.getenv("FILE_OPERATION_DELAY", '5000'))) +def write_in_json(path, data): + with open(path, 'w+') as fd: + try: + fcntl.lockf(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + json.dump(data, fd) + return data + except IOError: # another process accessing + log.info("trying to access locked file while writing") + raise + finally: + fcntl.lockf(fd, fcntl.LOCK_UN) + + +def execute_query(conn_properties, query): + conn = None + try: + conn = psycopg2.connect(**conn_properties) + with conn.cursor() as cur: + cur.execute(query) + return cur.fetchone()[0] + finally: + if conn: + conn.close() + +def get_version_of_pgsql_server(): + conn_properties = configs.connection_properties() + result = execute_query(conn_properties, 'SHOW SERVER_VERSION;') + return list(map(int, result.split(' ')[0].split('.'))) + + +# need to rewrite this one with usage of execute_query() +def get_owner_of_db(database): + conn_properties = configs.connection_properties() + conn = None + try: + conn = psycopg2.connect(**conn_properties) + with conn.cursor() as cur: + cur.execute(""" + SELECT pg_catalog.pg_get_userbyid(d.datdba) as "Owner" + FROM pg_catalog.pg_database d + WHERE d.datname = %s; + """, (database,)) + return cur.fetchone()[0] + finally: + if conn: + conn.close() + +def get_database_list(databases): + databases = tuple(databases) + conn_properties = configs.connection_properties() + conn = None + try: + conn = psycopg2.connect(**conn_properties) + with conn.cursor() as cur: + cur.execute("SELECT datname from pg_database where datname in %s;", (databases,)) + return [p[0] for p in cur.fetchall()] + finally: + if conn: + conn.close() + +def get_pg_version_from_dump(backup_path, key_name, bin_path): + + parallel_jobs = configs.get_parallel_jobs() + if key_name: + command = "openssl enc -aes-256-cbc -nosalt -d -pass " \ + "pass:'{}' < '{}' | {} -l ".format( + encryption.KeyManagement.get_object().get_password_by_name( + key_name), backup_path, bin_path) + else: + if int(parallel_jobs) > 1: + command = '{}/pg_restore -j {} -l {}'.format(bin_path, parallel_jobs, backup_path) + else: + command = '{}/pg_restore -l {}'.format(bin_path, backup_path) + + subprocess.check_output("ls -la {}".format(backup_path), shell=True) + + output = \ + subprocess.Popen(command, shell=True, + stdout=subprocess.PIPE).communicate()[0] + + # pg_restore -l output format example + # ; Archive created at 2019-02-11 09:38:35 UTC + # ; dbname: test1 + # ; Dumped from database version: 10.3 + # ; Dumped by pg_dump version: 10.6 + # ; Selected TOC Entries: + # ; + for item in output.decode().split(";"): + if "Dumped from database version" in item: + version_as_string = item.split(": ")[1] + return list(map(int, version_as_string.split(' ')[0].split('.'))) + return None + + + +class Rule: + magnifiers = { + "min": 60, + "h": 60 * 60, + "d": 60 * 60 * 24, + "m": 60 * 60 * 24 * 30, + "y": 60 * 60 * 24 * 30 * 12, + } + + def __init__(self, rule): + (startStr, intervalStr) = rule.strip().split("/") + self.start = self.__parseTimeSpec(startStr) + self.interval = "delete" if ( + intervalStr == "delete") else self.__parseTimeSpec(intervalStr) + + def __parseTimeSpec(self, spec): + import re + if (spec == "0"): + return 0 + + r = re.match("^(\\d+)(%s)$" % "|".join(list(self.magnifiers.keys())), spec) + if (r is None): + raise Exception( + "Incorrect eviction start/interval specification: %s" % spec) + + digit = int(r.groups()[0]) + magnifier = self.magnifiers[r.groups()[1]] + + return digit * magnifier + + def __str__(self): + return "%d/%d" % (self.start, self.interval) + + +def parse(rules): + rules = [Rule(r) for r in rules.split(",")] + return rules + + +def is_auth_needed(): + return os.getenv("AUTH", "false").lower() == "true" + + +def get_backup_tar_file_path(backup_id, namespace): + path = backups.build_backup_path(backup_id, namespace) + if os.path.exists(path): + tar_path = os.path.join(path, backup_id) + items = [x for x in os.listdir(path) if x.endswith('.dump') or x.endswith('.sql')] + with tarfile.open(tar_path + ".tar.gz", "w:gz") as tar: + for item in items: + tar.add(os.path.join(path, item), arcname=item) + full_path_tar = tar_path + ".tar.gz" + return full_path_tar + else: + return None + + +class GkeBackupApiCaller: + def __init__(self): + self.log = logging.getLogger('BackupRequestEndpoint') + self.service = build('sqladmin', 'v1beta4', cache_discovery=False) + self.project = os.getenv("GKE_PROJECT") + self.instance = os.getenv("GKE_INSTANCE") + + def perform_backup(self): + req = self.service.backupRuns().insert(project=self.project, instance=self.instance) + resp = req.execute() + self.log.info(json.dumps(resp, indent=2)) + if "error" in resp: + return resp + else: + return self.get_backup_id(resp["insertTime"]) + + def get_backup_id(self, insert_time): + req = self.service.backupRuns().list(project=self.project, instance=self.instance) + resp = req.execute() + self.log.info(json.dumps(resp, indent=2)) + for item in resp["items"]: + if item["startTime"] == insert_time: + self.log.info("Backup requested id: {}".format(item["id"])) + return item["id"] + else: + return "Can't perform backup" + + def delete_backup(self, backup_id): + req = self.service.backupRuns().delete(project=self.project, instance=self.instance, id=backup_id) + try: + resp = req.execute() + self.log.info(json.dumps(resp, indent=2)) + return { + "backupId": backup_id, + "message": resp["status"] + }, http.client.OK + + except errors.HttpError as e: + if e.resp["status"] == "404": + return "Backup with id {} not found".format(backup_id), http.client.NOT_FOUND + else: + return http.client.BAD_REQUEST + + def backup_status(self, backup_id): + req = self.service.backupRuns().get(project=self.project, instance=self.instance, id=backup_id) + try: + resp = req.execute() + self.log.info(json.dumps(resp, indent=2)) + return resp, http.client.OK + except errors.HttpError as e: + if e.resp["status"] == "404": + return "Backup with id {} not found".format(backup_id), http.client.NOT_FOUND + else: + return http.client.BAD_REQUEST + + def restore(self, restore_request): + body = { + "restoreBackupContext": { + "backupRunId": restore_request["backupId"] + } + } + req = self.service.instances().restoreBackup(project=self.project, instance=self.instance, body=body) + try: + resp = req.execute() + self.log.info(json.dumps(resp, indent=2)) + return { + 'trackingId': resp["name"] + }, http.client.ACCEPTED + except errors.HttpError as e: + if e.resp["status"] == "404": + return "Backup with id {} not found".format(restore_request["backupId"]), http.client.NOT_FOUND + else: + return "Bad request", http.client.BAD_REQUEST + + def restore_status(self, restore_id): + req = self.service.operations().get(project=self.project, operation=restore_id) + try: + resp = req.execute() + self.log.info(json.dumps(resp, indent=2)) + return resp, http.client.ACCEPTED + except errors.HttpError as e: + if e.resp["status"] == "404": + return "Restore with id {} not found".format(restore_id), http.client.NOT_FOUND + else: + return http.client.BAD_REQUEST + + def backup_list(self): + req = self.service.backupRuns().list(project=self.project, instance=self.instance) + try: + resp = req.execute() + self.log.info(json.dumps(resp, indent=2)) + result = {} + result["default"] = {} + for item in resp["items"]: + result["default"][item["id"]] = {"status": item["status"]} + return result, http.client.OK + except: + return http.client.BAD_REQUEST + + +def get_postgres_version_by_path(storage_path): + #storage_path = '/usr' + pg_dirs = fnmatch.filter(os.listdir(storage_path), 'pg*') + log.info(f"Possible directories for backup store using path method {pg_dirs}") + versions = sorted([int(re.search(r'\d+', x).group()) for x in pg_dirs], reverse=True) + version_postfix = "pg" + str(versions[0]) + log.info(f"PostgreSQL server version from path method is equal to {version_postfix}, " + f"so will save all backups in {version_postfix} dir") + version_as_list = [int(version_postfix.replace("pg", "")), 0] + return version_as_list + +def is_mirror_env(): + cm = kube_utils.get_configmap('mirror-config') + return cm is not None \ No newline at end of file diff --git a/docker/health.sh b/docker/health.sh new file mode 100755 index 0000000..07872fc --- /dev/null +++ b/docker/health.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +LOG_FILE=/tmp/health.log + +. /opt/backup/utils.sh + +function test_postgresql() { + local output=$(psql -h $POSTGRES_HOST -p $POSTGRES_PORT -U replicator -l 2>&1) + process_exit_code $? "${ouput}" +} + +if [ "$STORAGE_TYPE" = "swift" ]; then + test_postgresql >> ${LOG_FILE} + output=$(/opt/backup/scli ls $CONTAINER 2>&1) + process_exit_code $? "${ouput}" >> $LOG_FILE +elif [[ "${STORAGE_TYPE}" == "s3" ]]; then + test_postgresql >> ${LOG_FILE} + + unset flag_ssl_no_verify + if [[ "${AWS_S3_UNTRUSTED_CERT}" == "True" || "${AWS_S3_UNTRUSTED_CERT}" == "true" ]]; then + flag_ssl_no_verify="--no-verify-ssl" + fi + + output=$(aws ${flag_ssl_no_verify} --endpoint-url "${AWS_S3_ENDPOINT_URL}" s3 ls "s3://${CONTAINER}") + process_exit_code $? "${ouput}" >> $LOG_FILE +elif [[ "$STORAGE_TYPE" = "hostpath" ]] || [[ "$STORAGE_TYPE" = "pv" ]] || [[ "$STORAGE_TYPE" = "pv_label" ]] || [[ "$STORAGE_TYPE" = "provisioned" ]] || [[ "$STORAGE_TYPE" = "provisioned-default" ]]; then + test_postgresql >> ${LOG_FILE} +else + log "wrong storage type $STORAGE_TYPE" >> $LOG_FILE + exit 1 +fi diff --git a/docker/pip.conf b/docker/pip.conf new file mode 100755 index 0000000..d547ee0 --- /dev/null +++ b/docker/pip.conf @@ -0,0 +1,4 @@ +[global] +index-url = https://pypi.org/simple +break-system-packages = true +trusted-host = pypi.org \ No newline at end of file diff --git a/docker/postgres/aws-s3-backup.sh b/docker/postgres/aws-s3-backup.sh new file mode 100755 index 0000000..447f78a --- /dev/null +++ b/docker/postgres/aws-s3-backup.sh @@ -0,0 +1,197 @@ +#!/usr/bin/env bash +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +source utils.sh + +################################ +# +# $1 - bucket name +# $2 - backup id +# +################################ + +readonly BUCKET="$1" # Go to AWS S3 terminology +readonly BACKUP_ID="$2" +BACKUP_NAME="pg_${PG_CLUSTER_NAME}_backup_${BACKUP_ID}.tar.gz" + + +function log() { + log_module "$1" "aws-s3-backup" "$2" +} + +function log_info() { + log "INFO" "$1" +} + +function log_error() { + log "ERROR" "$1" + exit 1 +} + +function aws_process_exit_code() { + local exit_code=$1 + local message="$2" + if [ ${exit_code} -ne 0 ];then + log_error "${message}" + exit 1 + fi +} + +function smoke_aws_s3() { + log_info "validate provided configuration and test AWS S3 storage availability" + + if [[ -z "${AWS_S3_ENDPOINT_URL}" ]]; then + log_error "endpoint URL for AWS S3 must be specified in AWS_S3_ENDPOINT environment variable." + fi + + # http://docs.aws.amazon.com/cli/latest/topic/config-vars.html + if [[ -z "${AWS_ACCESS_KEY_ID}" ]]; then + log_error "access key for AWS S3 must be specified in AWS_ACCESS_KEY_ID environment variable." + fi + + if [[ -z "${AWS_SECRET_ACCESS_KEY}" ]]; then + log_error "secret access key for AWS S3 must be specified in AWS_SECRET_ACCESS_KEY environment variable." + fi + + if [[ -z "${BUCKET}" ]]; then + log_error "bucket name must be specified in CONTAINER environment variable." + fi + + local output=$(aws --endpoint-url "${AWS_S3_ENDPOINT_URL}" s3 ls "s3://${BUCKET}" 2>&1) + process_exit_code $? "${output}" +} + +function stream_backup_to_aws_s3() { + if [[ -z "${BACKUP_ID}" ]]; then + log_error "Backup id must be specified explicitly" + fi + + + unset flag_ssl_no_verify + + if [[ "${AWS_S3_UNTRUSTED_CERT}" == "True" || "${AWS_S3_UNTRUSTED_CERT}" == "true" ]]; then + flag_ssl_no_verify="--no-verify-ssl" + fi + + local s3_object_name="${BACKUP_ID}/${BACKUP_NAME}" + + # S3 required to specify "expected size" of backup to divide large stream + # into smaller pieces for multi-part upload correctly. + # But, unfortunately, we don't know exact backup size after gzip, + # so we tell S3 that our backup is of size of our database (but a cake is a lie). + local expected_backup_size=$(PGPASSWORD=$POSTGRES_PASSWORD psql --no-align \ + --tuples-only \ + -h "${POSTGRES_HOST}" \ + -p "${POSTGRES_PORT}" \ + -U "${POSTGRES_USER}" \ + -d postgres \ + -c "select sum(pg_database_size(db.name)) from (select datname as name from pg_database) db") + + log_info "expected backup size is approximately ${expected_backup_size} bytes." + + local validation_pipe="pg-backup-${BACKUP_ID}.pipe" + local pg_basebackup_stderr_file="pg-backup-${BACKUP_ID}.error.log" + local validation_stderr_file="pg-backup-validation-${BACKUP_ID}.error.log" + local storage_client_stdout_file="storage-client-${BACKUP_ID}.log" + + register_delete_on_exit "${validation_pipe}" "${storage_client_stdout_file}" "${validation_stderr_file}" "${pg_basebackup_stderr_file}" + + # Validate TAR stream on the fly. + # This will not validate backup data itself, but will check archive's integrity. + mkfifo "${validation_pipe}" + tar -tz <"${validation_pipe}" > /dev/null 2> "${validation_stderr_file}" & + + log_info "start backup streaming to AWS S3" + $PG_BASEBACKUP -h "${POSTGRES_HOST}" -p "${POSTGRES_PORT}" -U "${REPLICATION_USER}" -D - -X fetch --format=tar --gzip 2> "${pg_basebackup_stderr_file}" \ + | tee "${validation_pipe}" \ + | aws --endpoint-url "${AWS_S3_ENDPOINT_URL}" ${flag_ssl_no_verify} \ + s3 cp - "s3://${BUCKET}/${AWS_S3_PREFIX}/${s3_object_name}" \ + --expected-size "${expected_backup_size}" 2> "${storage_client_stdout_file}" + + # PIPESTATUS can be overridden, so need to keep it. + local exit_codes=(${PIPESTATUS[@]}) + local pg_basebackup_exit_code=${exit_codes[0]} + local storage_client_exit_code=${exit_codes[2]} + + # Wait for TAR validation to complete. + wait $! + local validation_exit_code=$? + + local storage_client_stdout="$(cat ${storage_client_stdout_file})" + local validation_stderr="$(cat ${validation_stderr_file})" + local pg_basebackup_log="$(cat ${pg_basebackup_stderr_file})" + + aws_process_exit_code "${pg_basebackup_exit_code}" "pg_basebackup has failed. Details: ${pg_basebackup_log}" + aws_process_exit_code "${validation_exit_code}" "Backup archive integrity validation not passed. This backup will be marked as failed. Details: ${validation_stderr}" + aws_process_exit_code "${storage_client_exit_code}" "Backup uploading to AWS S3 has failed. Details: ${storage_client_stdout}" + + log_info "completed" +} + +function main() { + + version="$(PGPASSWORD=$POSTGRES_PASSWORD psql -h "${POSTGRES_HOST}" -p "${POSTGRES_PORT}" -U "${POSTGRES_USER}" -d postgres -c "SHOW SERVER_VERSION;" -tA | egrep -o '[0-9]{1,}\.[0-9]{1,}')" + REPLICATION_USER="replicator" + + log "version of pgsql server is: ${version}" + + if python -c "import sys; sys.exit(0 if float("${version}") >= 16.0 else 1)"; then + log "Using pgsql 16 bins for pg_basebackup" + PG_BASEBACKUP="/usr/lib/postgresql/16/bin/pg_basebackup" + BACKUP_NAME="pg_backup_$(basename ${BACKUP_ID}).tar.gz" + elif python -c "import sys; sys.exit(0 if 15.0 <= float("${version}") < 16.0 else 1)"; then + log "Using pgsql 15 bins for pg_basebackup" + PG_BASEBACKUP="/usr/lib/postgresql/15/bin/pg_basebackup" + BACKUP_NAME="pg_backup_$(basename ${BACKUP_ID}).tar.gz" + elif python -c "import sys; sys.exit(0 if 14.0 <= float("${version}") < 15.0 else 1)"; then + log "Using pgsql 14 bins for pg_basebackup" + PG_BASEBACKUP="/usr/lib/postgresql/14/bin/pg_basebackup" + BACKUP_NAME="pg_backup_$(basename ${BACKUP_ID}).tar.gz" + elif python -c "import sys; sys.exit(0 if 13.0 <= float("${version}") < 14.0 else 1)"; then + log "Using pgsql 13 bins for pg_basebackup" + PG_BASEBACKUP="/usr/lib/postgresql/13/bin/pg_basebackup" + BACKUP_NAME="pg_backup_$(basename ${BACKUP_ID}).tar.gz" + elif python -c "import sys; sys.exit(0 if 12.0 <= float("${version}") < 13.0 else 1)"; then + log "Using pgsql 12 bins for pg_basebackup" + PG_BASEBACKUP="/usr/lib/postgresql/12/bin/pg_basebackup" + BACKUP_NAME="pg_backup_$(basename ${BACKUP_ID}).tar.gz" + elif python -c "import sys; sys.exit(0 if 11.0 <= float("${version}") < 12.0 else 1)"; then + log "Using pgsql 11 bins for pg_basebackup" + PG_BASEBACKUP="/usr/lib/postgresql/11/bin/pg_basebackup" + BACKUP_NAME="pg_backup_$(basename ${BACKUP_ID}).tar.gz" + elif python -c "import sys; sys.exit(0 if 10.0 <= float("${version}") < 11.0 else 1)"; then + log "Using pgsql 10 bins for pg_basebackup" + PG_BASEBACKUP="/usr/lib/postgresql/10/bin/pg_basebackup" + BACKUP_NAME="pg_backup_$(basename ${BACKUP_ID}).tar.gz" + else + if [ "${PG_CLUSTER_NAME}" != "gpdb" ] + then + log "Using pgsql 9.6 bins for pg_basebackup" + PG_BASEBACKUP="/usr/pgsql-9.6/bin/pg_basebackup" + else + log "Using gpdb bins for greenplum pg_basebackup" + TARGET_DB_ID="$(psql -h "${POSTGRES_HOST}" -p "${POSTGRES_PORT}" -U "${POSTGRES_USER}" -d postgres -c "select dbid from gp_segment_configuration where content = -1 and status = 'up' and role = 'p';" -tA )" + PG_BASEBACKUP="/usr/local/greenplum-db/bin/pg_basebackup --target-gp-dbid="${TARGET_DB_ID}"" + REPLICATION_USER=${POSTGRES_USER} + + fi + fi + + smoke_aws_s3 + stream_backup_to_aws_s3 +} + +main "$@" diff --git a/docker/postgres/backup-daemon.conf b/docker/postgres/backup-daemon.conf new file mode 100644 index 0000000..1fb414d --- /dev/null +++ b/docker/postgres/backup-daemon.conf @@ -0,0 +1,12 @@ +{ + schedule: "0 * * * *", + schedule: ${?BACKUP_SCHEDULE} + + eviction: "7d/delete" + eviction: ${?EVICTION_POLICY_BINARY} + eviction: ${?EVICTION_POLICY} + + storage: ${STORAGE_ROOT} + + command: "/opt/backup/postgres_backup.sh %(data_folder)s" +} \ No newline at end of file diff --git a/docker/postgres/configs.py b/docker/postgres/configs.py new file mode 100644 index 0000000..12d7afd --- /dev/null +++ b/docker/postgres/configs.py @@ -0,0 +1,41 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging.config +import os +from pyhocon import ConfigFactory + +GLOBAL_CONFIG_FILE_PATH = '/etc/backup-daemon.conf' +LOGGING_CONFIG_FILE_PATH = '/opt/backup/logging.conf' + + +def load_logging_configs(): + logging.config.fileConfig(LOGGING_CONFIG_FILE_PATH) + + +def load_configs(): + default_config = os.path.join(os.path.dirname(__file__), 'backup-daemon.conf') + if os.path.exists(GLOBAL_CONFIG_FILE_PATH): + conf = ConfigFactory.parse_file(GLOBAL_CONFIG_FILE_PATH).with_fallback(default_config) + else: + conf = ConfigFactory.parse_file(default_config) + + log = logging.getLogger("BackupDaemonConfiguration") + log.info("Loaded PostgreSQL backup configuration: %s" % json.dumps(conf)) + + return conf + +def is_external_pg(): + return os.getenv("EXTERNAL_POSTGRESQL", "") != "" diff --git a/docker/postgres/encryption.py b/docker/postgres/encryption.py new file mode 100644 index 0000000..1ede33d --- /dev/null +++ b/docker/postgres/encryption.py @@ -0,0 +1,253 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Crypto.Cipher import AES +import os +import io +import json +import logging +from kubernetes.client.rest import ApiException + + +class FileWrapper(object): + def __init__(self, file_path, encrypted): + self.__log = logging.getLogger("EncryptionHelper") + self.__file_path = file_path + self.__encrypted = encrypted + self.chunk_size = 4096 + if os.path.exists(file_path): + self.__file_size = os.path.getsize(file_path) + + def get_file_stream(self): + if self.__encrypted: + return self.get_decrypted() + else: + return io.FileIO(self.__file_path, "r", closefd=True) + + def put_file_stream(self, stream): + """ + This method saves file stream to a actual file on FS + in case of encryption, saves encrypted file and creates .key file with + metadata about encryption key. Used only for WAL archives + :param stream: file stream that should be saved + :return: sha256 - sha256 of processed file + """ + import hashlib + sha256 = hashlib.sha256() + if self.__encrypted: + password = KeyManagement.get_object().get_password() + cipher = EncryptionHelper.get_cipher_by_pw(password) + with io.FileIO(self.__file_path, "w", closefd=True) as target: + data = stream.read(self.chunk_size) + while True: + next_data = stream.read(self.chunk_size) + sha256.update(data) + if self.__encrypted: + # pad = False + # if len(data) % AES.block_size != 0: + # pad = True + pad = len(next_data) == 0 # try to pad only last chunk + data = cipher.encrypt(data, pad) + self.create_key_file() + target.write(data) + if len(next_data) == 0: + stream.close() + self.__log.info("Processed stream with sha256 {}".format( + sha256.hexdigest())) + return sha256.hexdigest() + else: + data = next_data + + def get_decrypted(self): + """ + in case of encryption everything is pretty simple, + need to decrypt file by chunks and then return it as a stream + :return: decrypted file stream + """ + self.__log.info("Will try to decrypt file: %s" % self.__file_path) + cipher = EncryptionHelper.get_cipher_by_pw( + self.get_password_for_file()) + import tempfile + decrypted = tempfile.TemporaryFile(mode='w+') + with io.FileIO(self.__file_path, "r", closefd=True) as encrypted: + number_of_chunks = self.__file_size / self.chunk_size + iteration = 0 + while True: + data = encrypted.read(self.chunk_size) + if len(data) == 0: + encrypted.close() + # return stream to the start + decrypted.seek(0) + return decrypted + data = cipher.decrypt(data) + if iteration == number_of_chunks: + data = cipher.unpad(data) + iteration = iteration + 1 + decrypted.write(data) + + def get_password_for_file(self): + """ + this method gets password by file name. + in case of WAL file metadata about the key is saved to .key file, + in case of full backup in .metrics. + :return: encryption key as a string + """ + if self.__file_path.endswith("tar.gz"): # full backup + key_filename = os.path.dirname(self.__file_path) + "/.metrics" + else: # WAL archive + key_filename = self.__file_path + ".key" + + if os.path.exists(key_filename): + with open(key_filename) as f: + data = json.load(f) + key_name, key_source = data["key_name"], data["key_source"] + self.__log.info("Will use key_name: {} and key_source: {} " + "for file decryption: {}". + format(key_name, key_source, self.__file_path)) + return KeyManagement(key_source, + key_name).get_password() + else: + # if file not exists for some of the reason, return default PW + return KeyManagement.get_object().get_password() + + def create_key_file(self): + key_info = { + "key_name": KeyManagement.get_key_name(), + "key_source": KeyManagement.get_key_source() + } + key_filename = self.__file_path + ".key" + with open(key_filename, 'w+') as outfile: + json.dump(key_info, outfile) + + +class EncryptionHelper(object): + def __init__(self, key_management): + self.log = logging.getLogger("EncryptionHelper") + self.__key_management = key_management + + @staticmethod + def evp_bytes_to_key(password, key_len=32, iv_len=16): + """ + Derive the key and the IV from the given password and salt. + """ + from hashlib import md5 + d_tot = md5(password).digest() + d = [d_tot] + while len(d_tot) < (iv_len + key_len): + d.append(md5(d[-1] + password).digest()) + d_tot += d[-1] + return d_tot[:key_len], d_tot[key_len:key_len + iv_len] + + @staticmethod + def get_cipher_by_pw(password): + key, iv = EncryptionHelper.evp_bytes_to_key(password) + return CipherWrapper(iv, key) + + +class KeyManagement(object): + def __init__(self, pw_source, pw_name): + self.log = logging.getLogger("KeyManagement") + self.pw_source = pw_source + self.pw_name = pw_name + # self.pw_version = pw_version + + @staticmethod + def get_object(): + pw_source = os.getenv("KEY_SOURCE", 'kubernetes') + pw_name = os.getenv("KEY_NAME", "daemon-secret") + return KeyManagement(pw_source, pw_name) + + def get_password(self): + if self.pw_source.lower() == KeySources.KUBERNETES: + pw_name = os.getenv("KEY_NAME", "daemon-secret") + return KubernetesPassword(pw_name).get_password() + elif self.pw_source.lower() == KeySources.VAULT: + pw_name = os.getenv("KEY_NAME", "daemon-secret") + return VaultPassword(pw_name).get_password() + + def get_password_by_name(self, key_name): + self.log.info("Try to get key: {}".format(key_name)) + if self.pw_source.lower() == KeySources.KUBERNETES: + return KubernetesPassword(key_name).get_password() + elif self.pw_source.lower() == KeySources.VAULT: + return VaultPassword(key_name).get_password() + + @staticmethod + def get_key_name(): + return os.getenv("KEY_NAME", "daemon-secret").lower() + + @staticmethod + def get_key_source(): + return os.getenv("KEY_SOURCE", "kubernetes").lower() + + +# here should be some abstract class +class KubernetesPassword(object): + SA_PATH = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" + + def __init__(self, key_name): + self.log = logging.getLogger("KubernetesPassword") + self.key_name = key_name + + def get_password(self): + from kubernetes import config + from kubernetes.client.apis import core_v1_api + config.load_incluster_config() + api = core_v1_api.CoreV1Api() + # https://github.com/kubernetes-client/python/issues/363 + namespace = open(self.SA_PATH).read() + try: + api_response = api.read_namespaced_secret(self.key_name, namespace) + import base64 + return base64.b64decode(api_response.data.get("password")) + except ApiException as exc: + self.log.error(exc) + raise exc + + +class VaultPassword(object): + def __init__(self, key_name): + pass + + def get_password(self): + pass + + +class CipherWrapper(object): + def __init__(self, iv, key): + from Crypto.Cipher import AES + self.__cipher = AES.new(key, AES.MODE_CBC, iv) + + def encrypt(self, data, pad=False): + return self.__cipher.encrypt(self.pad(data) if pad else data) + + def decrypt(self, data, unpad=False): + return self.unpad(self.__cipher.decrypt(data)) \ + if unpad else self.__cipher.decrypt(data) + + def pad(self, data): + from Crypto.Util.Padding import pad + return pad(data, AES.block_size) + + def unpad(self, data): + from Crypto.Util.Padding import unpad + return unpad(data, AES.block_size) + + +class KeySources: + KUBERNETES = "kubernetes" + VAULT = "vault" + + def __init__(self): + pass diff --git a/docker/postgres/endpoints/__init__.py b/docker/postgres/endpoints/__init__.py new file mode 100644 index 0000000..0ae8f9c --- /dev/null +++ b/docker/postgres/endpoints/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['backup', 'status', 'wal', 'restore'] diff --git a/docker/postgres/endpoints/backup.py b/docker/postgres/endpoints/backup.py new file mode 100644 index 0000000..8d397e9 --- /dev/null +++ b/docker/postgres/endpoints/backup.py @@ -0,0 +1,154 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Set of endpoints to performs actions with physical backups. +""" + +import logging +import utils +from flask_restful import Resource +from flask import Response, request, stream_with_context + +import requests +from requests.exceptions import HTTPError +from flask_httpauth import HTTPBasicAuth + +auth = HTTPBasicAuth() + + +@auth.verify_password +def verify(username, password): + return utils.validate_user(username, password) + + +class BackupRequest(Resource): + __endpoints = [ + '/backup', + '/backups/request' + ] + + def __init__(self): + self.__log = logging.getLogger("BackupEndpoint") + + @staticmethod + def get_endpoints(): + return BackupRequest.__endpoints + + @auth.login_required + def post(self): + self.__log.debug("Endpoint /backup has been called.") + # Redirect backup request to underlying scheduler, which keeps status of scheduled backups. + r = requests.post('http://localhost:8085/schedule') + + if not r.ok: + try: + r.raise_for_status() + except HTTPError as e: + self.__log.exception("Something went wrong when redirecting backup request to /schedule endpoint.", e) + + self.__log.debug(r.json()) + + return r.json() + + +class Eviction(Resource): + + __endpoints = [ + '/evict', + '/backups/delete' + ] + + def __init__(self, storage): + self.__storage = storage + + @staticmethod + def get_endpoints(): + return Eviction.__endpoints + + @auth.login_required + def delete(self): + backup_id = request.args.getlist('id')[0] + vaults = self.__storage.list() + vaults.reverse() + + # search for vault + vault_for_eviction = None + for vault in vaults: + if vault.get_id() == backup_id: + vault_for_eviction = vault + break + if vault_for_eviction: + self.__storage.evict(vault_for_eviction) + return "Ok" + else: + return "Not Found" + + +class Download(Resource): + + __endpoints = [ + '/get', + '/backups/download' + ] + + def __init__(self, storage): + self.__storage = storage + self.__log = logging.getLogger("DownloadBackupEndpoint") + + @staticmethod + def get_endpoints(): + return Download.__endpoints + + @auth.login_required + def get(self): + def generate(storage, vault): + stream = storage.get_backup_as_stream(vault) + with stream as f: + chunk_size = 4096 + while True: + data = f.read(chunk_size) + if len(data) == 0: + f.close() + return + yield data + + vaults = self.__storage.list() + vaults.reverse() + backup_id = request.args.getlist('id')[0] if request.args.getlist('id') else None + + # search for vault + vault_for_streaming = None + if backup_id: + for vault in vaults: + if vault.get_id() == backup_id: + vault_for_streaming = vault + break + else: + for vault in vaults: + if not vault.is_failed(): + vault_for_streaming = vault + break + if vault_for_streaming: + return Response(stream_with_context( + generate(self.__storage, vault_for_streaming)), + mimetype='application/octet-stream', + headers=[ + ('Content-Type', 'application/octet-stream'), + ('Content-Disposition', + "pg_backup_{}.tar.gz".format( + vault_for_streaming.get_id())) + ]) + else: + return Response("Cannot find backup", status=500) diff --git a/docker/postgres/endpoints/restore.py b/docker/postgres/endpoints/restore.py new file mode 100644 index 0000000..c5d031c --- /dev/null +++ b/docker/postgres/endpoints/restore.py @@ -0,0 +1,445 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Set of endpoints to performs actions with external restore. +""" + +import datetime +import time +import errno +import hashlib +import json +import logging +import os +import subprocess +from threading import Thread +import boto3 +from dateutil import parser + +from kubernetes import client as k8s_client, config as k8s_config + +import utils +import fcntl +from flask import Response, request +from flask_httpauth import HTTPBasicAuth +from flask_restful import Resource + +auth = HTTPBasicAuth() + + +@auth.verify_password +def verify(username, password): + return utils.validate_user(username, password) + + +class ExternalRestoreRequest(Resource): + __endpoints = [ + '/external/restore', + ] + + def __init__(self, storage): + self.__log = logging.getLogger("ExternalRestoreEndpoint") + self.restore_folder = f"{storage.root}/external/restore" + self.allowable_db_types = ['AZURE', 'RDS'] + if not os.path.exists(self.restore_folder): + try: + os.makedirs(self.restore_folder) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise + self.status_file = f"{self.restore_folder}/status" + + @staticmethod + def cleanup_restore_status(storage): + log=logging.getLogger("ExternalRestoreEndpoint") + restore_folder = f"{storage.root}/external/restore" + status_file = f"{restore_folder}/status" + if os.path.exists(status_file): + with open(status_file, 'r') as f: + status_map = json.load(f) + f.close() + restoreId = status_map['restoreId'] + restore_file = f"{restore_folder}/{restoreId}" + if os.path.exists(restore_file): + stuck_status = None + with open(restore_file, 'r') as o: + stuck_status = json.load(o) + o.close() + stuck_status['status'] = "Failed" + with open(restore_file, 'w') as o: + o.write(json.dumps(stuck_status)) + o.close() + os.remove(status_file) + + + @staticmethod + def get_endpoints(): + return ExternalRestoreRequest.__endpoints + + @auth.login_required + def post(self): + self.__log.debug("Endpoint /external/restore has been called") + + external_pg_type = os.getenv("EXTERNAL_POSTGRESQL", "FALSE").upper() + + if external_pg_type not in self.allowable_db_types: + return 404 + + req = request.get_json() + + if req.get('restore_time'): + restore_time = req.get('restore_time') + else: + self.__log.info("No restore_time provided") + return "No restore_time provided", 400 + + if req.get('restore_as_separate'): + restore_as_separate = req.get('restore_as_separate') + else: + restore_as_separate = "false" + + if req.get('geo_restore'): + geo_restore = req.get('geo_restore') + else: + geo_restore = "false" + + subnet = req.get('subnet', 'false') + if os.path.isfile(self.status_file): + with open(self.status_file, 'r') as f: + status_map = json.load(f) + f.close() + if status_map['status'] == "In Progress": + return status_map, 200 + else: + os.remove(self.status_file) + + restore_id = ExternalRestore.generate_restore_id() + + if external_pg_type == "AZURE": + restore = ExternalRestore(self.restore_folder, restore_id, restore_time, restore_as_separate, geo_restore, subnet) + + if external_pg_type == "RDS": + restore = RDSRestore(self.restore_folder, restore_id, restore_time, restore_as_separate) + + restore.start() + return Response(restore_id, 202) + + +class ExternalRestore(Thread): + def __init__(self, restore_folder, restore_id, restore_time, restore_as_separate, geo_restore, subnet): + Thread.__init__(self) + self.__log = logging.getLogger('ExternalRestore') + self.restore_id = restore_id + self.restore_time = restore_time + self.restore_folder = restore_folder + self.restore_as_separate = restore_as_separate + self.geo_restore = geo_restore + self.subnet = subnet + + def run(self): + cmd_processed = self.__process_cmd(self.restore_id, self.restore_time, self.restore_folder, + self.restore_as_separate, self.geo_restore, self.subnet) + exit_code = subprocess.call(cmd_processed) + if exit_code != 0: + self.__log.error("Restore process has been failed") + else: + self.__log.info("Restore process successfully finished") + + def __process_cmd(self, restore_id, restore_time, restore_folder, restore_as_separate, geo_restore, subnet): + self.__log.info(f"restore_id {restore_id}, restore_time {restore_time}, " + f"restore_folder {restore_folder}, restore_as_separate {restore_as_separate}, " + f"subnet: {subnet}") + cmd_processed = self.__split_command_line( + f"/opt/backup/azure_restore " + f"--restore_id {restore_id} " + f"--restore_time {restore_time} " + f"--restore_folder {restore_folder} " + f"--restore_as_separate {restore_as_separate} " + f"--geo_restore {geo_restore} " + f"--subnet {subnet}") + return cmd_processed + + @staticmethod + def __split_command_line(cmd_line): + import shlex + lex = shlex.shlex(cmd_line) + lex.quotes = '"' + lex.whitespace_split = True + lex.commenters = '' + return list(lex) + + @staticmethod + def generate_id(): + return datetime.datetime.now().strftime("%Y%m%dT%H%M%S%f") + + @staticmethod + def generate_restore_id(): + return 'restore-%s' % ExternalRestore.generate_id() + + +class RDSRestore(Thread): + + NAMESPACE_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/namespace' + + def __init__(self, restore_folder, restore_id, restore_time, restore_as_separate): + Thread.__init__(self) + try: + self.__log = logging.getLogger('RDSRestore') + self.restore_id = restore_id + self.restore_time = restore_time + self.restore_folder = restore_folder + self.client = self.get_rds_client() + self.pg_service_name = 'pg-patroni' + self.namespace = open(self.NAMESPACE_PATH).read() + k8s_config.load_incluster_config() + self.k8s_core_api = k8s_client.CoreV1Api() + self.status = { + 'trackingId': self.restore_id, + 'namespace': self.namespace, + 'status': BackupStatus.PLANNED + } + self.restore_as_separate = restore_as_separate + except Exception as e: + print("RDS: Client Error: %s " % e) + raise Exception(e) + + def run(self): + try: + self.__log.info("RDS: Restore Cluster Running") + self.update_status("status", BackupStatus.IN_PROGRESS, True) + + external_name = self.get_service_external_name() + cluster_name = external_name.split(".")[0] + + response, restored_cluster_name = self.restore_cluster(cluster_name) + current_instances, restored_instances = self.restore_db_instances(cluster_name, restored_cluster_name) + self.wait_for_db_instance(restored_instances) + if self.restore_as_separate != 'true': + self.update_service_external_name(restored_cluster_name) + self.stop_db_cluster(cluster_name) + self.update_status("status", BackupStatus.SUCCESSFUL, True) + self.__log.info("RDS: Restore Cluster Successful") + except Exception as e: + self.__log.error("RDS: Restore failed %s" % e) + self.update_status("status", BackupStatus.FAILED, True) + + @staticmethod + def generate_id(): + return datetime.datetime.now().strftime("%Y%m%dT%H%M%S%f") + + @staticmethod + def generate_restore_id(): + return 'restore-%s' % RDSRestore.generate_id() + + def get_rds_client(self): + self.__log.info("RDS: Init RDS Client") + return boto3.client("rds") + + def restore_cluster(self, cluster_name): + + self.check_name(cluster_name) + + restored_cluster_name = self.get_restored_name(cluster_name) + self.__log.info("RDS: Restore cLuster: %s" % cluster_name) + self.__log.info("RDS: Restore cLuster with new name: %s" % restored_cluster_name) + + try: + security_groups = [] + instance_response = self.client.describe_db_instances( + Filters=[{'Name': 'db-cluster-id', 'Values': [cluster_name, ]}, ]) + if instance_response.get('DBInstances'): + security_groups = self.extract_vpc_security_group_ids( + instance_response.get('DBInstances')[0].get('VpcSecurityGroups')) + + cluster_response = self.client.describe_db_clusters(DBClusterIdentifier=cluster_name) + if cluster_response.get('DBClusters'): + cluster = cluster_response.get('DBClusters')[0] + subnet_group_name = cluster.get('DBSubnetGroup') + port = cluster.get('Port') + else: + raise Exception("Cluster response is empty. Can not read parameters to continue") + + restore_date = parser.parse(self.restore_time) + + response = self.client.restore_db_cluster_to_point_in_time( + DBClusterIdentifier=restored_cluster_name, + SourceDBClusterIdentifier=cluster_name, + RestoreToTime=restore_date, + DBSubnetGroupName=subnet_group_name, + Port=port, + VpcSecurityGroupIds=security_groups, + Tags=[{ + 'Key': 'SOURCE-CLUSTER', + 'Value': cluster_name + }, { + 'Key': 'RESTORE-TIME', + 'Value': self.restore_time + }, + ], + ) + + status_code = response.get('ResponseMetadata').get('HTTPStatusCode') + if status_code != 200: + raise Exception("Error occurred while cluster restoring. http code: %s" % status_code) + self.__log.info("RDS: Cluster %s restored successfully" % restored_cluster_name) + except Exception as e: + raise Exception("RDS: Client Error: %s " % e) + + return response, restored_cluster_name + + def wait_for_db_instance(self, restored_instances): + for instance_name in restored_instances: + self.__log.info("RDS: Wait For DB Instance: %s" % instance_name) + waiter = self.client.get_waiter('db_instance_available') + waiter.wait(DBInstanceIdentifier=instance_name) + self.__log.info("RDS: DB Instance %s is ready" % instance_name) + + def restore_db_instances(self, cluster_name, restored_cluster_name): + instance_response = self.client.describe_db_instances( + Filters=[{'Name': 'db-cluster-id', 'Values': [cluster_name, ]}, ]) + if instance_response.get('DBInstances'): + restored_instances = [] + current_instances = [] + for current_instance in instance_response.get('DBInstances'): + current_instance, restored_instance = self.create_db_instance(restored_cluster_name, current_instance) + restored_instances.append(restored_instance) + current_instances.append(current_instance) + else: + raise Exception("RDS: Instance response is empty. Can not read parameters to continue") + return current_instances, restored_instances + + def create_db_instance(self, restored_cluster_name, current_instance): + try: + current_instance_id = current_instance.get('DBInstanceIdentifier') + self.check_name(current_instance_id) + restored_instance_id = self.get_restored_name(current_instance_id) + self.__log.info("RDS: Create DB Instance: %s" % restored_instance_id) + + db_instance_class = current_instance.get('DBInstanceClass') + db_engine = current_instance.get('Engine') + # db_master_username = current_instance.get('MasterUsername') + + self.client.create_db_instance( + DBInstanceIdentifier=restored_instance_id, + DBInstanceClass=db_instance_class, + Engine=db_engine, + DBClusterIdentifier=restored_cluster_name, + Tags=[{ + 'Key': 'SOURCE-CLUSTER', + 'Value': current_instance_id + }, + ], + ) + except Exception as e: + raise Exception("RDS: Restore DB Instance Failed: %s" % e) + + return current_instance_id, restored_instance_id + + def extract_vpc_security_group_ids(self, current_instance_security_groups): + vpc_security_group_ids = [] + for group in current_instance_security_groups: + vpc_security_group_ids.append(group.get('VpcSecurityGroupId')) + return vpc_security_group_ids + + def get_cluster_name(self, instance_name): + instance_response = self.client.describe_db_instances(DBInstanceIdentifier=instance_name) + return instance_response.get('DBClusterIdentifier') + + def update_service_external_name(self, restored_cluster_name): + self.__log.info("RDS: update service external name: %s" % restored_cluster_name) + # instance_response = self.client.describe_db_instances( + # Filters=[{'Name': 'db-cluster-id', 'Values': [restored_cluster_name,]},]) + # if instance_response.get('DBInstances'): + # restored_db_endpoint = instance_response.get('DBInstances')[0].get('Endpoint').get('Address') + cluster_response = self.client.describe_db_clusters(DBClusterIdentifier=restored_cluster_name) + if cluster_response.get('DBClusters'): + restored_db_endpoint = cluster_response.get('DBClusters')[0].get('Endpoint') + else: + raise Exception("RDS: Restored instance response is empty. Can not read parameters to continue") + + service_patch = self.create_service(restored_db_endpoint) + self.k8s_core_api.patch_namespaced_service(name=self.pg_service_name, namespace=self.namespace, body=service_patch) + + def get_service_external_name(self): + services = self.k8s_core_api.list_namespaced_service( + namespace=self.namespace, + field_selector="metadata.name=%s" % self.pg_service_name) + external_name = services.items[0].spec.external_name + self.__log.info("RDS: Current service external name: %s" % external_name) + return external_name + + def create_service(self, external_name) -> k8s_client.V1Service: + return k8s_client.V1Service( + spec=k8s_client.V1ServiceSpec( + external_name=external_name + ), + ) + + def get_restored_name(self, name): + hashed_time = hashlib.md5(self.restore_id.encode('utf-8')).hexdigest()[0:8] + if "-restore-" in name: + old_name = name.split("-restore-") + return old_name[0] + "-restore-" + hashed_time + else: + return name + "-restore-" + hashed_time + + #RDS cluster or instance names must contain from 1 to 63 symbols. 17 of them are reserved for restored name. + def check_name(self, name): + if len(name) > 46: + raise Exception("Name :%s is too long. Must me not more than 46 symbols") + + def stop_db_instances(self, instances): + for instance in instances: + self.__log.info("RDS: Stop instance: %s" % instance) + self.client.stop_db_instance(DBInstanceIdentifier=instance) + self.__log.info("RDS: Stop instance: %s - Stopped" % instance) + + def stop_db_cluster(self, cluster_name): + self.__log.info("RDS: Stop cluster: %s" % cluster_name) + self.client.stop_db_cluster(DBClusterIdentifier=cluster_name) + self.__log.info("RDS: Stop cluster: %s - Stopped" % cluster_name) + + def build_restore_status_file_path(self, ): + return '%s/%s' % (self.restore_folder, self.restore_id) + + def flush_status(self): + path = self.build_restore_status_file_path() + self.write_in_json(path, self.status) + + def update_status(self, key, value, flush=False): + self.status[key] = value + if flush: + self.flush_status() + + def write_in_json(self, path, data): + with open(path, 'w') as fd: + try: + fcntl.lockf(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + json.dump(data, fd) + return data + except IOError: # another process accessing + self.__log.info("trying to access locked file while writing") + raise + finally: + fcntl.lockf(fd, fcntl.LOCK_UN) + + +class BackupStatus: + SUCCESSFUL = 'Successful' + FAILED = 'Failed' + IN_PROGRESS = 'In progress' + PLANNED = 'Planned' + UNKNOWN = 'Unknown' + CANCELED = 'Canceled' diff --git a/docker/postgres/endpoints/status.py b/docker/postgres/endpoints/status.py new file mode 100644 index 0000000..42a8e5d --- /dev/null +++ b/docker/postgres/endpoints/status.py @@ -0,0 +1,302 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Set of endpoints that provide information about backups. +""" + +import logging +import utils +import requests +import os.path +import tempfile +import json + +from flask import Response +from flask_restful import Resource +from flask_httpauth import HTTPBasicAuth + +import storage_s3 +import eviction + + +auth = HTTPBasicAuth() + + +@auth.verify_password +def verify(username, password): + return utils.validate_user(username, password) + + +class Status(Resource): + __endpoints = [ + '/health', + '/status' + ] + + def __init__(self, storage): + self.__log = logging.getLogger("HealthEndpoint") + self.__storage = storage + + @staticmethod + def get_endpoints(): + return Status.__endpoints + + def get(self): + self.__log.debug("Endpoint /health has been called.") + result = { + "status": "UP", + "storage": {}, + "encryption": "Off" + } + + result.update(self.__storage.get_backup_in_progress_metrics()) + + schedule_rs = requests.get('http://localhost:8085/schedule') + + if not schedule_rs.ok: + result['status'] = "PROBLEM" + eviction_rule = None + + try: + schedule_rs.raise_for_status() + except requests.HTTPError as e: + result['message'] = e.__str__ + else: + schedule_metrics = schedule_rs.json() + eviction_rule = schedule_metrics['eviction_rule'] + + result.update({ + 'backup': schedule_metrics + }) + + vaults = list([v for v in self.__storage.list() if v.is_back_up_archive_exists()]) + vaults.reverse() + + fs_free, fs_total = self.__storage.fs_space() + + dump_count = len(vaults) + successful_dump_count = len([x for x in vaults if not x.is_failed()]) + + result["storage"] = { + "dump_count": dump_count, + "successful_dump_count": successful_dump_count, + "size": self.__storage.size(), + "archive_size": self.__storage.archive_size(), + "free_space": fs_free, + "total_space": fs_total, + "type": self.__storage.get_type(), + "type_id": self.__storage.get_type_id() + } + + if eviction_rule: + outdated_vaults = eviction.evict(vaults, eviction_rule, + accessor=lambda x: x.create_time()) + + result['storage']['outdated_backup_count'] = len(outdated_vaults) + + # calculate last successful + for vault in vaults: + if not vault.is_failed(): + result["storage"]["lastSuccessful"] = vault.to_json() + break + + if len(vaults) > 0: + last_vault = vaults[:1][0] + result["storage"]["last"] = last_vault.to_json() + if last_vault.is_failed(): + result["status"] = "WARNING" + + if self.__storage.get_encryption(): + result["encryption"] = "On" + + if self.__log.isEnabledFor(logging.DEBUG): + debug_info = { + 'debug': True, + 'endpoint': '/health', + 'response': result + } + self.__log.debug(debug_info) + + return result + +class List(Resource): + __endpoints = [ + '/list', + '/backups/list' + ] + + def __init__(self, storage): + self.__storage = storage + + @staticmethod + def get_endpoints(): + return List.__endpoints + + @auth.login_required + def get(self): + result = { + } + + vaults = list([v for v in self.__storage.list() if v.is_back_up_archive_exists()]) + vaults.reverse() + + # calculate last successful + for vault in vaults: + result[vault.get_id()] = vault.to_json() + + return result + + +class BackupStatus(Resource): + __endpoints = [ + '/backup/status/' + ] + + def __init__(self, storage): + self.__log = logging.getLogger('BackupRequestEndpoint') + self.__storage = storage + + @staticmethod + def get_endpoints(): + return BackupStatus.__endpoints + + def get(self, backup_id): + vault = None + result = None + vaults = self.__storage.list() + vaults.reverse() + + for vault in vaults: + if vault.get_id() == backup_id: + result = vault + break + if not result: + self.__log.info("Backup %s not found" % backup_id) + return Response("Backup %s not found \n" % backup_id, status=404) + else: + if vault.is_locked(): + self.__log.info("In Progress %s" % backup_id) + return Response("In Progress \n", status=200) + elif vault.is_failed(): + self.__log.info("Backup Failed %s" % backup_id) + return Response("Backup Failed \n", status=200) + elif vault.is_done(): + self.__log.info("Backup Done %s" % backup_id) + return Response("Backup Done \n", status=200) + else: + self.__log.info("Backup Failed %s" % backup_id) + return Response("Backup Failed \n", status=200) + + +class Health(Resource): + + __endpoints = [ + '/v2/health' + ] + + def __init__(self, storage): + self.__log = logging.getLogger("HealthEndpoint") + self.__root = storage.root + + @staticmethod + def get_endpoints(): + return Health.__endpoints + + def get(self): + schedule_rs = requests.get('http://localhost:8085/schedule') + # we also have to check that granular app works correctly + protocol = "http" + if os.getenv("TLS", "false").lower() == "true": + protocol += "s" + gr_backups = requests.get(protocol + '://localhost:9000/health', verify=False) + if (schedule_rs.status_code == 200) and (self.volume_liveliness_check()) \ + and gr_backups.status_code == 200: + return Response("OK", status=200) + else: + return Response("Internal server error", status=500) + + def volume_liveliness_check(self): + mount_path = self.__root + if os.environ['STORAGE_TYPE'] == "s3": + try: + with open(mount_path + "/health", "w") as f: + f.write("Health check") + if self.s3_health_check(mount_path + "/health"): + return True + else: + return False + except (IOError, Exception) as ex: + self.__log.exception(ex) + return False + else: + try: + f = tempfile.TemporaryFile(mode='w+t', suffix='.txt', prefix='volume_check_', dir=mount_path) + f.write("Test") + f.seek(0) + contents = f.read() + f.close() + return True + except (IOError, Exception) as ex: + self.__log.exception(ex) + return False + + def s3_health_check(self, filepath): + bucket = os.getenv("CONTAINER") + prefixed_filepath = os.getenv("AWS_S3_PREFIX", "") + filepath + try: + storage_s3.AwsS3Vault.get_s3_client().upload_file(filepath, bucket, prefixed_filepath) + except (IOError, Exception) as ex: + self.__log.exception(ex) + return False + try: + storage_s3.AwsS3Vault.get_s3_client().delete_object(Bucket=bucket, Key=prefixed_filepath) + except (IOError, Exception) as ex: + self.__log.exception(ex) + return False + return True + + +class ExternalRestoreStatus(Resource): + __endpoints = [ + '/external/restore/' + ] + + def __init__(self, storage): + self.__log = logging.getLogger('ExternalRestoreStatusEndpoint') + self.restore_folder = f"{storage.root}/external/restore" + self.allowable_db_types = ['AZURE', 'RDS'] + + @staticmethod + def get_endpoints(): + return ExternalRestoreStatus.__endpoints + + @auth.login_required + def get(self, restore_id): + self.__log.info("Get restore status request") + + external_pg_type = os.getenv("EXTERNAL_POSTGRESQL", "FALSE").upper() + + if external_pg_type not in self.allowable_db_types: + return Response(response=json.dumps({"status": "Not Supported"}), status=501) + + restore_file_path = f"{self.restore_folder}/{restore_id}" + if os.path.isfile(restore_file_path): + with open(restore_file_path, 'r') as f: + status_map = json.load(f) + f.close() + return status_map, 200 + else: + self.__log.info(f"Restore process not found {restore_id}") + return Response(response=json.dumps({"status": "Not Found"}), status=404) diff --git a/docker/postgres/endpoints/wal.py b/docker/postgres/endpoints/wal.py new file mode 100644 index 0000000..028b68a --- /dev/null +++ b/docker/postgres/endpoints/wal.py @@ -0,0 +1,202 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" + Set of endpoints for PostgreSQL to manipulate with write-ahead logs (WAL) archives. +""" +import threading +from threading import Thread + +from flask_restful import Resource +from flask import Response, request, stream_with_context +from flask_httpauth import HTTPBasicAuth +import utils +import logging +from filelock import Timeout, FileLock + +auth = HTTPBasicAuth() + +RESP_NO_FILE = "Please send file using curl -XPOST -F 'file=@somefile'" +RESP_EXIST_SHA_DIFF = "Archive file already exists with different sha256." +RESP_EXIST_SHA_SAME = "Archive file already exists with same sha256." +RESP_SHA_MISMATCH = "Provided sha256 does not match sha256 of stream" +RESP_WAL_PROC_BUSY = "WAL processor is busy" + +@auth.verify_password +def verify(username, password): + return utils.validate_user(username, password) + + +class Upload(Resource): + __log = logging.getLogger("UploadArchive") + + __endpoints = [ + '/archive/put', + '/archive/upload' + ] + + __evict_lock = threading.Lock() + + def __init__(self, storage): + self.__storage = storage + self.wal_processing_lock = FileLock("/tmp/wal.processing.lock") + self.wal_eviction_lock = FileLock("/tmp/wal.eviction.lock") + + @staticmethod + def get_endpoints(): + return Upload.__endpoints + + def __evict_archive_if_rule_exist(self): + self.__log.info("Check if eviction rule is specified and run eviction if needed") + if self.__storage.is_archive_evict_policy_set(): + try: + with self.wal_eviction_lock.acquire(timeout=5): + self.__storage.evict_archive() + except Timeout: + self.__log.warning("Evict is in progress. Skip new evict.") + + def __process_wall_post(self): + filename = None + sha256 = None + if request.args.getlist('filename'): + filename = request.args.getlist('filename')[0] + if request.args.getlist('sha256'): + sha256 = request.args.getlist('sha256')[0] + + self.__log.info(request.args) + self.__log.info( + "Start upload processing for {} with sha256 {}".format( + filename, sha256)) + + if self.__storage.is_archive_exists(filename): + stored_sha = self.__storage.get_sha256sum_for_archive(filename) + calc_sha = self.__storage.calculate_sha256sum_for_archive(filename) + if stored_sha == sha256: + if calc_sha != sha256: + self.__log.warning( + "Looks like file in storage broken. " + "Will receive new one as replacement. " + "Calculated sha256: {}.".format(calc_sha)) + else: + return Response(RESP_EXIST_SHA_SAME, status=208) + elif not stored_sha and calc_sha == sha256: + self.__log.warning( + "Found file without sha. Will store metainfo for future.") + self.__storage.store_archive_checksum(filename, sha256) + return Response(RESP_EXIST_SHA_SAME, status=208) + else: + return Response(RESP_EXIST_SHA_DIFF, status=409) + + if 'file' not in request.files: + return Response(RESP_NO_FILE, status=400) + + file_obj = request.files['file'] + try: + self.__storage.store_archive_checksum(filename, sha256) + sha256_processed = self.__storage.put_archive_as_stream( + filename, file_obj.stream) + if sha256 == sha256_processed: + return Response("Ok", status=200) + else: + self.__log.info( + "Wrong result sha256 hash: {}".format(sha256_processed)) + self.__storage.delete_archive(filename) + return Response(RESP_SHA_MISMATCH, status=400) + except Exception: + self.__log.exception("Cannot store archive log.") + for i in range(5): + try: + self.__storage.delete_archive(filename) + break + except Exception: + self.__log.exception("Cannot cleanup failed archive log.") + return Response("Internal error occurred.", status=500) + + @auth.login_required + def post(self): + try: + thread = Thread(target=self.__evict_archive_if_rule_exist) + thread.start() + except Exception as e: + self.__log.exception("Cannot start archive eviction during post.") + try: + with self.wal_processing_lock.acquire(timeout=5): + return self.__process_wall_post() + except Timeout: + self.__log.warning("Cannot process WAL because another in progress") + return Response(RESP_WAL_PROC_BUSY, status=503) + + +class Download(Resource): + + __endpoints = [ + '/archive/get', + '/archive/download' + ] + + def __init__(self, storage): + self.__storage = storage + + @staticmethod + def get_endpoints(): + return Download.__endpoints + + @auth.login_required + def get(self): + def generate(storage, filename): + with storage.get_archive_as_stream(filename) as f: + chunk_size = 4096 + while True: + data = f.read(chunk_size) + if len(data) == 0: + f.close() + return + yield data + + file_name = request.args.getlist('filename')[ + 0] if request.args.getlist('filename') else None + if file_name: + if self.__storage.is_archive_exists(file_name): + return Response(stream_with_context( + generate(self.__storage, file_name)), + mimetype='application/octet-stream', + headers=[ + ('Content-Type', + 'application/octet-stream'), + ('Content-Disposition', file_name) + ]) + else: + return Response( + "Cannot find file {} in archive".format(file_name), + status=404) + + +class Delete(Resource): + + __endpoints = [ + '/archive/delete' + ] + + def __init__(self, storage): + self.__storage = storage + + @staticmethod + def get_endpoints(): + return Delete.__endpoints + + @auth.login_required + def delete(self): + filename = request.args.getlist('filename')[0] if request.args.getlist('filename') else None + if filename: + self.__storage.delete_archive(filename) diff --git a/docker/postgres/eviction.py b/docker/postgres/eviction.py new file mode 100644 index 0000000..01a0311 --- /dev/null +++ b/docker/postgres/eviction.py @@ -0,0 +1,78 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +from itertools import groupby + + +class Rule: + magnifiers = { + "min": 60, + "h": 60 * 60, + "d": 60 * 60 * 24, + "m": 60 * 60 * 24 * 30, + "y": 60 * 60 * 24 * 30 * 12, + } + + def __init__(self, rule): + (startStr, intervalStr) = rule.strip().split("/") + self.start = self.__parseTimeSpec(startStr) + self.interval = "delete" if (intervalStr == "delete") else self.__parseTimeSpec(intervalStr) + + def __parseTimeSpec(self, spec): + import re + if (spec == "0"): + return 0 + + r = re.match("^(\\d+)(%s)$" % "|".join(list(self.magnifiers.keys())), spec) + if (r is None): + raise Exception("Incorrect eviction start/interval specification: %s" % spec) + + digit = int(r.groups()[0]) + magnifier = self.magnifiers[r.groups()[1]] + + return digit * magnifier + + def __str__(self): + return "%d/%d" % (self.start, self.interval) + + +def parse(rules): + rules = [Rule(r) for r in rules.split(",")] + return rules + + +def evict(items, rules, start_point_time=None, accessor=lambda x: x): + """ + Calculate what to evict from given list of versions (version is timestamp value, when each lbackup was created) + """ + + if start_point_time is None: + start_point_time = time.time() + + evictionVersions = [] + # TODO: to cache rules + for rule in parse(rules): + operateVersions = [t for t in items if accessor(t) <= start_point_time - rule.start] + if (rule.interval == "delete"): + # all versions should be evicted catched by this interval + evictionVersions.extend(operateVersions) + else: + # group by interval and leave only first on each + thursday = 3 * 24 * 60 * 60 + for _, versionsIt in groupby(operateVersions, lambda t: int((accessor(t) - thursday) / rule.interval)): + grouped = sorted(list(versionsIt), key=lambda t: accessor(t)) + evictionVersions.extend(grouped[:-1]) + + return sorted(list(set(evictionVersions)), reverse=True) diff --git a/docker/postgres/fsutil.py b/docker/postgres/fsutil.py new file mode 100644 index 0000000..87ae273 --- /dev/null +++ b/docker/postgres/fsutil.py @@ -0,0 +1,71 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import subprocess + + +def touch(filepath): + open(filepath, "w").close() + + +# def get_folder_size(dirpath): +# total_size = 0 +# for dirpath, dirname, filenames in os.walk(dirpath): +# for f in filenames: +# fp = os.path.join(dirpath, f) +# if os.path.exists(fp): +# total_size += os.path.getsize(fp) +# return total_size + +def get_folder_size(dirpath): + p1 = subprocess.Popen(["du", "-sb", dirpath], stdout=subprocess.PIPE) + size = p1.communicate()[0].split(b"\t")[0] + return size.decode() + + +def get_mount_point(pathname): + """Get the mount point of the filesystem containing pathname""" + pathname = os.path.normcase(os.path.realpath(pathname)) + parent_device = path_device = os.stat(pathname).st_dev + while parent_device == path_device: + mount_point = pathname + pathname = os.path.dirname(pathname) + if pathname == mount_point: break + parent_device = os.stat(pathname).st_dev + return mount_point + + +def get_mounted_device(pathname): + """Get the device mounted at pathname""" + # uses "/proc/mounts" + pathname = os.path.normcase(pathname) # might be unnecessary here + try: + with open("/proc/mounts", "r") as ifp: + for line in ifp: + fields = line.rstrip('\n').split() + # note that line above assumes that + # no mount points contain whitespace + if fields[1] == pathname: + return fields[0] + except EnvironmentError: + pass + return None # explicit + + +def get_fs_space(pathname): + """Get the free space and total of the filesystem containing pathname""" + + stat = os.statvfs(pathname) + return (stat.f_bfree * stat.f_bsize, stat.f_blocks * stat.f_bsize) diff --git a/docker/postgres/gunicorn/__init__.py b/docker/postgres/gunicorn/__init__.py new file mode 100644 index 0000000..342c6f0 --- /dev/null +++ b/docker/postgres/gunicorn/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/docker/postgres/gunicorn/archive.py b/docker/postgres/gunicorn/archive.py new file mode 100644 index 0000000..43ef9d0 --- /dev/null +++ b/docker/postgres/gunicorn/archive.py @@ -0,0 +1,35 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from flask import Flask +from flask_restful import Api + +import configs +import endpoints.wal +import storage + + +app = Flask('ArchiveEndpoints') +api = Api(app) + +conf = configs.load_configs() +storage_instance = storage.init_storage(storageRoot=conf['storage']) + +api.add_resource(endpoints.wal.Upload, *endpoints.wal.Upload.get_endpoints(), resource_class_args=(storage_instance, )) +api.add_resource(endpoints.wal.Download, *endpoints.wal.Download.get_endpoints(), resource_class_args=(storage_instance, )) +api.add_resource(endpoints.wal.Delete, *endpoints.wal.Delete.get_endpoints(), resource_class_args=(storage_instance, )) + + +if __name__ == '__main__': + app.run() diff --git a/docker/postgres/gunicorn/private.py b/docker/postgres/gunicorn/private.py new file mode 100644 index 0000000..a85eeea --- /dev/null +++ b/docker/postgres/gunicorn/private.py @@ -0,0 +1,35 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from flask_restful import Api +from flask import Flask + +import configs +import endpoints.backup +import endpoints.status +import storage + + +app = Flask("InternalServiceEndpoints") +api = Api(app) + +conf = configs.load_configs() +storage_instance = storage.init_storage(storageRoot=conf['storage']) + +api.add_resource(endpoints.status.List, *endpoints.status.List.get_endpoints(), resource_class_args=(storage_instance, )) +api.add_resource(endpoints.backup.Eviction, *endpoints.backup.Eviction.get_endpoints(), resource_class_args=(storage_instance, )) +api.add_resource(endpoints.backup.Download, *endpoints.backup.Download.get_endpoints(), resource_class_args=(storage_instance, )) + +if __name__ == '__main__': + app.run() diff --git a/docker/postgres/gunicorn/public.py b/docker/postgres/gunicorn/public.py new file mode 100644 index 0000000..861298d --- /dev/null +++ b/docker/postgres/gunicorn/public.py @@ -0,0 +1,60 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from flask_restful import Api +import os +from flask import Flask + +import configs +import endpoints.backup +import endpoints.restore +import endpoints.status +import storage + +from opentelemetry.instrumentation.flask import FlaskInstrumentor +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.resources import SERVICE_NAME, Resource + + +app = Flask("PublicEndpoints") +collector_endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", "") +if collector_endpoint != "": + collector_endpoint = "http://" + collector_endpoint + NAMESPACE_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/namespace' + ns = open(NAMESPACE_PATH).read() + resource = Resource(attributes={ + SERVICE_NAME: "postgresql-backup-daemon-" + ns + }) + provider = TracerProvider(resource=resource) + processor = BatchSpanProcessor(OTLPSpanExporter(endpoint=collector_endpoint, insecure=True)) + provider.add_span_processor(processor) + FlaskInstrumentor().instrument_app(app=app, tracer_provider=provider, excluded_urls="health,/health,v2/health,/v2/health") +api = Api(app) + +conf = configs.load_configs() +storage_instance = storage.init_storage(storageRoot=conf['storage']) + +endpoints.restore.ExternalRestoreRequest.cleanup_restore_status(storage_instance) + +api.add_resource(endpoints.status.Status, *endpoints.status.Status.get_endpoints(), resource_class_args=(storage_instance,)) +api.add_resource(endpoints.backup.BackupRequest, *endpoints.backup.BackupRequest.get_endpoints()) +api.add_resource(endpoints.status.Health, *endpoints.status.Health.get_endpoints(), resource_class_args=(storage_instance,)) +api.add_resource(endpoints.status.BackupStatus, *endpoints.status.BackupStatus.get_endpoints(), resource_class_args=(storage_instance,)) +api.add_resource(endpoints.status.ExternalRestoreStatus, *endpoints.status.ExternalRestoreStatus.get_endpoints(), resource_class_args=(storage_instance,)) +api.add_resource(endpoints.restore.ExternalRestoreRequest, *endpoints.restore.ExternalRestoreRequest.get_endpoints(), resource_class_args=(storage_instance,)) + +if __name__ == '__main__': + app.run() diff --git a/docker/postgres/locks.py b/docker/postgres/locks.py new file mode 100644 index 0000000..87bffae --- /dev/null +++ b/docker/postgres/locks.py @@ -0,0 +1,73 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import json +import logging +import os +import fsutil + +from retrying import retry + + +class BackupInProgressLock: + __in_progress_lock_file_path = '/tmp/backup.progress' + + def __init__(self): + self.__log = logging.getLogger(self.__class__.__name__) + + @retry(wait_fixed=300000) # Wait for 300 seconds before retry. + def acquire_lock(self): + # could not lock the resource + #if not self.__lock.acquire(False): + # self.__log.info("New backup can not be started, because last backup is still in progress.") + # raise Exception + #else: + # result = self.__lock.acquire() + self.__log.info("Backup lock has been acquired. File created") + if not os.path.isfile(self.get_lock_file_path()): + fsutil.touch(self.get_lock_file_path()) + with open(self.get_lock_file_path(), "w+") as lock_file: + lock_details = { + 'lock_acquisition_time': datetime.datetime.now().isoformat() + } + lock_file.write(json.dumps(lock_details)) + + def release_lock(self): + os.remove(self.get_lock_file_path()) + self.__log.info("Backup lock has been released.") + + @staticmethod + def get_lock_file_path(): + return BackupInProgressLock.__in_progress_lock_file_path + + +def backup_lock(): + return BackupInProgressLock() + + +def get_backup_lock_file_path(): + return BackupInProgressLock.get_lock_file_path() + + +def update_lock_file(**kwargs): + with open(get_backup_lock_file_path(), "r") as f: + details = json.load(f) + + details.update(kwargs) + temp_lock = '%s.tmp' % get_backup_lock_file_path() + with open(temp_lock, "w") as f: + json.dump(details, f) + + os.rename(temp_lock, get_backup_lock_file_path()) diff --git a/docker/postgres/logging.conf b/docker/postgres/logging.conf new file mode 100644 index 0000000..1a79a1d --- /dev/null +++ b/docker/postgres/logging.conf @@ -0,0 +1,43 @@ +[loggers] +keys=root, gunicorn.error, gunicorn.access, werkzeug + +[handlers] +keys=console + +[formatters] +keys=generic, graylog + +[logger_root] +level=INFO +handlers=console + +[logger_gunicorn.error] +level=INFO +handlers=console +qualname=gunicorn.error + +[logger_werkzeug] +level=WARNING +handlers=console +qualname=werkzeug + +[logger_gunicorn.access] +level=WARNING +handlers=console +qualname=gunicorn.access + +[handler_console] +class=StreamHandler +formatter=graylog +args=(sys.stdout, ) + +[formatter_generic] +format=[%(asctime)s][%(levelname)-5s][category=%(name)s] %(message)s +datefmt=%Y-%m-%dT%H:%M:%S +class=logging.Formatter + + +[formatter_graylog] +format=[%(asctime)s,%(msecs)03d][%(levelname)s][category=%(name)s] %(message)s +datefmt=%Y-%m-%dT%H:%M:%S +class=logging.Formatter diff --git a/docker/postgres/postgres_backup.sh b/docker/postgres/postgres_backup.sh new file mode 100755 index 0000000..403e867 --- /dev/null +++ b/docker/postgres/postgres_backup.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +cd $(dirname "$0") + +readonly AWS_S3_STORAGE="s3" +readonly SWIFT_STORAGE="swift" + +# In case of Encryption password for encryption will be passed as +# 2nd input parameter, 1st one is data_folder +readonly ENCRYPTION_KEY="$2" + +BACKUP_DESTINATION_DIRECTORY="$1" +BACKUP_NAME="pg_${PG_CLUSTER_NAME}_backup_$(basename ${BACKUP_DESTINATION_DIRECTORY}).tar.gz" + +source utils.sh + +function test_swift() { + local out + out=$(/opt/backup/scli ls ${CONTAINER} 2>&1) + process_exit_code $? "$out" +} + +function do_backup() { + if [[ -z "${BACKUP_ID}" ]]; then + log "Backup id must be specified explicitly" + fi + log BACKUP_ID + log "BACKUP_ID" + local validation_pipe="pg-backup-${BACKUP_ID}.pipe" + local pg_basebackup_stderr_file="pg-backup-${BACKUP_ID}.error.log" + local validation_stderr_file="pg-backup-validation-${BACKUP_ID}.error.log" + + register_delete_on_exit "${validation_pipe}" "${validation_stderr_file}" "${pg_basebackup_stderr_file}" + + # Validate TAR stream on the fly. + # This will not validate backup data itself, but will check archive's integrity. + mkfifo "${validation_pipe}" + tar -tz <"${validation_pipe}" > /dev/null 2> "${validation_stderr_file}" & + + log "start backup streaming to mounted storage" + if [[ -n "$ENCRYPTION_KEY" ]]; then + BACKUP_NAME="pg_backup_$(basename ${BACKUP_DESTINATION_DIRECTORY})_enc.tar.gz" + log "Encryption key is set will encrypt backup" + $PG_BASEBACKUP -h "${POSTGRES_HOST}" -p "${POSTGRES_PORT}" -U "${REPLICATION_USER}" -D - -X fetch --format=tar --gzip 2> "${pg_basebackup_stderr_file}" \ + | tee "${validation_pipe}" | openssl enc -aes-256-cbc -nosalt -pass pass:"$ENCRYPTION_KEY" > "${BACKUP_DESTINATION_DIRECTORY}/${BACKUP_NAME}" + else + $PG_BASEBACKUP -h "${POSTGRES_HOST}" -p "${POSTGRES_PORT}" -U "${REPLICATION_USER}" -D - -X fetch --format=tar --gzip 2> "${pg_basebackup_stderr_file}" \ + | tee "${validation_pipe}" > "${BACKUP_DESTINATION_DIRECTORY}/${BACKUP_NAME}" + fi + + # PIPESTATUS can be overridden, so need to keep it. + local exit_codes=(${PIPESTATUS[@]}) + local pg_basebackup_exit_code=${exit_codes[0]} + + # Wait for TAR validation to complete. + wait $! + local validation_exit_code=$? + + local validation_stderr="$(cat ${validation_stderr_file})" + local pg_basebackup_log="$(cat ${pg_basebackup_stderr_file})" + + process_exit_code ${pg_basebackup_exit_code} "pg_basebackup has failed. Details: ${pg_basebackup_log}" + process_exit_code ${validation_exit_code} "Backup archive integrity validation not passed. This backup will be marked as failed. Details: ${validation_stderr}" +} + +function do_swift_backup() { + local pg_backup_pipe="pg-backup-${BACKUP_DESTINATION_DIRECTORY}.pipe" + local pg_basebackup_error_file="pg-backup-${BACKUP_DESTINATION_DIRECTORY}.error.log" + local pg_backup_validation_error_file="pg-backup-validation-${BACKUP_DESTINATION_DIRECTORY}.error.log" + local swift_upload_log_file="pg-backup-${BACKUP_DESTINATION_DIRECTORY}.log" + local swift_object_path="${CONTAINER}/${BACKUP_DESTINATION_DIRECTORY}/${BACKUP_NAME}" + + log "Streaming backup to Swift under path: ${swift_object_path}" + + mkfifo "${pg_backup_pipe}" + tar -tz <"${pg_backup_pipe}" > /dev/null 2> "${pg_backup_validation_error_file}" & + # Validate TAR stream on the fly. + # This will not validate backup data itself, but will check archive's integrity. + if [[ -n "$ENCRYPTION_KEY" ]]; then + BACKUP_NAME="pg_backup_$(basename ${BACKUP_DESTINATION_DIRECTORY})_enc.tar.gz" + log "Encryption key is set will encrypt backup" + $PG_BASEBACKUP -h "${POSTGRES_HOST}" -p "${POSTGRES_PORT}" -U "${REPLICATION_USER}" -D - -X fetch --format=tar --gzip 2> "${pg_basebackup_error_file}" \ + | tee "${pg_backup_pipe}" \ + | openssl enc -aes-256-cbc -nosalt -pass pass:"$ENCRYPTION_KEY" \ + | /opt/backup/scli put "${swift_object_path}" 2>&1 > "${swift_upload_log_file}" + else + $PG_BASEBACKUP -h "${POSTGRES_HOST}" -p "${POSTGRES_PORT}" -U "${REPLICATION_USER}" -D - -X fetch --format=tar --gzip 2> "${pg_basebackup_error_file}" \ + | tee "${pg_backup_pipe}" \ + | /opt/backup/scli put "${swift_object_path}" 2>&1 > "${swift_upload_log_file}" + fi + + # PIPESTATUS can be overridden, so need to keep it. + local exit_codes=(${PIPESTATUS[@]}) + local pg_backup_exit_code=${exit_codes[0]} + local swift_upload_exit_code=${exit_codes[2]} + + # Wait for TAR validation to complete. + wait $! + local validation_exit_code=$? + + local swift_log="$(cat ${swift_upload_log_file})" + local validation_log="$(cat ${pg_backup_validation_error_file})" + local pg_basebackup_log="$(cat ${pg_basebackup_error_file})" + + rm "${pg_backup_pipe}" "${swift_upload_log_file}" "${pg_backup_validation_error_file}" "${pg_basebackup_error_file}" + + process_exit_code ${pg_backup_exit_code} "pg_basebackup has failed. Details: ${pg_basebackup_log}" + process_exit_code ${validation_exit_code} "Backup archive integrity validation not passed. This backup will be marked as failed. Details: ${validation_log}" + process_exit_code ${swift_upload_exit_code} "Backup uploading to Swift has failed. Details: ${swift_log}" + + log "PostgreSQL backup streaming to Swift completed successfully." +} + +function remove_backup() { + local out + out=$(rm -f "${BACKUP_DESTINATION_DIRECTORY}/${BACKUP_NAME}" 2>&1) + process_exit_code $? "$out" +} + +function main() { + version="$(PGPASSWORD=$POSTGRES_PASSWORD psql -h "${POSTGRES_HOST}" -p "${POSTGRES_PORT}" -U "${POSTGRES_USER}" -d postgres -c "SHOW SERVER_VERSION;" -tA | egrep -o '[0-9]{1,}\.[0-9]{1,}' | awk 'END{print $1}')" + REPLICATION_USER="replicator" + log "version of pgsql server is: ${version}" + if python -c "import sys; sys.exit(0 if float("${version}") >= 16.0 else 1)"; then + log "Using pgsql 16 bins for pg_basebackup" + PG_BASEBACKUP="/usr/lib/postgresql/16/bin/pg_basebackup" + BACKUP_NAME="pg_backup_$(basename ${BACKUP_DESTINATION_DIRECTORY}).tar.gz" + elif python -c "import sys; sys.exit(0 if 15.0 <= float("${version}") < 16.0 else 1)"; then + log "Using pgsql 15 bins for pg_basebackup" + PG_BASEBACKUP="/usr/lib/postgresql/15/bin/pg_basebackup" + BACKUP_NAME="pg_backup_$(basename ${BACKUP_DESTINATION_DIRECTORY}).tar.gz" + elif python -c "import sys; sys.exit(0 if 14.0 <= float("${version}") < 15.0 else 1)"; then + log "Using pgsql 14 bins for pg_basebackup" + PG_BASEBACKUP="/usr/lib/postgresql/14/bin/pg_basebackup" + BACKUP_NAME="pg_backup_$(basename ${BACKUP_DESTINATION_DIRECTORY}).tar.gz" + elif python -c "import sys; sys.exit(0 if 13.0 <= float("${version}") < 14.0 else 1)"; then + log "Using pgsql 13 bins for pg_basebackup" + PG_BASEBACKUP="/usr/lib/postgresql/13/bin/pg_basebackup" + BACKUP_NAME="pg_backup_$(basename ${BACKUP_DESTINATION_DIRECTORY}).tar.gz" + elif python -c "import sys; sys.exit(0 if 12.0 <= float("${version}") < 13.0 else 1)"; then + log "Using pgsql 12 bins for pg_basebackup" + PG_BASEBACKUP="/usr/lib/postgresql/12/bin/pg_basebackup" + BACKUP_NAME="pg_backup_$(basename ${BACKUP_DESTINATION_DIRECTORY}).tar.gz" + elif python -c "import sys; sys.exit(0 if 11.0 <= float("${version}") < 12.0 else 1)"; then + log "Using pgsql 11 bins for pg_basebackup" + PG_BASEBACKUP="/usr/lib/postgresql/11/bin/pg_basebackup" + BACKUP_NAME="pg_backup_$(basename ${BACKUP_DESTINATION_DIRECTORY}).tar.gz" + elif python -c "import sys; sys.exit(0 if 10.0 <= float("${version}") < 11.0 else 1)"; then + log "Using pgsql 10 bins for pg_basebackup" + PG_BASEBACKUP="/usr/lib/postgresql/10/bin/pg_basebackup" + BACKUP_NAME="pg_backup_$(basename ${BACKUP_DESTINATION_DIRECTORY}).tar.gz" + else + if [ "${PG_CLUSTER_NAME}" != "gpdb" ] + then + log "Using pgsql 9.6 bins for pg_basebackup" + PG_BASEBACKUP="/usr/pgsql-9.6/bin/pg_basebackup" + else + log "Using gpdb bins for greenplum pg_basebackup" + TARGET_DB_ID="$(psql -h "${POSTGRES_HOST}" -p "${POSTGRES_PORT}" -U "${POSTGRES_USER}" -d postgres -c "select dbid from gp_segment_configuration where content = -1 and status = 'up' and role = 'p';" -tA )" + PG_BASEBACKUP="/usr/local/greenplum-db/bin/pg_basebackup --target-gp-dbid="${TARGET_DB_ID}"" + REPLICATION_USER=${POSTGRES_USER} + + fi + fi + + if [ "$STORAGE_TYPE" == "${SWIFT_STORAGE}" ]; then + log "check swift is ready" + test_swift + log "do backup" + do_swift_backup + elif [[ "${STORAGE_TYPE}" == "${AWS_S3_STORAGE}" ]]; then + bash aws-s3-backup.sh "${CONTAINER}" "${BACKUP_DESTINATION_DIRECTORY}" + process_exit_code $? "PostgreSQL backup to AWS S3 has finished with an error." + elif [[ "${STORAGE_TYPE}" == "pgbackrest" ]]; then + log "Using pgbackrest as external backuper" + BACKUP_ID=$(basename ${BACKUP_DESTINATION_DIRECTORY}) + log "'$BACKUP_ID'" + log "BACKUP_DESTINATION_DIRECTORY" + curl -H "Content-Type: application/json" -H "Accept: application/json" -d '{"timestamp": "'$BACKUP_ID'"}' -XPOST pgbackrest:3000/backup + elif [[ "$STORAGE_TYPE" == "hostpath" ]] || [[ "$STORAGE_TYPE" == "pv" ]] || [[ "$STORAGE_TYPE" == "pv_label" ]] || [[ "$STORAGE_TYPE" == "provisioned" ]] || [[ "$STORAGE_TYPE" == "provisioned-default" ]]; then + log "do backup" + do_backup + fi + log "completed" +} + +main "$@" diff --git a/docker/postgres/postgres_backup_daemon.py b/docker/postgres/postgres_backup_daemon.py new file mode 100644 index 0000000..745bf79 --- /dev/null +++ b/docker/postgres/postgres_backup_daemon.py @@ -0,0 +1,40 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import configs +import postgres_backup_scheduler + + +if __name__ == "__main__": + configs.load_logging_configs() + conf = configs.load_configs() + + log = logging.getLogger("PostgreSQLBackupDaemon") + log.info("Backup daemon raised again.") + + backups_params = { + 'backup_command': conf.get_string("command"), + 'storage_root': conf.get_string("storage"), # TODO (vladislav.kaverin): Actually almost nobody needs storage root, only file-system storage. + 'eviction_rule': conf.get_string("eviction") + } + backups_schedule = conf.get_string("schedule") + + # Start scheduler and activate `/backup` endpoint. + backup_scheduler = postgres_backup_scheduler.start(backups_schedule, backups_params) + + + # scheduler <- backup_executor <- storage + # http_api <- storage \ No newline at end of file diff --git a/docker/postgres/postgres_backup_scheduler.py b/docker/postgres/postgres_backup_scheduler.py new file mode 100644 index 0000000..fa4b7bc --- /dev/null +++ b/docker/postgres/postgres_backup_scheduler.py @@ -0,0 +1,192 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from datetime import datetime +import threading +import time +import os + +from croniter import croniter +from flask_restful import Resource, Api +from flask import Flask +from queue import Queue, Empty +from storage import VAULT_NAME_FORMAT + +import locks +import storage +import workers + + +# TODO: move to postgres/endpoints +class SchedulerEndpoint(Resource): + + __endpoint = '/schedule' + + def __init__(self, scheduler): + self.__scheduler = scheduler + + @staticmethod + def get_endpoint(): + return SchedulerEndpoint.__endpoint + + def post(self): + return self.__scheduler.enqueue_backup("http-request") + + def get(self): + return self.__scheduler.get_metrics() + + +class BackupsScheduler: + def __init__(self, schedule, backup_options): + self.__log = logging.getLogger("BackupsScheduler") + self.__backup_options = backup_options + + self.__storage = storage.init_storage(storageRoot=self.__backup_options['storage_root']) + self.__task_queue = Queue() + + if schedule.lower() != 'none': + self.__log.info("Start backup scheduler with: %s" % schedule) + self.__cron = croniter(schedule) + self.__reschedule() + else: + self.__next_timestamp = None + self.__log.info("Skip backup schedule.") + + def __reschedule(self): + self.__next_timestamp = self.__cron.get_next() + self.__log.info("Scheduled next run at %s" % datetime.fromtimestamp(self.__next_timestamp).strftime( + "%Y-%m-%d %H:%M:%S")) + + # TODO check on negative value after substraction + delay = self.__next_timestamp - time.time() + if delay < 0: + self.__log.warn("Task execution performed longer than specified repeat interval") + delay = 0 + + self.timer = threading.Timer(delay, self.__execute_and_reschedule) + self.timer.setDaemon(True) + self.timer.start() + + def __execute_and_reschedule(self): + self.__log.info("[reason=schedule] Requesting new backup by schedule.") + self.enqueue_backup("schedule") + self.__reschedule() + + def run(self): + # Accept manual requests for backup. + _activate_endpoint(self) + self.__log.info("Waiting for backup requests...") + + while True: + try: + backup = self.__task_queue.get(True, timeout=1) + except Empty: + continue + + lock = locks.backup_lock() + try: + + + lock.acquire_lock() + self.__log.info("[reason={}] Spawn new backup worker. Backup requests queue length: {}".format + (backup.get_reason(), self.__task_queue.qsize())) + oldest_backup = self.__storage.get_oldest_backup() + worker = workers.spawn_backup_worker(self.__storage, + backup_command=self.__backup_options['backup_command'], + eviction_rule=self.__backup_options['eviction_rule'], + backup_id=backup.get_backup_id() + ) + + if oldest_backup: + self.__log.info("Id of latest backup: {}".format(oldest_backup.get_id())) + spent_time = oldest_backup.load_metrics().get('spent_time') + + if spent_time: + # time stored as milliseconds converting to seconds and double the value + time_out = spent_time / 1000 * 2 + self.__log.info("Setting timeout for backup process: {}".format(time_out)) + worker.join(time_out) + + else: + worker.join() + else: + worker.join() + + self.__log.info("Worker completed: {}".format(not worker.is_alive())) + + if worker.is_alive(): + self.__log.error("Backup worker for {} is not completed after timeout: {}".format(backup.get_backup_id(), time_out)) + worker.fail() + worker.kill() + raise Exception("Backup worker timeout exceeded") + + except: + self.__log.error("Error execute schedule callback", exc_info=1) + finally: + lock.release_lock() + self.__task_queue.task_done() + + def enqueue_backup(self, reason="manual"): + backup_id = datetime.now().strftime(VAULT_NAME_FORMAT) + backup = Backup(backup_id, reason) + + self.__task_queue.put(backup) + queue_size = self.__task_queue.qsize() + self.__log.info("[reason={}] New backup request has been received and added to queue. Queue length: {}".format + (reason, queue_size)) + + return { + 'accepted': True, + 'reason': reason, + 'backup_requests_in_queue': queue_size, + 'message': "PostgreSQL backup has been scheduled successfully.", + 'backup_id': backup_id + } + + def get_metrics(self): + return { + 'requests_in_queue': self.__task_queue.qsize(), + 'time_until_next_backup': 'none' if self.__next_timestamp is None else self.__next_timestamp - time.time(), + 'eviction_rule': self.__backup_options['eviction_rule'] + } + + +class Backup: + def __init__(self, backup_id, reason): + self.__backup_id = backup_id + self.__reason = reason + + def get_backup_id(self): + return self.__backup_id + + def get_reason(self): + return self.__reason + + +def start(backups_schedule, backups_params): + scheduler = BackupsScheduler(backups_schedule, backups_params) + scheduler.run() + + +def _activate_endpoint(scheduler): + app = Flask("ScheduleEndpoint") + api = Api(app) + api.add_resource(SchedulerEndpoint, SchedulerEndpoint.get_endpoint(), resource_class_args=(scheduler, )) + backup_endpoint_thread = threading.Thread(target=app.run, args=('127.0.0.1', 8085)) + backup_endpoint_thread.setDaemon(True) + backup_endpoint_thread.start() + + log = logging.getLogger("ScheduleEndpoint") + log.info("Endpoint `/schedule` has been activated.") diff --git a/docker/postgres/start_backup_daemon.sh b/docker/postgres/start_backup_daemon.sh new file mode 100644 index 0000000..a13acfd --- /dev/null +++ b/docker/postgres/start_backup_daemon.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +BACKUP_DAEMON_DIR="$(cd $(dirname $0); pwd)" + +DEFAULT_WORKERS_NUMBER=2 # Suppose that one accepted long-running request and another deals with quick ones. +DEFAULT_WORKERS_TIMEOUT=21600 # 6 hours + +PUBLIC_ENDPOINTS_WORKERS_NUMBER=${PUBLIC_ENDPOINTS_WORKERS_NUMBER:-${DEFAULT_WORKERS_NUMBER}} +PRIVATE_ENDPOINTS_WORKERS_NUMBER=${PRIVATE_ENDPOINTS_WORKERS_NUMBER:-${DEFAULT_WORKERS_NUMBER}} +ARCHIVE_ENDPOINTS_WORKERS_NUMBER=${ARCHIVE_ENDPOINTS_WORKERS_NUMBER:-${DEFAULT_WORKERS_NUMBER}} + +WORKERS_TIMEOUT=${WORKERS_TIMEOUT:-${DEFAULT_WORKERS_TIMEOUT}} +LOG_FORMAT=${LOG_FORMAT:-generic} + +function update_logging_configuration() { + sed -i s/formatter=.*/formatter=${LOG_FORMAT}/g ${BACKUP_DAEMON_DIR}/logging.conf + if [[ -z "${LOG_LEVEL}" ]]; then + return + fi + sed -i s/level=.*/level=${LOG_LEVEL}/g ${BACKUP_DAEMON_DIR}/logging.conf + +} + +function check_ipv6() { + if [[ -d "/proc/sys/net/ipv6" ]]; then + echo "IPv6 availiable will listen on [::]" + export LISTEN_ADDR="[::]" + else + echo "IPv6 is not availiable will listen on 0.0.0.0" + export LISTEN_ADDR="0.0.0.0" + fi +} + + +function ride_unicorn() { + params="--daemon --timeout ${WORKERS_TIMEOUT} --preload --enable-stdio-inheritance --log-config ${BACKUP_DAEMON_DIR}/logging.conf" + if [[ $TLS ]]; then + params="${params} --certfile=/certs/tls.crt --keyfile=/certs/tls.key" + fi + # 2 workers + # Roughly, one is for /health endpoint since it's quite fast operation and should not block, + # second one is for /backup which just schedules backups. + echo ${params} + gunicorn --chdir "${BACKUP_DAEMON_DIR}/gunicorn" \ + ${params} \ + -w ${PUBLIC_ENDPOINTS_WORKERS_NUMBER} \ + --pythonpath ${BACKUP_DAEMON_DIR} \ + -b "${LISTEN_ADDR}":8080 \ + public:app + + # 2 workers + # /get is long-running operation so it will be blocked + # /list and /evict are quite short. + gunicorn --chdir "${BACKUP_DAEMON_DIR}/gunicorn" \ + ${params} \ + -w ${PRIVATE_ENDPOINTS_WORKERS_NUMBER} \ + --pythonpath ${BACKUP_DAEMON_DIR} \ + -b "${LISTEN_ADDR}":8081 \ + private:app + + gunicorn --chdir "${BACKUP_DAEMON_DIR}/gunicorn" \ + ${params} \ + -w ${ARCHIVE_ENDPOINTS_WORKERS_NUMBER} \ + --pythonpath ${BACKUP_DAEMON_DIR} \ + -b "${LISTEN_ADDR}":8082 \ + archive:app + + # Granular backup are async, so one worker should be more than enough. + # Timeout is just 60 seconds since it should be enough to parse and validate any request. + gunicorn --chdir ${BACKUP_DAEMON_DIR}/granular \ + ${params} \ + -w 1 \ + --pythonpath ${BACKUP_DAEMON_DIR}/granular \ + -b "${LISTEN_ADDR}":9000 \ + granular:app +} + +function summon_daemon() { + python "${BACKUP_DAEMON_DIR}/postgres_backup_daemon.py" +} + +function check_user(){ + cur_user=$(id -u) + if [ "$cur_user" != "0" ] + then + echo "Adding randomly generated uid to passwd file..." + sed -i '/backup/d' /etc/passwd + if ! whoami &> /dev/null; then + if [ -w /etc/passwd ]; then + echo "${USER_NAME:-backup}:x:$(id -u):0:${USER_NAME:-backup} user:/backup-storage:/sbin/nologin" >> /etc/passwd + fi + fi + fi +} + +check_ipv6 +check_user +update_logging_configuration +ride_unicorn +summon_daemon \ No newline at end of file diff --git a/docker/postgres/storage.py b/docker/postgres/storage.py new file mode 100644 index 0000000..077507f --- /dev/null +++ b/docker/postgres/storage.py @@ -0,0 +1,529 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc +import binascii +import hashlib +import io +import json +import logging +import re +import os +import time +import locks +import utils +import errno +import subprocess +import configs +from retrying import retry +from traceback import format_exception + +try: + from io import StringIO +except ImportError: + from io import StringIO + + +class StorageLocationAlreadyExistsException(Exception): + pass + + +VAULT_NAME_FORMAT = "%Y%m%dT%H%M" +VAULT_DIRNAME_MATCHER = re.compile("\\d{8}T\\d{4}", re.IGNORECASE) +# pg_{}_archive_{} +ARCHIVE_NAME_MATCHER = re.compile("pg_(.*)_archive_(?P[\da-f]+(\.\d+\.backup)?(\.partial)?(\.bk)?)$", re.IGNORECASE) +# print(ARCHIVE_NAME_MATCHER.match("pg_common_archive_000000320000004600000066").group("name")) +# print(ARCHIVE_NAME_MATCHER.match("pg_common_archive_0000001B000000100000003E.bk").group("name")) +# print(ARCHIVE_NAME_MATCHER.match("pg_common_archive_0000003200000046000000A3.00000028.backup").group("name")) +# print(ARCHIVE_NAME_MATCHER.match("pg_common_archive_000000310000004400000020.partial").group("name")) +# print(ARCHIVE_NAME_MATCHER.match("pg_common_archive_0000001B0000001B00000015.partial.bk").group("name")) + +PG_CLUSTER_NAME = os.getenv("PG_CLUSTER_NAME") +ARCHIVE_EVICT_POLICY = os.getenv("ARCHIVE_EVICT_POLICY") +unit_pattern = re.compile("(?P\d*)(?P.+)") +memory_pattern = re.compile("\d+(\.\d+)?(?P[kmgt][bB]?)?") +memory_units = { + "ki": 1, "mi": 1024, "gi": 1048576, + "k": 1, "m": 1000, "g": 1000000, + "kb": 1, "mb": 1024, "gb": 1048576, "tb": 1073741824 +} +time_pattern = re.compile("\d+(\.\d+)?(?P(ms|s|min|h|d))?") +time_utins = { + "ms": 1, "s": 1000, "min": 60000, "h": 3600000, "d": 86400000 +} +log = logging.getLogger("Storage") + + +class StorageException(Exception): + def __init__(self, msg): + super(Exception, self).__init__(msg) + +#@retry(retry_on_exception=utils.retry_if_storage_error, wait_fixed=1000) +def init_storage(storageRoot): + if configs.is_external_pg(): + version_postfix = "external" + log.info("External Postgres is used, storage folder is \"external\"") + else: + version_postfix = utils.get_version_of_pgsql_server() + if version_postfix is None: + import fnmatch + pg_dirs = fnmatch.filter(os.listdir(storageRoot), 'pg*') + if not pg_dirs: + raise StorageException("No suitable directories are found, retrying") + log.info(f"Possible directories for backup store {pg_dirs}") + versions = sorted([int(re.search(r'\d+', x).group()) for x in pg_dirs], reverse=True) + version_postfix = "pg" + str(versions[0]) + + log.info(f"PostgreSQL server version is equal to {version_postfix}, " + f"so will save all backups in {version_postfix} dir") + storageRoot = os.path.join(storageRoot, version_postfix) + + # folders + for folder in ["archive", "granular"]: + dirs_to_create = os.path.join(storageRoot, folder) + if not os.path.exists(dirs_to_create): + try: + os.makedirs(dirs_to_create) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise + if os.environ['STORAGE_TYPE'] not in ["s3", "swift"]: + storage = os.path.join(storageRoot, "granular") + for namespace in os.listdir(storage): + for backup_id in os.listdir(os.path.join(storage, namespace)): + status_file = os.path.join(storage, namespace, backup_id, "status.json") + if os.path.isfile(status_file): + with open(status_file , "r+") as f: + try: + status_json = json.load(f) + except ValueError as e: + log.error("Failed to read the status file {} Error: {}".format(status_file,e)) + else: + if status_json['status'] == 'In progress': + log.info("Attempt to change the status Path: {} ".format(status_file)) + status_json['status'] = 'Failed' + f.seek(0) + f.write(json.dumps(status_json)) + f.truncate() + + if os.environ['STORAGE_TYPE'] == "swift": + import storage_swift + return storage_swift.SwiftStorage(storageRoot) + if os.environ['STORAGE_TYPE'] == "s3": + import storage_s3 + return storage_s3.AwsS3Storage(storageRoot) + if os.environ['STORAGE_TYPE'] == 'pgbackrest': + import storage_pgbackrest + print("Backrest storage init") + return storage_pgbackrest.BackRestStorage(storageRoot) + import storage_fs + return storage_fs.FSStorage(storageRoot) + + +class Storage(metaclass=abc.ABCMeta): + __log = logging.getLogger("Storage") + + @abc.abstractmethod + def list(self): + """ + :return: list of available except locked one. + :rtype: list + """ + raise NotImplementedError + + @abc.abstractmethod + def size(self): + """ + :return: occupied space size in bytes + :rtype: int + """ + raise NotImplementedError + + @abc.abstractmethod + def archive_size(self): + """ + :return: occupied space size in bytes + :rtype: int + """ + raise NotImplementedError + + @abc.abstractmethod + def fs_space(self): + """ + :return: tuple (free, total) space on mount point where is root folder located + :rtype: (int, int) + """ + raise NotImplementedError + + @abc.abstractmethod + def open_vault(self): + """ + :return: + :rtype: (str, dict, StringIO) + """ + raise NotImplementedError + + @abc.abstractmethod + def evict_vault(self, vault): + """ + Removes vault and associated files from storage + :param vault: + :return: + """ + raise NotImplementedError + + @abc.abstractmethod + def prot_is_file_exists(self, filename): + """ + Internal method. Should not use outside storage. + :param filename: filename with path from entry point. For fs storage entrypoint is /backup-storage/. + :return: + """ + raise NotImplementedError + + @abc.abstractmethod + def prot_delete(self, filename): + """ + Internal method. Should not use outside storage. + :param filename: filename with path from entry point. For fs storage entrypoint is /backup-storage/. + :return: + """ + raise NotImplementedError + + @abc.abstractmethod + def prot_get_as_stream(self, filename): + """ + Internal method. Should not use outside storage. + :param filename: path to file from backup root. + :type filename: string + :return: stream with requested file + :rtype: io.RawIOBase + """ + raise NotImplementedError + + @abc.abstractmethod + def prot_put_as_stream(self, filename, stream): + """ + Internal method. Should not use outside storage. + :param filename: + :type filename: string + :param stream: + :type stream: io.RawIOBase + :return: sha256 of processed stream + :rtype: string + """ + raise NotImplementedError + + @abc.abstractmethod + def prot_get_file_size(self, filename): + """ + Internal method. Should not use outside storage. + :param filename: path to file from backup root. + :type filename: string + :return: size of file in bytes + :rtype: int + """ + raise NotImplementedError + + @abc.abstractmethod + def prot_list_archive(self): + raise NotImplementedError + + @abc.abstractmethod + def get_type(self): + raise NotImplementedError + + @abc.abstractmethod + def get_type_id(self): + return -1 + + @abc.abstractmethod + def get_encryption(self): + raise NotImplementedError + + def is_archive_evict_policy_set(self): + return ARCHIVE_EVICT_POLICY is not None + + def evict(self, vault): + """ + Removes vault and associated files from storage + :param vault: + :return: + """ + self.evict_vault(vault) + self.evict_archive() + + def get_oldest_backup(self): + """ + :return: + :rtype: Vault + """ + vaults = reversed(self.list()) + for vault in vaults: + if not vault.is_failed() and vault.is_back_up_archive_exists(): + return vault + return None + + def evict_archive(self): + archives = self.prot_list_archive() + if archives: + self.__log.info("Stored archive list {}".format(archives)) + if ARCHIVE_EVICT_POLICY: + ups = unit_pattern.search(ARCHIVE_EVICT_POLICY) + evict_rule_unit = ups.group("unit") + evict_rule_value = int(ups.group("mult")) + if evict_rule_unit.lower() in memory_units: + value_kib = evict_rule_value * memory_units[evict_rule_unit.lower()] + self.__evict_by_size(archives, 1024 * value_kib) + elif evict_rule_unit.lower() in time_utins: + value_ms = evict_rule_value * time_utins[evict_rule_unit.lower()] + self.__evict_by_time(archives, int((time.time() * 1000 - value_ms))) + else: + self.__log.error( + "Cannot parse eviction policy {}. " + "Will use oldest backup to perform eviction.". + format(ARCHIVE_EVICT_POLICY)) + self.__evict_by_oldest_backup(archives) + + else: + self.__evict_by_oldest_backup(archives) + + def __evict_by_oldest_backup(self, archives): + oldest_backup = self.get_oldest_backup() + self.__log.info("Oldest available backup {}".format(oldest_backup)) + if oldest_backup: + bct = oldest_backup.create_timestamp() + else: + self.__log.warning( + "Does not have oldest backup. " + "Will use current time and remove all files in archive.") + bct = int(time.time() * 1000) + self.__evict_by_time(archives, bct) + + def __evict_by_time(self, archives, keep_time): + self.__log.info("Eviction timestamp {}".format(keep_time)) + for archive in archives: + self.__log.debug("Check WAL".format(archive)) + if keep_time > archive.timestamp: + self.delete_archive(archive.filename) + + def __evict_by_size(self, archives, keep_bytes): + self.__log.info("Eviction limit {} bytes".format(keep_bytes)) + sorted_archives = sorted(list(archives), key=lambda t: t.timestamp, reverse=True) + occupied_space = 0 + freed_space = 0 + for archive in sorted_archives: + self.__log.debug("Check WAL".format(archive)) + full_name = self.__get_fullfilename_for_archive(archive.filename) + size = self.prot_get_file_size(full_name) + occupied_space = occupied_space + size + if occupied_space > keep_bytes: + self.delete_archive(archive.filename) + freed_space = freed_space + size + self.__log.info("Occupied before: {} bytes. Freed: {} bytes.".format(occupied_space, freed_space)) + + def get_backup_as_stream(self, vault): + """ + :return: stream with requested backup + :rtype: io.RawIOBase + """ + self.__log.info("Get request for vault: %s" % vault) + backup_name = "{}/pg_{}_backup_{}.tar.gz".format(vault.get_id(), PG_CLUSTER_NAME, vault.get_id()) + + # for pg10 backups there is another naming convention `pg_backup_{timestamp}` + if not self.prot_is_file_exists(backup_name): + backup_name = "{}/pg_backup_{}.tar.gz".format(vault.get_id(), vault.get_id()) + + # encrypted backups + if not self.prot_is_file_exists(backup_name): + backup_name = "{}/pg_backup_{}_enc.tar.gz".format(vault.get_id(), + vault.get_id()) + + return self.prot_get_as_stream(backup_name) + + def __get_fullfilename_for_archive(self, filename): + """ + Returns full filename from storage root in form like 'archive/pg_common_archive_00000000010000000001(_enc)?' + :param filename: + :return: + """ + if self.get_encryption(): + filename = filename + "_enc" + archive_full_name = "archive/pg_{}_archive_{}".format(PG_CLUSTER_NAME, filename) + return archive_full_name + + def __get_fullfilename_for_archive_sha(self, filename): + sha_name = "archive/pg_{}_archive_{}.sha".format(PG_CLUSTER_NAME, filename) + return sha_name + + def is_archive_exists(self, filename): + return self.prot_is_file_exists(self.__get_fullfilename_for_archive(filename)) + + def get_archive_as_stream(self, filename): + """ + :return: stream with requested archive + :rtype: io.RawIOBase + """ + self.__log.info("Get request for archive: %s" % filename) + return self.prot_get_as_stream(self.__get_fullfilename_for_archive(filename)) + + def put_archive_as_stream(self, filename, stream): + """ + :return: stream with requested archive + :rtype: io.RawIOBase + """ + self.__log.info("Put request for archive: %s" % filename) + return self.prot_put_as_stream(self.__get_fullfilename_for_archive(filename), stream) + + def store_archive_checksum(self, filename, sha256): + sha_name = self.__get_fullfilename_for_archive_sha(filename) + to_store = str(binascii.crc32(sha256.encode())) + "_" + sha256 + self.prot_put_as_stream(sha_name, io.BytesIO(to_store.encode())) + + def delete_archive(self, filename): + self.__log.info("Delete request for archive: %s" % filename) + self.prot_delete(self.__get_fullfilename_for_archive(filename)) + sha_filename = self.__get_fullfilename_for_archive_sha(filename) + if self.prot_is_file_exists(sha_filename): + self.prot_delete(sha_filename) + + def get_sha256sum_for_archive(self, filename): + sha_name = self.__get_fullfilename_for_archive_sha(filename) + if self.prot_is_file_exists(sha_name): + stream = self.prot_get_as_stream(sha_name) + with stream as f: + result = f.read().strip() + try: + crc = int(result.split("_")[0]) + sha256 = result.split("_")[1] + if (crc == binascii.crc32(sha256.encode())): + return sha256 + else: + self.__log.warning("CRC32 check failed for sha file {}.".format(filename)) + return None + except Exception as e: + return None + return None + + def calculate_sha256sum_for_archive(self, filename): + sha256 = hashlib.sha256() + chunk_size = 4096 + stream = self.get_archive_as_stream(filename) + with stream as f: + while True: + data = f.read(chunk_size) + if len(data) == 0: + stream.close() + self.__log.info( + "Calculated sha256 {} for local archive".format( + sha256.hexdigest())) + return sha256.hexdigest() + sha256.update(data) + + def get_backup_in_progress_metrics(self): + is_backup_in_progress = os.path.isfile(locks.get_backup_lock_file_path()) + in_progress_metrics = {'backup_is_in_progress': is_backup_in_progress} + + if is_backup_in_progress: + with open(locks.get_backup_lock_file_path(), "r") as f: + in_progress_metrics.update(json.load(f)) + + return in_progress_metrics + + +class Archive: + def __init__(self, filename, timestamp): + self.filename = filename + self.timestamp = timestamp + + def __repr__(self): + return "Archive name: {}, timestamp: {}".format(self.filename, self.timestamp) + + def __eq__(self, other): + if isinstance(other, Archive): + return self.filename == other.filename + return False + + def __ne__(self, other): + return not self.__eq__(other) + + +class Vault(metaclass=abc.ABCMeta): + def __init__(self): + super(Vault, self).__init__() + + self.metrics = {} + + @abc.abstractmethod + def get_id(self): raise NotImplementedError + + @abc.abstractmethod + def get_folder(self): raise NotImplementedError + + @abc.abstractmethod + def load_metrics(self): raise NotImplementedError + + @abc.abstractmethod + def is_locked(self): raise NotImplementedError + + @abc.abstractmethod + def is_failed(self): raise NotImplementedError + + @abc.abstractmethod + def is_done(self): raise NotImplemented + + @abc.abstractmethod + def create_time(self): raise NotImplementedError + + @abc.abstractmethod + def is_back_up_archive_exists(self): raise NotImplementedError + + def create_timestamp(self): + return int(self.create_time() * 1000) + + def __enter__(self): + self.start_timestamp = create_timestamp_as_long() + + def __exit__(self, exc_type, exc_val, exc_tb): + end_backup_timestamp = create_timestamp_as_long() + + self.metrics["end_backup_timestamp"] = end_backup_timestamp + self.metrics["spent_time"] = end_backup_timestamp - self.start_timestamp + + def __lt__(self, other): + return self.create_timestamp() < other.create_timestamp() + + def __gt__(self, other): + return self.create_timestamp() > other.create_timestamp() + + def to_json(self): + metrics = self.load_metrics() + + result = { + "id": self.get_id(), + "failed": self.is_failed(), + "locked": self.is_locked(), + "ts": self.create_timestamp(), + "metrics": metrics + } + + end_backup_timestamp = metrics.get("end_backup_timestamp") + if end_backup_timestamp: + result["end_timestamp_ago"] = create_timestamp_as_long() - end_backup_timestamp + + return result + + +def create_timestamp_as_long(): + return int(time.time() * 1000) diff --git a/docker/postgres/storage_fs.py b/docker/postgres/storage_fs.py new file mode 100644 index 0000000..b16148b --- /dev/null +++ b/docker/postgres/storage_fs.py @@ -0,0 +1,290 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io + +import errno +import os +from datetime import datetime +import logging +from traceback import format_exception +import time +import json +import fsutil +import storage +from storage import VAULT_NAME_FORMAT, VAULT_DIRNAME_MATCHER, ARCHIVE_NAME_MATCHER, \ + StorageLocationAlreadyExistsException +import utils +import encryption + +try: + from io import StringIO +except ImportError: + from io import StringIO + +PG_CLUSTER_NAME = os.getenv("PG_CLUSTER_NAME") +STORAGE_ROOT = os.getenv("STORAGE_ROOT") + + +class FSStorage(storage.Storage): + __log = logging.getLogger("FSStorage") + + def __failed_filepath(self): + return ".failed" + + def __init__(self, root): + self.__log.info("Init storage object with storage root: %s" % root) + self.root = root + if utils.get_encryption(): + self.encryption = True + else: + self.encryption = False + + def fail(self, backup_id): + self.__log.info("create .failed file: {}".format(os.path.join(self.root, backup_id, self.__failed_filepath()))) + open(os.path.join(self.root, backup_id, self.__failed_filepath()), "w").close() + + def get_encryption(self): + return self.encryption + + def list(self): + if os.path.exists(self.root): + vaults = [FSVault(self.root + "/" + dirname) + for dirname in os.listdir(self.root) + if VAULT_DIRNAME_MATCHER.fullmatch(dirname) is not None] + vaults.sort(key=lambda v: v.create_time()) + return vaults + else: + return [] + + def size(self): + """ Returns whole storage size in bytes """ + return fsutil.get_folder_size(self.root) + + def archive_size(self): + """ Returns whole storage size in bytes """ + return fsutil.get_folder_size(os.path.join(self.root, "archive")) + + def fs_space(self): + """ Returns tuple (free, total) space on mount point where is root folder located """ + return fsutil.get_fs_space(self.root) + + def open_vault(self, backup_id): + """ + :return: + :rtype: (str, dict, StringIO) + """ + return FSVault("%s/%s" % (self.root, backup_id)) + + def evict_vault(self, vault): + self.__log.info("Evict vault: %s" % vault) + self.__log.debug("Delete folder: %s" % vault.folder) + for root, dirs, files in os.walk(vault.folder, topdown=False): + for name in files: + try: + os.remove(os.path.join(root, name)) + except OSError as e: # passing possible problems with permission + self.__log.exception(e) + for name in dirs: + try: + os.rmdir(os.path.join(root, name)) + except OSError as e: + self.__log.exception(e) + try: + os.rmdir(vault.folder) + except OSError as e: + self.__log.exception(e) + + def prot_is_file_exists(self, filename): + self.__log.debug("Check for file: %s" % filename) + return os.path.isfile("{}/{}".format(self.root, filename)) + + def prot_delete(self, filename): + self.__log.info("Delete file: %s" % filename) + os.remove("{}/{}".format(self.root, filename)) + + def prot_get_as_stream(self, filename): + """ + :param filename: path to file from backup root. + :type filename: string + :return: stream with requested file + :rtype: io.RawIOBase + """ + self.__log.info("Get request for file: %s" % filename) + full_file_path = "{}/{}".format(self.root, filename) + + return encryption.FileWrapper(full_file_path, self.encryption) \ + .get_file_stream() + + def prot_put_as_stream(self, filename, stream): + """ + :param filename: + :type filename: string + :param stream: + :type stream: io.RawIOBase + :return: sha256 of processed stream + :rtype: string + """ + self.__log.info("Put request for file: %s" % filename) + fs_filename = "{}/{}".format(self.root, filename) + if not os.path.exists(os.path.dirname(fs_filename)): + try: + os.makedirs(os.path.dirname(fs_filename)) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise + return encryption.FileWrapper(fs_filename, self.encryption) \ + .put_file_stream(stream) + + def prot_get_file_size(self, filename): + # self.__log.info("Size request for file: %s" % filename) + if os.path.exists("{}/{}".format(self.root, filename)): + return os.path.getsize("{}/{}".format(self.root, filename)) + return 0 + + def prot_list_archive(self): + archive_root = os.path.join(self.root, "archive") + if os.path.exists(archive_root): + self.__log.debug("Archive directory listing {}".format(os.listdir(archive_root))) + archives = [storage.Archive( + ARCHIVE_NAME_MATCHER.match(filename).group("name"), + os.path.getmtime(os.path.join(archive_root, filename)) * 1000) + for filename in os.listdir(archive_root) + if ARCHIVE_NAME_MATCHER.match(filename) is not None] + return archives + else: + self.__log.debug("Archive directory does not exist") + return [] + + def get_type(self): + return "Volume" + + def get_type_id(self): + return 0 + + +class FSVault(storage.Vault): + __log = logging.getLogger("FSVaultLock") + + def __init__(self, folder): + super(FSVault, self).__init__() + + self.folder = folder + self.metrics_filepath = self.folder + "/.metrics" + self.console = StringIO() + + def get_id(self): + return os.path.basename(self.folder) + + def get_folder(self): + return self.folder + + def load_metrics(self): + self.__log.debug("Load metrics from: %s" % self.metrics_filepath) + + if not os.path.isfile(self.metrics_filepath): + self.__log.warning("Metrics file: {} does not exists.".format(self.metrics_filepath)) + return {} + + with open(self.metrics_filepath, "r") as f: + try: + return json.load(f) + except Exception as e: + self.__log.exception(e) + self.__log.warning("Cannot load metrics file: {}. Metrics file can be damaged.".format(self.metrics_filepath)) + return {} + + def __lock_filepath(self): + return self.folder + "/.lock" + + def __failed_filepath(self): + return self.folder + "/.failed" + + def __console_filepath(self): + return self.folder + "/.console" + + def __metrics_filepath(self): + return self.folder + "/.metrics" + + def is_locked(self): + return os.path.exists(self.__lock_filepath()) + + def is_failed(self): + return os.path.exists(self.__failed_filepath()) + + def is_done(self): + if not os.path.isfile(self.__metrics_filepath()): + return False + with open(self.__metrics_filepath(), "r") as f: + j = json.load(f) + return j['exit_code'] == 0 + + def is_back_up_archive_exists(self): + # for pg10 backups there is another naming convention `pg_backup_{timestamp}` + return os.path.exists(self.folder + "/pg_{}_backup_{}.tar.gz".format(PG_CLUSTER_NAME, self.get_id())) \ + or os.path.exists(self.folder + "/pg_backup_{}.tar.gz".format(self.get_id())) \ + or os.path.exists(self.folder + "/pg_{}_backup_{}_enc.tar.gz".format(PG_CLUSTER_NAME, self.get_id())) \ + or os.path.exists(self.folder + "/pg_backup_{}_enc.tar.gz".format(self.get_id())) + + def __enter__(self): + self.__log.info("Init next vault: %s" % self.folder) + super(FSVault, self).__enter__() + + if not os.path.exists(self.folder): + os.makedirs(self.folder) + else: + raise StorageLocationAlreadyExistsException("Destination backup folder already exists: %s" % self.folder) + + self.__log.info("Create .lock file in vault: %s" % self.folder) + fsutil.touch(self.__lock_filepath()) + + return self.folder, self.metrics, self.console + + def create_time(self): + foldername = os.path.basename(self.folder) + d = datetime.strptime(foldername, VAULT_NAME_FORMAT) + return time.mktime(d.timetuple()) + + def __exit__(self, tpe, exception, tb): + self.__log.info("Close vault") + self.__log.info("Save metrics to: %s" % self.metrics_filepath) + + super(FSVault, self).__exit__(tpe, exception, tb) + + self.metrics["size"] = fsutil.get_folder_size(self.folder) + + if exception: + fsutil.touch(self.__failed_filepath()) + e = "\n".join(format_exception(tpe, exception, tb)) + self.__log.info("Don't remove vault .lock due exception in nested code") + self.__log.debug("Something wrong happened inside block uses vault: " + e) + self.metrics["exception"] = e + self.fail() + + with open(self.metrics_filepath, "w") as f: + json.dump(self.metrics, f) + + console_logs = self.console.getvalue() + self.console.close() + with open(self.__console_filepath(), "w") as f: + f.write(console_logs) + + self.__log.info("Remove lock for %s" % self.get_id()) + os.unlink(self.__lock_filepath()) + + def __repr__(self): + return "Vault(%s)" % os.path.basename(self.folder) + + def fail(self): + open(self.folder + self.__failed_filepath(), "w").close() diff --git a/docker/postgres/storage_pgbackrest.py b/docker/postgres/storage_pgbackrest.py new file mode 100644 index 0000000..2470df0 --- /dev/null +++ b/docker/postgres/storage_pgbackrest.py @@ -0,0 +1,243 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import io +import json +import logging +import time +from datetime import datetime +from traceback import format_exception + +import os + +import storage +from storage import VAULT_NAME_FORMAT, StorageLocationAlreadyExistsException, VAULT_DIRNAME_MATCHER +import fsutil +import requests + +try: + from io import StringIO +except ImportError: + from io import StringIO + +import subprocess + +class BackRestStorage(storage.Storage): + __log = logging.getLogger("PgBackRestFS") + + def __init__(self, root): + self.__log.info("Init storage object with storage root: %s" % root) + self.root = root + + def list(self): + + response = requests.get("http://pgbackrest:3000/list").json() + print(response) + vault = [BackRestVault(backup['annotation']['timestamp']) for backup in response] + return vault + + def size(self): + """ Returns whole storage size in bytes """ + return 0 + + def archive_size(self): + """ Returns whole storage size in bytes """ + return 0 + + def fs_space(self): + """ Returns tuple (free, total) space on mount point where is root folder located """ + return (1, 1) + + def get_type(self): + return "PgBackRest" + + def get_type_id(self): + return 2 + + def open_vault(self, backup_id): + """ + + :return: + :rtype: (str, dict, StringIO) + """ + vault = BackRestVault("%s/%s" % (self.root, backup_id)) + print(f'RETURNING VAULT {vault}') + return vault + + def prot_get_as_stream(self, filename): + pass + + def prot_get_file_size(self, filename): + pass + + def prot_is_file_exists(self, filename): + pass + + def prot_list_archive(self): + pass + + def prot_put_as_stream(self, filename, stream): + pass + + def size(self): + return 0 + + def evict_vault(self, vault): + self.__log.info("Evict vault: %s" % vault) + self.__log.debug("Delete folder: %s" % vault.folder) + for root, dirs, files in os.walk(vault.folder, topdown=False): + for name in files: + try: + os.remove(os.path.join(root, name)) + except OSError as e: # passing possible problems with permission + self.__log.exception(e) + for name in dirs: + try: + os.rmdir(os.path.join(root, name)) + except OSError as e: + self.__log.exception(e) + try: + os.rmdir(vault.folder) + except OSError as e: + self.__log.exception(e) + + def get_encryption(self): + pass + + def prot_delete(self, filename): + pass + +class BackRestVault(storage.Vault): + __log = logging.getLogger("BackRestVault") + + def __init__(self, timestamp=""): + super(BackRestVault, self).__init__() + + self.folder = timestamp + self.metrics_filepath = self.folder + "/.metrics" + self.console = StringIO() + + print(f'INIT VAULT {self.folder, self.metrics_filepath, self.console}') + + def get_id(self): + return os.path.basename(self.folder) + + def get_folder(self): + pass + + # + # + def __lock_filepath(self): + return self.folder + ".lock" + # + # def __failed_filepath(self): + # return self.folder + ".failed" + # + # def __metrics_filepath(self): + # return self.folder + ".metrics" + # + # def __console_filepath(self): + # return self.folder + ".console" + # + # def __is_file_exists(self, path): + # return subprocess.call(["/opt/backup/scli", "get", path], env=scli_env) == "Object Not Found" + # + # def __backup_archive_file_path(self): + # return "{}/pg_{}_backup_{}.tar.gz".format(self.get_folder(), PG_CLUSTER_NAME, self.get_id()) + # + # def is_locked(self): + # if self.cache_state: + # if not self.cached_state: + # self.__cache_current_state() + # return self.cached_state["is_locked"] + # return self.__is_file_exists("{}/{}".format(CONTAINER, self.__lock_filepath())) + # + def is_failed(self): + + return {} + # + def is_done(self): + return {} + # + def is_back_up_archive_exists(self): + return True + + def is_locked(self): + #TODO: work with root object from pgbackrest + pass + + def load_metrics(self): + return {} + + def __enter__(self): + self.__log.info("Init next vault: %s" % self.folder) + super(BackRestVault, self).__enter__() + + if not os.path.exists(self.folder): + os.makedirs(self.folder) + else: + raise StorageLocationAlreadyExistsException("Destination backup folder already exists: %s" % self.folder) + + self.__log.info("Create .lock file in vault: %s" % self.folder) + fsutil.touch(self.__lock_filepath()) + + return self.folder, self.metrics, self.console + + def create_time(self): + foldername = self.get_id() + d = datetime.strptime(foldername, VAULT_NAME_FORMAT) + return time.mktime(d.timetuple()) + # + # def __exit__(self, tpe, exception, tb): + # self.__log.info("Close vault") + # self.__log.info("Save metrics to: %s" % self.__metrics_filepath()) + # + # super(PgBackRestVault, self).__exit__(tpe, exception, tb) + # + # backup_name = "{}/pg_{}_backup_{}.tar.gz".format(self.get_folder(), PG_CLUSTER_NAME, self.get_id()) + # size_str = subprocess.check_output( + # ["/opt/backup/scli ls -l {} | grep {} | awk '{{A+=$2}} END{{print A}}'".format(CONTAINER_SEG, backup_name)] + # , shell=True + # , env=scli_env) + # try: + # self.metrics["size"] = int(size_str) + # except Exception as e: + # self.__log.error(e) + # self.metrics["size"] = -1 + # + # if exception: + # subprocess.call("echo .failed > /tmp/.failed", shell=True) + # subprocess.check_call(["/opt/backup/scli", "put", "/tmp/.failed", "{}/{}".format(CONTAINER, self.__failed_filepath())], env=scli_env) + # + # e = "\n".join(format_exception(tpe, exception, tb)) + # self.__log.info("Don't remove vault .lock due exception in nested code") + # self.__log.debug("Something wrong happened inside block uses vault: " + e) + # self.metrics["exception"] = e + # + # with open("/tmp/.metrics", "w") as f: + # json.dump(self.metrics, f) + # subprocess.check_call(["/opt/backup/scli", "put", "/tmp/.metrics", "{}/{}".format(CONTAINER, self.__metrics_filepath())], env=scli_env) + # + # console_logs = self.console.getvalue() + # self.console.close() + # with open("/tmp/.console", "w") as f: + # f.write(console_logs) + # subprocess.check_call(["/opt/backup/scli", "put", "/tmp/.console", "{}/{}".format(CONTAINER, self.__console_filepath())], env=scli_env) + # + # self.__log.info("Remove lock for %s" % self.get_id()) + # subprocess.check_call(["/opt/backup/scli", "delete", "{}/{}".format(CONTAINER, self.__lock_filepath())], env=scli_env) + # + # def __repr__(self): + # return "Vault(%s)" % self.get_id() diff --git a/docker/postgres/storage_s3.py b/docker/postgres/storage_s3.py new file mode 100644 index 0000000..8451173 --- /dev/null +++ b/docker/postgres/storage_s3.py @@ -0,0 +1,462 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess + +import boto3 +import botocore +import hashlib +import io + +import urllib3 + +import botocore.exceptions +import errno +import os +from datetime import datetime +import logging +from traceback import format_exception +import time +import json +import storage +from storage import VAULT_NAME_FORMAT, VAULT_DIRNAME_MATCHER, StorageLocationAlreadyExistsException, ARCHIVE_NAME_MATCHER +from retrying import retry + +try: + from io import StringIO +except ImportError: + from io import StringIO + +CONTAINER = os.getenv("CONTAINER") +CONTAINER_SEG = "{}_segments".format(CONTAINER) +PG_CLUSTER_NAME = os.getenv("PG_CLUSTER_NAME") + +RETRY_COUNT = 10 +RETRY_WAIT = 1000 + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +class StreamWrapper(io.RawIOBase): + + def __init__(self, object_body): + self.object_body = object_body + self.internal_closed = False + + def close(self, *args, **kwargs): + if not self.internal_closed: + self.object_body.close() + self.internal_closed = True + + def read(self, *args, **kwargs): + return self.object_body.read(*args, **kwargs) + + def __exit__(self, *args, **kwargs): + self.close() + + def __enter__(self, *args, **kwargs): + return super(StreamWrapper, self).__enter__(*args, **kwargs) + + +class AwsS3Storage(storage.Storage): + + __log = logging.getLogger("AwsS3Storage") + + def __init__(self, root): + self.__log.info("Init storage object with storage root: %s" % root) + self.root = root + self.aws_prefix = "%s/" % (os.getenv("AWS_S3_PREFIX", "")) + + def get_encryption(self): + pass + + def prot_put_as_stream(self, filename, stream): + """ + :param filename: + :type filename: string + :param stream: + :type stream: io.RawIOBase + :return: sha256 of processed stream + :rtype: string + """ + self.__log.info("Put request for file: %s" % filename) + sha256 = hashlib.sha256() + md5 = hashlib.md5() + chunk_size = 4096 + fs_filename = "{}/{}".format("/tmp", filename) + if not os.path.exists(os.path.dirname(fs_filename)): + try: + os.makedirs(os.path.dirname(fs_filename)) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise + with io.FileIO(fs_filename, "w", closefd=True) as target: + while True: + data = stream.read(chunk_size) + if len(data) == 0: + stream.close() + self.__log.info("Processed stream with sha256 {}".format(sha256.hexdigest())) + break + sha256.update(data) + md5.update(data) + target.write(data) + + self.__log.info("Start uploading: %s" % filename) + # todo[anin] replace implementation + # AwsS3Vault.get_s3_client().upload_fileobj(data, CONTAINER, filename) + AwsS3Vault.get_s3_client().upload_file(fs_filename, CONTAINER, filename) + os.remove(fs_filename) + return sha256.hexdigest() + + @retry(stop_max_attempt_number=RETRY_COUNT, wait_fixed=RETRY_WAIT) + def prot_get_as_stream(self, filename): + self.__log.info("Get stream request for file: %s" % self.aws_prefix + filename) + object_body = AwsS3Vault.get_s3_resource().Bucket(CONTAINER).Object(self.aws_prefix + filename).get()['Body'] + return StreamWrapper(object_body) + + @retry(stop_max_attempt_number=RETRY_COUNT, wait_fixed=RETRY_WAIT) + def prot_delete_bundle(self, filename): + objects_to_delete = AwsS3Vault.get_s3_client().list_objects(Bucket=CONTAINER, Prefix=self.aws_prefix + filename) + for obj in objects_to_delete.get('Contents', []): + AwsS3Vault.get_s3_client().delete_object(Bucket=CONTAINER, Key=obj['Key']) + + def prot_delete(self, filename): + self.prot_delete_bundle(filename) + + def prot_is_file_exists(self, filename): + exists = True + try: + AwsS3Vault.get_s3_resource().Object(CONTAINER, self.aws_prefix + filename).get() + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'NoSuchKey': + exists = False + else: + raise + + return exists + + def prot_get_file_size(self, filename): + if self.prot_is_file_exists(filename): + return int(AwsS3Vault.get_s3_resource().Object(CONTAINER, filename).get()['Size']) + return 0 + + def is_valid_backup_id(self, backup_id): + try: + datetime.strptime(backup_id, storage.VAULT_NAME_FORMAT) + return True + except ValueError: + return False + + @retry(stop_max_attempt_number=RETRY_COUNT, wait_fixed=RETRY_WAIT) + def list(self): + bucket = AwsS3Vault.get_s3_client().list_objects(Bucket=CONTAINER) + aws_s3_vault_listing = [] + if 'Contents' in bucket: + # Collect backups ids only + aws_s3_vault_listing = [obj["Key"].split('/', 2)[1] + for obj in bucket['Contents'] + if '/' in obj['Key'] and self.is_valid_backup_id(obj["Key"].split('/', 2)[1])] + + vaults = [ + AwsS3Vault(backup_id + , bucket=CONTAINER + , cluster_name=PG_CLUSTER_NAME + , cache_enabled=True + , aws_s3_bucket_listing=(bucket['Contents'] if 'Contents' in bucket else None)) + for backup_id in aws_s3_vault_listing] + vaults.sort(key=lambda v: v.create_time()) + return vaults + + def size(self): + """ Returns whole storage size in bytes """ + total_size = 0 + bucket = AwsS3Vault.get_s3_client().list_objects(Bucket=CONTAINER) + + if 'Contents' not in bucket: + return 0 + + for obj in bucket["Contents"]: + total_size += obj['Size'] + return total_size + + def archive_size(self): + """ Returns whole storage size in bytes """ + total_size = 0 + bucket = AwsS3Vault.get_s3_client().list_objects(Bucket=CONTAINER, Prefix="archive/") + + if 'Contents' not in bucket: + return 0 + + for obj in bucket["Contents"]: + total_size += obj['Size'] + return total_size + + def fs_space(self): + """ Returns tuple (free, total) space on mount point where is root folder located """ + return (1, 1) + + def open_vault(self, backup_id): + """ + + :return: + :rtype: (str, dict, StringIO) + """ + return AwsS3Vault("%s" % (datetime.now().strftime(VAULT_NAME_FORMAT)), CONTAINER, cluster_name=PG_CLUSTER_NAME) + + def evict_vault(self, vault): + self.__log.info("Evict vault: %s" % vault) + + backup_id = vault.get_folder() + backup_name = "{}/{}/pg_backup_{}.tar.gz".format(self.aws_prefix, backup_id, vault.get_id()) + try: + self.prot_delete_bundle(backup_id) + except botocore.exceptions.ClientError as e: + return "Not Found" + + def prot_list_archive(self): + bucket = AwsS3Vault.get_s3_client().list_objects(Bucket=CONTAINER, Prefix="archive/", Delimiter="/") + aws_s3_archive_listing = [] + if 'Contents' in bucket: + # Collect archive ids only + aws_s3_archive_listing = [obj + for obj in bucket['Contents'] + if '/' in obj['Key'] and ARCHIVE_NAME_MATCHER.match(obj["Key"].split('/', 1)[1])] + self.__log.info("Archives: {}".format(aws_s3_archive_listing)) + if aws_s3_archive_listing: + archives = [storage.Archive( + ARCHIVE_NAME_MATCHER.match(archive["Key"].split('/', 1)[1]).group("name"), + 1000 * int(archive["LastModified"].strftime("%s"))) + for archive in aws_s3_archive_listing] + self.__log.info("Parsed archives: {}".format(archives)) + return archives + else: + return [] + + def get_type(self): + return "AWS S3" + + def get_type_id(self): + return 1 + + +class AwsS3VaultCreationException(Exception): + pass + + +class AwsS3Vault(storage.Vault): + __log = logging.getLogger("AwsS3Vault") + + @staticmethod + def get_s3_resource(): + return boto3.resource("s3", + region_name=os.getenv("AWS_DEFAULT_REGION") if os.getenv("AWS_DEFAULT_REGION") else None, + endpoint_url=os.getenv("AWS_S3_ENDPOINT_URL"), + aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), + aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), + verify=(False if os.getenv("AWS_S3_UNTRUSTED_CERT", "false").lower() == "true" else None)) + + @staticmethod + def get_s3_client(): + return AwsS3Vault.get_s3_resource().meta.client + + def __init__(self, backup_id, bucket, cluster_name=None, cache_enabled=False, + aws_s3_bucket_listing=None): + super(AwsS3Vault, self).__init__() + + self.backup_id = backup_id + self.aws_prefix = os.getenv("AWS_S3_PREFIX", "") + self.bucket = bucket + self.console = None + self.cluster_name = cluster_name + self.cache_enabled = cache_enabled + self.cached_state = {} + self.aws_s3_bucket_listing = aws_s3_bucket_listing + self.backup_name = "{}/{}/pg_backup_{}.tar.gz".format(self.aws_prefix, self.get_folder(), self.get_id()) + + def __get_s3_bucket(self): + return AwsS3Vault.get_s3_resource().Bucket(self.bucket) + + def __cache_current_state(self): + self.__log.debug("Cache current state") + # todo[anin] fix cache + if self.aws_s3_bucket_listing: + # aws_s3_bucket_listing = [ + # { + # 'LastModified': datetime.datetime(2017, 6, 27, 10, 40, 8, 957000, tzinfo=tzlocal()), + # 'ETag': '"b3b1d912e348a08d57cb9b3ab5a92bc0"', + # 'StorageClass': 'STANDARD', + # 'Key': '20170627T1040.console', + # 'Owner': {'DisplayName': 'platform', 'ID': 'platform'}, + # 'Size': 418 + # } + # ... + # ] + self.cached_state["is_locked"] = len([x for x in self.aws_s3_bucket_listing if self.__lock_filepath() in x['Key']]) == 1 + self.cached_state["is_failed"] = len([x for x in self.aws_s3_bucket_listing if self.__failed_filepath() in x['Key']]) == 1 + else: + self.cached_state["is_locked"] = self.__is_file_exists(self.bucket, self.__lock_filepath()) + self.cached_state["is_failed"] = self.__is_file_exists(self.bucket, self.__failed_filepath()) + self.__log.debug("State: {}".format(self.cached_state)) + + def get_id(self): + return os.path.basename(self.backup_id) + + def get_folder(self): + return self.backup_id + + def __get_backup_name(self): + return self.backup_name + + @retry(stop_max_attempt_number=RETRY_COUNT, wait_fixed=RETRY_WAIT) + def __load_metrics_from_s3(self): + try: + metrics = AwsS3Vault.get_s3_resource().Object(self.bucket, self.__metrics_filepath()).get() + return json.load(metrics["Body"]) + except Exception as e: + self.__log.exception(e) + self.__log.warning("Cannot load metrics from s3. Backup can be damaged") + return {} + + def load_metrics(self): + self.__log.debug("Load metrics from: %s" % self.__metrics_filepath()) + if self.cache_enabled: + if "metrics" not in self.cached_state: + self.cached_state["metrics"] = self.__load_metrics_from_s3() + return self.cached_state["metrics"] + else: + return self.__load_metrics_from_s3() + + + def __lock_filepath(self): + return "%s/%s/%s" % (self.aws_prefix, self.backup_id, self.backup_id + ".lock") + + def __failed_filepath(self): + return "%s/%s/%s" % (self.aws_prefix, self.backup_id, self.backup_id + ".failed") + + def __metrics_filepath(self): + return "%s/%s/%s" % (self.aws_prefix, self.backup_id, self.backup_id + ".metrics") + + def __console_filepath(self): + return "%s/%s/%s" % (self.aws_prefix, self.backup_id, self.backup_id + ".console") + + def __is_file_exists(self, bucket, obj): + exists = True + try: + AwsS3Vault.get_s3_resource().Object(bucket, obj).get() + except botocore.exceptions.ClientError as e: + if e.response['Error']['Code'] == 'NoSuchKey': + exists = False + else: + raise + + return exists + + def is_locked(self): + if self.cache_enabled: + if not self.cached_state: + self.__cache_current_state() + if "is_locked" in self.cached_state.keys(): + return self.cached_state["is_locked"] + + return self.__is_file_exists(self.bucket, self.__lock_filepath()) + + def is_failed(self): + if self.cache_enabled: + if not self.cached_state: + self.__cache_current_state() + if "is_failed" in self.cached_state.keys(): + return self.cached_state["is_failed"] + return self.__is_file_exists(self.bucket, self.__failed_filepath()) + + def is_done(self): + if not self.__is_file_exists(CONTAINER, self.__metrics_filepath()): + self.__log.info(self.__is_file_exists) + return False + j = self.__load_metrics_from_s3() + self.__log.info(j) + return j['exit_code'] == 0 + + def is_back_up_archive_exists(self): + return self.__is_file_exists(self.bucket, self.backup_name) + + def __enter__(self): + self.__log.info("Init next vault: %s" % self.backup_id) + super(AwsS3Vault, self).__enter__() + + if self.__is_file_exists(self.bucket, self.__metrics_filepath()): + raise AwsS3VaultCreationException("Destination backup folder already exists: %s" % self.backup_id) + + self.__log.info("Create .lock file in vault: %s" % self.backup_id) + lock_marker = "/tmp/.lock" + subprocess.call("echo .lock > %s" % lock_marker, shell=True) + self.__get_s3_bucket().upload_file(lock_marker, self.__lock_filepath()) + + self.console = StringIO() + + return self.backup_id, self.metrics, self.console + + def create_time(self): + folder_name = self.get_id() + d = datetime.strptime(folder_name, "%Y%m%dT%H%M") + return time.mktime(d.timetuple()) + + def __exit__(self, tpe, exception, tb): + self.__log.debug("Closing vault [%s]" % self) + + super(AwsS3Vault, self).__exit__(tpe, exception, tb) + + console_logs = self.console.getvalue() + self.console.close() + + console_marker = "/tmp/.console" + with open(console_marker, "w") as f: + f.write(console_logs) + self.__get_s3_bucket().upload_file(console_marker, self.__console_filepath()) + self.__log.info("Console logs are saved to: %s" % self.__console_filepath()) + + if exception is None: + self.__on_successful_upload() + else: + e = "\n".join(format_exception(tpe, exception, tb)) + self.__on_failed_upload(exception=e, output=console_logs) + + metrics_marker = "/tmp/.metrics" + with open(metrics_marker, "w") as f: + json.dump(self.metrics, f) + + self.__get_s3_bucket().upload_file(metrics_marker, self.__metrics_filepath()) + self.__log.info("Metrics are saved to: %s" % self.__metrics_filepath()) + + self.__log.info("Remove lock for %s" % self.get_id()) + self.__get_s3_bucket().Object(self.__lock_filepath()).delete() + + def __on_successful_upload(self): + self.__log.info("Backup %s is uploaded successfully." % self.backup_id) + size_str = self.__get_s3_bucket().Object(self.__get_backup_name()).get()["ContentLength"] + self.metrics["size"] = int(size_str) + + def __on_failed_upload(self, **kwargs): + failed_marker = "/tmp/.failed" + subprocess.call("echo .failed > %s" % failed_marker, shell=True) + self.__get_s3_bucket().upload_file(failed_marker, self.__failed_filepath()) + + self.__log.error("Something wrong happened inside block uses vault: " + kwargs['exception']) + self.__log.error("Backup script output: " + kwargs['output']) + self.metrics["exception"] = kwargs['exception'] + self.metrics["size"] = -1 + + def __repr__(self): + return "Vault(%s)" % self.get_id() + + def fail(self): + open(self.__failed_filepath(), "w").close() diff --git a/docker/postgres/storage_swift.py b/docker/postgres/storage_swift.py new file mode 100644 index 0000000..a0fc482 --- /dev/null +++ b/docker/postgres/storage_swift.py @@ -0,0 +1,281 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import io +import json +import logging +import time +from datetime import datetime +from traceback import format_exception + +import os + +import storage +from storage import VAULT_NAME_FORMAT, StorageLocationAlreadyExistsException + +try: + from io import StringIO +except ImportError: + from io import StringIO + +import subprocess + +CONTAINER = os.getenv("CONTAINER") +CONTAINER_SEG = "{}_segments".format(CONTAINER) +PG_CLUSTER_NAME = os.getenv("PG_CLUSTER_NAME") +scli_env = os.environ.copy() +scli_env["ST_AUTH"] = os.getenv("SWIFT_AUTH_URL") +scli_env["ST_USER"] = os.getenv("SWIFT_USER") +scli_env["ST_KEY"] = os.getenv("SWIFT_PASSWORD") +scli_env["ST_TENANT"] = os.getenv("TENANT_NAME") + + +class SwiftStorage(storage.Storage): + __log = logging.getLogger("SwiftStorage") + + def __init__(self, root): + self.__log.info("Init storage object with storage root: %s" % root) + self.root = root + + def list(self): + swift_vault_listing = subprocess.check_output(["/opt/backup/scli", "ls", CONTAINER], env=scli_env) + vaults = [SwiftVault(os.path.basename(backup_name[0:(len(backup_name) - len(os.path.basename(backup_name)) - 1)]) + , cache_state=True + ,swift_vault_listing=swift_vault_listing) + for backup_name in swift_vault_listing.split("\n") + if backup_name.endswith(".tar.gz")] + vaults.sort(key=lambda v: v.create_time()) + return vaults + + def size(self): + """ Returns whole storage size in bytes """ + return 0 + + def archive_size(self): + """ Returns whole storage size in bytes """ + return 0 + + def fs_space(self): + """ Returns tuple (free, total) space on mount point where is root folder located """ + return (1, 1) + + def open_vault(self): + """ + + :return: + :rtype: (str, dict, StringIO) + """ + return SwiftVault("%s" % (datetime.now().strftime(VAULT_NAME_FORMAT))) + + def evict(self, vault): + self.__log.info("Evict vault: %s" % vault) + backup_name = "{}/pg_{}_backup_{}.tar.gz".format(vault.get_folder(), PG_CLUSTER_NAME, vault.get_id()) + subprocess.check_call(["/opt/backup/scli", "delete", "{}/{}".format(CONTAINER, backup_name)], env=scli_env) + subprocess.call(["/opt/backup/scli", "delete", "{}/{}.lock".format(CONTAINER, vault.get_folder())], env=scli_env) + subprocess.call(["/opt/backup/scli", "delete", "{}/{}.failed".format(CONTAINER, vault.get_folder())], env=scli_env) + subprocess.call(["/opt/backup/scli", "delete", "{}/{}.console".format(CONTAINER, vault.get_folder())], env=scli_env) + subprocess.check_call(["/opt/backup/scli", "delete", "{}/{}.metrics".format(CONTAINER, vault.get_folder())], env=scli_env) + + def prot_is_file_exists(self, filename): + self.__log.info("File existence check for: %s" % filename) + swift_vault_listing = subprocess.check_output(["/opt/backup/scli", "ls", CONTAINER], env=scli_env) + return filename in swift_vault_listing + + def prot_delete(self, filename): + self.__log.info("Delete file: %s" % filename) + subprocess.check_call(["/opt/backup/scli", "delete", "{}/{}".format(CONTAINER, filename)], env=scli_env) + + def prot_get_as_stream(self, filename): + """ + :param filename: path to file from backup root. + :type filename: string + :return: stream with requested file + :rtype: io.RawIOBase + """ + self.__log.info("Get request for file: %s" % filename) + process = subprocess.Popen(["/opt/backup/scli", "get", "{}/{}".format(CONTAINER, filename)], env=scli_env, stdout=subprocess.PIPE) + return process.stdout + + def prot_put_as_stream(self, filename, stream): + """ + :param filename: + :type filename: string + :param stream: + :type stream: io.RawIOBase + :return: sha256 of processed stream + :rtype: string + """ + self.__log.info("Put request for file: %s" % filename) + sha256 = hashlib.sha256() + chunk_size = 4096 + process = subprocess.Popen(["/opt/backup/scli", "put", "{}/{}".format(CONTAINER, filename)], env=scli_env, stdout=subprocess.PIPE, stdin=subprocess.PIPE) + + while True: + data = stream.read(chunk_size) + if len(data) == 0: + stream.close() + self.__log.info("Processed stream with sha256 {}".format(sha256.hexdigest())) + return sha256.hexdigest() + sha256.update(data) + process.stdin.write(data) + + def get_type(self): + return "Swift" + + def get_type_id(self): + return 2 + + +class SwiftVault(storage.Vault): + __log = logging.getLogger("SwiftVaultLock") + + def __init__(self, folder, cache_state=False, swift_vault_listing=None): + super(SwiftVault, self).__init__() + + self.folder = folder + self.console = None + self.cache_state = cache_state + self.cached_state = {} + self.swift_vault_listing = swift_vault_listing + + def __cache_current_state(self): + if self.cache_state: + swift_vault_listing = self.swift_vault_listing if self.swift_vault_listing else subprocess.check_output(["/opt/backup/scli", "ls", CONTAINER], env=scli_env) + self.cached_state["is_locked"] = self.__lock_filepath() in swift_vault_listing + self.cached_state["is_failed"] = self.__failed_filepath() in swift_vault_listing + + def get_id(self): + return os.path.basename(self.folder) + + def get_folder(self): + return self.folder + + def __load_metrics_from_swift(self): + try: + return json.loads(subprocess.check_output([ + "/opt/backup/scli", + "get", + "{}/{}".format(CONTAINER, self.__metrics_filepath())], env=scli_env)) + except Exception: + self.__log.warning("Cannot load metrics from swift. Backup can be damaged") + return {} + + def load_metrics(self): + self.__log.debug("Load metrics from: %s" % self.__metrics_filepath()) + if self.cache_state: + if "metrics" not in self.cached_state: + self.cached_state["metrics"] = self.__load_metrics_from_swift() + return self.cached_state["metrics"] + else: + return self.__load_metrics_from_swift() + + + def __lock_filepath(self): + return self.folder + ".lock" + + def __failed_filepath(self): + return self.folder + ".failed" + + def __metrics_filepath(self): + return self.folder + ".metrics" + + def __console_filepath(self): + return self.folder + ".console" + + def __is_file_exists(self, path): + return subprocess.call(["/opt/backup/scli", "get", path], env=scli_env) == "Object Not Found" + + def __backup_archive_file_path(self): + return "{}/pg_{}_backup_{}.tar.gz".format(self.get_folder(), PG_CLUSTER_NAME, self.get_id()) + + def is_locked(self): + if self.cache_state: + if not self.cached_state: + self.__cache_current_state() + return self.cached_state["is_locked"] + return self.__is_file_exists("{}/{}".format(CONTAINER, self.__lock_filepath())) + + def is_failed(self): + if self.cache_state: + if not self.cached_state: + self.__cache_current_state() + return self.cached_state["is_failed"] + return self.__is_file_exists("{}/{}".format(CONTAINER, self.__failed_filepath())) + + def is_done(self): + pass + + def is_back_up_archive_exists(self): + return self.__is_file_exists("{}/{}".format(CONTAINER, self.__backup_archive_file_path())) + + def __enter__(self): + self.__log.info("Init next vault: %s" % self.folder) + super(SwiftVault, self).__enter__() + + if self.__is_file_exists(self.__metrics_filepath()): + raise StorageLocationAlreadyExistsException("Destination backup folder already exists: %s" % self.folder) + + self.__log.info("Create .lock file in vault: %s" % self.folder) + subprocess.call("echo .lock > /tmp/.lock", shell=True) + subprocess.check_call(["/opt/backup/scli", "put", "/tmp/.lock", "{}/{}".format(CONTAINER, self.__lock_filepath())], env=scli_env) + self.console = StringIO() + return (self.folder, self.metrics, self.console) + + def create_time(self): + foldername = self.get_id() + d = datetime.strptime(foldername, VAULT_NAME_FORMAT) + return time.mktime(d.timetuple()) + + def __exit__(self, tpe, exception, tb): + self.__log.info("Close vault") + self.__log.info("Save metrics to: %s" % self.__metrics_filepath()) + + super(SwiftVault, self).__exit__(tpe, exception, tb) + + backup_name = "{}/pg_{}_backup_{}.tar.gz".format(self.get_folder(), PG_CLUSTER_NAME, self.get_id()) + size_str = subprocess.check_output( + ["/opt/backup/scli ls -l {} | grep {} | awk '{{A+=$2}} END{{print A}}'".format(CONTAINER_SEG, backup_name)] + , shell=True + , env=scli_env) + try: + self.metrics["size"] = int(size_str) + except Exception as e: + self.__log.error(e) + self.metrics["size"] = -1 + + if exception: + subprocess.call("echo .failed > /tmp/.failed", shell=True) + subprocess.check_call(["/opt/backup/scli", "put", "/tmp/.failed", "{}/{}".format(CONTAINER, self.__failed_filepath())], env=scli_env) + + e = "\n".join(format_exception(tpe, exception, tb)) + self.__log.info("Don't remove vault .lock due exception in nested code") + self.__log.debug("Something wrong happened inside block uses vault: " + e) + self.metrics["exception"] = e + + with open("/tmp/.metrics", "w") as f: + json.dump(self.metrics, f) + subprocess.check_call(["/opt/backup/scli", "put", "/tmp/.metrics", "{}/{}".format(CONTAINER, self.__metrics_filepath())], env=scli_env) + + console_logs = self.console.getvalue() + self.console.close() + with open("/tmp/.console", "w") as f: + f.write(console_logs) + subprocess.check_call(["/opt/backup/scli", "put", "/tmp/.console", "{}/{}".format(CONTAINER, self.__console_filepath())], env=scli_env) + + self.__log.info("Remove lock for %s" % self.get_id()) + subprocess.check_call(["/opt/backup/scli", "delete", "{}/{}".format(CONTAINER, self.__lock_filepath())], env=scli_env) + + def __repr__(self): + return "Vault(%s)" % self.get_id() \ No newline at end of file diff --git a/docker/postgres/utils.py b/docker/postgres/utils.py new file mode 100644 index 0000000..6c88864 --- /dev/null +++ b/docker/postgres/utils.py @@ -0,0 +1,87 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import psycopg2 +import logging + +log = logging.getLogger("utils") + + +def execute_query(conn_properties, query): + conn = None + try: + conn = psycopg2.connect(**conn_properties) + with conn.cursor() as cur: + cur.execute(query) + return cur.fetchone()[0] + finally: + if conn: + conn.close() + +def get_version_of_pgsql_server(): + # this is very bad, need to reuse code from granular backups + conn_properties = { + 'host': os.getenv('POSTGRES_HOST'), + 'port': os.getenv('POSTGRES_PORT'), + 'user': os.getenv('POSTGRES_USER') or 'postgres', + 'password': os.getenv('POSTGRES_PASSWORD'), + 'database': 'postgres', + 'connect_timeout': 5, + } + try: + server_version = execute_query(conn_properties, 'SHOW SERVER_VERSION;') + except psycopg2.OperationalError as e: + log.exception(e) + return None + version_as_list = list(map(int, server_version.split(' ')[0].split('.'))) + if [10, 0] <= version_as_list < [11, 0]: + return "pg10" + elif [11, 0] <= version_as_list < [12, 0]: + return "pg11" + elif [12, 0] <= version_as_list < [13, 0]: + return "pg12" + elif [13, 0] <= version_as_list < [14, 0]: + return "pg13" + elif [14, 0] <= version_as_list < [15, 0]: + return "pg14" + elif [15, 0] <= version_as_list < [16, 0]: + return "pg15" + elif version_as_list >= [16, 0]: + return "pg16" + return "" + + +def retry_if_storage_error(exception): + """ + Return True if we should retry (in this case when it's an psycopg2.OperationalError can happen on db connect error) + False otherwise + """ + log.info("During initialization of storage next error occurred: %s", exception) + import storage + return isinstance(exception, storage.StorageException) + + +def get_encryption(): + encrypt_backups = os.getenv("KEY_SOURCE", 'false').lower() + return encrypt_backups != 'false' + + +def validate_user(username, password): + if not os.getenv("AUTH", "false").lower() == "false": + return username == os.getenv("POSTGRES_USER") and \ + password == os.getenv("POSTGRES_PASSWORD") + else: + return True + diff --git a/docker/postgres/utils.sh b/docker/postgres/utils.sh new file mode 100755 index 0000000..2f0270f --- /dev/null +++ b/docker/postgres/utils.sh @@ -0,0 +1,46 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export ST_AUTH="${SWIFT_AUTH_URL}" +export ST_USER="${SWIFT_USER}" +export ST_KEY="${SWIFT_PASSWORD}" +export ST_TENANT="${TENANT_NAME}" + +export PGPASSWORD="$(echo ${PGPASSWORD} | tr -d '\n' | tr -d '[[:space:]]')" + +function log() { + log_module "" "backup_uploader" "$1" +} + +function log_module() { + local priority="${1:-INFO}" + local module="$2" + local message="$3" + + local timestamp=$(date --iso-8601=seconds) + echo "[${timestamp}][${priority}][category=${module}] ${message}" +} + +function process_exit_code() { + local exit_code=$1 + local message="$2" + if [ ${exit_code} -ne 0 ];then + log_module "ERROR" "" "${message}" + exit 1 + fi +} + +function register_delete_on_exit() { + trap "rm -f $*" EXIT +} \ No newline at end of file diff --git a/docker/postgres/workers.py b/docker/postgres/workers.py new file mode 100644 index 0000000..0380ef4 --- /dev/null +++ b/docker/postgres/workers.py @@ -0,0 +1,127 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import eviction +import locks +import utils +import logging +import time +import encryption + +from multiprocessing import Process + + +class BackupProcessException(Exception): + pass + + +class PostgreSQLBackupWorker(Process): + def __init__(self, storage, backup_command, eviction_rule, backup_id): + Process.__init__(self) + self.__log = logging.getLogger("PostgreSQLBackupWorker") + + self.__storage = storage + self.__backup_command = backup_command + self.__eviction_rule = eviction_rule + self.__backup_id = backup_id + + if utils.get_encryption(): + self.encryption = True + self.key = encryption.KeyManagement.get_object().get_password() + self.key_name = encryption.KeyManagement.get_key_name() + self.key_source = encryption.KeyManagement.get_key_source() + self.__backup_command = backup_command + " %(key)s" + else: + self.encryption = False + + def run(self): + self.__perform_database_backup() + self.__cleanup_storage() + + def __perform_database_backup(self): + with self.__storage.open_vault(self.__backup_id) as (vault_folder, metrics, console): + backup_id = self.__backup_id.split('/')[-1] + + self.__log.info("[backup-id=%s] Start new backup streaming." % backup_id) + locks.update_lock_file(backup_id=backup_id) + + cmd_options = {"data_folder": vault_folder} + if self.encryption: + cmd_options["key"] = self.key + + cmd_processed = self.__split_command_line(self.__backup_command % cmd_options) + if not self.encryption: + self.__log.info("Run cmd template: %s\n\toptions: %s\n\tcmd: [%s]" % ( + self.__backup_command, str(cmd_options), ", ".join(cmd_processed))) + else: + self.__log.info("Run cmd template %s\n" % (self.__backup_command)) + import subprocess + process = subprocess.Popen(cmd_processed, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + while True: + output = process.stdout.readline() + if output.decode() == '': + if process.poll() is not None: + break + else: + time.sleep(0.1) + if output: + print((output.decode().strip())) + console.write(output.decode().strip()) + exit_code = process.poll() + metrics["exit_code"] = exit_code + if self.encryption: + metrics["key_name"] = self.key_name + metrics["key_source"] = self.key_source + + if exit_code != 0: + msg = "[backup-id=%s] Execution of '%s' was finished with non zero exit code: %d" % (backup_id, cmd_processed, exit_code) + self.__log.error(msg) + raise BackupProcessException(msg) + else: + self.__log.info("[backup-id=%s] Backup streaming has been successfully finished." % backup_id) + + def __cleanup_storage(self): + self.__log.info("Start eviction process by policy: %s" % self.__eviction_rule) + outdated_vaults = eviction.evict(self.__storage.list(), self.__eviction_rule, + accessor=lambda x: x.create_time()) + if len(outdated_vaults) > 0: + for vault in outdated_vaults: + self.__storage.evict(vault) + else: + self.__log.info("No obsolete vaults to evict") + + def fail(self): + self.__log.warning("Set failed status for backup") + self.__storage.fail(self.__backup_id) + + @staticmethod + def __split_command_line(cmd_line): + import shlex + lex = shlex.shlex(cmd_line) + lex.quotes = '"' + lex.whitespace_split = True + lex.commenters = '' + return list(lex) + + +def spawn_backup_worker(storage, backup_command, eviction_rule, backup_id): + process = PostgreSQLBackupWorker(storage, backup_command, eviction_rule, backup_id) + process.start() + return process + + +def spawn_worker(runnable, *args): + process = Process(target=runnable, args=args) + process.start() + return process diff --git a/docker/requirements.txt b/docker/requirements.txt new file mode 100644 index 0000000..b0c6294 --- /dev/null +++ b/docker/requirements.txt @@ -0,0 +1,54 @@ +aniso8601==9.0.1 +APScheduler==3.10.0 +boto3==1.21.21 +cachetools==4.2.4 +certifi==2023.7.22 +chardet==3.0.4 +charset-normalizer==2.0.12 +colorama==0.4.3 +croniter==1.3.8 +dictdiffer==0.9.0 +docutils==0.20 +filelock==3.4.1 +Flask==3.0.3 +Flask-HTTPAuth==4.5.0 +Flask-RESTful==0.3.9 +google-api-core==2.11.0 +google-api-python-client==2.68.0 +google-auth==2.16.1 +google-auth-httplib2==0.1.1 +googleapis-common-protos==1.57.0 +gunicorn==22.0.0 +httplib2==0.21.0 +idna==3.7 +importlib-metadata==4.8.3 +ipaddress==1.0.23 +jmespath==1.0.1 +kubernetes==27.2.0 +MarkupSafe==2.1.1 +oauth2client==4.1.3 +oauthlib==3.2.2 +protobuf==4.25.0 +psycopg2-binary==2.9.5 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pycryptodome==3.17 +pyhocon==0.3.54 +pyparsing==3.1.0 +python-dateutil==2.9.0 +python-string-utils==1.0.0 +pytz==2022.6 +PyYAML==6.0.1 +requests==2.28.0 +requests-oauthlib==2.0.0 +retrying==1.3.3 +rsa==4.8 +ruamel.yaml==0.17.22 +six==1.16.0 +typing-extensions==4.5.0 +tzlocal==4.3 +uritemplate==4.1.1 +urllib3==1.26.17 +websocket-client==1.4.2 +Werkzeug==3.0.3 +zipp==3.6.0 \ No newline at end of file diff --git a/go/cmd/main.go b/go/cmd/main.go new file mode 100644 index 0000000..448584c --- /dev/null +++ b/go/cmd/main.go @@ -0,0 +1,57 @@ +// Copyright 2024-2025 NetCracker Technology Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "flag" + + "github.com/Netcracker/pgskipper-backup-daemon/pkg/config" + "github.com/Netcracker/pgskipper-backup-daemon/pkg/util" + + "github.com/Netcracker/pgskipper-backup-daemon/pkg/azure" + "github.com/Netcracker/pgskipper-backup-daemon/pkg/k8s" +) + +var ( + restoreId = flag.String("restore_id", "", "Id of restore operation") + restoreTime = flag.String("restore_time", "", "Time for restore database from") + restoreFolder = flag.String("restore_folder", "", "Folder to save restore statuses") + restoreAsSeparate = flag.String("restore_as_separate", "false", "Flag to skip update external service") + geoRestore = flag.String("geo_restore", "false", "Flag to perform geo restore") + subnet = flag.String("subnet", "false", "override subnet for restored instance") +) + +func main() { + flag.Parse() + restoreCfg := config.NewRestoreConfig(*restoreId, *restoreTime, *restoreFolder, *restoreAsSeparate, *geoRestore, *subnet) + restoreClient := azure.NewRestoreClientWithRestoreConfig(restoreCfg) + + util.ConfigureAzLogging() + + restoreClient.RestoreDatabase() + + if restoreCfg.RestoreAsSeparate() { + return + } + + err := k8s.UpdateExternalService(restoreCfg.NewServerName()) + if err != nil { + panic(err) + } + err = k8s.UpdateExternalCM(restoreCfg.ServerName(), restoreCfg.NewServerName()) + if err != nil { + panic(err) + } +} diff --git a/go/go.mod b/go/go.mod new file mode 100644 index 0000000..74dae14 --- /dev/null +++ b/go/go.mod @@ -0,0 +1,65 @@ +module github.com/Netcracker/pgskipper-backup-daemon + +go 1.19 + +require ( + github.com/Azure/azure-sdk-for-go/sdk/azcore v1.4.0 + github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.2.2 + github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/postgresql/armpostgresqlflexibleservers/v2 v2.1.0 + go.uber.org/zap v1.24.0 + k8s.io/api v0.26.0 + k8s.io/apimachinery v0.26.0 + k8s.io/client-go v0.26.0 + sigs.k8s.io/controller-runtime v0.14.1 +) + +require ( + github.com/Azure/azure-sdk-for-go/sdk/internal v1.2.0 // indirect + github.com/AzureAD/microsoft-authentication-library-for-go v0.9.0 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/emicklei/go-restful/v3 v3.9.0 // indirect + github.com/evanphx/json-patch/v5 v5.6.0 // indirect + github.com/go-logr/logr v1.2.3 // indirect + github.com/go-openapi/jsonpointer v0.19.5 // indirect + github.com/go-openapi/jsonreference v0.20.0 // indirect + github.com/go-openapi/swag v0.19.14 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/golang-jwt/jwt/v4 v4.5.0 // indirect + github.com/golang/protobuf v1.5.2 // indirect + github.com/google/gnostic v0.5.7-v3refs // indirect + github.com/google/go-cmp v0.5.9 // indirect + github.com/google/gofuzz v1.1.0 // indirect + github.com/google/uuid v1.3.0 // indirect + github.com/imdario/mergo v0.3.9 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect + github.com/mailru/easyjson v0.7.6 // indirect + github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/spf13/pflag v1.0.5 // indirect + go.uber.org/atomic v1.7.0 // indirect + go.uber.org/multierr v1.6.0 // indirect + golang.org/x/crypto v0.14.0 // indirect + golang.org/x/net v0.17.0 // indirect + golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect + golang.org/x/sys v0.13.0 // indirect + golang.org/x/term v0.13.0 // indirect + golang.org/x/text v0.13.0 // indirect + golang.org/x/time v0.3.0 // indirect + google.golang.org/appengine v1.6.7 // indirect + google.golang.org/protobuf v1.28.1 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v2 v2.4.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/klog/v2 v2.80.1 // indirect + k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280 // indirect + k8s.io/utils v0.0.0-20221128185143-99ec85e7a448 // indirect + sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect + sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect + sigs.k8s.io/yaml v1.3.0 // indirect +) diff --git a/go/go.sum b/go/go.sum new file mode 100644 index 0000000..b72491c --- /dev/null +++ b/go/go.sum @@ -0,0 +1,527 @@ +cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= +cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU= +cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY= +cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc= +cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0= +cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= +cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4= +cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M= +cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bPc= +cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk= +cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs= +cloud.google.com/go v0.62.0/go.mod h1:jmCYTdRCQuc1PHIIJ/maLInMho30T/Y0M4hTdTShOYc= +cloud.google.com/go v0.65.0/go.mod h1:O5N8zS7uWy9vkA9vayVHs65eM1ubvY4h553ofrNHObY= +cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= +cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= +cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= +cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg= +cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc= +cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ= +cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= +cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= +cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= +cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= +cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= +cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU= +cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= +cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= +cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= +cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs= +cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0= +dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.4.0 h1:rTnT/Jrcm+figWlYz4Ixzt0SJVR2cMC8lvZcimipiEY= +github.com/Azure/azure-sdk-for-go/sdk/azcore v1.4.0/go.mod h1:ON4tFdPTwRcgWEaVDrN3584Ef+b7GgSJaXxe5fW9t4M= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.2.2 h1:uqM+VoHjVH6zdlkLF2b6O0ZANcHoj3rO0PoQ3jglUJA= +github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.2.2/go.mod h1:twTKAa1E6hLmSDjLhaCkbTMQKc7p/rNLU40rLxGEOCI= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.2.0 h1:leh5DwKv6Ihwi+h60uHtn6UWAxBbZ0q8DwQVMzf61zw= +github.com/Azure/azure-sdk-for-go/sdk/internal v1.2.0/go.mod h1:eWRD7oawr1Mu1sLCawqVc0CUiF43ia3qQMxLscsKQ9w= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/postgresql/armpostgresqlflexibleservers/v2 v2.1.0 h1:8taYXqep2de3/KFzXp8JG1ZfL/OY8VEpvr9brYLF/zE= +github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/postgresql/armpostgresqlflexibleservers/v2 v2.1.0/go.mod h1:8Anbzn23yMdpl2DDY4qnFEPY9Vf6CoLphIt4mX59McI= +github.com/AzureAD/microsoft-authentication-library-for-go v0.9.0 h1:UE9n9rkJF62ArLb1F3DEjRt8O3jLwMWdSoypKV4f3MU= +github.com/AzureAD/microsoft-authentication-library-for-go v0.9.0/go.mod h1:kgDmCTgBzIEPFElEF+FK0SdjAor06dRq2Go927dnQ6o= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE= +github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= +github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= +github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= +github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dnaeon/go-vcr v1.1.0 h1:ReYa/UBrRyQdant9B4fNHGoCNKw6qh6P0fsdGmZpR7c= +github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= +github.com/emicklei/go-restful/v3 v3.9.0 h1:XwGDlfxEnQZzuopoqxwSEllNcCOM9DhhFyhFIIGKwxE= +github.com/emicklei/go-restful/v3 v3.9.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= +github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/evanphx/json-patch/v5 v5.6.0 h1:b91NhWfaz02IuVxO9faSllyAtNXHMPkC5J8sJCLunww= +github.com/evanphx/json-patch/v5 v5.6.0/go.mod h1:G79N1coSVB93tBe7j6PhzjmR3/2VvlbKOFpnXhI9Bw4= +github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY= +github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= +github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= +github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= +github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0= +github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/zapr v1.2.3 h1:a9vnzlIBPQBBkeaR9IuMUfmVOrQlkoC4YfPoFkX3T7A= +github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= +github.com/go-openapi/jsonpointer v0.19.5 h1:gZr+CIYByUqjcgeLXnQu2gHYQC9o73G2XUeOFYEICuY= +github.com/go-openapi/jsonpointer v0.19.5/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= +github.com/go-openapi/jsonreference v0.20.0 h1:MYlu0sBgChmCfJxxUKZ8g1cPWFOB37YSZqewK7OKeyA= +github.com/go-openapi/jsonreference v0.20.0/go.mod h1:Ag74Ico3lPc+zR+qjn4XBUmXymS4zJbYVCZmcgkasdo= +github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= +github.com/go-openapi/swag v0.19.14 h1:gm3vOOXfiuw5i9p5N9xJvfjvuofpyvLA9Wr6QfK5Fng= +github.com/go-openapi/swag v0.19.14/go.mod h1:QYRuS/SOXUCsnplDa677K7+DxSOj6IPNl/eQntq43wQ= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang-jwt/jwt/v4 v4.5.0 h1:7cYmW1XlMY7h7ii7UhUyChSgS5wUJEnm9uZVTGqOWzg= +github.com/golang-jwt/jwt/v4 v4.5.0/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= +github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= +github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= +github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= +github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= +github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= +github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= +github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= +github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= +github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= +github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/gnostic v0.5.7-v3refs h1:FhTMOKj2VhjpouxvWJAV1TL304uMlb9zcDqkl6cEI54= +github.com/google/gnostic v0.5.7-v3refs/go.mod h1:73MKFl6jIHelAJNaBGFzt3SPtZULs9dYrGFt8OiIsHQ= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.1.0 h1:Hsa8mG0dQ46ij8Sl2AYJDUv1oA9/d6Vk+3LG99Oe02g= +github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= +github.com/google/martian/v3 v3.0.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0= +github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= +github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= +github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= +github.com/imdario/mergo v0.3.9 h1:UauaLniWCFHWd+Jp9oCEkTBj8VO/9DKg3PV3VCNMDIg= +github.com/imdario/mergo v0.3.9/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= +github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= +github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +github.com/mailru/easyjson v0.7.6 h1:8yTIVnZgCoiM1TgqoeTl+LfU5Jg6/xL3QhGQnimLYnA= +github.com/mailru/easyjson v0.7.6/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= +github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= +github.com/onsi/ginkgo/v2 v2.6.0 h1:9t9b9vRUbFq3C4qKFCGkVuq/fIHji802N1nrtkh1mNc= +github.com/onsi/gomega v1.24.1 h1:KORJXNNTzJXzu4ScJWssJfJMnJ+2QJqhoQSRwNlze9E= +github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8 h1:KoWmjvw+nsYOo29YJK9vDA65RGE3NrOnUtO7a+RF9HU= +github.com/pkg/browser v0.0.0-20210911075715-681adbf594b8/go.mod h1:HKlIX3XHQyzLZPlr7++PzdhaXEj94dEiJgZDTsxEqUI= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.14.0 h1:nJdhIvne2eSX/XRAFV9PcvFFRbrjbcTUj0VP62TMhnw= +github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4= +github.com/prometheus/common v0.37.0 h1:ccBbHCgIiT9uSoFY0vX8H3zsNR5eLt17/RQLUvn8pXE= +github.com/prometheus/procfs v0.8.0 h1:ODq8ZFEaYeCaZOJlZZdJA2AbQR98dSHSM1KW/You5mo= +github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= +github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= +go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= +go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= +go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk= +go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= +go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= +go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= +go.uber.org/zap v1.24.0/go.mod h1:2kMP+WWQ8aoFoedH3T2sq6iJ2yDWpHbP0f6MQbS9Gkg= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc= +golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= +golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek= +golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= +golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= +golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= +golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= +golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs= +golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= +golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= +golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= +golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= +golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= +golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= +golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b h1:clP8eMhB30EHdc0bd2Twtq6kgU7yl5ub2cQLSdrv1Dg= +golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210616045830-e2b7044e8c71/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek= +golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= +golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191130070609-6e064ea0cf2d/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191216173652-a0e659d51361/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200117161641-43d50277825c/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200122220014-bf1340f18c4a/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200204074204-1cc6d1ef6c74/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200227222343-706bc42d1f0d/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= +golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= +golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8= +golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200618134242-20370b0cb4b2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200729194436-6467de6f59a7/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= +golang.org/x/tools v0.0.0-20200804011535-6c149bb5ef0d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= +golang.org/x/tools v0.0.0-20200825202427-b303f430e36d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gomodules.xyz/jsonpatch/v2 v2.2.0 h1:4pT439QV83L+G9FkcCriY6EkpcK6r6bK+A5FBUMI7qY= +google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= +google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= +google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= +google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= +google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.19.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= +google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= +google.golang.org/api v0.28.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= +google.golang.org/api v0.29.0/go.mod h1:Lcubydp8VUV7KeIHD9z2Bys/sm/vGKnG1UHuDBSrHWM= +google.golang.org/api v0.30.0/go.mod h1:QGmEvQ87FHZNiUVJkT14jQNYJ4ZJjdRF23ZXz5138Fc= +google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= +google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8= +google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191115194625-c23dd37a84c9/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191216164720-4f79533eabd1/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200115191322-ca5a22157cba/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200122232147-0452cf42e150/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4W5gMy59cAlVYjN9JhxgbQH6Gn+gFDQe2lzA= +google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200228133532-8c2c7df3a383/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200312145019-da6875a35672/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200331122359-1ee6d9798940/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= +google.golang.org/genproto v0.0.0-20200515170657-fc4c6c6a6587/go.mod h1:YsZOwe1myG/8QRHRsmBRE1LrgQY60beZKjly0O1fX9U= +google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= +google.golang.org/genproto v0.0.0-20200618031413-b414f8b61790/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA= +google.golang.org/genproto v0.0.0-20200729003335-053ba62fc06f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20200804131852-c06518451d9c/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20200825200019-8632dd797987/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/genproto v0.0.0-20201019141844-1ed22bb0c154/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= +google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= +google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= +google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.28.0/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60= +google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk= +google.golang.org/grpc v1.30.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= +google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= +google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= +google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= +google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= +google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= +google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= +google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.28.1 h1:d0NfwRgPtno5B1Wa6L2DAG+KivqkdutMf1UhdNx175w= +google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= +honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= +honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= +k8s.io/api v0.26.0 h1:IpPlZnxBpV1xl7TGk/X6lFtpgjgntCg8PJ+qrPHAC7I= +k8s.io/api v0.26.0/go.mod h1:k6HDTaIFC8yn1i6pSClSqIwLABIcLV9l5Q4EcngKnQg= +k8s.io/apiextensions-apiserver v0.26.0 h1:Gy93Xo1eg2ZIkNX/8vy5xviVSxwQulsnUdQ00nEdpDo= +k8s.io/apimachinery v0.26.0 h1:1feANjElT7MvPqp0JT6F3Ss6TWDwmcjLypwoPpEf7zg= +k8s.io/apimachinery v0.26.0/go.mod h1:tnPmbONNJ7ByJNz9+n9kMjNP8ON+1qoAIIC70lztu74= +k8s.io/client-go v0.26.0 h1:lT1D3OfO+wIi9UFolCrifbjUUgu7CpLca0AD8ghRLI8= +k8s.io/client-go v0.26.0/go.mod h1:I2Sh57A79EQsDmn7F7ASpmru1cceh3ocVT9KlX2jEZg= +k8s.io/klog/v2 v2.80.1 h1:atnLQ121W371wYYFawwYx1aEY2eUfs4l3J72wtgAwV4= +k8s.io/klog/v2 v2.80.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= +k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280 h1:+70TFaan3hfJzs+7VK2o+OGxg8HsuBr/5f6tVAjDu6E= +k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280/go.mod h1:+Axhij7bCpeqhklhUTe3xmOn6bWxolyZEeyaFpjGtl4= +k8s.io/utils v0.0.0-20221128185143-99ec85e7a448 h1:KTgPnR10d5zhztWptI952TNtt/4u5h3IzDXkdIMuo2Y= +k8s.io/utils v0.0.0-20221128185143-99ec85e7a448/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= +rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= +rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= +sigs.k8s.io/controller-runtime v0.14.1 h1:vThDes9pzg0Y+UbCPY3Wj34CGIYPgdmspPm2GIpxpzM= +sigs.k8s.io/controller-runtime v0.14.1/go.mod h1:GaRkrY8a7UZF0kqFFbUKG7n9ICiTY5T55P1RiE3UZlU= +sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k= +sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= +sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= +sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= +sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= diff --git a/go/pkg/azure/client.go b/go/pkg/azure/client.go new file mode 100644 index 0000000..1a85501 --- /dev/null +++ b/go/pkg/azure/client.go @@ -0,0 +1,335 @@ +// Copyright 2024-2025 NetCracker Technology Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package azure + +import ( + "context" + "fmt" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + server "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/postgresql/armpostgresqlflexibleservers/v2" + "github.com/Netcracker/pgskipper-backup-daemon/pkg/config" + "github.com/Netcracker/pgskipper-backup-daemon/pkg/k8s" + "github.com/Netcracker/pgskipper-backup-daemon/pkg/util" + "go.uber.org/zap" +) + +const ( + configFile = "/app/config/config.json" + currentStatusFile = "status" + statusSuccessful = "Successful" + statusInProgress = "In Progress" + statusFailed = "Failed" + + resourceGroup = "resourceGroup" + subscriptionId = "subscriptionId" + tenantId = "tenantId" + clientSecret = "clientSecret" + clientId = "clientId" + + restoreConfigCM = "external-restore-config" + mirrorCM = "mirror-config" + DRCM = "dr-config" +) + +var ( + logger = util.Logger +) + +type Client struct { + clientConfig + *config.RestoreConfig + creds *azidentity.ClientSecretCredential +} + +const ( + TypeTagKey = "type" + + DrMode = "dr" + MirrorMode = "mirror" +) + +type Status struct { + ServerName string `json:"serverName"` + NewServerName string `json:"newServerName"` + Status string `json:"status"` + RestoreId string `json:"restoreId"` + StatusFolder string `json:"-"` +} + +type clientConfig map[string]string + +func (c clientConfig) SubscribtionId() string { + return c[subscriptionId] +} + +func NewRestoreClientWithRestoreConfig(restoreCfg *config.RestoreConfig) *Client { + config, err := util.ReadConfigFile(configFile) + if err != nil { + panic(err) + } + + return &Client{ + clientConfig: config, + RestoreConfig: restoreCfg, + } +} + +func (azCl *Client) clientInfo(name string) string { + return azCl.clientConfig[name] +} + +func (azCl *Client) getStatus(status string) Status { + return Status{ + ServerName: azCl.ServerName(), + NewServerName: azCl.NewServerName(), + Status: status, + RestoreId: azCl.RestoreID(), + StatusFolder: azCl.StatusFolder(), + } +} + +func (azCl *Client) RestoreDatabase() { + status := azCl.getStatus(statusInProgress) + defer checkErrorStatus(status) + + ctx := context.Background() + client := azCl.getAzureClient() + oldServer, err := client.Get(ctx, azCl.ResourceGroup(), azCl.ServerName(), nil) + if err != nil { + logger.Error("cannot connect to Azure server", zap.Error(err)) + panic(err) + } + setStatus(status) + + azCl.checkRestoreValidity(oldServer) + + azCl.createNewServerFromRP(ctx, client, oldServer) + err = azCl.updateNewServerParameters(ctx, client, oldServer) + if err != nil { + panic(err) + } + + if azCl.StopSource() { + azCl.stopServer(ctx, client) + } + + if azCl.IsGeoRestore() { + dataToUpdate := make(map[string]string) + dataToUpdate[config.GeoHaKey] = string(*oldServer.Properties.HighAvailability.Mode) + dataToUpdate[config.GeoSubnetKey] = *oldServer.Properties.Network.DelegatedSubnetResourceID + dataToUpdate[config.GeoLocationKey] = azCl.Location() + dataToUpdate[config.GeoRGKey] = azCl.ResourceGroup() + dataToUpdate[config.GeoPrivateDNSZone] = *oldServer.Properties.Network.PrivateDNSZoneArmResourceID + dataToUpdate[config.MainLocationKey] = azCl.GeoLocation() + k8s.UpdateCMData(restoreConfigCM, dataToUpdate) + } + + status.Status = statusSuccessful + setStatus(status) +} + +func (azCl *Client) checkRestoreValidity(oldServer server.ServersClientGetResponse) { + isMirror, isDr := azCl.getSiteModes() + // check mirror restrictions + if isMirror { + logger.Info("Mirror config is found, only mirror restore of source server and restore for mirror server are allowed") + tagActualVal, tagIsPresent := oldServer.Tags[TypeTagKey] + isMirrorServer := tagIsPresent && *tagActualVal == MirrorMode + if isMirrorServer { + logger.Info("Pg server for restore is a mirror") + } else { + logger.Info("Pg server for restore is not a mirror") + } + if !isMirrorServer && !azCl.IsMirrorRestore() { + errorMsg := fmt.Sprintf("Cannot perform only mirror restore on %s from mirror side", *oldServer.Name) + logger.Error(errorMsg) + panic(errorMsg) + } + } + // check dr restrictions + if isDr { + logger.Info("Dr config is found, only dr restore of source server and restore for dr server are allowed") + tagActualVal, tagIsPresent := oldServer.Tags[TypeTagKey] + isDrServer := tagIsPresent && *tagActualVal == DrMode + if isDrServer { + logger.Info("Pg server for restore is a dr") + } else { + logger.Info("Pg server for restore is not a dr") + } + if !isDrServer && !azCl.IsGeoRestore() { + errorMsg := fmt.Sprintf("Cannot perform only dr restore on %s from dr side", *oldServer.Name) + logger.Error(errorMsg) + panic(errorMsg) + } + } +} + +func (azCl *Client) getSiteModes() (isMirror bool, isDR bool) { + isMirror, err := k8s.IsEnvTypeCmExist(mirrorCM) + if err != nil { + panic(err) + } + + isDR, err = k8s.IsEnvTypeCmExist(DRCM) + if err != nil { + panic(err) + } + + return +} + +func (azCl *Client) getAzureClient() *server.ServersClient { + client, err := server.NewServersClient(azCl.clientInfo(subscriptionId), azCl.getAzureCreds(), nil) + if err != nil { + logger.Error(fmt.Sprintf("failed to get server client: %v", err)) + panic(err) + } + return client +} + +func (azCl *Client) getAzureCreds() *azidentity.ClientSecretCredential { + if azCl.creds == nil { + creds, err := azidentity.NewClientSecretCredential(azCl.clientInfo(tenantId), azCl.clientInfo(clientId), azCl.clientInfo(clientSecret), nil) + if err != nil { + logger.Error(fmt.Sprintf("failed to obtain a credential: %v", err)) + panic(err) + } + azCl.creds = creds + } + return azCl.creds +} + +func (azCl *Client) createNewServerFromRP(ctx context.Context, client *server.ServersClient, oldServer server.ServersClientGetResponse) { + logger.Info(fmt.Sprintf("Starting restore of database %s with new name %s by point in time %s", *oldServer.Name, azCl.NewServerName(), azCl.RestoreTime())) + locationName := *oldServer.Location + avZone := oldServer.Properties.AvailabilityZone + restoreType := server.CreateModePointInTimeRestore + subnet := oldServer.Properties.Network.DelegatedSubnetResourceID + backup := oldServer.Properties.Backup + resourceGroup := azCl.ResourceGroup() + privateDnsZone := oldServer.Properties.Network.PrivateDNSZoneArmResourceID + // Select restore mode + if azCl.IsGeoRestore() { + locationName = azCl.GeoLocation() + resourceGroup = azCl.GeoResourceGroup() + restoreType = server.CreateModeGeoRestore + subnet = to.Ptr(azCl.GeoSubnet()) + avZone = nil + if len(azCl.GeoPrivateDNSZone()) != 0 { + logger.Info(fmt.Sprintf("geo restore in progress and dnsZone is specified, "+ + "overriding it to: %s", azCl.GeoPrivateDNSZone())) + privateDnsZone = to.Ptr(azCl.GeoPrivateDNSZone()) + } + // Disable Geo-Redundant backup on restored server due to Azure restrictions + disabledEnum := server.GeoRedundantBackupEnumDisabled + backup.GeoRedundantBackup = &disabledEnum + } else if azCl.IsMirrorRestore() { + resourceGroup = azCl.MirrorResourceGroup() + mirrorSubnet := azCl.MirrorSubnet() + subnet = &mirrorSubnet + if len(azCl.MirrorPrivateDNSZone()) != 0 { + logger.Info(fmt.Sprintf("mirror enabled and dnsZone is specified, "+ + "overriding it to: %s", azCl.MirrorPrivateDNSZone())) + privateDnsZone = to.Ptr(azCl.MirrorPrivateDNSZone()) + } + } + + newServer := server.Server{ + Location: &locationName, + Properties: &server.ServerProperties{ + AvailabilityZone: avZone, + Backup: backup, + CreateMode: &restoreType, + PointInTimeUTC: to.Ptr(func() time.Time { t, _ := time.Parse(time.RFC3339Nano, azCl.RestoreTime()); return t }()), + SourceServerResourceID: to.Ptr(fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.DBforPostgreSQL/flexibleServers/%s", azCl.clientInfo(subscriptionId), azCl.ResourceGroup(), *oldServer.Name)), + Network: &server.Network{ + DelegatedSubnetResourceID: subnet, + PrivateDNSZoneArmResourceID: privateDnsZone, + PublicNetworkAccess: oldServer.Properties.Network.PublicNetworkAccess, + }, + }, + } + + poller, err := client.BeginCreate(ctx, + resourceGroup, + azCl.NewServerName(), + newServer, + nil) + + if err != nil { + logger.Error("cannot create new Flexible Postgres database", zap.Error(err)) + panic(err) + } + + resCreate, err := poller.PollUntilDone(ctx, nil) + if err != nil { + panic(err) + } + + _ = resCreate +} + +func (azCl *Client) stopServer(ctx context.Context, client *server.ServersClient) { + logger.Info(fmt.Sprintf("stopping the server %s", azCl.ServerName())) + pollerStop, err := client.BeginStop(ctx, + azCl.ResourceGroup(), + azCl.ServerName(), + nil) + if err != nil { + logger.Warn(fmt.Sprintf("error during \"Stop server\" operation starting for the server %s", azCl.ServerName()), zap.Error(err)) + return + } + + resStop, err := pollerStop.PollUntilDone(ctx, nil) + if err != nil { + logger.Warn(fmt.Sprintf("cannot stop the server %s", azCl.ServerName()), zap.Error(err)) + return + } + _ = resStop +} + +func checkErrorStatus(status Status) { + if r := recover(); r != nil { + status.Status = statusFailed + setStatus(status) + panic(r) + } +} + +func setStatus(status Status) { + statusFilePath := fmt.Sprintf("%s/%s", status.StatusFolder, status.RestoreId) + err := util.WriteFile(statusFilePath, status) + if err != nil { + logger.Error("Cannot write to status file", zap.Error(err)) + } + + setCurrentStatus(status) +} + +// TODO expire +func setCurrentStatus(status Status) { + statusPath := fmt.Sprintf("%s/%s", status.StatusFolder, currentStatusFile) + if status.Status == statusInProgress { + err := util.WriteFile(statusPath, status) + if err != nil { + logger.Error("Cannot write to status file", zap.Error(err)) + } + } else { + util.DeleteFile(statusPath) + } +} diff --git a/go/pkg/azure/parameters.go b/go/pkg/azure/parameters.go new file mode 100644 index 0000000..7f2d36e --- /dev/null +++ b/go/pkg/azure/parameters.go @@ -0,0 +1,74 @@ +// Copyright 2024-2025 NetCracker Technology Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package azure + +import ( + "context" + "fmt" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + server "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/postgresql/armpostgresqlflexibleservers/v2" + "go.uber.org/zap" +) + +func (azCl *Client) updateNewServerParameters(ctx context.Context, client *server.ServersClient, oldServer server.ServersClientGetResponse) error { + HAMode := oldServer.Properties.HighAvailability.Mode + tags := oldServer.Tags + resourceGroup := azCl.ResourceGroup() + if azCl.IsMirrorRestore() { + resourceGroup = azCl.MirrorResourceGroup() + logger.Info("turning off HA, because we do override of subnet") + HAMode = to.Ptr(server.HighAvailabilityModeDisabled) + tags[TypeTagKey] = to.Ptr(MirrorMode) + } else if azCl.IsGeoRestore() { + resourceGroup = azCl.GeoResourceGroup() + geoHaModeStr := azCl.GeoHA() + if geoHaModeStr != "" { + logger.Info("changing HA mode, due to configuration parameters") + geoHaMode := server.HighAvailabilityMode(geoHaModeStr) + HAMode = &geoHaMode + } + tags[TypeTagKey] = to.Ptr(DrMode) + } + retentionDays := oldServer.Properties.Backup.BackupRetentionDays + logger.Info(fmt.Sprintf("Setting new server HA parameters: mode=%s, backup retension days=%d", *HAMode, *retentionDays)) + poller, err := client.BeginUpdate(ctx, + resourceGroup, + azCl.NewServerName(), + server.ServerForUpdate{ + Properties: &server.ServerPropertiesForUpdate{ + HighAvailability: &server.HighAvailability{ + Mode: HAMode, + }, + Backup: &server.Backup{BackupRetentionDays: retentionDays}, + }, + Tags: tags, + }, + nil) + + if err != nil { + logger.Error("cannot update new Flexible Postgres database", zap.Error(err)) + panic(err) + } + + resUpdate, err := poller.PollUntilDone(ctx, nil) + if err != nil { + logger.Error("error during polling HA update", zap.Error(err)) + panic(err) + } + + _ = resUpdate + + return nil +} diff --git a/go/pkg/config/restore_config.go b/go/pkg/config/restore_config.go new file mode 100644 index 0000000..1039ea9 --- /dev/null +++ b/go/pkg/config/restore_config.go @@ -0,0 +1,176 @@ +// Copyright 2024-2025 NetCracker Technology Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package config + +import ( + "fmt" + "strconv" + "strings" + "time" + + "github.com/Netcracker/pgskipper-backup-daemon/pkg/k8s" + "github.com/Netcracker/pgskipper-backup-daemon/pkg/util" +) + +const ( + restoreConfigCM = "external-restore-config" + + MirrorSubnetKey = "mirror.subnet" + MirrorPrivateDNSZone = "mirror.privateDnsZone" + MirrorRGKey = "mirror.resourceGroup" + GeoSubnetKey = "geo.subnet" + GeoLocationKey = "geo.location" + GeoRGKey = "geo.resourceGroup" + GeoPrivateDNSZone = "geo.privateDnsZone" + GeoHaKey = "geo.ha" + MainLocationKey = "main.location" + MainRGKey = "main.resourceGroup" +) + +var ( + logger = util.Logger +) + +// RestoreConfig holds configuration for restore +// if subnet is specified, means mirror restore -> don't stop source, add additional tag, override subnet from CM +// restoreAsSeparate -> restore az pg as separe instance -> don't stop source, don't update k8s service +type RestoreConfig struct { + serverName string + newServerName string + restoreId string + restoreTime string + statusFolder string + restoreAsSeparate string + geoRestore string + mirror string + cmData map[string]string +} + +func (cfg *RestoreConfig) NewServerName() string { + return cfg.newServerName +} + +func (cfg *RestoreConfig) ServerName() string { + return cfg.serverName +} + +func (cfg *RestoreConfig) RestoreAsSeparate() bool { + return strings.Compare(strings.ToLower(cfg.restoreAsSeparate), "true") == 0 +} + +func (cfg *RestoreConfig) IsMirrorRestore() bool { + return strings.ToLower(cfg.mirror) == "true" +} + +func (cfg *RestoreConfig) MirrorResourceGroup() string { + return cfg.cmData[MirrorRGKey] +} + +func (cfg *RestoreConfig) MirrorSubnet() string { + return cfg.cmData[MirrorSubnetKey] +} + +func (cfg *RestoreConfig) MirrorPrivateDNSZone() string { + return cfg.cmData[MirrorPrivateDNSZone] +} + +func (cfg *RestoreConfig) ResourceGroup() string { + return cfg.cmData[MainRGKey] +} + +func (cfg *RestoreConfig) Location() string { + return cfg.cmData[MainLocationKey] +} + +func (cfg *RestoreConfig) IsGeoRestore() bool { + return strings.ToLower(cfg.geoRestore) == "true" +} + +func (cfg *RestoreConfig) GeoSubnet() string { + return cfg.cmData[GeoSubnetKey] +} + +func (cfg *RestoreConfig) GeoHA() string { + return cfg.cmData[GeoHaKey] +} + +func (cfg *RestoreConfig) GeoLocation() string { + return cfg.cmData[GeoLocationKey] +} + +func (cfg *RestoreConfig) GeoResourceGroup() string { + return cfg.cmData[GeoRGKey] +} + +func (cfg *RestoreConfig) GeoPrivateDNSZone() string { + return cfg.cmData[GeoPrivateDNSZone] +} + +func (cfg *RestoreConfig) RestoreID() string { + return cfg.restoreId +} + +func (cfg *RestoreConfig) StatusFolder() string { + return cfg.statusFolder +} + +func (cfg *RestoreConfig) RestoreTime() string { + return cfg.restoreTime +} + +// StopSource Stop of Source should be done only in regular restore +// restoreAsSeparate=false subnet=false +func (cfg *RestoreConfig) StopSource() bool { + return !cfg.RestoreAsSeparate() && !cfg.IsMirrorRestore() && !cfg.IsGeoRestore() +} + +func NewRestoreConfig(restoreId, restoreTime, statusFolder, restoreAsSeparate, geoRestore, subnet string) *RestoreConfig { + serverName, err := k8s.GetServerName() + if err != nil { + panic(err) + } + newServerName := GenerateNewName(serverName) + + var cmData map[string]string + cm, err := k8s.GetCM(restoreConfigCM) + if err != nil { + panic(err) + } + cmData = cm.Data + + logger.Info(fmt.Sprintf("restoreAsSeparate: %s, subnet: %s, geoRestore %s", restoreAsSeparate, subnet, geoRestore)) + return &RestoreConfig{ + serverName: serverName, + newServerName: newServerName, + restoreId: restoreId, + restoreTime: restoreTime, + statusFolder: statusFolder, + restoreAsSeparate: restoreAsSeparate, + geoRestore: geoRestore, + mirror: subnet, + cmData: cmData, + } +} + +func GenerateNewName(oldServerName string) string { + now := time.Now() + timeString := strconv.Itoa(int(now.Unix())) + if strings.HasSuffix(oldServerName, "-restored") { + oldServerSlice := strings.Split(oldServerName, "-") + oldServerSlice[len(oldServerSlice)-2] = timeString + return strings.Join(oldServerSlice, "-") + } + return fmt.Sprintf("%s-%s-restored", oldServerName, timeString) +} diff --git a/go/pkg/k8s/client.go b/go/pkg/k8s/client.go new file mode 100644 index 0000000..f7dea25 --- /dev/null +++ b/go/pkg/k8s/client.go @@ -0,0 +1,173 @@ +// Copyright 2024-2025 NetCracker Technology Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package k8s + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/Netcracker/pgskipper-backup-daemon/pkg/util" + "go.uber.org/zap" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + crclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + serviceName = "pg-patroni" + externalCMName = "postgres-external" + connectionKey = "connectionName" + mirrorSubnetKey = "mirror.subnet" +) + +var ( + logger = util.Logger + k8sClient crclient.Client + namespace = "" +) + +func init() { + var err error + namespace, err = util.GetNamespace() + if err != nil { + panic(err) + } + k8sClient, err = util.CreateClient() + if err != nil { + logger.Error("Can't create k8s client") + panic(err) + } +} + +func UpdateExternalService(newDBName string) error { + extService, err := getExternalService() + if err != nil { + return err + } + extService.Spec.ExternalName = newDBName + ".postgres.database.azure.com" + if err = k8sClient.Update(context.TODO(), extService); err != nil { + logger.Error("Can't update external service", zap.Error(err)) + return err + } + return nil +} + +func UpdateExternalCM(dbName, newDBName string) error { + extCM, err := GetCM(externalCMName) + if err != nil { + return err + } + extCM.Data[connectionKey] = strings.ReplaceAll(extCM.Data[connectionKey], dbName, newDBName) + if err = k8sClient.Update(context.TODO(), extCM); err != nil { + logger.Error("Can't update external CM", zap.Error(err)) + return err + } + return nil +} + +func getExternalService() (*corev1.Service, error) { + extService := &corev1.Service{} + var k8sErr error + wait.PollImmediate(3*time.Second, 5*time.Minute, func() (done bool, err error) { + k8sErr = k8sClient.Get(context.TODO(), types.NamespacedName{ + Name: serviceName, Namespace: namespace, + }, extService) + if k8sErr != nil { + logger.Error("Error during obtaining ext service info, retrying...", zap.Error(err)) + return false, nil + } + return true, nil + }) + if k8sErr != nil { + logger.Error("Timeout exceeded", zap.Error(k8sErr)) + } + return extService, k8sErr +} + +func GetServerName() (string, error) { + cm, err := GetCM(externalCMName) + if err != nil { + return "", err + } + fullName := cm.Data[connectionKey] + splitStr := strings.Split(fullName, ".") + return splitStr[0], nil +} + +func GetCM(cmName string) (*corev1.ConfigMap, error) { + cm := &corev1.ConfigMap{} + var k8sErr error + wait.PollImmediate(3*time.Second, 5*time.Minute, func() (done bool, err error) { + k8sErr = k8sClient.Get(context.TODO(), types.NamespacedName{ + Name: cmName, Namespace: namespace, + }, cm) + if k8sErr != nil { + logger.Error(fmt.Sprintf("Error during obtaining %s ConfigMap, retrying...", cmName), zap.Error(err)) + return false, nil + } + return true, nil + }) + if k8sErr != nil { + logger.Error("Timeout exceeded", zap.Error(k8sErr)) + } + return cm, k8sErr +} + +// This CM must be created by Velero in case environment has specific type +func IsEnvTypeCmExist(cmName string) (bool, error) { + err := wait.PollImmediate(3*time.Second, 5*time.Minute, func() (done bool, err error) { + cm := &corev1.ConfigMap{} + k8sErr := k8sClient.Get(context.TODO(), types.NamespacedName{ + Name: cmName, Namespace: namespace, + }, cm) + if k8sErr != nil { + if errors.IsNotFound(k8sErr) { + logger.Error(fmt.Sprintf("Cm %s is not found", cmName)) + return false, k8sErr + } + logger.Error(fmt.Sprintf("Error during obtaining %s ConfigMap, retrying...", cmName), zap.Error(err)) + return false, nil + } + return true, nil + }) + if err != nil { + if errors.IsNotFound(err) { + return false, nil + } + return false, err + } + return true, nil +} + +func UpdateCMData(cmName string, dataForUpdate map[string]string) error { + cm, err := GetCM(cmName) + if err != nil { + return err + } + for key, value := range dataForUpdate { + cm.Data[key] = value + } + + err = k8sClient.Update(context.TODO(), cm) + if err != nil { + logger.Error("Error during updating ext CM", zap.Error(err)) + return err + } + return nil +} diff --git a/go/pkg/util/transport.go b/go/pkg/util/transport.go new file mode 100644 index 0000000..0f46c2e --- /dev/null +++ b/go/pkg/util/transport.go @@ -0,0 +1,76 @@ +// Copyright 2024-2025 NetCracker Technology Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package util + +import ( + "fmt" + "net/http" + "time" + + "k8s.io/client-go/rest" + "k8s.io/client-go/transport" +) + +const retryNumEnv = "K8S_CLIENT_RETRY_NUM" + +type Retry struct { + nums int + http.RoundTripper +} + +func (r *Retry) RoundTrip(req *http.Request) (resp *http.Response, err error) { + logger := Logger + logger.Debug(fmt.Sprintf("executing request: %s with retries", req.URL)) + for i := 0; i < r.nums; i++ { + logger.Debug(fmt.Sprintf("attempt: %v of %v", i+1, r.nums)) + resp, err = r.RoundTripper.RoundTrip(req) + if err != nil && resp == nil { + logger.Error(fmt.Sprintf("received not retryable error %v", err)) + return + } else if err != nil || resp.StatusCode >= 500 { + logger.Warn(fmt.Sprintf("received retryable error %v, with status code: %d ,retrying...", err, resp.StatusCode)) + time.Sleep(10 * time.Second) + continue + } else { + logger.Debug("executed successfully, exiting") + return + } + } + logger.Warn("no more retries, giving up...") + return +} + +func UpdateTransport(cfg *rest.Config) { + tc, err := cfg.TransportConfig() + if err != nil { + panic(err) + } + rt, err := transport.New(tc) + if err != nil { + panic(err) + } + cfg.Transport = getRetryTransport(rt) + + // Security moved to transport level + cfg.TLSClientConfig = rest.TLSClientConfig{} +} + +func getRetryTransport(rt http.RoundTripper) *Retry { + retryNum := getEnvInt(retryNumEnv, 10) + return &Retry{ + nums: retryNum, + RoundTripper: rt, + } +} diff --git a/go/pkg/util/util.go b/go/pkg/util/util.go new file mode 100644 index 0000000..68a0620 --- /dev/null +++ b/go/pkg/util/util.go @@ -0,0 +1,150 @@ +// Copyright 2024-2025 NetCracker Technology Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package util + +import ( + "encoding/json" + "fmt" + "os" + "strconv" + + azlog "github.com/Azure/azure-sdk-for-go/sdk/azcore/log" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + crclient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/config" +) + +const nsPath = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" + +var ( + Logger = getLogger() + AzureLogEvents = []azlog.Event{azlog.EventRequest, azlog.EventResponse, azlog.EventLRO, azlog.EventRetryPolicy} +) + +func getLogger() *zap.Logger { + atom := zap.NewAtomicLevel() + encoderCfg := zap.NewProductionEncoderConfig() + encoderCfg.TimeKey = "timestamp" + encoderCfg.EncodeTime = zapcore.ISO8601TimeEncoder + + logger := zap.New(zapcore.NewCore( + zapcore.NewJSONEncoder(encoderCfg), + zapcore.Lock(os.Stdout), + atom, + )) + defer logger.Sync() + return logger +} + +func CreateClient() (crclient.Client, error) { + clientConfig, err := config.GetConfig() + if err != nil { + return nil, err + } + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + UpdateTransport(clientConfig) + client, err := crclient.New(clientConfig, crclient.Options{Scheme: scheme}) + if err != nil { + return nil, err + } + return client, nil +} + +func ReadConfigFile(filePath string) (map[string]string, error) { + file, err := os.ReadFile(filePath) + if err != nil { + Logger.Error(fmt.Sprintf("cannot read config file: %s", filePath)) + return nil, err + } + var externalConfig map[string]string + err = json.Unmarshal(file, &externalConfig) + if err != nil { + Logger.Error(fmt.Sprintf("Failed to parse config file %s", filePath), zap.Error(err)) + return nil, err + } + return externalConfig, nil +} + +func WriteFile(filePath string, data interface{}) error { + file, err := os.Create(filePath) + if err != nil { + Logger.Error(fmt.Sprintf("cannot create %s file", filePath)) + return err + } + defer file.Close() + + dataStr, err := json.Marshal(data) + if err != nil { + Logger.Error("Cannot convert data to string") + return err + } + _, err = file.Write(dataStr) + if err != nil { + Logger.Error(fmt.Sprintf("cannot write into %s file", filePath)) + return err + } + return nil +} + +func DeleteFile(filepath string) { + err := os.Remove(filepath) + if err != nil { + Logger.Error("cannot remove current task file") + } +} + +func ConfigureAzLogging() { + azlog.SetListener(func(cls azlog.Event, msg string) { + prefixLog := "received event: " + switch cls { + case azlog.EventLRO: + prefixLog = "long running event: " + case azlog.EventRetryPolicy: + prefixLog = "retry event: " + } + Logger.Info("[azlog]" + prefixLog + msg) + }) + azlog.SetEvents(AzureLogEvents...) +} + +func getEnvInt(name string, defValue int) int { + val := os.Getenv(name) + if val == "" { + return defValue + } + intVal, err := strconv.ParseInt(val, 10, 32) + if err != nil { + Logger.Warn(fmt.Sprintf("cannot parse %s env variable, value %d will be used", name, defValue)) + return defValue + } + return int(intVal) +} + +func GetNamespace() (string, error) { + return ReadFromFile(nsPath) +} + +func ReadFromFile(filePath string) (string, error) { + dat, err := os.ReadFile(filePath) + if err != nil { + return "", err + } + return string(dat), nil +} diff --git a/maintenance/recovery/pg_back_rest_recovery.py b/maintenance/recovery/pg_back_rest_recovery.py new file mode 100644 index 0000000..446b2da --- /dev/null +++ b/maintenance/recovery/pg_back_rest_recovery.py @@ -0,0 +1,306 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + +from utils_oc import OpenshiftOrchestrator, OpenshiftPyClient +from recovery import cleanup_pg_pod_data_directory +from kubernetes import client +from kubernetes.client import configuration +from kubernetes.client import rest +from kubernetes.stream import stream +from kubernetes.stream.ws_client import ERROR_CHANNEL, STDOUT_CHANNEL, STDERR_CHANNEL +import kubernetes +from utils_common import retry + +import time +import requests +import logging +import os +import yaml + + +log = logging.getLogger() +log.setLevel(logging.DEBUG) +retry_count = 60 + +skip_tls_verify = os.getenv("OC_SKIP_TLS_VERIFY", "true") +oc_openshift_url = os.getenv("OC_OPENSHIFT_URL", None) +oc_project = os.getenv("POD_NAMESPACE", None) +pg_cluster_name = os.getenv("PG_CLUSTER_NAME", None) + +pg_dir = "/var/lib/pgsql/data" +pg_data_dir = "{}/postgresql_${{POD_IDENTITY}}".format(pg_dir) + + +class PgBackRestRecovery(): + def __init__(self): + try: + from kubernetes import config as k8s_config + k8s_config.load_incluster_config() + log.info("Using pyclient") + self.oc_client = OpenshiftPyClient() + self.oc_client.use_token(oc_url=oc_openshift_url, oc_token="", project=oc_project, skip_tls_verify=skip_tls_verify) + self._api_client = None + self.project = os.getenv("POD_NAMESPACE") + self.oc_orch = OpenshiftOrchestrator(self.oc_client, retry_count) + try: + self.apps_api = client.AppsV1Api(self._api_client) + except: + self.apps_api = client.AppsV1beta1Api(self._api_client) + + except Exception as e: + log.exception("Failed to create OpenshiftPyClient") + + + def get_patroni_replicas_ip(self, statefulsets): + r = requests.get("pg-patroni:8008") + return r.json()['replication'] + + + def get_patroni_statefulsets(self): + stateful_sets = self.apps_api.list_namespaced_stateful_set(self.project).items + return stateful_sets + + def patch_statefulset_cmd(self, stateful_set, stateful_set_name, cmd): + + log.info(f'Going to set {stateful_set_name} with {cmd} command') + + stateful_set.spec.template.spec.containers[0].command = cmd + + self.apps_api.patch_namespaced_stateful_set(stateful_set_name, self.project, stateful_set) + + + def scale_statefulset(self, stateful_set_name, replicas): + + log.info(f'Going to set {stateful_set_name} with {replicas} replicas') + + self.apps_api.patch_namespaced_stateful_set_scale(stateful_set_name, self.project, {"spec": {"replicas": replicas}}) + + def patch_configmap(self, config_map_name, config_map): + + log.info(f'Going to replace {config_map_name}') + core_api = client.CoreV1Api(self._api_client) + core_api.replace_namespaced_config_map(config_map_name, self.project, config_map) + + + def delete_master_cm(self): + try: + log.info("Delete leader cm") + body = client.V1DeleteOptions() + core_api = client.CoreV1Api(self._api_client) + core_api.delete_namespaced_config_map("patroni-leader", self.project, body=body,) + except kubernetes.client.rest.ApiException as e: + if e.reason == "Not Found": + return + else: + return e + + def clean_patroni_cm(self): + log.info("Delete initialize key") + cmaps = client.CoreV1Api(self._api_client).list_namespaced_config_map(self.project).items + for cm in cmaps: + if cm.metadata.name == 'patroni-config': + if "initialize" in cm.metadata.annotations: + del cm.metadata.annotations["initialize"] + self.patch_configmap(cm.metadata.name, cm) + return cm + + def get_config_map(self, name): + cmaps = client.CoreV1Api(self._api_client).list_namespaced_config_map(self.project).items + for cm in cmaps: + if cm.metadata.name == name: + return cm + + def get_template_cm(self): + template_cm = self.get_config_map(f"patroni-{pg_cluster_name}.config.yaml") + if not template_cm: + log.info(f"Can't find patroni-{pg_cluster_name}.config.yaml, trying to find {pg_cluster_name}-patroni.config.yaml") + template_cm = self.get_config_map(f"{pg_cluster_name}-patroni.config.yaml") + return template_cm + + def create_custom_bootstrap_method(self, target, restore_type): + template_cm = self.get_template_cm() + patroni_config_data = template_cm.data['patroni-config-template.yaml'] + log.info(f"Data {patroni_config_data}") + dict_data = yaml.load(patroni_config_data,Loader=yaml.FullLoader) + dict_data["bootstrap"]["pgbackrest"] = { + "command": f"pgbackrest --stanza=patroni --delta --type={restore_type} --target='{target}' --target-action=promote restore", + "keep_existing_recovery_conf": "True", + "no_params": "True" + } + dict_data["bootstrap"]["method"] = "pgbackrest" + template_cm.data['patroni-config-template.yaml'] = yaml.dump(dict_data) + self.patch_configmap(template_cm.metadata.name, template_cm) + + def clean_custom_bootstrap_method(self): + log.info(f"Pop custom bootstrap method from patroni-template config map") + template_cm = self.get_template_cm() + patroni_config_data = template_cm.data['patroni-config-template.yaml'] + dict_data = yaml.load(patroni_config_data,Loader=yaml.FullLoader) + dict_data["bootstrap"].pop("method", None) + template_cm.data['patroni-config-template.yaml'] = yaml.dump(dict_data) + self.patch_configmap(template_cm.metadata.name, template_cm) + + def upgrade_stanza(self): + # wait for leader to upgrade stanza + from requests.adapters import HTTPAdapter, Retry + logging.basicConfig(level=logging.DEBUG) + s = requests.Session() + retries = Retry(total=3600, backoff_factor=1, status_forcelist=[ 502, 503, 504 ]) + s.mount('http://', HTTPAdapter(max_retries=retries)) + r = s.post("http://pgbackrest:3000/upgrade") + log.info(f'{r.status_code}, {r.text}') + + def restore_pod(self, pod_name, backup_id): + # wait for pod to restore + log.info(f'Will invoke restore command for pod {pod_name}') + from requests.adapters import HTTPAdapter, Retry + logging.basicConfig(level=logging.DEBUG) + s = requests.Session() + retries = Retry(total=3600, backoff_factor=1, status_forcelist=[ 502, 503, 504 ]) + s.mount('http://', HTTPAdapter(max_retries=retries)) + r = s.post(f"http://{pod_name}.backrest-headless:3000/restore", data={'backupId':backup_id}) + log.info(f'{r.status_code}, {r.text}') + return r.status_code + + def perform_restore(self): + backup_id = '' if not os.getenv("SET") else os.getenv("SET") + restore_type = '' if not os.getenv("TYPE") else os.getenv("TYPE") + target = '' if not os.getenv("TARGET") else os.getenv("TARGET") + + + http_codes = {} + stateful_sets = self.get_patroni_statefulsets() + for stateful_set in stateful_sets: + + stateful_set_name = stateful_set.metadata.name + pod_name = stateful_set.metadata.name + "-0" + + cmd = ["sh", "-c", "while true ; do sleep 3600; done"] + + self.patch_statefulset_cmd(stateful_set, stateful_set_name, cmd) + time.sleep(5) + #Just in case when pods could be scaled 0 + self.scale_statefulset(stateful_set_name,1) + if not self.wait_for_pod(pod_name, attempts=5): + raise Exception("Pod {} is not ready".format(pod_name)) + self.cleanup_patroni_data(pod_name, stateful_set_name, False) + time.sleep(15) + + + stateful_sets = self.get_patroni_statefulsets() + for stateful_set in stateful_sets: + stateful_set_name = stateful_set.metadata.name + pod_name = stateful_set.metadata.name + "-0" + if target: + log.info(f"Target has been provided, so starting PITR for pod {pod_name}") + self.create_custom_bootstrap_method(target, restore_type) + self.clean_patroni_cm() + self.delete_master_cm() + else: + log.info(f"Starting full restore procedure for pod {pod_name}") + http_codes[stateful_set_name] = self.restore_pod(pod_name, backup_id) + + if target or http_codes[stateful_set_name] == 200: + log.info(f"Restore return 200 http state, so remove sleep cmd") + self.patch_statefulset_cmd(stateful_set, stateful_set_name, []) + time.sleep(5) + self.scale_statefulset(stateful_set_name,0) + time.sleep(15) + self.scale_statefulset(stateful_set_name,1) + else: + log.error(f'Restore procedure for {stateful_set_name} ends with error. It was {http_codes[stateful_set_name]}') + return + if not self.wait_for_pod(pod_name, attempts=5): + raise Exception("Pod {} is not ready".format(pod_name)) + self.clean_custom_bootstrap_method() + self.upgrade_stanza() + print("Done") + + + + + def cleanup_patroni_data(self, pod_name, container_name, preserve_old_files): + log.info("Try to cleanup data directory for pod {}".format(pod_name)) + if preserve_old_files == "yes": + self.oc_client.oc_exec(pod_name, container_name, "sh -c 'mv {} {}_backup_$(date +%s); ls -ll {}'".format(pg_data_dir, pg_data_dir, pg_dir)) + log.info("Old files were preserved on volume. Cleanup if needed.") + self.oc_exec(pod_name, container_name, " sh -c 'rm -rf {}; mkdir {}; chmod 700 {}' ".format(pg_data_dir, pg_data_dir, pg_data_dir)) + + + @retry(tries=30, delay=5) + def oc_exec(self, pod_id, container_name, command): + log.debug(f"Try to execute '{command}' on pod {pod_id}") + core_api = client.CoreV1Api(self._api_client) + + exec_command = [ + '/bin/sh', '-c', command + ] + + try: + resp = stream(core_api.connect_get_namespaced_pod_exec, + pod_id, + self.project, + container=container_name, + command=exec_command, + stderr=True, stdin=False, + stdout=True, tty=False, _preload_content=True, _request_timeout=60) + + log.info(f"Command executed. Result: {resp}") + + if resp: + log.debug(f"Command output: {resp}") + if "No such file or directory" in resp or "cannot remove" in resp: + log.info("Directory already cleaned up or removal issue detected.") + return resp # Exit early if the directory is already cleaned up or a removal issue was detected + + return resp + + except Exception as e: + log.error(f"Exception occurred while executing command: {e}") + raise + + + def wait_for_pod(self, pod_name, attempts=5): + for i in range(1, attempts): + time.sleep(15) + status = self.get_pod_status(pod_name) + log.info("Pod state is {}".format(status)) + if status and status.lower() == "running": + return True + else: + log.info("Retrying...") + log.info("Can't get pod {} status".format(pod_name)) + return False + + + def get_pod_status(self, pod_name): + core_api = client.CoreV1Api(self._api_client) + pods = core_api.list_namespaced_pod(self.project).items + for x in pods: + if x.metadata.name == pod_name: + return x.status.phase + else: + log.info("Pod {} not found".format(pod_name)) + +if __name__ == "__main__": + recovery = PgBackRestRecovery() + recovery.perform_restore() + + + + + + diff --git a/maintenance/recovery/recovery.py b/maintenance/recovery/recovery.py new file mode 100644 index 0000000..528cd4c --- /dev/null +++ b/maintenance/recovery/recovery.py @@ -0,0 +1,790 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import random +import requests +import string +import json +import yaml +import logging +import time +import sys +import utils_oc +import utils_pg +from utils_common import RecoveryException +from utils_dcs import PatroniDCS, PatroniDCSEtcd, PatroniDCSKubernetes + +import dateutil.parser +import dateutil.zoneinfo +import dateutil.tz + +from multiprocessing.pool import ThreadPool +import concurrent.futures +import asyncio + + +retry_count = 60 +pg_dir = "/var/lib/pgsql/data" +pg_data_dir = "{}/postgresql_${{POD_IDENTITY}}".format(pg_dir) + +log = logging.getLogger() +log.setLevel(logging.DEBUG) + +ch = logging.StreamHandler(sys.stdout) +ch.setLevel(logging.INFO) +formatter = logging\ + .Formatter('%(asctime)s - %(thread)d - %(name)s:%(funcName)s#%(lineno)d - %(levelname)s - %(message)s') +ch.setFormatter(formatter) +log.addHandler(ch) + +ch = logging.FileHandler("recovery.debug.{}.log".format(int(time.time())), mode='w', encoding=None, delay=False) +ch.setLevel(logging.DEBUG) +formatter = logging \ + .Formatter('%(asctime)s - %(thread)d - %(name)s:%(funcName)s#%(lineno)d - %(levelname)s - %(message)s') +ch.setFormatter(formatter) +log.addHandler(ch) + +RECOVERY_EXCEPTION_NO_RESTORE_COMMAND = """FAILURE: cannot find restore_command in dcs and in patroni config template. +Put valid value into dcs (etcd or '{}-config' configmap) on path 'postgresql.recovery_conf.restore_command'. +If dcs uninitialized put it into configmap 'patroni-{}.config.yaml' on path 'bootstrap.dcs.postgresql.recovery_conf.restore_command' +Value can be '' if wal_archive is disabled and 'curl -v -S -f --connect-timeout 3 postgres-backup-daemon:8082/archive/get?filename=%f -o %p' if enabled. +""" + + +class PoolLogger(object): + + def __init__(self, callable): + self.__callable = callable + + def __call__(self, *args, **kwargs): + try: + result = self.__callable(*args, **kwargs) + except Exception as e: + log.exception("Task failed with error") + raise e + return result + + +class LoggingPool(ThreadPool): + + def apply_async(self, func, args=(), kwds={}, callback=None): + return ThreadPool.apply_async(self, PoolLogger(func), args, kwds, callback) + +# todo[anin] OpenshiftPyClient cannot work in parallel! + + + + +def cleanup_pg_pod_data_directory(oc_client, pod_id, preserve_old_files): + log.info("Try to cleanup data directory for pod {}".format(pod_id)) + if preserve_old_files == "yes": + oc_client.oc_exec(pod_id, "sh -c 'mv {} {}_backup_$(date +%s); ls -ll {}'".format(pg_data_dir, pg_data_dir, pg_dir)) + log.info("Old files were preserved on volume. Cleanup if needed.") + oc_client.oc_exec(pod_id, " sh -c 'rm -rf {}; mkdir {}; chmod 700 {}' ".format(pg_data_dir, pg_data_dir, pg_data_dir)) + + +def update_configmap(oc_client, dcs_storage, recovery_pod_id, pg_cluster_name, restore_version, + recovery_target_timeline, recovery_target_inclusive, + recovery_target_name, recovery_target_time, recovery_target_xid, recovery_target,): + log.info("Update configmap with actual bootstrap method") + patroni_config_cm = oc_client.get_configmap("patroni-{}.config.yaml".format(pg_cluster_name)) + if not patroni_config_cm: + log.info("Can't find patroni-{}.config.yaml, trying to find {}-patroni.config.yaml".format(pg_cluster_name, pg_cluster_name)) + patroni_config_cm = oc_client.get_configmap("{}-patroni.config.yaml".format(pg_cluster_name)) + patroni_config_data = patroni_config_cm['data']['patroni-config-template.yaml'] + log.debug("Patroni config template from configmap: \n {}".format(patroni_config_data)) + patroni_config = yaml.safe_load(patroni_config_data) + log.debug("Patroni config template from configmap (parsed): \n {}".format(patroni_config)) + + patroni_dsc, patroni_dsc_data = dcs_storage.get_dcs_config(oc_client, recovery_pod_id, pg_cluster_name) + + targets = { + "recovery_target_name": recovery_target_name, + "recovery_target_time": recovery_target_time, + "recovery_target_xid": recovery_target_xid, + "recovery_target": recovery_target + } + + recovery_target_key = "" + recovery_target_value = "" + + for (key, value) in list(targets.items()): + if value: + recovery_target_key = key + recovery_target_value = value + + patroni_config["bootstrap"]["daemon_recovery"] = { + "command": "/daemon-recovery.sh --restore-version={}".format(restore_version), + "recovery_conf": { + "recovery_target_action": "promote", + "recovery_target_timeline": recovery_target_timeline, + "recovery_target_inclusive": recovery_target_inclusive + } + } + + restore_command = get_restore_command(oc_client, dcs_storage, pg_cluster_name, recovery_pod_id) + # This is a WA for PG12. In PG12 next IF condition were introduced: + # https://github.com/postgres/postgres/blob/REL_12_STABLE/src/backend/access/transam/xlog.c#L5397 + if not restore_command: + restore_command = "echo" + patroni_config["bootstrap"]["daemon_recovery"]["recovery_conf"]["restore_command"] = restore_command + + if recovery_target_key: + patroni_config["bootstrap"]["daemon_recovery"]["recovery_conf"][recovery_target_key] = recovery_target_value + + patroni_config["bootstrap"]["method"] = "daemon_recovery" + log.debug("Patroni config template after update: {}".format(patroni_config)) + updated_patroni_config = yaml.safe_dump(patroni_config, default_flow_style=False, encoding="utf-8", allow_unicode=True) + log.debug("Patroni config template after update: {}".format(updated_patroni_config)) + # encoded_patroni_config = json.dumps(updated_patroni_config) + # log.debug("Encoded patroni config template after update: {}".format(encoded_patroni_config)) + patroni_config_cm['data']['patroni-config-template.yaml'] = updated_patroni_config.decode() + oc_client.apply_object(patroni_config_cm) + + +def remove_bootstrap_method(oc_client, pg_cluster_name): + log.info("Remove bootstrap method from configmap") + patroni_config_cm = oc_client.get_configmap("patroni-{}.config.yaml".format(pg_cluster_name)) + if not patroni_config_cm: + log.info("Can't find patroni-{}.config.yaml, trying to find {}-patroni.config.yaml".format(pg_cluster_name, pg_cluster_name)) + patroni_config_cm = oc_client.get_configmap("{}-patroni.config.yaml".format(pg_cluster_name)) + patroni_config_data = patroni_config_cm['data']['patroni-config-template.yaml'] + log.debug("Patroni config template from configmap: \n {}".format(patroni_config_data)) + patroni_config = yaml.safe_load(patroni_config_data) + log.debug("Patroni config template from configmap (parsed): \n {}".format(patroni_config)) + + patroni_config["bootstrap"].pop("method", None) + log.debug("Patroni config template after update: {}".format(patroni_config)) + updated_patroni_config = yaml.safe_dump(patroni_config, default_flow_style=False, encoding="utf-8", allow_unicode=True) + log.debug("Patroni config template after update: {}".format(updated_patroni_config)) + + patroni_config_cm['data']['patroni-config-template.yaml'] = updated_patroni_config.decode() + oc_client.apply_object(patroni_config_cm) + + + +def perform_bootstrap_recovery(oc_client, oc_orch, pg, dcs_storage, + pg_depl_name, pg_cluster_name, + preserve_old_files, restore_version, + recovery_target_timeline, recovery_target_inclusive, + recovery_target_name, recovery_target_time, recovery_target_xid, recovery_target, + deployment_type): + ''' + :type oc_client: OpenshiftClient + :type oc_orch: OpenshiftOrchestrator + :type pg: PostgresqlClient + :type dcs_storage: PatroniDCS + :param pg_depl_name: + :param pg_cluster_name: + :param preserve_old_files: + :param restore_version: + :param recovery_target_timeline: + :param recovery_target_inclusive: + :param recovery_target_name: + :param recovery_target_time: + :param recovery_target_xid: + :param recovery_target: + :param deployment_type: + :return: + ''' + log.info("Start recovery procedure using bootstrap config") + + log.info("Replace current postgresql deployments with test versions") + + pool = concurrent.futures.ThreadPoolExecutor(max_workers=4) + loop = asyncio.get_event_loop() + + if deployment_type == "dc": + deployments = oc_client.get_deployment_names(pg_depl_name) + tasks = [loop.run_in_executor( + pool, + oc_orch.replace_command_on_dc, + p, ["sh", "-c", "while true ; do sleep 3600; done"]) for p in deployments] + loop.run_until_complete(asyncio.gather(*tasks)) + elif deployment_type == "deployment": + log.info("Will set test version on postgresql deployments: {}".format(pg_depl_name)) + deployments = oc_client.get_deployment_names(pg_depl_name, deployment_type) + tasks = [loop.run_in_executor( + pool, + oc_orch.replace_command_on_deployment, + p, ["sh", "-c", "while true ; do sleep 3600; done"]) for p in deployments] + + loop.run_until_complete(asyncio.gather(*tasks)) + + elif deployment_type == "statefulset": + oc_orch.replace_command_on_statefulset("pg-patroni-node1", ["sh", "-c", "while true ; do sleep 3600; done"]) + oc_orch.replace_command_on_statefulset("pg-patroni-node2", ["sh", "-c", "while true ; do sleep 3600; done"]) + + + + initial_replicas_desc = oc_client.get_cluster_pods_desc(pg_cluster_name) + initial_replicas = [pod["metadata"]["name"] for pod in initial_replicas_desc] + + # in case of statefulset need to get number of replicas for scaling back + if deployment_type == "statefulset": + initial_replicas_num = oc_client.get_stateful_set_replicas_count("patroni") + + recovery_pod_id = list([pod for pod in initial_replicas_desc if not list([env for env in pod["spec"]["containers"][0]["env"] if env["name"] == "DR_MODE" and "value" in env and env["value"].lower() == "true"])])[0]["metadata"]["name"] + log.info("Will use for procedure pod {} from {}".format(recovery_pod_id, initial_replicas)) + + log.info("Update configmap") + + update_configmap( + oc_client, dcs_storage, recovery_pod_id, pg_cluster_name, restore_version, + recovery_target_timeline, recovery_target_inclusive, + recovery_target_name, recovery_target_time, recovery_target_xid, recovery_target) + + log.info("Cleanup initialization key") + dcs_storage.cleanup_initialization_key( + oc_client, pg_cluster_name, recovery_pod_id) + + log.info("Cleanup data directories") + for p in initial_replicas: + cleanup_pg_pod_data_directory( + oc_client, p, preserve_old_files) + + # restore command on deployments + if deployment_type == "dc": + recovery_dc = list([p for p in deployments if p in recovery_pod_id])[0] + log.info("Restore command for replica deployment and leave them shut down") + tasks = [loop.run_in_executor( + pool, + oc_orch.replace_command_on_dc, + p, None, False) for p in [d for d in deployments if d != recovery_dc]] + log.info("Restore command for deployment {}".format(recovery_dc)) + tasks.append(loop.run_in_executor( + pool, + oc_orch.replace_command_on_dc, + recovery_dc, None)) + + loop.run_until_complete(asyncio.gather(*tasks)) + + # determine new pod for recovery deployment + recovery_replicas = oc_client.get_replicas_names(recovery_dc) + recovery_pod_id = recovery_replicas[0] + elif deployment_type == "deployment": + deployments = oc_client.get_deployment_names(pg_depl_name, deployment_type) + log.info("Deployments: {}".format(deployments)) + recovery_dc = list([p for p in deployments if p in recovery_pod_id])[0] + log.info("Restore command for replica deployment and leave them shut down") + tasks = [loop.run_in_executor( + pool, + oc_orch.replace_command_on_deployment, + p, None, True) for p in [d for d in deployments if d != recovery_dc]] + log.info("Restore command for deployment {}".format(recovery_dc)) + loop.run_until_complete(asyncio.gather(*tasks)) + oc_orch.replace_command_on_deployment(recovery_dc, None) + time.sleep(60) + recovery_pod_id = oc_client.get_pods_by_label("app={}".format(pg_cluster_name))[0] + elif deployment_type == "statefulset": + # set command as None for statefulset + oc_orch.replace_command_on_statefulset("pg-patroni-node1", None, False) + oc_orch.replace_command_on_statefulset("pg-patroni-node2", None, False) + oc_orch.scale_stateful_set("pg-patroni-node1", 1) + recovery_pod_id = oc_client.get_cluster_pods(pg_cluster_name)[0] + log.info(recovery_pod_id) + # wait while pod is up + if not oc_client.is_pod_ready(recovery_pod_id, attempts=5): + raise Exception("Pod {} is not ready".format(recovery_pod_id)) + + log.info("Cleanup initialization key") + dcs_storage.cleanup_initialization_key( + oc_client, pg_cluster_name, recovery_pod_id) + + cleanup_pg_pod_data_directory(oc_client, recovery_pod_id, preserve_old_files) + + # wait while database will complete bootstrap and exit from recovery mode + log.info("Wait while patroni pod: {} will complete bootstrap from backup.".format(recovery_pod_id)) + oc_orch.wait_for_one_of_records_in_logs_since( + recovery_pod_id, + ["no action. I am ({}), the leader with the lock".format(recovery_pod_id)], # prefix can be "INFO:" or "[INFO][source=patroni]" + time.time(), + "Bootstrap from backup daemon is in progress.", + [" bootstrap in progress", " waiting for end of recovery after bootstrap"] + ) + log.info("Wait while postgres will exit from recovery mode.") + pg.wait_pg_recovery_complete(recovery_pod_id) + + log.info("Try to set password for postgres and replicator users") + + def split(line): + pos = line.find("=") + if pos > 0: + return line[0:pos], line[pos+1:] + else: + return line, "" + + pg_user, pg_password = oc_client.get_secret_data("postgres-credentials") + log.info( + pg.execute_local_query(recovery_pod_id, "ALTER USER {} WITH PASSWORD '{}'" + .format(pg_user, pg_password))) + replicator_user, replicator_password = oc_client.get_secret_data("replicator-credentials") + log.info( + pg.execute_local_query(recovery_pod_id, "ALTER USER {} WITH PASSWORD '{}'" + .format(replicator_user, replicator_password))) + + bootstrap_method_cleanup = loop.run_in_executor( + pool, + remove_bootstrap_method, + oc_client, + pg_cluster_name) + loop.run_until_complete(asyncio.gather(bootstrap_method_cleanup)) + # restore replicas + if deployment_type == "dc": + for dc in deployments: + if dc != recovery_dc: + log.info("Scale up dc {}".format(dc)) + oc_orch.ensure_scale(dc, 1) + oc_orch.wait_replicas(dc, 1, running=True) + elif deployment_type == "deployment": + for dc in deployments: + if dc != recovery_dc: + log.info("Scale up deployment {}".format(dc)) + oc_orch.ensure_scale(dc, 1, "deployment") + elif deployment_type == "statefulset": + oc_orch.scale_stateful_set("pg-patroni-node2", initial_replicas_num) + + # check if database working on all nodes + replicas = oc_client.get_cluster_pods(pg_cluster_name) + for pod_id in replicas: + if pod_id != recovery_pod_id: + cleanup_pg_pod_data_directory(oc_client, pod_id, preserve_old_files) + log.info("Wait while patroni on replica {} will complete bootstrap from master.".format(pod_id)) + if not oc_client.is_pod_ready(pod_id, attempts=5): + raise Exception("Pod {} is not ready".format(pod_id)) + oc_orch.wait_for_record_in_logs_since( + pod_id, + "no action. I am ({}), a secondary, and following a leader ({})".format(pod_id, recovery_pod_id), + time.time(), + "Bootstrap from master is in progress.", + " bootstrap from leader" + ) + log.info("Wait while replica database will start on pod {}.".format(pod_id)) + pg.wait_database(pod_id) + + + +def download_archive(oc_client, recovery_pod_id, restore_version): + if restore_version: + oc_client.oc_exec(recovery_pod_id, "sh -c 'cd {} ; curl -u postgres:\"$PG_ROOT_PASSWORD\" postgres-backup-daemon:8081/get?id={} | tar -xzf - '" + .format(pg_data_dir, restore_version)) + else: + oc_client.oc_exec(recovery_pod_id, "sh -c 'cd {} ; curl -u postgres:\"$PG_ROOT_PASSWORD\" postgres-backup-daemon:8081/get | tar -xzf - '" + .format(pg_data_dir)) + + +def update_string_value_in_map(map, key, value): + if value: + map[key] = value + else: + map.pop(key, None) + + +def prepare_recovery_conf(recovery_pod_id, patroni_dsc, + recovery_target_timeline, recovery_target_inclusive, + recovery_target_name, recovery_target_time, recovery_target_xid, recovery_target): + if "recovery_conf" not in patroni_dsc["postgresql"]: + patroni_dsc["postgresql"]["recovery_conf"] = {} + + recovery_conf = patroni_dsc["postgresql"]["recovery_conf"] + update_string_value_in_map(recovery_conf, "recovery_target_inclusive", recovery_target_inclusive) + update_string_value_in_map(recovery_conf, "recovery_target_timeline", recovery_target_timeline) + + update_string_value_in_map(recovery_conf, "recovery_target_name", recovery_target_name) + update_string_value_in_map(recovery_conf, "recovery_target_time", recovery_target_time) + update_string_value_in_map(recovery_conf, "recovery_target_xid", recovery_target_xid) + update_string_value_in_map(recovery_conf, "recovery_target", recovery_target) + +def is_recovery_target_specified(recovery_target_name, recovery_target_time, recovery_target_xid, recovery_target): + return recovery_target_name or recovery_target_time or recovery_target_xid or recovery_target + + +def validate_restore_version(backups, restore_version): + if not backups: + raise RecoveryException("FAILURE: cannot find any available backups.") + + if restore_version: + if restore_version not in backups: + raise RecoveryException("FAILURE: RESTORE_VERSION={} does not match any existing backup. " + "List of available backups: {}.".format(restore_version, backups)) + else: + log.info("Proceed with empty RESTORE_VERSION.") + + +def validate_recovery_target_parameters( + oc_client, dcs_storage, pg_cluster_name, backup_daemon_pod_id, restore_version, + recovery_target_name, recovery_target_time, recovery_target_xid, recovery_target): + ''' + + :type oc_client: OpenshiftClient + :type dcs_storage: PatroniDCS + :param pg_cluster_name: + :param backup_daemon_pod_id: + :param restore_version: + :param recovery_target_name: + :param recovery_target_time: + :param recovery_target_xid: + :param recovery_target: + :return: + ''' + if is_recovery_target_specified(recovery_target_name, recovery_target_time, recovery_target_xid, recovery_target) \ + and (not restore_version and not recovery_target_time): + raise RecoveryException("FAILURE: cannot perform PITR " + "without specified backup or recovery_target_time") + + if is_recovery_target_specified(recovery_target_name, recovery_target_time, recovery_target_xid, recovery_target): + archive_mode = get_dcs_config_params(oc_client, dcs_storage, pg_cluster_name, backup_daemon_pod_id, + ["postgresql", "parameters", "archive_mode"], use_template=True) + if archive_mode != "on": + raise RecoveryException("FAILURE: Cannot perform PITR recovery without WAL archive.") + + if not get_restore_command(oc_client, dcs_storage, pg_cluster_name, backup_daemon_pod_id): + raise RecoveryException("FAILURE: Cannot perform PITR recovery without restore_command.") + + +def check_if_replication_works(oc_client, pg, pg_cluster_name): + log.info("Start replication check") + replicas = oc_client.get_cluster_pods_desc(pg_cluster_name) + master_replicas = list([p for p in replicas if "pgtype" in p["metadata"]["labels"] and + p["metadata"]["labels"]["pgtype"] == "master"]) + + if len(master_replicas) > 1: + raise RecoveryException("FAILURE: Several masters detected. Healthy PostgreSQL cluster should have one master.") + + if len(master_replicas) == 0: + raise RecoveryException("FAILURE: Cannot find master. Healthy PostgreSQL cluster should have one master.") + + master_pod_id = master_replicas[0]["metadata"]["name"] + + id = int(time.time()) * 100000 + random.randint(1, 99999) + uuid = ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(32)) + pg.execute_local_query(master_pod_id, "create table IF NOT EXISTS test_recovery (id bigint primary key not null, value text not null)") + pg.execute_local_query(master_pod_id, "insert into test_recovery values ({}, '{}')".format(id, uuid)) + resulted_uuid = pg.execute_local_query(master_pod_id, "select value from test_recovery where id={}".format(id)) + log.debug("Select result from master {}: {}".format(master_pod_id, resulted_uuid)) + + if resulted_uuid != uuid: + raise RecoveryException("FAILURE: Unexpected result '{}' on master while expected '{}'. " + "Master does not work or does not perform write operations.".format(resulted_uuid, uuid)) + + log.info("Record was inserted in master") + + replica_names = list([p["metadata"]["name"] for p in replicas]) + + for replica in replica_names: + if replica != master_pod_id: + query_result = None + for i in range(1, retry_count): + log.info("Try to check if database on pod {} receives data from master".format(replica)) + query_result = pg.execute_local_query(replica, "select value from test_recovery where id={}".format(id)) + log.debug("Query result from replica: {}".format(query_result)) + if query_result == uuid: + break + time.sleep(1) + + if query_result != uuid: + raise RecoveryException("FAILURE: Unexpected result '{}' while expected '{}'. " + "Replica was not able to receive changes from master.".format(query_result, uuid)) + + +def get_tzinfos(): + # todo[anin] implement for different versions of dateutils + if "get_zonefile_instance" in dir(dateutil.zoneinfo): + return dateutil.zoneinfo.get_zonefile_instance().zones + if "getzoneinfofile_stream" in dir(dateutil.zoneinfo): + return dateutil.zoneinfo.ZoneInfoFile(dateutil.zoneinfo.getzoneinfofile_stream()).zones + return {} + + +def enrich_backup(backup_info, tzinfos, cluster_tz_name): + backup_info["parsed"] = dateutil.parser.parse(backup_info["id"], tzinfos=tzinfos) + if not backup_info["parsed"].tzinfo: + backup_info["parsed"] = backup_info["parsed"].replace(tzinfo=tzinfos[cluster_tz_name]) + return backup_info + + +def safe_dict_get(data, path, default=None): + value = data + for x in path: + value = value.get(x) + if not value: + break + return value if value is not None else default + + +def get_dcs_config_params(oc_client, dcs_storage, pg_cluster_name, pod_id, path, use_template=True): + patroni_dsc, _ = dcs_storage.get_dcs_config(oc_client, pod_id, pg_cluster_name) + value = safe_dict_get(patroni_dsc, path) + if value is None and use_template: + log.warning("Cannot get '{}' from dcs. It is possible that dcs state was removed or lost.".format(path)) + patroni_template_cm = oc_client.get_configmap("patroni-{}.config.yaml".format(pg_cluster_name)) + if not patroni_template_cm: + log.info("Can't find patroni-{}.config.yaml, trying to find {}-patroni.config.yaml".format(pg_cluster_name, pg_cluster_name)) + patroni_template_cm = oc_client.get_configmap("{}-patroni.config.yaml".format(pg_cluster_name)) + patroni_template_data = patroni_template_cm.get("data", {}).get("patroni-config-template.yaml") + patroni_template = yaml.load(patroni_template_data) + log.debug("Patroni config template from configmap (parsed): \n {}".format(patroni_template)) + patroni_template_dcs = safe_dict_get(patroni_template, ["bootstrap", "dcs"], {}) + log.debug("Patroni dcs config template from configmap (parsed): \n {}".format(patroni_template_dcs)) + value = safe_dict_get(patroni_template_dcs, path) + return value + + +def get_restore_command(oc_client, dcs_storage, pg_cluster_name, backup_daemon_pod_id): + return get_dcs_config_params(oc_client, dcs_storage, pg_cluster_name, backup_daemon_pod_id, + ["postgresql", "recovery_conf", "restore_command"], use_template=True) + + +def perform_recovery(oc_openshift_url, oc_username, oc_password, oc_project, + pg_cluster_name, pg_depl_name, preserve_old_files, restore_version, + recovery_target_timeline, recovery_target_inclusive, + recovery_target_name, recovery_target_time, recovery_target_xid, recovery_target, + oc_path, oc_config_file, skip_tls_verify=False): + + # todo[anin] OpenshiftPyClient requires more testing (statefulsets for example) + # so leave OpenshiftShellClient as default for backward compatibility + try: + from kubernetes import config as k8s_config + k8s_config.load_incluster_config() + log.info("Using pyclient") + oc_client = utils_oc.OpenshiftPyClient() + oc_client.use_token(oc_url=oc_openshift_url, oc_token="", project=oc_project, skip_tls_verify=skip_tls_verify) + except Exception as e: + log.exception("Failed to create OpenshiftPyClient, proceeding with OpenshiftShellClient") + oc_client = utils_oc.OpenshiftShellClient(oc_path, oc_config_file) + oc_client.login(oc_url=oc_openshift_url, username=oc_username, password=oc_password, project=oc_project, + skip_tls_verify=skip_tls_verify) + + oc_orch = utils_oc.OpenshiftOrchestrator(oc_client, retry_count) + pg = utils_pg.PostgresqlClient(oc_client, retry_count) + + if oc_client.get_entity_safe("configmap", "{}-config".format(pg_cluster_name)): + log.info("Use kubernetes as DCS storage") + dcs_storage = PatroniDCSKubernetes() + elif oc_client.get_entity_safe("svc", "etcd"): + log.info("Use etcd as DCS storage") + dcs_storage = PatroniDCSEtcd() + else: + raise RecoveryException("Cannot find configmap {}-config or service etcd and guess dcs type." + .format(pg_cluster_name)) + deployment_type = "dc" + + # here we are trying to check if this is a deployment via operator + if oc_client.get_entity_safe("deployment", "postgres-backup-daemon"): + log.info("Deployments are used for Pods management") + deployment_type = "deployment" + + #Changed Order here of deployment check + if oc_client.get_entity_safe("statefulset", "patroni"): + log.info("Statefulset is used for Pod management") + deployment_type = "statefulset" + + deployment_type = "statefulset" + + log.info("Try to validate if backup daemon running") + backup_daemon_replicas = oc_client.get_postgres_backup_daemon_pod() + if not backup_daemon_replicas: + raise RecoveryException("FAILURE: cannot find backup daemon pod." + "Recovery procedure requires running backup daemon pod.") + backup_daemon_pod_id = backup_daemon_replicas[0] + log.info("Found backup daemon pod {}".format(backup_daemon_pod_id)) + + log.info("Try to validate recovery target parameters") + validate_recovery_target_parameters( + oc_client, dcs_storage, pg_cluster_name, backup_daemon_pod_id, restore_version, + recovery_target_name, recovery_target_time, recovery_target_xid, recovery_target) + + log.info("Try to validate restore_command setting") + restore_command = get_restore_command(oc_client, dcs_storage, pg_cluster_name, backup_daemon_pod_id) + log.info("Recovery command printed in start ={}".format(restore_command)) + if restore_command is None: + raise RecoveryException(RECOVERY_EXCEPTION_NO_RESTORE_COMMAND.format(pg_cluster_name, pg_cluster_name)) + + ##Added backup status check here + log.info("Try to validate if the backup status is successful") + backup_status = check_backup_status(restore_version) + if not backup_status: + raise RecoveryException("FAILURE: Backup with id {} has an unsuccessful status. " + "Recovery cannot proceed.".format(restore_version)) + + + if restore_version: + log.info("Try to validate backup {} against list of backups from {}".format(restore_version, + backup_daemon_pod_id)) + + backup_list = requests.get("http://localhost:8081/list", auth=('postgres', os.getenv('POSTGRES_PASSWORD'))) + validate_restore_version(backup_list.json(), restore_version) + elif recovery_target_time: + log.info("Try to find backup id from specified recovery_target_time={}".format(recovery_target_time)) + backup_list = requests.get("http://localhost:8081/list", auth=('postgres', os.getenv('POSTGRES_PASSWORD'))) + cluster_tz_name = oc_client.oc_exec(backup_daemon_pod_id, 'date "+%Z"').strip() + log.debug("Cluster time zone: {}" + cluster_tz_name) + + tzinfos = get_tzinfos() + rt_time = dateutil.parser.parse(recovery_target_time, tzinfos=tzinfos) + if not rt_time.tzinfo: + rt_time = rt_time.replace(tzinfo=tzinfos[cluster_tz_name]) + + log.debug("Parsed time: {}".format(rt_time)) + possible_backup_list = list([p for p in [enrich_backup(p, tzinfos, cluster_tz_name) for p in [p for p in list(backup_list.json().values()) if not p["failed"]]] if p["parsed"] and p["parsed"] < rt_time]) + + if not possible_backup_list: + raise RecoveryException("FAILURE: Cannot find backup for specified recovery_target_time='{}'. " + "Try to specify restore_version manually. " + "Available backups: {}".format(recovery_target_time, backup_list.json())) + + possible_backup = max(possible_backup_list, key=lambda p: p["parsed"]) + restore_version = possible_backup["id"] + + log.info("Selected restore_version is {}".format(restore_version)) + else: + raise RecoveryException("FAILURE: Cannot perform recovery without restore_version and recovery_target_time. " + "Please specify at least one of them.") + + patroni_cm = oc_client.get_entity_safe("configmap", "patroni-{}.config.yaml".format(pg_cluster_name)) + if not patroni_cm: + log.info("Can't find patroni-{}.config.yaml, trying to find {}-patroni.config.yaml".format(pg_cluster_name, pg_cluster_name)) + patroni_cm = oc_client.get_entity_safe("configmap", "{}-patroni.config.yaml".format(pg_cluster_name)) + if not patroni_cm: + raise RecoveryException("FAILURE: Cannot find configmap patroni-{}.config.yaml. " + "Check if recovery scripts version complies with cluster version".format(pg_cluster_name)) + perform_bootstrap_recovery(oc_client, oc_orch, pg, dcs_storage, + pg_depl_name, pg_cluster_name, + preserve_old_files, restore_version, + recovery_target_timeline, recovery_target_inclusive, + recovery_target_name, recovery_target_time, recovery_target_xid, recovery_target, + deployment_type) + + if not os.getenv("SKIP_REPLICATION_CHECK", False): + check_if_replication_works(oc_client, pg, pg_cluster_name) + log.info("Recovery is completed successfully") + + +def check_backup_status(backup_id): + try: + url = f"http://postgres-backup-daemon:8080/backup/status/{backup_id}" + headers = {"Accept": "application/json"} + response = requests.get(url, headers=headers) + response_text = response.text + + log.info("Backup Status Check Result: %s", response_text) + + if "Backup Done" in response_text: + log.info("Backup Status: Backup done") + return True + else: + log.info("Backup Status: Backup not done") + raise Exception(f"Backup not done. Status: {response_text}") + + except requests.RequestException as e: + log.error("Error occurred while checking backup status: %s", e) + return False + + + +def prepare_parameters_and_perform_recovery(): + log.info("Start parameters preparation") + + # import env variables + oc_openshift_url = os.getenv("OC_OPENSHIFT_URL", None) + oc_username = os.getenv("OC_USERNAME", None) + oc_password = os.getenv("OC_PASSWORD", None) + oc_project = os.getenv("OC_PROJECT", None) + + pg_cluster_name = os.getenv("PG_CLUSTER_NAME", None) + pg_depl_name = os.getenv("PG_DEPL_NAME", None) + preserve_old_files = os.getenv("PRESERVE_OLD_FILES", "no") + + restore_version = os.getenv("RESTORE_VERSION", "") + + recovery_target_timeline = os.getenv("RECOVERY_TARGET_TIMELINE", "latest") + recovery_target_inclusive = os.getenv("RECOVERY_TARGET_INCLUSIVE", "true") + + recovery_target_name = os.getenv("RECOVERY_TARGET_NAME", None) + recovery_target_time = os.getenv("RECOVERY_TARGET_TIME", None) + recovery_target_xid = os.getenv("RECOVERY_TARGET_XID", None) + recovery_target = os.getenv("RECOVERY_TARGET", None) + + oc_path = os.getenv("OC_PATH", "oc") + skip_tls_verify = os.getenv("OC_SKIP_TLS_VERIFY", "true") + oc_config_file = os.getenv("OC_CONFIG_FILE", "./oc_config_file.yaml") + + # parse args + parser = argparse.ArgumentParser(description='Recovery procedure for Postgresql cluster') + parser.add_argument('--oc-openshift-url', dest='oc_openshift_url', default=None, help='address of openshift console') + parser.add_argument('--oc-username', dest='oc_username', default=None, help='user of openshift console') + parser.add_argument('--oc-password', dest='oc_password', default=None, help='password of openshift console') + parser.add_argument('--oc-project', dest='oc_project', default=None, help='address of openshift console') + + parser.add_argument('--pg-cluster-name', dest='pg_cluster_name', default=None, help='Postgresql cluster name') + parser.add_argument('--pg-depl-name', dest='pg_depl_name', default=None, + help='Template of postgresql deployment name like "pg-common-node"') + + parser.add_argument('--preserve-old-files', dest='preserve_old_files', default=None, choices=["yes", "no"], + help='If "yes" then store old files on volume otherwise remove old files.') + + parser.add_argument('--restore-version', dest='restore_version', default=None, + help='ID of backup to recovery procedure') + + parser.add_argument('--timeline', dest='recovery_target_timeline', default=None, + help='Only for point in time recovery. Desired timeline for recovery procedure') + parser.add_argument('--inclusive', dest='recovery_target_inclusive', default=None, choices=['true', 'false'], + help='Only for point in time recovery. Specifies if recovery procedure should inslude specified recovery target') + + parser.add_argument('--recovery-target-time', dest='recovery_target_time', default=None, + help='Only for point in time recovery. Specifies recovery_target_time.') + parser.add_argument('--recovery-target-name', dest='recovery_target_name', default=None, + help='Only for point in time recovery. Specifies recovery_target_name.') + parser.add_argument('--recovery-target-xid', dest='recovery_target_xid', default=None, + help='Only for point in time recovery. Specifies recovery_target_xid.') + parser.add_argument('--recovery-target', dest='recovery_target', default=None, choices=['immediate'], + help='Only for point in time recovery. Specifies recovery_target.') + + parser.add_argument('--oc-path', dest='oc_path', default=None, help='path to oc client (with oc)') + parser.add_argument('--oc-config', dest='oc_config_file', default=None, help='path to oc client (with oc)') + parser.add_argument('--oc-skip-tls-verify', dest='skip_tls_verify', default=None, choices=['true', 'false'], + help='Set value to "true" to turn off ssl validation') + + args = parser.parse_args() + + for (key, value) in list(vars(args).items()): + if value and key in locals(): + locals()[key] = value + + if not pg_depl_name: + pg_depl_name = "pg-{}-node".format(pg_cluster_name) + + log.info("Parameters were parsed") + log.debug("Local vars : {}".format(locals())) + log.debug("Global vars : {}".format(globals())) + + try: + perform_recovery(oc_openshift_url, oc_username, oc_password, oc_project, + pg_cluster_name, pg_depl_name, + preserve_old_files, restore_version, + recovery_target_timeline, recovery_target_inclusive, + recovery_target_name, recovery_target_time, recovery_target_xid, recovery_target, + oc_path, oc_config_file, + # force_manual_recovery, + skip_tls_verify=(True if skip_tls_verify == 'true' else False)) + except RecoveryException as ex: + log.exception("Recovery procedure failed.") + log.error(str(ex)) + sys.exit(1) + + +if __name__ == "__main__": + prepare_parameters_and_perform_recovery() diff --git a/maintenance/recovery/recovery.sh b/maintenance/recovery/recovery.sh new file mode 100755 index 0000000..eb853a8 --- /dev/null +++ b/maintenance/recovery/recovery.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +source recovery_utils.sh +source recovery_setEnv.sh + +validate_binary jq +validate_binary python3 +validate_python_package "yaml" "PyYAML" + +python3 ./recovery.py \ No newline at end of file diff --git a/maintenance/recovery/recovery_launcher.sh b/maintenance/recovery/recovery_launcher.sh new file mode 100755 index 0000000..075d61b --- /dev/null +++ b/maintenance/recovery/recovery_launcher.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +source recovery_utils.sh +source recovery_setEnv.sh + +validate_binary jq + + +# check if need to setup RESTORE_VERSION +if [[ -z "${RESTORE_VERSION}" ]] ; then + log "Current RESTORE_VERSION value is empty." + + if [[ -n "${RECOVERY_TARGET_TIME}" ]] ; then + log "RECOVERY_TARGET_TIME is specified. Leave RESTORE_VERSION value is empty to allow procedure to guess RESTORE_VERSION." + fi + + backup_list_json=$(curl http://postgres-backup-daemon:8081/list) + backup_list=$(echo ${backup_list_json} | jq -r '.[].id') + for item in ${backup_list[@]}; do + echo ${item} + done + RESTORE_VERSION_CHECK=false + while [ ${RESTORE_VERSION_CHECK} == "false" ] ; do + log "Please select one of backups. Empty input means last available backup or guessed backup." + read RESTORE_VERSION + # empty value allowed + if [[ -z "${RESTORE_VERSION}" ]] ; then + RESTORE_VERSION_CHECK=true + fi + # check non empty value for correctness + for item in ${backup_list[@]}; do + if [[ "${RESTORE_VERSION}" == ${item} ]] ; then + RESTORE_VERSION_CHECK=true + break + fi + done + done +fi + + +if [[ -z "${RESTORE_VERSION}" ]] ; then + log "Will try to restore last available backup or guessed backup." + confirm || exit 1 +else + log "Will try to restore backup ${RESTORE_VERSION}." + confirm || exit 1 +fi + +RESTORE_VERSION=${RESTORE_VERSION} ./recovery.sh diff --git a/maintenance/recovery/recovery_setEnv.sh b/maintenance/recovery/recovery_setEnv.sh new file mode 100755 index 0000000..abe1a36 --- /dev/null +++ b/maintenance/recovery/recovery_setEnv.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +NAMESPACE=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace) + +# user defined variables +export OC_PROJECT=${OC_PROJECT:-${NAMESPACE}} +export OC_CONFIG_FILE=${OC_CONFIG_FILE:-oc_config_file.yaml} +export PG_CLUSTER_NAME=${PG_CLUSTER_NAME:-patroni} +export PRESERVE_OLD_FILES=${PRESERVE_OLD_FILES:-no} +export RESTORE_VERSION=${RESTORE_VERSION:-} +export SKIP_REPLICATION_CHECK=${SKIP_REPLICATION_CHECK:-False} + +############################################################################# +#### Recovery target settings according to +#### https://www.postgresql.org/docs/9.6/static/recovery-target-settings.html +############################################################################# + +export RECOVERY_TARGET_INCLUSIVE=${RECOVERY_TARGET_INCLUSIVE:-true} +export RECOVERY_TARGET_TIMELINE=${RECOVERY_TARGET_TIMELINE:-latest} + +# specify only one parameter +export RECOVERY_TARGET_TIME=${RECOVERY_TARGET_TIME:-} +export RECOVERY_TARGET_NAME=${RECOVERY_TARGET_NAME:-} +export RECOVERY_TARGET_XID=${RECOVERY_TARGET_XID:-} +export RECOVERY_TARGET=${RECOVERY_TARGET:-} +############################################################################# + + +############################################################################# +#### oc client settings +############################################################################# +export OC_PATH=${OC_PATH:-oc} +export OC_SKIP_TLS_VERIFY=${OC_SKIP_TLS_VERIFY:-true} +############################################################################# + +# system vars (do not change) +export PG_DEPL_NAME=${PG_DEPL_NAME:-pg-${PG_CLUSTER_NAME}} + +export LOG_ERROR="\e[0;101m" +export LOG_SUCCESS="\e[0;32m" +export LOG_INFO="\e[0;104m" + diff --git a/maintenance/recovery/recovery_utils.sh b/maintenance/recovery/recovery_utils.sh new file mode 100755 index 0000000..237064f --- /dev/null +++ b/maintenance/recovery/recovery_utils.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +function log { + echo -e "[$(date +'%H:%M:%S')] $2$1\e[m" +} + +function validate_binary { + command -v $1 &> /dev/null + if [ $? -gt 0 ] ; then + log "Please install $1" + exit 1 + fi +} + +function validate_python_package { + local package="$1" + local package_pip_name="$2" + local package_version="$3" + python -c "import $package" + if [ $? -gt 0 ] ; then + log "Please install python package $package" + [[ -n "$package_pip_name" ]] && log "To install $package execute command 'sudo pip install $package_pip_name'." + exit 1 + fi + + if [[ -n "${package_version}" ]] ; then + package_current_version=$(pip3 show ${package_pip_name} | grep -e '^Version:' | cut -d ':' -f2) + if [ $? -gt 0 ] ; then + log "Cannot get version of $2 via pip" + exit 1 + fi + cmp_result=$(python -c "from pkg_resources import parse_version; print(parse_version('${package_current_version}'.strip()) >= parse_version('${package_version}'))") + if [ $? -gt 0 ] ; then + log "Cannot compare version of ${package_pip_name} with desired version ${package_version}. Current version: ${package_current_version}." + exit 1 + fi + if [[ "${cmp_result}" != "True" ]] ; then + log "Installed version of ${package_pip_name} is too old. Please install version ${package_version} or above. Current version: ${package_current_version}." + exit 1 + fi + fi + +} + +function get_replicas { + REPL_NAME=${1} + gr_replicas=($(oc get pods --config="$OC_CONFIG_FILE" | grep -v deploy | grep Running | grep ${REPL_NAME} | cut -d\ -f1)) + echo "${gr_replicas[*]}" +} + +function get_pod_ip(){ + oc --config="$OC_CONFIG_FILE" get pod $1 -o json | jq -r '.status.podIP' +} + +confirm() { + while(true); do + read -p "Continue (y/n)?" choice + case "$choice" in + y|Y ) return 0;; + n|N ) return 1;; + esac + done +} \ No newline at end of file diff --git a/maintenance/recovery/utils_common.py b/maintenance/recovery/utils_common.py new file mode 100644 index 0000000..4fe53a6 --- /dev/null +++ b/maintenance/recovery/utils_common.py @@ -0,0 +1,146 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import time + + +class RecoveryException(Exception): + """ + This class should be used if exception contains meaningful message to user. + """ + + def __init__(self, *args): + super(RecoveryException, self).__init__(*args) + + +class Differ: + + def is_primitive_array(self, array): + for entity in array: + if isinstance(entity, list) or isinstance(entity, dict): + return False + return True + + def ensure_path(self, path, result): + target = result + for i in range(0, len(path)): + el = path[i] + nextEl = path[i + 1] if i + 1 < len(path) else None + if isinstance(el, str): + if el in target: + target = target[el] + else: + if nextEl is not None: + target[el] = {} if isinstance(nextEl, str) else [] + target = target[el] + else: + target[el] = None + else: + if len(target) > 0: + target = target[el] + else: + if nextEl is not None: + target.append({} if isinstance(nextEl, str) else []) + target = target[el] + else: + target[el] = None + + def set_value_for_path(self, data, path, result): + self.ensure_path(path, result) + target = result + for i in range(0, len(path) - 1): + el = path[i] + target = target[el] + target[path[len(path) - 1]] = data + + def trace_tree_for_diffs(self, source, data, path, result, keep_name=False): + if isinstance(data, list): + if source != data: + if source and self.is_primitive_array(source) or data and self.is_primitive_array(data): + self.set_value_for_path(data, path, result) + else: + if len(data) != len(source): + raise AssertionError("Cannot get diff for different arrays") + for i in range(0, len(data)): + newPath = copy.copy(path) + newPath.append(i) + self.trace_tree_for_diffs(source[i], data[i], newPath, result, keep_name=keep_name) + elif isinstance(data, dict): + if source != data: + for k, v in data.items(): + newPath = copy.copy(path) + newPath.append(k) + if k not in source: + self.set_value_for_path(data[k], newPath, result) + else: + self.trace_tree_for_diffs(source[k], data[k], newPath, result, keep_name=keep_name) + for k, v in source.items(): + if k not in data: + newPath = copy.copy(path) + newPath.append(k) + self.set_value_for_path(None, newPath, result) + else: + if source != data: + self.set_value_for_path(data, path, result) + elif keep_name and len(path) > 0 and path[len(path) - 1] == "name": + self.set_value_for_path(data, path, result) + + def get_json_diff(self, source, data, keep_name=False): + result = {} + self.trace_tree_for_diffs(source, data, [], result, keep_name=keep_name) + return result + + +def retry(exceptions=None, tries=5, delay=1, backoff=1, logger=None): + """ + :param exceptions: if defined - only specified exceptions will be checked + :type exceptions: tuple of Exception or Exception + :param tries: how much to try before fail. <=0 means no limits. + :param delay: basic delay between tries + :param backoff: delay increase factor after each retry + :param logger: + :type logger: logging.Logger + :return: + """ + def deco_retry(f): + + def handle_error(e, mtries, mdelay): + msg = "Error occurred during execution: {}. Will retry in {} seconds.".format(str(e), delay) + if logger: + logger.exception(msg) + else: + print(msg) + time.sleep(mdelay) + mtries -= 1 + mdelay *= backoff + return mtries, mdelay + + def f_retry(*args, **kwargs): + mtries, mdelay = tries, delay + while tries <= 0 or mtries > 1: + if exceptions: + try: + return f(*args, **kwargs) + except exceptions as e: + mtries, mdelay = handle_error(e, mtries, mdelay) + else: + try: + return f(*args, **kwargs) + except Exception as e: + mtries, mdelay = handle_error(e, mtries, mdelay) + return f(*args, **kwargs) + + return f_retry + return deco_retry diff --git a/maintenance/recovery/utils_dcs.py b/maintenance/recovery/utils_dcs.py new file mode 100644 index 0000000..16ca845 --- /dev/null +++ b/maintenance/recovery/utils_dcs.py @@ -0,0 +1,124 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +from abc import ABCMeta, abstractmethod + +log = logging.getLogger() + + +class PatroniDCS(metaclass=ABCMeta): + @abstractmethod + def get_dcs_config(self, client, recovery_pod_id, pg_cluster_name): + """ + returns 2 dict with data from dcs - config from dcs and dcs data itself. + :type client: OpenshiftClient + :type recovery_pod_id: str + :type pg_cluster_name: str + :rtype: tuple(dict, str) + """ + pass + + @abstractmethod + def update_dcs_config(self, oc_client, recovery_pod_id, patroni_dsc): + pass + + @abstractmethod + def cleanup_initialization_key(self, oc_client, pg_cluster_name, pod_id): + """ + :type oc_client: OpenshiftClient + :param pg_cluster_name: + :param pod_id: + :return: + """ + pass + + +class PatroniDCSEtcd(PatroniDCS): + + def get_dcs_config(self, oc_client, recovery_pod_id, pg_cluster_name): + patroni_dsc_data = json.loads(oc_client.oc_exec(recovery_pod_id, "curl etcd:2379/v2/keys/patroni/{}/config" + .format(pg_cluster_name)))["node"]["value"] + log.debug("Patroni dcd: {}".format(patroni_dsc_data)) + patroni_dsc = json.loads(patroni_dsc_data) + log.debug("Patroni dcd (parsed): {}".format(patroni_dsc)) + return patroni_dsc, patroni_dsc_data + + def update_dcs_config(self, oc_client, recovery_pod_id, patroni_dsc): + log.info("Start dsc configuration update.") + recovery_conf = patroni_dsc["postgresql"]["recovery_conf"] + recovery_conf["recovery_target_action"] = "promote" + with open("dsc.config.tmp", mode="w") as fd: + json.dump(patroni_dsc, fd) + oc_client.rsync("./", "{}:/tmp".format(recovery_pod_id)) + log.debug(oc_client.oc_exec(recovery_pod_id, + 'python -c \'import etcd, os; ' + 'fd=open("/tmp/dsc.config.tmp"); data=fd.read(); fd.close(); ' + 'client = etcd.Client(host="etcd", protocol="http", port=2379); ' + 'client.write("patroni/{}/config".format(os.getenv("PG_CLUST_NAME")), data)\'')) + + def cleanup_initialization_key(self, oc_client, pg_cluster_name, pod_id): + oc_client.oc_exec(pod_id, "sh -c '" + "curl -XDELETE -s etcd:2379/v2/keys/patroni/${PG_CLUST_NAME}/initialize; " + "curl -XDELETE etcd:2379/v2/keys/patroni/${PG_CLUST_NAME}/optime?recursive=true'") + + +class PatroniDCSKubernetes(PatroniDCS): + + def get_dcs_config(self, oc_client, recovery_pod_id, pg_cluster_name): + """ + :type oc_client: OpenshiftClient + :param recovery_pod_id: + :param pg_cluster_name: + :return: + """ + patroni_dcs_cm = oc_client.get_configmap("{}-config".format(pg_cluster_name)) + log.debug("DCS config map: {}".format(patroni_dcs_cm)) + patroni_dsc_data = patroni_dcs_cm["metadata"]["annotations"]["config"] + log.debug("Patroni dcs: {}".format(patroni_dsc_data)) + patroni_dsc = json.loads(patroni_dsc_data) + log.debug("Patroni dcd (parsed): {}".format(patroni_dsc)) + return patroni_dsc, patroni_dsc_data + + def update_dcs_config(self, oc_client, recovery_pod_id, patroni_dsc): + raise NotImplementedError() + + def cleanup_initialization_key(self, oc_client, pg_cluster_name, pod_id): + """ + :type oc_client: OpenshiftClient + :param pg_cluster_name: + :param pod_id: + :return: + """ + log.debug("Try to delete configmap {}-leader".format(pg_cluster_name)) + oc_client.delete_entity("configmap", "{}-leader".format(pg_cluster_name)) + + # in case of sync replication we should also remove leader key from cm + if oc_client.get_entity_safe("configmap", "{}-sync".format(pg_cluster_name)): + log.debug("Try to delete configmap {}-sync".format(pg_cluster_name)) + patroni_dcs_sync_cm = oc_client.get_configmap("{}-sync".format(pg_cluster_name)) + patroni_dcs_sync_cm["metadata"]["annotations"].pop('leader', None) + oc_client.replace_object(patroni_dcs_sync_cm) + + log.debug("Try to remove initialize key from configmap {}-config".format(pg_cluster_name)) + for _ in range(1, 5): + # this is a dirty WA because sometimes initialize did not removed + patroni_dcs_cm = oc_client.get_configmap("{}-config".format(pg_cluster_name)) + log.debug("DCS config map before : {}".format(patroni_dcs_cm)) + if "initialize" in patroni_dcs_cm["metadata"]["annotations"]: + del patroni_dcs_cm["metadata"]["annotations"]["initialize"] + oc_client.replace_object(patroni_dcs_cm) + patroni_dcs_cm = oc_client.get_configmap("{}-config".format(pg_cluster_name)) + log.debug("DCS config map after applying: {}".format(patroni_dcs_cm)) \ No newline at end of file diff --git a/maintenance/recovery/utils_oc.py b/maintenance/recovery/utils_oc.py new file mode 100644 index 0000000..915d559 --- /dev/null +++ b/maintenance/recovery/utils_oc.py @@ -0,0 +1,1113 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import json +from abc import abstractmethod, ABCMeta +import subprocess +import logging +from utils_common import * +import pprint +import time + +log = logging.getLogger() + +try: + from kubernetes import client + from kubernetes.client import configuration + from kubernetes.client import rest + #from openshift import client as op_client + from kubernetes.client.rest import ApiException + from kubernetes.stream import stream + from kubernetes.stream.ws_client import ERROR_CHANNEL, STDOUT_CHANNEL, STDERR_CHANNEL + import kubernetes + from six import iteritems + + def to_dict(self): + """ + Returns the model properties as a dict + """ + result = {} + + for attr, _ in iteritems(self.swagger_types): + if attr in self.attribute_map: + attr_name = self.attribute_map[attr] + else: + attr_name = attr + + value = getattr(self, attr) + if isinstance(value, list): + result[attr_name] = list([x.to_dict() if hasattr(x, "to_dict") else x for x in value]) + elif hasattr(value, "to_dict"): + result[attr_name] = value.to_dict() + elif isinstance(value, dict): + result[attr_name] = dict([(item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") else item for item in list(value.items())]) + else: + result[attr_name] = value + + return result + + # kubernetes.client.models.v1_pod.V1Pod.to_dict = to_dict + # kubernetes.client.models.v1_container.V1Container.to_dict = to_dict + # kubernetes.client.models.v1_probe.V1Probe.to_dict = to_dict + # kubernetes.client.models.v1_config_map.V1ConfigMap.to_dict = to_dict + # kubernetes.client.models.v1_deployment.V1Deployment.to_dict = to_dict + # openshift.client.models.v1_deployment_config.V1DeploymentConfig.to_dict = to_dict + # kubernetes.client.models.v1_replication_controller.V1ReplicationController.to_dict = to_dict + # kubernetes.client.models.v1_stateful_set.V1StatefulSet.to_dict = to_dict + # kubernetes.client.models.v1beta1_stateful_set.V1beta1StatefulSet.to_dict = to_dict + # kubernetes.client.models.apps_v1beta1_deployment.AppsV1beta1Deployment.to_dict = to_dict + # kubernetes.client.models.v1_object_meta.V1ObjectMeta.to_dict = to_dict + # kubernetes.client.models.v1_stateful_set_status.V1StatefulSetStatus.to_dict = to_dict + # kubernetes.client.models.v1beta1_stateful_set_status.V1beta1StatefulSetStatus.to_dict = to_dict + + class ObjectEncoder(json.JSONEncoder): + + def default(self, o): + if isinstance(o, datetime.datetime): + return o.strftime("%Y-%m-%dT%H:%M:%SZ") + return super(ObjectEncoder, self).default(o) + + use_kube_client = True +except ImportError as e: + log.exception("Cannot use python client") + use_kube_client = False + + +def get_api_token(oc_url, username, password): + import base64 + import requests + import re + + auth_url = oc_url + "/oauth/authorize?" \ + "client_id=openshift-challenging-client&" \ + "response_type=token" + headers = {} + user_password = username + b":" + password + encoding = base64.b64encode(user_password) + headers["Authorization"] = b"Basic " + encoding + headers["X-CSRF-Token"] = b"1" + + result = requests.get(url=auth_url, headers=headers, verify=False, allow_redirects=False) + if result.status_code != 302: + raise ApiException(status=result.status_code, http_resp=result.text) + location = result.headers["Location"] + p = re.compile("access_token=([a-zA-Z0-9-_]+)") + token = p.findall(location)[0] + return token + + +class OpenshiftClient(metaclass=ABCMeta): + @abstractmethod + def login(self, oc_url, username, password, project, skip_tls_verify=False): + pass + + @abstractmethod + def use_token(self, oc_url, oc_token, project, skip_tls_verify=False): + pass + + @abstractmethod + def get_entities(self, entity_type): + """ + :param entity_type: + :return: parsed entity + :rtype: [] + """ + pass + + @abstractmethod + def get_entity(self, entity_type, entity_name): + """ + :param entity_type: + :param entity_name: + :return: parsed entity + :rtype: {} + """ + pass + + @abstractmethod + def get_entity_safe(self, entity_type, entity_name): + """ + :param entity_type: + :param entity_name: + :return: parsed entity or None if entity does not exist + :rtype: {} + """ + pass + + def get_configmap(self, cm_name): + """ + :param cm_name: + :return: parsed entity + :rtype: {} + """ + return self.get_entity_safe("configmap", cm_name) + + def get_deployment(self, dc_name, type="dc"): + """ + :param dc_name: + :return: parsed entity + :rtype: {} + """ + return self.get_entity(type, dc_name) + + def get_stateful_set(self, stateful_set_name): + return self.get_entity("statefulset", stateful_set_name) + + def get_env_for_pod(self, pod_id, env_name, default_value=None): + pod = self.get_entity("pod", pod_id) + envs = pod["spec"]["containers"][0]["env"] + env = list([x for x in envs if x["name"] == env_name]) + return default_value if not env else env[0]["value"] + + def get_env_for_dc(self, dc_name, env_name, default_value=None): + dc = self.get_entity("dc", dc_name) + envs = dc["spec"]["template"]["spec"]["containers"][0]["env"] + env = list([x for x in envs if x["name"] == env_name]) + return default_value if not env else env[0]["value"] + + @abstractmethod + def set_env_for_dc(self, dc_name, env_name, env_value): + pass + + @abstractmethod + def get_deployment_names(self, dc_name_part): + """ + :param dc_name_part: + :return: list of deployment configs names which contain `dc_name_part` + :rtype: [string] + """ + pass + + def get_deployment_replicas_count(self, dc_name, type="dc"): + """ + :param dc_name: + :return: replica parameter from deployment corresponding to dc_name + :rtype: int + """ + deployment = self.get_deployment(dc_name, type) + return int(deployment["spec"]["replicas"]) + + def get_stateful_set_replicas_count(self, stateful_set_name): + stateful_set = self.get_stateful_set(stateful_set_name) + return int(stateful_set.get("spec").get("replicas")) + + def get_running_stateful_set_replicas_count(self, stateful_set_name): + stateful_set = self.get_stateful_set(stateful_set_name) + status = stateful_set.get("status") + return min(int(status.get("ready_replicas") or "0"), + int(status.get("updated_replicas") or "0")) + + def get_liveness_probe_from_stateful_set(self, stateful_set_name): + stateful_set = self.get_stateful_set(stateful_set_name) + return stateful_set["spec"]["template"]["spec"]["containers"][0]["livenessProbe"] + + @abstractmethod + def get_replicas_desc(self, dc_name, running=True): + """ + :param dc_name: + :param running: + :return: list of replicas descriptions + :rtype: [] + """ + pass + + def get_replicas_names(self, dc_name, running=True): + """ + :param dc_name: + :param running: + :return: + :rtype: [string] + """ + pod_names = list([p["metadata"]["name"] for p in self.get_replicas_desc(dc_name, running)]) + return pod_names + + @abstractmethod + def delete_entity(self, entity_type, entity_name, ignore_not_found=True): + pass + + @abstractmethod + def oc_exec(self, pod_id, command): + """ + :param pod_id: + :param command: + :return: + :rtype: string + """ + pass + + @abstractmethod + def get_logs(self, pod_id, since=None): + """ + :param pod_id: + :param since: + :return: + :rtype: string + """ + pass + + @abstractmethod + def rsync(self, source, target): + pass + + @abstractmethod + def delete_pod(self, pod_id, grace_period=None): + pass + + @abstractmethod + def scale(self, dc_name, count, entity="dc"): + pass + + @abstractmethod + def apply_object(self, data): + pass + + @abstractmethod + def replace_object(self, data): + pass + + @abstractmethod + def get_cluster_pods(self, cluster_name, running=True): + pass + + @abstractmethod + def get_cluster_pods_desc(self, cluster_name, running=True): + pass + + @abstractmethod + def get_pods_by_label(self, label_selector): + pass + + def get_pod_status(self, pod_id): + items = self.get_entities("pod") + for x in items: + if x["metadata"]["name"] == pod_id: + return x["status"]["phase"] + else: + log.info("Pod {} not found".format(pod_id)) + + def is_pod_ready(self, pod_id, attempts=5): + for i in range(1, attempts): + time.sleep(5) + status = self.get_pod_status(pod_id) + log.info("Pod state is {}".format(status)) + if status.lower() == "running": + return True + else: + log.info("Retrying...") + log.info("Can't get pod {} status".format(pod_id)) + return False + + def get_postgres_backup_daemon_pod(self): + pods_from_dcs = self.get_pods_by_label("app=postgres-backup-daemon") + pods_from_deployments = self.get_pods_by_label("component=postgres-backup-daemon") + return pods_from_dcs + pods_from_deployments + + +class OpenshiftShellClient(OpenshiftClient): + def __init__(self, oc_path="oc", config_file="./oc_config_file.yaml"): + self.oc = oc_path + " --config={}".format(config_file) + + def login(self, oc_url, username, password, project, skip_tls_verify=False): + log.info("Log in as {} to {}".format(username, oc_url)) + subprocess.check_call( + '{} login -u "{}" -p "{}" {} {}' + .format(self.oc, + username, password, oc_url, + ("--insecure-skip-tls-verify=true" if skip_tls_verify else "")), + shell=True, stdout=subprocess.PIPE) + + if project: + log.info("Change project to {}".format(project)) + subprocess.check_call( + '{} project {}'.format(self.oc, project), + shell=True, stdout=subprocess.PIPE) + + def use_token(self, oc_url, oc_token, project, skip_tls_verify=False): + raise NotImplementedError("Cannot use token with oc client") + + def get_entities(self, entity_type): + entity = json.loads(subprocess.Popen( + '{} get {} -o json'.format(self.oc, entity_type), + shell=True, stdout=subprocess.PIPE).stdout.read()) + return entity["items"] + + def get_entity(self, entity_type, entity_name): + entity = json.loads(subprocess.Popen( + '{} get {} {} -o json'.format(self.oc, entity_type, entity_name), + shell=True, stdout=subprocess.PIPE).stdout.read()) + return entity + + def get_entity_safe(self, entity_type, entity_name): + entities_data = json.loads(subprocess.Popen( + '{} get {} -o json'.format(self.oc, entity_type), shell=True, stdout=subprocess.PIPE).stdout.read()) + entities = list([p for p in entities_data["items"] if entity_name == p["metadata"]["name"]]) + if entities: + return entities[0] + return None + + def set_env_for_dc(self, dc_name, env_name, env_value): + log.info("Try to set env {}={} to deployment {}".format(env_name, env_value, dc_name)) + env_repr = "{}={}".format(env_name, env_value) if env_value else "{}-".format(env_name) + subprocess.check_call("{} set env dc {} {}".format(self.oc, dc_name, env_repr), shell=True) + + def get_deployment_names(self, dc_name_part, type="dc"): + deployments_data = json.loads(subprocess.Popen( + '{} get {} -o json'.format(self.oc, type), shell=True, stdout=subprocess.PIPE).stdout.read()) + deployments = list( + [p["metadata"]["name"] for p in [p for p in deployments_data["items"] if dc_name_part in p["metadata"]["name"]]]) + return deployments + + def get_replicas_desc(self, dc_name, running=True): + pods_data = json.loads(subprocess.Popen( + "{} get pods {} -o json".format(self.oc, ("-a" if not running else "")), + shell=True, stdout=subprocess.PIPE).stdout.read()) + pods = list( + [p for p in pods_data["items"] if "deploymentconfig" in p["metadata"]["labels"] and + dc_name in p["metadata"]["labels"]["deploymentconfig"]]) + if running: + pods = list([p for p in pods if "running" == p["status"]["phase"].lower()]) + return pods + + def delete_entity(self, entity_type, entity_name, ignore_not_found=True): + log.debug("Try to delete entity {} {}".format(entity_type, entity_name)) + inf_value = "true" if ignore_not_found else "false" + subprocess.check_call("{} delete {} {} --ignore-not-found={}" + .format(self.oc, entity_type, entity_name, inf_value), + shell=True) + + def oc_exec(self, pod_id, command): + log.info("Try to execute '{}' on pod {}".format(command, pod_id)) + process = subprocess.Popen("{} exec {} -- {}".format(self.oc, pod_id, command), shell=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if process.wait() != 0: + raise Exception("Error occured during execution. " + "Return code: {}, stderr: {}, stdout: {}" + .format(process.returncode, process.stderr.read(), process.stdout.read())) + return process.stdout.read().decode() + + def get_logs(self, pod_id, since=None): + log.debug("Try to obtain logs from pod {} for last {}s.".format(pod_id, since)) + cmd = "{} logs {} --since={}s".format(self.oc, pod_id, since) + if not since: + cmd = "{} logs {}".format(self.oc, pod_id) + process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + result = process.communicate() + log.debug("Received data len: {}".format(0 if not result[0] else len(result[0]))) + if process.returncode != 0 or result[1]: + log.warning("Error during log obtain. code{}, message: {}".format(process.returncode, result[1])) + return result[0].decode() + + def rsync(self, source, target): + subprocess.check_call("{} rsync {} {}".format(self.oc, source, target), shell=True) + + def delete_pod(self, pod_id, grace_period=None): + log.debug("Remove pod {} with grace-period {}".format(pod_id, grace_period)) + grace_period = "--grace-period={}".format(grace_period) if grace_period else "" + p = subprocess.Popen("{} delete pod {} {}".format(self.oc, pod_id, grace_period), + shell=True, stdin=subprocess.PIPE, stderr=subprocess.PIPE) + if p.wait() != 0: + error = p.stderr.read() + if 'pods "{}" not found'.format(pod_id) in error.decode(): + log.warning("Cannot remove pod {} - no such pod.".format(pod_id)) + pass + else: + raise Exception("Cannot remove pod. Error: {}".format(error)) + + @retry(tries=5) # handle case when DC version is Unknown + def scale(self, name, count, entity="dc"): + subprocess.check_call("{} scale --replicas={} {} {}" + .format(self.oc, count, entity, name), shell=True) + + def apply_object(self, data): + p = subprocess.Popen("{} apply -f -".format(self.oc), shell=True, stdin=subprocess.PIPE) + p.communicate(input=json.dumps(data).encode()) + + def replace_object(self, data): + p = subprocess.Popen("{} replace -f -".format(self.oc), shell=True, stdin=subprocess.PIPE) + p.communicate(input=json.dumps(data).encode()) + + def get_cluster_pods(self, cluster_name, running=True): + pods_data = json.loads( + subprocess.Popen("{} get pods {} --selector=\"pgcluster={}\" -o json" + .format(self.oc, ("-a" if not running else ""), cluster_name), + shell=True, stdout=subprocess.PIPE).stdout.read()) + pods = [pod["metadata"]["name"] for pod in pods_data["items"]] + return pods + + def get_cluster_pods_desc(self, cluster_name, running=True): + pods_data = json.loads( + subprocess.Popen("{} get pods {} --selector=\"pgcluster={}\" -o json" + .format(self.oc, ("-a" if not running else ""), cluster_name), + shell=True, stdout=subprocess.PIPE).stdout.read()) + return pods_data["items"] + + def get_pods_by_label(self, label_selector): + pods_data = json.loads( + subprocess.Popen("{0} get pods {1} -l={2} -o json".format(self.oc, "-a", label_selector), + shell=True, stdout=subprocess.PIPE).stdout.read()) + pods = [pod["metadata"]["name"] for pod in pods_data["items"]] + return pods + + +class OpenshiftPyClient(OpenshiftClient): + + def __init__(self): + super(OpenshiftPyClient, self).__init__() + self._api_client = None + self.project = None + + def login(self, oc_url, username, password, project, skip_tls_verify=False): + log.info("Log in as {} to {}".format(username, oc_url)) + + # Configuration for Kubernetes client + os_config = client.Configuration() + os_config.verify_ssl = not skip_tls_verify + os_config.assert_hostname = False + os_config.host = oc_url + + openshift_token = get_api_token(oc_url, b"admin", b"admin") + os_config.api_key = {"authorization": "Bearer " + openshift_token} + + self._api_client = client.ApiClient(configuration=os_config) + + log.info("Will use namespace {}".format(project)) + self.project = project + + # This method is called if OC_TOKEN env variable is presented, + # In our cases, this env is set only in case of robot tests + def use_token(self, oc_url, oc_token, project, skip_tls_verify=False): + log.info("Log in to {} with token".format(oc_url)) + from kubernetes import config + config.load_incluster_config() + log.info("Will use namespace {}".format(project)) + self.project = project + + def __list_entities(self, entity_type): + if entity_type == "pod": + core_api = client.CoreV1Api(self._api_client) + items = core_api.list_namespaced_pod(self.project).items + elif entity_type == "configmap": + core_api = client.CoreV1Api(self._api_client) + items = core_api.list_namespaced_config_map(self.project).items + elif entity_type == "dc": + core_api = client.CoreV1Api(self._api_client) + items = core_api.list_namespaced_deployment_config(self.project).items + elif entity_type == "rc": + core_api = client.CoreV1Api(self._api_client) + items = core_api.list_namespaced_replication_controller(self.project).items + elif entity_type == "statefulset": + try: + apps_api = client.AppsV1Api(self._api_client) + items = apps_api.list_namespaced_stateful_set(self.project).items + except: + apps_api = client.AppsV1beta1Api(self._api_client) + items = apps_api.list_namespaced_stateful_set(self.project).items + elif entity_type == "deployment": + try: + apps_api = client.AppsV1beta1Api(self._api_client) + items = apps_api.list_namespaced_deployment(self.project).items + except: + apps_api = client.AppsV1Api(self._api_client) + items = apps_api.list_namespaced_deployment(self.project).items + else: + raise NotImplementedError("Cannot list {}".format(entity_type)) + return items + + def __to_dict(self, entity): + data = entity.to_dict() + if isinstance(entity, kubernetes.client.models.v1_pod.V1Pod): + data["kind"] = "Pod" + elif isinstance(entity, + kubernetes.client.models.v1_config_map.V1ConfigMap): + data["kind"] = "ConfigMap" + elif isinstance(entity, + kubernetes.client.models.v1_deployment.V1Deployment): + data["kind"] = "Deployment" + elif isinstance(entity, + kubernetes.client.models.v1_replication_controller.V1ReplicationController): + data["kind"] = "ReplicationController" + elif isinstance(entity, kubernetes.client.models.v1_stateful_set.V1StatefulSet): + data["kind"] = "StatefulSet" + else: + raise NotImplementedError("Cannot transform to dict entity {}" + .format(entity)) + data = json.loads(json.dumps(data, cls=ObjectEncoder)) + return data + + def get_entities(self, entity_type): + items = self.__list_entities(entity_type) + return list([self.__to_dict(x) for x in items]) + + def get_entity(self, entity_type, entity_name): + items = self.__list_entities(entity_type) + if entity_type == "statefulset": + filtered_items = [item for item in items if "patroni" in item.metadata.name] + else: + filtered_items = list(filter(lambda x: x.metadata.name == entity_name, items)) + if entity_name == "pg-patroni-node1": + returned_value = self.__to_dict(filtered_items[0]) + elif entity_name == "pg-patroni-node2": + returned_value = self.__to_dict(filtered_items[1]) + else: + returned_value = self.__to_dict(filtered_items[0]) + return returned_value + + def get_entity_safe(self, entity_type, entity_name): + try: + return self.get_entity(entity_type, entity_name) + except Exception as e: + return None + + def set_env_for_dc(self, dc_name, env_name, env_value): + log.info("Try to set env {}={} to deployment {}".format(env_name, env_value, dc_name)) + core_api = client.CoreV1Api(self._api_client) + dc = core_api.read_namespaced_deployment_config(dc_name, self.project) + base_envs = dc.spec.template.spec.containers[0].env + base_env = list([x for x in base_envs if x.name == env_name]) + if env_value: + if base_env: + base_env[0].value = env_value + else: + base_envs.append(client.V1EnvVar(name=env_name, value=env_value)) + log.info(core_api.patch_namespaced_deployment_config(dc_name, self.project, dc)) + else: + if base_env: + base_envs.remove(base_env[0]) + log.info(core_api.replace_namespaced_deployment_config(dc_name, self.project, dc)) + + def get_deployment_names(self, dc_name_part, type="dc"): + items = self.__list_entities(type) + deployments = list( + [p.metadata.name for p in [p for p in items if dc_name_part in p.metadata.name]]) + return deployments + + def get_replicas_desc(self, dc_name, type="dc", running=True): + items = self.__list_entities("pod") + pods = list( + [p for p in items if "deploymentconfig" in p.metadata.labels and + dc_name in p.metadata.labels["deploymentconfig"]]) + if running: + pods = list( + [self.__to_dict(x) for x in [p for p in pods if "running" == p.status.phase.lower()]]) + else: + pods = list([self.__to_dict(x) for x in pods]) + return pods + + def delete_entity(self, entity_type, entity_name, ignore_not_found=True): + log.debug("Try to delete entity {} {}".format(entity_type, entity_name)) + try: + if entity_type == "pod": + core_api = client.CoreV1Api(self._api_client) + core_api.delete_namespaced_pod(entity_name, self.project, {}) + elif entity_type == "configmap": + body = client.V1DeleteOptions() + core_api = client.CoreV1Api(self._api_client) + core_api.delete_namespaced_config_map(entity_name, self.project, body=body,) + elif entity_type == "dc": + core_api = client.CoreV1Api(self._api_client) + core_api.delete_namespaced_deployment_config(entity_name, + self.project, {}) + elif entity_type == "rc": + core_api = client.CoreV1Api(self._api_client) + core_api.delete_namespaced_replication_controller(entity_name, + self.project, {}) + elif entity_type == "statefulset": + core_api = client.AppsV1Api(self._api_client) + core_api.delete_namespaced_stateful_set(entity_name, + self.project, {}) + else: + raise NotImplementedError("Cannot delete {}".format(entity_type)) + except kubernetes.client.rest.ApiException as e: + if ignore_not_found and e.reason == "Not Found": + return + else: + raise e + + @retry(tries=30, delay=5) + def oc_exec(self, pod_id, command): + log.debug(f"Try to execute '{command}' on pod {pod_id}") + core_api = client.CoreV1Api(self._api_client) + + exec_command = [ + '/bin/sh', '-c', command + ] + + try: + resp = stream(core_api.connect_get_namespaced_pod_exec, + pod_id, + self.project, + command=exec_command, + stderr=True, stdin=False, + stdout=True, tty=False, _preload_content=True, _request_timeout=60) + + log.info(f"Command executed. Result: {resp}") + + if resp: + log.debug(f"Command output: {resp}") + if "No such file or directory" in resp or "cannot remove" in resp: + log.info("Directory already cleaned up or removal issue detected.") + return resp # Exit early if the directory is already cleaned up or a removal issue was detected + + return resp + + except Exception as e: + log.error(f"Exception occurred while executing command: {e}") + raise + + log.debug(f"Command '{command}' completed for pod {pod_id}") + return None + + def get_logs(self, pod_id, since=None): + log.debug("Try to obtain logs from pod {} for last {}s." + .format(pod_id, since)) + # time.sleep(20) + core_api = client.CoreV1Api(self._api_client) + if since: + return core_api.read_namespaced_pod_log(pod_id, self.project, since_seconds=since) + else: + return core_api.read_namespaced_pod_log(pod_id, self.project) + + def rsync(self, source, target): + if ":" in source: + raise Exception("Cannot load files from pod yet") + (pod_id, target_dir) = target.split(":") + log.debug("Try to upload files from {} to pod {} in dir {}" + .format(source, pod_id, target_dir)) + + import tarfile + import tempfile + tempfile_fd = tempfile.TemporaryFile() + tar = tarfile.open(fileobj=tempfile_fd, mode='w:gz') + tar.add(source) + tar.close() + tempfile_fd.flush() + tempfile_fd.seek(0) + + core_api = client.CoreV1Api(self._api_client) + exec_command = ['tar', 'xzvf', '-', '-C', target_dir] + resp = stream(core_api.connect_get_namespaced_pod_exec, pod_id, + self.project, + command=exec_command, + stderr=True, stdin=True, + stdout=True, tty=False, + _preload_content=False) + + resp.write_stdin(tempfile_fd.read()) + resp.update(1) + if resp.peek_stdout(): + log.debug("STDOUT: %s" % resp.read_stdout()) + if resp.peek_stderr(): + log.debug("STDERR: %s" % resp.read_stderr()) + resp.close() + tempfile_fd.close() + error = resp.read_channel(ERROR_CHANNEL) + if error and "Success" != json.loads(error).get("status"): + raise Exception("Error occurred during execution: {}. " + .format(error)) + + def delete_pod(self, pod_id, grace_period=None): + log.debug("Try to remove pod {} with grace-period {}" + .format(pod_id, grace_period)) + core_api = client.CoreV1Api(self._api_client) + try: + status = core_api.delete_namespaced_pod(pod_id, self.project, {}, + grace_period_seconds=grace_period) + log.debug(status) + except ApiException as ae: + if 'pods \\"{}\\" not found'.format(pod_id) in ae.body: + log.warning("Cannot remove pod {} - no such pod." + .format(pod_id)) + pass + else: + raise ae + + @retry(tries=5) # handle case when DC version is Unknown + def scale(self, name, count, entity="dc"): + log.debug("Try to scale {} {} to {} replicas".format(entity, name, count)) + if entity == "dc": + core_api = client.CoreV1Api(self._api_client) + data = core_api.patch_namespaced_deployment_config(name, self.project, {"spec": {"replicas": count}}) + elif entity == "statefulset": + core_api = client.AppsV1Api(self._api_client) + data = core_api.patch_namespaced_stateful_set(name, self.project, {"spec": {"replicas": count}}) + elif entity == "deployment": + try: + core_api = client.AppsV1beta1Api(self._api_client) + data = core_api.patch_namespaced_deployment(name, self.project, {"spec": {"replicas": count}}) + except: + core_api = client.AppsV1Api(self._api_client) + data = core_api.patch_namespaced_deployment(name, self.project, {"spec": {"replicas": count}}) + else: + raise NotImplementedError("Cannot scale entity {} of type {}".format(name, entity)) + return self.__to_dict(data) + + def get_json_diff(self, source, data): + return Differ().get_json_diff(source, data, keep_name=True) + + def apply_object(self, data): + log.debug("Try to apply {}".format(data)) + entity_type = data["kind"] + entity_name = data["metadata"]["name"] + if entity_type == "Pod": + core_api = client.CoreV1Api(self._api_client) + source = reset_last_applied(self.get_entity("pod", entity_name)) + diff = self.get_json_diff(source, data) + if diff: + return self.__to_dict(core_api.patch_namespaced_pod(entity_name, self.project, diff)) + return data + elif entity_type == "ConfigMap": + core_api = client.CoreV1Api(self._api_client) + source = self.get_entity("configmap", entity_name) + diff = self.get_json_diff(source, data) + if diff: + return self.__to_dict(core_api.patch_namespaced_config_map(entity_name, self.project, diff)) + return data + elif entity_type == "DeploymentConfig": + core_api = client.CoreV1Api(self._api_client) + source = reset_last_applied(self.get_entity("dc", entity_name)) + diff = self.get_json_diff(source, data) + if diff: + return self.__to_dict(core_api.patch_namespaced_deployment_config(entity_name, self.project, diff)) + return data + elif entity_type == "ReplicationController": + # todo[anin] check container for same bug as above + core_api = client.CoreV1Api(self._api_client) + source = self.get_entity("rc", entity_name) + diff = self.get_json_diff(source, data) + if diff: + return self.__to_dict(core_api.patch_namespaced_replication_controller(entity_name, self.project, diff)) + return data + elif entity_type == "StatefulSet": + # todo[anin] check container for same bug as above + apps_api = client.AppsV1Api(self._api_client) + source = reset_last_applied(self.get_entity("statefulset", entity_name)) + # return self.__to_dict(apps_api.patch_namespaced_stateful_set(entity_name, self.project, data)) + # TODO: It's not working at all, we are just returning same without patching, just rolled back coz of release + return data + elif entity_type == "Deployment": + try: + apps_api = client.AppsV1beta1ApiApi(self._api_client) + source = reset_last_applied(self.get_entity("deployment", entity_name)) + except: + apps_api = client.AppsV1Api(self._api_client) + source = reset_last_applied(self.get_entity("deployment", entity_name)) + + diff = self.get_json_diff(source, data) + if diff: + return self.__to_dict(apps_api.patch_namespaced_deployment(entity_name, self.project, diff)) + return data + else: + raise NotImplementedError("Cannot apply {}".format(entity_type)) + + def replace_object(self, data): + log.debug("Try to apply {}".format(data)) + entity_type = data["kind"] + entity_name = data["metadata"]["name"] + if entity_type == "Pod": + core_api = client.CoreV1Api(self._api_client) + return self.__to_dict(core_api.replace_namespaced_pod(entity_name, self.project, data)) + elif entity_type == "ConfigMap": + core_api = client.CoreV1Api(self._api_client) + return self.__to_dict(core_api.replace_namespaced_config_map(entity_name, self.project, data)) + elif entity_type == "DeploymentConfig": + core_api = client.CoreV1Api(self._api_client) + return self.__to_dict(core_api.replace_namespaced_deployment_config(entity_name, self.project, data)) + elif entity_type == "ReplicationController": + core_api = client.CoreV1Api(self._api_client) + return self.__to_dict(core_api.replace_namespaced_replication_controller(entity_name, self.project, data)) + elif entity_type == "StatefulSet": + core_api = client.AppsV1Api(self._api_client) + return self.__to_dict(core_api.replace_namespaced_stateful_set(entity_name, self.project, data)) + else: + raise NotImplementedError("Cannot replace {}".format(entity_type)) + + def get_cluster_pods(self, cluster_name, running=True): + pods = [x for x in self.__list_entities("pod") if "pgcluster" in x.metadata.labels and x.metadata.labels["pgcluster"] == cluster_name] + pods = list([x.metadata.name for x in pods]) + return pods + + def get_cluster_pods_desc(self, cluster_name, running=True): + pods = [x for x in self.__list_entities("pod") if "pgcluster" in x.metadata.labels and x.metadata.labels["pgcluster"] == cluster_name] + pods = list([self.__to_dict(x) for x in pods]) + return pods + + def get_pods_by_label(self, label_selector): + core_api = client.CoreV1Api(self._api_client) + items = core_api.list_namespaced_pod(self.project, label_selector=label_selector).items + pods = list([x.metadata.name for x in items]) + return pods + + def get_secret_data(self, secret_name): + core_api = client.CoreV1Api() + try: + api_response = core_api.read_namespaced_secret(secret_name, self.project) + import base64 + data = api_response.data + password = base64.b64decode(data.get("password")).decode('utf-8') + user_data = data.get("user") + if not user_data: + user_data = data.get("username") + user = base64.b64decode(user_data).decode('utf-8') + return user, password + except ApiException as exc: + log.error(exc) + raise exc + + +def get_client(oc_path="oc", oc_config_file="./oc_config_file.yaml"): + """ + Returns wrapper over shell if oc client present + otherwise tries to return wrapper over kubernetes.client + :return: + :rtype: OpenshiftClient + """ + if use_kube_client: + return OpenshiftPyClient() + else: + return OpenshiftShellClient(oc_path, oc_config_file) + + +def reset_last_applied(entity): + entity["metadata"].pop("namespace", None) + entity["metadata"].pop("selfLink", None) + entity["metadata"].pop("uid", None) + entity["metadata"].pop("resourceVersion", None) + entity["metadata"].pop("generation", None) + entity["metadata"].pop("creationTimestamp", None) + entity["metadata"].pop("managedFields", None) + entity.pop("status", None) + return entity + + +class OpenshiftOrchestrator: + def __init__(self, client, retry_count=100): + self.oc = client + self.retry_count = retry_count + + def ensure_scale(self, dc_name, replicas, type="dc"): + log.info("Try to scale dc {} to {} replicas.".format(dc_name, replicas)) + for i in range(1, self.retry_count): + self.oc.scale(dc_name, replicas, type) + time.sleep(1) + if replicas == self.oc.get_deployment_replicas_count(dc_name, type): + log.debug("dc {} was scaled successfully.".format(dc_name)) + return + raise Exception("Was not able to scale deployment {}".format(dc_name)) + + def wait_replicas(self, dc_name, replicas, running=False): + log.info("Wait {} replicas of dc {}".format(replicas, dc_name)) + replica_names = None + for i in range(1, self.retry_count): + replica_names = self.oc.get_replicas_names(dc_name, running=running) + log.info("Wait {} replicas of dc {}. Actual replicas: {}".format(replicas, dc_name, replica_names)) + log.debug("Wait {} replicas of dc {}. Actual replicas: {}".format(replicas, dc_name, replica_names)) + if len(replica_names) == replicas: + log.debug("Found {} replicas of dc {}.".format(replicas, dc_name)) + return + time.sleep(1) + raise Exception("Expected replicas count was {} but actual replicas: {}".format(replicas, replica_names)) + + def wait_replicas_statefulset(self, stateful_set_name, replicas_number): + for i in range(1, self.retry_count): + log.info("Waiting till all replicas are ready") + time.sleep(1) + ready_replicas = self.oc.get_running_stateful_set_replicas_count(stateful_set_name) + if replicas_number == ready_replicas: + log.debug("Statefulset {} was scaled successfully.".format(stateful_set_name)) + return + raise Exception("Was not able to scale statefulset {}".format(stateful_set_name)) + + def set_env_on_dc(self, dc_name, env_name, env_value, scale_up=True): + log.info("Try to set env {}={} to deployment {}".format(env_name, env_value, dc_name)) + log.info("Scale down before changes") + self.ensure_scale(dc_name, 0) + replica_names = self.oc.get_replicas_names(dc_name, running=False) + for replica in replica_names: + self.oc.delete_pod(replica, 1) + self.wait_replicas(dc_name, 0) + + log.info("Change env") + self.oc.set_env_for_dc(dc_name, env_name, env_value) + + log.debug("Check if env present on actual version of dc") + for i in range(1, self.retry_count): + dc = self.oc.get_deployment(dc_name) + envs = dc["spec"]["template"]["spec"]["containers"][0]["env"] + env = list([x for x in envs if x["name"] == env_name]) + version = dc["metadata"].get("resourceVersion") if dc["metadata"].get("resourceVersion") else dc["metadata"].get("resource_version") + log.debug("Env: {}. Version: {}".format(env, version)) + if env_value: + if env and env[0]["value"] == env_value and version != "Unknown": + break + else: + if not env and version != "Unknown": + break + log.debug("Wait for changes to apply") + time.sleep(1) + + if scale_up: + log.info("Scale up after changes") + self.ensure_scale(dc_name, 1) + self.wait_replicas(dc_name, 1, running=True) + + def replace_command_on_deployment(self, deployment_name, command, scale_down=False): + log.info("Try to set command {} to deployment {}".format(command, deployment_name)) + deployment_entity = self.oc.get_deployment(deployment_name, "deployment") + deployment_entity["spec"]["template"]["spec"]["containers"][0]["command"] = command + old_generation = deployment_entity["status"]["observed_generation"] + log.info("Observed generation before update: {}".format(old_generation)) + if scale_down: + deployment_entity["spec"]["replicas"] = 0 + self.oc.apply_object(deployment_entity) + for _ in range(1, self.retry_count): + updated_deployment = self.oc.get_deployment(deployment_name, "deployment") + new_generation = updated_deployment["status"].get("observed_generation") + ready_replicas = updated_deployment["status"].get("ready_replicas") + if new_generation == (old_generation + 1) and ready_replicas: + break + else: + time.sleep(1) + + + def replace_command_on_dc(self, dc_name, command, scale_up=True): + log.info("Try to set command {} to deployment config {}".format(command, dc_name)) + + log.info("Scale down before changes") + self.ensure_scale(dc_name, 0) + replica_names = self.oc.get_replicas_names(dc_name, running=False) + for replica in replica_names: + self.oc.delete_pod(replica, 1) + self.wait_replicas(dc_name, 0) + + log.info("Change command") + dc = self.oc.get_deployment(dc_name) + dc["spec"]["template"]["spec"]["containers"][0]["command"] = command + + last_applied = dc["metadata"]["annotations"]["kubectl.kubernetes.io/last-applied-configuration"] + if last_applied: + dc = reset_last_applied(dc) + log.debug(json.dumps(dc)) + self.oc.apply_object(dc) + else: + log.debug(json.dumps(dc)) + self.oc.replace_object(dc) + + log.debug("Check if command present on actual version of dc") + for i in range(1, self.retry_count): + dc = self.oc.get_deployment(dc_name) + container_def = dc["spec"]["template"]["spec"]["containers"][0] + current_command = None + if "command" in container_def: + current_command = container_def["command"] + version = dc["metadata"].get("resourceVersion") if dc["metadata"].get("resourceVersion") else dc["metadata"].get("resource_version") + log.debug("Command: {}. Version: {}".format(current_command, version)) + if current_command == command and version != "Unknown": + break + log.debug("Wait for changes to apply") + time.sleep(1) + + if scale_up: + log.info("Scale up after changes") + self.ensure_scale(dc_name, 1) + self.wait_replicas(dc_name, 1, running=True) + + def replace_command_on_statefulset(self, stateful_set_name, command, scale_up=True): + log.info("Try to set command {} to statefulset {}".format(command, stateful_set_name)) + + stateful_set = self.oc.get_stateful_set(stateful_set_name) + replicas_number = self.oc.get_stateful_set_replicas_count(stateful_set_name) + log.info("Scale down before changes") + self.scale_stateful_set(stateful_set_name, 0) + + log.info("Change command") + + stateful_set["spec"]["template"]["spec"]["containers"][0]["command"] = command + + stateful_set = reset_last_applied(stateful_set) + log.debug(json.dumps(stateful_set)) + self.oc.apply_object(stateful_set) + + time.sleep(5) + if scale_up: + log.info("Scale up after changes") + self.scale_stateful_set(stateful_set_name, replicas_number) + + def wait_for_one_of_records_in_logs_since(self, pod_id, records, start_time, + wait_message=None, restart_timer_records=None): + """ + Receives logs from specified pod and checks if logs contain one of records since start_time. + Process will wait for record until retry counter reaches self.retry_count or + logs will be filled with one of restart_timer_records. + Sleep interval is 5 seconds. Between intervals Method can inform user about process with wait_message + :param pod_id: + :param records: + :type records: list + :param start_time: + :param wait_message: + :param restart_timer_records: + :type restart_timer_records: list + :return: Nothing. + :raise: Exception if record was not found + """ + log.debug("Wait for records {} in pod {} from {}.".format(records, pod_id, start_time)) + counter = 0 + sleep_time = 20 + fetch_start_time = start_time + while counter < self.retry_count / 5: + fetch_end_time = time.time() + # add small overlap to ensure that we dont miss peace of logs + time_passed = int(fetch_end_time - fetch_start_time) + sleep_time + logs = self.oc.get_logs(pod_id, time_passed) + # check if logs contains expected record + for record in records: + record_count = logs.count(record) + if record_count > 0: + log.debug("Found record '{}' in logs. Found {} records.".format(record, record_count)) + return + # check if logs contains new restart records. + if restart_timer_records: + for record in restart_timer_records: + record_count = logs.count(record) + if record_count > 0: + if wait_message: + log.info(wait_message) + log.debug("Found record '{}' in logs. Will prolong wait time. ".format(record)) + counter = 0 + time.sleep(sleep_time) + counter = counter + 1 + fetch_start_time = fetch_end_time + raise Exception("Cannot find records in logs") + + def wait_for_record_in_logs_since(self, pod_id, record, start_time, wait_message=None, restart_timer_record=None): + self.wait_for_one_of_records_in_logs_since(pod_id, + [record], + start_time, + wait_message, + [restart_timer_record] if restart_timer_record else None) + + def scale_stateful_set(self, stateful_set_name, replicas_number): + log.info("Try to scale statefulset {} to {} replicas.".format(stateful_set_name, replicas_number)) + self.oc.scale(stateful_set_name, replicas_number, entity="statefulset") + self.wait_replicas_statefulset(stateful_set_name, replicas_number) + + def get_liveness_probe_from_stateful_set(self, stateful_set_name): + return self.oc.get_liveness_probe_from_stateful_set(stateful_set_name) + + def return_liveness_readiness_probes_for_stateful_set(self, stateful_set_name, probe): + stateful_set = self.oc.get_stateful_set(stateful_set_name) + stateful_set["spec"]["template"]["spec"]["containers"][0]["readinessProbe"] = probe + stateful_set["spec"]["template"]["spec"]["containers"][0]["livenessProbe"] = probe + self.oc.apply_object(stateful_set) \ No newline at end of file diff --git a/maintenance/recovery/utils_pg.py b/maintenance/recovery/utils_pg.py new file mode 100644 index 0000000..39cf8e1 --- /dev/null +++ b/maintenance/recovery/utils_pg.py @@ -0,0 +1,87 @@ +# Copyright 2024-2025 NetCracker Technology Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import logging +import sys +from utils_common import RecoveryException + +log = logging.getLogger() + + +class PostgresqlClient: + + def __init__(self, oc_client, retry_count=100): + self.oc = oc_client + self.retry_count = retry_count + + def execute_select_query(self, query): + import psycopg2 + from psycopg2 import Error + import os + try: + connection = psycopg2.connect(user="postgres", + password=os.getenv('POSTGRES_PASSWORD'), + host=os.getenv('POSTGRES_HOST'), + port="5432", + database="postgres") + + cursor = connection.cursor() + cursor.execute(query) + for p in cursor.fetchall(): + return p[0] + except (Exception, Error) as error: + log.error("Postgres communication failed: {}".format(error)) + + def execute_local_query(self, pod_id, query): + return self.oc.oc_exec(pod_id, "psql -h localhost -p 5432 postgres -t -c \"{}\"".format(query)).strip() + + def wait_db_response(self, pod_id, query, result): + log.info("Start waiting for response '{}' for query '{}' from DB on {}".format(result, query, pod_id)) + wait_database_start_time = time.time() + query_result = None + select_counter = 0 + for i in range(1, self.retry_count): + log.debug("{} try to check DB response.".format(i)) + if query_result == result: + select_counter = select_counter + 1 + if select_counter == 5: + break + else: + select_counter = 0 + time.sleep(1) + try: + query_result = self.execute_select_query(query) + log.debug("Response from DB: {}".format(query_result)) + except Exception as e: + if 'current phase is Pending' in str(e) or \ + 'Is the server running on host' in str(e) or \ + 'server closed the connection' in str(e): + log.debug("One of allowed error occurred during request.") + else: + raise e + wait_database_time = time.time() - wait_database_start_time + if query_result == result and select_counter == 5: + log.info("SUCCESS: Received response {} in {} sec".format(query_result, wait_database_time)) + else: + raise RecoveryException("FAILURE: Cannot get expected result '{}' for query '{}' " + "from DB on pod {} in {} sec. Check if database working properly." + .format(result, query, pod_id, wait_database_time)) + + def wait_pg_recovery_complete(self, pod_id): + self.wait_db_response(pod_id, "select pg_is_in_recovery()", False) + + def wait_database(self, pod_id): + self.wait_db_response(pod_id, "select 1", 1) +