diff --git a/.buildkite/auditbeat/auditbeat-pipeline.yml b/.buildkite/auditbeat/auditbeat-pipeline.yml index 801768c271ec..ed19c7d91644 100644 --- a/.buildkite/auditbeat/auditbeat-pipeline.yml +++ b/.buildkite/auditbeat/auditbeat-pipeline.yml @@ -32,6 +32,9 @@ steps: command: | cd auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -47,6 +50,9 @@ steps: command: | cd auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_RHEL9}" @@ -62,6 +68,9 @@ steps: command: | Set-Location -Path auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -79,6 +88,9 @@ steps: command: | Set-Location -Path auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -97,6 +109,9 @@ steps: make -C auditbeat crosscompile env: GOX_FLAGS: "-arch amd64" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -115,6 +130,9 @@ steps: set -euo pipefail cd auditbeat mage unitTest + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" @@ -133,6 +151,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd auditbeat mage unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -147,6 +168,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd auditbeat mage unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" @@ -164,6 +188,9 @@ steps: command: | Set-Location -Path auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -180,6 +207,9 @@ steps: command: | Set-Location -Path auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -196,6 +226,9 @@ steps: command: | Set-Location -Path auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" diff --git a/.buildkite/filebeat/filebeat-pipeline.yml b/.buildkite/filebeat/filebeat-pipeline.yml index 7eedd9d76fba..053e8dbec419 100644 --- a/.buildkite/filebeat/filebeat-pipeline.yml +++ b/.buildkite/filebeat/filebeat-pipeline.yml @@ -30,6 +30,9 @@ steps: command: | cd filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -45,6 +48,9 @@ steps: command: | cd filebeat mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -60,6 +66,9 @@ steps: command: | cd filebeat mage pythonIntegTest + retry: + automatic: + - limit: 3 agents: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" @@ -76,6 +85,9 @@ steps: command: | Set-Location -Path filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -94,6 +106,9 @@ steps: command: | Set-Location -Path filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -118,6 +133,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -136,6 +154,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" @@ -152,6 +173,9 @@ steps: command: | cd filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" @@ -172,6 +196,9 @@ steps: command: | Set-Location -Path filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -190,6 +217,9 @@ steps: command: | Set-Location -Path filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -208,6 +238,9 @@ steps: command: | Set-Location -Path filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" diff --git a/.buildkite/heartbeat/heartbeat-pipeline.yml b/.buildkite/heartbeat/heartbeat-pipeline.yml index 8091b2eead17..cadbcec1eca2 100644 --- a/.buildkite/heartbeat/heartbeat-pipeline.yml +++ b/.buildkite/heartbeat/heartbeat-pipeline.yml @@ -30,6 +30,9 @@ steps: command: | cd heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -45,6 +48,9 @@ steps: command: | cd heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_RHEL9}" @@ -61,6 +67,9 @@ steps: command: | Set-Location -Path heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -78,6 +87,9 @@ steps: command: | Set-Location -Path heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -94,6 +106,9 @@ steps: command: | cd heartbeat mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -109,6 +124,9 @@ steps: command: | cd heartbeat mage pythonIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -130,6 +148,9 @@ steps: command: | cd heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" @@ -151,6 +172,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -168,6 +192,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" @@ -188,6 +215,9 @@ steps: command: | Set-Location -Path heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -205,6 +235,9 @@ steps: command: | Set-Location -Path heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -222,6 +255,9 @@ steps: command: | Set-Location -Path heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" diff --git a/.buildkite/libbeat/pipeline.libbeat.yml b/.buildkite/libbeat/pipeline.libbeat.yml index 040ad9b1d669..bc77712c330b 100644 --- a/.buildkite/libbeat/pipeline.libbeat.yml +++ b/.buildkite/libbeat/pipeline.libbeat.yml @@ -21,6 +21,9 @@ steps: set -euo pipefail cd libbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -38,6 +41,9 @@ steps: set -euo pipefail cd libbeat mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -55,6 +61,9 @@ steps: set -euo pipefail cd libbeat mage pythonIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -72,6 +81,9 @@ steps: set -euo pipefail cd libbeat make crosscompile + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -89,6 +101,9 @@ steps: set -euo pipefail cd libbeat make STRESS_TEST_OPTIONS='-timeout=20m -race -v -parallel 1' GOTEST_OUTPUT_OPTIONS=' | go-junit-report > libbeat-stress-test.xml' stress-tests + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -108,6 +123,9 @@ steps: set -euo pipefail cd libbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" diff --git a/.buildkite/metricbeat/pipeline.yml b/.buildkite/metricbeat/pipeline.yml index 1fb6bfcc2370..d15212d2ef32 100644 --- a/.buildkite/metricbeat/pipeline.yml +++ b/.buildkite/metricbeat/pipeline.yml @@ -32,6 +32,9 @@ steps: - label: ":linux: Ubuntu Unit Tests" key: "mandatory-linux-unit-test" command: "cd metricbeat && mage build unitTest" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -62,6 +65,9 @@ steps: echo "~~~ Running tests" export KUBECONFIG="$$PWD/kubecfg" cd metricbeat && mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -92,6 +98,9 @@ steps: echo "~~~ Running tests" export KUBECONFIG="$$PWD/kubecfg" cd metricbeat && mage pythonIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -106,6 +115,9 @@ steps: - label: ":negative_squared_cross_mark: Cross compile" key: "mandatory-cross-compile" command: "make -C metricbeat crosscompile" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -122,6 +134,9 @@ steps: Set-Location -Path metricbeat mage build unitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -140,6 +155,9 @@ steps: Set-Location -Path metricbeat mage build unitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -162,6 +180,9 @@ steps: Set-Location -Path metricbeat mage build unitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -180,6 +201,9 @@ steps: Set-Location -Path metricbeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -198,6 +222,9 @@ steps: Set-Location -Path metricbeat mage build unitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -221,6 +248,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd metricbeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -238,6 +268,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd metricbeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" diff --git a/.buildkite/packaging.pipeline.yml b/.buildkite/packaging.pipeline.yml index a7fdabb2268d..5fd559f458d3 100644 --- a/.buildkite/packaging.pipeline.yml +++ b/.buildkite/packaging.pipeline.yml @@ -12,17 +12,42 @@ env: PLATFORMS_ARM: "linux/arm64" steps: + # we use concurrency gates (https://buildkite.com/blog/concurrency-gates) + # to implement two FIFO queues for DRA-snapshot and DRA-staging + # this prevents parallel builds and possibility of publishing out of order DRA artifacts if the first job takes longer than the second + + - name: Start of concurrency group for DRA Snapshot + if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true" + command: echo "--> Start of concurrency gate dra-snapshot" + concurrency_group: "dra-gate-snapshot-$BUILDKITE_BRANCH" + concurrency: 1 + key: start-gate-snapshot + + - name: Start of concurrency group for DRA Staging + if: build.branch =~ /^\d+\.\d+$$/ + command: echo "--> Start of concurrency gate dra-staging" + concurrency_group: "dra-gate-staging-$BUILDKITE_BRANCH" + concurrency: 1 + key: start-gate-staging + + - wait + - group: Beats dashboards key: dashboards steps: - label: Snapshot dashboards if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true" + depends_on: start-gate-snapshot key: dashboards-snapshot # TODO: container with go and make agents: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" machineType: "${GCP_DEFAULT_MACHINE_TYPE}" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 commands: - make build/distributions/dependencies.csv - make beats-dashboards @@ -34,12 +59,17 @@ steps: - label: Staging dashboards if: build.branch =~ /^\d+\.\d+$$/ + depends_on: start-gate-staging key: dashboards-staging # TODO: container with go and make agents: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" machineType: "${GCP_DEFAULT_MACHINE_TYPE}" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 commands: - make build/distributions/dependencies.csv - make beats-dashboards @@ -52,6 +82,7 @@ steps: - group: Packaging snapshot if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true" key: packaging-snapshot + depends_on: start-gate-snapshot steps: - label: "SNAPSHOT: {{matrix}}" env: @@ -63,6 +94,10 @@ steps: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" machineType: "${GCP_DEFAULT_MACHINE_TYPE}" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 artifact_paths: - build/distributions/**/* matrix: @@ -93,6 +128,10 @@ steps: provider: "aws" imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" instanceType: "${AWS_ARM_INSTANCE_TYPE}" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 artifact_paths: - build/distributions/**/* matrix: @@ -119,12 +158,16 @@ steps: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" machineType: "c2-standard-16" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 artifact_paths: - build/distributions/**/* - group: Packaging Staging - key: packaging-staging + depends_on: start-gate-staging ## Only for release if: build.branch =~ /^\d+\.\d+$$/ steps: @@ -138,6 +181,10 @@ steps: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" machineType: "${GCP_DEFAULT_MACHINE_TYPE}" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 artifact_paths: - build/distributions/**/* matrix: @@ -168,8 +215,12 @@ steps: provider: "aws" imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" instanceType: "${AWS_ARM_INSTANCE_TYPE}" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 artifact_paths: - - build/distributions/** + - build/distributions/**/* matrix: - auditbeat - filebeat @@ -194,6 +245,10 @@ steps: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" machineType: "c2-standard-16" + timeout_in_minutes: 40 + retry: + automatic: + - limit: 1 artifact_paths: - build/distributions/**/* @@ -207,6 +262,7 @@ steps: env: DRA_WORKFLOW: snapshot depends_on: + - start-gate-snapshot - packaging-snapshot - dashboards-snapshot command: | @@ -225,6 +281,7 @@ steps: env: DRA_WORKFLOW: staging depends_on: + - start-gate-staging - packaging-staging - dashboards-staging command: | @@ -235,3 +292,17 @@ steps: provider: gcp image: "${IMAGE_UBUNTU_X86_64}" machineType: "${GCP_DEFAULT_MACHINE_TYPE}" + + - wait + + - command: echo "End of concurrency gate dra-snapshot <--" + if: build.branch =~ /^\d+\.\d+$$/ || build.branch == 'main' || build.env('RUN_SNAPSHOT') == "true" + concurrency_group: "dra-gate-snapshot-$BUILDKITE_BRANCH" + concurrency: 1 + key: end-gate-snapshot + + - command: echo "End of concurrency gate dra-staging <--" + if: build.branch =~ /^\d+\.\d+$$/ + concurrency_group: "dra-gate-staging-$BUILDKITE_BRANCH" + concurrency: 1 + key: end-gate-staging diff --git a/.buildkite/packetbeat/pipeline.packetbeat.yml b/.buildkite/packetbeat/pipeline.packetbeat.yml index c0f5c1e1a735..d510107a89c2 100644 --- a/.buildkite/packetbeat/pipeline.packetbeat.yml +++ b/.buildkite/packetbeat/pipeline.packetbeat.yml @@ -28,6 +28,9 @@ steps: command: | cd packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -43,6 +46,9 @@ steps: command: | cd packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_RHEL9_X86_64}" @@ -58,6 +64,9 @@ steps: command: | Set-Location -Path packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -75,6 +84,9 @@ steps: command: | Set-Location -Path packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -96,6 +108,9 @@ steps: command: | Set-Location -Path packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -114,6 +129,9 @@ steps: Set-Location -Path packetbeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -132,6 +150,9 @@ steps: Set-Location -Path packetbeat mage build unitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -156,6 +177,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -174,6 +198,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" @@ -188,6 +215,9 @@ steps: key: "linux-arm64-unit-tests-extended" command: "cd packetbeat && mage build unitTest" if: build.env("BUILDKITE_PULL_REQUEST") == "false" || build.env("GITHUB_PR_LABELS") =~ /.*arm.*/ + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${AWS_IMAGE_UBUNTU_ARM_64}" diff --git a/.buildkite/pipeline-scheduler.yml b/.buildkite/pipeline-scheduler.yml new file mode 100644 index 000000000000..3f9b628bc63a --- /dev/null +++ b/.buildkite/pipeline-scheduler.yml @@ -0,0 +1,17 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/buildkite/pipeline-schema/main/schema.json + +# this intermediate pipeline is required because we can't specify a custom agent (k8s image) yet +# in catalog-info: https://github.com/elastic/ci/blob/71e83d340e3b93ab43fcf16a7a70ac33bdeec6e9/terrazzo/terrazzo/constructs/buildkite/pipelines.py#L787-L842 + +steps: + - label: ":pipeline: Generate trigger steps for $PIPELINES_TO_TRIGGER" + command: | + set -eo pipefail + .buildkite/pipeline-scheduler.py >steps.yml + echo "~~~ Printing pipeline steps" + yq . steps.yml + echo "~~~ Uploading steps" + buildkite-agent pipeline upload steps.yml + agents: + image: "docker.elastic.co/ci-agent-images/platform-ingest/buildkite-agent-beats-ci-with-hooks:0.1" + useCustomGlobalHooks: true diff --git a/.buildkite/scripts/dra.sh b/.buildkite/scripts/dra.sh index ec9d523bf3ce..5ce6e5884b99 100755 --- a/.buildkite/scripts/dra.sh +++ b/.buildkite/scripts/dra.sh @@ -70,11 +70,13 @@ docker run --rm \ --artifact-set "main" \ ${DRY_RUN} | tee rm-output.txt -# extract the summary URL from a release manager output line like: -# Report summary-18.22.0.html can be found at https://artifacts-staging.elastic.co/beats/18.22.0-ABCDEFGH/summary-18.22.0.html -SUMMARY_URL=$(grep -E '^Report summary-.* can be found at ' rm-output.txt | grep -oP 'https://\S+' | awk '{print $1}') -rm rm-output.txt +if [[ "$DRY_RUN" != "--dry-run" ]]; then + # extract the summary URL from a release manager output line like: + # Report summary-18.22.0.html can be found at https://artifacts-staging.elastic.co/beats/18.22.0-ABCDEFGH/summary-18.22.0.html + SUMMARY_URL=$(grep -E '^Report summary-.* can be found at ' rm-output.txt | grep -oP 'https://\S+' | awk '{print $1}') + rm rm-output.txt -# and make it easily clickable as a Builkite annotation -printf "**Summary link:** [${SUMMARY_URL}](${SUMMARY_URL})\n" | buildkite-agent annotate --style=success + # and make it easily clickable as a Builkite annotation + printf "**${DRA_WORKFLOW} summary link:** [${SUMMARY_URL}](${SUMMARY_URL})\n" | buildkite-agent annotate --style=success --append +fi diff --git a/.buildkite/winlogbeat/pipeline.winlogbeat.yml b/.buildkite/winlogbeat/pipeline.winlogbeat.yml index c71858b45b0a..ff3327913492 100644 --- a/.buildkite/winlogbeat/pipeline.winlogbeat.yml +++ b/.buildkite/winlogbeat/pipeline.winlogbeat.yml @@ -24,6 +24,9 @@ steps: - label: ":ubuntu: Winlogbeat Crossccompile" key: "mandatory-cross-compile" command: "make -C winlogbeat crosscompile" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -40,6 +43,9 @@ steps: Set-Location -Path winlogbeat mage build unitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -58,6 +64,9 @@ steps: Set-Location -Path winlogbeat mage build unitTest key: "mandatory-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -76,6 +85,9 @@ steps: Set-Location -Path winlogbeat mage build unitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -99,6 +111,9 @@ steps: Set-Location -Path winlogbeat mage build unitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -117,6 +132,9 @@ steps: Set-Location -Path winlogbeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" diff --git a/.buildkite/x-pack/pipeline.xpack.auditbeat.yml b/.buildkite/x-pack/pipeline.xpack.auditbeat.yml index 36fcb9bebd99..80c298c725df 100644 --- a/.buildkite/x-pack/pipeline.xpack.auditbeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.auditbeat.yml @@ -36,6 +36,9 @@ steps: echo "~~~ Will run tests with env var MODULE=$$MODULE" cd x-pack/auditbeat mage update build test + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -52,6 +55,9 @@ steps: command: | cd x-pack/auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_RHEL9_X86_64}" @@ -68,6 +74,9 @@ steps: Set-Location -Path x-pack/auditbeat mage build unitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -86,6 +95,9 @@ steps: Set-Location -Path x-pack/auditbeat mage build unitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -108,6 +120,9 @@ steps: Set-Location -Path x-pack/auditbeat mage build unitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -126,6 +141,9 @@ steps: Set-Location -Path x-pack/auditbeat mage build unitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -144,6 +162,9 @@ steps: Set-Location -Path x-pack/auditbeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -167,6 +188,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd x-pack/auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -183,6 +207,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd x-pack/auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" @@ -201,6 +228,9 @@ steps: command: | cd x-pack/auditbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${IMAGE_UBUNTU_ARM_64}" diff --git a/.buildkite/x-pack/pipeline.xpack.dockerlogbeat.yml b/.buildkite/x-pack/pipeline.xpack.dockerlogbeat.yml index 05aee81e4d80..a64f7851913b 100644 --- a/.buildkite/x-pack/pipeline.xpack.dockerlogbeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.dockerlogbeat.yml @@ -28,6 +28,9 @@ steps: - label: ":ubuntu: Xpack/Dockerlogbeat Ubuntu Unit Tests" key: "mandatory-linux-unit-test" command: "cd x-pack/dockerlogbeat && mage build unitTest" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -44,6 +47,9 @@ steps: command: "cd x-pack/dockerlogbeat && mage goIntegTest" env: MODULE: $MODULE + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" diff --git a/.buildkite/x-pack/pipeline.xpack.filebeat.yml b/.buildkite/x-pack/pipeline.xpack.filebeat.yml index 795302bc2d99..b7e71e3c3c0a 100644 --- a/.buildkite/x-pack/pipeline.xpack.filebeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.filebeat.yml @@ -30,6 +30,9 @@ steps: command: | cd x-pack/filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -50,6 +53,9 @@ steps: defineModuleFromTheChangeSet x-pack/filebeat echo "~~~ Will run tests with env var MODULE=$$MODULE" cd x-pack/filebeat && mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -70,6 +76,9 @@ steps: defineModuleFromTheChangeSet x-pack/filebeat echo "~~~ Running tests with env var MODULE=$$MODULE" cd x-pack/filebeat && mage pythonIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -86,6 +95,9 @@ steps: Set-Location -Path x-pack/filebeat mage build unitTest key: "x-pack-filebeat-mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -104,6 +116,9 @@ steps: Set-Location -Path x-pack/filebeat mage build unitTest key: "x-pack-filebeat-mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -122,6 +137,9 @@ steps: command: | cd x-pack/filebeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${IMAGE_UBUNTU_ARM_64}" @@ -142,6 +160,9 @@ steps: Set-Location -Path x-pack/filebeat mage build unitTest key: "x-pack-filebeat-extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -160,6 +181,9 @@ steps: Set-Location -Path x-pack/filebeat mage build unitTest key: "x-pack-filebeat-extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -178,6 +202,9 @@ steps: Set-Location -Path x-pack/filebeat mage build unitTest key: "x-pack-filebeat-extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -200,6 +227,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd x-pack/filebeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -217,6 +247,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd x-pack/filebeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" diff --git a/.buildkite/x-pack/pipeline.xpack.heartbeat.yml b/.buildkite/x-pack/pipeline.xpack.heartbeat.yml index 107dfa65f1b2..136706e698cc 100644 --- a/.buildkite/x-pack/pipeline.xpack.heartbeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.heartbeat.yml @@ -39,6 +39,9 @@ steps: echo "~~~ Running tests" cd x-pack/heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -59,6 +62,9 @@ steps: echo "~~~ Running tests" cd x-pack/heartbeat mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -76,6 +82,9 @@ steps: command: | Set-Location -Path x-pack/heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -95,6 +104,9 @@ steps: command: | Set-Location -Path x-pack/heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -118,6 +130,9 @@ steps: Set-Location -Path x-pack/heartbeat mage build test key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -136,6 +151,9 @@ steps: Set-Location -Path x-pack/heartbeat mage build test key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -153,6 +171,9 @@ steps: command: | Set-Location -Path x-pack/heartbeat mage build test + retry: + automatic: + - limit: 3 key: "extended-win-2019-unit-tests" agents: provider: "gcp" @@ -166,7 +187,7 @@ steps: notify: - github_commit_status: context: "x-pack/heartbeat: Windows 2019 Unit Tests" - + - group: "x-pack/heartbeat MacOS Extended Tests" key: "x-pack-heartbeat-extended-tests-macos" if: build.env("BUILDKITE_PULL_REQUEST") == "false" || build.env("GITHUB_PR_LABELS") =~ /.*macOS.*/ @@ -179,6 +200,9 @@ steps: installNodeJsDependencies cd x-pack/heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -197,6 +221,9 @@ steps: installNodeJsDependencies cd x-pack/heartbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" diff --git a/.buildkite/x-pack/pipeline.xpack.libbeat.yml b/.buildkite/x-pack/pipeline.xpack.libbeat.yml index 14316a3ecd70..6bf456f6d83d 100644 --- a/.buildkite/x-pack/pipeline.xpack.libbeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.libbeat.yml @@ -26,6 +26,9 @@ steps: command: | cd x-pack/libbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -42,6 +45,9 @@ steps: command: | cd x-pack/libbeat mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -58,6 +64,9 @@ steps: command: | cd x-pack/libbeat mage pythonIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -74,6 +83,9 @@ steps: Set-Location -Path x-pack/libbeat mage -w reader\etw build goUnitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -92,6 +104,9 @@ steps: Set-Location -Path x-pack/libbeat mage -w reader\etw build goUnitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -114,6 +129,9 @@ steps: Set-Location -Path x-pack/libbeat mage -w reader\etw build goUnitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -132,6 +150,9 @@ steps: Set-Location -Path x-pack/libbeat mage -w reader\etw build goUnitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -150,6 +171,9 @@ steps: Set-Location -Path x-pack/libbeat mage -w reader\etw build goUnitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -172,6 +196,9 @@ steps: command: | cd x-pack/libbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${IMAGE_UBUNTU_ARM_64}" diff --git a/.buildkite/x-pack/pipeline.xpack.metricbeat.yml b/.buildkite/x-pack/pipeline.xpack.metricbeat.yml index 317b9069c556..4c1c31521f92 100644 --- a/.buildkite/x-pack/pipeline.xpack.metricbeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.metricbeat.yml @@ -30,6 +30,9 @@ steps: command: | cd x-pack/metricbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -50,6 +53,9 @@ steps: defineModuleFromTheChangeSet x-pack/metricbeat echo "~~~ Will run tests with env var MODULE=$$MODULE" cd x-pack/metricbeat && mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -70,6 +76,9 @@ steps: defineModuleFromTheChangeSet x-pack/metricbeat echo "~~~ Running tests with env var MODULE=$$MODULE" cd x-pack/metricbeat && mage pythonIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -86,6 +95,9 @@ steps: Set-Location -Path x-pack/metricbeat mage build unitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -104,6 +116,9 @@ steps: Set-Location -Path x-pack/metricbeat mage build unitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -126,6 +141,9 @@ steps: Set-Location -Path x-pack/metricbeat mage build unitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -144,6 +162,9 @@ steps: Set-Location -Path x-pack/metricbeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -162,6 +183,9 @@ steps: Set-Location -Path x-pack/metricbeat mage build unitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -185,6 +209,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd x-pack/metricbeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -202,6 +229,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd x-pack/metricbeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" diff --git a/.buildkite/x-pack/pipeline.xpack.osquerybeat.yml b/.buildkite/x-pack/pipeline.xpack.osquerybeat.yml index 8c9137cb423e..c8ecac79735b 100644 --- a/.buildkite/x-pack/pipeline.xpack.osquerybeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.osquerybeat.yml @@ -30,6 +30,9 @@ steps: command: | cd x-pack/osquerybeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -46,6 +49,9 @@ steps: command: | cd x-pack/osquerybeat mage goIntegTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -62,6 +68,9 @@ steps: Set-Location -Path x-pack/osquerybeat mage build unitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -80,6 +89,9 @@ steps: Set-Location -Path x-pack/osquerybeat mage build unitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -102,6 +114,9 @@ steps: Set-Location -Path x-pack/osquerybeat mage build unitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -120,6 +135,9 @@ steps: Set-Location -Path x-pack/osquerybeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -138,6 +156,9 @@ steps: Set-Location -Path x-pack/osquerybeat mage build unitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -160,6 +181,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd x-pack/osquerybeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -175,6 +199,9 @@ steps: set -euo pipefail source .buildkite/scripts/install_macos_tools.sh cd x-pack/osquerybeat && mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" diff --git a/.buildkite/x-pack/pipeline.xpack.packetbeat.yml b/.buildkite/x-pack/pipeline.xpack.packetbeat.yml index 77fdf2af8483..1ab71c30d7df 100644 --- a/.buildkite/x-pack/pipeline.xpack.packetbeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.packetbeat.yml @@ -29,6 +29,9 @@ steps: command: | cd x-pack/packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -45,6 +48,9 @@ steps: command: | cd x-pack/packetbeat mage systemTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_UBUNTU_X86_64}" @@ -61,6 +67,9 @@ steps: command: | cd x-pack/packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_RHEL9_X86_64}" @@ -77,6 +86,9 @@ steps: Set-Location -Path x-pack/packetbeat mage build unitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -95,6 +107,9 @@ steps: Set-Location -Path x-pack/packetbeat mage build unitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -114,6 +129,9 @@ steps: command: | Set-Location -Path x-pack/packetbeat mage systemTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -136,6 +154,9 @@ steps: Set-Location -Path x-pack/packetbeat mage build unitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -154,6 +175,9 @@ steps: Set-Location -Path x-pack/packetbeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -172,6 +196,9 @@ steps: Set-Location -Path x-pack/packetbeat mage build unitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -191,6 +218,9 @@ steps: command: | Set-Location -Path x-pack/packetbeat mage systemTest + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -214,6 +244,9 @@ steps: cd x-pack/packetbeat mage build unitTest if: build.env("GITHUB_PR_LABELS") =~ /.*arm.*/ + retry: + automatic: + - limit: 3 agents: provider: "aws" imagePrefix: "${IMAGE_UBUNTU_ARM_64}" @@ -236,6 +269,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd x-pack/packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_X86_64}" @@ -253,6 +289,9 @@ steps: source .buildkite/scripts/install_macos_tools.sh cd x-pack/packetbeat mage build unitTest + retry: + automatic: + - limit: 3 agents: provider: "orka" imagePrefix: "${IMAGE_MACOS_ARM}" diff --git a/.buildkite/x-pack/pipeline.xpack.winlogbeat.yml b/.buildkite/x-pack/pipeline.xpack.winlogbeat.yml index c07e537adf09..c6b5a6f59fe5 100644 --- a/.buildkite/x-pack/pipeline.xpack.winlogbeat.yml +++ b/.buildkite/x-pack/pipeline.xpack.winlogbeat.yml @@ -29,6 +29,9 @@ steps: mage build unitTest env: MODULE: $MODULE + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" @@ -47,6 +50,9 @@ steps: Set-Location -Path x-pack/winlogbeat mage build unitTest key: "mandatory-win-2016-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2016}" @@ -65,6 +71,9 @@ steps: Set-Location -Path x-pack/winlogbeat mage build unitTest key: "mandatory-win-2022-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2022}" @@ -88,6 +97,9 @@ steps: Set-Location -Path x-pack/winlogbeat mage build unitTest key: "extended-win-10-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_10}" @@ -106,6 +118,9 @@ steps: Set-Location -Path x-pack/winlogbeat mage build unitTest key: "extended-win-11-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_11}" @@ -124,6 +139,9 @@ steps: Set-Location -Path x-pack/winlogbeat mage build unitTest key: "extended-win-2019-unit-tests" + retry: + automatic: + - limit: 3 agents: provider: "gcp" image: "${IMAGE_WIN_2019}" diff --git a/.ci/jobs/packaging.yml b/.ci/jobs/packaging.yml index 6d4b136a5573..50cec32edd84 100644 --- a/.ci/jobs/packaging.yml +++ b/.ci/jobs/packaging.yml @@ -14,7 +14,7 @@ discover-pr-forks-trust: 'permission' discover-pr-origin: 'merge-current' discover-tags: true - head-filter-regex: '(7\.1[6789]|8\.\d+|PR-.*|v\d+\.\d+\.\d+)' + head-filter-regex: '(7\.1[6789]|8\.13|PR-.*|v8\.13\.\d+)' disable-pr-notifications: true notification-context: 'beats-packaging' repo: 'beats' @@ -28,11 +28,11 @@ ignore-tags-older-than: -1 ignore-tags-newer-than: 30 - named-branches: - - regex-name: - regex: '7\.1[6789]' + - exact-name: + name: '8.13' case-sensitive: true - regex-name: - regex: '8\.\d+' + regex: '7\.1[6789]' case-sensitive: true - change-request: ignore-target-only-changes: true diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 7fcaca8ac9e1..bbd4255fd870 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -34,6 +34,7 @@ updates: - dependency-name: github.com/elastic/go-perf - dependency-name: github.com/elastic/go-seccomp-bpf - dependency-name: github.com/elastic/toutoumomoma + - dependency-name: github.com/elastic/ebpfevents ignore: # Skip github.com/elastic/mito because it requires documentation updates. - dependency-name: github.com/elastic/mito diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index a1f6bf4aadd1..e9e5ec09236c 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -94,8 +94,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] *Auditbeat* - Set field types to correctly match ECS in sessionmd processor {issue}38955[38955] {pull}38994[38994] -- Keep process info on exited processes, to avoid failing to enrich events in sessionmd processor {pull}39173[39173] - +- Fix failing to enrich process events in sessionmd processor {issue}38955[38955] {pull}39173[39173] {pull}39243[39243] - Prevent scenario of losing children-related file events in a directory for recursive fsnotify backend of auditbeat file integrity module {pull}39133[39133] - Allow extra syscalls by auditbeat required in FIM with kprobes back-end {pull}39361[39361] @@ -143,6 +142,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Updated Websocket input title to align with existing inputs {pull}39006[39006] - Restore netflow input on Windows {pull}39024[39024] - Upgrade azure-event-hubs-go and azure-storage-blob-go dependencies. {pull}38861[38861] +- Fix concurrency/error handling bugs in the AWS S3 input that could drop data and prevent ingestion of large buckets. {pull}39131[39131] *Heartbeat* @@ -160,6 +160,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Fix fields not being parsed correctly in postgresql/database {issue}25301[25301] {pull}37720[37720] - rabbitmq/queue - Change the mapping type of `rabbitmq.queue.consumers.utilisation.pct` to `scaled_float` from `long` because the values fall within the range of `[0.0, 1.0]`. Previously, conversion to integer resulted in reporting either `0` or `1`. - Fix timeout caused by the retrival of which indices are hidden {pull}39165[39165] +- Fix Azure Monitor support for multiple aggregation types {issue}39192[39192] {pull}39204[39204] *Osquerybeat* diff --git a/NOTICE.txt b/NOTICE.txt index f060baf40980..951b7e7785c1 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -12525,11 +12525,11 @@ various licenses: -------------------------------------------------------------------------------- Dependency : github.com/elastic/elastic-agent-autodiscover -Version: v0.6.13 +Version: v0.6.14 Licence type (autodetected): Apache-2.0 -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-autodiscover@v0.6.13/LICENSE: +Contents of probable licence file $GOMODCACHE/github.com/elastic/elastic-agent-autodiscover@v0.6.14/LICENSE: Apache License Version 2.0, January 2004 @@ -25433,11 +25433,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- Dependency : golang.org/x/net -Version: v0.21.0 +Version: v0.23.0 Licence type (autodetected): BSD-3-Clause -------------------------------------------------------------------------------- -Contents of probable licence file $GOMODCACHE/golang.org/x/net@v0.21.0/LICENSE: +Contents of probable licence file $GOMODCACHE/golang.org/x/net@v0.23.0/LICENSE: Copyright (c) 2009 The Go Authors. All rights reserved. diff --git a/catalog-info.yaml b/catalog-info.yaml index 116e50246634..34d9e397ca3e 100644 --- a/catalog-info.yaml +++ b/catalog-info.yaml @@ -61,7 +61,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -108,7 +108,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -155,7 +155,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -202,7 +202,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -249,7 +249,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -296,7 +296,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -343,7 +343,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -390,7 +390,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -436,7 +436,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -483,7 +483,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -530,7 +530,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -577,7 +577,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -624,7 +624,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -671,7 +671,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -706,7 +706,7 @@ spec: release-eng: access_level: BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json apiVersion: backstage.io/v1alpha1 @@ -788,7 +788,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -835,7 +835,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -882,7 +882,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -929,7 +929,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -976,7 +976,7 @@ spec: ingest-fp: access_level: MANAGE_BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -1015,7 +1015,7 @@ spec: release-eng: access_level: BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -1040,24 +1040,33 @@ spec: spec: repository: elastic/beats pipeline_file: ".buildkite/packaging.pipeline.yml" - branch_configuration: "main" + branch_configuration: "main 8.14" # TODO enable after packaging backports for release branches # branch_configuration: "main 8.* 7.17" cancel_intermediate_builds: false skip_intermediate_builds: false + maximum_timeout_in_minutes: 90 provider_settings: + build_branches: true + build_pull_request_forks: false + build_pull_requests: false + build_tags: false + filter_condition: >- + build.branch =~ /^[0-9]+\.[0-9]+$$/ || build.branch == "main" + filter_enabled: true trigger_mode: code env: ELASTIC_SLACK_NOTIFICATIONS_ENABLED: 'true' SLACK_NOTIFICATIONS_CHANNEL: '#ingest-notifications' SLACK_NOTIFICATIONS_ON_SUCCESS: 'false' + SLACK_NOTIFICATIONS_SKIP_FOR_RETRIES: 'true' teams: ingest-fp: access_level: MANAGE_BUILD_AND_READ release-eng: access_level: BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ --- # yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json @@ -1087,29 +1096,56 @@ spec: skip_intermediate_builds: false provider_settings: trigger_mode: none - # TODO uncomment out after https://github.com/elastic/ingest-dev/issues/3235 - # schedules: - # # TODO to be replaced with a generic scheduler similar to https://github.com/elastic/logstash/pull/15705 - # Daily run of ironbank validation / main: - # branch: main - # cronline: 30 02 * * * - # message: Daily trigger of IronBank validation on main - # Daily run of ironbank validation / 8.14: - # branch: 8.14 - # cronline: 30 02 * * * - # message: Daily trigger of IronBank validation on 8.14 - # Daily run of ironbank validation / 8.13: - # branch: 8.13 - # cronline: 30 02 * * * - # message: Daily trigger of IronBank validation on 8.13 - # Daily run of ironbank validation / 7.17: - # branch: 7.17 - # cronline: 30 02 * * * - # message: Daily trigger of IronBank validation on 7.17 teams: ingest-fp: access_level: MANAGE_BUILD_AND_READ release-eng: access_level: BUILD_AND_READ everyone: - access_level: READ_ONLY + access_level: BUILD_AND_READ + +--- +# yaml-language-server: $schema=https://gist.githubusercontent.com/elasticmachine/988b80dae436cafea07d9a4a460a011d/raw/rre.schema.json +apiVersion: backstage.io/v1alpha1 +kind: Resource +metadata: + name: beats-pipeline-scheduler + description: 'Scheduled runs of various Beats pipelines per release branch' + links: + - title: 'Scheduled runs of Beats pipelines per release branch' + url: https://buildkite.com/elastic/logstash-pipeline-scheduler +spec: + type: buildkite-pipeline + owner: group:ingest-fp + system: buildkite + implementation: + apiVersion: buildkite.elastic.dev/v1 + kind: Pipeline + metadata: + name: beats-pipeline-scheduler + description: ':alarm_clock: Scheduled runs of various Beats pipelines per release branch' + spec: + repository: elastic/beats + pipeline_file: ".buildkite/pipeline-scheduler.yml" + maximum_timeout_in_minutes: 240 + schedules: + Daily run of Iron Bank validation: + branch: main + cronline: 30 02 * * * + message: Daily trigger of Iron Bank validation Pipeline per branch + env: + PIPELINES_TO_TRIGGER: 'beats-ironbank-validation' + skip_intermediate_builds: true + provider_settings: + trigger_mode: none + env: + ELASTIC_SLACK_NOTIFICATIONS_ENABLED: 'true' + SLACK_NOTIFICATIONS_CHANNEL: '#ingest-notifications' + SLACK_NOTIFICATIONS_ON_SUCCESS: 'false' + teams: + ingest-fp: + access_level: MANAGE_BUILD_AND_READ + release-eng: + access_level: BUILD_AND_READ + everyone: + access_level: BUILD_AND_READ diff --git a/dev-tools/mage/kubernetes/kuberemote.go b/dev-tools/mage/kubernetes/kuberemote.go index 8e9d9897d44f..e3062f00d1ad 100644 --- a/dev-tools/mage/kubernetes/kuberemote.go +++ b/dev-tools/mage/kubernetes/kuberemote.go @@ -250,7 +250,7 @@ func (r *KubeRemote) waitForPod(wait time.Duration, condition watchtools.Conditi return nil, err } -// portFoward runs the port forwarding so SSH rsync can be ran into the pod. +// portForward runs the port forwarding so SSH rsync can be ran into the pod. func (r *KubeRemote) portForward(ports []string, stopChannel, readyChannel chan struct{}, stdout, stderr io.Writer) (*portforward.PortForwarder, error) { roundTripper, upgrader, err := spdy.RoundTripperFor(r.cfg) if err != nil { diff --git a/filebeat/docs/inputs/input-filestream-file-options.asciidoc b/filebeat/docs/inputs/input-filestream-file-options.asciidoc index 47a8c819d9ea..5436d3863dc2 100644 --- a/filebeat/docs/inputs/input-filestream-file-options.asciidoc +++ b/filebeat/docs/inputs/input-filestream-file-options.asciidoc @@ -517,6 +517,30 @@ less than or equal to `prospector.scanner.check_interval` If `backoff.max` needs to be higher, it is recommended to close the file handler instead and let {beatname_uc} pick up the file again. +[float] +[id="{beatname_lc}-input-{type}-harvester-limit"] +===== `harvester_limit` + +The `harvester_limit` option limits the number of harvesters that are started in +parallel for one input. This directly relates to the maximum number of file +handlers that are opened. The default for `harvester_limit` is 0, which means +there is no limit. This configuration is useful if the number of files to be +harvested exceeds the open file handler limit of the operating system. + +Setting a limit on the number of harvesters means that potentially not all files +are opened in parallel. Therefore we recommended that you use this option in +combination with the `close.on_state_change.*` options to make sure +harvesters are stopped more often so that new files can be picked up. + +Currently if a new harvester can be started again, the harvester is picked +randomly. This means it's possible that the harvester for a file that was just +closed and then updated again might be started instead of the harvester for a +file that hasn't been harvested for a longer period of time. + +This configuration option applies per input. You can use this option to +indirectly set higher priorities on certain inputs by assigning a higher +limit of harvesters. + [float] ===== `file_identity` diff --git a/filebeat/docs/inputs/input-filestream.asciidoc b/filebeat/docs/inputs/input-filestream.asciidoc index 47d1b24a8e85..54283d6cce79 100644 --- a/filebeat/docs/inputs/input-filestream.asciidoc +++ b/filebeat/docs/inputs/input-filestream.asciidoc @@ -11,8 +11,9 @@ Use the `filestream` input to read lines from active log files. It is the new, improved alternative to the `log` input. It comes with various improvements to the existing input: -1. Checking of `close_*` options happens out of band. Thus, if an output is blocked, -{beatname_uc} can close the reader and avoid keeping too many files open. +1. Checking of `close.on_state_change.*` options happens out of +band. Thus, if an output is blocked, {beatname_uc} can close the +reader and avoid keeping too many files open. 2. Detailed metrics are available for all files that match the `paths` configuration regardless of the `harvester_limit`. This way, you can keep track of all files, diff --git a/filebeat/input/filestream/environment_test.go b/filebeat/input/filestream/environment_test.go index 7c3c8ccd4d3b..88163258938a 100644 --- a/filebeat/input/filestream/environment_test.go +++ b/filebeat/input/filestream/environment_test.go @@ -448,7 +448,7 @@ func (e *inputTestingEnvironment) waitUntilHarvesterIsDone() { } } -// requireEventReceived requires that the list of messages has made it into the output. +// requireEventsReceived requires that the list of messages has made it into the output. func (e *inputTestingEnvironment) requireEventsReceived(events []string) { foundEvents := make([]bool, len(events)) checkedEventCount := 0 diff --git a/go.mod b/go.mod index 0805e9200c8d..ad13afabd8da 100644 --- a/go.mod +++ b/go.mod @@ -154,7 +154,7 @@ require ( golang.org/x/crypto v0.21.0 golang.org/x/lint v0.0.0-20210508222113-6edffad5e616 golang.org/x/mod v0.14.0 - golang.org/x/net v0.21.0 + golang.org/x/net v0.23.0 golang.org/x/oauth2 v0.10.0 golang.org/x/sync v0.5.0 golang.org/x/sys v0.18.0 @@ -203,7 +203,7 @@ require ( github.com/awslabs/kinesis-aggregation/go/v2 v2.0.0-20220623125934-28468a6701b5 github.com/elastic/bayeux v1.0.5 github.com/elastic/ebpfevents v0.6.0 - github.com/elastic/elastic-agent-autodiscover v0.6.13 + github.com/elastic/elastic-agent-autodiscover v0.6.14 github.com/elastic/elastic-agent-libs v0.7.5 github.com/elastic/elastic-agent-shipper-client v0.5.1-0.20230228231646-f04347b666f3 github.com/elastic/elastic-agent-system-metrics v0.9.2 diff --git a/go.sum b/go.sum index 57711b7a9feb..5c45bdee7488 100644 --- a/go.sum +++ b/go.sum @@ -551,8 +551,8 @@ github.com/elastic/dhcp v0.0.0-20200227161230-57ec251c7eb3 h1:lnDkqiRFKm0rxdljqr github.com/elastic/dhcp v0.0.0-20200227161230-57ec251c7eb3/go.mod h1:aPqzac6AYkipvp4hufTyMj5PDIphF3+At8zr7r51xjY= github.com/elastic/ebpfevents v0.6.0 h1:BrL3m7JFK7U6h2jkbk3xAWWs//IZnugCHEDds5u2v68= github.com/elastic/ebpfevents v0.6.0/go.mod h1:ESG9gw7N+n5yCCMgdg1IIJENKWSmX7+X0Fi9GUs9nvU= -github.com/elastic/elastic-agent-autodiscover v0.6.13 h1:zBeTxV+o2efEKntY+o6iMMNJ1AVjDXUqY3o6uzIkKaw= -github.com/elastic/elastic-agent-autodiscover v0.6.13/go.mod h1:7P6YVKxuBT0qE/VxuA87obwZUAEU0O44mCN3r4/6x8w= +github.com/elastic/elastic-agent-autodiscover v0.6.14 h1:0zJYNyv9GKTOiNqCHqEVboP+WioV73ia17Et+UlFbz8= +github.com/elastic/elastic-agent-autodiscover v0.6.14/go.mod h1:39/fHHlnyTK6oUNZfAhxJwBTVahO9tNasEIjzsxGMu8= github.com/elastic/elastic-agent-client/v7 v7.8.1 h1:J9wZc/0mUvSEok0X5iR5+n60Jgb+AWooKddb3XgPWqM= github.com/elastic/elastic-agent-client/v7 v7.8.1/go.mod h1:axl1nkdqc84YRFkeJGD9jExKNPUrOrzf3DFo2m653nY= github.com/elastic/elastic-agent-libs v0.7.5 h1:4UMqB3BREvhwecYTs/L23oQp1hs/XUkcunPlmTZn5yg= @@ -1960,8 +1960,8 @@ golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= -golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190130055435-99b60b757ec1/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= diff --git a/libbeat/monitoring/report/log/log.go b/libbeat/monitoring/report/log/log.go index 886e207593a3..e11e8228cf70 100644 --- a/libbeat/monitoring/report/log/log.go +++ b/libbeat/monitoring/report/log/log.go @@ -37,34 +37,36 @@ import ( // TODO: Replace this with a proper solution that uses the metric type from // where it is defined. See: https://github.com/elastic/beats/issues/5433 var gauges = map[string]bool{ - "libbeat.output.events.active": true, - "libbeat.pipeline.events.active": true, - "libbeat.pipeline.clients": true, - "libbeat.config.module.running": true, - "registrar.states.current": true, - "filebeat.events.active": true, - "filebeat.harvester.running": true, - "filebeat.harvester.open_files": true, - "beat.memstats.memory_total": true, - "beat.memstats.memory_alloc": true, - "beat.memstats.rss": true, - "beat.memstats.gc_next": true, - "beat.info.uptime.ms": true, - "beat.cgroup.memory.mem.usage.bytes": true, - "beat.cpu.user.ticks": true, - "beat.cpu.system.ticks": true, - "beat.cpu.total.value": true, - "beat.cpu.total.ticks": true, - "beat.handles.open": true, - "beat.handles.limit.hard": true, - "beat.handles.limit.soft": true, - "beat.runtime.goroutines": true, - "system.load.1": true, - "system.load.5": true, - "system.load.15": true, - "system.load.norm.1": true, - "system.load.norm.5": true, - "system.load.norm.15": true, + "libbeat.output.events.active": true, + "libbeat.pipeline.events.active": true, + "libbeat.pipeline.clients": true, + "libbeat.pipeline.queue.max_events": true, + "libbeat.pipeline.queue.filled.pct.events": true, + "libbeat.config.module.running": true, + "registrar.states.current": true, + "filebeat.events.active": true, + "filebeat.harvester.running": true, + "filebeat.harvester.open_files": true, + "beat.memstats.memory_total": true, + "beat.memstats.memory_alloc": true, + "beat.memstats.rss": true, + "beat.memstats.gc_next": true, + "beat.info.uptime.ms": true, + "beat.cgroup.memory.mem.usage.bytes": true, + "beat.cpu.user.ticks": true, + "beat.cpu.system.ticks": true, + "beat.cpu.total.value": true, + "beat.cpu.total.ticks": true, + "beat.handles.open": true, + "beat.handles.limit.hard": true, + "beat.handles.limit.soft": true, + "beat.runtime.goroutines": true, + "system.load.1": true, + "system.load.5": true, + "system.load.15": true, + "system.load.norm.1": true, + "system.load.norm.5": true, + "system.load.norm.15": true, } // isGauge returns true when the given metric key name represents a gauge value. @@ -249,16 +251,16 @@ func toKeyValuePairs(snaps map[string]monitoring.FlatSnapshot) []interface{} { for name, snap := range snaps { data := make(mapstr.M, snapshotLen(snap)) for k, v := range snap.Bools { - data.Put(k, v) //nolint:errcheck // All keys within the flat snapshot are unique and are for scalar values. + data.Put(k, v) } for k, v := range snap.Floats { - data.Put(k, v) //nolint:errcheck // All keys within the flat snapshot are unique and are for scalar values. + data.Put(k, v) } for k, v := range snap.Ints { - data.Put(k, v) //nolint:errcheck // All keys within the flat snapshot are unique and are for scalar values. + data.Put(k, v) } for k, v := range snap.Strings { - data.Put(k, v) //nolint:errcheck // All keys within the flat snapshot are unique and are for scalar values. + data.Put(k, v) } if len(data) > 0 { args = append(args, logp.Reflect(name, data)) diff --git a/libbeat/processors/add_docker_metadata/docs/add_docker_metadata.asciidoc b/libbeat/processors/add_docker_metadata/docs/add_docker_metadata.asciidoc index 53292667f13b..616582101733 100644 --- a/libbeat/processors/add_docker_metadata/docs/add_docker_metadata.asciidoc +++ b/libbeat/processors/add_docker_metadata/docs/add_docker_metadata.asciidoc @@ -5,6 +5,11 @@ add_docker_metadata ++++ +ifeval::["{beatname_lc}"=="packetbeat"] +There is currently extremely limited capability for using {beatname_lc} to monitor and coexist with containers, for example Docker, Podman, or Kubernetes. Using the `add_docker_metadata` processor with {beatname_lc} is not recommended nor supported. +endif::[] + +ifeval::["{beatname_lc}"!="packetbeat"] The `add_docker_metadata` processor annotates each event with relevant metadata from Docker containers. At startup it detects a docker environment and caches the metadata. The events are annotated with Docker metadata, only if a valid configuration @@ -88,3 +93,4 @@ forget metadata for a container, 60s by default. `labels.dedot`:: (Optional) Default to be false. If set to true, replace dots in labels with `_`. +endif::[] \ No newline at end of file diff --git a/libbeat/publisher/pipeline/monitoring.go b/libbeat/publisher/pipeline/monitoring.go index 69a21c2c71ca..cda329e0963a 100644 --- a/libbeat/publisher/pipeline/monitoring.go +++ b/libbeat/publisher/pipeline/monitoring.go @@ -17,7 +17,11 @@ package pipeline -import "github.com/elastic/elastic-agent-libs/monitoring" +import ( + "math" + + "github.com/elastic/elastic-agent-libs/monitoring" +) type observer interface { pipelineObserver @@ -67,8 +71,9 @@ type metricsObserverVars struct { activeEvents *monitoring.Uint // queue metrics - queueACKed *monitoring.Uint - queueMaxEvents *monitoring.Uint + queueACKed *monitoring.Uint + queueMaxEvents *monitoring.Uint + percentQueueFull *monitoring.Float } func newMetricsObserver(metrics *monitoring.Registry) *metricsObserver { @@ -92,7 +97,8 @@ func newMetricsObserver(metrics *monitoring.Registry) *metricsObserver { queueACKed: monitoring.NewUint(reg, "queue.acked"), queueMaxEvents: monitoring.NewUint(reg, "queue.max_events"), - activeEvents: monitoring.NewUint(reg, "events.active"), // Gauge + activeEvents: monitoring.NewUint(reg, "events.active"), // Gauge + percentQueueFull: monitoring.NewFloat(reg, "queue.filled.pct.events"), }, } } @@ -121,12 +127,24 @@ func (o *metricsObserver) clientClosed() { o.vars.clients.Dec() } func (o *metricsObserver) newEvent() { o.vars.events.Inc() o.vars.activeEvents.Inc() + o.setPercentageFull() +} + +// setPercentageFull is used interally to set the `queue.full` metric +func (o *metricsObserver) setPercentageFull() { + maxEvt := o.vars.queueMaxEvents.Get() + if maxEvt != 0 { + pct := float64(o.vars.activeEvents.Get()) / float64(maxEvt) + pctRound := math.Round(pct/0.0005) * 0.0005 + o.vars.percentQueueFull.Set(pctRound) + } } // (client) event is filtered out (on purpose or failed) func (o *metricsObserver) filteredEvent() { o.vars.filtered.Inc() o.vars.activeEvents.Dec() + o.setPercentageFull() } // (client) managed to push an event into the publisher pipeline @@ -138,6 +156,7 @@ func (o *metricsObserver) publishedEvent() { func (o *metricsObserver) failedPublishEvent() { o.vars.failed.Inc() o.vars.activeEvents.Dec() + o.setPercentageFull() } // @@ -148,11 +167,13 @@ func (o *metricsObserver) failedPublishEvent() { func (o *metricsObserver) queueACKed(n int) { o.vars.queueACKed.Add(uint64(n)) o.vars.activeEvents.Sub(uint64(n)) + o.setPercentageFull() } // (queue) maximum queue event capacity func (o *metricsObserver) queueMaxEvents(n int) { o.vars.queueMaxEvents.Set(uint64(n)) + o.setPercentageFull() } // diff --git a/metricbeat/scripts/mage/package.go b/metricbeat/scripts/mage/package.go index e206881dd3ca..43e12652f4a5 100644 --- a/metricbeat/scripts/mage/package.go +++ b/metricbeat/scripts/mage/package.go @@ -40,7 +40,7 @@ const ( // not supported. You must declare a dependency on either // PrepareModulePackagingOSS or PrepareModulePackagingXPack. func CustomizePackaging() { - mg.Deps(customizeLightModulesPackaging) + mg.Deps(CustomizeLightModulesPackaging) var ( modulesDTarget = "modules.d" @@ -104,7 +104,7 @@ func CustomizePackaging() { // PrepareModulePackagingOSS generates build/package/modules and // build/package/modules.d directories for use in packaging. func PrepareModulePackagingOSS() error { - err := prepareLightModulesPackaging("module") + err := PrepareLightModulesPackaging("module") if err != nil { return err } @@ -116,7 +116,7 @@ func PrepareModulePackagingOSS() error { // PrepareModulePackagingXPack generates build/package/modules and // build/package/modules.d directories for use in packaging. func PrepareModulePackagingXPack() error { - err := prepareLightModulesPackaging("module", devtools.OSSBeatDir("module")) + err := PrepareLightModulesPackaging("module", devtools.OSSBeatDir("module")) if err != nil { return err } @@ -201,8 +201,8 @@ func GenerateDirModulesD() error { return nil } -// customizeLightModulesPackaging customizes packaging to add light modules -func customizeLightModulesPackaging() error { +// CustomizeLightModulesPackaging customizes packaging to add light modules +func CustomizeLightModulesPackaging() error { var ( moduleTarget = "module" module = devtools.PackageFile{ @@ -225,8 +225,8 @@ func customizeLightModulesPackaging() error { return nil } -// prepareLightModulesPackaging generates light modules -func prepareLightModulesPackaging(paths ...string) error { +// PrepareLightModulesPackaging generates light modules +func PrepareLightModulesPackaging(paths ...string) error { err := devtools.Clean([]string{dirModulesGenerated}) if err != nil { return err diff --git a/testing/environments/snapshot.yml b/testing/environments/snapshot.yml index a031c2184e5a..30002f9a255c 100644 --- a/testing/environments/snapshot.yml +++ b/testing/environments/snapshot.yml @@ -3,7 +3,7 @@ version: '2.3' services: elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0-053650c4-SNAPSHOT + image: docker.elastic.co/elasticsearch/elasticsearch:8.15.0-aa640648-SNAPSHOT # When extend is used it merges healthcheck.tests, see: # https://github.com/docker/compose/issues/8962 # healthcheck: @@ -31,7 +31,7 @@ services: - "./docker/elasticsearch/users_roles:/usr/share/elasticsearch/config/users_roles" logstash: - image: docker.elastic.co/logstash/logstash:8.15.0-053650c4-SNAPSHOT + image: docker.elastic.co/logstash/logstash:8.15.0-aa640648-SNAPSHOT healthcheck: test: ["CMD", "curl", "-f", "http://localhost:9600/_node/stats"] retries: 600 @@ -44,7 +44,7 @@ services: - 5055:5055 kibana: - image: docker.elastic.co/kibana/kibana:8.15.0-053650c4-SNAPSHOT + image: docker.elastic.co/kibana/kibana:8.15.0-aa640648-SNAPSHOT environment: - "ELASTICSEARCH_USERNAME=kibana_system_user" - "ELASTICSEARCH_PASSWORD=testing" diff --git a/x-pack/agentbeat/magefile.go b/x-pack/agentbeat/magefile.go index 874c79bf7a30..c7e6c561830d 100644 --- a/x-pack/agentbeat/magefile.go +++ b/x-pack/agentbeat/magefile.go @@ -20,6 +20,7 @@ import ( devtools "github.com/elastic/beats/v7/dev-tools/mage" "github.com/elastic/beats/v7/dev-tools/mage/target/build" + metricbeat "github.com/elastic/beats/v7/metricbeat/scripts/mage" packetbeat "github.com/elastic/beats/v7/packetbeat/scripts/mage" osquerybeat "github.com/elastic/beats/v7/x-pack/osquerybeat/scripts/mage" @@ -112,11 +113,19 @@ func CrossBuildDeps() error { return callForBeat("crossBuildExt", "osquerybeat") } +// PrepareLightModules prepares the module packaging. +func PrepareLightModules() error { + return metricbeat.PrepareLightModulesPackaging( + filepath.Join("..", "metricbeat", "module"), // x-pack/metricbeat + filepath.Join("..", "..", "metricbeat", "module"), // metricbeat (oss) + ) +} + // Package packages the Beat for distribution. // Use SNAPSHOT=true to build snapshots. // Use PLATFORMS to control the target platforms. // Use VERSION_QUALIFIER to control the version qualifier. -func Package() { +func Package() error { start := time.Now() defer func() { fmt.Println("package ran for", time.Since(start)) }() @@ -126,7 +135,14 @@ func Package() { // Add osquery distro binaries, required for the osquerybeat subcommand. osquerybeat.CustomizePackaging() - mg.SerialDeps(Update, osquerybeat.FetchOsqueryDistros, CrossBuildDeps, CrossBuild, devtools.Package, TestPackages) + // Add metricbeat lightweight modules. + if err := metricbeat.CustomizeLightModulesPackaging(); err != nil { + return err + } + + mg.SerialDeps(Update, PrepareLightModules, osquerybeat.FetchOsqueryDistros, CrossBuildDeps, CrossBuild, devtools.Package, TestPackages) + + return nil } // TestPackages tests the generated packages (i.e. file modes, owners, groups). diff --git a/x-pack/auditbeat/processors/sessionmd/add_session_metadata.go b/x-pack/auditbeat/processors/sessionmd/add_session_metadata.go index ff9fa54e556a..766e9623b9ea 100644 --- a/x-pack/auditbeat/processors/sessionmd/add_session_metadata.go +++ b/x-pack/auditbeat/processors/sessionmd/add_session_metadata.go @@ -96,13 +96,24 @@ func New(cfg *cfg.C) (beat.Processor, error) { } func (p *addSessionMetadata) Run(ev *beat.Event) (*beat.Event, error) { - _, err := ev.GetValue(p.config.PIDField) + pi, err := ev.GetValue(p.config.PIDField) if err != nil { // Do not attempt to enrich events without PID; it's not a supported event return ev, nil //nolint:nilerr // Running on events without PID is expected } - err = p.provider.UpdateDB(ev) + // Do not enrich failed syscalls, as there was no actual process change related to it + v, err := ev.GetValue("auditd.result") + if err == nil && v == "fail" { + return ev, nil + } + + pid, err := pidToUInt32(pi) + if err != nil { + return ev, nil //nolint:nilerr // Running on events with a different PID type is not a processor error + } + + err = p.provider.UpdateDB(ev, pid) if err != nil { return ev, err } @@ -136,7 +147,9 @@ func (p *addSessionMetadata) enrich(ev *beat.Event) (*beat.Event, error) { fullProcess, err := p.db.GetProcess(pid) if err != nil { - return nil, fmt.Errorf("pid %v not found in db: %w", pid, err) + e := fmt.Errorf("pid %v not found in db: %w", pid, err) + p.logger.Errorf("%v", e) + return nil, e } processMap := fullProcess.ToMap() diff --git a/x-pack/auditbeat/processors/sessionmd/processdb/db.go b/x-pack/auditbeat/processors/sessionmd/processdb/db.go index 2c7c228e2c1c..b8c624abe00a 100644 --- a/x-pack/auditbeat/processors/sessionmd/processdb/db.go +++ b/x-pack/auditbeat/processors/sessionmd/processdb/db.go @@ -238,7 +238,6 @@ func (db *DB) InsertFork(fork types.ProcessForkEvent) { pid := fork.ChildPIDs.Tgid ppid := fork.ParentPIDs.Tgid - db.scrapeAncestors(db.processes[pid]) if entry, ok := db.processes[ppid]; ok { entry.PIDs = pidInfoFromProto(fork.ChildPIDs) @@ -282,7 +281,6 @@ func (db *DB) InsertExec(exec types.ProcessExecEvent) { } db.processes[exec.PIDs.Tgid] = proc - db.scrapeAncestors(proc) entryLeaderPID := db.evaluateEntryLeader(proc) if entryLeaderPID != nil { db.entryLeaderRelationships[exec.PIDs.Tgid] = *entryLeaderPID @@ -568,6 +566,14 @@ func setSameAsProcess(process *types.Process) { } } +func (db *DB) HasProcess(pid uint32) bool { + db.mutex.RLock() + defer db.mutex.RUnlock() + + _, ok := db.processes[pid] + return ok +} + func (db *DB) GetProcess(pid uint32) (types.Process, error) { db.mutex.RLock() defer db.mutex.RUnlock() @@ -585,8 +591,6 @@ func (db *DB) GetProcess(pid uint32) (types.Process, error) { fillParent(&ret, parent) break } - db.logger.Debugf("failed to find %d in DB (parent of %d), attempting to scrape", process.PIDs.Ppid, pid) - db.scrapeAncestors(process) } } @@ -596,8 +600,6 @@ func (db *DB) GetProcess(pid uint32) (types.Process, error) { fillGroupLeader(&ret, groupLeader) break } - db.logger.Debugf("failed to find %d in DB (group leader of %d), attempting to scrape", process.PIDs.Pgid, pid) - db.scrapeAncestors(process) } } @@ -607,8 +609,6 @@ func (db *DB) GetProcess(pid uint32) (types.Process, error) { fillSessionLeader(&ret, sessionLeader) break } - db.logger.Debugf("failed to find %d in DB (session leader of %d), attempting to scrape", process.PIDs.Sid, pid) - db.scrapeAncestors(process) } } @@ -712,29 +712,6 @@ func getTTYType(major uint16, minor uint16) TTYType { return TTYUnknown } -func (db *DB) scrapeAncestors(proc Process) { - for _, pid := range []uint32{proc.PIDs.Pgid, proc.PIDs.Ppid, proc.PIDs.Sid} { - if _, exists := db.processes[pid]; pid == 0 || exists { - continue - } - procInfo, err := db.procfs.GetProcess(pid) - if err != nil { - db.logger.Debugf("couldn't get %v from procfs: %w", pid, err) - continue - } - p := Process{ - PIDs: pidInfoFromProto(procInfo.PIDs), - Creds: credInfoFromProto(procInfo.Creds), - CTTY: ttyDevFromProto(procInfo.CTTY), - Argv: procInfo.Argv, - Cwd: procInfo.Cwd, - Env: procInfo.Env, - Filename: procInfo.Filename, - } - db.insertProcess(p) - } -} - func (db *DB) Close() { close(db.stopChan) } diff --git a/x-pack/auditbeat/processors/sessionmd/provider/ebpf_provider/ebpf_provider.go b/x-pack/auditbeat/processors/sessionmd/provider/ebpf_provider/ebpf_provider.go index 2b9b540e037c..f1b8bae0b671 100644 --- a/x-pack/auditbeat/processors/sessionmd/provider/ebpf_provider/ebpf_provider.go +++ b/x-pack/auditbeat/processors/sessionmd/provider/ebpf_provider/ebpf_provider.go @@ -9,6 +9,7 @@ package ebpf_provider import ( "context" "fmt" + "time" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/ebpf" @@ -151,7 +152,80 @@ func NewProvider(ctx context.Context, logger *logp.Logger, db *processdb.DB) (pr return &p, nil } -func (s prvdr) UpdateDB(ev *beat.Event) error { - // no-op for ebpf, DB is updated from pushed ebpf events - return nil +const ( + maxWaitLimit = 200 * time.Millisecond // Maximum time UpdateDB will wait for process + combinedWaitLimit = 2 * time.Second // Multiple UpdateDB calls will wait up to this amount within resetDuration + backoffDuration = 10 * time.Second // UpdateDB will stop waiting for processes for this time + resetDuration = 5 * time.Second // After this amount of times with no backoffs, the combinedWait will be reset +) + +var ( + combinedWait = 0 * time.Millisecond + inBackoff = false + backoffStart = time.Now() + since = time.Now() + backoffSkipped = 0 +) + +// With ebpf, process events are pushed to the DB by the above goroutine, so this doesn't actually update the DB. +// It does to try sync the processor and ebpf events, so that the process is in the process db before continuing. +// +// It's possible that the event to enrich arrives before the process is inserted into the DB. In that case, this +// will block continuing the enrichment until the process is seen (or the timeout is reached). +// +// If for some reason a lot of time has been spent waiting for missing processes, this also has a backoff timer during +// which it will continue without waiting for missing events to arrive, so the processor doesn't become overly backed-up +// waiting for these processes, at the cost of possibly not enriching some processes. +func (s prvdr) UpdateDB(ev *beat.Event, pid uint32) error { + if s.db.HasProcess(pid) { + return nil + } + + now := time.Now() + if inBackoff { + if now.Sub(backoffStart) > backoffDuration { + s.logger.Warnf("ended backoff, skipped %d processes", backoffSkipped) + inBackoff = false + combinedWait = 0 * time.Millisecond + } else { + backoffSkipped += 1 + return nil + } + } else { + if combinedWait > combinedWaitLimit { + s.logger.Warn("starting backoff") + inBackoff = true + backoffStart = now + backoffSkipped = 0 + return nil + } + // maintain a moving window of time for the delays we track + if now.Sub(since) > resetDuration { + since = now + combinedWait = 0 * time.Millisecond + } + } + + start := now + nextWait := 5 * time.Millisecond + for { + waited := time.Since(start) + if s.db.HasProcess(pid) { + s.logger.Debugf("got process that was missing after %v", waited) + combinedWait = combinedWait + waited + return nil + } + if waited >= maxWaitLimit { + e := fmt.Errorf("process %v was not seen after %v", pid, waited) + s.logger.Warnf("%w", e) + combinedWait = combinedWait + waited + return e + } + time.Sleep(nextWait) + if nextWait*2+waited > maxWaitLimit { + nextWait = maxWaitLimit - waited + } else { + nextWait = nextWait * 2 + } + } } diff --git a/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider.go b/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider.go index 2f99dd72b1fb..6525b860b6d2 100644 --- a/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider.go +++ b/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider.go @@ -41,16 +41,7 @@ func NewProvider(ctx context.Context, logger *logp.Logger, db *processdb.DB, rea } // UpdateDB will update the process DB with process info from procfs or the event itself -func (s prvdr) UpdateDB(ev *beat.Event) error { - pi, err := ev.Fields.GetValue(s.pidField) - if err != nil { - return fmt.Errorf("event not supported, no pid") - } - pid, ok := pi.(int) - if !ok { - return fmt.Errorf("pid field not int") - } - +func (s prvdr) UpdateDB(ev *beat.Event, pid uint32) error { syscall, err := ev.GetValue(syscallField) if err != nil { return fmt.Errorf("event not supported, no syscall data") @@ -59,7 +50,7 @@ func (s prvdr) UpdateDB(ev *beat.Event) error { switch syscall { case "execveat", "execve": pe := types.ProcessExecEvent{} - proc_info, err := s.reader.GetProcess(uint32(pid)) + proc_info, err := s.reader.GetProcess(pid) if err == nil { pe.PIDs = proc_info.PIDs pe.Creds = proc_info.Creds @@ -72,7 +63,7 @@ func (s prvdr) UpdateDB(ev *beat.Event) error { s.logger.Warnf("couldn't get process info from proc for pid %v: %w", pid, err) // If process info couldn't be taken from procfs, populate with as much info as // possible from the event - pe.PIDs.Tgid = uint32(pid) + pe.PIDs.Tgid = pid var intr interface{} var i int var ok bool @@ -106,7 +97,7 @@ func (s prvdr) UpdateDB(ev *beat.Event) error { case "exit_group": pe := types.ProcessExitEvent{ PIDs: types.PIDInfo{ - Tgid: uint32(pid), + Tgid: pid, }, } s.db.InsertExit(pe) @@ -122,8 +113,8 @@ func (s prvdr) UpdateDB(ev *beat.Event) error { if result == "success" { setsid_ev := types.ProcessSetsidEvent{ PIDs: types.PIDInfo{ - Tgid: uint32(pid), - Sid: uint32(pid), + Tgid: pid, + Sid: pid, }, } s.db.InsertSetsid(setsid_ev) diff --git a/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider_test.go b/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider_test.go index 6fd333c47119..c438efcfe1ae 100644 --- a/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider_test.go +++ b/x-pack/auditbeat/processors/sessionmd/provider/procfs_provider/procfs_provider_test.go @@ -124,7 +124,7 @@ func TestExecveEvent(t *testing.T) { provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid") require.Nil(t, err, "error creating provider") - err = provider.UpdateDB(&event) + err = provider.UpdateDB(&event, expected.PIDs.Tgid) require.Nil(t, err) actual, err := db.GetProcess(pid) @@ -234,7 +234,7 @@ func TestExecveatEvent(t *testing.T) { provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid") require.Nil(t, err, "error creating provider") - err = provider.UpdateDB(&event) + err = provider.UpdateDB(&event, expected.PIDs.Tgid) require.Nil(t, err) actual, err := db.GetProcess(pid) @@ -317,7 +317,7 @@ func TestSetSidEvent(t *testing.T) { provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid") require.Nil(t, err, "error creating provider") - err = provider.UpdateDB(&event) + err = provider.UpdateDB(&event, expected.PIDs.Tgid) require.Nil(t, err) actual, err := db.GetProcess(pid) @@ -399,7 +399,7 @@ func TestSetSidEventFailed(t *testing.T) { provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid") require.Nil(t, err, "error creating provider") - err = provider.UpdateDB(&event) + err = provider.UpdateDB(&event, expected.PIDs.Tgid) require.Nil(t, err) actual, err := db.GetProcess(pid) @@ -470,7 +470,7 @@ func TestSetSidSessionLeaderNotScraped(t *testing.T) { provider, err := NewProvider(context.TODO(), &logger, db, reader, "process.pid") require.Nil(t, err, "error creating provider") - err = provider.UpdateDB(&event) + err = provider.UpdateDB(&event, expected.PIDs.Tgid) require.Nil(t, err) actual, err := db.GetProcess(pid) diff --git a/x-pack/auditbeat/processors/sessionmd/provider/provider.go b/x-pack/auditbeat/processors/sessionmd/provider/provider.go index e3fa1547806c..6452eb9e2bf7 100644 --- a/x-pack/auditbeat/processors/sessionmd/provider/provider.go +++ b/x-pack/auditbeat/processors/sessionmd/provider/provider.go @@ -11,5 +11,5 @@ import ( ) type Provider interface { - UpdateDB(*beat.Event) error + UpdateDB(*beat.Event, uint32) error } diff --git a/x-pack/filebeat/input/awss3/input.go b/x-pack/filebeat/input/awss3/input.go index 733de949f298..51e8c9808edb 100644 --- a/x-pack/filebeat/input/awss3/input.go +++ b/x-pack/filebeat/input/awss3/input.go @@ -13,6 +13,7 @@ import ( "time" awssdk "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/aws/retry" "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go-v2/service/sqs" "github.com/aws/smithy-go" @@ -21,7 +22,6 @@ import ( v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/beats/v7/libbeat/beat" "github.com/elastic/beats/v7/libbeat/feature" - "github.com/elastic/beats/v7/libbeat/statestore" awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" conf "github.com/elastic/elastic-agent-libs/config" "github.com/elastic/go-concert/unison" @@ -99,78 +99,88 @@ func (in *s3Input) Test(ctx v2.TestContext) error { } func (in *s3Input) Run(inputContext v2.Context, pipeline beat.Pipeline) error { - var err error + ctx := v2.GoContextFromCanceler(inputContext.Cancelation) - persistentStore, err := in.store.Access() - if err != nil { - return fmt.Errorf("can not access persistent store: %w", err) + if in.config.QueueURL != "" { + return in.runQueueReader(ctx, inputContext, pipeline) } - defer persistentStore.Close() + if in.config.BucketARN != "" || in.config.NonAWSBucketName != "" { + return in.runS3Poller(ctx, inputContext, pipeline) + } - states := newStates(inputContext) - err = states.readStatesFrom(persistentStore) - if err != nil { - return fmt.Errorf("can not start persistent store: %w", err) + return nil +} + +func (in *s3Input) runQueueReader( + ctx context.Context, + inputContext v2.Context, + pipeline beat.Pipeline, +) error { + configRegion := in.config.RegionName + urlRegion, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint) + if err != nil && configRegion == "" { + // Only report an error if we don't have a configured region + // to fall back on. + return fmt.Errorf("failed to get AWS region from queue_url: %w", err) + } else if configRegion != "" && configRegion != urlRegion { + inputContext.Logger.Warnf("configured region disagrees with queue_url region (%q != %q): using %q", configRegion, urlRegion, urlRegion) } - ctx := v2.GoContextFromCanceler(inputContext.Cancelation) + in.awsConfig.Region = urlRegion - if in.config.QueueURL != "" { - regionName, err := getRegionFromQueueURL(in.config.QueueURL, in.config.AWSConfig.Endpoint, in.config.RegionName) - if err != nil && in.config.RegionName == "" { - return fmt.Errorf("failed to get AWS region from queue_url: %w", err) - } - var warn regionMismatchError - if errors.As(err, &warn) { - // Warn of mismatch, but go ahead with configured region name. - inputContext.Logger.Warnf("%v: using %q", err, regionName) - } - in.awsConfig.Region = regionName + // Create SQS receiver and S3 notification processor. + receiver, err := in.createSQSReceiver(inputContext, pipeline) + if err != nil { + return fmt.Errorf("failed to initialize sqs receiver: %w", err) + } + defer receiver.metrics.Close() - // Create SQS receiver and S3 notification processor. - receiver, err := in.createSQSReceiver(inputContext, pipeline) - if err != nil { - return fmt.Errorf("failed to initialize sqs receiver: %w", err) - } - defer receiver.metrics.Close() + // Poll metrics periodically in the background + go pollSqsWaitingMetric(ctx, receiver) - // Poll metrics periodically in the background - go pollSqsWaitingMetric(ctx, receiver) + return receiver.Receive(ctx) +} - if err := receiver.Receive(ctx); err != nil { - return err - } +func (in *s3Input) runS3Poller( + ctx context.Context, + inputContext v2.Context, + pipeline beat.Pipeline, +) error { + // Create client for publishing events and receive notification of their ACKs. + client, err := pipeline.ConnectWith(beat.ClientConfig{ + EventListener: awscommon.NewEventACKHandler(), + Processing: beat.ProcessingConfig{ + // This input only produces events with basic types so normalization + // is not required. + EventNormalization: boolPtr(false), + }, + }) + if err != nil { + return fmt.Errorf("failed to create pipeline client: %w", err) } + defer client.Close() - if in.config.BucketARN != "" || in.config.NonAWSBucketName != "" { - // Create client for publishing events and receive notification of their ACKs. - client, err := pipeline.ConnectWith(beat.ClientConfig{ - EventListener: awscommon.NewEventACKHandler(), - Processing: beat.ProcessingConfig{ - // This input only produces events with basic types so normalization - // is not required. - EventNormalization: boolPtr(false), - }, - }) - if err != nil { - return fmt.Errorf("failed to create pipeline client: %w", err) - } - defer client.Close() + // Connect to the registry and create our states lookup + persistentStore, err := in.store.Access() + if err != nil { + return fmt.Errorf("can not access persistent store: %w", err) + } + defer persistentStore.Close() - // Create S3 receiver and S3 notification processor. - poller, err := in.createS3Lister(inputContext, ctx, client, persistentStore, states) - if err != nil { - return fmt.Errorf("failed to initialize s3 poller: %w", err) - } - defer poller.metrics.Close() + states, err := newStates(inputContext, persistentStore) + if err != nil { + return fmt.Errorf("can not start persistent store: %w", err) + } - if err := poller.Poll(ctx); err != nil { - return err - } + // Create S3 receiver and S3 notification processor. + poller, err := in.createS3Poller(inputContext, ctx, client, states) + if err != nil { + return fmt.Errorf("failed to initialize s3 poller: %w", err) } + defer poller.metrics.Close() - return nil + return poller.Poll(ctx) } func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*sqsReader, error) { @@ -215,8 +225,11 @@ func (in *s3Input) createSQSReceiver(ctx v2.Context, pipeline beat.Pipeline) (*s return nil, err } in.metrics = newInputMetrics(ctx.ID, nil, in.config.MaxNumberOfMessages) + s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), in.metrics, s3API, fileSelectors, in.config.BackupConfig, in.config.MaxNumberOfMessages) + sqsMessageHandler := newSQSS3EventProcessor(log.Named("sqs_s3_event"), in.metrics, sqsAPI, script, in.config.VisibilityTimeout, in.config.SQSMaxReceiveCount, pipeline, s3EventHandlerFactory, in.config.MaxNumberOfMessages) + sqsReader := newSQSReader(log.Named("sqs"), in.metrics, sqsAPI, in.config.MaxNumberOfMessages, sqsMessageHandler) return sqsReader, nil @@ -230,7 +243,7 @@ func (n nonAWSBucketResolver) ResolveEndpoint(region string, options s3.Endpoint return awssdk.Endpoint{URL: n.endpoint, SigningRegion: region, HostnameImmutable: true, Source: awssdk.EndpointSourceCustom}, nil } -func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, client beat.Client, persistentStore *statestore.Store, states *states) (*s3Poller, error) { +func (in *s3Input) createS3Poller(ctx v2.Context, cancelCtx context.Context, client beat.Client, states *states) (*s3Poller, error) { var bucketName string var bucketID string if in.config.NonAWSBucketName != "" { @@ -250,6 +263,12 @@ func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, cli o.EndpointOptions.UseFIPSEndpoint = awssdk.FIPSEndpointStateEnabled } o.UsePathStyle = in.config.PathStyle + + o.Retryer = retry.NewStandard(func(so *retry.StandardOptions) { + so.MaxAttempts = 5 + // Recover quickly when requests start working again + so.NoRetryIncrement = 100 + }) }) regionName, err := getRegionForBucket(cancelCtx, s3Client, bucketName) if err != nil { @@ -295,7 +314,6 @@ func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, cli client, s3EventHandlerFactory, states, - persistentStore, bucketID, in.config.BucketListPrefix, in.awsConfig.Region, @@ -308,7 +326,7 @@ func (in *s3Input) createS3Lister(ctx v2.Context, cancelCtx context.Context, cli var errBadQueueURL = errors.New("QueueURL is not in format: https://sqs.{REGION_ENDPOINT}.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME} or https://{VPC_ENDPOINT}.sqs.{REGION_ENDPOINT}.vpce.{ENDPOINT}/{ACCOUNT_NUMBER}/{QUEUE_NAME}") -func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (region string, err error) { +func getRegionFromQueueURL(queueURL, endpoint string) (string, error) { // get region from queueURL // Example for sqs queue: https://sqs.us-east-1.amazonaws.com/12345678912/test-s3-logs // Example for vpce: https://vpce-test.sqs.us-east-1.vpce.amazonaws.com/12345678912/sqs-queue @@ -321,11 +339,7 @@ func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (reg // check for sqs queue url if len(queueHostSplit) == 3 && queueHostSplit[0] == "sqs" { if queueHostSplit[2] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplit[2], "amazonaws.")) { - region = queueHostSplit[1] - if defaultRegion != "" && region != defaultRegion { - return defaultRegion, regionMismatchError{queueURLRegion: region, defaultRegion: defaultRegion} - } - return region, nil + return queueHostSplit[1], nil } } @@ -333,30 +347,13 @@ func getRegionFromQueueURL(queueURL string, endpoint, defaultRegion string) (reg queueHostSplitVPC := strings.SplitN(u.Host, ".", 5) if len(queueHostSplitVPC) == 5 && queueHostSplitVPC[1] == "sqs" { if queueHostSplitVPC[4] == endpoint || (endpoint == "" && strings.HasPrefix(queueHostSplitVPC[4], "amazonaws.")) { - region = queueHostSplitVPC[2] - if defaultRegion != "" && region != defaultRegion { - return defaultRegion, regionMismatchError{queueURLRegion: region, defaultRegion: defaultRegion} - } - return region, nil + return queueHostSplitVPC[2], nil } } - - if defaultRegion != "" { - return defaultRegion, nil - } } return "", errBadQueueURL } -type regionMismatchError struct { - queueURLRegion string - defaultRegion string -} - -func (e regionMismatchError) Error() string { - return fmt.Sprintf("configured region disagrees with queue_url region: %q != %q", e.queueURLRegion, e.defaultRegion) -} - func getRegionForBucket(ctx context.Context, s3Client *s3.Client, bucketName string) (string, error) { getBucketLocationOutput, err := s3Client.GetBucketLocation(ctx, &s3.GetBucketLocationInput{ Bucket: awssdk.String(bucketName), diff --git a/x-pack/filebeat/input/awss3/input_benchmark_test.go b/x-pack/filebeat/input/awss3/input_benchmark_test.go index e05e5b461ca6..5d22d1411687 100644 --- a/x-pack/filebeat/input/awss3/input_benchmark_test.go +++ b/x-pack/filebeat/input/awss3/input_benchmark_test.go @@ -8,7 +8,6 @@ import ( "context" "errors" "fmt" - "io/ioutil" "os" "path/filepath" "runtime" @@ -16,6 +15,8 @@ import ( "testing" "time" + "github.com/stretchr/testify/assert" + "github.com/elastic/beats/v7/libbeat/statestore" "github.com/elastic/beats/v7/libbeat/statestore/storetest" @@ -132,7 +133,7 @@ type constantS3 struct { var _ s3API = (*constantS3)(nil) func newConstantS3(t testing.TB) *constantS3 { - data, err := ioutil.ReadFile(cloudtrailTestFile) + data, err := os.ReadFile(cloudtrailTestFile) if err != nil { t.Fatal(err) } @@ -342,14 +343,11 @@ func benchmarkInputS3(t *testing.T, numberOfWorkers int) testing.BenchmarkResult return } - err = store.Set(awsS3WriteCommitPrefix+"bucket"+listPrefix, &commitWriteState{time.Time{}}) - if err != nil { - errChan <- err - return - } + states, err := newStates(inputCtx, store) + assert.NoError(t, err, "states creation should succeed") s3EventHandlerFactory := newS3ObjectProcessorFactory(log.Named("s3"), metrics, s3API, config.FileSelectors, backupConfig{}, numberOfWorkers) - s3Poller := newS3Poller(logp.NewLogger(inputName), metrics, s3API, client, s3EventHandlerFactory, newStates(inputCtx), store, "bucket", listPrefix, "region", "provider", numberOfWorkers, time.Second) + s3Poller := newS3Poller(logp.NewLogger(inputName), metrics, s3API, client, s3EventHandlerFactory, states, "bucket", listPrefix, "region", "provider", numberOfWorkers, time.Second) if err := s3Poller.Poll(ctx); err != nil { if !errors.Is(err, context.DeadlineExceeded) { diff --git a/x-pack/filebeat/input/awss3/input_test.go b/x-pack/filebeat/input/awss3/input_test.go index abc9f5c9a6a6..0a3053f7f1b9 100644 --- a/x-pack/filebeat/input/awss3/input_test.go +++ b/x-pack/filebeat/input/awss3/input_test.go @@ -54,7 +54,6 @@ func TestGetRegionFromQueueURL(t *testing.T) { name string queueURL string endpoint string - deflt string want string wantErr error }{ @@ -77,7 +76,6 @@ func TestGetRegionFromQueueURL(t *testing.T) { { name: "vpce_endpoint", queueURL: "https://vpce-test.sqs.us-east-2.vpce.amazonaws.com/12345678912/sqs-queue", - deflt: "", want: "us-east-2", }, { @@ -90,7 +88,7 @@ func TestGetRegionFromQueueURL(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - got, err := getRegionFromQueueURL(test.queueURL, test.endpoint, test.deflt) + got, err := getRegionFromQueueURL(test.queueURL, test.endpoint) if !sameError(err, test.wantErr) { t.Errorf("unexpected error: got:%v want:%v", err, test.wantErr) } diff --git a/x-pack/filebeat/input/awss3/s3.go b/x-pack/filebeat/input/awss3/s3.go index 5aa8d31e95de..8909f78bb39d 100644 --- a/x-pack/filebeat/input/awss3/s3.go +++ b/x-pack/filebeat/input/awss3/s3.go @@ -11,34 +11,22 @@ import ( "sync" "time" - "github.com/gofrs/uuid" - "go.uber.org/multierr" + "github.com/aws/aws-sdk-go-v2/aws/ratelimit" "github.com/elastic/beats/v7/libbeat/beat" - "github.com/elastic/beats/v7/libbeat/statestore" + "github.com/elastic/beats/v7/libbeat/common/backoff" awscommon "github.com/elastic/beats/v7/x-pack/libbeat/common/aws" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/go-concert/timed" ) -const maxCircuitBreaker = 5 - -type commitWriteState struct { - time.Time -} - -type s3ObjectInfo struct { - name string - key string - etag string - lastModified time.Time - listingID string -} +// var instead of const so it can be reduced during unit tests (instead of waiting +// through 10 minutes of retry backoff) +var readerLoopMaxCircuitBreaker = 10 type s3ObjectPayload struct { s3ObjectHandler s3ObjectHandler - s3ObjectInfo s3ObjectInfo - s3ObjectEvent s3EventV2 + objectState state } type s3Poller struct { @@ -48,15 +36,12 @@ type s3Poller struct { region string provider string bucketPollInterval time.Duration - workerSem *awscommon.Sem s3 s3API log *logp.Logger metrics *inputMetrics client beat.Client s3ObjectHandler s3ObjectHandlerFactory states *states - store *statestore.Store - workersListingMap *sync.Map workersProcessingMap *sync.Map } @@ -66,7 +51,6 @@ func newS3Poller(log *logp.Logger, client beat.Client, s3ObjectHandler s3ObjectHandlerFactory, states *states, - store *statestore.Store, bucket string, listPrefix string, awsRegion string, @@ -85,41 +69,17 @@ func newS3Poller(log *logp.Logger, region: awsRegion, provider: provider, bucketPollInterval: bucketPollInterval, - workerSem: awscommon.NewSem(numberOfWorkers), s3: s3, log: log, metrics: metrics, client: client, s3ObjectHandler: s3ObjectHandler, states: states, - store: store, - workersListingMap: new(sync.Map), workersProcessingMap: new(sync.Map), } } -func (p *s3Poller) handlePurgingLock(info s3ObjectInfo, isStored bool) { - id := stateID(info.name, info.key, info.etag, info.lastModified) - previousState := p.states.FindPreviousByID(id) - if !previousState.IsEmpty() { - if isStored { - previousState.MarkAsStored() - } else { - previousState.MarkAsError() - } - - p.states.Update(previousState, info.listingID) - } - - // Manage locks for purging. - if p.states.IsListingFullyStored(info.listingID) { - // locked on processing we unlock when all the object were ACKed - lock, _ := p.workersListingMap.Load(info.listingID) - lock.(*sync.Mutex).Unlock() - } -} - -func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) (s3ObjectHandler, s3EventV2) { +func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) s3ObjectHandler { event := s3EventV2{} event.AWSRegion = p.region event.Provider = p.provider @@ -129,275 +89,126 @@ func (p *s3Poller) createS3ObjectProcessor(ctx context.Context, state state) (s3 acker := awscommon.NewEventACKTracker(ctx) - return p.s3ObjectHandler.Create(ctx, p.log, p.client, acker, event), event + return p.s3ObjectHandler.Create(ctx, p.log, p.client, acker, event) } -func (p *s3Poller) ProcessObject(s3ObjectPayloadChan <-chan *s3ObjectPayload) error { - var errs []error +func (p *s3Poller) workerLoop(ctx context.Context, s3ObjectPayloadChan <-chan *s3ObjectPayload) { + rateLimitWaiter := backoff.NewEqualJitterBackoff(ctx.Done(), 1, 120) for s3ObjectPayload := range s3ObjectPayloadChan { - // Process S3 object (download, parse, create events). - err := s3ObjectPayload.s3ObjectHandler.ProcessS3Object() + objHandler := s3ObjectPayload.s3ObjectHandler + state := s3ObjectPayload.objectState - // Wait for all events to be ACKed before proceeding. - s3ObjectPayload.s3ObjectHandler.Wait() + // Process S3 object (download, parse, create events). + err := objHandler.ProcessS3Object() + if errors.Is(err, errS3DownloadFailed) { + // Download errors are ephemeral. Add a backoff delay, then skip to the + // next iteration so we don't mark the object as permanently failed. + rateLimitWaiter.Wait() + continue + } + // Reset the rate limit delay on results that aren't download errors. + rateLimitWaiter.Reset() - info := s3ObjectPayload.s3ObjectInfo + // Wait for downloaded objects to be ACKed. + objHandler.Wait() if err != nil { - event := s3ObjectPayload.s3ObjectEvent - errs = append(errs, - fmt.Errorf( - fmt.Sprintf("failed processing S3 event for object key %q in bucket %q: %%w", - event.S3.Object.Key, event.S3.Bucket.Name), - err)) - - p.handlePurgingLock(info, false) - continue + p.log.Errorf("failed processing S3 event for object key %q in bucket %q: %v", + state.Key, state.Bucket, err.Error()) + + // Non-retryable error. + state.Failed = true + } else { + state.Stored = true } - p.handlePurgingLock(info, true) + // Persist the result + p.states.AddState(state) // Metrics p.metrics.s3ObjectsAckedTotal.Inc() } - - return multierr.Combine(errs...) } -func (p *s3Poller) GetS3Objects(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) { +func (p *s3Poller) readerLoop(ctx context.Context, s3ObjectPayloadChan chan<- *s3ObjectPayload) { defer close(s3ObjectPayloadChan) bucketName := getBucketNameFromARN(p.bucket) + errorBackoff := backoff.NewEqualJitterBackoff(ctx.Done(), 1, 120) circuitBreaker := 0 paginator := p.s3.ListObjectsPaginator(bucketName, p.listPrefix) for paginator.HasMorePages() { page, err := paginator.NextPage(ctx) - if err != nil { - if !paginator.HasMorePages() { - break - } + if err != nil { p.log.Warnw("Error when paginating listing.", "error", err) - circuitBreaker++ - if circuitBreaker >= maxCircuitBreaker { - p.log.Warnw(fmt.Sprintf("%d consecutive error when paginating listing, breaking the circuit.", circuitBreaker), "error", err) - break + // QuotaExceededError is client-side rate limiting in the AWS sdk, + // don't include it in the circuit breaker count + if !errors.As(err, &ratelimit.QuotaExceededError{}) { + circuitBreaker++ + if circuitBreaker >= readerLoopMaxCircuitBreaker { + p.log.Warnw(fmt.Sprintf("%d consecutive error when paginating listing, breaking the circuit.", circuitBreaker), "error", err) + break + } } + // add a backoff delay and try again + errorBackoff.Wait() continue } + // Reset the circuit breaker and the error backoff if a read is successful + circuitBreaker = 0 + errorBackoff.Reset() - listingID, err := uuid.NewV4() - if err != nil { - p.log.Warnw("Error generating UUID for listing page.", "error", err) - continue - } - - // lock for the listing page and state in workersListingMap - // this map is shared with the storedOp and will be unlocked there - lock := new(sync.Mutex) - lock.Lock() - p.workersListingMap.Store(listingID.String(), lock) - - totProcessableObjects := 0 totListedObjects := len(page.Contents) - s3ObjectPayloadChanByPage := make(chan *s3ObjectPayload, totListedObjects) // Metrics p.metrics.s3ObjectsListedTotal.Add(uint64(totListedObjects)) for _, object := range page.Contents { - state := newState(bucketName, *object.Key, *object.ETag, p.listPrefix, *object.LastModified) - if p.states.MustSkip(state, p.store) { + state := newState(bucketName, *object.Key, *object.ETag, *object.LastModified) + if p.states.IsProcessed(state) { p.log.Debugw("skipping state.", "state", state) continue } - // we have no previous state or the previous state - // is not stored: refresh the state - previousState := p.states.FindPrevious(state) - if previousState.IsEmpty() || !previousState.IsProcessed() { - p.states.Update(state, "") - } - - s3Processor, event := p.createS3ObjectProcessor(ctx, state) + s3Processor := p.createS3ObjectProcessor(ctx, state) if s3Processor == nil { p.log.Debugw("empty s3 processor.", "state", state) continue } - totProcessableObjects++ - - s3ObjectPayloadChanByPage <- &s3ObjectPayload{ + s3ObjectPayloadChan <- &s3ObjectPayload{ s3ObjectHandler: s3Processor, - s3ObjectInfo: s3ObjectInfo{ - name: bucketName, - key: *object.Key, - etag: *object.ETag, - lastModified: *object.LastModified, - listingID: listingID.String(), - }, - s3ObjectEvent: event, - } - } - - if totProcessableObjects == 0 { - p.log.Debugw("0 processable objects on bucket pagination.", "bucket", p.bucket, "listPrefix", p.listPrefix, "listingID", listingID) - // nothing to be ACKed, unlock here - p.states.DeleteListing(listingID.String()) - lock.Unlock() - } else { - listingInfo := &listingInfo{totObjects: totProcessableObjects} - p.states.AddListing(listingID.String(), listingInfo) - - // Metrics - p.metrics.s3ObjectsProcessedTotal.Add(uint64(totProcessableObjects)) - } - - close(s3ObjectPayloadChanByPage) - for s3ObjectPayload := range s3ObjectPayloadChanByPage { - s3ObjectPayloadChan <- s3ObjectPayload - } - } -} - -func (p *s3Poller) Purge(ctx context.Context) { - listingIDs := p.states.GetListingIDs() - p.log.Debugw("purging listing.", "listingIDs", listingIDs) - for _, listingID := range listingIDs { - // we lock here in order to process the purge only after - // full listing page is ACKed by all the workers - lock, loaded := p.workersListingMap.Load(listingID) - if !loaded { - // purge calls can overlap, GetListingIDs can return - // an outdated snapshot with listing already purged - p.states.DeleteListing(listingID) - p.log.Debugw("deleting already purged listing from states.", "listingID", listingID) - continue - } - - lock.(*sync.Mutex).Lock() - - states := map[string]*state{} - latestStoredTimeByBucketAndListPrefix := make(map[string]time.Time, 0) - - listingStates := p.states.GetStatesByListingID(listingID) - for i, state := range listingStates { - // it is not stored, keep - if !state.IsProcessed() { - p.log.Debugw("state not stored or with error, skip purge", "state", state) - continue + objectState: state, } - var latestStoredTime time.Time - states[state.ID] = &listingStates[i] - latestStoredTime, ok := latestStoredTimeByBucketAndListPrefix[state.Bucket+state.ListPrefix] - if !ok { - var commitWriteState commitWriteState - err := p.store.Get(awsS3WriteCommitPrefix+state.Bucket+state.ListPrefix, &commitWriteState) - if err == nil { - // we have no entry in the map, and we have no entry in the store - // set zero time - latestStoredTime = time.Time{} - p.log.Debugw("last stored time is zero time", "bucket", state.Bucket, "listPrefix", state.ListPrefix) - } else { - latestStoredTime = commitWriteState.Time - p.log.Debugw("last stored time is commitWriteState", "commitWriteState", commitWriteState, "bucket", state.Bucket, "listPrefix", state.ListPrefix) - } - } else { - p.log.Debugw("last stored time from memory", "latestStoredTime", latestStoredTime, "bucket", state.Bucket, "listPrefix", state.ListPrefix) - } - - if state.LastModified.After(latestStoredTime) { - p.log.Debugw("last stored time updated", "state.LastModified", state.LastModified, "bucket", state.Bucket, "listPrefix", state.ListPrefix) - latestStoredTimeByBucketAndListPrefix[state.Bucket+state.ListPrefix] = state.LastModified - } - } - - for key := range states { - p.states.Delete(key) - } - - if err := p.states.writeStates(p.store); err != nil { - p.log.Errorw("Failed to write states to the registry", "error", err) - } - - for bucketAndListPrefix, latestStoredTime := range latestStoredTimeByBucketAndListPrefix { - if err := p.store.Set(awsS3WriteCommitPrefix+bucketAndListPrefix, commitWriteState{latestStoredTime}); err != nil { - p.log.Errorw("Failed to write commit time to the registry", "error", err) - } - } - - // purge is done, we can unlock and clean - lock.(*sync.Mutex).Unlock() - p.workersListingMap.Delete(listingID) - p.states.DeleteListing(listingID) - - // Listing is removed from all states, we can finalize now - for _, state := range states { - processor, _ := p.createS3ObjectProcessor(ctx, *state) - if err := processor.FinalizeS3Object(); err != nil { - p.log.Errorw("Failed to finalize S3 object", "key", state.Key, "error", err) - } + p.metrics.s3ObjectsProcessedTotal.Inc() } } } func (p *s3Poller) Poll(ctx context.Context) error { - // This loop tries to keep the workers busy as much as possible while - // honoring the number in config opposed to a simpler loop that does one - // listing, sequentially processes every object and then does another listing - workerWg := new(sync.WaitGroup) for ctx.Err() == nil { - // Determine how many S3 workers are available. - workers, err := p.workerSem.AcquireContext(p.numberOfWorkers, ctx) - if err != nil { - break - } - - if workers == 0 { - continue - } + var workerWg sync.WaitGroup + workChan := make(chan *s3ObjectPayload) - s3ObjectPayloadChan := make(chan *s3ObjectPayload) - - workerWg.Add(1) - go func() { - defer func() { - workerWg.Done() - }() - - p.GetS3Objects(ctx, s3ObjectPayloadChan) - p.Purge(ctx) - }() - - workerWg.Add(workers) - for i := 0; i < workers; i++ { + // Start the worker goroutines to listen on the work channel + for i := 0; i < p.numberOfWorkers; i++ { + workerWg.Add(1) go func() { - defer func() { - workerWg.Done() - p.workerSem.Release(1) - }() - if err := p.ProcessObject(s3ObjectPayloadChan); err != nil { - p.log.Warnw("Failed processing S3 listing.", "error", err) - } + defer workerWg.Done() + p.workerLoop(ctx, workChan) }() } - err = timed.Wait(ctx, p.bucketPollInterval) - if err != nil { - if errors.Is(err, context.Canceled) { - // A canceled context is a normal shutdown. - return nil - } + // Start reading data and wait for its processing to be done + p.readerLoop(ctx, workChan) + workerWg.Wait() - return err - } + _ = timed.Wait(ctx, p.bucketPollInterval) } - // Wait for all workers to finish. - workerWg.Wait() - if errors.Is(ctx.Err(), context.Canceled) { // A canceled context is a normal shutdown. return nil diff --git a/x-pack/filebeat/input/awss3/s3_objects.go b/x-pack/filebeat/input/awss3/s3_objects.go index 32911778336b..21dfa2243e7b 100644 --- a/x-pack/filebeat/input/awss3/s3_objects.go +++ b/x-pack/filebeat/input/awss3/s3_objects.go @@ -43,6 +43,11 @@ type s3ObjectProcessorFactory struct { backupConfig backupConfig } +// errS3DownloadFailed reports problems downloading an S3 object. Download errors +// should never treated as permanent, they are just an indication to apply a +// retry backoff until the connection is healthy again. +var errS3DownloadFailed = errors.New("S3 download failure") + func newS3ObjectProcessorFactory(log *logp.Logger, metrics *inputMetrics, s3 s3API, sel []fileSelectorConfig, backupConfig backupConfig, maxWorkers int) *s3ObjectProcessorFactory { if metrics == nil { // Metrics are optional. Initialize a stub. @@ -135,8 +140,9 @@ func (p *s3ObjectProcessor) ProcessS3Object() error { // Request object (download). contentType, meta, body, err := p.download() if err != nil { - return fmt.Errorf("failed to get s3 object (elapsed_time_ns=%d): %w", - time.Since(start).Nanoseconds(), err) + // Wrap downloadError in the result so the caller knows it's not a + // permanent failure. + return fmt.Errorf("%w: %w", errS3DownloadFailed, err) } defer body.Close() p.s3Metadata = meta @@ -434,10 +440,7 @@ func (p *s3ObjectProcessor) FinalizeS3Object() error { if bucketName == "" { return nil } - backupKey := p.s3Obj.S3.Object.Key - if p.backupConfig.BackupToBucketPrefix != "" { - backupKey = fmt.Sprintf("%s%s", p.backupConfig.BackupToBucketPrefix, backupKey) - } + backupKey := p.backupConfig.BackupToBucketPrefix + p.s3Obj.S3.Object.Key _, err := p.s3.CopyObject(p.ctx, p.s3Obj.S3.Bucket.Name, bucketName, p.s3Obj.S3.Object.Key, backupKey) if err != nil { return fmt.Errorf("failed to copy object to backup bucket: %w", err) diff --git a/x-pack/filebeat/input/awss3/s3_objects_test.go b/x-pack/filebeat/input/awss3/s3_objects_test.go index 6732c12e0579..28e8f4f42a52 100644 --- a/x-pack/filebeat/input/awss3/s3_objects_test.go +++ b/x-pack/filebeat/input/awss3/s3_objects_test.go @@ -8,7 +8,8 @@ import ( "bytes" "context" "errors" - "io/ioutil" + "io" + "os" "path/filepath" "strings" "testing" @@ -27,7 +28,7 @@ import ( ) func newS3Object(t testing.TB, filename, contentType string) (s3EventV2, *s3.GetObjectOutput) { - data, err := ioutil.ReadFile(filename) + data, err := os.ReadFile(filename) if err != nil { t.Fatal(err) } @@ -39,7 +40,7 @@ func newS3GetObjectResponse(filename string, data []byte, contentType string) *s r := bytes.NewReader(data) getObjectOutput := s3.GetObjectOutput{} getObjectOutput.ContentLength = int64(r.Len()) - getObjectOutput.Body = ioutil.NopCloser(r) + getObjectOutput.Body = io.NopCloser(r) if contentType != "" { getObjectOutput.ContentType = &contentType } @@ -157,7 +158,7 @@ func TestS3ObjectProcessor(t *testing.T) { ack := awscommon.NewEventACKTracker(ctx) err := s3ObjProc.Create(ctx, logp.NewLogger(inputName), mockPublisher, ack, s3Event).ProcessS3Object() require.Error(t, err) - assert.True(t, errors.Is(err, errFakeConnectivityFailure), "expected errFakeConnectivityFailure error") + assert.True(t, errors.Is(err, errS3DownloadFailed), "expected errS3DownloadFailed") }) t.Run("no error empty result in download", func(t *testing.T) { diff --git a/x-pack/filebeat/input/awss3/s3_test.go b/x-pack/filebeat/input/awss3/s3_test.go index b94ba7cfb09b..be1d65b796eb 100644 --- a/x-pack/filebeat/input/awss3/s3_test.go +++ b/x-pack/filebeat/input/awss3/s3_test.go @@ -13,7 +13,6 @@ import ( "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go-v2/service/s3/types" "github.com/golang/mock/gomock" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/elastic/beats/v7/libbeat/statestore" @@ -134,12 +133,16 @@ func TestS3Poller(t *testing.T) { Return(nil, errFakeConnectivityFailure) s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}, numberOfWorkers) - receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, newStates(inputCtx), store, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) + states, err := newStates(inputCtx, store) + require.NoError(t, err, "states creation must succeed") + receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, states, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx)) - assert.Equal(t, numberOfWorkers, receiver.workerSem.Available()) }) - t.Run("retry after Poll error", func(t *testing.T) { + t.Run("restart bucket scan after paging errors", func(t *testing.T) { + // Change the restart limit to 2 consecutive errors, so the test doesn't + // take too long to run + readerLoopMaxCircuitBreaker = 2 storeReg := statestore.NewRegistry(storetest.NewMemoryStoreBackend()) store, err := storeReg.Get("test") if err != nil { @@ -176,13 +179,13 @@ func TestS3Poller(t *testing.T) { // Initial Next gets an error. mockPagerFirst.EXPECT(). HasMorePages(). - Times(10). + Times(2). DoAndReturn(func() bool { return true }) mockPagerFirst.EXPECT(). NextPage(gomock.Any()). - Times(5). + Times(2). DoAndReturn(func(_ context.Context, optFns ...func(*s3.Options)) (*s3.ListObjectsV2Output, error) { return nil, errFakeConnectivityFailure }) @@ -257,8 +260,9 @@ func TestS3Poller(t *testing.T) { Return(nil, errFakeConnectivityFailure) s3ObjProc := newS3ObjectProcessorFactory(logp.NewLogger(inputName), nil, mockAPI, nil, backupConfig{}, numberOfWorkers) - receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, newStates(inputCtx), store, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) + states, err := newStates(inputCtx, store) + require.NoError(t, err, "states creation must succeed") + receiver := newS3Poller(logp.NewLogger(inputName), nil, mockAPI, mockPublisher, s3ObjProc, states, bucket, "key", "region", "provider", numberOfWorkers, pollInterval) require.Error(t, context.DeadlineExceeded, receiver.Poll(ctx)) - assert.Equal(t, numberOfWorkers, receiver.workerSem.Available()) }) } diff --git a/x-pack/filebeat/input/awss3/state.go b/x-pack/filebeat/input/awss3/state.go index 97fb8d538cd6..4b7e09f9e7fa 100644 --- a/x-pack/filebeat/input/awss3/state.go +++ b/x-pack/filebeat/input/awss3/state.go @@ -5,84 +5,52 @@ package awss3 import ( - "fmt" "time" ) // state is used to communicate the publishing state of a s3 object type state struct { - // ID is used to identify the state in the store, and it is composed by - // Bucket + Key + Etag + LastModified.String(): changing this value or how it is - // composed will break backward compatibilities with entries already in the store. - ID string `json:"id" struct:"id"` Bucket string `json:"bucket" struct:"bucket"` Key string `json:"key" struct:"key"` Etag string `json:"etag" struct:"etag"` LastModified time.Time `json:"last_modified" struct:"last_modified"` - // ListPrefix is used for unique of the key in the store for awsS3WriteCommitPrefix - ListPrefix string `json:"list_prefix" struct:"list_prefix"` - // A state has Stored = true when all events are ACKed. Stored bool `json:"stored" struct:"stored"` - // A state has Error = true when ProcessS3Object returned an error - Error bool `json:"error" struct:"error"` + + // Failed is true when ProcessS3Object returned an error other than + // s3DownloadError. + // Before 8.14, this field was called "error". However, that field was + // set for many ephemeral reasons including client-side rate limiting + // (see https://github.com/elastic/beats/issues/39114). Now that we + // don't treat download errors as permanent, the field name was changed + // so that users upgrading from old versions aren't prevented from + // retrying old download failures. + Failed bool `json:"failed" struct:"failed"` } +// ID is used to identify the state in the store, and it is composed by +// Bucket + Key + Etag + LastModified.String(): changing this value or how it is +// composed will break backward compatibilities with entries already in the store. func stateID(bucket, key, etag string, lastModified time.Time) string { return bucket + key + etag + lastModified.String() } // newState creates a new s3 object state -func newState(bucket, key, etag, listPrefix string, lastModified time.Time) state { - s := state{ +func newState(bucket, key, etag string, lastModified time.Time) state { + return state{ Bucket: bucket, Key: key, LastModified: lastModified, Etag: etag, - ListPrefix: listPrefix, - Stored: false, - Error: false, } - - s.ID = stateID(s.Bucket, s.Key, s.Etag, s.LastModified) - - return s } -// MarkAsStored set the stored flag to true -func (s *state) MarkAsStored() { - s.Stored = true -} - -// MarkAsError set the error flag to true -func (s *state) MarkAsError() { - s.Error = true -} - -// IsProcessed checks if the state is either Stored or Error -func (s *state) IsProcessed() bool { - return s.Stored || s.Error +func (s *state) ID() string { + return stateID(s.Bucket, s.Key, s.Etag, s.LastModified) } // IsEqual checks if the two states point to the same s3 object. func (s *state) IsEqual(c *state) bool { return s.Bucket == c.Bucket && s.Key == c.Key && s.Etag == c.Etag && s.LastModified.Equal(c.LastModified) } - -// IsEmpty checks if the state is empty -func (s *state) IsEmpty() bool { - c := state{} - return s.Bucket == c.Bucket && s.Key == c.Key && s.Etag == c.Etag && s.LastModified.Equal(c.LastModified) -} - -// String returns string representation of the struct -func (s *state) String() string { - return fmt.Sprintf( - "{ID: %v, Bucket: %v, Key: %v, Etag: %v, LastModified: %v}", - s.ID, - s.Bucket, - s.Key, - s.Etag, - s.LastModified) -} diff --git a/x-pack/filebeat/input/awss3/state_test.go b/x-pack/filebeat/input/awss3/state_test.go index 24a5e9d81b4e..375a44ce79e2 100644 --- a/x-pack/filebeat/input/awss3/state_test.go +++ b/x-pack/filebeat/input/awss3/state_test.go @@ -61,7 +61,7 @@ func TestStateIsEqual(t *testing.T) { Key: "/key/to/this/file/1", Etag: "etag", LastModified: lastModifed, - Error: true, + Failed: true, }, { Bucket: "bucket a", diff --git a/x-pack/filebeat/input/awss3/states.go b/x-pack/filebeat/input/awss3/states.go index 449219a867f5..edbbcc73793e 100644 --- a/x-pack/filebeat/input/awss3/states.go +++ b/x-pack/filebeat/input/awss3/states.go @@ -15,278 +15,64 @@ import ( "github.com/elastic/beats/v7/libbeat/statestore" ) -const ( - awsS3ObjectStatePrefix = "filebeat::aws-s3::state::" - awsS3WriteCommitPrefix = "filebeat::aws-s3::writeCommit::" -) - -type listingInfo struct { - totObjects int - - mu sync.Mutex - storedObjects int - errorObjects int - finalCheck bool -} +const awsS3ObjectStatePrefix = "filebeat::aws-s3::state::" // states handles list of s3 object state. One must use newStates to instantiate a // file states registry. Using the zero-value is not safe. type states struct { - sync.RWMutex - log *logp.Logger - // states store - states []state - - // idx maps state IDs to state indexes for fast lookup and modifications. - idx map[string]int + // Completed S3 object states, indexed by state ID. + // statesLock must be held to access states. + states map[string]state + statesLock sync.Mutex - listingIDs map[string]struct{} - listingInfo *sync.Map - statesByListingID map[string][]state + // The store used to persist state changes to the registry. + // storeLock must be held to access store. + store *statestore.Store + storeLock sync.Mutex } // newStates generates a new states registry. -func newStates(ctx v2.Context) *states { - return &states{ - log: ctx.Logger.Named("states"), - states: nil, - idx: map[string]int{}, - listingInfo: new(sync.Map), - listingIDs: map[string]struct{}{}, - statesByListingID: map[string][]state{}, - } -} - -func (s *states) MustSkip(state state, store *statestore.Store) bool { - if !s.IsNew(state) { - s.log.Debugw("not new state in must skip", "state", state) - return true - } - - previousState := s.FindPrevious(state) - - // status is forgotten. if there is no previous state and - // the state.LastModified is before the last cleanStore - // write commit we can remove - var commitWriteState commitWriteState - err := store.Get(awsS3WriteCommitPrefix+state.Bucket+state.ListPrefix, &commitWriteState) - if err == nil && previousState.IsEmpty() && - (state.LastModified.Before(commitWriteState.Time) || state.LastModified.Equal(commitWriteState.Time)) { - s.log.Debugw("state.LastModified older than writeCommitState in must skip", "state", state, "commitWriteState", commitWriteState) - return true - } - - // the previous state is stored or has error: let's skip - if !previousState.IsEmpty() && previousState.IsProcessed() { - s.log.Debugw("previous state is stored or has error", "state", state) - return true - } - - return false -} - -func (s *states) Delete(id string) { - s.Lock() - defer s.Unlock() - - index := s.findPrevious(id) - if index >= 0 { - last := len(s.states) - 1 - s.states[last], s.states[index] = s.states[index], s.states[last] - s.states = s.states[:last] - - s.idx = map[string]int{} - for i, state := range s.states { - s.idx[state.ID] = i - } - } -} - -// IsListingFullyStored check if listing if fully stored -// After first time the condition is met it will always return false -func (s *states) IsListingFullyStored(listingID string) bool { - info, ok := s.listingInfo.Load(listingID) - if !ok { - return false - } - listingInfo, ok := info.(*listingInfo) - if !ok { - return false - } - - listingInfo.mu.Lock() - defer listingInfo.mu.Unlock() - if listingInfo.finalCheck { - return false - } - - listingInfo.finalCheck = (listingInfo.storedObjects + listingInfo.errorObjects) == listingInfo.totObjects - - if (listingInfo.storedObjects + listingInfo.errorObjects) > listingInfo.totObjects { - s.log.Warnf("unexepected mixmatch between storedObjects (%d), errorObjects (%d) and totObjects (%d)", - listingInfo.storedObjects, listingInfo.errorObjects, listingInfo.totObjects) - } - - return listingInfo.finalCheck -} - -// AddListing add listing info -func (s *states) AddListing(listingID string, listingInfo *listingInfo) { - s.Lock() - defer s.Unlock() - s.listingIDs[listingID] = struct{}{} - s.listingInfo.Store(listingID, listingInfo) -} - -// DeleteListing delete listing info -func (s *states) DeleteListing(listingID string) { - s.Lock() - defer s.Unlock() - delete(s.listingIDs, listingID) - delete(s.statesByListingID, listingID) - s.listingInfo.Delete(listingID) -} - -// Update updates a state. If previous state didn't exist, new one is created -func (s *states) Update(newState state, listingID string) { - s.Lock() - defer s.Unlock() - - id := newState.ID - index := s.findPrevious(id) - - if index >= 0 { - s.states[index] = newState - } else { - // No existing state found, add new one - s.idx[id] = len(s.states) - s.states = append(s.states, newState) - s.log.Debug("New state added for ", newState.ID) - } - - if listingID == "" || !newState.IsProcessed() { - return - } - - // here we increase the number of stored object - info, ok := s.listingInfo.Load(listingID) - if !ok { - return - } - listingInfo, ok := info.(*listingInfo) - if !ok { - return - } - - listingInfo.mu.Lock() - - if newState.Stored { - listingInfo.storedObjects++ - } - - if newState.Error { - listingInfo.errorObjects++ - } - - listingInfo.mu.Unlock() - - if _, ok := s.statesByListingID[listingID]; !ok { - s.statesByListingID[listingID] = make([]state, 0) +func newStates(ctx v2.Context, store *statestore.Store) (*states, error) { + states := &states{ + log: ctx.Logger.Named("states"), + states: map[string]state{}, + store: store, } - - s.statesByListingID[listingID] = append(s.statesByListingID[listingID], newState) + return states, states.loadFromRegistry() } -// FindPrevious lookups a registered state, that matching the new state. -// Returns a zero-state if no match is found. -func (s *states) FindPrevious(newState state) state { - s.RLock() - defer s.RUnlock() - id := newState.ID - i := s.findPrevious(id) - if i < 0 { - return state{} - } - return s.states[i] +func (s *states) IsProcessed(state state) bool { + s.statesLock.Lock() + defer s.statesLock.Unlock() + // Our in-memory table only stores completed objects + _, ok := s.states[state.ID()] + return ok } -// FindPreviousByID lookups a registered state, that matching the id. -// Returns a zero-state if no match is found. -func (s *states) FindPreviousByID(id string) state { - s.RLock() - defer s.RUnlock() - i := s.findPrevious(id) - if i < 0 { - return state{} - } - return s.states[i] -} - -func (s *states) IsNew(state state) bool { - s.RLock() - defer s.RUnlock() - id := state.ID - i := s.findPrevious(id) - - if i < 0 { - return true - } +func (s *states) AddState(state state) { - return !s.states[i].IsEqual(&state) -} + id := state.ID() + // Update in-memory copy + s.statesLock.Lock() + s.states[id] = state + s.statesLock.Unlock() -// findPrevious returns the previous state for the file. -// In case no previous state exists, index -1 is returned -func (s *states) findPrevious(id string) int { - if i, exists := s.idx[id]; exists { - return i + // Persist to the registry + s.storeLock.Lock() + key := awsS3ObjectStatePrefix + id + if err := s.store.Set(key, state); err != nil { + s.log.Errorw("Failed to write states to the registry", "error", err) } - return -1 -} - -// GetStates creates copy of the file states. -func (s *states) GetStates() []state { - s.RLock() - defer s.RUnlock() - - newStates := make([]state, len(s.states)) - copy(newStates, s.states) - - return newStates -} - -// GetListingIDs return a of the listing IDs -func (s *states) GetListingIDs() []string { - s.RLock() - defer s.RUnlock() - listingIDs := make([]string, 0, len(s.listingIDs)) - for listingID := range s.listingIDs { - listingIDs = append(listingIDs, listingID) - } - - return listingIDs -} - -// GetStatesByListingID return a copy of the states by listing ID -func (s *states) GetStatesByListingID(listingID string) []state { - s.RLock() - defer s.RUnlock() - - if _, ok := s.statesByListingID[listingID]; !ok { - return nil - } - - newStates := make([]state, len(s.statesByListingID[listingID])) - copy(newStates, s.statesByListingID[listingID]) - return newStates + s.storeLock.Unlock() } -func (s *states) readStatesFrom(store *statestore.Store) error { - var states []state +func (s *states) loadFromRegistry() error { + states := map[string]state{} - err := store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) { + s.storeLock.Lock() + err := s.store.Each(func(key string, dec statestore.ValueDecoder) (bool, error) { if !strings.HasPrefix(key, awsS3ObjectStatePrefix) { return true, nil } @@ -294,78 +80,30 @@ func (s *states) readStatesFrom(store *statestore.Store) error { // try to decode. Ignore faulty/incompatible values. var st state if err := dec.Decode(&st); err != nil { - // XXX: Do we want to log here? In case we start to store other - // state types in the registry, then this operation will likely fail - // quite often, producing some false-positives in the logs... - return false, err + // Skip this key but continue iteration + s.log.Warnf("invalid S3 state loading object key %v", key) + //nolint:nilerr // One bad object shouldn't stop iteration + return true, nil + } + if !st.Stored && !st.Failed { + // This is from an older version where state could be stored in the + // registry even if the object wasn't processed, or if it encountered + // ephemeral download errors. We don't add these to the in-memory cache, + // so if we see them during a bucket scan we will still retry them. + return true, nil } - st.ID = key[len(awsS3ObjectStatePrefix):] - states = append(states, st) + states[st.ID()] = st return true, nil }) + s.storeLock.Unlock() if err != nil { return err } - states = fixStates(states) - - for _, state := range states { - s.Update(state, "") - } - - return nil -} - -// fixStates cleans up the registry states when updating from an older version -// of filebeat potentially writing invalid entries. -func fixStates(states []state) []state { - if len(states) == 0 { - return states - } - - // we use a map of states here, so to identify and merge duplicate entries. - idx := map[string]*state{} - for i := range states { - state := &states[i] - - old, exists := idx[state.ID] - if !exists { - idx[state.ID] = state - } else { - mergeStates(old, state) // overwrite the entry in 'old' - } - } - - if len(idx) == len(states) { - return states - } - - i := 0 - newStates := make([]state, len(idx)) - for _, state := range idx { - newStates[i] = *state - i++ - } - return newStates -} - -// mergeStates merges 2 states by trying to determine the 'newer' state. -// The st state is overwritten with the updated fields. -func mergeStates(st, other *state) { - // update file meta-data. As these are updated concurrently by the - // inputs, select the newer state based on the update timestamp. - if st.LastModified.Before(other.LastModified) { - st.LastModified = other.LastModified - } -} + s.statesLock.Lock() + s.states = states + s.statesLock.Unlock() -func (s *states) writeStates(store *statestore.Store) error { - for _, state := range s.GetStates() { - key := awsS3ObjectStatePrefix + state.ID - if err := store.Set(key, state); err != nil { - return err - } - } return nil } diff --git a/x-pack/filebeat/input/awss3/states_test.go b/x-pack/filebeat/input/awss3/states_test.go index 39dc4cf82e63..2f8bbf58fdfb 100644 --- a/x-pack/filebeat/input/awss3/states_test.go +++ b/x-pack/filebeat/input/awss3/states_test.go @@ -14,6 +14,7 @@ import ( "github.com/elastic/beats/v7/libbeat/statestore/storetest" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" v2 "github.com/elastic/beats/v7/filebeat/input/v2" "github.com/elastic/elastic-agent-libs/logp" @@ -46,287 +47,92 @@ var inputCtx = v2.Context{ Cancelation: context.Background(), } -func TestStatesIsNewAndMustSkip(t *testing.T) { +func TestStatesAddStateAndIsProcessed(t *testing.T) { type stateTestCase struct { - states func() *states - state state - mustBeNew bool - persistentStoreKV map[string]interface{} - expectedMustSkip bool - expectedIsNew bool + // An initialization callback to invoke on the (initially empty) states. + statesEdit func(states *states) + + // The state to call IsProcessed on and the expected result + state state + expectedIsProcessed bool + + // If true, the test will run statesEdit, then create a new states + // object from the same persistent store before calling IsProcessed + // (to test persistence between restarts). + shouldReload bool } lastModified := time.Date(2022, time.June, 30, 14, 13, 00, 0, time.UTC) + testState1 := newState("bucket", "key", "etag", lastModified) + testState2 := newState("bucket1", "key1", "etag1", lastModified) tests := map[string]stateTestCase{ "with empty states": { - states: func() *states { - return newStates(inputCtx) - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - expectedMustSkip: false, - expectedIsNew: true, + state: testState1, + expectedIsProcessed: false, }, "not existing state": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states + statesEdit: func(states *states) { + states.AddState(testState2) }, - state: newState("bucket1", "key1", "etag1", "listPrefix1", lastModified), - expectedMustSkip: false, - expectedIsNew: true, + state: testState1, + expectedIsProcessed: false, }, "existing state": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - expectedMustSkip: true, - expectedIsNew: false, - }, - "with different etag": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag1", "listPrefix", lastModified), "") - return states - }, - state: newState("bucket", "key", "etag2", "listPrefix", lastModified), - expectedMustSkip: false, - expectedIsNew: true, - }, - "with different lastmodified": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified.Add(1*time.Second)), - expectedMustSkip: false, - expectedIsNew: true, - }, - "with stored state": { - states: func() *states { - states := newStates(inputCtx) - aState := newState("bucket", "key", "etag", "listPrefix", lastModified) - aState.Stored = true - states.Update(aState, "") - return states + statesEdit: func(states *states) { + states.AddState(testState1) }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - mustBeNew: true, - expectedMustSkip: true, - expectedIsNew: true, + state: testState1, + expectedIsProcessed: true, }, - "with error state": { - states: func() *states { - states := newStates(inputCtx) - aState := newState("bucket", "key", "etag", "listPrefix", lastModified) - aState.Error = true - states.Update(aState, "") - return states + "existing stored state is persisted": { + statesEdit: func(states *states) { + state := testState1 + state.Stored = true + states.AddState(state) }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - mustBeNew: true, - expectedMustSkip: true, - expectedIsNew: true, + state: testState1, + shouldReload: true, + expectedIsProcessed: true, }, - "before commit write": { - states: func() *states { - return newStates(inputCtx) + "existing failed state is persisted": { + statesEdit: func(states *states) { + state := testState1 + state.Failed = true + states.AddState(state) }, - persistentStoreKV: map[string]interface{}{ - awsS3WriteCommitPrefix + "bucket" + "listPrefix": &commitWriteState{lastModified}, - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified.Add(-1*time.Second)), - expectedMustSkip: true, - expectedIsNew: true, + state: testState1, + shouldReload: true, + expectedIsProcessed: true, }, - "same commit write": { - states: func() *states { - return newStates(inputCtx) - }, - persistentStoreKV: map[string]interface{}{ - awsS3WriteCommitPrefix + "bucket" + "listPrefix": &commitWriteState{lastModified}, + "existing unprocessed state is not persisted": { + statesEdit: func(states *states) { + states.AddState(testState1) }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified), - expectedMustSkip: true, - expectedIsNew: true, - }, - "after commit write": { - states: func() *states { - return newStates(inputCtx) - }, - persistentStoreKV: map[string]interface{}{ - awsS3WriteCommitPrefix + "bucket" + "listPrefix": &commitWriteState{lastModified}, - }, - state: newState("bucket", "key", "etag", "listPrefix", lastModified.Add(time.Second)), - expectedMustSkip: false, - expectedIsNew: true, + state: testState1, + shouldReload: true, + expectedIsProcessed: false, }, } for name, test := range tests { test := test t.Run(name, func(t *testing.T) { - states := test.states() store := openTestStatestore() persistentStore, err := store.Access() if err != nil { t.Fatalf("unexpected err: %v", err) } - for key, value := range test.persistentStoreKV { - _ = persistentStore.Set(key, value) + states, err := newStates(inputCtx, persistentStore) + require.NoError(t, err, "states creation must succeed") + if test.statesEdit != nil { + test.statesEdit(states) } - - if test.mustBeNew { - test.state.LastModified = test.state.LastModified.Add(1 * time.Second) + if test.shouldReload { + states, err = newStates(inputCtx, persistentStore) + require.NoError(t, err, "states creation must succeed") } - isNew := states.IsNew(test.state) - assert.Equal(t, test.expectedIsNew, isNew) - - mustSkip := states.MustSkip(test.state, persistentStore) - assert.Equal(t, test.expectedMustSkip, mustSkip) - }) - } -} - -func TestStatesDelete(t *testing.T) { - type stateTestCase struct { - states func() *states - deleteID string - expected []state - } - - lastModified := time.Date(2021, time.July, 22, 18, 38, 00, 0, time.UTC) - tests := map[string]stateTestCase{ - "delete empty states": { - states: func() *states { - return newStates(inputCtx) - }, - deleteID: "an id", - expected: []state{}, - }, - "delete not existing state": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states - }, - deleteID: "an id", - expected: []state{ - { - ID: stateID("bucket", "key", "etag", lastModified), - Bucket: "bucket", - Key: "key", - Etag: "etag", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - }, - }, - "delete only one existing": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key", "etag", "listPrefix", lastModified), "") - return states - }, - deleteID: stateID("bucket", "key", "etag", lastModified), - expected: []state{}, - }, - "delete first": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key1", "etag1", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key2", "etag2", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key3", "etag3", "listPrefix", lastModified), "") - return states - }, - deleteID: "bucketkey1etag1" + lastModified.String(), - expected: []state{ - { - ID: stateID("bucket", "key3", "etag3", lastModified), - Bucket: "bucket", - Key: "key3", - Etag: "etag3", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - { - ID: stateID("bucket", "key2", "etag2", lastModified), - Bucket: "bucket", - Key: "key2", - Etag: "etag2", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - }, - }, - "delete last": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key1", "etag1", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key2", "etag2", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key3", "etag3", "listPrefix", lastModified), "") - return states - }, - deleteID: "bucketkey3etag3" + lastModified.String(), - expected: []state{ - { - ID: stateID("bucket", "key1", "etag1", lastModified), - Bucket: "bucket", - Key: "key1", - Etag: "etag1", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - { - ID: stateID("bucket", "key2", "etag2", lastModified), - Bucket: "bucket", - Key: "key2", - Etag: "etag2", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - }, - }, - "delete any": { - states: func() *states { - states := newStates(inputCtx) - states.Update(newState("bucket", "key1", "etag1", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key2", "etag2", "listPrefix", lastModified), "") - states.Update(newState("bucket", "key3", "etag3", "listPrefix", lastModified), "") - return states - }, - deleteID: "bucketkey2etag2" + lastModified.String(), - expected: []state{ - { - ID: stateID("bucket", "key1", "etag1", lastModified), - Bucket: "bucket", - Key: "key1", - Etag: "etag1", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - { - ID: stateID("bucket", "key3", "etag3", lastModified), - Bucket: "bucket", - Key: "key3", - Etag: "etag3", - ListPrefix: "listPrefix", - LastModified: lastModified, - }, - }, - }, - } - - for name, test := range tests { - test := test - t.Run(name, func(t *testing.T) { - states := test.states() - states.Delete(test.deleteID) - assert.Equal(t, test.expected, states.GetStates()) + isProcessed := states.IsProcessed(test.state) + assert.Equal(t, test.expectedIsProcessed, isProcessed) }) } } diff --git a/x-pack/metricbeat/module/azure/azure_test.go b/x-pack/metricbeat/module/azure/azure_test.go new file mode 100644 index 000000000000..c3d67525ddb9 --- /dev/null +++ b/x-pack/metricbeat/module/azure/azure_test.go @@ -0,0 +1,39 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package azure + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestGroupMetricsDefinitionsByResourceId(t *testing.T) { + + t.Run("Group metrics definitions by resource ID", func(t *testing.T) { + metrics := []Metric{ + { + ResourceId: "resource-1", + Namespace: "namespace-1", + Names: []string{"metric-1"}, + }, + { + ResourceId: "resource-1", + Namespace: "namespace-1", + Names: []string{"metric-2"}, + }, + { + ResourceId: "resource-1", + Namespace: "namespace-1", + Names: []string{"metric-3"}, + }, + } + + metricsByResourceId := groupMetricsDefinitionsByResourceId(metrics) + + assert.Equal(t, 1, len(metricsByResourceId)) + assert.Equal(t, 3, len(metricsByResourceId["resource-1"])) + }) +} diff --git a/x-pack/metricbeat/module/azure/client_test.go b/x-pack/metricbeat/module/azure/client_test.go index 79b1742ded0f..c23326ac82b7 100644 --- a/x-pack/metricbeat/module/azure/client_test.go +++ b/x-pack/metricbeat/module/azure/client_test.go @@ -9,10 +9,12 @@ import ( "testing" "time" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/monitor/armmonitor" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" ) var ( @@ -35,6 +37,7 @@ var ( }, }}}, } + countUnit = armmonitor.MetricUnit("Count") ) func mockMapResourceMetrics(client *Client, resources []*armresources.GenericResourceExpanded, resourceConfig ResourceConfig) ([]Metric, error) { @@ -112,4 +115,157 @@ func TestGetMetricValues(t *testing.T) { assert.Equal(t, len(client.ResourceConfigurations.Metrics[0].Values), 0) m.AssertExpectations(t) }) + + t.Run("multiple aggregation types", func(t *testing.T) { + client := NewMockClient() + referenceTime := time.Now().UTC() + client.ResourceConfigurations = ResourceConfiguration{ + Metrics: []Metric{ + { + Namespace: "Microsoft.EventHub/Namespaces", + Names: []string{"ActiveConnections"}, + Aggregations: "Maximum,Minimum,Average", + TimeGrain: "PT1M", + }, + }, + } + + m := &MockService{} + m.On( + "GetMetricValues", + mock.Anything, + mock.Anything, + mock.Anything, + mock.Anything, + mock.Anything, + mock.Anything, + mock.Anything, + ).Return( + []armmonitor.Metric{{ + ID: to.Ptr("test"), + Name: &armmonitor.LocalizableString{ + Value: to.Ptr("ActiveConnections"), + LocalizedValue: to.Ptr("ActiveConnections"), + }, + Timeseries: []*armmonitor.TimeSeriesElement{{ + Data: []*armmonitor.MetricValue{{ + Average: to.Ptr(1.0), + Maximum: to.Ptr(2.0), + Minimum: to.Ptr(3.0), + TimeStamp: to.Ptr(time.Now()), + }}, + }}, + Type: to.Ptr("Microsoft.Insights/metrics"), + Unit: &countUnit, + DisplayDescription: to.Ptr("Total Active Connections for Microsoft.EventHub."), + ErrorCode: to.Ptr("Success"), + }}, + "PT1M", + nil, + ) + + client.AzureMonitorService = m + mr := MockReporterV2{} + + metricValues := client.GetMetricValues(referenceTime, client.ResourceConfigurations.Metrics, &mr) + + require.Equal(t, len(metricValues), 1) + require.Equal(t, len(metricValues[0].Values), 1) + + assert.Equal(t, *metricValues[0].Values[0].avg, 1.0) + assert.Equal(t, *metricValues[0].Values[0].max, 2.0) + assert.Equal(t, *metricValues[0].Values[0].min, 3.0) + + require.Equal(t, len(client.ResourceConfigurations.Metrics[0].Values), 1) + + m.AssertExpectations(t) + }) + + t.Run("single aggregation types", func(t *testing.T) { + client := NewMockClient() + referenceTime := time.Now().UTC() + timestamp := time.Now().UTC() + client.ResourceConfigurations = ResourceConfiguration{ + Metrics: []Metric{ + { + Namespace: "Microsoft.EventHub/Namespaces", + Names: []string{"ActiveConnections"}, + Aggregations: "Maximum", + TimeGrain: "PT1M", + }, { + Namespace: "Microsoft.EventHub/Namespaces", + Names: []string{"ActiveConnections"}, + Aggregations: "Minimum", + TimeGrain: "PT1M", + }, { + Namespace: "Microsoft.EventHub/Namespaces", + Names: []string{"ActiveConnections"}, + Aggregations: "Average", + TimeGrain: "PT1M", + }, + }, + } + + m := &MockService{} + + x := []struct { + aggregation string + data []*armmonitor.MetricValue + }{ + {aggregation: "Maximum", data: []*armmonitor.MetricValue{{Maximum: to.Ptr(3.0), TimeStamp: to.Ptr(timestamp)}}}, + {aggregation: "Minimum", data: []*armmonitor.MetricValue{{Minimum: to.Ptr(1.0), TimeStamp: to.Ptr(timestamp)}}}, + {aggregation: "Average", data: []*armmonitor.MetricValue{{Average: to.Ptr(2.0), TimeStamp: to.Ptr(timestamp)}}}, + } + + for _, v := range x { + m.On( + "GetMetricValues", + mock.Anything, + mock.Anything, + mock.Anything, + mock.Anything, + mock.Anything, + v.aggregation, + mock.Anything, + ).Return( + []armmonitor.Metric{{ + ID: to.Ptr("test"), + Name: &armmonitor.LocalizableString{ + Value: to.Ptr("ActiveConnections"), + LocalizedValue: to.Ptr("ActiveConnections"), + }, + Timeseries: []*armmonitor.TimeSeriesElement{{ + Data: v.data, + }}, + Type: to.Ptr("Microsoft.Insights/metrics"), + Unit: &countUnit, + DisplayDescription: to.Ptr("Total Active Connections for Microsoft.EventHub."), + ErrorCode: to.Ptr("Success"), + }}, + "PT1M", + nil, + ).Once() + } + + client.AzureMonitorService = m + mr := MockReporterV2{} + + metricValues := client.GetMetricValues(referenceTime, client.ResourceConfigurations.Metrics, &mr) + + require.Equal(t, 3, len(metricValues)) + + require.Equal(t, 1, len(metricValues[0].Values)) + require.Equal(t, 1, len(metricValues[1].Values)) + require.Equal(t, 1, len(metricValues[2].Values)) + + require.NotNil(t, metricValues[0].Values[0].max, "max value is nil") + require.NotNil(t, metricValues[1].Values[0].min, "min value is nil") + require.NotNil(t, metricValues[2].Values[0].avg, "avg value is nil") + + assert.Equal(t, *metricValues[0].Values[0].max, 3.0) + assert.Equal(t, *metricValues[1].Values[0].min, 1.0) + assert.Equal(t, *metricValues[2].Values[0].avg, 2.0) + + m.AssertExpectations(t) + }) } diff --git a/x-pack/metricbeat/module/azure/data.go b/x-pack/metricbeat/module/azure/data.go index c46aee9da246..b2fffb404262 100644 --- a/x-pack/metricbeat/module/azure/data.go +++ b/x-pack/metricbeat/module/azure/data.go @@ -133,41 +133,8 @@ func mapToKeyValuePoints(metrics []Metric) []KeyValuePoint { var points []KeyValuePoint for _, metric := range metrics { for _, value := range metric.Values { - point := KeyValuePoint{ - Timestamp: value.timestamp, - Dimensions: mapstr.M{}, - } - metricName := managePropertyName(value.name) - switch { - case value.min != nil: - point.Key = fmt.Sprintf("%s.%s", metricName, "min") - point.Value = value.min - case value.max != nil: - point.Key = fmt.Sprintf("%s.%s", metricName, "max") - point.Value = value.max - case value.avg != nil: - point.Key = fmt.Sprintf("%s.%s", metricName, "avg") - point.Value = value.avg - case value.total != nil: - point.Key = fmt.Sprintf("%s.%s", metricName, "total") - point.Value = value.total - case value.count != nil: - point.Key = fmt.Sprintf("%s.%s", metricName, "count") - point.Value = value.count - } - - point.Namespace = metric.Namespace - point.ResourceId = metric.ResourceId - point.ResourceSubId = metric.ResourceSubId - point.TimeGrain = metric.TimeGrain - - // The number of dimensions in the metric definition and the - // number of dimensions in the metric values should be the same. - // - // But, since definitions and values are retrieved from different - // API endpoints, we need to make sure that we don't panic if the - // number of dimensions is different. + dimensions := mapstr.M{} if len(metric.Dimensions) == len(value.dimensions) { // Take the dimension name from the metric definition and the // dimension value from the metric value. @@ -180,11 +147,75 @@ func mapToKeyValuePoints(metrics []Metric) []KeyValuePoint { // Dimensions from metric definition and metric value are // not guaranteed to be in the same order, so we need to // find by name the right value for each dimension. - _, _ = point.Dimensions.Put(dim.Name, getDimensionValue(dim.Name, value.dimensions)) + // _, _ = point.Dimensions.Put(dim.Name, getDimensionValue(dim.Name, value.dimensions)) + _, _ = dimensions.Put(dim.Name, getDimensionValue(dim.Name, value.dimensions)) } } - points = append(points, point) + if value.min != nil { + points = append(points, KeyValuePoint{ + Key: fmt.Sprintf("%s.%s", metricName, "min"), + Value: value.min, + Namespace: metric.Namespace, + ResourceId: metric.ResourceId, + ResourceSubId: metric.ResourceSubId, + TimeGrain: metric.TimeGrain, + Dimensions: dimensions, + Timestamp: value.timestamp, + }) + } + + if value.max != nil { + points = append(points, KeyValuePoint{ + Key: fmt.Sprintf("%s.%s", metricName, "max"), + Value: value.max, + Namespace: metric.Namespace, + ResourceId: metric.ResourceId, + ResourceSubId: metric.ResourceSubId, + TimeGrain: metric.TimeGrain, + Dimensions: dimensions, + Timestamp: value.timestamp, + }) + } + + if value.avg != nil { + points = append(points, KeyValuePoint{ + Key: fmt.Sprintf("%s.%s", metricName, "avg"), + Value: value.avg, + Namespace: metric.Namespace, + ResourceId: metric.ResourceId, + ResourceSubId: metric.ResourceSubId, + TimeGrain: metric.TimeGrain, + Dimensions: dimensions, + Timestamp: value.timestamp, + }) + } + + if value.total != nil { + points = append(points, KeyValuePoint{ + Key: fmt.Sprintf("%s.%s", metricName, "total"), + Value: value.total, + Namespace: metric.Namespace, + ResourceId: metric.ResourceId, + ResourceSubId: metric.ResourceSubId, + TimeGrain: metric.TimeGrain, + Dimensions: dimensions, + Timestamp: value.timestamp, + }) + } + + if value.count != nil { + points = append(points, KeyValuePoint{ + Key: fmt.Sprintf("%s.%s", metricName, "count"), + Value: value.count, + Namespace: metric.Namespace, + ResourceId: metric.ResourceId, + ResourceSubId: metric.ResourceSubId, + TimeGrain: metric.TimeGrain, + Dimensions: dimensions, + Timestamp: value.timestamp, + }) + } } } diff --git a/x-pack/metricbeat/module/azure/data_test.go b/x-pack/metricbeat/module/azure/data_test.go index 85b781ed64ec..1519f78982d2 100644 --- a/x-pack/metricbeat/module/azure/data_test.go +++ b/x-pack/metricbeat/module/azure/data_test.go @@ -62,7 +62,37 @@ func TestMapToKeyValuePoints(t *testing.T) { resourceSubId := "test" timeGrain := "PT1M" - t.Run("test aggregation types", func(t *testing.T) { + t.Run("test single aggregation type (single config)", func(t *testing.T) { + + metrics := []Metric{{ + Namespace: namespace, + Names: []string{"test"}, + Aggregations: "min", + Values: []MetricValue{{name: metricName, min: &minValue, timestamp: timestamp}}, + TimeGrain: timeGrain, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + }} + + actual := mapToKeyValuePoints(metrics) + + expected := []KeyValuePoint{ + { + Key: fmt.Sprintf("%s.%s", metricName, "min"), + Value: &minValue, + Namespace: namespace, + TimeGrain: timeGrain, + Timestamp: timestamp, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + Dimensions: map[string]interface{}{}, + }, + } + + assert.Equal(t, expected, actual) + }) + + t.Run("test single aggregation types (multiple configs)", func(t *testing.T) { metrics := []Metric{{ Namespace: namespace, @@ -161,4 +191,79 @@ func TestMapToKeyValuePoints(t *testing.T) { assert.Equal(t, expected, actual) }) + + t.Run("test multiple aggregation types (multiple configs)", func(t *testing.T) { + metrics := []Metric{{ + Namespace: namespace, + Names: []string{"test"}, + Aggregations: "Minimum,Maximum,Average,Total,Count", + Values: []MetricValue{ + {name: metricName, min: &minValue, timestamp: timestamp}, + {name: metricName, max: &maxValue, timestamp: timestamp}, + {name: metricName, avg: &avgValue, timestamp: timestamp}, + {name: metricName, total: &totalValue, timestamp: timestamp}, + {name: metricName, count: &countValue, timestamp: timestamp}, + }, + TimeGrain: timeGrain, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + }} + + actual := mapToKeyValuePoints(metrics) + + expected := []KeyValuePoint{ + { + Key: fmt.Sprintf("%s.%s", metricName, "min"), + Value: &minValue, + Namespace: namespace, + TimeGrain: timeGrain, + Timestamp: timestamp, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + Dimensions: map[string]interface{}{}, + }, + { + Key: fmt.Sprintf("%s.%s", metricName, "max"), + Value: &maxValue, + Namespace: namespace, + TimeGrain: timeGrain, + Timestamp: timestamp, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + Dimensions: map[string]interface{}{}, + }, + { + Key: fmt.Sprintf("%s.%s", metricName, "avg"), + Value: &avgValue, + Namespace: namespace, + TimeGrain: timeGrain, + Timestamp: timestamp, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + Dimensions: map[string]interface{}{}, + }, + { + Key: fmt.Sprintf("%s.%s", metricName, "total"), + Value: &totalValue, + Namespace: namespace, + TimeGrain: timeGrain, + Timestamp: timestamp, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + Dimensions: map[string]interface{}{}, + }, + { + Key: fmt.Sprintf("%s.%s", metricName, "count"), + Value: &countValue, + Namespace: namespace, + TimeGrain: timeGrain, + Timestamp: timestamp, + ResourceId: resourceId, + ResourceSubId: resourceSubId, + Dimensions: map[string]interface{}{}, + }, + } + + assert.Equal(t, expected, actual) + }) } diff --git a/x-pack/metricbeat/module/azure/metric_registry.go b/x-pack/metricbeat/module/azure/metric_registry.go index cdaa9496b5d6..c127701c996e 100644 --- a/x-pack/metricbeat/module/azure/metric_registry.go +++ b/x-pack/metricbeat/module/azure/metric_registry.go @@ -5,6 +5,7 @@ package azure import ( + "fmt" "strings" "time" @@ -118,8 +119,14 @@ func (m *MetricRegistry) buildMetricKey(metric Metric) string { keyComponents := []string{ metric.Namespace, metric.ResourceId, + metric.Aggregations, + metric.TimeGrain, + strings.Join(metric.Names, ","), + } + + for _, dim := range metric.Dimensions { + keyComponents = append(keyComponents, fmt.Sprintf("%s=%s", dim.Name, dim.Value)) } - keyComponents = append(keyComponents, metric.Names...) return strings.Join(keyComponents, ",") } diff --git a/x-pack/metricbeat/module/azure/metric_registry_test.go b/x-pack/metricbeat/module/azure/metric_registry_test.go index a0ecdc84b85d..63984aa6b59e 100644 --- a/x-pack/metricbeat/module/azure/metric_registry_test.go +++ b/x-pack/metricbeat/module/azure/metric_registry_test.go @@ -13,7 +13,7 @@ import ( "github.com/elastic/elastic-agent-libs/logp" ) -func TestNewMetricRegistry(t *testing.T) { +func TestMetricRegistry(t *testing.T) { logger := logp.NewLogger("test azure monitor") t.Run("Collect metrics with a regular 5 minutes period", func(t *testing.T) { @@ -90,4 +90,140 @@ func TestNewMetricRegistry(t *testing.T) { assert.True(t, needsUpdate, "metric should not need update") }) + + t.Run("Metrics with different aggregation types", func(t *testing.T) { + metricRegistry := NewMetricRegistry(logger) + + referenceTime := time.Now().UTC() + lastCollectionAt := referenceTime.Add(-time.Minute * 10) + + metric1 := Metric{ + ResourceId: "test", + Namespace: "test", + Aggregations: "Maximum", + } + metric2 := Metric{ + ResourceId: "test", + Namespace: "test", + Aggregations: "Minimum", + } + + metricCollectionInfo := MetricCollectionInfo{ + timeGrain: "PT5M", + timestamp: lastCollectionAt, + } + + // Update metrics collection info for previous collection + metricRegistry.Update(metric1, metricCollectionInfo) + metricRegistry.Update(metric2, metricCollectionInfo) + + // Update metric info for metric1 + metricRegistry.Update(metric1, MetricCollectionInfo{ + timeGrain: "PT5M", + timestamp: referenceTime, + }) + + // Check if metrics need update + metric1NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric1) + metric2NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric2) + + assert.False(t, metric1NeedsUpdate, "metric should not need update") + assert.True(t, metric2NeedsUpdate, "metric should need update") + }) + + t.Run("Metrics with different dimensions", func(t *testing.T) { + metricRegistry := NewMetricRegistry(logger) + + referenceTime := time.Now().UTC() + lastCollectionAt := referenceTime.Add(-time.Minute * 10) + + metric1 := Metric{ + ResourceId: "resource-id-1", + Namespace: "namespace-1", + Names: []string{"metric-name-1"}, + Dimensions: []Dimension{ + {Name: "dimension-1", Value: "*"}, + }, + TimeGrain: "PT1M", + } + metric2 := Metric{ + ResourceId: "resource-id-1", + Namespace: "namespace-1", + Names: []string{"metric-name-1"}, + Dimensions: []Dimension{ + {Name: "dimension-2", Value: "*"}, + }, + TimeGrain: "PT1M", + } + + metricCollectionInfo := MetricCollectionInfo{ + timeGrain: "PT1M", + timestamp: lastCollectionAt, + } + + // Update metrics collection info for previous collection + metricRegistry.Update(metric1, metricCollectionInfo) + metricRegistry.Update(metric2, metricCollectionInfo) + + // Update metric info for metric1 + metricRegistry.Update(metric1, MetricCollectionInfo{ + timeGrain: "PT1M", + timestamp: referenceTime, + }) + + // Check if metrics need update + metric1NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric1) + metric2NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric2) + + assert.False(t, metric1NeedsUpdate, "metric should not need update") + assert.True(t, metric2NeedsUpdate, "metric should need update") + }) + + t.Run("Metrics with different timegrain", func(t *testing.T) { + metricRegistry := NewMetricRegistry(logger) + + referenceTime := time.Now().UTC() + lastCollectionAt := referenceTime.Add(-time.Minute * 10) + + metric1 := Metric{ + ResourceId: "resource-id-1", + Namespace: "namespace-1", + Names: []string{"metric-name-1"}, + Dimensions: []Dimension{ + {Name: "dimension-1", Value: "*"}, + }, + TimeGrain: "PT1M", + } + metric2 := Metric{ + ResourceId: "resource-id-1", + Namespace: "namespace-1", + Names: []string{"metric-name-1"}, + Dimensions: []Dimension{ + {Name: "dimension-1", Value: "*"}, + }, + TimeGrain: "PT5M", + } + + metricCollectionInfo := MetricCollectionInfo{ + timeGrain: "PT1M", + timestamp: lastCollectionAt, + } + + // Update metrics collection info for previous collection + metricRegistry.Update(metric1, metricCollectionInfo) + metricRegistry.Update(metric2, metricCollectionInfo) + + // Update metric info for metric1 + metricRegistry.Update(metric1, MetricCollectionInfo{ + timeGrain: "PT1M", + timestamp: referenceTime, + }) + + // Check if metrics need update + metric1NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric1) + metric2NeedsUpdate := metricRegistry.NeedsUpdate(referenceTime, metric2) + + assert.False(t, metric1NeedsUpdate, "metric should not need update") + assert.True(t, metric2NeedsUpdate, "metric should need update") + }) } diff --git a/x-pack/metricbeat/module/azure/mock_service.go b/x-pack/metricbeat/module/azure/mock_service.go index 9626952fa6d1..293adc7c9a78 100644 --- a/x-pack/metricbeat/module/azure/mock_service.go +++ b/x-pack/metricbeat/module/azure/mock_service.go @@ -43,7 +43,7 @@ func (client *MockService) GetMetricNamespaces(resourceId string) (armmonitor.Me // GetMetricValues is a mock function for the azure service func (client *MockService) GetMetricValues(resourceId string, namespace string, timegrain string, timespan string, metricNames []string, aggregations string, filter string) ([]armmonitor.Metric, string, error) { - args := client.Called(resourceId, namespace) + args := client.Called(resourceId, namespace, timegrain, timespan, metricNames, aggregations, filter) return args.Get(0).([]armmonitor.Metric), args.String(1), args.Error(2) } diff --git a/x-pack/metricbeat/module/azure/service_interface.go b/x-pack/metricbeat/module/azure/service_interface.go index cb524c7f6ea5..75ae48d3d6e4 100644 --- a/x-pack/metricbeat/module/azure/service_interface.go +++ b/x-pack/metricbeat/module/azure/service_interface.go @@ -15,5 +15,16 @@ type Service interface { GetResourceDefinitions(id []string, group []string, rType string, query string) ([]*armresources.GenericResourceExpanded, error) GetMetricDefinitionsWithRetry(resourceId string, namespace string) (armmonitor.MetricDefinitionCollection, error) GetMetricNamespaces(resourceId string) (armmonitor.MetricNamespaceCollection, error) - GetMetricValues(resourceId string, namespace string, timegrain string, timespan string, metricNames []string, aggregations string, filter string) ([]armmonitor.Metric, string, error) + // GetMetricValues returns the metric values for the given resource ID, namespace, timegrain, timespan, metricNames, aggregations and filter. + // + // If the timegrain is empty, the default timegrain for the metric is used and returned. + GetMetricValues( + resourceId string, // resourceId is the ID of the resource to query (e.g. "/subscriptions/{subscriptionId}/resourceGroups/{resourceGroupName}/providers/{resourceProviderNamespace}/{resourceType}/{resourceName}") + namespace string, // namespace is the metric namespace to query (e.g. "Microsoft.Compute/virtualMachines") + timegrain string, // timegrain is the timegrain to use for the metric query (e.g. "PT1M"); if empty, returns the default timegrain for the metric. + timespan string, // timespan is the time interval to query (e.g. 2024-04-29T14:03:00Z/2024-04-29T14:04:00Z) + metricNames []string, // metricNames is the list of metric names to query (e.g. ["ServiceApiLatency", "Availability"]) + aggregations string, // aggregations is the comma-separated list of aggregations to use for the metric query (e.g. "Average,Maximum,Minimum") + filter string, // filter is the filter to query for dimensions (e.g. "ActivityType eq '*' AND ActivityName eq '*' AND StatusCode eq '*' AND StatusCodeClass eq '*'") + ) ([]armmonitor.Metric, string, error) }