From b467d906b9b2737f76fa0dd84438e98777ac25f6 Mon Sep 17 00:00:00 2001 From: Philipp Thun Date: Tue, 21 Nov 2023 17:21:42 +0100 Subject: [PATCH 1/5] Don't emit thread_info for Puma The current 'thread_info' is tightly coupled to Thin/EventMachine; don't emit those metrics when running the Puma webserver; maybe we will come up with other thread-related metrics for Puma at a later point in time. --- .../metrics/periodic_updater.rb | 7 +-- .../metrics/prometheus_updater.rb | 52 +++++++++++-------- .../metrics/statsd_updater.rb | 2 +- .../metrics/periodic_updater_spec.rb | 34 ++++++++---- .../metrics/prometheus_updater_spec.rb | 4 +- .../metrics/statsd_updater_spec.rb | 4 +- 6 files changed, 63 insertions(+), 40 deletions(-) diff --git a/lib/cloud_controller/metrics/periodic_updater.rb b/lib/cloud_controller/metrics/periodic_updater.rb index b1780d79c3d..f4cd0c98890 100644 --- a/lib/cloud_controller/metrics/periodic_updater.rb +++ b/lib/cloud_controller/metrics/periodic_updater.rb @@ -90,9 +90,10 @@ def update_job_queue_length end def update_thread_info - local_thread_info = thread_info + return unless VCAP::CloudController::Config.config.get(:webserver) == 'thin' - [@statsd_updater, @prometheus_updater].each { |u| u.update_thread_info(local_thread_info) } + local_thread_info = thread_info_thin + [@statsd_updater, @prometheus_updater].each { |u| u.update_thread_info_thin(local_thread_info) } end def update_failed_job_count @@ -132,7 +133,7 @@ def update_vitals @prometheus_updater.update_vitals(prom_vitals) end - def thread_info + def thread_info_thin threadqueue = EM.instance_variable_get(:@threadqueue) || [] resultqueue = EM.instance_variable_get(:@resultqueue) || [] { diff --git a/lib/cloud_controller/metrics/prometheus_updater.rb b/lib/cloud_controller/metrics/prometheus_updater.rb index 447bd09e948..37c20bbf2ed 100644 --- a/lib/cloud_controller/metrics/prometheus_updater.rb +++ b/lib/cloud_controller/metrics/prometheus_updater.rb @@ -12,12 +12,6 @@ class PrometheusUpdater { type: :histogram, name: :cc_staging_failed_duration_seconds, docstring: 'Durations of failed staging events', buckets: DURATION_BUCKETS }, { type: :gauge, name: :cc_requests_outstanding_total, docstring: 'Requests outstanding' }, { type: :counter, name: :cc_requests_completed_total, docstring: 'Requests completed' }, - { type: :gauge, name: :cc_thread_info_thread_count, docstring: 'Thread count' }, - { type: :gauge, name: :cc_thread_info_event_machine_connection_count, docstring: 'EventMachine connection count' }, - { type: :gauge, name: :cc_thread_info_event_machine_threadqueue_size, docstring: 'EventMachine thread queue size' }, - { type: :gauge, name: :cc_thread_info_event_machine_threadqueue_num_waiting, docstring: 'EventMachine num waiting in thread' }, - { type: :gauge, name: :cc_thread_info_event_machine_resultqueue_size, docstring: 'EventMachine queue size' }, - { type: :gauge, name: :cc_thread_info_event_machine_resultqueue_num_waiting, docstring: 'EventMachine requests waiting in queue' }, { type: :gauge, name: :cc_vitals_started_at, docstring: 'CloudController Vitals: started_at' }, { type: :gauge, name: :cc_vitals_mem_bytes, docstring: 'CloudController Vitals: mem_bytes' }, { type: :gauge, name: :cc_vitals_cpu_load_avg, docstring: 'CloudController Vitals: cpu_load_avg' }, @@ -30,26 +24,21 @@ class PrometheusUpdater { type: :gauge, name: :cc_deployments_in_progress_total, docstring: 'Number of in progress deployments' } ].freeze + THIN_METRICS = [ + { type: :gauge, name: :cc_thread_info_thread_count, docstring: 'Thread count' }, + { type: :gauge, name: :cc_thread_info_event_machine_connection_count, docstring: 'EventMachine connection count' }, + { type: :gauge, name: :cc_thread_info_event_machine_threadqueue_size, docstring: 'EventMachine thread queue size' }, + { type: :gauge, name: :cc_thread_info_event_machine_threadqueue_num_waiting, docstring: 'EventMachine num waiting in thread' }, + { type: :gauge, name: :cc_thread_info_event_machine_resultqueue_size, docstring: 'EventMachine queue size' }, + { type: :gauge, name: :cc_thread_info_event_machine_resultqueue_num_waiting, docstring: 'EventMachine requests waiting in queue' } + ].freeze + def initialize(registry=Prometheus::Client.registry) @registry = registry # Register all metrics, to initialize them for discoverability - METRICS.map do |metric| - register_metric(metric[:type], metric[:name], metric[:docstring], labels: metric[:labels] || {}, buckets: metric[:buckets] || {}) unless @registry.exist?(metric[:name]) - end - end - - def register_metric(type, name, message, labels: {}, buckets: {}) - case type - when :gauge - @registry.gauge(name, docstring: message, labels: labels) - when :counter - @registry.counter(name, docstring: message, labels: labels) - when :histogram - @registry.histogram(name, docstring: message, labels: labels, buckets: buckets) - else - throw ArgumentError("Metric type #{type} does not exist.") - end + METRICS.each { |metric| register(metric) } + THIN_METRICS.each { |metric| register(metric) } if VCAP::CloudController::Config.config.get(:webserver) == 'thin' end def update_gauge_metric(metric, value, labels: {}) @@ -90,7 +79,7 @@ def update_job_queue_length(pending_job_count_by_queue) end end - def update_thread_info(thread_info) + def update_thread_info_thin(thread_info) update_gauge_metric(:cc_thread_info_thread_count, thread_info[:thread_count]) update_gauge_metric(:cc_thread_info_event_machine_connection_count, thread_info[:event_machine][:connection_count]) update_gauge_metric(:cc_thread_info_event_machine_threadqueue_size, thread_info[:event_machine][:threadqueue][:size]) @@ -131,6 +120,23 @@ def report_staging_failure_metrics(duration_ns) private + def register(metric) + register_metric(metric[:type], metric[:name], metric[:docstring], labels: metric[:labels] || {}, buckets: metric[:buckets] || {}) unless @registry.exist?(metric[:name]) + end + + def register_metric(type, name, message, labels: {}, buckets: {}) + case type + when :gauge + @registry.gauge(name, docstring: message, labels: labels) + when :counter + @registry.counter(name, docstring: message, labels: labels) + when :histogram + @registry.histogram(name, docstring: message, labels: labels, buckets: buckets) + else + throw ArgumentError("Metric type #{type} does not exist.") + end + end + def nanoseconds_to_seconds(time_ns) (time_ns / 1e9).to_f end diff --git a/lib/cloud_controller/metrics/statsd_updater.rb b/lib/cloud_controller/metrics/statsd_updater.rb index 49ce3d7aacc..bd5f55ce5cd 100644 --- a/lib/cloud_controller/metrics/statsd_updater.rb +++ b/lib/cloud_controller/metrics/statsd_updater.rb @@ -23,7 +23,7 @@ def update_job_queue_length(pending_job_count_by_queue, total) end end - def update_thread_info(thread_info) + def update_thread_info_thin(thread_info) @statsd.batch do |batch| batch.gauge('cc.thread_info.thread_count', thread_info[:thread_count]) batch.gauge('cc.thread_info.event_machine.connection_count', thread_info[:event_machine][:connection_count]) diff --git a/spec/unit/lib/cloud_controller/metrics/periodic_updater_spec.rb b/spec/unit/lib/cloud_controller/metrics/periodic_updater_spec.rb index 6482e27c3df..fd330697ee3 100644 --- a/spec/unit/lib/cloud_controller/metrics/periodic_updater_spec.rb +++ b/spec/unit/lib/cloud_controller/metrics/periodic_updater_spec.rb @@ -72,7 +72,7 @@ module VCAP::CloudController::Metrics before do allow(statsd_updater).to receive(:update_user_count) allow(statsd_updater).to receive(:update_job_queue_length) - allow(statsd_updater).to receive(:update_thread_info) + allow(statsd_updater).to receive(:update_thread_info_thin) allow(statsd_updater).to receive(:update_failed_job_count) allow(statsd_updater).to receive(:update_vitals) allow(statsd_updater).to receive(:update_log_counts) @@ -81,7 +81,7 @@ module VCAP::CloudController::Metrics allow(prometheus_updater).to receive(:update_user_count) allow(prometheus_updater).to receive(:update_job_queue_length) - allow(prometheus_updater).to receive(:update_thread_info) + allow(prometheus_updater).to receive(:update_thread_info_thin) allow(prometheus_updater).to receive(:update_failed_job_count) allow(prometheus_updater).to receive(:update_vitals) allow(prometheus_updater).to receive(:update_log_counts) @@ -449,10 +449,8 @@ module VCAP::CloudController::Metrics describe '#update_thread_info' do before do - allow(statsd_updater).to receive(:update_thread_info) - allow(prometheus_updater).to receive(:update_thread_info) - - periodic_updater.update_thread_info + allow(statsd_updater).to receive(:update_thread_info_thin) + allow(prometheus_updater).to receive(:update_thread_info_thin) end it 'contains EventMachine data and send it to all updaters' do @@ -471,8 +469,10 @@ module VCAP::CloudController::Metrics } } - expect(statsd_updater).to have_received(:update_thread_info).with(expected_thread_info) - expect(prometheus_updater).to have_received(:update_thread_info).with(expected_thread_info) + periodic_updater.update_thread_info + + expect(statsd_updater).to have_received(:update_thread_info_thin).with(expected_thread_info) + expect(prometheus_updater).to have_received(:update_thread_info_thin).with(expected_thread_info) end context 'when resultqueue and/or threadqueue is not a queue' do @@ -495,7 +495,23 @@ module VCAP::CloudController::Metrics } } - expect(statsd_updater).to have_received(:update_thread_info).with(expected_thread_info) + periodic_updater.update_thread_info + + expect(statsd_updater).to have_received(:update_thread_info_thin).with(expected_thread_info) + expect(prometheus_updater).to have_received(:update_thread_info_thin).with(expected_thread_info) + end + end + + context 'when Puma is configured as webserver' do + before do + TestConfig.override(webserver: 'puma') + end + + it 'does not send EventMachine data to updaters' do + periodic_updater.update_thread_info + + expect(statsd_updater).not_to have_received(:update_thread_info_thin) + expect(prometheus_updater).not_to have_received(:update_thread_info_thin) end end end diff --git a/spec/unit/lib/cloud_controller/metrics/prometheus_updater_spec.rb b/spec/unit/lib/cloud_controller/metrics/prometheus_updater_spec.rb index c2bac1460c9..49d6ddd6f83 100644 --- a/spec/unit/lib/cloud_controller/metrics/prometheus_updater_spec.rb +++ b/spec/unit/lib/cloud_controller/metrics/prometheus_updater_spec.rb @@ -92,7 +92,7 @@ module VCAP::CloudController::Metrics end end - describe '#update_thread_info' do + describe '#update_thread_info_thin' do it 'contains EventMachine data' do thread_info = { thread_count: 5, @@ -109,7 +109,7 @@ module VCAP::CloudController::Metrics } } - updater.update_thread_info(thread_info) + updater.update_thread_info_thin(thread_info) metric = prom_client.metrics.find { |m| m.name == :cc_thread_info_thread_count } expect(metric.get).to eq 5 diff --git a/spec/unit/lib/cloud_controller/metrics/statsd_updater_spec.rb b/spec/unit/lib/cloud_controller/metrics/statsd_updater_spec.rb index 4f8faefb30b..bcdb766738f 100644 --- a/spec/unit/lib/cloud_controller/metrics/statsd_updater_spec.rb +++ b/spec/unit/lib/cloud_controller/metrics/statsd_updater_spec.rb @@ -86,7 +86,7 @@ module VCAP::CloudController::Metrics end end - describe '#update_thread_info' do + describe '#update_thread_info_thin' do let(:batch) { double(:batch) } before do @@ -110,7 +110,7 @@ module VCAP::CloudController::Metrics } } - updater.update_thread_info(thread_info) + updater.update_thread_info_thin(thread_info) expect(batch).to have_received(:gauge).with('cc.thread_info.thread_count', 5) expect(batch).to have_received(:gauge).with('cc.thread_info.event_machine.connection_count', 10) From 5ea1f64b757001bd0e7dbc2cef6dfe411d2efbde Mon Sep 17 00:00:00 2001 From: Philipp Thun Date: Thu, 9 Nov 2023 14:03:21 +0100 Subject: [PATCH 2/5] Adapt vitals (mem and cpu) for Puma Return the summed up memory bytes and cpu for the Puma process and its worker subprocesses. --- lib/vcap/stats.rb | 25 ++++++++++++++++++++++--- spec/unit/lib/vcap/stats_spec.rb | 29 +++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) create mode 100644 spec/unit/lib/vcap/stats_spec.rb diff --git a/lib/vcap/stats.rb b/lib/vcap/stats.rb index 59529e78315..72d1cb77b22 100644 --- a/lib/vcap/stats.rb +++ b/lib/vcap/stats.rb @@ -5,9 +5,14 @@ module VCAP class Stats class << self def process_memory_bytes_and_cpu - rss, pcpu = `ps -o rss=,pcpu= -p #{Process.pid}`.split.map(&:to_i) - rss_bytes = rss * 1024 - [rss_bytes, pcpu] + rss = [] + pcpu = [] + + ps_out = ps_pid + ps_out += ps_ppid if VCAP::CloudController::Config.config.get(:webserver) == 'puma' + ps_out.split.each_with_index { |e, i| i.even? ? rss << e : pcpu << e } + + [rss.map(&:to_i).sum * 1024, pcpu.map(&:to_f).sum.round] end def memory_used_bytes @@ -23,6 +28,20 @@ def memory_free_bytes def cpu_load_average Vmstat.load_average.one_minute end + + private + + def ps_pid + `ps -o rss=,pcpu= -p #{Process.pid}` + end + + def ps_ppid + if RUBY_PLATFORM.match?(/darwin/) + `ps ax -o ppid,rss,pcpu | awk '$1 == #{Process.pid} { print $2,$3 }'` + else + `ps -o rss=,pcpu= --ppid #{Process.pid}` + end + end end end end diff --git a/spec/unit/lib/vcap/stats_spec.rb b/spec/unit/lib/vcap/stats_spec.rb new file mode 100644 index 00000000000..bf6b78de5e7 --- /dev/null +++ b/spec/unit/lib/vcap/stats_spec.rb @@ -0,0 +1,29 @@ +require 'spec_helper' + +RSpec.describe VCAP::Stats do + describe '#process_memory_bytes_and_cpu' do + before do + allow(VCAP::Stats).to receive_messages(ps_pid: "123456 7.8\n", ps_ppid: "121212 3.4\n343434 5.6\n") + end + + it 'returns the memory bytes and cpu for the process' do + rss_bytes, pcpu = VCAP::Stats.process_memory_bytes_and_cpu + + expect(rss_bytes).to eq(126_418_944) + expect(pcpu).to eq(8) + end + + context 'when Puma is configured as webserver' do + before do + TestConfig.override(webserver: 'puma') + end + + it 'returns the summed up memory bytes and cpu for the process and its subprocesses' do + rss_bytes, pcpu = VCAP::Stats.process_memory_bytes_and_cpu + + expect(rss_bytes).to eq(602_216_448) + expect(pcpu).to eq(17) + end + end + end +end From bcf4bca25851694e27d21ce5ae8a109b4fd7ac95 Mon Sep 17 00:00:00 2001 From: Philipp Thun Date: Fri, 24 Nov 2023 16:15:36 +0100 Subject: [PATCH 3/5] Use statsd_updater and statsd_client from DependencyLocator - Replace VCAP::CloudController::Metrics::StatsdUpdater.new with CloudController::DependencyLocator.instance.statsd_updater - Replace Statsd.new with CloudController::DependencyLocator.instance. statsd_client - Add statsd_host + statsd_port to ClockSchema; don't rely on default values --- app/controllers/internal/staging_completion_controller.rb | 2 +- app/jobs/diego/sync.rb | 2 +- lib/cloud_controller/config_schemas/base/clock_schema.rb | 3 +++ lib/cloud_controller/diego/messenger.rb | 2 +- lib/cloud_controller/diego/processes_sync.rb | 2 +- lib/cloud_controller/metrics/request_metrics.rb | 2 +- lib/cloud_controller/metrics/statsd_updater.rb | 2 +- spec/unit/jobs/diego/sync_spec.rb | 5 ++++- spec/unit/lib/cloud_controller/clock/scheduler_spec.rb | 1 + 9 files changed, 14 insertions(+), 7 deletions(-) diff --git a/app/controllers/internal/staging_completion_controller.rb b/app/controllers/internal/staging_completion_controller.rb index 7694b8b130f..4ba721bd5a1 100644 --- a/app/controllers/internal/staging_completion_controller.rb +++ b/app/controllers/internal/staging_completion_controller.rb @@ -111,7 +111,7 @@ def report_metrics(bbs_staging_response) end def statsd_updater - @statsd_updater ||= VCAP::CloudController::Metrics::StatsdUpdater.new + CloudController::DependencyLocator.instance.statsd_updater end def prometheus_updater diff --git a/app/jobs/diego/sync.rb b/app/jobs/diego/sync.rb index 629ee2ca9c2..f13d2ad0858 100644 --- a/app/jobs/diego/sync.rb +++ b/app/jobs/diego/sync.rb @@ -6,7 +6,7 @@ module VCAP::CloudController module Jobs module Diego class Sync < VCAP::CloudController::Jobs::CCJob - def initialize(statsd=Statsd.new) + def initialize(statsd=CloudController::DependencyLocator.instance.statsd_client) @statsd = statsd end diff --git a/lib/cloud_controller/config_schemas/base/clock_schema.rb b/lib/cloud_controller/config_schemas/base/clock_schema.rb index c028fc44576..2ddac9017fb 100644 --- a/lib/cloud_controller/config_schemas/base/clock_schema.rb +++ b/lib/cloud_controller/config_schemas/base/clock_schema.rb @@ -175,6 +175,9 @@ class ClockSchema < VCAP::Config optional(:priorities) => Hash }, + statsd_host: String, + statsd_port: Integer, + max_labels_per_resource: Integer, max_annotations_per_resource: Integer, custom_metric_tag_prefix_list: Array diff --git a/lib/cloud_controller/diego/messenger.rb b/lib/cloud_controller/diego/messenger.rb index 576790aa54d..985ebedc7d4 100644 --- a/lib/cloud_controller/diego/messenger.rb +++ b/lib/cloud_controller/diego/messenger.rb @@ -4,7 +4,7 @@ module VCAP::CloudController module Diego class Messenger - def initialize(statsd_updater=VCAP::CloudController::Metrics::StatsdUpdater.new, prometheus_updater=CloudController::DependencyLocator.instance.prometheus_updater) + def initialize(statsd_updater=CloudController::DependencyLocator.instance.statsd_updater, prometheus_updater=CloudController::DependencyLocator.instance.prometheus_updater) @statsd_updater = statsd_updater @prometheus_updater = prometheus_updater end diff --git a/lib/cloud_controller/diego/processes_sync.rb b/lib/cloud_controller/diego/processes_sync.rb index 0a30b7cbd54..6393dc11310 100644 --- a/lib/cloud_controller/diego/processes_sync.rb +++ b/lib/cloud_controller/diego/processes_sync.rb @@ -10,7 +10,7 @@ class Error < StandardError class BBSFetchError < Error end - def initialize(config:, statsd_updater: VCAP::CloudController::Metrics::StatsdUpdater.new) + def initialize(config:, statsd_updater: CloudController::DependencyLocator.instance.statsd_updater) @config = config @workpool = WorkPool.new(50, store_exceptions: true) @statsd_updater = statsd_updater diff --git a/lib/cloud_controller/metrics/request_metrics.rb b/lib/cloud_controller/metrics/request_metrics.rb index 186cf9f5379..2d3e597a272 100644 --- a/lib/cloud_controller/metrics/request_metrics.rb +++ b/lib/cloud_controller/metrics/request_metrics.rb @@ -3,7 +3,7 @@ module VCAP::CloudController module Metrics class RequestMetrics - def initialize(statsd=Statsd.new, prometheus_updater=CloudController::DependencyLocator.instance.prometheus_updater) + def initialize(statsd=CloudController::DependencyLocator.instance.statsd_client, prometheus_updater=CloudController::DependencyLocator.instance.prometheus_updater) @counter = 0 @statsd = statsd @prometheus_updater = prometheus_updater diff --git a/lib/cloud_controller/metrics/statsd_updater.rb b/lib/cloud_controller/metrics/statsd_updater.rb index bd5f55ce5cd..0c2de56f6c6 100644 --- a/lib/cloud_controller/metrics/statsd_updater.rb +++ b/lib/cloud_controller/metrics/statsd_updater.rb @@ -2,7 +2,7 @@ module VCAP::CloudController::Metrics class StatsdUpdater - def initialize(statsd=Statsd.new) + def initialize(statsd=CloudController::DependencyLocator.instance.statsd_client) @statsd = statsd end diff --git a/spec/unit/jobs/diego/sync_spec.rb b/spec/unit/jobs/diego/sync_spec.rb index 67e2c9ab004..52dfce9f634 100644 --- a/spec/unit/jobs/diego/sync_spec.rb +++ b/spec/unit/jobs/diego/sync_spec.rb @@ -3,6 +3,7 @@ module VCAP::CloudController module Jobs::Diego RSpec.describe Sync, job_context: :clock do + let(:statsd_client) { instance_double(Statsd) } let(:processes_sync) { instance_double(Diego::ProcessesSync) } let(:tasks_sync) { instance_double(Diego::ProcessesSync) } @@ -10,9 +11,11 @@ module Jobs::Diego describe '#perform' do before do + allow_any_instance_of(CloudController::DependencyLocator).to receive(:statsd_client).and_return(statsd_client) allow(Diego::ProcessesSync).to receive(:new).and_return(processes_sync) allow(Diego::TasksSync).to receive(:new).and_return(tasks_sync) + allow(statsd_client).to receive(:timing) allow(processes_sync).to receive(:sync) allow(tasks_sync).to receive(:sync) end @@ -35,7 +38,7 @@ module Jobs::Diego expect(processes_sync).to receive(:sync) expect(tasks_sync).to receive(:sync) expect(Time).to receive(:now).twice # Ensure that we get two time measurements. _Hopefully_ they get turned into an elapsed time and passed in where they need to be! - expect_any_instance_of(Statsd).to receive(:timing).with('cc.diego_sync.duration', kind_of(Numeric)) + expect(statsd_client).to receive(:timing).with('cc.diego_sync.duration', kind_of(Numeric)) job.perform end diff --git a/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb b/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb index 917e5fa3f70..544e7f0a391 100644 --- a/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb +++ b/spec/unit/lib/cloud_controller/clock/scheduler_spec.rb @@ -157,6 +157,7 @@ module VCAP::CloudController end it 'schedules the frequent inline jobs' do + allow_any_instance_of(CloudController::DependencyLocator).to receive(:statsd_client).and_return(instance_double(Statsd)) allow(clock).to receive(:schedule_daily_job) allow(clock).to receive(:schedule_frequent_worker_job) expect(clock).to receive(:schedule_frequent_inline_job) do |args, &block| From 3b1e5432ac835e028c4def50755ffc7103beb974 Mon Sep 17 00:00:00 2001 From: Philipp Thun Date: Fri, 24 Nov 2023 16:16:33 +0100 Subject: [PATCH 4/5] Don't run periodic_updater on Puma workers When using Puma, the periodic_updater shall run in the main process only, thus direct invocations of 'update!' from a controller (executed within a worker process) must be removed. --- app/controllers/internal/metrics_controller.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/controllers/internal/metrics_controller.rb b/app/controllers/internal/metrics_controller.rb index b9c9ac4e730..1c7dde09a4f 100644 --- a/app/controllers/internal/metrics_controller.rb +++ b/app/controllers/internal/metrics_controller.rb @@ -8,7 +8,7 @@ class MetricsController < RestController::BaseController get '/internal/v4/metrics', :index def index - CloudController::DependencyLocator.instance.periodic_updater.update! + CloudController::DependencyLocator.instance.periodic_updater.update! unless VCAP::CloudController::Config.config.get(:webserver) == 'puma' [200, Prometheus::Client::Formats::Text.marshal(Prometheus::Client.registry)] end end From 64bb2242dc6761677e149b409f991632b08c160e Mon Sep 17 00:00:00 2001 From: Philipp Thun Date: Fri, 24 Nov 2023 16:28:26 +0100 Subject: [PATCH 5/5] Use DirectFileStore and suitable aggregation methods - When running Puma, the DirectFileStore data store is used for prometheus metrics. - Aggregation methods for (gauge) metrics are specified and applied when using the DirectFileStore. - For 'cc_requests_outstanding_total' the values per worker are summed up. - For metrics that are collected by the periodic_updater running in the main process, the aggregation method MOST_RECENT has been specified to eliminate the unnecessary 'pid' label. --- .../metrics/prometheus_updater.rb | 41 +++++++++++-------- lib/cloud_controller/runner.rb | 20 +++++++++ 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/lib/cloud_controller/metrics/prometheus_updater.rb b/lib/cloud_controller/metrics/prometheus_updater.rb index 37c20bbf2ed..64d0bc17b24 100644 --- a/lib/cloud_controller/metrics/prometheus_updater.rb +++ b/lib/cloud_controller/metrics/prometheus_updater.rb @@ -5,23 +5,23 @@ class PrometheusUpdater DURATION_BUCKETS = [5, 10, 30, 60, 300, 600, 890].freeze METRICS = [ - { type: :gauge, name: :cc_job_queues_length_total, docstring: 'Job queues length of worker processes', labels: [:queue] }, - { type: :gauge, name: :cc_failed_jobs_total, docstring: 'Number of failed jobs of worker processes', labels: [:queue] }, + { type: :gauge, name: :cc_job_queues_length_total, docstring: 'Job queues length of worker processes', labels: [:queue], aggregation: :most_recent }, + { type: :gauge, name: :cc_failed_jobs_total, docstring: 'Number of failed jobs of worker processes', labels: [:queue], aggregation: :most_recent }, { type: :counter, name: :cc_staging_requests_total, docstring: 'Number of staging requests' }, { type: :histogram, name: :cc_staging_succeeded_duration_seconds, docstring: 'Durations of successful staging events', buckets: DURATION_BUCKETS }, { type: :histogram, name: :cc_staging_failed_duration_seconds, docstring: 'Durations of failed staging events', buckets: DURATION_BUCKETS }, - { type: :gauge, name: :cc_requests_outstanding_total, docstring: 'Requests outstanding' }, + { type: :gauge, name: :cc_requests_outstanding_total, docstring: 'Requests outstanding', aggregation: :sum }, { type: :counter, name: :cc_requests_completed_total, docstring: 'Requests completed' }, - { type: :gauge, name: :cc_vitals_started_at, docstring: 'CloudController Vitals: started_at' }, - { type: :gauge, name: :cc_vitals_mem_bytes, docstring: 'CloudController Vitals: mem_bytes' }, - { type: :gauge, name: :cc_vitals_cpu_load_avg, docstring: 'CloudController Vitals: cpu_load_avg' }, - { type: :gauge, name: :cc_vitals_mem_used_bytes, docstring: 'CloudController Vitals: mem_used_bytes' }, - { type: :gauge, name: :cc_vitals_mem_free_bytes, docstring: 'CloudController Vitals: mem_free_bytes' }, - { type: :gauge, name: :cc_vitals_num_cores, docstring: 'CloudController Vitals: num_cores' }, - { type: :gauge, name: :cc_running_tasks_total, docstring: 'Total running tasks' }, - { type: :gauge, name: :cc_running_tasks_memory_bytes, docstring: 'Total memory consumed by running tasks' }, - { type: :gauge, name: :cc_users_total, docstring: 'Number of users' }, - { type: :gauge, name: :cc_deployments_in_progress_total, docstring: 'Number of in progress deployments' } + { type: :gauge, name: :cc_vitals_started_at, docstring: 'CloudController Vitals: started_at', aggregation: :most_recent }, + { type: :gauge, name: :cc_vitals_mem_bytes, docstring: 'CloudController Vitals: mem_bytes', aggregation: :most_recent }, + { type: :gauge, name: :cc_vitals_cpu_load_avg, docstring: 'CloudController Vitals: cpu_load_avg', aggregation: :most_recent }, + { type: :gauge, name: :cc_vitals_mem_used_bytes, docstring: 'CloudController Vitals: mem_used_bytes', aggregation: :most_recent }, + { type: :gauge, name: :cc_vitals_mem_free_bytes, docstring: 'CloudController Vitals: mem_free_bytes', aggregation: :most_recent }, + { type: :gauge, name: :cc_vitals_num_cores, docstring: 'CloudController Vitals: num_cores', aggregation: :most_recent }, + { type: :gauge, name: :cc_running_tasks_total, docstring: 'Total running tasks', aggregation: :most_recent }, + { type: :gauge, name: :cc_running_tasks_memory_bytes, docstring: 'Total memory consumed by running tasks', aggregation: :most_recent }, + { type: :gauge, name: :cc_users_total, docstring: 'Number of users', aggregation: :most_recent }, + { type: :gauge, name: :cc_deployments_in_progress_total, docstring: 'Number of in progress deployments', aggregation: :most_recent } ].freeze THIN_METRICS = [ @@ -121,17 +121,22 @@ def report_staging_failure_metrics(duration_ns) private def register(metric) - register_metric(metric[:type], metric[:name], metric[:docstring], labels: metric[:labels] || {}, buckets: metric[:buckets] || {}) unless @registry.exist?(metric[:name]) + return if @registry.exist?(metric[:name]) + + register_metric(metric[:type], metric[:name], metric[:docstring], labels: metric[:labels] || [], buckets: metric[:buckets] || [], aggregation: metric[:aggregation]) end - def register_metric(type, name, message, labels: {}, buckets: {}) + def register_metric(type, name, message, labels:, buckets:, aggregation:) + store_settings = {} + store_settings[:aggregation] = aggregation if aggregation.present? && Prometheus::Client.config.data_store.instance_of?(Prometheus::Client::DataStores::DirectFileStore) + case type when :gauge - @registry.gauge(name, docstring: message, labels: labels) + @registry.gauge(name, docstring: message, labels: labels, store_settings: store_settings) when :counter - @registry.counter(name, docstring: message, labels: labels) + @registry.counter(name, docstring: message, labels: labels, store_settings: store_settings) when :histogram - @registry.histogram(name, docstring: message, labels: labels, buckets: buckets) + @registry.histogram(name, docstring: message, labels: labels, buckets: buckets, store_settings: store_settings) else throw ArgumentError("Metric type #{type} does not exist.") end diff --git a/lib/cloud_controller/runner.rb b/lib/cloud_controller/runner.rb index c8e1bf72ebe..c90d91c53b9 100644 --- a/lib/cloud_controller/runner.rb +++ b/lib/cloud_controller/runner.rb @@ -13,6 +13,7 @@ require 'cloud_controller/secrets_fetcher' require 'cloud_controller/runners/thin_runner' require 'cloud_controller/runners/puma_runner' +require 'prometheus/client/data_stores/direct_file_store' module VCAP::CloudController class Runner @@ -95,6 +96,7 @@ def run! private def setup_cloud_controller + setup_metrics setup_logging setup_telemetry_logging setup_db @@ -112,6 +114,24 @@ def create_pidfile raise "ERROR: Can't create pid file #{@config.get(:pid_filename)}" end + def setup_metrics + return if @setup_metrics + + @setup_metrics = true + + return unless @config.get(:webserver) == 'puma' + + prometheus_dir = File.join(@config.get(:directories, :tmpdir), 'prometheus') + FileUtils.mkdir_p(prometheus_dir) + + # Resetting metrics on startup + Dir["#{prometheus_dir}/*.bin"].each do |file_path| + File.unlink(file_path) + end + + Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: prometheus_dir) + end + def setup_logging return if @setup_logging