Skip to content

Commit

Permalink
Expose Puma Stats as Prometheus Metrics
Browse files Browse the repository at this point in the history
The PeriodicUpdater fetches statistics [1] from Puma every 30 seconds
and uses the PrometheusUpdater to send the following metrics:

- cc_puma_worker_count
- cc_puma_worker_started_at (per worker with labels index and pid)
- cc_puma_worker_thread_count (per worker)
- cc_puma_worker_backlog (per worker)
  - docs: "requests that are waiting for an available thread"
  - seems like a good scaling indicator

As we want to label worker metrics with the worker's pid (although
issued by the main process), the BASE_RESERVED_LABELS constant within
the Prometheus::Client had to be modified (monkey patched).

[1] https://puma.io/puma/file.stats.html
  • Loading branch information
philippthun committed Dec 5, 2023
1 parent 64115aa commit 118df23
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 0 deletions.
20 changes: 20 additions & 0 deletions lib/cloud_controller/metrics/periodic_updater.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def setup_updates
EM.add_periodic_timer(30) { catch_error { update_log_counts } }
EM.add_periodic_timer(30) { catch_error { update_task_stats } }
EM.add_periodic_timer(30) { catch_error { update_deploying_count } }
EM.add_periodic_timer(30) { catch_error { update_webserver_stats } }
end

def update!
Expand All @@ -35,6 +36,7 @@ def update!
update_log_counts
update_task_stats
update_deploying_count
update_webserver_stats
end

def catch_error
Expand Down Expand Up @@ -133,6 +135,24 @@ def update_vitals
@prometheus_updater.update_vitals(prom_vitals)
end

def update_webserver_stats
return unless VCAP::CloudController::Config.config.get(:webserver) == 'puma'

local_stats = Puma.stats_hash
worker_count = local_stats[:booted_workers]
worker_stats = []
local_stats[:worker_status].each do |worker_status|
worker_stats << {
started_at: Time.parse(worker_status[:started_at]).utc.to_i,
index: worker_status[:index],
pid: worker_status[:pid],
thread_count: worker_status[:last_status][:running],
backlog: worker_status[:last_status][:backlog]
}
end
@prometheus_updater.update_webserver_stats_puma(worker_count, worker_stats)
end

def thread_info_thin
threadqueue = EM.instance_variable_get(:@threadqueue) || []
resultqueue = EM.instance_variable_get(:@resultqueue) || []
Expand Down
37 changes: 37 additions & 0 deletions lib/cloud_controller/metrics/prometheus_updater.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,19 @@

module VCAP::CloudController::Metrics
class PrometheusUpdater
# We want to label worker metrics with the worker's pid. By default, this label is reserved
# within the Prometheus::Client, thus we modify the BASE_RESERVED_LABELS constant.
# Nevertheless, the pid label should be used with caution, i.e. only for metrics that are
# aggregated and thus don't have the pid label set automatically!
def self.allow_pid_label
return unless Prometheus::Client::LabelSetValidator.const_get(:BASE_RESERVED_LABELS).include?(:pid)

reserved_labels = Prometheus::Client::LabelSetValidator.const_get(:BASE_RESERVED_LABELS).dup
reserved_labels.delete_if { |l| l == :pid }
Prometheus::Client::LabelSetValidator.send(:remove_const, :BASE_RESERVED_LABELS)
Prometheus::Client::LabelSetValidator.const_set(:BASE_RESERVED_LABELS, reserved_labels.freeze)
end

DURATION_BUCKETS = [5, 10, 30, 60, 300, 600, 890].freeze

METRICS = [
Expand Down Expand Up @@ -33,12 +46,22 @@ class PrometheusUpdater
{ type: :gauge, name: :cc_thread_info_event_machine_resultqueue_num_waiting, docstring: 'EventMachine requests waiting in queue' }
].freeze

PUMA_METRICS = [
{ type: :gauge, name: :cc_puma_worker_count, docstring: 'Puma worker count', aggregation: :most_recent },
{ type: :gauge, name: :cc_puma_worker_started_at, docstring: 'Puma worker: started_at', labels: %i[index pid], aggregation: :most_recent },
{ type: :gauge, name: :cc_puma_worker_thread_count, docstring: 'Puma worker: thread count', labels: %i[index pid], aggregation: :most_recent },
{ type: :gauge, name: :cc_puma_worker_backlog, docstring: 'Puma worker: backlog', labels: %i[index pid], aggregation: :most_recent }
].freeze

def initialize(registry=Prometheus::Client.registry)
self.class.allow_pid_label

@registry = registry

# Register all metrics, to initialize them for discoverability
METRICS.each { |metric| register(metric) }
THIN_METRICS.each { |metric| register(metric) } if VCAP::CloudController::Config.config.get(:webserver) == 'thin'
PUMA_METRICS.each { |metric| register(metric) } if VCAP::CloudController::Config.config.get(:webserver) == 'puma'
end

def update_gauge_metric(metric, value, labels: {})
Expand Down Expand Up @@ -106,6 +129,20 @@ def update_task_stats(total_running_tasks, total_memory_in_bytes)
update_gauge_metric(:cc_running_tasks_memory_bytes, total_memory_in_bytes)
end

def update_webserver_stats_puma(worker_count, worker_stats)
update_gauge_metric(:cc_puma_worker_count, worker_count)

worker_stats.each do |stats|
index = stats.delete(:index)
pid = stats.delete(:pid)

stats.each do |key, value|
metric_key = :"cc_puma_worker_#{key.to_s.underscore}"
update_gauge_metric(metric_key, value, labels: { index:, pid: })
end
end
end

def start_staging_request_received
increment_counter_metric(:cc_staging_requests_total)
end
Expand Down
41 changes: 41 additions & 0 deletions spec/unit/lib/cloud_controller/metrics/periodic_updater_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,46 @@ module VCAP::CloudController::Metrics
end
end

describe '#update_webserver_stats' do
before do
allow(prometheus_updater).to receive(:update_webserver_stats_puma)
end

context 'when Puma is configured as webserver' do
before do
TestConfig.override(webserver: 'puma')
end

it 'sends stats to the prometheus updater' do
stats_hash = {
booted_workers: 2,
worker_status: [
{ started_at: '2023-11-29T13:15:05Z', index: 0, pid: 123, last_status: { running: 1, backlog: 0 } },
{ started_at: '2023-11-29T13:15:10Z', index: 1, pid: 234, last_status: { running: 2, backlog: 1 } }
]
}
allow(Puma).to receive(:stats_hash).and_return(stats_hash)

periodic_updater.update_webserver_stats

expected_worker_count = 2
expected_worker_stats = [
{ started_at: 1_701_263_705, index: 0, pid: 123, thread_count: 1, backlog: 0 },
{ started_at: 1_701_263_710, index: 1, pid: 234, thread_count: 2, backlog: 1 }
]
expect(prometheus_updater).to have_received(:update_webserver_stats_puma).with(expected_worker_count, expected_worker_stats)
end
end

context 'when Thin is configured as webserver' do
it 'does not send stats to the prometheus updater' do
periodic_updater.update_webserver_stats

expect(prometheus_updater).not_to have_received(:update_webserver_stats_puma)
end
end
end

describe '#update!' do
it 'calls all update methods' do
expect(periodic_updater).to receive(:update_user_count).once
Expand All @@ -610,6 +650,7 @@ module VCAP::CloudController::Metrics
expect(periodic_updater).to receive(:update_log_counts).once
expect(periodic_updater).to receive(:update_task_stats).once
expect(periodic_updater).to receive(:update_deploying_count).once
expect(periodic_updater).to receive(:update_webserver_stats).once

periodic_updater.update!
end
Expand Down
31 changes: 31 additions & 0 deletions spec/unit/lib/cloud_controller/metrics/prometheus_updater_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,37 @@ module VCAP::CloudController::Metrics
end
end

describe '#update_webserver_stats_puma' do
before do
TestConfig.override(webserver: 'puma')
end

it 'contains Puma stats' do
worker_count = 2
worker_stats = [
{ started_at: 1_701_263_705, index: 0, pid: 123, thread_count: 1, backlog: 0 },
{ started_at: 1_701_263_710, index: 1, pid: 234, thread_count: 2, backlog: 1 }
]

updater.update_webserver_stats_puma(worker_count, worker_stats)

metric = prom_client.metrics.find { |m| m.name == :cc_puma_worker_count }
expect(metric.get).to eq(2)

metric = prom_client.metrics.find { |m| m.name == :cc_puma_worker_started_at }
expect(metric.get(labels: { index: 0, pid: 123 })).to eq(1_701_263_705)
expect(metric.get(labels: { index: 1, pid: 234 })).to eq(1_701_263_710)

metric = prom_client.metrics.find { |m| m.name == :cc_puma_worker_thread_count }
expect(metric.get(labels: { index: 0, pid: 123 })).to eq(1)
expect(metric.get(labels: { index: 1, pid: 234 })).to eq(2)

metric = prom_client.metrics.find { |m| m.name == :cc_puma_worker_backlog }
expect(metric.get(labels: { index: 0, pid: 123 })).to eq(0)
expect(metric.get(labels: { index: 1, pid: 234 })).to eq(1)
end
end

describe '#start_staging_request_received' do
it 'increments "cc_staging_requests_total"' do
updater.start_staging_request_received
Expand Down

0 comments on commit 118df23

Please sign in to comment.