Skip to content

Commit

Permalink
Prometheus on Thin (#3445)
Browse files Browse the repository at this point in the history
- Remove deprecated metrics and metrics which have been found not useful according to discussions in the community
- Make use of the DependencyLocator for retrieving a singleton of the PrometheusUpdater and PeriodicUpdater
- Change vitals_uptime to vitals_started_at
- Emit cc_staging_requests_total metric
- Apply prometheus best practices like naming, base units, using labels, initialising metrics for discoverability
- Use counter metrics for metrics which do not decrease
- Remove metrics, which are emitted on the scheduler VM. Those metrics currently cannot be collected and will be still emitted via statsd
  • Loading branch information
svkrieger authored Nov 20, 2023
1 parent d5effa4 commit aadd26d
Show file tree
Hide file tree
Showing 17 changed files with 358 additions and 443 deletions.
12 changes: 1 addition & 11 deletions app/controllers/internal/metrics_controller.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
require 'prometheus/client'
require 'prometheus/client/formats/text'
require 'cloud_controller/metrics/prometheus_updater'

module VCAP::CloudController
module Internal
Expand All @@ -9,16 +8,7 @@ class MetricsController < RestController::BaseController
get '/internal/v4/metrics', :index

def index
periodic_updater = VCAP::CloudController::Metrics::PeriodicUpdater.new(
Time.now.utc,
Steno::Sink::Counter.new,
Steno.logger('cc.api'),
[
VCAP::CloudController::Metrics::StatsdUpdater.new,
VCAP::CloudController::Metrics::PrometheusUpdater.new
]
)
periodic_updater.update!
CloudController::DependencyLocator.instance.periodic_updater.update!
[200, Prometheus::Client::Formats::Text.marshal(Prometheus::Client.registry)]
end
end
Expand Down
2 changes: 1 addition & 1 deletion app/controllers/internal/staging_completion_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def statsd_updater
end

def prometheus_updater
@prometheus_updater ||= VCAP::CloudController::Metrics::PrometheusUpdater.new # this should be using singleton
CloudController::DependencyLocator.instance.prometheus_updater
end

attr_reader :stagers
Expand Down
4 changes: 1 addition & 3 deletions app/jobs/diego/sync.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@ module VCAP::CloudController
module Jobs
module Diego
class Sync < VCAP::CloudController::Jobs::CCJob
def initialize(statsd=Statsd.new, prometheus_updater=VCAP::CloudController::Metrics::PrometheusUpdater.new)
def initialize(statsd=Statsd.new)
@statsd = statsd
@prometheus_updater = prometheus_updater
end

def perform
Expand All @@ -27,7 +26,6 @@ def perform
elapsed_ms = ((finish - start) * 1000).round

@statsd.timing('cc.diego_sync.duration', elapsed_ms)
@prometheus_updater.report_diego_cell_sync_duration(elapsed_ms)
end
end

Expand Down
29 changes: 26 additions & 3 deletions lib/cloud_controller/dependency_locator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,28 @@ def runners
@dependencies[:runners] || register(:runners, VCAP::CloudController::Runners.new(config))
end

def periodic_updater
@dependencies[:periodic_updater] ||
register(:periodic_updater,
VCAP::CloudController::Metrics::PeriodicUpdater.new(
Time.now.utc,
log_counter,
Steno.logger('cc.api'),
statsd_updater,
prometheus_updater
))
end

def prometheus_updater
register(:prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new) unless @dependencies[:prometheus_updater]
@dependencies[:prometheus_updater]
@dependencies[:prometheus_updater] || register(:prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new)
end

def statsd_updater
@dependencies[:statsd_updater] || register(:statsd_updater, VCAP::CloudController::Metrics::StatsdUpdater.new(statsd_client))
end

def log_counter
@dependencies[:log_counter] || register(:log_counter, Steno::Sink::Counter.new)
end

def stagers
Expand Down Expand Up @@ -328,8 +347,12 @@ def registry_buddy_client
end

def statsd_client
@dependencies[:statsd_client] ||
if @dependencies[:statsd_client]
@dependencies[:statsd_client]
else
Statsd.logger = Steno.logger('statsd.client')
register(:statsd_client, Statsd.new(config.get(:statsd_host), config.get(:statsd_port)))
end
end

private
Expand Down
7 changes: 2 additions & 5 deletions lib/cloud_controller/deployment_updater/scheduler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,11 @@ def start
with_error_logging('cc.deployment_updater') do
config = CloudController::DependencyLocator.instance.config
statsd_client = CloudController::DependencyLocator.instance.statsd_client
prometheus_updater = CloudController::DependencyLocator.instance.prometheus_updater

update_step = proc {
update(
update_frequency: config.get(:deployment_updater, :update_frequency_in_seconds),
statsd_client: statsd_client,
prometheus_updater: prometheus_updater
statsd_client: statsd_client
)
}

Expand All @@ -42,7 +40,7 @@ def start

private

def update(update_frequency:, statsd_client:, prometheus_updater:)
def update(update_frequency:, statsd_client:)
logger = Steno.logger('cc.deployment_updater.scheduler')

update_start_time = Time.now
Expand All @@ -54,7 +52,6 @@ def update(update_frequency:, statsd_client:, prometheus_updater:)
## so feed in the entire value!
update_duration_ms = update_duration * 1000
statsd_client.timing('cc.deployments.update.duration', update_duration_ms)
prometheus_updater.report_deployment_duration(update_duration_ms)

logger.info("Update loop took #{update_duration}s")

Expand Down
4 changes: 3 additions & 1 deletion lib/cloud_controller/diego/messenger.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
module VCAP::CloudController
module Diego
class Messenger
def initialize(statsd_updater=VCAP::CloudController::Metrics::StatsdUpdater.new)
def initialize(statsd_updater=VCAP::CloudController::Metrics::StatsdUpdater.new, prometheus_updater=CloudController::DependencyLocator.instance.prometheus_updater)
@statsd_updater = statsd_updater
@prometheus_updater = prometheus_updater
end

def send_stage_request(_config, staging_details)
Expand All @@ -15,6 +16,7 @@ def send_stage_request(_config, staging_details)

bbs_stager_client.stage(staging_guid, staging_details)
@statsd_updater.start_staging_request_received
@prometheus_updater.start_staging_request_received
end

def send_stop_staging_request(staging_guid)
Expand Down
33 changes: 21 additions & 12 deletions lib/cloud_controller/metrics/periodic_updater.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@

module VCAP::CloudController::Metrics
class PeriodicUpdater
def initialize(start_time, log_counter, logger=Steno.logger, updaters=[StatsdUpdater.new, PrometheusUpdater.new])
def initialize(start_time, log_counter, logger, statsd_updater, prometheus_updater)
@start_time = start_time
@updaters = updaters
@statsd_updater = statsd_updater
@prometheus_updater = prometheus_updater
@log_counter = log_counter
@logger = logger
@known_job_queues = {
Expand Down Expand Up @@ -45,9 +46,9 @@ def catch_error
def update_task_stats
running_tasks = VCAP::CloudController::TaskModel.where(state: VCAP::CloudController::TaskModel::RUNNING_STATE)
running_task_count = running_tasks.count
running_task_memory = running_tasks.sum(:memory_in_mb)
running_task_memory = 0 if running_task_memory.nil?
@updaters.each { |u| u.update_task_stats(running_task_count, running_task_memory) }
running_task_memory = running_tasks.sum(:memory_in_mb) || 0
@statsd_updater.update_task_stats(running_task_count, running_task_memory)
@prometheus_updater.update_task_stats(running_task_count, running_task_memory * 1024 * 1024)
end

def update_log_counts
Expand All @@ -58,19 +59,19 @@ def update_log_counts
hash[level_name] = counts.fetch(level_name.to_s, 0)
end

@updaters.each { |u| u.update_log_counts(hash) }
@statsd_updater.update_log_counts(hash)
end

def update_deploying_count
deploying_count = VCAP::CloudController::DeploymentModel.deploying_count

@updaters.each { |u| u.update_deploying_count(deploying_count) }
[@statsd_updater, @prometheus_updater].each { |u| u.update_deploying_count(deploying_count) }
end

def update_user_count
user_count = VCAP::CloudController::User.count

@updaters.each { |u| u.update_user_count(user_count) }
[@statsd_updater, @prometheus_updater].each { |u| u.update_user_count(user_count) }
end

def update_job_queue_length
Expand All @@ -84,13 +85,14 @@ def update_job_queue_length
end

pending_job_count_by_queue.reverse_merge!(@known_job_queues)
@updaters.each { |u| u.update_job_queue_length(pending_job_count_by_queue, total) }
@statsd_updater.update_job_queue_length(pending_job_count_by_queue, total)
@prometheus_updater.update_job_queue_length(pending_job_count_by_queue)
end

def update_thread_info
local_thread_info = thread_info

@updaters.each { |u| u.update_thread_info(local_thread_info) }
[@statsd_updater, @prometheus_updater].each { |u| u.update_thread_info(local_thread_info) }
end

def update_failed_job_count
Expand All @@ -104,7 +106,8 @@ def update_failed_job_count
end

failed_jobs_by_queue.reverse_merge!(@known_job_queues)
@updaters.each { |u| u.update_failed_job_count(failed_jobs_by_queue, total) }
@statsd_updater.update_failed_job_count(failed_jobs_by_queue, total)
@prometheus_updater.update_failed_job_count(failed_jobs_by_queue)
end

def update_vitals
Expand All @@ -120,7 +123,13 @@ def update_vitals
num_cores: VCAP::HostSystem.new.num_cores
}

@updaters.each { |u| u.update_vitals(vitals) }
@statsd_updater.update_vitals(vitals)

prom_vitals = vitals.clone
prom_vitals.delete(:uptime)
prom_vitals.delete(:cpu)
prom_vitals[:started_at] = @start_time.to_i
@prometheus_updater.update_vitals(prom_vitals)
end

def thread_info
Expand Down
Loading

0 comments on commit aadd26d

Please sign in to comment.