From fe5e1dd2cf506d6e6846e71ffc8a45991dda5ce1 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Sun, 7 Apr 2024 15:27:05 +0200 Subject: [PATCH] migrate: misc migrations across all dashboards --- dashboards/cluster.jsonnet | 4 +- dashboards/jupyterhub.jsonnet | 141 ++++++++----------- dashboards/jupyterhub.libsonnet | 1 + dashboards/support.jsonnet | 3 +- dashboards/usage-report.jsonnet | 10 +- dashboards/user.jsonnet | 15 +- global-dashboards/global-usage-stats.jsonnet | 19 ++- 7 files changed, 89 insertions(+), 104 deletions(-) diff --git a/dashboards/cluster.jsonnet b/dashboards/cluster.jsonnet index 656dae7..bb4446e 100755 --- a/dashboards/cluster.jsonnet +++ b/dashboards/cluster.jsonnet @@ -431,7 +431,7 @@ dashboard.new('Cluster Information') [ row.new('Cluster Utilization') + row.withPanels([ - userPods, + userPods, // FIXME: previously width 24 userNodes, nodepoolMemoryCommitment, nodepoolCPUCommitment, @@ -450,6 +450,6 @@ dashboard.new('Cluster Information') ]), ], panelWidth=12, - panelHeight=8, + panelHeight=10, ) ) diff --git a/dashboards/jupyterhub.jsonnet b/dashboards/jupyterhub.jsonnet index aacad20..2578b3c 100755 --- a/dashboards/jupyterhub.jsonnet +++ b/dashboards/jupyterhub.jsonnet @@ -22,8 +22,7 @@ local currentActiveUsers = ||| ) + ts.standardOptions.withDecimals(0) - // stack=true, - + ts.standardOptions.withMin(0) + // FIXME: not migrated config stack=true, + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -50,10 +49,9 @@ local dailyActiveUsers = Requires JupyterHub 3.1. |||, ) - // legend_hideZero=false, + // FIXME: not migrated config legend_hideZero=false, + ts.standardOptions.withDecimals(0) - // stack=true, - + ts.standardOptions.withMin(0) + // FIXME: not migrated config stack=true, + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -76,10 +74,9 @@ local weeklyActiveUsers = Requires JupyterHub 3.1. ||| ) - // legend_hideZero=false, + // FIXME: not migrated config legend_hideZero=false, + ts.standardOptions.withDecimals(0) - // stack=true, - + ts.standardOptions.withMin(0) + // FIXME: not migrated config stack=true, + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -102,10 +99,9 @@ local monthlyActiveUsers = Requires JupyterHub 3.1. ||| ) - // legend_hideZero=false, + // FIXME: not migrated config legend_hideZero=false, + ts.standardOptions.withDecimals(0) - // stack=true, - + ts.standardOptions.withMin(0) + // FIXME: not migrated config stack=true, + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -121,11 +117,11 @@ local monthlyActiveUsers = local userMemoryDistribution = common.heatmapOptions + heatmap.new('User memory usage distribution') - // xBucketSize and interval must match to get correct values out of heatmaps - // xBucketSize='600s', - // yAxis_format='bytes', - // yAxis_min=0, - // color_colorScheme='interpolateViridis', + + heatmap.standardOptions.withUnit('bytes') + + heatmap.options.color.HeatmapColorOptions.withScheme('interpolateViridis') + + heatmap.options.calculation.xBuckets.withMode('size') + + heatmap.options.calculation.xBuckets.withValue('600s') // must align with interval + + heatmap.queryOptions.withInterval('600s') // must align with xBuckets value + heatmap.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -139,19 +135,18 @@ local userMemoryDistribution = ) by (pod) ||| % jupyterhub.onComponentLabel('singleuser-server', group_left='container'), - ), - // interval='600s', - // intervalFactor=1, + ) + + prometheus.withIntervalFactor(1), ]); local userCPUDistribution = common.heatmapOptions + heatmap.new('User CPU usage distribution') - // xBucketSize and interval must match to get correct values out of heatmaps - // xBucketSize='600s', - // yAxis_format='percentunit', - // yAxis_min=0, - // color_colorScheme='interpolateViridis', + + heatmap.standardOptions.withUnit('percentunit') + + heatmap.options.color.HeatmapColorOptions.withScheme('interpolateViridis') + + heatmap.options.calculation.xBuckets.withMode('size') + + heatmap.options.calculation.xBuckets.withValue('600s') // must align with interval + + heatmap.queryOptions.withInterval('600s') // must align with xBuckets value + heatmap.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -165,19 +160,18 @@ local userCPUDistribution = ) by (pod) ||| % jupyterhub.onComponentLabel('singleuser-server', group_left='container'), - ), - // interval='600s', - // intervalFactor=1, + ) + + prometheus.withIntervalFactor(1), ]); local userAgeDistribution = common.heatmapOptions + heatmap.new('User active age distribution') - // xBucketSize and interval must match to get correct values out of heatmaps - // xBucketSize='600s', - // yAxis_format='s', - // yAxis_min=0, - // color_colorScheme='interpolateViridis', + + heatmap.standardOptions.withUnit('s') + + heatmap.options.color.HeatmapColorOptions.withScheme('interpolateViridis') + + heatmap.options.calculation.xBuckets.withMode('size') + + heatmap.options.calculation.xBuckets.withValue('600s') // must align with interval + + heatmap.queryOptions.withInterval('600s') // must align with xBuckets value + heatmap.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -191,9 +185,8 @@ local userAgeDistribution = ) ||| % jupyterhub.onComponentLabel('singleuser-server'), - ), - // interval='600s', - // intervalFactor=1, + ) + + prometheus.withIntervalFactor(1), ]); // Hub diagnostics @@ -201,7 +194,6 @@ local hubResponseLatency = common.tsOptions + ts.new('Hub response latency') // formatY1='s', - + ts.standardOptions.withMin(0) + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -265,7 +257,6 @@ local hubResponseLatency = local hubResponseCodes = common.tsOptions + ts.new('Hub response status codes') - + ts.standardOptions.withMin(0) + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -297,7 +288,6 @@ local hubDBUsage = ||| ) + ts.standardOptions.withDecimals(0) - + ts.standardOptions.withMin(0) + ts.standardOptions.withMax(1) // formatY1='percentunit', + ts.queryOptions.withTargets([ @@ -319,7 +309,6 @@ local serverStartTimes = + ts.new('Server Start Times') // formatY1='s', // lines=false, - + ts.standardOptions.withMin(0) // points=true, // pointradius=2, + ts.queryOptions.withTargets([ @@ -345,9 +334,8 @@ local serverSpawnFailures = ||| ) // lines=false, - + ts.standardOptions.withMin(0) // points=false, - // legend_hideZero=true, + // FIXME: not migrated config legend_hideZero=true, // bars=true, // pointradius=2, + ts.queryOptions.withTargets([ @@ -364,7 +352,6 @@ local usersPerNode = common.tsOptions + ts.new('Users per node') + ts.standardOptions.withDecimals(0) - + ts.standardOptions.withMin(0) + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -393,8 +380,7 @@ local nonRunningPods = ||| ) // decimalsY1=0, - + ts.standardOptions.withMin(0) - // stack=true, + // FIXME: not migrated config stack=true, + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -421,7 +407,6 @@ local sharedVolumeFreeSpace = ||| ) // decimalsY1=0, - + ts.standardOptions.withMin(0) + ts.standardOptions.withMax(1) // formatY1='percentunit', + ts.queryOptions.withTargets([ @@ -442,14 +427,13 @@ local sharedVolumeFreeSpace = local oldUserpods = common.tableOptions + table.new('Very old user pods') - + ts.panelOptions.withDescription( + + table.panelOptions.withDescription( ||| User pods that have been running for a long time (>8h). This often indicates problems with the idle culler ||| ) - // transform='timeseries_to_rows', // styles=[ // { // pattern: 'Value', @@ -458,11 +442,12 @@ local oldUserpods = // alias: 'Age', // }, // ], - // sort={ - // col: 2, - // desc: true, - // }, - + ts.queryOptions.withTargets([ + + table.options.withSortBy({ + col: 2, + desc: true, + }) + + table.queryOptions.withTransformations('timeseries_to_rows') + + table.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', ||| @@ -475,12 +460,12 @@ local oldUserpods = + prometheus.withLegendFormat('{{ namespace }}/{{ pod }}'), // instant=true ]); -// .hideColumn('Time') +// FIXME: not migrated config .hideColumn('Time') local highCPUUserPods = common.tableOptions + table.new('User Pods with high CPU usage (>0.5)') - + ts.panelOptions.withDescription( + + table.panelOptions.withDescription( ||| User pods using a lot of CPU @@ -488,7 +473,6 @@ local highCPUUserPods = unnecessarily. ||| ) - // transform='timeseries_to_rows', // styles=[ // { // pattern: 'Value', @@ -497,11 +481,12 @@ local highCPUUserPods = // alias: 'CPU usage', // }, // ], - // sort={ - // col: 2, - // desc: true, - // }, - + ts.queryOptions.withTargets([ + + table.options.withSortBy({ + col: 2, + desc: true, + }) + + table.queryOptions.withTransformations('timeseries_to_rows') + + table.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', ||| @@ -515,19 +500,18 @@ local highCPUUserPods = + prometheus.withLegendFormat('{{ namespace }}/{{ pod }}'), // instant=true ]); -// .hideColumn('Time') +// FIXME: not migrated config .hideColumn('Time') local highMemoryUsagePods = common.tableOptions + table.new('User pods with high memory usage (>80% of limit)') - + ts.panelOptions.withDescription( + + table.panelOptions.withDescription( ||| User pods getting close to their memory limit Once they hit their memory limit, user kernels will start dying. ||| ) - // transform='timeseries_to_rows', // styles=[ // { // pattern: 'Value', @@ -536,11 +520,12 @@ local highMemoryUsagePods = // alias: '% of mem limit consumed', // }, // ], - // sort={ - // col: 2, - // desc: true, - // }, - + ts.queryOptions.withTargets([ + + table.options.withSortBy({ + col: 2, + desc: true, + }) + + table.queryOptions.withTransformations('timeseries_to_rows') + + table.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', ||| @@ -562,7 +547,7 @@ local highMemoryUsagePods = + prometheus.withLegendFormat('{{ namespace }}/{{ pod }}'), // instant=true ]); -// .hideColumn('Time') +// FIXME: not migrated config .hideColumn('Time') // Show images used by different users on the hub local notebookImagesUsed = @@ -573,10 +558,9 @@ local notebookImagesUsed = Number of user servers using a container image. ||| ) - // legend_hideZero=false, + // FIXME: not migrated config legend_hideZero=false, + ts.standardOptions.withDecimals(0) - // stack=false, - + ts.standardOptions.withMin(0) + // FIXME: not migrated config stack=false, + ts.queryOptions.withTargets([ prometheus.new( '$PROMETHEUS_DS', @@ -624,8 +608,8 @@ dashboard.new('JupyterHub Dashboard') serverSpawnFailures, hubResponseLatency, hubResponseCodes, - allComponentsCPU, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5 }, - allComponentsMemory, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5 }, + allComponentsCPU, // FIXME: previous height 12 + allComponentsMemory, // FIXME: previous height 12 hubDBUsage, nonRunningPods, usersPerNode, @@ -633,13 +617,12 @@ dashboard.new('JupyterHub Dashboard') ]), row.new('Anomalous user pods') + row.withPanels([ - oldUserpods, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5 }, - highCPUUserPods, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5 }, - highMemoryUsagePods, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5 }, + oldUserpods, // FIXME: previous height 12 + highCPUUserPods, // FIXME: previous height 12 + highMemoryUsagePods, // FIXME: previous height 12 ]), ], - // FIXME: panelWidth and panelHeight specified like cluster.jsonnet without visual check panelWidth=12, - panelHeight=8, + panelHeight=10, ) ) diff --git a/dashboards/jupyterhub.libsonnet b/dashboards/jupyterhub.libsonnet index 6998ade..14145bc 100644 --- a/dashboards/jupyterhub.libsonnet +++ b/dashboards/jupyterhub.libsonnet @@ -80,6 +80,7 @@ local prometheus = grafonnet.query.prometheus; */ componentResourcePanel(title, metric, component='', formatY1=null, decimalsY1=null, multi=false):: ts.new(title) + // FIXME: not migrated config below commented out //decimalsY1=decimalsY1, //formatY1=formatY1, // show legend as a table with current, avg, max values diff --git a/dashboards/support.jsonnet b/dashboards/support.jsonnet index eee92c5..d4bc766 100755 --- a/dashboards/support.jsonnet +++ b/dashboards/support.jsonnet @@ -199,8 +199,7 @@ dashboard.new('NFS and Support Information') prometheusNetwork, ]), ], - // FIXME: panelWidth and panelHeight specified like cluster.jsonnet without visual check panelWidth=12, - panelHeight=8, + panelHeight=10, ) ) diff --git a/dashboards/usage-report.jsonnet b/dashboards/usage-report.jsonnet index 928fef9..74a936f 100755 --- a/dashboards/usage-report.jsonnet +++ b/dashboards/usage-report.jsonnet @@ -8,13 +8,17 @@ local common = import './common.libsonnet'; // FIXME: apply threshold coloring, provided like this historically, for all // four panels in this dashboard -// +// FIXME: not migrated config thresholds, should apply to all panels // thresholds=[ // { // value: 0, // color: 'green', // }, // ] +// IDEAS: +// + barGauge.standardOptions.thresholds.withMode('percentage') +// + barGauge.standardOptions.color.withMode('continuous-greens') +// local memoryUsageUserPods = common.barGaugeOptions @@ -155,7 +159,7 @@ dashboard.new('Usage Report') memoryUsageDaskSchedulerPods, memoryUsageGPUPods, ], - panelWidth=12, - panelHeight=8, + panelWidth=24, + panelHeight=10, ) ) diff --git a/dashboards/user.jsonnet b/dashboards/user.jsonnet index 9676365..fee329c 100755 --- a/dashboards/user.jsonnet +++ b/dashboards/user.jsonnet @@ -149,14 +149,13 @@ dashboard.new('User Diagnostics Dashboard') + dashboard.withPanels( grafonnet.util.grid.makeGrid( [ - memoryUsage, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5, w: standardDims.w * 2 } - cpuUsage, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5, w: standardDims.w * 2 } - homedirSharedUsage, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5, w: standardDims.w * 2 } - memoryRequests, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5, w: standardDims.w * 2 } - cpuRequests, // FIXME: previously specified as, is it ok now? { h: standardDims.h * 1.5, w: standardDims.w * 2 } + memoryUsage, + cpuUsage, + homedirSharedUsage, + memoryRequests, + cpuRequests, ], - // FIXME: panelWidth and panelHeight specified like cluster.jsonnet without visual check - panelWidth=12, - panelHeight=8, + panelWidth=24, + panelHeight=16, ) ) diff --git a/global-dashboards/global-usage-stats.jsonnet b/global-dashboards/global-usage-stats.jsonnet index d023cc1..2dc25ce 100755 --- a/global-dashboards/global-usage-stats.jsonnet +++ b/global-dashboards/global-usage-stats.jsonnet @@ -9,6 +9,7 @@ local prometheus = grafonnet.query.prometheus; function(datasources) local weeklyActiveUsers = barGauge.new('Active users (over 7 days)') + // FIXME: not migrated config thresholds //thresholds=[ // { // value: 0, @@ -43,15 +44,13 @@ function(datasources) + dashboard.withUid('global-usage-dashboard') + dashboard.withTags(['jupyterhub', 'global']) + dashboard.withEditable(true) - // time_from='now-7d', + + dashboard.time.withFrom('now-7d') + dashboard.withPanels( - grafonnet.util.grid.makeGrid([ - weeklyActiveUsers, - ]) - // gridPos={ - // x: 0, - // y: 0, - // w: 25, - // h: 10, - // }, + grafonnet.util.grid.makeGrid( + [ + weeklyActiveUsers, + ], + panelWidth=24, + panelHeight=10, + ) )