diff --git a/CHANGELOG.md b/CHANGELOG.md index 3135d97..b7fff5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## master / unreleased +* [ENHANCEMENT] Update cortex-mixin to support Cortex deployment from the Helm chart. #361 + ## 1.11.0 / 2021-12-30 * [CHANGE] Store gateway: set `-blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency`, diff --git a/cortex-mixin/config.libsonnet b/cortex-mixin/config.libsonnet index 06941b6..b2c7c64 100644 --- a/cortex-mixin/config.libsonnet +++ b/cortex-mixin/config.libsonnet @@ -8,6 +8,14 @@ // Enables chunks- or blocks- specific panels and dashboards. storage_engine: ['blocks'], + // HTTP URL prefix under which the Prometheus api is available. + prometheus_http_prefix: 'prometheus', + + // Disable unused panels depending whether a component was installed or not + cortex_gw_enabled: true, + query_scheduler_enabled: true, + ruler_enabled: true, + // For chunks backend, switch for chunk index type. // May contain 'bigtable', 'dynamodb' or 'cassandra'. chunk_index_backend: ['bigtable', 'dynamodb', 'cassandra'], @@ -54,8 +62,10 @@ // Name selectors for different application instances, using the "per_instance_label". instance_names: { - compactor: 'compactor.*', - alertmanager: 'alertmanager.*', + alertmanager: '.*alertmanager.*', + compactor: '.*compactor.*', + ingester: '.*ingester.*', + store_gateway: '.*store-gateway.*', }, // The label used to differentiate between different nodes (i.e. servers). diff --git a/cortex-mixin/dashboards/alertmanager-resources.libsonnet b/cortex-mixin/dashboards/alertmanager-resources.libsonnet index 4150602..63d6cca 100644 --- a/cortex-mixin/dashboards/alertmanager-resources.libsonnet +++ b/cortex-mixin/dashboards/alertmanager-resources.libsonnet @@ -4,7 +4,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'alertmanager-resources.json': ($.dashboard('Cortex / Alertmanager Resources') + { uid: '68b66aed90ccab448009089544a8d6c6' }) .addClusterSelectorTemplates(false) - .addRow( + .addRowIf( + $._config.cortex_gw_enabled, $.row('Gateway') .addPanel( $.containerCPUUsagePanel('CPU', $._config.job_names.gateway), @@ -61,7 +62,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.containerDiskSpaceUtilization('Disk Space Utilization', 'alertmanager'), + $.containerDiskSpaceUtilization('Disk Space Utilization', $._config.instance_names.alertmanager), ) ), } diff --git a/cortex-mixin/dashboards/compactor-resources.libsonnet b/cortex-mixin/dashboards/compactor-resources.libsonnet index 82a6bce..3e628e9 100644 --- a/cortex-mixin/dashboards/compactor-resources.libsonnet +++ b/cortex-mixin/dashboards/compactor-resources.libsonnet @@ -34,7 +34,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerDiskReadsPanel('Disk Reads', 'compactor'), ) .addPanel( - $.containerDiskSpaceUtilization('Disk Space Utilization', 'compactor'), + $.containerDiskSpaceUtilization('Disk Space Utilization', $._config.instance_names.compactor), ) ) + { templating+: { diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet index e426819..a46d2f4 100644 --- a/cortex-mixin/dashboards/dashboard-utils.libsonnet +++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet @@ -9,6 +9,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; // - some links that propagate the selectred cluster. dashboard(title):: super.dashboard(title=title, datasource=$._config.dashboard_datasource) + { + refresh: '30s', + timezone: 'browser', + addRowIf(condition, row):: if condition then self.addRow(row) @@ -73,7 +76,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; namespaceMatcher():: if $._config.singleBinary then 'job=~"$job"' - else 'cluster=~"$cluster", namespace=~"$namespace"', + else 'namespace=~"$namespace"', jobSelector(job):: if $._config.singleBinary @@ -204,15 +207,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| sum by(%s, %s, device) ( rate( - node_disk_written_bytes_total[$__rate_interval] + %s[$__rate_interval] ) ) - + - %s ||| % [ $._config.per_node_label, $._config.per_instance_label, - $.filterNodeDiskContainer(containerName), + $.nodeDiskContainerBytesTotal(containerName, 'writes'), ], '{{%s}} - {{device}}' % $._config.per_instance_label ) + @@ -225,13 +226,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| sum by(%s, %s, device) ( rate( - node_disk_read_bytes_total[$__rate_interval] + %s[$__rate_interval] ) - ) + %s + ) ||| % [ $._config.per_node_label, $._config.per_instance_label, - $.filterNodeDiskContainer(containerName), + $.nodeDiskContainerBytesTotal(containerName, 'reads'), ], '{{%s}} - {{device}}' % $._config.per_instance_label ) + @@ -261,9 +262,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('percentunit') }, containerLabelMatcher(containerName):: - if containerName == 'ingester' - then 'label_name=~"ingester.*"' - else 'label_name="%s"' % containerName, + 'persistentvolumeclaim=~"%s"' % containerName, goHeapInUsePanel(title, jobName):: $.panel(title) + @@ -467,32 +466,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('percentunit') } ), - filterNodeDiskContainer(containerName):: + nodeDiskContainerBytesTotal(containerName, op):: ||| - ignoring(%s) group_right() ( - label_replace( - count by( - %s, - %s, - device - ) - ( - container_fs_writes_bytes_total{ - %s, - container="%s", - device!~".*sda.*" - } - ), - "device", - "$1", - "device", - "/dev/(.*)" - ) * 0 - ) + container_fs_%s_bytes_total{ + %s, + container="%s", + device!~".*sda.*|.*nvme0.*" + } ||| % [ - $._config.per_instance_label, - $._config.per_node_label, - $._config.per_instance_label, + op, $.namespaceMatcher(), containerName, ], diff --git a/cortex-mixin/dashboards/queries.libsonnet b/cortex-mixin/dashboards/queries.libsonnet index 259f5df..88fa99d 100644 --- a/cortex-mixin/dashboards/queries.libsonnet +++ b/cortex-mixin/dashboards/queries.libsonnet @@ -21,7 +21,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel('cortex_query_frontend_queue_length{%s}' % $.jobMatcher($._config.job_names.query_frontend), '{{cluster}} / {{namespace}} / {{%s}}' % $._config.per_instance_label), ) ) - .addRow( + .addRowIf( + $._config.query_scheduler_enabled, $.row('Query Scheduler') .addPanel( $.panel('Queue Duration') + diff --git a/cortex-mixin/dashboards/reads-resources.libsonnet b/cortex-mixin/dashboards/reads-resources.libsonnet index f0750c8..08129ec 100644 --- a/cortex-mixin/dashboards/reads-resources.libsonnet +++ b/cortex-mixin/dashboards/reads-resources.libsonnet @@ -4,7 +4,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'cortex-reads-resources.json': ($.dashboard('Cortex / Reads Resources') + { uid: '2fd2cda9eea8d8af9fbc0a5960425120' }) .addClusterSelectorTemplates(false) - .addRow( + .addRowIf( + $._config.cortex_gw_enabled, $.row('Gateway') .addPanel( $.containerCPUUsagePanel('CPU', $._config.job_names.gateway), @@ -28,7 +29,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.query_frontend), ) ) - .addRow( + .addRowIf( + $._config.query_scheduler_enabled, $.row('Query Scheduler') .addPanel( $.containerCPUUsagePanel('CPU', 'query-scheduler'), @@ -64,7 +66,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.ingester), ) ) - .addRow( + .addRowIf( + $._config.ruler_enabled, $.row('Ruler') .addPanel( $.panel('Rules') + @@ -77,7 +80,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerCPUUsagePanel('CPU', 'ruler'), ) ) - .addRow( + .addRowIf( + $._config.ruler_enabled, $.row('') .addPanel( $.containerMemoryWorkingSetPanel('Memory (workingset)', 'ruler'), @@ -109,7 +113,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerDiskReadsPanel('Disk Reads', 'store-gateway'), ) .addPanel( - $.containerDiskSpaceUtilization('Disk Space Utilization', 'store-gateway'), + $.containerDiskSpaceUtilization('Disk Space Utilization', $._config.instance_names.store_gateway), ) ) + { templating+: { diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet index 9bc9b7d..4aa0bf9 100644 --- a/cortex-mixin/dashboards/reads.libsonnet +++ b/cortex-mixin/dashboards/reads.libsonnet @@ -1,6 +1,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { + local config = { + gateway_read_routes_regex: '(%s|api_prom)_api_v1_.+' % $._config.prometheus_http_prefix, + }, + 'cortex-reads.json': ($.dashboard('Cortex / Reads') + { uid: '8d6ba60eccc4b6eedfa329b24b1bd339' }) .addClusterSelectorTemplates() @@ -42,7 +46,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; rate( cortex_request_duration_seconds_count{ %(queryFrontend)s, - route=~"(prometheus|api_prom)_api_v1_query" + route=~"%(query_routes_regex)s" }[$__rate_interval] ) ) + @@ -51,10 +55,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; cortex_prometheus_rule_evaluations_total{ %(ruler)s }[$__rate_interval] - ) + ) or on() vector(0) ) ||| % { queryFrontend: $.jobMatcher($._config.job_names.query_frontend), + query_routes_regex: '(%s|api_prom)_api_v1_query' % $._config.prometheus_http_prefix, ruler: $.jobMatcher($._config.job_names.ruler), }, format='reqps') + $.panelDescription( @@ -73,12 +78,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; rate( cortex_request_duration_seconds_count{ %(queryFrontend)s, - route=~"(prometheus|api_prom)_api_v1_query_range" + route=~"%(query_range_routes_regex)s" }[$__rate_interval] ) ) ||| % { queryFrontend: $.jobMatcher($._config.job_names.query_frontend), + query_range_routes_regex: '(%s|api_prom)_api_v1_query_range' % $._config.prometheus_http_prefix, }, format='reqps') + $.panelDescription( 'Range queries per second', @@ -89,20 +95,21 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) ) - .addRow( + .addRowIf( + $._config.cortex_gw_enabled, $.row('Gateway') .addPanel( $.panel('Requests / sec') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.gateway), config.gateway_read_routes_regex]) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', config.gateway_read_routes_regex)]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway), config.gateway_read_routes_regex], '' ) + { yaxes: $.yaxes('s') } ) @@ -111,21 +118,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Query Frontend') .addPanel( $.panel('Requests / sec') + - $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.query_frontend), config.gateway_read_routes_regex]) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', config.gateway_read_routes_regex)]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend), config.gateway_read_routes_regex], '' ) + { yaxes: $.yaxes('s') } ) ) - .addRow( + .addRowIf( + $._config.query_scheduler_enabled, $.row('Query Scheduler') .addPanel( $.textPanel( @@ -165,16 +173,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Querier') .addPanel( $.panel('Requests / sec') + - $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.querier), config.gateway_read_routes_regex]) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) + utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', config.gateway_read_routes_regex)]) ) .addPanel( $.panel('Per %s p99 Latency' % $._config.per_instance_label) + $.hiddenLegendQueryPanel( - 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier)], '' + 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier), config.gateway_read_routes_regex], '' ) + { yaxes: $.yaxes('s') } ) diff --git a/cortex-mixin/dashboards/rollout-progress.libsonnet b/cortex-mixin/dashboards/rollout-progress.libsonnet index 16c5409..743c796 100644 --- a/cortex-mixin/dashboards/rollout-progress.libsonnet +++ b/cortex-mixin/dashboards/rollout-progress.libsonnet @@ -4,9 +4,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; local config = { namespace_matcher: $.namespaceMatcher(), gateway_job_matcher: $.jobMatcher($._config.job_names.gateway), + writes_job_matcher: $.jobMatcher($._config.job_names.distributor), + reads_job_matcher: $.jobMatcher($._config.job_names.querier), gateway_write_routes_regex: 'api_(v1|prom)_push', - gateway_read_routes_regex: '(prometheus|api_prom)_api_v1_.+', - all_services_regex: std.join('|', ['cortex-gw', 'distributor', 'ingester.*', 'query-frontend.*', 'query-scheduler.*', 'querier.*', 'compactor', 'store-gateway', 'ruler', 'alertmanager']), + gateway_read_routes_regex: '(%s|api_prom)_api_v1_.+' % $._config.prometheus_http_prefix, + all_services_regex: std.join('|', ['cortex-gw', '.*distributor.*', '.*ingester.*', '.*query-frontend.*', '.*query-scheduler.*', '.*querier.*', '.*compactor.*', '.*store-gateway.*', '.*ruler.*', '.*alertmanager.*', '.*memcached.*']), }, 'cortex-rollout-progress.json': @@ -83,7 +85,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { color: 'green', value: 1 }, ], unit='percentunit', min=0, max=1) + { id: 1, - gridPos: { h: 8, w: 10, x: 0, y: 0 }, + gridPos: { h: 10, w: 10, x: 0, y: 0 }, }, // @@ -91,50 +93,50 @@ local utils = import 'mixin-utils/utils.libsonnet'; // $.panel('Writes - 2xx') + $.newStatPanel(||| - sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) / - sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) + sum(rate(cortex_request_duration_seconds_count{%(writes_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) / + sum(rate(cortex_request_duration_seconds_count{%(writes_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) or on() vector(0) ||| % config, thresholds=[ { color: 'green', value: null }, ]) + { id: 2, - gridPos: { h: 4, w: 2, x: 10, y: 0 }, + gridPos: { h: 5, w: 2, x: 10, y: 10 }, }, $.panel('Writes - 4xx') + $.newStatPanel(||| - sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) / - sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) + sum(rate(cortex_request_duration_seconds_count{%(writes_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) / + sum(rate(cortex_request_duration_seconds_count{%(writes_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) or on() vector(0) ||| % config, thresholds=[ { color: 'green', value: null }, { color: 'orange', value: 0.2 }, { color: 'red', value: 0.5 }, ]) + { id: 3, - gridPos: { h: 4, w: 2, x: 12, y: 0 }, + gridPos: { h: 5, w: 2, x: 12, y: 10 }, }, $.panel('Writes - 5xx') + $.newStatPanel(||| - sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) / - sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) + sum(rate(cortex_request_duration_seconds_count{%(writes_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) / + sum(rate(cortex_request_duration_seconds_count{%(writes_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) or on() vector(0) ||| % config, thresholds=[ { color: 'green', value: null }, { color: 'red', value: 0.01 }, ]) + { id: 4, - gridPos: { h: 4, w: 2, x: 14, y: 0 }, + gridPos: { h: 5, w: 2, x: 14, y: 10 }, }, $.panel('Writes 99th Latency') + $.newStatPanel(||| - histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"})) + histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(writes_job_matcher)s, route=~"%(gateway_write_routes_regex)s"})) ||| % config, unit='s', thresholds=[ { color: 'green', value: null }, { color: 'orange', value: 0.2 }, { color: 'red', value: 0.5 }, ]) + { id: 5, - gridPos: { h: 4, w: 8, x: 16, y: 0 }, + gridPos: { h: 5, w: 8, x: 16, y: 10 }, }, // @@ -142,50 +144,50 @@ local utils = import 'mixin-utils/utils.libsonnet'; // $.panel('Reads - 2xx') + $.newStatPanel(||| - sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) / - sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) + sum(rate(cortex_request_duration_seconds_count{%(reads_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) / + sum(rate(cortex_request_duration_seconds_count{%(reads_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) or on() vector(0) ||| % config, thresholds=[ { color: 'green', value: null }, ]) + { id: 6, - gridPos: { h: 4, w: 2, x: 10, y: 4 }, + gridPos: { h: 5, w: 2, x: 10, y: 15 }, }, $.panel('Reads - 4xx') + $.newStatPanel(||| - sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) / - sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) + sum(rate(cortex_request_duration_seconds_count{%(reads_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) / + sum(rate(cortex_request_duration_seconds_count{%(reads_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) or on() vector(0) ||| % config, thresholds=[ { color: 'green', value: null }, { color: 'orange', value: 0.01 }, { color: 'red', value: 0.05 }, ]) + { id: 7, - gridPos: { h: 4, w: 2, x: 12, y: 4 }, + gridPos: { h: 5, w: 2, x: 12, y: 15 }, }, $.panel('Reads - 5xx') + $.newStatPanel(||| - sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) / - sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) + sum(rate(cortex_request_duration_seconds_count{%(reads_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) / + sum(rate(cortex_request_duration_seconds_count{%(reads_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) or on() vector(0) ||| % config, thresholds=[ { color: 'green', value: null }, { color: 'red', value: 0.01 }, ]) + { id: 8, - gridPos: { h: 4, w: 2, x: 14, y: 4 }, + gridPos: { h: 5, w: 2, x: 14, y: 15 }, }, $.panel('Reads 99th Latency') + $.newStatPanel(||| - histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"})) + histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(reads_job_matcher)s, route=~"%(gateway_read_routes_regex)s"})) ||| % config, unit='s', thresholds=[ { color: 'green', value: null }, { color: 'orange', value: 1 }, { color: 'red', value: 2.5 }, ]) + { id: 9, - gridPos: { h: 4, w: 8, x: 16, y: 4 }, + gridPos: { h: 5, w: 8, x: 16, y: 15 }, }, // @@ -218,7 +220,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, }, id: 10, - gridPos: { h: 8, w: 10, x: 0, y: 8 }, + gridPos: { h: 10, w: 6, x: 10, y: 0 }, }, // @@ -235,7 +237,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; count by(container, version) ( label_replace( kube_pod_container_info{%(namespace_matcher)s,container=~"%(all_services_regex)s"}, - "version", "$1", "image", ".*:(.+)-.*" + "version", "$1", "image", ".*:(.+)" ) ) ||| % config, @@ -264,7 +266,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { // Hide time. id: 'organize', - options: { excludeByName: { Time: true } }, + options: { excludeByName: { Time: true }, indexByName: { Time: 0, container: 1 } }, }, { // Sort by container. @@ -274,7 +276,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ], id: 11, - gridPos: { h: 8, w: 6, x: 10, y: 8 }, + gridPos: { h: 10, w: 10, x: 0, y: 10 }, }, // @@ -283,15 +285,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency vs 24h ago') + $.queryPanel([||| 1 - ( - avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"} offset 24h))[1h:]) + avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(writes_job_matcher)s, route=~"%(gateway_write_routes_regex)s"} offset 24h))[1h:]) / - avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}))[1h:]) + avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(writes_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}))[1h:]) ) ||| % config, ||| 1 - ( - avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"} offset 24h))[1h:]) + avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(reads_job_matcher)s, route=~"%(gateway_read_routes_regex)s"} offset 24h))[1h:]) / - avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}))[1h:]) + avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(reads_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}))[1h:]) ) ||| % config], ['writes', 'reads']) + { yaxes: $.yaxes({ @@ -300,7 +302,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }), id: 12, - gridPos: { h: 8, w: 8, x: 16, y: 8 }, + gridPos: { h: 10, w: 8, x: 16, y: 0 }, }, ], diff --git a/cortex-mixin/dashboards/scaling.libsonnet b/cortex-mixin/dashboards/scaling.libsonnet index a01a7db..895dd56 100644 --- a/cortex-mixin/dashboards/scaling.libsonnet +++ b/cortex-mixin/dashboards/scaling.libsonnet @@ -1,7 +1,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; (import 'dashboard-utils.libsonnet') { - 'cortex-scaling.json': ($.dashboard('Cortex / Scaling') + { uid: '88c041017b96856c9176e07cf557bdcf' }) .addClusterSelectorTemplates() @@ -42,18 +41,20 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.tablePanel([ ||| sort_desc( - cluster_namespace_deployment_reason:required_replicas:count{cluster=~"$cluster", namespace=~"$namespace"} + cluster_namespace_deployment_reason:required_replicas:count{%(namespace_matcher)s} > ignoring(reason) group_left - cluster_namespace_deployment:actual_replicas:count{cluster=~"$cluster", namespace=~"$namespace"} + cluster_namespace_deployment:actual_replicas:count{%(namespace_matcher)s} ) - |||, + ||| % { + namespace_matcher: $.namespaceMatcher(), + }, ], { __name__: { alias: 'Cluster', type: 'hidden' }, cluster: { alias: 'Cluster' }, namespace: { alias: 'Namespace' }, deployment: { alias: 'Service' }, reason: { alias: 'Reason' }, - Value: { alias: 'Required Replicas', decimals: 0 }, + 'Value #A': { alias: 'Required Replicas', decimals: 0 }, }) ) ), diff --git a/cortex-mixin/dashboards/writes-resources.libsonnet b/cortex-mixin/dashboards/writes-resources.libsonnet index 64f83ef..408dd33 100644 --- a/cortex-mixin/dashboards/writes-resources.libsonnet +++ b/cortex-mixin/dashboards/writes-resources.libsonnet @@ -4,7 +4,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'cortex-writes-resources.json': ($.dashboard('Cortex / Writes Resources') + { uid: 'c0464f0d8bd026f776c9006b0591bb0b' }) .addClusterSelectorTemplates(false) - .addRow( + .addRowIf( + $._config.cortex_gw_enabled, $.row('Gateway') .addPanel( $.containerCPUUsagePanel('CPU', $._config.job_names.gateway), @@ -62,7 +63,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerDiskReadsPanel('Disk Reads', 'ingester') ) .addPanel( - $.containerDiskSpaceUtilization('Disk Space Utilization', 'ingester'), + $.containerDiskSpaceUtilization('Disk Space Utilization', $._config.instance_names.ingester), ) ) + { diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index e99faee..c4a18fd 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -61,10 +61,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Requests / sec') + - $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.distributor), format='reqps') ) ) - .addRow( + .addRowIf( + $._config.cortex_gw_enabled, $.row('Gateway') .addPanel( $.panel('Requests / sec') +