From 4268feb9d7b6be75359e9fe0dcf765cf73fedd17 Mon Sep 17 00:00:00 2001 From: Leo Q Date: Wed, 7 Jun 2023 20:28:13 +0800 Subject: [PATCH] add alert for sd refresh failure (#12410) * add alert for sd refresh failure Due to config error or sd service down, prometheus may fail to refresh sd resource, which may lead to scrape fail or irrelavant metrics. Signed-off-by: Leo Q * apply suggestions Signed-off-by: Leo Q --------- Signed-off-by: Leo Q --- documentation/prometheus-mixin/alerts.libsonnet | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 0ee5d83c7a..3efb0f27d1 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -20,6 +20,20 @@ description: 'Prometheus %(prometheusName)s has failed to reload its configuration.' % $._config, }, }, + { + alert: 'PrometheusSDRefreshFailure', + expr: ||| + increase(prometheus_sd_refresh_failures_total{%(prometheusSelector)s}[10m]) > 0 + ||| % $._config, + 'for': '20m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Failed Prometheus SD refresh.', + description: 'Prometheus %(prometheusName)s has failed to refresh SD with mechanism {{$labels.mechanism}}.' % $._config, + }, + }, { alert: 'PrometheusNotificationQueueRunningFull', expr: |||