Files
prometheus/discovery/metrics_refresh.go
Will Bollock b70a871988 fix(discovery): delete expired refresh metrics on reload (#17614)
Building off config-specific Prometheus refresh metrics from an earlier
PR (https://github.com/prometheus/prometheus/pull/17138), this deletes
refresh metrics like `prometheus_sd_refresh_duration_seconds` and
`prometheus_sd_refresh_failures_total` when the underlying scrape job
configuration is removed on reload. This reduces un-needed cardinality
from scrape job specific metrics while still preserving metrics that
indicate overall health of a service discovery engine.

For example,
`prometheus_sd_refresh_failures_total{config="linode-servers",mechanism="linode"} 1`
will no longer be exported by Prometheus when the `linode-servers`
scrape job for the Linode service provider is removed. The generic,
service discovery specific `prometheus_sd_linode_failures_total` metric
will persist however.

* fix: add targetsMtx lock for targets access

* test: validate refresh/discover metrics are gone

* ref: combine sdMetrics and refreshMetrics

Good idea from @bboreham to combine sdMetrics and refreshMetrics!
They're always passed around together and don't have much of a
reason not to be combined. mechanismMetrics makes it clear what kind of
metrics this is used for (service discovery mechanisms).

---------

Signed-off-by: Will Bollock <wbollock@linode.com>
2026-04-02 13:43:35 +01:00

97 lines
3.4 KiB
Go

// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package discovery
import (
"time"
"github.com/prometheus/client_golang/prometheus"
)
// RefreshMetricsVecs are metric vectors for the "refresh" package.
// We define them here in the "discovery" package in order to avoid a cyclic dependency between
// "discovery" and "refresh".
type RefreshMetricsVecs struct {
failuresVec *prometheus.CounterVec
durationVec *prometheus.SummaryVec
durationHistVec *prometheus.HistogramVec
metricRegisterer MetricRegisterer
}
var _ RefreshMetricsManager = (*RefreshMetricsVecs)(nil)
func NewRefreshMetrics(reg prometheus.Registerer) RefreshMetricsManager {
m := &RefreshMetricsVecs{
failuresVec: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "prometheus_sd_refresh_failures_total",
Help: "Number of refresh failures for the given SD mechanism.",
},
[]string{"mechanism", "config"}),
durationVec: prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "prometheus_sd_refresh_duration_seconds",
Help: "The duration of a refresh in seconds for the given SD mechanism.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
[]string{"mechanism", "config"}),
durationHistVec: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "prometheus_sd_refresh_duration_histogram_seconds",
Help: "The duration of a refresh for the given SD mechanism.",
Buckets: []float64{.01, .1, 1, 10},
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: 1 * time.Hour,
},
[]string{"mechanism"}),
}
// The reason we register metric vectors instead of metrics is so that
// the metrics are not visible until they are recorded.
m.metricRegisterer = NewMetricRegisterer(reg, []prometheus.Collector{
m.failuresVec,
m.durationVec,
m.durationHistVec,
})
return m
}
// Instantiate returns metrics out of metric vectors for a given mechanism and config.
func (m *RefreshMetricsVecs) Instantiate(mech, config string) *RefreshMetrics {
return &RefreshMetrics{
Failures: m.failuresVec.WithLabelValues(mech, config),
Duration: m.durationVec.WithLabelValues(mech, config),
DurationHistogram: m.durationHistVec.WithLabelValues(mech),
}
}
// Register implements discovery.DiscovererMetrics.
func (m *RefreshMetricsVecs) Register() error {
return m.metricRegisterer.RegisterMetrics()
}
// Unregister implements discovery.DiscovererMetrics.
func (m *RefreshMetricsVecs) Unregister() {
m.metricRegisterer.UnregisterMetrics()
}
// DeleteLabelValues deletes refresh metrics for a specific mechanism and config. Smart to use this when a scrape job is removed.
func (m *RefreshMetricsVecs) DeleteLabelValues(mech, config string) {
m.failuresVec.DeleteLabelValues(mech, config)
m.durationVec.DeleteLabelValues(mech, config)
}