diff --git a/docs/querying/functions.md b/docs/querying/functions.md index d895207da6..d274de0b72 100644 --- a/docs/querying/functions.md +++ b/docs/querying/functions.md @@ -293,6 +293,10 @@ boundaries, the function uses interpolation to estimate the fraction. With the resulting uncertainty, it becomes irrelevant if the boundaries are inclusive or exclusive. +Special case for native histograms with standard exponential buckets: +`NaN` observations are considered outside of any buckets in this case. +`histogram_fraction(-Inf, +Inf, b)` effectively returns the fraction of +non-`NaN` observations and may therefore be less than 1. ## `histogram_quantile()` @@ -385,9 +389,16 @@ Special cases for classic histograms: is applied within that bucket. Otherwise, the upper bound of the lowest bucket is returned for quantiles located in the lowest bucket. -Special cases for native histograms (relevant for the exact interpolation -happening within the zero bucket): +Special cases for native histograms: +* If a native histogram with standard exponential buckets has `NaN` + observations and the quantile falls into one of the existing exponential + buckets, the result is skewed towards higher values due to `NaN` + observations treated as `+Inf`. This is flagged with an info level + annotation. +* If a native histogram with standard exponential buckets has `NaN` + observations and the quantile falls above all of the existing exponential + buckets, `NaN` is returned. This is flagged with an info level annotation. * A zero bucket with finite width is assumed to contain no negative observations if the histogram has observations in positive buckets, but none in negative buckets. diff --git a/promql/functions.go b/promql/functions.go index 7e3099ec60..6dcb1823e7 100644 --- a/promql/functions.go +++ b/promql/functions.go @@ -1364,9 +1364,11 @@ func funcHistogramFraction(vals []parser.Value, args parser.Expressions, enh *Ev if !enh.enableDelayedNameRemoval { sample.Metric = sample.Metric.DropReserved(schema.IsMetadataLabel) } + hf, hfAnnos := HistogramFraction(lower, upper, sample.H, sample.Metric.Get(model.MetricNameLabel), args[0].PositionRange()) + annos.Merge(hfAnnos) enh.Out = append(enh.Out, Sample{ Metric: sample.Metric, - F: HistogramFraction(lower, upper, sample.H), + F: hf, DropName: true, }) } @@ -1410,9 +1412,11 @@ func funcHistogramQuantile(vals []parser.Value, args parser.Expressions, enh *Ev if !enh.enableDelayedNameRemoval { sample.Metric = sample.Metric.DropReserved(schema.IsMetadataLabel) } + hq, hqAnnos := HistogramQuantile(q, sample.H, sample.Metric.Get(model.MetricNameLabel), args[0].PositionRange()) + annos.Merge(hqAnnos) enh.Out = append(enh.Out, Sample{ Metric: sample.Metric, - F: HistogramQuantile(q, sample.H), + F: hq, DropName: true, }) } diff --git a/promql/promqltest/testdata/native_histograms.test b/promql/promqltest/testdata/native_histograms.test index dcf1547eda..6411fe44bd 100644 --- a/promql/promqltest/testdata/native_histograms.test +++ b/promql/promqltest/testdata/native_histograms.test @@ -44,6 +44,7 @@ eval instant at 1m histogram_fraction(1, 2, single_histogram) # We expect all values to fall in the range 0 < x <= 8. eval instant at 1m histogram_fraction(0, 8, single_histogram) + expect no_info {} 1 # Median is 1.414213562373095 (2**2**-1, or sqrt(2)) due to @@ -51,6 +52,7 @@ eval instant at 1m histogram_fraction(0, 8, single_histogram) # 2 is assumed where the bucket boundary would be if we increased the # resolution of the histogram by one step. eval instant at 1m histogram_quantile(0.5, single_histogram) + expect no_info {} 1.414213562373095 clear @@ -1328,3 +1330,46 @@ eval range from 0s to 60s step 15s last_over_time({__name__="http_request_durati {__name__="http_request_duration_seconds", pod="nginx-1"} {{count:3 sum:14 buckets:[1 2]}}x4 clear + +# Test native histogram quantile and fraction when the native histogram with exponential +# buckets has NaN observations. +load 1m + histogram_nan{case="100% NaNs"} {{schema:0 count:0 sum:0}} {{schema:0 count:3 sum:NaN}} + histogram_nan{case="20% NaNs"} {{schema:0 count:0 sum:0}} {{schema:0 count:15 sum:NaN buckets:[12]}} + +eval instant at 1m histogram_quantile(1, histogram_nan) + expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is NaN for metric name "histogram_nan" + {case="100% NaNs"} NaN + {case="20% NaNs"} NaN + +eval instant at 1m histogram_quantile(0.81, histogram_nan) + expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is NaN for metric name "histogram_nan" + {case="100% NaNs"} NaN + {case="20% NaNs"} NaN + +eval instant at 1m histogram_quantile(0.8, histogram_nan{case="100% NaNs"}) + expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is NaN for metric name "histogram_nan" + {case="100% NaNs"} NaN + +eval instant at 1m histogram_quantile(0.8, histogram_nan{case="20% NaNs"}) + expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is skewed higher for metric name "histogram_nan" + {case="20% NaNs"} 1 + +eval instant at 1m histogram_quantile(0.4, histogram_nan{case="100% NaNs"}) + expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is NaN for metric name "histogram_nan" + {case="100% NaNs"} NaN + +# histogram_quantile and histogram_fraction equivalence if quantile is not NaN +eval instant at 1m histogram_quantile(0.4, histogram_nan{case="20% NaNs"}) + expect info msg: PromQL info: input to histogram_quantile has NaN observations, result is skewed higher for metric name "histogram_nan" + {case="20% NaNs"} 0.7071067811865475 + +eval instant at 1m histogram_fraction(-Inf, 0.7071067811865475, histogram_nan) + expect info msg: PromQL info: input to histogram_fraction has NaN observations, which are excluded from all fractions for metric name "histogram_nan" + {case="100% NaNs"} 0.0 + {case="20% NaNs"} 0.4 + +eval instant at 1m histogram_fraction(-Inf, +Inf, histogram_nan) + expect info msg: PromQL info: input to histogram_fraction has NaN observations, which are excluded from all fractions for metric name "histogram_nan" + {case="100% NaNs"} 0.0 + {case="20% NaNs"} 0.8 diff --git a/promql/quantile.go b/promql/quantile.go index f21914cb94..1454974107 100644 --- a/promql/quantile.go +++ b/promql/quantile.go @@ -20,7 +20,9 @@ import ( "github.com/prometheus/prometheus/model/histogram" "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/promql/parser/posrange" "github.com/prometheus/prometheus/util/almost" + "github.com/prometheus/prometheus/util/annotations" ) // smallDeltaTolerance is the threshold for relative deltas between classic @@ -195,24 +197,34 @@ func BucketQuantile(q float64, buckets Buckets) (float64, bool, bool) { // // If q is NaN, NaN is returned. // +// If the native histogram has NaN observations and the quantile falls into +// an existing bucket, then an additional info level annotation is returned +// informing the user about possible skew to higher values as NaNs are +// considered +Inf in this case. +// +// If the native histogram has NaN observations and the quantile falls above +// all existing buckets, then NaN is returned along with an additional info +// level annotation informing the user that this has happened. +// // HistogramQuantile is for calculating the histogram_quantile() of native // histograms. See also: BucketQuantile for classic histograms. // // HistogramQuantile is exported as it may be used by other PromQL engine // implementations. -func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 { +func HistogramQuantile(q float64, h *histogram.FloatHistogram, metricName string, pos posrange.PositionRange) (float64, annotations.Annotations) { if q < 0 { - return math.Inf(-1) + return math.Inf(-1), nil } if q > 1 { - return math.Inf(+1) + return math.Inf(+1), nil } if h.Count == 0 || math.IsNaN(q) { - return math.NaN() + return math.NaN(), nil } var ( + annos annotations.Annotations bucket histogram.Bucket[float64] count float64 it histogram.BucketIterator[float64] @@ -255,12 +267,12 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 { if bucket.Lower == math.Inf(-1) { // first bucket, with lower bound -Inf if bucket.Upper <= 0 { - return bucket.Upper + return bucket.Upper, annos } bucket.Lower = 0 } else if bucket.Upper == math.Inf(1) { // last bucket, with upper bound +Inf - return bucket.Lower + return bucket.Lower, annos } } // Due to numerical inaccuracies, we could end up with a higher count @@ -270,21 +282,43 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 { } // We could have hit the highest bucket without even reaching the rank // (this should only happen if the histogram contains observations of - // the value NaN), in which case we simply return the upper limit of the - // highest explicit bucket. + // the value NaN, in which case Sum is also NaN), in which case we simply + // return NaN. + // See https://github.com/prometheus/prometheus/issues/16578 if count < rank { - return bucket.Upper + if math.IsNaN(h.Sum) { + return math.NaN(), annos.Add(annotations.NewNativeHistogramQuantileNaNResultInfo(metricName, pos)) + } + // This should not happen. Either NaNs are in the +Inf bucket (NHCB) and + // then count >= rank, or Sum is set to NaN. Might be a precision issue + // or something wrong with the histogram, fall back to returning the + // upper bound of the highest explicit bucket. + return bucket.Upper, annos } // NaN observations increase h.Count but not the total number of // observations in the buckets. Therefore, we have to use the forward - // iterator to find percentiles. We recognize histograms containing NaN - // observations by checking if their h.Sum is NaN. + // iterator to find percentiles. if math.IsNaN(h.Sum) || q < 0.5 { rank -= count - bucket.Count } else { rank = count - rank } + // We recognize histograms containing NaN observations by checking if their + // h.Sum is NaN and total number of observations is higher than the h.Count. + // If the latter is lost in precision, then the skew isn't worth reporting + // anyway. If the number is not greater, then the histogram observed -Inf + // and +Inf at some point, which made the Sum == NaN. + if math.IsNaN(h.Sum) { + // Detect if h.Count is greater than sum of buckets. + for it.Next() { + bucket = it.At() + count += bucket.Count + } + if count < h.Count { + annos.Add(annotations.NewNativeHistogramQuantileNaNSkewInfo(metricName, pos)) + } + } // The fraction of how far we are into the current bucket. fraction := rank / bucket.Count @@ -292,7 +326,7 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 { // Return linear interpolation for custom buckets and for quantiles that // end up in the zero bucket. if h.UsesCustomBuckets() || (bucket.Lower <= 0 && bucket.Upper >= 0) { - return bucket.Lower + (bucket.Upper-bucket.Lower)*fraction + return bucket.Lower + (bucket.Upper-bucket.Lower)*fraction, annos } // For exponential buckets, we interpolate on a logarithmic scale. On a @@ -305,10 +339,10 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 { logLower := math.Log2(math.Abs(bucket.Lower)) logUpper := math.Log2(math.Abs(bucket.Upper)) if bucket.Lower > 0 { // Positive bucket. - return math.Exp2(logLower + (logUpper-logLower)*fraction) + return math.Exp2(logLower + (logUpper-logLower)*fraction), annos } // Otherwise, we are in a negative bucket and have to mirror things. - return -math.Exp2(logUpper + (logLower-logUpper)*(1-fraction)) + return -math.Exp2(logUpper + (logLower-logUpper)*(1-fraction)), annos } // HistogramFraction calculates the fraction of observations between the @@ -343,23 +377,29 @@ func HistogramQuantile(q float64, h *histogram.FloatHistogram) float64 { // // If lower >= upper and the histogram has at least 1 observation, zero is returned. // +// If the histogram has NaN observations, these are not considered in any bucket +// thus histogram_fraction(-Inf, +Inf, v) might be less than 1.0. The function +// returns an info level annotation in this case. +// // HistogramFraction is exported as it may be used by other PromQL engine // implementations. -func HistogramFraction(lower, upper float64, h *histogram.FloatHistogram) float64 { +func HistogramFraction(lower, upper float64, h *histogram.FloatHistogram, metricName string, pos posrange.PositionRange) (float64, annotations.Annotations) { if h.Count == 0 || math.IsNaN(lower) || math.IsNaN(upper) { - return math.NaN() + return math.NaN(), nil } if lower >= upper { - return 0 + return 0, nil } var ( - rank, lowerRank, upperRank float64 - lowerSet, upperSet bool - it = h.AllBucketIterator() + count, rank, lowerRank, upperRank float64 + lowerSet, upperSet bool + it = h.AllBucketIterator() + annos annotations.Annotations ) for it.Next() { b := it.At() + count += b.Count zeroBucket := false // interpolateLinearly is used for custom buckets to be @@ -438,14 +478,28 @@ func HistogramFraction(lower, upper float64, h *histogram.FloatHistogram) float6 } rank += b.Count } - if !lowerSet || lowerRank > h.Count { - lowerRank = h.Count - } - if !upperSet || upperRank > h.Count { - upperRank = h.Count + if math.IsNaN(h.Sum) { + // There might be NaN observations, so we need to adjust + // the count to only include non `NaN` observations. + for it.Next() { + b := it.At() + count += b.Count + } + if count < h.Count { + annos.Add(annotations.NewNativeHistogramFractionNaNsInfo(metricName, pos)) + } + } else { + count = h.Count } - return (upperRank - lowerRank) / h.Count + if !lowerSet || lowerRank > count { + lowerRank = count + } + if !upperSet || upperRank > count { + upperRank = count + } + + return (upperRank - lowerRank) / h.Count, annos } // BucketFraction is a version of HistogramFraction for classic histograms. diff --git a/util/annotations/annotations.go b/util/annotations/annotations.go index 1b3f9e2ff2..5da360b69a 100644 --- a/util/annotations/annotations.go +++ b/util/annotations/annotations.go @@ -152,6 +152,9 @@ var ( IncompatibleTypesInBinOpInfo = fmt.Errorf("%w: incompatible sample types encountered for binary operator", PromQLInfo) HistogramIgnoredInAggregationInfo = fmt.Errorf("%w: ignored histogram in", PromQLInfo) HistogramIgnoredInMixedRangeInfo = fmt.Errorf("%w: ignored histograms in a range containing both floats and histograms for metric name", PromQLInfo) + NativeHistogramQuantileNaNResultInfo = fmt.Errorf("%w: input to histogram_quantile has NaN observations, result is NaN for metric name", PromQLInfo) + NativeHistogramQuantileNaNSkewInfo = fmt.Errorf("%w: input to histogram_quantile has NaN observations, result is skewed higher for metric name", PromQLInfo) + NativeHistogramFractionNaNsInfo = fmt.Errorf("%w: input to histogram_fraction has NaN observations, which are excluded from all fractions for metric name", PromQLInfo) ) type annoErr struct { @@ -324,3 +327,24 @@ func NewIncompatibleBucketLayoutInBinOpWarning(operator string, pos posrange.Pos Err: fmt.Errorf("%w %s", IncompatibleBucketLayoutInBinOpWarning, operator), } } + +func NewNativeHistogramQuantileNaNResultInfo(metricName string, pos posrange.PositionRange) error { + return annoErr{ + PositionRange: pos, + Err: fmt.Errorf("%w %q", NativeHistogramQuantileNaNResultInfo, metricName), + } +} + +func NewNativeHistogramQuantileNaNSkewInfo(metricName string, pos posrange.PositionRange) error { + return annoErr{ + PositionRange: pos, + Err: fmt.Errorf("%w %q", NativeHistogramQuantileNaNSkewInfo, metricName), + } +} + +func NewNativeHistogramFractionNaNsInfo(metricName string, pos posrange.PositionRange) error { + return annoErr{ + PositionRange: pos, + Err: fmt.Errorf("%w %q", NativeHistogramFractionNaNsInfo, metricName), + } +}