diff --git a/util/strutil/subsequence.go b/util/strutil/subsequence.go new file mode 100644 index 0000000000..767637e99c --- /dev/null +++ b/util/strutil/subsequence.go @@ -0,0 +1,272 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The scoring algorithm is inspired by two JavaScript libraries: +// https://github.com/Nexucis/fuzzy (MIT License), used by the Prometheus UI, +// which itself was inspired by https://github.com/mattyork/fuzzy (MIT License). + +package strutil + +import "strings" + +// SubsequenceMatcher pre-computes the encoding of a fixed search pattern so +// that it can be scored against many candidate strings without repeating the +// ASCII check or rune conversion on the pattern for every call. The first +// Score call with a Unicode candidate lazily caches the pattern's rune slice. +// It is not safe for concurrent use. +type SubsequenceMatcher struct { + pattern string + patternLen int // byte length; used for the pre-check len(pattern) > len(text) + patternASCII bool // whether pattern is pure ASCII + patternRunes []rune // pre-converted runes; set when !patternASCII or on first Unicode text +} + +// NewSubsequenceMatcher returns a matcher for the given pattern. +func NewSubsequenceMatcher(pattern string) *SubsequenceMatcher { + if isASCII(pattern) { + return &SubsequenceMatcher{pattern: pattern, patternLen: len(pattern), patternASCII: true} + } + return &SubsequenceMatcher{pattern: pattern, patternLen: len(pattern), patternRunes: []rune(pattern)} +} + +// Score computes a fuzzy match score between the matcher's pattern and text +// using a greedy character matching algorithm. Characters in pattern must +// appear in text in order (subsequence matching). +// The score is normalized to [0.0, 1.0] where: +// - 1.0 means exact match only. +// - 0.0 means no match (pattern is not a subsequence of text). +// - Intermediate values reward consecutive matches and penalize gaps. +// +// This is a simple scorer for autocomplete ranking. It does not try every +// possible match, so it may miss the best score when the pattern can match the +// text in more than one way. +// +// The raw scoring formula is: Σ(interval_size²) − Σ(gap_size / text_length) − trailing_gap / (2 * text_length). +// The result is normalized by pattern_length² (the maximum possible raw score). +func (m *SubsequenceMatcher) Score(text string) float64 { + if m.pattern == "" { + return 1.0 + } + if text == "" { + return 0.0 + } + + // Exact match: perfect score, checked before any allocation. + if m.pattern == text { + return 1.0 + } + + // Byte length >= rune count, so this is a safe early exit before any allocation. + if m.patternLen > len(text) { + return 0.0 + } + + // For ASCII strings, use the string-native path that avoids []rune conversion. + // If pattern has non-ASCII runes but text is pure ASCII, no non-ASCII + // pattern rune can ever match, so the pattern cannot be a subsequence. + textASCII := isASCII(text) + switch { + case m.patternASCII && textASCII: + return matchSubsequenceString(m.pattern, text) + case !m.patternASCII && textASCII: + return 0.0 + } + if m.patternRunes == nil { + // pattern is ASCII but text is Unicode; convert and cache pattern runes. + m.patternRunes = []rune(m.pattern) + } + return matchSubsequenceRunes(m.patternRunes, []rune(text)) +} + +// isASCII reports whether s contains only ASCII characters. +func isASCII(s string) bool { + for _, c := range s { + if c >= 0x80 { + return false + } + } + return true +} + +// matchSubsequenceString is the string-native implementation of the scoring +// algorithm for ASCII inputs. It uses strings.IndexByte for character scanning, +// with divisions by textLen replaced by a precomputed reciprocal multiply. +func matchSubsequenceString(pattern, text string) float64 { + patternLen := len(pattern) + textLen := len(text) + invTextLen := 1.0 / float64(textLen) + maxStart := textLen - patternLen + + // scoreFrom scores a match starting at startPos, where + // text[startPos] == pattern[0] is guaranteed by the caller. + scoreFrom := func(startPos int) (float64, bool) { + i := startPos + from := i + to := i + patternIdx := 1 + i++ + // Extend the initial consecutive run. + for patternIdx < patternLen && i < textLen && text[i] == pattern[patternIdx] { + to = i + patternIdx++ + i++ + } + var score float64 + if from > 0 { + score -= float64(from) * invTextLen + } + size := to - from + 1 + score += float64(size * size) + prevTo := to + + for patternIdx < patternLen { + // Jump to the next occurrence of pattern[patternIdx]. + j := strings.IndexByte(text[i:], pattern[patternIdx]) + if j < 0 { + return 0, false + } + i += j + from = i + to = i + patternIdx++ + i++ + // Extend the consecutive run. + for patternIdx < patternLen && i < textLen && text[i] == pattern[patternIdx] { + to = i + patternIdx++ + i++ + } + if gap := from - prevTo - 1; gap > 0 { + score -= float64(gap) * invTextLen + } + size = to - from + 1 + score += float64(size * size) + prevTo = to + } + + // Penalise unmatched trailing characters at half the leading/inner rate. + if trailing := textLen - 1 - prevTo; trailing > 0 { + score -= float64(trailing) * invTextLen * 0.5 + } + return score, true + } + + bestScore := -1.0 + for i := 0; i <= maxStart; { + // Scan for the first pattern character. + j := strings.IndexByte(text[i:maxStart+1], pattern[0]) + if j < 0 { + break + } + i += j + s, matched := scoreFrom(i) + if !matched { + // If the pattern cannot be completed from i, no later start can + // succeed: text[i+1:] is a strict subset of text[i:]. + break + } + if s > bestScore { + bestScore = s + } + i++ + } + + if bestScore < 0 { + return 0.0 + } + return bestScore / float64(patternLen*patternLen) +} + +// matchSubsequenceRunes implements the scoring algorithm over pre-converted +// rune slices for the Unicode path. +func matchSubsequenceRunes(patternSlice, textSlice []rune) float64 { + patternLen := len(patternSlice) + textLen := len(textSlice) + invTextLen := 1.0 / float64(textLen) + + // matchFromPos tries to match all pattern characters as a subsequence of + // text starting at startPos. Returns the raw score and true on success, or + // 0 and false if the pattern cannot be fully matched. + // The score is accumulated inline, tracking only prevTo, to avoid any allocation. + matchFromPos := func(startPos int) (float64, bool) { + patternIdx := 0 + i := startPos + var score float64 + prevTo := -1 + + for i < textLen && patternIdx < patternLen { + if textSlice[i] == patternSlice[patternIdx] { + from := i + to := i + patternIdx++ + i++ + for i < textLen && patternIdx < patternLen && textSlice[i] == patternSlice[patternIdx] { + to = i + patternIdx++ + i++ + } + var gapSize int + if prevTo < 0 { + gapSize = from + } else { + gapSize = from - prevTo - 1 + } + if gapSize > 0 { + score -= float64(gapSize) * invTextLen + } + size := to - from + 1 + score += float64(size * size) + prevTo = to + } else { + i++ + } + } + + if patternIdx < patternLen { + return 0, false + } + + // Penalize unmatched trailing characters at half the leading/inner gap rate. + trailingGap := textLen - 1 - prevTo + if trailingGap > 0 { + score -= float64(trailingGap) * invTextLen * 0.5 + } + + return score, true + } + + bestScore := -1.0 + // Only iterate while there are enough characters left for the pattern to fit. + maxStart := textLen - patternLen + for i := 0; i <= maxStart; i++ { + if textSlice[i] != patternSlice[0] { + continue + } + s, matched := matchFromPos(i) + if !matched { + // If matching fails from this position, no later position can succeed + // since the remaining text is a strict subset. + break + } + if s > bestScore { + bestScore = s + } + } + + if bestScore < 0 { + return 0.0 + } + + // Normalize by pattern_length² (the maximum possible raw score). + return bestScore / float64(patternLen*patternLen) +} diff --git a/util/strutil/subsequence_test.go b/util/strutil/subsequence_test.go new file mode 100644 index 0000000000..00ac62eca4 --- /dev/null +++ b/util/strutil/subsequence_test.go @@ -0,0 +1,302 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package strutil + +import ( + "fmt" + "math" + "testing" + + "github.com/stretchr/testify/require" +) + +// generateNames returns n synthetic metric-style names for benchmarking. +// Names are varied: some share characters with the query, some do not. +func generateNames(n int) []string { + names := make([]string, 0, n) + prefixes := []string{ + "http_requests_total", + "process_cpu_seconds", + "go_goroutines", + "node_memory_bytes", + "prometheus_tsdb_head", + "grpc_server_handled", + "container_cpu_usage", + "kube_pod_status", + "up", + "scrape_duration_seconds", + } + for i := range n { + base := prefixes[i%len(prefixes)] + names = append(names, fmt.Sprintf("%s_%d", base, i/len(prefixes))) + } + return names +} + +// generateNamesRunes returns n synthetic names containing non-ASCII characters +// for benchmarking the rune path. Non-ASCII characters are inserted at varying +// positions and with varying characters across prefixes, so the benchmark +// exercises both matching and non-matching rune paths. +func generateNamesRunes(n int) []string { + names := make([]string, 0, n) + prefixes := []string{ + "é_http_requests_total", // é prepended; matches "éhttp_total" + "process_écu_seconds", // é mid-name; no match + "go_goroutines", // ASCII; no match + "énode_memory_bytes", // é prepended, different root; no match + "prometheus_tsdb_head", // ASCII; no match + "grpc_sérveur_handled", // é mid-name; no match + "é_http_container_total", // é prepended; matches "éhttp_total" + "kube_pod_status", // ASCII; no match + "ñup", // other non-ASCII; no match + "scrape_duration_séconds", // é near end; no match + } + for i := range n { + base := prefixes[i%len(prefixes)] + names = append(names, fmt.Sprintf("%s_%d", base, i/len(prefixes))) + } + return names +} + +func BenchmarkSubsequenceScoreString(b *testing.B) { + // A 10-character query that partially matches many of the generated names. + const query = "http_total" + + for _, n := range []int{1000, 2000, 10000, 100000, 1000000} { + names := generateNames(n) + b.Run(fmt.Sprintf("names=%d", n), func(b *testing.B) { + b.ReportAllocs() + m := NewSubsequenceMatcher(query) + for b.Loop() { + for _, name := range names { + m.Score(name) + } + } + }) + } +} + +func BenchmarkSubsequenceScoreRunes(b *testing.B) { + // Non-ASCII query to force the rune path; matches a subset of the generated names. + const query = "éhttp_total" + + for _, n := range []int{1000, 2000, 10000, 100000, 1000000} { + names := generateNamesRunes(n) + b.Run(fmt.Sprintf("names=%d", n), func(b *testing.B) { + b.ReportAllocs() + m := NewSubsequenceMatcher(query) + for b.Loop() { + for _, name := range names { + m.Score(name) + } + } + }) + } +} + +func TestSubsequenceScore(t *testing.T) { + tests := []struct { + name string + pattern string + text string + wantScore float64 + wantZero bool + }{ + { + name: "empty pattern", + pattern: "", + text: "anything", + wantScore: 1.0, + }, + { + name: "empty text", + pattern: "abc", + text: "", + wantZero: true, + }, + { + name: "exact match", + pattern: "my awesome text", + text: "my awesome text", + wantScore: 1.0, + }, + { + name: "prefix match", + pattern: "my", + text: "my awesome text", + // intervals [0,1], leading=0, trailing=13. raw = 4 - 13/30, normalized by 4. + wantScore: 107.0 / 120.0, + }, + { + name: "substring match", + pattern: "tex", + text: "my awesome text", + // intervals [11,13], leading=11, trailing=1. raw = 9 - 11/15 - 1/30, normalized by 9. + wantScore: 247.0 / 270.0, + }, + { + name: "fuzzy match picks best starting position", + pattern: "met", + text: "my awesome text", + // intervals [8,9] and [11,11], leading=8, inner gap=1, trailing=3. raw = 5 - 9/15 - 3/30, normalized by 9. + wantScore: 43.0 / 90.0, + }, + { + name: "prefers later position with better consecutive run", + pattern: "bac", + text: "babac", + wantScore: 43.0 / 45.0, // match at [2,4], leading gap=2, trailing=0. raw = 9 - 2/5, normalized by 9. + }, + { + name: "pattern longer than text", + pattern: "abcd", + text: "abc", + wantZero: true, + }, + { + name: "pattern longer in runes than multi-byte text", + pattern: "abc", + text: "éé", + wantZero: true, + }, + { + name: "non-ASCII pattern with ASCII text", + pattern: "é", + text: "ab", + wantZero: true, + }, + { + name: "no subsequence match", + pattern: "xyz", + text: "abc", + wantZero: true, + }, + { + name: "unicode exact match", + pattern: "éàü", + text: "éàü", + wantScore: 1.0, + }, + { + name: "unicode prefix match", + pattern: "éà", + text: "éàü", + // intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4. + wantScore: 23.0 / 24.0, + }, + { + name: "unicode no match", + pattern: "üé", + text: "éàü", + wantZero: true, + }, + { + name: "unicode first char matches but pattern cannot complete", + pattern: "éàx", + text: "éàü", + wantZero: true, + }, + { + name: "unicode fuzzy match with gap between intervals", + pattern: "éü", + text: "éàü", + // intervals [0,0] and [2,2], leading=0, inner gap=1, trailing=0. + // raw = 1 + 1 - 1/3, normalized by 4. + wantScore: 5.0 / 12.0, + }, + { + name: "mixed ascii and unicode", + pattern: "aé", + text: "aéb", + // intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4. + wantScore: 23.0 / 24.0, + }, + { + name: "unicode chars sharing leading utf-8 byte do not match", + // 'é' (U+00E9) encodes as [0xC3 0xA9] and 'ã' (U+00E3) as [0xC3 0xA3]. + // They share the leading byte but must not be treated as equal. + pattern: "é", + text: "ã", + wantZero: true, + }, + { + name: "single char exact match", + pattern: "a", + text: "a", + wantScore: 1.0, + }, + { + name: "consecutive match with leading gap", + pattern: "oa", + text: "goat", + // 'o'(1),'a'(2) form one interval [1,2], leading gap=1, trailing=1. + // raw = 2² - 1/4 - 1/8 = 29/8, normalized by 2² = 4. + wantScore: 29.0 / 32.0, + }, + { + name: "repeated chars use greedy match", + pattern: "abaa", + text: "abbaa", + // Matches 'a'(0),'b'(1),'a'(3),'a'(4) as intervals [0,1] and [3,4]. + // raw = 2² + 2² - 1/5, normalized by 4² = 16. + // A better match exists at 'a'(0),'b'(2),'a'(3),'a'(4), which would score 49/80, + // but this test documents the current greedy behavior. + wantScore: 39.0 / 80.0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := NewSubsequenceMatcher(tt.pattern).Score(tt.text) + if tt.wantZero { + require.Equal(t, 0.0, got) + return + } + require.InDelta(t, tt.wantScore, got, 1e-9) + }) + } +} + +func TestSubsequenceScoreProperties(t *testing.T) { + // Prefix match scores below 1.0; only exact match scores 1.0. + // "pro" in "prometheus": intervals [0,2], trailing=7. raw = 9 - 7/20, normalized by 9. + require.InDelta(t, 173.0/180.0, NewSubsequenceMatcher("pro").Score("prometheus"), 1e-9) + + // Exact match always scores 1.0. + require.Equal(t, 1.0, NewSubsequenceMatcher("prometheus").Score("prometheus")) + + // Score is always in [0, 1]. + cases := [][2]string{ + {"abc", "xaxbxcx"}, + {"z", "aaaaaz"}, + {"ab", "ba"}, + {"met", "my awesome text"}, + } + for _, c := range cases { + score := NewSubsequenceMatcher(c[0]).Score(c[1]) + require.True(t, score >= 0.0 && score <= 1.0, + "score %v out of range for pattern=%q text=%q", score, c[0], c[1]) + require.False(t, math.IsNaN(score)) + } + + // Prefix scores higher than non-prefix substring. + prefixScore := NewSubsequenceMatcher("abc").Score("abcdef") + suffixScore := NewSubsequenceMatcher("abc").Score("defabc") + require.Greater(t, prefixScore, suffixScore) + + // Consecutive chars score higher than scattered. + consecutiveScore := NewSubsequenceMatcher("abc").Score("xabcx") + scatteredScore := NewSubsequenceMatcher("abc").Score("xaxbxcx") + require.Greater(t, consecutiveScore, scatteredScore) +}