Merge pull request #18402 from roidelapluie/roidelapluie/strutil_subsequence

util/strutil: add subsequence matching implementation
2026-04-20 22:41:05 +08:00 · 2026-04-14 16:55:56 +02:00
parent 12de1243c0
commit 0b067888c7
2 changed files with 574 additions and 0 deletions
--- a/util/strutil/subsequence.go
+++ b/util/strutil/subsequence.go
@@ -0,0 +1,272 @@
+// Copyright The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The scoring algorithm is inspired by two JavaScript libraries:
+// https://github.com/Nexucis/fuzzy (MIT License), used by the Prometheus UI,
+// which itself was inspired by https://github.com/mattyork/fuzzy (MIT License).
+
+package strutil
+
+import "strings"
+
+// SubsequenceMatcher pre-computes the encoding of a fixed search pattern so
+// that it can be scored against many candidate strings without repeating the
+// ASCII check or rune conversion on the pattern for every call. The first
+// Score call with a Unicode candidate lazily caches the pattern's rune slice.
+// It is not safe for concurrent use.
+type SubsequenceMatcher struct {
+	pattern      string
+	patternLen   int    // byte length; used for the pre-check len(pattern) > len(text)
+	patternASCII bool   // whether pattern is pure ASCII
+	patternRunes []rune // pre-converted runes; set when !patternASCII or on first Unicode text
+}
+
+// NewSubsequenceMatcher returns a matcher for the given pattern.
+func NewSubsequenceMatcher(pattern string) *SubsequenceMatcher {
+	if isASCII(pattern) {
+		return &SubsequenceMatcher{pattern: pattern, patternLen: len(pattern), patternASCII: true}
+	}
+	return &SubsequenceMatcher{pattern: pattern, patternLen: len(pattern), patternRunes: []rune(pattern)}
+}
+
+// Score computes a fuzzy match score between the matcher's pattern and text
+// using a greedy character matching algorithm. Characters in pattern must
+// appear in text in order (subsequence matching).
+// The score is normalized to [0.0, 1.0] where:
+//   - 1.0 means exact match only.
+//   - 0.0 means no match (pattern is not a subsequence of text).
+//   - Intermediate values reward consecutive matches and penalize gaps.
+//
+// This is a simple scorer for autocomplete ranking. It does not try every
+// possible match, so it may miss the best score when the pattern can match the
+// text in more than one way.
+//
+// The raw scoring formula is: Σ(interval_size²) − Σ(gap_size / text_length) − trailing_gap / (2 * text_length).
+// The result is normalized by pattern_length² (the maximum possible raw score).
+func (m *SubsequenceMatcher) Score(text string) float64 {
+	if m.pattern == "" {
+		return 1.0
+	}
+	if text == "" {
+		return 0.0
+	}
+
+	// Exact match: perfect score, checked before any allocation.
+	if m.pattern == text {
+		return 1.0
+	}
+
+	// Byte length >= rune count, so this is a safe early exit before any allocation.
+	if m.patternLen > len(text) {
+		return 0.0
+	}
+
+	// For ASCII strings, use the string-native path that avoids []rune conversion.
+	// If pattern has non-ASCII runes but text is pure ASCII, no non-ASCII
+	// pattern rune can ever match, so the pattern cannot be a subsequence.
+	textASCII := isASCII(text)
+	switch {
+	case m.patternASCII && textASCII:
+		return matchSubsequenceString(m.pattern, text)
+	case !m.patternASCII && textASCII:
+		return 0.0
+	}
+	if m.patternRunes == nil {
+		// pattern is ASCII but text is Unicode; convert and cache pattern runes.
+		m.patternRunes = []rune(m.pattern)
+	}
+	return matchSubsequenceRunes(m.patternRunes, []rune(text))
+}
+
+// isASCII reports whether s contains only ASCII characters.
+func isASCII(s string) bool {
+	for _, c := range s {
+		if c >= 0x80 {
+			return false
+		}
+	}
+	return true
+}
+
+// matchSubsequenceString is the string-native implementation of the scoring
+// algorithm for ASCII inputs. It uses strings.IndexByte for character scanning,
+// with divisions by textLen replaced by a precomputed reciprocal multiply.
+func matchSubsequenceString(pattern, text string) float64 {
+	patternLen := len(pattern)
+	textLen := len(text)
+	invTextLen := 1.0 / float64(textLen)
+	maxStart := textLen - patternLen
+
+	// scoreFrom scores a match starting at startPos, where
+	// text[startPos] == pattern[0] is guaranteed by the caller.
+	scoreFrom := func(startPos int) (float64, bool) {
+		i := startPos
+		from := i
+		to := i
+		patternIdx := 1
+		i++
+		// Extend the initial consecutive run.
+		for patternIdx < patternLen && i < textLen && text[i] == pattern[patternIdx] {
+			to = i
+			patternIdx++
+			i++
+		}
+		var score float64
+		if from > 0 {
+			score -= float64(from) * invTextLen
+		}
+		size := to - from + 1
+		score += float64(size * size)
+		prevTo := to
+
+		for patternIdx < patternLen {
+			// Jump to the next occurrence of pattern[patternIdx].
+			j := strings.IndexByte(text[i:], pattern[patternIdx])
+			if j < 0 {
+				return 0, false
+			}
+			i += j
+			from = i
+			to = i
+			patternIdx++
+			i++
+			// Extend the consecutive run.
+			for patternIdx < patternLen && i < textLen && text[i] == pattern[patternIdx] {
+				to = i
+				patternIdx++
+				i++
+			}
+			if gap := from - prevTo - 1; gap > 0 {
+				score -= float64(gap) * invTextLen
+			}
+			size = to - from + 1
+			score += float64(size * size)
+			prevTo = to
+		}
+
+		// Penalise unmatched trailing characters at half the leading/inner rate.
+		if trailing := textLen - 1 - prevTo; trailing > 0 {
+			score -= float64(trailing) * invTextLen * 0.5
+		}
+		return score, true
+	}
+
+	bestScore := -1.0
+	for i := 0; i <= maxStart; {
+		// Scan for the first pattern character.
+		j := strings.IndexByte(text[i:maxStart+1], pattern[0])
+		if j < 0 {
+			break
+		}
+		i += j
+		s, matched := scoreFrom(i)
+		if !matched {
+			// If the pattern cannot be completed from i, no later start can
+			// succeed: text[i+1:] is a strict subset of text[i:].
+			break
+		}
+		if s > bestScore {
+			bestScore = s
+		}
+		i++
+	}
+
+	if bestScore < 0 {
+		return 0.0
+	}
+	return bestScore / float64(patternLen*patternLen)
+}
+
+// matchSubsequenceRunes implements the scoring algorithm over pre-converted
+// rune slices for the Unicode path.
+func matchSubsequenceRunes(patternSlice, textSlice []rune) float64 {
+	patternLen := len(patternSlice)
+	textLen := len(textSlice)
+	invTextLen := 1.0 / float64(textLen)
+
+	// matchFromPos tries to match all pattern characters as a subsequence of
+	// text starting at startPos. Returns the raw score and true on success, or
+	// 0 and false if the pattern cannot be fully matched.
+	// The score is accumulated inline, tracking only prevTo, to avoid any allocation.
+	matchFromPos := func(startPos int) (float64, bool) {
+		patternIdx := 0
+		i := startPos
+		var score float64
+		prevTo := -1
+
+		for i < textLen && patternIdx < patternLen {
+			if textSlice[i] == patternSlice[patternIdx] {
+				from := i
+				to := i
+				patternIdx++
+				i++
+				for i < textLen && patternIdx < patternLen && textSlice[i] == patternSlice[patternIdx] {
+					to = i
+					patternIdx++
+					i++
+				}
+				var gapSize int
+				if prevTo < 0 {
+					gapSize = from
+				} else {
+					gapSize = from - prevTo - 1
+				}
+				if gapSize > 0 {
+					score -= float64(gapSize) * invTextLen
+				}
+				size := to - from + 1
+				score += float64(size * size)
+				prevTo = to
+			} else {
+				i++
+			}
+		}
+
+		if patternIdx < patternLen {
+			return 0, false
+		}
+
+		// Penalize unmatched trailing characters at half the leading/inner gap rate.
+		trailingGap := textLen - 1 - prevTo
+		if trailingGap > 0 {
+			score -= float64(trailingGap) * invTextLen * 0.5
+		}
+
+		return score, true
+	}
+
+	bestScore := -1.0
+	// Only iterate while there are enough characters left for the pattern to fit.
+	maxStart := textLen - patternLen
+	for i := 0; i <= maxStart; i++ {
+		if textSlice[i] != patternSlice[0] {
+			continue
+		}
+		s, matched := matchFromPos(i)
+		if !matched {
+			// If matching fails from this position, no later position can succeed
+			// since the remaining text is a strict subset.
+			break
+		}
+		if s > bestScore {
+			bestScore = s
+		}
+	}
+
+	if bestScore < 0 {
+		return 0.0
+	}
+
+	// Normalize by pattern_length² (the maximum possible raw score).
+	return bestScore / float64(patternLen*patternLen)
+}
--- a/util/strutil/subsequence_test.go
+++ b/util/strutil/subsequence_test.go
@@ -0,0 +1,302 @@
+// Copyright The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strutil
+
+import (
+	"fmt"
+	"math"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+// generateNames returns n synthetic metric-style names for benchmarking.
+// Names are varied: some share characters with the query, some do not.
+func generateNames(n int) []string {
+	names := make([]string, 0, n)
+	prefixes := []string{
+		"http_requests_total",
+		"process_cpu_seconds",
+		"go_goroutines",
+		"node_memory_bytes",
+		"prometheus_tsdb_head",
+		"grpc_server_handled",
+		"container_cpu_usage",
+		"kube_pod_status",
+		"up",
+		"scrape_duration_seconds",
+	}
+	for i := range n {
+		base := prefixes[i%len(prefixes)]
+		names = append(names, fmt.Sprintf("%s_%d", base, i/len(prefixes)))
+	}
+	return names
+}
+
+// generateNamesRunes returns n synthetic names containing non-ASCII characters
+// for benchmarking the rune path. Non-ASCII characters are inserted at varying
+// positions and with varying characters across prefixes, so the benchmark
+// exercises both matching and non-matching rune paths.
+func generateNamesRunes(n int) []string {
+	names := make([]string, 0, n)
+	prefixes := []string{
+		"é_http_requests_total",   // é prepended; matches "éhttp_total"
+		"process_écu_seconds",     // é mid-name; no match
+		"go_goroutines",           // ASCII; no match
+		"énode_memory_bytes",      // é prepended, different root; no match
+		"prometheus_tsdb_head",    // ASCII; no match
+		"grpc_sérveur_handled",    // é mid-name; no match
+		"é_http_container_total",  // é prepended; matches "éhttp_total"
+		"kube_pod_status",         // ASCII; no match
+		"ñup",                     // other non-ASCII; no match
+		"scrape_duration_séconds", // é near end; no match
+	}
+	for i := range n {
+		base := prefixes[i%len(prefixes)]
+		names = append(names, fmt.Sprintf("%s_%d", base, i/len(prefixes)))
+	}
+	return names
+}
+
+func BenchmarkSubsequenceScoreString(b *testing.B) {
+	// A 10-character query that partially matches many of the generated names.
+	const query = "http_total"
+
+	for _, n := range []int{1000, 2000, 10000, 100000, 1000000} {
+		names := generateNames(n)
+		b.Run(fmt.Sprintf("names=%d", n), func(b *testing.B) {
+			b.ReportAllocs()
+			m := NewSubsequenceMatcher(query)
+			for b.Loop() {
+				for _, name := range names {
+					m.Score(name)
+				}
+			}
+		})
+	}
+}
+
+func BenchmarkSubsequenceScoreRunes(b *testing.B) {
+	// Non-ASCII query to force the rune path; matches a subset of the generated names.
+	const query = "éhttp_total"
+
+	for _, n := range []int{1000, 2000, 10000, 100000, 1000000} {
+		names := generateNamesRunes(n)
+		b.Run(fmt.Sprintf("names=%d", n), func(b *testing.B) {
+			b.ReportAllocs()
+			m := NewSubsequenceMatcher(query)
+			for b.Loop() {
+				for _, name := range names {
+					m.Score(name)
+				}
+			}
+		})
+	}
+}
+
+func TestSubsequenceScore(t *testing.T) {
+	tests := []struct {
+		name      string
+		pattern   string
+		text      string
+		wantScore float64
+		wantZero  bool
+	}{
+		{
+			name:      "empty pattern",
+			pattern:   "",
+			text:      "anything",
+			wantScore: 1.0,
+		},
+		{
+			name:     "empty text",
+			pattern:  "abc",
+			text:     "",
+			wantZero: true,
+		},
+		{
+			name:      "exact match",
+			pattern:   "my awesome text",
+			text:      "my awesome text",
+			wantScore: 1.0,
+		},
+		{
+			name:    "prefix match",
+			pattern: "my",
+			text:    "my awesome text",
+			// intervals [0,1], leading=0, trailing=13. raw = 4 - 13/30, normalized by 4.
+			wantScore: 107.0 / 120.0,
+		},
+		{
+			name:    "substring match",
+			pattern: "tex",
+			text:    "my awesome text",
+			// intervals [11,13], leading=11, trailing=1. raw = 9 - 11/15 - 1/30, normalized by 9.
+			wantScore: 247.0 / 270.0,
+		},
+		{
+			name:    "fuzzy match picks best starting position",
+			pattern: "met",
+			text:    "my awesome text",
+			// intervals [8,9] and [11,11], leading=8, inner gap=1, trailing=3. raw = 5 - 9/15 - 3/30, normalized by 9.
+			wantScore: 43.0 / 90.0,
+		},
+		{
+			name:      "prefers later position with better consecutive run",
+			pattern:   "bac",
+			text:      "babac",
+			wantScore: 43.0 / 45.0, // match at [2,4], leading gap=2, trailing=0. raw = 9 - 2/5, normalized by 9.
+		},
+		{
+			name:     "pattern longer than text",
+			pattern:  "abcd",
+			text:     "abc",
+			wantZero: true,
+		},
+		{
+			name:     "pattern longer in runes than multi-byte text",
+			pattern:  "abc",
+			text:     "éé",
+			wantZero: true,
+		},
+		{
+			name:     "non-ASCII pattern with ASCII text",
+			pattern:  "é",
+			text:     "ab",
+			wantZero: true,
+		},
+		{
+			name:     "no subsequence match",
+			pattern:  "xyz",
+			text:     "abc",
+			wantZero: true,
+		},
+		{
+			name:      "unicode exact match",
+			pattern:   "éàü",
+			text:      "éàü",
+			wantScore: 1.0,
+		},
+		{
+			name:    "unicode prefix match",
+			pattern: "éà",
+			text:    "éàü",
+			// intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4.
+			wantScore: 23.0 / 24.0,
+		},
+		{
+			name:     "unicode no match",
+			pattern:  "üé",
+			text:     "éàü",
+			wantZero: true,
+		},
+		{
+			name:     "unicode first char matches but pattern cannot complete",
+			pattern:  "éàx",
+			text:     "éàü",
+			wantZero: true,
+		},
+		{
+			name:    "unicode fuzzy match with gap between intervals",
+			pattern: "éü",
+			text:    "éàü",
+			// intervals [0,0] and [2,2], leading=0, inner gap=1, trailing=0.
+			// raw = 1 + 1 - 1/3, normalized by 4.
+			wantScore: 5.0 / 12.0,
+		},
+		{
+			name:    "mixed ascii and unicode",
+			pattern: "aé",
+			text:    "aéb",
+			// intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4.
+			wantScore: 23.0 / 24.0,
+		},
+		{
+			name: "unicode chars sharing leading utf-8 byte do not match",
+			// 'é' (U+00E9) encodes as [0xC3 0xA9] and 'ã' (U+00E3) as [0xC3 0xA3].
+			// They share the leading byte but must not be treated as equal.
+			pattern:  "é",
+			text:     "ã",
+			wantZero: true,
+		},
+		{
+			name:      "single char exact match",
+			pattern:   "a",
+			text:      "a",
+			wantScore: 1.0,
+		},
+		{
+			name:    "consecutive match with leading gap",
+			pattern: "oa",
+			text:    "goat",
+			// 'o'(1),'a'(2) form one interval [1,2], leading gap=1, trailing=1.
+			// raw = 2² - 1/4 - 1/8 = 29/8, normalized by 2² = 4.
+			wantScore: 29.0 / 32.0,
+		},
+		{
+			name:    "repeated chars use greedy match",
+			pattern: "abaa",
+			text:    "abbaa",
+			// Matches 'a'(0),'b'(1),'a'(3),'a'(4) as intervals [0,1] and [3,4].
+			// raw = 2² + 2² - 1/5, normalized by 4² = 16.
+			// A better match exists at 'a'(0),'b'(2),'a'(3),'a'(4), which would score 49/80,
+			// but this test documents the current greedy behavior.
+			wantScore: 39.0 / 80.0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := NewSubsequenceMatcher(tt.pattern).Score(tt.text)
+			if tt.wantZero {
+				require.Equal(t, 0.0, got)
+				return
+			}
+			require.InDelta(t, tt.wantScore, got, 1e-9)
+		})
+	}
+}
+
+func TestSubsequenceScoreProperties(t *testing.T) {
+	// Prefix match scores below 1.0; only exact match scores 1.0.
+	// "pro" in "prometheus": intervals [0,2], trailing=7. raw = 9 - 7/20, normalized by 9.
+	require.InDelta(t, 173.0/180.0, NewSubsequenceMatcher("pro").Score("prometheus"), 1e-9)
+
+	// Exact match always scores 1.0.
+	require.Equal(t, 1.0, NewSubsequenceMatcher("prometheus").Score("prometheus"))
+
+	// Score is always in [0, 1].
+	cases := [][2]string{
+		{"abc", "xaxbxcx"},
+		{"z", "aaaaaz"},
+		{"ab", "ba"},
+		{"met", "my awesome text"},
+	}
+	for _, c := range cases {
+		score := NewSubsequenceMatcher(c[0]).Score(c[1])
+		require.True(t, score >= 0.0 && score <= 1.0,
+			"score %v out of range for pattern=%q text=%q", score, c[0], c[1])
+		require.False(t, math.IsNaN(score))
+	}
+
+	// Prefix scores higher than non-prefix substring.
+	prefixScore := NewSubsequenceMatcher("abc").Score("abcdef")
+	suffixScore := NewSubsequenceMatcher("abc").Score("defabc")
+	require.Greater(t, prefixScore, suffixScore)
+
+	// Consecutive chars score higher than scattered.
+	consecutiveScore := NewSubsequenceMatcher("abc").Score("xabcx")
+	scatteredScore := NewSubsequenceMatcher("abc").Score("xaxbxcx")
+	require.Greater(t, consecutiveScore, scatteredScore)
+}