mirror of
https://github.com/prometheus/prometheus
synced 2026-04-20 22:41:05 +08:00
Merge pull request #18402 from roidelapluie/roidelapluie/strutil_subsequence
util/strutil: add subsequence matching implementation
This commit is contained in:
272
util/strutil/subsequence.go
Normal file
272
util/strutil/subsequence.go
Normal file
@@ -0,0 +1,272 @@
|
||||
// Copyright The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// The scoring algorithm is inspired by two JavaScript libraries:
|
||||
// https://github.com/Nexucis/fuzzy (MIT License), used by the Prometheus UI,
|
||||
// which itself was inspired by https://github.com/mattyork/fuzzy (MIT License).
|
||||
|
||||
package strutil
|
||||
|
||||
import "strings"
|
||||
|
||||
// SubsequenceMatcher pre-computes the encoding of a fixed search pattern so
|
||||
// that it can be scored against many candidate strings without repeating the
|
||||
// ASCII check or rune conversion on the pattern for every call. The first
|
||||
// Score call with a Unicode candidate lazily caches the pattern's rune slice.
|
||||
// It is not safe for concurrent use.
|
||||
type SubsequenceMatcher struct {
|
||||
pattern string
|
||||
patternLen int // byte length; used for the pre-check len(pattern) > len(text)
|
||||
patternASCII bool // whether pattern is pure ASCII
|
||||
patternRunes []rune // pre-converted runes; set when !patternASCII or on first Unicode text
|
||||
}
|
||||
|
||||
// NewSubsequenceMatcher returns a matcher for the given pattern.
|
||||
func NewSubsequenceMatcher(pattern string) *SubsequenceMatcher {
|
||||
if isASCII(pattern) {
|
||||
return &SubsequenceMatcher{pattern: pattern, patternLen: len(pattern), patternASCII: true}
|
||||
}
|
||||
return &SubsequenceMatcher{pattern: pattern, patternLen: len(pattern), patternRunes: []rune(pattern)}
|
||||
}
|
||||
|
||||
// Score computes a fuzzy match score between the matcher's pattern and text
|
||||
// using a greedy character matching algorithm. Characters in pattern must
|
||||
// appear in text in order (subsequence matching).
|
||||
// The score is normalized to [0.0, 1.0] where:
|
||||
// - 1.0 means exact match only.
|
||||
// - 0.0 means no match (pattern is not a subsequence of text).
|
||||
// - Intermediate values reward consecutive matches and penalize gaps.
|
||||
//
|
||||
// This is a simple scorer for autocomplete ranking. It does not try every
|
||||
// possible match, so it may miss the best score when the pattern can match the
|
||||
// text in more than one way.
|
||||
//
|
||||
// The raw scoring formula is: Σ(interval_size²) − Σ(gap_size / text_length) − trailing_gap / (2 * text_length).
|
||||
// The result is normalized by pattern_length² (the maximum possible raw score).
|
||||
func (m *SubsequenceMatcher) Score(text string) float64 {
|
||||
if m.pattern == "" {
|
||||
return 1.0
|
||||
}
|
||||
if text == "" {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
// Exact match: perfect score, checked before any allocation.
|
||||
if m.pattern == text {
|
||||
return 1.0
|
||||
}
|
||||
|
||||
// Byte length >= rune count, so this is a safe early exit before any allocation.
|
||||
if m.patternLen > len(text) {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
// For ASCII strings, use the string-native path that avoids []rune conversion.
|
||||
// If pattern has non-ASCII runes but text is pure ASCII, no non-ASCII
|
||||
// pattern rune can ever match, so the pattern cannot be a subsequence.
|
||||
textASCII := isASCII(text)
|
||||
switch {
|
||||
case m.patternASCII && textASCII:
|
||||
return matchSubsequenceString(m.pattern, text)
|
||||
case !m.patternASCII && textASCII:
|
||||
return 0.0
|
||||
}
|
||||
if m.patternRunes == nil {
|
||||
// pattern is ASCII but text is Unicode; convert and cache pattern runes.
|
||||
m.patternRunes = []rune(m.pattern)
|
||||
}
|
||||
return matchSubsequenceRunes(m.patternRunes, []rune(text))
|
||||
}
|
||||
|
||||
// isASCII reports whether s contains only ASCII characters.
|
||||
func isASCII(s string) bool {
|
||||
for _, c := range s {
|
||||
if c >= 0x80 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// matchSubsequenceString is the string-native implementation of the scoring
|
||||
// algorithm for ASCII inputs. It uses strings.IndexByte for character scanning,
|
||||
// with divisions by textLen replaced by a precomputed reciprocal multiply.
|
||||
func matchSubsequenceString(pattern, text string) float64 {
|
||||
patternLen := len(pattern)
|
||||
textLen := len(text)
|
||||
invTextLen := 1.0 / float64(textLen)
|
||||
maxStart := textLen - patternLen
|
||||
|
||||
// scoreFrom scores a match starting at startPos, where
|
||||
// text[startPos] == pattern[0] is guaranteed by the caller.
|
||||
scoreFrom := func(startPos int) (float64, bool) {
|
||||
i := startPos
|
||||
from := i
|
||||
to := i
|
||||
patternIdx := 1
|
||||
i++
|
||||
// Extend the initial consecutive run.
|
||||
for patternIdx < patternLen && i < textLen && text[i] == pattern[patternIdx] {
|
||||
to = i
|
||||
patternIdx++
|
||||
i++
|
||||
}
|
||||
var score float64
|
||||
if from > 0 {
|
||||
score -= float64(from) * invTextLen
|
||||
}
|
||||
size := to - from + 1
|
||||
score += float64(size * size)
|
||||
prevTo := to
|
||||
|
||||
for patternIdx < patternLen {
|
||||
// Jump to the next occurrence of pattern[patternIdx].
|
||||
j := strings.IndexByte(text[i:], pattern[patternIdx])
|
||||
if j < 0 {
|
||||
return 0, false
|
||||
}
|
||||
i += j
|
||||
from = i
|
||||
to = i
|
||||
patternIdx++
|
||||
i++
|
||||
// Extend the consecutive run.
|
||||
for patternIdx < patternLen && i < textLen && text[i] == pattern[patternIdx] {
|
||||
to = i
|
||||
patternIdx++
|
||||
i++
|
||||
}
|
||||
if gap := from - prevTo - 1; gap > 0 {
|
||||
score -= float64(gap) * invTextLen
|
||||
}
|
||||
size = to - from + 1
|
||||
score += float64(size * size)
|
||||
prevTo = to
|
||||
}
|
||||
|
||||
// Penalise unmatched trailing characters at half the leading/inner rate.
|
||||
if trailing := textLen - 1 - prevTo; trailing > 0 {
|
||||
score -= float64(trailing) * invTextLen * 0.5
|
||||
}
|
||||
return score, true
|
||||
}
|
||||
|
||||
bestScore := -1.0
|
||||
for i := 0; i <= maxStart; {
|
||||
// Scan for the first pattern character.
|
||||
j := strings.IndexByte(text[i:maxStart+1], pattern[0])
|
||||
if j < 0 {
|
||||
break
|
||||
}
|
||||
i += j
|
||||
s, matched := scoreFrom(i)
|
||||
if !matched {
|
||||
// If the pattern cannot be completed from i, no later start can
|
||||
// succeed: text[i+1:] is a strict subset of text[i:].
|
||||
break
|
||||
}
|
||||
if s > bestScore {
|
||||
bestScore = s
|
||||
}
|
||||
i++
|
||||
}
|
||||
|
||||
if bestScore < 0 {
|
||||
return 0.0
|
||||
}
|
||||
return bestScore / float64(patternLen*patternLen)
|
||||
}
|
||||
|
||||
// matchSubsequenceRunes implements the scoring algorithm over pre-converted
|
||||
// rune slices for the Unicode path.
|
||||
func matchSubsequenceRunes(patternSlice, textSlice []rune) float64 {
|
||||
patternLen := len(patternSlice)
|
||||
textLen := len(textSlice)
|
||||
invTextLen := 1.0 / float64(textLen)
|
||||
|
||||
// matchFromPos tries to match all pattern characters as a subsequence of
|
||||
// text starting at startPos. Returns the raw score and true on success, or
|
||||
// 0 and false if the pattern cannot be fully matched.
|
||||
// The score is accumulated inline, tracking only prevTo, to avoid any allocation.
|
||||
matchFromPos := func(startPos int) (float64, bool) {
|
||||
patternIdx := 0
|
||||
i := startPos
|
||||
var score float64
|
||||
prevTo := -1
|
||||
|
||||
for i < textLen && patternIdx < patternLen {
|
||||
if textSlice[i] == patternSlice[patternIdx] {
|
||||
from := i
|
||||
to := i
|
||||
patternIdx++
|
||||
i++
|
||||
for i < textLen && patternIdx < patternLen && textSlice[i] == patternSlice[patternIdx] {
|
||||
to = i
|
||||
patternIdx++
|
||||
i++
|
||||
}
|
||||
var gapSize int
|
||||
if prevTo < 0 {
|
||||
gapSize = from
|
||||
} else {
|
||||
gapSize = from - prevTo - 1
|
||||
}
|
||||
if gapSize > 0 {
|
||||
score -= float64(gapSize) * invTextLen
|
||||
}
|
||||
size := to - from + 1
|
||||
score += float64(size * size)
|
||||
prevTo = to
|
||||
} else {
|
||||
i++
|
||||
}
|
||||
}
|
||||
|
||||
if patternIdx < patternLen {
|
||||
return 0, false
|
||||
}
|
||||
|
||||
// Penalize unmatched trailing characters at half the leading/inner gap rate.
|
||||
trailingGap := textLen - 1 - prevTo
|
||||
if trailingGap > 0 {
|
||||
score -= float64(trailingGap) * invTextLen * 0.5
|
||||
}
|
||||
|
||||
return score, true
|
||||
}
|
||||
|
||||
bestScore := -1.0
|
||||
// Only iterate while there are enough characters left for the pattern to fit.
|
||||
maxStart := textLen - patternLen
|
||||
for i := 0; i <= maxStart; i++ {
|
||||
if textSlice[i] != patternSlice[0] {
|
||||
continue
|
||||
}
|
||||
s, matched := matchFromPos(i)
|
||||
if !matched {
|
||||
// If matching fails from this position, no later position can succeed
|
||||
// since the remaining text is a strict subset.
|
||||
break
|
||||
}
|
||||
if s > bestScore {
|
||||
bestScore = s
|
||||
}
|
||||
}
|
||||
|
||||
if bestScore < 0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
// Normalize by pattern_length² (the maximum possible raw score).
|
||||
return bestScore / float64(patternLen*patternLen)
|
||||
}
|
||||
302
util/strutil/subsequence_test.go
Normal file
302
util/strutil/subsequence_test.go
Normal file
@@ -0,0 +1,302 @@
|
||||
// Copyright The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package strutil
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// generateNames returns n synthetic metric-style names for benchmarking.
|
||||
// Names are varied: some share characters with the query, some do not.
|
||||
func generateNames(n int) []string {
|
||||
names := make([]string, 0, n)
|
||||
prefixes := []string{
|
||||
"http_requests_total",
|
||||
"process_cpu_seconds",
|
||||
"go_goroutines",
|
||||
"node_memory_bytes",
|
||||
"prometheus_tsdb_head",
|
||||
"grpc_server_handled",
|
||||
"container_cpu_usage",
|
||||
"kube_pod_status",
|
||||
"up",
|
||||
"scrape_duration_seconds",
|
||||
}
|
||||
for i := range n {
|
||||
base := prefixes[i%len(prefixes)]
|
||||
names = append(names, fmt.Sprintf("%s_%d", base, i/len(prefixes)))
|
||||
}
|
||||
return names
|
||||
}
|
||||
|
||||
// generateNamesRunes returns n synthetic names containing non-ASCII characters
|
||||
// for benchmarking the rune path. Non-ASCII characters are inserted at varying
|
||||
// positions and with varying characters across prefixes, so the benchmark
|
||||
// exercises both matching and non-matching rune paths.
|
||||
func generateNamesRunes(n int) []string {
|
||||
names := make([]string, 0, n)
|
||||
prefixes := []string{
|
||||
"é_http_requests_total", // é prepended; matches "éhttp_total"
|
||||
"process_écu_seconds", // é mid-name; no match
|
||||
"go_goroutines", // ASCII; no match
|
||||
"énode_memory_bytes", // é prepended, different root; no match
|
||||
"prometheus_tsdb_head", // ASCII; no match
|
||||
"grpc_sérveur_handled", // é mid-name; no match
|
||||
"é_http_container_total", // é prepended; matches "éhttp_total"
|
||||
"kube_pod_status", // ASCII; no match
|
||||
"ñup", // other non-ASCII; no match
|
||||
"scrape_duration_séconds", // é near end; no match
|
||||
}
|
||||
for i := range n {
|
||||
base := prefixes[i%len(prefixes)]
|
||||
names = append(names, fmt.Sprintf("%s_%d", base, i/len(prefixes)))
|
||||
}
|
||||
return names
|
||||
}
|
||||
|
||||
func BenchmarkSubsequenceScoreString(b *testing.B) {
|
||||
// A 10-character query that partially matches many of the generated names.
|
||||
const query = "http_total"
|
||||
|
||||
for _, n := range []int{1000, 2000, 10000, 100000, 1000000} {
|
||||
names := generateNames(n)
|
||||
b.Run(fmt.Sprintf("names=%d", n), func(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
m := NewSubsequenceMatcher(query)
|
||||
for b.Loop() {
|
||||
for _, name := range names {
|
||||
m.Score(name)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSubsequenceScoreRunes(b *testing.B) {
|
||||
// Non-ASCII query to force the rune path; matches a subset of the generated names.
|
||||
const query = "éhttp_total"
|
||||
|
||||
for _, n := range []int{1000, 2000, 10000, 100000, 1000000} {
|
||||
names := generateNamesRunes(n)
|
||||
b.Run(fmt.Sprintf("names=%d", n), func(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
m := NewSubsequenceMatcher(query)
|
||||
for b.Loop() {
|
||||
for _, name := range names {
|
||||
m.Score(name)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSubsequenceScore(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
pattern string
|
||||
text string
|
||||
wantScore float64
|
||||
wantZero bool
|
||||
}{
|
||||
{
|
||||
name: "empty pattern",
|
||||
pattern: "",
|
||||
text: "anything",
|
||||
wantScore: 1.0,
|
||||
},
|
||||
{
|
||||
name: "empty text",
|
||||
pattern: "abc",
|
||||
text: "",
|
||||
wantZero: true,
|
||||
},
|
||||
{
|
||||
name: "exact match",
|
||||
pattern: "my awesome text",
|
||||
text: "my awesome text",
|
||||
wantScore: 1.0,
|
||||
},
|
||||
{
|
||||
name: "prefix match",
|
||||
pattern: "my",
|
||||
text: "my awesome text",
|
||||
// intervals [0,1], leading=0, trailing=13. raw = 4 - 13/30, normalized by 4.
|
||||
wantScore: 107.0 / 120.0,
|
||||
},
|
||||
{
|
||||
name: "substring match",
|
||||
pattern: "tex",
|
||||
text: "my awesome text",
|
||||
// intervals [11,13], leading=11, trailing=1. raw = 9 - 11/15 - 1/30, normalized by 9.
|
||||
wantScore: 247.0 / 270.0,
|
||||
},
|
||||
{
|
||||
name: "fuzzy match picks best starting position",
|
||||
pattern: "met",
|
||||
text: "my awesome text",
|
||||
// intervals [8,9] and [11,11], leading=8, inner gap=1, trailing=3. raw = 5 - 9/15 - 3/30, normalized by 9.
|
||||
wantScore: 43.0 / 90.0,
|
||||
},
|
||||
{
|
||||
name: "prefers later position with better consecutive run",
|
||||
pattern: "bac",
|
||||
text: "babac",
|
||||
wantScore: 43.0 / 45.0, // match at [2,4], leading gap=2, trailing=0. raw = 9 - 2/5, normalized by 9.
|
||||
},
|
||||
{
|
||||
name: "pattern longer than text",
|
||||
pattern: "abcd",
|
||||
text: "abc",
|
||||
wantZero: true,
|
||||
},
|
||||
{
|
||||
name: "pattern longer in runes than multi-byte text",
|
||||
pattern: "abc",
|
||||
text: "éé",
|
||||
wantZero: true,
|
||||
},
|
||||
{
|
||||
name: "non-ASCII pattern with ASCII text",
|
||||
pattern: "é",
|
||||
text: "ab",
|
||||
wantZero: true,
|
||||
},
|
||||
{
|
||||
name: "no subsequence match",
|
||||
pattern: "xyz",
|
||||
text: "abc",
|
||||
wantZero: true,
|
||||
},
|
||||
{
|
||||
name: "unicode exact match",
|
||||
pattern: "éàü",
|
||||
text: "éàü",
|
||||
wantScore: 1.0,
|
||||
},
|
||||
{
|
||||
name: "unicode prefix match",
|
||||
pattern: "éà",
|
||||
text: "éàü",
|
||||
// intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4.
|
||||
wantScore: 23.0 / 24.0,
|
||||
},
|
||||
{
|
||||
name: "unicode no match",
|
||||
pattern: "üé",
|
||||
text: "éàü",
|
||||
wantZero: true,
|
||||
},
|
||||
{
|
||||
name: "unicode first char matches but pattern cannot complete",
|
||||
pattern: "éàx",
|
||||
text: "éàü",
|
||||
wantZero: true,
|
||||
},
|
||||
{
|
||||
name: "unicode fuzzy match with gap between intervals",
|
||||
pattern: "éü",
|
||||
text: "éàü",
|
||||
// intervals [0,0] and [2,2], leading=0, inner gap=1, trailing=0.
|
||||
// raw = 1 + 1 - 1/3, normalized by 4.
|
||||
wantScore: 5.0 / 12.0,
|
||||
},
|
||||
{
|
||||
name: "mixed ascii and unicode",
|
||||
pattern: "aé",
|
||||
text: "aéb",
|
||||
// intervals [0,1], leading=0, trailing=1. raw = 4 - 1/6, normalized by 4.
|
||||
wantScore: 23.0 / 24.0,
|
||||
},
|
||||
{
|
||||
name: "unicode chars sharing leading utf-8 byte do not match",
|
||||
// 'é' (U+00E9) encodes as [0xC3 0xA9] and 'ã' (U+00E3) as [0xC3 0xA3].
|
||||
// They share the leading byte but must not be treated as equal.
|
||||
pattern: "é",
|
||||
text: "ã",
|
||||
wantZero: true,
|
||||
},
|
||||
{
|
||||
name: "single char exact match",
|
||||
pattern: "a",
|
||||
text: "a",
|
||||
wantScore: 1.0,
|
||||
},
|
||||
{
|
||||
name: "consecutive match with leading gap",
|
||||
pattern: "oa",
|
||||
text: "goat",
|
||||
// 'o'(1),'a'(2) form one interval [1,2], leading gap=1, trailing=1.
|
||||
// raw = 2² - 1/4 - 1/8 = 29/8, normalized by 2² = 4.
|
||||
wantScore: 29.0 / 32.0,
|
||||
},
|
||||
{
|
||||
name: "repeated chars use greedy match",
|
||||
pattern: "abaa",
|
||||
text: "abbaa",
|
||||
// Matches 'a'(0),'b'(1),'a'(3),'a'(4) as intervals [0,1] and [3,4].
|
||||
// raw = 2² + 2² - 1/5, normalized by 4² = 16.
|
||||
// A better match exists at 'a'(0),'b'(2),'a'(3),'a'(4), which would score 49/80,
|
||||
// but this test documents the current greedy behavior.
|
||||
wantScore: 39.0 / 80.0,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
got := NewSubsequenceMatcher(tt.pattern).Score(tt.text)
|
||||
if tt.wantZero {
|
||||
require.Equal(t, 0.0, got)
|
||||
return
|
||||
}
|
||||
require.InDelta(t, tt.wantScore, got, 1e-9)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSubsequenceScoreProperties(t *testing.T) {
|
||||
// Prefix match scores below 1.0; only exact match scores 1.0.
|
||||
// "pro" in "prometheus": intervals [0,2], trailing=7. raw = 9 - 7/20, normalized by 9.
|
||||
require.InDelta(t, 173.0/180.0, NewSubsequenceMatcher("pro").Score("prometheus"), 1e-9)
|
||||
|
||||
// Exact match always scores 1.0.
|
||||
require.Equal(t, 1.0, NewSubsequenceMatcher("prometheus").Score("prometheus"))
|
||||
|
||||
// Score is always in [0, 1].
|
||||
cases := [][2]string{
|
||||
{"abc", "xaxbxcx"},
|
||||
{"z", "aaaaaz"},
|
||||
{"ab", "ba"},
|
||||
{"met", "my awesome text"},
|
||||
}
|
||||
for _, c := range cases {
|
||||
score := NewSubsequenceMatcher(c[0]).Score(c[1])
|
||||
require.True(t, score >= 0.0 && score <= 1.0,
|
||||
"score %v out of range for pattern=%q text=%q", score, c[0], c[1])
|
||||
require.False(t, math.IsNaN(score))
|
||||
}
|
||||
|
||||
// Prefix scores higher than non-prefix substring.
|
||||
prefixScore := NewSubsequenceMatcher("abc").Score("abcdef")
|
||||
suffixScore := NewSubsequenceMatcher("abc").Score("defabc")
|
||||
require.Greater(t, prefixScore, suffixScore)
|
||||
|
||||
// Consecutive chars score higher than scattered.
|
||||
consecutiveScore := NewSubsequenceMatcher("abc").Score("xabcx")
|
||||
scatteredScore := NewSubsequenceMatcher("abc").Score("xaxbxcx")
|
||||
require.Greater(t, consecutiveScore, scatteredScore)
|
||||
}
|
||||
Reference in New Issue
Block a user