Files
prometheus/util/strutil/subsequence.go
Julien 0b067888c7 Merge pull request #18402 from roidelapluie/roidelapluie/strutil_subsequence
util/strutil: add subsequence matching implementation
2026-04-14 16:55:56 +02:00

273 lines
8.0 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// The scoring algorithm is inspired by two JavaScript libraries:
// https://github.com/Nexucis/fuzzy (MIT License), used by the Prometheus UI,
// which itself was inspired by https://github.com/mattyork/fuzzy (MIT License).
package strutil
import "strings"
// SubsequenceMatcher pre-computes the encoding of a fixed search pattern so
// that it can be scored against many candidate strings without repeating the
// ASCII check or rune conversion on the pattern for every call. The first
// Score call with a Unicode candidate lazily caches the pattern's rune slice.
// It is not safe for concurrent use.
type SubsequenceMatcher struct {
pattern string
patternLen int // byte length; used for the pre-check len(pattern) > len(text)
patternASCII bool // whether pattern is pure ASCII
patternRunes []rune // pre-converted runes; set when !patternASCII or on first Unicode text
}
// NewSubsequenceMatcher returns a matcher for the given pattern.
func NewSubsequenceMatcher(pattern string) *SubsequenceMatcher {
if isASCII(pattern) {
return &SubsequenceMatcher{pattern: pattern, patternLen: len(pattern), patternASCII: true}
}
return &SubsequenceMatcher{pattern: pattern, patternLen: len(pattern), patternRunes: []rune(pattern)}
}
// Score computes a fuzzy match score between the matcher's pattern and text
// using a greedy character matching algorithm. Characters in pattern must
// appear in text in order (subsequence matching).
// The score is normalized to [0.0, 1.0] where:
// - 1.0 means exact match only.
// - 0.0 means no match (pattern is not a subsequence of text).
// - Intermediate values reward consecutive matches and penalize gaps.
//
// This is a simple scorer for autocomplete ranking. It does not try every
// possible match, so it may miss the best score when the pattern can match the
// text in more than one way.
//
// The raw scoring formula is: Σ(interval_size²) Σ(gap_size / text_length) trailing_gap / (2 * text_length).
// The result is normalized by pattern_length² (the maximum possible raw score).
func (m *SubsequenceMatcher) Score(text string) float64 {
if m.pattern == "" {
return 1.0
}
if text == "" {
return 0.0
}
// Exact match: perfect score, checked before any allocation.
if m.pattern == text {
return 1.0
}
// Byte length >= rune count, so this is a safe early exit before any allocation.
if m.patternLen > len(text) {
return 0.0
}
// For ASCII strings, use the string-native path that avoids []rune conversion.
// If pattern has non-ASCII runes but text is pure ASCII, no non-ASCII
// pattern rune can ever match, so the pattern cannot be a subsequence.
textASCII := isASCII(text)
switch {
case m.patternASCII && textASCII:
return matchSubsequenceString(m.pattern, text)
case !m.patternASCII && textASCII:
return 0.0
}
if m.patternRunes == nil {
// pattern is ASCII but text is Unicode; convert and cache pattern runes.
m.patternRunes = []rune(m.pattern)
}
return matchSubsequenceRunes(m.patternRunes, []rune(text))
}
// isASCII reports whether s contains only ASCII characters.
func isASCII(s string) bool {
for _, c := range s {
if c >= 0x80 {
return false
}
}
return true
}
// matchSubsequenceString is the string-native implementation of the scoring
// algorithm for ASCII inputs. It uses strings.IndexByte for character scanning,
// with divisions by textLen replaced by a precomputed reciprocal multiply.
func matchSubsequenceString(pattern, text string) float64 {
patternLen := len(pattern)
textLen := len(text)
invTextLen := 1.0 / float64(textLen)
maxStart := textLen - patternLen
// scoreFrom scores a match starting at startPos, where
// text[startPos] == pattern[0] is guaranteed by the caller.
scoreFrom := func(startPos int) (float64, bool) {
i := startPos
from := i
to := i
patternIdx := 1
i++
// Extend the initial consecutive run.
for patternIdx < patternLen && i < textLen && text[i] == pattern[patternIdx] {
to = i
patternIdx++
i++
}
var score float64
if from > 0 {
score -= float64(from) * invTextLen
}
size := to - from + 1
score += float64(size * size)
prevTo := to
for patternIdx < patternLen {
// Jump to the next occurrence of pattern[patternIdx].
j := strings.IndexByte(text[i:], pattern[patternIdx])
if j < 0 {
return 0, false
}
i += j
from = i
to = i
patternIdx++
i++
// Extend the consecutive run.
for patternIdx < patternLen && i < textLen && text[i] == pattern[patternIdx] {
to = i
patternIdx++
i++
}
if gap := from - prevTo - 1; gap > 0 {
score -= float64(gap) * invTextLen
}
size = to - from + 1
score += float64(size * size)
prevTo = to
}
// Penalise unmatched trailing characters at half the leading/inner rate.
if trailing := textLen - 1 - prevTo; trailing > 0 {
score -= float64(trailing) * invTextLen * 0.5
}
return score, true
}
bestScore := -1.0
for i := 0; i <= maxStart; {
// Scan for the first pattern character.
j := strings.IndexByte(text[i:maxStart+1], pattern[0])
if j < 0 {
break
}
i += j
s, matched := scoreFrom(i)
if !matched {
// If the pattern cannot be completed from i, no later start can
// succeed: text[i+1:] is a strict subset of text[i:].
break
}
if s > bestScore {
bestScore = s
}
i++
}
if bestScore < 0 {
return 0.0
}
return bestScore / float64(patternLen*patternLen)
}
// matchSubsequenceRunes implements the scoring algorithm over pre-converted
// rune slices for the Unicode path.
func matchSubsequenceRunes(patternSlice, textSlice []rune) float64 {
patternLen := len(patternSlice)
textLen := len(textSlice)
invTextLen := 1.0 / float64(textLen)
// matchFromPos tries to match all pattern characters as a subsequence of
// text starting at startPos. Returns the raw score and true on success, or
// 0 and false if the pattern cannot be fully matched.
// The score is accumulated inline, tracking only prevTo, to avoid any allocation.
matchFromPos := func(startPos int) (float64, bool) {
patternIdx := 0
i := startPos
var score float64
prevTo := -1
for i < textLen && patternIdx < patternLen {
if textSlice[i] == patternSlice[patternIdx] {
from := i
to := i
patternIdx++
i++
for i < textLen && patternIdx < patternLen && textSlice[i] == patternSlice[patternIdx] {
to = i
patternIdx++
i++
}
var gapSize int
if prevTo < 0 {
gapSize = from
} else {
gapSize = from - prevTo - 1
}
if gapSize > 0 {
score -= float64(gapSize) * invTextLen
}
size := to - from + 1
score += float64(size * size)
prevTo = to
} else {
i++
}
}
if patternIdx < patternLen {
return 0, false
}
// Penalize unmatched trailing characters at half the leading/inner gap rate.
trailingGap := textLen - 1 - prevTo
if trailingGap > 0 {
score -= float64(trailingGap) * invTextLen * 0.5
}
return score, true
}
bestScore := -1.0
// Only iterate while there are enough characters left for the pattern to fit.
maxStart := textLen - patternLen
for i := 0; i <= maxStart; i++ {
if textSlice[i] != patternSlice[0] {
continue
}
s, matched := matchFromPos(i)
if !matched {
// If matching fails from this position, no later position can succeed
// since the remaining text is a strict subset.
break
}
if s > bestScore {
bestScore = s
}
}
if bestScore < 0 {
return 0.0
}
// Normalize by pattern_length² (the maximum possible raw score).
return bestScore / float64(patternLen*patternLen)
}