prometheus/util/strutil/subsequence.go

// Copyright The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// The scoring algorithm is inspired by two JavaScript libraries:
// https://github.com/Nexucis/fuzzy (MIT License), used by the Prometheus UI,
// which itself was inspired by https://github.com/mattyork/fuzzy (MIT License).

package strutil

import "strings"

// SubsequenceMatcher pre-computes the encoding of a fixed search pattern so
// that it can be scored against many candidate strings without repeating the
// ASCII check or rune conversion on the pattern for every call. The first
// Score call with a Unicode candidate lazily caches the pattern's rune slice.
// It is not safe for concurrent use.
type SubsequenceMatcher struct {
	pattern      string
	patternLen   int    // byte length; used for the pre-check len(pattern) > len(text)
	patternASCII bool   // whether pattern is pure ASCII
	patternRunes []rune // pre-converted runes; set when !patternASCII or on first Unicode text
}

// NewSubsequenceMatcher returns a matcher for the given pattern.
func NewSubsequenceMatcher(pattern string) *SubsequenceMatcher {
	if isASCII(pattern) {
		return &SubsequenceMatcher{pattern: pattern, patternLen: len(pattern), patternASCII: true}
	}
	return &SubsequenceMatcher{pattern: pattern, patternLen: len(pattern), patternRunes: []rune(pattern)}
}

// Score computes a fuzzy match score between the matcher's pattern and text
// using a greedy character matching algorithm. Characters in pattern must
// appear in text in order (subsequence matching).
// The score is normalized to [0.0, 1.0] where:
//   - 1.0 means exact match only.
//   - 0.0 means no match (pattern is not a subsequence of text).
//   - Intermediate values reward consecutive matches and penalize gaps.
//
// This is a simple scorer for autocomplete ranking. It does not try every
// possible match, so it may miss the best score when the pattern can match the
// text in more than one way.
//
// The raw scoring formula is: Σ(interval_size²) − Σ(gap_size / text_length) − trailing_gap / (2 * text_length).
// The result is normalized by pattern_length² (the maximum possible raw score).
func (m *SubsequenceMatcher) Score(text string) float64 {
	if m.pattern == "" {
		return 1.0
	}
	if text == "" {
		return 0.0
	}

	// Exact match: perfect score, checked before any allocation.
	if m.pattern == text {
		return 1.0
	}

	// Byte length >= rune count, so this is a safe early exit before any allocation.
	if m.patternLen > len(text) {
		return 0.0
	}

	// For ASCII strings, use the string-native path that avoids []rune conversion.
	// If pattern has non-ASCII runes but text is pure ASCII, no non-ASCII
	// pattern rune can ever match, so the pattern cannot be a subsequence.
	textASCII := isASCII(text)
	switch {
	case m.patternASCII && textASCII:
		return matchSubsequenceString(m.pattern, text)
	case !m.patternASCII && textASCII:
		return 0.0
	}
	if m.patternRunes == nil {
		// pattern is ASCII but text is Unicode; convert and cache pattern runes.
		m.patternRunes = []rune(m.pattern)
	}
	return matchSubsequenceRunes(m.patternRunes, []rune(text))
}

// isASCII reports whether s contains only ASCII characters.
func isASCII(s string) bool {
	for _, c := range s {
		if c >= 0x80 {
			return false
		}
	}
	return true
}

// matchSubsequenceString is the string-native implementation of the scoring
// algorithm for ASCII inputs. It uses strings.IndexByte for character scanning,
// with divisions by textLen replaced by a precomputed reciprocal multiply.
func matchSubsequenceString(pattern, text string) float64 {
	patternLen := len(pattern)
	textLen := len(text)
	invTextLen := 1.0 / float64(textLen)
	maxStart := textLen - patternLen

	// scoreFrom scores a match starting at startPos, where
	// text[startPos] == pattern[0] is guaranteed by the caller.
	scoreFrom := func(startPos int) (float64, bool) {
		i := startPos
		from := i
		to := i
		patternIdx := 1
		i++
		// Extend the initial consecutive run.
		for patternIdx < patternLen && i < textLen && text[i] == pattern[patternIdx] {
			to = i
			patternIdx++
			i++
		}
		var score float64
		if from > 0 {
			score -= float64(from) * invTextLen
		}
		size := to - from + 1
		score += float64(size * size)
		prevTo := to

		for patternIdx < patternLen {
			// Jump to the next occurrence of pattern[patternIdx].
			j := strings.IndexByte(text[i:], pattern[patternIdx])
			if j < 0 {
				return 0, false
			}
			i += j
			from = i
			to = i
			patternIdx++
			i++
			// Extend the consecutive run.
			for patternIdx < patternLen && i < textLen && text[i] == pattern[patternIdx] {
				to = i
				patternIdx++
				i++
			}
			if gap := from - prevTo - 1; gap > 0 {
				score -= float64(gap) * invTextLen
			}
			size = to - from + 1
			score += float64(size * size)
			prevTo = to
		}

		// Penalise unmatched trailing characters at half the leading/inner rate.
		if trailing := textLen - 1 - prevTo; trailing > 0 {
			score -= float64(trailing) * invTextLen * 0.5
		}
		return score, true
	}

	bestScore := -1.0
	for i := 0; i <= maxStart; {
		// Scan for the first pattern character.
		j := strings.IndexByte(text[i:maxStart+1], pattern[0])
		if j < 0 {
			break
		}
		i += j
		s, matched := scoreFrom(i)
		if !matched {
			// If the pattern cannot be completed from i, no later start can
			// succeed: text[i+1:] is a strict subset of text[i:].
			break
		}
		if s > bestScore {
			bestScore = s
		}
		i++
	}

	if bestScore < 0 {
		return 0.0
	}
	return bestScore / float64(patternLen*patternLen)
}

// matchSubsequenceRunes implements the scoring algorithm over pre-converted
// rune slices for the Unicode path.
func matchSubsequenceRunes(patternSlice, textSlice []rune) float64 {
	patternLen := len(patternSlice)
	textLen := len(textSlice)
	invTextLen := 1.0 / float64(textLen)

	// matchFromPos tries to match all pattern characters as a subsequence of
	// text starting at startPos. Returns the raw score and true on success, or
	// 0 and false if the pattern cannot be fully matched.
	// The score is accumulated inline, tracking only prevTo, to avoid any allocation.
	matchFromPos := func(startPos int) (float64, bool) {
		patternIdx := 0
		i := startPos
		var score float64
		prevTo := -1

		for i < textLen && patternIdx < patternLen {
			if textSlice[i] == patternSlice[patternIdx] {
				from := i
				to := i
				patternIdx++
				i++
				for i < textLen && patternIdx < patternLen && textSlice[i] == patternSlice[patternIdx] {
					to = i
					patternIdx++
					i++
				}
				var gapSize int
				if prevTo < 0 {
					gapSize = from
				} else {
					gapSize = from - prevTo - 1
				}
				if gapSize > 0 {
					score -= float64(gapSize) * invTextLen
				}
				size := to - from + 1
				score += float64(size * size)
				prevTo = to
			} else {
				i++
			}
		}

		if patternIdx < patternLen {
			return 0, false
		}

		// Penalize unmatched trailing characters at half the leading/inner gap rate.
		trailingGap := textLen - 1 - prevTo
		if trailingGap > 0 {
			score -= float64(trailingGap) * invTextLen * 0.5
		}

		return score, true
	}

	bestScore := -1.0
	// Only iterate while there are enough characters left for the pattern to fit.
	maxStart := textLen - patternLen
	for i := 0; i <= maxStart; i++ {
		if textSlice[i] != patternSlice[0] {
			continue
		}
		s, matched := matchFromPos(i)
		if !matched {
			// If matching fails from this position, no later position can succeed
			// since the remaining text is a strict subset.
			break
		}
		if s > bestScore {
			bestScore = s
		}
	}

	if bestScore < 0 {
		return 0.0
	}

	// Normalize by pattern_length² (the maximum possible raw score).
	return bestScore / float64(patternLen*patternLen)
}