package valpass // via https://github.com/adrg/strutil, MIT licensed // Copyright (c) 2019-2023 Adrian-George Bostan. import ( "strings" ) // Levenshtein represents the Levenshtein metric for measuring the similarity // between sequences. // // For more information see https://en.wikipedia.org/wiki/Levenshtein_distance. type Levenshtein struct { // CaseSensitive specifies if the string comparison is case sensitive. CaseSensitive bool // InsertCost represents the Levenshtein cost of a character insertion. InsertCost int // InsertCost represents the Levenshtein cost of a character deletion. DeleteCost int // InsertCost represents the Levenshtein cost of a character substitution. ReplaceCost int } // NewLevenshtein returns a new Levenshtein string metric. // // Default options: // // CaseSensitive: true // InsertCost: 1 // DeleteCost: 1 // ReplaceCost: 1 func NewLevenshtein() *Levenshtein { return &Levenshtein{ CaseSensitive: true, InsertCost: 1, DeleteCost: 1, ReplaceCost: 1, } } // Compare returns the Levenshtein similarity of a and b. The returned // similarity is a number between 0 and 1. Larger similarity numbers indicate // closer matches. func (m *Levenshtein) Compare(a, b string) float64 { distance, maxLen := m.distance(a, b) return 1 - float64(distance)/float64(maxLen) } // Distance returns the Levenshtein distance between a and b. Lower distances // indicate closer matches. A distance of 0 means the strings are identical. func (m *Levenshtein) Distance(a, b string) int { distance, _ := m.distance(a, b) return distance } // Min returns the value of the smallest argument, // or 0 if no arguments are provided. func Min(args ...int) int { if len(args) == 0 { return 0 } if len(args) == 1 { return args[0] } min := args[0] for _, arg := range args[1:] { if min > arg { min = arg } } return min } // Max returns the value of the largest argument, // or 0 if no arguments are provided. func Max(args ...int) int { if len(args) == 0 { return 0 } if len(args) == 1 { return args[0] } max := args[0] for _, arg := range args[1:] { if max < arg { max = arg } } return max } func (m *Levenshtein) distance(a, b string) (int, int) { // Lower terms if case insensitive comparison is specified. if !m.CaseSensitive { a = strings.ToLower(a) b = strings.ToLower(b) } runesA, runesB := []rune(a), []rune(b) // Check if both terms are empty. lenA, lenB := len(runesA), len(runesB) if lenA == 0 && lenB == 0 { return 0, 0 } // Check if one of the terms is empty. maxLen := Max(lenA, lenB) if lenA == 0 { return m.InsertCost * lenB, maxLen } if lenB == 0 { return m.DeleteCost * lenA, maxLen } // Initialize cost slice. prevCol := make([]int, lenB+1) for i := 0; i <= lenB; i++ { prevCol[i] = i } // Calculate distance. col := make([]int, lenB+1) for i := 0; i < lenA; i++ { col[0] = i + 1 for j := 0; j < lenB; j++ { delCost := prevCol[j+1] + m.DeleteCost insCost := col[j] + m.InsertCost subCost := prevCol[j] if runesA[i] != runesB[j] { subCost += m.ReplaceCost } col[j+1] = Min(delCost, insCost, subCost) } col, prevCol = prevCol, col } return prevCol[lenB], maxLen }