add fuzzy testing

2026-07-09 19:24:20 +02:00 · 2024-11-18 08:15:40 +01:00
parent f267b9da22
commit 340bb59796
2 changed files with 149 additions and 285 deletions
--- a/levenshtein.go
+++ b/levenshtein.go
@@ -0,0 +1,149 @@
+package valpass
+
+// via https://github.com/adrg/strutil, MIT licensed
+// Copyright (c) 2019-2023 Adrian-George Bostan.
+
+import (
+	"strings"
+)
+
+// Levenshtein represents the Levenshtein metric for measuring the similarity
+// between sequences.
+//
+//	For more information see https://en.wikipedia.org/wiki/Levenshtein_distance.
+type Levenshtein struct {
+	// CaseSensitive specifies if the string comparison is case sensitive.
+	CaseSensitive bool
+
+	// InsertCost represents the Levenshtein cost of a character insertion.
+	InsertCost int
+
+	// InsertCost represents the Levenshtein cost of a character deletion.
+	DeleteCost int
+
+	// InsertCost represents the Levenshtein cost of a character substitution.
+	ReplaceCost int
+}
+
+// NewLevenshtein returns a new Levenshtein string metric.
+//
+// Default options:
+//
+//	CaseSensitive: true
+//	InsertCost: 1
+//	DeleteCost: 1
+//	ReplaceCost: 1
+func NewLevenshtein() *Levenshtein {
+	return &Levenshtein{
+		CaseSensitive: true,
+		InsertCost:    1,
+		DeleteCost:    1,
+		ReplaceCost:   1,
+	}
+}
+
+// Compare returns the Levenshtein similarity of a and b. The returned
+// similarity is a number between 0 and 1. Larger similarity numbers indicate
+// closer matches.
+func (m *Levenshtein) Compare(a, b string) float64 {
+	distance, maxLen := m.distance(a, b)
+	return 1 - float64(distance)/float64(maxLen)
+}
+
+// Distance returns the Levenshtein distance between a and b. Lower distances
+// indicate closer matches. A distance of 0 means the strings are identical.
+func (m *Levenshtein) Distance(a, b string) int {
+	distance, _ := m.distance(a, b)
+	return distance
+}
+
+// Min returns the value of the smallest argument,
+// or 0 if no arguments are provided.
+func Min(args ...int) int {
+	if len(args) == 0 {
+		return 0
+	}
+	if len(args) == 1 {
+		return args[0]
+	}
+
+	min := args[0]
+	for _, arg := range args[1:] {
+		if min > arg {
+			min = arg
+		}
+	}
+
+	return min
+}
+
+// Max returns the value of the largest argument,
+// or 0 if no arguments are provided.
+func Max(args ...int) int {
+	if len(args) == 0 {
+		return 0
+	}
+	if len(args) == 1 {
+		return args[0]
+	}
+
+	max := args[0]
+	for _, arg := range args[1:] {
+		if max < arg {
+			max = arg
+		}
+	}
+
+	return max
+}
+
+func (m *Levenshtein) distance(a, b string) (int, int) {
+	// Lower terms if case insensitive comparison is specified.
+	if !m.CaseSensitive {
+		a = strings.ToLower(a)
+		b = strings.ToLower(b)
+	}
+	runesA, runesB := []rune(a), []rune(b)
+
+	// Check if both terms are empty.
+	lenA, lenB := len(runesA), len(runesB)
+	if lenA == 0 && lenB == 0 {
+		return 0, 0
+	}
+
+	// Check if one of the terms is empty.
+	maxLen := Max(lenA, lenB)
+	if lenA == 0 {
+		return m.InsertCost * lenB, maxLen
+	}
+	if lenB == 0 {
+		return m.DeleteCost * lenA, maxLen
+	}
+
+	// Initialize cost slice.
+	prevCol := make([]int, lenB+1)
+	for i := 0; i <= lenB; i++ {
+		prevCol[i] = i
+	}
+
+	// Calculate distance.
+	col := make([]int, lenB+1)
+	for i := 0; i < lenA; i++ {
+		col[0] = i + 1
+		for j := 0; j < lenB; j++ {
+			delCost := prevCol[j+1] + m.DeleteCost
+			insCost := col[j] + m.InsertCost
+
+			subCost := prevCol[j]
+			if runesA[i] != runesB[j] {
+				subCost += m.ReplaceCost
+			}
+
+			col[j+1] = Min(delCost, insCost, subCost)
+		}
+
+		col, prevCol = prevCol, col
+	}
+
+	return prevCol[lenB], maxLen
+}
--- a/lib.bak
+++ b/lib.bak
@@ -1,285 +0,0 @@
-package valpass
-
-import (
-	"bytes"
-	"compress/flate"
-	"fmt"
-	"math"
-	"strings"
-)
-
-/*
- * Contains the raw  dictionary data and some flags.  Must be provided
- * by the user
- */
-type Dictionary struct {
-	Words    []string // the actual dictionary
-	Submatch bool     // if true 'foo' would match 'foobar'
-}
-
-/*
- * Options define how to operate the validation
- */
-type Options struct {
-	Compress         int      // minimum compression rate in percent
-	CharDistribution float64  // minimum char distribution in percent
-	Entropy          float64  // minimum entropy value in bits/char
-	Dictionary       []string // if set, lookup given dictionary, the caller provides it
-	UTF8             bool     // if true work on unicode utf-8 space, not just bytes
-}
-
-/*
- * Default validation config, a compromise of comfort and security, as always.
- */
-const (
-	MIN_ENTROPY  float64 = 3.0
-	MIN_COMPRESS int     = 10
-	MIN_DICT     bool    = false
-	MIN_DIST     float64 = 10.0
-	MAX_UTF8     int     = 2164864 // max characters encodable with utf8
-	MAX_CHARS    int     = 95      // maximum printable US ASCII chars
-	MIN_DICT_LEN int     = 5000
-
-	//  we start  our ascii  arrays  at char(32),  so to  have max  95
-	// elements in the slice, we subtract 32 from each ascii code
-	MIN_ASCII int = 32
-)
-
-type Result struct {
-	Ok bool
-	Options
-}
-
-func Validate(passphrase string, opts ...Options) (Result, error) {
-	result := Result{Ok: true}
-	options := Options{
-		MIN_COMPRESS,
-		MIN_DIST,
-		MIN_ENTROPY,
-		nil,
-		false,
-	}
-
-	if len(opts) == 1 {
-		options = opts[0]
-	}
-
-	if options.Entropy > 0 {
-		var entropy float64
-		var err error
-
-		switch options.UTF8 {
-		case true:
-			entropy, err = GetEntropyUTF8(passphrase)
-			if err != nil {
-				return result, err
-			}
-		default:
-			entropy, err = GetEntropyAscii(passphrase)
-			if err != nil {
-				return result, err
-			}
-		}
-
-		if entropy <= options.Entropy {
-			result.Ok = false
-		}
-
-		result.Entropy = entropy
-	}
-
-	if options.Compress > 0 {
-		compression, err := GetCompression([]byte(passphrase))
-		if err != nil {
-			return result, err
-		}
-
-		if compression >= options.Compress {
-			result.Ok = false
-		}
-
-		result.Compress = compression
-	}
-
-	if options.CharDistribution > 0 {
-		var dist float64
-
-		switch options.UTF8 {
-		case true:
-			dist = GetDistributionUTF8(passphrase)
-		default:
-			dist = GetDistributionAscii(passphrase)
-		}
-		if dist <= options.CharDistribution {
-			result.Ok = false
-		}
-
-		result.CharDistribution = dist
-	}
-
-	if len(options.Dictionary) > 0 {
-
-	}
-
-	return result, nil
-}
-
-/*
- * we  compress with  Flate level  9 (max)  and see  if the  result is
- * smaller than the password, in which case it could be compressed and
- * contains repeating characters;  OR it is larger  than the password,
- * in which case it could NOT be compressed, which is what we want.
- */
-func GetCompression(passphrase []byte) (int, error) {
-	var b bytes.Buffer
-	flater, _ := flate.NewWriter(&b, 9)
-
-	if _, err := flater.Write(passphrase); err != nil {
-		return 0, fmt.Errorf("failed to write to flate writer: %w", err)
-	}
-
-	if err := flater.Flush(); err != nil {
-		return 0, fmt.Errorf("failed to flush flate writer: %w", err)
-	}
-
-	if err := flater.Close(); err != nil {
-		return 0, fmt.Errorf("failed to close flate writer: %w", err)
-	}
-
-	// use floats to avoid division by zero panic
-	length := float32(len(passphrase))
-	compressed := float32(len(b.Bytes()))
-
-	if compressed >= length {
-		return 0, nil
-	}
-
-	percent := 100 - (compressed / (length / 100))
-
-	return int(percent), nil
-}
-
-/*
- * Return the  entropy as bits/rune, where  rune is a unicode  char in
- * utf8 space.
- */
-func GetEntropyUTF8(passphrase string) (float64, error) {
-	var entropy float64
-	length := len(passphrase)
-
-	wherechar := make([]int, MAX_UTF8)
-	hist := make([]int, length)
-	var histlen int
-
-	for i := 0; i < MAX_UTF8; i++ {
-		wherechar[i] = -1
-	}
-
-	for _, char := range passphrase {
-		if wherechar[char] == -1 {
-			wherechar[char] = histlen
-			histlen++
-		}
-
-		hist[wherechar[char]]++
-	}
-
-	for i := 0; i < histlen; i++ {
-		diff := float64(hist[i]) / float64(length)
-		entropy -= diff * math.Log2(diff)
-	}
-
-	return entropy, nil
-}
-
-/* same thing for us ascii */
-func GetEntropyAscii(passphrase string) (float64, error) {
-	var entropy float64
-	length := len(passphrase)
-
-	wherechar := make([]int, MAX_CHARS)
-	hist := make([]int, length)
-	var histlen int
-
-	for i := 0; i < MAX_CHARS; i++ {
-		wherechar[i] = -1
-	}
-
-	for _, char := range []byte(passphrase) {
-		if char < MIN_ASCII || char > 126 {
-			return 0, fmt.Errorf("non-printable ASCII character encountered: %c", char)
-		}
-		if wherechar[char-MIN_ASCII] == -1 {
-			wherechar[char-MIN_ASCII] = histlen
-			histlen++
-		}
-
-		hist[wherechar[char-MIN_ASCII]]++
-	}
-
-	for i := 0; i < histlen; i++ {
-		diff := float64(hist[i]) / float64(length)
-		entropy -= diff * math.Log2(diff)
-	}
-
-	return entropy, nil
-}
-
-/*
- * Return character distribution
- */
-func GetDistributionUTF8(passphrase string) float64 {
-	hash := make([]int, MAX_UTF8)
-	var chars float64
-
-	for _, char := range passphrase {
-		hash[char]++
-	}
-
-	for i := 0; i < MAX_UTF8; i++ {
-		if hash[i] > 0 {
-			chars++
-		}
-	}
-	return chars / (float64(MAX_UTF8) / 100)
-}
-
-func GetDistributionAscii(passphrase string) float64 {
-	hash := make([]int, MAX_CHARS)
-	var chars float64
-
-	for _, char := range []byte(passphrase) {
-		hash[int(char)-MIN_ASCII]++
-	}
-
-	for i := 0; i < MAX_CHARS; i++ {
-		if hash[i] > 0 {
-			chars++
-		}
-	}
-	return chars / (float64(MAX_CHARS) / 100)
-}
-
-func GetDictMatch(passphrase string, dict *Dictionary) (bool, error) {
-	if len(dict.Words) < MIN_DICT_LEN {
-		return false, fmt.Errorf("provided dictionary is too small")
-	}
-
-	lcpass := strings.ToLower(passphrase)
-
-	if dict.Submatch {
-		for _, word := range dict.Words {
-			if strings.Contains(strings.ToLower(word), lcpass) {
-				return true, nil
-			}
-		}
-	} else {
-		for _, word := range dict.Words {
-			if lcpass == strings.ToLower(word) {
-				return true, nil
-			}
-		}
-	}
-
-	return false, nil
-}