add fuzzy testing

2026-02-04 03:00:57 +01:00 · 2024-11-18 08:15:40 +01:00
parent f267b9da22
commit 340bb59796
2 changed files with 149 additions and 285 deletions
--- a/levenshtein.go
+++ b/levenshtein.go
@@ -0,0 +1,149 @@
 package valpass
 // via https://github.com/adrg/strutil, MIT licensed
 // Copyright (c) 2019-2023 Adrian-George Bostan.
 import (
 	"strings"
 )
 // Levenshtein represents the Levenshtein metric for measuring the similarity
 // between sequences.
 //
 //	For more information see https://en.wikipedia.org/wiki/Levenshtein_distance.
 type Levenshtein struct {
 	// CaseSensitive specifies if the string comparison is case sensitive.
 	CaseSensitive bool
 	// InsertCost represents the Levenshtein cost of a character insertion.
 	InsertCost int
 	// InsertCost represents the Levenshtein cost of a character deletion.
 	DeleteCost int
 	// InsertCost represents the Levenshtein cost of a character substitution.
 	ReplaceCost int
 }
 // NewLevenshtein returns a new Levenshtein string metric.
 //
 // Default options:
 //
 //	CaseSensitive: true
 //	InsertCost: 1
 //	DeleteCost: 1
 //	ReplaceCost: 1
 func NewLevenshtein() *Levenshtein {
 	return &Levenshtein{
 		CaseSensitive: true,
 		InsertCost:    1,
 		DeleteCost:    1,
 		ReplaceCost:   1,
 	}
 }
 // Compare returns the Levenshtein similarity of a and b. The returned
 // similarity is a number between 0 and 1. Larger similarity numbers indicate
 // closer matches.
 func (m *Levenshtein) Compare(a, b string) float64 {
 	distance, maxLen := m.distance(a, b)
 	return 1 - float64(distance)/float64(maxLen)
 }
 // Distance returns the Levenshtein distance between a and b. Lower distances
 // indicate closer matches. A distance of 0 means the strings are identical.
 func (m *Levenshtein) Distance(a, b string) int {
 	distance, _ := m.distance(a, b)
 	return distance
 }
 // Min returns the value of the smallest argument,
 // or 0 if no arguments are provided.
 func Min(args ...int) int {
 	if len(args) == 0 {
 		return 0
 	}
 	if len(args) == 1 {
 		return args[0]
 	}
 	min := args[0]
 	for _, arg := range args[1:] {
 		if min > arg {
 			min = arg
 		}
 	}
 	return min
 }
 // Max returns the value of the largest argument,
 // or 0 if no arguments are provided.
 func Max(args ...int) int {
 	if len(args) == 0 {
 		return 0
 	}
 	if len(args) == 1 {
 		return args[0]
 	}
 	max := args[0]
 	for _, arg := range args[1:] {
 		if max < arg {
 			max = arg
 		}
 	}
 	return max
 }
 func (m *Levenshtein) distance(a, b string) (int, int) {
 	// Lower terms if case insensitive comparison is specified.
 	if !m.CaseSensitive {
 		a = strings.ToLower(a)
 		b = strings.ToLower(b)
 	}
 	runesA, runesB := []rune(a), []rune(b)
 	// Check if both terms are empty.
 	lenA, lenB := len(runesA), len(runesB)
 	if lenA == 0 && lenB == 0 {
 		return 0, 0
 	}
 	// Check if one of the terms is empty.
 	maxLen := Max(lenA, lenB)
 	if lenA == 0 {
 		return m.InsertCost * lenB, maxLen
 	}
 	if lenB == 0 {
 		return m.DeleteCost * lenA, maxLen
 	}
 	// Initialize cost slice.
 	prevCol := make([]int, lenB+1)
 	for i := 0; i <= lenB; i++ {
 		prevCol[i] = i
 	}
 	// Calculate distance.
 	col := make([]int, lenB+1)
 	for i := 0; i < lenA; i++ {
 		col[0] = i + 1
 		for j := 0; j < lenB; j++ {
 			delCost := prevCol[j+1] + m.DeleteCost
 			insCost := col[j] + m.InsertCost
 			subCost := prevCol[j]
 			if runesA[i] != runesB[j] {
 				subCost += m.ReplaceCost
 			}
 			col[j+1] = Min(delCost, insCost, subCost)
 		}
 		col, prevCol = prevCol, col
 	}
 	return prevCol[lenB], maxLen
 }
--- a/lib.bak
+++ b/lib.bak
@@ -1,285 +0,0 @@
 package valpass
 import (
 	"bytes"
 	"compress/flate"
 	"fmt"
 	"math"
 	"strings"
 )
 /*
 * Contains the raw  dictionary data and some flags.  Must be provided
 * by the user
 */
 type Dictionary struct {
 	Words    []string // the actual dictionary
 	Submatch bool     // if true 'foo' would match 'foobar'
 }
 /*
 * Options define how to operate the validation
 */
 type Options struct {
 	Compress         int      // minimum compression rate in percent
 	CharDistribution float64  // minimum char distribution in percent
 	Entropy          float64  // minimum entropy value in bits/char
 	Dictionary       []string // if set, lookup given dictionary, the caller provides it
 	UTF8             bool     // if true work on unicode utf-8 space, not just bytes
 }
 /*
 * Default validation config, a compromise of comfort and security, as always.
 */
 const (
 	MIN_ENTROPY  float64 = 3.0
 	MIN_COMPRESS int     = 10
 	MIN_DICT     bool    = false
 	MIN_DIST     float64 = 10.0
 	MAX_UTF8     int     = 2164864 // max characters encodable with utf8
 	MAX_CHARS    int     = 95      // maximum printable US ASCII chars
 	MIN_DICT_LEN int     = 5000
 	//  we start  our ascii  arrays  at char(32),  so to  have max  95
 	// elements in the slice, we subtract 32 from each ascii code
 	MIN_ASCII int = 32
 )
 type Result struct {
 	Ok bool
 	Options
 }
 func Validate(passphrase string, opts ...Options) (Result, error) {
 	result := Result{Ok: true}
 	options := Options{
 		MIN_COMPRESS,
 		MIN_DIST,
 		MIN_ENTROPY,
 		nil,
 		false,
 	}
 	if len(opts) == 1 {
 		options = opts[0]
 	}
 	if options.Entropy > 0 {
 		var entropy float64
 		var err error
 		switch options.UTF8 {
 		case true:
 			entropy, err = GetEntropyUTF8(passphrase)
 			if err != nil {
 				return result, err
 			}
 		default:
 			entropy, err = GetEntropyAscii(passphrase)
 			if err != nil {
 				return result, err
 			}
 		}
 		if entropy <= options.Entropy {
 			result.Ok = false
 		}
 		result.Entropy = entropy
 	}
 	if options.Compress > 0 {
 		compression, err := GetCompression([]byte(passphrase))
 		if err != nil {
 			return result, err
 		}
 		if compression >= options.Compress {
 			result.Ok = false
 		}
 		result.Compress = compression
 	}
 	if options.CharDistribution > 0 {
 		var dist float64
 		switch options.UTF8 {
 		case true:
 			dist = GetDistributionUTF8(passphrase)
 		default:
 			dist = GetDistributionAscii(passphrase)
 		}
 		if dist <= options.CharDistribution {
 			result.Ok = false
 		}
 		result.CharDistribution = dist
 	}
 	if len(options.Dictionary) > 0 {
 	}
 	return result, nil
 }
 /*
 * we  compress with  Flate level  9 (max)  and see  if the  result is
 * smaller than the password, in which case it could be compressed and
 * contains repeating characters;  OR it is larger  than the password,
 * in which case it could NOT be compressed, which is what we want.
 */
 func GetCompression(passphrase []byte) (int, error) {
 	var b bytes.Buffer
 	flater, _ := flate.NewWriter(&b, 9)
 	if _, err := flater.Write(passphrase); err != nil {
 		return 0, fmt.Errorf("failed to write to flate writer: %w", err)
 	}
 	if err := flater.Flush(); err != nil {
 		return 0, fmt.Errorf("failed to flush flate writer: %w", err)
 	}
 	if err := flater.Close(); err != nil {
 		return 0, fmt.Errorf("failed to close flate writer: %w", err)
 	}
 	// use floats to avoid division by zero panic
 	length := float32(len(passphrase))
 	compressed := float32(len(b.Bytes()))
 	if compressed >= length {
 		return 0, nil
 	}
 	percent := 100 - (compressed / (length / 100))
 	return int(percent), nil
 }
 /*
 * Return the  entropy as bits/rune, where  rune is a unicode  char in
 * utf8 space.
 */
 func GetEntropyUTF8(passphrase string) (float64, error) {
 	var entropy float64
 	length := len(passphrase)
 	wherechar := make([]int, MAX_UTF8)
 	hist := make([]int, length)
 	var histlen int
 	for i := 0; i < MAX_UTF8; i++ {
 		wherechar[i] = -1
 	}
 	for _, char := range passphrase {
 		if wherechar[char] == -1 {
 			wherechar[char] = histlen
 			histlen++
 		}
 		hist[wherechar[char]]++
 	}
 	for i := 0; i < histlen; i++ {
 		diff := float64(hist[i]) / float64(length)
 		entropy -= diff * math.Log2(diff)
 	}
 	return entropy, nil
 }
 /* same thing for us ascii */
 func GetEntropyAscii(passphrase string) (float64, error) {
 	var entropy float64
 	length := len(passphrase)
 	wherechar := make([]int, MAX_CHARS)
 	hist := make([]int, length)
 	var histlen int
 	for i := 0; i < MAX_CHARS; i++ {
 		wherechar[i] = -1
 	}
 	for _, char := range []byte(passphrase) {
 		if char < MIN_ASCII || char > 126 {
 			return 0, fmt.Errorf("non-printable ASCII character encountered: %c", char)
 		}
 		if wherechar[char-MIN_ASCII] == -1 {
 			wherechar[char-MIN_ASCII] = histlen
 			histlen++
 		}
 		hist[wherechar[char-MIN_ASCII]]++
 	}
 	for i := 0; i < histlen; i++ {
 		diff := float64(hist[i]) / float64(length)
 		entropy -= diff * math.Log2(diff)
 	}
 	return entropy, nil
 }
 /*
 * Return character distribution
 */
 func GetDistributionUTF8(passphrase string) float64 {
 	hash := make([]int, MAX_UTF8)
 	var chars float64
 	for _, char := range passphrase {
 		hash[char]++
 	}
 	for i := 0; i < MAX_UTF8; i++ {
 		if hash[i] > 0 {
 			chars++
 		}
 	}
 	return chars / (float64(MAX_UTF8) / 100)
 }
 func GetDistributionAscii(passphrase string) float64 {
 	hash := make([]int, MAX_CHARS)
 	var chars float64
 	for _, char := range []byte(passphrase) {
 		hash[int(char)-MIN_ASCII]++
 	}
 	for i := 0; i < MAX_CHARS; i++ {
 		if hash[i] > 0 {
 			chars++
 		}
 	}
 	return chars / (float64(MAX_CHARS) / 100)
 }
 func GetDictMatch(passphrase string, dict *Dictionary) (bool, error) {
 	if len(dict.Words) < MIN_DICT_LEN {
 		return false, fmt.Errorf("provided dictionary is too small")
 	}
 	lcpass := strings.ToLower(passphrase)
 	if dict.Submatch {
 		for _, word := range dict.Words {
 			if strings.Contains(strings.ToLower(word), lcpass) {
 				return true, nil
 			}
 		}
 	} else {
 		for _, word := range dict.Words {
 			if lcpass == strings.ToLower(word) {
 				return true, nil
 			}
 		}
 	}
 	return false, nil
 }