From 340bb597969f1ef68273052adf25e6a842247818 Mon Sep 17 00:00:00 2001 From: Thomas von Dein Date: Mon, 18 Nov 2024 08:15:40 +0100 Subject: [PATCH] add fuzzy testing --- levenshtein.go | 149 ++++++++++++++++++++++++++ lib.bak | 285 ------------------------------------------------- 2 files changed, 149 insertions(+), 285 deletions(-) create mode 100644 levenshtein.go delete mode 100644 lib.bak diff --git a/levenshtein.go b/levenshtein.go new file mode 100644 index 0000000..32e4cc8 --- /dev/null +++ b/levenshtein.go @@ -0,0 +1,149 @@ +package valpass + +// via https://github.com/adrg/strutil, MIT licensed +// Copyright (c) 2019-2023 Adrian-George Bostan. + +import ( + "strings" +) + +// Levenshtein represents the Levenshtein metric for measuring the similarity +// between sequences. +// +// For more information see https://en.wikipedia.org/wiki/Levenshtein_distance. +type Levenshtein struct { + // CaseSensitive specifies if the string comparison is case sensitive. + CaseSensitive bool + + // InsertCost represents the Levenshtein cost of a character insertion. + InsertCost int + + // InsertCost represents the Levenshtein cost of a character deletion. + DeleteCost int + + // InsertCost represents the Levenshtein cost of a character substitution. + ReplaceCost int +} + +// NewLevenshtein returns a new Levenshtein string metric. +// +// Default options: +// +// CaseSensitive: true +// InsertCost: 1 +// DeleteCost: 1 +// ReplaceCost: 1 +func NewLevenshtein() *Levenshtein { + return &Levenshtein{ + CaseSensitive: true, + InsertCost: 1, + DeleteCost: 1, + ReplaceCost: 1, + } +} + +// Compare returns the Levenshtein similarity of a and b. The returned +// similarity is a number between 0 and 1. Larger similarity numbers indicate +// closer matches. +func (m *Levenshtein) Compare(a, b string) float64 { + distance, maxLen := m.distance(a, b) + return 1 - float64(distance)/float64(maxLen) +} + +// Distance returns the Levenshtein distance between a and b. Lower distances +// indicate closer matches. A distance of 0 means the strings are identical. +func (m *Levenshtein) Distance(a, b string) int { + distance, _ := m.distance(a, b) + return distance +} + +// Min returns the value of the smallest argument, +// or 0 if no arguments are provided. +func Min(args ...int) int { + if len(args) == 0 { + return 0 + } + if len(args) == 1 { + return args[0] + } + + min := args[0] + for _, arg := range args[1:] { + if min > arg { + min = arg + } + } + + return min +} + +// Max returns the value of the largest argument, +// or 0 if no arguments are provided. +func Max(args ...int) int { + if len(args) == 0 { + return 0 + } + if len(args) == 1 { + return args[0] + } + + max := args[0] + for _, arg := range args[1:] { + if max < arg { + max = arg + } + } + + return max +} + +func (m *Levenshtein) distance(a, b string) (int, int) { + // Lower terms if case insensitive comparison is specified. + if !m.CaseSensitive { + a = strings.ToLower(a) + b = strings.ToLower(b) + } + runesA, runesB := []rune(a), []rune(b) + + // Check if both terms are empty. + lenA, lenB := len(runesA), len(runesB) + if lenA == 0 && lenB == 0 { + return 0, 0 + } + + // Check if one of the terms is empty. + maxLen := Max(lenA, lenB) + if lenA == 0 { + return m.InsertCost * lenB, maxLen + } + if lenB == 0 { + return m.DeleteCost * lenA, maxLen + } + + // Initialize cost slice. + prevCol := make([]int, lenB+1) + for i := 0; i <= lenB; i++ { + prevCol[i] = i + } + + // Calculate distance. + col := make([]int, lenB+1) + for i := 0; i < lenA; i++ { + col[0] = i + 1 + for j := 0; j < lenB; j++ { + delCost := prevCol[j+1] + m.DeleteCost + insCost := col[j] + m.InsertCost + + subCost := prevCol[j] + if runesA[i] != runesB[j] { + subCost += m.ReplaceCost + } + + col[j+1] = Min(delCost, insCost, subCost) + } + + col, prevCol = prevCol, col + } + + return prevCol[lenB], maxLen +} diff --git a/lib.bak b/lib.bak deleted file mode 100644 index 335db5e..0000000 --- a/lib.bak +++ /dev/null @@ -1,285 +0,0 @@ -package valpass - -import ( - "bytes" - "compress/flate" - "fmt" - "math" - "strings" -) - -/* - * Contains the raw dictionary data and some flags. Must be provided - * by the user - */ -type Dictionary struct { - Words []string // the actual dictionary - Submatch bool // if true 'foo' would match 'foobar' -} - -/* - * Options define how to operate the validation - */ -type Options struct { - Compress int // minimum compression rate in percent - CharDistribution float64 // minimum char distribution in percent - Entropy float64 // minimum entropy value in bits/char - Dictionary []string // if set, lookup given dictionary, the caller provides it - UTF8 bool // if true work on unicode utf-8 space, not just bytes -} - -/* - * Default validation config, a compromise of comfort and security, as always. - */ -const ( - MIN_ENTROPY float64 = 3.0 - MIN_COMPRESS int = 10 - MIN_DICT bool = false - MIN_DIST float64 = 10.0 - MAX_UTF8 int = 2164864 // max characters encodable with utf8 - MAX_CHARS int = 95 // maximum printable US ASCII chars - MIN_DICT_LEN int = 5000 - - // we start our ascii arrays at char(32), so to have max 95 - // elements in the slice, we subtract 32 from each ascii code - MIN_ASCII int = 32 -) - -type Result struct { - Ok bool - Options -} - -func Validate(passphrase string, opts ...Options) (Result, error) { - result := Result{Ok: true} - options := Options{ - MIN_COMPRESS, - MIN_DIST, - MIN_ENTROPY, - nil, - false, - } - - if len(opts) == 1 { - options = opts[0] - } - - if options.Entropy > 0 { - var entropy float64 - var err error - - switch options.UTF8 { - case true: - entropy, err = GetEntropyUTF8(passphrase) - if err != nil { - return result, err - } - default: - entropy, err = GetEntropyAscii(passphrase) - if err != nil { - return result, err - } - } - - if entropy <= options.Entropy { - result.Ok = false - } - - result.Entropy = entropy - } - - if options.Compress > 0 { - compression, err := GetCompression([]byte(passphrase)) - if err != nil { - return result, err - } - - if compression >= options.Compress { - result.Ok = false - } - - result.Compress = compression - } - - if options.CharDistribution > 0 { - var dist float64 - - switch options.UTF8 { - case true: - dist = GetDistributionUTF8(passphrase) - default: - dist = GetDistributionAscii(passphrase) - } - if dist <= options.CharDistribution { - result.Ok = false - } - - result.CharDistribution = dist - } - - if len(options.Dictionary) > 0 { - - } - - return result, nil -} - -/* - * we compress with Flate level 9 (max) and see if the result is - * smaller than the password, in which case it could be compressed and - * contains repeating characters; OR it is larger than the password, - * in which case it could NOT be compressed, which is what we want. - */ -func GetCompression(passphrase []byte) (int, error) { - var b bytes.Buffer - flater, _ := flate.NewWriter(&b, 9) - - if _, err := flater.Write(passphrase); err != nil { - return 0, fmt.Errorf("failed to write to flate writer: %w", err) - } - - if err := flater.Flush(); err != nil { - return 0, fmt.Errorf("failed to flush flate writer: %w", err) - } - - if err := flater.Close(); err != nil { - return 0, fmt.Errorf("failed to close flate writer: %w", err) - } - - // use floats to avoid division by zero panic - length := float32(len(passphrase)) - compressed := float32(len(b.Bytes())) - - if compressed >= length { - return 0, nil - } - - percent := 100 - (compressed / (length / 100)) - - return int(percent), nil -} - -/* - * Return the entropy as bits/rune, where rune is a unicode char in - * utf8 space. - */ -func GetEntropyUTF8(passphrase string) (float64, error) { - var entropy float64 - length := len(passphrase) - - wherechar := make([]int, MAX_UTF8) - hist := make([]int, length) - var histlen int - - for i := 0; i < MAX_UTF8; i++ { - wherechar[i] = -1 - } - - for _, char := range passphrase { - if wherechar[char] == -1 { - wherechar[char] = histlen - histlen++ - } - - hist[wherechar[char]]++ - } - - for i := 0; i < histlen; i++ { - diff := float64(hist[i]) / float64(length) - entropy -= diff * math.Log2(diff) - } - - return entropy, nil -} - -/* same thing for us ascii */ -func GetEntropyAscii(passphrase string) (float64, error) { - var entropy float64 - length := len(passphrase) - - wherechar := make([]int, MAX_CHARS) - hist := make([]int, length) - var histlen int - - for i := 0; i < MAX_CHARS; i++ { - wherechar[i] = -1 - } - - for _, char := range []byte(passphrase) { - if char < MIN_ASCII || char > 126 { - return 0, fmt.Errorf("non-printable ASCII character encountered: %c", char) - } - if wherechar[char-MIN_ASCII] == -1 { - wherechar[char-MIN_ASCII] = histlen - histlen++ - } - - hist[wherechar[char-MIN_ASCII]]++ - } - - for i := 0; i < histlen; i++ { - diff := float64(hist[i]) / float64(length) - entropy -= diff * math.Log2(diff) - } - - return entropy, nil -} - -/* - * Return character distribution - */ -func GetDistributionUTF8(passphrase string) float64 { - hash := make([]int, MAX_UTF8) - var chars float64 - - for _, char := range passphrase { - hash[char]++ - } - - for i := 0; i < MAX_UTF8; i++ { - if hash[i] > 0 { - chars++ - } - } - return chars / (float64(MAX_UTF8) / 100) -} - -func GetDistributionAscii(passphrase string) float64 { - hash := make([]int, MAX_CHARS) - var chars float64 - - for _, char := range []byte(passphrase) { - hash[int(char)-MIN_ASCII]++ - } - - for i := 0; i < MAX_CHARS; i++ { - if hash[i] > 0 { - chars++ - } - } - return chars / (float64(MAX_CHARS) / 100) -} - -func GetDictMatch(passphrase string, dict *Dictionary) (bool, error) { - if len(dict.Words) < MIN_DICT_LEN { - return false, fmt.Errorf("provided dictionary is too small") - } - - lcpass := strings.ToLower(passphrase) - - if dict.Submatch { - for _, word := range dict.Words { - if strings.Contains(strings.ToLower(word), lcpass) { - return true, nil - } - } - } else { - for _, word := range dict.Words { - if lcpass == strings.ToLower(word) { - return true, nil - } - } - } - - return false, nil -}