commit 623e7863bfa17276c8c39033ad8639e183d41234
parent 3975d504e3a0a45d47f0dd2b8e936b77215fc18d
Author: n0tr1v <n0tr1v@protonmail.com>
Date: Tue, 30 May 2023 20:21:24 -0700
add levenshtein library
Diffstat:
3 files changed, 164 insertions(+), 0 deletions(-)
diff --git a/pkg/levenshtein/License.txt b/pkg/levenshtein/License.txt
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Agniva De Sarker
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+\ No newline at end of file
diff --git a/pkg/levenshtein/levenshtein.go b/pkg/levenshtein/levenshtein.go
@@ -0,0 +1,89 @@
+// Package levenshtein is a Go implementation to calculate Levenshtein Distance.
+//
+// Implementation taken from
+// https://gist.github.com/andrei-m/982927#gistcomment-1931258
+package levenshtein
+
+import "unicode/utf8"
+
+// minLengthThreshold is the length of the string beyond which
+// an allocation will be made. Strings smaller than this will be
+// zero alloc.
+const minLengthThreshold = 32
+
+// ComputeDistance computes the levenshtein distance between the two
+// strings passed as an argument. The return value is the levenshtein distance
+//
+// Works on runes (Unicode code points) but does not normalize
+// the input strings. See https://blog.golang.org/normalization
+// and the golang.org/x/text/unicode/norm package.
+func ComputeDistance(a, b string) int {
+ if len(a) == 0 {
+ return utf8.RuneCountInString(b)
+ }
+
+ if len(b) == 0 {
+ return utf8.RuneCountInString(a)
+ }
+
+ if a == b {
+ return 0
+ }
+
+ // We need to convert to []rune if the strings are non-ASCII.
+ // This could be avoided by using utf8.RuneCountInString
+ // and then doing some juggling with rune indices,
+ // but leads to far more bounds checks. It is a reasonable trade-off.
+ s1 := []rune(a)
+ s2 := []rune(b)
+
+ // swap to save some memory O(min(a,b)) instead of O(a)
+ if len(s1) > len(s2) {
+ s1, s2 = s2, s1
+ }
+ lenS1 := len(s1)
+ lenS2 := len(s2)
+
+ // Init the row.
+ var x []uint16
+ if lenS1+1 > minLengthThreshold {
+ x = make([]uint16, lenS1+1)
+ } else {
+ // We make a small optimization here for small strings.
+ // Because a slice of constant length is effectively an array,
+ // it does not allocate. So we can re-slice it to the right length
+ // as long as it is below a desired threshold.
+ x = make([]uint16, minLengthThreshold)
+ x = x[:lenS1+1]
+ }
+
+ // we start from 1 because index 0 is already 0.
+ for i := 1; i < len(x); i++ {
+ x[i] = uint16(i)
+ }
+
+ // make a dummy bounds check to prevent the 2 bounds check down below.
+ // The one inside the loop is particularly costly.
+ _ = x[lenS1]
+ // fill in the rest
+ for i := 1; i <= lenS2; i++ {
+ prev := uint16(i)
+ for j := 1; j <= lenS1; j++ {
+ current := x[j-1] // match
+ if s2[i-1] != s1[j-1] {
+ current = min(min(x[j-1]+1, prev+1), x[j]+1)
+ }
+ x[j-1] = prev
+ prev = current
+ }
+ x[lenS1] = prev
+ }
+ return int(x[lenS1])
+}
+
+func min(a, b uint16) uint16 {
+ if a < b {
+ return a
+ }
+ return b
+}
diff --git a/pkg/levenshtein/levenshtein_test.go b/pkg/levenshtein/levenshtein_test.go
@@ -0,0 +1,53 @@
+package levenshtein
+
+import (
+ "testing"
+)
+
+func TestSanity(t *testing.T) {
+ tests := []struct {
+ a, b string
+ want int
+ }{
+ {"", "hello", 5},
+ {"hello", "", 5},
+ {"hello", "hello", 0},
+ {"ab", "aa", 1},
+ {"ab", "ba", 2},
+ {"ab", "aaa", 2},
+ {"bbb", "a", 3},
+ {"kitten", "sitting", 3},
+ {"distance", "difference", 5},
+ {"levenshtein", "frankenstein", 6},
+ {"resume and cafe", "resumes and cafes", 2},
+ {"a very long string that is meant to exceed", "another very long string that is meant to exceed", 6},
+ }
+ for i, d := range tests {
+ n := ComputeDistance(d.a, d.b)
+ if n != d.want {
+ t.Errorf("Test[%d]: ComputeDistance(%q,%q) returned %v, want %v",
+ i, d.a, d.b, n, d.want)
+ }
+ }
+}
+
+func TestUnicode(t *testing.T) {
+ tests := []struct {
+ a, b string
+ want int
+ }{
+ // Testing acutes and umlauts
+ {"resumé and café", "resumés and cafés", 2},
+ {"resume and cafe", "resumé and café", 2},
+ {"Hafþór Júlíus Björnsson", "Hafþor Julius Bjornsson", 4},
+ // Only 2 characters are less in the 2nd string
+ {"།་གམ་འས་པ་་མ།", "།་གམའས་པ་་མ", 2},
+ }
+ for i, d := range tests {
+ n := ComputeDistance(d.a, d.b)
+ if n != d.want {
+ t.Errorf("Test[%d]: ComputeDistance(%q,%q) returned %v, want %v",
+ i, d.a, d.b, n, d.want)
+ }
+ }
+}