[ dkforest ].git.dasho

commit 623e7863bfa17276c8c39033ad8639e183d41234
parent 3975d504e3a0a45d47f0dd2b8e936b77215fc18d
Author: n0tr1v <n0tr1v@protonmail.com>
Date:   Tue, 30 May 2023 20:21:24 -0700

add levenshtein library

Diffstat:
A pkg/levenshtein/License.txt  | 22 ++++++++++++++++++++++
A pkg/levenshtein/levenshtein.go  | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A pkg/levenshtein/levenshtein_test.go  | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 164 insertions(+), 0 deletions(-)
diff --git a/pkg/levenshtein/License.txt  b/pkg/levenshtein/License.txt 
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Agniva De Sarker
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+\ No newline at end of file
diff --git a/pkg/levenshtein/levenshtein.go b/pkg/levenshtein/levenshtein.go
@@ -0,0 +1,89 @@
+// Package levenshtein is a Go implementation to calculate Levenshtein Distance.
+//
+// Implementation taken from
+// https://gist.github.com/andrei-m/982927#gistcomment-1931258
+package levenshtein
+
+import "unicode/utf8"
+
+// minLengthThreshold is the length of the string beyond which
+// an allocation will be made. Strings smaller than this will be
+// zero alloc.
+const minLengthThreshold = 32
+
+// ComputeDistance computes the levenshtein distance between the two
+// strings passed as an argument. The return value is the levenshtein distance
+//
+// Works on runes (Unicode code points) but does not normalize
+// the input strings. See https://blog.golang.org/normalization
+// and the golang.org/x/text/unicode/norm package.
+func ComputeDistance(a, b string) int {
+	if len(a) == 0 {
+		return utf8.RuneCountInString(b)
+	}
+
+	if len(b) == 0 {
+		return utf8.RuneCountInString(a)
+	}
+
+	if a == b {
+		return 0
+	}
+
+	// We need to convert to []rune if the strings are non-ASCII.
+	// This could be avoided by using utf8.RuneCountInString
+	// and then doing some juggling with rune indices,
+	// but leads to far more bounds checks. It is a reasonable trade-off.
+	s1 := []rune(a)
+	s2 := []rune(b)
+
+	// swap to save some memory O(min(a,b)) instead of O(a)
+	if len(s1) > len(s2) {
+		s1, s2 = s2, s1
+	}
+	lenS1 := len(s1)
+	lenS2 := len(s2)
+
+	// Init the row.
+	var x []uint16
+	if lenS1+1 > minLengthThreshold {
+		x = make([]uint16, lenS1+1)
+	} else {
+		// We make a small optimization here for small strings.
+		// Because a slice of constant length is effectively an array,
+		// it does not allocate. So we can re-slice it to the right length
+		// as long as it is below a desired threshold.
+		x = make([]uint16, minLengthThreshold)
+		x = x[:lenS1+1]
+	}
+
+	// we start from 1 because index 0 is already 0.
+	for i := 1; i < len(x); i++ {
+		x[i] = uint16(i)
+	}
+
+	// make a dummy bounds check to prevent the 2 bounds check down below.
+	// The one inside the loop is particularly costly.
+	_ = x[lenS1]
+	// fill in the rest
+	for i := 1; i <= lenS2; i++ {
+		prev := uint16(i)
+		for j := 1; j <= lenS1; j++ {
+			current := x[j-1] // match
+			if s2[i-1] != s1[j-1] {
+				current = min(min(x[j-1]+1, prev+1), x[j]+1)
+			}
+			x[j-1] = prev
+			prev = current
+		}
+		x[lenS1] = prev
+	}
+	return int(x[lenS1])
+}
+
+func min(a, b uint16) uint16 {
+	if a < b {
+		return a
+	}
+	return b
+}
diff --git a/pkg/levenshtein/levenshtein_test.go b/pkg/levenshtein/levenshtein_test.go
@@ -0,0 +1,53 @@
+package levenshtein
+
+import (
+	"testing"
+)
+
+func TestSanity(t *testing.T) {
+	tests := []struct {
+		a, b string
+		want int
+	}{
+		{"", "hello", 5},
+		{"hello", "", 5},
+		{"hello", "hello", 0},
+		{"ab", "aa", 1},
+		{"ab", "ba", 2},
+		{"ab", "aaa", 2},
+		{"bbb", "a", 3},
+		{"kitten", "sitting", 3},
+		{"distance", "difference", 5},
+		{"levenshtein", "frankenstein", 6},
+		{"resume and cafe", "resumes and cafes", 2},
+		{"a very long string that is meant to exceed", "another very long string that is meant to exceed", 6},
+	}
+	for i, d := range tests {
+		n := ComputeDistance(d.a, d.b)
+		if n != d.want {
+			t.Errorf("Test[%d]: ComputeDistance(%q,%q) returned %v, want %v",
+				i, d.a, d.b, n, d.want)
+		}
+	}
+}
+
+func TestUnicode(t *testing.T) {
+	tests := []struct {
+		a, b string
+		want int
+	}{
+		// Testing acutes and umlauts
+		{"resumé and café", "resumés and cafés", 2},
+		{"resume and cafe", "resumé and café", 2},
+		{"Hafþór Júlíus Björnsson", "Hafþor Julius Bjornsson", 4},
+		// Only 2 characters are less in the 2nd string
+		{"།་གམ་འས་པ་་མ།", "།་གམའས་པ་་མ", 2},
+	}
+	for i, d := range tests {
+		n := ComputeDistance(d.a, d.b)
+		if n != d.want {
+			t.Errorf("Test[%d]: ComputeDistance(%q,%q) returned %v, want %v",
+				i, d.a, d.b, n, d.want)
+		}
+	}
+}

	dkforest A forum and chat platform (onion)
	git clone https://git.dasho.dev/n0tr1v/dkforest.git
	Log \| Files \| Refs \| LICENSE

A	pkg/levenshtein/License.txt	\|	22	++++++++++++++++++++++
A	pkg/levenshtein/levenshtein.go	\|	89	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	pkg/levenshtein/levenshtein_test.go	\|	53	+++++++++++++++++++++++++++++++++++++++++++++++++++++