dkforest

A forum and chat platform (onion)
git clone https://git.dasho.dev/n0tr1v/dkforest.git
Log | Files | Refs | LICENSE

commit 623e7863bfa17276c8c39033ad8639e183d41234
parent 3975d504e3a0a45d47f0dd2b8e936b77215fc18d
Author: n0tr1v <n0tr1v@protonmail.com>
Date:   Tue, 30 May 2023 20:21:24 -0700

add levenshtein library

Diffstat:
Apkg/levenshtein/License.txt | 22++++++++++++++++++++++
Apkg/levenshtein/levenshtein.go | 89+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apkg/levenshtein/levenshtein_test.go | 53+++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 164 insertions(+), 0 deletions(-)

diff --git a/pkg/levenshtein/License.txt b/pkg/levenshtein/License.txt @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Agniva De Sarker + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +\ No newline at end of file diff --git a/pkg/levenshtein/levenshtein.go b/pkg/levenshtein/levenshtein.go @@ -0,0 +1,89 @@ +// Package levenshtein is a Go implementation to calculate Levenshtein Distance. +// +// Implementation taken from +// https://gist.github.com/andrei-m/982927#gistcomment-1931258 +package levenshtein + +import "unicode/utf8" + +// minLengthThreshold is the length of the string beyond which +// an allocation will be made. Strings smaller than this will be +// zero alloc. +const minLengthThreshold = 32 + +// ComputeDistance computes the levenshtein distance between the two +// strings passed as an argument. The return value is the levenshtein distance +// +// Works on runes (Unicode code points) but does not normalize +// the input strings. See https://blog.golang.org/normalization +// and the golang.org/x/text/unicode/norm package. +func ComputeDistance(a, b string) int { + if len(a) == 0 { + return utf8.RuneCountInString(b) + } + + if len(b) == 0 { + return utf8.RuneCountInString(a) + } + + if a == b { + return 0 + } + + // We need to convert to []rune if the strings are non-ASCII. + // This could be avoided by using utf8.RuneCountInString + // and then doing some juggling with rune indices, + // but leads to far more bounds checks. It is a reasonable trade-off. + s1 := []rune(a) + s2 := []rune(b) + + // swap to save some memory O(min(a,b)) instead of O(a) + if len(s1) > len(s2) { + s1, s2 = s2, s1 + } + lenS1 := len(s1) + lenS2 := len(s2) + + // Init the row. + var x []uint16 + if lenS1+1 > minLengthThreshold { + x = make([]uint16, lenS1+1) + } else { + // We make a small optimization here for small strings. + // Because a slice of constant length is effectively an array, + // it does not allocate. So we can re-slice it to the right length + // as long as it is below a desired threshold. + x = make([]uint16, minLengthThreshold) + x = x[:lenS1+1] + } + + // we start from 1 because index 0 is already 0. + for i := 1; i < len(x); i++ { + x[i] = uint16(i) + } + + // make a dummy bounds check to prevent the 2 bounds check down below. + // The one inside the loop is particularly costly. + _ = x[lenS1] + // fill in the rest + for i := 1; i <= lenS2; i++ { + prev := uint16(i) + for j := 1; j <= lenS1; j++ { + current := x[j-1] // match + if s2[i-1] != s1[j-1] { + current = min(min(x[j-1]+1, prev+1), x[j]+1) + } + x[j-1] = prev + prev = current + } + x[lenS1] = prev + } + return int(x[lenS1]) +} + +func min(a, b uint16) uint16 { + if a < b { + return a + } + return b +} diff --git a/pkg/levenshtein/levenshtein_test.go b/pkg/levenshtein/levenshtein_test.go @@ -0,0 +1,53 @@ +package levenshtein + +import ( + "testing" +) + +func TestSanity(t *testing.T) { + tests := []struct { + a, b string + want int + }{ + {"", "hello", 5}, + {"hello", "", 5}, + {"hello", "hello", 0}, + {"ab", "aa", 1}, + {"ab", "ba", 2}, + {"ab", "aaa", 2}, + {"bbb", "a", 3}, + {"kitten", "sitting", 3}, + {"distance", "difference", 5}, + {"levenshtein", "frankenstein", 6}, + {"resume and cafe", "resumes and cafes", 2}, + {"a very long string that is meant to exceed", "another very long string that is meant to exceed", 6}, + } + for i, d := range tests { + n := ComputeDistance(d.a, d.b) + if n != d.want { + t.Errorf("Test[%d]: ComputeDistance(%q,%q) returned %v, want %v", + i, d.a, d.b, n, d.want) + } + } +} + +func TestUnicode(t *testing.T) { + tests := []struct { + a, b string + want int + }{ + // Testing acutes and umlauts + {"resumé and café", "resumés and cafés", 2}, + {"resume and cafe", "resumé and café", 2}, + {"Hafþór Júlíus Björnsson", "Hafþor Julius Bjornsson", 4}, + // Only 2 characters are less in the 2nd string + {"།་གམ་འས་པ་་མ།", "།་གམའས་པ་་མ", 2}, + } + for i, d := range tests { + n := ComputeDistance(d.a, d.b) + if n != d.want { + t.Errorf("Test[%d]: ComputeDistance(%q,%q) returned %v, want %v", + i, d.a, d.b, n, d.want) + } + } +}