dkforest

A forum and chat platform (onion)
git clone https://git.dasho.dev/n0tr1v/dkforest.git
Log | Files | Refs | LICENSE

levenshtein.go (2347B)


      1 // Package levenshtein is a Go implementation to calculate Levenshtein Distance.
      2 //
      3 // Implementation taken from
      4 // https://gist.github.com/andrei-m/982927#gistcomment-1931258
      5 package levenshtein
      6 
      7 import "unicode/utf8"
      8 
      9 // minLengthThreshold is the length of the string beyond which
     10 // an allocation will be made. Strings smaller than this will be
     11 // zero alloc.
     12 const minLengthThreshold = 32
     13 
     14 // ComputeDistance computes the levenshtein distance between the two
     15 // strings passed as an argument. The return value is the levenshtein distance
     16 //
     17 // Works on runes (Unicode code points) but does not normalize
     18 // the input strings. See https://blog.golang.org/normalization
     19 // and the golang.org/x/text/unicode/norm package.
     20 func ComputeDistance(a, b string) int {
     21 	if len(a) == 0 {
     22 		return utf8.RuneCountInString(b)
     23 	}
     24 
     25 	if len(b) == 0 {
     26 		return utf8.RuneCountInString(a)
     27 	}
     28 
     29 	if a == b {
     30 		return 0
     31 	}
     32 
     33 	// We need to convert to []rune if the strings are non-ASCII.
     34 	// This could be avoided by using utf8.RuneCountInString
     35 	// and then doing some juggling with rune indices,
     36 	// but leads to far more bounds checks. It is a reasonable trade-off.
     37 	s1 := []rune(a)
     38 	s2 := []rune(b)
     39 
     40 	// swap to save some memory O(min(a,b)) instead of O(a)
     41 	if len(s1) > len(s2) {
     42 		s1, s2 = s2, s1
     43 	}
     44 	lenS1 := len(s1)
     45 	lenS2 := len(s2)
     46 
     47 	// Init the row.
     48 	var x []uint16
     49 	if lenS1+1 > minLengthThreshold {
     50 		x = make([]uint16, lenS1+1)
     51 	} else {
     52 		// We make a small optimization here for small strings.
     53 		// Because a slice of constant length is effectively an array,
     54 		// it does not allocate. So we can re-slice it to the right length
     55 		// as long as it is below a desired threshold.
     56 		x = make([]uint16, minLengthThreshold)
     57 		x = x[:lenS1+1]
     58 	}
     59 
     60 	// we start from 1 because index 0 is already 0.
     61 	for i := 1; i < len(x); i++ {
     62 		x[i] = uint16(i)
     63 	}
     64 
     65 	// make a dummy bounds check to prevent the 2 bounds check down below.
     66 	// The one inside the loop is particularly costly.
     67 	_ = x[lenS1]
     68 	// fill in the rest
     69 	for i := 1; i <= lenS2; i++ {
     70 		prev := uint16(i)
     71 		for j := 1; j <= lenS1; j++ {
     72 			current := x[j-1] // match
     73 			if s2[i-1] != s1[j-1] {
     74 				current = min(min(x[j-1]+1, prev+1), x[j]+1)
     75 			}
     76 			x[j-1] = prev
     77 			prev = current
     78 		}
     79 		x[lenS1] = prev
     80 	}
     81 	return int(x[lenS1])
     82 }
     83 
     84 func min(a, b uint16) uint16 {
     85 	if a < b {
     86 		return a
     87 	}
     88 	return b
     89 }