damerau_levenshtein_distance.cc (3577B)
1 // Copyright 2022 The Abseil Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "absl/strings/internal/damerau_levenshtein_distance.h" 16 17 #include <algorithm> 18 #include <array> 19 #include <numeric> 20 21 #include "absl/strings/string_view.h" 22 namespace absl { 23 ABSL_NAMESPACE_BEGIN 24 namespace strings_internal { 25 // Calculate DamerauLevenshtein (adjacent transpositions) distance 26 // between two strings, 27 // https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance. The 28 // algorithm follows the condition that no substring is edited more than once. 29 // While this can reduce is larger distance, it's a) a much simpler algorithm 30 // and b) more realistic for the case that typographic mistakes should be 31 // detected. 32 // When the distance is larger than cutoff, or one of the strings has more 33 // than MAX_SIZE=100 characters, the code returns min(MAX_SIZE, cutoff) + 1. 34 uint8_t CappedDamerauLevenshteinDistance(absl::string_view s1, 35 absl::string_view s2, uint8_t cutoff) { 36 const uint8_t MAX_SIZE = 100; 37 const uint8_t _cutoff = std::min(MAX_SIZE, cutoff); 38 const uint8_t cutoff_plus_1 = static_cast<uint8_t>(_cutoff + 1); 39 40 if (s1.size() > s2.size()) std::swap(s1, s2); 41 if (s1.size() + _cutoff < s2.size() || s2.size() > MAX_SIZE) 42 return cutoff_plus_1; 43 44 if (s1.empty()) 45 return static_cast<uint8_t>(s2.size()); 46 47 // Lower diagonal bound: y = x - lower_diag 48 const uint8_t lower_diag = 49 _cutoff - static_cast<uint8_t>(s2.size() - s1.size()); 50 // Upper diagonal bound: y = x + upper_diag 51 const uint8_t upper_diag = _cutoff; 52 53 // d[i][j] is the number of edits required to convert s1[0, i] to s2[0, j] 54 std::array<std::array<uint8_t, MAX_SIZE + 2>, MAX_SIZE + 2> d; 55 std::iota(d[0].begin(), d[0].begin() + upper_diag + 1, 0); 56 d[0][cutoff_plus_1] = cutoff_plus_1; 57 for (size_t i = 1; i <= s1.size(); ++i) { 58 // Deduce begin of relevant window. 59 size_t j_begin = 1; 60 if (i > lower_diag) { 61 j_begin = i - lower_diag; 62 d[i][j_begin - 1] = cutoff_plus_1; 63 } else { 64 d[i][0] = static_cast<uint8_t>(i); 65 } 66 67 // Deduce end of relevant window. 68 size_t j_end = i + upper_diag; 69 if (j_end > s2.size()) { 70 j_end = s2.size(); 71 } else { 72 d[i][j_end + 1] = cutoff_plus_1; 73 } 74 75 for (size_t j = j_begin; j <= j_end; ++j) { 76 const uint8_t deletion_distance = d[i - 1][j] + 1; 77 const uint8_t insertion_distance = d[i][j - 1] + 1; 78 const uint8_t mismatched_tail_cost = s1[i - 1] == s2[j - 1] ? 0 : 1; 79 const uint8_t mismatch_distance = d[i - 1][j - 1] + mismatched_tail_cost; 80 uint8_t transposition_distance = _cutoff + 1; 81 if (i > 1 && j > 1 && s1[i - 1] == s2[j - 2] && s1[i - 2] == s2[j - 1]) 82 transposition_distance = d[i - 2][j - 2] + 1; 83 d[i][j] = std::min({cutoff_plus_1, deletion_distance, insertion_distance, 84 mismatch_distance, transposition_distance}); 85 } 86 } 87 return d[s1.size()][s2.size()]; 88 } 89 90 } // namespace strings_internal 91 92 ABSL_NAMESPACE_END 93 } // namespace absl