tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

damerau_levenshtein_distance.cc (3577B)


      1 // Copyright 2022 The Abseil Authors
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     https://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 #include "absl/strings/internal/damerau_levenshtein_distance.h"
     16 
     17 #include <algorithm>
     18 #include <array>
     19 #include <numeric>
     20 
     21 #include "absl/strings/string_view.h"
     22 namespace absl {
     23 ABSL_NAMESPACE_BEGIN
     24 namespace strings_internal {
     25 // Calculate DamerauLevenshtein (adjacent transpositions) distance
     26 // between two strings,
     27 // https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance. The
     28 // algorithm follows the condition that no substring is edited more than once.
     29 // While this can reduce is larger distance, it's a) a much simpler algorithm
     30 // and b) more realistic for the case that typographic mistakes should be
     31 // detected.
     32 // When the distance is larger than cutoff, or one of the strings has more
     33 // than MAX_SIZE=100 characters, the code returns min(MAX_SIZE, cutoff) + 1.
     34 uint8_t CappedDamerauLevenshteinDistance(absl::string_view s1,
     35                                         absl::string_view s2, uint8_t cutoff) {
     36  const uint8_t MAX_SIZE = 100;
     37  const uint8_t _cutoff = std::min(MAX_SIZE, cutoff);
     38  const uint8_t cutoff_plus_1 = static_cast<uint8_t>(_cutoff + 1);
     39 
     40  if (s1.size() > s2.size()) std::swap(s1, s2);
     41  if (s1.size() + _cutoff < s2.size() || s2.size() > MAX_SIZE)
     42    return cutoff_plus_1;
     43 
     44  if (s1.empty())
     45    return static_cast<uint8_t>(s2.size());
     46 
     47  // Lower diagonal bound: y = x - lower_diag
     48  const uint8_t lower_diag =
     49      _cutoff - static_cast<uint8_t>(s2.size() - s1.size());
     50  // Upper diagonal bound: y = x + upper_diag
     51  const uint8_t upper_diag = _cutoff;
     52 
     53  // d[i][j] is the number of edits required to convert s1[0, i] to s2[0, j]
     54  std::array<std::array<uint8_t, MAX_SIZE + 2>, MAX_SIZE + 2> d;
     55  std::iota(d[0].begin(), d[0].begin() + upper_diag + 1, 0);
     56  d[0][cutoff_plus_1] = cutoff_plus_1;
     57  for (size_t i = 1; i <= s1.size(); ++i) {
     58    // Deduce begin of relevant window.
     59    size_t j_begin = 1;
     60    if (i > lower_diag) {
     61      j_begin = i - lower_diag;
     62      d[i][j_begin - 1] = cutoff_plus_1;
     63    } else {
     64      d[i][0] = static_cast<uint8_t>(i);
     65    }
     66 
     67    // Deduce end of relevant window.
     68    size_t j_end = i + upper_diag;
     69    if (j_end > s2.size()) {
     70      j_end = s2.size();
     71    } else {
     72      d[i][j_end + 1] = cutoff_plus_1;
     73    }
     74 
     75    for (size_t j = j_begin; j <= j_end; ++j) {
     76      const uint8_t deletion_distance = d[i - 1][j] + 1;
     77      const uint8_t insertion_distance = d[i][j - 1] + 1;
     78      const uint8_t mismatched_tail_cost = s1[i - 1] == s2[j - 1] ? 0 : 1;
     79      const uint8_t mismatch_distance = d[i - 1][j - 1] + mismatched_tail_cost;
     80      uint8_t transposition_distance = _cutoff + 1;
     81      if (i > 1 && j > 1 && s1[i - 1] == s2[j - 2] && s1[i - 2] == s2[j - 1])
     82        transposition_distance = d[i - 2][j - 2] + 1;
     83      d[i][j] = std::min({cutoff_plus_1, deletion_distance, insertion_distance,
     84                          mismatch_distance, transposition_distance});
     85    }
     86  }
     87  return d[s1.size()][s2.size()];
     88 }
     89 
     90 }  // namespace strings_internal
     91 
     92 ABSL_NAMESPACE_END
     93 }  // namespace absl