neovim

Neovim text editor
git clone https://git.dasho.dev/neovim.git
Log | Files | Refs | README

linematch.c (14517B)


      1 #include <assert.h>
      2 #include <math.h>
      3 #include <stdbool.h>
      4 #include <stddef.h>
      5 #include <stdint.h>
      6 
      7 #include "nvim/linematch.h"
      8 #include "nvim/macros_defs.h"
      9 #include "nvim/memory.h"
     10 #include "nvim/pos_defs.h"
     11 #include "xdiff/xdiff.h"
     12 
     13 #define LN_MAX_BUFS 8
     14 #define LN_DECISION_MAX 255  // pow(2, LN_MAX_BUFS(8)) - 1 = 255
     15 
     16 // struct for running the diff linematch algorithm
     17 typedef struct diffcmppath_S diffcmppath_T;
     18 struct diffcmppath_S {
     19  int df_lev_score;  // to keep track of the total score of this path
     20  size_t df_path_n;   // current index of this path
     21  int df_choice_mem[LN_DECISION_MAX + 1];
     22  int df_choice[LN_DECISION_MAX];
     23  diffcmppath_T *df_decision[LN_DECISION_MAX];  // to keep track of this path traveled
     24  size_t df_optimal_choice;
     25 };
     26 
     27 #include "linematch.c.generated.h"
     28 
     29 static size_t line_len(const mmfile_t *m)
     30 {
     31  char *s = m->ptr;
     32  char *end = memchr(s, '\n', (size_t)m->size);
     33  return end ? (size_t)(end - s) : (size_t)m->size;
     34 }
     35 
     36 #define MATCH_CHAR_MAX_LEN 800
     37 
     38 /// Same as matching_chars but ignore whitespace
     39 ///
     40 /// @param s1
     41 /// @param s2
     42 static int matching_chars_iwhite(const mmfile_t *s1, const mmfile_t *s2)
     43 {
     44  // the newly processed strings that will be compared
     45  // delete the white space characters
     46  mmfile_t sp[2];
     47  char p[2][MATCH_CHAR_MAX_LEN];
     48  for (int k = 0; k < 2; k++) {
     49    const mmfile_t *s = k == 0 ? s1 : s2;
     50    size_t pi = 0;
     51    size_t slen = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(s));
     52    for (size_t i = 0; i <= slen; i++) {
     53      char e = s->ptr[i];
     54      if (e != ' ' && e != '\t') {
     55        p[k][pi] = e;
     56        pi++;
     57      }
     58    }
     59 
     60    sp[k] = (mmfile_t){
     61      .ptr = p[k],
     62      .size = (int)pi,
     63    };
     64  }
     65  return matching_chars(&sp[0], &sp[1]);
     66 }
     67 
     68 /// Return matching characters between "s1" and "s2" whilst respecting sequence order.
     69 /// Consider the case of two strings 'AAACCC' and 'CCCAAA', the
     70 /// return value from this function will be 3, either to match
     71 /// the 3 C's, or the 3 A's.
     72 ///
     73 /// Examples:
     74 ///   matching_chars("aabc", "acba")               -> 2  // 'a' and 'b' in common
     75 ///   matching_chars("123hello567", "he123ll567o") -> 8  // '123', 'll' and '567' in common
     76 ///   matching_chars("abcdefg", "gfedcba")         -> 1  // all characters in common,
     77 ///                                                      // but only at most 1 in sequence
     78 ///
     79 /// @param m1
     80 /// @param m2
     81 static int matching_chars(const mmfile_t *m1, const mmfile_t *m2)
     82 {
     83  size_t s1len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m1));
     84  size_t s2len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m2));
     85  char *s1 = m1->ptr;
     86  char *s2 = m2->ptr;
     87  int matrix[2][MATCH_CHAR_MAX_LEN] = { 0 };
     88  bool icur = 1;  // save space by storing only two rows for i axis
     89  for (size_t i = 0; i < s1len; i++) {
     90    icur = !icur;
     91    int *e1 = matrix[icur];
     92    int *e2 = matrix[!icur];
     93    for (size_t j = 0; j < s2len; j++) {
     94      // skip char in s1
     95      if (e2[j + 1] > e1[j + 1]) {
     96        e1[j + 1] = e2[j + 1];
     97      }
     98      // skip char in s2
     99      if (e1[j] > e1[j + 1]) {
    100        e1[j + 1] = e1[j];
    101      }
    102      // compare char in s1 and s2
    103      if ((s1[i] == s2[j]) && (e2[j] + 1) > e1[j + 1]) {
    104        e1[j + 1] = e2[j] + 1;
    105      }
    106    }
    107  }
    108  return matrix[icur][s2len];
    109 }
    110 
    111 /// count the matching characters between a variable number of strings "sp"
    112 /// mark the strings that have already been compared to extract them later
    113 /// without re-running the character match counting.
    114 /// @param sp
    115 /// @param fomvals
    116 /// @param n
    117 static int count_n_matched_chars(mmfile_t **sp, const size_t n, bool iwhite)
    118 {
    119  int matched_chars = 0;
    120  int matched = 0;
    121  for (size_t i = 0; i < n; i++) {
    122    for (size_t j = i + 1; j < n; j++) {
    123      if (sp[i]->ptr != NULL && sp[j]->ptr != NULL) {
    124        matched++;
    125        // TODO(lewis6991): handle whitespace ignoring higher up in the stack
    126        matched_chars += iwhite ? matching_chars_iwhite(sp[i], sp[j])
    127                                : matching_chars(sp[i], sp[j]);
    128      }
    129    }
    130  }
    131 
    132  // prioritize a match of 3 (or more lines) equally to a match of 2 lines
    133  if (matched >= 2) {
    134    matched_chars *= 2;
    135    matched_chars /= matched;
    136  }
    137 
    138  return matched_chars;
    139 }
    140 
    141 mmfile_t fastforward_buf_to_lnum(mmfile_t s, linenr_T lnum)
    142 {
    143  for (int i = 0; i < lnum - 1; i++) {
    144    char *line_end = memchr(s.ptr, '\n', (size_t)s.size);
    145    s.size = line_end ? (int)(s.size - (line_end - s.ptr)) : 0;
    146    s.ptr = line_end;
    147    if (!s.ptr) {
    148      break;
    149    }
    150    s.ptr++;
    151    s.size--;
    152  }
    153  return s;
    154 }
    155 
    156 /// try all the different ways to compare these lines and use the one that
    157 /// results in the most matching characters
    158 /// @param df_iters
    159 /// @param paths
    160 /// @param npaths
    161 /// @param path_idx
    162 /// @param choice
    163 /// @param diffcmppath
    164 /// @param diff_len
    165 /// @param ndiffs
    166 /// @param diff_blk
    167 static void try_possible_paths(const int *df_iters, const size_t *paths, const int npaths,
    168                               const int path_idx, int *choice, diffcmppath_T *diffcmppath,
    169                               const int *diff_len, const size_t ndiffs, const mmfile_t **diff_blk,
    170                               bool iwhite)
    171 {
    172  if (path_idx == npaths) {
    173    if ((*choice) > 0) {
    174      int from_vals[LN_MAX_BUFS] = { 0 };
    175      const int *to_vals = df_iters;
    176      mmfile_t mm[LN_MAX_BUFS];  // stack memory for current_lines
    177      mmfile_t *current_lines[LN_MAX_BUFS];
    178      for (size_t k = 0; k < ndiffs; k++) {
    179        from_vals[k] = df_iters[k];
    180        // get the index at all of the places
    181        if ((*choice) & (1 << k)) {
    182          from_vals[k]--;
    183          mm[k] = fastforward_buf_to_lnum(*diff_blk[k], df_iters[k]);
    184        } else {
    185          mm[k] = (mmfile_t){ 0 };
    186        }
    187        current_lines[k] = &mm[k];
    188      }
    189      size_t unwrapped_idx_from = unwrap_indexes(from_vals, diff_len, ndiffs);
    190      size_t unwrapped_idx_to = unwrap_indexes(to_vals, diff_len, ndiffs);
    191      int matched_chars = count_n_matched_chars(current_lines, ndiffs, iwhite);
    192      int score = diffcmppath[unwrapped_idx_from].df_lev_score + matched_chars;
    193      if (score > diffcmppath[unwrapped_idx_to].df_lev_score) {
    194        diffcmppath[unwrapped_idx_to].df_path_n = 1;
    195        diffcmppath[unwrapped_idx_to].df_decision[0] = &diffcmppath[unwrapped_idx_from];
    196        diffcmppath[unwrapped_idx_to].df_choice[0] = *choice;
    197        diffcmppath[unwrapped_idx_to].df_lev_score = score;
    198      } else if (score == diffcmppath[unwrapped_idx_to].df_lev_score) {
    199        size_t k = diffcmppath[unwrapped_idx_to].df_path_n++;
    200        diffcmppath[unwrapped_idx_to].df_decision[k] = &diffcmppath[unwrapped_idx_from];
    201        diffcmppath[unwrapped_idx_to].df_choice[k] = *choice;
    202      }
    203    }
    204    return;
    205  }
    206  size_t bit_place = paths[path_idx];
    207  *(choice) |= (1 << bit_place);  // set it to 1
    208  try_possible_paths(df_iters, paths, npaths, path_idx + 1, choice,
    209                     diffcmppath, diff_len, ndiffs, diff_blk, iwhite);
    210  *(choice) &= ~(1 << bit_place);  // set it to 0
    211  try_possible_paths(df_iters, paths, npaths, path_idx + 1, choice,
    212                     diffcmppath, diff_len, ndiffs, diff_blk, iwhite);
    213 }
    214 
    215 /// unwrap indexes to access n dimensional tensor
    216 /// @param values
    217 /// @param diff_len
    218 /// @param ndiffs
    219 static size_t unwrap_indexes(const int *values, const int *diff_len, const size_t ndiffs)
    220 {
    221  size_t num_unwrap_scalar = 1;
    222  for (size_t k = 0; k < ndiffs; k++) {
    223    num_unwrap_scalar *= (size_t)diff_len[k] + 1;
    224  }
    225 
    226  size_t path_idx = 0;
    227  for (size_t k = 0; k < ndiffs; k++) {
    228    num_unwrap_scalar /= (size_t)diff_len[k] + 1;
    229 
    230    int n = values[k];
    231    path_idx += num_unwrap_scalar * (size_t)n;
    232  }
    233  return path_idx;
    234 }
    235 
    236 /// populate the values of the linematch algorithm tensor, and find the best
    237 /// decision for how to compare the relevant lines from each of the buffers at
    238 /// each point in the tensor
    239 /// @param df_iters
    240 /// @param ch_dim
    241 /// @param diffcmppath
    242 /// @param diff_len
    243 /// @param ndiffs
    244 /// @param diff_blk
    245 static void populate_tensor(int *df_iters, const size_t ch_dim, diffcmppath_T *diffcmppath,
    246                            const int *diff_len, const size_t ndiffs, const mmfile_t **diff_blk,
    247                            bool iwhite)
    248 {
    249  if (ch_dim == ndiffs) {
    250    int npaths = 0;
    251    size_t paths[LN_MAX_BUFS];
    252 
    253    for (size_t j = 0; j < ndiffs; j++) {
    254      if (df_iters[j] > 0) {
    255        paths[npaths] = j;
    256        npaths++;
    257      }
    258    }
    259    int choice = 0;
    260    size_t unwrapper_idx_to = unwrap_indexes(df_iters, diff_len, ndiffs);
    261    diffcmppath[unwrapper_idx_to].df_lev_score = -1;
    262    try_possible_paths(df_iters, paths, npaths, 0, &choice, diffcmppath,
    263                       diff_len, ndiffs, diff_blk, iwhite);
    264    return;
    265  }
    266 
    267  for (int i = 0; i <= diff_len[ch_dim]; i++) {
    268    df_iters[ch_dim] = i;
    269    populate_tensor(df_iters, ch_dim + 1, diffcmppath, diff_len,
    270                    ndiffs, diff_blk, iwhite);
    271  }
    272 }
    273 
    274 /// algorithm to find an optimal alignment of lines of a diff block with 2 or
    275 /// more files. The algorithm is generalized to work for any number of files
    276 /// which corresponds to another dimension added to the tensor used in the
    277 /// algorithm
    278 ///
    279 /// for questions and information about the linematch algorithm please contact
    280 /// Jonathon White (jonathonwhite@protonmail.com)
    281 ///
    282 /// for explanation, a summary of the algorithm in 3 dimensions (3 files
    283 ///     compared) follows
    284 ///
    285 /// The 3d case (for 3 buffers) of the algorithm implemented when diffopt
    286 /// 'linematch' is enabled. The algorithm constructs a 3d tensor to
    287 /// compare a diff between 3 buffers. The dimensions of the tensor are
    288 /// the length of the diff in each buffer plus 1 A path is constructed by
    289 /// moving from one edge of the cube/3d tensor to the opposite edge.
    290 /// Motions from one cell of the cube to the next represent decisions. In
    291 /// a 3d cube, there are a total of 7 decisions that can be made,
    292 /// represented by the enum df_path3_choice which is defined in
    293 /// buffer_defs.h a comparison of buffer 0 and 1 represents a motion
    294 /// toward the opposite edge of the cube with components along the 0 and
    295 /// 1 axes.  a comparison of buffer 0, 1, and 2 represents a motion
    296 /// toward the opposite edge of the cube with components along the 0, 1,
    297 /// and 2 axes. A skip of buffer 0 represents a motion along only the 0
    298 /// axis. For each action, a point value is awarded, and the path is
    299 /// saved for reference later, if it is found to have been the optimal
    300 /// path. The optimal path has the highest score.  The score is
    301 /// calculated as the summation of the total characters matching between
    302 /// all of the lines which were compared. The structure of the algorithm
    303 /// is that of a dynamic programming problem.  We can calculate a point
    304 /// i,j,k in the cube as a function of i-1, j-1, and k-1. To find the
    305 /// score and path at point i,j,k, we must determine which path we want
    306 /// to use, this is done by looking at the possibilities and choosing
    307 /// the one which results in the local highest score.  The total highest
    308 /// scored path is, then in the end represented by the cell in the
    309 /// opposite corner from the start location.  The entire algorithm
    310 /// consists of populating the 3d cube with the optimal paths from which
    311 /// it may have came.
    312 ///
    313 /// Optimizations:
    314 /// As the function to calculate the cell of a tensor at point i,j,k is a
    315 /// function of the cells at i-1, j-1, k-1, the whole tensor doesn't need
    316 /// to be stored in memory at once. In the case of the 3d cube, only two
    317 /// slices (along k and j axis) are stored in memory. For the 2d matrix
    318 /// (for 2 files), only two rows are stored at a time. The next/previous
    319 /// slice (or row) is always calculated from the other, and they alternate
    320 /// at each iteration.
    321 /// In the 3d case, 3 arrays are populated to memorize the score (matched
    322 /// characters) of the 3 buffers, so a redundant calculation of the
    323 /// scores does not occur
    324 /// @param diff_blk
    325 /// @param diff_len
    326 /// @param ndiffs
    327 /// @param [out] [allocated] decisions
    328 /// @return the length of decisions
    329 size_t linematch_nbuffers(const mmfile_t **diff_blk, const int *diff_len, const size_t ndiffs,
    330                          int **decisions, bool iwhite)
    331 {
    332  assert(ndiffs <= LN_MAX_BUFS);
    333 
    334  size_t memsize = 1;
    335  size_t memsize_decisions = 0;
    336  for (size_t i = 0; i < ndiffs; i++) {
    337    assert(diff_len[i] >= 0);
    338    memsize *= (size_t)(diff_len[i] + 1);
    339    memsize_decisions += (size_t)diff_len[i];
    340  }
    341 
    342  // create the flattened path matrix
    343  diffcmppath_T *diffcmppath = xmalloc(sizeof(diffcmppath_T) * memsize);
    344  // allocate memory here
    345  size_t n = (size_t)pow(2.0, (double)ndiffs);
    346  for (size_t i = 0; i < memsize; i++) {
    347    diffcmppath[i].df_lev_score = 0;
    348    diffcmppath[i].df_path_n = 0;
    349    for (size_t j = 0; j < n; j++) {
    350      diffcmppath[i].df_choice_mem[j] = -1;
    351    }
    352  }
    353 
    354  // memory for avoiding repetitive calculations of score
    355  int df_iters[LN_MAX_BUFS];
    356  populate_tensor(df_iters, 0, diffcmppath, diff_len, ndiffs, diff_blk, iwhite);
    357 
    358  const size_t u = unwrap_indexes(diff_len, diff_len, ndiffs);
    359  diffcmppath_T *startNode = &diffcmppath[u];
    360 
    361  *decisions = xmalloc(sizeof(int) * memsize_decisions);
    362  size_t n_optimal = 0;
    363  test_charmatch_paths(startNode, 0);
    364  while (startNode->df_path_n > 0) {
    365    size_t j = startNode->df_optimal_choice;
    366    (*decisions)[n_optimal++] = startNode->df_choice[j];
    367    startNode = startNode->df_decision[j];
    368  }
    369  // reverse array
    370  for (size_t i = 0; i < (n_optimal / 2); i++) {
    371    int tmp = (*decisions)[i];
    372    (*decisions)[i] = (*decisions)[n_optimal - 1 - i];
    373    (*decisions)[n_optimal - 1 - i] = tmp;
    374  }
    375 
    376  xfree(diffcmppath);
    377 
    378  return n_optimal;
    379 }
    380 
    381 // returns the minimum amount of path changes from start to end
    382 static size_t test_charmatch_paths(diffcmppath_T *node, int lastdecision)
    383 {
    384  // memoization
    385  if (node->df_choice_mem[lastdecision] == -1) {
    386    if (node->df_path_n == 0) {
    387      // we have reached the end of the tree
    388      node->df_choice_mem[lastdecision] = 0;
    389    } else {
    390      size_t minimum_turns = SIZE_MAX;  // the minimum amount of turns required to reach the end
    391      for (size_t i = 0; i < node->df_path_n; i++) {
    392        // recurse
    393        size_t t = test_charmatch_paths(node->df_decision[i], node->df_choice[i]) +
    394                   (lastdecision != node->df_choice[i] ? 1 : 0);
    395        if (t < minimum_turns) {
    396          node->df_optimal_choice = i;
    397          minimum_turns = t;
    398        }
    399      }
    400      node->df_choice_mem[lastdecision] = (int)minimum_turns;
    401    }
    402  }
    403  return (size_t)node->df_choice_mem[lastdecision];
    404 }