linematch.c (14517B)
1 #include <assert.h> 2 #include <math.h> 3 #include <stdbool.h> 4 #include <stddef.h> 5 #include <stdint.h> 6 7 #include "nvim/linematch.h" 8 #include "nvim/macros_defs.h" 9 #include "nvim/memory.h" 10 #include "nvim/pos_defs.h" 11 #include "xdiff/xdiff.h" 12 13 #define LN_MAX_BUFS 8 14 #define LN_DECISION_MAX 255 // pow(2, LN_MAX_BUFS(8)) - 1 = 255 15 16 // struct for running the diff linematch algorithm 17 typedef struct diffcmppath_S diffcmppath_T; 18 struct diffcmppath_S { 19 int df_lev_score; // to keep track of the total score of this path 20 size_t df_path_n; // current index of this path 21 int df_choice_mem[LN_DECISION_MAX + 1]; 22 int df_choice[LN_DECISION_MAX]; 23 diffcmppath_T *df_decision[LN_DECISION_MAX]; // to keep track of this path traveled 24 size_t df_optimal_choice; 25 }; 26 27 #include "linematch.c.generated.h" 28 29 static size_t line_len(const mmfile_t *m) 30 { 31 char *s = m->ptr; 32 char *end = memchr(s, '\n', (size_t)m->size); 33 return end ? (size_t)(end - s) : (size_t)m->size; 34 } 35 36 #define MATCH_CHAR_MAX_LEN 800 37 38 /// Same as matching_chars but ignore whitespace 39 /// 40 /// @param s1 41 /// @param s2 42 static int matching_chars_iwhite(const mmfile_t *s1, const mmfile_t *s2) 43 { 44 // the newly processed strings that will be compared 45 // delete the white space characters 46 mmfile_t sp[2]; 47 char p[2][MATCH_CHAR_MAX_LEN]; 48 for (int k = 0; k < 2; k++) { 49 const mmfile_t *s = k == 0 ? s1 : s2; 50 size_t pi = 0; 51 size_t slen = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(s)); 52 for (size_t i = 0; i <= slen; i++) { 53 char e = s->ptr[i]; 54 if (e != ' ' && e != '\t') { 55 p[k][pi] = e; 56 pi++; 57 } 58 } 59 60 sp[k] = (mmfile_t){ 61 .ptr = p[k], 62 .size = (int)pi, 63 }; 64 } 65 return matching_chars(&sp[0], &sp[1]); 66 } 67 68 /// Return matching characters between "s1" and "s2" whilst respecting sequence order. 69 /// Consider the case of two strings 'AAACCC' and 'CCCAAA', the 70 /// return value from this function will be 3, either to match 71 /// the 3 C's, or the 3 A's. 72 /// 73 /// Examples: 74 /// matching_chars("aabc", "acba") -> 2 // 'a' and 'b' in common 75 /// matching_chars("123hello567", "he123ll567o") -> 8 // '123', 'll' and '567' in common 76 /// matching_chars("abcdefg", "gfedcba") -> 1 // all characters in common, 77 /// // but only at most 1 in sequence 78 /// 79 /// @param m1 80 /// @param m2 81 static int matching_chars(const mmfile_t *m1, const mmfile_t *m2) 82 { 83 size_t s1len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m1)); 84 size_t s2len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m2)); 85 char *s1 = m1->ptr; 86 char *s2 = m2->ptr; 87 int matrix[2][MATCH_CHAR_MAX_LEN] = { 0 }; 88 bool icur = 1; // save space by storing only two rows for i axis 89 for (size_t i = 0; i < s1len; i++) { 90 icur = !icur; 91 int *e1 = matrix[icur]; 92 int *e2 = matrix[!icur]; 93 for (size_t j = 0; j < s2len; j++) { 94 // skip char in s1 95 if (e2[j + 1] > e1[j + 1]) { 96 e1[j + 1] = e2[j + 1]; 97 } 98 // skip char in s2 99 if (e1[j] > e1[j + 1]) { 100 e1[j + 1] = e1[j]; 101 } 102 // compare char in s1 and s2 103 if ((s1[i] == s2[j]) && (e2[j] + 1) > e1[j + 1]) { 104 e1[j + 1] = e2[j] + 1; 105 } 106 } 107 } 108 return matrix[icur][s2len]; 109 } 110 111 /// count the matching characters between a variable number of strings "sp" 112 /// mark the strings that have already been compared to extract them later 113 /// without re-running the character match counting. 114 /// @param sp 115 /// @param fomvals 116 /// @param n 117 static int count_n_matched_chars(mmfile_t **sp, const size_t n, bool iwhite) 118 { 119 int matched_chars = 0; 120 int matched = 0; 121 for (size_t i = 0; i < n; i++) { 122 for (size_t j = i + 1; j < n; j++) { 123 if (sp[i]->ptr != NULL && sp[j]->ptr != NULL) { 124 matched++; 125 // TODO(lewis6991): handle whitespace ignoring higher up in the stack 126 matched_chars += iwhite ? matching_chars_iwhite(sp[i], sp[j]) 127 : matching_chars(sp[i], sp[j]); 128 } 129 } 130 } 131 132 // prioritize a match of 3 (or more lines) equally to a match of 2 lines 133 if (matched >= 2) { 134 matched_chars *= 2; 135 matched_chars /= matched; 136 } 137 138 return matched_chars; 139 } 140 141 mmfile_t fastforward_buf_to_lnum(mmfile_t s, linenr_T lnum) 142 { 143 for (int i = 0; i < lnum - 1; i++) { 144 char *line_end = memchr(s.ptr, '\n', (size_t)s.size); 145 s.size = line_end ? (int)(s.size - (line_end - s.ptr)) : 0; 146 s.ptr = line_end; 147 if (!s.ptr) { 148 break; 149 } 150 s.ptr++; 151 s.size--; 152 } 153 return s; 154 } 155 156 /// try all the different ways to compare these lines and use the one that 157 /// results in the most matching characters 158 /// @param df_iters 159 /// @param paths 160 /// @param npaths 161 /// @param path_idx 162 /// @param choice 163 /// @param diffcmppath 164 /// @param diff_len 165 /// @param ndiffs 166 /// @param diff_blk 167 static void try_possible_paths(const int *df_iters, const size_t *paths, const int npaths, 168 const int path_idx, int *choice, diffcmppath_T *diffcmppath, 169 const int *diff_len, const size_t ndiffs, const mmfile_t **diff_blk, 170 bool iwhite) 171 { 172 if (path_idx == npaths) { 173 if ((*choice) > 0) { 174 int from_vals[LN_MAX_BUFS] = { 0 }; 175 const int *to_vals = df_iters; 176 mmfile_t mm[LN_MAX_BUFS]; // stack memory for current_lines 177 mmfile_t *current_lines[LN_MAX_BUFS]; 178 for (size_t k = 0; k < ndiffs; k++) { 179 from_vals[k] = df_iters[k]; 180 // get the index at all of the places 181 if ((*choice) & (1 << k)) { 182 from_vals[k]--; 183 mm[k] = fastforward_buf_to_lnum(*diff_blk[k], df_iters[k]); 184 } else { 185 mm[k] = (mmfile_t){ 0 }; 186 } 187 current_lines[k] = &mm[k]; 188 } 189 size_t unwrapped_idx_from = unwrap_indexes(from_vals, diff_len, ndiffs); 190 size_t unwrapped_idx_to = unwrap_indexes(to_vals, diff_len, ndiffs); 191 int matched_chars = count_n_matched_chars(current_lines, ndiffs, iwhite); 192 int score = diffcmppath[unwrapped_idx_from].df_lev_score + matched_chars; 193 if (score > diffcmppath[unwrapped_idx_to].df_lev_score) { 194 diffcmppath[unwrapped_idx_to].df_path_n = 1; 195 diffcmppath[unwrapped_idx_to].df_decision[0] = &diffcmppath[unwrapped_idx_from]; 196 diffcmppath[unwrapped_idx_to].df_choice[0] = *choice; 197 diffcmppath[unwrapped_idx_to].df_lev_score = score; 198 } else if (score == diffcmppath[unwrapped_idx_to].df_lev_score) { 199 size_t k = diffcmppath[unwrapped_idx_to].df_path_n++; 200 diffcmppath[unwrapped_idx_to].df_decision[k] = &diffcmppath[unwrapped_idx_from]; 201 diffcmppath[unwrapped_idx_to].df_choice[k] = *choice; 202 } 203 } 204 return; 205 } 206 size_t bit_place = paths[path_idx]; 207 *(choice) |= (1 << bit_place); // set it to 1 208 try_possible_paths(df_iters, paths, npaths, path_idx + 1, choice, 209 diffcmppath, diff_len, ndiffs, diff_blk, iwhite); 210 *(choice) &= ~(1 << bit_place); // set it to 0 211 try_possible_paths(df_iters, paths, npaths, path_idx + 1, choice, 212 diffcmppath, diff_len, ndiffs, diff_blk, iwhite); 213 } 214 215 /// unwrap indexes to access n dimensional tensor 216 /// @param values 217 /// @param diff_len 218 /// @param ndiffs 219 static size_t unwrap_indexes(const int *values, const int *diff_len, const size_t ndiffs) 220 { 221 size_t num_unwrap_scalar = 1; 222 for (size_t k = 0; k < ndiffs; k++) { 223 num_unwrap_scalar *= (size_t)diff_len[k] + 1; 224 } 225 226 size_t path_idx = 0; 227 for (size_t k = 0; k < ndiffs; k++) { 228 num_unwrap_scalar /= (size_t)diff_len[k] + 1; 229 230 int n = values[k]; 231 path_idx += num_unwrap_scalar * (size_t)n; 232 } 233 return path_idx; 234 } 235 236 /// populate the values of the linematch algorithm tensor, and find the best 237 /// decision for how to compare the relevant lines from each of the buffers at 238 /// each point in the tensor 239 /// @param df_iters 240 /// @param ch_dim 241 /// @param diffcmppath 242 /// @param diff_len 243 /// @param ndiffs 244 /// @param diff_blk 245 static void populate_tensor(int *df_iters, const size_t ch_dim, diffcmppath_T *diffcmppath, 246 const int *diff_len, const size_t ndiffs, const mmfile_t **diff_blk, 247 bool iwhite) 248 { 249 if (ch_dim == ndiffs) { 250 int npaths = 0; 251 size_t paths[LN_MAX_BUFS]; 252 253 for (size_t j = 0; j < ndiffs; j++) { 254 if (df_iters[j] > 0) { 255 paths[npaths] = j; 256 npaths++; 257 } 258 } 259 int choice = 0; 260 size_t unwrapper_idx_to = unwrap_indexes(df_iters, diff_len, ndiffs); 261 diffcmppath[unwrapper_idx_to].df_lev_score = -1; 262 try_possible_paths(df_iters, paths, npaths, 0, &choice, diffcmppath, 263 diff_len, ndiffs, diff_blk, iwhite); 264 return; 265 } 266 267 for (int i = 0; i <= diff_len[ch_dim]; i++) { 268 df_iters[ch_dim] = i; 269 populate_tensor(df_iters, ch_dim + 1, diffcmppath, diff_len, 270 ndiffs, diff_blk, iwhite); 271 } 272 } 273 274 /// algorithm to find an optimal alignment of lines of a diff block with 2 or 275 /// more files. The algorithm is generalized to work for any number of files 276 /// which corresponds to another dimension added to the tensor used in the 277 /// algorithm 278 /// 279 /// for questions and information about the linematch algorithm please contact 280 /// Jonathon White (jonathonwhite@protonmail.com) 281 /// 282 /// for explanation, a summary of the algorithm in 3 dimensions (3 files 283 /// compared) follows 284 /// 285 /// The 3d case (for 3 buffers) of the algorithm implemented when diffopt 286 /// 'linematch' is enabled. The algorithm constructs a 3d tensor to 287 /// compare a diff between 3 buffers. The dimensions of the tensor are 288 /// the length of the diff in each buffer plus 1 A path is constructed by 289 /// moving from one edge of the cube/3d tensor to the opposite edge. 290 /// Motions from one cell of the cube to the next represent decisions. In 291 /// a 3d cube, there are a total of 7 decisions that can be made, 292 /// represented by the enum df_path3_choice which is defined in 293 /// buffer_defs.h a comparison of buffer 0 and 1 represents a motion 294 /// toward the opposite edge of the cube with components along the 0 and 295 /// 1 axes. a comparison of buffer 0, 1, and 2 represents a motion 296 /// toward the opposite edge of the cube with components along the 0, 1, 297 /// and 2 axes. A skip of buffer 0 represents a motion along only the 0 298 /// axis. For each action, a point value is awarded, and the path is 299 /// saved for reference later, if it is found to have been the optimal 300 /// path. The optimal path has the highest score. The score is 301 /// calculated as the summation of the total characters matching between 302 /// all of the lines which were compared. The structure of the algorithm 303 /// is that of a dynamic programming problem. We can calculate a point 304 /// i,j,k in the cube as a function of i-1, j-1, and k-1. To find the 305 /// score and path at point i,j,k, we must determine which path we want 306 /// to use, this is done by looking at the possibilities and choosing 307 /// the one which results in the local highest score. The total highest 308 /// scored path is, then in the end represented by the cell in the 309 /// opposite corner from the start location. The entire algorithm 310 /// consists of populating the 3d cube with the optimal paths from which 311 /// it may have came. 312 /// 313 /// Optimizations: 314 /// As the function to calculate the cell of a tensor at point i,j,k is a 315 /// function of the cells at i-1, j-1, k-1, the whole tensor doesn't need 316 /// to be stored in memory at once. In the case of the 3d cube, only two 317 /// slices (along k and j axis) are stored in memory. For the 2d matrix 318 /// (for 2 files), only two rows are stored at a time. The next/previous 319 /// slice (or row) is always calculated from the other, and they alternate 320 /// at each iteration. 321 /// In the 3d case, 3 arrays are populated to memorize the score (matched 322 /// characters) of the 3 buffers, so a redundant calculation of the 323 /// scores does not occur 324 /// @param diff_blk 325 /// @param diff_len 326 /// @param ndiffs 327 /// @param [out] [allocated] decisions 328 /// @return the length of decisions 329 size_t linematch_nbuffers(const mmfile_t **diff_blk, const int *diff_len, const size_t ndiffs, 330 int **decisions, bool iwhite) 331 { 332 assert(ndiffs <= LN_MAX_BUFS); 333 334 size_t memsize = 1; 335 size_t memsize_decisions = 0; 336 for (size_t i = 0; i < ndiffs; i++) { 337 assert(diff_len[i] >= 0); 338 memsize *= (size_t)(diff_len[i] + 1); 339 memsize_decisions += (size_t)diff_len[i]; 340 } 341 342 // create the flattened path matrix 343 diffcmppath_T *diffcmppath = xmalloc(sizeof(diffcmppath_T) * memsize); 344 // allocate memory here 345 size_t n = (size_t)pow(2.0, (double)ndiffs); 346 for (size_t i = 0; i < memsize; i++) { 347 diffcmppath[i].df_lev_score = 0; 348 diffcmppath[i].df_path_n = 0; 349 for (size_t j = 0; j < n; j++) { 350 diffcmppath[i].df_choice_mem[j] = -1; 351 } 352 } 353 354 // memory for avoiding repetitive calculations of score 355 int df_iters[LN_MAX_BUFS]; 356 populate_tensor(df_iters, 0, diffcmppath, diff_len, ndiffs, diff_blk, iwhite); 357 358 const size_t u = unwrap_indexes(diff_len, diff_len, ndiffs); 359 diffcmppath_T *startNode = &diffcmppath[u]; 360 361 *decisions = xmalloc(sizeof(int) * memsize_decisions); 362 size_t n_optimal = 0; 363 test_charmatch_paths(startNode, 0); 364 while (startNode->df_path_n > 0) { 365 size_t j = startNode->df_optimal_choice; 366 (*decisions)[n_optimal++] = startNode->df_choice[j]; 367 startNode = startNode->df_decision[j]; 368 } 369 // reverse array 370 for (size_t i = 0; i < (n_optimal / 2); i++) { 371 int tmp = (*decisions)[i]; 372 (*decisions)[i] = (*decisions)[n_optimal - 1 - i]; 373 (*decisions)[n_optimal - 1 - i] = tmp; 374 } 375 376 xfree(diffcmppath); 377 378 return n_optimal; 379 } 380 381 // returns the minimum amount of path changes from start to end 382 static size_t test_charmatch_paths(diffcmppath_T *node, int lastdecision) 383 { 384 // memoization 385 if (node->df_choice_mem[lastdecision] == -1) { 386 if (node->df_path_n == 0) { 387 // we have reached the end of the tree 388 node->df_choice_mem[lastdecision] = 0; 389 } else { 390 size_t minimum_turns = SIZE_MAX; // the minimum amount of turns required to reach the end 391 for (size_t i = 0; i < node->df_path_n; i++) { 392 // recurse 393 size_t t = test_charmatch_paths(node->df_decision[i], node->df_choice[i]) + 394 (lastdecision != node->df_choice[i] ? 1 : 0); 395 if (t < minimum_turns) { 396 node->df_optimal_choice = i; 397 minimum_turns = t; 398 } 399 } 400 node->df_choice_mem[lastdecision] = (int)minimum_turns; 401 } 402 } 403 return (size_t)node->df_choice_mem[lastdecision]; 404 }