xpatience.c (10597B)
1 /* 2 * LibXDiff by Davide Libenzi ( File Differential Library ) 3 * Copyright (C) 2003-2016 Davide Libenzi, Johannes E. Schindelin 4 * 5 * This library is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU Lesser General Public 7 * License as published by the Free Software Foundation; either 8 * version 2.1 of the License, or (at your option) any later version. 9 * 10 * This library is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Lesser General Public License for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public 16 * License along with this library; if not, see 17 * <http://www.gnu.org/licenses/>. 18 * 19 * Davide Libenzi <davidel@xmailserver.org> 20 * 21 */ 22 #include "xinclude.h" 23 24 /* 25 * The basic idea of patience diff is to find lines that are unique in 26 * both files. These are intuitively the ones that we want to see as 27 * common lines. 28 * 29 * The maximal ordered sequence of such line pairs (where ordered means 30 * that the order in the sequence agrees with the order of the lines in 31 * both files) naturally defines an initial set of common lines. 32 * 33 * Now, the algorithm tries to extend the set of common lines by growing 34 * the line ranges where the files have identical lines. 35 * 36 * Between those common lines, the patience diff algorithm is applied 37 * recursively, until no unique line pairs can be found; these line ranges 38 * are handled by the well-known Myers algorithm. 39 */ 40 41 #define NON_UNIQUE ULONG_MAX 42 43 /* 44 * This is a hash mapping from line hash to line numbers in the first and 45 * second file. 46 */ 47 struct hashmap { 48 int nr, alloc; 49 struct entry { 50 unsigned long hash; 51 /* 52 * 0 = unused entry, 1 = first line, 2 = second, etc. 53 * line2 is NON_UNIQUE if the line is not unique 54 * in either the first or the second file. 55 */ 56 unsigned long line1, line2; 57 /* 58 * "next" & "previous" are used for the longest common 59 * sequence; 60 * initially, "next" reflects only the order in file1. 61 */ 62 struct entry *next, *previous; 63 64 /* 65 * If 1, this entry can serve as an anchor. See 66 * Documentation/diff-options.txt for more information. 67 */ 68 unsigned anchor : 1; 69 } *entries, *first, *last; 70 /* were common records found? */ 71 unsigned long has_matches; 72 mmfile_t *file1, *file2; 73 xdfenv_t *env; 74 xpparam_t const *xpp; 75 }; 76 77 static int is_anchor(xpparam_t const *xpp, const char *line) 78 { 79 int i; 80 for (i = 0; i < (int)xpp->anchors_nr; i++) { 81 if (!strncmp(line, xpp->anchors[i], strlen(xpp->anchors[i]))) 82 return 1; 83 } 84 return 0; 85 } 86 87 /* The argument "pass" is 1 for the first file, 2 for the second. */ 88 static void insert_record(xpparam_t const *xpp, int line, struct hashmap *map, 89 int pass) 90 { 91 xrecord_t **records = pass == 1 ? 92 map->env->xdf1.recs : map->env->xdf2.recs; 93 xrecord_t *record = records[line - 1]; 94 /* 95 * After xdl_prepare_env() (or more precisely, due to 96 * xdl_classify_record()), the "ha" member of the records (AKA lines) 97 * is _not_ the hash anymore, but a linearized version of it. In 98 * other words, the "ha" member is guaranteed to start with 0 and 99 * the second record's ha can only be 0 or 1, etc. 100 * 101 * So we multiply ha by 2 in the hope that the hashing was 102 * "unique enough". 103 */ 104 int index = (int)((record->ha << 1) % map->alloc); 105 106 while (map->entries[index].line1) { 107 if (map->entries[index].hash != record->ha) { 108 if (++index >= map->alloc) 109 index = 0; 110 continue; 111 } 112 if (pass == 2) 113 map->has_matches = 1; 114 if (pass == 1 || map->entries[index].line2) 115 map->entries[index].line2 = NON_UNIQUE; 116 else 117 map->entries[index].line2 = line; 118 return; 119 } 120 if (pass == 2) 121 return; 122 map->entries[index].line1 = line; 123 map->entries[index].hash = record->ha; 124 map->entries[index].anchor = is_anchor(xpp, map->env->xdf1.recs[line - 1]->ptr); 125 if (!map->first) 126 map->first = map->entries + index; 127 if (map->last) { 128 map->last->next = map->entries + index; 129 map->entries[index].previous = map->last; 130 } 131 map->last = map->entries + index; 132 map->nr++; 133 } 134 135 /* 136 * This function has to be called for each recursion into the inter-hunk 137 * parts, as previously non-unique lines can become unique when being 138 * restricted to a smaller part of the files. 139 * 140 * It is assumed that env has been prepared using xdl_prepare(). 141 */ 142 static int fill_hashmap(mmfile_t *file1, mmfile_t *file2, 143 xpparam_t const *xpp, xdfenv_t *env, 144 struct hashmap *result, 145 int line1, int count1, int line2, int count2) 146 { 147 result->file1 = file1; 148 result->file2 = file2; 149 result->xpp = xpp; 150 result->env = env; 151 152 /* We know exactly how large we want the hash map */ 153 result->alloc = count1 * 2; 154 result->entries = (struct entry *) 155 xdl_malloc(result->alloc * sizeof(struct entry)); 156 if (!result->entries) 157 return -1; 158 memset(result->entries, 0, result->alloc * sizeof(struct entry)); 159 160 /* First, fill with entries from the first file */ 161 while (count1--) 162 insert_record(xpp, line1++, result, 1); 163 164 /* Then search for matches in the second file */ 165 while (count2--) 166 insert_record(xpp, line2++, result, 2); 167 168 return 0; 169 } 170 171 /* 172 * Find the longest sequence with a smaller last element (meaning a smaller 173 * line2, as we construct the sequence with entries ordered by line1). 174 */ 175 static int binary_search(struct entry **sequence, int longest, 176 struct entry *entry) 177 { 178 int left = -1, right = longest; 179 180 while (left + 1 < right) { 181 int middle = left + (right - left) / 2; 182 /* by construction, no two entries can be equal */ 183 if (sequence[middle]->line2 > entry->line2) 184 right = middle; 185 else 186 left = middle; 187 } 188 /* return the index in "sequence", _not_ the sequence length */ 189 return left; 190 } 191 192 /* 193 * The idea is to start with the list of common unique lines sorted by 194 * the order in file1. For each of these pairs, the longest (partial) 195 * sequence whose last element's line2 is smaller is determined. 196 * 197 * For efficiency, the sequences are kept in a list containing exactly one 198 * item per sequence length: the sequence with the smallest last 199 * element (in terms of line2). 200 */ 201 static struct entry *find_longest_common_sequence(struct hashmap *map) 202 { 203 struct entry **sequence = xdl_malloc(map->nr * sizeof(struct entry *)); 204 int longest = 0, i; 205 struct entry *entry; 206 207 /* 208 * If not -1, this entry in sequence must never be overridden. 209 * Therefore, overriding entries before this has no effect, so 210 * do not do that either. 211 */ 212 int anchor_i = -1; 213 214 // Added to silence Coverity. 215 if (sequence == NULL) 216 return map->first; 217 218 for (entry = map->first; entry; entry = entry->next) { 219 if (!entry->line2 || entry->line2 == NON_UNIQUE) 220 continue; 221 i = binary_search(sequence, longest, entry); 222 entry->previous = i < 0 ? NULL : sequence[i]; 223 ++i; 224 if (i <= anchor_i) 225 continue; 226 sequence[i] = entry; 227 if (entry->anchor) { 228 anchor_i = i; 229 longest = anchor_i + 1; 230 } else if (i == longest) { 231 longest++; 232 } 233 } 234 235 /* No common unique lines were found */ 236 if (!longest) { 237 xdl_free(sequence); 238 return NULL; 239 } 240 241 /* Iterate starting at the last element, adjusting the "next" members */ 242 entry = sequence[longest - 1]; 243 entry->next = NULL; 244 while (entry->previous) { 245 entry->previous->next = entry; 246 entry = entry->previous; 247 } 248 xdl_free(sequence); 249 return entry; 250 } 251 252 static int match(struct hashmap *map, int line1, int line2) 253 { 254 xrecord_t *record1 = map->env->xdf1.recs[line1 - 1]; 255 xrecord_t *record2 = map->env->xdf2.recs[line2 - 1]; 256 return record1->ha == record2->ha; 257 } 258 259 static int patience_diff(mmfile_t *file1, mmfile_t *file2, 260 xpparam_t const *xpp, xdfenv_t *env, 261 int line1, int count1, int line2, int count2); 262 263 static int walk_common_sequence(struct hashmap *map, struct entry *first, 264 int line1, int count1, int line2, int count2) 265 { 266 int end1 = line1 + count1, end2 = line2 + count2; 267 int next1, next2; 268 269 for (;;) { 270 /* Try to grow the line ranges of common lines */ 271 if (first) { 272 next1 = first->line1; 273 next2 = first->line2; 274 while (next1 > line1 && next2 > line2 && 275 match(map, next1 - 1, next2 - 1)) { 276 next1--; 277 next2--; 278 } 279 } else { 280 next1 = end1; 281 next2 = end2; 282 } 283 while (line1 < next1 && line2 < next2 && 284 match(map, line1, line2)) { 285 line1++; 286 line2++; 287 } 288 289 /* Recurse */ 290 if (next1 > line1 || next2 > line2) { 291 if (patience_diff(map->file1, map->file2, 292 map->xpp, map->env, 293 line1, next1 - line1, 294 line2, next2 - line2)) 295 return -1; 296 } 297 298 if (!first) 299 return 0; 300 301 while (first->next && 302 first->next->line1 == first->line1 + 1 && 303 first->next->line2 == first->line2 + 1) 304 first = first->next; 305 306 line1 = first->line1 + 1; 307 line2 = first->line2 + 1; 308 309 first = first->next; 310 } 311 } 312 313 static int fall_back_to_classic_diff(struct hashmap *map, 314 int line1, int count1, int line2, int count2) 315 { 316 xpparam_t xpp; 317 318 memset(&xpp, 0, sizeof(xpp)); 319 xpp.flags = map->xpp->flags & ~XDF_DIFF_ALGORITHM_MASK; 320 321 return xdl_fall_back_diff(map->env, &xpp, 322 line1, count1, line2, count2); 323 } 324 325 /* 326 * Recursively find the longest common sequence of unique lines, 327 * and if none was found, ask xdl_do_diff() to do the job. 328 * 329 * This function assumes that env was prepared with xdl_prepare_env(). 330 */ 331 static int patience_diff(mmfile_t *file1, mmfile_t *file2, 332 xpparam_t const *xpp, xdfenv_t *env, 333 int line1, int count1, int line2, int count2) 334 { 335 struct hashmap map; 336 struct entry *first; 337 int result = 0; 338 339 /* trivial case: one side is empty */ 340 if (!count1) { 341 while(count2--) 342 env->xdf2.rchg[line2++ - 1] = 1; 343 return 0; 344 } else if (!count2) { 345 while(count1--) 346 env->xdf1.rchg[line1++ - 1] = 1; 347 return 0; 348 } 349 350 memset(&map, 0, sizeof(map)); 351 if (fill_hashmap(file1, file2, xpp, env, &map, 352 line1, count1, line2, count2)) 353 return -1; 354 355 /* are there any matching lines at all? */ 356 if (!map.has_matches) { 357 while(count1--) 358 env->xdf1.rchg[line1++ - 1] = 1; 359 while(count2--) 360 env->xdf2.rchg[line2++ - 1] = 1; 361 xdl_free(map.entries); 362 return 0; 363 } 364 365 first = find_longest_common_sequence(&map); 366 if (first) 367 result = walk_common_sequence(&map, first, 368 line1, count1, line2, count2); 369 else 370 result = fall_back_to_classic_diff(&map, 371 line1, count1, line2, count2); 372 373 xdl_free(map.entries); 374 return result; 375 } 376 377 int xdl_do_patience_diff(mmfile_t *file1, mmfile_t *file2, 378 xpparam_t const *xpp, xdfenv_t *env) 379 { 380 if (xdl_prepare_env(file1, file2, xpp, env) < 0) 381 return -1; 382 383 /* environment is cleaned up in xdl_diff() */ 384 return patience_diff(file1, file2, xpp, env, 385 1, env->xdf1.nrec, 1, env->xdf2.nrec); 386 }