1 /* Analyze file differences for GNU DIFF. 2 Copyright (C) 1988, 1989, 1992, 1993, 1997 Free Software Foundation, Inc. 3 4 This file is part of GNU DIFF. 5 6 GNU DIFF is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 GNU DIFF is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 */ 17 18 /* The basic algorithm is described in: 19 "An O(ND) Difference Algorithm and its Variations", Eugene Myers, 20 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; 21 see especially section 4.2, which describes the variation used below. 22 Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE 23 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) 24 at the price of producing suboptimal output for large inputs with 25 many differences. 26 27 The basic algorithm was independently discovered as described in: 28 "Algorithms for Approximate String Matching", E. Ukkonen, 29 Information and Control Vol. 64, 1985, pp. 100-118. */ 30 31 #include "diff.h" 32 #include "cmpbuf.h" 33 34 extern int no_discards; 35 36 static int *xvec, *yvec; /* Vectors being compared. */ 37 static int *fdiag; /* Vector, indexed by diagonal, containing 38 1 + the X coordinate of the point furthest 39 along the given diagonal in the forward 40 search of the edit matrix. */ 41 static int *bdiag; /* Vector, indexed by diagonal, containing 42 the X coordinate of the point furthest 43 along the given diagonal in the backward 44 search of the edit matrix. */ 45 static int too_expensive; /* Edit scripts longer than this are too 46 expensive to compute. */ 47 48 #define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'. */ 49 50 struct partition 51 { 52 int xmid, ymid; /* Midpoints of this partition. */ 53 int lo_minimal; /* Nonzero if low half will be analyzed minimally. */ 54 int hi_minimal; /* Likewise for high half. */ 55 }; 56 57 static int diag PARAMS((int, int, int, int, int, struct partition *)); 58 static struct change *add_change PARAMS((int, int, int, int, struct change *)); 59 static struct change *build_reverse_script PARAMS((struct file_data const[])); 60 static struct change *build_script PARAMS((struct file_data const[])); 61 static void briefly_report PARAMS((int, struct file_data const[])); 62 static void compareseq PARAMS((int, int, int, int, int)); 63 static void discard_confusing_lines PARAMS((struct file_data[])); 64 static void shift_boundaries PARAMS((struct file_data[])); 65 66 /* Find the midpoint of the shortest edit script for a specified 67 portion of the two files. 68 69 Scan from the beginnings of the files, and simultaneously from the ends, 70 doing a breadth-first search through the space of edit-sequence. 71 When the two searches meet, we have found the midpoint of the shortest 72 edit sequence. 73 74 If MINIMAL is nonzero, find the minimal edit script regardless 75 of expense. Otherwise, if the search is too expensive, use 76 heuristics to stop the search and report a suboptimal answer. 77 78 Set PART->(XMID,YMID) to the midpoint (XMID,YMID). The diagonal number 79 XMID - YMID equals the number of inserted lines minus the number 80 of deleted lines (counting only lines before the midpoint). 81 Return the approximate edit cost; this is the total number of 82 lines inserted or deleted (counting only lines before the midpoint), 83 unless a heuristic is used to terminate the search prematurely. 84 85 Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script for the 86 left half of the partition is known; similarly for PART->RIGHT_MINIMAL. 87 88 This function assumes that the first lines of the specified portions 89 of the two files do not match, and likewise that the last lines do not 90 match. The caller must trim matching lines from the beginning and end 91 of the portions it is going to specify. 92 93 If we return the "wrong" partitions, 94 the worst this can do is cause suboptimal diff output. 95 It cannot cause incorrect diff output. */ 96 97 static int 98 diag (xoff, xlim, yoff, ylim, minimal, part) 99 int xoff, xlim, yoff, ylim, minimal; 100 struct partition *part; 101 { 102 int *const fd = fdiag; /* Give the compiler a chance. */ 103 int *const bd = bdiag; /* Additional help for the compiler. */ 104 int const *const xv = xvec; /* Still more help for the compiler. */ 105 int const *const yv = yvec; /* And more and more . . . */ 106 int const dmin = xoff - ylim; /* Minimum valid diagonal. */ 107 int const dmax = xlim - yoff; /* Maximum valid diagonal. */ 108 int const fmid = xoff - yoff; /* Center diagonal of top-down search. */ 109 int const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ 110 int fmin = fmid, fmax = fmid; /* Limits of top-down search. */ 111 int bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */ 112 int c; /* Cost. */ 113 int odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd 114 diagonal with respect to the northwest. */ 115 116 fd[fmid] = xoff; 117 bd[bmid] = xlim; 118 119 for (c = 1;; ++c) 120 { 121 int d; /* Active diagonal. */ 122 int big_snake = 0; 123 124 /* Extend the top-down search by an edit step in each diagonal. */ 125 fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin; 126 fmax < dmax ? fd[++fmax + 1] = -1 : --fmax; 127 for (d = fmax; d >= fmin; d -= 2) 128 { 129 int x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1]; 130 131 if (tlo >= thi) 132 x = tlo + 1; 133 else 134 x = thi; 135 oldx = x; 136 y = x - d; 137 while (x < xlim && y < ylim && xv[x] == yv[y]) 138 ++x, ++y; 139 if (x - oldx > SNAKE_LIMIT) 140 big_snake = 1; 141 fd[d] = x; 142 if (odd && bmin <= d && d <= bmax && bd[d] <= x) 143 { 144 part->xmid = x; 145 part->ymid = y; 146 part->lo_minimal = part->hi_minimal = 1; 147 return 2 * c - 1; 148 } 149 } 150 151 /* Similarly extend the bottom-up search. */ 152 bmin > dmin ? bd[--bmin - 1] = INT_MAX : ++bmin; 153 bmax < dmax ? bd[++bmax + 1] = INT_MAX : --bmax; 154 for (d = bmax; d >= bmin; d -= 2) 155 { 156 int x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1]; 157 158 if (tlo < thi) 159 x = tlo; 160 else 161 x = thi - 1; 162 oldx = x; 163 y = x - d; 164 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) 165 --x, --y; 166 if (oldx - x > SNAKE_LIMIT) 167 big_snake = 1; 168 bd[d] = x; 169 if (!odd && fmin <= d && d <= fmax && x <= fd[d]) 170 { 171 part->xmid = x; 172 part->ymid = y; 173 part->lo_minimal = part->hi_minimal = 1; 174 return 2 * c; 175 } 176 } 177 178 if (minimal) 179 continue; 180 181 /* Heuristic: check occasionally for a diagonal that has made 182 lots of progress compared with the edit distance. 183 If we have any such, find the one that has made the most 184 progress and return it as if it had succeeded. 185 186 With this heuristic, for files with a constant small density 187 of changes, the algorithm is linear in the file size. */ 188 189 if (c > 200 && big_snake && heuristic) 190 { 191 int best; 192 193 best = 0; 194 for (d = fmax; d >= fmin; d -= 2) 195 { 196 int dd = d - fmid; 197 int x = fd[d]; 198 int y = x - d; 199 int v = (x - xoff) * 2 - dd; 200 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 201 { 202 if (v > best 203 && xoff + SNAKE_LIMIT <= x && x < xlim 204 && yoff + SNAKE_LIMIT <= y && y < ylim) 205 { 206 /* We have a good enough best diagonal; 207 now insist that it end with a significant snake. */ 208 int k; 209 210 for (k = 1; xv[x - k] == yv[y - k]; k++) 211 if (k == SNAKE_LIMIT) 212 { 213 best = v; 214 part->xmid = x; 215 part->ymid = y; 216 break; 217 } 218 } 219 } 220 } 221 if (best > 0) 222 { 223 part->lo_minimal = 1; 224 part->hi_minimal = 0; 225 return 2 * c - 1; 226 } 227 228 best = 0; 229 for (d = bmax; d >= bmin; d -= 2) 230 { 231 int dd = d - bmid; 232 int x = bd[d]; 233 int y = x - d; 234 int v = (xlim - x) * 2 + dd; 235 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 236 { 237 if (v > best 238 && xoff < x && x <= xlim - SNAKE_LIMIT 239 && yoff < y && y <= ylim - SNAKE_LIMIT) 240 { 241 /* We have a good enough best diagonal; 242 now insist that it end with a significant snake. */ 243 int k; 244 245 for (k = 0; xv[x + k] == yv[y + k]; k++) 246 if (k == SNAKE_LIMIT - 1) 247 { 248 best = v; 249 part->xmid = x; 250 part->ymid = y; 251 break; 252 } 253 } 254 } 255 } 256 if (best > 0) 257 { 258 part->lo_minimal = 0; 259 part->hi_minimal = 1; 260 return 2 * c - 1; 261 } 262 } 263 264 /* Heuristic: if we've gone well beyond the call of duty, 265 give up and report halfway between our best results so far. */ 266 if (c >= too_expensive) 267 { 268 int fxybest, fxbest; 269 int bxybest, bxbest; 270 271 fxbest = bxbest = 0; /* Pacify `gcc -Wall'. */ 272 273 /* Find forward diagonal that maximizes X + Y. */ 274 fxybest = -1; 275 for (d = fmax; d >= fmin; d -= 2) 276 { 277 int x = min (fd[d], xlim); 278 int y = x - d; 279 if (ylim < y) 280 x = ylim + d, y = ylim; 281 if (fxybest < x + y) 282 { 283 fxybest = x + y; 284 fxbest = x; 285 } 286 } 287 288 /* Find backward diagonal that minimizes X + Y. */ 289 bxybest = INT_MAX; 290 for (d = bmax; d >= bmin; d -= 2) 291 { 292 int x = max (xoff, bd[d]); 293 int y = x - d; 294 if (y < yoff) 295 x = yoff + d, y = yoff; 296 if (x + y < bxybest) 297 { 298 bxybest = x + y; 299 bxbest = x; 300 } 301 } 302 303 /* Use the better of the two diagonals. */ 304 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) 305 { 306 part->xmid = fxbest; 307 part->ymid = fxybest - fxbest; 308 part->lo_minimal = 1; 309 part->hi_minimal = 0; 310 } 311 else 312 { 313 part->xmid = bxbest; 314 part->ymid = bxybest - bxbest; 315 part->lo_minimal = 0; 316 part->hi_minimal = 1; 317 } 318 return 2 * c - 1; 319 } 320 } 321 } 322 323 /* Compare in detail contiguous subsequences of the two files 325 which are known, as a whole, to match each other. 326 327 The results are recorded in the vectors files[N].changed_flag, by 328 storing a 1 in the element for each line that is an insertion or deletion. 329 330 The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. 331 332 Note that XLIM, YLIM are exclusive bounds. 333 All line numbers are origin-0 and discarded lines are not counted. 334 335 If MINIMAL is nonzero, find a minimal difference no matter how 336 expensive it is. */ 337 338 static void 339 compareseq (xoff, xlim, yoff, ylim, minimal) 340 int xoff, xlim, yoff, ylim, minimal; 341 { 342 int * const xv = xvec; /* Help the compiler. */ 343 int * const yv = yvec; 344 345 /* Slide down the bottom initial diagonal. */ 346 while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) 347 ++xoff, ++yoff; 348 /* Slide up the top initial diagonal. */ 349 while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) 350 --xlim, --ylim; 351 352 /* Handle simple cases. */ 353 if (xoff == xlim) 354 while (yoff < ylim) 355 files[1].changed_flag[files[1].realindexes[yoff++]] = 1; 356 else if (yoff == ylim) 357 while (xoff < xlim) 358 files[0].changed_flag[files[0].realindexes[xoff++]] = 1; 359 else 360 { 361 int c; 362 struct partition part; 363 364 /* Find a point of correspondence in the middle of the files. */ 365 366 c = diag (xoff, xlim, yoff, ylim, minimal, &part); 367 368 if (c == 1) 369 { 370 /* This should be impossible, because it implies that 371 one of the two subsequences is empty, 372 and that case was handled above without calling `diag'. 373 Let's verify that this is true. */ 374 abort (); 375 #if 0 376 /* The two subsequences differ by a single insert or delete; 377 record it and we are done. */ 378 if (part.xmid - part.ymid < xoff - yoff) 379 files[1].changed_flag[files[1].realindexes[part.ymid - 1]] = 1; 380 else 381 files[0].changed_flag[files[0].realindexes[part.xmid]] = 1; 382 #endif 383 } 384 else 385 { 386 /* Use the partitions to split this problem into subproblems. */ 387 compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal); 388 compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal); 389 } 390 } 391 } 392 393 /* Discard lines from one file that have no matches in the other file. 395 396 A line which is discarded will not be considered by the actual 397 comparison algorithm; it will be as if that line were not in the file. 398 The file's `realindexes' table maps virtual line numbers 399 (which don't count the discarded lines) into real line numbers; 400 this is how the actual comparison algorithm produces results 401 that are comprehensible when the discarded lines are counted. 402 403 When we discard a line, we also mark it as a deletion or insertion 404 so that it will be printed in the output. */ 405 406 static void 407 discard_confusing_lines (filevec) 408 struct file_data filevec[]; 409 { 410 unsigned int f, i; 411 char *discarded[2]; 412 int *equiv_count[2]; 413 int *p; 414 415 /* Allocate our results. */ 416 p = (int *) xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines) 417 * (2 * sizeof (int))); 418 for (f = 0; f < 2; f++) 419 { 420 filevec[f].undiscarded = p; p += filevec[f].buffered_lines; 421 filevec[f].realindexes = p; p += filevec[f].buffered_lines; 422 } 423 424 /* Set up equiv_count[F][I] as the number of lines in file F 425 that fall in equivalence class I. */ 426 427 p = (int *) xmalloc (filevec[0].equiv_max * (2 * sizeof (int))); 428 equiv_count[0] = p; 429 equiv_count[1] = p + filevec[0].equiv_max; 430 bzero (p, filevec[0].equiv_max * (2 * sizeof (int))); 431 432 for (i = 0; i < filevec[0].buffered_lines; ++i) 433 ++equiv_count[0][filevec[0].equivs[i]]; 434 for (i = 0; i < filevec[1].buffered_lines; ++i) 435 ++equiv_count[1][filevec[1].equivs[i]]; 436 437 /* Set up tables of which lines are going to be discarded. */ 438 439 discarded[0] = xmalloc (sizeof (char) 440 * (filevec[0].buffered_lines 441 + filevec[1].buffered_lines)); 442 discarded[1] = discarded[0] + filevec[0].buffered_lines; 443 bzero (discarded[0], sizeof (char) * (filevec[0].buffered_lines 444 + filevec[1].buffered_lines)); 445 446 /* Mark to be discarded each line that matches no line of the other file. 447 If a line matches many lines, mark it as provisionally discardable. */ 448 449 for (f = 0; f < 2; f++) 450 { 451 unsigned int end = filevec[f].buffered_lines; 452 char *discards = discarded[f]; 453 int *counts = equiv_count[1 - f]; 454 int *equivs = filevec[f].equivs; 455 unsigned int many = 5; 456 unsigned int tem = end / 64; 457 458 /* Multiply MANY by approximate square root of number of lines. 459 That is the threshold for provisionally discardable lines. */ 460 while ((tem = tem >> 2) > 0) 461 many *= 2; 462 463 for (i = 0; i < end; i++) 464 { 465 int nmatch; 466 if (equivs[i] == 0) 467 continue; 468 nmatch = counts[equivs[i]]; 469 if (nmatch == 0) 470 discards[i] = 1; 471 else if (nmatch > many) 472 discards[i] = 2; 473 } 474 } 475 476 /* Don't really discard the provisional lines except when they occur 477 in a run of discardables, with nonprovisionals at the beginning 478 and end. */ 479 480 for (f = 0; f < 2; f++) 481 { 482 unsigned int end = filevec[f].buffered_lines; 483 register char *discards = discarded[f]; 484 485 for (i = 0; i < end; i++) 486 { 487 /* Cancel provisional discards not in middle of run of discards. */ 488 if (discards[i] == 2) 489 discards[i] = 0; 490 else if (discards[i] != 0) 491 { 492 /* We have found a nonprovisional discard. */ 493 register int j; 494 unsigned int length; 495 unsigned int provisional = 0; 496 497 /* Find end of this run of discardable lines. 498 Count how many are provisionally discardable. */ 499 for (j = i; j < end; j++) 500 { 501 if (discards[j] == 0) 502 break; 503 if (discards[j] == 2) 504 ++provisional; 505 } 506 507 /* Cancel provisional discards at end, and shrink the run. */ 508 while (j > i && discards[j - 1] == 2) 509 discards[--j] = 0, --provisional; 510 511 /* Now we have the length of a run of discardable lines 512 whose first and last are not provisional. */ 513 length = j - i; 514 515 /* If 1/4 of the lines in the run are provisional, 516 cancel discarding of all provisional lines in the run. */ 517 if (provisional * 4 > length) 518 { 519 while (j > i) 520 if (discards[--j] == 2) 521 discards[j] = 0; 522 } 523 else 524 { 525 register unsigned int consec; 526 unsigned int minimum = 1; 527 unsigned int tem = length / 4; 528 529 /* MINIMUM is approximate square root of LENGTH/4. 530 A subrun of two or more provisionals can stand 531 when LENGTH is at least 16. 532 A subrun of 4 or more can stand when LENGTH >= 64. */ 533 while ((tem = tem >> 2) > 0) 534 minimum *= 2; 535 minimum++; 536 537 /* Cancel any subrun of MINIMUM or more provisionals 538 within the larger run. */ 539 for (j = 0, consec = 0; j < length; j++) 540 if (discards[i + j] != 2) 541 consec = 0; 542 else if (minimum == ++consec) 543 /* Back up to start of subrun, to cancel it all. */ 544 j -= consec; 545 else if (minimum < consec) 546 discards[i + j] = 0; 547 548 /* Scan from beginning of run 549 until we find 3 or more nonprovisionals in a row 550 or until the first nonprovisional at least 8 lines in. 551 Until that point, cancel any provisionals. */ 552 for (j = 0, consec = 0; j < length; j++) 553 { 554 if (j >= 8 && discards[i + j] == 1) 555 break; 556 if (discards[i + j] == 2) 557 consec = 0, discards[i + j] = 0; 558 else if (discards[i + j] == 0) 559 consec = 0; 560 else 561 consec++; 562 if (consec == 3) 563 break; 564 } 565 566 /* I advances to the last line of the run. */ 567 i += length - 1; 568 569 /* Same thing, from end. */ 570 for (j = 0, consec = 0; j < length; j++) 571 { 572 if (j >= 8 && discards[i - j] == 1) 573 break; 574 if (discards[i - j] == 2) 575 consec = 0, discards[i - j] = 0; 576 else if (discards[i - j] == 0) 577 consec = 0; 578 else 579 consec++; 580 if (consec == 3) 581 break; 582 } 583 } 584 } 585 } 586 } 587 588 /* Actually discard the lines. */ 589 for (f = 0; f < 2; f++) 590 { 591 char *discards = discarded[f]; 592 unsigned int end = filevec[f].buffered_lines; 593 unsigned int j = 0; 594 for (i = 0; i < end; ++i) 595 if (no_discards || discards[i] == 0) 596 { 597 filevec[f].undiscarded[j] = filevec[f].equivs[i]; 598 filevec[f].realindexes[j++] = i; 599 } 600 else 601 filevec[f].changed_flag[i] = 1; 602 filevec[f].nondiscarded_lines = j; 603 } 604 605 free (discarded[0]); 606 free (equiv_count[0]); 607 } 608 609 /* Adjust inserts/deletes of identical lines to join changes 611 as much as possible. 612 613 We do something when a run of changed lines include a 614 line at one end and have an excluded, identical line at the other. 615 We are free to choose which identical line is included. 616 `compareseq' usually chooses the one at the beginning, 617 but usually it is cleaner to consider the following identical line 618 to be the "change". */ 619 620 int inhibit; 621 622 static void 623 shift_boundaries (filevec) 624 struct file_data filevec[]; 625 { 626 int f; 627 628 if (inhibit) 629 return; 630 631 for (f = 0; f < 2; f++) 632 { 633 char *changed = filevec[f].changed_flag; 634 char const *other_changed = filevec[1-f].changed_flag; 635 int const *equivs = filevec[f].equivs; 636 int i = 0; 637 int j = 0; 638 int i_end = filevec[f].buffered_lines; 639 640 while (1) 641 { 642 int runlength, start, corresponding; 643 644 /* Scan forwards to find beginning of another run of changes. 645 Also keep track of the corresponding point in the other file. */ 646 647 while (i < i_end && changed[i] == 0) 648 { 649 while (other_changed[j++]) 650 continue; 651 i++; 652 } 653 654 if (i == i_end) 655 break; 656 657 start = i; 658 659 /* Find the end of this run of changes. */ 660 661 while (changed[++i]) 662 continue; 663 while (other_changed[j]) 664 j++; 665 666 do 667 { 668 /* Record the length of this run of changes, so that 669 we can later determine whether the run has grown. */ 670 runlength = i - start; 671 672 /* Move the changed region back, so long as the 673 previous unchanged line matches the last changed one. 674 This merges with previous changed regions. */ 675 676 while (start && equivs[start - 1] == equivs[i - 1]) 677 { 678 changed[--start] = 1; 679 changed[--i] = 0; 680 while (changed[start - 1]) 681 start--; 682 while (other_changed[--j]) 683 continue; 684 } 685 686 /* Set CORRESPONDING to the end of the changed run, at the last 687 point where it corresponds to a changed run in the other file. 688 CORRESPONDING == I_END means no such point has been found. */ 689 corresponding = other_changed[j - 1] ? i : i_end; 690 691 /* Move the changed region forward, so long as the 692 first changed line matches the following unchanged one. 693 This merges with following changed regions. 694 Do this second, so that if there are no merges, 695 the changed region is moved forward as far as possible. */ 696 697 while (i != i_end && equivs[start] == equivs[i]) 698 { 699 changed[start++] = 0; 700 changed[i++] = 1; 701 while (changed[i]) 702 i++; 703 while (other_changed[++j]) 704 corresponding = i; 705 } 706 } 707 while (runlength != i - start); 708 709 /* If possible, move the fully-merged run of changes 710 back to a corresponding run in the other file. */ 711 712 while (corresponding < i) 713 { 714 changed[--start] = 1; 715 changed[--i] = 0; 716 while (other_changed[--j]) 717 continue; 718 } 719 } 720 } 721 } 722 723 /* Cons an additional entry onto the front of an edit script OLD. 725 LINE0 and LINE1 are the first affected lines in the two files (origin 0). 726 DELETED is the number of lines deleted here from file 0. 727 INSERTED is the number of lines inserted here in file 1. 728 729 If DELETED is 0 then LINE0 is the number of the line before 730 which the insertion was done; vice versa for INSERTED and LINE1. */ 731 732 static struct change * 733 add_change (line0, line1, deleted, inserted, old) 734 int line0, line1, deleted, inserted; 735 struct change *old; 736 { 737 struct change *new = (struct change *) xmalloc (sizeof (struct change)); 738 739 new->line0 = line0; 740 new->line1 = line1; 741 new->inserted = inserted; 742 new->deleted = deleted; 743 new->link = old; 744 return new; 745 } 746 747 /* Scan the tables of which lines are inserted and deleted, 748 producing an edit script in reverse order. */ 749 750 static struct change * 751 build_reverse_script (filevec) 752 struct file_data const filevec[]; 753 { 754 struct change *script = 0; 755 char *changed0 = filevec[0].changed_flag; 756 char *changed1 = filevec[1].changed_flag; 757 int len0 = filevec[0].buffered_lines; 758 int len1 = filevec[1].buffered_lines; 759 760 /* Note that changedN[len0] does exist, and contains 0. */ 761 762 int i0 = 0, i1 = 0; 763 764 while (i0 < len0 || i1 < len1) 765 { 766 if (changed0[i0] || changed1[i1]) 767 { 768 int line0 = i0, line1 = i1; 769 770 /* Find # lines changed here in each file. */ 771 while (changed0[i0]) ++i0; 772 while (changed1[i1]) ++i1; 773 774 /* Record this change. */ 775 script = add_change (line0, line1, i0 - line0, i1 - line1, script); 776 } 777 778 /* We have reached lines in the two files that match each other. */ 779 i0++, i1++; 780 } 781 782 return script; 783 } 784 785 /* Scan the tables of which lines are inserted and deleted, 786 producing an edit script in forward order. */ 787 788 static struct change * 789 build_script (filevec) 790 struct file_data const filevec[]; 791 { 792 struct change *script = 0; 793 char *changed0 = filevec[0].changed_flag; 794 char *changed1 = filevec[1].changed_flag; 795 int i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines; 796 797 /* Note that changedN[-1] does exist, and contains 0. */ 798 799 while (i0 >= 0 || i1 >= 0) 800 { 801 if (changed0[i0 - 1] || changed1[i1 - 1]) 802 { 803 int line0 = i0, line1 = i1; 804 805 /* Find # lines changed here in each file. */ 806 while (changed0[i0 - 1]) --i0; 807 while (changed1[i1 - 1]) --i1; 808 809 /* Record this change. */ 810 script = add_change (i0, i1, line0 - i0, line1 - i1, script); 811 } 812 813 /* We have reached lines in the two files that match each other. */ 814 i0--, i1--; 815 } 816 817 return script; 818 } 819 820 /* If CHANGES, briefly report that two files differed. */ 822 static void 823 briefly_report (changes, filevec) 824 int changes; 825 struct file_data const filevec[]; 826 { 827 if (changes) 828 message (no_details_flag ? "Files %s and %s differ\n" 829 : "Binary files %s and %s differ\n", 830 filevec[0].name, filevec[1].name); 831 } 832 833 /* Report the differences of two files. DEPTH is the current directory 834 depth. */ 835 int 836 diff_2_files (filevec, depth) 837 struct file_data filevec[]; 838 int depth; 839 { 840 int diags; 841 int i; 842 struct change *e, *p; 843 struct change *script; 844 int changes; 845 846 847 /* If we have detected that either file is binary, 848 compare the two files as binary. This can happen 849 only when the first chunk is read. 850 Also, --brief without any --ignore-* options means 851 we can speed things up by treating the files as binary. */ 852 853 if (read_files (filevec, no_details_flag & ~ignore_some_changes)) 854 { 855 /* Files with different lengths must be different. */ 856 if (filevec[0].stat.st_size != filevec[1].stat.st_size 857 && (filevec[0].desc < 0 || S_ISREG (filevec[0].stat.st_mode)) 858 && (filevec[1].desc < 0 || S_ISREG (filevec[1].stat.st_mode))) 859 changes = 1; 860 861 /* Standard input equals itself. */ 862 else if (filevec[0].desc == filevec[1].desc) 863 changes = 0; 864 865 else 866 /* Scan both files, a buffer at a time, looking for a difference. */ 867 { 868 /* Allocate same-sized buffers for both files. */ 869 size_t buffer_size = buffer_lcm (STAT_BLOCKSIZE (filevec[0].stat), 870 STAT_BLOCKSIZE (filevec[1].stat)); 871 for (i = 0; i < 2; i++) 872 filevec[i].buffer = xrealloc (filevec[i].buffer, buffer_size); 873 874 for (;; filevec[0].buffered_chars = filevec[1].buffered_chars = 0) 875 { 876 /* Read a buffer's worth from both files. */ 877 for (i = 0; i < 2; i++) 878 if (0 <= filevec[i].desc) 879 while (filevec[i].buffered_chars != buffer_size) 880 { 881 int r = read (filevec[i].desc, 882 filevec[i].buffer 883 + filevec[i].buffered_chars, 884 buffer_size - filevec[i].buffered_chars); 885 if (r == 0) 886 break; 887 if (r < 0) 888 pfatal_with_name (filevec[i].name); 889 filevec[i].buffered_chars += r; 890 } 891 892 /* If the buffers differ, the files differ. */ 893 if (filevec[0].buffered_chars != filevec[1].buffered_chars 894 || (filevec[0].buffered_chars != 0 895 && memcmp (filevec[0].buffer, 896 filevec[1].buffer, 897 filevec[0].buffered_chars) != 0)) 898 { 899 changes = 1; 900 break; 901 } 902 903 /* If we reach end of file, the files are the same. */ 904 if (filevec[0].buffered_chars != buffer_size) 905 { 906 changes = 0; 907 break; 908 } 909 } 910 } 911 912 briefly_report (changes, filevec); 913 } 914 else 915 { 916 /* Allocate vectors for the results of comparison: 917 a flag for each line of each file, saying whether that line 918 is an insertion or deletion. 919 Allocate an extra element, always zero, at each end of each vector. */ 920 921 size_t s = filevec[0].buffered_lines + filevec[1].buffered_lines + 4; 922 filevec[0].changed_flag = xmalloc (s); 923 bzero (filevec[0].changed_flag, s); 924 filevec[0].changed_flag++; 925 filevec[1].changed_flag = filevec[0].changed_flag 926 + filevec[0].buffered_lines + 2; 927 928 /* Some lines are obviously insertions or deletions 929 because they don't match anything. Detect them now, and 930 avoid even thinking about them in the main comparison algorithm. */ 931 932 discard_confusing_lines (filevec); 933 934 /* Now do the main comparison algorithm, considering just the 935 undiscarded lines. */ 936 937 xvec = filevec[0].undiscarded; 938 yvec = filevec[1].undiscarded; 939 diags = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3; 940 fdiag = (int *) xmalloc (diags * (2 * sizeof (int))); 941 bdiag = fdiag + diags; 942 fdiag += filevec[1].nondiscarded_lines + 1; 943 bdiag += filevec[1].nondiscarded_lines + 1; 944 945 /* Set TOO_EXPENSIVE to be approximate square root of input size, 946 bounded below by 256. */ 947 too_expensive = 1; 948 for (i = filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines; 949 i != 0; i >>= 2) 950 too_expensive <<= 1; 951 too_expensive = max (256, too_expensive); 952 953 files[0] = filevec[0]; 954 files[1] = filevec[1]; 955 956 compareseq (0, filevec[0].nondiscarded_lines, 957 0, filevec[1].nondiscarded_lines, no_discards); 958 959 free (fdiag - (filevec[1].nondiscarded_lines + 1)); 960 961 /* Modify the results slightly to make them prettier 962 in cases where that can validly be done. */ 963 964 shift_boundaries (filevec); 965 966 /* Get the results of comparison in the form of a chain 967 of `struct change's -- an edit script. */ 968 969 if (output_style == OUTPUT_ED) 970 script = build_reverse_script (filevec); 971 else 972 script = build_script (filevec); 973 974 /* Set CHANGES if we had any diffs. 975 If some changes are ignored, we must scan the script to decide. */ 976 if (ignore_blank_lines_flag || ignore_regexp_list) 977 { 978 struct change *next = script; 979 changes = 0; 980 981 while (next && changes == 0) 982 { 983 struct change *this, *end; 984 int first0, last0, first1, last1, deletes, inserts; 985 986 /* Find a set of changes that belong together. */ 987 this = next; 988 end = find_change (next); 989 990 /* Disconnect them from the rest of the changes, making them 991 a hunk, and remember the rest for next iteration. */ 992 next = end->link; 993 end->link = 0; 994 995 /* Determine whether this hunk is really a difference. */ 996 analyze_hunk (this, &first0, &last0, &first1, &last1, 997 &deletes, &inserts); 998 999 /* Reconnect the script so it will all be freed properly. */ 1000 end->link = next; 1001 1002 if (deletes || inserts) 1003 changes = 1; 1004 } 1005 } 1006 else 1007 changes = (script != 0); 1008 1009 if (no_details_flag) 1010 briefly_report (changes, filevec); 1011 else 1012 { 1013 if (changes || ! no_diff_means_no_output) 1014 { 1015 /* Record info for starting up output, 1016 to be used if and when we have some output to print. */ 1017 setup_output (files[0].name, files[1].name, depth); 1018 1019 switch (output_style) 1020 { 1021 case OUTPUT_CONTEXT: 1022 print_context_script (script, 0); 1023 break; 1024 1025 case OUTPUT_UNIFIED: 1026 print_context_script (script, 1); 1027 break; 1028 1029 case OUTPUT_ED: 1030 print_ed_script (script); 1031 break; 1032 1033 case OUTPUT_FORWARD_ED: 1034 pr_forward_ed_script (script); 1035 break; 1036 1037 case OUTPUT_RCS: 1038 print_rcs_script (script); 1039 break; 1040 1041 case OUTPUT_NORMAL: 1042 print_normal_script (script); 1043 break; 1044 1045 case OUTPUT_IFDEF: 1046 print_ifdef_script (script); 1047 break; 1048 1049 case OUTPUT_SDIFF: 1050 print_sdiff_script (script); 1051 } 1052 1053 finish_output (); 1054 } 1055 } 1056 1057 free (filevec[0].undiscarded); 1058 1059 free (filevec[0].changed_flag - 1); 1060 1061 for (i = 1; i >= 0; --i) 1062 free (filevec[i].equivs); 1063 1064 for (i = 0; i < 2; ++i) 1065 free (filevec[i].linbuf + filevec[i].linbuf_base); 1066 1067 for (e = script; e; e = p) 1068 { 1069 p = e->link; 1070 free (e); 1071 } 1072 1073 if (! ROBUST_OUTPUT_STYLE (output_style)) 1074 for (i = 0; i < 2; ++i) 1075 if (filevec[i].missing_newline) 1076 { 1077 diff_error ("No newline at end of file %s", filevec[i].name, ""); 1078 changes = 2; 1079 } 1080 } 1081 1082 if (filevec[0].buffer != filevec[1].buffer) 1083 free (filevec[0].buffer); 1084 free (filevec[1].buffer); 1085 1086 return changes; 1087 } 1088