Home | History | Annotate | Line # | Download | only in libgettextpo
      1 /* Functions to make fuzzy comparisons between strings
      2    Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006 Free Software Foundation, Inc.
      3 
      4    This program is free software; you can redistribute it and/or modify
      5    it under the terms of the GNU General Public License as published by
      6    the Free Software Foundation; either version 2 of the License, or (at
      7    your option) any later version.
      8 
      9    This program is distributed in the hope that it will be useful, but
     10    WITHOUT ANY WARRANTY; without even the implied warranty of
     11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     12    General Public License for more details.
     13 
     14    You should have received a copy of the GNU General Public License
     15    along with this program; if not, write to the Free Software
     16    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
     17 
     18 
     19    Derived from GNU diff 2.7, analyze.c et al.
     20 
     21    The basic idea is to consider two strings as similar if, when
     22    transforming the first string into the second string through a
     23    sequence of edits (inserts and deletes of one character each),
     24    this sequence is short - or equivalently, if the ordered list
     25    of characters that are untouched by these edits is long.  For a
     26    good introduction to the subject, read about the "Levenshtein
     27    distance" in Wikipedia.
     28 
     29    The basic algorithm is described in:
     30    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
     31    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
     32    see especially section 4.2, which describes the variation used below.
     33 
     34    The basic algorithm was independently discovered as described in:
     35    "Algorithms for Approximate String Matching", E. Ukkonen,
     36    Information and Control Vol. 64, 1985, pp. 100-118.
     37 
     38    Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE
     39    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
     40    at the price of producing suboptimal output for large inputs with
     41    many differences.
     42 
     43    Modified to work on strings rather than files
     44    by Peter Miller <pmiller (at) agso.gov.au>, October 1995 */
     45 
     46 #include <config.h>
     47 
     48 /* Specification.  */
     49 #include "fstrcmp.h"
     50 
     51 #include <string.h>
     52 #include <stdio.h>
     53 #include <stdlib.h>
     54 #include <limits.h>
     55 
     56 #include "lock.h"
     57 #include "tls.h"
     58 #include "xalloc.h"
     59 
     60 #ifndef uintptr_t
     61 # define uintptr_t unsigned long
     62 #endif
     63 
     64 
     65 /*
     66  * Context of comparison operation.
     67  */
     68 struct context
     69 {
     70   /*
     71    * Data on one input string being compared.
     72    */
     73   struct string_data
     74   {
     75     /* The string to be compared. */
     76     const char *data;
     77 
     78     /* The length of the string to be compared. */
     79     int data_length;
     80 
     81     /* The number of characters inserted or deleted. */
     82     int edit_count;
     83   }
     84   string[2];
     85 
     86   #ifdef MINUS_H_FLAG
     87 
     88   /* This corresponds to the diff -H flag.  With this heuristic, for
     89      strings with a constant small density of changes, the algorithm is
     90      linear in the strings size.  This is unlikely in typical uses of
     91      fstrcmp, and so is usually compiled out.  Besides, there is no
     92      interface to set it true.  */
     93   int heuristic;
     94 
     95   #endif
     96 
     97   /* Vector, indexed by diagonal, containing 1 + the X coordinate of the
     98      point furthest along the given diagonal in the forward search of the
     99      edit matrix.  */
    100   int *fdiag;
    101 
    102   /* Vector, indexed by diagonal, containing the X coordinate of the point
    103      furthest along the given diagonal in the backward search of the edit
    104      matrix.  */
    105   int *bdiag;
    106 
    107   /* Edit scripts longer than this are too expensive to compute.  */
    108   int too_expensive;
    109 
    110   /* Snakes bigger than this are considered `big'.  */
    111   #define SNAKE_LIMIT	20
    112 };
    113 
    114 struct partition
    115 {
    116   /* Midpoints of this partition.  */
    117   int xmid, ymid;
    118 
    119   /* Nonzero if low half will be analyzed minimally.  */
    120   int lo_minimal;
    121 
    122   /* Likewise for high half.  */
    123   int hi_minimal;
    124 };
    125 
    126 
    127 /* NAME
    128 	diag - find diagonal path
    129 
    130    SYNOPSIS
    131 	int diag(int xoff, int xlim, int yoff, int ylim, int minimal,
    132 		 struct partition *part, struct context *ctxt);
    133 
    134    DESCRIPTION
    135 	Find the midpoint of the shortest edit script for a specified
    136 	portion of the two strings.
    137 
    138 	Scan from the beginnings of the strings, and simultaneously from
    139 	the ends, doing a breadth-first search through the space of
    140 	edit-sequence.  When the two searches meet, we have found the
    141 	midpoint of the shortest edit sequence.
    142 
    143 	If MINIMAL is nonzero, find the minimal edit script regardless
    144 	of expense.  Otherwise, if the search is too expensive, use
    145 	heuristics to stop the search and report a suboptimal answer.
    146 
    147    RETURNS
    148 	Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal
    149 	number XMID - YMID equals the number of inserted characters
    150 	minus the number of deleted characters (counting only characters
    151 	before the midpoint).  Return the approximate edit cost; this is
    152 	the total number of characters inserted or deleted (counting
    153 	only characters before the midpoint), unless a heuristic is used
    154 	to terminate the search prematurely.
    155 
    156 	Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
    157 	for the left half of the partition is known; similarly for
    158 	PART->RIGHT_MINIMAL.
    159 
    160    CAVEAT
    161 	This function assumes that the first characters of the specified
    162 	portions of the two strings do not match, and likewise that the
    163 	last characters do not match.  The caller must trim matching
    164 	characters from the beginning and end of the portions it is
    165 	going to specify.
    166 
    167 	If we return the "wrong" partitions, the worst this can do is
    168 	cause suboptimal diff output.  It cannot cause incorrect diff
    169 	output.  */
    170 
    171 static int
    172 diag (int xoff, int xlim, int yoff, int ylim, int minimal,
    173       struct partition *part, struct context *ctxt)
    174 {
    175   int *const fd = ctxt->fdiag;	/* Give the compiler a chance. */
    176   int *const bd = ctxt->bdiag;	/* Additional help for the compiler. */
    177   const char *const xv = ctxt->string[0].data;	/* Still more help for the compiler. */
    178   const char *const yv = ctxt->string[1].data;	/* And more and more . . . */
    179   const int dmin = xoff - ylim;	/* Minimum valid diagonal. */
    180   const int dmax = xlim - yoff;	/* Maximum valid diagonal. */
    181   const int fmid = xoff - yoff;	/* Center diagonal of top-down search. */
    182   const int bmid = xlim - ylim;	/* Center diagonal of bottom-up search. */
    183   int fmin = fmid;
    184   int fmax = fmid;		/* Limits of top-down search. */
    185   int bmin = bmid;
    186   int bmax = bmid;		/* Limits of bottom-up search. */
    187   int c;			/* Cost. */
    188   int odd = (fmid - bmid) & 1;
    189 
    190   /*
    191    * True if southeast corner is on an odd diagonal with respect
    192    * to the northwest.
    193    */
    194   fd[fmid] = xoff;
    195   bd[bmid] = xlim;
    196   for (c = 1;; ++c)
    197     {
    198       int d;			/* Active diagonal. */
    199       int big_snake;
    200 
    201       big_snake = 0;
    202       /* Extend the top-down search by an edit step in each diagonal. */
    203       if (fmin > dmin)
    204 	fd[--fmin - 1] = -1;
    205       else
    206 	++fmin;
    207       if (fmax < dmax)
    208 	fd[++fmax + 1] = -1;
    209       else
    210 	--fmax;
    211       for (d = fmax; d >= fmin; d -= 2)
    212 	{
    213 	  int x;
    214 	  int y;
    215 	  int oldx;
    216 	  int tlo;
    217 	  int thi;
    218 
    219 	  tlo = fd[d - 1],
    220 	    thi = fd[d + 1];
    221 
    222 	  if (tlo >= thi)
    223 	    x = tlo + 1;
    224 	  else
    225 	    x = thi;
    226 	  oldx = x;
    227 	  y = x - d;
    228 	  while (x < xlim && y < ylim && xv[x] == yv[y])
    229 	    {
    230 	      ++x;
    231 	      ++y;
    232 	    }
    233 	  if (x - oldx > SNAKE_LIMIT)
    234 	    big_snake = 1;
    235 	  fd[d] = x;
    236 	  if (odd && bmin <= d && d <= bmax && bd[d] <= x)
    237 	    {
    238 	      part->xmid = x;
    239 	      part->ymid = y;
    240 	      part->lo_minimal = part->hi_minimal = 1;
    241 	      return 2 * c - 1;
    242 	    }
    243 	}
    244       /* Similarly extend the bottom-up search.  */
    245       if (bmin > dmin)
    246 	bd[--bmin - 1] = INT_MAX;
    247       else
    248 	++bmin;
    249       if (bmax < dmax)
    250 	bd[++bmax + 1] = INT_MAX;
    251       else
    252 	--bmax;
    253       for (d = bmax; d >= bmin; d -= 2)
    254 	{
    255 	  int x;
    256 	  int y;
    257 	  int oldx;
    258 	  int tlo;
    259 	  int thi;
    260 
    261 	  tlo = bd[d - 1],
    262 	    thi = bd[d + 1];
    263 	  if (tlo < thi)
    264 	    x = tlo;
    265 	  else
    266 	    x = thi - 1;
    267 	  oldx = x;
    268 	  y = x - d;
    269 	  while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
    270 	    {
    271 	      --x;
    272 	      --y;
    273 	    }
    274 	  if (oldx - x > SNAKE_LIMIT)
    275 	    big_snake = 1;
    276 	  bd[d] = x;
    277 	  if (!odd && fmin <= d && d <= fmax && x <= fd[d])
    278 	    {
    279 	      part->xmid = x;
    280 	      part->ymid = y;
    281 	      part->lo_minimal = part->hi_minimal = 1;
    282 	      return 2 * c;
    283 	    }
    284 	}
    285 
    286       if (minimal)
    287 	continue;
    288 
    289 #ifdef MINUS_H_FLAG
    290       /* Heuristic: check occasionally for a diagonal that has made lots
    291          of progress compared with the edit distance.  If we have any
    292          such, find the one that has made the most progress and return
    293          it as if it had succeeded.
    294 
    295          With this heuristic, for strings with a constant small density
    296          of changes, the algorithm is linear in the strings size.  */
    297       if (c > 200 && big_snake && ctxt->heuristic)
    298 	{
    299 	  int best;
    300 
    301 	  best = 0;
    302 	  for (d = fmax; d >= fmin; d -= 2)
    303 	    {
    304 	      int dd;
    305 	      int x;
    306 	      int y;
    307 	      int v;
    308 
    309 	      dd = d - fmid;
    310 	      x = fd[d];
    311 	      y = x - d;
    312 	      v = (x - xoff) * 2 - dd;
    313 
    314 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
    315 		{
    316 		  if
    317 		    (
    318 		      v > best
    319 		      &&
    320 		      xoff + SNAKE_LIMIT <= x
    321 		      &&
    322 		      x < xlim
    323 		      &&
    324 		      yoff + SNAKE_LIMIT <= y
    325 		      &&
    326 		      y < ylim
    327 		    )
    328 		    {
    329 		      /* We have a good enough best diagonal; now insist
    330 			 that it end with a significant snake.  */
    331 		      int k;
    332 
    333 		      for (k = 1; xv[x - k] == yv[y - k]; k++)
    334 			{
    335 			  if (k == SNAKE_LIMIT)
    336 			    {
    337 			      best = v;
    338 			      part->xmid = x;
    339 			      part->ymid = y;
    340 			      break;
    341 			    }
    342 			}
    343 		    }
    344 		}
    345 	    }
    346 	  if (best > 0)
    347 	    {
    348 	      part->lo_minimal = 1;
    349 	      part->hi_minimal = 0;
    350 	      return 2 * c - 1;
    351 	    }
    352 	  best = 0;
    353 	  for (d = bmax; d >= bmin; d -= 2)
    354 	    {
    355 	      int dd;
    356 	      int x;
    357 	      int y;
    358 	      int v;
    359 
    360 	      dd = d - bmid;
    361 	      x = bd[d];
    362 	      y = x - d;
    363 	      v = (xlim - x) * 2 + dd;
    364 
    365 	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
    366 		{
    367 		  if (v > best && xoff < x && x <= xlim - SNAKE_LIMIT &&
    368 		      yoff < y && y <= ylim - SNAKE_LIMIT)
    369 		    {
    370 		      /* We have a good enough best diagonal; now insist
    371 			 that it end with a significant snake.  */
    372 		      int k;
    373 
    374 		      for (k = 0; xv[x + k] == yv[y + k]; k++)
    375 			{
    376 			  if (k == SNAKE_LIMIT - 1)
    377 			    {
    378 			      best = v;
    379 			      part->xmid = x;
    380 			      part->ymid = y;
    381 			      break;
    382 			    }
    383 			}
    384 		    }
    385 		}
    386 	    }
    387 	  if (best > 0)
    388 	    {
    389 	      part->lo_minimal = 0;
    390 	      part->hi_minimal = 1;
    391 	      return 2 * c - 1;
    392 	    }
    393 	}
    394 #endif /* MINUS_H_FLAG */
    395 
    396       /* Heuristic: if we've gone well beyond the call of duty, give up
    397 	 and report halfway between our best results so far.  */
    398       if (c >= ctxt->too_expensive)
    399 	{
    400 	  int fxybest;
    401 	  int fxbest;
    402 	  int bxybest;
    403 	  int bxbest;
    404 
    405 	  /* Pacify `gcc -Wall'. */
    406 	  fxbest = 0;
    407 	  bxbest = 0;
    408 
    409 	  /* Find forward diagonal that maximizes X + Y.  */
    410 	  fxybest = -1;
    411 	  for (d = fmax; d >= fmin; d -= 2)
    412 	    {
    413 	      int x;
    414 	      int y;
    415 
    416 	      x = fd[d] < xlim ? fd[d] : xlim;
    417 	      y = x - d;
    418 
    419 	      if (ylim < y)
    420 		{
    421 		  x = ylim + d;
    422 		  y = ylim;
    423 		}
    424 	      if (fxybest < x + y)
    425 		{
    426 		  fxybest = x + y;
    427 		  fxbest = x;
    428 		}
    429 	    }
    430 	  /* Find backward diagonal that minimizes X + Y.  */
    431 	  bxybest = INT_MAX;
    432 	  for (d = bmax; d >= bmin; d -= 2)
    433 	    {
    434 	      int x;
    435 	      int y;
    436 
    437 	      x = xoff > bd[d] ? xoff : bd[d];
    438 	      y = x - d;
    439 
    440 	      if (y < yoff)
    441 		{
    442 		  x = yoff + d;
    443 		  y = yoff;
    444 		}
    445 	      if (x + y < bxybest)
    446 		{
    447 		  bxybest = x + y;
    448 		  bxbest = x;
    449 		}
    450 	    }
    451 	  /* Use the better of the two diagonals.  */
    452 	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
    453 	    {
    454 	      part->xmid = fxbest;
    455 	      part->ymid = fxybest - fxbest;
    456 	      part->lo_minimal = 1;
    457 	      part->hi_minimal = 0;
    458 	    }
    459 	  else
    460 	    {
    461 	      part->xmid = bxbest;
    462 	      part->ymid = bxybest - bxbest;
    463 	      part->lo_minimal = 0;
    464 	      part->hi_minimal = 1;
    465 	    }
    466 	  return 2 * c - 1;
    467 	}
    468     }
    469 }
    470 
    471 
    472 /* NAME
    473 	compareseq - find edit sequence
    474 
    475    SYNOPSIS
    476 	void compareseq(int xoff, int xlim, int yoff, int ylim, int minimal,
    477 			struct context *ctxt);
    478 
    479    DESCRIPTION
    480 	Compare in detail contiguous subsequences of the two strings
    481 	which are known, as a whole, to match each other.
    482 
    483 	The subsequence of string 0 is [XOFF, XLIM) and likewise for
    484 	string 1.
    485 
    486 	Note that XLIM, YLIM are exclusive bounds.  All character
    487 	numbers are origin-0.
    488 
    489 	If MINIMAL is nonzero, find a minimal difference no matter how
    490 	expensive it is.  */
    491 
    492 static void
    493 compareseq (int xoff, int xlim, int yoff, int ylim, int minimal,
    494 	    struct context *ctxt)
    495 {
    496   const char *const xv = ctxt->string[0].data;	/* Help the compiler.  */
    497   const char *const yv = ctxt->string[1].data;
    498 
    499   /* Slide down the bottom initial diagonal. */
    500   while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
    501     {
    502       ++xoff;
    503       ++yoff;
    504     }
    505 
    506   /* Slide up the top initial diagonal. */
    507   while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
    508     {
    509       --xlim;
    510       --ylim;
    511     }
    512 
    513   /* Handle simple cases. */
    514   if (xoff == xlim)
    515     {
    516       while (yoff < ylim)
    517 	{
    518 	  ctxt->string[1].edit_count++;
    519 	  ++yoff;
    520 	}
    521     }
    522   else if (yoff == ylim)
    523     {
    524       while (xoff < xlim)
    525 	{
    526 	  ctxt->string[0].edit_count++;
    527 	  ++xoff;
    528 	}
    529     }
    530   else
    531     {
    532       int c;
    533       struct partition part;
    534 
    535       /* Find a point of correspondence in the middle of the strings.  */
    536       c = diag (xoff, xlim, yoff, ylim, minimal, &part, ctxt);
    537       if (c == 1)
    538 	{
    539 #if 0
    540 	  /* This should be impossible, because it implies that one of
    541 	     the two subsequences is empty, and that case was handled
    542 	     above without calling `diag'.  Let's verify that this is
    543 	     true.  */
    544 	  abort ();
    545 #else
    546 	  /* The two subsequences differ by a single insert or delete;
    547 	     record it and we are done.  */
    548 	  if (part.xmid - part.ymid < xoff - yoff)
    549 	    ctxt->string[1].edit_count++;
    550 	  else
    551 	    ctxt->string[0].edit_count++;
    552 #endif
    553 	}
    554       else
    555 	{
    556 	  /* Use the partitions to split this problem into subproblems.  */
    557 	  compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt);
    558 	  compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt);
    559 	}
    560     }
    561 }
    562 
    563 
    564 /* Because fstrcmp is typically called multiple times, attempt to minimize
    565    the number of memory allocations performed.  Thus, let a call reuse the
    566    memory already allocated by the previous call, if it is sufficient.
    567    To make it multithread-safe, without need for a lock that protects the
    568    already allocated memory, store the allocated memory per thread.  Free
    569    it only when the thread exits.  */
    570 
    571 static gl_tls_key_t buffer_key;	/* TLS key for a 'int *' */
    572 static gl_tls_key_t bufmax_key;	/* TLS key for a 'size_t' */
    573 
    574 static void
    575 keys_init (void)
    576 {
    577   gl_tls_key_init (buffer_key, free);
    578   gl_tls_key_init (bufmax_key, NULL);
    579   /* The per-thread initial values are NULL and 0, respectively.  */
    580 }
    581 
    582 /* Ensure that keys_init is called once only.  */
    583 gl_once_define(static, keys_init_once)
    584 
    585 
    586 /* NAME
    587 	fstrcmp - fuzzy string compare
    588 
    589    SYNOPSIS
    590 	double fstrcmp(const char *, const char *);
    591 
    592    DESCRIPTION
    593 	The fstrcmp function may be used to compare two string for
    594 	similarity.  It is very useful in reducing "cascade" or
    595 	"secondary" errors in compilers or other situations where
    596 	symbol tables occur.
    597 
    598    RETURNS
    599 	double; 0 if the strings are entirly dissimilar, 1 if the
    600 	strings are identical, and a number in between if they are
    601 	similar.  */
    602 
    603 double
    604 fstrcmp (const char *string1, const char *string2)
    605 {
    606   struct context ctxt;
    607   int i;
    608 
    609   size_t fdiag_len;
    610   int *buffer;
    611   size_t bufmax;
    612 
    613   /* set the info for each string.  */
    614   ctxt.string[0].data = string1;
    615   ctxt.string[0].data_length = strlen (string1);
    616   ctxt.string[1].data = string2;
    617   ctxt.string[1].data_length = strlen (string2);
    618 
    619   /* short-circuit obvious comparisons */
    620   if (ctxt.string[0].data_length == 0 && ctxt.string[1].data_length == 0)
    621     return 1.0;
    622   if (ctxt.string[0].data_length == 0 || ctxt.string[1].data_length == 0)
    623     return 0.0;
    624 
    625   /* Set TOO_EXPENSIVE to be approximate square root of input size,
    626      bounded below by 256.  */
    627   ctxt.too_expensive = 1;
    628   for (i = ctxt.string[0].data_length + ctxt.string[1].data_length;
    629        i != 0;
    630        i >>= 2)
    631     ctxt.too_expensive <<= 1;
    632   if (ctxt.too_expensive < 256)
    633     ctxt.too_expensive = 256;
    634 
    635   /* Allocate memory for fdiag and bdiag from a thread-local pool.  */
    636   fdiag_len = ctxt.string[0].data_length + ctxt.string[1].data_length + 3;
    637   gl_once (keys_init_once, keys_init);
    638   buffer = (int *) gl_tls_get (buffer_key);
    639   bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key);
    640   if (fdiag_len > bufmax)
    641     {
    642       /* Need more memory.  */
    643       bufmax = 2 * bufmax;
    644       if (fdiag_len > bufmax)
    645 	bufmax = fdiag_len;
    646       /* Calling xrealloc would be a waste: buffer's contents does not need
    647 	 to be preserved.  */
    648       if (buffer != NULL)
    649 	free (buffer);
    650       buffer = (int *) xmalloc (bufmax * (2 * sizeof (int)));
    651       gl_tls_set (buffer_key, buffer);
    652       gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax);
    653     }
    654   ctxt.fdiag = buffer + ctxt.string[1].data_length + 1;
    655   ctxt.bdiag = ctxt.fdiag + fdiag_len;
    656 
    657   /* Now do the main comparison algorithm */
    658   ctxt.string[0].edit_count = 0;
    659   ctxt.string[1].edit_count = 0;
    660   compareseq (0, ctxt.string[0].data_length, 0, ctxt.string[1].data_length, 0,
    661 	      &ctxt);
    662 
    663   /* The result is
    664 	((number of chars in common) / (average length of the strings)).
    665      This is admittedly biased towards finding that the strings are
    666      similar, however it does produce meaningful results.  */
    667   return ((double) (ctxt.string[0].data_length + ctxt.string[1].data_length
    668 		    - ctxt.string[1].edit_count - ctxt.string[0].edit_count)
    669 	  / (ctxt.string[0].data_length + ctxt.string[1].data_length));
    670 }
    671