Home | History | Annotate | Line # | Download | only in gcc
spellcheck.cc revision 1.1.1.1
      1 /* Find near-matches for strings.
      2    Copyright (C) 2015-2022 Free Software Foundation, Inc.
      3 
      4 This file is part of GCC.
      5 
      6 GCC is free software; you can redistribute it and/or modify it under
      7 the terms of the GNU General Public License as published by the Free
      8 Software Foundation; either version 3, or (at your option) any later
      9 version.
     10 
     11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
     12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
     13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     14 for more details.
     15 
     16 You should have received a copy of the GNU General Public License
     17 along with GCC; see the file COPYING3.  If not see
     18 <http://www.gnu.org/licenses/>.  */
     19 
     20 #include "config.h"
     21 #include "system.h"
     22 #include "coretypes.h"
     23 #include "tm.h"
     24 #include "tree.h"
     25 #include "spellcheck.h"
     26 #include "selftest.h"
     27 
     28 /* Cost of a case transformation.  */
     29 #define CASE_COST 1
     30 
     31 /* Cost of another kind of edit.  */
     32 #define BASE_COST 2
     33 
     34 /* Get the edit distance between the two strings: the minimal
     35    number of edits that are needed to change one string into another,
     36    where edits can be one-character insertions, removals, or substitutions,
     37    or transpositions of two adjacent characters (counting as one "edit").
     38 
     39    This implementation uses a modified variant of the Wagner-Fischer
     40    algorithm for the Damerau-Levenshtein distance; specifically, the
     41    "optimal string alignment distance" or "restricted edit distance"
     42    variant.  This implementation has been further modified to take
     43    case into account.  */
     44 
     45 edit_distance_t
     46 get_edit_distance (const char *s, int len_s,
     47 		   const char *t, int len_t)
     48 {
     49   const bool debug = false;
     50 
     51   if (debug)
     52     {
     53       printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
     54       printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
     55     }
     56 
     57   if (len_s == 0)
     58     return BASE_COST * len_t;
     59   if (len_t == 0)
     60     return BASE_COST * len_s;
     61 
     62   /* We effectively build a matrix where each (i, j) contains the
     63      distance between the prefix strings s[0:j] and t[0:i].
     64      Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
     65      we simply keep track of the last two rows, v_one_ago and v_two_ago,
     66      and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1)
     67      allocation and memory accesses in favor of three (len_s + 1)
     68      allocations.  These could potentially be
     69      statically-allocated if we impose a maximum length on the
     70      strings of interest.  */
     71   edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1];
     72   edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1];
     73   edit_distance_t *v_next = new edit_distance_t[len_s + 1];
     74 
     75   /* The first row is for the case of an empty target string, which
     76      we can reach by deleting every character in the source string.  */
     77   for (int i = 0; i < len_s + 1; i++)
     78     v_one_ago[i] = i * BASE_COST;
     79 
     80   /* Build successive rows.  */
     81   for (int i = 0; i < len_t; i++)
     82     {
     83       if (debug)
     84 	{
     85 	  printf ("i:%i v_one_ago = ", i);
     86 	  for (int j = 0; j < len_s + 1; j++)
     87 	    printf ("%i ", v_one_ago[j]);
     88 	  printf ("\n");
     89 	}
     90 
     91       /* The initial column is for the case of an empty source string; we
     92 	 can reach prefixes of the target string of length i
     93 	 by inserting i characters.  */
     94       v_next[0] = (i + 1) * BASE_COST;
     95 
     96       /* Build the rest of the row by considering neighbors to
     97 	 the north, west and northwest.  */
     98       for (int j = 0; j < len_s; j++)
     99 	{
    100 	  edit_distance_t cost;
    101 
    102 	  if (s[j] == t[i])
    103 	    cost = 0;
    104 	  else if (TOLOWER (s[j]) == TOLOWER (t[i]))
    105 	    cost = CASE_COST;
    106 	  else
    107 	    cost = BASE_COST;
    108 	  edit_distance_t deletion     = v_next[j] + BASE_COST;
    109 	  edit_distance_t insertion    = v_one_ago[j + 1] + BASE_COST;
    110 	  edit_distance_t substitution = v_one_ago[j] + cost;
    111 	  edit_distance_t cheapest = MIN (deletion, insertion);
    112 	  cheapest = MIN (cheapest, substitution);
    113 	  if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i])
    114 	    {
    115 	      edit_distance_t transposition = v_two_ago[j - 1] + BASE_COST;
    116 	      cheapest = MIN (cheapest, transposition);
    117 	    }
    118 	  v_next[j + 1] = cheapest;
    119 	}
    120 
    121       /* Prepare to move on to next row.  */
    122       for (int j = 0; j < len_s + 1; j++)
    123 	{
    124 	  v_two_ago[j] = v_one_ago[j];
    125 	  v_one_ago[j] = v_next[j];
    126 	}
    127     }
    128 
    129   if (debug)
    130     {
    131       printf ("final v_next = ");
    132       for (int j = 0; j < len_s + 1; j++)
    133 	printf ("%i ", v_next[j]);
    134       printf ("\n");
    135     }
    136 
    137   edit_distance_t result = v_next[len_s];
    138   delete[] v_two_ago;
    139   delete[] v_one_ago;
    140   delete[] v_next;
    141   return result;
    142 }
    143 
    144 /* Get the edit distance between two nil-terminated strings.  */
    145 
    146 edit_distance_t
    147 get_edit_distance (const char *s, const char *t)
    148 {
    149   return get_edit_distance (s, strlen (s), t, strlen (t));
    150 }
    151 
    152 /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
    153    an autovec of non-NULL strings, determine which element within
    154    CANDIDATES has the lowest edit distance to TARGET.  If there are
    155    multiple elements with the same minimal distance, the first in the
    156    vector wins.
    157 
    158    If more than half of the letters were misspelled, the suggestion is
    159    likely to be meaningless, so return NULL for this case.  */
    160 
    161 const char *
    162 find_closest_string (const char *target,
    163 		     const auto_vec<const char *> *candidates)
    164 {
    165   gcc_assert (target);
    166   gcc_assert (candidates);
    167 
    168   int i;
    169   const char *candidate;
    170   best_match<const char *, const char *> bm (target);
    171   FOR_EACH_VEC_ELT (*candidates, i, candidate)
    172     {
    173       gcc_assert (candidate);
    174       bm.consider (candidate);
    175     }
    176 
    177   return bm.get_best_meaningful_candidate ();
    178 }
    179 
    180 /* Generate the maximum edit distance for which we consider a suggestion
    181    to be meaningful, given a goal of length GOAL_LEN and a candidate of
    182    length CANDIDATE_LEN.
    183 
    184    This is a third of the length of the candidate or of the goal,
    185    whichever is bigger.  */
    186 
    187 edit_distance_t
    188 get_edit_distance_cutoff (size_t goal_len, size_t candidate_len)
    189 {
    190   size_t max_length = MAX (goal_len, candidate_len);
    191   size_t min_length = MIN (goal_len, candidate_len);
    192 
    193   gcc_assert (max_length >= min_length);
    194 
    195   /* Special case: don't offer suggestions for a pair of
    196      length == 1 strings (or empty strings).  */
    197   if (max_length <= 1)
    198     return 0;
    199 
    200   /* If the lengths are close, then round down.  */
    201   if (max_length - min_length <= 1)
    202     /* ...but allow an edit distance of at least 1.  */
    203     return BASE_COST * MAX (max_length / 3, 1);
    204 
    205   /* Otherwise, round up (thus giving a little extra leeway to some cases
    206      involving insertions/deletions).  */
    207   return BASE_COST * (max_length + 2) / 3;
    208 }
    209 
    210 #if CHECKING_P
    211 
    212 namespace selftest {
    213 
    214 /* Selftests.  */
    215 
    216 /* Verify that get_edit_distance (A, B) equals the expected value.  */
    217 
    218 static void
    219 test_get_edit_distance_one_way (const char *a, const char *b,
    220 				edit_distance_t expected)
    221 {
    222   edit_distance_t actual = get_edit_distance (a, b);
    223   ASSERT_EQ (actual, expected);
    224 }
    225 
    226 /* Verify that both
    227      get_edit_distance (A, B)
    228    and
    229      get_edit_distance (B, A)
    230    equal the expected value, to ensure that the function is symmetric.  */
    231 
    232 static void
    233 test_get_edit_distance_both_ways (const char *a, const char *b,
    234 			     edit_distance_t expected)
    235 {
    236   test_get_edit_distance_one_way (a, b, expected);
    237   test_get_edit_distance_one_way (b, a, expected);
    238 }
    239 
    240 /* Verify get_edit_distance for a variety of pairs of pre-canned
    241    inputs, comparing against known-good values.  */
    242 
    243 static void
    244 test_edit_distances ()
    245 {
    246   test_get_edit_distance_both_ways ("", "nonempty",
    247 				    BASE_COST * strlen ("nonempty"));
    248   test_get_edit_distance_both_ways ("saturday", "sunday",
    249 				    BASE_COST * 3);
    250   test_get_edit_distance_both_ways ("foo", "m_foo", BASE_COST * 2);
    251   test_get_edit_distance_both_ways ("hello_world", "HelloWorld", 4);
    252   test_get_edit_distance_both_ways
    253     ("the quick brown fox jumps over the lazy dog", "dog", BASE_COST * 40);
    254   test_get_edit_distance_both_ways
    255     ("the quick brown fox jumps over the lazy dog",
    256      "the quick brown dog jumps over the lazy fox",
    257      BASE_COST * 4);
    258   test_get_edit_distance_both_ways
    259     ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
    260      "All your base are belong to us",
    261      BASE_COST * 44);
    262   test_get_edit_distance_both_ways ("foo", "FOO", 3);
    263   test_get_edit_distance_both_ways ("fee", "deed", BASE_COST * 2);
    264   test_get_edit_distance_both_ways ("coorzd1", "coordx1", BASE_COST * 2);
    265   test_get_edit_distance_both_ways ("assert", "sqrt", BASE_COST * 3);
    266   test_get_edit_distance_both_ways ("PATH_MAX", "INT8_MAX", BASE_COST * 3);
    267   test_get_edit_distance_both_ways ("time", "nice", BASE_COST * 2);
    268   test_get_edit_distance_both_ways ("bar", "carg", BASE_COST * 2);
    269   test_get_edit_distance_both_ways ("gtk_widget_show_all",
    270 				    "GtkWidgetShowAll", 10);
    271   test_get_edit_distance_both_ways ("m_bar", "bar", BASE_COST * 2);
    272   test_get_edit_distance_both_ways ("MACRO", "MACRAME", BASE_COST * 3);
    273   test_get_edit_distance_both_ways ("ab", "ac", BASE_COST * 1);
    274   test_get_edit_distance_both_ways ("ab", "a", BASE_COST * 1);
    275   test_get_edit_distance_both_ways ("a", "b", BASE_COST * 1);
    276   test_get_edit_distance_both_ways ("nanl", "name", BASE_COST * 2);
    277   test_get_edit_distance_both_ways ("char", "bar", BASE_COST * 2);
    278   test_get_edit_distance_both_ways ("-optimize", "fsanitize", BASE_COST * 5);
    279   test_get_edit_distance_both_ways ("__DATE__", "__i386__", BASE_COST * 4);
    280 
    281   /* Examples where transposition helps.  */
    282   test_get_edit_distance_both_ways ("ab", "ba", BASE_COST * 1);
    283   test_get_edit_distance_both_ways ("ba", "abc", BASE_COST * 2);
    284   test_get_edit_distance_both_ways ("coorzd1", "coordz1", BASE_COST * 1);
    285   test_get_edit_distance_both_ways ("abcdefghijklmnopqrstuvwxyz",
    286 				    "bacdefghijklmnopqrstuvwxzy",
    287 				    BASE_COST * 2);
    288   test_get_edit_distance_both_ways ("saturday", "sundya", BASE_COST * 4);
    289   test_get_edit_distance_both_ways ("signed", "singed", BASE_COST * 1);
    290 }
    291 
    292 /* Subroutine of test_get_edit_distance_cutoff, for emulating the
    293    spellchecking cutoff in up to GCC 8.  */
    294 
    295 static edit_distance_t
    296 get_old_cutoff (size_t goal_len, size_t candidate_len)
    297 {
    298   return BASE_COST * MAX (goal_len, candidate_len) / 2;
    299 }
    300 
    301 /* Verify that the cutoff for "meaningfulness" of suggestions is at least as
    302    conservative as in older GCC releases.
    303 
    304    This should ensure that we don't offer additional meaningless
    305    suggestions (apart from those for which transposition has helped).  */
    306 
    307 static void
    308 test_get_edit_distance_cutoff ()
    309 {
    310   for (size_t goal_len = 0; goal_len < 30; goal_len++)
    311     for (size_t candidate_len = 0; candidate_len < 30; candidate_len++)
    312       ASSERT_TRUE (get_edit_distance_cutoff (goal_len, candidate_len)
    313 		   <= get_old_cutoff (goal_len, candidate_len));
    314 }
    315 
    316 /* Assert that CANDIDATE is offered as a suggestion for TARGET.  */
    317 
    318 static void
    319 assert_suggested_for (const location &loc, const char *candidate,
    320 		      const char *target)
    321 {
    322   auto_vec<const char *> candidates;
    323   candidates.safe_push (candidate);
    324   ASSERT_EQ_AT (loc, candidate, find_closest_string (target, &candidates));
    325 }
    326 
    327 /* Assert that CANDIDATE is offered as a suggestion for TARGET.  */
    328 
    329 #define ASSERT_SUGGESTED_FOR(CANDIDATE, TARGET)			\
    330   SELFTEST_BEGIN_STMT							\
    331     assert_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET);	\
    332   SELFTEST_END_STMT
    333 
    334 /* Assert that CANDIDATE is not offered as a suggestion for TARGET.  */
    335 
    336 static void
    337 assert_not_suggested_for (const location &loc, const char *candidate,
    338 			  const char *target)
    339 {
    340   auto_vec<const char *> candidates;
    341   candidates.safe_push (candidate);
    342   ASSERT_EQ_AT (loc, NULL, find_closest_string (target, &candidates));
    343 }
    344 
    345 /* Assert that CANDIDATE is not offered as a suggestion for TARGET.  */
    346 
    347 #define ASSERT_NOT_SUGGESTED_FOR(CANDIDATE, TARGET)			\
    348   SELFTEST_BEGIN_STMT							\
    349     assert_not_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET);	\
    350   SELFTEST_END_STMT
    351 
    352 /* Verify that we offer varous suggestions that are meaningful,
    353    and that we don't offer various other ones that aren't (PR c/82967).  */
    354 
    355 static void
    356 test_suggestions ()
    357 {
    358   /* Good suggestions.  */
    359 
    360   ASSERT_SUGGESTED_FOR ("m_bar", "bar");
    361   // dist == 2, max_length == 5, min_length == 3
    362 
    363   ASSERT_SUGGESTED_FOR ("MACRO", "MACRAME");
    364   // dist == 3, max_length == 7, min_length == 5
    365 
    366   ASSERT_SUGGESTED_FOR ("gtk_widget_show_all", "GtkWidgetShowAll");
    367   // dist == 7, max_length == 16, min_length = 19
    368 
    369   ASSERT_SUGGESTED_FOR ("ab", "ac");
    370   // dist == 1, max_length == min_length = 2
    371 
    372   ASSERT_SUGGESTED_FOR ("ab", "a");
    373   // dist == 1, max_length == 2, min_length = 1
    374 
    375   /* Bad suggestions.  */
    376 
    377   ASSERT_NOT_SUGGESTED_FOR ("a", "b");
    378   // dist == 1, max_length == min_length = 1
    379 
    380   ASSERT_NOT_SUGGESTED_FOR ("sqrt", "assert");
    381   // dist == 3, max_length 6, min_length == 4
    382 
    383   ASSERT_NOT_SUGGESTED_FOR ("INT8_MAX", "PATH_MAX");
    384   // dist == 3, max_length == min_length == 8
    385 
    386   ASSERT_NOT_SUGGESTED_FOR ("nice", "time");
    387   ASSERT_NOT_SUGGESTED_FOR ("nanl", "name");
    388   // dist == 2, max_length == min_length == 4
    389 
    390   ASSERT_NOT_SUGGESTED_FOR ("carg", "bar");
    391   ASSERT_NOT_SUGGESTED_FOR ("char", "bar");
    392   // dist == 2, max_length == 4, min_length == 3
    393 
    394   ASSERT_NOT_SUGGESTED_FOR ("-optimize", "fsanitize");
    395   // dist == 5, max_length == min_length == 9
    396 
    397   ASSERT_NOT_SUGGESTED_FOR ("__DATE__", "__i386__");
    398   // dist == 4, max_length == min_length == 8
    399 
    400   ASSERT_NOT_SUGGESTED_FOR ("start_input_device", "InputDevice");
    401   // dist == 9, max_length == 18, min_length == 11
    402 }
    403 
    404 /* Verify that find_closest_string is sane.  */
    405 
    406 static void
    407 test_find_closest_string ()
    408 {
    409   auto_vec<const char *> candidates;
    410 
    411   /* Verify that it can handle an empty vec.  */
    412   ASSERT_EQ (NULL, find_closest_string ("", &candidates));
    413 
    414   /* Verify that it works sanely for non-empty vecs.  */
    415   candidates.safe_push ("apple");
    416   candidates.safe_push ("banana");
    417   candidates.safe_push ("cherry");
    418 
    419   ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
    420   ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
    421   ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
    422   ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
    423 
    424   /* The order of the vec can matter, but it should not matter for these
    425      inputs.  */
    426   candidates.truncate (0);
    427   candidates.safe_push ("cherry");
    428   candidates.safe_push ("banana");
    429   candidates.safe_push ("apple");
    430   ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
    431   ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
    432   ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
    433   ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
    434 
    435   /* If the goal string somehow makes it into the candidate list, offering
    436      it as a suggestion will be nonsensical.  Verify that we don't offer such
    437      suggestions.  */
    438   ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
    439 
    440   /* Example from PR 69968 where transposition helps.  */
    441   candidates.truncate (0);
    442   candidates.safe_push("coordx");
    443   candidates.safe_push("coordy");
    444   candidates.safe_push("coordz");
    445   candidates.safe_push("coordx1");
    446   candidates.safe_push("coordy1");
    447   candidates.safe_push("coordz1");
    448   ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates));
    449 
    450   candidates.truncate (0);
    451   candidates.safe_push ("DWARF_GNAT_ENCODINGS_GDB");
    452   candidates.safe_push ("DWARF_GNAT_ENCODINGS_ALL");
    453   candidates.safe_push ("DWARF_GNAT_ENCODINGS_MINIMAL");
    454   ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
    455 		find_closest_string ("DWARF_GNAT_ENCODINGS_all",
    456 				     &candidates));
    457 
    458   /* The same as the previous test, but with a different order of
    459      candidates.  */
    460   candidates.truncate (0);
    461   candidates.safe_push ("DWARF_GNAT_ENCODINGS_ALL");
    462   candidates.safe_push ("DWARF_GNAT_ENCODINGS_GDB");
    463   candidates.safe_push ("DWARF_GNAT_ENCODINGS_MINIMAL");
    464   ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
    465 		find_closest_string ("DWARF_GNAT_ENCODINGS_all",
    466 				     &candidates));
    467 }
    468 
    469 /* Test data for test_metric_conditions.  */
    470 
    471 static const char * const test_data[] = {
    472   "",
    473   "foo",
    474   "food",
    475   "boo",
    476   "1234567890123456789012345678901234567890123456789012345678901234567890",
    477   "abc",
    478   "ac",
    479   "ca",
    480 };
    481 
    482 /* Verify that get_edit_distance appears to be a sane distance function,
    483    even though it doesn't satisfy the conditions for being a metric.  (This
    484    is because the triangle inequality fails to hold: the distance between
    485    "ca" and "ac" is 1, and so is the distance between "abc" and "ac", but
    486    the distance between "abc" and "ca" is 3. Algorithms that calculate the
    487    true Levenshtein-Damerau metric are much more expensive.)  */
    488 
    489 static void
    490 test_metric_conditions ()
    491 {
    492   const int num_test_cases = sizeof (test_data) / sizeof (test_data[0]);
    493 
    494   for (int i = 0; i < num_test_cases; i++)
    495     {
    496       for (int j = 0; j < num_test_cases; j++)
    497 	{
    498 	  edit_distance_t dist_ij
    499 	    = get_edit_distance (test_data[i], test_data[j]);
    500 
    501 	  /* Identity of indiscernibles: d(i, j) > 0 iff i == j.  */
    502 	  if (i == j)
    503 	    ASSERT_EQ (dist_ij, 0);
    504 	  else
    505 	    ASSERT_TRUE (dist_ij > 0);
    506 
    507 	  /* Symmetry: d(i, j) == d(j, i).  */
    508 	  edit_distance_t dist_ji
    509 	    = get_edit_distance (test_data[j], test_data[i]);
    510 	  ASSERT_EQ (dist_ij, dist_ji);
    511 	}
    512     }
    513 }
    514 
    515 /* Run all of the selftests within this file.  */
    516 
    517 void
    518 spellcheck_cc_tests ()
    519 {
    520   test_edit_distances ();
    521   test_get_edit_distance_cutoff ();
    522   test_suggestions ();
    523   test_find_closest_string ();
    524   test_metric_conditions ();
    525 }
    526 
    527 } // namespace selftest
    528 
    529 #endif /* #if CHECKING_P */
    530