spellcheck.cc revision 1.1.1.1 1 1.1 mrg /* Find near-matches for strings.
2 1.1 mrg Copyright (C) 2015-2022 Free Software Foundation, Inc.
3 1.1 mrg
4 1.1 mrg This file is part of GCC.
5 1.1 mrg
6 1.1 mrg GCC is free software; you can redistribute it and/or modify it under
7 1.1 mrg the terms of the GNU General Public License as published by the Free
8 1.1 mrg Software Foundation; either version 3, or (at your option) any later
9 1.1 mrg version.
10 1.1 mrg
11 1.1 mrg GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 1.1 mrg WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 1.1 mrg FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 1.1 mrg for more details.
15 1.1 mrg
16 1.1 mrg You should have received a copy of the GNU General Public License
17 1.1 mrg along with GCC; see the file COPYING3. If not see
18 1.1 mrg <http://www.gnu.org/licenses/>. */
19 1.1 mrg
20 1.1 mrg #include "config.h"
21 1.1 mrg #include "system.h"
22 1.1 mrg #include "coretypes.h"
23 1.1 mrg #include "tm.h"
24 1.1 mrg #include "tree.h"
25 1.1 mrg #include "spellcheck.h"
26 1.1 mrg #include "selftest.h"
27 1.1 mrg
28 1.1 mrg /* Cost of a case transformation. */
29 1.1 mrg #define CASE_COST 1
30 1.1 mrg
31 1.1 mrg /* Cost of another kind of edit. */
32 1.1 mrg #define BASE_COST 2
33 1.1 mrg
34 1.1 mrg /* Get the edit distance between the two strings: the minimal
35 1.1 mrg number of edits that are needed to change one string into another,
36 1.1 mrg where edits can be one-character insertions, removals, or substitutions,
37 1.1 mrg or transpositions of two adjacent characters (counting as one "edit").
38 1.1 mrg
39 1.1 mrg This implementation uses a modified variant of the Wagner-Fischer
40 1.1 mrg algorithm for the Damerau-Levenshtein distance; specifically, the
41 1.1 mrg "optimal string alignment distance" or "restricted edit distance"
42 1.1 mrg variant. This implementation has been further modified to take
43 1.1 mrg case into account. */
44 1.1 mrg
45 1.1 mrg edit_distance_t
46 1.1 mrg get_edit_distance (const char *s, int len_s,
47 1.1 mrg const char *t, int len_t)
48 1.1 mrg {
49 1.1 mrg const bool debug = false;
50 1.1 mrg
51 1.1 mrg if (debug)
52 1.1 mrg {
53 1.1 mrg printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
54 1.1 mrg printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
55 1.1 mrg }
56 1.1 mrg
57 1.1 mrg if (len_s == 0)
58 1.1 mrg return BASE_COST * len_t;
59 1.1 mrg if (len_t == 0)
60 1.1 mrg return BASE_COST * len_s;
61 1.1 mrg
62 1.1 mrg /* We effectively build a matrix where each (i, j) contains the
63 1.1 mrg distance between the prefix strings s[0:j] and t[0:i].
64 1.1 mrg Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
65 1.1 mrg we simply keep track of the last two rows, v_one_ago and v_two_ago,
66 1.1 mrg and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1)
67 1.1 mrg allocation and memory accesses in favor of three (len_s + 1)
68 1.1 mrg allocations. These could potentially be
69 1.1 mrg statically-allocated if we impose a maximum length on the
70 1.1 mrg strings of interest. */
71 1.1 mrg edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1];
72 1.1 mrg edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1];
73 1.1 mrg edit_distance_t *v_next = new edit_distance_t[len_s + 1];
74 1.1 mrg
75 1.1 mrg /* The first row is for the case of an empty target string, which
76 1.1 mrg we can reach by deleting every character in the source string. */
77 1.1 mrg for (int i = 0; i < len_s + 1; i++)
78 1.1 mrg v_one_ago[i] = i * BASE_COST;
79 1.1 mrg
80 1.1 mrg /* Build successive rows. */
81 1.1 mrg for (int i = 0; i < len_t; i++)
82 1.1 mrg {
83 1.1 mrg if (debug)
84 1.1 mrg {
85 1.1 mrg printf ("i:%i v_one_ago = ", i);
86 1.1 mrg for (int j = 0; j < len_s + 1; j++)
87 1.1 mrg printf ("%i ", v_one_ago[j]);
88 1.1 mrg printf ("\n");
89 1.1 mrg }
90 1.1 mrg
91 1.1 mrg /* The initial column is for the case of an empty source string; we
92 1.1 mrg can reach prefixes of the target string of length i
93 1.1 mrg by inserting i characters. */
94 1.1 mrg v_next[0] = (i + 1) * BASE_COST;
95 1.1 mrg
96 1.1 mrg /* Build the rest of the row by considering neighbors to
97 1.1 mrg the north, west and northwest. */
98 1.1 mrg for (int j = 0; j < len_s; j++)
99 1.1 mrg {
100 1.1 mrg edit_distance_t cost;
101 1.1 mrg
102 1.1 mrg if (s[j] == t[i])
103 1.1 mrg cost = 0;
104 1.1 mrg else if (TOLOWER (s[j]) == TOLOWER (t[i]))
105 1.1 mrg cost = CASE_COST;
106 1.1 mrg else
107 1.1 mrg cost = BASE_COST;
108 1.1 mrg edit_distance_t deletion = v_next[j] + BASE_COST;
109 1.1 mrg edit_distance_t insertion = v_one_ago[j + 1] + BASE_COST;
110 1.1 mrg edit_distance_t substitution = v_one_ago[j] + cost;
111 1.1 mrg edit_distance_t cheapest = MIN (deletion, insertion);
112 1.1 mrg cheapest = MIN (cheapest, substitution);
113 1.1 mrg if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i])
114 1.1 mrg {
115 1.1 mrg edit_distance_t transposition = v_two_ago[j - 1] + BASE_COST;
116 1.1 mrg cheapest = MIN (cheapest, transposition);
117 1.1 mrg }
118 1.1 mrg v_next[j + 1] = cheapest;
119 1.1 mrg }
120 1.1 mrg
121 1.1 mrg /* Prepare to move on to next row. */
122 1.1 mrg for (int j = 0; j < len_s + 1; j++)
123 1.1 mrg {
124 1.1 mrg v_two_ago[j] = v_one_ago[j];
125 1.1 mrg v_one_ago[j] = v_next[j];
126 1.1 mrg }
127 1.1 mrg }
128 1.1 mrg
129 1.1 mrg if (debug)
130 1.1 mrg {
131 1.1 mrg printf ("final v_next = ");
132 1.1 mrg for (int j = 0; j < len_s + 1; j++)
133 1.1 mrg printf ("%i ", v_next[j]);
134 1.1 mrg printf ("\n");
135 1.1 mrg }
136 1.1 mrg
137 1.1 mrg edit_distance_t result = v_next[len_s];
138 1.1 mrg delete[] v_two_ago;
139 1.1 mrg delete[] v_one_ago;
140 1.1 mrg delete[] v_next;
141 1.1 mrg return result;
142 1.1 mrg }
143 1.1 mrg
144 1.1 mrg /* Get the edit distance between two nil-terminated strings. */
145 1.1 mrg
146 1.1 mrg edit_distance_t
147 1.1 mrg get_edit_distance (const char *s, const char *t)
148 1.1 mrg {
149 1.1 mrg return get_edit_distance (s, strlen (s), t, strlen (t));
150 1.1 mrg }
151 1.1 mrg
152 1.1 mrg /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
153 1.1 mrg an autovec of non-NULL strings, determine which element within
154 1.1 mrg CANDIDATES has the lowest edit distance to TARGET. If there are
155 1.1 mrg multiple elements with the same minimal distance, the first in the
156 1.1 mrg vector wins.
157 1.1 mrg
158 1.1 mrg If more than half of the letters were misspelled, the suggestion is
159 1.1 mrg likely to be meaningless, so return NULL for this case. */
160 1.1 mrg
161 1.1 mrg const char *
162 1.1 mrg find_closest_string (const char *target,
163 1.1 mrg const auto_vec<const char *> *candidates)
164 1.1 mrg {
165 1.1 mrg gcc_assert (target);
166 1.1 mrg gcc_assert (candidates);
167 1.1 mrg
168 1.1 mrg int i;
169 1.1 mrg const char *candidate;
170 1.1 mrg best_match<const char *, const char *> bm (target);
171 1.1 mrg FOR_EACH_VEC_ELT (*candidates, i, candidate)
172 1.1 mrg {
173 1.1 mrg gcc_assert (candidate);
174 1.1 mrg bm.consider (candidate);
175 1.1 mrg }
176 1.1 mrg
177 1.1 mrg return bm.get_best_meaningful_candidate ();
178 1.1 mrg }
179 1.1 mrg
180 1.1 mrg /* Generate the maximum edit distance for which we consider a suggestion
181 1.1 mrg to be meaningful, given a goal of length GOAL_LEN and a candidate of
182 1.1 mrg length CANDIDATE_LEN.
183 1.1 mrg
184 1.1 mrg This is a third of the length of the candidate or of the goal,
185 1.1 mrg whichever is bigger. */
186 1.1 mrg
187 1.1 mrg edit_distance_t
188 1.1 mrg get_edit_distance_cutoff (size_t goal_len, size_t candidate_len)
189 1.1 mrg {
190 1.1 mrg size_t max_length = MAX (goal_len, candidate_len);
191 1.1 mrg size_t min_length = MIN (goal_len, candidate_len);
192 1.1 mrg
193 1.1 mrg gcc_assert (max_length >= min_length);
194 1.1 mrg
195 1.1 mrg /* Special case: don't offer suggestions for a pair of
196 1.1 mrg length == 1 strings (or empty strings). */
197 1.1 mrg if (max_length <= 1)
198 1.1 mrg return 0;
199 1.1 mrg
200 1.1 mrg /* If the lengths are close, then round down. */
201 1.1 mrg if (max_length - min_length <= 1)
202 1.1 mrg /* ...but allow an edit distance of at least 1. */
203 1.1 mrg return BASE_COST * MAX (max_length / 3, 1);
204 1.1 mrg
205 1.1 mrg /* Otherwise, round up (thus giving a little extra leeway to some cases
206 1.1 mrg involving insertions/deletions). */
207 1.1 mrg return BASE_COST * (max_length + 2) / 3;
208 1.1 mrg }
209 1.1 mrg
210 1.1 mrg #if CHECKING_P
211 1.1 mrg
212 1.1 mrg namespace selftest {
213 1.1 mrg
214 1.1 mrg /* Selftests. */
215 1.1 mrg
216 1.1 mrg /* Verify that get_edit_distance (A, B) equals the expected value. */
217 1.1 mrg
218 1.1 mrg static void
219 1.1 mrg test_get_edit_distance_one_way (const char *a, const char *b,
220 1.1 mrg edit_distance_t expected)
221 1.1 mrg {
222 1.1 mrg edit_distance_t actual = get_edit_distance (a, b);
223 1.1 mrg ASSERT_EQ (actual, expected);
224 1.1 mrg }
225 1.1 mrg
226 1.1 mrg /* Verify that both
227 1.1 mrg get_edit_distance (A, B)
228 1.1 mrg and
229 1.1 mrg get_edit_distance (B, A)
230 1.1 mrg equal the expected value, to ensure that the function is symmetric. */
231 1.1 mrg
232 1.1 mrg static void
233 1.1 mrg test_get_edit_distance_both_ways (const char *a, const char *b,
234 1.1 mrg edit_distance_t expected)
235 1.1 mrg {
236 1.1 mrg test_get_edit_distance_one_way (a, b, expected);
237 1.1 mrg test_get_edit_distance_one_way (b, a, expected);
238 1.1 mrg }
239 1.1 mrg
240 1.1 mrg /* Verify get_edit_distance for a variety of pairs of pre-canned
241 1.1 mrg inputs, comparing against known-good values. */
242 1.1 mrg
243 1.1 mrg static void
244 1.1 mrg test_edit_distances ()
245 1.1 mrg {
246 1.1 mrg test_get_edit_distance_both_ways ("", "nonempty",
247 1.1 mrg BASE_COST * strlen ("nonempty"));
248 1.1 mrg test_get_edit_distance_both_ways ("saturday", "sunday",
249 1.1 mrg BASE_COST * 3);
250 1.1 mrg test_get_edit_distance_both_ways ("foo", "m_foo", BASE_COST * 2);
251 1.1 mrg test_get_edit_distance_both_ways ("hello_world", "HelloWorld", 4);
252 1.1 mrg test_get_edit_distance_both_ways
253 1.1 mrg ("the quick brown fox jumps over the lazy dog", "dog", BASE_COST * 40);
254 1.1 mrg test_get_edit_distance_both_ways
255 1.1 mrg ("the quick brown fox jumps over the lazy dog",
256 1.1 mrg "the quick brown dog jumps over the lazy fox",
257 1.1 mrg BASE_COST * 4);
258 1.1 mrg test_get_edit_distance_both_ways
259 1.1 mrg ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
260 1.1 mrg "All your base are belong to us",
261 1.1 mrg BASE_COST * 44);
262 1.1 mrg test_get_edit_distance_both_ways ("foo", "FOO", 3);
263 1.1 mrg test_get_edit_distance_both_ways ("fee", "deed", BASE_COST * 2);
264 1.1 mrg test_get_edit_distance_both_ways ("coorzd1", "coordx1", BASE_COST * 2);
265 1.1 mrg test_get_edit_distance_both_ways ("assert", "sqrt", BASE_COST * 3);
266 1.1 mrg test_get_edit_distance_both_ways ("PATH_MAX", "INT8_MAX", BASE_COST * 3);
267 1.1 mrg test_get_edit_distance_both_ways ("time", "nice", BASE_COST * 2);
268 1.1 mrg test_get_edit_distance_both_ways ("bar", "carg", BASE_COST * 2);
269 1.1 mrg test_get_edit_distance_both_ways ("gtk_widget_show_all",
270 1.1 mrg "GtkWidgetShowAll", 10);
271 1.1 mrg test_get_edit_distance_both_ways ("m_bar", "bar", BASE_COST * 2);
272 1.1 mrg test_get_edit_distance_both_ways ("MACRO", "MACRAME", BASE_COST * 3);
273 1.1 mrg test_get_edit_distance_both_ways ("ab", "ac", BASE_COST * 1);
274 1.1 mrg test_get_edit_distance_both_ways ("ab", "a", BASE_COST * 1);
275 1.1 mrg test_get_edit_distance_both_ways ("a", "b", BASE_COST * 1);
276 1.1 mrg test_get_edit_distance_both_ways ("nanl", "name", BASE_COST * 2);
277 1.1 mrg test_get_edit_distance_both_ways ("char", "bar", BASE_COST * 2);
278 1.1 mrg test_get_edit_distance_both_ways ("-optimize", "fsanitize", BASE_COST * 5);
279 1.1 mrg test_get_edit_distance_both_ways ("__DATE__", "__i386__", BASE_COST * 4);
280 1.1 mrg
281 1.1 mrg /* Examples where transposition helps. */
282 1.1 mrg test_get_edit_distance_both_ways ("ab", "ba", BASE_COST * 1);
283 1.1 mrg test_get_edit_distance_both_ways ("ba", "abc", BASE_COST * 2);
284 1.1 mrg test_get_edit_distance_both_ways ("coorzd1", "coordz1", BASE_COST * 1);
285 1.1 mrg test_get_edit_distance_both_ways ("abcdefghijklmnopqrstuvwxyz",
286 1.1 mrg "bacdefghijklmnopqrstuvwxzy",
287 1.1 mrg BASE_COST * 2);
288 1.1 mrg test_get_edit_distance_both_ways ("saturday", "sundya", BASE_COST * 4);
289 1.1 mrg test_get_edit_distance_both_ways ("signed", "singed", BASE_COST * 1);
290 1.1 mrg }
291 1.1 mrg
292 1.1 mrg /* Subroutine of test_get_edit_distance_cutoff, for emulating the
293 1.1 mrg spellchecking cutoff in up to GCC 8. */
294 1.1 mrg
295 1.1 mrg static edit_distance_t
296 1.1 mrg get_old_cutoff (size_t goal_len, size_t candidate_len)
297 1.1 mrg {
298 1.1 mrg return BASE_COST * MAX (goal_len, candidate_len) / 2;
299 1.1 mrg }
300 1.1 mrg
301 1.1 mrg /* Verify that the cutoff for "meaningfulness" of suggestions is at least as
302 1.1 mrg conservative as in older GCC releases.
303 1.1 mrg
304 1.1 mrg This should ensure that we don't offer additional meaningless
305 1.1 mrg suggestions (apart from those for which transposition has helped). */
306 1.1 mrg
307 1.1 mrg static void
308 1.1 mrg test_get_edit_distance_cutoff ()
309 1.1 mrg {
310 1.1 mrg for (size_t goal_len = 0; goal_len < 30; goal_len++)
311 1.1 mrg for (size_t candidate_len = 0; candidate_len < 30; candidate_len++)
312 1.1 mrg ASSERT_TRUE (get_edit_distance_cutoff (goal_len, candidate_len)
313 1.1 mrg <= get_old_cutoff (goal_len, candidate_len));
314 1.1 mrg }
315 1.1 mrg
316 1.1 mrg /* Assert that CANDIDATE is offered as a suggestion for TARGET. */
317 1.1 mrg
318 1.1 mrg static void
319 1.1 mrg assert_suggested_for (const location &loc, const char *candidate,
320 1.1 mrg const char *target)
321 1.1 mrg {
322 1.1 mrg auto_vec<const char *> candidates;
323 1.1 mrg candidates.safe_push (candidate);
324 1.1 mrg ASSERT_EQ_AT (loc, candidate, find_closest_string (target, &candidates));
325 1.1 mrg }
326 1.1 mrg
327 1.1 mrg /* Assert that CANDIDATE is offered as a suggestion for TARGET. */
328 1.1 mrg
329 1.1 mrg #define ASSERT_SUGGESTED_FOR(CANDIDATE, TARGET) \
330 1.1 mrg SELFTEST_BEGIN_STMT \
331 1.1 mrg assert_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \
332 1.1 mrg SELFTEST_END_STMT
333 1.1 mrg
334 1.1 mrg /* Assert that CANDIDATE is not offered as a suggestion for TARGET. */
335 1.1 mrg
336 1.1 mrg static void
337 1.1 mrg assert_not_suggested_for (const location &loc, const char *candidate,
338 1.1 mrg const char *target)
339 1.1 mrg {
340 1.1 mrg auto_vec<const char *> candidates;
341 1.1 mrg candidates.safe_push (candidate);
342 1.1 mrg ASSERT_EQ_AT (loc, NULL, find_closest_string (target, &candidates));
343 1.1 mrg }
344 1.1 mrg
345 1.1 mrg /* Assert that CANDIDATE is not offered as a suggestion for TARGET. */
346 1.1 mrg
347 1.1 mrg #define ASSERT_NOT_SUGGESTED_FOR(CANDIDATE, TARGET) \
348 1.1 mrg SELFTEST_BEGIN_STMT \
349 1.1 mrg assert_not_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \
350 1.1 mrg SELFTEST_END_STMT
351 1.1 mrg
352 1.1 mrg /* Verify that we offer varous suggestions that are meaningful,
353 1.1 mrg and that we don't offer various other ones that aren't (PR c/82967). */
354 1.1 mrg
355 1.1 mrg static void
356 1.1 mrg test_suggestions ()
357 1.1 mrg {
358 1.1 mrg /* Good suggestions. */
359 1.1 mrg
360 1.1 mrg ASSERT_SUGGESTED_FOR ("m_bar", "bar");
361 1.1 mrg // dist == 2, max_length == 5, min_length == 3
362 1.1 mrg
363 1.1 mrg ASSERT_SUGGESTED_FOR ("MACRO", "MACRAME");
364 1.1 mrg // dist == 3, max_length == 7, min_length == 5
365 1.1 mrg
366 1.1 mrg ASSERT_SUGGESTED_FOR ("gtk_widget_show_all", "GtkWidgetShowAll");
367 1.1 mrg // dist == 7, max_length == 16, min_length = 19
368 1.1 mrg
369 1.1 mrg ASSERT_SUGGESTED_FOR ("ab", "ac");
370 1.1 mrg // dist == 1, max_length == min_length = 2
371 1.1 mrg
372 1.1 mrg ASSERT_SUGGESTED_FOR ("ab", "a");
373 1.1 mrg // dist == 1, max_length == 2, min_length = 1
374 1.1 mrg
375 1.1 mrg /* Bad suggestions. */
376 1.1 mrg
377 1.1 mrg ASSERT_NOT_SUGGESTED_FOR ("a", "b");
378 1.1 mrg // dist == 1, max_length == min_length = 1
379 1.1 mrg
380 1.1 mrg ASSERT_NOT_SUGGESTED_FOR ("sqrt", "assert");
381 1.1 mrg // dist == 3, max_length 6, min_length == 4
382 1.1 mrg
383 1.1 mrg ASSERT_NOT_SUGGESTED_FOR ("INT8_MAX", "PATH_MAX");
384 1.1 mrg // dist == 3, max_length == min_length == 8
385 1.1 mrg
386 1.1 mrg ASSERT_NOT_SUGGESTED_FOR ("nice", "time");
387 1.1 mrg ASSERT_NOT_SUGGESTED_FOR ("nanl", "name");
388 1.1 mrg // dist == 2, max_length == min_length == 4
389 1.1 mrg
390 1.1 mrg ASSERT_NOT_SUGGESTED_FOR ("carg", "bar");
391 1.1 mrg ASSERT_NOT_SUGGESTED_FOR ("char", "bar");
392 1.1 mrg // dist == 2, max_length == 4, min_length == 3
393 1.1 mrg
394 1.1 mrg ASSERT_NOT_SUGGESTED_FOR ("-optimize", "fsanitize");
395 1.1 mrg // dist == 5, max_length == min_length == 9
396 1.1 mrg
397 1.1 mrg ASSERT_NOT_SUGGESTED_FOR ("__DATE__", "__i386__");
398 1.1 mrg // dist == 4, max_length == min_length == 8
399 1.1 mrg
400 1.1 mrg ASSERT_NOT_SUGGESTED_FOR ("start_input_device", "InputDevice");
401 1.1 mrg // dist == 9, max_length == 18, min_length == 11
402 1.1 mrg }
403 1.1 mrg
404 1.1 mrg /* Verify that find_closest_string is sane. */
405 1.1 mrg
406 1.1 mrg static void
407 1.1 mrg test_find_closest_string ()
408 1.1 mrg {
409 1.1 mrg auto_vec<const char *> candidates;
410 1.1 mrg
411 1.1 mrg /* Verify that it can handle an empty vec. */
412 1.1 mrg ASSERT_EQ (NULL, find_closest_string ("", &candidates));
413 1.1 mrg
414 1.1 mrg /* Verify that it works sanely for non-empty vecs. */
415 1.1 mrg candidates.safe_push ("apple");
416 1.1 mrg candidates.safe_push ("banana");
417 1.1 mrg candidates.safe_push ("cherry");
418 1.1 mrg
419 1.1 mrg ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
420 1.1 mrg ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
421 1.1 mrg ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
422 1.1 mrg ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
423 1.1 mrg
424 1.1 mrg /* The order of the vec can matter, but it should not matter for these
425 1.1 mrg inputs. */
426 1.1 mrg candidates.truncate (0);
427 1.1 mrg candidates.safe_push ("cherry");
428 1.1 mrg candidates.safe_push ("banana");
429 1.1 mrg candidates.safe_push ("apple");
430 1.1 mrg ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
431 1.1 mrg ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
432 1.1 mrg ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
433 1.1 mrg ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
434 1.1 mrg
435 1.1 mrg /* If the goal string somehow makes it into the candidate list, offering
436 1.1 mrg it as a suggestion will be nonsensical. Verify that we don't offer such
437 1.1 mrg suggestions. */
438 1.1 mrg ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
439 1.1 mrg
440 1.1 mrg /* Example from PR 69968 where transposition helps. */
441 1.1 mrg candidates.truncate (0);
442 1.1 mrg candidates.safe_push("coordx");
443 1.1 mrg candidates.safe_push("coordy");
444 1.1 mrg candidates.safe_push("coordz");
445 1.1 mrg candidates.safe_push("coordx1");
446 1.1 mrg candidates.safe_push("coordy1");
447 1.1 mrg candidates.safe_push("coordz1");
448 1.1 mrg ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates));
449 1.1 mrg
450 1.1 mrg candidates.truncate (0);
451 1.1 mrg candidates.safe_push ("DWARF_GNAT_ENCODINGS_GDB");
452 1.1 mrg candidates.safe_push ("DWARF_GNAT_ENCODINGS_ALL");
453 1.1 mrg candidates.safe_push ("DWARF_GNAT_ENCODINGS_MINIMAL");
454 1.1 mrg ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
455 1.1 mrg find_closest_string ("DWARF_GNAT_ENCODINGS_all",
456 1.1 mrg &candidates));
457 1.1 mrg
458 1.1 mrg /* The same as the previous test, but with a different order of
459 1.1 mrg candidates. */
460 1.1 mrg candidates.truncate (0);
461 1.1 mrg candidates.safe_push ("DWARF_GNAT_ENCODINGS_ALL");
462 1.1 mrg candidates.safe_push ("DWARF_GNAT_ENCODINGS_GDB");
463 1.1 mrg candidates.safe_push ("DWARF_GNAT_ENCODINGS_MINIMAL");
464 1.1 mrg ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
465 1.1 mrg find_closest_string ("DWARF_GNAT_ENCODINGS_all",
466 1.1 mrg &candidates));
467 1.1 mrg }
468 1.1 mrg
469 1.1 mrg /* Test data for test_metric_conditions. */
470 1.1 mrg
471 1.1 mrg static const char * const test_data[] = {
472 1.1 mrg "",
473 1.1 mrg "foo",
474 1.1 mrg "food",
475 1.1 mrg "boo",
476 1.1 mrg "1234567890123456789012345678901234567890123456789012345678901234567890",
477 1.1 mrg "abc",
478 1.1 mrg "ac",
479 1.1 mrg "ca",
480 1.1 mrg };
481 1.1 mrg
482 1.1 mrg /* Verify that get_edit_distance appears to be a sane distance function,
483 1.1 mrg even though it doesn't satisfy the conditions for being a metric. (This
484 1.1 mrg is because the triangle inequality fails to hold: the distance between
485 1.1 mrg "ca" and "ac" is 1, and so is the distance between "abc" and "ac", but
486 1.1 mrg the distance between "abc" and "ca" is 3. Algorithms that calculate the
487 1.1 mrg true Levenshtein-Damerau metric are much more expensive.) */
488 1.1 mrg
489 1.1 mrg static void
490 1.1 mrg test_metric_conditions ()
491 1.1 mrg {
492 1.1 mrg const int num_test_cases = sizeof (test_data) / sizeof (test_data[0]);
493 1.1 mrg
494 1.1 mrg for (int i = 0; i < num_test_cases; i++)
495 1.1 mrg {
496 1.1 mrg for (int j = 0; j < num_test_cases; j++)
497 1.1 mrg {
498 1.1 mrg edit_distance_t dist_ij
499 1.1 mrg = get_edit_distance (test_data[i], test_data[j]);
500 1.1 mrg
501 1.1 mrg /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */
502 1.1 mrg if (i == j)
503 1.1 mrg ASSERT_EQ (dist_ij, 0);
504 1.1 mrg else
505 1.1 mrg ASSERT_TRUE (dist_ij > 0);
506 1.1 mrg
507 1.1 mrg /* Symmetry: d(i, j) == d(j, i). */
508 1.1 mrg edit_distance_t dist_ji
509 1.1 mrg = get_edit_distance (test_data[j], test_data[i]);
510 1.1 mrg ASSERT_EQ (dist_ij, dist_ji);
511 1.1 mrg }
512 1.1 mrg }
513 1.1 mrg }
514 1.1 mrg
515 1.1 mrg /* Run all of the selftests within this file. */
516 1.1 mrg
517 1.1 mrg void
518 1.1 mrg spellcheck_cc_tests ()
519 1.1 mrg {
520 1.1 mrg test_edit_distances ();
521 1.1 mrg test_get_edit_distance_cutoff ();
522 1.1 mrg test_suggestions ();
523 1.1 mrg test_find_closest_string ();
524 1.1 mrg test_metric_conditions ();
525 1.1 mrg }
526 1.1 mrg
527 1.1 mrg } // namespace selftest
528 1.1 mrg
529 1.1 mrg #endif /* #if CHECKING_P */
530