Home | History | Annotate | Line # | Download | only in info
      1 /*	$NetBSD: search.c,v 1.1.1.1 2016/01/14 00:11:29 christos Exp $	*/
      2 
      3 /* search.c -- searching large bodies of text.
      4    Id: search.c,v 1.3 2004/04/11 17:56:46 karl Exp
      5 
      6    Copyright (C) 1993, 1997, 1998, 2002, 2004 Free Software Foundation, Inc.
      7 
      8    This program is free software; you can redistribute it and/or modify
      9    it under the terms of the GNU General Public License as published by
     10    the Free Software Foundation; either version 2, or (at your option)
     11    any later version.
     12 
     13    This program is distributed in the hope that it will be useful,
     14    but WITHOUT ANY WARRANTY; without even the implied warranty of
     15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     16    GNU General Public License for more details.
     17 
     18    You should have received a copy of the GNU General Public License
     19    along with this program; if not, write to the Free Software
     20    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
     21 
     22    Written by Brian Fox (bfox (at) ai.mit.edu). */
     23 
     24 #include "info.h"
     25 
     26 #include "search.h"
     27 #include "nodes.h"
     28 
     29 /* The search functions take two arguments:
     30 
     31      1) a string to search for, and
     32 
     33      2) a pointer to a SEARCH_BINDING which contains the buffer, start,
     34         and end of the search.
     35 
     36    They return a long, which is the offset from the start of the buffer
     37    at which the match was found.  An offset of -1 indicates failure. */
     38 
     39 /* A function which makes a binding with buffer and bounds. */
     40 SEARCH_BINDING *
     41 make_binding (char *buffer, long int start, long int end)
     42 {
     43   SEARCH_BINDING *binding;
     44 
     45   binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING));
     46   binding->buffer = buffer;
     47   binding->start = start;
     48   binding->end = end;
     49   binding->flags = 0;
     50 
     51   return (binding);
     52 }
     53 
     54 /* Make a copy of BINDING without duplicating the data. */
     55 SEARCH_BINDING *
     56 copy_binding (SEARCH_BINDING *binding)
     57 {
     58   SEARCH_BINDING *copy;
     59 
     60   copy = make_binding (binding->buffer, binding->start, binding->end);
     61   copy->flags = binding->flags;
     62   return (copy);
     63 }
     64 
     65 
     66 /* **************************************************************** */
     68 /*                                                                  */
     69 /*                 The Actual Searching Functions                   */
     70 /*                                                                  */
     71 /* **************************************************************** */
     72 
     73 /* Search forwards or backwards for the text delimited by BINDING.
     74    The search is forwards if BINDING->start is greater than BINDING->end. */
     75 long
     76 search (char *string, SEARCH_BINDING *binding)
     77 {
     78   long result;
     79 
     80   /* If the search is backwards, then search backwards, otherwise forwards. */
     81   if (binding->start > binding->end)
     82     result = search_backward (string, binding);
     83   else
     84     result = search_forward (string, binding);
     85 
     86   return (result);
     87 }
     88 
     89 /* Search forwards for STRING through the text delimited in BINDING. */
     90 long
     91 search_forward (char *string, SEARCH_BINDING *binding)
     92 {
     93   register int c, i, len;
     94   register char *buff, *end;
     95   char *alternate = (char *)NULL;
     96 
     97   len = strlen (string);
     98 
     99   /* We match characters in the search buffer against STRING and ALTERNATE.
    100      ALTERNATE is a case reversed version of STRING; this is cheaper than
    101      case folding each character before comparison.   Alternate is only
    102      used if the case folding bit is turned on in the passed BINDING. */
    103 
    104   if (binding->flags & S_FoldCase)
    105     {
    106       alternate = xstrdup (string);
    107 
    108       for (i = 0; i < len; i++)
    109         {
    110           if (islower (alternate[i]))
    111             alternate[i] = toupper (alternate[i]);
    112           else if (isupper (alternate[i]))
    113             alternate[i] = tolower (alternate[i]);
    114         }
    115     }
    116 
    117   buff = binding->buffer + binding->start;
    118   end = binding->buffer + binding->end + 1;
    119 
    120   while (buff < (end - len))
    121     {
    122       for (i = 0; i < len; i++)
    123         {
    124           c = buff[i];
    125 
    126           if ((c != string[i]) && (!alternate || c != alternate[i]))
    127             break;
    128         }
    129 
    130       if (!string[i])
    131         {
    132           if (alternate)
    133             free (alternate);
    134           if (binding->flags & S_SkipDest)
    135             buff += len;
    136           return ((long) (buff - binding->buffer));
    137         }
    138 
    139       buff++;
    140     }
    141 
    142   if (alternate)
    143     free (alternate);
    144 
    145   return ((long) -1);
    146 }
    147 
    148 /* Search for STRING backwards through the text delimited in BINDING. */
    149 long
    150 search_backward (char *input_string, SEARCH_BINDING *binding)
    151 {
    152   register int c, i, len;
    153   register char *buff, *end;
    154   char *string;
    155   char *alternate = (char *)NULL;
    156 
    157   len = strlen (input_string);
    158 
    159   /* Reverse the characters in the search string. */
    160   string = (char *)xmalloc (1 + len);
    161   for (c = 0, i = len - 1; input_string[c]; c++, i--)
    162     string[i] = input_string[c];
    163 
    164   string[c] = '\0';
    165 
    166   /* We match characters in the search buffer against STRING and ALTERNATE.
    167      ALTERNATE is a case reversed version of STRING; this is cheaper than
    168      case folding each character before comparison.   ALTERNATE is only
    169      used if the case folding bit is turned on in the passed BINDING. */
    170 
    171   if (binding->flags & S_FoldCase)
    172     {
    173       alternate = xstrdup (string);
    174 
    175       for (i = 0; i < len; i++)
    176         {
    177           if (islower (alternate[i]))
    178             alternate[i] = toupper (alternate[i]);
    179           else if (isupper (alternate[i]))
    180             alternate[i] = tolower (alternate[i]);
    181         }
    182     }
    183 
    184   buff = binding->buffer + binding->start - 1;
    185   end = binding->buffer + binding->end;
    186 
    187   while (buff > (end + len))
    188     {
    189       for (i = 0; i < len; i++)
    190         {
    191           c = *(buff - i);
    192 
    193           if (c != string[i] && (!alternate || c != alternate[i]))
    194             break;
    195         }
    196 
    197       if (!string[i])
    198         {
    199           free (string);
    200           if (alternate)
    201             free (alternate);
    202 
    203           if (binding->flags & S_SkipDest)
    204             buff -= len;
    205           return ((long) (1 + (buff - binding->buffer)));
    206         }
    207 
    208       buff--;
    209     }
    210 
    211   free (string);
    212   if (alternate)
    213     free (alternate);
    214 
    215   return ((long) -1);
    216 }
    217 
    218 /* Find STRING in LINE, returning the offset of the end of the string.
    219    Return an offset of -1 if STRING does not appear in LINE.  The search
    220    is bound by the end of the line (i.e., either NEWLINE or 0). */
    221 int
    222 string_in_line (char *string, char *line)
    223 {
    224   register int end;
    225   SEARCH_BINDING binding;
    226 
    227   /* Find the end of the line. */
    228   for (end = 0; line[end] && line[end] != '\n'; end++);
    229 
    230   /* Search for STRING within these confines. */
    231   binding.buffer = line;
    232   binding.start = 0;
    233   binding.end = end;
    234   binding.flags = S_FoldCase | S_SkipDest;
    235 
    236   return (search_forward (string, &binding));
    237 }
    238 
    239 /* Return non-zero if STRING is the first text to appear at BINDING. */
    240 int
    241 looking_at (char *string, SEARCH_BINDING *binding)
    242 {
    243   long search_end;
    244 
    245   search_end = search (string, binding);
    246 
    247   /* If the string was not found, SEARCH_END is -1.  If the string was found,
    248      but not right away, SEARCH_END is != binding->start.  Otherwise, the
    249      string was found at binding->start. */
    250   return (search_end == binding->start);
    251 }
    252 
    253 /* **************************************************************** */
    255 /*                                                                  */
    256 /*                    Small String Searches                         */
    257 /*                                                                  */
    258 /* **************************************************************** */
    259 
    260 /* Function names that start with "skip" are passed a string, and return
    261    an offset from the start of that string.  Function names that start
    262    with "find" are passed a SEARCH_BINDING, and return an absolute position
    263    marker of the item being searched for.  "Find" functions return a value
    264    of -1 if the item being looked for couldn't be found. */
    265 
    266 /* Return the index of the first non-whitespace character in STRING. */
    267 int
    268 skip_whitespace (char *string)
    269 {
    270   register int i;
    271 
    272   for (i = 0; string && whitespace (string[i]); i++);
    273   return (i);
    274 }
    275 
    276 /* Return the index of the first non-whitespace or newline character in
    277    STRING. */
    278 int
    279 skip_whitespace_and_newlines (char *string)
    280 {
    281   register int i;
    282 
    283   for (i = 0; string && whitespace_or_newline (string[i]); i++);
    284   return (i);
    285 }
    286 
    287 /* Return the index of the first whitespace character in STRING. */
    288 int
    289 skip_non_whitespace (char *string)
    290 {
    291   register int i;
    292 
    293   for (i = 0; string && string[i] && !whitespace (string[i]); i++);
    294   return (i);
    295 }
    296 
    297 /* Return the index of the first non-node character in STRING.  Note that
    298    this function contains quite a bit of hair to ignore periods in some
    299    special cases.  This is because we here at GNU ship some info files which
    300    contain nodenames that contain periods.  No such nodename can start with
    301    a period, or continue with whitespace, newline, or ')' immediately following
    302    the period.  If second argument NEWLINES_OKAY is non-zero, newlines should
    303    be skipped while parsing out the nodename specification. */
    304 int
    305 skip_node_characters (char *string, int newlines_okay)
    306 {
    307   register int c, i = 0;
    308   int paren_seen = 0;
    309   int paren = 0;
    310 
    311   /* Handle special case.  This is when another function has parsed out the
    312      filename component of the node name, and we just want to parse out the
    313      nodename proper.  In that case, a period at the start of the nodename
    314      indicates an empty nodename. */
    315   if (string && *string == '.')
    316     return (0);
    317 
    318   if (string && *string == '(')
    319     {
    320       paren++;
    321       paren_seen++;
    322       i++;
    323     }
    324 
    325   for (; string && (c = string[i]); i++)
    326     {
    327       if (paren)
    328         {
    329           if (c == '(')
    330             paren++;
    331           else if (c == ')')
    332             paren--;
    333 
    334           continue;
    335         }
    336 
    337       /* If the character following the close paren is a space or period,
    338          then this node name has no more characters associated with it. */
    339       if (c == '\t' ||
    340           c == ','  ||
    341           c == INFO_TAGSEP ||
    342           ((!newlines_okay) && (c == '\n')) ||
    343           ((paren_seen && string[i - 1] == ')') &&
    344            (c == ' ' || c == '.')) ||
    345           (c == '.' &&
    346            (
    347 #if 0
    348 /* This test causes a node name ending in a period, like `This.', not to
    349    be found.  The trailing . is stripped.  This occurs in the jargon
    350    file (`I see no X here.' is a node name).  */
    351            (!string[i + 1]) ||
    352 #endif
    353             (whitespace_or_newline (string[i + 1])) ||
    354             (string[i + 1] == ')'))))
    355         break;
    356     }
    357   return (i);
    358 }
    359 
    360 
    361 /* **************************************************************** */
    363 /*                                                                  */
    364 /*                   Searching FILE_BUFFER's                        */
    365 /*                                                                  */
    366 /* **************************************************************** */
    367 
    368 /* Return the absolute position of the first occurence of a node separator in
    369    BINDING-buffer.  The search starts at BINDING->start.  Return -1 if no node
    370    separator was found. */
    371 long
    372 find_node_separator (SEARCH_BINDING *binding)
    373 {
    374   register long i;
    375   char *body;
    376 
    377   body = binding->buffer;
    378 
    379   /* A node is started by [^L]^_[^L]\n.  That is to say, the C-l's are
    380      optional, but the DELETE and NEWLINE are not.  This separator holds
    381      true for all separated elements in an Info file, including the tags
    382      table (if present) and the indirect tags table (if present). */
    383   for (i = binding->start; i < binding->end - 1; i++)
    384     if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) &&
    385          (body[i + 2] == '\n' ||
    386           (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) ||
    387         ((body[i] == INFO_COOKIE) &&
    388          (body[i + 1] == '\n' ||
    389           (body[i + 1] == INFO_FF && body[i + 2] == '\n'))))
    390       return (i);
    391   return (-1);
    392 }
    393 
    394 /* Return the length of the node separator characters that BODY is
    395    currently pointing at. */
    396 int
    397 skip_node_separator (char *body)
    398 {
    399   register int i;
    400 
    401   i = 0;
    402 
    403   if (body[i] == INFO_FF)
    404     i++;
    405 
    406   if (body[i++] != INFO_COOKIE)
    407     return (0);
    408 
    409   if (body[i] == INFO_FF)
    410     i++;
    411 
    412   if (body[i++] != '\n')
    413     return (0);
    414 
    415   return (i);
    416 }
    417 
    418 /* Return the number of characters from STRING to the start of
    419    the next line. */
    420 int
    421 skip_line (char *string)
    422 {
    423   register int i;
    424 
    425   for (i = 0; string && string[i] && string[i] != '\n'; i++);
    426 
    427   if (string[i] == '\n')
    428     i++;
    429 
    430   return (i);
    431 }
    432 
    433 /* Return the absolute position of the beginning of a tags table in this
    434    binding starting the search at binding->start. */
    435 long
    436 find_tags_table (SEARCH_BINDING *binding)
    437 {
    438   SEARCH_BINDING tmp_search;
    439   long position;
    440 
    441   tmp_search.buffer = binding->buffer;
    442   tmp_search.start = binding->start;
    443   tmp_search.end = binding->end;
    444   tmp_search.flags = S_FoldCase;
    445 
    446   while ((position = find_node_separator (&tmp_search)) != -1 )
    447     {
    448       tmp_search.start = position;
    449       tmp_search.start += skip_node_separator (tmp_search.buffer
    450           + tmp_search.start);
    451 
    452       if (looking_at (TAGS_TABLE_BEG_LABEL, &tmp_search))
    453         return (position);
    454     }
    455   return (-1);
    456 }
    457 
    458 /* Return the absolute position of the node named NODENAME in BINDING.
    459    This is a brute force search, and we wish to avoid it when possible.
    460    This function is called when a tag (indirect or otherwise) doesn't
    461    really point to the right node.  It returns the absolute position of
    462    the separator preceding the node. */
    463 long
    464 find_node_in_binding (char *nodename, SEARCH_BINDING *binding)
    465 {
    466   long position;
    467   int offset, namelen;
    468   SEARCH_BINDING tmp_search;
    469 
    470   namelen = strlen (nodename);
    471 
    472   tmp_search.buffer = binding->buffer;
    473   tmp_search.start = binding->start;
    474   tmp_search.end = binding->end;
    475   tmp_search.flags = 0;
    476 
    477   while ((position = find_node_separator (&tmp_search)) != -1)
    478     {
    479       tmp_search.start = position;
    480       tmp_search.start += skip_node_separator
    481         (tmp_search.buffer + tmp_search.start);
    482 
    483       offset = string_in_line
    484         (INFO_NODE_LABEL, tmp_search.buffer + tmp_search.start);
    485 
    486       if (offset == -1)
    487         continue;
    488 
    489       tmp_search.start += offset;
    490       tmp_search.start += skip_whitespace (tmp_search.buffer + tmp_search.start);
    491       offset = skip_node_characters
    492         (tmp_search.buffer + tmp_search.start, DONT_SKIP_NEWLINES);
    493 
    494       /* Notice that this is an exact match.  You cannot grovel through
    495          the buffer with this function looking for random nodes. */
    496        if ((offset == namelen) &&
    497            (tmp_search.buffer[tmp_search.start] == nodename[0]) &&
    498            (strncmp (tmp_search.buffer + tmp_search.start, nodename, offset) == 0))
    499          return (position);
    500     }
    501   return (-1);
    502 }
    503