Home | History | Annotate | Line # | Download | only in indent
lexi.c revision 1.147
      1  1.147  rillig /*	$NetBSD: lexi.c,v 1.147 2021/11/19 19:55:15 rillig Exp $	*/
      2    1.3     tls 
      3   1.16   kamil /*-
      4   1.16   kamil  * SPDX-License-Identifier: BSD-4-Clause
      5   1.16   kamil  *
      6   1.16   kamil  * Copyright (c) 1985 Sun Microsystems, Inc.
      7    1.5     mrg  * Copyright (c) 1980, 1993
      8    1.5     mrg  *	The Regents of the University of California.  All rights reserved.
      9    1.1     cgd  * All rights reserved.
     10    1.1     cgd  *
     11    1.1     cgd  * Redistribution and use in source and binary forms, with or without
     12    1.1     cgd  * modification, are permitted provided that the following conditions
     13    1.1     cgd  * are met:
     14    1.1     cgd  * 1. Redistributions of source code must retain the above copyright
     15    1.1     cgd  *    notice, this list of conditions and the following disclaimer.
     16    1.1     cgd  * 2. Redistributions in binary form must reproduce the above copyright
     17    1.1     cgd  *    notice, this list of conditions and the following disclaimer in the
     18    1.1     cgd  *    documentation and/or other materials provided with the distribution.
     19    1.1     cgd  * 3. All advertising materials mentioning features or use of this software
     20    1.1     cgd  *    must display the following acknowledgement:
     21    1.1     cgd  *	This product includes software developed by the University of
     22    1.1     cgd  *	California, Berkeley and its contributors.
     23    1.1     cgd  * 4. Neither the name of the University nor the names of its contributors
     24    1.1     cgd  *    may be used to endorse or promote products derived from this software
     25    1.1     cgd  *    without specific prior written permission.
     26    1.1     cgd  *
     27    1.1     cgd  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     28    1.1     cgd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29    1.1     cgd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     30    1.1     cgd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     31    1.1     cgd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     32    1.1     cgd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     33    1.1     cgd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     34    1.1     cgd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     35    1.1     cgd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     36    1.1     cgd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     37    1.1     cgd  * SUCH DAMAGE.
     38    1.1     cgd  */
     39    1.1     cgd 
     40   1.16   kamil #if 0
     41   1.16   kamil static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
     42   1.16   kamil #endif
     43   1.16   kamil 
     44    1.6   lukem #include <sys/cdefs.h>
     45   1.16   kamil #if defined(__NetBSD__)
     46  1.147  rillig __RCSID("$NetBSD: lexi.c,v 1.147 2021/11/19 19:55:15 rillig Exp $");
     47   1.16   kamil #elif defined(__FreeBSD__)
     48   1.16   kamil __FBSDID("$FreeBSD: head/usr.bin/indent/lexi.c 337862 2018-08-15 18:19:45Z pstef $");
     49   1.16   kamil #endif
     50    1.1     cgd 
     51  1.142  rillig #include <assert.h>
     52    1.1     cgd #include <ctype.h>
     53    1.1     cgd #include <stdlib.h>
     54    1.1     cgd #include <string.h>
     55   1.16   kamil 
     56   1.16   kamil #include "indent.h"
     57    1.1     cgd 
     58  1.127  rillig /*
     59  1.127  rillig  * While inside lexi_alnum, this constant just marks a type, independently of
     60  1.127  rillig  * the parentheses level.
     61  1.127  rillig  */
     62  1.135  rillig #define lsym_type lsym_type_outside_parentheses
     63  1.127  rillig 
     64   1.60  rillig /* must be sorted alphabetically, is used in binary search */
     65   1.62  rillig static const struct keyword {
     66   1.62  rillig     const char *name;
     67  1.125  rillig     lexer_symbol lsym;
     68   1.62  rillig } keywords[] = {
     69  1.127  rillig     {"_Bool", lsym_type},
     70  1.127  rillig     {"_Complex", lsym_type},
     71  1.127  rillig     {"_Imaginary", lsym_type},
     72  1.127  rillig     {"auto", lsym_storage_class},
     73  1.127  rillig     {"bool", lsym_type},
     74  1.134  rillig     {"break", lsym_word},
     75  1.127  rillig     {"case", lsym_case_label},
     76  1.127  rillig     {"char", lsym_type},
     77  1.127  rillig     {"complex", lsym_type},
     78  1.127  rillig     {"const", lsym_type},
     79  1.134  rillig     {"continue", lsym_word},
     80  1.127  rillig     {"default", lsym_case_label},
     81  1.127  rillig     {"do", lsym_do},
     82  1.127  rillig     {"double", lsym_type},
     83  1.127  rillig     {"else", lsym_else},
     84  1.127  rillig     {"enum", lsym_tag},
     85  1.127  rillig     {"extern", lsym_storage_class},
     86  1.127  rillig     {"float", lsym_type},
     87  1.127  rillig     {"for", lsym_for},
     88  1.134  rillig     {"goto", lsym_word},
     89  1.127  rillig     {"if", lsym_if},
     90  1.127  rillig     {"imaginary", lsym_type},
     91  1.134  rillig     {"inline", lsym_word},
     92  1.127  rillig     {"int", lsym_type},
     93  1.127  rillig     {"long", lsym_type},
     94  1.127  rillig     {"offsetof", lsym_offsetof},
     95  1.127  rillig     {"register", lsym_storage_class},
     96  1.134  rillig     {"restrict", lsym_word},
     97  1.129  rillig     {"return", lsym_return},
     98  1.127  rillig     {"short", lsym_type},
     99  1.127  rillig     {"signed", lsym_type},
    100  1.127  rillig     {"sizeof", lsym_sizeof},
    101  1.127  rillig     {"static", lsym_storage_class},
    102  1.127  rillig     {"struct", lsym_tag},
    103  1.127  rillig     {"switch", lsym_switch},
    104  1.127  rillig     {"typedef", lsym_typedef},
    105  1.127  rillig     {"union", lsym_tag},
    106  1.127  rillig     {"unsigned", lsym_type},
    107  1.127  rillig     {"void", lsym_type},
    108  1.127  rillig     {"volatile", lsym_type},
    109  1.127  rillig     {"while", lsym_while}
    110    1.1     cgd };
    111    1.1     cgd 
    112   1.84  rillig static struct {
    113   1.64  rillig     const char **items;
    114   1.64  rillig     unsigned int len;
    115   1.64  rillig     unsigned int cap;
    116   1.64  rillig } typenames;
    117   1.16   kamil 
    118   1.16   kamil /*
    119   1.16   kamil  * The transition table below was rewritten by hand from lx's output, given
    120   1.16   kamil  * the following definitions. lx is Katherine Flavel's lexer generator.
    121   1.16   kamil  *
    122   1.16   kamil  * O  = /[0-7]/;        D  = /[0-9]/;          NZ = /[1-9]/;
    123   1.16   kamil  * H  = /[a-f0-9]/i;    B  = /[0-1]/;          HP = /0x/i;
    124   1.16   kamil  * BP = /0b/i;          E  = /e[+\-]?/i D+;    P  = /p[+\-]?/i D+;
    125   1.16   kamil  * FS = /[fl]/i;        IS = /u/i /(l|L|ll|LL)/? | /(l|L|ll|LL)/ /u/i?;
    126   1.16   kamil  *
    127   1.16   kamil  * D+           E  FS? -> $float;
    128   1.16   kamil  * D*    "." D+ E? FS? -> $float;
    129   1.16   kamil  * D+    "."    E? FS? -> $float;    HP H+           IS? -> $int;
    130   1.16   kamil  * HP H+        P  FS? -> $float;    NZ D*           IS? -> $int;
    131   1.16   kamil  * HP H* "." H+ P  FS? -> $float;    "0" O*          IS? -> $int;
    132   1.16   kamil  * HP H+ "."    P  FS  -> $float;    BP B+           IS? -> $int;
    133   1.16   kamil  */
    134   1.71  rillig /* INDENT OFF */
    135   1.82  rillig static const unsigned char lex_number_state[][26] = {
    136   1.16   kamil     /*                examples:
    137   1.16   kamil                                      00
    138   1.16   kamil              s                      0xx
    139   1.16   kamil              t                    00xaa
    140   1.16   kamil              a     11       101100xxa..
    141   1.16   kamil              r   11ee0001101lbuuxx.a.pp
    142   1.16   kamil              t.01.e+008bLuxll0Ll.aa.p+0
    143   1.16   kamil     states:  ABCDEFGHIJKLMNOPQRSTUVWXYZ */
    144   1.83  rillig     [0] =   "uuiifuufiuuiiuiiiiiuiuuuuu",	/* (other) */
    145   1.83  rillig     [1] =   "CEIDEHHHIJQ  U  Q  VUVVZZZ",	/* 0 */
    146   1.83  rillig     [2] =   "DEIDEHHHIJQ  U  Q  VUVVZZZ",	/* 1 */
    147   1.83  rillig     [3] =   "DEIDEHHHIJ   U     VUVVZZZ",	/* 2 3 4 5 6 7 */
    148   1.83  rillig     [4] =   "DEJDEHHHJJ   U     VUVVZZZ",	/* 8 9 */
    149   1.83  rillig     [5] =   "             U     VUVV   ",	/* A a C c D d */
    150   1.83  rillig     [6] =   "  K          U     VUVV   ",	/* B b */
    151   1.83  rillig     [7] =   "  FFF   FF   U     VUVV   ",	/* E e */
    152   1.83  rillig     [8] =   "    f  f     U     VUVV  f",	/* F f */
    153   1.83  rillig     [9] =   "  LLf  fL  PR   Li  L    f",	/* L */
    154   1.83  rillig     [10] =  "  OOf  fO   S P O i O    f",	/* l */
    155   1.83  rillig     [11] =  "                    FFX   ",	/* P p */
    156   1.83  rillig     [12] =  "  MM    M  i  iiM   M     ",	/* U u */
    157   1.83  rillig     [13] =  "  N                       ",	/* X x */
    158   1.83  rillig     [14] =  "     G                 Y  ",	/* + - */
    159   1.83  rillig     [15] =  "B EE    EE   T      W     ",	/* . */
    160   1.16   kamil     /*       ABCDEFGHIJKLMNOPQRSTUVWXYZ */
    161    1.1     cgd };
    162   1.71  rillig /* INDENT ON */
    163    1.1     cgd 
    164  1.115  rillig static const unsigned char lex_number_row[] = {
    165   1.56  rillig     ['0'] = 1,
    166   1.56  rillig     ['1'] = 2,
    167   1.56  rillig     ['2'] = 3, ['3'] = 3, ['4'] = 3, ['5'] = 3, ['6'] = 3, ['7'] = 3,
    168   1.56  rillig     ['8'] = 4, ['9'] = 4,
    169   1.56  rillig     ['A'] = 5, ['a'] = 5, ['C'] = 5, ['c'] = 5, ['D'] = 5, ['d'] = 5,
    170   1.56  rillig     ['B'] = 6, ['b'] = 6,
    171   1.56  rillig     ['E'] = 7, ['e'] = 7,
    172   1.56  rillig     ['F'] = 8, ['f'] = 8,
    173   1.56  rillig     ['L'] = 9,
    174   1.56  rillig     ['l'] = 10,
    175   1.56  rillig     ['P'] = 11, ['p'] = 11,
    176   1.56  rillig     ['U'] = 12, ['u'] = 12,
    177   1.56  rillig     ['X'] = 13, ['x'] = 13,
    178   1.56  rillig     ['+'] = 14, ['-'] = 14,
    179   1.56  rillig     ['.'] = 15,
    180   1.56  rillig };
    181   1.36  rillig 
    182   1.25  rillig static void
    183   1.25  rillig check_size_token(size_t desired_size)
    184   1.25  rillig {
    185   1.58  rillig     if (token.e + desired_size >= token.l)
    186   1.58  rillig 	buf_expand(&token, desired_size);
    187   1.25  rillig }
    188   1.25  rillig 
    189   1.87  rillig static void
    190   1.87  rillig token_add_char(char ch)
    191   1.87  rillig {
    192   1.87  rillig     check_size_token(1);
    193   1.87  rillig     *token.e++ = ch;
    194   1.87  rillig }
    195   1.87  rillig 
    196   1.20  rillig #ifdef debug
    197  1.100  rillig static const char *
    198  1.100  rillig lsym_name(lexer_symbol sym)
    199   1.20  rillig {
    200   1.20  rillig     static const char *const name[] = {
    201  1.100  rillig 	"eof",
    202  1.100  rillig 	"preprocessing",
    203  1.100  rillig 	"newline",
    204  1.100  rillig 	"form_feed",
    205  1.100  rillig 	"comment",
    206  1.100  rillig 	"lparen_or_lbracket",
    207  1.100  rillig 	"rparen_or_rbracket",
    208  1.100  rillig 	"lbrace",
    209  1.100  rillig 	"rbrace",
    210  1.100  rillig 	"period",
    211  1.100  rillig 	"unary_op",
    212  1.100  rillig 	"binary_op",
    213  1.100  rillig 	"postfix_op",
    214  1.100  rillig 	"question",
    215  1.100  rillig 	"colon",
    216  1.100  rillig 	"comma",
    217  1.100  rillig 	"semicolon",
    218  1.100  rillig 	"typedef",
    219  1.100  rillig 	"storage_class",
    220  1.135  rillig 	"type_outside_parentheses",
    221  1.134  rillig 	"type_in_parentheses",
    222  1.100  rillig 	"tag",
    223  1.100  rillig 	"case_label",
    224  1.100  rillig 	"string_prefix",
    225  1.120  rillig 	"sizeof",
    226  1.121  rillig 	"offsetof",
    227  1.134  rillig 	"word",
    228  1.100  rillig 	"funcname",
    229  1.100  rillig 	"do",
    230  1.100  rillig 	"else",
    231  1.100  rillig 	"for",
    232  1.100  rillig 	"if",
    233  1.100  rillig 	"switch",
    234  1.100  rillig 	"while",
    235  1.129  rillig 	"return",
    236   1.20  rillig     };
    237   1.20  rillig 
    238  1.100  rillig     return name[sym];
    239   1.20  rillig }
    240   1.20  rillig 
    241   1.20  rillig static void
    242   1.72  rillig debug_print_buf(const char *name, const struct buffer *buf)
    243   1.20  rillig {
    244   1.72  rillig     if (buf->s < buf->e) {
    245  1.101  rillig 	debug_printf("%s ", name);
    246  1.101  rillig 	debug_vis_range("\"", buf->s, buf->e, "\"\n");
    247   1.20  rillig     }
    248   1.20  rillig }
    249   1.20  rillig 
    250  1.112  rillig #define debug_ps_bool(name) \
    251  1.113  rillig         if (ps.name != prev_ps.name) \
    252  1.113  rillig 	    debug_println("[%c] ps." #name, ps.name ? 'x' : ' ')
    253  1.112  rillig #define debug_ps_int(name) \
    254  1.113  rillig 	if (ps.name != prev_ps.name) \
    255  1.113  rillig 	    debug_println("%3d ps." #name, ps.name)
    256  1.112  rillig 
    257  1.101  rillig static void
    258  1.107  rillig debug_lexi(lexer_symbol lsym)
    259   1.20  rillig {
    260  1.113  rillig     /*
    261  1.113  rillig      * Watch out for 'rolled back parser state' in the debug output; the
    262  1.113  rillig      * differences around these are unreliable.
    263  1.113  rillig      */
    264  1.113  rillig     static struct parser_state prev_ps;
    265  1.113  rillig 
    266  1.104  rillig     debug_println("");
    267  1.134  rillig     debug_printf("line %d: %s", line_no, lsym_name(lsym));
    268  1.116  rillig     debug_vis_range(" \"", token.s, token.e, "\"\n");
    269  1.122  rillig 
    270   1.72  rillig     debug_print_buf("label", &lab);
    271   1.72  rillig     debug_print_buf("code", &code);
    272   1.72  rillig     debug_print_buf("comment", &com);
    273  1.112  rillig 
    274  1.122  rillig     debug_println("    ps.prev_token = %s", lsym_name(ps.prev_token));
    275  1.130  rillig     debug_ps_bool(next_col_1);
    276  1.117  rillig     debug_ps_bool(curr_col_1);
    277  1.112  rillig     debug_ps_bool(next_unary);
    278  1.147  rillig     debug_ps_bool(is_function_definition);
    279  1.112  rillig     debug_ps_bool(want_blank);
    280  1.112  rillig     debug_ps_int(paren_level);
    281  1.112  rillig     debug_ps_int(p_l_follow);
    282  1.131  rillig     if (ps.paren_level != prev_ps.paren_level) {
    283  1.131  rillig 	debug_printf("    ps.paren_indents:");
    284  1.131  rillig 	for (int i = 0; i < ps.paren_level; i++)
    285  1.131  rillig 	    debug_printf(" %d", ps.paren_indents[i]);
    286  1.131  rillig 	debug_println("");
    287  1.131  rillig     }
    288  1.112  rillig     debug_ps_int(cast_mask);
    289  1.112  rillig     debug_ps_int(not_cast_mask);
    290  1.112  rillig 
    291  1.112  rillig     debug_ps_int(comment_delta);
    292  1.112  rillig     debug_ps_int(n_comment_delta);
    293  1.112  rillig     debug_ps_int(com_ind);
    294  1.112  rillig 
    295  1.112  rillig     debug_ps_bool(block_init);
    296  1.112  rillig     debug_ps_int(block_init_level);
    297  1.112  rillig     debug_ps_bool(init_or_struct);
    298  1.112  rillig 
    299  1.112  rillig     debug_ps_int(ind_level);
    300  1.112  rillig     debug_ps_int(ind_level_follow);
    301  1.112  rillig 
    302  1.137  rillig     debug_ps_int(decl_level);
    303  1.112  rillig     debug_ps_bool(decl_on_line);
    304  1.112  rillig     debug_ps_bool(in_decl);
    305  1.112  rillig     debug_ps_int(just_saw_decl);
    306  1.112  rillig     debug_ps_bool(in_parameter_declaration);
    307  1.112  rillig     debug_ps_bool(decl_indent_done);
    308  1.112  rillig 
    309  1.112  rillig     debug_ps_bool(in_stmt);
    310  1.112  rillig     debug_ps_bool(ind_stmt);
    311  1.112  rillig     debug_ps_bool(is_case_label);
    312  1.112  rillig 
    313  1.112  rillig     debug_ps_bool(search_stmt);
    314  1.113  rillig 
    315  1.113  rillig     prev_ps = ps;
    316  1.101  rillig }
    317   1.96  rillig #endif
    318   1.20  rillig 
    319  1.104  rillig /* ARGSUSED */
    320  1.101  rillig static lexer_symbol
    321  1.107  rillig lexi_end(lexer_symbol lsym)
    322  1.101  rillig {
    323  1.101  rillig #ifdef debug
    324  1.107  rillig     debug_lexi(lsym);
    325  1.101  rillig #endif
    326  1.100  rillig     return lsym;
    327   1.20  rillig }
    328   1.20  rillig 
    329   1.43  rillig static void
    330   1.43  rillig lex_number(void)
    331   1.43  rillig {
    332  1.115  rillig     for (unsigned char s = 'A'; s != 'f' && s != 'i' && s != 'u';) {
    333  1.141  rillig 	unsigned char ch = (unsigned char)inp_peek();
    334   1.94  rillig 	if (ch >= array_length(lex_number_row) || lex_number_row[ch] == 0)
    335   1.56  rillig 	    break;
    336   1.75  rillig 
    337  1.115  rillig 	unsigned char row = lex_number_row[ch];
    338   1.82  rillig 	if (lex_number_state[row][s - 'A'] == ' ') {
    339   1.71  rillig 	    /*-
    340   1.82  rillig 	     * lex_number_state[0][s - 'A'] now indicates the type:
    341   1.74  rillig 	     * f = floating, i = integer, u = unknown
    342   1.56  rillig 	     */
    343  1.138  rillig 	    return;
    344   1.43  rillig 	}
    345   1.75  rillig 
    346   1.82  rillig 	s = lex_number_state[row][s - 'A'];
    347  1.133  rillig 	token_add_char(inp_next());
    348   1.43  rillig     }
    349   1.43  rillig }
    350   1.43  rillig 
    351  1.145  rillig static bool
    352  1.146  rillig is_identifier_start(char ch)
    353  1.146  rillig {
    354  1.146  rillig     return isalpha((unsigned char)ch) || ch == '_' || ch == '$';
    355  1.146  rillig }
    356  1.146  rillig 
    357  1.146  rillig static bool
    358  1.145  rillig is_identifier_part(char ch)
    359  1.145  rillig {
    360  1.145  rillig     return isalnum((unsigned char)ch) || ch == '_' || ch == '$';
    361  1.145  rillig }
    362  1.145  rillig 
    363   1.43  rillig static void
    364   1.43  rillig lex_word(void)
    365   1.43  rillig {
    366  1.145  rillig     while (is_identifier_part(inp_peek()) || inp_peek() == '\\' ) {
    367  1.141  rillig 	if (inp_peek() == '\\') {
    368  1.142  rillig 	    if (inp_lookahead(1) == '\n') {
    369  1.142  rillig 		inp_skip();
    370  1.142  rillig 		inp_skip();
    371   1.43  rillig 	    } else
    372   1.43  rillig 		break;
    373   1.43  rillig 	}
    374   1.75  rillig 
    375  1.133  rillig 	token_add_char(inp_next());
    376   1.43  rillig     }
    377   1.43  rillig }
    378   1.43  rillig 
    379   1.43  rillig static void
    380   1.43  rillig lex_char_or_string(void)
    381   1.43  rillig {
    382  1.132  rillig     for (char delim = token.e[-1];;) {
    383  1.141  rillig 	if (inp_peek() == '\n') {
    384   1.52  rillig 	    diag(1, "Unterminated literal");
    385   1.52  rillig 	    return;
    386   1.52  rillig 	}
    387   1.75  rillig 
    388  1.133  rillig 	token_add_char(inp_next());
    389   1.52  rillig 	if (token.e[-1] == delim)
    390   1.52  rillig 	    return;
    391   1.75  rillig 
    392   1.52  rillig 	if (token.e[-1] == '\\') {
    393  1.141  rillig 	    if (inp_peek() == '\n')
    394   1.52  rillig 		++line_no;
    395  1.133  rillig 	    token_add_char(inp_next());
    396   1.52  rillig 	}
    397   1.52  rillig     }
    398   1.43  rillig }
    399   1.43  rillig 
    400   1.84  rillig /* Guess whether the current token is a declared type. */
    401   1.57  rillig static bool
    402  1.107  rillig probably_typename(void)
    403   1.57  rillig {
    404  1.107  rillig     if (ps.block_init || ps.in_stmt)
    405   1.70  rillig 	return false;
    406  1.142  rillig     if (inp_peek() == '*' && inp_lookahead(1) != '=')
    407   1.70  rillig 	goto maybe;
    408  1.145  rillig     /* XXX: is_identifier_start */
    409  1.141  rillig     if (isalpha((unsigned char)inp_peek()))
    410   1.70  rillig 	goto maybe;
    411   1.70  rillig     return false;
    412   1.70  rillig maybe:
    413  1.110  rillig     return ps.prev_token == lsym_semicolon ||
    414  1.110  rillig 	ps.prev_token == lsym_lbrace ||
    415  1.110  rillig 	ps.prev_token == lsym_rbrace;
    416   1.57  rillig }
    417   1.57  rillig 
    418   1.84  rillig static int
    419   1.84  rillig bsearch_typenames(const char *key)
    420   1.84  rillig {
    421   1.84  rillig     const char **arr = typenames.items;
    422   1.84  rillig     int lo = 0;
    423   1.84  rillig     int hi = (int)typenames.len - 1;
    424   1.84  rillig 
    425   1.84  rillig     while (lo <= hi) {
    426   1.84  rillig 	int mid = (int)((unsigned)(lo + hi) >> 1);
    427   1.84  rillig 	int cmp = strcmp(arr[mid], key);
    428   1.84  rillig 	if (cmp < 0)
    429   1.84  rillig 	    lo = mid + 1;
    430   1.84  rillig 	else if (cmp > 0)
    431   1.84  rillig 	    hi = mid - 1;
    432   1.84  rillig 	else
    433   1.84  rillig 	    return mid;
    434   1.84  rillig     }
    435   1.84  rillig     return -(lo + 1);
    436   1.84  rillig }
    437   1.84  rillig 
    438   1.63  rillig static bool
    439   1.63  rillig is_typename(void)
    440   1.63  rillig {
    441   1.84  rillig     if (opt.auto_typedefs &&
    442   1.84  rillig 	token.e - token.s >= 2 && memcmp(token.e - 2, "_t", 2) == 0)
    443   1.84  rillig 	return true;
    444   1.63  rillig 
    445   1.84  rillig     return bsearch_typenames(token.s) >= 0;
    446   1.63  rillig }
    447   1.63  rillig 
    448  1.115  rillig static int
    449  1.115  rillig cmp_keyword_by_name(const void *key, const void *elem)
    450  1.115  rillig {
    451  1.115  rillig     return strcmp(key, ((const struct keyword *)elem)->name);
    452  1.115  rillig }
    453  1.115  rillig 
    454  1.138  rillig /* Read an alphanumeric token into 'token', or return lsym_eof. */
    455  1.100  rillig static lexer_symbol
    456  1.107  rillig lexi_alnum(void)
    457    1.1     cgd {
    458  1.141  rillig     if (isdigit((unsigned char)inp_peek()) ||
    459  1.142  rillig 	    (inp_peek() == '.' && isdigit((unsigned char)inp_lookahead(1)))) {
    460   1.89  rillig 	lex_number();
    461  1.145  rillig     } else if (is_identifier_part(inp_peek())) {
    462   1.89  rillig 	lex_word();
    463  1.102  rillig     } else
    464  1.102  rillig 	return lsym_eof;	/* just as a placeholder */
    465  1.102  rillig 
    466   1.89  rillig     *token.e = '\0';
    467   1.16   kamil 
    468   1.89  rillig     if (token.s[0] == 'L' && token.s[1] == '\0' &&
    469  1.141  rillig 	    (inp_peek() == '"' || inp_peek() == '\''))
    470  1.100  rillig 	return lsym_string_prefix;
    471   1.16   kamil 
    472  1.133  rillig     while (ch_isblank(inp_peek()))
    473  1.133  rillig 	inp_skip();
    474   1.89  rillig 
    475  1.110  rillig     if (ps.prev_token == lsym_tag && ps.p_l_follow == 0) {
    476  1.107  rillig 	ps.next_unary = true;
    477  1.135  rillig 	return lsym_type_outside_parentheses;
    478   1.16   kamil     }
    479    1.6   lukem 
    480   1.89  rillig     /* Operator after identifier is binary unless last token was 'struct'. */
    481  1.110  rillig     ps.next_unary = ps.prev_token == lsym_tag;
    482   1.16   kamil 
    483   1.89  rillig     const struct keyword *kw = bsearch(token.s, keywords,
    484   1.94  rillig 	array_length(keywords), sizeof(keywords[0]), cmp_keyword_by_name);
    485  1.134  rillig     bool is_type = false;
    486   1.89  rillig     if (kw == NULL) {
    487   1.89  rillig 	if (is_typename()) {
    488  1.134  rillig 	    is_type = true;
    489  1.107  rillig 	    ps.next_unary = true;
    490   1.89  rillig 	    goto found_typename;
    491   1.16   kamil 	}
    492   1.89  rillig 
    493   1.89  rillig     } else {			/* we have a keyword */
    494  1.134  rillig 	is_type = kw->lsym == lsym_type;
    495  1.107  rillig 	ps.next_unary = true;
    496  1.127  rillig 	if (kw->lsym != lsym_tag && kw->lsym != lsym_type)
    497  1.125  rillig 	    return kw->lsym;
    498  1.118  rillig 
    499  1.118  rillig found_typename:
    500  1.118  rillig 	if (ps.p_l_follow > 0) {
    501  1.118  rillig 	    /* inside parentheses: cast, param list, offsetof or sizeof */
    502  1.118  rillig 	    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
    503  1.118  rillig 	}
    504  1.118  rillig 	if (ps.prev_token != lsym_period && ps.prev_token != lsym_unary_op) {
    505  1.126  rillig 	    if (kw != NULL && kw->lsym == lsym_tag)
    506  1.100  rillig 		return lsym_tag;
    507  1.118  rillig 	    if (ps.p_l_follow == 0)
    508  1.135  rillig 		return lsym_type_outside_parentheses;
    509   1.90  rillig 	}
    510   1.90  rillig     }
    511   1.89  rillig 
    512  1.141  rillig     if (inp_peek() == '(' && ps.tos <= 1 && ps.ind_level == 0 &&
    513  1.107  rillig 	!ps.in_parameter_declaration && !ps.block_init) {
    514   1.89  rillig 
    515  1.143  rillig 	for (const char *p = inp_p(), *e = inp_line_end(); p < e;)
    516   1.89  rillig 	    if (*p++ == ')' && (*p == ';' || *p == ','))
    517  1.118  rillig 		goto no_function_definition;
    518   1.89  rillig 
    519  1.147  rillig 	ps.is_function_definition = true;
    520  1.107  rillig 	if (ps.in_decl)
    521  1.107  rillig 	    ps.in_parameter_declaration = true;
    522  1.100  rillig 	return lsym_funcname;
    523  1.118  rillig no_function_definition:;
    524   1.89  rillig 
    525  1.136  rillig     } else if (ps.p_l_follow == 0 && probably_typename()) {
    526  1.107  rillig 	ps.next_unary = true;
    527  1.135  rillig 	return lsym_type_outside_parentheses;
    528   1.89  rillig     }
    529   1.89  rillig 
    530  1.134  rillig     return is_type ? lsym_type_in_parentheses : lsym_word;
    531   1.89  rillig }
    532   1.75  rillig 
    533   1.89  rillig /* Reads the next token, placing it in the global variable "token". */
    534  1.100  rillig lexer_symbol
    535  1.106  rillig lexi(void)
    536   1.89  rillig {
    537   1.90  rillig     token.e = token.s;
    538  1.130  rillig     ps.curr_col_1 = ps.next_col_1;
    539  1.130  rillig     ps.next_col_1 = false;
    540   1.75  rillig 
    541  1.141  rillig     while (ch_isblank(inp_peek())) {
    542  1.117  rillig 	ps.curr_col_1 = false;
    543  1.133  rillig 	inp_skip();
    544   1.89  rillig     }
    545   1.75  rillig 
    546  1.107  rillig     lexer_symbol alnum_lsym = lexi_alnum();
    547  1.100  rillig     if (alnum_lsym != lsym_eof)
    548  1.107  rillig 	return lexi_end(alnum_lsym);
    549   1.16   kamil 
    550   1.16   kamil     /* Scan a non-alphanumeric token */
    551   1.16   kamil 
    552   1.90  rillig     check_size_token(3);	/* for things like "<<=" */
    553  1.133  rillig     *token.e++ = inp_next();
    554   1.50  rillig     *token.e = '\0';
    555   1.16   kamil 
    556  1.100  rillig     lexer_symbol lsym;
    557   1.89  rillig     bool unary_delim = false;	/* whether the current token forces a
    558   1.89  rillig 				 * following operator to be unary */
    559   1.89  rillig 
    560  1.132  rillig     switch (token.e[-1]) {
    561   1.16   kamil     case '\n':
    562  1.107  rillig 	unary_delim = ps.next_unary;
    563  1.130  rillig 	ps.next_col_1 = true;
    564   1.47  rillig 	/* if data has been exhausted, the newline is a dummy. */
    565  1.100  rillig 	lsym = had_eof ? lsym_eof : lsym_newline;
    566   1.16   kamil 	break;
    567   1.16   kamil 
    568   1.43  rillig     case '\'':
    569   1.43  rillig     case '"':
    570   1.44  rillig 	lex_char_or_string();
    571  1.134  rillig 	lsym = lsym_word;
    572   1.16   kamil 	break;
    573    1.6   lukem 
    574   1.40  rillig     case '(':
    575   1.40  rillig     case '[':
    576   1.16   kamil 	unary_delim = true;
    577  1.100  rillig 	lsym = lsym_lparen_or_lbracket;
    578   1.16   kamil 	break;
    579   1.16   kamil 
    580   1.40  rillig     case ')':
    581   1.40  rillig     case ']':
    582  1.100  rillig 	lsym = lsym_rparen_or_rbracket;
    583   1.16   kamil 	break;
    584   1.16   kamil 
    585   1.16   kamil     case '#':
    586  1.107  rillig 	unary_delim = ps.next_unary;
    587  1.100  rillig 	lsym = lsym_preprocessing;
    588   1.16   kamil 	break;
    589   1.16   kamil 
    590   1.16   kamil     case '?':
    591   1.16   kamil 	unary_delim = true;
    592  1.100  rillig 	lsym = lsym_question;
    593   1.16   kamil 	break;
    594   1.16   kamil 
    595   1.40  rillig     case ':':
    596  1.100  rillig 	lsym = lsym_colon;
    597   1.16   kamil 	unary_delim = true;
    598   1.16   kamil 	break;
    599   1.16   kamil 
    600   1.40  rillig     case ';':
    601   1.16   kamil 	unary_delim = true;
    602  1.100  rillig 	lsym = lsym_semicolon;
    603   1.16   kamil 	break;
    604   1.16   kamil 
    605   1.40  rillig     case '{':
    606   1.16   kamil 	unary_delim = true;
    607  1.100  rillig 	lsym = lsym_lbrace;
    608   1.16   kamil 	break;
    609   1.16   kamil 
    610   1.40  rillig     case '}':
    611   1.16   kamil 	unary_delim = true;
    612  1.100  rillig 	lsym = lsym_rbrace;
    613   1.16   kamil 	break;
    614   1.16   kamil 
    615   1.69  rillig     case '\f':
    616  1.107  rillig 	unary_delim = ps.next_unary;
    617  1.130  rillig 	ps.next_col_1 = true;
    618  1.100  rillig 	lsym = lsym_form_feed;
    619   1.16   kamil 	break;
    620   1.16   kamil 
    621   1.40  rillig     case ',':
    622   1.16   kamil 	unary_delim = true;
    623  1.100  rillig 	lsym = lsym_comma;
    624   1.16   kamil 	break;
    625   1.16   kamil 
    626   1.16   kamil     case '.':
    627   1.16   kamil 	unary_delim = false;
    628  1.100  rillig 	lsym = lsym_period;
    629   1.16   kamil 	break;
    630    1.1     cgd 
    631   1.16   kamil     case '-':
    632   1.90  rillig     case '+':
    633  1.107  rillig 	lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
    634   1.16   kamil 	unary_delim = true;
    635   1.16   kamil 
    636  1.141  rillig 	if (inp_peek() == token.e[-1]) {	/* ++, -- */
    637  1.141  rillig 	    *token.e++ = inp_next();
    638  1.134  rillig 	    if (ps.prev_token == lsym_word ||
    639  1.110  rillig 		    ps.prev_token == lsym_rparen_or_rbracket) {
    640  1.107  rillig 		lsym = ps.next_unary ? lsym_unary_op : lsym_postfix_op;
    641    1.1     cgd 		unary_delim = false;
    642   1.16   kamil 	    }
    643   1.75  rillig 
    644  1.141  rillig 	} else if (inp_peek() == '=') {	/* += */
    645  1.141  rillig 	    *token.e++ = inp_next();
    646   1.75  rillig 
    647  1.141  rillig 	} else if (inp_peek() == '>') {	/* -> */
    648  1.141  rillig 	    *token.e++ = inp_next();
    649   1.16   kamil 	    unary_delim = false;
    650  1.100  rillig 	    lsym = lsym_unary_op;
    651  1.107  rillig 	    ps.want_blank = false;
    652   1.16   kamil 	}
    653   1.90  rillig 	break;
    654   1.16   kamil 
    655   1.16   kamil     case '=':
    656  1.107  rillig 	if (ps.init_or_struct)
    657  1.107  rillig 	    ps.block_init = true;
    658  1.141  rillig 	if (inp_peek() == '=') {	/* == */
    659  1.141  rillig 	    *token.e++ = inp_next();
    660   1.67  rillig 	    *token.e = '\0';
    661   1.16   kamil 	}
    662  1.100  rillig 	lsym = lsym_binary_op;
    663   1.16   kamil 	unary_delim = true;
    664   1.16   kamil 	break;
    665   1.16   kamil 
    666   1.16   kamil     case '>':
    667   1.16   kamil     case '<':
    668   1.16   kamil     case '!':			/* ops like <, <<, <=, !=, etc */
    669  1.141  rillig 	if (inp_peek() == '>' || inp_peek() == '<' || inp_peek() == '=')
    670  1.141  rillig 	    *token.e++ = inp_next();
    671  1.141  rillig 	if (inp_peek() == '=')
    672  1.133  rillig 	    *token.e++ = inp_next();
    673  1.107  rillig 	lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
    674   1.16   kamil 	unary_delim = true;
    675   1.16   kamil 	break;
    676   1.16   kamil 
    677   1.16   kamil     case '*':
    678   1.16   kamil 	unary_delim = true;
    679  1.107  rillig 	if (!ps.next_unary) {
    680  1.141  rillig 	    if (inp_peek() == '=')
    681  1.141  rillig 		*token.e++ = inp_next();
    682  1.100  rillig 	    lsym = lsym_binary_op;
    683   1.16   kamil 	    break;
    684   1.16   kamil 	}
    685   1.75  rillig 
    686  1.141  rillig 	while (inp_peek() == '*' || isspace((unsigned char)inp_peek())) {
    687  1.141  rillig 	    if (inp_peek() == '*')
    688   1.87  rillig 		token_add_char('*');
    689  1.133  rillig 	    inp_skip();
    690   1.16   kamil 	}
    691   1.75  rillig 
    692   1.16   kamil 	if (ps.in_decl) {
    693  1.144  rillig 	    const char *tp = inp_p(), *e = inp_line_end();
    694    1.6   lukem 
    695  1.146  rillig 	    while (tp < e) {
    696  1.146  rillig 		if (isspace((unsigned char)*tp))
    697  1.146  rillig 		    tp++;
    698  1.146  rillig 		else if (is_identifier_start(*tp)) {
    699  1.146  rillig 		    tp++;
    700  1.146  rillig 		    while (tp < e && is_identifier_part(*tp))
    701  1.146  rillig 			tp++;
    702  1.146  rillig 		} else
    703  1.146  rillig 		    break;
    704  1.146  rillig 	    }
    705  1.144  rillig 
    706  1.144  rillig 	    if (tp < e && *tp == '(')
    707  1.147  rillig 		ps.is_function_definition = true;
    708   1.16   kamil 	}
    709   1.75  rillig 
    710  1.100  rillig 	lsym = lsym_unary_op;
    711   1.16   kamil 	break;
    712    1.1     cgd 
    713   1.16   kamil     default:
    714  1.141  rillig 	if (token.e[-1] == '/' && (inp_peek() == '*' || inp_peek() == '/')) {
    715  1.133  rillig 	    *token.e++ = inp_next();
    716  1.100  rillig 	    lsym = lsym_comment;
    717  1.107  rillig 	    unary_delim = ps.next_unary;
    718   1.16   kamil 	    break;
    719    1.1     cgd 	}
    720   1.75  rillig 
    721  1.132  rillig 	/* handle '||', '&&', etc., and also things as in 'int *****i' */
    722  1.141  rillig 	while (token.e[-1] == inp_peek() || inp_peek() == '=')
    723  1.133  rillig 	    token_add_char(inp_next());
    724   1.75  rillig 
    725  1.107  rillig 	lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
    726   1.16   kamil 	unary_delim = true;
    727   1.47  rillig     }
    728   1.16   kamil 
    729  1.107  rillig     ps.next_unary = unary_delim;
    730   1.75  rillig 
    731   1.25  rillig     check_size_token(1);
    732   1.50  rillig     *token.e = '\0';
    733   1.75  rillig 
    734  1.107  rillig     return lexi_end(lsym);
    735    1.1     cgd }
    736   1.16   kamil 
    737    1.6   lukem void
    738  1.128  rillig register_typename(const char *name)
    739    1.1     cgd {
    740   1.64  rillig     if (typenames.len >= typenames.cap) {
    741   1.64  rillig 	typenames.cap = 16 + 2 * typenames.cap;
    742   1.64  rillig 	typenames.items = xrealloc(typenames.items,
    743   1.64  rillig 	    sizeof(typenames.items[0]) * typenames.cap);
    744   1.64  rillig     }
    745   1.16   kamil 
    746   1.84  rillig     int pos = bsearch_typenames(name);
    747   1.64  rillig     if (pos >= 0)
    748   1.64  rillig 	return;			/* already in the list */
    749   1.75  rillig 
    750   1.64  rillig     pos = -(pos + 1);
    751   1.64  rillig     memmove(typenames.items + pos + 1, typenames.items + pos,
    752   1.73  rillig 	sizeof(typenames.items[0]) * (typenames.len++ - (unsigned)pos));
    753   1.64  rillig     typenames.items[pos] = xstrdup(name);
    754    1.1     cgd }
    755