Home | History | Annotate | Line # | Download | only in indent
lexi.c revision 1.172
      1  1.172  rillig /*	$NetBSD: lexi.c,v 1.172 2022/02/13 12:43:26 rillig Exp $	*/
      2    1.3     tls 
      3   1.16   kamil /*-
      4   1.16   kamil  * SPDX-License-Identifier: BSD-4-Clause
      5   1.16   kamil  *
      6   1.16   kamil  * Copyright (c) 1985 Sun Microsystems, Inc.
      7    1.5     mrg  * Copyright (c) 1980, 1993
      8    1.5     mrg  *	The Regents of the University of California.  All rights reserved.
      9    1.1     cgd  * All rights reserved.
     10    1.1     cgd  *
     11    1.1     cgd  * Redistribution and use in source and binary forms, with or without
     12    1.1     cgd  * modification, are permitted provided that the following conditions
     13    1.1     cgd  * are met:
     14    1.1     cgd  * 1. Redistributions of source code must retain the above copyright
     15    1.1     cgd  *    notice, this list of conditions and the following disclaimer.
     16    1.1     cgd  * 2. Redistributions in binary form must reproduce the above copyright
     17    1.1     cgd  *    notice, this list of conditions and the following disclaimer in the
     18    1.1     cgd  *    documentation and/or other materials provided with the distribution.
     19    1.1     cgd  * 3. All advertising materials mentioning features or use of this software
     20    1.1     cgd  *    must display the following acknowledgement:
     21    1.1     cgd  *	This product includes software developed by the University of
     22    1.1     cgd  *	California, Berkeley and its contributors.
     23    1.1     cgd  * 4. Neither the name of the University nor the names of its contributors
     24    1.1     cgd  *    may be used to endorse or promote products derived from this software
     25    1.1     cgd  *    without specific prior written permission.
     26    1.1     cgd  *
     27    1.1     cgd  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     28    1.1     cgd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29    1.1     cgd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     30    1.1     cgd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     31    1.1     cgd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     32    1.1     cgd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     33    1.1     cgd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     34    1.1     cgd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     35    1.1     cgd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     36    1.1     cgd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     37    1.1     cgd  * SUCH DAMAGE.
     38    1.1     cgd  */
     39    1.1     cgd 
     40   1.16   kamil #if 0
     41   1.16   kamil static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
     42   1.16   kamil #endif
     43   1.16   kamil 
     44    1.6   lukem #include <sys/cdefs.h>
     45   1.16   kamil #if defined(__NetBSD__)
     46  1.172  rillig __RCSID("$NetBSD: lexi.c,v 1.172 2022/02/13 12:43:26 rillig Exp $");
     47   1.16   kamil #elif defined(__FreeBSD__)
     48   1.16   kamil __FBSDID("$FreeBSD: head/usr.bin/indent/lexi.c 337862 2018-08-15 18:19:45Z pstef $");
     49   1.16   kamil #endif
     50    1.1     cgd 
     51    1.1     cgd #include <stdlib.h>
     52    1.1     cgd #include <string.h>
     53   1.16   kamil 
     54   1.16   kamil #include "indent.h"
     55    1.1     cgd 
     56  1.127  rillig /*
     57  1.127  rillig  * While inside lexi_alnum, this constant just marks a type, independently of
     58  1.127  rillig  * the parentheses level.
     59  1.127  rillig  */
     60  1.135  rillig #define lsym_type lsym_type_outside_parentheses
     61  1.127  rillig 
     62   1.60  rillig /* must be sorted alphabetically, is used in binary search */
     63   1.62  rillig static const struct keyword {
     64   1.62  rillig     const char *name;
     65  1.125  rillig     lexer_symbol lsym;
     66   1.62  rillig } keywords[] = {
     67  1.127  rillig     {"_Bool", lsym_type},
     68  1.127  rillig     {"_Complex", lsym_type},
     69  1.127  rillig     {"_Imaginary", lsym_type},
     70  1.127  rillig     {"auto", lsym_storage_class},
     71  1.127  rillig     {"bool", lsym_type},
     72  1.134  rillig     {"break", lsym_word},
     73  1.127  rillig     {"case", lsym_case_label},
     74  1.127  rillig     {"char", lsym_type},
     75  1.127  rillig     {"complex", lsym_type},
     76  1.127  rillig     {"const", lsym_type},
     77  1.134  rillig     {"continue", lsym_word},
     78  1.127  rillig     {"default", lsym_case_label},
     79  1.127  rillig     {"do", lsym_do},
     80  1.127  rillig     {"double", lsym_type},
     81  1.127  rillig     {"else", lsym_else},
     82  1.127  rillig     {"enum", lsym_tag},
     83  1.127  rillig     {"extern", lsym_storage_class},
     84  1.127  rillig     {"float", lsym_type},
     85  1.127  rillig     {"for", lsym_for},
     86  1.134  rillig     {"goto", lsym_word},
     87  1.127  rillig     {"if", lsym_if},
     88  1.127  rillig     {"imaginary", lsym_type},
     89  1.134  rillig     {"inline", lsym_word},
     90  1.127  rillig     {"int", lsym_type},
     91  1.127  rillig     {"long", lsym_type},
     92  1.127  rillig     {"offsetof", lsym_offsetof},
     93  1.127  rillig     {"register", lsym_storage_class},
     94  1.134  rillig     {"restrict", lsym_word},
     95  1.129  rillig     {"return", lsym_return},
     96  1.127  rillig     {"short", lsym_type},
     97  1.127  rillig     {"signed", lsym_type},
     98  1.127  rillig     {"sizeof", lsym_sizeof},
     99  1.127  rillig     {"static", lsym_storage_class},
    100  1.127  rillig     {"struct", lsym_tag},
    101  1.127  rillig     {"switch", lsym_switch},
    102  1.127  rillig     {"typedef", lsym_typedef},
    103  1.127  rillig     {"union", lsym_tag},
    104  1.127  rillig     {"unsigned", lsym_type},
    105  1.127  rillig     {"void", lsym_type},
    106  1.127  rillig     {"volatile", lsym_type},
    107  1.127  rillig     {"while", lsym_while}
    108    1.1     cgd };
    109    1.1     cgd 
    110   1.84  rillig static struct {
    111   1.64  rillig     const char **items;
    112   1.64  rillig     unsigned int len;
    113   1.64  rillig     unsigned int cap;
    114   1.64  rillig } typenames;
    115   1.16   kamil 
    116   1.16   kamil /*
    117   1.16   kamil  * The transition table below was rewritten by hand from lx's output, given
    118   1.16   kamil  * the following definitions. lx is Katherine Flavel's lexer generator.
    119   1.16   kamil  *
    120   1.16   kamil  * O  = /[0-7]/;        D  = /[0-9]/;          NZ = /[1-9]/;
    121   1.16   kamil  * H  = /[a-f0-9]/i;    B  = /[0-1]/;          HP = /0x/i;
    122   1.16   kamil  * BP = /0b/i;          E  = /e[+\-]?/i D+;    P  = /p[+\-]?/i D+;
    123   1.16   kamil  * FS = /[fl]/i;        IS = /u/i /(l|L|ll|LL)/? | /(l|L|ll|LL)/ /u/i?;
    124   1.16   kamil  *
    125   1.16   kamil  * D+           E  FS? -> $float;
    126   1.16   kamil  * D*    "." D+ E? FS? -> $float;
    127   1.16   kamil  * D+    "."    E? FS? -> $float;    HP H+           IS? -> $int;
    128   1.16   kamil  * HP H+        P  FS? -> $float;    NZ D*           IS? -> $int;
    129   1.16   kamil  * HP H* "." H+ P  FS? -> $float;    "0" O*          IS? -> $int;
    130   1.16   kamil  * HP H+ "."    P  FS  -> $float;    BP B+           IS? -> $int;
    131   1.16   kamil  */
    132   1.71  rillig /* INDENT OFF */
    133   1.82  rillig static const unsigned char lex_number_state[][26] = {
    134   1.16   kamil     /*                examples:
    135   1.16   kamil                                      00
    136   1.16   kamil              s                      0xx
    137   1.16   kamil              t                    00xaa
    138   1.16   kamil              a     11       101100xxa..
    139   1.16   kamil              r   11ee0001101lbuuxx.a.pp
    140   1.16   kamil              t.01.e+008bLuxll0Ll.aa.p+0
    141   1.16   kamil     states:  ABCDEFGHIJKLMNOPQRSTUVWXYZ */
    142   1.83  rillig     [0] =   "uuiifuufiuuiiuiiiiiuiuuuuu",	/* (other) */
    143   1.83  rillig     [1] =   "CEIDEHHHIJQ  U  Q  VUVVZZZ",	/* 0 */
    144   1.83  rillig     [2] =   "DEIDEHHHIJQ  U  Q  VUVVZZZ",	/* 1 */
    145   1.83  rillig     [3] =   "DEIDEHHHIJ   U     VUVVZZZ",	/* 2 3 4 5 6 7 */
    146   1.83  rillig     [4] =   "DEJDEHHHJJ   U     VUVVZZZ",	/* 8 9 */
    147   1.83  rillig     [5] =   "             U     VUVV   ",	/* A a C c D d */
    148   1.83  rillig     [6] =   "  K          U     VUVV   ",	/* B b */
    149   1.83  rillig     [7] =   "  FFF   FF   U     VUVV   ",	/* E e */
    150   1.83  rillig     [8] =   "    f  f     U     VUVV  f",	/* F f */
    151   1.83  rillig     [9] =   "  LLf  fL  PR   Li  L    f",	/* L */
    152   1.83  rillig     [10] =  "  OOf  fO   S P O i O    f",	/* l */
    153   1.83  rillig     [11] =  "                    FFX   ",	/* P p */
    154   1.83  rillig     [12] =  "  MM    M  i  iiM   M     ",	/* U u */
    155   1.83  rillig     [13] =  "  N                       ",	/* X x */
    156   1.83  rillig     [14] =  "     G                 Y  ",	/* + - */
    157   1.83  rillig     [15] =  "B EE    EE   T      W     ",	/* . */
    158   1.16   kamil     /*       ABCDEFGHIJKLMNOPQRSTUVWXYZ */
    159    1.1     cgd };
    160   1.71  rillig /* INDENT ON */
    161    1.1     cgd 
    162  1.115  rillig static const unsigned char lex_number_row[] = {
    163   1.56  rillig     ['0'] = 1,
    164   1.56  rillig     ['1'] = 2,
    165   1.56  rillig     ['2'] = 3, ['3'] = 3, ['4'] = 3, ['5'] = 3, ['6'] = 3, ['7'] = 3,
    166   1.56  rillig     ['8'] = 4, ['9'] = 4,
    167   1.56  rillig     ['A'] = 5, ['a'] = 5, ['C'] = 5, ['c'] = 5, ['D'] = 5, ['d'] = 5,
    168   1.56  rillig     ['B'] = 6, ['b'] = 6,
    169   1.56  rillig     ['E'] = 7, ['e'] = 7,
    170   1.56  rillig     ['F'] = 8, ['f'] = 8,
    171   1.56  rillig     ['L'] = 9,
    172   1.56  rillig     ['l'] = 10,
    173   1.56  rillig     ['P'] = 11, ['p'] = 11,
    174   1.56  rillig     ['U'] = 12, ['u'] = 12,
    175   1.56  rillig     ['X'] = 13, ['x'] = 13,
    176   1.56  rillig     ['+'] = 14, ['-'] = 14,
    177   1.56  rillig     ['.'] = 15,
    178   1.56  rillig };
    179   1.36  rillig 
    180   1.25  rillig static void
    181   1.25  rillig check_size_token(size_t desired_size)
    182   1.25  rillig {
    183   1.58  rillig     if (token.e + desired_size >= token.l)
    184   1.58  rillig 	buf_expand(&token, desired_size);
    185   1.25  rillig }
    186   1.25  rillig 
    187   1.87  rillig static void
    188   1.87  rillig token_add_char(char ch)
    189   1.87  rillig {
    190   1.87  rillig     check_size_token(1);
    191   1.87  rillig     *token.e++ = ch;
    192   1.87  rillig }
    193   1.87  rillig 
    194   1.20  rillig #ifdef debug
    195  1.100  rillig static const char *
    196  1.100  rillig lsym_name(lexer_symbol sym)
    197   1.20  rillig {
    198   1.20  rillig     static const char *const name[] = {
    199  1.100  rillig 	"eof",
    200  1.100  rillig 	"preprocessing",
    201  1.100  rillig 	"newline",
    202  1.100  rillig 	"form_feed",
    203  1.100  rillig 	"comment",
    204  1.100  rillig 	"lparen_or_lbracket",
    205  1.100  rillig 	"rparen_or_rbracket",
    206  1.100  rillig 	"lbrace",
    207  1.100  rillig 	"rbrace",
    208  1.100  rillig 	"period",
    209  1.100  rillig 	"unary_op",
    210  1.100  rillig 	"binary_op",
    211  1.100  rillig 	"postfix_op",
    212  1.100  rillig 	"question",
    213  1.100  rillig 	"colon",
    214  1.100  rillig 	"comma",
    215  1.100  rillig 	"semicolon",
    216  1.100  rillig 	"typedef",
    217  1.100  rillig 	"storage_class",
    218  1.135  rillig 	"type_outside_parentheses",
    219  1.134  rillig 	"type_in_parentheses",
    220  1.100  rillig 	"tag",
    221  1.100  rillig 	"case_label",
    222  1.120  rillig 	"sizeof",
    223  1.121  rillig 	"offsetof",
    224  1.134  rillig 	"word",
    225  1.100  rillig 	"funcname",
    226  1.100  rillig 	"do",
    227  1.100  rillig 	"else",
    228  1.100  rillig 	"for",
    229  1.100  rillig 	"if",
    230  1.100  rillig 	"switch",
    231  1.100  rillig 	"while",
    232  1.129  rillig 	"return",
    233   1.20  rillig     };
    234   1.20  rillig 
    235  1.100  rillig     return name[sym];
    236   1.20  rillig }
    237   1.20  rillig 
    238   1.20  rillig static void
    239   1.72  rillig debug_print_buf(const char *name, const struct buffer *buf)
    240   1.20  rillig {
    241   1.72  rillig     if (buf->s < buf->e) {
    242  1.101  rillig 	debug_printf("%s ", name);
    243  1.101  rillig 	debug_vis_range("\"", buf->s, buf->e, "\"\n");
    244   1.20  rillig     }
    245   1.20  rillig }
    246   1.20  rillig 
    247  1.168  rillig static bool
    248  1.168  rillig debug_full_parser_state(void)
    249  1.168  rillig {
    250  1.168  rillig     return true;
    251  1.168  rillig }
    252  1.168  rillig 
    253  1.112  rillig #define debug_ps_bool(name) \
    254  1.113  rillig         if (ps.name != prev_ps.name) \
    255  1.168  rillig 	    debug_println("[%c] -> [%c] ps." #name, \
    256  1.168  rillig 		prev_ps.name ? 'x' : ' ', ps.name ? 'x' : ' '); \
    257  1.168  rillig 	else if (debug_full_parser_state()) \
    258  1.168  rillig 	    debug_println("       [%c] ps." #name, ps.name ? 'x' : ' ')
    259  1.112  rillig #define debug_ps_int(name) \
    260  1.113  rillig 	if (ps.name != prev_ps.name) \
    261  1.168  rillig 	    debug_println("%3d -> %3d ps." #name, prev_ps.name, ps.name); \
    262  1.168  rillig 	else if (debug_full_parser_state()) \
    263  1.168  rillig 	    debug_println("       %3d ps." #name, ps.name)
    264  1.112  rillig 
    265  1.171  rillig static bool
    266  1.171  rillig ps_paren_has_changed(const struct parser_state *prev_ps)
    267  1.171  rillig {
    268  1.171  rillig     const paren_level_props *prev = prev_ps->paren, *curr = ps.paren;
    269  1.171  rillig 
    270  1.172  rillig     if (prev_ps->nparen != ps.nparen)
    271  1.171  rillig 	return true;
    272  1.171  rillig 
    273  1.172  rillig     for (int i = 0; i < ps.nparen; i++) {
    274  1.171  rillig 	if (curr[i].indent != prev[i].indent ||
    275  1.171  rillig 	    curr[i].maybe_cast != prev[i].maybe_cast ||
    276  1.171  rillig 	    curr[i].no_cast != prev[i].no_cast)
    277  1.171  rillig 	    return true;
    278  1.171  rillig     }
    279  1.171  rillig     return false;
    280  1.171  rillig }
    281  1.171  rillig 
    282  1.171  rillig static void
    283  1.171  rillig debug_ps_paren(const struct parser_state *prev_ps)
    284  1.171  rillig {
    285  1.171  rillig     if (!debug_full_parser_state() && !ps_paren_has_changed(prev_ps))
    286  1.171  rillig 	return;
    287  1.171  rillig 
    288  1.171  rillig     debug_printf("           ps.paren:");
    289  1.172  rillig     for (int i = 0; i < ps.nparen; i++) {
    290  1.171  rillig 	const paren_level_props *props = ps.paren + i;
    291  1.171  rillig 	const char *cast = props->no_cast ? "(no cast)"
    292  1.171  rillig 	    : props->maybe_cast ? "(cast)"
    293  1.171  rillig 	    : "";
    294  1.171  rillig 	debug_printf(" %s%d", cast, props->indent);
    295  1.171  rillig     }
    296  1.172  rillig     if (ps.nparen == 0)
    297  1.171  rillig 	debug_printf(" none");
    298  1.171  rillig     debug_println("");
    299  1.171  rillig }
    300  1.171  rillig 
    301  1.101  rillig static void
    302  1.107  rillig debug_lexi(lexer_symbol lsym)
    303   1.20  rillig {
    304  1.113  rillig     /*
    305  1.113  rillig      * Watch out for 'rolled back parser state' in the debug output; the
    306  1.113  rillig      * differences around these are unreliable.
    307  1.113  rillig      */
    308  1.113  rillig     static struct parser_state prev_ps;
    309  1.113  rillig 
    310  1.104  rillig     debug_println("");
    311  1.134  rillig     debug_printf("line %d: %s", line_no, lsym_name(lsym));
    312  1.116  rillig     debug_vis_range(" \"", token.s, token.e, "\"\n");
    313  1.122  rillig 
    314   1.72  rillig     debug_print_buf("label", &lab);
    315   1.72  rillig     debug_print_buf("code", &code);
    316   1.72  rillig     debug_print_buf("comment", &com);
    317  1.112  rillig 
    318  1.168  rillig     debug_println("           ps.prev_token = %s", lsym_name(ps.prev_token));
    319  1.130  rillig     debug_ps_bool(next_col_1);
    320  1.117  rillig     debug_ps_bool(curr_col_1);
    321  1.112  rillig     debug_ps_bool(next_unary);
    322  1.147  rillig     debug_ps_bool(is_function_definition);
    323  1.112  rillig     debug_ps_bool(want_blank);
    324  1.172  rillig     debug_ps_int(line_start_nparen);
    325  1.172  rillig     debug_ps_int(nparen);
    326  1.171  rillig     debug_ps_paren(&prev_ps);
    327  1.112  rillig 
    328  1.112  rillig     debug_ps_int(comment_delta);
    329  1.112  rillig     debug_ps_int(n_comment_delta);
    330  1.112  rillig     debug_ps_int(com_ind);
    331  1.112  rillig 
    332  1.112  rillig     debug_ps_bool(block_init);
    333  1.112  rillig     debug_ps_int(block_init_level);
    334  1.112  rillig     debug_ps_bool(init_or_struct);
    335  1.112  rillig 
    336  1.112  rillig     debug_ps_int(ind_level);
    337  1.112  rillig     debug_ps_int(ind_level_follow);
    338  1.112  rillig 
    339  1.137  rillig     debug_ps_int(decl_level);
    340  1.112  rillig     debug_ps_bool(decl_on_line);
    341  1.112  rillig     debug_ps_bool(in_decl);
    342  1.112  rillig     debug_ps_int(just_saw_decl);
    343  1.164  rillig     debug_ps_bool(in_func_def_params);
    344  1.112  rillig     debug_ps_bool(decl_indent_done);
    345  1.112  rillig 
    346  1.152  rillig     debug_ps_bool(in_stmt_or_decl);
    347  1.151  rillig     debug_ps_bool(in_stmt_cont);
    348  1.112  rillig     debug_ps_bool(is_case_label);
    349  1.112  rillig 
    350  1.112  rillig     debug_ps_bool(search_stmt);
    351  1.113  rillig 
    352  1.113  rillig     prev_ps = ps;
    353  1.101  rillig }
    354   1.96  rillig #endif
    355   1.20  rillig 
    356  1.101  rillig static lexer_symbol
    357  1.107  rillig lexi_end(lexer_symbol lsym)
    358  1.101  rillig {
    359  1.101  rillig #ifdef debug
    360  1.107  rillig     debug_lexi(lsym);
    361  1.101  rillig #endif
    362  1.100  rillig     return lsym;
    363   1.20  rillig }
    364   1.20  rillig 
    365   1.43  rillig static void
    366   1.43  rillig lex_number(void)
    367   1.43  rillig {
    368  1.115  rillig     for (unsigned char s = 'A'; s != 'f' && s != 'i' && s != 'u';) {
    369  1.141  rillig 	unsigned char ch = (unsigned char)inp_peek();
    370   1.94  rillig 	if (ch >= array_length(lex_number_row) || lex_number_row[ch] == 0)
    371   1.56  rillig 	    break;
    372   1.75  rillig 
    373  1.115  rillig 	unsigned char row = lex_number_row[ch];
    374   1.82  rillig 	if (lex_number_state[row][s - 'A'] == ' ') {
    375   1.71  rillig 	    /*-
    376   1.82  rillig 	     * lex_number_state[0][s - 'A'] now indicates the type:
    377   1.74  rillig 	     * f = floating, i = integer, u = unknown
    378   1.56  rillig 	     */
    379  1.138  rillig 	    return;
    380   1.43  rillig 	}
    381   1.75  rillig 
    382   1.82  rillig 	s = lex_number_state[row][s - 'A'];
    383  1.133  rillig 	token_add_char(inp_next());
    384   1.43  rillig     }
    385   1.43  rillig }
    386   1.43  rillig 
    387  1.145  rillig static bool
    388  1.146  rillig is_identifier_start(char ch)
    389  1.146  rillig {
    390  1.148  rillig     return ch_isalpha(ch) || ch == '_' || ch == '$';
    391  1.146  rillig }
    392  1.146  rillig 
    393  1.146  rillig static bool
    394  1.145  rillig is_identifier_part(char ch)
    395  1.145  rillig {
    396  1.148  rillig     return ch_isalnum(ch) || ch == '_' || ch == '$';
    397  1.145  rillig }
    398  1.145  rillig 
    399   1.43  rillig static void
    400   1.43  rillig lex_word(void)
    401   1.43  rillig {
    402  1.149  rillig     for (;;) {
    403  1.149  rillig 	if (is_identifier_part(inp_peek()))
    404  1.149  rillig 	    token_add_char(inp_next());
    405  1.149  rillig 	else if (inp_peek() == '\\' && inp_lookahead(1) == '\n') {
    406  1.149  rillig 	    inp_skip();
    407  1.149  rillig 	    inp_skip();
    408  1.149  rillig 	} else
    409  1.149  rillig 	    return;
    410   1.43  rillig     }
    411   1.43  rillig }
    412   1.43  rillig 
    413   1.43  rillig static void
    414   1.43  rillig lex_char_or_string(void)
    415   1.43  rillig {
    416  1.132  rillig     for (char delim = token.e[-1];;) {
    417  1.141  rillig 	if (inp_peek() == '\n') {
    418   1.52  rillig 	    diag(1, "Unterminated literal");
    419   1.52  rillig 	    return;
    420   1.52  rillig 	}
    421   1.75  rillig 
    422  1.133  rillig 	token_add_char(inp_next());
    423   1.52  rillig 	if (token.e[-1] == delim)
    424   1.52  rillig 	    return;
    425   1.75  rillig 
    426   1.52  rillig 	if (token.e[-1] == '\\') {
    427  1.141  rillig 	    if (inp_peek() == '\n')
    428   1.52  rillig 		++line_no;
    429  1.133  rillig 	    token_add_char(inp_next());
    430   1.52  rillig 	}
    431   1.52  rillig     }
    432   1.43  rillig }
    433   1.43  rillig 
    434   1.84  rillig /* Guess whether the current token is a declared type. */
    435   1.57  rillig static bool
    436  1.107  rillig probably_typename(void)
    437   1.57  rillig {
    438  1.153  rillig     if (ps.prev_token == lsym_storage_class)
    439  1.153  rillig 	return true;
    440  1.153  rillig     if (ps.block_init)
    441  1.153  rillig 	return false;
    442  1.153  rillig     if (ps.in_stmt_or_decl)	/* XXX: this condition looks incorrect */
    443   1.70  rillig 	return false;
    444  1.142  rillig     if (inp_peek() == '*' && inp_lookahead(1) != '=')
    445   1.70  rillig 	goto maybe;
    446  1.145  rillig     /* XXX: is_identifier_start */
    447  1.148  rillig     if (ch_isalpha(inp_peek()))
    448   1.70  rillig 	goto maybe;
    449   1.70  rillig     return false;
    450   1.70  rillig maybe:
    451  1.110  rillig     return ps.prev_token == lsym_semicolon ||
    452  1.110  rillig 	ps.prev_token == lsym_lbrace ||
    453  1.110  rillig 	ps.prev_token == lsym_rbrace;
    454   1.57  rillig }
    455   1.57  rillig 
    456   1.84  rillig static int
    457   1.84  rillig bsearch_typenames(const char *key)
    458   1.84  rillig {
    459   1.84  rillig     const char **arr = typenames.items;
    460   1.84  rillig     int lo = 0;
    461   1.84  rillig     int hi = (int)typenames.len - 1;
    462   1.84  rillig 
    463   1.84  rillig     while (lo <= hi) {
    464   1.84  rillig 	int mid = (int)((unsigned)(lo + hi) >> 1);
    465   1.84  rillig 	int cmp = strcmp(arr[mid], key);
    466   1.84  rillig 	if (cmp < 0)
    467   1.84  rillig 	    lo = mid + 1;
    468   1.84  rillig 	else if (cmp > 0)
    469   1.84  rillig 	    hi = mid - 1;
    470   1.84  rillig 	else
    471   1.84  rillig 	    return mid;
    472   1.84  rillig     }
    473   1.84  rillig     return -(lo + 1);
    474   1.84  rillig }
    475   1.84  rillig 
    476   1.63  rillig static bool
    477   1.63  rillig is_typename(void)
    478   1.63  rillig {
    479   1.84  rillig     if (opt.auto_typedefs &&
    480   1.84  rillig 	token.e - token.s >= 2 && memcmp(token.e - 2, "_t", 2) == 0)
    481   1.84  rillig 	return true;
    482   1.63  rillig 
    483   1.84  rillig     return bsearch_typenames(token.s) >= 0;
    484   1.63  rillig }
    485   1.63  rillig 
    486  1.115  rillig static int
    487  1.115  rillig cmp_keyword_by_name(const void *key, const void *elem)
    488  1.115  rillig {
    489  1.115  rillig     return strcmp(key, ((const struct keyword *)elem)->name);
    490  1.115  rillig }
    491  1.115  rillig 
    492  1.165  rillig /*
    493  1.166  rillig  * Looking at something like 'function_name(...)' in a line, guess whether
    494  1.165  rillig  * this starts a function definition or a declaration.
    495  1.165  rillig  */
    496  1.155  rillig static bool
    497  1.155  rillig probably_looking_at_definition(void)
    498  1.155  rillig {
    499  1.158  rillig     int paren_level = 0;
    500  1.158  rillig     for (const char *p = inp_p(), *e = inp_line_end(); p < e; p++) {
    501  1.158  rillig 	if (*p == '(')
    502  1.158  rillig 	    paren_level++;
    503  1.158  rillig 	if (*p == ')' && --paren_level == 0) {
    504  1.158  rillig 	    p++;
    505  1.166  rillig 
    506  1.158  rillig 	    while (p < e && (ch_isspace(*p) || is_identifier_part(*p)))
    507  1.166  rillig 		p++;		/* '__dead' or '__unused' */
    508  1.166  rillig 
    509  1.166  rillig 	    if (p == e)		/* func(...) */
    510  1.166  rillig 		break;
    511  1.166  rillig 	    if (*p == ';')	/* func(...); */
    512  1.165  rillig 		return false;
    513  1.166  rillig 	    if (*p == ',')	/* double abs(), pi; */
    514  1.166  rillig 		return false;
    515  1.166  rillig 	    if (*p == '(')	/* func(...) __attribute__((...)) */
    516  1.166  rillig 		paren_level++;	/* func(...) __printflike(...) */
    517  1.165  rillig 	    else
    518  1.166  rillig 		break;		/* func(...) { ... */
    519  1.158  rillig 	}
    520  1.158  rillig     }
    521  1.158  rillig 
    522  1.158  rillig     /*
    523  1.158  rillig      * To further reduce the cases where indent wrongly treats an incomplete
    524  1.158  rillig      * function declaration as a function definition, thus adding a newline
    525  1.158  rillig      * before the function name, it may be worth looking for parameter names,
    526  1.158  rillig      * as these are often omitted in function declarations and only included
    527  1.158  rillig      * in function definitions. Or just increase the lookahead to more than
    528  1.158  rillig      * just the current line of input, until the next '{'.
    529  1.158  rillig      */
    530  1.155  rillig     return true;
    531  1.155  rillig }
    532  1.155  rillig 
    533  1.138  rillig /* Read an alphanumeric token into 'token', or return lsym_eof. */
    534  1.100  rillig static lexer_symbol
    535  1.107  rillig lexi_alnum(void)
    536    1.1     cgd {
    537  1.148  rillig     if (ch_isdigit(inp_peek()) ||
    538  1.148  rillig 	    (inp_peek() == '.' && ch_isdigit(inp_lookahead(1)))) {
    539   1.89  rillig 	lex_number();
    540  1.168  rillig     } else if (is_identifier_start(inp_peek())) {
    541   1.89  rillig 	lex_word();
    542  1.167  rillig 
    543  1.167  rillig 	if (token.s[0] == 'L' && token.e - token.s == 1 &&
    544  1.167  rillig 		(inp_peek() == '"' || inp_peek() == '\'')) {
    545  1.167  rillig 	    token_add_char(inp_next());
    546  1.167  rillig 	    lex_char_or_string();
    547  1.167  rillig 	    ps.next_unary = false;
    548  1.167  rillig 
    549  1.167  rillig 	    check_size_token(1);
    550  1.167  rillig 	    *token.e = '\0';
    551  1.167  rillig 
    552  1.167  rillig 	    return lsym_word;
    553  1.167  rillig 	}
    554  1.102  rillig     } else
    555  1.102  rillig 	return lsym_eof;	/* just as a placeholder */
    556  1.102  rillig 
    557   1.89  rillig     *token.e = '\0';
    558   1.16   kamil 
    559  1.133  rillig     while (ch_isblank(inp_peek()))
    560  1.133  rillig 	inp_skip();
    561   1.89  rillig 
    562  1.154  rillig     ps.next_unary = ps.prev_token == lsym_tag;	/* for 'struct s *' */
    563  1.154  rillig 
    564  1.172  rillig     if (ps.prev_token == lsym_tag && ps.nparen == 0)
    565  1.135  rillig 	return lsym_type_outside_parentheses;
    566   1.16   kamil 
    567   1.89  rillig     const struct keyword *kw = bsearch(token.s, keywords,
    568   1.94  rillig 	array_length(keywords), sizeof(keywords[0]), cmp_keyword_by_name);
    569  1.134  rillig     bool is_type = false;
    570   1.89  rillig     if (kw == NULL) {
    571   1.89  rillig 	if (is_typename()) {
    572  1.134  rillig 	    is_type = true;
    573  1.107  rillig 	    ps.next_unary = true;
    574  1.169  rillig 	    if (ps.in_enum == in_enum_enum)
    575  1.169  rillig 		ps.in_enum = in_enum_type;
    576   1.89  rillig 	    goto found_typename;
    577   1.16   kamil 	}
    578   1.89  rillig 
    579   1.89  rillig     } else {			/* we have a keyword */
    580  1.134  rillig 	is_type = kw->lsym == lsym_type;
    581  1.107  rillig 	ps.next_unary = true;
    582  1.127  rillig 	if (kw->lsym != lsym_tag && kw->lsym != lsym_type)
    583  1.125  rillig 	    return kw->lsym;
    584  1.118  rillig 
    585  1.118  rillig found_typename:
    586  1.172  rillig 	if (ps.nparen > 0) {
    587  1.118  rillig 	    /* inside parentheses: cast, param list, offsetof or sizeof */
    588  1.172  rillig 	    if (!ps.paren[ps.nparen - 1].no_cast)
    589  1.172  rillig 		ps.paren[ps.nparen - 1].maybe_cast = true;
    590  1.118  rillig 	}
    591  1.118  rillig 	if (ps.prev_token != lsym_period && ps.prev_token != lsym_unary_op) {
    592  1.169  rillig 	    if (kw != NULL && kw->lsym == lsym_tag) {
    593  1.169  rillig 		if (token.s[0] == 'e' /* enum */)
    594  1.169  rillig 		    ps.in_enum = in_enum_enum;
    595  1.100  rillig 		return lsym_tag;
    596  1.169  rillig 	    }
    597  1.172  rillig 	    if (ps.nparen == 0)
    598  1.135  rillig 		return lsym_type_outside_parentheses;
    599   1.90  rillig 	}
    600   1.90  rillig     }
    601   1.89  rillig 
    602  1.141  rillig     if (inp_peek() == '(' && ps.tos <= 1 && ps.ind_level == 0 &&
    603  1.164  rillig 	!ps.in_func_def_params && !ps.block_init) {
    604   1.89  rillig 
    605  1.172  rillig 	if (ps.nparen == 0 && probably_looking_at_definition()) {
    606  1.155  rillig 	    ps.is_function_definition = true;
    607  1.155  rillig 	    if (ps.in_decl)
    608  1.164  rillig 		ps.in_func_def_params = true;
    609  1.155  rillig 	    return lsym_funcname;
    610  1.155  rillig 	}
    611   1.89  rillig 
    612  1.172  rillig     } else if (ps.nparen == 0 && probably_typename()) {
    613  1.107  rillig 	ps.next_unary = true;
    614  1.135  rillig 	return lsym_type_outside_parentheses;
    615   1.89  rillig     }
    616   1.89  rillig 
    617  1.134  rillig     return is_type ? lsym_type_in_parentheses : lsym_word;
    618   1.89  rillig }
    619   1.75  rillig 
    620  1.163  rillig static bool
    621  1.163  rillig is_asterisk_unary(void)
    622  1.163  rillig {
    623  1.164  rillig     if (ps.next_unary || ps.in_func_def_params)
    624  1.163  rillig 	return true;
    625  1.163  rillig     if (ps.prev_token == lsym_word ||
    626  1.163  rillig 	    ps.prev_token == lsym_rparen_or_rbracket)
    627  1.163  rillig 	return false;
    628  1.172  rillig     return ps.in_decl && ps.nparen > 0;
    629  1.163  rillig }
    630  1.163  rillig 
    631  1.161  rillig static void
    632  1.161  rillig lex_asterisk_unary(void)
    633  1.161  rillig {
    634  1.161  rillig     while (inp_peek() == '*' || ch_isspace(inp_peek())) {
    635  1.161  rillig 	if (inp_peek() == '*')
    636  1.161  rillig 	    token_add_char('*');
    637  1.161  rillig 	inp_skip();
    638  1.161  rillig     }
    639  1.161  rillig 
    640  1.161  rillig     if (ps.in_decl) {
    641  1.161  rillig 	const char *tp = inp_p(), *e = inp_line_end();
    642  1.161  rillig 
    643  1.161  rillig 	while (tp < e) {
    644  1.161  rillig 	    if (ch_isspace(*tp))
    645  1.161  rillig 		tp++;
    646  1.161  rillig 	    else if (is_identifier_start(*tp)) {
    647  1.161  rillig 		tp++;
    648  1.161  rillig 		while (tp < e && is_identifier_part(*tp))
    649  1.161  rillig 		    tp++;
    650  1.161  rillig 	    } else
    651  1.161  rillig 		break;
    652  1.161  rillig 	}
    653  1.161  rillig 
    654  1.161  rillig 	if (tp < e && *tp == '(')
    655  1.161  rillig 	    ps.is_function_definition = true;
    656  1.161  rillig     }
    657  1.161  rillig }
    658  1.161  rillig 
    659   1.89  rillig /* Reads the next token, placing it in the global variable "token". */
    660  1.100  rillig lexer_symbol
    661  1.106  rillig lexi(void)
    662   1.89  rillig {
    663   1.90  rillig     token.e = token.s;
    664  1.130  rillig     ps.curr_col_1 = ps.next_col_1;
    665  1.130  rillig     ps.next_col_1 = false;
    666   1.75  rillig 
    667  1.141  rillig     while (ch_isblank(inp_peek())) {
    668  1.117  rillig 	ps.curr_col_1 = false;
    669  1.133  rillig 	inp_skip();
    670   1.89  rillig     }
    671   1.75  rillig 
    672  1.107  rillig     lexer_symbol alnum_lsym = lexi_alnum();
    673  1.100  rillig     if (alnum_lsym != lsym_eof)
    674  1.107  rillig 	return lexi_end(alnum_lsym);
    675   1.16   kamil 
    676   1.16   kamil     /* Scan a non-alphanumeric token */
    677   1.16   kamil 
    678   1.90  rillig     check_size_token(3);	/* for things like "<<=" */
    679  1.133  rillig     *token.e++ = inp_next();
    680   1.50  rillig     *token.e = '\0';
    681   1.16   kamil 
    682  1.100  rillig     lexer_symbol lsym;
    683  1.159  rillig     bool next_unary;
    684   1.89  rillig 
    685  1.132  rillig     switch (token.e[-1]) {
    686  1.160  rillig 
    687  1.160  rillig     /* INDENT OFF */
    688  1.160  rillig     case '(':
    689  1.160  rillig     case '[':	lsym = lsym_lparen_or_lbracket;	next_unary = true;	break;
    690  1.160  rillig     case ')':
    691  1.160  rillig     case ']':	lsym = lsym_rparen_or_rbracket;	next_unary = false;	break;
    692  1.160  rillig     case '?':	lsym = lsym_question;		next_unary = true;	break;
    693  1.160  rillig     case ':':	lsym = lsym_colon;		next_unary = true;	break;
    694  1.160  rillig     case ';':	lsym = lsym_semicolon;		next_unary = true;	break;
    695  1.160  rillig     case '{':	lsym = lsym_lbrace;		next_unary = true;	break;
    696  1.160  rillig     case '}':	lsym = lsym_rbrace;		next_unary = true;	break;
    697  1.160  rillig     case ',':	lsym = lsym_comma;		next_unary = true;	break;
    698  1.160  rillig     case '.':	lsym = lsym_period;		next_unary = false;	break;
    699  1.160  rillig     /* INDENT ON */
    700  1.160  rillig 
    701   1.16   kamil     case '\n':
    702  1.159  rillig 	/* if data has been exhausted, the '\n' is a dummy. */
    703  1.159  rillig 	lsym = had_eof ? lsym_eof : lsym_newline;
    704  1.159  rillig 	next_unary = ps.next_unary;
    705  1.130  rillig 	ps.next_col_1 = true;
    706   1.16   kamil 	break;
    707   1.16   kamil 
    708   1.69  rillig     case '\f':
    709  1.159  rillig 	lsym = lsym_form_feed;
    710  1.159  rillig 	next_unary = ps.next_unary;
    711  1.130  rillig 	ps.next_col_1 = true;
    712   1.16   kamil 	break;
    713   1.16   kamil 
    714  1.160  rillig     case '#':
    715  1.160  rillig 	lsym = lsym_preprocessing;
    716  1.160  rillig 	next_unary = ps.next_unary;
    717   1.16   kamil 	break;
    718   1.16   kamil 
    719  1.160  rillig     case '\'':
    720  1.160  rillig     case '"':
    721  1.160  rillig 	lex_char_or_string();
    722  1.160  rillig 	lsym = lsym_word;
    723  1.159  rillig 	next_unary = false;
    724   1.16   kamil 	break;
    725    1.1     cgd 
    726   1.16   kamil     case '-':
    727   1.90  rillig     case '+':
    728  1.107  rillig 	lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
    729  1.159  rillig 	next_unary = true;
    730   1.16   kamil 
    731  1.159  rillig 	if (inp_peek() == token.e[-1]) {	/* '++' or '--' */
    732  1.141  rillig 	    *token.e++ = inp_next();
    733  1.134  rillig 	    if (ps.prev_token == lsym_word ||
    734  1.110  rillig 		    ps.prev_token == lsym_rparen_or_rbracket) {
    735  1.107  rillig 		lsym = ps.next_unary ? lsym_unary_op : lsym_postfix_op;
    736  1.159  rillig 		next_unary = false;
    737   1.16   kamil 	    }
    738   1.75  rillig 
    739  1.159  rillig 	} else if (inp_peek() == '=') {	/* '+=' or '-=' */
    740  1.141  rillig 	    *token.e++ = inp_next();
    741   1.75  rillig 
    742  1.159  rillig 	} else if (inp_peek() == '>') {	/* '->' */
    743  1.141  rillig 	    *token.e++ = inp_next();
    744  1.100  rillig 	    lsym = lsym_unary_op;
    745  1.159  rillig 	    next_unary = false;
    746  1.107  rillig 	    ps.want_blank = false;
    747   1.16   kamil 	}
    748   1.90  rillig 	break;
    749   1.16   kamil 
    750   1.16   kamil     case '=':
    751  1.107  rillig 	if (ps.init_or_struct)
    752  1.107  rillig 	    ps.block_init = true;
    753  1.141  rillig 	if (inp_peek() == '=') {	/* == */
    754  1.141  rillig 	    *token.e++ = inp_next();
    755   1.67  rillig 	    *token.e = '\0';
    756   1.16   kamil 	}
    757  1.100  rillig 	lsym = lsym_binary_op;
    758  1.159  rillig 	next_unary = true;
    759   1.16   kamil 	break;
    760   1.16   kamil 
    761   1.16   kamil     case '>':
    762   1.16   kamil     case '<':
    763   1.16   kamil     case '!':			/* ops like <, <<, <=, !=, etc */
    764  1.141  rillig 	if (inp_peek() == '>' || inp_peek() == '<' || inp_peek() == '=')
    765  1.141  rillig 	    *token.e++ = inp_next();
    766  1.141  rillig 	if (inp_peek() == '=')
    767  1.133  rillig 	    *token.e++ = inp_next();
    768  1.107  rillig 	lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
    769  1.159  rillig 	next_unary = true;
    770   1.16   kamil 	break;
    771   1.16   kamil 
    772   1.16   kamil     case '*':
    773  1.163  rillig 	if (is_asterisk_unary()) {
    774  1.162  rillig 	    lex_asterisk_unary();
    775  1.162  rillig 	    lsym = lsym_unary_op;
    776  1.162  rillig 	    next_unary = true;
    777  1.162  rillig 	} else {
    778  1.141  rillig 	    if (inp_peek() == '=')
    779  1.141  rillig 		*token.e++ = inp_next();
    780  1.100  rillig 	    lsym = lsym_binary_op;
    781  1.159  rillig 	    next_unary = true;
    782   1.16   kamil 	}
    783   1.16   kamil 	break;
    784    1.1     cgd 
    785   1.16   kamil     default:
    786  1.141  rillig 	if (token.e[-1] == '/' && (inp_peek() == '*' || inp_peek() == '/')) {
    787  1.133  rillig 	    *token.e++ = inp_next();
    788  1.100  rillig 	    lsym = lsym_comment;
    789  1.159  rillig 	    next_unary = ps.next_unary;
    790   1.16   kamil 	    break;
    791    1.1     cgd 	}
    792   1.75  rillig 
    793  1.132  rillig 	/* handle '||', '&&', etc., and also things as in 'int *****i' */
    794  1.141  rillig 	while (token.e[-1] == inp_peek() || inp_peek() == '=')
    795  1.133  rillig 	    token_add_char(inp_next());
    796   1.75  rillig 
    797  1.107  rillig 	lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
    798  1.159  rillig 	next_unary = true;
    799   1.47  rillig     }
    800   1.16   kamil 
    801  1.169  rillig     if (ps.in_enum == in_enum_enum || ps.in_enum == in_enum_type)
    802  1.169  rillig 	ps.in_enum = lsym == lsym_lbrace ? in_enum_brace : in_enum_no;
    803  1.169  rillig     if (lsym == lsym_rbrace)
    804  1.169  rillig 	ps.in_enum = in_enum_no;
    805  1.169  rillig 
    806  1.159  rillig     ps.next_unary = next_unary;
    807   1.75  rillig 
    808   1.25  rillig     check_size_token(1);
    809   1.50  rillig     *token.e = '\0';
    810   1.75  rillig 
    811  1.107  rillig     return lexi_end(lsym);
    812    1.1     cgd }
    813   1.16   kamil 
    814    1.6   lukem void
    815  1.128  rillig register_typename(const char *name)
    816    1.1     cgd {
    817   1.64  rillig     if (typenames.len >= typenames.cap) {
    818   1.64  rillig 	typenames.cap = 16 + 2 * typenames.cap;
    819   1.64  rillig 	typenames.items = xrealloc(typenames.items,
    820   1.64  rillig 	    sizeof(typenames.items[0]) * typenames.cap);
    821   1.64  rillig     }
    822   1.16   kamil 
    823   1.84  rillig     int pos = bsearch_typenames(name);
    824   1.64  rillig     if (pos >= 0)
    825   1.64  rillig 	return;			/* already in the list */
    826   1.75  rillig 
    827   1.64  rillig     pos = -(pos + 1);
    828   1.64  rillig     memmove(typenames.items + pos + 1, typenames.items + pos,
    829   1.73  rillig 	sizeof(typenames.items[0]) * (typenames.len++ - (unsigned)pos));
    830   1.64  rillig     typenames.items[pos] = xstrdup(name);
    831    1.1     cgd }
    832