Home | History | Annotate | Line # | Download | only in indent
lexi.c revision 1.122
      1  1.122  rillig /*	$NetBSD: lexi.c,v 1.122 2021/10/31 17:22:47 rillig Exp $	*/
      2    1.3     tls 
      3   1.16   kamil /*-
      4   1.16   kamil  * SPDX-License-Identifier: BSD-4-Clause
      5   1.16   kamil  *
      6   1.16   kamil  * Copyright (c) 1985 Sun Microsystems, Inc.
      7    1.5     mrg  * Copyright (c) 1980, 1993
      8    1.5     mrg  *	The Regents of the University of California.  All rights reserved.
      9    1.1     cgd  * All rights reserved.
     10    1.1     cgd  *
     11    1.1     cgd  * Redistribution and use in source and binary forms, with or without
     12    1.1     cgd  * modification, are permitted provided that the following conditions
     13    1.1     cgd  * are met:
     14    1.1     cgd  * 1. Redistributions of source code must retain the above copyright
     15    1.1     cgd  *    notice, this list of conditions and the following disclaimer.
     16    1.1     cgd  * 2. Redistributions in binary form must reproduce the above copyright
     17    1.1     cgd  *    notice, this list of conditions and the following disclaimer in the
     18    1.1     cgd  *    documentation and/or other materials provided with the distribution.
     19    1.1     cgd  * 3. All advertising materials mentioning features or use of this software
     20    1.1     cgd  *    must display the following acknowledgement:
     21    1.1     cgd  *	This product includes software developed by the University of
     22    1.1     cgd  *	California, Berkeley and its contributors.
     23    1.1     cgd  * 4. Neither the name of the University nor the names of its contributors
     24    1.1     cgd  *    may be used to endorse or promote products derived from this software
     25    1.1     cgd  *    without specific prior written permission.
     26    1.1     cgd  *
     27    1.1     cgd  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     28    1.1     cgd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29    1.1     cgd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     30    1.1     cgd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     31    1.1     cgd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     32    1.1     cgd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     33    1.1     cgd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     34    1.1     cgd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     35    1.1     cgd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     36    1.1     cgd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     37    1.1     cgd  * SUCH DAMAGE.
     38    1.1     cgd  */
     39    1.1     cgd 
     40   1.16   kamil #if 0
     41   1.16   kamil static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
     42   1.16   kamil #endif
     43   1.16   kamil 
     44    1.6   lukem #include <sys/cdefs.h>
     45   1.16   kamil #if defined(__NetBSD__)
     46  1.122  rillig __RCSID("$NetBSD: lexi.c,v 1.122 2021/10/31 17:22:47 rillig Exp $");
     47   1.16   kamil #elif defined(__FreeBSD__)
     48   1.16   kamil __FBSDID("$FreeBSD: head/usr.bin/indent/lexi.c 337862 2018-08-15 18:19:45Z pstef $");
     49   1.16   kamil #endif
     50    1.1     cgd 
     51    1.1     cgd #include <ctype.h>
     52    1.1     cgd #include <stdlib.h>
     53    1.1     cgd #include <string.h>
     54   1.16   kamil 
     55   1.16   kamil #include "indent.h"
     56    1.1     cgd 
     57   1.60  rillig /* must be sorted alphabetically, is used in binary search */
     58   1.62  rillig static const struct keyword {
     59   1.62  rillig     const char *name;
     60   1.62  rillig     enum keyword_kind kind;
     61   1.62  rillig } keywords[] = {
     62   1.62  rillig     {"_Bool", kw_type},
     63   1.62  rillig     {"_Complex", kw_type},
     64   1.62  rillig     {"_Imaginary", kw_type},
     65   1.62  rillig     {"auto", kw_storage_class},
     66   1.62  rillig     {"bool", kw_type},
     67  1.119  rillig     {"break", kw_other},
     68   1.62  rillig     {"case", kw_case_or_default},
     69   1.62  rillig     {"char", kw_type},
     70   1.62  rillig     {"complex", kw_type},
     71   1.62  rillig     {"const", kw_type},
     72  1.119  rillig     {"continue", kw_other},
     73   1.62  rillig     {"default", kw_case_or_default},
     74   1.97  rillig     {"do", kw_do},
     75   1.62  rillig     {"double", kw_type},
     76   1.97  rillig     {"else", kw_else},
     77  1.119  rillig     {"enum", kw_tag},
     78   1.62  rillig     {"extern", kw_storage_class},
     79   1.62  rillig     {"float", kw_type},
     80   1.98  rillig     {"for", kw_for},
     81  1.119  rillig     {"goto", kw_other},
     82   1.98  rillig     {"if", kw_if},
     83   1.62  rillig     {"imaginary", kw_type},
     84  1.119  rillig     {"inline", kw_other},
     85   1.62  rillig     {"int", kw_type},
     86   1.62  rillig     {"long", kw_type},
     87   1.62  rillig     {"offsetof", kw_offsetof},
     88   1.62  rillig     {"register", kw_storage_class},
     89  1.119  rillig     {"restrict", kw_other},
     90  1.119  rillig     {"return", kw_other},
     91   1.62  rillig     {"short", kw_type},
     92   1.62  rillig     {"signed", kw_type},
     93   1.62  rillig     {"sizeof", kw_sizeof},
     94   1.62  rillig     {"static", kw_storage_class},
     95  1.119  rillig     {"struct", kw_tag},
     96   1.62  rillig     {"switch", kw_switch},
     97   1.62  rillig     {"typedef", kw_typedef},
     98  1.119  rillig     {"union", kw_tag},
     99   1.62  rillig     {"unsigned", kw_type},
    100   1.62  rillig     {"void", kw_type},
    101   1.62  rillig     {"volatile", kw_type},
    102   1.98  rillig     {"while", kw_while}
    103    1.1     cgd };
    104    1.1     cgd 
    105   1.84  rillig static struct {
    106   1.64  rillig     const char **items;
    107   1.64  rillig     unsigned int len;
    108   1.64  rillig     unsigned int cap;
    109   1.64  rillig } typenames;
    110   1.16   kamil 
    111   1.16   kamil /*
    112   1.16   kamil  * The transition table below was rewritten by hand from lx's output, given
    113   1.16   kamil  * the following definitions. lx is Katherine Flavel's lexer generator.
    114   1.16   kamil  *
    115   1.16   kamil  * O  = /[0-7]/;        D  = /[0-9]/;          NZ = /[1-9]/;
    116   1.16   kamil  * H  = /[a-f0-9]/i;    B  = /[0-1]/;          HP = /0x/i;
    117   1.16   kamil  * BP = /0b/i;          E  = /e[+\-]?/i D+;    P  = /p[+\-]?/i D+;
    118   1.16   kamil  * FS = /[fl]/i;        IS = /u/i /(l|L|ll|LL)/? | /(l|L|ll|LL)/ /u/i?;
    119   1.16   kamil  *
    120   1.16   kamil  * D+           E  FS? -> $float;
    121   1.16   kamil  * D*    "." D+ E? FS? -> $float;
    122   1.16   kamil  * D+    "."    E? FS? -> $float;    HP H+           IS? -> $int;
    123   1.16   kamil  * HP H+        P  FS? -> $float;    NZ D*           IS? -> $int;
    124   1.16   kamil  * HP H* "." H+ P  FS? -> $float;    "0" O*          IS? -> $int;
    125   1.16   kamil  * HP H+ "."    P  FS  -> $float;    BP B+           IS? -> $int;
    126   1.16   kamil  */
    127   1.71  rillig /* INDENT OFF */
    128   1.82  rillig static const unsigned char lex_number_state[][26] = {
    129   1.16   kamil     /*                examples:
    130   1.16   kamil                                      00
    131   1.16   kamil              s                      0xx
    132   1.16   kamil              t                    00xaa
    133   1.16   kamil              a     11       101100xxa..
    134   1.16   kamil              r   11ee0001101lbuuxx.a.pp
    135   1.16   kamil              t.01.e+008bLuxll0Ll.aa.p+0
    136   1.16   kamil     states:  ABCDEFGHIJKLMNOPQRSTUVWXYZ */
    137   1.83  rillig     [0] =   "uuiifuufiuuiiuiiiiiuiuuuuu",	/* (other) */
    138   1.83  rillig     [1] =   "CEIDEHHHIJQ  U  Q  VUVVZZZ",	/* 0 */
    139   1.83  rillig     [2] =   "DEIDEHHHIJQ  U  Q  VUVVZZZ",	/* 1 */
    140   1.83  rillig     [3] =   "DEIDEHHHIJ   U     VUVVZZZ",	/* 2 3 4 5 6 7 */
    141   1.83  rillig     [4] =   "DEJDEHHHJJ   U     VUVVZZZ",	/* 8 9 */
    142   1.83  rillig     [5] =   "             U     VUVV   ",	/* A a C c D d */
    143   1.83  rillig     [6] =   "  K          U     VUVV   ",	/* B b */
    144   1.83  rillig     [7] =   "  FFF   FF   U     VUVV   ",	/* E e */
    145   1.83  rillig     [8] =   "    f  f     U     VUVV  f",	/* F f */
    146   1.83  rillig     [9] =   "  LLf  fL  PR   Li  L    f",	/* L */
    147   1.83  rillig     [10] =  "  OOf  fO   S P O i O    f",	/* l */
    148   1.83  rillig     [11] =  "                    FFX   ",	/* P p */
    149   1.83  rillig     [12] =  "  MM    M  i  iiM   M     ",	/* U u */
    150   1.83  rillig     [13] =  "  N                       ",	/* X x */
    151   1.83  rillig     [14] =  "     G                 Y  ",	/* + - */
    152   1.83  rillig     [15] =  "B EE    EE   T      W     ",	/* . */
    153   1.16   kamil     /*       ABCDEFGHIJKLMNOPQRSTUVWXYZ */
    154    1.1     cgd };
    155   1.71  rillig /* INDENT ON */
    156    1.1     cgd 
    157  1.115  rillig static const unsigned char lex_number_row[] = {
    158   1.56  rillig     ['0'] = 1,
    159   1.56  rillig     ['1'] = 2,
    160   1.56  rillig     ['2'] = 3, ['3'] = 3, ['4'] = 3, ['5'] = 3, ['6'] = 3, ['7'] = 3,
    161   1.56  rillig     ['8'] = 4, ['9'] = 4,
    162   1.56  rillig     ['A'] = 5, ['a'] = 5, ['C'] = 5, ['c'] = 5, ['D'] = 5, ['d'] = 5,
    163   1.56  rillig     ['B'] = 6, ['b'] = 6,
    164   1.56  rillig     ['E'] = 7, ['e'] = 7,
    165   1.56  rillig     ['F'] = 8, ['f'] = 8,
    166   1.56  rillig     ['L'] = 9,
    167   1.56  rillig     ['l'] = 10,
    168   1.56  rillig     ['P'] = 11, ['p'] = 11,
    169   1.56  rillig     ['U'] = 12, ['u'] = 12,
    170   1.56  rillig     ['X'] = 13, ['x'] = 13,
    171   1.56  rillig     ['+'] = 14, ['-'] = 14,
    172   1.56  rillig     ['.'] = 15,
    173   1.56  rillig };
    174   1.36  rillig 
    175   1.32  rillig static char
    176   1.32  rillig inbuf_peek(void)
    177   1.32  rillig {
    178   1.78  rillig     return *inp.s;
    179   1.32  rillig }
    180   1.32  rillig 
    181   1.66  rillig void
    182   1.32  rillig inbuf_skip(void)
    183   1.32  rillig {
    184   1.78  rillig     inp.s++;
    185   1.78  rillig     if (inp.s >= inp.e)
    186   1.81  rillig 	inbuf_read_line();
    187   1.32  rillig }
    188   1.32  rillig 
    189   1.66  rillig char
    190   1.32  rillig inbuf_next(void)
    191   1.32  rillig {
    192   1.32  rillig     char ch = inbuf_peek();
    193   1.32  rillig     inbuf_skip();
    194   1.32  rillig     return ch;
    195   1.32  rillig }
    196   1.32  rillig 
    197   1.25  rillig static void
    198   1.25  rillig check_size_token(size_t desired_size)
    199   1.25  rillig {
    200   1.58  rillig     if (token.e + desired_size >= token.l)
    201   1.58  rillig 	buf_expand(&token, desired_size);
    202   1.25  rillig }
    203   1.25  rillig 
    204   1.87  rillig static void
    205   1.87  rillig token_add_char(char ch)
    206   1.87  rillig {
    207   1.87  rillig     check_size_token(1);
    208   1.87  rillig     *token.e++ = ch;
    209   1.87  rillig }
    210   1.87  rillig 
    211   1.20  rillig #ifdef debug
    212  1.100  rillig static const char *
    213  1.100  rillig lsym_name(lexer_symbol sym)
    214   1.20  rillig {
    215   1.20  rillig     static const char *const name[] = {
    216  1.100  rillig 	"eof",
    217  1.100  rillig 	"preprocessing",
    218  1.100  rillig 	"newline",
    219  1.100  rillig 	"form_feed",
    220  1.100  rillig 	"comment",
    221  1.100  rillig 	"lparen_or_lbracket",
    222  1.100  rillig 	"rparen_or_rbracket",
    223  1.100  rillig 	"lbrace",
    224  1.100  rillig 	"rbrace",
    225  1.100  rillig 	"period",
    226  1.100  rillig 	"unary_op",
    227  1.100  rillig 	"binary_op",
    228  1.100  rillig 	"postfix_op",
    229  1.100  rillig 	"question",
    230  1.100  rillig 	"colon",
    231  1.100  rillig 	"comma",
    232  1.100  rillig 	"semicolon",
    233  1.100  rillig 	"typedef",
    234  1.100  rillig 	"storage_class",
    235  1.100  rillig 	"type",
    236  1.100  rillig 	"tag",
    237  1.100  rillig 	"case_label",
    238  1.100  rillig 	"string_prefix",
    239  1.120  rillig 	"sizeof",
    240  1.121  rillig 	"offsetof",
    241  1.100  rillig 	"ident",
    242  1.100  rillig 	"funcname",
    243  1.100  rillig 	"do",
    244  1.100  rillig 	"else",
    245  1.100  rillig 	"for",
    246  1.100  rillig 	"if",
    247  1.100  rillig 	"switch",
    248  1.100  rillig 	"while",
    249   1.20  rillig     };
    250   1.20  rillig 
    251  1.100  rillig     return name[sym];
    252   1.20  rillig }
    253   1.20  rillig 
    254  1.101  rillig static const char *
    255  1.103  rillig kw_name(enum keyword_kind kw)
    256  1.103  rillig {
    257  1.115  rillig     static const char *const name[] = {
    258  1.101  rillig 	"0",
    259  1.101  rillig 	"offsetof",
    260  1.101  rillig 	"sizeof",
    261  1.119  rillig 	"tag",
    262  1.101  rillig 	"type",
    263  1.101  rillig 	"for",
    264  1.101  rillig 	"if",
    265  1.101  rillig 	"while",
    266  1.101  rillig 	"do",
    267  1.101  rillig 	"else",
    268  1.101  rillig 	"switch",
    269  1.101  rillig 	"case_or_default",
    270  1.101  rillig 	"storage_class",
    271  1.101  rillig 	"typedef",
    272  1.119  rillig 	"other",
    273  1.101  rillig     };
    274  1.101  rillig 
    275  1.101  rillig     return name[kw];
    276  1.101  rillig }
    277  1.101  rillig 
    278   1.20  rillig static void
    279   1.72  rillig debug_print_buf(const char *name, const struct buffer *buf)
    280   1.20  rillig {
    281   1.72  rillig     if (buf->s < buf->e) {
    282  1.101  rillig 	debug_printf("%s ", name);
    283  1.101  rillig 	debug_vis_range("\"", buf->s, buf->e, "\"\n");
    284   1.20  rillig     }
    285   1.20  rillig }
    286   1.20  rillig 
    287  1.112  rillig #define debug_ps_bool(name) \
    288  1.113  rillig         if (ps.name != prev_ps.name) \
    289  1.113  rillig 	    debug_println("[%c] ps." #name, ps.name ? 'x' : ' ')
    290  1.112  rillig #define debug_ps_int(name) \
    291  1.113  rillig 	if (ps.name != prev_ps.name) \
    292  1.113  rillig 	    debug_println("%3d ps." #name, ps.name)
    293  1.112  rillig #define debug_ps_keyword(name) \
    294  1.112  rillig 	if (ps.name != kw_0) \
    295  1.113  rillig 	    debug_println("    ps." #name " = %s", kw_name(ps.name))
    296  1.112  rillig 
    297  1.101  rillig static void
    298  1.107  rillig debug_lexi(lexer_symbol lsym)
    299   1.20  rillig {
    300  1.113  rillig     /*
    301  1.113  rillig      * Watch out for 'rolled back parser state' in the debug output; the
    302  1.113  rillig      * differences around these are unreliable.
    303  1.113  rillig      */
    304  1.113  rillig     static struct parser_state prev_ps;
    305  1.113  rillig 
    306  1.104  rillig     debug_println("");
    307  1.116  rillig     debug_printf("line %d: %s", line_no, lsym_name(lsym));
    308  1.122  rillig     if (ps.curr_keyword != kw_0)
    309  1.122  rillig 	debug_printf(" %s", kw_name(ps.curr_keyword));
    310  1.116  rillig     debug_vis_range(" \"", token.s, token.e, "\"\n");
    311  1.122  rillig 
    312   1.72  rillig     debug_print_buf("label", &lab);
    313   1.72  rillig     debug_print_buf("code", &code);
    314   1.72  rillig     debug_print_buf("comment", &com);
    315  1.112  rillig 
    316  1.122  rillig     debug_println("    ps.prev_token = %s", lsym_name(ps.prev_token));
    317  1.112  rillig     debug_ps_keyword(prev_keyword);
    318  1.117  rillig     debug_ps_bool(curr_newline);
    319  1.117  rillig     debug_ps_bool(curr_col_1);
    320  1.112  rillig     debug_ps_bool(next_unary);
    321  1.112  rillig     // procname
    322  1.112  rillig     debug_ps_bool(want_blank);
    323  1.112  rillig     debug_ps_int(paren_level);
    324  1.112  rillig     debug_ps_int(p_l_follow);
    325  1.112  rillig     // paren_indents
    326  1.112  rillig     debug_ps_int(cast_mask);
    327  1.112  rillig     debug_ps_int(not_cast_mask);
    328  1.112  rillig 
    329  1.112  rillig     debug_ps_int(comment_delta);
    330  1.112  rillig     debug_ps_int(n_comment_delta);
    331  1.112  rillig     debug_ps_int(com_ind);
    332  1.112  rillig 
    333  1.112  rillig     debug_ps_bool(block_init);
    334  1.112  rillig     debug_ps_int(block_init_level);
    335  1.112  rillig     debug_ps_bool(init_or_struct);
    336  1.112  rillig 
    337  1.112  rillig     debug_ps_int(ind_level);
    338  1.112  rillig     debug_ps_int(ind_level_follow);
    339  1.112  rillig 
    340  1.112  rillig     debug_ps_int(decl_nest);
    341  1.112  rillig     debug_ps_bool(decl_on_line);
    342  1.112  rillig     debug_ps_bool(in_decl);
    343  1.112  rillig     debug_ps_int(just_saw_decl);
    344  1.112  rillig     debug_ps_bool(in_parameter_declaration);
    345  1.112  rillig     debug_ps_bool(decl_indent_done);
    346  1.112  rillig 
    347  1.112  rillig     debug_ps_bool(in_stmt);
    348  1.112  rillig     debug_ps_bool(ind_stmt);
    349  1.112  rillig     debug_ps_bool(is_case_label);
    350  1.112  rillig 
    351  1.112  rillig     debug_ps_bool(search_stmt);
    352  1.113  rillig 
    353  1.113  rillig     prev_ps = ps;
    354  1.101  rillig }
    355   1.96  rillig #endif
    356   1.20  rillig 
    357  1.104  rillig /* ARGSUSED */
    358  1.101  rillig static lexer_symbol
    359  1.107  rillig lexi_end(lexer_symbol lsym)
    360  1.101  rillig {
    361  1.101  rillig #ifdef debug
    362  1.107  rillig     debug_lexi(lsym);
    363  1.101  rillig #endif
    364  1.100  rillig     return lsym;
    365   1.20  rillig }
    366   1.20  rillig 
    367   1.43  rillig static void
    368   1.43  rillig lex_number(void)
    369   1.43  rillig {
    370  1.115  rillig     for (unsigned char s = 'A'; s != 'f' && s != 'i' && s != 'u';) {
    371  1.115  rillig 	unsigned char ch = (unsigned char)*inp.s;
    372   1.94  rillig 	if (ch >= array_length(lex_number_row) || lex_number_row[ch] == 0)
    373   1.56  rillig 	    break;
    374   1.75  rillig 
    375  1.115  rillig 	unsigned char row = lex_number_row[ch];
    376   1.82  rillig 	if (lex_number_state[row][s - 'A'] == ' ') {
    377   1.71  rillig 	    /*-
    378   1.82  rillig 	     * lex_number_state[0][s - 'A'] now indicates the type:
    379   1.74  rillig 	     * f = floating, i = integer, u = unknown
    380   1.56  rillig 	     */
    381   1.43  rillig 	    break;
    382   1.43  rillig 	}
    383   1.75  rillig 
    384   1.82  rillig 	s = lex_number_state[row][s - 'A'];
    385   1.87  rillig 	token_add_char(inbuf_next());
    386   1.43  rillig     }
    387   1.43  rillig }
    388   1.43  rillig 
    389   1.43  rillig static void
    390   1.43  rillig lex_word(void)
    391   1.43  rillig {
    392   1.78  rillig     while (isalnum((unsigned char)*inp.s) ||
    393   1.95  rillig 	    *inp.s == '\\' ||
    394   1.95  rillig 	    *inp.s == '_' || *inp.s == '$') {
    395   1.75  rillig 
    396   1.78  rillig 	if (*inp.s == '\\') {
    397   1.78  rillig 	    if (inp.s[1] == '\n') {
    398   1.78  rillig 		inp.s += 2;
    399   1.78  rillig 		if (inp.s >= inp.e)
    400   1.81  rillig 		    inbuf_read_line();
    401   1.43  rillig 	    } else
    402   1.43  rillig 		break;
    403   1.43  rillig 	}
    404   1.75  rillig 
    405   1.87  rillig 	token_add_char(inbuf_next());
    406   1.43  rillig     }
    407   1.43  rillig }
    408   1.43  rillig 
    409   1.43  rillig static void
    410   1.43  rillig lex_char_or_string(void)
    411   1.43  rillig {
    412   1.52  rillig     for (char delim = *token.s;;) {
    413   1.78  rillig 	if (*inp.s == '\n') {
    414   1.52  rillig 	    diag(1, "Unterminated literal");
    415   1.52  rillig 	    return;
    416   1.52  rillig 	}
    417   1.75  rillig 
    418   1.87  rillig 	token_add_char(inbuf_next());
    419   1.52  rillig 	if (token.e[-1] == delim)
    420   1.52  rillig 	    return;
    421   1.75  rillig 
    422   1.52  rillig 	if (token.e[-1] == '\\') {
    423   1.78  rillig 	    if (*inp.s == '\n')
    424   1.52  rillig 		++line_no;
    425   1.87  rillig 	    token_add_char(inbuf_next());
    426   1.52  rillig 	}
    427   1.52  rillig     }
    428   1.43  rillig }
    429   1.43  rillig 
    430   1.84  rillig /* Guess whether the current token is a declared type. */
    431   1.57  rillig static bool
    432  1.107  rillig probably_typename(void)
    433   1.57  rillig {
    434  1.109  rillig     if (ps.p_l_follow > 0)
    435   1.70  rillig 	return false;
    436  1.107  rillig     if (ps.block_init || ps.in_stmt)
    437   1.70  rillig 	return false;
    438   1.78  rillig     if (inp.s[0] == '*' && inp.s[1] != '=')
    439   1.70  rillig 	goto maybe;
    440   1.78  rillig     if (isalpha((unsigned char)*inp.s))
    441   1.70  rillig 	goto maybe;
    442   1.70  rillig     return false;
    443   1.70  rillig maybe:
    444  1.110  rillig     return ps.prev_token == lsym_semicolon ||
    445  1.110  rillig 	ps.prev_token == lsym_lbrace ||
    446  1.110  rillig 	ps.prev_token == lsym_rbrace;
    447   1.57  rillig }
    448   1.57  rillig 
    449   1.84  rillig static int
    450   1.84  rillig bsearch_typenames(const char *key)
    451   1.84  rillig {
    452   1.84  rillig     const char **arr = typenames.items;
    453   1.84  rillig     int lo = 0;
    454   1.84  rillig     int hi = (int)typenames.len - 1;
    455   1.84  rillig 
    456   1.84  rillig     while (lo <= hi) {
    457   1.84  rillig 	int mid = (int)((unsigned)(lo + hi) >> 1);
    458   1.84  rillig 	int cmp = strcmp(arr[mid], key);
    459   1.84  rillig 	if (cmp < 0)
    460   1.84  rillig 	    lo = mid + 1;
    461   1.84  rillig 	else if (cmp > 0)
    462   1.84  rillig 	    hi = mid - 1;
    463   1.84  rillig 	else
    464   1.84  rillig 	    return mid;
    465   1.84  rillig     }
    466   1.84  rillig     return -(lo + 1);
    467   1.84  rillig }
    468   1.84  rillig 
    469   1.63  rillig static bool
    470   1.63  rillig is_typename(void)
    471   1.63  rillig {
    472   1.84  rillig     if (opt.auto_typedefs &&
    473   1.84  rillig 	token.e - token.s >= 2 && memcmp(token.e - 2, "_t", 2) == 0)
    474   1.84  rillig 	return true;
    475   1.63  rillig 
    476   1.84  rillig     return bsearch_typenames(token.s) >= 0;
    477   1.63  rillig }
    478   1.63  rillig 
    479  1.115  rillig static int
    480  1.115  rillig cmp_keyword_by_name(const void *key, const void *elem)
    481  1.115  rillig {
    482  1.115  rillig     return strcmp(key, ((const struct keyword *)elem)->name);
    483  1.115  rillig }
    484  1.115  rillig 
    485   1.90  rillig /* Read an alphanumeric token into 'token', or return end_of_file. */
    486  1.100  rillig static lexer_symbol
    487  1.107  rillig lexi_alnum(void)
    488    1.1     cgd {
    489   1.89  rillig     if (isdigit((unsigned char)*inp.s) ||
    490   1.89  rillig 	(inp.s[0] == '.' && isdigit((unsigned char)inp.s[1]))) {
    491   1.89  rillig 	lex_number();
    492  1.103  rillig     } else if (isalnum((unsigned char)*inp.s) ||
    493  1.103  rillig 	    *inp.s == '_' || *inp.s == '$') {
    494   1.89  rillig 	lex_word();
    495  1.102  rillig     } else
    496  1.102  rillig 	return lsym_eof;	/* just as a placeholder */
    497  1.102  rillig 
    498   1.89  rillig     *token.e = '\0';
    499   1.16   kamil 
    500   1.89  rillig     if (token.s[0] == 'L' && token.s[1] == '\0' &&
    501   1.89  rillig 	(*inp.s == '"' || *inp.s == '\''))
    502  1.100  rillig 	return lsym_string_prefix;
    503   1.16   kamil 
    504  1.111  rillig     while (ch_isblank(inbuf_peek()))
    505   1.32  rillig 	inbuf_skip();
    506   1.89  rillig 
    507  1.110  rillig     if (ps.prev_token == lsym_tag && ps.p_l_follow == 0) {
    508  1.107  rillig 	ps.next_unary = true;
    509  1.100  rillig 	return lsym_type;
    510   1.16   kamil     }
    511    1.6   lukem 
    512   1.89  rillig     /* Operator after identifier is binary unless last token was 'struct'. */
    513  1.110  rillig     ps.next_unary = ps.prev_token == lsym_tag;
    514   1.16   kamil 
    515   1.89  rillig     const struct keyword *kw = bsearch(token.s, keywords,
    516   1.94  rillig 	array_length(keywords), sizeof(keywords[0]), cmp_keyword_by_name);
    517   1.89  rillig     if (kw == NULL) {
    518   1.89  rillig 	if (is_typename()) {
    519  1.107  rillig 	    ps.curr_keyword = kw_type;
    520  1.107  rillig 	    ps.next_unary = true;
    521   1.89  rillig 	    goto found_typename;
    522   1.16   kamil 	}
    523   1.89  rillig 
    524   1.89  rillig     } else {			/* we have a keyword */
    525  1.107  rillig 	ps.curr_keyword = kw->kind;
    526  1.107  rillig 	ps.next_unary = true;
    527   1.89  rillig 
    528  1.118  rillig 	/* INDENT OFF */
    529   1.89  rillig 	switch (kw->kind) {
    530  1.119  rillig 	case kw_tag:
    531  1.118  rillig 	case kw_type:		goto found_typename;
    532  1.118  rillig 	case kw_case_or_default: return lsym_case_label;
    533  1.118  rillig 	case kw_for:		return lsym_for;
    534  1.118  rillig 	case kw_if:		return lsym_if;
    535  1.118  rillig 	case kw_else:		return lsym_else;
    536  1.118  rillig 	case kw_switch:		return lsym_switch;
    537  1.118  rillig 	case kw_while:		return lsym_while;
    538  1.118  rillig 	case kw_do:		return lsym_do;
    539  1.118  rillig 	case kw_storage_class:	return lsym_storage_class;
    540  1.118  rillig 	case kw_typedef:	return lsym_typedef;
    541  1.121  rillig 	case kw_offsetof:	return lsym_offsetof;
    542  1.120  rillig 	case kw_sizeof:		return lsym_sizeof;
    543  1.118  rillig 	default:		return lsym_ident;
    544  1.118  rillig 	}
    545  1.118  rillig 	/* INDENT ON */
    546  1.118  rillig 
    547  1.118  rillig found_typename:
    548  1.118  rillig 	if (ps.p_l_follow > 0) {
    549  1.118  rillig 	    /* inside parentheses: cast, param list, offsetof or sizeof */
    550  1.118  rillig 	    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
    551  1.118  rillig 	}
    552  1.118  rillig 	if (ps.prev_token != lsym_period && ps.prev_token != lsym_unary_op) {
    553  1.119  rillig 	    if (kw != NULL && kw->kind == kw_tag)
    554  1.100  rillig 		return lsym_tag;
    555  1.118  rillig 	    if (ps.p_l_follow == 0)
    556  1.118  rillig 		return lsym_type;
    557   1.90  rillig 	}
    558   1.90  rillig     }
    559   1.89  rillig 
    560  1.107  rillig     if (*inp.s == '(' && ps.tos <= 1 && ps.ind_level == 0 &&
    561  1.107  rillig 	!ps.in_parameter_declaration && !ps.block_init) {
    562   1.89  rillig 
    563   1.89  rillig 	for (const char *p = inp.s; p < inp.e;)
    564   1.89  rillig 	    if (*p++ == ')' && (*p == ';' || *p == ','))
    565  1.118  rillig 		goto no_function_definition;
    566   1.89  rillig 
    567  1.107  rillig 	strncpy(ps.procname, token.s, sizeof ps.procname - 1);
    568  1.107  rillig 	if (ps.in_decl)
    569  1.107  rillig 	    ps.in_parameter_declaration = true;
    570  1.100  rillig 	return lsym_funcname;
    571  1.118  rillig no_function_definition:;
    572   1.89  rillig 
    573  1.107  rillig     } else if (probably_typename()) {
    574  1.107  rillig 	ps.curr_keyword = kw_type;
    575  1.107  rillig 	ps.next_unary = true;
    576  1.100  rillig 	return lsym_type;
    577   1.89  rillig     }
    578   1.89  rillig 
    579  1.110  rillig     if (ps.prev_token == lsym_type)	/* if this is a declared variable,
    580   1.89  rillig 					 * then following sign is unary */
    581  1.107  rillig 	ps.next_unary = true;	/* will make "int a -1" work */
    582   1.89  rillig 
    583  1.100  rillig     return lsym_ident;		/* the ident is not in the list */
    584   1.89  rillig }
    585   1.75  rillig 
    586   1.89  rillig /* Reads the next token, placing it in the global variable "token". */
    587  1.100  rillig lexer_symbol
    588  1.106  rillig lexi(void)
    589   1.89  rillig {
    590   1.90  rillig     token.e = token.s;
    591  1.117  rillig     ps.curr_col_1 = ps.curr_newline;
    592  1.117  rillig     ps.curr_newline = false;
    593  1.107  rillig     ps.prev_keyword = ps.curr_keyword;
    594  1.107  rillig     ps.curr_keyword = kw_0;
    595   1.75  rillig 
    596  1.111  rillig     while (ch_isblank(*inp.s)) {
    597  1.117  rillig 	ps.curr_col_1 = false;
    598   1.89  rillig 	inbuf_skip();
    599   1.89  rillig     }
    600   1.75  rillig 
    601  1.107  rillig     lexer_symbol alnum_lsym = lexi_alnum();
    602  1.100  rillig     if (alnum_lsym != lsym_eof)
    603  1.107  rillig 	return lexi_end(alnum_lsym);
    604   1.16   kamil 
    605   1.16   kamil     /* Scan a non-alphanumeric token */
    606   1.16   kamil 
    607   1.90  rillig     check_size_token(3);	/* for things like "<<=" */
    608   1.90  rillig     *token.e++ = inbuf_next();
    609   1.50  rillig     *token.e = '\0';
    610   1.16   kamil 
    611  1.100  rillig     lexer_symbol lsym;
    612   1.89  rillig     bool unary_delim = false;	/* whether the current token forces a
    613   1.89  rillig 				 * following operator to be unary */
    614   1.89  rillig 
    615   1.50  rillig     switch (*token.s) {
    616   1.16   kamil     case '\n':
    617  1.107  rillig 	unary_delim = ps.next_unary;
    618  1.117  rillig 	ps.curr_newline = true;
    619   1.47  rillig 	/* if data has been exhausted, the newline is a dummy. */
    620  1.100  rillig 	lsym = had_eof ? lsym_eof : lsym_newline;
    621   1.16   kamil 	break;
    622   1.16   kamil 
    623   1.43  rillig     case '\'':
    624   1.43  rillig     case '"':
    625   1.44  rillig 	lex_char_or_string();
    626  1.100  rillig 	lsym = lsym_ident;
    627   1.16   kamil 	break;
    628    1.6   lukem 
    629   1.40  rillig     case '(':
    630   1.40  rillig     case '[':
    631   1.16   kamil 	unary_delim = true;
    632  1.100  rillig 	lsym = lsym_lparen_or_lbracket;
    633   1.16   kamil 	break;
    634   1.16   kamil 
    635   1.40  rillig     case ')':
    636   1.40  rillig     case ']':
    637  1.100  rillig 	lsym = lsym_rparen_or_rbracket;
    638   1.16   kamil 	break;
    639   1.16   kamil 
    640   1.16   kamil     case '#':
    641  1.107  rillig 	unary_delim = ps.next_unary;
    642  1.100  rillig 	lsym = lsym_preprocessing;
    643   1.16   kamil 	break;
    644   1.16   kamil 
    645   1.16   kamil     case '?':
    646   1.16   kamil 	unary_delim = true;
    647  1.100  rillig 	lsym = lsym_question;
    648   1.16   kamil 	break;
    649   1.16   kamil 
    650   1.40  rillig     case ':':
    651  1.100  rillig 	lsym = lsym_colon;
    652   1.16   kamil 	unary_delim = true;
    653   1.16   kamil 	break;
    654   1.16   kamil 
    655   1.40  rillig     case ';':
    656   1.16   kamil 	unary_delim = true;
    657  1.100  rillig 	lsym = lsym_semicolon;
    658   1.16   kamil 	break;
    659   1.16   kamil 
    660   1.40  rillig     case '{':
    661   1.16   kamil 	unary_delim = true;
    662  1.100  rillig 	lsym = lsym_lbrace;
    663   1.16   kamil 	break;
    664   1.16   kamil 
    665   1.40  rillig     case '}':
    666   1.16   kamil 	unary_delim = true;
    667  1.100  rillig 	lsym = lsym_rbrace;
    668   1.16   kamil 	break;
    669   1.16   kamil 
    670   1.69  rillig     case '\f':
    671  1.107  rillig 	unary_delim = ps.next_unary;
    672  1.117  rillig 	ps.curr_newline = true;
    673  1.100  rillig 	lsym = lsym_form_feed;
    674   1.16   kamil 	break;
    675   1.16   kamil 
    676   1.40  rillig     case ',':
    677   1.16   kamil 	unary_delim = true;
    678  1.100  rillig 	lsym = lsym_comma;
    679   1.16   kamil 	break;
    680   1.16   kamil 
    681   1.16   kamil     case '.':
    682   1.16   kamil 	unary_delim = false;
    683  1.100  rillig 	lsym = lsym_period;
    684   1.16   kamil 	break;
    685    1.1     cgd 
    686   1.16   kamil     case '-':
    687   1.90  rillig     case '+':
    688  1.107  rillig 	lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
    689   1.16   kamil 	unary_delim = true;
    690   1.16   kamil 
    691   1.90  rillig 	if (*inp.s == token.s[0]) {	/* ++, -- */
    692   1.78  rillig 	    *token.e++ = *inp.s++;
    693  1.110  rillig 	    if (ps.prev_token == lsym_ident ||
    694  1.110  rillig 		    ps.prev_token == lsym_rparen_or_rbracket) {
    695  1.107  rillig 		lsym = ps.next_unary ? lsym_unary_op : lsym_postfix_op;
    696    1.1     cgd 		unary_delim = false;
    697   1.16   kamil 	    }
    698   1.75  rillig 
    699   1.90  rillig 	} else if (*inp.s == '=') {	/* += */
    700   1.78  rillig 	    *token.e++ = *inp.s++;
    701   1.75  rillig 
    702   1.90  rillig 	} else if (*inp.s == '>') {	/* -> */
    703   1.78  rillig 	    *token.e++ = *inp.s++;
    704   1.16   kamil 	    unary_delim = false;
    705  1.100  rillig 	    lsym = lsym_unary_op;
    706  1.107  rillig 	    ps.want_blank = false;
    707   1.16   kamil 	}
    708   1.90  rillig 	break;
    709   1.16   kamil 
    710   1.16   kamil     case '=':
    711  1.107  rillig 	if (ps.init_or_struct)
    712  1.107  rillig 	    ps.block_init = true;
    713   1.78  rillig 	if (*inp.s == '=') {	/* == */
    714   1.78  rillig 	    *token.e++ = *inp.s++;
    715   1.67  rillig 	    *token.e = '\0';
    716   1.16   kamil 	}
    717  1.100  rillig 	lsym = lsym_binary_op;
    718   1.16   kamil 	unary_delim = true;
    719   1.16   kamil 	break;
    720   1.16   kamil 
    721   1.16   kamil     case '>':
    722   1.16   kamil     case '<':
    723   1.16   kamil     case '!':			/* ops like <, <<, <=, !=, etc */
    724   1.78  rillig 	if (*inp.s == '>' || *inp.s == '<' || *inp.s == '=')
    725   1.50  rillig 	    *token.e++ = inbuf_next();
    726   1.78  rillig 	if (*inp.s == '=')
    727   1.78  rillig 	    *token.e++ = *inp.s++;
    728  1.107  rillig 	lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
    729   1.16   kamil 	unary_delim = true;
    730   1.16   kamil 	break;
    731   1.16   kamil 
    732   1.16   kamil     case '*':
    733   1.16   kamil 	unary_delim = true;
    734  1.107  rillig 	if (!ps.next_unary) {
    735   1.78  rillig 	    if (*inp.s == '=')
    736   1.78  rillig 		*token.e++ = *inp.s++;
    737  1.100  rillig 	    lsym = lsym_binary_op;
    738   1.16   kamil 	    break;
    739   1.16   kamil 	}
    740   1.75  rillig 
    741   1.78  rillig 	while (*inp.s == '*' || isspace((unsigned char)*inp.s)) {
    742   1.87  rillig 	    if (*inp.s == '*')
    743   1.87  rillig 		token_add_char('*');
    744   1.32  rillig 	    inbuf_skip();
    745   1.16   kamil 	}
    746   1.75  rillig 
    747   1.16   kamil 	if (ps.in_decl) {
    748   1.78  rillig 	    char *tp = inp.s;
    749    1.6   lukem 
    750   1.16   kamil 	    while (isalpha((unsigned char)*tp) ||
    751  1.103  rillig 		    isspace((unsigned char)*tp)) {
    752   1.78  rillig 		if (++tp >= inp.e)
    753   1.81  rillig 		    inbuf_read_line();
    754   1.16   kamil 	    }
    755   1.16   kamil 	    if (*tp == '(')
    756   1.16   kamil 		ps.procname[0] = ' ';
    757   1.16   kamil 	}
    758   1.75  rillig 
    759  1.100  rillig 	lsym = lsym_unary_op;
    760   1.16   kamil 	break;
    761    1.1     cgd 
    762   1.16   kamil     default:
    763   1.78  rillig 	if (token.s[0] == '/' && (*inp.s == '*' || *inp.s == '/')) {
    764   1.16   kamil 	    /* it is start of comment */
    765   1.50  rillig 	    *token.e++ = inbuf_next();
    766    1.1     cgd 
    767  1.100  rillig 	    lsym = lsym_comment;
    768  1.107  rillig 	    unary_delim = ps.next_unary;
    769   1.16   kamil 	    break;
    770    1.1     cgd 	}
    771   1.75  rillig 
    772   1.78  rillig 	while (token.e[-1] == *inp.s || *inp.s == '=') {
    773   1.87  rillig 	    /* handle '||', '&&', etc., and also things as in 'int *****i' */
    774   1.87  rillig 	    token_add_char(inbuf_next());
    775   1.16   kamil 	}
    776   1.75  rillig 
    777  1.107  rillig 	lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
    778   1.16   kamil 	unary_delim = true;
    779   1.47  rillig     }
    780   1.16   kamil 
    781   1.95  rillig     if (inp.s >= inp.e)		/* check for input buffer empty */
    782   1.81  rillig 	inbuf_read_line();
    783   1.75  rillig 
    784  1.107  rillig     ps.next_unary = unary_delim;
    785   1.75  rillig 
    786   1.25  rillig     check_size_token(1);
    787   1.50  rillig     *token.e = '\0';
    788   1.75  rillig 
    789  1.107  rillig     return lexi_end(lsym);
    790    1.1     cgd }
    791   1.16   kamil 
    792    1.6   lukem void
    793   1.64  rillig add_typename(const char *name)
    794    1.1     cgd {
    795   1.64  rillig     if (typenames.len >= typenames.cap) {
    796   1.64  rillig 	typenames.cap = 16 + 2 * typenames.cap;
    797   1.64  rillig 	typenames.items = xrealloc(typenames.items,
    798   1.64  rillig 	    sizeof(typenames.items[0]) * typenames.cap);
    799   1.64  rillig     }
    800   1.16   kamil 
    801   1.84  rillig     int pos = bsearch_typenames(name);
    802   1.64  rillig     if (pos >= 0)
    803   1.64  rillig 	return;			/* already in the list */
    804   1.75  rillig 
    805   1.64  rillig     pos = -(pos + 1);
    806   1.64  rillig     memmove(typenames.items + pos + 1, typenames.items + pos,
    807   1.73  rillig 	sizeof(typenames.items[0]) * (typenames.len++ - (unsigned)pos));
    808   1.64  rillig     typenames.items[pos] = xstrdup(name);
    809    1.1     cgd }
    810