Home | History | Annotate | Line # | Download | only in indent
lexi.c revision 1.201
      1  1.201  rillig /*	$NetBSD: lexi.c,v 1.201 2023/05/20 01:28:14 rillig Exp $	*/
      2    1.3     tls 
      3   1.16   kamil /*-
      4   1.16   kamil  * SPDX-License-Identifier: BSD-4-Clause
      5   1.16   kamil  *
      6   1.16   kamil  * Copyright (c) 1985 Sun Microsystems, Inc.
      7    1.5     mrg  * Copyright (c) 1980, 1993
      8    1.5     mrg  *	The Regents of the University of California.  All rights reserved.
      9    1.1     cgd  * All rights reserved.
     10    1.1     cgd  *
     11    1.1     cgd  * Redistribution and use in source and binary forms, with or without
     12    1.1     cgd  * modification, are permitted provided that the following conditions
     13    1.1     cgd  * are met:
     14    1.1     cgd  * 1. Redistributions of source code must retain the above copyright
     15    1.1     cgd  *    notice, this list of conditions and the following disclaimer.
     16    1.1     cgd  * 2. Redistributions in binary form must reproduce the above copyright
     17    1.1     cgd  *    notice, this list of conditions and the following disclaimer in the
     18    1.1     cgd  *    documentation and/or other materials provided with the distribution.
     19    1.1     cgd  * 3. All advertising materials mentioning features or use of this software
     20    1.1     cgd  *    must display the following acknowledgement:
     21    1.1     cgd  *	This product includes software developed by the University of
     22    1.1     cgd  *	California, Berkeley and its contributors.
     23    1.1     cgd  * 4. Neither the name of the University nor the names of its contributors
     24    1.1     cgd  *    may be used to endorse or promote products derived from this software
     25    1.1     cgd  *    without specific prior written permission.
     26    1.1     cgd  *
     27    1.1     cgd  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     28    1.1     cgd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29    1.1     cgd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     30    1.1     cgd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     31    1.1     cgd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     32    1.1     cgd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     33    1.1     cgd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     34    1.1     cgd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     35    1.1     cgd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     36    1.1     cgd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     37    1.1     cgd  * SUCH DAMAGE.
     38    1.1     cgd  */
     39    1.1     cgd 
     40    1.6   lukem #include <sys/cdefs.h>
     41  1.201  rillig __RCSID("$NetBSD: lexi.c,v 1.201 2023/05/20 01:28:14 rillig Exp $");
     42    1.1     cgd 
     43    1.1     cgd #include <stdlib.h>
     44    1.1     cgd #include <string.h>
     45   1.16   kamil 
     46   1.16   kamil #include "indent.h"
     47    1.1     cgd 
     48  1.181  rillig /* In lexi_alnum, this constant marks a type, independent of parentheses. */
     49  1.135  rillig #define lsym_type lsym_type_outside_parentheses
     50  1.191  rillig #define lsym_type_modifier lsym_storage_class
     51  1.127  rillig 
     52   1.60  rillig /* must be sorted alphabetically, is used in binary search */
     53   1.62  rillig static const struct keyword {
     54  1.198  rillig 	const char name[12];
     55  1.198  rillig 	lexer_symbol lsym;
     56   1.62  rillig } keywords[] = {
     57  1.198  rillig 	{"_Bool", lsym_type},
     58  1.198  rillig 	{"_Complex", lsym_type},
     59  1.198  rillig 	{"_Imaginary", lsym_type},
     60  1.198  rillig 	{"auto", lsym_storage_class},
     61  1.198  rillig 	{"bool", lsym_type},
     62  1.198  rillig 	{"break", lsym_word},
     63  1.198  rillig 	{"case", lsym_case_label},
     64  1.198  rillig 	{"char", lsym_type},
     65  1.198  rillig 	{"complex", lsym_type},
     66  1.198  rillig 	{"const", lsym_type_modifier},
     67  1.198  rillig 	{"continue", lsym_word},
     68  1.198  rillig 	{"default", lsym_case_label},
     69  1.198  rillig 	{"do", lsym_do},
     70  1.198  rillig 	{"double", lsym_type},
     71  1.198  rillig 	{"else", lsym_else},
     72  1.198  rillig 	{"enum", lsym_tag},
     73  1.198  rillig 	{"extern", lsym_storage_class},
     74  1.198  rillig 	{"float", lsym_type},
     75  1.198  rillig 	{"for", lsym_for},
     76  1.198  rillig 	{"goto", lsym_word},
     77  1.198  rillig 	{"if", lsym_if},
     78  1.198  rillig 	{"imaginary", lsym_type},
     79  1.198  rillig 	{"inline", lsym_word},
     80  1.198  rillig 	{"int", lsym_type},
     81  1.198  rillig 	{"long", lsym_type},
     82  1.198  rillig 	{"offsetof", lsym_offsetof},
     83  1.198  rillig 	{"register", lsym_storage_class},
     84  1.198  rillig 	{"restrict", lsym_word},
     85  1.198  rillig 	{"return", lsym_return},
     86  1.198  rillig 	{"short", lsym_type},
     87  1.198  rillig 	{"signed", lsym_type},
     88  1.198  rillig 	{"sizeof", lsym_sizeof},
     89  1.198  rillig 	{"static", lsym_storage_class},
     90  1.198  rillig 	{"struct", lsym_tag},
     91  1.198  rillig 	{"switch", lsym_switch},
     92  1.198  rillig 	{"typedef", lsym_typedef},
     93  1.198  rillig 	{"union", lsym_tag},
     94  1.198  rillig 	{"unsigned", lsym_type},
     95  1.198  rillig 	{"void", lsym_type},
     96  1.198  rillig 	{"volatile", lsym_type_modifier},
     97  1.198  rillig 	{"while", lsym_while}
     98    1.1     cgd };
     99    1.1     cgd 
    100   1.84  rillig static struct {
    101  1.198  rillig 	const char **items;
    102  1.198  rillig 	unsigned int len;
    103  1.198  rillig 	unsigned int cap;
    104   1.64  rillig } typenames;
    105   1.16   kamil 
    106  1.196  rillig /*-
    107   1.16   kamil  * The transition table below was rewritten by hand from lx's output, given
    108   1.16   kamil  * the following definitions. lx is Katherine Flavel's lexer generator.
    109   1.16   kamil  *
    110   1.16   kamil  * O  = /[0-7]/;        D  = /[0-9]/;          NZ = /[1-9]/;
    111   1.16   kamil  * H  = /[a-f0-9]/i;    B  = /[0-1]/;          HP = /0x/i;
    112   1.16   kamil  * BP = /0b/i;          E  = /e[+\-]?/i D+;    P  = /p[+\-]?/i D+;
    113   1.16   kamil  * FS = /[fl]/i;        IS = /u/i /(l|L|ll|LL)/? | /(l|L|ll|LL)/ /u/i?;
    114   1.16   kamil  *
    115   1.16   kamil  * D+           E  FS? -> $float;
    116   1.16   kamil  * D*    "." D+ E? FS? -> $float;
    117   1.16   kamil  * D+    "."    E? FS? -> $float;    HP H+           IS? -> $int;
    118   1.16   kamil  * HP H+        P  FS? -> $float;    NZ D*           IS? -> $int;
    119   1.16   kamil  * HP H* "." H+ P  FS? -> $float;    "0" O*          IS? -> $int;
    120   1.16   kamil  * HP H+ "."    P  FS  -> $float;    BP B+           IS? -> $int;
    121   1.16   kamil  */
    122   1.71  rillig /* INDENT OFF */
    123   1.82  rillig static const unsigned char lex_number_state[][26] = {
    124  1.199  rillig 	/*                examples:
    125  1.199  rillig 	                                 00
    126  1.199  rillig 	         s                      0xx
    127  1.199  rillig 	         t                    00xaa
    128  1.199  rillig 	         a     11       101100xxa..
    129  1.199  rillig 	         r   11ee0001101lbuuxx.a.pp
    130  1.199  rillig 	         t.01.e+008bLuxll0Ll.aa.p+0
    131  1.199  rillig 	states:  ABCDEFGHIJKLMNOPQRSTUVWXYZ */
    132  1.199  rillig 	[0] =   "uuiifuufiuuiiuiiiiiuiuuuuu",	/* (other) */
    133  1.199  rillig 	[1] =   "CEIDEHHHIJQ  U  Q  VUVVZZZ",	/* 0 */
    134  1.199  rillig 	[2] =   "DEIDEHHHIJQ  U  Q  VUVVZZZ",	/* 1 */
    135  1.199  rillig 	[3] =   "DEIDEHHHIJ   U     VUVVZZZ",	/* 2 3 4 5 6 7 */
    136  1.199  rillig 	[4] =   "DEJDEHHHJJ   U     VUVVZZZ",	/* 8 9 */
    137  1.199  rillig 	[5] =   "             U     VUVV   ",	/* A a C c D d */
    138  1.199  rillig 	[6] =   "  K          U     VUVV   ",	/* B b */
    139  1.199  rillig 	[7] =   "  FFF   FF   U     VUVV   ",	/* E e */
    140  1.199  rillig 	[8] =   "    f  f     U     VUVV  f",	/* F f */
    141  1.199  rillig 	[9] =   "  LLf  fL  PR   Li  L    f",	/* L */
    142  1.199  rillig 	[10] =  "  OOf  fO   S P O i O    f",	/* l */
    143  1.199  rillig 	[11] =  "                    FFX   ",	/* P p */
    144  1.199  rillig 	[12] =  "  MM    M  i  iiM   M     ",	/* U u */
    145  1.199  rillig 	[13] =  "  N                       ",	/* X x */
    146  1.199  rillig 	[14] =  "     G                 Y  ",	/* + - */
    147  1.199  rillig 	[15] =  "B EE    EE   T      W     ",	/* . */
    148  1.199  rillig 	/*       ABCDEFGHIJKLMNOPQRSTUVWXYZ */
    149    1.1     cgd };
    150   1.71  rillig /* INDENT ON */
    151    1.1     cgd 
    152  1.115  rillig static const unsigned char lex_number_row[] = {
    153  1.198  rillig 	['0'] = 1,
    154  1.198  rillig 	['1'] = 2,
    155  1.198  rillig 	['2'] = 3, ['3'] = 3, ['4'] = 3, ['5'] = 3, ['6'] = 3, ['7'] = 3,
    156  1.198  rillig 	['8'] = 4, ['9'] = 4,
    157  1.198  rillig 	['A'] = 5, ['a'] = 5, ['C'] = 5, ['c'] = 5, ['D'] = 5, ['d'] = 5,
    158  1.198  rillig 	['B'] = 6, ['b'] = 6,
    159  1.198  rillig 	['E'] = 7, ['e'] = 7,
    160  1.198  rillig 	['F'] = 8, ['f'] = 8,
    161  1.198  rillig 	['L'] = 9,
    162  1.198  rillig 	['l'] = 10,
    163  1.198  rillig 	['P'] = 11, ['p'] = 11,
    164  1.198  rillig 	['U'] = 12, ['u'] = 12,
    165  1.198  rillig 	['X'] = 13, ['x'] = 13,
    166  1.198  rillig 	['+'] = 14, ['-'] = 14,
    167  1.198  rillig 	['.'] = 15,
    168   1.56  rillig };
    169   1.36  rillig 
    170   1.25  rillig static void
    171   1.87  rillig token_add_char(char ch)
    172   1.87  rillig {
    173  1.198  rillig 	buf_add_char(&token, ch);
    174   1.87  rillig }
    175   1.87  rillig 
    176   1.43  rillig static void
    177   1.43  rillig lex_number(void)
    178   1.43  rillig {
    179  1.198  rillig 	for (unsigned char s = 'A'; s != 'f' && s != 'i' && s != 'u';) {
    180  1.198  rillig 		unsigned char ch = (unsigned char)inp.st[0];
    181  1.198  rillig 		if (ch == '\\' && inp.st[1] == '\n') {
    182  1.198  rillig 			inp.st++;
    183  1.198  rillig 			inp_skip();
    184  1.198  rillig 			line_no++;
    185  1.198  rillig 			continue;
    186  1.198  rillig 		}
    187  1.199  rillig 		if (ch >= array_length(lex_number_row)
    188  1.199  rillig 		    || lex_number_row[ch] == 0)
    189  1.198  rillig 			break;
    190  1.198  rillig 
    191  1.198  rillig 		unsigned char row = lex_number_row[ch];
    192  1.198  rillig 		if (lex_number_state[row][s - 'A'] == ' ') {
    193  1.198  rillig 			/*-
    194  1.198  rillig 		         * lex_number_state[0][s - 'A'] now indicates the type:
    195  1.198  rillig 		         * f = floating, i = integer, u = unknown
    196  1.198  rillig 		         */
    197  1.198  rillig 			return;
    198  1.198  rillig 		}
    199  1.198  rillig 
    200  1.198  rillig 		s = lex_number_state[row][s - 'A'];
    201  1.198  rillig 		token_add_char(inp_next());
    202   1.43  rillig 	}
    203   1.43  rillig }
    204   1.43  rillig 
    205  1.145  rillig static bool
    206  1.146  rillig is_identifier_start(char ch)
    207  1.146  rillig {
    208  1.198  rillig 	return ch_isalpha(ch) || ch == '_' || ch == '$';
    209  1.146  rillig }
    210  1.146  rillig 
    211  1.146  rillig static bool
    212  1.145  rillig is_identifier_part(char ch)
    213  1.145  rillig {
    214  1.198  rillig 	return ch_isalnum(ch) || ch == '_' || ch == '$';
    215  1.145  rillig }
    216  1.145  rillig 
    217   1.43  rillig static void
    218   1.43  rillig lex_word(void)
    219   1.43  rillig {
    220  1.198  rillig 	for (;;) {
    221  1.198  rillig 		if (is_identifier_part(inp.st[0]))
    222  1.198  rillig 			token_add_char(*inp.st++);
    223  1.198  rillig 		else if (inp.st[0] == '\\' && inp.st[1] == '\n') {
    224  1.198  rillig 			inp.st++;
    225  1.198  rillig 			inp_skip();
    226  1.198  rillig 			line_no++;
    227  1.198  rillig 		} else
    228  1.198  rillig 			return;
    229  1.198  rillig 	}
    230   1.43  rillig }
    231   1.43  rillig 
    232   1.43  rillig static void
    233   1.43  rillig lex_char_or_string(void)
    234   1.43  rillig {
    235  1.198  rillig 	for (char delim = token.mem[token.len - 1];;) {
    236  1.198  rillig 		if (inp.st[0] == '\n') {
    237  1.198  rillig 			diag(1, "Unterminated literal");
    238  1.198  rillig 			return;
    239  1.198  rillig 		}
    240  1.198  rillig 
    241  1.198  rillig 		token_add_char(*inp.st++);
    242  1.198  rillig 		if (token.mem[token.len - 1] == delim)
    243  1.198  rillig 			return;
    244  1.198  rillig 
    245  1.198  rillig 		if (token.mem[token.len - 1] == '\\') {
    246  1.198  rillig 			if (inp.st[0] == '\n')
    247  1.198  rillig 				++line_no;
    248  1.198  rillig 			token_add_char(inp_next());
    249  1.198  rillig 		}
    250   1.52  rillig 	}
    251   1.43  rillig }
    252   1.43  rillig 
    253   1.84  rillig /* Guess whether the current token is a declared type. */
    254   1.57  rillig static bool
    255  1.107  rillig probably_typename(void)
    256   1.57  rillig {
    257  1.198  rillig 	if (ps.prev_token == lsym_storage_class)
    258  1.198  rillig 		return true;
    259  1.198  rillig 	if (ps.block_init)
    260  1.198  rillig 		return false;
    261  1.198  rillig 	if (ps.in_stmt_or_decl)	/* XXX: this condition looks incorrect */
    262  1.198  rillig 		return false;
    263  1.198  rillig 	if (inp.st[0] == '*' && inp.st[1] != '=')
    264  1.198  rillig 		goto maybe;
    265  1.198  rillig 	/* XXX: is_identifier_start */
    266  1.198  rillig 	if (ch_isalpha(inp.st[0]))
    267  1.198  rillig 		goto maybe;
    268   1.70  rillig 	return false;
    269   1.70  rillig maybe:
    270  1.198  rillig 	return ps.prev_token == lsym_semicolon ||
    271  1.198  rillig 	    ps.prev_token == lsym_lbrace ||
    272  1.198  rillig 	    ps.prev_token == lsym_rbrace;
    273   1.57  rillig }
    274   1.57  rillig 
    275   1.84  rillig static int
    276   1.84  rillig bsearch_typenames(const char *key)
    277   1.84  rillig {
    278  1.198  rillig 	const char **arr = typenames.items;
    279  1.198  rillig 	int lo = 0;
    280  1.198  rillig 	int hi = (int)typenames.len - 1;
    281  1.198  rillig 
    282  1.198  rillig 	while (lo <= hi) {
    283  1.198  rillig 		int mid = (int)((unsigned)(lo + hi) >> 1);
    284  1.198  rillig 		int cmp = strcmp(arr[mid], key);
    285  1.198  rillig 		if (cmp < 0)
    286  1.198  rillig 			lo = mid + 1;
    287  1.198  rillig 		else if (cmp > 0)
    288  1.198  rillig 			hi = mid - 1;
    289  1.198  rillig 		else
    290  1.198  rillig 			return mid;
    291  1.198  rillig 	}
    292  1.198  rillig 	return -(lo + 1);
    293   1.84  rillig }
    294   1.84  rillig 
    295   1.63  rillig static bool
    296   1.63  rillig is_typename(void)
    297   1.63  rillig {
    298  1.198  rillig 	if (opt.auto_typedefs &&
    299  1.189  rillig 	    token.len >= 2 && memcmp(token.mem + token.len - 2, "_t", 2) == 0)
    300  1.198  rillig 		return true;
    301   1.63  rillig 
    302  1.198  rillig 	return bsearch_typenames(token.st) >= 0;
    303   1.63  rillig }
    304   1.63  rillig 
    305  1.115  rillig static int
    306  1.115  rillig cmp_keyword_by_name(const void *key, const void *elem)
    307  1.115  rillig {
    308  1.198  rillig 	return strcmp(key, ((const struct keyword *)elem)->name);
    309  1.115  rillig }
    310  1.115  rillig 
    311  1.165  rillig /*
    312  1.166  rillig  * Looking at something like 'function_name(...)' in a line, guess whether
    313  1.165  rillig  * this starts a function definition or a declaration.
    314  1.165  rillig  */
    315  1.155  rillig static bool
    316  1.155  rillig probably_looking_at_definition(void)
    317  1.155  rillig {
    318  1.198  rillig 	int paren_level = 0;
    319  1.198  rillig 	for (const char *p = inp.st; *p != '\n'; p++) {
    320  1.198  rillig 		if (*p == '(')
    321  1.198  rillig 			paren_level++;
    322  1.198  rillig 		if (*p == ')' && --paren_level == 0) {
    323  1.198  rillig 			p++;
    324  1.198  rillig 
    325  1.199  rillig 			while (*p != '\n'
    326  1.199  rillig 			    && (ch_isspace(*p) || is_identifier_part(*p)))
    327  1.198  rillig 				p++;	/* '__dead' or '__unused' */
    328  1.198  rillig 
    329  1.198  rillig 			if (*p == '\n')	/* func(...) */
    330  1.198  rillig 				break;
    331  1.198  rillig 			if (*p == ';')	/* func(...); */
    332  1.198  rillig 				return false;
    333  1.198  rillig 			if (*p == ',')	/* double abs(), pi; */
    334  1.198  rillig 				return false;
    335  1.198  rillig 			if (*p == '(')	/* func(...) __attribute__((...)) */
    336  1.198  rillig 				paren_level++;	/* func(...) __printflike(...)
    337  1.198  rillig 						 */
    338  1.198  rillig 			else
    339  1.198  rillig 				break;	/* func(...) { ... */
    340  1.198  rillig 		}
    341  1.198  rillig 	}
    342  1.198  rillig 
    343  1.198  rillig 	/* To further reduce the cases where indent wrongly treats an
    344  1.198  rillig 	 * incomplete function declaration as a function definition, thus
    345  1.198  rillig 	 * adding a newline before the function name, it may be worth looking
    346  1.198  rillig 	 * for parameter names, as these are often omitted in function
    347  1.198  rillig 	 * declarations and only included in function definitions. Or just
    348  1.198  rillig 	 * increase the lookahead to more than just the current line of input,
    349  1.198  rillig 	 * until the next '{'. */
    350  1.198  rillig 	return true;
    351  1.155  rillig }
    352  1.155  rillig 
    353  1.138  rillig /* Read an alphanumeric token into 'token', or return lsym_eof. */
    354  1.100  rillig static lexer_symbol
    355  1.107  rillig lexi_alnum(void)
    356    1.1     cgd {
    357  1.198  rillig 	if (ch_isdigit(inp.st[0]) ||
    358  1.197  rillig 	    (inp.st[0] == '.' && ch_isdigit(inp.st[1]))) {
    359  1.198  rillig 		lex_number();
    360  1.198  rillig 	} else if (is_identifier_start(inp.st[0])) {
    361  1.198  rillig 		lex_word();
    362  1.198  rillig 
    363  1.198  rillig 		if (token.len == 1 && token.st[0] == 'L' &&
    364  1.198  rillig 		    (inp.st[0] == '"' || inp.st[0] == '\'')) {
    365  1.198  rillig 			token_add_char(*inp.st++);
    366  1.198  rillig 			lex_char_or_string();
    367  1.198  rillig 			ps.next_unary = false;
    368  1.198  rillig 			return lsym_word;
    369  1.198  rillig 		}
    370  1.198  rillig 	} else
    371  1.198  rillig 		return lsym_eof;	/* just as a placeholder */
    372  1.198  rillig 
    373  1.198  rillig 	while (ch_isblank(inp.st[0]))
    374  1.198  rillig 		inp.st++;
    375  1.198  rillig 
    376  1.198  rillig 	ps.next_unary = ps.prev_token == lsym_tag
    377  1.198  rillig 	    || ps.prev_token == lsym_typedef;
    378  1.198  rillig 
    379  1.198  rillig 	if (ps.prev_token == lsym_tag && ps.nparen == 0)
    380  1.198  rillig 		return lsym_type_outside_parentheses;
    381  1.198  rillig 
    382  1.198  rillig 	token_add_char('\0');
    383  1.198  rillig 	token.len--;
    384  1.198  rillig 	const struct keyword *kw = bsearch(token.st, keywords,
    385  1.198  rillig 	    array_length(keywords), sizeof(keywords[0]), cmp_keyword_by_name);
    386  1.201  rillig 	lexer_symbol lsym = lsym_word;
    387  1.201  rillig 	if (kw != NULL) {
    388  1.201  rillig 		if (kw->lsym == lsym_type)
    389  1.201  rillig 			lsym = lsym_type_in_parentheses;
    390  1.201  rillig 		ps.next_unary = true;
    391  1.201  rillig 		if (kw->lsym == lsym_tag || kw->lsym == lsym_type)
    392  1.198  rillig 			goto found_typename;
    393  1.201  rillig 		return kw->lsym;
    394  1.201  rillig 	}
    395  1.198  rillig 
    396  1.201  rillig 	if (is_typename()) {
    397  1.201  rillig 		lsym = lsym_type_in_parentheses;
    398  1.198  rillig 		ps.next_unary = true;
    399  1.201  rillig 		if (ps.in_enum == in_enum_enum)
    400  1.201  rillig 			ps.in_enum = in_enum_type;
    401  1.118  rillig found_typename:
    402  1.198  rillig 		if (ps.nparen > 0) {
    403  1.198  rillig 			/* inside parentheses: cast, param list, offsetof or
    404  1.198  rillig 			 * sizeof */
    405  1.198  rillig 			if (ps.paren[ps.nparen - 1].cast == cast_unknown)
    406  1.198  rillig 				ps.paren[ps.nparen - 1].cast = cast_maybe;
    407  1.198  rillig 		}
    408  1.199  rillig 		if (ps.prev_token != lsym_period
    409  1.199  rillig 		    && ps.prev_token != lsym_unary_op) {
    410  1.198  rillig 			if (kw != NULL && kw->lsym == lsym_tag) {
    411  1.198  rillig 				if (token.st[0] == 'e' /* enum */)
    412  1.198  rillig 					ps.in_enum = in_enum_enum;
    413  1.198  rillig 				return lsym_tag;
    414  1.198  rillig 			}
    415  1.198  rillig 			if (ps.nparen == 0)
    416  1.198  rillig 				return lsym_type_outside_parentheses;
    417  1.198  rillig 		}
    418   1.90  rillig 	}
    419   1.89  rillig 
    420  1.198  rillig 	if (inp.st[0] == '(' && ps.tos <= 1 && ps.ind_level == 0 &&
    421  1.189  rillig 	    !ps.in_func_def_params && !ps.block_init) {
    422   1.89  rillig 
    423  1.198  rillig 		if (ps.nparen == 0 && probably_looking_at_definition()) {
    424  1.198  rillig 			ps.is_function_definition = true;
    425  1.198  rillig 			if (ps.in_decl)
    426  1.198  rillig 				ps.in_func_def_params = true;
    427  1.198  rillig 			return lsym_funcname;
    428  1.198  rillig 		}
    429  1.198  rillig 
    430  1.198  rillig 	} else if (ps.nparen == 0 && probably_typename()) {
    431  1.198  rillig 		ps.next_unary = true;
    432  1.198  rillig 		return lsym_type_outside_parentheses;
    433  1.155  rillig 	}
    434   1.89  rillig 
    435  1.201  rillig 	return lsym;
    436   1.89  rillig }
    437   1.75  rillig 
    438  1.163  rillig static bool
    439  1.163  rillig is_asterisk_unary(void)
    440  1.163  rillig {
    441  1.198  rillig 	if (ps.next_unary || ps.in_func_def_params)
    442  1.198  rillig 		return true;
    443  1.198  rillig 	if (ps.prev_token == lsym_word ||
    444  1.163  rillig 	    ps.prev_token == lsym_rparen_or_rbracket)
    445  1.198  rillig 		return false;
    446  1.198  rillig 	return ps.in_decl && ps.nparen > 0;
    447  1.163  rillig }
    448  1.163  rillig 
    449  1.200  rillig static bool
    450  1.200  rillig probably_in_function_definition(void)
    451  1.200  rillig {
    452  1.200  rillig 	for (const char *tp = inp.st; *tp != '\n';) {
    453  1.200  rillig 		if (ch_isspace(*tp))
    454  1.200  rillig 			tp++;
    455  1.200  rillig 		else if (is_identifier_start(*tp)) {
    456  1.200  rillig 			tp++;
    457  1.200  rillig 			while (is_identifier_part(*tp))
    458  1.200  rillig 				tp++;
    459  1.200  rillig 		} else
    460  1.200  rillig 			return *tp == '(';
    461  1.200  rillig 	}
    462  1.200  rillig 	return false;
    463  1.200  rillig }
    464  1.200  rillig 
    465  1.161  rillig static void
    466  1.161  rillig lex_asterisk_unary(void)
    467  1.161  rillig {
    468  1.198  rillig 	while (inp.st[0] == '*' || ch_isspace(inp.st[0])) {
    469  1.198  rillig 		if (inp.st[0] == '*')
    470  1.198  rillig 			token_add_char('*');
    471  1.198  rillig 		inp_skip();
    472  1.198  rillig 	}
    473  1.198  rillig 
    474  1.200  rillig 	if (ps.in_decl && probably_in_function_definition())
    475  1.200  rillig 		ps.is_function_definition = true;
    476  1.161  rillig }
    477  1.161  rillig 
    478  1.193  rillig static void
    479  1.193  rillig skip_blank(const char **pp)
    480  1.193  rillig {
    481  1.198  rillig 	while (ch_isblank(**pp))
    482  1.198  rillig 		(*pp)++;
    483  1.193  rillig }
    484  1.193  rillig 
    485  1.193  rillig static bool
    486  1.193  rillig skip_string(const char **pp, const char *s)
    487  1.193  rillig {
    488  1.198  rillig 	size_t len = strlen(s);
    489  1.198  rillig 	if (strncmp(*pp, s, len) == 0) {
    490  1.198  rillig 		*pp += len;
    491  1.198  rillig 		return true;
    492  1.198  rillig 	}
    493  1.198  rillig 	return false;
    494  1.193  rillig }
    495  1.193  rillig 
    496  1.194  rillig static void
    497  1.193  rillig lex_indent_comment(void)
    498  1.193  rillig {
    499  1.198  rillig 	const char *p = inp.mem;
    500  1.193  rillig 
    501  1.198  rillig 	skip_blank(&p);
    502  1.198  rillig 	if (!skip_string(&p, "/*"))
    503  1.198  rillig 		return;
    504  1.198  rillig 	skip_blank(&p);
    505  1.198  rillig 	if (!skip_string(&p, "INDENT"))
    506  1.198  rillig 		return;
    507  1.198  rillig 
    508  1.198  rillig 	enum indent_enabled enabled;
    509  1.198  rillig 	skip_blank(&p);
    510  1.198  rillig 	if (*p == '*' || skip_string(&p, "ON"))
    511  1.198  rillig 		enabled = indent_last_off_line;
    512  1.198  rillig 	else if (skip_string(&p, "OFF"))
    513  1.198  rillig 		enabled = indent_off;
    514  1.198  rillig 	else
    515  1.198  rillig 		return;
    516  1.198  rillig 
    517  1.198  rillig 	skip_blank(&p);
    518  1.198  rillig 	if (!skip_string(&p, "*/\n"))
    519  1.198  rillig 		return;
    520  1.193  rillig 
    521  1.198  rillig 	if (lab.len > 0 || code.len > 0 || com.len > 0)
    522  1.198  rillig 		output_line();
    523  1.193  rillig 
    524  1.198  rillig 	indent_enabled = enabled;
    525  1.193  rillig }
    526  1.193  rillig 
    527   1.89  rillig /* Reads the next token, placing it in the global variable "token". */
    528  1.100  rillig lexer_symbol
    529  1.106  rillig lexi(void)
    530   1.89  rillig {
    531  1.198  rillig 	token.len = 0;
    532  1.198  rillig 	ps.curr_col_1 = ps.next_col_1;
    533  1.198  rillig 	ps.next_col_1 = false;
    534  1.198  rillig 
    535  1.198  rillig 	for (;;) {
    536  1.198  rillig 		if (ch_isblank(inp.st[0])) {
    537  1.198  rillig 			ps.curr_col_1 = false;
    538  1.198  rillig 			inp.st++;
    539  1.198  rillig 		} else if (inp.st[0] == '\\' && inp.st[1] == '\n') {
    540  1.198  rillig 			inp.st++;
    541  1.198  rillig 			inp_skip();
    542  1.198  rillig 			line_no++;
    543  1.198  rillig 		} else
    544  1.198  rillig 			break;
    545  1.198  rillig 	}
    546  1.198  rillig 
    547  1.198  rillig 	lexer_symbol alnum_lsym = lexi_alnum();
    548  1.198  rillig 	if (alnum_lsym != lsym_eof) {
    549  1.198  rillig 		debug_parser_state(alnum_lsym);
    550  1.198  rillig 		return alnum_lsym;
    551  1.198  rillig 	}
    552   1.75  rillig 
    553  1.198  rillig 	/* Scan a non-alphanumeric token */
    554   1.16   kamil 
    555  1.198  rillig 	token_add_char(inp_next());
    556   1.16   kamil 
    557  1.198  rillig 	lexer_symbol lsym;
    558  1.198  rillig 	bool next_unary;
    559   1.89  rillig 
    560  1.198  rillig 	switch (token.mem[token.len - 1]) {
    561  1.160  rillig 
    562  1.160  rillig     /* INDENT OFF */
    563  1.160  rillig     case '(':
    564  1.160  rillig     case '[':	lsym = lsym_lparen_or_lbracket;	next_unary = true;	break;
    565  1.160  rillig     case ')':
    566  1.160  rillig     case ']':	lsym = lsym_rparen_or_rbracket;	next_unary = false;	break;
    567  1.160  rillig     case '?':	lsym = lsym_question;		next_unary = true;	break;
    568  1.160  rillig     case ':':	lsym = lsym_colon;		next_unary = true;	break;
    569  1.160  rillig     case ';':	lsym = lsym_semicolon;		next_unary = true;	break;
    570  1.160  rillig     case '{':	lsym = lsym_lbrace;		next_unary = true;	break;
    571  1.160  rillig     case '}':	lsym = lsym_rbrace;		next_unary = true;	break;
    572  1.160  rillig     case ',':	lsym = lsym_comma;		next_unary = true;	break;
    573  1.160  rillig     case '.':	lsym = lsym_period;		next_unary = false;	break;
    574  1.160  rillig     /* INDENT ON */
    575  1.160  rillig 
    576  1.198  rillig 	case '\n':
    577  1.198  rillig 		/* if data has been exhausted, the '\n' is a dummy. */
    578  1.198  rillig 		lsym = had_eof ? lsym_eof : lsym_newline;
    579  1.198  rillig 		next_unary = ps.next_unary;
    580  1.198  rillig 		ps.next_col_1 = true;
    581  1.198  rillig 		break;
    582  1.198  rillig 
    583  1.198  rillig 	case '#':
    584  1.198  rillig 		lsym = lsym_preprocessing;
    585  1.198  rillig 		next_unary = ps.next_unary;
    586  1.198  rillig 		break;
    587  1.198  rillig 
    588  1.198  rillig 	case '\'':
    589  1.198  rillig 	case '"':
    590  1.198  rillig 		lex_char_or_string();
    591  1.198  rillig 		lsym = lsym_word;
    592  1.159  rillig 		next_unary = false;
    593  1.198  rillig 		break;
    594  1.198  rillig 
    595  1.198  rillig 	case '-':
    596  1.198  rillig 	case '+':
    597  1.198  rillig 		lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
    598  1.198  rillig 		next_unary = true;
    599  1.198  rillig 
    600  1.199  rillig 		/* '++' or '--' */
    601  1.199  rillig 		if (inp.st[0] == token.mem[token.len - 1]) {
    602  1.198  rillig 			token_add_char(*inp.st++);
    603  1.198  rillig 			if (ps.prev_token == lsym_word ||
    604  1.198  rillig 			    ps.prev_token == lsym_rparen_or_rbracket) {
    605  1.199  rillig 				lsym = ps.next_unary
    606  1.199  rillig 				    ? lsym_unary_op : lsym_postfix_op;
    607  1.198  rillig 				next_unary = false;
    608  1.198  rillig 			}
    609  1.198  rillig 
    610  1.198  rillig 		} else if (inp.st[0] == '=') {	/* '+=' or '-=' */
    611  1.198  rillig 			token_add_char(*inp.st++);
    612  1.198  rillig 
    613  1.198  rillig 		} else if (inp.st[0] == '>') {	/* '->' */
    614  1.198  rillig 			token_add_char(*inp.st++);
    615  1.198  rillig 			lsym = lsym_unary_op;
    616  1.198  rillig 			next_unary = false;
    617  1.198  rillig 			ps.want_blank = false;
    618  1.198  rillig 		}
    619  1.198  rillig 		break;
    620  1.198  rillig 
    621  1.198  rillig 	case '=':
    622  1.198  rillig 		if (ps.init_or_struct)
    623  1.198  rillig 			ps.block_init = true;
    624  1.198  rillig 		if (inp.st[0] == '=')
    625  1.198  rillig 			token_add_char(*inp.st++);
    626  1.198  rillig 		lsym = lsym_binary_op;
    627  1.198  rillig 		next_unary = true;
    628  1.198  rillig 		break;
    629   1.75  rillig 
    630  1.198  rillig 	case '>':
    631  1.198  rillig 	case '<':
    632  1.198  rillig 	case '!':		/* ops like <, <<, <=, !=, etc */
    633  1.198  rillig 		if (inp.st[0] == '>' || inp.st[0] == '<' || inp.st[0] == '=')
    634  1.198  rillig 			token_add_char(*inp.st++);
    635  1.198  rillig 		if (inp.st[0] == '=')
    636  1.198  rillig 			token_add_char(*inp.st++);
    637  1.198  rillig 		lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
    638  1.198  rillig 		next_unary = true;
    639  1.198  rillig 		break;
    640   1.75  rillig 
    641  1.198  rillig 	case '*':
    642  1.198  rillig 		if (is_asterisk_unary()) {
    643  1.198  rillig 			lex_asterisk_unary();
    644  1.198  rillig 			lsym = lsym_unary_op;
    645  1.198  rillig 			next_unary = true;
    646  1.198  rillig 		} else {
    647  1.198  rillig 			if (inp.st[0] == '=')
    648  1.198  rillig 				token_add_char(*inp.st++);
    649  1.198  rillig 			lsym = lsym_binary_op;
    650  1.198  rillig 			next_unary = true;
    651  1.198  rillig 		}
    652  1.198  rillig 		break;
    653    1.1     cgd 
    654  1.198  rillig 	default:
    655  1.198  rillig 		if (token.mem[token.len - 1] == '/'
    656  1.198  rillig 		    && (inp.st[0] == '*' || inp.st[0] == '/')) {
    657  1.198  rillig 			enum indent_enabled prev = indent_enabled;
    658  1.198  rillig 			lex_indent_comment();
    659  1.198  rillig 			if (prev == indent_on && indent_enabled == indent_off)
    660  1.198  rillig 				clear_indent_off_text();
    661  1.198  rillig 			token_add_char(*inp.st++);
    662  1.198  rillig 			lsym = lsym_comment;
    663  1.198  rillig 			next_unary = ps.next_unary;
    664  1.198  rillig 			break;
    665  1.198  rillig 		}
    666  1.198  rillig 
    667  1.199  rillig 		/* things like '||', '&&', '<<=', 'int *****i' */
    668  1.199  rillig 		while (inp.st[0] == token.mem[token.len - 1]
    669  1.199  rillig 		    || inp.st[0] == '=')
    670  1.198  rillig 			token_add_char(*inp.st++);
    671  1.198  rillig 
    672  1.198  rillig 		lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
    673  1.198  rillig 		next_unary = true;
    674  1.198  rillig 	}
    675  1.198  rillig 
    676  1.198  rillig 	if (ps.in_enum == in_enum_enum || ps.in_enum == in_enum_type)
    677  1.198  rillig 		ps.in_enum = lsym == lsym_lbrace ? in_enum_brace : in_enum_no;
    678  1.198  rillig 	if (lsym == lsym_rbrace)
    679  1.198  rillig 		ps.in_enum = in_enum_no;
    680  1.169  rillig 
    681  1.198  rillig 	ps.next_unary = next_unary;
    682   1.75  rillig 
    683  1.198  rillig 	debug_parser_state(lsym);
    684  1.198  rillig 	return lsym;
    685    1.1     cgd }
    686   1.16   kamil 
    687    1.6   lukem void
    688  1.128  rillig register_typename(const char *name)
    689    1.1     cgd {
    690  1.198  rillig 	if (typenames.len >= typenames.cap) {
    691  1.198  rillig 		typenames.cap = 16 + 2 * typenames.cap;
    692  1.198  rillig 		typenames.items = nonnull(realloc(typenames.items,
    693  1.198  rillig 		    sizeof(typenames.items[0]) * typenames.cap));
    694  1.198  rillig 	}
    695  1.198  rillig 
    696  1.198  rillig 	int pos = bsearch_typenames(name);
    697  1.198  rillig 	if (pos >= 0)
    698  1.198  rillig 		return;		/* already in the list */
    699  1.198  rillig 
    700  1.198  rillig 	pos = -(pos + 1);
    701  1.198  rillig 	memmove(typenames.items + pos + 1, typenames.items + pos,
    702  1.198  rillig 	    sizeof(typenames.items[0]) * (typenames.len++ - (unsigned)pos));
    703  1.198  rillig 	typenames.items[pos] = nonnull(strdup(name));
    704    1.1     cgd }
    705