lexi.c revision 1.172 1 1.172 rillig /* $NetBSD: lexi.c,v 1.172 2022/02/13 12:43:26 rillig Exp $ */
2 1.3 tls
3 1.16 kamil /*-
4 1.16 kamil * SPDX-License-Identifier: BSD-4-Clause
5 1.16 kamil *
6 1.16 kamil * Copyright (c) 1985 Sun Microsystems, Inc.
7 1.5 mrg * Copyright (c) 1980, 1993
8 1.5 mrg * The Regents of the University of California. All rights reserved.
9 1.1 cgd * All rights reserved.
10 1.1 cgd *
11 1.1 cgd * Redistribution and use in source and binary forms, with or without
12 1.1 cgd * modification, are permitted provided that the following conditions
13 1.1 cgd * are met:
14 1.1 cgd * 1. Redistributions of source code must retain the above copyright
15 1.1 cgd * notice, this list of conditions and the following disclaimer.
16 1.1 cgd * 2. Redistributions in binary form must reproduce the above copyright
17 1.1 cgd * notice, this list of conditions and the following disclaimer in the
18 1.1 cgd * documentation and/or other materials provided with the distribution.
19 1.1 cgd * 3. All advertising materials mentioning features or use of this software
20 1.1 cgd * must display the following acknowledgement:
21 1.1 cgd * This product includes software developed by the University of
22 1.1 cgd * California, Berkeley and its contributors.
23 1.1 cgd * 4. Neither the name of the University nor the names of its contributors
24 1.1 cgd * may be used to endorse or promote products derived from this software
25 1.1 cgd * without specific prior written permission.
26 1.1 cgd *
27 1.1 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 1.1 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 1.1 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 1.1 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 1.1 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 1.1 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 1.1 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 1.1 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 1.1 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 1.1 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 1.1 cgd * SUCH DAMAGE.
38 1.1 cgd */
39 1.1 cgd
40 1.16 kamil #if 0
41 1.16 kamil static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";
42 1.16 kamil #endif
43 1.16 kamil
44 1.6 lukem #include <sys/cdefs.h>
45 1.16 kamil #if defined(__NetBSD__)
46 1.172 rillig __RCSID("$NetBSD: lexi.c,v 1.172 2022/02/13 12:43:26 rillig Exp $");
47 1.16 kamil #elif defined(__FreeBSD__)
48 1.16 kamil __FBSDID("$FreeBSD: head/usr.bin/indent/lexi.c 337862 2018-08-15 18:19:45Z pstef $");
49 1.16 kamil #endif
50 1.1 cgd
51 1.1 cgd #include <stdlib.h>
52 1.1 cgd #include <string.h>
53 1.16 kamil
54 1.16 kamil #include "indent.h"
55 1.1 cgd
56 1.127 rillig /*
57 1.127 rillig * While inside lexi_alnum, this constant just marks a type, independently of
58 1.127 rillig * the parentheses level.
59 1.127 rillig */
60 1.135 rillig #define lsym_type lsym_type_outside_parentheses
61 1.127 rillig
62 1.60 rillig /* must be sorted alphabetically, is used in binary search */
63 1.62 rillig static const struct keyword {
64 1.62 rillig const char *name;
65 1.125 rillig lexer_symbol lsym;
66 1.62 rillig } keywords[] = {
67 1.127 rillig {"_Bool", lsym_type},
68 1.127 rillig {"_Complex", lsym_type},
69 1.127 rillig {"_Imaginary", lsym_type},
70 1.127 rillig {"auto", lsym_storage_class},
71 1.127 rillig {"bool", lsym_type},
72 1.134 rillig {"break", lsym_word},
73 1.127 rillig {"case", lsym_case_label},
74 1.127 rillig {"char", lsym_type},
75 1.127 rillig {"complex", lsym_type},
76 1.127 rillig {"const", lsym_type},
77 1.134 rillig {"continue", lsym_word},
78 1.127 rillig {"default", lsym_case_label},
79 1.127 rillig {"do", lsym_do},
80 1.127 rillig {"double", lsym_type},
81 1.127 rillig {"else", lsym_else},
82 1.127 rillig {"enum", lsym_tag},
83 1.127 rillig {"extern", lsym_storage_class},
84 1.127 rillig {"float", lsym_type},
85 1.127 rillig {"for", lsym_for},
86 1.134 rillig {"goto", lsym_word},
87 1.127 rillig {"if", lsym_if},
88 1.127 rillig {"imaginary", lsym_type},
89 1.134 rillig {"inline", lsym_word},
90 1.127 rillig {"int", lsym_type},
91 1.127 rillig {"long", lsym_type},
92 1.127 rillig {"offsetof", lsym_offsetof},
93 1.127 rillig {"register", lsym_storage_class},
94 1.134 rillig {"restrict", lsym_word},
95 1.129 rillig {"return", lsym_return},
96 1.127 rillig {"short", lsym_type},
97 1.127 rillig {"signed", lsym_type},
98 1.127 rillig {"sizeof", lsym_sizeof},
99 1.127 rillig {"static", lsym_storage_class},
100 1.127 rillig {"struct", lsym_tag},
101 1.127 rillig {"switch", lsym_switch},
102 1.127 rillig {"typedef", lsym_typedef},
103 1.127 rillig {"union", lsym_tag},
104 1.127 rillig {"unsigned", lsym_type},
105 1.127 rillig {"void", lsym_type},
106 1.127 rillig {"volatile", lsym_type},
107 1.127 rillig {"while", lsym_while}
108 1.1 cgd };
109 1.1 cgd
110 1.84 rillig static struct {
111 1.64 rillig const char **items;
112 1.64 rillig unsigned int len;
113 1.64 rillig unsigned int cap;
114 1.64 rillig } typenames;
115 1.16 kamil
116 1.16 kamil /*
117 1.16 kamil * The transition table below was rewritten by hand from lx's output, given
118 1.16 kamil * the following definitions. lx is Katherine Flavel's lexer generator.
119 1.16 kamil *
120 1.16 kamil * O = /[0-7]/; D = /[0-9]/; NZ = /[1-9]/;
121 1.16 kamil * H = /[a-f0-9]/i; B = /[0-1]/; HP = /0x/i;
122 1.16 kamil * BP = /0b/i; E = /e[+\-]?/i D+; P = /p[+\-]?/i D+;
123 1.16 kamil * FS = /[fl]/i; IS = /u/i /(l|L|ll|LL)/? | /(l|L|ll|LL)/ /u/i?;
124 1.16 kamil *
125 1.16 kamil * D+ E FS? -> $float;
126 1.16 kamil * D* "." D+ E? FS? -> $float;
127 1.16 kamil * D+ "." E? FS? -> $float; HP H+ IS? -> $int;
128 1.16 kamil * HP H+ P FS? -> $float; NZ D* IS? -> $int;
129 1.16 kamil * HP H* "." H+ P FS? -> $float; "0" O* IS? -> $int;
130 1.16 kamil * HP H+ "." P FS -> $float; BP B+ IS? -> $int;
131 1.16 kamil */
132 1.71 rillig /* INDENT OFF */
133 1.82 rillig static const unsigned char lex_number_state[][26] = {
134 1.16 kamil /* examples:
135 1.16 kamil 00
136 1.16 kamil s 0xx
137 1.16 kamil t 00xaa
138 1.16 kamil a 11 101100xxa..
139 1.16 kamil r 11ee0001101lbuuxx.a.pp
140 1.16 kamil t.01.e+008bLuxll0Ll.aa.p+0
141 1.16 kamil states: ABCDEFGHIJKLMNOPQRSTUVWXYZ */
142 1.83 rillig [0] = "uuiifuufiuuiiuiiiiiuiuuuuu", /* (other) */
143 1.83 rillig [1] = "CEIDEHHHIJQ U Q VUVVZZZ", /* 0 */
144 1.83 rillig [2] = "DEIDEHHHIJQ U Q VUVVZZZ", /* 1 */
145 1.83 rillig [3] = "DEIDEHHHIJ U VUVVZZZ", /* 2 3 4 5 6 7 */
146 1.83 rillig [4] = "DEJDEHHHJJ U VUVVZZZ", /* 8 9 */
147 1.83 rillig [5] = " U VUVV ", /* A a C c D d */
148 1.83 rillig [6] = " K U VUVV ", /* B b */
149 1.83 rillig [7] = " FFF FF U VUVV ", /* E e */
150 1.83 rillig [8] = " f f U VUVV f", /* F f */
151 1.83 rillig [9] = " LLf fL PR Li L f", /* L */
152 1.83 rillig [10] = " OOf fO S P O i O f", /* l */
153 1.83 rillig [11] = " FFX ", /* P p */
154 1.83 rillig [12] = " MM M i iiM M ", /* U u */
155 1.83 rillig [13] = " N ", /* X x */
156 1.83 rillig [14] = " G Y ", /* + - */
157 1.83 rillig [15] = "B EE EE T W ", /* . */
158 1.16 kamil /* ABCDEFGHIJKLMNOPQRSTUVWXYZ */
159 1.1 cgd };
160 1.71 rillig /* INDENT ON */
161 1.1 cgd
162 1.115 rillig static const unsigned char lex_number_row[] = {
163 1.56 rillig ['0'] = 1,
164 1.56 rillig ['1'] = 2,
165 1.56 rillig ['2'] = 3, ['3'] = 3, ['4'] = 3, ['5'] = 3, ['6'] = 3, ['7'] = 3,
166 1.56 rillig ['8'] = 4, ['9'] = 4,
167 1.56 rillig ['A'] = 5, ['a'] = 5, ['C'] = 5, ['c'] = 5, ['D'] = 5, ['d'] = 5,
168 1.56 rillig ['B'] = 6, ['b'] = 6,
169 1.56 rillig ['E'] = 7, ['e'] = 7,
170 1.56 rillig ['F'] = 8, ['f'] = 8,
171 1.56 rillig ['L'] = 9,
172 1.56 rillig ['l'] = 10,
173 1.56 rillig ['P'] = 11, ['p'] = 11,
174 1.56 rillig ['U'] = 12, ['u'] = 12,
175 1.56 rillig ['X'] = 13, ['x'] = 13,
176 1.56 rillig ['+'] = 14, ['-'] = 14,
177 1.56 rillig ['.'] = 15,
178 1.56 rillig };
179 1.36 rillig
180 1.25 rillig static void
181 1.25 rillig check_size_token(size_t desired_size)
182 1.25 rillig {
183 1.58 rillig if (token.e + desired_size >= token.l)
184 1.58 rillig buf_expand(&token, desired_size);
185 1.25 rillig }
186 1.25 rillig
187 1.87 rillig static void
188 1.87 rillig token_add_char(char ch)
189 1.87 rillig {
190 1.87 rillig check_size_token(1);
191 1.87 rillig *token.e++ = ch;
192 1.87 rillig }
193 1.87 rillig
194 1.20 rillig #ifdef debug
195 1.100 rillig static const char *
196 1.100 rillig lsym_name(lexer_symbol sym)
197 1.20 rillig {
198 1.20 rillig static const char *const name[] = {
199 1.100 rillig "eof",
200 1.100 rillig "preprocessing",
201 1.100 rillig "newline",
202 1.100 rillig "form_feed",
203 1.100 rillig "comment",
204 1.100 rillig "lparen_or_lbracket",
205 1.100 rillig "rparen_or_rbracket",
206 1.100 rillig "lbrace",
207 1.100 rillig "rbrace",
208 1.100 rillig "period",
209 1.100 rillig "unary_op",
210 1.100 rillig "binary_op",
211 1.100 rillig "postfix_op",
212 1.100 rillig "question",
213 1.100 rillig "colon",
214 1.100 rillig "comma",
215 1.100 rillig "semicolon",
216 1.100 rillig "typedef",
217 1.100 rillig "storage_class",
218 1.135 rillig "type_outside_parentheses",
219 1.134 rillig "type_in_parentheses",
220 1.100 rillig "tag",
221 1.100 rillig "case_label",
222 1.120 rillig "sizeof",
223 1.121 rillig "offsetof",
224 1.134 rillig "word",
225 1.100 rillig "funcname",
226 1.100 rillig "do",
227 1.100 rillig "else",
228 1.100 rillig "for",
229 1.100 rillig "if",
230 1.100 rillig "switch",
231 1.100 rillig "while",
232 1.129 rillig "return",
233 1.20 rillig };
234 1.20 rillig
235 1.100 rillig return name[sym];
236 1.20 rillig }
237 1.20 rillig
238 1.20 rillig static void
239 1.72 rillig debug_print_buf(const char *name, const struct buffer *buf)
240 1.20 rillig {
241 1.72 rillig if (buf->s < buf->e) {
242 1.101 rillig debug_printf("%s ", name);
243 1.101 rillig debug_vis_range("\"", buf->s, buf->e, "\"\n");
244 1.20 rillig }
245 1.20 rillig }
246 1.20 rillig
247 1.168 rillig static bool
248 1.168 rillig debug_full_parser_state(void)
249 1.168 rillig {
250 1.168 rillig return true;
251 1.168 rillig }
252 1.168 rillig
253 1.112 rillig #define debug_ps_bool(name) \
254 1.113 rillig if (ps.name != prev_ps.name) \
255 1.168 rillig debug_println("[%c] -> [%c] ps." #name, \
256 1.168 rillig prev_ps.name ? 'x' : ' ', ps.name ? 'x' : ' '); \
257 1.168 rillig else if (debug_full_parser_state()) \
258 1.168 rillig debug_println(" [%c] ps." #name, ps.name ? 'x' : ' ')
259 1.112 rillig #define debug_ps_int(name) \
260 1.113 rillig if (ps.name != prev_ps.name) \
261 1.168 rillig debug_println("%3d -> %3d ps." #name, prev_ps.name, ps.name); \
262 1.168 rillig else if (debug_full_parser_state()) \
263 1.168 rillig debug_println(" %3d ps." #name, ps.name)
264 1.112 rillig
265 1.171 rillig static bool
266 1.171 rillig ps_paren_has_changed(const struct parser_state *prev_ps)
267 1.171 rillig {
268 1.171 rillig const paren_level_props *prev = prev_ps->paren, *curr = ps.paren;
269 1.171 rillig
270 1.172 rillig if (prev_ps->nparen != ps.nparen)
271 1.171 rillig return true;
272 1.171 rillig
273 1.172 rillig for (int i = 0; i < ps.nparen; i++) {
274 1.171 rillig if (curr[i].indent != prev[i].indent ||
275 1.171 rillig curr[i].maybe_cast != prev[i].maybe_cast ||
276 1.171 rillig curr[i].no_cast != prev[i].no_cast)
277 1.171 rillig return true;
278 1.171 rillig }
279 1.171 rillig return false;
280 1.171 rillig }
281 1.171 rillig
282 1.171 rillig static void
283 1.171 rillig debug_ps_paren(const struct parser_state *prev_ps)
284 1.171 rillig {
285 1.171 rillig if (!debug_full_parser_state() && !ps_paren_has_changed(prev_ps))
286 1.171 rillig return;
287 1.171 rillig
288 1.171 rillig debug_printf(" ps.paren:");
289 1.172 rillig for (int i = 0; i < ps.nparen; i++) {
290 1.171 rillig const paren_level_props *props = ps.paren + i;
291 1.171 rillig const char *cast = props->no_cast ? "(no cast)"
292 1.171 rillig : props->maybe_cast ? "(cast)"
293 1.171 rillig : "";
294 1.171 rillig debug_printf(" %s%d", cast, props->indent);
295 1.171 rillig }
296 1.172 rillig if (ps.nparen == 0)
297 1.171 rillig debug_printf(" none");
298 1.171 rillig debug_println("");
299 1.171 rillig }
300 1.171 rillig
301 1.101 rillig static void
302 1.107 rillig debug_lexi(lexer_symbol lsym)
303 1.20 rillig {
304 1.113 rillig /*
305 1.113 rillig * Watch out for 'rolled back parser state' in the debug output; the
306 1.113 rillig * differences around these are unreliable.
307 1.113 rillig */
308 1.113 rillig static struct parser_state prev_ps;
309 1.113 rillig
310 1.104 rillig debug_println("");
311 1.134 rillig debug_printf("line %d: %s", line_no, lsym_name(lsym));
312 1.116 rillig debug_vis_range(" \"", token.s, token.e, "\"\n");
313 1.122 rillig
314 1.72 rillig debug_print_buf("label", &lab);
315 1.72 rillig debug_print_buf("code", &code);
316 1.72 rillig debug_print_buf("comment", &com);
317 1.112 rillig
318 1.168 rillig debug_println(" ps.prev_token = %s", lsym_name(ps.prev_token));
319 1.130 rillig debug_ps_bool(next_col_1);
320 1.117 rillig debug_ps_bool(curr_col_1);
321 1.112 rillig debug_ps_bool(next_unary);
322 1.147 rillig debug_ps_bool(is_function_definition);
323 1.112 rillig debug_ps_bool(want_blank);
324 1.172 rillig debug_ps_int(line_start_nparen);
325 1.172 rillig debug_ps_int(nparen);
326 1.171 rillig debug_ps_paren(&prev_ps);
327 1.112 rillig
328 1.112 rillig debug_ps_int(comment_delta);
329 1.112 rillig debug_ps_int(n_comment_delta);
330 1.112 rillig debug_ps_int(com_ind);
331 1.112 rillig
332 1.112 rillig debug_ps_bool(block_init);
333 1.112 rillig debug_ps_int(block_init_level);
334 1.112 rillig debug_ps_bool(init_or_struct);
335 1.112 rillig
336 1.112 rillig debug_ps_int(ind_level);
337 1.112 rillig debug_ps_int(ind_level_follow);
338 1.112 rillig
339 1.137 rillig debug_ps_int(decl_level);
340 1.112 rillig debug_ps_bool(decl_on_line);
341 1.112 rillig debug_ps_bool(in_decl);
342 1.112 rillig debug_ps_int(just_saw_decl);
343 1.164 rillig debug_ps_bool(in_func_def_params);
344 1.112 rillig debug_ps_bool(decl_indent_done);
345 1.112 rillig
346 1.152 rillig debug_ps_bool(in_stmt_or_decl);
347 1.151 rillig debug_ps_bool(in_stmt_cont);
348 1.112 rillig debug_ps_bool(is_case_label);
349 1.112 rillig
350 1.112 rillig debug_ps_bool(search_stmt);
351 1.113 rillig
352 1.113 rillig prev_ps = ps;
353 1.101 rillig }
354 1.96 rillig #endif
355 1.20 rillig
356 1.101 rillig static lexer_symbol
357 1.107 rillig lexi_end(lexer_symbol lsym)
358 1.101 rillig {
359 1.101 rillig #ifdef debug
360 1.107 rillig debug_lexi(lsym);
361 1.101 rillig #endif
362 1.100 rillig return lsym;
363 1.20 rillig }
364 1.20 rillig
365 1.43 rillig static void
366 1.43 rillig lex_number(void)
367 1.43 rillig {
368 1.115 rillig for (unsigned char s = 'A'; s != 'f' && s != 'i' && s != 'u';) {
369 1.141 rillig unsigned char ch = (unsigned char)inp_peek();
370 1.94 rillig if (ch >= array_length(lex_number_row) || lex_number_row[ch] == 0)
371 1.56 rillig break;
372 1.75 rillig
373 1.115 rillig unsigned char row = lex_number_row[ch];
374 1.82 rillig if (lex_number_state[row][s - 'A'] == ' ') {
375 1.71 rillig /*-
376 1.82 rillig * lex_number_state[0][s - 'A'] now indicates the type:
377 1.74 rillig * f = floating, i = integer, u = unknown
378 1.56 rillig */
379 1.138 rillig return;
380 1.43 rillig }
381 1.75 rillig
382 1.82 rillig s = lex_number_state[row][s - 'A'];
383 1.133 rillig token_add_char(inp_next());
384 1.43 rillig }
385 1.43 rillig }
386 1.43 rillig
387 1.145 rillig static bool
388 1.146 rillig is_identifier_start(char ch)
389 1.146 rillig {
390 1.148 rillig return ch_isalpha(ch) || ch == '_' || ch == '$';
391 1.146 rillig }
392 1.146 rillig
393 1.146 rillig static bool
394 1.145 rillig is_identifier_part(char ch)
395 1.145 rillig {
396 1.148 rillig return ch_isalnum(ch) || ch == '_' || ch == '$';
397 1.145 rillig }
398 1.145 rillig
399 1.43 rillig static void
400 1.43 rillig lex_word(void)
401 1.43 rillig {
402 1.149 rillig for (;;) {
403 1.149 rillig if (is_identifier_part(inp_peek()))
404 1.149 rillig token_add_char(inp_next());
405 1.149 rillig else if (inp_peek() == '\\' && inp_lookahead(1) == '\n') {
406 1.149 rillig inp_skip();
407 1.149 rillig inp_skip();
408 1.149 rillig } else
409 1.149 rillig return;
410 1.43 rillig }
411 1.43 rillig }
412 1.43 rillig
413 1.43 rillig static void
414 1.43 rillig lex_char_or_string(void)
415 1.43 rillig {
416 1.132 rillig for (char delim = token.e[-1];;) {
417 1.141 rillig if (inp_peek() == '\n') {
418 1.52 rillig diag(1, "Unterminated literal");
419 1.52 rillig return;
420 1.52 rillig }
421 1.75 rillig
422 1.133 rillig token_add_char(inp_next());
423 1.52 rillig if (token.e[-1] == delim)
424 1.52 rillig return;
425 1.75 rillig
426 1.52 rillig if (token.e[-1] == '\\') {
427 1.141 rillig if (inp_peek() == '\n')
428 1.52 rillig ++line_no;
429 1.133 rillig token_add_char(inp_next());
430 1.52 rillig }
431 1.52 rillig }
432 1.43 rillig }
433 1.43 rillig
434 1.84 rillig /* Guess whether the current token is a declared type. */
435 1.57 rillig static bool
436 1.107 rillig probably_typename(void)
437 1.57 rillig {
438 1.153 rillig if (ps.prev_token == lsym_storage_class)
439 1.153 rillig return true;
440 1.153 rillig if (ps.block_init)
441 1.153 rillig return false;
442 1.153 rillig if (ps.in_stmt_or_decl) /* XXX: this condition looks incorrect */
443 1.70 rillig return false;
444 1.142 rillig if (inp_peek() == '*' && inp_lookahead(1) != '=')
445 1.70 rillig goto maybe;
446 1.145 rillig /* XXX: is_identifier_start */
447 1.148 rillig if (ch_isalpha(inp_peek()))
448 1.70 rillig goto maybe;
449 1.70 rillig return false;
450 1.70 rillig maybe:
451 1.110 rillig return ps.prev_token == lsym_semicolon ||
452 1.110 rillig ps.prev_token == lsym_lbrace ||
453 1.110 rillig ps.prev_token == lsym_rbrace;
454 1.57 rillig }
455 1.57 rillig
456 1.84 rillig static int
457 1.84 rillig bsearch_typenames(const char *key)
458 1.84 rillig {
459 1.84 rillig const char **arr = typenames.items;
460 1.84 rillig int lo = 0;
461 1.84 rillig int hi = (int)typenames.len - 1;
462 1.84 rillig
463 1.84 rillig while (lo <= hi) {
464 1.84 rillig int mid = (int)((unsigned)(lo + hi) >> 1);
465 1.84 rillig int cmp = strcmp(arr[mid], key);
466 1.84 rillig if (cmp < 0)
467 1.84 rillig lo = mid + 1;
468 1.84 rillig else if (cmp > 0)
469 1.84 rillig hi = mid - 1;
470 1.84 rillig else
471 1.84 rillig return mid;
472 1.84 rillig }
473 1.84 rillig return -(lo + 1);
474 1.84 rillig }
475 1.84 rillig
476 1.63 rillig static bool
477 1.63 rillig is_typename(void)
478 1.63 rillig {
479 1.84 rillig if (opt.auto_typedefs &&
480 1.84 rillig token.e - token.s >= 2 && memcmp(token.e - 2, "_t", 2) == 0)
481 1.84 rillig return true;
482 1.63 rillig
483 1.84 rillig return bsearch_typenames(token.s) >= 0;
484 1.63 rillig }
485 1.63 rillig
486 1.115 rillig static int
487 1.115 rillig cmp_keyword_by_name(const void *key, const void *elem)
488 1.115 rillig {
489 1.115 rillig return strcmp(key, ((const struct keyword *)elem)->name);
490 1.115 rillig }
491 1.115 rillig
492 1.165 rillig /*
493 1.166 rillig * Looking at something like 'function_name(...)' in a line, guess whether
494 1.165 rillig * this starts a function definition or a declaration.
495 1.165 rillig */
496 1.155 rillig static bool
497 1.155 rillig probably_looking_at_definition(void)
498 1.155 rillig {
499 1.158 rillig int paren_level = 0;
500 1.158 rillig for (const char *p = inp_p(), *e = inp_line_end(); p < e; p++) {
501 1.158 rillig if (*p == '(')
502 1.158 rillig paren_level++;
503 1.158 rillig if (*p == ')' && --paren_level == 0) {
504 1.158 rillig p++;
505 1.166 rillig
506 1.158 rillig while (p < e && (ch_isspace(*p) || is_identifier_part(*p)))
507 1.166 rillig p++; /* '__dead' or '__unused' */
508 1.166 rillig
509 1.166 rillig if (p == e) /* func(...) */
510 1.166 rillig break;
511 1.166 rillig if (*p == ';') /* func(...); */
512 1.165 rillig return false;
513 1.166 rillig if (*p == ',') /* double abs(), pi; */
514 1.166 rillig return false;
515 1.166 rillig if (*p == '(') /* func(...) __attribute__((...)) */
516 1.166 rillig paren_level++; /* func(...) __printflike(...) */
517 1.165 rillig else
518 1.166 rillig break; /* func(...) { ... */
519 1.158 rillig }
520 1.158 rillig }
521 1.158 rillig
522 1.158 rillig /*
523 1.158 rillig * To further reduce the cases where indent wrongly treats an incomplete
524 1.158 rillig * function declaration as a function definition, thus adding a newline
525 1.158 rillig * before the function name, it may be worth looking for parameter names,
526 1.158 rillig * as these are often omitted in function declarations and only included
527 1.158 rillig * in function definitions. Or just increase the lookahead to more than
528 1.158 rillig * just the current line of input, until the next '{'.
529 1.158 rillig */
530 1.155 rillig return true;
531 1.155 rillig }
532 1.155 rillig
533 1.138 rillig /* Read an alphanumeric token into 'token', or return lsym_eof. */
534 1.100 rillig static lexer_symbol
535 1.107 rillig lexi_alnum(void)
536 1.1 cgd {
537 1.148 rillig if (ch_isdigit(inp_peek()) ||
538 1.148 rillig (inp_peek() == '.' && ch_isdigit(inp_lookahead(1)))) {
539 1.89 rillig lex_number();
540 1.168 rillig } else if (is_identifier_start(inp_peek())) {
541 1.89 rillig lex_word();
542 1.167 rillig
543 1.167 rillig if (token.s[0] == 'L' && token.e - token.s == 1 &&
544 1.167 rillig (inp_peek() == '"' || inp_peek() == '\'')) {
545 1.167 rillig token_add_char(inp_next());
546 1.167 rillig lex_char_or_string();
547 1.167 rillig ps.next_unary = false;
548 1.167 rillig
549 1.167 rillig check_size_token(1);
550 1.167 rillig *token.e = '\0';
551 1.167 rillig
552 1.167 rillig return lsym_word;
553 1.167 rillig }
554 1.102 rillig } else
555 1.102 rillig return lsym_eof; /* just as a placeholder */
556 1.102 rillig
557 1.89 rillig *token.e = '\0';
558 1.16 kamil
559 1.133 rillig while (ch_isblank(inp_peek()))
560 1.133 rillig inp_skip();
561 1.89 rillig
562 1.154 rillig ps.next_unary = ps.prev_token == lsym_tag; /* for 'struct s *' */
563 1.154 rillig
564 1.172 rillig if (ps.prev_token == lsym_tag && ps.nparen == 0)
565 1.135 rillig return lsym_type_outside_parentheses;
566 1.16 kamil
567 1.89 rillig const struct keyword *kw = bsearch(token.s, keywords,
568 1.94 rillig array_length(keywords), sizeof(keywords[0]), cmp_keyword_by_name);
569 1.134 rillig bool is_type = false;
570 1.89 rillig if (kw == NULL) {
571 1.89 rillig if (is_typename()) {
572 1.134 rillig is_type = true;
573 1.107 rillig ps.next_unary = true;
574 1.169 rillig if (ps.in_enum == in_enum_enum)
575 1.169 rillig ps.in_enum = in_enum_type;
576 1.89 rillig goto found_typename;
577 1.16 kamil }
578 1.89 rillig
579 1.89 rillig } else { /* we have a keyword */
580 1.134 rillig is_type = kw->lsym == lsym_type;
581 1.107 rillig ps.next_unary = true;
582 1.127 rillig if (kw->lsym != lsym_tag && kw->lsym != lsym_type)
583 1.125 rillig return kw->lsym;
584 1.118 rillig
585 1.118 rillig found_typename:
586 1.172 rillig if (ps.nparen > 0) {
587 1.118 rillig /* inside parentheses: cast, param list, offsetof or sizeof */
588 1.172 rillig if (!ps.paren[ps.nparen - 1].no_cast)
589 1.172 rillig ps.paren[ps.nparen - 1].maybe_cast = true;
590 1.118 rillig }
591 1.118 rillig if (ps.prev_token != lsym_period && ps.prev_token != lsym_unary_op) {
592 1.169 rillig if (kw != NULL && kw->lsym == lsym_tag) {
593 1.169 rillig if (token.s[0] == 'e' /* enum */)
594 1.169 rillig ps.in_enum = in_enum_enum;
595 1.100 rillig return lsym_tag;
596 1.169 rillig }
597 1.172 rillig if (ps.nparen == 0)
598 1.135 rillig return lsym_type_outside_parentheses;
599 1.90 rillig }
600 1.90 rillig }
601 1.89 rillig
602 1.141 rillig if (inp_peek() == '(' && ps.tos <= 1 && ps.ind_level == 0 &&
603 1.164 rillig !ps.in_func_def_params && !ps.block_init) {
604 1.89 rillig
605 1.172 rillig if (ps.nparen == 0 && probably_looking_at_definition()) {
606 1.155 rillig ps.is_function_definition = true;
607 1.155 rillig if (ps.in_decl)
608 1.164 rillig ps.in_func_def_params = true;
609 1.155 rillig return lsym_funcname;
610 1.155 rillig }
611 1.89 rillig
612 1.172 rillig } else if (ps.nparen == 0 && probably_typename()) {
613 1.107 rillig ps.next_unary = true;
614 1.135 rillig return lsym_type_outside_parentheses;
615 1.89 rillig }
616 1.89 rillig
617 1.134 rillig return is_type ? lsym_type_in_parentheses : lsym_word;
618 1.89 rillig }
619 1.75 rillig
620 1.163 rillig static bool
621 1.163 rillig is_asterisk_unary(void)
622 1.163 rillig {
623 1.164 rillig if (ps.next_unary || ps.in_func_def_params)
624 1.163 rillig return true;
625 1.163 rillig if (ps.prev_token == lsym_word ||
626 1.163 rillig ps.prev_token == lsym_rparen_or_rbracket)
627 1.163 rillig return false;
628 1.172 rillig return ps.in_decl && ps.nparen > 0;
629 1.163 rillig }
630 1.163 rillig
631 1.161 rillig static void
632 1.161 rillig lex_asterisk_unary(void)
633 1.161 rillig {
634 1.161 rillig while (inp_peek() == '*' || ch_isspace(inp_peek())) {
635 1.161 rillig if (inp_peek() == '*')
636 1.161 rillig token_add_char('*');
637 1.161 rillig inp_skip();
638 1.161 rillig }
639 1.161 rillig
640 1.161 rillig if (ps.in_decl) {
641 1.161 rillig const char *tp = inp_p(), *e = inp_line_end();
642 1.161 rillig
643 1.161 rillig while (tp < e) {
644 1.161 rillig if (ch_isspace(*tp))
645 1.161 rillig tp++;
646 1.161 rillig else if (is_identifier_start(*tp)) {
647 1.161 rillig tp++;
648 1.161 rillig while (tp < e && is_identifier_part(*tp))
649 1.161 rillig tp++;
650 1.161 rillig } else
651 1.161 rillig break;
652 1.161 rillig }
653 1.161 rillig
654 1.161 rillig if (tp < e && *tp == '(')
655 1.161 rillig ps.is_function_definition = true;
656 1.161 rillig }
657 1.161 rillig }
658 1.161 rillig
659 1.89 rillig /* Reads the next token, placing it in the global variable "token". */
660 1.100 rillig lexer_symbol
661 1.106 rillig lexi(void)
662 1.89 rillig {
663 1.90 rillig token.e = token.s;
664 1.130 rillig ps.curr_col_1 = ps.next_col_1;
665 1.130 rillig ps.next_col_1 = false;
666 1.75 rillig
667 1.141 rillig while (ch_isblank(inp_peek())) {
668 1.117 rillig ps.curr_col_1 = false;
669 1.133 rillig inp_skip();
670 1.89 rillig }
671 1.75 rillig
672 1.107 rillig lexer_symbol alnum_lsym = lexi_alnum();
673 1.100 rillig if (alnum_lsym != lsym_eof)
674 1.107 rillig return lexi_end(alnum_lsym);
675 1.16 kamil
676 1.16 kamil /* Scan a non-alphanumeric token */
677 1.16 kamil
678 1.90 rillig check_size_token(3); /* for things like "<<=" */
679 1.133 rillig *token.e++ = inp_next();
680 1.50 rillig *token.e = '\0';
681 1.16 kamil
682 1.100 rillig lexer_symbol lsym;
683 1.159 rillig bool next_unary;
684 1.89 rillig
685 1.132 rillig switch (token.e[-1]) {
686 1.160 rillig
687 1.160 rillig /* INDENT OFF */
688 1.160 rillig case '(':
689 1.160 rillig case '[': lsym = lsym_lparen_or_lbracket; next_unary = true; break;
690 1.160 rillig case ')':
691 1.160 rillig case ']': lsym = lsym_rparen_or_rbracket; next_unary = false; break;
692 1.160 rillig case '?': lsym = lsym_question; next_unary = true; break;
693 1.160 rillig case ':': lsym = lsym_colon; next_unary = true; break;
694 1.160 rillig case ';': lsym = lsym_semicolon; next_unary = true; break;
695 1.160 rillig case '{': lsym = lsym_lbrace; next_unary = true; break;
696 1.160 rillig case '}': lsym = lsym_rbrace; next_unary = true; break;
697 1.160 rillig case ',': lsym = lsym_comma; next_unary = true; break;
698 1.160 rillig case '.': lsym = lsym_period; next_unary = false; break;
699 1.160 rillig /* INDENT ON */
700 1.160 rillig
701 1.16 kamil case '\n':
702 1.159 rillig /* if data has been exhausted, the '\n' is a dummy. */
703 1.159 rillig lsym = had_eof ? lsym_eof : lsym_newline;
704 1.159 rillig next_unary = ps.next_unary;
705 1.130 rillig ps.next_col_1 = true;
706 1.16 kamil break;
707 1.16 kamil
708 1.69 rillig case '\f':
709 1.159 rillig lsym = lsym_form_feed;
710 1.159 rillig next_unary = ps.next_unary;
711 1.130 rillig ps.next_col_1 = true;
712 1.16 kamil break;
713 1.16 kamil
714 1.160 rillig case '#':
715 1.160 rillig lsym = lsym_preprocessing;
716 1.160 rillig next_unary = ps.next_unary;
717 1.16 kamil break;
718 1.16 kamil
719 1.160 rillig case '\'':
720 1.160 rillig case '"':
721 1.160 rillig lex_char_or_string();
722 1.160 rillig lsym = lsym_word;
723 1.159 rillig next_unary = false;
724 1.16 kamil break;
725 1.1 cgd
726 1.16 kamil case '-':
727 1.90 rillig case '+':
728 1.107 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
729 1.159 rillig next_unary = true;
730 1.16 kamil
731 1.159 rillig if (inp_peek() == token.e[-1]) { /* '++' or '--' */
732 1.141 rillig *token.e++ = inp_next();
733 1.134 rillig if (ps.prev_token == lsym_word ||
734 1.110 rillig ps.prev_token == lsym_rparen_or_rbracket) {
735 1.107 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_postfix_op;
736 1.159 rillig next_unary = false;
737 1.16 kamil }
738 1.75 rillig
739 1.159 rillig } else if (inp_peek() == '=') { /* '+=' or '-=' */
740 1.141 rillig *token.e++ = inp_next();
741 1.75 rillig
742 1.159 rillig } else if (inp_peek() == '>') { /* '->' */
743 1.141 rillig *token.e++ = inp_next();
744 1.100 rillig lsym = lsym_unary_op;
745 1.159 rillig next_unary = false;
746 1.107 rillig ps.want_blank = false;
747 1.16 kamil }
748 1.90 rillig break;
749 1.16 kamil
750 1.16 kamil case '=':
751 1.107 rillig if (ps.init_or_struct)
752 1.107 rillig ps.block_init = true;
753 1.141 rillig if (inp_peek() == '=') { /* == */
754 1.141 rillig *token.e++ = inp_next();
755 1.67 rillig *token.e = '\0';
756 1.16 kamil }
757 1.100 rillig lsym = lsym_binary_op;
758 1.159 rillig next_unary = true;
759 1.16 kamil break;
760 1.16 kamil
761 1.16 kamil case '>':
762 1.16 kamil case '<':
763 1.16 kamil case '!': /* ops like <, <<, <=, !=, etc */
764 1.141 rillig if (inp_peek() == '>' || inp_peek() == '<' || inp_peek() == '=')
765 1.141 rillig *token.e++ = inp_next();
766 1.141 rillig if (inp_peek() == '=')
767 1.133 rillig *token.e++ = inp_next();
768 1.107 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
769 1.159 rillig next_unary = true;
770 1.16 kamil break;
771 1.16 kamil
772 1.16 kamil case '*':
773 1.163 rillig if (is_asterisk_unary()) {
774 1.162 rillig lex_asterisk_unary();
775 1.162 rillig lsym = lsym_unary_op;
776 1.162 rillig next_unary = true;
777 1.162 rillig } else {
778 1.141 rillig if (inp_peek() == '=')
779 1.141 rillig *token.e++ = inp_next();
780 1.100 rillig lsym = lsym_binary_op;
781 1.159 rillig next_unary = true;
782 1.16 kamil }
783 1.16 kamil break;
784 1.1 cgd
785 1.16 kamil default:
786 1.141 rillig if (token.e[-1] == '/' && (inp_peek() == '*' || inp_peek() == '/')) {
787 1.133 rillig *token.e++ = inp_next();
788 1.100 rillig lsym = lsym_comment;
789 1.159 rillig next_unary = ps.next_unary;
790 1.16 kamil break;
791 1.1 cgd }
792 1.75 rillig
793 1.132 rillig /* handle '||', '&&', etc., and also things as in 'int *****i' */
794 1.141 rillig while (token.e[-1] == inp_peek() || inp_peek() == '=')
795 1.133 rillig token_add_char(inp_next());
796 1.75 rillig
797 1.107 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
798 1.159 rillig next_unary = true;
799 1.47 rillig }
800 1.16 kamil
801 1.169 rillig if (ps.in_enum == in_enum_enum || ps.in_enum == in_enum_type)
802 1.169 rillig ps.in_enum = lsym == lsym_lbrace ? in_enum_brace : in_enum_no;
803 1.169 rillig if (lsym == lsym_rbrace)
804 1.169 rillig ps.in_enum = in_enum_no;
805 1.169 rillig
806 1.159 rillig ps.next_unary = next_unary;
807 1.75 rillig
808 1.25 rillig check_size_token(1);
809 1.50 rillig *token.e = '\0';
810 1.75 rillig
811 1.107 rillig return lexi_end(lsym);
812 1.1 cgd }
813 1.16 kamil
814 1.6 lukem void
815 1.128 rillig register_typename(const char *name)
816 1.1 cgd {
817 1.64 rillig if (typenames.len >= typenames.cap) {
818 1.64 rillig typenames.cap = 16 + 2 * typenames.cap;
819 1.64 rillig typenames.items = xrealloc(typenames.items,
820 1.64 rillig sizeof(typenames.items[0]) * typenames.cap);
821 1.64 rillig }
822 1.16 kamil
823 1.84 rillig int pos = bsearch_typenames(name);
824 1.64 rillig if (pos >= 0)
825 1.64 rillig return; /* already in the list */
826 1.75 rillig
827 1.64 rillig pos = -(pos + 1);
828 1.64 rillig memmove(typenames.items + pos + 1, typenames.items + pos,
829 1.73 rillig sizeof(typenames.items[0]) * (typenames.len++ - (unsigned)pos));
830 1.64 rillig typenames.items[pos] = xstrdup(name);
831 1.1 cgd }
832