lexi.c revision 1.143 1 1.143 rillig /* $NetBSD: lexi.c,v 1.143 2021/11/19 17:30:10 rillig Exp $ */
2 1.3 tls
3 1.16 kamil /*-
4 1.16 kamil * SPDX-License-Identifier: BSD-4-Clause
5 1.16 kamil *
6 1.16 kamil * Copyright (c) 1985 Sun Microsystems, Inc.
7 1.5 mrg * Copyright (c) 1980, 1993
8 1.5 mrg * The Regents of the University of California. All rights reserved.
9 1.1 cgd * All rights reserved.
10 1.1 cgd *
11 1.1 cgd * Redistribution and use in source and binary forms, with or without
12 1.1 cgd * modification, are permitted provided that the following conditions
13 1.1 cgd * are met:
14 1.1 cgd * 1. Redistributions of source code must retain the above copyright
15 1.1 cgd * notice, this list of conditions and the following disclaimer.
16 1.1 cgd * 2. Redistributions in binary form must reproduce the above copyright
17 1.1 cgd * notice, this list of conditions and the following disclaimer in the
18 1.1 cgd * documentation and/or other materials provided with the distribution.
19 1.1 cgd * 3. All advertising materials mentioning features or use of this software
20 1.1 cgd * must display the following acknowledgement:
21 1.1 cgd * This product includes software developed by the University of
22 1.1 cgd * California, Berkeley and its contributors.
23 1.1 cgd * 4. Neither the name of the University nor the names of its contributors
24 1.1 cgd * may be used to endorse or promote products derived from this software
25 1.1 cgd * without specific prior written permission.
26 1.1 cgd *
27 1.1 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 1.1 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 1.1 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 1.1 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 1.1 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 1.1 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 1.1 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 1.1 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 1.1 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 1.1 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 1.1 cgd * SUCH DAMAGE.
38 1.1 cgd */
39 1.1 cgd
40 1.16 kamil #if 0
41 1.16 kamil static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";
42 1.16 kamil #endif
43 1.16 kamil
44 1.6 lukem #include <sys/cdefs.h>
45 1.16 kamil #if defined(__NetBSD__)
46 1.143 rillig __RCSID("$NetBSD: lexi.c,v 1.143 2021/11/19 17:30:10 rillig Exp $");
47 1.16 kamil #elif defined(__FreeBSD__)
48 1.16 kamil __FBSDID("$FreeBSD: head/usr.bin/indent/lexi.c 337862 2018-08-15 18:19:45Z pstef $");
49 1.16 kamil #endif
50 1.1 cgd
51 1.142 rillig #include <assert.h>
52 1.1 cgd #include <ctype.h>
53 1.1 cgd #include <stdlib.h>
54 1.1 cgd #include <string.h>
55 1.16 kamil
56 1.16 kamil #include "indent.h"
57 1.1 cgd
58 1.127 rillig /*
59 1.127 rillig * While inside lexi_alnum, this constant just marks a type, independently of
60 1.127 rillig * the parentheses level.
61 1.127 rillig */
62 1.135 rillig #define lsym_type lsym_type_outside_parentheses
63 1.127 rillig
64 1.60 rillig /* must be sorted alphabetically, is used in binary search */
65 1.62 rillig static const struct keyword {
66 1.62 rillig const char *name;
67 1.125 rillig lexer_symbol lsym;
68 1.62 rillig } keywords[] = {
69 1.127 rillig {"_Bool", lsym_type},
70 1.127 rillig {"_Complex", lsym_type},
71 1.127 rillig {"_Imaginary", lsym_type},
72 1.127 rillig {"auto", lsym_storage_class},
73 1.127 rillig {"bool", lsym_type},
74 1.134 rillig {"break", lsym_word},
75 1.127 rillig {"case", lsym_case_label},
76 1.127 rillig {"char", lsym_type},
77 1.127 rillig {"complex", lsym_type},
78 1.127 rillig {"const", lsym_type},
79 1.134 rillig {"continue", lsym_word},
80 1.127 rillig {"default", lsym_case_label},
81 1.127 rillig {"do", lsym_do},
82 1.127 rillig {"double", lsym_type},
83 1.127 rillig {"else", lsym_else},
84 1.127 rillig {"enum", lsym_tag},
85 1.127 rillig {"extern", lsym_storage_class},
86 1.127 rillig {"float", lsym_type},
87 1.127 rillig {"for", lsym_for},
88 1.134 rillig {"goto", lsym_word},
89 1.127 rillig {"if", lsym_if},
90 1.127 rillig {"imaginary", lsym_type},
91 1.134 rillig {"inline", lsym_word},
92 1.127 rillig {"int", lsym_type},
93 1.127 rillig {"long", lsym_type},
94 1.127 rillig {"offsetof", lsym_offsetof},
95 1.127 rillig {"register", lsym_storage_class},
96 1.134 rillig {"restrict", lsym_word},
97 1.129 rillig {"return", lsym_return},
98 1.127 rillig {"short", lsym_type},
99 1.127 rillig {"signed", lsym_type},
100 1.127 rillig {"sizeof", lsym_sizeof},
101 1.127 rillig {"static", lsym_storage_class},
102 1.127 rillig {"struct", lsym_tag},
103 1.127 rillig {"switch", lsym_switch},
104 1.127 rillig {"typedef", lsym_typedef},
105 1.127 rillig {"union", lsym_tag},
106 1.127 rillig {"unsigned", lsym_type},
107 1.127 rillig {"void", lsym_type},
108 1.127 rillig {"volatile", lsym_type},
109 1.127 rillig {"while", lsym_while}
110 1.1 cgd };
111 1.1 cgd
112 1.84 rillig static struct {
113 1.64 rillig const char **items;
114 1.64 rillig unsigned int len;
115 1.64 rillig unsigned int cap;
116 1.64 rillig } typenames;
117 1.16 kamil
118 1.16 kamil /*
119 1.16 kamil * The transition table below was rewritten by hand from lx's output, given
120 1.16 kamil * the following definitions. lx is Katherine Flavel's lexer generator.
121 1.16 kamil *
122 1.16 kamil * O = /[0-7]/; D = /[0-9]/; NZ = /[1-9]/;
123 1.16 kamil * H = /[a-f0-9]/i; B = /[0-1]/; HP = /0x/i;
124 1.16 kamil * BP = /0b/i; E = /e[+\-]?/i D+; P = /p[+\-]?/i D+;
125 1.16 kamil * FS = /[fl]/i; IS = /u/i /(l|L|ll|LL)/? | /(l|L|ll|LL)/ /u/i?;
126 1.16 kamil *
127 1.16 kamil * D+ E FS? -> $float;
128 1.16 kamil * D* "." D+ E? FS? -> $float;
129 1.16 kamil * D+ "." E? FS? -> $float; HP H+ IS? -> $int;
130 1.16 kamil * HP H+ P FS? -> $float; NZ D* IS? -> $int;
131 1.16 kamil * HP H* "." H+ P FS? -> $float; "0" O* IS? -> $int;
132 1.16 kamil * HP H+ "." P FS -> $float; BP B+ IS? -> $int;
133 1.16 kamil */
134 1.71 rillig /* INDENT OFF */
135 1.82 rillig static const unsigned char lex_number_state[][26] = {
136 1.16 kamil /* examples:
137 1.16 kamil 00
138 1.16 kamil s 0xx
139 1.16 kamil t 00xaa
140 1.16 kamil a 11 101100xxa..
141 1.16 kamil r 11ee0001101lbuuxx.a.pp
142 1.16 kamil t.01.e+008bLuxll0Ll.aa.p+0
143 1.16 kamil states: ABCDEFGHIJKLMNOPQRSTUVWXYZ */
144 1.83 rillig [0] = "uuiifuufiuuiiuiiiiiuiuuuuu", /* (other) */
145 1.83 rillig [1] = "CEIDEHHHIJQ U Q VUVVZZZ", /* 0 */
146 1.83 rillig [2] = "DEIDEHHHIJQ U Q VUVVZZZ", /* 1 */
147 1.83 rillig [3] = "DEIDEHHHIJ U VUVVZZZ", /* 2 3 4 5 6 7 */
148 1.83 rillig [4] = "DEJDEHHHJJ U VUVVZZZ", /* 8 9 */
149 1.83 rillig [5] = " U VUVV ", /* A a C c D d */
150 1.83 rillig [6] = " K U VUVV ", /* B b */
151 1.83 rillig [7] = " FFF FF U VUVV ", /* E e */
152 1.83 rillig [8] = " f f U VUVV f", /* F f */
153 1.83 rillig [9] = " LLf fL PR Li L f", /* L */
154 1.83 rillig [10] = " OOf fO S P O i O f", /* l */
155 1.83 rillig [11] = " FFX ", /* P p */
156 1.83 rillig [12] = " MM M i iiM M ", /* U u */
157 1.83 rillig [13] = " N ", /* X x */
158 1.83 rillig [14] = " G Y ", /* + - */
159 1.83 rillig [15] = "B EE EE T W ", /* . */
160 1.16 kamil /* ABCDEFGHIJKLMNOPQRSTUVWXYZ */
161 1.1 cgd };
162 1.71 rillig /* INDENT ON */
163 1.1 cgd
164 1.115 rillig static const unsigned char lex_number_row[] = {
165 1.56 rillig ['0'] = 1,
166 1.56 rillig ['1'] = 2,
167 1.56 rillig ['2'] = 3, ['3'] = 3, ['4'] = 3, ['5'] = 3, ['6'] = 3, ['7'] = 3,
168 1.56 rillig ['8'] = 4, ['9'] = 4,
169 1.56 rillig ['A'] = 5, ['a'] = 5, ['C'] = 5, ['c'] = 5, ['D'] = 5, ['d'] = 5,
170 1.56 rillig ['B'] = 6, ['b'] = 6,
171 1.56 rillig ['E'] = 7, ['e'] = 7,
172 1.56 rillig ['F'] = 8, ['f'] = 8,
173 1.56 rillig ['L'] = 9,
174 1.56 rillig ['l'] = 10,
175 1.56 rillig ['P'] = 11, ['p'] = 11,
176 1.56 rillig ['U'] = 12, ['u'] = 12,
177 1.56 rillig ['X'] = 13, ['x'] = 13,
178 1.56 rillig ['+'] = 14, ['-'] = 14,
179 1.56 rillig ['.'] = 15,
180 1.56 rillig };
181 1.36 rillig
182 1.25 rillig static void
183 1.25 rillig check_size_token(size_t desired_size)
184 1.25 rillig {
185 1.58 rillig if (token.e + desired_size >= token.l)
186 1.58 rillig buf_expand(&token, desired_size);
187 1.25 rillig }
188 1.25 rillig
189 1.87 rillig static void
190 1.87 rillig token_add_char(char ch)
191 1.87 rillig {
192 1.87 rillig check_size_token(1);
193 1.87 rillig *token.e++ = ch;
194 1.87 rillig }
195 1.87 rillig
196 1.20 rillig #ifdef debug
197 1.100 rillig static const char *
198 1.100 rillig lsym_name(lexer_symbol sym)
199 1.20 rillig {
200 1.20 rillig static const char *const name[] = {
201 1.100 rillig "eof",
202 1.100 rillig "preprocessing",
203 1.100 rillig "newline",
204 1.100 rillig "form_feed",
205 1.100 rillig "comment",
206 1.100 rillig "lparen_or_lbracket",
207 1.100 rillig "rparen_or_rbracket",
208 1.100 rillig "lbrace",
209 1.100 rillig "rbrace",
210 1.100 rillig "period",
211 1.100 rillig "unary_op",
212 1.100 rillig "binary_op",
213 1.100 rillig "postfix_op",
214 1.100 rillig "question",
215 1.100 rillig "colon",
216 1.100 rillig "comma",
217 1.100 rillig "semicolon",
218 1.100 rillig "typedef",
219 1.100 rillig "storage_class",
220 1.135 rillig "type_outside_parentheses",
221 1.134 rillig "type_in_parentheses",
222 1.100 rillig "tag",
223 1.100 rillig "case_label",
224 1.100 rillig "string_prefix",
225 1.120 rillig "sizeof",
226 1.121 rillig "offsetof",
227 1.134 rillig "word",
228 1.100 rillig "funcname",
229 1.100 rillig "do",
230 1.100 rillig "else",
231 1.100 rillig "for",
232 1.100 rillig "if",
233 1.100 rillig "switch",
234 1.100 rillig "while",
235 1.129 rillig "return",
236 1.20 rillig };
237 1.20 rillig
238 1.100 rillig return name[sym];
239 1.20 rillig }
240 1.20 rillig
241 1.20 rillig static void
242 1.72 rillig debug_print_buf(const char *name, const struct buffer *buf)
243 1.20 rillig {
244 1.72 rillig if (buf->s < buf->e) {
245 1.101 rillig debug_printf("%s ", name);
246 1.101 rillig debug_vis_range("\"", buf->s, buf->e, "\"\n");
247 1.20 rillig }
248 1.20 rillig }
249 1.20 rillig
250 1.112 rillig #define debug_ps_bool(name) \
251 1.113 rillig if (ps.name != prev_ps.name) \
252 1.113 rillig debug_println("[%c] ps." #name, ps.name ? 'x' : ' ')
253 1.112 rillig #define debug_ps_int(name) \
254 1.113 rillig if (ps.name != prev_ps.name) \
255 1.113 rillig debug_println("%3d ps." #name, ps.name)
256 1.112 rillig
257 1.101 rillig static void
258 1.107 rillig debug_lexi(lexer_symbol lsym)
259 1.20 rillig {
260 1.113 rillig /*
261 1.113 rillig * Watch out for 'rolled back parser state' in the debug output; the
262 1.113 rillig * differences around these are unreliable.
263 1.113 rillig */
264 1.113 rillig static struct parser_state prev_ps;
265 1.113 rillig
266 1.104 rillig debug_println("");
267 1.134 rillig debug_printf("line %d: %s", line_no, lsym_name(lsym));
268 1.116 rillig debug_vis_range(" \"", token.s, token.e, "\"\n");
269 1.122 rillig
270 1.72 rillig debug_print_buf("label", &lab);
271 1.72 rillig debug_print_buf("code", &code);
272 1.72 rillig debug_print_buf("comment", &com);
273 1.112 rillig
274 1.122 rillig debug_println(" ps.prev_token = %s", lsym_name(ps.prev_token));
275 1.130 rillig debug_ps_bool(next_col_1);
276 1.117 rillig debug_ps_bool(curr_col_1);
277 1.112 rillig debug_ps_bool(next_unary);
278 1.131 rillig if (strcmp(ps.procname, prev_ps.procname) != 0)
279 1.131 rillig debug_println(" ps.procname = '%s'", ps.procname);
280 1.112 rillig debug_ps_bool(want_blank);
281 1.112 rillig debug_ps_int(paren_level);
282 1.112 rillig debug_ps_int(p_l_follow);
283 1.131 rillig if (ps.paren_level != prev_ps.paren_level) {
284 1.131 rillig debug_printf(" ps.paren_indents:");
285 1.131 rillig for (int i = 0; i < ps.paren_level; i++)
286 1.131 rillig debug_printf(" %d", ps.paren_indents[i]);
287 1.131 rillig debug_println("");
288 1.131 rillig }
289 1.112 rillig debug_ps_int(cast_mask);
290 1.112 rillig debug_ps_int(not_cast_mask);
291 1.112 rillig
292 1.112 rillig debug_ps_int(comment_delta);
293 1.112 rillig debug_ps_int(n_comment_delta);
294 1.112 rillig debug_ps_int(com_ind);
295 1.112 rillig
296 1.112 rillig debug_ps_bool(block_init);
297 1.112 rillig debug_ps_int(block_init_level);
298 1.112 rillig debug_ps_bool(init_or_struct);
299 1.112 rillig
300 1.112 rillig debug_ps_int(ind_level);
301 1.112 rillig debug_ps_int(ind_level_follow);
302 1.112 rillig
303 1.137 rillig debug_ps_int(decl_level);
304 1.112 rillig debug_ps_bool(decl_on_line);
305 1.112 rillig debug_ps_bool(in_decl);
306 1.112 rillig debug_ps_int(just_saw_decl);
307 1.112 rillig debug_ps_bool(in_parameter_declaration);
308 1.112 rillig debug_ps_bool(decl_indent_done);
309 1.112 rillig
310 1.112 rillig debug_ps_bool(in_stmt);
311 1.112 rillig debug_ps_bool(ind_stmt);
312 1.112 rillig debug_ps_bool(is_case_label);
313 1.112 rillig
314 1.112 rillig debug_ps_bool(search_stmt);
315 1.113 rillig
316 1.113 rillig prev_ps = ps;
317 1.101 rillig }
318 1.96 rillig #endif
319 1.20 rillig
320 1.104 rillig /* ARGSUSED */
321 1.101 rillig static lexer_symbol
322 1.107 rillig lexi_end(lexer_symbol lsym)
323 1.101 rillig {
324 1.101 rillig #ifdef debug
325 1.107 rillig debug_lexi(lsym);
326 1.101 rillig #endif
327 1.100 rillig return lsym;
328 1.20 rillig }
329 1.20 rillig
330 1.43 rillig static void
331 1.43 rillig lex_number(void)
332 1.43 rillig {
333 1.115 rillig for (unsigned char s = 'A'; s != 'f' && s != 'i' && s != 'u';) {
334 1.141 rillig unsigned char ch = (unsigned char)inp_peek();
335 1.94 rillig if (ch >= array_length(lex_number_row) || lex_number_row[ch] == 0)
336 1.56 rillig break;
337 1.75 rillig
338 1.115 rillig unsigned char row = lex_number_row[ch];
339 1.82 rillig if (lex_number_state[row][s - 'A'] == ' ') {
340 1.71 rillig /*-
341 1.82 rillig * lex_number_state[0][s - 'A'] now indicates the type:
342 1.74 rillig * f = floating, i = integer, u = unknown
343 1.56 rillig */
344 1.138 rillig return;
345 1.43 rillig }
346 1.75 rillig
347 1.82 rillig s = lex_number_state[row][s - 'A'];
348 1.133 rillig token_add_char(inp_next());
349 1.43 rillig }
350 1.43 rillig }
351 1.43 rillig
352 1.43 rillig static void
353 1.43 rillig lex_word(void)
354 1.43 rillig {
355 1.141 rillig while (isalnum((unsigned char)inp_peek()) ||
356 1.141 rillig inp_peek() == '\\' ||
357 1.141 rillig inp_peek() == '_' || inp_peek() == '$') {
358 1.140 rillig
359 1.141 rillig if (inp_peek() == '\\') {
360 1.142 rillig if (inp_lookahead(1) == '\n') {
361 1.142 rillig inp_skip();
362 1.142 rillig inp_skip();
363 1.43 rillig } else
364 1.43 rillig break;
365 1.43 rillig }
366 1.75 rillig
367 1.133 rillig token_add_char(inp_next());
368 1.43 rillig }
369 1.43 rillig }
370 1.43 rillig
371 1.43 rillig static void
372 1.43 rillig lex_char_or_string(void)
373 1.43 rillig {
374 1.132 rillig for (char delim = token.e[-1];;) {
375 1.141 rillig if (inp_peek() == '\n') {
376 1.52 rillig diag(1, "Unterminated literal");
377 1.52 rillig return;
378 1.52 rillig }
379 1.75 rillig
380 1.133 rillig token_add_char(inp_next());
381 1.52 rillig if (token.e[-1] == delim)
382 1.52 rillig return;
383 1.75 rillig
384 1.52 rillig if (token.e[-1] == '\\') {
385 1.141 rillig if (inp_peek() == '\n')
386 1.52 rillig ++line_no;
387 1.133 rillig token_add_char(inp_next());
388 1.52 rillig }
389 1.52 rillig }
390 1.43 rillig }
391 1.43 rillig
392 1.84 rillig /* Guess whether the current token is a declared type. */
393 1.57 rillig static bool
394 1.107 rillig probably_typename(void)
395 1.57 rillig {
396 1.107 rillig if (ps.block_init || ps.in_stmt)
397 1.70 rillig return false;
398 1.142 rillig if (inp_peek() == '*' && inp_lookahead(1) != '=')
399 1.70 rillig goto maybe;
400 1.141 rillig if (isalpha((unsigned char)inp_peek()))
401 1.70 rillig goto maybe;
402 1.70 rillig return false;
403 1.70 rillig maybe:
404 1.110 rillig return ps.prev_token == lsym_semicolon ||
405 1.110 rillig ps.prev_token == lsym_lbrace ||
406 1.110 rillig ps.prev_token == lsym_rbrace;
407 1.57 rillig }
408 1.57 rillig
409 1.84 rillig static int
410 1.84 rillig bsearch_typenames(const char *key)
411 1.84 rillig {
412 1.84 rillig const char **arr = typenames.items;
413 1.84 rillig int lo = 0;
414 1.84 rillig int hi = (int)typenames.len - 1;
415 1.84 rillig
416 1.84 rillig while (lo <= hi) {
417 1.84 rillig int mid = (int)((unsigned)(lo + hi) >> 1);
418 1.84 rillig int cmp = strcmp(arr[mid], key);
419 1.84 rillig if (cmp < 0)
420 1.84 rillig lo = mid + 1;
421 1.84 rillig else if (cmp > 0)
422 1.84 rillig hi = mid - 1;
423 1.84 rillig else
424 1.84 rillig return mid;
425 1.84 rillig }
426 1.84 rillig return -(lo + 1);
427 1.84 rillig }
428 1.84 rillig
429 1.63 rillig static bool
430 1.63 rillig is_typename(void)
431 1.63 rillig {
432 1.84 rillig if (opt.auto_typedefs &&
433 1.84 rillig token.e - token.s >= 2 && memcmp(token.e - 2, "_t", 2) == 0)
434 1.84 rillig return true;
435 1.63 rillig
436 1.84 rillig return bsearch_typenames(token.s) >= 0;
437 1.63 rillig }
438 1.63 rillig
439 1.115 rillig static int
440 1.115 rillig cmp_keyword_by_name(const void *key, const void *elem)
441 1.115 rillig {
442 1.115 rillig return strcmp(key, ((const struct keyword *)elem)->name);
443 1.115 rillig }
444 1.115 rillig
445 1.138 rillig /* Read an alphanumeric token into 'token', or return lsym_eof. */
446 1.100 rillig static lexer_symbol
447 1.107 rillig lexi_alnum(void)
448 1.1 cgd {
449 1.141 rillig if (isdigit((unsigned char)inp_peek()) ||
450 1.142 rillig (inp_peek() == '.' && isdigit((unsigned char)inp_lookahead(1)))) {
451 1.89 rillig lex_number();
452 1.141 rillig } else if (isalnum((unsigned char)inp_peek()) ||
453 1.141 rillig inp_peek() == '_' || inp_peek() == '$') {
454 1.89 rillig lex_word();
455 1.102 rillig } else
456 1.102 rillig return lsym_eof; /* just as a placeholder */
457 1.102 rillig
458 1.89 rillig *token.e = '\0';
459 1.16 kamil
460 1.89 rillig if (token.s[0] == 'L' && token.s[1] == '\0' &&
461 1.141 rillig (inp_peek() == '"' || inp_peek() == '\''))
462 1.100 rillig return lsym_string_prefix;
463 1.16 kamil
464 1.133 rillig while (ch_isblank(inp_peek()))
465 1.133 rillig inp_skip();
466 1.89 rillig
467 1.110 rillig if (ps.prev_token == lsym_tag && ps.p_l_follow == 0) {
468 1.107 rillig ps.next_unary = true;
469 1.135 rillig return lsym_type_outside_parentheses;
470 1.16 kamil }
471 1.6 lukem
472 1.89 rillig /* Operator after identifier is binary unless last token was 'struct'. */
473 1.110 rillig ps.next_unary = ps.prev_token == lsym_tag;
474 1.16 kamil
475 1.89 rillig const struct keyword *kw = bsearch(token.s, keywords,
476 1.94 rillig array_length(keywords), sizeof(keywords[0]), cmp_keyword_by_name);
477 1.134 rillig bool is_type = false;
478 1.89 rillig if (kw == NULL) {
479 1.89 rillig if (is_typename()) {
480 1.134 rillig is_type = true;
481 1.107 rillig ps.next_unary = true;
482 1.89 rillig goto found_typename;
483 1.16 kamil }
484 1.89 rillig
485 1.89 rillig } else { /* we have a keyword */
486 1.134 rillig is_type = kw->lsym == lsym_type;
487 1.107 rillig ps.next_unary = true;
488 1.127 rillig if (kw->lsym != lsym_tag && kw->lsym != lsym_type)
489 1.125 rillig return kw->lsym;
490 1.118 rillig
491 1.118 rillig found_typename:
492 1.118 rillig if (ps.p_l_follow > 0) {
493 1.118 rillig /* inside parentheses: cast, param list, offsetof or sizeof */
494 1.118 rillig ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
495 1.118 rillig }
496 1.118 rillig if (ps.prev_token != lsym_period && ps.prev_token != lsym_unary_op) {
497 1.126 rillig if (kw != NULL && kw->lsym == lsym_tag)
498 1.100 rillig return lsym_tag;
499 1.118 rillig if (ps.p_l_follow == 0)
500 1.135 rillig return lsym_type_outside_parentheses;
501 1.90 rillig }
502 1.90 rillig }
503 1.89 rillig
504 1.141 rillig if (inp_peek() == '(' && ps.tos <= 1 && ps.ind_level == 0 &&
505 1.107 rillig !ps.in_parameter_declaration && !ps.block_init) {
506 1.89 rillig
507 1.143 rillig for (const char *p = inp_p(), *e = inp_line_end(); p < e;)
508 1.89 rillig if (*p++ == ')' && (*p == ';' || *p == ','))
509 1.118 rillig goto no_function_definition;
510 1.89 rillig
511 1.107 rillig strncpy(ps.procname, token.s, sizeof ps.procname - 1);
512 1.107 rillig if (ps.in_decl)
513 1.107 rillig ps.in_parameter_declaration = true;
514 1.100 rillig return lsym_funcname;
515 1.118 rillig no_function_definition:;
516 1.89 rillig
517 1.136 rillig } else if (ps.p_l_follow == 0 && probably_typename()) {
518 1.107 rillig ps.next_unary = true;
519 1.135 rillig return lsym_type_outside_parentheses;
520 1.89 rillig }
521 1.89 rillig
522 1.134 rillig return is_type ? lsym_type_in_parentheses : lsym_word;
523 1.89 rillig }
524 1.75 rillig
525 1.89 rillig /* Reads the next token, placing it in the global variable "token". */
526 1.100 rillig lexer_symbol
527 1.106 rillig lexi(void)
528 1.89 rillig {
529 1.90 rillig token.e = token.s;
530 1.130 rillig ps.curr_col_1 = ps.next_col_1;
531 1.130 rillig ps.next_col_1 = false;
532 1.75 rillig
533 1.141 rillig while (ch_isblank(inp_peek())) {
534 1.117 rillig ps.curr_col_1 = false;
535 1.133 rillig inp_skip();
536 1.89 rillig }
537 1.75 rillig
538 1.107 rillig lexer_symbol alnum_lsym = lexi_alnum();
539 1.100 rillig if (alnum_lsym != lsym_eof)
540 1.107 rillig return lexi_end(alnum_lsym);
541 1.16 kamil
542 1.16 kamil /* Scan a non-alphanumeric token */
543 1.16 kamil
544 1.90 rillig check_size_token(3); /* for things like "<<=" */
545 1.133 rillig *token.e++ = inp_next();
546 1.50 rillig *token.e = '\0';
547 1.16 kamil
548 1.100 rillig lexer_symbol lsym;
549 1.89 rillig bool unary_delim = false; /* whether the current token forces a
550 1.89 rillig * following operator to be unary */
551 1.89 rillig
552 1.132 rillig switch (token.e[-1]) {
553 1.16 kamil case '\n':
554 1.107 rillig unary_delim = ps.next_unary;
555 1.130 rillig ps.next_col_1 = true;
556 1.47 rillig /* if data has been exhausted, the newline is a dummy. */
557 1.100 rillig lsym = had_eof ? lsym_eof : lsym_newline;
558 1.16 kamil break;
559 1.16 kamil
560 1.43 rillig case '\'':
561 1.43 rillig case '"':
562 1.44 rillig lex_char_or_string();
563 1.134 rillig lsym = lsym_word;
564 1.16 kamil break;
565 1.6 lukem
566 1.40 rillig case '(':
567 1.40 rillig case '[':
568 1.16 kamil unary_delim = true;
569 1.100 rillig lsym = lsym_lparen_or_lbracket;
570 1.16 kamil break;
571 1.16 kamil
572 1.40 rillig case ')':
573 1.40 rillig case ']':
574 1.100 rillig lsym = lsym_rparen_or_rbracket;
575 1.16 kamil break;
576 1.16 kamil
577 1.16 kamil case '#':
578 1.107 rillig unary_delim = ps.next_unary;
579 1.100 rillig lsym = lsym_preprocessing;
580 1.16 kamil break;
581 1.16 kamil
582 1.16 kamil case '?':
583 1.16 kamil unary_delim = true;
584 1.100 rillig lsym = lsym_question;
585 1.16 kamil break;
586 1.16 kamil
587 1.40 rillig case ':':
588 1.100 rillig lsym = lsym_colon;
589 1.16 kamil unary_delim = true;
590 1.16 kamil break;
591 1.16 kamil
592 1.40 rillig case ';':
593 1.16 kamil unary_delim = true;
594 1.100 rillig lsym = lsym_semicolon;
595 1.16 kamil break;
596 1.16 kamil
597 1.40 rillig case '{':
598 1.16 kamil unary_delim = true;
599 1.100 rillig lsym = lsym_lbrace;
600 1.16 kamil break;
601 1.16 kamil
602 1.40 rillig case '}':
603 1.16 kamil unary_delim = true;
604 1.100 rillig lsym = lsym_rbrace;
605 1.16 kamil break;
606 1.16 kamil
607 1.69 rillig case '\f':
608 1.107 rillig unary_delim = ps.next_unary;
609 1.130 rillig ps.next_col_1 = true;
610 1.100 rillig lsym = lsym_form_feed;
611 1.16 kamil break;
612 1.16 kamil
613 1.40 rillig case ',':
614 1.16 kamil unary_delim = true;
615 1.100 rillig lsym = lsym_comma;
616 1.16 kamil break;
617 1.16 kamil
618 1.16 kamil case '.':
619 1.16 kamil unary_delim = false;
620 1.100 rillig lsym = lsym_period;
621 1.16 kamil break;
622 1.1 cgd
623 1.16 kamil case '-':
624 1.90 rillig case '+':
625 1.107 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
626 1.16 kamil unary_delim = true;
627 1.16 kamil
628 1.141 rillig if (inp_peek() == token.e[-1]) { /* ++, -- */
629 1.141 rillig *token.e++ = inp_next();
630 1.134 rillig if (ps.prev_token == lsym_word ||
631 1.110 rillig ps.prev_token == lsym_rparen_or_rbracket) {
632 1.107 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_postfix_op;
633 1.1 cgd unary_delim = false;
634 1.16 kamil }
635 1.75 rillig
636 1.141 rillig } else if (inp_peek() == '=') { /* += */
637 1.141 rillig *token.e++ = inp_next();
638 1.75 rillig
639 1.141 rillig } else if (inp_peek() == '>') { /* -> */
640 1.141 rillig *token.e++ = inp_next();
641 1.16 kamil unary_delim = false;
642 1.100 rillig lsym = lsym_unary_op;
643 1.107 rillig ps.want_blank = false;
644 1.16 kamil }
645 1.90 rillig break;
646 1.16 kamil
647 1.16 kamil case '=':
648 1.107 rillig if (ps.init_or_struct)
649 1.107 rillig ps.block_init = true;
650 1.141 rillig if (inp_peek() == '=') { /* == */
651 1.141 rillig *token.e++ = inp_next();
652 1.67 rillig *token.e = '\0';
653 1.16 kamil }
654 1.100 rillig lsym = lsym_binary_op;
655 1.16 kamil unary_delim = true;
656 1.16 kamil break;
657 1.16 kamil
658 1.16 kamil case '>':
659 1.16 kamil case '<':
660 1.16 kamil case '!': /* ops like <, <<, <=, !=, etc */
661 1.141 rillig if (inp_peek() == '>' || inp_peek() == '<' || inp_peek() == '=')
662 1.141 rillig *token.e++ = inp_next();
663 1.141 rillig if (inp_peek() == '=')
664 1.133 rillig *token.e++ = inp_next();
665 1.107 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
666 1.16 kamil unary_delim = true;
667 1.16 kamil break;
668 1.16 kamil
669 1.16 kamil case '*':
670 1.16 kamil unary_delim = true;
671 1.107 rillig if (!ps.next_unary) {
672 1.141 rillig if (inp_peek() == '=')
673 1.141 rillig *token.e++ = inp_next();
674 1.100 rillig lsym = lsym_binary_op;
675 1.16 kamil break;
676 1.16 kamil }
677 1.75 rillig
678 1.141 rillig while (inp_peek() == '*' || isspace((unsigned char)inp_peek())) {
679 1.141 rillig if (inp_peek() == '*')
680 1.87 rillig token_add_char('*');
681 1.133 rillig inp_skip();
682 1.16 kamil }
683 1.75 rillig
684 1.16 kamil if (ps.in_decl) {
685 1.143 rillig const char *tp = inp_p();
686 1.6 lukem
687 1.16 kamil while (isalpha((unsigned char)*tp) ||
688 1.103 rillig isspace((unsigned char)*tp)) {
689 1.143 rillig if (++tp >= inp_line_end()) {
690 1.143 rillig const char *p_before = inp_p();
691 1.133 rillig inp_read_line();
692 1.143 rillig if (inp_p() != p_before)
693 1.139 rillig abort();
694 1.139 rillig }
695 1.16 kamil }
696 1.16 kamil if (*tp == '(')
697 1.16 kamil ps.procname[0] = ' ';
698 1.16 kamil }
699 1.75 rillig
700 1.100 rillig lsym = lsym_unary_op;
701 1.16 kamil break;
702 1.1 cgd
703 1.16 kamil default:
704 1.141 rillig if (token.e[-1] == '/' && (inp_peek() == '*' || inp_peek() == '/')) {
705 1.133 rillig *token.e++ = inp_next();
706 1.100 rillig lsym = lsym_comment;
707 1.107 rillig unary_delim = ps.next_unary;
708 1.16 kamil break;
709 1.1 cgd }
710 1.75 rillig
711 1.132 rillig /* handle '||', '&&', etc., and also things as in 'int *****i' */
712 1.141 rillig while (token.e[-1] == inp_peek() || inp_peek() == '=')
713 1.133 rillig token_add_char(inp_next());
714 1.75 rillig
715 1.107 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
716 1.16 kamil unary_delim = true;
717 1.47 rillig }
718 1.16 kamil
719 1.107 rillig ps.next_unary = unary_delim;
720 1.75 rillig
721 1.25 rillig check_size_token(1);
722 1.50 rillig *token.e = '\0';
723 1.75 rillig
724 1.107 rillig return lexi_end(lsym);
725 1.1 cgd }
726 1.16 kamil
727 1.6 lukem void
728 1.128 rillig register_typename(const char *name)
729 1.1 cgd {
730 1.64 rillig if (typenames.len >= typenames.cap) {
731 1.64 rillig typenames.cap = 16 + 2 * typenames.cap;
732 1.64 rillig typenames.items = xrealloc(typenames.items,
733 1.64 rillig sizeof(typenames.items[0]) * typenames.cap);
734 1.64 rillig }
735 1.16 kamil
736 1.84 rillig int pos = bsearch_typenames(name);
737 1.64 rillig if (pos >= 0)
738 1.64 rillig return; /* already in the list */
739 1.75 rillig
740 1.64 rillig pos = -(pos + 1);
741 1.64 rillig memmove(typenames.items + pos + 1, typenames.items + pos,
742 1.73 rillig sizeof(typenames.items[0]) * (typenames.len++ - (unsigned)pos));
743 1.64 rillig typenames.items[pos] = xstrdup(name);
744 1.1 cgd }
745