lexi.c revision 1.106 1 1.106 rillig /* $NetBSD: lexi.c,v 1.106 2021/10/28 21:56:26 rillig Exp $ */
2 1.3 tls
3 1.16 kamil /*-
4 1.16 kamil * SPDX-License-Identifier: BSD-4-Clause
5 1.16 kamil *
6 1.16 kamil * Copyright (c) 1985 Sun Microsystems, Inc.
7 1.5 mrg * Copyright (c) 1980, 1993
8 1.5 mrg * The Regents of the University of California. All rights reserved.
9 1.1 cgd * All rights reserved.
10 1.1 cgd *
11 1.1 cgd * Redistribution and use in source and binary forms, with or without
12 1.1 cgd * modification, are permitted provided that the following conditions
13 1.1 cgd * are met:
14 1.1 cgd * 1. Redistributions of source code must retain the above copyright
15 1.1 cgd * notice, this list of conditions and the following disclaimer.
16 1.1 cgd * 2. Redistributions in binary form must reproduce the above copyright
17 1.1 cgd * notice, this list of conditions and the following disclaimer in the
18 1.1 cgd * documentation and/or other materials provided with the distribution.
19 1.1 cgd * 3. All advertising materials mentioning features or use of this software
20 1.1 cgd * must display the following acknowledgement:
21 1.1 cgd * This product includes software developed by the University of
22 1.1 cgd * California, Berkeley and its contributors.
23 1.1 cgd * 4. Neither the name of the University nor the names of its contributors
24 1.1 cgd * may be used to endorse or promote products derived from this software
25 1.1 cgd * without specific prior written permission.
26 1.1 cgd *
27 1.1 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 1.1 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 1.1 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 1.1 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 1.1 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 1.1 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 1.1 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 1.1 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 1.1 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 1.1 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 1.1 cgd * SUCH DAMAGE.
38 1.1 cgd */
39 1.1 cgd
40 1.16 kamil #if 0
41 1.16 kamil static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";
42 1.16 kamil #endif
43 1.16 kamil
44 1.6 lukem #include <sys/cdefs.h>
45 1.16 kamil #if defined(__NetBSD__)
46 1.106 rillig __RCSID("$NetBSD: lexi.c,v 1.106 2021/10/28 21:56:26 rillig Exp $");
47 1.16 kamil #elif defined(__FreeBSD__)
48 1.16 kamil __FBSDID("$FreeBSD: head/usr.bin/indent/lexi.c 337862 2018-08-15 18:19:45Z pstef $");
49 1.16 kamil #endif
50 1.1 cgd
51 1.93 rillig #include <sys/param.h>
52 1.20 rillig #include <assert.h>
53 1.1 cgd #include <stdio.h>
54 1.1 cgd #include <ctype.h>
55 1.1 cgd #include <stdlib.h>
56 1.1 cgd #include <string.h>
57 1.16 kamil
58 1.16 kamil #include "indent.h"
59 1.1 cgd
60 1.60 rillig /* must be sorted alphabetically, is used in binary search */
61 1.62 rillig static const struct keyword {
62 1.62 rillig const char *name;
63 1.62 rillig enum keyword_kind kind;
64 1.62 rillig } keywords[] = {
65 1.62 rillig {"_Bool", kw_type},
66 1.62 rillig {"_Complex", kw_type},
67 1.62 rillig {"_Imaginary", kw_type},
68 1.62 rillig {"auto", kw_storage_class},
69 1.62 rillig {"bool", kw_type},
70 1.62 rillig {"break", kw_jump},
71 1.62 rillig {"case", kw_case_or_default},
72 1.62 rillig {"char", kw_type},
73 1.62 rillig {"complex", kw_type},
74 1.62 rillig {"const", kw_type},
75 1.62 rillig {"continue", kw_jump},
76 1.62 rillig {"default", kw_case_or_default},
77 1.97 rillig {"do", kw_do},
78 1.62 rillig {"double", kw_type},
79 1.97 rillig {"else", kw_else},
80 1.62 rillig {"enum", kw_struct_or_union_or_enum},
81 1.62 rillig {"extern", kw_storage_class},
82 1.62 rillig {"float", kw_type},
83 1.98 rillig {"for", kw_for},
84 1.62 rillig {"goto", kw_jump},
85 1.98 rillig {"if", kw_if},
86 1.62 rillig {"imaginary", kw_type},
87 1.62 rillig {"inline", kw_inline_or_restrict},
88 1.62 rillig {"int", kw_type},
89 1.62 rillig {"long", kw_type},
90 1.62 rillig {"offsetof", kw_offsetof},
91 1.62 rillig {"register", kw_storage_class},
92 1.62 rillig {"restrict", kw_inline_or_restrict},
93 1.62 rillig {"return", kw_jump},
94 1.62 rillig {"short", kw_type},
95 1.62 rillig {"signed", kw_type},
96 1.62 rillig {"sizeof", kw_sizeof},
97 1.62 rillig {"static", kw_storage_class},
98 1.62 rillig {"struct", kw_struct_or_union_or_enum},
99 1.62 rillig {"switch", kw_switch},
100 1.62 rillig {"typedef", kw_typedef},
101 1.62 rillig {"union", kw_struct_or_union_or_enum},
102 1.62 rillig {"unsigned", kw_type},
103 1.62 rillig {"void", kw_type},
104 1.62 rillig {"volatile", kw_type},
105 1.98 rillig {"while", kw_while}
106 1.1 cgd };
107 1.1 cgd
108 1.84 rillig static struct {
109 1.64 rillig const char **items;
110 1.64 rillig unsigned int len;
111 1.64 rillig unsigned int cap;
112 1.64 rillig } typenames;
113 1.16 kamil
114 1.16 kamil /*
115 1.16 kamil * The transition table below was rewritten by hand from lx's output, given
116 1.16 kamil * the following definitions. lx is Katherine Flavel's lexer generator.
117 1.16 kamil *
118 1.16 kamil * O = /[0-7]/; D = /[0-9]/; NZ = /[1-9]/;
119 1.16 kamil * H = /[a-f0-9]/i; B = /[0-1]/; HP = /0x/i;
120 1.16 kamil * BP = /0b/i; E = /e[+\-]?/i D+; P = /p[+\-]?/i D+;
121 1.16 kamil * FS = /[fl]/i; IS = /u/i /(l|L|ll|LL)/? | /(l|L|ll|LL)/ /u/i?;
122 1.16 kamil *
123 1.16 kamil * D+ E FS? -> $float;
124 1.16 kamil * D* "." D+ E? FS? -> $float;
125 1.16 kamil * D+ "." E? FS? -> $float; HP H+ IS? -> $int;
126 1.16 kamil * HP H+ P FS? -> $float; NZ D* IS? -> $int;
127 1.16 kamil * HP H* "." H+ P FS? -> $float; "0" O* IS? -> $int;
128 1.16 kamil * HP H+ "." P FS -> $float; BP B+ IS? -> $int;
129 1.16 kamil */
130 1.71 rillig /* INDENT OFF */
131 1.82 rillig static const unsigned char lex_number_state[][26] = {
132 1.16 kamil /* examples:
133 1.16 kamil 00
134 1.16 kamil s 0xx
135 1.16 kamil t 00xaa
136 1.16 kamil a 11 101100xxa..
137 1.16 kamil r 11ee0001101lbuuxx.a.pp
138 1.16 kamil t.01.e+008bLuxll0Ll.aa.p+0
139 1.16 kamil states: ABCDEFGHIJKLMNOPQRSTUVWXYZ */
140 1.83 rillig [0] = "uuiifuufiuuiiuiiiiiuiuuuuu", /* (other) */
141 1.83 rillig [1] = "CEIDEHHHIJQ U Q VUVVZZZ", /* 0 */
142 1.83 rillig [2] = "DEIDEHHHIJQ U Q VUVVZZZ", /* 1 */
143 1.83 rillig [3] = "DEIDEHHHIJ U VUVVZZZ", /* 2 3 4 5 6 7 */
144 1.83 rillig [4] = "DEJDEHHHJJ U VUVVZZZ", /* 8 9 */
145 1.83 rillig [5] = " U VUVV ", /* A a C c D d */
146 1.83 rillig [6] = " K U VUVV ", /* B b */
147 1.83 rillig [7] = " FFF FF U VUVV ", /* E e */
148 1.83 rillig [8] = " f f U VUVV f", /* F f */
149 1.83 rillig [9] = " LLf fL PR Li L f", /* L */
150 1.83 rillig [10] = " OOf fO S P O i O f", /* l */
151 1.83 rillig [11] = " FFX ", /* P p */
152 1.83 rillig [12] = " MM M i iiM M ", /* U u */
153 1.83 rillig [13] = " N ", /* X x */
154 1.83 rillig [14] = " G Y ", /* + - */
155 1.83 rillig [15] = "B EE EE T W ", /* . */
156 1.16 kamil /* ABCDEFGHIJKLMNOPQRSTUVWXYZ */
157 1.1 cgd };
158 1.71 rillig /* INDENT ON */
159 1.1 cgd
160 1.82 rillig static const uint8_t lex_number_row[] = {
161 1.56 rillig ['0'] = 1,
162 1.56 rillig ['1'] = 2,
163 1.56 rillig ['2'] = 3, ['3'] = 3, ['4'] = 3, ['5'] = 3, ['6'] = 3, ['7'] = 3,
164 1.56 rillig ['8'] = 4, ['9'] = 4,
165 1.56 rillig ['A'] = 5, ['a'] = 5, ['C'] = 5, ['c'] = 5, ['D'] = 5, ['d'] = 5,
166 1.56 rillig ['B'] = 6, ['b'] = 6,
167 1.56 rillig ['E'] = 7, ['e'] = 7,
168 1.56 rillig ['F'] = 8, ['f'] = 8,
169 1.56 rillig ['L'] = 9,
170 1.56 rillig ['l'] = 10,
171 1.56 rillig ['P'] = 11, ['p'] = 11,
172 1.56 rillig ['U'] = 12, ['u'] = 12,
173 1.56 rillig ['X'] = 13, ['x'] = 13,
174 1.56 rillig ['+'] = 14, ['-'] = 14,
175 1.56 rillig ['.'] = 15,
176 1.56 rillig };
177 1.36 rillig
178 1.32 rillig static char
179 1.32 rillig inbuf_peek(void)
180 1.32 rillig {
181 1.78 rillig return *inp.s;
182 1.32 rillig }
183 1.32 rillig
184 1.66 rillig void
185 1.32 rillig inbuf_skip(void)
186 1.32 rillig {
187 1.78 rillig inp.s++;
188 1.78 rillig if (inp.s >= inp.e)
189 1.81 rillig inbuf_read_line();
190 1.32 rillig }
191 1.32 rillig
192 1.66 rillig char
193 1.32 rillig inbuf_next(void)
194 1.32 rillig {
195 1.32 rillig char ch = inbuf_peek();
196 1.32 rillig inbuf_skip();
197 1.32 rillig return ch;
198 1.32 rillig }
199 1.32 rillig
200 1.25 rillig static void
201 1.25 rillig check_size_token(size_t desired_size)
202 1.25 rillig {
203 1.58 rillig if (token.e + desired_size >= token.l)
204 1.58 rillig buf_expand(&token, desired_size);
205 1.25 rillig }
206 1.25 rillig
207 1.87 rillig static void
208 1.87 rillig token_add_char(char ch)
209 1.87 rillig {
210 1.87 rillig check_size_token(1);
211 1.87 rillig *token.e++ = ch;
212 1.87 rillig }
213 1.87 rillig
214 1.16 kamil static int
215 1.62 rillig cmp_keyword_by_name(const void *key, const void *elem)
216 1.16 kamil {
217 1.62 rillig return strcmp(key, ((const struct keyword *)elem)->name);
218 1.27 rillig }
219 1.27 rillig
220 1.20 rillig #ifdef debug
221 1.100 rillig static const char *
222 1.100 rillig lsym_name(lexer_symbol sym)
223 1.20 rillig {
224 1.20 rillig static const char *const name[] = {
225 1.100 rillig "eof",
226 1.100 rillig "preprocessing",
227 1.100 rillig "newline",
228 1.100 rillig "form_feed",
229 1.100 rillig "comment",
230 1.100 rillig "lparen_or_lbracket",
231 1.100 rillig "rparen_or_rbracket",
232 1.100 rillig "lbrace",
233 1.100 rillig "rbrace",
234 1.100 rillig "period",
235 1.100 rillig "unary_op",
236 1.100 rillig "binary_op",
237 1.100 rillig "postfix_op",
238 1.100 rillig "question",
239 1.100 rillig "colon",
240 1.100 rillig "comma",
241 1.100 rillig "semicolon",
242 1.100 rillig "typedef",
243 1.100 rillig "storage_class",
244 1.100 rillig "type",
245 1.100 rillig "tag",
246 1.100 rillig "case_label",
247 1.100 rillig "string_prefix",
248 1.100 rillig "ident",
249 1.100 rillig "funcname",
250 1.100 rillig "do",
251 1.100 rillig "else",
252 1.100 rillig "for",
253 1.100 rillig "if",
254 1.100 rillig "switch",
255 1.100 rillig "while",
256 1.20 rillig };
257 1.20 rillig
258 1.100 rillig assert(array_length(name) == (int)lsym_while + 1);
259 1.20 rillig
260 1.100 rillig return name[sym];
261 1.20 rillig }
262 1.20 rillig
263 1.101 rillig static const char *
264 1.103 rillig kw_name(enum keyword_kind kw)
265 1.103 rillig {
266 1.101 rillig static const char *name[] = {
267 1.101 rillig "0",
268 1.101 rillig "offsetof",
269 1.101 rillig "sizeof",
270 1.101 rillig "struct_or_union_or_enum",
271 1.101 rillig "type",
272 1.101 rillig "for",
273 1.101 rillig "if",
274 1.101 rillig "while",
275 1.101 rillig "do",
276 1.101 rillig "else",
277 1.101 rillig "switch",
278 1.101 rillig "case_or_default",
279 1.101 rillig "jump",
280 1.101 rillig "storage_class",
281 1.101 rillig "typedef",
282 1.101 rillig "inline_or_restrict",
283 1.101 rillig };
284 1.101 rillig
285 1.101 rillig return name[kw];
286 1.101 rillig }
287 1.101 rillig
288 1.20 rillig static void
289 1.72 rillig debug_print_buf(const char *name, const struct buffer *buf)
290 1.20 rillig {
291 1.72 rillig if (buf->s < buf->e) {
292 1.101 rillig debug_printf("%s ", name);
293 1.101 rillig debug_vis_range("\"", buf->s, buf->e, "\"\n");
294 1.20 rillig }
295 1.20 rillig }
296 1.20 rillig
297 1.101 rillig static void
298 1.104 rillig debug_lexi(const struct parser_state *state, lexer_symbol lsym)
299 1.20 rillig {
300 1.104 rillig debug_println("");
301 1.101 rillig debug_printf("line %d\n", line_no);
302 1.104 rillig if (state != &ps)
303 1.104 rillig debug_println("transient state");
304 1.72 rillig debug_print_buf("label", &lab);
305 1.72 rillig debug_print_buf("code", &code);
306 1.72 rillig debug_print_buf("comment", &com);
307 1.101 rillig debug_printf("lexi returns '%s'", lsym_name(lsym));
308 1.105 rillig if (state->curr_keyword != kw_0)
309 1.105 rillig debug_printf(" keyword '%s'", kw_name(state->curr_keyword));
310 1.105 rillig if (state->prev_keyword != kw_0)
311 1.105 rillig debug_printf(" previous keyword '%s'", kw_name(state->prev_keyword));
312 1.104 rillig debug_println("");
313 1.101 rillig debug_print_buf("token", &token);
314 1.101 rillig }
315 1.96 rillig #endif
316 1.20 rillig
317 1.104 rillig /* ARGSUSED */
318 1.101 rillig static lexer_symbol
319 1.104 rillig lexi_end(const struct parser_state *state, lexer_symbol lsym)
320 1.101 rillig {
321 1.101 rillig #ifdef debug
322 1.104 rillig debug_lexi(state, lsym);
323 1.101 rillig #endif
324 1.100 rillig return lsym;
325 1.20 rillig }
326 1.20 rillig
327 1.43 rillig static void
328 1.43 rillig lex_number(void)
329 1.43 rillig {
330 1.71 rillig for (uint8_t s = 'A'; s != 'f' && s != 'i' && s != 'u';) {
331 1.78 rillig uint8_t ch = (uint8_t)*inp.s;
332 1.94 rillig if (ch >= array_length(lex_number_row) || lex_number_row[ch] == 0)
333 1.56 rillig break;
334 1.75 rillig
335 1.82 rillig uint8_t row = lex_number_row[ch];
336 1.82 rillig if (lex_number_state[row][s - 'A'] == ' ') {
337 1.71 rillig /*-
338 1.82 rillig * lex_number_state[0][s - 'A'] now indicates the type:
339 1.74 rillig * f = floating, i = integer, u = unknown
340 1.56 rillig */
341 1.43 rillig break;
342 1.43 rillig }
343 1.75 rillig
344 1.82 rillig s = lex_number_state[row][s - 'A'];
345 1.87 rillig token_add_char(inbuf_next());
346 1.43 rillig }
347 1.43 rillig }
348 1.43 rillig
349 1.43 rillig static void
350 1.43 rillig lex_word(void)
351 1.43 rillig {
352 1.78 rillig while (isalnum((unsigned char)*inp.s) ||
353 1.95 rillig *inp.s == '\\' ||
354 1.95 rillig *inp.s == '_' || *inp.s == '$') {
355 1.75 rillig
356 1.78 rillig if (*inp.s == '\\') {
357 1.78 rillig if (inp.s[1] == '\n') {
358 1.78 rillig inp.s += 2;
359 1.78 rillig if (inp.s >= inp.e)
360 1.81 rillig inbuf_read_line();
361 1.43 rillig } else
362 1.43 rillig break;
363 1.43 rillig }
364 1.75 rillig
365 1.87 rillig token_add_char(inbuf_next());
366 1.43 rillig }
367 1.43 rillig }
368 1.43 rillig
369 1.43 rillig static void
370 1.43 rillig lex_char_or_string(void)
371 1.43 rillig {
372 1.52 rillig for (char delim = *token.s;;) {
373 1.78 rillig if (*inp.s == '\n') {
374 1.52 rillig diag(1, "Unterminated literal");
375 1.52 rillig return;
376 1.52 rillig }
377 1.75 rillig
378 1.87 rillig token_add_char(inbuf_next());
379 1.52 rillig if (token.e[-1] == delim)
380 1.52 rillig return;
381 1.75 rillig
382 1.52 rillig if (token.e[-1] == '\\') {
383 1.78 rillig if (*inp.s == '\n')
384 1.52 rillig ++line_no;
385 1.87 rillig token_add_char(inbuf_next());
386 1.52 rillig }
387 1.52 rillig }
388 1.43 rillig }
389 1.43 rillig
390 1.84 rillig /* Guess whether the current token is a declared type. */
391 1.57 rillig static bool
392 1.84 rillig probably_typename(const struct parser_state *state)
393 1.57 rillig {
394 1.70 rillig if (state->p_l_follow != 0)
395 1.70 rillig return false;
396 1.70 rillig if (state->block_init || state->in_stmt)
397 1.70 rillig return false;
398 1.78 rillig if (inp.s[0] == '*' && inp.s[1] != '=')
399 1.70 rillig goto maybe;
400 1.78 rillig if (isalpha((unsigned char)*inp.s))
401 1.70 rillig goto maybe;
402 1.70 rillig return false;
403 1.70 rillig maybe:
404 1.100 rillig return state->last_token == lsym_semicolon ||
405 1.100 rillig state->last_token == lsym_lbrace ||
406 1.100 rillig state->last_token == lsym_rbrace;
407 1.57 rillig }
408 1.57 rillig
409 1.84 rillig static int
410 1.84 rillig bsearch_typenames(const char *key)
411 1.84 rillig {
412 1.84 rillig const char **arr = typenames.items;
413 1.84 rillig int lo = 0;
414 1.84 rillig int hi = (int)typenames.len - 1;
415 1.84 rillig
416 1.84 rillig while (lo <= hi) {
417 1.84 rillig int mid = (int)((unsigned)(lo + hi) >> 1);
418 1.84 rillig int cmp = strcmp(arr[mid], key);
419 1.84 rillig if (cmp < 0)
420 1.84 rillig lo = mid + 1;
421 1.84 rillig else if (cmp > 0)
422 1.84 rillig hi = mid - 1;
423 1.84 rillig else
424 1.84 rillig return mid;
425 1.84 rillig }
426 1.84 rillig return -(lo + 1);
427 1.84 rillig }
428 1.84 rillig
429 1.63 rillig static bool
430 1.63 rillig is_typename(void)
431 1.63 rillig {
432 1.84 rillig if (opt.auto_typedefs &&
433 1.84 rillig token.e - token.s >= 2 && memcmp(token.e - 2, "_t", 2) == 0)
434 1.84 rillig return true;
435 1.63 rillig
436 1.84 rillig return bsearch_typenames(token.s) >= 0;
437 1.63 rillig }
438 1.63 rillig
439 1.90 rillig /* Read an alphanumeric token into 'token', or return end_of_file. */
440 1.100 rillig static lexer_symbol
441 1.89 rillig lexi_alnum(struct parser_state *state)
442 1.1 cgd {
443 1.89 rillig if (isdigit((unsigned char)*inp.s) ||
444 1.89 rillig (inp.s[0] == '.' && isdigit((unsigned char)inp.s[1]))) {
445 1.89 rillig lex_number();
446 1.103 rillig } else if (isalnum((unsigned char)*inp.s) ||
447 1.103 rillig *inp.s == '_' || *inp.s == '$') {
448 1.89 rillig lex_word();
449 1.102 rillig } else
450 1.102 rillig return lsym_eof; /* just as a placeholder */
451 1.102 rillig
452 1.89 rillig *token.e = '\0';
453 1.16 kamil
454 1.89 rillig if (token.s[0] == 'L' && token.s[1] == '\0' &&
455 1.89 rillig (*inp.s == '"' || *inp.s == '\''))
456 1.100 rillig return lsym_string_prefix;
457 1.16 kamil
458 1.89 rillig while (is_hspace(inbuf_peek()))
459 1.32 rillig inbuf_skip();
460 1.89 rillig
461 1.103 rillig if (state->last_token == lsym_tag && state->p_l_follow == 0) {
462 1.92 rillig state->next_unary = true;
463 1.100 rillig return lsym_type;
464 1.16 kamil }
465 1.6 lukem
466 1.89 rillig /* Operator after identifier is binary unless last token was 'struct'. */
467 1.100 rillig state->next_unary = state->last_token == lsym_tag;
468 1.16 kamil
469 1.89 rillig const struct keyword *kw = bsearch(token.s, keywords,
470 1.94 rillig array_length(keywords), sizeof(keywords[0]), cmp_keyword_by_name);
471 1.89 rillig if (kw == NULL) {
472 1.89 rillig if (is_typename()) {
473 1.105 rillig state->curr_keyword = kw_type;
474 1.92 rillig state->next_unary = true;
475 1.89 rillig goto found_typename;
476 1.16 kamil }
477 1.89 rillig
478 1.89 rillig } else { /* we have a keyword */
479 1.105 rillig state->curr_keyword = kw->kind;
480 1.92 rillig state->next_unary = true;
481 1.89 rillig
482 1.89 rillig switch (kw->kind) {
483 1.89 rillig case kw_switch:
484 1.100 rillig return lsym_switch;
485 1.89 rillig
486 1.89 rillig case kw_case_or_default:
487 1.100 rillig return lsym_case_label;
488 1.89 rillig
489 1.89 rillig case kw_struct_or_union_or_enum:
490 1.89 rillig case kw_type:
491 1.89 rillig found_typename:
492 1.89 rillig if (state->p_l_follow != 0) {
493 1.89 rillig /* inside parens: cast, param list, offsetof or sizeof */
494 1.89 rillig state->cast_mask |= (1 << state->p_l_follow) & ~state->not_cast_mask;
495 1.89 rillig }
496 1.100 rillig if (state->last_token == lsym_period ||
497 1.105 rillig state->last_token == lsym_unary_op)
498 1.89 rillig break;
499 1.89 rillig if (kw != NULL && kw->kind == kw_struct_or_union_or_enum)
500 1.100 rillig return lsym_tag;
501 1.89 rillig if (state->p_l_follow != 0)
502 1.89 rillig break;
503 1.100 rillig return lsym_type;
504 1.75 rillig
505 1.98 rillig case kw_for:
506 1.100 rillig return lsym_for;
507 1.98 rillig
508 1.98 rillig case kw_if:
509 1.100 rillig return lsym_if;
510 1.98 rillig
511 1.98 rillig case kw_while:
512 1.100 rillig return lsym_while;
513 1.75 rillig
514 1.97 rillig case kw_do:
515 1.100 rillig return lsym_do;
516 1.97 rillig
517 1.97 rillig case kw_else:
518 1.100 rillig return lsym_else;
519 1.16 kamil
520 1.89 rillig case kw_storage_class:
521 1.100 rillig return lsym_storage_class;
522 1.16 kamil
523 1.89 rillig case kw_typedef:
524 1.100 rillig return lsym_typedef;
525 1.16 kamil
526 1.89 rillig default: /* all others are treated like any other
527 1.16 kamil * identifier */
528 1.100 rillig return lsym_ident;
529 1.90 rillig }
530 1.90 rillig }
531 1.89 rillig
532 1.89 rillig if (*inp.s == '(' && state->tos <= 1 && state->ind_level == 0 &&
533 1.89 rillig !state->in_parameter_declaration && !state->block_init) {
534 1.89 rillig
535 1.89 rillig for (const char *p = inp.s; p < inp.e;)
536 1.89 rillig if (*p++ == ')' && (*p == ';' || *p == ','))
537 1.89 rillig goto not_proc;
538 1.89 rillig
539 1.89 rillig strncpy(state->procname, token.s, sizeof state->procname - 1);
540 1.89 rillig if (state->in_decl)
541 1.89 rillig state->in_parameter_declaration = true;
542 1.100 rillig return lsym_funcname;
543 1.89 rillig not_proc:;
544 1.89 rillig
545 1.89 rillig } else if (probably_typename(state)) {
546 1.105 rillig state->curr_keyword = kw_type;
547 1.92 rillig state->next_unary = true;
548 1.100 rillig return lsym_type;
549 1.89 rillig }
550 1.89 rillig
551 1.100 rillig if (state->last_token == lsym_type) /* if this is a declared variable,
552 1.89 rillig * then following sign is unary */
553 1.92 rillig state->next_unary = true; /* will make "int a -1" work */
554 1.89 rillig
555 1.100 rillig return lsym_ident; /* the ident is not in the list */
556 1.89 rillig }
557 1.75 rillig
558 1.89 rillig /* Reads the next token, placing it in the global variable "token". */
559 1.100 rillig lexer_symbol
560 1.106 rillig lexi(void)
561 1.89 rillig {
562 1.106 rillig struct parser_state *state = &ps;
563 1.106 rillig
564 1.90 rillig token.e = token.s;
565 1.90 rillig state->col_1 = state->last_nl;
566 1.89 rillig state->last_nl = false;
567 1.105 rillig state->prev_keyword = state->curr_keyword;
568 1.105 rillig state->curr_keyword = kw_0;
569 1.75 rillig
570 1.89 rillig while (is_hspace(*inp.s)) {
571 1.89 rillig state->col_1 = false;
572 1.89 rillig inbuf_skip();
573 1.89 rillig }
574 1.75 rillig
575 1.100 rillig lexer_symbol alnum_lsym = lexi_alnum(state);
576 1.100 rillig if (alnum_lsym != lsym_eof)
577 1.104 rillig return lexi_end(state, alnum_lsym);
578 1.16 kamil
579 1.16 kamil /* Scan a non-alphanumeric token */
580 1.16 kamil
581 1.90 rillig check_size_token(3); /* for things like "<<=" */
582 1.90 rillig *token.e++ = inbuf_next();
583 1.50 rillig *token.e = '\0';
584 1.16 kamil
585 1.100 rillig lexer_symbol lsym;
586 1.89 rillig bool unary_delim = false; /* whether the current token forces a
587 1.89 rillig * following operator to be unary */
588 1.89 rillig
589 1.50 rillig switch (*token.s) {
590 1.16 kamil case '\n':
591 1.92 rillig unary_delim = state->next_unary;
592 1.16 kamil state->last_nl = true; /* remember that we just had a newline */
593 1.47 rillig /* if data has been exhausted, the newline is a dummy. */
594 1.100 rillig lsym = had_eof ? lsym_eof : lsym_newline;
595 1.16 kamil break;
596 1.16 kamil
597 1.43 rillig case '\'':
598 1.43 rillig case '"':
599 1.44 rillig lex_char_or_string();
600 1.100 rillig lsym = lsym_ident;
601 1.16 kamil break;
602 1.6 lukem
603 1.40 rillig case '(':
604 1.40 rillig case '[':
605 1.16 kamil unary_delim = true;
606 1.100 rillig lsym = lsym_lparen_or_lbracket;
607 1.16 kamil break;
608 1.16 kamil
609 1.40 rillig case ')':
610 1.40 rillig case ']':
611 1.100 rillig lsym = lsym_rparen_or_rbracket;
612 1.16 kamil break;
613 1.16 kamil
614 1.16 kamil case '#':
615 1.92 rillig unary_delim = state->next_unary;
616 1.100 rillig lsym = lsym_preprocessing;
617 1.16 kamil break;
618 1.16 kamil
619 1.16 kamil case '?':
620 1.16 kamil unary_delim = true;
621 1.100 rillig lsym = lsym_question;
622 1.16 kamil break;
623 1.16 kamil
624 1.40 rillig case ':':
625 1.100 rillig lsym = lsym_colon;
626 1.16 kamil unary_delim = true;
627 1.16 kamil break;
628 1.16 kamil
629 1.40 rillig case ';':
630 1.16 kamil unary_delim = true;
631 1.100 rillig lsym = lsym_semicolon;
632 1.16 kamil break;
633 1.16 kamil
634 1.40 rillig case '{':
635 1.16 kamil unary_delim = true;
636 1.100 rillig lsym = lsym_lbrace;
637 1.16 kamil break;
638 1.16 kamil
639 1.40 rillig case '}':
640 1.16 kamil unary_delim = true;
641 1.100 rillig lsym = lsym_rbrace;
642 1.16 kamil break;
643 1.16 kamil
644 1.69 rillig case '\f':
645 1.92 rillig unary_delim = state->next_unary;
646 1.74 rillig state->last_nl = true; /* remember this, so we can set 'state->col_1'
647 1.16 kamil * right */
648 1.100 rillig lsym = lsym_form_feed;
649 1.16 kamil break;
650 1.16 kamil
651 1.40 rillig case ',':
652 1.16 kamil unary_delim = true;
653 1.100 rillig lsym = lsym_comma;
654 1.16 kamil break;
655 1.16 kamil
656 1.16 kamil case '.':
657 1.16 kamil unary_delim = false;
658 1.100 rillig lsym = lsym_period;
659 1.16 kamil break;
660 1.1 cgd
661 1.16 kamil case '-':
662 1.90 rillig case '+':
663 1.100 rillig lsym = state->next_unary ? lsym_unary_op : lsym_binary_op;
664 1.16 kamil unary_delim = true;
665 1.16 kamil
666 1.90 rillig if (*inp.s == token.s[0]) { /* ++, -- */
667 1.78 rillig *token.e++ = *inp.s++;
668 1.100 rillig if (state->last_token == lsym_ident ||
669 1.100 rillig state->last_token == lsym_rparen_or_rbracket) {
670 1.100 rillig lsym = state->next_unary ? lsym_unary_op : lsym_postfix_op;
671 1.1 cgd unary_delim = false;
672 1.16 kamil }
673 1.75 rillig
674 1.90 rillig } else if (*inp.s == '=') { /* += */
675 1.78 rillig *token.e++ = *inp.s++;
676 1.75 rillig
677 1.90 rillig } else if (*inp.s == '>') { /* -> */
678 1.78 rillig *token.e++ = *inp.s++;
679 1.16 kamil unary_delim = false;
680 1.100 rillig lsym = lsym_unary_op;
681 1.16 kamil state->want_blank = false;
682 1.16 kamil }
683 1.90 rillig break;
684 1.16 kamil
685 1.16 kamil case '=':
686 1.86 rillig if (state->init_or_struct)
687 1.54 rillig state->block_init = true;
688 1.78 rillig if (*inp.s == '=') { /* == */
689 1.78 rillig *token.e++ = *inp.s++;
690 1.67 rillig *token.e = '\0';
691 1.16 kamil }
692 1.100 rillig lsym = lsym_binary_op;
693 1.16 kamil unary_delim = true;
694 1.16 kamil break;
695 1.16 kamil
696 1.16 kamil case '>':
697 1.16 kamil case '<':
698 1.16 kamil case '!': /* ops like <, <<, <=, !=, etc */
699 1.78 rillig if (*inp.s == '>' || *inp.s == '<' || *inp.s == '=')
700 1.50 rillig *token.e++ = inbuf_next();
701 1.78 rillig if (*inp.s == '=')
702 1.78 rillig *token.e++ = *inp.s++;
703 1.100 rillig lsym = state->next_unary ? lsym_unary_op : lsym_binary_op;
704 1.16 kamil unary_delim = true;
705 1.16 kamil break;
706 1.16 kamil
707 1.16 kamil case '*':
708 1.16 kamil unary_delim = true;
709 1.92 rillig if (!state->next_unary) {
710 1.78 rillig if (*inp.s == '=')
711 1.78 rillig *token.e++ = *inp.s++;
712 1.100 rillig lsym = lsym_binary_op;
713 1.16 kamil break;
714 1.16 kamil }
715 1.75 rillig
716 1.78 rillig while (*inp.s == '*' || isspace((unsigned char)*inp.s)) {
717 1.87 rillig if (*inp.s == '*')
718 1.87 rillig token_add_char('*');
719 1.32 rillig inbuf_skip();
720 1.16 kamil }
721 1.75 rillig
722 1.16 kamil if (ps.in_decl) {
723 1.78 rillig char *tp = inp.s;
724 1.6 lukem
725 1.16 kamil while (isalpha((unsigned char)*tp) ||
726 1.103 rillig isspace((unsigned char)*tp)) {
727 1.78 rillig if (++tp >= inp.e)
728 1.81 rillig inbuf_read_line();
729 1.16 kamil }
730 1.16 kamil if (*tp == '(')
731 1.16 kamil ps.procname[0] = ' ';
732 1.16 kamil }
733 1.75 rillig
734 1.100 rillig lsym = lsym_unary_op;
735 1.16 kamil break;
736 1.1 cgd
737 1.16 kamil default:
738 1.78 rillig if (token.s[0] == '/' && (*inp.s == '*' || *inp.s == '/')) {
739 1.16 kamil /* it is start of comment */
740 1.50 rillig *token.e++ = inbuf_next();
741 1.1 cgd
742 1.100 rillig lsym = lsym_comment;
743 1.92 rillig unary_delim = state->next_unary;
744 1.16 kamil break;
745 1.1 cgd }
746 1.75 rillig
747 1.78 rillig while (token.e[-1] == *inp.s || *inp.s == '=') {
748 1.87 rillig /* handle '||', '&&', etc., and also things as in 'int *****i' */
749 1.87 rillig token_add_char(inbuf_next());
750 1.16 kamil }
751 1.75 rillig
752 1.100 rillig lsym = state->next_unary ? lsym_unary_op : lsym_binary_op;
753 1.16 kamil unary_delim = true;
754 1.47 rillig }
755 1.16 kamil
756 1.95 rillig if (inp.s >= inp.e) /* check for input buffer empty */
757 1.81 rillig inbuf_read_line();
758 1.75 rillig
759 1.92 rillig state->next_unary = unary_delim;
760 1.75 rillig
761 1.25 rillig check_size_token(1);
762 1.50 rillig *token.e = '\0';
763 1.75 rillig
764 1.104 rillig return lexi_end(state, lsym);
765 1.1 cgd }
766 1.16 kamil
767 1.6 lukem void
768 1.64 rillig add_typename(const char *name)
769 1.1 cgd {
770 1.64 rillig if (typenames.len >= typenames.cap) {
771 1.64 rillig typenames.cap = 16 + 2 * typenames.cap;
772 1.64 rillig typenames.items = xrealloc(typenames.items,
773 1.64 rillig sizeof(typenames.items[0]) * typenames.cap);
774 1.64 rillig }
775 1.16 kamil
776 1.84 rillig int pos = bsearch_typenames(name);
777 1.64 rillig if (pos >= 0)
778 1.64 rillig return; /* already in the list */
779 1.75 rillig
780 1.64 rillig pos = -(pos + 1);
781 1.64 rillig memmove(typenames.items + pos + 1, typenames.items + pos,
782 1.73 rillig sizeof(typenames.items[0]) * (typenames.len++ - (unsigned)pos));
783 1.64 rillig typenames.items[pos] = xstrdup(name);
784 1.1 cgd }
785