lexi.c revision 1.119 1 1.119 rillig /* $NetBSD: lexi.c,v 1.119 2021/10/31 09:52:37 rillig Exp $ */
2 1.3 tls
3 1.16 kamil /*-
4 1.16 kamil * SPDX-License-Identifier: BSD-4-Clause
5 1.16 kamil *
6 1.16 kamil * Copyright (c) 1985 Sun Microsystems, Inc.
7 1.5 mrg * Copyright (c) 1980, 1993
8 1.5 mrg * The Regents of the University of California. All rights reserved.
9 1.1 cgd * All rights reserved.
10 1.1 cgd *
11 1.1 cgd * Redistribution and use in source and binary forms, with or without
12 1.1 cgd * modification, are permitted provided that the following conditions
13 1.1 cgd * are met:
14 1.1 cgd * 1. Redistributions of source code must retain the above copyright
15 1.1 cgd * notice, this list of conditions and the following disclaimer.
16 1.1 cgd * 2. Redistributions in binary form must reproduce the above copyright
17 1.1 cgd * notice, this list of conditions and the following disclaimer in the
18 1.1 cgd * documentation and/or other materials provided with the distribution.
19 1.1 cgd * 3. All advertising materials mentioning features or use of this software
20 1.1 cgd * must display the following acknowledgement:
21 1.1 cgd * This product includes software developed by the University of
22 1.1 cgd * California, Berkeley and its contributors.
23 1.1 cgd * 4. Neither the name of the University nor the names of its contributors
24 1.1 cgd * may be used to endorse or promote products derived from this software
25 1.1 cgd * without specific prior written permission.
26 1.1 cgd *
27 1.1 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 1.1 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 1.1 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 1.1 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 1.1 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 1.1 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 1.1 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 1.1 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 1.1 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 1.1 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 1.1 cgd * SUCH DAMAGE.
38 1.1 cgd */
39 1.1 cgd
40 1.16 kamil #if 0
41 1.16 kamil static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";
42 1.16 kamil #endif
43 1.16 kamil
44 1.6 lukem #include <sys/cdefs.h>
45 1.16 kamil #if defined(__NetBSD__)
46 1.119 rillig __RCSID("$NetBSD: lexi.c,v 1.119 2021/10/31 09:52:37 rillig Exp $");
47 1.16 kamil #elif defined(__FreeBSD__)
48 1.16 kamil __FBSDID("$FreeBSD: head/usr.bin/indent/lexi.c 337862 2018-08-15 18:19:45Z pstef $");
49 1.16 kamil #endif
50 1.1 cgd
51 1.1 cgd #include <ctype.h>
52 1.1 cgd #include <stdlib.h>
53 1.1 cgd #include <string.h>
54 1.16 kamil
55 1.16 kamil #include "indent.h"
56 1.1 cgd
57 1.60 rillig /* must be sorted alphabetically, is used in binary search */
58 1.62 rillig static const struct keyword {
59 1.62 rillig const char *name;
60 1.62 rillig enum keyword_kind kind;
61 1.62 rillig } keywords[] = {
62 1.62 rillig {"_Bool", kw_type},
63 1.62 rillig {"_Complex", kw_type},
64 1.62 rillig {"_Imaginary", kw_type},
65 1.62 rillig {"auto", kw_storage_class},
66 1.62 rillig {"bool", kw_type},
67 1.119 rillig {"break", kw_other},
68 1.62 rillig {"case", kw_case_or_default},
69 1.62 rillig {"char", kw_type},
70 1.62 rillig {"complex", kw_type},
71 1.62 rillig {"const", kw_type},
72 1.119 rillig {"continue", kw_other},
73 1.62 rillig {"default", kw_case_or_default},
74 1.97 rillig {"do", kw_do},
75 1.62 rillig {"double", kw_type},
76 1.97 rillig {"else", kw_else},
77 1.119 rillig {"enum", kw_tag},
78 1.62 rillig {"extern", kw_storage_class},
79 1.62 rillig {"float", kw_type},
80 1.98 rillig {"for", kw_for},
81 1.119 rillig {"goto", kw_other},
82 1.98 rillig {"if", kw_if},
83 1.62 rillig {"imaginary", kw_type},
84 1.119 rillig {"inline", kw_other},
85 1.62 rillig {"int", kw_type},
86 1.62 rillig {"long", kw_type},
87 1.62 rillig {"offsetof", kw_offsetof},
88 1.62 rillig {"register", kw_storage_class},
89 1.119 rillig {"restrict", kw_other},
90 1.119 rillig {"return", kw_other},
91 1.62 rillig {"short", kw_type},
92 1.62 rillig {"signed", kw_type},
93 1.62 rillig {"sizeof", kw_sizeof},
94 1.62 rillig {"static", kw_storage_class},
95 1.119 rillig {"struct", kw_tag},
96 1.62 rillig {"switch", kw_switch},
97 1.62 rillig {"typedef", kw_typedef},
98 1.119 rillig {"union", kw_tag},
99 1.62 rillig {"unsigned", kw_type},
100 1.62 rillig {"void", kw_type},
101 1.62 rillig {"volatile", kw_type},
102 1.98 rillig {"while", kw_while}
103 1.1 cgd };
104 1.1 cgd
105 1.84 rillig static struct {
106 1.64 rillig const char **items;
107 1.64 rillig unsigned int len;
108 1.64 rillig unsigned int cap;
109 1.64 rillig } typenames;
110 1.16 kamil
111 1.16 kamil /*
112 1.16 kamil * The transition table below was rewritten by hand from lx's output, given
113 1.16 kamil * the following definitions. lx is Katherine Flavel's lexer generator.
114 1.16 kamil *
115 1.16 kamil * O = /[0-7]/; D = /[0-9]/; NZ = /[1-9]/;
116 1.16 kamil * H = /[a-f0-9]/i; B = /[0-1]/; HP = /0x/i;
117 1.16 kamil * BP = /0b/i; E = /e[+\-]?/i D+; P = /p[+\-]?/i D+;
118 1.16 kamil * FS = /[fl]/i; IS = /u/i /(l|L|ll|LL)/? | /(l|L|ll|LL)/ /u/i?;
119 1.16 kamil *
120 1.16 kamil * D+ E FS? -> $float;
121 1.16 kamil * D* "." D+ E? FS? -> $float;
122 1.16 kamil * D+ "." E? FS? -> $float; HP H+ IS? -> $int;
123 1.16 kamil * HP H+ P FS? -> $float; NZ D* IS? -> $int;
124 1.16 kamil * HP H* "." H+ P FS? -> $float; "0" O* IS? -> $int;
125 1.16 kamil * HP H+ "." P FS -> $float; BP B+ IS? -> $int;
126 1.16 kamil */
127 1.71 rillig /* INDENT OFF */
128 1.82 rillig static const unsigned char lex_number_state[][26] = {
129 1.16 kamil /* examples:
130 1.16 kamil 00
131 1.16 kamil s 0xx
132 1.16 kamil t 00xaa
133 1.16 kamil a 11 101100xxa..
134 1.16 kamil r 11ee0001101lbuuxx.a.pp
135 1.16 kamil t.01.e+008bLuxll0Ll.aa.p+0
136 1.16 kamil states: ABCDEFGHIJKLMNOPQRSTUVWXYZ */
137 1.83 rillig [0] = "uuiifuufiuuiiuiiiiiuiuuuuu", /* (other) */
138 1.83 rillig [1] = "CEIDEHHHIJQ U Q VUVVZZZ", /* 0 */
139 1.83 rillig [2] = "DEIDEHHHIJQ U Q VUVVZZZ", /* 1 */
140 1.83 rillig [3] = "DEIDEHHHIJ U VUVVZZZ", /* 2 3 4 5 6 7 */
141 1.83 rillig [4] = "DEJDEHHHJJ U VUVVZZZ", /* 8 9 */
142 1.83 rillig [5] = " U VUVV ", /* A a C c D d */
143 1.83 rillig [6] = " K U VUVV ", /* B b */
144 1.83 rillig [7] = " FFF FF U VUVV ", /* E e */
145 1.83 rillig [8] = " f f U VUVV f", /* F f */
146 1.83 rillig [9] = " LLf fL PR Li L f", /* L */
147 1.83 rillig [10] = " OOf fO S P O i O f", /* l */
148 1.83 rillig [11] = " FFX ", /* P p */
149 1.83 rillig [12] = " MM M i iiM M ", /* U u */
150 1.83 rillig [13] = " N ", /* X x */
151 1.83 rillig [14] = " G Y ", /* + - */
152 1.83 rillig [15] = "B EE EE T W ", /* . */
153 1.16 kamil /* ABCDEFGHIJKLMNOPQRSTUVWXYZ */
154 1.1 cgd };
155 1.71 rillig /* INDENT ON */
156 1.1 cgd
157 1.115 rillig static const unsigned char lex_number_row[] = {
158 1.56 rillig ['0'] = 1,
159 1.56 rillig ['1'] = 2,
160 1.56 rillig ['2'] = 3, ['3'] = 3, ['4'] = 3, ['5'] = 3, ['6'] = 3, ['7'] = 3,
161 1.56 rillig ['8'] = 4, ['9'] = 4,
162 1.56 rillig ['A'] = 5, ['a'] = 5, ['C'] = 5, ['c'] = 5, ['D'] = 5, ['d'] = 5,
163 1.56 rillig ['B'] = 6, ['b'] = 6,
164 1.56 rillig ['E'] = 7, ['e'] = 7,
165 1.56 rillig ['F'] = 8, ['f'] = 8,
166 1.56 rillig ['L'] = 9,
167 1.56 rillig ['l'] = 10,
168 1.56 rillig ['P'] = 11, ['p'] = 11,
169 1.56 rillig ['U'] = 12, ['u'] = 12,
170 1.56 rillig ['X'] = 13, ['x'] = 13,
171 1.56 rillig ['+'] = 14, ['-'] = 14,
172 1.56 rillig ['.'] = 15,
173 1.56 rillig };
174 1.36 rillig
175 1.32 rillig static char
176 1.32 rillig inbuf_peek(void)
177 1.32 rillig {
178 1.78 rillig return *inp.s;
179 1.32 rillig }
180 1.32 rillig
181 1.66 rillig void
182 1.32 rillig inbuf_skip(void)
183 1.32 rillig {
184 1.78 rillig inp.s++;
185 1.78 rillig if (inp.s >= inp.e)
186 1.81 rillig inbuf_read_line();
187 1.32 rillig }
188 1.32 rillig
189 1.66 rillig char
190 1.32 rillig inbuf_next(void)
191 1.32 rillig {
192 1.32 rillig char ch = inbuf_peek();
193 1.32 rillig inbuf_skip();
194 1.32 rillig return ch;
195 1.32 rillig }
196 1.32 rillig
197 1.25 rillig static void
198 1.25 rillig check_size_token(size_t desired_size)
199 1.25 rillig {
200 1.58 rillig if (token.e + desired_size >= token.l)
201 1.58 rillig buf_expand(&token, desired_size);
202 1.25 rillig }
203 1.25 rillig
204 1.87 rillig static void
205 1.87 rillig token_add_char(char ch)
206 1.87 rillig {
207 1.87 rillig check_size_token(1);
208 1.87 rillig *token.e++ = ch;
209 1.87 rillig }
210 1.87 rillig
211 1.20 rillig #ifdef debug
212 1.100 rillig static const char *
213 1.100 rillig lsym_name(lexer_symbol sym)
214 1.20 rillig {
215 1.20 rillig static const char *const name[] = {
216 1.100 rillig "eof",
217 1.100 rillig "preprocessing",
218 1.100 rillig "newline",
219 1.100 rillig "form_feed",
220 1.100 rillig "comment",
221 1.100 rillig "lparen_or_lbracket",
222 1.100 rillig "rparen_or_rbracket",
223 1.100 rillig "lbrace",
224 1.100 rillig "rbrace",
225 1.100 rillig "period",
226 1.100 rillig "unary_op",
227 1.100 rillig "binary_op",
228 1.100 rillig "postfix_op",
229 1.100 rillig "question",
230 1.100 rillig "colon",
231 1.100 rillig "comma",
232 1.100 rillig "semicolon",
233 1.100 rillig "typedef",
234 1.100 rillig "storage_class",
235 1.100 rillig "type",
236 1.100 rillig "tag",
237 1.100 rillig "case_label",
238 1.100 rillig "string_prefix",
239 1.100 rillig "ident",
240 1.100 rillig "funcname",
241 1.100 rillig "do",
242 1.100 rillig "else",
243 1.100 rillig "for",
244 1.100 rillig "if",
245 1.100 rillig "switch",
246 1.100 rillig "while",
247 1.20 rillig };
248 1.20 rillig
249 1.100 rillig return name[sym];
250 1.20 rillig }
251 1.20 rillig
252 1.101 rillig static const char *
253 1.103 rillig kw_name(enum keyword_kind kw)
254 1.103 rillig {
255 1.115 rillig static const char *const name[] = {
256 1.101 rillig "0",
257 1.101 rillig "offsetof",
258 1.101 rillig "sizeof",
259 1.119 rillig "tag",
260 1.101 rillig "type",
261 1.101 rillig "for",
262 1.101 rillig "if",
263 1.101 rillig "while",
264 1.101 rillig "do",
265 1.101 rillig "else",
266 1.101 rillig "switch",
267 1.101 rillig "case_or_default",
268 1.101 rillig "storage_class",
269 1.101 rillig "typedef",
270 1.119 rillig "other",
271 1.101 rillig };
272 1.101 rillig
273 1.101 rillig return name[kw];
274 1.101 rillig }
275 1.101 rillig
276 1.20 rillig static void
277 1.72 rillig debug_print_buf(const char *name, const struct buffer *buf)
278 1.20 rillig {
279 1.72 rillig if (buf->s < buf->e) {
280 1.101 rillig debug_printf("%s ", name);
281 1.101 rillig debug_vis_range("\"", buf->s, buf->e, "\"\n");
282 1.20 rillig }
283 1.20 rillig }
284 1.20 rillig
285 1.112 rillig #define debug_ps_bool(name) \
286 1.113 rillig if (ps.name != prev_ps.name) \
287 1.113 rillig debug_println("[%c] ps." #name, ps.name ? 'x' : ' ')
288 1.112 rillig #define debug_ps_int(name) \
289 1.113 rillig if (ps.name != prev_ps.name) \
290 1.113 rillig debug_println("%3d ps." #name, ps.name)
291 1.112 rillig #define debug_ps_keyword(name) \
292 1.112 rillig if (ps.name != kw_0) \
293 1.113 rillig debug_println(" ps." #name " = %s", kw_name(ps.name))
294 1.112 rillig
295 1.101 rillig static void
296 1.107 rillig debug_lexi(lexer_symbol lsym)
297 1.20 rillig {
298 1.113 rillig /*
299 1.113 rillig * Watch out for 'rolled back parser state' in the debug output; the
300 1.113 rillig * differences around these are unreliable.
301 1.113 rillig */
302 1.113 rillig static struct parser_state prev_ps;
303 1.113 rillig
304 1.104 rillig debug_println("");
305 1.116 rillig debug_printf("line %d: %s", line_no, lsym_name(lsym));
306 1.116 rillig debug_vis_range(" \"", token.s, token.e, "\"\n");
307 1.72 rillig debug_print_buf("label", &lab);
308 1.72 rillig debug_print_buf("code", &code);
309 1.72 rillig debug_print_buf("comment", &com);
310 1.112 rillig
311 1.112 rillig // prev_token
312 1.112 rillig debug_ps_keyword(prev_keyword);
313 1.112 rillig debug_ps_keyword(curr_keyword);
314 1.117 rillig debug_ps_bool(curr_newline);
315 1.117 rillig debug_ps_bool(curr_col_1);
316 1.112 rillig debug_ps_bool(next_unary);
317 1.112 rillig // procname
318 1.112 rillig debug_ps_bool(want_blank);
319 1.112 rillig debug_ps_int(paren_level);
320 1.112 rillig debug_ps_int(p_l_follow);
321 1.112 rillig // paren_indents
322 1.112 rillig debug_ps_int(cast_mask);
323 1.112 rillig debug_ps_int(not_cast_mask);
324 1.112 rillig
325 1.112 rillig debug_ps_int(comment_delta);
326 1.112 rillig debug_ps_int(n_comment_delta);
327 1.112 rillig debug_ps_int(com_ind);
328 1.112 rillig
329 1.112 rillig debug_ps_bool(block_init);
330 1.112 rillig debug_ps_int(block_init_level);
331 1.112 rillig debug_ps_bool(init_or_struct);
332 1.112 rillig
333 1.112 rillig debug_ps_int(ind_level);
334 1.112 rillig debug_ps_int(ind_level_follow);
335 1.112 rillig
336 1.112 rillig debug_ps_int(decl_nest);
337 1.112 rillig debug_ps_bool(decl_on_line);
338 1.112 rillig debug_ps_bool(in_decl);
339 1.112 rillig debug_ps_int(just_saw_decl);
340 1.112 rillig debug_ps_bool(in_parameter_declaration);
341 1.112 rillig debug_ps_bool(decl_indent_done);
342 1.112 rillig
343 1.112 rillig debug_ps_bool(in_stmt);
344 1.112 rillig debug_ps_bool(ind_stmt);
345 1.112 rillig debug_ps_bool(is_case_label);
346 1.112 rillig
347 1.112 rillig debug_ps_bool(search_stmt);
348 1.113 rillig
349 1.113 rillig prev_ps = ps;
350 1.101 rillig }
351 1.96 rillig #endif
352 1.20 rillig
353 1.104 rillig /* ARGSUSED */
354 1.101 rillig static lexer_symbol
355 1.107 rillig lexi_end(lexer_symbol lsym)
356 1.101 rillig {
357 1.101 rillig #ifdef debug
358 1.107 rillig debug_lexi(lsym);
359 1.101 rillig #endif
360 1.100 rillig return lsym;
361 1.20 rillig }
362 1.20 rillig
363 1.43 rillig static void
364 1.43 rillig lex_number(void)
365 1.43 rillig {
366 1.115 rillig for (unsigned char s = 'A'; s != 'f' && s != 'i' && s != 'u';) {
367 1.115 rillig unsigned char ch = (unsigned char)*inp.s;
368 1.94 rillig if (ch >= array_length(lex_number_row) || lex_number_row[ch] == 0)
369 1.56 rillig break;
370 1.75 rillig
371 1.115 rillig unsigned char row = lex_number_row[ch];
372 1.82 rillig if (lex_number_state[row][s - 'A'] == ' ') {
373 1.71 rillig /*-
374 1.82 rillig * lex_number_state[0][s - 'A'] now indicates the type:
375 1.74 rillig * f = floating, i = integer, u = unknown
376 1.56 rillig */
377 1.43 rillig break;
378 1.43 rillig }
379 1.75 rillig
380 1.82 rillig s = lex_number_state[row][s - 'A'];
381 1.87 rillig token_add_char(inbuf_next());
382 1.43 rillig }
383 1.43 rillig }
384 1.43 rillig
385 1.43 rillig static void
386 1.43 rillig lex_word(void)
387 1.43 rillig {
388 1.78 rillig while (isalnum((unsigned char)*inp.s) ||
389 1.95 rillig *inp.s == '\\' ||
390 1.95 rillig *inp.s == '_' || *inp.s == '$') {
391 1.75 rillig
392 1.78 rillig if (*inp.s == '\\') {
393 1.78 rillig if (inp.s[1] == '\n') {
394 1.78 rillig inp.s += 2;
395 1.78 rillig if (inp.s >= inp.e)
396 1.81 rillig inbuf_read_line();
397 1.43 rillig } else
398 1.43 rillig break;
399 1.43 rillig }
400 1.75 rillig
401 1.87 rillig token_add_char(inbuf_next());
402 1.43 rillig }
403 1.43 rillig }
404 1.43 rillig
405 1.43 rillig static void
406 1.43 rillig lex_char_or_string(void)
407 1.43 rillig {
408 1.52 rillig for (char delim = *token.s;;) {
409 1.78 rillig if (*inp.s == '\n') {
410 1.52 rillig diag(1, "Unterminated literal");
411 1.52 rillig return;
412 1.52 rillig }
413 1.75 rillig
414 1.87 rillig token_add_char(inbuf_next());
415 1.52 rillig if (token.e[-1] == delim)
416 1.52 rillig return;
417 1.75 rillig
418 1.52 rillig if (token.e[-1] == '\\') {
419 1.78 rillig if (*inp.s == '\n')
420 1.52 rillig ++line_no;
421 1.87 rillig token_add_char(inbuf_next());
422 1.52 rillig }
423 1.52 rillig }
424 1.43 rillig }
425 1.43 rillig
426 1.84 rillig /* Guess whether the current token is a declared type. */
427 1.57 rillig static bool
428 1.107 rillig probably_typename(void)
429 1.57 rillig {
430 1.109 rillig if (ps.p_l_follow > 0)
431 1.70 rillig return false;
432 1.107 rillig if (ps.block_init || ps.in_stmt)
433 1.70 rillig return false;
434 1.78 rillig if (inp.s[0] == '*' && inp.s[1] != '=')
435 1.70 rillig goto maybe;
436 1.78 rillig if (isalpha((unsigned char)*inp.s))
437 1.70 rillig goto maybe;
438 1.70 rillig return false;
439 1.70 rillig maybe:
440 1.110 rillig return ps.prev_token == lsym_semicolon ||
441 1.110 rillig ps.prev_token == lsym_lbrace ||
442 1.110 rillig ps.prev_token == lsym_rbrace;
443 1.57 rillig }
444 1.57 rillig
445 1.84 rillig static int
446 1.84 rillig bsearch_typenames(const char *key)
447 1.84 rillig {
448 1.84 rillig const char **arr = typenames.items;
449 1.84 rillig int lo = 0;
450 1.84 rillig int hi = (int)typenames.len - 1;
451 1.84 rillig
452 1.84 rillig while (lo <= hi) {
453 1.84 rillig int mid = (int)((unsigned)(lo + hi) >> 1);
454 1.84 rillig int cmp = strcmp(arr[mid], key);
455 1.84 rillig if (cmp < 0)
456 1.84 rillig lo = mid + 1;
457 1.84 rillig else if (cmp > 0)
458 1.84 rillig hi = mid - 1;
459 1.84 rillig else
460 1.84 rillig return mid;
461 1.84 rillig }
462 1.84 rillig return -(lo + 1);
463 1.84 rillig }
464 1.84 rillig
465 1.63 rillig static bool
466 1.63 rillig is_typename(void)
467 1.63 rillig {
468 1.84 rillig if (opt.auto_typedefs &&
469 1.84 rillig token.e - token.s >= 2 && memcmp(token.e - 2, "_t", 2) == 0)
470 1.84 rillig return true;
471 1.63 rillig
472 1.84 rillig return bsearch_typenames(token.s) >= 0;
473 1.63 rillig }
474 1.63 rillig
475 1.115 rillig static int
476 1.115 rillig cmp_keyword_by_name(const void *key, const void *elem)
477 1.115 rillig {
478 1.115 rillig return strcmp(key, ((const struct keyword *)elem)->name);
479 1.115 rillig }
480 1.115 rillig
481 1.90 rillig /* Read an alphanumeric token into 'token', or return end_of_file. */
482 1.100 rillig static lexer_symbol
483 1.107 rillig lexi_alnum(void)
484 1.1 cgd {
485 1.89 rillig if (isdigit((unsigned char)*inp.s) ||
486 1.89 rillig (inp.s[0] == '.' && isdigit((unsigned char)inp.s[1]))) {
487 1.89 rillig lex_number();
488 1.103 rillig } else if (isalnum((unsigned char)*inp.s) ||
489 1.103 rillig *inp.s == '_' || *inp.s == '$') {
490 1.89 rillig lex_word();
491 1.102 rillig } else
492 1.102 rillig return lsym_eof; /* just as a placeholder */
493 1.102 rillig
494 1.89 rillig *token.e = '\0';
495 1.16 kamil
496 1.89 rillig if (token.s[0] == 'L' && token.s[1] == '\0' &&
497 1.89 rillig (*inp.s == '"' || *inp.s == '\''))
498 1.100 rillig return lsym_string_prefix;
499 1.16 kamil
500 1.111 rillig while (ch_isblank(inbuf_peek()))
501 1.32 rillig inbuf_skip();
502 1.89 rillig
503 1.110 rillig if (ps.prev_token == lsym_tag && ps.p_l_follow == 0) {
504 1.107 rillig ps.next_unary = true;
505 1.100 rillig return lsym_type;
506 1.16 kamil }
507 1.6 lukem
508 1.89 rillig /* Operator after identifier is binary unless last token was 'struct'. */
509 1.110 rillig ps.next_unary = ps.prev_token == lsym_tag;
510 1.16 kamil
511 1.89 rillig const struct keyword *kw = bsearch(token.s, keywords,
512 1.94 rillig array_length(keywords), sizeof(keywords[0]), cmp_keyword_by_name);
513 1.89 rillig if (kw == NULL) {
514 1.89 rillig if (is_typename()) {
515 1.107 rillig ps.curr_keyword = kw_type;
516 1.107 rillig ps.next_unary = true;
517 1.89 rillig goto found_typename;
518 1.16 kamil }
519 1.89 rillig
520 1.89 rillig } else { /* we have a keyword */
521 1.107 rillig ps.curr_keyword = kw->kind;
522 1.107 rillig ps.next_unary = true;
523 1.89 rillig
524 1.118 rillig /* INDENT OFF */
525 1.89 rillig switch (kw->kind) {
526 1.119 rillig case kw_tag:
527 1.118 rillig case kw_type: goto found_typename;
528 1.118 rillig case kw_case_or_default: return lsym_case_label;
529 1.118 rillig case kw_for: return lsym_for;
530 1.118 rillig case kw_if: return lsym_if;
531 1.118 rillig case kw_else: return lsym_else;
532 1.118 rillig case kw_switch: return lsym_switch;
533 1.118 rillig case kw_while: return lsym_while;
534 1.118 rillig case kw_do: return lsym_do;
535 1.118 rillig case kw_storage_class: return lsym_storage_class;
536 1.118 rillig case kw_typedef: return lsym_typedef;
537 1.118 rillig default: return lsym_ident;
538 1.118 rillig }
539 1.118 rillig /* INDENT ON */
540 1.118 rillig
541 1.118 rillig found_typename:
542 1.118 rillig if (ps.p_l_follow > 0) {
543 1.118 rillig /* inside parentheses: cast, param list, offsetof or sizeof */
544 1.118 rillig ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
545 1.118 rillig }
546 1.118 rillig if (ps.prev_token != lsym_period && ps.prev_token != lsym_unary_op) {
547 1.119 rillig if (kw != NULL && kw->kind == kw_tag)
548 1.100 rillig return lsym_tag;
549 1.118 rillig if (ps.p_l_follow == 0)
550 1.118 rillig return lsym_type;
551 1.90 rillig }
552 1.90 rillig }
553 1.89 rillig
554 1.107 rillig if (*inp.s == '(' && ps.tos <= 1 && ps.ind_level == 0 &&
555 1.107 rillig !ps.in_parameter_declaration && !ps.block_init) {
556 1.89 rillig
557 1.89 rillig for (const char *p = inp.s; p < inp.e;)
558 1.89 rillig if (*p++ == ')' && (*p == ';' || *p == ','))
559 1.118 rillig goto no_function_definition;
560 1.89 rillig
561 1.107 rillig strncpy(ps.procname, token.s, sizeof ps.procname - 1);
562 1.107 rillig if (ps.in_decl)
563 1.107 rillig ps.in_parameter_declaration = true;
564 1.100 rillig return lsym_funcname;
565 1.118 rillig no_function_definition:;
566 1.89 rillig
567 1.107 rillig } else if (probably_typename()) {
568 1.107 rillig ps.curr_keyword = kw_type;
569 1.107 rillig ps.next_unary = true;
570 1.100 rillig return lsym_type;
571 1.89 rillig }
572 1.89 rillig
573 1.110 rillig if (ps.prev_token == lsym_type) /* if this is a declared variable,
574 1.89 rillig * then following sign is unary */
575 1.107 rillig ps.next_unary = true; /* will make "int a -1" work */
576 1.89 rillig
577 1.100 rillig return lsym_ident; /* the ident is not in the list */
578 1.89 rillig }
579 1.75 rillig
580 1.89 rillig /* Reads the next token, placing it in the global variable "token". */
581 1.100 rillig lexer_symbol
582 1.106 rillig lexi(void)
583 1.89 rillig {
584 1.90 rillig token.e = token.s;
585 1.117 rillig ps.curr_col_1 = ps.curr_newline;
586 1.117 rillig ps.curr_newline = false;
587 1.107 rillig ps.prev_keyword = ps.curr_keyword;
588 1.107 rillig ps.curr_keyword = kw_0;
589 1.75 rillig
590 1.111 rillig while (ch_isblank(*inp.s)) {
591 1.117 rillig ps.curr_col_1 = false;
592 1.89 rillig inbuf_skip();
593 1.89 rillig }
594 1.75 rillig
595 1.107 rillig lexer_symbol alnum_lsym = lexi_alnum();
596 1.100 rillig if (alnum_lsym != lsym_eof)
597 1.107 rillig return lexi_end(alnum_lsym);
598 1.16 kamil
599 1.16 kamil /* Scan a non-alphanumeric token */
600 1.16 kamil
601 1.90 rillig check_size_token(3); /* for things like "<<=" */
602 1.90 rillig *token.e++ = inbuf_next();
603 1.50 rillig *token.e = '\0';
604 1.16 kamil
605 1.100 rillig lexer_symbol lsym;
606 1.89 rillig bool unary_delim = false; /* whether the current token forces a
607 1.89 rillig * following operator to be unary */
608 1.89 rillig
609 1.50 rillig switch (*token.s) {
610 1.16 kamil case '\n':
611 1.107 rillig unary_delim = ps.next_unary;
612 1.117 rillig ps.curr_newline = true;
613 1.47 rillig /* if data has been exhausted, the newline is a dummy. */
614 1.100 rillig lsym = had_eof ? lsym_eof : lsym_newline;
615 1.16 kamil break;
616 1.16 kamil
617 1.43 rillig case '\'':
618 1.43 rillig case '"':
619 1.44 rillig lex_char_or_string();
620 1.100 rillig lsym = lsym_ident;
621 1.16 kamil break;
622 1.6 lukem
623 1.40 rillig case '(':
624 1.40 rillig case '[':
625 1.16 kamil unary_delim = true;
626 1.100 rillig lsym = lsym_lparen_or_lbracket;
627 1.16 kamil break;
628 1.16 kamil
629 1.40 rillig case ')':
630 1.40 rillig case ']':
631 1.100 rillig lsym = lsym_rparen_or_rbracket;
632 1.16 kamil break;
633 1.16 kamil
634 1.16 kamil case '#':
635 1.107 rillig unary_delim = ps.next_unary;
636 1.100 rillig lsym = lsym_preprocessing;
637 1.16 kamil break;
638 1.16 kamil
639 1.16 kamil case '?':
640 1.16 kamil unary_delim = true;
641 1.100 rillig lsym = lsym_question;
642 1.16 kamil break;
643 1.16 kamil
644 1.40 rillig case ':':
645 1.100 rillig lsym = lsym_colon;
646 1.16 kamil unary_delim = true;
647 1.16 kamil break;
648 1.16 kamil
649 1.40 rillig case ';':
650 1.16 kamil unary_delim = true;
651 1.100 rillig lsym = lsym_semicolon;
652 1.16 kamil break;
653 1.16 kamil
654 1.40 rillig case '{':
655 1.16 kamil unary_delim = true;
656 1.100 rillig lsym = lsym_lbrace;
657 1.16 kamil break;
658 1.16 kamil
659 1.40 rillig case '}':
660 1.16 kamil unary_delim = true;
661 1.100 rillig lsym = lsym_rbrace;
662 1.16 kamil break;
663 1.16 kamil
664 1.69 rillig case '\f':
665 1.107 rillig unary_delim = ps.next_unary;
666 1.117 rillig ps.curr_newline = true;
667 1.100 rillig lsym = lsym_form_feed;
668 1.16 kamil break;
669 1.16 kamil
670 1.40 rillig case ',':
671 1.16 kamil unary_delim = true;
672 1.100 rillig lsym = lsym_comma;
673 1.16 kamil break;
674 1.16 kamil
675 1.16 kamil case '.':
676 1.16 kamil unary_delim = false;
677 1.100 rillig lsym = lsym_period;
678 1.16 kamil break;
679 1.1 cgd
680 1.16 kamil case '-':
681 1.90 rillig case '+':
682 1.107 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
683 1.16 kamil unary_delim = true;
684 1.16 kamil
685 1.90 rillig if (*inp.s == token.s[0]) { /* ++, -- */
686 1.78 rillig *token.e++ = *inp.s++;
687 1.110 rillig if (ps.prev_token == lsym_ident ||
688 1.110 rillig ps.prev_token == lsym_rparen_or_rbracket) {
689 1.107 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_postfix_op;
690 1.1 cgd unary_delim = false;
691 1.16 kamil }
692 1.75 rillig
693 1.90 rillig } else if (*inp.s == '=') { /* += */
694 1.78 rillig *token.e++ = *inp.s++;
695 1.75 rillig
696 1.90 rillig } else if (*inp.s == '>') { /* -> */
697 1.78 rillig *token.e++ = *inp.s++;
698 1.16 kamil unary_delim = false;
699 1.100 rillig lsym = lsym_unary_op;
700 1.107 rillig ps.want_blank = false;
701 1.16 kamil }
702 1.90 rillig break;
703 1.16 kamil
704 1.16 kamil case '=':
705 1.107 rillig if (ps.init_or_struct)
706 1.107 rillig ps.block_init = true;
707 1.78 rillig if (*inp.s == '=') { /* == */
708 1.78 rillig *token.e++ = *inp.s++;
709 1.67 rillig *token.e = '\0';
710 1.16 kamil }
711 1.100 rillig lsym = lsym_binary_op;
712 1.16 kamil unary_delim = true;
713 1.16 kamil break;
714 1.16 kamil
715 1.16 kamil case '>':
716 1.16 kamil case '<':
717 1.16 kamil case '!': /* ops like <, <<, <=, !=, etc */
718 1.78 rillig if (*inp.s == '>' || *inp.s == '<' || *inp.s == '=')
719 1.50 rillig *token.e++ = inbuf_next();
720 1.78 rillig if (*inp.s == '=')
721 1.78 rillig *token.e++ = *inp.s++;
722 1.107 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
723 1.16 kamil unary_delim = true;
724 1.16 kamil break;
725 1.16 kamil
726 1.16 kamil case '*':
727 1.16 kamil unary_delim = true;
728 1.107 rillig if (!ps.next_unary) {
729 1.78 rillig if (*inp.s == '=')
730 1.78 rillig *token.e++ = *inp.s++;
731 1.100 rillig lsym = lsym_binary_op;
732 1.16 kamil break;
733 1.16 kamil }
734 1.75 rillig
735 1.78 rillig while (*inp.s == '*' || isspace((unsigned char)*inp.s)) {
736 1.87 rillig if (*inp.s == '*')
737 1.87 rillig token_add_char('*');
738 1.32 rillig inbuf_skip();
739 1.16 kamil }
740 1.75 rillig
741 1.16 kamil if (ps.in_decl) {
742 1.78 rillig char *tp = inp.s;
743 1.6 lukem
744 1.16 kamil while (isalpha((unsigned char)*tp) ||
745 1.103 rillig isspace((unsigned char)*tp)) {
746 1.78 rillig if (++tp >= inp.e)
747 1.81 rillig inbuf_read_line();
748 1.16 kamil }
749 1.16 kamil if (*tp == '(')
750 1.16 kamil ps.procname[0] = ' ';
751 1.16 kamil }
752 1.75 rillig
753 1.100 rillig lsym = lsym_unary_op;
754 1.16 kamil break;
755 1.1 cgd
756 1.16 kamil default:
757 1.78 rillig if (token.s[0] == '/' && (*inp.s == '*' || *inp.s == '/')) {
758 1.16 kamil /* it is start of comment */
759 1.50 rillig *token.e++ = inbuf_next();
760 1.1 cgd
761 1.100 rillig lsym = lsym_comment;
762 1.107 rillig unary_delim = ps.next_unary;
763 1.16 kamil break;
764 1.1 cgd }
765 1.75 rillig
766 1.78 rillig while (token.e[-1] == *inp.s || *inp.s == '=') {
767 1.87 rillig /* handle '||', '&&', etc., and also things as in 'int *****i' */
768 1.87 rillig token_add_char(inbuf_next());
769 1.16 kamil }
770 1.75 rillig
771 1.107 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op;
772 1.16 kamil unary_delim = true;
773 1.47 rillig }
774 1.16 kamil
775 1.95 rillig if (inp.s >= inp.e) /* check for input buffer empty */
776 1.81 rillig inbuf_read_line();
777 1.75 rillig
778 1.107 rillig ps.next_unary = unary_delim;
779 1.75 rillig
780 1.25 rillig check_size_token(1);
781 1.50 rillig *token.e = '\0';
782 1.75 rillig
783 1.107 rillig return lexi_end(lsym);
784 1.1 cgd }
785 1.16 kamil
786 1.6 lukem void
787 1.64 rillig add_typename(const char *name)
788 1.1 cgd {
789 1.64 rillig if (typenames.len >= typenames.cap) {
790 1.64 rillig typenames.cap = 16 + 2 * typenames.cap;
791 1.64 rillig typenames.items = xrealloc(typenames.items,
792 1.64 rillig sizeof(typenames.items[0]) * typenames.cap);
793 1.64 rillig }
794 1.16 kamil
795 1.84 rillig int pos = bsearch_typenames(name);
796 1.64 rillig if (pos >= 0)
797 1.64 rillig return; /* already in the list */
798 1.75 rillig
799 1.64 rillig pos = -(pos + 1);
800 1.64 rillig memmove(typenames.items + pos + 1, typenames.items + pos,
801 1.73 rillig sizeof(typenames.items[0]) * (typenames.len++ - (unsigned)pos));
802 1.64 rillig typenames.items[pos] = xstrdup(name);
803 1.1 cgd }
804