1 1.242 rillig /* $NetBSD: lexi.c,v 1.242 2023/12/03 21:44:42 rillig Exp $ */ 2 1.3 tls 3 1.16 kamil /*- 4 1.16 kamil * SPDX-License-Identifier: BSD-4-Clause 5 1.16 kamil * 6 1.16 kamil * Copyright (c) 1985 Sun Microsystems, Inc. 7 1.5 mrg * Copyright (c) 1980, 1993 8 1.5 mrg * The Regents of the University of California. All rights reserved. 9 1.1 cgd * All rights reserved. 10 1.1 cgd * 11 1.1 cgd * Redistribution and use in source and binary forms, with or without 12 1.1 cgd * modification, are permitted provided that the following conditions 13 1.1 cgd * are met: 14 1.1 cgd * 1. Redistributions of source code must retain the above copyright 15 1.1 cgd * notice, this list of conditions and the following disclaimer. 16 1.1 cgd * 2. Redistributions in binary form must reproduce the above copyright 17 1.1 cgd * notice, this list of conditions and the following disclaimer in the 18 1.1 cgd * documentation and/or other materials provided with the distribution. 19 1.1 cgd * 3. All advertising materials mentioning features or use of this software 20 1.1 cgd * must display the following acknowledgement: 21 1.1 cgd * This product includes software developed by the University of 22 1.1 cgd * California, Berkeley and its contributors. 23 1.1 cgd * 4. Neither the name of the University nor the names of its contributors 24 1.1 cgd * may be used to endorse or promote products derived from this software 25 1.1 cgd * without specific prior written permission. 26 1.1 cgd * 27 1.1 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 1.1 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 1.1 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 1.1 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 1.1 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 1.1 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 1.1 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 1.1 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 1.1 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 1.1 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 1.1 cgd * SUCH DAMAGE. 38 1.1 cgd */ 39 1.1 cgd 40 1.6 lukem #include <sys/cdefs.h> 41 1.242 rillig __RCSID("$NetBSD: lexi.c,v 1.242 2023/12/03 21:44:42 rillig Exp $"); 42 1.1 cgd 43 1.1 cgd #include <stdlib.h> 44 1.1 cgd #include <string.h> 45 1.16 kamil 46 1.16 kamil #include "indent.h" 47 1.1 cgd 48 1.60 rillig /* must be sorted alphabetically, is used in binary search */ 49 1.62 rillig static const struct keyword { 50 1.198 rillig const char name[12]; 51 1.198 rillig lexer_symbol lsym; 52 1.62 rillig } keywords[] = { 53 1.198 rillig {"_Bool", lsym_type}, 54 1.235 rillig {"_Complex", lsym_modifier}, 55 1.235 rillig {"_Imaginary", lsym_modifier}, 56 1.209 rillig {"auto", lsym_modifier}, 57 1.198 rillig {"bool", lsym_type}, 58 1.198 rillig {"break", lsym_word}, 59 1.210 rillig {"case", lsym_case}, 60 1.198 rillig {"char", lsym_type}, 61 1.235 rillig {"complex", lsym_modifier}, 62 1.209 rillig {"const", lsym_modifier}, 63 1.198 rillig {"continue", lsym_word}, 64 1.210 rillig {"default", lsym_default}, 65 1.198 rillig {"do", lsym_do}, 66 1.198 rillig {"double", lsym_type}, 67 1.198 rillig {"else", lsym_else}, 68 1.198 rillig {"enum", lsym_tag}, 69 1.209 rillig {"extern", lsym_modifier}, 70 1.198 rillig {"float", lsym_type}, 71 1.198 rillig {"for", lsym_for}, 72 1.198 rillig {"goto", lsym_word}, 73 1.198 rillig {"if", lsym_if}, 74 1.235 rillig {"imaginary", lsym_modifier}, 75 1.209 rillig {"inline", lsym_modifier}, 76 1.198 rillig {"int", lsym_type}, 77 1.198 rillig {"long", lsym_type}, 78 1.198 rillig {"offsetof", lsym_offsetof}, 79 1.209 rillig {"register", lsym_modifier}, 80 1.198 rillig {"restrict", lsym_word}, 81 1.198 rillig {"return", lsym_return}, 82 1.198 rillig {"short", lsym_type}, 83 1.198 rillig {"signed", lsym_type}, 84 1.198 rillig {"sizeof", lsym_sizeof}, 85 1.209 rillig {"static", lsym_modifier}, 86 1.198 rillig {"struct", lsym_tag}, 87 1.198 rillig {"switch", lsym_switch}, 88 1.198 rillig {"typedef", lsym_typedef}, 89 1.198 rillig {"union", lsym_tag}, 90 1.198 rillig {"unsigned", lsym_type}, 91 1.198 rillig {"void", lsym_type}, 92 1.209 rillig {"volatile", lsym_modifier}, 93 1.198 rillig {"while", lsym_while} 94 1.1 cgd }; 95 1.1 cgd 96 1.84 rillig static struct { 97 1.198 rillig const char **items; 98 1.198 rillig unsigned int len; 99 1.198 rillig unsigned int cap; 100 1.64 rillig } typenames; 101 1.16 kamil 102 1.196 rillig /*- 103 1.16 kamil * The transition table below was rewritten by hand from lx's output, given 104 1.16 kamil * the following definitions. lx is Katherine Flavel's lexer generator. 105 1.16 kamil * 106 1.16 kamil * O = /[0-7]/; D = /[0-9]/; NZ = /[1-9]/; 107 1.16 kamil * H = /[a-f0-9]/i; B = /[0-1]/; HP = /0x/i; 108 1.16 kamil * BP = /0b/i; E = /e[+\-]?/i D+; P = /p[+\-]?/i D+; 109 1.16 kamil * FS = /[fl]/i; IS = /u/i /(l|L|ll|LL)/? | /(l|L|ll|LL)/ /u/i?; 110 1.16 kamil * 111 1.16 kamil * D+ E FS? -> $float; 112 1.16 kamil * D* "." D+ E? FS? -> $float; 113 1.16 kamil * D+ "." E? FS? -> $float; HP H+ IS? -> $int; 114 1.16 kamil * HP H+ P FS? -> $float; NZ D* IS? -> $int; 115 1.16 kamil * HP H* "." H+ P FS? -> $float; "0" O* IS? -> $int; 116 1.16 kamil * HP H+ "." P FS -> $float; BP B+ IS? -> $int; 117 1.16 kamil */ 118 1.71 rillig /* INDENT OFF */ 119 1.82 rillig static const unsigned char lex_number_state[][26] = { 120 1.199 rillig /* examples: 121 1.199 rillig 00 122 1.199 rillig s 0xx 123 1.199 rillig t 00xaa 124 1.199 rillig a 11 101100xxa.. 125 1.199 rillig r 11ee0001101lbuuxx.a.pp 126 1.199 rillig t.01.e+008bLuxll0Ll.aa.p+0 127 1.199 rillig states: ABCDEFGHIJKLMNOPQRSTUVWXYZ */ 128 1.199 rillig [0] = "uuiifuufiuuiiuiiiiiuiuuuuu", /* (other) */ 129 1.199 rillig [1] = "CEIDEHHHIJQ U Q VUVVZZZ", /* 0 */ 130 1.199 rillig [2] = "DEIDEHHHIJQ U Q VUVVZZZ", /* 1 */ 131 1.199 rillig [3] = "DEIDEHHHIJ U VUVVZZZ", /* 2 3 4 5 6 7 */ 132 1.199 rillig [4] = "DEJDEHHHJJ U VUVVZZZ", /* 8 9 */ 133 1.199 rillig [5] = " U VUVV ", /* A a C c D d */ 134 1.199 rillig [6] = " K U VUVV ", /* B b */ 135 1.199 rillig [7] = " FFF FF U VUVV ", /* E e */ 136 1.199 rillig [8] = " f f U VUVV f", /* F f */ 137 1.199 rillig [9] = " LLf fL PR Li L f", /* L */ 138 1.199 rillig [10] = " OOf fO S P O i O f", /* l */ 139 1.199 rillig [11] = " FFX ", /* P p */ 140 1.199 rillig [12] = " MM M i iiM M ", /* U u */ 141 1.199 rillig [13] = " N ", /* X x */ 142 1.199 rillig [14] = " G Y ", /* + - */ 143 1.199 rillig [15] = "B EE EE T W ", /* . */ 144 1.199 rillig /* ABCDEFGHIJKLMNOPQRSTUVWXYZ */ 145 1.1 cgd }; 146 1.71 rillig /* INDENT ON */ 147 1.1 cgd 148 1.115 rillig static const unsigned char lex_number_row[] = { 149 1.198 rillig ['0'] = 1, 150 1.198 rillig ['1'] = 2, 151 1.198 rillig ['2'] = 3, ['3'] = 3, ['4'] = 3, ['5'] = 3, ['6'] = 3, ['7'] = 3, 152 1.198 rillig ['8'] = 4, ['9'] = 4, 153 1.198 rillig ['A'] = 5, ['a'] = 5, ['C'] = 5, ['c'] = 5, ['D'] = 5, ['d'] = 5, 154 1.198 rillig ['B'] = 6, ['b'] = 6, 155 1.198 rillig ['E'] = 7, ['e'] = 7, 156 1.198 rillig ['F'] = 8, ['f'] = 8, 157 1.198 rillig ['L'] = 9, 158 1.198 rillig ['l'] = 10, 159 1.198 rillig ['P'] = 11, ['p'] = 11, 160 1.198 rillig ['U'] = 12, ['u'] = 12, 161 1.198 rillig ['X'] = 13, ['x'] = 13, 162 1.198 rillig ['+'] = 14, ['-'] = 14, 163 1.198 rillig ['.'] = 15, 164 1.56 rillig }; 165 1.36 rillig 166 1.215 rillig 167 1.225 rillig static bool 168 1.225 rillig is_identifier_start(char ch) 169 1.225 rillig { 170 1.225 rillig return ch_isalpha(ch) || ch == '_' || ch == '$'; 171 1.225 rillig } 172 1.225 rillig 173 1.225 rillig static bool 174 1.225 rillig is_identifier_part(char ch) 175 1.225 rillig { 176 1.225 rillig return ch_isalnum(ch) || ch == '_' || ch == '$'; 177 1.225 rillig } 178 1.225 rillig 179 1.25 rillig static void 180 1.87 rillig token_add_char(char ch) 181 1.87 rillig { 182 1.198 rillig buf_add_char(&token, ch); 183 1.87 rillig } 184 1.87 rillig 185 1.232 rillig static bool 186 1.232 rillig skip_line_continuation(void) 187 1.232 rillig { 188 1.242 rillig if (in.p[0] == '\\' && in.p[1] == '\n') { 189 1.242 rillig in.p++; 190 1.232 rillig inp_skip(); 191 1.242 rillig in.token_end_line++; 192 1.232 rillig return true; 193 1.232 rillig } 194 1.232 rillig return false; 195 1.232 rillig } 196 1.232 rillig 197 1.43 rillig static void 198 1.43 rillig lex_number(void) 199 1.43 rillig { 200 1.198 rillig for (unsigned char s = 'A'; s != 'f' && s != 'i' && s != 'u';) { 201 1.242 rillig unsigned char ch = (unsigned char)*in.p; 202 1.232 rillig if (skip_line_continuation()) 203 1.198 rillig continue; 204 1.199 rillig if (ch >= array_length(lex_number_row) 205 1.199 rillig || lex_number_row[ch] == 0) 206 1.198 rillig break; 207 1.198 rillig 208 1.198 rillig unsigned char row = lex_number_row[ch]; 209 1.198 rillig if (lex_number_state[row][s - 'A'] == ' ') { 210 1.237 rillig // lex_number_state[0][s - 'A'] now indicates the type: 211 1.237 rillig // f = floating, i = integer, u = unknown 212 1.198 rillig return; 213 1.198 rillig } 214 1.198 rillig 215 1.198 rillig s = lex_number_state[row][s - 'A']; 216 1.198 rillig token_add_char(inp_next()); 217 1.43 rillig } 218 1.43 rillig } 219 1.43 rillig 220 1.43 rillig static void 221 1.43 rillig lex_word(void) 222 1.43 rillig { 223 1.198 rillig for (;;) { 224 1.242 rillig if (is_identifier_part(*in.p)) 225 1.242 rillig token_add_char(*in.p++); 226 1.232 rillig else if (skip_line_continuation()) 227 1.232 rillig continue; 228 1.232 rillig else 229 1.198 rillig return; 230 1.198 rillig } 231 1.43 rillig } 232 1.43 rillig 233 1.43 rillig static void 234 1.43 rillig lex_char_or_string(void) 235 1.43 rillig { 236 1.212 rillig for (char delim = token.s[token.len - 1];;) { 237 1.242 rillig if (*in.p == '\n') { 238 1.198 rillig diag(1, "Unterminated literal"); 239 1.198 rillig return; 240 1.198 rillig } 241 1.198 rillig 242 1.242 rillig token_add_char(*in.p++); 243 1.212 rillig if (token.s[token.len - 1] == delim) 244 1.198 rillig return; 245 1.198 rillig 246 1.212 rillig if (token.s[token.len - 1] == '\\') { 247 1.242 rillig if (*in.p == '\n') 248 1.242 rillig in.token_end_line++; 249 1.198 rillig token_add_char(inp_next()); 250 1.198 rillig } 251 1.52 rillig } 252 1.43 rillig } 253 1.43 rillig 254 1.84 rillig /* Guess whether the current token is a declared type. */ 255 1.57 rillig static bool 256 1.107 rillig probably_typename(void) 257 1.57 rillig { 258 1.211 rillig if (ps.prev_lsym == lsym_modifier) 259 1.198 rillig return true; 260 1.221 rillig if (ps.in_init) 261 1.198 rillig return false; 262 1.198 rillig if (ps.in_stmt_or_decl) /* XXX: this condition looks incorrect */ 263 1.198 rillig return false; 264 1.220 rillig if (ps.prev_lsym == lsym_semicolon 265 1.220 rillig || ps.prev_lsym == lsym_lbrace 266 1.220 rillig || ps.prev_lsym == lsym_rbrace) { 267 1.242 rillig if (in.p[0] == '*' && in.p[1] != '=') 268 1.220 rillig return true; 269 1.220 rillig /* XXX: is_identifier_start */ 270 1.242 rillig if (ch_isalpha(in.p[0])) 271 1.220 rillig return true; 272 1.220 rillig } 273 1.70 rillig return false; 274 1.57 rillig } 275 1.57 rillig 276 1.84 rillig static int 277 1.84 rillig bsearch_typenames(const char *key) 278 1.84 rillig { 279 1.198 rillig const char **arr = typenames.items; 280 1.225 rillig unsigned lo = 0; 281 1.225 rillig unsigned hi = typenames.len; 282 1.198 rillig 283 1.225 rillig while (lo < hi) { 284 1.225 rillig unsigned mid = (lo + hi) / 2; 285 1.198 rillig int cmp = strcmp(arr[mid], key); 286 1.198 rillig if (cmp < 0) 287 1.198 rillig lo = mid + 1; 288 1.198 rillig else if (cmp > 0) 289 1.225 rillig hi = mid; 290 1.198 rillig else 291 1.225 rillig return (int)mid; 292 1.198 rillig } 293 1.225 rillig return -1 - (int)lo; 294 1.84 rillig } 295 1.84 rillig 296 1.63 rillig static bool 297 1.63 rillig is_typename(void) 298 1.63 rillig { 299 1.236 rillig if (ps.prev_lsym == lsym_tag) 300 1.236 rillig return true; 301 1.198 rillig if (opt.auto_typedefs && 302 1.212 rillig token.len >= 2 && memcmp(token.s + token.len - 2, "_t", 2) == 0) 303 1.198 rillig return true; 304 1.63 rillig 305 1.212 rillig return bsearch_typenames(token.s) >= 0; 306 1.63 rillig } 307 1.63 rillig 308 1.225 rillig void 309 1.225 rillig register_typename(const char *name) 310 1.225 rillig { 311 1.225 rillig if (typenames.len >= typenames.cap) { 312 1.225 rillig typenames.cap = 16 + 2 * typenames.cap; 313 1.225 rillig typenames.items = nonnull(realloc(typenames.items, 314 1.225 rillig sizeof(typenames.items[0]) * typenames.cap)); 315 1.225 rillig } 316 1.225 rillig 317 1.225 rillig int pos = bsearch_typenames(name); 318 1.225 rillig if (pos >= 0) 319 1.225 rillig return; /* already in the list */ 320 1.225 rillig 321 1.225 rillig pos = -1 - pos; 322 1.225 rillig memmove(typenames.items + pos + 1, typenames.items + pos, 323 1.225 rillig sizeof(typenames.items[0]) * (typenames.len++ - (unsigned)pos)); 324 1.225 rillig typenames.items[pos] = nonnull(strdup(name)); 325 1.225 rillig } 326 1.225 rillig 327 1.115 rillig static int 328 1.115 rillig cmp_keyword_by_name(const void *key, const void *elem) 329 1.115 rillig { 330 1.198 rillig return strcmp(key, ((const struct keyword *)elem)->name); 331 1.115 rillig } 332 1.115 rillig 333 1.165 rillig /* 334 1.231 rillig * Looking at the '(', guess whether this starts a function definition or a 335 1.231 rillig * function declaration. 336 1.165 rillig */ 337 1.155 rillig static bool 338 1.234 rillig probably_function_definition(const char *p) 339 1.155 rillig { 340 1.236 rillig // TODO: Don't look at characters in comments, see lsym_funcname.c. 341 1.198 rillig int paren_level = 0; 342 1.234 rillig for (; *p != '\n'; p++) { 343 1.198 rillig if (*p == '(') 344 1.198 rillig paren_level++; 345 1.198 rillig if (*p == ')' && --paren_level == 0) { 346 1.198 rillig p++; 347 1.198 rillig 348 1.199 rillig while (*p != '\n' 349 1.199 rillig && (ch_isspace(*p) || is_identifier_part(*p))) 350 1.198 rillig p++; /* '__dead' or '__unused' */ 351 1.198 rillig 352 1.198 rillig if (*p == '\n') /* func(...) */ 353 1.198 rillig break; 354 1.198 rillig if (*p == ';') /* func(...); */ 355 1.198 rillig return false; 356 1.198 rillig if (*p == ',') /* double abs(), pi; */ 357 1.198 rillig return false; 358 1.198 rillig if (*p == '(') /* func(...) __attribute__((...)) */ 359 1.198 rillig paren_level++; /* func(...) __printflike(...) 360 1.198 rillig */ 361 1.198 rillig else 362 1.198 rillig break; /* func(...) { ... */ 363 1.198 rillig } 364 1.219 rillig 365 1.219 rillig if (paren_level == 1 && p[0] == '*' && p[1] == ',') 366 1.219 rillig return false; 367 1.198 rillig } 368 1.198 rillig 369 1.231 rillig /* 370 1.231 rillig * To further reduce the cases where indent wrongly treats an 371 1.198 rillig * incomplete function declaration as a function definition, thus 372 1.198 rillig * adding a newline before the function name, it may be worth looking 373 1.198 rillig * for parameter names, as these are often omitted in function 374 1.198 rillig * declarations and only included in function definitions. Or just 375 1.198 rillig * increase the lookahead to more than just the current line of input, 376 1.231 rillig * until the next '{'. 377 1.231 rillig */ 378 1.198 rillig return true; 379 1.155 rillig } 380 1.155 rillig 381 1.100 rillig static lexer_symbol 382 1.107 rillig lexi_alnum(void) 383 1.1 cgd { 384 1.242 rillig if (ch_isdigit(in.p[0]) || 385 1.242 rillig (in.p[0] == '.' && ch_isdigit(in.p[1]))) { 386 1.198 rillig lex_number(); 387 1.242 rillig } else if (is_identifier_start(in.p[0])) { 388 1.198 rillig lex_word(); 389 1.198 rillig 390 1.212 rillig if (token.len == 1 && token.s[0] == 'L' && 391 1.242 rillig (in.p[0] == '"' || in.p[0] == '\'')) { 392 1.242 rillig token_add_char(*in.p++); 393 1.198 rillig lex_char_or_string(); 394 1.198 rillig ps.next_unary = false; 395 1.198 rillig return lsym_word; 396 1.198 rillig } 397 1.198 rillig } else 398 1.198 rillig return lsym_eof; /* just as a placeholder */ 399 1.198 rillig 400 1.242 rillig while (ch_isblank(*in.p)) 401 1.242 rillig in.p++; 402 1.198 rillig 403 1.211 rillig ps.next_unary = ps.prev_lsym == lsym_tag 404 1.238 rillig || ps.prev_lsym == lsym_typedef 405 1.242 rillig || (ps.prev_lsym == lsym_modifier && *in.p == '*'); 406 1.198 rillig 407 1.228 rillig if (ps.prev_lsym == lsym_tag && ps.paren.len == 0) 408 1.230 rillig return lsym_type; 409 1.239 rillig if (ps.spaced_expr_psym == psym_for_exprs 410 1.239 rillig && ps.prev_lsym == lsym_lparen && ps.paren.len == 1 411 1.242 rillig && *in.p == '*') { 412 1.239 rillig ps.next_unary = true; 413 1.239 rillig return lsym_type; 414 1.239 rillig } 415 1.198 rillig 416 1.237 rillig token_add_char('\0'); // Terminate in non-debug mode as well. 417 1.198 rillig token.len--; 418 1.212 rillig const struct keyword *kw = bsearch(token.s, keywords, 419 1.198 rillig array_length(keywords), sizeof(keywords[0]), cmp_keyword_by_name); 420 1.201 rillig lexer_symbol lsym = lsym_word; 421 1.201 rillig if (kw != NULL) { 422 1.236 rillig lsym = kw->lsym; 423 1.201 rillig ps.next_unary = true; 424 1.236 rillig if (lsym == lsym_tag || lsym == lsym_type) 425 1.198 rillig goto found_typename; 426 1.236 rillig return lsym; 427 1.201 rillig } 428 1.198 rillig 429 1.201 rillig if (is_typename()) { 430 1.230 rillig lsym = lsym_type; 431 1.198 rillig ps.next_unary = true; 432 1.118 rillig found_typename: 433 1.211 rillig if (ps.prev_lsym != lsym_period 434 1.211 rillig && ps.prev_lsym != lsym_unary_op) { 435 1.236 rillig if (lsym == lsym_tag) 436 1.198 rillig return lsym_tag; 437 1.228 rillig if (ps.paren.len == 0) 438 1.230 rillig return lsym_type; 439 1.198 rillig } 440 1.90 rillig } 441 1.89 rillig 442 1.242 rillig const char *p = in.p; 443 1.234 rillig if (*p == ')') 444 1.234 rillig p++; 445 1.234 rillig if (*p == '(' && ps.psyms.len < 3 && ps.ind_level == 0 && 446 1.221 rillig !ps.in_func_def_params && !ps.in_init) { 447 1.89 rillig 448 1.242 rillig bool maybe_function_definition = *in.p == ')' 449 1.234 rillig ? ps.paren.len == 1 && ps.prev_lsym != lsym_unary_op 450 1.234 rillig : ps.paren.len == 0; 451 1.234 rillig if (maybe_function_definition 452 1.234 rillig && probably_function_definition(p)) { 453 1.222 rillig ps.line_has_func_def = true; 454 1.198 rillig if (ps.in_decl) 455 1.198 rillig ps.in_func_def_params = true; 456 1.198 rillig return lsym_funcname; 457 1.198 rillig } 458 1.198 rillig 459 1.228 rillig } else if (ps.paren.len == 0 && probably_typename()) { 460 1.198 rillig ps.next_unary = true; 461 1.230 rillig return lsym_type; 462 1.155 rillig } 463 1.89 rillig 464 1.201 rillig return lsym; 465 1.89 rillig } 466 1.75 rillig 467 1.234 rillig static void 468 1.234 rillig check_parenthesized_function_definition(void) 469 1.234 rillig { 470 1.242 rillig const char *p = in.p; 471 1.234 rillig while (ch_isblank(*p)) 472 1.234 rillig p++; 473 1.234 rillig if (is_identifier_start(*p)) 474 1.234 rillig while (is_identifier_part(*p)) 475 1.234 rillig p++; 476 1.234 rillig while (ch_isblank(*p)) 477 1.234 rillig p++; 478 1.234 rillig if (*p == ')') { 479 1.234 rillig p++; 480 1.234 rillig while (ch_isblank(*p)) 481 1.234 rillig p++; 482 1.234 rillig if (*p == '(' && probably_function_definition(p)) 483 1.234 rillig ps.line_has_func_def = true; 484 1.234 rillig } 485 1.234 rillig } 486 1.234 rillig 487 1.163 rillig static bool 488 1.234 rillig is_asterisk_unary(void) 489 1.163 rillig { 490 1.242 rillig const char *p = in.p; 491 1.233 rillig while (*p == '*' || ch_isblank(*p)) 492 1.233 rillig p++; 493 1.233 rillig if (*p == ')') 494 1.204 rillig return true; 495 1.198 rillig if (ps.next_unary || ps.in_func_def_params) 496 1.198 rillig return true; 497 1.211 rillig if (ps.prev_lsym == lsym_word || 498 1.211 rillig ps.prev_lsym == lsym_rparen || 499 1.211 rillig ps.prev_lsym == lsym_rbracket) 500 1.198 rillig return false; 501 1.228 rillig return ps.in_decl && ps.paren.len > 0; 502 1.163 rillig } 503 1.163 rillig 504 1.200 rillig static bool 505 1.200 rillig probably_in_function_definition(void) 506 1.200 rillig { 507 1.242 rillig for (const char *p = in.p; *p != '\n';) { 508 1.231 rillig if (ch_isspace(*p)) 509 1.231 rillig p++; 510 1.231 rillig else if (is_identifier_start(*p)) { 511 1.231 rillig p++; 512 1.231 rillig while (is_identifier_part(*p)) 513 1.231 rillig p++; 514 1.200 rillig } else 515 1.231 rillig return *p == '('; 516 1.200 rillig } 517 1.200 rillig return false; 518 1.200 rillig } 519 1.200 rillig 520 1.161 rillig static void 521 1.234 rillig lex_asterisk_unary(void) 522 1.161 rillig { 523 1.242 rillig while (*in.p == '*' || ch_isspace(*in.p)) { 524 1.242 rillig if (*in.p == '*') 525 1.198 rillig token_add_char('*'); 526 1.242 rillig if (*in.p == '\n') 527 1.242 rillig in.token_end_line++; 528 1.198 rillig inp_skip(); 529 1.198 rillig } 530 1.198 rillig 531 1.200 rillig if (ps.in_decl && probably_in_function_definition()) 532 1.222 rillig ps.line_has_func_def = true; 533 1.161 rillig } 534 1.161 rillig 535 1.225 rillig static bool 536 1.225 rillig skip(const char **pp, const char *s) 537 1.193 rillig { 538 1.225 rillig size_t len = strlen(s); 539 1.198 rillig while (ch_isblank(**pp)) 540 1.198 rillig (*pp)++; 541 1.198 rillig if (strncmp(*pp, s, len) == 0) { 542 1.198 rillig *pp += len; 543 1.198 rillig return true; 544 1.198 rillig } 545 1.198 rillig return false; 546 1.193 rillig } 547 1.193 rillig 548 1.194 rillig static void 549 1.193 rillig lex_indent_comment(void) 550 1.193 rillig { 551 1.242 rillig const char *p = in.line.s; 552 1.225 rillig if (skip(&p, "/*") && skip(&p, "INDENT")) { 553 1.225 rillig enum indent_enabled enabled; 554 1.225 rillig if (skip(&p, "ON") || *p == '*') 555 1.225 rillig enabled = indent_last_off_line; 556 1.225 rillig else if (skip(&p, "OFF")) 557 1.225 rillig enabled = indent_off; 558 1.225 rillig else 559 1.225 rillig return; 560 1.225 rillig if (skip(&p, "*/\n")) { 561 1.225 rillig if (lab.len > 0 || code.len > 0 || com.len > 0) 562 1.225 rillig output_line(); 563 1.225 rillig indent_enabled = enabled; 564 1.225 rillig } 565 1.225 rillig } 566 1.193 rillig } 567 1.193 rillig 568 1.89 rillig /* Reads the next token, placing it in the global variable "token". */ 569 1.100 rillig lexer_symbol 570 1.106 rillig lexi(void) 571 1.89 rillig { 572 1.223 rillig buf_clear(&token); 573 1.198 rillig 574 1.198 rillig for (;;) { 575 1.242 rillig if (ch_isblank(*in.p)) 576 1.242 rillig in.p++; 577 1.232 rillig else if (skip_line_continuation()) 578 1.232 rillig continue; 579 1.232 rillig else 580 1.198 rillig break; 581 1.198 rillig } 582 1.242 rillig in.token_start_line = in.token_end_line; 583 1.198 rillig 584 1.198 rillig lexer_symbol alnum_lsym = lexi_alnum(); 585 1.205 rillig if (alnum_lsym != lsym_eof) 586 1.198 rillig return alnum_lsym; 587 1.75 rillig 588 1.198 rillig /* Scan a non-alphanumeric token */ 589 1.16 kamil 590 1.198 rillig token_add_char(inp_next()); 591 1.16 kamil 592 1.198 rillig lexer_symbol lsym; 593 1.198 rillig bool next_unary; 594 1.89 rillig 595 1.212 rillig switch (token.s[token.len - 1]) { 596 1.160 rillig 597 1.220 rillig case '#': 598 1.220 rillig lsym = lsym_preprocessing; 599 1.220 rillig next_unary = ps.next_unary; 600 1.208 rillig break; 601 1.208 rillig 602 1.198 rillig case '\n': 603 1.198 rillig /* if data has been exhausted, the '\n' is a dummy. */ 604 1.198 rillig lsym = had_eof ? lsym_eof : lsym_newline; 605 1.198 rillig next_unary = ps.next_unary; 606 1.198 rillig break; 607 1.198 rillig 608 1.220 rillig /* INDENT OFF */ 609 1.220 rillig case ')': lsym = lsym_rparen; next_unary = false; break; 610 1.220 rillig case '[': lsym = lsym_lbracket; next_unary = true; break; 611 1.220 rillig case ']': lsym = lsym_rbracket; next_unary = false; break; 612 1.220 rillig case '{': lsym = lsym_lbrace; next_unary = true; break; 613 1.220 rillig case '}': lsym = lsym_rbrace; next_unary = true; break; 614 1.220 rillig case '.': lsym = lsym_period; next_unary = false; break; 615 1.220 rillig case '?': lsym = lsym_question; next_unary = true; break; 616 1.220 rillig case ',': lsym = lsym_comma; next_unary = true; break; 617 1.220 rillig case ';': lsym = lsym_semicolon; next_unary = true; break; 618 1.220 rillig /* INDENT ON */ 619 1.198 rillig 620 1.234 rillig case '(': 621 1.242 rillig if (in.p == in.line.s + 1) 622 1.234 rillig check_parenthesized_function_definition(); 623 1.234 rillig lsym = lsym_lparen; 624 1.234 rillig next_unary = true; 625 1.234 rillig break; 626 1.234 rillig 627 1.231 rillig case '+': 628 1.198 rillig case '-': 629 1.198 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op; 630 1.198 rillig next_unary = true; 631 1.198 rillig 632 1.199 rillig /* '++' or '--' */ 633 1.242 rillig if (*in.p == token.s[token.len - 1]) { 634 1.242 rillig token_add_char(*in.p++); 635 1.211 rillig if (ps.prev_lsym == lsym_word || 636 1.211 rillig ps.prev_lsym == lsym_rparen || 637 1.211 rillig ps.prev_lsym == lsym_rbracket) { 638 1.199 rillig lsym = ps.next_unary 639 1.199 rillig ? lsym_unary_op : lsym_postfix_op; 640 1.198 rillig next_unary = false; 641 1.198 rillig } 642 1.198 rillig 643 1.242 rillig } else if (*in.p == '=') { /* '+=' or '-=' */ 644 1.242 rillig token_add_char(*in.p++); 645 1.198 rillig 646 1.242 rillig } else if (*in.p == '>') { /* '->' */ 647 1.242 rillig token_add_char(*in.p++); 648 1.198 rillig lsym = lsym_unary_op; 649 1.198 rillig next_unary = false; 650 1.198 rillig ps.want_blank = false; 651 1.198 rillig } 652 1.198 rillig break; 653 1.198 rillig 654 1.220 rillig case ':': 655 1.220 rillig lsym = ps.quest_level > 0 656 1.225 rillig ? (ps.quest_level--, lsym_question_colon) 657 1.225 rillig : ps.in_var_decl ? lsym_other_colon : lsym_label_colon; 658 1.220 rillig next_unary = true; 659 1.220 rillig break; 660 1.220 rillig 661 1.220 rillig case '*': 662 1.242 rillig if (*in.p == '=') { 663 1.242 rillig token_add_char(*in.p++); 664 1.220 rillig lsym = lsym_binary_op; 665 1.234 rillig } else if (is_asterisk_unary()) { 666 1.234 rillig lex_asterisk_unary(); 667 1.220 rillig lsym = lsym_unary_op; 668 1.220 rillig } else 669 1.220 rillig lsym = lsym_binary_op; 670 1.220 rillig next_unary = true; 671 1.220 rillig break; 672 1.220 rillig 673 1.198 rillig case '=': 674 1.221 rillig if (ps.in_var_decl) 675 1.221 rillig ps.in_init = true; 676 1.242 rillig if (*in.p == '=') 677 1.242 rillig token_add_char(*in.p++); 678 1.198 rillig lsym = lsym_binary_op; 679 1.198 rillig next_unary = true; 680 1.198 rillig break; 681 1.75 rillig 682 1.198 rillig case '>': 683 1.198 rillig case '<': 684 1.225 rillig case '!': /* ops like <, <<, <=, !=, etc. */ 685 1.242 rillig if (*in.p == '>' || *in.p == '<' || *in.p == '=') 686 1.242 rillig token_add_char(*in.p++); 687 1.242 rillig if (*in.p == '=') 688 1.242 rillig token_add_char(*in.p++); 689 1.198 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op; 690 1.198 rillig next_unary = true; 691 1.198 rillig break; 692 1.75 rillig 693 1.220 rillig case '\'': 694 1.220 rillig case '"': 695 1.220 rillig lex_char_or_string(); 696 1.220 rillig lsym = lsym_word; 697 1.220 rillig next_unary = false; 698 1.198 rillig break; 699 1.1 cgd 700 1.198 rillig default: 701 1.212 rillig if (token.s[token.len - 1] == '/' 702 1.242 rillig && (*in.p == '*' || *in.p == '/')) { 703 1.198 rillig enum indent_enabled prev = indent_enabled; 704 1.198 rillig lex_indent_comment(); 705 1.198 rillig if (prev == indent_on && indent_enabled == indent_off) 706 1.223 rillig buf_clear(&out.indent_off_text); 707 1.242 rillig token_add_char(*in.p++); 708 1.198 rillig lsym = lsym_comment; 709 1.198 rillig next_unary = ps.next_unary; 710 1.198 rillig break; 711 1.198 rillig } 712 1.198 rillig 713 1.225 rillig /* punctuation like '%', '&&', '/', '^', '||', '~' */ 714 1.214 rillig lsym = ps.next_unary ? lsym_unary_op : lsym_binary_op; 715 1.242 rillig if (*in.p == token.s[token.len - 1]) 716 1.242 rillig token_add_char(*in.p++), lsym = lsym_binary_op; 717 1.242 rillig if (*in.p == '=') 718 1.242 rillig token_add_char(*in.p++), lsym = lsym_binary_op; 719 1.198 rillig 720 1.198 rillig next_unary = true; 721 1.198 rillig } 722 1.198 rillig 723 1.198 rillig ps.next_unary = next_unary; 724 1.75 rillig 725 1.198 rillig return lsym; 726 1.1 cgd } 727