lexi.c revision 1.1.1.2 1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 1985 Sun Microsystems, Inc.
5 * Copyright (c) 1980, 1993
6 * The Regents of the University of California. All rights reserved.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38 #if 0
39 #ifndef lint
40 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";
41 #endif /* not lint */
42 #endif
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD: head/usr.bin/indent/lexi.c 337862 2018-08-15 18:19:45Z pstef $");
45
46 /*
47 * Here we have the token scanner for indent. It scans off one token and puts
48 * it in the global variable "token". It returns a code, indicating the type
49 * of token scanned.
50 */
51
52 #include <err.h>
53 #include <stdio.h>
54 #include <ctype.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #include <sys/param.h>
58
59 #include "indent_globs.h"
60 #include "indent_codes.h"
61 #include "indent.h"
62
63 struct templ {
64 const char *rwd;
65 int rwcode;
66 };
67
68 /*
69 * This table has to be sorted alphabetically, because it'll be used in binary
70 * search. For the same reason, string must be the first thing in struct templ.
71 */
72 struct templ specials[] =
73 {
74 {"_Bool", 4},
75 {"_Complex", 4},
76 {"_Imaginary", 4},
77 {"auto", 10},
78 {"bool", 4},
79 {"break", 9},
80 {"case", 8},
81 {"char", 4},
82 {"complex", 4},
83 {"const", 4},
84 {"continue", 12},
85 {"default", 8},
86 {"do", 6},
87 {"double", 4},
88 {"else", 6},
89 {"enum", 3},
90 {"extern", 10},
91 {"float", 4},
92 {"for", 5},
93 {"global", 4},
94 {"goto", 9},
95 {"if", 5},
96 {"imaginary", 4},
97 {"inline", 12},
98 {"int", 4},
99 {"long", 4},
100 {"offsetof", 1},
101 {"register", 10},
102 {"restrict", 12},
103 {"return", 9},
104 {"short", 4},
105 {"signed", 4},
106 {"sizeof", 2},
107 {"static", 10},
108 {"struct", 3},
109 {"switch", 7},
110 {"typedef", 11},
111 {"union", 3},
112 {"unsigned", 4},
113 {"void", 4},
114 {"volatile", 4},
115 {"while", 5}
116 };
117
118 const char **typenames;
119 int typename_count;
120 int typename_top = -1;
121
122 /*
123 * The transition table below was rewritten by hand from lx's output, given
124 * the following definitions. lx is Katherine Flavel's lexer generator.
125 *
126 * O = /[0-7]/; D = /[0-9]/; NZ = /[1-9]/;
127 * H = /[a-f0-9]/i; B = /[0-1]/; HP = /0x/i;
128 * BP = /0b/i; E = /e[+\-]?/i D+; P = /p[+\-]?/i D+;
129 * FS = /[fl]/i; IS = /u/i /(l|L|ll|LL)/? | /(l|L|ll|LL)/ /u/i?;
130 *
131 * D+ E FS? -> $float;
132 * D* "." D+ E? FS? -> $float;
133 * D+ "." E? FS? -> $float; HP H+ IS? -> $int;
134 * HP H+ P FS? -> $float; NZ D* IS? -> $int;
135 * HP H* "." H+ P FS? -> $float; "0" O* IS? -> $int;
136 * HP H+ "." P FS -> $float; BP B+ IS? -> $int;
137 */
138 static char const *table[] = {
139 /* examples:
140 00
141 s 0xx
142 t 00xaa
143 a 11 101100xxa..
144 r 11ee0001101lbuuxx.a.pp
145 t.01.e+008bLuxll0Ll.aa.p+0
146 states: ABCDEFGHIJKLMNOPQRSTUVWXYZ */
147 ['0'] = "CEIDEHHHIJQ U Q VUVVZZZ",
148 ['1'] = "DEIDEHHHIJQ U Q VUVVZZZ",
149 ['7'] = "DEIDEHHHIJ U VUVVZZZ",
150 ['9'] = "DEJDEHHHJJ U VUVVZZZ",
151 ['a'] = " U VUVV ",
152 ['b'] = " K U VUVV ",
153 ['e'] = " FFF FF U VUVV ",
154 ['f'] = " f f U VUVV f",
155 ['u'] = " MM M i iiM M ",
156 ['x'] = " N ",
157 ['p'] = " FFX ",
158 ['L'] = " LLf fL PR Li L f",
159 ['l'] = " OOf fO S P O i O f",
160 ['+'] = " G Y ",
161 ['.'] = "B EE EE T W ",
162 /* ABCDEFGHIJKLMNOPQRSTUVWXYZ */
163 [0] = "uuiifuufiuuiiuiiiiiuiuuuuu",
164 };
165
166 static int
167 strcmp_type(const void *e1, const void *e2)
168 {
169 return (strcmp(e1, *(const char * const *)e2));
170 }
171
172 int
173 lexi(struct parser_state *state)
174 {
175 int unary_delim; /* this is set to 1 if the current token
176 * forces a following operator to be unary */
177 int code; /* internal code to be returned */
178 char qchar; /* the delimiter character for a string */
179
180 e_token = s_token; /* point to start of place to save token */
181 unary_delim = false;
182 state->col_1 = state->last_nl; /* tell world that this token started
183 * in column 1 iff the last thing
184 * scanned was a newline */
185 state->last_nl = false;
186
187 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
188 state->col_1 = false; /* leading blanks imply token is not in column
189 * 1 */
190 if (++buf_ptr >= buf_end)
191 fill_buffer();
192 }
193
194 /* Scan an alphanumeric token */
195 if (isalnum((unsigned char)*buf_ptr) ||
196 *buf_ptr == '_' || *buf_ptr == '$' ||
197 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
198 /*
199 * we have a character or number
200 */
201 struct templ *p;
202
203 if (isdigit((unsigned char)*buf_ptr) ||
204 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
205 char s;
206 unsigned char i;
207
208 for (s = 'A'; s != 'f' && s != 'i' && s != 'u'; ) {
209 i = (unsigned char)*buf_ptr;
210 if (i >= nitems(table) || table[i] == NULL ||
211 table[i][s - 'A'] == ' ') {
212 s = table[0][s - 'A'];
213 break;
214 }
215 s = table[i][s - 'A'];
216 CHECK_SIZE_TOKEN(1);
217 *e_token++ = *buf_ptr++;
218 if (buf_ptr >= buf_end)
219 fill_buffer();
220 }
221 /* s now indicates the type: f(loating), i(integer), u(nknown) */
222 }
223 else
224 while (isalnum((unsigned char)*buf_ptr) ||
225 *buf_ptr == BACKSLASH ||
226 *buf_ptr == '_' || *buf_ptr == '$') {
227 /* fill_buffer() terminates buffer with newline */
228 if (*buf_ptr == BACKSLASH) {
229 if (*(buf_ptr + 1) == '\n') {
230 buf_ptr += 2;
231 if (buf_ptr >= buf_end)
232 fill_buffer();
233 } else
234 break;
235 }
236 CHECK_SIZE_TOKEN(1);
237 /* copy it over */
238 *e_token++ = *buf_ptr++;
239 if (buf_ptr >= buf_end)
240 fill_buffer();
241 }
242 *e_token = '\0';
243
244 if (s_token[0] == 'L' && s_token[1] == '\0' &&
245 (*buf_ptr == '"' || *buf_ptr == '\''))
246 return (strpfx);
247
248 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
249 if (++buf_ptr >= buf_end)
250 fill_buffer();
251 }
252 state->keyword = 0;
253 if (state->last_token == structure && !state->p_l_follow) {
254 /* if last token was 'struct' and we're not
255 * in parentheses, then this token
256 * should be treated as a declaration */
257 state->last_u_d = true;
258 return (decl);
259 }
260 /*
261 * Operator after identifier is binary unless last token was 'struct'
262 */
263 state->last_u_d = (state->last_token == structure);
264
265 p = bsearch(s_token,
266 specials,
267 sizeof(specials) / sizeof(specials[0]),
268 sizeof(specials[0]),
269 strcmp_type);
270 if (p == NULL) { /* not a special keyword... */
271 char *u;
272
273 /* ... so maybe a type_t or a typedef */
274 if ((opt.auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
275 strcmp(u, "_t") == 0) || (typename_top >= 0 &&
276 bsearch(s_token, typenames, typename_top + 1,
277 sizeof(typenames[0]), strcmp_type))) {
278 state->keyword = 4; /* a type name */
279 state->last_u_d = true;
280 goto found_typename;
281 }
282 } else { /* we have a keyword */
283 state->keyword = p->rwcode;
284 state->last_u_d = true;
285 switch (p->rwcode) {
286 case 7: /* it is a switch */
287 return (swstmt);
288 case 8: /* a case or default */
289 return (casestmt);
290
291 case 3: /* a "struct" */
292 /* FALLTHROUGH */
293 case 4: /* one of the declaration keywords */
294 found_typename:
295 if (state->p_l_follow) {
296 /* inside parens: cast, param list, offsetof or sizeof */
297 state->cast_mask |= (1 << state->p_l_follow) & ~state->not_cast_mask;
298 }
299 if (state->last_token == period || state->last_token == unary_op) {
300 state->keyword = 0;
301 break;
302 }
303 if (p != NULL && p->rwcode == 3)
304 return (structure);
305 if (state->p_l_follow)
306 break;
307 return (decl);
308
309 case 5: /* if, while, for */
310 return (sp_paren);
311
312 case 6: /* do, else */
313 return (sp_nparen);
314
315 case 10: /* storage class specifier */
316 return (storage);
317
318 case 11: /* typedef */
319 return (type_def);
320
321 default: /* all others are treated like any other
322 * identifier */
323 return (ident);
324 } /* end of switch */
325 } /* end of if (found_it) */
326 if (*buf_ptr == '(' && state->tos <= 1 && state->ind_level == 0 &&
327 state->in_parameter_declaration == 0 && state->block_init == 0) {
328 char *tp = buf_ptr;
329 while (tp < buf_end)
330 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
331 goto not_proc;
332 strncpy(state->procname, token, sizeof state->procname - 1);
333 if (state->in_decl)
334 state->in_parameter_declaration = 1;
335 return (funcname);
336 not_proc:;
337 }
338 /*
339 * The following hack attempts to guess whether or not the current
340 * token is in fact a declaration keyword -- one that has been
341 * typedefd
342 */
343 else if (!state->p_l_follow && !state->block_init &&
344 !state->in_stmt &&
345 ((*buf_ptr == '*' && buf_ptr[1] != '=') ||
346 isalpha((unsigned char)*buf_ptr)) &&
347 (state->last_token == semicolon || state->last_token == lbrace ||
348 state->last_token == rbrace)) {
349 state->keyword = 4; /* a type name */
350 state->last_u_d = true;
351 return decl;
352 }
353 if (state->last_token == decl) /* if this is a declared variable,
354 * then following sign is unary */
355 state->last_u_d = true; /* will make "int a -1" work */
356 return (ident); /* the ident is not in the list */
357 } /* end of procesing for alpanum character */
358
359 /* Scan a non-alphanumeric token */
360
361 CHECK_SIZE_TOKEN(3); /* things like "<<=" */
362 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
363 * moved here */
364 *e_token = '\0';
365 if (++buf_ptr >= buf_end)
366 fill_buffer();
367
368 switch (*token) {
369 case '\n':
370 unary_delim = state->last_u_d;
371 state->last_nl = true; /* remember that we just had a newline */
372 code = (had_eof ? 0 : newline);
373
374 /*
375 * if data has been exhausted, the newline is a dummy, and we should
376 * return code to stop
377 */
378 break;
379
380 case '\'': /* start of quoted character */
381 case '"': /* start of string */
382 qchar = *token;
383 do { /* copy the string */
384 while (1) { /* move one character or [/<char>]<char> */
385 if (*buf_ptr == '\n') {
386 diag2(1, "Unterminated literal");
387 goto stop_lit;
388 }
389 CHECK_SIZE_TOKEN(2);
390 *e_token = *buf_ptr++;
391 if (buf_ptr >= buf_end)
392 fill_buffer();
393 if (*e_token == BACKSLASH) { /* if escape, copy extra char */
394 if (*buf_ptr == '\n') /* check for escaped newline */
395 ++line_no;
396 *++e_token = *buf_ptr++;
397 ++e_token; /* we must increment this again because we
398 * copied two chars */
399 if (buf_ptr >= buf_end)
400 fill_buffer();
401 }
402 else
403 break; /* we copied one character */
404 } /* end of while (1) */
405 } while (*e_token++ != qchar);
406 stop_lit:
407 code = ident;
408 break;
409
410 case ('('):
411 case ('['):
412 unary_delim = true;
413 code = lparen;
414 break;
415
416 case (')'):
417 case (']'):
418 code = rparen;
419 break;
420
421 case '#':
422 unary_delim = state->last_u_d;
423 code = preesc;
424 break;
425
426 case '?':
427 unary_delim = true;
428 code = question;
429 break;
430
431 case (':'):
432 code = colon;
433 unary_delim = true;
434 break;
435
436 case (';'):
437 unary_delim = true;
438 code = semicolon;
439 break;
440
441 case ('{'):
442 unary_delim = true;
443
444 /*
445 * if (state->in_or_st) state->block_init = 1;
446 */
447 /* ? code = state->block_init ? lparen : lbrace; */
448 code = lbrace;
449 break;
450
451 case ('}'):
452 unary_delim = true;
453 /* ? code = state->block_init ? rparen : rbrace; */
454 code = rbrace;
455 break;
456
457 case 014: /* a form feed */
458 unary_delim = state->last_u_d;
459 state->last_nl = true; /* remember this so we can set 'state->col_1'
460 * right */
461 code = form_feed;
462 break;
463
464 case (','):
465 unary_delim = true;
466 code = comma;
467 break;
468
469 case '.':
470 unary_delim = false;
471 code = period;
472 break;
473
474 case '-':
475 case '+': /* check for -, +, --, ++ */
476 code = (state->last_u_d ? unary_op : binary_op);
477 unary_delim = true;
478
479 if (*buf_ptr == token[0]) {
480 /* check for doubled character */
481 *e_token++ = *buf_ptr++;
482 /* buffer overflow will be checked at end of loop */
483 if (state->last_token == ident || state->last_token == rparen) {
484 code = (state->last_u_d ? unary_op : postop);
485 /* check for following ++ or -- */
486 unary_delim = false;
487 }
488 }
489 else if (*buf_ptr == '=')
490 /* check for operator += */
491 *e_token++ = *buf_ptr++;
492 else if (*buf_ptr == '>') {
493 /* check for operator -> */
494 *e_token++ = *buf_ptr++;
495 unary_delim = false;
496 code = unary_op;
497 state->want_blank = false;
498 }
499 break; /* buffer overflow will be checked at end of
500 * switch */
501
502 case '=':
503 if (state->in_or_st)
504 state->block_init = 1;
505 if (*buf_ptr == '=') {/* == */
506 *e_token++ = '='; /* Flip =+ to += */
507 buf_ptr++;
508 *e_token = 0;
509 }
510 code = binary_op;
511 unary_delim = true;
512 break;
513 /* can drop thru!!! */
514
515 case '>':
516 case '<':
517 case '!': /* ops like <, <<, <=, !=, etc */
518 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
519 *e_token++ = *buf_ptr;
520 if (++buf_ptr >= buf_end)
521 fill_buffer();
522 }
523 if (*buf_ptr == '=')
524 *e_token++ = *buf_ptr++;
525 code = (state->last_u_d ? unary_op : binary_op);
526 unary_delim = true;
527 break;
528
529 case '*':
530 unary_delim = true;
531 if (!state->last_u_d) {
532 if (*buf_ptr == '=')
533 *e_token++ = *buf_ptr++;
534 code = binary_op;
535 break;
536 }
537 while (*buf_ptr == '*' || isspace((unsigned char)*buf_ptr)) {
538 if (*buf_ptr == '*') {
539 CHECK_SIZE_TOKEN(1);
540 *e_token++ = *buf_ptr;
541 }
542 if (++buf_ptr >= buf_end)
543 fill_buffer();
544 }
545 if (ps.in_decl) {
546 char *tp = buf_ptr;
547
548 while (isalpha((unsigned char)*tp) ||
549 isspace((unsigned char)*tp)) {
550 if (++tp >= buf_end)
551 fill_buffer();
552 }
553 if (*tp == '(')
554 ps.procname[0] = ' ';
555 }
556 code = unary_op;
557 break;
558
559 default:
560 if (token[0] == '/' && *buf_ptr == '*') {
561 /* it is start of comment */
562 *e_token++ = '*';
563
564 if (++buf_ptr >= buf_end)
565 fill_buffer();
566
567 code = comment;
568 unary_delim = state->last_u_d;
569 break;
570 }
571 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
572 /*
573 * handle ||, &&, etc, and also things as in int *****i
574 */
575 CHECK_SIZE_TOKEN(1);
576 *e_token++ = *buf_ptr;
577 if (++buf_ptr >= buf_end)
578 fill_buffer();
579 }
580 code = (state->last_u_d ? unary_op : binary_op);
581 unary_delim = true;
582
583
584 } /* end of switch */
585 if (buf_ptr >= buf_end) /* check for input buffer empty */
586 fill_buffer();
587 state->last_u_d = unary_delim;
588 CHECK_SIZE_TOKEN(1);
589 *e_token = '\0'; /* null terminate the token */
590 return (code);
591 }
592
593 /* Initialize constant transition table */
594 void
595 init_constant_tt(void)
596 {
597 table['-'] = table['+'];
598 table['8'] = table['9'];
599 table['2'] = table['3'] = table['4'] = table['5'] = table['6'] = table['7'];
600 table['A'] = table['C'] = table['D'] = table['c'] = table['d'] = table['a'];
601 table['B'] = table['b'];
602 table['E'] = table['e'];
603 table['U'] = table['u'];
604 table['X'] = table['x'];
605 table['P'] = table['p'];
606 table['F'] = table['f'];
607 }
608
609 void
610 alloc_typenames(void)
611 {
612
613 typenames = (const char **)malloc(sizeof(typenames[0]) *
614 (typename_count = 16));
615 if (typenames == NULL)
616 err(1, NULL);
617 }
618
619 void
620 add_typename(const char *key)
621 {
622 int comparison;
623 const char *copy;
624
625 if (typename_top + 1 >= typename_count) {
626 typenames = realloc((void *)typenames,
627 sizeof(typenames[0]) * (typename_count *= 2));
628 if (typenames == NULL)
629 err(1, NULL);
630 }
631 if (typename_top == -1)
632 typenames[++typename_top] = copy = strdup(key);
633 else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
634 /* take advantage of sorted input */
635 if (comparison == 0) /* remove duplicates */
636 return;
637 typenames[++typename_top] = copy = strdup(key);
638 }
639 else {
640 int p;
641
642 for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
643 /* find place for the new key */;
644 if (comparison == 0) /* remove duplicates */
645 return;
646 memmove(&typenames[p + 1], &typenames[p],
647 sizeof(typenames[0]) * (++typename_top - p));
648 typenames[p] = copy = strdup(key);
649 }
650
651 if (copy == NULL)
652 err(1, NULL);
653 }
654