lexi.c revision 1.15 1 /* $NetBSD: lexi.c,v 1.15 2019/02/03 03:19:29 mrg Exp $ */
2
3 /*
4 * Copyright (c) 1980, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1976 Board of Trustees of the University of Illinois.
34 * Copyright (c) 1985 Sun Microsystems, Inc.
35 * All rights reserved.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 */
65
66 #include <sys/cdefs.h>
67 #ifndef lint
68 #if 0
69 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";
70 #else
71 __RCSID("$NetBSD: lexi.c,v 1.15 2019/02/03 03:19:29 mrg Exp $");
72 #endif
73 #endif /* not lint */
74
75 /*
76 * Here we have the token scanner for indent. It scans off one token and puts
77 * it in the global variable "token". It returns a code, indicating the type
78 * of token scanned.
79 */
80
81 #include <stdio.h>
82 #include <ctype.h>
83 #include <stdlib.h>
84 #include <string.h>
85 #include "indent_globs.h"
86 #include "indent_codes.h"
87
88 #define alphanum 1
89 #define opchar 3
90
91 struct templ {
92 const char *rwd;
93 int rwcode;
94 };
95
96 struct templ specials[1000] =
97 {
98 {"switch", 1},
99 {"case", 2},
100 {"break", 0},
101 {"struct", 3},
102 {"union", 3},
103 {"enum", 3},
104 {"default", 2},
105 {"int", 4},
106 {"char", 4},
107 {"float", 4},
108 {"double", 4},
109 {"long", 4},
110 {"short", 4},
111 {"typedef", 4},
112 {"unsigned", 4},
113 {"register", 4},
114 {"static", 4},
115 {"global", 4},
116 {"extern", 4},
117 {"void", 4},
118 {"goto", 0},
119 {"return", 0},
120 {"if", 5},
121 {"while", 5},
122 {"for", 5},
123 {"else", 6},
124 {"do", 6},
125 {"sizeof", 7},
126 {0, 0}
127 };
128
129 char chartype[128] =
130 { /* this is used to facilitate the decision of
131 * what type (alphanumeric, operator) each
132 * character is */
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 3, 0, 0, 1, 3, 3, 0,
138 0, 0, 3, 3, 0, 3, 0, 3,
139 1, 1, 1, 1, 1, 1, 1, 1,
140 1, 1, 0, 0, 3, 3, 3, 3,
141 0, 1, 1, 1, 1, 1, 1, 1,
142 1, 1, 1, 1, 1, 1, 1, 1,
143 1, 1, 1, 1, 1, 1, 1, 1,
144 1, 1, 1, 0, 0, 0, 3, 1,
145 0, 1, 1, 1, 1, 1, 1, 1,
146 1, 1, 1, 1, 1, 1, 1, 1,
147 1, 1, 1, 1, 1, 1, 1, 1,
148 1, 1, 1, 0, 3, 0, 3, 0
149 };
150
151
152
153
154 int
155 lexi(void)
156 {
157 int unary_delim; /* this is set to 1 if the current token
158 *
159 * forces a following operator to be unary */
160 static int last_code; /* the last token type returned */
161 static int l_struct; /* set to 1 if the last token was 'struct' */
162 int code; /* internal code to be returned */
163 char qchar; /* the delimiter character for a string */
164
165 e_token = s_token; /* point to start of place to save token */
166 unary_delim = false;
167 ps.col_1 = ps.last_nl; /* tell world that this token started in
168 * column 1 iff the last thing scanned was nl */
169 ps.last_nl = false;
170
171 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
172 ps.col_1 = false; /* leading blanks imply token is not
173 * in column 1 */
174 if (++buf_ptr >= buf_end)
175 fill_buffer();
176 }
177
178 /* Scan an alphanumeric token */
179 if (chartype[(int) *buf_ptr] == alphanum ||
180 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
181 /*
182 * we have a character or number
183 */
184 const char *j; /* used for searching thru list of
185 * reserved words */
186 struct templ *p;
187
188 if (isdigit((unsigned char)*buf_ptr) ||
189 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
190 int seendot = 0, seenexp = 0, seensfx = 0;
191 if (*buf_ptr == '0' &&
192 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
193 *e_token++ = *buf_ptr++;
194 *e_token++ = *buf_ptr++;
195 while (isxdigit((unsigned char)*buf_ptr)) {
196 CHECK_SIZE_TOKEN;
197 *e_token++ = *buf_ptr++;
198 }
199 } else {
200 while (1) {
201 if (*buf_ptr == '.') {
202 if (seendot)
203 break;
204 else
205 seendot++;
206 }
207 CHECK_SIZE_TOKEN;
208 *e_token++ = *buf_ptr++;
209 if (!isdigit((unsigned char)*buf_ptr)
210 && *buf_ptr != '.') {
211 if ((*buf_ptr != 'E'
212 && *buf_ptr != 'e') || seenexp)
213 break;
214 else {
215 seenexp++;
216 seendot++;
217 CHECK_SIZE_TOKEN;
218 *e_token++ = *buf_ptr++;
219 if (*buf_ptr == '+' || *buf_ptr == '-')
220 *e_token++ = *buf_ptr++;
221 }
222 }
223 }
224 }
225 if (*buf_ptr == 'F' || *buf_ptr == 'f') {
226 /* float constant */
227 *e_token++ = *buf_ptr++;
228 } else {
229 /* integer constant */
230 while (1) {
231 if (!(seensfx & 1) &&
232 (*buf_ptr == 'U' ||
233 *buf_ptr == 'u')) {
234 CHECK_SIZE_TOKEN;
235 *e_token++ = *buf_ptr++;
236 seensfx |= 1;
237 continue;
238 }
239 if (!(seensfx & 2) &&
240 (*buf_ptr == 'L' ||
241 *buf_ptr == 'l')) {
242 CHECK_SIZE_TOKEN;
243 if (buf_ptr[1] == buf_ptr[0])
244 *e_token++ = *buf_ptr++;
245 *e_token++ = *buf_ptr++;
246 seensfx |= 2;
247 continue;
248 }
249 break;
250 }
251 }
252 } else
253 while (chartype[(int) *buf_ptr] == alphanum) { /* copy it over */
254 CHECK_SIZE_TOKEN;
255 *e_token++ = *buf_ptr++;
256 if (buf_ptr >= buf_end)
257 fill_buffer();
258 }
259 *e_token++ = '\0';
260 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
261 if (++buf_ptr >= buf_end)
262 fill_buffer();
263 }
264 ps.its_a_keyword = false;
265 ps.sizeof_keyword = false;
266 if (l_struct) { /* if last token was 'struct', then this token
267 * should be treated as a declaration */
268 l_struct = false;
269 last_code = ident;
270 ps.last_u_d = true;
271 return (decl);
272 }
273 ps.last_u_d = false; /* Operator after indentifier is
274 * binary */
275 last_code = ident; /* Remember that this is the code we
276 * will return */
277
278 /*
279 * This loop will check if the token is a keyword.
280 */
281 for (p = specials; (j = p->rwd) != 0; p++) {
282 char *pt = s_token; /* point at scanned token */
283 if (*j++ != *pt++ || *j++ != *pt++)
284 continue; /* This test depends on the
285 * fact that identifiers are
286 * always at least 1 character
287 * long (ie. the first two
288 * bytes of the identifier are
289 * always meaningful) */
290 if (pt[-1] == 0)
291 break; /* If its a one-character identifier */
292 while (*pt++ == *j)
293 if (*j++ == 0)
294 goto found_keyword; /* I wish that C had a
295 * multi-level break... */
296 }
297 if (p->rwd) { /* we have a keyword */
298 found_keyword:
299 ps.its_a_keyword = true;
300 ps.last_u_d = true;
301 switch (p->rwcode) {
302 case 1:/* it is a switch */
303 return (swstmt);
304 case 2:/* a case or default */
305 return (casestmt);
306
307 case 3:/* a "struct" */
308 if (ps.p_l_follow)
309 break; /* inside parens: cast */
310 l_struct = true;
311
312 /*
313 * Next time around, we will want to know that we have had a
314 * 'struct'
315 */
316 /* FALLTHROUGH */
317 case 4:/* one of the declaration keywords */
318 if (ps.p_l_follow) {
319 ps.cast_mask |= 1 << ps.p_l_follow;
320 break; /* inside parens: cast */
321 }
322 last_code = decl;
323 return (decl);
324
325 case 5:/* if, while, for */
326 return (sp_paren);
327
328 case 6:/* do, else */
329 return (sp_nparen);
330
331 case 7:
332 ps.sizeof_keyword = true;
333 /* FALLTHROUGH */
334 default: /* all others are treated like any
335 * other identifier */
336 return (ident);
337 } /* end of switch */
338 } /* end of if (found_it) */
339 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
340 char *tp = buf_ptr;
341 while (tp < buf_end)
342 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
343 goto not_proc;
344 strncpy(ps.procname, token, sizeof ps.procname - 1);
345 ps.in_parameter_declaration = 1;
346 rparen_count = 1;
347 not_proc: ;
348 }
349 /*
350 * The following hack attempts to guess whether or not the current
351 * token is in fact a declaration keyword -- one that has been
352 * typedefd
353 */
354 if (((*buf_ptr == '*' && buf_ptr[1] != '=') ||
355 isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_')
356 && !ps.p_l_follow
357 && !ps.block_init
358 && (ps.last_token == rparen || ps.last_token == semicolon ||
359 ps.last_token == decl ||
360 ps.last_token == lbrace || ps.last_token == rbrace)) {
361 ps.its_a_keyword = true;
362 ps.last_u_d = true;
363 last_code = decl;
364 return decl;
365 }
366 if (last_code == decl) /* if this is a declared variable,
367 * then following sign is unary */
368 ps.last_u_d = true; /* will make "int a -1" work */
369 last_code = ident;
370 return (ident); /* the ident is not in the list */
371 } /* end of procesing for alpanum character */
372 /* Scan a non-alphanumeric token */
373 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
374 * moved here */
375 *e_token = '\0';
376 if (++buf_ptr >= buf_end)
377 fill_buffer();
378
379 switch (*token) {
380 case '\n':
381 unary_delim = ps.last_u_d;
382 ps.last_nl = true; /* remember that we just had a newline */
383 code = (had_eof ? 0 : newline);
384
385 /*
386 * if data has been exausted, the newline is a dummy, and we should
387 * return code to stop
388 */
389 break;
390
391 case '\'': /* start of quoted character */
392 case '"': /* start of string */
393 qchar = *token;
394 if (troff) {
395 e_token[-1] = '`';
396 if (qchar == '"')
397 *e_token++ = '`';
398 e_token = chfont(&bodyf, &stringf, e_token);
399 }
400 do { /* copy the string */
401 while (1) { /* move one character or
402 * [/<char>]<char> */
403 if (*buf_ptr == '\n') {
404 printf("%d: Unterminated literal\n", line_no);
405 goto stop_lit;
406 }
407 CHECK_SIZE_TOKEN; /* Only have to do this
408 * once in this loop,
409 * since CHECK_SIZE
410 * guarantees that there
411 * are at least 5
412 * entries left */
413 *e_token = *buf_ptr++;
414 if (buf_ptr >= buf_end)
415 fill_buffer();
416 if (*e_token == BACKSLASH) { /* if escape, copy extra
417 * char */
418 if (*buf_ptr == '\n') /* check for escaped
419 * newline */
420 ++line_no;
421 if (troff) {
422 *++e_token = BACKSLASH;
423 if (*buf_ptr == BACKSLASH)
424 *++e_token = BACKSLASH;
425 }
426 *++e_token = *buf_ptr++;
427 ++e_token; /* we must increment
428 * this again because we
429 * copied two chars */
430 if (buf_ptr >= buf_end)
431 fill_buffer();
432 } else
433 break; /* we copied one character */
434 } /* end of while (1) */
435 } while (*e_token++ != qchar);
436 if (troff) {
437 e_token = chfont(&stringf, &bodyf, e_token - 1);
438 if (qchar == '"')
439 *e_token++ = '\'';
440 }
441 stop_lit:
442 code = ident;
443 break;
444
445 case ('('):
446 case ('['):
447 unary_delim = true;
448 code = lparen;
449 break;
450
451 case (')'):
452 case (']'):
453 code = rparen;
454 break;
455
456 case '#':
457 unary_delim = ps.last_u_d;
458 code = preesc;
459 break;
460
461 case '?':
462 unary_delim = true;
463 code = question;
464 break;
465
466 case (':'):
467 code = colon;
468 unary_delim = true;
469 break;
470
471 case (';'):
472 unary_delim = true;
473 code = semicolon;
474 break;
475
476 case ('{'):
477 unary_delim = true;
478
479 /*
480 * if (ps.in_or_st) ps.block_init = 1;
481 */
482 /* ? code = ps.block_init ? lparen : lbrace; */
483 code = lbrace;
484 break;
485
486 case ('}'):
487 unary_delim = true;
488 /* ? code = ps.block_init ? rparen : rbrace; */
489 code = rbrace;
490 break;
491
492 case 014: /* a form feed */
493 unary_delim = ps.last_u_d;
494 ps.last_nl = true; /* remember this so we can set
495 * 'ps.col_1' right */
496 code = form_feed;
497 break;
498
499 case (','):
500 unary_delim = true;
501 code = comma;
502 break;
503
504 case '.':
505 unary_delim = false;
506 code = period;
507 break;
508
509 case '-':
510 case '+': /* check for -, +, --, ++ */
511 code = (ps.last_u_d ? unary_op : binary_op);
512 unary_delim = true;
513
514 if (*buf_ptr == token[0]) {
515 /* check for doubled character */
516 *e_token++ = *buf_ptr++;
517 /* buffer overflow will be checked at end of loop */
518 if (last_code == ident || last_code == rparen) {
519 code = (ps.last_u_d ? unary_op : postop);
520 /* check for following ++ or -- */
521 unary_delim = false;
522 }
523 } else
524 if (*buf_ptr == '=')
525 /* check for operator += */
526 *e_token++ = *buf_ptr++;
527 else
528 if (*buf_ptr == '>') {
529 /* check for operator -> */
530 *e_token++ = *buf_ptr++;
531 if (!pointer_as_binop) {
532 unary_delim = false;
533 code = unary_op;
534 ps.want_blank = false;
535 }
536 }
537 break; /* buffer overflow will be checked at end of
538 * switch */
539
540 case '=':
541 if (ps.in_or_st)
542 ps.block_init = 1;
543 #ifdef undef
544 if (chartype[*buf_ptr] == opchar) { /* we have two char
545 * assignment */
546 e_token[-1] = *buf_ptr++;
547 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
548 *e_token++ = *buf_ptr++;
549 *e_token++ = '='; /* Flip =+ to += */
550 *e_token = 0;
551 }
552 #else
553 if (*buf_ptr == '=') { /* == */
554 *e_token++ = '='; /* Flip =+ to += */
555 buf_ptr++;
556 *e_token = 0;
557 }
558 #endif
559 code = binary_op;
560 unary_delim = true;
561 break;
562 /* can drop thru!!! */
563
564 case '>':
565 case '<':
566 case '!': /* ops like <, <<, <=, !=, etc */
567 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
568 *e_token++ = *buf_ptr;
569 if (++buf_ptr >= buf_end)
570 fill_buffer();
571 }
572 if (*buf_ptr == '=')
573 *e_token++ = *buf_ptr++;
574 code = (ps.last_u_d ? unary_op : binary_op);
575 unary_delim = true;
576 break;
577
578 default:
579 if (token[0] == '/' && *buf_ptr == '*') {
580 /* it is start of comment */
581 *e_token++ = '*';
582
583 if (++buf_ptr >= buf_end)
584 fill_buffer();
585
586 code = comment;
587 unary_delim = ps.last_u_d;
588 break;
589 }
590 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
591 /*
592 * handle ||, &&, etc, and also things as in int *****i
593 */
594 *e_token++ = *buf_ptr;
595 if (++buf_ptr >= buf_end)
596 fill_buffer();
597 }
598 code = (ps.last_u_d ? unary_op : binary_op);
599 unary_delim = true;
600
601
602 } /* end of switch */
603 if (code != newline) {
604 l_struct = false;
605 last_code = code;
606 }
607 if (buf_ptr >= buf_end) /* check for input buffer empty */
608 fill_buffer();
609 ps.last_u_d = unary_delim;
610 *e_token = '\0'; /* null terminate the token */
611 return (code);
612 }
613 /*
614 * Add the given keyword to the keyword table, using val as the keyword type
615 */
616 void
617 addkey(char *key, int val)
618 {
619 struct templ *p = specials;
620 while (p->rwd)
621 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
622 return;
623 else
624 p++;
625 if (p >= specials + sizeof specials / sizeof specials[0])
626 return; /* For now, table overflows are silently
627 * ignored */
628 p->rwd = key;
629 p->rwcode = val;
630 p[1].rwd = 0;
631 p[1].rwcode = 0;
632 }
633