lexi.c revision 1.11 1 /* $NetBSD: lexi.c,v 1.11 2002/05/26 22:53:38 wiz Exp $ */
2
3 /*
4 * Copyright (c) 1980, 1993
5 * The Regents of the University of California. All rights reserved.
6 * Copyright (c) 1976 Board of Trustees of the University of Illinois.
7 * Copyright (c) 1985 Sun Microsystems, Inc.
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39 #include <sys/cdefs.h>
40 #ifndef lint
41 #if 0
42 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";
43 #else
44 __RCSID("$NetBSD: lexi.c,v 1.11 2002/05/26 22:53:38 wiz Exp $");
45 #endif
46 #endif /* not lint */
47
48 /*
49 * Here we have the token scanner for indent. It scans off one token and puts
50 * it in the global variable "token". It returns a code, indicating the type
51 * of token scanned.
52 */
53
54 #include <stdio.h>
55 #include <ctype.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include "indent_globs.h"
59 #include "indent_codes.h"
60
61 #define alphanum 1
62 #define opchar 3
63
64 struct templ {
65 char *rwd;
66 int rwcode;
67 };
68
69 struct templ specials[1000] =
70 {
71 {"switch", 1},
72 {"case", 2},
73 {"break", 0},
74 {"struct", 3},
75 {"union", 3},
76 {"enum", 3},
77 {"default", 2},
78 {"int", 4},
79 {"char", 4},
80 {"float", 4},
81 {"double", 4},
82 {"long", 4},
83 {"short", 4},
84 {"typdef", 4},
85 {"unsigned", 4},
86 {"register", 4},
87 {"static", 4},
88 {"global", 4},
89 {"extern", 4},
90 {"void", 4},
91 {"goto", 0},
92 {"return", 0},
93 {"if", 5},
94 {"while", 5},
95 {"for", 5},
96 {"else", 6},
97 {"do", 6},
98 {"sizeof", 7},
99 {0, 0}
100 };
101
102 char chartype[128] =
103 { /* this is used to facilitate the decision of
104 * what type (alphanumeric, operator) each
105 * character is */
106 0, 0, 0, 0, 0, 0, 0, 0,
107 0, 0, 0, 0, 0, 0, 0, 0,
108 0, 0, 0, 0, 0, 0, 0, 0,
109 0, 0, 0, 0, 0, 0, 0, 0,
110 0, 3, 0, 0, 1, 3, 3, 0,
111 0, 0, 3, 3, 0, 3, 0, 3,
112 1, 1, 1, 1, 1, 1, 1, 1,
113 1, 1, 0, 0, 3, 3, 3, 3,
114 0, 1, 1, 1, 1, 1, 1, 1,
115 1, 1, 1, 1, 1, 1, 1, 1,
116 1, 1, 1, 1, 1, 1, 1, 1,
117 1, 1, 1, 0, 0, 0, 3, 1,
118 0, 1, 1, 1, 1, 1, 1, 1,
119 1, 1, 1, 1, 1, 1, 1, 1,
120 1, 1, 1, 1, 1, 1, 1, 1,
121 1, 1, 1, 0, 3, 0, 3, 0
122 };
123
124
125
126
127 int
128 lexi(void)
129 {
130 int unary_delim; /* this is set to 1 if the current token
131 *
132 * forces a following operator to be unary */
133 static int last_code; /* the last token type returned */
134 static int l_struct; /* set to 1 if the last token was 'struct' */
135 int code; /* internal code to be returned */
136 char qchar; /* the delimiter character for a string */
137
138 e_token = s_token; /* point to start of place to save token */
139 unary_delim = false;
140 ps.col_1 = ps.last_nl; /* tell world that this token started in
141 * column 1 iff the last thing scanned was nl */
142 ps.last_nl = false;
143
144 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
145 ps.col_1 = false; /* leading blanks imply token is not
146 * in column 1 */
147 if (++buf_ptr >= buf_end)
148 fill_buffer();
149 }
150
151 /* Scan an alphanumeric token */
152 if (chartype[(int) *buf_ptr] == alphanum ||
153 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
154 /*
155 * we have a character or number
156 */
157 char *j; /* used for searching thru list of
158 *
159 * reserved words */
160 struct templ *p;
161
162 if (isdigit((unsigned char)*buf_ptr) ||
163 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
164 int seendot = 0, seenexp = 0, seensfx = 0;
165 if (*buf_ptr == '0' &&
166 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
167 *e_token++ = *buf_ptr++;
168 *e_token++ = *buf_ptr++;
169 while (isxdigit((unsigned char)*buf_ptr)) {
170 CHECK_SIZE_TOKEN;
171 *e_token++ = *buf_ptr++;
172 }
173 } else {
174 while (1) {
175 if (*buf_ptr == '.') {
176 if (seendot)
177 break;
178 else
179 seendot++;
180 }
181 CHECK_SIZE_TOKEN;
182 *e_token++ = *buf_ptr++;
183 if (!isdigit((unsigned char)*buf_ptr)
184 && *buf_ptr != '.') {
185 if ((*buf_ptr != 'E'
186 && *buf_ptr != 'e') || seenexp)
187 break;
188 else {
189 seenexp++;
190 seendot++;
191 CHECK_SIZE_TOKEN;
192 *e_token++ = *buf_ptr++;
193 if (*buf_ptr == '+' || *buf_ptr == '-')
194 *e_token++ = *buf_ptr++;
195 }
196 }
197 }
198 }
199 if (*buf_ptr == 'F' || *buf_ptr == 'f') {
200 /* float constant */
201 *e_token++ = *buf_ptr++;
202 } else {
203 /* integer constant */
204 while (1) {
205 if (!(seensfx & 1) &&
206 (*buf_ptr == 'U' ||
207 *buf_ptr == 'u')) {
208 CHECK_SIZE_TOKEN;
209 *e_token++ = *buf_ptr++;
210 seensfx |= 1;
211 continue;
212 }
213 if (!(seensfx & 2) &&
214 (*buf_ptr == 'L' ||
215 *buf_ptr == 'l')) {
216 CHECK_SIZE_TOKEN;
217 if (buf_ptr[1] == buf_ptr[0])
218 *e_token++ = *buf_ptr++;
219 *e_token++ = *buf_ptr++;
220 seensfx |= 2;
221 continue;
222 }
223 break;
224 }
225 }
226 } else
227 while (chartype[(int) *buf_ptr] == alphanum) { /* copy it over */
228 CHECK_SIZE_TOKEN;
229 *e_token++ = *buf_ptr++;
230 if (buf_ptr >= buf_end)
231 fill_buffer();
232 }
233 *e_token++ = '\0';
234 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
235 if (++buf_ptr >= buf_end)
236 fill_buffer();
237 }
238 ps.its_a_keyword = false;
239 ps.sizeof_keyword = false;
240 if (l_struct) { /* if last token was 'struct', then this token
241 * should be treated as a declaration */
242 l_struct = false;
243 last_code = ident;
244 ps.last_u_d = true;
245 return (decl);
246 }
247 ps.last_u_d = false; /* Operator after indentifier is
248 * binary */
249 last_code = ident; /* Remember that this is the code we
250 * will return */
251
252 /*
253 * This loop will check if the token is a keyword.
254 */
255 for (p = specials; (j = p->rwd) != 0; p++) {
256 char *p = s_token; /* point at scanned token */
257 if (*j++ != *p++ || *j++ != *p++)
258 continue; /* This test depends on the
259 * fact that identifiers are
260 * always at least 1 character
261 * long (ie. the first two
262 * bytes of the identifier are
263 * always meaningful) */
264 if (p[-1] == 0)
265 break; /* If its a one-character identifier */
266 while (*p++ == *j)
267 if (*j++ == 0)
268 goto found_keyword; /* I wish that C had a
269 * multi-level break... */
270 }
271 if (p->rwd) { /* we have a keyword */
272 found_keyword:
273 ps.its_a_keyword = true;
274 ps.last_u_d = true;
275 switch (p->rwcode) {
276 case 1:/* it is a switch */
277 return (swstmt);
278 case 2:/* a case or default */
279 return (casestmt);
280
281 case 3:/* a "struct" */
282 if (ps.p_l_follow)
283 break; /* inside parens: cast */
284 l_struct = true;
285
286 /*
287 * Next time around, we will want to know that we have had a
288 * 'struct'
289 */
290 case 4:/* one of the declaration keywords */
291 if (ps.p_l_follow) {
292 ps.cast_mask |= 1 << ps.p_l_follow;
293 break; /* inside parens: cast */
294 }
295 last_code = decl;
296 return (decl);
297
298 case 5:/* if, while, for */
299 return (sp_paren);
300
301 case 6:/* do, else */
302 return (sp_nparen);
303
304 case 7:
305 ps.sizeof_keyword = true;
306 default: /* all others are treated like any
307 * other identifier */
308 return (ident);
309 } /* end of switch */
310 } /* end of if (found_it) */
311 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
312 char *tp = buf_ptr;
313 while (tp < buf_end)
314 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
315 goto not_proc;
316 strncpy(ps.procname, token, sizeof ps.procname - 1);
317 ps.in_parameter_declaration = 1;
318 rparen_count = 1;
319 not_proc: ;
320 }
321 /*
322 * The following hack attempts to guess whether or not the current
323 * token is in fact a declaration keyword -- one that has been
324 * typedefd
325 */
326 if (((*buf_ptr == '*' && buf_ptr[1] != '=') ||
327 isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_')
328 && !ps.p_l_follow
329 && !ps.block_init
330 && (ps.last_token == rparen || ps.last_token == semicolon ||
331 ps.last_token == decl ||
332 ps.last_token == lbrace || ps.last_token == rbrace)) {
333 ps.its_a_keyword = true;
334 ps.last_u_d = true;
335 last_code = decl;
336 return decl;
337 }
338 if (last_code == decl) /* if this is a declared variable,
339 * then following sign is unary */
340 ps.last_u_d = true; /* will make "int a -1" work */
341 last_code = ident;
342 return (ident); /* the ident is not in the list */
343 } /* end of procesing for alpanum character */
344 /* Scan a non-alphanumeric token */
345 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
346 * moved here */
347 *e_token = '\0';
348 if (++buf_ptr >= buf_end)
349 fill_buffer();
350
351 switch (*token) {
352 case '\n':
353 unary_delim = ps.last_u_d;
354 ps.last_nl = true; /* remember that we just had a newline */
355 code = (had_eof ? 0 : newline);
356
357 /*
358 * if data has been exausted, the newline is a dummy, and we should
359 * return code to stop
360 */
361 break;
362
363 case '\'': /* start of quoted character */
364 case '"': /* start of string */
365 qchar = *token;
366 if (troff) {
367 e_token[-1] = '`';
368 if (qchar == '"')
369 *e_token++ = '`';
370 e_token = chfont(&bodyf, &stringf, e_token);
371 }
372 do { /* copy the string */
373 while (1) { /* move one character or
374 * [/<char>]<char> */
375 if (*buf_ptr == '\n') {
376 printf("%d: Unterminated literal\n", line_no);
377 goto stop_lit;
378 }
379 CHECK_SIZE_TOKEN; /* Only have to do this
380 * once in this loop,
381 * since CHECK_SIZE
382 * guarantees that there
383 * are at least 5
384 * entries left */
385 *e_token = *buf_ptr++;
386 if (buf_ptr >= buf_end)
387 fill_buffer();
388 if (*e_token == BACKSLASH) { /* if escape, copy extra
389 * char */
390 if (*buf_ptr == '\n') /* check for escaped
391 * newline */
392 ++line_no;
393 if (troff) {
394 *++e_token = BACKSLASH;
395 if (*buf_ptr == BACKSLASH)
396 *++e_token = BACKSLASH;
397 }
398 *++e_token = *buf_ptr++;
399 ++e_token; /* we must increment
400 * this again because we
401 * copied two chars */
402 if (buf_ptr >= buf_end)
403 fill_buffer();
404 } else
405 break; /* we copied one character */
406 } /* end of while (1) */
407 } while (*e_token++ != qchar);
408 if (troff) {
409 e_token = chfont(&stringf, &bodyf, e_token - 1);
410 if (qchar == '"')
411 *e_token++ = '\'';
412 }
413 stop_lit:
414 code = ident;
415 break;
416
417 case ('('):
418 case ('['):
419 unary_delim = true;
420 code = lparen;
421 break;
422
423 case (')'):
424 case (']'):
425 code = rparen;
426 break;
427
428 case '#':
429 unary_delim = ps.last_u_d;
430 code = preesc;
431 break;
432
433 case '?':
434 unary_delim = true;
435 code = question;
436 break;
437
438 case (':'):
439 code = colon;
440 unary_delim = true;
441 break;
442
443 case (';'):
444 unary_delim = true;
445 code = semicolon;
446 break;
447
448 case ('{'):
449 unary_delim = true;
450
451 /*
452 * if (ps.in_or_st) ps.block_init = 1;
453 */
454 /* ? code = ps.block_init ? lparen : lbrace; */
455 code = lbrace;
456 break;
457
458 case ('}'):
459 unary_delim = true;
460 /* ? code = ps.block_init ? rparen : rbrace; */
461 code = rbrace;
462 break;
463
464 case 014: /* a form feed */
465 unary_delim = ps.last_u_d;
466 ps.last_nl = true; /* remember this so we can set
467 * 'ps.col_1' right */
468 code = form_feed;
469 break;
470
471 case (','):
472 unary_delim = true;
473 code = comma;
474 break;
475
476 case '.':
477 unary_delim = false;
478 code = period;
479 break;
480
481 case '-':
482 case '+': /* check for -, +, --, ++ */
483 code = (ps.last_u_d ? unary_op : binary_op);
484 unary_delim = true;
485
486 if (*buf_ptr == token[0]) {
487 /* check for doubled character */
488 *e_token++ = *buf_ptr++;
489 /* buffer overflow will be checked at end of loop */
490 if (last_code == ident || last_code == rparen) {
491 code = (ps.last_u_d ? unary_op : postop);
492 /* check for following ++ or -- */
493 unary_delim = false;
494 }
495 } else
496 if (*buf_ptr == '=')
497 /* check for operator += */
498 *e_token++ = *buf_ptr++;
499 else
500 if (*buf_ptr == '>') {
501 /* check for operator -> */
502 *e_token++ = *buf_ptr++;
503 if (!pointer_as_binop) {
504 unary_delim = false;
505 code = unary_op;
506 ps.want_blank = false;
507 }
508 }
509 break; /* buffer overflow will be checked at end of
510 * switch */
511
512 case '=':
513 if (ps.in_or_st)
514 ps.block_init = 1;
515 #ifdef undef
516 if (chartype[*buf_ptr] == opchar) { /* we have two char
517 * assignment */
518 e_token[-1] = *buf_ptr++;
519 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
520 *e_token++ = *buf_ptr++;
521 *e_token++ = '='; /* Flip =+ to += */
522 *e_token = 0;
523 }
524 #else
525 if (*buf_ptr == '=') { /* == */
526 *e_token++ = '='; /* Flip =+ to += */
527 buf_ptr++;
528 *e_token = 0;
529 }
530 #endif
531 code = binary_op;
532 unary_delim = true;
533 break;
534 /* can drop thru!!! */
535
536 case '>':
537 case '<':
538 case '!': /* ops like <, <<, <=, !=, etc */
539 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
540 *e_token++ = *buf_ptr;
541 if (++buf_ptr >= buf_end)
542 fill_buffer();
543 }
544 if (*buf_ptr == '=')
545 *e_token++ = *buf_ptr++;
546 code = (ps.last_u_d ? unary_op : binary_op);
547 unary_delim = true;
548 break;
549
550 default:
551 if (token[0] == '/' && *buf_ptr == '*') {
552 /* it is start of comment */
553 *e_token++ = '*';
554
555 if (++buf_ptr >= buf_end)
556 fill_buffer();
557
558 code = comment;
559 unary_delim = ps.last_u_d;
560 break;
561 }
562 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
563 /*
564 * handle ||, &&, etc, and also things as in int *****i
565 */
566 *e_token++ = *buf_ptr;
567 if (++buf_ptr >= buf_end)
568 fill_buffer();
569 }
570 code = (ps.last_u_d ? unary_op : binary_op);
571 unary_delim = true;
572
573
574 } /* end of switch */
575 if (code != newline) {
576 l_struct = false;
577 last_code = code;
578 }
579 if (buf_ptr >= buf_end) /* check for input buffer empty */
580 fill_buffer();
581 ps.last_u_d = unary_delim;
582 *e_token = '\0'; /* null terminate the token */
583 return (code);
584 }
585 /*
586 * Add the given keyword to the keyword table, using val as the keyword type
587 */
588 void
589 addkey(char *key, int val)
590 {
591 struct templ *p = specials;
592 while (p->rwd)
593 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
594 return;
595 else
596 p++;
597 if (p >= specials + sizeof specials / sizeof specials[0])
598 return; /* For now, table overflows are silently
599 * ignored */
600 p->rwd = key;
601 p->rwcode = val;
602 p[1].rwd = 0;
603 p[1].rwcode = 0;
604 }
605