lexi.c revision 1.9 1 /* $NetBSD: lexi.c,v 1.9 1999/03/15 20:28:45 kristerw Exp $ */
2
3 /*
4 * Copyright (c) 1980, 1993
5 * The Regents of the University of California. All rights reserved.
6 * Copyright (c) 1976 Board of Trustees of the University of Illinois.
7 * Copyright (c) 1985 Sun Microsystems, Inc.
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39 #include <sys/cdefs.h>
40 #ifndef lint
41 #if 0
42 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";
43 #else
44 __RCSID("$NetBSD: lexi.c,v 1.9 1999/03/15 20:28:45 kristerw Exp $");
45 #endif
46 #endif /* not lint */
47
48 /*
49 * Here we have the token scanner for indent. It scans off one token and puts
50 * it in the global variable "token". It returns a code, indicating the type
51 * of token scanned.
52 */
53
54 #include <stdio.h>
55 #include <ctype.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include "indent_globs.h"
59 #include "indent_codes.h"
60
61 #define alphanum 1
62 #define opchar 3
63
64 struct templ {
65 char *rwd;
66 int rwcode;
67 };
68
69 struct templ specials[1000] =
70 {
71 {"switch", 1},
72 {"case", 2},
73 {"break", 0},
74 {"struct", 3},
75 {"union", 3},
76 {"enum", 3},
77 {"default", 2},
78 {"int", 4},
79 {"char", 4},
80 {"float", 4},
81 {"double", 4},
82 {"long", 4},
83 {"short", 4},
84 {"typdef", 4},
85 {"unsigned", 4},
86 {"register", 4},
87 {"static", 4},
88 {"global", 4},
89 {"extern", 4},
90 {"void", 4},
91 {"goto", 0},
92 {"return", 0},
93 {"if", 5},
94 {"while", 5},
95 {"for", 5},
96 {"else", 6},
97 {"do", 6},
98 {"sizeof", 7},
99 {0, 0}
100 };
101
102 char chartype[128] =
103 { /* this is used to facilitate the decision of
104 * what type (alphanumeric, operator) each
105 * character is */
106 0, 0, 0, 0, 0, 0, 0, 0,
107 0, 0, 0, 0, 0, 0, 0, 0,
108 0, 0, 0, 0, 0, 0, 0, 0,
109 0, 0, 0, 0, 0, 0, 0, 0,
110 0, 3, 0, 0, 1, 3, 3, 0,
111 0, 0, 3, 3, 0, 3, 0, 3,
112 1, 1, 1, 1, 1, 1, 1, 1,
113 1, 1, 0, 0, 3, 3, 3, 3,
114 0, 1, 1, 1, 1, 1, 1, 1,
115 1, 1, 1, 1, 1, 1, 1, 1,
116 1, 1, 1, 1, 1, 1, 1, 1,
117 1, 1, 1, 0, 0, 0, 3, 1,
118 0, 1, 1, 1, 1, 1, 1, 1,
119 1, 1, 1, 1, 1, 1, 1, 1,
120 1, 1, 1, 1, 1, 1, 1, 1,
121 1, 1, 1, 0, 3, 0, 3, 0
122 };
123
124
125
126
127 int
128 lexi()
129 {
130 int unary_delim; /* this is set to 1 if the current token
131 *
132 * forces a following operator to be unary */
133 static int last_code; /* the last token type returned */
134 static int l_struct; /* set to 1 if the last token was 'struct' */
135 int code; /* internal code to be returned */
136 char qchar; /* the delimiter character for a string */
137
138 e_token = s_token; /* point to start of place to save token */
139 unary_delim = false;
140 ps.col_1 = ps.last_nl; /* tell world that this token started in
141 * column 1 iff the last thing scanned was nl */
142 ps.last_nl = false;
143
144 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
145 ps.col_1 = false; /* leading blanks imply token is not
146 * in column 1 */
147 if (++buf_ptr >= buf_end)
148 fill_buffer();
149 }
150
151 /* Scan an alphanumeric token */
152 if (chartype[(int) *buf_ptr] == alphanum ||
153 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
154 /*
155 * we have a character or number
156 */
157 char *j; /* used for searching thru list of
158 *
159 * reserved words */
160 struct templ *p;
161
162 if (isdigit((unsigned char)*buf_ptr) ||
163 (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
164 int seendot = 0, seenexp = 0;
165 if (*buf_ptr == '0' &&
166 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
167 *e_token++ = *buf_ptr++;
168 *e_token++ = *buf_ptr++;
169 while (isxdigit((unsigned char)*buf_ptr)) {
170 CHECK_SIZE_TOKEN;
171 *e_token++ = *buf_ptr++;
172 }
173 } else {
174 while (1) {
175 if (*buf_ptr == '.') {
176 if (seendot)
177 break;
178 else
179 seendot++;
180 }
181 CHECK_SIZE_TOKEN;
182 *e_token++ = *buf_ptr++;
183 if (!isdigit((unsigned char)*buf_ptr)
184 && *buf_ptr != '.') {
185 if ((*buf_ptr != 'E'
186 && *buf_ptr != 'e') || seenexp)
187 break;
188 else {
189 seenexp++;
190 seendot++;
191 CHECK_SIZE_TOKEN;
192 *e_token++ = *buf_ptr++;
193 if (*buf_ptr == '+' || *buf_ptr == '-')
194 *e_token++ = *buf_ptr++;
195 }
196 }
197 }
198 }
199 if (*buf_ptr == 'F' || *buf_ptr == 'f') {
200 /* float constant */
201 *e_token++ = *buf_ptr++;
202 } else {
203 /* integer constant (U, L, UL, LL, ULL) */
204 if (*buf_ptr == 'U' || *buf_ptr == 'u')
205 *e_token++ = *buf_ptr++;
206 if (*buf_ptr == 'L' || *buf_ptr == 'l')
207 *e_token++ = *buf_ptr++;
208 if (*buf_ptr == 'L' || *buf_ptr == 'l')
209 *e_token++ = *buf_ptr++;
210 }
211 } else
212 while (chartype[(int) *buf_ptr] == alphanum) { /* copy it over */
213 CHECK_SIZE_TOKEN;
214 *e_token++ = *buf_ptr++;
215 if (buf_ptr >= buf_end)
216 fill_buffer();
217 }
218 *e_token++ = '\0';
219 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
220 if (++buf_ptr >= buf_end)
221 fill_buffer();
222 }
223 ps.its_a_keyword = false;
224 ps.sizeof_keyword = false;
225 if (l_struct) { /* if last token was 'struct', then this token
226 * should be treated as a declaration */
227 l_struct = false;
228 last_code = ident;
229 ps.last_u_d = true;
230 return (decl);
231 }
232 ps.last_u_d = false; /* Operator after indentifier is
233 * binary */
234 last_code = ident; /* Remember that this is the code we
235 * will return */
236
237 /*
238 * This loop will check if the token is a keyword.
239 */
240 for (p = specials; (j = p->rwd) != 0; p++) {
241 char *p = s_token; /* point at scanned token */
242 if (*j++ != *p++ || *j++ != *p++)
243 continue; /* This test depends on the
244 * fact that identifiers are
245 * always at least 1 character
246 * long (ie. the first two
247 * bytes of the identifier are
248 * always meaningful) */
249 if (p[-1] == 0)
250 break; /* If its a one-character identifier */
251 while (*p++ == *j)
252 if (*j++ == 0)
253 goto found_keyword; /* I wish that C had a
254 * multi-level break... */
255 }
256 if (p->rwd) { /* we have a keyword */
257 found_keyword:
258 ps.its_a_keyword = true;
259 ps.last_u_d = true;
260 switch (p->rwcode) {
261 case 1:/* it is a switch */
262 return (swstmt);
263 case 2:/* a case or default */
264 return (casestmt);
265
266 case 3:/* a "struct" */
267 if (ps.p_l_follow)
268 break; /* inside parens: cast */
269 l_struct = true;
270
271 /*
272 * Next time around, we will want to know that we have had a
273 * 'struct'
274 */
275 case 4:/* one of the declaration keywords */
276 if (ps.p_l_follow) {
277 ps.cast_mask |= 1 << ps.p_l_follow;
278 break; /* inside parens: cast */
279 }
280 last_code = decl;
281 return (decl);
282
283 case 5:/* if, while, for */
284 return (sp_paren);
285
286 case 6:/* do, else */
287 return (sp_nparen);
288
289 case 7:
290 ps.sizeof_keyword = true;
291 default: /* all others are treated like any
292 * other identifier */
293 return (ident);
294 } /* end of switch */
295 } /* end of if (found_it) */
296 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
297 char *tp = buf_ptr;
298 while (tp < buf_end)
299 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
300 goto not_proc;
301 strncpy(ps.procname, token, sizeof ps.procname - 1);
302 ps.in_parameter_declaration = 1;
303 rparen_count = 1;
304 not_proc: ;
305 }
306 /*
307 * The following hack attempts to guess whether or not the current
308 * token is in fact a declaration keyword -- one that has been
309 * typedefd
310 */
311 if (((*buf_ptr == '*' && buf_ptr[1] != '=') ||
312 isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_')
313 && !ps.p_l_follow
314 && !ps.block_init
315 && (ps.last_token == rparen || ps.last_token == semicolon ||
316 ps.last_token == decl ||
317 ps.last_token == lbrace || ps.last_token == rbrace)) {
318 ps.its_a_keyword = true;
319 ps.last_u_d = true;
320 last_code = decl;
321 return decl;
322 }
323 if (last_code == decl) /* if this is a declared variable,
324 * then following sign is unary */
325 ps.last_u_d = true; /* will make "int a -1" work */
326 last_code = ident;
327 return (ident); /* the ident is not in the list */
328 } /* end of procesing for alpanum character */
329 /* Scan a non-alphanumeric token */
330 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
331 * moved here */
332 *e_token = '\0';
333 if (++buf_ptr >= buf_end)
334 fill_buffer();
335
336 switch (*token) {
337 case '\n':
338 unary_delim = ps.last_u_d;
339 ps.last_nl = true; /* remember that we just had a newline */
340 code = (had_eof ? 0 : newline);
341
342 /*
343 * if data has been exausted, the newline is a dummy, and we should
344 * return code to stop
345 */
346 break;
347
348 case '\'': /* start of quoted character */
349 case '"': /* start of string */
350 qchar = *token;
351 if (troff) {
352 e_token[-1] = '`';
353 if (qchar == '"')
354 *e_token++ = '`';
355 e_token = chfont(&bodyf, &stringf, e_token);
356 }
357 do { /* copy the string */
358 while (1) { /* move one character or
359 * [/<char>]<char> */
360 if (*buf_ptr == '\n') {
361 printf("%d: Unterminated literal\n", line_no);
362 goto stop_lit;
363 }
364 CHECK_SIZE_TOKEN; /* Only have to do this
365 * once in this loop,
366 * since CHECK_SIZE
367 * guarantees that there
368 * are at least 5
369 * entries left */
370 *e_token = *buf_ptr++;
371 if (buf_ptr >= buf_end)
372 fill_buffer();
373 if (*e_token == BACKSLASH) { /* if escape, copy extra
374 * char */
375 if (*buf_ptr == '\n') /* check for escaped
376 * newline */
377 ++line_no;
378 if (troff) {
379 *++e_token = BACKSLASH;
380 if (*buf_ptr == BACKSLASH)
381 *++e_token = BACKSLASH;
382 }
383 *++e_token = *buf_ptr++;
384 ++e_token; /* we must increment
385 * this again because we
386 * copied two chars */
387 if (buf_ptr >= buf_end)
388 fill_buffer();
389 } else
390 break; /* we copied one character */
391 } /* end of while (1) */
392 } while (*e_token++ != qchar);
393 if (troff) {
394 e_token = chfont(&stringf, &bodyf, e_token - 1);
395 if (qchar == '"')
396 *e_token++ = '\'';
397 }
398 stop_lit:
399 code = ident;
400 break;
401
402 case ('('):
403 case ('['):
404 unary_delim = true;
405 code = lparen;
406 break;
407
408 case (')'):
409 case (']'):
410 code = rparen;
411 break;
412
413 case '#':
414 unary_delim = ps.last_u_d;
415 code = preesc;
416 break;
417
418 case '?':
419 unary_delim = true;
420 code = question;
421 break;
422
423 case (':'):
424 code = colon;
425 unary_delim = true;
426 break;
427
428 case (';'):
429 unary_delim = true;
430 code = semicolon;
431 break;
432
433 case ('{'):
434 unary_delim = true;
435
436 /*
437 * if (ps.in_or_st) ps.block_init = 1;
438 */
439 /* ? code = ps.block_init ? lparen : lbrace; */
440 code = lbrace;
441 break;
442
443 case ('}'):
444 unary_delim = true;
445 /* ? code = ps.block_init ? rparen : rbrace; */
446 code = rbrace;
447 break;
448
449 case 014: /* a form feed */
450 unary_delim = ps.last_u_d;
451 ps.last_nl = true; /* remember this so we can set
452 * 'ps.col_1' right */
453 code = form_feed;
454 break;
455
456 case (','):
457 unary_delim = true;
458 code = comma;
459 break;
460
461 case '.':
462 unary_delim = false;
463 code = period;
464 break;
465
466 case '-':
467 case '+': /* check for -, +, --, ++ */
468 code = (ps.last_u_d ? unary_op : binary_op);
469 unary_delim = true;
470
471 if (*buf_ptr == token[0]) {
472 /* check for doubled character */
473 *e_token++ = *buf_ptr++;
474 /* buffer overflow will be checked at end of loop */
475 if (last_code == ident || last_code == rparen) {
476 code = (ps.last_u_d ? unary_op : postop);
477 /* check for following ++ or -- */
478 unary_delim = false;
479 }
480 } else
481 if (*buf_ptr == '=')
482 /* check for operator += */
483 *e_token++ = *buf_ptr++;
484 else
485 if (*buf_ptr == '>') {
486 /* check for operator -> */
487 *e_token++ = *buf_ptr++;
488 if (!pointer_as_binop) {
489 unary_delim = false;
490 code = unary_op;
491 ps.want_blank = false;
492 }
493 }
494 break; /* buffer overflow will be checked at end of
495 * switch */
496
497 case '=':
498 if (ps.in_or_st)
499 ps.block_init = 1;
500 #ifdef undef
501 if (chartype[*buf_ptr] == opchar) { /* we have two char
502 * assignment */
503 e_token[-1] = *buf_ptr++;
504 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
505 *e_token++ = *buf_ptr++;
506 *e_token++ = '='; /* Flip =+ to += */
507 *e_token = 0;
508 }
509 #else
510 if (*buf_ptr == '=') { /* == */
511 *e_token++ = '='; /* Flip =+ to += */
512 buf_ptr++;
513 *e_token = 0;
514 }
515 #endif
516 code = binary_op;
517 unary_delim = true;
518 break;
519 /* can drop thru!!! */
520
521 case '>':
522 case '<':
523 case '!': /* ops like <, <<, <=, !=, etc */
524 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
525 *e_token++ = *buf_ptr;
526 if (++buf_ptr >= buf_end)
527 fill_buffer();
528 }
529 if (*buf_ptr == '=')
530 *e_token++ = *buf_ptr++;
531 code = (ps.last_u_d ? unary_op : binary_op);
532 unary_delim = true;
533 break;
534
535 default:
536 if (token[0] == '/' && *buf_ptr == '*') {
537 /* it is start of comment */
538 *e_token++ = '*';
539
540 if (++buf_ptr >= buf_end)
541 fill_buffer();
542
543 code = comment;
544 unary_delim = ps.last_u_d;
545 break;
546 }
547 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
548 /*
549 * handle ||, &&, etc, and also things as in int *****i
550 */
551 *e_token++ = *buf_ptr;
552 if (++buf_ptr >= buf_end)
553 fill_buffer();
554 }
555 code = (ps.last_u_d ? unary_op : binary_op);
556 unary_delim = true;
557
558
559 } /* end of switch */
560 if (code != newline) {
561 l_struct = false;
562 last_code = code;
563 }
564 if (buf_ptr >= buf_end) /* check for input buffer empty */
565 fill_buffer();
566 ps.last_u_d = unary_delim;
567 *e_token = '\0'; /* null terminate the token */
568 return (code);
569 }
570 /*
571 * Add the given keyword to the keyword table, using val as the keyword type
572 */
573 void
574 addkey(key, val)
575 char *key;
576 int val;
577 {
578 struct templ *p = specials;
579 while (p->rwd)
580 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
581 return;
582 else
583 p++;
584 if (p >= specials + sizeof specials / sizeof specials[0])
585 return; /* For now, table overflows are silently
586 * ignored */
587 p->rwd = key;
588 p->rwcode = val;
589 p[1].rwd = 0;
590 p[1].rwcode = 0;
591 }
592