lexi.c revision 1.7 1 /* $NetBSD: lexi.c,v 1.7 1998/08/25 20:59:38 ross Exp $ */
2
3 /*
4 * Copyright (c) 1980, 1993
5 * The Regents of the University of California. All rights reserved.
6 * Copyright (c) 1976 Board of Trustees of the University of Illinois.
7 * Copyright (c) 1985 Sun Microsystems, Inc.
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39 #include <sys/cdefs.h>
40 #ifndef lint
41 #if 0
42 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";
43 #else
44 __RCSID("$NetBSD: lexi.c,v 1.7 1998/08/25 20:59:38 ross Exp $");
45 #endif
46 #endif /* not lint */
47
48 /*
49 * Here we have the token scanner for indent. It scans off one token and puts
50 * it in the global variable "token". It returns a code, indicating the type
51 * of token scanned.
52 */
53
54 #include <stdio.h>
55 #include <ctype.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include "indent_globs.h"
59 #include "indent_codes.h"
60
61 #define alphanum 1
62 #define opchar 3
63
64 struct templ {
65 char *rwd;
66 int rwcode;
67 };
68
69 struct templ specials[1000] =
70 {
71 {"switch", 1},
72 {"case", 2},
73 {"break", 0},
74 {"struct", 3},
75 {"union", 3},
76 {"enum", 3},
77 {"default", 2},
78 {"int", 4},
79 {"char", 4},
80 {"float", 4},
81 {"double", 4},
82 {"long", 4},
83 {"short", 4},
84 {"typdef", 4},
85 {"unsigned", 4},
86 {"register", 4},
87 {"static", 4},
88 {"global", 4},
89 {"extern", 4},
90 {"void", 4},
91 {"goto", 0},
92 {"return", 0},
93 {"if", 5},
94 {"while", 5},
95 {"for", 5},
96 {"else", 6},
97 {"do", 6},
98 {"sizeof", 7},
99 {0, 0}
100 };
101
102 char chartype[128] =
103 { /* this is used to facilitate the decision of
104 * what type (alphanumeric, operator) each
105 * character is */
106 0, 0, 0, 0, 0, 0, 0, 0,
107 0, 0, 0, 0, 0, 0, 0, 0,
108 0, 0, 0, 0, 0, 0, 0, 0,
109 0, 0, 0, 0, 0, 0, 0, 0,
110 0, 3, 0, 0, 1, 3, 3, 0,
111 0, 0, 3, 3, 0, 3, 0, 3,
112 1, 1, 1, 1, 1, 1, 1, 1,
113 1, 1, 0, 0, 3, 3, 3, 3,
114 0, 1, 1, 1, 1, 1, 1, 1,
115 1, 1, 1, 1, 1, 1, 1, 1,
116 1, 1, 1, 1, 1, 1, 1, 1,
117 1, 1, 1, 0, 0, 0, 3, 1,
118 0, 1, 1, 1, 1, 1, 1, 1,
119 1, 1, 1, 1, 1, 1, 1, 1,
120 1, 1, 1, 1, 1, 1, 1, 1,
121 1, 1, 1, 0, 3, 0, 3, 0
122 };
123
124
125
126
127 int
128 lexi()
129 {
130 int unary_delim; /* this is set to 1 if the current token
131 *
132 * forces a following operator to be unary */
133 static int last_code; /* the last token type returned */
134 static int l_struct; /* set to 1 if the last token was 'struct' */
135 int code; /* internal code to be returned */
136 char qchar; /* the delimiter character for a string */
137
138 e_token = s_token; /* point to start of place to save token */
139 unary_delim = false;
140 ps.col_1 = ps.last_nl; /* tell world that this token started in
141 * column 1 iff the last thing scanned was nl */
142 ps.last_nl = false;
143
144 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
145 ps.col_1 = false; /* leading blanks imply token is not
146 * in column 1 */
147 if (++buf_ptr >= buf_end)
148 fill_buffer();
149 }
150
151 /* Scan an alphanumeric token */
152 if (chartype[(int) *buf_ptr] == alphanum ||
153 (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
154 /*
155 * we have a character or number
156 */
157 char *j; /* used for searching thru list of
158 *
159 * reserved words */
160 struct templ *p;
161
162 if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
163 int seendot = 0, seenexp = 0;
164 if (*buf_ptr == '0' &&
165 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
166 *e_token++ = *buf_ptr++;
167 *e_token++ = *buf_ptr++;
168 while (isxdigit(*buf_ptr)) {
169 CHECK_SIZE_TOKEN;
170 *e_token++ = *buf_ptr++;
171 }
172 } else {
173 while (1) {
174 if (*buf_ptr == '.') {
175 if (seendot)
176 break;
177 else
178 seendot++;
179 }
180 CHECK_SIZE_TOKEN;
181 *e_token++ = *buf_ptr++;
182 if (!isdigit(*buf_ptr)
183 && *buf_ptr != '.') {
184 if ((*buf_ptr != 'E'
185 && *buf_ptr != 'e') || seenexp)
186 break;
187 else {
188 seenexp++;
189 seendot++;
190 CHECK_SIZE_TOKEN;
191 *e_token++ = *buf_ptr++;
192 if (*buf_ptr == '+' || *buf_ptr == '-')
193 *e_token++ = *buf_ptr++;
194 }
195 }
196 }
197 }
198 if (*buf_ptr == 'L' || *buf_ptr == 'l')
199 *e_token++ = *buf_ptr++;
200 } else
201 while (chartype[(int) *buf_ptr] == alphanum) { /* copy it over */
202 CHECK_SIZE_TOKEN;
203 *e_token++ = *buf_ptr++;
204 if (buf_ptr >= buf_end)
205 fill_buffer();
206 }
207 *e_token++ = '\0';
208 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
209 if (++buf_ptr >= buf_end)
210 fill_buffer();
211 }
212 ps.its_a_keyword = false;
213 ps.sizeof_keyword = false;
214 if (l_struct) { /* if last token was 'struct', then this token
215 * should be treated as a declaration */
216 l_struct = false;
217 last_code = ident;
218 ps.last_u_d = true;
219 return (decl);
220 }
221 ps.last_u_d = false; /* Operator after indentifier is
222 * binary */
223 last_code = ident; /* Remember that this is the code we
224 * will return */
225
226 /*
227 * This loop will check if the token is a keyword.
228 */
229 for (p = specials; (j = p->rwd) != 0; p++) {
230 char *p = s_token; /* point at scanned token */
231 if (*j++ != *p++ || *j++ != *p++)
232 continue; /* This test depends on the
233 * fact that identifiers are
234 * always at least 1 character
235 * long (ie. the first two
236 * bytes of the identifier are
237 * always meaningful) */
238 if (p[-1] == 0)
239 break; /* If its a one-character identifier */
240 while (*p++ == *j)
241 if (*j++ == 0)
242 goto found_keyword; /* I wish that C had a
243 * multi-level break... */
244 }
245 if (p->rwd) { /* we have a keyword */
246 found_keyword:
247 ps.its_a_keyword = true;
248 ps.last_u_d = true;
249 switch (p->rwcode) {
250 case 1:/* it is a switch */
251 return (swstmt);
252 case 2:/* a case or default */
253 return (casestmt);
254
255 case 3:/* a "struct" */
256 if (ps.p_l_follow)
257 break; /* inside parens: cast */
258 l_struct = true;
259
260 /*
261 * Next time around, we will want to know that we have had a
262 * 'struct'
263 */
264 case 4:/* one of the declaration keywords */
265 if (ps.p_l_follow) {
266 ps.cast_mask |= 1 << ps.p_l_follow;
267 break; /* inside parens: cast */
268 }
269 last_code = decl;
270 return (decl);
271
272 case 5:/* if, while, for */
273 return (sp_paren);
274
275 case 6:/* do, else */
276 return (sp_nparen);
277
278 case 7:
279 ps.sizeof_keyword = true;
280 default: /* all others are treated like any
281 * other identifier */
282 return (ident);
283 } /* end of switch */
284 } /* end of if (found_it) */
285 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
286 char *tp = buf_ptr;
287 while (tp < buf_end)
288 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
289 goto not_proc;
290 strncpy(ps.procname, token, sizeof ps.procname - 1);
291 ps.in_parameter_declaration = 1;
292 rparen_count = 1;
293 not_proc: ;
294 }
295 /*
296 * The following hack attempts to guess whether or not the current
297 * token is in fact a declaration keyword -- one that has been
298 * typedefd
299 */
300 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
301 && !ps.p_l_follow
302 && !ps.block_init
303 && (ps.last_token == rparen || ps.last_token == semicolon ||
304 ps.last_token == decl ||
305 ps.last_token == lbrace || ps.last_token == rbrace)) {
306 ps.its_a_keyword = true;
307 ps.last_u_d = true;
308 last_code = decl;
309 return decl;
310 }
311 if (last_code == decl) /* if this is a declared variable,
312 * then following sign is unary */
313 ps.last_u_d = true; /* will make "int a -1" work */
314 last_code = ident;
315 return (ident); /* the ident is not in the list */
316 } /* end of procesing for alpanum character */
317 /* Scan a non-alphanumeric token */
318 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
319 * moved here */
320 *e_token = '\0';
321 if (++buf_ptr >= buf_end)
322 fill_buffer();
323
324 switch (*token) {
325 case '\n':
326 unary_delim = ps.last_u_d;
327 ps.last_nl = true; /* remember that we just had a newline */
328 code = (had_eof ? 0 : newline);
329
330 /*
331 * if data has been exausted, the newline is a dummy, and we should
332 * return code to stop
333 */
334 break;
335
336 case '\'': /* start of quoted character */
337 case '"': /* start of string */
338 qchar = *token;
339 if (troff) {
340 e_token[-1] = '`';
341 if (qchar == '"')
342 *e_token++ = '`';
343 e_token = chfont(&bodyf, &stringf, e_token);
344 }
345 do { /* copy the string */
346 while (1) { /* move one character or
347 * [/<char>]<char> */
348 if (*buf_ptr == '\n') {
349 printf("%d: Unterminated literal\n", line_no);
350 goto stop_lit;
351 }
352 CHECK_SIZE_TOKEN; /* Only have to do this
353 * once in this loop,
354 * since CHECK_SIZE
355 * guarantees that there
356 * are at least 5
357 * entries left */
358 *e_token = *buf_ptr++;
359 if (buf_ptr >= buf_end)
360 fill_buffer();
361 if (*e_token == BACKSLASH) { /* if escape, copy extra
362 * char */
363 if (*buf_ptr == '\n') /* check for escaped
364 * newline */
365 ++line_no;
366 if (troff) {
367 *++e_token = BACKSLASH;
368 if (*buf_ptr == BACKSLASH)
369 *++e_token = BACKSLASH;
370 }
371 *++e_token = *buf_ptr++;
372 ++e_token; /* we must increment
373 * this again because we
374 * copied two chars */
375 if (buf_ptr >= buf_end)
376 fill_buffer();
377 } else
378 break; /* we copied one character */
379 } /* end of while (1) */
380 } while (*e_token++ != qchar);
381 if (troff) {
382 e_token = chfont(&stringf, &bodyf, e_token - 1);
383 if (qchar == '"')
384 *e_token++ = '\'';
385 }
386 stop_lit:
387 code = ident;
388 break;
389
390 case ('('):
391 case ('['):
392 unary_delim = true;
393 code = lparen;
394 break;
395
396 case (')'):
397 case (']'):
398 code = rparen;
399 break;
400
401 case '#':
402 unary_delim = ps.last_u_d;
403 code = preesc;
404 break;
405
406 case '?':
407 unary_delim = true;
408 code = question;
409 break;
410
411 case (':'):
412 code = colon;
413 unary_delim = true;
414 break;
415
416 case (';'):
417 unary_delim = true;
418 code = semicolon;
419 break;
420
421 case ('{'):
422 unary_delim = true;
423
424 /*
425 * if (ps.in_or_st) ps.block_init = 1;
426 */
427 /* ? code = ps.block_init ? lparen : lbrace; */
428 code = lbrace;
429 break;
430
431 case ('}'):
432 unary_delim = true;
433 /* ? code = ps.block_init ? rparen : rbrace; */
434 code = rbrace;
435 break;
436
437 case 014: /* a form feed */
438 unary_delim = ps.last_u_d;
439 ps.last_nl = true; /* remember this so we can set
440 * 'ps.col_1' right */
441 code = form_feed;
442 break;
443
444 case (','):
445 unary_delim = true;
446 code = comma;
447 break;
448
449 case '.':
450 unary_delim = false;
451 code = period;
452 break;
453
454 case '-':
455 case '+': /* check for -, +, --, ++ */
456 code = (ps.last_u_d ? unary_op : binary_op);
457 unary_delim = true;
458
459 if (*buf_ptr == token[0]) {
460 /* check for doubled character */
461 *e_token++ = *buf_ptr++;
462 /* buffer overflow will be checked at end of loop */
463 if (last_code == ident || last_code == rparen) {
464 code = (ps.last_u_d ? unary_op : postop);
465 /* check for following ++ or -- */
466 unary_delim = false;
467 }
468 } else
469 if (*buf_ptr == '=')
470 /* check for operator += */
471 *e_token++ = *buf_ptr++;
472 else
473 if (*buf_ptr == '>') {
474 /* check for operator -> */
475 *e_token++ = *buf_ptr++;
476 if (!pointer_as_binop) {
477 unary_delim = false;
478 code = unary_op;
479 ps.want_blank = false;
480 }
481 }
482 break; /* buffer overflow will be checked at end of
483 * switch */
484
485 case '=':
486 if (ps.in_or_st)
487 ps.block_init = 1;
488 #ifdef undef
489 if (chartype[*buf_ptr] == opchar) { /* we have two char
490 * assignment */
491 e_token[-1] = *buf_ptr++;
492 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
493 *e_token++ = *buf_ptr++;
494 *e_token++ = '='; /* Flip =+ to += */
495 *e_token = 0;
496 }
497 #else
498 if (*buf_ptr == '=') { /* == */
499 *e_token++ = '='; /* Flip =+ to += */
500 buf_ptr++;
501 *e_token = 0;
502 }
503 #endif
504 code = binary_op;
505 unary_delim = true;
506 break;
507 /* can drop thru!!! */
508
509 case '>':
510 case '<':
511 case '!': /* ops like <, <<, <=, !=, etc */
512 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
513 *e_token++ = *buf_ptr;
514 if (++buf_ptr >= buf_end)
515 fill_buffer();
516 }
517 if (*buf_ptr == '=')
518 *e_token++ = *buf_ptr++;
519 code = (ps.last_u_d ? unary_op : binary_op);
520 unary_delim = true;
521 break;
522
523 default:
524 if (token[0] == '/' && *buf_ptr == '*') {
525 /* it is start of comment */
526 *e_token++ = '*';
527
528 if (++buf_ptr >= buf_end)
529 fill_buffer();
530
531 code = comment;
532 unary_delim = ps.last_u_d;
533 break;
534 }
535 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
536 /*
537 * handle ||, &&, etc, and also things as in int *****i
538 */
539 *e_token++ = *buf_ptr;
540 if (++buf_ptr >= buf_end)
541 fill_buffer();
542 }
543 code = (ps.last_u_d ? unary_op : binary_op);
544 unary_delim = true;
545
546
547 } /* end of switch */
548 if (code != newline) {
549 l_struct = false;
550 last_code = code;
551 }
552 if (buf_ptr >= buf_end) /* check for input buffer empty */
553 fill_buffer();
554 ps.last_u_d = unary_delim;
555 *e_token = '\0'; /* null terminate the token */
556 return (code);
557 }
558 /*
559 * Add the given keyword to the keyword table, using val as the keyword type
560 */
561 void
562 addkey(key, val)
563 char *key;
564 int val;
565 {
566 struct templ *p = specials;
567 while (p->rwd)
568 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
569 return;
570 else
571 p++;
572 if (p >= specials + sizeof specials / sizeof specials[0])
573 return; /* For now, table overflows are silently
574 * ignored */
575 p->rwd = key;
576 p->rwcode = val;
577 p[1].rwd = 0;
578 p[1].rwcode = 0;
579 }
580