lexi.c revision 1.6 1 /* $NetBSD: lexi.c,v 1.6 1997/10/19 03:17:25 lukem Exp $ */
2
3 /*
4 * Copyright (c) 1980, 1993
5 * The Regents of the University of California. All rights reserved.
6 * Copyright (c) 1976 Board of Trustees of the University of Illinois.
7 * Copyright (c) 1985 Sun Microsystems, Inc.
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39 #include <sys/cdefs.h>
40 #ifndef lint
41 #if 0
42 static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";
43 #else
44 __RCSID("$NetBSD: lexi.c,v 1.6 1997/10/19 03:17:25 lukem Exp $");
45 #endif
46 #endif /* not lint */
47
48 /*
49 * Here we have the token scanner for indent. It scans off one token and puts
50 * it in the global variable "token". It returns a code, indicating the type
51 * of token scanned.
52 */
53
54 #include <stdio.h>
55 #include <ctype.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include "indent_globs.h"
59 #include "indent_codes.h"
60
61 #define alphanum 1
62 #define opchar 3
63
64 struct templ {
65 char *rwd;
66 int rwcode;
67 };
68
69 struct templ specials[1000] =
70 {
71 {"switch", 1},
72 {"case", 2},
73 {"break", 0},
74 {"struct", 3},
75 {"union", 3},
76 {"enum", 3},
77 {"default", 2},
78 {"int", 4},
79 {"char", 4},
80 {"float", 4},
81 {"double", 4},
82 {"long", 4},
83 {"short", 4},
84 {"typdef", 4},
85 {"unsigned", 4},
86 {"register", 4},
87 {"static", 4},
88 {"global", 4},
89 {"extern", 4},
90 {"void", 4},
91 {"goto", 0},
92 {"return", 0},
93 {"if", 5},
94 {"while", 5},
95 {"for", 5},
96 {"else", 6},
97 {"do", 6},
98 {"sizeof", 7},
99 {0, 0}
100 };
101
102 char chartype[128] =
103 { /* this is used to facilitate the decision of
104 * what type (alphanumeric, operator) each
105 * character is */
106 0, 0, 0, 0, 0, 0, 0, 0,
107 0, 0, 0, 0, 0, 0, 0, 0,
108 0, 0, 0, 0, 0, 0, 0, 0,
109 0, 0, 0, 0, 0, 0, 0, 0,
110 0, 3, 0, 0, 1, 3, 3, 0,
111 0, 0, 3, 3, 0, 3, 0, 3,
112 1, 1, 1, 1, 1, 1, 1, 1,
113 1, 1, 0, 0, 3, 3, 3, 3,
114 0, 1, 1, 1, 1, 1, 1, 1,
115 1, 1, 1, 1, 1, 1, 1, 1,
116 1, 1, 1, 1, 1, 1, 1, 1,
117 1, 1, 1, 0, 0, 0, 3, 1,
118 0, 1, 1, 1, 1, 1, 1, 1,
119 1, 1, 1, 1, 1, 1, 1, 1,
120 1, 1, 1, 1, 1, 1, 1, 1,
121 1, 1, 1, 0, 3, 0, 3, 0
122 };
123
124
125
126
127 int
128 lexi()
129 {
130 int unary_delim; /* this is set to 1 if the current token
131 *
132 * forces a following operator to be unary */
133 static int last_code; /* the last token type returned */
134 static int l_struct; /* set to 1 if the last token was 'struct' */
135 int code; /* internal code to be returned */
136 char qchar; /* the delimiter character for a string */
137
138 e_token = s_token; /* point to start of place to save token */
139 unary_delim = false;
140 ps.col_1 = ps.last_nl; /* tell world that this token started in
141 * column 1 iff the last thing scanned was nl */
142 ps.last_nl = false;
143
144 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
145 ps.col_1 = false; /* leading blanks imply token is not
146 * in column 1 */
147 if (++buf_ptr >= buf_end)
148 fill_buffer();
149 }
150
151 /* Scan an alphanumeric token */
152 if (chartype[(int) *buf_ptr] == alphanum ||
153 (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
154 /*
155 * we have a character or number
156 */
157 char *j; /* used for searching thru list of
158 *
159 * reserved words */
160 struct templ *p;
161
162 if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
163 int seendot = 0, seenexp = 0;
164 if (*buf_ptr == '0' &&
165 (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
166 *e_token++ = *buf_ptr++;
167 *e_token++ = *buf_ptr++;
168 while (isxdigit(*buf_ptr)) {
169 CHECK_SIZE_TOKEN;
170 *e_token++ = *buf_ptr++;
171 }
172 } else
173 while (1) {
174 if (*buf_ptr == '.')
175 if (seendot)
176 break;
177 else
178 seendot++;
179 CHECK_SIZE_TOKEN;
180 *e_token++ = *buf_ptr++;
181 if (!isdigit(*buf_ptr) && *buf_ptr != '.')
182 if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
183 break;
184 else {
185 seenexp++;
186 seendot++;
187 CHECK_SIZE_TOKEN;
188 *e_token++ = *buf_ptr++;
189 if (*buf_ptr == '+' || *buf_ptr == '-')
190 *e_token++ = *buf_ptr++;
191 }
192 }
193 if (*buf_ptr == 'L' || *buf_ptr == 'l')
194 *e_token++ = *buf_ptr++;
195 } else
196 while (chartype[(int) *buf_ptr] == alphanum) { /* copy it over */
197 CHECK_SIZE_TOKEN;
198 *e_token++ = *buf_ptr++;
199 if (buf_ptr >= buf_end)
200 fill_buffer();
201 }
202 *e_token++ = '\0';
203 while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
204 if (++buf_ptr >= buf_end)
205 fill_buffer();
206 }
207 ps.its_a_keyword = false;
208 ps.sizeof_keyword = false;
209 if (l_struct) { /* if last token was 'struct', then this token
210 * should be treated as a declaration */
211 l_struct = false;
212 last_code = ident;
213 ps.last_u_d = true;
214 return (decl);
215 }
216 ps.last_u_d = false; /* Operator after indentifier is
217 * binary */
218 last_code = ident; /* Remember that this is the code we
219 * will return */
220
221 /*
222 * This loop will check if the token is a keyword.
223 */
224 for (p = specials; (j = p->rwd) != 0; p++) {
225 char *p = s_token; /* point at scanned token */
226 if (*j++ != *p++ || *j++ != *p++)
227 continue; /* This test depends on the
228 * fact that identifiers are
229 * always at least 1 character
230 * long (ie. the first two
231 * bytes of the identifier are
232 * always meaningful) */
233 if (p[-1] == 0)
234 break; /* If its a one-character identifier */
235 while (*p++ == *j)
236 if (*j++ == 0)
237 goto found_keyword; /* I wish that C had a
238 * multi-level break... */
239 }
240 if (p->rwd) { /* we have a keyword */
241 found_keyword:
242 ps.its_a_keyword = true;
243 ps.last_u_d = true;
244 switch (p->rwcode) {
245 case 1:/* it is a switch */
246 return (swstmt);
247 case 2:/* a case or default */
248 return (casestmt);
249
250 case 3:/* a "struct" */
251 if (ps.p_l_follow)
252 break; /* inside parens: cast */
253 l_struct = true;
254
255 /*
256 * Next time around, we will want to know that we have had a
257 * 'struct'
258 */
259 case 4:/* one of the declaration keywords */
260 if (ps.p_l_follow) {
261 ps.cast_mask |= 1 << ps.p_l_follow;
262 break; /* inside parens: cast */
263 }
264 last_code = decl;
265 return (decl);
266
267 case 5:/* if, while, for */
268 return (sp_paren);
269
270 case 6:/* do, else */
271 return (sp_nparen);
272
273 case 7:
274 ps.sizeof_keyword = true;
275 default: /* all others are treated like any
276 * other identifier */
277 return (ident);
278 } /* end of switch */
279 } /* end of if (found_it) */
280 if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
281 char *tp = buf_ptr;
282 while (tp < buf_end)
283 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
284 goto not_proc;
285 strncpy(ps.procname, token, sizeof ps.procname - 1);
286 ps.in_parameter_declaration = 1;
287 rparen_count = 1;
288 not_proc: ;
289 }
290 /*
291 * The following hack attempts to guess whether or not the current
292 * token is in fact a declaration keyword -- one that has been
293 * typedefd
294 */
295 if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
296 && !ps.p_l_follow
297 && !ps.block_init
298 && (ps.last_token == rparen || ps.last_token == semicolon ||
299 ps.last_token == decl ||
300 ps.last_token == lbrace || ps.last_token == rbrace)) {
301 ps.its_a_keyword = true;
302 ps.last_u_d = true;
303 last_code = decl;
304 return decl;
305 }
306 if (last_code == decl) /* if this is a declared variable,
307 * then following sign is unary */
308 ps.last_u_d = true; /* will make "int a -1" work */
309 last_code = ident;
310 return (ident); /* the ident is not in the list */
311 } /* end of procesing for alpanum character */
312 /* Scan a non-alphanumeric token */
313 *e_token++ = *buf_ptr; /* if it is only a one-character token, it is
314 * moved here */
315 *e_token = '\0';
316 if (++buf_ptr >= buf_end)
317 fill_buffer();
318
319 switch (*token) {
320 case '\n':
321 unary_delim = ps.last_u_d;
322 ps.last_nl = true; /* remember that we just had a newline */
323 code = (had_eof ? 0 : newline);
324
325 /*
326 * if data has been exausted, the newline is a dummy, and we should
327 * return code to stop
328 */
329 break;
330
331 case '\'': /* start of quoted character */
332 case '"': /* start of string */
333 qchar = *token;
334 if (troff) {
335 e_token[-1] = '`';
336 if (qchar == '"')
337 *e_token++ = '`';
338 e_token = chfont(&bodyf, &stringf, e_token);
339 }
340 do { /* copy the string */
341 while (1) { /* move one character or
342 * [/<char>]<char> */
343 if (*buf_ptr == '\n') {
344 printf("%d: Unterminated literal\n", line_no);
345 goto stop_lit;
346 }
347 CHECK_SIZE_TOKEN; /* Only have to do this
348 * once in this loop,
349 * since CHECK_SIZE
350 * guarantees that there
351 * are at least 5
352 * entries left */
353 *e_token = *buf_ptr++;
354 if (buf_ptr >= buf_end)
355 fill_buffer();
356 if (*e_token == BACKSLASH) { /* if escape, copy extra
357 * char */
358 if (*buf_ptr == '\n') /* check for escaped
359 * newline */
360 ++line_no;
361 if (troff) {
362 *++e_token = BACKSLASH;
363 if (*buf_ptr == BACKSLASH)
364 *++e_token = BACKSLASH;
365 }
366 *++e_token = *buf_ptr++;
367 ++e_token; /* we must increment
368 * this again because we
369 * copied two chars */
370 if (buf_ptr >= buf_end)
371 fill_buffer();
372 } else
373 break; /* we copied one character */
374 } /* end of while (1) */
375 } while (*e_token++ != qchar);
376 if (troff) {
377 e_token = chfont(&stringf, &bodyf, e_token - 1);
378 if (qchar == '"')
379 *e_token++ = '\'';
380 }
381 stop_lit:
382 code = ident;
383 break;
384
385 case ('('):
386 case ('['):
387 unary_delim = true;
388 code = lparen;
389 break;
390
391 case (')'):
392 case (']'):
393 code = rparen;
394 break;
395
396 case '#':
397 unary_delim = ps.last_u_d;
398 code = preesc;
399 break;
400
401 case '?':
402 unary_delim = true;
403 code = question;
404 break;
405
406 case (':'):
407 code = colon;
408 unary_delim = true;
409 break;
410
411 case (';'):
412 unary_delim = true;
413 code = semicolon;
414 break;
415
416 case ('{'):
417 unary_delim = true;
418
419 /*
420 * if (ps.in_or_st) ps.block_init = 1;
421 */
422 /* ? code = ps.block_init ? lparen : lbrace; */
423 code = lbrace;
424 break;
425
426 case ('}'):
427 unary_delim = true;
428 /* ? code = ps.block_init ? rparen : rbrace; */
429 code = rbrace;
430 break;
431
432 case 014: /* a form feed */
433 unary_delim = ps.last_u_d;
434 ps.last_nl = true; /* remember this so we can set
435 * 'ps.col_1' right */
436 code = form_feed;
437 break;
438
439 case (','):
440 unary_delim = true;
441 code = comma;
442 break;
443
444 case '.':
445 unary_delim = false;
446 code = period;
447 break;
448
449 case '-':
450 case '+': /* check for -, +, --, ++ */
451 code = (ps.last_u_d ? unary_op : binary_op);
452 unary_delim = true;
453
454 if (*buf_ptr == token[0]) {
455 /* check for doubled character */
456 *e_token++ = *buf_ptr++;
457 /* buffer overflow will be checked at end of loop */
458 if (last_code == ident || last_code == rparen) {
459 code = (ps.last_u_d ? unary_op : postop);
460 /* check for following ++ or -- */
461 unary_delim = false;
462 }
463 } else
464 if (*buf_ptr == '=')
465 /* check for operator += */
466 *e_token++ = *buf_ptr++;
467 else
468 if (*buf_ptr == '>') {
469 /* check for operator -> */
470 *e_token++ = *buf_ptr++;
471 if (!pointer_as_binop) {
472 unary_delim = false;
473 code = unary_op;
474 ps.want_blank = false;
475 }
476 }
477 break; /* buffer overflow will be checked at end of
478 * switch */
479
480 case '=':
481 if (ps.in_or_st)
482 ps.block_init = 1;
483 #ifdef undef
484 if (chartype[*buf_ptr] == opchar) { /* we have two char
485 * assignment */
486 e_token[-1] = *buf_ptr++;
487 if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
488 *e_token++ = *buf_ptr++;
489 *e_token++ = '='; /* Flip =+ to += */
490 *e_token = 0;
491 }
492 #else
493 if (*buf_ptr == '=') { /* == */
494 *e_token++ = '='; /* Flip =+ to += */
495 buf_ptr++;
496 *e_token = 0;
497 }
498 #endif
499 code = binary_op;
500 unary_delim = true;
501 break;
502 /* can drop thru!!! */
503
504 case '>':
505 case '<':
506 case '!': /* ops like <, <<, <=, !=, etc */
507 if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
508 *e_token++ = *buf_ptr;
509 if (++buf_ptr >= buf_end)
510 fill_buffer();
511 }
512 if (*buf_ptr == '=')
513 *e_token++ = *buf_ptr++;
514 code = (ps.last_u_d ? unary_op : binary_op);
515 unary_delim = true;
516 break;
517
518 default:
519 if (token[0] == '/' && *buf_ptr == '*') {
520 /* it is start of comment */
521 *e_token++ = '*';
522
523 if (++buf_ptr >= buf_end)
524 fill_buffer();
525
526 code = comment;
527 unary_delim = ps.last_u_d;
528 break;
529 }
530 while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
531 /*
532 * handle ||, &&, etc, and also things as in int *****i
533 */
534 *e_token++ = *buf_ptr;
535 if (++buf_ptr >= buf_end)
536 fill_buffer();
537 }
538 code = (ps.last_u_d ? unary_op : binary_op);
539 unary_delim = true;
540
541
542 } /* end of switch */
543 if (code != newline) {
544 l_struct = false;
545 last_code = code;
546 }
547 if (buf_ptr >= buf_end) /* check for input buffer empty */
548 fill_buffer();
549 ps.last_u_d = unary_delim;
550 *e_token = '\0'; /* null terminate the token */
551 return (code);
552 }
553 /*
554 * Add the given keyword to the keyword table, using val as the keyword type
555 */
556 void
557 addkey(key, val)
558 char *key;
559 int val;
560 {
561 struct templ *p = specials;
562 while (p->rwd)
563 if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
564 return;
565 else
566 p++;
567 if (p >= specials + sizeof specials / sizeof specials[0])
568 return; /* For now, table overflows are silently
569 * ignored */
570 p->rwd = key;
571 p->rwcode = val;
572 p[1].rwd = 0;
573 p[1].rwcode = 0;
574 }
575