Home | History | Annotate | Line # | Download | only in indent
lexi.c revision 1.11
      1 /*	$NetBSD: lexi.c,v 1.11 2002/05/26 22:53:38 wiz Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1980, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
      7  * Copyright (c) 1985 Sun Microsystems, Inc.
      8  * All rights reserved.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. All advertising materials mentioning features or use of this software
     19  *    must display the following acknowledgement:
     20  *	This product includes software developed by the University of
     21  *	California, Berkeley and its contributors.
     22  * 4. Neither the name of the University nor the names of its contributors
     23  *    may be used to endorse or promote products derived from this software
     24  *    without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     36  * SUCH DAMAGE.
     37  */
     38 
     39 #include <sys/cdefs.h>
     40 #ifndef lint
     41 #if 0
     42 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
     43 #else
     44 __RCSID("$NetBSD: lexi.c,v 1.11 2002/05/26 22:53:38 wiz Exp $");
     45 #endif
     46 #endif				/* not lint */
     47 
     48 /*
     49  * Here we have the token scanner for indent.  It scans off one token and puts
     50  * it in the global variable "token".  It returns a code, indicating the type
     51  * of token scanned.
     52  */
     53 
     54 #include <stdio.h>
     55 #include <ctype.h>
     56 #include <stdlib.h>
     57 #include <string.h>
     58 #include "indent_globs.h"
     59 #include "indent_codes.h"
     60 
     61 #define alphanum 1
     62 #define opchar 3
     63 
     64 struct templ {
     65 	char   *rwd;
     66 	int     rwcode;
     67 };
     68 
     69 struct templ specials[1000] =
     70 {
     71 	{"switch", 1},
     72 	{"case", 2},
     73 	{"break", 0},
     74 	{"struct", 3},
     75 	{"union", 3},
     76 	{"enum", 3},
     77 	{"default", 2},
     78 	{"int", 4},
     79 	{"char", 4},
     80 	{"float", 4},
     81 	{"double", 4},
     82 	{"long", 4},
     83 	{"short", 4},
     84 	{"typdef", 4},
     85 	{"unsigned", 4},
     86 	{"register", 4},
     87 	{"static", 4},
     88 	{"global", 4},
     89 	{"extern", 4},
     90 	{"void", 4},
     91 	{"goto", 0},
     92 	{"return", 0},
     93 	{"if", 5},
     94 	{"while", 5},
     95 	{"for", 5},
     96 	{"else", 6},
     97 	{"do", 6},
     98 	{"sizeof", 7},
     99 	{0, 0}
    100 };
    101 
    102 char    chartype[128] =
    103 {				/* this is used to facilitate the decision of
    104 				 * what type (alphanumeric, operator) each
    105 				 * character is */
    106 	0, 0, 0, 0, 0, 0, 0, 0,
    107 	0, 0, 0, 0, 0, 0, 0, 0,
    108 	0, 0, 0, 0, 0, 0, 0, 0,
    109 	0, 0, 0, 0, 0, 0, 0, 0,
    110 	0, 3, 0, 0, 1, 3, 3, 0,
    111 	0, 0, 3, 3, 0, 3, 0, 3,
    112 	1, 1, 1, 1, 1, 1, 1, 1,
    113 	1, 1, 0, 0, 3, 3, 3, 3,
    114 	0, 1, 1, 1, 1, 1, 1, 1,
    115 	1, 1, 1, 1, 1, 1, 1, 1,
    116 	1, 1, 1, 1, 1, 1, 1, 1,
    117 	1, 1, 1, 0, 0, 0, 3, 1,
    118 	0, 1, 1, 1, 1, 1, 1, 1,
    119 	1, 1, 1, 1, 1, 1, 1, 1,
    120 	1, 1, 1, 1, 1, 1, 1, 1,
    121 	1, 1, 1, 0, 3, 0, 3, 0
    122 };
    123 
    124 
    125 
    126 
    127 int
    128 lexi(void)
    129 {
    130 	int     unary_delim;	/* this is set to 1 if the current token
    131 				 *
    132 				 * forces a following operator to be unary */
    133 	static int last_code;	/* the last token type returned */
    134 	static int l_struct;	/* set to 1 if the last token was 'struct' */
    135 	int     code;		/* internal code to be returned */
    136 	char    qchar;		/* the delimiter character for a string */
    137 
    138 	e_token = s_token;	/* point to start of place to save token */
    139 	unary_delim = false;
    140 	ps.col_1 = ps.last_nl;	/* tell world that this token started in
    141 				 * column 1 iff the last thing scanned was nl */
    142 	ps.last_nl = false;
    143 
    144 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
    145 		ps.col_1 = false;	/* leading blanks imply token is not
    146 					 * in column 1 */
    147 		if (++buf_ptr >= buf_end)
    148 			fill_buffer();
    149 	}
    150 
    151 	/* Scan an alphanumeric token */
    152 	if (chartype[(int) *buf_ptr] == alphanum ||
    153 	    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
    154 		/*
    155 		 * we have a character or number
    156 		 */
    157 		char   *j;	/* used for searching thru list of
    158 				 *
    159 				 * reserved words */
    160 		struct templ *p;
    161 
    162 		if (isdigit((unsigned char)*buf_ptr) ||
    163 		    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
    164 			int     seendot = 0, seenexp = 0, seensfx = 0;
    165 			if (*buf_ptr == '0' &&
    166 			    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
    167 				*e_token++ = *buf_ptr++;
    168 				*e_token++ = *buf_ptr++;
    169 				while (isxdigit((unsigned char)*buf_ptr)) {
    170 					CHECK_SIZE_TOKEN;
    171 					*e_token++ = *buf_ptr++;
    172 				}
    173 			} else {
    174 				while (1) {
    175 					if (*buf_ptr == '.') {
    176 						if (seendot)
    177 							break;
    178 						else
    179 							seendot++;
    180 					}
    181 					CHECK_SIZE_TOKEN;
    182 					*e_token++ = *buf_ptr++;
    183 					if (!isdigit((unsigned char)*buf_ptr)
    184 					&& *buf_ptr != '.') {
    185 						if ((*buf_ptr != 'E'
    186 						&& *buf_ptr != 'e') || seenexp)
    187 							break;
    188 						else {
    189 							seenexp++;
    190 							seendot++;
    191 							CHECK_SIZE_TOKEN;
    192 							*e_token++ = *buf_ptr++;
    193 							if (*buf_ptr == '+' || *buf_ptr == '-')
    194 								*e_token++ = *buf_ptr++;
    195 						}
    196 					}
    197 				}
    198 			}
    199 			if (*buf_ptr == 'F' || *buf_ptr == 'f') {
    200 				/* float constant */
    201 				*e_token++ = *buf_ptr++;
    202 			} else {
    203 				/* integer constant */
    204 				while (1) {
    205 					if (!(seensfx & 1) &&
    206 					    (*buf_ptr == 'U' ||
    207 					     *buf_ptr == 'u')) {
    208 						CHECK_SIZE_TOKEN;
    209 						*e_token++ = *buf_ptr++;
    210 						seensfx |= 1;
    211 						continue;
    212 					}
    213 					if (!(seensfx & 2) &&
    214 					    (*buf_ptr == 'L' ||
    215 					     *buf_ptr == 'l')) {
    216 						CHECK_SIZE_TOKEN;
    217 						if (buf_ptr[1] == buf_ptr[0])
    218 							*e_token++ = *buf_ptr++;
    219 						*e_token++ = *buf_ptr++;
    220 						seensfx |= 2;
    221 						continue;
    222 					}
    223 					break;
    224 				}
    225 			}
    226 		} else
    227 			while (chartype[(int) *buf_ptr] == alphanum) {	/* copy it over */
    228 				CHECK_SIZE_TOKEN;
    229 				*e_token++ = *buf_ptr++;
    230 				if (buf_ptr >= buf_end)
    231 					fill_buffer();
    232 			}
    233 		*e_token++ = '\0';
    234 		while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
    235 			if (++buf_ptr >= buf_end)
    236 				fill_buffer();
    237 		}
    238 		ps.its_a_keyword = false;
    239 		ps.sizeof_keyword = false;
    240 		if (l_struct) {	/* if last token was 'struct', then this token
    241 				 * should be treated as a declaration */
    242 			l_struct = false;
    243 			last_code = ident;
    244 			ps.last_u_d = true;
    245 			return (decl);
    246 		}
    247 		ps.last_u_d = false;	/* Operator after indentifier is
    248 					 * binary */
    249 		last_code = ident;	/* Remember that this is the code we
    250 					 * will return */
    251 
    252 		/*
    253 		 * This loop will check if the token is a keyword.
    254 		 */
    255 		for (p = specials; (j = p->rwd) != 0; p++) {
    256 			char   *p = s_token;	/* point at scanned token */
    257 			if (*j++ != *p++ || *j++ != *p++)
    258 				continue;	/* This test depends on the
    259 						 * fact that identifiers are
    260 						 * always at least 1 character
    261 						 * long (ie. the first two
    262 						 * bytes of the identifier are
    263 						 * always meaningful) */
    264 			if (p[-1] == 0)
    265 				break;	/* If its a one-character identifier */
    266 			while (*p++ == *j)
    267 				if (*j++ == 0)
    268 					goto found_keyword;	/* I wish that C had a
    269 								 * multi-level break... */
    270 		}
    271 		if (p->rwd) {	/* we have a keyword */
    272 	found_keyword:
    273 			ps.its_a_keyword = true;
    274 			ps.last_u_d = true;
    275 			switch (p->rwcode) {
    276 			case 1:/* it is a switch */
    277 				return (swstmt);
    278 			case 2:/* a case or default */
    279 				return (casestmt);
    280 
    281 			case 3:/* a "struct" */
    282 				if (ps.p_l_follow)
    283 					break;	/* inside parens: cast */
    284 				l_struct = true;
    285 
    286 				/*
    287 				 * Next time around, we will want to know that we have had a
    288 				 * 'struct'
    289 				 */
    290 			case 4:/* one of the declaration keywords */
    291 				if (ps.p_l_follow) {
    292 					ps.cast_mask |= 1 << ps.p_l_follow;
    293 					break;	/* inside parens: cast */
    294 				}
    295 				last_code = decl;
    296 				return (decl);
    297 
    298 			case 5:/* if, while, for */
    299 				return (sp_paren);
    300 
    301 			case 6:/* do, else */
    302 				return (sp_nparen);
    303 
    304 			case 7:
    305 				ps.sizeof_keyword = true;
    306 			default:	/* all others are treated like any
    307 					 * other identifier */
    308 				return (ident);
    309 			}	/* end of switch */
    310 		}		/* end of if (found_it) */
    311 		if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
    312 			char   *tp = buf_ptr;
    313 			while (tp < buf_end)
    314 				if (*tp++ == ')' && (*tp == ';' || *tp == ','))
    315 					goto not_proc;
    316 			strncpy(ps.procname, token, sizeof ps.procname - 1);
    317 			ps.in_parameter_declaration = 1;
    318 			rparen_count = 1;
    319 	not_proc:	;
    320 		}
    321 		/*
    322 		 * The following hack attempts to guess whether or not the current
    323 		 * token is in fact a declaration keyword -- one that has been
    324 		 * typedefd
    325 		 */
    326 		if (((*buf_ptr == '*' && buf_ptr[1] != '=') ||
    327 		    isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_')
    328 		    && !ps.p_l_follow
    329 		    && !ps.block_init
    330 		    && (ps.last_token == rparen || ps.last_token == semicolon ||
    331 			ps.last_token == decl ||
    332 			ps.last_token == lbrace || ps.last_token == rbrace)) {
    333 			ps.its_a_keyword = true;
    334 			ps.last_u_d = true;
    335 			last_code = decl;
    336 			return decl;
    337 		}
    338 		if (last_code == decl)	/* if this is a declared variable,
    339 					 * then following sign is unary */
    340 			ps.last_u_d = true;	/* will make "int a -1" work */
    341 		last_code = ident;
    342 		return (ident);	/* the ident is not in the list */
    343 	}			/* end of procesing for alpanum character */
    344 	/* Scan a non-alphanumeric token */
    345 	*e_token++ = *buf_ptr;	/* if it is only a one-character token, it is
    346 				 * moved here */
    347 	*e_token = '\0';
    348 	if (++buf_ptr >= buf_end)
    349 		fill_buffer();
    350 
    351 	switch (*token) {
    352 	case '\n':
    353 		unary_delim = ps.last_u_d;
    354 		ps.last_nl = true;	/* remember that we just had a newline */
    355 		code = (had_eof ? 0 : newline);
    356 
    357 		/*
    358 		 * if data has been exausted, the newline is a dummy, and we should
    359 		 * return code to stop
    360 		 */
    361 		break;
    362 
    363 	case '\'':		/* start of quoted character */
    364 	case '"':		/* start of string */
    365 		qchar = *token;
    366 		if (troff) {
    367 			e_token[-1] = '`';
    368 			if (qchar == '"')
    369 				*e_token++ = '`';
    370 			e_token = chfont(&bodyf, &stringf, e_token);
    371 		}
    372 		do {		/* copy the string */
    373 			while (1) {	/* move one character or
    374 					 * [/<char>]<char> */
    375 				if (*buf_ptr == '\n') {
    376 					printf("%d: Unterminated literal\n", line_no);
    377 					goto stop_lit;
    378 				}
    379 				CHECK_SIZE_TOKEN;	/* Only have to do this
    380 							 * once in this loop,
    381 							 * since CHECK_SIZE
    382 							 * guarantees that there
    383 							 * are at least 5
    384 							 * entries left */
    385 				*e_token = *buf_ptr++;
    386 				if (buf_ptr >= buf_end)
    387 					fill_buffer();
    388 				if (*e_token == BACKSLASH) {	/* if escape, copy extra
    389 								 * char */
    390 					if (*buf_ptr == '\n')	/* check for escaped
    391 								 * newline */
    392 						++line_no;
    393 					if (troff) {
    394 						*++e_token = BACKSLASH;
    395 						if (*buf_ptr == BACKSLASH)
    396 							*++e_token = BACKSLASH;
    397 					}
    398 					*++e_token = *buf_ptr++;
    399 					++e_token;	/* we must increment
    400 							 * this again because we
    401 							 * copied two chars */
    402 					if (buf_ptr >= buf_end)
    403 						fill_buffer();
    404 				} else
    405 					break;	/* we copied one character */
    406 			}	/* end of while (1) */
    407 		} while (*e_token++ != qchar);
    408 		if (troff) {
    409 			e_token = chfont(&stringf, &bodyf, e_token - 1);
    410 			if (qchar == '"')
    411 				*e_token++ = '\'';
    412 		}
    413 stop_lit:
    414 		code = ident;
    415 		break;
    416 
    417 	case ('('):
    418 	case ('['):
    419 		unary_delim = true;
    420 		code = lparen;
    421 		break;
    422 
    423 	case (')'):
    424 	case (']'):
    425 		code = rparen;
    426 		break;
    427 
    428 	case '#':
    429 		unary_delim = ps.last_u_d;
    430 		code = preesc;
    431 		break;
    432 
    433 	case '?':
    434 		unary_delim = true;
    435 		code = question;
    436 		break;
    437 
    438 	case (':'):
    439 		code = colon;
    440 		unary_delim = true;
    441 		break;
    442 
    443 	case (';'):
    444 		unary_delim = true;
    445 		code = semicolon;
    446 		break;
    447 
    448 	case ('{'):
    449 		unary_delim = true;
    450 
    451 		/*
    452 		 * if (ps.in_or_st) ps.block_init = 1;
    453 		 */
    454 		/* ?	code = ps.block_init ? lparen : lbrace; */
    455 		code = lbrace;
    456 		break;
    457 
    458 	case ('}'):
    459 		unary_delim = true;
    460 		/* ?	code = ps.block_init ? rparen : rbrace; */
    461 		code = rbrace;
    462 		break;
    463 
    464 	case 014:		/* a form feed */
    465 		unary_delim = ps.last_u_d;
    466 		ps.last_nl = true;	/* remember this so we can set
    467 					 * 'ps.col_1' right */
    468 		code = form_feed;
    469 		break;
    470 
    471 	case (','):
    472 		unary_delim = true;
    473 		code = comma;
    474 		break;
    475 
    476 	case '.':
    477 		unary_delim = false;
    478 		code = period;
    479 		break;
    480 
    481 	case '-':
    482 	case '+':		/* check for -, +, --, ++ */
    483 		code = (ps.last_u_d ? unary_op : binary_op);
    484 		unary_delim = true;
    485 
    486 		if (*buf_ptr == token[0]) {
    487 			/* check for doubled character */
    488 			*e_token++ = *buf_ptr++;
    489 			/* buffer overflow will be checked at end of loop */
    490 			if (last_code == ident || last_code == rparen) {
    491 				code = (ps.last_u_d ? unary_op : postop);
    492 				/* check for following ++ or -- */
    493 				unary_delim = false;
    494 			}
    495 		} else
    496 			if (*buf_ptr == '=')
    497 				/* check for operator += */
    498 				*e_token++ = *buf_ptr++;
    499 			else
    500 				if (*buf_ptr == '>') {
    501 					/* check for operator -> */
    502 					*e_token++ = *buf_ptr++;
    503 					if (!pointer_as_binop) {
    504 						unary_delim = false;
    505 						code = unary_op;
    506 						ps.want_blank = false;
    507 					}
    508 				}
    509 		break;		/* buffer overflow will be checked at end of
    510 				 * switch */
    511 
    512 	case '=':
    513 		if (ps.in_or_st)
    514 			ps.block_init = 1;
    515 #ifdef undef
    516 		if (chartype[*buf_ptr] == opchar) {	/* we have two char
    517 							 * assignment */
    518 			e_token[-1] = *buf_ptr++;
    519 			if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
    520 				*e_token++ = *buf_ptr++;
    521 			*e_token++ = '=';	/* Flip =+ to += */
    522 			*e_token = 0;
    523 		}
    524 #else
    525 		if (*buf_ptr == '=') {	/* == */
    526 			*e_token++ = '=';	/* Flip =+ to += */
    527 			buf_ptr++;
    528 			*e_token = 0;
    529 		}
    530 #endif
    531 		code = binary_op;
    532 		unary_delim = true;
    533 		break;
    534 		/* can drop thru!!! */
    535 
    536 	case '>':
    537 	case '<':
    538 	case '!':		/* ops like <, <<, <=, !=, etc */
    539 		if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
    540 			*e_token++ = *buf_ptr;
    541 			if (++buf_ptr >= buf_end)
    542 				fill_buffer();
    543 		}
    544 		if (*buf_ptr == '=')
    545 			*e_token++ = *buf_ptr++;
    546 		code = (ps.last_u_d ? unary_op : binary_op);
    547 		unary_delim = true;
    548 		break;
    549 
    550 	default:
    551 		if (token[0] == '/' && *buf_ptr == '*') {
    552 			/* it is start of comment */
    553 			*e_token++ = '*';
    554 
    555 			if (++buf_ptr >= buf_end)
    556 				fill_buffer();
    557 
    558 			code = comment;
    559 			unary_delim = ps.last_u_d;
    560 			break;
    561 		}
    562 		while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
    563 			/*
    564 		         * handle ||, &&, etc, and also things as in int *****i
    565 		         */
    566 			*e_token++ = *buf_ptr;
    567 			if (++buf_ptr >= buf_end)
    568 				fill_buffer();
    569 		}
    570 		code = (ps.last_u_d ? unary_op : binary_op);
    571 		unary_delim = true;
    572 
    573 
    574 	}			/* end of switch */
    575 	if (code != newline) {
    576 		l_struct = false;
    577 		last_code = code;
    578 	}
    579 	if (buf_ptr >= buf_end)	/* check for input buffer empty */
    580 		fill_buffer();
    581 	ps.last_u_d = unary_delim;
    582 	*e_token = '\0';	/* null terminate the token */
    583 	return (code);
    584 }
    585 /*
    586  * Add the given keyword to the keyword table, using val as the keyword type
    587  */
    588 void
    589 addkey(char *key, int val)
    590 {
    591 	struct templ *p = specials;
    592 	while (p->rwd)
    593 		if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
    594 			return;
    595 		else
    596 			p++;
    597 	if (p >= specials + sizeof specials / sizeof specials[0])
    598 		return;		/* For now, table overflows are silently
    599 				 * ignored */
    600 	p->rwd = key;
    601 	p->rwcode = val;
    602 	p[1].rwd = 0;
    603 	p[1].rwcode = 0;
    604 }
    605