Home | History | Annotate | Line # | Download | only in indent
lexi.c revision 1.7
      1 /*	$NetBSD: lexi.c,v 1.7 1998/08/25 20:59:38 ross Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1980, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
      7  * Copyright (c) 1985 Sun Microsystems, Inc.
      8  * All rights reserved.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. All advertising materials mentioning features or use of this software
     19  *    must display the following acknowledgement:
     20  *	This product includes software developed by the University of
     21  *	California, Berkeley and its contributors.
     22  * 4. Neither the name of the University nor the names of its contributors
     23  *    may be used to endorse or promote products derived from this software
     24  *    without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     36  * SUCH DAMAGE.
     37  */
     38 
     39 #include <sys/cdefs.h>
     40 #ifndef lint
     41 #if 0
     42 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
     43 #else
     44 __RCSID("$NetBSD: lexi.c,v 1.7 1998/08/25 20:59:38 ross Exp $");
     45 #endif
     46 #endif				/* not lint */
     47 
     48 /*
     49  * Here we have the token scanner for indent.  It scans off one token and puts
     50  * it in the global variable "token".  It returns a code, indicating the type
     51  * of token scanned.
     52  */
     53 
     54 #include <stdio.h>
     55 #include <ctype.h>
     56 #include <stdlib.h>
     57 #include <string.h>
     58 #include "indent_globs.h"
     59 #include "indent_codes.h"
     60 
     61 #define alphanum 1
     62 #define opchar 3
     63 
     64 struct templ {
     65 	char   *rwd;
     66 	int     rwcode;
     67 };
     68 
     69 struct templ specials[1000] =
     70 {
     71 	{"switch", 1},
     72 	{"case", 2},
     73 	{"break", 0},
     74 	{"struct", 3},
     75 	{"union", 3},
     76 	{"enum", 3},
     77 	{"default", 2},
     78 	{"int", 4},
     79 	{"char", 4},
     80 	{"float", 4},
     81 	{"double", 4},
     82 	{"long", 4},
     83 	{"short", 4},
     84 	{"typdef", 4},
     85 	{"unsigned", 4},
     86 	{"register", 4},
     87 	{"static", 4},
     88 	{"global", 4},
     89 	{"extern", 4},
     90 	{"void", 4},
     91 	{"goto", 0},
     92 	{"return", 0},
     93 	{"if", 5},
     94 	{"while", 5},
     95 	{"for", 5},
     96 	{"else", 6},
     97 	{"do", 6},
     98 	{"sizeof", 7},
     99 	{0, 0}
    100 };
    101 
    102 char    chartype[128] =
    103 {				/* this is used to facilitate the decision of
    104 				 * what type (alphanumeric, operator) each
    105 				 * character is */
    106 	0, 0, 0, 0, 0, 0, 0, 0,
    107 	0, 0, 0, 0, 0, 0, 0, 0,
    108 	0, 0, 0, 0, 0, 0, 0, 0,
    109 	0, 0, 0, 0, 0, 0, 0, 0,
    110 	0, 3, 0, 0, 1, 3, 3, 0,
    111 	0, 0, 3, 3, 0, 3, 0, 3,
    112 	1, 1, 1, 1, 1, 1, 1, 1,
    113 	1, 1, 0, 0, 3, 3, 3, 3,
    114 	0, 1, 1, 1, 1, 1, 1, 1,
    115 	1, 1, 1, 1, 1, 1, 1, 1,
    116 	1, 1, 1, 1, 1, 1, 1, 1,
    117 	1, 1, 1, 0, 0, 0, 3, 1,
    118 	0, 1, 1, 1, 1, 1, 1, 1,
    119 	1, 1, 1, 1, 1, 1, 1, 1,
    120 	1, 1, 1, 1, 1, 1, 1, 1,
    121 	1, 1, 1, 0, 3, 0, 3, 0
    122 };
    123 
    124 
    125 
    126 
    127 int
    128 lexi()
    129 {
    130 	int     unary_delim;	/* this is set to 1 if the current token
    131 				 *
    132 				 * forces a following operator to be unary */
    133 	static int last_code;	/* the last token type returned */
    134 	static int l_struct;	/* set to 1 if the last token was 'struct' */
    135 	int     code;		/* internal code to be returned */
    136 	char    qchar;		/* the delimiter character for a string */
    137 
    138 	e_token = s_token;	/* point to start of place to save token */
    139 	unary_delim = false;
    140 	ps.col_1 = ps.last_nl;	/* tell world that this token started in
    141 				 * column 1 iff the last thing scanned was nl */
    142 	ps.last_nl = false;
    143 
    144 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
    145 		ps.col_1 = false;	/* leading blanks imply token is not
    146 					 * in column 1 */
    147 		if (++buf_ptr >= buf_end)
    148 			fill_buffer();
    149 	}
    150 
    151 	/* Scan an alphanumeric token */
    152 	if (chartype[(int) *buf_ptr] == alphanum ||
    153 	    (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
    154 		/*
    155 		 * we have a character or number
    156 		 */
    157 		char   *j;	/* used for searching thru list of
    158 				 *
    159 				 * reserved words */
    160 		struct templ *p;
    161 
    162 		if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
    163 			int     seendot = 0, seenexp = 0;
    164 			if (*buf_ptr == '0' &&
    165 			    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
    166 				*e_token++ = *buf_ptr++;
    167 				*e_token++ = *buf_ptr++;
    168 				while (isxdigit(*buf_ptr)) {
    169 					CHECK_SIZE_TOKEN;
    170 					*e_token++ = *buf_ptr++;
    171 				}
    172 			} else {
    173 				while (1) {
    174 					if (*buf_ptr == '.') {
    175 						if (seendot)
    176 							break;
    177 						else
    178 							seendot++;
    179 					}
    180 					CHECK_SIZE_TOKEN;
    181 					*e_token++ = *buf_ptr++;
    182 					if (!isdigit(*buf_ptr)
    183 					&& *buf_ptr != '.') {
    184 						if ((*buf_ptr != 'E'
    185 						&& *buf_ptr != 'e') || seenexp)
    186 							break;
    187 						else {
    188 							seenexp++;
    189 							seendot++;
    190 							CHECK_SIZE_TOKEN;
    191 							*e_token++ = *buf_ptr++;
    192 							if (*buf_ptr == '+' || *buf_ptr == '-')
    193 								*e_token++ = *buf_ptr++;
    194 						}
    195 					}
    196 				}
    197 			}
    198 			if (*buf_ptr == 'L' || *buf_ptr == 'l')
    199 				*e_token++ = *buf_ptr++;
    200 		} else
    201 			while (chartype[(int) *buf_ptr] == alphanum) {	/* copy it over */
    202 				CHECK_SIZE_TOKEN;
    203 				*e_token++ = *buf_ptr++;
    204 				if (buf_ptr >= buf_end)
    205 					fill_buffer();
    206 			}
    207 		*e_token++ = '\0';
    208 		while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
    209 			if (++buf_ptr >= buf_end)
    210 				fill_buffer();
    211 		}
    212 		ps.its_a_keyword = false;
    213 		ps.sizeof_keyword = false;
    214 		if (l_struct) {	/* if last token was 'struct', then this token
    215 				 * should be treated as a declaration */
    216 			l_struct = false;
    217 			last_code = ident;
    218 			ps.last_u_d = true;
    219 			return (decl);
    220 		}
    221 		ps.last_u_d = false;	/* Operator after indentifier is
    222 					 * binary */
    223 		last_code = ident;	/* Remember that this is the code we
    224 					 * will return */
    225 
    226 		/*
    227 		 * This loop will check if the token is a keyword.
    228 		 */
    229 		for (p = specials; (j = p->rwd) != 0; p++) {
    230 			char   *p = s_token;	/* point at scanned token */
    231 			if (*j++ != *p++ || *j++ != *p++)
    232 				continue;	/* This test depends on the
    233 						 * fact that identifiers are
    234 						 * always at least 1 character
    235 						 * long (ie. the first two
    236 						 * bytes of the identifier are
    237 						 * always meaningful) */
    238 			if (p[-1] == 0)
    239 				break;	/* If its a one-character identifier */
    240 			while (*p++ == *j)
    241 				if (*j++ == 0)
    242 					goto found_keyword;	/* I wish that C had a
    243 								 * multi-level break... */
    244 		}
    245 		if (p->rwd) {	/* we have a keyword */
    246 	found_keyword:
    247 			ps.its_a_keyword = true;
    248 			ps.last_u_d = true;
    249 			switch (p->rwcode) {
    250 			case 1:/* it is a switch */
    251 				return (swstmt);
    252 			case 2:/* a case or default */
    253 				return (casestmt);
    254 
    255 			case 3:/* a "struct" */
    256 				if (ps.p_l_follow)
    257 					break;	/* inside parens: cast */
    258 				l_struct = true;
    259 
    260 				/*
    261 				 * Next time around, we will want to know that we have had a
    262 				 * 'struct'
    263 				 */
    264 			case 4:/* one of the declaration keywords */
    265 				if (ps.p_l_follow) {
    266 					ps.cast_mask |= 1 << ps.p_l_follow;
    267 					break;	/* inside parens: cast */
    268 				}
    269 				last_code = decl;
    270 				return (decl);
    271 
    272 			case 5:/* if, while, for */
    273 				return (sp_paren);
    274 
    275 			case 6:/* do, else */
    276 				return (sp_nparen);
    277 
    278 			case 7:
    279 				ps.sizeof_keyword = true;
    280 			default:	/* all others are treated like any
    281 					 * other identifier */
    282 				return (ident);
    283 			}	/* end of switch */
    284 		}		/* end of if (found_it) */
    285 		if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
    286 			char   *tp = buf_ptr;
    287 			while (tp < buf_end)
    288 				if (*tp++ == ')' && (*tp == ';' || *tp == ','))
    289 					goto not_proc;
    290 			strncpy(ps.procname, token, sizeof ps.procname - 1);
    291 			ps.in_parameter_declaration = 1;
    292 			rparen_count = 1;
    293 	not_proc:	;
    294 		}
    295 		/*
    296 		 * The following hack attempts to guess whether or not the current
    297 		 * token is in fact a declaration keyword -- one that has been
    298 		 * typedefd
    299 		 */
    300 		if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
    301 		    && !ps.p_l_follow
    302 		    && !ps.block_init
    303 		    && (ps.last_token == rparen || ps.last_token == semicolon ||
    304 			ps.last_token == decl ||
    305 			ps.last_token == lbrace || ps.last_token == rbrace)) {
    306 			ps.its_a_keyword = true;
    307 			ps.last_u_d = true;
    308 			last_code = decl;
    309 			return decl;
    310 		}
    311 		if (last_code == decl)	/* if this is a declared variable,
    312 					 * then following sign is unary */
    313 			ps.last_u_d = true;	/* will make "int a -1" work */
    314 		last_code = ident;
    315 		return (ident);	/* the ident is not in the list */
    316 	}			/* end of procesing for alpanum character */
    317 	/* Scan a non-alphanumeric token */
    318 	*e_token++ = *buf_ptr;	/* if it is only a one-character token, it is
    319 				 * moved here */
    320 	*e_token = '\0';
    321 	if (++buf_ptr >= buf_end)
    322 		fill_buffer();
    323 
    324 	switch (*token) {
    325 	case '\n':
    326 		unary_delim = ps.last_u_d;
    327 		ps.last_nl = true;	/* remember that we just had a newline */
    328 		code = (had_eof ? 0 : newline);
    329 
    330 		/*
    331 		 * if data has been exausted, the newline is a dummy, and we should
    332 		 * return code to stop
    333 		 */
    334 		break;
    335 
    336 	case '\'':		/* start of quoted character */
    337 	case '"':		/* start of string */
    338 		qchar = *token;
    339 		if (troff) {
    340 			e_token[-1] = '`';
    341 			if (qchar == '"')
    342 				*e_token++ = '`';
    343 			e_token = chfont(&bodyf, &stringf, e_token);
    344 		}
    345 		do {		/* copy the string */
    346 			while (1) {	/* move one character or
    347 					 * [/<char>]<char> */
    348 				if (*buf_ptr == '\n') {
    349 					printf("%d: Unterminated literal\n", line_no);
    350 					goto stop_lit;
    351 				}
    352 				CHECK_SIZE_TOKEN;	/* Only have to do this
    353 							 * once in this loop,
    354 							 * since CHECK_SIZE
    355 							 * guarantees that there
    356 							 * are at least 5
    357 							 * entries left */
    358 				*e_token = *buf_ptr++;
    359 				if (buf_ptr >= buf_end)
    360 					fill_buffer();
    361 				if (*e_token == BACKSLASH) {	/* if escape, copy extra
    362 								 * char */
    363 					if (*buf_ptr == '\n')	/* check for escaped
    364 								 * newline */
    365 						++line_no;
    366 					if (troff) {
    367 						*++e_token = BACKSLASH;
    368 						if (*buf_ptr == BACKSLASH)
    369 							*++e_token = BACKSLASH;
    370 					}
    371 					*++e_token = *buf_ptr++;
    372 					++e_token;	/* we must increment
    373 							 * this again because we
    374 							 * copied two chars */
    375 					if (buf_ptr >= buf_end)
    376 						fill_buffer();
    377 				} else
    378 					break;	/* we copied one character */
    379 			}	/* end of while (1) */
    380 		} while (*e_token++ != qchar);
    381 		if (troff) {
    382 			e_token = chfont(&stringf, &bodyf, e_token - 1);
    383 			if (qchar == '"')
    384 				*e_token++ = '\'';
    385 		}
    386 stop_lit:
    387 		code = ident;
    388 		break;
    389 
    390 	case ('('):
    391 	case ('['):
    392 		unary_delim = true;
    393 		code = lparen;
    394 		break;
    395 
    396 	case (')'):
    397 	case (']'):
    398 		code = rparen;
    399 		break;
    400 
    401 	case '#':
    402 		unary_delim = ps.last_u_d;
    403 		code = preesc;
    404 		break;
    405 
    406 	case '?':
    407 		unary_delim = true;
    408 		code = question;
    409 		break;
    410 
    411 	case (':'):
    412 		code = colon;
    413 		unary_delim = true;
    414 		break;
    415 
    416 	case (';'):
    417 		unary_delim = true;
    418 		code = semicolon;
    419 		break;
    420 
    421 	case ('{'):
    422 		unary_delim = true;
    423 
    424 		/*
    425 		 * if (ps.in_or_st) ps.block_init = 1;
    426 		 */
    427 		/* ?	code = ps.block_init ? lparen : lbrace; */
    428 		code = lbrace;
    429 		break;
    430 
    431 	case ('}'):
    432 		unary_delim = true;
    433 		/* ?	code = ps.block_init ? rparen : rbrace; */
    434 		code = rbrace;
    435 		break;
    436 
    437 	case 014:		/* a form feed */
    438 		unary_delim = ps.last_u_d;
    439 		ps.last_nl = true;	/* remember this so we can set
    440 					 * 'ps.col_1' right */
    441 		code = form_feed;
    442 		break;
    443 
    444 	case (','):
    445 		unary_delim = true;
    446 		code = comma;
    447 		break;
    448 
    449 	case '.':
    450 		unary_delim = false;
    451 		code = period;
    452 		break;
    453 
    454 	case '-':
    455 	case '+':		/* check for -, +, --, ++ */
    456 		code = (ps.last_u_d ? unary_op : binary_op);
    457 		unary_delim = true;
    458 
    459 		if (*buf_ptr == token[0]) {
    460 			/* check for doubled character */
    461 			*e_token++ = *buf_ptr++;
    462 			/* buffer overflow will be checked at end of loop */
    463 			if (last_code == ident || last_code == rparen) {
    464 				code = (ps.last_u_d ? unary_op : postop);
    465 				/* check for following ++ or -- */
    466 				unary_delim = false;
    467 			}
    468 		} else
    469 			if (*buf_ptr == '=')
    470 				/* check for operator += */
    471 				*e_token++ = *buf_ptr++;
    472 			else
    473 				if (*buf_ptr == '>') {
    474 					/* check for operator -> */
    475 					*e_token++ = *buf_ptr++;
    476 					if (!pointer_as_binop) {
    477 						unary_delim = false;
    478 						code = unary_op;
    479 						ps.want_blank = false;
    480 					}
    481 				}
    482 		break;		/* buffer overflow will be checked at end of
    483 				 * switch */
    484 
    485 	case '=':
    486 		if (ps.in_or_st)
    487 			ps.block_init = 1;
    488 #ifdef undef
    489 		if (chartype[*buf_ptr] == opchar) {	/* we have two char
    490 							 * assignment */
    491 			e_token[-1] = *buf_ptr++;
    492 			if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
    493 				*e_token++ = *buf_ptr++;
    494 			*e_token++ = '=';	/* Flip =+ to += */
    495 			*e_token = 0;
    496 		}
    497 #else
    498 		if (*buf_ptr == '=') {	/* == */
    499 			*e_token++ = '=';	/* Flip =+ to += */
    500 			buf_ptr++;
    501 			*e_token = 0;
    502 		}
    503 #endif
    504 		code = binary_op;
    505 		unary_delim = true;
    506 		break;
    507 		/* can drop thru!!! */
    508 
    509 	case '>':
    510 	case '<':
    511 	case '!':		/* ops like <, <<, <=, !=, etc */
    512 		if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
    513 			*e_token++ = *buf_ptr;
    514 			if (++buf_ptr >= buf_end)
    515 				fill_buffer();
    516 		}
    517 		if (*buf_ptr == '=')
    518 			*e_token++ = *buf_ptr++;
    519 		code = (ps.last_u_d ? unary_op : binary_op);
    520 		unary_delim = true;
    521 		break;
    522 
    523 	default:
    524 		if (token[0] == '/' && *buf_ptr == '*') {
    525 			/* it is start of comment */
    526 			*e_token++ = '*';
    527 
    528 			if (++buf_ptr >= buf_end)
    529 				fill_buffer();
    530 
    531 			code = comment;
    532 			unary_delim = ps.last_u_d;
    533 			break;
    534 		}
    535 		while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
    536 			/*
    537 		         * handle ||, &&, etc, and also things as in int *****i
    538 		         */
    539 			*e_token++ = *buf_ptr;
    540 			if (++buf_ptr >= buf_end)
    541 				fill_buffer();
    542 		}
    543 		code = (ps.last_u_d ? unary_op : binary_op);
    544 		unary_delim = true;
    545 
    546 
    547 	}			/* end of switch */
    548 	if (code != newline) {
    549 		l_struct = false;
    550 		last_code = code;
    551 	}
    552 	if (buf_ptr >= buf_end)	/* check for input buffer empty */
    553 		fill_buffer();
    554 	ps.last_u_d = unary_delim;
    555 	*e_token = '\0';	/* null terminate the token */
    556 	return (code);
    557 }
    558 /*
    559  * Add the given keyword to the keyword table, using val as the keyword type
    560  */
    561 void
    562 addkey(key, val)
    563 	char   *key;
    564 	int     val;
    565 {
    566 	struct templ *p = specials;
    567 	while (p->rwd)
    568 		if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
    569 			return;
    570 		else
    571 			p++;
    572 	if (p >= specials + sizeof specials / sizeof specials[0])
    573 		return;		/* For now, table overflows are silently
    574 				 * ignored */
    575 	p->rwd = key;
    576 	p->rwcode = val;
    577 	p[1].rwd = 0;
    578 	p[1].rwcode = 0;
    579 }
    580