Home | History | Annotate | Line # | Download | only in indent
lexi.c revision 1.9
      1 /*	$NetBSD: lexi.c,v 1.9 1999/03/15 20:28:45 kristerw Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1980, 1993
      5  *	The Regents of the University of California.  All rights reserved.
      6  * Copyright (c) 1976 Board of Trustees of the University of Illinois.
      7  * Copyright (c) 1985 Sun Microsystems, Inc.
      8  * All rights reserved.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  * 3. All advertising materials mentioning features or use of this software
     19  *    must display the following acknowledgement:
     20  *	This product includes software developed by the University of
     21  *	California, Berkeley and its contributors.
     22  * 4. Neither the name of the University nor the names of its contributors
     23  *    may be used to endorse or promote products derived from this software
     24  *    without specific prior written permission.
     25  *
     26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     36  * SUCH DAMAGE.
     37  */
     38 
     39 #include <sys/cdefs.h>
     40 #ifndef lint
     41 #if 0
     42 static char sccsid[] = "@(#)lexi.c	8.1 (Berkeley) 6/6/93";
     43 #else
     44 __RCSID("$NetBSD: lexi.c,v 1.9 1999/03/15 20:28:45 kristerw Exp $");
     45 #endif
     46 #endif				/* not lint */
     47 
     48 /*
     49  * Here we have the token scanner for indent.  It scans off one token and puts
     50  * it in the global variable "token".  It returns a code, indicating the type
     51  * of token scanned.
     52  */
     53 
     54 #include <stdio.h>
     55 #include <ctype.h>
     56 #include <stdlib.h>
     57 #include <string.h>
     58 #include "indent_globs.h"
     59 #include "indent_codes.h"
     60 
     61 #define alphanum 1
     62 #define opchar 3
     63 
     64 struct templ {
     65 	char   *rwd;
     66 	int     rwcode;
     67 };
     68 
     69 struct templ specials[1000] =
     70 {
     71 	{"switch", 1},
     72 	{"case", 2},
     73 	{"break", 0},
     74 	{"struct", 3},
     75 	{"union", 3},
     76 	{"enum", 3},
     77 	{"default", 2},
     78 	{"int", 4},
     79 	{"char", 4},
     80 	{"float", 4},
     81 	{"double", 4},
     82 	{"long", 4},
     83 	{"short", 4},
     84 	{"typdef", 4},
     85 	{"unsigned", 4},
     86 	{"register", 4},
     87 	{"static", 4},
     88 	{"global", 4},
     89 	{"extern", 4},
     90 	{"void", 4},
     91 	{"goto", 0},
     92 	{"return", 0},
     93 	{"if", 5},
     94 	{"while", 5},
     95 	{"for", 5},
     96 	{"else", 6},
     97 	{"do", 6},
     98 	{"sizeof", 7},
     99 	{0, 0}
    100 };
    101 
    102 char    chartype[128] =
    103 {				/* this is used to facilitate the decision of
    104 				 * what type (alphanumeric, operator) each
    105 				 * character is */
    106 	0, 0, 0, 0, 0, 0, 0, 0,
    107 	0, 0, 0, 0, 0, 0, 0, 0,
    108 	0, 0, 0, 0, 0, 0, 0, 0,
    109 	0, 0, 0, 0, 0, 0, 0, 0,
    110 	0, 3, 0, 0, 1, 3, 3, 0,
    111 	0, 0, 3, 3, 0, 3, 0, 3,
    112 	1, 1, 1, 1, 1, 1, 1, 1,
    113 	1, 1, 0, 0, 3, 3, 3, 3,
    114 	0, 1, 1, 1, 1, 1, 1, 1,
    115 	1, 1, 1, 1, 1, 1, 1, 1,
    116 	1, 1, 1, 1, 1, 1, 1, 1,
    117 	1, 1, 1, 0, 0, 0, 3, 1,
    118 	0, 1, 1, 1, 1, 1, 1, 1,
    119 	1, 1, 1, 1, 1, 1, 1, 1,
    120 	1, 1, 1, 1, 1, 1, 1, 1,
    121 	1, 1, 1, 0, 3, 0, 3, 0
    122 };
    123 
    124 
    125 
    126 
    127 int
    128 lexi()
    129 {
    130 	int     unary_delim;	/* this is set to 1 if the current token
    131 				 *
    132 				 * forces a following operator to be unary */
    133 	static int last_code;	/* the last token type returned */
    134 	static int l_struct;	/* set to 1 if the last token was 'struct' */
    135 	int     code;		/* internal code to be returned */
    136 	char    qchar;		/* the delimiter character for a string */
    137 
    138 	e_token = s_token;	/* point to start of place to save token */
    139 	unary_delim = false;
    140 	ps.col_1 = ps.last_nl;	/* tell world that this token started in
    141 				 * column 1 iff the last thing scanned was nl */
    142 	ps.last_nl = false;
    143 
    144 	while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
    145 		ps.col_1 = false;	/* leading blanks imply token is not
    146 					 * in column 1 */
    147 		if (++buf_ptr >= buf_end)
    148 			fill_buffer();
    149 	}
    150 
    151 	/* Scan an alphanumeric token */
    152 	if (chartype[(int) *buf_ptr] == alphanum ||
    153 	    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
    154 		/*
    155 		 * we have a character or number
    156 		 */
    157 		char   *j;	/* used for searching thru list of
    158 				 *
    159 				 * reserved words */
    160 		struct templ *p;
    161 
    162 		if (isdigit((unsigned char)*buf_ptr) ||
    163 		    (buf_ptr[0] == '.' && isdigit((unsigned char)buf_ptr[1]))) {
    164 			int     seendot = 0, seenexp = 0;
    165 			if (*buf_ptr == '0' &&
    166 			    (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
    167 				*e_token++ = *buf_ptr++;
    168 				*e_token++ = *buf_ptr++;
    169 				while (isxdigit((unsigned char)*buf_ptr)) {
    170 					CHECK_SIZE_TOKEN;
    171 					*e_token++ = *buf_ptr++;
    172 				}
    173 			} else {
    174 				while (1) {
    175 					if (*buf_ptr == '.') {
    176 						if (seendot)
    177 							break;
    178 						else
    179 							seendot++;
    180 					}
    181 					CHECK_SIZE_TOKEN;
    182 					*e_token++ = *buf_ptr++;
    183 					if (!isdigit((unsigned char)*buf_ptr)
    184 					&& *buf_ptr != '.') {
    185 						if ((*buf_ptr != 'E'
    186 						&& *buf_ptr != 'e') || seenexp)
    187 							break;
    188 						else {
    189 							seenexp++;
    190 							seendot++;
    191 							CHECK_SIZE_TOKEN;
    192 							*e_token++ = *buf_ptr++;
    193 							if (*buf_ptr == '+' || *buf_ptr == '-')
    194 								*e_token++ = *buf_ptr++;
    195 						}
    196 					}
    197 				}
    198 			}
    199 			if (*buf_ptr == 'F' || *buf_ptr == 'f') {
    200 				/* float constant */
    201 				*e_token++ = *buf_ptr++;
    202 			} else {
    203 				/* integer constant (U, L, UL, LL, ULL) */
    204 				if (*buf_ptr == 'U' || *buf_ptr == 'u')
    205 					*e_token++ = *buf_ptr++;
    206 				if (*buf_ptr == 'L' || *buf_ptr == 'l')
    207 					*e_token++ = *buf_ptr++;
    208 				if (*buf_ptr == 'L' || *buf_ptr == 'l')
    209 					*e_token++ = *buf_ptr++;
    210 			}
    211 		} else
    212 			while (chartype[(int) *buf_ptr] == alphanum) {	/* copy it over */
    213 				CHECK_SIZE_TOKEN;
    214 				*e_token++ = *buf_ptr++;
    215 				if (buf_ptr >= buf_end)
    216 					fill_buffer();
    217 			}
    218 		*e_token++ = '\0';
    219 		while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
    220 			if (++buf_ptr >= buf_end)
    221 				fill_buffer();
    222 		}
    223 		ps.its_a_keyword = false;
    224 		ps.sizeof_keyword = false;
    225 		if (l_struct) {	/* if last token was 'struct', then this token
    226 				 * should be treated as a declaration */
    227 			l_struct = false;
    228 			last_code = ident;
    229 			ps.last_u_d = true;
    230 			return (decl);
    231 		}
    232 		ps.last_u_d = false;	/* Operator after indentifier is
    233 					 * binary */
    234 		last_code = ident;	/* Remember that this is the code we
    235 					 * will return */
    236 
    237 		/*
    238 		 * This loop will check if the token is a keyword.
    239 		 */
    240 		for (p = specials; (j = p->rwd) != 0; p++) {
    241 			char   *p = s_token;	/* point at scanned token */
    242 			if (*j++ != *p++ || *j++ != *p++)
    243 				continue;	/* This test depends on the
    244 						 * fact that identifiers are
    245 						 * always at least 1 character
    246 						 * long (ie. the first two
    247 						 * bytes of the identifier are
    248 						 * always meaningful) */
    249 			if (p[-1] == 0)
    250 				break;	/* If its a one-character identifier */
    251 			while (*p++ == *j)
    252 				if (*j++ == 0)
    253 					goto found_keyword;	/* I wish that C had a
    254 								 * multi-level break... */
    255 		}
    256 		if (p->rwd) {	/* we have a keyword */
    257 	found_keyword:
    258 			ps.its_a_keyword = true;
    259 			ps.last_u_d = true;
    260 			switch (p->rwcode) {
    261 			case 1:/* it is a switch */
    262 				return (swstmt);
    263 			case 2:/* a case or default */
    264 				return (casestmt);
    265 
    266 			case 3:/* a "struct" */
    267 				if (ps.p_l_follow)
    268 					break;	/* inside parens: cast */
    269 				l_struct = true;
    270 
    271 				/*
    272 				 * Next time around, we will want to know that we have had a
    273 				 * 'struct'
    274 				 */
    275 			case 4:/* one of the declaration keywords */
    276 				if (ps.p_l_follow) {
    277 					ps.cast_mask |= 1 << ps.p_l_follow;
    278 					break;	/* inside parens: cast */
    279 				}
    280 				last_code = decl;
    281 				return (decl);
    282 
    283 			case 5:/* if, while, for */
    284 				return (sp_paren);
    285 
    286 			case 6:/* do, else */
    287 				return (sp_nparen);
    288 
    289 			case 7:
    290 				ps.sizeof_keyword = true;
    291 			default:	/* all others are treated like any
    292 					 * other identifier */
    293 				return (ident);
    294 			}	/* end of switch */
    295 		}		/* end of if (found_it) */
    296 		if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
    297 			char   *tp = buf_ptr;
    298 			while (tp < buf_end)
    299 				if (*tp++ == ')' && (*tp == ';' || *tp == ','))
    300 					goto not_proc;
    301 			strncpy(ps.procname, token, sizeof ps.procname - 1);
    302 			ps.in_parameter_declaration = 1;
    303 			rparen_count = 1;
    304 	not_proc:	;
    305 		}
    306 		/*
    307 		 * The following hack attempts to guess whether or not the current
    308 		 * token is in fact a declaration keyword -- one that has been
    309 		 * typedefd
    310 		 */
    311 		if (((*buf_ptr == '*' && buf_ptr[1] != '=') ||
    312 		    isalpha((unsigned char)*buf_ptr) || *buf_ptr == '_')
    313 		    && !ps.p_l_follow
    314 		    && !ps.block_init
    315 		    && (ps.last_token == rparen || ps.last_token == semicolon ||
    316 			ps.last_token == decl ||
    317 			ps.last_token == lbrace || ps.last_token == rbrace)) {
    318 			ps.its_a_keyword = true;
    319 			ps.last_u_d = true;
    320 			last_code = decl;
    321 			return decl;
    322 		}
    323 		if (last_code == decl)	/* if this is a declared variable,
    324 					 * then following sign is unary */
    325 			ps.last_u_d = true;	/* will make "int a -1" work */
    326 		last_code = ident;
    327 		return (ident);	/* the ident is not in the list */
    328 	}			/* end of procesing for alpanum character */
    329 	/* Scan a non-alphanumeric token */
    330 	*e_token++ = *buf_ptr;	/* if it is only a one-character token, it is
    331 				 * moved here */
    332 	*e_token = '\0';
    333 	if (++buf_ptr >= buf_end)
    334 		fill_buffer();
    335 
    336 	switch (*token) {
    337 	case '\n':
    338 		unary_delim = ps.last_u_d;
    339 		ps.last_nl = true;	/* remember that we just had a newline */
    340 		code = (had_eof ? 0 : newline);
    341 
    342 		/*
    343 		 * if data has been exausted, the newline is a dummy, and we should
    344 		 * return code to stop
    345 		 */
    346 		break;
    347 
    348 	case '\'':		/* start of quoted character */
    349 	case '"':		/* start of string */
    350 		qchar = *token;
    351 		if (troff) {
    352 			e_token[-1] = '`';
    353 			if (qchar == '"')
    354 				*e_token++ = '`';
    355 			e_token = chfont(&bodyf, &stringf, e_token);
    356 		}
    357 		do {		/* copy the string */
    358 			while (1) {	/* move one character or
    359 					 * [/<char>]<char> */
    360 				if (*buf_ptr == '\n') {
    361 					printf("%d: Unterminated literal\n", line_no);
    362 					goto stop_lit;
    363 				}
    364 				CHECK_SIZE_TOKEN;	/* Only have to do this
    365 							 * once in this loop,
    366 							 * since CHECK_SIZE
    367 							 * guarantees that there
    368 							 * are at least 5
    369 							 * entries left */
    370 				*e_token = *buf_ptr++;
    371 				if (buf_ptr >= buf_end)
    372 					fill_buffer();
    373 				if (*e_token == BACKSLASH) {	/* if escape, copy extra
    374 								 * char */
    375 					if (*buf_ptr == '\n')	/* check for escaped
    376 								 * newline */
    377 						++line_no;
    378 					if (troff) {
    379 						*++e_token = BACKSLASH;
    380 						if (*buf_ptr == BACKSLASH)
    381 							*++e_token = BACKSLASH;
    382 					}
    383 					*++e_token = *buf_ptr++;
    384 					++e_token;	/* we must increment
    385 							 * this again because we
    386 							 * copied two chars */
    387 					if (buf_ptr >= buf_end)
    388 						fill_buffer();
    389 				} else
    390 					break;	/* we copied one character */
    391 			}	/* end of while (1) */
    392 		} while (*e_token++ != qchar);
    393 		if (troff) {
    394 			e_token = chfont(&stringf, &bodyf, e_token - 1);
    395 			if (qchar == '"')
    396 				*e_token++ = '\'';
    397 		}
    398 stop_lit:
    399 		code = ident;
    400 		break;
    401 
    402 	case ('('):
    403 	case ('['):
    404 		unary_delim = true;
    405 		code = lparen;
    406 		break;
    407 
    408 	case (')'):
    409 	case (']'):
    410 		code = rparen;
    411 		break;
    412 
    413 	case '#':
    414 		unary_delim = ps.last_u_d;
    415 		code = preesc;
    416 		break;
    417 
    418 	case '?':
    419 		unary_delim = true;
    420 		code = question;
    421 		break;
    422 
    423 	case (':'):
    424 		code = colon;
    425 		unary_delim = true;
    426 		break;
    427 
    428 	case (';'):
    429 		unary_delim = true;
    430 		code = semicolon;
    431 		break;
    432 
    433 	case ('{'):
    434 		unary_delim = true;
    435 
    436 		/*
    437 		 * if (ps.in_or_st) ps.block_init = 1;
    438 		 */
    439 		/* ?	code = ps.block_init ? lparen : lbrace; */
    440 		code = lbrace;
    441 		break;
    442 
    443 	case ('}'):
    444 		unary_delim = true;
    445 		/* ?	code = ps.block_init ? rparen : rbrace; */
    446 		code = rbrace;
    447 		break;
    448 
    449 	case 014:		/* a form feed */
    450 		unary_delim = ps.last_u_d;
    451 		ps.last_nl = true;	/* remember this so we can set
    452 					 * 'ps.col_1' right */
    453 		code = form_feed;
    454 		break;
    455 
    456 	case (','):
    457 		unary_delim = true;
    458 		code = comma;
    459 		break;
    460 
    461 	case '.':
    462 		unary_delim = false;
    463 		code = period;
    464 		break;
    465 
    466 	case '-':
    467 	case '+':		/* check for -, +, --, ++ */
    468 		code = (ps.last_u_d ? unary_op : binary_op);
    469 		unary_delim = true;
    470 
    471 		if (*buf_ptr == token[0]) {
    472 			/* check for doubled character */
    473 			*e_token++ = *buf_ptr++;
    474 			/* buffer overflow will be checked at end of loop */
    475 			if (last_code == ident || last_code == rparen) {
    476 				code = (ps.last_u_d ? unary_op : postop);
    477 				/* check for following ++ or -- */
    478 				unary_delim = false;
    479 			}
    480 		} else
    481 			if (*buf_ptr == '=')
    482 				/* check for operator += */
    483 				*e_token++ = *buf_ptr++;
    484 			else
    485 				if (*buf_ptr == '>') {
    486 					/* check for operator -> */
    487 					*e_token++ = *buf_ptr++;
    488 					if (!pointer_as_binop) {
    489 						unary_delim = false;
    490 						code = unary_op;
    491 						ps.want_blank = false;
    492 					}
    493 				}
    494 		break;		/* buffer overflow will be checked at end of
    495 				 * switch */
    496 
    497 	case '=':
    498 		if (ps.in_or_st)
    499 			ps.block_init = 1;
    500 #ifdef undef
    501 		if (chartype[*buf_ptr] == opchar) {	/* we have two char
    502 							 * assignment */
    503 			e_token[-1] = *buf_ptr++;
    504 			if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
    505 				*e_token++ = *buf_ptr++;
    506 			*e_token++ = '=';	/* Flip =+ to += */
    507 			*e_token = 0;
    508 		}
    509 #else
    510 		if (*buf_ptr == '=') {	/* == */
    511 			*e_token++ = '=';	/* Flip =+ to += */
    512 			buf_ptr++;
    513 			*e_token = 0;
    514 		}
    515 #endif
    516 		code = binary_op;
    517 		unary_delim = true;
    518 		break;
    519 		/* can drop thru!!! */
    520 
    521 	case '>':
    522 	case '<':
    523 	case '!':		/* ops like <, <<, <=, !=, etc */
    524 		if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
    525 			*e_token++ = *buf_ptr;
    526 			if (++buf_ptr >= buf_end)
    527 				fill_buffer();
    528 		}
    529 		if (*buf_ptr == '=')
    530 			*e_token++ = *buf_ptr++;
    531 		code = (ps.last_u_d ? unary_op : binary_op);
    532 		unary_delim = true;
    533 		break;
    534 
    535 	default:
    536 		if (token[0] == '/' && *buf_ptr == '*') {
    537 			/* it is start of comment */
    538 			*e_token++ = '*';
    539 
    540 			if (++buf_ptr >= buf_end)
    541 				fill_buffer();
    542 
    543 			code = comment;
    544 			unary_delim = ps.last_u_d;
    545 			break;
    546 		}
    547 		while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
    548 			/*
    549 		         * handle ||, &&, etc, and also things as in int *****i
    550 		         */
    551 			*e_token++ = *buf_ptr;
    552 			if (++buf_ptr >= buf_end)
    553 				fill_buffer();
    554 		}
    555 		code = (ps.last_u_d ? unary_op : binary_op);
    556 		unary_delim = true;
    557 
    558 
    559 	}			/* end of switch */
    560 	if (code != newline) {
    561 		l_struct = false;
    562 		last_code = code;
    563 	}
    564 	if (buf_ptr >= buf_end)	/* check for input buffer empty */
    565 		fill_buffer();
    566 	ps.last_u_d = unary_delim;
    567 	*e_token = '\0';	/* null terminate the token */
    568 	return (code);
    569 }
    570 /*
    571  * Add the given keyword to the keyword table, using val as the keyword type
    572  */
    573 void
    574 addkey(key, val)
    575 	char   *key;
    576 	int     val;
    577 {
    578 	struct templ *p = specials;
    579 	while (p->rwd)
    580 		if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
    581 			return;
    582 		else
    583 			p++;
    584 	if (p >= specials + sizeof specials / sizeof specials[0])
    585 		return;		/* For now, table overflows are silently
    586 				 * ignored */
    587 	p->rwd = key;
    588 	p->rwcode = val;
    589 	p[1].rwd = 0;
    590 	p[1].rwcode = 0;
    591 }
    592