Home | History | Annotate | Line # | Download | only in src
      1 /*	$NetBSD: nfa.c,v 1.3 2017/01/02 17:45:27 christos Exp $	*/
      2 
      3 /* nfa - NFA construction routines */
      4 
      5 /*  Copyright (c) 1990 The Regents of the University of California. */
      6 /*  All rights reserved. */
      7 
      8 /*  This code is derived from software contributed to Berkeley by */
      9 /*  Vern Paxson. */
     10 
     11 /*  The United States Government has rights in this work pursuant */
     12 /*  to contract no. DE-AC03-76SF00098 between the United States */
     13 /*  Department of Energy and the University of California. */
     14 
     15 /*  This file is part of flex. */
     16 
     17 /*  Redistribution and use in source and binary forms, with or without */
     18 /*  modification, are permitted provided that the following conditions */
     19 /*  are met: */
     20 
     21 /*  1. Redistributions of source code must retain the above copyright */
     22 /*     notice, this list of conditions and the following disclaimer. */
     23 /*  2. Redistributions in binary form must reproduce the above copyright */
     24 /*     notice, this list of conditions and the following disclaimer in the */
     25 /*     documentation and/or other materials provided with the distribution. */
     26 
     27 /*  Neither the name of the University nor the names of its contributors */
     28 /*  may be used to endorse or promote products derived from this software */
     29 /*  without specific prior written permission. */
     30 
     31 /*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */
     32 /*  IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */
     33 /*  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
     34 /*  PURPOSE. */
     35 #include "flexdef.h"
     36 __RCSID("$NetBSD: nfa.c,v 1.3 2017/01/02 17:45:27 christos Exp $");
     37 
     38 
     39 
     40 /* declare functions that have forward references */
     41 
     42 int	dupmachine(int);
     43 void	mkxtion(int, int);
     44 
     45 
     46 /* add_accept - add an accepting state to a machine
     47  *
     48  * accepting_number becomes mach's accepting number.
     49  */
     50 
     51 void    add_accept (int mach, int accepting_number)
     52 {
     53 	/* Hang the accepting number off an epsilon state.  if it is associated
     54 	 * with a state that has a non-epsilon out-transition, then the state
     55 	 * will accept BEFORE it makes that transition, i.e., one character
     56 	 * too soon.
     57 	 */
     58 
     59 	if (transchar[finalst[mach]] == SYM_EPSILON)
     60 		accptnum[finalst[mach]] = accepting_number;
     61 
     62 	else {
     63 		int     astate = mkstate (SYM_EPSILON);
     64 
     65 		accptnum[astate] = accepting_number;
     66 		(void) link_machines (mach, astate);
     67 	}
     68 }
     69 
     70 
     71 /* copysingl - make a given number of copies of a singleton machine
     72  *
     73  * synopsis
     74  *
     75  *   newsng = copysingl( singl, num );
     76  *
     77  *     newsng - a new singleton composed of num copies of singl
     78  *     singl  - a singleton machine
     79  *     num    - the number of copies of singl to be present in newsng
     80  */
     81 
     82 int     copysingl (int singl, int num)
     83 {
     84 	int     copy, i;
     85 
     86 	copy = mkstate (SYM_EPSILON);
     87 
     88 	for (i = 1; i <= num; ++i)
     89 		copy = link_machines (copy, dupmachine (singl));
     90 
     91 	return copy;
     92 }
     93 
     94 
     95 /* dumpnfa - debugging routine to write out an nfa */
     96 
     97 void    dumpnfa (int state1)
     98 {
     99 	int     sym, tsp1, tsp2, anum, ns;
    100 
    101 	fprintf (stderr,
    102 		 _
    103 		 ("\n\n********** beginning dump of nfa with start state %d\n"),
    104 		 state1);
    105 
    106 	/* We probably should loop starting at firstst[state1] and going to
    107 	 * lastst[state1], but they're not maintained properly when we "or"
    108 	 * all of the rules together.  So we use our knowledge that the machine
    109 	 * starts at state 1 and ends at lastnfa.
    110 	 */
    111 
    112 	/* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
    113 	for (ns = 1; ns <= lastnfa; ++ns) {
    114 		fprintf (stderr, _("state # %4d\t"), ns);
    115 
    116 		sym = transchar[ns];
    117 		tsp1 = trans1[ns];
    118 		tsp2 = trans2[ns];
    119 		anum = accptnum[ns];
    120 
    121 		fprintf (stderr, "%3d:  %4d, %4d", sym, tsp1, tsp2);
    122 
    123 		if (anum != NIL)
    124 			fprintf (stderr, "  [%d]", anum);
    125 
    126 		fprintf (stderr, "\n");
    127 	}
    128 
    129 	fprintf (stderr, _("********** end of dump\n"));
    130 }
    131 
    132 
    133 /* dupmachine - make a duplicate of a given machine
    134  *
    135  * synopsis
    136  *
    137  *   copy = dupmachine( mach );
    138  *
    139  *     copy - holds duplicate of mach
    140  *     mach - machine to be duplicated
    141  *
    142  * note that the copy of mach is NOT an exact duplicate; rather, all the
    143  * transition states values are adjusted so that the copy is self-contained,
    144  * as the original should have been.
    145  *
    146  * also note that the original MUST be contiguous, with its low and high
    147  * states accessible by the arrays firstst and lastst
    148  */
    149 
    150 int     dupmachine (int mach)
    151 {
    152 	int     i, init, state_offset;
    153 	int     state = 0;
    154 	int     last = lastst[mach];
    155 
    156 	for (i = firstst[mach]; i <= last; ++i) {
    157 		state = mkstate (transchar[i]);
    158 
    159 		if (trans1[i] != NO_TRANSITION) {
    160 			mkxtion (finalst[state], trans1[i] + state - i);
    161 
    162 			if (transchar[i] == SYM_EPSILON &&
    163 			    trans2[i] != NO_TRANSITION)
    164 					mkxtion (finalst[state],
    165 						 trans2[i] + state - i);
    166 		}
    167 
    168 		accptnum[state] = accptnum[i];
    169 	}
    170 
    171 	if (state == 0)
    172 		flexfatal (_("empty machine in dupmachine()"));
    173 
    174 	state_offset = state - i + 1;
    175 
    176 	init = mach + state_offset;
    177 	firstst[init] = firstst[mach] + state_offset;
    178 	finalst[init] = finalst[mach] + state_offset;
    179 	lastst[init] = lastst[mach] + state_offset;
    180 
    181 	return init;
    182 }
    183 
    184 
    185 /* finish_rule - finish up the processing for a rule
    186  *
    187  * An accepting number is added to the given machine.  If variable_trail_rule
    188  * is true then the rule has trailing context and both the head and trail
    189  * are variable size.  Otherwise if headcnt or trailcnt is non-zero then
    190  * the machine recognizes a pattern with trailing context and headcnt is
    191  * the number of characters in the matched part of the pattern, or zero
    192  * if the matched part has variable length.  trailcnt is the number of
    193  * trailing context characters in the pattern, or zero if the trailing
    194  * context has variable length.
    195  */
    196 
    197 void    finish_rule (int mach, int variable_trail_rule, int headcnt, int trailcnt,
    198 		     int pcont_act)
    199 {
    200 	char    action_text[MAXLINE];
    201 
    202 	add_accept (mach, num_rules);
    203 
    204 	/* We did this in new_rule(), but it often gets the wrong
    205 	 * number because we do it before we start parsing the current rule.
    206 	 */
    207 	rule_linenum[num_rules] = linenum;
    208 
    209 	/* If this is a continued action, then the line-number has already
    210 	 * been updated, giving us the wrong number.
    211 	 */
    212 	if (continued_action)
    213 		--rule_linenum[num_rules];
    214 
    215 
    216 	/* If the previous rule was continued action, then we inherit the
    217 	 * previous newline flag, possibly overriding the current one.
    218 	 */
    219 	if (pcont_act && rule_has_nl[num_rules - 1])
    220 		rule_has_nl[num_rules] = true;
    221 
    222 	snprintf (action_text, sizeof(action_text), "case %d:\n", num_rules);
    223 	add_action (action_text);
    224 	if (rule_has_nl[num_rules]) {
    225 		snprintf (action_text, sizeof(action_text), "/* rule %d can match eol */\n",
    226 			 num_rules);
    227 		add_action (action_text);
    228 	}
    229 
    230 
    231 	if (variable_trail_rule) {
    232 		rule_type[num_rules] = RULE_VARIABLE;
    233 
    234 		if (performance_report > 0)
    235 			fprintf (stderr,
    236 				 _
    237 				 ("Variable trailing context rule at line %d\n"),
    238 				 rule_linenum[num_rules]);
    239 
    240 		variable_trailing_context_rules = true;
    241 	}
    242 
    243 	else {
    244 		rule_type[num_rules] = RULE_NORMAL;
    245 
    246 		if (headcnt > 0 || trailcnt > 0) {
    247 			/* Do trailing context magic to not match the trailing
    248 			 * characters.
    249 			 */
    250 			char   *scanner_cp = "YY_G(yy_c_buf_p) = yy_cp";
    251 			char   *scanner_bp = "yy_bp";
    252 
    253 			add_action
    254 				("*yy_cp = YY_G(yy_hold_char); /* undo effects of setting up yytext */\n");
    255 
    256 			if (headcnt > 0) {
    257 				if (rule_has_nl[num_rules]) {
    258 					snprintf (action_text, sizeof(action_text),
    259 						"YY_LINENO_REWIND_TO(%s + %d);\n", scanner_bp, headcnt);
    260 					add_action (action_text);
    261 				}
    262 				snprintf (action_text, sizeof(action_text), "%s = %s + %d;\n",
    263 					 scanner_cp, scanner_bp, headcnt);
    264 				add_action (action_text);
    265 			}
    266 
    267 			else {
    268 				if (rule_has_nl[num_rules]) {
    269 					snprintf (action_text, sizeof(action_text),
    270 						 "YY_LINENO_REWIND_TO(yy_cp - %d);\n", trailcnt);
    271 					add_action (action_text);
    272 				}
    273 
    274 				snprintf (action_text, sizeof(action_text), "%s -= %d;\n",
    275 					 scanner_cp, trailcnt);
    276 				add_action (action_text);
    277 			}
    278 
    279 			add_action
    280 				("YY_DO_BEFORE_ACTION; /* set up yytext again */\n");
    281 		}
    282 	}
    283 
    284 	/* Okay, in the action code at this point yytext and yyleng have
    285 	 * their proper final values for this rule, so here's the point
    286 	 * to do any user action.  But don't do it for continued actions,
    287 	 * as that'll result in multiple YY_RULE_SETUP's.
    288 	 */
    289 	if (!continued_action)
    290 		add_action ("YY_RULE_SETUP\n");
    291 
    292 	line_directive_out(NULL, 1);
    293         add_action("[[");
    294 }
    295 
    296 
    297 /* link_machines - connect two machines together
    298  *
    299  * synopsis
    300  *
    301  *   new = link_machines( first, last );
    302  *
    303  *     new    - a machine constructed by connecting first to last
    304  *     first  - the machine whose successor is to be last
    305  *     last   - the machine whose predecessor is to be first
    306  *
    307  * note: this routine concatenates the machine first with the machine
    308  *  last to produce a machine new which will pattern-match first first
    309  *  and then last, and will fail if either of the sub-patterns fails.
    310  *  FIRST is set to new by the operation.  last is unmolested.
    311  */
    312 
    313 int     link_machines (int first, int last)
    314 {
    315 	if (first == NIL)
    316 		return last;
    317 
    318 	else if (last == NIL)
    319 		return first;
    320 
    321 	else {
    322 		mkxtion (finalst[first], last);
    323 		finalst[first] = finalst[last];
    324 		lastst[first] = MAX (lastst[first], lastst[last]);
    325 		firstst[first] = MIN (firstst[first], firstst[last]);
    326 
    327 		return first;
    328 	}
    329 }
    330 
    331 
    332 /* mark_beginning_as_normal - mark each "beginning" state in a machine
    333  *                            as being a "normal" (i.e., not trailing context-
    334  *                            associated) states
    335  *
    336  * The "beginning" states are the epsilon closure of the first state
    337  */
    338 
    339 void    mark_beginning_as_normal (int mach)
    340 {
    341 	switch (state_type[mach]) {
    342 	case STATE_NORMAL:
    343 		/* Oh, we've already visited here. */
    344 		return;
    345 
    346 	case STATE_TRAILING_CONTEXT:
    347 		state_type[mach] = STATE_NORMAL;
    348 
    349 		if (transchar[mach] == SYM_EPSILON) {
    350 			if (trans1[mach] != NO_TRANSITION)
    351 				mark_beginning_as_normal (trans1[mach]);
    352 
    353 			if (trans2[mach] != NO_TRANSITION)
    354 				mark_beginning_as_normal (trans2[mach]);
    355 		}
    356 		break;
    357 
    358 	default:
    359 		flexerror (_
    360 			   ("bad state type in mark_beginning_as_normal()"));
    361 		break;
    362 	}
    363 }
    364 
    365 
    366 /* mkbranch - make a machine that branches to two machines
    367  *
    368  * synopsis
    369  *
    370  *   branch = mkbranch( first, second );
    371  *
    372  *     branch - a machine which matches either first's pattern or second's
    373  *     first, second - machines whose patterns are to be or'ed (the | operator)
    374  *
    375  * Note that first and second are NEITHER destroyed by the operation.  Also,
    376  * the resulting machine CANNOT be used with any other "mk" operation except
    377  * more mkbranch's.  Compare with mkor()
    378  */
    379 
    380 int     mkbranch (int first, int second)
    381 {
    382 	int     eps;
    383 
    384 	if (first == NO_TRANSITION)
    385 		return second;
    386 
    387 	else if (second == NO_TRANSITION)
    388 		return first;
    389 
    390 	eps = mkstate (SYM_EPSILON);
    391 
    392 	mkxtion (eps, first);
    393 	mkxtion (eps, second);
    394 
    395 	return eps;
    396 }
    397 
    398 
    399 /* mkclos - convert a machine into a closure
    400  *
    401  * synopsis
    402  *   new = mkclos( state );
    403  *
    404  * new - a new state which matches the closure of "state"
    405  */
    406 
    407 int     mkclos (int state)
    408 {
    409 	return mkopt (mkposcl (state));
    410 }
    411 
    412 
    413 /* mkopt - make a machine optional
    414  *
    415  * synopsis
    416  *
    417  *   new = mkopt( mach );
    418  *
    419  *     new  - a machine which optionally matches whatever mach matched
    420  *     mach - the machine to make optional
    421  *
    422  * notes:
    423  *     1. mach must be the last machine created
    424  *     2. mach is destroyed by the call
    425  */
    426 
    427 int     mkopt (int mach)
    428 {
    429 	int     eps;
    430 
    431 	if (!SUPER_FREE_EPSILON (finalst[mach])) {
    432 		eps = mkstate (SYM_EPSILON);
    433 		mach = link_machines (mach, eps);
    434 	}
    435 
    436 	/* Can't skimp on the following if FREE_EPSILON(mach) is true because
    437 	 * some state interior to "mach" might point back to the beginning
    438 	 * for a closure.
    439 	 */
    440 	eps = mkstate (SYM_EPSILON);
    441 	mach = link_machines (eps, mach);
    442 
    443 	mkxtion (mach, finalst[mach]);
    444 
    445 	return mach;
    446 }
    447 
    448 
    449 /* mkor - make a machine that matches either one of two machines
    450  *
    451  * synopsis
    452  *
    453  *   new = mkor( first, second );
    454  *
    455  *     new - a machine which matches either first's pattern or second's
    456  *     first, second - machines whose patterns are to be or'ed (the | operator)
    457  *
    458  * note that first and second are both destroyed by the operation
    459  * the code is rather convoluted because an attempt is made to minimize
    460  * the number of epsilon states needed
    461  */
    462 
    463 int     mkor (int first, int second)
    464 {
    465 	int     eps, orend;
    466 
    467 	if (first == NIL)
    468 		return second;
    469 
    470 	else if (second == NIL)
    471 		return first;
    472 
    473 	else {
    474 		/* See comment in mkopt() about why we can't use the first
    475 		 * state of "first" or "second" if they satisfy "FREE_EPSILON".
    476 		 */
    477 		eps = mkstate (SYM_EPSILON);
    478 
    479 		first = link_machines (eps, first);
    480 
    481 		mkxtion (first, second);
    482 
    483 		if (SUPER_FREE_EPSILON (finalst[first]) &&
    484 		    accptnum[finalst[first]] == NIL) {
    485 			orend = finalst[first];
    486 			mkxtion (finalst[second], orend);
    487 		}
    488 
    489 		else if (SUPER_FREE_EPSILON (finalst[second]) &&
    490 			 accptnum[finalst[second]] == NIL) {
    491 			orend = finalst[second];
    492 			mkxtion (finalst[first], orend);
    493 		}
    494 
    495 		else {
    496 			eps = mkstate (SYM_EPSILON);
    497 
    498 			first = link_machines (first, eps);
    499 			orend = finalst[first];
    500 
    501 			mkxtion (finalst[second], orend);
    502 		}
    503 	}
    504 
    505 	finalst[first] = orend;
    506 	return first;
    507 }
    508 
    509 
    510 /* mkposcl - convert a machine into a positive closure
    511  *
    512  * synopsis
    513  *   new = mkposcl( state );
    514  *
    515  *    new - a machine matching the positive closure of "state"
    516  */
    517 
    518 int     mkposcl (int state)
    519 {
    520 	int     eps;
    521 
    522 	if (SUPER_FREE_EPSILON (finalst[state])) {
    523 		mkxtion (finalst[state], state);
    524 		return state;
    525 	}
    526 
    527 	else {
    528 		eps = mkstate (SYM_EPSILON);
    529 		mkxtion (eps, state);
    530 		return link_machines (state, eps);
    531 	}
    532 }
    533 
    534 
    535 /* mkrep - make a replicated machine
    536  *
    537  * synopsis
    538  *   new = mkrep( mach, lb, ub );
    539  *
    540  *    new - a machine that matches whatever "mach" matched from "lb"
    541  *          number of times to "ub" number of times
    542  *
    543  * note
    544  *   if "ub" is INFINITE_REPEAT then "new" matches "lb" or more occurrences of "mach"
    545  */
    546 
    547 int     mkrep (int mach, int lb, int ub)
    548 {
    549 	int     base_mach, tail, copy, i;
    550 
    551 	base_mach = copysingl (mach, lb - 1);
    552 
    553 	if (ub == INFINITE_REPEAT) {
    554 		copy = dupmachine (mach);
    555 		mach = link_machines (mach,
    556 				      link_machines (base_mach,
    557 						     mkclos (copy)));
    558 	}
    559 
    560 	else {
    561 		tail = mkstate (SYM_EPSILON);
    562 
    563 		for (i = lb; i < ub; ++i) {
    564 			copy = dupmachine (mach);
    565 			tail = mkopt (link_machines (copy, tail));
    566 		}
    567 
    568 		mach =
    569 			link_machines (mach,
    570 				       link_machines (base_mach, tail));
    571 	}
    572 
    573 	return mach;
    574 }
    575 
    576 
    577 /* mkstate - create a state with a transition on a given symbol
    578  *
    579  * synopsis
    580  *
    581  *   state = mkstate( sym );
    582  *
    583  *     state - a new state matching sym
    584  *     sym   - the symbol the new state is to have an out-transition on
    585  *
    586  * note that this routine makes new states in ascending order through the
    587  * state array (and increments LASTNFA accordingly).  The routine DUPMACHINE
    588  * relies on machines being made in ascending order and that they are
    589  * CONTIGUOUS.  Change it and you will have to rewrite DUPMACHINE (kludge
    590  * that it admittedly is)
    591  */
    592 
    593 int     mkstate (int sym)
    594 {
    595 	if (++lastnfa >= current_mns) {
    596 		if ((current_mns += MNS_INCREMENT) >= maximum_mns)
    597 			lerr(_
    598 				("input rules are too complicated (>= %d NFA states)"),
    599 current_mns);
    600 
    601 		++num_reallocs;
    602 
    603 		firstst = reallocate_integer_array (firstst, current_mns);
    604 		lastst = reallocate_integer_array (lastst, current_mns);
    605 		finalst = reallocate_integer_array (finalst, current_mns);
    606 		transchar =
    607 			reallocate_integer_array (transchar, current_mns);
    608 		trans1 = reallocate_integer_array (trans1, current_mns);
    609 		trans2 = reallocate_integer_array (trans2, current_mns);
    610 		accptnum =
    611 			reallocate_integer_array (accptnum, current_mns);
    612 		assoc_rule =
    613 			reallocate_integer_array (assoc_rule, current_mns);
    614 		state_type =
    615 			reallocate_integer_array (state_type, current_mns);
    616 	}
    617 
    618 	firstst[lastnfa] = lastnfa;
    619 	finalst[lastnfa] = lastnfa;
    620 	lastst[lastnfa] = lastnfa;
    621 	transchar[lastnfa] = sym;
    622 	trans1[lastnfa] = NO_TRANSITION;
    623 	trans2[lastnfa] = NO_TRANSITION;
    624 	accptnum[lastnfa] = NIL;
    625 	assoc_rule[lastnfa] = num_rules;
    626 	state_type[lastnfa] = current_state_type;
    627 
    628 	/* Fix up equivalence classes base on this transition.  Note that any
    629 	 * character which has its own transition gets its own equivalence
    630 	 * class.  Thus only characters which are only in character classes
    631 	 * have a chance at being in the same equivalence class.  E.g. "a|b"
    632 	 * puts 'a' and 'b' into two different equivalence classes.  "[ab]"
    633 	 * puts them in the same equivalence class (barring other differences
    634 	 * elsewhere in the input).
    635 	 */
    636 
    637 	if (sym < 0) {
    638 		/* We don't have to update the equivalence classes since
    639 		 * that was already done when the ccl was created for the
    640 		 * first time.
    641 		 */
    642 	}
    643 
    644 	else if (sym == SYM_EPSILON)
    645 		++numeps;
    646 
    647 	else {
    648 		check_char (sym);
    649 
    650 		if (useecs)
    651 			/* Map NUL's to csize. */
    652 			mkechar (sym ? sym : csize, nextecm, ecgroup);
    653 	}
    654 
    655 	return lastnfa;
    656 }
    657 
    658 
    659 /* mkxtion - make a transition from one state to another
    660  *
    661  * synopsis
    662  *
    663  *   mkxtion( statefrom, stateto );
    664  *
    665  *     statefrom - the state from which the transition is to be made
    666  *     stateto   - the state to which the transition is to be made
    667  */
    668 
    669 void    mkxtion (int statefrom, int stateto)
    670 {
    671 	if (trans1[statefrom] == NO_TRANSITION)
    672 		trans1[statefrom] = stateto;
    673 
    674 	else if ((transchar[statefrom] != SYM_EPSILON) ||
    675 		 (trans2[statefrom] != NO_TRANSITION))
    676 		flexfatal (_("found too many transitions in mkxtion()"));
    677 
    678 	else {			/* second out-transition for an epsilon state */
    679 		++eps2;
    680 		trans2[statefrom] = stateto;
    681 	}
    682 }
    683 
    684 /* new_rule - initialize for a new rule */
    685 
    686 void    new_rule (void)
    687 {
    688 	if (++num_rules >= current_max_rules) {
    689 		++num_reallocs;
    690 		current_max_rules += MAX_RULES_INCREMENT;
    691 		rule_type = reallocate_integer_array (rule_type,
    692 						      current_max_rules);
    693 		rule_linenum = reallocate_integer_array (rule_linenum,
    694 							 current_max_rules);
    695 		rule_useful = reallocate_integer_array (rule_useful,
    696 							current_max_rules);
    697 		rule_has_nl = reallocate_bool_array (rule_has_nl,
    698 						     current_max_rules);
    699 	}
    700 
    701 	if (num_rules > MAX_RULE)
    702 		lerr (_("too many rules (> %d)!"), MAX_RULE);
    703 
    704 	rule_linenum[num_rules] = linenum;
    705 	rule_useful[num_rules] = false;
    706 	rule_has_nl[num_rules] = false;
    707 }
    708