Home | History | Annotate | Line # | Download | only in src
      1 /* ccl - routines for character classes */
      2 
      3 /*  Copyright (c) 1990 The Regents of the University of California. */
      4 /*  All rights reserved. */
      5 
      6 /*  This code is derived from software contributed to Berkeley by */
      7 /*  Vern Paxson. */
      8 
      9 /*  The United States Government has rights in this work pursuant */
     10 /*  to contract no. DE-AC03-76SF00098 between the United States */
     11  /*  Department of Energy and the University of California. */
     12 
     13 /*  This file is part of flex. */
     14 
     15 /*  Redistribution and use in source and binary forms, with or without */
     16 /*  modification, are permitted provided that the following conditions */
     17 /*  are met: */
     18 
     19 /*  1. Redistributions of source code must retain the above copyright */
     20 /*     notice, this list of conditions and the following disclaimer. */
     21 /*  2. Redistributions in binary form must reproduce the above copyright */
     22 /*     notice, this list of conditions and the following disclaimer in the */
     23 /*     documentation and/or other materials provided with the distribution. */
     24 
     25 /*  Neither the name of the University nor the names of its contributors */
     26 /*  may be used to endorse or promote products derived from this software */
     27 /*  without specific prior written permission. */
     28 
     29 /*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */
     30 /*  IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */
     31 /*  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
     32 /*  PURPOSE. */
     33 #include "flexdef.h"
     34 __RCSID("$NetBSD: ccl.c,v 1.3 2017/01/02 17:45:27 christos Exp $");
     35 
     36 
     37 /* return true if the chr is in the ccl. Takes negation into account. */
     38 static bool
     39 ccl_contains (const int cclp, const int ch)
     40 {
     41 	int     ind, len, i;
     42 
     43 	len = ccllen[cclp];
     44 	ind = cclmap[cclp];
     45 
     46 	for (i = 0; i < len; ++i)
     47 		if (ccltbl[ind + i] == ch)
     48 			return !cclng[cclp];
     49 
     50     return cclng[cclp];
     51 }
     52 
     53 
     54 /* ccladd - add a single character to a ccl */
     55 
     56 void ccladd (int cclp, int ch)
     57 {
     58 	int     ind, len, newpos, i;
     59 
     60 	check_char (ch);
     61 
     62 	len = ccllen[cclp];
     63 	ind = cclmap[cclp];
     64 
     65 	/* check to see if the character is already in the ccl */
     66 
     67 	for (i = 0; i < len; ++i)
     68 		if (ccltbl[ind + i] == ch)
     69 			return;
     70 
     71 	/* mark newlines */
     72 	if (ch == nlch)
     73 		ccl_has_nl[cclp] = true;
     74 
     75 	newpos = ind + len;
     76 
     77 	if (newpos >= current_max_ccl_tbl_size) {
     78 		current_max_ccl_tbl_size += MAX_CCL_TBL_SIZE_INCREMENT;
     79 
     80 		++num_reallocs;
     81 
     82 		ccltbl = reallocate_Character_array (ccltbl,
     83 						     current_max_ccl_tbl_size);
     84 	}
     85 
     86 	ccllen[cclp] = len + 1;
     87 	ccltbl[newpos] = (unsigned char) ch;
     88 }
     89 
     90 /* dump_cclp - same thing as list_character_set, but for cclps.  */
     91 
     92 static void    dump_cclp (FILE* file, int cclp)
     93 {
     94 	int i;
     95 
     96 	putc ('[', file);
     97 
     98 	for (i = 0; i < csize; ++i) {
     99 		if (ccl_contains(cclp, i)){
    100 			int start_char = i;
    101 
    102 			putc (' ', file);
    103 
    104 			fputs (readable_form (i), file);
    105 
    106 			while (++i < csize && ccl_contains(cclp,i)) ;
    107 
    108 			if (i - 1 > start_char)
    109 				/* this was a run */
    110 				fprintf (file, "-%s",
    111 					 readable_form (i - 1));
    112 
    113 			putc (' ', file);
    114 		}
    115 	}
    116 
    117 	putc (']', file);
    118 }
    119 
    120 
    121 
    122 /* ccl_set_diff - create a new ccl as the set difference of the two given ccls. */
    123 int
    124 ccl_set_diff (int a, int b)
    125 {
    126     int  d, ch;
    127 
    128     /* create new class  */
    129     d = cclinit();
    130 
    131     /* In order to handle negation, we spin through all possible chars,
    132      * addding each char in a that is not in b.
    133      * (This could be O(n^2), but n is small and bounded.)
    134      */
    135 	for ( ch = 0; ch < csize; ++ch )
    136         if (ccl_contains (a, ch) && !ccl_contains(b, ch))
    137             ccladd (d, ch);
    138 
    139     /* debug */
    140     if (0){
    141         fprintf(stderr, "ccl_set_diff (");
    142             fprintf(stderr, "\n    ");
    143             dump_cclp (stderr, a);
    144             fprintf(stderr, "\n    ");
    145             dump_cclp (stderr, b);
    146             fprintf(stderr, "\n    ");
    147             dump_cclp (stderr, d);
    148         fprintf(stderr, "\n)\n");
    149     }
    150     return d;
    151 }
    152 
    153 /* ccl_set_union - create a new ccl as the set union of the two given ccls. */
    154 int
    155 ccl_set_union (int a, int b)
    156 {
    157     int  d, i;
    158 
    159     /* create new class  */
    160     d = cclinit();
    161 
    162     /* Add all of a */
    163     for (i = 0; i < ccllen[a]; ++i)
    164 		ccladd (d, ccltbl[cclmap[a] + i]);
    165 
    166     /* Add all of b */
    167     for (i = 0; i < ccllen[b]; ++i)
    168 		ccladd (d, ccltbl[cclmap[b] + i]);
    169 
    170     /* debug */
    171     if (0){
    172         fprintf(stderr, "ccl_set_union (%d + %d = %d", a, b, d);
    173             fprintf(stderr, "\n    ");
    174             dump_cclp (stderr, a);
    175             fprintf(stderr, "\n    ");
    176             dump_cclp (stderr, b);
    177             fprintf(stderr, "\n    ");
    178             dump_cclp (stderr, d);
    179         fprintf(stderr, "\n)\n");
    180     }
    181     return d;
    182 }
    183 
    184 
    185 /* cclinit - return an empty ccl */
    186 
    187 int     cclinit (void)
    188 {
    189 	if (++lastccl >= current_maxccls) {
    190 		current_maxccls += MAX_CCLS_INCREMENT;
    191 
    192 		++num_reallocs;
    193 
    194 		cclmap =
    195 			reallocate_integer_array (cclmap, current_maxccls);
    196 		ccllen =
    197 			reallocate_integer_array (ccllen, current_maxccls);
    198 		cclng = reallocate_integer_array (cclng, current_maxccls);
    199 		ccl_has_nl =
    200 			reallocate_bool_array (ccl_has_nl,
    201 					       current_maxccls);
    202 	}
    203 
    204 	if (lastccl == 1)
    205 		/* we're making the first ccl */
    206 		cclmap[lastccl] = 0;
    207 
    208 	else
    209 		/* The new pointer is just past the end of the last ccl.
    210 		 * Since the cclmap points to the \first/ character of a
    211 		 * ccl, adding the length of the ccl to the cclmap pointer
    212 		 * will produce a cursor to the first free space.
    213 		 */
    214 		cclmap[lastccl] =
    215 			cclmap[lastccl - 1] + ccllen[lastccl - 1];
    216 
    217 	ccllen[lastccl] = 0;
    218 	cclng[lastccl] = 0;	/* ccl's start out life un-negated */
    219 	ccl_has_nl[lastccl] = false;
    220 
    221 	return lastccl;
    222 }
    223 
    224 
    225 /* cclnegate - negate the given ccl */
    226 
    227 void    cclnegate (int cclp)
    228 {
    229 	cclng[cclp] = 1;
    230 	ccl_has_nl[cclp] = !ccl_has_nl[cclp];
    231 }
    232 
    233 
    234 /* list_character_set - list the members of a set of characters in CCL form
    235  *
    236  * Writes to the given file a character-class representation of those
    237  * characters present in the given CCL.  A character is present if it
    238  * has a non-zero value in the cset array.
    239  */
    240 
    241 void    list_character_set (FILE *file, int cset[])
    242 {
    243 	int i;
    244 
    245 	putc ('[', file);
    246 
    247 	for (i = 0; i < csize; ++i) {
    248 		if (cset[i]) {
    249 			int start_char = i;
    250 
    251 			putc (' ', file);
    252 
    253 			fputs (readable_form (i), file);
    254 
    255 			while (++i < csize && cset[i]) ;
    256 
    257 			if (i - 1 > start_char)
    258 				/* this was a run */
    259 				fprintf (file, "-%s",
    260 					 readable_form (i - 1));
    261 
    262 			putc (' ', file);
    263 		}
    264 	}
    265 
    266 	putc (']', file);
    267 }
    268 
    269 /** Determines if the range [c1-c2] is unambiguous in a case-insensitive
    270  * scanner.  Specifically, if a lowercase or uppercase character, x, is in the
    271  * range [c1-c2], then we require that UPPERCASE(x) and LOWERCASE(x) must also
    272  * be in the range. If not, then this range is ambiguous, and the function
    273  * returns false.  For example, [@-_] spans [a-z] but not [A-Z].  Beware that
    274  * [a-z] will be labeled ambiguous because it does not include [A-Z].
    275  *
    276  * @param c1 the lower end of the range
    277  * @param c2 the upper end of the range
    278  * @return true if [c1-c2] is not ambiguous for a caseless scanner.
    279  */
    280 bool range_covers_case (int c1, int c2)
    281 {
    282 	int     i, o;
    283 
    284 	for (i = c1; i <= c2; i++) {
    285 		if (has_case (i)) {
    286 			o = reverse_case (i);
    287 			if (o < c1 || c2 < o)
    288 				return false;
    289 		}
    290 	}
    291 	return true;
    292 }
    293 
    294 /** Reverse the case of a character, if possible.
    295  * @return c if case-reversal does not apply.
    296  */
    297 int reverse_case (int c)
    298 {
    299 	return isupper (c) ? tolower (c) : (islower (c) ? toupper (c) : c);
    300 }
    301 
    302 /** Return true if c is uppercase or lowercase. */
    303 bool has_case (int c)
    304 {
    305 	return (isupper (c) || islower (c)) ? true : false;
    306 }
    307