Home | History | Annotate | Line # | Download | only in make
str.c revision 1.100
      1  1.100    rillig /*	$NetBSD: str.c,v 1.100 2023/12/16 21:26:07 rillig Exp $	*/
      2   1.10  christos 
      3   1.79    rillig /*
      4   1.13  christos  * Copyright (c) 1988, 1989, 1990, 1993
      5   1.13  christos  *	The Regents of the University of California.  All rights reserved.
      6   1.20       agc  *
      7   1.20       agc  * This code is derived from software contributed to Berkeley by
      8   1.20       agc  * Adam de Boor.
      9   1.20       agc  *
     10   1.20       agc  * Redistribution and use in source and binary forms, with or without
     11   1.20       agc  * modification, are permitted provided that the following conditions
     12   1.20       agc  * are met:
     13   1.20       agc  * 1. Redistributions of source code must retain the above copyright
     14   1.20       agc  *    notice, this list of conditions and the following disclaimer.
     15   1.20       agc  * 2. Redistributions in binary form must reproduce the above copyright
     16   1.20       agc  *    notice, this list of conditions and the following disclaimer in the
     17   1.20       agc  *    documentation and/or other materials provided with the distribution.
     18   1.20       agc  * 3. Neither the name of the University nor the names of its contributors
     19   1.20       agc  *    may be used to endorse or promote products derived from this software
     20   1.20       agc  *    without specific prior written permission.
     21   1.20       agc  *
     22   1.20       agc  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     23   1.20       agc  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     24   1.20       agc  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     25   1.20       agc  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     26   1.20       agc  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     27   1.20       agc  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     28   1.20       agc  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     29   1.20       agc  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     30   1.20       agc  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     31   1.20       agc  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     32   1.20       agc  * SUCH DAMAGE.
     33   1.20       agc  */
     34   1.20       agc 
     35   1.79    rillig /*
     36    1.1       cgd  * Copyright (c) 1989 by Berkeley Softworks
     37    1.1       cgd  * All rights reserved.
     38    1.1       cgd  *
     39    1.1       cgd  * This code is derived from software contributed to Berkeley by
     40    1.1       cgd  * Adam de Boor.
     41    1.1       cgd  *
     42    1.1       cgd  * Redistribution and use in source and binary forms, with or without
     43    1.1       cgd  * modification, are permitted provided that the following conditions
     44    1.1       cgd  * are met:
     45    1.1       cgd  * 1. Redistributions of source code must retain the above copyright
     46    1.1       cgd  *    notice, this list of conditions and the following disclaimer.
     47    1.1       cgd  * 2. Redistributions in binary form must reproduce the above copyright
     48    1.1       cgd  *    notice, this list of conditions and the following disclaimer in the
     49    1.1       cgd  *    documentation and/or other materials provided with the distribution.
     50    1.1       cgd  * 3. All advertising materials mentioning features or use of this software
     51    1.1       cgd  *    must display the following acknowledgement:
     52    1.1       cgd  *	This product includes software developed by the University of
     53    1.1       cgd  *	California, Berkeley and its contributors.
     54    1.1       cgd  * 4. Neither the name of the University nor the names of its contributors
     55    1.1       cgd  *    may be used to endorse or promote products derived from this software
     56    1.1       cgd  *    without specific prior written permission.
     57    1.1       cgd  *
     58    1.1       cgd  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     59    1.1       cgd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     60    1.1       cgd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     61    1.1       cgd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     62    1.1       cgd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     63    1.1       cgd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     64    1.1       cgd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     65    1.1       cgd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     66    1.1       cgd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     67    1.1       cgd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     68    1.1       cgd  * SUCH DAMAGE.
     69    1.1       cgd  */
     70    1.1       cgd 
     71   1.65    rillig #include "make.h"
     72    1.1       cgd 
     73   1.65    rillig /*	"@(#)str.c	5.8 (Berkeley) 6/1/90"	*/
     74  1.100    rillig MAKE_RCSID("$NetBSD: str.c,v 1.100 2023/12/16 21:26:07 rillig Exp $");
     75   1.87    rillig 
     76   1.87    rillig 
     77   1.87    rillig static HashTable interned_strings;
     78   1.87    rillig 
     79    1.1       cgd 
     80   1.59    rillig /* Return the concatenation of s1 and s2, freshly allocated. */
     81    1.1       cgd char *
     82   1.59    rillig str_concat2(const char *s1, const char *s2)
     83    1.1       cgd {
     84   1.57    rillig 	size_t len1 = strlen(s1);
     85   1.57    rillig 	size_t len2 = strlen(s2);
     86   1.59    rillig 	char *result = bmake_malloc(len1 + len2 + 1);
     87   1.28  christos 	memcpy(result, s1, len1);
     88   1.28  christos 	memcpy(result + len1, s2, len2 + 1);
     89   1.59    rillig 	return result;
     90   1.59    rillig }
     91    1.1       cgd 
     92   1.59    rillig /* Return the concatenation of s1, s2 and s3, freshly allocated. */
     93   1.59    rillig char *
     94   1.59    rillig str_concat3(const char *s1, const char *s2, const char *s3)
     95   1.59    rillig {
     96   1.59    rillig 	size_t len1 = strlen(s1);
     97   1.59    rillig 	size_t len2 = strlen(s2);
     98   1.59    rillig 	size_t len3 = strlen(s3);
     99   1.59    rillig 	char *result = bmake_malloc(len1 + len2 + len3 + 1);
    100   1.59    rillig 	memcpy(result, s1, len1);
    101   1.59    rillig 	memcpy(result + len1, s2, len2);
    102   1.59    rillig 	memcpy(result + len1 + len2, s3, len3 + 1);
    103   1.45    rillig 	return result;
    104    1.1       cgd }
    105    1.1       cgd 
    106   1.76    rillig /*
    107   1.76    rillig  * Fracture a string into an array of words (as delineated by tabs or spaces)
    108   1.73    rillig  * taking quotation marks into account.
    109   1.64    rillig  *
    110   1.82    rillig  * If expand is true, quotes are removed and escape sequences such as \r, \t,
    111   1.73    rillig  * etc... are expanded. In this case, return NULL on parse errors.
    112   1.64    rillig  *
    113   1.73    rillig  * Returns the fractured words, which must be freed later using Words_Free,
    114   1.73    rillig  * unless the returned Words.words was NULL.
    115    1.1       cgd  */
    116   1.84    rillig SubstringWords
    117   1.84    rillig Substring_Words(const char *str, bool expand)
    118    1.1       cgd {
    119   1.56    rillig 	size_t str_len;
    120   1.56    rillig 	char *words_buf;
    121   1.61    rillig 	size_t words_cap;
    122   1.84    rillig 	Substring *words;
    123   1.61    rillig 	size_t words_len;
    124   1.56    rillig 	char inquote;
    125   1.56    rillig 	char *word_start;
    126   1.56    rillig 	char *word_end;
    127   1.55    rillig 	const char *str_p;
    128   1.55    rillig 
    129   1.72    rillig 	/* XXX: why only hspace, not whitespace? */
    130   1.72    rillig 	cpp_skip_hspace(&str);	/* skip leading space chars. */
    131    1.1       cgd 
    132   1.41    rillig 	/* words_buf holds the words, separated by '\0'. */
    133   1.55    rillig 	str_len = strlen(str);
    134   1.75    rillig 	words_buf = bmake_malloc(str_len + 1);
    135    1.1       cgd 
    136   1.70    rillig 	words_cap = str_len / 5 > 50 ? str_len / 5 : 50;
    137   1.84    rillig 	words = bmake_malloc((words_cap + 1) * sizeof(words[0]));
    138   1.35       sjg 
    139   1.35       sjg 	/*
    140    1.1       cgd 	 * copy the string; at the same time, parse backslashes,
    141   1.41    rillig 	 * quotes and build the word list.
    142    1.1       cgd 	 */
    143   1.55    rillig 	words_len = 0;
    144   1.55    rillig 	inquote = '\0';
    145   1.55    rillig 	word_start = words_buf;
    146   1.55    rillig 	word_end = words_buf;
    147   1.80    rillig 	for (str_p = str;; str_p++) {
    148   1.41    rillig 		char ch = *str_p;
    149   1.56    rillig 		switch (ch) {
    150    1.1       cgd 		case '"':
    151    1.1       cgd 		case '\'':
    152   1.78    rillig 			if (inquote != '\0') {
    153    1.1       cgd 				if (inquote == ch)
    154    1.4       cgd 					inquote = '\0';
    155    1.1       cgd 				else
    156    1.1       cgd 					break;
    157   1.56    rillig 			} else {
    158   1.69    rillig 				inquote = ch;
    159    1.6       jtc 				/* Don't miss "" or '' */
    160   1.41    rillig 				if (word_start == NULL && str_p[1] == inquote) {
    161   1.30  christos 					if (!expand) {
    162   1.41    rillig 						word_start = word_end;
    163   1.41    rillig 						*word_end++ = ch;
    164   1.30  christos 					} else
    165   1.41    rillig 						word_start = word_end + 1;
    166   1.41    rillig 					str_p++;
    167   1.25  christos 					inquote = '\0';
    168    1.6       jtc 					break;
    169    1.6       jtc 				}
    170    1.6       jtc 			}
    171    1.8       jtc 			if (!expand) {
    172   1.41    rillig 				if (word_start == NULL)
    173   1.41    rillig 					word_start = word_end;
    174   1.41    rillig 				*word_end++ = ch;
    175    1.8       jtc 			}
    176    1.1       cgd 			continue;
    177    1.1       cgd 		case ' ':
    178    1.1       cgd 		case '\t':
    179    1.8       jtc 		case '\n':
    180   1.78    rillig 			if (inquote != '\0')
    181    1.1       cgd 				break;
    182   1.41    rillig 			if (word_start == NULL)
    183    1.1       cgd 				continue;
    184    1.1       cgd 			/* FALLTHROUGH */
    185    1.1       cgd 		case '\0':
    186    1.1       cgd 			/*
    187   1.41    rillig 			 * end of a token -- make sure there's enough words
    188    1.1       cgd 			 * space and save off a pointer.
    189    1.1       cgd 			 */
    190   1.41    rillig 			if (word_start == NULL)
    191   1.56    rillig 				goto done;
    192    1.8       jtc 
    193   1.41    rillig 			*word_end++ = '\0';
    194   1.41    rillig 			if (words_len == words_cap) {
    195   1.84    rillig 				words_cap *= 2;
    196   1.86    rillig 				words = bmake_realloc(words,
    197   1.86    rillig 				    (words_cap + 1) * sizeof(words[0]));
    198    1.1       cgd 			}
    199   1.84    rillig 			words[words_len++] =
    200   1.84    rillig 			    Substring_Init(word_start, word_end - 1);
    201   1.41    rillig 			word_start = NULL;
    202   1.31  christos 			if (ch == '\n' || ch == '\0') {
    203   1.77    rillig 				if (expand && inquote != '\0') {
    204   1.84    rillig 					SubstringWords res;
    205   1.83    rillig 
    206   1.41    rillig 					free(words);
    207   1.41    rillig 					free(words_buf);
    208   1.83    rillig 
    209   1.83    rillig 					res.words = NULL;
    210   1.83    rillig 					res.len = 0;
    211   1.83    rillig 					res.freeIt = NULL;
    212   1.83    rillig 					return res;
    213   1.31  christos 				}
    214    1.1       cgd 				goto done;
    215   1.31  christos 			}
    216    1.1       cgd 			continue;
    217    1.1       cgd 		case '\\':
    218    1.8       jtc 			if (!expand) {
    219   1.41    rillig 				if (word_start == NULL)
    220   1.41    rillig 					word_start = word_end;
    221   1.41    rillig 				*word_end++ = '\\';
    222   1.93    rillig 				/* catch lonely '\' at end of string */
    223   1.41    rillig 				if (str_p[1] == '\0')
    224   1.26       erh 					continue;
    225   1.41    rillig 				ch = *++str_p;
    226    1.8       jtc 				break;
    227    1.8       jtc 			}
    228   1.13  christos 
    229   1.41    rillig 			switch (ch = *++str_p) {
    230    1.1       cgd 			case '\0':
    231    1.1       cgd 			case '\n':
    232    1.1       cgd 				/* hmmm; fix it up as best we can */
    233    1.1       cgd 				ch = '\\';
    234   1.74    rillig 				str_p--;
    235    1.1       cgd 				break;
    236    1.1       cgd 			case 'b':
    237    1.1       cgd 				ch = '\b';
    238    1.1       cgd 				break;
    239    1.1       cgd 			case 'f':
    240    1.1       cgd 				ch = '\f';
    241    1.1       cgd 				break;
    242    1.1       cgd 			case 'n':
    243    1.1       cgd 				ch = '\n';
    244    1.1       cgd 				break;
    245    1.1       cgd 			case 'r':
    246    1.1       cgd 				ch = '\r';
    247    1.1       cgd 				break;
    248    1.1       cgd 			case 't':
    249    1.1       cgd 				ch = '\t';
    250    1.1       cgd 				break;
    251    1.1       cgd 			}
    252    1.1       cgd 			break;
    253    1.1       cgd 		}
    254   1.41    rillig 		if (word_start == NULL)
    255   1.41    rillig 			word_start = word_end;
    256   1.41    rillig 		*word_end++ = ch;
    257    1.1       cgd 	}
    258   1.56    rillig done:
    259   1.84    rillig 	words[words_len] = Substring_Init(NULL, NULL);	/* useful for argv */
    260   1.83    rillig 
    261   1.83    rillig 	{
    262   1.84    rillig 		SubstringWords result;
    263   1.83    rillig 
    264   1.83    rillig 		result.words = words;
    265   1.83    rillig 		result.len = words_len;
    266   1.83    rillig 		result.freeIt = words_buf;
    267   1.83    rillig 		return result;
    268   1.83    rillig 	}
    269    1.1       cgd }
    270    1.1       cgd 
    271   1.84    rillig Words
    272   1.84    rillig Str_Words(const char *str, bool expand)
    273   1.84    rillig {
    274   1.84    rillig 	SubstringWords swords;
    275   1.84    rillig 	Words words;
    276   1.84    rillig 	size_t i;
    277   1.84    rillig 
    278   1.84    rillig 	swords = Substring_Words(str, expand);
    279   1.84    rillig 	if (swords.words == NULL) {
    280   1.84    rillig 		words.words = NULL;
    281   1.84    rillig 		words.len = 0;
    282   1.84    rillig 		words.freeIt = NULL;
    283   1.84    rillig 		return words;
    284   1.84    rillig 	}
    285   1.84    rillig 
    286   1.84    rillig 	words.words = bmake_malloc((swords.len + 1) * sizeof(words.words[0]));
    287   1.84    rillig 	words.len = swords.len;
    288   1.84    rillig 	words.freeIt = swords.freeIt;
    289   1.84    rillig 	for (i = 0; i < swords.len + 1; i++)
    290   1.84    rillig 		words.words[i] = UNCONST(swords.words[i].start);
    291   1.84    rillig 	free(swords.words);
    292   1.84    rillig 	return words;
    293   1.84    rillig }
    294   1.84    rillig 
    295    1.1       cgd /*
    296   1.91    rillig  * XXX: In the extreme edge case that one of the characters is from the basic
    297   1.91    rillig  * execution character set and the other isn't, the result of the comparison
    298   1.91    rillig  * differs depending on whether plain char is signed or unsigned.
    299   1.91    rillig  *
    300   1.91    rillig  * An example is the character range from \xE4 to 'a', where \xE4 may come
    301   1.91    rillig  * from U+00E4 'Latin small letter A with diaeresis'.
    302   1.91    rillig  *
    303   1.91    rillig  * If char is signed, \xE4 evaluates to -28, the first half of the condition
    304   1.91    rillig  * becomes -28 <= '0' && '0' <= 'a', which evaluates to true.
    305   1.91    rillig  *
    306   1.91    rillig  * If char is unsigned, \xE4 evaluates to 228, the second half of the
    307   1.91    rillig  * condition becomes 'a' <= '0' && '0' <= 228, which evaluates to false.
    308   1.91    rillig  */
    309   1.91    rillig static bool
    310   1.91    rillig in_range(char e1, char c, char e2)
    311   1.91    rillig {
    312   1.91    rillig 	return (e1 <= c && c <= e2) || (e2 <= c && c <= e1);
    313   1.91    rillig }
    314   1.91    rillig 
    315   1.91    rillig /*
    316   1.94    rillig  * Test if a string matches a pattern like "*.[ch]". The pattern matching
    317   1.94    rillig  * characters are '*', '?' and '[]', as in fnmatch(3).
    318   1.13  christos  *
    319   1.93    rillig  * See varmod-match.mk for examples and edge cases.
    320    1.1       cgd  */
    321   1.98    rillig StrMatchResult
    322   1.51    rillig Str_Match(const char *str, const char *pat)
    323    1.1       cgd {
    324   1.98    rillig 	StrMatchResult res = { NULL, false };
    325  1.100    rillig 	bool asterisk = false;
    326  1.100    rillig 	const char *fixed_str = str;
    327  1.100    rillig 	const char *fixed_pat = pat;
    328   1.96    rillig 
    329   1.95    rillig match_fixed_length:
    330   1.96    rillig 	str = fixed_str;
    331   1.96    rillig 	pat = fixed_pat;
    332   1.95    rillig 	for (; *pat != '\0' && *pat != '*'; str++, pat++) {
    333   1.90    rillig 		if (*str == '\0')
    334   1.98    rillig 			return res;
    335   1.90    rillig 
    336   1.92    rillig 		if (*pat == '?')	/* match any single character */
    337   1.92    rillig 			continue;
    338   1.51    rillig 
    339   1.93    rillig 		if (*pat == '[') {	/* match a character from a list */
    340   1.82    rillig 			bool neg = pat[1] == '^';
    341   1.63    rillig 			pat += neg ? 2 : 1;
    342   1.37       sjg 
    343   1.99    rillig 		next_char_in_list:
    344   1.99    rillig 			if (*pat == '\0')
    345   1.99    rillig 				res.error = "Unfinished character list";
    346   1.99    rillig 			if (*pat == ']' || *pat == '\0') {
    347   1.99    rillig 				if (neg)
    348   1.99    rillig 					goto end_of_char_list;
    349  1.100    rillig 				goto no_match;
    350   1.99    rillig 			}
    351   1.99    rillig 			if (*pat == *str)
    352   1.99    rillig 				goto end_of_char_list;
    353   1.99    rillig 			if (pat[1] == '-' && pat[2] == '\0') {
    354   1.99    rillig 				res.error = "Unfinished character range";
    355   1.99    rillig 				res.matched = neg;
    356   1.99    rillig 				return res;
    357   1.99    rillig 			}
    358   1.99    rillig 			if (pat[1] == '-') {
    359   1.99    rillig 				if (in_range(pat[0], *str, pat[2]))
    360   1.99    rillig 					goto end_of_char_list;
    361   1.99    rillig 				pat += 2;
    362    1.1       cgd 			}
    363   1.99    rillig 			pat++;
    364   1.99    rillig 			goto next_char_in_list;
    365   1.99    rillig 
    366   1.99    rillig 		end_of_char_list:
    367   1.71    rillig 			if (neg && *pat != ']' && *pat != '\0')
    368  1.100    rillig 				goto no_match;
    369   1.71    rillig 			while (*pat != ']' && *pat != '\0')
    370   1.51    rillig 				pat++;
    371   1.71    rillig 			if (*pat == '\0')
    372   1.51    rillig 				pat--;
    373   1.92    rillig 			continue;
    374    1.1       cgd 		}
    375   1.51    rillig 
    376   1.92    rillig 		if (*pat == '\\')	/* match the next character exactly */
    377   1.51    rillig 			pat++;
    378   1.95    rillig 		if (*pat != *str)
    379  1.100    rillig 			goto no_match;
    380   1.95    rillig 	}
    381   1.51    rillig 
    382  1.100    rillig 	if (*pat == '*') {
    383  1.100    rillig 		asterisk = true;
    384  1.100    rillig 		while (*pat == '*')
    385  1.100    rillig 			pat++;
    386   1.98    rillig 		if (*pat == '\0') {
    387  1.100    rillig 			res.matched = true;
    388   1.98    rillig 			return res;
    389   1.98    rillig 		}
    390  1.100    rillig 		fixed_str = str;
    391  1.100    rillig 		fixed_pat = pat;
    392  1.100    rillig 		goto match_fixed_length;
    393  1.100    rillig 	}
    394  1.100    rillig 	if (asterisk && *str != '\0') {
    395  1.100    rillig 		fixed_str += strlen(str);
    396  1.100    rillig 		goto match_fixed_length;
    397    1.1       cgd 	}
    398  1.100    rillig 	res.matched = *str == '\0';
    399  1.100    rillig 	return res;
    400   1.95    rillig 
    401  1.100    rillig no_match:
    402  1.100    rillig 	if (!asterisk)
    403   1.98    rillig 		return res;
    404  1.100    rillig 	fixed_str++;
    405   1.95    rillig 	goto match_fixed_length;
    406    1.4       cgd }
    407   1.87    rillig 
    408   1.87    rillig void
    409   1.87    rillig Str_Intern_Init(void)
    410   1.87    rillig {
    411   1.87    rillig 	HashTable_Init(&interned_strings);
    412   1.87    rillig }
    413   1.87    rillig 
    414   1.88    rillig void
    415   1.88    rillig Str_Intern_End(void)
    416   1.88    rillig {
    417   1.88    rillig #ifdef CLEANUP
    418   1.88    rillig 	HashTable_Done(&interned_strings);
    419   1.88    rillig #endif
    420   1.88    rillig }
    421   1.88    rillig 
    422   1.87    rillig /* Return a canonical instance of str, with unlimited lifetime. */
    423   1.87    rillig const char *
    424   1.87    rillig Str_Intern(const char *str)
    425   1.87    rillig {
    426   1.87    rillig 	return HashTable_CreateEntry(&interned_strings, str, NULL)->key;
    427   1.87    rillig }
    428