Home | History | Annotate | Line # | Download | only in make
      1  1.106    rillig /*	$NetBSD: str.c,v 1.106 2025/06/28 21:09:43 rillig Exp $	*/
      2   1.10  christos 
      3   1.79    rillig /*
      4   1.13  christos  * Copyright (c) 1988, 1989, 1990, 1993
      5   1.13  christos  *	The Regents of the University of California.  All rights reserved.
      6   1.20       agc  *
      7   1.20       agc  * This code is derived from software contributed to Berkeley by
      8   1.20       agc  * Adam de Boor.
      9   1.20       agc  *
     10   1.20       agc  * Redistribution and use in source and binary forms, with or without
     11   1.20       agc  * modification, are permitted provided that the following conditions
     12   1.20       agc  * are met:
     13   1.20       agc  * 1. Redistributions of source code must retain the above copyright
     14   1.20       agc  *    notice, this list of conditions and the following disclaimer.
     15   1.20       agc  * 2. Redistributions in binary form must reproduce the above copyright
     16   1.20       agc  *    notice, this list of conditions and the following disclaimer in the
     17   1.20       agc  *    documentation and/or other materials provided with the distribution.
     18   1.20       agc  * 3. Neither the name of the University nor the names of its contributors
     19   1.20       agc  *    may be used to endorse or promote products derived from this software
     20   1.20       agc  *    without specific prior written permission.
     21   1.20       agc  *
     22   1.20       agc  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     23   1.20       agc  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     24   1.20       agc  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     25   1.20       agc  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     26   1.20       agc  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     27   1.20       agc  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     28   1.20       agc  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     29   1.20       agc  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     30   1.20       agc  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     31   1.20       agc  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     32   1.20       agc  * SUCH DAMAGE.
     33   1.20       agc  */
     34   1.20       agc 
     35   1.79    rillig /*
     36    1.1       cgd  * Copyright (c) 1989 by Berkeley Softworks
     37    1.1       cgd  * All rights reserved.
     38    1.1       cgd  *
     39    1.1       cgd  * This code is derived from software contributed to Berkeley by
     40    1.1       cgd  * Adam de Boor.
     41    1.1       cgd  *
     42    1.1       cgd  * Redistribution and use in source and binary forms, with or without
     43    1.1       cgd  * modification, are permitted provided that the following conditions
     44    1.1       cgd  * are met:
     45    1.1       cgd  * 1. Redistributions of source code must retain the above copyright
     46    1.1       cgd  *    notice, this list of conditions and the following disclaimer.
     47    1.1       cgd  * 2. Redistributions in binary form must reproduce the above copyright
     48    1.1       cgd  *    notice, this list of conditions and the following disclaimer in the
     49    1.1       cgd  *    documentation and/or other materials provided with the distribution.
     50    1.1       cgd  * 3. All advertising materials mentioning features or use of this software
     51    1.1       cgd  *    must display the following acknowledgement:
     52    1.1       cgd  *	This product includes software developed by the University of
     53    1.1       cgd  *	California, Berkeley and its contributors.
     54    1.1       cgd  * 4. Neither the name of the University nor the names of its contributors
     55    1.1       cgd  *    may be used to endorse or promote products derived from this software
     56    1.1       cgd  *    without specific prior written permission.
     57    1.1       cgd  *
     58    1.1       cgd  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     59    1.1       cgd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     60    1.1       cgd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     61    1.1       cgd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     62    1.1       cgd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     63    1.1       cgd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     64    1.1       cgd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     65    1.1       cgd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     66    1.1       cgd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     67    1.1       cgd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     68    1.1       cgd  * SUCH DAMAGE.
     69    1.1       cgd  */
     70    1.1       cgd 
     71   1.65    rillig #include "make.h"
     72    1.1       cgd 
     73   1.65    rillig /*	"@(#)str.c	5.8 (Berkeley) 6/1/90"	*/
     74  1.106    rillig MAKE_RCSID("$NetBSD: str.c,v 1.106 2025/06/28 21:09:43 rillig Exp $");
     75   1.87    rillig 
     76   1.87    rillig 
     77   1.87    rillig static HashTable interned_strings;
     78   1.87    rillig 
     79    1.1       cgd 
     80   1.59    rillig /* Return the concatenation of s1 and s2, freshly allocated. */
     81    1.1       cgd char *
     82   1.59    rillig str_concat2(const char *s1, const char *s2)
     83    1.1       cgd {
     84   1.57    rillig 	size_t len1 = strlen(s1);
     85   1.57    rillig 	size_t len2 = strlen(s2);
     86   1.59    rillig 	char *result = bmake_malloc(len1 + len2 + 1);
     87   1.28  christos 	memcpy(result, s1, len1);
     88   1.28  christos 	memcpy(result + len1, s2, len2 + 1);
     89   1.59    rillig 	return result;
     90   1.59    rillig }
     91    1.1       cgd 
     92   1.59    rillig /* Return the concatenation of s1, s2 and s3, freshly allocated. */
     93   1.59    rillig char *
     94   1.59    rillig str_concat3(const char *s1, const char *s2, const char *s3)
     95   1.59    rillig {
     96   1.59    rillig 	size_t len1 = strlen(s1);
     97   1.59    rillig 	size_t len2 = strlen(s2);
     98   1.59    rillig 	size_t len3 = strlen(s3);
     99   1.59    rillig 	char *result = bmake_malloc(len1 + len2 + len3 + 1);
    100   1.59    rillig 	memcpy(result, s1, len1);
    101   1.59    rillig 	memcpy(result + len1, s2, len2);
    102   1.59    rillig 	memcpy(result + len1 + len2, s3, len3 + 1);
    103   1.45    rillig 	return result;
    104    1.1       cgd }
    105    1.1       cgd 
    106   1.76    rillig /*
    107   1.76    rillig  * Fracture a string into an array of words (as delineated by tabs or spaces)
    108   1.73    rillig  * taking quotation marks into account.
    109   1.64    rillig  *
    110  1.102    rillig  * A string that is empty or only contains whitespace nevertheless results in
    111  1.102    rillig  * a single word.  This is unexpected in many places, and the caller needs to
    112  1.102    rillig  * correct for this edge case.
    113  1.102    rillig  *
    114   1.82    rillig  * If expand is true, quotes are removed and escape sequences such as \r, \t,
    115   1.73    rillig  * etc... are expanded. In this case, return NULL on parse errors.
    116   1.64    rillig  *
    117   1.73    rillig  * Returns the fractured words, which must be freed later using Words_Free,
    118   1.73    rillig  * unless the returned Words.words was NULL.
    119    1.1       cgd  */
    120   1.84    rillig SubstringWords
    121   1.84    rillig Substring_Words(const char *str, bool expand)
    122    1.1       cgd {
    123   1.56    rillig 	size_t str_len;
    124   1.56    rillig 	char *words_buf;
    125   1.61    rillig 	size_t words_cap;
    126   1.84    rillig 	Substring *words;
    127   1.61    rillig 	size_t words_len;
    128   1.56    rillig 	char inquote;
    129   1.56    rillig 	char *word_start;
    130   1.56    rillig 	char *word_end;
    131   1.55    rillig 	const char *str_p;
    132   1.55    rillig 
    133   1.72    rillig 	/* XXX: why only hspace, not whitespace? */
    134   1.72    rillig 	cpp_skip_hspace(&str);	/* skip leading space chars. */
    135    1.1       cgd 
    136   1.41    rillig 	/* words_buf holds the words, separated by '\0'. */
    137   1.55    rillig 	str_len = strlen(str);
    138   1.75    rillig 	words_buf = bmake_malloc(str_len + 1);
    139    1.1       cgd 
    140   1.70    rillig 	words_cap = str_len / 5 > 50 ? str_len / 5 : 50;
    141   1.84    rillig 	words = bmake_malloc((words_cap + 1) * sizeof(words[0]));
    142   1.35       sjg 
    143   1.35       sjg 	/*
    144    1.1       cgd 	 * copy the string; at the same time, parse backslashes,
    145   1.41    rillig 	 * quotes and build the word list.
    146    1.1       cgd 	 */
    147   1.55    rillig 	words_len = 0;
    148   1.55    rillig 	inquote = '\0';
    149   1.55    rillig 	word_start = words_buf;
    150   1.55    rillig 	word_end = words_buf;
    151   1.80    rillig 	for (str_p = str;; str_p++) {
    152   1.41    rillig 		char ch = *str_p;
    153   1.56    rillig 		switch (ch) {
    154    1.1       cgd 		case '"':
    155    1.1       cgd 		case '\'':
    156   1.78    rillig 			if (inquote != '\0') {
    157    1.1       cgd 				if (inquote == ch)
    158    1.4       cgd 					inquote = '\0';
    159    1.1       cgd 				else
    160    1.1       cgd 					break;
    161   1.56    rillig 			} else {
    162   1.69    rillig 				inquote = ch;
    163    1.6       jtc 				/* Don't miss "" or '' */
    164   1.41    rillig 				if (word_start == NULL && str_p[1] == inquote) {
    165   1.30  christos 					if (!expand) {
    166   1.41    rillig 						word_start = word_end;
    167   1.41    rillig 						*word_end++ = ch;
    168   1.30  christos 					} else
    169   1.41    rillig 						word_start = word_end + 1;
    170   1.41    rillig 					str_p++;
    171   1.25  christos 					inquote = '\0';
    172    1.6       jtc 					break;
    173    1.6       jtc 				}
    174    1.6       jtc 			}
    175    1.8       jtc 			if (!expand) {
    176   1.41    rillig 				if (word_start == NULL)
    177   1.41    rillig 					word_start = word_end;
    178   1.41    rillig 				*word_end++ = ch;
    179    1.8       jtc 			}
    180    1.1       cgd 			continue;
    181    1.1       cgd 		case ' ':
    182    1.1       cgd 		case '\t':
    183    1.8       jtc 		case '\n':
    184   1.78    rillig 			if (inquote != '\0')
    185    1.1       cgd 				break;
    186   1.41    rillig 			if (word_start == NULL)
    187    1.1       cgd 				continue;
    188    1.1       cgd 			/* FALLTHROUGH */
    189    1.1       cgd 		case '\0':
    190    1.1       cgd 			/*
    191   1.41    rillig 			 * end of a token -- make sure there's enough words
    192    1.1       cgd 			 * space and save off a pointer.
    193    1.1       cgd 			 */
    194   1.41    rillig 			if (word_start == NULL)
    195   1.56    rillig 				goto done;
    196    1.8       jtc 
    197   1.41    rillig 			*word_end++ = '\0';
    198   1.41    rillig 			if (words_len == words_cap) {
    199   1.84    rillig 				words_cap *= 2;
    200   1.86    rillig 				words = bmake_realloc(words,
    201   1.86    rillig 				    (words_cap + 1) * sizeof(words[0]));
    202    1.1       cgd 			}
    203   1.84    rillig 			words[words_len++] =
    204   1.84    rillig 			    Substring_Init(word_start, word_end - 1);
    205   1.41    rillig 			word_start = NULL;
    206   1.31  christos 			if (ch == '\n' || ch == '\0') {
    207   1.77    rillig 				if (expand && inquote != '\0') {
    208   1.84    rillig 					SubstringWords res;
    209   1.83    rillig 
    210   1.41    rillig 					free(words);
    211   1.41    rillig 					free(words_buf);
    212   1.83    rillig 
    213   1.83    rillig 					res.words = NULL;
    214   1.83    rillig 					res.len = 0;
    215   1.83    rillig 					res.freeIt = NULL;
    216   1.83    rillig 					return res;
    217   1.31  christos 				}
    218    1.1       cgd 				goto done;
    219   1.31  christos 			}
    220    1.1       cgd 			continue;
    221    1.1       cgd 		case '\\':
    222    1.8       jtc 			if (!expand) {
    223   1.41    rillig 				if (word_start == NULL)
    224   1.41    rillig 					word_start = word_end;
    225   1.41    rillig 				*word_end++ = '\\';
    226   1.93    rillig 				/* catch lonely '\' at end of string */
    227   1.41    rillig 				if (str_p[1] == '\0')
    228   1.26       erh 					continue;
    229   1.41    rillig 				ch = *++str_p;
    230    1.8       jtc 				break;
    231    1.8       jtc 			}
    232   1.13  christos 
    233   1.41    rillig 			switch (ch = *++str_p) {
    234    1.1       cgd 			case '\0':
    235    1.1       cgd 			case '\n':
    236    1.1       cgd 				/* hmmm; fix it up as best we can */
    237    1.1       cgd 				ch = '\\';
    238   1.74    rillig 				str_p--;
    239    1.1       cgd 				break;
    240    1.1       cgd 			case 'b':
    241    1.1       cgd 				ch = '\b';
    242    1.1       cgd 				break;
    243    1.1       cgd 			case 'f':
    244    1.1       cgd 				ch = '\f';
    245    1.1       cgd 				break;
    246    1.1       cgd 			case 'n':
    247    1.1       cgd 				ch = '\n';
    248    1.1       cgd 				break;
    249    1.1       cgd 			case 'r':
    250    1.1       cgd 				ch = '\r';
    251    1.1       cgd 				break;
    252    1.1       cgd 			case 't':
    253    1.1       cgd 				ch = '\t';
    254    1.1       cgd 				break;
    255    1.1       cgd 			}
    256    1.1       cgd 			break;
    257    1.1       cgd 		}
    258   1.41    rillig 		if (word_start == NULL)
    259   1.41    rillig 			word_start = word_end;
    260   1.41    rillig 		*word_end++ = ch;
    261    1.1       cgd 	}
    262   1.56    rillig done:
    263   1.84    rillig 	words[words_len] = Substring_Init(NULL, NULL);	/* useful for argv */
    264   1.83    rillig 
    265   1.83    rillig 	{
    266   1.84    rillig 		SubstringWords result;
    267   1.83    rillig 
    268   1.83    rillig 		result.words = words;
    269   1.83    rillig 		result.len = words_len;
    270   1.83    rillig 		result.freeIt = words_buf;
    271   1.83    rillig 		return result;
    272   1.83    rillig 	}
    273    1.1       cgd }
    274    1.1       cgd 
    275   1.84    rillig Words
    276   1.84    rillig Str_Words(const char *str, bool expand)
    277   1.84    rillig {
    278   1.84    rillig 	SubstringWords swords;
    279   1.84    rillig 	Words words;
    280   1.84    rillig 	size_t i;
    281   1.84    rillig 
    282   1.84    rillig 	swords = Substring_Words(str, expand);
    283   1.84    rillig 	if (swords.words == NULL) {
    284   1.84    rillig 		words.words = NULL;
    285   1.84    rillig 		words.len = 0;
    286   1.84    rillig 		words.freeIt = NULL;
    287   1.84    rillig 		return words;
    288   1.84    rillig 	}
    289   1.84    rillig 
    290   1.84    rillig 	words.words = bmake_malloc((swords.len + 1) * sizeof(words.words[0]));
    291   1.84    rillig 	words.len = swords.len;
    292   1.84    rillig 	words.freeIt = swords.freeIt;
    293   1.84    rillig 	for (i = 0; i < swords.len + 1; i++)
    294   1.84    rillig 		words.words[i] = UNCONST(swords.words[i].start);
    295   1.84    rillig 	free(swords.words);
    296   1.84    rillig 	return words;
    297   1.84    rillig }
    298   1.84    rillig 
    299    1.1       cgd /*
    300   1.94    rillig  * Test if a string matches a pattern like "*.[ch]". The pattern matching
    301   1.94    rillig  * characters are '*', '?' and '[]', as in fnmatch(3).
    302   1.13  christos  *
    303   1.93    rillig  * See varmod-match.mk for examples and edge cases.
    304    1.1       cgd  */
    305   1.98    rillig StrMatchResult
    306   1.51    rillig Str_Match(const char *str, const char *pat)
    307    1.1       cgd {
    308   1.98    rillig 	StrMatchResult res = { NULL, false };
    309  1.100    rillig 	bool asterisk = false;
    310  1.100    rillig 	const char *fixed_str = str;
    311  1.100    rillig 	const char *fixed_pat = pat;
    312   1.96    rillig 
    313   1.95    rillig match_fixed_length:
    314   1.96    rillig 	str = fixed_str;
    315   1.96    rillig 	pat = fixed_pat;
    316   1.95    rillig 	for (; *pat != '\0' && *pat != '*'; str++, pat++) {
    317   1.90    rillig 		if (*str == '\0')
    318   1.98    rillig 			return res;
    319   1.90    rillig 
    320   1.92    rillig 		if (*pat == '?')	/* match any single character */
    321   1.92    rillig 			continue;
    322   1.51    rillig 
    323   1.93    rillig 		if (*pat == '[') {	/* match a character from a list */
    324   1.82    rillig 			bool neg = pat[1] == '^';
    325   1.63    rillig 			pat += neg ? 2 : 1;
    326   1.37       sjg 
    327   1.99    rillig 		next_char_in_list:
    328   1.99    rillig 			if (*pat == '\0')
    329   1.99    rillig 				res.error = "Unfinished character list";
    330   1.99    rillig 			if (*pat == ']' || *pat == '\0') {
    331   1.99    rillig 				if (neg)
    332   1.99    rillig 					goto end_of_char_list;
    333  1.100    rillig 				goto no_match;
    334   1.99    rillig 			}
    335   1.99    rillig 			if (*pat == *str)
    336   1.99    rillig 				goto end_of_char_list;
    337   1.99    rillig 			if (pat[1] == '-' && pat[2] == '\0') {
    338   1.99    rillig 				res.error = "Unfinished character range";
    339   1.99    rillig 				res.matched = neg;
    340   1.99    rillig 				return res;
    341   1.99    rillig 			}
    342   1.99    rillig 			if (pat[1] == '-') {
    343  1.103    rillig 				unsigned char e1 = (unsigned char)pat[0];
    344  1.103    rillig 				unsigned char c = (unsigned char)*str;
    345  1.103    rillig 				unsigned char e2 = (unsigned char)pat[2];
    346  1.103    rillig 				if ((e1 <= c && c <= e2)
    347  1.103    rillig 				    || (e2 <= c && c <= e1))
    348   1.99    rillig 					goto end_of_char_list;
    349   1.99    rillig 				pat += 2;
    350    1.1       cgd 			}
    351   1.99    rillig 			pat++;
    352   1.99    rillig 			goto next_char_in_list;
    353   1.99    rillig 
    354   1.99    rillig 		end_of_char_list:
    355   1.71    rillig 			if (neg && *pat != ']' && *pat != '\0')
    356  1.100    rillig 				goto no_match;
    357   1.71    rillig 			while (*pat != ']' && *pat != '\0')
    358   1.51    rillig 				pat++;
    359  1.104    rillig 			if (*pat == '\0') {
    360  1.104    rillig 				res.error = "Unfinished character list";
    361   1.51    rillig 				pat--;
    362  1.104    rillig 			}
    363   1.92    rillig 			continue;
    364    1.1       cgd 		}
    365   1.51    rillig 
    366  1.106    rillig 		if (*pat == '\\') {	/* match the next character exactly */
    367   1.51    rillig 			pat++;
    368  1.106    rillig 			if (*pat == '\0')
    369  1.106    rillig 				res.error = "Unfinished backslash at the end";
    370  1.106    rillig 		}
    371  1.101    rillig 		if (*pat != *str) {
    372  1.101    rillig 			if (asterisk && str == fixed_str) {
    373  1.101    rillig 				while (*str != '\0' && *str != *pat)
    374  1.101    rillig 					str++;
    375  1.101    rillig 				fixed_str = str;
    376  1.101    rillig 				goto match_fixed_length;
    377  1.101    rillig 			}
    378  1.100    rillig 			goto no_match;
    379  1.101    rillig 		}
    380   1.95    rillig 	}
    381   1.51    rillig 
    382  1.100    rillig 	if (*pat == '*') {
    383  1.100    rillig 		asterisk = true;
    384  1.100    rillig 		while (*pat == '*')
    385  1.100    rillig 			pat++;
    386   1.98    rillig 		if (*pat == '\0') {
    387  1.100    rillig 			res.matched = true;
    388   1.98    rillig 			return res;
    389   1.98    rillig 		}
    390  1.100    rillig 		fixed_str = str;
    391  1.100    rillig 		fixed_pat = pat;
    392  1.100    rillig 		goto match_fixed_length;
    393  1.100    rillig 	}
    394  1.100    rillig 	if (asterisk && *str != '\0') {
    395  1.100    rillig 		fixed_str += strlen(str);
    396  1.100    rillig 		goto match_fixed_length;
    397    1.1       cgd 	}
    398  1.100    rillig 	res.matched = *str == '\0';
    399  1.100    rillig 	return res;
    400   1.95    rillig 
    401  1.100    rillig no_match:
    402  1.100    rillig 	if (!asterisk)
    403   1.98    rillig 		return res;
    404  1.100    rillig 	fixed_str++;
    405   1.95    rillig 	goto match_fixed_length;
    406    1.4       cgd }
    407   1.87    rillig 
    408   1.87    rillig void
    409   1.87    rillig Str_Intern_Init(void)
    410   1.87    rillig {
    411   1.87    rillig 	HashTable_Init(&interned_strings);
    412   1.87    rillig }
    413   1.87    rillig 
    414  1.105    rillig #ifdef CLEANUP
    415   1.88    rillig void
    416   1.88    rillig Str_Intern_End(void)
    417   1.88    rillig {
    418   1.88    rillig 	HashTable_Done(&interned_strings);
    419  1.105    rillig }
    420   1.88    rillig #endif
    421   1.88    rillig 
    422   1.87    rillig /* Return a canonical instance of str, with unlimited lifetime. */
    423   1.87    rillig const char *
    424   1.87    rillig Str_Intern(const char *str)
    425   1.87    rillig {
    426   1.87    rillig 	return HashTable_CreateEntry(&interned_strings, str, NULL)->key;
    427   1.87    rillig }
    428