1 # $NetBSD: varmod-subst-regex.mk,v 1.12 2024/07/20 11:05:12 rillig Exp $ 2 # 3 # Tests for the :C,from,to, variable modifier. 4 5 # report unmatched subexpressions 6 .MAKEFLAGS: -dL 7 8 all: mod-regex-compile-error 9 all: mod-regex-limits-{1,2,3,4,5,6} 10 all: mod-regex-errors-{1,2} 11 all: unmatched-subexpression 12 13 # The expression expands to 4 words. Of these words, none matches 14 # the regular expression "a b" since these words don't contain any 15 # whitespace. 16 .if ${:Ua b b c:C,a b,,} != "a b b c" 17 . error 18 .endif 19 20 # Using the '1' modifier does not change anything. The '1' modifier just 21 # means to apply at most 1 replacement in the whole expression. 22 .if ${:Ua b b c:C,a b,,1} != "a b b c" 23 . error 24 .endif 25 26 # The 'W' modifier treats the whole variable value as a single big word, 27 # containing whitespace. This big word matches the regular expression, 28 # therefore it gets replaced. Whitespace is preserved after replacing. 29 .if ${:Ua b b c:C,a b,,W} != " b c" 30 . error 31 .endif 32 33 # The 'g' modifier does not have any effect here since each of the words 34 # contains the character 'b' a single time. 35 .if ${:Ua b b c:C,b,,g} != "a c" 36 . error 37 .endif 38 39 # The first :C modifier has the 'W' modifier, which makes the whole 40 # expression a single word. The 'g' modifier then replaces all occurrences 41 # of "1 2" with "___". The 'W' modifier only applies to this single :C 42 # modifier. This is demonstrated by the :C modifier that follows. If the 43 # 'W' modifier would be preserved, only a single underscore would have been 44 # replaced with an 'x'. 45 .if ${:U1 2 3 1 2 3:C,1 2,___,Wg:C,_,x,} != "x__ 3 x__ 3" 46 . error 47 .endif 48 49 # The regular expression does not match in the first word. 50 # It matches once in the second word, and the \0\0 doubles that word. 51 # In the third word, the regular expression matches as early as possible, 52 # and since the matches must not overlap, the next possible match would 53 # start at the 6, but at that point, there is only one character left, 54 # and that cannot match the regular expression "..". Therefore only the 55 # "45" is doubled in the third word. 56 .if ${:U1 23 456:C,..,\0\0,} != "1 2323 45456" 57 . error 58 .endif 59 60 # The modifier '1' applies the replacement at most once, across the whole 61 # expression value, no matter whether it is a single big word or many small 62 # words. 63 # 64 # Up to 2020-08-28, the manual page said that the modifiers '1' and 'g' 65 # were orthogonal, which was wrong. It doesn't make sense to specify both 66 # 'g' and '1' at the same time. 67 .if ${:U12345 12345:C,.,\0\0,1} != "112345 12345" 68 . error 69 .endif 70 71 # A regular expression that matches the empty string applies before every 72 # single character of the word. 73 # XXX: Most other places where regular expression are used match at the end 74 # of the string as well. 75 .if ${:U1a2b3c:C,a*,*,g} != "*1**2*b*3*c" 76 . error 77 .endif 78 79 # A dot in the regular expression matches any character, even a newline. 80 # In most other contexts where regular expressions are used, a dot matches 81 # any character except newline. In make, regcomp is called without 82 # REG_NEWLINE, thus newline is an ordinary character. 83 .if ${:U"${.newline}":C,.,.,g} != "..." 84 . error 85 .endif 86 87 88 # Like the ':S' modifier, the ':C' modifier matches on an expression 89 # that contains no words at all, but only if the regular expression matches an 90 # empty string, for example, when the regular expression is anchored at the 91 # beginning or the end of the word. An unanchored regular expression that 92 # matches the empty string is uncommon in practice, as it would match before 93 # each character of the word. 94 .if "<${:U:S,,unanchored,}> <${:U:C,.?,unanchored,}>" != "<> <unanchored>" 95 . error 96 .endif 97 .if "<${:U:S,^,prefix,}> <${:U:C,^,prefix,}>" != "<prefix> <prefix>" 98 . error 99 .endif 100 .if "<${:U:S,$,suffix,}> <${:U:C,$,suffix,}>" != "<suffix> <suffix>" 101 . error 102 .endif 103 .if "<${:U:S,^$,whole,}> <${:U:C,^$,whole,}>" != "<whole> <whole>" 104 . error 105 .endif 106 .if "<${:U:S,,unanchored,g}> <${:U:C,.?,unanchored,g}>" != "<> <unanchored>" 107 . error 108 .endif 109 .if "<${:U:S,^,prefix,g}> <${:U:C,^,prefix,g}>" != "<prefix> <prefix>" 110 . error 111 .endif 112 .if "<${:U:S,$,suffix,g}> <${:U:C,$,suffix,g}>" != "<suffix> <suffix>" 113 . error 114 .endif 115 .if "<${:U:S,^$,whole,g}> <${:U:C,^$,whole,g}>" != "<whole> <whole>" 116 . error 117 .endif 118 .if "<${:U:S,,unanchored,W}> <${:U:C,.?,unanchored,W}>" != "<> <unanchored>" 119 . error 120 .endif 121 .if "<${:U:S,^,prefix,W}> <${:U:C,^,prefix,W}>" != "<prefix> <prefix>" 122 . error 123 .endif 124 .if "<${:U:S,$,suffix,W}> <${:U:C,$,suffix,W}>" != "<suffix> <suffix>" 125 . error 126 .endif 127 .if "<${:U:S,^$,whole,W}> <${:U:C,^$,whole,W}>" != "<whole> <whole>" 128 . error 129 .endif 130 131 132 # Multiple asterisks form an invalid regular expression. This produces an 133 # error message and (as of 2020-08-28) stops parsing in the middle of the 134 # expression. The unparsed part of the expression is then copied 135 # verbatim to the output, which is unexpected and can lead to strange shell 136 # commands being run. 137 mod-regex-compile-error: 138 @echo $@: ${:Uword1 word2:C,****,____,g:C,word,____,:Q}. 139 140 # These tests generate error messages but as of 2020-08-28 just continue 141 # parsing and execution as if nothing bad had happened. 142 mod-regex-limits-1: 143 @echo $@:11-missing:${:U1 23 456:C,..,\1\1,:Q} 144 mod-regex-limits-2: 145 @echo $@:11-ok:${:U1 23 456:C,(.).,\1\1,:Q} 146 mod-regex-limits-3: 147 @echo $@:22-missing:${:U1 23 456:C,..,\2\2,:Q} 148 mod-regex-limits-4: 149 @echo $@:22-missing:${:U1 23 456:C,(.).,\2\2,:Q} 150 mod-regex-limits-5: 151 @echo $@:22-ok:${:U1 23 456:C,(.)(.),\2\2,:Q} 152 mod-regex-limits-6: 153 # The :C modifier only handles single-digit capturing groups, 154 # which is enough for all practical use cases. 155 @echo $@:capture:${:UabcdefghijABCDEFGHIJrest:C,(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.),\9\8\7\6\5\4\3\2\1\0\10\11\12,} 156 157 mod-regex-errors-1: 158 @echo $@: ${UNDEF:Uvalue:C,[,,} 159 160 mod-regex-errors-2: 161 # If the replacement pattern produces a parse error because of an 162 # unknown modifier, the parse error is ignored in ParseModifierPart 163 # and the faulty expression expands to "". 164 @echo $@: ${word:L:C,.*,x${:U:Z}y,W} 165 166 # In regular expressions with alternatives, not all capturing groups are 167 # always set; some may be missing. Make calls these "unmatched 168 # subexpressions". 169 # 170 # Between var.c 1.16 from 1996-12-24 until before var.c 1.933 from 2021-06-21, 171 # unmatched subexpressions produced an "error message" but did not have any 172 # further effect since the "error handling" didn't influence the exit status. 173 # 174 # Before 2021-06-21 there was no way to turn off this warning, thus the 175 # combination of alternative matches and capturing groups was seldom used, if 176 # at all. 177 # 178 # Since var.c 1.933 from 2021-06-21, the error message is only printed in lint 179 # mode (-dL), but not in default mode. 180 # 181 # As an alternative to the change from var.c 1.933 from 2021-06-21, a possible 182 # mitigation would have been to add a new modifier 'U' to the already existing 183 # '1Wg' modifiers of the ':C' modifier. That modifier could have been used in 184 # the modifier ':C,(a.)|(b.),\1\2,U' to treat unmatched subexpressions as 185 # empty. This approach would have created a syntactical ambiguity since the 186 # modifiers ':S' and ':C' are open-ended (see mod-subst-chain), that is, they 187 # do not need to be followed by a ':' to separate them from the next modifier. 188 # Luckily the modifier :U does not make sense after :C, therefore this case 189 # does not happen in practice. 190 unmatched-subexpression: 191 # In each of the following cases, if the regular expression matches at 192 # all, the subexpression \1 matches as well. 193 @echo $@.ok: ${:U1 1 2 3 5 8 13 21 34:C,1(.*),one\1,} 194 195 # In the following cases: 196 # * The subexpression \1 is only defined for 1 and 13. 197 # * The subexpression \2 is only defined for 2 and 21. 198 # * If the regular expression does not match at all, the 199 # replacement string is not analyzed, thus no error messages. 200 # In total, there are 5 error messages about unmatched subexpressions. 201 @echo $@.1: ${:U 1:C,1(.*)|2(.*),(\1)(\2),:Q} # missing \2 202 @echo $@.1: ${:U 1:C,1(.*)|2(.*),(\1)(\2),:Q} # missing \2 203 @echo $@.2: ${:U 2:C,1(.*)|2(.*),(\1)(\2),:Q} # missing \1 204 @echo $@.3: ${:U 3:C,1(.*)|2(.*),(\1)(\2),:Q} 205 @echo $@.5: ${:U 5:C,1(.*)|2(.*),(\1)(\2),:Q} 206 @echo $@.8: ${:U 8:C,1(.*)|2(.*),(\1)(\2),:Q} 207 @echo $@.13: ${:U 13:C,1(.*)|2(.*),(\1)(\2),:Q} # missing \2 208 @echo $@.21: ${:U 21:C,1(.*)|2(.*),(\1)(\2),:Q} # missing \1 209 @echo $@.34: ${:U 34:C,1(.*)|2(.*),(\1)(\2),:Q} 210 211 # And now all together: 5 error messages for 1, 1, 2, 13, 21. 212 @echo $@.all: ${:U1 1 2 3 5 8 13 21 34:C,1(.*)|2(.*),(\1)(\2),:Q} 213