lex.cc revision 1.1.1.3 1 1.1 mrg /* CPP Library - lexical analysis.
2 1.1.1.3 mrg Copyright (C) 2000-2024 Free Software Foundation, Inc.
3 1.1 mrg Contributed by Per Bothner, 1994-95.
4 1.1 mrg Based on CCCP program by Paul Rubin, June 1986
5 1.1 mrg Adapted to ANSI C, Richard Stallman, Jan 1987
6 1.1 mrg Broken out to separate file, Zack Weinberg, Mar 2000
7 1.1 mrg
8 1.1 mrg This program is free software; you can redistribute it and/or modify it
9 1.1 mrg under the terms of the GNU General Public License as published by the
10 1.1 mrg Free Software Foundation; either version 3, or (at your option) any
11 1.1 mrg later version.
12 1.1 mrg
13 1.1 mrg This program is distributed in the hope that it will be useful,
14 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
15 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 1.1 mrg GNU General Public License for more details.
17 1.1 mrg
18 1.1 mrg You should have received a copy of the GNU General Public License
19 1.1 mrg along with this program; see the file COPYING3. If not see
20 1.1 mrg <http://www.gnu.org/licenses/>. */
21 1.1 mrg
22 1.1 mrg #include "config.h"
23 1.1 mrg #include "system.h"
24 1.1 mrg #include "cpplib.h"
25 1.1 mrg #include "internal.h"
26 1.1 mrg
27 1.1 mrg enum spell_type
28 1.1 mrg {
29 1.1 mrg SPELL_OPERATOR = 0,
30 1.1 mrg SPELL_IDENT,
31 1.1 mrg SPELL_LITERAL,
32 1.1 mrg SPELL_NONE
33 1.1 mrg };
34 1.1 mrg
35 1.1 mrg struct token_spelling
36 1.1 mrg {
37 1.1 mrg enum spell_type category;
38 1.1 mrg const unsigned char *name;
39 1.1 mrg };
40 1.1 mrg
41 1.1 mrg static const unsigned char *const digraph_spellings[] =
42 1.1 mrg { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
43 1.1 mrg
44 1.1 mrg #define OP(e, s) { SPELL_OPERATOR, UC s },
45 1.1 mrg #define TK(e, s) { SPELL_ ## s, UC #e },
46 1.1 mrg static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 1.1 mrg #undef OP
48 1.1 mrg #undef TK
49 1.1 mrg
50 1.1 mrg #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 1.1 mrg #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52 1.1 mrg
53 1.1.1.3 mrg /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
54 1.1.1.3 mrg #define UCS_LIMIT 0x10FFFF
55 1.1.1.3 mrg
56 1.1 mrg static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
57 1.1 mrg static int skip_line_comment (cpp_reader *);
58 1.1 mrg static void skip_whitespace (cpp_reader *, cppchar_t);
59 1.1 mrg static void lex_string (cpp_reader *, cpp_token *, const uchar *);
60 1.1 mrg static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
61 1.1 mrg static void store_comment (cpp_reader *, cpp_token *);
62 1.1 mrg static void create_literal (cpp_reader *, cpp_token *, const uchar *,
63 1.1 mrg unsigned int, enum cpp_ttype);
64 1.1 mrg static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
65 1.1 mrg static int name_p (cpp_reader *, const cpp_string *);
66 1.1 mrg static tokenrun *next_tokenrun (tokenrun *);
67 1.1 mrg
68 1.1 mrg static _cpp_buff *new_buff (size_t);
69 1.1 mrg
70 1.1 mrg
71 1.1 mrg /* Utility routine:
72 1.1 mrg
73 1.1 mrg Compares, the token TOKEN to the NUL-terminated string STRING.
74 1.1 mrg TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
75 1.1 mrg int
76 1.1 mrg cpp_ideq (const cpp_token *token, const char *string)
77 1.1 mrg {
78 1.1 mrg if (token->type != CPP_NAME)
79 1.1 mrg return 0;
80 1.1 mrg
81 1.1 mrg return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
82 1.1 mrg }
83 1.1 mrg
84 1.1 mrg /* Record a note TYPE at byte POS into the current cleaned logical
85 1.1 mrg line. */
86 1.1 mrg static void
87 1.1 mrg add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
88 1.1 mrg {
89 1.1 mrg if (buffer->notes_used == buffer->notes_cap)
90 1.1 mrg {
91 1.1 mrg buffer->notes_cap = buffer->notes_cap * 2 + 200;
92 1.1 mrg buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
93 1.1 mrg buffer->notes_cap);
94 1.1 mrg }
95 1.1 mrg
96 1.1 mrg buffer->notes[buffer->notes_used].pos = pos;
97 1.1 mrg buffer->notes[buffer->notes_used].type = type;
98 1.1 mrg buffer->notes_used++;
99 1.1 mrg }
100 1.1 mrg
101 1.1 mrg
102 1.1 mrg /* Fast path to find line special characters using optimized character
104 1.1 mrg scanning algorithms. Anything complicated falls back to the slow
105 1.1 mrg path below. Since this loop is very hot it's worth doing these kinds
106 1.1 mrg of optimizations.
107 1.1 mrg
108 1.1 mrg One of the paths through the ifdefs should provide
109 1.1 mrg
110 1.1 mrg const uchar *search_line_fast (const uchar *s, const uchar *end);
111 1.1 mrg
112 1.1 mrg Between S and END, search for \n, \r, \\, ?. Return a pointer to
113 1.1 mrg the found character.
114 1.1 mrg
115 1.1 mrg Note that the last character of the buffer is *always* a newline,
116 1.1 mrg as forced by _cpp_convert_input. This fact can be used to avoid
117 1.1 mrg explicitly looking for the end of the buffer. */
118 1.1 mrg
119 1.1 mrg /* Configure gives us an ifdef test. */
120 1.1 mrg #ifndef WORDS_BIGENDIAN
121 1.1 mrg #define WORDS_BIGENDIAN 0
122 1.1 mrg #endif
123 1.1 mrg
124 1.1 mrg /* We'd like the largest integer that fits into a register. There's nothing
125 1.1 mrg in <stdint.h> that gives us that. For most hosts this is unsigned long,
126 1.1 mrg but MS decided on an LLP64 model. Thankfully when building with GCC we
127 1.1 mrg can get the "real" word size. */
128 1.1 mrg #ifdef __GNUC__
129 1.1 mrg typedef unsigned int word_type __attribute__((__mode__(__word__)));
130 1.1 mrg #else
131 1.1 mrg typedef unsigned long word_type;
132 1.1 mrg #endif
133 1.1 mrg
134 1.1 mrg /* The code below is only expecting sizes 4 or 8.
135 1.1 mrg Die at compile-time if this expectation is violated. */
136 1.1 mrg typedef char check_word_type_size
137 1.1 mrg [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
138 1.1 mrg
139 1.1 mrg /* Return X with the first N bytes forced to values that won't match one
140 1.1 mrg of the interesting characters. Note that NUL is not interesting. */
141 1.1 mrg
142 1.1 mrg static inline word_type
143 1.1 mrg acc_char_mask_misalign (word_type val, unsigned int n)
144 1.1 mrg {
145 1.1 mrg word_type mask = -1;
146 1.1 mrg if (WORDS_BIGENDIAN)
147 1.1 mrg mask >>= n * 8;
148 1.1 mrg else
149 1.1 mrg mask <<= n * 8;
150 1.1 mrg return val & mask;
151 1.1 mrg }
152 1.1 mrg
153 1.1 mrg /* Return X replicated to all byte positions within WORD_TYPE. */
154 1.1 mrg
155 1.1 mrg static inline word_type
156 1.1 mrg acc_char_replicate (uchar x)
157 1.1 mrg {
158 1.1 mrg word_type ret;
159 1.1 mrg
160 1.1 mrg ret = (x << 24) | (x << 16) | (x << 8) | x;
161 1.1 mrg if (sizeof(word_type) == 8)
162 1.1 mrg ret = (ret << 16 << 16) | ret;
163 1.1 mrg return ret;
164 1.1 mrg }
165 1.1 mrg
166 1.1 mrg /* Return non-zero if some byte of VAL is (probably) C. */
167 1.1 mrg
168 1.1 mrg static inline word_type
169 1.1 mrg acc_char_cmp (word_type val, word_type c)
170 1.1 mrg {
171 1.1 mrg #if defined(__GNUC__) && defined(__alpha__)
172 1.1 mrg /* We can get exact results using a compare-bytes instruction.
173 1.1 mrg Get (val == c) via (0 >= (val ^ c)). */
174 1.1 mrg return __builtin_alpha_cmpbge (0, val ^ c);
175 1.1 mrg #else
176 1.1 mrg word_type magic = 0x7efefefeU;
177 1.1 mrg if (sizeof(word_type) == 8)
178 1.1 mrg magic = (magic << 16 << 16) | 0xfefefefeU;
179 1.1 mrg magic |= 1;
180 1.1 mrg
181 1.1 mrg val ^= c;
182 1.1 mrg return ((val + magic) ^ ~val) & ~magic;
183 1.1 mrg #endif
184 1.1 mrg }
185 1.1 mrg
186 1.1 mrg /* Given the result of acc_char_cmp is non-zero, return the index of
187 1.1 mrg the found character. If this was a false positive, return -1. */
188 1.1 mrg
189 1.1 mrg static inline int
190 1.1 mrg acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
191 1.1 mrg word_type val ATTRIBUTE_UNUSED)
192 1.1 mrg {
193 1.1 mrg #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
194 1.1 mrg /* The cmpbge instruction sets *bits* of the result corresponding to
195 1.1 mrg matches in the bytes with no false positives. */
196 1.1 mrg return __builtin_ctzl (cmp);
197 1.1 mrg #else
198 1.1 mrg unsigned int i;
199 1.1 mrg
200 1.1 mrg /* ??? It would be nice to force unrolling here,
201 1.1 mrg and have all of these constants folded. */
202 1.1 mrg for (i = 0; i < sizeof(word_type); ++i)
203 1.1 mrg {
204 1.1 mrg uchar c;
205 1.1 mrg if (WORDS_BIGENDIAN)
206 1.1 mrg c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
207 1.1 mrg else
208 1.1 mrg c = (val >> i * 8) & 0xff;
209 1.1 mrg
210 1.1 mrg if (c == '\n' || c == '\r' || c == '\\' || c == '?')
211 1.1 mrg return i;
212 1.1 mrg }
213 1.1 mrg
214 1.1 mrg return -1;
215 1.1 mrg #endif
216 1.1 mrg }
217 1.1 mrg
218 1.1 mrg /* A version of the fast scanner using bit fiddling techniques.
219 1.1 mrg
220 1.1 mrg For 32-bit words, one would normally perform 16 comparisons and
221 1.1 mrg 16 branches. With this algorithm one performs 24 arithmetic
222 1.1 mrg operations and one branch. Whether this is faster with a 32-bit
223 1.1 mrg word size is going to be somewhat system dependent.
224 1.1 mrg
225 1.1 mrg For 64-bit words, we eliminate twice the number of comparisons
226 1.1 mrg and branches without increasing the number of arithmetic operations.
227 1.1 mrg It's almost certainly going to be a win with 64-bit word size. */
228 1.1 mrg
229 1.1 mrg static const uchar * search_line_acc_char (const uchar *, const uchar *)
230 1.1 mrg ATTRIBUTE_UNUSED;
231 1.1 mrg
232 1.1 mrg static const uchar *
233 1.1 mrg search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
234 1.1 mrg {
235 1.1 mrg const word_type repl_nl = acc_char_replicate ('\n');
236 1.1 mrg const word_type repl_cr = acc_char_replicate ('\r');
237 1.1 mrg const word_type repl_bs = acc_char_replicate ('\\');
238 1.1 mrg const word_type repl_qm = acc_char_replicate ('?');
239 1.1 mrg
240 1.1 mrg unsigned int misalign;
241 1.1 mrg const word_type *p;
242 1.1 mrg word_type val, t;
243 1.1 mrg
244 1.1 mrg /* Align the buffer. Mask out any bytes from before the beginning. */
245 1.1 mrg p = (word_type *)((uintptr_t)s & -sizeof(word_type));
246 1.1 mrg val = *p;
247 1.1 mrg misalign = (uintptr_t)s & (sizeof(word_type) - 1);
248 1.1 mrg if (misalign)
249 1.1 mrg val = acc_char_mask_misalign (val, misalign);
250 1.1 mrg
251 1.1 mrg /* Main loop. */
252 1.1 mrg while (1)
253 1.1 mrg {
254 1.1 mrg t = acc_char_cmp (val, repl_nl);
255 1.1 mrg t |= acc_char_cmp (val, repl_cr);
256 1.1 mrg t |= acc_char_cmp (val, repl_bs);
257 1.1 mrg t |= acc_char_cmp (val, repl_qm);
258 1.1 mrg
259 1.1 mrg if (__builtin_expect (t != 0, 0))
260 1.1 mrg {
261 1.1 mrg int i = acc_char_index (t, val);
262 1.1 mrg if (i >= 0)
263 1.1 mrg return (const uchar *)p + i;
264 1.1 mrg }
265 1.1 mrg
266 1.1 mrg val = *++p;
267 1.1 mrg }
268 1.1 mrg }
269 1.1 mrg
270 1.1 mrg /* Disable on Solaris 2/x86 until the following problem can be properly
271 1.1 mrg autoconfed:
272 1.1 mrg
273 1.1 mrg The Solaris 10+ assembler tags objects with the instruction set
274 1.1 mrg extensions used, so SSE4.2 executables cannot run on machines that
275 1.1 mrg don't support that extension. */
276 1.1 mrg
277 1.1 mrg #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
278 1.1 mrg
279 1.1 mrg /* Replicated character data to be shared between implementations.
280 1.1 mrg Recall that outside of a context with vector support we can't
281 1.1 mrg define compatible vector types, therefore these are all defined
282 1.1 mrg in terms of raw characters. */
283 1.1 mrg static const char repl_chars[4][16] __attribute__((aligned(16))) = {
284 1.1 mrg { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
285 1.1 mrg '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
286 1.1 mrg { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
287 1.1 mrg '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
288 1.1 mrg { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
289 1.1 mrg '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
290 1.1 mrg { '?', '?', '?', '?', '?', '?', '?', '?',
291 1.1 mrg '?', '?', '?', '?', '?', '?', '?', '?' },
292 1.1 mrg };
293 1.1 mrg
294 1.1 mrg /* A version of the fast scanner using MMX vectorized byte compare insns.
295 1.1 mrg
296 1.1 mrg This uses the PMOVMSKB instruction which was introduced with "MMX2",
297 1.1 mrg which was packaged into SSE1; it is also present in the AMD MMX
298 1.1 mrg extension. Mark the function as using "sse" so that we emit a real
299 1.1 mrg "emms" instruction, rather than the 3dNOW "femms" instruction. */
300 1.1 mrg
301 1.1 mrg static const uchar *
302 1.1 mrg #ifndef __SSE__
303 1.1 mrg __attribute__((__target__("sse")))
304 1.1 mrg #endif
305 1.1 mrg search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
306 1.1 mrg {
307 1.1 mrg typedef char v8qi __attribute__ ((__vector_size__ (8)));
308 1.1 mrg typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
309 1.1 mrg
310 1.1 mrg const v8qi repl_nl = *(const v8qi *)repl_chars[0];
311 1.1 mrg const v8qi repl_cr = *(const v8qi *)repl_chars[1];
312 1.1 mrg const v8qi repl_bs = *(const v8qi *)repl_chars[2];
313 1.1 mrg const v8qi repl_qm = *(const v8qi *)repl_chars[3];
314 1.1 mrg
315 1.1 mrg unsigned int misalign, found, mask;
316 1.1 mrg const v8qi *p;
317 1.1 mrg v8qi data, t, c;
318 1.1 mrg
319 1.1 mrg /* Align the source pointer. While MMX doesn't generate unaligned data
320 1.1 mrg faults, this allows us to safely scan to the end of the buffer without
321 1.1 mrg reading beyond the end of the last page. */
322 1.1 mrg misalign = (uintptr_t)s & 7;
323 1.1 mrg p = (const v8qi *)((uintptr_t)s & -8);
324 1.1 mrg data = *p;
325 1.1 mrg
326 1.1 mrg /* Create a mask for the bytes that are valid within the first
327 1.1 mrg 16-byte block. The Idea here is that the AND with the mask
328 1.1 mrg within the loop is "free", since we need some AND or TEST
329 1.1 mrg insn in order to set the flags for the branch anyway. */
330 1.1 mrg mask = -1u << misalign;
331 1.1 mrg
332 1.1 mrg /* Main loop processing 8 bytes at a time. */
333 1.1 mrg goto start;
334 1.1 mrg do
335 1.1 mrg {
336 1.1 mrg data = *++p;
337 1.1 mrg mask = -1;
338 1.1 mrg
339 1.1 mrg start:
340 1.1 mrg t = __builtin_ia32_pcmpeqb(data, repl_nl);
341 1.1 mrg c = __builtin_ia32_pcmpeqb(data, repl_cr);
342 1.1 mrg t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
343 1.1 mrg c = __builtin_ia32_pcmpeqb(data, repl_bs);
344 1.1 mrg t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
345 1.1 mrg c = __builtin_ia32_pcmpeqb(data, repl_qm);
346 1.1 mrg t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
347 1.1 mrg found = __builtin_ia32_pmovmskb (t);
348 1.1 mrg found &= mask;
349 1.1 mrg }
350 1.1 mrg while (!found);
351 1.1 mrg
352 1.1 mrg __builtin_ia32_emms ();
353 1.1 mrg
354 1.1 mrg /* FOUND contains 1 in bits for which we matched a relevant
355 1.1 mrg character. Conversion to the byte index is trivial. */
356 1.1 mrg found = __builtin_ctz(found);
357 1.1 mrg return (const uchar *)p + found;
358 1.1 mrg }
359 1.1 mrg
360 1.1 mrg /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
361 1.1 mrg
362 1.1 mrg static const uchar *
363 1.1 mrg #ifndef __SSE2__
364 1.1 mrg __attribute__((__target__("sse2")))
365 1.1 mrg #endif
366 1.1 mrg search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
367 1.1 mrg {
368 1.1 mrg typedef char v16qi __attribute__ ((__vector_size__ (16)));
369 1.1 mrg
370 1.1 mrg const v16qi repl_nl = *(const v16qi *)repl_chars[0];
371 1.1 mrg const v16qi repl_cr = *(const v16qi *)repl_chars[1];
372 1.1 mrg const v16qi repl_bs = *(const v16qi *)repl_chars[2];
373 1.1 mrg const v16qi repl_qm = *(const v16qi *)repl_chars[3];
374 1.1 mrg
375 1.1 mrg unsigned int misalign, found, mask;
376 1.1 mrg const v16qi *p;
377 1.1 mrg v16qi data, t;
378 1.1 mrg
379 1.1 mrg /* Align the source pointer. */
380 1.1 mrg misalign = (uintptr_t)s & 15;
381 1.1 mrg p = (const v16qi *)((uintptr_t)s & -16);
382 1.1 mrg data = *p;
383 1.1 mrg
384 1.1 mrg /* Create a mask for the bytes that are valid within the first
385 1.1 mrg 16-byte block. The Idea here is that the AND with the mask
386 1.1 mrg within the loop is "free", since we need some AND or TEST
387 1.1 mrg insn in order to set the flags for the branch anyway. */
388 1.1 mrg mask = -1u << misalign;
389 1.1 mrg
390 1.1 mrg /* Main loop processing 16 bytes at a time. */
391 1.1 mrg goto start;
392 1.1 mrg do
393 1.1 mrg {
394 1.1 mrg data = *++p;
395 1.1 mrg mask = -1;
396 1.1 mrg
397 1.1 mrg start:
398 1.1 mrg t = data == repl_nl;
399 1.1 mrg t |= data == repl_cr;
400 1.1 mrg t |= data == repl_bs;
401 1.1 mrg t |= data == repl_qm;
402 1.1 mrg found = __builtin_ia32_pmovmskb128 (t);
403 1.1 mrg found &= mask;
404 1.1 mrg }
405 1.1 mrg while (!found);
406 1.1 mrg
407 1.1 mrg /* FOUND contains 1 in bits for which we matched a relevant
408 1.1 mrg character. Conversion to the byte index is trivial. */
409 1.1 mrg found = __builtin_ctz(found);
410 1.1 mrg return (const uchar *)p + found;
411 1.1 mrg }
412 1.1 mrg
413 1.1 mrg #ifdef HAVE_SSE4
414 1.1 mrg /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
415 1.1 mrg
416 1.1 mrg static const uchar *
417 1.1 mrg #ifndef __SSE4_2__
418 1.1 mrg __attribute__((__target__("sse4.2")))
419 1.1 mrg #endif
420 1.1 mrg search_line_sse42 (const uchar *s, const uchar *end)
421 1.1 mrg {
422 1.1 mrg typedef char v16qi __attribute__ ((__vector_size__ (16)));
423 1.1 mrg static const v16qi search = { '\n', '\r', '?', '\\' };
424 1.1 mrg
425 1.1 mrg uintptr_t si = (uintptr_t)s;
426 1.1 mrg uintptr_t index;
427 1.1 mrg
428 1.1 mrg /* Check for unaligned input. */
429 1.1 mrg if (si & 15)
430 1.1 mrg {
431 1.1 mrg v16qi sv;
432 1.1 mrg
433 1.1 mrg if (__builtin_expect (end - s < 16, 0)
434 1.1 mrg && __builtin_expect ((si & 0xfff) > 0xff0, 0))
435 1.1 mrg {
436 1.1 mrg /* There are less than 16 bytes left in the buffer, and less
437 1.1 mrg than 16 bytes left on the page. Reading 16 bytes at this
438 1.1 mrg point might generate a spurious page fault. Defer to the
439 1.1 mrg SSE2 implementation, which already handles alignment. */
440 1.1 mrg return search_line_sse2 (s, end);
441 1.1 mrg }
442 1.1 mrg
443 1.1 mrg /* ??? The builtin doesn't understand that the PCMPESTRI read from
444 1.1 mrg memory need not be aligned. */
445 1.1 mrg sv = __builtin_ia32_loaddqu ((const char *) s);
446 1.1 mrg index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
447 1.1 mrg
448 1.1 mrg if (__builtin_expect (index < 16, 0))
449 1.1 mrg goto found;
450 1.1 mrg
451 1.1 mrg /* Advance the pointer to an aligned address. We will re-scan a
452 1.1 mrg few bytes, but we no longer need care for reading past the
453 1.1 mrg end of a page, since we're guaranteed a match. */
454 1.1 mrg s = (const uchar *)((si + 15) & -16);
455 1.1 mrg }
456 1.1 mrg
457 1.1 mrg /* Main loop, processing 16 bytes at a time. */
458 1.1 mrg #ifdef __GCC_ASM_FLAG_OUTPUTS__
459 1.1 mrg while (1)
460 1.1 mrg {
461 1.1 mrg char f;
462 1.1 mrg
463 1.1 mrg /* By using inline assembly instead of the builtin,
464 1.1 mrg we can use the result, as well as the flags set. */
465 1.1 mrg __asm ("%vpcmpestri\t$0, %2, %3"
466 1.1 mrg : "=c"(index), "=@ccc"(f)
467 1.1 mrg : "m"(*s), "x"(search), "a"(4), "d"(16));
468 1.1 mrg if (f)
469 1.1 mrg break;
470 1.1 mrg
471 1.1 mrg s += 16;
472 1.1 mrg }
473 1.1 mrg #else
474 1.1 mrg s -= 16;
475 1.1 mrg /* By doing the whole loop in inline assembly,
476 1.1 mrg we can make proper use of the flags set. */
477 1.1 mrg __asm ( ".balign 16\n"
478 1.1 mrg "0: add $16, %1\n"
479 1.1 mrg " %vpcmpestri\t$0, (%1), %2\n"
480 1.1 mrg " jnc 0b"
481 1.1 mrg : "=&c"(index), "+r"(s)
482 1.1 mrg : "x"(search), "a"(4), "d"(16));
483 1.1 mrg #endif
484 1.1 mrg
485 1.1 mrg found:
486 1.1 mrg return s + index;
487 1.1 mrg }
488 1.1 mrg
489 1.1 mrg #else
490 1.1 mrg /* Work around out-dated assemblers without sse4 support. */
491 1.1 mrg #define search_line_sse42 search_line_sse2
492 1.1 mrg #endif
493 1.1 mrg
494 1.1 mrg /* Check the CPU capabilities. */
495 1.1 mrg
496 1.1 mrg #include "../gcc/config/i386/cpuid.h"
497 1.1 mrg
498 1.1 mrg typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
499 1.1 mrg static search_line_fast_type search_line_fast;
500 1.1 mrg
501 1.1 mrg #define HAVE_init_vectorized_lexer 1
502 1.1 mrg static inline void
503 1.1 mrg init_vectorized_lexer (void)
504 1.1 mrg {
505 1.1 mrg unsigned dummy, ecx = 0, edx = 0;
506 1.1 mrg search_line_fast_type impl = search_line_acc_char;
507 1.1 mrg int minimum = 0;
508 1.1 mrg
509 1.1 mrg #if defined(__SSE4_2__)
510 1.1 mrg minimum = 3;
511 1.1 mrg #elif defined(__SSE2__)
512 1.1 mrg minimum = 2;
513 1.1 mrg #elif defined(__SSE__)
514 1.1 mrg minimum = 1;
515 1.1 mrg #endif
516 1.1 mrg
517 1.1 mrg if (minimum == 3)
518 1.1 mrg impl = search_line_sse42;
519 1.1 mrg else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
520 1.1 mrg {
521 1.1 mrg if (minimum == 3 || (ecx & bit_SSE4_2))
522 1.1 mrg impl = search_line_sse42;
523 1.1 mrg else if (minimum == 2 || (edx & bit_SSE2))
524 1.1 mrg impl = search_line_sse2;
525 1.1 mrg else if (minimum == 1 || (edx & bit_SSE))
526 1.1 mrg impl = search_line_mmx;
527 1.1 mrg }
528 1.1 mrg else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
529 1.1 mrg {
530 1.1 mrg if (minimum == 1
531 1.1 mrg || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
532 1.1 mrg impl = search_line_mmx;
533 1.1 mrg }
534 1.1 mrg
535 1.1 mrg search_line_fast = impl;
536 1.1 mrg }
537 1.1 mrg
538 1.1 mrg #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
539 1.1 mrg
540 1.1 mrg /* A vection of the fast scanner using AltiVec vectorized byte compares
541 1.1 mrg and VSX unaligned loads (when VSX is available). This is otherwise
542 1.1 mrg the same as the AltiVec version. */
543 1.1 mrg
544 1.1 mrg ATTRIBUTE_NO_SANITIZE_UNDEFINED
545 1.1 mrg static const uchar *
546 1.1 mrg search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
547 1.1 mrg {
548 1.1 mrg typedef __attribute__((altivec(vector))) unsigned char vc;
549 1.1 mrg
550 1.1 mrg const vc repl_nl = {
551 1.1 mrg '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
552 1.1 mrg '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
553 1.1 mrg };
554 1.1 mrg const vc repl_cr = {
555 1.1 mrg '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
556 1.1 mrg '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
557 1.1 mrg };
558 1.1 mrg const vc repl_bs = {
559 1.1 mrg '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
560 1.1 mrg '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
561 1.1 mrg };
562 1.1 mrg const vc repl_qm = {
563 1.1 mrg '?', '?', '?', '?', '?', '?', '?', '?',
564 1.1 mrg '?', '?', '?', '?', '?', '?', '?', '?',
565 1.1 mrg };
566 1.1 mrg const vc zero = { 0 };
567 1.1 mrg
568 1.1 mrg vc data, t;
569 1.1 mrg
570 1.1 mrg /* Main loop processing 16 bytes at a time. */
571 1.1 mrg do
572 1.1 mrg {
573 1.1 mrg vc m_nl, m_cr, m_bs, m_qm;
574 1.1 mrg
575 1.1 mrg data = __builtin_vec_vsx_ld (0, s);
576 1.1 mrg s += 16;
577 1.1 mrg
578 1.1 mrg m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
579 1.1 mrg m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
580 1.1 mrg m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
581 1.1 mrg m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
582 1.1 mrg t = (m_nl | m_cr) | (m_bs | m_qm);
583 1.1 mrg
584 1.1 mrg /* T now contains 0xff in bytes for which we matched one of the relevant
585 1.1 mrg characters. We want to exit the loop if any byte in T is non-zero.
586 1.1 mrg Below is the expansion of vec_any_ne(t, zero). */
587 1.1 mrg }
588 1.1 mrg while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
589 1.1 mrg
590 1.1 mrg /* Restore s to to point to the 16 bytes we just processed. */
591 1.1 mrg s -= 16;
592 1.1 mrg
593 1.1 mrg {
594 1.1 mrg #define N (sizeof(vc) / sizeof(long))
595 1.1 mrg
596 1.1 mrg union {
597 1.1 mrg vc v;
598 1.1 mrg /* Statically assert that N is 2 or 4. */
599 1.1 mrg unsigned long l[(N == 2 || N == 4) ? N : -1];
600 1.1 mrg } u;
601 1.1 mrg unsigned long l, i = 0;
602 1.1 mrg
603 1.1 mrg u.v = t;
604 1.1 mrg
605 1.1 mrg /* Find the first word of T that is non-zero. */
606 1.1 mrg switch (N)
607 1.1 mrg {
608 1.1 mrg case 4:
609 1.1 mrg l = u.l[i++];
610 1.1 mrg if (l != 0)
611 1.1 mrg break;
612 1.1 mrg s += sizeof(unsigned long);
613 1.1 mrg l = u.l[i++];
614 1.1 mrg if (l != 0)
615 1.1 mrg break;
616 1.1 mrg s += sizeof(unsigned long);
617 1.1 mrg /* FALLTHRU */
618 1.1 mrg case 2:
619 1.1 mrg l = u.l[i++];
620 1.1 mrg if (l != 0)
621 1.1 mrg break;
622 1.1 mrg s += sizeof(unsigned long);
623 1.1 mrg l = u.l[i];
624 1.1 mrg }
625 1.1 mrg
626 1.1 mrg /* L now contains 0xff in bytes for which we matched one of the
627 1.1 mrg relevant characters. We can find the byte index by finding
628 1.1 mrg its bit index and dividing by 8. */
629 1.1 mrg #ifdef __BIG_ENDIAN__
630 1.1 mrg l = __builtin_clzl(l) >> 3;
631 1.1 mrg #else
632 1.1 mrg l = __builtin_ctzl(l) >> 3;
633 1.1 mrg #endif
634 1.1 mrg return s + l;
635 1.1 mrg
636 1.1 mrg #undef N
637 1.1 mrg }
638 1.1 mrg }
639 1.1 mrg
640 1.1 mrg #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
641 1.1 mrg
642 1.1 mrg /* A vection of the fast scanner using AltiVec vectorized byte compares.
643 1.1 mrg This cannot be used for little endian because vec_lvsl/lvsr are
644 1.1 mrg deprecated for little endian and the code won't work properly. */
645 1.1 mrg /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
646 1.1 mrg so we can't compile this function without -maltivec on the command line
647 1.1 mrg (or implied by some other switch). */
648 1.1 mrg
649 1.1 mrg static const uchar *
650 1.1 mrg search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
651 1.1 mrg {
652 1.1 mrg typedef __attribute__((altivec(vector))) unsigned char vc;
653 1.1 mrg
654 1.1 mrg const vc repl_nl = {
655 1.1 mrg '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
656 1.1 mrg '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
657 1.1 mrg };
658 1.1 mrg const vc repl_cr = {
659 1.1 mrg '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
660 1.1 mrg '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
661 1.1 mrg };
662 1.1 mrg const vc repl_bs = {
663 1.1 mrg '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
664 1.1 mrg '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
665 1.1 mrg };
666 1.1 mrg const vc repl_qm = {
667 1.1 mrg '?', '?', '?', '?', '?', '?', '?', '?',
668 1.1 mrg '?', '?', '?', '?', '?', '?', '?', '?',
669 1.1 mrg };
670 1.1 mrg const vc ones = {
671 1.1 mrg -1, -1, -1, -1, -1, -1, -1, -1,
672 1.1 mrg -1, -1, -1, -1, -1, -1, -1, -1,
673 1.1 mrg };
674 1.1 mrg const vc zero = { 0 };
675 1.1 mrg
676 1.1 mrg vc data, mask, t;
677 1.1 mrg
678 1.1 mrg /* Altivec loads automatically mask addresses with -16. This lets us
679 1.1 mrg issue the first load as early as possible. */
680 1.1 mrg data = __builtin_vec_ld(0, (const vc *)s);
681 1.1 mrg
682 1.1 mrg /* Discard bytes before the beginning of the buffer. Do this by
683 1.1 mrg beginning with all ones and shifting in zeros according to the
684 1.1 mrg mis-alignment. The LVSR instruction pulls the exact shift we
685 1.1 mrg want from the address. */
686 1.1 mrg mask = __builtin_vec_lvsr(0, s);
687 1.1 mrg mask = __builtin_vec_perm(zero, ones, mask);
688 1.1 mrg data &= mask;
689 1.1 mrg
690 1.1 mrg /* While altivec loads mask addresses, we still need to align S so
691 1.1 mrg that the offset we compute at the end is correct. */
692 1.1 mrg s = (const uchar *)((uintptr_t)s & -16);
693 1.1 mrg
694 1.1 mrg /* Main loop processing 16 bytes at a time. */
695 1.1 mrg goto start;
696 1.1 mrg do
697 1.1 mrg {
698 1.1 mrg vc m_nl, m_cr, m_bs, m_qm;
699 1.1 mrg
700 1.1 mrg s += 16;
701 1.1 mrg data = __builtin_vec_ld(0, (const vc *)s);
702 1.1 mrg
703 1.1 mrg start:
704 1.1 mrg m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
705 1.1 mrg m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
706 1.1 mrg m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
707 1.1 mrg m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
708 1.1 mrg t = (m_nl | m_cr) | (m_bs | m_qm);
709 1.1 mrg
710 1.1 mrg /* T now contains 0xff in bytes for which we matched one of the relevant
711 1.1 mrg characters. We want to exit the loop if any byte in T is non-zero.
712 1.1 mrg Below is the expansion of vec_any_ne(t, zero). */
713 1.1 mrg }
714 1.1 mrg while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
715 1.1 mrg
716 1.1 mrg {
717 1.1 mrg #define N (sizeof(vc) / sizeof(long))
718 1.1 mrg
719 1.1 mrg union {
720 1.1 mrg vc v;
721 1.1 mrg /* Statically assert that N is 2 or 4. */
722 1.1 mrg unsigned long l[(N == 2 || N == 4) ? N : -1];
723 1.1 mrg } u;
724 1.1 mrg unsigned long l, i = 0;
725 1.1 mrg
726 1.1 mrg u.v = t;
727 1.1 mrg
728 1.1 mrg /* Find the first word of T that is non-zero. */
729 1.1 mrg switch (N)
730 1.1 mrg {
731 1.1 mrg case 4:
732 1.1 mrg l = u.l[i++];
733 1.1 mrg if (l != 0)
734 1.1 mrg break;
735 1.1 mrg s += sizeof(unsigned long);
736 1.1 mrg l = u.l[i++];
737 1.1 mrg if (l != 0)
738 1.1 mrg break;
739 1.1 mrg s += sizeof(unsigned long);
740 1.1 mrg /* FALLTHROUGH */
741 1.1 mrg case 2:
742 1.1 mrg l = u.l[i++];
743 1.1 mrg if (l != 0)
744 1.1 mrg break;
745 1.1 mrg s += sizeof(unsigned long);
746 1.1 mrg l = u.l[i];
747 1.1 mrg }
748 1.1 mrg
749 1.1 mrg /* L now contains 0xff in bytes for which we matched one of the
750 1.1 mrg relevant characters. We can find the byte index by finding
751 1.1 mrg its bit index and dividing by 8. */
752 1.1 mrg l = __builtin_clzl(l) >> 3;
753 1.1 mrg return s + l;
754 1.1 mrg
755 1.1 mrg #undef N
756 1.1 mrg }
757 1.1 mrg }
758 1.1 mrg
759 1.1 mrg #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
760 1.1 mrg #include "arm_neon.h"
761 1.1 mrg
762 1.1 mrg /* This doesn't have to be the exact page size, but no system may use
763 1.1 mrg a size smaller than this. ARMv8 requires a minimum page size of
764 1.1 mrg 4k. The impact of being conservative here is a small number of
765 1.1 mrg cases will take the slightly slower entry path into the main
766 1.1 mrg loop. */
767 1.1 mrg
768 1.1 mrg #define AARCH64_MIN_PAGE_SIZE 4096
769 1.1 mrg
770 1.1 mrg static const uchar *
771 1.1 mrg search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
772 1.1 mrg {
773 1.1 mrg const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
774 1.1 mrg const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
775 1.1 mrg const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
776 1.1 mrg const uint8x16_t repl_qm = vdupq_n_u8 ('?');
777 1.1 mrg const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
778 1.1 mrg
779 1.1 mrg #ifdef __ARM_BIG_ENDIAN
780 1.1 mrg const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
781 1.1 mrg #else
782 1.1 mrg const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
783 1.1 mrg #endif
784 1.1 mrg
785 1.1 mrg unsigned int found;
786 1.1 mrg const uint8_t *p;
787 1.1 mrg uint8x16_t data;
788 1.1 mrg uint8x16_t t;
789 1.1 mrg uint16x8_t m;
790 1.1 mrg uint8x16_t u, v, w;
791 1.1 mrg
792 1.1 mrg /* Align the source pointer. */
793 1.1 mrg p = (const uint8_t *)((uintptr_t)s & -16);
794 1.1 mrg
795 1.1 mrg /* Assuming random string start positions, with a 4k page size we'll take
796 1.1 mrg the slow path about 0.37% of the time. */
797 1.1 mrg if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
798 1.1 mrg - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
799 1.1 mrg < 16, 0))
800 1.1 mrg {
801 1.1 mrg /* Slow path: the string starts near a possible page boundary. */
802 1.1 mrg uint32_t misalign, mask;
803 1.1 mrg
804 1.1 mrg misalign = (uintptr_t)s & 15;
805 1.1 mrg mask = (-1u << misalign) & 0xffff;
806 1.1 mrg data = vld1q_u8 (p);
807 1.1 mrg t = vceqq_u8 (data, repl_nl);
808 1.1 mrg u = vceqq_u8 (data, repl_cr);
809 1.1 mrg v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
810 1.1 mrg w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
811 1.1 mrg t = vorrq_u8 (v, w);
812 1.1 mrg t = vandq_u8 (t, xmask);
813 1.1 mrg m = vpaddlq_u8 (t);
814 1.1 mrg m = vshlq_u16 (m, shift);
815 1.1 mrg found = vaddvq_u16 (m);
816 1.1 mrg found &= mask;
817 1.1 mrg if (found)
818 1.1 mrg return (const uchar*)p + __builtin_ctz (found);
819 1.1 mrg }
820 1.1 mrg else
821 1.1 mrg {
822 1.1 mrg data = vld1q_u8 ((const uint8_t *) s);
823 1.1 mrg t = vceqq_u8 (data, repl_nl);
824 1.1 mrg u = vceqq_u8 (data, repl_cr);
825 1.1 mrg v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
826 1.1 mrg w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
827 1.1 mrg t = vorrq_u8 (v, w);
828 1.1 mrg if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
829 1.1 mrg goto done;
830 1.1 mrg }
831 1.1 mrg
832 1.1 mrg do
833 1.1 mrg {
834 1.1 mrg p += 16;
835 1.1 mrg data = vld1q_u8 (p);
836 1.1 mrg t = vceqq_u8 (data, repl_nl);
837 1.1 mrg u = vceqq_u8 (data, repl_cr);
838 1.1 mrg v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
839 1.1 mrg w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
840 1.1 mrg t = vorrq_u8 (v, w);
841 1.1 mrg } while (!vpaddd_u64 ((uint64x2_t)t));
842 1.1 mrg
843 1.1 mrg done:
844 1.1 mrg /* Now that we've found the terminating substring, work out precisely where
845 1.1 mrg we need to stop. */
846 1.1 mrg t = vandq_u8 (t, xmask);
847 1.1 mrg m = vpaddlq_u8 (t);
848 1.1 mrg m = vshlq_u16 (m, shift);
849 1.1 mrg found = vaddvq_u16 (m);
850 1.1 mrg return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
851 1.1 mrg + __builtin_ctz (found));
852 1.1 mrg }
853 1.1 mrg
854 1.1 mrg #elif defined (__ARM_NEON)
855 1.1 mrg #include "arm_neon.h"
856 1.1 mrg
857 1.1 mrg static const uchar *
858 1.1 mrg search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
859 1.1 mrg {
860 1.1 mrg const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
861 1.1 mrg const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
862 1.1 mrg const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
863 1.1 mrg const uint8x16_t repl_qm = vdupq_n_u8 ('?');
864 1.1 mrg const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
865 1.1 mrg
866 1.1 mrg unsigned int misalign, found, mask;
867 1.1 mrg const uint8_t *p;
868 1.1 mrg uint8x16_t data;
869 1.1 mrg
870 1.1 mrg /* Align the source pointer. */
871 1.1 mrg misalign = (uintptr_t)s & 15;
872 1.1 mrg p = (const uint8_t *)((uintptr_t)s & -16);
873 1.1 mrg data = vld1q_u8 (p);
874 1.1 mrg
875 1.1 mrg /* Create a mask for the bytes that are valid within the first
876 1.1 mrg 16-byte block. The Idea here is that the AND with the mask
877 1.1 mrg within the loop is "free", since we need some AND or TEST
878 1.1 mrg insn in order to set the flags for the branch anyway. */
879 1.1 mrg mask = (-1u << misalign) & 0xffff;
880 1.1 mrg
881 1.1 mrg /* Main loop, processing 16 bytes at a time. */
882 1.1 mrg goto start;
883 1.1 mrg
884 1.1 mrg do
885 1.1 mrg {
886 1.1 mrg uint8x8_t l;
887 1.1 mrg uint16x4_t m;
888 1.1 mrg uint32x2_t n;
889 1.1 mrg uint8x16_t t, u, v, w;
890 1.1 mrg
891 1.1 mrg p += 16;
892 1.1 mrg data = vld1q_u8 (p);
893 1.1 mrg mask = 0xffff;
894 1.1 mrg
895 1.1 mrg start:
896 1.1 mrg t = vceqq_u8 (data, repl_nl);
897 1.1 mrg u = vceqq_u8 (data, repl_cr);
898 1.1 mrg v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
899 1.1 mrg w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
900 1.1 mrg t = vandq_u8 (vorrq_u8 (v, w), xmask);
901 1.1 mrg l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
902 1.1 mrg m = vpaddl_u8 (l);
903 1.1 mrg n = vpaddl_u16 (m);
904 1.1 mrg
905 1.1 mrg found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
906 1.1 mrg vshr_n_u64 ((uint64x1_t) n, 24)), 0);
907 1.1 mrg found &= mask;
908 1.1 mrg }
909 1.1 mrg while (!found);
910 1.1 mrg
911 1.1 mrg /* FOUND contains 1 in bits for which we matched a relevant
912 1.1 mrg character. Conversion to the byte index is trivial. */
913 1.1 mrg found = __builtin_ctz (found);
914 1.1 mrg return (const uchar *)p + found;
915 1.1 mrg }
916 1.1 mrg
917 1.1 mrg #else
918 1.1 mrg
919 1.1 mrg /* We only have one accelerated alternative. Use a direct call so that
920 1.1 mrg we encourage inlining. */
921 1.1 mrg
922 1.1 mrg #define search_line_fast search_line_acc_char
923 1.1 mrg
924 1.1 mrg #endif
925 1.1 mrg
926 1.1 mrg /* Initialize the lexer if needed. */
927 1.1 mrg
928 1.1 mrg void
929 1.1 mrg _cpp_init_lexer (void)
930 1.1 mrg {
931 1.1 mrg #ifdef HAVE_init_vectorized_lexer
932 1.1 mrg init_vectorized_lexer ();
933 1.1 mrg #endif
934 1.1 mrg }
935 1.1 mrg
936 1.1 mrg /* Returns with a logical line that contains no escaped newlines or
937 1.1 mrg trigraphs. This is a time-critical inner loop. */
938 1.1 mrg void
939 1.1 mrg _cpp_clean_line (cpp_reader *pfile)
940 1.1 mrg {
941 1.1 mrg cpp_buffer *buffer;
942 1.1 mrg const uchar *s;
943 1.1 mrg uchar c, *d, *p;
944 1.1 mrg
945 1.1 mrg buffer = pfile->buffer;
946 1.1 mrg buffer->cur_note = buffer->notes_used = 0;
947 1.1 mrg buffer->cur = buffer->line_base = buffer->next_line;
948 1.1 mrg buffer->need_line = false;
949 1.1 mrg s = buffer->next_line;
950 1.1 mrg
951 1.1 mrg if (!buffer->from_stage3)
952 1.1 mrg {
953 1.1 mrg const uchar *pbackslash = NULL;
954 1.1 mrg
955 1.1 mrg /* Fast path. This is the common case of an un-escaped line with
956 1.1 mrg no trigraphs. The primary win here is by not writing any
957 1.1 mrg data back to memory until we have to. */
958 1.1 mrg while (1)
959 1.1 mrg {
960 1.1 mrg /* Perform an optimized search for \n, \r, \\, ?. */
961 1.1 mrg s = search_line_fast (s, buffer->rlimit);
962 1.1 mrg
963 1.1 mrg c = *s;
964 1.1 mrg if (c == '\\')
965 1.1 mrg {
966 1.1 mrg /* Record the location of the backslash and continue. */
967 1.1 mrg pbackslash = s++;
968 1.1 mrg }
969 1.1 mrg else if (__builtin_expect (c == '?', 0))
970 1.1 mrg {
971 1.1 mrg if (__builtin_expect (s[1] == '?', false)
972 1.1 mrg && _cpp_trigraph_map[s[2]])
973 1.1 mrg {
974 1.1 mrg /* Have a trigraph. We may or may not have to convert
975 1.1 mrg it. Add a line note regardless, for -Wtrigraphs. */
976 1.1 mrg add_line_note (buffer, s, s[2]);
977 1.1 mrg if (CPP_OPTION (pfile, trigraphs))
978 1.1 mrg {
979 1.1 mrg /* We do, and that means we have to switch to the
980 1.1 mrg slow path. */
981 1.1 mrg d = (uchar *) s;
982 1.1 mrg *d = _cpp_trigraph_map[s[2]];
983 1.1 mrg s += 2;
984 1.1 mrg goto slow_path;
985 1.1 mrg }
986 1.1 mrg }
987 1.1 mrg /* Not a trigraph. Continue on fast-path. */
988 1.1 mrg s++;
989 1.1 mrg }
990 1.1 mrg else
991 1.1 mrg break;
992 1.1 mrg }
993 1.1 mrg
994 1.1 mrg /* This must be \r or \n. We're either done, or we'll be forced
995 1.1 mrg to write back to the buffer and continue on the slow path. */
996 1.1 mrg d = (uchar *) s;
997 1.1 mrg
998 1.1 mrg if (__builtin_expect (s == buffer->rlimit, false))
999 1.1 mrg goto done;
1000 1.1 mrg
1001 1.1 mrg /* DOS line ending? */
1002 1.1 mrg if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1003 1.1 mrg {
1004 1.1 mrg s++;
1005 1.1 mrg if (s == buffer->rlimit)
1006 1.1 mrg goto done;
1007 1.1 mrg }
1008 1.1 mrg
1009 1.1 mrg if (__builtin_expect (pbackslash == NULL, true))
1010 1.1 mrg goto done;
1011 1.1 mrg
1012 1.1 mrg /* Check for escaped newline. */
1013 1.1 mrg p = d;
1014 1.1 mrg while (is_nvspace (p[-1]))
1015 1.1 mrg p--;
1016 1.1 mrg if (p - 1 != pbackslash)
1017 1.1 mrg goto done;
1018 1.1 mrg
1019 1.1 mrg /* Have an escaped newline; process it and proceed to
1020 1.1 mrg the slow path. */
1021 1.1 mrg add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1022 1.1 mrg d = p - 2;
1023 1.1 mrg buffer->next_line = p - 1;
1024 1.1 mrg
1025 1.1 mrg slow_path:
1026 1.1 mrg while (1)
1027 1.1 mrg {
1028 1.1 mrg c = *++s;
1029 1.1 mrg *++d = c;
1030 1.1 mrg
1031 1.1 mrg if (c == '\n' || c == '\r')
1032 1.1 mrg {
1033 1.1 mrg /* Handle DOS line endings. */
1034 1.1 mrg if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1035 1.1 mrg s++;
1036 1.1 mrg if (s == buffer->rlimit)
1037 1.1 mrg break;
1038 1.1 mrg
1039 1.1 mrg /* Escaped? */
1040 1.1 mrg p = d;
1041 1.1 mrg while (p != buffer->next_line && is_nvspace (p[-1]))
1042 1.1 mrg p--;
1043 1.1 mrg if (p == buffer->next_line || p[-1] != '\\')
1044 1.1 mrg break;
1045 1.1 mrg
1046 1.1 mrg add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1047 1.1 mrg d = p - 2;
1048 1.1 mrg buffer->next_line = p - 1;
1049 1.1 mrg }
1050 1.1 mrg else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1051 1.1 mrg {
1052 1.1 mrg /* Add a note regardless, for the benefit of -Wtrigraphs. */
1053 1.1 mrg add_line_note (buffer, d, s[2]);
1054 1.1 mrg if (CPP_OPTION (pfile, trigraphs))
1055 1.1 mrg {
1056 1.1 mrg *d = _cpp_trigraph_map[s[2]];
1057 1.1 mrg s += 2;
1058 1.1 mrg }
1059 1.1 mrg }
1060 1.1 mrg }
1061 1.1 mrg }
1062 1.1 mrg else
1063 1.1 mrg {
1064 1.1 mrg while (*s != '\n' && *s != '\r')
1065 1.1 mrg s++;
1066 1.1 mrg d = (uchar *) s;
1067 1.1 mrg
1068 1.1 mrg /* Handle DOS line endings. */
1069 1.1 mrg if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1070 1.1 mrg s++;
1071 1.1 mrg }
1072 1.1 mrg
1073 1.1 mrg done:
1074 1.1 mrg *d = '\n';
1075 1.1 mrg /* A sentinel note that should never be processed. */
1076 1.1 mrg add_line_note (buffer, d + 1, '\n');
1077 1.1 mrg buffer->next_line = s + 1;
1078 1.1 mrg }
1079 1.1.1.3 mrg
1080 1.1.1.3 mrg template <bool lexing_raw_string>
1081 1.1.1.3 mrg static bool get_fresh_line_impl (cpp_reader *pfile);
1082 1.1 mrg
1083 1.1 mrg /* Return true if the trigraph indicated by NOTE should be warned
1084 1.1 mrg about in a comment. */
1085 1.1 mrg static bool
1086 1.1 mrg warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1087 1.1 mrg {
1088 1.1 mrg const uchar *p;
1089 1.1 mrg
1090 1.1 mrg /* Within comments we don't warn about trigraphs, unless the
1091 1.1 mrg trigraph forms an escaped newline, as that may change
1092 1.1 mrg behavior. */
1093 1.1 mrg if (note->type != '/')
1094 1.1 mrg return false;
1095 1.1 mrg
1096 1.1 mrg /* If -trigraphs, then this was an escaped newline iff the next note
1097 1.1 mrg is coincident. */
1098 1.1 mrg if (CPP_OPTION (pfile, trigraphs))
1099 1.1 mrg return note[1].pos == note->pos;
1100 1.1 mrg
1101 1.1 mrg /* Otherwise, see if this forms an escaped newline. */
1102 1.1 mrg p = note->pos + 3;
1103 1.1 mrg while (is_nvspace (*p))
1104 1.1 mrg p++;
1105 1.1 mrg
1106 1.1 mrg /* There might have been escaped newlines between the trigraph and the
1107 1.1 mrg newline we found. Hence the position test. */
1108 1.1 mrg return (*p == '\n' && p < note[1].pos);
1109 1.1 mrg }
1110 1.1 mrg
1111 1.1 mrg /* Process the notes created by add_line_note as far as the current
1112 1.1 mrg location. */
1113 1.1 mrg void
1114 1.1 mrg _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1115 1.1 mrg {
1116 1.1 mrg cpp_buffer *buffer = pfile->buffer;
1117 1.1 mrg
1118 1.1 mrg for (;;)
1119 1.1 mrg {
1120 1.1 mrg _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1121 1.1 mrg unsigned int col;
1122 1.1 mrg
1123 1.1 mrg if (note->pos > buffer->cur)
1124 1.1 mrg break;
1125 1.1 mrg
1126 1.1 mrg buffer->cur_note++;
1127 1.1 mrg col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1128 1.1 mrg
1129 1.1 mrg if (note->type == '\\' || note->type == ' ')
1130 1.1 mrg {
1131 1.1 mrg if (note->type == ' ' && !in_comment)
1132 1.1 mrg cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1133 1.1 mrg "backslash and newline separated by space");
1134 1.1 mrg
1135 1.1 mrg if (buffer->next_line > buffer->rlimit)
1136 1.1 mrg {
1137 1.1 mrg cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1138 1.1 mrg "backslash-newline at end of file");
1139 1.1 mrg /* Prevent "no newline at end of file" warning. */
1140 1.1 mrg buffer->next_line = buffer->rlimit;
1141 1.1 mrg }
1142 1.1 mrg
1143 1.1 mrg buffer->line_base = note->pos;
1144 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
1145 1.1 mrg }
1146 1.1 mrg else if (_cpp_trigraph_map[note->type])
1147 1.1 mrg {
1148 1.1 mrg if (CPP_OPTION (pfile, warn_trigraphs)
1149 1.1 mrg && (!in_comment || warn_in_comment (pfile, note)))
1150 1.1 mrg {
1151 1.1 mrg if (CPP_OPTION (pfile, trigraphs))
1152 1.1 mrg cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1153 1.1 mrg pfile->line_table->highest_line, col,
1154 1.1 mrg "trigraph ??%c converted to %c",
1155 1.1 mrg note->type,
1156 1.1 mrg (int) _cpp_trigraph_map[note->type]);
1157 1.1 mrg else
1158 1.1 mrg {
1159 1.1 mrg cpp_warning_with_line
1160 1.1 mrg (pfile, CPP_W_TRIGRAPHS,
1161 1.1 mrg pfile->line_table->highest_line, col,
1162 1.1 mrg "trigraph ??%c ignored, use -trigraphs to enable",
1163 1.1 mrg note->type);
1164 1.1 mrg }
1165 1.1 mrg }
1166 1.1 mrg }
1167 1.1 mrg else if (note->type == 0)
1168 1.1 mrg /* Already processed in lex_raw_string. */;
1169 1.1 mrg else
1170 1.1 mrg abort ();
1171 1.1 mrg }
1172 1.1 mrg }
1173 1.1 mrg
1174 1.1 mrg namespace bidi {
1175 1.1 mrg enum class kind {
1176 1.1 mrg NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1177 1.1 mrg };
1178 1.1 mrg
1179 1.1 mrg /* All the UTF-8 encodings of bidi characters start with E2. */
1180 1.1 mrg constexpr uchar utf8_start = 0xe2;
1181 1.1 mrg
1182 1.1 mrg struct context
1183 1.1 mrg {
1184 1.1 mrg context () {}
1185 1.1 mrg context (location_t loc, kind k, bool pdf, bool ucn)
1186 1.1 mrg : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1187 1.1 mrg {
1188 1.1 mrg }
1189 1.1 mrg
1190 1.1 mrg kind get_pop_kind () const
1191 1.1 mrg {
1192 1.1 mrg return m_pdf ? kind::PDF : kind::PDI;
1193 1.1 mrg }
1194 1.1 mrg bool ucn_p () const
1195 1.1 mrg {
1196 1.1 mrg return m_ucn;
1197 1.1 mrg }
1198 1.1 mrg
1199 1.1 mrg location_t m_loc;
1200 1.1 mrg kind m_kind;
1201 1.1 mrg unsigned m_pdf : 1;
1202 1.1 mrg unsigned m_ucn : 1;
1203 1.1 mrg };
1204 1.1 mrg
1205 1.1 mrg /* A vector holding currently open bidi contexts. We use a char for
1206 1.1 mrg each context, its LSB is 1 if it represents a PDF context, 0 if it
1207 1.1 mrg represents a PDI context. The next bit is 1 if this context was open
1208 1.1 mrg by a bidi character written as a UCN, and 0 when it was UTF-8. */
1209 1.1 mrg semi_embedded_vec <context, 16> vec;
1210 1.1 mrg
1211 1.1 mrg /* Close the whole comment/identifier/string literal/character constant
1212 1.1 mrg context. */
1213 1.1 mrg void on_close ()
1214 1.1 mrg {
1215 1.1 mrg vec.truncate (0);
1216 1.1 mrg }
1217 1.1 mrg
1218 1.1 mrg /* Pop the last element in the vector. */
1219 1.1 mrg void pop ()
1220 1.1 mrg {
1221 1.1 mrg unsigned int len = vec.count ();
1222 1.1 mrg gcc_checking_assert (len > 0);
1223 1.1 mrg vec.truncate (len - 1);
1224 1.1 mrg }
1225 1.1 mrg
1226 1.1 mrg /* Return the pop kind of the context of the Ith element. */
1227 1.1 mrg kind pop_kind_at (unsigned int i)
1228 1.1 mrg {
1229 1.1 mrg return vec[i].get_pop_kind ();
1230 1.1 mrg }
1231 1.1 mrg
1232 1.1 mrg /* Return the pop kind of the context that is currently opened. */
1233 1.1 mrg kind current_ctx ()
1234 1.1 mrg {
1235 1.1 mrg unsigned int len = vec.count ();
1236 1.1 mrg if (len == 0)
1237 1.1 mrg return kind::NONE;
1238 1.1 mrg return vec[len - 1].get_pop_kind ();
1239 1.1 mrg }
1240 1.1 mrg
1241 1.1 mrg /* Return true if the current context comes from a UCN origin, that is,
1242 1.1 mrg the bidi char which started this bidi context was written as a UCN. */
1243 1.1 mrg bool current_ctx_ucn_p ()
1244 1.1 mrg {
1245 1.1 mrg unsigned int len = vec.count ();
1246 1.1 mrg gcc_checking_assert (len > 0);
1247 1.1 mrg return vec[len - 1].m_ucn;
1248 1.1 mrg }
1249 1.1 mrg
1250 1.1 mrg location_t current_ctx_loc ()
1251 1.1 mrg {
1252 1.1 mrg unsigned int len = vec.count ();
1253 1.1 mrg gcc_checking_assert (len > 0);
1254 1.1 mrg return vec[len - 1].m_loc;
1255 1.1 mrg }
1256 1.1 mrg
1257 1.1 mrg /* We've read a bidi char, update the current vector as necessary.
1258 1.1 mrg LOC is only valid when K is not kind::NONE. */
1259 1.1 mrg void on_char (kind k, bool ucn_p, location_t loc)
1260 1.1 mrg {
1261 1.1 mrg switch (k)
1262 1.1 mrg {
1263 1.1 mrg case kind::LRE:
1264 1.1 mrg case kind::RLE:
1265 1.1 mrg case kind::LRO:
1266 1.1 mrg case kind::RLO:
1267 1.1 mrg vec.push (context (loc, k, true, ucn_p));
1268 1.1 mrg break;
1269 1.1 mrg case kind::LRI:
1270 1.1 mrg case kind::RLI:
1271 1.1 mrg case kind::FSI:
1272 1.1 mrg vec.push (context (loc, k, false, ucn_p));
1273 1.1 mrg break;
1274 1.1 mrg /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1275 1.1 mrg whose scope has not yet been terminated. */
1276 1.1 mrg case kind::PDF:
1277 1.1 mrg if (current_ctx () == kind::PDF)
1278 1.1 mrg pop ();
1279 1.1 mrg break;
1280 1.1 mrg /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1281 1.1 mrg scope has not yet been terminated, as well as the scopes of
1282 1.1 mrg any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1283 1.1 mrg yet been terminated. */
1284 1.1 mrg case kind::PDI:
1285 1.1 mrg for (int i = vec.count () - 1; i >= 0; --i)
1286 1.1 mrg if (pop_kind_at (i) == kind::PDI)
1287 1.1 mrg {
1288 1.1 mrg vec.truncate (i);
1289 1.1 mrg break;
1290 1.1 mrg }
1291 1.1 mrg break;
1292 1.1 mrg case kind::LTR:
1293 1.1 mrg case kind::RTL:
1294 1.1 mrg /* These aren't popped by a PDF/PDI. */
1295 1.1 mrg break;
1296 1.1 mrg ATTR_LIKELY case kind::NONE:
1297 1.1 mrg break;
1298 1.1 mrg default:
1299 1.1 mrg abort ();
1300 1.1 mrg }
1301 1.1 mrg }
1302 1.1 mrg
1303 1.1 mrg /* Return a descriptive string for K. */
1304 1.1 mrg const char *to_str (kind k)
1305 1.1 mrg {
1306 1.1 mrg switch (k)
1307 1.1 mrg {
1308 1.1 mrg case kind::LRE:
1309 1.1 mrg return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1310 1.1 mrg case kind::RLE:
1311 1.1 mrg return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1312 1.1 mrg case kind::LRO:
1313 1.1 mrg return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1314 1.1 mrg case kind::RLO:
1315 1.1 mrg return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1316 1.1 mrg case kind::LRI:
1317 1.1 mrg return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1318 1.1 mrg case kind::RLI:
1319 1.1 mrg return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1320 1.1 mrg case kind::FSI:
1321 1.1 mrg return "U+2068 (FIRST STRONG ISOLATE)";
1322 1.1 mrg case kind::PDF:
1323 1.1 mrg return "U+202C (POP DIRECTIONAL FORMATTING)";
1324 1.1 mrg case kind::PDI:
1325 1.1 mrg return "U+2069 (POP DIRECTIONAL ISOLATE)";
1326 1.1 mrg case kind::LTR:
1327 1.1 mrg return "U+200E (LEFT-TO-RIGHT MARK)";
1328 1.1 mrg case kind::RTL:
1329 1.1 mrg return "U+200F (RIGHT-TO-LEFT MARK)";
1330 1.1 mrg default:
1331 1.1 mrg abort ();
1332 1.1 mrg }
1333 1.1 mrg }
1334 1.1 mrg }
1335 1.1 mrg
1336 1.1 mrg /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1337 1.1 mrg within the current line in FILE, with the caret at START. */
1338 1.1 mrg
1339 1.1 mrg static location_t
1340 1.1 mrg get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1341 1.1 mrg const unsigned char *const start,
1342 1.1 mrg size_t num_bytes)
1343 1.1 mrg {
1344 1.1 mrg gcc_checking_assert (num_bytes > 0);
1345 1.1 mrg
1346 1.1 mrg /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1347 1.1 mrg to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1348 1.1 mrg whereas linemap_position_for_column is 1-based. */
1349 1.1 mrg
1350 1.1 mrg /* Get 0-based offsets within the line. */
1351 1.1 mrg size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1352 1.1 mrg size_t end_offset = start_offset + num_bytes - 1;
1353 1.1 mrg
1354 1.1 mrg /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1355 1.1 mrg location_t start_loc = linemap_position_for_column (pfile->line_table,
1356 1.1 mrg start_offset + 1);
1357 1.1 mrg location_t end_loc = linemap_position_for_column (pfile->line_table,
1358 1.1 mrg end_offset + 1);
1359 1.1 mrg
1360 1.1 mrg if (start_loc == end_loc)
1361 1.1 mrg return start_loc;
1362 1.1 mrg
1363 1.1 mrg source_range src_range;
1364 1.1 mrg src_range.m_start = start_loc;
1365 1.1.1.3 mrg src_range.m_finish = end_loc;
1366 1.1.1.3 mrg location_t combined_loc
1367 1.1.1.3 mrg = pfile->line_table->get_or_create_combined_loc (start_loc,
1368 1.1.1.3 mrg src_range,
1369 1.1.1.3 mrg nullptr,
1370 1.1 mrg 0);
1371 1.1 mrg return combined_loc;
1372 1.1 mrg }
1373 1.1 mrg
1374 1.1 mrg /* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1375 1.1 mrg
1376 1.1 mrg static bidi::kind
1377 1.1 mrg get_bidi_utf8_1 (const unsigned char *const p)
1378 1.1 mrg {
1379 1.1 mrg gcc_checking_assert (p[0] == bidi::utf8_start);
1380 1.1 mrg
1381 1.1 mrg if (p[1] == 0x80)
1382 1.1 mrg switch (p[2])
1383 1.1 mrg {
1384 1.1 mrg case 0xaa:
1385 1.1 mrg return bidi::kind::LRE;
1386 1.1 mrg case 0xab:
1387 1.1 mrg return bidi::kind::RLE;
1388 1.1 mrg case 0xac:
1389 1.1 mrg return bidi::kind::PDF;
1390 1.1 mrg case 0xad:
1391 1.1 mrg return bidi::kind::LRO;
1392 1.1 mrg case 0xae:
1393 1.1 mrg return bidi::kind::RLO;
1394 1.1 mrg case 0x8e:
1395 1.1 mrg return bidi::kind::LTR;
1396 1.1 mrg case 0x8f:
1397 1.1 mrg return bidi::kind::RTL;
1398 1.1 mrg default:
1399 1.1 mrg break;
1400 1.1 mrg }
1401 1.1 mrg else if (p[1] == 0x81)
1402 1.1 mrg switch (p[2])
1403 1.1 mrg {
1404 1.1 mrg case 0xa6:
1405 1.1 mrg return bidi::kind::LRI;
1406 1.1 mrg case 0xa7:
1407 1.1 mrg return bidi::kind::RLI;
1408 1.1 mrg case 0xa8:
1409 1.1 mrg return bidi::kind::FSI;
1410 1.1 mrg case 0xa9:
1411 1.1 mrg return bidi::kind::PDI;
1412 1.1 mrg default:
1413 1.1 mrg break;
1414 1.1 mrg }
1415 1.1 mrg
1416 1.1 mrg return bidi::kind::NONE;
1417 1.1 mrg }
1418 1.1 mrg
1419 1.1 mrg /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1420 1.1 mrg If the kind is not NONE, write the location to *OUT.*/
1421 1.1 mrg
1422 1.1 mrg static bidi::kind
1423 1.1 mrg get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1424 1.1 mrg {
1425 1.1 mrg bidi::kind result = get_bidi_utf8_1 (p);
1426 1.1 mrg if (result != bidi::kind::NONE)
1427 1.1 mrg {
1428 1.1 mrg /* We have a sequence of 3 bytes starting at P. */
1429 1.1 mrg *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1430 1.1 mrg }
1431 1.1 mrg return result;
1432 1.1 mrg }
1433 1.1 mrg
1434 1.1 mrg /* Parse a UCN where P points just past \u or \U and return its bidi code. */
1435 1.1 mrg
1436 1.1.1.3 mrg static bidi::kind
1437 1.1 mrg get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1438 1.1 mrg {
1439 1.1 mrg /* 6.4.3 Universal Character Names
1440 1.1 mrg \u hex-quad
1441 1.1.1.3 mrg \U hex-quad hex-quad
1442 1.1 mrg \u { simple-hexadecimal-digit-sequence }
1443 1.1 mrg where \unnnn means \U0000nnnn. */
1444 1.1.1.3 mrg
1445 1.1 mrg *end = p + 4;
1446 1.1 mrg if (is_U)
1447 1.1 mrg {
1448 1.1 mrg if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1449 1.1 mrg return bidi::kind::NONE;
1450 1.1 mrg /* Skip 4B so we can treat \u and \U the same below. */
1451 1.1.1.3 mrg p += 4;
1452 1.1.1.3 mrg *end += 4;
1453 1.1.1.3 mrg }
1454 1.1.1.3 mrg else if (p[0] == '{')
1455 1.1.1.3 mrg {
1456 1.1.1.3 mrg p++;
1457 1.1.1.3 mrg while (*p == '0')
1458 1.1.1.3 mrg p++;
1459 1.1.1.3 mrg if (p[0] != '2'
1460 1.1.1.3 mrg || p[1] != '0'
1461 1.1.1.3 mrg || !ISXDIGIT (p[2])
1462 1.1.1.3 mrg || !ISXDIGIT (p[3])
1463 1.1.1.3 mrg || p[4] != '}')
1464 1.1.1.3 mrg return bidi::kind::NONE;
1465 1.1 mrg *end = p + 5;
1466 1.1 mrg }
1467 1.1 mrg
1468 1.1 mrg /* All code points we are looking for start with 20xx. */
1469 1.1 mrg if (p[0] != '2' || p[1] != '0')
1470 1.1 mrg return bidi::kind::NONE;
1471 1.1 mrg else if (p[2] == '2')
1472 1.1 mrg switch (p[3])
1473 1.1 mrg {
1474 1.1 mrg case 'a':
1475 1.1 mrg case 'A':
1476 1.1 mrg return bidi::kind::LRE;
1477 1.1 mrg case 'b':
1478 1.1 mrg case 'B':
1479 1.1 mrg return bidi::kind::RLE;
1480 1.1 mrg case 'c':
1481 1.1 mrg case 'C':
1482 1.1 mrg return bidi::kind::PDF;
1483 1.1 mrg case 'd':
1484 1.1 mrg case 'D':
1485 1.1 mrg return bidi::kind::LRO;
1486 1.1 mrg case 'e':
1487 1.1 mrg case 'E':
1488 1.1 mrg return bidi::kind::RLO;
1489 1.1 mrg default:
1490 1.1 mrg break;
1491 1.1 mrg }
1492 1.1 mrg else if (p[2] == '6')
1493 1.1 mrg switch (p[3])
1494 1.1 mrg {
1495 1.1 mrg case '6':
1496 1.1 mrg return bidi::kind::LRI;
1497 1.1 mrg case '7':
1498 1.1 mrg return bidi::kind::RLI;
1499 1.1 mrg case '8':
1500 1.1 mrg return bidi::kind::FSI;
1501 1.1 mrg case '9':
1502 1.1 mrg return bidi::kind::PDI;
1503 1.1 mrg default:
1504 1.1 mrg break;
1505 1.1 mrg }
1506 1.1 mrg else if (p[2] == '0')
1507 1.1 mrg switch (p[3])
1508 1.1 mrg {
1509 1.1 mrg case 'e':
1510 1.1 mrg case 'E':
1511 1.1 mrg return bidi::kind::LTR;
1512 1.1 mrg case 'f':
1513 1.1 mrg case 'F':
1514 1.1 mrg return bidi::kind::RTL;
1515 1.1 mrg default:
1516 1.1 mrg break;
1517 1.1 mrg }
1518 1.1 mrg
1519 1.1 mrg return bidi::kind::NONE;
1520 1.1 mrg }
1521 1.1 mrg
1522 1.1.1.3 mrg /* Parse a UCN where P points just past \u or \U and return its bidi code.
1523 1.1 mrg If the kind is not NONE, write the location to *OUT. */
1524 1.1 mrg
1525 1.1.1.3 mrg static bidi::kind
1526 1.1 mrg get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1527 1.1 mrg location_t *out)
1528 1.1.1.3 mrg {
1529 1.1.1.3 mrg const unsigned char *end;
1530 1.1 mrg bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1531 1.1 mrg if (result != bidi::kind::NONE)
1532 1.1 mrg {
1533 1.1.1.3 mrg const unsigned char *start = p - 2;
1534 1.1 mrg size_t num_bytes = end - start;
1535 1.1 mrg *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1536 1.1 mrg }
1537 1.1 mrg return result;
1538 1.1 mrg }
1539 1.1.1.3 mrg
1540 1.1.1.3 mrg /* Parse a named universal character escape where P points just past \N and
1541 1.1.1.3 mrg return its bidi code. If the kind is not NONE, write the location to
1542 1.1.1.3 mrg *OUT. */
1543 1.1.1.3 mrg
1544 1.1.1.3 mrg static bidi::kind
1545 1.1.1.3 mrg get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1546 1.1.1.3 mrg {
1547 1.1.1.3 mrg bidi::kind result = bidi::kind::NONE;
1548 1.1.1.3 mrg if (*p != '{')
1549 1.1.1.3 mrg return bidi::kind::NONE;
1550 1.1.1.3 mrg if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1551 1.1.1.3 mrg {
1552 1.1.1.3 mrg if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1553 1.1.1.3 mrg result = bidi::kind::LTR;
1554 1.1.1.3 mrg else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1555 1.1.1.3 mrg result = bidi::kind::LRE;
1556 1.1.1.3 mrg else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1557 1.1.1.3 mrg result = bidi::kind::LRO;
1558 1.1.1.3 mrg else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1559 1.1.1.3 mrg result = bidi::kind::LRI;
1560 1.1.1.3 mrg }
1561 1.1.1.3 mrg else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1562 1.1.1.3 mrg {
1563 1.1.1.3 mrg if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1564 1.1.1.3 mrg result = bidi::kind::RTL;
1565 1.1.1.3 mrg else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1566 1.1.1.3 mrg result = bidi::kind::RLE;
1567 1.1.1.3 mrg else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1568 1.1.1.3 mrg result = bidi::kind::RLO;
1569 1.1.1.3 mrg else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1570 1.1.1.3 mrg result = bidi::kind::RLI;
1571 1.1.1.3 mrg }
1572 1.1.1.3 mrg else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1573 1.1.1.3 mrg {
1574 1.1.1.3 mrg if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1575 1.1.1.3 mrg result = bidi::kind::PDF;
1576 1.1.1.3 mrg else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1577 1.1.1.3 mrg result = bidi::kind::PDI;
1578 1.1.1.3 mrg }
1579 1.1.1.3 mrg else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1580 1.1.1.3 mrg result = bidi::kind::FSI;
1581 1.1.1.3 mrg if (result != bidi::kind::NONE)
1582 1.1.1.3 mrg *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1583 1.1.1.3 mrg (strchr ((const char *)
1584 1.1.1.3 mrg (p + 1), '}')
1585 1.1.1.3 mrg - (const char *) p)
1586 1.1.1.3 mrg + 3);
1587 1.1.1.3 mrg return result;
1588 1.1.1.3 mrg }
1589 1.1 mrg
1590 1.1 mrg /* Subclass of rich_location for reporting on unpaired UTF-8
1591 1.1 mrg bidirectional control character(s).
1592 1.1 mrg Escape the source lines on output, and show all unclosed
1593 1.1 mrg bidi context, labelling everything. */
1594 1.1 mrg
1595 1.1 mrg class unpaired_bidi_rich_location : public rich_location
1596 1.1 mrg {
1597 1.1 mrg public:
1598 1.1 mrg class custom_range_label : public range_label
1599 1.1 mrg {
1600 1.1.1.3 mrg public:
1601 1.1 mrg label_text get_text (unsigned range_idx) const final override
1602 1.1 mrg {
1603 1.1 mrg /* range 0 is the primary location; each subsequent range i + 1
1604 1.1 mrg is for bidi::vec[i]. */
1605 1.1 mrg if (range_idx > 0)
1606 1.1 mrg {
1607 1.1 mrg const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1608 1.1 mrg return label_text::borrow (bidi::to_str (ctxt.m_kind));
1609 1.1 mrg }
1610 1.1 mrg else
1611 1.1 mrg return label_text::borrow (_("end of bidirectional context"));
1612 1.1 mrg }
1613 1.1 mrg };
1614 1.1 mrg
1615 1.1 mrg unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1616 1.1 mrg : rich_location (pfile->line_table, loc, &m_custom_label)
1617 1.1 mrg {
1618 1.1 mrg set_escape_on_output (true);
1619 1.1 mrg for (unsigned i = 0; i < bidi::vec.count (); i++)
1620 1.1 mrg add_range (bidi::vec[i].m_loc,
1621 1.1 mrg SHOW_RANGE_WITHOUT_CARET,
1622 1.1 mrg &m_custom_label);
1623 1.1 mrg }
1624 1.1 mrg
1625 1.1 mrg private:
1626 1.1 mrg custom_range_label m_custom_label;
1627 1.1 mrg };
1628 1.1 mrg
1629 1.1 mrg /* We're closing a bidi context, that is, we've encountered a newline,
1630 1.1 mrg are closing a C-style comment, or are at the end of a string literal,
1631 1.1 mrg character constant, or identifier. Warn if this context was not
1632 1.1 mrg properly terminated by a PDI or PDF. P points to the last character
1633 1.1 mrg in this context. */
1634 1.1 mrg
1635 1.1 mrg static void
1636 1.1 mrg maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1637 1.1 mrg {
1638 1.1 mrg const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1639 1.1 mrg if (bidi::vec.count () > 0
1640 1.1 mrg && (warn_bidi & bidirectional_unpaired
1641 1.1 mrg && (!bidi::current_ctx_ucn_p ()
1642 1.1 mrg || (warn_bidi & bidirectional_ucn))))
1643 1.1 mrg {
1644 1.1 mrg const location_t loc
1645 1.1 mrg = linemap_position_for_column (pfile->line_table,
1646 1.1 mrg CPP_BUF_COLUMN (pfile->buffer, p));
1647 1.1 mrg unpaired_bidi_rich_location rich_loc (pfile, loc);
1648 1.1 mrg /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1649 1.1 mrg forms of a diagnostic, so fake it for now. */
1650 1.1 mrg if (bidi::vec.count () > 1)
1651 1.1 mrg cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1652 1.1 mrg "unpaired UTF-8 bidirectional control characters "
1653 1.1 mrg "detected");
1654 1.1 mrg else
1655 1.1 mrg cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1656 1.1 mrg "unpaired UTF-8 bidirectional control character "
1657 1.1 mrg "detected");
1658 1.1 mrg }
1659 1.1 mrg /* We're done with this context. */
1660 1.1 mrg bidi::on_close ();
1661 1.1 mrg }
1662 1.1 mrg
1663 1.1 mrg /* We're at the beginning or in the middle of an identifier/comment/string
1664 1.1 mrg literal/character constant. Warn if we've encountered a bidi character.
1665 1.1 mrg KIND says which bidi control character it was; UCN_P is true iff this bidi
1666 1.1 mrg control character was written as a UCN. LOC is the location of the
1667 1.1 mrg character, but is only valid if KIND != bidi::kind::NONE. */
1668 1.1 mrg
1669 1.1 mrg static void
1670 1.1 mrg maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1671 1.1 mrg bool ucn_p, location_t loc)
1672 1.1 mrg {
1673 1.1 mrg if (__builtin_expect (kind == bidi::kind::NONE, 1))
1674 1.1 mrg return;
1675 1.1 mrg
1676 1.1 mrg const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1677 1.1 mrg
1678 1.1 mrg if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1679 1.1 mrg {
1680 1.1 mrg rich_location rich_loc (pfile->line_table, loc);
1681 1.1 mrg rich_loc.set_escape_on_output (true);
1682 1.1 mrg
1683 1.1 mrg /* It seems excessive to warn about a PDI/PDF that is closing
1684 1.1 mrg an opened context because we've already warned about the
1685 1.1 mrg opening character. Except warn when we have a UCN x UTF-8
1686 1.1 mrg mismatch, if UCN checking is enabled. */
1687 1.1 mrg if (kind == bidi::current_ctx ())
1688 1.1 mrg {
1689 1.1 mrg if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1690 1.1 mrg && bidi::current_ctx_ucn_p () != ucn_p)
1691 1.1 mrg {
1692 1.1 mrg rich_loc.add_range (bidi::current_ctx_loc ());
1693 1.1 mrg cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1694 1.1 mrg "UTF-8 vs UCN mismatch when closing "
1695 1.1 mrg "a context by \"%s\"", bidi::to_str (kind));
1696 1.1 mrg }
1697 1.1 mrg }
1698 1.1 mrg else if (warn_bidi & bidirectional_any
1699 1.1 mrg && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1700 1.1 mrg {
1701 1.1 mrg if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1702 1.1 mrg cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1703 1.1 mrg "\"%s\" is closing an unopened context",
1704 1.1 mrg bidi::to_str (kind));
1705 1.1 mrg else
1706 1.1 mrg cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1707 1.1 mrg "found problematic Unicode character \"%s\"",
1708 1.1 mrg bidi::to_str (kind));
1709 1.1 mrg }
1710 1.1 mrg }
1711 1.1 mrg /* We're done with this context. */
1712 1.1 mrg bidi::on_char (kind, ucn_p, loc);
1713 1.1 mrg }
1714 1.1.1.3 mrg
1715 1.1.1.3 mrg static const cppchar_t utf8_continuation = 0x80;
1716 1.1.1.3 mrg static const cppchar_t utf8_signifier = 0xC0;
1717 1.1.1.3 mrg
1718 1.1.1.3 mrg /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1719 1.1.1.3 mrg at PFILE->buffer->cur. Return a pointer after the diagnosed
1720 1.1.1.3 mrg invalid character. */
1721 1.1.1.3 mrg
1722 1.1.1.3 mrg static const uchar *
1723 1.1.1.3 mrg _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1724 1.1.1.3 mrg {
1725 1.1.1.3 mrg cpp_buffer *buffer = pfile->buffer;
1726 1.1.1.3 mrg const uchar *cur = buffer->cur;
1727 1.1.1.3 mrg bool pedantic = (CPP_PEDANTIC (pfile)
1728 1.1.1.3 mrg && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1729 1.1.1.3 mrg
1730 1.1.1.3 mrg if (cur[0] < utf8_signifier
1731 1.1.1.3 mrg || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1732 1.1.1.3 mrg {
1733 1.1.1.3 mrg if (pedantic)
1734 1.1.1.3 mrg cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1735 1.1.1.3 mrg pfile->line_table->highest_line,
1736 1.1.1.3 mrg CPP_BUF_COL (buffer),
1737 1.1.1.3 mrg "invalid UTF-8 character <%x>",
1738 1.1.1.3 mrg cur[0]);
1739 1.1.1.3 mrg else
1740 1.1.1.3 mrg cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1741 1.1.1.3 mrg pfile->line_table->highest_line,
1742 1.1.1.3 mrg CPP_BUF_COL (buffer),
1743 1.1.1.3 mrg "invalid UTF-8 character <%x>",
1744 1.1.1.3 mrg cur[0]);
1745 1.1.1.3 mrg return cur + 1;
1746 1.1.1.3 mrg }
1747 1.1.1.3 mrg else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1748 1.1.1.3 mrg {
1749 1.1.1.3 mrg if (pedantic)
1750 1.1.1.3 mrg cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1751 1.1.1.3 mrg pfile->line_table->highest_line,
1752 1.1.1.3 mrg CPP_BUF_COL (buffer),
1753 1.1.1.3 mrg "invalid UTF-8 character <%x><%x>",
1754 1.1.1.3 mrg cur[0], cur[1]);
1755 1.1.1.3 mrg else
1756 1.1.1.3 mrg cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1757 1.1.1.3 mrg pfile->line_table->highest_line,
1758 1.1.1.3 mrg CPP_BUF_COL (buffer),
1759 1.1.1.3 mrg "invalid UTF-8 character <%x><%x>",
1760 1.1.1.3 mrg cur[0], cur[1]);
1761 1.1.1.3 mrg return cur + 2;
1762 1.1.1.3 mrg }
1763 1.1.1.3 mrg else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1764 1.1.1.3 mrg {
1765 1.1.1.3 mrg if (pedantic)
1766 1.1.1.3 mrg cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1767 1.1.1.3 mrg pfile->line_table->highest_line,
1768 1.1.1.3 mrg CPP_BUF_COL (buffer),
1769 1.1.1.3 mrg "invalid UTF-8 character <%x><%x><%x>",
1770 1.1.1.3 mrg cur[0], cur[1], cur[2]);
1771 1.1.1.3 mrg else
1772 1.1.1.3 mrg cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1773 1.1.1.3 mrg pfile->line_table->highest_line,
1774 1.1.1.3 mrg CPP_BUF_COL (buffer),
1775 1.1.1.3 mrg "invalid UTF-8 character <%x><%x><%x>",
1776 1.1.1.3 mrg cur[0], cur[1], cur[2]);
1777 1.1.1.3 mrg return cur + 3;
1778 1.1.1.3 mrg }
1779 1.1.1.3 mrg else
1780 1.1.1.3 mrg {
1781 1.1.1.3 mrg if (pedantic)
1782 1.1.1.3 mrg cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1783 1.1.1.3 mrg pfile->line_table->highest_line,
1784 1.1.1.3 mrg CPP_BUF_COL (buffer),
1785 1.1.1.3 mrg "invalid UTF-8 character <%x><%x><%x><%x>",
1786 1.1.1.3 mrg cur[0], cur[1], cur[2], cur[3]);
1787 1.1.1.3 mrg else
1788 1.1.1.3 mrg cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1789 1.1.1.3 mrg pfile->line_table->highest_line,
1790 1.1.1.3 mrg CPP_BUF_COL (buffer),
1791 1.1.1.3 mrg "invalid UTF-8 character <%x><%x><%x><%x>",
1792 1.1.1.3 mrg cur[0], cur[1], cur[2], cur[3]);
1793 1.1.1.3 mrg return cur + 4;
1794 1.1.1.3 mrg }
1795 1.1.1.3 mrg }
1796 1.1.1.3 mrg
1797 1.1.1.3 mrg /* Helper function of *skip_*_comment and lex*_string. For C,
1798 1.1.1.3 mrg character at CUR[-1] with MSB set handle -Wbidi-chars* and
1799 1.1.1.3 mrg -Winvalid-utf8 diagnostics and return pointer to first character
1800 1.1.1.3 mrg that should be processed next. */
1801 1.1.1.3 mrg
1802 1.1.1.3 mrg static inline const uchar *
1803 1.1.1.3 mrg _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1804 1.1.1.3 mrg const uchar *cur, bool warn_bidi_p,
1805 1.1.1.3 mrg bool warn_invalid_utf8_p)
1806 1.1.1.3 mrg {
1807 1.1.1.3 mrg /* If this is a beginning of a UTF-8 encoding, it might be
1808 1.1.1.3 mrg a bidirectional control character. */
1809 1.1.1.3 mrg if (c == bidi::utf8_start && warn_bidi_p)
1810 1.1.1.3 mrg {
1811 1.1.1.3 mrg location_t loc;
1812 1.1.1.3 mrg bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1813 1.1.1.3 mrg maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1814 1.1.1.3 mrg }
1815 1.1.1.3 mrg if (!warn_invalid_utf8_p)
1816 1.1.1.3 mrg return cur;
1817 1.1.1.3 mrg if (c >= utf8_signifier)
1818 1.1.1.3 mrg {
1819 1.1.1.3 mrg cppchar_t s;
1820 1.1.1.3 mrg const uchar *pstr = cur - 1;
1821 1.1.1.3 mrg if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1822 1.1.1.3 mrg && s <= UCS_LIMIT)
1823 1.1.1.3 mrg return pstr;
1824 1.1.1.3 mrg }
1825 1.1.1.3 mrg pfile->buffer->cur = cur - 1;
1826 1.1.1.3 mrg return _cpp_warn_invalid_utf8 (pfile);
1827 1.1.1.3 mrg }
1828 1.1 mrg
1829 1.1 mrg /* Skip a C-style block comment. We find the end of the comment by
1830 1.1 mrg seeing if an asterisk is before every '/' we encounter. Returns
1831 1.1 mrg nonzero if comment terminated by EOF, zero otherwise.
1832 1.1 mrg
1833 1.1 mrg Buffer->cur points to the initial asterisk of the comment. */
1834 1.1 mrg bool
1835 1.1 mrg _cpp_skip_block_comment (cpp_reader *pfile)
1836 1.1 mrg {
1837 1.1 mrg cpp_buffer *buffer = pfile->buffer;
1838 1.1 mrg const uchar *cur = buffer->cur;
1839 1.1 mrg uchar c;
1840 1.1.1.3 mrg const bool warn_bidi_p = pfile->warn_bidi_p ();
1841 1.1.1.3 mrg const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1842 1.1 mrg const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1843 1.1 mrg
1844 1.1 mrg cur++;
1845 1.1 mrg if (*cur == '/')
1846 1.1 mrg cur++;
1847 1.1 mrg
1848 1.1 mrg for (;;)
1849 1.1 mrg {
1850 1.1 mrg /* People like decorating comments with '*', so check for '/'
1851 1.1 mrg instead for efficiency. */
1852 1.1 mrg c = *cur++;
1853 1.1 mrg
1854 1.1 mrg if (c == '/')
1855 1.1 mrg {
1856 1.1 mrg if (cur[-2] == '*')
1857 1.1 mrg {
1858 1.1 mrg if (warn_bidi_p)
1859 1.1 mrg maybe_warn_bidi_on_close (pfile, cur);
1860 1.1 mrg break;
1861 1.1 mrg }
1862 1.1 mrg
1863 1.1 mrg /* Warn about potential nested comments, but not if the '/'
1864 1.1 mrg comes immediately before the true comment delimiter.
1865 1.1 mrg Don't bother to get it right across escaped newlines. */
1866 1.1 mrg if (CPP_OPTION (pfile, warn_comments)
1867 1.1 mrg && cur[0] == '*' && cur[1] != '/')
1868 1.1 mrg {
1869 1.1 mrg buffer->cur = cur;
1870 1.1 mrg cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1871 1.1 mrg pfile->line_table->highest_line,
1872 1.1 mrg CPP_BUF_COL (buffer),
1873 1.1 mrg "\"/*\" within comment");
1874 1.1 mrg }
1875 1.1 mrg }
1876 1.1 mrg else if (c == '\n')
1877 1.1 mrg {
1878 1.1 mrg unsigned int cols;
1879 1.1 mrg buffer->cur = cur - 1;
1880 1.1 mrg if (warn_bidi_p)
1881 1.1 mrg maybe_warn_bidi_on_close (pfile, cur);
1882 1.1 mrg _cpp_process_line_notes (pfile, true);
1883 1.1 mrg if (buffer->next_line >= buffer->rlimit)
1884 1.1 mrg return true;
1885 1.1 mrg _cpp_clean_line (pfile);
1886 1.1 mrg
1887 1.1 mrg cols = buffer->next_line - buffer->line_base;
1888 1.1 mrg CPP_INCREMENT_LINE (pfile, cols);
1889 1.1 mrg
1890 1.1 mrg cur = buffer->cur;
1891 1.1.1.3 mrg }
1892 1.1.1.3 mrg else if (__builtin_expect (c >= utf8_continuation, 0)
1893 1.1.1.3 mrg && warn_bidi_or_invalid_utf8_p)
1894 1.1.1.3 mrg cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1895 1.1 mrg warn_invalid_utf8_p);
1896 1.1 mrg }
1897 1.1 mrg
1898 1.1 mrg buffer->cur = cur;
1899 1.1 mrg _cpp_process_line_notes (pfile, true);
1900 1.1 mrg return false;
1901 1.1 mrg }
1902 1.1 mrg
1903 1.1 mrg /* Skip a C++ line comment, leaving buffer->cur pointing to the
1904 1.1 mrg terminating newline. Handles escaped newlines. Returns nonzero
1905 1.1 mrg if a multiline comment. */
1906 1.1 mrg static int
1907 1.1 mrg skip_line_comment (cpp_reader *pfile)
1908 1.1 mrg {
1909 1.1 mrg cpp_buffer *buffer = pfile->buffer;
1910 1.1 mrg location_t orig_line = pfile->line_table->highest_line;
1911 1.1.1.3 mrg const bool warn_bidi_p = pfile->warn_bidi_p ();
1912 1.1.1.3 mrg const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1913 1.1 mrg const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1914 1.1.1.3 mrg
1915 1.1 mrg if (!warn_bidi_or_invalid_utf8_p)
1916 1.1 mrg while (*buffer->cur != '\n')
1917 1.1.1.3 mrg buffer->cur++;
1918 1.1 mrg else if (!warn_invalid_utf8_p)
1919 1.1 mrg {
1920 1.1 mrg while (*buffer->cur != '\n'
1921 1.1 mrg && *buffer->cur != bidi::utf8_start)
1922 1.1 mrg buffer->cur++;
1923 1.1 mrg if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1924 1.1 mrg {
1925 1.1 mrg while (*buffer->cur != '\n')
1926 1.1 mrg {
1927 1.1 mrg if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1928 1.1 mrg {
1929 1.1 mrg location_t loc;
1930 1.1 mrg bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1931 1.1 mrg maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1932 1.1 mrg }
1933 1.1 mrg buffer->cur++;
1934 1.1 mrg }
1935 1.1 mrg maybe_warn_bidi_on_close (pfile, buffer->cur);
1936 1.1 mrg }
1937 1.1.1.3 mrg }
1938 1.1.1.3 mrg else
1939 1.1.1.3 mrg {
1940 1.1.1.3 mrg while (*buffer->cur != '\n')
1941 1.1.1.3 mrg {
1942 1.1.1.3 mrg if (*buffer->cur < utf8_continuation)
1943 1.1.1.3 mrg {
1944 1.1.1.3 mrg buffer->cur++;
1945 1.1.1.3 mrg continue;
1946 1.1.1.3 mrg }
1947 1.1.1.3 mrg buffer->cur
1948 1.1.1.3 mrg = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1949 1.1.1.3 mrg warn_bidi_p, warn_invalid_utf8_p);
1950 1.1.1.3 mrg }
1951 1.1.1.3 mrg if (warn_bidi_p)
1952 1.1.1.3 mrg maybe_warn_bidi_on_close (pfile, buffer->cur);
1953 1.1 mrg }
1954 1.1 mrg
1955 1.1 mrg _cpp_process_line_notes (pfile, true);
1956 1.1 mrg return orig_line != pfile->line_table->highest_line;
1957 1.1 mrg }
1958 1.1 mrg
1959 1.1 mrg /* Skips whitespace, saving the next non-whitespace character. */
1960 1.1 mrg static void
1961 1.1 mrg skip_whitespace (cpp_reader *pfile, cppchar_t c)
1962 1.1 mrg {
1963 1.1 mrg cpp_buffer *buffer = pfile->buffer;
1964 1.1 mrg bool saw_NUL = false;
1965 1.1 mrg
1966 1.1 mrg do
1967 1.1 mrg {
1968 1.1 mrg /* Horizontal space always OK. */
1969 1.1 mrg if (c == ' ' || c == '\t')
1970 1.1 mrg ;
1971 1.1 mrg /* Just \f \v or \0 left. */
1972 1.1 mrg else if (c == '\0')
1973 1.1 mrg saw_NUL = true;
1974 1.1 mrg else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1975 1.1 mrg cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1976 1.1 mrg CPP_BUF_COL (buffer),
1977 1.1 mrg "%s in preprocessing directive",
1978 1.1 mrg c == '\f' ? "form feed" : "vertical tab");
1979 1.1 mrg
1980 1.1 mrg c = *buffer->cur++;
1981 1.1 mrg }
1982 1.1 mrg /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1983 1.1 mrg while (is_nvspace (c));
1984 1.1 mrg
1985 1.1 mrg if (saw_NUL)
1986 1.1 mrg {
1987 1.1 mrg encoding_rich_location rich_loc (pfile);
1988 1.1 mrg cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1989 1.1 mrg "null character(s) ignored");
1990 1.1 mrg }
1991 1.1 mrg
1992 1.1 mrg buffer->cur--;
1993 1.1 mrg }
1994 1.1 mrg
1995 1.1 mrg /* See if the characters of a number token are valid in a name (no
1996 1.1 mrg '.', '+' or '-'). */
1997 1.1 mrg static int
1998 1.1 mrg name_p (cpp_reader *pfile, const cpp_string *string)
1999 1.1 mrg {
2000 1.1 mrg unsigned int i;
2001 1.1 mrg
2002 1.1 mrg for (i = 0; i < string->len; i++)
2003 1.1 mrg if (!is_idchar (string->text[i]))
2004 1.1 mrg return 0;
2005 1.1 mrg
2006 1.1 mrg return 1;
2007 1.1 mrg }
2008 1.1 mrg
2009 1.1 mrg /* After parsing an identifier or other sequence, produce a warning about
2010 1.1 mrg sequences not in NFC/NFKC. */
2011 1.1 mrg static void
2012 1.1 mrg warn_about_normalization (cpp_reader *pfile,
2013 1.1.1.3 mrg const cpp_token *token,
2014 1.1.1.3 mrg const struct normalize_state *s,
2015 1.1 mrg bool identifier)
2016 1.1 mrg {
2017 1.1 mrg if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2018 1.1 mrg && !pfile->state.skipping)
2019 1.1 mrg {
2020 1.1 mrg location_t loc = token->src_loc;
2021 1.1 mrg
2022 1.1 mrg /* If possible, create a location range for the token. */
2023 1.1 mrg if (loc >= RESERVED_LOCATION_COUNT
2024 1.1 mrg && token->type != CPP_EOF
2025 1.1 mrg /* There must be no line notes to process. */
2026 1.1 mrg && (!(pfile->buffer->cur
2027 1.1 mrg >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2028 1.1 mrg && !pfile->overlaid_buffer)))
2029 1.1 mrg {
2030 1.1 mrg source_range tok_range;
2031 1.1 mrg tok_range.m_start = loc;
2032 1.1 mrg tok_range.m_finish
2033 1.1 mrg = linemap_position_for_column (pfile->line_table,
2034 1.1 mrg CPP_BUF_COLUMN (pfile->buffer,
2035 1.1.1.3 mrg pfile->buffer->cur));
2036 1.1.1.3 mrg loc = pfile->line_table->get_or_create_combined_loc (loc, tok_range,
2037 1.1 mrg nullptr, 0);
2038 1.1 mrg }
2039 1.1 mrg
2040 1.1 mrg encoding_rich_location rich_loc (pfile, loc);
2041 1.1 mrg
2042 1.1 mrg /* Make sure that the token is printed using UCNs, even
2043 1.1 mrg if we'd otherwise happily print UTF-8. */
2044 1.1 mrg unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2045 1.1 mrg size_t sz;
2046 1.1 mrg
2047 1.1 mrg sz = cpp_spell_token (pfile, token, buf, false) - buf;
2048 1.1 mrg if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2049 1.1 mrg cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2050 1.1.1.3 mrg "`%.*s' is not in NFKC", (int) sz, buf);
2051 1.1 mrg else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2052 1.1 mrg cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2053 1.1 mrg "`%.*s' is not in NFC", (int) sz, buf);
2054 1.1 mrg else
2055 1.1 mrg cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2056 1.1 mrg "`%.*s' is not in NFC", (int) sz, buf);
2057 1.1 mrg free (buf);
2058 1.1 mrg }
2059 1.1 mrg }
2060 1.1.1.3 mrg
2061 1.1.1.3 mrg /* Returns TRUE if the byte sequence starting at buffer->cur is a valid
2062 1.1.1.3 mrg extended character in an identifier. If FIRST is TRUE, then the character
2063 1.1.1.3 mrg must be valid at the beginning of an identifier as well. If the return
2064 1.1.1.3 mrg value is TRUE, then pfile->buffer->cur has been moved to point to the next
2065 1.1 mrg byte after the extended character. */
2066 1.1 mrg
2067 1.1 mrg static bool
2068 1.1 mrg forms_identifier_p (cpp_reader *pfile, int first,
2069 1.1 mrg struct normalize_state *state)
2070 1.1 mrg {
2071 1.1 mrg cpp_buffer *buffer = pfile->buffer;
2072 1.1 mrg const bool warn_bidi_p = pfile->warn_bidi_p ();
2073 1.1 mrg
2074 1.1 mrg if (*buffer->cur == '$')
2075 1.1 mrg {
2076 1.1 mrg if (!CPP_OPTION (pfile, dollars_in_ident))
2077 1.1 mrg return false;
2078 1.1 mrg
2079 1.1 mrg buffer->cur++;
2080 1.1 mrg if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2081 1.1 mrg {
2082 1.1 mrg CPP_OPTION (pfile, warn_dollars) = 0;
2083 1.1 mrg cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2084 1.1 mrg }
2085 1.1 mrg
2086 1.1 mrg return true;
2087 1.1 mrg }
2088 1.1 mrg
2089 1.1 mrg /* Is this a syntactically valid UCN or a valid UTF-8 char? */
2090 1.1 mrg if (CPP_OPTION (pfile, extended_identifiers))
2091 1.1 mrg {
2092 1.1 mrg cppchar_t s;
2093 1.1 mrg if (*buffer->cur >= utf8_signifier)
2094 1.1 mrg {
2095 1.1 mrg if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2096 1.1 mrg && warn_bidi_p)
2097 1.1 mrg {
2098 1.1 mrg location_t loc;
2099 1.1 mrg bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2100 1.1 mrg maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2101 1.1 mrg }
2102 1.1 mrg if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2103 1.1 mrg state, &s))
2104 1.1 mrg return true;
2105 1.1 mrg }
2106 1.1.1.3 mrg else if (*buffer->cur == '\\'
2107 1.1.1.3 mrg && (buffer->cur[1] == 'u'
2108 1.1.1.3 mrg || buffer->cur[1] == 'U'
2109 1.1 mrg || buffer->cur[1] == 'N'))
2110 1.1 mrg {
2111 1.1 mrg buffer->cur += 2;
2112 1.1 mrg if (warn_bidi_p)
2113 1.1 mrg {
2114 1.1.1.3 mrg location_t loc;
2115 1.1.1.3 mrg bidi::kind kind;
2116 1.1.1.3 mrg if (buffer->cur[-1] == 'N')
2117 1.1.1.3 mrg kind = get_bidi_named (pfile, buffer->cur, &loc);
2118 1.1.1.3 mrg else
2119 1.1.1.3 mrg kind = get_bidi_ucn (pfile, buffer->cur,
2120 1.1 mrg buffer->cur[-1] == 'U', &loc);
2121 1.1 mrg maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2122 1.1 mrg }
2123 1.1 mrg if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2124 1.1 mrg state, &s, NULL, NULL))
2125 1.1 mrg return true;
2126 1.1 mrg buffer->cur -= 2;
2127 1.1 mrg }
2128 1.1 mrg }
2129 1.1 mrg
2130 1.1 mrg return false;
2131 1.1 mrg }
2132 1.1 mrg
2133 1.1 mrg /* Helper function to issue error about improper __VA_OPT__ use. */
2134 1.1 mrg static void
2135 1.1 mrg maybe_va_opt_error (cpp_reader *pfile)
2136 1.1 mrg {
2137 1.1 mrg if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2138 1.1 mrg {
2139 1.1 mrg /* __VA_OPT__ should not be accepted at all, but allow it in
2140 1.1 mrg system headers. */
2141 1.1.1.3 mrg if (!_cpp_in_system_header (pfile))
2142 1.1.1.3 mrg {
2143 1.1.1.3 mrg if (CPP_OPTION (pfile, cplusplus))
2144 1.1.1.3 mrg cpp_error (pfile, CPP_DL_PEDWARN,
2145 1.1.1.3 mrg "__VA_OPT__ is not available until C++20");
2146 1.1.1.3 mrg else
2147 1.1.1.3 mrg cpp_error (pfile, CPP_DL_PEDWARN,
2148 1.1.1.3 mrg "__VA_OPT__ is not available until C23");
2149 1.1 mrg }
2150 1.1 mrg }
2151 1.1 mrg else if (!pfile->state.va_args_ok)
2152 1.1 mrg {
2153 1.1 mrg /* __VA_OPT__ should only appear in the replacement list of a
2154 1.1 mrg variadic macro. */
2155 1.1 mrg cpp_error (pfile, CPP_DL_PEDWARN,
2156 1.1 mrg "__VA_OPT__ can only appear in the expansion"
2157 1.1 mrg " of a C++20 variadic macro");
2158 1.1 mrg }
2159 1.1 mrg }
2160 1.1.1.3 mrg
2161 1.1.1.3 mrg /* Helper function to perform diagnostics that are needed (rarely)
2162 1.1.1.3 mrg when an identifier is lexed. */
2163 1.1.1.3 mrg static void
2164 1.1.1.3 mrg identifier_diagnostics_on_lex (cpp_reader *pfile, cpp_hashnode *node)
2165 1.1.1.3 mrg {
2166 1.1.1.3 mrg if (__builtin_expect (!(node->flags & NODE_DIAGNOSTIC)
2167 1.1.1.3 mrg || pfile->state.skipping, 1))
2168 1.1.1.3 mrg return;
2169 1.1.1.3 mrg
2170 1.1.1.3 mrg /* It is allowed to poison the same identifier twice. */
2171 1.1.1.3 mrg if ((node->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2172 1.1.1.3 mrg {
2173 1.1.1.3 mrg cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2174 1.1.1.3 mrg NODE_NAME (node));
2175 1.1.1.3 mrg const auto data = (cpp_hashnode_extra *)
2176 1.1.1.3 mrg ht_lookup (pfile->extra_hash_table, node->ident, HT_NO_INSERT);
2177 1.1.1.3 mrg if (data && data->poisoned_loc)
2178 1.1.1.3 mrg cpp_error_at (pfile, CPP_DL_NOTE, data->poisoned_loc, "poisoned here");
2179 1.1.1.3 mrg }
2180 1.1.1.3 mrg
2181 1.1.1.3 mrg /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2182 1.1.1.3 mrg replacement list of a variadic macro. */
2183 1.1.1.3 mrg if (node == pfile->spec_nodes.n__VA_ARGS__
2184 1.1.1.3 mrg && !pfile->state.va_args_ok)
2185 1.1.1.3 mrg {
2186 1.1.1.3 mrg if (CPP_OPTION (pfile, cplusplus))
2187 1.1.1.3 mrg cpp_error (pfile, CPP_DL_PEDWARN,
2188 1.1.1.3 mrg "__VA_ARGS__ can only appear in the expansion"
2189 1.1.1.3 mrg " of a C++11 variadic macro");
2190 1.1.1.3 mrg else
2191 1.1.1.3 mrg cpp_error (pfile, CPP_DL_PEDWARN,
2192 1.1.1.3 mrg "__VA_ARGS__ can only appear in the expansion"
2193 1.1.1.3 mrg " of a C99 variadic macro");
2194 1.1.1.3 mrg }
2195 1.1.1.3 mrg
2196 1.1.1.3 mrg /* __VA_OPT__ should only appear in the replacement list of a
2197 1.1.1.3 mrg variadic macro. */
2198 1.1.1.3 mrg if (node == pfile->spec_nodes.n__VA_OPT__)
2199 1.1.1.3 mrg maybe_va_opt_error (pfile);
2200 1.1.1.3 mrg
2201 1.1.1.3 mrg /* For -Wc++-compat, warn about use of C++ named operators. */
2202 1.1.1.3 mrg if (node->flags & NODE_WARN_OPERATOR)
2203 1.1.1.3 mrg cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2204 1.1.1.3 mrg "identifier \"%s\" is a special operator name in C++",
2205 1.1.1.3 mrg NODE_NAME (node));
2206 1.1.1.3 mrg }
2207 1.1 mrg
2208 1.1 mrg /* Helper function to get the cpp_hashnode of the identifier BASE. */
2209 1.1 mrg static cpp_hashnode *
2210 1.1 mrg lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2211 1.1 mrg {
2212 1.1 mrg cpp_hashnode *result;
2213 1.1 mrg const uchar *cur;
2214 1.1 mrg unsigned int len;
2215 1.1 mrg unsigned int hash = HT_HASHSTEP (0, *base);
2216 1.1 mrg
2217 1.1 mrg cur = base + 1;
2218 1.1 mrg while (ISIDNUM (*cur))
2219 1.1 mrg {
2220 1.1 mrg hash = HT_HASHSTEP (hash, *cur);
2221 1.1 mrg cur++;
2222 1.1 mrg }
2223 1.1 mrg len = cur - base;
2224 1.1 mrg hash = HT_HASHFINISH (hash, len);
2225 1.1 mrg result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2226 1.1.1.3 mrg base, len, hash, HT_ALLOC));
2227 1.1 mrg identifier_diagnostics_on_lex (pfile, result);
2228 1.1 mrg return result;
2229 1.1 mrg }
2230 1.1 mrg
2231 1.1 mrg /* Get the cpp_hashnode of an identifier specified by NAME in
2232 1.1 mrg the current cpp_reader object. If none is found, NULL is returned. */
2233 1.1 mrg cpp_hashnode *
2234 1.1 mrg _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2235 1.1 mrg {
2236 1.1 mrg cpp_hashnode *result;
2237 1.1 mrg result = lex_identifier_intern (pfile, (uchar *) name);
2238 1.1 mrg return result;
2239 1.1 mrg }
2240 1.1.1.3 mrg
2241 1.1.1.3 mrg /* Lex an identifier starting at BASE. BUFFER->CUR is expected to point
2242 1.1.1.3 mrg one past the first character at BASE, which may be a (possibly multi-byte)
2243 1.1 mrg character if STARTS_UCN is true. */
2244 1.1 mrg static cpp_hashnode *
2245 1.1 mrg lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2246 1.1 mrg struct normalize_state *nst, cpp_hashnode **spelling)
2247 1.1 mrg {
2248 1.1 mrg cpp_hashnode *result;
2249 1.1 mrg const uchar *cur;
2250 1.1 mrg unsigned int len;
2251 1.1 mrg unsigned int hash = HT_HASHSTEP (0, *base);
2252 1.1 mrg const bool warn_bidi_p = pfile->warn_bidi_p ();
2253 1.1 mrg
2254 1.1 mrg cur = pfile->buffer->cur;
2255 1.1 mrg if (! starts_ucn)
2256 1.1 mrg {
2257 1.1 mrg while (ISIDNUM (*cur))
2258 1.1 mrg {
2259 1.1 mrg hash = HT_HASHSTEP (hash, *cur);
2260 1.1 mrg cur++;
2261 1.1 mrg }
2262 1.1 mrg NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2263 1.1 mrg }
2264 1.1 mrg pfile->buffer->cur = cur;
2265 1.1 mrg if (starts_ucn || forms_identifier_p (pfile, false, nst))
2266 1.1 mrg {
2267 1.1 mrg /* Slower version for identifiers containing UCNs
2268 1.1 mrg or extended chars (including $). */
2269 1.1 mrg do {
2270 1.1 mrg while (ISIDNUM (*pfile->buffer->cur))
2271 1.1 mrg {
2272 1.1 mrg NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2273 1.1 mrg pfile->buffer->cur++;
2274 1.1 mrg }
2275 1.1 mrg } while (forms_identifier_p (pfile, false, nst));
2276 1.1 mrg if (warn_bidi_p)
2277 1.1 mrg maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2278 1.1 mrg result = _cpp_interpret_identifier (pfile, base,
2279 1.1 mrg pfile->buffer->cur - base);
2280 1.1 mrg *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2281 1.1 mrg }
2282 1.1 mrg else
2283 1.1 mrg {
2284 1.1 mrg len = cur - base;
2285 1.1 mrg hash = HT_HASHFINISH (hash, len);
2286 1.1 mrg
2287 1.1 mrg result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2288 1.1 mrg base, len, hash, HT_ALLOC));
2289 1.1 mrg *spelling = result;
2290 1.1 mrg }
2291 1.1.1.3 mrg
2292 1.1.1.3 mrg return result;
2293 1.1 mrg }
2294 1.1.1.3 mrg
2295 1.1.1.3 mrg /* Struct to hold the return value of the scan_cur_identifier () helper
2296 1.1 mrg function below. */
2297 1.1.1.3 mrg
2298 1.1.1.3 mrg struct scan_id_result
2299 1.1.1.3 mrg {
2300 1.1.1.3 mrg cpp_hashnode *node;
2301 1.1.1.3 mrg normalize_state nst;
2302 1.1.1.3 mrg
2303 1.1.1.3 mrg scan_id_result ()
2304 1.1.1.3 mrg : node (nullptr)
2305 1.1.1.3 mrg {
2306 1.1.1.3 mrg nst = INITIAL_NORMALIZE_STATE;
2307 1.1 mrg }
2308 1.1.1.3 mrg
2309 1.1.1.3 mrg explicit operator bool () const { return node; }
2310 1.1.1.3 mrg };
2311 1.1.1.3 mrg
2312 1.1.1.3 mrg /* Helper function to scan an entire identifier beginning at
2313 1.1.1.3 mrg pfile->buffer->cur, and possibly containing extended characters (UCNs
2314 1.1.1.3 mrg and/or UTF-8). Returns the cpp_hashnode for the identifier on success, or
2315 1.1.1.3 mrg else nullptr, as well as a normalize_state so that normalization warnings
2316 1.1.1.3 mrg may be issued once the token lexing is complete. */
2317 1.1.1.3 mrg
2318 1.1.1.3 mrg static scan_id_result
2319 1.1.1.3 mrg scan_cur_identifier (cpp_reader *pfile)
2320 1.1.1.3 mrg {
2321 1.1.1.3 mrg const auto buffer = pfile->buffer;
2322 1.1.1.3 mrg const auto begin = buffer->cur;
2323 1.1.1.3 mrg scan_id_result result;
2324 1.1.1.3 mrg if (ISIDST (*buffer->cur))
2325 1.1.1.3 mrg {
2326 1.1.1.3 mrg ++buffer->cur;
2327 1.1.1.3 mrg cpp_hashnode *ignore;
2328 1.1.1.3 mrg result.node = lex_identifier (pfile, begin, false, &result.nst, &ignore);
2329 1.1.1.3 mrg }
2330 1.1.1.3 mrg else if (forms_identifier_p (pfile, true, &result.nst))
2331 1.1.1.3 mrg {
2332 1.1.1.3 mrg /* buffer->cur has been moved already by the call
2333 1.1.1.3 mrg to forms_identifier_p. */
2334 1.1.1.3 mrg cpp_hashnode *ignore;
2335 1.1.1.3 mrg result.node = lex_identifier (pfile, begin, true, &result.nst, &ignore);
2336 1.1 mrg }
2337 1.1 mrg return result;
2338 1.1 mrg }
2339 1.1 mrg
2340 1.1 mrg /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
2341 1.1 mrg static void
2342 1.1 mrg lex_number (cpp_reader *pfile, cpp_string *number,
2343 1.1 mrg struct normalize_state *nst)
2344 1.1 mrg {
2345 1.1 mrg const uchar *cur;
2346 1.1 mrg const uchar *base;
2347 1.1 mrg uchar *dest;
2348 1.1 mrg
2349 1.1 mrg base = pfile->buffer->cur - 1;
2350 1.1 mrg do
2351 1.1 mrg {
2352 1.1 mrg const uchar *adj_digit_sep = NULL;
2353 1.1 mrg cur = pfile->buffer->cur;
2354 1.1 mrg
2355 1.1 mrg /* N.B. ISIDNUM does not include $. */
2356 1.1 mrg while (ISIDNUM (*cur)
2357 1.1 mrg || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2358 1.1 mrg || DIGIT_SEP (*cur)
2359 1.1 mrg || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2360 1.1 mrg {
2361 1.1 mrg NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2362 1.1 mrg /* Adjacent digit separators do not form part of the pp-number syntax.
2363 1.1 mrg However, they can safely be diagnosed here as an error, since '' is
2364 1.1 mrg not a valid preprocessing token. */
2365 1.1 mrg if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2366 1.1 mrg adj_digit_sep = cur;
2367 1.1 mrg cur++;
2368 1.1 mrg }
2369 1.1 mrg /* A number can't end with a digit separator. */
2370 1.1 mrg while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2371 1.1 mrg --cur;
2372 1.1 mrg if (adj_digit_sep && adj_digit_sep < cur)
2373 1.1 mrg cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2374 1.1 mrg
2375 1.1 mrg pfile->buffer->cur = cur;
2376 1.1 mrg }
2377 1.1 mrg while (forms_identifier_p (pfile, false, nst));
2378 1.1 mrg
2379 1.1 mrg number->len = cur - base;
2380 1.1 mrg dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2381 1.1 mrg memcpy (dest, base, number->len);
2382 1.1 mrg dest[number->len] = '\0';
2383 1.1 mrg number->text = dest;
2384 1.1 mrg }
2385 1.1 mrg
2386 1.1 mrg /* Create a token of type TYPE with a literal spelling. */
2387 1.1 mrg static void
2388 1.1 mrg create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2389 1.1 mrg unsigned int len, enum cpp_ttype type)
2390 1.1 mrg {
2391 1.1 mrg token->type = type;
2392 1.1 mrg token->val.str.len = len;
2393 1.1 mrg token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2394 1.1 mrg }
2395 1.1.1.3 mrg
2396 1.1.1.3 mrg /* Like create_literal(), but construct it from two separate strings
2397 1.1.1.3 mrg which are concatenated. LEN2 may be 0 if no second string is
2398 1.1.1.3 mrg required. */
2399 1.1.1.3 mrg static void
2400 1.1.1.3 mrg create_literal2 (cpp_reader *pfile, cpp_token *token, const uchar *base1,
2401 1.1.1.3 mrg unsigned int len1, const uchar *base2, unsigned int len2,
2402 1.1.1.3 mrg enum cpp_ttype type)
2403 1.1.1.3 mrg {
2404 1.1.1.3 mrg token->type = type;
2405 1.1.1.3 mrg token->val.str.len = len1 + len2;
2406 1.1.1.3 mrg uchar *const dest = _cpp_unaligned_alloc (pfile, len1 + len2 + 1);
2407 1.1.1.3 mrg memcpy (dest, base1, len1);
2408 1.1.1.3 mrg if (len2)
2409 1.1.1.3 mrg memcpy (dest+len1, base2, len2);
2410 1.1.1.3 mrg dest[len1 + len2] = 0;
2411 1.1.1.3 mrg token->val.str.text = dest;
2412 1.1.1.3 mrg }
2413 1.1 mrg
2414 1.1 mrg const uchar *
2415 1.1 mrg cpp_alloc_token_string (cpp_reader *pfile,
2416 1.1 mrg const unsigned char *ptr, unsigned len)
2417 1.1 mrg {
2418 1.1 mrg uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2419 1.1 mrg
2420 1.1 mrg dest[len] = 0;
2421 1.1 mrg memcpy (dest, ptr, len);
2422 1.1 mrg return dest;
2423 1.1 mrg }
2424 1.1 mrg
2425 1.1 mrg /* A pair of raw buffer pointers. The currently open one is [1], the
2426 1.1 mrg first one is [0]. Used for string literal lexing. */
2427 1.1 mrg struct lit_accum {
2428 1.1 mrg _cpp_buff *first;
2429 1.1 mrg _cpp_buff *last;
2430 1.1 mrg const uchar *rpos;
2431 1.1 mrg size_t accum;
2432 1.1 mrg
2433 1.1 mrg lit_accum ()
2434 1.1 mrg : first (NULL), last (NULL), rpos (0), accum (0)
2435 1.1 mrg {
2436 1.1 mrg }
2437 1.1 mrg
2438 1.1 mrg void append (cpp_reader *, const uchar *, size_t);
2439 1.1 mrg
2440 1.1 mrg void read_begin (cpp_reader *);
2441 1.1 mrg bool reading_p () const
2442 1.1 mrg {
2443 1.1 mrg return rpos != NULL;
2444 1.1 mrg }
2445 1.1 mrg char read_char ()
2446 1.1 mrg {
2447 1.1 mrg char c = *rpos++;
2448 1.1 mrg if (rpos == BUFF_FRONT (last))
2449 1.1 mrg rpos = NULL;
2450 1.1 mrg return c;
2451 1.1.1.3 mrg }
2452 1.1.1.3 mrg
2453 1.1.1.3 mrg void create_literal2 (cpp_reader *pfile, cpp_token *token,
2454 1.1.1.3 mrg const uchar *base1, unsigned int len1,
2455 1.1.1.3 mrg const uchar *base2, unsigned int len2,
2456 1.1 mrg enum cpp_ttype type);
2457 1.1 mrg };
2458 1.1 mrg
2459 1.1 mrg /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2460 1.1 mrg sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2461 1.1 mrg
2462 1.1 mrg void
2463 1.1 mrg lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2464 1.1 mrg {
2465 1.1 mrg if (!last)
2466 1.1 mrg /* Starting. */
2467 1.1 mrg first = last = _cpp_get_buff (pfile, len);
2468 1.1 mrg else if (len > BUFF_ROOM (last))
2469 1.1 mrg {
2470 1.1 mrg /* There is insufficient room in the buffer. Copy what we can,
2471 1.1 mrg and then either extend or create a new one. */
2472 1.1 mrg size_t room = BUFF_ROOM (last);
2473 1.1 mrg memcpy (BUFF_FRONT (last), base, room);
2474 1.1 mrg BUFF_FRONT (last) += room;
2475 1.1 mrg base += room;
2476 1.1 mrg len -= room;
2477 1.1 mrg accum += room;
2478 1.1 mrg
2479 1.1 mrg gcc_checking_assert (!rpos);
2480 1.1 mrg
2481 1.1 mrg last = _cpp_append_extend_buff (pfile, last, len);
2482 1.1 mrg }
2483 1.1 mrg
2484 1.1 mrg memcpy (BUFF_FRONT (last), base, len);
2485 1.1 mrg BUFF_FRONT (last) += len;
2486 1.1 mrg accum += len;
2487 1.1 mrg }
2488 1.1 mrg
2489 1.1 mrg void
2490 1.1 mrg lit_accum::read_begin (cpp_reader *pfile)
2491 1.1 mrg {
2492 1.1 mrg /* We never accumulate more than 4 chars to read. */
2493 1.1 mrg if (BUFF_ROOM (last) < 4)
2494 1.1 mrg
2495 1.1 mrg last = _cpp_append_extend_buff (pfile, last, 4);
2496 1.1 mrg rpos = BUFF_FRONT (last);
2497 1.1 mrg }
2498 1.1.1.3 mrg
2499 1.1.1.3 mrg /* Helper function to check if a string format macro, say from inttypes.h, is
2500 1.1.1.3 mrg placed touching a string literal, in which case it could be parsed as a C++11
2501 1.1.1.3 mrg user-defined string literal thus breaking the program. Return TRUE if the
2502 1.1.1.3 mrg UDL should be ignored for now and preserved for potential macro
2503 1.1 mrg expansion. */
2504 1.1 mrg
2505 1.1.1.3 mrg static bool
2506 1.1.1.3 mrg maybe_ignore_udl_macro_suffix (cpp_reader *pfile, location_t src_loc,
2507 1.1 mrg const uchar *suffix_begin, cpp_hashnode *node)
2508 1.1 mrg {
2509 1.1 mrg /* User-defined literals outside of namespace std must start with a single
2510 1.1 mrg underscore, so assume anything of that form really is a UDL suffix.
2511 1.1 mrg We don't need to worry about UDLs defined inside namespace std because
2512 1.1 mrg their names are reserved, so cannot be used as macro names in valid
2513 1.1.1.3 mrg programs. */
2514 1.1.1.3 mrg if ((suffix_begin[0] == '_' && suffix_begin[1] != '_')
2515 1.1 mrg || !cpp_macro_p (node))
2516 1.1.1.3 mrg return false;
2517 1.1.1.3 mrg
2518 1.1.1.3 mrg /* Maybe raise a warning here; caller should arrange not to consume
2519 1.1.1.3 mrg the tokens. */
2520 1.1.1.3 mrg if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2521 1.1.1.3 mrg cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, src_loc, 0,
2522 1.1.1.3 mrg "invalid suffix on literal; C++11 requires a space "
2523 1.1.1.3 mrg "between literal and string macro");
2524 1.1.1.3 mrg return true;
2525 1.1.1.3 mrg }
2526 1.1.1.3 mrg
2527 1.1.1.3 mrg /* Like create_literal2(), but also prepend all the accumulated data from
2528 1.1.1.3 mrg the lit_accum struct. */
2529 1.1.1.3 mrg void
2530 1.1.1.3 mrg lit_accum::create_literal2 (cpp_reader *pfile, cpp_token *token,
2531 1.1.1.3 mrg const uchar *base1, unsigned int len1,
2532 1.1.1.3 mrg const uchar *base2, unsigned int len2,
2533 1.1.1.3 mrg enum cpp_ttype type)
2534 1.1.1.3 mrg {
2535 1.1.1.3 mrg const unsigned int tot_len = accum + len1 + len2;
2536 1.1.1.3 mrg uchar *dest = _cpp_unaligned_alloc (pfile, tot_len + 1);
2537 1.1.1.3 mrg token->type = type;
2538 1.1.1.3 mrg token->val.str.len = tot_len;
2539 1.1.1.3 mrg token->val.str.text = dest;
2540 1.1.1.3 mrg for (_cpp_buff *buf = first; buf; buf = buf->next)
2541 1.1.1.3 mrg {
2542 1.1.1.3 mrg size_t len = BUFF_FRONT (buf) - buf->base;
2543 1.1.1.3 mrg memcpy (dest, buf->base, len);
2544 1.1.1.3 mrg dest += len;
2545 1.1.1.3 mrg }
2546 1.1.1.3 mrg memcpy (dest, base1, len1);
2547 1.1.1.3 mrg dest += len1;
2548 1.1.1.3 mrg if (len2)
2549 1.1.1.3 mrg memcpy (dest, base2, len2);
2550 1.1.1.3 mrg dest += len2;
2551 1.1 mrg *dest = '\0';
2552 1.1 mrg }
2553 1.1 mrg
2554 1.1 mrg /* Lexes a raw string. The stored string contains the spelling,
2555 1.1 mrg including double quotes, delimiter string, '(' and ')', any leading
2556 1.1 mrg 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2557 1.1 mrg the type of the literal, or CPP_OTHER if it was not properly
2558 1.1 mrg terminated.
2559 1.1 mrg
2560 1.1 mrg BASE is the start of the token. Updates pfile->buffer->cur to just
2561 1.1 mrg after the lexed string.
2562 1.1 mrg
2563 1.1 mrg The spelling is NUL-terminated, but it is not guaranteed that this
2564 1.1 mrg is the first NUL since embedded NULs are preserved. */
2565 1.1 mrg
2566 1.1 mrg static void
2567 1.1 mrg lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2568 1.1 mrg {
2569 1.1 mrg const uchar *pos = base;
2570 1.1.1.3 mrg const bool warn_bidi_p = pfile->warn_bidi_p ();
2571 1.1.1.3 mrg const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2572 1.1 mrg const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2573 1.1 mrg
2574 1.1 mrg /* 'tis a pity this information isn't passed down from the lexer's
2575 1.1 mrg initial categorization of the token. */
2576 1.1 mrg enum cpp_ttype type = CPP_STRING;
2577 1.1 mrg
2578 1.1 mrg if (*pos == 'L')
2579 1.1 mrg {
2580 1.1 mrg type = CPP_WSTRING;
2581 1.1 mrg pos++;
2582 1.1 mrg }
2583 1.1 mrg else if (*pos == 'U')
2584 1.1 mrg {
2585 1.1 mrg type = CPP_STRING32;
2586 1.1 mrg pos++;
2587 1.1 mrg }
2588 1.1 mrg else if (*pos == 'u')
2589 1.1 mrg {
2590 1.1 mrg if (pos[1] == '8')
2591 1.1 mrg {
2592 1.1 mrg type = CPP_UTF8STRING;
2593 1.1 mrg pos++;
2594 1.1 mrg }
2595 1.1 mrg else
2596 1.1 mrg type = CPP_STRING16;
2597 1.1 mrg pos++;
2598 1.1 mrg }
2599 1.1 mrg
2600 1.1 mrg gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2601 1.1 mrg pos += 2;
2602 1.1 mrg
2603 1.1 mrg _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2604 1.1 mrg
2605 1.1 mrg /* Skip notes before the ". */
2606 1.1 mrg while (note->pos < pos)
2607 1.1 mrg ++note;
2608 1.1 mrg
2609 1.1 mrg lit_accum accum;
2610 1.1 mrg
2611 1.1 mrg uchar prefix[17];
2612 1.1 mrg unsigned prefix_len = 0;
2613 1.1 mrg enum Phase
2614 1.1 mrg {
2615 1.1 mrg PHASE_PREFIX = -2,
2616 1.1 mrg PHASE_NONE = -1,
2617 1.1 mrg PHASE_SUFFIX = 0
2618 1.1 mrg } phase = PHASE_PREFIX;
2619 1.1 mrg
2620 1.1 mrg for (;;)
2621 1.1 mrg {
2622 1.1 mrg gcc_checking_assert (note->pos >= pos);
2623 1.1 mrg
2624 1.1 mrg /* Undo any escaped newlines and trigraphs. */
2625 1.1 mrg if (!accum.reading_p () && note->pos == pos)
2626 1.1 mrg switch (note->type)
2627 1.1 mrg {
2628 1.1 mrg case '\\':
2629 1.1 mrg case ' ':
2630 1.1 mrg /* Restore backslash followed by newline. */
2631 1.1 mrg accum.append (pfile, base, pos - base);
2632 1.1 mrg base = pos;
2633 1.1 mrg accum.read_begin (pfile);
2634 1.1 mrg accum.append (pfile, UC"\\", 1);
2635 1.1 mrg
2636 1.1 mrg after_backslash:
2637 1.1 mrg if (note->type == ' ')
2638 1.1 mrg /* GNU backslash whitespace newline extension. FIXME
2639 1.1 mrg could be any sequence of non-vertical space. When we
2640 1.1 mrg can properly restore any such sequence, we should
2641 1.1 mrg mark this note as handled so _cpp_process_line_notes
2642 1.1 mrg doesn't warn. */
2643 1.1 mrg accum.append (pfile, UC" ", 1);
2644 1.1 mrg
2645 1.1 mrg accum.append (pfile, UC"\n", 1);
2646 1.1 mrg note++;
2647 1.1 mrg break;
2648 1.1 mrg
2649 1.1 mrg case '\n':
2650 1.1 mrg /* This can happen for ??/<NEWLINE> when trigraphs are not
2651 1.1 mrg being interpretted. */
2652 1.1 mrg gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2653 1.1 mrg note->type = 0;
2654 1.1 mrg note++;
2655 1.1 mrg break;
2656 1.1 mrg
2657 1.1 mrg default:
2658 1.1 mrg gcc_checking_assert (_cpp_trigraph_map[note->type]);
2659 1.1 mrg
2660 1.1 mrg /* Don't warn about this trigraph in
2661 1.1 mrg _cpp_process_line_notes, since trigraphs show up as
2662 1.1 mrg trigraphs in raw strings. */
2663 1.1 mrg uchar type = note->type;
2664 1.1 mrg note->type = 0;
2665 1.1 mrg
2666 1.1 mrg if (CPP_OPTION (pfile, trigraphs))
2667 1.1 mrg {
2668 1.1 mrg accum.append (pfile, base, pos - base);
2669 1.1 mrg base = pos;
2670 1.1 mrg accum.read_begin (pfile);
2671 1.1 mrg accum.append (pfile, UC"??", 2);
2672 1.1 mrg accum.append (pfile, &type, 1);
2673 1.1 mrg
2674 1.1 mrg /* ??/ followed by newline gets two line notes, one for
2675 1.1 mrg the trigraph and one for the backslash/newline. */
2676 1.1 mrg if (type == '/' && note[1].pos == pos)
2677 1.1 mrg {
2678 1.1 mrg note++;
2679 1.1 mrg gcc_assert (note->type == '\\' || note->type == ' ');
2680 1.1 mrg goto after_backslash;
2681 1.1 mrg }
2682 1.1 mrg /* Skip the replacement character. */
2683 1.1 mrg base = ++pos;
2684 1.1 mrg }
2685 1.1 mrg
2686 1.1 mrg note++;
2687 1.1 mrg break;
2688 1.1 mrg }
2689 1.1 mrg
2690 1.1 mrg /* Now get a char to process. Either from an expanded note, or
2691 1.1 mrg from the line buffer. */
2692 1.1 mrg bool read_note = accum.reading_p ();
2693 1.1 mrg char c = read_note ? accum.read_char () : *pos++;
2694 1.1 mrg
2695 1.1 mrg if (phase == PHASE_PREFIX)
2696 1.1 mrg {
2697 1.1 mrg if (c == '(')
2698 1.1 mrg {
2699 1.1 mrg /* Done. */
2700 1.1 mrg phase = PHASE_NONE;
2701 1.1 mrg prefix[prefix_len++] = '"';
2702 1.1 mrg }
2703 1.1 mrg else if (prefix_len < 16
2704 1.1 mrg /* Prefix chars are any of the basic character set,
2705 1.1 mrg [lex.charset] except for '
2706 1.1 mrg ()\\\t\v\f\n'. Optimized for a contiguous
2707 1.1 mrg alphabet. */
2708 1.1 mrg /* Unlike a switch, this collapses down to one or
2709 1.1 mrg two shift and bitmask operations on an ASCII
2710 1.1 mrg system, with an outlier or two. */
2711 1.1 mrg && (('Z' - 'A' == 25
2712 1.1 mrg ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2713 1.1 mrg : ISIDST (c))
2714 1.1 mrg || (c >= '0' && c <= '9')
2715 1.1 mrg || c == '_' || c == '{' || c == '}'
2716 1.1 mrg || c == '[' || c == ']' || c == '#'
2717 1.1 mrg || c == '<' || c == '>' || c == '%'
2718 1.1 mrg || c == ':' || c == ';' || c == '.' || c == '?'
2719 1.1 mrg || c == '*' || c == '+' || c == '-' || c == '/'
2720 1.1 mrg || c == '^' || c == '&' || c == '|' || c == '~'
2721 1.1 mrg || c == '!' || c == '=' || c == ','
2722 1.1 mrg || c == '"' || c == '\''))
2723 1.1 mrg prefix[prefix_len++] = c;
2724 1.1 mrg else
2725 1.1 mrg {
2726 1.1 mrg /* Something is wrong. */
2727 1.1 mrg int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2728 1.1 mrg if (prefix_len == 16)
2729 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2730 1.1 mrg col, "raw string delimiter longer "
2731 1.1 mrg "than 16 characters");
2732 1.1 mrg else if (c == '\n')
2733 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2734 1.1 mrg col, "invalid new-line in raw "
2735 1.1 mrg "string delimiter");
2736 1.1 mrg else
2737 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2738 1.1 mrg col, "invalid character '%c' in "
2739 1.1 mrg "raw string delimiter", c);
2740 1.1 mrg type = CPP_OTHER;
2741 1.1 mrg phase = PHASE_NONE;
2742 1.1 mrg /* Continue until we get a close quote, that's probably
2743 1.1 mrg the best failure mode. */
2744 1.1 mrg prefix_len = 0;
2745 1.1 mrg }
2746 1.1 mrg if (c != '\n')
2747 1.1 mrg continue;
2748 1.1 mrg }
2749 1.1 mrg
2750 1.1 mrg if (phase != PHASE_NONE)
2751 1.1 mrg {
2752 1.1 mrg if (prefix[phase] != c)
2753 1.1 mrg phase = PHASE_NONE;
2754 1.1 mrg else if (unsigned (phase + 1) == prefix_len)
2755 1.1 mrg break;
2756 1.1 mrg else
2757 1.1 mrg {
2758 1.1 mrg phase = Phase (phase + 1);
2759 1.1 mrg continue;
2760 1.1 mrg }
2761 1.1 mrg }
2762 1.1 mrg
2763 1.1 mrg if (!prefix_len && c == '"')
2764 1.1 mrg /* Failure mode lexing. */
2765 1.1 mrg goto out;
2766 1.1 mrg else if (prefix_len && c == ')')
2767 1.1 mrg phase = PHASE_SUFFIX;
2768 1.1 mrg else if (!read_note && c == '\n')
2769 1.1 mrg {
2770 1.1 mrg pos--;
2771 1.1.1.3 mrg pfile->buffer->cur = pos;
2772 1.1.1.3 mrg if ((pfile->state.in_directive || pfile->state.parsing_args
2773 1.1.1.3 mrg || pfile->state.in_deferred_pragma)
2774 1.1 mrg && pfile->buffer->next_line >= pfile->buffer->rlimit)
2775 1.1 mrg {
2776 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2777 1.1 mrg "unterminated raw string");
2778 1.1 mrg type = CPP_OTHER;
2779 1.1 mrg goto out;
2780 1.1 mrg }
2781 1.1 mrg
2782 1.1 mrg accum.append (pfile, base, pos - base + 1);
2783 1.1 mrg _cpp_process_line_notes (pfile, false);
2784 1.1 mrg
2785 1.1 mrg if (pfile->buffer->next_line < pfile->buffer->rlimit)
2786 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
2787 1.1 mrg pfile->buffer->need_line = true;
2788 1.1.1.3 mrg
2789 1.1 mrg if (!get_fresh_line_impl<true> (pfile))
2790 1.1 mrg {
2791 1.1 mrg /* We ran out of file and failed to get a line. */
2792 1.1 mrg location_t src_loc = token->src_loc;
2793 1.1 mrg token->type = CPP_EOF;
2794 1.1 mrg /* Tell the compiler the line number of the EOF token. */
2795 1.1 mrg token->src_loc = pfile->line_table->highest_line;
2796 1.1 mrg token->flags = BOL;
2797 1.1 mrg if (accum.first)
2798 1.1 mrg _cpp_release_buff (pfile, accum.first);
2799 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2800 1.1.1.3 mrg "unterminated raw string");
2801 1.1.1.3 mrg
2802 1.1.1.3 mrg /* Now pop the buffer that get_fresh_line_impl() did not. Popping
2803 1.1.1.3 mrg is not safe if processing a directive, however this cannot
2804 1.1.1.3 mrg happen as we already checked above that a line would be
2805 1.1.1.3 mrg available, and get_fresh_line_impl() can't fail in this
2806 1.1.1.3 mrg case. */
2807 1.1 mrg gcc_assert (!pfile->state.in_directive);
2808 1.1.1.3 mrg _cpp_pop_buffer (pfile);
2809 1.1 mrg
2810 1.1 mrg return;
2811 1.1 mrg }
2812 1.1 mrg
2813 1.1 mrg pos = base = pfile->buffer->cur;
2814 1.1 mrg note = &pfile->buffer->notes[pfile->buffer->cur_note];
2815 1.1.1.3 mrg }
2816 1.1.1.3 mrg else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2817 1.1.1.3 mrg && warn_bidi_or_invalid_utf8_p)
2818 1.1.1.3 mrg pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2819 1.1 mrg warn_invalid_utf8_p);
2820 1.1 mrg }
2821 1.1 mrg
2822 1.1 mrg if (warn_bidi_p)
2823 1.1 mrg maybe_warn_bidi_on_close (pfile, pos);
2824 1.1 mrg
2825 1.1 mrg if (CPP_OPTION (pfile, user_literals))
2826 1.1.1.3 mrg {
2827 1.1.1.3 mrg const uchar *const suffix_begin = pos;
2828 1.1 mrg pfile->buffer->cur = pos;
2829 1.1.1.3 mrg
2830 1.1.1.3 mrg if (const auto sr = scan_cur_identifier (pfile))
2831 1.1.1.3 mrg {
2832 1.1.1.3 mrg if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2833 1.1.1.3 mrg suffix_begin, sr.node))
2834 1.1.1.3 mrg pfile->buffer->cur = suffix_begin;
2835 1.1.1.3 mrg else
2836 1.1.1.3 mrg {
2837 1.1.1.3 mrg type = cpp_userdef_string_add_type (type);
2838 1.1.1.3 mrg accum.create_literal2 (pfile, token, base, suffix_begin - base,
2839 1.1.1.3 mrg NODE_NAME (sr.node), NODE_LEN (sr.node),
2840 1.1.1.3 mrg type);
2841 1.1.1.3 mrg if (accum.first)
2842 1.1.1.3 mrg _cpp_release_buff (pfile, accum.first);
2843 1.1.1.3 mrg warn_about_normalization (pfile, token, &sr.nst, true);
2844 1.1.1.3 mrg return;
2845 1.1 mrg }
2846 1.1 mrg }
2847 1.1 mrg }
2848 1.1 mrg
2849 1.1 mrg out:
2850 1.1 mrg pfile->buffer->cur = pos;
2851 1.1 mrg if (!accum.accum)
2852 1.1 mrg create_literal (pfile, token, base, pos - base, type);
2853 1.1 mrg else
2854 1.1.1.3 mrg {
2855 1.1 mrg accum.create_literal2 (pfile, token, base, pos - base, nullptr, 0, type);
2856 1.1 mrg _cpp_release_buff (pfile, accum.first);
2857 1.1 mrg }
2858 1.1 mrg }
2859 1.1 mrg
2860 1.1 mrg /* Lexes a string, character constant, or angle-bracketed header file
2861 1.1 mrg name. The stored string contains the spelling, including opening
2862 1.1 mrg quote and any leading 'L', 'u', 'U' or 'u8' and optional
2863 1.1 mrg 'R' modifier. It returns the type of the literal, or CPP_OTHER
2864 1.1 mrg if it was not properly terminated, or CPP_LESS for an unterminated
2865 1.1 mrg header name which must be relexed as normal tokens.
2866 1.1 mrg
2867 1.1 mrg The spelling is NUL-terminated, but it is not guaranteed that this
2868 1.1 mrg is the first NUL since embedded NULs are preserved. */
2869 1.1 mrg static void
2870 1.1 mrg lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2871 1.1 mrg {
2872 1.1 mrg bool saw_NUL = false;
2873 1.1 mrg const uchar *cur;
2874 1.1 mrg cppchar_t terminator;
2875 1.1 mrg enum cpp_ttype type;
2876 1.1 mrg
2877 1.1 mrg cur = base;
2878 1.1 mrg terminator = *cur++;
2879 1.1 mrg if (terminator == 'L' || terminator == 'U')
2880 1.1 mrg terminator = *cur++;
2881 1.1 mrg else if (terminator == 'u')
2882 1.1 mrg {
2883 1.1 mrg terminator = *cur++;
2884 1.1 mrg if (terminator == '8')
2885 1.1 mrg terminator = *cur++;
2886 1.1 mrg }
2887 1.1 mrg if (terminator == 'R')
2888 1.1 mrg {
2889 1.1 mrg lex_raw_string (pfile, token, base);
2890 1.1 mrg return;
2891 1.1 mrg }
2892 1.1 mrg if (terminator == '"')
2893 1.1 mrg type = (*base == 'L' ? CPP_WSTRING :
2894 1.1 mrg *base == 'U' ? CPP_STRING32 :
2895 1.1 mrg *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2896 1.1 mrg : CPP_STRING);
2897 1.1 mrg else if (terminator == '\'')
2898 1.1 mrg type = (*base == 'L' ? CPP_WCHAR :
2899 1.1 mrg *base == 'U' ? CPP_CHAR32 :
2900 1.1 mrg *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2901 1.1 mrg : CPP_CHAR);
2902 1.1 mrg else
2903 1.1 mrg terminator = '>', type = CPP_HEADER_NAME;
2904 1.1 mrg
2905 1.1.1.3 mrg const bool warn_bidi_p = pfile->warn_bidi_p ();
2906 1.1.1.3 mrg const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2907 1.1 mrg const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2908 1.1 mrg for (;;)
2909 1.1 mrg {
2910 1.1 mrg cppchar_t c = *cur++;
2911 1.1 mrg
2912 1.1 mrg /* In #include-style directives, terminators are not escapable. */
2913 1.1 mrg if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2914 1.1.1.3 mrg {
2915 1.1 mrg if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2916 1.1 mrg {
2917 1.1.1.3 mrg location_t loc;
2918 1.1.1.3 mrg bidi::kind kind;
2919 1.1.1.3 mrg if (cur[0] == 'N')
2920 1.1.1.3 mrg kind = get_bidi_named (pfile, cur + 1, &loc);
2921 1.1.1.3 mrg else
2922 1.1 mrg kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2923 1.1 mrg maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2924 1.1 mrg }
2925 1.1 mrg cur++;
2926 1.1 mrg }
2927 1.1 mrg else if (c == terminator)
2928 1.1 mrg {
2929 1.1 mrg if (warn_bidi_p)
2930 1.1 mrg maybe_warn_bidi_on_close (pfile, cur - 1);
2931 1.1 mrg break;
2932 1.1 mrg }
2933 1.1 mrg else if (c == '\n')
2934 1.1 mrg {
2935 1.1 mrg cur--;
2936 1.1 mrg /* Unmatched quotes always yield undefined behavior, but
2937 1.1 mrg greedy lexing means that what appears to be an unterminated
2938 1.1 mrg header name may actually be a legitimate sequence of tokens. */
2939 1.1 mrg if (terminator == '>')
2940 1.1 mrg {
2941 1.1 mrg token->type = CPP_LESS;
2942 1.1 mrg return;
2943 1.1 mrg }
2944 1.1 mrg type = CPP_OTHER;
2945 1.1 mrg break;
2946 1.1 mrg }
2947 1.1 mrg else if (c == '\0')
2948 1.1.1.3 mrg saw_NUL = true;
2949 1.1.1.3 mrg else if (__builtin_expect (c >= utf8_continuation, 0)
2950 1.1.1.3 mrg && warn_bidi_or_invalid_utf8_p)
2951 1.1.1.3 mrg cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2952 1.1 mrg warn_invalid_utf8_p);
2953 1.1 mrg }
2954 1.1 mrg
2955 1.1 mrg if (saw_NUL && !pfile->state.skipping)
2956 1.1 mrg cpp_error (pfile, CPP_DL_WARNING,
2957 1.1 mrg "null character(s) preserved in literal");
2958 1.1 mrg
2959 1.1 mrg if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2960 1.1 mrg cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2961 1.1 mrg (int) terminator);
2962 1.1.1.3 mrg
2963 1.1.1.3 mrg pfile->buffer->cur = cur;
2964 1.1.1.3 mrg const uchar *const suffix_begin = cur;
2965 1.1 mrg
2966 1.1 mrg if (CPP_OPTION (pfile, user_literals))
2967 1.1.1.3 mrg {
2968 1.1.1.3 mrg if (const auto sr = scan_cur_identifier (pfile))
2969 1.1.1.3 mrg {
2970 1.1.1.3 mrg if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2971 1.1.1.3 mrg suffix_begin, sr.node))
2972 1.1.1.3 mrg pfile->buffer->cur = suffix_begin;
2973 1.1.1.3 mrg else
2974 1.1.1.3 mrg {
2975 1.1.1.3 mrg /* Grab user defined literal suffix. */
2976 1.1.1.3 mrg type = cpp_userdef_char_add_type (type);
2977 1.1.1.3 mrg type = cpp_userdef_string_add_type (type);
2978 1.1.1.3 mrg create_literal2 (pfile, token, base, suffix_begin - base,
2979 1.1.1.3 mrg NODE_NAME (sr.node), NODE_LEN (sr.node), type);
2980 1.1.1.3 mrg warn_about_normalization (pfile, token, &sr.nst, true);
2981 1.1.1.3 mrg return;
2982 1.1 mrg }
2983 1.1 mrg }
2984 1.1 mrg }
2985 1.1 mrg else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2986 1.1.1.3 mrg && !pfile->state.skipping)
2987 1.1.1.3 mrg {
2988 1.1.1.3 mrg const auto sr = scan_cur_identifier (pfile);
2989 1.1.1.3 mrg /* Maybe raise a warning, but do not consume the tokens. */
2990 1.1.1.3 mrg pfile->buffer->cur = suffix_begin;
2991 1.1.1.3 mrg if (sr && cpp_macro_p (sr.node))
2992 1.1.1.3 mrg cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2993 1.1.1.3 mrg token->src_loc, 0, "C++11 requires a space "
2994 1.1.1.3 mrg "between string literal and macro");
2995 1.1 mrg }
2996 1.1 mrg
2997 1.1 mrg create_literal (pfile, token, base, cur - base, type);
2998 1.1 mrg }
2999 1.1 mrg
3000 1.1 mrg /* Return the comment table. The client may not make any assumption
3001 1.1 mrg about the ordering of the table. */
3002 1.1 mrg cpp_comment_table *
3003 1.1 mrg cpp_get_comments (cpp_reader *pfile)
3004 1.1 mrg {
3005 1.1 mrg return &pfile->comments;
3006 1.1 mrg }
3007 1.1 mrg
3008 1.1 mrg /* Append a comment to the end of the comment table. */
3009 1.1 mrg static void
3010 1.1 mrg store_comment (cpp_reader *pfile, cpp_token *token)
3011 1.1 mrg {
3012 1.1 mrg int len;
3013 1.1 mrg
3014 1.1 mrg if (pfile->comments.allocated == 0)
3015 1.1 mrg {
3016 1.1 mrg pfile->comments.allocated = 256;
3017 1.1 mrg pfile->comments.entries = (cpp_comment *) xmalloc
3018 1.1 mrg (pfile->comments.allocated * sizeof (cpp_comment));
3019 1.1 mrg }
3020 1.1 mrg
3021 1.1 mrg if (pfile->comments.count == pfile->comments.allocated)
3022 1.1 mrg {
3023 1.1 mrg pfile->comments.allocated *= 2;
3024 1.1 mrg pfile->comments.entries = (cpp_comment *) xrealloc
3025 1.1 mrg (pfile->comments.entries,
3026 1.1 mrg pfile->comments.allocated * sizeof (cpp_comment));
3027 1.1 mrg }
3028 1.1 mrg
3029 1.1 mrg len = token->val.str.len;
3030 1.1 mrg
3031 1.1 mrg /* Copy comment. Note, token may not be NULL terminated. */
3032 1.1 mrg pfile->comments.entries[pfile->comments.count].comment =
3033 1.1 mrg (char *) xmalloc (sizeof (char) * (len + 1));
3034 1.1 mrg memcpy (pfile->comments.entries[pfile->comments.count].comment,
3035 1.1 mrg token->val.str.text, len);
3036 1.1 mrg pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
3037 1.1 mrg
3038 1.1 mrg /* Set source location. */
3039 1.1 mrg pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
3040 1.1 mrg
3041 1.1 mrg /* Increment the count of entries in the comment table. */
3042 1.1 mrg pfile->comments.count++;
3043 1.1 mrg }
3044 1.1 mrg
3045 1.1 mrg /* The stored comment includes the comment start and any terminator. */
3046 1.1 mrg static void
3047 1.1 mrg save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
3048 1.1 mrg cppchar_t type)
3049 1.1 mrg {
3050 1.1 mrg unsigned char *buffer;
3051 1.1 mrg unsigned int len, clen, i;
3052 1.1 mrg
3053 1.1 mrg len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
3054 1.1 mrg
3055 1.1 mrg /* C++ comments probably (not definitely) have moved past a new
3056 1.1 mrg line, which we don't want to save in the comment. */
3057 1.1 mrg if (is_vspace (pfile->buffer->cur[-1]))
3058 1.1 mrg len--;
3059 1.1 mrg
3060 1.1 mrg /* If we are currently in a directive or in argument parsing, then
3061 1.1 mrg we need to store all C++ comments as C comments internally, and
3062 1.1 mrg so we need to allocate a little extra space in that case.
3063 1.1 mrg
3064 1.1 mrg Note that the only time we encounter a directive here is
3065 1.1 mrg when we are saving comments in a "#define". */
3066 1.1 mrg clen = ((pfile->state.in_directive || pfile->state.parsing_args)
3067 1.1 mrg && type == '/') ? len + 2 : len;
3068 1.1 mrg
3069 1.1 mrg buffer = _cpp_unaligned_alloc (pfile, clen);
3070 1.1 mrg
3071 1.1 mrg token->type = CPP_COMMENT;
3072 1.1 mrg token->val.str.len = clen;
3073 1.1 mrg token->val.str.text = buffer;
3074 1.1 mrg
3075 1.1 mrg buffer[0] = '/';
3076 1.1 mrg memcpy (buffer + 1, from, len - 1);
3077 1.1 mrg
3078 1.1 mrg /* Finish conversion to a C comment, if necessary. */
3079 1.1 mrg if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3080 1.1 mrg {
3081 1.1 mrg buffer[1] = '*';
3082 1.1 mrg buffer[clen - 2] = '*';
3083 1.1 mrg buffer[clen - 1] = '/';
3084 1.1 mrg /* As there can be in a C++ comments illegal sequences for C comments
3085 1.1 mrg we need to filter them out. */
3086 1.1 mrg for (i = 2; i < (clen - 2); i++)
3087 1.1 mrg if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3088 1.1 mrg buffer[i] = '|';
3089 1.1 mrg }
3090 1.1 mrg
3091 1.1 mrg /* Finally store this comment for use by clients of libcpp. */
3092 1.1 mrg store_comment (pfile, token);
3093 1.1 mrg }
3094 1.1 mrg
3095 1.1 mrg /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3096 1.1 mrg comment. */
3097 1.1 mrg
3098 1.1 mrg static bool
3099 1.1 mrg fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3100 1.1 mrg {
3101 1.1 mrg const unsigned char *from = comment_start + 1;
3102 1.1 mrg
3103 1.1 mrg switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3104 1.1 mrg {
3105 1.1 mrg /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3106 1.1 mrg don't recognize any comments. The latter only checks attributes,
3107 1.1 mrg the former doesn't warn. */
3108 1.1 mrg case 0:
3109 1.1 mrg default:
3110 1.1 mrg return false;
3111 1.1 mrg /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3112 1.1 mrg content it has. */
3113 1.1 mrg case 1:
3114 1.1 mrg return true;
3115 1.1 mrg case 2:
3116 1.1 mrg /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3117 1.1 mrg .*falls?[ \t-]*thr(u|ough).* regex. */
3118 1.1 mrg for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3119 1.1 mrg from++)
3120 1.1 mrg {
3121 1.1 mrg /* Is there anything like strpbrk with upper boundary, or
3122 1.1 mrg memchr looking for 2 characters rather than just one? */
3123 1.1 mrg if (from[0] != 'f' && from[0] != 'F')
3124 1.1 mrg continue;
3125 1.1 mrg if (from[1] != 'a' && from[1] != 'A')
3126 1.1 mrg continue;
3127 1.1 mrg if (from[2] != 'l' && from[2] != 'L')
3128 1.1 mrg continue;
3129 1.1 mrg if (from[3] != 'l' && from[3] != 'L')
3130 1.1 mrg continue;
3131 1.1 mrg from += sizeof "fall" - 1;
3132 1.1 mrg if (from[0] == 's' || from[0] == 'S')
3133 1.1 mrg from++;
3134 1.1 mrg while (*from == ' ' || *from == '\t' || *from == '-')
3135 1.1 mrg from++;
3136 1.1 mrg if (from[0] != 't' && from[0] != 'T')
3137 1.1 mrg continue;
3138 1.1 mrg if (from[1] != 'h' && from[1] != 'H')
3139 1.1 mrg continue;
3140 1.1 mrg if (from[2] != 'r' && from[2] != 'R')
3141 1.1 mrg continue;
3142 1.1 mrg if (from[3] == 'u' || from[3] == 'U')
3143 1.1 mrg return true;
3144 1.1 mrg if (from[3] != 'o' && from[3] != 'O')
3145 1.1 mrg continue;
3146 1.1 mrg if (from[4] != 'u' && from[4] != 'U')
3147 1.1 mrg continue;
3148 1.1 mrg if (from[5] != 'g' && from[5] != 'G')
3149 1.1 mrg continue;
3150 1.1 mrg if (from[6] != 'h' && from[6] != 'H')
3151 1.1 mrg continue;
3152 1.1 mrg return true;
3153 1.1 mrg }
3154 1.1 mrg return false;
3155 1.1 mrg case 3:
3156 1.1 mrg case 4:
3157 1.1 mrg break;
3158 1.1 mrg }
3159 1.1 mrg
3160 1.1 mrg /* Whole comment contents:
3161 1.1 mrg -fallthrough
3162 1.1 mrg @fallthrough@
3163 1.1 mrg */
3164 1.1 mrg if (*from == '-' || *from == '@')
3165 1.1 mrg {
3166 1.1 mrg size_t len = sizeof "fallthrough" - 1;
3167 1.1 mrg if ((size_t) (pfile->buffer->cur - from - 1) < len)
3168 1.1 mrg return false;
3169 1.1 mrg if (memcmp (from + 1, "fallthrough", len))
3170 1.1 mrg return false;
3171 1.1 mrg if (*from == '@')
3172 1.1 mrg {
3173 1.1 mrg if (from[len + 1] != '@')
3174 1.1 mrg return false;
3175 1.1 mrg len++;
3176 1.1 mrg }
3177 1.1 mrg from += 1 + len;
3178 1.1 mrg }
3179 1.1 mrg /* Whole comment contents (regex):
3180 1.1 mrg lint -fallthrough[ \t]*
3181 1.1 mrg */
3182 1.1 mrg else if (*from == 'l')
3183 1.1 mrg {
3184 1.1 mrg size_t len = sizeof "int -fallthrough" - 1;
3185 1.1 mrg if ((size_t) (pfile->buffer->cur - from - 1) < len)
3186 1.1 mrg return false;
3187 1.1 mrg if (memcmp (from + 1, "int -fallthrough", len))
3188 1.1 mrg return false;
3189 1.1 mrg from += 1 + len;
3190 1.1 mrg while (*from == ' ' || *from == '\t')
3191 1.1 mrg from++;
3192 1.1 mrg }
3193 1.1 mrg /* Whole comment contents (regex):
3194 1.1 mrg [ \t]*FALLTHR(U|OUGH)[ \t]*
3195 1.1 mrg */
3196 1.1 mrg else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3197 1.1 mrg {
3198 1.1 mrg while (*from == ' ' || *from == '\t')
3199 1.1 mrg from++;
3200 1.1 mrg if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
3201 1.1 mrg return false;
3202 1.1 mrg if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3203 1.1 mrg return false;
3204 1.1 mrg from += sizeof "FALLTHR" - 1;
3205 1.1 mrg if (*from == 'U')
3206 1.1 mrg from++;
3207 1.1 mrg else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
3208 1.1 mrg return false;
3209 1.1 mrg else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3210 1.1 mrg return false;
3211 1.1 mrg else
3212 1.1 mrg from += sizeof "OUGH" - 1;
3213 1.1 mrg while (*from == ' ' || *from == '\t')
3214 1.1 mrg from++;
3215 1.1 mrg }
3216 1.1 mrg /* Whole comment contents (regex):
3217 1.1 mrg [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3218 1.1 mrg [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3219 1.1 mrg [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3220 1.1 mrg */
3221 1.1 mrg else
3222 1.1 mrg {
3223 1.1 mrg while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3224 1.1 mrg from++;
3225 1.1 mrg unsigned char f = *from;
3226 1.1 mrg bool all_upper = false;
3227 1.1 mrg if (f == 'E' || f == 'e')
3228 1.1 mrg {
3229 1.1 mrg if ((size_t) (pfile->buffer->cur - from)
3230 1.1 mrg < sizeof "else fallthru" - 1)
3231 1.1 mrg return false;
3232 1.1 mrg if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3233 1.1 mrg all_upper = true;
3234 1.1 mrg else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3235 1.1 mrg return false;
3236 1.1 mrg from += sizeof "else" - 1;
3237 1.1 mrg if (*from == ',')
3238 1.1 mrg from++;
3239 1.1 mrg if (*from != ' ')
3240 1.1 mrg return false;
3241 1.1 mrg from++;
3242 1.1 mrg if (all_upper && *from == 'f')
3243 1.1 mrg return false;
3244 1.1 mrg if (f == 'e' && *from == 'F')
3245 1.1 mrg return false;
3246 1.1 mrg f = *from;
3247 1.1 mrg }
3248 1.1 mrg else if (f == 'I' || f == 'i')
3249 1.1 mrg {
3250 1.1 mrg if ((size_t) (pfile->buffer->cur - from)
3251 1.1 mrg < sizeof "intentional fallthru" - 1)
3252 1.1 mrg return false;
3253 1.1 mrg if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3254 1.1 mrg sizeof "NTENTIONAL" - 1) == 0)
3255 1.1 mrg all_upper = true;
3256 1.1 mrg else if (memcmp (from + 1, "ntentional",
3257 1.1 mrg sizeof "ntentional" - 1))
3258 1.1 mrg return false;
3259 1.1 mrg from += sizeof "intentional" - 1;
3260 1.1 mrg if (*from == ' ')
3261 1.1 mrg {
3262 1.1 mrg from++;
3263 1.1 mrg if (all_upper && *from == 'f')
3264 1.1 mrg return false;
3265 1.1 mrg }
3266 1.1 mrg else if (all_upper)
3267 1.1 mrg {
3268 1.1 mrg if (memcmp (from, "LY F", sizeof "LY F" - 1))
3269 1.1 mrg return false;
3270 1.1 mrg from += sizeof "LY " - 1;
3271 1.1 mrg }
3272 1.1 mrg else
3273 1.1 mrg {
3274 1.1 mrg if (memcmp (from, "ly ", sizeof "ly " - 1))
3275 1.1 mrg return false;
3276 1.1 mrg from += sizeof "ly " - 1;
3277 1.1 mrg }
3278 1.1 mrg if (f == 'i' && *from == 'F')
3279 1.1 mrg return false;
3280 1.1 mrg f = *from;
3281 1.1 mrg }
3282 1.1 mrg if (f != 'F' && f != 'f')
3283 1.1 mrg return false;
3284 1.1 mrg if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3285 1.1 mrg return false;
3286 1.1 mrg if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3287 1.1 mrg all_upper = true;
3288 1.1 mrg else if (all_upper)
3289 1.1 mrg return false;
3290 1.1 mrg else if (memcmp (from + 1, "all", sizeof "all" - 1))
3291 1.1 mrg return false;
3292 1.1 mrg from += sizeof "fall" - 1;
3293 1.1 mrg if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3294 1.1 mrg from += 2;
3295 1.1 mrg else if (*from == ' ' || *from == '-')
3296 1.1 mrg from++;
3297 1.1 mrg else if (*from != (all_upper ? 'T' : 't'))
3298 1.1 mrg return false;
3299 1.1 mrg if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3300 1.1 mrg return false;
3301 1.1 mrg if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3302 1.1 mrg return false;
3303 1.1 mrg if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3304 1.1 mrg {
3305 1.1 mrg if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3306 1.1 mrg return false;
3307 1.1 mrg if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3308 1.1 mrg sizeof "hrough" - 1))
3309 1.1 mrg return false;
3310 1.1 mrg from += sizeof "through" - 1;
3311 1.1 mrg }
3312 1.1 mrg else
3313 1.1 mrg from += sizeof "thru" - 1;
3314 1.1 mrg while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3315 1.1 mrg from++;
3316 1.1 mrg if (*from == '-')
3317 1.1 mrg {
3318 1.1 mrg from++;
3319 1.1 mrg if (*comment_start == '*')
3320 1.1 mrg {
3321 1.1 mrg do
3322 1.1 mrg {
3323 1.1 mrg while (*from && *from != '*'
3324 1.1 mrg && *from != '\n' && *from != '\r')
3325 1.1 mrg from++;
3326 1.1 mrg if (*from != '*' || from[1] == '/')
3327 1.1 mrg break;
3328 1.1 mrg from++;
3329 1.1 mrg }
3330 1.1 mrg while (1);
3331 1.1 mrg }
3332 1.1 mrg else
3333 1.1 mrg while (*from && *from != '\n' && *from != '\r')
3334 1.1 mrg from++;
3335 1.1 mrg }
3336 1.1 mrg }
3337 1.1 mrg /* C block comment. */
3338 1.1 mrg if (*comment_start == '*')
3339 1.1 mrg {
3340 1.1 mrg if (*from != '*' || from[1] != '/')
3341 1.1 mrg return false;
3342 1.1 mrg }
3343 1.1 mrg /* C++ line comment. */
3344 1.1 mrg else if (*from != '\n')
3345 1.1 mrg return false;
3346 1.1 mrg
3347 1.1 mrg return true;
3348 1.1 mrg }
3349 1.1 mrg
3350 1.1 mrg /* Allocate COUNT tokens for RUN. */
3351 1.1 mrg void
3352 1.1 mrg _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3353 1.1 mrg {
3354 1.1 mrg run->base = XNEWVEC (cpp_token, count);
3355 1.1 mrg run->limit = run->base + count;
3356 1.1 mrg run->next = NULL;
3357 1.1 mrg }
3358 1.1 mrg
3359 1.1 mrg /* Returns the next tokenrun, or creates one if there is none. */
3360 1.1 mrg static tokenrun *
3361 1.1 mrg next_tokenrun (tokenrun *run)
3362 1.1 mrg {
3363 1.1 mrg if (run->next == NULL)
3364 1.1 mrg {
3365 1.1 mrg run->next = XNEW (tokenrun);
3366 1.1 mrg run->next->prev = run;
3367 1.1 mrg _cpp_init_tokenrun (run->next, 250);
3368 1.1 mrg }
3369 1.1 mrg
3370 1.1 mrg return run->next;
3371 1.1 mrg }
3372 1.1 mrg
3373 1.1 mrg /* Return the number of not yet processed token in a given
3374 1.1 mrg context. */
3375 1.1 mrg int
3376 1.1 mrg _cpp_remaining_tokens_num_in_context (cpp_context *context)
3377 1.1 mrg {
3378 1.1 mrg if (context->tokens_kind == TOKENS_KIND_DIRECT)
3379 1.1 mrg return (LAST (context).token - FIRST (context).token);
3380 1.1 mrg else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3381 1.1 mrg || context->tokens_kind == TOKENS_KIND_EXTENDED)
3382 1.1 mrg return (LAST (context).ptoken - FIRST (context).ptoken);
3383 1.1 mrg else
3384 1.1 mrg abort ();
3385 1.1 mrg }
3386 1.1 mrg
3387 1.1 mrg /* Returns the token present at index INDEX in a given context. If
3388 1.1 mrg INDEX is zero, the next token to be processed is returned. */
3389 1.1 mrg static const cpp_token*
3390 1.1 mrg _cpp_token_from_context_at (cpp_context *context, int index)
3391 1.1 mrg {
3392 1.1 mrg if (context->tokens_kind == TOKENS_KIND_DIRECT)
3393 1.1 mrg return &(FIRST (context).token[index]);
3394 1.1 mrg else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3395 1.1 mrg || context->tokens_kind == TOKENS_KIND_EXTENDED)
3396 1.1 mrg return FIRST (context).ptoken[index];
3397 1.1 mrg else
3398 1.1 mrg abort ();
3399 1.1 mrg }
3400 1.1 mrg
3401 1.1 mrg /* Look ahead in the input stream. */
3402 1.1 mrg const cpp_token *
3403 1.1 mrg cpp_peek_token (cpp_reader *pfile, int index)
3404 1.1 mrg {
3405 1.1 mrg cpp_context *context = pfile->context;
3406 1.1 mrg const cpp_token *peektok;
3407 1.1 mrg int count;
3408 1.1 mrg
3409 1.1 mrg /* First, scan through any pending cpp_context objects. */
3410 1.1 mrg while (context->prev)
3411 1.1 mrg {
3412 1.1 mrg ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3413 1.1 mrg
3414 1.1 mrg if (index < (int) sz)
3415 1.1 mrg return _cpp_token_from_context_at (context, index);
3416 1.1 mrg index -= (int) sz;
3417 1.1 mrg context = context->prev;
3418 1.1 mrg }
3419 1.1 mrg
3420 1.1 mrg /* We will have to read some new tokens after all (and do so
3421 1.1 mrg without invalidating preceding tokens). */
3422 1.1 mrg count = index;
3423 1.1 mrg pfile->keep_tokens++;
3424 1.1 mrg
3425 1.1 mrg /* For peeked tokens temporarily disable line_change reporting,
3426 1.1 mrg until the tokens are parsed for real. */
3427 1.1 mrg void (*line_change) (cpp_reader *, const cpp_token *, int)
3428 1.1 mrg = pfile->cb.line_change;
3429 1.1 mrg pfile->cb.line_change = NULL;
3430 1.1 mrg
3431 1.1 mrg do
3432 1.1 mrg {
3433 1.1 mrg peektok = _cpp_lex_token (pfile);
3434 1.1 mrg if (peektok->type == CPP_EOF)
3435 1.1 mrg {
3436 1.1 mrg index--;
3437 1.1 mrg break;
3438 1.1 mrg }
3439 1.1 mrg else if (peektok->type == CPP_PRAGMA)
3440 1.1 mrg {
3441 1.1 mrg /* Don't peek past a pragma. */
3442 1.1 mrg if (peektok == &pfile->directive_result)
3443 1.1 mrg /* Save the pragma in the buffer. */
3444 1.1 mrg *pfile->cur_token++ = *peektok;
3445 1.1 mrg index--;
3446 1.1 mrg break;
3447 1.1 mrg }
3448 1.1 mrg }
3449 1.1 mrg while (index--);
3450 1.1 mrg
3451 1.1 mrg _cpp_backup_tokens_direct (pfile, count - index);
3452 1.1 mrg pfile->keep_tokens--;
3453 1.1 mrg pfile->cb.line_change = line_change;
3454 1.1 mrg
3455 1.1 mrg return peektok;
3456 1.1 mrg }
3457 1.1 mrg
3458 1.1 mrg /* Allocate a single token that is invalidated at the same time as the
3459 1.1 mrg rest of the tokens on the line. Has its line and col set to the
3460 1.1 mrg same as the last lexed token, so that diagnostics appear in the
3461 1.1 mrg right place. */
3462 1.1 mrg cpp_token *
3463 1.1 mrg _cpp_temp_token (cpp_reader *pfile)
3464 1.1 mrg {
3465 1.1 mrg cpp_token *old, *result;
3466 1.1 mrg ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3467 1.1 mrg ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3468 1.1 mrg
3469 1.1 mrg old = pfile->cur_token - 1;
3470 1.1 mrg /* Any pre-existing lookaheads must not be clobbered. */
3471 1.1 mrg if (la)
3472 1.1 mrg {
3473 1.1 mrg if (sz <= la)
3474 1.1 mrg {
3475 1.1 mrg tokenrun *next = next_tokenrun (pfile->cur_run);
3476 1.1 mrg
3477 1.1 mrg if (sz < la)
3478 1.1 mrg memmove (next->base + 1, next->base,
3479 1.1 mrg (la - sz) * sizeof (cpp_token));
3480 1.1 mrg
3481 1.1 mrg next->base[0] = pfile->cur_run->limit[-1];
3482 1.1 mrg }
3483 1.1 mrg
3484 1.1 mrg if (sz > 1)
3485 1.1 mrg memmove (pfile->cur_token + 1, pfile->cur_token,
3486 1.1 mrg MIN (la, sz - 1) * sizeof (cpp_token));
3487 1.1 mrg }
3488 1.1 mrg
3489 1.1 mrg if (!sz && pfile->cur_token == pfile->cur_run->limit)
3490 1.1 mrg {
3491 1.1 mrg pfile->cur_run = next_tokenrun (pfile->cur_run);
3492 1.1 mrg pfile->cur_token = pfile->cur_run->base;
3493 1.1 mrg }
3494 1.1 mrg
3495 1.1 mrg result = pfile->cur_token++;
3496 1.1 mrg result->src_loc = old->src_loc;
3497 1.1 mrg return result;
3498 1.1 mrg }
3499 1.1 mrg
3500 1.1 mrg /* We're at the beginning of a logical line (so not in
3501 1.1 mrg directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3502 1.1 mrg if we should enter deferred_pragma mode to tokenize the rest of the
3503 1.1 mrg line as a module control-line. */
3504 1.1 mrg
3505 1.1 mrg static void
3506 1.1 mrg cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3507 1.1 mrg {
3508 1.1 mrg unsigned backup = 0; /* Tokens we peeked. */
3509 1.1 mrg cpp_hashnode *node = result->val.node.node;
3510 1.1 mrg cpp_token *peek = result;
3511 1.1 mrg cpp_token *keyword = peek;
3512 1.1 mrg cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3513 1.1 mrg int header_count = 0;
3514 1.1 mrg
3515 1.1 mrg /* Make sure the incoming state is as we expect it. This way we
3516 1.1 mrg can restore it using constants. */
3517 1.1 mrg gcc_checking_assert (!pfile->state.in_deferred_pragma
3518 1.1 mrg && !pfile->state.skipping
3519 1.1 mrg && !pfile->state.parsing_args
3520 1.1 mrg && !pfile->state.angled_headers
3521 1.1 mrg && (pfile->state.save_comments
3522 1.1 mrg == !CPP_OPTION (pfile, discard_comments)));
3523 1.1 mrg
3524 1.1 mrg /* Enter directives mode sufficiently for peeking. We don't have
3525 1.1 mrg to actually set in_directive. */
3526 1.1 mrg pfile->state.in_deferred_pragma = true;
3527 1.1 mrg
3528 1.1 mrg /* These two fields are needed to process tokenization in deferred
3529 1.1 mrg pragma mode. They are not used outside deferred pragma mode or
3530 1.1 mrg directives mode. */
3531 1.1 mrg pfile->state.pragma_allow_expansion = true;
3532 1.1 mrg pfile->directive_line = result->src_loc;
3533 1.1 mrg
3534 1.1 mrg /* Saving comments is incompatible with directives mode. */
3535 1.1 mrg pfile->state.save_comments = 0;
3536 1.1 mrg
3537 1.1 mrg if (node == n_modules[spec_nodes::M_EXPORT][0])
3538 1.1 mrg {
3539 1.1 mrg peek = _cpp_lex_direct (pfile);
3540 1.1 mrg keyword = peek;
3541 1.1 mrg backup++;
3542 1.1 mrg if (keyword->type != CPP_NAME)
3543 1.1 mrg goto not_module;
3544 1.1 mrg node = keyword->val.node.node;
3545 1.1 mrg if (!(node->flags & NODE_MODULE))
3546 1.1 mrg goto not_module;
3547 1.1 mrg }
3548 1.1 mrg
3549 1.1 mrg if (node == n_modules[spec_nodes::M__IMPORT][0])
3550 1.1 mrg /* __import */
3551 1.1 mrg header_count = backup + 2 + 16;
3552 1.1 mrg else if (node == n_modules[spec_nodes::M_IMPORT][0])
3553 1.1 mrg /* import */
3554 1.1 mrg header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3555 1.1 mrg else if (node == n_modules[spec_nodes::M_MODULE][0])
3556 1.1 mrg ; /* module */
3557 1.1 mrg else
3558 1.1 mrg goto not_module;
3559 1.1 mrg
3560 1.1 mrg /* We've seen [export] {module|import|__import}. Check the next token. */
3561 1.1 mrg if (header_count)
3562 1.1 mrg /* After '{,__}import' a header name may appear. */
3563 1.1 mrg pfile->state.angled_headers = true;
3564 1.1 mrg peek = _cpp_lex_direct (pfile);
3565 1.1 mrg backup++;
3566 1.1 mrg
3567 1.1 mrg /* ... import followed by identifier, ':', '<' or
3568 1.1 mrg header-name preprocessing tokens, or module
3569 1.1 mrg followed by cpp-identifier, ':' or ';' preprocessing
3570 1.1 mrg tokens. C++ keywords are not yet relevant. */
3571 1.1 mrg if (peek->type == CPP_NAME
3572 1.1 mrg || peek->type == CPP_COLON
3573 1.1 mrg || (header_count
3574 1.1 mrg ? (peek->type == CPP_LESS
3575 1.1 mrg || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3576 1.1 mrg || peek->type == CPP_HEADER_NAME)
3577 1.1 mrg : peek->type == CPP_SEMICOLON))
3578 1.1 mrg {
3579 1.1 mrg pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3580 1.1 mrg if (!pfile->state.pragma_allow_expansion)
3581 1.1 mrg pfile->state.prevent_expansion++;
3582 1.1 mrg
3583 1.1 mrg if (!header_count && linemap_included_from
3584 1.1 mrg (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3585 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3586 1.1 mrg "module control-line cannot be in included file");
3587 1.1 mrg
3588 1.1 mrg /* The first one or two tokens cannot be macro names. */
3589 1.1 mrg for (int ix = backup; ix--;)
3590 1.1 mrg {
3591 1.1 mrg cpp_token *tok = ix ? keyword : result;
3592 1.1 mrg cpp_hashnode *node = tok->val.node.node;
3593 1.1 mrg
3594 1.1 mrg /* Don't attempt to expand the token. */
3595 1.1 mrg tok->flags |= NO_EXPAND;
3596 1.1 mrg if (_cpp_defined_macro_p (node)
3597 1.1 mrg && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3598 1.1 mrg && !cpp_fun_like_macro_p (node))
3599 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3600 1.1 mrg "module control-line \"%s\" cannot be"
3601 1.1 mrg " an object-like macro",
3602 1.1 mrg NODE_NAME (node));
3603 1.1 mrg }
3604 1.1 mrg
3605 1.1 mrg /* Map to underbar variants. */
3606 1.1 mrg keyword->val.node.node = n_modules[header_count
3607 1.1 mrg ? spec_nodes::M_IMPORT
3608 1.1 mrg : spec_nodes::M_MODULE][1];
3609 1.1 mrg if (backup != 1)
3610 1.1 mrg result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3611 1.1 mrg
3612 1.1 mrg /* Maybe tell the tokenizer we expect a header-name down the
3613 1.1 mrg road. */
3614 1.1 mrg pfile->state.directive_file_token = header_count;
3615 1.1 mrg }
3616 1.1 mrg else
3617 1.1 mrg {
3618 1.1 mrg not_module:
3619 1.1 mrg /* Drop out of directive mode. */
3620 1.1 mrg /* We aaserted save_comments had this value upon entry. */
3621 1.1 mrg pfile->state.save_comments
3622 1.1 mrg = !CPP_OPTION (pfile, discard_comments);
3623 1.1 mrg pfile->state.in_deferred_pragma = false;
3624 1.1 mrg /* Do not let this remain on. */
3625 1.1 mrg pfile->state.angled_headers = false;
3626 1.1 mrg }
3627 1.1 mrg
3628 1.1 mrg /* In either case we want to backup the peeked tokens. */
3629 1.1 mrg if (backup)
3630 1.1 mrg {
3631 1.1 mrg /* If we saw EOL, we should drop it, because this isn't a module
3632 1.1 mrg control-line after all. */
3633 1.1 mrg bool eol = peek->type == CPP_PRAGMA_EOL;
3634 1.1 mrg if (!eol || backup > 1)
3635 1.1 mrg {
3636 1.1 mrg /* Put put the peeked tokens back */
3637 1.1 mrg _cpp_backup_tokens_direct (pfile, backup);
3638 1.1 mrg /* But if the last one was an EOL, forget it. */
3639 1.1 mrg if (eol)
3640 1.1 mrg pfile->lookaheads--;
3641 1.1 mrg }
3642 1.1 mrg }
3643 1.1 mrg }
3644 1.1 mrg
3645 1.1 mrg /* Lex a token into RESULT (external interface). Takes care of issues
3646 1.1 mrg like directive handling, token lookahead, multiple include
3647 1.1 mrg optimization and skipping. */
3648 1.1 mrg const cpp_token *
3649 1.1 mrg _cpp_lex_token (cpp_reader *pfile)
3650 1.1 mrg {
3651 1.1 mrg cpp_token *result;
3652 1.1 mrg
3653 1.1 mrg for (;;)
3654 1.1 mrg {
3655 1.1 mrg if (pfile->cur_token == pfile->cur_run->limit)
3656 1.1 mrg {
3657 1.1 mrg pfile->cur_run = next_tokenrun (pfile->cur_run);
3658 1.1 mrg pfile->cur_token = pfile->cur_run->base;
3659 1.1 mrg }
3660 1.1 mrg /* We assume that the current token is somewhere in the current
3661 1.1 mrg run. */
3662 1.1 mrg if (pfile->cur_token < pfile->cur_run->base
3663 1.1 mrg || pfile->cur_token >= pfile->cur_run->limit)
3664 1.1 mrg abort ();
3665 1.1 mrg
3666 1.1 mrg if (pfile->lookaheads)
3667 1.1 mrg {
3668 1.1 mrg pfile->lookaheads--;
3669 1.1 mrg result = pfile->cur_token++;
3670 1.1 mrg }
3671 1.1 mrg else
3672 1.1 mrg result = _cpp_lex_direct (pfile);
3673 1.1 mrg
3674 1.1 mrg if (result->flags & BOL)
3675 1.1 mrg {
3676 1.1 mrg /* Is this a directive. If _cpp_handle_directive returns
3677 1.1 mrg false, it is an assembler #. */
3678 1.1 mrg if (result->type == CPP_HASH
3679 1.1 mrg /* 6.10.3 p 11: Directives in a list of macro arguments
3680 1.1 mrg gives undefined behavior. This implementation
3681 1.1 mrg handles the directive as normal. */
3682 1.1 mrg && pfile->state.parsing_args != 1)
3683 1.1 mrg {
3684 1.1 mrg if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3685 1.1 mrg {
3686 1.1 mrg if (pfile->directive_result.type == CPP_PADDING)
3687 1.1 mrg continue;
3688 1.1 mrg result = &pfile->directive_result;
3689 1.1 mrg }
3690 1.1 mrg }
3691 1.1 mrg else if (pfile->state.in_deferred_pragma)
3692 1.1 mrg result = &pfile->directive_result;
3693 1.1 mrg else if (result->type == CPP_NAME
3694 1.1 mrg && (result->val.node.node->flags & NODE_MODULE)
3695 1.1 mrg && !pfile->state.skipping
3696 1.1 mrg /* Unlike regular directives, we do not deal with
3697 1.1 mrg tokenizing module directives as macro arguments.
3698 1.1 mrg That's not permitted. */
3699 1.1 mrg && !pfile->state.parsing_args)
3700 1.1 mrg {
3701 1.1 mrg /* P1857. Before macro expansion, At start of logical
3702 1.1 mrg line ... */
3703 1.1 mrg /* We don't have to consider lookaheads at this point. */
3704 1.1 mrg gcc_checking_assert (!pfile->lookaheads);
3705 1.1 mrg
3706 1.1 mrg cpp_maybe_module_directive (pfile, result);
3707 1.1 mrg }
3708 1.1 mrg
3709 1.1 mrg if (pfile->cb.line_change && !pfile->state.skipping)
3710 1.1 mrg pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3711 1.1 mrg }
3712 1.1 mrg
3713 1.1 mrg /* We don't skip tokens in directives. */
3714 1.1 mrg if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3715 1.1 mrg break;
3716 1.1 mrg
3717 1.1 mrg /* Outside a directive, invalidate controlling macros. At file
3718 1.1 mrg EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3719 1.1 mrg get here and MI optimization works. */
3720 1.1 mrg pfile->mi_valid = false;
3721 1.1 mrg
3722 1.1 mrg if (!pfile->state.skipping || result->type == CPP_EOF)
3723 1.1 mrg break;
3724 1.1 mrg }
3725 1.1 mrg
3726 1.1 mrg return result;
3727 1.1 mrg }
3728 1.1 mrg
3729 1.1.1.3 mrg /* Returns true if a fresh line has been loaded. */
3730 1.1.1.3 mrg template <bool lexing_raw_string>
3731 1.1.1.3 mrg static bool
3732 1.1 mrg get_fresh_line_impl (cpp_reader *pfile)
3733 1.1.1.3 mrg {
3734 1.1.1.3 mrg /* We can't get a new line until we leave the current directive, unless we
3735 1.1.1.3 mrg are lexing a raw string, in which case it will be OK as long as we don't
3736 1.1.1.3 mrg pop the current buffer. */
3737 1.1 mrg if (!lexing_raw_string && pfile->state.in_directive)
3738 1.1 mrg return false;
3739 1.1 mrg
3740 1.1 mrg for (;;)
3741 1.1 mrg {
3742 1.1 mrg cpp_buffer *buffer = pfile->buffer;
3743 1.1 mrg
3744 1.1 mrg if (!buffer->need_line)
3745 1.1 mrg return true;
3746 1.1 mrg
3747 1.1 mrg if (buffer->next_line < buffer->rlimit)
3748 1.1 mrg {
3749 1.1 mrg _cpp_clean_line (pfile);
3750 1.1 mrg return true;
3751 1.1 mrg }
3752 1.1.1.3 mrg
3753 1.1.1.3 mrg /* We can't change buffers until we leave the current directive. */
3754 1.1.1.3 mrg if (lexing_raw_string && pfile->state.in_directive)
3755 1.1.1.3 mrg return false;
3756 1.1 mrg
3757 1.1 mrg /* First, get out of parsing arguments state. */
3758 1.1 mrg if (pfile->state.parsing_args)
3759 1.1 mrg return false;
3760 1.1 mrg
3761 1.1 mrg /* End of buffer. Non-empty files should end in a newline. */
3762 1.1 mrg if (buffer->buf != buffer->rlimit
3763 1.1 mrg && buffer->next_line > buffer->rlimit
3764 1.1 mrg && !buffer->from_stage3)
3765 1.1 mrg {
3766 1.1 mrg /* Clip to buffer size. */
3767 1.1 mrg buffer->next_line = buffer->rlimit;
3768 1.1 mrg }
3769 1.1 mrg
3770 1.1 mrg if (buffer->prev && !buffer->return_at_eof)
3771 1.1 mrg _cpp_pop_buffer (pfile);
3772 1.1 mrg else
3773 1.1 mrg {
3774 1.1 mrg /* End of translation. Do not pop the buffer yet. Increment
3775 1.1 mrg line number so that the EOF token is on a line of its own
3776 1.1 mrg (_cpp_lex_direct doesn't increment in that case, because
3777 1.1 mrg it's hard for it to distinguish this special case). */
3778 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
3779 1.1 mrg return false;
3780 1.1 mrg }
3781 1.1 mrg }
3782 1.1 mrg }
3783 1.1.1.3 mrg
3784 1.1.1.3 mrg bool
3785 1.1.1.3 mrg _cpp_get_fresh_line (cpp_reader *pfile)
3786 1.1.1.3 mrg {
3787 1.1.1.3 mrg return get_fresh_line_impl<false> (pfile);
3788 1.1.1.3 mrg }
3789 1.1.1.3 mrg
3790 1.1 mrg
3791 1.1 mrg #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3792 1.1 mrg do \
3793 1.1 mrg { \
3794 1.1 mrg result->type = ELSE_TYPE; \
3795 1.1 mrg if (*buffer->cur == CHAR) \
3796 1.1 mrg buffer->cur++, result->type = THEN_TYPE; \
3797 1.1 mrg } \
3798 1.1 mrg while (0)
3799 1.1 mrg
3800 1.1 mrg /* Lex a token into pfile->cur_token, which is also incremented, to
3801 1.1 mrg get diagnostics pointing to the correct location.
3802 1.1 mrg
3803 1.1 mrg Does not handle issues such as token lookahead, multiple-include
3804 1.1 mrg optimization, directives, skipping etc. This function is only
3805 1.1 mrg suitable for use by _cpp_lex_token, and in special cases like
3806 1.1 mrg lex_expansion_token which doesn't care for any of these issues.
3807 1.1 mrg
3808 1.1 mrg When meeting a newline, returns CPP_EOF if parsing a directive,
3809 1.1 mrg otherwise returns to the start of the token buffer if permissible.
3810 1.1 mrg Returns the location of the lexed token. */
3811 1.1 mrg cpp_token *
3812 1.1 mrg _cpp_lex_direct (cpp_reader *pfile)
3813 1.1.1.3 mrg {
3814 1.1 mrg cppchar_t c = 0;
3815 1.1 mrg cpp_buffer *buffer;
3816 1.1 mrg const unsigned char *comment_start;
3817 1.1 mrg bool fallthrough_comment = false;
3818 1.1 mrg cpp_token *result = pfile->cur_token++;
3819 1.1 mrg
3820 1.1 mrg fresh_line:
3821 1.1 mrg result->flags = 0;
3822 1.1 mrg buffer = pfile->buffer;
3823 1.1 mrg if (buffer->need_line)
3824 1.1 mrg {
3825 1.1 mrg if (pfile->state.in_deferred_pragma)
3826 1.1 mrg {
3827 1.1 mrg /* This can happen in cases like:
3828 1.1 mrg #define loop(x) whatever
3829 1.1 mrg #pragma omp loop
3830 1.1 mrg where when trying to expand loop we need to peek
3831 1.1 mrg next token after loop, but aren't still in_deferred_pragma
3832 1.1 mrg mode but are in in_directive mode, so buffer->need_line
3833 1.1 mrg is set, a CPP_EOF is peeked. */
3834 1.1 mrg result->type = CPP_PRAGMA_EOL;
3835 1.1 mrg pfile->state.in_deferred_pragma = false;
3836 1.1 mrg if (!pfile->state.pragma_allow_expansion)
3837 1.1.1.3 mrg pfile->state.prevent_expansion--;
3838 1.1 mrg result->src_loc = pfile->line_table->highest_line;
3839 1.1 mrg return result;
3840 1.1 mrg }
3841 1.1 mrg if (!_cpp_get_fresh_line (pfile))
3842 1.1 mrg {
3843 1.1 mrg result->type = CPP_EOF;
3844 1.1 mrg /* Not a real EOF in a directive or arg parsing -- we refuse
3845 1.1 mrg to advance to the next file now, and will once we're out
3846 1.1 mrg of those modes. */
3847 1.1 mrg if (!pfile->state.in_directive && !pfile->state.parsing_args)
3848 1.1 mrg {
3849 1.1 mrg /* Tell the compiler the line number of the EOF token. */
3850 1.1 mrg result->src_loc = pfile->line_table->highest_line;
3851 1.1 mrg result->flags = BOL;
3852 1.1 mrg /* Now pop the buffer that _cpp_get_fresh_line did not. */
3853 1.1 mrg _cpp_pop_buffer (pfile);
3854 1.1.1.3 mrg }
3855 1.1.1.3 mrg else if (c == 0)
3856 1.1 mrg result->src_loc = pfile->line_table->highest_line;
3857 1.1 mrg return result;
3858 1.1 mrg }
3859 1.1 mrg if (buffer != pfile->buffer)
3860 1.1 mrg fallthrough_comment = false;
3861 1.1 mrg if (!pfile->keep_tokens)
3862 1.1 mrg {
3863 1.1 mrg pfile->cur_run = &pfile->base_run;
3864 1.1 mrg result = pfile->base_run.base;
3865 1.1 mrg pfile->cur_token = result + 1;
3866 1.1 mrg }
3867 1.1 mrg result->flags = BOL;
3868 1.1 mrg if (pfile->state.parsing_args == 2)
3869 1.1 mrg result->flags |= PREV_WHITE;
3870 1.1 mrg }
3871 1.1 mrg buffer = pfile->buffer;
3872 1.1 mrg update_tokens_line:
3873 1.1 mrg result->src_loc = pfile->line_table->highest_line;
3874 1.1 mrg
3875 1.1 mrg skipped_white:
3876 1.1 mrg if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3877 1.1 mrg && !pfile->overlaid_buffer)
3878 1.1 mrg {
3879 1.1 mrg _cpp_process_line_notes (pfile, false);
3880 1.1 mrg result->src_loc = pfile->line_table->highest_line;
3881 1.1 mrg }
3882 1.1 mrg c = *buffer->cur++;
3883 1.1 mrg
3884 1.1 mrg if (pfile->forced_token_location)
3885 1.1 mrg result->src_loc = pfile->forced_token_location;
3886 1.1 mrg else
3887 1.1 mrg result->src_loc = linemap_position_for_column (pfile->line_table,
3888 1.1 mrg CPP_BUF_COLUMN (buffer, buffer->cur));
3889 1.1 mrg
3890 1.1 mrg switch (c)
3891 1.1 mrg {
3892 1.1 mrg case ' ': case '\t': case '\f': case '\v': case '\0':
3893 1.1 mrg result->flags |= PREV_WHITE;
3894 1.1 mrg skip_whitespace (pfile, c);
3895 1.1 mrg goto skipped_white;
3896 1.1 mrg
3897 1.1 mrg case '\n':
3898 1.1 mrg /* Increment the line, unless this is the last line ... */
3899 1.1 mrg if (buffer->cur < buffer->rlimit
3900 1.1 mrg /* ... or this is a #include, (where _cpp_stack_file needs to
3901 1.1 mrg unwind by one line) ... */
3902 1.1 mrg || (pfile->state.in_directive > 1
3903 1.1 mrg /* ... except traditional-cpp increments this elsewhere. */
3904 1.1 mrg && !CPP_OPTION (pfile, traditional)))
3905 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
3906 1.1 mrg buffer->need_line = true;
3907 1.1 mrg if (pfile->state.in_deferred_pragma)
3908 1.1 mrg {
3909 1.1 mrg /* Produce the PRAGMA_EOL on this line. File reading
3910 1.1 mrg ensures there is always a \n at end of the buffer, thus
3911 1.1 mrg in a deferred pragma we always see CPP_PRAGMA_EOL before
3912 1.1 mrg any CPP_EOF. */
3913 1.1 mrg result->type = CPP_PRAGMA_EOL;
3914 1.1 mrg result->flags &= ~PREV_WHITE;
3915 1.1 mrg pfile->state.in_deferred_pragma = false;
3916 1.1 mrg if (!pfile->state.pragma_allow_expansion)
3917 1.1 mrg pfile->state.prevent_expansion--;
3918 1.1 mrg return result;
3919 1.1 mrg }
3920 1.1 mrg goto fresh_line;
3921 1.1 mrg
3922 1.1 mrg case '0': case '1': case '2': case '3': case '4':
3923 1.1 mrg case '5': case '6': case '7': case '8': case '9':
3924 1.1 mrg {
3925 1.1 mrg struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3926 1.1 mrg result->type = CPP_NUMBER;
3927 1.1.1.3 mrg lex_number (pfile, &result->val.str, &nst);
3928 1.1 mrg warn_about_normalization (pfile, result, &nst, false);
3929 1.1 mrg break;
3930 1.1 mrg }
3931 1.1 mrg
3932 1.1 mrg case 'L':
3933 1.1 mrg case 'u':
3934 1.1 mrg case 'U':
3935 1.1 mrg case 'R':
3936 1.1 mrg /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3937 1.1 mrg wide strings or raw strings. */
3938 1.1 mrg if (c == 'L' || CPP_OPTION (pfile, rliterals)
3939 1.1 mrg || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3940 1.1 mrg {
3941 1.1 mrg if ((*buffer->cur == '\'' && c != 'R')
3942 1.1 mrg || *buffer->cur == '"'
3943 1.1 mrg || (*buffer->cur == 'R'
3944 1.1 mrg && c != 'R'
3945 1.1 mrg && buffer->cur[1] == '"'
3946 1.1 mrg && CPP_OPTION (pfile, rliterals))
3947 1.1 mrg || (*buffer->cur == '8'
3948 1.1 mrg && c == 'u'
3949 1.1 mrg && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3950 1.1 mrg && CPP_OPTION (pfile, utf8_char_literals)))
3951 1.1 mrg || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3952 1.1 mrg && CPP_OPTION (pfile, rliterals)))))
3953 1.1 mrg {
3954 1.1 mrg lex_string (pfile, result, buffer->cur - 1);
3955 1.1 mrg break;
3956 1.1 mrg }
3957 1.1 mrg }
3958 1.1 mrg /* Fall through. */
3959 1.1 mrg
3960 1.1 mrg case '_':
3961 1.1 mrg case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3962 1.1 mrg case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3963 1.1 mrg case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3964 1.1 mrg case 's': case 't': case 'v': case 'w': case 'x':
3965 1.1 mrg case 'y': case 'z':
3966 1.1 mrg case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3967 1.1 mrg case 'G': case 'H': case 'I': case 'J': case 'K':
3968 1.1 mrg case 'M': case 'N': case 'O': case 'P': case 'Q':
3969 1.1 mrg case 'S': case 'T': case 'V': case 'W': case 'X':
3970 1.1 mrg case 'Y': case 'Z':
3971 1.1 mrg result->type = CPP_NAME;
3972 1.1 mrg {
3973 1.1.1.3 mrg struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3974 1.1.1.3 mrg const auto node = lex_identifier (pfile, buffer->cur - 1, false, &nst,
3975 1.1.1.3 mrg &result->val.node.spelling);
3976 1.1.1.3 mrg result->val.node.node = node;
3977 1.1.1.3 mrg identifier_diagnostics_on_lex (pfile, node);
3978 1.1 mrg warn_about_normalization (pfile, result, &nst, true);
3979 1.1 mrg }
3980 1.1 mrg
3981 1.1 mrg /* Convert named operators to their proper types. */
3982 1.1 mrg if (result->val.node.node->flags & NODE_OPERATOR)
3983 1.1 mrg {
3984 1.1 mrg result->flags |= NAMED_OP;
3985 1.1 mrg result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3986 1.1 mrg }
3987 1.1 mrg
3988 1.1 mrg /* Signal FALLTHROUGH comment followed by another token. */
3989 1.1 mrg if (fallthrough_comment)
3990 1.1 mrg result->flags |= PREV_FALLTHROUGH;
3991 1.1 mrg break;
3992 1.1 mrg
3993 1.1 mrg case '\'':
3994 1.1 mrg case '"':
3995 1.1 mrg lex_string (pfile, result, buffer->cur - 1);
3996 1.1 mrg break;
3997 1.1 mrg
3998 1.1 mrg case '/':
3999 1.1 mrg /* A potential block or line comment. */
4000 1.1 mrg comment_start = buffer->cur;
4001 1.1 mrg c = *buffer->cur;
4002 1.1 mrg
4003 1.1 mrg if (c == '*')
4004 1.1 mrg {
4005 1.1 mrg if (_cpp_skip_block_comment (pfile))
4006 1.1 mrg cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
4007 1.1 mrg }
4008 1.1 mrg else if (c == '/' && ! CPP_OPTION (pfile, traditional))
4009 1.1 mrg {
4010 1.1 mrg /* Don't warn for system headers. */
4011 1.1 mrg if (_cpp_in_system_header (pfile))
4012 1.1 mrg ;
4013 1.1 mrg /* Warn about comments if pedantically GNUC89, and not
4014 1.1 mrg in system headers. */
4015 1.1 mrg else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
4016 1.1 mrg && CPP_PEDANTIC (pfile)
4017 1.1 mrg && ! buffer->warned_cplusplus_comments)
4018 1.1 mrg {
4019 1.1 mrg if (cpp_error (pfile, CPP_DL_PEDWARN,
4020 1.1 mrg "C++ style comments are not allowed in ISO C90"))
4021 1.1 mrg cpp_error (pfile, CPP_DL_NOTE,
4022 1.1 mrg "(this will be reported only once per input file)");
4023 1.1 mrg buffer->warned_cplusplus_comments = 1;
4024 1.1 mrg }
4025 1.1 mrg /* Or if specifically desired via -Wc90-c99-compat. */
4026 1.1 mrg else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
4027 1.1 mrg && ! CPP_OPTION (pfile, cplusplus)
4028 1.1 mrg && ! buffer->warned_cplusplus_comments)
4029 1.1 mrg {
4030 1.1 mrg if (cpp_error (pfile, CPP_DL_WARNING,
4031 1.1 mrg "C++ style comments are incompatible with C90"))
4032 1.1 mrg cpp_error (pfile, CPP_DL_NOTE,
4033 1.1 mrg "(this will be reported only once per input file)");
4034 1.1 mrg buffer->warned_cplusplus_comments = 1;
4035 1.1 mrg }
4036 1.1 mrg /* In C89/C94, C++ style comments are forbidden. */
4037 1.1 mrg else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
4038 1.1 mrg || CPP_OPTION (pfile, lang) == CLK_STDC94))
4039 1.1 mrg {
4040 1.1 mrg /* But don't be confused about valid code such as
4041 1.1 mrg - // immediately followed by *,
4042 1.1 mrg - // in a preprocessing directive,
4043 1.1 mrg - // in an #if 0 block. */
4044 1.1 mrg if (buffer->cur[1] == '*'
4045 1.1 mrg || pfile->state.in_directive
4046 1.1 mrg || pfile->state.skipping)
4047 1.1 mrg {
4048 1.1 mrg result->type = CPP_DIV;
4049 1.1 mrg break;
4050 1.1 mrg }
4051 1.1 mrg else if (! buffer->warned_cplusplus_comments)
4052 1.1 mrg {
4053 1.1 mrg if (cpp_error (pfile, CPP_DL_ERROR,
4054 1.1 mrg "C++ style comments are not allowed in "
4055 1.1 mrg "ISO C90"))
4056 1.1 mrg cpp_error (pfile, CPP_DL_NOTE,
4057 1.1 mrg "(this will be reported only once per input "
4058 1.1 mrg "file)");
4059 1.1 mrg buffer->warned_cplusplus_comments = 1;
4060 1.1 mrg }
4061 1.1 mrg }
4062 1.1 mrg if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
4063 1.1 mrg cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
4064 1.1 mrg }
4065 1.1 mrg else if (c == '=')
4066 1.1 mrg {
4067 1.1 mrg buffer->cur++;
4068 1.1 mrg result->type = CPP_DIV_EQ;
4069 1.1 mrg break;
4070 1.1 mrg }
4071 1.1 mrg else
4072 1.1 mrg {
4073 1.1 mrg result->type = CPP_DIV;
4074 1.1 mrg break;
4075 1.1 mrg }
4076 1.1 mrg
4077 1.1 mrg if (fallthrough_comment_p (pfile, comment_start))
4078 1.1 mrg fallthrough_comment = true;
4079 1.1 mrg
4080 1.1 mrg if (pfile->cb.comment)
4081 1.1 mrg {
4082 1.1 mrg size_t len = pfile->buffer->cur - comment_start;
4083 1.1 mrg pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
4084 1.1 mrg len + 1);
4085 1.1 mrg }
4086 1.1 mrg
4087 1.1 mrg if (!pfile->state.save_comments)
4088 1.1 mrg {
4089 1.1 mrg result->flags |= PREV_WHITE;
4090 1.1 mrg goto update_tokens_line;
4091 1.1 mrg }
4092 1.1 mrg
4093 1.1 mrg if (fallthrough_comment)
4094 1.1 mrg result->flags |= PREV_FALLTHROUGH;
4095 1.1 mrg
4096 1.1 mrg /* Save the comment as a token in its own right. */
4097 1.1 mrg save_comment (pfile, result, comment_start, c);
4098 1.1 mrg break;
4099 1.1 mrg
4100 1.1 mrg case '<':
4101 1.1 mrg if (pfile->state.angled_headers)
4102 1.1 mrg {
4103 1.1 mrg lex_string (pfile, result, buffer->cur - 1);
4104 1.1 mrg if (result->type != CPP_LESS)
4105 1.1 mrg break;
4106 1.1 mrg }
4107 1.1 mrg
4108 1.1 mrg result->type = CPP_LESS;
4109 1.1 mrg if (*buffer->cur == '=')
4110 1.1 mrg {
4111 1.1 mrg buffer->cur++, result->type = CPP_LESS_EQ;
4112 1.1 mrg if (*buffer->cur == '>'
4113 1.1 mrg && CPP_OPTION (pfile, cplusplus)
4114 1.1 mrg && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4115 1.1 mrg buffer->cur++, result->type = CPP_SPACESHIP;
4116 1.1 mrg }
4117 1.1 mrg else if (*buffer->cur == '<')
4118 1.1 mrg {
4119 1.1 mrg buffer->cur++;
4120 1.1 mrg IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4121 1.1 mrg }
4122 1.1 mrg else if (CPP_OPTION (pfile, digraphs))
4123 1.1 mrg {
4124 1.1 mrg if (*buffer->cur == ':')
4125 1.1 mrg {
4126 1.1 mrg /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4127 1.1 mrg three characters are <:: and the subsequent character
4128 1.1 mrg is neither : nor >, the < is treated as a preprocessor
4129 1.1 mrg token by itself". */
4130 1.1 mrg if (CPP_OPTION (pfile, cplusplus)
4131 1.1 mrg && CPP_OPTION (pfile, lang) != CLK_CXX98
4132 1.1 mrg && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4133 1.1 mrg && buffer->cur[1] == ':'
4134 1.1 mrg && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4135 1.1 mrg break;
4136 1.1 mrg
4137 1.1 mrg buffer->cur++;
4138 1.1 mrg result->flags |= DIGRAPH;
4139 1.1 mrg result->type = CPP_OPEN_SQUARE;
4140 1.1 mrg }
4141 1.1 mrg else if (*buffer->cur == '%')
4142 1.1 mrg {
4143 1.1 mrg buffer->cur++;
4144 1.1 mrg result->flags |= DIGRAPH;
4145 1.1 mrg result->type = CPP_OPEN_BRACE;
4146 1.1 mrg }
4147 1.1 mrg }
4148 1.1 mrg break;
4149 1.1 mrg
4150 1.1 mrg case '>':
4151 1.1 mrg result->type = CPP_GREATER;
4152 1.1 mrg if (*buffer->cur == '=')
4153 1.1 mrg buffer->cur++, result->type = CPP_GREATER_EQ;
4154 1.1 mrg else if (*buffer->cur == '>')
4155 1.1 mrg {
4156 1.1 mrg buffer->cur++;
4157 1.1 mrg IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4158 1.1 mrg }
4159 1.1 mrg break;
4160 1.1 mrg
4161 1.1 mrg case '%':
4162 1.1 mrg result->type = CPP_MOD;
4163 1.1 mrg if (*buffer->cur == '=')
4164 1.1 mrg buffer->cur++, result->type = CPP_MOD_EQ;
4165 1.1 mrg else if (CPP_OPTION (pfile, digraphs))
4166 1.1 mrg {
4167 1.1 mrg if (*buffer->cur == ':')
4168 1.1 mrg {
4169 1.1 mrg buffer->cur++;
4170 1.1 mrg result->flags |= DIGRAPH;
4171 1.1 mrg result->type = CPP_HASH;
4172 1.1 mrg if (*buffer->cur == '%' && buffer->cur[1] == ':')
4173 1.1 mrg buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4174 1.1 mrg }
4175 1.1 mrg else if (*buffer->cur == '>')
4176 1.1 mrg {
4177 1.1 mrg buffer->cur++;
4178 1.1 mrg result->flags |= DIGRAPH;
4179 1.1 mrg result->type = CPP_CLOSE_BRACE;
4180 1.1 mrg }
4181 1.1 mrg }
4182 1.1 mrg break;
4183 1.1 mrg
4184 1.1 mrg case '.':
4185 1.1 mrg result->type = CPP_DOT;
4186 1.1 mrg if (ISDIGIT (*buffer->cur))
4187 1.1 mrg {
4188 1.1 mrg struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4189 1.1 mrg result->type = CPP_NUMBER;
4190 1.1.1.3 mrg lex_number (pfile, &result->val.str, &nst);
4191 1.1 mrg warn_about_normalization (pfile, result, &nst, false);
4192 1.1 mrg }
4193 1.1 mrg else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4194 1.1 mrg buffer->cur += 2, result->type = CPP_ELLIPSIS;
4195 1.1 mrg else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4196 1.1 mrg buffer->cur++, result->type = CPP_DOT_STAR;
4197 1.1 mrg break;
4198 1.1 mrg
4199 1.1 mrg case '+':
4200 1.1 mrg result->type = CPP_PLUS;
4201 1.1 mrg if (*buffer->cur == '+')
4202 1.1 mrg buffer->cur++, result->type = CPP_PLUS_PLUS;
4203 1.1 mrg else if (*buffer->cur == '=')
4204 1.1 mrg buffer->cur++, result->type = CPP_PLUS_EQ;
4205 1.1 mrg break;
4206 1.1 mrg
4207 1.1 mrg case '-':
4208 1.1 mrg result->type = CPP_MINUS;
4209 1.1 mrg if (*buffer->cur == '>')
4210 1.1 mrg {
4211 1.1 mrg buffer->cur++;
4212 1.1 mrg result->type = CPP_DEREF;
4213 1.1 mrg if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4214 1.1 mrg buffer->cur++, result->type = CPP_DEREF_STAR;
4215 1.1 mrg }
4216 1.1 mrg else if (*buffer->cur == '-')
4217 1.1 mrg buffer->cur++, result->type = CPP_MINUS_MINUS;
4218 1.1 mrg else if (*buffer->cur == '=')
4219 1.1 mrg buffer->cur++, result->type = CPP_MINUS_EQ;
4220 1.1 mrg break;
4221 1.1 mrg
4222 1.1 mrg case '&':
4223 1.1 mrg result->type = CPP_AND;
4224 1.1 mrg if (*buffer->cur == '&')
4225 1.1 mrg buffer->cur++, result->type = CPP_AND_AND;
4226 1.1 mrg else if (*buffer->cur == '=')
4227 1.1 mrg buffer->cur++, result->type = CPP_AND_EQ;
4228 1.1 mrg break;
4229 1.1 mrg
4230 1.1 mrg case '|':
4231 1.1 mrg result->type = CPP_OR;
4232 1.1 mrg if (*buffer->cur == '|')
4233 1.1 mrg buffer->cur++, result->type = CPP_OR_OR;
4234 1.1 mrg else if (*buffer->cur == '=')
4235 1.1 mrg buffer->cur++, result->type = CPP_OR_EQ;
4236 1.1 mrg break;
4237 1.1 mrg
4238 1.1 mrg case ':':
4239 1.1.1.2 mrg result->type = CPP_COLON;
4240 1.1.1.2 mrg if (*buffer->cur == ':')
4241 1.1.1.2 mrg {
4242 1.1.1.2 mrg if (CPP_OPTION (pfile, scope))
4243 1.1.1.2 mrg buffer->cur++, result->type = CPP_SCOPE;
4244 1.1.1.2 mrg else
4245 1.1.1.2 mrg result->flags |= COLON_SCOPE;
4246 1.1 mrg }
4247 1.1 mrg else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4248 1.1 mrg {
4249 1.1 mrg buffer->cur++;
4250 1.1 mrg result->flags |= DIGRAPH;
4251 1.1 mrg result->type = CPP_CLOSE_SQUARE;
4252 1.1 mrg }
4253 1.1 mrg break;
4254 1.1 mrg
4255 1.1 mrg case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4256 1.1 mrg case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4257 1.1 mrg case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4258 1.1 mrg case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4259 1.1 mrg case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4260 1.1 mrg
4261 1.1 mrg case '?': result->type = CPP_QUERY; break;
4262 1.1 mrg case '~': result->type = CPP_COMPL; break;
4263 1.1 mrg case ',': result->type = CPP_COMMA; break;
4264 1.1 mrg case '(': result->type = CPP_OPEN_PAREN; break;
4265 1.1 mrg case ')': result->type = CPP_CLOSE_PAREN; break;
4266 1.1 mrg case '[': result->type = CPP_OPEN_SQUARE; break;
4267 1.1 mrg case ']': result->type = CPP_CLOSE_SQUARE; break;
4268 1.1 mrg case '{': result->type = CPP_OPEN_BRACE; break;
4269 1.1 mrg case '}': result->type = CPP_CLOSE_BRACE; break;
4270 1.1 mrg case ';': result->type = CPP_SEMICOLON; break;
4271 1.1 mrg
4272 1.1 mrg /* @ is a punctuator in Objective-C. */
4273 1.1 mrg case '@': result->type = CPP_ATSIGN; break;
4274 1.1 mrg
4275 1.1 mrg default:
4276 1.1 mrg {
4277 1.1.1.3 mrg const uchar *base = --buffer->cur;
4278 1.1 mrg static int no_warn_cnt;
4279 1.1 mrg
4280 1.1 mrg /* Check for an extended identifier ($ or UCN or UTF-8). */
4281 1.1 mrg struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4282 1.1 mrg if (forms_identifier_p (pfile, true, &nst))
4283 1.1 mrg {
4284 1.1.1.3 mrg result->type = CPP_NAME;
4285 1.1.1.3 mrg const auto node = lex_identifier (pfile, base, true, &nst,
4286 1.1.1.3 mrg &result->val.node.spelling);
4287 1.1.1.3 mrg result->val.node.node = node;
4288 1.1.1.3 mrg identifier_diagnostics_on_lex (pfile, node);
4289 1.1 mrg warn_about_normalization (pfile, result, &nst, true);
4290 1.1 mrg break;
4291 1.1 mrg }
4292 1.1 mrg
4293 1.1 mrg /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
4294 1.1 mrg single token. */
4295 1.1 mrg buffer->cur++;
4296 1.1 mrg if (c >= utf8_signifier)
4297 1.1 mrg {
4298 1.1 mrg const uchar *pstr = base;
4299 1.1 mrg cppchar_t s;
4300 1.1.1.3 mrg if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4301 1.1.1.3 mrg {
4302 1.1.1.3 mrg if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4303 1.1.1.3 mrg {
4304 1.1.1.3 mrg buffer->cur = base;
4305 1.1.1.3 mrg _cpp_warn_invalid_utf8 (pfile);
4306 1.1.1.3 mrg }
4307 1.1.1.3 mrg buffer->cur = pstr;
4308 1.1.1.3 mrg }
4309 1.1.1.3 mrg else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4310 1.1.1.3 mrg {
4311 1.1.1.3 mrg buffer->cur = base;
4312 1.1.1.3 mrg const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4313 1.1.1.3 mrg buffer->cur = base + 1;
4314 1.1.1.3 mrg no_warn_cnt = end - buffer->cur;
4315 1.1.1.3 mrg }
4316 1.1.1.3 mrg }
4317 1.1.1.3 mrg else if (c >= utf8_continuation
4318 1.1.1.3 mrg && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4319 1.1.1.3 mrg {
4320 1.1.1.3 mrg if (no_warn_cnt)
4321 1.1.1.3 mrg --no_warn_cnt;
4322 1.1.1.3 mrg else
4323 1.1.1.3 mrg {
4324 1.1.1.3 mrg buffer->cur = base;
4325 1.1.1.3 mrg _cpp_warn_invalid_utf8 (pfile);
4326 1.1.1.3 mrg buffer->cur = base + 1;
4327 1.1 mrg }
4328 1.1 mrg }
4329 1.1 mrg create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4330 1.1 mrg break;
4331 1.1 mrg }
4332 1.1 mrg
4333 1.1 mrg }
4334 1.1 mrg
4335 1.1 mrg /* Potentially convert the location of the token to a range. */
4336 1.1 mrg if (result->src_loc >= RESERVED_LOCATION_COUNT
4337 1.1 mrg && result->type != CPP_EOF)
4338 1.1 mrg {
4339 1.1 mrg /* Ensure that any line notes are processed, so that we have the
4340 1.1 mrg correct physical line/column for the end-point of the token even
4341 1.1 mrg when a logical line is split via one or more backslashes. */
4342 1.1 mrg if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4343 1.1 mrg && !pfile->overlaid_buffer)
4344 1.1 mrg _cpp_process_line_notes (pfile, false);
4345 1.1 mrg
4346 1.1 mrg source_range tok_range;
4347 1.1 mrg tok_range.m_start = result->src_loc;
4348 1.1 mrg tok_range.m_finish
4349 1.1 mrg = linemap_position_for_column (pfile->line_table,
4350 1.1 mrg CPP_BUF_COLUMN (buffer, buffer->cur));
4351 1.1.1.3 mrg
4352 1.1.1.3 mrg result->src_loc
4353 1.1.1.3 mrg = pfile->line_table->get_or_create_combined_loc (result->src_loc,
4354 1.1 mrg tok_range, nullptr, 0);
4355 1.1 mrg }
4356 1.1 mrg
4357 1.1 mrg return result;
4358 1.1 mrg }
4359 1.1 mrg
4360 1.1 mrg /* An upper bound on the number of bytes needed to spell TOKEN.
4361 1.1 mrg Does not include preceding whitespace. */
4362 1.1 mrg unsigned int
4363 1.1 mrg cpp_token_len (const cpp_token *token)
4364 1.1 mrg {
4365 1.1 mrg unsigned int len;
4366 1.1 mrg
4367 1.1 mrg switch (TOKEN_SPELL (token))
4368 1.1 mrg {
4369 1.1 mrg default: len = 6; break;
4370 1.1 mrg case SPELL_LITERAL: len = token->val.str.len; break;
4371 1.1 mrg case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
4372 1.1 mrg }
4373 1.1 mrg
4374 1.1 mrg return len;
4375 1.1 mrg }
4376 1.1 mrg
4377 1.1 mrg /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4378 1.1 mrg Return the number of bytes read out of NAME. (There are always
4379 1.1 mrg 10 bytes written to BUFFER.) */
4380 1.1 mrg
4381 1.1 mrg static size_t
4382 1.1 mrg utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4383 1.1 mrg {
4384 1.1 mrg int j;
4385 1.1 mrg int ucn_len = 0;
4386 1.1 mrg int ucn_len_c;
4387 1.1 mrg unsigned t;
4388 1.1 mrg unsigned long utf32;
4389 1.1 mrg
4390 1.1 mrg /* Compute the length of the UTF-8 sequence. */
4391 1.1 mrg for (t = *name; t & 0x80; t <<= 1)
4392 1.1 mrg ucn_len++;
4393 1.1 mrg
4394 1.1 mrg utf32 = *name & (0x7F >> ucn_len);
4395 1.1 mrg for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4396 1.1 mrg {
4397 1.1 mrg utf32 = (utf32 << 6) | (*++name & 0x3F);
4398 1.1 mrg
4399 1.1 mrg /* Ill-formed UTF-8. */
4400 1.1 mrg if ((*name & ~0x3F) != 0x80)
4401 1.1 mrg abort ();
4402 1.1 mrg }
4403 1.1 mrg
4404 1.1 mrg *buffer++ = '\\';
4405 1.1 mrg *buffer++ = 'U';
4406 1.1 mrg for (j = 7; j >= 0; j--)
4407 1.1 mrg *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4408 1.1 mrg return ucn_len;
4409 1.1 mrg }
4410 1.1 mrg
4411 1.1 mrg /* Given a token TYPE corresponding to a digraph, return a pointer to
4412 1.1 mrg the spelling of the digraph. */
4413 1.1 mrg static const unsigned char *
4414 1.1 mrg cpp_digraph2name (enum cpp_ttype type)
4415 1.1 mrg {
4416 1.1 mrg return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4417 1.1 mrg }
4418 1.1 mrg
4419 1.1.1.3 mrg /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4420 1.1 mrg The buffer must already contain enough space to hold the
4421 1.1 mrg token's spelling. Returns a pointer to the character after the
4422 1.1 mrg last character written. */
4423 1.1 mrg unsigned char *
4424 1.1 mrg _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4425 1.1 mrg {
4426 1.1 mrg size_t i;
4427 1.1 mrg const unsigned char *name = NODE_NAME (ident);
4428 1.1 mrg
4429 1.1 mrg for (i = 0; i < NODE_LEN (ident); i++)
4430 1.1 mrg if (name[i] & ~0x7F)
4431 1.1 mrg {
4432 1.1 mrg i += utf8_to_ucn (buffer, name + i) - 1;
4433 1.1 mrg buffer += 10;
4434 1.1 mrg }
4435 1.1 mrg else
4436 1.1 mrg *buffer++ = name[i];
4437 1.1 mrg
4438 1.1 mrg return buffer;
4439 1.1 mrg }
4440 1.1 mrg
4441 1.1.1.3 mrg /* Write the spelling of a token TOKEN to BUFFER. The buffer must
4442 1.1 mrg already contain enough space to hold the token's spelling.
4443 1.1 mrg Returns a pointer to the character after the last character written.
4444 1.1 mrg FORSTRING is true if this is to be the spelling after translation
4445 1.1 mrg phase 1 (with the original spelling of extended identifiers), false
4446 1.1 mrg if extended identifiers should always be written using UCNs (there is
4447 1.1 mrg no option for always writing them in the internal UTF-8 form).
4448 1.1 mrg FIXME: Would be nice if we didn't need the PFILE argument. */
4449 1.1 mrg unsigned char *
4450 1.1 mrg cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4451 1.1 mrg unsigned char *buffer, bool forstring)
4452 1.1 mrg {
4453 1.1 mrg switch (TOKEN_SPELL (token))
4454 1.1 mrg {
4455 1.1 mrg case SPELL_OPERATOR:
4456 1.1 mrg {
4457 1.1 mrg const unsigned char *spelling;
4458 1.1 mrg unsigned char c;
4459 1.1 mrg
4460 1.1 mrg if (token->flags & DIGRAPH)
4461 1.1 mrg spelling = cpp_digraph2name (token->type);
4462 1.1 mrg else if (token->flags & NAMED_OP)
4463 1.1 mrg goto spell_ident;
4464 1.1 mrg else
4465 1.1 mrg spelling = TOKEN_NAME (token);
4466 1.1 mrg
4467 1.1 mrg while ((c = *spelling++) != '\0')
4468 1.1 mrg *buffer++ = c;
4469 1.1 mrg }
4470 1.1 mrg break;
4471 1.1 mrg
4472 1.1 mrg spell_ident:
4473 1.1 mrg case SPELL_IDENT:
4474 1.1 mrg if (forstring)
4475 1.1 mrg {
4476 1.1 mrg memcpy (buffer, NODE_NAME (token->val.node.spelling),
4477 1.1 mrg NODE_LEN (token->val.node.spelling));
4478 1.1 mrg buffer += NODE_LEN (token->val.node.spelling);
4479 1.1 mrg }
4480 1.1 mrg else
4481 1.1 mrg buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4482 1.1 mrg break;
4483 1.1 mrg
4484 1.1 mrg case SPELL_LITERAL:
4485 1.1 mrg memcpy (buffer, token->val.str.text, token->val.str.len);
4486 1.1 mrg buffer += token->val.str.len;
4487 1.1 mrg break;
4488 1.1 mrg
4489 1.1 mrg case SPELL_NONE:
4490 1.1 mrg cpp_error (pfile, CPP_DL_ICE,
4491 1.1 mrg "unspellable token %s", TOKEN_NAME (token));
4492 1.1 mrg break;
4493 1.1 mrg }
4494 1.1 mrg
4495 1.1 mrg return buffer;
4496 1.1 mrg }
4497 1.1 mrg
4498 1.1 mrg /* Returns TOKEN spelt as a null-terminated string. The string is
4499 1.1 mrg freed when the reader is destroyed. Useful for diagnostics. */
4500 1.1 mrg unsigned char *
4501 1.1 mrg cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4502 1.1 mrg {
4503 1.1 mrg unsigned int len = cpp_token_len (token) + 1;
4504 1.1 mrg unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4505 1.1 mrg
4506 1.1 mrg end = cpp_spell_token (pfile, token, start, false);
4507 1.1 mrg end[0] = '\0';
4508 1.1 mrg
4509 1.1 mrg return start;
4510 1.1 mrg }
4511 1.1 mrg
4512 1.1 mrg /* Returns a pointer to a string which spells the token defined by
4513 1.1 mrg TYPE and FLAGS. Used by C front ends, which really should move to
4514 1.1 mrg using cpp_token_as_text. */
4515 1.1 mrg const char *
4516 1.1 mrg cpp_type2name (enum cpp_ttype type, unsigned char flags)
4517 1.1 mrg {
4518 1.1 mrg if (flags & DIGRAPH)
4519 1.1 mrg return (const char *) cpp_digraph2name (type);
4520 1.1 mrg else if (flags & NAMED_OP)
4521 1.1 mrg return cpp_named_operator2name (type);
4522 1.1 mrg
4523 1.1 mrg return (const char *) token_spellings[type].name;
4524 1.1 mrg }
4525 1.1 mrg
4526 1.1 mrg /* Writes the spelling of token to FP, without any preceding space.
4527 1.1 mrg Separated from cpp_spell_token for efficiency - to avoid stdio
4528 1.1 mrg double-buffering. */
4529 1.1 mrg void
4530 1.1 mrg cpp_output_token (const cpp_token *token, FILE *fp)
4531 1.1 mrg {
4532 1.1 mrg switch (TOKEN_SPELL (token))
4533 1.1 mrg {
4534 1.1 mrg case SPELL_OPERATOR:
4535 1.1 mrg {
4536 1.1 mrg const unsigned char *spelling;
4537 1.1 mrg int c;
4538 1.1 mrg
4539 1.1 mrg if (token->flags & DIGRAPH)
4540 1.1 mrg spelling = cpp_digraph2name (token->type);
4541 1.1 mrg else if (token->flags & NAMED_OP)
4542 1.1 mrg goto spell_ident;
4543 1.1 mrg else
4544 1.1 mrg spelling = TOKEN_NAME (token);
4545 1.1 mrg
4546 1.1 mrg c = *spelling;
4547 1.1 mrg do
4548 1.1 mrg putc (c, fp);
4549 1.1 mrg while ((c = *++spelling) != '\0');
4550 1.1 mrg }
4551 1.1 mrg break;
4552 1.1 mrg
4553 1.1 mrg spell_ident:
4554 1.1 mrg case SPELL_IDENT:
4555 1.1 mrg {
4556 1.1 mrg size_t i;
4557 1.1 mrg const unsigned char * name = NODE_NAME (token->val.node.node);
4558 1.1 mrg
4559 1.1 mrg for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4560 1.1 mrg if (name[i] & ~0x7F)
4561 1.1 mrg {
4562 1.1 mrg unsigned char buffer[10];
4563 1.1 mrg i += utf8_to_ucn (buffer, name + i) - 1;
4564 1.1 mrg fwrite (buffer, 1, 10, fp);
4565 1.1 mrg }
4566 1.1 mrg else
4567 1.1 mrg fputc (NODE_NAME (token->val.node.node)[i], fp);
4568 1.1 mrg }
4569 1.1 mrg break;
4570 1.1 mrg
4571 1.1 mrg case SPELL_LITERAL:
4572 1.1 mrg if (token->type == CPP_HEADER_NAME)
4573 1.1 mrg fputc ('"', fp);
4574 1.1 mrg fwrite (token->val.str.text, 1, token->val.str.len, fp);
4575 1.1 mrg if (token->type == CPP_HEADER_NAME)
4576 1.1 mrg fputc ('"', fp);
4577 1.1 mrg break;
4578 1.1 mrg
4579 1.1 mrg case SPELL_NONE:
4580 1.1 mrg /* An error, most probably. */
4581 1.1 mrg break;
4582 1.1 mrg }
4583 1.1 mrg }
4584 1.1 mrg
4585 1.1 mrg /* Compare two tokens. */
4586 1.1 mrg int
4587 1.1 mrg _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4588 1.1 mrg {
4589 1.1 mrg if (a->type == b->type && a->flags == b->flags)
4590 1.1 mrg switch (TOKEN_SPELL (a))
4591 1.1 mrg {
4592 1.1 mrg default: /* Keep compiler happy. */
4593 1.1 mrg case SPELL_OPERATOR:
4594 1.1 mrg /* token_no is used to track where multiple consecutive ##
4595 1.1 mrg tokens were originally located. */
4596 1.1 mrg return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4597 1.1 mrg case SPELL_NONE:
4598 1.1 mrg return (a->type != CPP_MACRO_ARG
4599 1.1 mrg || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4600 1.1 mrg && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4601 1.1 mrg case SPELL_IDENT:
4602 1.1 mrg return (a->val.node.node == b->val.node.node
4603 1.1 mrg && a->val.node.spelling == b->val.node.spelling);
4604 1.1 mrg case SPELL_LITERAL:
4605 1.1 mrg return (a->val.str.len == b->val.str.len
4606 1.1 mrg && !memcmp (a->val.str.text, b->val.str.text,
4607 1.1 mrg a->val.str.len));
4608 1.1 mrg }
4609 1.1 mrg
4610 1.1 mrg return 0;
4611 1.1 mrg }
4612 1.1 mrg
4613 1.1 mrg /* Returns nonzero if a space should be inserted to avoid an
4614 1.1 mrg accidental token paste for output. For simplicity, it is
4615 1.1 mrg conservative, and occasionally advises a space where one is not
4616 1.1 mrg needed, e.g. "." and ".2". */
4617 1.1 mrg int
4618 1.1 mrg cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4619 1.1 mrg const cpp_token *token2)
4620 1.1 mrg {
4621 1.1 mrg enum cpp_ttype a = token1->type, b = token2->type;
4622 1.1 mrg cppchar_t c;
4623 1.1 mrg
4624 1.1 mrg if (token1->flags & NAMED_OP)
4625 1.1 mrg a = CPP_NAME;
4626 1.1 mrg if (token2->flags & NAMED_OP)
4627 1.1 mrg b = CPP_NAME;
4628 1.1 mrg
4629 1.1 mrg c = EOF;
4630 1.1 mrg if (token2->flags & DIGRAPH)
4631 1.1 mrg c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4632 1.1 mrg else if (token_spellings[b].category == SPELL_OPERATOR)
4633 1.1 mrg c = token_spellings[b].name[0];
4634 1.1 mrg
4635 1.1 mrg /* Quickly get everything that can paste with an '='. */
4636 1.1 mrg if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4637 1.1 mrg return 1;
4638 1.1 mrg
4639 1.1 mrg switch (a)
4640 1.1 mrg {
4641 1.1 mrg case CPP_GREATER: return c == '>';
4642 1.1 mrg case CPP_LESS: return c == '<' || c == '%' || c == ':';
4643 1.1 mrg case CPP_PLUS: return c == '+';
4644 1.1 mrg case CPP_MINUS: return c == '-' || c == '>';
4645 1.1 mrg case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4646 1.1 mrg case CPP_MOD: return c == ':' || c == '>';
4647 1.1 mrg case CPP_AND: return c == '&';
4648 1.1 mrg case CPP_OR: return c == '|';
4649 1.1 mrg case CPP_COLON: return c == ':' || c == '>';
4650 1.1 mrg case CPP_DEREF: return c == '*';
4651 1.1 mrg case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
4652 1.1 mrg case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
4653 1.1 mrg case CPP_PRAGMA:
4654 1.1 mrg case CPP_NAME: return ((b == CPP_NUMBER
4655 1.1 mrg && name_p (pfile, &token2->val.str))
4656 1.1 mrg || b == CPP_NAME
4657 1.1 mrg || b == CPP_CHAR || b == CPP_STRING); /* L */
4658 1.1 mrg case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
4659 1.1 mrg || b == CPP_CHAR
4660 1.1 mrg || c == '.' || c == '+' || c == '-');
4661 1.1 mrg /* UCNs */
4662 1.1 mrg case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4663 1.1 mrg && b == CPP_NAME)
4664 1.1 mrg || (CPP_OPTION (pfile, objc)
4665 1.1 mrg && token1->val.str.text[0] == '@'
4666 1.1 mrg && (b == CPP_NAME || b == CPP_STRING)));
4667 1.1 mrg case CPP_LESS_EQ: return c == '>';
4668 1.1 mrg case CPP_STRING:
4669 1.1 mrg case CPP_WSTRING:
4670 1.1 mrg case CPP_UTF8STRING:
4671 1.1 mrg case CPP_STRING16:
4672 1.1 mrg case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4673 1.1 mrg && (b == CPP_NAME
4674 1.1 mrg || (TOKEN_SPELL (token2) == SPELL_LITERAL
4675 1.1 mrg && ISIDST (token2->val.str.text[0]))));
4676 1.1 mrg
4677 1.1 mrg default: break;
4678 1.1 mrg }
4679 1.1 mrg
4680 1.1 mrg return 0;
4681 1.1 mrg }
4682 1.1 mrg
4683 1.1 mrg /* Output all the remaining tokens on the current line, and a newline
4684 1.1 mrg character, to FP. Leading whitespace is removed. If there are
4685 1.1 mrg macros, special token padding is not performed. */
4686 1.1 mrg void
4687 1.1 mrg cpp_output_line (cpp_reader *pfile, FILE *fp)
4688 1.1 mrg {
4689 1.1 mrg const cpp_token *token;
4690 1.1 mrg
4691 1.1 mrg token = cpp_get_token (pfile);
4692 1.1 mrg while (token->type != CPP_EOF)
4693 1.1 mrg {
4694 1.1 mrg cpp_output_token (token, fp);
4695 1.1 mrg token = cpp_get_token (pfile);
4696 1.1 mrg if (token->flags & PREV_WHITE)
4697 1.1 mrg putc (' ', fp);
4698 1.1 mrg }
4699 1.1 mrg
4700 1.1 mrg putc ('\n', fp);
4701 1.1 mrg }
4702 1.1 mrg
4703 1.1 mrg /* Return a string representation of all the remaining tokens on the
4704 1.1 mrg current line. The result is allocated using xmalloc and must be
4705 1.1 mrg freed by the caller. */
4706 1.1 mrg unsigned char *
4707 1.1 mrg cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4708 1.1 mrg {
4709 1.1 mrg const cpp_token *token;
4710 1.1 mrg unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4711 1.1 mrg unsigned int alloced = 120 + out;
4712 1.1 mrg unsigned char *result = (unsigned char *) xmalloc (alloced);
4713 1.1 mrg
4714 1.1 mrg /* If DIR_NAME is empty, there are no initial contents. */
4715 1.1 mrg if (dir_name)
4716 1.1 mrg {
4717 1.1 mrg sprintf ((char *) result, "#%s ", dir_name);
4718 1.1 mrg out += 2;
4719 1.1 mrg }
4720 1.1 mrg
4721 1.1 mrg token = cpp_get_token (pfile);
4722 1.1 mrg while (token->type != CPP_EOF)
4723 1.1 mrg {
4724 1.1 mrg unsigned char *last;
4725 1.1 mrg /* Include room for a possible space and the terminating nul. */
4726 1.1 mrg unsigned int len = cpp_token_len (token) + 2;
4727 1.1 mrg
4728 1.1 mrg if (out + len > alloced)
4729 1.1 mrg {
4730 1.1 mrg alloced *= 2;
4731 1.1 mrg if (out + len > alloced)
4732 1.1 mrg alloced = out + len;
4733 1.1 mrg result = (unsigned char *) xrealloc (result, alloced);
4734 1.1 mrg }
4735 1.1 mrg
4736 1.1 mrg last = cpp_spell_token (pfile, token, &result[out], 0);
4737 1.1 mrg out = last - result;
4738 1.1 mrg
4739 1.1 mrg token = cpp_get_token (pfile);
4740 1.1 mrg if (token->flags & PREV_WHITE)
4741 1.1 mrg result[out++] = ' ';
4742 1.1 mrg }
4743 1.1 mrg
4744 1.1 mrg result[out] = '\0';
4745 1.1 mrg return result;
4746 1.1 mrg }
4747 1.1 mrg
4748 1.1 mrg /* Memory buffers. Changing these three constants can have a dramatic
4749 1.1 mrg effect on performance. The values here are reasonable defaults,
4750 1.1 mrg but might be tuned. If you adjust them, be sure to test across a
4751 1.1 mrg range of uses of cpplib, including heavy nested function-like macro
4752 1.1 mrg expansion. Also check the change in peak memory usage (NJAMD is a
4753 1.1 mrg good tool for this). */
4754 1.1 mrg #define MIN_BUFF_SIZE 8000
4755 1.1 mrg #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4756 1.1 mrg #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4757 1.1 mrg (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4758 1.1 mrg
4759 1.1 mrg #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4760 1.1 mrg #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4761 1.1 mrg #endif
4762 1.1 mrg
4763 1.1 mrg /* Create a new allocation buffer. Place the control block at the end
4764 1.1 mrg of the buffer, so that buffer overflows will cause immediate chaos. */
4765 1.1 mrg static _cpp_buff *
4766 1.1 mrg new_buff (size_t len)
4767 1.1 mrg {
4768 1.1 mrg _cpp_buff *result;
4769 1.1 mrg unsigned char *base;
4770 1.1 mrg
4771 1.1 mrg if (len < MIN_BUFF_SIZE)
4772 1.1 mrg len = MIN_BUFF_SIZE;
4773 1.1 mrg len = CPP_ALIGN (len);
4774 1.1.1.3 mrg
4775 1.1 mrg #ifdef ENABLE_VALGRIND_WORKAROUNDS
4776 1.1 mrg /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4777 1.1 mrg struct first. */
4778 1.1 mrg size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4779 1.1 mrg base = XNEWVEC (unsigned char, len + slen);
4780 1.1 mrg result = (_cpp_buff *) base;
4781 1.1 mrg base += slen;
4782 1.1 mrg #else
4783 1.1 mrg base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4784 1.1 mrg result = (_cpp_buff *) (base + len);
4785 1.1 mrg #endif
4786 1.1 mrg result->base = base;
4787 1.1 mrg result->cur = base;
4788 1.1 mrg result->limit = base + len;
4789 1.1 mrg result->next = NULL;
4790 1.1 mrg return result;
4791 1.1 mrg }
4792 1.1 mrg
4793 1.1 mrg /* Place a chain of unwanted allocation buffers on the free list. */
4794 1.1 mrg void
4795 1.1 mrg _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4796 1.1 mrg {
4797 1.1 mrg _cpp_buff *end = buff;
4798 1.1 mrg
4799 1.1 mrg while (end->next)
4800 1.1 mrg end = end->next;
4801 1.1 mrg end->next = pfile->free_buffs;
4802 1.1 mrg pfile->free_buffs = buff;
4803 1.1 mrg }
4804 1.1 mrg
4805 1.1 mrg /* Return a free buffer of size at least MIN_SIZE. */
4806 1.1 mrg _cpp_buff *
4807 1.1 mrg _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4808 1.1 mrg {
4809 1.1 mrg _cpp_buff *result, **p;
4810 1.1 mrg
4811 1.1 mrg for (p = &pfile->free_buffs;; p = &(*p)->next)
4812 1.1 mrg {
4813 1.1 mrg size_t size;
4814 1.1 mrg
4815 1.1 mrg if (*p == NULL)
4816 1.1 mrg return new_buff (min_size);
4817 1.1 mrg result = *p;
4818 1.1 mrg size = result->limit - result->base;
4819 1.1 mrg /* Return a buffer that's big enough, but don't waste one that's
4820 1.1 mrg way too big. */
4821 1.1 mrg if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4822 1.1 mrg break;
4823 1.1 mrg }
4824 1.1 mrg
4825 1.1 mrg *p = result->next;
4826 1.1 mrg result->next = NULL;
4827 1.1 mrg result->cur = result->base;
4828 1.1 mrg return result;
4829 1.1 mrg }
4830 1.1 mrg
4831 1.1 mrg /* Creates a new buffer with enough space to hold the uncommitted
4832 1.1 mrg remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4833 1.1 mrg the excess bytes to the new buffer. Chains the new buffer after
4834 1.1 mrg BUFF, and returns the new buffer. */
4835 1.1 mrg _cpp_buff *
4836 1.1 mrg _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4837 1.1 mrg {
4838 1.1 mrg size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4839 1.1 mrg _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4840 1.1 mrg
4841 1.1 mrg buff->next = new_buff;
4842 1.1 mrg memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4843 1.1 mrg return new_buff;
4844 1.1 mrg }
4845 1.1 mrg
4846 1.1 mrg /* Creates a new buffer with enough space to hold the uncommitted
4847 1.1 mrg remaining bytes of the buffer pointed to by BUFF, and at least
4848 1.1 mrg MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4849 1.1 mrg Chains the new buffer before the buffer pointed to by BUFF, and
4850 1.1 mrg updates the pointer to point to the new buffer. */
4851 1.1 mrg void
4852 1.1 mrg _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4853 1.1 mrg {
4854 1.1 mrg _cpp_buff *new_buff, *old_buff = *pbuff;
4855 1.1 mrg size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4856 1.1 mrg
4857 1.1 mrg new_buff = _cpp_get_buff (pfile, size);
4858 1.1 mrg memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4859 1.1 mrg new_buff->next = old_buff;
4860 1.1 mrg *pbuff = new_buff;
4861 1.1 mrg }
4862 1.1 mrg
4863 1.1 mrg /* Free a chain of buffers starting at BUFF. */
4864 1.1 mrg void
4865 1.1 mrg _cpp_free_buff (_cpp_buff *buff)
4866 1.1 mrg {
4867 1.1 mrg _cpp_buff *next;
4868 1.1 mrg
4869 1.1 mrg for (; buff; buff = next)
4870 1.1 mrg {
4871 1.1.1.3 mrg next = buff->next;
4872 1.1 mrg #ifdef ENABLE_VALGRIND_WORKAROUNDS
4873 1.1 mrg free (buff);
4874 1.1 mrg #else
4875 1.1 mrg free (buff->base);
4876 1.1 mrg #endif
4877 1.1 mrg }
4878 1.1 mrg }
4879 1.1 mrg
4880 1.1 mrg /* Allocate permanent, unaligned storage of length LEN. */
4881 1.1 mrg unsigned char *
4882 1.1 mrg _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4883 1.1 mrg {
4884 1.1 mrg _cpp_buff *buff = pfile->u_buff;
4885 1.1 mrg unsigned char *result = buff->cur;
4886 1.1 mrg
4887 1.1 mrg if (len > (size_t) (buff->limit - result))
4888 1.1 mrg {
4889 1.1 mrg buff = _cpp_get_buff (pfile, len);
4890 1.1 mrg buff->next = pfile->u_buff;
4891 1.1 mrg pfile->u_buff = buff;
4892 1.1 mrg result = buff->cur;
4893 1.1 mrg }
4894 1.1 mrg
4895 1.1 mrg buff->cur = result + len;
4896 1.1 mrg return result;
4897 1.1 mrg }
4898 1.1 mrg
4899 1.1 mrg /* Allocate permanent, unaligned storage of length LEN from a_buff.
4900 1.1 mrg That buffer is used for growing allocations when saving macro
4901 1.1 mrg replacement lists in a #define, and when parsing an answer to an
4902 1.1 mrg assertion in #assert, #unassert or #if (and therefore possibly
4903 1.1 mrg whilst expanding macros). It therefore must not be used by any
4904 1.1 mrg code that they might call: specifically the lexer and the guts of
4905 1.1 mrg the macro expander.
4906 1.1 mrg
4907 1.1 mrg All existing other uses clearly fit this restriction: storing
4908 1.1 mrg registered pragmas during initialization. */
4909 1.1 mrg unsigned char *
4910 1.1 mrg _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4911 1.1 mrg {
4912 1.1 mrg _cpp_buff *buff = pfile->a_buff;
4913 1.1 mrg unsigned char *result = buff->cur;
4914 1.1 mrg
4915 1.1 mrg if (len > (size_t) (buff->limit - result))
4916 1.1 mrg {
4917 1.1 mrg buff = _cpp_get_buff (pfile, len);
4918 1.1 mrg buff->next = pfile->a_buff;
4919 1.1 mrg pfile->a_buff = buff;
4920 1.1 mrg result = buff->cur;
4921 1.1 mrg }
4922 1.1 mrg
4923 1.1 mrg buff->cur = result + len;
4924 1.1 mrg return result;
4925 1.1 mrg }
4926 1.1 mrg
4927 1.1 mrg /* Commit or allocate storage from a buffer. */
4928 1.1 mrg
4929 1.1 mrg void *
4930 1.1 mrg _cpp_commit_buff (cpp_reader *pfile, size_t size)
4931 1.1 mrg {
4932 1.1 mrg void *ptr = BUFF_FRONT (pfile->a_buff);
4933 1.1 mrg
4934 1.1 mrg if (pfile->hash_table->alloc_subobject)
4935 1.1 mrg {
4936 1.1 mrg void *copy = pfile->hash_table->alloc_subobject (size);
4937 1.1 mrg memcpy (copy, ptr, size);
4938 1.1 mrg ptr = copy;
4939 1.1 mrg }
4940 1.1 mrg else
4941 1.1 mrg BUFF_FRONT (pfile->a_buff) += size;
4942 1.1 mrg
4943 1.1 mrg return ptr;
4944 1.1 mrg }
4945 1.1 mrg
4946 1.1 mrg /* Say which field of TOK is in use. */
4947 1.1 mrg
4948 1.1 mrg enum cpp_token_fld_kind
4949 1.1 mrg cpp_token_val_index (const cpp_token *tok)
4950 1.1 mrg {
4951 1.1 mrg switch (TOKEN_SPELL (tok))
4952 1.1 mrg {
4953 1.1 mrg case SPELL_IDENT:
4954 1.1 mrg return CPP_TOKEN_FLD_NODE;
4955 1.1 mrg case SPELL_LITERAL:
4956 1.1 mrg return CPP_TOKEN_FLD_STR;
4957 1.1 mrg case SPELL_OPERATOR:
4958 1.1 mrg /* Operands which were originally spelled as ident keep around
4959 1.1 mrg the node for the exact spelling. */
4960 1.1 mrg if (tok->flags & NAMED_OP)
4961 1.1 mrg return CPP_TOKEN_FLD_NODE;
4962 1.1 mrg else if (tok->type == CPP_PASTE)
4963 1.1 mrg return CPP_TOKEN_FLD_TOKEN_NO;
4964 1.1 mrg else
4965 1.1 mrg return CPP_TOKEN_FLD_NONE;
4966 1.1 mrg case SPELL_NONE:
4967 1.1 mrg if (tok->type == CPP_MACRO_ARG)
4968 1.1 mrg return CPP_TOKEN_FLD_ARG_NO;
4969 1.1 mrg else if (tok->type == CPP_PADDING)
4970 1.1 mrg return CPP_TOKEN_FLD_SOURCE;
4971 1.1 mrg else if (tok->type == CPP_PRAGMA)
4972 1.1 mrg return CPP_TOKEN_FLD_PRAGMA;
4973 1.1 mrg /* fall through */
4974 1.1 mrg default:
4975 1.1 mrg return CPP_TOKEN_FLD_NONE;
4976 1.1 mrg }
4977 1.1 mrg }
4978 1.1 mrg
4979 1.1 mrg /* All tokens lexed in R after calling this function will be forced to
4980 1.1 mrg have their location_t to be P, until
4981 1.1 mrg cpp_stop_forcing_token_locations is called for R. */
4982 1.1 mrg
4983 1.1 mrg void
4984 1.1 mrg cpp_force_token_locations (cpp_reader *r, location_t loc)
4985 1.1 mrg {
4986 1.1 mrg r->forced_token_location = loc;
4987 1.1 mrg }
4988 1.1 mrg
4989 1.1 mrg /* Go back to assigning locations naturally for lexed tokens. */
4990 1.1 mrg
4991 1.1 mrg void
4992 1.1 mrg cpp_stop_forcing_token_locations (cpp_reader *r)
4993 1.1 mrg {
4994 1.1 mrg r->forced_token_location = 0;
4995 1.1 mrg }
4996 1.1 mrg
4997 1.1 mrg /* We're looking at \, if it's escaping EOL, look past it. If at
4998 1.1 mrg LIMIT, don't advance. */
4999 1.1 mrg
5000 1.1 mrg static const unsigned char *
5001 1.1 mrg do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
5002 1.1 mrg {
5003 1.1 mrg const unsigned char *probe = peek;
5004 1.1 mrg
5005 1.1 mrg if (__builtin_expect (peek[1] == '\n', true))
5006 1.1 mrg {
5007 1.1 mrg eol:
5008 1.1 mrg probe += 2;
5009 1.1 mrg if (__builtin_expect (probe < limit, true))
5010 1.1 mrg {
5011 1.1 mrg peek = probe;
5012 1.1 mrg if (*peek == '\\')
5013 1.1 mrg /* The user might be perverse. */
5014 1.1 mrg return do_peek_backslash (peek, limit);
5015 1.1 mrg }
5016 1.1 mrg }
5017 1.1 mrg else if (__builtin_expect (peek[1] == '\r', false))
5018 1.1 mrg {
5019 1.1 mrg if (probe[2] == '\n')
5020 1.1 mrg probe++;
5021 1.1 mrg goto eol;
5022 1.1 mrg }
5023 1.1 mrg
5024 1.1 mrg return peek;
5025 1.1 mrg }
5026 1.1 mrg
5027 1.1 mrg static const unsigned char *
5028 1.1 mrg do_peek_next (const unsigned char *peek, const unsigned char *limit)
5029 1.1 mrg {
5030 1.1 mrg if (__builtin_expect (*peek == '\\', false))
5031 1.1 mrg peek = do_peek_backslash (peek, limit);
5032 1.1 mrg return peek;
5033 1.1 mrg }
5034 1.1 mrg
5035 1.1 mrg static const unsigned char *
5036 1.1 mrg do_peek_prev (const unsigned char *peek, const unsigned char *bound)
5037 1.1 mrg {
5038 1.1 mrg if (peek == bound)
5039 1.1 mrg return NULL;
5040 1.1 mrg
5041 1.1 mrg unsigned char c = *--peek;
5042 1.1 mrg if (__builtin_expect (c == '\n', false)
5043 1.1 mrg || __builtin_expect (c == 'r', false))
5044 1.1 mrg {
5045 1.1 mrg if (peek == bound)
5046 1.1 mrg return peek;
5047 1.1 mrg int ix = -1;
5048 1.1 mrg if (c == '\n' && peek[ix] == '\r')
5049 1.1 mrg {
5050 1.1 mrg if (peek + ix == bound)
5051 1.1 mrg return peek;
5052 1.1 mrg ix--;
5053 1.1 mrg }
5054 1.1 mrg
5055 1.1 mrg if (peek[ix] == '\\')
5056 1.1 mrg return do_peek_prev (peek + ix, bound);
5057 1.1 mrg
5058 1.1 mrg return peek;
5059 1.1 mrg }
5060 1.1 mrg else
5061 1.1 mrg return peek;
5062 1.1 mrg }
5063 1.1 mrg
5064 1.1 mrg /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
5065 1.1 mrg space. Otherwise return NULL. */
5066 1.1 mrg
5067 1.1 mrg static const unsigned char *
5068 1.1 mrg do_peek_ident (const char *match, const unsigned char *peek,
5069 1.1 mrg const unsigned char *limit)
5070 1.1 mrg {
5071 1.1 mrg for (; *++match; peek++)
5072 1.1 mrg if (*peek != *match)
5073 1.1 mrg {
5074 1.1 mrg peek = do_peek_next (peek, limit);
5075 1.1 mrg if (*peek != *match)
5076 1.1 mrg return NULL;
5077 1.1 mrg }
5078 1.1 mrg
5079 1.1 mrg /* Must now not be looking at an identifier char. */
5080 1.1 mrg peek = do_peek_next (peek, limit);
5081 1.1 mrg if (ISIDNUM (*peek))
5082 1.1 mrg return NULL;
5083 1.1 mrg
5084 1.1 mrg /* Skip control-line whitespace. */
5085 1.1 mrg ws:
5086 1.1 mrg while (*peek == ' ' || *peek == '\t')
5087 1.1 mrg peek++;
5088 1.1 mrg if (__builtin_expect (*peek == '\\', false))
5089 1.1 mrg {
5090 1.1 mrg peek = do_peek_backslash (peek, limit);
5091 1.1 mrg if (*peek != '\\')
5092 1.1 mrg goto ws;
5093 1.1 mrg }
5094 1.1 mrg
5095 1.1 mrg return peek;
5096 1.1 mrg }
5097 1.1 mrg
5098 1.1 mrg /* Are we looking at a module control line starting as PEEK - 1? */
5099 1.1 mrg
5100 1.1 mrg static bool
5101 1.1 mrg do_peek_module (cpp_reader *pfile, unsigned char c,
5102 1.1 mrg const unsigned char *peek, const unsigned char *limit)
5103 1.1 mrg {
5104 1.1 mrg bool import = false;
5105 1.1 mrg
5106 1.1 mrg if (__builtin_expect (c == 'e', false))
5107 1.1 mrg {
5108 1.1 mrg if (!((peek[0] == 'x' || peek[0] == '\\')
5109 1.1 mrg && (peek = do_peek_ident ("export", peek, limit))))
5110 1.1 mrg return false;
5111 1.1 mrg
5112 1.1 mrg /* export, peek for import or module. No need to peek __import
5113 1.1 mrg here. */
5114 1.1 mrg if (peek[0] == 'i')
5115 1.1 mrg {
5116 1.1 mrg if (!((peek[1] == 'm' || peek[1] == '\\')
5117 1.1 mrg && (peek = do_peek_ident ("import", peek + 1, limit))))
5118 1.1 mrg return false;
5119 1.1 mrg import = true;
5120 1.1 mrg }
5121 1.1 mrg else if (peek[0] == 'm')
5122 1.1 mrg {
5123 1.1 mrg if (!((peek[1] == 'o' || peek[1] == '\\')
5124 1.1 mrg && (peek = do_peek_ident ("module", peek + 1, limit))))
5125 1.1 mrg return false;
5126 1.1 mrg }
5127 1.1 mrg else
5128 1.1 mrg return false;
5129 1.1 mrg }
5130 1.1 mrg else if (__builtin_expect (c == 'i', false))
5131 1.1 mrg {
5132 1.1 mrg if (!((peek[0] == 'm' || peek[0] == '\\')
5133 1.1 mrg && (peek = do_peek_ident ("import", peek, limit))))
5134 1.1 mrg return false;
5135 1.1 mrg import = true;
5136 1.1 mrg }
5137 1.1 mrg else if (__builtin_expect (c == '_', false))
5138 1.1 mrg {
5139 1.1 mrg /* Needed for translated includes. */
5140 1.1 mrg if (!((peek[0] == '_' || peek[0] == '\\')
5141 1.1 mrg && (peek = do_peek_ident ("__import", peek, limit))))
5142 1.1 mrg return false;
5143 1.1 mrg import = true;
5144 1.1 mrg }
5145 1.1 mrg else if (__builtin_expect (c == 'm', false))
5146 1.1 mrg {
5147 1.1 mrg if (!((peek[0] == 'o' || peek[0] == '\\')
5148 1.1 mrg && (peek = do_peek_ident ("module", peek, limit))))
5149 1.1 mrg return false;
5150 1.1 mrg }
5151 1.1 mrg else
5152 1.1 mrg return false;
5153 1.1 mrg
5154 1.1 mrg /* Peek the next character to see if it's good enough. We'll be at
5155 1.1 mrg the first non-whitespace char, including skipping an escaped
5156 1.1 mrg newline. */
5157 1.1 mrg /* ... import followed by identifier, ':', '<' or header-name
5158 1.1 mrg preprocessing tokens, or module followed by identifier, ':' or
5159 1.1 mrg ';' preprocessing tokens. */
5160 1.1 mrg unsigned char p = *peek++;
5161 1.1 mrg
5162 1.1 mrg /* A character literal is ... single quotes, ... optionally preceded
5163 1.1 mrg by u8, u, U, or L */
5164 1.1 mrg /* A string-literal is a ... double quotes, optionally prefixed by
5165 1.1 mrg R, u8, u8R, u, uR, U, UR, L, or LR */
5166 1.1 mrg if (p == 'u')
5167 1.1 mrg {
5168 1.1 mrg peek = do_peek_next (peek, limit);
5169 1.1 mrg if (*peek == '8')
5170 1.1 mrg {
5171 1.1 mrg peek++;
5172 1.1 mrg goto peek_u8;
5173 1.1 mrg }
5174 1.1 mrg goto peek_u;
5175 1.1 mrg }
5176 1.1 mrg else if (p == 'U' || p == 'L')
5177 1.1 mrg {
5178 1.1 mrg peek_u8:
5179 1.1 mrg peek = do_peek_next (peek, limit);
5180 1.1 mrg peek_u:
5181 1.1 mrg if (*peek == '\"' || *peek == '\'')
5182 1.1 mrg return false;
5183 1.1 mrg
5184 1.1 mrg if (*peek == 'R')
5185 1.1 mrg goto peek_R;
5186 1.1 mrg /* Identifier. Ok. */
5187 1.1 mrg }
5188 1.1 mrg else if (p == 'R')
5189 1.1 mrg {
5190 1.1 mrg peek_R:
5191 1.1 mrg if (CPP_OPTION (pfile, rliterals))
5192 1.1 mrg {
5193 1.1 mrg peek = do_peek_next (peek, limit);
5194 1.1 mrg if (*peek == '\"')
5195 1.1 mrg return false;
5196 1.1 mrg }
5197 1.1 mrg /* Identifier. Ok. */
5198 1.1 mrg }
5199 1.1 mrg else if ('Z' - 'A' == 25
5200 1.1 mrg ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5201 1.1 mrg : ISIDST (p))
5202 1.1 mrg {
5203 1.1 mrg /* Identifier. Ok. */
5204 1.1 mrg }
5205 1.1 mrg else if (p == '<')
5206 1.1 mrg {
5207 1.1 mrg /* Maybe angle header, ok for import. Reject
5208 1.1 mrg '<=', '<<' digraph:'<:'. */
5209 1.1 mrg if (!import)
5210 1.1 mrg return false;
5211 1.1 mrg peek = do_peek_next (peek, limit);
5212 1.1 mrg if (*peek == '=' || *peek == '<'
5213 1.1 mrg || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5214 1.1 mrg return false;
5215 1.1 mrg }
5216 1.1 mrg else if (p == ';')
5217 1.1 mrg {
5218 1.1 mrg /* SEMICOLON, ok for module. */
5219 1.1 mrg if (import)
5220 1.1 mrg return false;
5221 1.1 mrg }
5222 1.1 mrg else if (p == '"')
5223 1.1 mrg {
5224 1.1 mrg /* STRING, ok for import. */
5225 1.1 mrg if (!import)
5226 1.1 mrg return false;
5227 1.1 mrg }
5228 1.1 mrg else if (p == ':')
5229 1.1 mrg {
5230 1.1 mrg /* Maybe COLON, ok. Reject '::', digraph:':>'. */
5231 1.1 mrg peek = do_peek_next (peek, limit);
5232 1.1 mrg if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5233 1.1 mrg return false;
5234 1.1 mrg }
5235 1.1 mrg else
5236 1.1 mrg /* FIXME: Detect a unicode character, excluding those not
5237 1.1 mrg permitted as the initial character. [lex.name]/1. I presume
5238 1.1 mrg we need to check the \[uU] spellings, and directly using
5239 1.1 mrg Unicode in say UTF8 form? Or perhaps we do the phase-1
5240 1.1 mrg conversion of UTF8 to universal-character-names? */
5241 1.1 mrg return false;
5242 1.1 mrg
5243 1.1 mrg return true;
5244 1.1 mrg }
5245 1.1 mrg
5246 1.1 mrg /* Directives-only scanning. Somewhat more relaxed than correct
5247 1.1 mrg parsing -- some ill-formed programs will not be rejected. */
5248 1.1 mrg
5249 1.1 mrg void
5250 1.1 mrg cpp_directive_only_process (cpp_reader *pfile,
5251 1.1 mrg void *data,
5252 1.1 mrg void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5253 1.1 mrg {
5254 1.1 mrg bool module_p = CPP_OPTION (pfile, module_directives);
5255 1.1 mrg
5256 1.1 mrg do
5257 1.1 mrg {
5258 1.1 mrg restart:
5259 1.1 mrg /* Buffer initialization, but no line cleaning. */
5260 1.1 mrg cpp_buffer *buffer = pfile->buffer;
5261 1.1 mrg buffer->cur_note = buffer->notes_used = 0;
5262 1.1 mrg buffer->cur = buffer->line_base = buffer->next_line;
5263 1.1 mrg buffer->need_line = false;
5264 1.1 mrg /* Files always end in a newline or carriage return. We rely on this for
5265 1.1 mrg character peeking safety. */
5266 1.1 mrg gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5267 1.1 mrg
5268 1.1 mrg const unsigned char *base = buffer->cur;
5269 1.1 mrg unsigned line_count = 0;
5270 1.1 mrg const unsigned char *line_start = base;
5271 1.1 mrg
5272 1.1 mrg bool bol = true;
5273 1.1 mrg bool raw = false;
5274 1.1 mrg
5275 1.1 mrg const unsigned char *lwm = base;
5276 1.1 mrg for (const unsigned char *pos = base, *limit = buffer->rlimit;
5277 1.1 mrg pos < limit;)
5278 1.1 mrg {
5279 1.1 mrg unsigned char c = *pos++;
5280 1.1 mrg /* This matches the switch in _cpp_lex_direct. */
5281 1.1 mrg switch (c)
5282 1.1 mrg {
5283 1.1 mrg case ' ': case '\t': case '\f': case '\v':
5284 1.1 mrg /* Whitespace, do nothing. */
5285 1.1 mrg break;
5286 1.1 mrg
5287 1.1 mrg case '\r': /* MAC line ending, or Windows \r\n */
5288 1.1 mrg if (*pos == '\n')
5289 1.1 mrg pos++;
5290 1.1 mrg /* FALLTHROUGH */
5291 1.1 mrg
5292 1.1 mrg case '\n':
5293 1.1 mrg bol = true;
5294 1.1 mrg
5295 1.1 mrg next_line:
5296 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
5297 1.1 mrg line_count++;
5298 1.1 mrg line_start = pos;
5299 1.1 mrg break;
5300 1.1 mrg
5301 1.1 mrg case '\\':
5302 1.1 mrg /* <backslash><newline> is removed, and doesn't undo any
5303 1.1 mrg preceeding escape or whatnot. */
5304 1.1 mrg if (*pos == '\n')
5305 1.1 mrg {
5306 1.1 mrg pos++;
5307 1.1 mrg goto next_line;
5308 1.1 mrg }
5309 1.1 mrg else if (*pos == '\r')
5310 1.1 mrg {
5311 1.1 mrg if (pos[1] == '\n')
5312 1.1 mrg pos++;
5313 1.1 mrg pos++;
5314 1.1 mrg goto next_line;
5315 1.1 mrg }
5316 1.1 mrg goto dflt;
5317 1.1 mrg
5318 1.1 mrg case '#':
5319 1.1 mrg if (bol)
5320 1.1 mrg {
5321 1.1 mrg /* Line directive. */
5322 1.1 mrg if (pos - 1 > base && !pfile->state.skipping)
5323 1.1 mrg cb (pfile, CPP_DO_print, data,
5324 1.1 mrg line_count, base, pos - 1 - base);
5325 1.1 mrg
5326 1.1 mrg /* Prep things for directive handling. */
5327 1.1 mrg buffer->next_line = pos;
5328 1.1 mrg buffer->need_line = true;
5329 1.1 mrg bool ok = _cpp_get_fresh_line (pfile);
5330 1.1 mrg gcc_checking_assert (ok);
5331 1.1 mrg
5332 1.1 mrg /* Ensure proper column numbering for generated
5333 1.1 mrg error messages. */
5334 1.1 mrg buffer->line_base -= pos - line_start;
5335 1.1 mrg
5336 1.1 mrg _cpp_handle_directive (pfile, line_start + 1 != pos);
5337 1.1 mrg
5338 1.1 mrg /* Sanitize the line settings. Duplicate #include's can
5339 1.1 mrg mess things up. */
5340 1.1 mrg // FIXME: Necessary?
5341 1.1 mrg pfile->line_table->highest_location
5342 1.1 mrg = pfile->line_table->highest_line;
5343 1.1 mrg
5344 1.1 mrg if (!pfile->state.skipping
5345 1.1 mrg && pfile->buffer->next_line < pfile->buffer->rlimit)
5346 1.1 mrg cb (pfile, CPP_DO_location, data,
5347 1.1 mrg pfile->line_table->highest_line);
5348 1.1 mrg
5349 1.1 mrg goto restart;
5350 1.1 mrg }
5351 1.1 mrg goto dflt;
5352 1.1 mrg
5353 1.1 mrg case '/':
5354 1.1 mrg {
5355 1.1 mrg const unsigned char *peek = do_peek_next (pos, limit);
5356 1.1 mrg if (!(*peek == '/' || *peek == '*'))
5357 1.1 mrg goto dflt;
5358 1.1 mrg
5359 1.1 mrg /* Line or block comment */
5360 1.1 mrg bool is_block = *peek == '*';
5361 1.1 mrg bool star = false;
5362 1.1 mrg bool esc = false;
5363 1.1 mrg location_t sloc
5364 1.1 mrg = linemap_position_for_column (pfile->line_table,
5365 1.1 mrg pos - line_start);
5366 1.1 mrg
5367 1.1 mrg while (pos < limit)
5368 1.1 mrg {
5369 1.1 mrg char c = *pos++;
5370 1.1 mrg switch (c)
5371 1.1 mrg {
5372 1.1 mrg case '\\':
5373 1.1 mrg esc = true;
5374 1.1 mrg break;
5375 1.1 mrg
5376 1.1 mrg case '\r':
5377 1.1 mrg if (*pos == '\n')
5378 1.1 mrg pos++;
5379 1.1 mrg /* FALLTHROUGH */
5380 1.1 mrg
5381 1.1 mrg case '\n':
5382 1.1 mrg {
5383 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
5384 1.1 mrg line_count++;
5385 1.1 mrg line_start = pos;
5386 1.1 mrg if (!esc && !is_block)
5387 1.1 mrg {
5388 1.1 mrg bol = true;
5389 1.1 mrg goto done_comment;
5390 1.1 mrg }
5391 1.1 mrg }
5392 1.1 mrg if (!esc)
5393 1.1 mrg star = false;
5394 1.1 mrg esc = false;
5395 1.1 mrg break;
5396 1.1 mrg
5397 1.1 mrg case '*':
5398 1.1 mrg if (pos > peek)
5399 1.1 mrg star = is_block;
5400 1.1 mrg esc = false;
5401 1.1 mrg break;
5402 1.1 mrg
5403 1.1 mrg case '/':
5404 1.1 mrg if (star)
5405 1.1 mrg goto done_comment;
5406 1.1 mrg /* FALLTHROUGH */
5407 1.1 mrg
5408 1.1 mrg default:
5409 1.1 mrg star = false;
5410 1.1 mrg esc = false;
5411 1.1 mrg break;
5412 1.1 mrg }
5413 1.1 mrg }
5414 1.1 mrg if (pos < limit || is_block)
5415 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5416 1.1 mrg "unterminated comment");
5417 1.1 mrg done_comment:
5418 1.1 mrg lwm = pos;
5419 1.1 mrg break;
5420 1.1 mrg }
5421 1.1 mrg
5422 1.1 mrg case '\'':
5423 1.1 mrg if (!CPP_OPTION (pfile, digit_separators))
5424 1.1 mrg goto delimited_string;
5425 1.1 mrg
5426 1.1 mrg /* Possibly a number punctuator. */
5427 1.1 mrg if (!ISIDNUM (*do_peek_next (pos, limit)))
5428 1.1 mrg goto delimited_string;
5429 1.1 mrg
5430 1.1 mrg goto quote_peek;
5431 1.1 mrg
5432 1.1 mrg case '\"':
5433 1.1 mrg if (!CPP_OPTION (pfile, rliterals))
5434 1.1 mrg goto delimited_string;
5435 1.1 mrg
5436 1.1 mrg quote_peek:
5437 1.1 mrg {
5438 1.1 mrg /* For ' see if it's a number punctuator
5439 1.1 mrg \.?<digit>(<digit>|<identifier-nondigit>
5440 1.1 mrg |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5441 1.1 mrg /* For " see if it's a raw string
5442 1.1 mrg {U,L,u,u8}R. This includes CPP_NUMBER detection,
5443 1.1 mrg because that could be 0e+R. */
5444 1.1 mrg const unsigned char *peek = pos - 1;
5445 1.1 mrg bool quote_first = c == '"';
5446 1.1 mrg bool quote_eight = false;
5447 1.1 mrg bool maybe_number_start = false;
5448 1.1 mrg bool want_number = false;
5449 1.1 mrg
5450 1.1 mrg while ((peek = do_peek_prev (peek, lwm)))
5451 1.1 mrg {
5452 1.1 mrg unsigned char p = *peek;
5453 1.1 mrg if (quote_first)
5454 1.1 mrg {
5455 1.1 mrg if (!raw)
5456 1.1 mrg {
5457 1.1 mrg if (p != 'R')
5458 1.1 mrg break;
5459 1.1 mrg raw = true;
5460 1.1 mrg continue;
5461 1.1 mrg }
5462 1.1 mrg
5463 1.1 mrg quote_first = false;
5464 1.1 mrg if (p == 'L' || p == 'U' || p == 'u')
5465 1.1 mrg ;
5466 1.1 mrg else if (p == '8')
5467 1.1 mrg quote_eight = true;
5468 1.1 mrg else
5469 1.1 mrg goto second_raw;
5470 1.1 mrg }
5471 1.1 mrg else if (quote_eight)
5472 1.1 mrg {
5473 1.1 mrg if (p != 'u')
5474 1.1 mrg {
5475 1.1 mrg raw = false;
5476 1.1 mrg break;
5477 1.1 mrg }
5478 1.1 mrg quote_eight = false;
5479 1.1 mrg }
5480 1.1 mrg else if (c == '"')
5481 1.1 mrg {
5482 1.1 mrg second_raw:;
5483 1.1 mrg if (!want_number && ISIDNUM (p))
5484 1.1 mrg {
5485 1.1 mrg raw = false;
5486 1.1 mrg break;
5487 1.1 mrg }
5488 1.1 mrg }
5489 1.1 mrg
5490 1.1 mrg if (ISDIGIT (p))
5491 1.1 mrg maybe_number_start = true;
5492 1.1 mrg else if (p == '.')
5493 1.1 mrg want_number = true;
5494 1.1 mrg else if (ISIDNUM (p))
5495 1.1 mrg maybe_number_start = false;
5496 1.1 mrg else if (p == '+' || p == '-')
5497 1.1 mrg {
5498 1.1 mrg if (const unsigned char *peek_prev
5499 1.1 mrg = do_peek_prev (peek, lwm))
5500 1.1 mrg {
5501 1.1 mrg p = *peek_prev;
5502 1.1 mrg if (p == 'e' || p == 'E'
5503 1.1 mrg || p == 'p' || p == 'P')
5504 1.1 mrg {
5505 1.1 mrg want_number = true;
5506 1.1 mrg maybe_number_start = false;
5507 1.1 mrg }
5508 1.1 mrg else
5509 1.1 mrg break;
5510 1.1 mrg }
5511 1.1 mrg else
5512 1.1 mrg break;
5513 1.1 mrg }
5514 1.1 mrg else if (p == '\'' || p == '\"')
5515 1.1 mrg {
5516 1.1 mrg /* If this is lwm, this must be the end of a
5517 1.1 mrg previous string. So this is a trailing
5518 1.1 mrg literal type, (a) if those are allowed,
5519 1.1 mrg and (b) maybe_start is false. Otherwise
5520 1.1 mrg this must be a CPP_NUMBER because we've
5521 1.1 mrg met another ', and we'd have checked that
5522 1.1 mrg in its own right. */
5523 1.1 mrg if (peek == lwm && CPP_OPTION (pfile, uliterals))
5524 1.1 mrg {
5525 1.1 mrg if (!maybe_number_start && !want_number)
5526 1.1 mrg /* Must be a literal type. */
5527 1.1 mrg raw = false;
5528 1.1 mrg }
5529 1.1 mrg else if (p == '\''
5530 1.1 mrg && CPP_OPTION (pfile, digit_separators))
5531 1.1 mrg maybe_number_start = true;
5532 1.1 mrg break;
5533 1.1 mrg }
5534 1.1 mrg else if (c == '\'')
5535 1.1 mrg break;
5536 1.1 mrg else if (!quote_first && !quote_eight)
5537 1.1 mrg break;
5538 1.1 mrg }
5539 1.1 mrg
5540 1.1 mrg if (maybe_number_start)
5541 1.1 mrg {
5542 1.1 mrg if (c == '\'')
5543 1.1 mrg /* A CPP NUMBER. */
5544 1.1 mrg goto dflt;
5545 1.1 mrg raw = false;
5546 1.1 mrg }
5547 1.1 mrg
5548 1.1 mrg goto delimited_string;
5549 1.1 mrg }
5550 1.1 mrg
5551 1.1 mrg delimited_string:
5552 1.1 mrg {
5553 1.1 mrg /* (Possibly raw) string or char literal. */
5554 1.1 mrg unsigned char end = c;
5555 1.1 mrg int delim_len = -1;
5556 1.1 mrg const unsigned char *delim = NULL;
5557 1.1 mrg location_t sloc = linemap_position_for_column (pfile->line_table,
5558 1.1 mrg pos - line_start);
5559 1.1 mrg int esc = 0;
5560 1.1 mrg
5561 1.1 mrg if (raw)
5562 1.1 mrg {
5563 1.1 mrg /* There can be no line breaks in the delimiter. */
5564 1.1 mrg delim = pos;
5565 1.1 mrg for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5566 1.1 mrg {
5567 1.1 mrg if (delim_len == 16)
5568 1.1 mrg {
5569 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR,
5570 1.1 mrg sloc, 0,
5571 1.1 mrg "raw string delimiter"
5572 1.1 mrg " longer than %d"
5573 1.1 mrg " characters",
5574 1.1 mrg delim_len);
5575 1.1 mrg raw = false;
5576 1.1 mrg pos = delim;
5577 1.1 mrg break;
5578 1.1 mrg }
5579 1.1 mrg if (strchr (") \\\t\v\f\n", c))
5580 1.1 mrg {
5581 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR,
5582 1.1 mrg sloc, 0,
5583 1.1 mrg "invalid character '%c'"
5584 1.1 mrg " in raw string"
5585 1.1 mrg " delimiter", c);
5586 1.1 mrg raw = false;
5587 1.1 mrg pos = delim;
5588 1.1 mrg break;
5589 1.1 mrg }
5590 1.1 mrg if (pos >= limit)
5591 1.1 mrg goto bad_string;
5592 1.1 mrg }
5593 1.1 mrg }
5594 1.1 mrg
5595 1.1 mrg while (pos < limit)
5596 1.1 mrg {
5597 1.1 mrg char c = *pos++;
5598 1.1 mrg switch (c)
5599 1.1 mrg {
5600 1.1 mrg case '\\':
5601 1.1 mrg if (!raw)
5602 1.1 mrg esc++;
5603 1.1 mrg break;
5604 1.1 mrg
5605 1.1 mrg case '\r':
5606 1.1 mrg if (*pos == '\n')
5607 1.1 mrg pos++;
5608 1.1 mrg /* FALLTHROUGH */
5609 1.1 mrg
5610 1.1 mrg case '\n':
5611 1.1 mrg {
5612 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
5613 1.1 mrg line_count++;
5614 1.1 mrg line_start = pos;
5615 1.1 mrg }
5616 1.1 mrg if (esc)
5617 1.1 mrg esc--;
5618 1.1 mrg break;
5619 1.1 mrg
5620 1.1 mrg case ')':
5621 1.1 mrg if (raw
5622 1.1 mrg && pos + delim_len + 1 < limit
5623 1.1 mrg && pos[delim_len] == end
5624 1.1 mrg && !memcmp (delim, pos, delim_len))
5625 1.1 mrg {
5626 1.1 mrg pos += delim_len + 1;
5627 1.1 mrg raw = false;
5628 1.1 mrg goto done_string;
5629 1.1 mrg }
5630 1.1 mrg break;
5631 1.1 mrg
5632 1.1 mrg default:
5633 1.1 mrg if (!raw && !(esc & 1) && c == end)
5634 1.1 mrg goto done_string;
5635 1.1 mrg esc = 0;
5636 1.1 mrg break;
5637 1.1 mrg }
5638 1.1 mrg }
5639 1.1 mrg bad_string:
5640 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5641 1.1 mrg "unterminated literal");
5642 1.1 mrg
5643 1.1 mrg done_string:
5644 1.1 mrg raw = false;
5645 1.1 mrg lwm = pos - 1;
5646 1.1 mrg }
5647 1.1 mrg goto dflt;
5648 1.1 mrg
5649 1.1 mrg case '_':
5650 1.1 mrg case 'e':
5651 1.1 mrg case 'i':
5652 1.1 mrg case 'm':
5653 1.1 mrg if (bol && module_p && !pfile->state.skipping
5654 1.1 mrg && do_peek_module (pfile, c, pos, limit))
5655 1.1 mrg {
5656 1.1 mrg /* We've seen the start of a module control line.
5657 1.1 mrg Start up the tokenizer. */
5658 1.1 mrg pos--; /* Backup over the first character. */
5659 1.1 mrg
5660 1.1 mrg /* Backup over whitespace to start of line. */
5661 1.1 mrg while (pos > line_start
5662 1.1 mrg && (pos[-1] == ' ' || pos[-1] == '\t'))
5663 1.1 mrg pos--;
5664 1.1 mrg
5665 1.1 mrg if (pos > base)
5666 1.1 mrg cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5667 1.1 mrg
5668 1.1 mrg /* Prep things for directive handling. */
5669 1.1 mrg buffer->next_line = pos;
5670 1.1 mrg buffer->need_line = true;
5671 1.1 mrg
5672 1.1 mrg /* Now get tokens until the PRAGMA_EOL. */
5673 1.1 mrg do
5674 1.1 mrg {
5675 1.1 mrg location_t spelling;
5676 1.1 mrg const cpp_token *tok
5677 1.1 mrg = cpp_get_token_with_location (pfile, &spelling);
5678 1.1 mrg
5679 1.1 mrg gcc_assert (pfile->state.in_deferred_pragma
5680 1.1 mrg || tok->type == CPP_PRAGMA_EOL);
5681 1.1 mrg cb (pfile, CPP_DO_token, data, tok, spelling);
5682 1.1 mrg }
5683 1.1 mrg while (pfile->state.in_deferred_pragma);
5684 1.1 mrg
5685 1.1 mrg if (pfile->buffer->next_line < pfile->buffer->rlimit)
5686 1.1 mrg cb (pfile, CPP_DO_location, data,
5687 1.1 mrg pfile->line_table->highest_line);
5688 1.1 mrg
5689 1.1 mrg pfile->mi_valid = false;
5690 1.1 mrg goto restart;
5691 1.1 mrg }
5692 1.1 mrg goto dflt;
5693 1.1 mrg
5694 1.1 mrg default:
5695 1.1 mrg dflt:
5696 1.1 mrg bol = false;
5697 1.1 mrg pfile->mi_valid = false;
5698 1.1 mrg break;
5699 1.1 mrg }
5700 1.1 mrg }
5701 1.1 mrg
5702 1.1 mrg if (buffer->rlimit > base && !pfile->state.skipping)
5703 1.1 mrg {
5704 1.1 mrg const unsigned char *limit = buffer->rlimit;
5705 1.1 mrg /* If the file was not newline terminated, add rlimit, which is
5706 1.1 mrg guaranteed to point to a newline, to the end of our range. */
5707 1.1 mrg if (limit[-1] != '\n')
5708 1.1 mrg {
5709 1.1 mrg limit++;
5710 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
5711 1.1 mrg line_count++;
5712 1.1 mrg }
5713 1.1 mrg cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5714 1.1 mrg }
5715 1.1 mrg
5716 1.1 mrg _cpp_pop_buffer (pfile);
5717 1.1 mrg }
5718 1.1 mrg while (pfile->buffer);
5719 }
5720