lex.cc revision 1.1.1.3 1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2024 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
26
27 enum spell_type
28 {
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
33 };
34
35 struct token_spelling
36 {
37 enum spell_type category;
38 const unsigned char *name;
39 };
40
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
43
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
49
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52
53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
54 #define UCS_LIMIT 0x10FFFF
55
56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
57 static int skip_line_comment (cpp_reader *);
58 static void skip_whitespace (cpp_reader *, cppchar_t);
59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
61 static void store_comment (cpp_reader *, cpp_token *);
62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
63 unsigned int, enum cpp_ttype);
64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
65 static int name_p (cpp_reader *, const cpp_string *);
66 static tokenrun *next_tokenrun (tokenrun *);
67
68 static _cpp_buff *new_buff (size_t);
69
70
71 /* Utility routine:
72
73 Compares, the token TOKEN to the NUL-terminated string STRING.
74 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
75 int
76 cpp_ideq (const cpp_token *token, const char *string)
77 {
78 if (token->type != CPP_NAME)
79 return 0;
80
81 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
82 }
83
84 /* Record a note TYPE at byte POS into the current cleaned logical
85 line. */
86 static void
87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
88 {
89 if (buffer->notes_used == buffer->notes_cap)
90 {
91 buffer->notes_cap = buffer->notes_cap * 2 + 200;
92 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
93 buffer->notes_cap);
94 }
95
96 buffer->notes[buffer->notes_used].pos = pos;
97 buffer->notes[buffer->notes_used].type = type;
98 buffer->notes_used++;
99 }
100
101
102 /* Fast path to find line special characters using optimized character
104 scanning algorithms. Anything complicated falls back to the slow
105 path below. Since this loop is very hot it's worth doing these kinds
106 of optimizations.
107
108 One of the paths through the ifdefs should provide
109
110 const uchar *search_line_fast (const uchar *s, const uchar *end);
111
112 Between S and END, search for \n, \r, \\, ?. Return a pointer to
113 the found character.
114
115 Note that the last character of the buffer is *always* a newline,
116 as forced by _cpp_convert_input. This fact can be used to avoid
117 explicitly looking for the end of the buffer. */
118
119 /* Configure gives us an ifdef test. */
120 #ifndef WORDS_BIGENDIAN
121 #define WORDS_BIGENDIAN 0
122 #endif
123
124 /* We'd like the largest integer that fits into a register. There's nothing
125 in <stdint.h> that gives us that. For most hosts this is unsigned long,
126 but MS decided on an LLP64 model. Thankfully when building with GCC we
127 can get the "real" word size. */
128 #ifdef __GNUC__
129 typedef unsigned int word_type __attribute__((__mode__(__word__)));
130 #else
131 typedef unsigned long word_type;
132 #endif
133
134 /* The code below is only expecting sizes 4 or 8.
135 Die at compile-time if this expectation is violated. */
136 typedef char check_word_type_size
137 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
138
139 /* Return X with the first N bytes forced to values that won't match one
140 of the interesting characters. Note that NUL is not interesting. */
141
142 static inline word_type
143 acc_char_mask_misalign (word_type val, unsigned int n)
144 {
145 word_type mask = -1;
146 if (WORDS_BIGENDIAN)
147 mask >>= n * 8;
148 else
149 mask <<= n * 8;
150 return val & mask;
151 }
152
153 /* Return X replicated to all byte positions within WORD_TYPE. */
154
155 static inline word_type
156 acc_char_replicate (uchar x)
157 {
158 word_type ret;
159
160 ret = (x << 24) | (x << 16) | (x << 8) | x;
161 if (sizeof(word_type) == 8)
162 ret = (ret << 16 << 16) | ret;
163 return ret;
164 }
165
166 /* Return non-zero if some byte of VAL is (probably) C. */
167
168 static inline word_type
169 acc_char_cmp (word_type val, word_type c)
170 {
171 #if defined(__GNUC__) && defined(__alpha__)
172 /* We can get exact results using a compare-bytes instruction.
173 Get (val == c) via (0 >= (val ^ c)). */
174 return __builtin_alpha_cmpbge (0, val ^ c);
175 #else
176 word_type magic = 0x7efefefeU;
177 if (sizeof(word_type) == 8)
178 magic = (magic << 16 << 16) | 0xfefefefeU;
179 magic |= 1;
180
181 val ^= c;
182 return ((val + magic) ^ ~val) & ~magic;
183 #endif
184 }
185
186 /* Given the result of acc_char_cmp is non-zero, return the index of
187 the found character. If this was a false positive, return -1. */
188
189 static inline int
190 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
191 word_type val ATTRIBUTE_UNUSED)
192 {
193 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
194 /* The cmpbge instruction sets *bits* of the result corresponding to
195 matches in the bytes with no false positives. */
196 return __builtin_ctzl (cmp);
197 #else
198 unsigned int i;
199
200 /* ??? It would be nice to force unrolling here,
201 and have all of these constants folded. */
202 for (i = 0; i < sizeof(word_type); ++i)
203 {
204 uchar c;
205 if (WORDS_BIGENDIAN)
206 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
207 else
208 c = (val >> i * 8) & 0xff;
209
210 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
211 return i;
212 }
213
214 return -1;
215 #endif
216 }
217
218 /* A version of the fast scanner using bit fiddling techniques.
219
220 For 32-bit words, one would normally perform 16 comparisons and
221 16 branches. With this algorithm one performs 24 arithmetic
222 operations and one branch. Whether this is faster with a 32-bit
223 word size is going to be somewhat system dependent.
224
225 For 64-bit words, we eliminate twice the number of comparisons
226 and branches without increasing the number of arithmetic operations.
227 It's almost certainly going to be a win with 64-bit word size. */
228
229 static const uchar * search_line_acc_char (const uchar *, const uchar *)
230 ATTRIBUTE_UNUSED;
231
232 static const uchar *
233 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
234 {
235 const word_type repl_nl = acc_char_replicate ('\n');
236 const word_type repl_cr = acc_char_replicate ('\r');
237 const word_type repl_bs = acc_char_replicate ('\\');
238 const word_type repl_qm = acc_char_replicate ('?');
239
240 unsigned int misalign;
241 const word_type *p;
242 word_type val, t;
243
244 /* Align the buffer. Mask out any bytes from before the beginning. */
245 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
246 val = *p;
247 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
248 if (misalign)
249 val = acc_char_mask_misalign (val, misalign);
250
251 /* Main loop. */
252 while (1)
253 {
254 t = acc_char_cmp (val, repl_nl);
255 t |= acc_char_cmp (val, repl_cr);
256 t |= acc_char_cmp (val, repl_bs);
257 t |= acc_char_cmp (val, repl_qm);
258
259 if (__builtin_expect (t != 0, 0))
260 {
261 int i = acc_char_index (t, val);
262 if (i >= 0)
263 return (const uchar *)p + i;
264 }
265
266 val = *++p;
267 }
268 }
269
270 /* Disable on Solaris 2/x86 until the following problem can be properly
271 autoconfed:
272
273 The Solaris 10+ assembler tags objects with the instruction set
274 extensions used, so SSE4.2 executables cannot run on machines that
275 don't support that extension. */
276
277 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
278
279 /* Replicated character data to be shared between implementations.
280 Recall that outside of a context with vector support we can't
281 define compatible vector types, therefore these are all defined
282 in terms of raw characters. */
283 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
284 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
285 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
286 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
287 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
288 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
289 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
290 { '?', '?', '?', '?', '?', '?', '?', '?',
291 '?', '?', '?', '?', '?', '?', '?', '?' },
292 };
293
294 /* A version of the fast scanner using MMX vectorized byte compare insns.
295
296 This uses the PMOVMSKB instruction which was introduced with "MMX2",
297 which was packaged into SSE1; it is also present in the AMD MMX
298 extension. Mark the function as using "sse" so that we emit a real
299 "emms" instruction, rather than the 3dNOW "femms" instruction. */
300
301 static const uchar *
302 #ifndef __SSE__
303 __attribute__((__target__("sse")))
304 #endif
305 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
306 {
307 typedef char v8qi __attribute__ ((__vector_size__ (8)));
308 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
309
310 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
311 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
312 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
313 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
314
315 unsigned int misalign, found, mask;
316 const v8qi *p;
317 v8qi data, t, c;
318
319 /* Align the source pointer. While MMX doesn't generate unaligned data
320 faults, this allows us to safely scan to the end of the buffer without
321 reading beyond the end of the last page. */
322 misalign = (uintptr_t)s & 7;
323 p = (const v8qi *)((uintptr_t)s & -8);
324 data = *p;
325
326 /* Create a mask for the bytes that are valid within the first
327 16-byte block. The Idea here is that the AND with the mask
328 within the loop is "free", since we need some AND or TEST
329 insn in order to set the flags for the branch anyway. */
330 mask = -1u << misalign;
331
332 /* Main loop processing 8 bytes at a time. */
333 goto start;
334 do
335 {
336 data = *++p;
337 mask = -1;
338
339 start:
340 t = __builtin_ia32_pcmpeqb(data, repl_nl);
341 c = __builtin_ia32_pcmpeqb(data, repl_cr);
342 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
343 c = __builtin_ia32_pcmpeqb(data, repl_bs);
344 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
345 c = __builtin_ia32_pcmpeqb(data, repl_qm);
346 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
347 found = __builtin_ia32_pmovmskb (t);
348 found &= mask;
349 }
350 while (!found);
351
352 __builtin_ia32_emms ();
353
354 /* FOUND contains 1 in bits for which we matched a relevant
355 character. Conversion to the byte index is trivial. */
356 found = __builtin_ctz(found);
357 return (const uchar *)p + found;
358 }
359
360 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
361
362 static const uchar *
363 #ifndef __SSE2__
364 __attribute__((__target__("sse2")))
365 #endif
366 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
367 {
368 typedef char v16qi __attribute__ ((__vector_size__ (16)));
369
370 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
371 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
372 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
373 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
374
375 unsigned int misalign, found, mask;
376 const v16qi *p;
377 v16qi data, t;
378
379 /* Align the source pointer. */
380 misalign = (uintptr_t)s & 15;
381 p = (const v16qi *)((uintptr_t)s & -16);
382 data = *p;
383
384 /* Create a mask for the bytes that are valid within the first
385 16-byte block. The Idea here is that the AND with the mask
386 within the loop is "free", since we need some AND or TEST
387 insn in order to set the flags for the branch anyway. */
388 mask = -1u << misalign;
389
390 /* Main loop processing 16 bytes at a time. */
391 goto start;
392 do
393 {
394 data = *++p;
395 mask = -1;
396
397 start:
398 t = data == repl_nl;
399 t |= data == repl_cr;
400 t |= data == repl_bs;
401 t |= data == repl_qm;
402 found = __builtin_ia32_pmovmskb128 (t);
403 found &= mask;
404 }
405 while (!found);
406
407 /* FOUND contains 1 in bits for which we matched a relevant
408 character. Conversion to the byte index is trivial. */
409 found = __builtin_ctz(found);
410 return (const uchar *)p + found;
411 }
412
413 #ifdef HAVE_SSE4
414 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
415
416 static const uchar *
417 #ifndef __SSE4_2__
418 __attribute__((__target__("sse4.2")))
419 #endif
420 search_line_sse42 (const uchar *s, const uchar *end)
421 {
422 typedef char v16qi __attribute__ ((__vector_size__ (16)));
423 static const v16qi search = { '\n', '\r', '?', '\\' };
424
425 uintptr_t si = (uintptr_t)s;
426 uintptr_t index;
427
428 /* Check for unaligned input. */
429 if (si & 15)
430 {
431 v16qi sv;
432
433 if (__builtin_expect (end - s < 16, 0)
434 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
435 {
436 /* There are less than 16 bytes left in the buffer, and less
437 than 16 bytes left on the page. Reading 16 bytes at this
438 point might generate a spurious page fault. Defer to the
439 SSE2 implementation, which already handles alignment. */
440 return search_line_sse2 (s, end);
441 }
442
443 /* ??? The builtin doesn't understand that the PCMPESTRI read from
444 memory need not be aligned. */
445 sv = __builtin_ia32_loaddqu ((const char *) s);
446 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
447
448 if (__builtin_expect (index < 16, 0))
449 goto found;
450
451 /* Advance the pointer to an aligned address. We will re-scan a
452 few bytes, but we no longer need care for reading past the
453 end of a page, since we're guaranteed a match. */
454 s = (const uchar *)((si + 15) & -16);
455 }
456
457 /* Main loop, processing 16 bytes at a time. */
458 #ifdef __GCC_ASM_FLAG_OUTPUTS__
459 while (1)
460 {
461 char f;
462
463 /* By using inline assembly instead of the builtin,
464 we can use the result, as well as the flags set. */
465 __asm ("%vpcmpestri\t$0, %2, %3"
466 : "=c"(index), "=@ccc"(f)
467 : "m"(*s), "x"(search), "a"(4), "d"(16));
468 if (f)
469 break;
470
471 s += 16;
472 }
473 #else
474 s -= 16;
475 /* By doing the whole loop in inline assembly,
476 we can make proper use of the flags set. */
477 __asm ( ".balign 16\n"
478 "0: add $16, %1\n"
479 " %vpcmpestri\t$0, (%1), %2\n"
480 " jnc 0b"
481 : "=&c"(index), "+r"(s)
482 : "x"(search), "a"(4), "d"(16));
483 #endif
484
485 found:
486 return s + index;
487 }
488
489 #else
490 /* Work around out-dated assemblers without sse4 support. */
491 #define search_line_sse42 search_line_sse2
492 #endif
493
494 /* Check the CPU capabilities. */
495
496 #include "../gcc/config/i386/cpuid.h"
497
498 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
499 static search_line_fast_type search_line_fast;
500
501 #define HAVE_init_vectorized_lexer 1
502 static inline void
503 init_vectorized_lexer (void)
504 {
505 unsigned dummy, ecx = 0, edx = 0;
506 search_line_fast_type impl = search_line_acc_char;
507 int minimum = 0;
508
509 #if defined(__SSE4_2__)
510 minimum = 3;
511 #elif defined(__SSE2__)
512 minimum = 2;
513 #elif defined(__SSE__)
514 minimum = 1;
515 #endif
516
517 if (minimum == 3)
518 impl = search_line_sse42;
519 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
520 {
521 if (minimum == 3 || (ecx & bit_SSE4_2))
522 impl = search_line_sse42;
523 else if (minimum == 2 || (edx & bit_SSE2))
524 impl = search_line_sse2;
525 else if (minimum == 1 || (edx & bit_SSE))
526 impl = search_line_mmx;
527 }
528 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
529 {
530 if (minimum == 1
531 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
532 impl = search_line_mmx;
533 }
534
535 search_line_fast = impl;
536 }
537
538 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
539
540 /* A vection of the fast scanner using AltiVec vectorized byte compares
541 and VSX unaligned loads (when VSX is available). This is otherwise
542 the same as the AltiVec version. */
543
544 ATTRIBUTE_NO_SANITIZE_UNDEFINED
545 static const uchar *
546 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
547 {
548 typedef __attribute__((altivec(vector))) unsigned char vc;
549
550 const vc repl_nl = {
551 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
552 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
553 };
554 const vc repl_cr = {
555 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
556 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
557 };
558 const vc repl_bs = {
559 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
560 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
561 };
562 const vc repl_qm = {
563 '?', '?', '?', '?', '?', '?', '?', '?',
564 '?', '?', '?', '?', '?', '?', '?', '?',
565 };
566 const vc zero = { 0 };
567
568 vc data, t;
569
570 /* Main loop processing 16 bytes at a time. */
571 do
572 {
573 vc m_nl, m_cr, m_bs, m_qm;
574
575 data = __builtin_vec_vsx_ld (0, s);
576 s += 16;
577
578 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
579 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
580 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
581 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
582 t = (m_nl | m_cr) | (m_bs | m_qm);
583
584 /* T now contains 0xff in bytes for which we matched one of the relevant
585 characters. We want to exit the loop if any byte in T is non-zero.
586 Below is the expansion of vec_any_ne(t, zero). */
587 }
588 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
589
590 /* Restore s to to point to the 16 bytes we just processed. */
591 s -= 16;
592
593 {
594 #define N (sizeof(vc) / sizeof(long))
595
596 union {
597 vc v;
598 /* Statically assert that N is 2 or 4. */
599 unsigned long l[(N == 2 || N == 4) ? N : -1];
600 } u;
601 unsigned long l, i = 0;
602
603 u.v = t;
604
605 /* Find the first word of T that is non-zero. */
606 switch (N)
607 {
608 case 4:
609 l = u.l[i++];
610 if (l != 0)
611 break;
612 s += sizeof(unsigned long);
613 l = u.l[i++];
614 if (l != 0)
615 break;
616 s += sizeof(unsigned long);
617 /* FALLTHRU */
618 case 2:
619 l = u.l[i++];
620 if (l != 0)
621 break;
622 s += sizeof(unsigned long);
623 l = u.l[i];
624 }
625
626 /* L now contains 0xff in bytes for which we matched one of the
627 relevant characters. We can find the byte index by finding
628 its bit index and dividing by 8. */
629 #ifdef __BIG_ENDIAN__
630 l = __builtin_clzl(l) >> 3;
631 #else
632 l = __builtin_ctzl(l) >> 3;
633 #endif
634 return s + l;
635
636 #undef N
637 }
638 }
639
640 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
641
642 /* A vection of the fast scanner using AltiVec vectorized byte compares.
643 This cannot be used for little endian because vec_lvsl/lvsr are
644 deprecated for little endian and the code won't work properly. */
645 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
646 so we can't compile this function without -maltivec on the command line
647 (or implied by some other switch). */
648
649 static const uchar *
650 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
651 {
652 typedef __attribute__((altivec(vector))) unsigned char vc;
653
654 const vc repl_nl = {
655 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
656 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
657 };
658 const vc repl_cr = {
659 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
660 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
661 };
662 const vc repl_bs = {
663 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
664 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
665 };
666 const vc repl_qm = {
667 '?', '?', '?', '?', '?', '?', '?', '?',
668 '?', '?', '?', '?', '?', '?', '?', '?',
669 };
670 const vc ones = {
671 -1, -1, -1, -1, -1, -1, -1, -1,
672 -1, -1, -1, -1, -1, -1, -1, -1,
673 };
674 const vc zero = { 0 };
675
676 vc data, mask, t;
677
678 /* Altivec loads automatically mask addresses with -16. This lets us
679 issue the first load as early as possible. */
680 data = __builtin_vec_ld(0, (const vc *)s);
681
682 /* Discard bytes before the beginning of the buffer. Do this by
683 beginning with all ones and shifting in zeros according to the
684 mis-alignment. The LVSR instruction pulls the exact shift we
685 want from the address. */
686 mask = __builtin_vec_lvsr(0, s);
687 mask = __builtin_vec_perm(zero, ones, mask);
688 data &= mask;
689
690 /* While altivec loads mask addresses, we still need to align S so
691 that the offset we compute at the end is correct. */
692 s = (const uchar *)((uintptr_t)s & -16);
693
694 /* Main loop processing 16 bytes at a time. */
695 goto start;
696 do
697 {
698 vc m_nl, m_cr, m_bs, m_qm;
699
700 s += 16;
701 data = __builtin_vec_ld(0, (const vc *)s);
702
703 start:
704 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
705 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
706 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
707 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
708 t = (m_nl | m_cr) | (m_bs | m_qm);
709
710 /* T now contains 0xff in bytes for which we matched one of the relevant
711 characters. We want to exit the loop if any byte in T is non-zero.
712 Below is the expansion of vec_any_ne(t, zero). */
713 }
714 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
715
716 {
717 #define N (sizeof(vc) / sizeof(long))
718
719 union {
720 vc v;
721 /* Statically assert that N is 2 or 4. */
722 unsigned long l[(N == 2 || N == 4) ? N : -1];
723 } u;
724 unsigned long l, i = 0;
725
726 u.v = t;
727
728 /* Find the first word of T that is non-zero. */
729 switch (N)
730 {
731 case 4:
732 l = u.l[i++];
733 if (l != 0)
734 break;
735 s += sizeof(unsigned long);
736 l = u.l[i++];
737 if (l != 0)
738 break;
739 s += sizeof(unsigned long);
740 /* FALLTHROUGH */
741 case 2:
742 l = u.l[i++];
743 if (l != 0)
744 break;
745 s += sizeof(unsigned long);
746 l = u.l[i];
747 }
748
749 /* L now contains 0xff in bytes for which we matched one of the
750 relevant characters. We can find the byte index by finding
751 its bit index and dividing by 8. */
752 l = __builtin_clzl(l) >> 3;
753 return s + l;
754
755 #undef N
756 }
757 }
758
759 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
760 #include "arm_neon.h"
761
762 /* This doesn't have to be the exact page size, but no system may use
763 a size smaller than this. ARMv8 requires a minimum page size of
764 4k. The impact of being conservative here is a small number of
765 cases will take the slightly slower entry path into the main
766 loop. */
767
768 #define AARCH64_MIN_PAGE_SIZE 4096
769
770 static const uchar *
771 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
772 {
773 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
774 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
775 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
776 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
777 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
778
779 #ifdef __ARM_BIG_ENDIAN
780 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
781 #else
782 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
783 #endif
784
785 unsigned int found;
786 const uint8_t *p;
787 uint8x16_t data;
788 uint8x16_t t;
789 uint16x8_t m;
790 uint8x16_t u, v, w;
791
792 /* Align the source pointer. */
793 p = (const uint8_t *)((uintptr_t)s & -16);
794
795 /* Assuming random string start positions, with a 4k page size we'll take
796 the slow path about 0.37% of the time. */
797 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
798 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
799 < 16, 0))
800 {
801 /* Slow path: the string starts near a possible page boundary. */
802 uint32_t misalign, mask;
803
804 misalign = (uintptr_t)s & 15;
805 mask = (-1u << misalign) & 0xffff;
806 data = vld1q_u8 (p);
807 t = vceqq_u8 (data, repl_nl);
808 u = vceqq_u8 (data, repl_cr);
809 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
810 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
811 t = vorrq_u8 (v, w);
812 t = vandq_u8 (t, xmask);
813 m = vpaddlq_u8 (t);
814 m = vshlq_u16 (m, shift);
815 found = vaddvq_u16 (m);
816 found &= mask;
817 if (found)
818 return (const uchar*)p + __builtin_ctz (found);
819 }
820 else
821 {
822 data = vld1q_u8 ((const uint8_t *) s);
823 t = vceqq_u8 (data, repl_nl);
824 u = vceqq_u8 (data, repl_cr);
825 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
826 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
827 t = vorrq_u8 (v, w);
828 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
829 goto done;
830 }
831
832 do
833 {
834 p += 16;
835 data = vld1q_u8 (p);
836 t = vceqq_u8 (data, repl_nl);
837 u = vceqq_u8 (data, repl_cr);
838 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
839 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
840 t = vorrq_u8 (v, w);
841 } while (!vpaddd_u64 ((uint64x2_t)t));
842
843 done:
844 /* Now that we've found the terminating substring, work out precisely where
845 we need to stop. */
846 t = vandq_u8 (t, xmask);
847 m = vpaddlq_u8 (t);
848 m = vshlq_u16 (m, shift);
849 found = vaddvq_u16 (m);
850 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
851 + __builtin_ctz (found));
852 }
853
854 #elif defined (__ARM_NEON)
855 #include "arm_neon.h"
856
857 static const uchar *
858 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
859 {
860 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
861 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
862 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
863 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
864 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
865
866 unsigned int misalign, found, mask;
867 const uint8_t *p;
868 uint8x16_t data;
869
870 /* Align the source pointer. */
871 misalign = (uintptr_t)s & 15;
872 p = (const uint8_t *)((uintptr_t)s & -16);
873 data = vld1q_u8 (p);
874
875 /* Create a mask for the bytes that are valid within the first
876 16-byte block. The Idea here is that the AND with the mask
877 within the loop is "free", since we need some AND or TEST
878 insn in order to set the flags for the branch anyway. */
879 mask = (-1u << misalign) & 0xffff;
880
881 /* Main loop, processing 16 bytes at a time. */
882 goto start;
883
884 do
885 {
886 uint8x8_t l;
887 uint16x4_t m;
888 uint32x2_t n;
889 uint8x16_t t, u, v, w;
890
891 p += 16;
892 data = vld1q_u8 (p);
893 mask = 0xffff;
894
895 start:
896 t = vceqq_u8 (data, repl_nl);
897 u = vceqq_u8 (data, repl_cr);
898 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
899 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
900 t = vandq_u8 (vorrq_u8 (v, w), xmask);
901 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
902 m = vpaddl_u8 (l);
903 n = vpaddl_u16 (m);
904
905 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
906 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
907 found &= mask;
908 }
909 while (!found);
910
911 /* FOUND contains 1 in bits for which we matched a relevant
912 character. Conversion to the byte index is trivial. */
913 found = __builtin_ctz (found);
914 return (const uchar *)p + found;
915 }
916
917 #else
918
919 /* We only have one accelerated alternative. Use a direct call so that
920 we encourage inlining. */
921
922 #define search_line_fast search_line_acc_char
923
924 #endif
925
926 /* Initialize the lexer if needed. */
927
928 void
929 _cpp_init_lexer (void)
930 {
931 #ifdef HAVE_init_vectorized_lexer
932 init_vectorized_lexer ();
933 #endif
934 }
935
936 /* Returns with a logical line that contains no escaped newlines or
937 trigraphs. This is a time-critical inner loop. */
938 void
939 _cpp_clean_line (cpp_reader *pfile)
940 {
941 cpp_buffer *buffer;
942 const uchar *s;
943 uchar c, *d, *p;
944
945 buffer = pfile->buffer;
946 buffer->cur_note = buffer->notes_used = 0;
947 buffer->cur = buffer->line_base = buffer->next_line;
948 buffer->need_line = false;
949 s = buffer->next_line;
950
951 if (!buffer->from_stage3)
952 {
953 const uchar *pbackslash = NULL;
954
955 /* Fast path. This is the common case of an un-escaped line with
956 no trigraphs. The primary win here is by not writing any
957 data back to memory until we have to. */
958 while (1)
959 {
960 /* Perform an optimized search for \n, \r, \\, ?. */
961 s = search_line_fast (s, buffer->rlimit);
962
963 c = *s;
964 if (c == '\\')
965 {
966 /* Record the location of the backslash and continue. */
967 pbackslash = s++;
968 }
969 else if (__builtin_expect (c == '?', 0))
970 {
971 if (__builtin_expect (s[1] == '?', false)
972 && _cpp_trigraph_map[s[2]])
973 {
974 /* Have a trigraph. We may or may not have to convert
975 it. Add a line note regardless, for -Wtrigraphs. */
976 add_line_note (buffer, s, s[2]);
977 if (CPP_OPTION (pfile, trigraphs))
978 {
979 /* We do, and that means we have to switch to the
980 slow path. */
981 d = (uchar *) s;
982 *d = _cpp_trigraph_map[s[2]];
983 s += 2;
984 goto slow_path;
985 }
986 }
987 /* Not a trigraph. Continue on fast-path. */
988 s++;
989 }
990 else
991 break;
992 }
993
994 /* This must be \r or \n. We're either done, or we'll be forced
995 to write back to the buffer and continue on the slow path. */
996 d = (uchar *) s;
997
998 if (__builtin_expect (s == buffer->rlimit, false))
999 goto done;
1000
1001 /* DOS line ending? */
1002 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1003 {
1004 s++;
1005 if (s == buffer->rlimit)
1006 goto done;
1007 }
1008
1009 if (__builtin_expect (pbackslash == NULL, true))
1010 goto done;
1011
1012 /* Check for escaped newline. */
1013 p = d;
1014 while (is_nvspace (p[-1]))
1015 p--;
1016 if (p - 1 != pbackslash)
1017 goto done;
1018
1019 /* Have an escaped newline; process it and proceed to
1020 the slow path. */
1021 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1022 d = p - 2;
1023 buffer->next_line = p - 1;
1024
1025 slow_path:
1026 while (1)
1027 {
1028 c = *++s;
1029 *++d = c;
1030
1031 if (c == '\n' || c == '\r')
1032 {
1033 /* Handle DOS line endings. */
1034 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1035 s++;
1036 if (s == buffer->rlimit)
1037 break;
1038
1039 /* Escaped? */
1040 p = d;
1041 while (p != buffer->next_line && is_nvspace (p[-1]))
1042 p--;
1043 if (p == buffer->next_line || p[-1] != '\\')
1044 break;
1045
1046 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1047 d = p - 2;
1048 buffer->next_line = p - 1;
1049 }
1050 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1051 {
1052 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1053 add_line_note (buffer, d, s[2]);
1054 if (CPP_OPTION (pfile, trigraphs))
1055 {
1056 *d = _cpp_trigraph_map[s[2]];
1057 s += 2;
1058 }
1059 }
1060 }
1061 }
1062 else
1063 {
1064 while (*s != '\n' && *s != '\r')
1065 s++;
1066 d = (uchar *) s;
1067
1068 /* Handle DOS line endings. */
1069 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1070 s++;
1071 }
1072
1073 done:
1074 *d = '\n';
1075 /* A sentinel note that should never be processed. */
1076 add_line_note (buffer, d + 1, '\n');
1077 buffer->next_line = s + 1;
1078 }
1079
1080 template <bool lexing_raw_string>
1081 static bool get_fresh_line_impl (cpp_reader *pfile);
1082
1083 /* Return true if the trigraph indicated by NOTE should be warned
1084 about in a comment. */
1085 static bool
1086 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1087 {
1088 const uchar *p;
1089
1090 /* Within comments we don't warn about trigraphs, unless the
1091 trigraph forms an escaped newline, as that may change
1092 behavior. */
1093 if (note->type != '/')
1094 return false;
1095
1096 /* If -trigraphs, then this was an escaped newline iff the next note
1097 is coincident. */
1098 if (CPP_OPTION (pfile, trigraphs))
1099 return note[1].pos == note->pos;
1100
1101 /* Otherwise, see if this forms an escaped newline. */
1102 p = note->pos + 3;
1103 while (is_nvspace (*p))
1104 p++;
1105
1106 /* There might have been escaped newlines between the trigraph and the
1107 newline we found. Hence the position test. */
1108 return (*p == '\n' && p < note[1].pos);
1109 }
1110
1111 /* Process the notes created by add_line_note as far as the current
1112 location. */
1113 void
1114 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1115 {
1116 cpp_buffer *buffer = pfile->buffer;
1117
1118 for (;;)
1119 {
1120 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1121 unsigned int col;
1122
1123 if (note->pos > buffer->cur)
1124 break;
1125
1126 buffer->cur_note++;
1127 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1128
1129 if (note->type == '\\' || note->type == ' ')
1130 {
1131 if (note->type == ' ' && !in_comment)
1132 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1133 "backslash and newline separated by space");
1134
1135 if (buffer->next_line > buffer->rlimit)
1136 {
1137 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1138 "backslash-newline at end of file");
1139 /* Prevent "no newline at end of file" warning. */
1140 buffer->next_line = buffer->rlimit;
1141 }
1142
1143 buffer->line_base = note->pos;
1144 CPP_INCREMENT_LINE (pfile, 0);
1145 }
1146 else if (_cpp_trigraph_map[note->type])
1147 {
1148 if (CPP_OPTION (pfile, warn_trigraphs)
1149 && (!in_comment || warn_in_comment (pfile, note)))
1150 {
1151 if (CPP_OPTION (pfile, trigraphs))
1152 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1153 pfile->line_table->highest_line, col,
1154 "trigraph ??%c converted to %c",
1155 note->type,
1156 (int) _cpp_trigraph_map[note->type]);
1157 else
1158 {
1159 cpp_warning_with_line
1160 (pfile, CPP_W_TRIGRAPHS,
1161 pfile->line_table->highest_line, col,
1162 "trigraph ??%c ignored, use -trigraphs to enable",
1163 note->type);
1164 }
1165 }
1166 }
1167 else if (note->type == 0)
1168 /* Already processed in lex_raw_string. */;
1169 else
1170 abort ();
1171 }
1172 }
1173
1174 namespace bidi {
1175 enum class kind {
1176 NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1177 };
1178
1179 /* All the UTF-8 encodings of bidi characters start with E2. */
1180 constexpr uchar utf8_start = 0xe2;
1181
1182 struct context
1183 {
1184 context () {}
1185 context (location_t loc, kind k, bool pdf, bool ucn)
1186 : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1187 {
1188 }
1189
1190 kind get_pop_kind () const
1191 {
1192 return m_pdf ? kind::PDF : kind::PDI;
1193 }
1194 bool ucn_p () const
1195 {
1196 return m_ucn;
1197 }
1198
1199 location_t m_loc;
1200 kind m_kind;
1201 unsigned m_pdf : 1;
1202 unsigned m_ucn : 1;
1203 };
1204
1205 /* A vector holding currently open bidi contexts. We use a char for
1206 each context, its LSB is 1 if it represents a PDF context, 0 if it
1207 represents a PDI context. The next bit is 1 if this context was open
1208 by a bidi character written as a UCN, and 0 when it was UTF-8. */
1209 semi_embedded_vec <context, 16> vec;
1210
1211 /* Close the whole comment/identifier/string literal/character constant
1212 context. */
1213 void on_close ()
1214 {
1215 vec.truncate (0);
1216 }
1217
1218 /* Pop the last element in the vector. */
1219 void pop ()
1220 {
1221 unsigned int len = vec.count ();
1222 gcc_checking_assert (len > 0);
1223 vec.truncate (len - 1);
1224 }
1225
1226 /* Return the pop kind of the context of the Ith element. */
1227 kind pop_kind_at (unsigned int i)
1228 {
1229 return vec[i].get_pop_kind ();
1230 }
1231
1232 /* Return the pop kind of the context that is currently opened. */
1233 kind current_ctx ()
1234 {
1235 unsigned int len = vec.count ();
1236 if (len == 0)
1237 return kind::NONE;
1238 return vec[len - 1].get_pop_kind ();
1239 }
1240
1241 /* Return true if the current context comes from a UCN origin, that is,
1242 the bidi char which started this bidi context was written as a UCN. */
1243 bool current_ctx_ucn_p ()
1244 {
1245 unsigned int len = vec.count ();
1246 gcc_checking_assert (len > 0);
1247 return vec[len - 1].m_ucn;
1248 }
1249
1250 location_t current_ctx_loc ()
1251 {
1252 unsigned int len = vec.count ();
1253 gcc_checking_assert (len > 0);
1254 return vec[len - 1].m_loc;
1255 }
1256
1257 /* We've read a bidi char, update the current vector as necessary.
1258 LOC is only valid when K is not kind::NONE. */
1259 void on_char (kind k, bool ucn_p, location_t loc)
1260 {
1261 switch (k)
1262 {
1263 case kind::LRE:
1264 case kind::RLE:
1265 case kind::LRO:
1266 case kind::RLO:
1267 vec.push (context (loc, k, true, ucn_p));
1268 break;
1269 case kind::LRI:
1270 case kind::RLI:
1271 case kind::FSI:
1272 vec.push (context (loc, k, false, ucn_p));
1273 break;
1274 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1275 whose scope has not yet been terminated. */
1276 case kind::PDF:
1277 if (current_ctx () == kind::PDF)
1278 pop ();
1279 break;
1280 /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1281 scope has not yet been terminated, as well as the scopes of
1282 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1283 yet been terminated. */
1284 case kind::PDI:
1285 for (int i = vec.count () - 1; i >= 0; --i)
1286 if (pop_kind_at (i) == kind::PDI)
1287 {
1288 vec.truncate (i);
1289 break;
1290 }
1291 break;
1292 case kind::LTR:
1293 case kind::RTL:
1294 /* These aren't popped by a PDF/PDI. */
1295 break;
1296 ATTR_LIKELY case kind::NONE:
1297 break;
1298 default:
1299 abort ();
1300 }
1301 }
1302
1303 /* Return a descriptive string for K. */
1304 const char *to_str (kind k)
1305 {
1306 switch (k)
1307 {
1308 case kind::LRE:
1309 return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1310 case kind::RLE:
1311 return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1312 case kind::LRO:
1313 return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1314 case kind::RLO:
1315 return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1316 case kind::LRI:
1317 return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1318 case kind::RLI:
1319 return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1320 case kind::FSI:
1321 return "U+2068 (FIRST STRONG ISOLATE)";
1322 case kind::PDF:
1323 return "U+202C (POP DIRECTIONAL FORMATTING)";
1324 case kind::PDI:
1325 return "U+2069 (POP DIRECTIONAL ISOLATE)";
1326 case kind::LTR:
1327 return "U+200E (LEFT-TO-RIGHT MARK)";
1328 case kind::RTL:
1329 return "U+200F (RIGHT-TO-LEFT MARK)";
1330 default:
1331 abort ();
1332 }
1333 }
1334 }
1335
1336 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1337 within the current line in FILE, with the caret at START. */
1338
1339 static location_t
1340 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1341 const unsigned char *const start,
1342 size_t num_bytes)
1343 {
1344 gcc_checking_assert (num_bytes > 0);
1345
1346 /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1347 to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1348 whereas linemap_position_for_column is 1-based. */
1349
1350 /* Get 0-based offsets within the line. */
1351 size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1352 size_t end_offset = start_offset + num_bytes - 1;
1353
1354 /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1355 location_t start_loc = linemap_position_for_column (pfile->line_table,
1356 start_offset + 1);
1357 location_t end_loc = linemap_position_for_column (pfile->line_table,
1358 end_offset + 1);
1359
1360 if (start_loc == end_loc)
1361 return start_loc;
1362
1363 source_range src_range;
1364 src_range.m_start = start_loc;
1365 src_range.m_finish = end_loc;
1366 location_t combined_loc
1367 = pfile->line_table->get_or_create_combined_loc (start_loc,
1368 src_range,
1369 nullptr,
1370 0);
1371 return combined_loc;
1372 }
1373
1374 /* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1375
1376 static bidi::kind
1377 get_bidi_utf8_1 (const unsigned char *const p)
1378 {
1379 gcc_checking_assert (p[0] == bidi::utf8_start);
1380
1381 if (p[1] == 0x80)
1382 switch (p[2])
1383 {
1384 case 0xaa:
1385 return bidi::kind::LRE;
1386 case 0xab:
1387 return bidi::kind::RLE;
1388 case 0xac:
1389 return bidi::kind::PDF;
1390 case 0xad:
1391 return bidi::kind::LRO;
1392 case 0xae:
1393 return bidi::kind::RLO;
1394 case 0x8e:
1395 return bidi::kind::LTR;
1396 case 0x8f:
1397 return bidi::kind::RTL;
1398 default:
1399 break;
1400 }
1401 else if (p[1] == 0x81)
1402 switch (p[2])
1403 {
1404 case 0xa6:
1405 return bidi::kind::LRI;
1406 case 0xa7:
1407 return bidi::kind::RLI;
1408 case 0xa8:
1409 return bidi::kind::FSI;
1410 case 0xa9:
1411 return bidi::kind::PDI;
1412 default:
1413 break;
1414 }
1415
1416 return bidi::kind::NONE;
1417 }
1418
1419 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1420 If the kind is not NONE, write the location to *OUT.*/
1421
1422 static bidi::kind
1423 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1424 {
1425 bidi::kind result = get_bidi_utf8_1 (p);
1426 if (result != bidi::kind::NONE)
1427 {
1428 /* We have a sequence of 3 bytes starting at P. */
1429 *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1430 }
1431 return result;
1432 }
1433
1434 /* Parse a UCN where P points just past \u or \U and return its bidi code. */
1435
1436 static bidi::kind
1437 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1438 {
1439 /* 6.4.3 Universal Character Names
1440 \u hex-quad
1441 \U hex-quad hex-quad
1442 \u { simple-hexadecimal-digit-sequence }
1443 where \unnnn means \U0000nnnn. */
1444
1445 *end = p + 4;
1446 if (is_U)
1447 {
1448 if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1449 return bidi::kind::NONE;
1450 /* Skip 4B so we can treat \u and \U the same below. */
1451 p += 4;
1452 *end += 4;
1453 }
1454 else if (p[0] == '{')
1455 {
1456 p++;
1457 while (*p == '0')
1458 p++;
1459 if (p[0] != '2'
1460 || p[1] != '0'
1461 || !ISXDIGIT (p[2])
1462 || !ISXDIGIT (p[3])
1463 || p[4] != '}')
1464 return bidi::kind::NONE;
1465 *end = p + 5;
1466 }
1467
1468 /* All code points we are looking for start with 20xx. */
1469 if (p[0] != '2' || p[1] != '0')
1470 return bidi::kind::NONE;
1471 else if (p[2] == '2')
1472 switch (p[3])
1473 {
1474 case 'a':
1475 case 'A':
1476 return bidi::kind::LRE;
1477 case 'b':
1478 case 'B':
1479 return bidi::kind::RLE;
1480 case 'c':
1481 case 'C':
1482 return bidi::kind::PDF;
1483 case 'd':
1484 case 'D':
1485 return bidi::kind::LRO;
1486 case 'e':
1487 case 'E':
1488 return bidi::kind::RLO;
1489 default:
1490 break;
1491 }
1492 else if (p[2] == '6')
1493 switch (p[3])
1494 {
1495 case '6':
1496 return bidi::kind::LRI;
1497 case '7':
1498 return bidi::kind::RLI;
1499 case '8':
1500 return bidi::kind::FSI;
1501 case '9':
1502 return bidi::kind::PDI;
1503 default:
1504 break;
1505 }
1506 else if (p[2] == '0')
1507 switch (p[3])
1508 {
1509 case 'e':
1510 case 'E':
1511 return bidi::kind::LTR;
1512 case 'f':
1513 case 'F':
1514 return bidi::kind::RTL;
1515 default:
1516 break;
1517 }
1518
1519 return bidi::kind::NONE;
1520 }
1521
1522 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1523 If the kind is not NONE, write the location to *OUT. */
1524
1525 static bidi::kind
1526 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1527 location_t *out)
1528 {
1529 const unsigned char *end;
1530 bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
1531 if (result != bidi::kind::NONE)
1532 {
1533 const unsigned char *start = p - 2;
1534 size_t num_bytes = end - start;
1535 *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1536 }
1537 return result;
1538 }
1539
1540 /* Parse a named universal character escape where P points just past \N and
1541 return its bidi code. If the kind is not NONE, write the location to
1542 *OUT. */
1543
1544 static bidi::kind
1545 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1546 {
1547 bidi::kind result = bidi::kind::NONE;
1548 if (*p != '{')
1549 return bidi::kind::NONE;
1550 if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1551 {
1552 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1553 result = bidi::kind::LTR;
1554 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1555 result = bidi::kind::LRE;
1556 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1557 result = bidi::kind::LRO;
1558 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1559 result = bidi::kind::LRI;
1560 }
1561 else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1562 {
1563 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1564 result = bidi::kind::RTL;
1565 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1566 result = bidi::kind::RLE;
1567 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1568 result = bidi::kind::RLO;
1569 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1570 result = bidi::kind::RLI;
1571 }
1572 else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1573 {
1574 if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1575 result = bidi::kind::PDF;
1576 else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1577 result = bidi::kind::PDI;
1578 }
1579 else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1580 result = bidi::kind::FSI;
1581 if (result != bidi::kind::NONE)
1582 *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1583 (strchr ((const char *)
1584 (p + 1), '}')
1585 - (const char *) p)
1586 + 3);
1587 return result;
1588 }
1589
1590 /* Subclass of rich_location for reporting on unpaired UTF-8
1591 bidirectional control character(s).
1592 Escape the source lines on output, and show all unclosed
1593 bidi context, labelling everything. */
1594
1595 class unpaired_bidi_rich_location : public rich_location
1596 {
1597 public:
1598 class custom_range_label : public range_label
1599 {
1600 public:
1601 label_text get_text (unsigned range_idx) const final override
1602 {
1603 /* range 0 is the primary location; each subsequent range i + 1
1604 is for bidi::vec[i]. */
1605 if (range_idx > 0)
1606 {
1607 const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1608 return label_text::borrow (bidi::to_str (ctxt.m_kind));
1609 }
1610 else
1611 return label_text::borrow (_("end of bidirectional context"));
1612 }
1613 };
1614
1615 unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1616 : rich_location (pfile->line_table, loc, &m_custom_label)
1617 {
1618 set_escape_on_output (true);
1619 for (unsigned i = 0; i < bidi::vec.count (); i++)
1620 add_range (bidi::vec[i].m_loc,
1621 SHOW_RANGE_WITHOUT_CARET,
1622 &m_custom_label);
1623 }
1624
1625 private:
1626 custom_range_label m_custom_label;
1627 };
1628
1629 /* We're closing a bidi context, that is, we've encountered a newline,
1630 are closing a C-style comment, or are at the end of a string literal,
1631 character constant, or identifier. Warn if this context was not
1632 properly terminated by a PDI or PDF. P points to the last character
1633 in this context. */
1634
1635 static void
1636 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1637 {
1638 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1639 if (bidi::vec.count () > 0
1640 && (warn_bidi & bidirectional_unpaired
1641 && (!bidi::current_ctx_ucn_p ()
1642 || (warn_bidi & bidirectional_ucn))))
1643 {
1644 const location_t loc
1645 = linemap_position_for_column (pfile->line_table,
1646 CPP_BUF_COLUMN (pfile->buffer, p));
1647 unpaired_bidi_rich_location rich_loc (pfile, loc);
1648 /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1649 forms of a diagnostic, so fake it for now. */
1650 if (bidi::vec.count () > 1)
1651 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1652 "unpaired UTF-8 bidirectional control characters "
1653 "detected");
1654 else
1655 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1656 "unpaired UTF-8 bidirectional control character "
1657 "detected");
1658 }
1659 /* We're done with this context. */
1660 bidi::on_close ();
1661 }
1662
1663 /* We're at the beginning or in the middle of an identifier/comment/string
1664 literal/character constant. Warn if we've encountered a bidi character.
1665 KIND says which bidi control character it was; UCN_P is true iff this bidi
1666 control character was written as a UCN. LOC is the location of the
1667 character, but is only valid if KIND != bidi::kind::NONE. */
1668
1669 static void
1670 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1671 bool ucn_p, location_t loc)
1672 {
1673 if (__builtin_expect (kind == bidi::kind::NONE, 1))
1674 return;
1675
1676 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1677
1678 if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1679 {
1680 rich_location rich_loc (pfile->line_table, loc);
1681 rich_loc.set_escape_on_output (true);
1682
1683 /* It seems excessive to warn about a PDI/PDF that is closing
1684 an opened context because we've already warned about the
1685 opening character. Except warn when we have a UCN x UTF-8
1686 mismatch, if UCN checking is enabled. */
1687 if (kind == bidi::current_ctx ())
1688 {
1689 if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1690 && bidi::current_ctx_ucn_p () != ucn_p)
1691 {
1692 rich_loc.add_range (bidi::current_ctx_loc ());
1693 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1694 "UTF-8 vs UCN mismatch when closing "
1695 "a context by \"%s\"", bidi::to_str (kind));
1696 }
1697 }
1698 else if (warn_bidi & bidirectional_any
1699 && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1700 {
1701 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1702 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1703 "\"%s\" is closing an unopened context",
1704 bidi::to_str (kind));
1705 else
1706 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1707 "found problematic Unicode character \"%s\"",
1708 bidi::to_str (kind));
1709 }
1710 }
1711 /* We're done with this context. */
1712 bidi::on_char (kind, ucn_p, loc);
1713 }
1714
1715 static const cppchar_t utf8_continuation = 0x80;
1716 static const cppchar_t utf8_signifier = 0xC0;
1717
1718 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1719 at PFILE->buffer->cur. Return a pointer after the diagnosed
1720 invalid character. */
1721
1722 static const uchar *
1723 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
1724 {
1725 cpp_buffer *buffer = pfile->buffer;
1726 const uchar *cur = buffer->cur;
1727 bool pedantic = (CPP_PEDANTIC (pfile)
1728 && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1729
1730 if (cur[0] < utf8_signifier
1731 || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1732 {
1733 if (pedantic)
1734 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1735 pfile->line_table->highest_line,
1736 CPP_BUF_COL (buffer),
1737 "invalid UTF-8 character <%x>",
1738 cur[0]);
1739 else
1740 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1741 pfile->line_table->highest_line,
1742 CPP_BUF_COL (buffer),
1743 "invalid UTF-8 character <%x>",
1744 cur[0]);
1745 return cur + 1;
1746 }
1747 else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1748 {
1749 if (pedantic)
1750 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1751 pfile->line_table->highest_line,
1752 CPP_BUF_COL (buffer),
1753 "invalid UTF-8 character <%x><%x>",
1754 cur[0], cur[1]);
1755 else
1756 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1757 pfile->line_table->highest_line,
1758 CPP_BUF_COL (buffer),
1759 "invalid UTF-8 character <%x><%x>",
1760 cur[0], cur[1]);
1761 return cur + 2;
1762 }
1763 else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1764 {
1765 if (pedantic)
1766 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1767 pfile->line_table->highest_line,
1768 CPP_BUF_COL (buffer),
1769 "invalid UTF-8 character <%x><%x><%x>",
1770 cur[0], cur[1], cur[2]);
1771 else
1772 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1773 pfile->line_table->highest_line,
1774 CPP_BUF_COL (buffer),
1775 "invalid UTF-8 character <%x><%x><%x>",
1776 cur[0], cur[1], cur[2]);
1777 return cur + 3;
1778 }
1779 else
1780 {
1781 if (pedantic)
1782 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1783 pfile->line_table->highest_line,
1784 CPP_BUF_COL (buffer),
1785 "invalid UTF-8 character <%x><%x><%x><%x>",
1786 cur[0], cur[1], cur[2], cur[3]);
1787 else
1788 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1789 pfile->line_table->highest_line,
1790 CPP_BUF_COL (buffer),
1791 "invalid UTF-8 character <%x><%x><%x><%x>",
1792 cur[0], cur[1], cur[2], cur[3]);
1793 return cur + 4;
1794 }
1795 }
1796
1797 /* Helper function of *skip_*_comment and lex*_string. For C,
1798 character at CUR[-1] with MSB set handle -Wbidi-chars* and
1799 -Winvalid-utf8 diagnostics and return pointer to first character
1800 that should be processed next. */
1801
1802 static inline const uchar *
1803 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1804 const uchar *cur, bool warn_bidi_p,
1805 bool warn_invalid_utf8_p)
1806 {
1807 /* If this is a beginning of a UTF-8 encoding, it might be
1808 a bidirectional control character. */
1809 if (c == bidi::utf8_start && warn_bidi_p)
1810 {
1811 location_t loc;
1812 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1813 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1814 }
1815 if (!warn_invalid_utf8_p)
1816 return cur;
1817 if (c >= utf8_signifier)
1818 {
1819 cppchar_t s;
1820 const uchar *pstr = cur - 1;
1821 if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1822 && s <= UCS_LIMIT)
1823 return pstr;
1824 }
1825 pfile->buffer->cur = cur - 1;
1826 return _cpp_warn_invalid_utf8 (pfile);
1827 }
1828
1829 /* Skip a C-style block comment. We find the end of the comment by
1830 seeing if an asterisk is before every '/' we encounter. Returns
1831 nonzero if comment terminated by EOF, zero otherwise.
1832
1833 Buffer->cur points to the initial asterisk of the comment. */
1834 bool
1835 _cpp_skip_block_comment (cpp_reader *pfile)
1836 {
1837 cpp_buffer *buffer = pfile->buffer;
1838 const uchar *cur = buffer->cur;
1839 uchar c;
1840 const bool warn_bidi_p = pfile->warn_bidi_p ();
1841 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1842 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1843
1844 cur++;
1845 if (*cur == '/')
1846 cur++;
1847
1848 for (;;)
1849 {
1850 /* People like decorating comments with '*', so check for '/'
1851 instead for efficiency. */
1852 c = *cur++;
1853
1854 if (c == '/')
1855 {
1856 if (cur[-2] == '*')
1857 {
1858 if (warn_bidi_p)
1859 maybe_warn_bidi_on_close (pfile, cur);
1860 break;
1861 }
1862
1863 /* Warn about potential nested comments, but not if the '/'
1864 comes immediately before the true comment delimiter.
1865 Don't bother to get it right across escaped newlines. */
1866 if (CPP_OPTION (pfile, warn_comments)
1867 && cur[0] == '*' && cur[1] != '/')
1868 {
1869 buffer->cur = cur;
1870 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1871 pfile->line_table->highest_line,
1872 CPP_BUF_COL (buffer),
1873 "\"/*\" within comment");
1874 }
1875 }
1876 else if (c == '\n')
1877 {
1878 unsigned int cols;
1879 buffer->cur = cur - 1;
1880 if (warn_bidi_p)
1881 maybe_warn_bidi_on_close (pfile, cur);
1882 _cpp_process_line_notes (pfile, true);
1883 if (buffer->next_line >= buffer->rlimit)
1884 return true;
1885 _cpp_clean_line (pfile);
1886
1887 cols = buffer->next_line - buffer->line_base;
1888 CPP_INCREMENT_LINE (pfile, cols);
1889
1890 cur = buffer->cur;
1891 }
1892 else if (__builtin_expect (c >= utf8_continuation, 0)
1893 && warn_bidi_or_invalid_utf8_p)
1894 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1895 warn_invalid_utf8_p);
1896 }
1897
1898 buffer->cur = cur;
1899 _cpp_process_line_notes (pfile, true);
1900 return false;
1901 }
1902
1903 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1904 terminating newline. Handles escaped newlines. Returns nonzero
1905 if a multiline comment. */
1906 static int
1907 skip_line_comment (cpp_reader *pfile)
1908 {
1909 cpp_buffer *buffer = pfile->buffer;
1910 location_t orig_line = pfile->line_table->highest_line;
1911 const bool warn_bidi_p = pfile->warn_bidi_p ();
1912 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1913 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1914
1915 if (!warn_bidi_or_invalid_utf8_p)
1916 while (*buffer->cur != '\n')
1917 buffer->cur++;
1918 else if (!warn_invalid_utf8_p)
1919 {
1920 while (*buffer->cur != '\n'
1921 && *buffer->cur != bidi::utf8_start)
1922 buffer->cur++;
1923 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1924 {
1925 while (*buffer->cur != '\n')
1926 {
1927 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1928 {
1929 location_t loc;
1930 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1931 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1932 }
1933 buffer->cur++;
1934 }
1935 maybe_warn_bidi_on_close (pfile, buffer->cur);
1936 }
1937 }
1938 else
1939 {
1940 while (*buffer->cur != '\n')
1941 {
1942 if (*buffer->cur < utf8_continuation)
1943 {
1944 buffer->cur++;
1945 continue;
1946 }
1947 buffer->cur
1948 = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1949 warn_bidi_p, warn_invalid_utf8_p);
1950 }
1951 if (warn_bidi_p)
1952 maybe_warn_bidi_on_close (pfile, buffer->cur);
1953 }
1954
1955 _cpp_process_line_notes (pfile, true);
1956 return orig_line != pfile->line_table->highest_line;
1957 }
1958
1959 /* Skips whitespace, saving the next non-whitespace character. */
1960 static void
1961 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1962 {
1963 cpp_buffer *buffer = pfile->buffer;
1964 bool saw_NUL = false;
1965
1966 do
1967 {
1968 /* Horizontal space always OK. */
1969 if (c == ' ' || c == '\t')
1970 ;
1971 /* Just \f \v or \0 left. */
1972 else if (c == '\0')
1973 saw_NUL = true;
1974 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1975 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1976 CPP_BUF_COL (buffer),
1977 "%s in preprocessing directive",
1978 c == '\f' ? "form feed" : "vertical tab");
1979
1980 c = *buffer->cur++;
1981 }
1982 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1983 while (is_nvspace (c));
1984
1985 if (saw_NUL)
1986 {
1987 encoding_rich_location rich_loc (pfile);
1988 cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1989 "null character(s) ignored");
1990 }
1991
1992 buffer->cur--;
1993 }
1994
1995 /* See if the characters of a number token are valid in a name (no
1996 '.', '+' or '-'). */
1997 static int
1998 name_p (cpp_reader *pfile, const cpp_string *string)
1999 {
2000 unsigned int i;
2001
2002 for (i = 0; i < string->len; i++)
2003 if (!is_idchar (string->text[i]))
2004 return 0;
2005
2006 return 1;
2007 }
2008
2009 /* After parsing an identifier or other sequence, produce a warning about
2010 sequences not in NFC/NFKC. */
2011 static void
2012 warn_about_normalization (cpp_reader *pfile,
2013 const cpp_token *token,
2014 const struct normalize_state *s,
2015 bool identifier)
2016 {
2017 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2018 && !pfile->state.skipping)
2019 {
2020 location_t loc = token->src_loc;
2021
2022 /* If possible, create a location range for the token. */
2023 if (loc >= RESERVED_LOCATION_COUNT
2024 && token->type != CPP_EOF
2025 /* There must be no line notes to process. */
2026 && (!(pfile->buffer->cur
2027 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2028 && !pfile->overlaid_buffer)))
2029 {
2030 source_range tok_range;
2031 tok_range.m_start = loc;
2032 tok_range.m_finish
2033 = linemap_position_for_column (pfile->line_table,
2034 CPP_BUF_COLUMN (pfile->buffer,
2035 pfile->buffer->cur));
2036 loc = pfile->line_table->get_or_create_combined_loc (loc, tok_range,
2037 nullptr, 0);
2038 }
2039
2040 encoding_rich_location rich_loc (pfile, loc);
2041
2042 /* Make sure that the token is printed using UCNs, even
2043 if we'd otherwise happily print UTF-8. */
2044 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2045 size_t sz;
2046
2047 sz = cpp_spell_token (pfile, token, buf, false) - buf;
2048 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2049 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2050 "`%.*s' is not in NFKC", (int) sz, buf);
2051 else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2052 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2053 "`%.*s' is not in NFC", (int) sz, buf);
2054 else
2055 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2056 "`%.*s' is not in NFC", (int) sz, buf);
2057 free (buf);
2058 }
2059 }
2060
2061 /* Returns TRUE if the byte sequence starting at buffer->cur is a valid
2062 extended character in an identifier. If FIRST is TRUE, then the character
2063 must be valid at the beginning of an identifier as well. If the return
2064 value is TRUE, then pfile->buffer->cur has been moved to point to the next
2065 byte after the extended character. */
2066
2067 static bool
2068 forms_identifier_p (cpp_reader *pfile, int first,
2069 struct normalize_state *state)
2070 {
2071 cpp_buffer *buffer = pfile->buffer;
2072 const bool warn_bidi_p = pfile->warn_bidi_p ();
2073
2074 if (*buffer->cur == '$')
2075 {
2076 if (!CPP_OPTION (pfile, dollars_in_ident))
2077 return false;
2078
2079 buffer->cur++;
2080 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2081 {
2082 CPP_OPTION (pfile, warn_dollars) = 0;
2083 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
2084 }
2085
2086 return true;
2087 }
2088
2089 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
2090 if (CPP_OPTION (pfile, extended_identifiers))
2091 {
2092 cppchar_t s;
2093 if (*buffer->cur >= utf8_signifier)
2094 {
2095 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2096 && warn_bidi_p)
2097 {
2098 location_t loc;
2099 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2100 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2101 }
2102 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2103 state, &s))
2104 return true;
2105 }
2106 else if (*buffer->cur == '\\'
2107 && (buffer->cur[1] == 'u'
2108 || buffer->cur[1] == 'U'
2109 || buffer->cur[1] == 'N'))
2110 {
2111 buffer->cur += 2;
2112 if (warn_bidi_p)
2113 {
2114 location_t loc;
2115 bidi::kind kind;
2116 if (buffer->cur[-1] == 'N')
2117 kind = get_bidi_named (pfile, buffer->cur, &loc);
2118 else
2119 kind = get_bidi_ucn (pfile, buffer->cur,
2120 buffer->cur[-1] == 'U', &loc);
2121 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2122 }
2123 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2124 state, &s, NULL, NULL))
2125 return true;
2126 buffer->cur -= 2;
2127 }
2128 }
2129
2130 return false;
2131 }
2132
2133 /* Helper function to issue error about improper __VA_OPT__ use. */
2134 static void
2135 maybe_va_opt_error (cpp_reader *pfile)
2136 {
2137 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2138 {
2139 /* __VA_OPT__ should not be accepted at all, but allow it in
2140 system headers. */
2141 if (!_cpp_in_system_header (pfile))
2142 {
2143 if (CPP_OPTION (pfile, cplusplus))
2144 cpp_error (pfile, CPP_DL_PEDWARN,
2145 "__VA_OPT__ is not available until C++20");
2146 else
2147 cpp_error (pfile, CPP_DL_PEDWARN,
2148 "__VA_OPT__ is not available until C23");
2149 }
2150 }
2151 else if (!pfile->state.va_args_ok)
2152 {
2153 /* __VA_OPT__ should only appear in the replacement list of a
2154 variadic macro. */
2155 cpp_error (pfile, CPP_DL_PEDWARN,
2156 "__VA_OPT__ can only appear in the expansion"
2157 " of a C++20 variadic macro");
2158 }
2159 }
2160
2161 /* Helper function to perform diagnostics that are needed (rarely)
2162 when an identifier is lexed. */
2163 static void
2164 identifier_diagnostics_on_lex (cpp_reader *pfile, cpp_hashnode *node)
2165 {
2166 if (__builtin_expect (!(node->flags & NODE_DIAGNOSTIC)
2167 || pfile->state.skipping, 1))
2168 return;
2169
2170 /* It is allowed to poison the same identifier twice. */
2171 if ((node->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2172 {
2173 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2174 NODE_NAME (node));
2175 const auto data = (cpp_hashnode_extra *)
2176 ht_lookup (pfile->extra_hash_table, node->ident, HT_NO_INSERT);
2177 if (data && data->poisoned_loc)
2178 cpp_error_at (pfile, CPP_DL_NOTE, data->poisoned_loc, "poisoned here");
2179 }
2180
2181 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2182 replacement list of a variadic macro. */
2183 if (node == pfile->spec_nodes.n__VA_ARGS__
2184 && !pfile->state.va_args_ok)
2185 {
2186 if (CPP_OPTION (pfile, cplusplus))
2187 cpp_error (pfile, CPP_DL_PEDWARN,
2188 "__VA_ARGS__ can only appear in the expansion"
2189 " of a C++11 variadic macro");
2190 else
2191 cpp_error (pfile, CPP_DL_PEDWARN,
2192 "__VA_ARGS__ can only appear in the expansion"
2193 " of a C99 variadic macro");
2194 }
2195
2196 /* __VA_OPT__ should only appear in the replacement list of a
2197 variadic macro. */
2198 if (node == pfile->spec_nodes.n__VA_OPT__)
2199 maybe_va_opt_error (pfile);
2200
2201 /* For -Wc++-compat, warn about use of C++ named operators. */
2202 if (node->flags & NODE_WARN_OPERATOR)
2203 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2204 "identifier \"%s\" is a special operator name in C++",
2205 NODE_NAME (node));
2206 }
2207
2208 /* Helper function to get the cpp_hashnode of the identifier BASE. */
2209 static cpp_hashnode *
2210 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2211 {
2212 cpp_hashnode *result;
2213 const uchar *cur;
2214 unsigned int len;
2215 unsigned int hash = HT_HASHSTEP (0, *base);
2216
2217 cur = base + 1;
2218 while (ISIDNUM (*cur))
2219 {
2220 hash = HT_HASHSTEP (hash, *cur);
2221 cur++;
2222 }
2223 len = cur - base;
2224 hash = HT_HASHFINISH (hash, len);
2225 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2226 base, len, hash, HT_ALLOC));
2227 identifier_diagnostics_on_lex (pfile, result);
2228 return result;
2229 }
2230
2231 /* Get the cpp_hashnode of an identifier specified by NAME in
2232 the current cpp_reader object. If none is found, NULL is returned. */
2233 cpp_hashnode *
2234 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2235 {
2236 cpp_hashnode *result;
2237 result = lex_identifier_intern (pfile, (uchar *) name);
2238 return result;
2239 }
2240
2241 /* Lex an identifier starting at BASE. BUFFER->CUR is expected to point
2242 one past the first character at BASE, which may be a (possibly multi-byte)
2243 character if STARTS_UCN is true. */
2244 static cpp_hashnode *
2245 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2246 struct normalize_state *nst, cpp_hashnode **spelling)
2247 {
2248 cpp_hashnode *result;
2249 const uchar *cur;
2250 unsigned int len;
2251 unsigned int hash = HT_HASHSTEP (0, *base);
2252 const bool warn_bidi_p = pfile->warn_bidi_p ();
2253
2254 cur = pfile->buffer->cur;
2255 if (! starts_ucn)
2256 {
2257 while (ISIDNUM (*cur))
2258 {
2259 hash = HT_HASHSTEP (hash, *cur);
2260 cur++;
2261 }
2262 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2263 }
2264 pfile->buffer->cur = cur;
2265 if (starts_ucn || forms_identifier_p (pfile, false, nst))
2266 {
2267 /* Slower version for identifiers containing UCNs
2268 or extended chars (including $). */
2269 do {
2270 while (ISIDNUM (*pfile->buffer->cur))
2271 {
2272 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2273 pfile->buffer->cur++;
2274 }
2275 } while (forms_identifier_p (pfile, false, nst));
2276 if (warn_bidi_p)
2277 maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2278 result = _cpp_interpret_identifier (pfile, base,
2279 pfile->buffer->cur - base);
2280 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2281 }
2282 else
2283 {
2284 len = cur - base;
2285 hash = HT_HASHFINISH (hash, len);
2286
2287 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2288 base, len, hash, HT_ALLOC));
2289 *spelling = result;
2290 }
2291
2292 return result;
2293 }
2294
2295 /* Struct to hold the return value of the scan_cur_identifier () helper
2296 function below. */
2297
2298 struct scan_id_result
2299 {
2300 cpp_hashnode *node;
2301 normalize_state nst;
2302
2303 scan_id_result ()
2304 : node (nullptr)
2305 {
2306 nst = INITIAL_NORMALIZE_STATE;
2307 }
2308
2309 explicit operator bool () const { return node; }
2310 };
2311
2312 /* Helper function to scan an entire identifier beginning at
2313 pfile->buffer->cur, and possibly containing extended characters (UCNs
2314 and/or UTF-8). Returns the cpp_hashnode for the identifier on success, or
2315 else nullptr, as well as a normalize_state so that normalization warnings
2316 may be issued once the token lexing is complete. */
2317
2318 static scan_id_result
2319 scan_cur_identifier (cpp_reader *pfile)
2320 {
2321 const auto buffer = pfile->buffer;
2322 const auto begin = buffer->cur;
2323 scan_id_result result;
2324 if (ISIDST (*buffer->cur))
2325 {
2326 ++buffer->cur;
2327 cpp_hashnode *ignore;
2328 result.node = lex_identifier (pfile, begin, false, &result.nst, &ignore);
2329 }
2330 else if (forms_identifier_p (pfile, true, &result.nst))
2331 {
2332 /* buffer->cur has been moved already by the call
2333 to forms_identifier_p. */
2334 cpp_hashnode *ignore;
2335 result.node = lex_identifier (pfile, begin, true, &result.nst, &ignore);
2336 }
2337 return result;
2338 }
2339
2340 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
2341 static void
2342 lex_number (cpp_reader *pfile, cpp_string *number,
2343 struct normalize_state *nst)
2344 {
2345 const uchar *cur;
2346 const uchar *base;
2347 uchar *dest;
2348
2349 base = pfile->buffer->cur - 1;
2350 do
2351 {
2352 const uchar *adj_digit_sep = NULL;
2353 cur = pfile->buffer->cur;
2354
2355 /* N.B. ISIDNUM does not include $. */
2356 while (ISIDNUM (*cur)
2357 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2358 || DIGIT_SEP (*cur)
2359 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2360 {
2361 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2362 /* Adjacent digit separators do not form part of the pp-number syntax.
2363 However, they can safely be diagnosed here as an error, since '' is
2364 not a valid preprocessing token. */
2365 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2366 adj_digit_sep = cur;
2367 cur++;
2368 }
2369 /* A number can't end with a digit separator. */
2370 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2371 --cur;
2372 if (adj_digit_sep && adj_digit_sep < cur)
2373 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2374
2375 pfile->buffer->cur = cur;
2376 }
2377 while (forms_identifier_p (pfile, false, nst));
2378
2379 number->len = cur - base;
2380 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2381 memcpy (dest, base, number->len);
2382 dest[number->len] = '\0';
2383 number->text = dest;
2384 }
2385
2386 /* Create a token of type TYPE with a literal spelling. */
2387 static void
2388 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2389 unsigned int len, enum cpp_ttype type)
2390 {
2391 token->type = type;
2392 token->val.str.len = len;
2393 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2394 }
2395
2396 /* Like create_literal(), but construct it from two separate strings
2397 which are concatenated. LEN2 may be 0 if no second string is
2398 required. */
2399 static void
2400 create_literal2 (cpp_reader *pfile, cpp_token *token, const uchar *base1,
2401 unsigned int len1, const uchar *base2, unsigned int len2,
2402 enum cpp_ttype type)
2403 {
2404 token->type = type;
2405 token->val.str.len = len1 + len2;
2406 uchar *const dest = _cpp_unaligned_alloc (pfile, len1 + len2 + 1);
2407 memcpy (dest, base1, len1);
2408 if (len2)
2409 memcpy (dest+len1, base2, len2);
2410 dest[len1 + len2] = 0;
2411 token->val.str.text = dest;
2412 }
2413
2414 const uchar *
2415 cpp_alloc_token_string (cpp_reader *pfile,
2416 const unsigned char *ptr, unsigned len)
2417 {
2418 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2419
2420 dest[len] = 0;
2421 memcpy (dest, ptr, len);
2422 return dest;
2423 }
2424
2425 /* A pair of raw buffer pointers. The currently open one is [1], the
2426 first one is [0]. Used for string literal lexing. */
2427 struct lit_accum {
2428 _cpp_buff *first;
2429 _cpp_buff *last;
2430 const uchar *rpos;
2431 size_t accum;
2432
2433 lit_accum ()
2434 : first (NULL), last (NULL), rpos (0), accum (0)
2435 {
2436 }
2437
2438 void append (cpp_reader *, const uchar *, size_t);
2439
2440 void read_begin (cpp_reader *);
2441 bool reading_p () const
2442 {
2443 return rpos != NULL;
2444 }
2445 char read_char ()
2446 {
2447 char c = *rpos++;
2448 if (rpos == BUFF_FRONT (last))
2449 rpos = NULL;
2450 return c;
2451 }
2452
2453 void create_literal2 (cpp_reader *pfile, cpp_token *token,
2454 const uchar *base1, unsigned int len1,
2455 const uchar *base2, unsigned int len2,
2456 enum cpp_ttype type);
2457 };
2458
2459 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2460 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2461
2462 void
2463 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2464 {
2465 if (!last)
2466 /* Starting. */
2467 first = last = _cpp_get_buff (pfile, len);
2468 else if (len > BUFF_ROOM (last))
2469 {
2470 /* There is insufficient room in the buffer. Copy what we can,
2471 and then either extend or create a new one. */
2472 size_t room = BUFF_ROOM (last);
2473 memcpy (BUFF_FRONT (last), base, room);
2474 BUFF_FRONT (last) += room;
2475 base += room;
2476 len -= room;
2477 accum += room;
2478
2479 gcc_checking_assert (!rpos);
2480
2481 last = _cpp_append_extend_buff (pfile, last, len);
2482 }
2483
2484 memcpy (BUFF_FRONT (last), base, len);
2485 BUFF_FRONT (last) += len;
2486 accum += len;
2487 }
2488
2489 void
2490 lit_accum::read_begin (cpp_reader *pfile)
2491 {
2492 /* We never accumulate more than 4 chars to read. */
2493 if (BUFF_ROOM (last) < 4)
2494
2495 last = _cpp_append_extend_buff (pfile, last, 4);
2496 rpos = BUFF_FRONT (last);
2497 }
2498
2499 /* Helper function to check if a string format macro, say from inttypes.h, is
2500 placed touching a string literal, in which case it could be parsed as a C++11
2501 user-defined string literal thus breaking the program. Return TRUE if the
2502 UDL should be ignored for now and preserved for potential macro
2503 expansion. */
2504
2505 static bool
2506 maybe_ignore_udl_macro_suffix (cpp_reader *pfile, location_t src_loc,
2507 const uchar *suffix_begin, cpp_hashnode *node)
2508 {
2509 /* User-defined literals outside of namespace std must start with a single
2510 underscore, so assume anything of that form really is a UDL suffix.
2511 We don't need to worry about UDLs defined inside namespace std because
2512 their names are reserved, so cannot be used as macro names in valid
2513 programs. */
2514 if ((suffix_begin[0] == '_' && suffix_begin[1] != '_')
2515 || !cpp_macro_p (node))
2516 return false;
2517
2518 /* Maybe raise a warning here; caller should arrange not to consume
2519 the tokens. */
2520 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2521 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, src_loc, 0,
2522 "invalid suffix on literal; C++11 requires a space "
2523 "between literal and string macro");
2524 return true;
2525 }
2526
2527 /* Like create_literal2(), but also prepend all the accumulated data from
2528 the lit_accum struct. */
2529 void
2530 lit_accum::create_literal2 (cpp_reader *pfile, cpp_token *token,
2531 const uchar *base1, unsigned int len1,
2532 const uchar *base2, unsigned int len2,
2533 enum cpp_ttype type)
2534 {
2535 const unsigned int tot_len = accum + len1 + len2;
2536 uchar *dest = _cpp_unaligned_alloc (pfile, tot_len + 1);
2537 token->type = type;
2538 token->val.str.len = tot_len;
2539 token->val.str.text = dest;
2540 for (_cpp_buff *buf = first; buf; buf = buf->next)
2541 {
2542 size_t len = BUFF_FRONT (buf) - buf->base;
2543 memcpy (dest, buf->base, len);
2544 dest += len;
2545 }
2546 memcpy (dest, base1, len1);
2547 dest += len1;
2548 if (len2)
2549 memcpy (dest, base2, len2);
2550 dest += len2;
2551 *dest = '\0';
2552 }
2553
2554 /* Lexes a raw string. The stored string contains the spelling,
2555 including double quotes, delimiter string, '(' and ')', any leading
2556 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2557 the type of the literal, or CPP_OTHER if it was not properly
2558 terminated.
2559
2560 BASE is the start of the token. Updates pfile->buffer->cur to just
2561 after the lexed string.
2562
2563 The spelling is NUL-terminated, but it is not guaranteed that this
2564 is the first NUL since embedded NULs are preserved. */
2565
2566 static void
2567 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2568 {
2569 const uchar *pos = base;
2570 const bool warn_bidi_p = pfile->warn_bidi_p ();
2571 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2572 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2573
2574 /* 'tis a pity this information isn't passed down from the lexer's
2575 initial categorization of the token. */
2576 enum cpp_ttype type = CPP_STRING;
2577
2578 if (*pos == 'L')
2579 {
2580 type = CPP_WSTRING;
2581 pos++;
2582 }
2583 else if (*pos == 'U')
2584 {
2585 type = CPP_STRING32;
2586 pos++;
2587 }
2588 else if (*pos == 'u')
2589 {
2590 if (pos[1] == '8')
2591 {
2592 type = CPP_UTF8STRING;
2593 pos++;
2594 }
2595 else
2596 type = CPP_STRING16;
2597 pos++;
2598 }
2599
2600 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2601 pos += 2;
2602
2603 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2604
2605 /* Skip notes before the ". */
2606 while (note->pos < pos)
2607 ++note;
2608
2609 lit_accum accum;
2610
2611 uchar prefix[17];
2612 unsigned prefix_len = 0;
2613 enum Phase
2614 {
2615 PHASE_PREFIX = -2,
2616 PHASE_NONE = -1,
2617 PHASE_SUFFIX = 0
2618 } phase = PHASE_PREFIX;
2619
2620 for (;;)
2621 {
2622 gcc_checking_assert (note->pos >= pos);
2623
2624 /* Undo any escaped newlines and trigraphs. */
2625 if (!accum.reading_p () && note->pos == pos)
2626 switch (note->type)
2627 {
2628 case '\\':
2629 case ' ':
2630 /* Restore backslash followed by newline. */
2631 accum.append (pfile, base, pos - base);
2632 base = pos;
2633 accum.read_begin (pfile);
2634 accum.append (pfile, UC"\\", 1);
2635
2636 after_backslash:
2637 if (note->type == ' ')
2638 /* GNU backslash whitespace newline extension. FIXME
2639 could be any sequence of non-vertical space. When we
2640 can properly restore any such sequence, we should
2641 mark this note as handled so _cpp_process_line_notes
2642 doesn't warn. */
2643 accum.append (pfile, UC" ", 1);
2644
2645 accum.append (pfile, UC"\n", 1);
2646 note++;
2647 break;
2648
2649 case '\n':
2650 /* This can happen for ??/<NEWLINE> when trigraphs are not
2651 being interpretted. */
2652 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2653 note->type = 0;
2654 note++;
2655 break;
2656
2657 default:
2658 gcc_checking_assert (_cpp_trigraph_map[note->type]);
2659
2660 /* Don't warn about this trigraph in
2661 _cpp_process_line_notes, since trigraphs show up as
2662 trigraphs in raw strings. */
2663 uchar type = note->type;
2664 note->type = 0;
2665
2666 if (CPP_OPTION (pfile, trigraphs))
2667 {
2668 accum.append (pfile, base, pos - base);
2669 base = pos;
2670 accum.read_begin (pfile);
2671 accum.append (pfile, UC"??", 2);
2672 accum.append (pfile, &type, 1);
2673
2674 /* ??/ followed by newline gets two line notes, one for
2675 the trigraph and one for the backslash/newline. */
2676 if (type == '/' && note[1].pos == pos)
2677 {
2678 note++;
2679 gcc_assert (note->type == '\\' || note->type == ' ');
2680 goto after_backslash;
2681 }
2682 /* Skip the replacement character. */
2683 base = ++pos;
2684 }
2685
2686 note++;
2687 break;
2688 }
2689
2690 /* Now get a char to process. Either from an expanded note, or
2691 from the line buffer. */
2692 bool read_note = accum.reading_p ();
2693 char c = read_note ? accum.read_char () : *pos++;
2694
2695 if (phase == PHASE_PREFIX)
2696 {
2697 if (c == '(')
2698 {
2699 /* Done. */
2700 phase = PHASE_NONE;
2701 prefix[prefix_len++] = '"';
2702 }
2703 else if (prefix_len < 16
2704 /* Prefix chars are any of the basic character set,
2705 [lex.charset] except for '
2706 ()\\\t\v\f\n'. Optimized for a contiguous
2707 alphabet. */
2708 /* Unlike a switch, this collapses down to one or
2709 two shift and bitmask operations on an ASCII
2710 system, with an outlier or two. */
2711 && (('Z' - 'A' == 25
2712 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2713 : ISIDST (c))
2714 || (c >= '0' && c <= '9')
2715 || c == '_' || c == '{' || c == '}'
2716 || c == '[' || c == ']' || c == '#'
2717 || c == '<' || c == '>' || c == '%'
2718 || c == ':' || c == ';' || c == '.' || c == '?'
2719 || c == '*' || c == '+' || c == '-' || c == '/'
2720 || c == '^' || c == '&' || c == '|' || c == '~'
2721 || c == '!' || c == '=' || c == ','
2722 || c == '"' || c == '\''))
2723 prefix[prefix_len++] = c;
2724 else
2725 {
2726 /* Something is wrong. */
2727 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2728 if (prefix_len == 16)
2729 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2730 col, "raw string delimiter longer "
2731 "than 16 characters");
2732 else if (c == '\n')
2733 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2734 col, "invalid new-line in raw "
2735 "string delimiter");
2736 else
2737 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2738 col, "invalid character '%c' in "
2739 "raw string delimiter", c);
2740 type = CPP_OTHER;
2741 phase = PHASE_NONE;
2742 /* Continue until we get a close quote, that's probably
2743 the best failure mode. */
2744 prefix_len = 0;
2745 }
2746 if (c != '\n')
2747 continue;
2748 }
2749
2750 if (phase != PHASE_NONE)
2751 {
2752 if (prefix[phase] != c)
2753 phase = PHASE_NONE;
2754 else if (unsigned (phase + 1) == prefix_len)
2755 break;
2756 else
2757 {
2758 phase = Phase (phase + 1);
2759 continue;
2760 }
2761 }
2762
2763 if (!prefix_len && c == '"')
2764 /* Failure mode lexing. */
2765 goto out;
2766 else if (prefix_len && c == ')')
2767 phase = PHASE_SUFFIX;
2768 else if (!read_note && c == '\n')
2769 {
2770 pos--;
2771 pfile->buffer->cur = pos;
2772 if ((pfile->state.in_directive || pfile->state.parsing_args
2773 || pfile->state.in_deferred_pragma)
2774 && pfile->buffer->next_line >= pfile->buffer->rlimit)
2775 {
2776 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2777 "unterminated raw string");
2778 type = CPP_OTHER;
2779 goto out;
2780 }
2781
2782 accum.append (pfile, base, pos - base + 1);
2783 _cpp_process_line_notes (pfile, false);
2784
2785 if (pfile->buffer->next_line < pfile->buffer->rlimit)
2786 CPP_INCREMENT_LINE (pfile, 0);
2787 pfile->buffer->need_line = true;
2788
2789 if (!get_fresh_line_impl<true> (pfile))
2790 {
2791 /* We ran out of file and failed to get a line. */
2792 location_t src_loc = token->src_loc;
2793 token->type = CPP_EOF;
2794 /* Tell the compiler the line number of the EOF token. */
2795 token->src_loc = pfile->line_table->highest_line;
2796 token->flags = BOL;
2797 if (accum.first)
2798 _cpp_release_buff (pfile, accum.first);
2799 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2800 "unterminated raw string");
2801
2802 /* Now pop the buffer that get_fresh_line_impl() did not. Popping
2803 is not safe if processing a directive, however this cannot
2804 happen as we already checked above that a line would be
2805 available, and get_fresh_line_impl() can't fail in this
2806 case. */
2807 gcc_assert (!pfile->state.in_directive);
2808 _cpp_pop_buffer (pfile);
2809
2810 return;
2811 }
2812
2813 pos = base = pfile->buffer->cur;
2814 note = &pfile->buffer->notes[pfile->buffer->cur_note];
2815 }
2816 else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2817 && warn_bidi_or_invalid_utf8_p)
2818 pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2819 warn_invalid_utf8_p);
2820 }
2821
2822 if (warn_bidi_p)
2823 maybe_warn_bidi_on_close (pfile, pos);
2824
2825 if (CPP_OPTION (pfile, user_literals))
2826 {
2827 const uchar *const suffix_begin = pos;
2828 pfile->buffer->cur = pos;
2829
2830 if (const auto sr = scan_cur_identifier (pfile))
2831 {
2832 if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2833 suffix_begin, sr.node))
2834 pfile->buffer->cur = suffix_begin;
2835 else
2836 {
2837 type = cpp_userdef_string_add_type (type);
2838 accum.create_literal2 (pfile, token, base, suffix_begin - base,
2839 NODE_NAME (sr.node), NODE_LEN (sr.node),
2840 type);
2841 if (accum.first)
2842 _cpp_release_buff (pfile, accum.first);
2843 warn_about_normalization (pfile, token, &sr.nst, true);
2844 return;
2845 }
2846 }
2847 }
2848
2849 out:
2850 pfile->buffer->cur = pos;
2851 if (!accum.accum)
2852 create_literal (pfile, token, base, pos - base, type);
2853 else
2854 {
2855 accum.create_literal2 (pfile, token, base, pos - base, nullptr, 0, type);
2856 _cpp_release_buff (pfile, accum.first);
2857 }
2858 }
2859
2860 /* Lexes a string, character constant, or angle-bracketed header file
2861 name. The stored string contains the spelling, including opening
2862 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2863 'R' modifier. It returns the type of the literal, or CPP_OTHER
2864 if it was not properly terminated, or CPP_LESS for an unterminated
2865 header name which must be relexed as normal tokens.
2866
2867 The spelling is NUL-terminated, but it is not guaranteed that this
2868 is the first NUL since embedded NULs are preserved. */
2869 static void
2870 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2871 {
2872 bool saw_NUL = false;
2873 const uchar *cur;
2874 cppchar_t terminator;
2875 enum cpp_ttype type;
2876
2877 cur = base;
2878 terminator = *cur++;
2879 if (terminator == 'L' || terminator == 'U')
2880 terminator = *cur++;
2881 else if (terminator == 'u')
2882 {
2883 terminator = *cur++;
2884 if (terminator == '8')
2885 terminator = *cur++;
2886 }
2887 if (terminator == 'R')
2888 {
2889 lex_raw_string (pfile, token, base);
2890 return;
2891 }
2892 if (terminator == '"')
2893 type = (*base == 'L' ? CPP_WSTRING :
2894 *base == 'U' ? CPP_STRING32 :
2895 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2896 : CPP_STRING);
2897 else if (terminator == '\'')
2898 type = (*base == 'L' ? CPP_WCHAR :
2899 *base == 'U' ? CPP_CHAR32 :
2900 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2901 : CPP_CHAR);
2902 else
2903 terminator = '>', type = CPP_HEADER_NAME;
2904
2905 const bool warn_bidi_p = pfile->warn_bidi_p ();
2906 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2907 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2908 for (;;)
2909 {
2910 cppchar_t c = *cur++;
2911
2912 /* In #include-style directives, terminators are not escapable. */
2913 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2914 {
2915 if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2916 {
2917 location_t loc;
2918 bidi::kind kind;
2919 if (cur[0] == 'N')
2920 kind = get_bidi_named (pfile, cur + 1, &loc);
2921 else
2922 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
2923 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2924 }
2925 cur++;
2926 }
2927 else if (c == terminator)
2928 {
2929 if (warn_bidi_p)
2930 maybe_warn_bidi_on_close (pfile, cur - 1);
2931 break;
2932 }
2933 else if (c == '\n')
2934 {
2935 cur--;
2936 /* Unmatched quotes always yield undefined behavior, but
2937 greedy lexing means that what appears to be an unterminated
2938 header name may actually be a legitimate sequence of tokens. */
2939 if (terminator == '>')
2940 {
2941 token->type = CPP_LESS;
2942 return;
2943 }
2944 type = CPP_OTHER;
2945 break;
2946 }
2947 else if (c == '\0')
2948 saw_NUL = true;
2949 else if (__builtin_expect (c >= utf8_continuation, 0)
2950 && warn_bidi_or_invalid_utf8_p)
2951 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2952 warn_invalid_utf8_p);
2953 }
2954
2955 if (saw_NUL && !pfile->state.skipping)
2956 cpp_error (pfile, CPP_DL_WARNING,
2957 "null character(s) preserved in literal");
2958
2959 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2960 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2961 (int) terminator);
2962
2963 pfile->buffer->cur = cur;
2964 const uchar *const suffix_begin = cur;
2965
2966 if (CPP_OPTION (pfile, user_literals))
2967 {
2968 if (const auto sr = scan_cur_identifier (pfile))
2969 {
2970 if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2971 suffix_begin, sr.node))
2972 pfile->buffer->cur = suffix_begin;
2973 else
2974 {
2975 /* Grab user defined literal suffix. */
2976 type = cpp_userdef_char_add_type (type);
2977 type = cpp_userdef_string_add_type (type);
2978 create_literal2 (pfile, token, base, suffix_begin - base,
2979 NODE_NAME (sr.node), NODE_LEN (sr.node), type);
2980 warn_about_normalization (pfile, token, &sr.nst, true);
2981 return;
2982 }
2983 }
2984 }
2985 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2986 && !pfile->state.skipping)
2987 {
2988 const auto sr = scan_cur_identifier (pfile);
2989 /* Maybe raise a warning, but do not consume the tokens. */
2990 pfile->buffer->cur = suffix_begin;
2991 if (sr && cpp_macro_p (sr.node))
2992 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2993 token->src_loc, 0, "C++11 requires a space "
2994 "between string literal and macro");
2995 }
2996
2997 create_literal (pfile, token, base, cur - base, type);
2998 }
2999
3000 /* Return the comment table. The client may not make any assumption
3001 about the ordering of the table. */
3002 cpp_comment_table *
3003 cpp_get_comments (cpp_reader *pfile)
3004 {
3005 return &pfile->comments;
3006 }
3007
3008 /* Append a comment to the end of the comment table. */
3009 static void
3010 store_comment (cpp_reader *pfile, cpp_token *token)
3011 {
3012 int len;
3013
3014 if (pfile->comments.allocated == 0)
3015 {
3016 pfile->comments.allocated = 256;
3017 pfile->comments.entries = (cpp_comment *) xmalloc
3018 (pfile->comments.allocated * sizeof (cpp_comment));
3019 }
3020
3021 if (pfile->comments.count == pfile->comments.allocated)
3022 {
3023 pfile->comments.allocated *= 2;
3024 pfile->comments.entries = (cpp_comment *) xrealloc
3025 (pfile->comments.entries,
3026 pfile->comments.allocated * sizeof (cpp_comment));
3027 }
3028
3029 len = token->val.str.len;
3030
3031 /* Copy comment. Note, token may not be NULL terminated. */
3032 pfile->comments.entries[pfile->comments.count].comment =
3033 (char *) xmalloc (sizeof (char) * (len + 1));
3034 memcpy (pfile->comments.entries[pfile->comments.count].comment,
3035 token->val.str.text, len);
3036 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
3037
3038 /* Set source location. */
3039 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
3040
3041 /* Increment the count of entries in the comment table. */
3042 pfile->comments.count++;
3043 }
3044
3045 /* The stored comment includes the comment start and any terminator. */
3046 static void
3047 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
3048 cppchar_t type)
3049 {
3050 unsigned char *buffer;
3051 unsigned int len, clen, i;
3052
3053 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
3054
3055 /* C++ comments probably (not definitely) have moved past a new
3056 line, which we don't want to save in the comment. */
3057 if (is_vspace (pfile->buffer->cur[-1]))
3058 len--;
3059
3060 /* If we are currently in a directive or in argument parsing, then
3061 we need to store all C++ comments as C comments internally, and
3062 so we need to allocate a little extra space in that case.
3063
3064 Note that the only time we encounter a directive here is
3065 when we are saving comments in a "#define". */
3066 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
3067 && type == '/') ? len + 2 : len;
3068
3069 buffer = _cpp_unaligned_alloc (pfile, clen);
3070
3071 token->type = CPP_COMMENT;
3072 token->val.str.len = clen;
3073 token->val.str.text = buffer;
3074
3075 buffer[0] = '/';
3076 memcpy (buffer + 1, from, len - 1);
3077
3078 /* Finish conversion to a C comment, if necessary. */
3079 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3080 {
3081 buffer[1] = '*';
3082 buffer[clen - 2] = '*';
3083 buffer[clen - 1] = '/';
3084 /* As there can be in a C++ comments illegal sequences for C comments
3085 we need to filter them out. */
3086 for (i = 2; i < (clen - 2); i++)
3087 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3088 buffer[i] = '|';
3089 }
3090
3091 /* Finally store this comment for use by clients of libcpp. */
3092 store_comment (pfile, token);
3093 }
3094
3095 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3096 comment. */
3097
3098 static bool
3099 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3100 {
3101 const unsigned char *from = comment_start + 1;
3102
3103 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3104 {
3105 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3106 don't recognize any comments. The latter only checks attributes,
3107 the former doesn't warn. */
3108 case 0:
3109 default:
3110 return false;
3111 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3112 content it has. */
3113 case 1:
3114 return true;
3115 case 2:
3116 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3117 .*falls?[ \t-]*thr(u|ough).* regex. */
3118 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3119 from++)
3120 {
3121 /* Is there anything like strpbrk with upper boundary, or
3122 memchr looking for 2 characters rather than just one? */
3123 if (from[0] != 'f' && from[0] != 'F')
3124 continue;
3125 if (from[1] != 'a' && from[1] != 'A')
3126 continue;
3127 if (from[2] != 'l' && from[2] != 'L')
3128 continue;
3129 if (from[3] != 'l' && from[3] != 'L')
3130 continue;
3131 from += sizeof "fall" - 1;
3132 if (from[0] == 's' || from[0] == 'S')
3133 from++;
3134 while (*from == ' ' || *from == '\t' || *from == '-')
3135 from++;
3136 if (from[0] != 't' && from[0] != 'T')
3137 continue;
3138 if (from[1] != 'h' && from[1] != 'H')
3139 continue;
3140 if (from[2] != 'r' && from[2] != 'R')
3141 continue;
3142 if (from[3] == 'u' || from[3] == 'U')
3143 return true;
3144 if (from[3] != 'o' && from[3] != 'O')
3145 continue;
3146 if (from[4] != 'u' && from[4] != 'U')
3147 continue;
3148 if (from[5] != 'g' && from[5] != 'G')
3149 continue;
3150 if (from[6] != 'h' && from[6] != 'H')
3151 continue;
3152 return true;
3153 }
3154 return false;
3155 case 3:
3156 case 4:
3157 break;
3158 }
3159
3160 /* Whole comment contents:
3161 -fallthrough
3162 @fallthrough@
3163 */
3164 if (*from == '-' || *from == '@')
3165 {
3166 size_t len = sizeof "fallthrough" - 1;
3167 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3168 return false;
3169 if (memcmp (from + 1, "fallthrough", len))
3170 return false;
3171 if (*from == '@')
3172 {
3173 if (from[len + 1] != '@')
3174 return false;
3175 len++;
3176 }
3177 from += 1 + len;
3178 }
3179 /* Whole comment contents (regex):
3180 lint -fallthrough[ \t]*
3181 */
3182 else if (*from == 'l')
3183 {
3184 size_t len = sizeof "int -fallthrough" - 1;
3185 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3186 return false;
3187 if (memcmp (from + 1, "int -fallthrough", len))
3188 return false;
3189 from += 1 + len;
3190 while (*from == ' ' || *from == '\t')
3191 from++;
3192 }
3193 /* Whole comment contents (regex):
3194 [ \t]*FALLTHR(U|OUGH)[ \t]*
3195 */
3196 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3197 {
3198 while (*from == ' ' || *from == '\t')
3199 from++;
3200 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
3201 return false;
3202 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3203 return false;
3204 from += sizeof "FALLTHR" - 1;
3205 if (*from == 'U')
3206 from++;
3207 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
3208 return false;
3209 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3210 return false;
3211 else
3212 from += sizeof "OUGH" - 1;
3213 while (*from == ' ' || *from == '\t')
3214 from++;
3215 }
3216 /* Whole comment contents (regex):
3217 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3218 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3219 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3220 */
3221 else
3222 {
3223 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3224 from++;
3225 unsigned char f = *from;
3226 bool all_upper = false;
3227 if (f == 'E' || f == 'e')
3228 {
3229 if ((size_t) (pfile->buffer->cur - from)
3230 < sizeof "else fallthru" - 1)
3231 return false;
3232 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
3233 all_upper = true;
3234 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3235 return false;
3236 from += sizeof "else" - 1;
3237 if (*from == ',')
3238 from++;
3239 if (*from != ' ')
3240 return false;
3241 from++;
3242 if (all_upper && *from == 'f')
3243 return false;
3244 if (f == 'e' && *from == 'F')
3245 return false;
3246 f = *from;
3247 }
3248 else if (f == 'I' || f == 'i')
3249 {
3250 if ((size_t) (pfile->buffer->cur - from)
3251 < sizeof "intentional fallthru" - 1)
3252 return false;
3253 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3254 sizeof "NTENTIONAL" - 1) == 0)
3255 all_upper = true;
3256 else if (memcmp (from + 1, "ntentional",
3257 sizeof "ntentional" - 1))
3258 return false;
3259 from += sizeof "intentional" - 1;
3260 if (*from == ' ')
3261 {
3262 from++;
3263 if (all_upper && *from == 'f')
3264 return false;
3265 }
3266 else if (all_upper)
3267 {
3268 if (memcmp (from, "LY F", sizeof "LY F" - 1))
3269 return false;
3270 from += sizeof "LY " - 1;
3271 }
3272 else
3273 {
3274 if (memcmp (from, "ly ", sizeof "ly " - 1))
3275 return false;
3276 from += sizeof "ly " - 1;
3277 }
3278 if (f == 'i' && *from == 'F')
3279 return false;
3280 f = *from;
3281 }
3282 if (f != 'F' && f != 'f')
3283 return false;
3284 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3285 return false;
3286 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3287 all_upper = true;
3288 else if (all_upper)
3289 return false;
3290 else if (memcmp (from + 1, "all", sizeof "all" - 1))
3291 return false;
3292 from += sizeof "fall" - 1;
3293 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3294 from += 2;
3295 else if (*from == ' ' || *from == '-')
3296 from++;
3297 else if (*from != (all_upper ? 'T' : 't'))
3298 return false;
3299 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3300 return false;
3301 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3302 return false;
3303 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3304 {
3305 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3306 return false;
3307 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3308 sizeof "hrough" - 1))
3309 return false;
3310 from += sizeof "through" - 1;
3311 }
3312 else
3313 from += sizeof "thru" - 1;
3314 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3315 from++;
3316 if (*from == '-')
3317 {
3318 from++;
3319 if (*comment_start == '*')
3320 {
3321 do
3322 {
3323 while (*from && *from != '*'
3324 && *from != '\n' && *from != '\r')
3325 from++;
3326 if (*from != '*' || from[1] == '/')
3327 break;
3328 from++;
3329 }
3330 while (1);
3331 }
3332 else
3333 while (*from && *from != '\n' && *from != '\r')
3334 from++;
3335 }
3336 }
3337 /* C block comment. */
3338 if (*comment_start == '*')
3339 {
3340 if (*from != '*' || from[1] != '/')
3341 return false;
3342 }
3343 /* C++ line comment. */
3344 else if (*from != '\n')
3345 return false;
3346
3347 return true;
3348 }
3349
3350 /* Allocate COUNT tokens for RUN. */
3351 void
3352 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3353 {
3354 run->base = XNEWVEC (cpp_token, count);
3355 run->limit = run->base + count;
3356 run->next = NULL;
3357 }
3358
3359 /* Returns the next tokenrun, or creates one if there is none. */
3360 static tokenrun *
3361 next_tokenrun (tokenrun *run)
3362 {
3363 if (run->next == NULL)
3364 {
3365 run->next = XNEW (tokenrun);
3366 run->next->prev = run;
3367 _cpp_init_tokenrun (run->next, 250);
3368 }
3369
3370 return run->next;
3371 }
3372
3373 /* Return the number of not yet processed token in a given
3374 context. */
3375 int
3376 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3377 {
3378 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3379 return (LAST (context).token - FIRST (context).token);
3380 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3381 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3382 return (LAST (context).ptoken - FIRST (context).ptoken);
3383 else
3384 abort ();
3385 }
3386
3387 /* Returns the token present at index INDEX in a given context. If
3388 INDEX is zero, the next token to be processed is returned. */
3389 static const cpp_token*
3390 _cpp_token_from_context_at (cpp_context *context, int index)
3391 {
3392 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3393 return &(FIRST (context).token[index]);
3394 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3395 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3396 return FIRST (context).ptoken[index];
3397 else
3398 abort ();
3399 }
3400
3401 /* Look ahead in the input stream. */
3402 const cpp_token *
3403 cpp_peek_token (cpp_reader *pfile, int index)
3404 {
3405 cpp_context *context = pfile->context;
3406 const cpp_token *peektok;
3407 int count;
3408
3409 /* First, scan through any pending cpp_context objects. */
3410 while (context->prev)
3411 {
3412 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3413
3414 if (index < (int) sz)
3415 return _cpp_token_from_context_at (context, index);
3416 index -= (int) sz;
3417 context = context->prev;
3418 }
3419
3420 /* We will have to read some new tokens after all (and do so
3421 without invalidating preceding tokens). */
3422 count = index;
3423 pfile->keep_tokens++;
3424
3425 /* For peeked tokens temporarily disable line_change reporting,
3426 until the tokens are parsed for real. */
3427 void (*line_change) (cpp_reader *, const cpp_token *, int)
3428 = pfile->cb.line_change;
3429 pfile->cb.line_change = NULL;
3430
3431 do
3432 {
3433 peektok = _cpp_lex_token (pfile);
3434 if (peektok->type == CPP_EOF)
3435 {
3436 index--;
3437 break;
3438 }
3439 else if (peektok->type == CPP_PRAGMA)
3440 {
3441 /* Don't peek past a pragma. */
3442 if (peektok == &pfile->directive_result)
3443 /* Save the pragma in the buffer. */
3444 *pfile->cur_token++ = *peektok;
3445 index--;
3446 break;
3447 }
3448 }
3449 while (index--);
3450
3451 _cpp_backup_tokens_direct (pfile, count - index);
3452 pfile->keep_tokens--;
3453 pfile->cb.line_change = line_change;
3454
3455 return peektok;
3456 }
3457
3458 /* Allocate a single token that is invalidated at the same time as the
3459 rest of the tokens on the line. Has its line and col set to the
3460 same as the last lexed token, so that diagnostics appear in the
3461 right place. */
3462 cpp_token *
3463 _cpp_temp_token (cpp_reader *pfile)
3464 {
3465 cpp_token *old, *result;
3466 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3467 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3468
3469 old = pfile->cur_token - 1;
3470 /* Any pre-existing lookaheads must not be clobbered. */
3471 if (la)
3472 {
3473 if (sz <= la)
3474 {
3475 tokenrun *next = next_tokenrun (pfile->cur_run);
3476
3477 if (sz < la)
3478 memmove (next->base + 1, next->base,
3479 (la - sz) * sizeof (cpp_token));
3480
3481 next->base[0] = pfile->cur_run->limit[-1];
3482 }
3483
3484 if (sz > 1)
3485 memmove (pfile->cur_token + 1, pfile->cur_token,
3486 MIN (la, sz - 1) * sizeof (cpp_token));
3487 }
3488
3489 if (!sz && pfile->cur_token == pfile->cur_run->limit)
3490 {
3491 pfile->cur_run = next_tokenrun (pfile->cur_run);
3492 pfile->cur_token = pfile->cur_run->base;
3493 }
3494
3495 result = pfile->cur_token++;
3496 result->src_loc = old->src_loc;
3497 return result;
3498 }
3499
3500 /* We're at the beginning of a logical line (so not in
3501 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3502 if we should enter deferred_pragma mode to tokenize the rest of the
3503 line as a module control-line. */
3504
3505 static void
3506 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3507 {
3508 unsigned backup = 0; /* Tokens we peeked. */
3509 cpp_hashnode *node = result->val.node.node;
3510 cpp_token *peek = result;
3511 cpp_token *keyword = peek;
3512 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3513 int header_count = 0;
3514
3515 /* Make sure the incoming state is as we expect it. This way we
3516 can restore it using constants. */
3517 gcc_checking_assert (!pfile->state.in_deferred_pragma
3518 && !pfile->state.skipping
3519 && !pfile->state.parsing_args
3520 && !pfile->state.angled_headers
3521 && (pfile->state.save_comments
3522 == !CPP_OPTION (pfile, discard_comments)));
3523
3524 /* Enter directives mode sufficiently for peeking. We don't have
3525 to actually set in_directive. */
3526 pfile->state.in_deferred_pragma = true;
3527
3528 /* These two fields are needed to process tokenization in deferred
3529 pragma mode. They are not used outside deferred pragma mode or
3530 directives mode. */
3531 pfile->state.pragma_allow_expansion = true;
3532 pfile->directive_line = result->src_loc;
3533
3534 /* Saving comments is incompatible with directives mode. */
3535 pfile->state.save_comments = 0;
3536
3537 if (node == n_modules[spec_nodes::M_EXPORT][0])
3538 {
3539 peek = _cpp_lex_direct (pfile);
3540 keyword = peek;
3541 backup++;
3542 if (keyword->type != CPP_NAME)
3543 goto not_module;
3544 node = keyword->val.node.node;
3545 if (!(node->flags & NODE_MODULE))
3546 goto not_module;
3547 }
3548
3549 if (node == n_modules[spec_nodes::M__IMPORT][0])
3550 /* __import */
3551 header_count = backup + 2 + 16;
3552 else if (node == n_modules[spec_nodes::M_IMPORT][0])
3553 /* import */
3554 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3555 else if (node == n_modules[spec_nodes::M_MODULE][0])
3556 ; /* module */
3557 else
3558 goto not_module;
3559
3560 /* We've seen [export] {module|import|__import}. Check the next token. */
3561 if (header_count)
3562 /* After '{,__}import' a header name may appear. */
3563 pfile->state.angled_headers = true;
3564 peek = _cpp_lex_direct (pfile);
3565 backup++;
3566
3567 /* ... import followed by identifier, ':', '<' or
3568 header-name preprocessing tokens, or module
3569 followed by cpp-identifier, ':' or ';' preprocessing
3570 tokens. C++ keywords are not yet relevant. */
3571 if (peek->type == CPP_NAME
3572 || peek->type == CPP_COLON
3573 || (header_count
3574 ? (peek->type == CPP_LESS
3575 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3576 || peek->type == CPP_HEADER_NAME)
3577 : peek->type == CPP_SEMICOLON))
3578 {
3579 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3580 if (!pfile->state.pragma_allow_expansion)
3581 pfile->state.prevent_expansion++;
3582
3583 if (!header_count && linemap_included_from
3584 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3585 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3586 "module control-line cannot be in included file");
3587
3588 /* The first one or two tokens cannot be macro names. */
3589 for (int ix = backup; ix--;)
3590 {
3591 cpp_token *tok = ix ? keyword : result;
3592 cpp_hashnode *node = tok->val.node.node;
3593
3594 /* Don't attempt to expand the token. */
3595 tok->flags |= NO_EXPAND;
3596 if (_cpp_defined_macro_p (node)
3597 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3598 && !cpp_fun_like_macro_p (node))
3599 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3600 "module control-line \"%s\" cannot be"
3601 " an object-like macro",
3602 NODE_NAME (node));
3603 }
3604
3605 /* Map to underbar variants. */
3606 keyword->val.node.node = n_modules[header_count
3607 ? spec_nodes::M_IMPORT
3608 : spec_nodes::M_MODULE][1];
3609 if (backup != 1)
3610 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3611
3612 /* Maybe tell the tokenizer we expect a header-name down the
3613 road. */
3614 pfile->state.directive_file_token = header_count;
3615 }
3616 else
3617 {
3618 not_module:
3619 /* Drop out of directive mode. */
3620 /* We aaserted save_comments had this value upon entry. */
3621 pfile->state.save_comments
3622 = !CPP_OPTION (pfile, discard_comments);
3623 pfile->state.in_deferred_pragma = false;
3624 /* Do not let this remain on. */
3625 pfile->state.angled_headers = false;
3626 }
3627
3628 /* In either case we want to backup the peeked tokens. */
3629 if (backup)
3630 {
3631 /* If we saw EOL, we should drop it, because this isn't a module
3632 control-line after all. */
3633 bool eol = peek->type == CPP_PRAGMA_EOL;
3634 if (!eol || backup > 1)
3635 {
3636 /* Put put the peeked tokens back */
3637 _cpp_backup_tokens_direct (pfile, backup);
3638 /* But if the last one was an EOL, forget it. */
3639 if (eol)
3640 pfile->lookaheads--;
3641 }
3642 }
3643 }
3644
3645 /* Lex a token into RESULT (external interface). Takes care of issues
3646 like directive handling, token lookahead, multiple include
3647 optimization and skipping. */
3648 const cpp_token *
3649 _cpp_lex_token (cpp_reader *pfile)
3650 {
3651 cpp_token *result;
3652
3653 for (;;)
3654 {
3655 if (pfile->cur_token == pfile->cur_run->limit)
3656 {
3657 pfile->cur_run = next_tokenrun (pfile->cur_run);
3658 pfile->cur_token = pfile->cur_run->base;
3659 }
3660 /* We assume that the current token is somewhere in the current
3661 run. */
3662 if (pfile->cur_token < pfile->cur_run->base
3663 || pfile->cur_token >= pfile->cur_run->limit)
3664 abort ();
3665
3666 if (pfile->lookaheads)
3667 {
3668 pfile->lookaheads--;
3669 result = pfile->cur_token++;
3670 }
3671 else
3672 result = _cpp_lex_direct (pfile);
3673
3674 if (result->flags & BOL)
3675 {
3676 /* Is this a directive. If _cpp_handle_directive returns
3677 false, it is an assembler #. */
3678 if (result->type == CPP_HASH
3679 /* 6.10.3 p 11: Directives in a list of macro arguments
3680 gives undefined behavior. This implementation
3681 handles the directive as normal. */
3682 && pfile->state.parsing_args != 1)
3683 {
3684 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3685 {
3686 if (pfile->directive_result.type == CPP_PADDING)
3687 continue;
3688 result = &pfile->directive_result;
3689 }
3690 }
3691 else if (pfile->state.in_deferred_pragma)
3692 result = &pfile->directive_result;
3693 else if (result->type == CPP_NAME
3694 && (result->val.node.node->flags & NODE_MODULE)
3695 && !pfile->state.skipping
3696 /* Unlike regular directives, we do not deal with
3697 tokenizing module directives as macro arguments.
3698 That's not permitted. */
3699 && !pfile->state.parsing_args)
3700 {
3701 /* P1857. Before macro expansion, At start of logical
3702 line ... */
3703 /* We don't have to consider lookaheads at this point. */
3704 gcc_checking_assert (!pfile->lookaheads);
3705
3706 cpp_maybe_module_directive (pfile, result);
3707 }
3708
3709 if (pfile->cb.line_change && !pfile->state.skipping)
3710 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3711 }
3712
3713 /* We don't skip tokens in directives. */
3714 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3715 break;
3716
3717 /* Outside a directive, invalidate controlling macros. At file
3718 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3719 get here and MI optimization works. */
3720 pfile->mi_valid = false;
3721
3722 if (!pfile->state.skipping || result->type == CPP_EOF)
3723 break;
3724 }
3725
3726 return result;
3727 }
3728
3729 /* Returns true if a fresh line has been loaded. */
3730 template <bool lexing_raw_string>
3731 static bool
3732 get_fresh_line_impl (cpp_reader *pfile)
3733 {
3734 /* We can't get a new line until we leave the current directive, unless we
3735 are lexing a raw string, in which case it will be OK as long as we don't
3736 pop the current buffer. */
3737 if (!lexing_raw_string && pfile->state.in_directive)
3738 return false;
3739
3740 for (;;)
3741 {
3742 cpp_buffer *buffer = pfile->buffer;
3743
3744 if (!buffer->need_line)
3745 return true;
3746
3747 if (buffer->next_line < buffer->rlimit)
3748 {
3749 _cpp_clean_line (pfile);
3750 return true;
3751 }
3752
3753 /* We can't change buffers until we leave the current directive. */
3754 if (lexing_raw_string && pfile->state.in_directive)
3755 return false;
3756
3757 /* First, get out of parsing arguments state. */
3758 if (pfile->state.parsing_args)
3759 return false;
3760
3761 /* End of buffer. Non-empty files should end in a newline. */
3762 if (buffer->buf != buffer->rlimit
3763 && buffer->next_line > buffer->rlimit
3764 && !buffer->from_stage3)
3765 {
3766 /* Clip to buffer size. */
3767 buffer->next_line = buffer->rlimit;
3768 }
3769
3770 if (buffer->prev && !buffer->return_at_eof)
3771 _cpp_pop_buffer (pfile);
3772 else
3773 {
3774 /* End of translation. Do not pop the buffer yet. Increment
3775 line number so that the EOF token is on a line of its own
3776 (_cpp_lex_direct doesn't increment in that case, because
3777 it's hard for it to distinguish this special case). */
3778 CPP_INCREMENT_LINE (pfile, 0);
3779 return false;
3780 }
3781 }
3782 }
3783
3784 bool
3785 _cpp_get_fresh_line (cpp_reader *pfile)
3786 {
3787 return get_fresh_line_impl<false> (pfile);
3788 }
3789
3790
3791 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3792 do \
3793 { \
3794 result->type = ELSE_TYPE; \
3795 if (*buffer->cur == CHAR) \
3796 buffer->cur++, result->type = THEN_TYPE; \
3797 } \
3798 while (0)
3799
3800 /* Lex a token into pfile->cur_token, which is also incremented, to
3801 get diagnostics pointing to the correct location.
3802
3803 Does not handle issues such as token lookahead, multiple-include
3804 optimization, directives, skipping etc. This function is only
3805 suitable for use by _cpp_lex_token, and in special cases like
3806 lex_expansion_token which doesn't care for any of these issues.
3807
3808 When meeting a newline, returns CPP_EOF if parsing a directive,
3809 otherwise returns to the start of the token buffer if permissible.
3810 Returns the location of the lexed token. */
3811 cpp_token *
3812 _cpp_lex_direct (cpp_reader *pfile)
3813 {
3814 cppchar_t c = 0;
3815 cpp_buffer *buffer;
3816 const unsigned char *comment_start;
3817 bool fallthrough_comment = false;
3818 cpp_token *result = pfile->cur_token++;
3819
3820 fresh_line:
3821 result->flags = 0;
3822 buffer = pfile->buffer;
3823 if (buffer->need_line)
3824 {
3825 if (pfile->state.in_deferred_pragma)
3826 {
3827 /* This can happen in cases like:
3828 #define loop(x) whatever
3829 #pragma omp loop
3830 where when trying to expand loop we need to peek
3831 next token after loop, but aren't still in_deferred_pragma
3832 mode but are in in_directive mode, so buffer->need_line
3833 is set, a CPP_EOF is peeked. */
3834 result->type = CPP_PRAGMA_EOL;
3835 pfile->state.in_deferred_pragma = false;
3836 if (!pfile->state.pragma_allow_expansion)
3837 pfile->state.prevent_expansion--;
3838 result->src_loc = pfile->line_table->highest_line;
3839 return result;
3840 }
3841 if (!_cpp_get_fresh_line (pfile))
3842 {
3843 result->type = CPP_EOF;
3844 /* Not a real EOF in a directive or arg parsing -- we refuse
3845 to advance to the next file now, and will once we're out
3846 of those modes. */
3847 if (!pfile->state.in_directive && !pfile->state.parsing_args)
3848 {
3849 /* Tell the compiler the line number of the EOF token. */
3850 result->src_loc = pfile->line_table->highest_line;
3851 result->flags = BOL;
3852 /* Now pop the buffer that _cpp_get_fresh_line did not. */
3853 _cpp_pop_buffer (pfile);
3854 }
3855 else if (c == 0)
3856 result->src_loc = pfile->line_table->highest_line;
3857 return result;
3858 }
3859 if (buffer != pfile->buffer)
3860 fallthrough_comment = false;
3861 if (!pfile->keep_tokens)
3862 {
3863 pfile->cur_run = &pfile->base_run;
3864 result = pfile->base_run.base;
3865 pfile->cur_token = result + 1;
3866 }
3867 result->flags = BOL;
3868 if (pfile->state.parsing_args == 2)
3869 result->flags |= PREV_WHITE;
3870 }
3871 buffer = pfile->buffer;
3872 update_tokens_line:
3873 result->src_loc = pfile->line_table->highest_line;
3874
3875 skipped_white:
3876 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3877 && !pfile->overlaid_buffer)
3878 {
3879 _cpp_process_line_notes (pfile, false);
3880 result->src_loc = pfile->line_table->highest_line;
3881 }
3882 c = *buffer->cur++;
3883
3884 if (pfile->forced_token_location)
3885 result->src_loc = pfile->forced_token_location;
3886 else
3887 result->src_loc = linemap_position_for_column (pfile->line_table,
3888 CPP_BUF_COLUMN (buffer, buffer->cur));
3889
3890 switch (c)
3891 {
3892 case ' ': case '\t': case '\f': case '\v': case '\0':
3893 result->flags |= PREV_WHITE;
3894 skip_whitespace (pfile, c);
3895 goto skipped_white;
3896
3897 case '\n':
3898 /* Increment the line, unless this is the last line ... */
3899 if (buffer->cur < buffer->rlimit
3900 /* ... or this is a #include, (where _cpp_stack_file needs to
3901 unwind by one line) ... */
3902 || (pfile->state.in_directive > 1
3903 /* ... except traditional-cpp increments this elsewhere. */
3904 && !CPP_OPTION (pfile, traditional)))
3905 CPP_INCREMENT_LINE (pfile, 0);
3906 buffer->need_line = true;
3907 if (pfile->state.in_deferred_pragma)
3908 {
3909 /* Produce the PRAGMA_EOL on this line. File reading
3910 ensures there is always a \n at end of the buffer, thus
3911 in a deferred pragma we always see CPP_PRAGMA_EOL before
3912 any CPP_EOF. */
3913 result->type = CPP_PRAGMA_EOL;
3914 result->flags &= ~PREV_WHITE;
3915 pfile->state.in_deferred_pragma = false;
3916 if (!pfile->state.pragma_allow_expansion)
3917 pfile->state.prevent_expansion--;
3918 return result;
3919 }
3920 goto fresh_line;
3921
3922 case '0': case '1': case '2': case '3': case '4':
3923 case '5': case '6': case '7': case '8': case '9':
3924 {
3925 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3926 result->type = CPP_NUMBER;
3927 lex_number (pfile, &result->val.str, &nst);
3928 warn_about_normalization (pfile, result, &nst, false);
3929 break;
3930 }
3931
3932 case 'L':
3933 case 'u':
3934 case 'U':
3935 case 'R':
3936 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3937 wide strings or raw strings. */
3938 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3939 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3940 {
3941 if ((*buffer->cur == '\'' && c != 'R')
3942 || *buffer->cur == '"'
3943 || (*buffer->cur == 'R'
3944 && c != 'R'
3945 && buffer->cur[1] == '"'
3946 && CPP_OPTION (pfile, rliterals))
3947 || (*buffer->cur == '8'
3948 && c == 'u'
3949 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3950 && CPP_OPTION (pfile, utf8_char_literals)))
3951 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3952 && CPP_OPTION (pfile, rliterals)))))
3953 {
3954 lex_string (pfile, result, buffer->cur - 1);
3955 break;
3956 }
3957 }
3958 /* Fall through. */
3959
3960 case '_':
3961 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3962 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3963 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3964 case 's': case 't': case 'v': case 'w': case 'x':
3965 case 'y': case 'z':
3966 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3967 case 'G': case 'H': case 'I': case 'J': case 'K':
3968 case 'M': case 'N': case 'O': case 'P': case 'Q':
3969 case 'S': case 'T': case 'V': case 'W': case 'X':
3970 case 'Y': case 'Z':
3971 result->type = CPP_NAME;
3972 {
3973 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3974 const auto node = lex_identifier (pfile, buffer->cur - 1, false, &nst,
3975 &result->val.node.spelling);
3976 result->val.node.node = node;
3977 identifier_diagnostics_on_lex (pfile, node);
3978 warn_about_normalization (pfile, result, &nst, true);
3979 }
3980
3981 /* Convert named operators to their proper types. */
3982 if (result->val.node.node->flags & NODE_OPERATOR)
3983 {
3984 result->flags |= NAMED_OP;
3985 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3986 }
3987
3988 /* Signal FALLTHROUGH comment followed by another token. */
3989 if (fallthrough_comment)
3990 result->flags |= PREV_FALLTHROUGH;
3991 break;
3992
3993 case '\'':
3994 case '"':
3995 lex_string (pfile, result, buffer->cur - 1);
3996 break;
3997
3998 case '/':
3999 /* A potential block or line comment. */
4000 comment_start = buffer->cur;
4001 c = *buffer->cur;
4002
4003 if (c == '*')
4004 {
4005 if (_cpp_skip_block_comment (pfile))
4006 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
4007 }
4008 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
4009 {
4010 /* Don't warn for system headers. */
4011 if (_cpp_in_system_header (pfile))
4012 ;
4013 /* Warn about comments if pedantically GNUC89, and not
4014 in system headers. */
4015 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
4016 && CPP_PEDANTIC (pfile)
4017 && ! buffer->warned_cplusplus_comments)
4018 {
4019 if (cpp_error (pfile, CPP_DL_PEDWARN,
4020 "C++ style comments are not allowed in ISO C90"))
4021 cpp_error (pfile, CPP_DL_NOTE,
4022 "(this will be reported only once per input file)");
4023 buffer->warned_cplusplus_comments = 1;
4024 }
4025 /* Or if specifically desired via -Wc90-c99-compat. */
4026 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
4027 && ! CPP_OPTION (pfile, cplusplus)
4028 && ! buffer->warned_cplusplus_comments)
4029 {
4030 if (cpp_error (pfile, CPP_DL_WARNING,
4031 "C++ style comments are incompatible with C90"))
4032 cpp_error (pfile, CPP_DL_NOTE,
4033 "(this will be reported only once per input file)");
4034 buffer->warned_cplusplus_comments = 1;
4035 }
4036 /* In C89/C94, C++ style comments are forbidden. */
4037 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
4038 || CPP_OPTION (pfile, lang) == CLK_STDC94))
4039 {
4040 /* But don't be confused about valid code such as
4041 - // immediately followed by *,
4042 - // in a preprocessing directive,
4043 - // in an #if 0 block. */
4044 if (buffer->cur[1] == '*'
4045 || pfile->state.in_directive
4046 || pfile->state.skipping)
4047 {
4048 result->type = CPP_DIV;
4049 break;
4050 }
4051 else if (! buffer->warned_cplusplus_comments)
4052 {
4053 if (cpp_error (pfile, CPP_DL_ERROR,
4054 "C++ style comments are not allowed in "
4055 "ISO C90"))
4056 cpp_error (pfile, CPP_DL_NOTE,
4057 "(this will be reported only once per input "
4058 "file)");
4059 buffer->warned_cplusplus_comments = 1;
4060 }
4061 }
4062 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
4063 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
4064 }
4065 else if (c == '=')
4066 {
4067 buffer->cur++;
4068 result->type = CPP_DIV_EQ;
4069 break;
4070 }
4071 else
4072 {
4073 result->type = CPP_DIV;
4074 break;
4075 }
4076
4077 if (fallthrough_comment_p (pfile, comment_start))
4078 fallthrough_comment = true;
4079
4080 if (pfile->cb.comment)
4081 {
4082 size_t len = pfile->buffer->cur - comment_start;
4083 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
4084 len + 1);
4085 }
4086
4087 if (!pfile->state.save_comments)
4088 {
4089 result->flags |= PREV_WHITE;
4090 goto update_tokens_line;
4091 }
4092
4093 if (fallthrough_comment)
4094 result->flags |= PREV_FALLTHROUGH;
4095
4096 /* Save the comment as a token in its own right. */
4097 save_comment (pfile, result, comment_start, c);
4098 break;
4099
4100 case '<':
4101 if (pfile->state.angled_headers)
4102 {
4103 lex_string (pfile, result, buffer->cur - 1);
4104 if (result->type != CPP_LESS)
4105 break;
4106 }
4107
4108 result->type = CPP_LESS;
4109 if (*buffer->cur == '=')
4110 {
4111 buffer->cur++, result->type = CPP_LESS_EQ;
4112 if (*buffer->cur == '>'
4113 && CPP_OPTION (pfile, cplusplus)
4114 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4115 buffer->cur++, result->type = CPP_SPACESHIP;
4116 }
4117 else if (*buffer->cur == '<')
4118 {
4119 buffer->cur++;
4120 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4121 }
4122 else if (CPP_OPTION (pfile, digraphs))
4123 {
4124 if (*buffer->cur == ':')
4125 {
4126 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4127 three characters are <:: and the subsequent character
4128 is neither : nor >, the < is treated as a preprocessor
4129 token by itself". */
4130 if (CPP_OPTION (pfile, cplusplus)
4131 && CPP_OPTION (pfile, lang) != CLK_CXX98
4132 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4133 && buffer->cur[1] == ':'
4134 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4135 break;
4136
4137 buffer->cur++;
4138 result->flags |= DIGRAPH;
4139 result->type = CPP_OPEN_SQUARE;
4140 }
4141 else if (*buffer->cur == '%')
4142 {
4143 buffer->cur++;
4144 result->flags |= DIGRAPH;
4145 result->type = CPP_OPEN_BRACE;
4146 }
4147 }
4148 break;
4149
4150 case '>':
4151 result->type = CPP_GREATER;
4152 if (*buffer->cur == '=')
4153 buffer->cur++, result->type = CPP_GREATER_EQ;
4154 else if (*buffer->cur == '>')
4155 {
4156 buffer->cur++;
4157 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4158 }
4159 break;
4160
4161 case '%':
4162 result->type = CPP_MOD;
4163 if (*buffer->cur == '=')
4164 buffer->cur++, result->type = CPP_MOD_EQ;
4165 else if (CPP_OPTION (pfile, digraphs))
4166 {
4167 if (*buffer->cur == ':')
4168 {
4169 buffer->cur++;
4170 result->flags |= DIGRAPH;
4171 result->type = CPP_HASH;
4172 if (*buffer->cur == '%' && buffer->cur[1] == ':')
4173 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4174 }
4175 else if (*buffer->cur == '>')
4176 {
4177 buffer->cur++;
4178 result->flags |= DIGRAPH;
4179 result->type = CPP_CLOSE_BRACE;
4180 }
4181 }
4182 break;
4183
4184 case '.':
4185 result->type = CPP_DOT;
4186 if (ISDIGIT (*buffer->cur))
4187 {
4188 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4189 result->type = CPP_NUMBER;
4190 lex_number (pfile, &result->val.str, &nst);
4191 warn_about_normalization (pfile, result, &nst, false);
4192 }
4193 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4194 buffer->cur += 2, result->type = CPP_ELLIPSIS;
4195 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4196 buffer->cur++, result->type = CPP_DOT_STAR;
4197 break;
4198
4199 case '+':
4200 result->type = CPP_PLUS;
4201 if (*buffer->cur == '+')
4202 buffer->cur++, result->type = CPP_PLUS_PLUS;
4203 else if (*buffer->cur == '=')
4204 buffer->cur++, result->type = CPP_PLUS_EQ;
4205 break;
4206
4207 case '-':
4208 result->type = CPP_MINUS;
4209 if (*buffer->cur == '>')
4210 {
4211 buffer->cur++;
4212 result->type = CPP_DEREF;
4213 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4214 buffer->cur++, result->type = CPP_DEREF_STAR;
4215 }
4216 else if (*buffer->cur == '-')
4217 buffer->cur++, result->type = CPP_MINUS_MINUS;
4218 else if (*buffer->cur == '=')
4219 buffer->cur++, result->type = CPP_MINUS_EQ;
4220 break;
4221
4222 case '&':
4223 result->type = CPP_AND;
4224 if (*buffer->cur == '&')
4225 buffer->cur++, result->type = CPP_AND_AND;
4226 else if (*buffer->cur == '=')
4227 buffer->cur++, result->type = CPP_AND_EQ;
4228 break;
4229
4230 case '|':
4231 result->type = CPP_OR;
4232 if (*buffer->cur == '|')
4233 buffer->cur++, result->type = CPP_OR_OR;
4234 else if (*buffer->cur == '=')
4235 buffer->cur++, result->type = CPP_OR_EQ;
4236 break;
4237
4238 case ':':
4239 result->type = CPP_COLON;
4240 if (*buffer->cur == ':')
4241 {
4242 if (CPP_OPTION (pfile, scope))
4243 buffer->cur++, result->type = CPP_SCOPE;
4244 else
4245 result->flags |= COLON_SCOPE;
4246 }
4247 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4248 {
4249 buffer->cur++;
4250 result->flags |= DIGRAPH;
4251 result->type = CPP_CLOSE_SQUARE;
4252 }
4253 break;
4254
4255 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4256 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4257 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4258 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4259 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4260
4261 case '?': result->type = CPP_QUERY; break;
4262 case '~': result->type = CPP_COMPL; break;
4263 case ',': result->type = CPP_COMMA; break;
4264 case '(': result->type = CPP_OPEN_PAREN; break;
4265 case ')': result->type = CPP_CLOSE_PAREN; break;
4266 case '[': result->type = CPP_OPEN_SQUARE; break;
4267 case ']': result->type = CPP_CLOSE_SQUARE; break;
4268 case '{': result->type = CPP_OPEN_BRACE; break;
4269 case '}': result->type = CPP_CLOSE_BRACE; break;
4270 case ';': result->type = CPP_SEMICOLON; break;
4271
4272 /* @ is a punctuator in Objective-C. */
4273 case '@': result->type = CPP_ATSIGN; break;
4274
4275 default:
4276 {
4277 const uchar *base = --buffer->cur;
4278 static int no_warn_cnt;
4279
4280 /* Check for an extended identifier ($ or UCN or UTF-8). */
4281 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4282 if (forms_identifier_p (pfile, true, &nst))
4283 {
4284 result->type = CPP_NAME;
4285 const auto node = lex_identifier (pfile, base, true, &nst,
4286 &result->val.node.spelling);
4287 result->val.node.node = node;
4288 identifier_diagnostics_on_lex (pfile, node);
4289 warn_about_normalization (pfile, result, &nst, true);
4290 break;
4291 }
4292
4293 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
4294 single token. */
4295 buffer->cur++;
4296 if (c >= utf8_signifier)
4297 {
4298 const uchar *pstr = base;
4299 cppchar_t s;
4300 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4301 {
4302 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4303 {
4304 buffer->cur = base;
4305 _cpp_warn_invalid_utf8 (pfile);
4306 }
4307 buffer->cur = pstr;
4308 }
4309 else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4310 {
4311 buffer->cur = base;
4312 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4313 buffer->cur = base + 1;
4314 no_warn_cnt = end - buffer->cur;
4315 }
4316 }
4317 else if (c >= utf8_continuation
4318 && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4319 {
4320 if (no_warn_cnt)
4321 --no_warn_cnt;
4322 else
4323 {
4324 buffer->cur = base;
4325 _cpp_warn_invalid_utf8 (pfile);
4326 buffer->cur = base + 1;
4327 }
4328 }
4329 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4330 break;
4331 }
4332
4333 }
4334
4335 /* Potentially convert the location of the token to a range. */
4336 if (result->src_loc >= RESERVED_LOCATION_COUNT
4337 && result->type != CPP_EOF)
4338 {
4339 /* Ensure that any line notes are processed, so that we have the
4340 correct physical line/column for the end-point of the token even
4341 when a logical line is split via one or more backslashes. */
4342 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4343 && !pfile->overlaid_buffer)
4344 _cpp_process_line_notes (pfile, false);
4345
4346 source_range tok_range;
4347 tok_range.m_start = result->src_loc;
4348 tok_range.m_finish
4349 = linemap_position_for_column (pfile->line_table,
4350 CPP_BUF_COLUMN (buffer, buffer->cur));
4351
4352 result->src_loc
4353 = pfile->line_table->get_or_create_combined_loc (result->src_loc,
4354 tok_range, nullptr, 0);
4355 }
4356
4357 return result;
4358 }
4359
4360 /* An upper bound on the number of bytes needed to spell TOKEN.
4361 Does not include preceding whitespace. */
4362 unsigned int
4363 cpp_token_len (const cpp_token *token)
4364 {
4365 unsigned int len;
4366
4367 switch (TOKEN_SPELL (token))
4368 {
4369 default: len = 6; break;
4370 case SPELL_LITERAL: len = token->val.str.len; break;
4371 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
4372 }
4373
4374 return len;
4375 }
4376
4377 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4378 Return the number of bytes read out of NAME. (There are always
4379 10 bytes written to BUFFER.) */
4380
4381 static size_t
4382 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4383 {
4384 int j;
4385 int ucn_len = 0;
4386 int ucn_len_c;
4387 unsigned t;
4388 unsigned long utf32;
4389
4390 /* Compute the length of the UTF-8 sequence. */
4391 for (t = *name; t & 0x80; t <<= 1)
4392 ucn_len++;
4393
4394 utf32 = *name & (0x7F >> ucn_len);
4395 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4396 {
4397 utf32 = (utf32 << 6) | (*++name & 0x3F);
4398
4399 /* Ill-formed UTF-8. */
4400 if ((*name & ~0x3F) != 0x80)
4401 abort ();
4402 }
4403
4404 *buffer++ = '\\';
4405 *buffer++ = 'U';
4406 for (j = 7; j >= 0; j--)
4407 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4408 return ucn_len;
4409 }
4410
4411 /* Given a token TYPE corresponding to a digraph, return a pointer to
4412 the spelling of the digraph. */
4413 static const unsigned char *
4414 cpp_digraph2name (enum cpp_ttype type)
4415 {
4416 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4417 }
4418
4419 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4420 The buffer must already contain enough space to hold the
4421 token's spelling. Returns a pointer to the character after the
4422 last character written. */
4423 unsigned char *
4424 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4425 {
4426 size_t i;
4427 const unsigned char *name = NODE_NAME (ident);
4428
4429 for (i = 0; i < NODE_LEN (ident); i++)
4430 if (name[i] & ~0x7F)
4431 {
4432 i += utf8_to_ucn (buffer, name + i) - 1;
4433 buffer += 10;
4434 }
4435 else
4436 *buffer++ = name[i];
4437
4438 return buffer;
4439 }
4440
4441 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
4442 already contain enough space to hold the token's spelling.
4443 Returns a pointer to the character after the last character written.
4444 FORSTRING is true if this is to be the spelling after translation
4445 phase 1 (with the original spelling of extended identifiers), false
4446 if extended identifiers should always be written using UCNs (there is
4447 no option for always writing them in the internal UTF-8 form).
4448 FIXME: Would be nice if we didn't need the PFILE argument. */
4449 unsigned char *
4450 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4451 unsigned char *buffer, bool forstring)
4452 {
4453 switch (TOKEN_SPELL (token))
4454 {
4455 case SPELL_OPERATOR:
4456 {
4457 const unsigned char *spelling;
4458 unsigned char c;
4459
4460 if (token->flags & DIGRAPH)
4461 spelling = cpp_digraph2name (token->type);
4462 else if (token->flags & NAMED_OP)
4463 goto spell_ident;
4464 else
4465 spelling = TOKEN_NAME (token);
4466
4467 while ((c = *spelling++) != '\0')
4468 *buffer++ = c;
4469 }
4470 break;
4471
4472 spell_ident:
4473 case SPELL_IDENT:
4474 if (forstring)
4475 {
4476 memcpy (buffer, NODE_NAME (token->val.node.spelling),
4477 NODE_LEN (token->val.node.spelling));
4478 buffer += NODE_LEN (token->val.node.spelling);
4479 }
4480 else
4481 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4482 break;
4483
4484 case SPELL_LITERAL:
4485 memcpy (buffer, token->val.str.text, token->val.str.len);
4486 buffer += token->val.str.len;
4487 break;
4488
4489 case SPELL_NONE:
4490 cpp_error (pfile, CPP_DL_ICE,
4491 "unspellable token %s", TOKEN_NAME (token));
4492 break;
4493 }
4494
4495 return buffer;
4496 }
4497
4498 /* Returns TOKEN spelt as a null-terminated string. The string is
4499 freed when the reader is destroyed. Useful for diagnostics. */
4500 unsigned char *
4501 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4502 {
4503 unsigned int len = cpp_token_len (token) + 1;
4504 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4505
4506 end = cpp_spell_token (pfile, token, start, false);
4507 end[0] = '\0';
4508
4509 return start;
4510 }
4511
4512 /* Returns a pointer to a string which spells the token defined by
4513 TYPE and FLAGS. Used by C front ends, which really should move to
4514 using cpp_token_as_text. */
4515 const char *
4516 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4517 {
4518 if (flags & DIGRAPH)
4519 return (const char *) cpp_digraph2name (type);
4520 else if (flags & NAMED_OP)
4521 return cpp_named_operator2name (type);
4522
4523 return (const char *) token_spellings[type].name;
4524 }
4525
4526 /* Writes the spelling of token to FP, without any preceding space.
4527 Separated from cpp_spell_token for efficiency - to avoid stdio
4528 double-buffering. */
4529 void
4530 cpp_output_token (const cpp_token *token, FILE *fp)
4531 {
4532 switch (TOKEN_SPELL (token))
4533 {
4534 case SPELL_OPERATOR:
4535 {
4536 const unsigned char *spelling;
4537 int c;
4538
4539 if (token->flags & DIGRAPH)
4540 spelling = cpp_digraph2name (token->type);
4541 else if (token->flags & NAMED_OP)
4542 goto spell_ident;
4543 else
4544 spelling = TOKEN_NAME (token);
4545
4546 c = *spelling;
4547 do
4548 putc (c, fp);
4549 while ((c = *++spelling) != '\0');
4550 }
4551 break;
4552
4553 spell_ident:
4554 case SPELL_IDENT:
4555 {
4556 size_t i;
4557 const unsigned char * name = NODE_NAME (token->val.node.node);
4558
4559 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4560 if (name[i] & ~0x7F)
4561 {
4562 unsigned char buffer[10];
4563 i += utf8_to_ucn (buffer, name + i) - 1;
4564 fwrite (buffer, 1, 10, fp);
4565 }
4566 else
4567 fputc (NODE_NAME (token->val.node.node)[i], fp);
4568 }
4569 break;
4570
4571 case SPELL_LITERAL:
4572 if (token->type == CPP_HEADER_NAME)
4573 fputc ('"', fp);
4574 fwrite (token->val.str.text, 1, token->val.str.len, fp);
4575 if (token->type == CPP_HEADER_NAME)
4576 fputc ('"', fp);
4577 break;
4578
4579 case SPELL_NONE:
4580 /* An error, most probably. */
4581 break;
4582 }
4583 }
4584
4585 /* Compare two tokens. */
4586 int
4587 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4588 {
4589 if (a->type == b->type && a->flags == b->flags)
4590 switch (TOKEN_SPELL (a))
4591 {
4592 default: /* Keep compiler happy. */
4593 case SPELL_OPERATOR:
4594 /* token_no is used to track where multiple consecutive ##
4595 tokens were originally located. */
4596 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4597 case SPELL_NONE:
4598 return (a->type != CPP_MACRO_ARG
4599 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4600 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4601 case SPELL_IDENT:
4602 return (a->val.node.node == b->val.node.node
4603 && a->val.node.spelling == b->val.node.spelling);
4604 case SPELL_LITERAL:
4605 return (a->val.str.len == b->val.str.len
4606 && !memcmp (a->val.str.text, b->val.str.text,
4607 a->val.str.len));
4608 }
4609
4610 return 0;
4611 }
4612
4613 /* Returns nonzero if a space should be inserted to avoid an
4614 accidental token paste for output. For simplicity, it is
4615 conservative, and occasionally advises a space where one is not
4616 needed, e.g. "." and ".2". */
4617 int
4618 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4619 const cpp_token *token2)
4620 {
4621 enum cpp_ttype a = token1->type, b = token2->type;
4622 cppchar_t c;
4623
4624 if (token1->flags & NAMED_OP)
4625 a = CPP_NAME;
4626 if (token2->flags & NAMED_OP)
4627 b = CPP_NAME;
4628
4629 c = EOF;
4630 if (token2->flags & DIGRAPH)
4631 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4632 else if (token_spellings[b].category == SPELL_OPERATOR)
4633 c = token_spellings[b].name[0];
4634
4635 /* Quickly get everything that can paste with an '='. */
4636 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4637 return 1;
4638
4639 switch (a)
4640 {
4641 case CPP_GREATER: return c == '>';
4642 case CPP_LESS: return c == '<' || c == '%' || c == ':';
4643 case CPP_PLUS: return c == '+';
4644 case CPP_MINUS: return c == '-' || c == '>';
4645 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4646 case CPP_MOD: return c == ':' || c == '>';
4647 case CPP_AND: return c == '&';
4648 case CPP_OR: return c == '|';
4649 case CPP_COLON: return c == ':' || c == '>';
4650 case CPP_DEREF: return c == '*';
4651 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
4652 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
4653 case CPP_PRAGMA:
4654 case CPP_NAME: return ((b == CPP_NUMBER
4655 && name_p (pfile, &token2->val.str))
4656 || b == CPP_NAME
4657 || b == CPP_CHAR || b == CPP_STRING); /* L */
4658 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
4659 || b == CPP_CHAR
4660 || c == '.' || c == '+' || c == '-');
4661 /* UCNs */
4662 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4663 && b == CPP_NAME)
4664 || (CPP_OPTION (pfile, objc)
4665 && token1->val.str.text[0] == '@'
4666 && (b == CPP_NAME || b == CPP_STRING)));
4667 case CPP_LESS_EQ: return c == '>';
4668 case CPP_STRING:
4669 case CPP_WSTRING:
4670 case CPP_UTF8STRING:
4671 case CPP_STRING16:
4672 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4673 && (b == CPP_NAME
4674 || (TOKEN_SPELL (token2) == SPELL_LITERAL
4675 && ISIDST (token2->val.str.text[0]))));
4676
4677 default: break;
4678 }
4679
4680 return 0;
4681 }
4682
4683 /* Output all the remaining tokens on the current line, and a newline
4684 character, to FP. Leading whitespace is removed. If there are
4685 macros, special token padding is not performed. */
4686 void
4687 cpp_output_line (cpp_reader *pfile, FILE *fp)
4688 {
4689 const cpp_token *token;
4690
4691 token = cpp_get_token (pfile);
4692 while (token->type != CPP_EOF)
4693 {
4694 cpp_output_token (token, fp);
4695 token = cpp_get_token (pfile);
4696 if (token->flags & PREV_WHITE)
4697 putc (' ', fp);
4698 }
4699
4700 putc ('\n', fp);
4701 }
4702
4703 /* Return a string representation of all the remaining tokens on the
4704 current line. The result is allocated using xmalloc and must be
4705 freed by the caller. */
4706 unsigned char *
4707 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4708 {
4709 const cpp_token *token;
4710 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4711 unsigned int alloced = 120 + out;
4712 unsigned char *result = (unsigned char *) xmalloc (alloced);
4713
4714 /* If DIR_NAME is empty, there are no initial contents. */
4715 if (dir_name)
4716 {
4717 sprintf ((char *) result, "#%s ", dir_name);
4718 out += 2;
4719 }
4720
4721 token = cpp_get_token (pfile);
4722 while (token->type != CPP_EOF)
4723 {
4724 unsigned char *last;
4725 /* Include room for a possible space and the terminating nul. */
4726 unsigned int len = cpp_token_len (token) + 2;
4727
4728 if (out + len > alloced)
4729 {
4730 alloced *= 2;
4731 if (out + len > alloced)
4732 alloced = out + len;
4733 result = (unsigned char *) xrealloc (result, alloced);
4734 }
4735
4736 last = cpp_spell_token (pfile, token, &result[out], 0);
4737 out = last - result;
4738
4739 token = cpp_get_token (pfile);
4740 if (token->flags & PREV_WHITE)
4741 result[out++] = ' ';
4742 }
4743
4744 result[out] = '\0';
4745 return result;
4746 }
4747
4748 /* Memory buffers. Changing these three constants can have a dramatic
4749 effect on performance. The values here are reasonable defaults,
4750 but might be tuned. If you adjust them, be sure to test across a
4751 range of uses of cpplib, including heavy nested function-like macro
4752 expansion. Also check the change in peak memory usage (NJAMD is a
4753 good tool for this). */
4754 #define MIN_BUFF_SIZE 8000
4755 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4756 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4757 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4758
4759 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4760 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4761 #endif
4762
4763 /* Create a new allocation buffer. Place the control block at the end
4764 of the buffer, so that buffer overflows will cause immediate chaos. */
4765 static _cpp_buff *
4766 new_buff (size_t len)
4767 {
4768 _cpp_buff *result;
4769 unsigned char *base;
4770
4771 if (len < MIN_BUFF_SIZE)
4772 len = MIN_BUFF_SIZE;
4773 len = CPP_ALIGN (len);
4774
4775 #ifdef ENABLE_VALGRIND_WORKAROUNDS
4776 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4777 struct first. */
4778 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4779 base = XNEWVEC (unsigned char, len + slen);
4780 result = (_cpp_buff *) base;
4781 base += slen;
4782 #else
4783 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4784 result = (_cpp_buff *) (base + len);
4785 #endif
4786 result->base = base;
4787 result->cur = base;
4788 result->limit = base + len;
4789 result->next = NULL;
4790 return result;
4791 }
4792
4793 /* Place a chain of unwanted allocation buffers on the free list. */
4794 void
4795 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4796 {
4797 _cpp_buff *end = buff;
4798
4799 while (end->next)
4800 end = end->next;
4801 end->next = pfile->free_buffs;
4802 pfile->free_buffs = buff;
4803 }
4804
4805 /* Return a free buffer of size at least MIN_SIZE. */
4806 _cpp_buff *
4807 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4808 {
4809 _cpp_buff *result, **p;
4810
4811 for (p = &pfile->free_buffs;; p = &(*p)->next)
4812 {
4813 size_t size;
4814
4815 if (*p == NULL)
4816 return new_buff (min_size);
4817 result = *p;
4818 size = result->limit - result->base;
4819 /* Return a buffer that's big enough, but don't waste one that's
4820 way too big. */
4821 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4822 break;
4823 }
4824
4825 *p = result->next;
4826 result->next = NULL;
4827 result->cur = result->base;
4828 return result;
4829 }
4830
4831 /* Creates a new buffer with enough space to hold the uncommitted
4832 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4833 the excess bytes to the new buffer. Chains the new buffer after
4834 BUFF, and returns the new buffer. */
4835 _cpp_buff *
4836 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4837 {
4838 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4839 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4840
4841 buff->next = new_buff;
4842 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4843 return new_buff;
4844 }
4845
4846 /* Creates a new buffer with enough space to hold the uncommitted
4847 remaining bytes of the buffer pointed to by BUFF, and at least
4848 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4849 Chains the new buffer before the buffer pointed to by BUFF, and
4850 updates the pointer to point to the new buffer. */
4851 void
4852 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4853 {
4854 _cpp_buff *new_buff, *old_buff = *pbuff;
4855 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4856
4857 new_buff = _cpp_get_buff (pfile, size);
4858 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4859 new_buff->next = old_buff;
4860 *pbuff = new_buff;
4861 }
4862
4863 /* Free a chain of buffers starting at BUFF. */
4864 void
4865 _cpp_free_buff (_cpp_buff *buff)
4866 {
4867 _cpp_buff *next;
4868
4869 for (; buff; buff = next)
4870 {
4871 next = buff->next;
4872 #ifdef ENABLE_VALGRIND_WORKAROUNDS
4873 free (buff);
4874 #else
4875 free (buff->base);
4876 #endif
4877 }
4878 }
4879
4880 /* Allocate permanent, unaligned storage of length LEN. */
4881 unsigned char *
4882 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4883 {
4884 _cpp_buff *buff = pfile->u_buff;
4885 unsigned char *result = buff->cur;
4886
4887 if (len > (size_t) (buff->limit - result))
4888 {
4889 buff = _cpp_get_buff (pfile, len);
4890 buff->next = pfile->u_buff;
4891 pfile->u_buff = buff;
4892 result = buff->cur;
4893 }
4894
4895 buff->cur = result + len;
4896 return result;
4897 }
4898
4899 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4900 That buffer is used for growing allocations when saving macro
4901 replacement lists in a #define, and when parsing an answer to an
4902 assertion in #assert, #unassert or #if (and therefore possibly
4903 whilst expanding macros). It therefore must not be used by any
4904 code that they might call: specifically the lexer and the guts of
4905 the macro expander.
4906
4907 All existing other uses clearly fit this restriction: storing
4908 registered pragmas during initialization. */
4909 unsigned char *
4910 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4911 {
4912 _cpp_buff *buff = pfile->a_buff;
4913 unsigned char *result = buff->cur;
4914
4915 if (len > (size_t) (buff->limit - result))
4916 {
4917 buff = _cpp_get_buff (pfile, len);
4918 buff->next = pfile->a_buff;
4919 pfile->a_buff = buff;
4920 result = buff->cur;
4921 }
4922
4923 buff->cur = result + len;
4924 return result;
4925 }
4926
4927 /* Commit or allocate storage from a buffer. */
4928
4929 void *
4930 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4931 {
4932 void *ptr = BUFF_FRONT (pfile->a_buff);
4933
4934 if (pfile->hash_table->alloc_subobject)
4935 {
4936 void *copy = pfile->hash_table->alloc_subobject (size);
4937 memcpy (copy, ptr, size);
4938 ptr = copy;
4939 }
4940 else
4941 BUFF_FRONT (pfile->a_buff) += size;
4942
4943 return ptr;
4944 }
4945
4946 /* Say which field of TOK is in use. */
4947
4948 enum cpp_token_fld_kind
4949 cpp_token_val_index (const cpp_token *tok)
4950 {
4951 switch (TOKEN_SPELL (tok))
4952 {
4953 case SPELL_IDENT:
4954 return CPP_TOKEN_FLD_NODE;
4955 case SPELL_LITERAL:
4956 return CPP_TOKEN_FLD_STR;
4957 case SPELL_OPERATOR:
4958 /* Operands which were originally spelled as ident keep around
4959 the node for the exact spelling. */
4960 if (tok->flags & NAMED_OP)
4961 return CPP_TOKEN_FLD_NODE;
4962 else if (tok->type == CPP_PASTE)
4963 return CPP_TOKEN_FLD_TOKEN_NO;
4964 else
4965 return CPP_TOKEN_FLD_NONE;
4966 case SPELL_NONE:
4967 if (tok->type == CPP_MACRO_ARG)
4968 return CPP_TOKEN_FLD_ARG_NO;
4969 else if (tok->type == CPP_PADDING)
4970 return CPP_TOKEN_FLD_SOURCE;
4971 else if (tok->type == CPP_PRAGMA)
4972 return CPP_TOKEN_FLD_PRAGMA;
4973 /* fall through */
4974 default:
4975 return CPP_TOKEN_FLD_NONE;
4976 }
4977 }
4978
4979 /* All tokens lexed in R after calling this function will be forced to
4980 have their location_t to be P, until
4981 cpp_stop_forcing_token_locations is called for R. */
4982
4983 void
4984 cpp_force_token_locations (cpp_reader *r, location_t loc)
4985 {
4986 r->forced_token_location = loc;
4987 }
4988
4989 /* Go back to assigning locations naturally for lexed tokens. */
4990
4991 void
4992 cpp_stop_forcing_token_locations (cpp_reader *r)
4993 {
4994 r->forced_token_location = 0;
4995 }
4996
4997 /* We're looking at \, if it's escaping EOL, look past it. If at
4998 LIMIT, don't advance. */
4999
5000 static const unsigned char *
5001 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
5002 {
5003 const unsigned char *probe = peek;
5004
5005 if (__builtin_expect (peek[1] == '\n', true))
5006 {
5007 eol:
5008 probe += 2;
5009 if (__builtin_expect (probe < limit, true))
5010 {
5011 peek = probe;
5012 if (*peek == '\\')
5013 /* The user might be perverse. */
5014 return do_peek_backslash (peek, limit);
5015 }
5016 }
5017 else if (__builtin_expect (peek[1] == '\r', false))
5018 {
5019 if (probe[2] == '\n')
5020 probe++;
5021 goto eol;
5022 }
5023
5024 return peek;
5025 }
5026
5027 static const unsigned char *
5028 do_peek_next (const unsigned char *peek, const unsigned char *limit)
5029 {
5030 if (__builtin_expect (*peek == '\\', false))
5031 peek = do_peek_backslash (peek, limit);
5032 return peek;
5033 }
5034
5035 static const unsigned char *
5036 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
5037 {
5038 if (peek == bound)
5039 return NULL;
5040
5041 unsigned char c = *--peek;
5042 if (__builtin_expect (c == '\n', false)
5043 || __builtin_expect (c == 'r', false))
5044 {
5045 if (peek == bound)
5046 return peek;
5047 int ix = -1;
5048 if (c == '\n' && peek[ix] == '\r')
5049 {
5050 if (peek + ix == bound)
5051 return peek;
5052 ix--;
5053 }
5054
5055 if (peek[ix] == '\\')
5056 return do_peek_prev (peek + ix, bound);
5057
5058 return peek;
5059 }
5060 else
5061 return peek;
5062 }
5063
5064 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
5065 space. Otherwise return NULL. */
5066
5067 static const unsigned char *
5068 do_peek_ident (const char *match, const unsigned char *peek,
5069 const unsigned char *limit)
5070 {
5071 for (; *++match; peek++)
5072 if (*peek != *match)
5073 {
5074 peek = do_peek_next (peek, limit);
5075 if (*peek != *match)
5076 return NULL;
5077 }
5078
5079 /* Must now not be looking at an identifier char. */
5080 peek = do_peek_next (peek, limit);
5081 if (ISIDNUM (*peek))
5082 return NULL;
5083
5084 /* Skip control-line whitespace. */
5085 ws:
5086 while (*peek == ' ' || *peek == '\t')
5087 peek++;
5088 if (__builtin_expect (*peek == '\\', false))
5089 {
5090 peek = do_peek_backslash (peek, limit);
5091 if (*peek != '\\')
5092 goto ws;
5093 }
5094
5095 return peek;
5096 }
5097
5098 /* Are we looking at a module control line starting as PEEK - 1? */
5099
5100 static bool
5101 do_peek_module (cpp_reader *pfile, unsigned char c,
5102 const unsigned char *peek, const unsigned char *limit)
5103 {
5104 bool import = false;
5105
5106 if (__builtin_expect (c == 'e', false))
5107 {
5108 if (!((peek[0] == 'x' || peek[0] == '\\')
5109 && (peek = do_peek_ident ("export", peek, limit))))
5110 return false;
5111
5112 /* export, peek for import or module. No need to peek __import
5113 here. */
5114 if (peek[0] == 'i')
5115 {
5116 if (!((peek[1] == 'm' || peek[1] == '\\')
5117 && (peek = do_peek_ident ("import", peek + 1, limit))))
5118 return false;
5119 import = true;
5120 }
5121 else if (peek[0] == 'm')
5122 {
5123 if (!((peek[1] == 'o' || peek[1] == '\\')
5124 && (peek = do_peek_ident ("module", peek + 1, limit))))
5125 return false;
5126 }
5127 else
5128 return false;
5129 }
5130 else if (__builtin_expect (c == 'i', false))
5131 {
5132 if (!((peek[0] == 'm' || peek[0] == '\\')
5133 && (peek = do_peek_ident ("import", peek, limit))))
5134 return false;
5135 import = true;
5136 }
5137 else if (__builtin_expect (c == '_', false))
5138 {
5139 /* Needed for translated includes. */
5140 if (!((peek[0] == '_' || peek[0] == '\\')
5141 && (peek = do_peek_ident ("__import", peek, limit))))
5142 return false;
5143 import = true;
5144 }
5145 else if (__builtin_expect (c == 'm', false))
5146 {
5147 if (!((peek[0] == 'o' || peek[0] == '\\')
5148 && (peek = do_peek_ident ("module", peek, limit))))
5149 return false;
5150 }
5151 else
5152 return false;
5153
5154 /* Peek the next character to see if it's good enough. We'll be at
5155 the first non-whitespace char, including skipping an escaped
5156 newline. */
5157 /* ... import followed by identifier, ':', '<' or header-name
5158 preprocessing tokens, or module followed by identifier, ':' or
5159 ';' preprocessing tokens. */
5160 unsigned char p = *peek++;
5161
5162 /* A character literal is ... single quotes, ... optionally preceded
5163 by u8, u, U, or L */
5164 /* A string-literal is a ... double quotes, optionally prefixed by
5165 R, u8, u8R, u, uR, U, UR, L, or LR */
5166 if (p == 'u')
5167 {
5168 peek = do_peek_next (peek, limit);
5169 if (*peek == '8')
5170 {
5171 peek++;
5172 goto peek_u8;
5173 }
5174 goto peek_u;
5175 }
5176 else if (p == 'U' || p == 'L')
5177 {
5178 peek_u8:
5179 peek = do_peek_next (peek, limit);
5180 peek_u:
5181 if (*peek == '\"' || *peek == '\'')
5182 return false;
5183
5184 if (*peek == 'R')
5185 goto peek_R;
5186 /* Identifier. Ok. */
5187 }
5188 else if (p == 'R')
5189 {
5190 peek_R:
5191 if (CPP_OPTION (pfile, rliterals))
5192 {
5193 peek = do_peek_next (peek, limit);
5194 if (*peek == '\"')
5195 return false;
5196 }
5197 /* Identifier. Ok. */
5198 }
5199 else if ('Z' - 'A' == 25
5200 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5201 : ISIDST (p))
5202 {
5203 /* Identifier. Ok. */
5204 }
5205 else if (p == '<')
5206 {
5207 /* Maybe angle header, ok for import. Reject
5208 '<=', '<<' digraph:'<:'. */
5209 if (!import)
5210 return false;
5211 peek = do_peek_next (peek, limit);
5212 if (*peek == '=' || *peek == '<'
5213 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5214 return false;
5215 }
5216 else if (p == ';')
5217 {
5218 /* SEMICOLON, ok for module. */
5219 if (import)
5220 return false;
5221 }
5222 else if (p == '"')
5223 {
5224 /* STRING, ok for import. */
5225 if (!import)
5226 return false;
5227 }
5228 else if (p == ':')
5229 {
5230 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
5231 peek = do_peek_next (peek, limit);
5232 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5233 return false;
5234 }
5235 else
5236 /* FIXME: Detect a unicode character, excluding those not
5237 permitted as the initial character. [lex.name]/1. I presume
5238 we need to check the \[uU] spellings, and directly using
5239 Unicode in say UTF8 form? Or perhaps we do the phase-1
5240 conversion of UTF8 to universal-character-names? */
5241 return false;
5242
5243 return true;
5244 }
5245
5246 /* Directives-only scanning. Somewhat more relaxed than correct
5247 parsing -- some ill-formed programs will not be rejected. */
5248
5249 void
5250 cpp_directive_only_process (cpp_reader *pfile,
5251 void *data,
5252 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5253 {
5254 bool module_p = CPP_OPTION (pfile, module_directives);
5255
5256 do
5257 {
5258 restart:
5259 /* Buffer initialization, but no line cleaning. */
5260 cpp_buffer *buffer = pfile->buffer;
5261 buffer->cur_note = buffer->notes_used = 0;
5262 buffer->cur = buffer->line_base = buffer->next_line;
5263 buffer->need_line = false;
5264 /* Files always end in a newline or carriage return. We rely on this for
5265 character peeking safety. */
5266 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5267
5268 const unsigned char *base = buffer->cur;
5269 unsigned line_count = 0;
5270 const unsigned char *line_start = base;
5271
5272 bool bol = true;
5273 bool raw = false;
5274
5275 const unsigned char *lwm = base;
5276 for (const unsigned char *pos = base, *limit = buffer->rlimit;
5277 pos < limit;)
5278 {
5279 unsigned char c = *pos++;
5280 /* This matches the switch in _cpp_lex_direct. */
5281 switch (c)
5282 {
5283 case ' ': case '\t': case '\f': case '\v':
5284 /* Whitespace, do nothing. */
5285 break;
5286
5287 case '\r': /* MAC line ending, or Windows \r\n */
5288 if (*pos == '\n')
5289 pos++;
5290 /* FALLTHROUGH */
5291
5292 case '\n':
5293 bol = true;
5294
5295 next_line:
5296 CPP_INCREMENT_LINE (pfile, 0);
5297 line_count++;
5298 line_start = pos;
5299 break;
5300
5301 case '\\':
5302 /* <backslash><newline> is removed, and doesn't undo any
5303 preceeding escape or whatnot. */
5304 if (*pos == '\n')
5305 {
5306 pos++;
5307 goto next_line;
5308 }
5309 else if (*pos == '\r')
5310 {
5311 if (pos[1] == '\n')
5312 pos++;
5313 pos++;
5314 goto next_line;
5315 }
5316 goto dflt;
5317
5318 case '#':
5319 if (bol)
5320 {
5321 /* Line directive. */
5322 if (pos - 1 > base && !pfile->state.skipping)
5323 cb (pfile, CPP_DO_print, data,
5324 line_count, base, pos - 1 - base);
5325
5326 /* Prep things for directive handling. */
5327 buffer->next_line = pos;
5328 buffer->need_line = true;
5329 bool ok = _cpp_get_fresh_line (pfile);
5330 gcc_checking_assert (ok);
5331
5332 /* Ensure proper column numbering for generated
5333 error messages. */
5334 buffer->line_base -= pos - line_start;
5335
5336 _cpp_handle_directive (pfile, line_start + 1 != pos);
5337
5338 /* Sanitize the line settings. Duplicate #include's can
5339 mess things up. */
5340 // FIXME: Necessary?
5341 pfile->line_table->highest_location
5342 = pfile->line_table->highest_line;
5343
5344 if (!pfile->state.skipping
5345 && pfile->buffer->next_line < pfile->buffer->rlimit)
5346 cb (pfile, CPP_DO_location, data,
5347 pfile->line_table->highest_line);
5348
5349 goto restart;
5350 }
5351 goto dflt;
5352
5353 case '/':
5354 {
5355 const unsigned char *peek = do_peek_next (pos, limit);
5356 if (!(*peek == '/' || *peek == '*'))
5357 goto dflt;
5358
5359 /* Line or block comment */
5360 bool is_block = *peek == '*';
5361 bool star = false;
5362 bool esc = false;
5363 location_t sloc
5364 = linemap_position_for_column (pfile->line_table,
5365 pos - line_start);
5366
5367 while (pos < limit)
5368 {
5369 char c = *pos++;
5370 switch (c)
5371 {
5372 case '\\':
5373 esc = true;
5374 break;
5375
5376 case '\r':
5377 if (*pos == '\n')
5378 pos++;
5379 /* FALLTHROUGH */
5380
5381 case '\n':
5382 {
5383 CPP_INCREMENT_LINE (pfile, 0);
5384 line_count++;
5385 line_start = pos;
5386 if (!esc && !is_block)
5387 {
5388 bol = true;
5389 goto done_comment;
5390 }
5391 }
5392 if (!esc)
5393 star = false;
5394 esc = false;
5395 break;
5396
5397 case '*':
5398 if (pos > peek)
5399 star = is_block;
5400 esc = false;
5401 break;
5402
5403 case '/':
5404 if (star)
5405 goto done_comment;
5406 /* FALLTHROUGH */
5407
5408 default:
5409 star = false;
5410 esc = false;
5411 break;
5412 }
5413 }
5414 if (pos < limit || is_block)
5415 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5416 "unterminated comment");
5417 done_comment:
5418 lwm = pos;
5419 break;
5420 }
5421
5422 case '\'':
5423 if (!CPP_OPTION (pfile, digit_separators))
5424 goto delimited_string;
5425
5426 /* Possibly a number punctuator. */
5427 if (!ISIDNUM (*do_peek_next (pos, limit)))
5428 goto delimited_string;
5429
5430 goto quote_peek;
5431
5432 case '\"':
5433 if (!CPP_OPTION (pfile, rliterals))
5434 goto delimited_string;
5435
5436 quote_peek:
5437 {
5438 /* For ' see if it's a number punctuator
5439 \.?<digit>(<digit>|<identifier-nondigit>
5440 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5441 /* For " see if it's a raw string
5442 {U,L,u,u8}R. This includes CPP_NUMBER detection,
5443 because that could be 0e+R. */
5444 const unsigned char *peek = pos - 1;
5445 bool quote_first = c == '"';
5446 bool quote_eight = false;
5447 bool maybe_number_start = false;
5448 bool want_number = false;
5449
5450 while ((peek = do_peek_prev (peek, lwm)))
5451 {
5452 unsigned char p = *peek;
5453 if (quote_first)
5454 {
5455 if (!raw)
5456 {
5457 if (p != 'R')
5458 break;
5459 raw = true;
5460 continue;
5461 }
5462
5463 quote_first = false;
5464 if (p == 'L' || p == 'U' || p == 'u')
5465 ;
5466 else if (p == '8')
5467 quote_eight = true;
5468 else
5469 goto second_raw;
5470 }
5471 else if (quote_eight)
5472 {
5473 if (p != 'u')
5474 {
5475 raw = false;
5476 break;
5477 }
5478 quote_eight = false;
5479 }
5480 else if (c == '"')
5481 {
5482 second_raw:;
5483 if (!want_number && ISIDNUM (p))
5484 {
5485 raw = false;
5486 break;
5487 }
5488 }
5489
5490 if (ISDIGIT (p))
5491 maybe_number_start = true;
5492 else if (p == '.')
5493 want_number = true;
5494 else if (ISIDNUM (p))
5495 maybe_number_start = false;
5496 else if (p == '+' || p == '-')
5497 {
5498 if (const unsigned char *peek_prev
5499 = do_peek_prev (peek, lwm))
5500 {
5501 p = *peek_prev;
5502 if (p == 'e' || p == 'E'
5503 || p == 'p' || p == 'P')
5504 {
5505 want_number = true;
5506 maybe_number_start = false;
5507 }
5508 else
5509 break;
5510 }
5511 else
5512 break;
5513 }
5514 else if (p == '\'' || p == '\"')
5515 {
5516 /* If this is lwm, this must be the end of a
5517 previous string. So this is a trailing
5518 literal type, (a) if those are allowed,
5519 and (b) maybe_start is false. Otherwise
5520 this must be a CPP_NUMBER because we've
5521 met another ', and we'd have checked that
5522 in its own right. */
5523 if (peek == lwm && CPP_OPTION (pfile, uliterals))
5524 {
5525 if (!maybe_number_start && !want_number)
5526 /* Must be a literal type. */
5527 raw = false;
5528 }
5529 else if (p == '\''
5530 && CPP_OPTION (pfile, digit_separators))
5531 maybe_number_start = true;
5532 break;
5533 }
5534 else if (c == '\'')
5535 break;
5536 else if (!quote_first && !quote_eight)
5537 break;
5538 }
5539
5540 if (maybe_number_start)
5541 {
5542 if (c == '\'')
5543 /* A CPP NUMBER. */
5544 goto dflt;
5545 raw = false;
5546 }
5547
5548 goto delimited_string;
5549 }
5550
5551 delimited_string:
5552 {
5553 /* (Possibly raw) string or char literal. */
5554 unsigned char end = c;
5555 int delim_len = -1;
5556 const unsigned char *delim = NULL;
5557 location_t sloc = linemap_position_for_column (pfile->line_table,
5558 pos - line_start);
5559 int esc = 0;
5560
5561 if (raw)
5562 {
5563 /* There can be no line breaks in the delimiter. */
5564 delim = pos;
5565 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5566 {
5567 if (delim_len == 16)
5568 {
5569 cpp_error_with_line (pfile, CPP_DL_ERROR,
5570 sloc, 0,
5571 "raw string delimiter"
5572 " longer than %d"
5573 " characters",
5574 delim_len);
5575 raw = false;
5576 pos = delim;
5577 break;
5578 }
5579 if (strchr (") \\\t\v\f\n", c))
5580 {
5581 cpp_error_with_line (pfile, CPP_DL_ERROR,
5582 sloc, 0,
5583 "invalid character '%c'"
5584 " in raw string"
5585 " delimiter", c);
5586 raw = false;
5587 pos = delim;
5588 break;
5589 }
5590 if (pos >= limit)
5591 goto bad_string;
5592 }
5593 }
5594
5595 while (pos < limit)
5596 {
5597 char c = *pos++;
5598 switch (c)
5599 {
5600 case '\\':
5601 if (!raw)
5602 esc++;
5603 break;
5604
5605 case '\r':
5606 if (*pos == '\n')
5607 pos++;
5608 /* FALLTHROUGH */
5609
5610 case '\n':
5611 {
5612 CPP_INCREMENT_LINE (pfile, 0);
5613 line_count++;
5614 line_start = pos;
5615 }
5616 if (esc)
5617 esc--;
5618 break;
5619
5620 case ')':
5621 if (raw
5622 && pos + delim_len + 1 < limit
5623 && pos[delim_len] == end
5624 && !memcmp (delim, pos, delim_len))
5625 {
5626 pos += delim_len + 1;
5627 raw = false;
5628 goto done_string;
5629 }
5630 break;
5631
5632 default:
5633 if (!raw && !(esc & 1) && c == end)
5634 goto done_string;
5635 esc = 0;
5636 break;
5637 }
5638 }
5639 bad_string:
5640 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5641 "unterminated literal");
5642
5643 done_string:
5644 raw = false;
5645 lwm = pos - 1;
5646 }
5647 goto dflt;
5648
5649 case '_':
5650 case 'e':
5651 case 'i':
5652 case 'm':
5653 if (bol && module_p && !pfile->state.skipping
5654 && do_peek_module (pfile, c, pos, limit))
5655 {
5656 /* We've seen the start of a module control line.
5657 Start up the tokenizer. */
5658 pos--; /* Backup over the first character. */
5659
5660 /* Backup over whitespace to start of line. */
5661 while (pos > line_start
5662 && (pos[-1] == ' ' || pos[-1] == '\t'))
5663 pos--;
5664
5665 if (pos > base)
5666 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5667
5668 /* Prep things for directive handling. */
5669 buffer->next_line = pos;
5670 buffer->need_line = true;
5671
5672 /* Now get tokens until the PRAGMA_EOL. */
5673 do
5674 {
5675 location_t spelling;
5676 const cpp_token *tok
5677 = cpp_get_token_with_location (pfile, &spelling);
5678
5679 gcc_assert (pfile->state.in_deferred_pragma
5680 || tok->type == CPP_PRAGMA_EOL);
5681 cb (pfile, CPP_DO_token, data, tok, spelling);
5682 }
5683 while (pfile->state.in_deferred_pragma);
5684
5685 if (pfile->buffer->next_line < pfile->buffer->rlimit)
5686 cb (pfile, CPP_DO_location, data,
5687 pfile->line_table->highest_line);
5688
5689 pfile->mi_valid = false;
5690 goto restart;
5691 }
5692 goto dflt;
5693
5694 default:
5695 dflt:
5696 bol = false;
5697 pfile->mi_valid = false;
5698 break;
5699 }
5700 }
5701
5702 if (buffer->rlimit > base && !pfile->state.skipping)
5703 {
5704 const unsigned char *limit = buffer->rlimit;
5705 /* If the file was not newline terminated, add rlimit, which is
5706 guaranteed to point to a newline, to the end of our range. */
5707 if (limit[-1] != '\n')
5708 {
5709 limit++;
5710 CPP_INCREMENT_LINE (pfile, 0);
5711 line_count++;
5712 }
5713 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5714 }
5715
5716 _cpp_pop_buffer (pfile);
5717 }
5718 while (pfile->buffer);
5719 }
5720