lex.cc revision 1.1.1.1 1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2022 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
26
27 enum spell_type
28 {
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
33 };
34
35 struct token_spelling
36 {
37 enum spell_type category;
38 const unsigned char *name;
39 };
40
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
43
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
49
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void store_comment (cpp_reader *, cpp_token *);
59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60 unsigned int, enum cpp_ttype);
61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62 static int name_p (cpp_reader *, const cpp_string *);
63 static tokenrun *next_tokenrun (tokenrun *);
64
65 static _cpp_buff *new_buff (size_t);
66
67
68 /* Utility routine:
69
70 Compares, the token TOKEN to the NUL-terminated string STRING.
71 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
72 int
73 cpp_ideq (const cpp_token *token, const char *string)
74 {
75 if (token->type != CPP_NAME)
76 return 0;
77
78 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
79 }
80
81 /* Record a note TYPE at byte POS into the current cleaned logical
82 line. */
83 static void
84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
85 {
86 if (buffer->notes_used == buffer->notes_cap)
87 {
88 buffer->notes_cap = buffer->notes_cap * 2 + 200;
89 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90 buffer->notes_cap);
91 }
92
93 buffer->notes[buffer->notes_used].pos = pos;
94 buffer->notes[buffer->notes_used].type = type;
95 buffer->notes_used++;
96 }
97
98
99 /* Fast path to find line special characters using optimized character
101 scanning algorithms. Anything complicated falls back to the slow
102 path below. Since this loop is very hot it's worth doing these kinds
103 of optimizations.
104
105 One of the paths through the ifdefs should provide
106
107 const uchar *search_line_fast (const uchar *s, const uchar *end);
108
109 Between S and END, search for \n, \r, \\, ?. Return a pointer to
110 the found character.
111
112 Note that the last character of the buffer is *always* a newline,
113 as forced by _cpp_convert_input. This fact can be used to avoid
114 explicitly looking for the end of the buffer. */
115
116 /* Configure gives us an ifdef test. */
117 #ifndef WORDS_BIGENDIAN
118 #define WORDS_BIGENDIAN 0
119 #endif
120
121 /* We'd like the largest integer that fits into a register. There's nothing
122 in <stdint.h> that gives us that. For most hosts this is unsigned long,
123 but MS decided on an LLP64 model. Thankfully when building with GCC we
124 can get the "real" word size. */
125 #ifdef __GNUC__
126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
127 #else
128 typedef unsigned long word_type;
129 #endif
130
131 /* The code below is only expecting sizes 4 or 8.
132 Die at compile-time if this expectation is violated. */
133 typedef char check_word_type_size
134 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
135
136 /* Return X with the first N bytes forced to values that won't match one
137 of the interesting characters. Note that NUL is not interesting. */
138
139 static inline word_type
140 acc_char_mask_misalign (word_type val, unsigned int n)
141 {
142 word_type mask = -1;
143 if (WORDS_BIGENDIAN)
144 mask >>= n * 8;
145 else
146 mask <<= n * 8;
147 return val & mask;
148 }
149
150 /* Return X replicated to all byte positions within WORD_TYPE. */
151
152 static inline word_type
153 acc_char_replicate (uchar x)
154 {
155 word_type ret;
156
157 ret = (x << 24) | (x << 16) | (x << 8) | x;
158 if (sizeof(word_type) == 8)
159 ret = (ret << 16 << 16) | ret;
160 return ret;
161 }
162
163 /* Return non-zero if some byte of VAL is (probably) C. */
164
165 static inline word_type
166 acc_char_cmp (word_type val, word_type c)
167 {
168 #if defined(__GNUC__) && defined(__alpha__)
169 /* We can get exact results using a compare-bytes instruction.
170 Get (val == c) via (0 >= (val ^ c)). */
171 return __builtin_alpha_cmpbge (0, val ^ c);
172 #else
173 word_type magic = 0x7efefefeU;
174 if (sizeof(word_type) == 8)
175 magic = (magic << 16 << 16) | 0xfefefefeU;
176 magic |= 1;
177
178 val ^= c;
179 return ((val + magic) ^ ~val) & ~magic;
180 #endif
181 }
182
183 /* Given the result of acc_char_cmp is non-zero, return the index of
184 the found character. If this was a false positive, return -1. */
185
186 static inline int
187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
188 word_type val ATTRIBUTE_UNUSED)
189 {
190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
191 /* The cmpbge instruction sets *bits* of the result corresponding to
192 matches in the bytes with no false positives. */
193 return __builtin_ctzl (cmp);
194 #else
195 unsigned int i;
196
197 /* ??? It would be nice to force unrolling here,
198 and have all of these constants folded. */
199 for (i = 0; i < sizeof(word_type); ++i)
200 {
201 uchar c;
202 if (WORDS_BIGENDIAN)
203 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
204 else
205 c = (val >> i * 8) & 0xff;
206
207 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
208 return i;
209 }
210
211 return -1;
212 #endif
213 }
214
215 /* A version of the fast scanner using bit fiddling techniques.
216
217 For 32-bit words, one would normally perform 16 comparisons and
218 16 branches. With this algorithm one performs 24 arithmetic
219 operations and one branch. Whether this is faster with a 32-bit
220 word size is going to be somewhat system dependent.
221
222 For 64-bit words, we eliminate twice the number of comparisons
223 and branches without increasing the number of arithmetic operations.
224 It's almost certainly going to be a win with 64-bit word size. */
225
226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
227 ATTRIBUTE_UNUSED;
228
229 static const uchar *
230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
231 {
232 const word_type repl_nl = acc_char_replicate ('\n');
233 const word_type repl_cr = acc_char_replicate ('\r');
234 const word_type repl_bs = acc_char_replicate ('\\');
235 const word_type repl_qm = acc_char_replicate ('?');
236
237 unsigned int misalign;
238 const word_type *p;
239 word_type val, t;
240
241 /* Align the buffer. Mask out any bytes from before the beginning. */
242 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
243 val = *p;
244 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
245 if (misalign)
246 val = acc_char_mask_misalign (val, misalign);
247
248 /* Main loop. */
249 while (1)
250 {
251 t = acc_char_cmp (val, repl_nl);
252 t |= acc_char_cmp (val, repl_cr);
253 t |= acc_char_cmp (val, repl_bs);
254 t |= acc_char_cmp (val, repl_qm);
255
256 if (__builtin_expect (t != 0, 0))
257 {
258 int i = acc_char_index (t, val);
259 if (i >= 0)
260 return (const uchar *)p + i;
261 }
262
263 val = *++p;
264 }
265 }
266
267 /* Disable on Solaris 2/x86 until the following problem can be properly
268 autoconfed:
269
270 The Solaris 10+ assembler tags objects with the instruction set
271 extensions used, so SSE4.2 executables cannot run on machines that
272 don't support that extension. */
273
274 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
275
276 /* Replicated character data to be shared between implementations.
277 Recall that outside of a context with vector support we can't
278 define compatible vector types, therefore these are all defined
279 in terms of raw characters. */
280 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
281 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
282 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
283 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
284 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
285 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
286 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
287 { '?', '?', '?', '?', '?', '?', '?', '?',
288 '?', '?', '?', '?', '?', '?', '?', '?' },
289 };
290
291 /* A version of the fast scanner using MMX vectorized byte compare insns.
292
293 This uses the PMOVMSKB instruction which was introduced with "MMX2",
294 which was packaged into SSE1; it is also present in the AMD MMX
295 extension. Mark the function as using "sse" so that we emit a real
296 "emms" instruction, rather than the 3dNOW "femms" instruction. */
297
298 static const uchar *
299 #ifndef __SSE__
300 __attribute__((__target__("sse")))
301 #endif
302 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
303 {
304 typedef char v8qi __attribute__ ((__vector_size__ (8)));
305 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
306
307 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
308 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
309 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
310 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
311
312 unsigned int misalign, found, mask;
313 const v8qi *p;
314 v8qi data, t, c;
315
316 /* Align the source pointer. While MMX doesn't generate unaligned data
317 faults, this allows us to safely scan to the end of the buffer without
318 reading beyond the end of the last page. */
319 misalign = (uintptr_t)s & 7;
320 p = (const v8qi *)((uintptr_t)s & -8);
321 data = *p;
322
323 /* Create a mask for the bytes that are valid within the first
324 16-byte block. The Idea here is that the AND with the mask
325 within the loop is "free", since we need some AND or TEST
326 insn in order to set the flags for the branch anyway. */
327 mask = -1u << misalign;
328
329 /* Main loop processing 8 bytes at a time. */
330 goto start;
331 do
332 {
333 data = *++p;
334 mask = -1;
335
336 start:
337 t = __builtin_ia32_pcmpeqb(data, repl_nl);
338 c = __builtin_ia32_pcmpeqb(data, repl_cr);
339 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
340 c = __builtin_ia32_pcmpeqb(data, repl_bs);
341 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
342 c = __builtin_ia32_pcmpeqb(data, repl_qm);
343 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
344 found = __builtin_ia32_pmovmskb (t);
345 found &= mask;
346 }
347 while (!found);
348
349 __builtin_ia32_emms ();
350
351 /* FOUND contains 1 in bits for which we matched a relevant
352 character. Conversion to the byte index is trivial. */
353 found = __builtin_ctz(found);
354 return (const uchar *)p + found;
355 }
356
357 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
358
359 static const uchar *
360 #ifndef __SSE2__
361 __attribute__((__target__("sse2")))
362 #endif
363 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
364 {
365 typedef char v16qi __attribute__ ((__vector_size__ (16)));
366
367 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
368 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
369 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
370 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
371
372 unsigned int misalign, found, mask;
373 const v16qi *p;
374 v16qi data, t;
375
376 /* Align the source pointer. */
377 misalign = (uintptr_t)s & 15;
378 p = (const v16qi *)((uintptr_t)s & -16);
379 data = *p;
380
381 /* Create a mask for the bytes that are valid within the first
382 16-byte block. The Idea here is that the AND with the mask
383 within the loop is "free", since we need some AND or TEST
384 insn in order to set the flags for the branch anyway. */
385 mask = -1u << misalign;
386
387 /* Main loop processing 16 bytes at a time. */
388 goto start;
389 do
390 {
391 data = *++p;
392 mask = -1;
393
394 start:
395 t = data == repl_nl;
396 t |= data == repl_cr;
397 t |= data == repl_bs;
398 t |= data == repl_qm;
399 found = __builtin_ia32_pmovmskb128 (t);
400 found &= mask;
401 }
402 while (!found);
403
404 /* FOUND contains 1 in bits for which we matched a relevant
405 character. Conversion to the byte index is trivial. */
406 found = __builtin_ctz(found);
407 return (const uchar *)p + found;
408 }
409
410 #ifdef HAVE_SSE4
411 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
412
413 static const uchar *
414 #ifndef __SSE4_2__
415 __attribute__((__target__("sse4.2")))
416 #endif
417 search_line_sse42 (const uchar *s, const uchar *end)
418 {
419 typedef char v16qi __attribute__ ((__vector_size__ (16)));
420 static const v16qi search = { '\n', '\r', '?', '\\' };
421
422 uintptr_t si = (uintptr_t)s;
423 uintptr_t index;
424
425 /* Check for unaligned input. */
426 if (si & 15)
427 {
428 v16qi sv;
429
430 if (__builtin_expect (end - s < 16, 0)
431 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
432 {
433 /* There are less than 16 bytes left in the buffer, and less
434 than 16 bytes left on the page. Reading 16 bytes at this
435 point might generate a spurious page fault. Defer to the
436 SSE2 implementation, which already handles alignment. */
437 return search_line_sse2 (s, end);
438 }
439
440 /* ??? The builtin doesn't understand that the PCMPESTRI read from
441 memory need not be aligned. */
442 sv = __builtin_ia32_loaddqu ((const char *) s);
443 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
444
445 if (__builtin_expect (index < 16, 0))
446 goto found;
447
448 /* Advance the pointer to an aligned address. We will re-scan a
449 few bytes, but we no longer need care for reading past the
450 end of a page, since we're guaranteed a match. */
451 s = (const uchar *)((si + 15) & -16);
452 }
453
454 /* Main loop, processing 16 bytes at a time. */
455 #ifdef __GCC_ASM_FLAG_OUTPUTS__
456 while (1)
457 {
458 char f;
459
460 /* By using inline assembly instead of the builtin,
461 we can use the result, as well as the flags set. */
462 __asm ("%vpcmpestri\t$0, %2, %3"
463 : "=c"(index), "=@ccc"(f)
464 : "m"(*s), "x"(search), "a"(4), "d"(16));
465 if (f)
466 break;
467
468 s += 16;
469 }
470 #else
471 s -= 16;
472 /* By doing the whole loop in inline assembly,
473 we can make proper use of the flags set. */
474 __asm ( ".balign 16\n"
475 "0: add $16, %1\n"
476 " %vpcmpestri\t$0, (%1), %2\n"
477 " jnc 0b"
478 : "=&c"(index), "+r"(s)
479 : "x"(search), "a"(4), "d"(16));
480 #endif
481
482 found:
483 return s + index;
484 }
485
486 #else
487 /* Work around out-dated assemblers without sse4 support. */
488 #define search_line_sse42 search_line_sse2
489 #endif
490
491 /* Check the CPU capabilities. */
492
493 #include "../gcc/config/i386/cpuid.h"
494
495 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
496 static search_line_fast_type search_line_fast;
497
498 #define HAVE_init_vectorized_lexer 1
499 static inline void
500 init_vectorized_lexer (void)
501 {
502 unsigned dummy, ecx = 0, edx = 0;
503 search_line_fast_type impl = search_line_acc_char;
504 int minimum = 0;
505
506 #if defined(__SSE4_2__)
507 minimum = 3;
508 #elif defined(__SSE2__)
509 minimum = 2;
510 #elif defined(__SSE__)
511 minimum = 1;
512 #endif
513
514 if (minimum == 3)
515 impl = search_line_sse42;
516 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
517 {
518 if (minimum == 3 || (ecx & bit_SSE4_2))
519 impl = search_line_sse42;
520 else if (minimum == 2 || (edx & bit_SSE2))
521 impl = search_line_sse2;
522 else if (minimum == 1 || (edx & bit_SSE))
523 impl = search_line_mmx;
524 }
525 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
526 {
527 if (minimum == 1
528 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
529 impl = search_line_mmx;
530 }
531
532 search_line_fast = impl;
533 }
534
535 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
536
537 /* A vection of the fast scanner using AltiVec vectorized byte compares
538 and VSX unaligned loads (when VSX is available). This is otherwise
539 the same as the AltiVec version. */
540
541 ATTRIBUTE_NO_SANITIZE_UNDEFINED
542 static const uchar *
543 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
544 {
545 typedef __attribute__((altivec(vector))) unsigned char vc;
546
547 const vc repl_nl = {
548 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
549 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
550 };
551 const vc repl_cr = {
552 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
553 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
554 };
555 const vc repl_bs = {
556 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
557 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
558 };
559 const vc repl_qm = {
560 '?', '?', '?', '?', '?', '?', '?', '?',
561 '?', '?', '?', '?', '?', '?', '?', '?',
562 };
563 const vc zero = { 0 };
564
565 vc data, t;
566
567 /* Main loop processing 16 bytes at a time. */
568 do
569 {
570 vc m_nl, m_cr, m_bs, m_qm;
571
572 data = __builtin_vec_vsx_ld (0, s);
573 s += 16;
574
575 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
576 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
577 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
578 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
579 t = (m_nl | m_cr) | (m_bs | m_qm);
580
581 /* T now contains 0xff in bytes for which we matched one of the relevant
582 characters. We want to exit the loop if any byte in T is non-zero.
583 Below is the expansion of vec_any_ne(t, zero). */
584 }
585 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
586
587 /* Restore s to to point to the 16 bytes we just processed. */
588 s -= 16;
589
590 {
591 #define N (sizeof(vc) / sizeof(long))
592
593 union {
594 vc v;
595 /* Statically assert that N is 2 or 4. */
596 unsigned long l[(N == 2 || N == 4) ? N : -1];
597 } u;
598 unsigned long l, i = 0;
599
600 u.v = t;
601
602 /* Find the first word of T that is non-zero. */
603 switch (N)
604 {
605 case 4:
606 l = u.l[i++];
607 if (l != 0)
608 break;
609 s += sizeof(unsigned long);
610 l = u.l[i++];
611 if (l != 0)
612 break;
613 s += sizeof(unsigned long);
614 /* FALLTHRU */
615 case 2:
616 l = u.l[i++];
617 if (l != 0)
618 break;
619 s += sizeof(unsigned long);
620 l = u.l[i];
621 }
622
623 /* L now contains 0xff in bytes for which we matched one of the
624 relevant characters. We can find the byte index by finding
625 its bit index and dividing by 8. */
626 #ifdef __BIG_ENDIAN__
627 l = __builtin_clzl(l) >> 3;
628 #else
629 l = __builtin_ctzl(l) >> 3;
630 #endif
631 return s + l;
632
633 #undef N
634 }
635 }
636
637 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
638
639 /* A vection of the fast scanner using AltiVec vectorized byte compares.
640 This cannot be used for little endian because vec_lvsl/lvsr are
641 deprecated for little endian and the code won't work properly. */
642 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
643 so we can't compile this function without -maltivec on the command line
644 (or implied by some other switch). */
645
646 static const uchar *
647 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
648 {
649 typedef __attribute__((altivec(vector))) unsigned char vc;
650
651 const vc repl_nl = {
652 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
653 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
654 };
655 const vc repl_cr = {
656 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
657 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
658 };
659 const vc repl_bs = {
660 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
661 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
662 };
663 const vc repl_qm = {
664 '?', '?', '?', '?', '?', '?', '?', '?',
665 '?', '?', '?', '?', '?', '?', '?', '?',
666 };
667 const vc ones = {
668 -1, -1, -1, -1, -1, -1, -1, -1,
669 -1, -1, -1, -1, -1, -1, -1, -1,
670 };
671 const vc zero = { 0 };
672
673 vc data, mask, t;
674
675 /* Altivec loads automatically mask addresses with -16. This lets us
676 issue the first load as early as possible. */
677 data = __builtin_vec_ld(0, (const vc *)s);
678
679 /* Discard bytes before the beginning of the buffer. Do this by
680 beginning with all ones and shifting in zeros according to the
681 mis-alignment. The LVSR instruction pulls the exact shift we
682 want from the address. */
683 mask = __builtin_vec_lvsr(0, s);
684 mask = __builtin_vec_perm(zero, ones, mask);
685 data &= mask;
686
687 /* While altivec loads mask addresses, we still need to align S so
688 that the offset we compute at the end is correct. */
689 s = (const uchar *)((uintptr_t)s & -16);
690
691 /* Main loop processing 16 bytes at a time. */
692 goto start;
693 do
694 {
695 vc m_nl, m_cr, m_bs, m_qm;
696
697 s += 16;
698 data = __builtin_vec_ld(0, (const vc *)s);
699
700 start:
701 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
702 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
703 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
704 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
705 t = (m_nl | m_cr) | (m_bs | m_qm);
706
707 /* T now contains 0xff in bytes for which we matched one of the relevant
708 characters. We want to exit the loop if any byte in T is non-zero.
709 Below is the expansion of vec_any_ne(t, zero). */
710 }
711 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
712
713 {
714 #define N (sizeof(vc) / sizeof(long))
715
716 union {
717 vc v;
718 /* Statically assert that N is 2 or 4. */
719 unsigned long l[(N == 2 || N == 4) ? N : -1];
720 } u;
721 unsigned long l, i = 0;
722
723 u.v = t;
724
725 /* Find the first word of T that is non-zero. */
726 switch (N)
727 {
728 case 4:
729 l = u.l[i++];
730 if (l != 0)
731 break;
732 s += sizeof(unsigned long);
733 l = u.l[i++];
734 if (l != 0)
735 break;
736 s += sizeof(unsigned long);
737 /* FALLTHROUGH */
738 case 2:
739 l = u.l[i++];
740 if (l != 0)
741 break;
742 s += sizeof(unsigned long);
743 l = u.l[i];
744 }
745
746 /* L now contains 0xff in bytes for which we matched one of the
747 relevant characters. We can find the byte index by finding
748 its bit index and dividing by 8. */
749 l = __builtin_clzl(l) >> 3;
750 return s + l;
751
752 #undef N
753 }
754 }
755
756 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
757 #include "arm_neon.h"
758
759 /* This doesn't have to be the exact page size, but no system may use
760 a size smaller than this. ARMv8 requires a minimum page size of
761 4k. The impact of being conservative here is a small number of
762 cases will take the slightly slower entry path into the main
763 loop. */
764
765 #define AARCH64_MIN_PAGE_SIZE 4096
766
767 static const uchar *
768 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
769 {
770 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
771 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
772 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
773 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
774 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
775
776 #ifdef __ARM_BIG_ENDIAN
777 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
778 #else
779 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
780 #endif
781
782 unsigned int found;
783 const uint8_t *p;
784 uint8x16_t data;
785 uint8x16_t t;
786 uint16x8_t m;
787 uint8x16_t u, v, w;
788
789 /* Align the source pointer. */
790 p = (const uint8_t *)((uintptr_t)s & -16);
791
792 /* Assuming random string start positions, with a 4k page size we'll take
793 the slow path about 0.37% of the time. */
794 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
795 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
796 < 16, 0))
797 {
798 /* Slow path: the string starts near a possible page boundary. */
799 uint32_t misalign, mask;
800
801 misalign = (uintptr_t)s & 15;
802 mask = (-1u << misalign) & 0xffff;
803 data = vld1q_u8 (p);
804 t = vceqq_u8 (data, repl_nl);
805 u = vceqq_u8 (data, repl_cr);
806 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
807 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
808 t = vorrq_u8 (v, w);
809 t = vandq_u8 (t, xmask);
810 m = vpaddlq_u8 (t);
811 m = vshlq_u16 (m, shift);
812 found = vaddvq_u16 (m);
813 found &= mask;
814 if (found)
815 return (const uchar*)p + __builtin_ctz (found);
816 }
817 else
818 {
819 data = vld1q_u8 ((const uint8_t *) s);
820 t = vceqq_u8 (data, repl_nl);
821 u = vceqq_u8 (data, repl_cr);
822 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
823 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
824 t = vorrq_u8 (v, w);
825 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
826 goto done;
827 }
828
829 do
830 {
831 p += 16;
832 data = vld1q_u8 (p);
833 t = vceqq_u8 (data, repl_nl);
834 u = vceqq_u8 (data, repl_cr);
835 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
836 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
837 t = vorrq_u8 (v, w);
838 } while (!vpaddd_u64 ((uint64x2_t)t));
839
840 done:
841 /* Now that we've found the terminating substring, work out precisely where
842 we need to stop. */
843 t = vandq_u8 (t, xmask);
844 m = vpaddlq_u8 (t);
845 m = vshlq_u16 (m, shift);
846 found = vaddvq_u16 (m);
847 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
848 + __builtin_ctz (found));
849 }
850
851 #elif defined (__ARM_NEON)
852 #include "arm_neon.h"
853
854 static const uchar *
855 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
856 {
857 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
858 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
859 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
860 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
861 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
862
863 unsigned int misalign, found, mask;
864 const uint8_t *p;
865 uint8x16_t data;
866
867 /* Align the source pointer. */
868 misalign = (uintptr_t)s & 15;
869 p = (const uint8_t *)((uintptr_t)s & -16);
870 data = vld1q_u8 (p);
871
872 /* Create a mask for the bytes that are valid within the first
873 16-byte block. The Idea here is that the AND with the mask
874 within the loop is "free", since we need some AND or TEST
875 insn in order to set the flags for the branch anyway. */
876 mask = (-1u << misalign) & 0xffff;
877
878 /* Main loop, processing 16 bytes at a time. */
879 goto start;
880
881 do
882 {
883 uint8x8_t l;
884 uint16x4_t m;
885 uint32x2_t n;
886 uint8x16_t t, u, v, w;
887
888 p += 16;
889 data = vld1q_u8 (p);
890 mask = 0xffff;
891
892 start:
893 t = vceqq_u8 (data, repl_nl);
894 u = vceqq_u8 (data, repl_cr);
895 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
896 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
897 t = vandq_u8 (vorrq_u8 (v, w), xmask);
898 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
899 m = vpaddl_u8 (l);
900 n = vpaddl_u16 (m);
901
902 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
903 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
904 found &= mask;
905 }
906 while (!found);
907
908 /* FOUND contains 1 in bits for which we matched a relevant
909 character. Conversion to the byte index is trivial. */
910 found = __builtin_ctz (found);
911 return (const uchar *)p + found;
912 }
913
914 #else
915
916 /* We only have one accelerated alternative. Use a direct call so that
917 we encourage inlining. */
918
919 #define search_line_fast search_line_acc_char
920
921 #endif
922
923 /* Initialize the lexer if needed. */
924
925 void
926 _cpp_init_lexer (void)
927 {
928 #ifdef HAVE_init_vectorized_lexer
929 init_vectorized_lexer ();
930 #endif
931 }
932
933 /* Returns with a logical line that contains no escaped newlines or
934 trigraphs. This is a time-critical inner loop. */
935 void
936 _cpp_clean_line (cpp_reader *pfile)
937 {
938 cpp_buffer *buffer;
939 const uchar *s;
940 uchar c, *d, *p;
941
942 buffer = pfile->buffer;
943 buffer->cur_note = buffer->notes_used = 0;
944 buffer->cur = buffer->line_base = buffer->next_line;
945 buffer->need_line = false;
946 s = buffer->next_line;
947
948 if (!buffer->from_stage3)
949 {
950 const uchar *pbackslash = NULL;
951
952 /* Fast path. This is the common case of an un-escaped line with
953 no trigraphs. The primary win here is by not writing any
954 data back to memory until we have to. */
955 while (1)
956 {
957 /* Perform an optimized search for \n, \r, \\, ?. */
958 s = search_line_fast (s, buffer->rlimit);
959
960 c = *s;
961 if (c == '\\')
962 {
963 /* Record the location of the backslash and continue. */
964 pbackslash = s++;
965 }
966 else if (__builtin_expect (c == '?', 0))
967 {
968 if (__builtin_expect (s[1] == '?', false)
969 && _cpp_trigraph_map[s[2]])
970 {
971 /* Have a trigraph. We may or may not have to convert
972 it. Add a line note regardless, for -Wtrigraphs. */
973 add_line_note (buffer, s, s[2]);
974 if (CPP_OPTION (pfile, trigraphs))
975 {
976 /* We do, and that means we have to switch to the
977 slow path. */
978 d = (uchar *) s;
979 *d = _cpp_trigraph_map[s[2]];
980 s += 2;
981 goto slow_path;
982 }
983 }
984 /* Not a trigraph. Continue on fast-path. */
985 s++;
986 }
987 else
988 break;
989 }
990
991 /* This must be \r or \n. We're either done, or we'll be forced
992 to write back to the buffer and continue on the slow path. */
993 d = (uchar *) s;
994
995 if (__builtin_expect (s == buffer->rlimit, false))
996 goto done;
997
998 /* DOS line ending? */
999 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1000 {
1001 s++;
1002 if (s == buffer->rlimit)
1003 goto done;
1004 }
1005
1006 if (__builtin_expect (pbackslash == NULL, true))
1007 goto done;
1008
1009 /* Check for escaped newline. */
1010 p = d;
1011 while (is_nvspace (p[-1]))
1012 p--;
1013 if (p - 1 != pbackslash)
1014 goto done;
1015
1016 /* Have an escaped newline; process it and proceed to
1017 the slow path. */
1018 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1019 d = p - 2;
1020 buffer->next_line = p - 1;
1021
1022 slow_path:
1023 while (1)
1024 {
1025 c = *++s;
1026 *++d = c;
1027
1028 if (c == '\n' || c == '\r')
1029 {
1030 /* Handle DOS line endings. */
1031 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1032 s++;
1033 if (s == buffer->rlimit)
1034 break;
1035
1036 /* Escaped? */
1037 p = d;
1038 while (p != buffer->next_line && is_nvspace (p[-1]))
1039 p--;
1040 if (p == buffer->next_line || p[-1] != '\\')
1041 break;
1042
1043 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1044 d = p - 2;
1045 buffer->next_line = p - 1;
1046 }
1047 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1048 {
1049 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1050 add_line_note (buffer, d, s[2]);
1051 if (CPP_OPTION (pfile, trigraphs))
1052 {
1053 *d = _cpp_trigraph_map[s[2]];
1054 s += 2;
1055 }
1056 }
1057 }
1058 }
1059 else
1060 {
1061 while (*s != '\n' && *s != '\r')
1062 s++;
1063 d = (uchar *) s;
1064
1065 /* Handle DOS line endings. */
1066 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1067 s++;
1068 }
1069
1070 done:
1071 *d = '\n';
1072 /* A sentinel note that should never be processed. */
1073 add_line_note (buffer, d + 1, '\n');
1074 buffer->next_line = s + 1;
1075 }
1076
1077 /* Return true if the trigraph indicated by NOTE should be warned
1078 about in a comment. */
1079 static bool
1080 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1081 {
1082 const uchar *p;
1083
1084 /* Within comments we don't warn about trigraphs, unless the
1085 trigraph forms an escaped newline, as that may change
1086 behavior. */
1087 if (note->type != '/')
1088 return false;
1089
1090 /* If -trigraphs, then this was an escaped newline iff the next note
1091 is coincident. */
1092 if (CPP_OPTION (pfile, trigraphs))
1093 return note[1].pos == note->pos;
1094
1095 /* Otherwise, see if this forms an escaped newline. */
1096 p = note->pos + 3;
1097 while (is_nvspace (*p))
1098 p++;
1099
1100 /* There might have been escaped newlines between the trigraph and the
1101 newline we found. Hence the position test. */
1102 return (*p == '\n' && p < note[1].pos);
1103 }
1104
1105 /* Process the notes created by add_line_note as far as the current
1106 location. */
1107 void
1108 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1109 {
1110 cpp_buffer *buffer = pfile->buffer;
1111
1112 for (;;)
1113 {
1114 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1115 unsigned int col;
1116
1117 if (note->pos > buffer->cur)
1118 break;
1119
1120 buffer->cur_note++;
1121 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1122
1123 if (note->type == '\\' || note->type == ' ')
1124 {
1125 if (note->type == ' ' && !in_comment)
1126 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1127 "backslash and newline separated by space");
1128
1129 if (buffer->next_line > buffer->rlimit)
1130 {
1131 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1132 "backslash-newline at end of file");
1133 /* Prevent "no newline at end of file" warning. */
1134 buffer->next_line = buffer->rlimit;
1135 }
1136
1137 buffer->line_base = note->pos;
1138 CPP_INCREMENT_LINE (pfile, 0);
1139 }
1140 else if (_cpp_trigraph_map[note->type])
1141 {
1142 if (CPP_OPTION (pfile, warn_trigraphs)
1143 && (!in_comment || warn_in_comment (pfile, note)))
1144 {
1145 if (CPP_OPTION (pfile, trigraphs))
1146 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1147 pfile->line_table->highest_line, col,
1148 "trigraph ??%c converted to %c",
1149 note->type,
1150 (int) _cpp_trigraph_map[note->type]);
1151 else
1152 {
1153 cpp_warning_with_line
1154 (pfile, CPP_W_TRIGRAPHS,
1155 pfile->line_table->highest_line, col,
1156 "trigraph ??%c ignored, use -trigraphs to enable",
1157 note->type);
1158 }
1159 }
1160 }
1161 else if (note->type == 0)
1162 /* Already processed in lex_raw_string. */;
1163 else
1164 abort ();
1165 }
1166 }
1167
1168 namespace bidi {
1169 enum class kind {
1170 NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1171 };
1172
1173 /* All the UTF-8 encodings of bidi characters start with E2. */
1174 constexpr uchar utf8_start = 0xe2;
1175
1176 struct context
1177 {
1178 context () {}
1179 context (location_t loc, kind k, bool pdf, bool ucn)
1180 : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1181 {
1182 }
1183
1184 kind get_pop_kind () const
1185 {
1186 return m_pdf ? kind::PDF : kind::PDI;
1187 }
1188 bool ucn_p () const
1189 {
1190 return m_ucn;
1191 }
1192
1193 location_t m_loc;
1194 kind m_kind;
1195 unsigned m_pdf : 1;
1196 unsigned m_ucn : 1;
1197 };
1198
1199 /* A vector holding currently open bidi contexts. We use a char for
1200 each context, its LSB is 1 if it represents a PDF context, 0 if it
1201 represents a PDI context. The next bit is 1 if this context was open
1202 by a bidi character written as a UCN, and 0 when it was UTF-8. */
1203 semi_embedded_vec <context, 16> vec;
1204
1205 /* Close the whole comment/identifier/string literal/character constant
1206 context. */
1207 void on_close ()
1208 {
1209 vec.truncate (0);
1210 }
1211
1212 /* Pop the last element in the vector. */
1213 void pop ()
1214 {
1215 unsigned int len = vec.count ();
1216 gcc_checking_assert (len > 0);
1217 vec.truncate (len - 1);
1218 }
1219
1220 /* Return the pop kind of the context of the Ith element. */
1221 kind pop_kind_at (unsigned int i)
1222 {
1223 return vec[i].get_pop_kind ();
1224 }
1225
1226 /* Return the pop kind of the context that is currently opened. */
1227 kind current_ctx ()
1228 {
1229 unsigned int len = vec.count ();
1230 if (len == 0)
1231 return kind::NONE;
1232 return vec[len - 1].get_pop_kind ();
1233 }
1234
1235 /* Return true if the current context comes from a UCN origin, that is,
1236 the bidi char which started this bidi context was written as a UCN. */
1237 bool current_ctx_ucn_p ()
1238 {
1239 unsigned int len = vec.count ();
1240 gcc_checking_assert (len > 0);
1241 return vec[len - 1].m_ucn;
1242 }
1243
1244 location_t current_ctx_loc ()
1245 {
1246 unsigned int len = vec.count ();
1247 gcc_checking_assert (len > 0);
1248 return vec[len - 1].m_loc;
1249 }
1250
1251 /* We've read a bidi char, update the current vector as necessary.
1252 LOC is only valid when K is not kind::NONE. */
1253 void on_char (kind k, bool ucn_p, location_t loc)
1254 {
1255 switch (k)
1256 {
1257 case kind::LRE:
1258 case kind::RLE:
1259 case kind::LRO:
1260 case kind::RLO:
1261 vec.push (context (loc, k, true, ucn_p));
1262 break;
1263 case kind::LRI:
1264 case kind::RLI:
1265 case kind::FSI:
1266 vec.push (context (loc, k, false, ucn_p));
1267 break;
1268 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1269 whose scope has not yet been terminated. */
1270 case kind::PDF:
1271 if (current_ctx () == kind::PDF)
1272 pop ();
1273 break;
1274 /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1275 scope has not yet been terminated, as well as the scopes of
1276 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1277 yet been terminated. */
1278 case kind::PDI:
1279 for (int i = vec.count () - 1; i >= 0; --i)
1280 if (pop_kind_at (i) == kind::PDI)
1281 {
1282 vec.truncate (i);
1283 break;
1284 }
1285 break;
1286 case kind::LTR:
1287 case kind::RTL:
1288 /* These aren't popped by a PDF/PDI. */
1289 break;
1290 ATTR_LIKELY case kind::NONE:
1291 break;
1292 default:
1293 abort ();
1294 }
1295 }
1296
1297 /* Return a descriptive string for K. */
1298 const char *to_str (kind k)
1299 {
1300 switch (k)
1301 {
1302 case kind::LRE:
1303 return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1304 case kind::RLE:
1305 return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1306 case kind::LRO:
1307 return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1308 case kind::RLO:
1309 return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1310 case kind::LRI:
1311 return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1312 case kind::RLI:
1313 return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1314 case kind::FSI:
1315 return "U+2068 (FIRST STRONG ISOLATE)";
1316 case kind::PDF:
1317 return "U+202C (POP DIRECTIONAL FORMATTING)";
1318 case kind::PDI:
1319 return "U+2069 (POP DIRECTIONAL ISOLATE)";
1320 case kind::LTR:
1321 return "U+200E (LEFT-TO-RIGHT MARK)";
1322 case kind::RTL:
1323 return "U+200F (RIGHT-TO-LEFT MARK)";
1324 default:
1325 abort ();
1326 }
1327 }
1328 }
1329
1330 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1331 within the current line in FILE, with the caret at START. */
1332
1333 static location_t
1334 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1335 const unsigned char *const start,
1336 size_t num_bytes)
1337 {
1338 gcc_checking_assert (num_bytes > 0);
1339
1340 /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1341 to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1342 whereas linemap_position_for_column is 1-based. */
1343
1344 /* Get 0-based offsets within the line. */
1345 size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1346 size_t end_offset = start_offset + num_bytes - 1;
1347
1348 /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1349 location_t start_loc = linemap_position_for_column (pfile->line_table,
1350 start_offset + 1);
1351 location_t end_loc = linemap_position_for_column (pfile->line_table,
1352 end_offset + 1);
1353
1354 if (start_loc == end_loc)
1355 return start_loc;
1356
1357 source_range src_range;
1358 src_range.m_start = start_loc;
1359 src_range.m_finish = end_loc;
1360 location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1361 start_loc,
1362 src_range,
1363 NULL);
1364 return combined_loc;
1365 }
1366
1367 /* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1368
1369 static bidi::kind
1370 get_bidi_utf8_1 (const unsigned char *const p)
1371 {
1372 gcc_checking_assert (p[0] == bidi::utf8_start);
1373
1374 if (p[1] == 0x80)
1375 switch (p[2])
1376 {
1377 case 0xaa:
1378 return bidi::kind::LRE;
1379 case 0xab:
1380 return bidi::kind::RLE;
1381 case 0xac:
1382 return bidi::kind::PDF;
1383 case 0xad:
1384 return bidi::kind::LRO;
1385 case 0xae:
1386 return bidi::kind::RLO;
1387 case 0x8e:
1388 return bidi::kind::LTR;
1389 case 0x8f:
1390 return bidi::kind::RTL;
1391 default:
1392 break;
1393 }
1394 else if (p[1] == 0x81)
1395 switch (p[2])
1396 {
1397 case 0xa6:
1398 return bidi::kind::LRI;
1399 case 0xa7:
1400 return bidi::kind::RLI;
1401 case 0xa8:
1402 return bidi::kind::FSI;
1403 case 0xa9:
1404 return bidi::kind::PDI;
1405 default:
1406 break;
1407 }
1408
1409 return bidi::kind::NONE;
1410 }
1411
1412 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1413 If the kind is not NONE, write the location to *OUT.*/
1414
1415 static bidi::kind
1416 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1417 {
1418 bidi::kind result = get_bidi_utf8_1 (p);
1419 if (result != bidi::kind::NONE)
1420 {
1421 /* We have a sequence of 3 bytes starting at P. */
1422 *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1423 }
1424 return result;
1425 }
1426
1427 /* Parse a UCN where P points just past \u or \U and return its bidi code. */
1428
1429 static bidi::kind
1430 get_bidi_ucn_1 (const unsigned char *p, bool is_U)
1431 {
1432 /* 6.4.3 Universal Character Names
1433 \u hex-quad
1434 \U hex-quad hex-quad
1435 where \unnnn means \U0000nnnn. */
1436
1437 if (is_U)
1438 {
1439 if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1440 return bidi::kind::NONE;
1441 /* Skip 4B so we can treat \u and \U the same below. */
1442 p += 4;
1443 }
1444
1445 /* All code points we are looking for start with 20xx. */
1446 if (p[0] != '2' || p[1] != '0')
1447 return bidi::kind::NONE;
1448 else if (p[2] == '2')
1449 switch (p[3])
1450 {
1451 case 'a':
1452 case 'A':
1453 return bidi::kind::LRE;
1454 case 'b':
1455 case 'B':
1456 return bidi::kind::RLE;
1457 case 'c':
1458 case 'C':
1459 return bidi::kind::PDF;
1460 case 'd':
1461 case 'D':
1462 return bidi::kind::LRO;
1463 case 'e':
1464 case 'E':
1465 return bidi::kind::RLO;
1466 default:
1467 break;
1468 }
1469 else if (p[2] == '6')
1470 switch (p[3])
1471 {
1472 case '6':
1473 return bidi::kind::LRI;
1474 case '7':
1475 return bidi::kind::RLI;
1476 case '8':
1477 return bidi::kind::FSI;
1478 case '9':
1479 return bidi::kind::PDI;
1480 default:
1481 break;
1482 }
1483 else if (p[2] == '0')
1484 switch (p[3])
1485 {
1486 case 'e':
1487 case 'E':
1488 return bidi::kind::LTR;
1489 case 'f':
1490 case 'F':
1491 return bidi::kind::RTL;
1492 default:
1493 break;
1494 }
1495
1496 return bidi::kind::NONE;
1497 }
1498
1499 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1500 If the kind is not NONE, write the location to *OUT.*/
1501
1502 static bidi::kind
1503 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1504 location_t *out)
1505 {
1506 bidi::kind result = get_bidi_ucn_1 (p, is_U);
1507 if (result != bidi::kind::NONE)
1508 {
1509 const unsigned char *start = p - 2;
1510 size_t num_bytes = 2 + (is_U ? 8 : 4);
1511 *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1512 }
1513 return result;
1514 }
1515
1516 /* Subclass of rich_location for reporting on unpaired UTF-8
1517 bidirectional control character(s).
1518 Escape the source lines on output, and show all unclosed
1519 bidi context, labelling everything. */
1520
1521 class unpaired_bidi_rich_location : public rich_location
1522 {
1523 public:
1524 class custom_range_label : public range_label
1525 {
1526 public:
1527 label_text get_text (unsigned range_idx) const FINAL OVERRIDE
1528 {
1529 /* range 0 is the primary location; each subsequent range i + 1
1530 is for bidi::vec[i]. */
1531 if (range_idx > 0)
1532 {
1533 const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1534 return label_text::borrow (bidi::to_str (ctxt.m_kind));
1535 }
1536 else
1537 return label_text::borrow (_("end of bidirectional context"));
1538 }
1539 };
1540
1541 unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1542 : rich_location (pfile->line_table, loc, &m_custom_label)
1543 {
1544 set_escape_on_output (true);
1545 for (unsigned i = 0; i < bidi::vec.count (); i++)
1546 add_range (bidi::vec[i].m_loc,
1547 SHOW_RANGE_WITHOUT_CARET,
1548 &m_custom_label);
1549 }
1550
1551 private:
1552 custom_range_label m_custom_label;
1553 };
1554
1555 /* We're closing a bidi context, that is, we've encountered a newline,
1556 are closing a C-style comment, or are at the end of a string literal,
1557 character constant, or identifier. Warn if this context was not
1558 properly terminated by a PDI or PDF. P points to the last character
1559 in this context. */
1560
1561 static void
1562 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1563 {
1564 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1565 if (bidi::vec.count () > 0
1566 && (warn_bidi & bidirectional_unpaired
1567 && (!bidi::current_ctx_ucn_p ()
1568 || (warn_bidi & bidirectional_ucn))))
1569 {
1570 const location_t loc
1571 = linemap_position_for_column (pfile->line_table,
1572 CPP_BUF_COLUMN (pfile->buffer, p));
1573 unpaired_bidi_rich_location rich_loc (pfile, loc);
1574 /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1575 forms of a diagnostic, so fake it for now. */
1576 if (bidi::vec.count () > 1)
1577 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1578 "unpaired UTF-8 bidirectional control characters "
1579 "detected");
1580 else
1581 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1582 "unpaired UTF-8 bidirectional control character "
1583 "detected");
1584 }
1585 /* We're done with this context. */
1586 bidi::on_close ();
1587 }
1588
1589 /* We're at the beginning or in the middle of an identifier/comment/string
1590 literal/character constant. Warn if we've encountered a bidi character.
1591 KIND says which bidi control character it was; UCN_P is true iff this bidi
1592 control character was written as a UCN. LOC is the location of the
1593 character, but is only valid if KIND != bidi::kind::NONE. */
1594
1595 static void
1596 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1597 bool ucn_p, location_t loc)
1598 {
1599 if (__builtin_expect (kind == bidi::kind::NONE, 1))
1600 return;
1601
1602 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1603
1604 if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1605 {
1606 rich_location rich_loc (pfile->line_table, loc);
1607 rich_loc.set_escape_on_output (true);
1608
1609 /* It seems excessive to warn about a PDI/PDF that is closing
1610 an opened context because we've already warned about the
1611 opening character. Except warn when we have a UCN x UTF-8
1612 mismatch, if UCN checking is enabled. */
1613 if (kind == bidi::current_ctx ())
1614 {
1615 if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1616 && bidi::current_ctx_ucn_p () != ucn_p)
1617 {
1618 rich_loc.add_range (bidi::current_ctx_loc ());
1619 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1620 "UTF-8 vs UCN mismatch when closing "
1621 "a context by \"%s\"", bidi::to_str (kind));
1622 }
1623 }
1624 else if (warn_bidi & bidirectional_any
1625 && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1626 {
1627 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1628 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1629 "\"%s\" is closing an unopened context",
1630 bidi::to_str (kind));
1631 else
1632 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1633 "found problematic Unicode character \"%s\"",
1634 bidi::to_str (kind));
1635 }
1636 }
1637 /* We're done with this context. */
1638 bidi::on_char (kind, ucn_p, loc);
1639 }
1640
1641 /* Skip a C-style block comment. We find the end of the comment by
1642 seeing if an asterisk is before every '/' we encounter. Returns
1643 nonzero if comment terminated by EOF, zero otherwise.
1644
1645 Buffer->cur points to the initial asterisk of the comment. */
1646 bool
1647 _cpp_skip_block_comment (cpp_reader *pfile)
1648 {
1649 cpp_buffer *buffer = pfile->buffer;
1650 const uchar *cur = buffer->cur;
1651 uchar c;
1652 const bool warn_bidi_p = pfile->warn_bidi_p ();
1653
1654 cur++;
1655 if (*cur == '/')
1656 cur++;
1657
1658 for (;;)
1659 {
1660 /* People like decorating comments with '*', so check for '/'
1661 instead for efficiency. */
1662 c = *cur++;
1663
1664 if (c == '/')
1665 {
1666 if (cur[-2] == '*')
1667 {
1668 if (warn_bidi_p)
1669 maybe_warn_bidi_on_close (pfile, cur);
1670 break;
1671 }
1672
1673 /* Warn about potential nested comments, but not if the '/'
1674 comes immediately before the true comment delimiter.
1675 Don't bother to get it right across escaped newlines. */
1676 if (CPP_OPTION (pfile, warn_comments)
1677 && cur[0] == '*' && cur[1] != '/')
1678 {
1679 buffer->cur = cur;
1680 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1681 pfile->line_table->highest_line,
1682 CPP_BUF_COL (buffer),
1683 "\"/*\" within comment");
1684 }
1685 }
1686 else if (c == '\n')
1687 {
1688 unsigned int cols;
1689 buffer->cur = cur - 1;
1690 if (warn_bidi_p)
1691 maybe_warn_bidi_on_close (pfile, cur);
1692 _cpp_process_line_notes (pfile, true);
1693 if (buffer->next_line >= buffer->rlimit)
1694 return true;
1695 _cpp_clean_line (pfile);
1696
1697 cols = buffer->next_line - buffer->line_base;
1698 CPP_INCREMENT_LINE (pfile, cols);
1699
1700 cur = buffer->cur;
1701 }
1702 /* If this is a beginning of a UTF-8 encoding, it might be
1703 a bidirectional control character. */
1704 else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
1705 {
1706 location_t loc;
1707 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1708 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1709 }
1710 }
1711
1712 buffer->cur = cur;
1713 _cpp_process_line_notes (pfile, true);
1714 return false;
1715 }
1716
1717 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1718 terminating newline. Handles escaped newlines. Returns nonzero
1719 if a multiline comment. */
1720 static int
1721 skip_line_comment (cpp_reader *pfile)
1722 {
1723 cpp_buffer *buffer = pfile->buffer;
1724 location_t orig_line = pfile->line_table->highest_line;
1725 const bool warn_bidi_p = pfile->warn_bidi_p ();
1726
1727 if (!warn_bidi_p)
1728 while (*buffer->cur != '\n')
1729 buffer->cur++;
1730 else
1731 {
1732 while (*buffer->cur != '\n'
1733 && *buffer->cur != bidi::utf8_start)
1734 buffer->cur++;
1735 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1736 {
1737 while (*buffer->cur != '\n')
1738 {
1739 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1740 {
1741 location_t loc;
1742 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1743 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1744 }
1745 buffer->cur++;
1746 }
1747 maybe_warn_bidi_on_close (pfile, buffer->cur);
1748 }
1749 }
1750
1751 _cpp_process_line_notes (pfile, true);
1752 return orig_line != pfile->line_table->highest_line;
1753 }
1754
1755 /* Skips whitespace, saving the next non-whitespace character. */
1756 static void
1757 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1758 {
1759 cpp_buffer *buffer = pfile->buffer;
1760 bool saw_NUL = false;
1761
1762 do
1763 {
1764 /* Horizontal space always OK. */
1765 if (c == ' ' || c == '\t')
1766 ;
1767 /* Just \f \v or \0 left. */
1768 else if (c == '\0')
1769 saw_NUL = true;
1770 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1771 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1772 CPP_BUF_COL (buffer),
1773 "%s in preprocessing directive",
1774 c == '\f' ? "form feed" : "vertical tab");
1775
1776 c = *buffer->cur++;
1777 }
1778 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1779 while (is_nvspace (c));
1780
1781 if (saw_NUL)
1782 {
1783 encoding_rich_location rich_loc (pfile);
1784 cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1785 "null character(s) ignored");
1786 }
1787
1788 buffer->cur--;
1789 }
1790
1791 /* See if the characters of a number token are valid in a name (no
1792 '.', '+' or '-'). */
1793 static int
1794 name_p (cpp_reader *pfile, const cpp_string *string)
1795 {
1796 unsigned int i;
1797
1798 for (i = 0; i < string->len; i++)
1799 if (!is_idchar (string->text[i]))
1800 return 0;
1801
1802 return 1;
1803 }
1804
1805 /* After parsing an identifier or other sequence, produce a warning about
1806 sequences not in NFC/NFKC. */
1807 static void
1808 warn_about_normalization (cpp_reader *pfile,
1809 const cpp_token *token,
1810 const struct normalize_state *s)
1811 {
1812 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1813 && !pfile->state.skipping)
1814 {
1815 location_t loc = token->src_loc;
1816
1817 /* If possible, create a location range for the token. */
1818 if (loc >= RESERVED_LOCATION_COUNT
1819 && token->type != CPP_EOF
1820 /* There must be no line notes to process. */
1821 && (!(pfile->buffer->cur
1822 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
1823 && !pfile->overlaid_buffer)))
1824 {
1825 source_range tok_range;
1826 tok_range.m_start = loc;
1827 tok_range.m_finish
1828 = linemap_position_for_column (pfile->line_table,
1829 CPP_BUF_COLUMN (pfile->buffer,
1830 pfile->buffer->cur));
1831 loc = COMBINE_LOCATION_DATA (pfile->line_table,
1832 loc, tok_range, NULL);
1833 }
1834
1835 encoding_rich_location rich_loc (pfile, loc);
1836
1837 /* Make sure that the token is printed using UCNs, even
1838 if we'd otherwise happily print UTF-8. */
1839 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1840 size_t sz;
1841
1842 sz = cpp_spell_token (pfile, token, buf, false) - buf;
1843 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1844 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1845 "`%.*s' is not in NFKC", (int) sz, buf);
1846 else if (CPP_OPTION (pfile, cplusplus))
1847 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1848 "`%.*s' is not in NFC", (int) sz, buf);
1849 else
1850 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1851 "`%.*s' is not in NFC", (int) sz, buf);
1852 free (buf);
1853 }
1854 }
1855
1856 static const cppchar_t utf8_signifier = 0xC0;
1857
1858 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1859 an identifier. FIRST is TRUE if this starts an identifier. */
1860
1861 static bool
1862 forms_identifier_p (cpp_reader *pfile, int first,
1863 struct normalize_state *state)
1864 {
1865 cpp_buffer *buffer = pfile->buffer;
1866 const bool warn_bidi_p = pfile->warn_bidi_p ();
1867
1868 if (*buffer->cur == '$')
1869 {
1870 if (!CPP_OPTION (pfile, dollars_in_ident))
1871 return false;
1872
1873 buffer->cur++;
1874 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1875 {
1876 CPP_OPTION (pfile, warn_dollars) = 0;
1877 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1878 }
1879
1880 return true;
1881 }
1882
1883 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
1884 if (CPP_OPTION (pfile, extended_identifiers))
1885 {
1886 cppchar_t s;
1887 if (*buffer->cur >= utf8_signifier)
1888 {
1889 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
1890 && warn_bidi_p)
1891 {
1892 location_t loc;
1893 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1894 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1895 }
1896 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1897 state, &s))
1898 return true;
1899 }
1900 else if (*buffer->cur == '\\'
1901 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1902 {
1903 buffer->cur += 2;
1904 if (warn_bidi_p)
1905 {
1906 location_t loc;
1907 bidi::kind kind = get_bidi_ucn (pfile,
1908 buffer->cur,
1909 buffer->cur[-1] == 'U',
1910 &loc);
1911 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
1912 }
1913 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1914 state, &s, NULL, NULL))
1915 return true;
1916 buffer->cur -= 2;
1917 }
1918 }
1919
1920 return false;
1921 }
1922
1923 /* Helper function to issue error about improper __VA_OPT__ use. */
1924 static void
1925 maybe_va_opt_error (cpp_reader *pfile)
1926 {
1927 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1928 {
1929 /* __VA_OPT__ should not be accepted at all, but allow it in
1930 system headers. */
1931 if (!_cpp_in_system_header (pfile))
1932 cpp_error (pfile, CPP_DL_PEDWARN,
1933 "__VA_OPT__ is not available until C++20");
1934 }
1935 else if (!pfile->state.va_args_ok)
1936 {
1937 /* __VA_OPT__ should only appear in the replacement list of a
1938 variadic macro. */
1939 cpp_error (pfile, CPP_DL_PEDWARN,
1940 "__VA_OPT__ can only appear in the expansion"
1941 " of a C++20 variadic macro");
1942 }
1943 }
1944
1945 /* Helper function to get the cpp_hashnode of the identifier BASE. */
1946 static cpp_hashnode *
1947 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1948 {
1949 cpp_hashnode *result;
1950 const uchar *cur;
1951 unsigned int len;
1952 unsigned int hash = HT_HASHSTEP (0, *base);
1953
1954 cur = base + 1;
1955 while (ISIDNUM (*cur))
1956 {
1957 hash = HT_HASHSTEP (hash, *cur);
1958 cur++;
1959 }
1960 len = cur - base;
1961 hash = HT_HASHFINISH (hash, len);
1962 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1963 base, len, hash, HT_ALLOC));
1964
1965 /* Rarely, identifiers require diagnostics when lexed. */
1966 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1967 && !pfile->state.skipping, 0))
1968 {
1969 /* It is allowed to poison the same identifier twice. */
1970 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1971 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1972 NODE_NAME (result));
1973
1974 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1975 replacement list of a variadic macro. */
1976 if (result == pfile->spec_nodes.n__VA_ARGS__
1977 && !pfile->state.va_args_ok)
1978 {
1979 if (CPP_OPTION (pfile, cplusplus))
1980 cpp_error (pfile, CPP_DL_PEDWARN,
1981 "__VA_ARGS__ can only appear in the expansion"
1982 " of a C++11 variadic macro");
1983 else
1984 cpp_error (pfile, CPP_DL_PEDWARN,
1985 "__VA_ARGS__ can only appear in the expansion"
1986 " of a C99 variadic macro");
1987 }
1988
1989 if (result == pfile->spec_nodes.n__VA_OPT__)
1990 maybe_va_opt_error (pfile);
1991
1992 /* For -Wc++-compat, warn about use of C++ named operators. */
1993 if (result->flags & NODE_WARN_OPERATOR)
1994 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1995 "identifier \"%s\" is a special operator name in C++",
1996 NODE_NAME (result));
1997 }
1998
1999 return result;
2000 }
2001
2002 /* Get the cpp_hashnode of an identifier specified by NAME in
2003 the current cpp_reader object. If none is found, NULL is returned. */
2004 cpp_hashnode *
2005 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2006 {
2007 cpp_hashnode *result;
2008 result = lex_identifier_intern (pfile, (uchar *) name);
2009 return result;
2010 }
2011
2012 /* Lex an identifier starting at BUFFER->CUR - 1. */
2013 static cpp_hashnode *
2014 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2015 struct normalize_state *nst, cpp_hashnode **spelling)
2016 {
2017 cpp_hashnode *result;
2018 const uchar *cur;
2019 unsigned int len;
2020 unsigned int hash = HT_HASHSTEP (0, *base);
2021 const bool warn_bidi_p = pfile->warn_bidi_p ();
2022
2023 cur = pfile->buffer->cur;
2024 if (! starts_ucn)
2025 {
2026 while (ISIDNUM (*cur))
2027 {
2028 hash = HT_HASHSTEP (hash, *cur);
2029 cur++;
2030 }
2031 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2032 }
2033 pfile->buffer->cur = cur;
2034 if (starts_ucn || forms_identifier_p (pfile, false, nst))
2035 {
2036 /* Slower version for identifiers containing UCNs
2037 or extended chars (including $). */
2038 do {
2039 while (ISIDNUM (*pfile->buffer->cur))
2040 {
2041 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2042 pfile->buffer->cur++;
2043 }
2044 } while (forms_identifier_p (pfile, false, nst));
2045 if (warn_bidi_p)
2046 maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2047 result = _cpp_interpret_identifier (pfile, base,
2048 pfile->buffer->cur - base);
2049 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2050 }
2051 else
2052 {
2053 len = cur - base;
2054 hash = HT_HASHFINISH (hash, len);
2055
2056 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2057 base, len, hash, HT_ALLOC));
2058 *spelling = result;
2059 }
2060
2061 /* Rarely, identifiers require diagnostics when lexed. */
2062 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2063 && !pfile->state.skipping, 0))
2064 {
2065 /* It is allowed to poison the same identifier twice. */
2066 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2067 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2068 NODE_NAME (result));
2069
2070 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2071 replacement list of a variadic macro. */
2072 if (result == pfile->spec_nodes.n__VA_ARGS__
2073 && !pfile->state.va_args_ok)
2074 {
2075 if (CPP_OPTION (pfile, cplusplus))
2076 cpp_error (pfile, CPP_DL_PEDWARN,
2077 "__VA_ARGS__ can only appear in the expansion"
2078 " of a C++11 variadic macro");
2079 else
2080 cpp_error (pfile, CPP_DL_PEDWARN,
2081 "__VA_ARGS__ can only appear in the expansion"
2082 " of a C99 variadic macro");
2083 }
2084
2085 /* __VA_OPT__ should only appear in the replacement list of a
2086 variadic macro. */
2087 if (result == pfile->spec_nodes.n__VA_OPT__)
2088 maybe_va_opt_error (pfile);
2089
2090 /* For -Wc++-compat, warn about use of C++ named operators. */
2091 if (result->flags & NODE_WARN_OPERATOR)
2092 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2093 "identifier \"%s\" is a special operator name in C++",
2094 NODE_NAME (result));
2095 }
2096
2097 return result;
2098 }
2099
2100 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
2101 static void
2102 lex_number (cpp_reader *pfile, cpp_string *number,
2103 struct normalize_state *nst)
2104 {
2105 const uchar *cur;
2106 const uchar *base;
2107 uchar *dest;
2108
2109 base = pfile->buffer->cur - 1;
2110 do
2111 {
2112 const uchar *adj_digit_sep = NULL;
2113 cur = pfile->buffer->cur;
2114
2115 /* N.B. ISIDNUM does not include $. */
2116 while (ISIDNUM (*cur)
2117 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2118 || DIGIT_SEP (*cur)
2119 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2120 {
2121 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2122 /* Adjacent digit separators do not form part of the pp-number syntax.
2123 However, they can safely be diagnosed here as an error, since '' is
2124 not a valid preprocessing token. */
2125 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2126 adj_digit_sep = cur;
2127 cur++;
2128 }
2129 /* A number can't end with a digit separator. */
2130 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2131 --cur;
2132 if (adj_digit_sep && adj_digit_sep < cur)
2133 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2134
2135 pfile->buffer->cur = cur;
2136 }
2137 while (forms_identifier_p (pfile, false, nst));
2138
2139 number->len = cur - base;
2140 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2141 memcpy (dest, base, number->len);
2142 dest[number->len] = '\0';
2143 number->text = dest;
2144 }
2145
2146 /* Create a token of type TYPE with a literal spelling. */
2147 static void
2148 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2149 unsigned int len, enum cpp_ttype type)
2150 {
2151 token->type = type;
2152 token->val.str.len = len;
2153 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2154 }
2155
2156 const uchar *
2157 cpp_alloc_token_string (cpp_reader *pfile,
2158 const unsigned char *ptr, unsigned len)
2159 {
2160 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2161
2162 dest[len] = 0;
2163 memcpy (dest, ptr, len);
2164 return dest;
2165 }
2166
2167 /* A pair of raw buffer pointers. The currently open one is [1], the
2168 first one is [0]. Used for string literal lexing. */
2169 struct lit_accum {
2170 _cpp_buff *first;
2171 _cpp_buff *last;
2172 const uchar *rpos;
2173 size_t accum;
2174
2175 lit_accum ()
2176 : first (NULL), last (NULL), rpos (0), accum (0)
2177 {
2178 }
2179
2180 void append (cpp_reader *, const uchar *, size_t);
2181
2182 void read_begin (cpp_reader *);
2183 bool reading_p () const
2184 {
2185 return rpos != NULL;
2186 }
2187 char read_char ()
2188 {
2189 char c = *rpos++;
2190 if (rpos == BUFF_FRONT (last))
2191 rpos = NULL;
2192 return c;
2193 }
2194 };
2195
2196 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2197 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2198
2199 void
2200 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2201 {
2202 if (!last)
2203 /* Starting. */
2204 first = last = _cpp_get_buff (pfile, len);
2205 else if (len > BUFF_ROOM (last))
2206 {
2207 /* There is insufficient room in the buffer. Copy what we can,
2208 and then either extend or create a new one. */
2209 size_t room = BUFF_ROOM (last);
2210 memcpy (BUFF_FRONT (last), base, room);
2211 BUFF_FRONT (last) += room;
2212 base += room;
2213 len -= room;
2214 accum += room;
2215
2216 gcc_checking_assert (!rpos);
2217
2218 last = _cpp_append_extend_buff (pfile, last, len);
2219 }
2220
2221 memcpy (BUFF_FRONT (last), base, len);
2222 BUFF_FRONT (last) += len;
2223 accum += len;
2224 }
2225
2226 void
2227 lit_accum::read_begin (cpp_reader *pfile)
2228 {
2229 /* We never accumulate more than 4 chars to read. */
2230 if (BUFF_ROOM (last) < 4)
2231
2232 last = _cpp_append_extend_buff (pfile, last, 4);
2233 rpos = BUFF_FRONT (last);
2234 }
2235
2236 /* Returns true if a macro has been defined.
2237 This might not work if compile with -save-temps,
2238 or preprocess separately from compilation. */
2239
2240 static bool
2241 is_macro(cpp_reader *pfile, const uchar *base)
2242 {
2243 const uchar *cur = base;
2244 if (! ISIDST (*cur))
2245 return false;
2246 unsigned int hash = HT_HASHSTEP (0, *cur);
2247 ++cur;
2248 while (ISIDNUM (*cur))
2249 {
2250 hash = HT_HASHSTEP (hash, *cur);
2251 ++cur;
2252 }
2253 hash = HT_HASHFINISH (hash, cur - base);
2254
2255 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2256 base, cur - base, hash, HT_NO_INSERT));
2257
2258 return result && cpp_macro_p (result);
2259 }
2260
2261 /* Returns true if a literal suffix does not have the expected form
2262 and is defined as a macro. */
2263
2264 static bool
2265 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2266 {
2267 /* User-defined literals outside of namespace std must start with a single
2268 underscore, so assume anything of that form really is a UDL suffix.
2269 We don't need to worry about UDLs defined inside namespace std because
2270 their names are reserved, so cannot be used as macro names in valid
2271 programs. */
2272 if (base[0] == '_' && base[1] != '_')
2273 return false;
2274 return is_macro (pfile, base);
2275 }
2276
2277 /* Lexes a raw string. The stored string contains the spelling,
2278 including double quotes, delimiter string, '(' and ')', any leading
2279 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2280 the type of the literal, or CPP_OTHER if it was not properly
2281 terminated.
2282
2283 BASE is the start of the token. Updates pfile->buffer->cur to just
2284 after the lexed string.
2285
2286 The spelling is NUL-terminated, but it is not guaranteed that this
2287 is the first NUL since embedded NULs are preserved. */
2288
2289 static void
2290 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2291 {
2292 const uchar *pos = base;
2293 const bool warn_bidi_p = pfile->warn_bidi_p ();
2294
2295 /* 'tis a pity this information isn't passed down from the lexer's
2296 initial categorization of the token. */
2297 enum cpp_ttype type = CPP_STRING;
2298
2299 if (*pos == 'L')
2300 {
2301 type = CPP_WSTRING;
2302 pos++;
2303 }
2304 else if (*pos == 'U')
2305 {
2306 type = CPP_STRING32;
2307 pos++;
2308 }
2309 else if (*pos == 'u')
2310 {
2311 if (pos[1] == '8')
2312 {
2313 type = CPP_UTF8STRING;
2314 pos++;
2315 }
2316 else
2317 type = CPP_STRING16;
2318 pos++;
2319 }
2320
2321 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2322 pos += 2;
2323
2324 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2325
2326 /* Skip notes before the ". */
2327 while (note->pos < pos)
2328 ++note;
2329
2330 lit_accum accum;
2331
2332 uchar prefix[17];
2333 unsigned prefix_len = 0;
2334 enum Phase
2335 {
2336 PHASE_PREFIX = -2,
2337 PHASE_NONE = -1,
2338 PHASE_SUFFIX = 0
2339 } phase = PHASE_PREFIX;
2340
2341 for (;;)
2342 {
2343 gcc_checking_assert (note->pos >= pos);
2344
2345 /* Undo any escaped newlines and trigraphs. */
2346 if (!accum.reading_p () && note->pos == pos)
2347 switch (note->type)
2348 {
2349 case '\\':
2350 case ' ':
2351 /* Restore backslash followed by newline. */
2352 accum.append (pfile, base, pos - base);
2353 base = pos;
2354 accum.read_begin (pfile);
2355 accum.append (pfile, UC"\\", 1);
2356
2357 after_backslash:
2358 if (note->type == ' ')
2359 /* GNU backslash whitespace newline extension. FIXME
2360 could be any sequence of non-vertical space. When we
2361 can properly restore any such sequence, we should
2362 mark this note as handled so _cpp_process_line_notes
2363 doesn't warn. */
2364 accum.append (pfile, UC" ", 1);
2365
2366 accum.append (pfile, UC"\n", 1);
2367 note++;
2368 break;
2369
2370 case '\n':
2371 /* This can happen for ??/<NEWLINE> when trigraphs are not
2372 being interpretted. */
2373 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2374 note->type = 0;
2375 note++;
2376 break;
2377
2378 default:
2379 gcc_checking_assert (_cpp_trigraph_map[note->type]);
2380
2381 /* Don't warn about this trigraph in
2382 _cpp_process_line_notes, since trigraphs show up as
2383 trigraphs in raw strings. */
2384 uchar type = note->type;
2385 note->type = 0;
2386
2387 if (CPP_OPTION (pfile, trigraphs))
2388 {
2389 accum.append (pfile, base, pos - base);
2390 base = pos;
2391 accum.read_begin (pfile);
2392 accum.append (pfile, UC"??", 2);
2393 accum.append (pfile, &type, 1);
2394
2395 /* ??/ followed by newline gets two line notes, one for
2396 the trigraph and one for the backslash/newline. */
2397 if (type == '/' && note[1].pos == pos)
2398 {
2399 note++;
2400 gcc_assert (note->type == '\\' || note->type == ' ');
2401 goto after_backslash;
2402 }
2403 /* Skip the replacement character. */
2404 base = ++pos;
2405 }
2406
2407 note++;
2408 break;
2409 }
2410
2411 /* Now get a char to process. Either from an expanded note, or
2412 from the line buffer. */
2413 bool read_note = accum.reading_p ();
2414 char c = read_note ? accum.read_char () : *pos++;
2415
2416 if (phase == PHASE_PREFIX)
2417 {
2418 if (c == '(')
2419 {
2420 /* Done. */
2421 phase = PHASE_NONE;
2422 prefix[prefix_len++] = '"';
2423 }
2424 else if (prefix_len < 16
2425 /* Prefix chars are any of the basic character set,
2426 [lex.charset] except for '
2427 ()\\\t\v\f\n'. Optimized for a contiguous
2428 alphabet. */
2429 /* Unlike a switch, this collapses down to one or
2430 two shift and bitmask operations on an ASCII
2431 system, with an outlier or two. */
2432 && (('Z' - 'A' == 25
2433 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2434 : ISIDST (c))
2435 || (c >= '0' && c <= '9')
2436 || c == '_' || c == '{' || c == '}'
2437 || c == '[' || c == ']' || c == '#'
2438 || c == '<' || c == '>' || c == '%'
2439 || c == ':' || c == ';' || c == '.' || c == '?'
2440 || c == '*' || c == '+' || c == '-' || c == '/'
2441 || c == '^' || c == '&' || c == '|' || c == '~'
2442 || c == '!' || c == '=' || c == ','
2443 || c == '"' || c == '\''))
2444 prefix[prefix_len++] = c;
2445 else
2446 {
2447 /* Something is wrong. */
2448 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2449 if (prefix_len == 16)
2450 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2451 col, "raw string delimiter longer "
2452 "than 16 characters");
2453 else if (c == '\n')
2454 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2455 col, "invalid new-line in raw "
2456 "string delimiter");
2457 else
2458 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2459 col, "invalid character '%c' in "
2460 "raw string delimiter", c);
2461 type = CPP_OTHER;
2462 phase = PHASE_NONE;
2463 /* Continue until we get a close quote, that's probably
2464 the best failure mode. */
2465 prefix_len = 0;
2466 }
2467 if (c != '\n')
2468 continue;
2469 }
2470
2471 if (phase != PHASE_NONE)
2472 {
2473 if (prefix[phase] != c)
2474 phase = PHASE_NONE;
2475 else if (unsigned (phase + 1) == prefix_len)
2476 break;
2477 else
2478 {
2479 phase = Phase (phase + 1);
2480 continue;
2481 }
2482 }
2483
2484 if (!prefix_len && c == '"')
2485 /* Failure mode lexing. */
2486 goto out;
2487 else if (prefix_len && c == ')')
2488 phase = PHASE_SUFFIX;
2489 else if (!read_note && c == '\n')
2490 {
2491 pos--;
2492 pfile->buffer->cur = pos;
2493 if (pfile->state.in_directive
2494 || (pfile->state.parsing_args
2495 && pfile->buffer->next_line >= pfile->buffer->rlimit))
2496 {
2497 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2498 "unterminated raw string");
2499 type = CPP_OTHER;
2500 goto out;
2501 }
2502
2503 accum.append (pfile, base, pos - base + 1);
2504 _cpp_process_line_notes (pfile, false);
2505
2506 if (pfile->buffer->next_line < pfile->buffer->rlimit)
2507 CPP_INCREMENT_LINE (pfile, 0);
2508 pfile->buffer->need_line = true;
2509
2510 if (!_cpp_get_fresh_line (pfile))
2511 {
2512 /* We ran out of file and failed to get a line. */
2513 location_t src_loc = token->src_loc;
2514 token->type = CPP_EOF;
2515 /* Tell the compiler the line number of the EOF token. */
2516 token->src_loc = pfile->line_table->highest_line;
2517 token->flags = BOL;
2518 if (accum.first)
2519 _cpp_release_buff (pfile, accum.first);
2520 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2521 "unterminated raw string");
2522 /* Now pop the buffer that _cpp_get_fresh_line did not. */
2523 _cpp_pop_buffer (pfile);
2524 return;
2525 }
2526
2527 pos = base = pfile->buffer->cur;
2528 note = &pfile->buffer->notes[pfile->buffer->cur_note];
2529 }
2530 else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
2531 && warn_bidi_p)
2532 {
2533 location_t loc;
2534 bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
2535 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2536 }
2537 }
2538
2539 if (warn_bidi_p)
2540 maybe_warn_bidi_on_close (pfile, pos);
2541
2542 if (CPP_OPTION (pfile, user_literals))
2543 {
2544 /* If a string format macro, say from inttypes.h, is placed touching
2545 a string literal it could be parsed as a C++11 user-defined string
2546 literal thus breaking the program. */
2547 if (is_macro_not_literal_suffix (pfile, pos))
2548 {
2549 /* Raise a warning, but do not consume subsequent tokens. */
2550 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2551 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2552 token->src_loc, 0,
2553 "invalid suffix on literal; C++11 requires "
2554 "a space between literal and string macro");
2555 }
2556 /* Grab user defined literal suffix. */
2557 else if (ISIDST (*pos))
2558 {
2559 type = cpp_userdef_string_add_type (type);
2560 ++pos;
2561
2562 while (ISIDNUM (*pos))
2563 ++pos;
2564 }
2565 }
2566
2567 out:
2568 pfile->buffer->cur = pos;
2569 if (!accum.accum)
2570 create_literal (pfile, token, base, pos - base, type);
2571 else
2572 {
2573 size_t extra_len = pos - base;
2574 uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2575
2576 token->type = type;
2577 token->val.str.len = accum.accum + extra_len;
2578 token->val.str.text = dest;
2579 for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2580 {
2581 size_t len = BUFF_FRONT (buf) - buf->base;
2582 memcpy (dest, buf->base, len);
2583 dest += len;
2584 }
2585 _cpp_release_buff (pfile, accum.first);
2586 memcpy (dest, base, extra_len);
2587 dest[extra_len] = '\0';
2588 }
2589 }
2590
2591 /* Lexes a string, character constant, or angle-bracketed header file
2592 name. The stored string contains the spelling, including opening
2593 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2594 'R' modifier. It returns the type of the literal, or CPP_OTHER
2595 if it was not properly terminated, or CPP_LESS for an unterminated
2596 header name which must be relexed as normal tokens.
2597
2598 The spelling is NUL-terminated, but it is not guaranteed that this
2599 is the first NUL since embedded NULs are preserved. */
2600 static void
2601 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2602 {
2603 bool saw_NUL = false;
2604 const uchar *cur;
2605 cppchar_t terminator;
2606 enum cpp_ttype type;
2607
2608 cur = base;
2609 terminator = *cur++;
2610 if (terminator == 'L' || terminator == 'U')
2611 terminator = *cur++;
2612 else if (terminator == 'u')
2613 {
2614 terminator = *cur++;
2615 if (terminator == '8')
2616 terminator = *cur++;
2617 }
2618 if (terminator == 'R')
2619 {
2620 lex_raw_string (pfile, token, base);
2621 return;
2622 }
2623 if (terminator == '"')
2624 type = (*base == 'L' ? CPP_WSTRING :
2625 *base == 'U' ? CPP_STRING32 :
2626 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2627 : CPP_STRING);
2628 else if (terminator == '\'')
2629 type = (*base == 'L' ? CPP_WCHAR :
2630 *base == 'U' ? CPP_CHAR32 :
2631 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2632 : CPP_CHAR);
2633 else
2634 terminator = '>', type = CPP_HEADER_NAME;
2635
2636 const bool warn_bidi_p = pfile->warn_bidi_p ();
2637 for (;;)
2638 {
2639 cppchar_t c = *cur++;
2640
2641 /* In #include-style directives, terminators are not escapable. */
2642 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2643 {
2644 if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
2645 {
2646 location_t loc;
2647 bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
2648 &loc);
2649 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2650 }
2651 cur++;
2652 }
2653 else if (c == terminator)
2654 {
2655 if (warn_bidi_p)
2656 maybe_warn_bidi_on_close (pfile, cur - 1);
2657 break;
2658 }
2659 else if (c == '\n')
2660 {
2661 cur--;
2662 /* Unmatched quotes always yield undefined behavior, but
2663 greedy lexing means that what appears to be an unterminated
2664 header name may actually be a legitimate sequence of tokens. */
2665 if (terminator == '>')
2666 {
2667 token->type = CPP_LESS;
2668 return;
2669 }
2670 type = CPP_OTHER;
2671 break;
2672 }
2673 else if (c == '\0')
2674 saw_NUL = true;
2675 else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
2676 {
2677 location_t loc;
2678 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
2679 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2680 }
2681 }
2682
2683 if (saw_NUL && !pfile->state.skipping)
2684 cpp_error (pfile, CPP_DL_WARNING,
2685 "null character(s) preserved in literal");
2686
2687 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2688 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2689 (int) terminator);
2690
2691 if (CPP_OPTION (pfile, user_literals))
2692 {
2693 /* If a string format macro, say from inttypes.h, is placed touching
2694 a string literal it could be parsed as a C++11 user-defined string
2695 literal thus breaking the program. */
2696 if (is_macro_not_literal_suffix (pfile, cur))
2697 {
2698 /* Raise a warning, but do not consume subsequent tokens. */
2699 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2700 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2701 token->src_loc, 0,
2702 "invalid suffix on literal; C++11 requires "
2703 "a space between literal and string macro");
2704 }
2705 /* Grab user defined literal suffix. */
2706 else if (ISIDST (*cur))
2707 {
2708 type = cpp_userdef_char_add_type (type);
2709 type = cpp_userdef_string_add_type (type);
2710 ++cur;
2711
2712 while (ISIDNUM (*cur))
2713 ++cur;
2714 }
2715 }
2716 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2717 && is_macro (pfile, cur)
2718 && !pfile->state.skipping)
2719 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2720 token->src_loc, 0, "C++11 requires a space "
2721 "between string literal and macro");
2722
2723 pfile->buffer->cur = cur;
2724 create_literal (pfile, token, base, cur - base, type);
2725 }
2726
2727 /* Return the comment table. The client may not make any assumption
2728 about the ordering of the table. */
2729 cpp_comment_table *
2730 cpp_get_comments (cpp_reader *pfile)
2731 {
2732 return &pfile->comments;
2733 }
2734
2735 /* Append a comment to the end of the comment table. */
2736 static void
2737 store_comment (cpp_reader *pfile, cpp_token *token)
2738 {
2739 int len;
2740
2741 if (pfile->comments.allocated == 0)
2742 {
2743 pfile->comments.allocated = 256;
2744 pfile->comments.entries = (cpp_comment *) xmalloc
2745 (pfile->comments.allocated * sizeof (cpp_comment));
2746 }
2747
2748 if (pfile->comments.count == pfile->comments.allocated)
2749 {
2750 pfile->comments.allocated *= 2;
2751 pfile->comments.entries = (cpp_comment *) xrealloc
2752 (pfile->comments.entries,
2753 pfile->comments.allocated * sizeof (cpp_comment));
2754 }
2755
2756 len = token->val.str.len;
2757
2758 /* Copy comment. Note, token may not be NULL terminated. */
2759 pfile->comments.entries[pfile->comments.count].comment =
2760 (char *) xmalloc (sizeof (char) * (len + 1));
2761 memcpy (pfile->comments.entries[pfile->comments.count].comment,
2762 token->val.str.text, len);
2763 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2764
2765 /* Set source location. */
2766 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2767
2768 /* Increment the count of entries in the comment table. */
2769 pfile->comments.count++;
2770 }
2771
2772 /* The stored comment includes the comment start and any terminator. */
2773 static void
2774 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2775 cppchar_t type)
2776 {
2777 unsigned char *buffer;
2778 unsigned int len, clen, i;
2779
2780 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
2781
2782 /* C++ comments probably (not definitely) have moved past a new
2783 line, which we don't want to save in the comment. */
2784 if (is_vspace (pfile->buffer->cur[-1]))
2785 len--;
2786
2787 /* If we are currently in a directive or in argument parsing, then
2788 we need to store all C++ comments as C comments internally, and
2789 so we need to allocate a little extra space in that case.
2790
2791 Note that the only time we encounter a directive here is
2792 when we are saving comments in a "#define". */
2793 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2794 && type == '/') ? len + 2 : len;
2795
2796 buffer = _cpp_unaligned_alloc (pfile, clen);
2797
2798 token->type = CPP_COMMENT;
2799 token->val.str.len = clen;
2800 token->val.str.text = buffer;
2801
2802 buffer[0] = '/';
2803 memcpy (buffer + 1, from, len - 1);
2804
2805 /* Finish conversion to a C comment, if necessary. */
2806 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2807 {
2808 buffer[1] = '*';
2809 buffer[clen - 2] = '*';
2810 buffer[clen - 1] = '/';
2811 /* As there can be in a C++ comments illegal sequences for C comments
2812 we need to filter them out. */
2813 for (i = 2; i < (clen - 2); i++)
2814 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2815 buffer[i] = '|';
2816 }
2817
2818 /* Finally store this comment for use by clients of libcpp. */
2819 store_comment (pfile, token);
2820 }
2821
2822 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2823 comment. */
2824
2825 static bool
2826 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2827 {
2828 const unsigned char *from = comment_start + 1;
2829
2830 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2831 {
2832 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2833 don't recognize any comments. The latter only checks attributes,
2834 the former doesn't warn. */
2835 case 0:
2836 default:
2837 return false;
2838 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2839 content it has. */
2840 case 1:
2841 return true;
2842 case 2:
2843 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2844 .*falls?[ \t-]*thr(u|ough).* regex. */
2845 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2846 from++)
2847 {
2848 /* Is there anything like strpbrk with upper boundary, or
2849 memchr looking for 2 characters rather than just one? */
2850 if (from[0] != 'f' && from[0] != 'F')
2851 continue;
2852 if (from[1] != 'a' && from[1] != 'A')
2853 continue;
2854 if (from[2] != 'l' && from[2] != 'L')
2855 continue;
2856 if (from[3] != 'l' && from[3] != 'L')
2857 continue;
2858 from += sizeof "fall" - 1;
2859 if (from[0] == 's' || from[0] == 'S')
2860 from++;
2861 while (*from == ' ' || *from == '\t' || *from == '-')
2862 from++;
2863 if (from[0] != 't' && from[0] != 'T')
2864 continue;
2865 if (from[1] != 'h' && from[1] != 'H')
2866 continue;
2867 if (from[2] != 'r' && from[2] != 'R')
2868 continue;
2869 if (from[3] == 'u' || from[3] == 'U')
2870 return true;
2871 if (from[3] != 'o' && from[3] != 'O')
2872 continue;
2873 if (from[4] != 'u' && from[4] != 'U')
2874 continue;
2875 if (from[5] != 'g' && from[5] != 'G')
2876 continue;
2877 if (from[6] != 'h' && from[6] != 'H')
2878 continue;
2879 return true;
2880 }
2881 return false;
2882 case 3:
2883 case 4:
2884 break;
2885 }
2886
2887 /* Whole comment contents:
2888 -fallthrough
2889 @fallthrough@
2890 */
2891 if (*from == '-' || *from == '@')
2892 {
2893 size_t len = sizeof "fallthrough" - 1;
2894 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2895 return false;
2896 if (memcmp (from + 1, "fallthrough", len))
2897 return false;
2898 if (*from == '@')
2899 {
2900 if (from[len + 1] != '@')
2901 return false;
2902 len++;
2903 }
2904 from += 1 + len;
2905 }
2906 /* Whole comment contents (regex):
2907 lint -fallthrough[ \t]*
2908 */
2909 else if (*from == 'l')
2910 {
2911 size_t len = sizeof "int -fallthrough" - 1;
2912 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2913 return false;
2914 if (memcmp (from + 1, "int -fallthrough", len))
2915 return false;
2916 from += 1 + len;
2917 while (*from == ' ' || *from == '\t')
2918 from++;
2919 }
2920 /* Whole comment contents (regex):
2921 [ \t]*FALLTHR(U|OUGH)[ \t]*
2922 */
2923 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2924 {
2925 while (*from == ' ' || *from == '\t')
2926 from++;
2927 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
2928 return false;
2929 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2930 return false;
2931 from += sizeof "FALLTHR" - 1;
2932 if (*from == 'U')
2933 from++;
2934 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
2935 return false;
2936 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2937 return false;
2938 else
2939 from += sizeof "OUGH" - 1;
2940 while (*from == ' ' || *from == '\t')
2941 from++;
2942 }
2943 /* Whole comment contents (regex):
2944 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2945 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2946 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2947 */
2948 else
2949 {
2950 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2951 from++;
2952 unsigned char f = *from;
2953 bool all_upper = false;
2954 if (f == 'E' || f == 'e')
2955 {
2956 if ((size_t) (pfile->buffer->cur - from)
2957 < sizeof "else fallthru" - 1)
2958 return false;
2959 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2960 all_upper = true;
2961 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2962 return false;
2963 from += sizeof "else" - 1;
2964 if (*from == ',')
2965 from++;
2966 if (*from != ' ')
2967 return false;
2968 from++;
2969 if (all_upper && *from == 'f')
2970 return false;
2971 if (f == 'e' && *from == 'F')
2972 return false;
2973 f = *from;
2974 }
2975 else if (f == 'I' || f == 'i')
2976 {
2977 if ((size_t) (pfile->buffer->cur - from)
2978 < sizeof "intentional fallthru" - 1)
2979 return false;
2980 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2981 sizeof "NTENTIONAL" - 1) == 0)
2982 all_upper = true;
2983 else if (memcmp (from + 1, "ntentional",
2984 sizeof "ntentional" - 1))
2985 return false;
2986 from += sizeof "intentional" - 1;
2987 if (*from == ' ')
2988 {
2989 from++;
2990 if (all_upper && *from == 'f')
2991 return false;
2992 }
2993 else if (all_upper)
2994 {
2995 if (memcmp (from, "LY F", sizeof "LY F" - 1))
2996 return false;
2997 from += sizeof "LY " - 1;
2998 }
2999 else
3000 {
3001 if (memcmp (from, "ly ", sizeof "ly " - 1))
3002 return false;
3003 from += sizeof "ly " - 1;
3004 }
3005 if (f == 'i' && *from == 'F')
3006 return false;
3007 f = *from;
3008 }
3009 if (f != 'F' && f != 'f')
3010 return false;
3011 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3012 return false;
3013 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3014 all_upper = true;
3015 else if (all_upper)
3016 return false;
3017 else if (memcmp (from + 1, "all", sizeof "all" - 1))
3018 return false;
3019 from += sizeof "fall" - 1;
3020 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3021 from += 2;
3022 else if (*from == ' ' || *from == '-')
3023 from++;
3024 else if (*from != (all_upper ? 'T' : 't'))
3025 return false;
3026 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3027 return false;
3028 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3029 return false;
3030 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3031 {
3032 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3033 return false;
3034 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3035 sizeof "hrough" - 1))
3036 return false;
3037 from += sizeof "through" - 1;
3038 }
3039 else
3040 from += sizeof "thru" - 1;
3041 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3042 from++;
3043 if (*from == '-')
3044 {
3045 from++;
3046 if (*comment_start == '*')
3047 {
3048 do
3049 {
3050 while (*from && *from != '*'
3051 && *from != '\n' && *from != '\r')
3052 from++;
3053 if (*from != '*' || from[1] == '/')
3054 break;
3055 from++;
3056 }
3057 while (1);
3058 }
3059 else
3060 while (*from && *from != '\n' && *from != '\r')
3061 from++;
3062 }
3063 }
3064 /* C block comment. */
3065 if (*comment_start == '*')
3066 {
3067 if (*from != '*' || from[1] != '/')
3068 return false;
3069 }
3070 /* C++ line comment. */
3071 else if (*from != '\n')
3072 return false;
3073
3074 return true;
3075 }
3076
3077 /* Allocate COUNT tokens for RUN. */
3078 void
3079 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3080 {
3081 run->base = XNEWVEC (cpp_token, count);
3082 run->limit = run->base + count;
3083 run->next = NULL;
3084 }
3085
3086 /* Returns the next tokenrun, or creates one if there is none. */
3087 static tokenrun *
3088 next_tokenrun (tokenrun *run)
3089 {
3090 if (run->next == NULL)
3091 {
3092 run->next = XNEW (tokenrun);
3093 run->next->prev = run;
3094 _cpp_init_tokenrun (run->next, 250);
3095 }
3096
3097 return run->next;
3098 }
3099
3100 /* Return the number of not yet processed token in a given
3101 context. */
3102 int
3103 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3104 {
3105 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3106 return (LAST (context).token - FIRST (context).token);
3107 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3108 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3109 return (LAST (context).ptoken - FIRST (context).ptoken);
3110 else
3111 abort ();
3112 }
3113
3114 /* Returns the token present at index INDEX in a given context. If
3115 INDEX is zero, the next token to be processed is returned. */
3116 static const cpp_token*
3117 _cpp_token_from_context_at (cpp_context *context, int index)
3118 {
3119 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3120 return &(FIRST (context).token[index]);
3121 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3122 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3123 return FIRST (context).ptoken[index];
3124 else
3125 abort ();
3126 }
3127
3128 /* Look ahead in the input stream. */
3129 const cpp_token *
3130 cpp_peek_token (cpp_reader *pfile, int index)
3131 {
3132 cpp_context *context = pfile->context;
3133 const cpp_token *peektok;
3134 int count;
3135
3136 /* First, scan through any pending cpp_context objects. */
3137 while (context->prev)
3138 {
3139 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3140
3141 if (index < (int) sz)
3142 return _cpp_token_from_context_at (context, index);
3143 index -= (int) sz;
3144 context = context->prev;
3145 }
3146
3147 /* We will have to read some new tokens after all (and do so
3148 without invalidating preceding tokens). */
3149 count = index;
3150 pfile->keep_tokens++;
3151
3152 /* For peeked tokens temporarily disable line_change reporting,
3153 until the tokens are parsed for real. */
3154 void (*line_change) (cpp_reader *, const cpp_token *, int)
3155 = pfile->cb.line_change;
3156 pfile->cb.line_change = NULL;
3157
3158 do
3159 {
3160 peektok = _cpp_lex_token (pfile);
3161 if (peektok->type == CPP_EOF)
3162 {
3163 index--;
3164 break;
3165 }
3166 else if (peektok->type == CPP_PRAGMA)
3167 {
3168 /* Don't peek past a pragma. */
3169 if (peektok == &pfile->directive_result)
3170 /* Save the pragma in the buffer. */
3171 *pfile->cur_token++ = *peektok;
3172 index--;
3173 break;
3174 }
3175 }
3176 while (index--);
3177
3178 _cpp_backup_tokens_direct (pfile, count - index);
3179 pfile->keep_tokens--;
3180 pfile->cb.line_change = line_change;
3181
3182 return peektok;
3183 }
3184
3185 /* Allocate a single token that is invalidated at the same time as the
3186 rest of the tokens on the line. Has its line and col set to the
3187 same as the last lexed token, so that diagnostics appear in the
3188 right place. */
3189 cpp_token *
3190 _cpp_temp_token (cpp_reader *pfile)
3191 {
3192 cpp_token *old, *result;
3193 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3194 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3195
3196 old = pfile->cur_token - 1;
3197 /* Any pre-existing lookaheads must not be clobbered. */
3198 if (la)
3199 {
3200 if (sz <= la)
3201 {
3202 tokenrun *next = next_tokenrun (pfile->cur_run);
3203
3204 if (sz < la)
3205 memmove (next->base + 1, next->base,
3206 (la - sz) * sizeof (cpp_token));
3207
3208 next->base[0] = pfile->cur_run->limit[-1];
3209 }
3210
3211 if (sz > 1)
3212 memmove (pfile->cur_token + 1, pfile->cur_token,
3213 MIN (la, sz - 1) * sizeof (cpp_token));
3214 }
3215
3216 if (!sz && pfile->cur_token == pfile->cur_run->limit)
3217 {
3218 pfile->cur_run = next_tokenrun (pfile->cur_run);
3219 pfile->cur_token = pfile->cur_run->base;
3220 }
3221
3222 result = pfile->cur_token++;
3223 result->src_loc = old->src_loc;
3224 return result;
3225 }
3226
3227 /* We're at the beginning of a logical line (so not in
3228 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3229 if we should enter deferred_pragma mode to tokenize the rest of the
3230 line as a module control-line. */
3231
3232 static void
3233 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3234 {
3235 unsigned backup = 0; /* Tokens we peeked. */
3236 cpp_hashnode *node = result->val.node.node;
3237 cpp_token *peek = result;
3238 cpp_token *keyword = peek;
3239 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3240 int header_count = 0;
3241
3242 /* Make sure the incoming state is as we expect it. This way we
3243 can restore it using constants. */
3244 gcc_checking_assert (!pfile->state.in_deferred_pragma
3245 && !pfile->state.skipping
3246 && !pfile->state.parsing_args
3247 && !pfile->state.angled_headers
3248 && (pfile->state.save_comments
3249 == !CPP_OPTION (pfile, discard_comments)));
3250
3251 /* Enter directives mode sufficiently for peeking. We don't have
3252 to actually set in_directive. */
3253 pfile->state.in_deferred_pragma = true;
3254
3255 /* These two fields are needed to process tokenization in deferred
3256 pragma mode. They are not used outside deferred pragma mode or
3257 directives mode. */
3258 pfile->state.pragma_allow_expansion = true;
3259 pfile->directive_line = result->src_loc;
3260
3261 /* Saving comments is incompatible with directives mode. */
3262 pfile->state.save_comments = 0;
3263
3264 if (node == n_modules[spec_nodes::M_EXPORT][0])
3265 {
3266 peek = _cpp_lex_direct (pfile);
3267 keyword = peek;
3268 backup++;
3269 if (keyword->type != CPP_NAME)
3270 goto not_module;
3271 node = keyword->val.node.node;
3272 if (!(node->flags & NODE_MODULE))
3273 goto not_module;
3274 }
3275
3276 if (node == n_modules[spec_nodes::M__IMPORT][0])
3277 /* __import */
3278 header_count = backup + 2 + 16;
3279 else if (node == n_modules[spec_nodes::M_IMPORT][0])
3280 /* import */
3281 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3282 else if (node == n_modules[spec_nodes::M_MODULE][0])
3283 ; /* module */
3284 else
3285 goto not_module;
3286
3287 /* We've seen [export] {module|import|__import}. Check the next token. */
3288 if (header_count)
3289 /* After '{,__}import' a header name may appear. */
3290 pfile->state.angled_headers = true;
3291 peek = _cpp_lex_direct (pfile);
3292 backup++;
3293
3294 /* ... import followed by identifier, ':', '<' or
3295 header-name preprocessing tokens, or module
3296 followed by cpp-identifier, ':' or ';' preprocessing
3297 tokens. C++ keywords are not yet relevant. */
3298 if (peek->type == CPP_NAME
3299 || peek->type == CPP_COLON
3300 || (header_count
3301 ? (peek->type == CPP_LESS
3302 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3303 || peek->type == CPP_HEADER_NAME)
3304 : peek->type == CPP_SEMICOLON))
3305 {
3306 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3307 if (!pfile->state.pragma_allow_expansion)
3308 pfile->state.prevent_expansion++;
3309
3310 if (!header_count && linemap_included_from
3311 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3312 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3313 "module control-line cannot be in included file");
3314
3315 /* The first one or two tokens cannot be macro names. */
3316 for (int ix = backup; ix--;)
3317 {
3318 cpp_token *tok = ix ? keyword : result;
3319 cpp_hashnode *node = tok->val.node.node;
3320
3321 /* Don't attempt to expand the token. */
3322 tok->flags |= NO_EXPAND;
3323 if (_cpp_defined_macro_p (node)
3324 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3325 && !cpp_fun_like_macro_p (node))
3326 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3327 "module control-line \"%s\" cannot be"
3328 " an object-like macro",
3329 NODE_NAME (node));
3330 }
3331
3332 /* Map to underbar variants. */
3333 keyword->val.node.node = n_modules[header_count
3334 ? spec_nodes::M_IMPORT
3335 : spec_nodes::M_MODULE][1];
3336 if (backup != 1)
3337 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3338
3339 /* Maybe tell the tokenizer we expect a header-name down the
3340 road. */
3341 pfile->state.directive_file_token = header_count;
3342 }
3343 else
3344 {
3345 not_module:
3346 /* Drop out of directive mode. */
3347 /* We aaserted save_comments had this value upon entry. */
3348 pfile->state.save_comments
3349 = !CPP_OPTION (pfile, discard_comments);
3350 pfile->state.in_deferred_pragma = false;
3351 /* Do not let this remain on. */
3352 pfile->state.angled_headers = false;
3353 }
3354
3355 /* In either case we want to backup the peeked tokens. */
3356 if (backup)
3357 {
3358 /* If we saw EOL, we should drop it, because this isn't a module
3359 control-line after all. */
3360 bool eol = peek->type == CPP_PRAGMA_EOL;
3361 if (!eol || backup > 1)
3362 {
3363 /* Put put the peeked tokens back */
3364 _cpp_backup_tokens_direct (pfile, backup);
3365 /* But if the last one was an EOL, forget it. */
3366 if (eol)
3367 pfile->lookaheads--;
3368 }
3369 }
3370 }
3371
3372 /* Lex a token into RESULT (external interface). Takes care of issues
3373 like directive handling, token lookahead, multiple include
3374 optimization and skipping. */
3375 const cpp_token *
3376 _cpp_lex_token (cpp_reader *pfile)
3377 {
3378 cpp_token *result;
3379
3380 for (;;)
3381 {
3382 if (pfile->cur_token == pfile->cur_run->limit)
3383 {
3384 pfile->cur_run = next_tokenrun (pfile->cur_run);
3385 pfile->cur_token = pfile->cur_run->base;
3386 }
3387 /* We assume that the current token is somewhere in the current
3388 run. */
3389 if (pfile->cur_token < pfile->cur_run->base
3390 || pfile->cur_token >= pfile->cur_run->limit)
3391 abort ();
3392
3393 if (pfile->lookaheads)
3394 {
3395 pfile->lookaheads--;
3396 result = pfile->cur_token++;
3397 }
3398 else
3399 result = _cpp_lex_direct (pfile);
3400
3401 if (result->flags & BOL)
3402 {
3403 /* Is this a directive. If _cpp_handle_directive returns
3404 false, it is an assembler #. */
3405 if (result->type == CPP_HASH
3406 /* 6.10.3 p 11: Directives in a list of macro arguments
3407 gives undefined behavior. This implementation
3408 handles the directive as normal. */
3409 && pfile->state.parsing_args != 1)
3410 {
3411 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3412 {
3413 if (pfile->directive_result.type == CPP_PADDING)
3414 continue;
3415 result = &pfile->directive_result;
3416 }
3417 }
3418 else if (pfile->state.in_deferred_pragma)
3419 result = &pfile->directive_result;
3420 else if (result->type == CPP_NAME
3421 && (result->val.node.node->flags & NODE_MODULE)
3422 && !pfile->state.skipping
3423 /* Unlike regular directives, we do not deal with
3424 tokenizing module directives as macro arguments.
3425 That's not permitted. */
3426 && !pfile->state.parsing_args)
3427 {
3428 /* P1857. Before macro expansion, At start of logical
3429 line ... */
3430 /* We don't have to consider lookaheads at this point. */
3431 gcc_checking_assert (!pfile->lookaheads);
3432
3433 cpp_maybe_module_directive (pfile, result);
3434 }
3435
3436 if (pfile->cb.line_change && !pfile->state.skipping)
3437 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3438 }
3439
3440 /* We don't skip tokens in directives. */
3441 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3442 break;
3443
3444 /* Outside a directive, invalidate controlling macros. At file
3445 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3446 get here and MI optimization works. */
3447 pfile->mi_valid = false;
3448
3449 if (!pfile->state.skipping || result->type == CPP_EOF)
3450 break;
3451 }
3452
3453 return result;
3454 }
3455
3456 /* Returns true if a fresh line has been loaded. */
3457 bool
3458 _cpp_get_fresh_line (cpp_reader *pfile)
3459 {
3460 /* We can't get a new line until we leave the current directive. */
3461 if (pfile->state.in_directive)
3462 return false;
3463
3464 for (;;)
3465 {
3466 cpp_buffer *buffer = pfile->buffer;
3467
3468 if (!buffer->need_line)
3469 return true;
3470
3471 if (buffer->next_line < buffer->rlimit)
3472 {
3473 _cpp_clean_line (pfile);
3474 return true;
3475 }
3476
3477 /* First, get out of parsing arguments state. */
3478 if (pfile->state.parsing_args)
3479 return false;
3480
3481 /* End of buffer. Non-empty files should end in a newline. */
3482 if (buffer->buf != buffer->rlimit
3483 && buffer->next_line > buffer->rlimit
3484 && !buffer->from_stage3)
3485 {
3486 /* Clip to buffer size. */
3487 buffer->next_line = buffer->rlimit;
3488 }
3489
3490 if (buffer->prev && !buffer->return_at_eof)
3491 _cpp_pop_buffer (pfile);
3492 else
3493 {
3494 /* End of translation. Do not pop the buffer yet. Increment
3495 line number so that the EOF token is on a line of its own
3496 (_cpp_lex_direct doesn't increment in that case, because
3497 it's hard for it to distinguish this special case). */
3498 CPP_INCREMENT_LINE (pfile, 0);
3499 return false;
3500 }
3501 }
3502 }
3503
3504 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3505 do \
3506 { \
3507 result->type = ELSE_TYPE; \
3508 if (*buffer->cur == CHAR) \
3509 buffer->cur++, result->type = THEN_TYPE; \
3510 } \
3511 while (0)
3512
3513 /* Lex a token into pfile->cur_token, which is also incremented, to
3514 get diagnostics pointing to the correct location.
3515
3516 Does not handle issues such as token lookahead, multiple-include
3517 optimization, directives, skipping etc. This function is only
3518 suitable for use by _cpp_lex_token, and in special cases like
3519 lex_expansion_token which doesn't care for any of these issues.
3520
3521 When meeting a newline, returns CPP_EOF if parsing a directive,
3522 otherwise returns to the start of the token buffer if permissible.
3523 Returns the location of the lexed token. */
3524 cpp_token *
3525 _cpp_lex_direct (cpp_reader *pfile)
3526 {
3527 cppchar_t c;
3528 cpp_buffer *buffer;
3529 const unsigned char *comment_start;
3530 bool fallthrough_comment = false;
3531 cpp_token *result = pfile->cur_token++;
3532
3533 fresh_line:
3534 result->flags = 0;
3535 buffer = pfile->buffer;
3536 if (buffer->need_line)
3537 {
3538 if (pfile->state.in_deferred_pragma)
3539 {
3540 /* This can happen in cases like:
3541 #define loop(x) whatever
3542 #pragma omp loop
3543 where when trying to expand loop we need to peek
3544 next token after loop, but aren't still in_deferred_pragma
3545 mode but are in in_directive mode, so buffer->need_line
3546 is set, a CPP_EOF is peeked. */
3547 result->type = CPP_PRAGMA_EOL;
3548 pfile->state.in_deferred_pragma = false;
3549 if (!pfile->state.pragma_allow_expansion)
3550 pfile->state.prevent_expansion--;
3551 return result;
3552 }
3553 if (!_cpp_get_fresh_line (pfile))
3554 {
3555 result->type = CPP_EOF;
3556 /* Not a real EOF in a directive or arg parsing -- we refuse
3557 to advance to the next file now, and will once we're out
3558 of those modes. */
3559 if (!pfile->state.in_directive && !pfile->state.parsing_args)
3560 {
3561 /* Tell the compiler the line number of the EOF token. */
3562 result->src_loc = pfile->line_table->highest_line;
3563 result->flags = BOL;
3564 /* Now pop the buffer that _cpp_get_fresh_line did not. */
3565 _cpp_pop_buffer (pfile);
3566 }
3567 return result;
3568 }
3569 if (buffer != pfile->buffer)
3570 fallthrough_comment = false;
3571 if (!pfile->keep_tokens)
3572 {
3573 pfile->cur_run = &pfile->base_run;
3574 result = pfile->base_run.base;
3575 pfile->cur_token = result + 1;
3576 }
3577 result->flags = BOL;
3578 if (pfile->state.parsing_args == 2)
3579 result->flags |= PREV_WHITE;
3580 }
3581 buffer = pfile->buffer;
3582 update_tokens_line:
3583 result->src_loc = pfile->line_table->highest_line;
3584
3585 skipped_white:
3586 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3587 && !pfile->overlaid_buffer)
3588 {
3589 _cpp_process_line_notes (pfile, false);
3590 result->src_loc = pfile->line_table->highest_line;
3591 }
3592 c = *buffer->cur++;
3593
3594 if (pfile->forced_token_location)
3595 result->src_loc = pfile->forced_token_location;
3596 else
3597 result->src_loc = linemap_position_for_column (pfile->line_table,
3598 CPP_BUF_COLUMN (buffer, buffer->cur));
3599
3600 switch (c)
3601 {
3602 case ' ': case '\t': case '\f': case '\v': case '\0':
3603 result->flags |= PREV_WHITE;
3604 skip_whitespace (pfile, c);
3605 goto skipped_white;
3606
3607 case '\n':
3608 /* Increment the line, unless this is the last line ... */
3609 if (buffer->cur < buffer->rlimit
3610 /* ... or this is a #include, (where _cpp_stack_file needs to
3611 unwind by one line) ... */
3612 || (pfile->state.in_directive > 1
3613 /* ... except traditional-cpp increments this elsewhere. */
3614 && !CPP_OPTION (pfile, traditional)))
3615 CPP_INCREMENT_LINE (pfile, 0);
3616 buffer->need_line = true;
3617 if (pfile->state.in_deferred_pragma)
3618 {
3619 /* Produce the PRAGMA_EOL on this line. File reading
3620 ensures there is always a \n at end of the buffer, thus
3621 in a deferred pragma we always see CPP_PRAGMA_EOL before
3622 any CPP_EOF. */
3623 result->type = CPP_PRAGMA_EOL;
3624 result->flags &= ~PREV_WHITE;
3625 pfile->state.in_deferred_pragma = false;
3626 if (!pfile->state.pragma_allow_expansion)
3627 pfile->state.prevent_expansion--;
3628 return result;
3629 }
3630 goto fresh_line;
3631
3632 case '0': case '1': case '2': case '3': case '4':
3633 case '5': case '6': case '7': case '8': case '9':
3634 {
3635 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3636 result->type = CPP_NUMBER;
3637 lex_number (pfile, &result->val.str, &nst);
3638 warn_about_normalization (pfile, result, &nst);
3639 break;
3640 }
3641
3642 case 'L':
3643 case 'u':
3644 case 'U':
3645 case 'R':
3646 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3647 wide strings or raw strings. */
3648 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3649 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3650 {
3651 if ((*buffer->cur == '\'' && c != 'R')
3652 || *buffer->cur == '"'
3653 || (*buffer->cur == 'R'
3654 && c != 'R'
3655 && buffer->cur[1] == '"'
3656 && CPP_OPTION (pfile, rliterals))
3657 || (*buffer->cur == '8'
3658 && c == 'u'
3659 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3660 && CPP_OPTION (pfile, utf8_char_literals)))
3661 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3662 && CPP_OPTION (pfile, rliterals)))))
3663 {
3664 lex_string (pfile, result, buffer->cur - 1);
3665 break;
3666 }
3667 }
3668 /* Fall through. */
3669
3670 case '_':
3671 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3672 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3673 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3674 case 's': case 't': case 'v': case 'w': case 'x':
3675 case 'y': case 'z':
3676 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3677 case 'G': case 'H': case 'I': case 'J': case 'K':
3678 case 'M': case 'N': case 'O': case 'P': case 'Q':
3679 case 'S': case 'T': case 'V': case 'W': case 'X':
3680 case 'Y': case 'Z':
3681 result->type = CPP_NAME;
3682 {
3683 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3684 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3685 &nst,
3686 &result->val.node.spelling);
3687 warn_about_normalization (pfile, result, &nst);
3688 }
3689
3690 /* Convert named operators to their proper types. */
3691 if (result->val.node.node->flags & NODE_OPERATOR)
3692 {
3693 result->flags |= NAMED_OP;
3694 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3695 }
3696
3697 /* Signal FALLTHROUGH comment followed by another token. */
3698 if (fallthrough_comment)
3699 result->flags |= PREV_FALLTHROUGH;
3700 break;
3701
3702 case '\'':
3703 case '"':
3704 lex_string (pfile, result, buffer->cur - 1);
3705 break;
3706
3707 case '/':
3708 /* A potential block or line comment. */
3709 comment_start = buffer->cur;
3710 c = *buffer->cur;
3711
3712 if (c == '*')
3713 {
3714 if (_cpp_skip_block_comment (pfile))
3715 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3716 }
3717 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3718 {
3719 /* Don't warn for system headers. */
3720 if (_cpp_in_system_header (pfile))
3721 ;
3722 /* Warn about comments if pedantically GNUC89, and not
3723 in system headers. */
3724 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3725 && CPP_PEDANTIC (pfile)
3726 && ! buffer->warned_cplusplus_comments)
3727 {
3728 if (cpp_error (pfile, CPP_DL_PEDWARN,
3729 "C++ style comments are not allowed in ISO C90"))
3730 cpp_error (pfile, CPP_DL_NOTE,
3731 "(this will be reported only once per input file)");
3732 buffer->warned_cplusplus_comments = 1;
3733 }
3734 /* Or if specifically desired via -Wc90-c99-compat. */
3735 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3736 && ! CPP_OPTION (pfile, cplusplus)
3737 && ! buffer->warned_cplusplus_comments)
3738 {
3739 if (cpp_error (pfile, CPP_DL_WARNING,
3740 "C++ style comments are incompatible with C90"))
3741 cpp_error (pfile, CPP_DL_NOTE,
3742 "(this will be reported only once per input file)");
3743 buffer->warned_cplusplus_comments = 1;
3744 }
3745 /* In C89/C94, C++ style comments are forbidden. */
3746 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3747 || CPP_OPTION (pfile, lang) == CLK_STDC94))
3748 {
3749 /* But don't be confused about valid code such as
3750 - // immediately followed by *,
3751 - // in a preprocessing directive,
3752 - // in an #if 0 block. */
3753 if (buffer->cur[1] == '*'
3754 || pfile->state.in_directive
3755 || pfile->state.skipping)
3756 {
3757 result->type = CPP_DIV;
3758 break;
3759 }
3760 else if (! buffer->warned_cplusplus_comments)
3761 {
3762 if (cpp_error (pfile, CPP_DL_ERROR,
3763 "C++ style comments are not allowed in "
3764 "ISO C90"))
3765 cpp_error (pfile, CPP_DL_NOTE,
3766 "(this will be reported only once per input "
3767 "file)");
3768 buffer->warned_cplusplus_comments = 1;
3769 }
3770 }
3771 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3772 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3773 }
3774 else if (c == '=')
3775 {
3776 buffer->cur++;
3777 result->type = CPP_DIV_EQ;
3778 break;
3779 }
3780 else
3781 {
3782 result->type = CPP_DIV;
3783 break;
3784 }
3785
3786 if (fallthrough_comment_p (pfile, comment_start))
3787 fallthrough_comment = true;
3788
3789 if (pfile->cb.comment)
3790 {
3791 size_t len = pfile->buffer->cur - comment_start;
3792 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3793 len + 1);
3794 }
3795
3796 if (!pfile->state.save_comments)
3797 {
3798 result->flags |= PREV_WHITE;
3799 goto update_tokens_line;
3800 }
3801
3802 if (fallthrough_comment)
3803 result->flags |= PREV_FALLTHROUGH;
3804
3805 /* Save the comment as a token in its own right. */
3806 save_comment (pfile, result, comment_start, c);
3807 break;
3808
3809 case '<':
3810 if (pfile->state.angled_headers)
3811 {
3812 lex_string (pfile, result, buffer->cur - 1);
3813 if (result->type != CPP_LESS)
3814 break;
3815 }
3816
3817 result->type = CPP_LESS;
3818 if (*buffer->cur == '=')
3819 {
3820 buffer->cur++, result->type = CPP_LESS_EQ;
3821 if (*buffer->cur == '>'
3822 && CPP_OPTION (pfile, cplusplus)
3823 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3824 buffer->cur++, result->type = CPP_SPACESHIP;
3825 }
3826 else if (*buffer->cur == '<')
3827 {
3828 buffer->cur++;
3829 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3830 }
3831 else if (CPP_OPTION (pfile, digraphs))
3832 {
3833 if (*buffer->cur == ':')
3834 {
3835 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3836 three characters are <:: and the subsequent character
3837 is neither : nor >, the < is treated as a preprocessor
3838 token by itself". */
3839 if (CPP_OPTION (pfile, cplusplus)
3840 && CPP_OPTION (pfile, lang) != CLK_CXX98
3841 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3842 && buffer->cur[1] == ':'
3843 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3844 break;
3845
3846 buffer->cur++;
3847 result->flags |= DIGRAPH;
3848 result->type = CPP_OPEN_SQUARE;
3849 }
3850 else if (*buffer->cur == '%')
3851 {
3852 buffer->cur++;
3853 result->flags |= DIGRAPH;
3854 result->type = CPP_OPEN_BRACE;
3855 }
3856 }
3857 break;
3858
3859 case '>':
3860 result->type = CPP_GREATER;
3861 if (*buffer->cur == '=')
3862 buffer->cur++, result->type = CPP_GREATER_EQ;
3863 else if (*buffer->cur == '>')
3864 {
3865 buffer->cur++;
3866 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3867 }
3868 break;
3869
3870 case '%':
3871 result->type = CPP_MOD;
3872 if (*buffer->cur == '=')
3873 buffer->cur++, result->type = CPP_MOD_EQ;
3874 else if (CPP_OPTION (pfile, digraphs))
3875 {
3876 if (*buffer->cur == ':')
3877 {
3878 buffer->cur++;
3879 result->flags |= DIGRAPH;
3880 result->type = CPP_HASH;
3881 if (*buffer->cur == '%' && buffer->cur[1] == ':')
3882 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3883 }
3884 else if (*buffer->cur == '>')
3885 {
3886 buffer->cur++;
3887 result->flags |= DIGRAPH;
3888 result->type = CPP_CLOSE_BRACE;
3889 }
3890 }
3891 break;
3892
3893 case '.':
3894 result->type = CPP_DOT;
3895 if (ISDIGIT (*buffer->cur))
3896 {
3897 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3898 result->type = CPP_NUMBER;
3899 lex_number (pfile, &result->val.str, &nst);
3900 warn_about_normalization (pfile, result, &nst);
3901 }
3902 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3903 buffer->cur += 2, result->type = CPP_ELLIPSIS;
3904 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3905 buffer->cur++, result->type = CPP_DOT_STAR;
3906 break;
3907
3908 case '+':
3909 result->type = CPP_PLUS;
3910 if (*buffer->cur == '+')
3911 buffer->cur++, result->type = CPP_PLUS_PLUS;
3912 else if (*buffer->cur == '=')
3913 buffer->cur++, result->type = CPP_PLUS_EQ;
3914 break;
3915
3916 case '-':
3917 result->type = CPP_MINUS;
3918 if (*buffer->cur == '>')
3919 {
3920 buffer->cur++;
3921 result->type = CPP_DEREF;
3922 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3923 buffer->cur++, result->type = CPP_DEREF_STAR;
3924 }
3925 else if (*buffer->cur == '-')
3926 buffer->cur++, result->type = CPP_MINUS_MINUS;
3927 else if (*buffer->cur == '=')
3928 buffer->cur++, result->type = CPP_MINUS_EQ;
3929 break;
3930
3931 case '&':
3932 result->type = CPP_AND;
3933 if (*buffer->cur == '&')
3934 buffer->cur++, result->type = CPP_AND_AND;
3935 else if (*buffer->cur == '=')
3936 buffer->cur++, result->type = CPP_AND_EQ;
3937 break;
3938
3939 case '|':
3940 result->type = CPP_OR;
3941 if (*buffer->cur == '|')
3942 buffer->cur++, result->type = CPP_OR_OR;
3943 else if (*buffer->cur == '=')
3944 buffer->cur++, result->type = CPP_OR_EQ;
3945 break;
3946
3947 case ':':
3948 result->type = CPP_COLON;
3949 if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3950 buffer->cur++, result->type = CPP_SCOPE;
3951 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3952 {
3953 buffer->cur++;
3954 result->flags |= DIGRAPH;
3955 result->type = CPP_CLOSE_SQUARE;
3956 }
3957 break;
3958
3959 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3960 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3961 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3962 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3963 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3964
3965 case '?': result->type = CPP_QUERY; break;
3966 case '~': result->type = CPP_COMPL; break;
3967 case ',': result->type = CPP_COMMA; break;
3968 case '(': result->type = CPP_OPEN_PAREN; break;
3969 case ')': result->type = CPP_CLOSE_PAREN; break;
3970 case '[': result->type = CPP_OPEN_SQUARE; break;
3971 case ']': result->type = CPP_CLOSE_SQUARE; break;
3972 case '{': result->type = CPP_OPEN_BRACE; break;
3973 case '}': result->type = CPP_CLOSE_BRACE; break;
3974 case ';': result->type = CPP_SEMICOLON; break;
3975
3976 /* @ is a punctuator in Objective-C. */
3977 case '@': result->type = CPP_ATSIGN; break;
3978
3979 default:
3980 {
3981 const uchar *base = --buffer->cur;
3982
3983 /* Check for an extended identifier ($ or UCN or UTF-8). */
3984 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3985 if (forms_identifier_p (pfile, true, &nst))
3986 {
3987 result->type = CPP_NAME;
3988 result->val.node.node = lex_identifier (pfile, base, true, &nst,
3989 &result->val.node.spelling);
3990 warn_about_normalization (pfile, result, &nst);
3991 break;
3992 }
3993
3994 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
3995 single token. */
3996 buffer->cur++;
3997 if (c >= utf8_signifier)
3998 {
3999 const uchar *pstr = base;
4000 cppchar_t s;
4001 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4002 buffer->cur = pstr;
4003 }
4004 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4005 break;
4006 }
4007
4008 }
4009
4010 /* Potentially convert the location of the token to a range. */
4011 if (result->src_loc >= RESERVED_LOCATION_COUNT
4012 && result->type != CPP_EOF)
4013 {
4014 /* Ensure that any line notes are processed, so that we have the
4015 correct physical line/column for the end-point of the token even
4016 when a logical line is split via one or more backslashes. */
4017 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4018 && !pfile->overlaid_buffer)
4019 _cpp_process_line_notes (pfile, false);
4020
4021 source_range tok_range;
4022 tok_range.m_start = result->src_loc;
4023 tok_range.m_finish
4024 = linemap_position_for_column (pfile->line_table,
4025 CPP_BUF_COLUMN (buffer, buffer->cur));
4026
4027 result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4028 result->src_loc,
4029 tok_range, NULL);
4030 }
4031
4032 return result;
4033 }
4034
4035 /* An upper bound on the number of bytes needed to spell TOKEN.
4036 Does not include preceding whitespace. */
4037 unsigned int
4038 cpp_token_len (const cpp_token *token)
4039 {
4040 unsigned int len;
4041
4042 switch (TOKEN_SPELL (token))
4043 {
4044 default: len = 6; break;
4045 case SPELL_LITERAL: len = token->val.str.len; break;
4046 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
4047 }
4048
4049 return len;
4050 }
4051
4052 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4053 Return the number of bytes read out of NAME. (There are always
4054 10 bytes written to BUFFER.) */
4055
4056 static size_t
4057 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4058 {
4059 int j;
4060 int ucn_len = 0;
4061 int ucn_len_c;
4062 unsigned t;
4063 unsigned long utf32;
4064
4065 /* Compute the length of the UTF-8 sequence. */
4066 for (t = *name; t & 0x80; t <<= 1)
4067 ucn_len++;
4068
4069 utf32 = *name & (0x7F >> ucn_len);
4070 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4071 {
4072 utf32 = (utf32 << 6) | (*++name & 0x3F);
4073
4074 /* Ill-formed UTF-8. */
4075 if ((*name & ~0x3F) != 0x80)
4076 abort ();
4077 }
4078
4079 *buffer++ = '\\';
4080 *buffer++ = 'U';
4081 for (j = 7; j >= 0; j--)
4082 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4083 return ucn_len;
4084 }
4085
4086 /* Given a token TYPE corresponding to a digraph, return a pointer to
4087 the spelling of the digraph. */
4088 static const unsigned char *
4089 cpp_digraph2name (enum cpp_ttype type)
4090 {
4091 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4092 }
4093
4094 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4095 The buffer must already contain the enough space to hold the
4096 token's spelling. Returns a pointer to the character after the
4097 last character written. */
4098 unsigned char *
4099 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4100 {
4101 size_t i;
4102 const unsigned char *name = NODE_NAME (ident);
4103
4104 for (i = 0; i < NODE_LEN (ident); i++)
4105 if (name[i] & ~0x7F)
4106 {
4107 i += utf8_to_ucn (buffer, name + i) - 1;
4108 buffer += 10;
4109 }
4110 else
4111 *buffer++ = name[i];
4112
4113 return buffer;
4114 }
4115
4116 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
4117 already contain the enough space to hold the token's spelling.
4118 Returns a pointer to the character after the last character written.
4119 FORSTRING is true if this is to be the spelling after translation
4120 phase 1 (with the original spelling of extended identifiers), false
4121 if extended identifiers should always be written using UCNs (there is
4122 no option for always writing them in the internal UTF-8 form).
4123 FIXME: Would be nice if we didn't need the PFILE argument. */
4124 unsigned char *
4125 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4126 unsigned char *buffer, bool forstring)
4127 {
4128 switch (TOKEN_SPELL (token))
4129 {
4130 case SPELL_OPERATOR:
4131 {
4132 const unsigned char *spelling;
4133 unsigned char c;
4134
4135 if (token->flags & DIGRAPH)
4136 spelling = cpp_digraph2name (token->type);
4137 else if (token->flags & NAMED_OP)
4138 goto spell_ident;
4139 else
4140 spelling = TOKEN_NAME (token);
4141
4142 while ((c = *spelling++) != '\0')
4143 *buffer++ = c;
4144 }
4145 break;
4146
4147 spell_ident:
4148 case SPELL_IDENT:
4149 if (forstring)
4150 {
4151 memcpy (buffer, NODE_NAME (token->val.node.spelling),
4152 NODE_LEN (token->val.node.spelling));
4153 buffer += NODE_LEN (token->val.node.spelling);
4154 }
4155 else
4156 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4157 break;
4158
4159 case SPELL_LITERAL:
4160 memcpy (buffer, token->val.str.text, token->val.str.len);
4161 buffer += token->val.str.len;
4162 break;
4163
4164 case SPELL_NONE:
4165 cpp_error (pfile, CPP_DL_ICE,
4166 "unspellable token %s", TOKEN_NAME (token));
4167 break;
4168 }
4169
4170 return buffer;
4171 }
4172
4173 /* Returns TOKEN spelt as a null-terminated string. The string is
4174 freed when the reader is destroyed. Useful for diagnostics. */
4175 unsigned char *
4176 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4177 {
4178 unsigned int len = cpp_token_len (token) + 1;
4179 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4180
4181 end = cpp_spell_token (pfile, token, start, false);
4182 end[0] = '\0';
4183
4184 return start;
4185 }
4186
4187 /* Returns a pointer to a string which spells the token defined by
4188 TYPE and FLAGS. Used by C front ends, which really should move to
4189 using cpp_token_as_text. */
4190 const char *
4191 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4192 {
4193 if (flags & DIGRAPH)
4194 return (const char *) cpp_digraph2name (type);
4195 else if (flags & NAMED_OP)
4196 return cpp_named_operator2name (type);
4197
4198 return (const char *) token_spellings[type].name;
4199 }
4200
4201 /* Writes the spelling of token to FP, without any preceding space.
4202 Separated from cpp_spell_token for efficiency - to avoid stdio
4203 double-buffering. */
4204 void
4205 cpp_output_token (const cpp_token *token, FILE *fp)
4206 {
4207 switch (TOKEN_SPELL (token))
4208 {
4209 case SPELL_OPERATOR:
4210 {
4211 const unsigned char *spelling;
4212 int c;
4213
4214 if (token->flags & DIGRAPH)
4215 spelling = cpp_digraph2name (token->type);
4216 else if (token->flags & NAMED_OP)
4217 goto spell_ident;
4218 else
4219 spelling = TOKEN_NAME (token);
4220
4221 c = *spelling;
4222 do
4223 putc (c, fp);
4224 while ((c = *++spelling) != '\0');
4225 }
4226 break;
4227
4228 spell_ident:
4229 case SPELL_IDENT:
4230 {
4231 size_t i;
4232 const unsigned char * name = NODE_NAME (token->val.node.node);
4233
4234 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4235 if (name[i] & ~0x7F)
4236 {
4237 unsigned char buffer[10];
4238 i += utf8_to_ucn (buffer, name + i) - 1;
4239 fwrite (buffer, 1, 10, fp);
4240 }
4241 else
4242 fputc (NODE_NAME (token->val.node.node)[i], fp);
4243 }
4244 break;
4245
4246 case SPELL_LITERAL:
4247 if (token->type == CPP_HEADER_NAME)
4248 fputc ('"', fp);
4249 fwrite (token->val.str.text, 1, token->val.str.len, fp);
4250 if (token->type == CPP_HEADER_NAME)
4251 fputc ('"', fp);
4252 break;
4253
4254 case SPELL_NONE:
4255 /* An error, most probably. */
4256 break;
4257 }
4258 }
4259
4260 /* Compare two tokens. */
4261 int
4262 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4263 {
4264 if (a->type == b->type && a->flags == b->flags)
4265 switch (TOKEN_SPELL (a))
4266 {
4267 default: /* Keep compiler happy. */
4268 case SPELL_OPERATOR:
4269 /* token_no is used to track where multiple consecutive ##
4270 tokens were originally located. */
4271 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4272 case SPELL_NONE:
4273 return (a->type != CPP_MACRO_ARG
4274 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4275 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4276 case SPELL_IDENT:
4277 return (a->val.node.node == b->val.node.node
4278 && a->val.node.spelling == b->val.node.spelling);
4279 case SPELL_LITERAL:
4280 return (a->val.str.len == b->val.str.len
4281 && !memcmp (a->val.str.text, b->val.str.text,
4282 a->val.str.len));
4283 }
4284
4285 return 0;
4286 }
4287
4288 /* Returns nonzero if a space should be inserted to avoid an
4289 accidental token paste for output. For simplicity, it is
4290 conservative, and occasionally advises a space where one is not
4291 needed, e.g. "." and ".2". */
4292 int
4293 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4294 const cpp_token *token2)
4295 {
4296 enum cpp_ttype a = token1->type, b = token2->type;
4297 cppchar_t c;
4298
4299 if (token1->flags & NAMED_OP)
4300 a = CPP_NAME;
4301 if (token2->flags & NAMED_OP)
4302 b = CPP_NAME;
4303
4304 c = EOF;
4305 if (token2->flags & DIGRAPH)
4306 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4307 else if (token_spellings[b].category == SPELL_OPERATOR)
4308 c = token_spellings[b].name[0];
4309
4310 /* Quickly get everything that can paste with an '='. */
4311 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4312 return 1;
4313
4314 switch (a)
4315 {
4316 case CPP_GREATER: return c == '>';
4317 case CPP_LESS: return c == '<' || c == '%' || c == ':';
4318 case CPP_PLUS: return c == '+';
4319 case CPP_MINUS: return c == '-' || c == '>';
4320 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4321 case CPP_MOD: return c == ':' || c == '>';
4322 case CPP_AND: return c == '&';
4323 case CPP_OR: return c == '|';
4324 case CPP_COLON: return c == ':' || c == '>';
4325 case CPP_DEREF: return c == '*';
4326 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
4327 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
4328 case CPP_PRAGMA:
4329 case CPP_NAME: return ((b == CPP_NUMBER
4330 && name_p (pfile, &token2->val.str))
4331 || b == CPP_NAME
4332 || b == CPP_CHAR || b == CPP_STRING); /* L */
4333 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
4334 || b == CPP_CHAR
4335 || c == '.' || c == '+' || c == '-');
4336 /* UCNs */
4337 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4338 && b == CPP_NAME)
4339 || (CPP_OPTION (pfile, objc)
4340 && token1->val.str.text[0] == '@'
4341 && (b == CPP_NAME || b == CPP_STRING)));
4342 case CPP_LESS_EQ: return c == '>';
4343 case CPP_STRING:
4344 case CPP_WSTRING:
4345 case CPP_UTF8STRING:
4346 case CPP_STRING16:
4347 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4348 && (b == CPP_NAME
4349 || (TOKEN_SPELL (token2) == SPELL_LITERAL
4350 && ISIDST (token2->val.str.text[0]))));
4351
4352 default: break;
4353 }
4354
4355 return 0;
4356 }
4357
4358 /* Output all the remaining tokens on the current line, and a newline
4359 character, to FP. Leading whitespace is removed. If there are
4360 macros, special token padding is not performed. */
4361 void
4362 cpp_output_line (cpp_reader *pfile, FILE *fp)
4363 {
4364 const cpp_token *token;
4365
4366 token = cpp_get_token (pfile);
4367 while (token->type != CPP_EOF)
4368 {
4369 cpp_output_token (token, fp);
4370 token = cpp_get_token (pfile);
4371 if (token->flags & PREV_WHITE)
4372 putc (' ', fp);
4373 }
4374
4375 putc ('\n', fp);
4376 }
4377
4378 /* Return a string representation of all the remaining tokens on the
4379 current line. The result is allocated using xmalloc and must be
4380 freed by the caller. */
4381 unsigned char *
4382 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4383 {
4384 const cpp_token *token;
4385 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4386 unsigned int alloced = 120 + out;
4387 unsigned char *result = (unsigned char *) xmalloc (alloced);
4388
4389 /* If DIR_NAME is empty, there are no initial contents. */
4390 if (dir_name)
4391 {
4392 sprintf ((char *) result, "#%s ", dir_name);
4393 out += 2;
4394 }
4395
4396 token = cpp_get_token (pfile);
4397 while (token->type != CPP_EOF)
4398 {
4399 unsigned char *last;
4400 /* Include room for a possible space and the terminating nul. */
4401 unsigned int len = cpp_token_len (token) + 2;
4402
4403 if (out + len > alloced)
4404 {
4405 alloced *= 2;
4406 if (out + len > alloced)
4407 alloced = out + len;
4408 result = (unsigned char *) xrealloc (result, alloced);
4409 }
4410
4411 last = cpp_spell_token (pfile, token, &result[out], 0);
4412 out = last - result;
4413
4414 token = cpp_get_token (pfile);
4415 if (token->flags & PREV_WHITE)
4416 result[out++] = ' ';
4417 }
4418
4419 result[out] = '\0';
4420 return result;
4421 }
4422
4423 /* Memory buffers. Changing these three constants can have a dramatic
4424 effect on performance. The values here are reasonable defaults,
4425 but might be tuned. If you adjust them, be sure to test across a
4426 range of uses of cpplib, including heavy nested function-like macro
4427 expansion. Also check the change in peak memory usage (NJAMD is a
4428 good tool for this). */
4429 #define MIN_BUFF_SIZE 8000
4430 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4431 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4432 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4433
4434 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4435 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4436 #endif
4437
4438 /* Create a new allocation buffer. Place the control block at the end
4439 of the buffer, so that buffer overflows will cause immediate chaos. */
4440 static _cpp_buff *
4441 new_buff (size_t len)
4442 {
4443 _cpp_buff *result;
4444 unsigned char *base;
4445
4446 if (len < MIN_BUFF_SIZE)
4447 len = MIN_BUFF_SIZE;
4448 len = CPP_ALIGN (len);
4449
4450 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4451 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4452 struct first. */
4453 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4454 base = XNEWVEC (unsigned char, len + slen);
4455 result = (_cpp_buff *) base;
4456 base += slen;
4457 #else
4458 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4459 result = (_cpp_buff *) (base + len);
4460 #endif
4461 result->base = base;
4462 result->cur = base;
4463 result->limit = base + len;
4464 result->next = NULL;
4465 return result;
4466 }
4467
4468 /* Place a chain of unwanted allocation buffers on the free list. */
4469 void
4470 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4471 {
4472 _cpp_buff *end = buff;
4473
4474 while (end->next)
4475 end = end->next;
4476 end->next = pfile->free_buffs;
4477 pfile->free_buffs = buff;
4478 }
4479
4480 /* Return a free buffer of size at least MIN_SIZE. */
4481 _cpp_buff *
4482 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4483 {
4484 _cpp_buff *result, **p;
4485
4486 for (p = &pfile->free_buffs;; p = &(*p)->next)
4487 {
4488 size_t size;
4489
4490 if (*p == NULL)
4491 return new_buff (min_size);
4492 result = *p;
4493 size = result->limit - result->base;
4494 /* Return a buffer that's big enough, but don't waste one that's
4495 way too big. */
4496 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4497 break;
4498 }
4499
4500 *p = result->next;
4501 result->next = NULL;
4502 result->cur = result->base;
4503 return result;
4504 }
4505
4506 /* Creates a new buffer with enough space to hold the uncommitted
4507 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4508 the excess bytes to the new buffer. Chains the new buffer after
4509 BUFF, and returns the new buffer. */
4510 _cpp_buff *
4511 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4512 {
4513 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4514 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4515
4516 buff->next = new_buff;
4517 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4518 return new_buff;
4519 }
4520
4521 /* Creates a new buffer with enough space to hold the uncommitted
4522 remaining bytes of the buffer pointed to by BUFF, and at least
4523 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4524 Chains the new buffer before the buffer pointed to by BUFF, and
4525 updates the pointer to point to the new buffer. */
4526 void
4527 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4528 {
4529 _cpp_buff *new_buff, *old_buff = *pbuff;
4530 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4531
4532 new_buff = _cpp_get_buff (pfile, size);
4533 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4534 new_buff->next = old_buff;
4535 *pbuff = new_buff;
4536 }
4537
4538 /* Free a chain of buffers starting at BUFF. */
4539 void
4540 _cpp_free_buff (_cpp_buff *buff)
4541 {
4542 _cpp_buff *next;
4543
4544 for (; buff; buff = next)
4545 {
4546 next = buff->next;
4547 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4548 free (buff);
4549 #else
4550 free (buff->base);
4551 #endif
4552 }
4553 }
4554
4555 /* Allocate permanent, unaligned storage of length LEN. */
4556 unsigned char *
4557 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4558 {
4559 _cpp_buff *buff = pfile->u_buff;
4560 unsigned char *result = buff->cur;
4561
4562 if (len > (size_t) (buff->limit - result))
4563 {
4564 buff = _cpp_get_buff (pfile, len);
4565 buff->next = pfile->u_buff;
4566 pfile->u_buff = buff;
4567 result = buff->cur;
4568 }
4569
4570 buff->cur = result + len;
4571 return result;
4572 }
4573
4574 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4575 That buffer is used for growing allocations when saving macro
4576 replacement lists in a #define, and when parsing an answer to an
4577 assertion in #assert, #unassert or #if (and therefore possibly
4578 whilst expanding macros). It therefore must not be used by any
4579 code that they might call: specifically the lexer and the guts of
4580 the macro expander.
4581
4582 All existing other uses clearly fit this restriction: storing
4583 registered pragmas during initialization. */
4584 unsigned char *
4585 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4586 {
4587 _cpp_buff *buff = pfile->a_buff;
4588 unsigned char *result = buff->cur;
4589
4590 if (len > (size_t) (buff->limit - result))
4591 {
4592 buff = _cpp_get_buff (pfile, len);
4593 buff->next = pfile->a_buff;
4594 pfile->a_buff = buff;
4595 result = buff->cur;
4596 }
4597
4598 buff->cur = result + len;
4599 return result;
4600 }
4601
4602 /* Commit or allocate storage from a buffer. */
4603
4604 void *
4605 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4606 {
4607 void *ptr = BUFF_FRONT (pfile->a_buff);
4608
4609 if (pfile->hash_table->alloc_subobject)
4610 {
4611 void *copy = pfile->hash_table->alloc_subobject (size);
4612 memcpy (copy, ptr, size);
4613 ptr = copy;
4614 }
4615 else
4616 BUFF_FRONT (pfile->a_buff) += size;
4617
4618 return ptr;
4619 }
4620
4621 /* Say which field of TOK is in use. */
4622
4623 enum cpp_token_fld_kind
4624 cpp_token_val_index (const cpp_token *tok)
4625 {
4626 switch (TOKEN_SPELL (tok))
4627 {
4628 case SPELL_IDENT:
4629 return CPP_TOKEN_FLD_NODE;
4630 case SPELL_LITERAL:
4631 return CPP_TOKEN_FLD_STR;
4632 case SPELL_OPERATOR:
4633 /* Operands which were originally spelled as ident keep around
4634 the node for the exact spelling. */
4635 if (tok->flags & NAMED_OP)
4636 return CPP_TOKEN_FLD_NODE;
4637 else if (tok->type == CPP_PASTE)
4638 return CPP_TOKEN_FLD_TOKEN_NO;
4639 else
4640 return CPP_TOKEN_FLD_NONE;
4641 case SPELL_NONE:
4642 if (tok->type == CPP_MACRO_ARG)
4643 return CPP_TOKEN_FLD_ARG_NO;
4644 else if (tok->type == CPP_PADDING)
4645 return CPP_TOKEN_FLD_SOURCE;
4646 else if (tok->type == CPP_PRAGMA)
4647 return CPP_TOKEN_FLD_PRAGMA;
4648 /* fall through */
4649 default:
4650 return CPP_TOKEN_FLD_NONE;
4651 }
4652 }
4653
4654 /* All tokens lexed in R after calling this function will be forced to
4655 have their location_t to be P, until
4656 cpp_stop_forcing_token_locations is called for R. */
4657
4658 void
4659 cpp_force_token_locations (cpp_reader *r, location_t loc)
4660 {
4661 r->forced_token_location = loc;
4662 }
4663
4664 /* Go back to assigning locations naturally for lexed tokens. */
4665
4666 void
4667 cpp_stop_forcing_token_locations (cpp_reader *r)
4668 {
4669 r->forced_token_location = 0;
4670 }
4671
4672 /* We're looking at \, if it's escaping EOL, look past it. If at
4673 LIMIT, don't advance. */
4674
4675 static const unsigned char *
4676 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4677 {
4678 const unsigned char *probe = peek;
4679
4680 if (__builtin_expect (peek[1] == '\n', true))
4681 {
4682 eol:
4683 probe += 2;
4684 if (__builtin_expect (probe < limit, true))
4685 {
4686 peek = probe;
4687 if (*peek == '\\')
4688 /* The user might be perverse. */
4689 return do_peek_backslash (peek, limit);
4690 }
4691 }
4692 else if (__builtin_expect (peek[1] == '\r', false))
4693 {
4694 if (probe[2] == '\n')
4695 probe++;
4696 goto eol;
4697 }
4698
4699 return peek;
4700 }
4701
4702 static const unsigned char *
4703 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4704 {
4705 if (__builtin_expect (*peek == '\\', false))
4706 peek = do_peek_backslash (peek, limit);
4707 return peek;
4708 }
4709
4710 static const unsigned char *
4711 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4712 {
4713 if (peek == bound)
4714 return NULL;
4715
4716 unsigned char c = *--peek;
4717 if (__builtin_expect (c == '\n', false)
4718 || __builtin_expect (c == 'r', false))
4719 {
4720 if (peek == bound)
4721 return peek;
4722 int ix = -1;
4723 if (c == '\n' && peek[ix] == '\r')
4724 {
4725 if (peek + ix == bound)
4726 return peek;
4727 ix--;
4728 }
4729
4730 if (peek[ix] == '\\')
4731 return do_peek_prev (peek + ix, bound);
4732
4733 return peek;
4734 }
4735 else
4736 return peek;
4737 }
4738
4739 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4740 space. Otherwise return NULL. */
4741
4742 static const unsigned char *
4743 do_peek_ident (const char *match, const unsigned char *peek,
4744 const unsigned char *limit)
4745 {
4746 for (; *++match; peek++)
4747 if (*peek != *match)
4748 {
4749 peek = do_peek_next (peek, limit);
4750 if (*peek != *match)
4751 return NULL;
4752 }
4753
4754 /* Must now not be looking at an identifier char. */
4755 peek = do_peek_next (peek, limit);
4756 if (ISIDNUM (*peek))
4757 return NULL;
4758
4759 /* Skip control-line whitespace. */
4760 ws:
4761 while (*peek == ' ' || *peek == '\t')
4762 peek++;
4763 if (__builtin_expect (*peek == '\\', false))
4764 {
4765 peek = do_peek_backslash (peek, limit);
4766 if (*peek != '\\')
4767 goto ws;
4768 }
4769
4770 return peek;
4771 }
4772
4773 /* Are we looking at a module control line starting as PEEK - 1? */
4774
4775 static bool
4776 do_peek_module (cpp_reader *pfile, unsigned char c,
4777 const unsigned char *peek, const unsigned char *limit)
4778 {
4779 bool import = false;
4780
4781 if (__builtin_expect (c == 'e', false))
4782 {
4783 if (!((peek[0] == 'x' || peek[0] == '\\')
4784 && (peek = do_peek_ident ("export", peek, limit))))
4785 return false;
4786
4787 /* export, peek for import or module. No need to peek __import
4788 here. */
4789 if (peek[0] == 'i')
4790 {
4791 if (!((peek[1] == 'm' || peek[1] == '\\')
4792 && (peek = do_peek_ident ("import", peek + 1, limit))))
4793 return false;
4794 import = true;
4795 }
4796 else if (peek[0] == 'm')
4797 {
4798 if (!((peek[1] == 'o' || peek[1] == '\\')
4799 && (peek = do_peek_ident ("module", peek + 1, limit))))
4800 return false;
4801 }
4802 else
4803 return false;
4804 }
4805 else if (__builtin_expect (c == 'i', false))
4806 {
4807 if (!((peek[0] == 'm' || peek[0] == '\\')
4808 && (peek = do_peek_ident ("import", peek, limit))))
4809 return false;
4810 import = true;
4811 }
4812 else if (__builtin_expect (c == '_', false))
4813 {
4814 /* Needed for translated includes. */
4815 if (!((peek[0] == '_' || peek[0] == '\\')
4816 && (peek = do_peek_ident ("__import", peek, limit))))
4817 return false;
4818 import = true;
4819 }
4820 else if (__builtin_expect (c == 'm', false))
4821 {
4822 if (!((peek[0] == 'o' || peek[0] == '\\')
4823 && (peek = do_peek_ident ("module", peek, limit))))
4824 return false;
4825 }
4826 else
4827 return false;
4828
4829 /* Peek the next character to see if it's good enough. We'll be at
4830 the first non-whitespace char, including skipping an escaped
4831 newline. */
4832 /* ... import followed by identifier, ':', '<' or header-name
4833 preprocessing tokens, or module followed by identifier, ':' or
4834 ';' preprocessing tokens. */
4835 unsigned char p = *peek++;
4836
4837 /* A character literal is ... single quotes, ... optionally preceded
4838 by u8, u, U, or L */
4839 /* A string-literal is a ... double quotes, optionally prefixed by
4840 R, u8, u8R, u, uR, U, UR, L, or LR */
4841 if (p == 'u')
4842 {
4843 peek = do_peek_next (peek, limit);
4844 if (*peek == '8')
4845 {
4846 peek++;
4847 goto peek_u8;
4848 }
4849 goto peek_u;
4850 }
4851 else if (p == 'U' || p == 'L')
4852 {
4853 peek_u8:
4854 peek = do_peek_next (peek, limit);
4855 peek_u:
4856 if (*peek == '\"' || *peek == '\'')
4857 return false;
4858
4859 if (*peek == 'R')
4860 goto peek_R;
4861 /* Identifier. Ok. */
4862 }
4863 else if (p == 'R')
4864 {
4865 peek_R:
4866 if (CPP_OPTION (pfile, rliterals))
4867 {
4868 peek = do_peek_next (peek, limit);
4869 if (*peek == '\"')
4870 return false;
4871 }
4872 /* Identifier. Ok. */
4873 }
4874 else if ('Z' - 'A' == 25
4875 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4876 : ISIDST (p))
4877 {
4878 /* Identifier. Ok. */
4879 }
4880 else if (p == '<')
4881 {
4882 /* Maybe angle header, ok for import. Reject
4883 '<=', '<<' digraph:'<:'. */
4884 if (!import)
4885 return false;
4886 peek = do_peek_next (peek, limit);
4887 if (*peek == '=' || *peek == '<'
4888 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4889 return false;
4890 }
4891 else if (p == ';')
4892 {
4893 /* SEMICOLON, ok for module. */
4894 if (import)
4895 return false;
4896 }
4897 else if (p == '"')
4898 {
4899 /* STRING, ok for import. */
4900 if (!import)
4901 return false;
4902 }
4903 else if (p == ':')
4904 {
4905 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
4906 peek = do_peek_next (peek, limit);
4907 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4908 return false;
4909 }
4910 else
4911 /* FIXME: Detect a unicode character, excluding those not
4912 permitted as the initial character. [lex.name]/1. I presume
4913 we need to check the \[uU] spellings, and directly using
4914 Unicode in say UTF8 form? Or perhaps we do the phase-1
4915 conversion of UTF8 to universal-character-names? */
4916 return false;
4917
4918 return true;
4919 }
4920
4921 /* Directives-only scanning. Somewhat more relaxed than correct
4922 parsing -- some ill-formed programs will not be rejected. */
4923
4924 void
4925 cpp_directive_only_process (cpp_reader *pfile,
4926 void *data,
4927 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4928 {
4929 bool module_p = CPP_OPTION (pfile, module_directives);
4930
4931 do
4932 {
4933 restart:
4934 /* Buffer initialization, but no line cleaning. */
4935 cpp_buffer *buffer = pfile->buffer;
4936 buffer->cur_note = buffer->notes_used = 0;
4937 buffer->cur = buffer->line_base = buffer->next_line;
4938 buffer->need_line = false;
4939 /* Files always end in a newline or carriage return. We rely on this for
4940 character peeking safety. */
4941 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
4942
4943 const unsigned char *base = buffer->cur;
4944 unsigned line_count = 0;
4945 const unsigned char *line_start = base;
4946
4947 bool bol = true;
4948 bool raw = false;
4949
4950 const unsigned char *lwm = base;
4951 for (const unsigned char *pos = base, *limit = buffer->rlimit;
4952 pos < limit;)
4953 {
4954 unsigned char c = *pos++;
4955 /* This matches the switch in _cpp_lex_direct. */
4956 switch (c)
4957 {
4958 case ' ': case '\t': case '\f': case '\v':
4959 /* Whitespace, do nothing. */
4960 break;
4961
4962 case '\r': /* MAC line ending, or Windows \r\n */
4963 if (*pos == '\n')
4964 pos++;
4965 /* FALLTHROUGH */
4966
4967 case '\n':
4968 bol = true;
4969
4970 next_line:
4971 CPP_INCREMENT_LINE (pfile, 0);
4972 line_count++;
4973 line_start = pos;
4974 break;
4975
4976 case '\\':
4977 /* <backslash><newline> is removed, and doesn't undo any
4978 preceeding escape or whatnot. */
4979 if (*pos == '\n')
4980 {
4981 pos++;
4982 goto next_line;
4983 }
4984 else if (*pos == '\r')
4985 {
4986 if (pos[1] == '\n')
4987 pos++;
4988 pos++;
4989 goto next_line;
4990 }
4991 goto dflt;
4992
4993 case '#':
4994 if (bol)
4995 {
4996 /* Line directive. */
4997 if (pos - 1 > base && !pfile->state.skipping)
4998 cb (pfile, CPP_DO_print, data,
4999 line_count, base, pos - 1 - base);
5000
5001 /* Prep things for directive handling. */
5002 buffer->next_line = pos;
5003 buffer->need_line = true;
5004 bool ok = _cpp_get_fresh_line (pfile);
5005 gcc_checking_assert (ok);
5006
5007 /* Ensure proper column numbering for generated
5008 error messages. */
5009 buffer->line_base -= pos - line_start;
5010
5011 _cpp_handle_directive (pfile, line_start + 1 != pos);
5012
5013 /* Sanitize the line settings. Duplicate #include's can
5014 mess things up. */
5015 // FIXME: Necessary?
5016 pfile->line_table->highest_location
5017 = pfile->line_table->highest_line;
5018
5019 if (!pfile->state.skipping
5020 && pfile->buffer->next_line < pfile->buffer->rlimit)
5021 cb (pfile, CPP_DO_location, data,
5022 pfile->line_table->highest_line);
5023
5024 goto restart;
5025 }
5026 goto dflt;
5027
5028 case '/':
5029 {
5030 const unsigned char *peek = do_peek_next (pos, limit);
5031 if (!(*peek == '/' || *peek == '*'))
5032 goto dflt;
5033
5034 /* Line or block comment */
5035 bool is_block = *peek == '*';
5036 bool star = false;
5037 bool esc = false;
5038 location_t sloc
5039 = linemap_position_for_column (pfile->line_table,
5040 pos - line_start);
5041
5042 while (pos < limit)
5043 {
5044 char c = *pos++;
5045 switch (c)
5046 {
5047 case '\\':
5048 esc = true;
5049 break;
5050
5051 case '\r':
5052 if (*pos == '\n')
5053 pos++;
5054 /* FALLTHROUGH */
5055
5056 case '\n':
5057 {
5058 CPP_INCREMENT_LINE (pfile, 0);
5059 line_count++;
5060 line_start = pos;
5061 if (!esc && !is_block)
5062 {
5063 bol = true;
5064 goto done_comment;
5065 }
5066 }
5067 if (!esc)
5068 star = false;
5069 esc = false;
5070 break;
5071
5072 case '*':
5073 if (pos > peek)
5074 star = is_block;
5075 esc = false;
5076 break;
5077
5078 case '/':
5079 if (star)
5080 goto done_comment;
5081 /* FALLTHROUGH */
5082
5083 default:
5084 star = false;
5085 esc = false;
5086 break;
5087 }
5088 }
5089 if (pos < limit || is_block)
5090 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5091 "unterminated comment");
5092 done_comment:
5093 lwm = pos;
5094 break;
5095 }
5096
5097 case '\'':
5098 if (!CPP_OPTION (pfile, digit_separators))
5099 goto delimited_string;
5100
5101 /* Possibly a number punctuator. */
5102 if (!ISIDNUM (*do_peek_next (pos, limit)))
5103 goto delimited_string;
5104
5105 goto quote_peek;
5106
5107 case '\"':
5108 if (!CPP_OPTION (pfile, rliterals))
5109 goto delimited_string;
5110
5111 quote_peek:
5112 {
5113 /* For ' see if it's a number punctuator
5114 \.?<digit>(<digit>|<identifier-nondigit>
5115 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5116 /* For " see if it's a raw string
5117 {U,L,u,u8}R. This includes CPP_NUMBER detection,
5118 because that could be 0e+R. */
5119 const unsigned char *peek = pos - 1;
5120 bool quote_first = c == '"';
5121 bool quote_eight = false;
5122 bool maybe_number_start = false;
5123 bool want_number = false;
5124
5125 while ((peek = do_peek_prev (peek, lwm)))
5126 {
5127 unsigned char p = *peek;
5128 if (quote_first)
5129 {
5130 if (!raw)
5131 {
5132 if (p != 'R')
5133 break;
5134 raw = true;
5135 continue;
5136 }
5137
5138 quote_first = false;
5139 if (p == 'L' || p == 'U' || p == 'u')
5140 ;
5141 else if (p == '8')
5142 quote_eight = true;
5143 else
5144 goto second_raw;
5145 }
5146 else if (quote_eight)
5147 {
5148 if (p != 'u')
5149 {
5150 raw = false;
5151 break;
5152 }
5153 quote_eight = false;
5154 }
5155 else if (c == '"')
5156 {
5157 second_raw:;
5158 if (!want_number && ISIDNUM (p))
5159 {
5160 raw = false;
5161 break;
5162 }
5163 }
5164
5165 if (ISDIGIT (p))
5166 maybe_number_start = true;
5167 else if (p == '.')
5168 want_number = true;
5169 else if (ISIDNUM (p))
5170 maybe_number_start = false;
5171 else if (p == '+' || p == '-')
5172 {
5173 if (const unsigned char *peek_prev
5174 = do_peek_prev (peek, lwm))
5175 {
5176 p = *peek_prev;
5177 if (p == 'e' || p == 'E'
5178 || p == 'p' || p == 'P')
5179 {
5180 want_number = true;
5181 maybe_number_start = false;
5182 }
5183 else
5184 break;
5185 }
5186 else
5187 break;
5188 }
5189 else if (p == '\'' || p == '\"')
5190 {
5191 /* If this is lwm, this must be the end of a
5192 previous string. So this is a trailing
5193 literal type, (a) if those are allowed,
5194 and (b) maybe_start is false. Otherwise
5195 this must be a CPP_NUMBER because we've
5196 met another ', and we'd have checked that
5197 in its own right. */
5198 if (peek == lwm && CPP_OPTION (pfile, uliterals))
5199 {
5200 if (!maybe_number_start && !want_number)
5201 /* Must be a literal type. */
5202 raw = false;
5203 }
5204 else if (p == '\''
5205 && CPP_OPTION (pfile, digit_separators))
5206 maybe_number_start = true;
5207 break;
5208 }
5209 else if (c == '\'')
5210 break;
5211 else if (!quote_first && !quote_eight)
5212 break;
5213 }
5214
5215 if (maybe_number_start)
5216 {
5217 if (c == '\'')
5218 /* A CPP NUMBER. */
5219 goto dflt;
5220 raw = false;
5221 }
5222
5223 goto delimited_string;
5224 }
5225
5226 delimited_string:
5227 {
5228 /* (Possibly raw) string or char literal. */
5229 unsigned char end = c;
5230 int delim_len = -1;
5231 const unsigned char *delim = NULL;
5232 location_t sloc = linemap_position_for_column (pfile->line_table,
5233 pos - line_start);
5234 int esc = 0;
5235
5236 if (raw)
5237 {
5238 /* There can be no line breaks in the delimiter. */
5239 delim = pos;
5240 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5241 {
5242 if (delim_len == 16)
5243 {
5244 cpp_error_with_line (pfile, CPP_DL_ERROR,
5245 sloc, 0,
5246 "raw string delimiter"
5247 " longer than %d"
5248 " characters",
5249 delim_len);
5250 raw = false;
5251 pos = delim;
5252 break;
5253 }
5254 if (strchr (") \\\t\v\f\n", c))
5255 {
5256 cpp_error_with_line (pfile, CPP_DL_ERROR,
5257 sloc, 0,
5258 "invalid character '%c'"
5259 " in raw string"
5260 " delimiter", c);
5261 raw = false;
5262 pos = delim;
5263 break;
5264 }
5265 if (pos >= limit)
5266 goto bad_string;
5267 }
5268 }
5269
5270 while (pos < limit)
5271 {
5272 char c = *pos++;
5273 switch (c)
5274 {
5275 case '\\':
5276 if (!raw)
5277 esc++;
5278 break;
5279
5280 case '\r':
5281 if (*pos == '\n')
5282 pos++;
5283 /* FALLTHROUGH */
5284
5285 case '\n':
5286 {
5287 CPP_INCREMENT_LINE (pfile, 0);
5288 line_count++;
5289 line_start = pos;
5290 }
5291 if (esc)
5292 esc--;
5293 break;
5294
5295 case ')':
5296 if (raw
5297 && pos + delim_len + 1 < limit
5298 && pos[delim_len] == end
5299 && !memcmp (delim, pos, delim_len))
5300 {
5301 pos += delim_len + 1;
5302 raw = false;
5303 goto done_string;
5304 }
5305 break;
5306
5307 default:
5308 if (!raw && !(esc & 1) && c == end)
5309 goto done_string;
5310 esc = 0;
5311 break;
5312 }
5313 }
5314 bad_string:
5315 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5316 "unterminated literal");
5317
5318 done_string:
5319 raw = false;
5320 lwm = pos - 1;
5321 }
5322 goto dflt;
5323
5324 case '_':
5325 case 'e':
5326 case 'i':
5327 case 'm':
5328 if (bol && module_p && !pfile->state.skipping
5329 && do_peek_module (pfile, c, pos, limit))
5330 {
5331 /* We've seen the start of a module control line.
5332 Start up the tokenizer. */
5333 pos--; /* Backup over the first character. */
5334
5335 /* Backup over whitespace to start of line. */
5336 while (pos > line_start
5337 && (pos[-1] == ' ' || pos[-1] == '\t'))
5338 pos--;
5339
5340 if (pos > base)
5341 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5342
5343 /* Prep things for directive handling. */
5344 buffer->next_line = pos;
5345 buffer->need_line = true;
5346
5347 /* Now get tokens until the PRAGMA_EOL. */
5348 do
5349 {
5350 location_t spelling;
5351 const cpp_token *tok
5352 = cpp_get_token_with_location (pfile, &spelling);
5353
5354 gcc_assert (pfile->state.in_deferred_pragma
5355 || tok->type == CPP_PRAGMA_EOL);
5356 cb (pfile, CPP_DO_token, data, tok, spelling);
5357 }
5358 while (pfile->state.in_deferred_pragma);
5359
5360 if (pfile->buffer->next_line < pfile->buffer->rlimit)
5361 cb (pfile, CPP_DO_location, data,
5362 pfile->line_table->highest_line);
5363
5364 pfile->mi_valid = false;
5365 goto restart;
5366 }
5367 goto dflt;
5368
5369 default:
5370 dflt:
5371 bol = false;
5372 pfile->mi_valid = false;
5373 break;
5374 }
5375 }
5376
5377 if (buffer->rlimit > base && !pfile->state.skipping)
5378 {
5379 const unsigned char *limit = buffer->rlimit;
5380 /* If the file was not newline terminated, add rlimit, which is
5381 guaranteed to point to a newline, to the end of our range. */
5382 if (limit[-1] != '\n')
5383 {
5384 limit++;
5385 CPP_INCREMENT_LINE (pfile, 0);
5386 line_count++;
5387 }
5388 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5389 }
5390
5391 _cpp_pop_buffer (pfile);
5392 }
5393 while (pfile->buffer);
5394 }
5395