lex.cc revision 1.2 1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2022 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
26
27 enum spell_type
28 {
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
33 };
34
35 struct token_spelling
36 {
37 enum spell_type category;
38 const unsigned char *name;
39 };
40
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
43
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
49
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void store_comment (cpp_reader *, cpp_token *);
59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60 unsigned int, enum cpp_ttype);
61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62 static int name_p (cpp_reader *, const cpp_string *);
63 static tokenrun *next_tokenrun (tokenrun *);
64
65 static _cpp_buff *new_buff (size_t);
66
67
68 /* Utility routine:
69
70 Compares, the token TOKEN to the NUL-terminated string STRING.
71 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
72 int
73 cpp_ideq (const cpp_token *token, const char *string)
74 {
75 if (token->type != CPP_NAME)
76 return 0;
77
78 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
79 }
80
81 /* Record a note TYPE at byte POS into the current cleaned logical
82 line. */
83 static void
84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
85 {
86 if (buffer->notes_used == buffer->notes_cap)
87 {
88 buffer->notes_cap = buffer->notes_cap * 2 + 200;
89 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90 buffer->notes_cap);
91 }
92
93 buffer->notes[buffer->notes_used].pos = pos;
94 buffer->notes[buffer->notes_used].type = type;
95 buffer->notes_used++;
96 }
97
98
99 /* Fast path to find line special characters using optimized character
101 scanning algorithms. Anything complicated falls back to the slow
102 path below. Since this loop is very hot it's worth doing these kinds
103 of optimizations.
104
105 One of the paths through the ifdefs should provide
106
107 const uchar *search_line_fast (const uchar *s, const uchar *end);
108
109 Between S and END, search for \n, \r, \\, ?. Return a pointer to
110 the found character.
111
112 Note that the last character of the buffer is *always* a newline,
113 as forced by _cpp_convert_input. This fact can be used to avoid
114 explicitly looking for the end of the buffer. */
115
116 /* Configure gives us an ifdef test. */
117 #ifndef WORDS_BIGENDIAN
118 #define WORDS_BIGENDIAN 0
119 #endif
120
121 /* We'd like the largest integer that fits into a register. There's nothing
122 in <stdint.h> that gives us that. For most hosts this is unsigned long,
123 but MS decided on an LLP64 model. Thankfully when building with GCC we
124 can get the "real" word size. */
125 #ifdef __GNUC__
126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
127 #else
128 typedef unsigned long word_type;
129 #endif
130
131 /* The code below is only expecting sizes 4 or 8.
132 Die at compile-time if this expectation is violated. */
133 typedef char check_word_type_size
134 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
135
136 /* Return X with the first N bytes forced to values that won't match one
137 of the interesting characters. Note that NUL is not interesting. */
138
139 static inline word_type
140 acc_char_mask_misalign (word_type val, unsigned int n)
141 {
142 word_type mask = -1;
143 if (WORDS_BIGENDIAN)
144 mask >>= n * 8;
145 else
146 mask <<= n * 8;
147 return val & mask;
148 }
149
150 /* Return X replicated to all byte positions within WORD_TYPE. */
151
152 static inline word_type
153 acc_char_replicate (uchar x)
154 {
155 word_type ret;
156
157 ret = (x << 24) | (x << 16) | (x << 8) | x;
158 if (sizeof(word_type) == 8)
159 ret = (ret << 16 << 16) | ret;
160 return ret;
161 }
162
163 /* Return non-zero if some byte of VAL is (probably) C. */
164
165 static inline word_type
166 acc_char_cmp (word_type val, word_type c)
167 {
168 #if defined(__GNUC__) && defined(__alpha__)
169 /* We can get exact results using a compare-bytes instruction.
170 Get (val == c) via (0 >= (val ^ c)). */
171 return __builtin_alpha_cmpbge (0, val ^ c);
172 #else
173 word_type magic = 0x7efefefeU;
174 if (sizeof(word_type) == 8)
175 magic = (magic << 16 << 16) | 0xfefefefeU;
176 magic |= 1;
177
178 val ^= c;
179 return ((val + magic) ^ ~val) & ~magic;
180 #endif
181 }
182
183 /* Given the result of acc_char_cmp is non-zero, return the index of
184 the found character. If this was a false positive, return -1. */
185
186 static inline int
187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
188 word_type val ATTRIBUTE_UNUSED)
189 {
190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
191 /* The cmpbge instruction sets *bits* of the result corresponding to
192 matches in the bytes with no false positives. */
193 return __builtin_ctzl (cmp);
194 #else
195 unsigned int i;
196
197 /* ??? It would be nice to force unrolling here,
198 and have all of these constants folded. */
199 for (i = 0; i < sizeof(word_type); ++i)
200 {
201 uchar c;
202 if (WORDS_BIGENDIAN)
203 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
204 else
205 c = (val >> i * 8) & 0xff;
206
207 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
208 return i;
209 }
210
211 return -1;
212 #endif
213 }
214
215 /* A version of the fast scanner using bit fiddling techniques.
216
217 For 32-bit words, one would normally perform 16 comparisons and
218 16 branches. With this algorithm one performs 24 arithmetic
219 operations and one branch. Whether this is faster with a 32-bit
220 word size is going to be somewhat system dependent.
221
222 For 64-bit words, we eliminate twice the number of comparisons
223 and branches without increasing the number of arithmetic operations.
224 It's almost certainly going to be a win with 64-bit word size. */
225
226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
227 ATTRIBUTE_UNUSED;
228
229 static const uchar *
230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
231 {
232 const word_type repl_nl = acc_char_replicate ('\n');
233 const word_type repl_cr = acc_char_replicate ('\r');
234 const word_type repl_bs = acc_char_replicate ('\\');
235 const word_type repl_qm = acc_char_replicate ('?');
236
237 unsigned int misalign;
238 const word_type *p;
239 word_type val, t;
240
241 /* Align the buffer. Mask out any bytes from before the beginning. */
242 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
243 val = *p;
244 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
245 if (misalign)
246 val = acc_char_mask_misalign (val, misalign);
247
248 /* Main loop. */
249 while (1)
250 {
251 t = acc_char_cmp (val, repl_nl);
252 t |= acc_char_cmp (val, repl_cr);
253 t |= acc_char_cmp (val, repl_bs);
254 t |= acc_char_cmp (val, repl_qm);
255
256 if (__builtin_expect (t != 0, 0))
257 {
258 int i = acc_char_index (t, val);
259 if (i >= 0)
260 return (const uchar *)p + i;
261 }
262
263 val = *++p;
264 }
265 }
266
267 /* Disable on Solaris 2/x86 until the following problem can be properly
268 autoconfed:
269
270 The Solaris 10+ assembler tags objects with the instruction set
271 extensions used, so SSE4.2 executables cannot run on machines that
272 don't support that extension. */
273
274 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
275
276 /* Replicated character data to be shared between implementations.
277 Recall that outside of a context with vector support we can't
278 define compatible vector types, therefore these are all defined
279 in terms of raw characters. */
280 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
281 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
282 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
283 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
284 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
285 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
286 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
287 { '?', '?', '?', '?', '?', '?', '?', '?',
288 '?', '?', '?', '?', '?', '?', '?', '?' },
289 };
290
291 /* A version of the fast scanner using MMX vectorized byte compare insns.
292
293 This uses the PMOVMSKB instruction which was introduced with "MMX2",
294 which was packaged into SSE1; it is also present in the AMD MMX
295 extension. Mark the function as using "sse" so that we emit a real
296 "emms" instruction, rather than the 3dNOW "femms" instruction. */
297
298 static const uchar *
299 #ifndef __SSE__
300 __attribute__((__target__("sse")))
301 #endif
302 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
303 {
304 typedef char v8qi __attribute__ ((__vector_size__ (8)));
305 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
306
307 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
308 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
309 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
310 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
311
312 unsigned int misalign, found, mask;
313 const v8qi *p;
314 v8qi data, t, c;
315
316 /* Align the source pointer. While MMX doesn't generate unaligned data
317 faults, this allows us to safely scan to the end of the buffer without
318 reading beyond the end of the last page. */
319 misalign = (uintptr_t)s & 7;
320 p = (const v8qi *)((uintptr_t)s & -8);
321 data = *p;
322
323 /* Create a mask for the bytes that are valid within the first
324 16-byte block. The Idea here is that the AND with the mask
325 within the loop is "free", since we need some AND or TEST
326 insn in order to set the flags for the branch anyway. */
327 mask = -1u << misalign;
328
329 /* Main loop processing 8 bytes at a time. */
330 goto start;
331 do
332 {
333 data = *++p;
334 mask = -1;
335
336 start:
337 t = __builtin_ia32_pcmpeqb(data, repl_nl);
338 c = __builtin_ia32_pcmpeqb(data, repl_cr);
339 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
340 c = __builtin_ia32_pcmpeqb(data, repl_bs);
341 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
342 c = __builtin_ia32_pcmpeqb(data, repl_qm);
343 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
344 found = __builtin_ia32_pmovmskb (t);
345 found &= mask;
346 }
347 while (!found);
348
349 __builtin_ia32_emms ();
350
351 /* FOUND contains 1 in bits for which we matched a relevant
352 character. Conversion to the byte index is trivial. */
353 found = __builtin_ctz(found);
354 return (const uchar *)p + found;
355 }
356
357 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
358
359 static const uchar *
360 #ifndef __SSE2__
361 __attribute__((__target__("sse2")))
362 #endif
363 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
364 {
365 typedef char v16qi __attribute__ ((__vector_size__ (16)));
366
367 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
368 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
369 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
370 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
371
372 unsigned int misalign, found, mask;
373 const v16qi *p;
374 v16qi data, t;
375
376 /* Align the source pointer. */
377 misalign = (uintptr_t)s & 15;
378 p = (const v16qi *)((uintptr_t)s & -16);
379 data = *p;
380
381 /* Create a mask for the bytes that are valid within the first
382 16-byte block. The Idea here is that the AND with the mask
383 within the loop is "free", since we need some AND or TEST
384 insn in order to set the flags for the branch anyway. */
385 mask = -1u << misalign;
386
387 /* Main loop processing 16 bytes at a time. */
388 goto start;
389 do
390 {
391 data = *++p;
392 mask = -1;
393
394 start:
395 t = data == repl_nl;
396 t |= data == repl_cr;
397 t |= data == repl_bs;
398 t |= data == repl_qm;
399 found = __builtin_ia32_pmovmskb128 (t);
400 found &= mask;
401 }
402 while (!found);
403
404 /* FOUND contains 1 in bits for which we matched a relevant
405 character. Conversion to the byte index is trivial. */
406 found = __builtin_ctz(found);
407 return (const uchar *)p + found;
408 }
409
410 #ifdef HAVE_SSE4
411 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
412
413 static const uchar *
414 #ifndef __SSE4_2__
415 __attribute__((__target__("sse4.2")))
416 #endif
417 search_line_sse42 (const uchar *s, const uchar *end)
418 {
419 typedef char v16qi __attribute__ ((__vector_size__ (16)));
420 static const v16qi search = { '\n', '\r', '?', '\\' };
421
422 uintptr_t si = (uintptr_t)s;
423 uintptr_t index;
424
425 /* Check for unaligned input. */
426 if (si & 15)
427 {
428 v16qi sv;
429
430 if (__builtin_expect (end - s < 16, 0)
431 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
432 {
433 /* There are less than 16 bytes left in the buffer, and less
434 than 16 bytes left on the page. Reading 16 bytes at this
435 point might generate a spurious page fault. Defer to the
436 SSE2 implementation, which already handles alignment. */
437 return search_line_sse2 (s, end);
438 }
439
440 /* ??? The builtin doesn't understand that the PCMPESTRI read from
441 memory need not be aligned. */
442 sv = __builtin_ia32_loaddqu ((const char *) s);
443 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
444
445 if (__builtin_expect (index < 16, 0))
446 goto found;
447
448 /* Advance the pointer to an aligned address. We will re-scan a
449 few bytes, but we no longer need care for reading past the
450 end of a page, since we're guaranteed a match. */
451 s = (const uchar *)((si + 15) & -16);
452 }
453
454 /* Main loop, processing 16 bytes at a time. */
455 #ifdef __GCC_ASM_FLAG_OUTPUTS__
456 while (1)
457 {
458 char f;
459
460 /* By using inline assembly instead of the builtin,
461 we can use the result, as well as the flags set. */
462 __asm ("%vpcmpestri\t$0, %2, %3"
463 : "=c"(index), "=@ccc"(f)
464 : "m"(*s), "x"(search), "a"(4), "d"(16));
465 if (f)
466 break;
467
468 s += 16;
469 }
470 #else
471 s -= 16;
472 /* By doing the whole loop in inline assembly,
473 we can make proper use of the flags set. */
474 __asm ( ".balign 16\n"
475 "0: add $16, %1\n"
476 " %vpcmpestri\t$0, (%1), %2\n"
477 " jnc 0b"
478 : "=&c"(index), "+r"(s)
479 : "x"(search), "a"(4), "d"(16));
480 #endif
481
482 found:
483 return s + index;
484 }
485
486 #else
487 /* Work around out-dated assemblers without sse4 support. */
488 #define search_line_sse42 search_line_sse2
489 #endif
490
491 /* Check the CPU capabilities. */
492
493 #include "../gcc/config/i386/cpuid.h"
494
495 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
496 static search_line_fast_type search_line_fast;
497
498 #define HAVE_init_vectorized_lexer 1
499 static inline void
500 init_vectorized_lexer (void)
501 {
502 unsigned dummy, ecx = 0, edx = 0;
503 search_line_fast_type impl = search_line_acc_char;
504 int minimum = 0;
505
506 #if defined(__SSE4_2__)
507 minimum = 3;
508 #elif defined(__SSE2__)
509 minimum = 2;
510 #elif defined(__SSE__)
511 minimum = 1;
512 #endif
513
514 if (minimum == 3)
515 impl = search_line_sse42;
516 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
517 {
518 if (minimum == 3 || (ecx & bit_SSE4_2))
519 impl = search_line_sse42;
520 else if (minimum == 2 || (edx & bit_SSE2))
521 impl = search_line_sse2;
522 else if (minimum == 1 || (edx & bit_SSE))
523 impl = search_line_mmx;
524 }
525 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
526 {
527 if (minimum == 1
528 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
529 impl = search_line_mmx;
530 }
531
532 search_line_fast = impl;
533 }
534
535 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
536
537 /* A vection of the fast scanner using AltiVec vectorized byte compares
538 and VSX unaligned loads (when VSX is available). This is otherwise
539 the same as the AltiVec version. */
540
541 ATTRIBUTE_NO_SANITIZE_UNDEFINED
542 static const uchar *
543 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
544 {
545 typedef __attribute__((altivec(vector))) unsigned char vc;
546
547 const vc repl_nl = {
548 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
549 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
550 };
551 const vc repl_cr = {
552 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
553 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
554 };
555 const vc repl_bs = {
556 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
557 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
558 };
559 const vc repl_qm = {
560 '?', '?', '?', '?', '?', '?', '?', '?',
561 '?', '?', '?', '?', '?', '?', '?', '?',
562 };
563 const vc zero = { 0 };
564
565 vc data, t;
566
567 /* Main loop processing 16 bytes at a time. */
568 do
569 {
570 vc m_nl, m_cr, m_bs, m_qm;
571
572 data = __builtin_vec_vsx_ld (0, s);
573 s += 16;
574
575 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
576 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
577 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
578 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
579 t = (m_nl | m_cr) | (m_bs | m_qm);
580
581 /* T now contains 0xff in bytes for which we matched one of the relevant
582 characters. We want to exit the loop if any byte in T is non-zero.
583 Below is the expansion of vec_any_ne(t, zero). */
584 }
585 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
586
587 /* Restore s to to point to the 16 bytes we just processed. */
588 s -= 16;
589
590 {
591 #define N (sizeof(vc) / sizeof(long))
592
593 union {
594 vc v;
595 /* Statically assert that N is 2 or 4. */
596 unsigned long l[(N == 2 || N == 4) ? N : -1];
597 } u;
598 unsigned long l, i = 0;
599
600 u.v = t;
601
602 /* Find the first word of T that is non-zero. */
603 switch (N)
604 {
605 case 4:
606 l = u.l[i++];
607 if (l != 0)
608 break;
609 s += sizeof(unsigned long);
610 l = u.l[i++];
611 if (l != 0)
612 break;
613 s += sizeof(unsigned long);
614 /* FALLTHRU */
615 case 2:
616 l = u.l[i++];
617 if (l != 0)
618 break;
619 s += sizeof(unsigned long);
620 l = u.l[i];
621 }
622
623 /* L now contains 0xff in bytes for which we matched one of the
624 relevant characters. We can find the byte index by finding
625 its bit index and dividing by 8. */
626 #ifdef __BIG_ENDIAN__
627 l = __builtin_clzl(l) >> 3;
628 #else
629 l = __builtin_ctzl(l) >> 3;
630 #endif
631 return s + l;
632
633 #undef N
634 }
635 }
636
637 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
638
639 /* A vection of the fast scanner using AltiVec vectorized byte compares.
640 This cannot be used for little endian because vec_lvsl/lvsr are
641 deprecated for little endian and the code won't work properly. */
642 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
643 so we can't compile this function without -maltivec on the command line
644 (or implied by some other switch). */
645
646 static const uchar *
647 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
648 {
649 typedef __attribute__((altivec(vector))) unsigned char vc;
650
651 const vc repl_nl = {
652 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
653 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
654 };
655 const vc repl_cr = {
656 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
657 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
658 };
659 const vc repl_bs = {
660 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
661 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
662 };
663 const vc repl_qm = {
664 '?', '?', '?', '?', '?', '?', '?', '?',
665 '?', '?', '?', '?', '?', '?', '?', '?',
666 };
667 const vc ones = {
668 -1, -1, -1, -1, -1, -1, -1, -1,
669 -1, -1, -1, -1, -1, -1, -1, -1,
670 };
671 const vc zero = { 0 };
672
673 vc data, mask, t;
674
675 /* Altivec loads automatically mask addresses with -16. This lets us
676 issue the first load as early as possible. */
677 data = __builtin_vec_ld(0, (const vc *)s);
678
679 /* Discard bytes before the beginning of the buffer. Do this by
680 beginning with all ones and shifting in zeros according to the
681 mis-alignment. The LVSR instruction pulls the exact shift we
682 want from the address. */
683 mask = __builtin_vec_lvsr(0, s);
684 mask = __builtin_vec_perm(zero, ones, mask);
685 data &= mask;
686
687 /* While altivec loads mask addresses, we still need to align S so
688 that the offset we compute at the end is correct. */
689 s = (const uchar *)((uintptr_t)s & -16);
690
691 /* Main loop processing 16 bytes at a time. */
692 goto start;
693 do
694 {
695 vc m_nl, m_cr, m_bs, m_qm;
696
697 s += 16;
698 data = __builtin_vec_ld(0, (const vc *)s);
699
700 start:
701 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
702 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
703 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
704 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
705 t = (m_nl | m_cr) | (m_bs | m_qm);
706
707 /* T now contains 0xff in bytes for which we matched one of the relevant
708 characters. We want to exit the loop if any byte in T is non-zero.
709 Below is the expansion of vec_any_ne(t, zero). */
710 }
711 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
712
713 {
714 #define N (sizeof(vc) / sizeof(long))
715
716 union {
717 vc v;
718 /* Statically assert that N is 2 or 4. */
719 unsigned long l[(N == 2 || N == 4) ? N : -1];
720 } u;
721 unsigned long l, i = 0;
722
723 u.v = t;
724
725 /* Find the first word of T that is non-zero. */
726 switch (N)
727 {
728 case 4:
729 l = u.l[i++];
730 if (l != 0)
731 break;
732 s += sizeof(unsigned long);
733 l = u.l[i++];
734 if (l != 0)
735 break;
736 s += sizeof(unsigned long);
737 /* FALLTHROUGH */
738 case 2:
739 l = u.l[i++];
740 if (l != 0)
741 break;
742 s += sizeof(unsigned long);
743 l = u.l[i];
744 }
745
746 /* L now contains 0xff in bytes for which we matched one of the
747 relevant characters. We can find the byte index by finding
748 its bit index and dividing by 8. */
749 l = __builtin_clzl(l) >> 3;
750 return s + l;
751
752 #undef N
753 }
754 }
755
756 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
757 #include "arm_neon.h"
758
759 /* This doesn't have to be the exact page size, but no system may use
760 a size smaller than this. ARMv8 requires a minimum page size of
761 4k. The impact of being conservative here is a small number of
762 cases will take the slightly slower entry path into the main
763 loop. */
764
765 #define AARCH64_MIN_PAGE_SIZE 4096
766
767 static const uchar *
768 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
769 {
770 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
771 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
772 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
773 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
774 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
775
776 #ifdef __ARM_BIG_ENDIAN
777 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
778 #else
779 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
780 #endif
781
782 unsigned int found;
783 const uint8_t *p;
784 uint8x16_t data;
785 uint8x16_t t;
786 uint16x8_t m;
787 uint8x16_t u, v, w;
788
789 /* Align the source pointer. */
790 p = (const uint8_t *)((uintptr_t)s & -16);
791
792 /* Assuming random string start positions, with a 4k page size we'll take
793 the slow path about 0.37% of the time. */
794 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
795 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
796 < 16, 0))
797 {
798 /* Slow path: the string starts near a possible page boundary. */
799 uint32_t misalign, mask;
800
801 misalign = (uintptr_t)s & 15;
802 mask = (-1u << misalign) & 0xffff;
803 data = vld1q_u8 (p);
804 t = vceqq_u8 (data, repl_nl);
805 u = vceqq_u8 (data, repl_cr);
806 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
807 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
808 t = vorrq_u8 (v, w);
809 t = vandq_u8 (t, xmask);
810 m = vpaddlq_u8 (t);
811 m = vshlq_u16 (m, shift);
812 found = vaddvq_u16 (m);
813 found &= mask;
814 if (found)
815 return (const uchar*)p + __builtin_ctz (found);
816 }
817 else
818 {
819 data = vld1q_u8 ((const uint8_t *) s);
820 t = vceqq_u8 (data, repl_nl);
821 u = vceqq_u8 (data, repl_cr);
822 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
823 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
824 t = vorrq_u8 (v, w);
825 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
826 goto done;
827 }
828
829 do
830 {
831 p += 16;
832 data = vld1q_u8 (p);
833 t = vceqq_u8 (data, repl_nl);
834 u = vceqq_u8 (data, repl_cr);
835 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
836 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
837 t = vorrq_u8 (v, w);
838 } while (!vpaddd_u64 ((uint64x2_t)t));
839
840 done:
841 /* Now that we've found the terminating substring, work out precisely where
842 we need to stop. */
843 t = vandq_u8 (t, xmask);
844 m = vpaddlq_u8 (t);
845 m = vshlq_u16 (m, shift);
846 found = vaddvq_u16 (m);
847 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
848 + __builtin_ctz (found));
849 }
850
851 #elif defined (__ARM_NEON)
852 #include "arm_neon.h"
853
854 static const uchar *
855 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
856 {
857 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
858 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
859 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
860 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
861 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
862
863 unsigned int misalign, found, mask;
864 const uint8_t *p;
865 uint8x16_t data;
866
867 /* Align the source pointer. */
868 misalign = (uintptr_t)s & 15;
869 p = (const uint8_t *)((uintptr_t)s & -16);
870 data = vld1q_u8 (p);
871
872 /* Create a mask for the bytes that are valid within the first
873 16-byte block. The Idea here is that the AND with the mask
874 within the loop is "free", since we need some AND or TEST
875 insn in order to set the flags for the branch anyway. */
876 mask = (-1u << misalign) & 0xffff;
877
878 /* Main loop, processing 16 bytes at a time. */
879 goto start;
880
881 do
882 {
883 uint8x8_t l;
884 uint16x4_t m;
885 uint32x2_t n;
886 uint8x16_t t, u, v, w;
887
888 p += 16;
889 data = vld1q_u8 (p);
890 mask = 0xffff;
891
892 start:
893 t = vceqq_u8 (data, repl_nl);
894 u = vceqq_u8 (data, repl_cr);
895 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
896 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
897 t = vandq_u8 (vorrq_u8 (v, w), xmask);
898 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
899 m = vpaddl_u8 (l);
900 n = vpaddl_u16 (m);
901
902 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
903 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
904 found &= mask;
905 }
906 while (!found);
907
908 /* FOUND contains 1 in bits for which we matched a relevant
909 character. Conversion to the byte index is trivial. */
910 found = __builtin_ctz (found);
911 return (const uchar *)p + found;
912 }
913
914 #else
915
916 /* We only have one accelerated alternative. Use a direct call so that
917 we encourage inlining. */
918
919 #define search_line_fast search_line_acc_char
920
921 #endif
922
923 /* Initialize the lexer if needed. */
924
925 void
926 _cpp_init_lexer (void)
927 {
928 #ifdef HAVE_init_vectorized_lexer
929 init_vectorized_lexer ();
930 #endif
931 }
932
933 /* Returns with a logical line that contains no escaped newlines or
934 trigraphs. This is a time-critical inner loop. */
935 void
936 _cpp_clean_line (cpp_reader *pfile)
937 {
938 cpp_buffer *buffer;
939 const uchar *s;
940 uchar c, *d, *p;
941
942 buffer = pfile->buffer;
943 buffer->cur_note = buffer->notes_used = 0;
944 buffer->cur = buffer->line_base = buffer->next_line;
945 buffer->need_line = false;
946 s = buffer->next_line;
947
948 if (!buffer->from_stage3)
949 {
950 const uchar *pbackslash = NULL;
951
952 /* Fast path. This is the common case of an un-escaped line with
953 no trigraphs. The primary win here is by not writing any
954 data back to memory until we have to. */
955 while (1)
956 {
957 /* Perform an optimized search for \n, \r, \\, ?. */
958 s = search_line_fast (s, buffer->rlimit);
959
960 c = *s;
961 if (c == '\\')
962 {
963 /* Record the location of the backslash and continue. */
964 pbackslash = s++;
965 }
966 else if (__builtin_expect (c == '?', 0))
967 {
968 if (__builtin_expect (s[1] == '?', false)
969 && _cpp_trigraph_map[s[2]])
970 {
971 /* Have a trigraph. We may or may not have to convert
972 it. Add a line note regardless, for -Wtrigraphs. */
973 add_line_note (buffer, s, s[2]);
974 if (CPP_OPTION (pfile, trigraphs))
975 {
976 /* We do, and that means we have to switch to the
977 slow path. */
978 d = (uchar *) s;
979 *d = _cpp_trigraph_map[s[2]];
980 s += 2;
981 goto slow_path;
982 }
983 }
984 /* Not a trigraph. Continue on fast-path. */
985 s++;
986 }
987 else
988 break;
989 }
990
991 /* This must be \r or \n. We're either done, or we'll be forced
992 to write back to the buffer and continue on the slow path. */
993 d = (uchar *) s;
994
995 if (__builtin_expect (s == buffer->rlimit, false))
996 goto done;
997
998 /* DOS line ending? */
999 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1000 {
1001 s++;
1002 if (s == buffer->rlimit)
1003 goto done;
1004 }
1005
1006 if (__builtin_expect (pbackslash == NULL, true))
1007 goto done;
1008
1009 /* Check for escaped newline. */
1010 p = d;
1011 while (is_nvspace (p[-1]))
1012 p--;
1013 if (p - 1 != pbackslash)
1014 goto done;
1015
1016 /* Have an escaped newline; process it and proceed to
1017 the slow path. */
1018 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1019 d = p - 2;
1020 buffer->next_line = p - 1;
1021
1022 slow_path:
1023 while (1)
1024 {
1025 c = *++s;
1026 *++d = c;
1027
1028 if (c == '\n' || c == '\r')
1029 {
1030 /* Handle DOS line endings. */
1031 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1032 s++;
1033 if (s == buffer->rlimit)
1034 break;
1035
1036 /* Escaped? */
1037 p = d;
1038 while (p != buffer->next_line && is_nvspace (p[-1]))
1039 p--;
1040 if (p == buffer->next_line || p[-1] != '\\')
1041 break;
1042
1043 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1044 d = p - 2;
1045 buffer->next_line = p - 1;
1046 }
1047 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1048 {
1049 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1050 add_line_note (buffer, d, s[2]);
1051 if (CPP_OPTION (pfile, trigraphs))
1052 {
1053 *d = _cpp_trigraph_map[s[2]];
1054 s += 2;
1055 }
1056 }
1057 }
1058 }
1059 else
1060 {
1061 while (*s != '\n' && *s != '\r')
1062 s++;
1063 d = (uchar *) s;
1064
1065 /* Handle DOS line endings. */
1066 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1067 s++;
1068 }
1069
1070 done:
1071 *d = '\n';
1072 /* A sentinel note that should never be processed. */
1073 add_line_note (buffer, d + 1, '\n');
1074 buffer->next_line = s + 1;
1075 }
1076
1077 /* Return true if the trigraph indicated by NOTE should be warned
1078 about in a comment. */
1079 static bool
1080 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1081 {
1082 const uchar *p;
1083
1084 /* Within comments we don't warn about trigraphs, unless the
1085 trigraph forms an escaped newline, as that may change
1086 behavior. */
1087 if (note->type != '/')
1088 return false;
1089
1090 /* If -trigraphs, then this was an escaped newline iff the next note
1091 is coincident. */
1092 if (CPP_OPTION (pfile, trigraphs))
1093 return note[1].pos == note->pos;
1094
1095 /* Otherwise, see if this forms an escaped newline. */
1096 p = note->pos + 3;
1097 while (is_nvspace (*p))
1098 p++;
1099
1100 /* There might have been escaped newlines between the trigraph and the
1101 newline we found. Hence the position test. */
1102 return (*p == '\n' && p < note[1].pos);
1103 }
1104
1105 /* Process the notes created by add_line_note as far as the current
1106 location. */
1107 void
1108 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1109 {
1110 cpp_buffer *buffer = pfile->buffer;
1111
1112 for (;;)
1113 {
1114 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1115 unsigned int col;
1116
1117 if (note->pos > buffer->cur)
1118 break;
1119
1120 buffer->cur_note++;
1121 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1122
1123 if (note->type == '\\' || note->type == ' ')
1124 {
1125 if (note->type == ' ' && !in_comment)
1126 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1127 "backslash and newline separated by space");
1128
1129 if (buffer->next_line > buffer->rlimit)
1130 {
1131 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1132 "backslash-newline at end of file");
1133 /* Prevent "no newline at end of file" warning. */
1134 buffer->next_line = buffer->rlimit;
1135 }
1136
1137 buffer->line_base = note->pos;
1138 CPP_INCREMENT_LINE (pfile, 0);
1139 }
1140 else if (_cpp_trigraph_map[note->type])
1141 {
1142 if (CPP_OPTION (pfile, warn_trigraphs)
1143 && (!in_comment || warn_in_comment (pfile, note)))
1144 {
1145 if (CPP_OPTION (pfile, trigraphs))
1146 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1147 pfile->line_table->highest_line, col,
1148 "trigraph ??%c converted to %c",
1149 note->type,
1150 (int) _cpp_trigraph_map[note->type]);
1151 else
1152 {
1153 cpp_warning_with_line
1154 (pfile, CPP_W_TRIGRAPHS,
1155 pfile->line_table->highest_line, col,
1156 "trigraph ??%c ignored, use -trigraphs to enable",
1157 note->type);
1158 }
1159 }
1160 }
1161 else if (note->type == 0)
1162 /* Already processed in lex_raw_string. */;
1163 else
1164 abort ();
1165 }
1166 }
1167
1168 namespace bidi {
1169 enum class kind {
1170 NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1171 };
1172
1173 /* All the UTF-8 encodings of bidi characters start with E2. */
1174 constexpr uchar utf8_start = 0xe2;
1175
1176 struct context
1177 {
1178 context () {}
1179 context (location_t loc, kind k, bool pdf, bool ucn)
1180 : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1181 {
1182 }
1183
1184 kind get_pop_kind () const
1185 {
1186 return m_pdf ? kind::PDF : kind::PDI;
1187 }
1188 bool ucn_p () const
1189 {
1190 return m_ucn;
1191 }
1192
1193 location_t m_loc;
1194 kind m_kind;
1195 unsigned m_pdf : 1;
1196 unsigned m_ucn : 1;
1197 };
1198
1199 /* A vector holding currently open bidi contexts. We use a char for
1200 each context, its LSB is 1 if it represents a PDF context, 0 if it
1201 represents a PDI context. The next bit is 1 if this context was open
1202 by a bidi character written as a UCN, and 0 when it was UTF-8. */
1203 semi_embedded_vec <context, 16> vec;
1204
1205 /* Close the whole comment/identifier/string literal/character constant
1206 context. */
1207 void on_close ()
1208 {
1209 vec.truncate (0);
1210 }
1211
1212 /* Pop the last element in the vector. */
1213 void pop ()
1214 {
1215 unsigned int len = vec.count ();
1216 gcc_checking_assert (len > 0);
1217 vec.truncate (len - 1);
1218 }
1219
1220 /* Return the pop kind of the context of the Ith element. */
1221 kind pop_kind_at (unsigned int i)
1222 {
1223 return vec[i].get_pop_kind ();
1224 }
1225
1226 /* Return the pop kind of the context that is currently opened. */
1227 kind current_ctx ()
1228 {
1229 unsigned int len = vec.count ();
1230 if (len == 0)
1231 return kind::NONE;
1232 return vec[len - 1].get_pop_kind ();
1233 }
1234
1235 /* Return true if the current context comes from a UCN origin, that is,
1236 the bidi char which started this bidi context was written as a UCN. */
1237 bool current_ctx_ucn_p ()
1238 {
1239 unsigned int len = vec.count ();
1240 gcc_checking_assert (len > 0);
1241 return vec[len - 1].m_ucn;
1242 }
1243
1244 location_t current_ctx_loc ()
1245 {
1246 unsigned int len = vec.count ();
1247 gcc_checking_assert (len > 0);
1248 return vec[len - 1].m_loc;
1249 }
1250
1251 /* We've read a bidi char, update the current vector as necessary.
1252 LOC is only valid when K is not kind::NONE. */
1253 void on_char (kind k, bool ucn_p, location_t loc)
1254 {
1255 switch (k)
1256 {
1257 case kind::LRE:
1258 case kind::RLE:
1259 case kind::LRO:
1260 case kind::RLO:
1261 vec.push (context (loc, k, true, ucn_p));
1262 break;
1263 case kind::LRI:
1264 case kind::RLI:
1265 case kind::FSI:
1266 vec.push (context (loc, k, false, ucn_p));
1267 break;
1268 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1269 whose scope has not yet been terminated. */
1270 case kind::PDF:
1271 if (current_ctx () == kind::PDF)
1272 pop ();
1273 break;
1274 /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1275 scope has not yet been terminated, as well as the scopes of
1276 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1277 yet been terminated. */
1278 case kind::PDI:
1279 for (int i = vec.count () - 1; i >= 0; --i)
1280 if (pop_kind_at (i) == kind::PDI)
1281 {
1282 vec.truncate (i);
1283 break;
1284 }
1285 break;
1286 case kind::LTR:
1287 case kind::RTL:
1288 /* These aren't popped by a PDF/PDI. */
1289 break;
1290 ATTR_LIKELY case kind::NONE:
1291 break;
1292 default:
1293 abort ();
1294 }
1295 }
1296
1297 /* Return a descriptive string for K. */
1298 const char *to_str (kind k)
1299 {
1300 switch (k)
1301 {
1302 case kind::LRE:
1303 return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1304 case kind::RLE:
1305 return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1306 case kind::LRO:
1307 return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1308 case kind::RLO:
1309 return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1310 case kind::LRI:
1311 return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1312 case kind::RLI:
1313 return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1314 case kind::FSI:
1315 return "U+2068 (FIRST STRONG ISOLATE)";
1316 case kind::PDF:
1317 return "U+202C (POP DIRECTIONAL FORMATTING)";
1318 case kind::PDI:
1319 return "U+2069 (POP DIRECTIONAL ISOLATE)";
1320 case kind::LTR:
1321 return "U+200E (LEFT-TO-RIGHT MARK)";
1322 case kind::RTL:
1323 return "U+200F (RIGHT-TO-LEFT MARK)";
1324 default:
1325 abort ();
1326 }
1327 }
1328 }
1329
1330 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1331 within the current line in FILE, with the caret at START. */
1332
1333 static location_t
1334 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1335 const unsigned char *const start,
1336 size_t num_bytes)
1337 {
1338 gcc_checking_assert (num_bytes > 0);
1339
1340 /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1341 to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1342 whereas linemap_position_for_column is 1-based. */
1343
1344 /* Get 0-based offsets within the line. */
1345 size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1346 size_t end_offset = start_offset + num_bytes - 1;
1347
1348 /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1349 location_t start_loc = linemap_position_for_column (pfile->line_table,
1350 start_offset + 1);
1351 location_t end_loc = linemap_position_for_column (pfile->line_table,
1352 end_offset + 1);
1353
1354 if (start_loc == end_loc)
1355 return start_loc;
1356
1357 source_range src_range;
1358 src_range.m_start = start_loc;
1359 src_range.m_finish = end_loc;
1360 location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1361 start_loc,
1362 src_range,
1363 NULL);
1364 return combined_loc;
1365 }
1366
1367 /* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1368
1369 static bidi::kind
1370 get_bidi_utf8_1 (const unsigned char *const p)
1371 {
1372 gcc_checking_assert (p[0] == bidi::utf8_start);
1373
1374 if (p[1] == 0x80)
1375 switch (p[2])
1376 {
1377 case 0xaa:
1378 return bidi::kind::LRE;
1379 case 0xab:
1380 return bidi::kind::RLE;
1381 case 0xac:
1382 return bidi::kind::PDF;
1383 case 0xad:
1384 return bidi::kind::LRO;
1385 case 0xae:
1386 return bidi::kind::RLO;
1387 case 0x8e:
1388 return bidi::kind::LTR;
1389 case 0x8f:
1390 return bidi::kind::RTL;
1391 default:
1392 break;
1393 }
1394 else if (p[1] == 0x81)
1395 switch (p[2])
1396 {
1397 case 0xa6:
1398 return bidi::kind::LRI;
1399 case 0xa7:
1400 return bidi::kind::RLI;
1401 case 0xa8:
1402 return bidi::kind::FSI;
1403 case 0xa9:
1404 return bidi::kind::PDI;
1405 default:
1406 break;
1407 }
1408
1409 return bidi::kind::NONE;
1410 }
1411
1412 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1413 If the kind is not NONE, write the location to *OUT.*/
1414
1415 static bidi::kind
1416 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1417 {
1418 bidi::kind result = get_bidi_utf8_1 (p);
1419 if (result != bidi::kind::NONE)
1420 {
1421 /* We have a sequence of 3 bytes starting at P. */
1422 *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1423 }
1424 return result;
1425 }
1426
1427 /* Parse a UCN where P points just past \u or \U and return its bidi code. */
1428
1429 static bidi::kind
1430 get_bidi_ucn_1 (const unsigned char *p, bool is_U)
1431 {
1432 /* 6.4.3 Universal Character Names
1433 \u hex-quad
1434 \U hex-quad hex-quad
1435 where \unnnn means \U0000nnnn. */
1436
1437 if (is_U)
1438 {
1439 if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1440 return bidi::kind::NONE;
1441 /* Skip 4B so we can treat \u and \U the same below. */
1442 p += 4;
1443 }
1444
1445 /* All code points we are looking for start with 20xx. */
1446 if (p[0] != '2' || p[1] != '0')
1447 return bidi::kind::NONE;
1448 else if (p[2] == '2')
1449 switch (p[3])
1450 {
1451 case 'a':
1452 case 'A':
1453 return bidi::kind::LRE;
1454 case 'b':
1455 case 'B':
1456 return bidi::kind::RLE;
1457 case 'c':
1458 case 'C':
1459 return bidi::kind::PDF;
1460 case 'd':
1461 case 'D':
1462 return bidi::kind::LRO;
1463 case 'e':
1464 case 'E':
1465 return bidi::kind::RLO;
1466 default:
1467 break;
1468 }
1469 else if (p[2] == '6')
1470 switch (p[3])
1471 {
1472 case '6':
1473 return bidi::kind::LRI;
1474 case '7':
1475 return bidi::kind::RLI;
1476 case '8':
1477 return bidi::kind::FSI;
1478 case '9':
1479 return bidi::kind::PDI;
1480 default:
1481 break;
1482 }
1483 else if (p[2] == '0')
1484 switch (p[3])
1485 {
1486 case 'e':
1487 case 'E':
1488 return bidi::kind::LTR;
1489 case 'f':
1490 case 'F':
1491 return bidi::kind::RTL;
1492 default:
1493 break;
1494 }
1495
1496 return bidi::kind::NONE;
1497 }
1498
1499 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1500 If the kind is not NONE, write the location to *OUT.*/
1501
1502 static bidi::kind
1503 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1504 location_t *out)
1505 {
1506 bidi::kind result = get_bidi_ucn_1 (p, is_U);
1507 if (result != bidi::kind::NONE)
1508 {
1509 const unsigned char *start = p - 2;
1510 size_t num_bytes = 2 + (is_U ? 8 : 4);
1511 *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1512 }
1513 return result;
1514 }
1515
1516 /* Subclass of rich_location for reporting on unpaired UTF-8
1517 bidirectional control character(s).
1518 Escape the source lines on output, and show all unclosed
1519 bidi context, labelling everything. */
1520
1521 class unpaired_bidi_rich_location : public rich_location
1522 {
1523 public:
1524 class custom_range_label : public range_label
1525 {
1526 public:
1527 label_text get_text (unsigned range_idx) const FINAL OVERRIDE
1528 {
1529 /* range 0 is the primary location; each subsequent range i + 1
1530 is for bidi::vec[i]. */
1531 if (range_idx > 0)
1532 {
1533 const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1534 return label_text::borrow (bidi::to_str (ctxt.m_kind));
1535 }
1536 else
1537 return label_text::borrow (_("end of bidirectional context"));
1538 }
1539 };
1540
1541 unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1542 : rich_location (pfile->line_table, loc, &m_custom_label)
1543 {
1544 set_escape_on_output (true);
1545 for (unsigned i = 0; i < bidi::vec.count (); i++)
1546 add_range (bidi::vec[i].m_loc,
1547 SHOW_RANGE_WITHOUT_CARET,
1548 &m_custom_label);
1549 }
1550
1551 private:
1552 custom_range_label m_custom_label;
1553 };
1554
1555 /* We're closing a bidi context, that is, we've encountered a newline,
1556 are closing a C-style comment, or are at the end of a string literal,
1557 character constant, or identifier. Warn if this context was not
1558 properly terminated by a PDI or PDF. P points to the last character
1559 in this context. */
1560
1561 static void
1562 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1563 {
1564 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1565 if (bidi::vec.count () > 0
1566 && (warn_bidi & bidirectional_unpaired
1567 && (!bidi::current_ctx_ucn_p ()
1568 || (warn_bidi & bidirectional_ucn))))
1569 {
1570 const location_t loc
1571 = linemap_position_for_column (pfile->line_table,
1572 CPP_BUF_COLUMN (pfile->buffer, p));
1573 unpaired_bidi_rich_location rich_loc (pfile, loc);
1574 /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1575 forms of a diagnostic, so fake it for now. */
1576 if (bidi::vec.count () > 1)
1577 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1578 "unpaired UTF-8 bidirectional control characters "
1579 "detected");
1580 else
1581 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1582 "unpaired UTF-8 bidirectional control character "
1583 "detected");
1584 }
1585 /* We're done with this context. */
1586 bidi::on_close ();
1587 }
1588
1589 /* We're at the beginning or in the middle of an identifier/comment/string
1590 literal/character constant. Warn if we've encountered a bidi character.
1591 KIND says which bidi control character it was; UCN_P is true iff this bidi
1592 control character was written as a UCN. LOC is the location of the
1593 character, but is only valid if KIND != bidi::kind::NONE. */
1594
1595 static void
1596 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1597 bool ucn_p, location_t loc)
1598 {
1599 if (__builtin_expect (kind == bidi::kind::NONE, 1))
1600 return;
1601
1602 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1603
1604 if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1605 {
1606 rich_location rich_loc (pfile->line_table, loc);
1607 rich_loc.set_escape_on_output (true);
1608
1609 /* It seems excessive to warn about a PDI/PDF that is closing
1610 an opened context because we've already warned about the
1611 opening character. Except warn when we have a UCN x UTF-8
1612 mismatch, if UCN checking is enabled. */
1613 if (kind == bidi::current_ctx ())
1614 {
1615 if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1616 && bidi::current_ctx_ucn_p () != ucn_p)
1617 {
1618 rich_loc.add_range (bidi::current_ctx_loc ());
1619 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1620 "UTF-8 vs UCN mismatch when closing "
1621 "a context by \"%s\"", bidi::to_str (kind));
1622 }
1623 }
1624 else if (warn_bidi & bidirectional_any
1625 && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1626 {
1627 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1628 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1629 "\"%s\" is closing an unopened context",
1630 bidi::to_str (kind));
1631 else
1632 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1633 "found problematic Unicode character \"%s\"",
1634 bidi::to_str (kind));
1635 }
1636 }
1637 /* We're done with this context. */
1638 bidi::on_char (kind, ucn_p, loc);
1639 }
1640
1641 /* Skip a C-style block comment. We find the end of the comment by
1642 seeing if an asterisk is before every '/' we encounter. Returns
1643 nonzero if comment terminated by EOF, zero otherwise.
1644
1645 Buffer->cur points to the initial asterisk of the comment. */
1646 bool
1647 _cpp_skip_block_comment (cpp_reader *pfile)
1648 {
1649 cpp_buffer *buffer = pfile->buffer;
1650 const uchar *cur = buffer->cur;
1651 uchar c;
1652 const bool warn_bidi_p = pfile->warn_bidi_p ();
1653
1654 cur++;
1655 if (*cur == '/')
1656 cur++;
1657
1658 for (;;)
1659 {
1660 /* People like decorating comments with '*', so check for '/'
1661 instead for efficiency. */
1662 c = *cur++;
1663
1664 if (c == '/')
1665 {
1666 if (cur[-2] == '*')
1667 {
1668 if (warn_bidi_p)
1669 maybe_warn_bidi_on_close (pfile, cur);
1670 break;
1671 }
1672
1673 /* Warn about potential nested comments, but not if the '/'
1674 comes immediately before the true comment delimiter.
1675 Don't bother to get it right across escaped newlines. */
1676 if (CPP_OPTION (pfile, warn_comments)
1677 && cur[0] == '*' && cur[1] != '/')
1678 {
1679 buffer->cur = cur;
1680 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1681 pfile->line_table->highest_line,
1682 CPP_BUF_COL (buffer),
1683 "\"/*\" within comment");
1684 }
1685 }
1686 else if (c == '\n')
1687 {
1688 unsigned int cols;
1689 buffer->cur = cur - 1;
1690 if (warn_bidi_p)
1691 maybe_warn_bidi_on_close (pfile, cur);
1692 _cpp_process_line_notes (pfile, true);
1693 if (buffer->next_line >= buffer->rlimit)
1694 return true;
1695 _cpp_clean_line (pfile);
1696
1697 cols = buffer->next_line - buffer->line_base;
1698 CPP_INCREMENT_LINE (pfile, cols);
1699
1700 cur = buffer->cur;
1701 }
1702 /* If this is a beginning of a UTF-8 encoding, it might be
1703 a bidirectional control character. */
1704 else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
1705 {
1706 location_t loc;
1707 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1708 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1709 }
1710 }
1711
1712 buffer->cur = cur;
1713 _cpp_process_line_notes (pfile, true);
1714 return false;
1715 }
1716
1717 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1718 terminating newline. Handles escaped newlines. Returns nonzero
1719 if a multiline comment. */
1720 static int
1721 skip_line_comment (cpp_reader *pfile)
1722 {
1723 cpp_buffer *buffer = pfile->buffer;
1724 location_t orig_line = pfile->line_table->highest_line;
1725 const bool warn_bidi_p = pfile->warn_bidi_p ();
1726
1727 if (!warn_bidi_p)
1728 while (*buffer->cur != '\n')
1729 buffer->cur++;
1730 else
1731 {
1732 while (*buffer->cur != '\n'
1733 && *buffer->cur != bidi::utf8_start)
1734 buffer->cur++;
1735 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1736 {
1737 while (*buffer->cur != '\n')
1738 {
1739 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1740 {
1741 location_t loc;
1742 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1743 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1744 }
1745 buffer->cur++;
1746 }
1747 maybe_warn_bidi_on_close (pfile, buffer->cur);
1748 }
1749 }
1750
1751 _cpp_process_line_notes (pfile, true);
1752 return orig_line != pfile->line_table->highest_line;
1753 }
1754
1755 /* Skips whitespace, saving the next non-whitespace character. */
1756 static void
1757 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1758 {
1759 cpp_buffer *buffer = pfile->buffer;
1760 bool saw_NUL = false;
1761
1762 do
1763 {
1764 /* Horizontal space always OK. */
1765 if (c == ' ' || c == '\t')
1766 ;
1767 /* Just \f \v or \0 left. */
1768 else if (c == '\0')
1769 saw_NUL = true;
1770 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1771 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1772 CPP_BUF_COL (buffer),
1773 "%s in preprocessing directive",
1774 c == '\f' ? "form feed" : "vertical tab");
1775
1776 c = *buffer->cur++;
1777 }
1778 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1779 while (is_nvspace (c));
1780
1781 if (saw_NUL)
1782 {
1783 encoding_rich_location rich_loc (pfile);
1784 cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1785 "null character(s) ignored");
1786 }
1787
1788 buffer->cur--;
1789 }
1790
1791 /* See if the characters of a number token are valid in a name (no
1792 '.', '+' or '-'). */
1793 static int
1794 name_p (cpp_reader *pfile, const cpp_string *string)
1795 {
1796 unsigned int i;
1797
1798 for (i = 0; i < string->len; i++)
1799 if (!is_idchar (string->text[i]))
1800 return 0;
1801
1802 return 1;
1803 }
1804
1805 /* After parsing an identifier or other sequence, produce a warning about
1806 sequences not in NFC/NFKC. */
1807 static void
1808 warn_about_normalization (cpp_reader *pfile,
1809 const cpp_token *token,
1810 const struct normalize_state *s)
1811 {
1812 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1813 && !pfile->state.skipping)
1814 {
1815 location_t loc = token->src_loc;
1816
1817 /* If possible, create a location range for the token. */
1818 if (loc >= RESERVED_LOCATION_COUNT
1819 && token->type != CPP_EOF
1820 /* There must be no line notes to process. */
1821 && (!(pfile->buffer->cur
1822 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
1823 && !pfile->overlaid_buffer)))
1824 {
1825 source_range tok_range;
1826 tok_range.m_start = loc;
1827 tok_range.m_finish
1828 = linemap_position_for_column (pfile->line_table,
1829 CPP_BUF_COLUMN (pfile->buffer,
1830 pfile->buffer->cur));
1831 loc = COMBINE_LOCATION_DATA (pfile->line_table,
1832 loc, tok_range, NULL);
1833 }
1834
1835 encoding_rich_location rich_loc (pfile, loc);
1836
1837 /* Make sure that the token is printed using UCNs, even
1838 if we'd otherwise happily print UTF-8. */
1839 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1840 size_t sz;
1841
1842 sz = cpp_spell_token (pfile, token, buf, false) - buf;
1843 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1844 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1845 "`%.*s' is not in NFKC", (int) sz, buf);
1846 else if (CPP_OPTION (pfile, cplusplus))
1847 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1848 "`%.*s' is not in NFC", (int) sz, buf);
1849 else
1850 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1851 "`%.*s' is not in NFC", (int) sz, buf);
1852 free (buf);
1853 }
1854 }
1855
1856 static const cppchar_t utf8_signifier = 0xC0;
1857
1858 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1859 an identifier. FIRST is TRUE if this starts an identifier. */
1860
1861 static bool
1862 forms_identifier_p (cpp_reader *pfile, int first,
1863 struct normalize_state *state)
1864 {
1865 cpp_buffer *buffer = pfile->buffer;
1866 const bool warn_bidi_p = pfile->warn_bidi_p ();
1867
1868 if (*buffer->cur == '$')
1869 {
1870 if (!CPP_OPTION (pfile, dollars_in_ident))
1871 return false;
1872
1873 buffer->cur++;
1874 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1875 {
1876 CPP_OPTION (pfile, warn_dollars) = 0;
1877 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1878 }
1879
1880 return true;
1881 }
1882
1883 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
1884 if (CPP_OPTION (pfile, extended_identifiers))
1885 {
1886 cppchar_t s;
1887 if (*buffer->cur >= utf8_signifier)
1888 {
1889 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
1890 && warn_bidi_p)
1891 {
1892 location_t loc;
1893 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1894 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1895 }
1896 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1897 state, &s))
1898 return true;
1899 }
1900 else if (*buffer->cur == '\\'
1901 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1902 {
1903 buffer->cur += 2;
1904 if (warn_bidi_p)
1905 {
1906 location_t loc;
1907 bidi::kind kind = get_bidi_ucn (pfile,
1908 buffer->cur,
1909 buffer->cur[-1] == 'U',
1910 &loc);
1911 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
1912 }
1913 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1914 state, &s, NULL, NULL))
1915 return true;
1916 buffer->cur -= 2;
1917 }
1918 }
1919
1920 return false;
1921 }
1922
1923 /* Helper function to issue error about improper __VA_OPT__ use. */
1924 static void
1925 maybe_va_opt_error (cpp_reader *pfile)
1926 {
1927 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1928 {
1929 /* __VA_OPT__ should not be accepted at all, but allow it in
1930 system headers. */
1931 if (!_cpp_in_system_header (pfile))
1932 cpp_error (pfile, CPP_DL_PEDWARN,
1933 "__VA_OPT__ is not available until C++20");
1934 }
1935 else if (!pfile->state.va_args_ok)
1936 {
1937 /* __VA_OPT__ should only appear in the replacement list of a
1938 variadic macro. */
1939 cpp_error (pfile, CPP_DL_PEDWARN,
1940 "__VA_OPT__ can only appear in the expansion"
1941 " of a C++20 variadic macro");
1942 }
1943 }
1944
1945 /* Helper function to get the cpp_hashnode of the identifier BASE. */
1946 static cpp_hashnode *
1947 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1948 {
1949 cpp_hashnode *result;
1950 const uchar *cur;
1951 unsigned int len;
1952 unsigned int hash = HT_HASHSTEP (0, *base);
1953
1954 cur = base + 1;
1955 while (ISIDNUM (*cur))
1956 {
1957 hash = HT_HASHSTEP (hash, *cur);
1958 cur++;
1959 }
1960 len = cur - base;
1961 hash = HT_HASHFINISH (hash, len);
1962 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1963 base, len, hash, HT_ALLOC));
1964
1965 /* Rarely, identifiers require diagnostics when lexed. */
1966 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1967 && !pfile->state.skipping, 0))
1968 {
1969 /* It is allowed to poison the same identifier twice. */
1970 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1971 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1972 NODE_NAME (result));
1973
1974 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1975 replacement list of a variadic macro. */
1976 if (result == pfile->spec_nodes.n__VA_ARGS__
1977 && !pfile->state.va_args_ok)
1978 {
1979 if (CPP_OPTION (pfile, cplusplus))
1980 cpp_error (pfile, CPP_DL_PEDWARN,
1981 "__VA_ARGS__ can only appear in the expansion"
1982 " of a C++11 variadic macro");
1983 else
1984 cpp_error (pfile, CPP_DL_PEDWARN,
1985 "__VA_ARGS__ can only appear in the expansion"
1986 " of a C99 variadic macro");
1987 }
1988
1989 if (result == pfile->spec_nodes.n__VA_OPT__)
1990 maybe_va_opt_error (pfile);
1991
1992 /* For -Wc++-compat, warn about use of C++ named operators. */
1993 if (result->flags & NODE_WARN_OPERATOR)
1994 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1995 "identifier \"%s\" is a special operator name in C++",
1996 NODE_NAME (result));
1997 }
1998
1999 return result;
2000 }
2001
2002 /* Get the cpp_hashnode of an identifier specified by NAME in
2003 the current cpp_reader object. If none is found, NULL is returned. */
2004 cpp_hashnode *
2005 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2006 {
2007 cpp_hashnode *result;
2008 result = lex_identifier_intern (pfile, (uchar *) name);
2009 return result;
2010 }
2011
2012 /* Lex an identifier starting at BUFFER->CUR - 1. */
2013 static cpp_hashnode *
2014 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2015 struct normalize_state *nst, cpp_hashnode **spelling)
2016 {
2017 cpp_hashnode *result;
2018 const uchar *cur;
2019 unsigned int len;
2020 unsigned int hash = HT_HASHSTEP (0, *base);
2021 const bool warn_bidi_p = pfile->warn_bidi_p ();
2022
2023 cur = pfile->buffer->cur;
2024 if (! starts_ucn)
2025 {
2026 while (ISIDNUM (*cur))
2027 {
2028 hash = HT_HASHSTEP (hash, *cur);
2029 cur++;
2030 }
2031 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2032 }
2033 pfile->buffer->cur = cur;
2034 if (starts_ucn || forms_identifier_p (pfile, false, nst))
2035 {
2036 /* Slower version for identifiers containing UCNs
2037 or extended chars (including $). */
2038 do {
2039 while (ISIDNUM (*pfile->buffer->cur))
2040 {
2041 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2042 pfile->buffer->cur++;
2043 }
2044 } while (forms_identifier_p (pfile, false, nst));
2045 if (warn_bidi_p)
2046 maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2047 result = _cpp_interpret_identifier (pfile, base,
2048 pfile->buffer->cur - base);
2049 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2050 }
2051 else
2052 {
2053 len = cur - base;
2054 hash = HT_HASHFINISH (hash, len);
2055
2056 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2057 base, len, hash, HT_ALLOC));
2058 *spelling = result;
2059 }
2060
2061 /* Rarely, identifiers require diagnostics when lexed. */
2062 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2063 && !pfile->state.skipping, 0))
2064 {
2065 /* It is allowed to poison the same identifier twice. */
2066 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2067 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2068 NODE_NAME (result));
2069
2070 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2071 replacement list of a variadic macro. */
2072 if (result == pfile->spec_nodes.n__VA_ARGS__
2073 && !pfile->state.va_args_ok)
2074 {
2075 if (CPP_OPTION (pfile, cplusplus))
2076 cpp_error (pfile, CPP_DL_PEDWARN,
2077 "__VA_ARGS__ can only appear in the expansion"
2078 " of a C++11 variadic macro");
2079 else
2080 cpp_error (pfile, CPP_DL_PEDWARN,
2081 "__VA_ARGS__ can only appear in the expansion"
2082 " of a C99 variadic macro");
2083 }
2084
2085 /* __VA_OPT__ should only appear in the replacement list of a
2086 variadic macro. */
2087 if (result == pfile->spec_nodes.n__VA_OPT__)
2088 maybe_va_opt_error (pfile);
2089
2090 /* For -Wc++-compat, warn about use of C++ named operators. */
2091 if (result->flags & NODE_WARN_OPERATOR)
2092 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2093 "identifier \"%s\" is a special operator name in C++",
2094 NODE_NAME (result));
2095 }
2096
2097 return result;
2098 }
2099
2100 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
2101 static void
2102 lex_number (cpp_reader *pfile, cpp_string *number,
2103 struct normalize_state *nst)
2104 {
2105 const uchar *cur;
2106 const uchar *base;
2107 uchar *dest;
2108
2109 base = pfile->buffer->cur - 1;
2110 do
2111 {
2112 const uchar *adj_digit_sep = NULL;
2113 cur = pfile->buffer->cur;
2114
2115 /* N.B. ISIDNUM does not include $. */
2116 while (ISIDNUM (*cur)
2117 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2118 || DIGIT_SEP (*cur)
2119 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2120 {
2121 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2122 /* Adjacent digit separators do not form part of the pp-number syntax.
2123 However, they can safely be diagnosed here as an error, since '' is
2124 not a valid preprocessing token. */
2125 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2126 adj_digit_sep = cur;
2127 cur++;
2128 }
2129 /* A number can't end with a digit separator. */
2130 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2131 --cur;
2132 if (adj_digit_sep && adj_digit_sep < cur)
2133 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2134
2135 pfile->buffer->cur = cur;
2136 }
2137 while (forms_identifier_p (pfile, false, nst));
2138
2139 number->len = cur - base;
2140 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2141 memcpy (dest, base, number->len);
2142 dest[number->len] = '\0';
2143 number->text = dest;
2144 }
2145
2146 /* Create a token of type TYPE with a literal spelling. */
2147 static void
2148 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2149 unsigned int len, enum cpp_ttype type)
2150 {
2151 token->type = type;
2152 token->val.str.len = len;
2153 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2154 }
2155
2156 const uchar *
2157 cpp_alloc_token_string (cpp_reader *pfile,
2158 const unsigned char *ptr, unsigned len)
2159 {
2160 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2161
2162 dest[len] = 0;
2163 memcpy (dest, ptr, len);
2164 return dest;
2165 }
2166
2167 /* A pair of raw buffer pointers. The currently open one is [1], the
2168 first one is [0]. Used for string literal lexing. */
2169 struct lit_accum {
2170 _cpp_buff *first;
2171 _cpp_buff *last;
2172 const uchar *rpos;
2173 size_t accum;
2174
2175 lit_accum ()
2176 : first (NULL), last (NULL), rpos (0), accum (0)
2177 {
2178 }
2179
2180 void append (cpp_reader *, const uchar *, size_t);
2181
2182 void read_begin (cpp_reader *);
2183 bool reading_p () const
2184 {
2185 return rpos != NULL;
2186 }
2187 char read_char ()
2188 {
2189 char c = *rpos++;
2190 if (rpos == BUFF_FRONT (last))
2191 rpos = NULL;
2192 return c;
2193 }
2194 };
2195
2196 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2197 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2198
2199 void
2200 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2201 {
2202 if (!last)
2203 /* Starting. */
2204 first = last = _cpp_get_buff (pfile, len);
2205 else if (len > BUFF_ROOM (last))
2206 {
2207 /* There is insufficient room in the buffer. Copy what we can,
2208 and then either extend or create a new one. */
2209 size_t room = BUFF_ROOM (last);
2210 memcpy (BUFF_FRONT (last), base, room);
2211 BUFF_FRONT (last) += room;
2212 base += room;
2213 len -= room;
2214 accum += room;
2215
2216 gcc_checking_assert (!rpos);
2217
2218 last = _cpp_append_extend_buff (pfile, last, len);
2219 }
2220
2221 memcpy (BUFF_FRONT (last), base, len);
2222 BUFF_FRONT (last) += len;
2223 accum += len;
2224 }
2225
2226 void
2227 lit_accum::read_begin (cpp_reader *pfile)
2228 {
2229 /* We never accumulate more than 4 chars to read. */
2230 if (BUFF_ROOM (last) < 4)
2231
2232 last = _cpp_append_extend_buff (pfile, last, 4);
2233 rpos = BUFF_FRONT (last);
2234 }
2235
2236 /* Returns true if a macro has been defined.
2237 This might not work if compile with -save-temps,
2238 or preprocess separately from compilation. */
2239
2240 static bool
2241 is_macro(cpp_reader *pfile, const uchar *base)
2242 {
2243 const uchar *cur = base;
2244 if (! ISIDST (*cur))
2245 return false;
2246 unsigned int hash = HT_HASHSTEP (0, *cur);
2247 ++cur;
2248 while (ISIDNUM (*cur))
2249 {
2250 hash = HT_HASHSTEP (hash, *cur);
2251 ++cur;
2252 }
2253 hash = HT_HASHFINISH (hash, cur - base);
2254
2255 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2256 base, cur - base, hash, HT_NO_INSERT));
2257
2258 return result && cpp_macro_p (result);
2259 }
2260
2261 /* Returns true if a literal suffix does not have the expected form
2262 and is defined as a macro. */
2263
2264 static bool
2265 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2266 {
2267 /* User-defined literals outside of namespace std must start with a single
2268 underscore, so assume anything of that form really is a UDL suffix.
2269 We don't need to worry about UDLs defined inside namespace std because
2270 their names are reserved, so cannot be used as macro names in valid
2271 programs. */
2272 if (base[0] == '_' && base[1] != '_')
2273 return false;
2274 return is_macro (pfile, base);
2275 }
2276
2277 /* Lexes a raw string. The stored string contains the spelling,
2278 including double quotes, delimiter string, '(' and ')', any leading
2279 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2280 the type of the literal, or CPP_OTHER if it was not properly
2281 terminated.
2282
2283 BASE is the start of the token. Updates pfile->buffer->cur to just
2284 after the lexed string.
2285
2286 The spelling is NUL-terminated, but it is not guaranteed that this
2287 is the first NUL since embedded NULs are preserved. */
2288
2289 static void
2290 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2291 {
2292 const uchar *pos = base;
2293 const bool warn_bidi_p = pfile->warn_bidi_p ();
2294
2295 /* 'tis a pity this information isn't passed down from the lexer's
2296 initial categorization of the token. */
2297 enum cpp_ttype type = CPP_STRING;
2298
2299 if (*pos == 'L')
2300 {
2301 type = CPP_WSTRING;
2302 pos++;
2303 }
2304 else if (*pos == 'U')
2305 {
2306 type = CPP_STRING32;
2307 pos++;
2308 }
2309 else if (*pos == 'u')
2310 {
2311 if (pos[1] == '8')
2312 {
2313 type = CPP_UTF8STRING;
2314 pos++;
2315 }
2316 else
2317 type = CPP_STRING16;
2318 pos++;
2319 }
2320
2321 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2322 pos += 2;
2323
2324 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2325
2326 /* Skip notes before the ". */
2327 while (note->pos < pos)
2328 ++note;
2329
2330 lit_accum accum;
2331
2332 uchar prefix[17];
2333 unsigned prefix_len = 0;
2334 enum Phase
2335 {
2336 PHASE_PREFIX = -2,
2337 PHASE_NONE = -1,
2338 PHASE_SUFFIX = 0
2339 } phase = PHASE_PREFIX;
2340
2341 for (;;)
2342 {
2343 gcc_checking_assert (note->pos >= pos);
2344
2345 /* Undo any escaped newlines and trigraphs. */
2346 if (!accum.reading_p () && note->pos == pos)
2347 switch (note->type)
2348 {
2349 case '\\':
2350 case ' ':
2351 /* Restore backslash followed by newline. */
2352 accum.append (pfile, base, pos - base);
2353 base = pos;
2354 accum.read_begin (pfile);
2355 accum.append (pfile, UC"\\", 1);
2356
2357 after_backslash:
2358 if (note->type == ' ')
2359 /* GNU backslash whitespace newline extension. FIXME
2360 could be any sequence of non-vertical space. When we
2361 can properly restore any such sequence, we should
2362 mark this note as handled so _cpp_process_line_notes
2363 doesn't warn. */
2364 accum.append (pfile, UC" ", 1);
2365
2366 accum.append (pfile, UC"\n", 1);
2367 note++;
2368 break;
2369
2370 case '\n':
2371 /* This can happen for ??/<NEWLINE> when trigraphs are not
2372 being interpretted. */
2373 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2374 note->type = 0;
2375 note++;
2376 break;
2377
2378 default:
2379 gcc_checking_assert (_cpp_trigraph_map[note->type]);
2380
2381 /* Don't warn about this trigraph in
2382 _cpp_process_line_notes, since trigraphs show up as
2383 trigraphs in raw strings. */
2384 uchar type = note->type;
2385 note->type = 0;
2386
2387 if (CPP_OPTION (pfile, trigraphs))
2388 {
2389 accum.append (pfile, base, pos - base);
2390 base = pos;
2391 accum.read_begin (pfile);
2392 accum.append (pfile, UC"??", 2);
2393 accum.append (pfile, &type, 1);
2394
2395 /* ??/ followed by newline gets two line notes, one for
2396 the trigraph and one for the backslash/newline. */
2397 if (type == '/' && note[1].pos == pos)
2398 {
2399 note++;
2400 gcc_assert (note->type == '\\' || note->type == ' ');
2401 goto after_backslash;
2402 }
2403 /* Skip the replacement character. */
2404 base = ++pos;
2405 }
2406
2407 note++;
2408 break;
2409 }
2410
2411 /* Now get a char to process. Either from an expanded note, or
2412 from the line buffer. */
2413 bool read_note = accum.reading_p ();
2414 char c = read_note ? accum.read_char () : *pos++;
2415
2416 if (phase == PHASE_PREFIX)
2417 {
2418 if (c == '(')
2419 {
2420 /* Done. */
2421 phase = PHASE_NONE;
2422 prefix[prefix_len++] = '"';
2423 }
2424 else if (prefix_len < 16
2425 /* Prefix chars are any of the basic character set,
2426 [lex.charset] except for '
2427 ()\\\t\v\f\n'. Optimized for a contiguous
2428 alphabet. */
2429 /* Unlike a switch, this collapses down to one or
2430 two shift and bitmask operations on an ASCII
2431 system, with an outlier or two. */
2432 && (('Z' - 'A' == 25
2433 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2434 : ISIDST (c))
2435 || (c >= '0' && c <= '9')
2436 || c == '_' || c == '{' || c == '}'
2437 || c == '[' || c == ']' || c == '#'
2438 || c == '<' || c == '>' || c == '%'
2439 || c == ':' || c == ';' || c == '.' || c == '?'
2440 || c == '*' || c == '+' || c == '-' || c == '/'
2441 || c == '^' || c == '&' || c == '|' || c == '~'
2442 || c == '!' || c == '=' || c == ','
2443 || c == '"' || c == '\''))
2444 prefix[prefix_len++] = c;
2445 else
2446 {
2447 /* Something is wrong. */
2448 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2449 if (prefix_len == 16)
2450 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2451 col, "raw string delimiter longer "
2452 "than 16 characters");
2453 else if (c == '\n')
2454 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2455 col, "invalid new-line in raw "
2456 "string delimiter");
2457 else
2458 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2459 col, "invalid character '%c' in "
2460 "raw string delimiter", c);
2461 type = CPP_OTHER;
2462 phase = PHASE_NONE;
2463 /* Continue until we get a close quote, that's probably
2464 the best failure mode. */
2465 prefix_len = 0;
2466 }
2467 if (c != '\n')
2468 continue;
2469 }
2470
2471 if (phase != PHASE_NONE)
2472 {
2473 if (prefix[phase] != c)
2474 phase = PHASE_NONE;
2475 else if (unsigned (phase + 1) == prefix_len)
2476 break;
2477 else
2478 {
2479 phase = Phase (phase + 1);
2480 continue;
2481 }
2482 }
2483
2484 if (!prefix_len && c == '"')
2485 /* Failure mode lexing. */
2486 goto out;
2487 else if (prefix_len && c == ')')
2488 phase = PHASE_SUFFIX;
2489 else if (!read_note && c == '\n')
2490 {
2491 pos--;
2492 pfile->buffer->cur = pos;
2493 if (pfile->state.in_directive
2494 || (pfile->state.parsing_args
2495 && pfile->buffer->next_line >= pfile->buffer->rlimit))
2496 {
2497 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2498 "unterminated raw string");
2499 type = CPP_OTHER;
2500 goto out;
2501 }
2502
2503 accum.append (pfile, base, pos - base + 1);
2504 _cpp_process_line_notes (pfile, false);
2505
2506 if (pfile->buffer->next_line < pfile->buffer->rlimit)
2507 CPP_INCREMENT_LINE (pfile, 0);
2508 pfile->buffer->need_line = true;
2509
2510 if (!_cpp_get_fresh_line (pfile))
2511 {
2512 /* We ran out of file and failed to get a line. */
2513 location_t src_loc = token->src_loc;
2514 token->type = CPP_EOF;
2515 /* Tell the compiler the line number of the EOF token. */
2516 token->src_loc = pfile->line_table->highest_line;
2517 token->flags = BOL;
2518 if (accum.first)
2519 _cpp_release_buff (pfile, accum.first);
2520 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2521 "unterminated raw string");
2522 /* Now pop the buffer that _cpp_get_fresh_line did not. */
2523 _cpp_pop_buffer (pfile);
2524 return;
2525 }
2526
2527 pos = base = pfile->buffer->cur;
2528 note = &pfile->buffer->notes[pfile->buffer->cur_note];
2529 }
2530 else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
2531 && warn_bidi_p)
2532 {
2533 location_t loc;
2534 bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
2535 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2536 }
2537 }
2538
2539 if (warn_bidi_p)
2540 maybe_warn_bidi_on_close (pfile, pos);
2541
2542 if (CPP_OPTION (pfile, user_literals))
2543 {
2544 /* If a string format macro, say from inttypes.h, is placed touching
2545 a string literal it could be parsed as a C++11 user-defined string
2546 literal thus breaking the program. */
2547 if (is_macro_not_literal_suffix (pfile, pos))
2548 {
2549 /* Raise a warning, but do not consume subsequent tokens. */
2550 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2551 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2552 token->src_loc, 0,
2553 "invalid suffix on literal; C++11 requires "
2554 "a space between literal and string macro");
2555 }
2556 /* Grab user defined literal suffix. */
2557 else if (ISIDST (*pos))
2558 {
2559 type = cpp_userdef_string_add_type (type);
2560 ++pos;
2561
2562 while (ISIDNUM (*pos))
2563 ++pos;
2564 }
2565 }
2566
2567 out:
2568 pfile->buffer->cur = pos;
2569 if (!accum.accum)
2570 create_literal (pfile, token, base, pos - base, type);
2571 else
2572 {
2573 size_t extra_len = pos - base;
2574 uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2575
2576 token->type = type;
2577 token->val.str.len = accum.accum + extra_len;
2578 token->val.str.text = dest;
2579 for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2580 {
2581 size_t len = BUFF_FRONT (buf) - buf->base;
2582 memcpy (dest, buf->base, len);
2583 dest += len;
2584 }
2585 _cpp_release_buff (pfile, accum.first);
2586 memcpy (dest, base, extra_len);
2587 dest[extra_len] = '\0';
2588 }
2589 }
2590
2591 /* Lexes a string, character constant, or angle-bracketed header file
2592 name. The stored string contains the spelling, including opening
2593 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2594 'R' modifier. It returns the type of the literal, or CPP_OTHER
2595 if it was not properly terminated, or CPP_LESS for an unterminated
2596 header name which must be relexed as normal tokens.
2597
2598 The spelling is NUL-terminated, but it is not guaranteed that this
2599 is the first NUL since embedded NULs are preserved. */
2600 static void
2601 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2602 {
2603 bool saw_NUL = false;
2604 const uchar *cur;
2605 cppchar_t terminator;
2606 enum cpp_ttype type;
2607
2608 cur = base;
2609 terminator = *cur++;
2610 if (terminator == 'L' || terminator == 'U')
2611 terminator = *cur++;
2612 else if (terminator == 'u')
2613 {
2614 terminator = *cur++;
2615 if (terminator == '8')
2616 terminator = *cur++;
2617 }
2618 if (terminator == 'R')
2619 {
2620 lex_raw_string (pfile, token, base);
2621 return;
2622 }
2623 if (terminator == '"')
2624 type = (*base == 'L' ? CPP_WSTRING :
2625 *base == 'U' ? CPP_STRING32 :
2626 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2627 : CPP_STRING);
2628 else if (terminator == '\'')
2629 type = (*base == 'L' ? CPP_WCHAR :
2630 *base == 'U' ? CPP_CHAR32 :
2631 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2632 : CPP_CHAR);
2633 else
2634 terminator = '>', type = CPP_HEADER_NAME;
2635
2636 const bool warn_bidi_p = pfile->warn_bidi_p ();
2637 for (;;)
2638 {
2639 cppchar_t c = *cur++;
2640
2641 /* In #include-style directives, terminators are not escapable. */
2642 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2643 {
2644 if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
2645 {
2646 location_t loc;
2647 bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
2648 &loc);
2649 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2650 }
2651 cur++;
2652 }
2653 else if (c == terminator)
2654 {
2655 if (warn_bidi_p)
2656 maybe_warn_bidi_on_close (pfile, cur - 1);
2657 break;
2658 }
2659 else if (c == '\n')
2660 {
2661 cur--;
2662 /* Unmatched quotes always yield undefined behavior, but
2663 greedy lexing means that what appears to be an unterminated
2664 header name may actually be a legitimate sequence of tokens. */
2665 if (terminator == '>')
2666 {
2667 token->type = CPP_LESS;
2668 return;
2669 }
2670 type = CPP_OTHER;
2671 break;
2672 }
2673 else if (c == '\0')
2674 saw_NUL = true;
2675 else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
2676 {
2677 location_t loc;
2678 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
2679 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2680 }
2681 }
2682
2683 if (saw_NUL && !pfile->state.skipping)
2684 cpp_error (pfile, CPP_DL_WARNING,
2685 "null character(s) preserved in literal");
2686
2687 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2688 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2689 (int) terminator);
2690
2691 if (CPP_OPTION (pfile, user_literals))
2692 {
2693 /* If a string format macro, say from inttypes.h, is placed touching
2694 a string literal it could be parsed as a C++11 user-defined string
2695 literal thus breaking the program. */
2696 if (is_macro_not_literal_suffix (pfile, cur))
2697 {
2698 /* Raise a warning, but do not consume subsequent tokens. */
2699 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2700 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2701 token->src_loc, 0,
2702 "invalid suffix on literal; C++11 requires "
2703 "a space between literal and string macro");
2704 }
2705 /* Grab user defined literal suffix. */
2706 else if (ISIDST (*cur))
2707 {
2708 type = cpp_userdef_char_add_type (type);
2709 type = cpp_userdef_string_add_type (type);
2710 ++cur;
2711
2712 while (ISIDNUM (*cur))
2713 ++cur;
2714 }
2715 }
2716 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2717 && is_macro (pfile, cur)
2718 && !pfile->state.skipping)
2719 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2720 token->src_loc, 0, "C++11 requires a space "
2721 "between string literal and macro");
2722
2723 pfile->buffer->cur = cur;
2724 create_literal (pfile, token, base, cur - base, type);
2725 }
2726
2727 /* Return the comment table. The client may not make any assumption
2728 about the ordering of the table. */
2729 cpp_comment_table *
2730 cpp_get_comments (cpp_reader *pfile)
2731 {
2732 return &pfile->comments;
2733 }
2734
2735 /* Append a comment to the end of the comment table. */
2736 static void
2737 store_comment (cpp_reader *pfile, cpp_token *token)
2738 {
2739 int len;
2740
2741 if (pfile->comments.allocated == 0)
2742 {
2743 pfile->comments.allocated = 256;
2744 pfile->comments.entries = (cpp_comment *) xmalloc
2745 (pfile->comments.allocated * sizeof (cpp_comment));
2746 }
2747
2748 if (pfile->comments.count == pfile->comments.allocated)
2749 {
2750 pfile->comments.allocated *= 2;
2751 pfile->comments.entries = (cpp_comment *) xrealloc
2752 (pfile->comments.entries,
2753 pfile->comments.allocated * sizeof (cpp_comment));
2754 }
2755
2756 len = token->val.str.len;
2757
2758 /* Copy comment. Note, token may not be NULL terminated. */
2759 pfile->comments.entries[pfile->comments.count].comment =
2760 (char *) xmalloc (sizeof (char) * (len + 1));
2761 memcpy (pfile->comments.entries[pfile->comments.count].comment,
2762 token->val.str.text, len);
2763 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2764
2765 /* Set source location. */
2766 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2767
2768 /* Increment the count of entries in the comment table. */
2769 pfile->comments.count++;
2770 }
2771
2772 /* The stored comment includes the comment start and any terminator. */
2773 static void
2774 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2775 cppchar_t type)
2776 {
2777 unsigned char *buffer;
2778 unsigned int len, clen, i;
2779 int convert_to_c = (pfile->state.in_directive || pfile->state.parsing_args)
2780 && type == '/';
2781
2782 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
2783
2784 /* C++ comments probably (not definitely) have moved past a new
2785 line, which we don't want to save in the comment. */
2786 if (is_vspace (pfile->buffer->cur[-1]))
2787 len--;
2788
2789 /* If we are currently in a directive or in argument parsing, then
2790 we need to store all C++ comments as C comments internally, and
2791 so we need to allocate a little extra space in that case.
2792
2793 Note that the only time we encounter a directive here is
2794 when we are saving comments in a "#define". */
2795 clen = convert_to_c ? len + 2 : len;
2796
2797 buffer = _cpp_unaligned_alloc (pfile, clen);
2798
2799 token->type = CPP_COMMENT;
2800 token->val.str.len = clen;
2801 token->val.str.text = buffer;
2802
2803 buffer[0] = '/';
2804 memcpy (buffer + 1, from, len - 1);
2805
2806 /* Finish conversion to a C comment, if necessary. */
2807 if (convert_to_c)
2808 {
2809 buffer[1] = '*';
2810 buffer[clen - 2] = '*';
2811 buffer[clen - 1] = '/';
2812 /* As there can be in a C++ comments illegal sequences for C comments
2813 we need to filter them out. */
2814 for (i = 2; i < (clen - 2); i++)
2815 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2816 buffer[i] = '|';
2817 }
2818
2819 /* Finally store this comment for use by clients of libcpp. */
2820 store_comment (pfile, token);
2821 }
2822
2823 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2824 comment. */
2825
2826 static bool
2827 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2828 {
2829 const unsigned char *from = comment_start + 1;
2830
2831 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2832 {
2833 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2834 don't recognize any comments. The latter only checks attributes,
2835 the former doesn't warn. */
2836 case 0:
2837 default:
2838 return false;
2839 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2840 content it has. */
2841 case 1:
2842 return true;
2843 case 2:
2844 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2845 .*falls?[ \t-]*thr(u|ough).* regex. */
2846 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2847 from++)
2848 {
2849 /* Is there anything like strpbrk with upper boundary, or
2850 memchr looking for 2 characters rather than just one? */
2851 if (from[0] != 'f' && from[0] != 'F')
2852 continue;
2853 if (from[1] != 'a' && from[1] != 'A')
2854 continue;
2855 if (from[2] != 'l' && from[2] != 'L')
2856 continue;
2857 if (from[3] != 'l' && from[3] != 'L')
2858 continue;
2859 from += sizeof "fall" - 1;
2860 if (from[0] == 's' || from[0] == 'S')
2861 from++;
2862 while (*from == ' ' || *from == '\t' || *from == '-')
2863 from++;
2864 if (from[0] != 't' && from[0] != 'T')
2865 continue;
2866 if (from[1] != 'h' && from[1] != 'H')
2867 continue;
2868 if (from[2] != 'r' && from[2] != 'R')
2869 continue;
2870 if (from[3] == 'u' || from[3] == 'U')
2871 return true;
2872 if (from[3] != 'o' && from[3] != 'O')
2873 continue;
2874 if (from[4] != 'u' && from[4] != 'U')
2875 continue;
2876 if (from[5] != 'g' && from[5] != 'G')
2877 continue;
2878 if (from[6] != 'h' && from[6] != 'H')
2879 continue;
2880 return true;
2881 }
2882 return false;
2883 case 3:
2884 case 4:
2885 break;
2886 }
2887
2888 /* Whole comment contents:
2889 -fallthrough
2890 @fallthrough@
2891 */
2892 if (*from == '-' || *from == '@')
2893 {
2894 size_t len = sizeof "fallthrough" - 1;
2895 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2896 return false;
2897 if (memcmp (from + 1, "fallthrough", len))
2898 return false;
2899 if (*from == '@')
2900 {
2901 if (from[len + 1] != '@')
2902 return false;
2903 len++;
2904 }
2905 from += 1 + len;
2906 }
2907 /* Whole comment contents (regex):
2908 lint -fallthrough[ \t]*
2909 */
2910 else if (*from == 'l')
2911 {
2912 size_t len = sizeof "int -fallthrough" - 1;
2913 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2914 return false;
2915 if (memcmp (from + 1, "int -fallthrough", len))
2916 return false;
2917 from += 1 + len;
2918 while (*from == ' ' || *from == '\t')
2919 from++;
2920 }
2921 /* Whole comment contents (regex):
2922 [ \t]*FALLTHR(U|OUGH)[ \t]*
2923 */
2924 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2925 {
2926 while (*from == ' ' || *from == '\t')
2927 from++;
2928 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
2929 return false;
2930 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2931 return false;
2932 from += sizeof "FALLTHR" - 1;
2933 if (*from == 'U')
2934 from++;
2935 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
2936 return false;
2937 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2938 return false;
2939 else
2940 from += sizeof "OUGH" - 1;
2941 while (*from == ' ' || *from == '\t')
2942 from++;
2943 }
2944 /* Whole comment contents (regex):
2945 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2946 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2947 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2948 */
2949 else
2950 {
2951 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2952 from++;
2953 unsigned char f = *from;
2954 bool all_upper = false;
2955 if (f == 'E' || f == 'e')
2956 {
2957 if ((size_t) (pfile->buffer->cur - from)
2958 < sizeof "else fallthru" - 1)
2959 return false;
2960 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2961 all_upper = true;
2962 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2963 return false;
2964 from += sizeof "else" - 1;
2965 if (*from == ',')
2966 from++;
2967 if (*from != ' ')
2968 return false;
2969 from++;
2970 if (all_upper && *from == 'f')
2971 return false;
2972 if (f == 'e' && *from == 'F')
2973 return false;
2974 f = *from;
2975 }
2976 else if (f == 'I' || f == 'i')
2977 {
2978 if ((size_t) (pfile->buffer->cur - from)
2979 < sizeof "intentional fallthru" - 1)
2980 return false;
2981 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2982 sizeof "NTENTIONAL" - 1) == 0)
2983 all_upper = true;
2984 else if (memcmp (from + 1, "ntentional",
2985 sizeof "ntentional" - 1))
2986 return false;
2987 from += sizeof "intentional" - 1;
2988 if (*from == ' ')
2989 {
2990 from++;
2991 if (all_upper && *from == 'f')
2992 return false;
2993 }
2994 else if (all_upper)
2995 {
2996 if (memcmp (from, "LY F", sizeof "LY F" - 1))
2997 return false;
2998 from += sizeof "LY " - 1;
2999 }
3000 else
3001 {
3002 if (memcmp (from, "ly ", sizeof "ly " - 1))
3003 return false;
3004 from += sizeof "ly " - 1;
3005 }
3006 if (f == 'i' && *from == 'F')
3007 return false;
3008 f = *from;
3009 }
3010 if (f != 'F' && f != 'f')
3011 return false;
3012 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3013 return false;
3014 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3015 all_upper = true;
3016 else if (all_upper)
3017 return false;
3018 else if (memcmp (from + 1, "all", sizeof "all" - 1))
3019 return false;
3020 from += sizeof "fall" - 1;
3021 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3022 from += 2;
3023 else if (*from == ' ' || *from == '-')
3024 from++;
3025 else if (*from != (all_upper ? 'T' : 't'))
3026 return false;
3027 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3028 return false;
3029 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3030 return false;
3031 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3032 {
3033 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3034 return false;
3035 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3036 sizeof "hrough" - 1))
3037 return false;
3038 from += sizeof "through" - 1;
3039 }
3040 else
3041 from += sizeof "thru" - 1;
3042 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3043 from++;
3044 if (*from == '-')
3045 {
3046 from++;
3047 if (*comment_start == '*')
3048 {
3049 do
3050 {
3051 while (*from && *from != '*'
3052 && *from != '\n' && *from != '\r')
3053 from++;
3054 if (*from != '*' || from[1] == '/')
3055 break;
3056 from++;
3057 }
3058 while (1);
3059 }
3060 else
3061 while (*from && *from != '\n' && *from != '\r')
3062 from++;
3063 }
3064 }
3065 /* C block comment. */
3066 if (*comment_start == '*')
3067 {
3068 if (*from != '*' || from[1] != '/')
3069 return false;
3070 }
3071 /* C++ line comment. */
3072 else if (*from != '\n')
3073 return false;
3074
3075 return true;
3076 }
3077
3078 /* Allocate COUNT tokens for RUN. */
3079 void
3080 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3081 {
3082 run->base = XNEWVEC (cpp_token, count);
3083 run->limit = run->base + count;
3084 run->next = NULL;
3085 }
3086
3087 /* Returns the next tokenrun, or creates one if there is none. */
3088 static tokenrun *
3089 next_tokenrun (tokenrun *run)
3090 {
3091 if (run->next == NULL)
3092 {
3093 run->next = XNEW (tokenrun);
3094 run->next->prev = run;
3095 _cpp_init_tokenrun (run->next, 250);
3096 }
3097
3098 return run->next;
3099 }
3100
3101 /* Return the number of not yet processed token in a given
3102 context. */
3103 int
3104 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3105 {
3106 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3107 return (LAST (context).token - FIRST (context).token);
3108 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3109 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3110 return (LAST (context).ptoken - FIRST (context).ptoken);
3111 else
3112 abort ();
3113 }
3114
3115 /* Returns the token present at index INDEX in a given context. If
3116 INDEX is zero, the next token to be processed is returned. */
3117 static const cpp_token*
3118 _cpp_token_from_context_at (cpp_context *context, int index)
3119 {
3120 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3121 return &(FIRST (context).token[index]);
3122 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3123 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3124 return FIRST (context).ptoken[index];
3125 else
3126 abort ();
3127 }
3128
3129 /* Look ahead in the input stream. */
3130 const cpp_token *
3131 cpp_peek_token (cpp_reader *pfile, int index)
3132 {
3133 cpp_context *context = pfile->context;
3134 const cpp_token *peektok;
3135 int count;
3136
3137 /* First, scan through any pending cpp_context objects. */
3138 while (context->prev)
3139 {
3140 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3141
3142 if (index < (int) sz)
3143 return _cpp_token_from_context_at (context, index);
3144 index -= (int) sz;
3145 context = context->prev;
3146 }
3147
3148 /* We will have to read some new tokens after all (and do so
3149 without invalidating preceding tokens). */
3150 count = index;
3151 pfile->keep_tokens++;
3152
3153 /* For peeked tokens temporarily disable line_change reporting,
3154 until the tokens are parsed for real. */
3155 void (*line_change) (cpp_reader *, const cpp_token *, int)
3156 = pfile->cb.line_change;
3157 pfile->cb.line_change = NULL;
3158
3159 do
3160 {
3161 peektok = _cpp_lex_token (pfile);
3162 if (peektok->type == CPP_EOF)
3163 {
3164 index--;
3165 break;
3166 }
3167 else if (peektok->type == CPP_PRAGMA)
3168 {
3169 /* Don't peek past a pragma. */
3170 if (peektok == &pfile->directive_result)
3171 /* Save the pragma in the buffer. */
3172 *pfile->cur_token++ = *peektok;
3173 index--;
3174 break;
3175 }
3176 }
3177 while (index--);
3178
3179 _cpp_backup_tokens_direct (pfile, count - index);
3180 pfile->keep_tokens--;
3181 pfile->cb.line_change = line_change;
3182
3183 return peektok;
3184 }
3185
3186 /* Allocate a single token that is invalidated at the same time as the
3187 rest of the tokens on the line. Has its line and col set to the
3188 same as the last lexed token, so that diagnostics appear in the
3189 right place. */
3190 cpp_token *
3191 _cpp_temp_token (cpp_reader *pfile)
3192 {
3193 cpp_token *old, *result;
3194 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3195 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3196
3197 old = pfile->cur_token - 1;
3198 /* Any pre-existing lookaheads must not be clobbered. */
3199 if (la)
3200 {
3201 if (sz <= la)
3202 {
3203 tokenrun *next = next_tokenrun (pfile->cur_run);
3204
3205 if (sz < la)
3206 memmove (next->base + 1, next->base,
3207 (la - sz) * sizeof (cpp_token));
3208
3209 next->base[0] = pfile->cur_run->limit[-1];
3210 }
3211
3212 if (sz > 1)
3213 memmove (pfile->cur_token + 1, pfile->cur_token,
3214 MIN (la, sz - 1) * sizeof (cpp_token));
3215 }
3216
3217 if (!sz && pfile->cur_token == pfile->cur_run->limit)
3218 {
3219 pfile->cur_run = next_tokenrun (pfile->cur_run);
3220 pfile->cur_token = pfile->cur_run->base;
3221 }
3222
3223 result = pfile->cur_token++;
3224 result->src_loc = old->src_loc;
3225 return result;
3226 }
3227
3228 /* We're at the beginning of a logical line (so not in
3229 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3230 if we should enter deferred_pragma mode to tokenize the rest of the
3231 line as a module control-line. */
3232
3233 static void
3234 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3235 {
3236 unsigned backup = 0; /* Tokens we peeked. */
3237 cpp_hashnode *node = result->val.node.node;
3238 cpp_token *peek = result;
3239 cpp_token *keyword = peek;
3240 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3241 int header_count = 0;
3242
3243 /* Make sure the incoming state is as we expect it. This way we
3244 can restore it using constants. */
3245 gcc_checking_assert (!pfile->state.in_deferred_pragma
3246 && !pfile->state.skipping
3247 && !pfile->state.parsing_args
3248 && !pfile->state.angled_headers
3249 && (pfile->state.save_comments
3250 == !CPP_OPTION (pfile, discard_comments)));
3251
3252 /* Enter directives mode sufficiently for peeking. We don't have
3253 to actually set in_directive. */
3254 pfile->state.in_deferred_pragma = true;
3255
3256 /* These two fields are needed to process tokenization in deferred
3257 pragma mode. They are not used outside deferred pragma mode or
3258 directives mode. */
3259 pfile->state.pragma_allow_expansion = true;
3260 pfile->directive_line = result->src_loc;
3261
3262 /* Saving comments is incompatible with directives mode. */
3263 pfile->state.save_comments = 0;
3264
3265 if (node == n_modules[spec_nodes::M_EXPORT][0])
3266 {
3267 peek = _cpp_lex_direct (pfile);
3268 keyword = peek;
3269 backup++;
3270 if (keyword->type != CPP_NAME)
3271 goto not_module;
3272 node = keyword->val.node.node;
3273 if (!(node->flags & NODE_MODULE))
3274 goto not_module;
3275 }
3276
3277 if (node == n_modules[spec_nodes::M__IMPORT][0])
3278 /* __import */
3279 header_count = backup + 2 + 16;
3280 else if (node == n_modules[spec_nodes::M_IMPORT][0])
3281 /* import */
3282 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3283 else if (node == n_modules[spec_nodes::M_MODULE][0])
3284 ; /* module */
3285 else
3286 goto not_module;
3287
3288 /* We've seen [export] {module|import|__import}. Check the next token. */
3289 if (header_count)
3290 /* After '{,__}import' a header name may appear. */
3291 pfile->state.angled_headers = true;
3292 peek = _cpp_lex_direct (pfile);
3293 backup++;
3294
3295 /* ... import followed by identifier, ':', '<' or
3296 header-name preprocessing tokens, or module
3297 followed by cpp-identifier, ':' or ';' preprocessing
3298 tokens. C++ keywords are not yet relevant. */
3299 if (peek->type == CPP_NAME
3300 || peek->type == CPP_COLON
3301 || (header_count
3302 ? (peek->type == CPP_LESS
3303 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3304 || peek->type == CPP_HEADER_NAME)
3305 : peek->type == CPP_SEMICOLON))
3306 {
3307 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3308 if (!pfile->state.pragma_allow_expansion)
3309 pfile->state.prevent_expansion++;
3310
3311 if (!header_count && linemap_included_from
3312 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3313 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3314 "module control-line cannot be in included file");
3315
3316 /* The first one or two tokens cannot be macro names. */
3317 for (int ix = backup; ix--;)
3318 {
3319 cpp_token *tok = ix ? keyword : result;
3320 cpp_hashnode *node = tok->val.node.node;
3321
3322 /* Don't attempt to expand the token. */
3323 tok->flags |= NO_EXPAND;
3324 if (_cpp_defined_macro_p (node)
3325 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3326 && !cpp_fun_like_macro_p (node))
3327 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3328 "module control-line \"%s\" cannot be"
3329 " an object-like macro",
3330 NODE_NAME (node));
3331 }
3332
3333 /* Map to underbar variants. */
3334 keyword->val.node.node = n_modules[header_count
3335 ? spec_nodes::M_IMPORT
3336 : spec_nodes::M_MODULE][1];
3337 if (backup != 1)
3338 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3339
3340 /* Maybe tell the tokenizer we expect a header-name down the
3341 road. */
3342 pfile->state.directive_file_token = header_count;
3343 }
3344 else
3345 {
3346 not_module:
3347 /* Drop out of directive mode. */
3348 /* We aaserted save_comments had this value upon entry. */
3349 pfile->state.save_comments
3350 = !CPP_OPTION (pfile, discard_comments);
3351 pfile->state.in_deferred_pragma = false;
3352 /* Do not let this remain on. */
3353 pfile->state.angled_headers = false;
3354 }
3355
3356 /* In either case we want to backup the peeked tokens. */
3357 if (backup)
3358 {
3359 /* If we saw EOL, we should drop it, because this isn't a module
3360 control-line after all. */
3361 bool eol = peek->type == CPP_PRAGMA_EOL;
3362 if (!eol || backup > 1)
3363 {
3364 /* Put put the peeked tokens back */
3365 _cpp_backup_tokens_direct (pfile, backup);
3366 /* But if the last one was an EOL, forget it. */
3367 if (eol)
3368 pfile->lookaheads--;
3369 }
3370 }
3371 }
3372
3373 /* Lex a token into RESULT (external interface). Takes care of issues
3374 like directive handling, token lookahead, multiple include
3375 optimization and skipping. */
3376 const cpp_token *
3377 _cpp_lex_token (cpp_reader *pfile)
3378 {
3379 cpp_token *result;
3380
3381 for (;;)
3382 {
3383 if (pfile->cur_token == pfile->cur_run->limit)
3384 {
3385 pfile->cur_run = next_tokenrun (pfile->cur_run);
3386 pfile->cur_token = pfile->cur_run->base;
3387 }
3388 /* We assume that the current token is somewhere in the current
3389 run. */
3390 if (pfile->cur_token < pfile->cur_run->base
3391 || pfile->cur_token >= pfile->cur_run->limit)
3392 abort ();
3393
3394 if (pfile->lookaheads)
3395 {
3396 pfile->lookaheads--;
3397 result = pfile->cur_token++;
3398 }
3399 else
3400 result = _cpp_lex_direct (pfile);
3401
3402 if (result->flags & BOL)
3403 {
3404 /* Is this a directive. If _cpp_handle_directive returns
3405 false, it is an assembler #. */
3406 if (result->type == CPP_HASH
3407 /* 6.10.3 p 11: Directives in a list of macro arguments
3408 gives undefined behavior. This implementation
3409 handles the directive as normal. */
3410 && pfile->state.parsing_args != 1)
3411 {
3412 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3413 {
3414 if (pfile->directive_result.type == CPP_PADDING)
3415 continue;
3416 result = &pfile->directive_result;
3417 }
3418 }
3419 else if (pfile->state.in_deferred_pragma)
3420 result = &pfile->directive_result;
3421 else if (result->type == CPP_NAME
3422 && (result->val.node.node->flags & NODE_MODULE)
3423 && !pfile->state.skipping
3424 /* Unlike regular directives, we do not deal with
3425 tokenizing module directives as macro arguments.
3426 That's not permitted. */
3427 && !pfile->state.parsing_args)
3428 {
3429 /* P1857. Before macro expansion, At start of logical
3430 line ... */
3431 /* We don't have to consider lookaheads at this point. */
3432 gcc_checking_assert (!pfile->lookaheads);
3433
3434 cpp_maybe_module_directive (pfile, result);
3435 }
3436
3437 if (pfile->cb.line_change && !pfile->state.skipping)
3438 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3439 }
3440
3441 /* We don't skip tokens in directives. */
3442 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3443 break;
3444
3445 /* Outside a directive, invalidate controlling macros. At file
3446 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3447 get here and MI optimization works. */
3448 pfile->mi_valid = false;
3449
3450 if (!pfile->state.skipping || result->type == CPP_EOF)
3451 break;
3452 }
3453
3454 return result;
3455 }
3456
3457 /* Returns true if a fresh line has been loaded. */
3458 bool
3459 _cpp_get_fresh_line (cpp_reader *pfile)
3460 {
3461 /* We can't get a new line until we leave the current directive. */
3462 if (pfile->state.in_directive)
3463 return false;
3464
3465 for (;;)
3466 {
3467 cpp_buffer *buffer = pfile->buffer;
3468
3469 if (!buffer->need_line)
3470 return true;
3471
3472 if (buffer->next_line < buffer->rlimit)
3473 {
3474 _cpp_clean_line (pfile);
3475 return true;
3476 }
3477
3478 /* First, get out of parsing arguments state. */
3479 if (pfile->state.parsing_args)
3480 return false;
3481
3482 /* End of buffer. Non-empty files should end in a newline. */
3483 if (buffer->buf != buffer->rlimit
3484 && buffer->next_line > buffer->rlimit
3485 && !buffer->from_stage3)
3486 {
3487 /* Clip to buffer size. */
3488 buffer->next_line = buffer->rlimit;
3489 }
3490
3491 if (buffer->prev && !buffer->return_at_eof)
3492 _cpp_pop_buffer (pfile);
3493 else
3494 {
3495 /* End of translation. Do not pop the buffer yet. Increment
3496 line number so that the EOF token is on a line of its own
3497 (_cpp_lex_direct doesn't increment in that case, because
3498 it's hard for it to distinguish this special case). */
3499 CPP_INCREMENT_LINE (pfile, 0);
3500 return false;
3501 }
3502 }
3503 }
3504
3505 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3506 do \
3507 { \
3508 result->type = ELSE_TYPE; \
3509 if (*buffer->cur == CHAR) \
3510 buffer->cur++, result->type = THEN_TYPE; \
3511 } \
3512 while (0)
3513
3514 /* Lex a token into pfile->cur_token, which is also incremented, to
3515 get diagnostics pointing to the correct location.
3516
3517 Does not handle issues such as token lookahead, multiple-include
3518 optimization, directives, skipping etc. This function is only
3519 suitable for use by _cpp_lex_token, and in special cases like
3520 lex_expansion_token which doesn't care for any of these issues.
3521
3522 When meeting a newline, returns CPP_EOF if parsing a directive,
3523 otherwise returns to the start of the token buffer if permissible.
3524 Returns the location of the lexed token. */
3525 cpp_token *
3526 _cpp_lex_direct (cpp_reader *pfile)
3527 {
3528 cppchar_t c;
3529 cpp_buffer *buffer;
3530 const unsigned char *comment_start;
3531 bool fallthrough_comment = false;
3532 cpp_token *result = pfile->cur_token++;
3533
3534 fresh_line:
3535 result->flags = 0;
3536 buffer = pfile->buffer;
3537 if (buffer->need_line)
3538 {
3539 if (pfile->state.in_deferred_pragma)
3540 {
3541 /* This can happen in cases like:
3542 #define loop(x) whatever
3543 #pragma omp loop
3544 where when trying to expand loop we need to peek
3545 next token after loop, but aren't still in_deferred_pragma
3546 mode but are in in_directive mode, so buffer->need_line
3547 is set, a CPP_EOF is peeked. */
3548 result->type = CPP_PRAGMA_EOL;
3549 pfile->state.in_deferred_pragma = false;
3550 if (!pfile->state.pragma_allow_expansion)
3551 pfile->state.prevent_expansion--;
3552 return result;
3553 }
3554 if (!_cpp_get_fresh_line (pfile))
3555 {
3556 result->type = CPP_EOF;
3557 /* Not a real EOF in a directive or arg parsing -- we refuse
3558 to advance to the next file now, and will once we're out
3559 of those modes. */
3560 if (!pfile->state.in_directive && !pfile->state.parsing_args)
3561 {
3562 /* Tell the compiler the line number of the EOF token. */
3563 result->src_loc = pfile->line_table->highest_line;
3564 result->flags = BOL;
3565 /* Now pop the buffer that _cpp_get_fresh_line did not. */
3566 _cpp_pop_buffer (pfile);
3567 }
3568 return result;
3569 }
3570 if (buffer != pfile->buffer)
3571 fallthrough_comment = false;
3572 if (!pfile->keep_tokens)
3573 {
3574 pfile->cur_run = &pfile->base_run;
3575 result = pfile->base_run.base;
3576 pfile->cur_token = result + 1;
3577 }
3578 result->flags = BOL;
3579 if (pfile->state.parsing_args == 2)
3580 result->flags |= PREV_WHITE;
3581 }
3582 buffer = pfile->buffer;
3583 update_tokens_line:
3584 result->src_loc = pfile->line_table->highest_line;
3585
3586 skipped_white:
3587 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3588 && !pfile->overlaid_buffer)
3589 {
3590 _cpp_process_line_notes (pfile, false);
3591 result->src_loc = pfile->line_table->highest_line;
3592 }
3593 c = *buffer->cur++;
3594
3595 if (pfile->forced_token_location)
3596 result->src_loc = pfile->forced_token_location;
3597 else
3598 result->src_loc = linemap_position_for_column (pfile->line_table,
3599 CPP_BUF_COLUMN (buffer, buffer->cur));
3600
3601 switch (c)
3602 {
3603 case ' ': case '\t': case '\f': case '\v': case '\0':
3604 result->flags |= PREV_WHITE;
3605 skip_whitespace (pfile, c);
3606 goto skipped_white;
3607
3608 case '\n':
3609 /* Increment the line, unless this is the last line ... */
3610 if (buffer->cur < buffer->rlimit
3611 /* ... or this is a #include, (where _cpp_stack_file needs to
3612 unwind by one line) ... */
3613 || (pfile->state.in_directive > 1
3614 /* ... except traditional-cpp increments this elsewhere. */
3615 && !CPP_OPTION (pfile, traditional)))
3616 CPP_INCREMENT_LINE (pfile, 0);
3617 buffer->need_line = true;
3618 if (pfile->state.in_deferred_pragma)
3619 {
3620 /* Produce the PRAGMA_EOL on this line. File reading
3621 ensures there is always a \n at end of the buffer, thus
3622 in a deferred pragma we always see CPP_PRAGMA_EOL before
3623 any CPP_EOF. */
3624 result->type = CPP_PRAGMA_EOL;
3625 result->flags &= ~PREV_WHITE;
3626 pfile->state.in_deferred_pragma = false;
3627 if (!pfile->state.pragma_allow_expansion)
3628 pfile->state.prevent_expansion--;
3629 return result;
3630 }
3631 goto fresh_line;
3632
3633 case '0': case '1': case '2': case '3': case '4':
3634 case '5': case '6': case '7': case '8': case '9':
3635 {
3636 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3637 result->type = CPP_NUMBER;
3638 lex_number (pfile, &result->val.str, &nst);
3639 warn_about_normalization (pfile, result, &nst);
3640 break;
3641 }
3642
3643 case 'L':
3644 case 'u':
3645 case 'U':
3646 case 'R':
3647 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3648 wide strings or raw strings. */
3649 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3650 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3651 {
3652 if ((*buffer->cur == '\'' && c != 'R')
3653 || *buffer->cur == '"'
3654 || (*buffer->cur == 'R'
3655 && c != 'R'
3656 && buffer->cur[1] == '"'
3657 && CPP_OPTION (pfile, rliterals))
3658 || (*buffer->cur == '8'
3659 && c == 'u'
3660 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3661 && CPP_OPTION (pfile, utf8_char_literals)))
3662 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3663 && CPP_OPTION (pfile, rliterals)))))
3664 {
3665 lex_string (pfile, result, buffer->cur - 1);
3666 break;
3667 }
3668 }
3669 /* Fall through. */
3670
3671 case '_':
3672 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3673 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3674 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3675 case 's': case 't': case 'v': case 'w': case 'x':
3676 case 'y': case 'z':
3677 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3678 case 'G': case 'H': case 'I': case 'J': case 'K':
3679 case 'M': case 'N': case 'O': case 'P': case 'Q':
3680 case 'S': case 'T': case 'V': case 'W': case 'X':
3681 case 'Y': case 'Z':
3682 result->type = CPP_NAME;
3683 {
3684 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3685 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3686 &nst,
3687 &result->val.node.spelling);
3688 warn_about_normalization (pfile, result, &nst);
3689 }
3690
3691 /* Convert named operators to their proper types. */
3692 if (result->val.node.node->flags & NODE_OPERATOR)
3693 {
3694 result->flags |= NAMED_OP;
3695 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3696 }
3697
3698 /* Signal FALLTHROUGH comment followed by another token. */
3699 if (fallthrough_comment)
3700 result->flags |= PREV_FALLTHROUGH;
3701 break;
3702
3703 case '\'':
3704 case '"':
3705 lex_string (pfile, result, buffer->cur - 1);
3706 break;
3707
3708 case '/':
3709 /* A potential block or line comment. */
3710 comment_start = buffer->cur;
3711 c = *buffer->cur;
3712
3713 if (c == '*')
3714 {
3715 if (_cpp_skip_block_comment (pfile))
3716 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3717 }
3718 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3719 {
3720 /* Don't warn for system headers. */
3721 if (_cpp_in_system_header (pfile))
3722 ;
3723 /* Warn about comments if pedantically GNUC89, and not
3724 in system headers. */
3725 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3726 && CPP_PEDANTIC (pfile)
3727 && ! buffer->warned_cplusplus_comments)
3728 {
3729 if (cpp_error (pfile, CPP_DL_PEDWARN,
3730 "C++ style comments are not allowed in ISO C90"))
3731 cpp_error (pfile, CPP_DL_NOTE,
3732 "(this will be reported only once per input file)");
3733 buffer->warned_cplusplus_comments = 1;
3734 }
3735 /* Or if specifically desired via -Wc90-c99-compat. */
3736 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3737 && ! CPP_OPTION (pfile, cplusplus)
3738 && ! buffer->warned_cplusplus_comments)
3739 {
3740 if (cpp_error (pfile, CPP_DL_WARNING,
3741 "C++ style comments are incompatible with C90"))
3742 cpp_error (pfile, CPP_DL_NOTE,
3743 "(this will be reported only once per input file)");
3744 buffer->warned_cplusplus_comments = 1;
3745 }
3746 /* In C89/C94, C++ style comments are forbidden. */
3747 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3748 || CPP_OPTION (pfile, lang) == CLK_STDC94))
3749 {
3750 /* But don't be confused about valid code such as
3751 - // immediately followed by *,
3752 - // in a preprocessing directive,
3753 - // in an #if 0 block. */
3754 if (buffer->cur[1] == '*'
3755 || pfile->state.in_directive
3756 || pfile->state.skipping)
3757 {
3758 result->type = CPP_DIV;
3759 break;
3760 }
3761 else if (! buffer->warned_cplusplus_comments)
3762 {
3763 if (cpp_error (pfile, CPP_DL_ERROR,
3764 "C++ style comments are not allowed in "
3765 "ISO C90"))
3766 cpp_error (pfile, CPP_DL_NOTE,
3767 "(this will be reported only once per input "
3768 "file)");
3769 buffer->warned_cplusplus_comments = 1;
3770 }
3771 }
3772 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3773 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3774 }
3775 else if (c == '=')
3776 {
3777 buffer->cur++;
3778 result->type = CPP_DIV_EQ;
3779 break;
3780 }
3781 else
3782 {
3783 result->type = CPP_DIV;
3784 break;
3785 }
3786
3787 if (fallthrough_comment_p (pfile, comment_start))
3788 fallthrough_comment = true;
3789
3790 if (pfile->cb.comment)
3791 {
3792 size_t len = pfile->buffer->cur - comment_start;
3793 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3794 len + 1);
3795 }
3796
3797 if (!pfile->state.save_comments)
3798 {
3799 result->flags |= PREV_WHITE;
3800 goto update_tokens_line;
3801 }
3802
3803 if (fallthrough_comment)
3804 result->flags |= PREV_FALLTHROUGH;
3805
3806 /* Save the comment as a token in its own right. */
3807 save_comment (pfile, result, comment_start, c);
3808 break;
3809
3810 case '<':
3811 if (pfile->state.angled_headers)
3812 {
3813 lex_string (pfile, result, buffer->cur - 1);
3814 if (result->type != CPP_LESS)
3815 break;
3816 }
3817
3818 result->type = CPP_LESS;
3819 if (*buffer->cur == '=')
3820 {
3821 buffer->cur++, result->type = CPP_LESS_EQ;
3822 if (*buffer->cur == '>'
3823 && CPP_OPTION (pfile, cplusplus)
3824 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3825 buffer->cur++, result->type = CPP_SPACESHIP;
3826 }
3827 else if (*buffer->cur == '<')
3828 {
3829 buffer->cur++;
3830 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3831 }
3832 else if (CPP_OPTION (pfile, digraphs))
3833 {
3834 if (*buffer->cur == ':')
3835 {
3836 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3837 three characters are <:: and the subsequent character
3838 is neither : nor >, the < is treated as a preprocessor
3839 token by itself". */
3840 if (CPP_OPTION (pfile, cplusplus)
3841 && CPP_OPTION (pfile, lang) != CLK_CXX98
3842 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3843 && buffer->cur[1] == ':'
3844 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3845 break;
3846
3847 buffer->cur++;
3848 result->flags |= DIGRAPH;
3849 result->type = CPP_OPEN_SQUARE;
3850 }
3851 else if (*buffer->cur == '%')
3852 {
3853 buffer->cur++;
3854 result->flags |= DIGRAPH;
3855 result->type = CPP_OPEN_BRACE;
3856 }
3857 }
3858 break;
3859
3860 case '>':
3861 result->type = CPP_GREATER;
3862 if (*buffer->cur == '=')
3863 buffer->cur++, result->type = CPP_GREATER_EQ;
3864 else if (*buffer->cur == '>')
3865 {
3866 buffer->cur++;
3867 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3868 }
3869 break;
3870
3871 case '%':
3872 result->type = CPP_MOD;
3873 if (*buffer->cur == '=')
3874 buffer->cur++, result->type = CPP_MOD_EQ;
3875 else if (CPP_OPTION (pfile, digraphs))
3876 {
3877 if (*buffer->cur == ':')
3878 {
3879 buffer->cur++;
3880 result->flags |= DIGRAPH;
3881 result->type = CPP_HASH;
3882 if (*buffer->cur == '%' && buffer->cur[1] == ':')
3883 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3884 }
3885 else if (*buffer->cur == '>')
3886 {
3887 buffer->cur++;
3888 result->flags |= DIGRAPH;
3889 result->type = CPP_CLOSE_BRACE;
3890 }
3891 }
3892 break;
3893
3894 case '.':
3895 result->type = CPP_DOT;
3896 if (ISDIGIT (*buffer->cur))
3897 {
3898 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3899 result->type = CPP_NUMBER;
3900 lex_number (pfile, &result->val.str, &nst);
3901 warn_about_normalization (pfile, result, &nst);
3902 }
3903 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3904 buffer->cur += 2, result->type = CPP_ELLIPSIS;
3905 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3906 buffer->cur++, result->type = CPP_DOT_STAR;
3907 break;
3908
3909 case '+':
3910 result->type = CPP_PLUS;
3911 if (*buffer->cur == '+')
3912 buffer->cur++, result->type = CPP_PLUS_PLUS;
3913 else if (*buffer->cur == '=')
3914 buffer->cur++, result->type = CPP_PLUS_EQ;
3915 break;
3916
3917 case '-':
3918 result->type = CPP_MINUS;
3919 if (*buffer->cur == '>')
3920 {
3921 buffer->cur++;
3922 result->type = CPP_DEREF;
3923 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3924 buffer->cur++, result->type = CPP_DEREF_STAR;
3925 }
3926 else if (*buffer->cur == '-')
3927 buffer->cur++, result->type = CPP_MINUS_MINUS;
3928 else if (*buffer->cur == '=')
3929 buffer->cur++, result->type = CPP_MINUS_EQ;
3930 break;
3931
3932 case '&':
3933 result->type = CPP_AND;
3934 if (*buffer->cur == '&')
3935 buffer->cur++, result->type = CPP_AND_AND;
3936 else if (*buffer->cur == '=')
3937 buffer->cur++, result->type = CPP_AND_EQ;
3938 break;
3939
3940 case '|':
3941 result->type = CPP_OR;
3942 if (*buffer->cur == '|')
3943 buffer->cur++, result->type = CPP_OR_OR;
3944 else if (*buffer->cur == '=')
3945 buffer->cur++, result->type = CPP_OR_EQ;
3946 break;
3947
3948 case ':':
3949 result->type = CPP_COLON;
3950 if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3951 buffer->cur++, result->type = CPP_SCOPE;
3952 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3953 {
3954 buffer->cur++;
3955 result->flags |= DIGRAPH;
3956 result->type = CPP_CLOSE_SQUARE;
3957 }
3958 break;
3959
3960 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3961 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3962 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3963 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3964 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3965
3966 case '?': result->type = CPP_QUERY; break;
3967 case '~': result->type = CPP_COMPL; break;
3968 case ',': result->type = CPP_COMMA; break;
3969 case '(': result->type = CPP_OPEN_PAREN; break;
3970 case ')': result->type = CPP_CLOSE_PAREN; break;
3971 case '[': result->type = CPP_OPEN_SQUARE; break;
3972 case ']': result->type = CPP_CLOSE_SQUARE; break;
3973 case '{': result->type = CPP_OPEN_BRACE; break;
3974 case '}': result->type = CPP_CLOSE_BRACE; break;
3975 case ';': result->type = CPP_SEMICOLON; break;
3976
3977 /* @ is a punctuator in Objective-C. */
3978 case '@': result->type = CPP_ATSIGN; break;
3979
3980 default:
3981 {
3982 const uchar *base = --buffer->cur;
3983
3984 /* Check for an extended identifier ($ or UCN or UTF-8). */
3985 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3986 if (forms_identifier_p (pfile, true, &nst))
3987 {
3988 result->type = CPP_NAME;
3989 result->val.node.node = lex_identifier (pfile, base, true, &nst,
3990 &result->val.node.spelling);
3991 warn_about_normalization (pfile, result, &nst);
3992 break;
3993 }
3994
3995 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
3996 single token. */
3997 buffer->cur++;
3998 if (c >= utf8_signifier)
3999 {
4000 const uchar *pstr = base;
4001 cppchar_t s;
4002 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4003 buffer->cur = pstr;
4004 }
4005 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4006 break;
4007 }
4008
4009 }
4010
4011 /* Potentially convert the location of the token to a range. */
4012 if (result->src_loc >= RESERVED_LOCATION_COUNT
4013 && result->type != CPP_EOF)
4014 {
4015 /* Ensure that any line notes are processed, so that we have the
4016 correct physical line/column for the end-point of the token even
4017 when a logical line is split via one or more backslashes. */
4018 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4019 && !pfile->overlaid_buffer)
4020 _cpp_process_line_notes (pfile, false);
4021
4022 source_range tok_range;
4023 tok_range.m_start = result->src_loc;
4024 tok_range.m_finish
4025 = linemap_position_for_column (pfile->line_table,
4026 CPP_BUF_COLUMN (buffer, buffer->cur));
4027
4028 result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4029 result->src_loc,
4030 tok_range, NULL);
4031 }
4032
4033 return result;
4034 }
4035
4036 /* An upper bound on the number of bytes needed to spell TOKEN.
4037 Does not include preceding whitespace. */
4038 unsigned int
4039 cpp_token_len (const cpp_token *token)
4040 {
4041 unsigned int len;
4042
4043 switch (TOKEN_SPELL (token))
4044 {
4045 default: len = 6; break;
4046 case SPELL_LITERAL: len = token->val.str.len; break;
4047 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
4048 }
4049
4050 return len;
4051 }
4052
4053 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4054 Return the number of bytes read out of NAME. (There are always
4055 10 bytes written to BUFFER.) */
4056
4057 static size_t
4058 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4059 {
4060 int j;
4061 int ucn_len = 0;
4062 int ucn_len_c;
4063 unsigned t;
4064 unsigned long utf32;
4065
4066 /* Compute the length of the UTF-8 sequence. */
4067 for (t = *name; t & 0x80; t <<= 1)
4068 ucn_len++;
4069
4070 utf32 = *name & (0x7F >> ucn_len);
4071 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4072 {
4073 utf32 = (utf32 << 6) | (*++name & 0x3F);
4074
4075 /* Ill-formed UTF-8. */
4076 if ((*name & ~0x3F) != 0x80)
4077 abort ();
4078 }
4079
4080 *buffer++ = '\\';
4081 *buffer++ = 'U';
4082 for (j = 7; j >= 0; j--)
4083 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4084 return ucn_len;
4085 }
4086
4087 /* Given a token TYPE corresponding to a digraph, return a pointer to
4088 the spelling of the digraph. */
4089 static const unsigned char *
4090 cpp_digraph2name (enum cpp_ttype type)
4091 {
4092 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4093 }
4094
4095 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4096 The buffer must already contain the enough space to hold the
4097 token's spelling. Returns a pointer to the character after the
4098 last character written. */
4099 unsigned char *
4100 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4101 {
4102 size_t i;
4103 const unsigned char *name = NODE_NAME (ident);
4104
4105 for (i = 0; i < NODE_LEN (ident); i++)
4106 if (name[i] & ~0x7F)
4107 {
4108 i += utf8_to_ucn (buffer, name + i) - 1;
4109 buffer += 10;
4110 }
4111 else
4112 *buffer++ = name[i];
4113
4114 return buffer;
4115 }
4116
4117 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
4118 already contain the enough space to hold the token's spelling.
4119 Returns a pointer to the character after the last character written.
4120 FORSTRING is true if this is to be the spelling after translation
4121 phase 1 (with the original spelling of extended identifiers), false
4122 if extended identifiers should always be written using UCNs (there is
4123 no option for always writing them in the internal UTF-8 form).
4124 FIXME: Would be nice if we didn't need the PFILE argument. */
4125 unsigned char *
4126 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4127 unsigned char *buffer, bool forstring)
4128 {
4129 switch (TOKEN_SPELL (token))
4130 {
4131 case SPELL_OPERATOR:
4132 {
4133 const unsigned char *spelling;
4134 unsigned char c;
4135
4136 if (token->flags & DIGRAPH)
4137 spelling = cpp_digraph2name (token->type);
4138 else if (token->flags & NAMED_OP)
4139 goto spell_ident;
4140 else
4141 spelling = TOKEN_NAME (token);
4142
4143 while ((c = *spelling++) != '\0')
4144 *buffer++ = c;
4145 }
4146 break;
4147
4148 spell_ident:
4149 case SPELL_IDENT:
4150 if (forstring)
4151 {
4152 memcpy (buffer, NODE_NAME (token->val.node.spelling),
4153 NODE_LEN (token->val.node.spelling));
4154 buffer += NODE_LEN (token->val.node.spelling);
4155 }
4156 else
4157 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4158 break;
4159
4160 case SPELL_LITERAL:
4161 memcpy (buffer, token->val.str.text, token->val.str.len);
4162 buffer += token->val.str.len;
4163 break;
4164
4165 case SPELL_NONE:
4166 cpp_error (pfile, CPP_DL_ICE,
4167 "unspellable token %s", TOKEN_NAME (token));
4168 break;
4169 }
4170
4171 return buffer;
4172 }
4173
4174 /* Returns TOKEN spelt as a null-terminated string. The string is
4175 freed when the reader is destroyed. Useful for diagnostics. */
4176 unsigned char *
4177 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4178 {
4179 unsigned int len = cpp_token_len (token) + 1;
4180 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4181
4182 end = cpp_spell_token (pfile, token, start, false);
4183 end[0] = '\0';
4184
4185 return start;
4186 }
4187
4188 /* Returns a pointer to a string which spells the token defined by
4189 TYPE and FLAGS. Used by C front ends, which really should move to
4190 using cpp_token_as_text. */
4191 const char *
4192 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4193 {
4194 if (flags & DIGRAPH)
4195 return (const char *) cpp_digraph2name (type);
4196 else if (flags & NAMED_OP)
4197 return cpp_named_operator2name (type);
4198
4199 return (const char *) token_spellings[type].name;
4200 }
4201
4202 /* Writes the spelling of token to FP, without any preceding space.
4203 Separated from cpp_spell_token for efficiency - to avoid stdio
4204 double-buffering. */
4205 void
4206 cpp_output_token (const cpp_token *token, FILE *fp)
4207 {
4208 switch (TOKEN_SPELL (token))
4209 {
4210 case SPELL_OPERATOR:
4211 {
4212 const unsigned char *spelling;
4213 int c;
4214
4215 if (token->flags & DIGRAPH)
4216 spelling = cpp_digraph2name (token->type);
4217 else if (token->flags & NAMED_OP)
4218 goto spell_ident;
4219 else
4220 spelling = TOKEN_NAME (token);
4221
4222 c = *spelling;
4223 do
4224 putc (c, fp);
4225 while ((c = *++spelling) != '\0');
4226 }
4227 break;
4228
4229 spell_ident:
4230 case SPELL_IDENT:
4231 {
4232 size_t i;
4233 const unsigned char * name = NODE_NAME (token->val.node.node);
4234
4235 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4236 if (name[i] & ~0x7F)
4237 {
4238 unsigned char buffer[10];
4239 i += utf8_to_ucn (buffer, name + i) - 1;
4240 fwrite (buffer, 1, 10, fp);
4241 }
4242 else
4243 fputc (NODE_NAME (token->val.node.node)[i], fp);
4244 }
4245 break;
4246
4247 case SPELL_LITERAL:
4248 if (token->type == CPP_HEADER_NAME)
4249 fputc ('"', fp);
4250 fwrite (token->val.str.text, 1, token->val.str.len, fp);
4251 if (token->type == CPP_HEADER_NAME)
4252 fputc ('"', fp);
4253 break;
4254
4255 case SPELL_NONE:
4256 /* An error, most probably. */
4257 break;
4258 }
4259 }
4260
4261 /* Compare two tokens. */
4262 int
4263 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4264 {
4265 if (a->type == b->type && a->flags == b->flags)
4266 switch (TOKEN_SPELL (a))
4267 {
4268 default: /* Keep compiler happy. */
4269 case SPELL_OPERATOR:
4270 /* token_no is used to track where multiple consecutive ##
4271 tokens were originally located. */
4272 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4273 case SPELL_NONE:
4274 return (a->type != CPP_MACRO_ARG
4275 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4276 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4277 case SPELL_IDENT:
4278 return (a->val.node.node == b->val.node.node
4279 && a->val.node.spelling == b->val.node.spelling);
4280 case SPELL_LITERAL:
4281 return (a->val.str.len == b->val.str.len
4282 && !memcmp (a->val.str.text, b->val.str.text,
4283 a->val.str.len));
4284 }
4285
4286 return 0;
4287 }
4288
4289 /* Returns nonzero if a space should be inserted to avoid an
4290 accidental token paste for output. For simplicity, it is
4291 conservative, and occasionally advises a space where one is not
4292 needed, e.g. "." and ".2". */
4293 int
4294 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4295 const cpp_token *token2)
4296 {
4297 enum cpp_ttype a = token1->type, b = token2->type;
4298 cppchar_t c;
4299
4300 if (token1->flags & NAMED_OP)
4301 a = CPP_NAME;
4302 if (token2->flags & NAMED_OP)
4303 b = CPP_NAME;
4304
4305 c = EOF;
4306 if (token2->flags & DIGRAPH)
4307 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4308 else if (token_spellings[b].category == SPELL_OPERATOR)
4309 c = token_spellings[b].name[0];
4310
4311 /* Quickly get everything that can paste with an '='. */
4312 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4313 return 1;
4314
4315 switch (a)
4316 {
4317 case CPP_GREATER: return c == '>';
4318 case CPP_LESS: return c == '<' || c == '%' || c == ':';
4319 case CPP_PLUS: return c == '+';
4320 case CPP_MINUS: return c == '-' || c == '>';
4321 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4322 case CPP_MOD: return c == ':' || c == '>';
4323 case CPP_AND: return c == '&';
4324 case CPP_OR: return c == '|';
4325 case CPP_COLON: return c == ':' || c == '>';
4326 case CPP_DEREF: return c == '*';
4327 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
4328 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
4329 case CPP_PRAGMA:
4330 case CPP_NAME: return ((b == CPP_NUMBER
4331 && name_p (pfile, &token2->val.str))
4332 || b == CPP_NAME
4333 || b == CPP_CHAR || b == CPP_STRING); /* L */
4334 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
4335 || b == CPP_CHAR
4336 || c == '.' || c == '+' || c == '-');
4337 /* UCNs */
4338 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4339 && b == CPP_NAME)
4340 || (CPP_OPTION (pfile, objc)
4341 && token1->val.str.text[0] == '@'
4342 && (b == CPP_NAME || b == CPP_STRING)));
4343 case CPP_LESS_EQ: return c == '>';
4344 case CPP_STRING:
4345 case CPP_WSTRING:
4346 case CPP_UTF8STRING:
4347 case CPP_STRING16:
4348 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4349 && (b == CPP_NAME
4350 || (TOKEN_SPELL (token2) == SPELL_LITERAL
4351 && ISIDST (token2->val.str.text[0]))));
4352
4353 default: break;
4354 }
4355
4356 return 0;
4357 }
4358
4359 /* Output all the remaining tokens on the current line, and a newline
4360 character, to FP. Leading whitespace is removed. If there are
4361 macros, special token padding is not performed. */
4362 void
4363 cpp_output_line (cpp_reader *pfile, FILE *fp)
4364 {
4365 const cpp_token *token;
4366
4367 token = cpp_get_token (pfile);
4368 while (token->type != CPP_EOF)
4369 {
4370 cpp_output_token (token, fp);
4371 token = cpp_get_token (pfile);
4372 if (token->flags & PREV_WHITE)
4373 putc (' ', fp);
4374 }
4375
4376 putc ('\n', fp);
4377 }
4378
4379 /* Return a string representation of all the remaining tokens on the
4380 current line. The result is allocated using xmalloc and must be
4381 freed by the caller. */
4382 unsigned char *
4383 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4384 {
4385 const cpp_token *token;
4386 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4387 unsigned int alloced = 120 + out;
4388 unsigned char *result = (unsigned char *) xmalloc (alloced);
4389
4390 /* If DIR_NAME is empty, there are no initial contents. */
4391 if (dir_name)
4392 {
4393 sprintf ((char *) result, "#%s ", dir_name);
4394 out += 2;
4395 }
4396
4397 token = cpp_get_token (pfile);
4398 while (token->type != CPP_EOF)
4399 {
4400 unsigned char *last;
4401 /* Include room for a possible space and the terminating nul. */
4402 unsigned int len = cpp_token_len (token) + 2;
4403
4404 if (out + len > alloced)
4405 {
4406 alloced *= 2;
4407 if (out + len > alloced)
4408 alloced = out + len;
4409 result = (unsigned char *) xrealloc (result, alloced);
4410 }
4411
4412 last = cpp_spell_token (pfile, token, &result[out], 0);
4413 out = last - result;
4414
4415 token = cpp_get_token (pfile);
4416 if (token->flags & PREV_WHITE)
4417 result[out++] = ' ';
4418 }
4419
4420 result[out] = '\0';
4421 return result;
4422 }
4423
4424 /* Memory buffers. Changing these three constants can have a dramatic
4425 effect on performance. The values here are reasonable defaults,
4426 but might be tuned. If you adjust them, be sure to test across a
4427 range of uses of cpplib, including heavy nested function-like macro
4428 expansion. Also check the change in peak memory usage (NJAMD is a
4429 good tool for this). */
4430 #define MIN_BUFF_SIZE 8000
4431 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4432 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4433 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4434
4435 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4436 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4437 #endif
4438
4439 /* Create a new allocation buffer. Place the control block at the end
4440 of the buffer, so that buffer overflows will cause immediate chaos. */
4441 static _cpp_buff *
4442 new_buff (size_t len)
4443 {
4444 _cpp_buff *result;
4445 unsigned char *base;
4446
4447 if (len < MIN_BUFF_SIZE)
4448 len = MIN_BUFF_SIZE;
4449 len = CPP_ALIGN (len);
4450
4451 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4452 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4453 struct first. */
4454 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4455 base = XNEWVEC (unsigned char, len + slen);
4456 result = (_cpp_buff *) base;
4457 base += slen;
4458 #else
4459 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4460 result = (_cpp_buff *) (base + len);
4461 #endif
4462 result->base = base;
4463 result->cur = base;
4464 result->limit = base + len;
4465 result->next = NULL;
4466 return result;
4467 }
4468
4469 /* Place a chain of unwanted allocation buffers on the free list. */
4470 void
4471 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4472 {
4473 _cpp_buff *end = buff;
4474
4475 while (end->next)
4476 end = end->next;
4477 end->next = pfile->free_buffs;
4478 pfile->free_buffs = buff;
4479 }
4480
4481 /* Return a free buffer of size at least MIN_SIZE. */
4482 _cpp_buff *
4483 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4484 {
4485 _cpp_buff *result, **p;
4486
4487 for (p = &pfile->free_buffs;; p = &(*p)->next)
4488 {
4489 size_t size;
4490
4491 if (*p == NULL)
4492 return new_buff (min_size);
4493 result = *p;
4494 size = result->limit - result->base;
4495 /* Return a buffer that's big enough, but don't waste one that's
4496 way too big. */
4497 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4498 break;
4499 }
4500
4501 *p = result->next;
4502 result->next = NULL;
4503 result->cur = result->base;
4504 return result;
4505 }
4506
4507 /* Creates a new buffer with enough space to hold the uncommitted
4508 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4509 the excess bytes to the new buffer. Chains the new buffer after
4510 BUFF, and returns the new buffer. */
4511 _cpp_buff *
4512 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4513 {
4514 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4515 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4516
4517 buff->next = new_buff;
4518 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4519 return new_buff;
4520 }
4521
4522 /* Creates a new buffer with enough space to hold the uncommitted
4523 remaining bytes of the buffer pointed to by BUFF, and at least
4524 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4525 Chains the new buffer before the buffer pointed to by BUFF, and
4526 updates the pointer to point to the new buffer. */
4527 void
4528 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4529 {
4530 _cpp_buff *new_buff, *old_buff = *pbuff;
4531 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4532
4533 new_buff = _cpp_get_buff (pfile, size);
4534 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4535 new_buff->next = old_buff;
4536 *pbuff = new_buff;
4537 }
4538
4539 /* Free a chain of buffers starting at BUFF. */
4540 void
4541 _cpp_free_buff (_cpp_buff *buff)
4542 {
4543 _cpp_buff *next;
4544
4545 for (; buff; buff = next)
4546 {
4547 next = buff->next;
4548 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4549 free (buff);
4550 #else
4551 free (buff->base);
4552 #endif
4553 }
4554 }
4555
4556 /* Allocate permanent, unaligned storage of length LEN. */
4557 unsigned char *
4558 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4559 {
4560 _cpp_buff *buff = pfile->u_buff;
4561 unsigned char *result = buff->cur;
4562
4563 if (len > (size_t) (buff->limit - result))
4564 {
4565 buff = _cpp_get_buff (pfile, len);
4566 buff->next = pfile->u_buff;
4567 pfile->u_buff = buff;
4568 result = buff->cur;
4569 }
4570
4571 buff->cur = result + len;
4572 return result;
4573 }
4574
4575 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4576 That buffer is used for growing allocations when saving macro
4577 replacement lists in a #define, and when parsing an answer to an
4578 assertion in #assert, #unassert or #if (and therefore possibly
4579 whilst expanding macros). It therefore must not be used by any
4580 code that they might call: specifically the lexer and the guts of
4581 the macro expander.
4582
4583 All existing other uses clearly fit this restriction: storing
4584 registered pragmas during initialization. */
4585 unsigned char *
4586 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4587 {
4588 _cpp_buff *buff = pfile->a_buff;
4589 unsigned char *result = buff->cur;
4590
4591 if (len > (size_t) (buff->limit - result))
4592 {
4593 buff = _cpp_get_buff (pfile, len);
4594 buff->next = pfile->a_buff;
4595 pfile->a_buff = buff;
4596 result = buff->cur;
4597 }
4598
4599 buff->cur = result + len;
4600 return result;
4601 }
4602
4603 /* Commit or allocate storage from a buffer. */
4604
4605 void *
4606 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4607 {
4608 void *ptr = BUFF_FRONT (pfile->a_buff);
4609
4610 if (pfile->hash_table->alloc_subobject)
4611 {
4612 void *copy = pfile->hash_table->alloc_subobject (size);
4613 memcpy (copy, ptr, size);
4614 ptr = copy;
4615 }
4616 else
4617 BUFF_FRONT (pfile->a_buff) += size;
4618
4619 return ptr;
4620 }
4621
4622 /* Say which field of TOK is in use. */
4623
4624 enum cpp_token_fld_kind
4625 cpp_token_val_index (const cpp_token *tok)
4626 {
4627 switch (TOKEN_SPELL (tok))
4628 {
4629 case SPELL_IDENT:
4630 return CPP_TOKEN_FLD_NODE;
4631 case SPELL_LITERAL:
4632 return CPP_TOKEN_FLD_STR;
4633 case SPELL_OPERATOR:
4634 /* Operands which were originally spelled as ident keep around
4635 the node for the exact spelling. */
4636 if (tok->flags & NAMED_OP)
4637 return CPP_TOKEN_FLD_NODE;
4638 else if (tok->type == CPP_PASTE)
4639 return CPP_TOKEN_FLD_TOKEN_NO;
4640 else
4641 return CPP_TOKEN_FLD_NONE;
4642 case SPELL_NONE:
4643 if (tok->type == CPP_MACRO_ARG)
4644 return CPP_TOKEN_FLD_ARG_NO;
4645 else if (tok->type == CPP_PADDING)
4646 return CPP_TOKEN_FLD_SOURCE;
4647 else if (tok->type == CPP_PRAGMA)
4648 return CPP_TOKEN_FLD_PRAGMA;
4649 /* fall through */
4650 default:
4651 return CPP_TOKEN_FLD_NONE;
4652 }
4653 }
4654
4655 /* All tokens lexed in R after calling this function will be forced to
4656 have their location_t to be P, until
4657 cpp_stop_forcing_token_locations is called for R. */
4658
4659 void
4660 cpp_force_token_locations (cpp_reader *r, location_t loc)
4661 {
4662 r->forced_token_location = loc;
4663 }
4664
4665 /* Go back to assigning locations naturally for lexed tokens. */
4666
4667 void
4668 cpp_stop_forcing_token_locations (cpp_reader *r)
4669 {
4670 r->forced_token_location = 0;
4671 }
4672
4673 /* We're looking at \, if it's escaping EOL, look past it. If at
4674 LIMIT, don't advance. */
4675
4676 static const unsigned char *
4677 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4678 {
4679 const unsigned char *probe = peek;
4680
4681 if (__builtin_expect (peek[1] == '\n', true))
4682 {
4683 eol:
4684 probe += 2;
4685 if (__builtin_expect (probe < limit, true))
4686 {
4687 peek = probe;
4688 if (*peek == '\\')
4689 /* The user might be perverse. */
4690 return do_peek_backslash (peek, limit);
4691 }
4692 }
4693 else if (__builtin_expect (peek[1] == '\r', false))
4694 {
4695 if (probe[2] == '\n')
4696 probe++;
4697 goto eol;
4698 }
4699
4700 return peek;
4701 }
4702
4703 static const unsigned char *
4704 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4705 {
4706 if (__builtin_expect (*peek == '\\', false))
4707 peek = do_peek_backslash (peek, limit);
4708 return peek;
4709 }
4710
4711 static const unsigned char *
4712 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4713 {
4714 if (peek == bound)
4715 return NULL;
4716
4717 unsigned char c = *--peek;
4718 if (__builtin_expect (c == '\n', false)
4719 || __builtin_expect (c == 'r', false))
4720 {
4721 if (peek == bound)
4722 return peek;
4723 int ix = -1;
4724 if (c == '\n' && peek[ix] == '\r')
4725 {
4726 if (peek + ix == bound)
4727 return peek;
4728 ix--;
4729 }
4730
4731 if (peek[ix] == '\\')
4732 return do_peek_prev (peek + ix, bound);
4733
4734 return peek;
4735 }
4736 else
4737 return peek;
4738 }
4739
4740 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4741 space. Otherwise return NULL. */
4742
4743 static const unsigned char *
4744 do_peek_ident (const char *match, const unsigned char *peek,
4745 const unsigned char *limit)
4746 {
4747 for (; *++match; peek++)
4748 if (*peek != *match)
4749 {
4750 peek = do_peek_next (peek, limit);
4751 if (*peek != *match)
4752 return NULL;
4753 }
4754
4755 /* Must now not be looking at an identifier char. */
4756 peek = do_peek_next (peek, limit);
4757 if (ISIDNUM (*peek))
4758 return NULL;
4759
4760 /* Skip control-line whitespace. */
4761 ws:
4762 while (*peek == ' ' || *peek == '\t')
4763 peek++;
4764 if (__builtin_expect (*peek == '\\', false))
4765 {
4766 peek = do_peek_backslash (peek, limit);
4767 if (*peek != '\\')
4768 goto ws;
4769 }
4770
4771 return peek;
4772 }
4773
4774 /* Are we looking at a module control line starting as PEEK - 1? */
4775
4776 static bool
4777 do_peek_module (cpp_reader *pfile, unsigned char c,
4778 const unsigned char *peek, const unsigned char *limit)
4779 {
4780 bool import = false;
4781
4782 if (__builtin_expect (c == 'e', false))
4783 {
4784 if (!((peek[0] == 'x' || peek[0] == '\\')
4785 && (peek = do_peek_ident ("export", peek, limit))))
4786 return false;
4787
4788 /* export, peek for import or module. No need to peek __import
4789 here. */
4790 if (peek[0] == 'i')
4791 {
4792 if (!((peek[1] == 'm' || peek[1] == '\\')
4793 && (peek = do_peek_ident ("import", peek + 1, limit))))
4794 return false;
4795 import = true;
4796 }
4797 else if (peek[0] == 'm')
4798 {
4799 if (!((peek[1] == 'o' || peek[1] == '\\')
4800 && (peek = do_peek_ident ("module", peek + 1, limit))))
4801 return false;
4802 }
4803 else
4804 return false;
4805 }
4806 else if (__builtin_expect (c == 'i', false))
4807 {
4808 if (!((peek[0] == 'm' || peek[0] == '\\')
4809 && (peek = do_peek_ident ("import", peek, limit))))
4810 return false;
4811 import = true;
4812 }
4813 else if (__builtin_expect (c == '_', false))
4814 {
4815 /* Needed for translated includes. */
4816 if (!((peek[0] == '_' || peek[0] == '\\')
4817 && (peek = do_peek_ident ("__import", peek, limit))))
4818 return false;
4819 import = true;
4820 }
4821 else if (__builtin_expect (c == 'm', false))
4822 {
4823 if (!((peek[0] == 'o' || peek[0] == '\\')
4824 && (peek = do_peek_ident ("module", peek, limit))))
4825 return false;
4826 }
4827 else
4828 return false;
4829
4830 /* Peek the next character to see if it's good enough. We'll be at
4831 the first non-whitespace char, including skipping an escaped
4832 newline. */
4833 /* ... import followed by identifier, ':', '<' or header-name
4834 preprocessing tokens, or module followed by identifier, ':' or
4835 ';' preprocessing tokens. */
4836 unsigned char p = *peek++;
4837
4838 /* A character literal is ... single quotes, ... optionally preceded
4839 by u8, u, U, or L */
4840 /* A string-literal is a ... double quotes, optionally prefixed by
4841 R, u8, u8R, u, uR, U, UR, L, or LR */
4842 if (p == 'u')
4843 {
4844 peek = do_peek_next (peek, limit);
4845 if (*peek == '8')
4846 {
4847 peek++;
4848 goto peek_u8;
4849 }
4850 goto peek_u;
4851 }
4852 else if (p == 'U' || p == 'L')
4853 {
4854 peek_u8:
4855 peek = do_peek_next (peek, limit);
4856 peek_u:
4857 if (*peek == '\"' || *peek == '\'')
4858 return false;
4859
4860 if (*peek == 'R')
4861 goto peek_R;
4862 /* Identifier. Ok. */
4863 }
4864 else if (p == 'R')
4865 {
4866 peek_R:
4867 if (CPP_OPTION (pfile, rliterals))
4868 {
4869 peek = do_peek_next (peek, limit);
4870 if (*peek == '\"')
4871 return false;
4872 }
4873 /* Identifier. Ok. */
4874 }
4875 else if ('Z' - 'A' == 25
4876 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4877 : ISIDST (p))
4878 {
4879 /* Identifier. Ok. */
4880 }
4881 else if (p == '<')
4882 {
4883 /* Maybe angle header, ok for import. Reject
4884 '<=', '<<' digraph:'<:'. */
4885 if (!import)
4886 return false;
4887 peek = do_peek_next (peek, limit);
4888 if (*peek == '=' || *peek == '<'
4889 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4890 return false;
4891 }
4892 else if (p == ';')
4893 {
4894 /* SEMICOLON, ok for module. */
4895 if (import)
4896 return false;
4897 }
4898 else if (p == '"')
4899 {
4900 /* STRING, ok for import. */
4901 if (!import)
4902 return false;
4903 }
4904 else if (p == ':')
4905 {
4906 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
4907 peek = do_peek_next (peek, limit);
4908 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4909 return false;
4910 }
4911 else
4912 /* FIXME: Detect a unicode character, excluding those not
4913 permitted as the initial character. [lex.name]/1. I presume
4914 we need to check the \[uU] spellings, and directly using
4915 Unicode in say UTF8 form? Or perhaps we do the phase-1
4916 conversion of UTF8 to universal-character-names? */
4917 return false;
4918
4919 return true;
4920 }
4921
4922 /* Directives-only scanning. Somewhat more relaxed than correct
4923 parsing -- some ill-formed programs will not be rejected. */
4924
4925 void
4926 cpp_directive_only_process (cpp_reader *pfile,
4927 void *data,
4928 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4929 {
4930 bool module_p = CPP_OPTION (pfile, module_directives);
4931
4932 do
4933 {
4934 restart:
4935 /* Buffer initialization, but no line cleaning. */
4936 cpp_buffer *buffer = pfile->buffer;
4937 buffer->cur_note = buffer->notes_used = 0;
4938 buffer->cur = buffer->line_base = buffer->next_line;
4939 buffer->need_line = false;
4940 /* Files always end in a newline or carriage return. We rely on this for
4941 character peeking safety. */
4942 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
4943
4944 const unsigned char *base = buffer->cur;
4945 unsigned line_count = 0;
4946 const unsigned char *line_start = base;
4947
4948 bool bol = true;
4949 bool raw = false;
4950
4951 const unsigned char *lwm = base;
4952 for (const unsigned char *pos = base, *limit = buffer->rlimit;
4953 pos < limit;)
4954 {
4955 unsigned char c = *pos++;
4956 /* This matches the switch in _cpp_lex_direct. */
4957 switch (c)
4958 {
4959 case ' ': case '\t': case '\f': case '\v':
4960 /* Whitespace, do nothing. */
4961 break;
4962
4963 case '\r': /* MAC line ending, or Windows \r\n */
4964 if (*pos == '\n')
4965 pos++;
4966 /* FALLTHROUGH */
4967
4968 case '\n':
4969 bol = true;
4970
4971 next_line:
4972 CPP_INCREMENT_LINE (pfile, 0);
4973 line_count++;
4974 line_start = pos;
4975 break;
4976
4977 case '\\':
4978 /* <backslash><newline> is removed, and doesn't undo any
4979 preceeding escape or whatnot. */
4980 if (*pos == '\n')
4981 {
4982 pos++;
4983 goto next_line;
4984 }
4985 else if (*pos == '\r')
4986 {
4987 if (pos[1] == '\n')
4988 pos++;
4989 pos++;
4990 goto next_line;
4991 }
4992 goto dflt;
4993
4994 case '#':
4995 if (bol)
4996 {
4997 /* Line directive. */
4998 if (pos - 1 > base && !pfile->state.skipping)
4999 cb (pfile, CPP_DO_print, data,
5000 line_count, base, pos - 1 - base);
5001
5002 /* Prep things for directive handling. */
5003 buffer->next_line = pos;
5004 buffer->need_line = true;
5005 bool ok = _cpp_get_fresh_line (pfile);
5006 gcc_checking_assert (ok);
5007
5008 /* Ensure proper column numbering for generated
5009 error messages. */
5010 buffer->line_base -= pos - line_start;
5011
5012 _cpp_handle_directive (pfile, line_start + 1 != pos);
5013
5014 /* Sanitize the line settings. Duplicate #include's can
5015 mess things up. */
5016 // FIXME: Necessary?
5017 pfile->line_table->highest_location
5018 = pfile->line_table->highest_line;
5019
5020 if (!pfile->state.skipping
5021 && pfile->buffer->next_line < pfile->buffer->rlimit)
5022 cb (pfile, CPP_DO_location, data,
5023 pfile->line_table->highest_line);
5024
5025 goto restart;
5026 }
5027 goto dflt;
5028
5029 case '/':
5030 {
5031 const unsigned char *peek = do_peek_next (pos, limit);
5032 if (!(*peek == '/' || *peek == '*'))
5033 goto dflt;
5034
5035 /* Line or block comment */
5036 bool is_block = *peek == '*';
5037 bool star = false;
5038 bool esc = false;
5039 location_t sloc
5040 = linemap_position_for_column (pfile->line_table,
5041 pos - line_start);
5042
5043 while (pos < limit)
5044 {
5045 char c = *pos++;
5046 switch (c)
5047 {
5048 case '\\':
5049 esc = true;
5050 break;
5051
5052 case '\r':
5053 if (*pos == '\n')
5054 pos++;
5055 /* FALLTHROUGH */
5056
5057 case '\n':
5058 {
5059 CPP_INCREMENT_LINE (pfile, 0);
5060 line_count++;
5061 line_start = pos;
5062 if (!esc && !is_block)
5063 {
5064 bol = true;
5065 goto done_comment;
5066 }
5067 }
5068 if (!esc)
5069 star = false;
5070 esc = false;
5071 break;
5072
5073 case '*':
5074 if (pos > peek)
5075 star = is_block;
5076 esc = false;
5077 break;
5078
5079 case '/':
5080 if (star)
5081 goto done_comment;
5082 /* FALLTHROUGH */
5083
5084 default:
5085 star = false;
5086 esc = false;
5087 break;
5088 }
5089 }
5090 if (pos < limit || is_block)
5091 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5092 "unterminated comment");
5093 done_comment:
5094 lwm = pos;
5095 break;
5096 }
5097
5098 case '\'':
5099 if (!CPP_OPTION (pfile, digit_separators))
5100 goto delimited_string;
5101
5102 /* Possibly a number punctuator. */
5103 if (!ISIDNUM (*do_peek_next (pos, limit)))
5104 goto delimited_string;
5105
5106 goto quote_peek;
5107
5108 case '\"':
5109 if (!CPP_OPTION (pfile, rliterals))
5110 goto delimited_string;
5111
5112 quote_peek:
5113 {
5114 /* For ' see if it's a number punctuator
5115 \.?<digit>(<digit>|<identifier-nondigit>
5116 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5117 /* For " see if it's a raw string
5118 {U,L,u,u8}R. This includes CPP_NUMBER detection,
5119 because that could be 0e+R. */
5120 const unsigned char *peek = pos - 1;
5121 bool quote_first = c == '"';
5122 bool quote_eight = false;
5123 bool maybe_number_start = false;
5124 bool want_number = false;
5125
5126 while ((peek = do_peek_prev (peek, lwm)))
5127 {
5128 unsigned char p = *peek;
5129 if (quote_first)
5130 {
5131 if (!raw)
5132 {
5133 if (p != 'R')
5134 break;
5135 raw = true;
5136 continue;
5137 }
5138
5139 quote_first = false;
5140 if (p == 'L' || p == 'U' || p == 'u')
5141 ;
5142 else if (p == '8')
5143 quote_eight = true;
5144 else
5145 goto second_raw;
5146 }
5147 else if (quote_eight)
5148 {
5149 if (p != 'u')
5150 {
5151 raw = false;
5152 break;
5153 }
5154 quote_eight = false;
5155 }
5156 else if (c == '"')
5157 {
5158 second_raw:;
5159 if (!want_number && ISIDNUM (p))
5160 {
5161 raw = false;
5162 break;
5163 }
5164 }
5165
5166 if (ISDIGIT (p))
5167 maybe_number_start = true;
5168 else if (p == '.')
5169 want_number = true;
5170 else if (ISIDNUM (p))
5171 maybe_number_start = false;
5172 else if (p == '+' || p == '-')
5173 {
5174 if (const unsigned char *peek_prev
5175 = do_peek_prev (peek, lwm))
5176 {
5177 p = *peek_prev;
5178 if (p == 'e' || p == 'E'
5179 || p == 'p' || p == 'P')
5180 {
5181 want_number = true;
5182 maybe_number_start = false;
5183 }
5184 else
5185 break;
5186 }
5187 else
5188 break;
5189 }
5190 else if (p == '\'' || p == '\"')
5191 {
5192 /* If this is lwm, this must be the end of a
5193 previous string. So this is a trailing
5194 literal type, (a) if those are allowed,
5195 and (b) maybe_start is false. Otherwise
5196 this must be a CPP_NUMBER because we've
5197 met another ', and we'd have checked that
5198 in its own right. */
5199 if (peek == lwm && CPP_OPTION (pfile, uliterals))
5200 {
5201 if (!maybe_number_start && !want_number)
5202 /* Must be a literal type. */
5203 raw = false;
5204 }
5205 else if (p == '\''
5206 && CPP_OPTION (pfile, digit_separators))
5207 maybe_number_start = true;
5208 break;
5209 }
5210 else if (c == '\'')
5211 break;
5212 else if (!quote_first && !quote_eight)
5213 break;
5214 }
5215
5216 if (maybe_number_start)
5217 {
5218 if (c == '\'')
5219 /* A CPP NUMBER. */
5220 goto dflt;
5221 raw = false;
5222 }
5223
5224 goto delimited_string;
5225 }
5226
5227 delimited_string:
5228 {
5229 /* (Possibly raw) string or char literal. */
5230 unsigned char end = c;
5231 int delim_len = -1;
5232 const unsigned char *delim = NULL;
5233 location_t sloc = linemap_position_for_column (pfile->line_table,
5234 pos - line_start);
5235 int esc = 0;
5236
5237 if (raw)
5238 {
5239 /* There can be no line breaks in the delimiter. */
5240 delim = pos;
5241 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5242 {
5243 if (delim_len == 16)
5244 {
5245 cpp_error_with_line (pfile, CPP_DL_ERROR,
5246 sloc, 0,
5247 "raw string delimiter"
5248 " longer than %d"
5249 " characters",
5250 delim_len);
5251 raw = false;
5252 pos = delim;
5253 break;
5254 }
5255 if (strchr (") \\\t\v\f\n", c))
5256 {
5257 cpp_error_with_line (pfile, CPP_DL_ERROR,
5258 sloc, 0,
5259 "invalid character '%c'"
5260 " in raw string"
5261 " delimiter", c);
5262 raw = false;
5263 pos = delim;
5264 break;
5265 }
5266 if (pos >= limit)
5267 goto bad_string;
5268 }
5269 }
5270
5271 while (pos < limit)
5272 {
5273 char c = *pos++;
5274 switch (c)
5275 {
5276 case '\\':
5277 if (!raw)
5278 esc++;
5279 break;
5280
5281 case '\r':
5282 if (*pos == '\n')
5283 pos++;
5284 /* FALLTHROUGH */
5285
5286 case '\n':
5287 {
5288 CPP_INCREMENT_LINE (pfile, 0);
5289 line_count++;
5290 line_start = pos;
5291 }
5292 if (esc)
5293 esc--;
5294 break;
5295
5296 case ')':
5297 if (raw
5298 && pos + delim_len + 1 < limit
5299 && pos[delim_len] == end
5300 && !memcmp (delim, pos, delim_len))
5301 {
5302 pos += delim_len + 1;
5303 raw = false;
5304 goto done_string;
5305 }
5306 break;
5307
5308 default:
5309 if (!raw && !(esc & 1) && c == end)
5310 goto done_string;
5311 esc = 0;
5312 break;
5313 }
5314 }
5315 bad_string:
5316 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5317 "unterminated literal");
5318
5319 done_string:
5320 raw = false;
5321 lwm = pos - 1;
5322 }
5323 goto dflt;
5324
5325 case '_':
5326 case 'e':
5327 case 'i':
5328 case 'm':
5329 if (bol && module_p && !pfile->state.skipping
5330 && do_peek_module (pfile, c, pos, limit))
5331 {
5332 /* We've seen the start of a module control line.
5333 Start up the tokenizer. */
5334 pos--; /* Backup over the first character. */
5335
5336 /* Backup over whitespace to start of line. */
5337 while (pos > line_start
5338 && (pos[-1] == ' ' || pos[-1] == '\t'))
5339 pos--;
5340
5341 if (pos > base)
5342 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5343
5344 /* Prep things for directive handling. */
5345 buffer->next_line = pos;
5346 buffer->need_line = true;
5347
5348 /* Now get tokens until the PRAGMA_EOL. */
5349 do
5350 {
5351 location_t spelling;
5352 const cpp_token *tok
5353 = cpp_get_token_with_location (pfile, &spelling);
5354
5355 gcc_assert (pfile->state.in_deferred_pragma
5356 || tok->type == CPP_PRAGMA_EOL);
5357 cb (pfile, CPP_DO_token, data, tok, spelling);
5358 }
5359 while (pfile->state.in_deferred_pragma);
5360
5361 if (pfile->buffer->next_line < pfile->buffer->rlimit)
5362 cb (pfile, CPP_DO_location, data,
5363 pfile->line_table->highest_line);
5364
5365 pfile->mi_valid = false;
5366 goto restart;
5367 }
5368 goto dflt;
5369
5370 default:
5371 dflt:
5372 bol = false;
5373 pfile->mi_valid = false;
5374 break;
5375 }
5376 }
5377
5378 if (buffer->rlimit > base && !pfile->state.skipping)
5379 {
5380 const unsigned char *limit = buffer->rlimit;
5381 /* If the file was not newline terminated, add rlimit, which is
5382 guaranteed to point to a newline, to the end of our range. */
5383 if (limit[-1] != '\n')
5384 {
5385 limit++;
5386 CPP_INCREMENT_LINE (pfile, 0);
5387 line_count++;
5388 }
5389 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5390 }
5391
5392 _cpp_pop_buffer (pfile);
5393 }
5394 while (pfile->buffer);
5395 }
5396