lex.cc revision 1.3 1 /* CPP Library - lexical analysis.
2 Copyright (C) 2000-2022 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
26
27 enum spell_type
28 {
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
33 };
34
35 struct token_spelling
36 {
37 enum spell_type category;
38 const unsigned char *name;
39 };
40
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
43
44 #define OP(e, s) { SPELL_OPERATOR, UC s },
45 #define TK(e, s) { SPELL_ ## s, UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
49
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void store_comment (cpp_reader *, cpp_token *);
59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60 unsigned int, enum cpp_ttype);
61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62 static int name_p (cpp_reader *, const cpp_string *);
63 static tokenrun *next_tokenrun (tokenrun *);
64
65 static _cpp_buff *new_buff (size_t);
66
67
68 /* Utility routine:
69
70 Compares, the token TOKEN to the NUL-terminated string STRING.
71 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
72 int
73 cpp_ideq (const cpp_token *token, const char *string)
74 {
75 if (token->type != CPP_NAME)
76 return 0;
77
78 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
79 }
80
81 /* Record a note TYPE at byte POS into the current cleaned logical
82 line. */
83 static void
84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
85 {
86 if (buffer->notes_used == buffer->notes_cap)
87 {
88 buffer->notes_cap = buffer->notes_cap * 2 + 200;
89 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90 buffer->notes_cap);
91 }
92
93 buffer->notes[buffer->notes_used].pos = pos;
94 buffer->notes[buffer->notes_used].type = type;
95 buffer->notes_used++;
96 }
97
98
99 /* Fast path to find line special characters using optimized character
101 scanning algorithms. Anything complicated falls back to the slow
102 path below. Since this loop is very hot it's worth doing these kinds
103 of optimizations.
104
105 One of the paths through the ifdefs should provide
106
107 const uchar *search_line_fast (const uchar *s, const uchar *end);
108
109 Between S and END, search for \n, \r, \\, ?. Return a pointer to
110 the found character.
111
112 Note that the last character of the buffer is *always* a newline,
113 as forced by _cpp_convert_input. This fact can be used to avoid
114 explicitly looking for the end of the buffer. */
115
116 /* Configure gives us an ifdef test. */
117 #ifndef WORDS_BIGENDIAN
118 #define WORDS_BIGENDIAN 0
119 #endif
120
121 /* We'd like the largest integer that fits into a register. There's nothing
122 in <stdint.h> that gives us that. For most hosts this is unsigned long,
123 but MS decided on an LLP64 model. Thankfully when building with GCC we
124 can get the "real" word size. */
125 #ifdef __GNUC__
126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
127 #else
128 typedef unsigned long word_type;
129 #endif
130
131 /* The code below is only expecting sizes 4 or 8.
132 Die at compile-time if this expectation is violated. */
133 typedef char check_word_type_size
134 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
135
136 /* Return X with the first N bytes forced to values that won't match one
137 of the interesting characters. Note that NUL is not interesting. */
138
139 static inline word_type
140 acc_char_mask_misalign (word_type val, unsigned int n)
141 {
142 word_type mask = -1;
143 if (WORDS_BIGENDIAN)
144 mask >>= n * 8;
145 else
146 mask <<= n * 8;
147 return val & mask;
148 }
149
150 /* Return X replicated to all byte positions within WORD_TYPE. */
151
152 static inline word_type
153 acc_char_replicate (uchar x)
154 {
155 word_type ret;
156
157 ret = (x << 24) | (x << 16) | (x << 8) | x;
158 if (sizeof(word_type) == 8)
159 ret = (ret << 16 << 16) | ret;
160 return ret;
161 }
162
163 /* Return non-zero if some byte of VAL is (probably) C. */
164
165 static inline word_type
166 acc_char_cmp (word_type val, word_type c)
167 {
168 #if defined(__GNUC__) && defined(__alpha__)
169 /* We can get exact results using a compare-bytes instruction.
170 Get (val == c) via (0 >= (val ^ c)). */
171 return __builtin_alpha_cmpbge (0, val ^ c);
172 #else
173 word_type magic = 0x7efefefeU;
174 if (sizeof(word_type) == 8)
175 magic = (magic << 16 << 16) | 0xfefefefeU;
176 magic |= 1;
177
178 val ^= c;
179 return ((val + magic) ^ ~val) & ~magic;
180 #endif
181 }
182
183 /* Given the result of acc_char_cmp is non-zero, return the index of
184 the found character. If this was a false positive, return -1. */
185
186 static inline int
187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
188 word_type val ATTRIBUTE_UNUSED)
189 {
190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
191 /* The cmpbge instruction sets *bits* of the result corresponding to
192 matches in the bytes with no false positives. */
193 return __builtin_ctzl (cmp);
194 #else
195 unsigned int i;
196
197 /* ??? It would be nice to force unrolling here,
198 and have all of these constants folded. */
199 for (i = 0; i < sizeof(word_type); ++i)
200 {
201 uchar c;
202 if (WORDS_BIGENDIAN)
203 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
204 else
205 c = (val >> i * 8) & 0xff;
206
207 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
208 return i;
209 }
210
211 return -1;
212 #endif
213 }
214
215 /* A version of the fast scanner using bit fiddling techniques.
216
217 For 32-bit words, one would normally perform 16 comparisons and
218 16 branches. With this algorithm one performs 24 arithmetic
219 operations and one branch. Whether this is faster with a 32-bit
220 word size is going to be somewhat system dependent.
221
222 For 64-bit words, we eliminate twice the number of comparisons
223 and branches without increasing the number of arithmetic operations.
224 It's almost certainly going to be a win with 64-bit word size. */
225
226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
227 ATTRIBUTE_UNUSED;
228
229 static const uchar *
230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
231 {
232 const word_type repl_nl = acc_char_replicate ('\n');
233 const word_type repl_cr = acc_char_replicate ('\r');
234 const word_type repl_bs = acc_char_replicate ('\\');
235 const word_type repl_qm = acc_char_replicate ('?');
236
237 unsigned int misalign;
238 const word_type *p;
239 word_type val, t;
240
241 /* Align the buffer. Mask out any bytes from before the beginning. */
242 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
243 val = *p;
244 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
245 if (misalign)
246 val = acc_char_mask_misalign (val, misalign);
247
248 /* Main loop. */
249 while (1)
250 {
251 t = acc_char_cmp (val, repl_nl);
252 t |= acc_char_cmp (val, repl_cr);
253 t |= acc_char_cmp (val, repl_bs);
254 t |= acc_char_cmp (val, repl_qm);
255
256 if (__builtin_expect (t != 0, 0))
257 {
258 int i = acc_char_index (t, val);
259 if (i >= 0)
260 return (const uchar *)p + i;
261 }
262
263 val = *++p;
264 }
265 }
266
267 /* Disable on Solaris 2/x86 until the following problem can be properly
268 autoconfed:
269
270 The Solaris 10+ assembler tags objects with the instruction set
271 extensions used, so SSE4.2 executables cannot run on machines that
272 don't support that extension. */
273
274 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
275
276 /* Replicated character data to be shared between implementations.
277 Recall that outside of a context with vector support we can't
278 define compatible vector types, therefore these are all defined
279 in terms of raw characters. */
280 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
281 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
282 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
283 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
284 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
285 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
286 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
287 { '?', '?', '?', '?', '?', '?', '?', '?',
288 '?', '?', '?', '?', '?', '?', '?', '?' },
289 };
290
291 /* A version of the fast scanner using MMX vectorized byte compare insns.
292
293 This uses the PMOVMSKB instruction which was introduced with "MMX2",
294 which was packaged into SSE1; it is also present in the AMD MMX
295 extension. Mark the function as using "sse" so that we emit a real
296 "emms" instruction, rather than the 3dNOW "femms" instruction. */
297
298 static const uchar *
299 #ifndef __SSE__
300 __attribute__((__target__("sse")))
301 #endif
302 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
303 {
304 typedef char v8qi __attribute__ ((__vector_size__ (8)));
305 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
306
307 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
308 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
309 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
310 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
311
312 unsigned int misalign, found, mask;
313 const v8qi *p;
314 v8qi data, t, c;
315
316 /* Align the source pointer. While MMX doesn't generate unaligned data
317 faults, this allows us to safely scan to the end of the buffer without
318 reading beyond the end of the last page. */
319 misalign = (uintptr_t)s & 7;
320 p = (const v8qi *)((uintptr_t)s & -8);
321 data = *p;
322
323 /* Create a mask for the bytes that are valid within the first
324 16-byte block. The Idea here is that the AND with the mask
325 within the loop is "free", since we need some AND or TEST
326 insn in order to set the flags for the branch anyway. */
327 mask = -1u << misalign;
328
329 /* Main loop processing 8 bytes at a time. */
330 goto start;
331 do
332 {
333 data = *++p;
334 mask = -1;
335
336 start:
337 t = __builtin_ia32_pcmpeqb(data, repl_nl);
338 c = __builtin_ia32_pcmpeqb(data, repl_cr);
339 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
340 c = __builtin_ia32_pcmpeqb(data, repl_bs);
341 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
342 c = __builtin_ia32_pcmpeqb(data, repl_qm);
343 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
344 found = __builtin_ia32_pmovmskb (t);
345 found &= mask;
346 }
347 while (!found);
348
349 __builtin_ia32_emms ();
350
351 /* FOUND contains 1 in bits for which we matched a relevant
352 character. Conversion to the byte index is trivial. */
353 found = __builtin_ctz(found);
354 return (const uchar *)p + found;
355 }
356
357 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
358
359 static const uchar *
360 #ifndef __SSE2__
361 __attribute__((__target__("sse2")))
362 #endif
363 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
364 {
365 typedef char v16qi __attribute__ ((__vector_size__ (16)));
366
367 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
368 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
369 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
370 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
371
372 unsigned int misalign, found, mask;
373 const v16qi *p;
374 v16qi data, t;
375
376 /* Align the source pointer. */
377 misalign = (uintptr_t)s & 15;
378 p = (const v16qi *)((uintptr_t)s & -16);
379 data = *p;
380
381 /* Create a mask for the bytes that are valid within the first
382 16-byte block. The Idea here is that the AND with the mask
383 within the loop is "free", since we need some AND or TEST
384 insn in order to set the flags for the branch anyway. */
385 mask = -1u << misalign;
386
387 /* Main loop processing 16 bytes at a time. */
388 goto start;
389 do
390 {
391 data = *++p;
392 mask = -1;
393
394 start:
395 t = data == repl_nl;
396 t |= data == repl_cr;
397 t |= data == repl_bs;
398 t |= data == repl_qm;
399 found = __builtin_ia32_pmovmskb128 (t);
400 found &= mask;
401 }
402 while (!found);
403
404 /* FOUND contains 1 in bits for which we matched a relevant
405 character. Conversion to the byte index is trivial. */
406 found = __builtin_ctz(found);
407 return (const uchar *)p + found;
408 }
409
410 #ifdef HAVE_SSE4
411 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
412
413 static const uchar *
414 #ifndef __SSE4_2__
415 __attribute__((__target__("sse4.2")))
416 #endif
417 search_line_sse42 (const uchar *s, const uchar *end)
418 {
419 typedef char v16qi __attribute__ ((__vector_size__ (16)));
420 static const v16qi search = { '\n', '\r', '?', '\\' };
421
422 uintptr_t si = (uintptr_t)s;
423 uintptr_t index;
424
425 /* Check for unaligned input. */
426 if (si & 15)
427 {
428 v16qi sv;
429
430 if (__builtin_expect (end - s < 16, 0)
431 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
432 {
433 /* There are less than 16 bytes left in the buffer, and less
434 than 16 bytes left on the page. Reading 16 bytes at this
435 point might generate a spurious page fault. Defer to the
436 SSE2 implementation, which already handles alignment. */
437 return search_line_sse2 (s, end);
438 }
439
440 /* ??? The builtin doesn't understand that the PCMPESTRI read from
441 memory need not be aligned. */
442 sv = __builtin_ia32_loaddqu ((const char *) s);
443 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
444
445 if (__builtin_expect (index < 16, 0))
446 goto found;
447
448 /* Advance the pointer to an aligned address. We will re-scan a
449 few bytes, but we no longer need care for reading past the
450 end of a page, since we're guaranteed a match. */
451 s = (const uchar *)((si + 15) & -16);
452 }
453
454 /* Main loop, processing 16 bytes at a time. */
455 #ifdef __GCC_ASM_FLAG_OUTPUTS__
456 while (1)
457 {
458 char f;
459
460 /* By using inline assembly instead of the builtin,
461 we can use the result, as well as the flags set. */
462 __asm ("%vpcmpestri\t$0, %2, %3"
463 : "=c"(index), "=@ccc"(f)
464 : "m"(*s), "x"(search), "a"(4), "d"(16));
465 if (f)
466 break;
467
468 s += 16;
469 }
470 #else
471 s -= 16;
472 /* By doing the whole loop in inline assembly,
473 we can make proper use of the flags set. */
474 __asm ( ".balign 16\n"
475 "0: add $16, %1\n"
476 " %vpcmpestri\t$0, (%1), %2\n"
477 " jnc 0b"
478 : "=&c"(index), "+r"(s)
479 : "x"(search), "a"(4), "d"(16));
480 #endif
481
482 found:
483 return s + index;
484 }
485
486 #else
487 /* Work around out-dated assemblers without sse4 support. */
488 #define search_line_sse42 search_line_sse2
489 #endif
490
491 /* Check the CPU capabilities. */
492
493 #include "../gcc/config/i386/cpuid.h"
494
495 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
496 static search_line_fast_type search_line_fast;
497
498 #define HAVE_init_vectorized_lexer 1
499 static inline void
500 init_vectorized_lexer (void)
501 {
502 unsigned dummy, ecx = 0, edx = 0;
503 search_line_fast_type impl = search_line_acc_char;
504 int minimum = 0;
505
506 #if defined(__SSE4_2__)
507 minimum = 3;
508 #elif defined(__SSE2__)
509 minimum = 2;
510 #elif defined(__SSE__)
511 minimum = 1;
512 #endif
513
514 if (minimum == 3)
515 impl = search_line_sse42;
516 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
517 {
518 if (minimum == 3 || (ecx & bit_SSE4_2))
519 impl = search_line_sse42;
520 else if (minimum == 2 || (edx & bit_SSE2))
521 impl = search_line_sse2;
522 else if (minimum == 1 || (edx & bit_SSE))
523 impl = search_line_mmx;
524 }
525 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
526 {
527 if (minimum == 1
528 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
529 impl = search_line_mmx;
530 }
531
532 search_line_fast = impl;
533 }
534
535 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
536
537 /* A vection of the fast scanner using AltiVec vectorized byte compares
538 and VSX unaligned loads (when VSX is available). This is otherwise
539 the same as the AltiVec version. */
540
541 ATTRIBUTE_NO_SANITIZE_UNDEFINED
542 static const uchar *
543 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
544 {
545 typedef __attribute__((altivec(vector))) unsigned char vc;
546
547 const vc repl_nl = {
548 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
549 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
550 };
551 const vc repl_cr = {
552 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
553 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
554 };
555 const vc repl_bs = {
556 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
557 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
558 };
559 const vc repl_qm = {
560 '?', '?', '?', '?', '?', '?', '?', '?',
561 '?', '?', '?', '?', '?', '?', '?', '?',
562 };
563 const vc zero = { 0 };
564
565 vc data, t;
566
567 /* Main loop processing 16 bytes at a time. */
568 do
569 {
570 vc m_nl, m_cr, m_bs, m_qm;
571
572 data = __builtin_vec_vsx_ld (0, s);
573 s += 16;
574
575 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
576 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
577 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
578 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
579 t = (m_nl | m_cr) | (m_bs | m_qm);
580
581 /* T now contains 0xff in bytes for which we matched one of the relevant
582 characters. We want to exit the loop if any byte in T is non-zero.
583 Below is the expansion of vec_any_ne(t, zero). */
584 }
585 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
586
587 /* Restore s to to point to the 16 bytes we just processed. */
588 s -= 16;
589
590 {
591 #define N (sizeof(vc) / sizeof(long))
592
593 union {
594 vc v;
595 /* Statically assert that N is 2 or 4. */
596 unsigned long l[(N == 2 || N == 4) ? N : -1];
597 } u;
598 unsigned long l, i = 0;
599
600 u.v = t;
601
602 /* Find the first word of T that is non-zero. */
603 switch (N)
604 {
605 case 4:
606 l = u.l[i++];
607 if (l != 0)
608 break;
609 s += sizeof(unsigned long);
610 l = u.l[i++];
611 if (l != 0)
612 break;
613 s += sizeof(unsigned long);
614 /* FALLTHRU */
615 case 2:
616 l = u.l[i++];
617 if (l != 0)
618 break;
619 s += sizeof(unsigned long);
620 l = u.l[i];
621 }
622
623 /* L now contains 0xff in bytes for which we matched one of the
624 relevant characters. We can find the byte index by finding
625 its bit index and dividing by 8. */
626 #ifdef __BIG_ENDIAN__
627 l = __builtin_clzl(l) >> 3;
628 #else
629 l = __builtin_ctzl(l) >> 3;
630 #endif
631 return s + l;
632
633 #undef N
634 }
635 }
636
637 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
638
639 /* A vection of the fast scanner using AltiVec vectorized byte compares.
640 This cannot be used for little endian because vec_lvsl/lvsr are
641 deprecated for little endian and the code won't work properly. */
642 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
643 so we can't compile this function without -maltivec on the command line
644 (or implied by some other switch). */
645
646 static const uchar *
647 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
648 {
649 typedef __attribute__((altivec(vector))) unsigned char vc;
650
651 const vc repl_nl = {
652 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
653 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
654 };
655 const vc repl_cr = {
656 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
657 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
658 };
659 const vc repl_bs = {
660 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
661 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
662 };
663 const vc repl_qm = {
664 '?', '?', '?', '?', '?', '?', '?', '?',
665 '?', '?', '?', '?', '?', '?', '?', '?',
666 };
667 const vc ones = {
668 -1, -1, -1, -1, -1, -1, -1, -1,
669 -1, -1, -1, -1, -1, -1, -1, -1,
670 };
671 const vc zero = { 0 };
672
673 vc data, mask, t;
674
675 /* Altivec loads automatically mask addresses with -16. This lets us
676 issue the first load as early as possible. */
677 data = __builtin_vec_ld(0, (const vc *)s);
678
679 /* Discard bytes before the beginning of the buffer. Do this by
680 beginning with all ones and shifting in zeros according to the
681 mis-alignment. The LVSR instruction pulls the exact shift we
682 want from the address. */
683 mask = __builtin_vec_lvsr(0, s);
684 mask = __builtin_vec_perm(zero, ones, mask);
685 data &= mask;
686
687 /* While altivec loads mask addresses, we still need to align S so
688 that the offset we compute at the end is correct. */
689 s = (const uchar *)((uintptr_t)s & -16);
690
691 /* Main loop processing 16 bytes at a time. */
692 goto start;
693 do
694 {
695 vc m_nl, m_cr, m_bs, m_qm;
696
697 s += 16;
698 data = __builtin_vec_ld(0, (const vc *)s);
699
700 start:
701 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
702 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
703 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
704 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
705 t = (m_nl | m_cr) | (m_bs | m_qm);
706
707 /* T now contains 0xff in bytes for which we matched one of the relevant
708 characters. We want to exit the loop if any byte in T is non-zero.
709 Below is the expansion of vec_any_ne(t, zero). */
710 }
711 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
712
713 {
714 #define N (sizeof(vc) / sizeof(long))
715
716 union {
717 vc v;
718 /* Statically assert that N is 2 or 4. */
719 unsigned long l[(N == 2 || N == 4) ? N : -1];
720 } u;
721 unsigned long l, i = 0;
722
723 u.v = t;
724
725 /* Find the first word of T that is non-zero. */
726 switch (N)
727 {
728 case 4:
729 l = u.l[i++];
730 if (l != 0)
731 break;
732 s += sizeof(unsigned long);
733 l = u.l[i++];
734 if (l != 0)
735 break;
736 s += sizeof(unsigned long);
737 /* FALLTHROUGH */
738 case 2:
739 l = u.l[i++];
740 if (l != 0)
741 break;
742 s += sizeof(unsigned long);
743 l = u.l[i];
744 }
745
746 /* L now contains 0xff in bytes for which we matched one of the
747 relevant characters. We can find the byte index by finding
748 its bit index and dividing by 8. */
749 l = __builtin_clzl(l) >> 3;
750 return s + l;
751
752 #undef N
753 }
754 }
755
756 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
757 #include "arm_neon.h"
758
759 /* This doesn't have to be the exact page size, but no system may use
760 a size smaller than this. ARMv8 requires a minimum page size of
761 4k. The impact of being conservative here is a small number of
762 cases will take the slightly slower entry path into the main
763 loop. */
764
765 #define AARCH64_MIN_PAGE_SIZE 4096
766
767 static const uchar *
768 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
769 {
770 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
771 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
772 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
773 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
774 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
775
776 #ifdef __ARM_BIG_ENDIAN
777 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
778 #else
779 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
780 #endif
781
782 unsigned int found;
783 const uint8_t *p;
784 uint8x16_t data;
785 uint8x16_t t;
786 uint16x8_t m;
787 uint8x16_t u, v, w;
788
789 /* Align the source pointer. */
790 p = (const uint8_t *)((uintptr_t)s & -16);
791
792 /* Assuming random string start positions, with a 4k page size we'll take
793 the slow path about 0.37% of the time. */
794 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
795 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
796 < 16, 0))
797 {
798 /* Slow path: the string starts near a possible page boundary. */
799 uint32_t misalign, mask;
800
801 misalign = (uintptr_t)s & 15;
802 mask = (-1u << misalign) & 0xffff;
803 data = vld1q_u8 (p);
804 t = vceqq_u8 (data, repl_nl);
805 u = vceqq_u8 (data, repl_cr);
806 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
807 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
808 t = vorrq_u8 (v, w);
809 t = vandq_u8 (t, xmask);
810 m = vpaddlq_u8 (t);
811 m = vshlq_u16 (m, shift);
812 found = vaddvq_u16 (m);
813 found &= mask;
814 if (found)
815 return (const uchar*)p + __builtin_ctz (found);
816 }
817 else
818 {
819 data = vld1q_u8 ((const uint8_t *) s);
820 t = vceqq_u8 (data, repl_nl);
821 u = vceqq_u8 (data, repl_cr);
822 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
823 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
824 t = vorrq_u8 (v, w);
825 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
826 goto done;
827 }
828
829 do
830 {
831 p += 16;
832 data = vld1q_u8 (p);
833 t = vceqq_u8 (data, repl_nl);
834 u = vceqq_u8 (data, repl_cr);
835 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
836 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
837 t = vorrq_u8 (v, w);
838 } while (!vpaddd_u64 ((uint64x2_t)t));
839
840 done:
841 /* Now that we've found the terminating substring, work out precisely where
842 we need to stop. */
843 t = vandq_u8 (t, xmask);
844 m = vpaddlq_u8 (t);
845 m = vshlq_u16 (m, shift);
846 found = vaddvq_u16 (m);
847 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
848 + __builtin_ctz (found));
849 }
850
851 #elif defined (__ARM_NEON)
852 #include "arm_neon.h"
853
854 static const uchar *
855 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
856 {
857 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
858 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
859 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
860 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
861 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
862
863 unsigned int misalign, found, mask;
864 const uint8_t *p;
865 uint8x16_t data;
866
867 /* Align the source pointer. */
868 misalign = (uintptr_t)s & 15;
869 p = (const uint8_t *)((uintptr_t)s & -16);
870 data = vld1q_u8 (p);
871
872 /* Create a mask for the bytes that are valid within the first
873 16-byte block. The Idea here is that the AND with the mask
874 within the loop is "free", since we need some AND or TEST
875 insn in order to set the flags for the branch anyway. */
876 mask = (-1u << misalign) & 0xffff;
877
878 /* Main loop, processing 16 bytes at a time. */
879 goto start;
880
881 do
882 {
883 uint8x8_t l;
884 uint16x4_t m;
885 uint32x2_t n;
886 uint8x16_t t, u, v, w;
887
888 p += 16;
889 data = vld1q_u8 (p);
890 mask = 0xffff;
891
892 start:
893 t = vceqq_u8 (data, repl_nl);
894 u = vceqq_u8 (data, repl_cr);
895 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
896 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
897 t = vandq_u8 (vorrq_u8 (v, w), xmask);
898 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
899 m = vpaddl_u8 (l);
900 n = vpaddl_u16 (m);
901
902 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
903 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
904 found &= mask;
905 }
906 while (!found);
907
908 /* FOUND contains 1 in bits for which we matched a relevant
909 character. Conversion to the byte index is trivial. */
910 found = __builtin_ctz (found);
911 return (const uchar *)p + found;
912 }
913
914 #else
915
916 /* We only have one accelerated alternative. Use a direct call so that
917 we encourage inlining. */
918
919 #define search_line_fast search_line_acc_char
920
921 #endif
922
923 /* Initialize the lexer if needed. */
924
925 void
926 _cpp_init_lexer (void)
927 {
928 #ifdef HAVE_init_vectorized_lexer
929 init_vectorized_lexer ();
930 #endif
931 }
932
933 /* Returns with a logical line that contains no escaped newlines or
934 trigraphs. This is a time-critical inner loop. */
935 void
936 _cpp_clean_line (cpp_reader *pfile)
937 {
938 cpp_buffer *buffer;
939 const uchar *s;
940 uchar c, *d, *p;
941
942 buffer = pfile->buffer;
943 buffer->cur_note = buffer->notes_used = 0;
944 buffer->cur = buffer->line_base = buffer->next_line;
945 buffer->need_line = false;
946 s = buffer->next_line;
947
948 if (!buffer->from_stage3)
949 {
950 const uchar *pbackslash = NULL;
951
952 /* Fast path. This is the common case of an un-escaped line with
953 no trigraphs. The primary win here is by not writing any
954 data back to memory until we have to. */
955 while (1)
956 {
957 /* Perform an optimized search for \n, \r, \\, ?. */
958 s = search_line_fast (s, buffer->rlimit);
959
960 c = *s;
961 if (c == '\\')
962 {
963 /* Record the location of the backslash and continue. */
964 pbackslash = s++;
965 }
966 else if (__builtin_expect (c == '?', 0))
967 {
968 if (__builtin_expect (s[1] == '?', false)
969 && _cpp_trigraph_map[s[2]])
970 {
971 /* Have a trigraph. We may or may not have to convert
972 it. Add a line note regardless, for -Wtrigraphs. */
973 add_line_note (buffer, s, s[2]);
974 if (CPP_OPTION (pfile, trigraphs))
975 {
976 /* We do, and that means we have to switch to the
977 slow path. */
978 d = (uchar *) s;
979 *d = _cpp_trigraph_map[s[2]];
980 s += 2;
981 goto slow_path;
982 }
983 }
984 /* Not a trigraph. Continue on fast-path. */
985 s++;
986 }
987 else
988 break;
989 }
990
991 /* This must be \r or \n. We're either done, or we'll be forced
992 to write back to the buffer and continue on the slow path. */
993 d = (uchar *) s;
994
995 if (__builtin_expect (s == buffer->rlimit, false))
996 goto done;
997
998 /* DOS line ending? */
999 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1000 {
1001 s++;
1002 if (s == buffer->rlimit)
1003 goto done;
1004 }
1005
1006 if (__builtin_expect (pbackslash == NULL, true))
1007 goto done;
1008
1009 /* Check for escaped newline. */
1010 p = d;
1011 while (is_nvspace (p[-1]))
1012 p--;
1013 if (p - 1 != pbackslash)
1014 goto done;
1015
1016 /* Have an escaped newline; process it and proceed to
1017 the slow path. */
1018 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1019 d = p - 2;
1020 buffer->next_line = p - 1;
1021
1022 slow_path:
1023 while (1)
1024 {
1025 c = *++s;
1026 *++d = c;
1027
1028 if (c == '\n' || c == '\r')
1029 {
1030 /* Handle DOS line endings. */
1031 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1032 s++;
1033 if (s == buffer->rlimit)
1034 break;
1035
1036 /* Escaped? */
1037 p = d;
1038 while (p != buffer->next_line && is_nvspace (p[-1]))
1039 p--;
1040 if (p == buffer->next_line || p[-1] != '\\')
1041 break;
1042
1043 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1044 d = p - 2;
1045 buffer->next_line = p - 1;
1046 }
1047 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1048 {
1049 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1050 add_line_note (buffer, d, s[2]);
1051 if (CPP_OPTION (pfile, trigraphs))
1052 {
1053 *d = _cpp_trigraph_map[s[2]];
1054 s += 2;
1055 }
1056 }
1057 }
1058 }
1059 else
1060 {
1061 while (*s != '\n' && *s != '\r')
1062 s++;
1063 d = (uchar *) s;
1064
1065 /* Handle DOS line endings. */
1066 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1067 s++;
1068 }
1069
1070 done:
1071 *d = '\n';
1072 /* A sentinel note that should never be processed. */
1073 add_line_note (buffer, d + 1, '\n');
1074 buffer->next_line = s + 1;
1075 }
1076
1077 /* Return true if the trigraph indicated by NOTE should be warned
1078 about in a comment. */
1079 static bool
1080 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1081 {
1082 const uchar *p;
1083
1084 /* Within comments we don't warn about trigraphs, unless the
1085 trigraph forms an escaped newline, as that may change
1086 behavior. */
1087 if (note->type != '/')
1088 return false;
1089
1090 /* If -trigraphs, then this was an escaped newline iff the next note
1091 is coincident. */
1092 if (CPP_OPTION (pfile, trigraphs))
1093 return note[1].pos == note->pos;
1094
1095 /* Otherwise, see if this forms an escaped newline. */
1096 p = note->pos + 3;
1097 while (is_nvspace (*p))
1098 p++;
1099
1100 /* There might have been escaped newlines between the trigraph and the
1101 newline we found. Hence the position test. */
1102 return (*p == '\n' && p < note[1].pos);
1103 }
1104
1105 /* Process the notes created by add_line_note as far as the current
1106 location. */
1107 void
1108 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1109 {
1110 cpp_buffer *buffer = pfile->buffer;
1111
1112 for (;;)
1113 {
1114 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1115 unsigned int col;
1116
1117 if (note->pos > buffer->cur)
1118 break;
1119
1120 buffer->cur_note++;
1121 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1122
1123 if (note->type == '\\' || note->type == ' ')
1124 {
1125 if (note->type == ' ' && !in_comment)
1126 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1127 "backslash and newline separated by space");
1128
1129 if (buffer->next_line > buffer->rlimit)
1130 {
1131 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1132 "backslash-newline at end of file");
1133 /* Prevent "no newline at end of file" warning. */
1134 buffer->next_line = buffer->rlimit;
1135 }
1136
1137 buffer->line_base = note->pos;
1138 CPP_INCREMENT_LINE (pfile, 0);
1139 }
1140 else if (_cpp_trigraph_map[note->type])
1141 {
1142 if (CPP_OPTION (pfile, warn_trigraphs)
1143 && (!in_comment || warn_in_comment (pfile, note)))
1144 {
1145 if (CPP_OPTION (pfile, trigraphs))
1146 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1147 pfile->line_table->highest_line, col,
1148 "trigraph ??%c converted to %c",
1149 note->type,
1150 (int) _cpp_trigraph_map[note->type]);
1151 else
1152 {
1153 cpp_warning_with_line
1154 (pfile, CPP_W_TRIGRAPHS,
1155 pfile->line_table->highest_line, col,
1156 "trigraph ??%c ignored, use -trigraphs to enable",
1157 note->type);
1158 }
1159 }
1160 }
1161 else if (note->type == 0)
1162 /* Already processed in lex_raw_string. */;
1163 else
1164 abort ();
1165 }
1166 }
1167
1168 namespace bidi {
1169 enum class kind {
1170 NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1171 };
1172
1173 /* All the UTF-8 encodings of bidi characters start with E2. */
1174 constexpr uchar utf8_start = 0xe2;
1175
1176 struct context
1177 {
1178 context () {}
1179 context (location_t loc, kind k, bool pdf, bool ucn)
1180 : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1181 {
1182 }
1183
1184 kind get_pop_kind () const
1185 {
1186 return m_pdf ? kind::PDF : kind::PDI;
1187 }
1188 bool ucn_p () const
1189 {
1190 return m_ucn;
1191 }
1192
1193 location_t m_loc;
1194 kind m_kind;
1195 unsigned m_pdf : 1;
1196 unsigned m_ucn : 1;
1197 };
1198
1199 /* A vector holding currently open bidi contexts. We use a char for
1200 each context, its LSB is 1 if it represents a PDF context, 0 if it
1201 represents a PDI context. The next bit is 1 if this context was open
1202 by a bidi character written as a UCN, and 0 when it was UTF-8. */
1203 semi_embedded_vec <context, 16> vec;
1204
1205 /* Close the whole comment/identifier/string literal/character constant
1206 context. */
1207 void on_close ()
1208 {
1209 vec.truncate (0);
1210 }
1211
1212 /* Pop the last element in the vector. */
1213 void pop ()
1214 {
1215 unsigned int len = vec.count ();
1216 gcc_checking_assert (len > 0);
1217 vec.truncate (len - 1);
1218 }
1219
1220 /* Return the pop kind of the context of the Ith element. */
1221 kind pop_kind_at (unsigned int i)
1222 {
1223 return vec[i].get_pop_kind ();
1224 }
1225
1226 /* Return the pop kind of the context that is currently opened. */
1227 kind current_ctx ()
1228 {
1229 unsigned int len = vec.count ();
1230 if (len == 0)
1231 return kind::NONE;
1232 return vec[len - 1].get_pop_kind ();
1233 }
1234
1235 /* Return true if the current context comes from a UCN origin, that is,
1236 the bidi char which started this bidi context was written as a UCN. */
1237 bool current_ctx_ucn_p ()
1238 {
1239 unsigned int len = vec.count ();
1240 gcc_checking_assert (len > 0);
1241 return vec[len - 1].m_ucn;
1242 }
1243
1244 location_t current_ctx_loc ()
1245 {
1246 unsigned int len = vec.count ();
1247 gcc_checking_assert (len > 0);
1248 return vec[len - 1].m_loc;
1249 }
1250
1251 /* We've read a bidi char, update the current vector as necessary.
1252 LOC is only valid when K is not kind::NONE. */
1253 void on_char (kind k, bool ucn_p, location_t loc)
1254 {
1255 switch (k)
1256 {
1257 case kind::LRE:
1258 case kind::RLE:
1259 case kind::LRO:
1260 case kind::RLO:
1261 vec.push (context (loc, k, true, ucn_p));
1262 break;
1263 case kind::LRI:
1264 case kind::RLI:
1265 case kind::FSI:
1266 vec.push (context (loc, k, false, ucn_p));
1267 break;
1268 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1269 whose scope has not yet been terminated. */
1270 case kind::PDF:
1271 if (current_ctx () == kind::PDF)
1272 pop ();
1273 break;
1274 /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1275 scope has not yet been terminated, as well as the scopes of
1276 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1277 yet been terminated. */
1278 case kind::PDI:
1279 for (int i = vec.count () - 1; i >= 0; --i)
1280 if (pop_kind_at (i) == kind::PDI)
1281 {
1282 vec.truncate (i);
1283 break;
1284 }
1285 break;
1286 case kind::LTR:
1287 case kind::RTL:
1288 /* These aren't popped by a PDF/PDI. */
1289 break;
1290 ATTR_LIKELY case kind::NONE:
1291 break;
1292 default:
1293 abort ();
1294 }
1295 }
1296
1297 /* Return a descriptive string for K. */
1298 const char *to_str (kind k)
1299 {
1300 switch (k)
1301 {
1302 case kind::LRE:
1303 return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1304 case kind::RLE:
1305 return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1306 case kind::LRO:
1307 return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1308 case kind::RLO:
1309 return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1310 case kind::LRI:
1311 return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1312 case kind::RLI:
1313 return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1314 case kind::FSI:
1315 return "U+2068 (FIRST STRONG ISOLATE)";
1316 case kind::PDF:
1317 return "U+202C (POP DIRECTIONAL FORMATTING)";
1318 case kind::PDI:
1319 return "U+2069 (POP DIRECTIONAL ISOLATE)";
1320 case kind::LTR:
1321 return "U+200E (LEFT-TO-RIGHT MARK)";
1322 case kind::RTL:
1323 return "U+200F (RIGHT-TO-LEFT MARK)";
1324 default:
1325 abort ();
1326 }
1327 }
1328 }
1329
1330 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1331 within the current line in FILE, with the caret at START. */
1332
1333 static location_t
1334 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1335 const unsigned char *const start,
1336 size_t num_bytes)
1337 {
1338 gcc_checking_assert (num_bytes > 0);
1339
1340 /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1341 to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1342 whereas linemap_position_for_column is 1-based. */
1343
1344 /* Get 0-based offsets within the line. */
1345 size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1346 size_t end_offset = start_offset + num_bytes - 1;
1347
1348 /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1349 location_t start_loc = linemap_position_for_column (pfile->line_table,
1350 start_offset + 1);
1351 location_t end_loc = linemap_position_for_column (pfile->line_table,
1352 end_offset + 1);
1353
1354 if (start_loc == end_loc)
1355 return start_loc;
1356
1357 source_range src_range;
1358 src_range.m_start = start_loc;
1359 src_range.m_finish = end_loc;
1360 location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1361 start_loc,
1362 src_range,
1363 NULL);
1364 return combined_loc;
1365 }
1366
1367 /* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1368
1369 static bidi::kind
1370 get_bidi_utf8_1 (const unsigned char *const p)
1371 {
1372 gcc_checking_assert (p[0] == bidi::utf8_start);
1373
1374 if (p[1] == 0x80)
1375 switch (p[2])
1376 {
1377 case 0xaa:
1378 return bidi::kind::LRE;
1379 case 0xab:
1380 return bidi::kind::RLE;
1381 case 0xac:
1382 return bidi::kind::PDF;
1383 case 0xad:
1384 return bidi::kind::LRO;
1385 case 0xae:
1386 return bidi::kind::RLO;
1387 case 0x8e:
1388 return bidi::kind::LTR;
1389 case 0x8f:
1390 return bidi::kind::RTL;
1391 default:
1392 break;
1393 }
1394 else if (p[1] == 0x81)
1395 switch (p[2])
1396 {
1397 case 0xa6:
1398 return bidi::kind::LRI;
1399 case 0xa7:
1400 return bidi::kind::RLI;
1401 case 0xa8:
1402 return bidi::kind::FSI;
1403 case 0xa9:
1404 return bidi::kind::PDI;
1405 default:
1406 break;
1407 }
1408
1409 return bidi::kind::NONE;
1410 }
1411
1412 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1413 If the kind is not NONE, write the location to *OUT.*/
1414
1415 static bidi::kind
1416 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1417 {
1418 bidi::kind result = get_bidi_utf8_1 (p);
1419 if (result != bidi::kind::NONE)
1420 {
1421 /* We have a sequence of 3 bytes starting at P. */
1422 *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1423 }
1424 return result;
1425 }
1426
1427 /* Parse a UCN where P points just past \u or \U and return its bidi code. */
1428
1429 static bidi::kind
1430 get_bidi_ucn_1 (const unsigned char *p, bool is_U)
1431 {
1432 /* 6.4.3 Universal Character Names
1433 \u hex-quad
1434 \U hex-quad hex-quad
1435 where \unnnn means \U0000nnnn. */
1436
1437 if (is_U)
1438 {
1439 if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1440 return bidi::kind::NONE;
1441 /* Skip 4B so we can treat \u and \U the same below. */
1442 p += 4;
1443 }
1444
1445 /* All code points we are looking for start with 20xx. */
1446 if (p[0] != '2' || p[1] != '0')
1447 return bidi::kind::NONE;
1448 else if (p[2] == '2')
1449 switch (p[3])
1450 {
1451 case 'a':
1452 case 'A':
1453 return bidi::kind::LRE;
1454 case 'b':
1455 case 'B':
1456 return bidi::kind::RLE;
1457 case 'c':
1458 case 'C':
1459 return bidi::kind::PDF;
1460 case 'd':
1461 case 'D':
1462 return bidi::kind::LRO;
1463 case 'e':
1464 case 'E':
1465 return bidi::kind::RLO;
1466 default:
1467 break;
1468 }
1469 else if (p[2] == '6')
1470 switch (p[3])
1471 {
1472 case '6':
1473 return bidi::kind::LRI;
1474 case '7':
1475 return bidi::kind::RLI;
1476 case '8':
1477 return bidi::kind::FSI;
1478 case '9':
1479 return bidi::kind::PDI;
1480 default:
1481 break;
1482 }
1483 else if (p[2] == '0')
1484 switch (p[3])
1485 {
1486 case 'e':
1487 case 'E':
1488 return bidi::kind::LTR;
1489 case 'f':
1490 case 'F':
1491 return bidi::kind::RTL;
1492 default:
1493 break;
1494 }
1495
1496 return bidi::kind::NONE;
1497 }
1498
1499 /* Parse a UCN where P points just past \u or \U and return its bidi code.
1500 If the kind is not NONE, write the location to *OUT.*/
1501
1502 static bidi::kind
1503 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1504 location_t *out)
1505 {
1506 bidi::kind result = get_bidi_ucn_1 (p, is_U);
1507 if (result != bidi::kind::NONE)
1508 {
1509 const unsigned char *start = p - 2;
1510 size_t num_bytes = 2 + (is_U ? 8 : 4);
1511 *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1512 }
1513 return result;
1514 }
1515
1516 /* Subclass of rich_location for reporting on unpaired UTF-8
1517 bidirectional control character(s).
1518 Escape the source lines on output, and show all unclosed
1519 bidi context, labelling everything. */
1520
1521 class unpaired_bidi_rich_location : public rich_location
1522 {
1523 public:
1524 class custom_range_label : public range_label
1525 {
1526 public:
1527 label_text get_text (unsigned range_idx) const FINAL OVERRIDE
1528 {
1529 /* range 0 is the primary location; each subsequent range i + 1
1530 is for bidi::vec[i]. */
1531 if (range_idx > 0)
1532 {
1533 const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1534 return label_text::borrow (bidi::to_str (ctxt.m_kind));
1535 }
1536 else
1537 return label_text::borrow (_("end of bidirectional context"));
1538 }
1539 };
1540
1541 unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1542 : rich_location (pfile->line_table, loc, &m_custom_label)
1543 {
1544 set_escape_on_output (true);
1545 for (unsigned i = 0; i < bidi::vec.count (); i++)
1546 add_range (bidi::vec[i].m_loc,
1547 SHOW_RANGE_WITHOUT_CARET,
1548 &m_custom_label);
1549 }
1550
1551 private:
1552 custom_range_label m_custom_label;
1553 };
1554
1555 /* We're closing a bidi context, that is, we've encountered a newline,
1556 are closing a C-style comment, or are at the end of a string literal,
1557 character constant, or identifier. Warn if this context was not
1558 properly terminated by a PDI or PDF. P points to the last character
1559 in this context. */
1560
1561 static void
1562 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1563 {
1564 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1565 if (bidi::vec.count () > 0
1566 && (warn_bidi & bidirectional_unpaired
1567 && (!bidi::current_ctx_ucn_p ()
1568 || (warn_bidi & bidirectional_ucn))))
1569 {
1570 const location_t loc
1571 = linemap_position_for_column (pfile->line_table,
1572 CPP_BUF_COLUMN (pfile->buffer, p));
1573 unpaired_bidi_rich_location rich_loc (pfile, loc);
1574 /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1575 forms of a diagnostic, so fake it for now. */
1576 if (bidi::vec.count () > 1)
1577 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1578 "unpaired UTF-8 bidirectional control characters "
1579 "detected");
1580 else
1581 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1582 "unpaired UTF-8 bidirectional control character "
1583 "detected");
1584 }
1585 /* We're done with this context. */
1586 bidi::on_close ();
1587 }
1588
1589 /* We're at the beginning or in the middle of an identifier/comment/string
1590 literal/character constant. Warn if we've encountered a bidi character.
1591 KIND says which bidi control character it was; UCN_P is true iff this bidi
1592 control character was written as a UCN. LOC is the location of the
1593 character, but is only valid if KIND != bidi::kind::NONE. */
1594
1595 static void
1596 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1597 bool ucn_p, location_t loc)
1598 {
1599 if (__builtin_expect (kind == bidi::kind::NONE, 1))
1600 return;
1601
1602 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1603
1604 if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1605 {
1606 rich_location rich_loc (pfile->line_table, loc);
1607 rich_loc.set_escape_on_output (true);
1608
1609 /* It seems excessive to warn about a PDI/PDF that is closing
1610 an opened context because we've already warned about the
1611 opening character. Except warn when we have a UCN x UTF-8
1612 mismatch, if UCN checking is enabled. */
1613 if (kind == bidi::current_ctx ())
1614 {
1615 if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1616 && bidi::current_ctx_ucn_p () != ucn_p)
1617 {
1618 rich_loc.add_range (bidi::current_ctx_loc ());
1619 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1620 "UTF-8 vs UCN mismatch when closing "
1621 "a context by \"%s\"", bidi::to_str (kind));
1622 }
1623 }
1624 else if (warn_bidi & bidirectional_any
1625 && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1626 {
1627 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1628 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1629 "\"%s\" is closing an unopened context",
1630 bidi::to_str (kind));
1631 else
1632 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1633 "found problematic Unicode character \"%s\"",
1634 bidi::to_str (kind));
1635 }
1636 }
1637 /* We're done with this context. */
1638 bidi::on_char (kind, ucn_p, loc);
1639 }
1640
1641 /* Skip a C-style block comment. We find the end of the comment by
1642 seeing if an asterisk is before every '/' we encounter. Returns
1643 nonzero if comment terminated by EOF, zero otherwise.
1644
1645 Buffer->cur points to the initial asterisk of the comment. */
1646 bool
1647 _cpp_skip_block_comment (cpp_reader *pfile)
1648 {
1649 cpp_buffer *buffer = pfile->buffer;
1650 const uchar *cur = buffer->cur;
1651 uchar c;
1652 const bool warn_bidi_p = pfile->warn_bidi_p ();
1653
1654 cur++;
1655 if (*cur == '/')
1656 cur++;
1657
1658 for (;;)
1659 {
1660 /* People like decorating comments with '*', so check for '/'
1661 instead for efficiency. */
1662 c = *cur++;
1663
1664 if (c == '/')
1665 {
1666 if (cur[-2] == '*')
1667 {
1668 if (warn_bidi_p)
1669 maybe_warn_bidi_on_close (pfile, cur);
1670 break;
1671 }
1672
1673 /* Warn about potential nested comments, but not if the '/'
1674 comes immediately before the true comment delimiter.
1675 Don't bother to get it right across escaped newlines. */
1676 if (CPP_OPTION (pfile, warn_comments)
1677 && cur[0] == '*' && cur[1] != '/')
1678 {
1679 buffer->cur = cur;
1680 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1681 pfile->line_table->highest_line,
1682 CPP_BUF_COL (buffer),
1683 "\"/*\" within comment");
1684 }
1685 }
1686 else if (c == '\n')
1687 {
1688 unsigned int cols;
1689 buffer->cur = cur - 1;
1690 if (warn_bidi_p)
1691 maybe_warn_bidi_on_close (pfile, cur);
1692 _cpp_process_line_notes (pfile, true);
1693 if (buffer->next_line >= buffer->rlimit)
1694 return true;
1695 _cpp_clean_line (pfile);
1696
1697 cols = buffer->next_line - buffer->line_base;
1698 CPP_INCREMENT_LINE (pfile, cols);
1699
1700 cur = buffer->cur;
1701 }
1702 /* If this is a beginning of a UTF-8 encoding, it might be
1703 a bidirectional control character. */
1704 else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
1705 {
1706 location_t loc;
1707 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1708 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1709 }
1710 }
1711
1712 buffer->cur = cur;
1713 _cpp_process_line_notes (pfile, true);
1714 return false;
1715 }
1716
1717 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1718 terminating newline. Handles escaped newlines. Returns nonzero
1719 if a multiline comment. */
1720 static int
1721 skip_line_comment (cpp_reader *pfile)
1722 {
1723 cpp_buffer *buffer = pfile->buffer;
1724 location_t orig_line = pfile->line_table->highest_line;
1725 const bool warn_bidi_p = pfile->warn_bidi_p ();
1726
1727 if (!warn_bidi_p)
1728 while (*buffer->cur != '\n')
1729 buffer->cur++;
1730 else
1731 {
1732 while (*buffer->cur != '\n'
1733 && *buffer->cur != bidi::utf8_start)
1734 buffer->cur++;
1735 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1736 {
1737 while (*buffer->cur != '\n')
1738 {
1739 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1740 {
1741 location_t loc;
1742 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1743 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1744 }
1745 buffer->cur++;
1746 }
1747 maybe_warn_bidi_on_close (pfile, buffer->cur);
1748 }
1749 }
1750
1751 _cpp_process_line_notes (pfile, true);
1752 return orig_line != pfile->line_table->highest_line;
1753 }
1754
1755 /* Skips whitespace, saving the next non-whitespace character. */
1756 static void
1757 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1758 {
1759 cpp_buffer *buffer = pfile->buffer;
1760 bool saw_NUL = false;
1761
1762 do
1763 {
1764 /* Horizontal space always OK. */
1765 if (c == ' ' || c == '\t')
1766 ;
1767 /* Just \f \v or \0 left. */
1768 else if (c == '\0')
1769 saw_NUL = true;
1770 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1771 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1772 CPP_BUF_COL (buffer),
1773 "%s in preprocessing directive",
1774 c == '\f' ? "form feed" : "vertical tab");
1775
1776 c = *buffer->cur++;
1777 }
1778 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1779 while (is_nvspace (c));
1780
1781 if (saw_NUL)
1782 {
1783 encoding_rich_location rich_loc (pfile);
1784 cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1785 "null character(s) ignored");
1786 }
1787
1788 buffer->cur--;
1789 }
1790
1791 /* See if the characters of a number token are valid in a name (no
1792 '.', '+' or '-'). */
1793 static int
1794 name_p (cpp_reader *pfile, const cpp_string *string)
1795 {
1796 unsigned int i;
1797
1798 for (i = 0; i < string->len; i++)
1799 if (!is_idchar (string->text[i]))
1800 return 0;
1801
1802 return 1;
1803 }
1804
1805 /* After parsing an identifier or other sequence, produce a warning about
1806 sequences not in NFC/NFKC. */
1807 static void
1808 warn_about_normalization (cpp_reader *pfile,
1809 const cpp_token *token,
1810 const struct normalize_state *s)
1811 {
1812 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1813 && !pfile->state.skipping)
1814 {
1815 location_t loc = token->src_loc;
1816
1817 /* If possible, create a location range for the token. */
1818 if (loc >= RESERVED_LOCATION_COUNT
1819 && token->type != CPP_EOF
1820 /* There must be no line notes to process. */
1821 && (!(pfile->buffer->cur
1822 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
1823 && !pfile->overlaid_buffer)))
1824 {
1825 source_range tok_range;
1826 tok_range.m_start = loc;
1827 tok_range.m_finish
1828 = linemap_position_for_column (pfile->line_table,
1829 CPP_BUF_COLUMN (pfile->buffer,
1830 pfile->buffer->cur));
1831 loc = COMBINE_LOCATION_DATA (pfile->line_table,
1832 loc, tok_range, NULL);
1833 }
1834
1835 encoding_rich_location rich_loc (pfile, loc);
1836
1837 /* Make sure that the token is printed using UCNs, even
1838 if we'd otherwise happily print UTF-8. */
1839 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1840 size_t sz;
1841
1842 sz = cpp_spell_token (pfile, token, buf, false) - buf;
1843 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1844 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1845 "`%.*s' is not in NFKC", (int) sz, buf);
1846 else if (CPP_OPTION (pfile, cplusplus))
1847 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1848 "`%.*s' is not in NFC", (int) sz, buf);
1849 else
1850 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1851 "`%.*s' is not in NFC", (int) sz, buf);
1852 free (buf);
1853 }
1854 }
1855
1856 static const cppchar_t utf8_signifier = 0xC0;
1857
1858 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1859 an identifier. FIRST is TRUE if this starts an identifier. */
1860
1861 static bool
1862 forms_identifier_p (cpp_reader *pfile, int first,
1863 struct normalize_state *state)
1864 {
1865 cpp_buffer *buffer = pfile->buffer;
1866 const bool warn_bidi_p = pfile->warn_bidi_p ();
1867
1868 if (*buffer->cur == '$')
1869 {
1870 if (!CPP_OPTION (pfile, dollars_in_ident))
1871 return false;
1872
1873 buffer->cur++;
1874 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1875 {
1876 CPP_OPTION (pfile, warn_dollars) = 0;
1877 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1878 }
1879
1880 return true;
1881 }
1882
1883 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
1884 if (CPP_OPTION (pfile, extended_identifiers))
1885 {
1886 cppchar_t s;
1887 if (*buffer->cur >= utf8_signifier)
1888 {
1889 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
1890 && warn_bidi_p)
1891 {
1892 location_t loc;
1893 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1894 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1895 }
1896 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1897 state, &s))
1898 return true;
1899 }
1900 else if (*buffer->cur == '\\'
1901 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1902 {
1903 buffer->cur += 2;
1904 if (warn_bidi_p)
1905 {
1906 location_t loc;
1907 bidi::kind kind = get_bidi_ucn (pfile,
1908 buffer->cur,
1909 buffer->cur[-1] == 'U',
1910 &loc);
1911 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
1912 }
1913 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1914 state, &s, NULL, NULL))
1915 return true;
1916 buffer->cur -= 2;
1917 }
1918 }
1919
1920 return false;
1921 }
1922
1923 /* Helper function to issue error about improper __VA_OPT__ use. */
1924 static void
1925 maybe_va_opt_error (cpp_reader *pfile)
1926 {
1927 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1928 {
1929 /* __VA_OPT__ should not be accepted at all, but allow it in
1930 system headers. */
1931 if (!_cpp_in_system_header (pfile))
1932 cpp_error (pfile, CPP_DL_PEDWARN,
1933 "__VA_OPT__ is not available until C++20");
1934 }
1935 else if (!pfile->state.va_args_ok)
1936 {
1937 /* __VA_OPT__ should only appear in the replacement list of a
1938 variadic macro. */
1939 cpp_error (pfile, CPP_DL_PEDWARN,
1940 "__VA_OPT__ can only appear in the expansion"
1941 " of a C++20 variadic macro");
1942 }
1943 }
1944
1945 /* Helper function to get the cpp_hashnode of the identifier BASE. */
1946 static cpp_hashnode *
1947 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1948 {
1949 cpp_hashnode *result;
1950 const uchar *cur;
1951 unsigned int len;
1952 unsigned int hash = HT_HASHSTEP (0, *base);
1953
1954 cur = base + 1;
1955 while (ISIDNUM (*cur))
1956 {
1957 hash = HT_HASHSTEP (hash, *cur);
1958 cur++;
1959 }
1960 len = cur - base;
1961 hash = HT_HASHFINISH (hash, len);
1962 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1963 base, len, hash, HT_ALLOC));
1964
1965 /* Rarely, identifiers require diagnostics when lexed. */
1966 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1967 && !pfile->state.skipping, 0))
1968 {
1969 /* It is allowed to poison the same identifier twice. */
1970 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1971 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1972 NODE_NAME (result));
1973
1974 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1975 replacement list of a variadic macro. */
1976 if (result == pfile->spec_nodes.n__VA_ARGS__
1977 && !pfile->state.va_args_ok)
1978 {
1979 if (CPP_OPTION (pfile, cplusplus))
1980 cpp_error (pfile, CPP_DL_PEDWARN,
1981 "__VA_ARGS__ can only appear in the expansion"
1982 " of a C++11 variadic macro");
1983 else
1984 cpp_error (pfile, CPP_DL_PEDWARN,
1985 "__VA_ARGS__ can only appear in the expansion"
1986 " of a C99 variadic macro");
1987 }
1988
1989 if (result == pfile->spec_nodes.n__VA_OPT__)
1990 maybe_va_opt_error (pfile);
1991
1992 /* For -Wc++-compat, warn about use of C++ named operators. */
1993 if (result->flags & NODE_WARN_OPERATOR)
1994 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1995 "identifier \"%s\" is a special operator name in C++",
1996 NODE_NAME (result));
1997 }
1998
1999 return result;
2000 }
2001
2002 /* Get the cpp_hashnode of an identifier specified by NAME in
2003 the current cpp_reader object. If none is found, NULL is returned. */
2004 cpp_hashnode *
2005 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2006 {
2007 cpp_hashnode *result;
2008 result = lex_identifier_intern (pfile, (uchar *) name);
2009 return result;
2010 }
2011
2012 /* Lex an identifier starting at BUFFER->CUR - 1. */
2013 static cpp_hashnode *
2014 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2015 struct normalize_state *nst, cpp_hashnode **spelling)
2016 {
2017 cpp_hashnode *result;
2018 const uchar *cur;
2019 unsigned int len;
2020 unsigned int hash = HT_HASHSTEP (0, *base);
2021 const bool warn_bidi_p = pfile->warn_bidi_p ();
2022
2023 cur = pfile->buffer->cur;
2024 if (! starts_ucn)
2025 {
2026 while (ISIDNUM (*cur))
2027 {
2028 hash = HT_HASHSTEP (hash, *cur);
2029 cur++;
2030 }
2031 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2032 }
2033 pfile->buffer->cur = cur;
2034 if (starts_ucn || forms_identifier_p (pfile, false, nst))
2035 {
2036 /* Slower version for identifiers containing UCNs
2037 or extended chars (including $). */
2038 do {
2039 while (ISIDNUM (*pfile->buffer->cur))
2040 {
2041 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2042 pfile->buffer->cur++;
2043 }
2044 } while (forms_identifier_p (pfile, false, nst));
2045 if (warn_bidi_p)
2046 maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2047 result = _cpp_interpret_identifier (pfile, base,
2048 pfile->buffer->cur - base);
2049 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2050 }
2051 else
2052 {
2053 len = cur - base;
2054 hash = HT_HASHFINISH (hash, len);
2055
2056 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2057 base, len, hash, HT_ALLOC));
2058 *spelling = result;
2059 }
2060
2061 /* Rarely, identifiers require diagnostics when lexed. */
2062 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2063 && !pfile->state.skipping, 0))
2064 {
2065 /* It is allowed to poison the same identifier twice. */
2066 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2067 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2068 NODE_NAME (result));
2069
2070 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2071 replacement list of a variadic macro. */
2072 if (result == pfile->spec_nodes.n__VA_ARGS__
2073 && !pfile->state.va_args_ok)
2074 {
2075 if (CPP_OPTION (pfile, cplusplus))
2076 cpp_error (pfile, CPP_DL_PEDWARN,
2077 "__VA_ARGS__ can only appear in the expansion"
2078 " of a C++11 variadic macro");
2079 else
2080 cpp_error (pfile, CPP_DL_PEDWARN,
2081 "__VA_ARGS__ can only appear in the expansion"
2082 " of a C99 variadic macro");
2083 }
2084
2085 /* __VA_OPT__ should only appear in the replacement list of a
2086 variadic macro. */
2087 if (result == pfile->spec_nodes.n__VA_OPT__)
2088 maybe_va_opt_error (pfile);
2089
2090 /* For -Wc++-compat, warn about use of C++ named operators. */
2091 if (result->flags & NODE_WARN_OPERATOR)
2092 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2093 "identifier \"%s\" is a special operator name in C++",
2094 NODE_NAME (result));
2095 }
2096
2097 return result;
2098 }
2099
2100 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
2101 static void
2102 lex_number (cpp_reader *pfile, cpp_string *number,
2103 struct normalize_state *nst)
2104 {
2105 const uchar *cur;
2106 const uchar *base;
2107 uchar *dest;
2108
2109 base = pfile->buffer->cur - 1;
2110 do
2111 {
2112 const uchar *adj_digit_sep = NULL;
2113 cur = pfile->buffer->cur;
2114
2115 /* N.B. ISIDNUM does not include $. */
2116 while (ISIDNUM (*cur)
2117 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2118 || DIGIT_SEP (*cur)
2119 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2120 {
2121 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2122 /* Adjacent digit separators do not form part of the pp-number syntax.
2123 However, they can safely be diagnosed here as an error, since '' is
2124 not a valid preprocessing token. */
2125 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2126 adj_digit_sep = cur;
2127 cur++;
2128 }
2129 /* A number can't end with a digit separator. */
2130 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2131 --cur;
2132 if (adj_digit_sep && adj_digit_sep < cur)
2133 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2134
2135 pfile->buffer->cur = cur;
2136 }
2137 while (forms_identifier_p (pfile, false, nst));
2138
2139 number->len = cur - base;
2140 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2141 memcpy (dest, base, number->len);
2142 dest[number->len] = '\0';
2143 number->text = dest;
2144 }
2145
2146 /* Create a token of type TYPE with a literal spelling. */
2147 static void
2148 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2149 unsigned int len, enum cpp_ttype type)
2150 {
2151 token->type = type;
2152 token->val.str.len = len;
2153 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2154 }
2155
2156 const uchar *
2157 cpp_alloc_token_string (cpp_reader *pfile,
2158 const unsigned char *ptr, unsigned len)
2159 {
2160 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2161
2162 dest[len] = 0;
2163 memcpy (dest, ptr, len);
2164 return dest;
2165 }
2166
2167 /* A pair of raw buffer pointers. The currently open one is [1], the
2168 first one is [0]. Used for string literal lexing. */
2169 struct lit_accum {
2170 _cpp_buff *first;
2171 _cpp_buff *last;
2172 const uchar *rpos;
2173 size_t accum;
2174
2175 lit_accum ()
2176 : first (NULL), last (NULL), rpos (0), accum (0)
2177 {
2178 }
2179
2180 void append (cpp_reader *, const uchar *, size_t);
2181
2182 void read_begin (cpp_reader *);
2183 bool reading_p () const
2184 {
2185 return rpos != NULL;
2186 }
2187 char read_char ()
2188 {
2189 char c = *rpos++;
2190 if (rpos == BUFF_FRONT (last))
2191 rpos = NULL;
2192 return c;
2193 }
2194 };
2195
2196 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2197 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2198
2199 void
2200 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2201 {
2202 if (!last)
2203 /* Starting. */
2204 first = last = _cpp_get_buff (pfile, len);
2205 else if (len > BUFF_ROOM (last))
2206 {
2207 /* There is insufficient room in the buffer. Copy what we can,
2208 and then either extend or create a new one. */
2209 size_t room = BUFF_ROOM (last);
2210 memcpy (BUFF_FRONT (last), base, room);
2211 BUFF_FRONT (last) += room;
2212 base += room;
2213 len -= room;
2214 accum += room;
2215
2216 gcc_checking_assert (!rpos);
2217
2218 last = _cpp_append_extend_buff (pfile, last, len);
2219 }
2220
2221 memcpy (BUFF_FRONT (last), base, len);
2222 BUFF_FRONT (last) += len;
2223 accum += len;
2224 }
2225
2226 void
2227 lit_accum::read_begin (cpp_reader *pfile)
2228 {
2229 /* We never accumulate more than 4 chars to read. */
2230 if (BUFF_ROOM (last) < 4)
2231
2232 last = _cpp_append_extend_buff (pfile, last, 4);
2233 rpos = BUFF_FRONT (last);
2234 }
2235
2236 /* Returns true if a macro has been defined.
2237 This might not work if compile with -save-temps,
2238 or preprocess separately from compilation. */
2239
2240 static bool
2241 is_macro(cpp_reader *pfile, const uchar *base)
2242 {
2243 const uchar *cur = base;
2244 if (! ISIDST (*cur))
2245 return false;
2246 unsigned int hash = HT_HASHSTEP (0, *cur);
2247 ++cur;
2248 while (ISIDNUM (*cur))
2249 {
2250 hash = HT_HASHSTEP (hash, *cur);
2251 ++cur;
2252 }
2253 hash = HT_HASHFINISH (hash, cur - base);
2254
2255 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2256 base, cur - base, hash, HT_NO_INSERT));
2257
2258 return result && cpp_macro_p (result);
2259 }
2260
2261 /* Returns true if a literal suffix does not have the expected form
2262 and is defined as a macro. */
2263
2264 static bool
2265 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2266 {
2267 /* User-defined literals outside of namespace std must start with a single
2268 underscore, so assume anything of that form really is a UDL suffix.
2269 We don't need to worry about UDLs defined inside namespace std because
2270 their names are reserved, so cannot be used as macro names in valid
2271 programs. */
2272 if (base[0] == '_' && base[1] != '_')
2273 return false;
2274 return is_macro (pfile, base);
2275 }
2276
2277 /* Lexes a raw string. The stored string contains the spelling,
2278 including double quotes, delimiter string, '(' and ')', any leading
2279 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2280 the type of the literal, or CPP_OTHER if it was not properly
2281 terminated.
2282
2283 BASE is the start of the token. Updates pfile->buffer->cur to just
2284 after the lexed string.
2285
2286 The spelling is NUL-terminated, but it is not guaranteed that this
2287 is the first NUL since embedded NULs are preserved. */
2288
2289 static void
2290 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2291 {
2292 const uchar *pos = base;
2293 const bool warn_bidi_p = pfile->warn_bidi_p ();
2294
2295 /* 'tis a pity this information isn't passed down from the lexer's
2296 initial categorization of the token. */
2297 enum cpp_ttype type = CPP_STRING;
2298
2299 if (*pos == 'L')
2300 {
2301 type = CPP_WSTRING;
2302 pos++;
2303 }
2304 else if (*pos == 'U')
2305 {
2306 type = CPP_STRING32;
2307 pos++;
2308 }
2309 else if (*pos == 'u')
2310 {
2311 if (pos[1] == '8')
2312 {
2313 type = CPP_UTF8STRING;
2314 pos++;
2315 }
2316 else
2317 type = CPP_STRING16;
2318 pos++;
2319 }
2320
2321 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2322 pos += 2;
2323
2324 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2325
2326 /* Skip notes before the ". */
2327 while (note->pos < pos)
2328 ++note;
2329
2330 lit_accum accum;
2331
2332 uchar prefix[17];
2333 unsigned prefix_len = 0;
2334 enum Phase
2335 {
2336 PHASE_PREFIX = -2,
2337 PHASE_NONE = -1,
2338 PHASE_SUFFIX = 0
2339 } phase = PHASE_PREFIX;
2340
2341 for (;;)
2342 {
2343 gcc_checking_assert (note->pos >= pos);
2344
2345 /* Undo any escaped newlines and trigraphs. */
2346 if (!accum.reading_p () && note->pos == pos)
2347 switch (note->type)
2348 {
2349 case '\\':
2350 case ' ':
2351 /* Restore backslash followed by newline. */
2352 accum.append (pfile, base, pos - base);
2353 base = pos;
2354 accum.read_begin (pfile);
2355 accum.append (pfile, UC"\\", 1);
2356
2357 after_backslash:
2358 if (note->type == ' ')
2359 /* GNU backslash whitespace newline extension. FIXME
2360 could be any sequence of non-vertical space. When we
2361 can properly restore any such sequence, we should
2362 mark this note as handled so _cpp_process_line_notes
2363 doesn't warn. */
2364 accum.append (pfile, UC" ", 1);
2365
2366 accum.append (pfile, UC"\n", 1);
2367 note++;
2368 break;
2369
2370 case '\n':
2371 /* This can happen for ??/<NEWLINE> when trigraphs are not
2372 being interpretted. */
2373 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2374 note->type = 0;
2375 note++;
2376 break;
2377
2378 default:
2379 gcc_checking_assert (_cpp_trigraph_map[note->type]);
2380
2381 /* Don't warn about this trigraph in
2382 _cpp_process_line_notes, since trigraphs show up as
2383 trigraphs in raw strings. */
2384 uchar type = note->type;
2385 note->type = 0;
2386
2387 if (CPP_OPTION (pfile, trigraphs))
2388 {
2389 accum.append (pfile, base, pos - base);
2390 base = pos;
2391 accum.read_begin (pfile);
2392 accum.append (pfile, UC"??", 2);
2393 accum.append (pfile, &type, 1);
2394
2395 /* ??/ followed by newline gets two line notes, one for
2396 the trigraph and one for the backslash/newline. */
2397 if (type == '/' && note[1].pos == pos)
2398 {
2399 note++;
2400 gcc_assert (note->type == '\\' || note->type == ' ');
2401 goto after_backslash;
2402 }
2403 /* Skip the replacement character. */
2404 base = ++pos;
2405 }
2406
2407 note++;
2408 break;
2409 }
2410
2411 /* Now get a char to process. Either from an expanded note, or
2412 from the line buffer. */
2413 bool read_note = accum.reading_p ();
2414 char c = read_note ? accum.read_char () : *pos++;
2415
2416 if (phase == PHASE_PREFIX)
2417 {
2418 if (c == '(')
2419 {
2420 /* Done. */
2421 phase = PHASE_NONE;
2422 prefix[prefix_len++] = '"';
2423 }
2424 else if (prefix_len < 16
2425 /* Prefix chars are any of the basic character set,
2426 [lex.charset] except for '
2427 ()\\\t\v\f\n'. Optimized for a contiguous
2428 alphabet. */
2429 /* Unlike a switch, this collapses down to one or
2430 two shift and bitmask operations on an ASCII
2431 system, with an outlier or two. */
2432 && (('Z' - 'A' == 25
2433 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2434 : ISIDST (c))
2435 || (c >= '0' && c <= '9')
2436 || c == '_' || c == '{' || c == '}'
2437 || c == '[' || c == ']' || c == '#'
2438 || c == '<' || c == '>' || c == '%'
2439 || c == ':' || c == ';' || c == '.' || c == '?'
2440 || c == '*' || c == '+' || c == '-' || c == '/'
2441 || c == '^' || c == '&' || c == '|' || c == '~'
2442 || c == '!' || c == '=' || c == ','
2443 || c == '"' || c == '\''))
2444 prefix[prefix_len++] = c;
2445 else
2446 {
2447 /* Something is wrong. */
2448 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2449 if (prefix_len == 16)
2450 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2451 col, "raw string delimiter longer "
2452 "than 16 characters");
2453 else if (c == '\n')
2454 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2455 col, "invalid new-line in raw "
2456 "string delimiter");
2457 else
2458 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2459 col, "invalid character '%c' in "
2460 "raw string delimiter", c);
2461 type = CPP_OTHER;
2462 phase = PHASE_NONE;
2463 /* Continue until we get a close quote, that's probably
2464 the best failure mode. */
2465 prefix_len = 0;
2466 }
2467 if (c != '\n')
2468 continue;
2469 }
2470
2471 if (phase != PHASE_NONE)
2472 {
2473 if (prefix[phase] != c)
2474 phase = PHASE_NONE;
2475 else if (unsigned (phase + 1) == prefix_len)
2476 break;
2477 else
2478 {
2479 phase = Phase (phase + 1);
2480 continue;
2481 }
2482 }
2483
2484 if (!prefix_len && c == '"')
2485 /* Failure mode lexing. */
2486 goto out;
2487 else if (prefix_len && c == ')')
2488 phase = PHASE_SUFFIX;
2489 else if (!read_note && c == '\n')
2490 {
2491 pos--;
2492 pfile->buffer->cur = pos;
2493 if (pfile->state.in_directive
2494 || (pfile->state.parsing_args
2495 && pfile->buffer->next_line >= pfile->buffer->rlimit))
2496 {
2497 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2498 "unterminated raw string");
2499 type = CPP_OTHER;
2500 goto out;
2501 }
2502
2503 accum.append (pfile, base, pos - base + 1);
2504 _cpp_process_line_notes (pfile, false);
2505
2506 if (pfile->buffer->next_line < pfile->buffer->rlimit)
2507 CPP_INCREMENT_LINE (pfile, 0);
2508 pfile->buffer->need_line = true;
2509
2510 if (!_cpp_get_fresh_line (pfile))
2511 {
2512 /* We ran out of file and failed to get a line. */
2513 location_t src_loc = token->src_loc;
2514 token->type = CPP_EOF;
2515 /* Tell the compiler the line number of the EOF token. */
2516 token->src_loc = pfile->line_table->highest_line;
2517 token->flags = BOL;
2518 if (accum.first)
2519 _cpp_release_buff (pfile, accum.first);
2520 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2521 "unterminated raw string");
2522 /* Now pop the buffer that _cpp_get_fresh_line did not. */
2523 _cpp_pop_buffer (pfile);
2524 return;
2525 }
2526
2527 pos = base = pfile->buffer->cur;
2528 note = &pfile->buffer->notes[pfile->buffer->cur_note];
2529 }
2530 else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
2531 && warn_bidi_p)
2532 {
2533 location_t loc;
2534 bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
2535 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2536 }
2537 }
2538
2539 if (warn_bidi_p)
2540 maybe_warn_bidi_on_close (pfile, pos);
2541
2542 if (CPP_OPTION (pfile, user_literals))
2543 {
2544 /* If a string format macro, say from inttypes.h, is placed touching
2545 a string literal it could be parsed as a C++11 user-defined string
2546 literal thus breaking the program. */
2547 if (is_macro_not_literal_suffix (pfile, pos))
2548 {
2549 /* Raise a warning, but do not consume subsequent tokens. */
2550 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2551 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2552 token->src_loc, 0,
2553 "invalid suffix on literal; C++11 requires "
2554 "a space between literal and string macro");
2555 }
2556 /* Grab user defined literal suffix. */
2557 else if (ISIDST (*pos))
2558 {
2559 type = cpp_userdef_string_add_type (type);
2560 ++pos;
2561
2562 while (ISIDNUM (*pos))
2563 ++pos;
2564 }
2565 }
2566
2567 out:
2568 pfile->buffer->cur = pos;
2569 if (!accum.accum)
2570 create_literal (pfile, token, base, pos - base, type);
2571 else
2572 {
2573 size_t extra_len = pos - base;
2574 uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2575
2576 token->type = type;
2577 token->val.str.len = accum.accum + extra_len;
2578 token->val.str.text = dest;
2579 for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2580 {
2581 size_t len = BUFF_FRONT (buf) - buf->base;
2582 memcpy (dest, buf->base, len);
2583 dest += len;
2584 }
2585 _cpp_release_buff (pfile, accum.first);
2586 memcpy (dest, base, extra_len);
2587 dest[extra_len] = '\0';
2588 }
2589 }
2590
2591 /* Lexes a string, character constant, or angle-bracketed header file
2592 name. The stored string contains the spelling, including opening
2593 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2594 'R' modifier. It returns the type of the literal, or CPP_OTHER
2595 if it was not properly terminated, or CPP_LESS for an unterminated
2596 header name which must be relexed as normal tokens.
2597
2598 The spelling is NUL-terminated, but it is not guaranteed that this
2599 is the first NUL since embedded NULs are preserved. */
2600 static void
2601 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2602 {
2603 bool saw_NUL = false;
2604 const uchar *cur;
2605 cppchar_t terminator;
2606 enum cpp_ttype type;
2607
2608 cur = base;
2609 terminator = *cur++;
2610 if (terminator == 'L' || terminator == 'U')
2611 terminator = *cur++;
2612 else if (terminator == 'u')
2613 {
2614 terminator = *cur++;
2615 if (terminator == '8')
2616 terminator = *cur++;
2617 }
2618 if (terminator == 'R')
2619 {
2620 lex_raw_string (pfile, token, base);
2621 return;
2622 }
2623 if (terminator == '"')
2624 type = (*base == 'L' ? CPP_WSTRING :
2625 *base == 'U' ? CPP_STRING32 :
2626 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2627 : CPP_STRING);
2628 else if (terminator == '\'')
2629 type = (*base == 'L' ? CPP_WCHAR :
2630 *base == 'U' ? CPP_CHAR32 :
2631 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2632 : CPP_CHAR);
2633 else
2634 terminator = '>', type = CPP_HEADER_NAME;
2635
2636 const bool warn_bidi_p = pfile->warn_bidi_p ();
2637 for (;;)
2638 {
2639 cppchar_t c = *cur++;
2640
2641 /* In #include-style directives, terminators are not escapable. */
2642 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2643 {
2644 if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
2645 {
2646 location_t loc;
2647 bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
2648 &loc);
2649 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2650 }
2651 cur++;
2652 }
2653 else if (c == terminator)
2654 {
2655 if (warn_bidi_p)
2656 maybe_warn_bidi_on_close (pfile, cur - 1);
2657 break;
2658 }
2659 else if (c == '\n')
2660 {
2661 cur--;
2662 /* Unmatched quotes always yield undefined behavior, but
2663 greedy lexing means that what appears to be an unterminated
2664 header name may actually be a legitimate sequence of tokens. */
2665 if (terminator == '>')
2666 {
2667 token->type = CPP_LESS;
2668 return;
2669 }
2670 type = CPP_OTHER;
2671 break;
2672 }
2673 else if (c == '\0')
2674 saw_NUL = true;
2675 else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
2676 {
2677 location_t loc;
2678 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
2679 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2680 }
2681 }
2682
2683 if (saw_NUL && !pfile->state.skipping)
2684 cpp_error (pfile, CPP_DL_WARNING,
2685 "null character(s) preserved in literal");
2686
2687 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2688 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2689 (int) terminator);
2690
2691 if (CPP_OPTION (pfile, user_literals))
2692 {
2693 /* If a string format macro, say from inttypes.h, is placed touching
2694 a string literal it could be parsed as a C++11 user-defined string
2695 literal thus breaking the program. */
2696 if (is_macro_not_literal_suffix (pfile, cur))
2697 {
2698 /* Raise a warning, but do not consume subsequent tokens. */
2699 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2700 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2701 token->src_loc, 0,
2702 "invalid suffix on literal; C++11 requires "
2703 "a space between literal and string macro");
2704 }
2705 /* Grab user defined literal suffix. */
2706 else if (ISIDST (*cur))
2707 {
2708 type = cpp_userdef_char_add_type (type);
2709 type = cpp_userdef_string_add_type (type);
2710 ++cur;
2711
2712 while (ISIDNUM (*cur))
2713 ++cur;
2714 }
2715 }
2716 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2717 && is_macro (pfile, cur)
2718 && !pfile->state.skipping)
2719 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2720 token->src_loc, 0, "C++11 requires a space "
2721 "between string literal and macro");
2722
2723 pfile->buffer->cur = cur;
2724 create_literal (pfile, token, base, cur - base, type);
2725 }
2726
2727 /* Return the comment table. The client may not make any assumption
2728 about the ordering of the table. */
2729 cpp_comment_table *
2730 cpp_get_comments (cpp_reader *pfile)
2731 {
2732 return &pfile->comments;
2733 }
2734
2735 /* Append a comment to the end of the comment table. */
2736 static void
2737 store_comment (cpp_reader *pfile, cpp_token *token)
2738 {
2739 int len;
2740
2741 if (pfile->comments.allocated == 0)
2742 {
2743 pfile->comments.allocated = 256;
2744 pfile->comments.entries = (cpp_comment *) xmalloc
2745 (pfile->comments.allocated * sizeof (cpp_comment));
2746 }
2747
2748 if (pfile->comments.count == pfile->comments.allocated)
2749 {
2750 pfile->comments.allocated *= 2;
2751 pfile->comments.entries = (cpp_comment *) xrealloc
2752 (pfile->comments.entries,
2753 pfile->comments.allocated * sizeof (cpp_comment));
2754 }
2755
2756 len = token->val.str.len;
2757
2758 /* Copy comment. Note, token may not be NULL terminated. */
2759 pfile->comments.entries[pfile->comments.count].comment =
2760 (char *) xmalloc (sizeof (char) * (len + 1));
2761 memcpy (pfile->comments.entries[pfile->comments.count].comment,
2762 token->val.str.text, len);
2763 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2764
2765 /* Set source location. */
2766 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2767
2768 /* Increment the count of entries in the comment table. */
2769 pfile->comments.count++;
2770 }
2771
2772 /* The stored comment includes the comment start and any terminator. */
2773 static void
2774 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2775 cppchar_t type)
2776 {
2777 unsigned char *buffer;
2778 unsigned int len, clen, i;
2779 int convert_to_c = (pfile->state.in_directive || pfile->state.parsing_args)
2780 && type == '/';
2781
2782 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
2783
2784 /* C++ comments probably (not definitely) have moved past a new
2785 line, which we don't want to save in the comment. */
2786 if (is_vspace (pfile->buffer->cur[-1]))
2787 len--;
2788
2789 /* If we are currently in a directive or in argument parsing, then
2790 we need to store all C++ comments as C comments internally, and
2791 so we need to allocate a little extra space in that case.
2792
2793 Note that the only time we encounter a directive here is
2794 when we are saving comments in a "#define". */
2795 clen = convert_to_c ? len + 2 : len;
2796
2797 buffer = _cpp_unaligned_alloc (pfile, clen);
2798
2799 token->type = CPP_COMMENT;
2800 token->val.str.len = clen;
2801 token->val.str.text = buffer;
2802
2803 buffer[0] = '/';
2804 memcpy (buffer + 1, from, len - 1);
2805
2806 /* Finish conversion to a C comment, if necessary. */
2807 if (convert_to_c)
2808 {
2809 buffer[1] = '*';
2810 buffer[clen - 2] = '*';
2811 buffer[clen - 1] = '/';
2812 /* As there can be in a C++ comments illegal sequences for C comments
2813 we need to filter them out. */
2814 for (i = 2; i < (clen - 2); i++)
2815 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2816 buffer[i] = '|';
2817 }
2818
2819 /* Finally store this comment for use by clients of libcpp. */
2820 store_comment (pfile, token);
2821 }
2822
2823 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2824 comment. */
2825
2826 static bool
2827 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2828 {
2829 const unsigned char *from = comment_start + 1;
2830
2831 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2832 {
2833 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2834 don't recognize any comments. The latter only checks attributes,
2835 the former doesn't warn. */
2836 case 0:
2837 default:
2838 return false;
2839 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2840 content it has. */
2841 case 1:
2842 return true;
2843 case 2:
2844 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2845 .*falls?[ \t-]*thr(u|ough).* regex. */
2846 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2847 from++)
2848 {
2849 /* Is there anything like strpbrk with upper boundary, or
2850 memchr looking for 2 characters rather than just one? */
2851 if (from[0] != 'f' && from[0] != 'F')
2852 continue;
2853 if (from[1] != 'a' && from[1] != 'A')
2854 continue;
2855 if (from[2] != 'l' && from[2] != 'L')
2856 continue;
2857 if (from[3] != 'l' && from[3] != 'L')
2858 continue;
2859 from += sizeof "fall" - 1;
2860 if (from[0] == 's' || from[0] == 'S')
2861 from++;
2862 while (*from == ' ' || *from == '\t' || *from == '-')
2863 from++;
2864 if (from[0] != 't' && from[0] != 'T')
2865 continue;
2866 if (from[1] != 'h' && from[1] != 'H')
2867 continue;
2868 if (from[2] != 'r' && from[2] != 'R')
2869 continue;
2870 if (from[3] == 'u' || from[3] == 'U')
2871 return true;
2872 if (from[3] != 'o' && from[3] != 'O')
2873 continue;
2874 if (from[4] != 'u' && from[4] != 'U')
2875 continue;
2876 if (from[5] != 'g' && from[5] != 'G')
2877 continue;
2878 if (from[6] != 'h' && from[6] != 'H')
2879 continue;
2880 return true;
2881 }
2882 return false;
2883 case 3:
2884 case 4:
2885 break;
2886 }
2887
2888 /* Whole comment contents:
2889 -fallthrough
2890 @fallthrough@
2891 */
2892 if (*from == '-' || *from == '@')
2893 {
2894 size_t len = sizeof "fallthrough" - 1;
2895 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2896 return false;
2897 if (memcmp (from + 1, "fallthrough", len))
2898 return false;
2899 if (*from == '@')
2900 {
2901 if (from[len + 1] != '@')
2902 return false;
2903 len++;
2904 }
2905 from += 1 + len;
2906 }
2907 /* Whole comment contents (regex):
2908 lint -fallthrough[ \t]*
2909 */
2910 else if (*from == 'l')
2911 {
2912 size_t len = sizeof "int -fallthrough" - 1;
2913 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2914 return false;
2915 if (memcmp (from + 1, "int -fallthrough", len))
2916 return false;
2917 from += 1 + len;
2918 while (*from == ' ' || *from == '\t')
2919 from++;
2920 }
2921 /* Whole comment contents (regex):
2922 [ \t]*FALLTHR(U|OUGH)[ \t]*
2923 */
2924 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2925 {
2926 while (*from == ' ' || *from == '\t')
2927 from++;
2928 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
2929 return false;
2930 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2931 return false;
2932 from += sizeof "FALLTHR" - 1;
2933 if (*from == 'U')
2934 from++;
2935 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
2936 return false;
2937 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2938 return false;
2939 else
2940 from += sizeof "OUGH" - 1;
2941 while (*from == ' ' || *from == '\t')
2942 from++;
2943 }
2944 /* Whole comment contents (regex):
2945 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2946 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2947 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2948 */
2949 else
2950 {
2951 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2952 from++;
2953 unsigned char f = *from;
2954 bool all_upper = false;
2955 if (f == 'E' || f == 'e')
2956 {
2957 if ((size_t) (pfile->buffer->cur - from)
2958 < sizeof "else fallthru" - 1)
2959 return false;
2960 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2961 all_upper = true;
2962 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2963 return false;
2964 from += sizeof "else" - 1;
2965 if (*from == ',')
2966 from++;
2967 if (*from != ' ')
2968 return false;
2969 from++;
2970 if (all_upper && *from == 'f')
2971 return false;
2972 if (f == 'e' && *from == 'F')
2973 return false;
2974 f = *from;
2975 }
2976 else if (f == 'I' || f == 'i')
2977 {
2978 if ((size_t) (pfile->buffer->cur - from)
2979 < sizeof "intentional fallthru" - 1)
2980 return false;
2981 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2982 sizeof "NTENTIONAL" - 1) == 0)
2983 all_upper = true;
2984 else if (memcmp (from + 1, "ntentional",
2985 sizeof "ntentional" - 1))
2986 return false;
2987 from += sizeof "intentional" - 1;
2988 if (*from == ' ')
2989 {
2990 from++;
2991 if (all_upper && *from == 'f')
2992 return false;
2993 }
2994 else if (all_upper)
2995 {
2996 if (memcmp (from, "LY F", sizeof "LY F" - 1))
2997 return false;
2998 from += sizeof "LY " - 1;
2999 }
3000 else
3001 {
3002 if (memcmp (from, "ly ", sizeof "ly " - 1))
3003 return false;
3004 from += sizeof "ly " - 1;
3005 }
3006 if (f == 'i' && *from == 'F')
3007 return false;
3008 f = *from;
3009 }
3010 if (f != 'F' && f != 'f')
3011 return false;
3012 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3013 return false;
3014 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3015 all_upper = true;
3016 else if (all_upper)
3017 return false;
3018 else if (memcmp (from + 1, "all", sizeof "all" - 1))
3019 return false;
3020 from += sizeof "fall" - 1;
3021 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3022 from += 2;
3023 else if (*from == ' ' || *from == '-')
3024 from++;
3025 else if (*from != (all_upper ? 'T' : 't'))
3026 return false;
3027 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3028 return false;
3029 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3030 return false;
3031 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3032 {
3033 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3034 return false;
3035 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3036 sizeof "hrough" - 1))
3037 return false;
3038 from += sizeof "through" - 1;
3039 }
3040 else
3041 from += sizeof "thru" - 1;
3042 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3043 from++;
3044 if (*from == '-')
3045 {
3046 from++;
3047 if (*comment_start == '*')
3048 {
3049 do
3050 {
3051 while (*from && *from != '*'
3052 && *from != '\n' && *from != '\r')
3053 from++;
3054 if (*from != '*' || from[1] == '/')
3055 break;
3056 from++;
3057 }
3058 while (1);
3059 }
3060 else
3061 while (*from && *from != '\n' && *from != '\r')
3062 from++;
3063 }
3064 }
3065 /* C block comment. */
3066 if (*comment_start == '*')
3067 {
3068 if (*from != '*' || from[1] != '/')
3069 return false;
3070 }
3071 /* C++ line comment. */
3072 else if (*from != '\n')
3073 return false;
3074
3075 return true;
3076 }
3077
3078 /* Allocate COUNT tokens for RUN. */
3079 void
3080 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3081 {
3082 run->base = XNEWVEC (cpp_token, count);
3083 run->limit = run->base + count;
3084 run->next = NULL;
3085 }
3086
3087 /* Returns the next tokenrun, or creates one if there is none. */
3088 static tokenrun *
3089 next_tokenrun (tokenrun *run)
3090 {
3091 if (run->next == NULL)
3092 {
3093 run->next = XNEW (tokenrun);
3094 run->next->prev = run;
3095 _cpp_init_tokenrun (run->next, 250);
3096 }
3097
3098 return run->next;
3099 }
3100
3101 /* Return the number of not yet processed token in a given
3102 context. */
3103 int
3104 _cpp_remaining_tokens_num_in_context (cpp_context *context)
3105 {
3106 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3107 return (LAST (context).token - FIRST (context).token);
3108 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3109 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3110 return (LAST (context).ptoken - FIRST (context).ptoken);
3111 else
3112 abort ();
3113 }
3114
3115 /* Returns the token present at index INDEX in a given context. If
3116 INDEX is zero, the next token to be processed is returned. */
3117 static const cpp_token*
3118 _cpp_token_from_context_at (cpp_context *context, int index)
3119 {
3120 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3121 return &(FIRST (context).token[index]);
3122 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3123 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3124 return FIRST (context).ptoken[index];
3125 else
3126 abort ();
3127 }
3128
3129 /* Look ahead in the input stream. */
3130 const cpp_token *
3131 cpp_peek_token (cpp_reader *pfile, int index)
3132 {
3133 cpp_context *context = pfile->context;
3134 const cpp_token *peektok;
3135 int count;
3136
3137 /* First, scan through any pending cpp_context objects. */
3138 while (context->prev)
3139 {
3140 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3141
3142 if (index < (int) sz)
3143 return _cpp_token_from_context_at (context, index);
3144 index -= (int) sz;
3145 context = context->prev;
3146 }
3147
3148 /* We will have to read some new tokens after all (and do so
3149 without invalidating preceding tokens). */
3150 count = index;
3151 pfile->keep_tokens++;
3152
3153 /* For peeked tokens temporarily disable line_change reporting,
3154 until the tokens are parsed for real. */
3155 void (*line_change) (cpp_reader *, const cpp_token *, int)
3156 = pfile->cb.line_change;
3157 pfile->cb.line_change = NULL;
3158
3159 do
3160 {
3161 peektok = _cpp_lex_token (pfile);
3162 if (peektok->type == CPP_EOF)
3163 {
3164 index--;
3165 break;
3166 }
3167 else if (peektok->type == CPP_PRAGMA)
3168 {
3169 /* Don't peek past a pragma. */
3170 if (peektok == &pfile->directive_result)
3171 /* Save the pragma in the buffer. */
3172 *pfile->cur_token++ = *peektok;
3173 index--;
3174 break;
3175 }
3176 }
3177 while (index--);
3178
3179 _cpp_backup_tokens_direct (pfile, count - index);
3180 pfile->keep_tokens--;
3181 pfile->cb.line_change = line_change;
3182
3183 return peektok;
3184 }
3185
3186 /* Allocate a single token that is invalidated at the same time as the
3187 rest of the tokens on the line. Has its line and col set to the
3188 same as the last lexed token, so that diagnostics appear in the
3189 right place. */
3190 cpp_token *
3191 _cpp_temp_token (cpp_reader *pfile)
3192 {
3193 cpp_token *old, *result;
3194 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3195 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3196
3197 old = pfile->cur_token - 1;
3198 /* Any pre-existing lookaheads must not be clobbered. */
3199 if (la)
3200 {
3201 if (sz <= la)
3202 {
3203 tokenrun *next = next_tokenrun (pfile->cur_run);
3204
3205 if (sz < la)
3206 memmove (next->base + 1, next->base,
3207 (la - sz) * sizeof (cpp_token));
3208
3209 next->base[0] = pfile->cur_run->limit[-1];
3210 }
3211
3212 if (sz > 1)
3213 memmove (pfile->cur_token + 1, pfile->cur_token,
3214 MIN (la, sz - 1) * sizeof (cpp_token));
3215 }
3216
3217 if (!sz && pfile->cur_token == pfile->cur_run->limit)
3218 {
3219 pfile->cur_run = next_tokenrun (pfile->cur_run);
3220 pfile->cur_token = pfile->cur_run->base;
3221 }
3222
3223 result = pfile->cur_token++;
3224 result->src_loc = old->src_loc;
3225 return result;
3226 }
3227
3228 /* We're at the beginning of a logical line (so not in
3229 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3230 if we should enter deferred_pragma mode to tokenize the rest of the
3231 line as a module control-line. */
3232
3233 static void
3234 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3235 {
3236 unsigned backup = 0; /* Tokens we peeked. */
3237 cpp_hashnode *node = result->val.node.node;
3238 cpp_token *peek = result;
3239 cpp_token *keyword = peek;
3240 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3241 int header_count = 0;
3242
3243 /* Make sure the incoming state is as we expect it. This way we
3244 can restore it using constants. */
3245 gcc_checking_assert (!pfile->state.in_deferred_pragma
3246 && !pfile->state.skipping
3247 && !pfile->state.parsing_args
3248 && !pfile->state.angled_headers
3249 && (pfile->state.save_comments
3250 == !CPP_OPTION (pfile, discard_comments)));
3251
3252 /* Enter directives mode sufficiently for peeking. We don't have
3253 to actually set in_directive. */
3254 pfile->state.in_deferred_pragma = true;
3255
3256 /* These two fields are needed to process tokenization in deferred
3257 pragma mode. They are not used outside deferred pragma mode or
3258 directives mode. */
3259 pfile->state.pragma_allow_expansion = true;
3260 pfile->directive_line = result->src_loc;
3261
3262 /* Saving comments is incompatible with directives mode. */
3263 pfile->state.save_comments = 0;
3264
3265 if (node == n_modules[spec_nodes::M_EXPORT][0])
3266 {
3267 peek = _cpp_lex_direct (pfile);
3268 keyword = peek;
3269 backup++;
3270 if (keyword->type != CPP_NAME)
3271 goto not_module;
3272 node = keyword->val.node.node;
3273 if (!(node->flags & NODE_MODULE))
3274 goto not_module;
3275 }
3276
3277 if (node == n_modules[spec_nodes::M__IMPORT][0])
3278 /* __import */
3279 header_count = backup + 2 + 16;
3280 else if (node == n_modules[spec_nodes::M_IMPORT][0])
3281 /* import */
3282 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3283 else if (node == n_modules[spec_nodes::M_MODULE][0])
3284 ; /* module */
3285 else
3286 goto not_module;
3287
3288 /* We've seen [export] {module|import|__import}. Check the next token. */
3289 if (header_count)
3290 /* After '{,__}import' a header name may appear. */
3291 pfile->state.angled_headers = true;
3292 peek = _cpp_lex_direct (pfile);
3293 backup++;
3294
3295 /* ... import followed by identifier, ':', '<' or
3296 header-name preprocessing tokens, or module
3297 followed by cpp-identifier, ':' or ';' preprocessing
3298 tokens. C++ keywords are not yet relevant. */
3299 if (peek->type == CPP_NAME
3300 || peek->type == CPP_COLON
3301 || (header_count
3302 ? (peek->type == CPP_LESS
3303 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3304 || peek->type == CPP_HEADER_NAME)
3305 : peek->type == CPP_SEMICOLON))
3306 {
3307 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3308 if (!pfile->state.pragma_allow_expansion)
3309 pfile->state.prevent_expansion++;
3310
3311 if (!header_count && linemap_included_from
3312 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3313 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3314 "module control-line cannot be in included file");
3315
3316 /* The first one or two tokens cannot be macro names. */
3317 for (int ix = backup; ix--;)
3318 {
3319 cpp_token *tok = ix ? keyword : result;
3320 cpp_hashnode *node = tok->val.node.node;
3321
3322 /* Don't attempt to expand the token. */
3323 tok->flags |= NO_EXPAND;
3324 if (_cpp_defined_macro_p (node)
3325 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3326 && !cpp_fun_like_macro_p (node))
3327 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3328 "module control-line \"%s\" cannot be"
3329 " an object-like macro",
3330 NODE_NAME (node));
3331 }
3332
3333 /* Map to underbar variants. */
3334 keyword->val.node.node = n_modules[header_count
3335 ? spec_nodes::M_IMPORT
3336 : spec_nodes::M_MODULE][1];
3337 if (backup != 1)
3338 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3339
3340 /* Maybe tell the tokenizer we expect a header-name down the
3341 road. */
3342 pfile->state.directive_file_token = header_count;
3343 }
3344 else
3345 {
3346 not_module:
3347 /* Drop out of directive mode. */
3348 /* We aaserted save_comments had this value upon entry. */
3349 pfile->state.save_comments
3350 = !CPP_OPTION (pfile, discard_comments);
3351 pfile->state.in_deferred_pragma = false;
3352 /* Do not let this remain on. */
3353 pfile->state.angled_headers = false;
3354 }
3355
3356 /* In either case we want to backup the peeked tokens. */
3357 if (backup)
3358 {
3359 /* If we saw EOL, we should drop it, because this isn't a module
3360 control-line after all. */
3361 bool eol = peek->type == CPP_PRAGMA_EOL;
3362 if (!eol || backup > 1)
3363 {
3364 /* Put put the peeked tokens back */
3365 _cpp_backup_tokens_direct (pfile, backup);
3366 /* But if the last one was an EOL, forget it. */
3367 if (eol)
3368 pfile->lookaheads--;
3369 }
3370 }
3371 }
3372
3373 /* Lex a token into RESULT (external interface). Takes care of issues
3374 like directive handling, token lookahead, multiple include
3375 optimization and skipping. */
3376 const cpp_token *
3377 _cpp_lex_token (cpp_reader *pfile)
3378 {
3379 cpp_token *result;
3380
3381 for (;;)
3382 {
3383 if (pfile->cur_token == pfile->cur_run->limit)
3384 {
3385 pfile->cur_run = next_tokenrun (pfile->cur_run);
3386 pfile->cur_token = pfile->cur_run->base;
3387 }
3388 /* We assume that the current token is somewhere in the current
3389 run. */
3390 if (pfile->cur_token < pfile->cur_run->base
3391 || pfile->cur_token >= pfile->cur_run->limit)
3392 abort ();
3393
3394 if (pfile->lookaheads)
3395 {
3396 pfile->lookaheads--;
3397 result = pfile->cur_token++;
3398 }
3399 else
3400 result = _cpp_lex_direct (pfile);
3401
3402 if (result->flags & BOL)
3403 {
3404 /* Is this a directive. If _cpp_handle_directive returns
3405 false, it is an assembler #. */
3406 if (result->type == CPP_HASH
3407 /* 6.10.3 p 11: Directives in a list of macro arguments
3408 gives undefined behavior. This implementation
3409 handles the directive as normal. */
3410 && pfile->state.parsing_args != 1)
3411 {
3412 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3413 {
3414 if (pfile->directive_result.type == CPP_PADDING)
3415 continue;
3416 result = &pfile->directive_result;
3417 }
3418 }
3419 else if (pfile->state.in_deferred_pragma)
3420 result = &pfile->directive_result;
3421 else if (result->type == CPP_NAME
3422 && (result->val.node.node->flags & NODE_MODULE)
3423 && !pfile->state.skipping
3424 /* Unlike regular directives, we do not deal with
3425 tokenizing module directives as macro arguments.
3426 That's not permitted. */
3427 && !pfile->state.parsing_args)
3428 {
3429 /* P1857. Before macro expansion, At start of logical
3430 line ... */
3431 /* We don't have to consider lookaheads at this point. */
3432 gcc_checking_assert (!pfile->lookaheads);
3433
3434 cpp_maybe_module_directive (pfile, result);
3435 }
3436
3437 if (pfile->cb.line_change && !pfile->state.skipping)
3438 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3439 }
3440
3441 /* We don't skip tokens in directives. */
3442 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3443 break;
3444
3445 /* Outside a directive, invalidate controlling macros. At file
3446 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3447 get here and MI optimization works. */
3448 pfile->mi_valid = false;
3449
3450 if (!pfile->state.skipping || result->type == CPP_EOF)
3451 break;
3452 }
3453
3454 return result;
3455 }
3456
3457 /* Returns true if a fresh line has been loaded. */
3458 bool
3459 _cpp_get_fresh_line (cpp_reader *pfile)
3460 {
3461 /* We can't get a new line until we leave the current directive. */
3462 if (pfile->state.in_directive)
3463 return false;
3464
3465 for (;;)
3466 {
3467 cpp_buffer *buffer = pfile->buffer;
3468
3469 if (!buffer->need_line)
3470 return true;
3471
3472 if (buffer->next_line < buffer->rlimit)
3473 {
3474 _cpp_clean_line (pfile);
3475 return true;
3476 }
3477
3478 /* First, get out of parsing arguments state. */
3479 if (pfile->state.parsing_args)
3480 return false;
3481
3482 /* End of buffer. Non-empty files should end in a newline. */
3483 if (buffer->buf != buffer->rlimit
3484 && buffer->next_line > buffer->rlimit
3485 && !buffer->from_stage3)
3486 {
3487 /* Clip to buffer size. */
3488 buffer->next_line = buffer->rlimit;
3489 }
3490
3491 if (buffer->prev && !buffer->return_at_eof)
3492 _cpp_pop_buffer (pfile);
3493 else
3494 {
3495 /* End of translation. Do not pop the buffer yet. Increment
3496 line number so that the EOF token is on a line of its own
3497 (_cpp_lex_direct doesn't increment in that case, because
3498 it's hard for it to distinguish this special case). */
3499 CPP_INCREMENT_LINE (pfile, 0);
3500 return false;
3501 }
3502 }
3503 }
3504
3505 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3506 do \
3507 { \
3508 result->type = ELSE_TYPE; \
3509 if (*buffer->cur == CHAR) \
3510 buffer->cur++, result->type = THEN_TYPE; \
3511 } \
3512 while (0)
3513
3514 /* Lex a token into pfile->cur_token, which is also incremented, to
3515 get diagnostics pointing to the correct location.
3516
3517 Does not handle issues such as token lookahead, multiple-include
3518 optimization, directives, skipping etc. This function is only
3519 suitable for use by _cpp_lex_token, and in special cases like
3520 lex_expansion_token which doesn't care for any of these issues.
3521
3522 When meeting a newline, returns CPP_EOF if parsing a directive,
3523 otherwise returns to the start of the token buffer if permissible.
3524 Returns the location of the lexed token. */
3525 cpp_token *
3526 _cpp_lex_direct (cpp_reader *pfile)
3527 {
3528 cppchar_t c;
3529 cpp_buffer *buffer;
3530 const unsigned char *comment_start;
3531 bool fallthrough_comment = false;
3532 cpp_token *result = pfile->cur_token++;
3533
3534 fresh_line:
3535 result->flags = 0;
3536 buffer = pfile->buffer;
3537 if (buffer->need_line)
3538 {
3539 if (pfile->state.in_deferred_pragma)
3540 {
3541 /* This can happen in cases like:
3542 #define loop(x) whatever
3543 #pragma omp loop
3544 where when trying to expand loop we need to peek
3545 next token after loop, but aren't still in_deferred_pragma
3546 mode but are in in_directive mode, so buffer->need_line
3547 is set, a CPP_EOF is peeked. */
3548 result->type = CPP_PRAGMA_EOL;
3549 pfile->state.in_deferred_pragma = false;
3550 if (!pfile->state.pragma_allow_expansion)
3551 pfile->state.prevent_expansion--;
3552 return result;
3553 }
3554 if (!_cpp_get_fresh_line (pfile))
3555 {
3556 result->type = CPP_EOF;
3557 /* Not a real EOF in a directive or arg parsing -- we refuse
3558 to advance to the next file now, and will once we're out
3559 of those modes. */
3560 if (!pfile->state.in_directive && !pfile->state.parsing_args)
3561 {
3562 /* Tell the compiler the line number of the EOF token. */
3563 result->src_loc = pfile->line_table->highest_line;
3564 result->flags = BOL;
3565 /* Now pop the buffer that _cpp_get_fresh_line did not. */
3566 _cpp_pop_buffer (pfile);
3567 }
3568 return result;
3569 }
3570 if (buffer != pfile->buffer)
3571 fallthrough_comment = false;
3572 if (!pfile->keep_tokens)
3573 {
3574 pfile->cur_run = &pfile->base_run;
3575 result = pfile->base_run.base;
3576 pfile->cur_token = result + 1;
3577 }
3578 result->flags = BOL;
3579 if (pfile->state.parsing_args == 2)
3580 result->flags |= PREV_WHITE;
3581 }
3582 buffer = pfile->buffer;
3583 update_tokens_line:
3584 result->src_loc = pfile->line_table->highest_line;
3585
3586 skipped_white:
3587 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3588 && !pfile->overlaid_buffer)
3589 {
3590 _cpp_process_line_notes (pfile, false);
3591 result->src_loc = pfile->line_table->highest_line;
3592 }
3593 c = *buffer->cur++;
3594
3595 if (pfile->forced_token_location)
3596 result->src_loc = pfile->forced_token_location;
3597 else
3598 result->src_loc = linemap_position_for_column (pfile->line_table,
3599 CPP_BUF_COLUMN (buffer, buffer->cur));
3600
3601 switch (c)
3602 {
3603 case ' ': case '\t': case '\f': case '\v': case '\0':
3604 result->flags |= PREV_WHITE;
3605 skip_whitespace (pfile, c);
3606 goto skipped_white;
3607
3608 case '\n':
3609 /* Increment the line, unless this is the last line ... */
3610 if (buffer->cur < buffer->rlimit
3611 /* ... or this is a #include, (where _cpp_stack_file needs to
3612 unwind by one line) ... */
3613 || (pfile->state.in_directive > 1
3614 /* ... except traditional-cpp increments this elsewhere. */
3615 && !CPP_OPTION (pfile, traditional)))
3616 CPP_INCREMENT_LINE (pfile, 0);
3617 buffer->need_line = true;
3618 if (pfile->state.in_deferred_pragma)
3619 {
3620 /* Produce the PRAGMA_EOL on this line. File reading
3621 ensures there is always a \n at end of the buffer, thus
3622 in a deferred pragma we always see CPP_PRAGMA_EOL before
3623 any CPP_EOF. */
3624 result->type = CPP_PRAGMA_EOL;
3625 result->flags &= ~PREV_WHITE;
3626 pfile->state.in_deferred_pragma = false;
3627 if (!pfile->state.pragma_allow_expansion)
3628 pfile->state.prevent_expansion--;
3629 return result;
3630 }
3631 goto fresh_line;
3632
3633 case '0': case '1': case '2': case '3': case '4':
3634 case '5': case '6': case '7': case '8': case '9':
3635 {
3636 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3637 result->type = CPP_NUMBER;
3638 lex_number (pfile, &result->val.str, &nst);
3639 warn_about_normalization (pfile, result, &nst);
3640 break;
3641 }
3642
3643 case 'L':
3644 case 'u':
3645 case 'U':
3646 case 'R':
3647 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3648 wide strings or raw strings. */
3649 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3650 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3651 {
3652 if ((*buffer->cur == '\'' && c != 'R')
3653 || *buffer->cur == '"'
3654 || (*buffer->cur == 'R'
3655 && c != 'R'
3656 && buffer->cur[1] == '"'
3657 && CPP_OPTION (pfile, rliterals))
3658 || (*buffer->cur == '8'
3659 && c == 'u'
3660 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3661 && CPP_OPTION (pfile, utf8_char_literals)))
3662 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3663 && CPP_OPTION (pfile, rliterals)))))
3664 {
3665 lex_string (pfile, result, buffer->cur - 1);
3666 break;
3667 }
3668 }
3669 /* Fall through. */
3670
3671 case '_':
3672 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3673 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3674 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3675 case 's': case 't': case 'v': case 'w': case 'x':
3676 case 'y': case 'z':
3677 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3678 case 'G': case 'H': case 'I': case 'J': case 'K':
3679 case 'M': case 'N': case 'O': case 'P': case 'Q':
3680 case 'S': case 'T': case 'V': case 'W': case 'X':
3681 case 'Y': case 'Z':
3682 result->type = CPP_NAME;
3683 {
3684 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3685 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3686 &nst,
3687 &result->val.node.spelling);
3688 warn_about_normalization (pfile, result, &nst);
3689 }
3690
3691 /* Convert named operators to their proper types. */
3692 if (result->val.node.node->flags & NODE_OPERATOR)
3693 {
3694 result->flags |= NAMED_OP;
3695 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3696 }
3697
3698 /* Signal FALLTHROUGH comment followed by another token. */
3699 if (fallthrough_comment)
3700 result->flags |= PREV_FALLTHROUGH;
3701 break;
3702
3703 case '\'':
3704 case '"':
3705 lex_string (pfile, result, buffer->cur - 1);
3706 break;
3707
3708 case '/':
3709 /* A potential block or line comment. */
3710 comment_start = buffer->cur;
3711 c = *buffer->cur;
3712
3713 if (c == '*')
3714 {
3715 if (_cpp_skip_block_comment (pfile))
3716 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3717 }
3718 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3719 {
3720 /* Don't warn for system headers. */
3721 if (_cpp_in_system_header (pfile))
3722 ;
3723 /* Warn about comments if pedantically GNUC89, and not
3724 in system headers. */
3725 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3726 && CPP_PEDANTIC (pfile)
3727 && ! buffer->warned_cplusplus_comments)
3728 {
3729 if (cpp_error (pfile, CPP_DL_PEDWARN,
3730 "C++ style comments are not allowed in ISO C90"))
3731 cpp_error (pfile, CPP_DL_NOTE,
3732 "(this will be reported only once per input file)");
3733 buffer->warned_cplusplus_comments = 1;
3734 }
3735 /* Or if specifically desired via -Wc90-c99-compat. */
3736 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3737 && ! CPP_OPTION (pfile, cplusplus)
3738 && ! buffer->warned_cplusplus_comments)
3739 {
3740 if (cpp_error (pfile, CPP_DL_WARNING,
3741 "C++ style comments are incompatible with C90"))
3742 cpp_error (pfile, CPP_DL_NOTE,
3743 "(this will be reported only once per input file)");
3744 buffer->warned_cplusplus_comments = 1;
3745 }
3746 /* In C89/C94, C++ style comments are forbidden. */
3747 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3748 || CPP_OPTION (pfile, lang) == CLK_STDC94))
3749 {
3750 /* But don't be confused about valid code such as
3751 - // immediately followed by *,
3752 - // in a preprocessing directive,
3753 - // in an #if 0 block. */
3754 if (buffer->cur[1] == '*'
3755 || pfile->state.in_directive
3756 || pfile->state.skipping)
3757 {
3758 result->type = CPP_DIV;
3759 break;
3760 }
3761 else if (! buffer->warned_cplusplus_comments)
3762 {
3763 if (cpp_error (pfile, CPP_DL_ERROR,
3764 "C++ style comments are not allowed in "
3765 "ISO C90"))
3766 cpp_error (pfile, CPP_DL_NOTE,
3767 "(this will be reported only once per input "
3768 "file)");
3769 buffer->warned_cplusplus_comments = 1;
3770 }
3771 }
3772 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3773 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3774 }
3775 else if (c == '=')
3776 {
3777 buffer->cur++;
3778 result->type = CPP_DIV_EQ;
3779 break;
3780 }
3781 else
3782 {
3783 result->type = CPP_DIV;
3784 break;
3785 }
3786
3787 if (fallthrough_comment_p (pfile, comment_start))
3788 fallthrough_comment = true;
3789
3790 if (pfile->cb.comment)
3791 {
3792 size_t len = pfile->buffer->cur - comment_start;
3793 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3794 len + 1);
3795 }
3796
3797 if (!pfile->state.save_comments)
3798 {
3799 result->flags |= PREV_WHITE;
3800 goto update_tokens_line;
3801 }
3802
3803 if (fallthrough_comment)
3804 result->flags |= PREV_FALLTHROUGH;
3805
3806 /* Save the comment as a token in its own right. */
3807 save_comment (pfile, result, comment_start, c);
3808 break;
3809
3810 case '<':
3811 if (pfile->state.angled_headers)
3812 {
3813 lex_string (pfile, result, buffer->cur - 1);
3814 if (result->type != CPP_LESS)
3815 break;
3816 }
3817
3818 result->type = CPP_LESS;
3819 if (*buffer->cur == '=')
3820 {
3821 buffer->cur++, result->type = CPP_LESS_EQ;
3822 if (*buffer->cur == '>'
3823 && CPP_OPTION (pfile, cplusplus)
3824 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3825 buffer->cur++, result->type = CPP_SPACESHIP;
3826 }
3827 else if (*buffer->cur == '<')
3828 {
3829 buffer->cur++;
3830 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3831 }
3832 else if (CPP_OPTION (pfile, digraphs))
3833 {
3834 if (*buffer->cur == ':')
3835 {
3836 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3837 three characters are <:: and the subsequent character
3838 is neither : nor >, the < is treated as a preprocessor
3839 token by itself". */
3840 if (CPP_OPTION (pfile, cplusplus)
3841 && CPP_OPTION (pfile, lang) != CLK_CXX98
3842 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3843 && buffer->cur[1] == ':'
3844 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3845 break;
3846
3847 buffer->cur++;
3848 result->flags |= DIGRAPH;
3849 result->type = CPP_OPEN_SQUARE;
3850 }
3851 else if (*buffer->cur == '%')
3852 {
3853 buffer->cur++;
3854 result->flags |= DIGRAPH;
3855 result->type = CPP_OPEN_BRACE;
3856 }
3857 }
3858 break;
3859
3860 case '>':
3861 result->type = CPP_GREATER;
3862 if (*buffer->cur == '=')
3863 buffer->cur++, result->type = CPP_GREATER_EQ;
3864 else if (*buffer->cur == '>')
3865 {
3866 buffer->cur++;
3867 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3868 }
3869 break;
3870
3871 case '%':
3872 result->type = CPP_MOD;
3873 if (*buffer->cur == '=')
3874 buffer->cur++, result->type = CPP_MOD_EQ;
3875 else if (CPP_OPTION (pfile, digraphs))
3876 {
3877 if (*buffer->cur == ':')
3878 {
3879 buffer->cur++;
3880 result->flags |= DIGRAPH;
3881 result->type = CPP_HASH;
3882 if (*buffer->cur == '%' && buffer->cur[1] == ':')
3883 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3884 }
3885 else if (*buffer->cur == '>')
3886 {
3887 buffer->cur++;
3888 result->flags |= DIGRAPH;
3889 result->type = CPP_CLOSE_BRACE;
3890 }
3891 }
3892 break;
3893
3894 case '.':
3895 result->type = CPP_DOT;
3896 if (ISDIGIT (*buffer->cur))
3897 {
3898 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3899 result->type = CPP_NUMBER;
3900 lex_number (pfile, &result->val.str, &nst);
3901 warn_about_normalization (pfile, result, &nst);
3902 }
3903 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3904 buffer->cur += 2, result->type = CPP_ELLIPSIS;
3905 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3906 buffer->cur++, result->type = CPP_DOT_STAR;
3907 break;
3908
3909 case '+':
3910 result->type = CPP_PLUS;
3911 if (*buffer->cur == '+')
3912 buffer->cur++, result->type = CPP_PLUS_PLUS;
3913 else if (*buffer->cur == '=')
3914 buffer->cur++, result->type = CPP_PLUS_EQ;
3915 break;
3916
3917 case '-':
3918 result->type = CPP_MINUS;
3919 if (*buffer->cur == '>')
3920 {
3921 buffer->cur++;
3922 result->type = CPP_DEREF;
3923 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3924 buffer->cur++, result->type = CPP_DEREF_STAR;
3925 }
3926 else if (*buffer->cur == '-')
3927 buffer->cur++, result->type = CPP_MINUS_MINUS;
3928 else if (*buffer->cur == '=')
3929 buffer->cur++, result->type = CPP_MINUS_EQ;
3930 break;
3931
3932 case '&':
3933 result->type = CPP_AND;
3934 if (*buffer->cur == '&')
3935 buffer->cur++, result->type = CPP_AND_AND;
3936 else if (*buffer->cur == '=')
3937 buffer->cur++, result->type = CPP_AND_EQ;
3938 break;
3939
3940 case '|':
3941 result->type = CPP_OR;
3942 if (*buffer->cur == '|')
3943 buffer->cur++, result->type = CPP_OR_OR;
3944 else if (*buffer->cur == '=')
3945 buffer->cur++, result->type = CPP_OR_EQ;
3946 break;
3947
3948 case ':':
3949 result->type = CPP_COLON;
3950 if (*buffer->cur == ':')
3951 {
3952 if (CPP_OPTION (pfile, scope))
3953 buffer->cur++, result->type = CPP_SCOPE;
3954 else
3955 result->flags |= COLON_SCOPE;
3956 }
3957 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3958 {
3959 buffer->cur++;
3960 result->flags |= DIGRAPH;
3961 result->type = CPP_CLOSE_SQUARE;
3962 }
3963 break;
3964
3965 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3966 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3967 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3968 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3969 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3970
3971 case '?': result->type = CPP_QUERY; break;
3972 case '~': result->type = CPP_COMPL; break;
3973 case ',': result->type = CPP_COMMA; break;
3974 case '(': result->type = CPP_OPEN_PAREN; break;
3975 case ')': result->type = CPP_CLOSE_PAREN; break;
3976 case '[': result->type = CPP_OPEN_SQUARE; break;
3977 case ']': result->type = CPP_CLOSE_SQUARE; break;
3978 case '{': result->type = CPP_OPEN_BRACE; break;
3979 case '}': result->type = CPP_CLOSE_BRACE; break;
3980 case ';': result->type = CPP_SEMICOLON; break;
3981
3982 /* @ is a punctuator in Objective-C. */
3983 case '@': result->type = CPP_ATSIGN; break;
3984
3985 default:
3986 {
3987 const uchar *base = --buffer->cur;
3988
3989 /* Check for an extended identifier ($ or UCN or UTF-8). */
3990 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3991 if (forms_identifier_p (pfile, true, &nst))
3992 {
3993 result->type = CPP_NAME;
3994 result->val.node.node = lex_identifier (pfile, base, true, &nst,
3995 &result->val.node.spelling);
3996 warn_about_normalization (pfile, result, &nst);
3997 break;
3998 }
3999
4000 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
4001 single token. */
4002 buffer->cur++;
4003 if (c >= utf8_signifier)
4004 {
4005 const uchar *pstr = base;
4006 cppchar_t s;
4007 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4008 buffer->cur = pstr;
4009 }
4010 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4011 break;
4012 }
4013
4014 }
4015
4016 /* Potentially convert the location of the token to a range. */
4017 if (result->src_loc >= RESERVED_LOCATION_COUNT
4018 && result->type != CPP_EOF)
4019 {
4020 /* Ensure that any line notes are processed, so that we have the
4021 correct physical line/column for the end-point of the token even
4022 when a logical line is split via one or more backslashes. */
4023 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4024 && !pfile->overlaid_buffer)
4025 _cpp_process_line_notes (pfile, false);
4026
4027 source_range tok_range;
4028 tok_range.m_start = result->src_loc;
4029 tok_range.m_finish
4030 = linemap_position_for_column (pfile->line_table,
4031 CPP_BUF_COLUMN (buffer, buffer->cur));
4032
4033 result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4034 result->src_loc,
4035 tok_range, NULL);
4036 }
4037
4038 return result;
4039 }
4040
4041 /* An upper bound on the number of bytes needed to spell TOKEN.
4042 Does not include preceding whitespace. */
4043 unsigned int
4044 cpp_token_len (const cpp_token *token)
4045 {
4046 unsigned int len;
4047
4048 switch (TOKEN_SPELL (token))
4049 {
4050 default: len = 6; break;
4051 case SPELL_LITERAL: len = token->val.str.len; break;
4052 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
4053 }
4054
4055 return len;
4056 }
4057
4058 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4059 Return the number of bytes read out of NAME. (There are always
4060 10 bytes written to BUFFER.) */
4061
4062 static size_t
4063 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4064 {
4065 int j;
4066 int ucn_len = 0;
4067 int ucn_len_c;
4068 unsigned t;
4069 unsigned long utf32;
4070
4071 /* Compute the length of the UTF-8 sequence. */
4072 for (t = *name; t & 0x80; t <<= 1)
4073 ucn_len++;
4074
4075 utf32 = *name & (0x7F >> ucn_len);
4076 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4077 {
4078 utf32 = (utf32 << 6) | (*++name & 0x3F);
4079
4080 /* Ill-formed UTF-8. */
4081 if ((*name & ~0x3F) != 0x80)
4082 abort ();
4083 }
4084
4085 *buffer++ = '\\';
4086 *buffer++ = 'U';
4087 for (j = 7; j >= 0; j--)
4088 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4089 return ucn_len;
4090 }
4091
4092 /* Given a token TYPE corresponding to a digraph, return a pointer to
4093 the spelling of the digraph. */
4094 static const unsigned char *
4095 cpp_digraph2name (enum cpp_ttype type)
4096 {
4097 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4098 }
4099
4100 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4101 The buffer must already contain the enough space to hold the
4102 token's spelling. Returns a pointer to the character after the
4103 last character written. */
4104 unsigned char *
4105 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4106 {
4107 size_t i;
4108 const unsigned char *name = NODE_NAME (ident);
4109
4110 for (i = 0; i < NODE_LEN (ident); i++)
4111 if (name[i] & ~0x7F)
4112 {
4113 i += utf8_to_ucn (buffer, name + i) - 1;
4114 buffer += 10;
4115 }
4116 else
4117 *buffer++ = name[i];
4118
4119 return buffer;
4120 }
4121
4122 /* Write the spelling of a token TOKEN to BUFFER. The buffer must
4123 already contain the enough space to hold the token's spelling.
4124 Returns a pointer to the character after the last character written.
4125 FORSTRING is true if this is to be the spelling after translation
4126 phase 1 (with the original spelling of extended identifiers), false
4127 if extended identifiers should always be written using UCNs (there is
4128 no option for always writing them in the internal UTF-8 form).
4129 FIXME: Would be nice if we didn't need the PFILE argument. */
4130 unsigned char *
4131 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4132 unsigned char *buffer, bool forstring)
4133 {
4134 switch (TOKEN_SPELL (token))
4135 {
4136 case SPELL_OPERATOR:
4137 {
4138 const unsigned char *spelling;
4139 unsigned char c;
4140
4141 if (token->flags & DIGRAPH)
4142 spelling = cpp_digraph2name (token->type);
4143 else if (token->flags & NAMED_OP)
4144 goto spell_ident;
4145 else
4146 spelling = TOKEN_NAME (token);
4147
4148 while ((c = *spelling++) != '\0')
4149 *buffer++ = c;
4150 }
4151 break;
4152
4153 spell_ident:
4154 case SPELL_IDENT:
4155 if (forstring)
4156 {
4157 memcpy (buffer, NODE_NAME (token->val.node.spelling),
4158 NODE_LEN (token->val.node.spelling));
4159 buffer += NODE_LEN (token->val.node.spelling);
4160 }
4161 else
4162 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4163 break;
4164
4165 case SPELL_LITERAL:
4166 memcpy (buffer, token->val.str.text, token->val.str.len);
4167 buffer += token->val.str.len;
4168 break;
4169
4170 case SPELL_NONE:
4171 cpp_error (pfile, CPP_DL_ICE,
4172 "unspellable token %s", TOKEN_NAME (token));
4173 break;
4174 }
4175
4176 return buffer;
4177 }
4178
4179 /* Returns TOKEN spelt as a null-terminated string. The string is
4180 freed when the reader is destroyed. Useful for diagnostics. */
4181 unsigned char *
4182 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4183 {
4184 unsigned int len = cpp_token_len (token) + 1;
4185 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4186
4187 end = cpp_spell_token (pfile, token, start, false);
4188 end[0] = '\0';
4189
4190 return start;
4191 }
4192
4193 /* Returns a pointer to a string which spells the token defined by
4194 TYPE and FLAGS. Used by C front ends, which really should move to
4195 using cpp_token_as_text. */
4196 const char *
4197 cpp_type2name (enum cpp_ttype type, unsigned char flags)
4198 {
4199 if (flags & DIGRAPH)
4200 return (const char *) cpp_digraph2name (type);
4201 else if (flags & NAMED_OP)
4202 return cpp_named_operator2name (type);
4203
4204 return (const char *) token_spellings[type].name;
4205 }
4206
4207 /* Writes the spelling of token to FP, without any preceding space.
4208 Separated from cpp_spell_token for efficiency - to avoid stdio
4209 double-buffering. */
4210 void
4211 cpp_output_token (const cpp_token *token, FILE *fp)
4212 {
4213 switch (TOKEN_SPELL (token))
4214 {
4215 case SPELL_OPERATOR:
4216 {
4217 const unsigned char *spelling;
4218 int c;
4219
4220 if (token->flags & DIGRAPH)
4221 spelling = cpp_digraph2name (token->type);
4222 else if (token->flags & NAMED_OP)
4223 goto spell_ident;
4224 else
4225 spelling = TOKEN_NAME (token);
4226
4227 c = *spelling;
4228 do
4229 putc (c, fp);
4230 while ((c = *++spelling) != '\0');
4231 }
4232 break;
4233
4234 spell_ident:
4235 case SPELL_IDENT:
4236 {
4237 size_t i;
4238 const unsigned char * name = NODE_NAME (token->val.node.node);
4239
4240 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4241 if (name[i] & ~0x7F)
4242 {
4243 unsigned char buffer[10];
4244 i += utf8_to_ucn (buffer, name + i) - 1;
4245 fwrite (buffer, 1, 10, fp);
4246 }
4247 else
4248 fputc (NODE_NAME (token->val.node.node)[i], fp);
4249 }
4250 break;
4251
4252 case SPELL_LITERAL:
4253 if (token->type == CPP_HEADER_NAME)
4254 fputc ('"', fp);
4255 fwrite (token->val.str.text, 1, token->val.str.len, fp);
4256 if (token->type == CPP_HEADER_NAME)
4257 fputc ('"', fp);
4258 break;
4259
4260 case SPELL_NONE:
4261 /* An error, most probably. */
4262 break;
4263 }
4264 }
4265
4266 /* Compare two tokens. */
4267 int
4268 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4269 {
4270 if (a->type == b->type && a->flags == b->flags)
4271 switch (TOKEN_SPELL (a))
4272 {
4273 default: /* Keep compiler happy. */
4274 case SPELL_OPERATOR:
4275 /* token_no is used to track where multiple consecutive ##
4276 tokens were originally located. */
4277 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4278 case SPELL_NONE:
4279 return (a->type != CPP_MACRO_ARG
4280 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4281 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4282 case SPELL_IDENT:
4283 return (a->val.node.node == b->val.node.node
4284 && a->val.node.spelling == b->val.node.spelling);
4285 case SPELL_LITERAL:
4286 return (a->val.str.len == b->val.str.len
4287 && !memcmp (a->val.str.text, b->val.str.text,
4288 a->val.str.len));
4289 }
4290
4291 return 0;
4292 }
4293
4294 /* Returns nonzero if a space should be inserted to avoid an
4295 accidental token paste for output. For simplicity, it is
4296 conservative, and occasionally advises a space where one is not
4297 needed, e.g. "." and ".2". */
4298 int
4299 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4300 const cpp_token *token2)
4301 {
4302 enum cpp_ttype a = token1->type, b = token2->type;
4303 cppchar_t c;
4304
4305 if (token1->flags & NAMED_OP)
4306 a = CPP_NAME;
4307 if (token2->flags & NAMED_OP)
4308 b = CPP_NAME;
4309
4310 c = EOF;
4311 if (token2->flags & DIGRAPH)
4312 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4313 else if (token_spellings[b].category == SPELL_OPERATOR)
4314 c = token_spellings[b].name[0];
4315
4316 /* Quickly get everything that can paste with an '='. */
4317 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4318 return 1;
4319
4320 switch (a)
4321 {
4322 case CPP_GREATER: return c == '>';
4323 case CPP_LESS: return c == '<' || c == '%' || c == ':';
4324 case CPP_PLUS: return c == '+';
4325 case CPP_MINUS: return c == '-' || c == '>';
4326 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4327 case CPP_MOD: return c == ':' || c == '>';
4328 case CPP_AND: return c == '&';
4329 case CPP_OR: return c == '|';
4330 case CPP_COLON: return c == ':' || c == '>';
4331 case CPP_DEREF: return c == '*';
4332 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
4333 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
4334 case CPP_PRAGMA:
4335 case CPP_NAME: return ((b == CPP_NUMBER
4336 && name_p (pfile, &token2->val.str))
4337 || b == CPP_NAME
4338 || b == CPP_CHAR || b == CPP_STRING); /* L */
4339 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
4340 || b == CPP_CHAR
4341 || c == '.' || c == '+' || c == '-');
4342 /* UCNs */
4343 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4344 && b == CPP_NAME)
4345 || (CPP_OPTION (pfile, objc)
4346 && token1->val.str.text[0] == '@'
4347 && (b == CPP_NAME || b == CPP_STRING)));
4348 case CPP_LESS_EQ: return c == '>';
4349 case CPP_STRING:
4350 case CPP_WSTRING:
4351 case CPP_UTF8STRING:
4352 case CPP_STRING16:
4353 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4354 && (b == CPP_NAME
4355 || (TOKEN_SPELL (token2) == SPELL_LITERAL
4356 && ISIDST (token2->val.str.text[0]))));
4357
4358 default: break;
4359 }
4360
4361 return 0;
4362 }
4363
4364 /* Output all the remaining tokens on the current line, and a newline
4365 character, to FP. Leading whitespace is removed. If there are
4366 macros, special token padding is not performed. */
4367 void
4368 cpp_output_line (cpp_reader *pfile, FILE *fp)
4369 {
4370 const cpp_token *token;
4371
4372 token = cpp_get_token (pfile);
4373 while (token->type != CPP_EOF)
4374 {
4375 cpp_output_token (token, fp);
4376 token = cpp_get_token (pfile);
4377 if (token->flags & PREV_WHITE)
4378 putc (' ', fp);
4379 }
4380
4381 putc ('\n', fp);
4382 }
4383
4384 /* Return a string representation of all the remaining tokens on the
4385 current line. The result is allocated using xmalloc and must be
4386 freed by the caller. */
4387 unsigned char *
4388 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4389 {
4390 const cpp_token *token;
4391 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4392 unsigned int alloced = 120 + out;
4393 unsigned char *result = (unsigned char *) xmalloc (alloced);
4394
4395 /* If DIR_NAME is empty, there are no initial contents. */
4396 if (dir_name)
4397 {
4398 sprintf ((char *) result, "#%s ", dir_name);
4399 out += 2;
4400 }
4401
4402 token = cpp_get_token (pfile);
4403 while (token->type != CPP_EOF)
4404 {
4405 unsigned char *last;
4406 /* Include room for a possible space and the terminating nul. */
4407 unsigned int len = cpp_token_len (token) + 2;
4408
4409 if (out + len > alloced)
4410 {
4411 alloced *= 2;
4412 if (out + len > alloced)
4413 alloced = out + len;
4414 result = (unsigned char *) xrealloc (result, alloced);
4415 }
4416
4417 last = cpp_spell_token (pfile, token, &result[out], 0);
4418 out = last - result;
4419
4420 token = cpp_get_token (pfile);
4421 if (token->flags & PREV_WHITE)
4422 result[out++] = ' ';
4423 }
4424
4425 result[out] = '\0';
4426 return result;
4427 }
4428
4429 /* Memory buffers. Changing these three constants can have a dramatic
4430 effect on performance. The values here are reasonable defaults,
4431 but might be tuned. If you adjust them, be sure to test across a
4432 range of uses of cpplib, including heavy nested function-like macro
4433 expansion. Also check the change in peak memory usage (NJAMD is a
4434 good tool for this). */
4435 #define MIN_BUFF_SIZE 8000
4436 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4437 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4438 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4439
4440 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4441 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4442 #endif
4443
4444 /* Create a new allocation buffer. Place the control block at the end
4445 of the buffer, so that buffer overflows will cause immediate chaos. */
4446 static _cpp_buff *
4447 new_buff (size_t len)
4448 {
4449 _cpp_buff *result;
4450 unsigned char *base;
4451
4452 if (len < MIN_BUFF_SIZE)
4453 len = MIN_BUFF_SIZE;
4454 len = CPP_ALIGN (len);
4455
4456 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4457 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4458 struct first. */
4459 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4460 base = XNEWVEC (unsigned char, len + slen);
4461 result = (_cpp_buff *) base;
4462 base += slen;
4463 #else
4464 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4465 result = (_cpp_buff *) (base + len);
4466 #endif
4467 result->base = base;
4468 result->cur = base;
4469 result->limit = base + len;
4470 result->next = NULL;
4471 return result;
4472 }
4473
4474 /* Place a chain of unwanted allocation buffers on the free list. */
4475 void
4476 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4477 {
4478 _cpp_buff *end = buff;
4479
4480 while (end->next)
4481 end = end->next;
4482 end->next = pfile->free_buffs;
4483 pfile->free_buffs = buff;
4484 }
4485
4486 /* Return a free buffer of size at least MIN_SIZE. */
4487 _cpp_buff *
4488 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4489 {
4490 _cpp_buff *result, **p;
4491
4492 for (p = &pfile->free_buffs;; p = &(*p)->next)
4493 {
4494 size_t size;
4495
4496 if (*p == NULL)
4497 return new_buff (min_size);
4498 result = *p;
4499 size = result->limit - result->base;
4500 /* Return a buffer that's big enough, but don't waste one that's
4501 way too big. */
4502 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4503 break;
4504 }
4505
4506 *p = result->next;
4507 result->next = NULL;
4508 result->cur = result->base;
4509 return result;
4510 }
4511
4512 /* Creates a new buffer with enough space to hold the uncommitted
4513 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4514 the excess bytes to the new buffer. Chains the new buffer after
4515 BUFF, and returns the new buffer. */
4516 _cpp_buff *
4517 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4518 {
4519 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4520 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4521
4522 buff->next = new_buff;
4523 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4524 return new_buff;
4525 }
4526
4527 /* Creates a new buffer with enough space to hold the uncommitted
4528 remaining bytes of the buffer pointed to by BUFF, and at least
4529 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4530 Chains the new buffer before the buffer pointed to by BUFF, and
4531 updates the pointer to point to the new buffer. */
4532 void
4533 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4534 {
4535 _cpp_buff *new_buff, *old_buff = *pbuff;
4536 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4537
4538 new_buff = _cpp_get_buff (pfile, size);
4539 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4540 new_buff->next = old_buff;
4541 *pbuff = new_buff;
4542 }
4543
4544 /* Free a chain of buffers starting at BUFF. */
4545 void
4546 _cpp_free_buff (_cpp_buff *buff)
4547 {
4548 _cpp_buff *next;
4549
4550 for (; buff; buff = next)
4551 {
4552 next = buff->next;
4553 #ifdef ENABLE_VALGRIND_ANNOTATIONS
4554 free (buff);
4555 #else
4556 free (buff->base);
4557 #endif
4558 }
4559 }
4560
4561 /* Allocate permanent, unaligned storage of length LEN. */
4562 unsigned char *
4563 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4564 {
4565 _cpp_buff *buff = pfile->u_buff;
4566 unsigned char *result = buff->cur;
4567
4568 if (len > (size_t) (buff->limit - result))
4569 {
4570 buff = _cpp_get_buff (pfile, len);
4571 buff->next = pfile->u_buff;
4572 pfile->u_buff = buff;
4573 result = buff->cur;
4574 }
4575
4576 buff->cur = result + len;
4577 return result;
4578 }
4579
4580 /* Allocate permanent, unaligned storage of length LEN from a_buff.
4581 That buffer is used for growing allocations when saving macro
4582 replacement lists in a #define, and when parsing an answer to an
4583 assertion in #assert, #unassert or #if (and therefore possibly
4584 whilst expanding macros). It therefore must not be used by any
4585 code that they might call: specifically the lexer and the guts of
4586 the macro expander.
4587
4588 All existing other uses clearly fit this restriction: storing
4589 registered pragmas during initialization. */
4590 unsigned char *
4591 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4592 {
4593 _cpp_buff *buff = pfile->a_buff;
4594 unsigned char *result = buff->cur;
4595
4596 if (len > (size_t) (buff->limit - result))
4597 {
4598 buff = _cpp_get_buff (pfile, len);
4599 buff->next = pfile->a_buff;
4600 pfile->a_buff = buff;
4601 result = buff->cur;
4602 }
4603
4604 buff->cur = result + len;
4605 return result;
4606 }
4607
4608 /* Commit or allocate storage from a buffer. */
4609
4610 void *
4611 _cpp_commit_buff (cpp_reader *pfile, size_t size)
4612 {
4613 void *ptr = BUFF_FRONT (pfile->a_buff);
4614
4615 if (pfile->hash_table->alloc_subobject)
4616 {
4617 void *copy = pfile->hash_table->alloc_subobject (size);
4618 memcpy (copy, ptr, size);
4619 ptr = copy;
4620 }
4621 else
4622 BUFF_FRONT (pfile->a_buff) += size;
4623
4624 return ptr;
4625 }
4626
4627 /* Say which field of TOK is in use. */
4628
4629 enum cpp_token_fld_kind
4630 cpp_token_val_index (const cpp_token *tok)
4631 {
4632 switch (TOKEN_SPELL (tok))
4633 {
4634 case SPELL_IDENT:
4635 return CPP_TOKEN_FLD_NODE;
4636 case SPELL_LITERAL:
4637 return CPP_TOKEN_FLD_STR;
4638 case SPELL_OPERATOR:
4639 /* Operands which were originally spelled as ident keep around
4640 the node for the exact spelling. */
4641 if (tok->flags & NAMED_OP)
4642 return CPP_TOKEN_FLD_NODE;
4643 else if (tok->type == CPP_PASTE)
4644 return CPP_TOKEN_FLD_TOKEN_NO;
4645 else
4646 return CPP_TOKEN_FLD_NONE;
4647 case SPELL_NONE:
4648 if (tok->type == CPP_MACRO_ARG)
4649 return CPP_TOKEN_FLD_ARG_NO;
4650 else if (tok->type == CPP_PADDING)
4651 return CPP_TOKEN_FLD_SOURCE;
4652 else if (tok->type == CPP_PRAGMA)
4653 return CPP_TOKEN_FLD_PRAGMA;
4654 /* fall through */
4655 default:
4656 return CPP_TOKEN_FLD_NONE;
4657 }
4658 }
4659
4660 /* All tokens lexed in R after calling this function will be forced to
4661 have their location_t to be P, until
4662 cpp_stop_forcing_token_locations is called for R. */
4663
4664 void
4665 cpp_force_token_locations (cpp_reader *r, location_t loc)
4666 {
4667 r->forced_token_location = loc;
4668 }
4669
4670 /* Go back to assigning locations naturally for lexed tokens. */
4671
4672 void
4673 cpp_stop_forcing_token_locations (cpp_reader *r)
4674 {
4675 r->forced_token_location = 0;
4676 }
4677
4678 /* We're looking at \, if it's escaping EOL, look past it. If at
4679 LIMIT, don't advance. */
4680
4681 static const unsigned char *
4682 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4683 {
4684 const unsigned char *probe = peek;
4685
4686 if (__builtin_expect (peek[1] == '\n', true))
4687 {
4688 eol:
4689 probe += 2;
4690 if (__builtin_expect (probe < limit, true))
4691 {
4692 peek = probe;
4693 if (*peek == '\\')
4694 /* The user might be perverse. */
4695 return do_peek_backslash (peek, limit);
4696 }
4697 }
4698 else if (__builtin_expect (peek[1] == '\r', false))
4699 {
4700 if (probe[2] == '\n')
4701 probe++;
4702 goto eol;
4703 }
4704
4705 return peek;
4706 }
4707
4708 static const unsigned char *
4709 do_peek_next (const unsigned char *peek, const unsigned char *limit)
4710 {
4711 if (__builtin_expect (*peek == '\\', false))
4712 peek = do_peek_backslash (peek, limit);
4713 return peek;
4714 }
4715
4716 static const unsigned char *
4717 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4718 {
4719 if (peek == bound)
4720 return NULL;
4721
4722 unsigned char c = *--peek;
4723 if (__builtin_expect (c == '\n', false)
4724 || __builtin_expect (c == 'r', false))
4725 {
4726 if (peek == bound)
4727 return peek;
4728 int ix = -1;
4729 if (c == '\n' && peek[ix] == '\r')
4730 {
4731 if (peek + ix == bound)
4732 return peek;
4733 ix--;
4734 }
4735
4736 if (peek[ix] == '\\')
4737 return do_peek_prev (peek + ix, bound);
4738
4739 return peek;
4740 }
4741 else
4742 return peek;
4743 }
4744
4745 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4746 space. Otherwise return NULL. */
4747
4748 static const unsigned char *
4749 do_peek_ident (const char *match, const unsigned char *peek,
4750 const unsigned char *limit)
4751 {
4752 for (; *++match; peek++)
4753 if (*peek != *match)
4754 {
4755 peek = do_peek_next (peek, limit);
4756 if (*peek != *match)
4757 return NULL;
4758 }
4759
4760 /* Must now not be looking at an identifier char. */
4761 peek = do_peek_next (peek, limit);
4762 if (ISIDNUM (*peek))
4763 return NULL;
4764
4765 /* Skip control-line whitespace. */
4766 ws:
4767 while (*peek == ' ' || *peek == '\t')
4768 peek++;
4769 if (__builtin_expect (*peek == '\\', false))
4770 {
4771 peek = do_peek_backslash (peek, limit);
4772 if (*peek != '\\')
4773 goto ws;
4774 }
4775
4776 return peek;
4777 }
4778
4779 /* Are we looking at a module control line starting as PEEK - 1? */
4780
4781 static bool
4782 do_peek_module (cpp_reader *pfile, unsigned char c,
4783 const unsigned char *peek, const unsigned char *limit)
4784 {
4785 bool import = false;
4786
4787 if (__builtin_expect (c == 'e', false))
4788 {
4789 if (!((peek[0] == 'x' || peek[0] == '\\')
4790 && (peek = do_peek_ident ("export", peek, limit))))
4791 return false;
4792
4793 /* export, peek for import or module. No need to peek __import
4794 here. */
4795 if (peek[0] == 'i')
4796 {
4797 if (!((peek[1] == 'm' || peek[1] == '\\')
4798 && (peek = do_peek_ident ("import", peek + 1, limit))))
4799 return false;
4800 import = true;
4801 }
4802 else if (peek[0] == 'm')
4803 {
4804 if (!((peek[1] == 'o' || peek[1] == '\\')
4805 && (peek = do_peek_ident ("module", peek + 1, limit))))
4806 return false;
4807 }
4808 else
4809 return false;
4810 }
4811 else if (__builtin_expect (c == 'i', false))
4812 {
4813 if (!((peek[0] == 'm' || peek[0] == '\\')
4814 && (peek = do_peek_ident ("import", peek, limit))))
4815 return false;
4816 import = true;
4817 }
4818 else if (__builtin_expect (c == '_', false))
4819 {
4820 /* Needed for translated includes. */
4821 if (!((peek[0] == '_' || peek[0] == '\\')
4822 && (peek = do_peek_ident ("__import", peek, limit))))
4823 return false;
4824 import = true;
4825 }
4826 else if (__builtin_expect (c == 'm', false))
4827 {
4828 if (!((peek[0] == 'o' || peek[0] == '\\')
4829 && (peek = do_peek_ident ("module", peek, limit))))
4830 return false;
4831 }
4832 else
4833 return false;
4834
4835 /* Peek the next character to see if it's good enough. We'll be at
4836 the first non-whitespace char, including skipping an escaped
4837 newline. */
4838 /* ... import followed by identifier, ':', '<' or header-name
4839 preprocessing tokens, or module followed by identifier, ':' or
4840 ';' preprocessing tokens. */
4841 unsigned char p = *peek++;
4842
4843 /* A character literal is ... single quotes, ... optionally preceded
4844 by u8, u, U, or L */
4845 /* A string-literal is a ... double quotes, optionally prefixed by
4846 R, u8, u8R, u, uR, U, UR, L, or LR */
4847 if (p == 'u')
4848 {
4849 peek = do_peek_next (peek, limit);
4850 if (*peek == '8')
4851 {
4852 peek++;
4853 goto peek_u8;
4854 }
4855 goto peek_u;
4856 }
4857 else if (p == 'U' || p == 'L')
4858 {
4859 peek_u8:
4860 peek = do_peek_next (peek, limit);
4861 peek_u:
4862 if (*peek == '\"' || *peek == '\'')
4863 return false;
4864
4865 if (*peek == 'R')
4866 goto peek_R;
4867 /* Identifier. Ok. */
4868 }
4869 else if (p == 'R')
4870 {
4871 peek_R:
4872 if (CPP_OPTION (pfile, rliterals))
4873 {
4874 peek = do_peek_next (peek, limit);
4875 if (*peek == '\"')
4876 return false;
4877 }
4878 /* Identifier. Ok. */
4879 }
4880 else if ('Z' - 'A' == 25
4881 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4882 : ISIDST (p))
4883 {
4884 /* Identifier. Ok. */
4885 }
4886 else if (p == '<')
4887 {
4888 /* Maybe angle header, ok for import. Reject
4889 '<=', '<<' digraph:'<:'. */
4890 if (!import)
4891 return false;
4892 peek = do_peek_next (peek, limit);
4893 if (*peek == '=' || *peek == '<'
4894 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4895 return false;
4896 }
4897 else if (p == ';')
4898 {
4899 /* SEMICOLON, ok for module. */
4900 if (import)
4901 return false;
4902 }
4903 else if (p == '"')
4904 {
4905 /* STRING, ok for import. */
4906 if (!import)
4907 return false;
4908 }
4909 else if (p == ':')
4910 {
4911 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
4912 peek = do_peek_next (peek, limit);
4913 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4914 return false;
4915 }
4916 else
4917 /* FIXME: Detect a unicode character, excluding those not
4918 permitted as the initial character. [lex.name]/1. I presume
4919 we need to check the \[uU] spellings, and directly using
4920 Unicode in say UTF8 form? Or perhaps we do the phase-1
4921 conversion of UTF8 to universal-character-names? */
4922 return false;
4923
4924 return true;
4925 }
4926
4927 /* Directives-only scanning. Somewhat more relaxed than correct
4928 parsing -- some ill-formed programs will not be rejected. */
4929
4930 void
4931 cpp_directive_only_process (cpp_reader *pfile,
4932 void *data,
4933 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4934 {
4935 bool module_p = CPP_OPTION (pfile, module_directives);
4936
4937 do
4938 {
4939 restart:
4940 /* Buffer initialization, but no line cleaning. */
4941 cpp_buffer *buffer = pfile->buffer;
4942 buffer->cur_note = buffer->notes_used = 0;
4943 buffer->cur = buffer->line_base = buffer->next_line;
4944 buffer->need_line = false;
4945 /* Files always end in a newline or carriage return. We rely on this for
4946 character peeking safety. */
4947 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
4948
4949 const unsigned char *base = buffer->cur;
4950 unsigned line_count = 0;
4951 const unsigned char *line_start = base;
4952
4953 bool bol = true;
4954 bool raw = false;
4955
4956 const unsigned char *lwm = base;
4957 for (const unsigned char *pos = base, *limit = buffer->rlimit;
4958 pos < limit;)
4959 {
4960 unsigned char c = *pos++;
4961 /* This matches the switch in _cpp_lex_direct. */
4962 switch (c)
4963 {
4964 case ' ': case '\t': case '\f': case '\v':
4965 /* Whitespace, do nothing. */
4966 break;
4967
4968 case '\r': /* MAC line ending, or Windows \r\n */
4969 if (*pos == '\n')
4970 pos++;
4971 /* FALLTHROUGH */
4972
4973 case '\n':
4974 bol = true;
4975
4976 next_line:
4977 CPP_INCREMENT_LINE (pfile, 0);
4978 line_count++;
4979 line_start = pos;
4980 break;
4981
4982 case '\\':
4983 /* <backslash><newline> is removed, and doesn't undo any
4984 preceeding escape or whatnot. */
4985 if (*pos == '\n')
4986 {
4987 pos++;
4988 goto next_line;
4989 }
4990 else if (*pos == '\r')
4991 {
4992 if (pos[1] == '\n')
4993 pos++;
4994 pos++;
4995 goto next_line;
4996 }
4997 goto dflt;
4998
4999 case '#':
5000 if (bol)
5001 {
5002 /* Line directive. */
5003 if (pos - 1 > base && !pfile->state.skipping)
5004 cb (pfile, CPP_DO_print, data,
5005 line_count, base, pos - 1 - base);
5006
5007 /* Prep things for directive handling. */
5008 buffer->next_line = pos;
5009 buffer->need_line = true;
5010 bool ok = _cpp_get_fresh_line (pfile);
5011 gcc_checking_assert (ok);
5012
5013 /* Ensure proper column numbering for generated
5014 error messages. */
5015 buffer->line_base -= pos - line_start;
5016
5017 _cpp_handle_directive (pfile, line_start + 1 != pos);
5018
5019 /* Sanitize the line settings. Duplicate #include's can
5020 mess things up. */
5021 // FIXME: Necessary?
5022 pfile->line_table->highest_location
5023 = pfile->line_table->highest_line;
5024
5025 if (!pfile->state.skipping
5026 && pfile->buffer->next_line < pfile->buffer->rlimit)
5027 cb (pfile, CPP_DO_location, data,
5028 pfile->line_table->highest_line);
5029
5030 goto restart;
5031 }
5032 goto dflt;
5033
5034 case '/':
5035 {
5036 const unsigned char *peek = do_peek_next (pos, limit);
5037 if (!(*peek == '/' || *peek == '*'))
5038 goto dflt;
5039
5040 /* Line or block comment */
5041 bool is_block = *peek == '*';
5042 bool star = false;
5043 bool esc = false;
5044 location_t sloc
5045 = linemap_position_for_column (pfile->line_table,
5046 pos - line_start);
5047
5048 while (pos < limit)
5049 {
5050 char c = *pos++;
5051 switch (c)
5052 {
5053 case '\\':
5054 esc = true;
5055 break;
5056
5057 case '\r':
5058 if (*pos == '\n')
5059 pos++;
5060 /* FALLTHROUGH */
5061
5062 case '\n':
5063 {
5064 CPP_INCREMENT_LINE (pfile, 0);
5065 line_count++;
5066 line_start = pos;
5067 if (!esc && !is_block)
5068 {
5069 bol = true;
5070 goto done_comment;
5071 }
5072 }
5073 if (!esc)
5074 star = false;
5075 esc = false;
5076 break;
5077
5078 case '*':
5079 if (pos > peek)
5080 star = is_block;
5081 esc = false;
5082 break;
5083
5084 case '/':
5085 if (star)
5086 goto done_comment;
5087 /* FALLTHROUGH */
5088
5089 default:
5090 star = false;
5091 esc = false;
5092 break;
5093 }
5094 }
5095 if (pos < limit || is_block)
5096 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5097 "unterminated comment");
5098 done_comment:
5099 lwm = pos;
5100 break;
5101 }
5102
5103 case '\'':
5104 if (!CPP_OPTION (pfile, digit_separators))
5105 goto delimited_string;
5106
5107 /* Possibly a number punctuator. */
5108 if (!ISIDNUM (*do_peek_next (pos, limit)))
5109 goto delimited_string;
5110
5111 goto quote_peek;
5112
5113 case '\"':
5114 if (!CPP_OPTION (pfile, rliterals))
5115 goto delimited_string;
5116
5117 quote_peek:
5118 {
5119 /* For ' see if it's a number punctuator
5120 \.?<digit>(<digit>|<identifier-nondigit>
5121 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5122 /* For " see if it's a raw string
5123 {U,L,u,u8}R. This includes CPP_NUMBER detection,
5124 because that could be 0e+R. */
5125 const unsigned char *peek = pos - 1;
5126 bool quote_first = c == '"';
5127 bool quote_eight = false;
5128 bool maybe_number_start = false;
5129 bool want_number = false;
5130
5131 while ((peek = do_peek_prev (peek, lwm)))
5132 {
5133 unsigned char p = *peek;
5134 if (quote_first)
5135 {
5136 if (!raw)
5137 {
5138 if (p != 'R')
5139 break;
5140 raw = true;
5141 continue;
5142 }
5143
5144 quote_first = false;
5145 if (p == 'L' || p == 'U' || p == 'u')
5146 ;
5147 else if (p == '8')
5148 quote_eight = true;
5149 else
5150 goto second_raw;
5151 }
5152 else if (quote_eight)
5153 {
5154 if (p != 'u')
5155 {
5156 raw = false;
5157 break;
5158 }
5159 quote_eight = false;
5160 }
5161 else if (c == '"')
5162 {
5163 second_raw:;
5164 if (!want_number && ISIDNUM (p))
5165 {
5166 raw = false;
5167 break;
5168 }
5169 }
5170
5171 if (ISDIGIT (p))
5172 maybe_number_start = true;
5173 else if (p == '.')
5174 want_number = true;
5175 else if (ISIDNUM (p))
5176 maybe_number_start = false;
5177 else if (p == '+' || p == '-')
5178 {
5179 if (const unsigned char *peek_prev
5180 = do_peek_prev (peek, lwm))
5181 {
5182 p = *peek_prev;
5183 if (p == 'e' || p == 'E'
5184 || p == 'p' || p == 'P')
5185 {
5186 want_number = true;
5187 maybe_number_start = false;
5188 }
5189 else
5190 break;
5191 }
5192 else
5193 break;
5194 }
5195 else if (p == '\'' || p == '\"')
5196 {
5197 /* If this is lwm, this must be the end of a
5198 previous string. So this is a trailing
5199 literal type, (a) if those are allowed,
5200 and (b) maybe_start is false. Otherwise
5201 this must be a CPP_NUMBER because we've
5202 met another ', and we'd have checked that
5203 in its own right. */
5204 if (peek == lwm && CPP_OPTION (pfile, uliterals))
5205 {
5206 if (!maybe_number_start && !want_number)
5207 /* Must be a literal type. */
5208 raw = false;
5209 }
5210 else if (p == '\''
5211 && CPP_OPTION (pfile, digit_separators))
5212 maybe_number_start = true;
5213 break;
5214 }
5215 else if (c == '\'')
5216 break;
5217 else if (!quote_first && !quote_eight)
5218 break;
5219 }
5220
5221 if (maybe_number_start)
5222 {
5223 if (c == '\'')
5224 /* A CPP NUMBER. */
5225 goto dflt;
5226 raw = false;
5227 }
5228
5229 goto delimited_string;
5230 }
5231
5232 delimited_string:
5233 {
5234 /* (Possibly raw) string or char literal. */
5235 unsigned char end = c;
5236 int delim_len = -1;
5237 const unsigned char *delim = NULL;
5238 location_t sloc = linemap_position_for_column (pfile->line_table,
5239 pos - line_start);
5240 int esc = 0;
5241
5242 if (raw)
5243 {
5244 /* There can be no line breaks in the delimiter. */
5245 delim = pos;
5246 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5247 {
5248 if (delim_len == 16)
5249 {
5250 cpp_error_with_line (pfile, CPP_DL_ERROR,
5251 sloc, 0,
5252 "raw string delimiter"
5253 " longer than %d"
5254 " characters",
5255 delim_len);
5256 raw = false;
5257 pos = delim;
5258 break;
5259 }
5260 if (strchr (") \\\t\v\f\n", c))
5261 {
5262 cpp_error_with_line (pfile, CPP_DL_ERROR,
5263 sloc, 0,
5264 "invalid character '%c'"
5265 " in raw string"
5266 " delimiter", c);
5267 raw = false;
5268 pos = delim;
5269 break;
5270 }
5271 if (pos >= limit)
5272 goto bad_string;
5273 }
5274 }
5275
5276 while (pos < limit)
5277 {
5278 char c = *pos++;
5279 switch (c)
5280 {
5281 case '\\':
5282 if (!raw)
5283 esc++;
5284 break;
5285
5286 case '\r':
5287 if (*pos == '\n')
5288 pos++;
5289 /* FALLTHROUGH */
5290
5291 case '\n':
5292 {
5293 CPP_INCREMENT_LINE (pfile, 0);
5294 line_count++;
5295 line_start = pos;
5296 }
5297 if (esc)
5298 esc--;
5299 break;
5300
5301 case ')':
5302 if (raw
5303 && pos + delim_len + 1 < limit
5304 && pos[delim_len] == end
5305 && !memcmp (delim, pos, delim_len))
5306 {
5307 pos += delim_len + 1;
5308 raw = false;
5309 goto done_string;
5310 }
5311 break;
5312
5313 default:
5314 if (!raw && !(esc & 1) && c == end)
5315 goto done_string;
5316 esc = 0;
5317 break;
5318 }
5319 }
5320 bad_string:
5321 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5322 "unterminated literal");
5323
5324 done_string:
5325 raw = false;
5326 lwm = pos - 1;
5327 }
5328 goto dflt;
5329
5330 case '_':
5331 case 'e':
5332 case 'i':
5333 case 'm':
5334 if (bol && module_p && !pfile->state.skipping
5335 && do_peek_module (pfile, c, pos, limit))
5336 {
5337 /* We've seen the start of a module control line.
5338 Start up the tokenizer. */
5339 pos--; /* Backup over the first character. */
5340
5341 /* Backup over whitespace to start of line. */
5342 while (pos > line_start
5343 && (pos[-1] == ' ' || pos[-1] == '\t'))
5344 pos--;
5345
5346 if (pos > base)
5347 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5348
5349 /* Prep things for directive handling. */
5350 buffer->next_line = pos;
5351 buffer->need_line = true;
5352
5353 /* Now get tokens until the PRAGMA_EOL. */
5354 do
5355 {
5356 location_t spelling;
5357 const cpp_token *tok
5358 = cpp_get_token_with_location (pfile, &spelling);
5359
5360 gcc_assert (pfile->state.in_deferred_pragma
5361 || tok->type == CPP_PRAGMA_EOL);
5362 cb (pfile, CPP_DO_token, data, tok, spelling);
5363 }
5364 while (pfile->state.in_deferred_pragma);
5365
5366 if (pfile->buffer->next_line < pfile->buffer->rlimit)
5367 cb (pfile, CPP_DO_location, data,
5368 pfile->line_table->highest_line);
5369
5370 pfile->mi_valid = false;
5371 goto restart;
5372 }
5373 goto dflt;
5374
5375 default:
5376 dflt:
5377 bol = false;
5378 pfile->mi_valid = false;
5379 break;
5380 }
5381 }
5382
5383 if (buffer->rlimit > base && !pfile->state.skipping)
5384 {
5385 const unsigned char *limit = buffer->rlimit;
5386 /* If the file was not newline terminated, add rlimit, which is
5387 guaranteed to point to a newline, to the end of our range. */
5388 if (limit[-1] != '\n')
5389 {
5390 limit++;
5391 CPP_INCREMENT_LINE (pfile, 0);
5392 line_count++;
5393 }
5394 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5395 }
5396
5397 _cpp_pop_buffer (pfile);
5398 }
5399 while (pfile->buffer);
5400 }
5401