Home | History | Annotate | Line # | Download | only in libcpp
lex.cc revision 1.1.1.3
      1 /* CPP Library - lexical analysis.
      2    Copyright (C) 2000-2024 Free Software Foundation, Inc.
      3    Contributed by Per Bothner, 1994-95.
      4    Based on CCCP program by Paul Rubin, June 1986
      5    Adapted to ANSI C, Richard Stallman, Jan 1987
      6    Broken out to separate file, Zack Weinberg, Mar 2000
      7 
      8 This program is free software; you can redistribute it and/or modify it
      9 under the terms of the GNU General Public License as published by the
     10 Free Software Foundation; either version 3, or (at your option) any
     11 later version.
     12 
     13 This program is distributed in the hope that it will be useful,
     14 but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     16 GNU General Public License for more details.
     17 
     18 You should have received a copy of the GNU General Public License
     19 along with this program; see the file COPYING3.  If not see
     20 <http://www.gnu.org/licenses/>.  */
     21 
     22 #include "config.h"
     23 #include "system.h"
     24 #include "cpplib.h"
     25 #include "internal.h"
     26 
     27 enum spell_type
     28 {
     29   SPELL_OPERATOR = 0,
     30   SPELL_IDENT,
     31   SPELL_LITERAL,
     32   SPELL_NONE
     33 };
     34 
     35 struct token_spelling
     36 {
     37   enum spell_type category;
     38   const unsigned char *name;
     39 };
     40 
     41 static const unsigned char *const digraph_spellings[] =
     42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
     43 
     44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
     45 #define TK(e, s) { SPELL_ ## s,    UC #e },
     46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
     47 #undef OP
     48 #undef TK
     49 
     50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
     51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
     52 
     53 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive.  */
     54 #define UCS_LIMIT 0x10FFFF
     55 
     56 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
     57 static int skip_line_comment (cpp_reader *);
     58 static void skip_whitespace (cpp_reader *, cppchar_t);
     59 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
     60 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
     61 static void store_comment (cpp_reader *, cpp_token *);
     62 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
     63 			    unsigned int, enum cpp_ttype);
     64 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
     65 static int name_p (cpp_reader *, const cpp_string *);
     66 static tokenrun *next_tokenrun (tokenrun *);
     67 
     68 static _cpp_buff *new_buff (size_t);
     69 
     70 
     71 /* Utility routine:
     72 
     73    Compares, the token TOKEN to the NUL-terminated string STRING.
     74    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
     75 int
     76 cpp_ideq (const cpp_token *token, const char *string)
     77 {
     78   if (token->type != CPP_NAME)
     79     return 0;
     80 
     81   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
     82 }
     83 
     84 /* Record a note TYPE at byte POS into the current cleaned logical
     85    line.  */
     86 static void
     87 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
     88 {
     89   if (buffer->notes_used == buffer->notes_cap)
     90     {
     91       buffer->notes_cap = buffer->notes_cap * 2 + 200;
     92       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
     93                                   buffer->notes_cap);
     94     }
     95 
     96   buffer->notes[buffer->notes_used].pos = pos;
     97   buffer->notes[buffer->notes_used].type = type;
     98   buffer->notes_used++;
     99 }
    100 
    101 
    102 /* Fast path to find line special characters using optimized character
    104    scanning algorithms.  Anything complicated falls back to the slow
    105    path below.  Since this loop is very hot it's worth doing these kinds
    106    of optimizations.
    107 
    108    One of the paths through the ifdefs should provide
    109 
    110      const uchar *search_line_fast (const uchar *s, const uchar *end);
    111 
    112    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
    113    the found character.
    114 
    115    Note that the last character of the buffer is *always* a newline,
    116    as forced by _cpp_convert_input.  This fact can be used to avoid
    117    explicitly looking for the end of the buffer.  */
    118 
    119 /* Configure gives us an ifdef test.  */
    120 #ifndef WORDS_BIGENDIAN
    121 #define WORDS_BIGENDIAN 0
    122 #endif
    123 
    124 /* We'd like the largest integer that fits into a register.  There's nothing
    125    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
    126    but MS decided on an LLP64 model.  Thankfully when building with GCC we
    127    can get the "real" word size.  */
    128 #ifdef __GNUC__
    129 typedef unsigned int word_type __attribute__((__mode__(__word__)));
    130 #else
    131 typedef unsigned long word_type;
    132 #endif
    133 
    134 /* The code below is only expecting sizes 4 or 8.
    135    Die at compile-time if this expectation is violated.  */
    136 typedef char check_word_type_size
    137   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
    138 
    139 /* Return X with the first N bytes forced to values that won't match one
    140    of the interesting characters.  Note that NUL is not interesting.  */
    141 
    142 static inline word_type
    143 acc_char_mask_misalign (word_type val, unsigned int n)
    144 {
    145   word_type mask = -1;
    146   if (WORDS_BIGENDIAN)
    147     mask >>= n * 8;
    148   else
    149     mask <<= n * 8;
    150   return val & mask;
    151 }
    152 
    153 /* Return X replicated to all byte positions within WORD_TYPE.  */
    154 
    155 static inline word_type
    156 acc_char_replicate (uchar x)
    157 {
    158   word_type ret;
    159 
    160   ret = (x << 24) | (x << 16) | (x << 8) | x;
    161   if (sizeof(word_type) == 8)
    162     ret = (ret << 16 << 16) | ret;
    163   return ret;
    164 }
    165 
    166 /* Return non-zero if some byte of VAL is (probably) C.  */
    167 
    168 static inline word_type
    169 acc_char_cmp (word_type val, word_type c)
    170 {
    171 #if defined(__GNUC__) && defined(__alpha__)
    172   /* We can get exact results using a compare-bytes instruction.
    173      Get (val == c) via (0 >= (val ^ c)).  */
    174   return __builtin_alpha_cmpbge (0, val ^ c);
    175 #else
    176   word_type magic = 0x7efefefeU;
    177   if (sizeof(word_type) == 8)
    178     magic = (magic << 16 << 16) | 0xfefefefeU;
    179   magic |= 1;
    180 
    181   val ^= c;
    182   return ((val + magic) ^ ~val) & ~magic;
    183 #endif
    184 }
    185 
    186 /* Given the result of acc_char_cmp is non-zero, return the index of
    187    the found character.  If this was a false positive, return -1.  */
    188 
    189 static inline int
    190 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
    191 		word_type val ATTRIBUTE_UNUSED)
    192 {
    193 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
    194   /* The cmpbge instruction sets *bits* of the result corresponding to
    195      matches in the bytes with no false positives.  */
    196   return __builtin_ctzl (cmp);
    197 #else
    198   unsigned int i;
    199 
    200   /* ??? It would be nice to force unrolling here,
    201      and have all of these constants folded.  */
    202   for (i = 0; i < sizeof(word_type); ++i)
    203     {
    204       uchar c;
    205       if (WORDS_BIGENDIAN)
    206 	c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
    207       else
    208 	c = (val >> i * 8) & 0xff;
    209 
    210       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
    211 	return i;
    212     }
    213 
    214   return -1;
    215 #endif
    216 }
    217 
    218 /* A version of the fast scanner using bit fiddling techniques.
    219 
    220    For 32-bit words, one would normally perform 16 comparisons and
    221    16 branches.  With this algorithm one performs 24 arithmetic
    222    operations and one branch.  Whether this is faster with a 32-bit
    223    word size is going to be somewhat system dependent.
    224 
    225    For 64-bit words, we eliminate twice the number of comparisons
    226    and branches without increasing the number of arithmetic operations.
    227    It's almost certainly going to be a win with 64-bit word size.  */
    228 
    229 static const uchar * search_line_acc_char (const uchar *, const uchar *)
    230   ATTRIBUTE_UNUSED;
    231 
    232 static const uchar *
    233 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    234 {
    235   const word_type repl_nl = acc_char_replicate ('\n');
    236   const word_type repl_cr = acc_char_replicate ('\r');
    237   const word_type repl_bs = acc_char_replicate ('\\');
    238   const word_type repl_qm = acc_char_replicate ('?');
    239 
    240   unsigned int misalign;
    241   const word_type *p;
    242   word_type val, t;
    243 
    244   /* Align the buffer.  Mask out any bytes from before the beginning.  */
    245   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
    246   val = *p;
    247   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
    248   if (misalign)
    249     val = acc_char_mask_misalign (val, misalign);
    250 
    251   /* Main loop.  */
    252   while (1)
    253     {
    254       t  = acc_char_cmp (val, repl_nl);
    255       t |= acc_char_cmp (val, repl_cr);
    256       t |= acc_char_cmp (val, repl_bs);
    257       t |= acc_char_cmp (val, repl_qm);
    258 
    259       if (__builtin_expect (t != 0, 0))
    260 	{
    261 	  int i = acc_char_index (t, val);
    262 	  if (i >= 0)
    263 	    return (const uchar *)p + i;
    264 	}
    265 
    266       val = *++p;
    267     }
    268 }
    269 
    270 /* Disable on Solaris 2/x86 until the following problem can be properly
    271    autoconfed:
    272 
    273    The Solaris 10+ assembler tags objects with the instruction set
    274    extensions used, so SSE4.2 executables cannot run on machines that
    275    don't support that extension.  */
    276 
    277 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
    278 
    279 /* Replicated character data to be shared between implementations.
    280    Recall that outside of a context with vector support we can't
    281    define compatible vector types, therefore these are all defined
    282    in terms of raw characters.  */
    283 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
    284   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    285     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
    286   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    287     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
    288   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    289     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
    290   { '?', '?', '?', '?', '?', '?', '?', '?',
    291     '?', '?', '?', '?', '?', '?', '?', '?' },
    292 };
    293 
    294 /* A version of the fast scanner using MMX vectorized byte compare insns.
    295 
    296    This uses the PMOVMSKB instruction which was introduced with "MMX2",
    297    which was packaged into SSE1; it is also present in the AMD MMX
    298    extension.  Mark the function as using "sse" so that we emit a real
    299    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
    300 
    301 static const uchar *
    302 #ifndef __SSE__
    303 __attribute__((__target__("sse")))
    304 #endif
    305 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    306 {
    307   typedef char v8qi __attribute__ ((__vector_size__ (8)));
    308   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
    309 
    310   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
    311   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
    312   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
    313   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
    314 
    315   unsigned int misalign, found, mask;
    316   const v8qi *p;
    317   v8qi data, t, c;
    318 
    319   /* Align the source pointer.  While MMX doesn't generate unaligned data
    320      faults, this allows us to safely scan to the end of the buffer without
    321      reading beyond the end of the last page.  */
    322   misalign = (uintptr_t)s & 7;
    323   p = (const v8qi *)((uintptr_t)s & -8);
    324   data = *p;
    325 
    326   /* Create a mask for the bytes that are valid within the first
    327      16-byte block.  The Idea here is that the AND with the mask
    328      within the loop is "free", since we need some AND or TEST
    329      insn in order to set the flags for the branch anyway.  */
    330   mask = -1u << misalign;
    331 
    332   /* Main loop processing 8 bytes at a time.  */
    333   goto start;
    334   do
    335     {
    336       data = *++p;
    337       mask = -1;
    338 
    339     start:
    340       t = __builtin_ia32_pcmpeqb(data, repl_nl);
    341       c = __builtin_ia32_pcmpeqb(data, repl_cr);
    342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
    343       c = __builtin_ia32_pcmpeqb(data, repl_bs);
    344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
    345       c = __builtin_ia32_pcmpeqb(data, repl_qm);
    346       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
    347       found = __builtin_ia32_pmovmskb (t);
    348       found &= mask;
    349     }
    350   while (!found);
    351 
    352   __builtin_ia32_emms ();
    353 
    354   /* FOUND contains 1 in bits for which we matched a relevant
    355      character.  Conversion to the byte index is trivial.  */
    356   found = __builtin_ctz(found);
    357   return (const uchar *)p + found;
    358 }
    359 
    360 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
    361 
    362 static const uchar *
    363 #ifndef __SSE2__
    364 __attribute__((__target__("sse2")))
    365 #endif
    366 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    367 {
    368   typedef char v16qi __attribute__ ((__vector_size__ (16)));
    369 
    370   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
    371   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
    372   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
    373   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
    374 
    375   unsigned int misalign, found, mask;
    376   const v16qi *p;
    377   v16qi data, t;
    378 
    379   /* Align the source pointer.  */
    380   misalign = (uintptr_t)s & 15;
    381   p = (const v16qi *)((uintptr_t)s & -16);
    382   data = *p;
    383 
    384   /* Create a mask for the bytes that are valid within the first
    385      16-byte block.  The Idea here is that the AND with the mask
    386      within the loop is "free", since we need some AND or TEST
    387      insn in order to set the flags for the branch anyway.  */
    388   mask = -1u << misalign;
    389 
    390   /* Main loop processing 16 bytes at a time.  */
    391   goto start;
    392   do
    393     {
    394       data = *++p;
    395       mask = -1;
    396 
    397     start:
    398       t  = data == repl_nl;
    399       t |= data == repl_cr;
    400       t |= data == repl_bs;
    401       t |= data == repl_qm;
    402       found = __builtin_ia32_pmovmskb128 (t);
    403       found &= mask;
    404     }
    405   while (!found);
    406 
    407   /* FOUND contains 1 in bits for which we matched a relevant
    408      character.  Conversion to the byte index is trivial.  */
    409   found = __builtin_ctz(found);
    410   return (const uchar *)p + found;
    411 }
    412 
    413 #ifdef HAVE_SSE4
    414 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
    415 
    416 static const uchar *
    417 #ifndef __SSE4_2__
    418 __attribute__((__target__("sse4.2")))
    419 #endif
    420 search_line_sse42 (const uchar *s, const uchar *end)
    421 {
    422   typedef char v16qi __attribute__ ((__vector_size__ (16)));
    423   static const v16qi search = { '\n', '\r', '?', '\\' };
    424 
    425   uintptr_t si = (uintptr_t)s;
    426   uintptr_t index;
    427 
    428   /* Check for unaligned input.  */
    429   if (si & 15)
    430     {
    431       v16qi sv;
    432 
    433       if (__builtin_expect (end - s < 16, 0)
    434 	  && __builtin_expect ((si & 0xfff) > 0xff0, 0))
    435 	{
    436 	  /* There are less than 16 bytes left in the buffer, and less
    437 	     than 16 bytes left on the page.  Reading 16 bytes at this
    438 	     point might generate a spurious page fault.  Defer to the
    439 	     SSE2 implementation, which already handles alignment.  */
    440 	  return search_line_sse2 (s, end);
    441 	}
    442 
    443       /* ??? The builtin doesn't understand that the PCMPESTRI read from
    444 	 memory need not be aligned.  */
    445       sv = __builtin_ia32_loaddqu ((const char *) s);
    446       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
    447 
    448       if (__builtin_expect (index < 16, 0))
    449 	goto found;
    450 
    451       /* Advance the pointer to an aligned address.  We will re-scan a
    452 	 few bytes, but we no longer need care for reading past the
    453 	 end of a page, since we're guaranteed a match.  */
    454       s = (const uchar *)((si + 15) & -16);
    455     }
    456 
    457   /* Main loop, processing 16 bytes at a time.  */
    458 #ifdef __GCC_ASM_FLAG_OUTPUTS__
    459   while (1)
    460     {
    461       char f;
    462 
    463       /* By using inline assembly instead of the builtin,
    464 	 we can use the result, as well as the flags set.  */
    465       __asm ("%vpcmpestri\t$0, %2, %3"
    466 	     : "=c"(index), "=@ccc"(f)
    467 	     : "m"(*s), "x"(search), "a"(4), "d"(16));
    468       if (f)
    469 	break;
    470 
    471       s += 16;
    472     }
    473 #else
    474   s -= 16;
    475   /* By doing the whole loop in inline assembly,
    476      we can make proper use of the flags set.  */
    477   __asm (      ".balign 16\n"
    478 	"0:	add $16, %1\n"
    479 	"	%vpcmpestri\t$0, (%1), %2\n"
    480 	"	jnc 0b"
    481 	: "=&c"(index), "+r"(s)
    482 	: "x"(search), "a"(4), "d"(16));
    483 #endif
    484 
    485  found:
    486   return s + index;
    487 }
    488 
    489 #else
    490 /* Work around out-dated assemblers without sse4 support.  */
    491 #define search_line_sse42 search_line_sse2
    492 #endif
    493 
    494 /* Check the CPU capabilities.  */
    495 
    496 #include "../gcc/config/i386/cpuid.h"
    497 
    498 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
    499 static search_line_fast_type search_line_fast;
    500 
    501 #define HAVE_init_vectorized_lexer 1
    502 static inline void
    503 init_vectorized_lexer (void)
    504 {
    505   unsigned dummy, ecx = 0, edx = 0;
    506   search_line_fast_type impl = search_line_acc_char;
    507   int minimum = 0;
    508 
    509 #if defined(__SSE4_2__)
    510   minimum = 3;
    511 #elif defined(__SSE2__)
    512   minimum = 2;
    513 #elif defined(__SSE__)
    514   minimum = 1;
    515 #endif
    516 
    517   if (minimum == 3)
    518     impl = search_line_sse42;
    519   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
    520     {
    521       if (minimum == 3 || (ecx & bit_SSE4_2))
    522         impl = search_line_sse42;
    523       else if (minimum == 2 || (edx & bit_SSE2))
    524 	impl = search_line_sse2;
    525       else if (minimum == 1 || (edx & bit_SSE))
    526 	impl = search_line_mmx;
    527     }
    528   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
    529     {
    530       if (minimum == 1
    531 	  || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
    532 	impl = search_line_mmx;
    533     }
    534 
    535   search_line_fast = impl;
    536 }
    537 
    538 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
    539 
    540 /* A vection of the fast scanner using AltiVec vectorized byte compares
    541    and VSX unaligned loads (when VSX is available).  This is otherwise
    542    the same as the AltiVec version.  */
    543 
    544 ATTRIBUTE_NO_SANITIZE_UNDEFINED
    545 static const uchar *
    546 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    547 {
    548   typedef __attribute__((altivec(vector))) unsigned char vc;
    549 
    550   const vc repl_nl = {
    551     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    552     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
    553   };
    554   const vc repl_cr = {
    555     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    556     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
    557   };
    558   const vc repl_bs = {
    559     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    560     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
    561   };
    562   const vc repl_qm = {
    563     '?', '?', '?', '?', '?', '?', '?', '?',
    564     '?', '?', '?', '?', '?', '?', '?', '?',
    565   };
    566   const vc zero = { 0 };
    567 
    568   vc data, t;
    569 
    570   /* Main loop processing 16 bytes at a time.  */
    571   do
    572     {
    573       vc m_nl, m_cr, m_bs, m_qm;
    574 
    575       data = __builtin_vec_vsx_ld (0, s);
    576       s += 16;
    577 
    578       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
    579       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
    580       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
    581       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
    582       t = (m_nl | m_cr) | (m_bs | m_qm);
    583 
    584       /* T now contains 0xff in bytes for which we matched one of the relevant
    585 	 characters.  We want to exit the loop if any byte in T is non-zero.
    586 	 Below is the expansion of vec_any_ne(t, zero).  */
    587     }
    588   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
    589 
    590   /* Restore s to to point to the 16 bytes we just processed.  */
    591   s -= 16;
    592 
    593   {
    594 #define N  (sizeof(vc) / sizeof(long))
    595 
    596     union {
    597       vc v;
    598       /* Statically assert that N is 2 or 4.  */
    599       unsigned long l[(N == 2 || N == 4) ? N : -1];
    600     } u;
    601     unsigned long l, i = 0;
    602 
    603     u.v = t;
    604 
    605     /* Find the first word of T that is non-zero.  */
    606     switch (N)
    607       {
    608       case 4:
    609 	l = u.l[i++];
    610 	if (l != 0)
    611 	  break;
    612 	s += sizeof(unsigned long);
    613 	l = u.l[i++];
    614 	if (l != 0)
    615 	  break;
    616 	s += sizeof(unsigned long);
    617 	/* FALLTHRU */
    618       case 2:
    619 	l = u.l[i++];
    620 	if (l != 0)
    621 	  break;
    622 	s += sizeof(unsigned long);
    623 	l = u.l[i];
    624       }
    625 
    626     /* L now contains 0xff in bytes for which we matched one of the
    627        relevant characters.  We can find the byte index by finding
    628        its bit index and dividing by 8.  */
    629 #ifdef __BIG_ENDIAN__
    630     l = __builtin_clzl(l) >> 3;
    631 #else
    632     l = __builtin_ctzl(l) >> 3;
    633 #endif
    634     return s + l;
    635 
    636 #undef N
    637   }
    638 }
    639 
    640 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
    641 
    642 /* A vection of the fast scanner using AltiVec vectorized byte compares.
    643    This cannot be used for little endian because vec_lvsl/lvsr are
    644    deprecated for little endian and the code won't work properly.  */
    645 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
    646    so we can't compile this function without -maltivec on the command line
    647    (or implied by some other switch).  */
    648 
    649 static const uchar *
    650 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    651 {
    652   typedef __attribute__((altivec(vector))) unsigned char vc;
    653 
    654   const vc repl_nl = {
    655     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    656     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
    657   };
    658   const vc repl_cr = {
    659     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    660     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
    661   };
    662   const vc repl_bs = {
    663     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    664     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
    665   };
    666   const vc repl_qm = {
    667     '?', '?', '?', '?', '?', '?', '?', '?',
    668     '?', '?', '?', '?', '?', '?', '?', '?',
    669   };
    670   const vc ones = {
    671     -1, -1, -1, -1, -1, -1, -1, -1,
    672     -1, -1, -1, -1, -1, -1, -1, -1,
    673   };
    674   const vc zero = { 0 };
    675 
    676   vc data, mask, t;
    677 
    678   /* Altivec loads automatically mask addresses with -16.  This lets us
    679      issue the first load as early as possible.  */
    680   data = __builtin_vec_ld(0, (const vc *)s);
    681 
    682   /* Discard bytes before the beginning of the buffer.  Do this by
    683      beginning with all ones and shifting in zeros according to the
    684      mis-alignment.  The LVSR instruction pulls the exact shift we
    685      want from the address.  */
    686   mask = __builtin_vec_lvsr(0, s);
    687   mask = __builtin_vec_perm(zero, ones, mask);
    688   data &= mask;
    689 
    690   /* While altivec loads mask addresses, we still need to align S so
    691      that the offset we compute at the end is correct.  */
    692   s = (const uchar *)((uintptr_t)s & -16);
    693 
    694   /* Main loop processing 16 bytes at a time.  */
    695   goto start;
    696   do
    697     {
    698       vc m_nl, m_cr, m_bs, m_qm;
    699 
    700       s += 16;
    701       data = __builtin_vec_ld(0, (const vc *)s);
    702 
    703     start:
    704       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
    705       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
    706       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
    707       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
    708       t = (m_nl | m_cr) | (m_bs | m_qm);
    709 
    710       /* T now contains 0xff in bytes for which we matched one of the relevant
    711 	 characters.  We want to exit the loop if any byte in T is non-zero.
    712 	 Below is the expansion of vec_any_ne(t, zero).  */
    713     }
    714   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
    715 
    716   {
    717 #define N  (sizeof(vc) / sizeof(long))
    718 
    719     union {
    720       vc v;
    721       /* Statically assert that N is 2 or 4.  */
    722       unsigned long l[(N == 2 || N == 4) ? N : -1];
    723     } u;
    724     unsigned long l, i = 0;
    725 
    726     u.v = t;
    727 
    728     /* Find the first word of T that is non-zero.  */
    729     switch (N)
    730       {
    731       case 4:
    732 	l = u.l[i++];
    733 	if (l != 0)
    734 	  break;
    735 	s += sizeof(unsigned long);
    736 	l = u.l[i++];
    737 	if (l != 0)
    738 	  break;
    739 	s += sizeof(unsigned long);
    740 	/* FALLTHROUGH */
    741       case 2:
    742 	l = u.l[i++];
    743 	if (l != 0)
    744 	  break;
    745 	s += sizeof(unsigned long);
    746 	l = u.l[i];
    747       }
    748 
    749     /* L now contains 0xff in bytes for which we matched one of the
    750        relevant characters.  We can find the byte index by finding
    751        its bit index and dividing by 8.  */
    752     l = __builtin_clzl(l) >> 3;
    753     return s + l;
    754 
    755 #undef N
    756   }
    757 }
    758 
    759 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
    760 #include "arm_neon.h"
    761 
    762 /* This doesn't have to be the exact page size, but no system may use
    763    a size smaller than this.  ARMv8 requires a minimum page size of
    764    4k.  The impact of being conservative here is a small number of
    765    cases will take the slightly slower entry path into the main
    766    loop.  */
    767 
    768 #define AARCH64_MIN_PAGE_SIZE 4096
    769 
    770 static const uchar *
    771 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    772 {
    773   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
    774   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
    775   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
    776   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
    777   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
    778 
    779 #ifdef __ARM_BIG_ENDIAN
    780   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
    781 #else
    782   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
    783 #endif
    784 
    785   unsigned int found;
    786   const uint8_t *p;
    787   uint8x16_t data;
    788   uint8x16_t t;
    789   uint16x8_t m;
    790   uint8x16_t u, v, w;
    791 
    792   /* Align the source pointer.  */
    793   p = (const uint8_t *)((uintptr_t)s & -16);
    794 
    795   /* Assuming random string start positions, with a 4k page size we'll take
    796      the slow path about 0.37% of the time.  */
    797   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
    798 			 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
    799 			< 16, 0))
    800     {
    801       /* Slow path: the string starts near a possible page boundary.  */
    802       uint32_t misalign, mask;
    803 
    804       misalign = (uintptr_t)s & 15;
    805       mask = (-1u << misalign) & 0xffff;
    806       data = vld1q_u8 (p);
    807       t = vceqq_u8 (data, repl_nl);
    808       u = vceqq_u8 (data, repl_cr);
    809       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    810       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    811       t = vorrq_u8 (v, w);
    812       t = vandq_u8 (t, xmask);
    813       m = vpaddlq_u8 (t);
    814       m = vshlq_u16 (m, shift);
    815       found = vaddvq_u16 (m);
    816       found &= mask;
    817       if (found)
    818 	return (const uchar*)p + __builtin_ctz (found);
    819     }
    820   else
    821     {
    822       data = vld1q_u8 ((const uint8_t *) s);
    823       t = vceqq_u8 (data, repl_nl);
    824       u = vceqq_u8 (data, repl_cr);
    825       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    826       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    827       t = vorrq_u8 (v, w);
    828       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
    829 	goto done;
    830     }
    831 
    832   do
    833     {
    834       p += 16;
    835       data = vld1q_u8 (p);
    836       t = vceqq_u8 (data, repl_nl);
    837       u = vceqq_u8 (data, repl_cr);
    838       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    839       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    840       t = vorrq_u8 (v, w);
    841     } while (!vpaddd_u64 ((uint64x2_t)t));
    842 
    843 done:
    844   /* Now that we've found the terminating substring, work out precisely where
    845      we need to stop.  */
    846   t = vandq_u8 (t, xmask);
    847   m = vpaddlq_u8 (t);
    848   m = vshlq_u16 (m, shift);
    849   found = vaddvq_u16 (m);
    850   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
    851 	  + __builtin_ctz (found));
    852 }
    853 
    854 #elif defined (__ARM_NEON)
    855 #include "arm_neon.h"
    856 
    857 static const uchar *
    858 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    859 {
    860   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
    861   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
    862   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
    863   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
    864   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
    865 
    866   unsigned int misalign, found, mask;
    867   const uint8_t *p;
    868   uint8x16_t data;
    869 
    870   /* Align the source pointer.  */
    871   misalign = (uintptr_t)s & 15;
    872   p = (const uint8_t *)((uintptr_t)s & -16);
    873   data = vld1q_u8 (p);
    874 
    875   /* Create a mask for the bytes that are valid within the first
    876      16-byte block.  The Idea here is that the AND with the mask
    877      within the loop is "free", since we need some AND or TEST
    878      insn in order to set the flags for the branch anyway.  */
    879   mask = (-1u << misalign) & 0xffff;
    880 
    881   /* Main loop, processing 16 bytes at a time.  */
    882   goto start;
    883 
    884   do
    885     {
    886       uint8x8_t l;
    887       uint16x4_t m;
    888       uint32x2_t n;
    889       uint8x16_t t, u, v, w;
    890 
    891       p += 16;
    892       data = vld1q_u8 (p);
    893       mask = 0xffff;
    894 
    895     start:
    896       t = vceqq_u8 (data, repl_nl);
    897       u = vceqq_u8 (data, repl_cr);
    898       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    899       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    900       t = vandq_u8 (vorrq_u8 (v, w), xmask);
    901       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
    902       m = vpaddl_u8 (l);
    903       n = vpaddl_u16 (m);
    904 
    905       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
    906 	      vshr_n_u64 ((uint64x1_t) n, 24)), 0);
    907       found &= mask;
    908     }
    909   while (!found);
    910 
    911   /* FOUND contains 1 in bits for which we matched a relevant
    912      character.  Conversion to the byte index is trivial.  */
    913   found = __builtin_ctz (found);
    914   return (const uchar *)p + found;
    915 }
    916 
    917 #else
    918 
    919 /* We only have one accelerated alternative.  Use a direct call so that
    920    we encourage inlining.  */
    921 
    922 #define search_line_fast  search_line_acc_char
    923 
    924 #endif
    925 
    926 /* Initialize the lexer if needed.  */
    927 
    928 void
    929 _cpp_init_lexer (void)
    930 {
    931 #ifdef HAVE_init_vectorized_lexer
    932   init_vectorized_lexer ();
    933 #endif
    934 }
    935 
    936 /* Returns with a logical line that contains no escaped newlines or
    937    trigraphs.  This is a time-critical inner loop.  */
    938 void
    939 _cpp_clean_line (cpp_reader *pfile)
    940 {
    941   cpp_buffer *buffer;
    942   const uchar *s;
    943   uchar c, *d, *p;
    944 
    945   buffer = pfile->buffer;
    946   buffer->cur_note = buffer->notes_used = 0;
    947   buffer->cur = buffer->line_base = buffer->next_line;
    948   buffer->need_line = false;
    949   s = buffer->next_line;
    950 
    951   if (!buffer->from_stage3)
    952     {
    953       const uchar *pbackslash = NULL;
    954 
    955       /* Fast path.  This is the common case of an un-escaped line with
    956 	 no trigraphs.  The primary win here is by not writing any
    957 	 data back to memory until we have to.  */
    958       while (1)
    959 	{
    960 	  /* Perform an optimized search for \n, \r, \\, ?.  */
    961 	  s = search_line_fast (s, buffer->rlimit);
    962 
    963 	  c = *s;
    964 	  if (c == '\\')
    965 	    {
    966 	      /* Record the location of the backslash and continue.  */
    967 	      pbackslash = s++;
    968 	    }
    969 	  else if (__builtin_expect (c == '?', 0))
    970 	    {
    971 	      if (__builtin_expect (s[1] == '?', false)
    972 		   && _cpp_trigraph_map[s[2]])
    973 		{
    974 		  /* Have a trigraph.  We may or may not have to convert
    975 		     it.  Add a line note regardless, for -Wtrigraphs.  */
    976 		  add_line_note (buffer, s, s[2]);
    977 		  if (CPP_OPTION (pfile, trigraphs))
    978 		    {
    979 		      /* We do, and that means we have to switch to the
    980 		         slow path.  */
    981 		      d = (uchar *) s;
    982 		      *d = _cpp_trigraph_map[s[2]];
    983 		      s += 2;
    984 		      goto slow_path;
    985 		    }
    986 		}
    987 	      /* Not a trigraph.  Continue on fast-path.  */
    988 	      s++;
    989 	    }
    990 	  else
    991 	    break;
    992 	}
    993 
    994       /* This must be \r or \n.  We're either done, or we'll be forced
    995 	 to write back to the buffer and continue on the slow path.  */
    996       d = (uchar *) s;
    997 
    998       if (__builtin_expect (s == buffer->rlimit, false))
    999 	goto done;
   1000 
   1001       /* DOS line ending? */
   1002       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
   1003 	{
   1004 	  s++;
   1005 	  if (s == buffer->rlimit)
   1006 	    goto done;
   1007 	}
   1008 
   1009       if (__builtin_expect (pbackslash == NULL, true))
   1010 	goto done;
   1011 
   1012       /* Check for escaped newline.  */
   1013       p = d;
   1014       while (is_nvspace (p[-1]))
   1015 	p--;
   1016       if (p - 1 != pbackslash)
   1017 	goto done;
   1018 
   1019       /* Have an escaped newline; process it and proceed to
   1020 	 the slow path.  */
   1021       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
   1022       d = p - 2;
   1023       buffer->next_line = p - 1;
   1024 
   1025     slow_path:
   1026       while (1)
   1027 	{
   1028 	  c = *++s;
   1029 	  *++d = c;
   1030 
   1031 	  if (c == '\n' || c == '\r')
   1032 	    {
   1033 	      /* Handle DOS line endings.  */
   1034 	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
   1035 		s++;
   1036 	      if (s == buffer->rlimit)
   1037 		break;
   1038 
   1039 	      /* Escaped?  */
   1040 	      p = d;
   1041 	      while (p != buffer->next_line && is_nvspace (p[-1]))
   1042 		p--;
   1043 	      if (p == buffer->next_line || p[-1] != '\\')
   1044 		break;
   1045 
   1046 	      add_line_note (buffer, p - 1, p != d ? ' ': '\\');
   1047 	      d = p - 2;
   1048 	      buffer->next_line = p - 1;
   1049 	    }
   1050 	  else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
   1051 	    {
   1052 	      /* Add a note regardless, for the benefit of -Wtrigraphs.  */
   1053 	      add_line_note (buffer, d, s[2]);
   1054 	      if (CPP_OPTION (pfile, trigraphs))
   1055 		{
   1056 		  *d = _cpp_trigraph_map[s[2]];
   1057 		  s += 2;
   1058 		}
   1059 	    }
   1060 	}
   1061     }
   1062   else
   1063     {
   1064       while (*s != '\n' && *s != '\r')
   1065 	s++;
   1066       d = (uchar *) s;
   1067 
   1068       /* Handle DOS line endings.  */
   1069       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
   1070 	s++;
   1071     }
   1072 
   1073  done:
   1074   *d = '\n';
   1075   /* A sentinel note that should never be processed.  */
   1076   add_line_note (buffer, d + 1, '\n');
   1077   buffer->next_line = s + 1;
   1078 }
   1079 
   1080 template <bool lexing_raw_string>
   1081 static bool get_fresh_line_impl (cpp_reader *pfile);
   1082 
   1083 /* Return true if the trigraph indicated by NOTE should be warned
   1084    about in a comment.  */
   1085 static bool
   1086 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
   1087 {
   1088   const uchar *p;
   1089 
   1090   /* Within comments we don't warn about trigraphs, unless the
   1091      trigraph forms an escaped newline, as that may change
   1092      behavior.  */
   1093   if (note->type != '/')
   1094     return false;
   1095 
   1096   /* If -trigraphs, then this was an escaped newline iff the next note
   1097      is coincident.  */
   1098   if (CPP_OPTION (pfile, trigraphs))
   1099     return note[1].pos == note->pos;
   1100 
   1101   /* Otherwise, see if this forms an escaped newline.  */
   1102   p = note->pos + 3;
   1103   while (is_nvspace (*p))
   1104     p++;
   1105 
   1106   /* There might have been escaped newlines between the trigraph and the
   1107      newline we found.  Hence the position test.  */
   1108   return (*p == '\n' && p < note[1].pos);
   1109 }
   1110 
   1111 /* Process the notes created by add_line_note as far as the current
   1112    location.  */
   1113 void
   1114 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
   1115 {
   1116   cpp_buffer *buffer = pfile->buffer;
   1117 
   1118   for (;;)
   1119     {
   1120       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
   1121       unsigned int col;
   1122 
   1123       if (note->pos > buffer->cur)
   1124 	break;
   1125 
   1126       buffer->cur_note++;
   1127       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
   1128 
   1129       if (note->type == '\\' || note->type == ' ')
   1130 	{
   1131 	  if (note->type == ' ' && !in_comment)
   1132 	    cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
   1133 				 "backslash and newline separated by space");
   1134 
   1135 	  if (buffer->next_line > buffer->rlimit)
   1136 	    {
   1137 	      cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
   1138 				   "backslash-newline at end of file");
   1139 	      /* Prevent "no newline at end of file" warning.  */
   1140 	      buffer->next_line = buffer->rlimit;
   1141 	    }
   1142 
   1143 	  buffer->line_base = note->pos;
   1144 	  CPP_INCREMENT_LINE (pfile, 0);
   1145 	}
   1146       else if (_cpp_trigraph_map[note->type])
   1147 	{
   1148 	  if (CPP_OPTION (pfile, warn_trigraphs)
   1149 	      && (!in_comment || warn_in_comment (pfile, note)))
   1150 	    {
   1151 	      if (CPP_OPTION (pfile, trigraphs))
   1152 		cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
   1153                                        pfile->line_table->highest_line, col,
   1154 				       "trigraph ??%c converted to %c",
   1155 				       note->type,
   1156 				       (int) _cpp_trigraph_map[note->type]);
   1157 	      else
   1158 		{
   1159 		  cpp_warning_with_line
   1160 		    (pfile, CPP_W_TRIGRAPHS,
   1161                      pfile->line_table->highest_line, col,
   1162 		     "trigraph ??%c ignored, use -trigraphs to enable",
   1163 		     note->type);
   1164 		}
   1165 	    }
   1166 	}
   1167       else if (note->type == 0)
   1168 	/* Already processed in lex_raw_string.  */;
   1169       else
   1170 	abort ();
   1171     }
   1172 }
   1173 
   1174 namespace bidi {
   1175   enum class kind {
   1176     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
   1177   };
   1178 
   1179   /* All the UTF-8 encodings of bidi characters start with E2.  */
   1180   constexpr uchar utf8_start = 0xe2;
   1181 
   1182   struct context
   1183   {
   1184     context () {}
   1185     context (location_t loc, kind k, bool pdf, bool ucn)
   1186     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
   1187     {
   1188     }
   1189 
   1190     kind get_pop_kind () const
   1191     {
   1192       return m_pdf ? kind::PDF : kind::PDI;
   1193     }
   1194     bool ucn_p () const
   1195     {
   1196       return m_ucn;
   1197     }
   1198 
   1199     location_t m_loc;
   1200     kind m_kind;
   1201     unsigned m_pdf : 1;
   1202     unsigned m_ucn : 1;
   1203   };
   1204 
   1205   /* A vector holding currently open bidi contexts.  We use a char for
   1206      each context, its LSB is 1 if it represents a PDF context, 0 if it
   1207      represents a PDI context.  The next bit is 1 if this context was open
   1208      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
   1209   semi_embedded_vec <context, 16> vec;
   1210 
   1211   /* Close the whole comment/identifier/string literal/character constant
   1212      context.  */
   1213   void on_close ()
   1214   {
   1215     vec.truncate (0);
   1216   }
   1217 
   1218   /* Pop the last element in the vector.  */
   1219   void pop ()
   1220   {
   1221     unsigned int len = vec.count ();
   1222     gcc_checking_assert (len > 0);
   1223     vec.truncate (len - 1);
   1224   }
   1225 
   1226   /* Return the pop kind of the context of the Ith element.  */
   1227   kind pop_kind_at (unsigned int i)
   1228   {
   1229     return vec[i].get_pop_kind ();
   1230   }
   1231 
   1232   /* Return the pop kind of the context that is currently opened.  */
   1233   kind current_ctx ()
   1234   {
   1235     unsigned int len = vec.count ();
   1236     if (len == 0)
   1237       return kind::NONE;
   1238     return vec[len - 1].get_pop_kind ();
   1239   }
   1240 
   1241   /* Return true if the current context comes from a UCN origin, that is,
   1242      the bidi char which started this bidi context was written as a UCN.  */
   1243   bool current_ctx_ucn_p ()
   1244   {
   1245     unsigned int len = vec.count ();
   1246     gcc_checking_assert (len > 0);
   1247     return vec[len - 1].m_ucn;
   1248   }
   1249 
   1250   location_t current_ctx_loc ()
   1251   {
   1252     unsigned int len = vec.count ();
   1253     gcc_checking_assert (len > 0);
   1254     return vec[len - 1].m_loc;
   1255   }
   1256 
   1257   /* We've read a bidi char, update the current vector as necessary.
   1258      LOC is only valid when K is not kind::NONE.  */
   1259   void on_char (kind k, bool ucn_p, location_t loc)
   1260   {
   1261     switch (k)
   1262       {
   1263       case kind::LRE:
   1264       case kind::RLE:
   1265       case kind::LRO:
   1266       case kind::RLO:
   1267 	vec.push (context (loc, k, true, ucn_p));
   1268 	break;
   1269       case kind::LRI:
   1270       case kind::RLI:
   1271       case kind::FSI:
   1272 	vec.push (context (loc, k, false, ucn_p));
   1273 	break;
   1274       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
   1275 	 whose scope has not yet been terminated.  */
   1276       case kind::PDF:
   1277 	if (current_ctx () == kind::PDF)
   1278 	  pop ();
   1279 	break;
   1280       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
   1281 	 scope has not yet been terminated, as well as the scopes of
   1282 	 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
   1283 	 yet been terminated.  */
   1284       case kind::PDI:
   1285 	for (int i = vec.count () - 1; i >= 0; --i)
   1286 	  if (pop_kind_at (i) == kind::PDI)
   1287 	    {
   1288 	      vec.truncate (i);
   1289 	      break;
   1290 	    }
   1291 	break;
   1292       case kind::LTR:
   1293       case kind::RTL:
   1294 	/* These aren't popped by a PDF/PDI.  */
   1295 	break;
   1296       ATTR_LIKELY case kind::NONE:
   1297 	break;
   1298       default:
   1299 	abort ();
   1300       }
   1301   }
   1302 
   1303   /* Return a descriptive string for K.  */
   1304   const char *to_str (kind k)
   1305   {
   1306     switch (k)
   1307       {
   1308       case kind::LRE:
   1309 	return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
   1310       case kind::RLE:
   1311 	return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
   1312       case kind::LRO:
   1313 	return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
   1314       case kind::RLO:
   1315 	return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
   1316       case kind::LRI:
   1317 	return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
   1318       case kind::RLI:
   1319 	return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
   1320       case kind::FSI:
   1321 	return "U+2068 (FIRST STRONG ISOLATE)";
   1322       case kind::PDF:
   1323 	return "U+202C (POP DIRECTIONAL FORMATTING)";
   1324       case kind::PDI:
   1325 	return "U+2069 (POP DIRECTIONAL ISOLATE)";
   1326       case kind::LTR:
   1327 	return "U+200E (LEFT-TO-RIGHT MARK)";
   1328       case kind::RTL:
   1329 	return "U+200F (RIGHT-TO-LEFT MARK)";
   1330       default:
   1331 	abort ();
   1332       }
   1333   }
   1334 }
   1335 
   1336 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
   1337    within the current line in FILE, with the caret at START.  */
   1338 
   1339 static location_t
   1340 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
   1341 					 const unsigned char *const start,
   1342 					 size_t num_bytes)
   1343 {
   1344   gcc_checking_assert (num_bytes > 0);
   1345 
   1346   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
   1347      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
   1348      whereas linemap_position_for_column is 1-based.  */
   1349 
   1350   /* Get 0-based offsets within the line.  */
   1351   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
   1352   size_t end_offset = start_offset + num_bytes - 1;
   1353 
   1354   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
   1355   location_t start_loc = linemap_position_for_column (pfile->line_table,
   1356 						      start_offset + 1);
   1357   location_t end_loc = linemap_position_for_column (pfile->line_table,
   1358 						     end_offset + 1);
   1359 
   1360   if (start_loc == end_loc)
   1361     return start_loc;
   1362 
   1363   source_range src_range;
   1364   src_range.m_start = start_loc;
   1365   src_range.m_finish = end_loc;
   1366   location_t combined_loc
   1367     = pfile->line_table->get_or_create_combined_loc (start_loc,
   1368 						     src_range,
   1369 						     nullptr,
   1370 						     0);
   1371   return combined_loc;
   1372 }
   1373 
   1374 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
   1375 
   1376 static bidi::kind
   1377 get_bidi_utf8_1 (const unsigned char *const p)
   1378 {
   1379   gcc_checking_assert (p[0] == bidi::utf8_start);
   1380 
   1381   if (p[1] == 0x80)
   1382     switch (p[2])
   1383       {
   1384       case 0xaa:
   1385 	return bidi::kind::LRE;
   1386       case 0xab:
   1387 	return bidi::kind::RLE;
   1388       case 0xac:
   1389 	return bidi::kind::PDF;
   1390       case 0xad:
   1391 	return bidi::kind::LRO;
   1392       case 0xae:
   1393 	return bidi::kind::RLO;
   1394       case 0x8e:
   1395 	return bidi::kind::LTR;
   1396       case 0x8f:
   1397 	return bidi::kind::RTL;
   1398       default:
   1399 	break;
   1400       }
   1401   else if (p[1] == 0x81)
   1402     switch (p[2])
   1403       {
   1404       case 0xa6:
   1405 	return bidi::kind::LRI;
   1406       case 0xa7:
   1407 	return bidi::kind::RLI;
   1408       case 0xa8:
   1409 	return bidi::kind::FSI;
   1410       case 0xa9:
   1411 	return bidi::kind::PDI;
   1412       default:
   1413 	break;
   1414       }
   1415 
   1416   return bidi::kind::NONE;
   1417 }
   1418 
   1419 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
   1420    If the kind is not NONE, write the location to *OUT.*/
   1421 
   1422 static bidi::kind
   1423 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
   1424 {
   1425   bidi::kind result = get_bidi_utf8_1 (p);
   1426   if (result != bidi::kind::NONE)
   1427     {
   1428       /* We have a sequence of 3 bytes starting at P.  */
   1429       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
   1430     }
   1431   return result;
   1432 }
   1433 
   1434 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
   1435 
   1436 static bidi::kind
   1437 get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
   1438 {
   1439   /* 6.4.3 Universal Character Names
   1440       \u hex-quad
   1441       \U hex-quad hex-quad
   1442       \u { simple-hexadecimal-digit-sequence }
   1443      where \unnnn means \U0000nnnn.  */
   1444 
   1445   *end = p + 4;
   1446   if (is_U)
   1447     {
   1448       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
   1449 	return bidi::kind::NONE;
   1450       /* Skip 4B so we can treat \u and \U the same below.  */
   1451       p += 4;
   1452       *end += 4;
   1453     }
   1454   else if (p[0] == '{')
   1455     {
   1456       p++;
   1457       while (*p == '0')
   1458 	p++;
   1459       if (p[0] != '2'
   1460 	  || p[1] != '0'
   1461 	  || !ISXDIGIT (p[2])
   1462 	  || !ISXDIGIT (p[3])
   1463 	  || p[4] != '}')
   1464 	return bidi::kind::NONE;
   1465       *end = p + 5;
   1466     }
   1467 
   1468   /* All code points we are looking for start with 20xx.  */
   1469   if (p[0] != '2' || p[1] != '0')
   1470     return bidi::kind::NONE;
   1471   else if (p[2] == '2')
   1472     switch (p[3])
   1473       {
   1474       case 'a':
   1475       case 'A':
   1476 	return bidi::kind::LRE;
   1477       case 'b':
   1478       case 'B':
   1479 	return bidi::kind::RLE;
   1480       case 'c':
   1481       case 'C':
   1482 	return bidi::kind::PDF;
   1483       case 'd':
   1484       case 'D':
   1485 	return bidi::kind::LRO;
   1486       case 'e':
   1487       case 'E':
   1488 	return bidi::kind::RLO;
   1489       default:
   1490 	break;
   1491       }
   1492   else if (p[2] == '6')
   1493     switch (p[3])
   1494       {
   1495       case '6':
   1496 	return bidi::kind::LRI;
   1497       case '7':
   1498 	return bidi::kind::RLI;
   1499       case '8':
   1500 	return bidi::kind::FSI;
   1501       case '9':
   1502 	return bidi::kind::PDI;
   1503       default:
   1504 	break;
   1505       }
   1506   else if (p[2] == '0')
   1507     switch (p[3])
   1508       {
   1509       case 'e':
   1510       case 'E':
   1511 	return bidi::kind::LTR;
   1512       case 'f':
   1513       case 'F':
   1514 	return bidi::kind::RTL;
   1515       default:
   1516 	break;
   1517       }
   1518 
   1519   return bidi::kind::NONE;
   1520 }
   1521 
   1522 /* Parse a UCN where P points just past \u or \U and return its bidi code.
   1523    If the kind is not NONE, write the location to *OUT.  */
   1524 
   1525 static bidi::kind
   1526 get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
   1527 	      location_t *out)
   1528 {
   1529   const unsigned char *end;
   1530   bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
   1531   if (result != bidi::kind::NONE)
   1532     {
   1533       const unsigned char *start = p - 2;
   1534       size_t num_bytes = end - start;
   1535       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
   1536     }
   1537   return result;
   1538 }
   1539 
   1540 /* Parse a named universal character escape where P points just past \N and
   1541    return its bidi code.  If the kind is not NONE, write the location to
   1542    *OUT.  */
   1543 
   1544 static bidi::kind
   1545 get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
   1546 {
   1547   bidi::kind result = bidi::kind::NONE;
   1548   if (*p != '{')
   1549     return bidi::kind::NONE;
   1550   if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
   1551     {
   1552       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
   1553 	result = bidi::kind::LTR;
   1554       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
   1555 	result = bidi::kind::LRE;
   1556       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
   1557 	result = bidi::kind::LRO;
   1558       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
   1559 	result = bidi::kind::LRI;
   1560     }
   1561   else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
   1562     {
   1563       if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
   1564 	result = bidi::kind::RTL;
   1565       else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
   1566 	result = bidi::kind::RLE;
   1567       else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
   1568 	result = bidi::kind::RLO;
   1569       else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
   1570 	result = bidi::kind::RLI;
   1571     }
   1572   else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
   1573     {
   1574       if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
   1575 	result = bidi::kind::PDF;
   1576       else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
   1577 	result = bidi::kind::PDI;
   1578     }
   1579   else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
   1580     result = bidi::kind::FSI;
   1581   if (result != bidi::kind::NONE)
   1582     *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
   1583 						    (strchr ((const char *)
   1584 							     (p + 1), '}')
   1585 						     - (const char *) p)
   1586 						    + 3);
   1587   return result;
   1588 }
   1589 
   1590 /* Subclass of rich_location for reporting on unpaired UTF-8
   1591    bidirectional control character(s).
   1592    Escape the source lines on output, and show all unclosed
   1593    bidi context, labelling everything.  */
   1594 
   1595 class unpaired_bidi_rich_location : public rich_location
   1596 {
   1597  public:
   1598   class custom_range_label : public range_label
   1599   {
   1600    public:
   1601      label_text get_text (unsigned range_idx) const final override
   1602      {
   1603        /* range 0 is the primary location; each subsequent range i + 1
   1604 	  is for bidi::vec[i].  */
   1605        if (range_idx > 0)
   1606 	 {
   1607 	   const bidi::context &ctxt (bidi::vec[range_idx - 1]);
   1608 	   return label_text::borrow (bidi::to_str (ctxt.m_kind));
   1609 	 }
   1610        else
   1611 	 return label_text::borrow (_("end of bidirectional context"));
   1612      }
   1613   };
   1614 
   1615   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
   1616   : rich_location (pfile->line_table, loc, &m_custom_label)
   1617   {
   1618     set_escape_on_output (true);
   1619     for (unsigned i = 0; i < bidi::vec.count (); i++)
   1620       add_range (bidi::vec[i].m_loc,
   1621 		 SHOW_RANGE_WITHOUT_CARET,
   1622 		 &m_custom_label);
   1623   }
   1624 
   1625  private:
   1626    custom_range_label m_custom_label;
   1627 };
   1628 
   1629 /* We're closing a bidi context, that is, we've encountered a newline,
   1630    are closing a C-style comment, or are at the end of a string literal,
   1631    character constant, or identifier.  Warn if this context was not
   1632    properly terminated by a PDI or PDF.  P points to the last character
   1633    in this context.  */
   1634 
   1635 static void
   1636 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
   1637 {
   1638   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
   1639   if (bidi::vec.count () > 0
   1640       && (warn_bidi & bidirectional_unpaired
   1641 	  && (!bidi::current_ctx_ucn_p ()
   1642 	      || (warn_bidi & bidirectional_ucn))))
   1643     {
   1644       const location_t loc
   1645 	= linemap_position_for_column (pfile->line_table,
   1646 				       CPP_BUF_COLUMN (pfile->buffer, p));
   1647       unpaired_bidi_rich_location rich_loc (pfile, loc);
   1648       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
   1649 	 forms of a diagnostic, so fake it for now.  */
   1650       if (bidi::vec.count () > 1)
   1651 	cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1652 			"unpaired UTF-8 bidirectional control characters "
   1653 			"detected");
   1654       else
   1655 	cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1656 			"unpaired UTF-8 bidirectional control character "
   1657 			"detected");
   1658     }
   1659   /* We're done with this context.  */
   1660   bidi::on_close ();
   1661 }
   1662 
   1663 /* We're at the beginning or in the middle of an identifier/comment/string
   1664    literal/character constant.  Warn if we've encountered a bidi character.
   1665    KIND says which bidi control character it was; UCN_P is true iff this bidi
   1666    control character was written as a UCN.  LOC is the location of the
   1667    character, but is only valid if KIND != bidi::kind::NONE.  */
   1668 
   1669 static void
   1670 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
   1671 			 bool ucn_p, location_t loc)
   1672 {
   1673   if (__builtin_expect (kind == bidi::kind::NONE, 1))
   1674     return;
   1675 
   1676   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
   1677 
   1678   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
   1679     {
   1680       rich_location rich_loc (pfile->line_table, loc);
   1681       rich_loc.set_escape_on_output (true);
   1682 
   1683       /* It seems excessive to warn about a PDI/PDF that is closing
   1684 	 an opened context because we've already warned about the
   1685 	 opening character.  Except warn when we have a UCN x UTF-8
   1686 	 mismatch, if UCN checking is enabled.  */
   1687       if (kind == bidi::current_ctx ())
   1688 	{
   1689 	  if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
   1690 	      && bidi::current_ctx_ucn_p () != ucn_p)
   1691 	    {
   1692 	      rich_loc.add_range (bidi::current_ctx_loc ());
   1693 	      cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1694 			      "UTF-8 vs UCN mismatch when closing "
   1695 			      "a context by \"%s\"", bidi::to_str (kind));
   1696 	    }
   1697 	}
   1698       else if (warn_bidi & bidirectional_any
   1699 	       && (!ucn_p || (warn_bidi & bidirectional_ucn)))
   1700 	{
   1701 	  if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
   1702 	    cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1703 			    "\"%s\" is closing an unopened context",
   1704 			    bidi::to_str (kind));
   1705 	  else
   1706 	    cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1707 			    "found problematic Unicode character \"%s\"",
   1708 			    bidi::to_str (kind));
   1709 	}
   1710     }
   1711   /* We're done with this context.  */
   1712   bidi::on_char (kind, ucn_p, loc);
   1713 }
   1714 
   1715 static const cppchar_t utf8_continuation = 0x80;
   1716 static const cppchar_t utf8_signifier = 0xC0;
   1717 
   1718 /* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
   1719    at PFILE->buffer->cur.  Return a pointer after the diagnosed
   1720    invalid character.  */
   1721 
   1722 static const uchar *
   1723 _cpp_warn_invalid_utf8 (cpp_reader *pfile)
   1724 {
   1725   cpp_buffer *buffer = pfile->buffer;
   1726   const uchar *cur = buffer->cur;
   1727   bool pedantic = (CPP_PEDANTIC (pfile)
   1728 		   && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
   1729 
   1730   if (cur[0] < utf8_signifier
   1731       || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
   1732     {
   1733       if (pedantic)
   1734 	cpp_error_with_line (pfile, CPP_DL_PEDWARN,
   1735 			     pfile->line_table->highest_line,
   1736 			     CPP_BUF_COL (buffer),
   1737 			     "invalid UTF-8 character <%x>",
   1738 			     cur[0]);
   1739       else
   1740 	cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
   1741 			       pfile->line_table->highest_line,
   1742 			       CPP_BUF_COL (buffer),
   1743 			       "invalid UTF-8 character <%x>",
   1744 			       cur[0]);
   1745       return cur + 1;
   1746     }
   1747   else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
   1748     {
   1749       if (pedantic)
   1750 	cpp_error_with_line (pfile, CPP_DL_PEDWARN,
   1751 			     pfile->line_table->highest_line,
   1752 			     CPP_BUF_COL (buffer),
   1753 			     "invalid UTF-8 character <%x><%x>",
   1754 			     cur[0], cur[1]);
   1755       else
   1756 	cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
   1757 			       pfile->line_table->highest_line,
   1758 			       CPP_BUF_COL (buffer),
   1759 			       "invalid UTF-8 character <%x><%x>",
   1760 			       cur[0], cur[1]);
   1761       return cur + 2;
   1762     }
   1763   else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
   1764     {
   1765       if (pedantic)
   1766 	cpp_error_with_line (pfile, CPP_DL_PEDWARN,
   1767 			     pfile->line_table->highest_line,
   1768 			     CPP_BUF_COL (buffer),
   1769 			     "invalid UTF-8 character <%x><%x><%x>",
   1770 			     cur[0], cur[1], cur[2]);
   1771       else
   1772 	cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
   1773 			       pfile->line_table->highest_line,
   1774 			       CPP_BUF_COL (buffer),
   1775 			       "invalid UTF-8 character <%x><%x><%x>",
   1776 			       cur[0], cur[1], cur[2]);
   1777       return cur + 3;
   1778     }
   1779   else
   1780     {
   1781       if (pedantic)
   1782 	cpp_error_with_line (pfile, CPP_DL_PEDWARN,
   1783 			     pfile->line_table->highest_line,
   1784 			     CPP_BUF_COL (buffer),
   1785 			     "invalid UTF-8 character <%x><%x><%x><%x>",
   1786 			     cur[0], cur[1], cur[2], cur[3]);
   1787       else
   1788 	cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
   1789 			       pfile->line_table->highest_line,
   1790 			       CPP_BUF_COL (buffer),
   1791 			       "invalid UTF-8 character <%x><%x><%x><%x>",
   1792 			       cur[0], cur[1], cur[2], cur[3]);
   1793       return cur + 4;
   1794     }
   1795 }
   1796 
   1797 /* Helper function of *skip_*_comment and lex*_string.  For C,
   1798    character at CUR[-1] with MSB set handle -Wbidi-chars* and
   1799    -Winvalid-utf8 diagnostics and return pointer to first character
   1800    that should be processed next.  */
   1801 
   1802 static inline const uchar *
   1803 _cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
   1804 			    const uchar *cur, bool warn_bidi_p,
   1805 			    bool warn_invalid_utf8_p)
   1806 {
   1807   /* If this is a beginning of a UTF-8 encoding, it might be
   1808      a bidirectional control character.  */
   1809   if (c == bidi::utf8_start && warn_bidi_p)
   1810     {
   1811       location_t loc;
   1812       bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
   1813       maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   1814     }
   1815   if (!warn_invalid_utf8_p)
   1816     return cur;
   1817   if (c >= utf8_signifier)
   1818     {
   1819       cppchar_t s;
   1820       const uchar *pstr = cur - 1;
   1821       if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
   1822 	  && s <= UCS_LIMIT)
   1823 	return pstr;
   1824     }
   1825   pfile->buffer->cur = cur - 1;
   1826   return _cpp_warn_invalid_utf8 (pfile);
   1827 }
   1828 
   1829 /* Skip a C-style block comment.  We find the end of the comment by
   1830    seeing if an asterisk is before every '/' we encounter.  Returns
   1831    nonzero if comment terminated by EOF, zero otherwise.
   1832 
   1833    Buffer->cur points to the initial asterisk of the comment.  */
   1834 bool
   1835 _cpp_skip_block_comment (cpp_reader *pfile)
   1836 {
   1837   cpp_buffer *buffer = pfile->buffer;
   1838   const uchar *cur = buffer->cur;
   1839   uchar c;
   1840   const bool warn_bidi_p = pfile->warn_bidi_p ();
   1841   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
   1842   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
   1843 
   1844   cur++;
   1845   if (*cur == '/')
   1846     cur++;
   1847 
   1848   for (;;)
   1849     {
   1850       /* People like decorating comments with '*', so check for '/'
   1851 	 instead for efficiency.  */
   1852       c = *cur++;
   1853 
   1854       if (c == '/')
   1855 	{
   1856 	  if (cur[-2] == '*')
   1857 	    {
   1858 	      if (warn_bidi_p)
   1859 		maybe_warn_bidi_on_close (pfile, cur);
   1860 	      break;
   1861 	    }
   1862 
   1863 	  /* Warn about potential nested comments, but not if the '/'
   1864 	     comes immediately before the true comment delimiter.
   1865 	     Don't bother to get it right across escaped newlines.  */
   1866 	  if (CPP_OPTION (pfile, warn_comments)
   1867 	      && cur[0] == '*' && cur[1] != '/')
   1868 	    {
   1869 	      buffer->cur = cur;
   1870 	      cpp_warning_with_line (pfile, CPP_W_COMMENTS,
   1871 				     pfile->line_table->highest_line,
   1872 				     CPP_BUF_COL (buffer),
   1873 				     "\"/*\" within comment");
   1874 	    }
   1875 	}
   1876       else if (c == '\n')
   1877 	{
   1878 	  unsigned int cols;
   1879 	  buffer->cur = cur - 1;
   1880 	  if (warn_bidi_p)
   1881 	    maybe_warn_bidi_on_close (pfile, cur);
   1882 	  _cpp_process_line_notes (pfile, true);
   1883 	  if (buffer->next_line >= buffer->rlimit)
   1884 	    return true;
   1885 	  _cpp_clean_line (pfile);
   1886 
   1887 	  cols = buffer->next_line - buffer->line_base;
   1888 	  CPP_INCREMENT_LINE (pfile, cols);
   1889 
   1890 	  cur = buffer->cur;
   1891 	}
   1892       else if (__builtin_expect (c >= utf8_continuation, 0)
   1893 	       && warn_bidi_or_invalid_utf8_p)
   1894 	cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
   1895 					  warn_invalid_utf8_p);
   1896     }
   1897 
   1898   buffer->cur = cur;
   1899   _cpp_process_line_notes (pfile, true);
   1900   return false;
   1901 }
   1902 
   1903 /* Skip a C++ line comment, leaving buffer->cur pointing to the
   1904    terminating newline.  Handles escaped newlines.  Returns nonzero
   1905    if a multiline comment.  */
   1906 static int
   1907 skip_line_comment (cpp_reader *pfile)
   1908 {
   1909   cpp_buffer *buffer = pfile->buffer;
   1910   location_t orig_line = pfile->line_table->highest_line;
   1911   const bool warn_bidi_p = pfile->warn_bidi_p ();
   1912   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
   1913   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
   1914 
   1915   if (!warn_bidi_or_invalid_utf8_p)
   1916     while (*buffer->cur != '\n')
   1917       buffer->cur++;
   1918   else if (!warn_invalid_utf8_p)
   1919     {
   1920       while (*buffer->cur != '\n'
   1921 	     && *buffer->cur != bidi::utf8_start)
   1922 	buffer->cur++;
   1923       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
   1924 	{
   1925 	  while (*buffer->cur != '\n')
   1926 	    {
   1927 	      if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
   1928 		{
   1929 		  location_t loc;
   1930 		  bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
   1931 		  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   1932 		}
   1933 	      buffer->cur++;
   1934 	    }
   1935 	  maybe_warn_bidi_on_close (pfile, buffer->cur);
   1936 	}
   1937     }
   1938   else
   1939     {
   1940       while (*buffer->cur != '\n')
   1941 	{
   1942 	  if (*buffer->cur < utf8_continuation)
   1943 	    {
   1944 	      buffer->cur++;
   1945 	      continue;
   1946 	    }
   1947 	  buffer->cur
   1948 	    = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
   1949 					  warn_bidi_p, warn_invalid_utf8_p);
   1950 	}
   1951       if (warn_bidi_p)
   1952 	maybe_warn_bidi_on_close (pfile, buffer->cur);
   1953     }
   1954 
   1955   _cpp_process_line_notes (pfile, true);
   1956   return orig_line != pfile->line_table->highest_line;
   1957 }
   1958 
   1959 /* Skips whitespace, saving the next non-whitespace character.  */
   1960 static void
   1961 skip_whitespace (cpp_reader *pfile, cppchar_t c)
   1962 {
   1963   cpp_buffer *buffer = pfile->buffer;
   1964   bool saw_NUL = false;
   1965 
   1966   do
   1967     {
   1968       /* Horizontal space always OK.  */
   1969       if (c == ' ' || c == '\t')
   1970 	;
   1971       /* Just \f \v or \0 left.  */
   1972       else if (c == '\0')
   1973 	saw_NUL = true;
   1974       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
   1975 	cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
   1976 			     CPP_BUF_COL (buffer),
   1977 			     "%s in preprocessing directive",
   1978 			     c == '\f' ? "form feed" : "vertical tab");
   1979 
   1980       c = *buffer->cur++;
   1981     }
   1982   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
   1983   while (is_nvspace (c));
   1984 
   1985   if (saw_NUL)
   1986     {
   1987       encoding_rich_location rich_loc (pfile);
   1988       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
   1989 		    "null character(s) ignored");
   1990     }
   1991 
   1992   buffer->cur--;
   1993 }
   1994 
   1995 /* See if the characters of a number token are valid in a name (no
   1996    '.', '+' or '-').  */
   1997 static int
   1998 name_p (cpp_reader *pfile, const cpp_string *string)
   1999 {
   2000   unsigned int i;
   2001 
   2002   for (i = 0; i < string->len; i++)
   2003     if (!is_idchar (string->text[i]))
   2004       return 0;
   2005 
   2006   return 1;
   2007 }
   2008 
   2009 /* After parsing an identifier or other sequence, produce a warning about
   2010    sequences not in NFC/NFKC.  */
   2011 static void
   2012 warn_about_normalization (cpp_reader *pfile,
   2013 			  const cpp_token *token,
   2014 			  const struct normalize_state *s,
   2015 			  bool identifier)
   2016 {
   2017   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
   2018       && !pfile->state.skipping)
   2019     {
   2020       location_t loc = token->src_loc;
   2021 
   2022       /* If possible, create a location range for the token.  */
   2023       if (loc >= RESERVED_LOCATION_COUNT
   2024 	  && token->type != CPP_EOF
   2025 	  /* There must be no line notes to process.  */
   2026 	  && (!(pfile->buffer->cur
   2027 		>= pfile->buffer->notes[pfile->buffer->cur_note].pos
   2028 		&& !pfile->overlaid_buffer)))
   2029 	{
   2030 	  source_range tok_range;
   2031 	  tok_range.m_start = loc;
   2032 	  tok_range.m_finish
   2033 	    = linemap_position_for_column (pfile->line_table,
   2034 					   CPP_BUF_COLUMN (pfile->buffer,
   2035 							   pfile->buffer->cur));
   2036 	  loc = pfile->line_table->get_or_create_combined_loc (loc, tok_range,
   2037 							       nullptr, 0);
   2038 	}
   2039 
   2040       encoding_rich_location rich_loc (pfile, loc);
   2041 
   2042       /* Make sure that the token is printed using UCNs, even
   2043 	 if we'd otherwise happily print UTF-8.  */
   2044       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
   2045       size_t sz;
   2046 
   2047       sz = cpp_spell_token (pfile, token, buf, false) - buf;
   2048       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
   2049 	cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
   2050 			"`%.*s' is not in NFKC", (int) sz, buf);
   2051       else if (identifier && CPP_OPTION (pfile, xid_identifiers))
   2052 	cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
   2053 				  "`%.*s' is not in NFC", (int) sz, buf);
   2054       else
   2055 	cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
   2056 			"`%.*s' is not in NFC", (int) sz, buf);
   2057       free (buf);
   2058     }
   2059 }
   2060 
   2061 /* Returns TRUE if the byte sequence starting at buffer->cur is a valid
   2062    extended character in an identifier.  If FIRST is TRUE, then the character
   2063    must be valid at the beginning of an identifier as well.  If the return
   2064    value is TRUE, then pfile->buffer->cur has been moved to point to the next
   2065    byte after the extended character.  */
   2066 
   2067 static bool
   2068 forms_identifier_p (cpp_reader *pfile, int first,
   2069 		    struct normalize_state *state)
   2070 {
   2071   cpp_buffer *buffer = pfile->buffer;
   2072   const bool warn_bidi_p = pfile->warn_bidi_p ();
   2073 
   2074   if (*buffer->cur == '$')
   2075     {
   2076       if (!CPP_OPTION (pfile, dollars_in_ident))
   2077 	return false;
   2078 
   2079       buffer->cur++;
   2080       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
   2081 	{
   2082 	  CPP_OPTION (pfile, warn_dollars) = 0;
   2083 	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
   2084 	}
   2085 
   2086       return true;
   2087     }
   2088 
   2089   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
   2090   if (CPP_OPTION (pfile, extended_identifiers))
   2091     {
   2092       cppchar_t s;
   2093       if (*buffer->cur >= utf8_signifier)
   2094 	{
   2095 	  if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
   2096 	      && warn_bidi_p)
   2097 	    {
   2098 	      location_t loc;
   2099 	      bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
   2100 	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   2101 	    }
   2102 	  if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
   2103 			       state, &s))
   2104 	    return true;
   2105 	}
   2106       else if (*buffer->cur == '\\'
   2107 	       && (buffer->cur[1] == 'u'
   2108 		   || buffer->cur[1] == 'U'
   2109 		   || buffer->cur[1] == 'N'))
   2110 	{
   2111 	  buffer->cur += 2;
   2112 	  if (warn_bidi_p)
   2113 	    {
   2114 	      location_t loc;
   2115 	      bidi::kind kind;
   2116 	      if (buffer->cur[-1] == 'N')
   2117 		kind = get_bidi_named (pfile, buffer->cur, &loc);
   2118 	      else
   2119 		kind = get_bidi_ucn (pfile, buffer->cur,
   2120 				     buffer->cur[-1] == 'U', &loc);
   2121 	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
   2122 	    }
   2123 	  if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
   2124 			      state, &s, NULL, NULL))
   2125 	    return true;
   2126 	  buffer->cur -= 2;
   2127 	}
   2128     }
   2129 
   2130   return false;
   2131 }
   2132 
   2133 /* Helper function to issue error about improper __VA_OPT__ use.  */
   2134 static void
   2135 maybe_va_opt_error (cpp_reader *pfile)
   2136 {
   2137   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
   2138     {
   2139       /* __VA_OPT__ should not be accepted at all, but allow it in
   2140 	 system headers.  */
   2141       if (!_cpp_in_system_header (pfile))
   2142 	{
   2143 	  if (CPP_OPTION (pfile, cplusplus))
   2144 	    cpp_error (pfile, CPP_DL_PEDWARN,
   2145 		       "__VA_OPT__ is not available until C++20");
   2146 	  else
   2147 	    cpp_error (pfile, CPP_DL_PEDWARN,
   2148 		       "__VA_OPT__ is not available until C23");
   2149 	}
   2150     }
   2151   else if (!pfile->state.va_args_ok)
   2152     {
   2153       /* __VA_OPT__ should only appear in the replacement list of a
   2154 	 variadic macro.  */
   2155       cpp_error (pfile, CPP_DL_PEDWARN,
   2156 		 "__VA_OPT__ can only appear in the expansion"
   2157 		 " of a C++20 variadic macro");
   2158     }
   2159 }
   2160 
   2161 /* Helper function to perform diagnostics that are needed (rarely)
   2162    when an identifier is lexed.  */
   2163 static void
   2164 identifier_diagnostics_on_lex (cpp_reader *pfile, cpp_hashnode *node)
   2165 {
   2166   if (__builtin_expect (!(node->flags & NODE_DIAGNOSTIC)
   2167 			|| pfile->state.skipping, 1))
   2168     return;
   2169 
   2170   /* It is allowed to poison the same identifier twice.  */
   2171   if ((node->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
   2172     {
   2173       cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
   2174 		 NODE_NAME (node));
   2175       const auto data = (cpp_hashnode_extra *)
   2176 	ht_lookup (pfile->extra_hash_table, node->ident, HT_NO_INSERT);
   2177       if (data && data->poisoned_loc)
   2178 	cpp_error_at (pfile, CPP_DL_NOTE, data->poisoned_loc, "poisoned here");
   2179     }
   2180 
   2181   /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
   2182      replacement list of a variadic macro.  */
   2183   if (node == pfile->spec_nodes.n__VA_ARGS__
   2184       && !pfile->state.va_args_ok)
   2185     {
   2186       if (CPP_OPTION (pfile, cplusplus))
   2187 	cpp_error (pfile, CPP_DL_PEDWARN,
   2188 		   "__VA_ARGS__ can only appear in the expansion"
   2189 		   " of a C++11 variadic macro");
   2190       else
   2191 	cpp_error (pfile, CPP_DL_PEDWARN,
   2192 		   "__VA_ARGS__ can only appear in the expansion"
   2193 		   " of a C99 variadic macro");
   2194     }
   2195 
   2196   /* __VA_OPT__ should only appear in the replacement list of a
   2197      variadic macro.  */
   2198   if (node == pfile->spec_nodes.n__VA_OPT__)
   2199     maybe_va_opt_error (pfile);
   2200 
   2201   /* For -Wc++-compat, warn about use of C++ named operators.  */
   2202   if (node->flags & NODE_WARN_OPERATOR)
   2203     cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
   2204 		 "identifier \"%s\" is a special operator name in C++",
   2205 		 NODE_NAME (node));
   2206 }
   2207 
   2208 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
   2209 static cpp_hashnode *
   2210 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
   2211 {
   2212   cpp_hashnode *result;
   2213   const uchar *cur;
   2214   unsigned int len;
   2215   unsigned int hash = HT_HASHSTEP (0, *base);
   2216 
   2217   cur = base + 1;
   2218   while (ISIDNUM (*cur))
   2219     {
   2220       hash = HT_HASHSTEP (hash, *cur);
   2221       cur++;
   2222     }
   2223   len = cur - base;
   2224   hash = HT_HASHFINISH (hash, len);
   2225   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
   2226 					      base, len, hash, HT_ALLOC));
   2227   identifier_diagnostics_on_lex (pfile, result);
   2228   return result;
   2229 }
   2230 
   2231 /* Get the cpp_hashnode of an identifier specified by NAME in
   2232    the current cpp_reader object.  If none is found, NULL is returned.  */
   2233 cpp_hashnode *
   2234 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
   2235 {
   2236   cpp_hashnode *result;
   2237   result = lex_identifier_intern (pfile, (uchar *) name);
   2238   return result;
   2239 }
   2240 
   2241 /* Lex an identifier starting at BASE.  BUFFER->CUR is expected to point
   2242    one past the first character at BASE, which may be a (possibly multi-byte)
   2243    character if STARTS_UCN is true.  */
   2244 static cpp_hashnode *
   2245 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
   2246 		struct normalize_state *nst, cpp_hashnode **spelling)
   2247 {
   2248   cpp_hashnode *result;
   2249   const uchar *cur;
   2250   unsigned int len;
   2251   unsigned int hash = HT_HASHSTEP (0, *base);
   2252   const bool warn_bidi_p = pfile->warn_bidi_p ();
   2253 
   2254   cur = pfile->buffer->cur;
   2255   if (! starts_ucn)
   2256     {
   2257       while (ISIDNUM (*cur))
   2258 	{
   2259 	  hash = HT_HASHSTEP (hash, *cur);
   2260 	  cur++;
   2261 	}
   2262       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
   2263     }
   2264   pfile->buffer->cur = cur;
   2265   if (starts_ucn || forms_identifier_p (pfile, false, nst))
   2266     {
   2267       /* Slower version for identifiers containing UCNs
   2268 	 or extended chars (including $).  */
   2269       do {
   2270 	while (ISIDNUM (*pfile->buffer->cur))
   2271 	  {
   2272 	    NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
   2273 	    pfile->buffer->cur++;
   2274 	  }
   2275       } while (forms_identifier_p (pfile, false, nst));
   2276       if (warn_bidi_p)
   2277 	maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
   2278       result = _cpp_interpret_identifier (pfile, base,
   2279 					  pfile->buffer->cur - base);
   2280       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
   2281     }
   2282   else
   2283     {
   2284       len = cur - base;
   2285       hash = HT_HASHFINISH (hash, len);
   2286 
   2287       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
   2288 						  base, len, hash, HT_ALLOC));
   2289       *spelling = result;
   2290     }
   2291 
   2292   return result;
   2293 }
   2294 
   2295 /* Struct to hold the return value of the scan_cur_identifier () helper
   2296    function below.  */
   2297 
   2298 struct scan_id_result
   2299 {
   2300   cpp_hashnode *node;
   2301   normalize_state nst;
   2302 
   2303   scan_id_result ()
   2304     : node (nullptr)
   2305   {
   2306     nst = INITIAL_NORMALIZE_STATE;
   2307   }
   2308 
   2309   explicit operator bool () const { return node; }
   2310 };
   2311 
   2312 /* Helper function to scan an entire identifier beginning at
   2313    pfile->buffer->cur, and possibly containing extended characters (UCNs
   2314    and/or UTF-8).  Returns the cpp_hashnode for the identifier on success, or
   2315    else nullptr, as well as a normalize_state so that normalization warnings
   2316    may be issued once the token lexing is complete.  */
   2317 
   2318 static scan_id_result
   2319 scan_cur_identifier (cpp_reader *pfile)
   2320 {
   2321   const auto buffer = pfile->buffer;
   2322   const auto begin = buffer->cur;
   2323   scan_id_result result;
   2324   if (ISIDST (*buffer->cur))
   2325     {
   2326       ++buffer->cur;
   2327       cpp_hashnode *ignore;
   2328       result.node = lex_identifier (pfile, begin, false, &result.nst, &ignore);
   2329     }
   2330   else if (forms_identifier_p (pfile, true, &result.nst))
   2331     {
   2332       /* buffer->cur has been moved already by the call
   2333 	 to forms_identifier_p.  */
   2334       cpp_hashnode *ignore;
   2335       result.node = lex_identifier (pfile, begin, true, &result.nst, &ignore);
   2336     }
   2337   return result;
   2338 }
   2339 
   2340 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
   2341 static void
   2342 lex_number (cpp_reader *pfile, cpp_string *number,
   2343 	    struct normalize_state *nst)
   2344 {
   2345   const uchar *cur;
   2346   const uchar *base;
   2347   uchar *dest;
   2348 
   2349   base = pfile->buffer->cur - 1;
   2350   do
   2351     {
   2352       const uchar *adj_digit_sep = NULL;
   2353       cur = pfile->buffer->cur;
   2354 
   2355       /* N.B. ISIDNUM does not include $.  */
   2356       while (ISIDNUM (*cur)
   2357 	     || (*cur == '.' && !DIGIT_SEP (cur[-1]))
   2358 	     || DIGIT_SEP (*cur)
   2359 	     || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
   2360 	{
   2361 	  NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
   2362 	  /* Adjacent digit separators do not form part of the pp-number syntax.
   2363 	     However, they can safely be diagnosed here as an error, since '' is
   2364 	     not a valid preprocessing token.  */
   2365 	  if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
   2366 	    adj_digit_sep = cur;
   2367 	  cur++;
   2368 	}
   2369       /* A number can't end with a digit separator.  */
   2370       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
   2371 	--cur;
   2372       if (adj_digit_sep && adj_digit_sep < cur)
   2373 	cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
   2374 
   2375       pfile->buffer->cur = cur;
   2376     }
   2377   while (forms_identifier_p (pfile, false, nst));
   2378 
   2379   number->len = cur - base;
   2380   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
   2381   memcpy (dest, base, number->len);
   2382   dest[number->len] = '\0';
   2383   number->text = dest;
   2384 }
   2385 
   2386 /* Create a token of type TYPE with a literal spelling.  */
   2387 static void
   2388 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
   2389 		unsigned int len, enum cpp_ttype type)
   2390 {
   2391   token->type = type;
   2392   token->val.str.len = len;
   2393   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
   2394 }
   2395 
   2396 /* Like create_literal(), but construct it from two separate strings
   2397    which are concatenated.  LEN2 may be 0 if no second string is
   2398    required.  */
   2399 static void
   2400 create_literal2 (cpp_reader *pfile, cpp_token *token, const uchar *base1,
   2401 		 unsigned int len1, const uchar *base2, unsigned int len2,
   2402 		 enum cpp_ttype type)
   2403 {
   2404   token->type = type;
   2405   token->val.str.len = len1 + len2;
   2406   uchar *const dest = _cpp_unaligned_alloc (pfile, len1 + len2 + 1);
   2407   memcpy (dest, base1, len1);
   2408   if (len2)
   2409     memcpy (dest+len1, base2, len2);
   2410   dest[len1 + len2] = 0;
   2411   token->val.str.text = dest;
   2412 }
   2413 
   2414 const uchar *
   2415 cpp_alloc_token_string (cpp_reader *pfile,
   2416 			const unsigned char *ptr, unsigned len)
   2417 {
   2418   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
   2419 
   2420   dest[len] = 0;
   2421   memcpy (dest, ptr, len);
   2422   return dest;
   2423 }
   2424 
   2425 /* A pair of raw buffer pointers.  The currently open one is [1], the
   2426    first one is [0].  Used for string literal lexing.  */
   2427 struct lit_accum {
   2428   _cpp_buff *first;
   2429   _cpp_buff *last;
   2430   const uchar *rpos;
   2431   size_t accum;
   2432 
   2433   lit_accum ()
   2434     : first (NULL), last (NULL), rpos (0), accum (0)
   2435   {
   2436   }
   2437 
   2438   void append (cpp_reader *, const uchar *, size_t);
   2439 
   2440   void read_begin (cpp_reader *);
   2441   bool reading_p () const
   2442   {
   2443     return rpos != NULL;
   2444   }
   2445   char read_char ()
   2446   {
   2447     char c = *rpos++;
   2448     if (rpos == BUFF_FRONT (last))
   2449       rpos = NULL;
   2450     return c;
   2451   }
   2452 
   2453   void create_literal2 (cpp_reader *pfile, cpp_token *token,
   2454 			const uchar *base1, unsigned int len1,
   2455 			const uchar *base2, unsigned int len2,
   2456 			enum cpp_ttype type);
   2457 };
   2458 
   2459 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
   2460    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
   2461 
   2462 void
   2463 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
   2464 {
   2465   if (!last)
   2466     /* Starting.  */
   2467     first = last = _cpp_get_buff (pfile, len);
   2468   else if (len > BUFF_ROOM (last))
   2469     {
   2470       /* There is insufficient room in the buffer.  Copy what we can,
   2471 	 and then either extend or create a new one.  */
   2472       size_t room = BUFF_ROOM (last);
   2473       memcpy (BUFF_FRONT (last), base, room);
   2474       BUFF_FRONT (last) += room;
   2475       base += room;
   2476       len -= room;
   2477       accum += room;
   2478 
   2479       gcc_checking_assert (!rpos);
   2480 
   2481       last = _cpp_append_extend_buff (pfile, last, len);
   2482     }
   2483 
   2484   memcpy (BUFF_FRONT (last), base, len);
   2485   BUFF_FRONT (last) += len;
   2486   accum += len;
   2487 }
   2488 
   2489 void
   2490 lit_accum::read_begin (cpp_reader *pfile)
   2491 {
   2492   /* We never accumulate more than 4 chars to read.  */
   2493   if (BUFF_ROOM (last) < 4)
   2494 
   2495     last = _cpp_append_extend_buff (pfile, last, 4);
   2496   rpos = BUFF_FRONT (last);
   2497 }
   2498 
   2499 /* Helper function to check if a string format macro, say from inttypes.h, is
   2500    placed touching a string literal, in which case it could be parsed as a C++11
   2501    user-defined string literal thus breaking the program.  Return TRUE if the
   2502    UDL should be ignored for now and preserved for potential macro
   2503    expansion.  */
   2504 
   2505 static bool
   2506 maybe_ignore_udl_macro_suffix (cpp_reader *pfile, location_t src_loc,
   2507 			       const uchar *suffix_begin, cpp_hashnode *node)
   2508 {
   2509   /* User-defined literals outside of namespace std must start with a single
   2510      underscore, so assume anything of that form really is a UDL suffix.
   2511      We don't need to worry about UDLs defined inside namespace std because
   2512      their names are reserved, so cannot be used as macro names in valid
   2513      programs.  */
   2514   if ((suffix_begin[0] == '_' && suffix_begin[1] != '_')
   2515       || !cpp_macro_p (node))
   2516     return false;
   2517 
   2518   /* Maybe raise a warning here; caller should arrange not to consume
   2519      the tokens.  */
   2520   if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
   2521     cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, src_loc, 0,
   2522 			   "invalid suffix on literal; C++11 requires a space "
   2523 			   "between literal and string macro");
   2524   return true;
   2525 }
   2526 
   2527 /* Like create_literal2(), but also prepend all the accumulated data from
   2528    the lit_accum struct.  */
   2529 void
   2530 lit_accum::create_literal2 (cpp_reader *pfile, cpp_token *token,
   2531 			    const uchar *base1, unsigned int len1,
   2532 			    const uchar *base2, unsigned int len2,
   2533 			    enum cpp_ttype type)
   2534 {
   2535   const unsigned int tot_len = accum + len1 + len2;
   2536   uchar *dest = _cpp_unaligned_alloc (pfile, tot_len + 1);
   2537   token->type = type;
   2538   token->val.str.len = tot_len;
   2539   token->val.str.text = dest;
   2540   for (_cpp_buff *buf = first; buf; buf = buf->next)
   2541     {
   2542       size_t len = BUFF_FRONT (buf) - buf->base;
   2543       memcpy (dest, buf->base, len);
   2544       dest += len;
   2545     }
   2546   memcpy (dest, base1, len1);
   2547   dest += len1;
   2548   if (len2)
   2549     memcpy (dest, base2, len2);
   2550   dest += len2;
   2551   *dest = '\0';
   2552 }
   2553 
   2554 /* Lexes a raw string.  The stored string contains the spelling,
   2555    including double quotes, delimiter string, '(' and ')', any leading
   2556    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
   2557    the type of the literal, or CPP_OTHER if it was not properly
   2558    terminated.
   2559 
   2560    BASE is the start of the token.  Updates pfile->buffer->cur to just
   2561    after the lexed string.
   2562 
   2563    The spelling is NUL-terminated, but it is not guaranteed that this
   2564    is the first NUL since embedded NULs are preserved.  */
   2565 
   2566 static void
   2567 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
   2568 {
   2569   const uchar *pos = base;
   2570   const bool warn_bidi_p = pfile->warn_bidi_p ();
   2571   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
   2572   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
   2573 
   2574   /* 'tis a pity this information isn't passed down from the lexer's
   2575      initial categorization of the token.  */
   2576   enum cpp_ttype type = CPP_STRING;
   2577 
   2578   if (*pos == 'L')
   2579     {
   2580       type = CPP_WSTRING;
   2581       pos++;
   2582     }
   2583   else if (*pos == 'U')
   2584     {
   2585       type = CPP_STRING32;
   2586       pos++;
   2587     }
   2588   else if (*pos == 'u')
   2589     {
   2590       if (pos[1] == '8')
   2591 	{
   2592 	  type = CPP_UTF8STRING;
   2593 	  pos++;
   2594 	}
   2595       else
   2596 	type = CPP_STRING16;
   2597       pos++;
   2598     }
   2599 
   2600   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
   2601   pos += 2;
   2602 
   2603   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
   2604 
   2605   /* Skip notes before the ".  */
   2606   while (note->pos < pos)
   2607     ++note;
   2608 
   2609   lit_accum accum;
   2610 
   2611   uchar prefix[17];
   2612   unsigned prefix_len = 0;
   2613   enum Phase
   2614   {
   2615    PHASE_PREFIX = -2,
   2616    PHASE_NONE = -1,
   2617    PHASE_SUFFIX = 0
   2618   } phase = PHASE_PREFIX;
   2619 
   2620   for (;;)
   2621     {
   2622       gcc_checking_assert (note->pos >= pos);
   2623 
   2624       /* Undo any escaped newlines and trigraphs.  */
   2625       if (!accum.reading_p () && note->pos == pos)
   2626 	switch (note->type)
   2627 	  {
   2628 	  case '\\':
   2629 	  case ' ':
   2630 	    /* Restore backslash followed by newline.  */
   2631 	    accum.append (pfile, base, pos - base);
   2632 	    base = pos;
   2633 	    accum.read_begin (pfile);
   2634 	    accum.append (pfile, UC"\\", 1);
   2635 
   2636 	  after_backslash:
   2637 	    if (note->type == ' ')
   2638 	      /* GNU backslash whitespace newline extension.  FIXME
   2639 		 could be any sequence of non-vertical space.  When we
   2640 		 can properly restore any such sequence, we should
   2641 		 mark this note as handled so _cpp_process_line_notes
   2642 		 doesn't warn.  */
   2643 	      accum.append (pfile, UC" ", 1);
   2644 
   2645 	    accum.append (pfile, UC"\n", 1);
   2646 	    note++;
   2647 	    break;
   2648 
   2649 	  case '\n':
   2650 	    /* This can happen for ??/<NEWLINE> when trigraphs are not
   2651 	       being interpretted.  */
   2652 	    gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
   2653 	    note->type = 0;
   2654 	    note++;
   2655 	    break;
   2656 
   2657 	  default:
   2658 	    gcc_checking_assert (_cpp_trigraph_map[note->type]);
   2659 
   2660 	    /* Don't warn about this trigraph in
   2661 	       _cpp_process_line_notes, since trigraphs show up as
   2662 	       trigraphs in raw strings.  */
   2663 	    uchar type = note->type;
   2664 	    note->type = 0;
   2665 
   2666 	    if (CPP_OPTION (pfile, trigraphs))
   2667 	      {
   2668 		accum.append (pfile, base, pos - base);
   2669 		base = pos;
   2670 		accum.read_begin (pfile);
   2671 		accum.append (pfile, UC"??", 2);
   2672 		accum.append (pfile, &type, 1);
   2673 
   2674 		/* ??/ followed by newline gets two line notes, one for
   2675 		   the trigraph and one for the backslash/newline.  */
   2676 		if (type == '/' && note[1].pos == pos)
   2677 		  {
   2678 		    note++;
   2679 		    gcc_assert (note->type == '\\' || note->type == ' ');
   2680 		    goto after_backslash;
   2681 		  }
   2682 		/* Skip the replacement character.  */
   2683 		base = ++pos;
   2684 	      }
   2685 
   2686 	    note++;
   2687 	    break;
   2688 	  }
   2689 
   2690       /* Now get a char to process.  Either from an expanded note, or
   2691 	 from the line buffer.  */
   2692       bool read_note = accum.reading_p ();
   2693       char c = read_note ? accum.read_char () : *pos++;
   2694 
   2695       if (phase == PHASE_PREFIX)
   2696 	{
   2697 	  if (c == '(')
   2698 	    {
   2699 	      /* Done.  */
   2700 	      phase = PHASE_NONE;
   2701 	      prefix[prefix_len++] = '"';
   2702 	    }
   2703 	  else if (prefix_len < 16
   2704 		   /* Prefix chars are any of the basic character set,
   2705 		      [lex.charset] except for '
   2706 		      ()\\\t\v\f\n'. Optimized for a contiguous
   2707 		      alphabet.  */
   2708 		   /* Unlike a switch, this collapses down to one or
   2709 		      two shift and bitmask operations on an ASCII
   2710 		      system, with an outlier or two.   */
   2711 		   && (('Z' - 'A' == 25
   2712 			? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
   2713 			: ISIDST (c))
   2714 		       || (c >= '0' && c <= '9')
   2715 		       || c == '_' || c == '{' || c == '}'
   2716 		       || c == '[' || c == ']' || c == '#'
   2717 		       || c == '<' || c == '>' || c == '%'
   2718 		       || c == ':' || c == ';' || c == '.' || c == '?'
   2719 		       || c == '*' || c == '+' || c == '-' || c == '/'
   2720 		       || c == '^' || c == '&' || c == '|' || c == '~'
   2721 		       || c == '!' || c == '=' || c == ','
   2722 		       || c == '"' || c == '\''))
   2723 	    prefix[prefix_len++] = c;
   2724 	  else
   2725 	    {
   2726 	      /* Something is wrong.  */
   2727 	      int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
   2728 	      if (prefix_len == 16)
   2729 		cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
   2730 				     col, "raw string delimiter longer "
   2731 				     "than 16 characters");
   2732 	      else if (c == '\n')
   2733 		cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
   2734 				     col, "invalid new-line in raw "
   2735 				     "string delimiter");
   2736 	      else
   2737 		cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
   2738 				     col, "invalid character '%c' in "
   2739 				     "raw string delimiter", c);
   2740 	      type = CPP_OTHER;
   2741 	      phase = PHASE_NONE;
   2742 	      /* Continue until we get a close quote, that's probably
   2743 		 the best failure mode.  */
   2744 	      prefix_len = 0;
   2745 	    }
   2746 	  if (c != '\n')
   2747 	    continue;
   2748 	}
   2749 
   2750       if (phase != PHASE_NONE)
   2751 	{
   2752 	  if (prefix[phase] != c)
   2753 	    phase = PHASE_NONE;
   2754 	  else if (unsigned (phase + 1) == prefix_len)
   2755 	    break;
   2756 	  else
   2757 	    {
   2758 	      phase = Phase (phase + 1);
   2759 	      continue;
   2760 	    }
   2761 	}
   2762 
   2763       if (!prefix_len && c == '"')
   2764 	/* Failure mode lexing.  */
   2765 	goto out;
   2766       else if (prefix_len && c == ')')
   2767 	phase = PHASE_SUFFIX;
   2768       else if (!read_note && c == '\n')
   2769 	{
   2770 	  pos--;
   2771 	  pfile->buffer->cur = pos;
   2772 	  if ((pfile->state.in_directive || pfile->state.parsing_args
   2773 	       || pfile->state.in_deferred_pragma)
   2774 	      && pfile->buffer->next_line >= pfile->buffer->rlimit)
   2775 	    {
   2776 	      cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
   2777 				   "unterminated raw string");
   2778 	      type = CPP_OTHER;
   2779 	      goto out;
   2780 	    }
   2781 
   2782 	  accum.append (pfile, base, pos - base + 1);
   2783 	  _cpp_process_line_notes (pfile, false);
   2784 
   2785 	  if (pfile->buffer->next_line < pfile->buffer->rlimit)
   2786 	    CPP_INCREMENT_LINE (pfile, 0);
   2787 	  pfile->buffer->need_line = true;
   2788 
   2789 	  if (!get_fresh_line_impl<true> (pfile))
   2790 	    {
   2791 	      /* We ran out of file and failed to get a line.  */
   2792 	      location_t src_loc = token->src_loc;
   2793 	      token->type = CPP_EOF;
   2794 	      /* Tell the compiler the line number of the EOF token.  */
   2795 	      token->src_loc = pfile->line_table->highest_line;
   2796 	      token->flags = BOL;
   2797 	      if (accum.first)
   2798 		_cpp_release_buff (pfile, accum.first);
   2799 	      cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
   2800 				   "unterminated raw string");
   2801 
   2802 	      /* Now pop the buffer that get_fresh_line_impl() did not.  Popping
   2803 		 is not safe if processing a directive, however this cannot
   2804 		 happen as we already checked above that a line would be
   2805 		 available, and get_fresh_line_impl() can't fail in this
   2806 		 case.  */
   2807 	      gcc_assert (!pfile->state.in_directive);
   2808 	      _cpp_pop_buffer (pfile);
   2809 
   2810 	      return;
   2811 	    }
   2812 
   2813 	  pos = base = pfile->buffer->cur;
   2814 	  note = &pfile->buffer->notes[pfile->buffer->cur_note];
   2815 	}
   2816       else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
   2817 	       && warn_bidi_or_invalid_utf8_p)
   2818 	pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
   2819 					  warn_invalid_utf8_p);
   2820     }
   2821 
   2822   if (warn_bidi_p)
   2823     maybe_warn_bidi_on_close (pfile, pos);
   2824 
   2825   if (CPP_OPTION (pfile, user_literals))
   2826     {
   2827       const uchar *const suffix_begin = pos;
   2828       pfile->buffer->cur = pos;
   2829 
   2830       if (const auto sr = scan_cur_identifier (pfile))
   2831 	{
   2832 	  if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
   2833 					     suffix_begin, sr.node))
   2834 	      pfile->buffer->cur = suffix_begin;
   2835 	  else
   2836 	    {
   2837 	      type = cpp_userdef_string_add_type (type);
   2838 	      accum.create_literal2 (pfile, token, base, suffix_begin - base,
   2839 				     NODE_NAME (sr.node), NODE_LEN (sr.node),
   2840 				     type);
   2841 	      if (accum.first)
   2842 		_cpp_release_buff (pfile, accum.first);
   2843 	      warn_about_normalization (pfile, token, &sr.nst, true);
   2844 	      return;
   2845 	    }
   2846 	}
   2847     }
   2848 
   2849  out:
   2850   pfile->buffer->cur = pos;
   2851   if (!accum.accum)
   2852     create_literal (pfile, token, base, pos - base, type);
   2853   else
   2854     {
   2855       accum.create_literal2 (pfile, token, base, pos - base, nullptr, 0, type);
   2856       _cpp_release_buff (pfile, accum.first);
   2857     }
   2858 }
   2859 
   2860 /* Lexes a string, character constant, or angle-bracketed header file
   2861    name.  The stored string contains the spelling, including opening
   2862    quote and any leading 'L', 'u', 'U' or 'u8' and optional
   2863    'R' modifier.  It returns the type of the literal, or CPP_OTHER
   2864    if it was not properly terminated, or CPP_LESS for an unterminated
   2865    header name which must be relexed as normal tokens.
   2866 
   2867    The spelling is NUL-terminated, but it is not guaranteed that this
   2868    is the first NUL since embedded NULs are preserved.  */
   2869 static void
   2870 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
   2871 {
   2872   bool saw_NUL = false;
   2873   const uchar *cur;
   2874   cppchar_t terminator;
   2875   enum cpp_ttype type;
   2876 
   2877   cur = base;
   2878   terminator = *cur++;
   2879   if (terminator == 'L' || terminator == 'U')
   2880     terminator = *cur++;
   2881   else if (terminator == 'u')
   2882     {
   2883       terminator = *cur++;
   2884       if (terminator == '8')
   2885 	terminator = *cur++;
   2886     }
   2887   if (terminator == 'R')
   2888     {
   2889       lex_raw_string (pfile, token, base);
   2890       return;
   2891     }
   2892   if (terminator == '"')
   2893     type = (*base == 'L' ? CPP_WSTRING :
   2894 	    *base == 'U' ? CPP_STRING32 :
   2895 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
   2896 			 : CPP_STRING);
   2897   else if (terminator == '\'')
   2898     type = (*base == 'L' ? CPP_WCHAR :
   2899 	    *base == 'U' ? CPP_CHAR32 :
   2900 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
   2901 			 : CPP_CHAR);
   2902   else
   2903     terminator = '>', type = CPP_HEADER_NAME;
   2904 
   2905   const bool warn_bidi_p = pfile->warn_bidi_p ();
   2906   const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
   2907   const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
   2908   for (;;)
   2909     {
   2910       cppchar_t c = *cur++;
   2911 
   2912       /* In #include-style directives, terminators are not escapable.  */
   2913       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
   2914 	{
   2915 	  if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
   2916 	    {
   2917 	      location_t loc;
   2918 	      bidi::kind kind;
   2919 	      if (cur[0] == 'N')
   2920 		kind = get_bidi_named (pfile, cur + 1, &loc);
   2921 	      else
   2922 		kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
   2923 	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
   2924 	    }
   2925 	  cur++;
   2926 	}
   2927       else if (c == terminator)
   2928 	{
   2929 	  if (warn_bidi_p)
   2930 	    maybe_warn_bidi_on_close (pfile, cur - 1);
   2931 	  break;
   2932 	}
   2933       else if (c == '\n')
   2934 	{
   2935 	  cur--;
   2936 	  /* Unmatched quotes always yield undefined behavior, but
   2937 	     greedy lexing means that what appears to be an unterminated
   2938 	     header name may actually be a legitimate sequence of tokens.  */
   2939 	  if (terminator == '>')
   2940 	    {
   2941 	      token->type = CPP_LESS;
   2942 	      return;
   2943 	    }
   2944 	  type = CPP_OTHER;
   2945 	  break;
   2946 	}
   2947       else if (c == '\0')
   2948 	saw_NUL = true;
   2949       else if (__builtin_expect (c >= utf8_continuation, 0)
   2950 	       && warn_bidi_or_invalid_utf8_p)
   2951 	cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
   2952 					  warn_invalid_utf8_p);
   2953     }
   2954 
   2955   if (saw_NUL && !pfile->state.skipping)
   2956     cpp_error (pfile, CPP_DL_WARNING,
   2957 	       "null character(s) preserved in literal");
   2958 
   2959   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
   2960     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
   2961 	       (int) terminator);
   2962 
   2963   pfile->buffer->cur = cur;
   2964   const uchar *const suffix_begin = cur;
   2965 
   2966   if (CPP_OPTION (pfile, user_literals))
   2967     {
   2968       if (const auto sr = scan_cur_identifier (pfile))
   2969 	{
   2970 	  if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
   2971 					     suffix_begin, sr.node))
   2972 	    pfile->buffer->cur = suffix_begin;
   2973 	  else
   2974 	    {
   2975 	      /* Grab user defined literal suffix.  */
   2976 	      type = cpp_userdef_char_add_type (type);
   2977 	      type = cpp_userdef_string_add_type (type);
   2978 	      create_literal2 (pfile, token, base, suffix_begin - base,
   2979 			       NODE_NAME (sr.node), NODE_LEN (sr.node), type);
   2980 	      warn_about_normalization (pfile, token, &sr.nst, true);
   2981 	      return;
   2982 	    }
   2983 	}
   2984     }
   2985   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
   2986 	   && !pfile->state.skipping)
   2987     {
   2988       const auto sr = scan_cur_identifier (pfile);
   2989       /* Maybe raise a warning, but do not consume the tokens.  */
   2990       pfile->buffer->cur = suffix_begin;
   2991       if (sr && cpp_macro_p (sr.node))
   2992 	cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
   2993 			       token->src_loc, 0, "C++11 requires a space "
   2994 			       "between string literal and macro");
   2995     }
   2996 
   2997   create_literal (pfile, token, base, cur - base, type);
   2998 }
   2999 
   3000 /* Return the comment table. The client may not make any assumption
   3001    about the ordering of the table.  */
   3002 cpp_comment_table *
   3003 cpp_get_comments (cpp_reader *pfile)
   3004 {
   3005   return &pfile->comments;
   3006 }
   3007 
   3008 /* Append a comment to the end of the comment table. */
   3009 static void
   3010 store_comment (cpp_reader *pfile, cpp_token *token)
   3011 {
   3012   int len;
   3013 
   3014   if (pfile->comments.allocated == 0)
   3015     {
   3016       pfile->comments.allocated = 256;
   3017       pfile->comments.entries = (cpp_comment *) xmalloc
   3018 	(pfile->comments.allocated * sizeof (cpp_comment));
   3019     }
   3020 
   3021   if (pfile->comments.count == pfile->comments.allocated)
   3022     {
   3023       pfile->comments.allocated *= 2;
   3024       pfile->comments.entries = (cpp_comment *) xrealloc
   3025 	(pfile->comments.entries,
   3026 	 pfile->comments.allocated * sizeof (cpp_comment));
   3027     }
   3028 
   3029   len = token->val.str.len;
   3030 
   3031   /* Copy comment. Note, token may not be NULL terminated. */
   3032   pfile->comments.entries[pfile->comments.count].comment =
   3033     (char *) xmalloc (sizeof (char) * (len + 1));
   3034   memcpy (pfile->comments.entries[pfile->comments.count].comment,
   3035 	  token->val.str.text, len);
   3036   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
   3037 
   3038   /* Set source location. */
   3039   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
   3040 
   3041   /* Increment the count of entries in the comment table. */
   3042   pfile->comments.count++;
   3043 }
   3044 
   3045 /* The stored comment includes the comment start and any terminator.  */
   3046 static void
   3047 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
   3048 	      cppchar_t type)
   3049 {
   3050   unsigned char *buffer;
   3051   unsigned int len, clen, i;
   3052 
   3053   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
   3054 
   3055   /* C++ comments probably (not definitely) have moved past a new
   3056      line, which we don't want to save in the comment.  */
   3057   if (is_vspace (pfile->buffer->cur[-1]))
   3058     len--;
   3059 
   3060   /* If we are currently in a directive or in argument parsing, then
   3061      we need to store all C++ comments as C comments internally, and
   3062      so we need to allocate a little extra space in that case.
   3063 
   3064      Note that the only time we encounter a directive here is
   3065      when we are saving comments in a "#define".  */
   3066   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
   3067 	  && type == '/') ? len + 2 : len;
   3068 
   3069   buffer = _cpp_unaligned_alloc (pfile, clen);
   3070 
   3071   token->type = CPP_COMMENT;
   3072   token->val.str.len = clen;
   3073   token->val.str.text = buffer;
   3074 
   3075   buffer[0] = '/';
   3076   memcpy (buffer + 1, from, len - 1);
   3077 
   3078   /* Finish conversion to a C comment, if necessary.  */
   3079   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
   3080     {
   3081       buffer[1] = '*';
   3082       buffer[clen - 2] = '*';
   3083       buffer[clen - 1] = '/';
   3084       /* As there can be in a C++ comments illegal sequences for C comments
   3085          we need to filter them out.  */
   3086       for (i = 2; i < (clen - 2); i++)
   3087         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
   3088           buffer[i] = '|';
   3089     }
   3090 
   3091   /* Finally store this comment for use by clients of libcpp. */
   3092   store_comment (pfile, token);
   3093 }
   3094 
   3095 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
   3096    comment.  */
   3097 
   3098 static bool
   3099 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
   3100 {
   3101   const unsigned char *from = comment_start + 1;
   3102 
   3103   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
   3104     {
   3105       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
   3106 	 don't recognize any comments.  The latter only checks attributes,
   3107 	 the former doesn't warn.  */
   3108     case 0:
   3109     default:
   3110       return false;
   3111       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
   3112 	 content it has.  */
   3113     case 1:
   3114       return true;
   3115     case 2:
   3116       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
   3117 	 .*falls?[ \t-]*thr(u|ough).* regex.  */
   3118       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
   3119 	   from++)
   3120 	{
   3121 	  /* Is there anything like strpbrk with upper boundary, or
   3122 	     memchr looking for 2 characters rather than just one?  */
   3123 	  if (from[0] != 'f' && from[0] != 'F')
   3124 	    continue;
   3125 	  if (from[1] != 'a' && from[1] != 'A')
   3126 	    continue;
   3127 	  if (from[2] != 'l' && from[2] != 'L')
   3128 	    continue;
   3129 	  if (from[3] != 'l' && from[3] != 'L')
   3130 	    continue;
   3131 	  from += sizeof "fall" - 1;
   3132 	  if (from[0] == 's' || from[0] == 'S')
   3133 	    from++;
   3134 	  while (*from == ' ' || *from == '\t' || *from == '-')
   3135 	    from++;
   3136 	  if (from[0] != 't' && from[0] != 'T')
   3137 	    continue;
   3138 	  if (from[1] != 'h' && from[1] != 'H')
   3139 	    continue;
   3140 	  if (from[2] != 'r' && from[2] != 'R')
   3141 	    continue;
   3142 	  if (from[3] == 'u' || from[3] == 'U')
   3143 	    return true;
   3144 	  if (from[3] != 'o' && from[3] != 'O')
   3145 	    continue;
   3146 	  if (from[4] != 'u' && from[4] != 'U')
   3147 	    continue;
   3148 	  if (from[5] != 'g' && from[5] != 'G')
   3149 	    continue;
   3150 	  if (from[6] != 'h' && from[6] != 'H')
   3151 	    continue;
   3152 	  return true;
   3153 	}
   3154       return false;
   3155     case 3:
   3156     case 4:
   3157       break;
   3158     }
   3159 
   3160   /* Whole comment contents:
   3161      -fallthrough
   3162      @fallthrough@
   3163    */
   3164   if (*from == '-' || *from == '@')
   3165     {
   3166       size_t len = sizeof "fallthrough" - 1;
   3167       if ((size_t) (pfile->buffer->cur - from - 1) < len)
   3168 	return false;
   3169       if (memcmp (from + 1, "fallthrough", len))
   3170 	return false;
   3171       if (*from == '@')
   3172 	{
   3173 	  if (from[len + 1] != '@')
   3174 	    return false;
   3175 	  len++;
   3176 	}
   3177       from += 1 + len;
   3178     }
   3179   /* Whole comment contents (regex):
   3180      lint -fallthrough[ \t]*
   3181    */
   3182   else if (*from == 'l')
   3183     {
   3184       size_t len = sizeof "int -fallthrough" - 1;
   3185       if ((size_t) (pfile->buffer->cur - from - 1) < len)
   3186 	return false;
   3187       if (memcmp (from + 1, "int -fallthrough", len))
   3188 	return false;
   3189       from += 1 + len;
   3190       while (*from == ' ' || *from == '\t')
   3191 	from++;
   3192     }
   3193   /* Whole comment contents (regex):
   3194      [ \t]*FALLTHR(U|OUGH)[ \t]*
   3195    */
   3196   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
   3197     {
   3198       while (*from == ' ' || *from == '\t')
   3199 	from++;
   3200       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
   3201 	return false;
   3202       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
   3203 	return false;
   3204       from += sizeof "FALLTHR" - 1;
   3205       if (*from == 'U')
   3206 	from++;
   3207       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
   3208 	return false;
   3209       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
   3210 	return false;
   3211       else
   3212 	from += sizeof "OUGH" - 1;
   3213       while (*from == ' ' || *from == '\t')
   3214 	from++;
   3215     }
   3216   /* Whole comment contents (regex):
   3217      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
   3218      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
   3219      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
   3220    */
   3221   else
   3222     {
   3223       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
   3224 	from++;
   3225       unsigned char f = *from;
   3226       bool all_upper = false;
   3227       if (f == 'E' || f == 'e')
   3228 	{
   3229 	  if ((size_t) (pfile->buffer->cur - from)
   3230 	      < sizeof "else fallthru" - 1)
   3231 	    return false;
   3232 	  if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
   3233 	    all_upper = true;
   3234 	  else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
   3235 	    return false;
   3236 	  from += sizeof "else" - 1;
   3237 	  if (*from == ',')
   3238 	    from++;
   3239 	  if (*from != ' ')
   3240 	    return false;
   3241 	  from++;
   3242 	  if (all_upper && *from == 'f')
   3243 	    return false;
   3244 	  if (f == 'e' && *from == 'F')
   3245 	    return false;
   3246 	  f = *from;
   3247 	}
   3248       else if (f == 'I' || f == 'i')
   3249 	{
   3250 	  if ((size_t) (pfile->buffer->cur - from)
   3251 	      < sizeof "intentional fallthru" - 1)
   3252 	    return false;
   3253 	  if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
   3254 				  sizeof "NTENTIONAL" - 1) == 0)
   3255 	    all_upper = true;
   3256 	  else if (memcmp (from + 1, "ntentional",
   3257 			   sizeof "ntentional" - 1))
   3258 	    return false;
   3259 	  from += sizeof "intentional" - 1;
   3260 	  if (*from == ' ')
   3261 	    {
   3262 	      from++;
   3263 	      if (all_upper && *from == 'f')
   3264 		return false;
   3265 	    }
   3266 	  else if (all_upper)
   3267 	    {
   3268 	      if (memcmp (from, "LY F", sizeof "LY F" - 1))
   3269 		return false;
   3270 	      from += sizeof "LY " - 1;
   3271 	    }
   3272 	  else
   3273 	    {
   3274 	      if (memcmp (from, "ly ", sizeof "ly " - 1))
   3275 		return false;
   3276 	      from += sizeof "ly " - 1;
   3277 	    }
   3278 	  if (f == 'i' && *from == 'F')
   3279 	    return false;
   3280 	  f = *from;
   3281 	}
   3282       if (f != 'F' && f != 'f')
   3283 	return false;
   3284       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
   3285 	return false;
   3286       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
   3287 	all_upper = true;
   3288       else if (all_upper)
   3289 	return false;
   3290       else if (memcmp (from + 1, "all", sizeof "all" - 1))
   3291 	return false;
   3292       from += sizeof "fall" - 1;
   3293       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
   3294 	from += 2;
   3295       else if (*from == ' ' || *from == '-')
   3296 	from++;
   3297       else if (*from != (all_upper ? 'T' : 't'))
   3298 	return false;
   3299       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
   3300 	return false;
   3301       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
   3302 	return false;
   3303       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
   3304 	{
   3305 	  if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
   3306 	    return false;
   3307 	  if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
   3308 		      sizeof "hrough" - 1))
   3309 	    return false;
   3310 	  from += sizeof "through" - 1;
   3311 	}
   3312       else
   3313 	from += sizeof "thru" - 1;
   3314       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
   3315 	from++;
   3316       if (*from == '-')
   3317 	{
   3318 	  from++;
   3319 	  if (*comment_start == '*')
   3320 	    {
   3321 	      do
   3322 		{
   3323 		  while (*from && *from != '*'
   3324 			 && *from != '\n' && *from != '\r')
   3325 		    from++;
   3326 		  if (*from != '*' || from[1] == '/')
   3327 		    break;
   3328 		  from++;
   3329 		}
   3330 	      while (1);
   3331 	    }
   3332 	  else
   3333 	    while (*from && *from != '\n' && *from != '\r')
   3334 	      from++;
   3335 	}
   3336     }
   3337   /* C block comment.  */
   3338   if (*comment_start == '*')
   3339     {
   3340       if (*from != '*' || from[1] != '/')
   3341 	return false;
   3342     }
   3343   /* C++ line comment.  */
   3344   else if (*from != '\n')
   3345     return false;
   3346 
   3347   return true;
   3348 }
   3349 
   3350 /* Allocate COUNT tokens for RUN.  */
   3351 void
   3352 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
   3353 {
   3354   run->base = XNEWVEC (cpp_token, count);
   3355   run->limit = run->base + count;
   3356   run->next = NULL;
   3357 }
   3358 
   3359 /* Returns the next tokenrun, or creates one if there is none.  */
   3360 static tokenrun *
   3361 next_tokenrun (tokenrun *run)
   3362 {
   3363   if (run->next == NULL)
   3364     {
   3365       run->next = XNEW (tokenrun);
   3366       run->next->prev = run;
   3367       _cpp_init_tokenrun (run->next, 250);
   3368     }
   3369 
   3370   return run->next;
   3371 }
   3372 
   3373 /* Return the number of not yet processed token in a given
   3374    context.  */
   3375 int
   3376 _cpp_remaining_tokens_num_in_context (cpp_context *context)
   3377 {
   3378   if (context->tokens_kind == TOKENS_KIND_DIRECT)
   3379     return (LAST (context).token - FIRST (context).token);
   3380   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
   3381 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
   3382     return (LAST (context).ptoken - FIRST (context).ptoken);
   3383   else
   3384       abort ();
   3385 }
   3386 
   3387 /* Returns the token present at index INDEX in a given context.  If
   3388    INDEX is zero, the next token to be processed is returned.  */
   3389 static const cpp_token*
   3390 _cpp_token_from_context_at (cpp_context *context, int index)
   3391 {
   3392   if (context->tokens_kind == TOKENS_KIND_DIRECT)
   3393     return &(FIRST (context).token[index]);
   3394   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
   3395 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
   3396     return FIRST (context).ptoken[index];
   3397  else
   3398    abort ();
   3399 }
   3400 
   3401 /* Look ahead in the input stream.  */
   3402 const cpp_token *
   3403 cpp_peek_token (cpp_reader *pfile, int index)
   3404 {
   3405   cpp_context *context = pfile->context;
   3406   const cpp_token *peektok;
   3407   int count;
   3408 
   3409   /* First, scan through any pending cpp_context objects.  */
   3410   while (context->prev)
   3411     {
   3412       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
   3413 
   3414       if (index < (int) sz)
   3415         return _cpp_token_from_context_at (context, index);
   3416       index -= (int) sz;
   3417       context = context->prev;
   3418     }
   3419 
   3420   /* We will have to read some new tokens after all (and do so
   3421      without invalidating preceding tokens).  */
   3422   count = index;
   3423   pfile->keep_tokens++;
   3424 
   3425   /* For peeked tokens temporarily disable line_change reporting,
   3426      until the tokens are parsed for real.  */
   3427   void (*line_change) (cpp_reader *, const cpp_token *, int)
   3428     = pfile->cb.line_change;
   3429   pfile->cb.line_change = NULL;
   3430 
   3431   do
   3432     {
   3433       peektok = _cpp_lex_token (pfile);
   3434       if (peektok->type == CPP_EOF)
   3435 	{
   3436 	  index--;
   3437 	  break;
   3438 	}
   3439       else if (peektok->type == CPP_PRAGMA)
   3440 	{
   3441 	  /* Don't peek past a pragma.  */
   3442 	  if (peektok == &pfile->directive_result)
   3443 	    /* Save the pragma in the buffer.  */
   3444 	    *pfile->cur_token++ = *peektok;
   3445 	  index--;
   3446 	  break;
   3447 	}
   3448     }
   3449   while (index--);
   3450 
   3451   _cpp_backup_tokens_direct (pfile, count - index);
   3452   pfile->keep_tokens--;
   3453   pfile->cb.line_change = line_change;
   3454 
   3455   return peektok;
   3456 }
   3457 
   3458 /* Allocate a single token that is invalidated at the same time as the
   3459    rest of the tokens on the line.  Has its line and col set to the
   3460    same as the last lexed token, so that diagnostics appear in the
   3461    right place.  */
   3462 cpp_token *
   3463 _cpp_temp_token (cpp_reader *pfile)
   3464 {
   3465   cpp_token *old, *result;
   3466   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
   3467   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
   3468 
   3469   old = pfile->cur_token - 1;
   3470   /* Any pre-existing lookaheads must not be clobbered.  */
   3471   if (la)
   3472     {
   3473       if (sz <= la)
   3474         {
   3475           tokenrun *next = next_tokenrun (pfile->cur_run);
   3476 
   3477           if (sz < la)
   3478             memmove (next->base + 1, next->base,
   3479                      (la - sz) * sizeof (cpp_token));
   3480 
   3481           next->base[0] = pfile->cur_run->limit[-1];
   3482         }
   3483 
   3484       if (sz > 1)
   3485         memmove (pfile->cur_token + 1, pfile->cur_token,
   3486                  MIN (la, sz - 1) * sizeof (cpp_token));
   3487     }
   3488 
   3489   if (!sz && pfile->cur_token == pfile->cur_run->limit)
   3490     {
   3491       pfile->cur_run = next_tokenrun (pfile->cur_run);
   3492       pfile->cur_token = pfile->cur_run->base;
   3493     }
   3494 
   3495   result = pfile->cur_token++;
   3496   result->src_loc = old->src_loc;
   3497   return result;
   3498 }
   3499 
   3500 /* We're at the beginning of a logical line (so not in
   3501   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
   3502   if we should enter deferred_pragma mode to tokenize the rest of the
   3503   line as a module control-line.  */
   3504 
   3505 static void
   3506 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
   3507 {
   3508   unsigned backup = 0; /* Tokens we peeked.  */
   3509   cpp_hashnode *node = result->val.node.node;
   3510   cpp_token *peek = result;
   3511   cpp_token *keyword = peek;
   3512   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
   3513   int header_count = 0;
   3514 
   3515   /* Make sure the incoming state is as we expect it.  This way we
   3516      can restore it using constants.  */
   3517   gcc_checking_assert (!pfile->state.in_deferred_pragma
   3518 		       && !pfile->state.skipping
   3519 		       && !pfile->state.parsing_args
   3520 		       && !pfile->state.angled_headers
   3521 		       && (pfile->state.save_comments
   3522 			   == !CPP_OPTION (pfile, discard_comments)));
   3523 
   3524   /* Enter directives mode sufficiently for peeking.  We don't have
   3525      to actually set in_directive.  */
   3526   pfile->state.in_deferred_pragma = true;
   3527 
   3528   /* These two fields are needed to process tokenization in deferred
   3529      pragma mode.  They are not used outside deferred pragma mode or
   3530      directives mode.  */
   3531   pfile->state.pragma_allow_expansion = true;
   3532   pfile->directive_line = result->src_loc;
   3533 
   3534   /* Saving comments is incompatible with directives mode.   */
   3535   pfile->state.save_comments = 0;
   3536 
   3537   if (node == n_modules[spec_nodes::M_EXPORT][0])
   3538     {
   3539       peek = _cpp_lex_direct (pfile);
   3540       keyword = peek;
   3541       backup++;
   3542       if (keyword->type != CPP_NAME)
   3543 	goto not_module;
   3544       node = keyword->val.node.node;
   3545       if (!(node->flags & NODE_MODULE))
   3546 	goto not_module;
   3547     }
   3548 
   3549   if (node == n_modules[spec_nodes::M__IMPORT][0])
   3550     /* __import  */
   3551     header_count = backup + 2 + 16;
   3552   else if (node == n_modules[spec_nodes::M_IMPORT][0])
   3553     /* import  */
   3554     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
   3555   else if (node == n_modules[spec_nodes::M_MODULE][0])
   3556     ; /* module  */
   3557   else
   3558     goto not_module;
   3559 
   3560   /* We've seen [export] {module|import|__import}.  Check the next token.  */
   3561   if (header_count)
   3562     /* After '{,__}import' a header name may appear.  */
   3563     pfile->state.angled_headers = true;
   3564   peek = _cpp_lex_direct (pfile);
   3565   backup++;
   3566 
   3567   /* ... import followed by identifier, ':', '<' or
   3568      header-name preprocessing tokens, or module
   3569      followed by cpp-identifier, ':' or ';' preprocessing
   3570      tokens.  C++ keywords are not yet relevant.  */
   3571   if (peek->type == CPP_NAME
   3572       || peek->type == CPP_COLON
   3573       ||  (header_count
   3574 	   ? (peek->type == CPP_LESS
   3575 	      || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
   3576 	      || peek->type == CPP_HEADER_NAME)
   3577 	   : peek->type == CPP_SEMICOLON))
   3578     {
   3579       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
   3580       if (!pfile->state.pragma_allow_expansion)
   3581 	pfile->state.prevent_expansion++;
   3582 
   3583       if (!header_count && linemap_included_from
   3584 	  (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
   3585 	cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
   3586 			     "module control-line cannot be in included file");
   3587 
   3588       /* The first one or two tokens cannot be macro names.  */
   3589       for (int ix = backup; ix--;)
   3590 	{
   3591 	  cpp_token *tok = ix ? keyword : result;
   3592 	  cpp_hashnode *node = tok->val.node.node;
   3593 
   3594 	  /* Don't attempt to expand the token.  */
   3595 	  tok->flags |= NO_EXPAND;
   3596 	  if (_cpp_defined_macro_p (node)
   3597 	      && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
   3598 	      && !cpp_fun_like_macro_p (node))
   3599 	    cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
   3600 				 "module control-line \"%s\" cannot be"
   3601 				 " an object-like macro",
   3602 				 NODE_NAME (node));
   3603 	}
   3604 
   3605       /* Map to underbar variants.  */
   3606       keyword->val.node.node = n_modules[header_count
   3607 					 ? spec_nodes::M_IMPORT
   3608 					 : spec_nodes::M_MODULE][1];
   3609       if (backup != 1)
   3610 	result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
   3611 
   3612       /* Maybe tell the tokenizer we expect a header-name down the
   3613 	 road.  */
   3614       pfile->state.directive_file_token = header_count;
   3615     }
   3616   else
   3617     {
   3618     not_module:
   3619       /* Drop out of directive mode.  */
   3620       /* We aaserted save_comments had this value upon entry.  */
   3621       pfile->state.save_comments
   3622 	= !CPP_OPTION (pfile, discard_comments);
   3623       pfile->state.in_deferred_pragma = false;
   3624       /* Do not let this remain on.  */
   3625       pfile->state.angled_headers = false;
   3626     }
   3627 
   3628   /* In either case we want to backup the peeked tokens.  */
   3629   if (backup)
   3630     {
   3631       /* If we saw EOL, we should drop it, because this isn't a module
   3632 	 control-line after all.  */
   3633       bool eol = peek->type == CPP_PRAGMA_EOL;
   3634       if (!eol || backup > 1)
   3635 	{
   3636 	  /* Put put the peeked tokens back  */
   3637 	  _cpp_backup_tokens_direct (pfile, backup);
   3638 	  /* But if the last one was an EOL, forget it.  */
   3639 	  if (eol)
   3640 	    pfile->lookaheads--;
   3641 	}
   3642     }
   3643 }
   3644 
   3645 /* Lex a token into RESULT (external interface).  Takes care of issues
   3646    like directive handling, token lookahead, multiple include
   3647    optimization and skipping.  */
   3648 const cpp_token *
   3649 _cpp_lex_token (cpp_reader *pfile)
   3650 {
   3651   cpp_token *result;
   3652 
   3653   for (;;)
   3654     {
   3655       if (pfile->cur_token == pfile->cur_run->limit)
   3656 	{
   3657 	  pfile->cur_run = next_tokenrun (pfile->cur_run);
   3658 	  pfile->cur_token = pfile->cur_run->base;
   3659 	}
   3660       /* We assume that the current token is somewhere in the current
   3661 	 run.  */
   3662       if (pfile->cur_token < pfile->cur_run->base
   3663 	  || pfile->cur_token >= pfile->cur_run->limit)
   3664 	abort ();
   3665 
   3666       if (pfile->lookaheads)
   3667 	{
   3668 	  pfile->lookaheads--;
   3669 	  result = pfile->cur_token++;
   3670 	}
   3671       else
   3672 	result = _cpp_lex_direct (pfile);
   3673 
   3674       if (result->flags & BOL)
   3675 	{
   3676 	  /* Is this a directive.  If _cpp_handle_directive returns
   3677 	     false, it is an assembler #.  */
   3678 	  if (result->type == CPP_HASH
   3679 	      /* 6.10.3 p 11: Directives in a list of macro arguments
   3680 		 gives undefined behavior.  This implementation
   3681 		 handles the directive as normal.  */
   3682 	      && pfile->state.parsing_args != 1)
   3683 	    {
   3684 	      if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
   3685 		{
   3686 		  if (pfile->directive_result.type == CPP_PADDING)
   3687 		    continue;
   3688 		  result = &pfile->directive_result;
   3689 		}
   3690 	    }
   3691 	  else if (pfile->state.in_deferred_pragma)
   3692 	    result = &pfile->directive_result;
   3693 	  else if (result->type == CPP_NAME
   3694 		   && (result->val.node.node->flags & NODE_MODULE)
   3695 		   && !pfile->state.skipping
   3696 		   /* Unlike regular directives, we do not deal with
   3697 		      tokenizing module directives as macro arguments.
   3698 		      That's not permitted.  */
   3699 		   && !pfile->state.parsing_args)
   3700 	    {
   3701 	      /* P1857.  Before macro expansion, At start of logical
   3702 		 line ... */
   3703 	      /* We don't have to consider lookaheads at this point.  */
   3704 	      gcc_checking_assert (!pfile->lookaheads);
   3705 
   3706 	      cpp_maybe_module_directive (pfile, result);
   3707 	    }
   3708 
   3709 	  if (pfile->cb.line_change && !pfile->state.skipping)
   3710 	    pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
   3711 	}
   3712 
   3713       /* We don't skip tokens in directives.  */
   3714       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
   3715 	break;
   3716 
   3717       /* Outside a directive, invalidate controlling macros.  At file
   3718 	 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
   3719 	 get here and MI optimization works.  */
   3720       pfile->mi_valid = false;
   3721 
   3722       if (!pfile->state.skipping || result->type == CPP_EOF)
   3723 	break;
   3724     }
   3725 
   3726   return result;
   3727 }
   3728 
   3729 /* Returns true if a fresh line has been loaded.  */
   3730 template <bool lexing_raw_string>
   3731 static bool
   3732 get_fresh_line_impl (cpp_reader *pfile)
   3733 {
   3734   /* We can't get a new line until we leave the current directive, unless we
   3735      are lexing a raw string, in which case it will be OK as long as we don't
   3736      pop the current buffer.  */
   3737   if (!lexing_raw_string && pfile->state.in_directive)
   3738     return false;
   3739 
   3740   for (;;)
   3741     {
   3742       cpp_buffer *buffer = pfile->buffer;
   3743 
   3744       if (!buffer->need_line)
   3745 	return true;
   3746 
   3747       if (buffer->next_line < buffer->rlimit)
   3748 	{
   3749 	  _cpp_clean_line (pfile);
   3750 	  return true;
   3751 	}
   3752 
   3753       /* We can't change buffers until we leave the current directive.  */
   3754       if (lexing_raw_string && pfile->state.in_directive)
   3755 	return false;
   3756 
   3757       /* First, get out of parsing arguments state.  */
   3758       if (pfile->state.parsing_args)
   3759 	return false;
   3760 
   3761       /* End of buffer.  Non-empty files should end in a newline.  */
   3762       if (buffer->buf != buffer->rlimit
   3763 	  && buffer->next_line > buffer->rlimit
   3764 	  && !buffer->from_stage3)
   3765 	{
   3766 	  /* Clip to buffer size.  */
   3767 	  buffer->next_line = buffer->rlimit;
   3768 	}
   3769 
   3770       if (buffer->prev && !buffer->return_at_eof)
   3771 	_cpp_pop_buffer (pfile);
   3772       else
   3773 	{
   3774 	  /* End of translation.  Do not pop the buffer yet. Increment
   3775 	     line number so that the EOF token is on a line of its own
   3776 	     (_cpp_lex_direct doesn't increment in that case, because
   3777 	     it's hard for it to distinguish this special case). */
   3778 	  CPP_INCREMENT_LINE (pfile, 0);
   3779 	  return false;
   3780 	}
   3781     }
   3782 }
   3783 
   3784 bool
   3785 _cpp_get_fresh_line (cpp_reader *pfile)
   3786 {
   3787   return get_fresh_line_impl<false> (pfile);
   3788 }
   3789 
   3790 
   3791 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)		\
   3792   do							\
   3793     {							\
   3794       result->type = ELSE_TYPE;				\
   3795       if (*buffer->cur == CHAR)				\
   3796 	buffer->cur++, result->type = THEN_TYPE;	\
   3797     }							\
   3798   while (0)
   3799 
   3800 /* Lex a token into pfile->cur_token, which is also incremented, to
   3801    get diagnostics pointing to the correct location.
   3802 
   3803    Does not handle issues such as token lookahead, multiple-include
   3804    optimization, directives, skipping etc.  This function is only
   3805    suitable for use by _cpp_lex_token, and in special cases like
   3806    lex_expansion_token which doesn't care for any of these issues.
   3807 
   3808    When meeting a newline, returns CPP_EOF if parsing a directive,
   3809    otherwise returns to the start of the token buffer if permissible.
   3810    Returns the location of the lexed token.  */
   3811 cpp_token *
   3812 _cpp_lex_direct (cpp_reader *pfile)
   3813 {
   3814   cppchar_t c = 0;
   3815   cpp_buffer *buffer;
   3816   const unsigned char *comment_start;
   3817   bool fallthrough_comment = false;
   3818   cpp_token *result = pfile->cur_token++;
   3819 
   3820  fresh_line:
   3821   result->flags = 0;
   3822   buffer = pfile->buffer;
   3823   if (buffer->need_line)
   3824     {
   3825       if (pfile->state.in_deferred_pragma)
   3826 	{
   3827 	  /* This can happen in cases like:
   3828 	     #define loop(x) whatever
   3829 	     #pragma omp loop
   3830 	     where when trying to expand loop we need to peek
   3831 	     next token after loop, but aren't still in_deferred_pragma
   3832 	     mode but are in in_directive mode, so buffer->need_line
   3833 	     is set, a CPP_EOF is peeked.  */
   3834 	  result->type = CPP_PRAGMA_EOL;
   3835 	  pfile->state.in_deferred_pragma = false;
   3836 	  if (!pfile->state.pragma_allow_expansion)
   3837 	    pfile->state.prevent_expansion--;
   3838 	  result->src_loc = pfile->line_table->highest_line;
   3839 	  return result;
   3840 	}
   3841       if (!_cpp_get_fresh_line (pfile))
   3842 	{
   3843 	  result->type = CPP_EOF;
   3844 	  /* Not a real EOF in a directive or arg parsing -- we refuse
   3845   	     to advance to the next file now, and will once we're out
   3846   	     of those modes.  */
   3847 	  if (!pfile->state.in_directive && !pfile->state.parsing_args)
   3848 	    {
   3849 	      /* Tell the compiler the line number of the EOF token.  */
   3850 	      result->src_loc = pfile->line_table->highest_line;
   3851 	      result->flags = BOL;
   3852 	      /* Now pop the buffer that _cpp_get_fresh_line did not.  */
   3853 	      _cpp_pop_buffer (pfile);
   3854 	    }
   3855 	  else if (c == 0)
   3856 	    result->src_loc = pfile->line_table->highest_line;
   3857 	  return result;
   3858 	}
   3859       if (buffer != pfile->buffer)
   3860 	fallthrough_comment = false;
   3861       if (!pfile->keep_tokens)
   3862 	{
   3863 	  pfile->cur_run = &pfile->base_run;
   3864 	  result = pfile->base_run.base;
   3865 	  pfile->cur_token = result + 1;
   3866 	}
   3867       result->flags = BOL;
   3868       if (pfile->state.parsing_args == 2)
   3869 	result->flags |= PREV_WHITE;
   3870     }
   3871   buffer = pfile->buffer;
   3872  update_tokens_line:
   3873   result->src_loc = pfile->line_table->highest_line;
   3874 
   3875  skipped_white:
   3876   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
   3877       && !pfile->overlaid_buffer)
   3878     {
   3879       _cpp_process_line_notes (pfile, false);
   3880       result->src_loc = pfile->line_table->highest_line;
   3881     }
   3882   c = *buffer->cur++;
   3883 
   3884   if (pfile->forced_token_location)
   3885     result->src_loc = pfile->forced_token_location;
   3886   else
   3887     result->src_loc = linemap_position_for_column (pfile->line_table,
   3888 					  CPP_BUF_COLUMN (buffer, buffer->cur));
   3889 
   3890   switch (c)
   3891     {
   3892     case ' ': case '\t': case '\f': case '\v': case '\0':
   3893       result->flags |= PREV_WHITE;
   3894       skip_whitespace (pfile, c);
   3895       goto skipped_white;
   3896 
   3897     case '\n':
   3898       /* Increment the line, unless this is the last line ...  */
   3899       if (buffer->cur < buffer->rlimit
   3900 	  /* ... or this is a #include, (where _cpp_stack_file needs to
   3901 	     unwind by one line) ...  */
   3902 	  || (pfile->state.in_directive > 1
   3903 	      /* ... except traditional-cpp increments this elsewhere.  */
   3904 	      && !CPP_OPTION (pfile, traditional)))
   3905 	CPP_INCREMENT_LINE (pfile, 0);
   3906       buffer->need_line = true;
   3907       if (pfile->state.in_deferred_pragma)
   3908 	{
   3909 	  /* Produce the PRAGMA_EOL on this line.  File reading
   3910 	     ensures there is always a \n at end of the buffer, thus
   3911 	     in a deferred pragma we always see CPP_PRAGMA_EOL before
   3912 	     any CPP_EOF.  */
   3913 	  result->type = CPP_PRAGMA_EOL;
   3914 	  result->flags &= ~PREV_WHITE;
   3915 	  pfile->state.in_deferred_pragma = false;
   3916 	  if (!pfile->state.pragma_allow_expansion)
   3917 	    pfile->state.prevent_expansion--;
   3918 	  return result;
   3919 	}
   3920       goto fresh_line;
   3921 
   3922     case '0': case '1': case '2': case '3': case '4':
   3923     case '5': case '6': case '7': case '8': case '9':
   3924       {
   3925 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   3926 	result->type = CPP_NUMBER;
   3927 	lex_number (pfile, &result->val.str, &nst);
   3928 	warn_about_normalization (pfile, result, &nst, false);
   3929 	break;
   3930       }
   3931 
   3932     case 'L':
   3933     case 'u':
   3934     case 'U':
   3935     case 'R':
   3936       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
   3937 	 wide strings or raw strings.  */
   3938       if (c == 'L' || CPP_OPTION (pfile, rliterals)
   3939 	  || (c != 'R' && CPP_OPTION (pfile, uliterals)))
   3940 	{
   3941 	  if ((*buffer->cur == '\'' && c != 'R')
   3942 	      || *buffer->cur == '"'
   3943 	      || (*buffer->cur == 'R'
   3944 		  && c != 'R'
   3945 		  && buffer->cur[1] == '"'
   3946 		  && CPP_OPTION (pfile, rliterals))
   3947 	      || (*buffer->cur == '8'
   3948 		  && c == 'u'
   3949 		  && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
   3950 				&& CPP_OPTION (pfile, utf8_char_literals)))
   3951 		      || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
   3952 			  && CPP_OPTION (pfile, rliterals)))))
   3953 	    {
   3954 	      lex_string (pfile, result, buffer->cur - 1);
   3955 	      break;
   3956 	    }
   3957 	}
   3958       /* Fall through.  */
   3959 
   3960     case '_':
   3961     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
   3962     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
   3963     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
   3964     case 's': case 't':           case 'v': case 'w': case 'x':
   3965     case 'y': case 'z':
   3966     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
   3967     case 'G': case 'H': case 'I': case 'J': case 'K':
   3968     case 'M': case 'N': case 'O': case 'P': case 'Q':
   3969     case 'S': case 'T':           case 'V': case 'W': case 'X':
   3970     case 'Y': case 'Z':
   3971       result->type = CPP_NAME;
   3972       {
   3973 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   3974 	const auto node = lex_identifier (pfile, buffer->cur - 1, false, &nst,
   3975 					  &result->val.node.spelling);
   3976 	result->val.node.node = node;
   3977 	identifier_diagnostics_on_lex (pfile, node);
   3978 	warn_about_normalization (pfile, result, &nst, true);
   3979       }
   3980 
   3981       /* Convert named operators to their proper types.  */
   3982       if (result->val.node.node->flags & NODE_OPERATOR)
   3983 	{
   3984 	  result->flags |= NAMED_OP;
   3985 	  result->type = (enum cpp_ttype) result->val.node.node->directive_index;
   3986 	}
   3987 
   3988       /* Signal FALLTHROUGH comment followed by another token.  */
   3989       if (fallthrough_comment)
   3990 	result->flags |= PREV_FALLTHROUGH;
   3991       break;
   3992 
   3993     case '\'':
   3994     case '"':
   3995       lex_string (pfile, result, buffer->cur - 1);
   3996       break;
   3997 
   3998     case '/':
   3999       /* A potential block or line comment.  */
   4000       comment_start = buffer->cur;
   4001       c = *buffer->cur;
   4002 
   4003       if (c == '*')
   4004 	{
   4005 	  if (_cpp_skip_block_comment (pfile))
   4006 	    cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
   4007 	}
   4008       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
   4009 	{
   4010 	  /* Don't warn for system headers.  */
   4011 	  if (_cpp_in_system_header (pfile))
   4012 	    ;
   4013 	  /* Warn about comments if pedantically GNUC89, and not
   4014 	     in system headers.  */
   4015 	  else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
   4016 		   && CPP_PEDANTIC (pfile)
   4017 		   && ! buffer->warned_cplusplus_comments)
   4018 	    {
   4019 	      if (cpp_error (pfile, CPP_DL_PEDWARN,
   4020 			     "C++ style comments are not allowed in ISO C90"))
   4021 		cpp_error (pfile, CPP_DL_NOTE,
   4022 			   "(this will be reported only once per input file)");
   4023 	      buffer->warned_cplusplus_comments = 1;
   4024 	    }
   4025 	  /* Or if specifically desired via -Wc90-c99-compat.  */
   4026 	  else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
   4027 		   && ! CPP_OPTION (pfile, cplusplus)
   4028 		   && ! buffer->warned_cplusplus_comments)
   4029 	    {
   4030 	      if (cpp_error (pfile, CPP_DL_WARNING,
   4031 			     "C++ style comments are incompatible with C90"))
   4032 		cpp_error (pfile, CPP_DL_NOTE,
   4033 			   "(this will be reported only once per input file)");
   4034 	      buffer->warned_cplusplus_comments = 1;
   4035 	    }
   4036 	  /* In C89/C94, C++ style comments are forbidden.  */
   4037 	  else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
   4038 		    || CPP_OPTION (pfile, lang) == CLK_STDC94))
   4039 	    {
   4040 	      /* But don't be confused about valid code such as
   4041 	         - // immediately followed by *,
   4042 		 - // in a preprocessing directive,
   4043 		 - // in an #if 0 block.  */
   4044 	      if (buffer->cur[1] == '*'
   4045 		  || pfile->state.in_directive
   4046 		  || pfile->state.skipping)
   4047 		{
   4048 		  result->type = CPP_DIV;
   4049 		  break;
   4050 		}
   4051 	      else if (! buffer->warned_cplusplus_comments)
   4052 		{
   4053 		  if (cpp_error (pfile, CPP_DL_ERROR,
   4054 				 "C++ style comments are not allowed in "
   4055 				 "ISO C90"))
   4056 		    cpp_error (pfile, CPP_DL_NOTE,
   4057 			       "(this will be reported only once per input "
   4058 			       "file)");
   4059 		  buffer->warned_cplusplus_comments = 1;
   4060 		}
   4061 	    }
   4062 	  if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
   4063 	    cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
   4064 	}
   4065       else if (c == '=')
   4066 	{
   4067 	  buffer->cur++;
   4068 	  result->type = CPP_DIV_EQ;
   4069 	  break;
   4070 	}
   4071       else
   4072 	{
   4073 	  result->type = CPP_DIV;
   4074 	  break;
   4075 	}
   4076 
   4077       if (fallthrough_comment_p (pfile, comment_start))
   4078 	fallthrough_comment = true;
   4079 
   4080       if (pfile->cb.comment)
   4081 	{
   4082 	  size_t len = pfile->buffer->cur - comment_start;
   4083 	  pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
   4084 			     len + 1);
   4085 	}
   4086 
   4087       if (!pfile->state.save_comments)
   4088 	{
   4089 	  result->flags |= PREV_WHITE;
   4090 	  goto update_tokens_line;
   4091 	}
   4092 
   4093       if (fallthrough_comment)
   4094 	result->flags |= PREV_FALLTHROUGH;
   4095 
   4096       /* Save the comment as a token in its own right.  */
   4097       save_comment (pfile, result, comment_start, c);
   4098       break;
   4099 
   4100     case '<':
   4101       if (pfile->state.angled_headers)
   4102 	{
   4103 	  lex_string (pfile, result, buffer->cur - 1);
   4104 	  if (result->type != CPP_LESS)
   4105 	    break;
   4106 	}
   4107 
   4108       result->type = CPP_LESS;
   4109       if (*buffer->cur == '=')
   4110 	{
   4111 	  buffer->cur++, result->type = CPP_LESS_EQ;
   4112 	  if (*buffer->cur == '>'
   4113 	      && CPP_OPTION (pfile, cplusplus)
   4114 	      && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
   4115 	    buffer->cur++, result->type = CPP_SPACESHIP;
   4116 	}
   4117       else if (*buffer->cur == '<')
   4118 	{
   4119 	  buffer->cur++;
   4120 	  IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
   4121 	}
   4122       else if (CPP_OPTION (pfile, digraphs))
   4123 	{
   4124 	  if (*buffer->cur == ':')
   4125 	    {
   4126 	      /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
   4127 		 three characters are <:: and the subsequent character
   4128 		 is neither : nor >, the < is treated as a preprocessor
   4129 		 token by itself".  */
   4130 	      if (CPP_OPTION (pfile, cplusplus)
   4131 		  && CPP_OPTION (pfile, lang) != CLK_CXX98
   4132 		  && CPP_OPTION (pfile, lang) != CLK_GNUCXX
   4133 		  && buffer->cur[1] == ':'
   4134 		  && buffer->cur[2] != ':' && buffer->cur[2] != '>')
   4135 		break;
   4136 
   4137 	      buffer->cur++;
   4138 	      result->flags |= DIGRAPH;
   4139 	      result->type = CPP_OPEN_SQUARE;
   4140 	    }
   4141 	  else if (*buffer->cur == '%')
   4142 	    {
   4143 	      buffer->cur++;
   4144 	      result->flags |= DIGRAPH;
   4145 	      result->type = CPP_OPEN_BRACE;
   4146 	    }
   4147 	}
   4148       break;
   4149 
   4150     case '>':
   4151       result->type = CPP_GREATER;
   4152       if (*buffer->cur == '=')
   4153 	buffer->cur++, result->type = CPP_GREATER_EQ;
   4154       else if (*buffer->cur == '>')
   4155 	{
   4156 	  buffer->cur++;
   4157 	  IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
   4158 	}
   4159       break;
   4160 
   4161     case '%':
   4162       result->type = CPP_MOD;
   4163       if (*buffer->cur == '=')
   4164 	buffer->cur++, result->type = CPP_MOD_EQ;
   4165       else if (CPP_OPTION (pfile, digraphs))
   4166 	{
   4167 	  if (*buffer->cur == ':')
   4168 	    {
   4169 	      buffer->cur++;
   4170 	      result->flags |= DIGRAPH;
   4171 	      result->type = CPP_HASH;
   4172 	      if (*buffer->cur == '%' && buffer->cur[1] == ':')
   4173 		buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
   4174 	    }
   4175 	  else if (*buffer->cur == '>')
   4176 	    {
   4177 	      buffer->cur++;
   4178 	      result->flags |= DIGRAPH;
   4179 	      result->type = CPP_CLOSE_BRACE;
   4180 	    }
   4181 	}
   4182       break;
   4183 
   4184     case '.':
   4185       result->type = CPP_DOT;
   4186       if (ISDIGIT (*buffer->cur))
   4187 	{
   4188 	  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   4189 	  result->type = CPP_NUMBER;
   4190 	  lex_number (pfile, &result->val.str, &nst);
   4191 	  warn_about_normalization (pfile, result, &nst, false);
   4192 	}
   4193       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
   4194 	buffer->cur += 2, result->type = CPP_ELLIPSIS;
   4195       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
   4196 	buffer->cur++, result->type = CPP_DOT_STAR;
   4197       break;
   4198 
   4199     case '+':
   4200       result->type = CPP_PLUS;
   4201       if (*buffer->cur == '+')
   4202 	buffer->cur++, result->type = CPP_PLUS_PLUS;
   4203       else if (*buffer->cur == '=')
   4204 	buffer->cur++, result->type = CPP_PLUS_EQ;
   4205       break;
   4206 
   4207     case '-':
   4208       result->type = CPP_MINUS;
   4209       if (*buffer->cur == '>')
   4210 	{
   4211 	  buffer->cur++;
   4212 	  result->type = CPP_DEREF;
   4213 	  if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
   4214 	    buffer->cur++, result->type = CPP_DEREF_STAR;
   4215 	}
   4216       else if (*buffer->cur == '-')
   4217 	buffer->cur++, result->type = CPP_MINUS_MINUS;
   4218       else if (*buffer->cur == '=')
   4219 	buffer->cur++, result->type = CPP_MINUS_EQ;
   4220       break;
   4221 
   4222     case '&':
   4223       result->type = CPP_AND;
   4224       if (*buffer->cur == '&')
   4225 	buffer->cur++, result->type = CPP_AND_AND;
   4226       else if (*buffer->cur == '=')
   4227 	buffer->cur++, result->type = CPP_AND_EQ;
   4228       break;
   4229 
   4230     case '|':
   4231       result->type = CPP_OR;
   4232       if (*buffer->cur == '|')
   4233 	buffer->cur++, result->type = CPP_OR_OR;
   4234       else if (*buffer->cur == '=')
   4235 	buffer->cur++, result->type = CPP_OR_EQ;
   4236       break;
   4237 
   4238     case ':':
   4239       result->type = CPP_COLON;
   4240       if (*buffer->cur == ':')
   4241 	{
   4242 	  if (CPP_OPTION (pfile, scope))
   4243 	    buffer->cur++, result->type = CPP_SCOPE;
   4244 	  else
   4245 	    result->flags |= COLON_SCOPE;
   4246 	}
   4247       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
   4248 	{
   4249 	  buffer->cur++;
   4250 	  result->flags |= DIGRAPH;
   4251 	  result->type = CPP_CLOSE_SQUARE;
   4252 	}
   4253       break;
   4254 
   4255     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
   4256     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
   4257     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
   4258     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
   4259     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
   4260 
   4261     case '?': result->type = CPP_QUERY; break;
   4262     case '~': result->type = CPP_COMPL; break;
   4263     case ',': result->type = CPP_COMMA; break;
   4264     case '(': result->type = CPP_OPEN_PAREN; break;
   4265     case ')': result->type = CPP_CLOSE_PAREN; break;
   4266     case '[': result->type = CPP_OPEN_SQUARE; break;
   4267     case ']': result->type = CPP_CLOSE_SQUARE; break;
   4268     case '{': result->type = CPP_OPEN_BRACE; break;
   4269     case '}': result->type = CPP_CLOSE_BRACE; break;
   4270     case ';': result->type = CPP_SEMICOLON; break;
   4271 
   4272       /* @ is a punctuator in Objective-C.  */
   4273     case '@': result->type = CPP_ATSIGN; break;
   4274 
   4275     default:
   4276       {
   4277 	const uchar *base = --buffer->cur;
   4278 	static int no_warn_cnt;
   4279 
   4280 	/* Check for an extended identifier ($ or UCN or UTF-8).  */
   4281 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   4282 	if (forms_identifier_p (pfile, true, &nst))
   4283 	  {
   4284 	    result->type = CPP_NAME;
   4285 	    const auto node = lex_identifier (pfile, base, true, &nst,
   4286 					      &result->val.node.spelling);
   4287 	    result->val.node.node = node;
   4288 	    identifier_diagnostics_on_lex (pfile, node);
   4289 	    warn_about_normalization (pfile, result, &nst, true);
   4290 	    break;
   4291 	  }
   4292 
   4293 	/* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
   4294 	   single token.  */
   4295 	buffer->cur++;
   4296 	if (c >= utf8_signifier)
   4297 	  {
   4298 	    const uchar *pstr = base;
   4299 	    cppchar_t s;
   4300 	    if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
   4301 	      {
   4302 		if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
   4303 		  {
   4304 		    buffer->cur = base;
   4305 		    _cpp_warn_invalid_utf8 (pfile);
   4306 		  }
   4307 		buffer->cur = pstr;
   4308 	      }
   4309 	    else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
   4310 	      {
   4311 		buffer->cur = base;
   4312 		const uchar *end = _cpp_warn_invalid_utf8 (pfile);
   4313 		buffer->cur = base + 1;
   4314 		no_warn_cnt = end - buffer->cur;
   4315 	      }
   4316 	  }
   4317 	else if (c >= utf8_continuation
   4318 		 && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
   4319 	  {
   4320 	    if (no_warn_cnt)
   4321 	      --no_warn_cnt;
   4322 	    else
   4323 	      {
   4324 		buffer->cur = base;
   4325 		_cpp_warn_invalid_utf8 (pfile);
   4326 		buffer->cur = base + 1;
   4327 	      }
   4328 	  }
   4329 	create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
   4330 	break;
   4331       }
   4332 
   4333     }
   4334 
   4335   /* Potentially convert the location of the token to a range.  */
   4336   if (result->src_loc >= RESERVED_LOCATION_COUNT
   4337       && result->type != CPP_EOF)
   4338     {
   4339       /* Ensure that any line notes are processed, so that we have the
   4340 	 correct physical line/column for the end-point of the token even
   4341 	 when a logical line is split via one or more backslashes.  */
   4342       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
   4343 	  && !pfile->overlaid_buffer)
   4344 	_cpp_process_line_notes (pfile, false);
   4345 
   4346       source_range tok_range;
   4347       tok_range.m_start = result->src_loc;
   4348       tok_range.m_finish
   4349 	= linemap_position_for_column (pfile->line_table,
   4350 				       CPP_BUF_COLUMN (buffer, buffer->cur));
   4351 
   4352       result->src_loc
   4353 	= pfile->line_table->get_or_create_combined_loc (result->src_loc,
   4354 							 tok_range, nullptr, 0);
   4355     }
   4356 
   4357   return result;
   4358 }
   4359 
   4360 /* An upper bound on the number of bytes needed to spell TOKEN.
   4361    Does not include preceding whitespace.  */
   4362 unsigned int
   4363 cpp_token_len (const cpp_token *token)
   4364 {
   4365   unsigned int len;
   4366 
   4367   switch (TOKEN_SPELL (token))
   4368     {
   4369     default:		len = 6;				break;
   4370     case SPELL_LITERAL:	len = token->val.str.len;		break;
   4371     case SPELL_IDENT:	len = NODE_LEN (token->val.node.node) * 10;	break;
   4372     }
   4373 
   4374   return len;
   4375 }
   4376 
   4377 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
   4378    Return the number of bytes read out of NAME.  (There are always
   4379    10 bytes written to BUFFER.)  */
   4380 
   4381 static size_t
   4382 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
   4383 {
   4384   int j;
   4385   int ucn_len = 0;
   4386   int ucn_len_c;
   4387   unsigned t;
   4388   unsigned long utf32;
   4389 
   4390   /* Compute the length of the UTF-8 sequence.  */
   4391   for (t = *name; t & 0x80; t <<= 1)
   4392     ucn_len++;
   4393 
   4394   utf32 = *name & (0x7F >> ucn_len);
   4395   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
   4396     {
   4397       utf32 = (utf32 << 6) | (*++name & 0x3F);
   4398 
   4399       /* Ill-formed UTF-8.  */
   4400       if ((*name & ~0x3F) != 0x80)
   4401 	abort ();
   4402     }
   4403 
   4404   *buffer++ = '\\';
   4405   *buffer++ = 'U';
   4406   for (j = 7; j >= 0; j--)
   4407     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
   4408   return ucn_len;
   4409 }
   4410 
   4411 /* Given a token TYPE corresponding to a digraph, return a pointer to
   4412    the spelling of the digraph.  */
   4413 static const unsigned char *
   4414 cpp_digraph2name (enum cpp_ttype type)
   4415 {
   4416   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
   4417 }
   4418 
   4419 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
   4420    The buffer must already contain enough space to hold the
   4421    token's spelling.  Returns a pointer to the character after the
   4422    last character written.  */
   4423 unsigned char *
   4424 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
   4425 {
   4426   size_t i;
   4427   const unsigned char *name = NODE_NAME (ident);
   4428 
   4429   for (i = 0; i < NODE_LEN (ident); i++)
   4430     if (name[i] & ~0x7F)
   4431       {
   4432 	i += utf8_to_ucn (buffer, name + i) - 1;
   4433 	buffer += 10;
   4434       }
   4435     else
   4436       *buffer++ = name[i];
   4437 
   4438   return buffer;
   4439 }
   4440 
   4441 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
   4442    already contain enough space to hold the token's spelling.
   4443    Returns a pointer to the character after the last character written.
   4444    FORSTRING is true if this is to be the spelling after translation
   4445    phase 1 (with the original spelling of extended identifiers), false
   4446    if extended identifiers should always be written using UCNs (there is
   4447    no option for always writing them in the internal UTF-8 form).
   4448    FIXME: Would be nice if we didn't need the PFILE argument.  */
   4449 unsigned char *
   4450 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
   4451 		 unsigned char *buffer, bool forstring)
   4452 {
   4453   switch (TOKEN_SPELL (token))
   4454     {
   4455     case SPELL_OPERATOR:
   4456       {
   4457 	const unsigned char *spelling;
   4458 	unsigned char c;
   4459 
   4460 	if (token->flags & DIGRAPH)
   4461 	  spelling = cpp_digraph2name (token->type);
   4462 	else if (token->flags & NAMED_OP)
   4463 	  goto spell_ident;
   4464 	else
   4465 	  spelling = TOKEN_NAME (token);
   4466 
   4467 	while ((c = *spelling++) != '\0')
   4468 	  *buffer++ = c;
   4469       }
   4470       break;
   4471 
   4472     spell_ident:
   4473     case SPELL_IDENT:
   4474       if (forstring)
   4475 	{
   4476 	  memcpy (buffer, NODE_NAME (token->val.node.spelling),
   4477 		  NODE_LEN (token->val.node.spelling));
   4478 	  buffer += NODE_LEN (token->val.node.spelling);
   4479 	}
   4480       else
   4481 	buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
   4482       break;
   4483 
   4484     case SPELL_LITERAL:
   4485       memcpy (buffer, token->val.str.text, token->val.str.len);
   4486       buffer += token->val.str.len;
   4487       break;
   4488 
   4489     case SPELL_NONE:
   4490       cpp_error (pfile, CPP_DL_ICE,
   4491 		 "unspellable token %s", TOKEN_NAME (token));
   4492       break;
   4493     }
   4494 
   4495   return buffer;
   4496 }
   4497 
   4498 /* Returns TOKEN spelt as a null-terminated string.  The string is
   4499    freed when the reader is destroyed.  Useful for diagnostics.  */
   4500 unsigned char *
   4501 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
   4502 {
   4503   unsigned int len = cpp_token_len (token) + 1;
   4504   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
   4505 
   4506   end = cpp_spell_token (pfile, token, start, false);
   4507   end[0] = '\0';
   4508 
   4509   return start;
   4510 }
   4511 
   4512 /* Returns a pointer to a string which spells the token defined by
   4513    TYPE and FLAGS.  Used by C front ends, which really should move to
   4514    using cpp_token_as_text.  */
   4515 const char *
   4516 cpp_type2name (enum cpp_ttype type, unsigned char flags)
   4517 {
   4518   if (flags & DIGRAPH)
   4519     return (const char *) cpp_digraph2name (type);
   4520   else if (flags & NAMED_OP)
   4521     return cpp_named_operator2name (type);
   4522 
   4523   return (const char *) token_spellings[type].name;
   4524 }
   4525 
   4526 /* Writes the spelling of token to FP, without any preceding space.
   4527    Separated from cpp_spell_token for efficiency - to avoid stdio
   4528    double-buffering.  */
   4529 void
   4530 cpp_output_token (const cpp_token *token, FILE *fp)
   4531 {
   4532   switch (TOKEN_SPELL (token))
   4533     {
   4534     case SPELL_OPERATOR:
   4535       {
   4536 	const unsigned char *spelling;
   4537 	int c;
   4538 
   4539 	if (token->flags & DIGRAPH)
   4540 	  spelling = cpp_digraph2name (token->type);
   4541 	else if (token->flags & NAMED_OP)
   4542 	  goto spell_ident;
   4543 	else
   4544 	  spelling = TOKEN_NAME (token);
   4545 
   4546 	c = *spelling;
   4547 	do
   4548 	  putc (c, fp);
   4549 	while ((c = *++spelling) != '\0');
   4550       }
   4551       break;
   4552 
   4553     spell_ident:
   4554     case SPELL_IDENT:
   4555       {
   4556 	size_t i;
   4557 	const unsigned char * name = NODE_NAME (token->val.node.node);
   4558 
   4559 	for (i = 0; i < NODE_LEN (token->val.node.node); i++)
   4560 	  if (name[i] & ~0x7F)
   4561 	    {
   4562 	      unsigned char buffer[10];
   4563 	      i += utf8_to_ucn (buffer, name + i) - 1;
   4564 	      fwrite (buffer, 1, 10, fp);
   4565 	    }
   4566 	  else
   4567 	    fputc (NODE_NAME (token->val.node.node)[i], fp);
   4568       }
   4569       break;
   4570 
   4571     case SPELL_LITERAL:
   4572       if (token->type == CPP_HEADER_NAME)
   4573 	fputc ('"', fp);
   4574       fwrite (token->val.str.text, 1, token->val.str.len, fp);
   4575       if (token->type == CPP_HEADER_NAME)
   4576 	fputc ('"', fp);
   4577       break;
   4578 
   4579     case SPELL_NONE:
   4580       /* An error, most probably.  */
   4581       break;
   4582     }
   4583 }
   4584 
   4585 /* Compare two tokens.  */
   4586 int
   4587 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
   4588 {
   4589   if (a->type == b->type && a->flags == b->flags)
   4590     switch (TOKEN_SPELL (a))
   4591       {
   4592       default:			/* Keep compiler happy.  */
   4593       case SPELL_OPERATOR:
   4594 	/* token_no is used to track where multiple consecutive ##
   4595 	   tokens were originally located.  */
   4596 	return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
   4597       case SPELL_NONE:
   4598 	return (a->type != CPP_MACRO_ARG
   4599 		|| (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
   4600 		    && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
   4601       case SPELL_IDENT:
   4602 	return (a->val.node.node == b->val.node.node
   4603 		&& a->val.node.spelling == b->val.node.spelling);
   4604       case SPELL_LITERAL:
   4605 	return (a->val.str.len == b->val.str.len
   4606 		&& !memcmp (a->val.str.text, b->val.str.text,
   4607 			    a->val.str.len));
   4608       }
   4609 
   4610   return 0;
   4611 }
   4612 
   4613 /* Returns nonzero if a space should be inserted to avoid an
   4614    accidental token paste for output.  For simplicity, it is
   4615    conservative, and occasionally advises a space where one is not
   4616    needed, e.g. "." and ".2".  */
   4617 int
   4618 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
   4619 		 const cpp_token *token2)
   4620 {
   4621   enum cpp_ttype a = token1->type, b = token2->type;
   4622   cppchar_t c;
   4623 
   4624   if (token1->flags & NAMED_OP)
   4625     a = CPP_NAME;
   4626   if (token2->flags & NAMED_OP)
   4627     b = CPP_NAME;
   4628 
   4629   c = EOF;
   4630   if (token2->flags & DIGRAPH)
   4631     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
   4632   else if (token_spellings[b].category == SPELL_OPERATOR)
   4633     c = token_spellings[b].name[0];
   4634 
   4635   /* Quickly get everything that can paste with an '='.  */
   4636   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
   4637     return 1;
   4638 
   4639   switch (a)
   4640     {
   4641     case CPP_GREATER:	return c == '>';
   4642     case CPP_LESS:	return c == '<' || c == '%' || c == ':';
   4643     case CPP_PLUS:	return c == '+';
   4644     case CPP_MINUS:	return c == '-' || c == '>';
   4645     case CPP_DIV:	return c == '/' || c == '*'; /* Comments.  */
   4646     case CPP_MOD:	return c == ':' || c == '>';
   4647     case CPP_AND:	return c == '&';
   4648     case CPP_OR:	return c == '|';
   4649     case CPP_COLON:	return c == ':' || c == '>';
   4650     case CPP_DEREF:	return c == '*';
   4651     case CPP_DOT:	return c == '.' || c == '%' || b == CPP_NUMBER;
   4652     case CPP_HASH:	return c == '#' || c == '%'; /* Digraph form.  */
   4653     case CPP_PRAGMA:
   4654     case CPP_NAME:	return ((b == CPP_NUMBER
   4655 				 && name_p (pfile, &token2->val.str))
   4656 				|| b == CPP_NAME
   4657 				|| b == CPP_CHAR || b == CPP_STRING); /* L */
   4658     case CPP_NUMBER:	return (b == CPP_NUMBER || b == CPP_NAME
   4659 				|| b == CPP_CHAR
   4660 				|| c == '.' || c == '+' || c == '-');
   4661 				      /* UCNs */
   4662     case CPP_OTHER:	return ((token1->val.str.text[0] == '\\'
   4663 				 && b == CPP_NAME)
   4664 				|| (CPP_OPTION (pfile, objc)
   4665 				    && token1->val.str.text[0] == '@'
   4666 				    && (b == CPP_NAME || b == CPP_STRING)));
   4667     case CPP_LESS_EQ:	return c == '>';
   4668     case CPP_STRING:
   4669     case CPP_WSTRING:
   4670     case CPP_UTF8STRING:
   4671     case CPP_STRING16:
   4672     case CPP_STRING32:	return (CPP_OPTION (pfile, user_literals)
   4673 				&& (b == CPP_NAME
   4674 				    || (TOKEN_SPELL (token2) == SPELL_LITERAL
   4675 					&& ISIDST (token2->val.str.text[0]))));
   4676 
   4677     default:		break;
   4678     }
   4679 
   4680   return 0;
   4681 }
   4682 
   4683 /* Output all the remaining tokens on the current line, and a newline
   4684    character, to FP.  Leading whitespace is removed.  If there are
   4685    macros, special token padding is not performed.  */
   4686 void
   4687 cpp_output_line (cpp_reader *pfile, FILE *fp)
   4688 {
   4689   const cpp_token *token;
   4690 
   4691   token = cpp_get_token (pfile);
   4692   while (token->type != CPP_EOF)
   4693     {
   4694       cpp_output_token (token, fp);
   4695       token = cpp_get_token (pfile);
   4696       if (token->flags & PREV_WHITE)
   4697 	putc (' ', fp);
   4698     }
   4699 
   4700   putc ('\n', fp);
   4701 }
   4702 
   4703 /* Return a string representation of all the remaining tokens on the
   4704    current line.  The result is allocated using xmalloc and must be
   4705    freed by the caller.  */
   4706 unsigned char *
   4707 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
   4708 {
   4709   const cpp_token *token;
   4710   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
   4711   unsigned int alloced = 120 + out;
   4712   unsigned char *result = (unsigned char *) xmalloc (alloced);
   4713 
   4714   /* If DIR_NAME is empty, there are no initial contents.  */
   4715   if (dir_name)
   4716     {
   4717       sprintf ((char *) result, "#%s ", dir_name);
   4718       out += 2;
   4719     }
   4720 
   4721   token = cpp_get_token (pfile);
   4722   while (token->type != CPP_EOF)
   4723     {
   4724       unsigned char *last;
   4725       /* Include room for a possible space and the terminating nul.  */
   4726       unsigned int len = cpp_token_len (token) + 2;
   4727 
   4728       if (out + len > alloced)
   4729 	{
   4730 	  alloced *= 2;
   4731 	  if (out + len > alloced)
   4732 	    alloced = out + len;
   4733 	  result = (unsigned char *) xrealloc (result, alloced);
   4734 	}
   4735 
   4736       last = cpp_spell_token (pfile, token, &result[out], 0);
   4737       out = last - result;
   4738 
   4739       token = cpp_get_token (pfile);
   4740       if (token->flags & PREV_WHITE)
   4741 	result[out++] = ' ';
   4742     }
   4743 
   4744   result[out] = '\0';
   4745   return result;
   4746 }
   4747 
   4748 /* Memory buffers.  Changing these three constants can have a dramatic
   4749    effect on performance.  The values here are reasonable defaults,
   4750    but might be tuned.  If you adjust them, be sure to test across a
   4751    range of uses of cpplib, including heavy nested function-like macro
   4752    expansion.  Also check the change in peak memory usage (NJAMD is a
   4753    good tool for this).  */
   4754 #define MIN_BUFF_SIZE 8000
   4755 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
   4756 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
   4757 	(MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
   4758 
   4759 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
   4760   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
   4761 #endif
   4762 
   4763 /* Create a new allocation buffer.  Place the control block at the end
   4764    of the buffer, so that buffer overflows will cause immediate chaos.  */
   4765 static _cpp_buff *
   4766 new_buff (size_t len)
   4767 {
   4768   _cpp_buff *result;
   4769   unsigned char *base;
   4770 
   4771   if (len < MIN_BUFF_SIZE)
   4772     len = MIN_BUFF_SIZE;
   4773   len = CPP_ALIGN (len);
   4774 
   4775 #ifdef ENABLE_VALGRIND_WORKAROUNDS
   4776   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
   4777      struct first.  */
   4778   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
   4779   base = XNEWVEC (unsigned char, len + slen);
   4780   result = (_cpp_buff *) base;
   4781   base += slen;
   4782 #else
   4783   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
   4784   result = (_cpp_buff *) (base + len);
   4785 #endif
   4786   result->base = base;
   4787   result->cur = base;
   4788   result->limit = base + len;
   4789   result->next = NULL;
   4790   return result;
   4791 }
   4792 
   4793 /* Place a chain of unwanted allocation buffers on the free list.  */
   4794 void
   4795 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
   4796 {
   4797   _cpp_buff *end = buff;
   4798 
   4799   while (end->next)
   4800     end = end->next;
   4801   end->next = pfile->free_buffs;
   4802   pfile->free_buffs = buff;
   4803 }
   4804 
   4805 /* Return a free buffer of size at least MIN_SIZE.  */
   4806 _cpp_buff *
   4807 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
   4808 {
   4809   _cpp_buff *result, **p;
   4810 
   4811   for (p = &pfile->free_buffs;; p = &(*p)->next)
   4812     {
   4813       size_t size;
   4814 
   4815       if (*p == NULL)
   4816 	return new_buff (min_size);
   4817       result = *p;
   4818       size = result->limit - result->base;
   4819       /* Return a buffer that's big enough, but don't waste one that's
   4820          way too big.  */
   4821       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
   4822 	break;
   4823     }
   4824 
   4825   *p = result->next;
   4826   result->next = NULL;
   4827   result->cur = result->base;
   4828   return result;
   4829 }
   4830 
   4831 /* Creates a new buffer with enough space to hold the uncommitted
   4832    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
   4833    the excess bytes to the new buffer.  Chains the new buffer after
   4834    BUFF, and returns the new buffer.  */
   4835 _cpp_buff *
   4836 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
   4837 {
   4838   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
   4839   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
   4840 
   4841   buff->next = new_buff;
   4842   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
   4843   return new_buff;
   4844 }
   4845 
   4846 /* Creates a new buffer with enough space to hold the uncommitted
   4847    remaining bytes of the buffer pointed to by BUFF, and at least
   4848    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
   4849    Chains the new buffer before the buffer pointed to by BUFF, and
   4850    updates the pointer to point to the new buffer.  */
   4851 void
   4852 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
   4853 {
   4854   _cpp_buff *new_buff, *old_buff = *pbuff;
   4855   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
   4856 
   4857   new_buff = _cpp_get_buff (pfile, size);
   4858   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
   4859   new_buff->next = old_buff;
   4860   *pbuff = new_buff;
   4861 }
   4862 
   4863 /* Free a chain of buffers starting at BUFF.  */
   4864 void
   4865 _cpp_free_buff (_cpp_buff *buff)
   4866 {
   4867   _cpp_buff *next;
   4868 
   4869   for (; buff; buff = next)
   4870     {
   4871       next = buff->next;
   4872 #ifdef ENABLE_VALGRIND_WORKAROUNDS
   4873       free (buff);
   4874 #else
   4875       free (buff->base);
   4876 #endif
   4877     }
   4878 }
   4879 
   4880 /* Allocate permanent, unaligned storage of length LEN.  */
   4881 unsigned char *
   4882 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
   4883 {
   4884   _cpp_buff *buff = pfile->u_buff;
   4885   unsigned char *result = buff->cur;
   4886 
   4887   if (len > (size_t) (buff->limit - result))
   4888     {
   4889       buff = _cpp_get_buff (pfile, len);
   4890       buff->next = pfile->u_buff;
   4891       pfile->u_buff = buff;
   4892       result = buff->cur;
   4893     }
   4894 
   4895   buff->cur = result + len;
   4896   return result;
   4897 }
   4898 
   4899 /* Allocate permanent, unaligned storage of length LEN from a_buff.
   4900    That buffer is used for growing allocations when saving macro
   4901    replacement lists in a #define, and when parsing an answer to an
   4902    assertion in #assert, #unassert or #if (and therefore possibly
   4903    whilst expanding macros).  It therefore must not be used by any
   4904    code that they might call: specifically the lexer and the guts of
   4905    the macro expander.
   4906 
   4907    All existing other uses clearly fit this restriction: storing
   4908    registered pragmas during initialization.  */
   4909 unsigned char *
   4910 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
   4911 {
   4912   _cpp_buff *buff = pfile->a_buff;
   4913   unsigned char *result = buff->cur;
   4914 
   4915   if (len > (size_t) (buff->limit - result))
   4916     {
   4917       buff = _cpp_get_buff (pfile, len);
   4918       buff->next = pfile->a_buff;
   4919       pfile->a_buff = buff;
   4920       result = buff->cur;
   4921     }
   4922 
   4923   buff->cur = result + len;
   4924   return result;
   4925 }
   4926 
   4927 /* Commit or allocate storage from a buffer.  */
   4928 
   4929 void *
   4930 _cpp_commit_buff (cpp_reader *pfile, size_t size)
   4931 {
   4932   void *ptr = BUFF_FRONT (pfile->a_buff);
   4933 
   4934   if (pfile->hash_table->alloc_subobject)
   4935     {
   4936       void *copy = pfile->hash_table->alloc_subobject (size);
   4937       memcpy (copy, ptr, size);
   4938       ptr = copy;
   4939     }
   4940   else
   4941     BUFF_FRONT (pfile->a_buff) += size;
   4942 
   4943   return ptr;
   4944 }
   4945 
   4946 /* Say which field of TOK is in use.  */
   4947 
   4948 enum cpp_token_fld_kind
   4949 cpp_token_val_index (const cpp_token *tok)
   4950 {
   4951   switch (TOKEN_SPELL (tok))
   4952     {
   4953     case SPELL_IDENT:
   4954       return CPP_TOKEN_FLD_NODE;
   4955     case SPELL_LITERAL:
   4956       return CPP_TOKEN_FLD_STR;
   4957     case SPELL_OPERATOR:
   4958       /* Operands which were originally spelled as ident keep around
   4959          the node for the exact spelling.  */
   4960       if (tok->flags & NAMED_OP)
   4961 	return CPP_TOKEN_FLD_NODE;
   4962       else if (tok->type == CPP_PASTE)
   4963 	return CPP_TOKEN_FLD_TOKEN_NO;
   4964       else
   4965 	return CPP_TOKEN_FLD_NONE;
   4966     case SPELL_NONE:
   4967       if (tok->type == CPP_MACRO_ARG)
   4968 	return CPP_TOKEN_FLD_ARG_NO;
   4969       else if (tok->type == CPP_PADDING)
   4970 	return CPP_TOKEN_FLD_SOURCE;
   4971       else if (tok->type == CPP_PRAGMA)
   4972 	return CPP_TOKEN_FLD_PRAGMA;
   4973       /* fall through */
   4974     default:
   4975       return CPP_TOKEN_FLD_NONE;
   4976     }
   4977 }
   4978 
   4979 /* All tokens lexed in R after calling this function will be forced to
   4980    have their location_t to be P, until
   4981    cpp_stop_forcing_token_locations is called for R.  */
   4982 
   4983 void
   4984 cpp_force_token_locations (cpp_reader *r, location_t loc)
   4985 {
   4986   r->forced_token_location = loc;
   4987 }
   4988 
   4989 /* Go back to assigning locations naturally for lexed tokens.  */
   4990 
   4991 void
   4992 cpp_stop_forcing_token_locations (cpp_reader *r)
   4993 {
   4994   r->forced_token_location = 0;
   4995 }
   4996 
   4997 /* We're looking at \, if it's escaping EOL, look past it.  If at
   4998    LIMIT, don't advance.  */
   4999 
   5000 static const unsigned char *
   5001 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
   5002 {
   5003   const unsigned char *probe = peek;
   5004 
   5005   if (__builtin_expect (peek[1] == '\n', true))
   5006     {
   5007     eol:
   5008       probe += 2;
   5009       if (__builtin_expect (probe < limit, true))
   5010 	{
   5011 	  peek = probe;
   5012 	  if (*peek == '\\')
   5013 	    /* The user might be perverse.  */
   5014 	    return do_peek_backslash (peek, limit);
   5015 	}
   5016     }
   5017   else if (__builtin_expect (peek[1] == '\r', false))
   5018     {
   5019       if (probe[2] == '\n')
   5020 	probe++;
   5021       goto eol;
   5022     }
   5023 
   5024   return peek;
   5025 }
   5026 
   5027 static const unsigned char *
   5028 do_peek_next (const unsigned char *peek, const unsigned char *limit)
   5029 {
   5030   if (__builtin_expect (*peek == '\\', false))
   5031     peek = do_peek_backslash (peek, limit);
   5032   return peek;
   5033 }
   5034 
   5035 static const unsigned char *
   5036 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
   5037 {
   5038   if (peek == bound)
   5039     return NULL;
   5040 
   5041   unsigned char c = *--peek;
   5042   if (__builtin_expect (c == '\n', false)
   5043       || __builtin_expect (c == 'r', false))
   5044     {
   5045       if (peek == bound)
   5046 	return peek;
   5047       int ix = -1;
   5048       if (c == '\n' && peek[ix] == '\r')
   5049 	{
   5050 	  if (peek + ix == bound)
   5051 	    return peek;
   5052 	  ix--;
   5053 	}
   5054 
   5055       if (peek[ix] == '\\')
   5056 	return do_peek_prev (peek + ix, bound);
   5057 
   5058       return peek;
   5059     }
   5060   else
   5061     return peek;
   5062 }
   5063 
   5064 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
   5065    space.  Otherwise return NULL.  */
   5066 
   5067 static const unsigned char *
   5068 do_peek_ident (const char *match, const unsigned char *peek,
   5069 	       const unsigned char *limit)
   5070 {
   5071   for (; *++match; peek++)
   5072     if (*peek != *match)
   5073       {
   5074 	peek = do_peek_next (peek, limit);
   5075 	if (*peek != *match)
   5076 	  return NULL;
   5077       }
   5078 
   5079   /* Must now not be looking at an identifier char.  */
   5080   peek = do_peek_next (peek, limit);
   5081   if (ISIDNUM (*peek))
   5082     return NULL;
   5083 
   5084   /* Skip control-line whitespace.  */
   5085  ws:
   5086   while (*peek == ' ' || *peek == '\t')
   5087     peek++;
   5088   if (__builtin_expect (*peek == '\\', false))
   5089     {
   5090       peek = do_peek_backslash (peek, limit);
   5091       if (*peek != '\\')
   5092 	goto ws;
   5093     }
   5094 
   5095   return peek;
   5096 }
   5097 
   5098 /* Are we looking at a module control line starting as PEEK - 1?  */
   5099 
   5100 static bool
   5101 do_peek_module (cpp_reader *pfile, unsigned char c,
   5102 		const unsigned char *peek, const unsigned char *limit)
   5103 {
   5104   bool import = false;
   5105 
   5106   if (__builtin_expect (c == 'e', false))
   5107     {
   5108       if (!((peek[0] == 'x' || peek[0] == '\\')
   5109 	    && (peek = do_peek_ident ("export", peek, limit))))
   5110 	return false;
   5111 
   5112       /* export, peek for import or module.  No need to peek __import
   5113 	 here.  */
   5114       if (peek[0] == 'i')
   5115 	{
   5116 	  if (!((peek[1] == 'm' || peek[1] == '\\')
   5117 		&& (peek = do_peek_ident ("import", peek + 1, limit))))
   5118 	    return false;
   5119 	  import = true;
   5120 	}
   5121       else if (peek[0] == 'm')
   5122 	{
   5123 	  if (!((peek[1] == 'o' || peek[1] == '\\')
   5124 		&& (peek = do_peek_ident ("module", peek + 1, limit))))
   5125 	    return false;
   5126 	}
   5127       else
   5128 	return false;
   5129     }
   5130   else if (__builtin_expect (c == 'i', false))
   5131     {
   5132       if (!((peek[0] == 'm' || peek[0] == '\\')
   5133 	    && (peek = do_peek_ident ("import", peek, limit))))
   5134 	return false;
   5135       import = true;
   5136     }
   5137   else if (__builtin_expect (c == '_', false))
   5138     {
   5139       /* Needed for translated includes.   */
   5140       if (!((peek[0] == '_' || peek[0] == '\\')
   5141 	    && (peek = do_peek_ident ("__import", peek, limit))))
   5142 	return false;
   5143       import = true;
   5144     }
   5145   else if (__builtin_expect (c == 'm', false))
   5146     {
   5147       if (!((peek[0] == 'o' || peek[0] == '\\')
   5148 	    && (peek = do_peek_ident ("module", peek, limit))))
   5149 	return false;
   5150     }
   5151   else
   5152     return false;
   5153 
   5154   /* Peek the next character to see if it's good enough.  We'll be at
   5155      the first non-whitespace char, including skipping an escaped
   5156      newline.  */
   5157   /* ... import followed by identifier, ':', '<' or header-name
   5158      preprocessing tokens, or module followed by identifier, ':' or
   5159      ';' preprocessing tokens.  */
   5160   unsigned char p = *peek++;
   5161 
   5162   /* A character literal is ... single quotes, ... optionally preceded
   5163      by u8, u, U, or L */
   5164   /* A string-literal is a ... double quotes, optionally prefixed by
   5165      R, u8, u8R, u, uR, U, UR, L, or LR */
   5166   if (p == 'u')
   5167     {
   5168       peek = do_peek_next (peek, limit);
   5169       if (*peek == '8')
   5170 	{
   5171 	  peek++;
   5172 	  goto peek_u8;
   5173 	}
   5174       goto peek_u;
   5175     }
   5176   else if (p == 'U' || p == 'L')
   5177     {
   5178     peek_u8:
   5179       peek = do_peek_next (peek, limit);
   5180     peek_u:
   5181       if (*peek == '\"' || *peek == '\'')
   5182 	return false;
   5183 
   5184       if (*peek == 'R')
   5185 	goto peek_R;
   5186       /* Identifier. Ok.  */
   5187     }
   5188   else if (p == 'R')
   5189     {
   5190     peek_R:
   5191       if (CPP_OPTION (pfile, rliterals))
   5192 	{
   5193 	  peek = do_peek_next (peek, limit);
   5194 	  if (*peek == '\"')
   5195 	    return false;
   5196 	}
   5197       /* Identifier. Ok.  */
   5198     }
   5199   else if ('Z' - 'A' == 25
   5200 	   ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
   5201 	   : ISIDST (p))
   5202     {
   5203       /* Identifier.  Ok. */
   5204     }
   5205   else if (p == '<')
   5206     {
   5207       /* Maybe angle header, ok for import.  Reject
   5208 	 '<=', '<<' digraph:'<:'.  */
   5209       if (!import)
   5210 	return false;
   5211       peek = do_peek_next (peek, limit);
   5212       if (*peek == '=' || *peek == '<'
   5213 	  || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
   5214 	return false;
   5215     }
   5216   else if (p == ';')
   5217     {
   5218       /* SEMICOLON, ok for module.  */
   5219       if (import)
   5220 	return false;
   5221     }
   5222   else if (p == '"')
   5223     {
   5224       /* STRING, ok for import.  */
   5225       if (!import)
   5226 	return false;
   5227     }
   5228   else if (p == ':')
   5229     {
   5230       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
   5231       peek = do_peek_next (peek, limit);
   5232       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
   5233 	return false;
   5234     }
   5235   else
   5236     /* FIXME: Detect a unicode character, excluding those not
   5237        permitted as the initial character. [lex.name]/1.  I presume
   5238        we need to check the \[uU] spellings, and directly using
   5239        Unicode in say UTF8 form?  Or perhaps we do the phase-1
   5240        conversion of UTF8 to universal-character-names?  */
   5241     return false;
   5242 
   5243   return true;
   5244 }
   5245 
   5246 /* Directives-only scanning.  Somewhat more relaxed than correct
   5247    parsing -- some ill-formed programs will not be rejected.  */
   5248 
   5249 void
   5250 cpp_directive_only_process (cpp_reader *pfile,
   5251 			    void *data,
   5252 			    void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
   5253 {
   5254   bool module_p = CPP_OPTION (pfile, module_directives);
   5255 
   5256   do
   5257     {
   5258     restart:
   5259       /* Buffer initialization, but no line cleaning. */
   5260       cpp_buffer *buffer = pfile->buffer;
   5261       buffer->cur_note = buffer->notes_used = 0;
   5262       buffer->cur = buffer->line_base = buffer->next_line;
   5263       buffer->need_line = false;
   5264       /* Files always end in a newline or carriage return.  We rely on this for
   5265 	 character peeking safety.  */
   5266       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
   5267 
   5268       const unsigned char *base = buffer->cur;
   5269       unsigned line_count = 0;
   5270       const unsigned char *line_start = base;
   5271 
   5272       bool bol = true;
   5273       bool raw = false;
   5274 
   5275       const unsigned char *lwm = base;
   5276       for (const unsigned char *pos = base, *limit = buffer->rlimit;
   5277 	   pos < limit;)
   5278 	{
   5279 	  unsigned char c = *pos++;
   5280 	  /* This matches the switch in _cpp_lex_direct.  */
   5281 	  switch (c)
   5282 	    {
   5283 	    case ' ': case '\t': case '\f': case '\v':
   5284 	      /* Whitespace, do nothing.  */
   5285 	      break;
   5286 
   5287 	    case '\r': /* MAC line ending, or Windows \r\n  */
   5288 	      if (*pos == '\n')
   5289 		pos++;
   5290 	      /* FALLTHROUGH */
   5291 
   5292 	    case '\n':
   5293 	      bol = true;
   5294 
   5295 	    next_line:
   5296 	      CPP_INCREMENT_LINE (pfile, 0);
   5297 	      line_count++;
   5298 	      line_start = pos;
   5299 	      break;
   5300 
   5301 	    case '\\':
   5302 	      /* <backslash><newline> is removed, and doesn't undo any
   5303 		 preceeding escape or whatnot.  */
   5304 	      if (*pos == '\n')
   5305 		{
   5306 		  pos++;
   5307 		  goto next_line;
   5308 		}
   5309 	      else if (*pos == '\r')
   5310 		{
   5311 		  if (pos[1] == '\n')
   5312 		    pos++;
   5313 		  pos++;
   5314 		  goto next_line;
   5315 		}
   5316 	      goto dflt;
   5317 
   5318 	    case '#':
   5319 	      if (bol)
   5320 		{
   5321 		  /* Line directive.  */
   5322 		  if (pos - 1 > base && !pfile->state.skipping)
   5323 		    cb (pfile, CPP_DO_print, data,
   5324 			line_count, base, pos - 1 - base);
   5325 
   5326 		  /* Prep things for directive handling. */
   5327 		  buffer->next_line = pos;
   5328 		  buffer->need_line = true;
   5329 		  bool ok = _cpp_get_fresh_line (pfile);
   5330 		  gcc_checking_assert (ok);
   5331 
   5332 		  /* Ensure proper column numbering for generated
   5333 		     error messages. */
   5334 		  buffer->line_base -= pos - line_start;
   5335 
   5336 		  _cpp_handle_directive (pfile, line_start + 1 != pos);
   5337 
   5338 		  /* Sanitize the line settings.  Duplicate #include's can
   5339 		     mess things up. */
   5340 		  // FIXME: Necessary?
   5341 		  pfile->line_table->highest_location
   5342 		    = pfile->line_table->highest_line;
   5343 
   5344 		  if (!pfile->state.skipping
   5345 		      && pfile->buffer->next_line < pfile->buffer->rlimit)
   5346 		    cb (pfile, CPP_DO_location, data,
   5347 			pfile->line_table->highest_line);
   5348 
   5349 		  goto restart;
   5350 		}
   5351 	      goto dflt;
   5352 
   5353 	    case '/':
   5354 	      {
   5355 		const unsigned char *peek = do_peek_next (pos, limit);
   5356 		if (!(*peek == '/' || *peek == '*'))
   5357 		  goto dflt;
   5358 
   5359 		/* Line or block comment  */
   5360 		bool is_block = *peek == '*';
   5361 		bool star = false;
   5362 		bool esc = false;
   5363 		location_t sloc
   5364 		  = linemap_position_for_column (pfile->line_table,
   5365 						 pos - line_start);
   5366 
   5367 		while (pos < limit)
   5368 		  {
   5369 		    char c = *pos++;
   5370 		    switch (c)
   5371 		      {
   5372 		      case '\\':
   5373 			esc = true;
   5374 			break;
   5375 
   5376 		      case '\r':
   5377 			if (*pos == '\n')
   5378 			  pos++;
   5379 			/* FALLTHROUGH  */
   5380 
   5381 		      case '\n':
   5382 			{
   5383 			  CPP_INCREMENT_LINE (pfile, 0);
   5384 			  line_count++;
   5385 			  line_start = pos;
   5386 			  if (!esc && !is_block)
   5387 			    {
   5388 			      bol = true;
   5389 			      goto done_comment;
   5390 			    }
   5391 			}
   5392 			if (!esc)
   5393 			  star = false;
   5394 			esc = false;
   5395 			break;
   5396 
   5397 		      case '*':
   5398 			if (pos > peek)
   5399 			  star = is_block;
   5400 			esc = false;
   5401 			break;
   5402 
   5403 		      case '/':
   5404 			if (star)
   5405 			  goto done_comment;
   5406 			/* FALLTHROUGH  */
   5407 
   5408 		      default:
   5409 			star = false;
   5410 			esc = false;
   5411 			break;
   5412 		      }
   5413 		  }
   5414 		if (pos < limit || is_block)
   5415 		  cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
   5416 				       "unterminated comment");
   5417 	      done_comment:
   5418 		lwm = pos;
   5419 		break;
   5420 	      }
   5421 
   5422 	    case '\'':
   5423 	      if (!CPP_OPTION (pfile, digit_separators))
   5424 		goto delimited_string;
   5425 
   5426 	      /* Possibly a number punctuator.  */
   5427 	      if (!ISIDNUM (*do_peek_next (pos, limit)))
   5428 		goto delimited_string;
   5429 
   5430 	      goto quote_peek;
   5431 
   5432 	    case '\"':
   5433 	      if (!CPP_OPTION (pfile, rliterals))
   5434 		goto delimited_string;
   5435 
   5436 	    quote_peek:
   5437 	      {
   5438 		/* For ' see if it's a number punctuator
   5439 		   \.?<digit>(<digit>|<identifier-nondigit>
   5440 		   |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
   5441 		/* For " see if it's a raw string
   5442 		   {U,L,u,u8}R.  This includes CPP_NUMBER detection,
   5443 		   because that could be 0e+R.  */
   5444 		const unsigned char *peek = pos - 1;
   5445 		bool quote_first = c == '"';
   5446 		bool quote_eight = false;
   5447 		bool maybe_number_start = false;
   5448 		bool want_number = false;
   5449 
   5450 		while ((peek = do_peek_prev (peek, lwm)))
   5451 		  {
   5452 		    unsigned char p = *peek;
   5453 		    if (quote_first)
   5454 		      {
   5455 			if (!raw)
   5456 			  {
   5457 			    if (p != 'R')
   5458 			      break;
   5459 			    raw = true;
   5460 			    continue;
   5461 			  }
   5462 
   5463 			quote_first = false;
   5464 			if (p == 'L' || p == 'U' || p == 'u')
   5465 			  ;
   5466 			else if (p == '8')
   5467 			  quote_eight = true;
   5468 			else
   5469 			  goto second_raw;
   5470 		      }
   5471 		    else if (quote_eight)
   5472 		      {
   5473 			if (p != 'u')
   5474 			  {
   5475 			    raw = false;
   5476 			    break;
   5477 			  }
   5478 			quote_eight = false;
   5479 		      }
   5480 		    else if (c == '"')
   5481 		      {
   5482 		      second_raw:;
   5483 			if (!want_number && ISIDNUM (p))
   5484 			  {
   5485 			    raw = false;
   5486 			    break;
   5487 			  }
   5488 		      }
   5489 
   5490 		    if (ISDIGIT (p))
   5491 		      maybe_number_start = true;
   5492 		    else if (p == '.')
   5493 		      want_number = true;
   5494 		    else if (ISIDNUM (p))
   5495 		      maybe_number_start = false;
   5496 		    else if (p == '+' || p == '-')
   5497 		      {
   5498 			if (const unsigned char *peek_prev
   5499 			    = do_peek_prev (peek, lwm))
   5500 			  {
   5501 			    p = *peek_prev;
   5502 			    if (p == 'e' || p == 'E'
   5503 				|| p == 'p' || p == 'P')
   5504 			      {
   5505 				want_number = true;
   5506 				maybe_number_start = false;
   5507 			      }
   5508 			    else
   5509 			      break;
   5510 			  }
   5511 			else
   5512 			  break;
   5513 		      }
   5514 		    else if (p == '\'' || p == '\"')
   5515 		      {
   5516 			/* If this is lwm, this must be the end of a
   5517 			   previous string.  So this is a trailing
   5518 			   literal type, (a) if those are allowed,
   5519 			     and (b) maybe_start is false.  Otherwise
   5520 			     this must be a CPP_NUMBER because we've
   5521 			     met another ', and we'd have checked that
   5522 			     in its own right.  */
   5523 			if (peek == lwm && CPP_OPTION (pfile, uliterals))
   5524 			  {
   5525 			    if  (!maybe_number_start && !want_number)
   5526 			      /* Must be a literal type.  */
   5527 			      raw = false;
   5528 			  }
   5529 			else if (p == '\''
   5530 				 && CPP_OPTION (pfile, digit_separators))
   5531 			  maybe_number_start = true;
   5532 			break;
   5533 		      }
   5534 		    else if (c == '\'')
   5535 		      break;
   5536 		    else if (!quote_first && !quote_eight)
   5537 		      break;
   5538 		  }
   5539 
   5540 		if (maybe_number_start)
   5541 		  {
   5542 		    if (c == '\'')
   5543 		      /* A CPP NUMBER.  */
   5544 		      goto dflt;
   5545 		    raw = false;
   5546 		  }
   5547 
   5548 		goto delimited_string;
   5549 	      }
   5550 
   5551 	    delimited_string:
   5552 	      {
   5553 		/* (Possibly raw) string or char literal.  */
   5554 		unsigned char end = c;
   5555 		int delim_len = -1;
   5556 		const unsigned char *delim = NULL;
   5557 		location_t sloc = linemap_position_for_column (pfile->line_table,
   5558 							       pos - line_start);
   5559 		int esc = 0;
   5560 
   5561 		if (raw)
   5562 		  {
   5563 		    /* There can be no line breaks in the delimiter.  */
   5564 		    delim = pos;
   5565 		    for (delim_len = 0; (c = *pos++) != '('; delim_len++)
   5566 		      {
   5567 			if (delim_len == 16)
   5568 			  {
   5569 			    cpp_error_with_line (pfile, CPP_DL_ERROR,
   5570 						 sloc, 0,
   5571 						 "raw string delimiter"
   5572 						 " longer than %d"
   5573 						 " characters",
   5574 						 delim_len);
   5575 			    raw = false;
   5576 			    pos = delim;
   5577 			    break;
   5578 			  }
   5579 			if (strchr (") \\\t\v\f\n", c))
   5580 			  {
   5581 			    cpp_error_with_line (pfile, CPP_DL_ERROR,
   5582 						 sloc, 0,
   5583 						 "invalid character '%c'"
   5584 						 " in raw string"
   5585 						 " delimiter", c);
   5586 			    raw = false;
   5587 			    pos = delim;
   5588 			    break;
   5589 			  }
   5590 			if (pos >= limit)
   5591 			  goto bad_string;
   5592 		      }
   5593 		  }
   5594 
   5595 		while (pos < limit)
   5596 		  {
   5597 		    char c = *pos++;
   5598 		    switch (c)
   5599 		      {
   5600 		      case '\\':
   5601 			if (!raw)
   5602 			  esc++;
   5603 			break;
   5604 
   5605 		      case '\r':
   5606 			if (*pos == '\n')
   5607 			  pos++;
   5608 			/* FALLTHROUGH  */
   5609 
   5610 		      case '\n':
   5611 			{
   5612 			  CPP_INCREMENT_LINE (pfile, 0);
   5613 			  line_count++;
   5614 			  line_start = pos;
   5615 			}
   5616 			if (esc)
   5617 			  esc--;
   5618 			break;
   5619 
   5620 		      case ')':
   5621 			if (raw
   5622 			    && pos + delim_len + 1 < limit
   5623 			    && pos[delim_len] == end
   5624 			    && !memcmp (delim, pos, delim_len))
   5625 			  {
   5626 			    pos += delim_len + 1;
   5627 			    raw = false;
   5628 			    goto done_string;
   5629 			  }
   5630 			break;
   5631 
   5632 		      default:
   5633 			if (!raw && !(esc & 1) && c == end)
   5634 			  goto done_string;
   5635 			esc = 0;
   5636 			break;
   5637 		      }
   5638 		  }
   5639 	      bad_string:
   5640 		cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
   5641 				     "unterminated literal");
   5642 
   5643 	      done_string:
   5644 		raw = false;
   5645 		lwm = pos - 1;
   5646 	      }
   5647 	      goto dflt;
   5648 
   5649 	    case '_':
   5650 	    case 'e':
   5651 	    case 'i':
   5652 	    case 'm':
   5653 	      if (bol && module_p && !pfile->state.skipping
   5654 		  && do_peek_module (pfile, c, pos, limit))
   5655 		{
   5656 		  /* We've seen the start of a module control line.
   5657 		     Start up the tokenizer.  */
   5658 		  pos--; /* Backup over the first character.  */
   5659 
   5660 		  /* Backup over whitespace to start of line.  */
   5661 		  while (pos > line_start
   5662 			 && (pos[-1] == ' ' || pos[-1] == '\t'))
   5663 		    pos--;
   5664 
   5665 		  if (pos > base)
   5666 		    cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
   5667 
   5668 		  /* Prep things for directive handling. */
   5669 		  buffer->next_line = pos;
   5670 		  buffer->need_line = true;
   5671 
   5672 		  /* Now get tokens until the PRAGMA_EOL.  */
   5673 		  do
   5674 		    {
   5675 		      location_t spelling;
   5676 		      const cpp_token *tok
   5677 			= cpp_get_token_with_location (pfile, &spelling);
   5678 
   5679 		      gcc_assert (pfile->state.in_deferred_pragma
   5680 				  || tok->type == CPP_PRAGMA_EOL);
   5681 		      cb (pfile, CPP_DO_token, data, tok, spelling);
   5682 		    }
   5683 		  while (pfile->state.in_deferred_pragma);
   5684 
   5685 		  if (pfile->buffer->next_line < pfile->buffer->rlimit)
   5686 		    cb (pfile, CPP_DO_location, data,
   5687 			pfile->line_table->highest_line);
   5688 
   5689 		  pfile->mi_valid = false;
   5690 		  goto restart;
   5691 		}
   5692 	      goto dflt;
   5693 
   5694 	    default:
   5695 	    dflt:
   5696 	      bol = false;
   5697 	      pfile->mi_valid = false;
   5698 	      break;
   5699 	    }
   5700 	}
   5701 
   5702       if (buffer->rlimit > base && !pfile->state.skipping)
   5703 	{
   5704 	  const unsigned char *limit = buffer->rlimit;
   5705 	  /* If the file was not newline terminated, add rlimit, which is
   5706 	     guaranteed to point to a newline, to the end of our range.  */
   5707 	  if (limit[-1] != '\n')
   5708 	    {
   5709 	      limit++;
   5710 	      CPP_INCREMENT_LINE (pfile, 0);
   5711 	      line_count++;
   5712 	    }
   5713 	  cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
   5714 	}
   5715 
   5716       _cpp_pop_buffer (pfile);
   5717     }
   5718   while (pfile->buffer);
   5719 }
   5720