Home | History | Annotate | Line # | Download | only in libcpp
lex.cc revision 1.3
      1 /* CPP Library - lexical analysis.
      2    Copyright (C) 2000-2022 Free Software Foundation, Inc.
      3    Contributed by Per Bothner, 1994-95.
      4    Based on CCCP program by Paul Rubin, June 1986
      5    Adapted to ANSI C, Richard Stallman, Jan 1987
      6    Broken out to separate file, Zack Weinberg, Mar 2000
      7 
      8 This program is free software; you can redistribute it and/or modify it
      9 under the terms of the GNU General Public License as published by the
     10 Free Software Foundation; either version 3, or (at your option) any
     11 later version.
     12 
     13 This program is distributed in the hope that it will be useful,
     14 but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     16 GNU General Public License for more details.
     17 
     18 You should have received a copy of the GNU General Public License
     19 along with this program; see the file COPYING3.  If not see
     20 <http://www.gnu.org/licenses/>.  */
     21 
     22 #include "config.h"
     23 #include "system.h"
     24 #include "cpplib.h"
     25 #include "internal.h"
     26 
     27 enum spell_type
     28 {
     29   SPELL_OPERATOR = 0,
     30   SPELL_IDENT,
     31   SPELL_LITERAL,
     32   SPELL_NONE
     33 };
     34 
     35 struct token_spelling
     36 {
     37   enum spell_type category;
     38   const unsigned char *name;
     39 };
     40 
     41 static const unsigned char *const digraph_spellings[] =
     42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
     43 
     44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
     45 #define TK(e, s) { SPELL_ ## s,    UC #e },
     46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
     47 #undef OP
     48 #undef TK
     49 
     50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
     51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
     52 
     53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
     54 static int skip_line_comment (cpp_reader *);
     55 static void skip_whitespace (cpp_reader *, cppchar_t);
     56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
     57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
     58 static void store_comment (cpp_reader *, cpp_token *);
     59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
     60 			    unsigned int, enum cpp_ttype);
     61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
     62 static int name_p (cpp_reader *, const cpp_string *);
     63 static tokenrun *next_tokenrun (tokenrun *);
     64 
     65 static _cpp_buff *new_buff (size_t);
     66 
     67 
     68 /* Utility routine:
     69 
     70    Compares, the token TOKEN to the NUL-terminated string STRING.
     71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
     72 int
     73 cpp_ideq (const cpp_token *token, const char *string)
     74 {
     75   if (token->type != CPP_NAME)
     76     return 0;
     77 
     78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
     79 }
     80 
     81 /* Record a note TYPE at byte POS into the current cleaned logical
     82    line.  */
     83 static void
     84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
     85 {
     86   if (buffer->notes_used == buffer->notes_cap)
     87     {
     88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
     89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
     90                                   buffer->notes_cap);
     91     }
     92 
     93   buffer->notes[buffer->notes_used].pos = pos;
     94   buffer->notes[buffer->notes_used].type = type;
     95   buffer->notes_used++;
     96 }
     97 
     98 
     99 /* Fast path to find line special characters using optimized character
    101    scanning algorithms.  Anything complicated falls back to the slow
    102    path below.  Since this loop is very hot it's worth doing these kinds
    103    of optimizations.
    104 
    105    One of the paths through the ifdefs should provide
    106 
    107      const uchar *search_line_fast (const uchar *s, const uchar *end);
    108 
    109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
    110    the found character.
    111 
    112    Note that the last character of the buffer is *always* a newline,
    113    as forced by _cpp_convert_input.  This fact can be used to avoid
    114    explicitly looking for the end of the buffer.  */
    115 
    116 /* Configure gives us an ifdef test.  */
    117 #ifndef WORDS_BIGENDIAN
    118 #define WORDS_BIGENDIAN 0
    119 #endif
    120 
    121 /* We'd like the largest integer that fits into a register.  There's nothing
    122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
    123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
    124    can get the "real" word size.  */
    125 #ifdef __GNUC__
    126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
    127 #else
    128 typedef unsigned long word_type;
    129 #endif
    130 
    131 /* The code below is only expecting sizes 4 or 8.
    132    Die at compile-time if this expectation is violated.  */
    133 typedef char check_word_type_size
    134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
    135 
    136 /* Return X with the first N bytes forced to values that won't match one
    137    of the interesting characters.  Note that NUL is not interesting.  */
    138 
    139 static inline word_type
    140 acc_char_mask_misalign (word_type val, unsigned int n)
    141 {
    142   word_type mask = -1;
    143   if (WORDS_BIGENDIAN)
    144     mask >>= n * 8;
    145   else
    146     mask <<= n * 8;
    147   return val & mask;
    148 }
    149 
    150 /* Return X replicated to all byte positions within WORD_TYPE.  */
    151 
    152 static inline word_type
    153 acc_char_replicate (uchar x)
    154 {
    155   word_type ret;
    156 
    157   ret = (x << 24) | (x << 16) | (x << 8) | x;
    158   if (sizeof(word_type) == 8)
    159     ret = (ret << 16 << 16) | ret;
    160   return ret;
    161 }
    162 
    163 /* Return non-zero if some byte of VAL is (probably) C.  */
    164 
    165 static inline word_type
    166 acc_char_cmp (word_type val, word_type c)
    167 {
    168 #if defined(__GNUC__) && defined(__alpha__)
    169   /* We can get exact results using a compare-bytes instruction.
    170      Get (val == c) via (0 >= (val ^ c)).  */
    171   return __builtin_alpha_cmpbge (0, val ^ c);
    172 #else
    173   word_type magic = 0x7efefefeU;
    174   if (sizeof(word_type) == 8)
    175     magic = (magic << 16 << 16) | 0xfefefefeU;
    176   magic |= 1;
    177 
    178   val ^= c;
    179   return ((val + magic) ^ ~val) & ~magic;
    180 #endif
    181 }
    182 
    183 /* Given the result of acc_char_cmp is non-zero, return the index of
    184    the found character.  If this was a false positive, return -1.  */
    185 
    186 static inline int
    187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
    188 		word_type val ATTRIBUTE_UNUSED)
    189 {
    190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
    191   /* The cmpbge instruction sets *bits* of the result corresponding to
    192      matches in the bytes with no false positives.  */
    193   return __builtin_ctzl (cmp);
    194 #else
    195   unsigned int i;
    196 
    197   /* ??? It would be nice to force unrolling here,
    198      and have all of these constants folded.  */
    199   for (i = 0; i < sizeof(word_type); ++i)
    200     {
    201       uchar c;
    202       if (WORDS_BIGENDIAN)
    203 	c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
    204       else
    205 	c = (val >> i * 8) & 0xff;
    206 
    207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
    208 	return i;
    209     }
    210 
    211   return -1;
    212 #endif
    213 }
    214 
    215 /* A version of the fast scanner using bit fiddling techniques.
    216 
    217    For 32-bit words, one would normally perform 16 comparisons and
    218    16 branches.  With this algorithm one performs 24 arithmetic
    219    operations and one branch.  Whether this is faster with a 32-bit
    220    word size is going to be somewhat system dependent.
    221 
    222    For 64-bit words, we eliminate twice the number of comparisons
    223    and branches without increasing the number of arithmetic operations.
    224    It's almost certainly going to be a win with 64-bit word size.  */
    225 
    226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
    227   ATTRIBUTE_UNUSED;
    228 
    229 static const uchar *
    230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    231 {
    232   const word_type repl_nl = acc_char_replicate ('\n');
    233   const word_type repl_cr = acc_char_replicate ('\r');
    234   const word_type repl_bs = acc_char_replicate ('\\');
    235   const word_type repl_qm = acc_char_replicate ('?');
    236 
    237   unsigned int misalign;
    238   const word_type *p;
    239   word_type val, t;
    240 
    241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
    242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
    243   val = *p;
    244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
    245   if (misalign)
    246     val = acc_char_mask_misalign (val, misalign);
    247 
    248   /* Main loop.  */
    249   while (1)
    250     {
    251       t  = acc_char_cmp (val, repl_nl);
    252       t |= acc_char_cmp (val, repl_cr);
    253       t |= acc_char_cmp (val, repl_bs);
    254       t |= acc_char_cmp (val, repl_qm);
    255 
    256       if (__builtin_expect (t != 0, 0))
    257 	{
    258 	  int i = acc_char_index (t, val);
    259 	  if (i >= 0)
    260 	    return (const uchar *)p + i;
    261 	}
    262 
    263       val = *++p;
    264     }
    265 }
    266 
    267 /* Disable on Solaris 2/x86 until the following problem can be properly
    268    autoconfed:
    269 
    270    The Solaris 10+ assembler tags objects with the instruction set
    271    extensions used, so SSE4.2 executables cannot run on machines that
    272    don't support that extension.  */
    273 
    274 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
    275 
    276 /* Replicated character data to be shared between implementations.
    277    Recall that outside of a context with vector support we can't
    278    define compatible vector types, therefore these are all defined
    279    in terms of raw characters.  */
    280 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
    281   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    282     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
    283   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    284     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
    285   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    286     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
    287   { '?', '?', '?', '?', '?', '?', '?', '?',
    288     '?', '?', '?', '?', '?', '?', '?', '?' },
    289 };
    290 
    291 /* A version of the fast scanner using MMX vectorized byte compare insns.
    292 
    293    This uses the PMOVMSKB instruction which was introduced with "MMX2",
    294    which was packaged into SSE1; it is also present in the AMD MMX
    295    extension.  Mark the function as using "sse" so that we emit a real
    296    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
    297 
    298 static const uchar *
    299 #ifndef __SSE__
    300 __attribute__((__target__("sse")))
    301 #endif
    302 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    303 {
    304   typedef char v8qi __attribute__ ((__vector_size__ (8)));
    305   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
    306 
    307   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
    308   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
    309   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
    310   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
    311 
    312   unsigned int misalign, found, mask;
    313   const v8qi *p;
    314   v8qi data, t, c;
    315 
    316   /* Align the source pointer.  While MMX doesn't generate unaligned data
    317      faults, this allows us to safely scan to the end of the buffer without
    318      reading beyond the end of the last page.  */
    319   misalign = (uintptr_t)s & 7;
    320   p = (const v8qi *)((uintptr_t)s & -8);
    321   data = *p;
    322 
    323   /* Create a mask for the bytes that are valid within the first
    324      16-byte block.  The Idea here is that the AND with the mask
    325      within the loop is "free", since we need some AND or TEST
    326      insn in order to set the flags for the branch anyway.  */
    327   mask = -1u << misalign;
    328 
    329   /* Main loop processing 8 bytes at a time.  */
    330   goto start;
    331   do
    332     {
    333       data = *++p;
    334       mask = -1;
    335 
    336     start:
    337       t = __builtin_ia32_pcmpeqb(data, repl_nl);
    338       c = __builtin_ia32_pcmpeqb(data, repl_cr);
    339       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
    340       c = __builtin_ia32_pcmpeqb(data, repl_bs);
    341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
    342       c = __builtin_ia32_pcmpeqb(data, repl_qm);
    343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
    344       found = __builtin_ia32_pmovmskb (t);
    345       found &= mask;
    346     }
    347   while (!found);
    348 
    349   __builtin_ia32_emms ();
    350 
    351   /* FOUND contains 1 in bits for which we matched a relevant
    352      character.  Conversion to the byte index is trivial.  */
    353   found = __builtin_ctz(found);
    354   return (const uchar *)p + found;
    355 }
    356 
    357 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
    358 
    359 static const uchar *
    360 #ifndef __SSE2__
    361 __attribute__((__target__("sse2")))
    362 #endif
    363 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    364 {
    365   typedef char v16qi __attribute__ ((__vector_size__ (16)));
    366 
    367   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
    368   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
    369   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
    370   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
    371 
    372   unsigned int misalign, found, mask;
    373   const v16qi *p;
    374   v16qi data, t;
    375 
    376   /* Align the source pointer.  */
    377   misalign = (uintptr_t)s & 15;
    378   p = (const v16qi *)((uintptr_t)s & -16);
    379   data = *p;
    380 
    381   /* Create a mask for the bytes that are valid within the first
    382      16-byte block.  The Idea here is that the AND with the mask
    383      within the loop is "free", since we need some AND or TEST
    384      insn in order to set the flags for the branch anyway.  */
    385   mask = -1u << misalign;
    386 
    387   /* Main loop processing 16 bytes at a time.  */
    388   goto start;
    389   do
    390     {
    391       data = *++p;
    392       mask = -1;
    393 
    394     start:
    395       t  = data == repl_nl;
    396       t |= data == repl_cr;
    397       t |= data == repl_bs;
    398       t |= data == repl_qm;
    399       found = __builtin_ia32_pmovmskb128 (t);
    400       found &= mask;
    401     }
    402   while (!found);
    403 
    404   /* FOUND contains 1 in bits for which we matched a relevant
    405      character.  Conversion to the byte index is trivial.  */
    406   found = __builtin_ctz(found);
    407   return (const uchar *)p + found;
    408 }
    409 
    410 #ifdef HAVE_SSE4
    411 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
    412 
    413 static const uchar *
    414 #ifndef __SSE4_2__
    415 __attribute__((__target__("sse4.2")))
    416 #endif
    417 search_line_sse42 (const uchar *s, const uchar *end)
    418 {
    419   typedef char v16qi __attribute__ ((__vector_size__ (16)));
    420   static const v16qi search = { '\n', '\r', '?', '\\' };
    421 
    422   uintptr_t si = (uintptr_t)s;
    423   uintptr_t index;
    424 
    425   /* Check for unaligned input.  */
    426   if (si & 15)
    427     {
    428       v16qi sv;
    429 
    430       if (__builtin_expect (end - s < 16, 0)
    431 	  && __builtin_expect ((si & 0xfff) > 0xff0, 0))
    432 	{
    433 	  /* There are less than 16 bytes left in the buffer, and less
    434 	     than 16 bytes left on the page.  Reading 16 bytes at this
    435 	     point might generate a spurious page fault.  Defer to the
    436 	     SSE2 implementation, which already handles alignment.  */
    437 	  return search_line_sse2 (s, end);
    438 	}
    439 
    440       /* ??? The builtin doesn't understand that the PCMPESTRI read from
    441 	 memory need not be aligned.  */
    442       sv = __builtin_ia32_loaddqu ((const char *) s);
    443       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
    444 
    445       if (__builtin_expect (index < 16, 0))
    446 	goto found;
    447 
    448       /* Advance the pointer to an aligned address.  We will re-scan a
    449 	 few bytes, but we no longer need care for reading past the
    450 	 end of a page, since we're guaranteed a match.  */
    451       s = (const uchar *)((si + 15) & -16);
    452     }
    453 
    454   /* Main loop, processing 16 bytes at a time.  */
    455 #ifdef __GCC_ASM_FLAG_OUTPUTS__
    456   while (1)
    457     {
    458       char f;
    459 
    460       /* By using inline assembly instead of the builtin,
    461 	 we can use the result, as well as the flags set.  */
    462       __asm ("%vpcmpestri\t$0, %2, %3"
    463 	     : "=c"(index), "=@ccc"(f)
    464 	     : "m"(*s), "x"(search), "a"(4), "d"(16));
    465       if (f)
    466 	break;
    467 
    468       s += 16;
    469     }
    470 #else
    471   s -= 16;
    472   /* By doing the whole loop in inline assembly,
    473      we can make proper use of the flags set.  */
    474   __asm (      ".balign 16\n"
    475 	"0:	add $16, %1\n"
    476 	"	%vpcmpestri\t$0, (%1), %2\n"
    477 	"	jnc 0b"
    478 	: "=&c"(index), "+r"(s)
    479 	: "x"(search), "a"(4), "d"(16));
    480 #endif
    481 
    482  found:
    483   return s + index;
    484 }
    485 
    486 #else
    487 /* Work around out-dated assemblers without sse4 support.  */
    488 #define search_line_sse42 search_line_sse2
    489 #endif
    490 
    491 /* Check the CPU capabilities.  */
    492 
    493 #include "../gcc/config/i386/cpuid.h"
    494 
    495 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
    496 static search_line_fast_type search_line_fast;
    497 
    498 #define HAVE_init_vectorized_lexer 1
    499 static inline void
    500 init_vectorized_lexer (void)
    501 {
    502   unsigned dummy, ecx = 0, edx = 0;
    503   search_line_fast_type impl = search_line_acc_char;
    504   int minimum = 0;
    505 
    506 #if defined(__SSE4_2__)
    507   minimum = 3;
    508 #elif defined(__SSE2__)
    509   minimum = 2;
    510 #elif defined(__SSE__)
    511   minimum = 1;
    512 #endif
    513 
    514   if (minimum == 3)
    515     impl = search_line_sse42;
    516   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
    517     {
    518       if (minimum == 3 || (ecx & bit_SSE4_2))
    519         impl = search_line_sse42;
    520       else if (minimum == 2 || (edx & bit_SSE2))
    521 	impl = search_line_sse2;
    522       else if (minimum == 1 || (edx & bit_SSE))
    523 	impl = search_line_mmx;
    524     }
    525   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
    526     {
    527       if (minimum == 1
    528 	  || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
    529 	impl = search_line_mmx;
    530     }
    531 
    532   search_line_fast = impl;
    533 }
    534 
    535 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
    536 
    537 /* A vection of the fast scanner using AltiVec vectorized byte compares
    538    and VSX unaligned loads (when VSX is available).  This is otherwise
    539    the same as the AltiVec version.  */
    540 
    541 ATTRIBUTE_NO_SANITIZE_UNDEFINED
    542 static const uchar *
    543 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    544 {
    545   typedef __attribute__((altivec(vector))) unsigned char vc;
    546 
    547   const vc repl_nl = {
    548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    549     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
    550   };
    551   const vc repl_cr = {
    552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    553     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
    554   };
    555   const vc repl_bs = {
    556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    557     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
    558   };
    559   const vc repl_qm = {
    560     '?', '?', '?', '?', '?', '?', '?', '?',
    561     '?', '?', '?', '?', '?', '?', '?', '?',
    562   };
    563   const vc zero = { 0 };
    564 
    565   vc data, t;
    566 
    567   /* Main loop processing 16 bytes at a time.  */
    568   do
    569     {
    570       vc m_nl, m_cr, m_bs, m_qm;
    571 
    572       data = __builtin_vec_vsx_ld (0, s);
    573       s += 16;
    574 
    575       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
    576       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
    577       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
    578       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
    579       t = (m_nl | m_cr) | (m_bs | m_qm);
    580 
    581       /* T now contains 0xff in bytes for which we matched one of the relevant
    582 	 characters.  We want to exit the loop if any byte in T is non-zero.
    583 	 Below is the expansion of vec_any_ne(t, zero).  */
    584     }
    585   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
    586 
    587   /* Restore s to to point to the 16 bytes we just processed.  */
    588   s -= 16;
    589 
    590   {
    591 #define N  (sizeof(vc) / sizeof(long))
    592 
    593     union {
    594       vc v;
    595       /* Statically assert that N is 2 or 4.  */
    596       unsigned long l[(N == 2 || N == 4) ? N : -1];
    597     } u;
    598     unsigned long l, i = 0;
    599 
    600     u.v = t;
    601 
    602     /* Find the first word of T that is non-zero.  */
    603     switch (N)
    604       {
    605       case 4:
    606 	l = u.l[i++];
    607 	if (l != 0)
    608 	  break;
    609 	s += sizeof(unsigned long);
    610 	l = u.l[i++];
    611 	if (l != 0)
    612 	  break;
    613 	s += sizeof(unsigned long);
    614 	/* FALLTHRU */
    615       case 2:
    616 	l = u.l[i++];
    617 	if (l != 0)
    618 	  break;
    619 	s += sizeof(unsigned long);
    620 	l = u.l[i];
    621       }
    622 
    623     /* L now contains 0xff in bytes for which we matched one of the
    624        relevant characters.  We can find the byte index by finding
    625        its bit index and dividing by 8.  */
    626 #ifdef __BIG_ENDIAN__
    627     l = __builtin_clzl(l) >> 3;
    628 #else
    629     l = __builtin_ctzl(l) >> 3;
    630 #endif
    631     return s + l;
    632 
    633 #undef N
    634   }
    635 }
    636 
    637 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
    638 
    639 /* A vection of the fast scanner using AltiVec vectorized byte compares.
    640    This cannot be used for little endian because vec_lvsl/lvsr are
    641    deprecated for little endian and the code won't work properly.  */
    642 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
    643    so we can't compile this function without -maltivec on the command line
    644    (or implied by some other switch).  */
    645 
    646 static const uchar *
    647 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    648 {
    649   typedef __attribute__((altivec(vector))) unsigned char vc;
    650 
    651   const vc repl_nl = {
    652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    653     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
    654   };
    655   const vc repl_cr = {
    656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    657     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
    658   };
    659   const vc repl_bs = {
    660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    661     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
    662   };
    663   const vc repl_qm = {
    664     '?', '?', '?', '?', '?', '?', '?', '?',
    665     '?', '?', '?', '?', '?', '?', '?', '?',
    666   };
    667   const vc ones = {
    668     -1, -1, -1, -1, -1, -1, -1, -1,
    669     -1, -1, -1, -1, -1, -1, -1, -1,
    670   };
    671   const vc zero = { 0 };
    672 
    673   vc data, mask, t;
    674 
    675   /* Altivec loads automatically mask addresses with -16.  This lets us
    676      issue the first load as early as possible.  */
    677   data = __builtin_vec_ld(0, (const vc *)s);
    678 
    679   /* Discard bytes before the beginning of the buffer.  Do this by
    680      beginning with all ones and shifting in zeros according to the
    681      mis-alignment.  The LVSR instruction pulls the exact shift we
    682      want from the address.  */
    683   mask = __builtin_vec_lvsr(0, s);
    684   mask = __builtin_vec_perm(zero, ones, mask);
    685   data &= mask;
    686 
    687   /* While altivec loads mask addresses, we still need to align S so
    688      that the offset we compute at the end is correct.  */
    689   s = (const uchar *)((uintptr_t)s & -16);
    690 
    691   /* Main loop processing 16 bytes at a time.  */
    692   goto start;
    693   do
    694     {
    695       vc m_nl, m_cr, m_bs, m_qm;
    696 
    697       s += 16;
    698       data = __builtin_vec_ld(0, (const vc *)s);
    699 
    700     start:
    701       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
    702       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
    703       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
    704       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
    705       t = (m_nl | m_cr) | (m_bs | m_qm);
    706 
    707       /* T now contains 0xff in bytes for which we matched one of the relevant
    708 	 characters.  We want to exit the loop if any byte in T is non-zero.
    709 	 Below is the expansion of vec_any_ne(t, zero).  */
    710     }
    711   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
    712 
    713   {
    714 #define N  (sizeof(vc) / sizeof(long))
    715 
    716     union {
    717       vc v;
    718       /* Statically assert that N is 2 or 4.  */
    719       unsigned long l[(N == 2 || N == 4) ? N : -1];
    720     } u;
    721     unsigned long l, i = 0;
    722 
    723     u.v = t;
    724 
    725     /* Find the first word of T that is non-zero.  */
    726     switch (N)
    727       {
    728       case 4:
    729 	l = u.l[i++];
    730 	if (l != 0)
    731 	  break;
    732 	s += sizeof(unsigned long);
    733 	l = u.l[i++];
    734 	if (l != 0)
    735 	  break;
    736 	s += sizeof(unsigned long);
    737 	/* FALLTHROUGH */
    738       case 2:
    739 	l = u.l[i++];
    740 	if (l != 0)
    741 	  break;
    742 	s += sizeof(unsigned long);
    743 	l = u.l[i];
    744       }
    745 
    746     /* L now contains 0xff in bytes for which we matched one of the
    747        relevant characters.  We can find the byte index by finding
    748        its bit index and dividing by 8.  */
    749     l = __builtin_clzl(l) >> 3;
    750     return s + l;
    751 
    752 #undef N
    753   }
    754 }
    755 
    756 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
    757 #include "arm_neon.h"
    758 
    759 /* This doesn't have to be the exact page size, but no system may use
    760    a size smaller than this.  ARMv8 requires a minimum page size of
    761    4k.  The impact of being conservative here is a small number of
    762    cases will take the slightly slower entry path into the main
    763    loop.  */
    764 
    765 #define AARCH64_MIN_PAGE_SIZE 4096
    766 
    767 static const uchar *
    768 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    769 {
    770   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
    771   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
    772   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
    773   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
    774   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
    775 
    776 #ifdef __ARM_BIG_ENDIAN
    777   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
    778 #else
    779   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
    780 #endif
    781 
    782   unsigned int found;
    783   const uint8_t *p;
    784   uint8x16_t data;
    785   uint8x16_t t;
    786   uint16x8_t m;
    787   uint8x16_t u, v, w;
    788 
    789   /* Align the source pointer.  */
    790   p = (const uint8_t *)((uintptr_t)s & -16);
    791 
    792   /* Assuming random string start positions, with a 4k page size we'll take
    793      the slow path about 0.37% of the time.  */
    794   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
    795 			 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
    796 			< 16, 0))
    797     {
    798       /* Slow path: the string starts near a possible page boundary.  */
    799       uint32_t misalign, mask;
    800 
    801       misalign = (uintptr_t)s & 15;
    802       mask = (-1u << misalign) & 0xffff;
    803       data = vld1q_u8 (p);
    804       t = vceqq_u8 (data, repl_nl);
    805       u = vceqq_u8 (data, repl_cr);
    806       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    807       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    808       t = vorrq_u8 (v, w);
    809       t = vandq_u8 (t, xmask);
    810       m = vpaddlq_u8 (t);
    811       m = vshlq_u16 (m, shift);
    812       found = vaddvq_u16 (m);
    813       found &= mask;
    814       if (found)
    815 	return (const uchar*)p + __builtin_ctz (found);
    816     }
    817   else
    818     {
    819       data = vld1q_u8 ((const uint8_t *) s);
    820       t = vceqq_u8 (data, repl_nl);
    821       u = vceqq_u8 (data, repl_cr);
    822       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    823       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    824       t = vorrq_u8 (v, w);
    825       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
    826 	goto done;
    827     }
    828 
    829   do
    830     {
    831       p += 16;
    832       data = vld1q_u8 (p);
    833       t = vceqq_u8 (data, repl_nl);
    834       u = vceqq_u8 (data, repl_cr);
    835       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    836       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    837       t = vorrq_u8 (v, w);
    838     } while (!vpaddd_u64 ((uint64x2_t)t));
    839 
    840 done:
    841   /* Now that we've found the terminating substring, work out precisely where
    842      we need to stop.  */
    843   t = vandq_u8 (t, xmask);
    844   m = vpaddlq_u8 (t);
    845   m = vshlq_u16 (m, shift);
    846   found = vaddvq_u16 (m);
    847   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
    848 	  + __builtin_ctz (found));
    849 }
    850 
    851 #elif defined (__ARM_NEON)
    852 #include "arm_neon.h"
    853 
    854 static const uchar *
    855 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    856 {
    857   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
    858   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
    859   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
    860   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
    861   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
    862 
    863   unsigned int misalign, found, mask;
    864   const uint8_t *p;
    865   uint8x16_t data;
    866 
    867   /* Align the source pointer.  */
    868   misalign = (uintptr_t)s & 15;
    869   p = (const uint8_t *)((uintptr_t)s & -16);
    870   data = vld1q_u8 (p);
    871 
    872   /* Create a mask for the bytes that are valid within the first
    873      16-byte block.  The Idea here is that the AND with the mask
    874      within the loop is "free", since we need some AND or TEST
    875      insn in order to set the flags for the branch anyway.  */
    876   mask = (-1u << misalign) & 0xffff;
    877 
    878   /* Main loop, processing 16 bytes at a time.  */
    879   goto start;
    880 
    881   do
    882     {
    883       uint8x8_t l;
    884       uint16x4_t m;
    885       uint32x2_t n;
    886       uint8x16_t t, u, v, w;
    887 
    888       p += 16;
    889       data = vld1q_u8 (p);
    890       mask = 0xffff;
    891 
    892     start:
    893       t = vceqq_u8 (data, repl_nl);
    894       u = vceqq_u8 (data, repl_cr);
    895       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    896       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    897       t = vandq_u8 (vorrq_u8 (v, w), xmask);
    898       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
    899       m = vpaddl_u8 (l);
    900       n = vpaddl_u16 (m);
    901 
    902       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
    903 	      vshr_n_u64 ((uint64x1_t) n, 24)), 0);
    904       found &= mask;
    905     }
    906   while (!found);
    907 
    908   /* FOUND contains 1 in bits for which we matched a relevant
    909      character.  Conversion to the byte index is trivial.  */
    910   found = __builtin_ctz (found);
    911   return (const uchar *)p + found;
    912 }
    913 
    914 #else
    915 
    916 /* We only have one accelerated alternative.  Use a direct call so that
    917    we encourage inlining.  */
    918 
    919 #define search_line_fast  search_line_acc_char
    920 
    921 #endif
    922 
    923 /* Initialize the lexer if needed.  */
    924 
    925 void
    926 _cpp_init_lexer (void)
    927 {
    928 #ifdef HAVE_init_vectorized_lexer
    929   init_vectorized_lexer ();
    930 #endif
    931 }
    932 
    933 /* Returns with a logical line that contains no escaped newlines or
    934    trigraphs.  This is a time-critical inner loop.  */
    935 void
    936 _cpp_clean_line (cpp_reader *pfile)
    937 {
    938   cpp_buffer *buffer;
    939   const uchar *s;
    940   uchar c, *d, *p;
    941 
    942   buffer = pfile->buffer;
    943   buffer->cur_note = buffer->notes_used = 0;
    944   buffer->cur = buffer->line_base = buffer->next_line;
    945   buffer->need_line = false;
    946   s = buffer->next_line;
    947 
    948   if (!buffer->from_stage3)
    949     {
    950       const uchar *pbackslash = NULL;
    951 
    952       /* Fast path.  This is the common case of an un-escaped line with
    953 	 no trigraphs.  The primary win here is by not writing any
    954 	 data back to memory until we have to.  */
    955       while (1)
    956 	{
    957 	  /* Perform an optimized search for \n, \r, \\, ?.  */
    958 	  s = search_line_fast (s, buffer->rlimit);
    959 
    960 	  c = *s;
    961 	  if (c == '\\')
    962 	    {
    963 	      /* Record the location of the backslash and continue.  */
    964 	      pbackslash = s++;
    965 	    }
    966 	  else if (__builtin_expect (c == '?', 0))
    967 	    {
    968 	      if (__builtin_expect (s[1] == '?', false)
    969 		   && _cpp_trigraph_map[s[2]])
    970 		{
    971 		  /* Have a trigraph.  We may or may not have to convert
    972 		     it.  Add a line note regardless, for -Wtrigraphs.  */
    973 		  add_line_note (buffer, s, s[2]);
    974 		  if (CPP_OPTION (pfile, trigraphs))
    975 		    {
    976 		      /* We do, and that means we have to switch to the
    977 		         slow path.  */
    978 		      d = (uchar *) s;
    979 		      *d = _cpp_trigraph_map[s[2]];
    980 		      s += 2;
    981 		      goto slow_path;
    982 		    }
    983 		}
    984 	      /* Not a trigraph.  Continue on fast-path.  */
    985 	      s++;
    986 	    }
    987 	  else
    988 	    break;
    989 	}
    990 
    991       /* This must be \r or \n.  We're either done, or we'll be forced
    992 	 to write back to the buffer and continue on the slow path.  */
    993       d = (uchar *) s;
    994 
    995       if (__builtin_expect (s == buffer->rlimit, false))
    996 	goto done;
    997 
    998       /* DOS line ending? */
    999       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
   1000 	{
   1001 	  s++;
   1002 	  if (s == buffer->rlimit)
   1003 	    goto done;
   1004 	}
   1005 
   1006       if (__builtin_expect (pbackslash == NULL, true))
   1007 	goto done;
   1008 
   1009       /* Check for escaped newline.  */
   1010       p = d;
   1011       while (is_nvspace (p[-1]))
   1012 	p--;
   1013       if (p - 1 != pbackslash)
   1014 	goto done;
   1015 
   1016       /* Have an escaped newline; process it and proceed to
   1017 	 the slow path.  */
   1018       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
   1019       d = p - 2;
   1020       buffer->next_line = p - 1;
   1021 
   1022     slow_path:
   1023       while (1)
   1024 	{
   1025 	  c = *++s;
   1026 	  *++d = c;
   1027 
   1028 	  if (c == '\n' || c == '\r')
   1029 	    {
   1030 	      /* Handle DOS line endings.  */
   1031 	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
   1032 		s++;
   1033 	      if (s == buffer->rlimit)
   1034 		break;
   1035 
   1036 	      /* Escaped?  */
   1037 	      p = d;
   1038 	      while (p != buffer->next_line && is_nvspace (p[-1]))
   1039 		p--;
   1040 	      if (p == buffer->next_line || p[-1] != '\\')
   1041 		break;
   1042 
   1043 	      add_line_note (buffer, p - 1, p != d ? ' ': '\\');
   1044 	      d = p - 2;
   1045 	      buffer->next_line = p - 1;
   1046 	    }
   1047 	  else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
   1048 	    {
   1049 	      /* Add a note regardless, for the benefit of -Wtrigraphs.  */
   1050 	      add_line_note (buffer, d, s[2]);
   1051 	      if (CPP_OPTION (pfile, trigraphs))
   1052 		{
   1053 		  *d = _cpp_trigraph_map[s[2]];
   1054 		  s += 2;
   1055 		}
   1056 	    }
   1057 	}
   1058     }
   1059   else
   1060     {
   1061       while (*s != '\n' && *s != '\r')
   1062 	s++;
   1063       d = (uchar *) s;
   1064 
   1065       /* Handle DOS line endings.  */
   1066       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
   1067 	s++;
   1068     }
   1069 
   1070  done:
   1071   *d = '\n';
   1072   /* A sentinel note that should never be processed.  */
   1073   add_line_note (buffer, d + 1, '\n');
   1074   buffer->next_line = s + 1;
   1075 }
   1076 
   1077 /* Return true if the trigraph indicated by NOTE should be warned
   1078    about in a comment.  */
   1079 static bool
   1080 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
   1081 {
   1082   const uchar *p;
   1083 
   1084   /* Within comments we don't warn about trigraphs, unless the
   1085      trigraph forms an escaped newline, as that may change
   1086      behavior.  */
   1087   if (note->type != '/')
   1088     return false;
   1089 
   1090   /* If -trigraphs, then this was an escaped newline iff the next note
   1091      is coincident.  */
   1092   if (CPP_OPTION (pfile, trigraphs))
   1093     return note[1].pos == note->pos;
   1094 
   1095   /* Otherwise, see if this forms an escaped newline.  */
   1096   p = note->pos + 3;
   1097   while (is_nvspace (*p))
   1098     p++;
   1099 
   1100   /* There might have been escaped newlines between the trigraph and the
   1101      newline we found.  Hence the position test.  */
   1102   return (*p == '\n' && p < note[1].pos);
   1103 }
   1104 
   1105 /* Process the notes created by add_line_note as far as the current
   1106    location.  */
   1107 void
   1108 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
   1109 {
   1110   cpp_buffer *buffer = pfile->buffer;
   1111 
   1112   for (;;)
   1113     {
   1114       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
   1115       unsigned int col;
   1116 
   1117       if (note->pos > buffer->cur)
   1118 	break;
   1119 
   1120       buffer->cur_note++;
   1121       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
   1122 
   1123       if (note->type == '\\' || note->type == ' ')
   1124 	{
   1125 	  if (note->type == ' ' && !in_comment)
   1126 	    cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
   1127 				 "backslash and newline separated by space");
   1128 
   1129 	  if (buffer->next_line > buffer->rlimit)
   1130 	    {
   1131 	      cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
   1132 				   "backslash-newline at end of file");
   1133 	      /* Prevent "no newline at end of file" warning.  */
   1134 	      buffer->next_line = buffer->rlimit;
   1135 	    }
   1136 
   1137 	  buffer->line_base = note->pos;
   1138 	  CPP_INCREMENT_LINE (pfile, 0);
   1139 	}
   1140       else if (_cpp_trigraph_map[note->type])
   1141 	{
   1142 	  if (CPP_OPTION (pfile, warn_trigraphs)
   1143 	      && (!in_comment || warn_in_comment (pfile, note)))
   1144 	    {
   1145 	      if (CPP_OPTION (pfile, trigraphs))
   1146 		cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
   1147                                        pfile->line_table->highest_line, col,
   1148 				       "trigraph ??%c converted to %c",
   1149 				       note->type,
   1150 				       (int) _cpp_trigraph_map[note->type]);
   1151 	      else
   1152 		{
   1153 		  cpp_warning_with_line
   1154 		    (pfile, CPP_W_TRIGRAPHS,
   1155                      pfile->line_table->highest_line, col,
   1156 		     "trigraph ??%c ignored, use -trigraphs to enable",
   1157 		     note->type);
   1158 		}
   1159 	    }
   1160 	}
   1161       else if (note->type == 0)
   1162 	/* Already processed in lex_raw_string.  */;
   1163       else
   1164 	abort ();
   1165     }
   1166 }
   1167 
   1168 namespace bidi {
   1169   enum class kind {
   1170     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
   1171   };
   1172 
   1173   /* All the UTF-8 encodings of bidi characters start with E2.  */
   1174   constexpr uchar utf8_start = 0xe2;
   1175 
   1176   struct context
   1177   {
   1178     context () {}
   1179     context (location_t loc, kind k, bool pdf, bool ucn)
   1180     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
   1181     {
   1182     }
   1183 
   1184     kind get_pop_kind () const
   1185     {
   1186       return m_pdf ? kind::PDF : kind::PDI;
   1187     }
   1188     bool ucn_p () const
   1189     {
   1190       return m_ucn;
   1191     }
   1192 
   1193     location_t m_loc;
   1194     kind m_kind;
   1195     unsigned m_pdf : 1;
   1196     unsigned m_ucn : 1;
   1197   };
   1198 
   1199   /* A vector holding currently open bidi contexts.  We use a char for
   1200      each context, its LSB is 1 if it represents a PDF context, 0 if it
   1201      represents a PDI context.  The next bit is 1 if this context was open
   1202      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
   1203   semi_embedded_vec <context, 16> vec;
   1204 
   1205   /* Close the whole comment/identifier/string literal/character constant
   1206      context.  */
   1207   void on_close ()
   1208   {
   1209     vec.truncate (0);
   1210   }
   1211 
   1212   /* Pop the last element in the vector.  */
   1213   void pop ()
   1214   {
   1215     unsigned int len = vec.count ();
   1216     gcc_checking_assert (len > 0);
   1217     vec.truncate (len - 1);
   1218   }
   1219 
   1220   /* Return the pop kind of the context of the Ith element.  */
   1221   kind pop_kind_at (unsigned int i)
   1222   {
   1223     return vec[i].get_pop_kind ();
   1224   }
   1225 
   1226   /* Return the pop kind of the context that is currently opened.  */
   1227   kind current_ctx ()
   1228   {
   1229     unsigned int len = vec.count ();
   1230     if (len == 0)
   1231       return kind::NONE;
   1232     return vec[len - 1].get_pop_kind ();
   1233   }
   1234 
   1235   /* Return true if the current context comes from a UCN origin, that is,
   1236      the bidi char which started this bidi context was written as a UCN.  */
   1237   bool current_ctx_ucn_p ()
   1238   {
   1239     unsigned int len = vec.count ();
   1240     gcc_checking_assert (len > 0);
   1241     return vec[len - 1].m_ucn;
   1242   }
   1243 
   1244   location_t current_ctx_loc ()
   1245   {
   1246     unsigned int len = vec.count ();
   1247     gcc_checking_assert (len > 0);
   1248     return vec[len - 1].m_loc;
   1249   }
   1250 
   1251   /* We've read a bidi char, update the current vector as necessary.
   1252      LOC is only valid when K is not kind::NONE.  */
   1253   void on_char (kind k, bool ucn_p, location_t loc)
   1254   {
   1255     switch (k)
   1256       {
   1257       case kind::LRE:
   1258       case kind::RLE:
   1259       case kind::LRO:
   1260       case kind::RLO:
   1261 	vec.push (context (loc, k, true, ucn_p));
   1262 	break;
   1263       case kind::LRI:
   1264       case kind::RLI:
   1265       case kind::FSI:
   1266 	vec.push (context (loc, k, false, ucn_p));
   1267 	break;
   1268       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
   1269 	 whose scope has not yet been terminated.  */
   1270       case kind::PDF:
   1271 	if (current_ctx () == kind::PDF)
   1272 	  pop ();
   1273 	break;
   1274       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
   1275 	 scope has not yet been terminated, as well as the scopes of
   1276 	 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
   1277 	 yet been terminated.  */
   1278       case kind::PDI:
   1279 	for (int i = vec.count () - 1; i >= 0; --i)
   1280 	  if (pop_kind_at (i) == kind::PDI)
   1281 	    {
   1282 	      vec.truncate (i);
   1283 	      break;
   1284 	    }
   1285 	break;
   1286       case kind::LTR:
   1287       case kind::RTL:
   1288 	/* These aren't popped by a PDF/PDI.  */
   1289 	break;
   1290       ATTR_LIKELY case kind::NONE:
   1291 	break;
   1292       default:
   1293 	abort ();
   1294       }
   1295   }
   1296 
   1297   /* Return a descriptive string for K.  */
   1298   const char *to_str (kind k)
   1299   {
   1300     switch (k)
   1301       {
   1302       case kind::LRE:
   1303 	return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
   1304       case kind::RLE:
   1305 	return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
   1306       case kind::LRO:
   1307 	return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
   1308       case kind::RLO:
   1309 	return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
   1310       case kind::LRI:
   1311 	return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
   1312       case kind::RLI:
   1313 	return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
   1314       case kind::FSI:
   1315 	return "U+2068 (FIRST STRONG ISOLATE)";
   1316       case kind::PDF:
   1317 	return "U+202C (POP DIRECTIONAL FORMATTING)";
   1318       case kind::PDI:
   1319 	return "U+2069 (POP DIRECTIONAL ISOLATE)";
   1320       case kind::LTR:
   1321 	return "U+200E (LEFT-TO-RIGHT MARK)";
   1322       case kind::RTL:
   1323 	return "U+200F (RIGHT-TO-LEFT MARK)";
   1324       default:
   1325 	abort ();
   1326       }
   1327   }
   1328 }
   1329 
   1330 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
   1331    within the current line in FILE, with the caret at START.  */
   1332 
   1333 static location_t
   1334 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
   1335 					 const unsigned char *const start,
   1336 					 size_t num_bytes)
   1337 {
   1338   gcc_checking_assert (num_bytes > 0);
   1339 
   1340   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
   1341      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
   1342      whereas linemap_position_for_column is 1-based.  */
   1343 
   1344   /* Get 0-based offsets within the line.  */
   1345   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
   1346   size_t end_offset = start_offset + num_bytes - 1;
   1347 
   1348   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
   1349   location_t start_loc = linemap_position_for_column (pfile->line_table,
   1350 						      start_offset + 1);
   1351   location_t end_loc = linemap_position_for_column (pfile->line_table,
   1352 						     end_offset + 1);
   1353 
   1354   if (start_loc == end_loc)
   1355     return start_loc;
   1356 
   1357   source_range src_range;
   1358   src_range.m_start = start_loc;
   1359   src_range.m_finish = end_loc;
   1360   location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
   1361 						   start_loc,
   1362 						   src_range,
   1363 						   NULL);
   1364   return combined_loc;
   1365 }
   1366 
   1367 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
   1368 
   1369 static bidi::kind
   1370 get_bidi_utf8_1 (const unsigned char *const p)
   1371 {
   1372   gcc_checking_assert (p[0] == bidi::utf8_start);
   1373 
   1374   if (p[1] == 0x80)
   1375     switch (p[2])
   1376       {
   1377       case 0xaa:
   1378 	return bidi::kind::LRE;
   1379       case 0xab:
   1380 	return bidi::kind::RLE;
   1381       case 0xac:
   1382 	return bidi::kind::PDF;
   1383       case 0xad:
   1384 	return bidi::kind::LRO;
   1385       case 0xae:
   1386 	return bidi::kind::RLO;
   1387       case 0x8e:
   1388 	return bidi::kind::LTR;
   1389       case 0x8f:
   1390 	return bidi::kind::RTL;
   1391       default:
   1392 	break;
   1393       }
   1394   else if (p[1] == 0x81)
   1395     switch (p[2])
   1396       {
   1397       case 0xa6:
   1398 	return bidi::kind::LRI;
   1399       case 0xa7:
   1400 	return bidi::kind::RLI;
   1401       case 0xa8:
   1402 	return bidi::kind::FSI;
   1403       case 0xa9:
   1404 	return bidi::kind::PDI;
   1405       default:
   1406 	break;
   1407       }
   1408 
   1409   return bidi::kind::NONE;
   1410 }
   1411 
   1412 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
   1413    If the kind is not NONE, write the location to *OUT.*/
   1414 
   1415 static bidi::kind
   1416 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
   1417 {
   1418   bidi::kind result = get_bidi_utf8_1 (p);
   1419   if (result != bidi::kind::NONE)
   1420     {
   1421       /* We have a sequence of 3 bytes starting at P.  */
   1422       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
   1423     }
   1424   return result;
   1425 }
   1426 
   1427 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
   1428 
   1429 static bidi::kind
   1430 get_bidi_ucn_1 (const unsigned char *p, bool is_U)
   1431 {
   1432   /* 6.4.3 Universal Character Names
   1433       \u hex-quad
   1434       \U hex-quad hex-quad
   1435      where \unnnn means \U0000nnnn.  */
   1436 
   1437   if (is_U)
   1438     {
   1439       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
   1440 	return bidi::kind::NONE;
   1441       /* Skip 4B so we can treat \u and \U the same below.  */
   1442       p += 4;
   1443     }
   1444 
   1445   /* All code points we are looking for start with 20xx.  */
   1446   if (p[0] != '2' || p[1] != '0')
   1447     return bidi::kind::NONE;
   1448   else if (p[2] == '2')
   1449     switch (p[3])
   1450       {
   1451       case 'a':
   1452       case 'A':
   1453 	return bidi::kind::LRE;
   1454       case 'b':
   1455       case 'B':
   1456 	return bidi::kind::RLE;
   1457       case 'c':
   1458       case 'C':
   1459 	return bidi::kind::PDF;
   1460       case 'd':
   1461       case 'D':
   1462 	return bidi::kind::LRO;
   1463       case 'e':
   1464       case 'E':
   1465 	return bidi::kind::RLO;
   1466       default:
   1467 	break;
   1468       }
   1469   else if (p[2] == '6')
   1470     switch (p[3])
   1471       {
   1472       case '6':
   1473 	return bidi::kind::LRI;
   1474       case '7':
   1475 	return bidi::kind::RLI;
   1476       case '8':
   1477 	return bidi::kind::FSI;
   1478       case '9':
   1479 	return bidi::kind::PDI;
   1480       default:
   1481 	break;
   1482       }
   1483   else if (p[2] == '0')
   1484     switch (p[3])
   1485       {
   1486       case 'e':
   1487       case 'E':
   1488 	return bidi::kind::LTR;
   1489       case 'f':
   1490       case 'F':
   1491 	return bidi::kind::RTL;
   1492       default:
   1493 	break;
   1494       }
   1495 
   1496   return bidi::kind::NONE;
   1497 }
   1498 
   1499 /* Parse a UCN where P points just past \u or \U and return its bidi code.
   1500    If the kind is not NONE, write the location to *OUT.*/
   1501 
   1502 static bidi::kind
   1503 get_bidi_ucn (cpp_reader *pfile,  const unsigned char *p, bool is_U,
   1504 	      location_t *out)
   1505 {
   1506   bidi::kind result = get_bidi_ucn_1 (p, is_U);
   1507   if (result != bidi::kind::NONE)
   1508     {
   1509       const unsigned char *start = p - 2;
   1510       size_t num_bytes = 2 + (is_U ? 8 : 4);
   1511       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
   1512     }
   1513   return result;
   1514 }
   1515 
   1516 /* Subclass of rich_location for reporting on unpaired UTF-8
   1517    bidirectional control character(s).
   1518    Escape the source lines on output, and show all unclosed
   1519    bidi context, labelling everything.  */
   1520 
   1521 class unpaired_bidi_rich_location : public rich_location
   1522 {
   1523  public:
   1524   class custom_range_label : public range_label
   1525   {
   1526    public:
   1527      label_text get_text (unsigned range_idx) const FINAL OVERRIDE
   1528      {
   1529        /* range 0 is the primary location; each subsequent range i + 1
   1530 	  is for bidi::vec[i].  */
   1531        if (range_idx > 0)
   1532 	 {
   1533 	   const bidi::context &ctxt (bidi::vec[range_idx - 1]);
   1534 	   return label_text::borrow (bidi::to_str (ctxt.m_kind));
   1535 	 }
   1536        else
   1537 	 return label_text::borrow (_("end of bidirectional context"));
   1538      }
   1539   };
   1540 
   1541   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
   1542   : rich_location (pfile->line_table, loc, &m_custom_label)
   1543   {
   1544     set_escape_on_output (true);
   1545     for (unsigned i = 0; i < bidi::vec.count (); i++)
   1546       add_range (bidi::vec[i].m_loc,
   1547 		 SHOW_RANGE_WITHOUT_CARET,
   1548 		 &m_custom_label);
   1549   }
   1550 
   1551  private:
   1552    custom_range_label m_custom_label;
   1553 };
   1554 
   1555 /* We're closing a bidi context, that is, we've encountered a newline,
   1556    are closing a C-style comment, or are at the end of a string literal,
   1557    character constant, or identifier.  Warn if this context was not
   1558    properly terminated by a PDI or PDF.  P points to the last character
   1559    in this context.  */
   1560 
   1561 static void
   1562 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
   1563 {
   1564   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
   1565   if (bidi::vec.count () > 0
   1566       && (warn_bidi & bidirectional_unpaired
   1567 	  && (!bidi::current_ctx_ucn_p ()
   1568 	      || (warn_bidi & bidirectional_ucn))))
   1569     {
   1570       const location_t loc
   1571 	= linemap_position_for_column (pfile->line_table,
   1572 				       CPP_BUF_COLUMN (pfile->buffer, p));
   1573       unpaired_bidi_rich_location rich_loc (pfile, loc);
   1574       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
   1575 	 forms of a diagnostic, so fake it for now.  */
   1576       if (bidi::vec.count () > 1)
   1577 	cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1578 			"unpaired UTF-8 bidirectional control characters "
   1579 			"detected");
   1580       else
   1581 	cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1582 			"unpaired UTF-8 bidirectional control character "
   1583 			"detected");
   1584     }
   1585   /* We're done with this context.  */
   1586   bidi::on_close ();
   1587 }
   1588 
   1589 /* We're at the beginning or in the middle of an identifier/comment/string
   1590    literal/character constant.  Warn if we've encountered a bidi character.
   1591    KIND says which bidi control character it was; UCN_P is true iff this bidi
   1592    control character was written as a UCN.  LOC is the location of the
   1593    character, but is only valid if KIND != bidi::kind::NONE.  */
   1594 
   1595 static void
   1596 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
   1597 			 bool ucn_p, location_t loc)
   1598 {
   1599   if (__builtin_expect (kind == bidi::kind::NONE, 1))
   1600     return;
   1601 
   1602   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
   1603 
   1604   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
   1605     {
   1606       rich_location rich_loc (pfile->line_table, loc);
   1607       rich_loc.set_escape_on_output (true);
   1608 
   1609       /* It seems excessive to warn about a PDI/PDF that is closing
   1610 	 an opened context because we've already warned about the
   1611 	 opening character.  Except warn when we have a UCN x UTF-8
   1612 	 mismatch, if UCN checking is enabled.  */
   1613       if (kind == bidi::current_ctx ())
   1614 	{
   1615 	  if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
   1616 	      && bidi::current_ctx_ucn_p () != ucn_p)
   1617 	    {
   1618 	      rich_loc.add_range (bidi::current_ctx_loc ());
   1619 	      cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1620 			      "UTF-8 vs UCN mismatch when closing "
   1621 			      "a context by \"%s\"", bidi::to_str (kind));
   1622 	    }
   1623 	}
   1624       else if (warn_bidi & bidirectional_any
   1625 	       && (!ucn_p || (warn_bidi & bidirectional_ucn)))
   1626 	{
   1627 	  if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
   1628 	    cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1629 			    "\"%s\" is closing an unopened context",
   1630 			    bidi::to_str (kind));
   1631 	  else
   1632 	    cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1633 			    "found problematic Unicode character \"%s\"",
   1634 			    bidi::to_str (kind));
   1635 	}
   1636     }
   1637   /* We're done with this context.  */
   1638   bidi::on_char (kind, ucn_p, loc);
   1639 }
   1640 
   1641 /* Skip a C-style block comment.  We find the end of the comment by
   1642    seeing if an asterisk is before every '/' we encounter.  Returns
   1643    nonzero if comment terminated by EOF, zero otherwise.
   1644 
   1645    Buffer->cur points to the initial asterisk of the comment.  */
   1646 bool
   1647 _cpp_skip_block_comment (cpp_reader *pfile)
   1648 {
   1649   cpp_buffer *buffer = pfile->buffer;
   1650   const uchar *cur = buffer->cur;
   1651   uchar c;
   1652   const bool warn_bidi_p = pfile->warn_bidi_p ();
   1653 
   1654   cur++;
   1655   if (*cur == '/')
   1656     cur++;
   1657 
   1658   for (;;)
   1659     {
   1660       /* People like decorating comments with '*', so check for '/'
   1661 	 instead for efficiency.  */
   1662       c = *cur++;
   1663 
   1664       if (c == '/')
   1665 	{
   1666 	  if (cur[-2] == '*')
   1667 	    {
   1668 	      if (warn_bidi_p)
   1669 		maybe_warn_bidi_on_close (pfile, cur);
   1670 	      break;
   1671 	    }
   1672 
   1673 	  /* Warn about potential nested comments, but not if the '/'
   1674 	     comes immediately before the true comment delimiter.
   1675 	     Don't bother to get it right across escaped newlines.  */
   1676 	  if (CPP_OPTION (pfile, warn_comments)
   1677 	      && cur[0] == '*' && cur[1] != '/')
   1678 	    {
   1679 	      buffer->cur = cur;
   1680 	      cpp_warning_with_line (pfile, CPP_W_COMMENTS,
   1681 				     pfile->line_table->highest_line,
   1682 				     CPP_BUF_COL (buffer),
   1683 				     "\"/*\" within comment");
   1684 	    }
   1685 	}
   1686       else if (c == '\n')
   1687 	{
   1688 	  unsigned int cols;
   1689 	  buffer->cur = cur - 1;
   1690 	  if (warn_bidi_p)
   1691 	    maybe_warn_bidi_on_close (pfile, cur);
   1692 	  _cpp_process_line_notes (pfile, true);
   1693 	  if (buffer->next_line >= buffer->rlimit)
   1694 	    return true;
   1695 	  _cpp_clean_line (pfile);
   1696 
   1697 	  cols = buffer->next_line - buffer->line_base;
   1698 	  CPP_INCREMENT_LINE (pfile, cols);
   1699 
   1700 	  cur = buffer->cur;
   1701 	}
   1702       /* If this is a beginning of a UTF-8 encoding, it might be
   1703 	 a bidirectional control character.  */
   1704       else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
   1705 	{
   1706 	  location_t loc;
   1707 	  bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
   1708 	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   1709 	}
   1710     }
   1711 
   1712   buffer->cur = cur;
   1713   _cpp_process_line_notes (pfile, true);
   1714   return false;
   1715 }
   1716 
   1717 /* Skip a C++ line comment, leaving buffer->cur pointing to the
   1718    terminating newline.  Handles escaped newlines.  Returns nonzero
   1719    if a multiline comment.  */
   1720 static int
   1721 skip_line_comment (cpp_reader *pfile)
   1722 {
   1723   cpp_buffer *buffer = pfile->buffer;
   1724   location_t orig_line = pfile->line_table->highest_line;
   1725   const bool warn_bidi_p = pfile->warn_bidi_p ();
   1726 
   1727   if (!warn_bidi_p)
   1728     while (*buffer->cur != '\n')
   1729       buffer->cur++;
   1730   else
   1731     {
   1732       while (*buffer->cur != '\n'
   1733 	     && *buffer->cur != bidi::utf8_start)
   1734 	buffer->cur++;
   1735       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
   1736 	{
   1737 	  while (*buffer->cur != '\n')
   1738 	    {
   1739 	      if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
   1740 		{
   1741 		  location_t loc;
   1742 		  bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
   1743 		  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   1744 		}
   1745 	      buffer->cur++;
   1746 	    }
   1747 	  maybe_warn_bidi_on_close (pfile, buffer->cur);
   1748 	}
   1749     }
   1750 
   1751   _cpp_process_line_notes (pfile, true);
   1752   return orig_line != pfile->line_table->highest_line;
   1753 }
   1754 
   1755 /* Skips whitespace, saving the next non-whitespace character.  */
   1756 static void
   1757 skip_whitespace (cpp_reader *pfile, cppchar_t c)
   1758 {
   1759   cpp_buffer *buffer = pfile->buffer;
   1760   bool saw_NUL = false;
   1761 
   1762   do
   1763     {
   1764       /* Horizontal space always OK.  */
   1765       if (c == ' ' || c == '\t')
   1766 	;
   1767       /* Just \f \v or \0 left.  */
   1768       else if (c == '\0')
   1769 	saw_NUL = true;
   1770       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
   1771 	cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
   1772 			     CPP_BUF_COL (buffer),
   1773 			     "%s in preprocessing directive",
   1774 			     c == '\f' ? "form feed" : "vertical tab");
   1775 
   1776       c = *buffer->cur++;
   1777     }
   1778   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
   1779   while (is_nvspace (c));
   1780 
   1781   if (saw_NUL)
   1782     {
   1783       encoding_rich_location rich_loc (pfile);
   1784       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
   1785 		    "null character(s) ignored");
   1786     }
   1787 
   1788   buffer->cur--;
   1789 }
   1790 
   1791 /* See if the characters of a number token are valid in a name (no
   1792    '.', '+' or '-').  */
   1793 static int
   1794 name_p (cpp_reader *pfile, const cpp_string *string)
   1795 {
   1796   unsigned int i;
   1797 
   1798   for (i = 0; i < string->len; i++)
   1799     if (!is_idchar (string->text[i]))
   1800       return 0;
   1801 
   1802   return 1;
   1803 }
   1804 
   1805 /* After parsing an identifier or other sequence, produce a warning about
   1806    sequences not in NFC/NFKC.  */
   1807 static void
   1808 warn_about_normalization (cpp_reader *pfile,
   1809 			  const cpp_token *token,
   1810 			  const struct normalize_state *s)
   1811 {
   1812   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
   1813       && !pfile->state.skipping)
   1814     {
   1815       location_t loc = token->src_loc;
   1816 
   1817       /* If possible, create a location range for the token.  */
   1818       if (loc >= RESERVED_LOCATION_COUNT
   1819 	  && token->type != CPP_EOF
   1820 	  /* There must be no line notes to process.  */
   1821 	  && (!(pfile->buffer->cur
   1822 		>= pfile->buffer->notes[pfile->buffer->cur_note].pos
   1823 		&& !pfile->overlaid_buffer)))
   1824 	{
   1825 	  source_range tok_range;
   1826 	  tok_range.m_start = loc;
   1827 	  tok_range.m_finish
   1828 	    = linemap_position_for_column (pfile->line_table,
   1829 					   CPP_BUF_COLUMN (pfile->buffer,
   1830 							   pfile->buffer->cur));
   1831 	  loc = COMBINE_LOCATION_DATA (pfile->line_table,
   1832 				       loc, tok_range, NULL);
   1833 	}
   1834 
   1835       encoding_rich_location rich_loc (pfile, loc);
   1836 
   1837       /* Make sure that the token is printed using UCNs, even
   1838 	 if we'd otherwise happily print UTF-8.  */
   1839       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
   1840       size_t sz;
   1841 
   1842       sz = cpp_spell_token (pfile, token, buf, false) - buf;
   1843       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
   1844 	cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
   1845 			"`%.*s' is not in NFKC", (int) sz, buf);
   1846       else if (CPP_OPTION (pfile, cplusplus))
   1847 	cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
   1848 				  "`%.*s' is not in NFC", (int) sz, buf);
   1849       else
   1850 	cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
   1851 			"`%.*s' is not in NFC", (int) sz, buf);
   1852       free (buf);
   1853     }
   1854 }
   1855 
   1856 static const cppchar_t utf8_signifier = 0xC0;
   1857 
   1858 /* Returns TRUE if the sequence starting at buffer->cur is valid in
   1859    an identifier.  FIRST is TRUE if this starts an identifier.  */
   1860 
   1861 static bool
   1862 forms_identifier_p (cpp_reader *pfile, int first,
   1863 		    struct normalize_state *state)
   1864 {
   1865   cpp_buffer *buffer = pfile->buffer;
   1866   const bool warn_bidi_p = pfile->warn_bidi_p ();
   1867 
   1868   if (*buffer->cur == '$')
   1869     {
   1870       if (!CPP_OPTION (pfile, dollars_in_ident))
   1871 	return false;
   1872 
   1873       buffer->cur++;
   1874       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
   1875 	{
   1876 	  CPP_OPTION (pfile, warn_dollars) = 0;
   1877 	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
   1878 	}
   1879 
   1880       return true;
   1881     }
   1882 
   1883   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
   1884   if (CPP_OPTION (pfile, extended_identifiers))
   1885     {
   1886       cppchar_t s;
   1887       if (*buffer->cur >= utf8_signifier)
   1888 	{
   1889 	  if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
   1890 	      && warn_bidi_p)
   1891 	    {
   1892 	      location_t loc;
   1893 	      bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
   1894 	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   1895 	    }
   1896 	  if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
   1897 			       state, &s))
   1898 	    return true;
   1899 	}
   1900       else if (*buffer->cur == '\\'
   1901 	       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
   1902 	{
   1903 	  buffer->cur += 2;
   1904 	  if (warn_bidi_p)
   1905 	    {
   1906 	      location_t loc;
   1907 	      bidi::kind kind = get_bidi_ucn (pfile,
   1908 					      buffer->cur,
   1909 					      buffer->cur[-1] == 'U',
   1910 					      &loc);
   1911 	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
   1912 	    }
   1913 	  if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
   1914 			      state, &s, NULL, NULL))
   1915 	    return true;
   1916 	  buffer->cur -= 2;
   1917 	}
   1918     }
   1919 
   1920   return false;
   1921 }
   1922 
   1923 /* Helper function to issue error about improper __VA_OPT__ use.  */
   1924 static void
   1925 maybe_va_opt_error (cpp_reader *pfile)
   1926 {
   1927   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
   1928     {
   1929       /* __VA_OPT__ should not be accepted at all, but allow it in
   1930 	 system headers.  */
   1931       if (!_cpp_in_system_header (pfile))
   1932 	cpp_error (pfile, CPP_DL_PEDWARN,
   1933 		   "__VA_OPT__ is not available until C++20");
   1934     }
   1935   else if (!pfile->state.va_args_ok)
   1936     {
   1937       /* __VA_OPT__ should only appear in the replacement list of a
   1938 	 variadic macro.  */
   1939       cpp_error (pfile, CPP_DL_PEDWARN,
   1940 		 "__VA_OPT__ can only appear in the expansion"
   1941 		 " of a C++20 variadic macro");
   1942     }
   1943 }
   1944 
   1945 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
   1946 static cpp_hashnode *
   1947 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
   1948 {
   1949   cpp_hashnode *result;
   1950   const uchar *cur;
   1951   unsigned int len;
   1952   unsigned int hash = HT_HASHSTEP (0, *base);
   1953 
   1954   cur = base + 1;
   1955   while (ISIDNUM (*cur))
   1956     {
   1957       hash = HT_HASHSTEP (hash, *cur);
   1958       cur++;
   1959     }
   1960   len = cur - base;
   1961   hash = HT_HASHFINISH (hash, len);
   1962   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
   1963 					      base, len, hash, HT_ALLOC));
   1964 
   1965   /* Rarely, identifiers require diagnostics when lexed.  */
   1966   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
   1967 			&& !pfile->state.skipping, 0))
   1968     {
   1969       /* It is allowed to poison the same identifier twice.  */
   1970       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
   1971 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
   1972 		   NODE_NAME (result));
   1973 
   1974       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
   1975 	 replacement list of a variadic macro.  */
   1976       if (result == pfile->spec_nodes.n__VA_ARGS__
   1977 	  && !pfile->state.va_args_ok)
   1978 	{
   1979 	  if (CPP_OPTION (pfile, cplusplus))
   1980 	    cpp_error (pfile, CPP_DL_PEDWARN,
   1981 		       "__VA_ARGS__ can only appear in the expansion"
   1982 		       " of a C++11 variadic macro");
   1983 	  else
   1984 	    cpp_error (pfile, CPP_DL_PEDWARN,
   1985 		       "__VA_ARGS__ can only appear in the expansion"
   1986 		       " of a C99 variadic macro");
   1987 	}
   1988 
   1989       if (result == pfile->spec_nodes.n__VA_OPT__)
   1990 	maybe_va_opt_error (pfile);
   1991 
   1992       /* For -Wc++-compat, warn about use of C++ named operators.  */
   1993       if (result->flags & NODE_WARN_OPERATOR)
   1994 	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
   1995 		     "identifier \"%s\" is a special operator name in C++",
   1996 		     NODE_NAME (result));
   1997     }
   1998 
   1999   return result;
   2000 }
   2001 
   2002 /* Get the cpp_hashnode of an identifier specified by NAME in
   2003    the current cpp_reader object.  If none is found, NULL is returned.  */
   2004 cpp_hashnode *
   2005 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
   2006 {
   2007   cpp_hashnode *result;
   2008   result = lex_identifier_intern (pfile, (uchar *) name);
   2009   return result;
   2010 }
   2011 
   2012 /* Lex an identifier starting at BUFFER->CUR - 1.  */
   2013 static cpp_hashnode *
   2014 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
   2015 		struct normalize_state *nst, cpp_hashnode **spelling)
   2016 {
   2017   cpp_hashnode *result;
   2018   const uchar *cur;
   2019   unsigned int len;
   2020   unsigned int hash = HT_HASHSTEP (0, *base);
   2021   const bool warn_bidi_p = pfile->warn_bidi_p ();
   2022 
   2023   cur = pfile->buffer->cur;
   2024   if (! starts_ucn)
   2025     {
   2026       while (ISIDNUM (*cur))
   2027 	{
   2028 	  hash = HT_HASHSTEP (hash, *cur);
   2029 	  cur++;
   2030 	}
   2031       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
   2032     }
   2033   pfile->buffer->cur = cur;
   2034   if (starts_ucn || forms_identifier_p (pfile, false, nst))
   2035     {
   2036       /* Slower version for identifiers containing UCNs
   2037 	 or extended chars (including $).  */
   2038       do {
   2039 	while (ISIDNUM (*pfile->buffer->cur))
   2040 	  {
   2041 	    NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
   2042 	    pfile->buffer->cur++;
   2043 	  }
   2044       } while (forms_identifier_p (pfile, false, nst));
   2045       if (warn_bidi_p)
   2046 	maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
   2047       result = _cpp_interpret_identifier (pfile, base,
   2048 					  pfile->buffer->cur - base);
   2049       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
   2050     }
   2051   else
   2052     {
   2053       len = cur - base;
   2054       hash = HT_HASHFINISH (hash, len);
   2055 
   2056       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
   2057 						  base, len, hash, HT_ALLOC));
   2058       *spelling = result;
   2059     }
   2060 
   2061   /* Rarely, identifiers require diagnostics when lexed.  */
   2062   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
   2063 			&& !pfile->state.skipping, 0))
   2064     {
   2065       /* It is allowed to poison the same identifier twice.  */
   2066       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
   2067 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
   2068 		   NODE_NAME (result));
   2069 
   2070       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
   2071 	 replacement list of a variadic macro.  */
   2072       if (result == pfile->spec_nodes.n__VA_ARGS__
   2073 	  && !pfile->state.va_args_ok)
   2074 	{
   2075 	  if (CPP_OPTION (pfile, cplusplus))
   2076 	    cpp_error (pfile, CPP_DL_PEDWARN,
   2077 		       "__VA_ARGS__ can only appear in the expansion"
   2078 		       " of a C++11 variadic macro");
   2079 	  else
   2080 	    cpp_error (pfile, CPP_DL_PEDWARN,
   2081 		       "__VA_ARGS__ can only appear in the expansion"
   2082 		       " of a C99 variadic macro");
   2083 	}
   2084 
   2085       /* __VA_OPT__ should only appear in the replacement list of a
   2086 	 variadic macro.  */
   2087       if (result == pfile->spec_nodes.n__VA_OPT__)
   2088 	maybe_va_opt_error (pfile);
   2089 
   2090       /* For -Wc++-compat, warn about use of C++ named operators.  */
   2091       if (result->flags & NODE_WARN_OPERATOR)
   2092 	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
   2093 		     "identifier \"%s\" is a special operator name in C++",
   2094 		     NODE_NAME (result));
   2095     }
   2096 
   2097   return result;
   2098 }
   2099 
   2100 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
   2101 static void
   2102 lex_number (cpp_reader *pfile, cpp_string *number,
   2103 	    struct normalize_state *nst)
   2104 {
   2105   const uchar *cur;
   2106   const uchar *base;
   2107   uchar *dest;
   2108 
   2109   base = pfile->buffer->cur - 1;
   2110   do
   2111     {
   2112       const uchar *adj_digit_sep = NULL;
   2113       cur = pfile->buffer->cur;
   2114 
   2115       /* N.B. ISIDNUM does not include $.  */
   2116       while (ISIDNUM (*cur)
   2117 	     || (*cur == '.' && !DIGIT_SEP (cur[-1]))
   2118 	     || DIGIT_SEP (*cur)
   2119 	     || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
   2120 	{
   2121 	  NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
   2122 	  /* Adjacent digit separators do not form part of the pp-number syntax.
   2123 	     However, they can safely be diagnosed here as an error, since '' is
   2124 	     not a valid preprocessing token.  */
   2125 	  if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
   2126 	    adj_digit_sep = cur;
   2127 	  cur++;
   2128 	}
   2129       /* A number can't end with a digit separator.  */
   2130       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
   2131 	--cur;
   2132       if (adj_digit_sep && adj_digit_sep < cur)
   2133 	cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
   2134 
   2135       pfile->buffer->cur = cur;
   2136     }
   2137   while (forms_identifier_p (pfile, false, nst));
   2138 
   2139   number->len = cur - base;
   2140   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
   2141   memcpy (dest, base, number->len);
   2142   dest[number->len] = '\0';
   2143   number->text = dest;
   2144 }
   2145 
   2146 /* Create a token of type TYPE with a literal spelling.  */
   2147 static void
   2148 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
   2149 		unsigned int len, enum cpp_ttype type)
   2150 {
   2151   token->type = type;
   2152   token->val.str.len = len;
   2153   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
   2154 }
   2155 
   2156 const uchar *
   2157 cpp_alloc_token_string (cpp_reader *pfile,
   2158 			const unsigned char *ptr, unsigned len)
   2159 {
   2160   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
   2161 
   2162   dest[len] = 0;
   2163   memcpy (dest, ptr, len);
   2164   return dest;
   2165 }
   2166 
   2167 /* A pair of raw buffer pointers.  The currently open one is [1], the
   2168    first one is [0].  Used for string literal lexing.  */
   2169 struct lit_accum {
   2170   _cpp_buff *first;
   2171   _cpp_buff *last;
   2172   const uchar *rpos;
   2173   size_t accum;
   2174 
   2175   lit_accum ()
   2176     : first (NULL), last (NULL), rpos (0), accum (0)
   2177   {
   2178   }
   2179 
   2180   void append (cpp_reader *, const uchar *, size_t);
   2181 
   2182   void read_begin (cpp_reader *);
   2183   bool reading_p () const
   2184   {
   2185     return rpos != NULL;
   2186   }
   2187   char read_char ()
   2188   {
   2189     char c = *rpos++;
   2190     if (rpos == BUFF_FRONT (last))
   2191       rpos = NULL;
   2192     return c;
   2193   }
   2194 };
   2195 
   2196 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
   2197    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
   2198 
   2199 void
   2200 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
   2201 {
   2202   if (!last)
   2203     /* Starting.  */
   2204     first = last = _cpp_get_buff (pfile, len);
   2205   else if (len > BUFF_ROOM (last))
   2206     {
   2207       /* There is insufficient room in the buffer.  Copy what we can,
   2208 	 and then either extend or create a new one.  */
   2209       size_t room = BUFF_ROOM (last);
   2210       memcpy (BUFF_FRONT (last), base, room);
   2211       BUFF_FRONT (last) += room;
   2212       base += room;
   2213       len -= room;
   2214       accum += room;
   2215 
   2216       gcc_checking_assert (!rpos);
   2217 
   2218       last = _cpp_append_extend_buff (pfile, last, len);
   2219     }
   2220 
   2221   memcpy (BUFF_FRONT (last), base, len);
   2222   BUFF_FRONT (last) += len;
   2223   accum += len;
   2224 }
   2225 
   2226 void
   2227 lit_accum::read_begin (cpp_reader *pfile)
   2228 {
   2229   /* We never accumulate more than 4 chars to read.  */
   2230   if (BUFF_ROOM (last) < 4)
   2231 
   2232     last = _cpp_append_extend_buff (pfile, last, 4);
   2233   rpos = BUFF_FRONT (last);
   2234 }
   2235 
   2236 /* Returns true if a macro has been defined.
   2237    This might not work if compile with -save-temps,
   2238    or preprocess separately from compilation.  */
   2239 
   2240 static bool
   2241 is_macro(cpp_reader *pfile, const uchar *base)
   2242 {
   2243   const uchar *cur = base;
   2244   if (! ISIDST (*cur))
   2245     return false;
   2246   unsigned int hash = HT_HASHSTEP (0, *cur);
   2247   ++cur;
   2248   while (ISIDNUM (*cur))
   2249     {
   2250       hash = HT_HASHSTEP (hash, *cur);
   2251       ++cur;
   2252     }
   2253   hash = HT_HASHFINISH (hash, cur - base);
   2254 
   2255   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
   2256 					base, cur - base, hash, HT_NO_INSERT));
   2257 
   2258   return result && cpp_macro_p (result);
   2259 }
   2260 
   2261 /* Returns true if a literal suffix does not have the expected form
   2262    and is defined as a macro.  */
   2263 
   2264 static bool
   2265 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
   2266 {
   2267   /* User-defined literals outside of namespace std must start with a single
   2268      underscore, so assume anything of that form really is a UDL suffix.
   2269      We don't need to worry about UDLs defined inside namespace std because
   2270      their names are reserved, so cannot be used as macro names in valid
   2271      programs.  */
   2272   if (base[0] == '_' && base[1] != '_')
   2273     return false;
   2274   return is_macro (pfile, base);
   2275 }
   2276 
   2277 /* Lexes a raw string.  The stored string contains the spelling,
   2278    including double quotes, delimiter string, '(' and ')', any leading
   2279    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
   2280    the type of the literal, or CPP_OTHER if it was not properly
   2281    terminated.
   2282 
   2283    BASE is the start of the token.  Updates pfile->buffer->cur to just
   2284    after the lexed string.
   2285 
   2286    The spelling is NUL-terminated, but it is not guaranteed that this
   2287    is the first NUL since embedded NULs are preserved.  */
   2288 
   2289 static void
   2290 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
   2291 {
   2292   const uchar *pos = base;
   2293   const bool warn_bidi_p = pfile->warn_bidi_p ();
   2294 
   2295   /* 'tis a pity this information isn't passed down from the lexer's
   2296      initial categorization of the token.  */
   2297   enum cpp_ttype type = CPP_STRING;
   2298 
   2299   if (*pos == 'L')
   2300     {
   2301       type = CPP_WSTRING;
   2302       pos++;
   2303     }
   2304   else if (*pos == 'U')
   2305     {
   2306       type = CPP_STRING32;
   2307       pos++;
   2308     }
   2309   else if (*pos == 'u')
   2310     {
   2311       if (pos[1] == '8')
   2312 	{
   2313 	  type = CPP_UTF8STRING;
   2314 	  pos++;
   2315 	}
   2316       else
   2317 	type = CPP_STRING16;
   2318       pos++;
   2319     }
   2320 
   2321   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
   2322   pos += 2;
   2323 
   2324   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
   2325 
   2326   /* Skip notes before the ".  */
   2327   while (note->pos < pos)
   2328     ++note;
   2329 
   2330   lit_accum accum;
   2331 
   2332   uchar prefix[17];
   2333   unsigned prefix_len = 0;
   2334   enum Phase
   2335   {
   2336    PHASE_PREFIX = -2,
   2337    PHASE_NONE = -1,
   2338    PHASE_SUFFIX = 0
   2339   } phase = PHASE_PREFIX;
   2340 
   2341   for (;;)
   2342     {
   2343       gcc_checking_assert (note->pos >= pos);
   2344 
   2345       /* Undo any escaped newlines and trigraphs.  */
   2346       if (!accum.reading_p () && note->pos == pos)
   2347 	switch (note->type)
   2348 	  {
   2349 	  case '\\':
   2350 	  case ' ':
   2351 	    /* Restore backslash followed by newline.  */
   2352 	    accum.append (pfile, base, pos - base);
   2353 	    base = pos;
   2354 	    accum.read_begin (pfile);
   2355 	    accum.append (pfile, UC"\\", 1);
   2356 
   2357 	  after_backslash:
   2358 	    if (note->type == ' ')
   2359 	      /* GNU backslash whitespace newline extension.  FIXME
   2360 		 could be any sequence of non-vertical space.  When we
   2361 		 can properly restore any such sequence, we should
   2362 		 mark this note as handled so _cpp_process_line_notes
   2363 		 doesn't warn.  */
   2364 	      accum.append (pfile, UC" ", 1);
   2365 
   2366 	    accum.append (pfile, UC"\n", 1);
   2367 	    note++;
   2368 	    break;
   2369 
   2370 	  case '\n':
   2371 	    /* This can happen for ??/<NEWLINE> when trigraphs are not
   2372 	       being interpretted.  */
   2373 	    gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
   2374 	    note->type = 0;
   2375 	    note++;
   2376 	    break;
   2377 
   2378 	  default:
   2379 	    gcc_checking_assert (_cpp_trigraph_map[note->type]);
   2380 
   2381 	    /* Don't warn about this trigraph in
   2382 	       _cpp_process_line_notes, since trigraphs show up as
   2383 	       trigraphs in raw strings.  */
   2384 	    uchar type = note->type;
   2385 	    note->type = 0;
   2386 
   2387 	    if (CPP_OPTION (pfile, trigraphs))
   2388 	      {
   2389 		accum.append (pfile, base, pos - base);
   2390 		base = pos;
   2391 		accum.read_begin (pfile);
   2392 		accum.append (pfile, UC"??", 2);
   2393 		accum.append (pfile, &type, 1);
   2394 
   2395 		/* ??/ followed by newline gets two line notes, one for
   2396 		   the trigraph and one for the backslash/newline.  */
   2397 		if (type == '/' && note[1].pos == pos)
   2398 		  {
   2399 		    note++;
   2400 		    gcc_assert (note->type == '\\' || note->type == ' ');
   2401 		    goto after_backslash;
   2402 		  }
   2403 		/* Skip the replacement character.  */
   2404 		base = ++pos;
   2405 	      }
   2406 
   2407 	    note++;
   2408 	    break;
   2409 	  }
   2410 
   2411       /* Now get a char to process.  Either from an expanded note, or
   2412 	 from the line buffer.  */
   2413       bool read_note = accum.reading_p ();
   2414       char c = read_note ? accum.read_char () : *pos++;
   2415 
   2416       if (phase == PHASE_PREFIX)
   2417 	{
   2418 	  if (c == '(')
   2419 	    {
   2420 	      /* Done.  */
   2421 	      phase = PHASE_NONE;
   2422 	      prefix[prefix_len++] = '"';
   2423 	    }
   2424 	  else if (prefix_len < 16
   2425 		   /* Prefix chars are any of the basic character set,
   2426 		      [lex.charset] except for '
   2427 		      ()\\\t\v\f\n'. Optimized for a contiguous
   2428 		      alphabet.  */
   2429 		   /* Unlike a switch, this collapses down to one or
   2430 		      two shift and bitmask operations on an ASCII
   2431 		      system, with an outlier or two.   */
   2432 		   && (('Z' - 'A' == 25
   2433 			? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
   2434 			: ISIDST (c))
   2435 		       || (c >= '0' && c <= '9')
   2436 		       || c == '_' || c == '{' || c == '}'
   2437 		       || c == '[' || c == ']' || c == '#'
   2438 		       || c == '<' || c == '>' || c == '%'
   2439 		       || c == ':' || c == ';' || c == '.' || c == '?'
   2440 		       || c == '*' || c == '+' || c == '-' || c == '/'
   2441 		       || c == '^' || c == '&' || c == '|' || c == '~'
   2442 		       || c == '!' || c == '=' || c == ','
   2443 		       || c == '"' || c == '\''))
   2444 	    prefix[prefix_len++] = c;
   2445 	  else
   2446 	    {
   2447 	      /* Something is wrong.  */
   2448 	      int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
   2449 	      if (prefix_len == 16)
   2450 		cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
   2451 				     col, "raw string delimiter longer "
   2452 				     "than 16 characters");
   2453 	      else if (c == '\n')
   2454 		cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
   2455 				     col, "invalid new-line in raw "
   2456 				     "string delimiter");
   2457 	      else
   2458 		cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
   2459 				     col, "invalid character '%c' in "
   2460 				     "raw string delimiter", c);
   2461 	      type = CPP_OTHER;
   2462 	      phase = PHASE_NONE;
   2463 	      /* Continue until we get a close quote, that's probably
   2464 		 the best failure mode.  */
   2465 	      prefix_len = 0;
   2466 	    }
   2467 	  if (c != '\n')
   2468 	    continue;
   2469 	}
   2470 
   2471       if (phase != PHASE_NONE)
   2472 	{
   2473 	  if (prefix[phase] != c)
   2474 	    phase = PHASE_NONE;
   2475 	  else if (unsigned (phase + 1) == prefix_len)
   2476 	    break;
   2477 	  else
   2478 	    {
   2479 	      phase = Phase (phase + 1);
   2480 	      continue;
   2481 	    }
   2482 	}
   2483 
   2484       if (!prefix_len && c == '"')
   2485 	/* Failure mode lexing.  */
   2486 	goto out;
   2487       else if (prefix_len && c == ')')
   2488 	phase = PHASE_SUFFIX;
   2489       else if (!read_note && c == '\n')
   2490 	{
   2491 	  pos--;
   2492 	  pfile->buffer->cur = pos;
   2493 	  if (pfile->state.in_directive
   2494 	      || (pfile->state.parsing_args
   2495 		  && pfile->buffer->next_line >= pfile->buffer->rlimit))
   2496 	    {
   2497 	      cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
   2498 				   "unterminated raw string");
   2499 	      type = CPP_OTHER;
   2500 	      goto out;
   2501 	    }
   2502 
   2503 	  accum.append (pfile, base, pos - base + 1);
   2504 	  _cpp_process_line_notes (pfile, false);
   2505 
   2506 	  if (pfile->buffer->next_line < pfile->buffer->rlimit)
   2507 	    CPP_INCREMENT_LINE (pfile, 0);
   2508 	  pfile->buffer->need_line = true;
   2509 
   2510 	  if (!_cpp_get_fresh_line (pfile))
   2511 	    {
   2512 	      /* We ran out of file and failed to get a line.  */
   2513 	      location_t src_loc = token->src_loc;
   2514 	      token->type = CPP_EOF;
   2515 	      /* Tell the compiler the line number of the EOF token.  */
   2516 	      token->src_loc = pfile->line_table->highest_line;
   2517 	      token->flags = BOL;
   2518 	      if (accum.first)
   2519 		_cpp_release_buff (pfile, accum.first);
   2520 	      cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
   2521 				   "unterminated raw string");
   2522 	      /* Now pop the buffer that _cpp_get_fresh_line did not.  */
   2523 	      _cpp_pop_buffer (pfile);
   2524 	      return;
   2525 	    }
   2526 
   2527 	  pos = base = pfile->buffer->cur;
   2528 	  note = &pfile->buffer->notes[pfile->buffer->cur_note];
   2529 	}
   2530       else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
   2531 	       && warn_bidi_p)
   2532 	{
   2533 	  location_t loc;
   2534 	  bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
   2535 	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   2536 	}
   2537     }
   2538 
   2539   if (warn_bidi_p)
   2540     maybe_warn_bidi_on_close (pfile, pos);
   2541 
   2542   if (CPP_OPTION (pfile, user_literals))
   2543     {
   2544       /* If a string format macro, say from inttypes.h, is placed touching
   2545 	 a string literal it could be parsed as a C++11 user-defined string
   2546 	 literal thus breaking the program.  */
   2547       if (is_macro_not_literal_suffix (pfile, pos))
   2548 	{
   2549 	  /* Raise a warning, but do not consume subsequent tokens.  */
   2550 	  if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
   2551 	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
   2552 				   token->src_loc, 0,
   2553 				   "invalid suffix on literal; C++11 requires "
   2554 				   "a space between literal and string macro");
   2555 	}
   2556       /* Grab user defined literal suffix.  */
   2557       else if (ISIDST (*pos))
   2558 	{
   2559 	  type = cpp_userdef_string_add_type (type);
   2560 	  ++pos;
   2561 
   2562 	  while (ISIDNUM (*pos))
   2563 	    ++pos;
   2564 	}
   2565     }
   2566 
   2567  out:
   2568   pfile->buffer->cur = pos;
   2569   if (!accum.accum)
   2570     create_literal (pfile, token, base, pos - base, type);
   2571   else
   2572     {
   2573       size_t extra_len = pos - base;
   2574       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
   2575 
   2576       token->type = type;
   2577       token->val.str.len = accum.accum + extra_len;
   2578       token->val.str.text = dest;
   2579       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
   2580 	{
   2581 	  size_t len = BUFF_FRONT (buf) - buf->base;
   2582 	  memcpy (dest, buf->base, len);
   2583 	  dest += len;
   2584 	}
   2585       _cpp_release_buff (pfile, accum.first);
   2586       memcpy (dest, base, extra_len);
   2587       dest[extra_len] = '\0';
   2588     }
   2589 }
   2590 
   2591 /* Lexes a string, character constant, or angle-bracketed header file
   2592    name.  The stored string contains the spelling, including opening
   2593    quote and any leading 'L', 'u', 'U' or 'u8' and optional
   2594    'R' modifier.  It returns the type of the literal, or CPP_OTHER
   2595    if it was not properly terminated, or CPP_LESS for an unterminated
   2596    header name which must be relexed as normal tokens.
   2597 
   2598    The spelling is NUL-terminated, but it is not guaranteed that this
   2599    is the first NUL since embedded NULs are preserved.  */
   2600 static void
   2601 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
   2602 {
   2603   bool saw_NUL = false;
   2604   const uchar *cur;
   2605   cppchar_t terminator;
   2606   enum cpp_ttype type;
   2607 
   2608   cur = base;
   2609   terminator = *cur++;
   2610   if (terminator == 'L' || terminator == 'U')
   2611     terminator = *cur++;
   2612   else if (terminator == 'u')
   2613     {
   2614       terminator = *cur++;
   2615       if (terminator == '8')
   2616 	terminator = *cur++;
   2617     }
   2618   if (terminator == 'R')
   2619     {
   2620       lex_raw_string (pfile, token, base);
   2621       return;
   2622     }
   2623   if (terminator == '"')
   2624     type = (*base == 'L' ? CPP_WSTRING :
   2625 	    *base == 'U' ? CPP_STRING32 :
   2626 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
   2627 			 : CPP_STRING);
   2628   else if (terminator == '\'')
   2629     type = (*base == 'L' ? CPP_WCHAR :
   2630 	    *base == 'U' ? CPP_CHAR32 :
   2631 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
   2632 			 : CPP_CHAR);
   2633   else
   2634     terminator = '>', type = CPP_HEADER_NAME;
   2635 
   2636   const bool warn_bidi_p = pfile->warn_bidi_p ();
   2637   for (;;)
   2638     {
   2639       cppchar_t c = *cur++;
   2640 
   2641       /* In #include-style directives, terminators are not escapable.  */
   2642       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
   2643 	{
   2644 	  if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
   2645 	    {
   2646 	      location_t loc;
   2647 	      bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
   2648 					      &loc);
   2649 	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
   2650 	    }
   2651 	  cur++;
   2652 	}
   2653       else if (c == terminator)
   2654 	{
   2655 	  if (warn_bidi_p)
   2656 	    maybe_warn_bidi_on_close (pfile, cur - 1);
   2657 	  break;
   2658 	}
   2659       else if (c == '\n')
   2660 	{
   2661 	  cur--;
   2662 	  /* Unmatched quotes always yield undefined behavior, but
   2663 	     greedy lexing means that what appears to be an unterminated
   2664 	     header name may actually be a legitimate sequence of tokens.  */
   2665 	  if (terminator == '>')
   2666 	    {
   2667 	      token->type = CPP_LESS;
   2668 	      return;
   2669 	    }
   2670 	  type = CPP_OTHER;
   2671 	  break;
   2672 	}
   2673       else if (c == '\0')
   2674 	saw_NUL = true;
   2675       else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
   2676 	{
   2677 	  location_t loc;
   2678 	  bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
   2679 	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   2680 	}
   2681     }
   2682 
   2683   if (saw_NUL && !pfile->state.skipping)
   2684     cpp_error (pfile, CPP_DL_WARNING,
   2685 	       "null character(s) preserved in literal");
   2686 
   2687   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
   2688     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
   2689 	       (int) terminator);
   2690 
   2691   if (CPP_OPTION (pfile, user_literals))
   2692     {
   2693       /* If a string format macro, say from inttypes.h, is placed touching
   2694 	 a string literal it could be parsed as a C++11 user-defined string
   2695 	 literal thus breaking the program.  */
   2696       if (is_macro_not_literal_suffix (pfile, cur))
   2697 	{
   2698 	  /* Raise a warning, but do not consume subsequent tokens.  */
   2699 	  if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
   2700 	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
   2701 				   token->src_loc, 0,
   2702 				   "invalid suffix on literal; C++11 requires "
   2703 				   "a space between literal and string macro");
   2704 	}
   2705       /* Grab user defined literal suffix.  */
   2706       else if (ISIDST (*cur))
   2707 	{
   2708 	  type = cpp_userdef_char_add_type (type);
   2709 	  type = cpp_userdef_string_add_type (type);
   2710           ++cur;
   2711 
   2712 	  while (ISIDNUM (*cur))
   2713 	    ++cur;
   2714 	}
   2715     }
   2716   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
   2717 	   && is_macro (pfile, cur)
   2718 	   && !pfile->state.skipping)
   2719     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
   2720 			   token->src_loc, 0, "C++11 requires a space "
   2721 			   "between string literal and macro");
   2722 
   2723   pfile->buffer->cur = cur;
   2724   create_literal (pfile, token, base, cur - base, type);
   2725 }
   2726 
   2727 /* Return the comment table. The client may not make any assumption
   2728    about the ordering of the table.  */
   2729 cpp_comment_table *
   2730 cpp_get_comments (cpp_reader *pfile)
   2731 {
   2732   return &pfile->comments;
   2733 }
   2734 
   2735 /* Append a comment to the end of the comment table. */
   2736 static void
   2737 store_comment (cpp_reader *pfile, cpp_token *token)
   2738 {
   2739   int len;
   2740 
   2741   if (pfile->comments.allocated == 0)
   2742     {
   2743       pfile->comments.allocated = 256;
   2744       pfile->comments.entries = (cpp_comment *) xmalloc
   2745 	(pfile->comments.allocated * sizeof (cpp_comment));
   2746     }
   2747 
   2748   if (pfile->comments.count == pfile->comments.allocated)
   2749     {
   2750       pfile->comments.allocated *= 2;
   2751       pfile->comments.entries = (cpp_comment *) xrealloc
   2752 	(pfile->comments.entries,
   2753 	 pfile->comments.allocated * sizeof (cpp_comment));
   2754     }
   2755 
   2756   len = token->val.str.len;
   2757 
   2758   /* Copy comment. Note, token may not be NULL terminated. */
   2759   pfile->comments.entries[pfile->comments.count].comment =
   2760     (char *) xmalloc (sizeof (char) * (len + 1));
   2761   memcpy (pfile->comments.entries[pfile->comments.count].comment,
   2762 	  token->val.str.text, len);
   2763   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
   2764 
   2765   /* Set source location. */
   2766   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
   2767 
   2768   /* Increment the count of entries in the comment table. */
   2769   pfile->comments.count++;
   2770 }
   2771 
   2772 /* The stored comment includes the comment start and any terminator.  */
   2773 static void
   2774 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
   2775 	      cppchar_t type)
   2776 {
   2777   unsigned char *buffer;
   2778   unsigned int len, clen, i;
   2779   int convert_to_c = (pfile->state.in_directive || pfile->state.parsing_args)
   2780     && type == '/';
   2781 
   2782   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
   2783 
   2784   /* C++ comments probably (not definitely) have moved past a new
   2785      line, which we don't want to save in the comment.  */
   2786   if (is_vspace (pfile->buffer->cur[-1]))
   2787     len--;
   2788 
   2789   /* If we are currently in a directive or in argument parsing, then
   2790      we need to store all C++ comments as C comments internally, and
   2791      so we need to allocate a little extra space in that case.
   2792 
   2793      Note that the only time we encounter a directive here is
   2794      when we are saving comments in a "#define".  */
   2795   clen = convert_to_c ? len + 2 : len;
   2796 
   2797   buffer = _cpp_unaligned_alloc (pfile, clen);
   2798 
   2799   token->type = CPP_COMMENT;
   2800   token->val.str.len = clen;
   2801   token->val.str.text = buffer;
   2802 
   2803   buffer[0] = '/';
   2804   memcpy (buffer + 1, from, len - 1);
   2805 
   2806   /* Finish conversion to a C comment, if necessary.  */
   2807   if (convert_to_c)
   2808     {
   2809       buffer[1] = '*';
   2810       buffer[clen - 2] = '*';
   2811       buffer[clen - 1] = '/';
   2812       /* As there can be in a C++ comments illegal sequences for C comments
   2813          we need to filter them out.  */
   2814       for (i = 2; i < (clen - 2); i++)
   2815         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
   2816           buffer[i] = '|';
   2817     }
   2818 
   2819   /* Finally store this comment for use by clients of libcpp. */
   2820   store_comment (pfile, token);
   2821 }
   2822 
   2823 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
   2824    comment.  */
   2825 
   2826 static bool
   2827 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
   2828 {
   2829   const unsigned char *from = comment_start + 1;
   2830 
   2831   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
   2832     {
   2833       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
   2834 	 don't recognize any comments.  The latter only checks attributes,
   2835 	 the former doesn't warn.  */
   2836     case 0:
   2837     default:
   2838       return false;
   2839       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
   2840 	 content it has.  */
   2841     case 1:
   2842       return true;
   2843     case 2:
   2844       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
   2845 	 .*falls?[ \t-]*thr(u|ough).* regex.  */
   2846       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
   2847 	   from++)
   2848 	{
   2849 	  /* Is there anything like strpbrk with upper boundary, or
   2850 	     memchr looking for 2 characters rather than just one?  */
   2851 	  if (from[0] != 'f' && from[0] != 'F')
   2852 	    continue;
   2853 	  if (from[1] != 'a' && from[1] != 'A')
   2854 	    continue;
   2855 	  if (from[2] != 'l' && from[2] != 'L')
   2856 	    continue;
   2857 	  if (from[3] != 'l' && from[3] != 'L')
   2858 	    continue;
   2859 	  from += sizeof "fall" - 1;
   2860 	  if (from[0] == 's' || from[0] == 'S')
   2861 	    from++;
   2862 	  while (*from == ' ' || *from == '\t' || *from == '-')
   2863 	    from++;
   2864 	  if (from[0] != 't' && from[0] != 'T')
   2865 	    continue;
   2866 	  if (from[1] != 'h' && from[1] != 'H')
   2867 	    continue;
   2868 	  if (from[2] != 'r' && from[2] != 'R')
   2869 	    continue;
   2870 	  if (from[3] == 'u' || from[3] == 'U')
   2871 	    return true;
   2872 	  if (from[3] != 'o' && from[3] != 'O')
   2873 	    continue;
   2874 	  if (from[4] != 'u' && from[4] != 'U')
   2875 	    continue;
   2876 	  if (from[5] != 'g' && from[5] != 'G')
   2877 	    continue;
   2878 	  if (from[6] != 'h' && from[6] != 'H')
   2879 	    continue;
   2880 	  return true;
   2881 	}
   2882       return false;
   2883     case 3:
   2884     case 4:
   2885       break;
   2886     }
   2887 
   2888   /* Whole comment contents:
   2889      -fallthrough
   2890      @fallthrough@
   2891    */
   2892   if (*from == '-' || *from == '@')
   2893     {
   2894       size_t len = sizeof "fallthrough" - 1;
   2895       if ((size_t) (pfile->buffer->cur - from - 1) < len)
   2896 	return false;
   2897       if (memcmp (from + 1, "fallthrough", len))
   2898 	return false;
   2899       if (*from == '@')
   2900 	{
   2901 	  if (from[len + 1] != '@')
   2902 	    return false;
   2903 	  len++;
   2904 	}
   2905       from += 1 + len;
   2906     }
   2907   /* Whole comment contents (regex):
   2908      lint -fallthrough[ \t]*
   2909    */
   2910   else if (*from == 'l')
   2911     {
   2912       size_t len = sizeof "int -fallthrough" - 1;
   2913       if ((size_t) (pfile->buffer->cur - from - 1) < len)
   2914 	return false;
   2915       if (memcmp (from + 1, "int -fallthrough", len))
   2916 	return false;
   2917       from += 1 + len;
   2918       while (*from == ' ' || *from == '\t')
   2919 	from++;
   2920     }
   2921   /* Whole comment contents (regex):
   2922      [ \t]*FALLTHR(U|OUGH)[ \t]*
   2923    */
   2924   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
   2925     {
   2926       while (*from == ' ' || *from == '\t')
   2927 	from++;
   2928       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
   2929 	return false;
   2930       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
   2931 	return false;
   2932       from += sizeof "FALLTHR" - 1;
   2933       if (*from == 'U')
   2934 	from++;
   2935       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
   2936 	return false;
   2937       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
   2938 	return false;
   2939       else
   2940 	from += sizeof "OUGH" - 1;
   2941       while (*from == ' ' || *from == '\t')
   2942 	from++;
   2943     }
   2944   /* Whole comment contents (regex):
   2945      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
   2946      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
   2947      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
   2948    */
   2949   else
   2950     {
   2951       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
   2952 	from++;
   2953       unsigned char f = *from;
   2954       bool all_upper = false;
   2955       if (f == 'E' || f == 'e')
   2956 	{
   2957 	  if ((size_t) (pfile->buffer->cur - from)
   2958 	      < sizeof "else fallthru" - 1)
   2959 	    return false;
   2960 	  if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
   2961 	    all_upper = true;
   2962 	  else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
   2963 	    return false;
   2964 	  from += sizeof "else" - 1;
   2965 	  if (*from == ',')
   2966 	    from++;
   2967 	  if (*from != ' ')
   2968 	    return false;
   2969 	  from++;
   2970 	  if (all_upper && *from == 'f')
   2971 	    return false;
   2972 	  if (f == 'e' && *from == 'F')
   2973 	    return false;
   2974 	  f = *from;
   2975 	}
   2976       else if (f == 'I' || f == 'i')
   2977 	{
   2978 	  if ((size_t) (pfile->buffer->cur - from)
   2979 	      < sizeof "intentional fallthru" - 1)
   2980 	    return false;
   2981 	  if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
   2982 				  sizeof "NTENTIONAL" - 1) == 0)
   2983 	    all_upper = true;
   2984 	  else if (memcmp (from + 1, "ntentional",
   2985 			   sizeof "ntentional" - 1))
   2986 	    return false;
   2987 	  from += sizeof "intentional" - 1;
   2988 	  if (*from == ' ')
   2989 	    {
   2990 	      from++;
   2991 	      if (all_upper && *from == 'f')
   2992 		return false;
   2993 	    }
   2994 	  else if (all_upper)
   2995 	    {
   2996 	      if (memcmp (from, "LY F", sizeof "LY F" - 1))
   2997 		return false;
   2998 	      from += sizeof "LY " - 1;
   2999 	    }
   3000 	  else
   3001 	    {
   3002 	      if (memcmp (from, "ly ", sizeof "ly " - 1))
   3003 		return false;
   3004 	      from += sizeof "ly " - 1;
   3005 	    }
   3006 	  if (f == 'i' && *from == 'F')
   3007 	    return false;
   3008 	  f = *from;
   3009 	}
   3010       if (f != 'F' && f != 'f')
   3011 	return false;
   3012       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
   3013 	return false;
   3014       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
   3015 	all_upper = true;
   3016       else if (all_upper)
   3017 	return false;
   3018       else if (memcmp (from + 1, "all", sizeof "all" - 1))
   3019 	return false;
   3020       from += sizeof "fall" - 1;
   3021       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
   3022 	from += 2;
   3023       else if (*from == ' ' || *from == '-')
   3024 	from++;
   3025       else if (*from != (all_upper ? 'T' : 't'))
   3026 	return false;
   3027       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
   3028 	return false;
   3029       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
   3030 	return false;
   3031       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
   3032 	{
   3033 	  if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
   3034 	    return false;
   3035 	  if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
   3036 		      sizeof "hrough" - 1))
   3037 	    return false;
   3038 	  from += sizeof "through" - 1;
   3039 	}
   3040       else
   3041 	from += sizeof "thru" - 1;
   3042       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
   3043 	from++;
   3044       if (*from == '-')
   3045 	{
   3046 	  from++;
   3047 	  if (*comment_start == '*')
   3048 	    {
   3049 	      do
   3050 		{
   3051 		  while (*from && *from != '*'
   3052 			 && *from != '\n' && *from != '\r')
   3053 		    from++;
   3054 		  if (*from != '*' || from[1] == '/')
   3055 		    break;
   3056 		  from++;
   3057 		}
   3058 	      while (1);
   3059 	    }
   3060 	  else
   3061 	    while (*from && *from != '\n' && *from != '\r')
   3062 	      from++;
   3063 	}
   3064     }
   3065   /* C block comment.  */
   3066   if (*comment_start == '*')
   3067     {
   3068       if (*from != '*' || from[1] != '/')
   3069 	return false;
   3070     }
   3071   /* C++ line comment.  */
   3072   else if (*from != '\n')
   3073     return false;
   3074 
   3075   return true;
   3076 }
   3077 
   3078 /* Allocate COUNT tokens for RUN.  */
   3079 void
   3080 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
   3081 {
   3082   run->base = XNEWVEC (cpp_token, count);
   3083   run->limit = run->base + count;
   3084   run->next = NULL;
   3085 }
   3086 
   3087 /* Returns the next tokenrun, or creates one if there is none.  */
   3088 static tokenrun *
   3089 next_tokenrun (tokenrun *run)
   3090 {
   3091   if (run->next == NULL)
   3092     {
   3093       run->next = XNEW (tokenrun);
   3094       run->next->prev = run;
   3095       _cpp_init_tokenrun (run->next, 250);
   3096     }
   3097 
   3098   return run->next;
   3099 }
   3100 
   3101 /* Return the number of not yet processed token in a given
   3102    context.  */
   3103 int
   3104 _cpp_remaining_tokens_num_in_context (cpp_context *context)
   3105 {
   3106   if (context->tokens_kind == TOKENS_KIND_DIRECT)
   3107     return (LAST (context).token - FIRST (context).token);
   3108   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
   3109 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
   3110     return (LAST (context).ptoken - FIRST (context).ptoken);
   3111   else
   3112       abort ();
   3113 }
   3114 
   3115 /* Returns the token present at index INDEX in a given context.  If
   3116    INDEX is zero, the next token to be processed is returned.  */
   3117 static const cpp_token*
   3118 _cpp_token_from_context_at (cpp_context *context, int index)
   3119 {
   3120   if (context->tokens_kind == TOKENS_KIND_DIRECT)
   3121     return &(FIRST (context).token[index]);
   3122   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
   3123 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
   3124     return FIRST (context).ptoken[index];
   3125  else
   3126    abort ();
   3127 }
   3128 
   3129 /* Look ahead in the input stream.  */
   3130 const cpp_token *
   3131 cpp_peek_token (cpp_reader *pfile, int index)
   3132 {
   3133   cpp_context *context = pfile->context;
   3134   const cpp_token *peektok;
   3135   int count;
   3136 
   3137   /* First, scan through any pending cpp_context objects.  */
   3138   while (context->prev)
   3139     {
   3140       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
   3141 
   3142       if (index < (int) sz)
   3143         return _cpp_token_from_context_at (context, index);
   3144       index -= (int) sz;
   3145       context = context->prev;
   3146     }
   3147 
   3148   /* We will have to read some new tokens after all (and do so
   3149      without invalidating preceding tokens).  */
   3150   count = index;
   3151   pfile->keep_tokens++;
   3152 
   3153   /* For peeked tokens temporarily disable line_change reporting,
   3154      until the tokens are parsed for real.  */
   3155   void (*line_change) (cpp_reader *, const cpp_token *, int)
   3156     = pfile->cb.line_change;
   3157   pfile->cb.line_change = NULL;
   3158 
   3159   do
   3160     {
   3161       peektok = _cpp_lex_token (pfile);
   3162       if (peektok->type == CPP_EOF)
   3163 	{
   3164 	  index--;
   3165 	  break;
   3166 	}
   3167       else if (peektok->type == CPP_PRAGMA)
   3168 	{
   3169 	  /* Don't peek past a pragma.  */
   3170 	  if (peektok == &pfile->directive_result)
   3171 	    /* Save the pragma in the buffer.  */
   3172 	    *pfile->cur_token++ = *peektok;
   3173 	  index--;
   3174 	  break;
   3175 	}
   3176     }
   3177   while (index--);
   3178 
   3179   _cpp_backup_tokens_direct (pfile, count - index);
   3180   pfile->keep_tokens--;
   3181   pfile->cb.line_change = line_change;
   3182 
   3183   return peektok;
   3184 }
   3185 
   3186 /* Allocate a single token that is invalidated at the same time as the
   3187    rest of the tokens on the line.  Has its line and col set to the
   3188    same as the last lexed token, so that diagnostics appear in the
   3189    right place.  */
   3190 cpp_token *
   3191 _cpp_temp_token (cpp_reader *pfile)
   3192 {
   3193   cpp_token *old, *result;
   3194   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
   3195   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
   3196 
   3197   old = pfile->cur_token - 1;
   3198   /* Any pre-existing lookaheads must not be clobbered.  */
   3199   if (la)
   3200     {
   3201       if (sz <= la)
   3202         {
   3203           tokenrun *next = next_tokenrun (pfile->cur_run);
   3204 
   3205           if (sz < la)
   3206             memmove (next->base + 1, next->base,
   3207                      (la - sz) * sizeof (cpp_token));
   3208 
   3209           next->base[0] = pfile->cur_run->limit[-1];
   3210         }
   3211 
   3212       if (sz > 1)
   3213         memmove (pfile->cur_token + 1, pfile->cur_token,
   3214                  MIN (la, sz - 1) * sizeof (cpp_token));
   3215     }
   3216 
   3217   if (!sz && pfile->cur_token == pfile->cur_run->limit)
   3218     {
   3219       pfile->cur_run = next_tokenrun (pfile->cur_run);
   3220       pfile->cur_token = pfile->cur_run->base;
   3221     }
   3222 
   3223   result = pfile->cur_token++;
   3224   result->src_loc = old->src_loc;
   3225   return result;
   3226 }
   3227 
   3228 /* We're at the beginning of a logical line (so not in
   3229   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
   3230   if we should enter deferred_pragma mode to tokenize the rest of the
   3231   line as a module control-line.  */
   3232 
   3233 static void
   3234 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
   3235 {
   3236   unsigned backup = 0; /* Tokens we peeked.  */
   3237   cpp_hashnode *node = result->val.node.node;
   3238   cpp_token *peek = result;
   3239   cpp_token *keyword = peek;
   3240   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
   3241   int header_count = 0;
   3242 
   3243   /* Make sure the incoming state is as we expect it.  This way we
   3244      can restore it using constants.  */
   3245   gcc_checking_assert (!pfile->state.in_deferred_pragma
   3246 		       && !pfile->state.skipping
   3247 		       && !pfile->state.parsing_args
   3248 		       && !pfile->state.angled_headers
   3249 		       && (pfile->state.save_comments
   3250 			   == !CPP_OPTION (pfile, discard_comments)));
   3251 
   3252   /* Enter directives mode sufficiently for peeking.  We don't have
   3253      to actually set in_directive.  */
   3254   pfile->state.in_deferred_pragma = true;
   3255 
   3256   /* These two fields are needed to process tokenization in deferred
   3257      pragma mode.  They are not used outside deferred pragma mode or
   3258      directives mode.  */
   3259   pfile->state.pragma_allow_expansion = true;
   3260   pfile->directive_line = result->src_loc;
   3261 
   3262   /* Saving comments is incompatible with directives mode.   */
   3263   pfile->state.save_comments = 0;
   3264 
   3265   if (node == n_modules[spec_nodes::M_EXPORT][0])
   3266     {
   3267       peek = _cpp_lex_direct (pfile);
   3268       keyword = peek;
   3269       backup++;
   3270       if (keyword->type != CPP_NAME)
   3271 	goto not_module;
   3272       node = keyword->val.node.node;
   3273       if (!(node->flags & NODE_MODULE))
   3274 	goto not_module;
   3275     }
   3276 
   3277   if (node == n_modules[spec_nodes::M__IMPORT][0])
   3278     /* __import  */
   3279     header_count = backup + 2 + 16;
   3280   else if (node == n_modules[spec_nodes::M_IMPORT][0])
   3281     /* import  */
   3282     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
   3283   else if (node == n_modules[spec_nodes::M_MODULE][0])
   3284     ; /* module  */
   3285   else
   3286     goto not_module;
   3287 
   3288   /* We've seen [export] {module|import|__import}.  Check the next token.  */
   3289   if (header_count)
   3290     /* After '{,__}import' a header name may appear.  */
   3291     pfile->state.angled_headers = true;
   3292   peek = _cpp_lex_direct (pfile);
   3293   backup++;
   3294 
   3295   /* ... import followed by identifier, ':', '<' or
   3296      header-name preprocessing tokens, or module
   3297      followed by cpp-identifier, ':' or ';' preprocessing
   3298      tokens.  C++ keywords are not yet relevant.  */
   3299   if (peek->type == CPP_NAME
   3300       || peek->type == CPP_COLON
   3301       ||  (header_count
   3302 	   ? (peek->type == CPP_LESS
   3303 	      || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
   3304 	      || peek->type == CPP_HEADER_NAME)
   3305 	   : peek->type == CPP_SEMICOLON))
   3306     {
   3307       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
   3308       if (!pfile->state.pragma_allow_expansion)
   3309 	pfile->state.prevent_expansion++;
   3310 
   3311       if (!header_count && linemap_included_from
   3312 	  (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
   3313 	cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
   3314 			     "module control-line cannot be in included file");
   3315 
   3316       /* The first one or two tokens cannot be macro names.  */
   3317       for (int ix = backup; ix--;)
   3318 	{
   3319 	  cpp_token *tok = ix ? keyword : result;
   3320 	  cpp_hashnode *node = tok->val.node.node;
   3321 
   3322 	  /* Don't attempt to expand the token.  */
   3323 	  tok->flags |= NO_EXPAND;
   3324 	  if (_cpp_defined_macro_p (node)
   3325 	      && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
   3326 	      && !cpp_fun_like_macro_p (node))
   3327 	    cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
   3328 				 "module control-line \"%s\" cannot be"
   3329 				 " an object-like macro",
   3330 				 NODE_NAME (node));
   3331 	}
   3332 
   3333       /* Map to underbar variants.  */
   3334       keyword->val.node.node = n_modules[header_count
   3335 					 ? spec_nodes::M_IMPORT
   3336 					 : spec_nodes::M_MODULE][1];
   3337       if (backup != 1)
   3338 	result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
   3339 
   3340       /* Maybe tell the tokenizer we expect a header-name down the
   3341 	 road.  */
   3342       pfile->state.directive_file_token = header_count;
   3343     }
   3344   else
   3345     {
   3346     not_module:
   3347       /* Drop out of directive mode.  */
   3348       /* We aaserted save_comments had this value upon entry.  */
   3349       pfile->state.save_comments
   3350 	= !CPP_OPTION (pfile, discard_comments);
   3351       pfile->state.in_deferred_pragma = false;
   3352       /* Do not let this remain on.  */
   3353       pfile->state.angled_headers = false;
   3354     }
   3355 
   3356   /* In either case we want to backup the peeked tokens.  */
   3357   if (backup)
   3358     {
   3359       /* If we saw EOL, we should drop it, because this isn't a module
   3360 	 control-line after all.  */
   3361       bool eol = peek->type == CPP_PRAGMA_EOL;
   3362       if (!eol || backup > 1)
   3363 	{
   3364 	  /* Put put the peeked tokens back  */
   3365 	  _cpp_backup_tokens_direct (pfile, backup);
   3366 	  /* But if the last one was an EOL, forget it.  */
   3367 	  if (eol)
   3368 	    pfile->lookaheads--;
   3369 	}
   3370     }
   3371 }
   3372 
   3373 /* Lex a token into RESULT (external interface).  Takes care of issues
   3374    like directive handling, token lookahead, multiple include
   3375    optimization and skipping.  */
   3376 const cpp_token *
   3377 _cpp_lex_token (cpp_reader *pfile)
   3378 {
   3379   cpp_token *result;
   3380 
   3381   for (;;)
   3382     {
   3383       if (pfile->cur_token == pfile->cur_run->limit)
   3384 	{
   3385 	  pfile->cur_run = next_tokenrun (pfile->cur_run);
   3386 	  pfile->cur_token = pfile->cur_run->base;
   3387 	}
   3388       /* We assume that the current token is somewhere in the current
   3389 	 run.  */
   3390       if (pfile->cur_token < pfile->cur_run->base
   3391 	  || pfile->cur_token >= pfile->cur_run->limit)
   3392 	abort ();
   3393 
   3394       if (pfile->lookaheads)
   3395 	{
   3396 	  pfile->lookaheads--;
   3397 	  result = pfile->cur_token++;
   3398 	}
   3399       else
   3400 	result = _cpp_lex_direct (pfile);
   3401 
   3402       if (result->flags & BOL)
   3403 	{
   3404 	  /* Is this a directive.  If _cpp_handle_directive returns
   3405 	     false, it is an assembler #.  */
   3406 	  if (result->type == CPP_HASH
   3407 	      /* 6.10.3 p 11: Directives in a list of macro arguments
   3408 		 gives undefined behavior.  This implementation
   3409 		 handles the directive as normal.  */
   3410 	      && pfile->state.parsing_args != 1)
   3411 	    {
   3412 	      if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
   3413 		{
   3414 		  if (pfile->directive_result.type == CPP_PADDING)
   3415 		    continue;
   3416 		  result = &pfile->directive_result;
   3417 		}
   3418 	    }
   3419 	  else if (pfile->state.in_deferred_pragma)
   3420 	    result = &pfile->directive_result;
   3421 	  else if (result->type == CPP_NAME
   3422 		   && (result->val.node.node->flags & NODE_MODULE)
   3423 		   && !pfile->state.skipping
   3424 		   /* Unlike regular directives, we do not deal with
   3425 		      tokenizing module directives as macro arguments.
   3426 		      That's not permitted.  */
   3427 		   && !pfile->state.parsing_args)
   3428 	    {
   3429 	      /* P1857.  Before macro expansion, At start of logical
   3430 		 line ... */
   3431 	      /* We don't have to consider lookaheads at this point.  */
   3432 	      gcc_checking_assert (!pfile->lookaheads);
   3433 
   3434 	      cpp_maybe_module_directive (pfile, result);
   3435 	    }
   3436 
   3437 	  if (pfile->cb.line_change && !pfile->state.skipping)
   3438 	    pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
   3439 	}
   3440 
   3441       /* We don't skip tokens in directives.  */
   3442       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
   3443 	break;
   3444 
   3445       /* Outside a directive, invalidate controlling macros.  At file
   3446 	 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
   3447 	 get here and MI optimization works.  */
   3448       pfile->mi_valid = false;
   3449 
   3450       if (!pfile->state.skipping || result->type == CPP_EOF)
   3451 	break;
   3452     }
   3453 
   3454   return result;
   3455 }
   3456 
   3457 /* Returns true if a fresh line has been loaded.  */
   3458 bool
   3459 _cpp_get_fresh_line (cpp_reader *pfile)
   3460 {
   3461   /* We can't get a new line until we leave the current directive.  */
   3462   if (pfile->state.in_directive)
   3463     return false;
   3464 
   3465   for (;;)
   3466     {
   3467       cpp_buffer *buffer = pfile->buffer;
   3468 
   3469       if (!buffer->need_line)
   3470 	return true;
   3471 
   3472       if (buffer->next_line < buffer->rlimit)
   3473 	{
   3474 	  _cpp_clean_line (pfile);
   3475 	  return true;
   3476 	}
   3477 
   3478       /* First, get out of parsing arguments state.  */
   3479       if (pfile->state.parsing_args)
   3480 	return false;
   3481 
   3482       /* End of buffer.  Non-empty files should end in a newline.  */
   3483       if (buffer->buf != buffer->rlimit
   3484 	  && buffer->next_line > buffer->rlimit
   3485 	  && !buffer->from_stage3)
   3486 	{
   3487 	  /* Clip to buffer size.  */
   3488 	  buffer->next_line = buffer->rlimit;
   3489 	}
   3490 
   3491       if (buffer->prev && !buffer->return_at_eof)
   3492 	_cpp_pop_buffer (pfile);
   3493       else
   3494 	{
   3495 	  /* End of translation.  Do not pop the buffer yet. Increment
   3496 	     line number so that the EOF token is on a line of its own
   3497 	     (_cpp_lex_direct doesn't increment in that case, because
   3498 	     it's hard for it to distinguish this special case). */
   3499 	  CPP_INCREMENT_LINE (pfile, 0);
   3500 	  return false;
   3501 	}
   3502     }
   3503 }
   3504 
   3505 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)		\
   3506   do							\
   3507     {							\
   3508       result->type = ELSE_TYPE;				\
   3509       if (*buffer->cur == CHAR)				\
   3510 	buffer->cur++, result->type = THEN_TYPE;	\
   3511     }							\
   3512   while (0)
   3513 
   3514 /* Lex a token into pfile->cur_token, which is also incremented, to
   3515    get diagnostics pointing to the correct location.
   3516 
   3517    Does not handle issues such as token lookahead, multiple-include
   3518    optimization, directives, skipping etc.  This function is only
   3519    suitable for use by _cpp_lex_token, and in special cases like
   3520    lex_expansion_token which doesn't care for any of these issues.
   3521 
   3522    When meeting a newline, returns CPP_EOF if parsing a directive,
   3523    otherwise returns to the start of the token buffer if permissible.
   3524    Returns the location of the lexed token.  */
   3525 cpp_token *
   3526 _cpp_lex_direct (cpp_reader *pfile)
   3527 {
   3528   cppchar_t c;
   3529   cpp_buffer *buffer;
   3530   const unsigned char *comment_start;
   3531   bool fallthrough_comment = false;
   3532   cpp_token *result = pfile->cur_token++;
   3533 
   3534  fresh_line:
   3535   result->flags = 0;
   3536   buffer = pfile->buffer;
   3537   if (buffer->need_line)
   3538     {
   3539       if (pfile->state.in_deferred_pragma)
   3540 	{
   3541 	  /* This can happen in cases like:
   3542 	     #define loop(x) whatever
   3543 	     #pragma omp loop
   3544 	     where when trying to expand loop we need to peek
   3545 	     next token after loop, but aren't still in_deferred_pragma
   3546 	     mode but are in in_directive mode, so buffer->need_line
   3547 	     is set, a CPP_EOF is peeked.  */
   3548 	  result->type = CPP_PRAGMA_EOL;
   3549 	  pfile->state.in_deferred_pragma = false;
   3550 	  if (!pfile->state.pragma_allow_expansion)
   3551 	    pfile->state.prevent_expansion--;
   3552 	  return result;
   3553 	}
   3554       if (!_cpp_get_fresh_line (pfile))
   3555 	{
   3556 	  result->type = CPP_EOF;
   3557 	  /* Not a real EOF in a directive or arg parsing -- we refuse
   3558   	     to advance to the next file now, and will once we're out
   3559   	     of those modes.  */
   3560 	  if (!pfile->state.in_directive && !pfile->state.parsing_args)
   3561 	    {
   3562 	      /* Tell the compiler the line number of the EOF token.  */
   3563 	      result->src_loc = pfile->line_table->highest_line;
   3564 	      result->flags = BOL;
   3565 	      /* Now pop the buffer that _cpp_get_fresh_line did not.  */
   3566 	      _cpp_pop_buffer (pfile);
   3567 	    }
   3568 	  return result;
   3569 	}
   3570       if (buffer != pfile->buffer)
   3571 	fallthrough_comment = false;
   3572       if (!pfile->keep_tokens)
   3573 	{
   3574 	  pfile->cur_run = &pfile->base_run;
   3575 	  result = pfile->base_run.base;
   3576 	  pfile->cur_token = result + 1;
   3577 	}
   3578       result->flags = BOL;
   3579       if (pfile->state.parsing_args == 2)
   3580 	result->flags |= PREV_WHITE;
   3581     }
   3582   buffer = pfile->buffer;
   3583  update_tokens_line:
   3584   result->src_loc = pfile->line_table->highest_line;
   3585 
   3586  skipped_white:
   3587   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
   3588       && !pfile->overlaid_buffer)
   3589     {
   3590       _cpp_process_line_notes (pfile, false);
   3591       result->src_loc = pfile->line_table->highest_line;
   3592     }
   3593   c = *buffer->cur++;
   3594 
   3595   if (pfile->forced_token_location)
   3596     result->src_loc = pfile->forced_token_location;
   3597   else
   3598     result->src_loc = linemap_position_for_column (pfile->line_table,
   3599 					  CPP_BUF_COLUMN (buffer, buffer->cur));
   3600 
   3601   switch (c)
   3602     {
   3603     case ' ': case '\t': case '\f': case '\v': case '\0':
   3604       result->flags |= PREV_WHITE;
   3605       skip_whitespace (pfile, c);
   3606       goto skipped_white;
   3607 
   3608     case '\n':
   3609       /* Increment the line, unless this is the last line ...  */
   3610       if (buffer->cur < buffer->rlimit
   3611 	  /* ... or this is a #include, (where _cpp_stack_file needs to
   3612 	     unwind by one line) ...  */
   3613 	  || (pfile->state.in_directive > 1
   3614 	      /* ... except traditional-cpp increments this elsewhere.  */
   3615 	      && !CPP_OPTION (pfile, traditional)))
   3616 	CPP_INCREMENT_LINE (pfile, 0);
   3617       buffer->need_line = true;
   3618       if (pfile->state.in_deferred_pragma)
   3619 	{
   3620 	  /* Produce the PRAGMA_EOL on this line.  File reading
   3621 	     ensures there is always a \n at end of the buffer, thus
   3622 	     in a deferred pragma we always see CPP_PRAGMA_EOL before
   3623 	     any CPP_EOF.  */
   3624 	  result->type = CPP_PRAGMA_EOL;
   3625 	  result->flags &= ~PREV_WHITE;
   3626 	  pfile->state.in_deferred_pragma = false;
   3627 	  if (!pfile->state.pragma_allow_expansion)
   3628 	    pfile->state.prevent_expansion--;
   3629 	  return result;
   3630 	}
   3631       goto fresh_line;
   3632 
   3633     case '0': case '1': case '2': case '3': case '4':
   3634     case '5': case '6': case '7': case '8': case '9':
   3635       {
   3636 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   3637 	result->type = CPP_NUMBER;
   3638 	lex_number (pfile, &result->val.str, &nst);
   3639 	warn_about_normalization (pfile, result, &nst);
   3640 	break;
   3641       }
   3642 
   3643     case 'L':
   3644     case 'u':
   3645     case 'U':
   3646     case 'R':
   3647       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
   3648 	 wide strings or raw strings.  */
   3649       if (c == 'L' || CPP_OPTION (pfile, rliterals)
   3650 	  || (c != 'R' && CPP_OPTION (pfile, uliterals)))
   3651 	{
   3652 	  if ((*buffer->cur == '\'' && c != 'R')
   3653 	      || *buffer->cur == '"'
   3654 	      || (*buffer->cur == 'R'
   3655 		  && c != 'R'
   3656 		  && buffer->cur[1] == '"'
   3657 		  && CPP_OPTION (pfile, rliterals))
   3658 	      || (*buffer->cur == '8'
   3659 		  && c == 'u'
   3660 		  && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
   3661 				&& CPP_OPTION (pfile, utf8_char_literals)))
   3662 		      || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
   3663 			  && CPP_OPTION (pfile, rliterals)))))
   3664 	    {
   3665 	      lex_string (pfile, result, buffer->cur - 1);
   3666 	      break;
   3667 	    }
   3668 	}
   3669       /* Fall through.  */
   3670 
   3671     case '_':
   3672     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
   3673     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
   3674     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
   3675     case 's': case 't':           case 'v': case 'w': case 'x':
   3676     case 'y': case 'z':
   3677     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
   3678     case 'G': case 'H': case 'I': case 'J': case 'K':
   3679     case 'M': case 'N': case 'O': case 'P': case 'Q':
   3680     case 'S': case 'T':           case 'V': case 'W': case 'X':
   3681     case 'Y': case 'Z':
   3682       result->type = CPP_NAME;
   3683       {
   3684 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   3685 	result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
   3686 						&nst,
   3687 						&result->val.node.spelling);
   3688 	warn_about_normalization (pfile, result, &nst);
   3689       }
   3690 
   3691       /* Convert named operators to their proper types.  */
   3692       if (result->val.node.node->flags & NODE_OPERATOR)
   3693 	{
   3694 	  result->flags |= NAMED_OP;
   3695 	  result->type = (enum cpp_ttype) result->val.node.node->directive_index;
   3696 	}
   3697 
   3698       /* Signal FALLTHROUGH comment followed by another token.  */
   3699       if (fallthrough_comment)
   3700 	result->flags |= PREV_FALLTHROUGH;
   3701       break;
   3702 
   3703     case '\'':
   3704     case '"':
   3705       lex_string (pfile, result, buffer->cur - 1);
   3706       break;
   3707 
   3708     case '/':
   3709       /* A potential block or line comment.  */
   3710       comment_start = buffer->cur;
   3711       c = *buffer->cur;
   3712 
   3713       if (c == '*')
   3714 	{
   3715 	  if (_cpp_skip_block_comment (pfile))
   3716 	    cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
   3717 	}
   3718       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
   3719 	{
   3720 	  /* Don't warn for system headers.  */
   3721 	  if (_cpp_in_system_header (pfile))
   3722 	    ;
   3723 	  /* Warn about comments if pedantically GNUC89, and not
   3724 	     in system headers.  */
   3725 	  else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
   3726 		   && CPP_PEDANTIC (pfile)
   3727 		   && ! buffer->warned_cplusplus_comments)
   3728 	    {
   3729 	      if (cpp_error (pfile, CPP_DL_PEDWARN,
   3730 			     "C++ style comments are not allowed in ISO C90"))
   3731 		cpp_error (pfile, CPP_DL_NOTE,
   3732 			   "(this will be reported only once per input file)");
   3733 	      buffer->warned_cplusplus_comments = 1;
   3734 	    }
   3735 	  /* Or if specifically desired via -Wc90-c99-compat.  */
   3736 	  else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
   3737 		   && ! CPP_OPTION (pfile, cplusplus)
   3738 		   && ! buffer->warned_cplusplus_comments)
   3739 	    {
   3740 	      if (cpp_error (pfile, CPP_DL_WARNING,
   3741 			     "C++ style comments are incompatible with C90"))
   3742 		cpp_error (pfile, CPP_DL_NOTE,
   3743 			   "(this will be reported only once per input file)");
   3744 	      buffer->warned_cplusplus_comments = 1;
   3745 	    }
   3746 	  /* In C89/C94, C++ style comments are forbidden.  */
   3747 	  else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
   3748 		    || CPP_OPTION (pfile, lang) == CLK_STDC94))
   3749 	    {
   3750 	      /* But don't be confused about valid code such as
   3751 	         - // immediately followed by *,
   3752 		 - // in a preprocessing directive,
   3753 		 - // in an #if 0 block.  */
   3754 	      if (buffer->cur[1] == '*'
   3755 		  || pfile->state.in_directive
   3756 		  || pfile->state.skipping)
   3757 		{
   3758 		  result->type = CPP_DIV;
   3759 		  break;
   3760 		}
   3761 	      else if (! buffer->warned_cplusplus_comments)
   3762 		{
   3763 		  if (cpp_error (pfile, CPP_DL_ERROR,
   3764 				 "C++ style comments are not allowed in "
   3765 				 "ISO C90"))
   3766 		    cpp_error (pfile, CPP_DL_NOTE,
   3767 			       "(this will be reported only once per input "
   3768 			       "file)");
   3769 		  buffer->warned_cplusplus_comments = 1;
   3770 		}
   3771 	    }
   3772 	  if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
   3773 	    cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
   3774 	}
   3775       else if (c == '=')
   3776 	{
   3777 	  buffer->cur++;
   3778 	  result->type = CPP_DIV_EQ;
   3779 	  break;
   3780 	}
   3781       else
   3782 	{
   3783 	  result->type = CPP_DIV;
   3784 	  break;
   3785 	}
   3786 
   3787       if (fallthrough_comment_p (pfile, comment_start))
   3788 	fallthrough_comment = true;
   3789 
   3790       if (pfile->cb.comment)
   3791 	{
   3792 	  size_t len = pfile->buffer->cur - comment_start;
   3793 	  pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
   3794 			     len + 1);
   3795 	}
   3796 
   3797       if (!pfile->state.save_comments)
   3798 	{
   3799 	  result->flags |= PREV_WHITE;
   3800 	  goto update_tokens_line;
   3801 	}
   3802 
   3803       if (fallthrough_comment)
   3804 	result->flags |= PREV_FALLTHROUGH;
   3805 
   3806       /* Save the comment as a token in its own right.  */
   3807       save_comment (pfile, result, comment_start, c);
   3808       break;
   3809 
   3810     case '<':
   3811       if (pfile->state.angled_headers)
   3812 	{
   3813 	  lex_string (pfile, result, buffer->cur - 1);
   3814 	  if (result->type != CPP_LESS)
   3815 	    break;
   3816 	}
   3817 
   3818       result->type = CPP_LESS;
   3819       if (*buffer->cur == '=')
   3820 	{
   3821 	  buffer->cur++, result->type = CPP_LESS_EQ;
   3822 	  if (*buffer->cur == '>'
   3823 	      && CPP_OPTION (pfile, cplusplus)
   3824 	      && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
   3825 	    buffer->cur++, result->type = CPP_SPACESHIP;
   3826 	}
   3827       else if (*buffer->cur == '<')
   3828 	{
   3829 	  buffer->cur++;
   3830 	  IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
   3831 	}
   3832       else if (CPP_OPTION (pfile, digraphs))
   3833 	{
   3834 	  if (*buffer->cur == ':')
   3835 	    {
   3836 	      /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
   3837 		 three characters are <:: and the subsequent character
   3838 		 is neither : nor >, the < is treated as a preprocessor
   3839 		 token by itself".  */
   3840 	      if (CPP_OPTION (pfile, cplusplus)
   3841 		  && CPP_OPTION (pfile, lang) != CLK_CXX98
   3842 		  && CPP_OPTION (pfile, lang) != CLK_GNUCXX
   3843 		  && buffer->cur[1] == ':'
   3844 		  && buffer->cur[2] != ':' && buffer->cur[2] != '>')
   3845 		break;
   3846 
   3847 	      buffer->cur++;
   3848 	      result->flags |= DIGRAPH;
   3849 	      result->type = CPP_OPEN_SQUARE;
   3850 	    }
   3851 	  else if (*buffer->cur == '%')
   3852 	    {
   3853 	      buffer->cur++;
   3854 	      result->flags |= DIGRAPH;
   3855 	      result->type = CPP_OPEN_BRACE;
   3856 	    }
   3857 	}
   3858       break;
   3859 
   3860     case '>':
   3861       result->type = CPP_GREATER;
   3862       if (*buffer->cur == '=')
   3863 	buffer->cur++, result->type = CPP_GREATER_EQ;
   3864       else if (*buffer->cur == '>')
   3865 	{
   3866 	  buffer->cur++;
   3867 	  IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
   3868 	}
   3869       break;
   3870 
   3871     case '%':
   3872       result->type = CPP_MOD;
   3873       if (*buffer->cur == '=')
   3874 	buffer->cur++, result->type = CPP_MOD_EQ;
   3875       else if (CPP_OPTION (pfile, digraphs))
   3876 	{
   3877 	  if (*buffer->cur == ':')
   3878 	    {
   3879 	      buffer->cur++;
   3880 	      result->flags |= DIGRAPH;
   3881 	      result->type = CPP_HASH;
   3882 	      if (*buffer->cur == '%' && buffer->cur[1] == ':')
   3883 		buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
   3884 	    }
   3885 	  else if (*buffer->cur == '>')
   3886 	    {
   3887 	      buffer->cur++;
   3888 	      result->flags |= DIGRAPH;
   3889 	      result->type = CPP_CLOSE_BRACE;
   3890 	    }
   3891 	}
   3892       break;
   3893 
   3894     case '.':
   3895       result->type = CPP_DOT;
   3896       if (ISDIGIT (*buffer->cur))
   3897 	{
   3898 	  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   3899 	  result->type = CPP_NUMBER;
   3900 	  lex_number (pfile, &result->val.str, &nst);
   3901 	  warn_about_normalization (pfile, result, &nst);
   3902 	}
   3903       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
   3904 	buffer->cur += 2, result->type = CPP_ELLIPSIS;
   3905       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
   3906 	buffer->cur++, result->type = CPP_DOT_STAR;
   3907       break;
   3908 
   3909     case '+':
   3910       result->type = CPP_PLUS;
   3911       if (*buffer->cur == '+')
   3912 	buffer->cur++, result->type = CPP_PLUS_PLUS;
   3913       else if (*buffer->cur == '=')
   3914 	buffer->cur++, result->type = CPP_PLUS_EQ;
   3915       break;
   3916 
   3917     case '-':
   3918       result->type = CPP_MINUS;
   3919       if (*buffer->cur == '>')
   3920 	{
   3921 	  buffer->cur++;
   3922 	  result->type = CPP_DEREF;
   3923 	  if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
   3924 	    buffer->cur++, result->type = CPP_DEREF_STAR;
   3925 	}
   3926       else if (*buffer->cur == '-')
   3927 	buffer->cur++, result->type = CPP_MINUS_MINUS;
   3928       else if (*buffer->cur == '=')
   3929 	buffer->cur++, result->type = CPP_MINUS_EQ;
   3930       break;
   3931 
   3932     case '&':
   3933       result->type = CPP_AND;
   3934       if (*buffer->cur == '&')
   3935 	buffer->cur++, result->type = CPP_AND_AND;
   3936       else if (*buffer->cur == '=')
   3937 	buffer->cur++, result->type = CPP_AND_EQ;
   3938       break;
   3939 
   3940     case '|':
   3941       result->type = CPP_OR;
   3942       if (*buffer->cur == '|')
   3943 	buffer->cur++, result->type = CPP_OR_OR;
   3944       else if (*buffer->cur == '=')
   3945 	buffer->cur++, result->type = CPP_OR_EQ;
   3946       break;
   3947 
   3948     case ':':
   3949       result->type = CPP_COLON;
   3950       if (*buffer->cur == ':')
   3951 	{
   3952 	  if (CPP_OPTION (pfile, scope))
   3953 	    buffer->cur++, result->type = CPP_SCOPE;
   3954 	  else
   3955 	    result->flags |= COLON_SCOPE;
   3956 	}
   3957       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
   3958 	{
   3959 	  buffer->cur++;
   3960 	  result->flags |= DIGRAPH;
   3961 	  result->type = CPP_CLOSE_SQUARE;
   3962 	}
   3963       break;
   3964 
   3965     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
   3966     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
   3967     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
   3968     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
   3969     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
   3970 
   3971     case '?': result->type = CPP_QUERY; break;
   3972     case '~': result->type = CPP_COMPL; break;
   3973     case ',': result->type = CPP_COMMA; break;
   3974     case '(': result->type = CPP_OPEN_PAREN; break;
   3975     case ')': result->type = CPP_CLOSE_PAREN; break;
   3976     case '[': result->type = CPP_OPEN_SQUARE; break;
   3977     case ']': result->type = CPP_CLOSE_SQUARE; break;
   3978     case '{': result->type = CPP_OPEN_BRACE; break;
   3979     case '}': result->type = CPP_CLOSE_BRACE; break;
   3980     case ';': result->type = CPP_SEMICOLON; break;
   3981 
   3982       /* @ is a punctuator in Objective-C.  */
   3983     case '@': result->type = CPP_ATSIGN; break;
   3984 
   3985     default:
   3986       {
   3987 	const uchar *base = --buffer->cur;
   3988 
   3989 	/* Check for an extended identifier ($ or UCN or UTF-8).  */
   3990 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   3991 	if (forms_identifier_p (pfile, true, &nst))
   3992 	  {
   3993 	    result->type = CPP_NAME;
   3994 	    result->val.node.node = lex_identifier (pfile, base, true, &nst,
   3995 						    &result->val.node.spelling);
   3996 	    warn_about_normalization (pfile, result, &nst);
   3997 	    break;
   3998 	  }
   3999 
   4000 	/* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
   4001 	   single token.  */
   4002 	buffer->cur++;
   4003 	if (c >= utf8_signifier)
   4004 	  {
   4005 	    const uchar *pstr = base;
   4006 	    cppchar_t s;
   4007 	    if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
   4008 	      buffer->cur = pstr;
   4009 	  }
   4010 	create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
   4011 	break;
   4012       }
   4013 
   4014     }
   4015 
   4016   /* Potentially convert the location of the token to a range.  */
   4017   if (result->src_loc >= RESERVED_LOCATION_COUNT
   4018       && result->type != CPP_EOF)
   4019     {
   4020       /* Ensure that any line notes are processed, so that we have the
   4021 	 correct physical line/column for the end-point of the token even
   4022 	 when a logical line is split via one or more backslashes.  */
   4023       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
   4024 	  && !pfile->overlaid_buffer)
   4025 	_cpp_process_line_notes (pfile, false);
   4026 
   4027       source_range tok_range;
   4028       tok_range.m_start = result->src_loc;
   4029       tok_range.m_finish
   4030 	= linemap_position_for_column (pfile->line_table,
   4031 				       CPP_BUF_COLUMN (buffer, buffer->cur));
   4032 
   4033       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
   4034 					       result->src_loc,
   4035 					       tok_range, NULL);
   4036     }
   4037 
   4038   return result;
   4039 }
   4040 
   4041 /* An upper bound on the number of bytes needed to spell TOKEN.
   4042    Does not include preceding whitespace.  */
   4043 unsigned int
   4044 cpp_token_len (const cpp_token *token)
   4045 {
   4046   unsigned int len;
   4047 
   4048   switch (TOKEN_SPELL (token))
   4049     {
   4050     default:		len = 6;				break;
   4051     case SPELL_LITERAL:	len = token->val.str.len;		break;
   4052     case SPELL_IDENT:	len = NODE_LEN (token->val.node.node) * 10;	break;
   4053     }
   4054 
   4055   return len;
   4056 }
   4057 
   4058 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
   4059    Return the number of bytes read out of NAME.  (There are always
   4060    10 bytes written to BUFFER.)  */
   4061 
   4062 static size_t
   4063 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
   4064 {
   4065   int j;
   4066   int ucn_len = 0;
   4067   int ucn_len_c;
   4068   unsigned t;
   4069   unsigned long utf32;
   4070 
   4071   /* Compute the length of the UTF-8 sequence.  */
   4072   for (t = *name; t & 0x80; t <<= 1)
   4073     ucn_len++;
   4074 
   4075   utf32 = *name & (0x7F >> ucn_len);
   4076   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
   4077     {
   4078       utf32 = (utf32 << 6) | (*++name & 0x3F);
   4079 
   4080       /* Ill-formed UTF-8.  */
   4081       if ((*name & ~0x3F) != 0x80)
   4082 	abort ();
   4083     }
   4084 
   4085   *buffer++ = '\\';
   4086   *buffer++ = 'U';
   4087   for (j = 7; j >= 0; j--)
   4088     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
   4089   return ucn_len;
   4090 }
   4091 
   4092 /* Given a token TYPE corresponding to a digraph, return a pointer to
   4093    the spelling of the digraph.  */
   4094 static const unsigned char *
   4095 cpp_digraph2name (enum cpp_ttype type)
   4096 {
   4097   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
   4098 }
   4099 
   4100 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
   4101    The buffer must already contain the enough space to hold the
   4102    token's spelling.  Returns a pointer to the character after the
   4103    last character written.  */
   4104 unsigned char *
   4105 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
   4106 {
   4107   size_t i;
   4108   const unsigned char *name = NODE_NAME (ident);
   4109 
   4110   for (i = 0; i < NODE_LEN (ident); i++)
   4111     if (name[i] & ~0x7F)
   4112       {
   4113 	i += utf8_to_ucn (buffer, name + i) - 1;
   4114 	buffer += 10;
   4115       }
   4116     else
   4117       *buffer++ = name[i];
   4118 
   4119   return buffer;
   4120 }
   4121 
   4122 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
   4123    already contain the enough space to hold the token's spelling.
   4124    Returns a pointer to the character after the last character written.
   4125    FORSTRING is true if this is to be the spelling after translation
   4126    phase 1 (with the original spelling of extended identifiers), false
   4127    if extended identifiers should always be written using UCNs (there is
   4128    no option for always writing them in the internal UTF-8 form).
   4129    FIXME: Would be nice if we didn't need the PFILE argument.  */
   4130 unsigned char *
   4131 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
   4132 		 unsigned char *buffer, bool forstring)
   4133 {
   4134   switch (TOKEN_SPELL (token))
   4135     {
   4136     case SPELL_OPERATOR:
   4137       {
   4138 	const unsigned char *spelling;
   4139 	unsigned char c;
   4140 
   4141 	if (token->flags & DIGRAPH)
   4142 	  spelling = cpp_digraph2name (token->type);
   4143 	else if (token->flags & NAMED_OP)
   4144 	  goto spell_ident;
   4145 	else
   4146 	  spelling = TOKEN_NAME (token);
   4147 
   4148 	while ((c = *spelling++) != '\0')
   4149 	  *buffer++ = c;
   4150       }
   4151       break;
   4152 
   4153     spell_ident:
   4154     case SPELL_IDENT:
   4155       if (forstring)
   4156 	{
   4157 	  memcpy (buffer, NODE_NAME (token->val.node.spelling),
   4158 		  NODE_LEN (token->val.node.spelling));
   4159 	  buffer += NODE_LEN (token->val.node.spelling);
   4160 	}
   4161       else
   4162 	buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
   4163       break;
   4164 
   4165     case SPELL_LITERAL:
   4166       memcpy (buffer, token->val.str.text, token->val.str.len);
   4167       buffer += token->val.str.len;
   4168       break;
   4169 
   4170     case SPELL_NONE:
   4171       cpp_error (pfile, CPP_DL_ICE,
   4172 		 "unspellable token %s", TOKEN_NAME (token));
   4173       break;
   4174     }
   4175 
   4176   return buffer;
   4177 }
   4178 
   4179 /* Returns TOKEN spelt as a null-terminated string.  The string is
   4180    freed when the reader is destroyed.  Useful for diagnostics.  */
   4181 unsigned char *
   4182 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
   4183 {
   4184   unsigned int len = cpp_token_len (token) + 1;
   4185   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
   4186 
   4187   end = cpp_spell_token (pfile, token, start, false);
   4188   end[0] = '\0';
   4189 
   4190   return start;
   4191 }
   4192 
   4193 /* Returns a pointer to a string which spells the token defined by
   4194    TYPE and FLAGS.  Used by C front ends, which really should move to
   4195    using cpp_token_as_text.  */
   4196 const char *
   4197 cpp_type2name (enum cpp_ttype type, unsigned char flags)
   4198 {
   4199   if (flags & DIGRAPH)
   4200     return (const char *) cpp_digraph2name (type);
   4201   else if (flags & NAMED_OP)
   4202     return cpp_named_operator2name (type);
   4203 
   4204   return (const char *) token_spellings[type].name;
   4205 }
   4206 
   4207 /* Writes the spelling of token to FP, without any preceding space.
   4208    Separated from cpp_spell_token for efficiency - to avoid stdio
   4209    double-buffering.  */
   4210 void
   4211 cpp_output_token (const cpp_token *token, FILE *fp)
   4212 {
   4213   switch (TOKEN_SPELL (token))
   4214     {
   4215     case SPELL_OPERATOR:
   4216       {
   4217 	const unsigned char *spelling;
   4218 	int c;
   4219 
   4220 	if (token->flags & DIGRAPH)
   4221 	  spelling = cpp_digraph2name (token->type);
   4222 	else if (token->flags & NAMED_OP)
   4223 	  goto spell_ident;
   4224 	else
   4225 	  spelling = TOKEN_NAME (token);
   4226 
   4227 	c = *spelling;
   4228 	do
   4229 	  putc (c, fp);
   4230 	while ((c = *++spelling) != '\0');
   4231       }
   4232       break;
   4233 
   4234     spell_ident:
   4235     case SPELL_IDENT:
   4236       {
   4237 	size_t i;
   4238 	const unsigned char * name = NODE_NAME (token->val.node.node);
   4239 
   4240 	for (i = 0; i < NODE_LEN (token->val.node.node); i++)
   4241 	  if (name[i] & ~0x7F)
   4242 	    {
   4243 	      unsigned char buffer[10];
   4244 	      i += utf8_to_ucn (buffer, name + i) - 1;
   4245 	      fwrite (buffer, 1, 10, fp);
   4246 	    }
   4247 	  else
   4248 	    fputc (NODE_NAME (token->val.node.node)[i], fp);
   4249       }
   4250       break;
   4251 
   4252     case SPELL_LITERAL:
   4253       if (token->type == CPP_HEADER_NAME)
   4254 	fputc ('"', fp);
   4255       fwrite (token->val.str.text, 1, token->val.str.len, fp);
   4256       if (token->type == CPP_HEADER_NAME)
   4257 	fputc ('"', fp);
   4258       break;
   4259 
   4260     case SPELL_NONE:
   4261       /* An error, most probably.  */
   4262       break;
   4263     }
   4264 }
   4265 
   4266 /* Compare two tokens.  */
   4267 int
   4268 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
   4269 {
   4270   if (a->type == b->type && a->flags == b->flags)
   4271     switch (TOKEN_SPELL (a))
   4272       {
   4273       default:			/* Keep compiler happy.  */
   4274       case SPELL_OPERATOR:
   4275 	/* token_no is used to track where multiple consecutive ##
   4276 	   tokens were originally located.  */
   4277 	return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
   4278       case SPELL_NONE:
   4279 	return (a->type != CPP_MACRO_ARG
   4280 		|| (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
   4281 		    && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
   4282       case SPELL_IDENT:
   4283 	return (a->val.node.node == b->val.node.node
   4284 		&& a->val.node.spelling == b->val.node.spelling);
   4285       case SPELL_LITERAL:
   4286 	return (a->val.str.len == b->val.str.len
   4287 		&& !memcmp (a->val.str.text, b->val.str.text,
   4288 			    a->val.str.len));
   4289       }
   4290 
   4291   return 0;
   4292 }
   4293 
   4294 /* Returns nonzero if a space should be inserted to avoid an
   4295    accidental token paste for output.  For simplicity, it is
   4296    conservative, and occasionally advises a space where one is not
   4297    needed, e.g. "." and ".2".  */
   4298 int
   4299 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
   4300 		 const cpp_token *token2)
   4301 {
   4302   enum cpp_ttype a = token1->type, b = token2->type;
   4303   cppchar_t c;
   4304 
   4305   if (token1->flags & NAMED_OP)
   4306     a = CPP_NAME;
   4307   if (token2->flags & NAMED_OP)
   4308     b = CPP_NAME;
   4309 
   4310   c = EOF;
   4311   if (token2->flags & DIGRAPH)
   4312     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
   4313   else if (token_spellings[b].category == SPELL_OPERATOR)
   4314     c = token_spellings[b].name[0];
   4315 
   4316   /* Quickly get everything that can paste with an '='.  */
   4317   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
   4318     return 1;
   4319 
   4320   switch (a)
   4321     {
   4322     case CPP_GREATER:	return c == '>';
   4323     case CPP_LESS:	return c == '<' || c == '%' || c == ':';
   4324     case CPP_PLUS:	return c == '+';
   4325     case CPP_MINUS:	return c == '-' || c == '>';
   4326     case CPP_DIV:	return c == '/' || c == '*'; /* Comments.  */
   4327     case CPP_MOD:	return c == ':' || c == '>';
   4328     case CPP_AND:	return c == '&';
   4329     case CPP_OR:	return c == '|';
   4330     case CPP_COLON:	return c == ':' || c == '>';
   4331     case CPP_DEREF:	return c == '*';
   4332     case CPP_DOT:	return c == '.' || c == '%' || b == CPP_NUMBER;
   4333     case CPP_HASH:	return c == '#' || c == '%'; /* Digraph form.  */
   4334     case CPP_PRAGMA:
   4335     case CPP_NAME:	return ((b == CPP_NUMBER
   4336 				 && name_p (pfile, &token2->val.str))
   4337 				|| b == CPP_NAME
   4338 				|| b == CPP_CHAR || b == CPP_STRING); /* L */
   4339     case CPP_NUMBER:	return (b == CPP_NUMBER || b == CPP_NAME
   4340 				|| b == CPP_CHAR
   4341 				|| c == '.' || c == '+' || c == '-');
   4342 				      /* UCNs */
   4343     case CPP_OTHER:	return ((token1->val.str.text[0] == '\\'
   4344 				 && b == CPP_NAME)
   4345 				|| (CPP_OPTION (pfile, objc)
   4346 				    && token1->val.str.text[0] == '@'
   4347 				    && (b == CPP_NAME || b == CPP_STRING)));
   4348     case CPP_LESS_EQ:	return c == '>';
   4349     case CPP_STRING:
   4350     case CPP_WSTRING:
   4351     case CPP_UTF8STRING:
   4352     case CPP_STRING16:
   4353     case CPP_STRING32:	return (CPP_OPTION (pfile, user_literals)
   4354 				&& (b == CPP_NAME
   4355 				    || (TOKEN_SPELL (token2) == SPELL_LITERAL
   4356 					&& ISIDST (token2->val.str.text[0]))));
   4357 
   4358     default:		break;
   4359     }
   4360 
   4361   return 0;
   4362 }
   4363 
   4364 /* Output all the remaining tokens on the current line, and a newline
   4365    character, to FP.  Leading whitespace is removed.  If there are
   4366    macros, special token padding is not performed.  */
   4367 void
   4368 cpp_output_line (cpp_reader *pfile, FILE *fp)
   4369 {
   4370   const cpp_token *token;
   4371 
   4372   token = cpp_get_token (pfile);
   4373   while (token->type != CPP_EOF)
   4374     {
   4375       cpp_output_token (token, fp);
   4376       token = cpp_get_token (pfile);
   4377       if (token->flags & PREV_WHITE)
   4378 	putc (' ', fp);
   4379     }
   4380 
   4381   putc ('\n', fp);
   4382 }
   4383 
   4384 /* Return a string representation of all the remaining tokens on the
   4385    current line.  The result is allocated using xmalloc and must be
   4386    freed by the caller.  */
   4387 unsigned char *
   4388 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
   4389 {
   4390   const cpp_token *token;
   4391   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
   4392   unsigned int alloced = 120 + out;
   4393   unsigned char *result = (unsigned char *) xmalloc (alloced);
   4394 
   4395   /* If DIR_NAME is empty, there are no initial contents.  */
   4396   if (dir_name)
   4397     {
   4398       sprintf ((char *) result, "#%s ", dir_name);
   4399       out += 2;
   4400     }
   4401 
   4402   token = cpp_get_token (pfile);
   4403   while (token->type != CPP_EOF)
   4404     {
   4405       unsigned char *last;
   4406       /* Include room for a possible space and the terminating nul.  */
   4407       unsigned int len = cpp_token_len (token) + 2;
   4408 
   4409       if (out + len > alloced)
   4410 	{
   4411 	  alloced *= 2;
   4412 	  if (out + len > alloced)
   4413 	    alloced = out + len;
   4414 	  result = (unsigned char *) xrealloc (result, alloced);
   4415 	}
   4416 
   4417       last = cpp_spell_token (pfile, token, &result[out], 0);
   4418       out = last - result;
   4419 
   4420       token = cpp_get_token (pfile);
   4421       if (token->flags & PREV_WHITE)
   4422 	result[out++] = ' ';
   4423     }
   4424 
   4425   result[out] = '\0';
   4426   return result;
   4427 }
   4428 
   4429 /* Memory buffers.  Changing these three constants can have a dramatic
   4430    effect on performance.  The values here are reasonable defaults,
   4431    but might be tuned.  If you adjust them, be sure to test across a
   4432    range of uses of cpplib, including heavy nested function-like macro
   4433    expansion.  Also check the change in peak memory usage (NJAMD is a
   4434    good tool for this).  */
   4435 #define MIN_BUFF_SIZE 8000
   4436 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
   4437 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
   4438 	(MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
   4439 
   4440 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
   4441   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
   4442 #endif
   4443 
   4444 /* Create a new allocation buffer.  Place the control block at the end
   4445    of the buffer, so that buffer overflows will cause immediate chaos.  */
   4446 static _cpp_buff *
   4447 new_buff (size_t len)
   4448 {
   4449   _cpp_buff *result;
   4450   unsigned char *base;
   4451 
   4452   if (len < MIN_BUFF_SIZE)
   4453     len = MIN_BUFF_SIZE;
   4454   len = CPP_ALIGN (len);
   4455 
   4456 #ifdef ENABLE_VALGRIND_ANNOTATIONS
   4457   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
   4458      struct first.  */
   4459   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
   4460   base = XNEWVEC (unsigned char, len + slen);
   4461   result = (_cpp_buff *) base;
   4462   base += slen;
   4463 #else
   4464   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
   4465   result = (_cpp_buff *) (base + len);
   4466 #endif
   4467   result->base = base;
   4468   result->cur = base;
   4469   result->limit = base + len;
   4470   result->next = NULL;
   4471   return result;
   4472 }
   4473 
   4474 /* Place a chain of unwanted allocation buffers on the free list.  */
   4475 void
   4476 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
   4477 {
   4478   _cpp_buff *end = buff;
   4479 
   4480   while (end->next)
   4481     end = end->next;
   4482   end->next = pfile->free_buffs;
   4483   pfile->free_buffs = buff;
   4484 }
   4485 
   4486 /* Return a free buffer of size at least MIN_SIZE.  */
   4487 _cpp_buff *
   4488 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
   4489 {
   4490   _cpp_buff *result, **p;
   4491 
   4492   for (p = &pfile->free_buffs;; p = &(*p)->next)
   4493     {
   4494       size_t size;
   4495 
   4496       if (*p == NULL)
   4497 	return new_buff (min_size);
   4498       result = *p;
   4499       size = result->limit - result->base;
   4500       /* Return a buffer that's big enough, but don't waste one that's
   4501          way too big.  */
   4502       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
   4503 	break;
   4504     }
   4505 
   4506   *p = result->next;
   4507   result->next = NULL;
   4508   result->cur = result->base;
   4509   return result;
   4510 }
   4511 
   4512 /* Creates a new buffer with enough space to hold the uncommitted
   4513    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
   4514    the excess bytes to the new buffer.  Chains the new buffer after
   4515    BUFF, and returns the new buffer.  */
   4516 _cpp_buff *
   4517 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
   4518 {
   4519   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
   4520   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
   4521 
   4522   buff->next = new_buff;
   4523   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
   4524   return new_buff;
   4525 }
   4526 
   4527 /* Creates a new buffer with enough space to hold the uncommitted
   4528    remaining bytes of the buffer pointed to by BUFF, and at least
   4529    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
   4530    Chains the new buffer before the buffer pointed to by BUFF, and
   4531    updates the pointer to point to the new buffer.  */
   4532 void
   4533 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
   4534 {
   4535   _cpp_buff *new_buff, *old_buff = *pbuff;
   4536   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
   4537 
   4538   new_buff = _cpp_get_buff (pfile, size);
   4539   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
   4540   new_buff->next = old_buff;
   4541   *pbuff = new_buff;
   4542 }
   4543 
   4544 /* Free a chain of buffers starting at BUFF.  */
   4545 void
   4546 _cpp_free_buff (_cpp_buff *buff)
   4547 {
   4548   _cpp_buff *next;
   4549 
   4550   for (; buff; buff = next)
   4551     {
   4552       next = buff->next;
   4553 #ifdef ENABLE_VALGRIND_ANNOTATIONS
   4554       free (buff);
   4555 #else
   4556       free (buff->base);
   4557 #endif
   4558     }
   4559 }
   4560 
   4561 /* Allocate permanent, unaligned storage of length LEN.  */
   4562 unsigned char *
   4563 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
   4564 {
   4565   _cpp_buff *buff = pfile->u_buff;
   4566   unsigned char *result = buff->cur;
   4567 
   4568   if (len > (size_t) (buff->limit - result))
   4569     {
   4570       buff = _cpp_get_buff (pfile, len);
   4571       buff->next = pfile->u_buff;
   4572       pfile->u_buff = buff;
   4573       result = buff->cur;
   4574     }
   4575 
   4576   buff->cur = result + len;
   4577   return result;
   4578 }
   4579 
   4580 /* Allocate permanent, unaligned storage of length LEN from a_buff.
   4581    That buffer is used for growing allocations when saving macro
   4582    replacement lists in a #define, and when parsing an answer to an
   4583    assertion in #assert, #unassert or #if (and therefore possibly
   4584    whilst expanding macros).  It therefore must not be used by any
   4585    code that they might call: specifically the lexer and the guts of
   4586    the macro expander.
   4587 
   4588    All existing other uses clearly fit this restriction: storing
   4589    registered pragmas during initialization.  */
   4590 unsigned char *
   4591 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
   4592 {
   4593   _cpp_buff *buff = pfile->a_buff;
   4594   unsigned char *result = buff->cur;
   4595 
   4596   if (len > (size_t) (buff->limit - result))
   4597     {
   4598       buff = _cpp_get_buff (pfile, len);
   4599       buff->next = pfile->a_buff;
   4600       pfile->a_buff = buff;
   4601       result = buff->cur;
   4602     }
   4603 
   4604   buff->cur = result + len;
   4605   return result;
   4606 }
   4607 
   4608 /* Commit or allocate storage from a buffer.  */
   4609 
   4610 void *
   4611 _cpp_commit_buff (cpp_reader *pfile, size_t size)
   4612 {
   4613   void *ptr = BUFF_FRONT (pfile->a_buff);
   4614 
   4615   if (pfile->hash_table->alloc_subobject)
   4616     {
   4617       void *copy = pfile->hash_table->alloc_subobject (size);
   4618       memcpy (copy, ptr, size);
   4619       ptr = copy;
   4620     }
   4621   else
   4622     BUFF_FRONT (pfile->a_buff) += size;
   4623 
   4624   return ptr;
   4625 }
   4626 
   4627 /* Say which field of TOK is in use.  */
   4628 
   4629 enum cpp_token_fld_kind
   4630 cpp_token_val_index (const cpp_token *tok)
   4631 {
   4632   switch (TOKEN_SPELL (tok))
   4633     {
   4634     case SPELL_IDENT:
   4635       return CPP_TOKEN_FLD_NODE;
   4636     case SPELL_LITERAL:
   4637       return CPP_TOKEN_FLD_STR;
   4638     case SPELL_OPERATOR:
   4639       /* Operands which were originally spelled as ident keep around
   4640          the node for the exact spelling.  */
   4641       if (tok->flags & NAMED_OP)
   4642 	return CPP_TOKEN_FLD_NODE;
   4643       else if (tok->type == CPP_PASTE)
   4644 	return CPP_TOKEN_FLD_TOKEN_NO;
   4645       else
   4646 	return CPP_TOKEN_FLD_NONE;
   4647     case SPELL_NONE:
   4648       if (tok->type == CPP_MACRO_ARG)
   4649 	return CPP_TOKEN_FLD_ARG_NO;
   4650       else if (tok->type == CPP_PADDING)
   4651 	return CPP_TOKEN_FLD_SOURCE;
   4652       else if (tok->type == CPP_PRAGMA)
   4653 	return CPP_TOKEN_FLD_PRAGMA;
   4654       /* fall through */
   4655     default:
   4656       return CPP_TOKEN_FLD_NONE;
   4657     }
   4658 }
   4659 
   4660 /* All tokens lexed in R after calling this function will be forced to
   4661    have their location_t to be P, until
   4662    cpp_stop_forcing_token_locations is called for R.  */
   4663 
   4664 void
   4665 cpp_force_token_locations (cpp_reader *r, location_t loc)
   4666 {
   4667   r->forced_token_location = loc;
   4668 }
   4669 
   4670 /* Go back to assigning locations naturally for lexed tokens.  */
   4671 
   4672 void
   4673 cpp_stop_forcing_token_locations (cpp_reader *r)
   4674 {
   4675   r->forced_token_location = 0;
   4676 }
   4677 
   4678 /* We're looking at \, if it's escaping EOL, look past it.  If at
   4679    LIMIT, don't advance.  */
   4680 
   4681 static const unsigned char *
   4682 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
   4683 {
   4684   const unsigned char *probe = peek;
   4685 
   4686   if (__builtin_expect (peek[1] == '\n', true))
   4687     {
   4688     eol:
   4689       probe += 2;
   4690       if (__builtin_expect (probe < limit, true))
   4691 	{
   4692 	  peek = probe;
   4693 	  if (*peek == '\\')
   4694 	    /* The user might be perverse.  */
   4695 	    return do_peek_backslash (peek, limit);
   4696 	}
   4697     }
   4698   else if (__builtin_expect (peek[1] == '\r', false))
   4699     {
   4700       if (probe[2] == '\n')
   4701 	probe++;
   4702       goto eol;
   4703     }
   4704 
   4705   return peek;
   4706 }
   4707 
   4708 static const unsigned char *
   4709 do_peek_next (const unsigned char *peek, const unsigned char *limit)
   4710 {
   4711   if (__builtin_expect (*peek == '\\', false))
   4712     peek = do_peek_backslash (peek, limit);
   4713   return peek;
   4714 }
   4715 
   4716 static const unsigned char *
   4717 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
   4718 {
   4719   if (peek == bound)
   4720     return NULL;
   4721 
   4722   unsigned char c = *--peek;
   4723   if (__builtin_expect (c == '\n', false)
   4724       || __builtin_expect (c == 'r', false))
   4725     {
   4726       if (peek == bound)
   4727 	return peek;
   4728       int ix = -1;
   4729       if (c == '\n' && peek[ix] == '\r')
   4730 	{
   4731 	  if (peek + ix == bound)
   4732 	    return peek;
   4733 	  ix--;
   4734 	}
   4735 
   4736       if (peek[ix] == '\\')
   4737 	return do_peek_prev (peek + ix, bound);
   4738 
   4739       return peek;
   4740     }
   4741   else
   4742     return peek;
   4743 }
   4744 
   4745 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
   4746    space.  Otherwise return NULL.  */
   4747 
   4748 static const unsigned char *
   4749 do_peek_ident (const char *match, const unsigned char *peek,
   4750 	       const unsigned char *limit)
   4751 {
   4752   for (; *++match; peek++)
   4753     if (*peek != *match)
   4754       {
   4755 	peek = do_peek_next (peek, limit);
   4756 	if (*peek != *match)
   4757 	  return NULL;
   4758       }
   4759 
   4760   /* Must now not be looking at an identifier char.  */
   4761   peek = do_peek_next (peek, limit);
   4762   if (ISIDNUM (*peek))
   4763     return NULL;
   4764 
   4765   /* Skip control-line whitespace.  */
   4766  ws:
   4767   while (*peek == ' ' || *peek == '\t')
   4768     peek++;
   4769   if (__builtin_expect (*peek == '\\', false))
   4770     {
   4771       peek = do_peek_backslash (peek, limit);
   4772       if (*peek != '\\')
   4773 	goto ws;
   4774     }
   4775 
   4776   return peek;
   4777 }
   4778 
   4779 /* Are we looking at a module control line starting as PEEK - 1?  */
   4780 
   4781 static bool
   4782 do_peek_module (cpp_reader *pfile, unsigned char c,
   4783 		const unsigned char *peek, const unsigned char *limit)
   4784 {
   4785   bool import = false;
   4786 
   4787   if (__builtin_expect (c == 'e', false))
   4788     {
   4789       if (!((peek[0] == 'x' || peek[0] == '\\')
   4790 	    && (peek = do_peek_ident ("export", peek, limit))))
   4791 	return false;
   4792 
   4793       /* export, peek for import or module.  No need to peek __import
   4794 	 here.  */
   4795       if (peek[0] == 'i')
   4796 	{
   4797 	  if (!((peek[1] == 'm' || peek[1] == '\\')
   4798 		&& (peek = do_peek_ident ("import", peek + 1, limit))))
   4799 	    return false;
   4800 	  import = true;
   4801 	}
   4802       else if (peek[0] == 'm')
   4803 	{
   4804 	  if (!((peek[1] == 'o' || peek[1] == '\\')
   4805 		&& (peek = do_peek_ident ("module", peek + 1, limit))))
   4806 	    return false;
   4807 	}
   4808       else
   4809 	return false;
   4810     }
   4811   else if (__builtin_expect (c == 'i', false))
   4812     {
   4813       if (!((peek[0] == 'm' || peek[0] == '\\')
   4814 	    && (peek = do_peek_ident ("import", peek, limit))))
   4815 	return false;
   4816       import = true;
   4817     }
   4818   else if (__builtin_expect (c == '_', false))
   4819     {
   4820       /* Needed for translated includes.   */
   4821       if (!((peek[0] == '_' || peek[0] == '\\')
   4822 	    && (peek = do_peek_ident ("__import", peek, limit))))
   4823 	return false;
   4824       import = true;
   4825     }
   4826   else if (__builtin_expect (c == 'm', false))
   4827     {
   4828       if (!((peek[0] == 'o' || peek[0] == '\\')
   4829 	    && (peek = do_peek_ident ("module", peek, limit))))
   4830 	return false;
   4831     }
   4832   else
   4833     return false;
   4834 
   4835   /* Peek the next character to see if it's good enough.  We'll be at
   4836      the first non-whitespace char, including skipping an escaped
   4837      newline.  */
   4838   /* ... import followed by identifier, ':', '<' or header-name
   4839      preprocessing tokens, or module followed by identifier, ':' or
   4840      ';' preprocessing tokens.  */
   4841   unsigned char p = *peek++;
   4842 
   4843   /* A character literal is ... single quotes, ... optionally preceded
   4844      by u8, u, U, or L */
   4845   /* A string-literal is a ... double quotes, optionally prefixed by
   4846      R, u8, u8R, u, uR, U, UR, L, or LR */
   4847   if (p == 'u')
   4848     {
   4849       peek = do_peek_next (peek, limit);
   4850       if (*peek == '8')
   4851 	{
   4852 	  peek++;
   4853 	  goto peek_u8;
   4854 	}
   4855       goto peek_u;
   4856     }
   4857   else if (p == 'U' || p == 'L')
   4858     {
   4859     peek_u8:
   4860       peek = do_peek_next (peek, limit);
   4861     peek_u:
   4862       if (*peek == '\"' || *peek == '\'')
   4863 	return false;
   4864 
   4865       if (*peek == 'R')
   4866 	goto peek_R;
   4867       /* Identifier. Ok.  */
   4868     }
   4869   else if (p == 'R')
   4870     {
   4871     peek_R:
   4872       if (CPP_OPTION (pfile, rliterals))
   4873 	{
   4874 	  peek = do_peek_next (peek, limit);
   4875 	  if (*peek == '\"')
   4876 	    return false;
   4877 	}
   4878       /* Identifier. Ok.  */
   4879     }
   4880   else if ('Z' - 'A' == 25
   4881 	   ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
   4882 	   : ISIDST (p))
   4883     {
   4884       /* Identifier.  Ok. */
   4885     }
   4886   else if (p == '<')
   4887     {
   4888       /* Maybe angle header, ok for import.  Reject
   4889 	 '<=', '<<' digraph:'<:'.  */
   4890       if (!import)
   4891 	return false;
   4892       peek = do_peek_next (peek, limit);
   4893       if (*peek == '=' || *peek == '<'
   4894 	  || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
   4895 	return false;
   4896     }
   4897   else if (p == ';')
   4898     {
   4899       /* SEMICOLON, ok for module.  */
   4900       if (import)
   4901 	return false;
   4902     }
   4903   else if (p == '"')
   4904     {
   4905       /* STRING, ok for import.  */
   4906       if (!import)
   4907 	return false;
   4908     }
   4909   else if (p == ':')
   4910     {
   4911       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
   4912       peek = do_peek_next (peek, limit);
   4913       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
   4914 	return false;
   4915     }
   4916   else
   4917     /* FIXME: Detect a unicode character, excluding those not
   4918        permitted as the initial character. [lex.name]/1.  I presume
   4919        we need to check the \[uU] spellings, and directly using
   4920        Unicode in say UTF8 form?  Or perhaps we do the phase-1
   4921        conversion of UTF8 to universal-character-names?  */
   4922     return false;
   4923 
   4924   return true;
   4925 }
   4926 
   4927 /* Directives-only scanning.  Somewhat more relaxed than correct
   4928    parsing -- some ill-formed programs will not be rejected.  */
   4929 
   4930 void
   4931 cpp_directive_only_process (cpp_reader *pfile,
   4932 			    void *data,
   4933 			    void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
   4934 {
   4935   bool module_p = CPP_OPTION (pfile, module_directives);
   4936 
   4937   do
   4938     {
   4939     restart:
   4940       /* Buffer initialization, but no line cleaning. */
   4941       cpp_buffer *buffer = pfile->buffer;
   4942       buffer->cur_note = buffer->notes_used = 0;
   4943       buffer->cur = buffer->line_base = buffer->next_line;
   4944       buffer->need_line = false;
   4945       /* Files always end in a newline or carriage return.  We rely on this for
   4946 	 character peeking safety.  */
   4947       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
   4948 
   4949       const unsigned char *base = buffer->cur;
   4950       unsigned line_count = 0;
   4951       const unsigned char *line_start = base;
   4952 
   4953       bool bol = true;
   4954       bool raw = false;
   4955 
   4956       const unsigned char *lwm = base;
   4957       for (const unsigned char *pos = base, *limit = buffer->rlimit;
   4958 	   pos < limit;)
   4959 	{
   4960 	  unsigned char c = *pos++;
   4961 	  /* This matches the switch in _cpp_lex_direct.  */
   4962 	  switch (c)
   4963 	    {
   4964 	    case ' ': case '\t': case '\f': case '\v':
   4965 	      /* Whitespace, do nothing.  */
   4966 	      break;
   4967 
   4968 	    case '\r': /* MAC line ending, or Windows \r\n  */
   4969 	      if (*pos == '\n')
   4970 		pos++;
   4971 	      /* FALLTHROUGH */
   4972 
   4973 	    case '\n':
   4974 	      bol = true;
   4975 
   4976 	    next_line:
   4977 	      CPP_INCREMENT_LINE (pfile, 0);
   4978 	      line_count++;
   4979 	      line_start = pos;
   4980 	      break;
   4981 
   4982 	    case '\\':
   4983 	      /* <backslash><newline> is removed, and doesn't undo any
   4984 		 preceeding escape or whatnot.  */
   4985 	      if (*pos == '\n')
   4986 		{
   4987 		  pos++;
   4988 		  goto next_line;
   4989 		}
   4990 	      else if (*pos == '\r')
   4991 		{
   4992 		  if (pos[1] == '\n')
   4993 		    pos++;
   4994 		  pos++;
   4995 		  goto next_line;
   4996 		}
   4997 	      goto dflt;
   4998 
   4999 	    case '#':
   5000 	      if (bol)
   5001 		{
   5002 		  /* Line directive.  */
   5003 		  if (pos - 1 > base && !pfile->state.skipping)
   5004 		    cb (pfile, CPP_DO_print, data,
   5005 			line_count, base, pos - 1 - base);
   5006 
   5007 		  /* Prep things for directive handling. */
   5008 		  buffer->next_line = pos;
   5009 		  buffer->need_line = true;
   5010 		  bool ok = _cpp_get_fresh_line (pfile);
   5011 		  gcc_checking_assert (ok);
   5012 
   5013 		  /* Ensure proper column numbering for generated
   5014 		     error messages. */
   5015 		  buffer->line_base -= pos - line_start;
   5016 
   5017 		  _cpp_handle_directive (pfile, line_start + 1 != pos);
   5018 
   5019 		  /* Sanitize the line settings.  Duplicate #include's can
   5020 		     mess things up. */
   5021 		  // FIXME: Necessary?
   5022 		  pfile->line_table->highest_location
   5023 		    = pfile->line_table->highest_line;
   5024 
   5025 		  if (!pfile->state.skipping
   5026 		      && pfile->buffer->next_line < pfile->buffer->rlimit)
   5027 		    cb (pfile, CPP_DO_location, data,
   5028 			pfile->line_table->highest_line);
   5029 
   5030 		  goto restart;
   5031 		}
   5032 	      goto dflt;
   5033 
   5034 	    case '/':
   5035 	      {
   5036 		const unsigned char *peek = do_peek_next (pos, limit);
   5037 		if (!(*peek == '/' || *peek == '*'))
   5038 		  goto dflt;
   5039 
   5040 		/* Line or block comment  */
   5041 		bool is_block = *peek == '*';
   5042 		bool star = false;
   5043 		bool esc = false;
   5044 		location_t sloc
   5045 		  = linemap_position_for_column (pfile->line_table,
   5046 						 pos - line_start);
   5047 
   5048 		while (pos < limit)
   5049 		  {
   5050 		    char c = *pos++;
   5051 		    switch (c)
   5052 		      {
   5053 		      case '\\':
   5054 			esc = true;
   5055 			break;
   5056 
   5057 		      case '\r':
   5058 			if (*pos == '\n')
   5059 			  pos++;
   5060 			/* FALLTHROUGH  */
   5061 
   5062 		      case '\n':
   5063 			{
   5064 			  CPP_INCREMENT_LINE (pfile, 0);
   5065 			  line_count++;
   5066 			  line_start = pos;
   5067 			  if (!esc && !is_block)
   5068 			    {
   5069 			      bol = true;
   5070 			      goto done_comment;
   5071 			    }
   5072 			}
   5073 			if (!esc)
   5074 			  star = false;
   5075 			esc = false;
   5076 			break;
   5077 
   5078 		      case '*':
   5079 			if (pos > peek)
   5080 			  star = is_block;
   5081 			esc = false;
   5082 			break;
   5083 
   5084 		      case '/':
   5085 			if (star)
   5086 			  goto done_comment;
   5087 			/* FALLTHROUGH  */
   5088 
   5089 		      default:
   5090 			star = false;
   5091 			esc = false;
   5092 			break;
   5093 		      }
   5094 		  }
   5095 		if (pos < limit || is_block)
   5096 		  cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
   5097 				       "unterminated comment");
   5098 	      done_comment:
   5099 		lwm = pos;
   5100 		break;
   5101 	      }
   5102 
   5103 	    case '\'':
   5104 	      if (!CPP_OPTION (pfile, digit_separators))
   5105 		goto delimited_string;
   5106 
   5107 	      /* Possibly a number punctuator.  */
   5108 	      if (!ISIDNUM (*do_peek_next (pos, limit)))
   5109 		goto delimited_string;
   5110 
   5111 	      goto quote_peek;
   5112 
   5113 	    case '\"':
   5114 	      if (!CPP_OPTION (pfile, rliterals))
   5115 		goto delimited_string;
   5116 
   5117 	    quote_peek:
   5118 	      {
   5119 		/* For ' see if it's a number punctuator
   5120 		   \.?<digit>(<digit>|<identifier-nondigit>
   5121 		   |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
   5122 		/* For " see if it's a raw string
   5123 		   {U,L,u,u8}R.  This includes CPP_NUMBER detection,
   5124 		   because that could be 0e+R.  */
   5125 		const unsigned char *peek = pos - 1;
   5126 		bool quote_first = c == '"';
   5127 		bool quote_eight = false;
   5128 		bool maybe_number_start = false;
   5129 		bool want_number = false;
   5130 
   5131 		while ((peek = do_peek_prev (peek, lwm)))
   5132 		  {
   5133 		    unsigned char p = *peek;
   5134 		    if (quote_first)
   5135 		      {
   5136 			if (!raw)
   5137 			  {
   5138 			    if (p != 'R')
   5139 			      break;
   5140 			    raw = true;
   5141 			    continue;
   5142 			  }
   5143 
   5144 			quote_first = false;
   5145 			if (p == 'L' || p == 'U' || p == 'u')
   5146 			  ;
   5147 			else if (p == '8')
   5148 			  quote_eight = true;
   5149 			else
   5150 			  goto second_raw;
   5151 		      }
   5152 		    else if (quote_eight)
   5153 		      {
   5154 			if (p != 'u')
   5155 			  {
   5156 			    raw = false;
   5157 			    break;
   5158 			  }
   5159 			quote_eight = false;
   5160 		      }
   5161 		    else if (c == '"')
   5162 		      {
   5163 		      second_raw:;
   5164 			if (!want_number && ISIDNUM (p))
   5165 			  {
   5166 			    raw = false;
   5167 			    break;
   5168 			  }
   5169 		      }
   5170 
   5171 		    if (ISDIGIT (p))
   5172 		      maybe_number_start = true;
   5173 		    else if (p == '.')
   5174 		      want_number = true;
   5175 		    else if (ISIDNUM (p))
   5176 		      maybe_number_start = false;
   5177 		    else if (p == '+' || p == '-')
   5178 		      {
   5179 			if (const unsigned char *peek_prev
   5180 			    = do_peek_prev (peek, lwm))
   5181 			  {
   5182 			    p = *peek_prev;
   5183 			    if (p == 'e' || p == 'E'
   5184 				|| p == 'p' || p == 'P')
   5185 			      {
   5186 				want_number = true;
   5187 				maybe_number_start = false;
   5188 			      }
   5189 			    else
   5190 			      break;
   5191 			  }
   5192 			else
   5193 			  break;
   5194 		      }
   5195 		    else if (p == '\'' || p == '\"')
   5196 		      {
   5197 			/* If this is lwm, this must be the end of a
   5198 			   previous string.  So this is a trailing
   5199 			   literal type, (a) if those are allowed,
   5200 			     and (b) maybe_start is false.  Otherwise
   5201 			     this must be a CPP_NUMBER because we've
   5202 			     met another ', and we'd have checked that
   5203 			     in its own right.  */
   5204 			if (peek == lwm && CPP_OPTION (pfile, uliterals))
   5205 			  {
   5206 			    if  (!maybe_number_start && !want_number)
   5207 			      /* Must be a literal type.  */
   5208 			      raw = false;
   5209 			  }
   5210 			else if (p == '\''
   5211 				 && CPP_OPTION (pfile, digit_separators))
   5212 			  maybe_number_start = true;
   5213 			break;
   5214 		      }
   5215 		    else if (c == '\'')
   5216 		      break;
   5217 		    else if (!quote_first && !quote_eight)
   5218 		      break;
   5219 		  }
   5220 
   5221 		if (maybe_number_start)
   5222 		  {
   5223 		    if (c == '\'')
   5224 		      /* A CPP NUMBER.  */
   5225 		      goto dflt;
   5226 		    raw = false;
   5227 		  }
   5228 
   5229 		goto delimited_string;
   5230 	      }
   5231 
   5232 	    delimited_string:
   5233 	      {
   5234 		/* (Possibly raw) string or char literal.  */
   5235 		unsigned char end = c;
   5236 		int delim_len = -1;
   5237 		const unsigned char *delim = NULL;
   5238 		location_t sloc = linemap_position_for_column (pfile->line_table,
   5239 							       pos - line_start);
   5240 		int esc = 0;
   5241 
   5242 		if (raw)
   5243 		  {
   5244 		    /* There can be no line breaks in the delimiter.  */
   5245 		    delim = pos;
   5246 		    for (delim_len = 0; (c = *pos++) != '('; delim_len++)
   5247 		      {
   5248 			if (delim_len == 16)
   5249 			  {
   5250 			    cpp_error_with_line (pfile, CPP_DL_ERROR,
   5251 						 sloc, 0,
   5252 						 "raw string delimiter"
   5253 						 " longer than %d"
   5254 						 " characters",
   5255 						 delim_len);
   5256 			    raw = false;
   5257 			    pos = delim;
   5258 			    break;
   5259 			  }
   5260 			if (strchr (") \\\t\v\f\n", c))
   5261 			  {
   5262 			    cpp_error_with_line (pfile, CPP_DL_ERROR,
   5263 						 sloc, 0,
   5264 						 "invalid character '%c'"
   5265 						 " in raw string"
   5266 						 " delimiter", c);
   5267 			    raw = false;
   5268 			    pos = delim;
   5269 			    break;
   5270 			  }
   5271 			if (pos >= limit)
   5272 			  goto bad_string;
   5273 		      }
   5274 		  }
   5275 
   5276 		while (pos < limit)
   5277 		  {
   5278 		    char c = *pos++;
   5279 		    switch (c)
   5280 		      {
   5281 		      case '\\':
   5282 			if (!raw)
   5283 			  esc++;
   5284 			break;
   5285 
   5286 		      case '\r':
   5287 			if (*pos == '\n')
   5288 			  pos++;
   5289 			/* FALLTHROUGH  */
   5290 
   5291 		      case '\n':
   5292 			{
   5293 			  CPP_INCREMENT_LINE (pfile, 0);
   5294 			  line_count++;
   5295 			  line_start = pos;
   5296 			}
   5297 			if (esc)
   5298 			  esc--;
   5299 			break;
   5300 
   5301 		      case ')':
   5302 			if (raw
   5303 			    && pos + delim_len + 1 < limit
   5304 			    && pos[delim_len] == end
   5305 			    && !memcmp (delim, pos, delim_len))
   5306 			  {
   5307 			    pos += delim_len + 1;
   5308 			    raw = false;
   5309 			    goto done_string;
   5310 			  }
   5311 			break;
   5312 
   5313 		      default:
   5314 			if (!raw && !(esc & 1) && c == end)
   5315 			  goto done_string;
   5316 			esc = 0;
   5317 			break;
   5318 		      }
   5319 		  }
   5320 	      bad_string:
   5321 		cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
   5322 				     "unterminated literal");
   5323 
   5324 	      done_string:
   5325 		raw = false;
   5326 		lwm = pos - 1;
   5327 	      }
   5328 	      goto dflt;
   5329 
   5330 	    case '_':
   5331 	    case 'e':
   5332 	    case 'i':
   5333 	    case 'm':
   5334 	      if (bol && module_p && !pfile->state.skipping
   5335 		  && do_peek_module (pfile, c, pos, limit))
   5336 		{
   5337 		  /* We've seen the start of a module control line.
   5338 		     Start up the tokenizer.  */
   5339 		  pos--; /* Backup over the first character.  */
   5340 
   5341 		  /* Backup over whitespace to start of line.  */
   5342 		  while (pos > line_start
   5343 			 && (pos[-1] == ' ' || pos[-1] == '\t'))
   5344 		    pos--;
   5345 
   5346 		  if (pos > base)
   5347 		    cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
   5348 
   5349 		  /* Prep things for directive handling. */
   5350 		  buffer->next_line = pos;
   5351 		  buffer->need_line = true;
   5352 
   5353 		  /* Now get tokens until the PRAGMA_EOL.  */
   5354 		  do
   5355 		    {
   5356 		      location_t spelling;
   5357 		      const cpp_token *tok
   5358 			= cpp_get_token_with_location (pfile, &spelling);
   5359 
   5360 		      gcc_assert (pfile->state.in_deferred_pragma
   5361 				  || tok->type == CPP_PRAGMA_EOL);
   5362 		      cb (pfile, CPP_DO_token, data, tok, spelling);
   5363 		    }
   5364 		  while (pfile->state.in_deferred_pragma);
   5365 
   5366 		  if (pfile->buffer->next_line < pfile->buffer->rlimit)
   5367 		    cb (pfile, CPP_DO_location, data,
   5368 			pfile->line_table->highest_line);
   5369 
   5370 		  pfile->mi_valid = false;
   5371 		  goto restart;
   5372 		}
   5373 	      goto dflt;
   5374 
   5375 	    default:
   5376 	    dflt:
   5377 	      bol = false;
   5378 	      pfile->mi_valid = false;
   5379 	      break;
   5380 	    }
   5381 	}
   5382 
   5383       if (buffer->rlimit > base && !pfile->state.skipping)
   5384 	{
   5385 	  const unsigned char *limit = buffer->rlimit;
   5386 	  /* If the file was not newline terminated, add rlimit, which is
   5387 	     guaranteed to point to a newline, to the end of our range.  */
   5388 	  if (limit[-1] != '\n')
   5389 	    {
   5390 	      limit++;
   5391 	      CPP_INCREMENT_LINE (pfile, 0);
   5392 	      line_count++;
   5393 	    }
   5394 	  cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
   5395 	}
   5396 
   5397       _cpp_pop_buffer (pfile);
   5398     }
   5399   while (pfile->buffer);
   5400 }
   5401