Home | History | Annotate | Line # | Download | only in libcpp
lex.cc revision 1.1.1.1
      1 /* CPP Library - lexical analysis.
      2    Copyright (C) 2000-2022 Free Software Foundation, Inc.
      3    Contributed by Per Bothner, 1994-95.
      4    Based on CCCP program by Paul Rubin, June 1986
      5    Adapted to ANSI C, Richard Stallman, Jan 1987
      6    Broken out to separate file, Zack Weinberg, Mar 2000
      7 
      8 This program is free software; you can redistribute it and/or modify it
      9 under the terms of the GNU General Public License as published by the
     10 Free Software Foundation; either version 3, or (at your option) any
     11 later version.
     12 
     13 This program is distributed in the hope that it will be useful,
     14 but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     16 GNU General Public License for more details.
     17 
     18 You should have received a copy of the GNU General Public License
     19 along with this program; see the file COPYING3.  If not see
     20 <http://www.gnu.org/licenses/>.  */
     21 
     22 #include "config.h"
     23 #include "system.h"
     24 #include "cpplib.h"
     25 #include "internal.h"
     26 
     27 enum spell_type
     28 {
     29   SPELL_OPERATOR = 0,
     30   SPELL_IDENT,
     31   SPELL_LITERAL,
     32   SPELL_NONE
     33 };
     34 
     35 struct token_spelling
     36 {
     37   enum spell_type category;
     38   const unsigned char *name;
     39 };
     40 
     41 static const unsigned char *const digraph_spellings[] =
     42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
     43 
     44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
     45 #define TK(e, s) { SPELL_ ## s,    UC #e },
     46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
     47 #undef OP
     48 #undef TK
     49 
     50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
     51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
     52 
     53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
     54 static int skip_line_comment (cpp_reader *);
     55 static void skip_whitespace (cpp_reader *, cppchar_t);
     56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
     57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
     58 static void store_comment (cpp_reader *, cpp_token *);
     59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
     60 			    unsigned int, enum cpp_ttype);
     61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
     62 static int name_p (cpp_reader *, const cpp_string *);
     63 static tokenrun *next_tokenrun (tokenrun *);
     64 
     65 static _cpp_buff *new_buff (size_t);
     66 
     67 
     68 /* Utility routine:
     69 
     70    Compares, the token TOKEN to the NUL-terminated string STRING.
     71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
     72 int
     73 cpp_ideq (const cpp_token *token, const char *string)
     74 {
     75   if (token->type != CPP_NAME)
     76     return 0;
     77 
     78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
     79 }
     80 
     81 /* Record a note TYPE at byte POS into the current cleaned logical
     82    line.  */
     83 static void
     84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
     85 {
     86   if (buffer->notes_used == buffer->notes_cap)
     87     {
     88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
     89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
     90                                   buffer->notes_cap);
     91     }
     92 
     93   buffer->notes[buffer->notes_used].pos = pos;
     94   buffer->notes[buffer->notes_used].type = type;
     95   buffer->notes_used++;
     96 }
     97 
     98 
     99 /* Fast path to find line special characters using optimized character
    101    scanning algorithms.  Anything complicated falls back to the slow
    102    path below.  Since this loop is very hot it's worth doing these kinds
    103    of optimizations.
    104 
    105    One of the paths through the ifdefs should provide
    106 
    107      const uchar *search_line_fast (const uchar *s, const uchar *end);
    108 
    109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
    110    the found character.
    111 
    112    Note that the last character of the buffer is *always* a newline,
    113    as forced by _cpp_convert_input.  This fact can be used to avoid
    114    explicitly looking for the end of the buffer.  */
    115 
    116 /* Configure gives us an ifdef test.  */
    117 #ifndef WORDS_BIGENDIAN
    118 #define WORDS_BIGENDIAN 0
    119 #endif
    120 
    121 /* We'd like the largest integer that fits into a register.  There's nothing
    122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
    123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
    124    can get the "real" word size.  */
    125 #ifdef __GNUC__
    126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
    127 #else
    128 typedef unsigned long word_type;
    129 #endif
    130 
    131 /* The code below is only expecting sizes 4 or 8.
    132    Die at compile-time if this expectation is violated.  */
    133 typedef char check_word_type_size
    134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
    135 
    136 /* Return X with the first N bytes forced to values that won't match one
    137    of the interesting characters.  Note that NUL is not interesting.  */
    138 
    139 static inline word_type
    140 acc_char_mask_misalign (word_type val, unsigned int n)
    141 {
    142   word_type mask = -1;
    143   if (WORDS_BIGENDIAN)
    144     mask >>= n * 8;
    145   else
    146     mask <<= n * 8;
    147   return val & mask;
    148 }
    149 
    150 /* Return X replicated to all byte positions within WORD_TYPE.  */
    151 
    152 static inline word_type
    153 acc_char_replicate (uchar x)
    154 {
    155   word_type ret;
    156 
    157   ret = (x << 24) | (x << 16) | (x << 8) | x;
    158   if (sizeof(word_type) == 8)
    159     ret = (ret << 16 << 16) | ret;
    160   return ret;
    161 }
    162 
    163 /* Return non-zero if some byte of VAL is (probably) C.  */
    164 
    165 static inline word_type
    166 acc_char_cmp (word_type val, word_type c)
    167 {
    168 #if defined(__GNUC__) && defined(__alpha__)
    169   /* We can get exact results using a compare-bytes instruction.
    170      Get (val == c) via (0 >= (val ^ c)).  */
    171   return __builtin_alpha_cmpbge (0, val ^ c);
    172 #else
    173   word_type magic = 0x7efefefeU;
    174   if (sizeof(word_type) == 8)
    175     magic = (magic << 16 << 16) | 0xfefefefeU;
    176   magic |= 1;
    177 
    178   val ^= c;
    179   return ((val + magic) ^ ~val) & ~magic;
    180 #endif
    181 }
    182 
    183 /* Given the result of acc_char_cmp is non-zero, return the index of
    184    the found character.  If this was a false positive, return -1.  */
    185 
    186 static inline int
    187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
    188 		word_type val ATTRIBUTE_UNUSED)
    189 {
    190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
    191   /* The cmpbge instruction sets *bits* of the result corresponding to
    192      matches in the bytes with no false positives.  */
    193   return __builtin_ctzl (cmp);
    194 #else
    195   unsigned int i;
    196 
    197   /* ??? It would be nice to force unrolling here,
    198      and have all of these constants folded.  */
    199   for (i = 0; i < sizeof(word_type); ++i)
    200     {
    201       uchar c;
    202       if (WORDS_BIGENDIAN)
    203 	c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
    204       else
    205 	c = (val >> i * 8) & 0xff;
    206 
    207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
    208 	return i;
    209     }
    210 
    211   return -1;
    212 #endif
    213 }
    214 
    215 /* A version of the fast scanner using bit fiddling techniques.
    216 
    217    For 32-bit words, one would normally perform 16 comparisons and
    218    16 branches.  With this algorithm one performs 24 arithmetic
    219    operations and one branch.  Whether this is faster with a 32-bit
    220    word size is going to be somewhat system dependent.
    221 
    222    For 64-bit words, we eliminate twice the number of comparisons
    223    and branches without increasing the number of arithmetic operations.
    224    It's almost certainly going to be a win with 64-bit word size.  */
    225 
    226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
    227   ATTRIBUTE_UNUSED;
    228 
    229 static const uchar *
    230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    231 {
    232   const word_type repl_nl = acc_char_replicate ('\n');
    233   const word_type repl_cr = acc_char_replicate ('\r');
    234   const word_type repl_bs = acc_char_replicate ('\\');
    235   const word_type repl_qm = acc_char_replicate ('?');
    236 
    237   unsigned int misalign;
    238   const word_type *p;
    239   word_type val, t;
    240 
    241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
    242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
    243   val = *p;
    244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
    245   if (misalign)
    246     val = acc_char_mask_misalign (val, misalign);
    247 
    248   /* Main loop.  */
    249   while (1)
    250     {
    251       t  = acc_char_cmp (val, repl_nl);
    252       t |= acc_char_cmp (val, repl_cr);
    253       t |= acc_char_cmp (val, repl_bs);
    254       t |= acc_char_cmp (val, repl_qm);
    255 
    256       if (__builtin_expect (t != 0, 0))
    257 	{
    258 	  int i = acc_char_index (t, val);
    259 	  if (i >= 0)
    260 	    return (const uchar *)p + i;
    261 	}
    262 
    263       val = *++p;
    264     }
    265 }
    266 
    267 /* Disable on Solaris 2/x86 until the following problem can be properly
    268    autoconfed:
    269 
    270    The Solaris 10+ assembler tags objects with the instruction set
    271    extensions used, so SSE4.2 executables cannot run on machines that
    272    don't support that extension.  */
    273 
    274 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
    275 
    276 /* Replicated character data to be shared between implementations.
    277    Recall that outside of a context with vector support we can't
    278    define compatible vector types, therefore these are all defined
    279    in terms of raw characters.  */
    280 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
    281   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    282     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
    283   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    284     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
    285   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    286     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
    287   { '?', '?', '?', '?', '?', '?', '?', '?',
    288     '?', '?', '?', '?', '?', '?', '?', '?' },
    289 };
    290 
    291 /* A version of the fast scanner using MMX vectorized byte compare insns.
    292 
    293    This uses the PMOVMSKB instruction which was introduced with "MMX2",
    294    which was packaged into SSE1; it is also present in the AMD MMX
    295    extension.  Mark the function as using "sse" so that we emit a real
    296    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
    297 
    298 static const uchar *
    299 #ifndef __SSE__
    300 __attribute__((__target__("sse")))
    301 #endif
    302 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    303 {
    304   typedef char v8qi __attribute__ ((__vector_size__ (8)));
    305   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
    306 
    307   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
    308   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
    309   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
    310   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
    311 
    312   unsigned int misalign, found, mask;
    313   const v8qi *p;
    314   v8qi data, t, c;
    315 
    316   /* Align the source pointer.  While MMX doesn't generate unaligned data
    317      faults, this allows us to safely scan to the end of the buffer without
    318      reading beyond the end of the last page.  */
    319   misalign = (uintptr_t)s & 7;
    320   p = (const v8qi *)((uintptr_t)s & -8);
    321   data = *p;
    322 
    323   /* Create a mask for the bytes that are valid within the first
    324      16-byte block.  The Idea here is that the AND with the mask
    325      within the loop is "free", since we need some AND or TEST
    326      insn in order to set the flags for the branch anyway.  */
    327   mask = -1u << misalign;
    328 
    329   /* Main loop processing 8 bytes at a time.  */
    330   goto start;
    331   do
    332     {
    333       data = *++p;
    334       mask = -1;
    335 
    336     start:
    337       t = __builtin_ia32_pcmpeqb(data, repl_nl);
    338       c = __builtin_ia32_pcmpeqb(data, repl_cr);
    339       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
    340       c = __builtin_ia32_pcmpeqb(data, repl_bs);
    341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
    342       c = __builtin_ia32_pcmpeqb(data, repl_qm);
    343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
    344       found = __builtin_ia32_pmovmskb (t);
    345       found &= mask;
    346     }
    347   while (!found);
    348 
    349   __builtin_ia32_emms ();
    350 
    351   /* FOUND contains 1 in bits for which we matched a relevant
    352      character.  Conversion to the byte index is trivial.  */
    353   found = __builtin_ctz(found);
    354   return (const uchar *)p + found;
    355 }
    356 
    357 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
    358 
    359 static const uchar *
    360 #ifndef __SSE2__
    361 __attribute__((__target__("sse2")))
    362 #endif
    363 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    364 {
    365   typedef char v16qi __attribute__ ((__vector_size__ (16)));
    366 
    367   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
    368   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
    369   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
    370   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
    371 
    372   unsigned int misalign, found, mask;
    373   const v16qi *p;
    374   v16qi data, t;
    375 
    376   /* Align the source pointer.  */
    377   misalign = (uintptr_t)s & 15;
    378   p = (const v16qi *)((uintptr_t)s & -16);
    379   data = *p;
    380 
    381   /* Create a mask for the bytes that are valid within the first
    382      16-byte block.  The Idea here is that the AND with the mask
    383      within the loop is "free", since we need some AND or TEST
    384      insn in order to set the flags for the branch anyway.  */
    385   mask = -1u << misalign;
    386 
    387   /* Main loop processing 16 bytes at a time.  */
    388   goto start;
    389   do
    390     {
    391       data = *++p;
    392       mask = -1;
    393 
    394     start:
    395       t  = data == repl_nl;
    396       t |= data == repl_cr;
    397       t |= data == repl_bs;
    398       t |= data == repl_qm;
    399       found = __builtin_ia32_pmovmskb128 (t);
    400       found &= mask;
    401     }
    402   while (!found);
    403 
    404   /* FOUND contains 1 in bits for which we matched a relevant
    405      character.  Conversion to the byte index is trivial.  */
    406   found = __builtin_ctz(found);
    407   return (const uchar *)p + found;
    408 }
    409 
    410 #ifdef HAVE_SSE4
    411 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
    412 
    413 static const uchar *
    414 #ifndef __SSE4_2__
    415 __attribute__((__target__("sse4.2")))
    416 #endif
    417 search_line_sse42 (const uchar *s, const uchar *end)
    418 {
    419   typedef char v16qi __attribute__ ((__vector_size__ (16)));
    420   static const v16qi search = { '\n', '\r', '?', '\\' };
    421 
    422   uintptr_t si = (uintptr_t)s;
    423   uintptr_t index;
    424 
    425   /* Check for unaligned input.  */
    426   if (si & 15)
    427     {
    428       v16qi sv;
    429 
    430       if (__builtin_expect (end - s < 16, 0)
    431 	  && __builtin_expect ((si & 0xfff) > 0xff0, 0))
    432 	{
    433 	  /* There are less than 16 bytes left in the buffer, and less
    434 	     than 16 bytes left on the page.  Reading 16 bytes at this
    435 	     point might generate a spurious page fault.  Defer to the
    436 	     SSE2 implementation, which already handles alignment.  */
    437 	  return search_line_sse2 (s, end);
    438 	}
    439 
    440       /* ??? The builtin doesn't understand that the PCMPESTRI read from
    441 	 memory need not be aligned.  */
    442       sv = __builtin_ia32_loaddqu ((const char *) s);
    443       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
    444 
    445       if (__builtin_expect (index < 16, 0))
    446 	goto found;
    447 
    448       /* Advance the pointer to an aligned address.  We will re-scan a
    449 	 few bytes, but we no longer need care for reading past the
    450 	 end of a page, since we're guaranteed a match.  */
    451       s = (const uchar *)((si + 15) & -16);
    452     }
    453 
    454   /* Main loop, processing 16 bytes at a time.  */
    455 #ifdef __GCC_ASM_FLAG_OUTPUTS__
    456   while (1)
    457     {
    458       char f;
    459 
    460       /* By using inline assembly instead of the builtin,
    461 	 we can use the result, as well as the flags set.  */
    462       __asm ("%vpcmpestri\t$0, %2, %3"
    463 	     : "=c"(index), "=@ccc"(f)
    464 	     : "m"(*s), "x"(search), "a"(4), "d"(16));
    465       if (f)
    466 	break;
    467 
    468       s += 16;
    469     }
    470 #else
    471   s -= 16;
    472   /* By doing the whole loop in inline assembly,
    473      we can make proper use of the flags set.  */
    474   __asm (      ".balign 16\n"
    475 	"0:	add $16, %1\n"
    476 	"	%vpcmpestri\t$0, (%1), %2\n"
    477 	"	jnc 0b"
    478 	: "=&c"(index), "+r"(s)
    479 	: "x"(search), "a"(4), "d"(16));
    480 #endif
    481 
    482  found:
    483   return s + index;
    484 }
    485 
    486 #else
    487 /* Work around out-dated assemblers without sse4 support.  */
    488 #define search_line_sse42 search_line_sse2
    489 #endif
    490 
    491 /* Check the CPU capabilities.  */
    492 
    493 #include "../gcc/config/i386/cpuid.h"
    494 
    495 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
    496 static search_line_fast_type search_line_fast;
    497 
    498 #define HAVE_init_vectorized_lexer 1
    499 static inline void
    500 init_vectorized_lexer (void)
    501 {
    502   unsigned dummy, ecx = 0, edx = 0;
    503   search_line_fast_type impl = search_line_acc_char;
    504   int minimum = 0;
    505 
    506 #if defined(__SSE4_2__)
    507   minimum = 3;
    508 #elif defined(__SSE2__)
    509   minimum = 2;
    510 #elif defined(__SSE__)
    511   minimum = 1;
    512 #endif
    513 
    514   if (minimum == 3)
    515     impl = search_line_sse42;
    516   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
    517     {
    518       if (minimum == 3 || (ecx & bit_SSE4_2))
    519         impl = search_line_sse42;
    520       else if (minimum == 2 || (edx & bit_SSE2))
    521 	impl = search_line_sse2;
    522       else if (minimum == 1 || (edx & bit_SSE))
    523 	impl = search_line_mmx;
    524     }
    525   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
    526     {
    527       if (minimum == 1
    528 	  || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
    529 	impl = search_line_mmx;
    530     }
    531 
    532   search_line_fast = impl;
    533 }
    534 
    535 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
    536 
    537 /* A vection of the fast scanner using AltiVec vectorized byte compares
    538    and VSX unaligned loads (when VSX is available).  This is otherwise
    539    the same as the AltiVec version.  */
    540 
    541 ATTRIBUTE_NO_SANITIZE_UNDEFINED
    542 static const uchar *
    543 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    544 {
    545   typedef __attribute__((altivec(vector))) unsigned char vc;
    546 
    547   const vc repl_nl = {
    548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    549     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
    550   };
    551   const vc repl_cr = {
    552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    553     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
    554   };
    555   const vc repl_bs = {
    556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    557     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
    558   };
    559   const vc repl_qm = {
    560     '?', '?', '?', '?', '?', '?', '?', '?',
    561     '?', '?', '?', '?', '?', '?', '?', '?',
    562   };
    563   const vc zero = { 0 };
    564 
    565   vc data, t;
    566 
    567   /* Main loop processing 16 bytes at a time.  */
    568   do
    569     {
    570       vc m_nl, m_cr, m_bs, m_qm;
    571 
    572       data = __builtin_vec_vsx_ld (0, s);
    573       s += 16;
    574 
    575       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
    576       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
    577       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
    578       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
    579       t = (m_nl | m_cr) | (m_bs | m_qm);
    580 
    581       /* T now contains 0xff in bytes for which we matched one of the relevant
    582 	 characters.  We want to exit the loop if any byte in T is non-zero.
    583 	 Below is the expansion of vec_any_ne(t, zero).  */
    584     }
    585   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
    586 
    587   /* Restore s to to point to the 16 bytes we just processed.  */
    588   s -= 16;
    589 
    590   {
    591 #define N  (sizeof(vc) / sizeof(long))
    592 
    593     union {
    594       vc v;
    595       /* Statically assert that N is 2 or 4.  */
    596       unsigned long l[(N == 2 || N == 4) ? N : -1];
    597     } u;
    598     unsigned long l, i = 0;
    599 
    600     u.v = t;
    601 
    602     /* Find the first word of T that is non-zero.  */
    603     switch (N)
    604       {
    605       case 4:
    606 	l = u.l[i++];
    607 	if (l != 0)
    608 	  break;
    609 	s += sizeof(unsigned long);
    610 	l = u.l[i++];
    611 	if (l != 0)
    612 	  break;
    613 	s += sizeof(unsigned long);
    614 	/* FALLTHRU */
    615       case 2:
    616 	l = u.l[i++];
    617 	if (l != 0)
    618 	  break;
    619 	s += sizeof(unsigned long);
    620 	l = u.l[i];
    621       }
    622 
    623     /* L now contains 0xff in bytes for which we matched one of the
    624        relevant characters.  We can find the byte index by finding
    625        its bit index and dividing by 8.  */
    626 #ifdef __BIG_ENDIAN__
    627     l = __builtin_clzl(l) >> 3;
    628 #else
    629     l = __builtin_ctzl(l) >> 3;
    630 #endif
    631     return s + l;
    632 
    633 #undef N
    634   }
    635 }
    636 
    637 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
    638 
    639 /* A vection of the fast scanner using AltiVec vectorized byte compares.
    640    This cannot be used for little endian because vec_lvsl/lvsr are
    641    deprecated for little endian and the code won't work properly.  */
    642 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
    643    so we can't compile this function without -maltivec on the command line
    644    (or implied by some other switch).  */
    645 
    646 static const uchar *
    647 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    648 {
    649   typedef __attribute__((altivec(vector))) unsigned char vc;
    650 
    651   const vc repl_nl = {
    652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    653     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
    654   };
    655   const vc repl_cr = {
    656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    657     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
    658   };
    659   const vc repl_bs = {
    660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    661     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
    662   };
    663   const vc repl_qm = {
    664     '?', '?', '?', '?', '?', '?', '?', '?',
    665     '?', '?', '?', '?', '?', '?', '?', '?',
    666   };
    667   const vc ones = {
    668     -1, -1, -1, -1, -1, -1, -1, -1,
    669     -1, -1, -1, -1, -1, -1, -1, -1,
    670   };
    671   const vc zero = { 0 };
    672 
    673   vc data, mask, t;
    674 
    675   /* Altivec loads automatically mask addresses with -16.  This lets us
    676      issue the first load as early as possible.  */
    677   data = __builtin_vec_ld(0, (const vc *)s);
    678 
    679   /* Discard bytes before the beginning of the buffer.  Do this by
    680      beginning with all ones and shifting in zeros according to the
    681      mis-alignment.  The LVSR instruction pulls the exact shift we
    682      want from the address.  */
    683   mask = __builtin_vec_lvsr(0, s);
    684   mask = __builtin_vec_perm(zero, ones, mask);
    685   data &= mask;
    686 
    687   /* While altivec loads mask addresses, we still need to align S so
    688      that the offset we compute at the end is correct.  */
    689   s = (const uchar *)((uintptr_t)s & -16);
    690 
    691   /* Main loop processing 16 bytes at a time.  */
    692   goto start;
    693   do
    694     {
    695       vc m_nl, m_cr, m_bs, m_qm;
    696 
    697       s += 16;
    698       data = __builtin_vec_ld(0, (const vc *)s);
    699 
    700     start:
    701       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
    702       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
    703       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
    704       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
    705       t = (m_nl | m_cr) | (m_bs | m_qm);
    706 
    707       /* T now contains 0xff in bytes for which we matched one of the relevant
    708 	 characters.  We want to exit the loop if any byte in T is non-zero.
    709 	 Below is the expansion of vec_any_ne(t, zero).  */
    710     }
    711   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
    712 
    713   {
    714 #define N  (sizeof(vc) / sizeof(long))
    715 
    716     union {
    717       vc v;
    718       /* Statically assert that N is 2 or 4.  */
    719       unsigned long l[(N == 2 || N == 4) ? N : -1];
    720     } u;
    721     unsigned long l, i = 0;
    722 
    723     u.v = t;
    724 
    725     /* Find the first word of T that is non-zero.  */
    726     switch (N)
    727       {
    728       case 4:
    729 	l = u.l[i++];
    730 	if (l != 0)
    731 	  break;
    732 	s += sizeof(unsigned long);
    733 	l = u.l[i++];
    734 	if (l != 0)
    735 	  break;
    736 	s += sizeof(unsigned long);
    737 	/* FALLTHROUGH */
    738       case 2:
    739 	l = u.l[i++];
    740 	if (l != 0)
    741 	  break;
    742 	s += sizeof(unsigned long);
    743 	l = u.l[i];
    744       }
    745 
    746     /* L now contains 0xff in bytes for which we matched one of the
    747        relevant characters.  We can find the byte index by finding
    748        its bit index and dividing by 8.  */
    749     l = __builtin_clzl(l) >> 3;
    750     return s + l;
    751 
    752 #undef N
    753   }
    754 }
    755 
    756 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
    757 #include "arm_neon.h"
    758 
    759 /* This doesn't have to be the exact page size, but no system may use
    760    a size smaller than this.  ARMv8 requires a minimum page size of
    761    4k.  The impact of being conservative here is a small number of
    762    cases will take the slightly slower entry path into the main
    763    loop.  */
    764 
    765 #define AARCH64_MIN_PAGE_SIZE 4096
    766 
    767 static const uchar *
    768 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    769 {
    770   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
    771   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
    772   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
    773   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
    774   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
    775 
    776 #ifdef __ARM_BIG_ENDIAN
    777   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
    778 #else
    779   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
    780 #endif
    781 
    782   unsigned int found;
    783   const uint8_t *p;
    784   uint8x16_t data;
    785   uint8x16_t t;
    786   uint16x8_t m;
    787   uint8x16_t u, v, w;
    788 
    789   /* Align the source pointer.  */
    790   p = (const uint8_t *)((uintptr_t)s & -16);
    791 
    792   /* Assuming random string start positions, with a 4k page size we'll take
    793      the slow path about 0.37% of the time.  */
    794   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
    795 			 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
    796 			< 16, 0))
    797     {
    798       /* Slow path: the string starts near a possible page boundary.  */
    799       uint32_t misalign, mask;
    800 
    801       misalign = (uintptr_t)s & 15;
    802       mask = (-1u << misalign) & 0xffff;
    803       data = vld1q_u8 (p);
    804       t = vceqq_u8 (data, repl_nl);
    805       u = vceqq_u8 (data, repl_cr);
    806       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    807       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    808       t = vorrq_u8 (v, w);
    809       t = vandq_u8 (t, xmask);
    810       m = vpaddlq_u8 (t);
    811       m = vshlq_u16 (m, shift);
    812       found = vaddvq_u16 (m);
    813       found &= mask;
    814       if (found)
    815 	return (const uchar*)p + __builtin_ctz (found);
    816     }
    817   else
    818     {
    819       data = vld1q_u8 ((const uint8_t *) s);
    820       t = vceqq_u8 (data, repl_nl);
    821       u = vceqq_u8 (data, repl_cr);
    822       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    823       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    824       t = vorrq_u8 (v, w);
    825       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
    826 	goto done;
    827     }
    828 
    829   do
    830     {
    831       p += 16;
    832       data = vld1q_u8 (p);
    833       t = vceqq_u8 (data, repl_nl);
    834       u = vceqq_u8 (data, repl_cr);
    835       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    836       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    837       t = vorrq_u8 (v, w);
    838     } while (!vpaddd_u64 ((uint64x2_t)t));
    839 
    840 done:
    841   /* Now that we've found the terminating substring, work out precisely where
    842      we need to stop.  */
    843   t = vandq_u8 (t, xmask);
    844   m = vpaddlq_u8 (t);
    845   m = vshlq_u16 (m, shift);
    846   found = vaddvq_u16 (m);
    847   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
    848 	  + __builtin_ctz (found));
    849 }
    850 
    851 #elif defined (__ARM_NEON)
    852 #include "arm_neon.h"
    853 
    854 static const uchar *
    855 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    856 {
    857   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
    858   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
    859   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
    860   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
    861   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
    862 
    863   unsigned int misalign, found, mask;
    864   const uint8_t *p;
    865   uint8x16_t data;
    866 
    867   /* Align the source pointer.  */
    868   misalign = (uintptr_t)s & 15;
    869   p = (const uint8_t *)((uintptr_t)s & -16);
    870   data = vld1q_u8 (p);
    871 
    872   /* Create a mask for the bytes that are valid within the first
    873      16-byte block.  The Idea here is that the AND with the mask
    874      within the loop is "free", since we need some AND or TEST
    875      insn in order to set the flags for the branch anyway.  */
    876   mask = (-1u << misalign) & 0xffff;
    877 
    878   /* Main loop, processing 16 bytes at a time.  */
    879   goto start;
    880 
    881   do
    882     {
    883       uint8x8_t l;
    884       uint16x4_t m;
    885       uint32x2_t n;
    886       uint8x16_t t, u, v, w;
    887 
    888       p += 16;
    889       data = vld1q_u8 (p);
    890       mask = 0xffff;
    891 
    892     start:
    893       t = vceqq_u8 (data, repl_nl);
    894       u = vceqq_u8 (data, repl_cr);
    895       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    896       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    897       t = vandq_u8 (vorrq_u8 (v, w), xmask);
    898       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
    899       m = vpaddl_u8 (l);
    900       n = vpaddl_u16 (m);
    901 
    902       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
    903 	      vshr_n_u64 ((uint64x1_t) n, 24)), 0);
    904       found &= mask;
    905     }
    906   while (!found);
    907 
    908   /* FOUND contains 1 in bits for which we matched a relevant
    909      character.  Conversion to the byte index is trivial.  */
    910   found = __builtin_ctz (found);
    911   return (const uchar *)p + found;
    912 }
    913 
    914 #else
    915 
    916 /* We only have one accelerated alternative.  Use a direct call so that
    917    we encourage inlining.  */
    918 
    919 #define search_line_fast  search_line_acc_char
    920 
    921 #endif
    922 
    923 /* Initialize the lexer if needed.  */
    924 
    925 void
    926 _cpp_init_lexer (void)
    927 {
    928 #ifdef HAVE_init_vectorized_lexer
    929   init_vectorized_lexer ();
    930 #endif
    931 }
    932 
    933 /* Returns with a logical line that contains no escaped newlines or
    934    trigraphs.  This is a time-critical inner loop.  */
    935 void
    936 _cpp_clean_line (cpp_reader *pfile)
    937 {
    938   cpp_buffer *buffer;
    939   const uchar *s;
    940   uchar c, *d, *p;
    941 
    942   buffer = pfile->buffer;
    943   buffer->cur_note = buffer->notes_used = 0;
    944   buffer->cur = buffer->line_base = buffer->next_line;
    945   buffer->need_line = false;
    946   s = buffer->next_line;
    947 
    948   if (!buffer->from_stage3)
    949     {
    950       const uchar *pbackslash = NULL;
    951 
    952       /* Fast path.  This is the common case of an un-escaped line with
    953 	 no trigraphs.  The primary win here is by not writing any
    954 	 data back to memory until we have to.  */
    955       while (1)
    956 	{
    957 	  /* Perform an optimized search for \n, \r, \\, ?.  */
    958 	  s = search_line_fast (s, buffer->rlimit);
    959 
    960 	  c = *s;
    961 	  if (c == '\\')
    962 	    {
    963 	      /* Record the location of the backslash and continue.  */
    964 	      pbackslash = s++;
    965 	    }
    966 	  else if (__builtin_expect (c == '?', 0))
    967 	    {
    968 	      if (__builtin_expect (s[1] == '?', false)
    969 		   && _cpp_trigraph_map[s[2]])
    970 		{
    971 		  /* Have a trigraph.  We may or may not have to convert
    972 		     it.  Add a line note regardless, for -Wtrigraphs.  */
    973 		  add_line_note (buffer, s, s[2]);
    974 		  if (CPP_OPTION (pfile, trigraphs))
    975 		    {
    976 		      /* We do, and that means we have to switch to the
    977 		         slow path.  */
    978 		      d = (uchar *) s;
    979 		      *d = _cpp_trigraph_map[s[2]];
    980 		      s += 2;
    981 		      goto slow_path;
    982 		    }
    983 		}
    984 	      /* Not a trigraph.  Continue on fast-path.  */
    985 	      s++;
    986 	    }
    987 	  else
    988 	    break;
    989 	}
    990 
    991       /* This must be \r or \n.  We're either done, or we'll be forced
    992 	 to write back to the buffer and continue on the slow path.  */
    993       d = (uchar *) s;
    994 
    995       if (__builtin_expect (s == buffer->rlimit, false))
    996 	goto done;
    997 
    998       /* DOS line ending? */
    999       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
   1000 	{
   1001 	  s++;
   1002 	  if (s == buffer->rlimit)
   1003 	    goto done;
   1004 	}
   1005 
   1006       if (__builtin_expect (pbackslash == NULL, true))
   1007 	goto done;
   1008 
   1009       /* Check for escaped newline.  */
   1010       p = d;
   1011       while (is_nvspace (p[-1]))
   1012 	p--;
   1013       if (p - 1 != pbackslash)
   1014 	goto done;
   1015 
   1016       /* Have an escaped newline; process it and proceed to
   1017 	 the slow path.  */
   1018       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
   1019       d = p - 2;
   1020       buffer->next_line = p - 1;
   1021 
   1022     slow_path:
   1023       while (1)
   1024 	{
   1025 	  c = *++s;
   1026 	  *++d = c;
   1027 
   1028 	  if (c == '\n' || c == '\r')
   1029 	    {
   1030 	      /* Handle DOS line endings.  */
   1031 	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
   1032 		s++;
   1033 	      if (s == buffer->rlimit)
   1034 		break;
   1035 
   1036 	      /* Escaped?  */
   1037 	      p = d;
   1038 	      while (p != buffer->next_line && is_nvspace (p[-1]))
   1039 		p--;
   1040 	      if (p == buffer->next_line || p[-1] != '\\')
   1041 		break;
   1042 
   1043 	      add_line_note (buffer, p - 1, p != d ? ' ': '\\');
   1044 	      d = p - 2;
   1045 	      buffer->next_line = p - 1;
   1046 	    }
   1047 	  else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
   1048 	    {
   1049 	      /* Add a note regardless, for the benefit of -Wtrigraphs.  */
   1050 	      add_line_note (buffer, d, s[2]);
   1051 	      if (CPP_OPTION (pfile, trigraphs))
   1052 		{
   1053 		  *d = _cpp_trigraph_map[s[2]];
   1054 		  s += 2;
   1055 		}
   1056 	    }
   1057 	}
   1058     }
   1059   else
   1060     {
   1061       while (*s != '\n' && *s != '\r')
   1062 	s++;
   1063       d = (uchar *) s;
   1064 
   1065       /* Handle DOS line endings.  */
   1066       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
   1067 	s++;
   1068     }
   1069 
   1070  done:
   1071   *d = '\n';
   1072   /* A sentinel note that should never be processed.  */
   1073   add_line_note (buffer, d + 1, '\n');
   1074   buffer->next_line = s + 1;
   1075 }
   1076 
   1077 /* Return true if the trigraph indicated by NOTE should be warned
   1078    about in a comment.  */
   1079 static bool
   1080 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
   1081 {
   1082   const uchar *p;
   1083 
   1084   /* Within comments we don't warn about trigraphs, unless the
   1085      trigraph forms an escaped newline, as that may change
   1086      behavior.  */
   1087   if (note->type != '/')
   1088     return false;
   1089 
   1090   /* If -trigraphs, then this was an escaped newline iff the next note
   1091      is coincident.  */
   1092   if (CPP_OPTION (pfile, trigraphs))
   1093     return note[1].pos == note->pos;
   1094 
   1095   /* Otherwise, see if this forms an escaped newline.  */
   1096   p = note->pos + 3;
   1097   while (is_nvspace (*p))
   1098     p++;
   1099 
   1100   /* There might have been escaped newlines between the trigraph and the
   1101      newline we found.  Hence the position test.  */
   1102   return (*p == '\n' && p < note[1].pos);
   1103 }
   1104 
   1105 /* Process the notes created by add_line_note as far as the current
   1106    location.  */
   1107 void
   1108 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
   1109 {
   1110   cpp_buffer *buffer = pfile->buffer;
   1111 
   1112   for (;;)
   1113     {
   1114       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
   1115       unsigned int col;
   1116 
   1117       if (note->pos > buffer->cur)
   1118 	break;
   1119 
   1120       buffer->cur_note++;
   1121       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
   1122 
   1123       if (note->type == '\\' || note->type == ' ')
   1124 	{
   1125 	  if (note->type == ' ' && !in_comment)
   1126 	    cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
   1127 				 "backslash and newline separated by space");
   1128 
   1129 	  if (buffer->next_line > buffer->rlimit)
   1130 	    {
   1131 	      cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
   1132 				   "backslash-newline at end of file");
   1133 	      /* Prevent "no newline at end of file" warning.  */
   1134 	      buffer->next_line = buffer->rlimit;
   1135 	    }
   1136 
   1137 	  buffer->line_base = note->pos;
   1138 	  CPP_INCREMENT_LINE (pfile, 0);
   1139 	}
   1140       else if (_cpp_trigraph_map[note->type])
   1141 	{
   1142 	  if (CPP_OPTION (pfile, warn_trigraphs)
   1143 	      && (!in_comment || warn_in_comment (pfile, note)))
   1144 	    {
   1145 	      if (CPP_OPTION (pfile, trigraphs))
   1146 		cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
   1147                                        pfile->line_table->highest_line, col,
   1148 				       "trigraph ??%c converted to %c",
   1149 				       note->type,
   1150 				       (int) _cpp_trigraph_map[note->type]);
   1151 	      else
   1152 		{
   1153 		  cpp_warning_with_line
   1154 		    (pfile, CPP_W_TRIGRAPHS,
   1155                      pfile->line_table->highest_line, col,
   1156 		     "trigraph ??%c ignored, use -trigraphs to enable",
   1157 		     note->type);
   1158 		}
   1159 	    }
   1160 	}
   1161       else if (note->type == 0)
   1162 	/* Already processed in lex_raw_string.  */;
   1163       else
   1164 	abort ();
   1165     }
   1166 }
   1167 
   1168 namespace bidi {
   1169   enum class kind {
   1170     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
   1171   };
   1172 
   1173   /* All the UTF-8 encodings of bidi characters start with E2.  */
   1174   constexpr uchar utf8_start = 0xe2;
   1175 
   1176   struct context
   1177   {
   1178     context () {}
   1179     context (location_t loc, kind k, bool pdf, bool ucn)
   1180     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
   1181     {
   1182     }
   1183 
   1184     kind get_pop_kind () const
   1185     {
   1186       return m_pdf ? kind::PDF : kind::PDI;
   1187     }
   1188     bool ucn_p () const
   1189     {
   1190       return m_ucn;
   1191     }
   1192 
   1193     location_t m_loc;
   1194     kind m_kind;
   1195     unsigned m_pdf : 1;
   1196     unsigned m_ucn : 1;
   1197   };
   1198 
   1199   /* A vector holding currently open bidi contexts.  We use a char for
   1200      each context, its LSB is 1 if it represents a PDF context, 0 if it
   1201      represents a PDI context.  The next bit is 1 if this context was open
   1202      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
   1203   semi_embedded_vec <context, 16> vec;
   1204 
   1205   /* Close the whole comment/identifier/string literal/character constant
   1206      context.  */
   1207   void on_close ()
   1208   {
   1209     vec.truncate (0);
   1210   }
   1211 
   1212   /* Pop the last element in the vector.  */
   1213   void pop ()
   1214   {
   1215     unsigned int len = vec.count ();
   1216     gcc_checking_assert (len > 0);
   1217     vec.truncate (len - 1);
   1218   }
   1219 
   1220   /* Return the pop kind of the context of the Ith element.  */
   1221   kind pop_kind_at (unsigned int i)
   1222   {
   1223     return vec[i].get_pop_kind ();
   1224   }
   1225 
   1226   /* Return the pop kind of the context that is currently opened.  */
   1227   kind current_ctx ()
   1228   {
   1229     unsigned int len = vec.count ();
   1230     if (len == 0)
   1231       return kind::NONE;
   1232     return vec[len - 1].get_pop_kind ();
   1233   }
   1234 
   1235   /* Return true if the current context comes from a UCN origin, that is,
   1236      the bidi char which started this bidi context was written as a UCN.  */
   1237   bool current_ctx_ucn_p ()
   1238   {
   1239     unsigned int len = vec.count ();
   1240     gcc_checking_assert (len > 0);
   1241     return vec[len - 1].m_ucn;
   1242   }
   1243 
   1244   location_t current_ctx_loc ()
   1245   {
   1246     unsigned int len = vec.count ();
   1247     gcc_checking_assert (len > 0);
   1248     return vec[len - 1].m_loc;
   1249   }
   1250 
   1251   /* We've read a bidi char, update the current vector as necessary.
   1252      LOC is only valid when K is not kind::NONE.  */
   1253   void on_char (kind k, bool ucn_p, location_t loc)
   1254   {
   1255     switch (k)
   1256       {
   1257       case kind::LRE:
   1258       case kind::RLE:
   1259       case kind::LRO:
   1260       case kind::RLO:
   1261 	vec.push (context (loc, k, true, ucn_p));
   1262 	break;
   1263       case kind::LRI:
   1264       case kind::RLI:
   1265       case kind::FSI:
   1266 	vec.push (context (loc, k, false, ucn_p));
   1267 	break;
   1268       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
   1269 	 whose scope has not yet been terminated.  */
   1270       case kind::PDF:
   1271 	if (current_ctx () == kind::PDF)
   1272 	  pop ();
   1273 	break;
   1274       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
   1275 	 scope has not yet been terminated, as well as the scopes of
   1276 	 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
   1277 	 yet been terminated.  */
   1278       case kind::PDI:
   1279 	for (int i = vec.count () - 1; i >= 0; --i)
   1280 	  if (pop_kind_at (i) == kind::PDI)
   1281 	    {
   1282 	      vec.truncate (i);
   1283 	      break;
   1284 	    }
   1285 	break;
   1286       case kind::LTR:
   1287       case kind::RTL:
   1288 	/* These aren't popped by a PDF/PDI.  */
   1289 	break;
   1290       ATTR_LIKELY case kind::NONE:
   1291 	break;
   1292       default:
   1293 	abort ();
   1294       }
   1295   }
   1296 
   1297   /* Return a descriptive string for K.  */
   1298   const char *to_str (kind k)
   1299   {
   1300     switch (k)
   1301       {
   1302       case kind::LRE:
   1303 	return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
   1304       case kind::RLE:
   1305 	return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
   1306       case kind::LRO:
   1307 	return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
   1308       case kind::RLO:
   1309 	return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
   1310       case kind::LRI:
   1311 	return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
   1312       case kind::RLI:
   1313 	return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
   1314       case kind::FSI:
   1315 	return "U+2068 (FIRST STRONG ISOLATE)";
   1316       case kind::PDF:
   1317 	return "U+202C (POP DIRECTIONAL FORMATTING)";
   1318       case kind::PDI:
   1319 	return "U+2069 (POP DIRECTIONAL ISOLATE)";
   1320       case kind::LTR:
   1321 	return "U+200E (LEFT-TO-RIGHT MARK)";
   1322       case kind::RTL:
   1323 	return "U+200F (RIGHT-TO-LEFT MARK)";
   1324       default:
   1325 	abort ();
   1326       }
   1327   }
   1328 }
   1329 
   1330 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
   1331    within the current line in FILE, with the caret at START.  */
   1332 
   1333 static location_t
   1334 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
   1335 					 const unsigned char *const start,
   1336 					 size_t num_bytes)
   1337 {
   1338   gcc_checking_assert (num_bytes > 0);
   1339 
   1340   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
   1341      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
   1342      whereas linemap_position_for_column is 1-based.  */
   1343 
   1344   /* Get 0-based offsets within the line.  */
   1345   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
   1346   size_t end_offset = start_offset + num_bytes - 1;
   1347 
   1348   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
   1349   location_t start_loc = linemap_position_for_column (pfile->line_table,
   1350 						      start_offset + 1);
   1351   location_t end_loc = linemap_position_for_column (pfile->line_table,
   1352 						     end_offset + 1);
   1353 
   1354   if (start_loc == end_loc)
   1355     return start_loc;
   1356 
   1357   source_range src_range;
   1358   src_range.m_start = start_loc;
   1359   src_range.m_finish = end_loc;
   1360   location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
   1361 						   start_loc,
   1362 						   src_range,
   1363 						   NULL);
   1364   return combined_loc;
   1365 }
   1366 
   1367 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
   1368 
   1369 static bidi::kind
   1370 get_bidi_utf8_1 (const unsigned char *const p)
   1371 {
   1372   gcc_checking_assert (p[0] == bidi::utf8_start);
   1373 
   1374   if (p[1] == 0x80)
   1375     switch (p[2])
   1376       {
   1377       case 0xaa:
   1378 	return bidi::kind::LRE;
   1379       case 0xab:
   1380 	return bidi::kind::RLE;
   1381       case 0xac:
   1382 	return bidi::kind::PDF;
   1383       case 0xad:
   1384 	return bidi::kind::LRO;
   1385       case 0xae:
   1386 	return bidi::kind::RLO;
   1387       case 0x8e:
   1388 	return bidi::kind::LTR;
   1389       case 0x8f:
   1390 	return bidi::kind::RTL;
   1391       default:
   1392 	break;
   1393       }
   1394   else if (p[1] == 0x81)
   1395     switch (p[2])
   1396       {
   1397       case 0xa6:
   1398 	return bidi::kind::LRI;
   1399       case 0xa7:
   1400 	return bidi::kind::RLI;
   1401       case 0xa8:
   1402 	return bidi::kind::FSI;
   1403       case 0xa9:
   1404 	return bidi::kind::PDI;
   1405       default:
   1406 	break;
   1407       }
   1408 
   1409   return bidi::kind::NONE;
   1410 }
   1411 
   1412 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
   1413    If the kind is not NONE, write the location to *OUT.*/
   1414 
   1415 static bidi::kind
   1416 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
   1417 {
   1418   bidi::kind result = get_bidi_utf8_1 (p);
   1419   if (result != bidi::kind::NONE)
   1420     {
   1421       /* We have a sequence of 3 bytes starting at P.  */
   1422       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
   1423     }
   1424   return result;
   1425 }
   1426 
   1427 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
   1428 
   1429 static bidi::kind
   1430 get_bidi_ucn_1 (const unsigned char *p, bool is_U)
   1431 {
   1432   /* 6.4.3 Universal Character Names
   1433       \u hex-quad
   1434       \U hex-quad hex-quad
   1435      where \unnnn means \U0000nnnn.  */
   1436 
   1437   if (is_U)
   1438     {
   1439       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
   1440 	return bidi::kind::NONE;
   1441       /* Skip 4B so we can treat \u and \U the same below.  */
   1442       p += 4;
   1443     }
   1444 
   1445   /* All code points we are looking for start with 20xx.  */
   1446   if (p[0] != '2' || p[1] != '0')
   1447     return bidi::kind::NONE;
   1448   else if (p[2] == '2')
   1449     switch (p[3])
   1450       {
   1451       case 'a':
   1452       case 'A':
   1453 	return bidi::kind::LRE;
   1454       case 'b':
   1455       case 'B':
   1456 	return bidi::kind::RLE;
   1457       case 'c':
   1458       case 'C':
   1459 	return bidi::kind::PDF;
   1460       case 'd':
   1461       case 'D':
   1462 	return bidi::kind::LRO;
   1463       case 'e':
   1464       case 'E':
   1465 	return bidi::kind::RLO;
   1466       default:
   1467 	break;
   1468       }
   1469   else if (p[2] == '6')
   1470     switch (p[3])
   1471       {
   1472       case '6':
   1473 	return bidi::kind::LRI;
   1474       case '7':
   1475 	return bidi::kind::RLI;
   1476       case '8':
   1477 	return bidi::kind::FSI;
   1478       case '9':
   1479 	return bidi::kind::PDI;
   1480       default:
   1481 	break;
   1482       }
   1483   else if (p[2] == '0')
   1484     switch (p[3])
   1485       {
   1486       case 'e':
   1487       case 'E':
   1488 	return bidi::kind::LTR;
   1489       case 'f':
   1490       case 'F':
   1491 	return bidi::kind::RTL;
   1492       default:
   1493 	break;
   1494       }
   1495 
   1496   return bidi::kind::NONE;
   1497 }
   1498 
   1499 /* Parse a UCN where P points just past \u or \U and return its bidi code.
   1500    If the kind is not NONE, write the location to *OUT.*/
   1501 
   1502 static bidi::kind
   1503 get_bidi_ucn (cpp_reader *pfile,  const unsigned char *p, bool is_U,
   1504 	      location_t *out)
   1505 {
   1506   bidi::kind result = get_bidi_ucn_1 (p, is_U);
   1507   if (result != bidi::kind::NONE)
   1508     {
   1509       const unsigned char *start = p - 2;
   1510       size_t num_bytes = 2 + (is_U ? 8 : 4);
   1511       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
   1512     }
   1513   return result;
   1514 }
   1515 
   1516 /* Subclass of rich_location for reporting on unpaired UTF-8
   1517    bidirectional control character(s).
   1518    Escape the source lines on output, and show all unclosed
   1519    bidi context, labelling everything.  */
   1520 
   1521 class unpaired_bidi_rich_location : public rich_location
   1522 {
   1523  public:
   1524   class custom_range_label : public range_label
   1525   {
   1526    public:
   1527      label_text get_text (unsigned range_idx) const FINAL OVERRIDE
   1528      {
   1529        /* range 0 is the primary location; each subsequent range i + 1
   1530 	  is for bidi::vec[i].  */
   1531        if (range_idx > 0)
   1532 	 {
   1533 	   const bidi::context &ctxt (bidi::vec[range_idx - 1]);
   1534 	   return label_text::borrow (bidi::to_str (ctxt.m_kind));
   1535 	 }
   1536        else
   1537 	 return label_text::borrow (_("end of bidirectional context"));
   1538      }
   1539   };
   1540 
   1541   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
   1542   : rich_location (pfile->line_table, loc, &m_custom_label)
   1543   {
   1544     set_escape_on_output (true);
   1545     for (unsigned i = 0; i < bidi::vec.count (); i++)
   1546       add_range (bidi::vec[i].m_loc,
   1547 		 SHOW_RANGE_WITHOUT_CARET,
   1548 		 &m_custom_label);
   1549   }
   1550 
   1551  private:
   1552    custom_range_label m_custom_label;
   1553 };
   1554 
   1555 /* We're closing a bidi context, that is, we've encountered a newline,
   1556    are closing a C-style comment, or are at the end of a string literal,
   1557    character constant, or identifier.  Warn if this context was not
   1558    properly terminated by a PDI or PDF.  P points to the last character
   1559    in this context.  */
   1560 
   1561 static void
   1562 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
   1563 {
   1564   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
   1565   if (bidi::vec.count () > 0
   1566       && (warn_bidi & bidirectional_unpaired
   1567 	  && (!bidi::current_ctx_ucn_p ()
   1568 	      || (warn_bidi & bidirectional_ucn))))
   1569     {
   1570       const location_t loc
   1571 	= linemap_position_for_column (pfile->line_table,
   1572 				       CPP_BUF_COLUMN (pfile->buffer, p));
   1573       unpaired_bidi_rich_location rich_loc (pfile, loc);
   1574       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
   1575 	 forms of a diagnostic, so fake it for now.  */
   1576       if (bidi::vec.count () > 1)
   1577 	cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1578 			"unpaired UTF-8 bidirectional control characters "
   1579 			"detected");
   1580       else
   1581 	cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1582 			"unpaired UTF-8 bidirectional control character "
   1583 			"detected");
   1584     }
   1585   /* We're done with this context.  */
   1586   bidi::on_close ();
   1587 }
   1588 
   1589 /* We're at the beginning or in the middle of an identifier/comment/string
   1590    literal/character constant.  Warn if we've encountered a bidi character.
   1591    KIND says which bidi control character it was; UCN_P is true iff this bidi
   1592    control character was written as a UCN.  LOC is the location of the
   1593    character, but is only valid if KIND != bidi::kind::NONE.  */
   1594 
   1595 static void
   1596 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
   1597 			 bool ucn_p, location_t loc)
   1598 {
   1599   if (__builtin_expect (kind == bidi::kind::NONE, 1))
   1600     return;
   1601 
   1602   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
   1603 
   1604   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
   1605     {
   1606       rich_location rich_loc (pfile->line_table, loc);
   1607       rich_loc.set_escape_on_output (true);
   1608 
   1609       /* It seems excessive to warn about a PDI/PDF that is closing
   1610 	 an opened context because we've already warned about the
   1611 	 opening character.  Except warn when we have a UCN x UTF-8
   1612 	 mismatch, if UCN checking is enabled.  */
   1613       if (kind == bidi::current_ctx ())
   1614 	{
   1615 	  if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
   1616 	      && bidi::current_ctx_ucn_p () != ucn_p)
   1617 	    {
   1618 	      rich_loc.add_range (bidi::current_ctx_loc ());
   1619 	      cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1620 			      "UTF-8 vs UCN mismatch when closing "
   1621 			      "a context by \"%s\"", bidi::to_str (kind));
   1622 	    }
   1623 	}
   1624       else if (warn_bidi & bidirectional_any
   1625 	       && (!ucn_p || (warn_bidi & bidirectional_ucn)))
   1626 	{
   1627 	  if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
   1628 	    cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1629 			    "\"%s\" is closing an unopened context",
   1630 			    bidi::to_str (kind));
   1631 	  else
   1632 	    cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1633 			    "found problematic Unicode character \"%s\"",
   1634 			    bidi::to_str (kind));
   1635 	}
   1636     }
   1637   /* We're done with this context.  */
   1638   bidi::on_char (kind, ucn_p, loc);
   1639 }
   1640 
   1641 /* Skip a C-style block comment.  We find the end of the comment by
   1642    seeing if an asterisk is before every '/' we encounter.  Returns
   1643    nonzero if comment terminated by EOF, zero otherwise.
   1644 
   1645    Buffer->cur points to the initial asterisk of the comment.  */
   1646 bool
   1647 _cpp_skip_block_comment (cpp_reader *pfile)
   1648 {
   1649   cpp_buffer *buffer = pfile->buffer;
   1650   const uchar *cur = buffer->cur;
   1651   uchar c;
   1652   const bool warn_bidi_p = pfile->warn_bidi_p ();
   1653 
   1654   cur++;
   1655   if (*cur == '/')
   1656     cur++;
   1657 
   1658   for (;;)
   1659     {
   1660       /* People like decorating comments with '*', so check for '/'
   1661 	 instead for efficiency.  */
   1662       c = *cur++;
   1663 
   1664       if (c == '/')
   1665 	{
   1666 	  if (cur[-2] == '*')
   1667 	    {
   1668 	      if (warn_bidi_p)
   1669 		maybe_warn_bidi_on_close (pfile, cur);
   1670 	      break;
   1671 	    }
   1672 
   1673 	  /* Warn about potential nested comments, but not if the '/'
   1674 	     comes immediately before the true comment delimiter.
   1675 	     Don't bother to get it right across escaped newlines.  */
   1676 	  if (CPP_OPTION (pfile, warn_comments)
   1677 	      && cur[0] == '*' && cur[1] != '/')
   1678 	    {
   1679 	      buffer->cur = cur;
   1680 	      cpp_warning_with_line (pfile, CPP_W_COMMENTS,
   1681 				     pfile->line_table->highest_line,
   1682 				     CPP_BUF_COL (buffer),
   1683 				     "\"/*\" within comment");
   1684 	    }
   1685 	}
   1686       else if (c == '\n')
   1687 	{
   1688 	  unsigned int cols;
   1689 	  buffer->cur = cur - 1;
   1690 	  if (warn_bidi_p)
   1691 	    maybe_warn_bidi_on_close (pfile, cur);
   1692 	  _cpp_process_line_notes (pfile, true);
   1693 	  if (buffer->next_line >= buffer->rlimit)
   1694 	    return true;
   1695 	  _cpp_clean_line (pfile);
   1696 
   1697 	  cols = buffer->next_line - buffer->line_base;
   1698 	  CPP_INCREMENT_LINE (pfile, cols);
   1699 
   1700 	  cur = buffer->cur;
   1701 	}
   1702       /* If this is a beginning of a UTF-8 encoding, it might be
   1703 	 a bidirectional control character.  */
   1704       else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
   1705 	{
   1706 	  location_t loc;
   1707 	  bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
   1708 	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   1709 	}
   1710     }
   1711 
   1712   buffer->cur = cur;
   1713   _cpp_process_line_notes (pfile, true);
   1714   return false;
   1715 }
   1716 
   1717 /* Skip a C++ line comment, leaving buffer->cur pointing to the
   1718    terminating newline.  Handles escaped newlines.  Returns nonzero
   1719    if a multiline comment.  */
   1720 static int
   1721 skip_line_comment (cpp_reader *pfile)
   1722 {
   1723   cpp_buffer *buffer = pfile->buffer;
   1724   location_t orig_line = pfile->line_table->highest_line;
   1725   const bool warn_bidi_p = pfile->warn_bidi_p ();
   1726 
   1727   if (!warn_bidi_p)
   1728     while (*buffer->cur != '\n')
   1729       buffer->cur++;
   1730   else
   1731     {
   1732       while (*buffer->cur != '\n'
   1733 	     && *buffer->cur != bidi::utf8_start)
   1734 	buffer->cur++;
   1735       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
   1736 	{
   1737 	  while (*buffer->cur != '\n')
   1738 	    {
   1739 	      if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
   1740 		{
   1741 		  location_t loc;
   1742 		  bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
   1743 		  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   1744 		}
   1745 	      buffer->cur++;
   1746 	    }
   1747 	  maybe_warn_bidi_on_close (pfile, buffer->cur);
   1748 	}
   1749     }
   1750 
   1751   _cpp_process_line_notes (pfile, true);
   1752   return orig_line != pfile->line_table->highest_line;
   1753 }
   1754 
   1755 /* Skips whitespace, saving the next non-whitespace character.  */
   1756 static void
   1757 skip_whitespace (cpp_reader *pfile, cppchar_t c)
   1758 {
   1759   cpp_buffer *buffer = pfile->buffer;
   1760   bool saw_NUL = false;
   1761 
   1762   do
   1763     {
   1764       /* Horizontal space always OK.  */
   1765       if (c == ' ' || c == '\t')
   1766 	;
   1767       /* Just \f \v or \0 left.  */
   1768       else if (c == '\0')
   1769 	saw_NUL = true;
   1770       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
   1771 	cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
   1772 			     CPP_BUF_COL (buffer),
   1773 			     "%s in preprocessing directive",
   1774 			     c == '\f' ? "form feed" : "vertical tab");
   1775 
   1776       c = *buffer->cur++;
   1777     }
   1778   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
   1779   while (is_nvspace (c));
   1780 
   1781   if (saw_NUL)
   1782     {
   1783       encoding_rich_location rich_loc (pfile);
   1784       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
   1785 		    "null character(s) ignored");
   1786     }
   1787 
   1788   buffer->cur--;
   1789 }
   1790 
   1791 /* See if the characters of a number token are valid in a name (no
   1792    '.', '+' or '-').  */
   1793 static int
   1794 name_p (cpp_reader *pfile, const cpp_string *string)
   1795 {
   1796   unsigned int i;
   1797 
   1798   for (i = 0; i < string->len; i++)
   1799     if (!is_idchar (string->text[i]))
   1800       return 0;
   1801 
   1802   return 1;
   1803 }
   1804 
   1805 /* After parsing an identifier or other sequence, produce a warning about
   1806    sequences not in NFC/NFKC.  */
   1807 static void
   1808 warn_about_normalization (cpp_reader *pfile,
   1809 			  const cpp_token *token,
   1810 			  const struct normalize_state *s)
   1811 {
   1812   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
   1813       && !pfile->state.skipping)
   1814     {
   1815       location_t loc = token->src_loc;
   1816 
   1817       /* If possible, create a location range for the token.  */
   1818       if (loc >= RESERVED_LOCATION_COUNT
   1819 	  && token->type != CPP_EOF
   1820 	  /* There must be no line notes to process.  */
   1821 	  && (!(pfile->buffer->cur
   1822 		>= pfile->buffer->notes[pfile->buffer->cur_note].pos
   1823 		&& !pfile->overlaid_buffer)))
   1824 	{
   1825 	  source_range tok_range;
   1826 	  tok_range.m_start = loc;
   1827 	  tok_range.m_finish
   1828 	    = linemap_position_for_column (pfile->line_table,
   1829 					   CPP_BUF_COLUMN (pfile->buffer,
   1830 							   pfile->buffer->cur));
   1831 	  loc = COMBINE_LOCATION_DATA (pfile->line_table,
   1832 				       loc, tok_range, NULL);
   1833 	}
   1834 
   1835       encoding_rich_location rich_loc (pfile, loc);
   1836 
   1837       /* Make sure that the token is printed using UCNs, even
   1838 	 if we'd otherwise happily print UTF-8.  */
   1839       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
   1840       size_t sz;
   1841 
   1842       sz = cpp_spell_token (pfile, token, buf, false) - buf;
   1843       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
   1844 	cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
   1845 			"`%.*s' is not in NFKC", (int) sz, buf);
   1846       else if (CPP_OPTION (pfile, cplusplus))
   1847 	cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
   1848 				  "`%.*s' is not in NFC", (int) sz, buf);
   1849       else
   1850 	cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
   1851 			"`%.*s' is not in NFC", (int) sz, buf);
   1852       free (buf);
   1853     }
   1854 }
   1855 
   1856 static const cppchar_t utf8_signifier = 0xC0;
   1857 
   1858 /* Returns TRUE if the sequence starting at buffer->cur is valid in
   1859    an identifier.  FIRST is TRUE if this starts an identifier.  */
   1860 
   1861 static bool
   1862 forms_identifier_p (cpp_reader *pfile, int first,
   1863 		    struct normalize_state *state)
   1864 {
   1865   cpp_buffer *buffer = pfile->buffer;
   1866   const bool warn_bidi_p = pfile->warn_bidi_p ();
   1867 
   1868   if (*buffer->cur == '$')
   1869     {
   1870       if (!CPP_OPTION (pfile, dollars_in_ident))
   1871 	return false;
   1872 
   1873       buffer->cur++;
   1874       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
   1875 	{
   1876 	  CPP_OPTION (pfile, warn_dollars) = 0;
   1877 	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
   1878 	}
   1879 
   1880       return true;
   1881     }
   1882 
   1883   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
   1884   if (CPP_OPTION (pfile, extended_identifiers))
   1885     {
   1886       cppchar_t s;
   1887       if (*buffer->cur >= utf8_signifier)
   1888 	{
   1889 	  if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
   1890 	      && warn_bidi_p)
   1891 	    {
   1892 	      location_t loc;
   1893 	      bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
   1894 	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   1895 	    }
   1896 	  if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
   1897 			       state, &s))
   1898 	    return true;
   1899 	}
   1900       else if (*buffer->cur == '\\'
   1901 	       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
   1902 	{
   1903 	  buffer->cur += 2;
   1904 	  if (warn_bidi_p)
   1905 	    {
   1906 	      location_t loc;
   1907 	      bidi::kind kind = get_bidi_ucn (pfile,
   1908 					      buffer->cur,
   1909 					      buffer->cur[-1] == 'U',
   1910 					      &loc);
   1911 	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
   1912 	    }
   1913 	  if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
   1914 			      state, &s, NULL, NULL))
   1915 	    return true;
   1916 	  buffer->cur -= 2;
   1917 	}
   1918     }
   1919 
   1920   return false;
   1921 }
   1922 
   1923 /* Helper function to issue error about improper __VA_OPT__ use.  */
   1924 static void
   1925 maybe_va_opt_error (cpp_reader *pfile)
   1926 {
   1927   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
   1928     {
   1929       /* __VA_OPT__ should not be accepted at all, but allow it in
   1930 	 system headers.  */
   1931       if (!_cpp_in_system_header (pfile))
   1932 	cpp_error (pfile, CPP_DL_PEDWARN,
   1933 		   "__VA_OPT__ is not available until C++20");
   1934     }
   1935   else if (!pfile->state.va_args_ok)
   1936     {
   1937       /* __VA_OPT__ should only appear in the replacement list of a
   1938 	 variadic macro.  */
   1939       cpp_error (pfile, CPP_DL_PEDWARN,
   1940 		 "__VA_OPT__ can only appear in the expansion"
   1941 		 " of a C++20 variadic macro");
   1942     }
   1943 }
   1944 
   1945 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
   1946 static cpp_hashnode *
   1947 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
   1948 {
   1949   cpp_hashnode *result;
   1950   const uchar *cur;
   1951   unsigned int len;
   1952   unsigned int hash = HT_HASHSTEP (0, *base);
   1953 
   1954   cur = base + 1;
   1955   while (ISIDNUM (*cur))
   1956     {
   1957       hash = HT_HASHSTEP (hash, *cur);
   1958       cur++;
   1959     }
   1960   len = cur - base;
   1961   hash = HT_HASHFINISH (hash, len);
   1962   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
   1963 					      base, len, hash, HT_ALLOC));
   1964 
   1965   /* Rarely, identifiers require diagnostics when lexed.  */
   1966   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
   1967 			&& !pfile->state.skipping, 0))
   1968     {
   1969       /* It is allowed to poison the same identifier twice.  */
   1970       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
   1971 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
   1972 		   NODE_NAME (result));
   1973 
   1974       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
   1975 	 replacement list of a variadic macro.  */
   1976       if (result == pfile->spec_nodes.n__VA_ARGS__
   1977 	  && !pfile->state.va_args_ok)
   1978 	{
   1979 	  if (CPP_OPTION (pfile, cplusplus))
   1980 	    cpp_error (pfile, CPP_DL_PEDWARN,
   1981 		       "__VA_ARGS__ can only appear in the expansion"
   1982 		       " of a C++11 variadic macro");
   1983 	  else
   1984 	    cpp_error (pfile, CPP_DL_PEDWARN,
   1985 		       "__VA_ARGS__ can only appear in the expansion"
   1986 		       " of a C99 variadic macro");
   1987 	}
   1988 
   1989       if (result == pfile->spec_nodes.n__VA_OPT__)
   1990 	maybe_va_opt_error (pfile);
   1991 
   1992       /* For -Wc++-compat, warn about use of C++ named operators.  */
   1993       if (result->flags & NODE_WARN_OPERATOR)
   1994 	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
   1995 		     "identifier \"%s\" is a special operator name in C++",
   1996 		     NODE_NAME (result));
   1997     }
   1998 
   1999   return result;
   2000 }
   2001 
   2002 /* Get the cpp_hashnode of an identifier specified by NAME in
   2003    the current cpp_reader object.  If none is found, NULL is returned.  */
   2004 cpp_hashnode *
   2005 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
   2006 {
   2007   cpp_hashnode *result;
   2008   result = lex_identifier_intern (pfile, (uchar *) name);
   2009   return result;
   2010 }
   2011 
   2012 /* Lex an identifier starting at BUFFER->CUR - 1.  */
   2013 static cpp_hashnode *
   2014 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
   2015 		struct normalize_state *nst, cpp_hashnode **spelling)
   2016 {
   2017   cpp_hashnode *result;
   2018   const uchar *cur;
   2019   unsigned int len;
   2020   unsigned int hash = HT_HASHSTEP (0, *base);
   2021   const bool warn_bidi_p = pfile->warn_bidi_p ();
   2022 
   2023   cur = pfile->buffer->cur;
   2024   if (! starts_ucn)
   2025     {
   2026       while (ISIDNUM (*cur))
   2027 	{
   2028 	  hash = HT_HASHSTEP (hash, *cur);
   2029 	  cur++;
   2030 	}
   2031       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
   2032     }
   2033   pfile->buffer->cur = cur;
   2034   if (starts_ucn || forms_identifier_p (pfile, false, nst))
   2035     {
   2036       /* Slower version for identifiers containing UCNs
   2037 	 or extended chars (including $).  */
   2038       do {
   2039 	while (ISIDNUM (*pfile->buffer->cur))
   2040 	  {
   2041 	    NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
   2042 	    pfile->buffer->cur++;
   2043 	  }
   2044       } while (forms_identifier_p (pfile, false, nst));
   2045       if (warn_bidi_p)
   2046 	maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
   2047       result = _cpp_interpret_identifier (pfile, base,
   2048 					  pfile->buffer->cur - base);
   2049       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
   2050     }
   2051   else
   2052     {
   2053       len = cur - base;
   2054       hash = HT_HASHFINISH (hash, len);
   2055 
   2056       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
   2057 						  base, len, hash, HT_ALLOC));
   2058       *spelling = result;
   2059     }
   2060 
   2061   /* Rarely, identifiers require diagnostics when lexed.  */
   2062   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
   2063 			&& !pfile->state.skipping, 0))
   2064     {
   2065       /* It is allowed to poison the same identifier twice.  */
   2066       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
   2067 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
   2068 		   NODE_NAME (result));
   2069 
   2070       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
   2071 	 replacement list of a variadic macro.  */
   2072       if (result == pfile->spec_nodes.n__VA_ARGS__
   2073 	  && !pfile->state.va_args_ok)
   2074 	{
   2075 	  if (CPP_OPTION (pfile, cplusplus))
   2076 	    cpp_error (pfile, CPP_DL_PEDWARN,
   2077 		       "__VA_ARGS__ can only appear in the expansion"
   2078 		       " of a C++11 variadic macro");
   2079 	  else
   2080 	    cpp_error (pfile, CPP_DL_PEDWARN,
   2081 		       "__VA_ARGS__ can only appear in the expansion"
   2082 		       " of a C99 variadic macro");
   2083 	}
   2084 
   2085       /* __VA_OPT__ should only appear in the replacement list of a
   2086 	 variadic macro.  */
   2087       if (result == pfile->spec_nodes.n__VA_OPT__)
   2088 	maybe_va_opt_error (pfile);
   2089 
   2090       /* For -Wc++-compat, warn about use of C++ named operators.  */
   2091       if (result->flags & NODE_WARN_OPERATOR)
   2092 	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
   2093 		     "identifier \"%s\" is a special operator name in C++",
   2094 		     NODE_NAME (result));
   2095     }
   2096 
   2097   return result;
   2098 }
   2099 
   2100 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
   2101 static void
   2102 lex_number (cpp_reader *pfile, cpp_string *number,
   2103 	    struct normalize_state *nst)
   2104 {
   2105   const uchar *cur;
   2106   const uchar *base;
   2107   uchar *dest;
   2108 
   2109   base = pfile->buffer->cur - 1;
   2110   do
   2111     {
   2112       const uchar *adj_digit_sep = NULL;
   2113       cur = pfile->buffer->cur;
   2114 
   2115       /* N.B. ISIDNUM does not include $.  */
   2116       while (ISIDNUM (*cur)
   2117 	     || (*cur == '.' && !DIGIT_SEP (cur[-1]))
   2118 	     || DIGIT_SEP (*cur)
   2119 	     || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
   2120 	{
   2121 	  NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
   2122 	  /* Adjacent digit separators do not form part of the pp-number syntax.
   2123 	     However, they can safely be diagnosed here as an error, since '' is
   2124 	     not a valid preprocessing token.  */
   2125 	  if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
   2126 	    adj_digit_sep = cur;
   2127 	  cur++;
   2128 	}
   2129       /* A number can't end with a digit separator.  */
   2130       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
   2131 	--cur;
   2132       if (adj_digit_sep && adj_digit_sep < cur)
   2133 	cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
   2134 
   2135       pfile->buffer->cur = cur;
   2136     }
   2137   while (forms_identifier_p (pfile, false, nst));
   2138 
   2139   number->len = cur - base;
   2140   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
   2141   memcpy (dest, base, number->len);
   2142   dest[number->len] = '\0';
   2143   number->text = dest;
   2144 }
   2145 
   2146 /* Create a token of type TYPE with a literal spelling.  */
   2147 static void
   2148 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
   2149 		unsigned int len, enum cpp_ttype type)
   2150 {
   2151   token->type = type;
   2152   token->val.str.len = len;
   2153   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
   2154 }
   2155 
   2156 const uchar *
   2157 cpp_alloc_token_string (cpp_reader *pfile,
   2158 			const unsigned char *ptr, unsigned len)
   2159 {
   2160   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
   2161 
   2162   dest[len] = 0;
   2163   memcpy (dest, ptr, len);
   2164   return dest;
   2165 }
   2166 
   2167 /* A pair of raw buffer pointers.  The currently open one is [1], the
   2168    first one is [0].  Used for string literal lexing.  */
   2169 struct lit_accum {
   2170   _cpp_buff *first;
   2171   _cpp_buff *last;
   2172   const uchar *rpos;
   2173   size_t accum;
   2174 
   2175   lit_accum ()
   2176     : first (NULL), last (NULL), rpos (0), accum (0)
   2177   {
   2178   }
   2179 
   2180   void append (cpp_reader *, const uchar *, size_t);
   2181 
   2182   void read_begin (cpp_reader *);
   2183   bool reading_p () const
   2184   {
   2185     return rpos != NULL;
   2186   }
   2187   char read_char ()
   2188   {
   2189     char c = *rpos++;
   2190     if (rpos == BUFF_FRONT (last))
   2191       rpos = NULL;
   2192     return c;
   2193   }
   2194 };
   2195 
   2196 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
   2197    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
   2198 
   2199 void
   2200 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
   2201 {
   2202   if (!last)
   2203     /* Starting.  */
   2204     first = last = _cpp_get_buff (pfile, len);
   2205   else if (len > BUFF_ROOM (last))
   2206     {
   2207       /* There is insufficient room in the buffer.  Copy what we can,
   2208 	 and then either extend or create a new one.  */
   2209       size_t room = BUFF_ROOM (last);
   2210       memcpy (BUFF_FRONT (last), base, room);
   2211       BUFF_FRONT (last) += room;
   2212       base += room;
   2213       len -= room;
   2214       accum += room;
   2215 
   2216       gcc_checking_assert (!rpos);
   2217 
   2218       last = _cpp_append_extend_buff (pfile, last, len);
   2219     }
   2220 
   2221   memcpy (BUFF_FRONT (last), base, len);
   2222   BUFF_FRONT (last) += len;
   2223   accum += len;
   2224 }
   2225 
   2226 void
   2227 lit_accum::read_begin (cpp_reader *pfile)
   2228 {
   2229   /* We never accumulate more than 4 chars to read.  */
   2230   if (BUFF_ROOM (last) < 4)
   2231 
   2232     last = _cpp_append_extend_buff (pfile, last, 4);
   2233   rpos = BUFF_FRONT (last);
   2234 }
   2235 
   2236 /* Returns true if a macro has been defined.
   2237    This might not work if compile with -save-temps,
   2238    or preprocess separately from compilation.  */
   2239 
   2240 static bool
   2241 is_macro(cpp_reader *pfile, const uchar *base)
   2242 {
   2243   const uchar *cur = base;
   2244   if (! ISIDST (*cur))
   2245     return false;
   2246   unsigned int hash = HT_HASHSTEP (0, *cur);
   2247   ++cur;
   2248   while (ISIDNUM (*cur))
   2249     {
   2250       hash = HT_HASHSTEP (hash, *cur);
   2251       ++cur;
   2252     }
   2253   hash = HT_HASHFINISH (hash, cur - base);
   2254 
   2255   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
   2256 					base, cur - base, hash, HT_NO_INSERT));
   2257 
   2258   return result && cpp_macro_p (result);
   2259 }
   2260 
   2261 /* Returns true if a literal suffix does not have the expected form
   2262    and is defined as a macro.  */
   2263 
   2264 static bool
   2265 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
   2266 {
   2267   /* User-defined literals outside of namespace std must start with a single
   2268      underscore, so assume anything of that form really is a UDL suffix.
   2269      We don't need to worry about UDLs defined inside namespace std because
   2270      their names are reserved, so cannot be used as macro names in valid
   2271      programs.  */
   2272   if (base[0] == '_' && base[1] != '_')
   2273     return false;
   2274   return is_macro (pfile, base);
   2275 }
   2276 
   2277 /* Lexes a raw string.  The stored string contains the spelling,
   2278    including double quotes, delimiter string, '(' and ')', any leading
   2279    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
   2280    the type of the literal, or CPP_OTHER if it was not properly
   2281    terminated.
   2282 
   2283    BASE is the start of the token.  Updates pfile->buffer->cur to just
   2284    after the lexed string.
   2285 
   2286    The spelling is NUL-terminated, but it is not guaranteed that this
   2287    is the first NUL since embedded NULs are preserved.  */
   2288 
   2289 static void
   2290 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
   2291 {
   2292   const uchar *pos = base;
   2293   const bool warn_bidi_p = pfile->warn_bidi_p ();
   2294 
   2295   /* 'tis a pity this information isn't passed down from the lexer's
   2296      initial categorization of the token.  */
   2297   enum cpp_ttype type = CPP_STRING;
   2298 
   2299   if (*pos == 'L')
   2300     {
   2301       type = CPP_WSTRING;
   2302       pos++;
   2303     }
   2304   else if (*pos == 'U')
   2305     {
   2306       type = CPP_STRING32;
   2307       pos++;
   2308     }
   2309   else if (*pos == 'u')
   2310     {
   2311       if (pos[1] == '8')
   2312 	{
   2313 	  type = CPP_UTF8STRING;
   2314 	  pos++;
   2315 	}
   2316       else
   2317 	type = CPP_STRING16;
   2318       pos++;
   2319     }
   2320 
   2321   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
   2322   pos += 2;
   2323 
   2324   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
   2325 
   2326   /* Skip notes before the ".  */
   2327   while (note->pos < pos)
   2328     ++note;
   2329 
   2330   lit_accum accum;
   2331 
   2332   uchar prefix[17];
   2333   unsigned prefix_len = 0;
   2334   enum Phase
   2335   {
   2336    PHASE_PREFIX = -2,
   2337    PHASE_NONE = -1,
   2338    PHASE_SUFFIX = 0
   2339   } phase = PHASE_PREFIX;
   2340 
   2341   for (;;)
   2342     {
   2343       gcc_checking_assert (note->pos >= pos);
   2344 
   2345       /* Undo any escaped newlines and trigraphs.  */
   2346       if (!accum.reading_p () && note->pos == pos)
   2347 	switch (note->type)
   2348 	  {
   2349 	  case '\\':
   2350 	  case ' ':
   2351 	    /* Restore backslash followed by newline.  */
   2352 	    accum.append (pfile, base, pos - base);
   2353 	    base = pos;
   2354 	    accum.read_begin (pfile);
   2355 	    accum.append (pfile, UC"\\", 1);
   2356 
   2357 	  after_backslash:
   2358 	    if (note->type == ' ')
   2359 	      /* GNU backslash whitespace newline extension.  FIXME
   2360 		 could be any sequence of non-vertical space.  When we
   2361 		 can properly restore any such sequence, we should
   2362 		 mark this note as handled so _cpp_process_line_notes
   2363 		 doesn't warn.  */
   2364 	      accum.append (pfile, UC" ", 1);
   2365 
   2366 	    accum.append (pfile, UC"\n", 1);
   2367 	    note++;
   2368 	    break;
   2369 
   2370 	  case '\n':
   2371 	    /* This can happen for ??/<NEWLINE> when trigraphs are not
   2372 	       being interpretted.  */
   2373 	    gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
   2374 	    note->type = 0;
   2375 	    note++;
   2376 	    break;
   2377 
   2378 	  default:
   2379 	    gcc_checking_assert (_cpp_trigraph_map[note->type]);
   2380 
   2381 	    /* Don't warn about this trigraph in
   2382 	       _cpp_process_line_notes, since trigraphs show up as
   2383 	       trigraphs in raw strings.  */
   2384 	    uchar type = note->type;
   2385 	    note->type = 0;
   2386 
   2387 	    if (CPP_OPTION (pfile, trigraphs))
   2388 	      {
   2389 		accum.append (pfile, base, pos - base);
   2390 		base = pos;
   2391 		accum.read_begin (pfile);
   2392 		accum.append (pfile, UC"??", 2);
   2393 		accum.append (pfile, &type, 1);
   2394 
   2395 		/* ??/ followed by newline gets two line notes, one for
   2396 		   the trigraph and one for the backslash/newline.  */
   2397 		if (type == '/' && note[1].pos == pos)
   2398 		  {
   2399 		    note++;
   2400 		    gcc_assert (note->type == '\\' || note->type == ' ');
   2401 		    goto after_backslash;
   2402 		  }
   2403 		/* Skip the replacement character.  */
   2404 		base = ++pos;
   2405 	      }
   2406 
   2407 	    note++;
   2408 	    break;
   2409 	  }
   2410 
   2411       /* Now get a char to process.  Either from an expanded note, or
   2412 	 from the line buffer.  */
   2413       bool read_note = accum.reading_p ();
   2414       char c = read_note ? accum.read_char () : *pos++;
   2415 
   2416       if (phase == PHASE_PREFIX)
   2417 	{
   2418 	  if (c == '(')
   2419 	    {
   2420 	      /* Done.  */
   2421 	      phase = PHASE_NONE;
   2422 	      prefix[prefix_len++] = '"';
   2423 	    }
   2424 	  else if (prefix_len < 16
   2425 		   /* Prefix chars are any of the basic character set,
   2426 		      [lex.charset] except for '
   2427 		      ()\\\t\v\f\n'. Optimized for a contiguous
   2428 		      alphabet.  */
   2429 		   /* Unlike a switch, this collapses down to one or
   2430 		      two shift and bitmask operations on an ASCII
   2431 		      system, with an outlier or two.   */
   2432 		   && (('Z' - 'A' == 25
   2433 			? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
   2434 			: ISIDST (c))
   2435 		       || (c >= '0' && c <= '9')
   2436 		       || c == '_' || c == '{' || c == '}'
   2437 		       || c == '[' || c == ']' || c == '#'
   2438 		       || c == '<' || c == '>' || c == '%'
   2439 		       || c == ':' || c == ';' || c == '.' || c == '?'
   2440 		       || c == '*' || c == '+' || c == '-' || c == '/'
   2441 		       || c == '^' || c == '&' || c == '|' || c == '~'
   2442 		       || c == '!' || c == '=' || c == ','
   2443 		       || c == '"' || c == '\''))
   2444 	    prefix[prefix_len++] = c;
   2445 	  else
   2446 	    {
   2447 	      /* Something is wrong.  */
   2448 	      int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
   2449 	      if (prefix_len == 16)
   2450 		cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
   2451 				     col, "raw string delimiter longer "
   2452 				     "than 16 characters");
   2453 	      else if (c == '\n')
   2454 		cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
   2455 				     col, "invalid new-line in raw "
   2456 				     "string delimiter");
   2457 	      else
   2458 		cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
   2459 				     col, "invalid character '%c' in "
   2460 				     "raw string delimiter", c);
   2461 	      type = CPP_OTHER;
   2462 	      phase = PHASE_NONE;
   2463 	      /* Continue until we get a close quote, that's probably
   2464 		 the best failure mode.  */
   2465 	      prefix_len = 0;
   2466 	    }
   2467 	  if (c != '\n')
   2468 	    continue;
   2469 	}
   2470 
   2471       if (phase != PHASE_NONE)
   2472 	{
   2473 	  if (prefix[phase] != c)
   2474 	    phase = PHASE_NONE;
   2475 	  else if (unsigned (phase + 1) == prefix_len)
   2476 	    break;
   2477 	  else
   2478 	    {
   2479 	      phase = Phase (phase + 1);
   2480 	      continue;
   2481 	    }
   2482 	}
   2483 
   2484       if (!prefix_len && c == '"')
   2485 	/* Failure mode lexing.  */
   2486 	goto out;
   2487       else if (prefix_len && c == ')')
   2488 	phase = PHASE_SUFFIX;
   2489       else if (!read_note && c == '\n')
   2490 	{
   2491 	  pos--;
   2492 	  pfile->buffer->cur = pos;
   2493 	  if (pfile->state.in_directive
   2494 	      || (pfile->state.parsing_args
   2495 		  && pfile->buffer->next_line >= pfile->buffer->rlimit))
   2496 	    {
   2497 	      cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
   2498 				   "unterminated raw string");
   2499 	      type = CPP_OTHER;
   2500 	      goto out;
   2501 	    }
   2502 
   2503 	  accum.append (pfile, base, pos - base + 1);
   2504 	  _cpp_process_line_notes (pfile, false);
   2505 
   2506 	  if (pfile->buffer->next_line < pfile->buffer->rlimit)
   2507 	    CPP_INCREMENT_LINE (pfile, 0);
   2508 	  pfile->buffer->need_line = true;
   2509 
   2510 	  if (!_cpp_get_fresh_line (pfile))
   2511 	    {
   2512 	      /* We ran out of file and failed to get a line.  */
   2513 	      location_t src_loc = token->src_loc;
   2514 	      token->type = CPP_EOF;
   2515 	      /* Tell the compiler the line number of the EOF token.  */
   2516 	      token->src_loc = pfile->line_table->highest_line;
   2517 	      token->flags = BOL;
   2518 	      if (accum.first)
   2519 		_cpp_release_buff (pfile, accum.first);
   2520 	      cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
   2521 				   "unterminated raw string");
   2522 	      /* Now pop the buffer that _cpp_get_fresh_line did not.  */
   2523 	      _cpp_pop_buffer (pfile);
   2524 	      return;
   2525 	    }
   2526 
   2527 	  pos = base = pfile->buffer->cur;
   2528 	  note = &pfile->buffer->notes[pfile->buffer->cur_note];
   2529 	}
   2530       else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
   2531 	       && warn_bidi_p)
   2532 	{
   2533 	  location_t loc;
   2534 	  bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
   2535 	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   2536 	}
   2537     }
   2538 
   2539   if (warn_bidi_p)
   2540     maybe_warn_bidi_on_close (pfile, pos);
   2541 
   2542   if (CPP_OPTION (pfile, user_literals))
   2543     {
   2544       /* If a string format macro, say from inttypes.h, is placed touching
   2545 	 a string literal it could be parsed as a C++11 user-defined string
   2546 	 literal thus breaking the program.  */
   2547       if (is_macro_not_literal_suffix (pfile, pos))
   2548 	{
   2549 	  /* Raise a warning, but do not consume subsequent tokens.  */
   2550 	  if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
   2551 	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
   2552 				   token->src_loc, 0,
   2553 				   "invalid suffix on literal; C++11 requires "
   2554 				   "a space between literal and string macro");
   2555 	}
   2556       /* Grab user defined literal suffix.  */
   2557       else if (ISIDST (*pos))
   2558 	{
   2559 	  type = cpp_userdef_string_add_type (type);
   2560 	  ++pos;
   2561 
   2562 	  while (ISIDNUM (*pos))
   2563 	    ++pos;
   2564 	}
   2565     }
   2566 
   2567  out:
   2568   pfile->buffer->cur = pos;
   2569   if (!accum.accum)
   2570     create_literal (pfile, token, base, pos - base, type);
   2571   else
   2572     {
   2573       size_t extra_len = pos - base;
   2574       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
   2575 
   2576       token->type = type;
   2577       token->val.str.len = accum.accum + extra_len;
   2578       token->val.str.text = dest;
   2579       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
   2580 	{
   2581 	  size_t len = BUFF_FRONT (buf) - buf->base;
   2582 	  memcpy (dest, buf->base, len);
   2583 	  dest += len;
   2584 	}
   2585       _cpp_release_buff (pfile, accum.first);
   2586       memcpy (dest, base, extra_len);
   2587       dest[extra_len] = '\0';
   2588     }
   2589 }
   2590 
   2591 /* Lexes a string, character constant, or angle-bracketed header file
   2592    name.  The stored string contains the spelling, including opening
   2593    quote and any leading 'L', 'u', 'U' or 'u8' and optional
   2594    'R' modifier.  It returns the type of the literal, or CPP_OTHER
   2595    if it was not properly terminated, or CPP_LESS for an unterminated
   2596    header name which must be relexed as normal tokens.
   2597 
   2598    The spelling is NUL-terminated, but it is not guaranteed that this
   2599    is the first NUL since embedded NULs are preserved.  */
   2600 static void
   2601 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
   2602 {
   2603   bool saw_NUL = false;
   2604   const uchar *cur;
   2605   cppchar_t terminator;
   2606   enum cpp_ttype type;
   2607 
   2608   cur = base;
   2609   terminator = *cur++;
   2610   if (terminator == 'L' || terminator == 'U')
   2611     terminator = *cur++;
   2612   else if (terminator == 'u')
   2613     {
   2614       terminator = *cur++;
   2615       if (terminator == '8')
   2616 	terminator = *cur++;
   2617     }
   2618   if (terminator == 'R')
   2619     {
   2620       lex_raw_string (pfile, token, base);
   2621       return;
   2622     }
   2623   if (terminator == '"')
   2624     type = (*base == 'L' ? CPP_WSTRING :
   2625 	    *base == 'U' ? CPP_STRING32 :
   2626 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
   2627 			 : CPP_STRING);
   2628   else if (terminator == '\'')
   2629     type = (*base == 'L' ? CPP_WCHAR :
   2630 	    *base == 'U' ? CPP_CHAR32 :
   2631 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
   2632 			 : CPP_CHAR);
   2633   else
   2634     terminator = '>', type = CPP_HEADER_NAME;
   2635 
   2636   const bool warn_bidi_p = pfile->warn_bidi_p ();
   2637   for (;;)
   2638     {
   2639       cppchar_t c = *cur++;
   2640 
   2641       /* In #include-style directives, terminators are not escapable.  */
   2642       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
   2643 	{
   2644 	  if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
   2645 	    {
   2646 	      location_t loc;
   2647 	      bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
   2648 					      &loc);
   2649 	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
   2650 	    }
   2651 	  cur++;
   2652 	}
   2653       else if (c == terminator)
   2654 	{
   2655 	  if (warn_bidi_p)
   2656 	    maybe_warn_bidi_on_close (pfile, cur - 1);
   2657 	  break;
   2658 	}
   2659       else if (c == '\n')
   2660 	{
   2661 	  cur--;
   2662 	  /* Unmatched quotes always yield undefined behavior, but
   2663 	     greedy lexing means that what appears to be an unterminated
   2664 	     header name may actually be a legitimate sequence of tokens.  */
   2665 	  if (terminator == '>')
   2666 	    {
   2667 	      token->type = CPP_LESS;
   2668 	      return;
   2669 	    }
   2670 	  type = CPP_OTHER;
   2671 	  break;
   2672 	}
   2673       else if (c == '\0')
   2674 	saw_NUL = true;
   2675       else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
   2676 	{
   2677 	  location_t loc;
   2678 	  bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
   2679 	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   2680 	}
   2681     }
   2682 
   2683   if (saw_NUL && !pfile->state.skipping)
   2684     cpp_error (pfile, CPP_DL_WARNING,
   2685 	       "null character(s) preserved in literal");
   2686 
   2687   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
   2688     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
   2689 	       (int) terminator);
   2690 
   2691   if (CPP_OPTION (pfile, user_literals))
   2692     {
   2693       /* If a string format macro, say from inttypes.h, is placed touching
   2694 	 a string literal it could be parsed as a C++11 user-defined string
   2695 	 literal thus breaking the program.  */
   2696       if (is_macro_not_literal_suffix (pfile, cur))
   2697 	{
   2698 	  /* Raise a warning, but do not consume subsequent tokens.  */
   2699 	  if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
   2700 	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
   2701 				   token->src_loc, 0,
   2702 				   "invalid suffix on literal; C++11 requires "
   2703 				   "a space between literal and string macro");
   2704 	}
   2705       /* Grab user defined literal suffix.  */
   2706       else if (ISIDST (*cur))
   2707 	{
   2708 	  type = cpp_userdef_char_add_type (type);
   2709 	  type = cpp_userdef_string_add_type (type);
   2710           ++cur;
   2711 
   2712 	  while (ISIDNUM (*cur))
   2713 	    ++cur;
   2714 	}
   2715     }
   2716   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
   2717 	   && is_macro (pfile, cur)
   2718 	   && !pfile->state.skipping)
   2719     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
   2720 			   token->src_loc, 0, "C++11 requires a space "
   2721 			   "between string literal and macro");
   2722 
   2723   pfile->buffer->cur = cur;
   2724   create_literal (pfile, token, base, cur - base, type);
   2725 }
   2726 
   2727 /* Return the comment table. The client may not make any assumption
   2728    about the ordering of the table.  */
   2729 cpp_comment_table *
   2730 cpp_get_comments (cpp_reader *pfile)
   2731 {
   2732   return &pfile->comments;
   2733 }
   2734 
   2735 /* Append a comment to the end of the comment table. */
   2736 static void
   2737 store_comment (cpp_reader *pfile, cpp_token *token)
   2738 {
   2739   int len;
   2740 
   2741   if (pfile->comments.allocated == 0)
   2742     {
   2743       pfile->comments.allocated = 256;
   2744       pfile->comments.entries = (cpp_comment *) xmalloc
   2745 	(pfile->comments.allocated * sizeof (cpp_comment));
   2746     }
   2747 
   2748   if (pfile->comments.count == pfile->comments.allocated)
   2749     {
   2750       pfile->comments.allocated *= 2;
   2751       pfile->comments.entries = (cpp_comment *) xrealloc
   2752 	(pfile->comments.entries,
   2753 	 pfile->comments.allocated * sizeof (cpp_comment));
   2754     }
   2755 
   2756   len = token->val.str.len;
   2757 
   2758   /* Copy comment. Note, token may not be NULL terminated. */
   2759   pfile->comments.entries[pfile->comments.count].comment =
   2760     (char *) xmalloc (sizeof (char) * (len + 1));
   2761   memcpy (pfile->comments.entries[pfile->comments.count].comment,
   2762 	  token->val.str.text, len);
   2763   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
   2764 
   2765   /* Set source location. */
   2766   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
   2767 
   2768   /* Increment the count of entries in the comment table. */
   2769   pfile->comments.count++;
   2770 }
   2771 
   2772 /* The stored comment includes the comment start and any terminator.  */
   2773 static void
   2774 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
   2775 	      cppchar_t type)
   2776 {
   2777   unsigned char *buffer;
   2778   unsigned int len, clen, i;
   2779 
   2780   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
   2781 
   2782   /* C++ comments probably (not definitely) have moved past a new
   2783      line, which we don't want to save in the comment.  */
   2784   if (is_vspace (pfile->buffer->cur[-1]))
   2785     len--;
   2786 
   2787   /* If we are currently in a directive or in argument parsing, then
   2788      we need to store all C++ comments as C comments internally, and
   2789      so we need to allocate a little extra space in that case.
   2790 
   2791      Note that the only time we encounter a directive here is
   2792      when we are saving comments in a "#define".  */
   2793   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
   2794 	  && type == '/') ? len + 2 : len;
   2795 
   2796   buffer = _cpp_unaligned_alloc (pfile, clen);
   2797 
   2798   token->type = CPP_COMMENT;
   2799   token->val.str.len = clen;
   2800   token->val.str.text = buffer;
   2801 
   2802   buffer[0] = '/';
   2803   memcpy (buffer + 1, from, len - 1);
   2804 
   2805   /* Finish conversion to a C comment, if necessary.  */
   2806   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
   2807     {
   2808       buffer[1] = '*';
   2809       buffer[clen - 2] = '*';
   2810       buffer[clen - 1] = '/';
   2811       /* As there can be in a C++ comments illegal sequences for C comments
   2812          we need to filter them out.  */
   2813       for (i = 2; i < (clen - 2); i++)
   2814         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
   2815           buffer[i] = '|';
   2816     }
   2817 
   2818   /* Finally store this comment for use by clients of libcpp. */
   2819   store_comment (pfile, token);
   2820 }
   2821 
   2822 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
   2823    comment.  */
   2824 
   2825 static bool
   2826 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
   2827 {
   2828   const unsigned char *from = comment_start + 1;
   2829 
   2830   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
   2831     {
   2832       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
   2833 	 don't recognize any comments.  The latter only checks attributes,
   2834 	 the former doesn't warn.  */
   2835     case 0:
   2836     default:
   2837       return false;
   2838       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
   2839 	 content it has.  */
   2840     case 1:
   2841       return true;
   2842     case 2:
   2843       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
   2844 	 .*falls?[ \t-]*thr(u|ough).* regex.  */
   2845       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
   2846 	   from++)
   2847 	{
   2848 	  /* Is there anything like strpbrk with upper boundary, or
   2849 	     memchr looking for 2 characters rather than just one?  */
   2850 	  if (from[0] != 'f' && from[0] != 'F')
   2851 	    continue;
   2852 	  if (from[1] != 'a' && from[1] != 'A')
   2853 	    continue;
   2854 	  if (from[2] != 'l' && from[2] != 'L')
   2855 	    continue;
   2856 	  if (from[3] != 'l' && from[3] != 'L')
   2857 	    continue;
   2858 	  from += sizeof "fall" - 1;
   2859 	  if (from[0] == 's' || from[0] == 'S')
   2860 	    from++;
   2861 	  while (*from == ' ' || *from == '\t' || *from == '-')
   2862 	    from++;
   2863 	  if (from[0] != 't' && from[0] != 'T')
   2864 	    continue;
   2865 	  if (from[1] != 'h' && from[1] != 'H')
   2866 	    continue;
   2867 	  if (from[2] != 'r' && from[2] != 'R')
   2868 	    continue;
   2869 	  if (from[3] == 'u' || from[3] == 'U')
   2870 	    return true;
   2871 	  if (from[3] != 'o' && from[3] != 'O')
   2872 	    continue;
   2873 	  if (from[4] != 'u' && from[4] != 'U')
   2874 	    continue;
   2875 	  if (from[5] != 'g' && from[5] != 'G')
   2876 	    continue;
   2877 	  if (from[6] != 'h' && from[6] != 'H')
   2878 	    continue;
   2879 	  return true;
   2880 	}
   2881       return false;
   2882     case 3:
   2883     case 4:
   2884       break;
   2885     }
   2886 
   2887   /* Whole comment contents:
   2888      -fallthrough
   2889      @fallthrough@
   2890    */
   2891   if (*from == '-' || *from == '@')
   2892     {
   2893       size_t len = sizeof "fallthrough" - 1;
   2894       if ((size_t) (pfile->buffer->cur - from - 1) < len)
   2895 	return false;
   2896       if (memcmp (from + 1, "fallthrough", len))
   2897 	return false;
   2898       if (*from == '@')
   2899 	{
   2900 	  if (from[len + 1] != '@')
   2901 	    return false;
   2902 	  len++;
   2903 	}
   2904       from += 1 + len;
   2905     }
   2906   /* Whole comment contents (regex):
   2907      lint -fallthrough[ \t]*
   2908    */
   2909   else if (*from == 'l')
   2910     {
   2911       size_t len = sizeof "int -fallthrough" - 1;
   2912       if ((size_t) (pfile->buffer->cur - from - 1) < len)
   2913 	return false;
   2914       if (memcmp (from + 1, "int -fallthrough", len))
   2915 	return false;
   2916       from += 1 + len;
   2917       while (*from == ' ' || *from == '\t')
   2918 	from++;
   2919     }
   2920   /* Whole comment contents (regex):
   2921      [ \t]*FALLTHR(U|OUGH)[ \t]*
   2922    */
   2923   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
   2924     {
   2925       while (*from == ' ' || *from == '\t')
   2926 	from++;
   2927       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
   2928 	return false;
   2929       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
   2930 	return false;
   2931       from += sizeof "FALLTHR" - 1;
   2932       if (*from == 'U')
   2933 	from++;
   2934       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
   2935 	return false;
   2936       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
   2937 	return false;
   2938       else
   2939 	from += sizeof "OUGH" - 1;
   2940       while (*from == ' ' || *from == '\t')
   2941 	from++;
   2942     }
   2943   /* Whole comment contents (regex):
   2944      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
   2945      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
   2946      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
   2947    */
   2948   else
   2949     {
   2950       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
   2951 	from++;
   2952       unsigned char f = *from;
   2953       bool all_upper = false;
   2954       if (f == 'E' || f == 'e')
   2955 	{
   2956 	  if ((size_t) (pfile->buffer->cur - from)
   2957 	      < sizeof "else fallthru" - 1)
   2958 	    return false;
   2959 	  if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
   2960 	    all_upper = true;
   2961 	  else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
   2962 	    return false;
   2963 	  from += sizeof "else" - 1;
   2964 	  if (*from == ',')
   2965 	    from++;
   2966 	  if (*from != ' ')
   2967 	    return false;
   2968 	  from++;
   2969 	  if (all_upper && *from == 'f')
   2970 	    return false;
   2971 	  if (f == 'e' && *from == 'F')
   2972 	    return false;
   2973 	  f = *from;
   2974 	}
   2975       else if (f == 'I' || f == 'i')
   2976 	{
   2977 	  if ((size_t) (pfile->buffer->cur - from)
   2978 	      < sizeof "intentional fallthru" - 1)
   2979 	    return false;
   2980 	  if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
   2981 				  sizeof "NTENTIONAL" - 1) == 0)
   2982 	    all_upper = true;
   2983 	  else if (memcmp (from + 1, "ntentional",
   2984 			   sizeof "ntentional" - 1))
   2985 	    return false;
   2986 	  from += sizeof "intentional" - 1;
   2987 	  if (*from == ' ')
   2988 	    {
   2989 	      from++;
   2990 	      if (all_upper && *from == 'f')
   2991 		return false;
   2992 	    }
   2993 	  else if (all_upper)
   2994 	    {
   2995 	      if (memcmp (from, "LY F", sizeof "LY F" - 1))
   2996 		return false;
   2997 	      from += sizeof "LY " - 1;
   2998 	    }
   2999 	  else
   3000 	    {
   3001 	      if (memcmp (from, "ly ", sizeof "ly " - 1))
   3002 		return false;
   3003 	      from += sizeof "ly " - 1;
   3004 	    }
   3005 	  if (f == 'i' && *from == 'F')
   3006 	    return false;
   3007 	  f = *from;
   3008 	}
   3009       if (f != 'F' && f != 'f')
   3010 	return false;
   3011       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
   3012 	return false;
   3013       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
   3014 	all_upper = true;
   3015       else if (all_upper)
   3016 	return false;
   3017       else if (memcmp (from + 1, "all", sizeof "all" - 1))
   3018 	return false;
   3019       from += sizeof "fall" - 1;
   3020       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
   3021 	from += 2;
   3022       else if (*from == ' ' || *from == '-')
   3023 	from++;
   3024       else if (*from != (all_upper ? 'T' : 't'))
   3025 	return false;
   3026       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
   3027 	return false;
   3028       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
   3029 	return false;
   3030       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
   3031 	{
   3032 	  if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
   3033 	    return false;
   3034 	  if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
   3035 		      sizeof "hrough" - 1))
   3036 	    return false;
   3037 	  from += sizeof "through" - 1;
   3038 	}
   3039       else
   3040 	from += sizeof "thru" - 1;
   3041       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
   3042 	from++;
   3043       if (*from == '-')
   3044 	{
   3045 	  from++;
   3046 	  if (*comment_start == '*')
   3047 	    {
   3048 	      do
   3049 		{
   3050 		  while (*from && *from != '*'
   3051 			 && *from != '\n' && *from != '\r')
   3052 		    from++;
   3053 		  if (*from != '*' || from[1] == '/')
   3054 		    break;
   3055 		  from++;
   3056 		}
   3057 	      while (1);
   3058 	    }
   3059 	  else
   3060 	    while (*from && *from != '\n' && *from != '\r')
   3061 	      from++;
   3062 	}
   3063     }
   3064   /* C block comment.  */
   3065   if (*comment_start == '*')
   3066     {
   3067       if (*from != '*' || from[1] != '/')
   3068 	return false;
   3069     }
   3070   /* C++ line comment.  */
   3071   else if (*from != '\n')
   3072     return false;
   3073 
   3074   return true;
   3075 }
   3076 
   3077 /* Allocate COUNT tokens for RUN.  */
   3078 void
   3079 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
   3080 {
   3081   run->base = XNEWVEC (cpp_token, count);
   3082   run->limit = run->base + count;
   3083   run->next = NULL;
   3084 }
   3085 
   3086 /* Returns the next tokenrun, or creates one if there is none.  */
   3087 static tokenrun *
   3088 next_tokenrun (tokenrun *run)
   3089 {
   3090   if (run->next == NULL)
   3091     {
   3092       run->next = XNEW (tokenrun);
   3093       run->next->prev = run;
   3094       _cpp_init_tokenrun (run->next, 250);
   3095     }
   3096 
   3097   return run->next;
   3098 }
   3099 
   3100 /* Return the number of not yet processed token in a given
   3101    context.  */
   3102 int
   3103 _cpp_remaining_tokens_num_in_context (cpp_context *context)
   3104 {
   3105   if (context->tokens_kind == TOKENS_KIND_DIRECT)
   3106     return (LAST (context).token - FIRST (context).token);
   3107   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
   3108 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
   3109     return (LAST (context).ptoken - FIRST (context).ptoken);
   3110   else
   3111       abort ();
   3112 }
   3113 
   3114 /* Returns the token present at index INDEX in a given context.  If
   3115    INDEX is zero, the next token to be processed is returned.  */
   3116 static const cpp_token*
   3117 _cpp_token_from_context_at (cpp_context *context, int index)
   3118 {
   3119   if (context->tokens_kind == TOKENS_KIND_DIRECT)
   3120     return &(FIRST (context).token[index]);
   3121   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
   3122 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
   3123     return FIRST (context).ptoken[index];
   3124  else
   3125    abort ();
   3126 }
   3127 
   3128 /* Look ahead in the input stream.  */
   3129 const cpp_token *
   3130 cpp_peek_token (cpp_reader *pfile, int index)
   3131 {
   3132   cpp_context *context = pfile->context;
   3133   const cpp_token *peektok;
   3134   int count;
   3135 
   3136   /* First, scan through any pending cpp_context objects.  */
   3137   while (context->prev)
   3138     {
   3139       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
   3140 
   3141       if (index < (int) sz)
   3142         return _cpp_token_from_context_at (context, index);
   3143       index -= (int) sz;
   3144       context = context->prev;
   3145     }
   3146 
   3147   /* We will have to read some new tokens after all (and do so
   3148      without invalidating preceding tokens).  */
   3149   count = index;
   3150   pfile->keep_tokens++;
   3151 
   3152   /* For peeked tokens temporarily disable line_change reporting,
   3153      until the tokens are parsed for real.  */
   3154   void (*line_change) (cpp_reader *, const cpp_token *, int)
   3155     = pfile->cb.line_change;
   3156   pfile->cb.line_change = NULL;
   3157 
   3158   do
   3159     {
   3160       peektok = _cpp_lex_token (pfile);
   3161       if (peektok->type == CPP_EOF)
   3162 	{
   3163 	  index--;
   3164 	  break;
   3165 	}
   3166       else if (peektok->type == CPP_PRAGMA)
   3167 	{
   3168 	  /* Don't peek past a pragma.  */
   3169 	  if (peektok == &pfile->directive_result)
   3170 	    /* Save the pragma in the buffer.  */
   3171 	    *pfile->cur_token++ = *peektok;
   3172 	  index--;
   3173 	  break;
   3174 	}
   3175     }
   3176   while (index--);
   3177 
   3178   _cpp_backup_tokens_direct (pfile, count - index);
   3179   pfile->keep_tokens--;
   3180   pfile->cb.line_change = line_change;
   3181 
   3182   return peektok;
   3183 }
   3184 
   3185 /* Allocate a single token that is invalidated at the same time as the
   3186    rest of the tokens on the line.  Has its line and col set to the
   3187    same as the last lexed token, so that diagnostics appear in the
   3188    right place.  */
   3189 cpp_token *
   3190 _cpp_temp_token (cpp_reader *pfile)
   3191 {
   3192   cpp_token *old, *result;
   3193   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
   3194   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
   3195 
   3196   old = pfile->cur_token - 1;
   3197   /* Any pre-existing lookaheads must not be clobbered.  */
   3198   if (la)
   3199     {
   3200       if (sz <= la)
   3201         {
   3202           tokenrun *next = next_tokenrun (pfile->cur_run);
   3203 
   3204           if (sz < la)
   3205             memmove (next->base + 1, next->base,
   3206                      (la - sz) * sizeof (cpp_token));
   3207 
   3208           next->base[0] = pfile->cur_run->limit[-1];
   3209         }
   3210 
   3211       if (sz > 1)
   3212         memmove (pfile->cur_token + 1, pfile->cur_token,
   3213                  MIN (la, sz - 1) * sizeof (cpp_token));
   3214     }
   3215 
   3216   if (!sz && pfile->cur_token == pfile->cur_run->limit)
   3217     {
   3218       pfile->cur_run = next_tokenrun (pfile->cur_run);
   3219       pfile->cur_token = pfile->cur_run->base;
   3220     }
   3221 
   3222   result = pfile->cur_token++;
   3223   result->src_loc = old->src_loc;
   3224   return result;
   3225 }
   3226 
   3227 /* We're at the beginning of a logical line (so not in
   3228   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
   3229   if we should enter deferred_pragma mode to tokenize the rest of the
   3230   line as a module control-line.  */
   3231 
   3232 static void
   3233 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
   3234 {
   3235   unsigned backup = 0; /* Tokens we peeked.  */
   3236   cpp_hashnode *node = result->val.node.node;
   3237   cpp_token *peek = result;
   3238   cpp_token *keyword = peek;
   3239   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
   3240   int header_count = 0;
   3241 
   3242   /* Make sure the incoming state is as we expect it.  This way we
   3243      can restore it using constants.  */
   3244   gcc_checking_assert (!pfile->state.in_deferred_pragma
   3245 		       && !pfile->state.skipping
   3246 		       && !pfile->state.parsing_args
   3247 		       && !pfile->state.angled_headers
   3248 		       && (pfile->state.save_comments
   3249 			   == !CPP_OPTION (pfile, discard_comments)));
   3250 
   3251   /* Enter directives mode sufficiently for peeking.  We don't have
   3252      to actually set in_directive.  */
   3253   pfile->state.in_deferred_pragma = true;
   3254 
   3255   /* These two fields are needed to process tokenization in deferred
   3256      pragma mode.  They are not used outside deferred pragma mode or
   3257      directives mode.  */
   3258   pfile->state.pragma_allow_expansion = true;
   3259   pfile->directive_line = result->src_loc;
   3260 
   3261   /* Saving comments is incompatible with directives mode.   */
   3262   pfile->state.save_comments = 0;
   3263 
   3264   if (node == n_modules[spec_nodes::M_EXPORT][0])
   3265     {
   3266       peek = _cpp_lex_direct (pfile);
   3267       keyword = peek;
   3268       backup++;
   3269       if (keyword->type != CPP_NAME)
   3270 	goto not_module;
   3271       node = keyword->val.node.node;
   3272       if (!(node->flags & NODE_MODULE))
   3273 	goto not_module;
   3274     }
   3275 
   3276   if (node == n_modules[spec_nodes::M__IMPORT][0])
   3277     /* __import  */
   3278     header_count = backup + 2 + 16;
   3279   else if (node == n_modules[spec_nodes::M_IMPORT][0])
   3280     /* import  */
   3281     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
   3282   else if (node == n_modules[spec_nodes::M_MODULE][0])
   3283     ; /* module  */
   3284   else
   3285     goto not_module;
   3286 
   3287   /* We've seen [export] {module|import|__import}.  Check the next token.  */
   3288   if (header_count)
   3289     /* After '{,__}import' a header name may appear.  */
   3290     pfile->state.angled_headers = true;
   3291   peek = _cpp_lex_direct (pfile);
   3292   backup++;
   3293 
   3294   /* ... import followed by identifier, ':', '<' or
   3295      header-name preprocessing tokens, or module
   3296      followed by cpp-identifier, ':' or ';' preprocessing
   3297      tokens.  C++ keywords are not yet relevant.  */
   3298   if (peek->type == CPP_NAME
   3299       || peek->type == CPP_COLON
   3300       ||  (header_count
   3301 	   ? (peek->type == CPP_LESS
   3302 	      || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
   3303 	      || peek->type == CPP_HEADER_NAME)
   3304 	   : peek->type == CPP_SEMICOLON))
   3305     {
   3306       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
   3307       if (!pfile->state.pragma_allow_expansion)
   3308 	pfile->state.prevent_expansion++;
   3309 
   3310       if (!header_count && linemap_included_from
   3311 	  (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
   3312 	cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
   3313 			     "module control-line cannot be in included file");
   3314 
   3315       /* The first one or two tokens cannot be macro names.  */
   3316       for (int ix = backup; ix--;)
   3317 	{
   3318 	  cpp_token *tok = ix ? keyword : result;
   3319 	  cpp_hashnode *node = tok->val.node.node;
   3320 
   3321 	  /* Don't attempt to expand the token.  */
   3322 	  tok->flags |= NO_EXPAND;
   3323 	  if (_cpp_defined_macro_p (node)
   3324 	      && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
   3325 	      && !cpp_fun_like_macro_p (node))
   3326 	    cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
   3327 				 "module control-line \"%s\" cannot be"
   3328 				 " an object-like macro",
   3329 				 NODE_NAME (node));
   3330 	}
   3331 
   3332       /* Map to underbar variants.  */
   3333       keyword->val.node.node = n_modules[header_count
   3334 					 ? spec_nodes::M_IMPORT
   3335 					 : spec_nodes::M_MODULE][1];
   3336       if (backup != 1)
   3337 	result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
   3338 
   3339       /* Maybe tell the tokenizer we expect a header-name down the
   3340 	 road.  */
   3341       pfile->state.directive_file_token = header_count;
   3342     }
   3343   else
   3344     {
   3345     not_module:
   3346       /* Drop out of directive mode.  */
   3347       /* We aaserted save_comments had this value upon entry.  */
   3348       pfile->state.save_comments
   3349 	= !CPP_OPTION (pfile, discard_comments);
   3350       pfile->state.in_deferred_pragma = false;
   3351       /* Do not let this remain on.  */
   3352       pfile->state.angled_headers = false;
   3353     }
   3354 
   3355   /* In either case we want to backup the peeked tokens.  */
   3356   if (backup)
   3357     {
   3358       /* If we saw EOL, we should drop it, because this isn't a module
   3359 	 control-line after all.  */
   3360       bool eol = peek->type == CPP_PRAGMA_EOL;
   3361       if (!eol || backup > 1)
   3362 	{
   3363 	  /* Put put the peeked tokens back  */
   3364 	  _cpp_backup_tokens_direct (pfile, backup);
   3365 	  /* But if the last one was an EOL, forget it.  */
   3366 	  if (eol)
   3367 	    pfile->lookaheads--;
   3368 	}
   3369     }
   3370 }
   3371 
   3372 /* Lex a token into RESULT (external interface).  Takes care of issues
   3373    like directive handling, token lookahead, multiple include
   3374    optimization and skipping.  */
   3375 const cpp_token *
   3376 _cpp_lex_token (cpp_reader *pfile)
   3377 {
   3378   cpp_token *result;
   3379 
   3380   for (;;)
   3381     {
   3382       if (pfile->cur_token == pfile->cur_run->limit)
   3383 	{
   3384 	  pfile->cur_run = next_tokenrun (pfile->cur_run);
   3385 	  pfile->cur_token = pfile->cur_run->base;
   3386 	}
   3387       /* We assume that the current token is somewhere in the current
   3388 	 run.  */
   3389       if (pfile->cur_token < pfile->cur_run->base
   3390 	  || pfile->cur_token >= pfile->cur_run->limit)
   3391 	abort ();
   3392 
   3393       if (pfile->lookaheads)
   3394 	{
   3395 	  pfile->lookaheads--;
   3396 	  result = pfile->cur_token++;
   3397 	}
   3398       else
   3399 	result = _cpp_lex_direct (pfile);
   3400 
   3401       if (result->flags & BOL)
   3402 	{
   3403 	  /* Is this a directive.  If _cpp_handle_directive returns
   3404 	     false, it is an assembler #.  */
   3405 	  if (result->type == CPP_HASH
   3406 	      /* 6.10.3 p 11: Directives in a list of macro arguments
   3407 		 gives undefined behavior.  This implementation
   3408 		 handles the directive as normal.  */
   3409 	      && pfile->state.parsing_args != 1)
   3410 	    {
   3411 	      if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
   3412 		{
   3413 		  if (pfile->directive_result.type == CPP_PADDING)
   3414 		    continue;
   3415 		  result = &pfile->directive_result;
   3416 		}
   3417 	    }
   3418 	  else if (pfile->state.in_deferred_pragma)
   3419 	    result = &pfile->directive_result;
   3420 	  else if (result->type == CPP_NAME
   3421 		   && (result->val.node.node->flags & NODE_MODULE)
   3422 		   && !pfile->state.skipping
   3423 		   /* Unlike regular directives, we do not deal with
   3424 		      tokenizing module directives as macro arguments.
   3425 		      That's not permitted.  */
   3426 		   && !pfile->state.parsing_args)
   3427 	    {
   3428 	      /* P1857.  Before macro expansion, At start of logical
   3429 		 line ... */
   3430 	      /* We don't have to consider lookaheads at this point.  */
   3431 	      gcc_checking_assert (!pfile->lookaheads);
   3432 
   3433 	      cpp_maybe_module_directive (pfile, result);
   3434 	    }
   3435 
   3436 	  if (pfile->cb.line_change && !pfile->state.skipping)
   3437 	    pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
   3438 	}
   3439 
   3440       /* We don't skip tokens in directives.  */
   3441       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
   3442 	break;
   3443 
   3444       /* Outside a directive, invalidate controlling macros.  At file
   3445 	 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
   3446 	 get here and MI optimization works.  */
   3447       pfile->mi_valid = false;
   3448 
   3449       if (!pfile->state.skipping || result->type == CPP_EOF)
   3450 	break;
   3451     }
   3452 
   3453   return result;
   3454 }
   3455 
   3456 /* Returns true if a fresh line has been loaded.  */
   3457 bool
   3458 _cpp_get_fresh_line (cpp_reader *pfile)
   3459 {
   3460   /* We can't get a new line until we leave the current directive.  */
   3461   if (pfile->state.in_directive)
   3462     return false;
   3463 
   3464   for (;;)
   3465     {
   3466       cpp_buffer *buffer = pfile->buffer;
   3467 
   3468       if (!buffer->need_line)
   3469 	return true;
   3470 
   3471       if (buffer->next_line < buffer->rlimit)
   3472 	{
   3473 	  _cpp_clean_line (pfile);
   3474 	  return true;
   3475 	}
   3476 
   3477       /* First, get out of parsing arguments state.  */
   3478       if (pfile->state.parsing_args)
   3479 	return false;
   3480 
   3481       /* End of buffer.  Non-empty files should end in a newline.  */
   3482       if (buffer->buf != buffer->rlimit
   3483 	  && buffer->next_line > buffer->rlimit
   3484 	  && !buffer->from_stage3)
   3485 	{
   3486 	  /* Clip to buffer size.  */
   3487 	  buffer->next_line = buffer->rlimit;
   3488 	}
   3489 
   3490       if (buffer->prev && !buffer->return_at_eof)
   3491 	_cpp_pop_buffer (pfile);
   3492       else
   3493 	{
   3494 	  /* End of translation.  Do not pop the buffer yet. Increment
   3495 	     line number so that the EOF token is on a line of its own
   3496 	     (_cpp_lex_direct doesn't increment in that case, because
   3497 	     it's hard for it to distinguish this special case). */
   3498 	  CPP_INCREMENT_LINE (pfile, 0);
   3499 	  return false;
   3500 	}
   3501     }
   3502 }
   3503 
   3504 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)		\
   3505   do							\
   3506     {							\
   3507       result->type = ELSE_TYPE;				\
   3508       if (*buffer->cur == CHAR)				\
   3509 	buffer->cur++, result->type = THEN_TYPE;	\
   3510     }							\
   3511   while (0)
   3512 
   3513 /* Lex a token into pfile->cur_token, which is also incremented, to
   3514    get diagnostics pointing to the correct location.
   3515 
   3516    Does not handle issues such as token lookahead, multiple-include
   3517    optimization, directives, skipping etc.  This function is only
   3518    suitable for use by _cpp_lex_token, and in special cases like
   3519    lex_expansion_token which doesn't care for any of these issues.
   3520 
   3521    When meeting a newline, returns CPP_EOF if parsing a directive,
   3522    otherwise returns to the start of the token buffer if permissible.
   3523    Returns the location of the lexed token.  */
   3524 cpp_token *
   3525 _cpp_lex_direct (cpp_reader *pfile)
   3526 {
   3527   cppchar_t c;
   3528   cpp_buffer *buffer;
   3529   const unsigned char *comment_start;
   3530   bool fallthrough_comment = false;
   3531   cpp_token *result = pfile->cur_token++;
   3532 
   3533  fresh_line:
   3534   result->flags = 0;
   3535   buffer = pfile->buffer;
   3536   if (buffer->need_line)
   3537     {
   3538       if (pfile->state.in_deferred_pragma)
   3539 	{
   3540 	  /* This can happen in cases like:
   3541 	     #define loop(x) whatever
   3542 	     #pragma omp loop
   3543 	     where when trying to expand loop we need to peek
   3544 	     next token after loop, but aren't still in_deferred_pragma
   3545 	     mode but are in in_directive mode, so buffer->need_line
   3546 	     is set, a CPP_EOF is peeked.  */
   3547 	  result->type = CPP_PRAGMA_EOL;
   3548 	  pfile->state.in_deferred_pragma = false;
   3549 	  if (!pfile->state.pragma_allow_expansion)
   3550 	    pfile->state.prevent_expansion--;
   3551 	  return result;
   3552 	}
   3553       if (!_cpp_get_fresh_line (pfile))
   3554 	{
   3555 	  result->type = CPP_EOF;
   3556 	  /* Not a real EOF in a directive or arg parsing -- we refuse
   3557   	     to advance to the next file now, and will once we're out
   3558   	     of those modes.  */
   3559 	  if (!pfile->state.in_directive && !pfile->state.parsing_args)
   3560 	    {
   3561 	      /* Tell the compiler the line number of the EOF token.  */
   3562 	      result->src_loc = pfile->line_table->highest_line;
   3563 	      result->flags = BOL;
   3564 	      /* Now pop the buffer that _cpp_get_fresh_line did not.  */
   3565 	      _cpp_pop_buffer (pfile);
   3566 	    }
   3567 	  return result;
   3568 	}
   3569       if (buffer != pfile->buffer)
   3570 	fallthrough_comment = false;
   3571       if (!pfile->keep_tokens)
   3572 	{
   3573 	  pfile->cur_run = &pfile->base_run;
   3574 	  result = pfile->base_run.base;
   3575 	  pfile->cur_token = result + 1;
   3576 	}
   3577       result->flags = BOL;
   3578       if (pfile->state.parsing_args == 2)
   3579 	result->flags |= PREV_WHITE;
   3580     }
   3581   buffer = pfile->buffer;
   3582  update_tokens_line:
   3583   result->src_loc = pfile->line_table->highest_line;
   3584 
   3585  skipped_white:
   3586   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
   3587       && !pfile->overlaid_buffer)
   3588     {
   3589       _cpp_process_line_notes (pfile, false);
   3590       result->src_loc = pfile->line_table->highest_line;
   3591     }
   3592   c = *buffer->cur++;
   3593 
   3594   if (pfile->forced_token_location)
   3595     result->src_loc = pfile->forced_token_location;
   3596   else
   3597     result->src_loc = linemap_position_for_column (pfile->line_table,
   3598 					  CPP_BUF_COLUMN (buffer, buffer->cur));
   3599 
   3600   switch (c)
   3601     {
   3602     case ' ': case '\t': case '\f': case '\v': case '\0':
   3603       result->flags |= PREV_WHITE;
   3604       skip_whitespace (pfile, c);
   3605       goto skipped_white;
   3606 
   3607     case '\n':
   3608       /* Increment the line, unless this is the last line ...  */
   3609       if (buffer->cur < buffer->rlimit
   3610 	  /* ... or this is a #include, (where _cpp_stack_file needs to
   3611 	     unwind by one line) ...  */
   3612 	  || (pfile->state.in_directive > 1
   3613 	      /* ... except traditional-cpp increments this elsewhere.  */
   3614 	      && !CPP_OPTION (pfile, traditional)))
   3615 	CPP_INCREMENT_LINE (pfile, 0);
   3616       buffer->need_line = true;
   3617       if (pfile->state.in_deferred_pragma)
   3618 	{
   3619 	  /* Produce the PRAGMA_EOL on this line.  File reading
   3620 	     ensures there is always a \n at end of the buffer, thus
   3621 	     in a deferred pragma we always see CPP_PRAGMA_EOL before
   3622 	     any CPP_EOF.  */
   3623 	  result->type = CPP_PRAGMA_EOL;
   3624 	  result->flags &= ~PREV_WHITE;
   3625 	  pfile->state.in_deferred_pragma = false;
   3626 	  if (!pfile->state.pragma_allow_expansion)
   3627 	    pfile->state.prevent_expansion--;
   3628 	  return result;
   3629 	}
   3630       goto fresh_line;
   3631 
   3632     case '0': case '1': case '2': case '3': case '4':
   3633     case '5': case '6': case '7': case '8': case '9':
   3634       {
   3635 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   3636 	result->type = CPP_NUMBER;
   3637 	lex_number (pfile, &result->val.str, &nst);
   3638 	warn_about_normalization (pfile, result, &nst);
   3639 	break;
   3640       }
   3641 
   3642     case 'L':
   3643     case 'u':
   3644     case 'U':
   3645     case 'R':
   3646       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
   3647 	 wide strings or raw strings.  */
   3648       if (c == 'L' || CPP_OPTION (pfile, rliterals)
   3649 	  || (c != 'R' && CPP_OPTION (pfile, uliterals)))
   3650 	{
   3651 	  if ((*buffer->cur == '\'' && c != 'R')
   3652 	      || *buffer->cur == '"'
   3653 	      || (*buffer->cur == 'R'
   3654 		  && c != 'R'
   3655 		  && buffer->cur[1] == '"'
   3656 		  && CPP_OPTION (pfile, rliterals))
   3657 	      || (*buffer->cur == '8'
   3658 		  && c == 'u'
   3659 		  && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
   3660 				&& CPP_OPTION (pfile, utf8_char_literals)))
   3661 		      || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
   3662 			  && CPP_OPTION (pfile, rliterals)))))
   3663 	    {
   3664 	      lex_string (pfile, result, buffer->cur - 1);
   3665 	      break;
   3666 	    }
   3667 	}
   3668       /* Fall through.  */
   3669 
   3670     case '_':
   3671     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
   3672     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
   3673     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
   3674     case 's': case 't':           case 'v': case 'w': case 'x':
   3675     case 'y': case 'z':
   3676     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
   3677     case 'G': case 'H': case 'I': case 'J': case 'K':
   3678     case 'M': case 'N': case 'O': case 'P': case 'Q':
   3679     case 'S': case 'T':           case 'V': case 'W': case 'X':
   3680     case 'Y': case 'Z':
   3681       result->type = CPP_NAME;
   3682       {
   3683 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   3684 	result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
   3685 						&nst,
   3686 						&result->val.node.spelling);
   3687 	warn_about_normalization (pfile, result, &nst);
   3688       }
   3689 
   3690       /* Convert named operators to their proper types.  */
   3691       if (result->val.node.node->flags & NODE_OPERATOR)
   3692 	{
   3693 	  result->flags |= NAMED_OP;
   3694 	  result->type = (enum cpp_ttype) result->val.node.node->directive_index;
   3695 	}
   3696 
   3697       /* Signal FALLTHROUGH comment followed by another token.  */
   3698       if (fallthrough_comment)
   3699 	result->flags |= PREV_FALLTHROUGH;
   3700       break;
   3701 
   3702     case '\'':
   3703     case '"':
   3704       lex_string (pfile, result, buffer->cur - 1);
   3705       break;
   3706 
   3707     case '/':
   3708       /* A potential block or line comment.  */
   3709       comment_start = buffer->cur;
   3710       c = *buffer->cur;
   3711 
   3712       if (c == '*')
   3713 	{
   3714 	  if (_cpp_skip_block_comment (pfile))
   3715 	    cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
   3716 	}
   3717       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
   3718 	{
   3719 	  /* Don't warn for system headers.  */
   3720 	  if (_cpp_in_system_header (pfile))
   3721 	    ;
   3722 	  /* Warn about comments if pedantically GNUC89, and not
   3723 	     in system headers.  */
   3724 	  else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
   3725 		   && CPP_PEDANTIC (pfile)
   3726 		   && ! buffer->warned_cplusplus_comments)
   3727 	    {
   3728 	      if (cpp_error (pfile, CPP_DL_PEDWARN,
   3729 			     "C++ style comments are not allowed in ISO C90"))
   3730 		cpp_error (pfile, CPP_DL_NOTE,
   3731 			   "(this will be reported only once per input file)");
   3732 	      buffer->warned_cplusplus_comments = 1;
   3733 	    }
   3734 	  /* Or if specifically desired via -Wc90-c99-compat.  */
   3735 	  else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
   3736 		   && ! CPP_OPTION (pfile, cplusplus)
   3737 		   && ! buffer->warned_cplusplus_comments)
   3738 	    {
   3739 	      if (cpp_error (pfile, CPP_DL_WARNING,
   3740 			     "C++ style comments are incompatible with C90"))
   3741 		cpp_error (pfile, CPP_DL_NOTE,
   3742 			   "(this will be reported only once per input file)");
   3743 	      buffer->warned_cplusplus_comments = 1;
   3744 	    }
   3745 	  /* In C89/C94, C++ style comments are forbidden.  */
   3746 	  else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
   3747 		    || CPP_OPTION (pfile, lang) == CLK_STDC94))
   3748 	    {
   3749 	      /* But don't be confused about valid code such as
   3750 	         - // immediately followed by *,
   3751 		 - // in a preprocessing directive,
   3752 		 - // in an #if 0 block.  */
   3753 	      if (buffer->cur[1] == '*'
   3754 		  || pfile->state.in_directive
   3755 		  || pfile->state.skipping)
   3756 		{
   3757 		  result->type = CPP_DIV;
   3758 		  break;
   3759 		}
   3760 	      else if (! buffer->warned_cplusplus_comments)
   3761 		{
   3762 		  if (cpp_error (pfile, CPP_DL_ERROR,
   3763 				 "C++ style comments are not allowed in "
   3764 				 "ISO C90"))
   3765 		    cpp_error (pfile, CPP_DL_NOTE,
   3766 			       "(this will be reported only once per input "
   3767 			       "file)");
   3768 		  buffer->warned_cplusplus_comments = 1;
   3769 		}
   3770 	    }
   3771 	  if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
   3772 	    cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
   3773 	}
   3774       else if (c == '=')
   3775 	{
   3776 	  buffer->cur++;
   3777 	  result->type = CPP_DIV_EQ;
   3778 	  break;
   3779 	}
   3780       else
   3781 	{
   3782 	  result->type = CPP_DIV;
   3783 	  break;
   3784 	}
   3785 
   3786       if (fallthrough_comment_p (pfile, comment_start))
   3787 	fallthrough_comment = true;
   3788 
   3789       if (pfile->cb.comment)
   3790 	{
   3791 	  size_t len = pfile->buffer->cur - comment_start;
   3792 	  pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
   3793 			     len + 1);
   3794 	}
   3795 
   3796       if (!pfile->state.save_comments)
   3797 	{
   3798 	  result->flags |= PREV_WHITE;
   3799 	  goto update_tokens_line;
   3800 	}
   3801 
   3802       if (fallthrough_comment)
   3803 	result->flags |= PREV_FALLTHROUGH;
   3804 
   3805       /* Save the comment as a token in its own right.  */
   3806       save_comment (pfile, result, comment_start, c);
   3807       break;
   3808 
   3809     case '<':
   3810       if (pfile->state.angled_headers)
   3811 	{
   3812 	  lex_string (pfile, result, buffer->cur - 1);
   3813 	  if (result->type != CPP_LESS)
   3814 	    break;
   3815 	}
   3816 
   3817       result->type = CPP_LESS;
   3818       if (*buffer->cur == '=')
   3819 	{
   3820 	  buffer->cur++, result->type = CPP_LESS_EQ;
   3821 	  if (*buffer->cur == '>'
   3822 	      && CPP_OPTION (pfile, cplusplus)
   3823 	      && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
   3824 	    buffer->cur++, result->type = CPP_SPACESHIP;
   3825 	}
   3826       else if (*buffer->cur == '<')
   3827 	{
   3828 	  buffer->cur++;
   3829 	  IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
   3830 	}
   3831       else if (CPP_OPTION (pfile, digraphs))
   3832 	{
   3833 	  if (*buffer->cur == ':')
   3834 	    {
   3835 	      /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
   3836 		 three characters are <:: and the subsequent character
   3837 		 is neither : nor >, the < is treated as a preprocessor
   3838 		 token by itself".  */
   3839 	      if (CPP_OPTION (pfile, cplusplus)
   3840 		  && CPP_OPTION (pfile, lang) != CLK_CXX98
   3841 		  && CPP_OPTION (pfile, lang) != CLK_GNUCXX
   3842 		  && buffer->cur[1] == ':'
   3843 		  && buffer->cur[2] != ':' && buffer->cur[2] != '>')
   3844 		break;
   3845 
   3846 	      buffer->cur++;
   3847 	      result->flags |= DIGRAPH;
   3848 	      result->type = CPP_OPEN_SQUARE;
   3849 	    }
   3850 	  else if (*buffer->cur == '%')
   3851 	    {
   3852 	      buffer->cur++;
   3853 	      result->flags |= DIGRAPH;
   3854 	      result->type = CPP_OPEN_BRACE;
   3855 	    }
   3856 	}
   3857       break;
   3858 
   3859     case '>':
   3860       result->type = CPP_GREATER;
   3861       if (*buffer->cur == '=')
   3862 	buffer->cur++, result->type = CPP_GREATER_EQ;
   3863       else if (*buffer->cur == '>')
   3864 	{
   3865 	  buffer->cur++;
   3866 	  IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
   3867 	}
   3868       break;
   3869 
   3870     case '%':
   3871       result->type = CPP_MOD;
   3872       if (*buffer->cur == '=')
   3873 	buffer->cur++, result->type = CPP_MOD_EQ;
   3874       else if (CPP_OPTION (pfile, digraphs))
   3875 	{
   3876 	  if (*buffer->cur == ':')
   3877 	    {
   3878 	      buffer->cur++;
   3879 	      result->flags |= DIGRAPH;
   3880 	      result->type = CPP_HASH;
   3881 	      if (*buffer->cur == '%' && buffer->cur[1] == ':')
   3882 		buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
   3883 	    }
   3884 	  else if (*buffer->cur == '>')
   3885 	    {
   3886 	      buffer->cur++;
   3887 	      result->flags |= DIGRAPH;
   3888 	      result->type = CPP_CLOSE_BRACE;
   3889 	    }
   3890 	}
   3891       break;
   3892 
   3893     case '.':
   3894       result->type = CPP_DOT;
   3895       if (ISDIGIT (*buffer->cur))
   3896 	{
   3897 	  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   3898 	  result->type = CPP_NUMBER;
   3899 	  lex_number (pfile, &result->val.str, &nst);
   3900 	  warn_about_normalization (pfile, result, &nst);
   3901 	}
   3902       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
   3903 	buffer->cur += 2, result->type = CPP_ELLIPSIS;
   3904       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
   3905 	buffer->cur++, result->type = CPP_DOT_STAR;
   3906       break;
   3907 
   3908     case '+':
   3909       result->type = CPP_PLUS;
   3910       if (*buffer->cur == '+')
   3911 	buffer->cur++, result->type = CPP_PLUS_PLUS;
   3912       else if (*buffer->cur == '=')
   3913 	buffer->cur++, result->type = CPP_PLUS_EQ;
   3914       break;
   3915 
   3916     case '-':
   3917       result->type = CPP_MINUS;
   3918       if (*buffer->cur == '>')
   3919 	{
   3920 	  buffer->cur++;
   3921 	  result->type = CPP_DEREF;
   3922 	  if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
   3923 	    buffer->cur++, result->type = CPP_DEREF_STAR;
   3924 	}
   3925       else if (*buffer->cur == '-')
   3926 	buffer->cur++, result->type = CPP_MINUS_MINUS;
   3927       else if (*buffer->cur == '=')
   3928 	buffer->cur++, result->type = CPP_MINUS_EQ;
   3929       break;
   3930 
   3931     case '&':
   3932       result->type = CPP_AND;
   3933       if (*buffer->cur == '&')
   3934 	buffer->cur++, result->type = CPP_AND_AND;
   3935       else if (*buffer->cur == '=')
   3936 	buffer->cur++, result->type = CPP_AND_EQ;
   3937       break;
   3938 
   3939     case '|':
   3940       result->type = CPP_OR;
   3941       if (*buffer->cur == '|')
   3942 	buffer->cur++, result->type = CPP_OR_OR;
   3943       else if (*buffer->cur == '=')
   3944 	buffer->cur++, result->type = CPP_OR_EQ;
   3945       break;
   3946 
   3947     case ':':
   3948       result->type = CPP_COLON;
   3949       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
   3950 	buffer->cur++, result->type = CPP_SCOPE;
   3951       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
   3952 	{
   3953 	  buffer->cur++;
   3954 	  result->flags |= DIGRAPH;
   3955 	  result->type = CPP_CLOSE_SQUARE;
   3956 	}
   3957       break;
   3958 
   3959     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
   3960     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
   3961     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
   3962     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
   3963     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
   3964 
   3965     case '?': result->type = CPP_QUERY; break;
   3966     case '~': result->type = CPP_COMPL; break;
   3967     case ',': result->type = CPP_COMMA; break;
   3968     case '(': result->type = CPP_OPEN_PAREN; break;
   3969     case ')': result->type = CPP_CLOSE_PAREN; break;
   3970     case '[': result->type = CPP_OPEN_SQUARE; break;
   3971     case ']': result->type = CPP_CLOSE_SQUARE; break;
   3972     case '{': result->type = CPP_OPEN_BRACE; break;
   3973     case '}': result->type = CPP_CLOSE_BRACE; break;
   3974     case ';': result->type = CPP_SEMICOLON; break;
   3975 
   3976       /* @ is a punctuator in Objective-C.  */
   3977     case '@': result->type = CPP_ATSIGN; break;
   3978 
   3979     default:
   3980       {
   3981 	const uchar *base = --buffer->cur;
   3982 
   3983 	/* Check for an extended identifier ($ or UCN or UTF-8).  */
   3984 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   3985 	if (forms_identifier_p (pfile, true, &nst))
   3986 	  {
   3987 	    result->type = CPP_NAME;
   3988 	    result->val.node.node = lex_identifier (pfile, base, true, &nst,
   3989 						    &result->val.node.spelling);
   3990 	    warn_about_normalization (pfile, result, &nst);
   3991 	    break;
   3992 	  }
   3993 
   3994 	/* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
   3995 	   single token.  */
   3996 	buffer->cur++;
   3997 	if (c >= utf8_signifier)
   3998 	  {
   3999 	    const uchar *pstr = base;
   4000 	    cppchar_t s;
   4001 	    if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
   4002 	      buffer->cur = pstr;
   4003 	  }
   4004 	create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
   4005 	break;
   4006       }
   4007 
   4008     }
   4009 
   4010   /* Potentially convert the location of the token to a range.  */
   4011   if (result->src_loc >= RESERVED_LOCATION_COUNT
   4012       && result->type != CPP_EOF)
   4013     {
   4014       /* Ensure that any line notes are processed, so that we have the
   4015 	 correct physical line/column for the end-point of the token even
   4016 	 when a logical line is split via one or more backslashes.  */
   4017       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
   4018 	  && !pfile->overlaid_buffer)
   4019 	_cpp_process_line_notes (pfile, false);
   4020 
   4021       source_range tok_range;
   4022       tok_range.m_start = result->src_loc;
   4023       tok_range.m_finish
   4024 	= linemap_position_for_column (pfile->line_table,
   4025 				       CPP_BUF_COLUMN (buffer, buffer->cur));
   4026 
   4027       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
   4028 					       result->src_loc,
   4029 					       tok_range, NULL);
   4030     }
   4031 
   4032   return result;
   4033 }
   4034 
   4035 /* An upper bound on the number of bytes needed to spell TOKEN.
   4036    Does not include preceding whitespace.  */
   4037 unsigned int
   4038 cpp_token_len (const cpp_token *token)
   4039 {
   4040   unsigned int len;
   4041 
   4042   switch (TOKEN_SPELL (token))
   4043     {
   4044     default:		len = 6;				break;
   4045     case SPELL_LITERAL:	len = token->val.str.len;		break;
   4046     case SPELL_IDENT:	len = NODE_LEN (token->val.node.node) * 10;	break;
   4047     }
   4048 
   4049   return len;
   4050 }
   4051 
   4052 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
   4053    Return the number of bytes read out of NAME.  (There are always
   4054    10 bytes written to BUFFER.)  */
   4055 
   4056 static size_t
   4057 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
   4058 {
   4059   int j;
   4060   int ucn_len = 0;
   4061   int ucn_len_c;
   4062   unsigned t;
   4063   unsigned long utf32;
   4064 
   4065   /* Compute the length of the UTF-8 sequence.  */
   4066   for (t = *name; t & 0x80; t <<= 1)
   4067     ucn_len++;
   4068 
   4069   utf32 = *name & (0x7F >> ucn_len);
   4070   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
   4071     {
   4072       utf32 = (utf32 << 6) | (*++name & 0x3F);
   4073 
   4074       /* Ill-formed UTF-8.  */
   4075       if ((*name & ~0x3F) != 0x80)
   4076 	abort ();
   4077     }
   4078 
   4079   *buffer++ = '\\';
   4080   *buffer++ = 'U';
   4081   for (j = 7; j >= 0; j--)
   4082     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
   4083   return ucn_len;
   4084 }
   4085 
   4086 /* Given a token TYPE corresponding to a digraph, return a pointer to
   4087    the spelling of the digraph.  */
   4088 static const unsigned char *
   4089 cpp_digraph2name (enum cpp_ttype type)
   4090 {
   4091   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
   4092 }
   4093 
   4094 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
   4095    The buffer must already contain the enough space to hold the
   4096    token's spelling.  Returns a pointer to the character after the
   4097    last character written.  */
   4098 unsigned char *
   4099 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
   4100 {
   4101   size_t i;
   4102   const unsigned char *name = NODE_NAME (ident);
   4103 
   4104   for (i = 0; i < NODE_LEN (ident); i++)
   4105     if (name[i] & ~0x7F)
   4106       {
   4107 	i += utf8_to_ucn (buffer, name + i) - 1;
   4108 	buffer += 10;
   4109       }
   4110     else
   4111       *buffer++ = name[i];
   4112 
   4113   return buffer;
   4114 }
   4115 
   4116 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
   4117    already contain the enough space to hold the token's spelling.
   4118    Returns a pointer to the character after the last character written.
   4119    FORSTRING is true if this is to be the spelling after translation
   4120    phase 1 (with the original spelling of extended identifiers), false
   4121    if extended identifiers should always be written using UCNs (there is
   4122    no option for always writing them in the internal UTF-8 form).
   4123    FIXME: Would be nice if we didn't need the PFILE argument.  */
   4124 unsigned char *
   4125 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
   4126 		 unsigned char *buffer, bool forstring)
   4127 {
   4128   switch (TOKEN_SPELL (token))
   4129     {
   4130     case SPELL_OPERATOR:
   4131       {
   4132 	const unsigned char *spelling;
   4133 	unsigned char c;
   4134 
   4135 	if (token->flags & DIGRAPH)
   4136 	  spelling = cpp_digraph2name (token->type);
   4137 	else if (token->flags & NAMED_OP)
   4138 	  goto spell_ident;
   4139 	else
   4140 	  spelling = TOKEN_NAME (token);
   4141 
   4142 	while ((c = *spelling++) != '\0')
   4143 	  *buffer++ = c;
   4144       }
   4145       break;
   4146 
   4147     spell_ident:
   4148     case SPELL_IDENT:
   4149       if (forstring)
   4150 	{
   4151 	  memcpy (buffer, NODE_NAME (token->val.node.spelling),
   4152 		  NODE_LEN (token->val.node.spelling));
   4153 	  buffer += NODE_LEN (token->val.node.spelling);
   4154 	}
   4155       else
   4156 	buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
   4157       break;
   4158 
   4159     case SPELL_LITERAL:
   4160       memcpy (buffer, token->val.str.text, token->val.str.len);
   4161       buffer += token->val.str.len;
   4162       break;
   4163 
   4164     case SPELL_NONE:
   4165       cpp_error (pfile, CPP_DL_ICE,
   4166 		 "unspellable token %s", TOKEN_NAME (token));
   4167       break;
   4168     }
   4169 
   4170   return buffer;
   4171 }
   4172 
   4173 /* Returns TOKEN spelt as a null-terminated string.  The string is
   4174    freed when the reader is destroyed.  Useful for diagnostics.  */
   4175 unsigned char *
   4176 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
   4177 {
   4178   unsigned int len = cpp_token_len (token) + 1;
   4179   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
   4180 
   4181   end = cpp_spell_token (pfile, token, start, false);
   4182   end[0] = '\0';
   4183 
   4184   return start;
   4185 }
   4186 
   4187 /* Returns a pointer to a string which spells the token defined by
   4188    TYPE and FLAGS.  Used by C front ends, which really should move to
   4189    using cpp_token_as_text.  */
   4190 const char *
   4191 cpp_type2name (enum cpp_ttype type, unsigned char flags)
   4192 {
   4193   if (flags & DIGRAPH)
   4194     return (const char *) cpp_digraph2name (type);
   4195   else if (flags & NAMED_OP)
   4196     return cpp_named_operator2name (type);
   4197 
   4198   return (const char *) token_spellings[type].name;
   4199 }
   4200 
   4201 /* Writes the spelling of token to FP, without any preceding space.
   4202    Separated from cpp_spell_token for efficiency - to avoid stdio
   4203    double-buffering.  */
   4204 void
   4205 cpp_output_token (const cpp_token *token, FILE *fp)
   4206 {
   4207   switch (TOKEN_SPELL (token))
   4208     {
   4209     case SPELL_OPERATOR:
   4210       {
   4211 	const unsigned char *spelling;
   4212 	int c;
   4213 
   4214 	if (token->flags & DIGRAPH)
   4215 	  spelling = cpp_digraph2name (token->type);
   4216 	else if (token->flags & NAMED_OP)
   4217 	  goto spell_ident;
   4218 	else
   4219 	  spelling = TOKEN_NAME (token);
   4220 
   4221 	c = *spelling;
   4222 	do
   4223 	  putc (c, fp);
   4224 	while ((c = *++spelling) != '\0');
   4225       }
   4226       break;
   4227 
   4228     spell_ident:
   4229     case SPELL_IDENT:
   4230       {
   4231 	size_t i;
   4232 	const unsigned char * name = NODE_NAME (token->val.node.node);
   4233 
   4234 	for (i = 0; i < NODE_LEN (token->val.node.node); i++)
   4235 	  if (name[i] & ~0x7F)
   4236 	    {
   4237 	      unsigned char buffer[10];
   4238 	      i += utf8_to_ucn (buffer, name + i) - 1;
   4239 	      fwrite (buffer, 1, 10, fp);
   4240 	    }
   4241 	  else
   4242 	    fputc (NODE_NAME (token->val.node.node)[i], fp);
   4243       }
   4244       break;
   4245 
   4246     case SPELL_LITERAL:
   4247       if (token->type == CPP_HEADER_NAME)
   4248 	fputc ('"', fp);
   4249       fwrite (token->val.str.text, 1, token->val.str.len, fp);
   4250       if (token->type == CPP_HEADER_NAME)
   4251 	fputc ('"', fp);
   4252       break;
   4253 
   4254     case SPELL_NONE:
   4255       /* An error, most probably.  */
   4256       break;
   4257     }
   4258 }
   4259 
   4260 /* Compare two tokens.  */
   4261 int
   4262 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
   4263 {
   4264   if (a->type == b->type && a->flags == b->flags)
   4265     switch (TOKEN_SPELL (a))
   4266       {
   4267       default:			/* Keep compiler happy.  */
   4268       case SPELL_OPERATOR:
   4269 	/* token_no is used to track where multiple consecutive ##
   4270 	   tokens were originally located.  */
   4271 	return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
   4272       case SPELL_NONE:
   4273 	return (a->type != CPP_MACRO_ARG
   4274 		|| (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
   4275 		    && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
   4276       case SPELL_IDENT:
   4277 	return (a->val.node.node == b->val.node.node
   4278 		&& a->val.node.spelling == b->val.node.spelling);
   4279       case SPELL_LITERAL:
   4280 	return (a->val.str.len == b->val.str.len
   4281 		&& !memcmp (a->val.str.text, b->val.str.text,
   4282 			    a->val.str.len));
   4283       }
   4284 
   4285   return 0;
   4286 }
   4287 
   4288 /* Returns nonzero if a space should be inserted to avoid an
   4289    accidental token paste for output.  For simplicity, it is
   4290    conservative, and occasionally advises a space where one is not
   4291    needed, e.g. "." and ".2".  */
   4292 int
   4293 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
   4294 		 const cpp_token *token2)
   4295 {
   4296   enum cpp_ttype a = token1->type, b = token2->type;
   4297   cppchar_t c;
   4298 
   4299   if (token1->flags & NAMED_OP)
   4300     a = CPP_NAME;
   4301   if (token2->flags & NAMED_OP)
   4302     b = CPP_NAME;
   4303 
   4304   c = EOF;
   4305   if (token2->flags & DIGRAPH)
   4306     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
   4307   else if (token_spellings[b].category == SPELL_OPERATOR)
   4308     c = token_spellings[b].name[0];
   4309 
   4310   /* Quickly get everything that can paste with an '='.  */
   4311   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
   4312     return 1;
   4313 
   4314   switch (a)
   4315     {
   4316     case CPP_GREATER:	return c == '>';
   4317     case CPP_LESS:	return c == '<' || c == '%' || c == ':';
   4318     case CPP_PLUS:	return c == '+';
   4319     case CPP_MINUS:	return c == '-' || c == '>';
   4320     case CPP_DIV:	return c == '/' || c == '*'; /* Comments.  */
   4321     case CPP_MOD:	return c == ':' || c == '>';
   4322     case CPP_AND:	return c == '&';
   4323     case CPP_OR:	return c == '|';
   4324     case CPP_COLON:	return c == ':' || c == '>';
   4325     case CPP_DEREF:	return c == '*';
   4326     case CPP_DOT:	return c == '.' || c == '%' || b == CPP_NUMBER;
   4327     case CPP_HASH:	return c == '#' || c == '%'; /* Digraph form.  */
   4328     case CPP_PRAGMA:
   4329     case CPP_NAME:	return ((b == CPP_NUMBER
   4330 				 && name_p (pfile, &token2->val.str))
   4331 				|| b == CPP_NAME
   4332 				|| b == CPP_CHAR || b == CPP_STRING); /* L */
   4333     case CPP_NUMBER:	return (b == CPP_NUMBER || b == CPP_NAME
   4334 				|| b == CPP_CHAR
   4335 				|| c == '.' || c == '+' || c == '-');
   4336 				      /* UCNs */
   4337     case CPP_OTHER:	return ((token1->val.str.text[0] == '\\'
   4338 				 && b == CPP_NAME)
   4339 				|| (CPP_OPTION (pfile, objc)
   4340 				    && token1->val.str.text[0] == '@'
   4341 				    && (b == CPP_NAME || b == CPP_STRING)));
   4342     case CPP_LESS_EQ:	return c == '>';
   4343     case CPP_STRING:
   4344     case CPP_WSTRING:
   4345     case CPP_UTF8STRING:
   4346     case CPP_STRING16:
   4347     case CPP_STRING32:	return (CPP_OPTION (pfile, user_literals)
   4348 				&& (b == CPP_NAME
   4349 				    || (TOKEN_SPELL (token2) == SPELL_LITERAL
   4350 					&& ISIDST (token2->val.str.text[0]))));
   4351 
   4352     default:		break;
   4353     }
   4354 
   4355   return 0;
   4356 }
   4357 
   4358 /* Output all the remaining tokens on the current line, and a newline
   4359    character, to FP.  Leading whitespace is removed.  If there are
   4360    macros, special token padding is not performed.  */
   4361 void
   4362 cpp_output_line (cpp_reader *pfile, FILE *fp)
   4363 {
   4364   const cpp_token *token;
   4365 
   4366   token = cpp_get_token (pfile);
   4367   while (token->type != CPP_EOF)
   4368     {
   4369       cpp_output_token (token, fp);
   4370       token = cpp_get_token (pfile);
   4371       if (token->flags & PREV_WHITE)
   4372 	putc (' ', fp);
   4373     }
   4374 
   4375   putc ('\n', fp);
   4376 }
   4377 
   4378 /* Return a string representation of all the remaining tokens on the
   4379    current line.  The result is allocated using xmalloc and must be
   4380    freed by the caller.  */
   4381 unsigned char *
   4382 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
   4383 {
   4384   const cpp_token *token;
   4385   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
   4386   unsigned int alloced = 120 + out;
   4387   unsigned char *result = (unsigned char *) xmalloc (alloced);
   4388 
   4389   /* If DIR_NAME is empty, there are no initial contents.  */
   4390   if (dir_name)
   4391     {
   4392       sprintf ((char *) result, "#%s ", dir_name);
   4393       out += 2;
   4394     }
   4395 
   4396   token = cpp_get_token (pfile);
   4397   while (token->type != CPP_EOF)
   4398     {
   4399       unsigned char *last;
   4400       /* Include room for a possible space and the terminating nul.  */
   4401       unsigned int len = cpp_token_len (token) + 2;
   4402 
   4403       if (out + len > alloced)
   4404 	{
   4405 	  alloced *= 2;
   4406 	  if (out + len > alloced)
   4407 	    alloced = out + len;
   4408 	  result = (unsigned char *) xrealloc (result, alloced);
   4409 	}
   4410 
   4411       last = cpp_spell_token (pfile, token, &result[out], 0);
   4412       out = last - result;
   4413 
   4414       token = cpp_get_token (pfile);
   4415       if (token->flags & PREV_WHITE)
   4416 	result[out++] = ' ';
   4417     }
   4418 
   4419   result[out] = '\0';
   4420   return result;
   4421 }
   4422 
   4423 /* Memory buffers.  Changing these three constants can have a dramatic
   4424    effect on performance.  The values here are reasonable defaults,
   4425    but might be tuned.  If you adjust them, be sure to test across a
   4426    range of uses of cpplib, including heavy nested function-like macro
   4427    expansion.  Also check the change in peak memory usage (NJAMD is a
   4428    good tool for this).  */
   4429 #define MIN_BUFF_SIZE 8000
   4430 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
   4431 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
   4432 	(MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
   4433 
   4434 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
   4435   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
   4436 #endif
   4437 
   4438 /* Create a new allocation buffer.  Place the control block at the end
   4439    of the buffer, so that buffer overflows will cause immediate chaos.  */
   4440 static _cpp_buff *
   4441 new_buff (size_t len)
   4442 {
   4443   _cpp_buff *result;
   4444   unsigned char *base;
   4445 
   4446   if (len < MIN_BUFF_SIZE)
   4447     len = MIN_BUFF_SIZE;
   4448   len = CPP_ALIGN (len);
   4449 
   4450 #ifdef ENABLE_VALGRIND_ANNOTATIONS
   4451   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
   4452      struct first.  */
   4453   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
   4454   base = XNEWVEC (unsigned char, len + slen);
   4455   result = (_cpp_buff *) base;
   4456   base += slen;
   4457 #else
   4458   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
   4459   result = (_cpp_buff *) (base + len);
   4460 #endif
   4461   result->base = base;
   4462   result->cur = base;
   4463   result->limit = base + len;
   4464   result->next = NULL;
   4465   return result;
   4466 }
   4467 
   4468 /* Place a chain of unwanted allocation buffers on the free list.  */
   4469 void
   4470 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
   4471 {
   4472   _cpp_buff *end = buff;
   4473 
   4474   while (end->next)
   4475     end = end->next;
   4476   end->next = pfile->free_buffs;
   4477   pfile->free_buffs = buff;
   4478 }
   4479 
   4480 /* Return a free buffer of size at least MIN_SIZE.  */
   4481 _cpp_buff *
   4482 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
   4483 {
   4484   _cpp_buff *result, **p;
   4485 
   4486   for (p = &pfile->free_buffs;; p = &(*p)->next)
   4487     {
   4488       size_t size;
   4489 
   4490       if (*p == NULL)
   4491 	return new_buff (min_size);
   4492       result = *p;
   4493       size = result->limit - result->base;
   4494       /* Return a buffer that's big enough, but don't waste one that's
   4495          way too big.  */
   4496       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
   4497 	break;
   4498     }
   4499 
   4500   *p = result->next;
   4501   result->next = NULL;
   4502   result->cur = result->base;
   4503   return result;
   4504 }
   4505 
   4506 /* Creates a new buffer with enough space to hold the uncommitted
   4507    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
   4508    the excess bytes to the new buffer.  Chains the new buffer after
   4509    BUFF, and returns the new buffer.  */
   4510 _cpp_buff *
   4511 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
   4512 {
   4513   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
   4514   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
   4515 
   4516   buff->next = new_buff;
   4517   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
   4518   return new_buff;
   4519 }
   4520 
   4521 /* Creates a new buffer with enough space to hold the uncommitted
   4522    remaining bytes of the buffer pointed to by BUFF, and at least
   4523    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
   4524    Chains the new buffer before the buffer pointed to by BUFF, and
   4525    updates the pointer to point to the new buffer.  */
   4526 void
   4527 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
   4528 {
   4529   _cpp_buff *new_buff, *old_buff = *pbuff;
   4530   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
   4531 
   4532   new_buff = _cpp_get_buff (pfile, size);
   4533   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
   4534   new_buff->next = old_buff;
   4535   *pbuff = new_buff;
   4536 }
   4537 
   4538 /* Free a chain of buffers starting at BUFF.  */
   4539 void
   4540 _cpp_free_buff (_cpp_buff *buff)
   4541 {
   4542   _cpp_buff *next;
   4543 
   4544   for (; buff; buff = next)
   4545     {
   4546       next = buff->next;
   4547 #ifdef ENABLE_VALGRIND_ANNOTATIONS
   4548       free (buff);
   4549 #else
   4550       free (buff->base);
   4551 #endif
   4552     }
   4553 }
   4554 
   4555 /* Allocate permanent, unaligned storage of length LEN.  */
   4556 unsigned char *
   4557 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
   4558 {
   4559   _cpp_buff *buff = pfile->u_buff;
   4560   unsigned char *result = buff->cur;
   4561 
   4562   if (len > (size_t) (buff->limit - result))
   4563     {
   4564       buff = _cpp_get_buff (pfile, len);
   4565       buff->next = pfile->u_buff;
   4566       pfile->u_buff = buff;
   4567       result = buff->cur;
   4568     }
   4569 
   4570   buff->cur = result + len;
   4571   return result;
   4572 }
   4573 
   4574 /* Allocate permanent, unaligned storage of length LEN from a_buff.
   4575    That buffer is used for growing allocations when saving macro
   4576    replacement lists in a #define, and when parsing an answer to an
   4577    assertion in #assert, #unassert or #if (and therefore possibly
   4578    whilst expanding macros).  It therefore must not be used by any
   4579    code that they might call: specifically the lexer and the guts of
   4580    the macro expander.
   4581 
   4582    All existing other uses clearly fit this restriction: storing
   4583    registered pragmas during initialization.  */
   4584 unsigned char *
   4585 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
   4586 {
   4587   _cpp_buff *buff = pfile->a_buff;
   4588   unsigned char *result = buff->cur;
   4589 
   4590   if (len > (size_t) (buff->limit - result))
   4591     {
   4592       buff = _cpp_get_buff (pfile, len);
   4593       buff->next = pfile->a_buff;
   4594       pfile->a_buff = buff;
   4595       result = buff->cur;
   4596     }
   4597 
   4598   buff->cur = result + len;
   4599   return result;
   4600 }
   4601 
   4602 /* Commit or allocate storage from a buffer.  */
   4603 
   4604 void *
   4605 _cpp_commit_buff (cpp_reader *pfile, size_t size)
   4606 {
   4607   void *ptr = BUFF_FRONT (pfile->a_buff);
   4608 
   4609   if (pfile->hash_table->alloc_subobject)
   4610     {
   4611       void *copy = pfile->hash_table->alloc_subobject (size);
   4612       memcpy (copy, ptr, size);
   4613       ptr = copy;
   4614     }
   4615   else
   4616     BUFF_FRONT (pfile->a_buff) += size;
   4617 
   4618   return ptr;
   4619 }
   4620 
   4621 /* Say which field of TOK is in use.  */
   4622 
   4623 enum cpp_token_fld_kind
   4624 cpp_token_val_index (const cpp_token *tok)
   4625 {
   4626   switch (TOKEN_SPELL (tok))
   4627     {
   4628     case SPELL_IDENT:
   4629       return CPP_TOKEN_FLD_NODE;
   4630     case SPELL_LITERAL:
   4631       return CPP_TOKEN_FLD_STR;
   4632     case SPELL_OPERATOR:
   4633       /* Operands which were originally spelled as ident keep around
   4634          the node for the exact spelling.  */
   4635       if (tok->flags & NAMED_OP)
   4636 	return CPP_TOKEN_FLD_NODE;
   4637       else if (tok->type == CPP_PASTE)
   4638 	return CPP_TOKEN_FLD_TOKEN_NO;
   4639       else
   4640 	return CPP_TOKEN_FLD_NONE;
   4641     case SPELL_NONE:
   4642       if (tok->type == CPP_MACRO_ARG)
   4643 	return CPP_TOKEN_FLD_ARG_NO;
   4644       else if (tok->type == CPP_PADDING)
   4645 	return CPP_TOKEN_FLD_SOURCE;
   4646       else if (tok->type == CPP_PRAGMA)
   4647 	return CPP_TOKEN_FLD_PRAGMA;
   4648       /* fall through */
   4649     default:
   4650       return CPP_TOKEN_FLD_NONE;
   4651     }
   4652 }
   4653 
   4654 /* All tokens lexed in R after calling this function will be forced to
   4655    have their location_t to be P, until
   4656    cpp_stop_forcing_token_locations is called for R.  */
   4657 
   4658 void
   4659 cpp_force_token_locations (cpp_reader *r, location_t loc)
   4660 {
   4661   r->forced_token_location = loc;
   4662 }
   4663 
   4664 /* Go back to assigning locations naturally for lexed tokens.  */
   4665 
   4666 void
   4667 cpp_stop_forcing_token_locations (cpp_reader *r)
   4668 {
   4669   r->forced_token_location = 0;
   4670 }
   4671 
   4672 /* We're looking at \, if it's escaping EOL, look past it.  If at
   4673    LIMIT, don't advance.  */
   4674 
   4675 static const unsigned char *
   4676 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
   4677 {
   4678   const unsigned char *probe = peek;
   4679 
   4680   if (__builtin_expect (peek[1] == '\n', true))
   4681     {
   4682     eol:
   4683       probe += 2;
   4684       if (__builtin_expect (probe < limit, true))
   4685 	{
   4686 	  peek = probe;
   4687 	  if (*peek == '\\')
   4688 	    /* The user might be perverse.  */
   4689 	    return do_peek_backslash (peek, limit);
   4690 	}
   4691     }
   4692   else if (__builtin_expect (peek[1] == '\r', false))
   4693     {
   4694       if (probe[2] == '\n')
   4695 	probe++;
   4696       goto eol;
   4697     }
   4698 
   4699   return peek;
   4700 }
   4701 
   4702 static const unsigned char *
   4703 do_peek_next (const unsigned char *peek, const unsigned char *limit)
   4704 {
   4705   if (__builtin_expect (*peek == '\\', false))
   4706     peek = do_peek_backslash (peek, limit);
   4707   return peek;
   4708 }
   4709 
   4710 static const unsigned char *
   4711 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
   4712 {
   4713   if (peek == bound)
   4714     return NULL;
   4715 
   4716   unsigned char c = *--peek;
   4717   if (__builtin_expect (c == '\n', false)
   4718       || __builtin_expect (c == 'r', false))
   4719     {
   4720       if (peek == bound)
   4721 	return peek;
   4722       int ix = -1;
   4723       if (c == '\n' && peek[ix] == '\r')
   4724 	{
   4725 	  if (peek + ix == bound)
   4726 	    return peek;
   4727 	  ix--;
   4728 	}
   4729 
   4730       if (peek[ix] == '\\')
   4731 	return do_peek_prev (peek + ix, bound);
   4732 
   4733       return peek;
   4734     }
   4735   else
   4736     return peek;
   4737 }
   4738 
   4739 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
   4740    space.  Otherwise return NULL.  */
   4741 
   4742 static const unsigned char *
   4743 do_peek_ident (const char *match, const unsigned char *peek,
   4744 	       const unsigned char *limit)
   4745 {
   4746   for (; *++match; peek++)
   4747     if (*peek != *match)
   4748       {
   4749 	peek = do_peek_next (peek, limit);
   4750 	if (*peek != *match)
   4751 	  return NULL;
   4752       }
   4753 
   4754   /* Must now not be looking at an identifier char.  */
   4755   peek = do_peek_next (peek, limit);
   4756   if (ISIDNUM (*peek))
   4757     return NULL;
   4758 
   4759   /* Skip control-line whitespace.  */
   4760  ws:
   4761   while (*peek == ' ' || *peek == '\t')
   4762     peek++;
   4763   if (__builtin_expect (*peek == '\\', false))
   4764     {
   4765       peek = do_peek_backslash (peek, limit);
   4766       if (*peek != '\\')
   4767 	goto ws;
   4768     }
   4769 
   4770   return peek;
   4771 }
   4772 
   4773 /* Are we looking at a module control line starting as PEEK - 1?  */
   4774 
   4775 static bool
   4776 do_peek_module (cpp_reader *pfile, unsigned char c,
   4777 		const unsigned char *peek, const unsigned char *limit)
   4778 {
   4779   bool import = false;
   4780 
   4781   if (__builtin_expect (c == 'e', false))
   4782     {
   4783       if (!((peek[0] == 'x' || peek[0] == '\\')
   4784 	    && (peek = do_peek_ident ("export", peek, limit))))
   4785 	return false;
   4786 
   4787       /* export, peek for import or module.  No need to peek __import
   4788 	 here.  */
   4789       if (peek[0] == 'i')
   4790 	{
   4791 	  if (!((peek[1] == 'm' || peek[1] == '\\')
   4792 		&& (peek = do_peek_ident ("import", peek + 1, limit))))
   4793 	    return false;
   4794 	  import = true;
   4795 	}
   4796       else if (peek[0] == 'm')
   4797 	{
   4798 	  if (!((peek[1] == 'o' || peek[1] == '\\')
   4799 		&& (peek = do_peek_ident ("module", peek + 1, limit))))
   4800 	    return false;
   4801 	}
   4802       else
   4803 	return false;
   4804     }
   4805   else if (__builtin_expect (c == 'i', false))
   4806     {
   4807       if (!((peek[0] == 'm' || peek[0] == '\\')
   4808 	    && (peek = do_peek_ident ("import", peek, limit))))
   4809 	return false;
   4810       import = true;
   4811     }
   4812   else if (__builtin_expect (c == '_', false))
   4813     {
   4814       /* Needed for translated includes.   */
   4815       if (!((peek[0] == '_' || peek[0] == '\\')
   4816 	    && (peek = do_peek_ident ("__import", peek, limit))))
   4817 	return false;
   4818       import = true;
   4819     }
   4820   else if (__builtin_expect (c == 'm', false))
   4821     {
   4822       if (!((peek[0] == 'o' || peek[0] == '\\')
   4823 	    && (peek = do_peek_ident ("module", peek, limit))))
   4824 	return false;
   4825     }
   4826   else
   4827     return false;
   4828 
   4829   /* Peek the next character to see if it's good enough.  We'll be at
   4830      the first non-whitespace char, including skipping an escaped
   4831      newline.  */
   4832   /* ... import followed by identifier, ':', '<' or header-name
   4833      preprocessing tokens, or module followed by identifier, ':' or
   4834      ';' preprocessing tokens.  */
   4835   unsigned char p = *peek++;
   4836 
   4837   /* A character literal is ... single quotes, ... optionally preceded
   4838      by u8, u, U, or L */
   4839   /* A string-literal is a ... double quotes, optionally prefixed by
   4840      R, u8, u8R, u, uR, U, UR, L, or LR */
   4841   if (p == 'u')
   4842     {
   4843       peek = do_peek_next (peek, limit);
   4844       if (*peek == '8')
   4845 	{
   4846 	  peek++;
   4847 	  goto peek_u8;
   4848 	}
   4849       goto peek_u;
   4850     }
   4851   else if (p == 'U' || p == 'L')
   4852     {
   4853     peek_u8:
   4854       peek = do_peek_next (peek, limit);
   4855     peek_u:
   4856       if (*peek == '\"' || *peek == '\'')
   4857 	return false;
   4858 
   4859       if (*peek == 'R')
   4860 	goto peek_R;
   4861       /* Identifier. Ok.  */
   4862     }
   4863   else if (p == 'R')
   4864     {
   4865     peek_R:
   4866       if (CPP_OPTION (pfile, rliterals))
   4867 	{
   4868 	  peek = do_peek_next (peek, limit);
   4869 	  if (*peek == '\"')
   4870 	    return false;
   4871 	}
   4872       /* Identifier. Ok.  */
   4873     }
   4874   else if ('Z' - 'A' == 25
   4875 	   ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
   4876 	   : ISIDST (p))
   4877     {
   4878       /* Identifier.  Ok. */
   4879     }
   4880   else if (p == '<')
   4881     {
   4882       /* Maybe angle header, ok for import.  Reject
   4883 	 '<=', '<<' digraph:'<:'.  */
   4884       if (!import)
   4885 	return false;
   4886       peek = do_peek_next (peek, limit);
   4887       if (*peek == '=' || *peek == '<'
   4888 	  || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
   4889 	return false;
   4890     }
   4891   else if (p == ';')
   4892     {
   4893       /* SEMICOLON, ok for module.  */
   4894       if (import)
   4895 	return false;
   4896     }
   4897   else if (p == '"')
   4898     {
   4899       /* STRING, ok for import.  */
   4900       if (!import)
   4901 	return false;
   4902     }
   4903   else if (p == ':')
   4904     {
   4905       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
   4906       peek = do_peek_next (peek, limit);
   4907       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
   4908 	return false;
   4909     }
   4910   else
   4911     /* FIXME: Detect a unicode character, excluding those not
   4912        permitted as the initial character. [lex.name]/1.  I presume
   4913        we need to check the \[uU] spellings, and directly using
   4914        Unicode in say UTF8 form?  Or perhaps we do the phase-1
   4915        conversion of UTF8 to universal-character-names?  */
   4916     return false;
   4917 
   4918   return true;
   4919 }
   4920 
   4921 /* Directives-only scanning.  Somewhat more relaxed than correct
   4922    parsing -- some ill-formed programs will not be rejected.  */
   4923 
   4924 void
   4925 cpp_directive_only_process (cpp_reader *pfile,
   4926 			    void *data,
   4927 			    void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
   4928 {
   4929   bool module_p = CPP_OPTION (pfile, module_directives);
   4930 
   4931   do
   4932     {
   4933     restart:
   4934       /* Buffer initialization, but no line cleaning. */
   4935       cpp_buffer *buffer = pfile->buffer;
   4936       buffer->cur_note = buffer->notes_used = 0;
   4937       buffer->cur = buffer->line_base = buffer->next_line;
   4938       buffer->need_line = false;
   4939       /* Files always end in a newline or carriage return.  We rely on this for
   4940 	 character peeking safety.  */
   4941       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
   4942 
   4943       const unsigned char *base = buffer->cur;
   4944       unsigned line_count = 0;
   4945       const unsigned char *line_start = base;
   4946 
   4947       bool bol = true;
   4948       bool raw = false;
   4949 
   4950       const unsigned char *lwm = base;
   4951       for (const unsigned char *pos = base, *limit = buffer->rlimit;
   4952 	   pos < limit;)
   4953 	{
   4954 	  unsigned char c = *pos++;
   4955 	  /* This matches the switch in _cpp_lex_direct.  */
   4956 	  switch (c)
   4957 	    {
   4958 	    case ' ': case '\t': case '\f': case '\v':
   4959 	      /* Whitespace, do nothing.  */
   4960 	      break;
   4961 
   4962 	    case '\r': /* MAC line ending, or Windows \r\n  */
   4963 	      if (*pos == '\n')
   4964 		pos++;
   4965 	      /* FALLTHROUGH */
   4966 
   4967 	    case '\n':
   4968 	      bol = true;
   4969 
   4970 	    next_line:
   4971 	      CPP_INCREMENT_LINE (pfile, 0);
   4972 	      line_count++;
   4973 	      line_start = pos;
   4974 	      break;
   4975 
   4976 	    case '\\':
   4977 	      /* <backslash><newline> is removed, and doesn't undo any
   4978 		 preceeding escape or whatnot.  */
   4979 	      if (*pos == '\n')
   4980 		{
   4981 		  pos++;
   4982 		  goto next_line;
   4983 		}
   4984 	      else if (*pos == '\r')
   4985 		{
   4986 		  if (pos[1] == '\n')
   4987 		    pos++;
   4988 		  pos++;
   4989 		  goto next_line;
   4990 		}
   4991 	      goto dflt;
   4992 
   4993 	    case '#':
   4994 	      if (bol)
   4995 		{
   4996 		  /* Line directive.  */
   4997 		  if (pos - 1 > base && !pfile->state.skipping)
   4998 		    cb (pfile, CPP_DO_print, data,
   4999 			line_count, base, pos - 1 - base);
   5000 
   5001 		  /* Prep things for directive handling. */
   5002 		  buffer->next_line = pos;
   5003 		  buffer->need_line = true;
   5004 		  bool ok = _cpp_get_fresh_line (pfile);
   5005 		  gcc_checking_assert (ok);
   5006 
   5007 		  /* Ensure proper column numbering for generated
   5008 		     error messages. */
   5009 		  buffer->line_base -= pos - line_start;
   5010 
   5011 		  _cpp_handle_directive (pfile, line_start + 1 != pos);
   5012 
   5013 		  /* Sanitize the line settings.  Duplicate #include's can
   5014 		     mess things up. */
   5015 		  // FIXME: Necessary?
   5016 		  pfile->line_table->highest_location
   5017 		    = pfile->line_table->highest_line;
   5018 
   5019 		  if (!pfile->state.skipping
   5020 		      && pfile->buffer->next_line < pfile->buffer->rlimit)
   5021 		    cb (pfile, CPP_DO_location, data,
   5022 			pfile->line_table->highest_line);
   5023 
   5024 		  goto restart;
   5025 		}
   5026 	      goto dflt;
   5027 
   5028 	    case '/':
   5029 	      {
   5030 		const unsigned char *peek = do_peek_next (pos, limit);
   5031 		if (!(*peek == '/' || *peek == '*'))
   5032 		  goto dflt;
   5033 
   5034 		/* Line or block comment  */
   5035 		bool is_block = *peek == '*';
   5036 		bool star = false;
   5037 		bool esc = false;
   5038 		location_t sloc
   5039 		  = linemap_position_for_column (pfile->line_table,
   5040 						 pos - line_start);
   5041 
   5042 		while (pos < limit)
   5043 		  {
   5044 		    char c = *pos++;
   5045 		    switch (c)
   5046 		      {
   5047 		      case '\\':
   5048 			esc = true;
   5049 			break;
   5050 
   5051 		      case '\r':
   5052 			if (*pos == '\n')
   5053 			  pos++;
   5054 			/* FALLTHROUGH  */
   5055 
   5056 		      case '\n':
   5057 			{
   5058 			  CPP_INCREMENT_LINE (pfile, 0);
   5059 			  line_count++;
   5060 			  line_start = pos;
   5061 			  if (!esc && !is_block)
   5062 			    {
   5063 			      bol = true;
   5064 			      goto done_comment;
   5065 			    }
   5066 			}
   5067 			if (!esc)
   5068 			  star = false;
   5069 			esc = false;
   5070 			break;
   5071 
   5072 		      case '*':
   5073 			if (pos > peek)
   5074 			  star = is_block;
   5075 			esc = false;
   5076 			break;
   5077 
   5078 		      case '/':
   5079 			if (star)
   5080 			  goto done_comment;
   5081 			/* FALLTHROUGH  */
   5082 
   5083 		      default:
   5084 			star = false;
   5085 			esc = false;
   5086 			break;
   5087 		      }
   5088 		  }
   5089 		if (pos < limit || is_block)
   5090 		  cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
   5091 				       "unterminated comment");
   5092 	      done_comment:
   5093 		lwm = pos;
   5094 		break;
   5095 	      }
   5096 
   5097 	    case '\'':
   5098 	      if (!CPP_OPTION (pfile, digit_separators))
   5099 		goto delimited_string;
   5100 
   5101 	      /* Possibly a number punctuator.  */
   5102 	      if (!ISIDNUM (*do_peek_next (pos, limit)))
   5103 		goto delimited_string;
   5104 
   5105 	      goto quote_peek;
   5106 
   5107 	    case '\"':
   5108 	      if (!CPP_OPTION (pfile, rliterals))
   5109 		goto delimited_string;
   5110 
   5111 	    quote_peek:
   5112 	      {
   5113 		/* For ' see if it's a number punctuator
   5114 		   \.?<digit>(<digit>|<identifier-nondigit>
   5115 		   |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
   5116 		/* For " see if it's a raw string
   5117 		   {U,L,u,u8}R.  This includes CPP_NUMBER detection,
   5118 		   because that could be 0e+R.  */
   5119 		const unsigned char *peek = pos - 1;
   5120 		bool quote_first = c == '"';
   5121 		bool quote_eight = false;
   5122 		bool maybe_number_start = false;
   5123 		bool want_number = false;
   5124 
   5125 		while ((peek = do_peek_prev (peek, lwm)))
   5126 		  {
   5127 		    unsigned char p = *peek;
   5128 		    if (quote_first)
   5129 		      {
   5130 			if (!raw)
   5131 			  {
   5132 			    if (p != 'R')
   5133 			      break;
   5134 			    raw = true;
   5135 			    continue;
   5136 			  }
   5137 
   5138 			quote_first = false;
   5139 			if (p == 'L' || p == 'U' || p == 'u')
   5140 			  ;
   5141 			else if (p == '8')
   5142 			  quote_eight = true;
   5143 			else
   5144 			  goto second_raw;
   5145 		      }
   5146 		    else if (quote_eight)
   5147 		      {
   5148 			if (p != 'u')
   5149 			  {
   5150 			    raw = false;
   5151 			    break;
   5152 			  }
   5153 			quote_eight = false;
   5154 		      }
   5155 		    else if (c == '"')
   5156 		      {
   5157 		      second_raw:;
   5158 			if (!want_number && ISIDNUM (p))
   5159 			  {
   5160 			    raw = false;
   5161 			    break;
   5162 			  }
   5163 		      }
   5164 
   5165 		    if (ISDIGIT (p))
   5166 		      maybe_number_start = true;
   5167 		    else if (p == '.')
   5168 		      want_number = true;
   5169 		    else if (ISIDNUM (p))
   5170 		      maybe_number_start = false;
   5171 		    else if (p == '+' || p == '-')
   5172 		      {
   5173 			if (const unsigned char *peek_prev
   5174 			    = do_peek_prev (peek, lwm))
   5175 			  {
   5176 			    p = *peek_prev;
   5177 			    if (p == 'e' || p == 'E'
   5178 				|| p == 'p' || p == 'P')
   5179 			      {
   5180 				want_number = true;
   5181 				maybe_number_start = false;
   5182 			      }
   5183 			    else
   5184 			      break;
   5185 			  }
   5186 			else
   5187 			  break;
   5188 		      }
   5189 		    else if (p == '\'' || p == '\"')
   5190 		      {
   5191 			/* If this is lwm, this must be the end of a
   5192 			   previous string.  So this is a trailing
   5193 			   literal type, (a) if those are allowed,
   5194 			     and (b) maybe_start is false.  Otherwise
   5195 			     this must be a CPP_NUMBER because we've
   5196 			     met another ', and we'd have checked that
   5197 			     in its own right.  */
   5198 			if (peek == lwm && CPP_OPTION (pfile, uliterals))
   5199 			  {
   5200 			    if  (!maybe_number_start && !want_number)
   5201 			      /* Must be a literal type.  */
   5202 			      raw = false;
   5203 			  }
   5204 			else if (p == '\''
   5205 				 && CPP_OPTION (pfile, digit_separators))
   5206 			  maybe_number_start = true;
   5207 			break;
   5208 		      }
   5209 		    else if (c == '\'')
   5210 		      break;
   5211 		    else if (!quote_first && !quote_eight)
   5212 		      break;
   5213 		  }
   5214 
   5215 		if (maybe_number_start)
   5216 		  {
   5217 		    if (c == '\'')
   5218 		      /* A CPP NUMBER.  */
   5219 		      goto dflt;
   5220 		    raw = false;
   5221 		  }
   5222 
   5223 		goto delimited_string;
   5224 	      }
   5225 
   5226 	    delimited_string:
   5227 	      {
   5228 		/* (Possibly raw) string or char literal.  */
   5229 		unsigned char end = c;
   5230 		int delim_len = -1;
   5231 		const unsigned char *delim = NULL;
   5232 		location_t sloc = linemap_position_for_column (pfile->line_table,
   5233 							       pos - line_start);
   5234 		int esc = 0;
   5235 
   5236 		if (raw)
   5237 		  {
   5238 		    /* There can be no line breaks in the delimiter.  */
   5239 		    delim = pos;
   5240 		    for (delim_len = 0; (c = *pos++) != '('; delim_len++)
   5241 		      {
   5242 			if (delim_len == 16)
   5243 			  {
   5244 			    cpp_error_with_line (pfile, CPP_DL_ERROR,
   5245 						 sloc, 0,
   5246 						 "raw string delimiter"
   5247 						 " longer than %d"
   5248 						 " characters",
   5249 						 delim_len);
   5250 			    raw = false;
   5251 			    pos = delim;
   5252 			    break;
   5253 			  }
   5254 			if (strchr (") \\\t\v\f\n", c))
   5255 			  {
   5256 			    cpp_error_with_line (pfile, CPP_DL_ERROR,
   5257 						 sloc, 0,
   5258 						 "invalid character '%c'"
   5259 						 " in raw string"
   5260 						 " delimiter", c);
   5261 			    raw = false;
   5262 			    pos = delim;
   5263 			    break;
   5264 			  }
   5265 			if (pos >= limit)
   5266 			  goto bad_string;
   5267 		      }
   5268 		  }
   5269 
   5270 		while (pos < limit)
   5271 		  {
   5272 		    char c = *pos++;
   5273 		    switch (c)
   5274 		      {
   5275 		      case '\\':
   5276 			if (!raw)
   5277 			  esc++;
   5278 			break;
   5279 
   5280 		      case '\r':
   5281 			if (*pos == '\n')
   5282 			  pos++;
   5283 			/* FALLTHROUGH  */
   5284 
   5285 		      case '\n':
   5286 			{
   5287 			  CPP_INCREMENT_LINE (pfile, 0);
   5288 			  line_count++;
   5289 			  line_start = pos;
   5290 			}
   5291 			if (esc)
   5292 			  esc--;
   5293 			break;
   5294 
   5295 		      case ')':
   5296 			if (raw
   5297 			    && pos + delim_len + 1 < limit
   5298 			    && pos[delim_len] == end
   5299 			    && !memcmp (delim, pos, delim_len))
   5300 			  {
   5301 			    pos += delim_len + 1;
   5302 			    raw = false;
   5303 			    goto done_string;
   5304 			  }
   5305 			break;
   5306 
   5307 		      default:
   5308 			if (!raw && !(esc & 1) && c == end)
   5309 			  goto done_string;
   5310 			esc = 0;
   5311 			break;
   5312 		      }
   5313 		  }
   5314 	      bad_string:
   5315 		cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
   5316 				     "unterminated literal");
   5317 
   5318 	      done_string:
   5319 		raw = false;
   5320 		lwm = pos - 1;
   5321 	      }
   5322 	      goto dflt;
   5323 
   5324 	    case '_':
   5325 	    case 'e':
   5326 	    case 'i':
   5327 	    case 'm':
   5328 	      if (bol && module_p && !pfile->state.skipping
   5329 		  && do_peek_module (pfile, c, pos, limit))
   5330 		{
   5331 		  /* We've seen the start of a module control line.
   5332 		     Start up the tokenizer.  */
   5333 		  pos--; /* Backup over the first character.  */
   5334 
   5335 		  /* Backup over whitespace to start of line.  */
   5336 		  while (pos > line_start
   5337 			 && (pos[-1] == ' ' || pos[-1] == '\t'))
   5338 		    pos--;
   5339 
   5340 		  if (pos > base)
   5341 		    cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
   5342 
   5343 		  /* Prep things for directive handling. */
   5344 		  buffer->next_line = pos;
   5345 		  buffer->need_line = true;
   5346 
   5347 		  /* Now get tokens until the PRAGMA_EOL.  */
   5348 		  do
   5349 		    {
   5350 		      location_t spelling;
   5351 		      const cpp_token *tok
   5352 			= cpp_get_token_with_location (pfile, &spelling);
   5353 
   5354 		      gcc_assert (pfile->state.in_deferred_pragma
   5355 				  || tok->type == CPP_PRAGMA_EOL);
   5356 		      cb (pfile, CPP_DO_token, data, tok, spelling);
   5357 		    }
   5358 		  while (pfile->state.in_deferred_pragma);
   5359 
   5360 		  if (pfile->buffer->next_line < pfile->buffer->rlimit)
   5361 		    cb (pfile, CPP_DO_location, data,
   5362 			pfile->line_table->highest_line);
   5363 
   5364 		  pfile->mi_valid = false;
   5365 		  goto restart;
   5366 		}
   5367 	      goto dflt;
   5368 
   5369 	    default:
   5370 	    dflt:
   5371 	      bol = false;
   5372 	      pfile->mi_valid = false;
   5373 	      break;
   5374 	    }
   5375 	}
   5376 
   5377       if (buffer->rlimit > base && !pfile->state.skipping)
   5378 	{
   5379 	  const unsigned char *limit = buffer->rlimit;
   5380 	  /* If the file was not newline terminated, add rlimit, which is
   5381 	     guaranteed to point to a newline, to the end of our range.  */
   5382 	  if (limit[-1] != '\n')
   5383 	    {
   5384 	      limit++;
   5385 	      CPP_INCREMENT_LINE (pfile, 0);
   5386 	      line_count++;
   5387 	    }
   5388 	  cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
   5389 	}
   5390 
   5391       _cpp_pop_buffer (pfile);
   5392     }
   5393   while (pfile->buffer);
   5394 }
   5395