Home | History | Annotate | Line # | Download | only in libcpp
lex.cc revision 1.2
      1 /* CPP Library - lexical analysis.
      2    Copyright (C) 2000-2022 Free Software Foundation, Inc.
      3    Contributed by Per Bothner, 1994-95.
      4    Based on CCCP program by Paul Rubin, June 1986
      5    Adapted to ANSI C, Richard Stallman, Jan 1987
      6    Broken out to separate file, Zack Weinberg, Mar 2000
      7 
      8 This program is free software; you can redistribute it and/or modify it
      9 under the terms of the GNU General Public License as published by the
     10 Free Software Foundation; either version 3, or (at your option) any
     11 later version.
     12 
     13 This program is distributed in the hope that it will be useful,
     14 but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     16 GNU General Public License for more details.
     17 
     18 You should have received a copy of the GNU General Public License
     19 along with this program; see the file COPYING3.  If not see
     20 <http://www.gnu.org/licenses/>.  */
     21 
     22 #include "config.h"
     23 #include "system.h"
     24 #include "cpplib.h"
     25 #include "internal.h"
     26 
     27 enum spell_type
     28 {
     29   SPELL_OPERATOR = 0,
     30   SPELL_IDENT,
     31   SPELL_LITERAL,
     32   SPELL_NONE
     33 };
     34 
     35 struct token_spelling
     36 {
     37   enum spell_type category;
     38   const unsigned char *name;
     39 };
     40 
     41 static const unsigned char *const digraph_spellings[] =
     42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
     43 
     44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
     45 #define TK(e, s) { SPELL_ ## s,    UC #e },
     46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
     47 #undef OP
     48 #undef TK
     49 
     50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
     51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
     52 
     53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
     54 static int skip_line_comment (cpp_reader *);
     55 static void skip_whitespace (cpp_reader *, cppchar_t);
     56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
     57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
     58 static void store_comment (cpp_reader *, cpp_token *);
     59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
     60 			    unsigned int, enum cpp_ttype);
     61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
     62 static int name_p (cpp_reader *, const cpp_string *);
     63 static tokenrun *next_tokenrun (tokenrun *);
     64 
     65 static _cpp_buff *new_buff (size_t);
     66 
     67 
     68 /* Utility routine:
     69 
     70    Compares, the token TOKEN to the NUL-terminated string STRING.
     71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
     72 int
     73 cpp_ideq (const cpp_token *token, const char *string)
     74 {
     75   if (token->type != CPP_NAME)
     76     return 0;
     77 
     78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
     79 }
     80 
     81 /* Record a note TYPE at byte POS into the current cleaned logical
     82    line.  */
     83 static void
     84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
     85 {
     86   if (buffer->notes_used == buffer->notes_cap)
     87     {
     88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
     89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
     90                                   buffer->notes_cap);
     91     }
     92 
     93   buffer->notes[buffer->notes_used].pos = pos;
     94   buffer->notes[buffer->notes_used].type = type;
     95   buffer->notes_used++;
     96 }
     97 
     98 
     99 /* Fast path to find line special characters using optimized character
    101    scanning algorithms.  Anything complicated falls back to the slow
    102    path below.  Since this loop is very hot it's worth doing these kinds
    103    of optimizations.
    104 
    105    One of the paths through the ifdefs should provide
    106 
    107      const uchar *search_line_fast (const uchar *s, const uchar *end);
    108 
    109    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
    110    the found character.
    111 
    112    Note that the last character of the buffer is *always* a newline,
    113    as forced by _cpp_convert_input.  This fact can be used to avoid
    114    explicitly looking for the end of the buffer.  */
    115 
    116 /* Configure gives us an ifdef test.  */
    117 #ifndef WORDS_BIGENDIAN
    118 #define WORDS_BIGENDIAN 0
    119 #endif
    120 
    121 /* We'd like the largest integer that fits into a register.  There's nothing
    122    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
    123    but MS decided on an LLP64 model.  Thankfully when building with GCC we
    124    can get the "real" word size.  */
    125 #ifdef __GNUC__
    126 typedef unsigned int word_type __attribute__((__mode__(__word__)));
    127 #else
    128 typedef unsigned long word_type;
    129 #endif
    130 
    131 /* The code below is only expecting sizes 4 or 8.
    132    Die at compile-time if this expectation is violated.  */
    133 typedef char check_word_type_size
    134   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
    135 
    136 /* Return X with the first N bytes forced to values that won't match one
    137    of the interesting characters.  Note that NUL is not interesting.  */
    138 
    139 static inline word_type
    140 acc_char_mask_misalign (word_type val, unsigned int n)
    141 {
    142   word_type mask = -1;
    143   if (WORDS_BIGENDIAN)
    144     mask >>= n * 8;
    145   else
    146     mask <<= n * 8;
    147   return val & mask;
    148 }
    149 
    150 /* Return X replicated to all byte positions within WORD_TYPE.  */
    151 
    152 static inline word_type
    153 acc_char_replicate (uchar x)
    154 {
    155   word_type ret;
    156 
    157   ret = (x << 24) | (x << 16) | (x << 8) | x;
    158   if (sizeof(word_type) == 8)
    159     ret = (ret << 16 << 16) | ret;
    160   return ret;
    161 }
    162 
    163 /* Return non-zero if some byte of VAL is (probably) C.  */
    164 
    165 static inline word_type
    166 acc_char_cmp (word_type val, word_type c)
    167 {
    168 #if defined(__GNUC__) && defined(__alpha__)
    169   /* We can get exact results using a compare-bytes instruction.
    170      Get (val == c) via (0 >= (val ^ c)).  */
    171   return __builtin_alpha_cmpbge (0, val ^ c);
    172 #else
    173   word_type magic = 0x7efefefeU;
    174   if (sizeof(word_type) == 8)
    175     magic = (magic << 16 << 16) | 0xfefefefeU;
    176   magic |= 1;
    177 
    178   val ^= c;
    179   return ((val + magic) ^ ~val) & ~magic;
    180 #endif
    181 }
    182 
    183 /* Given the result of acc_char_cmp is non-zero, return the index of
    184    the found character.  If this was a false positive, return -1.  */
    185 
    186 static inline int
    187 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
    188 		word_type val ATTRIBUTE_UNUSED)
    189 {
    190 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
    191   /* The cmpbge instruction sets *bits* of the result corresponding to
    192      matches in the bytes with no false positives.  */
    193   return __builtin_ctzl (cmp);
    194 #else
    195   unsigned int i;
    196 
    197   /* ??? It would be nice to force unrolling here,
    198      and have all of these constants folded.  */
    199   for (i = 0; i < sizeof(word_type); ++i)
    200     {
    201       uchar c;
    202       if (WORDS_BIGENDIAN)
    203 	c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
    204       else
    205 	c = (val >> i * 8) & 0xff;
    206 
    207       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
    208 	return i;
    209     }
    210 
    211   return -1;
    212 #endif
    213 }
    214 
    215 /* A version of the fast scanner using bit fiddling techniques.
    216 
    217    For 32-bit words, one would normally perform 16 comparisons and
    218    16 branches.  With this algorithm one performs 24 arithmetic
    219    operations and one branch.  Whether this is faster with a 32-bit
    220    word size is going to be somewhat system dependent.
    221 
    222    For 64-bit words, we eliminate twice the number of comparisons
    223    and branches without increasing the number of arithmetic operations.
    224    It's almost certainly going to be a win with 64-bit word size.  */
    225 
    226 static const uchar * search_line_acc_char (const uchar *, const uchar *)
    227   ATTRIBUTE_UNUSED;
    228 
    229 static const uchar *
    230 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    231 {
    232   const word_type repl_nl = acc_char_replicate ('\n');
    233   const word_type repl_cr = acc_char_replicate ('\r');
    234   const word_type repl_bs = acc_char_replicate ('\\');
    235   const word_type repl_qm = acc_char_replicate ('?');
    236 
    237   unsigned int misalign;
    238   const word_type *p;
    239   word_type val, t;
    240 
    241   /* Align the buffer.  Mask out any bytes from before the beginning.  */
    242   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
    243   val = *p;
    244   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
    245   if (misalign)
    246     val = acc_char_mask_misalign (val, misalign);
    247 
    248   /* Main loop.  */
    249   while (1)
    250     {
    251       t  = acc_char_cmp (val, repl_nl);
    252       t |= acc_char_cmp (val, repl_cr);
    253       t |= acc_char_cmp (val, repl_bs);
    254       t |= acc_char_cmp (val, repl_qm);
    255 
    256       if (__builtin_expect (t != 0, 0))
    257 	{
    258 	  int i = acc_char_index (t, val);
    259 	  if (i >= 0)
    260 	    return (const uchar *)p + i;
    261 	}
    262 
    263       val = *++p;
    264     }
    265 }
    266 
    267 /* Disable on Solaris 2/x86 until the following problem can be properly
    268    autoconfed:
    269 
    270    The Solaris 10+ assembler tags objects with the instruction set
    271    extensions used, so SSE4.2 executables cannot run on machines that
    272    don't support that extension.  */
    273 
    274 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
    275 
    276 /* Replicated character data to be shared between implementations.
    277    Recall that outside of a context with vector support we can't
    278    define compatible vector types, therefore these are all defined
    279    in terms of raw characters.  */
    280 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
    281   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    282     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
    283   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    284     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
    285   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    286     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
    287   { '?', '?', '?', '?', '?', '?', '?', '?',
    288     '?', '?', '?', '?', '?', '?', '?', '?' },
    289 };
    290 
    291 /* A version of the fast scanner using MMX vectorized byte compare insns.
    292 
    293    This uses the PMOVMSKB instruction which was introduced with "MMX2",
    294    which was packaged into SSE1; it is also present in the AMD MMX
    295    extension.  Mark the function as using "sse" so that we emit a real
    296    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
    297 
    298 static const uchar *
    299 #ifndef __SSE__
    300 __attribute__((__target__("sse")))
    301 #endif
    302 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    303 {
    304   typedef char v8qi __attribute__ ((__vector_size__ (8)));
    305   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
    306 
    307   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
    308   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
    309   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
    310   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
    311 
    312   unsigned int misalign, found, mask;
    313   const v8qi *p;
    314   v8qi data, t, c;
    315 
    316   /* Align the source pointer.  While MMX doesn't generate unaligned data
    317      faults, this allows us to safely scan to the end of the buffer without
    318      reading beyond the end of the last page.  */
    319   misalign = (uintptr_t)s & 7;
    320   p = (const v8qi *)((uintptr_t)s & -8);
    321   data = *p;
    322 
    323   /* Create a mask for the bytes that are valid within the first
    324      16-byte block.  The Idea here is that the AND with the mask
    325      within the loop is "free", since we need some AND or TEST
    326      insn in order to set the flags for the branch anyway.  */
    327   mask = -1u << misalign;
    328 
    329   /* Main loop processing 8 bytes at a time.  */
    330   goto start;
    331   do
    332     {
    333       data = *++p;
    334       mask = -1;
    335 
    336     start:
    337       t = __builtin_ia32_pcmpeqb(data, repl_nl);
    338       c = __builtin_ia32_pcmpeqb(data, repl_cr);
    339       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
    340       c = __builtin_ia32_pcmpeqb(data, repl_bs);
    341       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
    342       c = __builtin_ia32_pcmpeqb(data, repl_qm);
    343       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
    344       found = __builtin_ia32_pmovmskb (t);
    345       found &= mask;
    346     }
    347   while (!found);
    348 
    349   __builtin_ia32_emms ();
    350 
    351   /* FOUND contains 1 in bits for which we matched a relevant
    352      character.  Conversion to the byte index is trivial.  */
    353   found = __builtin_ctz(found);
    354   return (const uchar *)p + found;
    355 }
    356 
    357 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
    358 
    359 static const uchar *
    360 #ifndef __SSE2__
    361 __attribute__((__target__("sse2")))
    362 #endif
    363 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    364 {
    365   typedef char v16qi __attribute__ ((__vector_size__ (16)));
    366 
    367   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
    368   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
    369   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
    370   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
    371 
    372   unsigned int misalign, found, mask;
    373   const v16qi *p;
    374   v16qi data, t;
    375 
    376   /* Align the source pointer.  */
    377   misalign = (uintptr_t)s & 15;
    378   p = (const v16qi *)((uintptr_t)s & -16);
    379   data = *p;
    380 
    381   /* Create a mask for the bytes that are valid within the first
    382      16-byte block.  The Idea here is that the AND with the mask
    383      within the loop is "free", since we need some AND or TEST
    384      insn in order to set the flags for the branch anyway.  */
    385   mask = -1u << misalign;
    386 
    387   /* Main loop processing 16 bytes at a time.  */
    388   goto start;
    389   do
    390     {
    391       data = *++p;
    392       mask = -1;
    393 
    394     start:
    395       t  = data == repl_nl;
    396       t |= data == repl_cr;
    397       t |= data == repl_bs;
    398       t |= data == repl_qm;
    399       found = __builtin_ia32_pmovmskb128 (t);
    400       found &= mask;
    401     }
    402   while (!found);
    403 
    404   /* FOUND contains 1 in bits for which we matched a relevant
    405      character.  Conversion to the byte index is trivial.  */
    406   found = __builtin_ctz(found);
    407   return (const uchar *)p + found;
    408 }
    409 
    410 #ifdef HAVE_SSE4
    411 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
    412 
    413 static const uchar *
    414 #ifndef __SSE4_2__
    415 __attribute__((__target__("sse4.2")))
    416 #endif
    417 search_line_sse42 (const uchar *s, const uchar *end)
    418 {
    419   typedef char v16qi __attribute__ ((__vector_size__ (16)));
    420   static const v16qi search = { '\n', '\r', '?', '\\' };
    421 
    422   uintptr_t si = (uintptr_t)s;
    423   uintptr_t index;
    424 
    425   /* Check for unaligned input.  */
    426   if (si & 15)
    427     {
    428       v16qi sv;
    429 
    430       if (__builtin_expect (end - s < 16, 0)
    431 	  && __builtin_expect ((si & 0xfff) > 0xff0, 0))
    432 	{
    433 	  /* There are less than 16 bytes left in the buffer, and less
    434 	     than 16 bytes left on the page.  Reading 16 bytes at this
    435 	     point might generate a spurious page fault.  Defer to the
    436 	     SSE2 implementation, which already handles alignment.  */
    437 	  return search_line_sse2 (s, end);
    438 	}
    439 
    440       /* ??? The builtin doesn't understand that the PCMPESTRI read from
    441 	 memory need not be aligned.  */
    442       sv = __builtin_ia32_loaddqu ((const char *) s);
    443       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
    444 
    445       if (__builtin_expect (index < 16, 0))
    446 	goto found;
    447 
    448       /* Advance the pointer to an aligned address.  We will re-scan a
    449 	 few bytes, but we no longer need care for reading past the
    450 	 end of a page, since we're guaranteed a match.  */
    451       s = (const uchar *)((si + 15) & -16);
    452     }
    453 
    454   /* Main loop, processing 16 bytes at a time.  */
    455 #ifdef __GCC_ASM_FLAG_OUTPUTS__
    456   while (1)
    457     {
    458       char f;
    459 
    460       /* By using inline assembly instead of the builtin,
    461 	 we can use the result, as well as the flags set.  */
    462       __asm ("%vpcmpestri\t$0, %2, %3"
    463 	     : "=c"(index), "=@ccc"(f)
    464 	     : "m"(*s), "x"(search), "a"(4), "d"(16));
    465       if (f)
    466 	break;
    467 
    468       s += 16;
    469     }
    470 #else
    471   s -= 16;
    472   /* By doing the whole loop in inline assembly,
    473      we can make proper use of the flags set.  */
    474   __asm (      ".balign 16\n"
    475 	"0:	add $16, %1\n"
    476 	"	%vpcmpestri\t$0, (%1), %2\n"
    477 	"	jnc 0b"
    478 	: "=&c"(index), "+r"(s)
    479 	: "x"(search), "a"(4), "d"(16));
    480 #endif
    481 
    482  found:
    483   return s + index;
    484 }
    485 
    486 #else
    487 /* Work around out-dated assemblers without sse4 support.  */
    488 #define search_line_sse42 search_line_sse2
    489 #endif
    490 
    491 /* Check the CPU capabilities.  */
    492 
    493 #include "../gcc/config/i386/cpuid.h"
    494 
    495 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
    496 static search_line_fast_type search_line_fast;
    497 
    498 #define HAVE_init_vectorized_lexer 1
    499 static inline void
    500 init_vectorized_lexer (void)
    501 {
    502   unsigned dummy, ecx = 0, edx = 0;
    503   search_line_fast_type impl = search_line_acc_char;
    504   int minimum = 0;
    505 
    506 #if defined(__SSE4_2__)
    507   minimum = 3;
    508 #elif defined(__SSE2__)
    509   minimum = 2;
    510 #elif defined(__SSE__)
    511   minimum = 1;
    512 #endif
    513 
    514   if (minimum == 3)
    515     impl = search_line_sse42;
    516   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
    517     {
    518       if (minimum == 3 || (ecx & bit_SSE4_2))
    519         impl = search_line_sse42;
    520       else if (minimum == 2 || (edx & bit_SSE2))
    521 	impl = search_line_sse2;
    522       else if (minimum == 1 || (edx & bit_SSE))
    523 	impl = search_line_mmx;
    524     }
    525   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
    526     {
    527       if (minimum == 1
    528 	  || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
    529 	impl = search_line_mmx;
    530     }
    531 
    532   search_line_fast = impl;
    533 }
    534 
    535 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
    536 
    537 /* A vection of the fast scanner using AltiVec vectorized byte compares
    538    and VSX unaligned loads (when VSX is available).  This is otherwise
    539    the same as the AltiVec version.  */
    540 
    541 ATTRIBUTE_NO_SANITIZE_UNDEFINED
    542 static const uchar *
    543 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    544 {
    545   typedef __attribute__((altivec(vector))) unsigned char vc;
    546 
    547   const vc repl_nl = {
    548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    549     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
    550   };
    551   const vc repl_cr = {
    552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    553     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
    554   };
    555   const vc repl_bs = {
    556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    557     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
    558   };
    559   const vc repl_qm = {
    560     '?', '?', '?', '?', '?', '?', '?', '?',
    561     '?', '?', '?', '?', '?', '?', '?', '?',
    562   };
    563   const vc zero = { 0 };
    564 
    565   vc data, t;
    566 
    567   /* Main loop processing 16 bytes at a time.  */
    568   do
    569     {
    570       vc m_nl, m_cr, m_bs, m_qm;
    571 
    572       data = __builtin_vec_vsx_ld (0, s);
    573       s += 16;
    574 
    575       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
    576       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
    577       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
    578       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
    579       t = (m_nl | m_cr) | (m_bs | m_qm);
    580 
    581       /* T now contains 0xff in bytes for which we matched one of the relevant
    582 	 characters.  We want to exit the loop if any byte in T is non-zero.
    583 	 Below is the expansion of vec_any_ne(t, zero).  */
    584     }
    585   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
    586 
    587   /* Restore s to to point to the 16 bytes we just processed.  */
    588   s -= 16;
    589 
    590   {
    591 #define N  (sizeof(vc) / sizeof(long))
    592 
    593     union {
    594       vc v;
    595       /* Statically assert that N is 2 or 4.  */
    596       unsigned long l[(N == 2 || N == 4) ? N : -1];
    597     } u;
    598     unsigned long l, i = 0;
    599 
    600     u.v = t;
    601 
    602     /* Find the first word of T that is non-zero.  */
    603     switch (N)
    604       {
    605       case 4:
    606 	l = u.l[i++];
    607 	if (l != 0)
    608 	  break;
    609 	s += sizeof(unsigned long);
    610 	l = u.l[i++];
    611 	if (l != 0)
    612 	  break;
    613 	s += sizeof(unsigned long);
    614 	/* FALLTHRU */
    615       case 2:
    616 	l = u.l[i++];
    617 	if (l != 0)
    618 	  break;
    619 	s += sizeof(unsigned long);
    620 	l = u.l[i];
    621       }
    622 
    623     /* L now contains 0xff in bytes for which we matched one of the
    624        relevant characters.  We can find the byte index by finding
    625        its bit index and dividing by 8.  */
    626 #ifdef __BIG_ENDIAN__
    627     l = __builtin_clzl(l) >> 3;
    628 #else
    629     l = __builtin_ctzl(l) >> 3;
    630 #endif
    631     return s + l;
    632 
    633 #undef N
    634   }
    635 }
    636 
    637 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
    638 
    639 /* A vection of the fast scanner using AltiVec vectorized byte compares.
    640    This cannot be used for little endian because vec_lvsl/lvsr are
    641    deprecated for little endian and the code won't work properly.  */
    642 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
    643    so we can't compile this function without -maltivec on the command line
    644    (or implied by some other switch).  */
    645 
    646 static const uchar *
    647 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    648 {
    649   typedef __attribute__((altivec(vector))) unsigned char vc;
    650 
    651   const vc repl_nl = {
    652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
    653     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
    654   };
    655   const vc repl_cr = {
    656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
    657     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
    658   };
    659   const vc repl_bs = {
    660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
    661     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
    662   };
    663   const vc repl_qm = {
    664     '?', '?', '?', '?', '?', '?', '?', '?',
    665     '?', '?', '?', '?', '?', '?', '?', '?',
    666   };
    667   const vc ones = {
    668     -1, -1, -1, -1, -1, -1, -1, -1,
    669     -1, -1, -1, -1, -1, -1, -1, -1,
    670   };
    671   const vc zero = { 0 };
    672 
    673   vc data, mask, t;
    674 
    675   /* Altivec loads automatically mask addresses with -16.  This lets us
    676      issue the first load as early as possible.  */
    677   data = __builtin_vec_ld(0, (const vc *)s);
    678 
    679   /* Discard bytes before the beginning of the buffer.  Do this by
    680      beginning with all ones and shifting in zeros according to the
    681      mis-alignment.  The LVSR instruction pulls the exact shift we
    682      want from the address.  */
    683   mask = __builtin_vec_lvsr(0, s);
    684   mask = __builtin_vec_perm(zero, ones, mask);
    685   data &= mask;
    686 
    687   /* While altivec loads mask addresses, we still need to align S so
    688      that the offset we compute at the end is correct.  */
    689   s = (const uchar *)((uintptr_t)s & -16);
    690 
    691   /* Main loop processing 16 bytes at a time.  */
    692   goto start;
    693   do
    694     {
    695       vc m_nl, m_cr, m_bs, m_qm;
    696 
    697       s += 16;
    698       data = __builtin_vec_ld(0, (const vc *)s);
    699 
    700     start:
    701       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
    702       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
    703       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
    704       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
    705       t = (m_nl | m_cr) | (m_bs | m_qm);
    706 
    707       /* T now contains 0xff in bytes for which we matched one of the relevant
    708 	 characters.  We want to exit the loop if any byte in T is non-zero.
    709 	 Below is the expansion of vec_any_ne(t, zero).  */
    710     }
    711   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
    712 
    713   {
    714 #define N  (sizeof(vc) / sizeof(long))
    715 
    716     union {
    717       vc v;
    718       /* Statically assert that N is 2 or 4.  */
    719       unsigned long l[(N == 2 || N == 4) ? N : -1];
    720     } u;
    721     unsigned long l, i = 0;
    722 
    723     u.v = t;
    724 
    725     /* Find the first word of T that is non-zero.  */
    726     switch (N)
    727       {
    728       case 4:
    729 	l = u.l[i++];
    730 	if (l != 0)
    731 	  break;
    732 	s += sizeof(unsigned long);
    733 	l = u.l[i++];
    734 	if (l != 0)
    735 	  break;
    736 	s += sizeof(unsigned long);
    737 	/* FALLTHROUGH */
    738       case 2:
    739 	l = u.l[i++];
    740 	if (l != 0)
    741 	  break;
    742 	s += sizeof(unsigned long);
    743 	l = u.l[i];
    744       }
    745 
    746     /* L now contains 0xff in bytes for which we matched one of the
    747        relevant characters.  We can find the byte index by finding
    748        its bit index and dividing by 8.  */
    749     l = __builtin_clzl(l) >> 3;
    750     return s + l;
    751 
    752 #undef N
    753   }
    754 }
    755 
    756 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
    757 #include "arm_neon.h"
    758 
    759 /* This doesn't have to be the exact page size, but no system may use
    760    a size smaller than this.  ARMv8 requires a minimum page size of
    761    4k.  The impact of being conservative here is a small number of
    762    cases will take the slightly slower entry path into the main
    763    loop.  */
    764 
    765 #define AARCH64_MIN_PAGE_SIZE 4096
    766 
    767 static const uchar *
    768 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    769 {
    770   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
    771   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
    772   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
    773   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
    774   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
    775 
    776 #ifdef __ARM_BIG_ENDIAN
    777   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
    778 #else
    779   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
    780 #endif
    781 
    782   unsigned int found;
    783   const uint8_t *p;
    784   uint8x16_t data;
    785   uint8x16_t t;
    786   uint16x8_t m;
    787   uint8x16_t u, v, w;
    788 
    789   /* Align the source pointer.  */
    790   p = (const uint8_t *)((uintptr_t)s & -16);
    791 
    792   /* Assuming random string start positions, with a 4k page size we'll take
    793      the slow path about 0.37% of the time.  */
    794   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
    795 			 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
    796 			< 16, 0))
    797     {
    798       /* Slow path: the string starts near a possible page boundary.  */
    799       uint32_t misalign, mask;
    800 
    801       misalign = (uintptr_t)s & 15;
    802       mask = (-1u << misalign) & 0xffff;
    803       data = vld1q_u8 (p);
    804       t = vceqq_u8 (data, repl_nl);
    805       u = vceqq_u8 (data, repl_cr);
    806       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    807       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    808       t = vorrq_u8 (v, w);
    809       t = vandq_u8 (t, xmask);
    810       m = vpaddlq_u8 (t);
    811       m = vshlq_u16 (m, shift);
    812       found = vaddvq_u16 (m);
    813       found &= mask;
    814       if (found)
    815 	return (const uchar*)p + __builtin_ctz (found);
    816     }
    817   else
    818     {
    819       data = vld1q_u8 ((const uint8_t *) s);
    820       t = vceqq_u8 (data, repl_nl);
    821       u = vceqq_u8 (data, repl_cr);
    822       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    823       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    824       t = vorrq_u8 (v, w);
    825       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
    826 	goto done;
    827     }
    828 
    829   do
    830     {
    831       p += 16;
    832       data = vld1q_u8 (p);
    833       t = vceqq_u8 (data, repl_nl);
    834       u = vceqq_u8 (data, repl_cr);
    835       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    836       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    837       t = vorrq_u8 (v, w);
    838     } while (!vpaddd_u64 ((uint64x2_t)t));
    839 
    840 done:
    841   /* Now that we've found the terminating substring, work out precisely where
    842      we need to stop.  */
    843   t = vandq_u8 (t, xmask);
    844   m = vpaddlq_u8 (t);
    845   m = vshlq_u16 (m, shift);
    846   found = vaddvq_u16 (m);
    847   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
    848 	  + __builtin_ctz (found));
    849 }
    850 
    851 #elif defined (__ARM_NEON)
    852 #include "arm_neon.h"
    853 
    854 static const uchar *
    855 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
    856 {
    857   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
    858   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
    859   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
    860   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
    861   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
    862 
    863   unsigned int misalign, found, mask;
    864   const uint8_t *p;
    865   uint8x16_t data;
    866 
    867   /* Align the source pointer.  */
    868   misalign = (uintptr_t)s & 15;
    869   p = (const uint8_t *)((uintptr_t)s & -16);
    870   data = vld1q_u8 (p);
    871 
    872   /* Create a mask for the bytes that are valid within the first
    873      16-byte block.  The Idea here is that the AND with the mask
    874      within the loop is "free", since we need some AND or TEST
    875      insn in order to set the flags for the branch anyway.  */
    876   mask = (-1u << misalign) & 0xffff;
    877 
    878   /* Main loop, processing 16 bytes at a time.  */
    879   goto start;
    880 
    881   do
    882     {
    883       uint8x8_t l;
    884       uint16x4_t m;
    885       uint32x2_t n;
    886       uint8x16_t t, u, v, w;
    887 
    888       p += 16;
    889       data = vld1q_u8 (p);
    890       mask = 0xffff;
    891 
    892     start:
    893       t = vceqq_u8 (data, repl_nl);
    894       u = vceqq_u8 (data, repl_cr);
    895       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
    896       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
    897       t = vandq_u8 (vorrq_u8 (v, w), xmask);
    898       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
    899       m = vpaddl_u8 (l);
    900       n = vpaddl_u16 (m);
    901 
    902       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
    903 	      vshr_n_u64 ((uint64x1_t) n, 24)), 0);
    904       found &= mask;
    905     }
    906   while (!found);
    907 
    908   /* FOUND contains 1 in bits for which we matched a relevant
    909      character.  Conversion to the byte index is trivial.  */
    910   found = __builtin_ctz (found);
    911   return (const uchar *)p + found;
    912 }
    913 
    914 #else
    915 
    916 /* We only have one accelerated alternative.  Use a direct call so that
    917    we encourage inlining.  */
    918 
    919 #define search_line_fast  search_line_acc_char
    920 
    921 #endif
    922 
    923 /* Initialize the lexer if needed.  */
    924 
    925 void
    926 _cpp_init_lexer (void)
    927 {
    928 #ifdef HAVE_init_vectorized_lexer
    929   init_vectorized_lexer ();
    930 #endif
    931 }
    932 
    933 /* Returns with a logical line that contains no escaped newlines or
    934    trigraphs.  This is a time-critical inner loop.  */
    935 void
    936 _cpp_clean_line (cpp_reader *pfile)
    937 {
    938   cpp_buffer *buffer;
    939   const uchar *s;
    940   uchar c, *d, *p;
    941 
    942   buffer = pfile->buffer;
    943   buffer->cur_note = buffer->notes_used = 0;
    944   buffer->cur = buffer->line_base = buffer->next_line;
    945   buffer->need_line = false;
    946   s = buffer->next_line;
    947 
    948   if (!buffer->from_stage3)
    949     {
    950       const uchar *pbackslash = NULL;
    951 
    952       /* Fast path.  This is the common case of an un-escaped line with
    953 	 no trigraphs.  The primary win here is by not writing any
    954 	 data back to memory until we have to.  */
    955       while (1)
    956 	{
    957 	  /* Perform an optimized search for \n, \r, \\, ?.  */
    958 	  s = search_line_fast (s, buffer->rlimit);
    959 
    960 	  c = *s;
    961 	  if (c == '\\')
    962 	    {
    963 	      /* Record the location of the backslash and continue.  */
    964 	      pbackslash = s++;
    965 	    }
    966 	  else if (__builtin_expect (c == '?', 0))
    967 	    {
    968 	      if (__builtin_expect (s[1] == '?', false)
    969 		   && _cpp_trigraph_map[s[2]])
    970 		{
    971 		  /* Have a trigraph.  We may or may not have to convert
    972 		     it.  Add a line note regardless, for -Wtrigraphs.  */
    973 		  add_line_note (buffer, s, s[2]);
    974 		  if (CPP_OPTION (pfile, trigraphs))
    975 		    {
    976 		      /* We do, and that means we have to switch to the
    977 		         slow path.  */
    978 		      d = (uchar *) s;
    979 		      *d = _cpp_trigraph_map[s[2]];
    980 		      s += 2;
    981 		      goto slow_path;
    982 		    }
    983 		}
    984 	      /* Not a trigraph.  Continue on fast-path.  */
    985 	      s++;
    986 	    }
    987 	  else
    988 	    break;
    989 	}
    990 
    991       /* This must be \r or \n.  We're either done, or we'll be forced
    992 	 to write back to the buffer and continue on the slow path.  */
    993       d = (uchar *) s;
    994 
    995       if (__builtin_expect (s == buffer->rlimit, false))
    996 	goto done;
    997 
    998       /* DOS line ending? */
    999       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
   1000 	{
   1001 	  s++;
   1002 	  if (s == buffer->rlimit)
   1003 	    goto done;
   1004 	}
   1005 
   1006       if (__builtin_expect (pbackslash == NULL, true))
   1007 	goto done;
   1008 
   1009       /* Check for escaped newline.  */
   1010       p = d;
   1011       while (is_nvspace (p[-1]))
   1012 	p--;
   1013       if (p - 1 != pbackslash)
   1014 	goto done;
   1015 
   1016       /* Have an escaped newline; process it and proceed to
   1017 	 the slow path.  */
   1018       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
   1019       d = p - 2;
   1020       buffer->next_line = p - 1;
   1021 
   1022     slow_path:
   1023       while (1)
   1024 	{
   1025 	  c = *++s;
   1026 	  *++d = c;
   1027 
   1028 	  if (c == '\n' || c == '\r')
   1029 	    {
   1030 	      /* Handle DOS line endings.  */
   1031 	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
   1032 		s++;
   1033 	      if (s == buffer->rlimit)
   1034 		break;
   1035 
   1036 	      /* Escaped?  */
   1037 	      p = d;
   1038 	      while (p != buffer->next_line && is_nvspace (p[-1]))
   1039 		p--;
   1040 	      if (p == buffer->next_line || p[-1] != '\\')
   1041 		break;
   1042 
   1043 	      add_line_note (buffer, p - 1, p != d ? ' ': '\\');
   1044 	      d = p - 2;
   1045 	      buffer->next_line = p - 1;
   1046 	    }
   1047 	  else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
   1048 	    {
   1049 	      /* Add a note regardless, for the benefit of -Wtrigraphs.  */
   1050 	      add_line_note (buffer, d, s[2]);
   1051 	      if (CPP_OPTION (pfile, trigraphs))
   1052 		{
   1053 		  *d = _cpp_trigraph_map[s[2]];
   1054 		  s += 2;
   1055 		}
   1056 	    }
   1057 	}
   1058     }
   1059   else
   1060     {
   1061       while (*s != '\n' && *s != '\r')
   1062 	s++;
   1063       d = (uchar *) s;
   1064 
   1065       /* Handle DOS line endings.  */
   1066       if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
   1067 	s++;
   1068     }
   1069 
   1070  done:
   1071   *d = '\n';
   1072   /* A sentinel note that should never be processed.  */
   1073   add_line_note (buffer, d + 1, '\n');
   1074   buffer->next_line = s + 1;
   1075 }
   1076 
   1077 /* Return true if the trigraph indicated by NOTE should be warned
   1078    about in a comment.  */
   1079 static bool
   1080 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
   1081 {
   1082   const uchar *p;
   1083 
   1084   /* Within comments we don't warn about trigraphs, unless the
   1085      trigraph forms an escaped newline, as that may change
   1086      behavior.  */
   1087   if (note->type != '/')
   1088     return false;
   1089 
   1090   /* If -trigraphs, then this was an escaped newline iff the next note
   1091      is coincident.  */
   1092   if (CPP_OPTION (pfile, trigraphs))
   1093     return note[1].pos == note->pos;
   1094 
   1095   /* Otherwise, see if this forms an escaped newline.  */
   1096   p = note->pos + 3;
   1097   while (is_nvspace (*p))
   1098     p++;
   1099 
   1100   /* There might have been escaped newlines between the trigraph and the
   1101      newline we found.  Hence the position test.  */
   1102   return (*p == '\n' && p < note[1].pos);
   1103 }
   1104 
   1105 /* Process the notes created by add_line_note as far as the current
   1106    location.  */
   1107 void
   1108 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
   1109 {
   1110   cpp_buffer *buffer = pfile->buffer;
   1111 
   1112   for (;;)
   1113     {
   1114       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
   1115       unsigned int col;
   1116 
   1117       if (note->pos > buffer->cur)
   1118 	break;
   1119 
   1120       buffer->cur_note++;
   1121       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
   1122 
   1123       if (note->type == '\\' || note->type == ' ')
   1124 	{
   1125 	  if (note->type == ' ' && !in_comment)
   1126 	    cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
   1127 				 "backslash and newline separated by space");
   1128 
   1129 	  if (buffer->next_line > buffer->rlimit)
   1130 	    {
   1131 	      cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
   1132 				   "backslash-newline at end of file");
   1133 	      /* Prevent "no newline at end of file" warning.  */
   1134 	      buffer->next_line = buffer->rlimit;
   1135 	    }
   1136 
   1137 	  buffer->line_base = note->pos;
   1138 	  CPP_INCREMENT_LINE (pfile, 0);
   1139 	}
   1140       else if (_cpp_trigraph_map[note->type])
   1141 	{
   1142 	  if (CPP_OPTION (pfile, warn_trigraphs)
   1143 	      && (!in_comment || warn_in_comment (pfile, note)))
   1144 	    {
   1145 	      if (CPP_OPTION (pfile, trigraphs))
   1146 		cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
   1147                                        pfile->line_table->highest_line, col,
   1148 				       "trigraph ??%c converted to %c",
   1149 				       note->type,
   1150 				       (int) _cpp_trigraph_map[note->type]);
   1151 	      else
   1152 		{
   1153 		  cpp_warning_with_line
   1154 		    (pfile, CPP_W_TRIGRAPHS,
   1155                      pfile->line_table->highest_line, col,
   1156 		     "trigraph ??%c ignored, use -trigraphs to enable",
   1157 		     note->type);
   1158 		}
   1159 	    }
   1160 	}
   1161       else if (note->type == 0)
   1162 	/* Already processed in lex_raw_string.  */;
   1163       else
   1164 	abort ();
   1165     }
   1166 }
   1167 
   1168 namespace bidi {
   1169   enum class kind {
   1170     NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
   1171   };
   1172 
   1173   /* All the UTF-8 encodings of bidi characters start with E2.  */
   1174   constexpr uchar utf8_start = 0xe2;
   1175 
   1176   struct context
   1177   {
   1178     context () {}
   1179     context (location_t loc, kind k, bool pdf, bool ucn)
   1180     : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
   1181     {
   1182     }
   1183 
   1184     kind get_pop_kind () const
   1185     {
   1186       return m_pdf ? kind::PDF : kind::PDI;
   1187     }
   1188     bool ucn_p () const
   1189     {
   1190       return m_ucn;
   1191     }
   1192 
   1193     location_t m_loc;
   1194     kind m_kind;
   1195     unsigned m_pdf : 1;
   1196     unsigned m_ucn : 1;
   1197   };
   1198 
   1199   /* A vector holding currently open bidi contexts.  We use a char for
   1200      each context, its LSB is 1 if it represents a PDF context, 0 if it
   1201      represents a PDI context.  The next bit is 1 if this context was open
   1202      by a bidi character written as a UCN, and 0 when it was UTF-8.  */
   1203   semi_embedded_vec <context, 16> vec;
   1204 
   1205   /* Close the whole comment/identifier/string literal/character constant
   1206      context.  */
   1207   void on_close ()
   1208   {
   1209     vec.truncate (0);
   1210   }
   1211 
   1212   /* Pop the last element in the vector.  */
   1213   void pop ()
   1214   {
   1215     unsigned int len = vec.count ();
   1216     gcc_checking_assert (len > 0);
   1217     vec.truncate (len - 1);
   1218   }
   1219 
   1220   /* Return the pop kind of the context of the Ith element.  */
   1221   kind pop_kind_at (unsigned int i)
   1222   {
   1223     return vec[i].get_pop_kind ();
   1224   }
   1225 
   1226   /* Return the pop kind of the context that is currently opened.  */
   1227   kind current_ctx ()
   1228   {
   1229     unsigned int len = vec.count ();
   1230     if (len == 0)
   1231       return kind::NONE;
   1232     return vec[len - 1].get_pop_kind ();
   1233   }
   1234 
   1235   /* Return true if the current context comes from a UCN origin, that is,
   1236      the bidi char which started this bidi context was written as a UCN.  */
   1237   bool current_ctx_ucn_p ()
   1238   {
   1239     unsigned int len = vec.count ();
   1240     gcc_checking_assert (len > 0);
   1241     return vec[len - 1].m_ucn;
   1242   }
   1243 
   1244   location_t current_ctx_loc ()
   1245   {
   1246     unsigned int len = vec.count ();
   1247     gcc_checking_assert (len > 0);
   1248     return vec[len - 1].m_loc;
   1249   }
   1250 
   1251   /* We've read a bidi char, update the current vector as necessary.
   1252      LOC is only valid when K is not kind::NONE.  */
   1253   void on_char (kind k, bool ucn_p, location_t loc)
   1254   {
   1255     switch (k)
   1256       {
   1257       case kind::LRE:
   1258       case kind::RLE:
   1259       case kind::LRO:
   1260       case kind::RLO:
   1261 	vec.push (context (loc, k, true, ucn_p));
   1262 	break;
   1263       case kind::LRI:
   1264       case kind::RLI:
   1265       case kind::FSI:
   1266 	vec.push (context (loc, k, false, ucn_p));
   1267 	break;
   1268       /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
   1269 	 whose scope has not yet been terminated.  */
   1270       case kind::PDF:
   1271 	if (current_ctx () == kind::PDF)
   1272 	  pop ();
   1273 	break;
   1274       /* PDI terminates the scope of the last LRI, RLI, or FSI whose
   1275 	 scope has not yet been terminated, as well as the scopes of
   1276 	 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
   1277 	 yet been terminated.  */
   1278       case kind::PDI:
   1279 	for (int i = vec.count () - 1; i >= 0; --i)
   1280 	  if (pop_kind_at (i) == kind::PDI)
   1281 	    {
   1282 	      vec.truncate (i);
   1283 	      break;
   1284 	    }
   1285 	break;
   1286       case kind::LTR:
   1287       case kind::RTL:
   1288 	/* These aren't popped by a PDF/PDI.  */
   1289 	break;
   1290       ATTR_LIKELY case kind::NONE:
   1291 	break;
   1292       default:
   1293 	abort ();
   1294       }
   1295   }
   1296 
   1297   /* Return a descriptive string for K.  */
   1298   const char *to_str (kind k)
   1299   {
   1300     switch (k)
   1301       {
   1302       case kind::LRE:
   1303 	return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
   1304       case kind::RLE:
   1305 	return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
   1306       case kind::LRO:
   1307 	return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
   1308       case kind::RLO:
   1309 	return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
   1310       case kind::LRI:
   1311 	return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
   1312       case kind::RLI:
   1313 	return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
   1314       case kind::FSI:
   1315 	return "U+2068 (FIRST STRONG ISOLATE)";
   1316       case kind::PDF:
   1317 	return "U+202C (POP DIRECTIONAL FORMATTING)";
   1318       case kind::PDI:
   1319 	return "U+2069 (POP DIRECTIONAL ISOLATE)";
   1320       case kind::LTR:
   1321 	return "U+200E (LEFT-TO-RIGHT MARK)";
   1322       case kind::RTL:
   1323 	return "U+200F (RIGHT-TO-LEFT MARK)";
   1324       default:
   1325 	abort ();
   1326       }
   1327   }
   1328 }
   1329 
   1330 /* Get location_t for the range of bytes [START, START + NUM_BYTES)
   1331    within the current line in FILE, with the caret at START.  */
   1332 
   1333 static location_t
   1334 get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
   1335 					 const unsigned char *const start,
   1336 					 size_t num_bytes)
   1337 {
   1338   gcc_checking_assert (num_bytes > 0);
   1339 
   1340   /* CPP_BUF_COLUMN and linemap_position_for_column both refer
   1341      to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
   1342      whereas linemap_position_for_column is 1-based.  */
   1343 
   1344   /* Get 0-based offsets within the line.  */
   1345   size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
   1346   size_t end_offset = start_offset + num_bytes - 1;
   1347 
   1348   /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
   1349   location_t start_loc = linemap_position_for_column (pfile->line_table,
   1350 						      start_offset + 1);
   1351   location_t end_loc = linemap_position_for_column (pfile->line_table,
   1352 						     end_offset + 1);
   1353 
   1354   if (start_loc == end_loc)
   1355     return start_loc;
   1356 
   1357   source_range src_range;
   1358   src_range.m_start = start_loc;
   1359   src_range.m_finish = end_loc;
   1360   location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
   1361 						   start_loc,
   1362 						   src_range,
   1363 						   NULL);
   1364   return combined_loc;
   1365 }
   1366 
   1367 /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
   1368 
   1369 static bidi::kind
   1370 get_bidi_utf8_1 (const unsigned char *const p)
   1371 {
   1372   gcc_checking_assert (p[0] == bidi::utf8_start);
   1373 
   1374   if (p[1] == 0x80)
   1375     switch (p[2])
   1376       {
   1377       case 0xaa:
   1378 	return bidi::kind::LRE;
   1379       case 0xab:
   1380 	return bidi::kind::RLE;
   1381       case 0xac:
   1382 	return bidi::kind::PDF;
   1383       case 0xad:
   1384 	return bidi::kind::LRO;
   1385       case 0xae:
   1386 	return bidi::kind::RLO;
   1387       case 0x8e:
   1388 	return bidi::kind::LTR;
   1389       case 0x8f:
   1390 	return bidi::kind::RTL;
   1391       default:
   1392 	break;
   1393       }
   1394   else if (p[1] == 0x81)
   1395     switch (p[2])
   1396       {
   1397       case 0xa6:
   1398 	return bidi::kind::LRI;
   1399       case 0xa7:
   1400 	return bidi::kind::RLI;
   1401       case 0xa8:
   1402 	return bidi::kind::FSI;
   1403       case 0xa9:
   1404 	return bidi::kind::PDI;
   1405       default:
   1406 	break;
   1407       }
   1408 
   1409   return bidi::kind::NONE;
   1410 }
   1411 
   1412 /* Parse a sequence of 3 bytes starting with P and return its bidi code.
   1413    If the kind is not NONE, write the location to *OUT.*/
   1414 
   1415 static bidi::kind
   1416 get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
   1417 {
   1418   bidi::kind result = get_bidi_utf8_1 (p);
   1419   if (result != bidi::kind::NONE)
   1420     {
   1421       /* We have a sequence of 3 bytes starting at P.  */
   1422       *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
   1423     }
   1424   return result;
   1425 }
   1426 
   1427 /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
   1428 
   1429 static bidi::kind
   1430 get_bidi_ucn_1 (const unsigned char *p, bool is_U)
   1431 {
   1432   /* 6.4.3 Universal Character Names
   1433       \u hex-quad
   1434       \U hex-quad hex-quad
   1435      where \unnnn means \U0000nnnn.  */
   1436 
   1437   if (is_U)
   1438     {
   1439       if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
   1440 	return bidi::kind::NONE;
   1441       /* Skip 4B so we can treat \u and \U the same below.  */
   1442       p += 4;
   1443     }
   1444 
   1445   /* All code points we are looking for start with 20xx.  */
   1446   if (p[0] != '2' || p[1] != '0')
   1447     return bidi::kind::NONE;
   1448   else if (p[2] == '2')
   1449     switch (p[3])
   1450       {
   1451       case 'a':
   1452       case 'A':
   1453 	return bidi::kind::LRE;
   1454       case 'b':
   1455       case 'B':
   1456 	return bidi::kind::RLE;
   1457       case 'c':
   1458       case 'C':
   1459 	return bidi::kind::PDF;
   1460       case 'd':
   1461       case 'D':
   1462 	return bidi::kind::LRO;
   1463       case 'e':
   1464       case 'E':
   1465 	return bidi::kind::RLO;
   1466       default:
   1467 	break;
   1468       }
   1469   else if (p[2] == '6')
   1470     switch (p[3])
   1471       {
   1472       case '6':
   1473 	return bidi::kind::LRI;
   1474       case '7':
   1475 	return bidi::kind::RLI;
   1476       case '8':
   1477 	return bidi::kind::FSI;
   1478       case '9':
   1479 	return bidi::kind::PDI;
   1480       default:
   1481 	break;
   1482       }
   1483   else if (p[2] == '0')
   1484     switch (p[3])
   1485       {
   1486       case 'e':
   1487       case 'E':
   1488 	return bidi::kind::LTR;
   1489       case 'f':
   1490       case 'F':
   1491 	return bidi::kind::RTL;
   1492       default:
   1493 	break;
   1494       }
   1495 
   1496   return bidi::kind::NONE;
   1497 }
   1498 
   1499 /* Parse a UCN where P points just past \u or \U and return its bidi code.
   1500    If the kind is not NONE, write the location to *OUT.*/
   1501 
   1502 static bidi::kind
   1503 get_bidi_ucn (cpp_reader *pfile,  const unsigned char *p, bool is_U,
   1504 	      location_t *out)
   1505 {
   1506   bidi::kind result = get_bidi_ucn_1 (p, is_U);
   1507   if (result != bidi::kind::NONE)
   1508     {
   1509       const unsigned char *start = p - 2;
   1510       size_t num_bytes = 2 + (is_U ? 8 : 4);
   1511       *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
   1512     }
   1513   return result;
   1514 }
   1515 
   1516 /* Subclass of rich_location for reporting on unpaired UTF-8
   1517    bidirectional control character(s).
   1518    Escape the source lines on output, and show all unclosed
   1519    bidi context, labelling everything.  */
   1520 
   1521 class unpaired_bidi_rich_location : public rich_location
   1522 {
   1523  public:
   1524   class custom_range_label : public range_label
   1525   {
   1526    public:
   1527      label_text get_text (unsigned range_idx) const FINAL OVERRIDE
   1528      {
   1529        /* range 0 is the primary location; each subsequent range i + 1
   1530 	  is for bidi::vec[i].  */
   1531        if (range_idx > 0)
   1532 	 {
   1533 	   const bidi::context &ctxt (bidi::vec[range_idx - 1]);
   1534 	   return label_text::borrow (bidi::to_str (ctxt.m_kind));
   1535 	 }
   1536        else
   1537 	 return label_text::borrow (_("end of bidirectional context"));
   1538      }
   1539   };
   1540 
   1541   unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
   1542   : rich_location (pfile->line_table, loc, &m_custom_label)
   1543   {
   1544     set_escape_on_output (true);
   1545     for (unsigned i = 0; i < bidi::vec.count (); i++)
   1546       add_range (bidi::vec[i].m_loc,
   1547 		 SHOW_RANGE_WITHOUT_CARET,
   1548 		 &m_custom_label);
   1549   }
   1550 
   1551  private:
   1552    custom_range_label m_custom_label;
   1553 };
   1554 
   1555 /* We're closing a bidi context, that is, we've encountered a newline,
   1556    are closing a C-style comment, or are at the end of a string literal,
   1557    character constant, or identifier.  Warn if this context was not
   1558    properly terminated by a PDI or PDF.  P points to the last character
   1559    in this context.  */
   1560 
   1561 static void
   1562 maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
   1563 {
   1564   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
   1565   if (bidi::vec.count () > 0
   1566       && (warn_bidi & bidirectional_unpaired
   1567 	  && (!bidi::current_ctx_ucn_p ()
   1568 	      || (warn_bidi & bidirectional_ucn))))
   1569     {
   1570       const location_t loc
   1571 	= linemap_position_for_column (pfile->line_table,
   1572 				       CPP_BUF_COLUMN (pfile->buffer, p));
   1573       unpaired_bidi_rich_location rich_loc (pfile, loc);
   1574       /* cpp_callbacks doesn't yet have a way to handle singular vs plural
   1575 	 forms of a diagnostic, so fake it for now.  */
   1576       if (bidi::vec.count () > 1)
   1577 	cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1578 			"unpaired UTF-8 bidirectional control characters "
   1579 			"detected");
   1580       else
   1581 	cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1582 			"unpaired UTF-8 bidirectional control character "
   1583 			"detected");
   1584     }
   1585   /* We're done with this context.  */
   1586   bidi::on_close ();
   1587 }
   1588 
   1589 /* We're at the beginning or in the middle of an identifier/comment/string
   1590    literal/character constant.  Warn if we've encountered a bidi character.
   1591    KIND says which bidi control character it was; UCN_P is true iff this bidi
   1592    control character was written as a UCN.  LOC is the location of the
   1593    character, but is only valid if KIND != bidi::kind::NONE.  */
   1594 
   1595 static void
   1596 maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
   1597 			 bool ucn_p, location_t loc)
   1598 {
   1599   if (__builtin_expect (kind == bidi::kind::NONE, 1))
   1600     return;
   1601 
   1602   const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
   1603 
   1604   if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
   1605     {
   1606       rich_location rich_loc (pfile->line_table, loc);
   1607       rich_loc.set_escape_on_output (true);
   1608 
   1609       /* It seems excessive to warn about a PDI/PDF that is closing
   1610 	 an opened context because we've already warned about the
   1611 	 opening character.  Except warn when we have a UCN x UTF-8
   1612 	 mismatch, if UCN checking is enabled.  */
   1613       if (kind == bidi::current_ctx ())
   1614 	{
   1615 	  if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
   1616 	      && bidi::current_ctx_ucn_p () != ucn_p)
   1617 	    {
   1618 	      rich_loc.add_range (bidi::current_ctx_loc ());
   1619 	      cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1620 			      "UTF-8 vs UCN mismatch when closing "
   1621 			      "a context by \"%s\"", bidi::to_str (kind));
   1622 	    }
   1623 	}
   1624       else if (warn_bidi & bidirectional_any
   1625 	       && (!ucn_p || (warn_bidi & bidirectional_ucn)))
   1626 	{
   1627 	  if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
   1628 	    cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1629 			    "\"%s\" is closing an unopened context",
   1630 			    bidi::to_str (kind));
   1631 	  else
   1632 	    cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
   1633 			    "found problematic Unicode character \"%s\"",
   1634 			    bidi::to_str (kind));
   1635 	}
   1636     }
   1637   /* We're done with this context.  */
   1638   bidi::on_char (kind, ucn_p, loc);
   1639 }
   1640 
   1641 /* Skip a C-style block comment.  We find the end of the comment by
   1642    seeing if an asterisk is before every '/' we encounter.  Returns
   1643    nonzero if comment terminated by EOF, zero otherwise.
   1644 
   1645    Buffer->cur points to the initial asterisk of the comment.  */
   1646 bool
   1647 _cpp_skip_block_comment (cpp_reader *pfile)
   1648 {
   1649   cpp_buffer *buffer = pfile->buffer;
   1650   const uchar *cur = buffer->cur;
   1651   uchar c;
   1652   const bool warn_bidi_p = pfile->warn_bidi_p ();
   1653 
   1654   cur++;
   1655   if (*cur == '/')
   1656     cur++;
   1657 
   1658   for (;;)
   1659     {
   1660       /* People like decorating comments with '*', so check for '/'
   1661 	 instead for efficiency.  */
   1662       c = *cur++;
   1663 
   1664       if (c == '/')
   1665 	{
   1666 	  if (cur[-2] == '*')
   1667 	    {
   1668 	      if (warn_bidi_p)
   1669 		maybe_warn_bidi_on_close (pfile, cur);
   1670 	      break;
   1671 	    }
   1672 
   1673 	  /* Warn about potential nested comments, but not if the '/'
   1674 	     comes immediately before the true comment delimiter.
   1675 	     Don't bother to get it right across escaped newlines.  */
   1676 	  if (CPP_OPTION (pfile, warn_comments)
   1677 	      && cur[0] == '*' && cur[1] != '/')
   1678 	    {
   1679 	      buffer->cur = cur;
   1680 	      cpp_warning_with_line (pfile, CPP_W_COMMENTS,
   1681 				     pfile->line_table->highest_line,
   1682 				     CPP_BUF_COL (buffer),
   1683 				     "\"/*\" within comment");
   1684 	    }
   1685 	}
   1686       else if (c == '\n')
   1687 	{
   1688 	  unsigned int cols;
   1689 	  buffer->cur = cur - 1;
   1690 	  if (warn_bidi_p)
   1691 	    maybe_warn_bidi_on_close (pfile, cur);
   1692 	  _cpp_process_line_notes (pfile, true);
   1693 	  if (buffer->next_line >= buffer->rlimit)
   1694 	    return true;
   1695 	  _cpp_clean_line (pfile);
   1696 
   1697 	  cols = buffer->next_line - buffer->line_base;
   1698 	  CPP_INCREMENT_LINE (pfile, cols);
   1699 
   1700 	  cur = buffer->cur;
   1701 	}
   1702       /* If this is a beginning of a UTF-8 encoding, it might be
   1703 	 a bidirectional control character.  */
   1704       else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
   1705 	{
   1706 	  location_t loc;
   1707 	  bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
   1708 	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   1709 	}
   1710     }
   1711 
   1712   buffer->cur = cur;
   1713   _cpp_process_line_notes (pfile, true);
   1714   return false;
   1715 }
   1716 
   1717 /* Skip a C++ line comment, leaving buffer->cur pointing to the
   1718    terminating newline.  Handles escaped newlines.  Returns nonzero
   1719    if a multiline comment.  */
   1720 static int
   1721 skip_line_comment (cpp_reader *pfile)
   1722 {
   1723   cpp_buffer *buffer = pfile->buffer;
   1724   location_t orig_line = pfile->line_table->highest_line;
   1725   const bool warn_bidi_p = pfile->warn_bidi_p ();
   1726 
   1727   if (!warn_bidi_p)
   1728     while (*buffer->cur != '\n')
   1729       buffer->cur++;
   1730   else
   1731     {
   1732       while (*buffer->cur != '\n'
   1733 	     && *buffer->cur != bidi::utf8_start)
   1734 	buffer->cur++;
   1735       if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
   1736 	{
   1737 	  while (*buffer->cur != '\n')
   1738 	    {
   1739 	      if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
   1740 		{
   1741 		  location_t loc;
   1742 		  bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
   1743 		  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   1744 		}
   1745 	      buffer->cur++;
   1746 	    }
   1747 	  maybe_warn_bidi_on_close (pfile, buffer->cur);
   1748 	}
   1749     }
   1750 
   1751   _cpp_process_line_notes (pfile, true);
   1752   return orig_line != pfile->line_table->highest_line;
   1753 }
   1754 
   1755 /* Skips whitespace, saving the next non-whitespace character.  */
   1756 static void
   1757 skip_whitespace (cpp_reader *pfile, cppchar_t c)
   1758 {
   1759   cpp_buffer *buffer = pfile->buffer;
   1760   bool saw_NUL = false;
   1761 
   1762   do
   1763     {
   1764       /* Horizontal space always OK.  */
   1765       if (c == ' ' || c == '\t')
   1766 	;
   1767       /* Just \f \v or \0 left.  */
   1768       else if (c == '\0')
   1769 	saw_NUL = true;
   1770       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
   1771 	cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
   1772 			     CPP_BUF_COL (buffer),
   1773 			     "%s in preprocessing directive",
   1774 			     c == '\f' ? "form feed" : "vertical tab");
   1775 
   1776       c = *buffer->cur++;
   1777     }
   1778   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
   1779   while (is_nvspace (c));
   1780 
   1781   if (saw_NUL)
   1782     {
   1783       encoding_rich_location rich_loc (pfile);
   1784       cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
   1785 		    "null character(s) ignored");
   1786     }
   1787 
   1788   buffer->cur--;
   1789 }
   1790 
   1791 /* See if the characters of a number token are valid in a name (no
   1792    '.', '+' or '-').  */
   1793 static int
   1794 name_p (cpp_reader *pfile, const cpp_string *string)
   1795 {
   1796   unsigned int i;
   1797 
   1798   for (i = 0; i < string->len; i++)
   1799     if (!is_idchar (string->text[i]))
   1800       return 0;
   1801 
   1802   return 1;
   1803 }
   1804 
   1805 /* After parsing an identifier or other sequence, produce a warning about
   1806    sequences not in NFC/NFKC.  */
   1807 static void
   1808 warn_about_normalization (cpp_reader *pfile,
   1809 			  const cpp_token *token,
   1810 			  const struct normalize_state *s)
   1811 {
   1812   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
   1813       && !pfile->state.skipping)
   1814     {
   1815       location_t loc = token->src_loc;
   1816 
   1817       /* If possible, create a location range for the token.  */
   1818       if (loc >= RESERVED_LOCATION_COUNT
   1819 	  && token->type != CPP_EOF
   1820 	  /* There must be no line notes to process.  */
   1821 	  && (!(pfile->buffer->cur
   1822 		>= pfile->buffer->notes[pfile->buffer->cur_note].pos
   1823 		&& !pfile->overlaid_buffer)))
   1824 	{
   1825 	  source_range tok_range;
   1826 	  tok_range.m_start = loc;
   1827 	  tok_range.m_finish
   1828 	    = linemap_position_for_column (pfile->line_table,
   1829 					   CPP_BUF_COLUMN (pfile->buffer,
   1830 							   pfile->buffer->cur));
   1831 	  loc = COMBINE_LOCATION_DATA (pfile->line_table,
   1832 				       loc, tok_range, NULL);
   1833 	}
   1834 
   1835       encoding_rich_location rich_loc (pfile, loc);
   1836 
   1837       /* Make sure that the token is printed using UCNs, even
   1838 	 if we'd otherwise happily print UTF-8.  */
   1839       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
   1840       size_t sz;
   1841 
   1842       sz = cpp_spell_token (pfile, token, buf, false) - buf;
   1843       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
   1844 	cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
   1845 			"`%.*s' is not in NFKC", (int) sz, buf);
   1846       else if (CPP_OPTION (pfile, cplusplus))
   1847 	cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
   1848 				  "`%.*s' is not in NFC", (int) sz, buf);
   1849       else
   1850 	cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
   1851 			"`%.*s' is not in NFC", (int) sz, buf);
   1852       free (buf);
   1853     }
   1854 }
   1855 
   1856 static const cppchar_t utf8_signifier = 0xC0;
   1857 
   1858 /* Returns TRUE if the sequence starting at buffer->cur is valid in
   1859    an identifier.  FIRST is TRUE if this starts an identifier.  */
   1860 
   1861 static bool
   1862 forms_identifier_p (cpp_reader *pfile, int first,
   1863 		    struct normalize_state *state)
   1864 {
   1865   cpp_buffer *buffer = pfile->buffer;
   1866   const bool warn_bidi_p = pfile->warn_bidi_p ();
   1867 
   1868   if (*buffer->cur == '$')
   1869     {
   1870       if (!CPP_OPTION (pfile, dollars_in_ident))
   1871 	return false;
   1872 
   1873       buffer->cur++;
   1874       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
   1875 	{
   1876 	  CPP_OPTION (pfile, warn_dollars) = 0;
   1877 	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
   1878 	}
   1879 
   1880       return true;
   1881     }
   1882 
   1883   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
   1884   if (CPP_OPTION (pfile, extended_identifiers))
   1885     {
   1886       cppchar_t s;
   1887       if (*buffer->cur >= utf8_signifier)
   1888 	{
   1889 	  if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
   1890 	      && warn_bidi_p)
   1891 	    {
   1892 	      location_t loc;
   1893 	      bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
   1894 	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   1895 	    }
   1896 	  if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
   1897 			       state, &s))
   1898 	    return true;
   1899 	}
   1900       else if (*buffer->cur == '\\'
   1901 	       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
   1902 	{
   1903 	  buffer->cur += 2;
   1904 	  if (warn_bidi_p)
   1905 	    {
   1906 	      location_t loc;
   1907 	      bidi::kind kind = get_bidi_ucn (pfile,
   1908 					      buffer->cur,
   1909 					      buffer->cur[-1] == 'U',
   1910 					      &loc);
   1911 	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
   1912 	    }
   1913 	  if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
   1914 			      state, &s, NULL, NULL))
   1915 	    return true;
   1916 	  buffer->cur -= 2;
   1917 	}
   1918     }
   1919 
   1920   return false;
   1921 }
   1922 
   1923 /* Helper function to issue error about improper __VA_OPT__ use.  */
   1924 static void
   1925 maybe_va_opt_error (cpp_reader *pfile)
   1926 {
   1927   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
   1928     {
   1929       /* __VA_OPT__ should not be accepted at all, but allow it in
   1930 	 system headers.  */
   1931       if (!_cpp_in_system_header (pfile))
   1932 	cpp_error (pfile, CPP_DL_PEDWARN,
   1933 		   "__VA_OPT__ is not available until C++20");
   1934     }
   1935   else if (!pfile->state.va_args_ok)
   1936     {
   1937       /* __VA_OPT__ should only appear in the replacement list of a
   1938 	 variadic macro.  */
   1939       cpp_error (pfile, CPP_DL_PEDWARN,
   1940 		 "__VA_OPT__ can only appear in the expansion"
   1941 		 " of a C++20 variadic macro");
   1942     }
   1943 }
   1944 
   1945 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
   1946 static cpp_hashnode *
   1947 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
   1948 {
   1949   cpp_hashnode *result;
   1950   const uchar *cur;
   1951   unsigned int len;
   1952   unsigned int hash = HT_HASHSTEP (0, *base);
   1953 
   1954   cur = base + 1;
   1955   while (ISIDNUM (*cur))
   1956     {
   1957       hash = HT_HASHSTEP (hash, *cur);
   1958       cur++;
   1959     }
   1960   len = cur - base;
   1961   hash = HT_HASHFINISH (hash, len);
   1962   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
   1963 					      base, len, hash, HT_ALLOC));
   1964 
   1965   /* Rarely, identifiers require diagnostics when lexed.  */
   1966   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
   1967 			&& !pfile->state.skipping, 0))
   1968     {
   1969       /* It is allowed to poison the same identifier twice.  */
   1970       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
   1971 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
   1972 		   NODE_NAME (result));
   1973 
   1974       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
   1975 	 replacement list of a variadic macro.  */
   1976       if (result == pfile->spec_nodes.n__VA_ARGS__
   1977 	  && !pfile->state.va_args_ok)
   1978 	{
   1979 	  if (CPP_OPTION (pfile, cplusplus))
   1980 	    cpp_error (pfile, CPP_DL_PEDWARN,
   1981 		       "__VA_ARGS__ can only appear in the expansion"
   1982 		       " of a C++11 variadic macro");
   1983 	  else
   1984 	    cpp_error (pfile, CPP_DL_PEDWARN,
   1985 		       "__VA_ARGS__ can only appear in the expansion"
   1986 		       " of a C99 variadic macro");
   1987 	}
   1988 
   1989       if (result == pfile->spec_nodes.n__VA_OPT__)
   1990 	maybe_va_opt_error (pfile);
   1991 
   1992       /* For -Wc++-compat, warn about use of C++ named operators.  */
   1993       if (result->flags & NODE_WARN_OPERATOR)
   1994 	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
   1995 		     "identifier \"%s\" is a special operator name in C++",
   1996 		     NODE_NAME (result));
   1997     }
   1998 
   1999   return result;
   2000 }
   2001 
   2002 /* Get the cpp_hashnode of an identifier specified by NAME in
   2003    the current cpp_reader object.  If none is found, NULL is returned.  */
   2004 cpp_hashnode *
   2005 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
   2006 {
   2007   cpp_hashnode *result;
   2008   result = lex_identifier_intern (pfile, (uchar *) name);
   2009   return result;
   2010 }
   2011 
   2012 /* Lex an identifier starting at BUFFER->CUR - 1.  */
   2013 static cpp_hashnode *
   2014 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
   2015 		struct normalize_state *nst, cpp_hashnode **spelling)
   2016 {
   2017   cpp_hashnode *result;
   2018   const uchar *cur;
   2019   unsigned int len;
   2020   unsigned int hash = HT_HASHSTEP (0, *base);
   2021   const bool warn_bidi_p = pfile->warn_bidi_p ();
   2022 
   2023   cur = pfile->buffer->cur;
   2024   if (! starts_ucn)
   2025     {
   2026       while (ISIDNUM (*cur))
   2027 	{
   2028 	  hash = HT_HASHSTEP (hash, *cur);
   2029 	  cur++;
   2030 	}
   2031       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
   2032     }
   2033   pfile->buffer->cur = cur;
   2034   if (starts_ucn || forms_identifier_p (pfile, false, nst))
   2035     {
   2036       /* Slower version for identifiers containing UCNs
   2037 	 or extended chars (including $).  */
   2038       do {
   2039 	while (ISIDNUM (*pfile->buffer->cur))
   2040 	  {
   2041 	    NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
   2042 	    pfile->buffer->cur++;
   2043 	  }
   2044       } while (forms_identifier_p (pfile, false, nst));
   2045       if (warn_bidi_p)
   2046 	maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
   2047       result = _cpp_interpret_identifier (pfile, base,
   2048 					  pfile->buffer->cur - base);
   2049       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
   2050     }
   2051   else
   2052     {
   2053       len = cur - base;
   2054       hash = HT_HASHFINISH (hash, len);
   2055 
   2056       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
   2057 						  base, len, hash, HT_ALLOC));
   2058       *spelling = result;
   2059     }
   2060 
   2061   /* Rarely, identifiers require diagnostics when lexed.  */
   2062   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
   2063 			&& !pfile->state.skipping, 0))
   2064     {
   2065       /* It is allowed to poison the same identifier twice.  */
   2066       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
   2067 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
   2068 		   NODE_NAME (result));
   2069 
   2070       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
   2071 	 replacement list of a variadic macro.  */
   2072       if (result == pfile->spec_nodes.n__VA_ARGS__
   2073 	  && !pfile->state.va_args_ok)
   2074 	{
   2075 	  if (CPP_OPTION (pfile, cplusplus))
   2076 	    cpp_error (pfile, CPP_DL_PEDWARN,
   2077 		       "__VA_ARGS__ can only appear in the expansion"
   2078 		       " of a C++11 variadic macro");
   2079 	  else
   2080 	    cpp_error (pfile, CPP_DL_PEDWARN,
   2081 		       "__VA_ARGS__ can only appear in the expansion"
   2082 		       " of a C99 variadic macro");
   2083 	}
   2084 
   2085       /* __VA_OPT__ should only appear in the replacement list of a
   2086 	 variadic macro.  */
   2087       if (result == pfile->spec_nodes.n__VA_OPT__)
   2088 	maybe_va_opt_error (pfile);
   2089 
   2090       /* For -Wc++-compat, warn about use of C++ named operators.  */
   2091       if (result->flags & NODE_WARN_OPERATOR)
   2092 	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
   2093 		     "identifier \"%s\" is a special operator name in C++",
   2094 		     NODE_NAME (result));
   2095     }
   2096 
   2097   return result;
   2098 }
   2099 
   2100 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
   2101 static void
   2102 lex_number (cpp_reader *pfile, cpp_string *number,
   2103 	    struct normalize_state *nst)
   2104 {
   2105   const uchar *cur;
   2106   const uchar *base;
   2107   uchar *dest;
   2108 
   2109   base = pfile->buffer->cur - 1;
   2110   do
   2111     {
   2112       const uchar *adj_digit_sep = NULL;
   2113       cur = pfile->buffer->cur;
   2114 
   2115       /* N.B. ISIDNUM does not include $.  */
   2116       while (ISIDNUM (*cur)
   2117 	     || (*cur == '.' && !DIGIT_SEP (cur[-1]))
   2118 	     || DIGIT_SEP (*cur)
   2119 	     || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
   2120 	{
   2121 	  NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
   2122 	  /* Adjacent digit separators do not form part of the pp-number syntax.
   2123 	     However, they can safely be diagnosed here as an error, since '' is
   2124 	     not a valid preprocessing token.  */
   2125 	  if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
   2126 	    adj_digit_sep = cur;
   2127 	  cur++;
   2128 	}
   2129       /* A number can't end with a digit separator.  */
   2130       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
   2131 	--cur;
   2132       if (adj_digit_sep && adj_digit_sep < cur)
   2133 	cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
   2134 
   2135       pfile->buffer->cur = cur;
   2136     }
   2137   while (forms_identifier_p (pfile, false, nst));
   2138 
   2139   number->len = cur - base;
   2140   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
   2141   memcpy (dest, base, number->len);
   2142   dest[number->len] = '\0';
   2143   number->text = dest;
   2144 }
   2145 
   2146 /* Create a token of type TYPE with a literal spelling.  */
   2147 static void
   2148 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
   2149 		unsigned int len, enum cpp_ttype type)
   2150 {
   2151   token->type = type;
   2152   token->val.str.len = len;
   2153   token->val.str.text = cpp_alloc_token_string (pfile, base, len);
   2154 }
   2155 
   2156 const uchar *
   2157 cpp_alloc_token_string (cpp_reader *pfile,
   2158 			const unsigned char *ptr, unsigned len)
   2159 {
   2160   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
   2161 
   2162   dest[len] = 0;
   2163   memcpy (dest, ptr, len);
   2164   return dest;
   2165 }
   2166 
   2167 /* A pair of raw buffer pointers.  The currently open one is [1], the
   2168    first one is [0].  Used for string literal lexing.  */
   2169 struct lit_accum {
   2170   _cpp_buff *first;
   2171   _cpp_buff *last;
   2172   const uchar *rpos;
   2173   size_t accum;
   2174 
   2175   lit_accum ()
   2176     : first (NULL), last (NULL), rpos (0), accum (0)
   2177   {
   2178   }
   2179 
   2180   void append (cpp_reader *, const uchar *, size_t);
   2181 
   2182   void read_begin (cpp_reader *);
   2183   bool reading_p () const
   2184   {
   2185     return rpos != NULL;
   2186   }
   2187   char read_char ()
   2188   {
   2189     char c = *rpos++;
   2190     if (rpos == BUFF_FRONT (last))
   2191       rpos = NULL;
   2192     return c;
   2193   }
   2194 };
   2195 
   2196 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
   2197    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
   2198 
   2199 void
   2200 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
   2201 {
   2202   if (!last)
   2203     /* Starting.  */
   2204     first = last = _cpp_get_buff (pfile, len);
   2205   else if (len > BUFF_ROOM (last))
   2206     {
   2207       /* There is insufficient room in the buffer.  Copy what we can,
   2208 	 and then either extend or create a new one.  */
   2209       size_t room = BUFF_ROOM (last);
   2210       memcpy (BUFF_FRONT (last), base, room);
   2211       BUFF_FRONT (last) += room;
   2212       base += room;
   2213       len -= room;
   2214       accum += room;
   2215 
   2216       gcc_checking_assert (!rpos);
   2217 
   2218       last = _cpp_append_extend_buff (pfile, last, len);
   2219     }
   2220 
   2221   memcpy (BUFF_FRONT (last), base, len);
   2222   BUFF_FRONT (last) += len;
   2223   accum += len;
   2224 }
   2225 
   2226 void
   2227 lit_accum::read_begin (cpp_reader *pfile)
   2228 {
   2229   /* We never accumulate more than 4 chars to read.  */
   2230   if (BUFF_ROOM (last) < 4)
   2231 
   2232     last = _cpp_append_extend_buff (pfile, last, 4);
   2233   rpos = BUFF_FRONT (last);
   2234 }
   2235 
   2236 /* Returns true if a macro has been defined.
   2237    This might not work if compile with -save-temps,
   2238    or preprocess separately from compilation.  */
   2239 
   2240 static bool
   2241 is_macro(cpp_reader *pfile, const uchar *base)
   2242 {
   2243   const uchar *cur = base;
   2244   if (! ISIDST (*cur))
   2245     return false;
   2246   unsigned int hash = HT_HASHSTEP (0, *cur);
   2247   ++cur;
   2248   while (ISIDNUM (*cur))
   2249     {
   2250       hash = HT_HASHSTEP (hash, *cur);
   2251       ++cur;
   2252     }
   2253   hash = HT_HASHFINISH (hash, cur - base);
   2254 
   2255   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
   2256 					base, cur - base, hash, HT_NO_INSERT));
   2257 
   2258   return result && cpp_macro_p (result);
   2259 }
   2260 
   2261 /* Returns true if a literal suffix does not have the expected form
   2262    and is defined as a macro.  */
   2263 
   2264 static bool
   2265 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
   2266 {
   2267   /* User-defined literals outside of namespace std must start with a single
   2268      underscore, so assume anything of that form really is a UDL suffix.
   2269      We don't need to worry about UDLs defined inside namespace std because
   2270      their names are reserved, so cannot be used as macro names in valid
   2271      programs.  */
   2272   if (base[0] == '_' && base[1] != '_')
   2273     return false;
   2274   return is_macro (pfile, base);
   2275 }
   2276 
   2277 /* Lexes a raw string.  The stored string contains the spelling,
   2278    including double quotes, delimiter string, '(' and ')', any leading
   2279    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
   2280    the type of the literal, or CPP_OTHER if it was not properly
   2281    terminated.
   2282 
   2283    BASE is the start of the token.  Updates pfile->buffer->cur to just
   2284    after the lexed string.
   2285 
   2286    The spelling is NUL-terminated, but it is not guaranteed that this
   2287    is the first NUL since embedded NULs are preserved.  */
   2288 
   2289 static void
   2290 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
   2291 {
   2292   const uchar *pos = base;
   2293   const bool warn_bidi_p = pfile->warn_bidi_p ();
   2294 
   2295   /* 'tis a pity this information isn't passed down from the lexer's
   2296      initial categorization of the token.  */
   2297   enum cpp_ttype type = CPP_STRING;
   2298 
   2299   if (*pos == 'L')
   2300     {
   2301       type = CPP_WSTRING;
   2302       pos++;
   2303     }
   2304   else if (*pos == 'U')
   2305     {
   2306       type = CPP_STRING32;
   2307       pos++;
   2308     }
   2309   else if (*pos == 'u')
   2310     {
   2311       if (pos[1] == '8')
   2312 	{
   2313 	  type = CPP_UTF8STRING;
   2314 	  pos++;
   2315 	}
   2316       else
   2317 	type = CPP_STRING16;
   2318       pos++;
   2319     }
   2320 
   2321   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
   2322   pos += 2;
   2323 
   2324   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
   2325 
   2326   /* Skip notes before the ".  */
   2327   while (note->pos < pos)
   2328     ++note;
   2329 
   2330   lit_accum accum;
   2331 
   2332   uchar prefix[17];
   2333   unsigned prefix_len = 0;
   2334   enum Phase
   2335   {
   2336    PHASE_PREFIX = -2,
   2337    PHASE_NONE = -1,
   2338    PHASE_SUFFIX = 0
   2339   } phase = PHASE_PREFIX;
   2340 
   2341   for (;;)
   2342     {
   2343       gcc_checking_assert (note->pos >= pos);
   2344 
   2345       /* Undo any escaped newlines and trigraphs.  */
   2346       if (!accum.reading_p () && note->pos == pos)
   2347 	switch (note->type)
   2348 	  {
   2349 	  case '\\':
   2350 	  case ' ':
   2351 	    /* Restore backslash followed by newline.  */
   2352 	    accum.append (pfile, base, pos - base);
   2353 	    base = pos;
   2354 	    accum.read_begin (pfile);
   2355 	    accum.append (pfile, UC"\\", 1);
   2356 
   2357 	  after_backslash:
   2358 	    if (note->type == ' ')
   2359 	      /* GNU backslash whitespace newline extension.  FIXME
   2360 		 could be any sequence of non-vertical space.  When we
   2361 		 can properly restore any such sequence, we should
   2362 		 mark this note as handled so _cpp_process_line_notes
   2363 		 doesn't warn.  */
   2364 	      accum.append (pfile, UC" ", 1);
   2365 
   2366 	    accum.append (pfile, UC"\n", 1);
   2367 	    note++;
   2368 	    break;
   2369 
   2370 	  case '\n':
   2371 	    /* This can happen for ??/<NEWLINE> when trigraphs are not
   2372 	       being interpretted.  */
   2373 	    gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
   2374 	    note->type = 0;
   2375 	    note++;
   2376 	    break;
   2377 
   2378 	  default:
   2379 	    gcc_checking_assert (_cpp_trigraph_map[note->type]);
   2380 
   2381 	    /* Don't warn about this trigraph in
   2382 	       _cpp_process_line_notes, since trigraphs show up as
   2383 	       trigraphs in raw strings.  */
   2384 	    uchar type = note->type;
   2385 	    note->type = 0;
   2386 
   2387 	    if (CPP_OPTION (pfile, trigraphs))
   2388 	      {
   2389 		accum.append (pfile, base, pos - base);
   2390 		base = pos;
   2391 		accum.read_begin (pfile);
   2392 		accum.append (pfile, UC"??", 2);
   2393 		accum.append (pfile, &type, 1);
   2394 
   2395 		/* ??/ followed by newline gets two line notes, one for
   2396 		   the trigraph and one for the backslash/newline.  */
   2397 		if (type == '/' && note[1].pos == pos)
   2398 		  {
   2399 		    note++;
   2400 		    gcc_assert (note->type == '\\' || note->type == ' ');
   2401 		    goto after_backslash;
   2402 		  }
   2403 		/* Skip the replacement character.  */
   2404 		base = ++pos;
   2405 	      }
   2406 
   2407 	    note++;
   2408 	    break;
   2409 	  }
   2410 
   2411       /* Now get a char to process.  Either from an expanded note, or
   2412 	 from the line buffer.  */
   2413       bool read_note = accum.reading_p ();
   2414       char c = read_note ? accum.read_char () : *pos++;
   2415 
   2416       if (phase == PHASE_PREFIX)
   2417 	{
   2418 	  if (c == '(')
   2419 	    {
   2420 	      /* Done.  */
   2421 	      phase = PHASE_NONE;
   2422 	      prefix[prefix_len++] = '"';
   2423 	    }
   2424 	  else if (prefix_len < 16
   2425 		   /* Prefix chars are any of the basic character set,
   2426 		      [lex.charset] except for '
   2427 		      ()\\\t\v\f\n'. Optimized for a contiguous
   2428 		      alphabet.  */
   2429 		   /* Unlike a switch, this collapses down to one or
   2430 		      two shift and bitmask operations on an ASCII
   2431 		      system, with an outlier or two.   */
   2432 		   && (('Z' - 'A' == 25
   2433 			? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
   2434 			: ISIDST (c))
   2435 		       || (c >= '0' && c <= '9')
   2436 		       || c == '_' || c == '{' || c == '}'
   2437 		       || c == '[' || c == ']' || c == '#'
   2438 		       || c == '<' || c == '>' || c == '%'
   2439 		       || c == ':' || c == ';' || c == '.' || c == '?'
   2440 		       || c == '*' || c == '+' || c == '-' || c == '/'
   2441 		       || c == '^' || c == '&' || c == '|' || c == '~'
   2442 		       || c == '!' || c == '=' || c == ','
   2443 		       || c == '"' || c == '\''))
   2444 	    prefix[prefix_len++] = c;
   2445 	  else
   2446 	    {
   2447 	      /* Something is wrong.  */
   2448 	      int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
   2449 	      if (prefix_len == 16)
   2450 		cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
   2451 				     col, "raw string delimiter longer "
   2452 				     "than 16 characters");
   2453 	      else if (c == '\n')
   2454 		cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
   2455 				     col, "invalid new-line in raw "
   2456 				     "string delimiter");
   2457 	      else
   2458 		cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
   2459 				     col, "invalid character '%c' in "
   2460 				     "raw string delimiter", c);
   2461 	      type = CPP_OTHER;
   2462 	      phase = PHASE_NONE;
   2463 	      /* Continue until we get a close quote, that's probably
   2464 		 the best failure mode.  */
   2465 	      prefix_len = 0;
   2466 	    }
   2467 	  if (c != '\n')
   2468 	    continue;
   2469 	}
   2470 
   2471       if (phase != PHASE_NONE)
   2472 	{
   2473 	  if (prefix[phase] != c)
   2474 	    phase = PHASE_NONE;
   2475 	  else if (unsigned (phase + 1) == prefix_len)
   2476 	    break;
   2477 	  else
   2478 	    {
   2479 	      phase = Phase (phase + 1);
   2480 	      continue;
   2481 	    }
   2482 	}
   2483 
   2484       if (!prefix_len && c == '"')
   2485 	/* Failure mode lexing.  */
   2486 	goto out;
   2487       else if (prefix_len && c == ')')
   2488 	phase = PHASE_SUFFIX;
   2489       else if (!read_note && c == '\n')
   2490 	{
   2491 	  pos--;
   2492 	  pfile->buffer->cur = pos;
   2493 	  if (pfile->state.in_directive
   2494 	      || (pfile->state.parsing_args
   2495 		  && pfile->buffer->next_line >= pfile->buffer->rlimit))
   2496 	    {
   2497 	      cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
   2498 				   "unterminated raw string");
   2499 	      type = CPP_OTHER;
   2500 	      goto out;
   2501 	    }
   2502 
   2503 	  accum.append (pfile, base, pos - base + 1);
   2504 	  _cpp_process_line_notes (pfile, false);
   2505 
   2506 	  if (pfile->buffer->next_line < pfile->buffer->rlimit)
   2507 	    CPP_INCREMENT_LINE (pfile, 0);
   2508 	  pfile->buffer->need_line = true;
   2509 
   2510 	  if (!_cpp_get_fresh_line (pfile))
   2511 	    {
   2512 	      /* We ran out of file and failed to get a line.  */
   2513 	      location_t src_loc = token->src_loc;
   2514 	      token->type = CPP_EOF;
   2515 	      /* Tell the compiler the line number of the EOF token.  */
   2516 	      token->src_loc = pfile->line_table->highest_line;
   2517 	      token->flags = BOL;
   2518 	      if (accum.first)
   2519 		_cpp_release_buff (pfile, accum.first);
   2520 	      cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
   2521 				   "unterminated raw string");
   2522 	      /* Now pop the buffer that _cpp_get_fresh_line did not.  */
   2523 	      _cpp_pop_buffer (pfile);
   2524 	      return;
   2525 	    }
   2526 
   2527 	  pos = base = pfile->buffer->cur;
   2528 	  note = &pfile->buffer->notes[pfile->buffer->cur_note];
   2529 	}
   2530       else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
   2531 	       && warn_bidi_p)
   2532 	{
   2533 	  location_t loc;
   2534 	  bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
   2535 	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   2536 	}
   2537     }
   2538 
   2539   if (warn_bidi_p)
   2540     maybe_warn_bidi_on_close (pfile, pos);
   2541 
   2542   if (CPP_OPTION (pfile, user_literals))
   2543     {
   2544       /* If a string format macro, say from inttypes.h, is placed touching
   2545 	 a string literal it could be parsed as a C++11 user-defined string
   2546 	 literal thus breaking the program.  */
   2547       if (is_macro_not_literal_suffix (pfile, pos))
   2548 	{
   2549 	  /* Raise a warning, but do not consume subsequent tokens.  */
   2550 	  if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
   2551 	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
   2552 				   token->src_loc, 0,
   2553 				   "invalid suffix on literal; C++11 requires "
   2554 				   "a space between literal and string macro");
   2555 	}
   2556       /* Grab user defined literal suffix.  */
   2557       else if (ISIDST (*pos))
   2558 	{
   2559 	  type = cpp_userdef_string_add_type (type);
   2560 	  ++pos;
   2561 
   2562 	  while (ISIDNUM (*pos))
   2563 	    ++pos;
   2564 	}
   2565     }
   2566 
   2567  out:
   2568   pfile->buffer->cur = pos;
   2569   if (!accum.accum)
   2570     create_literal (pfile, token, base, pos - base, type);
   2571   else
   2572     {
   2573       size_t extra_len = pos - base;
   2574       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
   2575 
   2576       token->type = type;
   2577       token->val.str.len = accum.accum + extra_len;
   2578       token->val.str.text = dest;
   2579       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
   2580 	{
   2581 	  size_t len = BUFF_FRONT (buf) - buf->base;
   2582 	  memcpy (dest, buf->base, len);
   2583 	  dest += len;
   2584 	}
   2585       _cpp_release_buff (pfile, accum.first);
   2586       memcpy (dest, base, extra_len);
   2587       dest[extra_len] = '\0';
   2588     }
   2589 }
   2590 
   2591 /* Lexes a string, character constant, or angle-bracketed header file
   2592    name.  The stored string contains the spelling, including opening
   2593    quote and any leading 'L', 'u', 'U' or 'u8' and optional
   2594    'R' modifier.  It returns the type of the literal, or CPP_OTHER
   2595    if it was not properly terminated, or CPP_LESS for an unterminated
   2596    header name which must be relexed as normal tokens.
   2597 
   2598    The spelling is NUL-terminated, but it is not guaranteed that this
   2599    is the first NUL since embedded NULs are preserved.  */
   2600 static void
   2601 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
   2602 {
   2603   bool saw_NUL = false;
   2604   const uchar *cur;
   2605   cppchar_t terminator;
   2606   enum cpp_ttype type;
   2607 
   2608   cur = base;
   2609   terminator = *cur++;
   2610   if (terminator == 'L' || terminator == 'U')
   2611     terminator = *cur++;
   2612   else if (terminator == 'u')
   2613     {
   2614       terminator = *cur++;
   2615       if (terminator == '8')
   2616 	terminator = *cur++;
   2617     }
   2618   if (terminator == 'R')
   2619     {
   2620       lex_raw_string (pfile, token, base);
   2621       return;
   2622     }
   2623   if (terminator == '"')
   2624     type = (*base == 'L' ? CPP_WSTRING :
   2625 	    *base == 'U' ? CPP_STRING32 :
   2626 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
   2627 			 : CPP_STRING);
   2628   else if (terminator == '\'')
   2629     type = (*base == 'L' ? CPP_WCHAR :
   2630 	    *base == 'U' ? CPP_CHAR32 :
   2631 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
   2632 			 : CPP_CHAR);
   2633   else
   2634     terminator = '>', type = CPP_HEADER_NAME;
   2635 
   2636   const bool warn_bidi_p = pfile->warn_bidi_p ();
   2637   for (;;)
   2638     {
   2639       cppchar_t c = *cur++;
   2640 
   2641       /* In #include-style directives, terminators are not escapable.  */
   2642       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
   2643 	{
   2644 	  if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
   2645 	    {
   2646 	      location_t loc;
   2647 	      bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
   2648 					      &loc);
   2649 	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
   2650 	    }
   2651 	  cur++;
   2652 	}
   2653       else if (c == terminator)
   2654 	{
   2655 	  if (warn_bidi_p)
   2656 	    maybe_warn_bidi_on_close (pfile, cur - 1);
   2657 	  break;
   2658 	}
   2659       else if (c == '\n')
   2660 	{
   2661 	  cur--;
   2662 	  /* Unmatched quotes always yield undefined behavior, but
   2663 	     greedy lexing means that what appears to be an unterminated
   2664 	     header name may actually be a legitimate sequence of tokens.  */
   2665 	  if (terminator == '>')
   2666 	    {
   2667 	      token->type = CPP_LESS;
   2668 	      return;
   2669 	    }
   2670 	  type = CPP_OTHER;
   2671 	  break;
   2672 	}
   2673       else if (c == '\0')
   2674 	saw_NUL = true;
   2675       else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
   2676 	{
   2677 	  location_t loc;
   2678 	  bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
   2679 	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
   2680 	}
   2681     }
   2682 
   2683   if (saw_NUL && !pfile->state.skipping)
   2684     cpp_error (pfile, CPP_DL_WARNING,
   2685 	       "null character(s) preserved in literal");
   2686 
   2687   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
   2688     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
   2689 	       (int) terminator);
   2690 
   2691   if (CPP_OPTION (pfile, user_literals))
   2692     {
   2693       /* If a string format macro, say from inttypes.h, is placed touching
   2694 	 a string literal it could be parsed as a C++11 user-defined string
   2695 	 literal thus breaking the program.  */
   2696       if (is_macro_not_literal_suffix (pfile, cur))
   2697 	{
   2698 	  /* Raise a warning, but do not consume subsequent tokens.  */
   2699 	  if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
   2700 	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
   2701 				   token->src_loc, 0,
   2702 				   "invalid suffix on literal; C++11 requires "
   2703 				   "a space between literal and string macro");
   2704 	}
   2705       /* Grab user defined literal suffix.  */
   2706       else if (ISIDST (*cur))
   2707 	{
   2708 	  type = cpp_userdef_char_add_type (type);
   2709 	  type = cpp_userdef_string_add_type (type);
   2710           ++cur;
   2711 
   2712 	  while (ISIDNUM (*cur))
   2713 	    ++cur;
   2714 	}
   2715     }
   2716   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
   2717 	   && is_macro (pfile, cur)
   2718 	   && !pfile->state.skipping)
   2719     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
   2720 			   token->src_loc, 0, "C++11 requires a space "
   2721 			   "between string literal and macro");
   2722 
   2723   pfile->buffer->cur = cur;
   2724   create_literal (pfile, token, base, cur - base, type);
   2725 }
   2726 
   2727 /* Return the comment table. The client may not make any assumption
   2728    about the ordering of the table.  */
   2729 cpp_comment_table *
   2730 cpp_get_comments (cpp_reader *pfile)
   2731 {
   2732   return &pfile->comments;
   2733 }
   2734 
   2735 /* Append a comment to the end of the comment table. */
   2736 static void
   2737 store_comment (cpp_reader *pfile, cpp_token *token)
   2738 {
   2739   int len;
   2740 
   2741   if (pfile->comments.allocated == 0)
   2742     {
   2743       pfile->comments.allocated = 256;
   2744       pfile->comments.entries = (cpp_comment *) xmalloc
   2745 	(pfile->comments.allocated * sizeof (cpp_comment));
   2746     }
   2747 
   2748   if (pfile->comments.count == pfile->comments.allocated)
   2749     {
   2750       pfile->comments.allocated *= 2;
   2751       pfile->comments.entries = (cpp_comment *) xrealloc
   2752 	(pfile->comments.entries,
   2753 	 pfile->comments.allocated * sizeof (cpp_comment));
   2754     }
   2755 
   2756   len = token->val.str.len;
   2757 
   2758   /* Copy comment. Note, token may not be NULL terminated. */
   2759   pfile->comments.entries[pfile->comments.count].comment =
   2760     (char *) xmalloc (sizeof (char) * (len + 1));
   2761   memcpy (pfile->comments.entries[pfile->comments.count].comment,
   2762 	  token->val.str.text, len);
   2763   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
   2764 
   2765   /* Set source location. */
   2766   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
   2767 
   2768   /* Increment the count of entries in the comment table. */
   2769   pfile->comments.count++;
   2770 }
   2771 
   2772 /* The stored comment includes the comment start and any terminator.  */
   2773 static void
   2774 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
   2775 	      cppchar_t type)
   2776 {
   2777   unsigned char *buffer;
   2778   unsigned int len, clen, i;
   2779   int convert_to_c = (pfile->state.in_directive || pfile->state.parsing_args)
   2780     && type == '/';
   2781 
   2782   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
   2783 
   2784   /* C++ comments probably (not definitely) have moved past a new
   2785      line, which we don't want to save in the comment.  */
   2786   if (is_vspace (pfile->buffer->cur[-1]))
   2787     len--;
   2788 
   2789   /* If we are currently in a directive or in argument parsing, then
   2790      we need to store all C++ comments as C comments internally, and
   2791      so we need to allocate a little extra space in that case.
   2792 
   2793      Note that the only time we encounter a directive here is
   2794      when we are saving comments in a "#define".  */
   2795   clen = convert_to_c ? len + 2 : len;
   2796 
   2797   buffer = _cpp_unaligned_alloc (pfile, clen);
   2798 
   2799   token->type = CPP_COMMENT;
   2800   token->val.str.len = clen;
   2801   token->val.str.text = buffer;
   2802 
   2803   buffer[0] = '/';
   2804   memcpy (buffer + 1, from, len - 1);
   2805 
   2806   /* Finish conversion to a C comment, if necessary.  */
   2807   if (convert_to_c)
   2808     {
   2809       buffer[1] = '*';
   2810       buffer[clen - 2] = '*';
   2811       buffer[clen - 1] = '/';
   2812       /* As there can be in a C++ comments illegal sequences for C comments
   2813          we need to filter them out.  */
   2814       for (i = 2; i < (clen - 2); i++)
   2815         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
   2816           buffer[i] = '|';
   2817     }
   2818 
   2819   /* Finally store this comment for use by clients of libcpp. */
   2820   store_comment (pfile, token);
   2821 }
   2822 
   2823 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
   2824    comment.  */
   2825 
   2826 static bool
   2827 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
   2828 {
   2829   const unsigned char *from = comment_start + 1;
   2830 
   2831   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
   2832     {
   2833       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
   2834 	 don't recognize any comments.  The latter only checks attributes,
   2835 	 the former doesn't warn.  */
   2836     case 0:
   2837     default:
   2838       return false;
   2839       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
   2840 	 content it has.  */
   2841     case 1:
   2842       return true;
   2843     case 2:
   2844       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
   2845 	 .*falls?[ \t-]*thr(u|ough).* regex.  */
   2846       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
   2847 	   from++)
   2848 	{
   2849 	  /* Is there anything like strpbrk with upper boundary, or
   2850 	     memchr looking for 2 characters rather than just one?  */
   2851 	  if (from[0] != 'f' && from[0] != 'F')
   2852 	    continue;
   2853 	  if (from[1] != 'a' && from[1] != 'A')
   2854 	    continue;
   2855 	  if (from[2] != 'l' && from[2] != 'L')
   2856 	    continue;
   2857 	  if (from[3] != 'l' && from[3] != 'L')
   2858 	    continue;
   2859 	  from += sizeof "fall" - 1;
   2860 	  if (from[0] == 's' || from[0] == 'S')
   2861 	    from++;
   2862 	  while (*from == ' ' || *from == '\t' || *from == '-')
   2863 	    from++;
   2864 	  if (from[0] != 't' && from[0] != 'T')
   2865 	    continue;
   2866 	  if (from[1] != 'h' && from[1] != 'H')
   2867 	    continue;
   2868 	  if (from[2] != 'r' && from[2] != 'R')
   2869 	    continue;
   2870 	  if (from[3] == 'u' || from[3] == 'U')
   2871 	    return true;
   2872 	  if (from[3] != 'o' && from[3] != 'O')
   2873 	    continue;
   2874 	  if (from[4] != 'u' && from[4] != 'U')
   2875 	    continue;
   2876 	  if (from[5] != 'g' && from[5] != 'G')
   2877 	    continue;
   2878 	  if (from[6] != 'h' && from[6] != 'H')
   2879 	    continue;
   2880 	  return true;
   2881 	}
   2882       return false;
   2883     case 3:
   2884     case 4:
   2885       break;
   2886     }
   2887 
   2888   /* Whole comment contents:
   2889      -fallthrough
   2890      @fallthrough@
   2891    */
   2892   if (*from == '-' || *from == '@')
   2893     {
   2894       size_t len = sizeof "fallthrough" - 1;
   2895       if ((size_t) (pfile->buffer->cur - from - 1) < len)
   2896 	return false;
   2897       if (memcmp (from + 1, "fallthrough", len))
   2898 	return false;
   2899       if (*from == '@')
   2900 	{
   2901 	  if (from[len + 1] != '@')
   2902 	    return false;
   2903 	  len++;
   2904 	}
   2905       from += 1 + len;
   2906     }
   2907   /* Whole comment contents (regex):
   2908      lint -fallthrough[ \t]*
   2909    */
   2910   else if (*from == 'l')
   2911     {
   2912       size_t len = sizeof "int -fallthrough" - 1;
   2913       if ((size_t) (pfile->buffer->cur - from - 1) < len)
   2914 	return false;
   2915       if (memcmp (from + 1, "int -fallthrough", len))
   2916 	return false;
   2917       from += 1 + len;
   2918       while (*from == ' ' || *from == '\t')
   2919 	from++;
   2920     }
   2921   /* Whole comment contents (regex):
   2922      [ \t]*FALLTHR(U|OUGH)[ \t]*
   2923    */
   2924   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
   2925     {
   2926       while (*from == ' ' || *from == '\t')
   2927 	from++;
   2928       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
   2929 	return false;
   2930       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
   2931 	return false;
   2932       from += sizeof "FALLTHR" - 1;
   2933       if (*from == 'U')
   2934 	from++;
   2935       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
   2936 	return false;
   2937       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
   2938 	return false;
   2939       else
   2940 	from += sizeof "OUGH" - 1;
   2941       while (*from == ' ' || *from == '\t')
   2942 	from++;
   2943     }
   2944   /* Whole comment contents (regex):
   2945      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
   2946      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
   2947      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
   2948    */
   2949   else
   2950     {
   2951       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
   2952 	from++;
   2953       unsigned char f = *from;
   2954       bool all_upper = false;
   2955       if (f == 'E' || f == 'e')
   2956 	{
   2957 	  if ((size_t) (pfile->buffer->cur - from)
   2958 	      < sizeof "else fallthru" - 1)
   2959 	    return false;
   2960 	  if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
   2961 	    all_upper = true;
   2962 	  else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
   2963 	    return false;
   2964 	  from += sizeof "else" - 1;
   2965 	  if (*from == ',')
   2966 	    from++;
   2967 	  if (*from != ' ')
   2968 	    return false;
   2969 	  from++;
   2970 	  if (all_upper && *from == 'f')
   2971 	    return false;
   2972 	  if (f == 'e' && *from == 'F')
   2973 	    return false;
   2974 	  f = *from;
   2975 	}
   2976       else if (f == 'I' || f == 'i')
   2977 	{
   2978 	  if ((size_t) (pfile->buffer->cur - from)
   2979 	      < sizeof "intentional fallthru" - 1)
   2980 	    return false;
   2981 	  if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
   2982 				  sizeof "NTENTIONAL" - 1) == 0)
   2983 	    all_upper = true;
   2984 	  else if (memcmp (from + 1, "ntentional",
   2985 			   sizeof "ntentional" - 1))
   2986 	    return false;
   2987 	  from += sizeof "intentional" - 1;
   2988 	  if (*from == ' ')
   2989 	    {
   2990 	      from++;
   2991 	      if (all_upper && *from == 'f')
   2992 		return false;
   2993 	    }
   2994 	  else if (all_upper)
   2995 	    {
   2996 	      if (memcmp (from, "LY F", sizeof "LY F" - 1))
   2997 		return false;
   2998 	      from += sizeof "LY " - 1;
   2999 	    }
   3000 	  else
   3001 	    {
   3002 	      if (memcmp (from, "ly ", sizeof "ly " - 1))
   3003 		return false;
   3004 	      from += sizeof "ly " - 1;
   3005 	    }
   3006 	  if (f == 'i' && *from == 'F')
   3007 	    return false;
   3008 	  f = *from;
   3009 	}
   3010       if (f != 'F' && f != 'f')
   3011 	return false;
   3012       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
   3013 	return false;
   3014       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
   3015 	all_upper = true;
   3016       else if (all_upper)
   3017 	return false;
   3018       else if (memcmp (from + 1, "all", sizeof "all" - 1))
   3019 	return false;
   3020       from += sizeof "fall" - 1;
   3021       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
   3022 	from += 2;
   3023       else if (*from == ' ' || *from == '-')
   3024 	from++;
   3025       else if (*from != (all_upper ? 'T' : 't'))
   3026 	return false;
   3027       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
   3028 	return false;
   3029       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
   3030 	return false;
   3031       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
   3032 	{
   3033 	  if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
   3034 	    return false;
   3035 	  if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
   3036 		      sizeof "hrough" - 1))
   3037 	    return false;
   3038 	  from += sizeof "through" - 1;
   3039 	}
   3040       else
   3041 	from += sizeof "thru" - 1;
   3042       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
   3043 	from++;
   3044       if (*from == '-')
   3045 	{
   3046 	  from++;
   3047 	  if (*comment_start == '*')
   3048 	    {
   3049 	      do
   3050 		{
   3051 		  while (*from && *from != '*'
   3052 			 && *from != '\n' && *from != '\r')
   3053 		    from++;
   3054 		  if (*from != '*' || from[1] == '/')
   3055 		    break;
   3056 		  from++;
   3057 		}
   3058 	      while (1);
   3059 	    }
   3060 	  else
   3061 	    while (*from && *from != '\n' && *from != '\r')
   3062 	      from++;
   3063 	}
   3064     }
   3065   /* C block comment.  */
   3066   if (*comment_start == '*')
   3067     {
   3068       if (*from != '*' || from[1] != '/')
   3069 	return false;
   3070     }
   3071   /* C++ line comment.  */
   3072   else if (*from != '\n')
   3073     return false;
   3074 
   3075   return true;
   3076 }
   3077 
   3078 /* Allocate COUNT tokens for RUN.  */
   3079 void
   3080 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
   3081 {
   3082   run->base = XNEWVEC (cpp_token, count);
   3083   run->limit = run->base + count;
   3084   run->next = NULL;
   3085 }
   3086 
   3087 /* Returns the next tokenrun, or creates one if there is none.  */
   3088 static tokenrun *
   3089 next_tokenrun (tokenrun *run)
   3090 {
   3091   if (run->next == NULL)
   3092     {
   3093       run->next = XNEW (tokenrun);
   3094       run->next->prev = run;
   3095       _cpp_init_tokenrun (run->next, 250);
   3096     }
   3097 
   3098   return run->next;
   3099 }
   3100 
   3101 /* Return the number of not yet processed token in a given
   3102    context.  */
   3103 int
   3104 _cpp_remaining_tokens_num_in_context (cpp_context *context)
   3105 {
   3106   if (context->tokens_kind == TOKENS_KIND_DIRECT)
   3107     return (LAST (context).token - FIRST (context).token);
   3108   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
   3109 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
   3110     return (LAST (context).ptoken - FIRST (context).ptoken);
   3111   else
   3112       abort ();
   3113 }
   3114 
   3115 /* Returns the token present at index INDEX in a given context.  If
   3116    INDEX is zero, the next token to be processed is returned.  */
   3117 static const cpp_token*
   3118 _cpp_token_from_context_at (cpp_context *context, int index)
   3119 {
   3120   if (context->tokens_kind == TOKENS_KIND_DIRECT)
   3121     return &(FIRST (context).token[index]);
   3122   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
   3123 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
   3124     return FIRST (context).ptoken[index];
   3125  else
   3126    abort ();
   3127 }
   3128 
   3129 /* Look ahead in the input stream.  */
   3130 const cpp_token *
   3131 cpp_peek_token (cpp_reader *pfile, int index)
   3132 {
   3133   cpp_context *context = pfile->context;
   3134   const cpp_token *peektok;
   3135   int count;
   3136 
   3137   /* First, scan through any pending cpp_context objects.  */
   3138   while (context->prev)
   3139     {
   3140       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
   3141 
   3142       if (index < (int) sz)
   3143         return _cpp_token_from_context_at (context, index);
   3144       index -= (int) sz;
   3145       context = context->prev;
   3146     }
   3147 
   3148   /* We will have to read some new tokens after all (and do so
   3149      without invalidating preceding tokens).  */
   3150   count = index;
   3151   pfile->keep_tokens++;
   3152 
   3153   /* For peeked tokens temporarily disable line_change reporting,
   3154      until the tokens are parsed for real.  */
   3155   void (*line_change) (cpp_reader *, const cpp_token *, int)
   3156     = pfile->cb.line_change;
   3157   pfile->cb.line_change = NULL;
   3158 
   3159   do
   3160     {
   3161       peektok = _cpp_lex_token (pfile);
   3162       if (peektok->type == CPP_EOF)
   3163 	{
   3164 	  index--;
   3165 	  break;
   3166 	}
   3167       else if (peektok->type == CPP_PRAGMA)
   3168 	{
   3169 	  /* Don't peek past a pragma.  */
   3170 	  if (peektok == &pfile->directive_result)
   3171 	    /* Save the pragma in the buffer.  */
   3172 	    *pfile->cur_token++ = *peektok;
   3173 	  index--;
   3174 	  break;
   3175 	}
   3176     }
   3177   while (index--);
   3178 
   3179   _cpp_backup_tokens_direct (pfile, count - index);
   3180   pfile->keep_tokens--;
   3181   pfile->cb.line_change = line_change;
   3182 
   3183   return peektok;
   3184 }
   3185 
   3186 /* Allocate a single token that is invalidated at the same time as the
   3187    rest of the tokens on the line.  Has its line and col set to the
   3188    same as the last lexed token, so that diagnostics appear in the
   3189    right place.  */
   3190 cpp_token *
   3191 _cpp_temp_token (cpp_reader *pfile)
   3192 {
   3193   cpp_token *old, *result;
   3194   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
   3195   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
   3196 
   3197   old = pfile->cur_token - 1;
   3198   /* Any pre-existing lookaheads must not be clobbered.  */
   3199   if (la)
   3200     {
   3201       if (sz <= la)
   3202         {
   3203           tokenrun *next = next_tokenrun (pfile->cur_run);
   3204 
   3205           if (sz < la)
   3206             memmove (next->base + 1, next->base,
   3207                      (la - sz) * sizeof (cpp_token));
   3208 
   3209           next->base[0] = pfile->cur_run->limit[-1];
   3210         }
   3211 
   3212       if (sz > 1)
   3213         memmove (pfile->cur_token + 1, pfile->cur_token,
   3214                  MIN (la, sz - 1) * sizeof (cpp_token));
   3215     }
   3216 
   3217   if (!sz && pfile->cur_token == pfile->cur_run->limit)
   3218     {
   3219       pfile->cur_run = next_tokenrun (pfile->cur_run);
   3220       pfile->cur_token = pfile->cur_run->base;
   3221     }
   3222 
   3223   result = pfile->cur_token++;
   3224   result->src_loc = old->src_loc;
   3225   return result;
   3226 }
   3227 
   3228 /* We're at the beginning of a logical line (so not in
   3229   directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set.  See
   3230   if we should enter deferred_pragma mode to tokenize the rest of the
   3231   line as a module control-line.  */
   3232 
   3233 static void
   3234 cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
   3235 {
   3236   unsigned backup = 0; /* Tokens we peeked.  */
   3237   cpp_hashnode *node = result->val.node.node;
   3238   cpp_token *peek = result;
   3239   cpp_token *keyword = peek;
   3240   cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
   3241   int header_count = 0;
   3242 
   3243   /* Make sure the incoming state is as we expect it.  This way we
   3244      can restore it using constants.  */
   3245   gcc_checking_assert (!pfile->state.in_deferred_pragma
   3246 		       && !pfile->state.skipping
   3247 		       && !pfile->state.parsing_args
   3248 		       && !pfile->state.angled_headers
   3249 		       && (pfile->state.save_comments
   3250 			   == !CPP_OPTION (pfile, discard_comments)));
   3251 
   3252   /* Enter directives mode sufficiently for peeking.  We don't have
   3253      to actually set in_directive.  */
   3254   pfile->state.in_deferred_pragma = true;
   3255 
   3256   /* These two fields are needed to process tokenization in deferred
   3257      pragma mode.  They are not used outside deferred pragma mode or
   3258      directives mode.  */
   3259   pfile->state.pragma_allow_expansion = true;
   3260   pfile->directive_line = result->src_loc;
   3261 
   3262   /* Saving comments is incompatible with directives mode.   */
   3263   pfile->state.save_comments = 0;
   3264 
   3265   if (node == n_modules[spec_nodes::M_EXPORT][0])
   3266     {
   3267       peek = _cpp_lex_direct (pfile);
   3268       keyword = peek;
   3269       backup++;
   3270       if (keyword->type != CPP_NAME)
   3271 	goto not_module;
   3272       node = keyword->val.node.node;
   3273       if (!(node->flags & NODE_MODULE))
   3274 	goto not_module;
   3275     }
   3276 
   3277   if (node == n_modules[spec_nodes::M__IMPORT][0])
   3278     /* __import  */
   3279     header_count = backup + 2 + 16;
   3280   else if (node == n_modules[spec_nodes::M_IMPORT][0])
   3281     /* import  */
   3282     header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
   3283   else if (node == n_modules[spec_nodes::M_MODULE][0])
   3284     ; /* module  */
   3285   else
   3286     goto not_module;
   3287 
   3288   /* We've seen [export] {module|import|__import}.  Check the next token.  */
   3289   if (header_count)
   3290     /* After '{,__}import' a header name may appear.  */
   3291     pfile->state.angled_headers = true;
   3292   peek = _cpp_lex_direct (pfile);
   3293   backup++;
   3294 
   3295   /* ... import followed by identifier, ':', '<' or
   3296      header-name preprocessing tokens, or module
   3297      followed by cpp-identifier, ':' or ';' preprocessing
   3298      tokens.  C++ keywords are not yet relevant.  */
   3299   if (peek->type == CPP_NAME
   3300       || peek->type == CPP_COLON
   3301       ||  (header_count
   3302 	   ? (peek->type == CPP_LESS
   3303 	      || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
   3304 	      || peek->type == CPP_HEADER_NAME)
   3305 	   : peek->type == CPP_SEMICOLON))
   3306     {
   3307       pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
   3308       if (!pfile->state.pragma_allow_expansion)
   3309 	pfile->state.prevent_expansion++;
   3310 
   3311       if (!header_count && linemap_included_from
   3312 	  (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
   3313 	cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
   3314 			     "module control-line cannot be in included file");
   3315 
   3316       /* The first one or two tokens cannot be macro names.  */
   3317       for (int ix = backup; ix--;)
   3318 	{
   3319 	  cpp_token *tok = ix ? keyword : result;
   3320 	  cpp_hashnode *node = tok->val.node.node;
   3321 
   3322 	  /* Don't attempt to expand the token.  */
   3323 	  tok->flags |= NO_EXPAND;
   3324 	  if (_cpp_defined_macro_p (node)
   3325 	      && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
   3326 	      && !cpp_fun_like_macro_p (node))
   3327 	    cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
   3328 				 "module control-line \"%s\" cannot be"
   3329 				 " an object-like macro",
   3330 				 NODE_NAME (node));
   3331 	}
   3332 
   3333       /* Map to underbar variants.  */
   3334       keyword->val.node.node = n_modules[header_count
   3335 					 ? spec_nodes::M_IMPORT
   3336 					 : spec_nodes::M_MODULE][1];
   3337       if (backup != 1)
   3338 	result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
   3339 
   3340       /* Maybe tell the tokenizer we expect a header-name down the
   3341 	 road.  */
   3342       pfile->state.directive_file_token = header_count;
   3343     }
   3344   else
   3345     {
   3346     not_module:
   3347       /* Drop out of directive mode.  */
   3348       /* We aaserted save_comments had this value upon entry.  */
   3349       pfile->state.save_comments
   3350 	= !CPP_OPTION (pfile, discard_comments);
   3351       pfile->state.in_deferred_pragma = false;
   3352       /* Do not let this remain on.  */
   3353       pfile->state.angled_headers = false;
   3354     }
   3355 
   3356   /* In either case we want to backup the peeked tokens.  */
   3357   if (backup)
   3358     {
   3359       /* If we saw EOL, we should drop it, because this isn't a module
   3360 	 control-line after all.  */
   3361       bool eol = peek->type == CPP_PRAGMA_EOL;
   3362       if (!eol || backup > 1)
   3363 	{
   3364 	  /* Put put the peeked tokens back  */
   3365 	  _cpp_backup_tokens_direct (pfile, backup);
   3366 	  /* But if the last one was an EOL, forget it.  */
   3367 	  if (eol)
   3368 	    pfile->lookaheads--;
   3369 	}
   3370     }
   3371 }
   3372 
   3373 /* Lex a token into RESULT (external interface).  Takes care of issues
   3374    like directive handling, token lookahead, multiple include
   3375    optimization and skipping.  */
   3376 const cpp_token *
   3377 _cpp_lex_token (cpp_reader *pfile)
   3378 {
   3379   cpp_token *result;
   3380 
   3381   for (;;)
   3382     {
   3383       if (pfile->cur_token == pfile->cur_run->limit)
   3384 	{
   3385 	  pfile->cur_run = next_tokenrun (pfile->cur_run);
   3386 	  pfile->cur_token = pfile->cur_run->base;
   3387 	}
   3388       /* We assume that the current token is somewhere in the current
   3389 	 run.  */
   3390       if (pfile->cur_token < pfile->cur_run->base
   3391 	  || pfile->cur_token >= pfile->cur_run->limit)
   3392 	abort ();
   3393 
   3394       if (pfile->lookaheads)
   3395 	{
   3396 	  pfile->lookaheads--;
   3397 	  result = pfile->cur_token++;
   3398 	}
   3399       else
   3400 	result = _cpp_lex_direct (pfile);
   3401 
   3402       if (result->flags & BOL)
   3403 	{
   3404 	  /* Is this a directive.  If _cpp_handle_directive returns
   3405 	     false, it is an assembler #.  */
   3406 	  if (result->type == CPP_HASH
   3407 	      /* 6.10.3 p 11: Directives in a list of macro arguments
   3408 		 gives undefined behavior.  This implementation
   3409 		 handles the directive as normal.  */
   3410 	      && pfile->state.parsing_args != 1)
   3411 	    {
   3412 	      if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
   3413 		{
   3414 		  if (pfile->directive_result.type == CPP_PADDING)
   3415 		    continue;
   3416 		  result = &pfile->directive_result;
   3417 		}
   3418 	    }
   3419 	  else if (pfile->state.in_deferred_pragma)
   3420 	    result = &pfile->directive_result;
   3421 	  else if (result->type == CPP_NAME
   3422 		   && (result->val.node.node->flags & NODE_MODULE)
   3423 		   && !pfile->state.skipping
   3424 		   /* Unlike regular directives, we do not deal with
   3425 		      tokenizing module directives as macro arguments.
   3426 		      That's not permitted.  */
   3427 		   && !pfile->state.parsing_args)
   3428 	    {
   3429 	      /* P1857.  Before macro expansion, At start of logical
   3430 		 line ... */
   3431 	      /* We don't have to consider lookaheads at this point.  */
   3432 	      gcc_checking_assert (!pfile->lookaheads);
   3433 
   3434 	      cpp_maybe_module_directive (pfile, result);
   3435 	    }
   3436 
   3437 	  if (pfile->cb.line_change && !pfile->state.skipping)
   3438 	    pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
   3439 	}
   3440 
   3441       /* We don't skip tokens in directives.  */
   3442       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
   3443 	break;
   3444 
   3445       /* Outside a directive, invalidate controlling macros.  At file
   3446 	 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
   3447 	 get here and MI optimization works.  */
   3448       pfile->mi_valid = false;
   3449 
   3450       if (!pfile->state.skipping || result->type == CPP_EOF)
   3451 	break;
   3452     }
   3453 
   3454   return result;
   3455 }
   3456 
   3457 /* Returns true if a fresh line has been loaded.  */
   3458 bool
   3459 _cpp_get_fresh_line (cpp_reader *pfile)
   3460 {
   3461   /* We can't get a new line until we leave the current directive.  */
   3462   if (pfile->state.in_directive)
   3463     return false;
   3464 
   3465   for (;;)
   3466     {
   3467       cpp_buffer *buffer = pfile->buffer;
   3468 
   3469       if (!buffer->need_line)
   3470 	return true;
   3471 
   3472       if (buffer->next_line < buffer->rlimit)
   3473 	{
   3474 	  _cpp_clean_line (pfile);
   3475 	  return true;
   3476 	}
   3477 
   3478       /* First, get out of parsing arguments state.  */
   3479       if (pfile->state.parsing_args)
   3480 	return false;
   3481 
   3482       /* End of buffer.  Non-empty files should end in a newline.  */
   3483       if (buffer->buf != buffer->rlimit
   3484 	  && buffer->next_line > buffer->rlimit
   3485 	  && !buffer->from_stage3)
   3486 	{
   3487 	  /* Clip to buffer size.  */
   3488 	  buffer->next_line = buffer->rlimit;
   3489 	}
   3490 
   3491       if (buffer->prev && !buffer->return_at_eof)
   3492 	_cpp_pop_buffer (pfile);
   3493       else
   3494 	{
   3495 	  /* End of translation.  Do not pop the buffer yet. Increment
   3496 	     line number so that the EOF token is on a line of its own
   3497 	     (_cpp_lex_direct doesn't increment in that case, because
   3498 	     it's hard for it to distinguish this special case). */
   3499 	  CPP_INCREMENT_LINE (pfile, 0);
   3500 	  return false;
   3501 	}
   3502     }
   3503 }
   3504 
   3505 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)		\
   3506   do							\
   3507     {							\
   3508       result->type = ELSE_TYPE;				\
   3509       if (*buffer->cur == CHAR)				\
   3510 	buffer->cur++, result->type = THEN_TYPE;	\
   3511     }							\
   3512   while (0)
   3513 
   3514 /* Lex a token into pfile->cur_token, which is also incremented, to
   3515    get diagnostics pointing to the correct location.
   3516 
   3517    Does not handle issues such as token lookahead, multiple-include
   3518    optimization, directives, skipping etc.  This function is only
   3519    suitable for use by _cpp_lex_token, and in special cases like
   3520    lex_expansion_token which doesn't care for any of these issues.
   3521 
   3522    When meeting a newline, returns CPP_EOF if parsing a directive,
   3523    otherwise returns to the start of the token buffer if permissible.
   3524    Returns the location of the lexed token.  */
   3525 cpp_token *
   3526 _cpp_lex_direct (cpp_reader *pfile)
   3527 {
   3528   cppchar_t c;
   3529   cpp_buffer *buffer;
   3530   const unsigned char *comment_start;
   3531   bool fallthrough_comment = false;
   3532   cpp_token *result = pfile->cur_token++;
   3533 
   3534  fresh_line:
   3535   result->flags = 0;
   3536   buffer = pfile->buffer;
   3537   if (buffer->need_line)
   3538     {
   3539       if (pfile->state.in_deferred_pragma)
   3540 	{
   3541 	  /* This can happen in cases like:
   3542 	     #define loop(x) whatever
   3543 	     #pragma omp loop
   3544 	     where when trying to expand loop we need to peek
   3545 	     next token after loop, but aren't still in_deferred_pragma
   3546 	     mode but are in in_directive mode, so buffer->need_line
   3547 	     is set, a CPP_EOF is peeked.  */
   3548 	  result->type = CPP_PRAGMA_EOL;
   3549 	  pfile->state.in_deferred_pragma = false;
   3550 	  if (!pfile->state.pragma_allow_expansion)
   3551 	    pfile->state.prevent_expansion--;
   3552 	  return result;
   3553 	}
   3554       if (!_cpp_get_fresh_line (pfile))
   3555 	{
   3556 	  result->type = CPP_EOF;
   3557 	  /* Not a real EOF in a directive or arg parsing -- we refuse
   3558   	     to advance to the next file now, and will once we're out
   3559   	     of those modes.  */
   3560 	  if (!pfile->state.in_directive && !pfile->state.parsing_args)
   3561 	    {
   3562 	      /* Tell the compiler the line number of the EOF token.  */
   3563 	      result->src_loc = pfile->line_table->highest_line;
   3564 	      result->flags = BOL;
   3565 	      /* Now pop the buffer that _cpp_get_fresh_line did not.  */
   3566 	      _cpp_pop_buffer (pfile);
   3567 	    }
   3568 	  return result;
   3569 	}
   3570       if (buffer != pfile->buffer)
   3571 	fallthrough_comment = false;
   3572       if (!pfile->keep_tokens)
   3573 	{
   3574 	  pfile->cur_run = &pfile->base_run;
   3575 	  result = pfile->base_run.base;
   3576 	  pfile->cur_token = result + 1;
   3577 	}
   3578       result->flags = BOL;
   3579       if (pfile->state.parsing_args == 2)
   3580 	result->flags |= PREV_WHITE;
   3581     }
   3582   buffer = pfile->buffer;
   3583  update_tokens_line:
   3584   result->src_loc = pfile->line_table->highest_line;
   3585 
   3586  skipped_white:
   3587   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
   3588       && !pfile->overlaid_buffer)
   3589     {
   3590       _cpp_process_line_notes (pfile, false);
   3591       result->src_loc = pfile->line_table->highest_line;
   3592     }
   3593   c = *buffer->cur++;
   3594 
   3595   if (pfile->forced_token_location)
   3596     result->src_loc = pfile->forced_token_location;
   3597   else
   3598     result->src_loc = linemap_position_for_column (pfile->line_table,
   3599 					  CPP_BUF_COLUMN (buffer, buffer->cur));
   3600 
   3601   switch (c)
   3602     {
   3603     case ' ': case '\t': case '\f': case '\v': case '\0':
   3604       result->flags |= PREV_WHITE;
   3605       skip_whitespace (pfile, c);
   3606       goto skipped_white;
   3607 
   3608     case '\n':
   3609       /* Increment the line, unless this is the last line ...  */
   3610       if (buffer->cur < buffer->rlimit
   3611 	  /* ... or this is a #include, (where _cpp_stack_file needs to
   3612 	     unwind by one line) ...  */
   3613 	  || (pfile->state.in_directive > 1
   3614 	      /* ... except traditional-cpp increments this elsewhere.  */
   3615 	      && !CPP_OPTION (pfile, traditional)))
   3616 	CPP_INCREMENT_LINE (pfile, 0);
   3617       buffer->need_line = true;
   3618       if (pfile->state.in_deferred_pragma)
   3619 	{
   3620 	  /* Produce the PRAGMA_EOL on this line.  File reading
   3621 	     ensures there is always a \n at end of the buffer, thus
   3622 	     in a deferred pragma we always see CPP_PRAGMA_EOL before
   3623 	     any CPP_EOF.  */
   3624 	  result->type = CPP_PRAGMA_EOL;
   3625 	  result->flags &= ~PREV_WHITE;
   3626 	  pfile->state.in_deferred_pragma = false;
   3627 	  if (!pfile->state.pragma_allow_expansion)
   3628 	    pfile->state.prevent_expansion--;
   3629 	  return result;
   3630 	}
   3631       goto fresh_line;
   3632 
   3633     case '0': case '1': case '2': case '3': case '4':
   3634     case '5': case '6': case '7': case '8': case '9':
   3635       {
   3636 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   3637 	result->type = CPP_NUMBER;
   3638 	lex_number (pfile, &result->val.str, &nst);
   3639 	warn_about_normalization (pfile, result, &nst);
   3640 	break;
   3641       }
   3642 
   3643     case 'L':
   3644     case 'u':
   3645     case 'U':
   3646     case 'R':
   3647       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
   3648 	 wide strings or raw strings.  */
   3649       if (c == 'L' || CPP_OPTION (pfile, rliterals)
   3650 	  || (c != 'R' && CPP_OPTION (pfile, uliterals)))
   3651 	{
   3652 	  if ((*buffer->cur == '\'' && c != 'R')
   3653 	      || *buffer->cur == '"'
   3654 	      || (*buffer->cur == 'R'
   3655 		  && c != 'R'
   3656 		  && buffer->cur[1] == '"'
   3657 		  && CPP_OPTION (pfile, rliterals))
   3658 	      || (*buffer->cur == '8'
   3659 		  && c == 'u'
   3660 		  && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
   3661 				&& CPP_OPTION (pfile, utf8_char_literals)))
   3662 		      || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
   3663 			  && CPP_OPTION (pfile, rliterals)))))
   3664 	    {
   3665 	      lex_string (pfile, result, buffer->cur - 1);
   3666 	      break;
   3667 	    }
   3668 	}
   3669       /* Fall through.  */
   3670 
   3671     case '_':
   3672     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
   3673     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
   3674     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
   3675     case 's': case 't':           case 'v': case 'w': case 'x':
   3676     case 'y': case 'z':
   3677     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
   3678     case 'G': case 'H': case 'I': case 'J': case 'K':
   3679     case 'M': case 'N': case 'O': case 'P': case 'Q':
   3680     case 'S': case 'T':           case 'V': case 'W': case 'X':
   3681     case 'Y': case 'Z':
   3682       result->type = CPP_NAME;
   3683       {
   3684 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   3685 	result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
   3686 						&nst,
   3687 						&result->val.node.spelling);
   3688 	warn_about_normalization (pfile, result, &nst);
   3689       }
   3690 
   3691       /* Convert named operators to their proper types.  */
   3692       if (result->val.node.node->flags & NODE_OPERATOR)
   3693 	{
   3694 	  result->flags |= NAMED_OP;
   3695 	  result->type = (enum cpp_ttype) result->val.node.node->directive_index;
   3696 	}
   3697 
   3698       /* Signal FALLTHROUGH comment followed by another token.  */
   3699       if (fallthrough_comment)
   3700 	result->flags |= PREV_FALLTHROUGH;
   3701       break;
   3702 
   3703     case '\'':
   3704     case '"':
   3705       lex_string (pfile, result, buffer->cur - 1);
   3706       break;
   3707 
   3708     case '/':
   3709       /* A potential block or line comment.  */
   3710       comment_start = buffer->cur;
   3711       c = *buffer->cur;
   3712 
   3713       if (c == '*')
   3714 	{
   3715 	  if (_cpp_skip_block_comment (pfile))
   3716 	    cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
   3717 	}
   3718       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
   3719 	{
   3720 	  /* Don't warn for system headers.  */
   3721 	  if (_cpp_in_system_header (pfile))
   3722 	    ;
   3723 	  /* Warn about comments if pedantically GNUC89, and not
   3724 	     in system headers.  */
   3725 	  else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
   3726 		   && CPP_PEDANTIC (pfile)
   3727 		   && ! buffer->warned_cplusplus_comments)
   3728 	    {
   3729 	      if (cpp_error (pfile, CPP_DL_PEDWARN,
   3730 			     "C++ style comments are not allowed in ISO C90"))
   3731 		cpp_error (pfile, CPP_DL_NOTE,
   3732 			   "(this will be reported only once per input file)");
   3733 	      buffer->warned_cplusplus_comments = 1;
   3734 	    }
   3735 	  /* Or if specifically desired via -Wc90-c99-compat.  */
   3736 	  else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
   3737 		   && ! CPP_OPTION (pfile, cplusplus)
   3738 		   && ! buffer->warned_cplusplus_comments)
   3739 	    {
   3740 	      if (cpp_error (pfile, CPP_DL_WARNING,
   3741 			     "C++ style comments are incompatible with C90"))
   3742 		cpp_error (pfile, CPP_DL_NOTE,
   3743 			   "(this will be reported only once per input file)");
   3744 	      buffer->warned_cplusplus_comments = 1;
   3745 	    }
   3746 	  /* In C89/C94, C++ style comments are forbidden.  */
   3747 	  else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
   3748 		    || CPP_OPTION (pfile, lang) == CLK_STDC94))
   3749 	    {
   3750 	      /* But don't be confused about valid code such as
   3751 	         - // immediately followed by *,
   3752 		 - // in a preprocessing directive,
   3753 		 - // in an #if 0 block.  */
   3754 	      if (buffer->cur[1] == '*'
   3755 		  || pfile->state.in_directive
   3756 		  || pfile->state.skipping)
   3757 		{
   3758 		  result->type = CPP_DIV;
   3759 		  break;
   3760 		}
   3761 	      else if (! buffer->warned_cplusplus_comments)
   3762 		{
   3763 		  if (cpp_error (pfile, CPP_DL_ERROR,
   3764 				 "C++ style comments are not allowed in "
   3765 				 "ISO C90"))
   3766 		    cpp_error (pfile, CPP_DL_NOTE,
   3767 			       "(this will be reported only once per input "
   3768 			       "file)");
   3769 		  buffer->warned_cplusplus_comments = 1;
   3770 		}
   3771 	    }
   3772 	  if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
   3773 	    cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
   3774 	}
   3775       else if (c == '=')
   3776 	{
   3777 	  buffer->cur++;
   3778 	  result->type = CPP_DIV_EQ;
   3779 	  break;
   3780 	}
   3781       else
   3782 	{
   3783 	  result->type = CPP_DIV;
   3784 	  break;
   3785 	}
   3786 
   3787       if (fallthrough_comment_p (pfile, comment_start))
   3788 	fallthrough_comment = true;
   3789 
   3790       if (pfile->cb.comment)
   3791 	{
   3792 	  size_t len = pfile->buffer->cur - comment_start;
   3793 	  pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
   3794 			     len + 1);
   3795 	}
   3796 
   3797       if (!pfile->state.save_comments)
   3798 	{
   3799 	  result->flags |= PREV_WHITE;
   3800 	  goto update_tokens_line;
   3801 	}
   3802 
   3803       if (fallthrough_comment)
   3804 	result->flags |= PREV_FALLTHROUGH;
   3805 
   3806       /* Save the comment as a token in its own right.  */
   3807       save_comment (pfile, result, comment_start, c);
   3808       break;
   3809 
   3810     case '<':
   3811       if (pfile->state.angled_headers)
   3812 	{
   3813 	  lex_string (pfile, result, buffer->cur - 1);
   3814 	  if (result->type != CPP_LESS)
   3815 	    break;
   3816 	}
   3817 
   3818       result->type = CPP_LESS;
   3819       if (*buffer->cur == '=')
   3820 	{
   3821 	  buffer->cur++, result->type = CPP_LESS_EQ;
   3822 	  if (*buffer->cur == '>'
   3823 	      && CPP_OPTION (pfile, cplusplus)
   3824 	      && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
   3825 	    buffer->cur++, result->type = CPP_SPACESHIP;
   3826 	}
   3827       else if (*buffer->cur == '<')
   3828 	{
   3829 	  buffer->cur++;
   3830 	  IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
   3831 	}
   3832       else if (CPP_OPTION (pfile, digraphs))
   3833 	{
   3834 	  if (*buffer->cur == ':')
   3835 	    {
   3836 	      /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
   3837 		 three characters are <:: and the subsequent character
   3838 		 is neither : nor >, the < is treated as a preprocessor
   3839 		 token by itself".  */
   3840 	      if (CPP_OPTION (pfile, cplusplus)
   3841 		  && CPP_OPTION (pfile, lang) != CLK_CXX98
   3842 		  && CPP_OPTION (pfile, lang) != CLK_GNUCXX
   3843 		  && buffer->cur[1] == ':'
   3844 		  && buffer->cur[2] != ':' && buffer->cur[2] != '>')
   3845 		break;
   3846 
   3847 	      buffer->cur++;
   3848 	      result->flags |= DIGRAPH;
   3849 	      result->type = CPP_OPEN_SQUARE;
   3850 	    }
   3851 	  else if (*buffer->cur == '%')
   3852 	    {
   3853 	      buffer->cur++;
   3854 	      result->flags |= DIGRAPH;
   3855 	      result->type = CPP_OPEN_BRACE;
   3856 	    }
   3857 	}
   3858       break;
   3859 
   3860     case '>':
   3861       result->type = CPP_GREATER;
   3862       if (*buffer->cur == '=')
   3863 	buffer->cur++, result->type = CPP_GREATER_EQ;
   3864       else if (*buffer->cur == '>')
   3865 	{
   3866 	  buffer->cur++;
   3867 	  IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
   3868 	}
   3869       break;
   3870 
   3871     case '%':
   3872       result->type = CPP_MOD;
   3873       if (*buffer->cur == '=')
   3874 	buffer->cur++, result->type = CPP_MOD_EQ;
   3875       else if (CPP_OPTION (pfile, digraphs))
   3876 	{
   3877 	  if (*buffer->cur == ':')
   3878 	    {
   3879 	      buffer->cur++;
   3880 	      result->flags |= DIGRAPH;
   3881 	      result->type = CPP_HASH;
   3882 	      if (*buffer->cur == '%' && buffer->cur[1] == ':')
   3883 		buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
   3884 	    }
   3885 	  else if (*buffer->cur == '>')
   3886 	    {
   3887 	      buffer->cur++;
   3888 	      result->flags |= DIGRAPH;
   3889 	      result->type = CPP_CLOSE_BRACE;
   3890 	    }
   3891 	}
   3892       break;
   3893 
   3894     case '.':
   3895       result->type = CPP_DOT;
   3896       if (ISDIGIT (*buffer->cur))
   3897 	{
   3898 	  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   3899 	  result->type = CPP_NUMBER;
   3900 	  lex_number (pfile, &result->val.str, &nst);
   3901 	  warn_about_normalization (pfile, result, &nst);
   3902 	}
   3903       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
   3904 	buffer->cur += 2, result->type = CPP_ELLIPSIS;
   3905       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
   3906 	buffer->cur++, result->type = CPP_DOT_STAR;
   3907       break;
   3908 
   3909     case '+':
   3910       result->type = CPP_PLUS;
   3911       if (*buffer->cur == '+')
   3912 	buffer->cur++, result->type = CPP_PLUS_PLUS;
   3913       else if (*buffer->cur == '=')
   3914 	buffer->cur++, result->type = CPP_PLUS_EQ;
   3915       break;
   3916 
   3917     case '-':
   3918       result->type = CPP_MINUS;
   3919       if (*buffer->cur == '>')
   3920 	{
   3921 	  buffer->cur++;
   3922 	  result->type = CPP_DEREF;
   3923 	  if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
   3924 	    buffer->cur++, result->type = CPP_DEREF_STAR;
   3925 	}
   3926       else if (*buffer->cur == '-')
   3927 	buffer->cur++, result->type = CPP_MINUS_MINUS;
   3928       else if (*buffer->cur == '=')
   3929 	buffer->cur++, result->type = CPP_MINUS_EQ;
   3930       break;
   3931 
   3932     case '&':
   3933       result->type = CPP_AND;
   3934       if (*buffer->cur == '&')
   3935 	buffer->cur++, result->type = CPP_AND_AND;
   3936       else if (*buffer->cur == '=')
   3937 	buffer->cur++, result->type = CPP_AND_EQ;
   3938       break;
   3939 
   3940     case '|':
   3941       result->type = CPP_OR;
   3942       if (*buffer->cur == '|')
   3943 	buffer->cur++, result->type = CPP_OR_OR;
   3944       else if (*buffer->cur == '=')
   3945 	buffer->cur++, result->type = CPP_OR_EQ;
   3946       break;
   3947 
   3948     case ':':
   3949       result->type = CPP_COLON;
   3950       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
   3951 	buffer->cur++, result->type = CPP_SCOPE;
   3952       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
   3953 	{
   3954 	  buffer->cur++;
   3955 	  result->flags |= DIGRAPH;
   3956 	  result->type = CPP_CLOSE_SQUARE;
   3957 	}
   3958       break;
   3959 
   3960     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
   3961     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
   3962     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
   3963     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
   3964     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
   3965 
   3966     case '?': result->type = CPP_QUERY; break;
   3967     case '~': result->type = CPP_COMPL; break;
   3968     case ',': result->type = CPP_COMMA; break;
   3969     case '(': result->type = CPP_OPEN_PAREN; break;
   3970     case ')': result->type = CPP_CLOSE_PAREN; break;
   3971     case '[': result->type = CPP_OPEN_SQUARE; break;
   3972     case ']': result->type = CPP_CLOSE_SQUARE; break;
   3973     case '{': result->type = CPP_OPEN_BRACE; break;
   3974     case '}': result->type = CPP_CLOSE_BRACE; break;
   3975     case ';': result->type = CPP_SEMICOLON; break;
   3976 
   3977       /* @ is a punctuator in Objective-C.  */
   3978     case '@': result->type = CPP_ATSIGN; break;
   3979 
   3980     default:
   3981       {
   3982 	const uchar *base = --buffer->cur;
   3983 
   3984 	/* Check for an extended identifier ($ or UCN or UTF-8).  */
   3985 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
   3986 	if (forms_identifier_p (pfile, true, &nst))
   3987 	  {
   3988 	    result->type = CPP_NAME;
   3989 	    result->val.node.node = lex_identifier (pfile, base, true, &nst,
   3990 						    &result->val.node.spelling);
   3991 	    warn_about_normalization (pfile, result, &nst);
   3992 	    break;
   3993 	  }
   3994 
   3995 	/* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
   3996 	   single token.  */
   3997 	buffer->cur++;
   3998 	if (c >= utf8_signifier)
   3999 	  {
   4000 	    const uchar *pstr = base;
   4001 	    cppchar_t s;
   4002 	    if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
   4003 	      buffer->cur = pstr;
   4004 	  }
   4005 	create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
   4006 	break;
   4007       }
   4008 
   4009     }
   4010 
   4011   /* Potentially convert the location of the token to a range.  */
   4012   if (result->src_loc >= RESERVED_LOCATION_COUNT
   4013       && result->type != CPP_EOF)
   4014     {
   4015       /* Ensure that any line notes are processed, so that we have the
   4016 	 correct physical line/column for the end-point of the token even
   4017 	 when a logical line is split via one or more backslashes.  */
   4018       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
   4019 	  && !pfile->overlaid_buffer)
   4020 	_cpp_process_line_notes (pfile, false);
   4021 
   4022       source_range tok_range;
   4023       tok_range.m_start = result->src_loc;
   4024       tok_range.m_finish
   4025 	= linemap_position_for_column (pfile->line_table,
   4026 				       CPP_BUF_COLUMN (buffer, buffer->cur));
   4027 
   4028       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
   4029 					       result->src_loc,
   4030 					       tok_range, NULL);
   4031     }
   4032 
   4033   return result;
   4034 }
   4035 
   4036 /* An upper bound on the number of bytes needed to spell TOKEN.
   4037    Does not include preceding whitespace.  */
   4038 unsigned int
   4039 cpp_token_len (const cpp_token *token)
   4040 {
   4041   unsigned int len;
   4042 
   4043   switch (TOKEN_SPELL (token))
   4044     {
   4045     default:		len = 6;				break;
   4046     case SPELL_LITERAL:	len = token->val.str.len;		break;
   4047     case SPELL_IDENT:	len = NODE_LEN (token->val.node.node) * 10;	break;
   4048     }
   4049 
   4050   return len;
   4051 }
   4052 
   4053 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
   4054    Return the number of bytes read out of NAME.  (There are always
   4055    10 bytes written to BUFFER.)  */
   4056 
   4057 static size_t
   4058 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
   4059 {
   4060   int j;
   4061   int ucn_len = 0;
   4062   int ucn_len_c;
   4063   unsigned t;
   4064   unsigned long utf32;
   4065 
   4066   /* Compute the length of the UTF-8 sequence.  */
   4067   for (t = *name; t & 0x80; t <<= 1)
   4068     ucn_len++;
   4069 
   4070   utf32 = *name & (0x7F >> ucn_len);
   4071   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
   4072     {
   4073       utf32 = (utf32 << 6) | (*++name & 0x3F);
   4074 
   4075       /* Ill-formed UTF-8.  */
   4076       if ((*name & ~0x3F) != 0x80)
   4077 	abort ();
   4078     }
   4079 
   4080   *buffer++ = '\\';
   4081   *buffer++ = 'U';
   4082   for (j = 7; j >= 0; j--)
   4083     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
   4084   return ucn_len;
   4085 }
   4086 
   4087 /* Given a token TYPE corresponding to a digraph, return a pointer to
   4088    the spelling of the digraph.  */
   4089 static const unsigned char *
   4090 cpp_digraph2name (enum cpp_ttype type)
   4091 {
   4092   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
   4093 }
   4094 
   4095 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
   4096    The buffer must already contain the enough space to hold the
   4097    token's spelling.  Returns a pointer to the character after the
   4098    last character written.  */
   4099 unsigned char *
   4100 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
   4101 {
   4102   size_t i;
   4103   const unsigned char *name = NODE_NAME (ident);
   4104 
   4105   for (i = 0; i < NODE_LEN (ident); i++)
   4106     if (name[i] & ~0x7F)
   4107       {
   4108 	i += utf8_to_ucn (buffer, name + i) - 1;
   4109 	buffer += 10;
   4110       }
   4111     else
   4112       *buffer++ = name[i];
   4113 
   4114   return buffer;
   4115 }
   4116 
   4117 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
   4118    already contain the enough space to hold the token's spelling.
   4119    Returns a pointer to the character after the last character written.
   4120    FORSTRING is true if this is to be the spelling after translation
   4121    phase 1 (with the original spelling of extended identifiers), false
   4122    if extended identifiers should always be written using UCNs (there is
   4123    no option for always writing them in the internal UTF-8 form).
   4124    FIXME: Would be nice if we didn't need the PFILE argument.  */
   4125 unsigned char *
   4126 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
   4127 		 unsigned char *buffer, bool forstring)
   4128 {
   4129   switch (TOKEN_SPELL (token))
   4130     {
   4131     case SPELL_OPERATOR:
   4132       {
   4133 	const unsigned char *spelling;
   4134 	unsigned char c;
   4135 
   4136 	if (token->flags & DIGRAPH)
   4137 	  spelling = cpp_digraph2name (token->type);
   4138 	else if (token->flags & NAMED_OP)
   4139 	  goto spell_ident;
   4140 	else
   4141 	  spelling = TOKEN_NAME (token);
   4142 
   4143 	while ((c = *spelling++) != '\0')
   4144 	  *buffer++ = c;
   4145       }
   4146       break;
   4147 
   4148     spell_ident:
   4149     case SPELL_IDENT:
   4150       if (forstring)
   4151 	{
   4152 	  memcpy (buffer, NODE_NAME (token->val.node.spelling),
   4153 		  NODE_LEN (token->val.node.spelling));
   4154 	  buffer += NODE_LEN (token->val.node.spelling);
   4155 	}
   4156       else
   4157 	buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
   4158       break;
   4159 
   4160     case SPELL_LITERAL:
   4161       memcpy (buffer, token->val.str.text, token->val.str.len);
   4162       buffer += token->val.str.len;
   4163       break;
   4164 
   4165     case SPELL_NONE:
   4166       cpp_error (pfile, CPP_DL_ICE,
   4167 		 "unspellable token %s", TOKEN_NAME (token));
   4168       break;
   4169     }
   4170 
   4171   return buffer;
   4172 }
   4173 
   4174 /* Returns TOKEN spelt as a null-terminated string.  The string is
   4175    freed when the reader is destroyed.  Useful for diagnostics.  */
   4176 unsigned char *
   4177 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
   4178 {
   4179   unsigned int len = cpp_token_len (token) + 1;
   4180   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
   4181 
   4182   end = cpp_spell_token (pfile, token, start, false);
   4183   end[0] = '\0';
   4184 
   4185   return start;
   4186 }
   4187 
   4188 /* Returns a pointer to a string which spells the token defined by
   4189    TYPE and FLAGS.  Used by C front ends, which really should move to
   4190    using cpp_token_as_text.  */
   4191 const char *
   4192 cpp_type2name (enum cpp_ttype type, unsigned char flags)
   4193 {
   4194   if (flags & DIGRAPH)
   4195     return (const char *) cpp_digraph2name (type);
   4196   else if (flags & NAMED_OP)
   4197     return cpp_named_operator2name (type);
   4198 
   4199   return (const char *) token_spellings[type].name;
   4200 }
   4201 
   4202 /* Writes the spelling of token to FP, without any preceding space.
   4203    Separated from cpp_spell_token for efficiency - to avoid stdio
   4204    double-buffering.  */
   4205 void
   4206 cpp_output_token (const cpp_token *token, FILE *fp)
   4207 {
   4208   switch (TOKEN_SPELL (token))
   4209     {
   4210     case SPELL_OPERATOR:
   4211       {
   4212 	const unsigned char *spelling;
   4213 	int c;
   4214 
   4215 	if (token->flags & DIGRAPH)
   4216 	  spelling = cpp_digraph2name (token->type);
   4217 	else if (token->flags & NAMED_OP)
   4218 	  goto spell_ident;
   4219 	else
   4220 	  spelling = TOKEN_NAME (token);
   4221 
   4222 	c = *spelling;
   4223 	do
   4224 	  putc (c, fp);
   4225 	while ((c = *++spelling) != '\0');
   4226       }
   4227       break;
   4228 
   4229     spell_ident:
   4230     case SPELL_IDENT:
   4231       {
   4232 	size_t i;
   4233 	const unsigned char * name = NODE_NAME (token->val.node.node);
   4234 
   4235 	for (i = 0; i < NODE_LEN (token->val.node.node); i++)
   4236 	  if (name[i] & ~0x7F)
   4237 	    {
   4238 	      unsigned char buffer[10];
   4239 	      i += utf8_to_ucn (buffer, name + i) - 1;
   4240 	      fwrite (buffer, 1, 10, fp);
   4241 	    }
   4242 	  else
   4243 	    fputc (NODE_NAME (token->val.node.node)[i], fp);
   4244       }
   4245       break;
   4246 
   4247     case SPELL_LITERAL:
   4248       if (token->type == CPP_HEADER_NAME)
   4249 	fputc ('"', fp);
   4250       fwrite (token->val.str.text, 1, token->val.str.len, fp);
   4251       if (token->type == CPP_HEADER_NAME)
   4252 	fputc ('"', fp);
   4253       break;
   4254 
   4255     case SPELL_NONE:
   4256       /* An error, most probably.  */
   4257       break;
   4258     }
   4259 }
   4260 
   4261 /* Compare two tokens.  */
   4262 int
   4263 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
   4264 {
   4265   if (a->type == b->type && a->flags == b->flags)
   4266     switch (TOKEN_SPELL (a))
   4267       {
   4268       default:			/* Keep compiler happy.  */
   4269       case SPELL_OPERATOR:
   4270 	/* token_no is used to track where multiple consecutive ##
   4271 	   tokens were originally located.  */
   4272 	return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
   4273       case SPELL_NONE:
   4274 	return (a->type != CPP_MACRO_ARG
   4275 		|| (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
   4276 		    && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
   4277       case SPELL_IDENT:
   4278 	return (a->val.node.node == b->val.node.node
   4279 		&& a->val.node.spelling == b->val.node.spelling);
   4280       case SPELL_LITERAL:
   4281 	return (a->val.str.len == b->val.str.len
   4282 		&& !memcmp (a->val.str.text, b->val.str.text,
   4283 			    a->val.str.len));
   4284       }
   4285 
   4286   return 0;
   4287 }
   4288 
   4289 /* Returns nonzero if a space should be inserted to avoid an
   4290    accidental token paste for output.  For simplicity, it is
   4291    conservative, and occasionally advises a space where one is not
   4292    needed, e.g. "." and ".2".  */
   4293 int
   4294 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
   4295 		 const cpp_token *token2)
   4296 {
   4297   enum cpp_ttype a = token1->type, b = token2->type;
   4298   cppchar_t c;
   4299 
   4300   if (token1->flags & NAMED_OP)
   4301     a = CPP_NAME;
   4302   if (token2->flags & NAMED_OP)
   4303     b = CPP_NAME;
   4304 
   4305   c = EOF;
   4306   if (token2->flags & DIGRAPH)
   4307     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
   4308   else if (token_spellings[b].category == SPELL_OPERATOR)
   4309     c = token_spellings[b].name[0];
   4310 
   4311   /* Quickly get everything that can paste with an '='.  */
   4312   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
   4313     return 1;
   4314 
   4315   switch (a)
   4316     {
   4317     case CPP_GREATER:	return c == '>';
   4318     case CPP_LESS:	return c == '<' || c == '%' || c == ':';
   4319     case CPP_PLUS:	return c == '+';
   4320     case CPP_MINUS:	return c == '-' || c == '>';
   4321     case CPP_DIV:	return c == '/' || c == '*'; /* Comments.  */
   4322     case CPP_MOD:	return c == ':' || c == '>';
   4323     case CPP_AND:	return c == '&';
   4324     case CPP_OR:	return c == '|';
   4325     case CPP_COLON:	return c == ':' || c == '>';
   4326     case CPP_DEREF:	return c == '*';
   4327     case CPP_DOT:	return c == '.' || c == '%' || b == CPP_NUMBER;
   4328     case CPP_HASH:	return c == '#' || c == '%'; /* Digraph form.  */
   4329     case CPP_PRAGMA:
   4330     case CPP_NAME:	return ((b == CPP_NUMBER
   4331 				 && name_p (pfile, &token2->val.str))
   4332 				|| b == CPP_NAME
   4333 				|| b == CPP_CHAR || b == CPP_STRING); /* L */
   4334     case CPP_NUMBER:	return (b == CPP_NUMBER || b == CPP_NAME
   4335 				|| b == CPP_CHAR
   4336 				|| c == '.' || c == '+' || c == '-');
   4337 				      /* UCNs */
   4338     case CPP_OTHER:	return ((token1->val.str.text[0] == '\\'
   4339 				 && b == CPP_NAME)
   4340 				|| (CPP_OPTION (pfile, objc)
   4341 				    && token1->val.str.text[0] == '@'
   4342 				    && (b == CPP_NAME || b == CPP_STRING)));
   4343     case CPP_LESS_EQ:	return c == '>';
   4344     case CPP_STRING:
   4345     case CPP_WSTRING:
   4346     case CPP_UTF8STRING:
   4347     case CPP_STRING16:
   4348     case CPP_STRING32:	return (CPP_OPTION (pfile, user_literals)
   4349 				&& (b == CPP_NAME
   4350 				    || (TOKEN_SPELL (token2) == SPELL_LITERAL
   4351 					&& ISIDST (token2->val.str.text[0]))));
   4352 
   4353     default:		break;
   4354     }
   4355 
   4356   return 0;
   4357 }
   4358 
   4359 /* Output all the remaining tokens on the current line, and a newline
   4360    character, to FP.  Leading whitespace is removed.  If there are
   4361    macros, special token padding is not performed.  */
   4362 void
   4363 cpp_output_line (cpp_reader *pfile, FILE *fp)
   4364 {
   4365   const cpp_token *token;
   4366 
   4367   token = cpp_get_token (pfile);
   4368   while (token->type != CPP_EOF)
   4369     {
   4370       cpp_output_token (token, fp);
   4371       token = cpp_get_token (pfile);
   4372       if (token->flags & PREV_WHITE)
   4373 	putc (' ', fp);
   4374     }
   4375 
   4376   putc ('\n', fp);
   4377 }
   4378 
   4379 /* Return a string representation of all the remaining tokens on the
   4380    current line.  The result is allocated using xmalloc and must be
   4381    freed by the caller.  */
   4382 unsigned char *
   4383 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
   4384 {
   4385   const cpp_token *token;
   4386   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
   4387   unsigned int alloced = 120 + out;
   4388   unsigned char *result = (unsigned char *) xmalloc (alloced);
   4389 
   4390   /* If DIR_NAME is empty, there are no initial contents.  */
   4391   if (dir_name)
   4392     {
   4393       sprintf ((char *) result, "#%s ", dir_name);
   4394       out += 2;
   4395     }
   4396 
   4397   token = cpp_get_token (pfile);
   4398   while (token->type != CPP_EOF)
   4399     {
   4400       unsigned char *last;
   4401       /* Include room for a possible space and the terminating nul.  */
   4402       unsigned int len = cpp_token_len (token) + 2;
   4403 
   4404       if (out + len > alloced)
   4405 	{
   4406 	  alloced *= 2;
   4407 	  if (out + len > alloced)
   4408 	    alloced = out + len;
   4409 	  result = (unsigned char *) xrealloc (result, alloced);
   4410 	}
   4411 
   4412       last = cpp_spell_token (pfile, token, &result[out], 0);
   4413       out = last - result;
   4414 
   4415       token = cpp_get_token (pfile);
   4416       if (token->flags & PREV_WHITE)
   4417 	result[out++] = ' ';
   4418     }
   4419 
   4420   result[out] = '\0';
   4421   return result;
   4422 }
   4423 
   4424 /* Memory buffers.  Changing these three constants can have a dramatic
   4425    effect on performance.  The values here are reasonable defaults,
   4426    but might be tuned.  If you adjust them, be sure to test across a
   4427    range of uses of cpplib, including heavy nested function-like macro
   4428    expansion.  Also check the change in peak memory usage (NJAMD is a
   4429    good tool for this).  */
   4430 #define MIN_BUFF_SIZE 8000
   4431 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
   4432 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
   4433 	(MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
   4434 
   4435 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
   4436   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
   4437 #endif
   4438 
   4439 /* Create a new allocation buffer.  Place the control block at the end
   4440    of the buffer, so that buffer overflows will cause immediate chaos.  */
   4441 static _cpp_buff *
   4442 new_buff (size_t len)
   4443 {
   4444   _cpp_buff *result;
   4445   unsigned char *base;
   4446 
   4447   if (len < MIN_BUFF_SIZE)
   4448     len = MIN_BUFF_SIZE;
   4449   len = CPP_ALIGN (len);
   4450 
   4451 #ifdef ENABLE_VALGRIND_ANNOTATIONS
   4452   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
   4453      struct first.  */
   4454   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
   4455   base = XNEWVEC (unsigned char, len + slen);
   4456   result = (_cpp_buff *) base;
   4457   base += slen;
   4458 #else
   4459   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
   4460   result = (_cpp_buff *) (base + len);
   4461 #endif
   4462   result->base = base;
   4463   result->cur = base;
   4464   result->limit = base + len;
   4465   result->next = NULL;
   4466   return result;
   4467 }
   4468 
   4469 /* Place a chain of unwanted allocation buffers on the free list.  */
   4470 void
   4471 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
   4472 {
   4473   _cpp_buff *end = buff;
   4474 
   4475   while (end->next)
   4476     end = end->next;
   4477   end->next = pfile->free_buffs;
   4478   pfile->free_buffs = buff;
   4479 }
   4480 
   4481 /* Return a free buffer of size at least MIN_SIZE.  */
   4482 _cpp_buff *
   4483 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
   4484 {
   4485   _cpp_buff *result, **p;
   4486 
   4487   for (p = &pfile->free_buffs;; p = &(*p)->next)
   4488     {
   4489       size_t size;
   4490 
   4491       if (*p == NULL)
   4492 	return new_buff (min_size);
   4493       result = *p;
   4494       size = result->limit - result->base;
   4495       /* Return a buffer that's big enough, but don't waste one that's
   4496          way too big.  */
   4497       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
   4498 	break;
   4499     }
   4500 
   4501   *p = result->next;
   4502   result->next = NULL;
   4503   result->cur = result->base;
   4504   return result;
   4505 }
   4506 
   4507 /* Creates a new buffer with enough space to hold the uncommitted
   4508    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
   4509    the excess bytes to the new buffer.  Chains the new buffer after
   4510    BUFF, and returns the new buffer.  */
   4511 _cpp_buff *
   4512 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
   4513 {
   4514   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
   4515   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
   4516 
   4517   buff->next = new_buff;
   4518   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
   4519   return new_buff;
   4520 }
   4521 
   4522 /* Creates a new buffer with enough space to hold the uncommitted
   4523    remaining bytes of the buffer pointed to by BUFF, and at least
   4524    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
   4525    Chains the new buffer before the buffer pointed to by BUFF, and
   4526    updates the pointer to point to the new buffer.  */
   4527 void
   4528 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
   4529 {
   4530   _cpp_buff *new_buff, *old_buff = *pbuff;
   4531   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
   4532 
   4533   new_buff = _cpp_get_buff (pfile, size);
   4534   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
   4535   new_buff->next = old_buff;
   4536   *pbuff = new_buff;
   4537 }
   4538 
   4539 /* Free a chain of buffers starting at BUFF.  */
   4540 void
   4541 _cpp_free_buff (_cpp_buff *buff)
   4542 {
   4543   _cpp_buff *next;
   4544 
   4545   for (; buff; buff = next)
   4546     {
   4547       next = buff->next;
   4548 #ifdef ENABLE_VALGRIND_ANNOTATIONS
   4549       free (buff);
   4550 #else
   4551       free (buff->base);
   4552 #endif
   4553     }
   4554 }
   4555 
   4556 /* Allocate permanent, unaligned storage of length LEN.  */
   4557 unsigned char *
   4558 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
   4559 {
   4560   _cpp_buff *buff = pfile->u_buff;
   4561   unsigned char *result = buff->cur;
   4562 
   4563   if (len > (size_t) (buff->limit - result))
   4564     {
   4565       buff = _cpp_get_buff (pfile, len);
   4566       buff->next = pfile->u_buff;
   4567       pfile->u_buff = buff;
   4568       result = buff->cur;
   4569     }
   4570 
   4571   buff->cur = result + len;
   4572   return result;
   4573 }
   4574 
   4575 /* Allocate permanent, unaligned storage of length LEN from a_buff.
   4576    That buffer is used for growing allocations when saving macro
   4577    replacement lists in a #define, and when parsing an answer to an
   4578    assertion in #assert, #unassert or #if (and therefore possibly
   4579    whilst expanding macros).  It therefore must not be used by any
   4580    code that they might call: specifically the lexer and the guts of
   4581    the macro expander.
   4582 
   4583    All existing other uses clearly fit this restriction: storing
   4584    registered pragmas during initialization.  */
   4585 unsigned char *
   4586 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
   4587 {
   4588   _cpp_buff *buff = pfile->a_buff;
   4589   unsigned char *result = buff->cur;
   4590 
   4591   if (len > (size_t) (buff->limit - result))
   4592     {
   4593       buff = _cpp_get_buff (pfile, len);
   4594       buff->next = pfile->a_buff;
   4595       pfile->a_buff = buff;
   4596       result = buff->cur;
   4597     }
   4598 
   4599   buff->cur = result + len;
   4600   return result;
   4601 }
   4602 
   4603 /* Commit or allocate storage from a buffer.  */
   4604 
   4605 void *
   4606 _cpp_commit_buff (cpp_reader *pfile, size_t size)
   4607 {
   4608   void *ptr = BUFF_FRONT (pfile->a_buff);
   4609 
   4610   if (pfile->hash_table->alloc_subobject)
   4611     {
   4612       void *copy = pfile->hash_table->alloc_subobject (size);
   4613       memcpy (copy, ptr, size);
   4614       ptr = copy;
   4615     }
   4616   else
   4617     BUFF_FRONT (pfile->a_buff) += size;
   4618 
   4619   return ptr;
   4620 }
   4621 
   4622 /* Say which field of TOK is in use.  */
   4623 
   4624 enum cpp_token_fld_kind
   4625 cpp_token_val_index (const cpp_token *tok)
   4626 {
   4627   switch (TOKEN_SPELL (tok))
   4628     {
   4629     case SPELL_IDENT:
   4630       return CPP_TOKEN_FLD_NODE;
   4631     case SPELL_LITERAL:
   4632       return CPP_TOKEN_FLD_STR;
   4633     case SPELL_OPERATOR:
   4634       /* Operands which were originally spelled as ident keep around
   4635          the node for the exact spelling.  */
   4636       if (tok->flags & NAMED_OP)
   4637 	return CPP_TOKEN_FLD_NODE;
   4638       else if (tok->type == CPP_PASTE)
   4639 	return CPP_TOKEN_FLD_TOKEN_NO;
   4640       else
   4641 	return CPP_TOKEN_FLD_NONE;
   4642     case SPELL_NONE:
   4643       if (tok->type == CPP_MACRO_ARG)
   4644 	return CPP_TOKEN_FLD_ARG_NO;
   4645       else if (tok->type == CPP_PADDING)
   4646 	return CPP_TOKEN_FLD_SOURCE;
   4647       else if (tok->type == CPP_PRAGMA)
   4648 	return CPP_TOKEN_FLD_PRAGMA;
   4649       /* fall through */
   4650     default:
   4651       return CPP_TOKEN_FLD_NONE;
   4652     }
   4653 }
   4654 
   4655 /* All tokens lexed in R after calling this function will be forced to
   4656    have their location_t to be P, until
   4657    cpp_stop_forcing_token_locations is called for R.  */
   4658 
   4659 void
   4660 cpp_force_token_locations (cpp_reader *r, location_t loc)
   4661 {
   4662   r->forced_token_location = loc;
   4663 }
   4664 
   4665 /* Go back to assigning locations naturally for lexed tokens.  */
   4666 
   4667 void
   4668 cpp_stop_forcing_token_locations (cpp_reader *r)
   4669 {
   4670   r->forced_token_location = 0;
   4671 }
   4672 
   4673 /* We're looking at \, if it's escaping EOL, look past it.  If at
   4674    LIMIT, don't advance.  */
   4675 
   4676 static const unsigned char *
   4677 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
   4678 {
   4679   const unsigned char *probe = peek;
   4680 
   4681   if (__builtin_expect (peek[1] == '\n', true))
   4682     {
   4683     eol:
   4684       probe += 2;
   4685       if (__builtin_expect (probe < limit, true))
   4686 	{
   4687 	  peek = probe;
   4688 	  if (*peek == '\\')
   4689 	    /* The user might be perverse.  */
   4690 	    return do_peek_backslash (peek, limit);
   4691 	}
   4692     }
   4693   else if (__builtin_expect (peek[1] == '\r', false))
   4694     {
   4695       if (probe[2] == '\n')
   4696 	probe++;
   4697       goto eol;
   4698     }
   4699 
   4700   return peek;
   4701 }
   4702 
   4703 static const unsigned char *
   4704 do_peek_next (const unsigned char *peek, const unsigned char *limit)
   4705 {
   4706   if (__builtin_expect (*peek == '\\', false))
   4707     peek = do_peek_backslash (peek, limit);
   4708   return peek;
   4709 }
   4710 
   4711 static const unsigned char *
   4712 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
   4713 {
   4714   if (peek == bound)
   4715     return NULL;
   4716 
   4717   unsigned char c = *--peek;
   4718   if (__builtin_expect (c == '\n', false)
   4719       || __builtin_expect (c == 'r', false))
   4720     {
   4721       if (peek == bound)
   4722 	return peek;
   4723       int ix = -1;
   4724       if (c == '\n' && peek[ix] == '\r')
   4725 	{
   4726 	  if (peek + ix == bound)
   4727 	    return peek;
   4728 	  ix--;
   4729 	}
   4730 
   4731       if (peek[ix] == '\\')
   4732 	return do_peek_prev (peek + ix, bound);
   4733 
   4734       return peek;
   4735     }
   4736   else
   4737     return peek;
   4738 }
   4739 
   4740 /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
   4741    space.  Otherwise return NULL.  */
   4742 
   4743 static const unsigned char *
   4744 do_peek_ident (const char *match, const unsigned char *peek,
   4745 	       const unsigned char *limit)
   4746 {
   4747   for (; *++match; peek++)
   4748     if (*peek != *match)
   4749       {
   4750 	peek = do_peek_next (peek, limit);
   4751 	if (*peek != *match)
   4752 	  return NULL;
   4753       }
   4754 
   4755   /* Must now not be looking at an identifier char.  */
   4756   peek = do_peek_next (peek, limit);
   4757   if (ISIDNUM (*peek))
   4758     return NULL;
   4759 
   4760   /* Skip control-line whitespace.  */
   4761  ws:
   4762   while (*peek == ' ' || *peek == '\t')
   4763     peek++;
   4764   if (__builtin_expect (*peek == '\\', false))
   4765     {
   4766       peek = do_peek_backslash (peek, limit);
   4767       if (*peek != '\\')
   4768 	goto ws;
   4769     }
   4770 
   4771   return peek;
   4772 }
   4773 
   4774 /* Are we looking at a module control line starting as PEEK - 1?  */
   4775 
   4776 static bool
   4777 do_peek_module (cpp_reader *pfile, unsigned char c,
   4778 		const unsigned char *peek, const unsigned char *limit)
   4779 {
   4780   bool import = false;
   4781 
   4782   if (__builtin_expect (c == 'e', false))
   4783     {
   4784       if (!((peek[0] == 'x' || peek[0] == '\\')
   4785 	    && (peek = do_peek_ident ("export", peek, limit))))
   4786 	return false;
   4787 
   4788       /* export, peek for import or module.  No need to peek __import
   4789 	 here.  */
   4790       if (peek[0] == 'i')
   4791 	{
   4792 	  if (!((peek[1] == 'm' || peek[1] == '\\')
   4793 		&& (peek = do_peek_ident ("import", peek + 1, limit))))
   4794 	    return false;
   4795 	  import = true;
   4796 	}
   4797       else if (peek[0] == 'm')
   4798 	{
   4799 	  if (!((peek[1] == 'o' || peek[1] == '\\')
   4800 		&& (peek = do_peek_ident ("module", peek + 1, limit))))
   4801 	    return false;
   4802 	}
   4803       else
   4804 	return false;
   4805     }
   4806   else if (__builtin_expect (c == 'i', false))
   4807     {
   4808       if (!((peek[0] == 'm' || peek[0] == '\\')
   4809 	    && (peek = do_peek_ident ("import", peek, limit))))
   4810 	return false;
   4811       import = true;
   4812     }
   4813   else if (__builtin_expect (c == '_', false))
   4814     {
   4815       /* Needed for translated includes.   */
   4816       if (!((peek[0] == '_' || peek[0] == '\\')
   4817 	    && (peek = do_peek_ident ("__import", peek, limit))))
   4818 	return false;
   4819       import = true;
   4820     }
   4821   else if (__builtin_expect (c == 'm', false))
   4822     {
   4823       if (!((peek[0] == 'o' || peek[0] == '\\')
   4824 	    && (peek = do_peek_ident ("module", peek, limit))))
   4825 	return false;
   4826     }
   4827   else
   4828     return false;
   4829 
   4830   /* Peek the next character to see if it's good enough.  We'll be at
   4831      the first non-whitespace char, including skipping an escaped
   4832      newline.  */
   4833   /* ... import followed by identifier, ':', '<' or header-name
   4834      preprocessing tokens, or module followed by identifier, ':' or
   4835      ';' preprocessing tokens.  */
   4836   unsigned char p = *peek++;
   4837 
   4838   /* A character literal is ... single quotes, ... optionally preceded
   4839      by u8, u, U, or L */
   4840   /* A string-literal is a ... double quotes, optionally prefixed by
   4841      R, u8, u8R, u, uR, U, UR, L, or LR */
   4842   if (p == 'u')
   4843     {
   4844       peek = do_peek_next (peek, limit);
   4845       if (*peek == '8')
   4846 	{
   4847 	  peek++;
   4848 	  goto peek_u8;
   4849 	}
   4850       goto peek_u;
   4851     }
   4852   else if (p == 'U' || p == 'L')
   4853     {
   4854     peek_u8:
   4855       peek = do_peek_next (peek, limit);
   4856     peek_u:
   4857       if (*peek == '\"' || *peek == '\'')
   4858 	return false;
   4859 
   4860       if (*peek == 'R')
   4861 	goto peek_R;
   4862       /* Identifier. Ok.  */
   4863     }
   4864   else if (p == 'R')
   4865     {
   4866     peek_R:
   4867       if (CPP_OPTION (pfile, rliterals))
   4868 	{
   4869 	  peek = do_peek_next (peek, limit);
   4870 	  if (*peek == '\"')
   4871 	    return false;
   4872 	}
   4873       /* Identifier. Ok.  */
   4874     }
   4875   else if ('Z' - 'A' == 25
   4876 	   ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
   4877 	   : ISIDST (p))
   4878     {
   4879       /* Identifier.  Ok. */
   4880     }
   4881   else if (p == '<')
   4882     {
   4883       /* Maybe angle header, ok for import.  Reject
   4884 	 '<=', '<<' digraph:'<:'.  */
   4885       if (!import)
   4886 	return false;
   4887       peek = do_peek_next (peek, limit);
   4888       if (*peek == '=' || *peek == '<'
   4889 	  || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
   4890 	return false;
   4891     }
   4892   else if (p == ';')
   4893     {
   4894       /* SEMICOLON, ok for module.  */
   4895       if (import)
   4896 	return false;
   4897     }
   4898   else if (p == '"')
   4899     {
   4900       /* STRING, ok for import.  */
   4901       if (!import)
   4902 	return false;
   4903     }
   4904   else if (p == ':')
   4905     {
   4906       /* Maybe COLON, ok.  Reject '::', digraph:':>'.  */
   4907       peek = do_peek_next (peek, limit);
   4908       if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
   4909 	return false;
   4910     }
   4911   else
   4912     /* FIXME: Detect a unicode character, excluding those not
   4913        permitted as the initial character. [lex.name]/1.  I presume
   4914        we need to check the \[uU] spellings, and directly using
   4915        Unicode in say UTF8 form?  Or perhaps we do the phase-1
   4916        conversion of UTF8 to universal-character-names?  */
   4917     return false;
   4918 
   4919   return true;
   4920 }
   4921 
   4922 /* Directives-only scanning.  Somewhat more relaxed than correct
   4923    parsing -- some ill-formed programs will not be rejected.  */
   4924 
   4925 void
   4926 cpp_directive_only_process (cpp_reader *pfile,
   4927 			    void *data,
   4928 			    void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
   4929 {
   4930   bool module_p = CPP_OPTION (pfile, module_directives);
   4931 
   4932   do
   4933     {
   4934     restart:
   4935       /* Buffer initialization, but no line cleaning. */
   4936       cpp_buffer *buffer = pfile->buffer;
   4937       buffer->cur_note = buffer->notes_used = 0;
   4938       buffer->cur = buffer->line_base = buffer->next_line;
   4939       buffer->need_line = false;
   4940       /* Files always end in a newline or carriage return.  We rely on this for
   4941 	 character peeking safety.  */
   4942       gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
   4943 
   4944       const unsigned char *base = buffer->cur;
   4945       unsigned line_count = 0;
   4946       const unsigned char *line_start = base;
   4947 
   4948       bool bol = true;
   4949       bool raw = false;
   4950 
   4951       const unsigned char *lwm = base;
   4952       for (const unsigned char *pos = base, *limit = buffer->rlimit;
   4953 	   pos < limit;)
   4954 	{
   4955 	  unsigned char c = *pos++;
   4956 	  /* This matches the switch in _cpp_lex_direct.  */
   4957 	  switch (c)
   4958 	    {
   4959 	    case ' ': case '\t': case '\f': case '\v':
   4960 	      /* Whitespace, do nothing.  */
   4961 	      break;
   4962 
   4963 	    case '\r': /* MAC line ending, or Windows \r\n  */
   4964 	      if (*pos == '\n')
   4965 		pos++;
   4966 	      /* FALLTHROUGH */
   4967 
   4968 	    case '\n':
   4969 	      bol = true;
   4970 
   4971 	    next_line:
   4972 	      CPP_INCREMENT_LINE (pfile, 0);
   4973 	      line_count++;
   4974 	      line_start = pos;
   4975 	      break;
   4976 
   4977 	    case '\\':
   4978 	      /* <backslash><newline> is removed, and doesn't undo any
   4979 		 preceeding escape or whatnot.  */
   4980 	      if (*pos == '\n')
   4981 		{
   4982 		  pos++;
   4983 		  goto next_line;
   4984 		}
   4985 	      else if (*pos == '\r')
   4986 		{
   4987 		  if (pos[1] == '\n')
   4988 		    pos++;
   4989 		  pos++;
   4990 		  goto next_line;
   4991 		}
   4992 	      goto dflt;
   4993 
   4994 	    case '#':
   4995 	      if (bol)
   4996 		{
   4997 		  /* Line directive.  */
   4998 		  if (pos - 1 > base && !pfile->state.skipping)
   4999 		    cb (pfile, CPP_DO_print, data,
   5000 			line_count, base, pos - 1 - base);
   5001 
   5002 		  /* Prep things for directive handling. */
   5003 		  buffer->next_line = pos;
   5004 		  buffer->need_line = true;
   5005 		  bool ok = _cpp_get_fresh_line (pfile);
   5006 		  gcc_checking_assert (ok);
   5007 
   5008 		  /* Ensure proper column numbering for generated
   5009 		     error messages. */
   5010 		  buffer->line_base -= pos - line_start;
   5011 
   5012 		  _cpp_handle_directive (pfile, line_start + 1 != pos);
   5013 
   5014 		  /* Sanitize the line settings.  Duplicate #include's can
   5015 		     mess things up. */
   5016 		  // FIXME: Necessary?
   5017 		  pfile->line_table->highest_location
   5018 		    = pfile->line_table->highest_line;
   5019 
   5020 		  if (!pfile->state.skipping
   5021 		      && pfile->buffer->next_line < pfile->buffer->rlimit)
   5022 		    cb (pfile, CPP_DO_location, data,
   5023 			pfile->line_table->highest_line);
   5024 
   5025 		  goto restart;
   5026 		}
   5027 	      goto dflt;
   5028 
   5029 	    case '/':
   5030 	      {
   5031 		const unsigned char *peek = do_peek_next (pos, limit);
   5032 		if (!(*peek == '/' || *peek == '*'))
   5033 		  goto dflt;
   5034 
   5035 		/* Line or block comment  */
   5036 		bool is_block = *peek == '*';
   5037 		bool star = false;
   5038 		bool esc = false;
   5039 		location_t sloc
   5040 		  = linemap_position_for_column (pfile->line_table,
   5041 						 pos - line_start);
   5042 
   5043 		while (pos < limit)
   5044 		  {
   5045 		    char c = *pos++;
   5046 		    switch (c)
   5047 		      {
   5048 		      case '\\':
   5049 			esc = true;
   5050 			break;
   5051 
   5052 		      case '\r':
   5053 			if (*pos == '\n')
   5054 			  pos++;
   5055 			/* FALLTHROUGH  */
   5056 
   5057 		      case '\n':
   5058 			{
   5059 			  CPP_INCREMENT_LINE (pfile, 0);
   5060 			  line_count++;
   5061 			  line_start = pos;
   5062 			  if (!esc && !is_block)
   5063 			    {
   5064 			      bol = true;
   5065 			      goto done_comment;
   5066 			    }
   5067 			}
   5068 			if (!esc)
   5069 			  star = false;
   5070 			esc = false;
   5071 			break;
   5072 
   5073 		      case '*':
   5074 			if (pos > peek)
   5075 			  star = is_block;
   5076 			esc = false;
   5077 			break;
   5078 
   5079 		      case '/':
   5080 			if (star)
   5081 			  goto done_comment;
   5082 			/* FALLTHROUGH  */
   5083 
   5084 		      default:
   5085 			star = false;
   5086 			esc = false;
   5087 			break;
   5088 		      }
   5089 		  }
   5090 		if (pos < limit || is_block)
   5091 		  cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
   5092 				       "unterminated comment");
   5093 	      done_comment:
   5094 		lwm = pos;
   5095 		break;
   5096 	      }
   5097 
   5098 	    case '\'':
   5099 	      if (!CPP_OPTION (pfile, digit_separators))
   5100 		goto delimited_string;
   5101 
   5102 	      /* Possibly a number punctuator.  */
   5103 	      if (!ISIDNUM (*do_peek_next (pos, limit)))
   5104 		goto delimited_string;
   5105 
   5106 	      goto quote_peek;
   5107 
   5108 	    case '\"':
   5109 	      if (!CPP_OPTION (pfile, rliterals))
   5110 		goto delimited_string;
   5111 
   5112 	    quote_peek:
   5113 	      {
   5114 		/* For ' see if it's a number punctuator
   5115 		   \.?<digit>(<digit>|<identifier-nondigit>
   5116 		   |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
   5117 		/* For " see if it's a raw string
   5118 		   {U,L,u,u8}R.  This includes CPP_NUMBER detection,
   5119 		   because that could be 0e+R.  */
   5120 		const unsigned char *peek = pos - 1;
   5121 		bool quote_first = c == '"';
   5122 		bool quote_eight = false;
   5123 		bool maybe_number_start = false;
   5124 		bool want_number = false;
   5125 
   5126 		while ((peek = do_peek_prev (peek, lwm)))
   5127 		  {
   5128 		    unsigned char p = *peek;
   5129 		    if (quote_first)
   5130 		      {
   5131 			if (!raw)
   5132 			  {
   5133 			    if (p != 'R')
   5134 			      break;
   5135 			    raw = true;
   5136 			    continue;
   5137 			  }
   5138 
   5139 			quote_first = false;
   5140 			if (p == 'L' || p == 'U' || p == 'u')
   5141 			  ;
   5142 			else if (p == '8')
   5143 			  quote_eight = true;
   5144 			else
   5145 			  goto second_raw;
   5146 		      }
   5147 		    else if (quote_eight)
   5148 		      {
   5149 			if (p != 'u')
   5150 			  {
   5151 			    raw = false;
   5152 			    break;
   5153 			  }
   5154 			quote_eight = false;
   5155 		      }
   5156 		    else if (c == '"')
   5157 		      {
   5158 		      second_raw:;
   5159 			if (!want_number && ISIDNUM (p))
   5160 			  {
   5161 			    raw = false;
   5162 			    break;
   5163 			  }
   5164 		      }
   5165 
   5166 		    if (ISDIGIT (p))
   5167 		      maybe_number_start = true;
   5168 		    else if (p == '.')
   5169 		      want_number = true;
   5170 		    else if (ISIDNUM (p))
   5171 		      maybe_number_start = false;
   5172 		    else if (p == '+' || p == '-')
   5173 		      {
   5174 			if (const unsigned char *peek_prev
   5175 			    = do_peek_prev (peek, lwm))
   5176 			  {
   5177 			    p = *peek_prev;
   5178 			    if (p == 'e' || p == 'E'
   5179 				|| p == 'p' || p == 'P')
   5180 			      {
   5181 				want_number = true;
   5182 				maybe_number_start = false;
   5183 			      }
   5184 			    else
   5185 			      break;
   5186 			  }
   5187 			else
   5188 			  break;
   5189 		      }
   5190 		    else if (p == '\'' || p == '\"')
   5191 		      {
   5192 			/* If this is lwm, this must be the end of a
   5193 			   previous string.  So this is a trailing
   5194 			   literal type, (a) if those are allowed,
   5195 			     and (b) maybe_start is false.  Otherwise
   5196 			     this must be a CPP_NUMBER because we've
   5197 			     met another ', and we'd have checked that
   5198 			     in its own right.  */
   5199 			if (peek == lwm && CPP_OPTION (pfile, uliterals))
   5200 			  {
   5201 			    if  (!maybe_number_start && !want_number)
   5202 			      /* Must be a literal type.  */
   5203 			      raw = false;
   5204 			  }
   5205 			else if (p == '\''
   5206 				 && CPP_OPTION (pfile, digit_separators))
   5207 			  maybe_number_start = true;
   5208 			break;
   5209 		      }
   5210 		    else if (c == '\'')
   5211 		      break;
   5212 		    else if (!quote_first && !quote_eight)
   5213 		      break;
   5214 		  }
   5215 
   5216 		if (maybe_number_start)
   5217 		  {
   5218 		    if (c == '\'')
   5219 		      /* A CPP NUMBER.  */
   5220 		      goto dflt;
   5221 		    raw = false;
   5222 		  }
   5223 
   5224 		goto delimited_string;
   5225 	      }
   5226 
   5227 	    delimited_string:
   5228 	      {
   5229 		/* (Possibly raw) string or char literal.  */
   5230 		unsigned char end = c;
   5231 		int delim_len = -1;
   5232 		const unsigned char *delim = NULL;
   5233 		location_t sloc = linemap_position_for_column (pfile->line_table,
   5234 							       pos - line_start);
   5235 		int esc = 0;
   5236 
   5237 		if (raw)
   5238 		  {
   5239 		    /* There can be no line breaks in the delimiter.  */
   5240 		    delim = pos;
   5241 		    for (delim_len = 0; (c = *pos++) != '('; delim_len++)
   5242 		      {
   5243 			if (delim_len == 16)
   5244 			  {
   5245 			    cpp_error_with_line (pfile, CPP_DL_ERROR,
   5246 						 sloc, 0,
   5247 						 "raw string delimiter"
   5248 						 " longer than %d"
   5249 						 " characters",
   5250 						 delim_len);
   5251 			    raw = false;
   5252 			    pos = delim;
   5253 			    break;
   5254 			  }
   5255 			if (strchr (") \\\t\v\f\n", c))
   5256 			  {
   5257 			    cpp_error_with_line (pfile, CPP_DL_ERROR,
   5258 						 sloc, 0,
   5259 						 "invalid character '%c'"
   5260 						 " in raw string"
   5261 						 " delimiter", c);
   5262 			    raw = false;
   5263 			    pos = delim;
   5264 			    break;
   5265 			  }
   5266 			if (pos >= limit)
   5267 			  goto bad_string;
   5268 		      }
   5269 		  }
   5270 
   5271 		while (pos < limit)
   5272 		  {
   5273 		    char c = *pos++;
   5274 		    switch (c)
   5275 		      {
   5276 		      case '\\':
   5277 			if (!raw)
   5278 			  esc++;
   5279 			break;
   5280 
   5281 		      case '\r':
   5282 			if (*pos == '\n')
   5283 			  pos++;
   5284 			/* FALLTHROUGH  */
   5285 
   5286 		      case '\n':
   5287 			{
   5288 			  CPP_INCREMENT_LINE (pfile, 0);
   5289 			  line_count++;
   5290 			  line_start = pos;
   5291 			}
   5292 			if (esc)
   5293 			  esc--;
   5294 			break;
   5295 
   5296 		      case ')':
   5297 			if (raw
   5298 			    && pos + delim_len + 1 < limit
   5299 			    && pos[delim_len] == end
   5300 			    && !memcmp (delim, pos, delim_len))
   5301 			  {
   5302 			    pos += delim_len + 1;
   5303 			    raw = false;
   5304 			    goto done_string;
   5305 			  }
   5306 			break;
   5307 
   5308 		      default:
   5309 			if (!raw && !(esc & 1) && c == end)
   5310 			  goto done_string;
   5311 			esc = 0;
   5312 			break;
   5313 		      }
   5314 		  }
   5315 	      bad_string:
   5316 		cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
   5317 				     "unterminated literal");
   5318 
   5319 	      done_string:
   5320 		raw = false;
   5321 		lwm = pos - 1;
   5322 	      }
   5323 	      goto dflt;
   5324 
   5325 	    case '_':
   5326 	    case 'e':
   5327 	    case 'i':
   5328 	    case 'm':
   5329 	      if (bol && module_p && !pfile->state.skipping
   5330 		  && do_peek_module (pfile, c, pos, limit))
   5331 		{
   5332 		  /* We've seen the start of a module control line.
   5333 		     Start up the tokenizer.  */
   5334 		  pos--; /* Backup over the first character.  */
   5335 
   5336 		  /* Backup over whitespace to start of line.  */
   5337 		  while (pos > line_start
   5338 			 && (pos[-1] == ' ' || pos[-1] == '\t'))
   5339 		    pos--;
   5340 
   5341 		  if (pos > base)
   5342 		    cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
   5343 
   5344 		  /* Prep things for directive handling. */
   5345 		  buffer->next_line = pos;
   5346 		  buffer->need_line = true;
   5347 
   5348 		  /* Now get tokens until the PRAGMA_EOL.  */
   5349 		  do
   5350 		    {
   5351 		      location_t spelling;
   5352 		      const cpp_token *tok
   5353 			= cpp_get_token_with_location (pfile, &spelling);
   5354 
   5355 		      gcc_assert (pfile->state.in_deferred_pragma
   5356 				  || tok->type == CPP_PRAGMA_EOL);
   5357 		      cb (pfile, CPP_DO_token, data, tok, spelling);
   5358 		    }
   5359 		  while (pfile->state.in_deferred_pragma);
   5360 
   5361 		  if (pfile->buffer->next_line < pfile->buffer->rlimit)
   5362 		    cb (pfile, CPP_DO_location, data,
   5363 			pfile->line_table->highest_line);
   5364 
   5365 		  pfile->mi_valid = false;
   5366 		  goto restart;
   5367 		}
   5368 	      goto dflt;
   5369 
   5370 	    default:
   5371 	    dflt:
   5372 	      bol = false;
   5373 	      pfile->mi_valid = false;
   5374 	      break;
   5375 	    }
   5376 	}
   5377 
   5378       if (buffer->rlimit > base && !pfile->state.skipping)
   5379 	{
   5380 	  const unsigned char *limit = buffer->rlimit;
   5381 	  /* If the file was not newline terminated, add rlimit, which is
   5382 	     guaranteed to point to a newline, to the end of our range.  */
   5383 	  if (limit[-1] != '\n')
   5384 	    {
   5385 	      limit++;
   5386 	      CPP_INCREMENT_LINE (pfile, 0);
   5387 	      line_count++;
   5388 	    }
   5389 	  cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
   5390 	}
   5391 
   5392       _cpp_pop_buffer (pfile);
   5393     }
   5394   while (pfile->buffer);
   5395 }
   5396