lex.cc revision 1.3 1 1.1 mrg /* CPP Library - lexical analysis.
2 1.1 mrg Copyright (C) 2000-2022 Free Software Foundation, Inc.
3 1.1 mrg Contributed by Per Bothner, 1994-95.
4 1.1 mrg Based on CCCP program by Paul Rubin, June 1986
5 1.1 mrg Adapted to ANSI C, Richard Stallman, Jan 1987
6 1.1 mrg Broken out to separate file, Zack Weinberg, Mar 2000
7 1.1 mrg
8 1.1 mrg This program is free software; you can redistribute it and/or modify it
9 1.1 mrg under the terms of the GNU General Public License as published by the
10 1.1 mrg Free Software Foundation; either version 3, or (at your option) any
11 1.1 mrg later version.
12 1.1 mrg
13 1.1 mrg This program is distributed in the hope that it will be useful,
14 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
15 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 1.1 mrg GNU General Public License for more details.
17 1.1 mrg
18 1.1 mrg You should have received a copy of the GNU General Public License
19 1.1 mrg along with this program; see the file COPYING3. If not see
20 1.1 mrg <http://www.gnu.org/licenses/>. */
21 1.1 mrg
22 1.1 mrg #include "config.h"
23 1.1 mrg #include "system.h"
24 1.1 mrg #include "cpplib.h"
25 1.1 mrg #include "internal.h"
26 1.1 mrg
27 1.1 mrg enum spell_type
28 1.1 mrg {
29 1.1 mrg SPELL_OPERATOR = 0,
30 1.1 mrg SPELL_IDENT,
31 1.1 mrg SPELL_LITERAL,
32 1.1 mrg SPELL_NONE
33 1.1 mrg };
34 1.1 mrg
35 1.1 mrg struct token_spelling
36 1.1 mrg {
37 1.1 mrg enum spell_type category;
38 1.1 mrg const unsigned char *name;
39 1.1 mrg };
40 1.1 mrg
41 1.1 mrg static const unsigned char *const digraph_spellings[] =
42 1.1 mrg { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
43 1.1 mrg
44 1.1 mrg #define OP(e, s) { SPELL_OPERATOR, UC s },
45 1.1 mrg #define TK(e, s) { SPELL_ ## s, UC #e },
46 1.1 mrg static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 1.1 mrg #undef OP
48 1.1 mrg #undef TK
49 1.1 mrg
50 1.1 mrg #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 1.1 mrg #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52 1.1 mrg
53 1.1 mrg static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 1.1 mrg static int skip_line_comment (cpp_reader *);
55 1.1 mrg static void skip_whitespace (cpp_reader *, cppchar_t);
56 1.1 mrg static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 1.1 mrg static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 1.1 mrg static void store_comment (cpp_reader *, cpp_token *);
59 1.1 mrg static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60 1.1 mrg unsigned int, enum cpp_ttype);
61 1.1 mrg static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62 1.1 mrg static int name_p (cpp_reader *, const cpp_string *);
63 1.1 mrg static tokenrun *next_tokenrun (tokenrun *);
64 1.1 mrg
65 1.1 mrg static _cpp_buff *new_buff (size_t);
66 1.1 mrg
67 1.1 mrg
68 1.1 mrg /* Utility routine:
69 1.1 mrg
70 1.1 mrg Compares, the token TOKEN to the NUL-terminated string STRING.
71 1.1 mrg TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
72 1.1 mrg int
73 1.1 mrg cpp_ideq (const cpp_token *token, const char *string)
74 1.1 mrg {
75 1.1 mrg if (token->type != CPP_NAME)
76 1.1 mrg return 0;
77 1.1 mrg
78 1.1 mrg return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
79 1.1 mrg }
80 1.1 mrg
81 1.1 mrg /* Record a note TYPE at byte POS into the current cleaned logical
82 1.1 mrg line. */
83 1.1 mrg static void
84 1.1 mrg add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
85 1.1 mrg {
86 1.1 mrg if (buffer->notes_used == buffer->notes_cap)
87 1.1 mrg {
88 1.1 mrg buffer->notes_cap = buffer->notes_cap * 2 + 200;
89 1.1 mrg buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90 1.1 mrg buffer->notes_cap);
91 1.1 mrg }
92 1.1 mrg
93 1.1 mrg buffer->notes[buffer->notes_used].pos = pos;
94 1.1 mrg buffer->notes[buffer->notes_used].type = type;
95 1.1 mrg buffer->notes_used++;
96 1.1 mrg }
97 1.1 mrg
98 1.1 mrg
99 1.1 mrg /* Fast path to find line special characters using optimized character
101 1.1 mrg scanning algorithms. Anything complicated falls back to the slow
102 1.1 mrg path below. Since this loop is very hot it's worth doing these kinds
103 1.1 mrg of optimizations.
104 1.1 mrg
105 1.1 mrg One of the paths through the ifdefs should provide
106 1.1 mrg
107 1.1 mrg const uchar *search_line_fast (const uchar *s, const uchar *end);
108 1.1 mrg
109 1.1 mrg Between S and END, search for \n, \r, \\, ?. Return a pointer to
110 1.1 mrg the found character.
111 1.1 mrg
112 1.1 mrg Note that the last character of the buffer is *always* a newline,
113 1.1 mrg as forced by _cpp_convert_input. This fact can be used to avoid
114 1.1 mrg explicitly looking for the end of the buffer. */
115 1.1 mrg
116 1.1 mrg /* Configure gives us an ifdef test. */
117 1.1 mrg #ifndef WORDS_BIGENDIAN
118 1.1 mrg #define WORDS_BIGENDIAN 0
119 1.1 mrg #endif
120 1.1 mrg
121 1.1 mrg /* We'd like the largest integer that fits into a register. There's nothing
122 1.1 mrg in <stdint.h> that gives us that. For most hosts this is unsigned long,
123 1.1 mrg but MS decided on an LLP64 model. Thankfully when building with GCC we
124 1.1 mrg can get the "real" word size. */
125 1.1 mrg #ifdef __GNUC__
126 1.1 mrg typedef unsigned int word_type __attribute__((__mode__(__word__)));
127 1.1 mrg #else
128 1.1 mrg typedef unsigned long word_type;
129 1.1 mrg #endif
130 1.1 mrg
131 1.1 mrg /* The code below is only expecting sizes 4 or 8.
132 1.1 mrg Die at compile-time if this expectation is violated. */
133 1.1 mrg typedef char check_word_type_size
134 1.1 mrg [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
135 1.1 mrg
136 1.1 mrg /* Return X with the first N bytes forced to values that won't match one
137 1.1 mrg of the interesting characters. Note that NUL is not interesting. */
138 1.1 mrg
139 1.1 mrg static inline word_type
140 1.1 mrg acc_char_mask_misalign (word_type val, unsigned int n)
141 1.1 mrg {
142 1.1 mrg word_type mask = -1;
143 1.1 mrg if (WORDS_BIGENDIAN)
144 1.1 mrg mask >>= n * 8;
145 1.1 mrg else
146 1.1 mrg mask <<= n * 8;
147 1.1 mrg return val & mask;
148 1.1 mrg }
149 1.1 mrg
150 1.1 mrg /* Return X replicated to all byte positions within WORD_TYPE. */
151 1.1 mrg
152 1.1 mrg static inline word_type
153 1.1 mrg acc_char_replicate (uchar x)
154 1.1 mrg {
155 1.1 mrg word_type ret;
156 1.1 mrg
157 1.1 mrg ret = (x << 24) | (x << 16) | (x << 8) | x;
158 1.1 mrg if (sizeof(word_type) == 8)
159 1.1 mrg ret = (ret << 16 << 16) | ret;
160 1.1 mrg return ret;
161 1.1 mrg }
162 1.1 mrg
163 1.1 mrg /* Return non-zero if some byte of VAL is (probably) C. */
164 1.1 mrg
165 1.1 mrg static inline word_type
166 1.1 mrg acc_char_cmp (word_type val, word_type c)
167 1.1 mrg {
168 1.1 mrg #if defined(__GNUC__) && defined(__alpha__)
169 1.1 mrg /* We can get exact results using a compare-bytes instruction.
170 1.1 mrg Get (val == c) via (0 >= (val ^ c)). */
171 1.1 mrg return __builtin_alpha_cmpbge (0, val ^ c);
172 1.1 mrg #else
173 1.1 mrg word_type magic = 0x7efefefeU;
174 1.1 mrg if (sizeof(word_type) == 8)
175 1.1 mrg magic = (magic << 16 << 16) | 0xfefefefeU;
176 1.1 mrg magic |= 1;
177 1.1 mrg
178 1.1 mrg val ^= c;
179 1.1 mrg return ((val + magic) ^ ~val) & ~magic;
180 1.1 mrg #endif
181 1.1 mrg }
182 1.1 mrg
183 1.1 mrg /* Given the result of acc_char_cmp is non-zero, return the index of
184 1.1 mrg the found character. If this was a false positive, return -1. */
185 1.1 mrg
186 1.1 mrg static inline int
187 1.1 mrg acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
188 1.1 mrg word_type val ATTRIBUTE_UNUSED)
189 1.1 mrg {
190 1.1 mrg #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
191 1.1 mrg /* The cmpbge instruction sets *bits* of the result corresponding to
192 1.1 mrg matches in the bytes with no false positives. */
193 1.1 mrg return __builtin_ctzl (cmp);
194 1.1 mrg #else
195 1.1 mrg unsigned int i;
196 1.1 mrg
197 1.1 mrg /* ??? It would be nice to force unrolling here,
198 1.1 mrg and have all of these constants folded. */
199 1.1 mrg for (i = 0; i < sizeof(word_type); ++i)
200 1.1 mrg {
201 1.1 mrg uchar c;
202 1.1 mrg if (WORDS_BIGENDIAN)
203 1.1 mrg c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
204 1.1 mrg else
205 1.1 mrg c = (val >> i * 8) & 0xff;
206 1.1 mrg
207 1.1 mrg if (c == '\n' || c == '\r' || c == '\\' || c == '?')
208 1.1 mrg return i;
209 1.1 mrg }
210 1.1 mrg
211 1.1 mrg return -1;
212 1.1 mrg #endif
213 1.1 mrg }
214 1.1 mrg
215 1.1 mrg /* A version of the fast scanner using bit fiddling techniques.
216 1.1 mrg
217 1.1 mrg For 32-bit words, one would normally perform 16 comparisons and
218 1.1 mrg 16 branches. With this algorithm one performs 24 arithmetic
219 1.1 mrg operations and one branch. Whether this is faster with a 32-bit
220 1.1 mrg word size is going to be somewhat system dependent.
221 1.1 mrg
222 1.1 mrg For 64-bit words, we eliminate twice the number of comparisons
223 1.1 mrg and branches without increasing the number of arithmetic operations.
224 1.1 mrg It's almost certainly going to be a win with 64-bit word size. */
225 1.1 mrg
226 1.1 mrg static const uchar * search_line_acc_char (const uchar *, const uchar *)
227 1.1 mrg ATTRIBUTE_UNUSED;
228 1.1 mrg
229 1.1 mrg static const uchar *
230 1.1 mrg search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
231 1.1 mrg {
232 1.1 mrg const word_type repl_nl = acc_char_replicate ('\n');
233 1.1 mrg const word_type repl_cr = acc_char_replicate ('\r');
234 1.1 mrg const word_type repl_bs = acc_char_replicate ('\\');
235 1.1 mrg const word_type repl_qm = acc_char_replicate ('?');
236 1.1 mrg
237 1.1 mrg unsigned int misalign;
238 1.1 mrg const word_type *p;
239 1.1 mrg word_type val, t;
240 1.1 mrg
241 1.1 mrg /* Align the buffer. Mask out any bytes from before the beginning. */
242 1.1 mrg p = (word_type *)((uintptr_t)s & -sizeof(word_type));
243 1.1 mrg val = *p;
244 1.1 mrg misalign = (uintptr_t)s & (sizeof(word_type) - 1);
245 1.1 mrg if (misalign)
246 1.1 mrg val = acc_char_mask_misalign (val, misalign);
247 1.1 mrg
248 1.1 mrg /* Main loop. */
249 1.1 mrg while (1)
250 1.1 mrg {
251 1.1 mrg t = acc_char_cmp (val, repl_nl);
252 1.1 mrg t |= acc_char_cmp (val, repl_cr);
253 1.1 mrg t |= acc_char_cmp (val, repl_bs);
254 1.1 mrg t |= acc_char_cmp (val, repl_qm);
255 1.1 mrg
256 1.1 mrg if (__builtin_expect (t != 0, 0))
257 1.1 mrg {
258 1.1 mrg int i = acc_char_index (t, val);
259 1.1 mrg if (i >= 0)
260 1.1 mrg return (const uchar *)p + i;
261 1.1 mrg }
262 1.1 mrg
263 1.1 mrg val = *++p;
264 1.1 mrg }
265 1.1 mrg }
266 1.1 mrg
267 1.1 mrg /* Disable on Solaris 2/x86 until the following problem can be properly
268 1.1 mrg autoconfed:
269 1.1 mrg
270 1.1 mrg The Solaris 10+ assembler tags objects with the instruction set
271 1.1 mrg extensions used, so SSE4.2 executables cannot run on machines that
272 1.1 mrg don't support that extension. */
273 1.1 mrg
274 1.1 mrg #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
275 1.1 mrg
276 1.1 mrg /* Replicated character data to be shared between implementations.
277 1.1 mrg Recall that outside of a context with vector support we can't
278 1.1 mrg define compatible vector types, therefore these are all defined
279 1.1 mrg in terms of raw characters. */
280 1.1 mrg static const char repl_chars[4][16] __attribute__((aligned(16))) = {
281 1.1 mrg { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
282 1.1 mrg '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
283 1.1 mrg { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
284 1.1 mrg '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
285 1.1 mrg { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
286 1.1 mrg '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
287 1.1 mrg { '?', '?', '?', '?', '?', '?', '?', '?',
288 1.1 mrg '?', '?', '?', '?', '?', '?', '?', '?' },
289 1.1 mrg };
290 1.1 mrg
291 1.1 mrg /* A version of the fast scanner using MMX vectorized byte compare insns.
292 1.1 mrg
293 1.1 mrg This uses the PMOVMSKB instruction which was introduced with "MMX2",
294 1.1 mrg which was packaged into SSE1; it is also present in the AMD MMX
295 1.1 mrg extension. Mark the function as using "sse" so that we emit a real
296 1.1 mrg "emms" instruction, rather than the 3dNOW "femms" instruction. */
297 1.1 mrg
298 1.1 mrg static const uchar *
299 1.1 mrg #ifndef __SSE__
300 1.1 mrg __attribute__((__target__("sse")))
301 1.1 mrg #endif
302 1.1 mrg search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
303 1.1 mrg {
304 1.1 mrg typedef char v8qi __attribute__ ((__vector_size__ (8)));
305 1.1 mrg typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
306 1.1 mrg
307 1.1 mrg const v8qi repl_nl = *(const v8qi *)repl_chars[0];
308 1.1 mrg const v8qi repl_cr = *(const v8qi *)repl_chars[1];
309 1.1 mrg const v8qi repl_bs = *(const v8qi *)repl_chars[2];
310 1.1 mrg const v8qi repl_qm = *(const v8qi *)repl_chars[3];
311 1.1 mrg
312 1.1 mrg unsigned int misalign, found, mask;
313 1.1 mrg const v8qi *p;
314 1.1 mrg v8qi data, t, c;
315 1.1 mrg
316 1.1 mrg /* Align the source pointer. While MMX doesn't generate unaligned data
317 1.1 mrg faults, this allows us to safely scan to the end of the buffer without
318 1.1 mrg reading beyond the end of the last page. */
319 1.1 mrg misalign = (uintptr_t)s & 7;
320 1.1 mrg p = (const v8qi *)((uintptr_t)s & -8);
321 1.1 mrg data = *p;
322 1.1 mrg
323 1.1 mrg /* Create a mask for the bytes that are valid within the first
324 1.1 mrg 16-byte block. The Idea here is that the AND with the mask
325 1.1 mrg within the loop is "free", since we need some AND or TEST
326 1.1 mrg insn in order to set the flags for the branch anyway. */
327 1.1 mrg mask = -1u << misalign;
328 1.1 mrg
329 1.1 mrg /* Main loop processing 8 bytes at a time. */
330 1.1 mrg goto start;
331 1.1 mrg do
332 1.1 mrg {
333 1.1 mrg data = *++p;
334 1.1 mrg mask = -1;
335 1.1 mrg
336 1.1 mrg start:
337 1.1 mrg t = __builtin_ia32_pcmpeqb(data, repl_nl);
338 1.1 mrg c = __builtin_ia32_pcmpeqb(data, repl_cr);
339 1.1 mrg t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
340 1.1 mrg c = __builtin_ia32_pcmpeqb(data, repl_bs);
341 1.1 mrg t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
342 1.1 mrg c = __builtin_ia32_pcmpeqb(data, repl_qm);
343 1.1 mrg t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
344 1.1 mrg found = __builtin_ia32_pmovmskb (t);
345 1.1 mrg found &= mask;
346 1.1 mrg }
347 1.1 mrg while (!found);
348 1.1 mrg
349 1.1 mrg __builtin_ia32_emms ();
350 1.1 mrg
351 1.1 mrg /* FOUND contains 1 in bits for which we matched a relevant
352 1.1 mrg character. Conversion to the byte index is trivial. */
353 1.1 mrg found = __builtin_ctz(found);
354 1.1 mrg return (const uchar *)p + found;
355 1.1 mrg }
356 1.1 mrg
357 1.1 mrg /* A version of the fast scanner using SSE2 vectorized byte compare insns. */
358 1.1 mrg
359 1.1 mrg static const uchar *
360 1.1 mrg #ifndef __SSE2__
361 1.1 mrg __attribute__((__target__("sse2")))
362 1.1 mrg #endif
363 1.1 mrg search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
364 1.1 mrg {
365 1.1 mrg typedef char v16qi __attribute__ ((__vector_size__ (16)));
366 1.1 mrg
367 1.1 mrg const v16qi repl_nl = *(const v16qi *)repl_chars[0];
368 1.1 mrg const v16qi repl_cr = *(const v16qi *)repl_chars[1];
369 1.1 mrg const v16qi repl_bs = *(const v16qi *)repl_chars[2];
370 1.1 mrg const v16qi repl_qm = *(const v16qi *)repl_chars[3];
371 1.1 mrg
372 1.1 mrg unsigned int misalign, found, mask;
373 1.1 mrg const v16qi *p;
374 1.1 mrg v16qi data, t;
375 1.1 mrg
376 1.1 mrg /* Align the source pointer. */
377 1.1 mrg misalign = (uintptr_t)s & 15;
378 1.1 mrg p = (const v16qi *)((uintptr_t)s & -16);
379 1.1 mrg data = *p;
380 1.1 mrg
381 1.1 mrg /* Create a mask for the bytes that are valid within the first
382 1.1 mrg 16-byte block. The Idea here is that the AND with the mask
383 1.1 mrg within the loop is "free", since we need some AND or TEST
384 1.1 mrg insn in order to set the flags for the branch anyway. */
385 1.1 mrg mask = -1u << misalign;
386 1.1 mrg
387 1.1 mrg /* Main loop processing 16 bytes at a time. */
388 1.1 mrg goto start;
389 1.1 mrg do
390 1.1 mrg {
391 1.1 mrg data = *++p;
392 1.1 mrg mask = -1;
393 1.1 mrg
394 1.1 mrg start:
395 1.1 mrg t = data == repl_nl;
396 1.1 mrg t |= data == repl_cr;
397 1.1 mrg t |= data == repl_bs;
398 1.1 mrg t |= data == repl_qm;
399 1.1 mrg found = __builtin_ia32_pmovmskb128 (t);
400 1.1 mrg found &= mask;
401 1.1 mrg }
402 1.1 mrg while (!found);
403 1.1 mrg
404 1.1 mrg /* FOUND contains 1 in bits for which we matched a relevant
405 1.1 mrg character. Conversion to the byte index is trivial. */
406 1.1 mrg found = __builtin_ctz(found);
407 1.1 mrg return (const uchar *)p + found;
408 1.1 mrg }
409 1.1 mrg
410 1.1 mrg #ifdef HAVE_SSE4
411 1.1 mrg /* A version of the fast scanner using SSE 4.2 vectorized string insns. */
412 1.1 mrg
413 1.1 mrg static const uchar *
414 1.1 mrg #ifndef __SSE4_2__
415 1.1 mrg __attribute__((__target__("sse4.2")))
416 1.1 mrg #endif
417 1.1 mrg search_line_sse42 (const uchar *s, const uchar *end)
418 1.1 mrg {
419 1.1 mrg typedef char v16qi __attribute__ ((__vector_size__ (16)));
420 1.1 mrg static const v16qi search = { '\n', '\r', '?', '\\' };
421 1.1 mrg
422 1.1 mrg uintptr_t si = (uintptr_t)s;
423 1.1 mrg uintptr_t index;
424 1.1 mrg
425 1.1 mrg /* Check for unaligned input. */
426 1.1 mrg if (si & 15)
427 1.1 mrg {
428 1.1 mrg v16qi sv;
429 1.1 mrg
430 1.1 mrg if (__builtin_expect (end - s < 16, 0)
431 1.1 mrg && __builtin_expect ((si & 0xfff) > 0xff0, 0))
432 1.1 mrg {
433 1.1 mrg /* There are less than 16 bytes left in the buffer, and less
434 1.1 mrg than 16 bytes left on the page. Reading 16 bytes at this
435 1.1 mrg point might generate a spurious page fault. Defer to the
436 1.1 mrg SSE2 implementation, which already handles alignment. */
437 1.1 mrg return search_line_sse2 (s, end);
438 1.1 mrg }
439 1.1 mrg
440 1.1 mrg /* ??? The builtin doesn't understand that the PCMPESTRI read from
441 1.1 mrg memory need not be aligned. */
442 1.1 mrg sv = __builtin_ia32_loaddqu ((const char *) s);
443 1.1 mrg index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
444 1.1 mrg
445 1.1 mrg if (__builtin_expect (index < 16, 0))
446 1.1 mrg goto found;
447 1.1 mrg
448 1.1 mrg /* Advance the pointer to an aligned address. We will re-scan a
449 1.1 mrg few bytes, but we no longer need care for reading past the
450 1.1 mrg end of a page, since we're guaranteed a match. */
451 1.1 mrg s = (const uchar *)((si + 15) & -16);
452 1.1 mrg }
453 1.1 mrg
454 1.1 mrg /* Main loop, processing 16 bytes at a time. */
455 1.1 mrg #ifdef __GCC_ASM_FLAG_OUTPUTS__
456 1.1 mrg while (1)
457 1.1 mrg {
458 1.1 mrg char f;
459 1.1 mrg
460 1.1 mrg /* By using inline assembly instead of the builtin,
461 1.1 mrg we can use the result, as well as the flags set. */
462 1.1 mrg __asm ("%vpcmpestri\t$0, %2, %3"
463 1.1 mrg : "=c"(index), "=@ccc"(f)
464 1.1 mrg : "m"(*s), "x"(search), "a"(4), "d"(16));
465 1.1 mrg if (f)
466 1.1 mrg break;
467 1.1 mrg
468 1.1 mrg s += 16;
469 1.1 mrg }
470 1.1 mrg #else
471 1.1 mrg s -= 16;
472 1.1 mrg /* By doing the whole loop in inline assembly,
473 1.1 mrg we can make proper use of the flags set. */
474 1.1 mrg __asm ( ".balign 16\n"
475 1.1 mrg "0: add $16, %1\n"
476 1.1 mrg " %vpcmpestri\t$0, (%1), %2\n"
477 1.1 mrg " jnc 0b"
478 1.1 mrg : "=&c"(index), "+r"(s)
479 1.1 mrg : "x"(search), "a"(4), "d"(16));
480 1.1 mrg #endif
481 1.1 mrg
482 1.1 mrg found:
483 1.1 mrg return s + index;
484 1.1 mrg }
485 1.1 mrg
486 1.1 mrg #else
487 1.1 mrg /* Work around out-dated assemblers without sse4 support. */
488 1.1 mrg #define search_line_sse42 search_line_sse2
489 1.1 mrg #endif
490 1.1 mrg
491 1.1 mrg /* Check the CPU capabilities. */
492 1.1 mrg
493 1.1 mrg #include "../gcc/config/i386/cpuid.h"
494 1.1 mrg
495 1.1 mrg typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
496 1.1 mrg static search_line_fast_type search_line_fast;
497 1.1 mrg
498 1.1 mrg #define HAVE_init_vectorized_lexer 1
499 1.1 mrg static inline void
500 1.1 mrg init_vectorized_lexer (void)
501 1.1 mrg {
502 1.1 mrg unsigned dummy, ecx = 0, edx = 0;
503 1.1 mrg search_line_fast_type impl = search_line_acc_char;
504 1.1 mrg int minimum = 0;
505 1.1 mrg
506 1.1 mrg #if defined(__SSE4_2__)
507 1.1 mrg minimum = 3;
508 1.1 mrg #elif defined(__SSE2__)
509 1.1 mrg minimum = 2;
510 1.1 mrg #elif defined(__SSE__)
511 1.1 mrg minimum = 1;
512 1.1 mrg #endif
513 1.1 mrg
514 1.1 mrg if (minimum == 3)
515 1.1 mrg impl = search_line_sse42;
516 1.1 mrg else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
517 1.1 mrg {
518 1.1 mrg if (minimum == 3 || (ecx & bit_SSE4_2))
519 1.1 mrg impl = search_line_sse42;
520 1.1 mrg else if (minimum == 2 || (edx & bit_SSE2))
521 1.1 mrg impl = search_line_sse2;
522 1.1 mrg else if (minimum == 1 || (edx & bit_SSE))
523 1.1 mrg impl = search_line_mmx;
524 1.1 mrg }
525 1.1 mrg else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
526 1.1 mrg {
527 1.1 mrg if (minimum == 1
528 1.1 mrg || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
529 1.1 mrg impl = search_line_mmx;
530 1.1 mrg }
531 1.1 mrg
532 1.1 mrg search_line_fast = impl;
533 1.1 mrg }
534 1.1 mrg
535 1.1 mrg #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
536 1.1 mrg
537 1.1 mrg /* A vection of the fast scanner using AltiVec vectorized byte compares
538 1.1 mrg and VSX unaligned loads (when VSX is available). This is otherwise
539 1.1 mrg the same as the AltiVec version. */
540 1.1 mrg
541 1.1 mrg ATTRIBUTE_NO_SANITIZE_UNDEFINED
542 1.1 mrg static const uchar *
543 1.1 mrg search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
544 1.1 mrg {
545 1.1 mrg typedef __attribute__((altivec(vector))) unsigned char vc;
546 1.1 mrg
547 1.1 mrg const vc repl_nl = {
548 1.1 mrg '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
549 1.1 mrg '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
550 1.1 mrg };
551 1.1 mrg const vc repl_cr = {
552 1.1 mrg '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
553 1.1 mrg '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
554 1.1 mrg };
555 1.1 mrg const vc repl_bs = {
556 1.1 mrg '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
557 1.1 mrg '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
558 1.1 mrg };
559 1.1 mrg const vc repl_qm = {
560 1.1 mrg '?', '?', '?', '?', '?', '?', '?', '?',
561 1.1 mrg '?', '?', '?', '?', '?', '?', '?', '?',
562 1.1 mrg };
563 1.1 mrg const vc zero = { 0 };
564 1.1 mrg
565 1.1 mrg vc data, t;
566 1.1 mrg
567 1.1 mrg /* Main loop processing 16 bytes at a time. */
568 1.1 mrg do
569 1.1 mrg {
570 1.1 mrg vc m_nl, m_cr, m_bs, m_qm;
571 1.1 mrg
572 1.1 mrg data = __builtin_vec_vsx_ld (0, s);
573 1.1 mrg s += 16;
574 1.1 mrg
575 1.1 mrg m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
576 1.1 mrg m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
577 1.1 mrg m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
578 1.1 mrg m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
579 1.1 mrg t = (m_nl | m_cr) | (m_bs | m_qm);
580 1.1 mrg
581 1.1 mrg /* T now contains 0xff in bytes for which we matched one of the relevant
582 1.1 mrg characters. We want to exit the loop if any byte in T is non-zero.
583 1.1 mrg Below is the expansion of vec_any_ne(t, zero). */
584 1.1 mrg }
585 1.1 mrg while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
586 1.1 mrg
587 1.1 mrg /* Restore s to to point to the 16 bytes we just processed. */
588 1.1 mrg s -= 16;
589 1.1 mrg
590 1.1 mrg {
591 1.1 mrg #define N (sizeof(vc) / sizeof(long))
592 1.1 mrg
593 1.1 mrg union {
594 1.1 mrg vc v;
595 1.1 mrg /* Statically assert that N is 2 or 4. */
596 1.1 mrg unsigned long l[(N == 2 || N == 4) ? N : -1];
597 1.1 mrg } u;
598 1.1 mrg unsigned long l, i = 0;
599 1.1 mrg
600 1.1 mrg u.v = t;
601 1.1 mrg
602 1.1 mrg /* Find the first word of T that is non-zero. */
603 1.1 mrg switch (N)
604 1.1 mrg {
605 1.1 mrg case 4:
606 1.1 mrg l = u.l[i++];
607 1.1 mrg if (l != 0)
608 1.1 mrg break;
609 1.1 mrg s += sizeof(unsigned long);
610 1.1 mrg l = u.l[i++];
611 1.1 mrg if (l != 0)
612 1.1 mrg break;
613 1.1 mrg s += sizeof(unsigned long);
614 1.1 mrg /* FALLTHRU */
615 1.1 mrg case 2:
616 1.1 mrg l = u.l[i++];
617 1.1 mrg if (l != 0)
618 1.1 mrg break;
619 1.1 mrg s += sizeof(unsigned long);
620 1.1 mrg l = u.l[i];
621 1.1 mrg }
622 1.1 mrg
623 1.1 mrg /* L now contains 0xff in bytes for which we matched one of the
624 1.1 mrg relevant characters. We can find the byte index by finding
625 1.1 mrg its bit index and dividing by 8. */
626 1.1 mrg #ifdef __BIG_ENDIAN__
627 1.1 mrg l = __builtin_clzl(l) >> 3;
628 1.1 mrg #else
629 1.1 mrg l = __builtin_ctzl(l) >> 3;
630 1.1 mrg #endif
631 1.1 mrg return s + l;
632 1.1 mrg
633 1.1 mrg #undef N
634 1.1 mrg }
635 1.1 mrg }
636 1.1 mrg
637 1.1 mrg #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
638 1.1 mrg
639 1.1 mrg /* A vection of the fast scanner using AltiVec vectorized byte compares.
640 1.1 mrg This cannot be used for little endian because vec_lvsl/lvsr are
641 1.1 mrg deprecated for little endian and the code won't work properly. */
642 1.1 mrg /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
643 1.1 mrg so we can't compile this function without -maltivec on the command line
644 1.1 mrg (or implied by some other switch). */
645 1.1 mrg
646 1.1 mrg static const uchar *
647 1.1 mrg search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
648 1.1 mrg {
649 1.1 mrg typedef __attribute__((altivec(vector))) unsigned char vc;
650 1.1 mrg
651 1.1 mrg const vc repl_nl = {
652 1.1 mrg '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
653 1.1 mrg '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
654 1.1 mrg };
655 1.1 mrg const vc repl_cr = {
656 1.1 mrg '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
657 1.1 mrg '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
658 1.1 mrg };
659 1.1 mrg const vc repl_bs = {
660 1.1 mrg '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
661 1.1 mrg '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
662 1.1 mrg };
663 1.1 mrg const vc repl_qm = {
664 1.1 mrg '?', '?', '?', '?', '?', '?', '?', '?',
665 1.1 mrg '?', '?', '?', '?', '?', '?', '?', '?',
666 1.1 mrg };
667 1.1 mrg const vc ones = {
668 1.1 mrg -1, -1, -1, -1, -1, -1, -1, -1,
669 1.1 mrg -1, -1, -1, -1, -1, -1, -1, -1,
670 1.1 mrg };
671 1.1 mrg const vc zero = { 0 };
672 1.1 mrg
673 1.1 mrg vc data, mask, t;
674 1.1 mrg
675 1.1 mrg /* Altivec loads automatically mask addresses with -16. This lets us
676 1.1 mrg issue the first load as early as possible. */
677 1.1 mrg data = __builtin_vec_ld(0, (const vc *)s);
678 1.1 mrg
679 1.1 mrg /* Discard bytes before the beginning of the buffer. Do this by
680 1.1 mrg beginning with all ones and shifting in zeros according to the
681 1.1 mrg mis-alignment. The LVSR instruction pulls the exact shift we
682 1.1 mrg want from the address. */
683 1.1 mrg mask = __builtin_vec_lvsr(0, s);
684 1.1 mrg mask = __builtin_vec_perm(zero, ones, mask);
685 1.1 mrg data &= mask;
686 1.1 mrg
687 1.1 mrg /* While altivec loads mask addresses, we still need to align S so
688 1.1 mrg that the offset we compute at the end is correct. */
689 1.1 mrg s = (const uchar *)((uintptr_t)s & -16);
690 1.1 mrg
691 1.1 mrg /* Main loop processing 16 bytes at a time. */
692 1.1 mrg goto start;
693 1.1 mrg do
694 1.1 mrg {
695 1.1 mrg vc m_nl, m_cr, m_bs, m_qm;
696 1.1 mrg
697 1.1 mrg s += 16;
698 1.1 mrg data = __builtin_vec_ld(0, (const vc *)s);
699 1.1 mrg
700 1.1 mrg start:
701 1.1 mrg m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
702 1.1 mrg m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
703 1.1 mrg m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
704 1.1 mrg m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
705 1.1 mrg t = (m_nl | m_cr) | (m_bs | m_qm);
706 1.1 mrg
707 1.1 mrg /* T now contains 0xff in bytes for which we matched one of the relevant
708 1.1 mrg characters. We want to exit the loop if any byte in T is non-zero.
709 1.1 mrg Below is the expansion of vec_any_ne(t, zero). */
710 1.1 mrg }
711 1.1 mrg while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
712 1.1 mrg
713 1.1 mrg {
714 1.1 mrg #define N (sizeof(vc) / sizeof(long))
715 1.1 mrg
716 1.1 mrg union {
717 1.1 mrg vc v;
718 1.1 mrg /* Statically assert that N is 2 or 4. */
719 1.1 mrg unsigned long l[(N == 2 || N == 4) ? N : -1];
720 1.1 mrg } u;
721 1.1 mrg unsigned long l, i = 0;
722 1.1 mrg
723 1.1 mrg u.v = t;
724 1.1 mrg
725 1.1 mrg /* Find the first word of T that is non-zero. */
726 1.1 mrg switch (N)
727 1.1 mrg {
728 1.1 mrg case 4:
729 1.1 mrg l = u.l[i++];
730 1.1 mrg if (l != 0)
731 1.1 mrg break;
732 1.1 mrg s += sizeof(unsigned long);
733 1.1 mrg l = u.l[i++];
734 1.1 mrg if (l != 0)
735 1.1 mrg break;
736 1.1 mrg s += sizeof(unsigned long);
737 1.1 mrg /* FALLTHROUGH */
738 1.1 mrg case 2:
739 1.1 mrg l = u.l[i++];
740 1.1 mrg if (l != 0)
741 1.1 mrg break;
742 1.1 mrg s += sizeof(unsigned long);
743 1.1 mrg l = u.l[i];
744 1.1 mrg }
745 1.1 mrg
746 1.1 mrg /* L now contains 0xff in bytes for which we matched one of the
747 1.1 mrg relevant characters. We can find the byte index by finding
748 1.1 mrg its bit index and dividing by 8. */
749 1.1 mrg l = __builtin_clzl(l) >> 3;
750 1.1 mrg return s + l;
751 1.1 mrg
752 1.1 mrg #undef N
753 1.1 mrg }
754 1.1 mrg }
755 1.1 mrg
756 1.1 mrg #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
757 1.1 mrg #include "arm_neon.h"
758 1.1 mrg
759 1.1 mrg /* This doesn't have to be the exact page size, but no system may use
760 1.1 mrg a size smaller than this. ARMv8 requires a minimum page size of
761 1.1 mrg 4k. The impact of being conservative here is a small number of
762 1.1 mrg cases will take the slightly slower entry path into the main
763 1.1 mrg loop. */
764 1.1 mrg
765 1.1 mrg #define AARCH64_MIN_PAGE_SIZE 4096
766 1.1 mrg
767 1.1 mrg static const uchar *
768 1.1 mrg search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
769 1.1 mrg {
770 1.1 mrg const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
771 1.1 mrg const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
772 1.1 mrg const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
773 1.1 mrg const uint8x16_t repl_qm = vdupq_n_u8 ('?');
774 1.1 mrg const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
775 1.1 mrg
776 1.1 mrg #ifdef __ARM_BIG_ENDIAN
777 1.1 mrg const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
778 1.1 mrg #else
779 1.1 mrg const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
780 1.1 mrg #endif
781 1.1 mrg
782 1.1 mrg unsigned int found;
783 1.1 mrg const uint8_t *p;
784 1.1 mrg uint8x16_t data;
785 1.1 mrg uint8x16_t t;
786 1.1 mrg uint16x8_t m;
787 1.1 mrg uint8x16_t u, v, w;
788 1.1 mrg
789 1.1 mrg /* Align the source pointer. */
790 1.1 mrg p = (const uint8_t *)((uintptr_t)s & -16);
791 1.1 mrg
792 1.1 mrg /* Assuming random string start positions, with a 4k page size we'll take
793 1.1 mrg the slow path about 0.37% of the time. */
794 1.1 mrg if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
795 1.1 mrg - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
796 1.1 mrg < 16, 0))
797 1.1 mrg {
798 1.1 mrg /* Slow path: the string starts near a possible page boundary. */
799 1.1 mrg uint32_t misalign, mask;
800 1.1 mrg
801 1.1 mrg misalign = (uintptr_t)s & 15;
802 1.1 mrg mask = (-1u << misalign) & 0xffff;
803 1.1 mrg data = vld1q_u8 (p);
804 1.1 mrg t = vceqq_u8 (data, repl_nl);
805 1.1 mrg u = vceqq_u8 (data, repl_cr);
806 1.1 mrg v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
807 1.1 mrg w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
808 1.1 mrg t = vorrq_u8 (v, w);
809 1.1 mrg t = vandq_u8 (t, xmask);
810 1.1 mrg m = vpaddlq_u8 (t);
811 1.1 mrg m = vshlq_u16 (m, shift);
812 1.1 mrg found = vaddvq_u16 (m);
813 1.1 mrg found &= mask;
814 1.1 mrg if (found)
815 1.1 mrg return (const uchar*)p + __builtin_ctz (found);
816 1.1 mrg }
817 1.1 mrg else
818 1.1 mrg {
819 1.1 mrg data = vld1q_u8 ((const uint8_t *) s);
820 1.1 mrg t = vceqq_u8 (data, repl_nl);
821 1.1 mrg u = vceqq_u8 (data, repl_cr);
822 1.1 mrg v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
823 1.1 mrg w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
824 1.1 mrg t = vorrq_u8 (v, w);
825 1.1 mrg if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
826 1.1 mrg goto done;
827 1.1 mrg }
828 1.1 mrg
829 1.1 mrg do
830 1.1 mrg {
831 1.1 mrg p += 16;
832 1.1 mrg data = vld1q_u8 (p);
833 1.1 mrg t = vceqq_u8 (data, repl_nl);
834 1.1 mrg u = vceqq_u8 (data, repl_cr);
835 1.1 mrg v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
836 1.1 mrg w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
837 1.1 mrg t = vorrq_u8 (v, w);
838 1.1 mrg } while (!vpaddd_u64 ((uint64x2_t)t));
839 1.1 mrg
840 1.1 mrg done:
841 1.1 mrg /* Now that we've found the terminating substring, work out precisely where
842 1.1 mrg we need to stop. */
843 1.1 mrg t = vandq_u8 (t, xmask);
844 1.1 mrg m = vpaddlq_u8 (t);
845 1.1 mrg m = vshlq_u16 (m, shift);
846 1.1 mrg found = vaddvq_u16 (m);
847 1.1 mrg return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
848 1.1 mrg + __builtin_ctz (found));
849 1.1 mrg }
850 1.1 mrg
851 1.1 mrg #elif defined (__ARM_NEON)
852 1.1 mrg #include "arm_neon.h"
853 1.1 mrg
854 1.1 mrg static const uchar *
855 1.1 mrg search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
856 1.1 mrg {
857 1.1 mrg const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
858 1.1 mrg const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
859 1.1 mrg const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
860 1.1 mrg const uint8x16_t repl_qm = vdupq_n_u8 ('?');
861 1.1 mrg const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
862 1.1 mrg
863 1.1 mrg unsigned int misalign, found, mask;
864 1.1 mrg const uint8_t *p;
865 1.1 mrg uint8x16_t data;
866 1.1 mrg
867 1.1 mrg /* Align the source pointer. */
868 1.1 mrg misalign = (uintptr_t)s & 15;
869 1.1 mrg p = (const uint8_t *)((uintptr_t)s & -16);
870 1.1 mrg data = vld1q_u8 (p);
871 1.1 mrg
872 1.1 mrg /* Create a mask for the bytes that are valid within the first
873 1.1 mrg 16-byte block. The Idea here is that the AND with the mask
874 1.1 mrg within the loop is "free", since we need some AND or TEST
875 1.1 mrg insn in order to set the flags for the branch anyway. */
876 1.1 mrg mask = (-1u << misalign) & 0xffff;
877 1.1 mrg
878 1.1 mrg /* Main loop, processing 16 bytes at a time. */
879 1.1 mrg goto start;
880 1.1 mrg
881 1.1 mrg do
882 1.1 mrg {
883 1.1 mrg uint8x8_t l;
884 1.1 mrg uint16x4_t m;
885 1.1 mrg uint32x2_t n;
886 1.1 mrg uint8x16_t t, u, v, w;
887 1.1 mrg
888 1.1 mrg p += 16;
889 1.1 mrg data = vld1q_u8 (p);
890 1.1 mrg mask = 0xffff;
891 1.1 mrg
892 1.1 mrg start:
893 1.1 mrg t = vceqq_u8 (data, repl_nl);
894 1.1 mrg u = vceqq_u8 (data, repl_cr);
895 1.1 mrg v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
896 1.1 mrg w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
897 1.1 mrg t = vandq_u8 (vorrq_u8 (v, w), xmask);
898 1.1 mrg l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
899 1.1 mrg m = vpaddl_u8 (l);
900 1.1 mrg n = vpaddl_u16 (m);
901 1.1 mrg
902 1.1 mrg found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
903 1.1 mrg vshr_n_u64 ((uint64x1_t) n, 24)), 0);
904 1.1 mrg found &= mask;
905 1.1 mrg }
906 1.1 mrg while (!found);
907 1.1 mrg
908 1.1 mrg /* FOUND contains 1 in bits for which we matched a relevant
909 1.1 mrg character. Conversion to the byte index is trivial. */
910 1.1 mrg found = __builtin_ctz (found);
911 1.1 mrg return (const uchar *)p + found;
912 1.1 mrg }
913 1.1 mrg
914 1.1 mrg #else
915 1.1 mrg
916 1.1 mrg /* We only have one accelerated alternative. Use a direct call so that
917 1.1 mrg we encourage inlining. */
918 1.1 mrg
919 1.1 mrg #define search_line_fast search_line_acc_char
920 1.1 mrg
921 1.1 mrg #endif
922 1.1 mrg
923 1.1 mrg /* Initialize the lexer if needed. */
924 1.1 mrg
925 1.1 mrg void
926 1.1 mrg _cpp_init_lexer (void)
927 1.1 mrg {
928 1.1 mrg #ifdef HAVE_init_vectorized_lexer
929 1.1 mrg init_vectorized_lexer ();
930 1.1 mrg #endif
931 1.1 mrg }
932 1.1 mrg
933 1.1 mrg /* Returns with a logical line that contains no escaped newlines or
934 1.1 mrg trigraphs. This is a time-critical inner loop. */
935 1.1 mrg void
936 1.1 mrg _cpp_clean_line (cpp_reader *pfile)
937 1.1 mrg {
938 1.1 mrg cpp_buffer *buffer;
939 1.1 mrg const uchar *s;
940 1.1 mrg uchar c, *d, *p;
941 1.1 mrg
942 1.1 mrg buffer = pfile->buffer;
943 1.1 mrg buffer->cur_note = buffer->notes_used = 0;
944 1.1 mrg buffer->cur = buffer->line_base = buffer->next_line;
945 1.1 mrg buffer->need_line = false;
946 1.1 mrg s = buffer->next_line;
947 1.1 mrg
948 1.1 mrg if (!buffer->from_stage3)
949 1.1 mrg {
950 1.1 mrg const uchar *pbackslash = NULL;
951 1.1 mrg
952 1.1 mrg /* Fast path. This is the common case of an un-escaped line with
953 1.1 mrg no trigraphs. The primary win here is by not writing any
954 1.1 mrg data back to memory until we have to. */
955 1.1 mrg while (1)
956 1.1 mrg {
957 1.1 mrg /* Perform an optimized search for \n, \r, \\, ?. */
958 1.1 mrg s = search_line_fast (s, buffer->rlimit);
959 1.1 mrg
960 1.1 mrg c = *s;
961 1.1 mrg if (c == '\\')
962 1.1 mrg {
963 1.1 mrg /* Record the location of the backslash and continue. */
964 1.1 mrg pbackslash = s++;
965 1.1 mrg }
966 1.1 mrg else if (__builtin_expect (c == '?', 0))
967 1.1 mrg {
968 1.1 mrg if (__builtin_expect (s[1] == '?', false)
969 1.1 mrg && _cpp_trigraph_map[s[2]])
970 1.1 mrg {
971 1.1 mrg /* Have a trigraph. We may or may not have to convert
972 1.1 mrg it. Add a line note regardless, for -Wtrigraphs. */
973 1.1 mrg add_line_note (buffer, s, s[2]);
974 1.1 mrg if (CPP_OPTION (pfile, trigraphs))
975 1.1 mrg {
976 1.1 mrg /* We do, and that means we have to switch to the
977 1.1 mrg slow path. */
978 1.1 mrg d = (uchar *) s;
979 1.1 mrg *d = _cpp_trigraph_map[s[2]];
980 1.1 mrg s += 2;
981 1.1 mrg goto slow_path;
982 1.1 mrg }
983 1.1 mrg }
984 1.1 mrg /* Not a trigraph. Continue on fast-path. */
985 1.1 mrg s++;
986 1.1 mrg }
987 1.1 mrg else
988 1.1 mrg break;
989 1.1 mrg }
990 1.1 mrg
991 1.1 mrg /* This must be \r or \n. We're either done, or we'll be forced
992 1.1 mrg to write back to the buffer and continue on the slow path. */
993 1.1 mrg d = (uchar *) s;
994 1.1 mrg
995 1.1 mrg if (__builtin_expect (s == buffer->rlimit, false))
996 1.1 mrg goto done;
997 1.1 mrg
998 1.1 mrg /* DOS line ending? */
999 1.1 mrg if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1000 1.1 mrg {
1001 1.1 mrg s++;
1002 1.1 mrg if (s == buffer->rlimit)
1003 1.1 mrg goto done;
1004 1.1 mrg }
1005 1.1 mrg
1006 1.1 mrg if (__builtin_expect (pbackslash == NULL, true))
1007 1.1 mrg goto done;
1008 1.1 mrg
1009 1.1 mrg /* Check for escaped newline. */
1010 1.1 mrg p = d;
1011 1.1 mrg while (is_nvspace (p[-1]))
1012 1.1 mrg p--;
1013 1.1 mrg if (p - 1 != pbackslash)
1014 1.1 mrg goto done;
1015 1.1 mrg
1016 1.1 mrg /* Have an escaped newline; process it and proceed to
1017 1.1 mrg the slow path. */
1018 1.1 mrg add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1019 1.1 mrg d = p - 2;
1020 1.1 mrg buffer->next_line = p - 1;
1021 1.1 mrg
1022 1.1 mrg slow_path:
1023 1.1 mrg while (1)
1024 1.1 mrg {
1025 1.1 mrg c = *++s;
1026 1.1 mrg *++d = c;
1027 1.1 mrg
1028 1.1 mrg if (c == '\n' || c == '\r')
1029 1.1 mrg {
1030 1.1 mrg /* Handle DOS line endings. */
1031 1.1 mrg if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1032 1.1 mrg s++;
1033 1.1 mrg if (s == buffer->rlimit)
1034 1.1 mrg break;
1035 1.1 mrg
1036 1.1 mrg /* Escaped? */
1037 1.1 mrg p = d;
1038 1.1 mrg while (p != buffer->next_line && is_nvspace (p[-1]))
1039 1.1 mrg p--;
1040 1.1 mrg if (p == buffer->next_line || p[-1] != '\\')
1041 1.1 mrg break;
1042 1.1 mrg
1043 1.1 mrg add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1044 1.1 mrg d = p - 2;
1045 1.1 mrg buffer->next_line = p - 1;
1046 1.1 mrg }
1047 1.1 mrg else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1048 1.1 mrg {
1049 1.1 mrg /* Add a note regardless, for the benefit of -Wtrigraphs. */
1050 1.1 mrg add_line_note (buffer, d, s[2]);
1051 1.1 mrg if (CPP_OPTION (pfile, trigraphs))
1052 1.1 mrg {
1053 1.1 mrg *d = _cpp_trigraph_map[s[2]];
1054 1.1 mrg s += 2;
1055 1.1 mrg }
1056 1.1 mrg }
1057 1.1 mrg }
1058 1.1 mrg }
1059 1.1 mrg else
1060 1.1 mrg {
1061 1.1 mrg while (*s != '\n' && *s != '\r')
1062 1.1 mrg s++;
1063 1.1 mrg d = (uchar *) s;
1064 1.1 mrg
1065 1.1 mrg /* Handle DOS line endings. */
1066 1.1 mrg if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1067 1.1 mrg s++;
1068 1.1 mrg }
1069 1.1 mrg
1070 1.1 mrg done:
1071 1.1 mrg *d = '\n';
1072 1.1 mrg /* A sentinel note that should never be processed. */
1073 1.1 mrg add_line_note (buffer, d + 1, '\n');
1074 1.1 mrg buffer->next_line = s + 1;
1075 1.1 mrg }
1076 1.1 mrg
1077 1.1 mrg /* Return true if the trigraph indicated by NOTE should be warned
1078 1.1 mrg about in a comment. */
1079 1.1 mrg static bool
1080 1.1 mrg warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1081 1.1 mrg {
1082 1.1 mrg const uchar *p;
1083 1.1 mrg
1084 1.1 mrg /* Within comments we don't warn about trigraphs, unless the
1085 1.1 mrg trigraph forms an escaped newline, as that may change
1086 1.1 mrg behavior. */
1087 1.1 mrg if (note->type != '/')
1088 1.1 mrg return false;
1089 1.1 mrg
1090 1.1 mrg /* If -trigraphs, then this was an escaped newline iff the next note
1091 1.1 mrg is coincident. */
1092 1.1 mrg if (CPP_OPTION (pfile, trigraphs))
1093 1.1 mrg return note[1].pos == note->pos;
1094 1.1 mrg
1095 1.1 mrg /* Otherwise, see if this forms an escaped newline. */
1096 1.1 mrg p = note->pos + 3;
1097 1.1 mrg while (is_nvspace (*p))
1098 1.1 mrg p++;
1099 1.1 mrg
1100 1.1 mrg /* There might have been escaped newlines between the trigraph and the
1101 1.1 mrg newline we found. Hence the position test. */
1102 1.1 mrg return (*p == '\n' && p < note[1].pos);
1103 1.1 mrg }
1104 1.1 mrg
1105 1.1 mrg /* Process the notes created by add_line_note as far as the current
1106 1.1 mrg location. */
1107 1.1 mrg void
1108 1.1 mrg _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1109 1.1 mrg {
1110 1.1 mrg cpp_buffer *buffer = pfile->buffer;
1111 1.1 mrg
1112 1.1 mrg for (;;)
1113 1.1 mrg {
1114 1.1 mrg _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1115 1.1 mrg unsigned int col;
1116 1.1 mrg
1117 1.1 mrg if (note->pos > buffer->cur)
1118 1.1 mrg break;
1119 1.1 mrg
1120 1.1 mrg buffer->cur_note++;
1121 1.1 mrg col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1122 1.1 mrg
1123 1.1 mrg if (note->type == '\\' || note->type == ' ')
1124 1.1 mrg {
1125 1.1 mrg if (note->type == ' ' && !in_comment)
1126 1.1 mrg cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1127 1.1 mrg "backslash and newline separated by space");
1128 1.1 mrg
1129 1.1 mrg if (buffer->next_line > buffer->rlimit)
1130 1.1 mrg {
1131 1.1 mrg cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1132 1.1 mrg "backslash-newline at end of file");
1133 1.1 mrg /* Prevent "no newline at end of file" warning. */
1134 1.1 mrg buffer->next_line = buffer->rlimit;
1135 1.1 mrg }
1136 1.1 mrg
1137 1.1 mrg buffer->line_base = note->pos;
1138 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
1139 1.1 mrg }
1140 1.1 mrg else if (_cpp_trigraph_map[note->type])
1141 1.1 mrg {
1142 1.1 mrg if (CPP_OPTION (pfile, warn_trigraphs)
1143 1.1 mrg && (!in_comment || warn_in_comment (pfile, note)))
1144 1.1 mrg {
1145 1.1 mrg if (CPP_OPTION (pfile, trigraphs))
1146 1.1 mrg cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1147 1.1 mrg pfile->line_table->highest_line, col,
1148 1.1 mrg "trigraph ??%c converted to %c",
1149 1.1 mrg note->type,
1150 1.1 mrg (int) _cpp_trigraph_map[note->type]);
1151 1.1 mrg else
1152 1.1 mrg {
1153 1.1 mrg cpp_warning_with_line
1154 1.1 mrg (pfile, CPP_W_TRIGRAPHS,
1155 1.1 mrg pfile->line_table->highest_line, col,
1156 1.1 mrg "trigraph ??%c ignored, use -trigraphs to enable",
1157 1.1 mrg note->type);
1158 1.1 mrg }
1159 1.1 mrg }
1160 1.1 mrg }
1161 1.1 mrg else if (note->type == 0)
1162 1.1 mrg /* Already processed in lex_raw_string. */;
1163 1.1 mrg else
1164 1.1 mrg abort ();
1165 1.1 mrg }
1166 1.1 mrg }
1167 1.1 mrg
1168 1.1 mrg namespace bidi {
1169 1.1 mrg enum class kind {
1170 1.1 mrg NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1171 1.1 mrg };
1172 1.1 mrg
1173 1.1 mrg /* All the UTF-8 encodings of bidi characters start with E2. */
1174 1.1 mrg constexpr uchar utf8_start = 0xe2;
1175 1.1 mrg
1176 1.1 mrg struct context
1177 1.1 mrg {
1178 1.1 mrg context () {}
1179 1.1 mrg context (location_t loc, kind k, bool pdf, bool ucn)
1180 1.1 mrg : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1181 1.1 mrg {
1182 1.1 mrg }
1183 1.1 mrg
1184 1.1 mrg kind get_pop_kind () const
1185 1.1 mrg {
1186 1.1 mrg return m_pdf ? kind::PDF : kind::PDI;
1187 1.1 mrg }
1188 1.1 mrg bool ucn_p () const
1189 1.1 mrg {
1190 1.1 mrg return m_ucn;
1191 1.1 mrg }
1192 1.1 mrg
1193 1.1 mrg location_t m_loc;
1194 1.1 mrg kind m_kind;
1195 1.1 mrg unsigned m_pdf : 1;
1196 1.1 mrg unsigned m_ucn : 1;
1197 1.1 mrg };
1198 1.1 mrg
1199 1.1 mrg /* A vector holding currently open bidi contexts. We use a char for
1200 1.1 mrg each context, its LSB is 1 if it represents a PDF context, 0 if it
1201 1.1 mrg represents a PDI context. The next bit is 1 if this context was open
1202 1.1 mrg by a bidi character written as a UCN, and 0 when it was UTF-8. */
1203 1.1 mrg semi_embedded_vec <context, 16> vec;
1204 1.1 mrg
1205 1.1 mrg /* Close the whole comment/identifier/string literal/character constant
1206 1.1 mrg context. */
1207 1.1 mrg void on_close ()
1208 1.1 mrg {
1209 1.1 mrg vec.truncate (0);
1210 1.1 mrg }
1211 1.1 mrg
1212 1.1 mrg /* Pop the last element in the vector. */
1213 1.1 mrg void pop ()
1214 1.1 mrg {
1215 1.1 mrg unsigned int len = vec.count ();
1216 1.1 mrg gcc_checking_assert (len > 0);
1217 1.1 mrg vec.truncate (len - 1);
1218 1.1 mrg }
1219 1.1 mrg
1220 1.1 mrg /* Return the pop kind of the context of the Ith element. */
1221 1.1 mrg kind pop_kind_at (unsigned int i)
1222 1.1 mrg {
1223 1.1 mrg return vec[i].get_pop_kind ();
1224 1.1 mrg }
1225 1.1 mrg
1226 1.1 mrg /* Return the pop kind of the context that is currently opened. */
1227 1.1 mrg kind current_ctx ()
1228 1.1 mrg {
1229 1.1 mrg unsigned int len = vec.count ();
1230 1.1 mrg if (len == 0)
1231 1.1 mrg return kind::NONE;
1232 1.1 mrg return vec[len - 1].get_pop_kind ();
1233 1.1 mrg }
1234 1.1 mrg
1235 1.1 mrg /* Return true if the current context comes from a UCN origin, that is,
1236 1.1 mrg the bidi char which started this bidi context was written as a UCN. */
1237 1.1 mrg bool current_ctx_ucn_p ()
1238 1.1 mrg {
1239 1.1 mrg unsigned int len = vec.count ();
1240 1.1 mrg gcc_checking_assert (len > 0);
1241 1.1 mrg return vec[len - 1].m_ucn;
1242 1.1 mrg }
1243 1.1 mrg
1244 1.1 mrg location_t current_ctx_loc ()
1245 1.1 mrg {
1246 1.1 mrg unsigned int len = vec.count ();
1247 1.1 mrg gcc_checking_assert (len > 0);
1248 1.1 mrg return vec[len - 1].m_loc;
1249 1.1 mrg }
1250 1.1 mrg
1251 1.1 mrg /* We've read a bidi char, update the current vector as necessary.
1252 1.1 mrg LOC is only valid when K is not kind::NONE. */
1253 1.1 mrg void on_char (kind k, bool ucn_p, location_t loc)
1254 1.1 mrg {
1255 1.1 mrg switch (k)
1256 1.1 mrg {
1257 1.1 mrg case kind::LRE:
1258 1.1 mrg case kind::RLE:
1259 1.1 mrg case kind::LRO:
1260 1.1 mrg case kind::RLO:
1261 1.1 mrg vec.push (context (loc, k, true, ucn_p));
1262 1.1 mrg break;
1263 1.1 mrg case kind::LRI:
1264 1.1 mrg case kind::RLI:
1265 1.1 mrg case kind::FSI:
1266 1.1 mrg vec.push (context (loc, k, false, ucn_p));
1267 1.1 mrg break;
1268 1.1 mrg /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1269 1.1 mrg whose scope has not yet been terminated. */
1270 1.1 mrg case kind::PDF:
1271 1.1 mrg if (current_ctx () == kind::PDF)
1272 1.1 mrg pop ();
1273 1.1 mrg break;
1274 1.1 mrg /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1275 1.1 mrg scope has not yet been terminated, as well as the scopes of
1276 1.1 mrg any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1277 1.1 mrg yet been terminated. */
1278 1.1 mrg case kind::PDI:
1279 1.1 mrg for (int i = vec.count () - 1; i >= 0; --i)
1280 1.1 mrg if (pop_kind_at (i) == kind::PDI)
1281 1.1 mrg {
1282 1.1 mrg vec.truncate (i);
1283 1.1 mrg break;
1284 1.1 mrg }
1285 1.1 mrg break;
1286 1.1 mrg case kind::LTR:
1287 1.1 mrg case kind::RTL:
1288 1.1 mrg /* These aren't popped by a PDF/PDI. */
1289 1.1 mrg break;
1290 1.1 mrg ATTR_LIKELY case kind::NONE:
1291 1.1 mrg break;
1292 1.1 mrg default:
1293 1.1 mrg abort ();
1294 1.1 mrg }
1295 1.1 mrg }
1296 1.1 mrg
1297 1.1 mrg /* Return a descriptive string for K. */
1298 1.1 mrg const char *to_str (kind k)
1299 1.1 mrg {
1300 1.1 mrg switch (k)
1301 1.1 mrg {
1302 1.1 mrg case kind::LRE:
1303 1.1 mrg return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1304 1.1 mrg case kind::RLE:
1305 1.1 mrg return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1306 1.1 mrg case kind::LRO:
1307 1.1 mrg return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1308 1.1 mrg case kind::RLO:
1309 1.1 mrg return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1310 1.1 mrg case kind::LRI:
1311 1.1 mrg return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1312 1.1 mrg case kind::RLI:
1313 1.1 mrg return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1314 1.1 mrg case kind::FSI:
1315 1.1 mrg return "U+2068 (FIRST STRONG ISOLATE)";
1316 1.1 mrg case kind::PDF:
1317 1.1 mrg return "U+202C (POP DIRECTIONAL FORMATTING)";
1318 1.1 mrg case kind::PDI:
1319 1.1 mrg return "U+2069 (POP DIRECTIONAL ISOLATE)";
1320 1.1 mrg case kind::LTR:
1321 1.1 mrg return "U+200E (LEFT-TO-RIGHT MARK)";
1322 1.1 mrg case kind::RTL:
1323 1.1 mrg return "U+200F (RIGHT-TO-LEFT MARK)";
1324 1.1 mrg default:
1325 1.1 mrg abort ();
1326 1.1 mrg }
1327 1.1 mrg }
1328 1.1 mrg }
1329 1.1 mrg
1330 1.1 mrg /* Get location_t for the range of bytes [START, START + NUM_BYTES)
1331 1.1 mrg within the current line in FILE, with the caret at START. */
1332 1.1 mrg
1333 1.1 mrg static location_t
1334 1.1 mrg get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1335 1.1 mrg const unsigned char *const start,
1336 1.1 mrg size_t num_bytes)
1337 1.1 mrg {
1338 1.1 mrg gcc_checking_assert (num_bytes > 0);
1339 1.1 mrg
1340 1.1 mrg /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1341 1.1 mrg to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1342 1.1 mrg whereas linemap_position_for_column is 1-based. */
1343 1.1 mrg
1344 1.1 mrg /* Get 0-based offsets within the line. */
1345 1.1 mrg size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1346 1.1 mrg size_t end_offset = start_offset + num_bytes - 1;
1347 1.1 mrg
1348 1.1 mrg /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1349 1.1 mrg location_t start_loc = linemap_position_for_column (pfile->line_table,
1350 1.1 mrg start_offset + 1);
1351 1.1 mrg location_t end_loc = linemap_position_for_column (pfile->line_table,
1352 1.1 mrg end_offset + 1);
1353 1.1 mrg
1354 1.1 mrg if (start_loc == end_loc)
1355 1.1 mrg return start_loc;
1356 1.1 mrg
1357 1.1 mrg source_range src_range;
1358 1.1 mrg src_range.m_start = start_loc;
1359 1.1 mrg src_range.m_finish = end_loc;
1360 1.1 mrg location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1361 1.1 mrg start_loc,
1362 1.1 mrg src_range,
1363 1.1 mrg NULL);
1364 1.1 mrg return combined_loc;
1365 1.1 mrg }
1366 1.1 mrg
1367 1.1 mrg /* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1368 1.1 mrg
1369 1.1 mrg static bidi::kind
1370 1.1 mrg get_bidi_utf8_1 (const unsigned char *const p)
1371 1.1 mrg {
1372 1.1 mrg gcc_checking_assert (p[0] == bidi::utf8_start);
1373 1.1 mrg
1374 1.1 mrg if (p[1] == 0x80)
1375 1.1 mrg switch (p[2])
1376 1.1 mrg {
1377 1.1 mrg case 0xaa:
1378 1.1 mrg return bidi::kind::LRE;
1379 1.1 mrg case 0xab:
1380 1.1 mrg return bidi::kind::RLE;
1381 1.1 mrg case 0xac:
1382 1.1 mrg return bidi::kind::PDF;
1383 1.1 mrg case 0xad:
1384 1.1 mrg return bidi::kind::LRO;
1385 1.1 mrg case 0xae:
1386 1.1 mrg return bidi::kind::RLO;
1387 1.1 mrg case 0x8e:
1388 1.1 mrg return bidi::kind::LTR;
1389 1.1 mrg case 0x8f:
1390 1.1 mrg return bidi::kind::RTL;
1391 1.1 mrg default:
1392 1.1 mrg break;
1393 1.1 mrg }
1394 1.1 mrg else if (p[1] == 0x81)
1395 1.1 mrg switch (p[2])
1396 1.1 mrg {
1397 1.1 mrg case 0xa6:
1398 1.1 mrg return bidi::kind::LRI;
1399 1.1 mrg case 0xa7:
1400 1.1 mrg return bidi::kind::RLI;
1401 1.1 mrg case 0xa8:
1402 1.1 mrg return bidi::kind::FSI;
1403 1.1 mrg case 0xa9:
1404 1.1 mrg return bidi::kind::PDI;
1405 1.1 mrg default:
1406 1.1 mrg break;
1407 1.1 mrg }
1408 1.1 mrg
1409 1.1 mrg return bidi::kind::NONE;
1410 1.1 mrg }
1411 1.1 mrg
1412 1.1 mrg /* Parse a sequence of 3 bytes starting with P and return its bidi code.
1413 1.1 mrg If the kind is not NONE, write the location to *OUT.*/
1414 1.1 mrg
1415 1.1 mrg static bidi::kind
1416 1.1 mrg get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1417 1.1 mrg {
1418 1.1 mrg bidi::kind result = get_bidi_utf8_1 (p);
1419 1.1 mrg if (result != bidi::kind::NONE)
1420 1.1 mrg {
1421 1.1 mrg /* We have a sequence of 3 bytes starting at P. */
1422 1.1 mrg *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1423 1.1 mrg }
1424 1.1 mrg return result;
1425 1.1 mrg }
1426 1.1 mrg
1427 1.1 mrg /* Parse a UCN where P points just past \u or \U and return its bidi code. */
1428 1.1 mrg
1429 1.1 mrg static bidi::kind
1430 1.1 mrg get_bidi_ucn_1 (const unsigned char *p, bool is_U)
1431 1.1 mrg {
1432 1.1 mrg /* 6.4.3 Universal Character Names
1433 1.1 mrg \u hex-quad
1434 1.1 mrg \U hex-quad hex-quad
1435 1.1 mrg where \unnnn means \U0000nnnn. */
1436 1.1 mrg
1437 1.1 mrg if (is_U)
1438 1.1 mrg {
1439 1.1 mrg if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1440 1.1 mrg return bidi::kind::NONE;
1441 1.1 mrg /* Skip 4B so we can treat \u and \U the same below. */
1442 1.1 mrg p += 4;
1443 1.1 mrg }
1444 1.1 mrg
1445 1.1 mrg /* All code points we are looking for start with 20xx. */
1446 1.1 mrg if (p[0] != '2' || p[1] != '0')
1447 1.1 mrg return bidi::kind::NONE;
1448 1.1 mrg else if (p[2] == '2')
1449 1.1 mrg switch (p[3])
1450 1.1 mrg {
1451 1.1 mrg case 'a':
1452 1.1 mrg case 'A':
1453 1.1 mrg return bidi::kind::LRE;
1454 1.1 mrg case 'b':
1455 1.1 mrg case 'B':
1456 1.1 mrg return bidi::kind::RLE;
1457 1.1 mrg case 'c':
1458 1.1 mrg case 'C':
1459 1.1 mrg return bidi::kind::PDF;
1460 1.1 mrg case 'd':
1461 1.1 mrg case 'D':
1462 1.1 mrg return bidi::kind::LRO;
1463 1.1 mrg case 'e':
1464 1.1 mrg case 'E':
1465 1.1 mrg return bidi::kind::RLO;
1466 1.1 mrg default:
1467 1.1 mrg break;
1468 1.1 mrg }
1469 1.1 mrg else if (p[2] == '6')
1470 1.1 mrg switch (p[3])
1471 1.1 mrg {
1472 1.1 mrg case '6':
1473 1.1 mrg return bidi::kind::LRI;
1474 1.1 mrg case '7':
1475 1.1 mrg return bidi::kind::RLI;
1476 1.1 mrg case '8':
1477 1.1 mrg return bidi::kind::FSI;
1478 1.1 mrg case '9':
1479 1.1 mrg return bidi::kind::PDI;
1480 1.1 mrg default:
1481 1.1 mrg break;
1482 1.1 mrg }
1483 1.1 mrg else if (p[2] == '0')
1484 1.1 mrg switch (p[3])
1485 1.1 mrg {
1486 1.1 mrg case 'e':
1487 1.1 mrg case 'E':
1488 1.1 mrg return bidi::kind::LTR;
1489 1.1 mrg case 'f':
1490 1.1 mrg case 'F':
1491 1.1 mrg return bidi::kind::RTL;
1492 1.1 mrg default:
1493 1.1 mrg break;
1494 1.1 mrg }
1495 1.1 mrg
1496 1.1 mrg return bidi::kind::NONE;
1497 1.1 mrg }
1498 1.1 mrg
1499 1.1 mrg /* Parse a UCN where P points just past \u or \U and return its bidi code.
1500 1.1 mrg If the kind is not NONE, write the location to *OUT.*/
1501 1.1 mrg
1502 1.1 mrg static bidi::kind
1503 1.1 mrg get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1504 1.1 mrg location_t *out)
1505 1.1 mrg {
1506 1.1 mrg bidi::kind result = get_bidi_ucn_1 (p, is_U);
1507 1.1 mrg if (result != bidi::kind::NONE)
1508 1.1 mrg {
1509 1.1 mrg const unsigned char *start = p - 2;
1510 1.1 mrg size_t num_bytes = 2 + (is_U ? 8 : 4);
1511 1.1 mrg *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1512 1.1 mrg }
1513 1.1 mrg return result;
1514 1.1 mrg }
1515 1.1 mrg
1516 1.1 mrg /* Subclass of rich_location for reporting on unpaired UTF-8
1517 1.1 mrg bidirectional control character(s).
1518 1.1 mrg Escape the source lines on output, and show all unclosed
1519 1.1 mrg bidi context, labelling everything. */
1520 1.1 mrg
1521 1.1 mrg class unpaired_bidi_rich_location : public rich_location
1522 1.1 mrg {
1523 1.1 mrg public:
1524 1.1 mrg class custom_range_label : public range_label
1525 1.1 mrg {
1526 1.1 mrg public:
1527 1.1 mrg label_text get_text (unsigned range_idx) const FINAL OVERRIDE
1528 1.1 mrg {
1529 1.1 mrg /* range 0 is the primary location; each subsequent range i + 1
1530 1.1 mrg is for bidi::vec[i]. */
1531 1.1 mrg if (range_idx > 0)
1532 1.1 mrg {
1533 1.1 mrg const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1534 1.1 mrg return label_text::borrow (bidi::to_str (ctxt.m_kind));
1535 1.1 mrg }
1536 1.1 mrg else
1537 1.1 mrg return label_text::borrow (_("end of bidirectional context"));
1538 1.1 mrg }
1539 1.1 mrg };
1540 1.1 mrg
1541 1.1 mrg unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1542 1.1 mrg : rich_location (pfile->line_table, loc, &m_custom_label)
1543 1.1 mrg {
1544 1.1 mrg set_escape_on_output (true);
1545 1.1 mrg for (unsigned i = 0; i < bidi::vec.count (); i++)
1546 1.1 mrg add_range (bidi::vec[i].m_loc,
1547 1.1 mrg SHOW_RANGE_WITHOUT_CARET,
1548 1.1 mrg &m_custom_label);
1549 1.1 mrg }
1550 1.1 mrg
1551 1.1 mrg private:
1552 1.1 mrg custom_range_label m_custom_label;
1553 1.1 mrg };
1554 1.1 mrg
1555 1.1 mrg /* We're closing a bidi context, that is, we've encountered a newline,
1556 1.1 mrg are closing a C-style comment, or are at the end of a string literal,
1557 1.1 mrg character constant, or identifier. Warn if this context was not
1558 1.1 mrg properly terminated by a PDI or PDF. P points to the last character
1559 1.1 mrg in this context. */
1560 1.1 mrg
1561 1.1 mrg static void
1562 1.1 mrg maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1563 1.1 mrg {
1564 1.1 mrg const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1565 1.1 mrg if (bidi::vec.count () > 0
1566 1.1 mrg && (warn_bidi & bidirectional_unpaired
1567 1.1 mrg && (!bidi::current_ctx_ucn_p ()
1568 1.1 mrg || (warn_bidi & bidirectional_ucn))))
1569 1.1 mrg {
1570 1.1 mrg const location_t loc
1571 1.1 mrg = linemap_position_for_column (pfile->line_table,
1572 1.1 mrg CPP_BUF_COLUMN (pfile->buffer, p));
1573 1.1 mrg unpaired_bidi_rich_location rich_loc (pfile, loc);
1574 1.1 mrg /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1575 1.1 mrg forms of a diagnostic, so fake it for now. */
1576 1.1 mrg if (bidi::vec.count () > 1)
1577 1.1 mrg cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1578 1.1 mrg "unpaired UTF-8 bidirectional control characters "
1579 1.1 mrg "detected");
1580 1.1 mrg else
1581 1.1 mrg cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1582 1.1 mrg "unpaired UTF-8 bidirectional control character "
1583 1.1 mrg "detected");
1584 1.1 mrg }
1585 1.1 mrg /* We're done with this context. */
1586 1.1 mrg bidi::on_close ();
1587 1.1 mrg }
1588 1.1 mrg
1589 1.1 mrg /* We're at the beginning or in the middle of an identifier/comment/string
1590 1.1 mrg literal/character constant. Warn if we've encountered a bidi character.
1591 1.1 mrg KIND says which bidi control character it was; UCN_P is true iff this bidi
1592 1.1 mrg control character was written as a UCN. LOC is the location of the
1593 1.1 mrg character, but is only valid if KIND != bidi::kind::NONE. */
1594 1.1 mrg
1595 1.1 mrg static void
1596 1.1 mrg maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1597 1.1 mrg bool ucn_p, location_t loc)
1598 1.1 mrg {
1599 1.1 mrg if (__builtin_expect (kind == bidi::kind::NONE, 1))
1600 1.1 mrg return;
1601 1.1 mrg
1602 1.1 mrg const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1603 1.1 mrg
1604 1.1 mrg if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1605 1.1 mrg {
1606 1.1 mrg rich_location rich_loc (pfile->line_table, loc);
1607 1.1 mrg rich_loc.set_escape_on_output (true);
1608 1.1 mrg
1609 1.1 mrg /* It seems excessive to warn about a PDI/PDF that is closing
1610 1.1 mrg an opened context because we've already warned about the
1611 1.1 mrg opening character. Except warn when we have a UCN x UTF-8
1612 1.1 mrg mismatch, if UCN checking is enabled. */
1613 1.1 mrg if (kind == bidi::current_ctx ())
1614 1.1 mrg {
1615 1.1 mrg if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1616 1.1 mrg && bidi::current_ctx_ucn_p () != ucn_p)
1617 1.1 mrg {
1618 1.1 mrg rich_loc.add_range (bidi::current_ctx_loc ());
1619 1.1 mrg cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1620 1.1 mrg "UTF-8 vs UCN mismatch when closing "
1621 1.1 mrg "a context by \"%s\"", bidi::to_str (kind));
1622 1.1 mrg }
1623 1.1 mrg }
1624 1.1 mrg else if (warn_bidi & bidirectional_any
1625 1.1 mrg && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1626 1.1 mrg {
1627 1.1 mrg if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1628 1.1 mrg cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1629 1.1 mrg "\"%s\" is closing an unopened context",
1630 1.1 mrg bidi::to_str (kind));
1631 1.1 mrg else
1632 1.1 mrg cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1633 1.1 mrg "found problematic Unicode character \"%s\"",
1634 1.1 mrg bidi::to_str (kind));
1635 1.1 mrg }
1636 1.1 mrg }
1637 1.1 mrg /* We're done with this context. */
1638 1.1 mrg bidi::on_char (kind, ucn_p, loc);
1639 1.1 mrg }
1640 1.1 mrg
1641 1.1 mrg /* Skip a C-style block comment. We find the end of the comment by
1642 1.1 mrg seeing if an asterisk is before every '/' we encounter. Returns
1643 1.1 mrg nonzero if comment terminated by EOF, zero otherwise.
1644 1.1 mrg
1645 1.1 mrg Buffer->cur points to the initial asterisk of the comment. */
1646 1.1 mrg bool
1647 1.1 mrg _cpp_skip_block_comment (cpp_reader *pfile)
1648 1.1 mrg {
1649 1.1 mrg cpp_buffer *buffer = pfile->buffer;
1650 1.1 mrg const uchar *cur = buffer->cur;
1651 1.1 mrg uchar c;
1652 1.1 mrg const bool warn_bidi_p = pfile->warn_bidi_p ();
1653 1.1 mrg
1654 1.1 mrg cur++;
1655 1.1 mrg if (*cur == '/')
1656 1.1 mrg cur++;
1657 1.1 mrg
1658 1.1 mrg for (;;)
1659 1.1 mrg {
1660 1.1 mrg /* People like decorating comments with '*', so check for '/'
1661 1.1 mrg instead for efficiency. */
1662 1.1 mrg c = *cur++;
1663 1.1 mrg
1664 1.1 mrg if (c == '/')
1665 1.1 mrg {
1666 1.1 mrg if (cur[-2] == '*')
1667 1.1 mrg {
1668 1.1 mrg if (warn_bidi_p)
1669 1.1 mrg maybe_warn_bidi_on_close (pfile, cur);
1670 1.1 mrg break;
1671 1.1 mrg }
1672 1.1 mrg
1673 1.1 mrg /* Warn about potential nested comments, but not if the '/'
1674 1.1 mrg comes immediately before the true comment delimiter.
1675 1.1 mrg Don't bother to get it right across escaped newlines. */
1676 1.1 mrg if (CPP_OPTION (pfile, warn_comments)
1677 1.1 mrg && cur[0] == '*' && cur[1] != '/')
1678 1.1 mrg {
1679 1.1 mrg buffer->cur = cur;
1680 1.1 mrg cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1681 1.1 mrg pfile->line_table->highest_line,
1682 1.1 mrg CPP_BUF_COL (buffer),
1683 1.1 mrg "\"/*\" within comment");
1684 1.1 mrg }
1685 1.1 mrg }
1686 1.1 mrg else if (c == '\n')
1687 1.1 mrg {
1688 1.1 mrg unsigned int cols;
1689 1.1 mrg buffer->cur = cur - 1;
1690 1.1 mrg if (warn_bidi_p)
1691 1.1 mrg maybe_warn_bidi_on_close (pfile, cur);
1692 1.1 mrg _cpp_process_line_notes (pfile, true);
1693 1.1 mrg if (buffer->next_line >= buffer->rlimit)
1694 1.1 mrg return true;
1695 1.1 mrg _cpp_clean_line (pfile);
1696 1.1 mrg
1697 1.1 mrg cols = buffer->next_line - buffer->line_base;
1698 1.1 mrg CPP_INCREMENT_LINE (pfile, cols);
1699 1.1 mrg
1700 1.1 mrg cur = buffer->cur;
1701 1.1 mrg }
1702 1.1 mrg /* If this is a beginning of a UTF-8 encoding, it might be
1703 1.1 mrg a bidirectional control character. */
1704 1.1 mrg else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
1705 1.1 mrg {
1706 1.1 mrg location_t loc;
1707 1.1 mrg bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1708 1.1 mrg maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1709 1.1 mrg }
1710 1.1 mrg }
1711 1.1 mrg
1712 1.1 mrg buffer->cur = cur;
1713 1.1 mrg _cpp_process_line_notes (pfile, true);
1714 1.1 mrg return false;
1715 1.1 mrg }
1716 1.1 mrg
1717 1.1 mrg /* Skip a C++ line comment, leaving buffer->cur pointing to the
1718 1.1 mrg terminating newline. Handles escaped newlines. Returns nonzero
1719 1.1 mrg if a multiline comment. */
1720 1.1 mrg static int
1721 1.1 mrg skip_line_comment (cpp_reader *pfile)
1722 1.1 mrg {
1723 1.1 mrg cpp_buffer *buffer = pfile->buffer;
1724 1.1 mrg location_t orig_line = pfile->line_table->highest_line;
1725 1.1 mrg const bool warn_bidi_p = pfile->warn_bidi_p ();
1726 1.1 mrg
1727 1.1 mrg if (!warn_bidi_p)
1728 1.1 mrg while (*buffer->cur != '\n')
1729 1.1 mrg buffer->cur++;
1730 1.1 mrg else
1731 1.1 mrg {
1732 1.1 mrg while (*buffer->cur != '\n'
1733 1.1 mrg && *buffer->cur != bidi::utf8_start)
1734 1.1 mrg buffer->cur++;
1735 1.1 mrg if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1736 1.1 mrg {
1737 1.1 mrg while (*buffer->cur != '\n')
1738 1.1 mrg {
1739 1.1 mrg if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1740 1.1 mrg {
1741 1.1 mrg location_t loc;
1742 1.1 mrg bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1743 1.1 mrg maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1744 1.1 mrg }
1745 1.1 mrg buffer->cur++;
1746 1.1 mrg }
1747 1.1 mrg maybe_warn_bidi_on_close (pfile, buffer->cur);
1748 1.1 mrg }
1749 1.1 mrg }
1750 1.1 mrg
1751 1.1 mrg _cpp_process_line_notes (pfile, true);
1752 1.1 mrg return orig_line != pfile->line_table->highest_line;
1753 1.1 mrg }
1754 1.1 mrg
1755 1.1 mrg /* Skips whitespace, saving the next non-whitespace character. */
1756 1.1 mrg static void
1757 1.1 mrg skip_whitespace (cpp_reader *pfile, cppchar_t c)
1758 1.1 mrg {
1759 1.1 mrg cpp_buffer *buffer = pfile->buffer;
1760 1.1 mrg bool saw_NUL = false;
1761 1.1 mrg
1762 1.1 mrg do
1763 1.1 mrg {
1764 1.1 mrg /* Horizontal space always OK. */
1765 1.1 mrg if (c == ' ' || c == '\t')
1766 1.1 mrg ;
1767 1.1 mrg /* Just \f \v or \0 left. */
1768 1.1 mrg else if (c == '\0')
1769 1.1 mrg saw_NUL = true;
1770 1.1 mrg else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1771 1.1 mrg cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1772 1.1 mrg CPP_BUF_COL (buffer),
1773 1.1 mrg "%s in preprocessing directive",
1774 1.1 mrg c == '\f' ? "form feed" : "vertical tab");
1775 1.1 mrg
1776 1.1 mrg c = *buffer->cur++;
1777 1.1 mrg }
1778 1.1 mrg /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1779 1.1 mrg while (is_nvspace (c));
1780 1.1 mrg
1781 1.1 mrg if (saw_NUL)
1782 1.1 mrg {
1783 1.1 mrg encoding_rich_location rich_loc (pfile);
1784 1.1 mrg cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1785 1.1 mrg "null character(s) ignored");
1786 1.1 mrg }
1787 1.1 mrg
1788 1.1 mrg buffer->cur--;
1789 1.1 mrg }
1790 1.1 mrg
1791 1.1 mrg /* See if the characters of a number token are valid in a name (no
1792 1.1 mrg '.', '+' or '-'). */
1793 1.1 mrg static int
1794 1.1 mrg name_p (cpp_reader *pfile, const cpp_string *string)
1795 1.1 mrg {
1796 1.1 mrg unsigned int i;
1797 1.1 mrg
1798 1.1 mrg for (i = 0; i < string->len; i++)
1799 1.1 mrg if (!is_idchar (string->text[i]))
1800 1.1 mrg return 0;
1801 1.1 mrg
1802 1.1 mrg return 1;
1803 1.1 mrg }
1804 1.1 mrg
1805 1.1 mrg /* After parsing an identifier or other sequence, produce a warning about
1806 1.1 mrg sequences not in NFC/NFKC. */
1807 1.1 mrg static void
1808 1.1 mrg warn_about_normalization (cpp_reader *pfile,
1809 1.1 mrg const cpp_token *token,
1810 1.1 mrg const struct normalize_state *s)
1811 1.1 mrg {
1812 1.1 mrg if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1813 1.1 mrg && !pfile->state.skipping)
1814 1.1 mrg {
1815 1.1 mrg location_t loc = token->src_loc;
1816 1.1 mrg
1817 1.1 mrg /* If possible, create a location range for the token. */
1818 1.1 mrg if (loc >= RESERVED_LOCATION_COUNT
1819 1.1 mrg && token->type != CPP_EOF
1820 1.1 mrg /* There must be no line notes to process. */
1821 1.1 mrg && (!(pfile->buffer->cur
1822 1.1 mrg >= pfile->buffer->notes[pfile->buffer->cur_note].pos
1823 1.1 mrg && !pfile->overlaid_buffer)))
1824 1.1 mrg {
1825 1.1 mrg source_range tok_range;
1826 1.1 mrg tok_range.m_start = loc;
1827 1.1 mrg tok_range.m_finish
1828 1.1 mrg = linemap_position_for_column (pfile->line_table,
1829 1.1 mrg CPP_BUF_COLUMN (pfile->buffer,
1830 1.1 mrg pfile->buffer->cur));
1831 1.1 mrg loc = COMBINE_LOCATION_DATA (pfile->line_table,
1832 1.1 mrg loc, tok_range, NULL);
1833 1.1 mrg }
1834 1.1 mrg
1835 1.1 mrg encoding_rich_location rich_loc (pfile, loc);
1836 1.1 mrg
1837 1.1 mrg /* Make sure that the token is printed using UCNs, even
1838 1.1 mrg if we'd otherwise happily print UTF-8. */
1839 1.1 mrg unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1840 1.1 mrg size_t sz;
1841 1.1 mrg
1842 1.1 mrg sz = cpp_spell_token (pfile, token, buf, false) - buf;
1843 1.1 mrg if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1844 1.1 mrg cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1845 1.1 mrg "`%.*s' is not in NFKC", (int) sz, buf);
1846 1.1 mrg else if (CPP_OPTION (pfile, cplusplus))
1847 1.1 mrg cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1848 1.1 mrg "`%.*s' is not in NFC", (int) sz, buf);
1849 1.1 mrg else
1850 1.1 mrg cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1851 1.1 mrg "`%.*s' is not in NFC", (int) sz, buf);
1852 1.1 mrg free (buf);
1853 1.1 mrg }
1854 1.1 mrg }
1855 1.1 mrg
1856 1.1 mrg static const cppchar_t utf8_signifier = 0xC0;
1857 1.1 mrg
1858 1.1 mrg /* Returns TRUE if the sequence starting at buffer->cur is valid in
1859 1.1 mrg an identifier. FIRST is TRUE if this starts an identifier. */
1860 1.1 mrg
1861 1.1 mrg static bool
1862 1.1 mrg forms_identifier_p (cpp_reader *pfile, int first,
1863 1.1 mrg struct normalize_state *state)
1864 1.1 mrg {
1865 1.1 mrg cpp_buffer *buffer = pfile->buffer;
1866 1.1 mrg const bool warn_bidi_p = pfile->warn_bidi_p ();
1867 1.1 mrg
1868 1.1 mrg if (*buffer->cur == '$')
1869 1.1 mrg {
1870 1.1 mrg if (!CPP_OPTION (pfile, dollars_in_ident))
1871 1.1 mrg return false;
1872 1.1 mrg
1873 1.1 mrg buffer->cur++;
1874 1.1 mrg if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1875 1.1 mrg {
1876 1.1 mrg CPP_OPTION (pfile, warn_dollars) = 0;
1877 1.1 mrg cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1878 1.1 mrg }
1879 1.1 mrg
1880 1.1 mrg return true;
1881 1.1 mrg }
1882 1.1 mrg
1883 1.1 mrg /* Is this a syntactically valid UCN or a valid UTF-8 char? */
1884 1.1 mrg if (CPP_OPTION (pfile, extended_identifiers))
1885 1.1 mrg {
1886 1.1 mrg cppchar_t s;
1887 1.1 mrg if (*buffer->cur >= utf8_signifier)
1888 1.1 mrg {
1889 1.1 mrg if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
1890 1.1 mrg && warn_bidi_p)
1891 1.1 mrg {
1892 1.1 mrg location_t loc;
1893 1.1 mrg bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1894 1.1 mrg maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1895 1.1 mrg }
1896 1.1 mrg if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1897 1.1 mrg state, &s))
1898 1.1 mrg return true;
1899 1.1 mrg }
1900 1.1 mrg else if (*buffer->cur == '\\'
1901 1.1 mrg && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1902 1.1 mrg {
1903 1.1 mrg buffer->cur += 2;
1904 1.1 mrg if (warn_bidi_p)
1905 1.1 mrg {
1906 1.1 mrg location_t loc;
1907 1.1 mrg bidi::kind kind = get_bidi_ucn (pfile,
1908 1.1 mrg buffer->cur,
1909 1.1 mrg buffer->cur[-1] == 'U',
1910 1.1 mrg &loc);
1911 1.1 mrg maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
1912 1.1 mrg }
1913 1.1 mrg if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1914 1.1 mrg state, &s, NULL, NULL))
1915 1.1 mrg return true;
1916 1.1 mrg buffer->cur -= 2;
1917 1.1 mrg }
1918 1.1 mrg }
1919 1.1 mrg
1920 1.1 mrg return false;
1921 1.1 mrg }
1922 1.1 mrg
1923 1.1 mrg /* Helper function to issue error about improper __VA_OPT__ use. */
1924 1.1 mrg static void
1925 1.1 mrg maybe_va_opt_error (cpp_reader *pfile)
1926 1.1 mrg {
1927 1.1 mrg if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1928 1.1 mrg {
1929 1.1 mrg /* __VA_OPT__ should not be accepted at all, but allow it in
1930 1.1 mrg system headers. */
1931 1.1 mrg if (!_cpp_in_system_header (pfile))
1932 1.1 mrg cpp_error (pfile, CPP_DL_PEDWARN,
1933 1.1 mrg "__VA_OPT__ is not available until C++20");
1934 1.1 mrg }
1935 1.1 mrg else if (!pfile->state.va_args_ok)
1936 1.1 mrg {
1937 1.1 mrg /* __VA_OPT__ should only appear in the replacement list of a
1938 1.1 mrg variadic macro. */
1939 1.1 mrg cpp_error (pfile, CPP_DL_PEDWARN,
1940 1.1 mrg "__VA_OPT__ can only appear in the expansion"
1941 1.1 mrg " of a C++20 variadic macro");
1942 1.1 mrg }
1943 1.1 mrg }
1944 1.1 mrg
1945 1.1 mrg /* Helper function to get the cpp_hashnode of the identifier BASE. */
1946 1.1 mrg static cpp_hashnode *
1947 1.1 mrg lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1948 1.1 mrg {
1949 1.1 mrg cpp_hashnode *result;
1950 1.1 mrg const uchar *cur;
1951 1.1 mrg unsigned int len;
1952 1.1 mrg unsigned int hash = HT_HASHSTEP (0, *base);
1953 1.1 mrg
1954 1.1 mrg cur = base + 1;
1955 1.1 mrg while (ISIDNUM (*cur))
1956 1.1 mrg {
1957 1.1 mrg hash = HT_HASHSTEP (hash, *cur);
1958 1.1 mrg cur++;
1959 1.1 mrg }
1960 1.1 mrg len = cur - base;
1961 1.1 mrg hash = HT_HASHFINISH (hash, len);
1962 1.1 mrg result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1963 1.1 mrg base, len, hash, HT_ALLOC));
1964 1.1 mrg
1965 1.1 mrg /* Rarely, identifiers require diagnostics when lexed. */
1966 1.1 mrg if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1967 1.1 mrg && !pfile->state.skipping, 0))
1968 1.1 mrg {
1969 1.1 mrg /* It is allowed to poison the same identifier twice. */
1970 1.1 mrg if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1971 1.1 mrg cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1972 1.1 mrg NODE_NAME (result));
1973 1.1 mrg
1974 1.1 mrg /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1975 1.1 mrg replacement list of a variadic macro. */
1976 1.1 mrg if (result == pfile->spec_nodes.n__VA_ARGS__
1977 1.1 mrg && !pfile->state.va_args_ok)
1978 1.1 mrg {
1979 1.1 mrg if (CPP_OPTION (pfile, cplusplus))
1980 1.1 mrg cpp_error (pfile, CPP_DL_PEDWARN,
1981 1.1 mrg "__VA_ARGS__ can only appear in the expansion"
1982 1.1 mrg " of a C++11 variadic macro");
1983 1.1 mrg else
1984 1.1 mrg cpp_error (pfile, CPP_DL_PEDWARN,
1985 1.1 mrg "__VA_ARGS__ can only appear in the expansion"
1986 1.1 mrg " of a C99 variadic macro");
1987 1.1 mrg }
1988 1.1 mrg
1989 1.1 mrg if (result == pfile->spec_nodes.n__VA_OPT__)
1990 1.1 mrg maybe_va_opt_error (pfile);
1991 1.1 mrg
1992 1.1 mrg /* For -Wc++-compat, warn about use of C++ named operators. */
1993 1.1 mrg if (result->flags & NODE_WARN_OPERATOR)
1994 1.1 mrg cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1995 1.1 mrg "identifier \"%s\" is a special operator name in C++",
1996 1.1 mrg NODE_NAME (result));
1997 1.1 mrg }
1998 1.1 mrg
1999 1.1 mrg return result;
2000 1.1 mrg }
2001 1.1 mrg
2002 1.1 mrg /* Get the cpp_hashnode of an identifier specified by NAME in
2003 1.1 mrg the current cpp_reader object. If none is found, NULL is returned. */
2004 1.1 mrg cpp_hashnode *
2005 1.1 mrg _cpp_lex_identifier (cpp_reader *pfile, const char *name)
2006 1.1 mrg {
2007 1.1 mrg cpp_hashnode *result;
2008 1.1 mrg result = lex_identifier_intern (pfile, (uchar *) name);
2009 1.1 mrg return result;
2010 1.1 mrg }
2011 1.1 mrg
2012 1.1 mrg /* Lex an identifier starting at BUFFER->CUR - 1. */
2013 1.1 mrg static cpp_hashnode *
2014 1.1 mrg lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2015 1.1 mrg struct normalize_state *nst, cpp_hashnode **spelling)
2016 1.1 mrg {
2017 1.1 mrg cpp_hashnode *result;
2018 1.1 mrg const uchar *cur;
2019 1.1 mrg unsigned int len;
2020 1.1 mrg unsigned int hash = HT_HASHSTEP (0, *base);
2021 1.1 mrg const bool warn_bidi_p = pfile->warn_bidi_p ();
2022 1.1 mrg
2023 1.1 mrg cur = pfile->buffer->cur;
2024 1.1 mrg if (! starts_ucn)
2025 1.1 mrg {
2026 1.1 mrg while (ISIDNUM (*cur))
2027 1.1 mrg {
2028 1.1 mrg hash = HT_HASHSTEP (hash, *cur);
2029 1.1 mrg cur++;
2030 1.1 mrg }
2031 1.1 mrg NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2032 1.1 mrg }
2033 1.1 mrg pfile->buffer->cur = cur;
2034 1.1 mrg if (starts_ucn || forms_identifier_p (pfile, false, nst))
2035 1.1 mrg {
2036 1.1 mrg /* Slower version for identifiers containing UCNs
2037 1.1 mrg or extended chars (including $). */
2038 1.1 mrg do {
2039 1.1 mrg while (ISIDNUM (*pfile->buffer->cur))
2040 1.1 mrg {
2041 1.1 mrg NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2042 1.1 mrg pfile->buffer->cur++;
2043 1.1 mrg }
2044 1.1 mrg } while (forms_identifier_p (pfile, false, nst));
2045 1.1 mrg if (warn_bidi_p)
2046 1.1 mrg maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
2047 1.1 mrg result = _cpp_interpret_identifier (pfile, base,
2048 1.1 mrg pfile->buffer->cur - base);
2049 1.1 mrg *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2050 1.1 mrg }
2051 1.1 mrg else
2052 1.1 mrg {
2053 1.1 mrg len = cur - base;
2054 1.1 mrg hash = HT_HASHFINISH (hash, len);
2055 1.1 mrg
2056 1.1 mrg result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2057 1.1 mrg base, len, hash, HT_ALLOC));
2058 1.1 mrg *spelling = result;
2059 1.1 mrg }
2060 1.1 mrg
2061 1.1 mrg /* Rarely, identifiers require diagnostics when lexed. */
2062 1.1 mrg if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2063 1.1 mrg && !pfile->state.skipping, 0))
2064 1.1 mrg {
2065 1.1 mrg /* It is allowed to poison the same identifier twice. */
2066 1.1 mrg if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2067 1.1 mrg cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2068 1.1 mrg NODE_NAME (result));
2069 1.1 mrg
2070 1.1 mrg /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2071 1.1 mrg replacement list of a variadic macro. */
2072 1.1 mrg if (result == pfile->spec_nodes.n__VA_ARGS__
2073 1.1 mrg && !pfile->state.va_args_ok)
2074 1.1 mrg {
2075 1.1 mrg if (CPP_OPTION (pfile, cplusplus))
2076 1.1 mrg cpp_error (pfile, CPP_DL_PEDWARN,
2077 1.1 mrg "__VA_ARGS__ can only appear in the expansion"
2078 1.1 mrg " of a C++11 variadic macro");
2079 1.1 mrg else
2080 1.1 mrg cpp_error (pfile, CPP_DL_PEDWARN,
2081 1.1 mrg "__VA_ARGS__ can only appear in the expansion"
2082 1.1 mrg " of a C99 variadic macro");
2083 1.1 mrg }
2084 1.1 mrg
2085 1.1 mrg /* __VA_OPT__ should only appear in the replacement list of a
2086 1.1 mrg variadic macro. */
2087 1.1 mrg if (result == pfile->spec_nodes.n__VA_OPT__)
2088 1.1 mrg maybe_va_opt_error (pfile);
2089 1.1 mrg
2090 1.1 mrg /* For -Wc++-compat, warn about use of C++ named operators. */
2091 1.1 mrg if (result->flags & NODE_WARN_OPERATOR)
2092 1.1 mrg cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2093 1.1 mrg "identifier \"%s\" is a special operator name in C++",
2094 1.1 mrg NODE_NAME (result));
2095 1.1 mrg }
2096 1.1 mrg
2097 1.1 mrg return result;
2098 1.1 mrg }
2099 1.1 mrg
2100 1.1 mrg /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
2101 1.1 mrg static void
2102 1.1 mrg lex_number (cpp_reader *pfile, cpp_string *number,
2103 1.1 mrg struct normalize_state *nst)
2104 1.1 mrg {
2105 1.1 mrg const uchar *cur;
2106 1.1 mrg const uchar *base;
2107 1.1 mrg uchar *dest;
2108 1.1 mrg
2109 1.1 mrg base = pfile->buffer->cur - 1;
2110 1.1 mrg do
2111 1.1 mrg {
2112 1.1 mrg const uchar *adj_digit_sep = NULL;
2113 1.1 mrg cur = pfile->buffer->cur;
2114 1.1 mrg
2115 1.1 mrg /* N.B. ISIDNUM does not include $. */
2116 1.1 mrg while (ISIDNUM (*cur)
2117 1.1 mrg || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2118 1.1 mrg || DIGIT_SEP (*cur)
2119 1.1 mrg || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2120 1.1 mrg {
2121 1.1 mrg NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2122 1.1 mrg /* Adjacent digit separators do not form part of the pp-number syntax.
2123 1.1 mrg However, they can safely be diagnosed here as an error, since '' is
2124 1.1 mrg not a valid preprocessing token. */
2125 1.1 mrg if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2126 1.1 mrg adj_digit_sep = cur;
2127 1.1 mrg cur++;
2128 1.1 mrg }
2129 1.1 mrg /* A number can't end with a digit separator. */
2130 1.1 mrg while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2131 1.1 mrg --cur;
2132 1.1 mrg if (adj_digit_sep && adj_digit_sep < cur)
2133 1.1 mrg cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
2134 1.1 mrg
2135 1.1 mrg pfile->buffer->cur = cur;
2136 1.1 mrg }
2137 1.1 mrg while (forms_identifier_p (pfile, false, nst));
2138 1.1 mrg
2139 1.1 mrg number->len = cur - base;
2140 1.1 mrg dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2141 1.1 mrg memcpy (dest, base, number->len);
2142 1.1 mrg dest[number->len] = '\0';
2143 1.1 mrg number->text = dest;
2144 1.1 mrg }
2145 1.1 mrg
2146 1.1 mrg /* Create a token of type TYPE with a literal spelling. */
2147 1.1 mrg static void
2148 1.1 mrg create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2149 1.1 mrg unsigned int len, enum cpp_ttype type)
2150 1.1 mrg {
2151 1.1 mrg token->type = type;
2152 1.1 mrg token->val.str.len = len;
2153 1.1 mrg token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2154 1.1 mrg }
2155 1.1 mrg
2156 1.1 mrg const uchar *
2157 1.1 mrg cpp_alloc_token_string (cpp_reader *pfile,
2158 1.1 mrg const unsigned char *ptr, unsigned len)
2159 1.1 mrg {
2160 1.1 mrg uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2161 1.1 mrg
2162 1.1 mrg dest[len] = 0;
2163 1.1 mrg memcpy (dest, ptr, len);
2164 1.1 mrg return dest;
2165 1.1 mrg }
2166 1.1 mrg
2167 1.1 mrg /* A pair of raw buffer pointers. The currently open one is [1], the
2168 1.1 mrg first one is [0]. Used for string literal lexing. */
2169 1.1 mrg struct lit_accum {
2170 1.1 mrg _cpp_buff *first;
2171 1.1 mrg _cpp_buff *last;
2172 1.1 mrg const uchar *rpos;
2173 1.1 mrg size_t accum;
2174 1.1 mrg
2175 1.1 mrg lit_accum ()
2176 1.1 mrg : first (NULL), last (NULL), rpos (0), accum (0)
2177 1.1 mrg {
2178 1.1 mrg }
2179 1.1 mrg
2180 1.1 mrg void append (cpp_reader *, const uchar *, size_t);
2181 1.1 mrg
2182 1.1 mrg void read_begin (cpp_reader *);
2183 1.1 mrg bool reading_p () const
2184 1.1 mrg {
2185 1.1 mrg return rpos != NULL;
2186 1.1 mrg }
2187 1.1 mrg char read_char ()
2188 1.1 mrg {
2189 1.1 mrg char c = *rpos++;
2190 1.1 mrg if (rpos == BUFF_FRONT (last))
2191 1.1 mrg rpos = NULL;
2192 1.1 mrg return c;
2193 1.1 mrg }
2194 1.1 mrg };
2195 1.1 mrg
2196 1.1 mrg /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2197 1.1 mrg sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2198 1.1 mrg
2199 1.1 mrg void
2200 1.1 mrg lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2201 1.1 mrg {
2202 1.1 mrg if (!last)
2203 1.1 mrg /* Starting. */
2204 1.1 mrg first = last = _cpp_get_buff (pfile, len);
2205 1.1 mrg else if (len > BUFF_ROOM (last))
2206 1.1 mrg {
2207 1.1 mrg /* There is insufficient room in the buffer. Copy what we can,
2208 1.1 mrg and then either extend or create a new one. */
2209 1.1 mrg size_t room = BUFF_ROOM (last);
2210 1.1 mrg memcpy (BUFF_FRONT (last), base, room);
2211 1.1 mrg BUFF_FRONT (last) += room;
2212 1.1 mrg base += room;
2213 1.1 mrg len -= room;
2214 1.1 mrg accum += room;
2215 1.1 mrg
2216 1.1 mrg gcc_checking_assert (!rpos);
2217 1.1 mrg
2218 1.1 mrg last = _cpp_append_extend_buff (pfile, last, len);
2219 1.1 mrg }
2220 1.1 mrg
2221 1.1 mrg memcpy (BUFF_FRONT (last), base, len);
2222 1.1 mrg BUFF_FRONT (last) += len;
2223 1.1 mrg accum += len;
2224 1.1 mrg }
2225 1.1 mrg
2226 1.1 mrg void
2227 1.1 mrg lit_accum::read_begin (cpp_reader *pfile)
2228 1.1 mrg {
2229 1.1 mrg /* We never accumulate more than 4 chars to read. */
2230 1.1 mrg if (BUFF_ROOM (last) < 4)
2231 1.1 mrg
2232 1.1 mrg last = _cpp_append_extend_buff (pfile, last, 4);
2233 1.1 mrg rpos = BUFF_FRONT (last);
2234 1.1 mrg }
2235 1.1 mrg
2236 1.1 mrg /* Returns true if a macro has been defined.
2237 1.1 mrg This might not work if compile with -save-temps,
2238 1.1 mrg or preprocess separately from compilation. */
2239 1.1 mrg
2240 1.1 mrg static bool
2241 1.1 mrg is_macro(cpp_reader *pfile, const uchar *base)
2242 1.1 mrg {
2243 1.1 mrg const uchar *cur = base;
2244 1.1 mrg if (! ISIDST (*cur))
2245 1.1 mrg return false;
2246 1.1 mrg unsigned int hash = HT_HASHSTEP (0, *cur);
2247 1.1 mrg ++cur;
2248 1.1 mrg while (ISIDNUM (*cur))
2249 1.1 mrg {
2250 1.1 mrg hash = HT_HASHSTEP (hash, *cur);
2251 1.1 mrg ++cur;
2252 1.1 mrg }
2253 1.1 mrg hash = HT_HASHFINISH (hash, cur - base);
2254 1.1 mrg
2255 1.1 mrg cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2256 1.1 mrg base, cur - base, hash, HT_NO_INSERT));
2257 1.1 mrg
2258 1.1 mrg return result && cpp_macro_p (result);
2259 1.1 mrg }
2260 1.1 mrg
2261 1.1 mrg /* Returns true if a literal suffix does not have the expected form
2262 1.1 mrg and is defined as a macro. */
2263 1.1 mrg
2264 1.1 mrg static bool
2265 1.1 mrg is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2266 1.1 mrg {
2267 1.1 mrg /* User-defined literals outside of namespace std must start with a single
2268 1.1 mrg underscore, so assume anything of that form really is a UDL suffix.
2269 1.1 mrg We don't need to worry about UDLs defined inside namespace std because
2270 1.1 mrg their names are reserved, so cannot be used as macro names in valid
2271 1.1 mrg programs. */
2272 1.1 mrg if (base[0] == '_' && base[1] != '_')
2273 1.1 mrg return false;
2274 1.1 mrg return is_macro (pfile, base);
2275 1.1 mrg }
2276 1.1 mrg
2277 1.1 mrg /* Lexes a raw string. The stored string contains the spelling,
2278 1.1 mrg including double quotes, delimiter string, '(' and ')', any leading
2279 1.1 mrg 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2280 1.1 mrg the type of the literal, or CPP_OTHER if it was not properly
2281 1.1 mrg terminated.
2282 1.1 mrg
2283 1.1 mrg BASE is the start of the token. Updates pfile->buffer->cur to just
2284 1.1 mrg after the lexed string.
2285 1.1 mrg
2286 1.1 mrg The spelling is NUL-terminated, but it is not guaranteed that this
2287 1.1 mrg is the first NUL since embedded NULs are preserved. */
2288 1.1 mrg
2289 1.1 mrg static void
2290 1.1 mrg lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2291 1.1 mrg {
2292 1.1 mrg const uchar *pos = base;
2293 1.1 mrg const bool warn_bidi_p = pfile->warn_bidi_p ();
2294 1.1 mrg
2295 1.1 mrg /* 'tis a pity this information isn't passed down from the lexer's
2296 1.1 mrg initial categorization of the token. */
2297 1.1 mrg enum cpp_ttype type = CPP_STRING;
2298 1.1 mrg
2299 1.1 mrg if (*pos == 'L')
2300 1.1 mrg {
2301 1.1 mrg type = CPP_WSTRING;
2302 1.1 mrg pos++;
2303 1.1 mrg }
2304 1.1 mrg else if (*pos == 'U')
2305 1.1 mrg {
2306 1.1 mrg type = CPP_STRING32;
2307 1.1 mrg pos++;
2308 1.1 mrg }
2309 1.1 mrg else if (*pos == 'u')
2310 1.1 mrg {
2311 1.1 mrg if (pos[1] == '8')
2312 1.1 mrg {
2313 1.1 mrg type = CPP_UTF8STRING;
2314 1.1 mrg pos++;
2315 1.1 mrg }
2316 1.1 mrg else
2317 1.1 mrg type = CPP_STRING16;
2318 1.1 mrg pos++;
2319 1.1 mrg }
2320 1.1 mrg
2321 1.1 mrg gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2322 1.1 mrg pos += 2;
2323 1.1 mrg
2324 1.1 mrg _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2325 1.1 mrg
2326 1.1 mrg /* Skip notes before the ". */
2327 1.1 mrg while (note->pos < pos)
2328 1.1 mrg ++note;
2329 1.1 mrg
2330 1.1 mrg lit_accum accum;
2331 1.1 mrg
2332 1.1 mrg uchar prefix[17];
2333 1.1 mrg unsigned prefix_len = 0;
2334 1.1 mrg enum Phase
2335 1.1 mrg {
2336 1.1 mrg PHASE_PREFIX = -2,
2337 1.1 mrg PHASE_NONE = -1,
2338 1.1 mrg PHASE_SUFFIX = 0
2339 1.1 mrg } phase = PHASE_PREFIX;
2340 1.1 mrg
2341 1.1 mrg for (;;)
2342 1.1 mrg {
2343 1.1 mrg gcc_checking_assert (note->pos >= pos);
2344 1.1 mrg
2345 1.1 mrg /* Undo any escaped newlines and trigraphs. */
2346 1.1 mrg if (!accum.reading_p () && note->pos == pos)
2347 1.1 mrg switch (note->type)
2348 1.1 mrg {
2349 1.1 mrg case '\\':
2350 1.1 mrg case ' ':
2351 1.1 mrg /* Restore backslash followed by newline. */
2352 1.1 mrg accum.append (pfile, base, pos - base);
2353 1.1 mrg base = pos;
2354 1.1 mrg accum.read_begin (pfile);
2355 1.1 mrg accum.append (pfile, UC"\\", 1);
2356 1.1 mrg
2357 1.1 mrg after_backslash:
2358 1.1 mrg if (note->type == ' ')
2359 1.1 mrg /* GNU backslash whitespace newline extension. FIXME
2360 1.1 mrg could be any sequence of non-vertical space. When we
2361 1.1 mrg can properly restore any such sequence, we should
2362 1.1 mrg mark this note as handled so _cpp_process_line_notes
2363 1.1 mrg doesn't warn. */
2364 1.1 mrg accum.append (pfile, UC" ", 1);
2365 1.1 mrg
2366 1.1 mrg accum.append (pfile, UC"\n", 1);
2367 1.1 mrg note++;
2368 1.1 mrg break;
2369 1.1 mrg
2370 1.1 mrg case '\n':
2371 1.1 mrg /* This can happen for ??/<NEWLINE> when trigraphs are not
2372 1.1 mrg being interpretted. */
2373 1.1 mrg gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2374 1.1 mrg note->type = 0;
2375 1.1 mrg note++;
2376 1.1 mrg break;
2377 1.1 mrg
2378 1.1 mrg default:
2379 1.1 mrg gcc_checking_assert (_cpp_trigraph_map[note->type]);
2380 1.1 mrg
2381 1.1 mrg /* Don't warn about this trigraph in
2382 1.1 mrg _cpp_process_line_notes, since trigraphs show up as
2383 1.1 mrg trigraphs in raw strings. */
2384 1.1 mrg uchar type = note->type;
2385 1.1 mrg note->type = 0;
2386 1.1 mrg
2387 1.1 mrg if (CPP_OPTION (pfile, trigraphs))
2388 1.1 mrg {
2389 1.1 mrg accum.append (pfile, base, pos - base);
2390 1.1 mrg base = pos;
2391 1.1 mrg accum.read_begin (pfile);
2392 1.1 mrg accum.append (pfile, UC"??", 2);
2393 1.1 mrg accum.append (pfile, &type, 1);
2394 1.1 mrg
2395 1.1 mrg /* ??/ followed by newline gets two line notes, one for
2396 1.1 mrg the trigraph and one for the backslash/newline. */
2397 1.1 mrg if (type == '/' && note[1].pos == pos)
2398 1.1 mrg {
2399 1.1 mrg note++;
2400 1.1 mrg gcc_assert (note->type == '\\' || note->type == ' ');
2401 1.1 mrg goto after_backslash;
2402 1.1 mrg }
2403 1.1 mrg /* Skip the replacement character. */
2404 1.1 mrg base = ++pos;
2405 1.1 mrg }
2406 1.1 mrg
2407 1.1 mrg note++;
2408 1.1 mrg break;
2409 1.1 mrg }
2410 1.1 mrg
2411 1.1 mrg /* Now get a char to process. Either from an expanded note, or
2412 1.1 mrg from the line buffer. */
2413 1.1 mrg bool read_note = accum.reading_p ();
2414 1.1 mrg char c = read_note ? accum.read_char () : *pos++;
2415 1.1 mrg
2416 1.1 mrg if (phase == PHASE_PREFIX)
2417 1.1 mrg {
2418 1.1 mrg if (c == '(')
2419 1.1 mrg {
2420 1.1 mrg /* Done. */
2421 1.1 mrg phase = PHASE_NONE;
2422 1.1 mrg prefix[prefix_len++] = '"';
2423 1.1 mrg }
2424 1.1 mrg else if (prefix_len < 16
2425 1.1 mrg /* Prefix chars are any of the basic character set,
2426 1.1 mrg [lex.charset] except for '
2427 1.1 mrg ()\\\t\v\f\n'. Optimized for a contiguous
2428 1.1 mrg alphabet. */
2429 1.1 mrg /* Unlike a switch, this collapses down to one or
2430 1.1 mrg two shift and bitmask operations on an ASCII
2431 1.1 mrg system, with an outlier or two. */
2432 1.1 mrg && (('Z' - 'A' == 25
2433 1.1 mrg ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2434 1.1 mrg : ISIDST (c))
2435 1.1 mrg || (c >= '0' && c <= '9')
2436 1.1 mrg || c == '_' || c == '{' || c == '}'
2437 1.1 mrg || c == '[' || c == ']' || c == '#'
2438 1.1 mrg || c == '<' || c == '>' || c == '%'
2439 1.1 mrg || c == ':' || c == ';' || c == '.' || c == '?'
2440 1.1 mrg || c == '*' || c == '+' || c == '-' || c == '/'
2441 1.1 mrg || c == '^' || c == '&' || c == '|' || c == '~'
2442 1.1 mrg || c == '!' || c == '=' || c == ','
2443 1.1 mrg || c == '"' || c == '\''))
2444 1.1 mrg prefix[prefix_len++] = c;
2445 1.1 mrg else
2446 1.1 mrg {
2447 1.1 mrg /* Something is wrong. */
2448 1.1 mrg int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2449 1.1 mrg if (prefix_len == 16)
2450 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2451 1.1 mrg col, "raw string delimiter longer "
2452 1.1 mrg "than 16 characters");
2453 1.1 mrg else if (c == '\n')
2454 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2455 1.1 mrg col, "invalid new-line in raw "
2456 1.1 mrg "string delimiter");
2457 1.1 mrg else
2458 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2459 1.1 mrg col, "invalid character '%c' in "
2460 1.1 mrg "raw string delimiter", c);
2461 1.1 mrg type = CPP_OTHER;
2462 1.1 mrg phase = PHASE_NONE;
2463 1.1 mrg /* Continue until we get a close quote, that's probably
2464 1.1 mrg the best failure mode. */
2465 1.1 mrg prefix_len = 0;
2466 1.1 mrg }
2467 1.1 mrg if (c != '\n')
2468 1.1 mrg continue;
2469 1.1 mrg }
2470 1.1 mrg
2471 1.1 mrg if (phase != PHASE_NONE)
2472 1.1 mrg {
2473 1.1 mrg if (prefix[phase] != c)
2474 1.1 mrg phase = PHASE_NONE;
2475 1.1 mrg else if (unsigned (phase + 1) == prefix_len)
2476 1.1 mrg break;
2477 1.1 mrg else
2478 1.1 mrg {
2479 1.1 mrg phase = Phase (phase + 1);
2480 1.1 mrg continue;
2481 1.1 mrg }
2482 1.1 mrg }
2483 1.1 mrg
2484 1.1 mrg if (!prefix_len && c == '"')
2485 1.1 mrg /* Failure mode lexing. */
2486 1.1 mrg goto out;
2487 1.1 mrg else if (prefix_len && c == ')')
2488 1.1 mrg phase = PHASE_SUFFIX;
2489 1.1 mrg else if (!read_note && c == '\n')
2490 1.1 mrg {
2491 1.1 mrg pos--;
2492 1.1 mrg pfile->buffer->cur = pos;
2493 1.1 mrg if (pfile->state.in_directive
2494 1.1 mrg || (pfile->state.parsing_args
2495 1.1 mrg && pfile->buffer->next_line >= pfile->buffer->rlimit))
2496 1.1 mrg {
2497 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2498 1.1 mrg "unterminated raw string");
2499 1.1 mrg type = CPP_OTHER;
2500 1.1 mrg goto out;
2501 1.1 mrg }
2502 1.1 mrg
2503 1.1 mrg accum.append (pfile, base, pos - base + 1);
2504 1.1 mrg _cpp_process_line_notes (pfile, false);
2505 1.1 mrg
2506 1.1 mrg if (pfile->buffer->next_line < pfile->buffer->rlimit)
2507 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
2508 1.1 mrg pfile->buffer->need_line = true;
2509 1.1 mrg
2510 1.1 mrg if (!_cpp_get_fresh_line (pfile))
2511 1.1 mrg {
2512 1.1 mrg /* We ran out of file and failed to get a line. */
2513 1.1 mrg location_t src_loc = token->src_loc;
2514 1.1 mrg token->type = CPP_EOF;
2515 1.1 mrg /* Tell the compiler the line number of the EOF token. */
2516 1.1 mrg token->src_loc = pfile->line_table->highest_line;
2517 1.1 mrg token->flags = BOL;
2518 1.1 mrg if (accum.first)
2519 1.1 mrg _cpp_release_buff (pfile, accum.first);
2520 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2521 1.1 mrg "unterminated raw string");
2522 1.1 mrg /* Now pop the buffer that _cpp_get_fresh_line did not. */
2523 1.1 mrg _cpp_pop_buffer (pfile);
2524 1.1 mrg return;
2525 1.1 mrg }
2526 1.1 mrg
2527 1.1 mrg pos = base = pfile->buffer->cur;
2528 1.1 mrg note = &pfile->buffer->notes[pfile->buffer->cur_note];
2529 1.1 mrg }
2530 1.1 mrg else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
2531 1.1 mrg && warn_bidi_p)
2532 1.1 mrg {
2533 1.1 mrg location_t loc;
2534 1.1 mrg bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
2535 1.1 mrg maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2536 1.1 mrg }
2537 1.1 mrg }
2538 1.1 mrg
2539 1.1 mrg if (warn_bidi_p)
2540 1.1 mrg maybe_warn_bidi_on_close (pfile, pos);
2541 1.1 mrg
2542 1.1 mrg if (CPP_OPTION (pfile, user_literals))
2543 1.1 mrg {
2544 1.1 mrg /* If a string format macro, say from inttypes.h, is placed touching
2545 1.1 mrg a string literal it could be parsed as a C++11 user-defined string
2546 1.1 mrg literal thus breaking the program. */
2547 1.1 mrg if (is_macro_not_literal_suffix (pfile, pos))
2548 1.1 mrg {
2549 1.1 mrg /* Raise a warning, but do not consume subsequent tokens. */
2550 1.1 mrg if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2551 1.1 mrg cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2552 1.1 mrg token->src_loc, 0,
2553 1.1 mrg "invalid suffix on literal; C++11 requires "
2554 1.1 mrg "a space between literal and string macro");
2555 1.1 mrg }
2556 1.1 mrg /* Grab user defined literal suffix. */
2557 1.1 mrg else if (ISIDST (*pos))
2558 1.1 mrg {
2559 1.1 mrg type = cpp_userdef_string_add_type (type);
2560 1.1 mrg ++pos;
2561 1.1 mrg
2562 1.1 mrg while (ISIDNUM (*pos))
2563 1.1 mrg ++pos;
2564 1.1 mrg }
2565 1.1 mrg }
2566 1.1 mrg
2567 1.1 mrg out:
2568 1.1 mrg pfile->buffer->cur = pos;
2569 1.1 mrg if (!accum.accum)
2570 1.1 mrg create_literal (pfile, token, base, pos - base, type);
2571 1.1 mrg else
2572 1.1 mrg {
2573 1.1 mrg size_t extra_len = pos - base;
2574 1.1 mrg uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2575 1.1 mrg
2576 1.1 mrg token->type = type;
2577 1.1 mrg token->val.str.len = accum.accum + extra_len;
2578 1.1 mrg token->val.str.text = dest;
2579 1.1 mrg for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2580 1.1 mrg {
2581 1.1 mrg size_t len = BUFF_FRONT (buf) - buf->base;
2582 1.1 mrg memcpy (dest, buf->base, len);
2583 1.1 mrg dest += len;
2584 1.1 mrg }
2585 1.1 mrg _cpp_release_buff (pfile, accum.first);
2586 1.1 mrg memcpy (dest, base, extra_len);
2587 1.1 mrg dest[extra_len] = '\0';
2588 1.1 mrg }
2589 1.1 mrg }
2590 1.1 mrg
2591 1.1 mrg /* Lexes a string, character constant, or angle-bracketed header file
2592 1.1 mrg name. The stored string contains the spelling, including opening
2593 1.1 mrg quote and any leading 'L', 'u', 'U' or 'u8' and optional
2594 1.1 mrg 'R' modifier. It returns the type of the literal, or CPP_OTHER
2595 1.1 mrg if it was not properly terminated, or CPP_LESS for an unterminated
2596 1.1 mrg header name which must be relexed as normal tokens.
2597 1.1 mrg
2598 1.1 mrg The spelling is NUL-terminated, but it is not guaranteed that this
2599 1.1 mrg is the first NUL since embedded NULs are preserved. */
2600 1.1 mrg static void
2601 1.1 mrg lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2602 1.1 mrg {
2603 1.1 mrg bool saw_NUL = false;
2604 1.1 mrg const uchar *cur;
2605 1.1 mrg cppchar_t terminator;
2606 1.1 mrg enum cpp_ttype type;
2607 1.1 mrg
2608 1.1 mrg cur = base;
2609 1.1 mrg terminator = *cur++;
2610 1.1 mrg if (terminator == 'L' || terminator == 'U')
2611 1.1 mrg terminator = *cur++;
2612 1.1 mrg else if (terminator == 'u')
2613 1.1 mrg {
2614 1.1 mrg terminator = *cur++;
2615 1.1 mrg if (terminator == '8')
2616 1.1 mrg terminator = *cur++;
2617 1.1 mrg }
2618 1.1 mrg if (terminator == 'R')
2619 1.1 mrg {
2620 1.1 mrg lex_raw_string (pfile, token, base);
2621 1.1 mrg return;
2622 1.1 mrg }
2623 1.1 mrg if (terminator == '"')
2624 1.1 mrg type = (*base == 'L' ? CPP_WSTRING :
2625 1.1 mrg *base == 'U' ? CPP_STRING32 :
2626 1.1 mrg *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2627 1.1 mrg : CPP_STRING);
2628 1.1 mrg else if (terminator == '\'')
2629 1.1 mrg type = (*base == 'L' ? CPP_WCHAR :
2630 1.1 mrg *base == 'U' ? CPP_CHAR32 :
2631 1.1 mrg *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2632 1.1 mrg : CPP_CHAR);
2633 1.1 mrg else
2634 1.1 mrg terminator = '>', type = CPP_HEADER_NAME;
2635 1.1 mrg
2636 1.1 mrg const bool warn_bidi_p = pfile->warn_bidi_p ();
2637 1.1 mrg for (;;)
2638 1.1 mrg {
2639 1.1 mrg cppchar_t c = *cur++;
2640 1.1 mrg
2641 1.1 mrg /* In #include-style directives, terminators are not escapable. */
2642 1.1 mrg if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2643 1.1 mrg {
2644 1.1 mrg if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
2645 1.1 mrg {
2646 1.1 mrg location_t loc;
2647 1.1 mrg bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
2648 1.1 mrg &loc);
2649 1.1 mrg maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2650 1.1 mrg }
2651 1.1 mrg cur++;
2652 1.1 mrg }
2653 1.1 mrg else if (c == terminator)
2654 1.1 mrg {
2655 1.1 mrg if (warn_bidi_p)
2656 1.1 mrg maybe_warn_bidi_on_close (pfile, cur - 1);
2657 1.1 mrg break;
2658 1.1 mrg }
2659 1.1 mrg else if (c == '\n')
2660 1.1 mrg {
2661 1.1 mrg cur--;
2662 1.1 mrg /* Unmatched quotes always yield undefined behavior, but
2663 1.1 mrg greedy lexing means that what appears to be an unterminated
2664 1.1 mrg header name may actually be a legitimate sequence of tokens. */
2665 1.1 mrg if (terminator == '>')
2666 1.1 mrg {
2667 1.1 mrg token->type = CPP_LESS;
2668 1.1 mrg return;
2669 1.1 mrg }
2670 1.1 mrg type = CPP_OTHER;
2671 1.1 mrg break;
2672 1.1 mrg }
2673 1.1 mrg else if (c == '\0')
2674 1.1 mrg saw_NUL = true;
2675 1.1 mrg else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
2676 1.1 mrg {
2677 1.1 mrg location_t loc;
2678 1.1 mrg bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
2679 1.1 mrg maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2680 1.1 mrg }
2681 1.1 mrg }
2682 1.1 mrg
2683 1.1 mrg if (saw_NUL && !pfile->state.skipping)
2684 1.1 mrg cpp_error (pfile, CPP_DL_WARNING,
2685 1.1 mrg "null character(s) preserved in literal");
2686 1.1 mrg
2687 1.1 mrg if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2688 1.1 mrg cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2689 1.1 mrg (int) terminator);
2690 1.1 mrg
2691 1.1 mrg if (CPP_OPTION (pfile, user_literals))
2692 1.1 mrg {
2693 1.1 mrg /* If a string format macro, say from inttypes.h, is placed touching
2694 1.1 mrg a string literal it could be parsed as a C++11 user-defined string
2695 1.1 mrg literal thus breaking the program. */
2696 1.1 mrg if (is_macro_not_literal_suffix (pfile, cur))
2697 1.1 mrg {
2698 1.1 mrg /* Raise a warning, but do not consume subsequent tokens. */
2699 1.1 mrg if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2700 1.1 mrg cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2701 1.1 mrg token->src_loc, 0,
2702 1.1 mrg "invalid suffix on literal; C++11 requires "
2703 1.1 mrg "a space between literal and string macro");
2704 1.1 mrg }
2705 1.1 mrg /* Grab user defined literal suffix. */
2706 1.1 mrg else if (ISIDST (*cur))
2707 1.1 mrg {
2708 1.1 mrg type = cpp_userdef_char_add_type (type);
2709 1.1 mrg type = cpp_userdef_string_add_type (type);
2710 1.1 mrg ++cur;
2711 1.1 mrg
2712 1.1 mrg while (ISIDNUM (*cur))
2713 1.1 mrg ++cur;
2714 1.1 mrg }
2715 1.1 mrg }
2716 1.1 mrg else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2717 1.1 mrg && is_macro (pfile, cur)
2718 1.1 mrg && !pfile->state.skipping)
2719 1.1 mrg cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2720 1.1 mrg token->src_loc, 0, "C++11 requires a space "
2721 1.1 mrg "between string literal and macro");
2722 1.1 mrg
2723 1.1 mrg pfile->buffer->cur = cur;
2724 1.1 mrg create_literal (pfile, token, base, cur - base, type);
2725 1.1 mrg }
2726 1.1 mrg
2727 1.1 mrg /* Return the comment table. The client may not make any assumption
2728 1.1 mrg about the ordering of the table. */
2729 1.1 mrg cpp_comment_table *
2730 1.1 mrg cpp_get_comments (cpp_reader *pfile)
2731 1.1 mrg {
2732 1.1 mrg return &pfile->comments;
2733 1.1 mrg }
2734 1.1 mrg
2735 1.1 mrg /* Append a comment to the end of the comment table. */
2736 1.1 mrg static void
2737 1.1 mrg store_comment (cpp_reader *pfile, cpp_token *token)
2738 1.1 mrg {
2739 1.1 mrg int len;
2740 1.1 mrg
2741 1.1 mrg if (pfile->comments.allocated == 0)
2742 1.1 mrg {
2743 1.1 mrg pfile->comments.allocated = 256;
2744 1.1 mrg pfile->comments.entries = (cpp_comment *) xmalloc
2745 1.1 mrg (pfile->comments.allocated * sizeof (cpp_comment));
2746 1.1 mrg }
2747 1.1 mrg
2748 1.1 mrg if (pfile->comments.count == pfile->comments.allocated)
2749 1.1 mrg {
2750 1.1 mrg pfile->comments.allocated *= 2;
2751 1.1 mrg pfile->comments.entries = (cpp_comment *) xrealloc
2752 1.1 mrg (pfile->comments.entries,
2753 1.1 mrg pfile->comments.allocated * sizeof (cpp_comment));
2754 1.1 mrg }
2755 1.1 mrg
2756 1.1 mrg len = token->val.str.len;
2757 1.1 mrg
2758 1.1 mrg /* Copy comment. Note, token may not be NULL terminated. */
2759 1.1 mrg pfile->comments.entries[pfile->comments.count].comment =
2760 1.1 mrg (char *) xmalloc (sizeof (char) * (len + 1));
2761 1.1 mrg memcpy (pfile->comments.entries[pfile->comments.count].comment,
2762 1.1 mrg token->val.str.text, len);
2763 1.1 mrg pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2764 1.1 mrg
2765 1.1 mrg /* Set source location. */
2766 1.1 mrg pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2767 1.1 mrg
2768 1.1 mrg /* Increment the count of entries in the comment table. */
2769 1.1 mrg pfile->comments.count++;
2770 1.1 mrg }
2771 1.1 mrg
2772 1.1 mrg /* The stored comment includes the comment start and any terminator. */
2773 1.1 mrg static void
2774 1.1 mrg save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2775 1.1 mrg cppchar_t type)
2776 1.1 mrg {
2777 1.1 mrg unsigned char *buffer;
2778 1.2 mrg unsigned int len, clen, i;
2779 1.2 mrg int convert_to_c = (pfile->state.in_directive || pfile->state.parsing_args)
2780 1.1 mrg && type == '/';
2781 1.1 mrg
2782 1.1 mrg len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
2783 1.1 mrg
2784 1.1 mrg /* C++ comments probably (not definitely) have moved past a new
2785 1.1 mrg line, which we don't want to save in the comment. */
2786 1.1 mrg if (is_vspace (pfile->buffer->cur[-1]))
2787 1.1 mrg len--;
2788 1.1 mrg
2789 1.1 mrg /* If we are currently in a directive or in argument parsing, then
2790 1.1 mrg we need to store all C++ comments as C comments internally, and
2791 1.1 mrg so we need to allocate a little extra space in that case.
2792 1.1 mrg
2793 1.1 mrg Note that the only time we encounter a directive here is
2794 1.2 mrg when we are saving comments in a "#define". */
2795 1.1 mrg clen = convert_to_c ? len + 2 : len;
2796 1.1 mrg
2797 1.1 mrg buffer = _cpp_unaligned_alloc (pfile, clen);
2798 1.1 mrg
2799 1.1 mrg token->type = CPP_COMMENT;
2800 1.1 mrg token->val.str.len = clen;
2801 1.1 mrg token->val.str.text = buffer;
2802 1.1 mrg
2803 1.1 mrg buffer[0] = '/';
2804 1.1 mrg memcpy (buffer + 1, from, len - 1);
2805 1.1 mrg
2806 1.2 mrg /* Finish conversion to a C comment, if necessary. */
2807 1.1 mrg if (convert_to_c)
2808 1.1 mrg {
2809 1.1 mrg buffer[1] = '*';
2810 1.1 mrg buffer[clen - 2] = '*';
2811 1.1 mrg buffer[clen - 1] = '/';
2812 1.1 mrg /* As there can be in a C++ comments illegal sequences for C comments
2813 1.1 mrg we need to filter them out. */
2814 1.1 mrg for (i = 2; i < (clen - 2); i++)
2815 1.1 mrg if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2816 1.1 mrg buffer[i] = '|';
2817 1.1 mrg }
2818 1.1 mrg
2819 1.1 mrg /* Finally store this comment for use by clients of libcpp. */
2820 1.1 mrg store_comment (pfile, token);
2821 1.1 mrg }
2822 1.1 mrg
2823 1.1 mrg /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2824 1.1 mrg comment. */
2825 1.1 mrg
2826 1.1 mrg static bool
2827 1.1 mrg fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2828 1.1 mrg {
2829 1.1 mrg const unsigned char *from = comment_start + 1;
2830 1.1 mrg
2831 1.1 mrg switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2832 1.1 mrg {
2833 1.1 mrg /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2834 1.1 mrg don't recognize any comments. The latter only checks attributes,
2835 1.1 mrg the former doesn't warn. */
2836 1.1 mrg case 0:
2837 1.1 mrg default:
2838 1.1 mrg return false;
2839 1.1 mrg /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2840 1.1 mrg content it has. */
2841 1.1 mrg case 1:
2842 1.1 mrg return true;
2843 1.1 mrg case 2:
2844 1.1 mrg /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2845 1.1 mrg .*falls?[ \t-]*thr(u|ough).* regex. */
2846 1.1 mrg for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2847 1.1 mrg from++)
2848 1.1 mrg {
2849 1.1 mrg /* Is there anything like strpbrk with upper boundary, or
2850 1.1 mrg memchr looking for 2 characters rather than just one? */
2851 1.1 mrg if (from[0] != 'f' && from[0] != 'F')
2852 1.1 mrg continue;
2853 1.1 mrg if (from[1] != 'a' && from[1] != 'A')
2854 1.1 mrg continue;
2855 1.1 mrg if (from[2] != 'l' && from[2] != 'L')
2856 1.1 mrg continue;
2857 1.1 mrg if (from[3] != 'l' && from[3] != 'L')
2858 1.1 mrg continue;
2859 1.1 mrg from += sizeof "fall" - 1;
2860 1.1 mrg if (from[0] == 's' || from[0] == 'S')
2861 1.1 mrg from++;
2862 1.1 mrg while (*from == ' ' || *from == '\t' || *from == '-')
2863 1.1 mrg from++;
2864 1.1 mrg if (from[0] != 't' && from[0] != 'T')
2865 1.1 mrg continue;
2866 1.1 mrg if (from[1] != 'h' && from[1] != 'H')
2867 1.1 mrg continue;
2868 1.1 mrg if (from[2] != 'r' && from[2] != 'R')
2869 1.1 mrg continue;
2870 1.1 mrg if (from[3] == 'u' || from[3] == 'U')
2871 1.1 mrg return true;
2872 1.1 mrg if (from[3] != 'o' && from[3] != 'O')
2873 1.1 mrg continue;
2874 1.1 mrg if (from[4] != 'u' && from[4] != 'U')
2875 1.1 mrg continue;
2876 1.1 mrg if (from[5] != 'g' && from[5] != 'G')
2877 1.1 mrg continue;
2878 1.1 mrg if (from[6] != 'h' && from[6] != 'H')
2879 1.1 mrg continue;
2880 1.1 mrg return true;
2881 1.1 mrg }
2882 1.1 mrg return false;
2883 1.1 mrg case 3:
2884 1.1 mrg case 4:
2885 1.1 mrg break;
2886 1.1 mrg }
2887 1.1 mrg
2888 1.1 mrg /* Whole comment contents:
2889 1.1 mrg -fallthrough
2890 1.1 mrg @fallthrough@
2891 1.1 mrg */
2892 1.1 mrg if (*from == '-' || *from == '@')
2893 1.1 mrg {
2894 1.1 mrg size_t len = sizeof "fallthrough" - 1;
2895 1.1 mrg if ((size_t) (pfile->buffer->cur - from - 1) < len)
2896 1.1 mrg return false;
2897 1.1 mrg if (memcmp (from + 1, "fallthrough", len))
2898 1.1 mrg return false;
2899 1.1 mrg if (*from == '@')
2900 1.1 mrg {
2901 1.1 mrg if (from[len + 1] != '@')
2902 1.1 mrg return false;
2903 1.1 mrg len++;
2904 1.1 mrg }
2905 1.1 mrg from += 1 + len;
2906 1.1 mrg }
2907 1.1 mrg /* Whole comment contents (regex):
2908 1.1 mrg lint -fallthrough[ \t]*
2909 1.1 mrg */
2910 1.1 mrg else if (*from == 'l')
2911 1.1 mrg {
2912 1.1 mrg size_t len = sizeof "int -fallthrough" - 1;
2913 1.1 mrg if ((size_t) (pfile->buffer->cur - from - 1) < len)
2914 1.1 mrg return false;
2915 1.1 mrg if (memcmp (from + 1, "int -fallthrough", len))
2916 1.1 mrg return false;
2917 1.1 mrg from += 1 + len;
2918 1.1 mrg while (*from == ' ' || *from == '\t')
2919 1.1 mrg from++;
2920 1.1 mrg }
2921 1.1 mrg /* Whole comment contents (regex):
2922 1.1 mrg [ \t]*FALLTHR(U|OUGH)[ \t]*
2923 1.1 mrg */
2924 1.1 mrg else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2925 1.1 mrg {
2926 1.1 mrg while (*from == ' ' || *from == '\t')
2927 1.1 mrg from++;
2928 1.1 mrg if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
2929 1.1 mrg return false;
2930 1.1 mrg if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2931 1.1 mrg return false;
2932 1.1 mrg from += sizeof "FALLTHR" - 1;
2933 1.1 mrg if (*from == 'U')
2934 1.1 mrg from++;
2935 1.1 mrg else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
2936 1.1 mrg return false;
2937 1.1 mrg else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2938 1.1 mrg return false;
2939 1.1 mrg else
2940 1.1 mrg from += sizeof "OUGH" - 1;
2941 1.1 mrg while (*from == ' ' || *from == '\t')
2942 1.1 mrg from++;
2943 1.1 mrg }
2944 1.1 mrg /* Whole comment contents (regex):
2945 1.1 mrg [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2946 1.1 mrg [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2947 1.1 mrg [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2948 1.1 mrg */
2949 1.1 mrg else
2950 1.1 mrg {
2951 1.1 mrg while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2952 1.1 mrg from++;
2953 1.1 mrg unsigned char f = *from;
2954 1.1 mrg bool all_upper = false;
2955 1.1 mrg if (f == 'E' || f == 'e')
2956 1.1 mrg {
2957 1.1 mrg if ((size_t) (pfile->buffer->cur - from)
2958 1.1 mrg < sizeof "else fallthru" - 1)
2959 1.1 mrg return false;
2960 1.1 mrg if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2961 1.1 mrg all_upper = true;
2962 1.1 mrg else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2963 1.1 mrg return false;
2964 1.1 mrg from += sizeof "else" - 1;
2965 1.1 mrg if (*from == ',')
2966 1.1 mrg from++;
2967 1.1 mrg if (*from != ' ')
2968 1.1 mrg return false;
2969 1.1 mrg from++;
2970 1.1 mrg if (all_upper && *from == 'f')
2971 1.1 mrg return false;
2972 1.1 mrg if (f == 'e' && *from == 'F')
2973 1.1 mrg return false;
2974 1.1 mrg f = *from;
2975 1.1 mrg }
2976 1.1 mrg else if (f == 'I' || f == 'i')
2977 1.1 mrg {
2978 1.1 mrg if ((size_t) (pfile->buffer->cur - from)
2979 1.1 mrg < sizeof "intentional fallthru" - 1)
2980 1.1 mrg return false;
2981 1.1 mrg if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2982 1.1 mrg sizeof "NTENTIONAL" - 1) == 0)
2983 1.1 mrg all_upper = true;
2984 1.1 mrg else if (memcmp (from + 1, "ntentional",
2985 1.1 mrg sizeof "ntentional" - 1))
2986 1.1 mrg return false;
2987 1.1 mrg from += sizeof "intentional" - 1;
2988 1.1 mrg if (*from == ' ')
2989 1.1 mrg {
2990 1.1 mrg from++;
2991 1.1 mrg if (all_upper && *from == 'f')
2992 1.1 mrg return false;
2993 1.1 mrg }
2994 1.1 mrg else if (all_upper)
2995 1.1 mrg {
2996 1.1 mrg if (memcmp (from, "LY F", sizeof "LY F" - 1))
2997 1.1 mrg return false;
2998 1.1 mrg from += sizeof "LY " - 1;
2999 1.1 mrg }
3000 1.1 mrg else
3001 1.1 mrg {
3002 1.1 mrg if (memcmp (from, "ly ", sizeof "ly " - 1))
3003 1.1 mrg return false;
3004 1.1 mrg from += sizeof "ly " - 1;
3005 1.1 mrg }
3006 1.1 mrg if (f == 'i' && *from == 'F')
3007 1.1 mrg return false;
3008 1.1 mrg f = *from;
3009 1.1 mrg }
3010 1.1 mrg if (f != 'F' && f != 'f')
3011 1.1 mrg return false;
3012 1.1 mrg if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3013 1.1 mrg return false;
3014 1.1 mrg if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3015 1.1 mrg all_upper = true;
3016 1.1 mrg else if (all_upper)
3017 1.1 mrg return false;
3018 1.1 mrg else if (memcmp (from + 1, "all", sizeof "all" - 1))
3019 1.1 mrg return false;
3020 1.1 mrg from += sizeof "fall" - 1;
3021 1.1 mrg if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3022 1.1 mrg from += 2;
3023 1.1 mrg else if (*from == ' ' || *from == '-')
3024 1.1 mrg from++;
3025 1.1 mrg else if (*from != (all_upper ? 'T' : 't'))
3026 1.1 mrg return false;
3027 1.1 mrg if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3028 1.1 mrg return false;
3029 1.1 mrg if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3030 1.1 mrg return false;
3031 1.1 mrg if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3032 1.1 mrg {
3033 1.1 mrg if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3034 1.1 mrg return false;
3035 1.1 mrg if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3036 1.1 mrg sizeof "hrough" - 1))
3037 1.1 mrg return false;
3038 1.1 mrg from += sizeof "through" - 1;
3039 1.1 mrg }
3040 1.1 mrg else
3041 1.1 mrg from += sizeof "thru" - 1;
3042 1.1 mrg while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3043 1.1 mrg from++;
3044 1.1 mrg if (*from == '-')
3045 1.1 mrg {
3046 1.1 mrg from++;
3047 1.1 mrg if (*comment_start == '*')
3048 1.1 mrg {
3049 1.1 mrg do
3050 1.1 mrg {
3051 1.1 mrg while (*from && *from != '*'
3052 1.1 mrg && *from != '\n' && *from != '\r')
3053 1.1 mrg from++;
3054 1.1 mrg if (*from != '*' || from[1] == '/')
3055 1.1 mrg break;
3056 1.1 mrg from++;
3057 1.1 mrg }
3058 1.1 mrg while (1);
3059 1.1 mrg }
3060 1.1 mrg else
3061 1.1 mrg while (*from && *from != '\n' && *from != '\r')
3062 1.1 mrg from++;
3063 1.1 mrg }
3064 1.1 mrg }
3065 1.1 mrg /* C block comment. */
3066 1.1 mrg if (*comment_start == '*')
3067 1.1 mrg {
3068 1.1 mrg if (*from != '*' || from[1] != '/')
3069 1.1 mrg return false;
3070 1.1 mrg }
3071 1.1 mrg /* C++ line comment. */
3072 1.1 mrg else if (*from != '\n')
3073 1.1 mrg return false;
3074 1.1 mrg
3075 1.1 mrg return true;
3076 1.1 mrg }
3077 1.1 mrg
3078 1.1 mrg /* Allocate COUNT tokens for RUN. */
3079 1.1 mrg void
3080 1.1 mrg _cpp_init_tokenrun (tokenrun *run, unsigned int count)
3081 1.1 mrg {
3082 1.1 mrg run->base = XNEWVEC (cpp_token, count);
3083 1.1 mrg run->limit = run->base + count;
3084 1.1 mrg run->next = NULL;
3085 1.1 mrg }
3086 1.1 mrg
3087 1.1 mrg /* Returns the next tokenrun, or creates one if there is none. */
3088 1.1 mrg static tokenrun *
3089 1.1 mrg next_tokenrun (tokenrun *run)
3090 1.1 mrg {
3091 1.1 mrg if (run->next == NULL)
3092 1.1 mrg {
3093 1.1 mrg run->next = XNEW (tokenrun);
3094 1.1 mrg run->next->prev = run;
3095 1.1 mrg _cpp_init_tokenrun (run->next, 250);
3096 1.1 mrg }
3097 1.1 mrg
3098 1.1 mrg return run->next;
3099 1.1 mrg }
3100 1.1 mrg
3101 1.1 mrg /* Return the number of not yet processed token in a given
3102 1.1 mrg context. */
3103 1.1 mrg int
3104 1.1 mrg _cpp_remaining_tokens_num_in_context (cpp_context *context)
3105 1.1 mrg {
3106 1.1 mrg if (context->tokens_kind == TOKENS_KIND_DIRECT)
3107 1.1 mrg return (LAST (context).token - FIRST (context).token);
3108 1.1 mrg else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3109 1.1 mrg || context->tokens_kind == TOKENS_KIND_EXTENDED)
3110 1.1 mrg return (LAST (context).ptoken - FIRST (context).ptoken);
3111 1.1 mrg else
3112 1.1 mrg abort ();
3113 1.1 mrg }
3114 1.1 mrg
3115 1.1 mrg /* Returns the token present at index INDEX in a given context. If
3116 1.1 mrg INDEX is zero, the next token to be processed is returned. */
3117 1.1 mrg static const cpp_token*
3118 1.1 mrg _cpp_token_from_context_at (cpp_context *context, int index)
3119 1.1 mrg {
3120 1.1 mrg if (context->tokens_kind == TOKENS_KIND_DIRECT)
3121 1.1 mrg return &(FIRST (context).token[index]);
3122 1.1 mrg else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3123 1.1 mrg || context->tokens_kind == TOKENS_KIND_EXTENDED)
3124 1.1 mrg return FIRST (context).ptoken[index];
3125 1.1 mrg else
3126 1.1 mrg abort ();
3127 1.1 mrg }
3128 1.1 mrg
3129 1.1 mrg /* Look ahead in the input stream. */
3130 1.1 mrg const cpp_token *
3131 1.1 mrg cpp_peek_token (cpp_reader *pfile, int index)
3132 1.1 mrg {
3133 1.1 mrg cpp_context *context = pfile->context;
3134 1.1 mrg const cpp_token *peektok;
3135 1.1 mrg int count;
3136 1.1 mrg
3137 1.1 mrg /* First, scan through any pending cpp_context objects. */
3138 1.1 mrg while (context->prev)
3139 1.1 mrg {
3140 1.1 mrg ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3141 1.1 mrg
3142 1.1 mrg if (index < (int) sz)
3143 1.1 mrg return _cpp_token_from_context_at (context, index);
3144 1.1 mrg index -= (int) sz;
3145 1.1 mrg context = context->prev;
3146 1.1 mrg }
3147 1.1 mrg
3148 1.1 mrg /* We will have to read some new tokens after all (and do so
3149 1.1 mrg without invalidating preceding tokens). */
3150 1.1 mrg count = index;
3151 1.1 mrg pfile->keep_tokens++;
3152 1.1 mrg
3153 1.1 mrg /* For peeked tokens temporarily disable line_change reporting,
3154 1.1 mrg until the tokens are parsed for real. */
3155 1.1 mrg void (*line_change) (cpp_reader *, const cpp_token *, int)
3156 1.1 mrg = pfile->cb.line_change;
3157 1.1 mrg pfile->cb.line_change = NULL;
3158 1.1 mrg
3159 1.1 mrg do
3160 1.1 mrg {
3161 1.1 mrg peektok = _cpp_lex_token (pfile);
3162 1.1 mrg if (peektok->type == CPP_EOF)
3163 1.1 mrg {
3164 1.1 mrg index--;
3165 1.1 mrg break;
3166 1.1 mrg }
3167 1.1 mrg else if (peektok->type == CPP_PRAGMA)
3168 1.1 mrg {
3169 1.1 mrg /* Don't peek past a pragma. */
3170 1.1 mrg if (peektok == &pfile->directive_result)
3171 1.1 mrg /* Save the pragma in the buffer. */
3172 1.1 mrg *pfile->cur_token++ = *peektok;
3173 1.1 mrg index--;
3174 1.1 mrg break;
3175 1.1 mrg }
3176 1.1 mrg }
3177 1.1 mrg while (index--);
3178 1.1 mrg
3179 1.1 mrg _cpp_backup_tokens_direct (pfile, count - index);
3180 1.1 mrg pfile->keep_tokens--;
3181 1.1 mrg pfile->cb.line_change = line_change;
3182 1.1 mrg
3183 1.1 mrg return peektok;
3184 1.1 mrg }
3185 1.1 mrg
3186 1.1 mrg /* Allocate a single token that is invalidated at the same time as the
3187 1.1 mrg rest of the tokens on the line. Has its line and col set to the
3188 1.1 mrg same as the last lexed token, so that diagnostics appear in the
3189 1.1 mrg right place. */
3190 1.1 mrg cpp_token *
3191 1.1 mrg _cpp_temp_token (cpp_reader *pfile)
3192 1.1 mrg {
3193 1.1 mrg cpp_token *old, *result;
3194 1.1 mrg ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3195 1.1 mrg ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3196 1.1 mrg
3197 1.1 mrg old = pfile->cur_token - 1;
3198 1.1 mrg /* Any pre-existing lookaheads must not be clobbered. */
3199 1.1 mrg if (la)
3200 1.1 mrg {
3201 1.1 mrg if (sz <= la)
3202 1.1 mrg {
3203 1.1 mrg tokenrun *next = next_tokenrun (pfile->cur_run);
3204 1.1 mrg
3205 1.1 mrg if (sz < la)
3206 1.1 mrg memmove (next->base + 1, next->base,
3207 1.1 mrg (la - sz) * sizeof (cpp_token));
3208 1.1 mrg
3209 1.1 mrg next->base[0] = pfile->cur_run->limit[-1];
3210 1.1 mrg }
3211 1.1 mrg
3212 1.1 mrg if (sz > 1)
3213 1.1 mrg memmove (pfile->cur_token + 1, pfile->cur_token,
3214 1.1 mrg MIN (la, sz - 1) * sizeof (cpp_token));
3215 1.1 mrg }
3216 1.1 mrg
3217 1.1 mrg if (!sz && pfile->cur_token == pfile->cur_run->limit)
3218 1.1 mrg {
3219 1.1 mrg pfile->cur_run = next_tokenrun (pfile->cur_run);
3220 1.1 mrg pfile->cur_token = pfile->cur_run->base;
3221 1.1 mrg }
3222 1.1 mrg
3223 1.1 mrg result = pfile->cur_token++;
3224 1.1 mrg result->src_loc = old->src_loc;
3225 1.1 mrg return result;
3226 1.1 mrg }
3227 1.1 mrg
3228 1.1 mrg /* We're at the beginning of a logical line (so not in
3229 1.1 mrg directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3230 1.1 mrg if we should enter deferred_pragma mode to tokenize the rest of the
3231 1.1 mrg line as a module control-line. */
3232 1.1 mrg
3233 1.1 mrg static void
3234 1.1 mrg cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3235 1.1 mrg {
3236 1.1 mrg unsigned backup = 0; /* Tokens we peeked. */
3237 1.1 mrg cpp_hashnode *node = result->val.node.node;
3238 1.1 mrg cpp_token *peek = result;
3239 1.1 mrg cpp_token *keyword = peek;
3240 1.1 mrg cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3241 1.1 mrg int header_count = 0;
3242 1.1 mrg
3243 1.1 mrg /* Make sure the incoming state is as we expect it. This way we
3244 1.1 mrg can restore it using constants. */
3245 1.1 mrg gcc_checking_assert (!pfile->state.in_deferred_pragma
3246 1.1 mrg && !pfile->state.skipping
3247 1.1 mrg && !pfile->state.parsing_args
3248 1.1 mrg && !pfile->state.angled_headers
3249 1.1 mrg && (pfile->state.save_comments
3250 1.1 mrg == !CPP_OPTION (pfile, discard_comments)));
3251 1.1 mrg
3252 1.1 mrg /* Enter directives mode sufficiently for peeking. We don't have
3253 1.1 mrg to actually set in_directive. */
3254 1.1 mrg pfile->state.in_deferred_pragma = true;
3255 1.1 mrg
3256 1.1 mrg /* These two fields are needed to process tokenization in deferred
3257 1.1 mrg pragma mode. They are not used outside deferred pragma mode or
3258 1.1 mrg directives mode. */
3259 1.1 mrg pfile->state.pragma_allow_expansion = true;
3260 1.1 mrg pfile->directive_line = result->src_loc;
3261 1.1 mrg
3262 1.1 mrg /* Saving comments is incompatible with directives mode. */
3263 1.1 mrg pfile->state.save_comments = 0;
3264 1.1 mrg
3265 1.1 mrg if (node == n_modules[spec_nodes::M_EXPORT][0])
3266 1.1 mrg {
3267 1.1 mrg peek = _cpp_lex_direct (pfile);
3268 1.1 mrg keyword = peek;
3269 1.1 mrg backup++;
3270 1.1 mrg if (keyword->type != CPP_NAME)
3271 1.1 mrg goto not_module;
3272 1.1 mrg node = keyword->val.node.node;
3273 1.1 mrg if (!(node->flags & NODE_MODULE))
3274 1.1 mrg goto not_module;
3275 1.1 mrg }
3276 1.1 mrg
3277 1.1 mrg if (node == n_modules[spec_nodes::M__IMPORT][0])
3278 1.1 mrg /* __import */
3279 1.1 mrg header_count = backup + 2 + 16;
3280 1.1 mrg else if (node == n_modules[spec_nodes::M_IMPORT][0])
3281 1.1 mrg /* import */
3282 1.1 mrg header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3283 1.1 mrg else if (node == n_modules[spec_nodes::M_MODULE][0])
3284 1.1 mrg ; /* module */
3285 1.1 mrg else
3286 1.1 mrg goto not_module;
3287 1.1 mrg
3288 1.1 mrg /* We've seen [export] {module|import|__import}. Check the next token. */
3289 1.1 mrg if (header_count)
3290 1.1 mrg /* After '{,__}import' a header name may appear. */
3291 1.1 mrg pfile->state.angled_headers = true;
3292 1.1 mrg peek = _cpp_lex_direct (pfile);
3293 1.1 mrg backup++;
3294 1.1 mrg
3295 1.1 mrg /* ... import followed by identifier, ':', '<' or
3296 1.1 mrg header-name preprocessing tokens, or module
3297 1.1 mrg followed by cpp-identifier, ':' or ';' preprocessing
3298 1.1 mrg tokens. C++ keywords are not yet relevant. */
3299 1.1 mrg if (peek->type == CPP_NAME
3300 1.1 mrg || peek->type == CPP_COLON
3301 1.1 mrg || (header_count
3302 1.1 mrg ? (peek->type == CPP_LESS
3303 1.1 mrg || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3304 1.1 mrg || peek->type == CPP_HEADER_NAME)
3305 1.1 mrg : peek->type == CPP_SEMICOLON))
3306 1.1 mrg {
3307 1.1 mrg pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3308 1.1 mrg if (!pfile->state.pragma_allow_expansion)
3309 1.1 mrg pfile->state.prevent_expansion++;
3310 1.1 mrg
3311 1.1 mrg if (!header_count && linemap_included_from
3312 1.1 mrg (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3313 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3314 1.1 mrg "module control-line cannot be in included file");
3315 1.1 mrg
3316 1.1 mrg /* The first one or two tokens cannot be macro names. */
3317 1.1 mrg for (int ix = backup; ix--;)
3318 1.1 mrg {
3319 1.1 mrg cpp_token *tok = ix ? keyword : result;
3320 1.1 mrg cpp_hashnode *node = tok->val.node.node;
3321 1.1 mrg
3322 1.1 mrg /* Don't attempt to expand the token. */
3323 1.1 mrg tok->flags |= NO_EXPAND;
3324 1.1 mrg if (_cpp_defined_macro_p (node)
3325 1.1 mrg && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
3326 1.1 mrg && !cpp_fun_like_macro_p (node))
3327 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3328 1.1 mrg "module control-line \"%s\" cannot be"
3329 1.1 mrg " an object-like macro",
3330 1.1 mrg NODE_NAME (node));
3331 1.1 mrg }
3332 1.1 mrg
3333 1.1 mrg /* Map to underbar variants. */
3334 1.1 mrg keyword->val.node.node = n_modules[header_count
3335 1.1 mrg ? spec_nodes::M_IMPORT
3336 1.1 mrg : spec_nodes::M_MODULE][1];
3337 1.1 mrg if (backup != 1)
3338 1.1 mrg result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3339 1.1 mrg
3340 1.1 mrg /* Maybe tell the tokenizer we expect a header-name down the
3341 1.1 mrg road. */
3342 1.1 mrg pfile->state.directive_file_token = header_count;
3343 1.1 mrg }
3344 1.1 mrg else
3345 1.1 mrg {
3346 1.1 mrg not_module:
3347 1.1 mrg /* Drop out of directive mode. */
3348 1.1 mrg /* We aaserted save_comments had this value upon entry. */
3349 1.1 mrg pfile->state.save_comments
3350 1.1 mrg = !CPP_OPTION (pfile, discard_comments);
3351 1.1 mrg pfile->state.in_deferred_pragma = false;
3352 1.1 mrg /* Do not let this remain on. */
3353 1.1 mrg pfile->state.angled_headers = false;
3354 1.1 mrg }
3355 1.1 mrg
3356 1.1 mrg /* In either case we want to backup the peeked tokens. */
3357 1.1 mrg if (backup)
3358 1.1 mrg {
3359 1.1 mrg /* If we saw EOL, we should drop it, because this isn't a module
3360 1.1 mrg control-line after all. */
3361 1.1 mrg bool eol = peek->type == CPP_PRAGMA_EOL;
3362 1.1 mrg if (!eol || backup > 1)
3363 1.1 mrg {
3364 1.1 mrg /* Put put the peeked tokens back */
3365 1.1 mrg _cpp_backup_tokens_direct (pfile, backup);
3366 1.1 mrg /* But if the last one was an EOL, forget it. */
3367 1.1 mrg if (eol)
3368 1.1 mrg pfile->lookaheads--;
3369 1.1 mrg }
3370 1.1 mrg }
3371 1.1 mrg }
3372 1.1 mrg
3373 1.1 mrg /* Lex a token into RESULT (external interface). Takes care of issues
3374 1.1 mrg like directive handling, token lookahead, multiple include
3375 1.1 mrg optimization and skipping. */
3376 1.1 mrg const cpp_token *
3377 1.1 mrg _cpp_lex_token (cpp_reader *pfile)
3378 1.1 mrg {
3379 1.1 mrg cpp_token *result;
3380 1.1 mrg
3381 1.1 mrg for (;;)
3382 1.1 mrg {
3383 1.1 mrg if (pfile->cur_token == pfile->cur_run->limit)
3384 1.1 mrg {
3385 1.1 mrg pfile->cur_run = next_tokenrun (pfile->cur_run);
3386 1.1 mrg pfile->cur_token = pfile->cur_run->base;
3387 1.1 mrg }
3388 1.1 mrg /* We assume that the current token is somewhere in the current
3389 1.1 mrg run. */
3390 1.1 mrg if (pfile->cur_token < pfile->cur_run->base
3391 1.1 mrg || pfile->cur_token >= pfile->cur_run->limit)
3392 1.1 mrg abort ();
3393 1.1 mrg
3394 1.1 mrg if (pfile->lookaheads)
3395 1.1 mrg {
3396 1.1 mrg pfile->lookaheads--;
3397 1.1 mrg result = pfile->cur_token++;
3398 1.1 mrg }
3399 1.1 mrg else
3400 1.1 mrg result = _cpp_lex_direct (pfile);
3401 1.1 mrg
3402 1.1 mrg if (result->flags & BOL)
3403 1.1 mrg {
3404 1.1 mrg /* Is this a directive. If _cpp_handle_directive returns
3405 1.1 mrg false, it is an assembler #. */
3406 1.1 mrg if (result->type == CPP_HASH
3407 1.1 mrg /* 6.10.3 p 11: Directives in a list of macro arguments
3408 1.1 mrg gives undefined behavior. This implementation
3409 1.1 mrg handles the directive as normal. */
3410 1.1 mrg && pfile->state.parsing_args != 1)
3411 1.1 mrg {
3412 1.1 mrg if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3413 1.1 mrg {
3414 1.1 mrg if (pfile->directive_result.type == CPP_PADDING)
3415 1.1 mrg continue;
3416 1.1 mrg result = &pfile->directive_result;
3417 1.1 mrg }
3418 1.1 mrg }
3419 1.1 mrg else if (pfile->state.in_deferred_pragma)
3420 1.1 mrg result = &pfile->directive_result;
3421 1.1 mrg else if (result->type == CPP_NAME
3422 1.1 mrg && (result->val.node.node->flags & NODE_MODULE)
3423 1.1 mrg && !pfile->state.skipping
3424 1.1 mrg /* Unlike regular directives, we do not deal with
3425 1.1 mrg tokenizing module directives as macro arguments.
3426 1.1 mrg That's not permitted. */
3427 1.1 mrg && !pfile->state.parsing_args)
3428 1.1 mrg {
3429 1.1 mrg /* P1857. Before macro expansion, At start of logical
3430 1.1 mrg line ... */
3431 1.1 mrg /* We don't have to consider lookaheads at this point. */
3432 1.1 mrg gcc_checking_assert (!pfile->lookaheads);
3433 1.1 mrg
3434 1.1 mrg cpp_maybe_module_directive (pfile, result);
3435 1.1 mrg }
3436 1.1 mrg
3437 1.1 mrg if (pfile->cb.line_change && !pfile->state.skipping)
3438 1.1 mrg pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3439 1.1 mrg }
3440 1.1 mrg
3441 1.1 mrg /* We don't skip tokens in directives. */
3442 1.1 mrg if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3443 1.1 mrg break;
3444 1.1 mrg
3445 1.1 mrg /* Outside a directive, invalidate controlling macros. At file
3446 1.1 mrg EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3447 1.1 mrg get here and MI optimization works. */
3448 1.1 mrg pfile->mi_valid = false;
3449 1.1 mrg
3450 1.1 mrg if (!pfile->state.skipping || result->type == CPP_EOF)
3451 1.1 mrg break;
3452 1.1 mrg }
3453 1.1 mrg
3454 1.1 mrg return result;
3455 1.1 mrg }
3456 1.1 mrg
3457 1.1 mrg /* Returns true if a fresh line has been loaded. */
3458 1.1 mrg bool
3459 1.1 mrg _cpp_get_fresh_line (cpp_reader *pfile)
3460 1.1 mrg {
3461 1.1 mrg /* We can't get a new line until we leave the current directive. */
3462 1.1 mrg if (pfile->state.in_directive)
3463 1.1 mrg return false;
3464 1.1 mrg
3465 1.1 mrg for (;;)
3466 1.1 mrg {
3467 1.1 mrg cpp_buffer *buffer = pfile->buffer;
3468 1.1 mrg
3469 1.1 mrg if (!buffer->need_line)
3470 1.1 mrg return true;
3471 1.1 mrg
3472 1.1 mrg if (buffer->next_line < buffer->rlimit)
3473 1.1 mrg {
3474 1.1 mrg _cpp_clean_line (pfile);
3475 1.1 mrg return true;
3476 1.1 mrg }
3477 1.1 mrg
3478 1.1 mrg /* First, get out of parsing arguments state. */
3479 1.1 mrg if (pfile->state.parsing_args)
3480 1.1 mrg return false;
3481 1.1 mrg
3482 1.1 mrg /* End of buffer. Non-empty files should end in a newline. */
3483 1.1 mrg if (buffer->buf != buffer->rlimit
3484 1.1 mrg && buffer->next_line > buffer->rlimit
3485 1.1 mrg && !buffer->from_stage3)
3486 1.1 mrg {
3487 1.1 mrg /* Clip to buffer size. */
3488 1.1 mrg buffer->next_line = buffer->rlimit;
3489 1.1 mrg }
3490 1.1 mrg
3491 1.1 mrg if (buffer->prev && !buffer->return_at_eof)
3492 1.1 mrg _cpp_pop_buffer (pfile);
3493 1.1 mrg else
3494 1.1 mrg {
3495 1.1 mrg /* End of translation. Do not pop the buffer yet. Increment
3496 1.1 mrg line number so that the EOF token is on a line of its own
3497 1.1 mrg (_cpp_lex_direct doesn't increment in that case, because
3498 1.1 mrg it's hard for it to distinguish this special case). */
3499 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
3500 1.1 mrg return false;
3501 1.1 mrg }
3502 1.1 mrg }
3503 1.1 mrg }
3504 1.1 mrg
3505 1.1 mrg #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3506 1.1 mrg do \
3507 1.1 mrg { \
3508 1.1 mrg result->type = ELSE_TYPE; \
3509 1.1 mrg if (*buffer->cur == CHAR) \
3510 1.1 mrg buffer->cur++, result->type = THEN_TYPE; \
3511 1.1 mrg } \
3512 1.1 mrg while (0)
3513 1.1 mrg
3514 1.1 mrg /* Lex a token into pfile->cur_token, which is also incremented, to
3515 1.1 mrg get diagnostics pointing to the correct location.
3516 1.1 mrg
3517 1.1 mrg Does not handle issues such as token lookahead, multiple-include
3518 1.1 mrg optimization, directives, skipping etc. This function is only
3519 1.1 mrg suitable for use by _cpp_lex_token, and in special cases like
3520 1.1 mrg lex_expansion_token which doesn't care for any of these issues.
3521 1.1 mrg
3522 1.1 mrg When meeting a newline, returns CPP_EOF if parsing a directive,
3523 1.1 mrg otherwise returns to the start of the token buffer if permissible.
3524 1.1 mrg Returns the location of the lexed token. */
3525 1.1 mrg cpp_token *
3526 1.1 mrg _cpp_lex_direct (cpp_reader *pfile)
3527 1.1 mrg {
3528 1.1 mrg cppchar_t c;
3529 1.1 mrg cpp_buffer *buffer;
3530 1.1 mrg const unsigned char *comment_start;
3531 1.1 mrg bool fallthrough_comment = false;
3532 1.1 mrg cpp_token *result = pfile->cur_token++;
3533 1.1 mrg
3534 1.1 mrg fresh_line:
3535 1.1 mrg result->flags = 0;
3536 1.1 mrg buffer = pfile->buffer;
3537 1.1 mrg if (buffer->need_line)
3538 1.1 mrg {
3539 1.1 mrg if (pfile->state.in_deferred_pragma)
3540 1.1 mrg {
3541 1.1 mrg /* This can happen in cases like:
3542 1.1 mrg #define loop(x) whatever
3543 1.1 mrg #pragma omp loop
3544 1.1 mrg where when trying to expand loop we need to peek
3545 1.1 mrg next token after loop, but aren't still in_deferred_pragma
3546 1.1 mrg mode but are in in_directive mode, so buffer->need_line
3547 1.1 mrg is set, a CPP_EOF is peeked. */
3548 1.1 mrg result->type = CPP_PRAGMA_EOL;
3549 1.1 mrg pfile->state.in_deferred_pragma = false;
3550 1.1 mrg if (!pfile->state.pragma_allow_expansion)
3551 1.1 mrg pfile->state.prevent_expansion--;
3552 1.1 mrg return result;
3553 1.1 mrg }
3554 1.1 mrg if (!_cpp_get_fresh_line (pfile))
3555 1.1 mrg {
3556 1.1 mrg result->type = CPP_EOF;
3557 1.1 mrg /* Not a real EOF in a directive or arg parsing -- we refuse
3558 1.1 mrg to advance to the next file now, and will once we're out
3559 1.1 mrg of those modes. */
3560 1.1 mrg if (!pfile->state.in_directive && !pfile->state.parsing_args)
3561 1.1 mrg {
3562 1.1 mrg /* Tell the compiler the line number of the EOF token. */
3563 1.1 mrg result->src_loc = pfile->line_table->highest_line;
3564 1.1 mrg result->flags = BOL;
3565 1.1 mrg /* Now pop the buffer that _cpp_get_fresh_line did not. */
3566 1.1 mrg _cpp_pop_buffer (pfile);
3567 1.1 mrg }
3568 1.1 mrg return result;
3569 1.1 mrg }
3570 1.1 mrg if (buffer != pfile->buffer)
3571 1.1 mrg fallthrough_comment = false;
3572 1.1 mrg if (!pfile->keep_tokens)
3573 1.1 mrg {
3574 1.1 mrg pfile->cur_run = &pfile->base_run;
3575 1.1 mrg result = pfile->base_run.base;
3576 1.1 mrg pfile->cur_token = result + 1;
3577 1.1 mrg }
3578 1.1 mrg result->flags = BOL;
3579 1.1 mrg if (pfile->state.parsing_args == 2)
3580 1.1 mrg result->flags |= PREV_WHITE;
3581 1.1 mrg }
3582 1.1 mrg buffer = pfile->buffer;
3583 1.1 mrg update_tokens_line:
3584 1.1 mrg result->src_loc = pfile->line_table->highest_line;
3585 1.1 mrg
3586 1.1 mrg skipped_white:
3587 1.1 mrg if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3588 1.1 mrg && !pfile->overlaid_buffer)
3589 1.1 mrg {
3590 1.1 mrg _cpp_process_line_notes (pfile, false);
3591 1.1 mrg result->src_loc = pfile->line_table->highest_line;
3592 1.1 mrg }
3593 1.1 mrg c = *buffer->cur++;
3594 1.1 mrg
3595 1.1 mrg if (pfile->forced_token_location)
3596 1.1 mrg result->src_loc = pfile->forced_token_location;
3597 1.1 mrg else
3598 1.1 mrg result->src_loc = linemap_position_for_column (pfile->line_table,
3599 1.1 mrg CPP_BUF_COLUMN (buffer, buffer->cur));
3600 1.1 mrg
3601 1.1 mrg switch (c)
3602 1.1 mrg {
3603 1.1 mrg case ' ': case '\t': case '\f': case '\v': case '\0':
3604 1.1 mrg result->flags |= PREV_WHITE;
3605 1.1 mrg skip_whitespace (pfile, c);
3606 1.1 mrg goto skipped_white;
3607 1.1 mrg
3608 1.1 mrg case '\n':
3609 1.1 mrg /* Increment the line, unless this is the last line ... */
3610 1.1 mrg if (buffer->cur < buffer->rlimit
3611 1.1 mrg /* ... or this is a #include, (where _cpp_stack_file needs to
3612 1.1 mrg unwind by one line) ... */
3613 1.1 mrg || (pfile->state.in_directive > 1
3614 1.1 mrg /* ... except traditional-cpp increments this elsewhere. */
3615 1.1 mrg && !CPP_OPTION (pfile, traditional)))
3616 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
3617 1.1 mrg buffer->need_line = true;
3618 1.1 mrg if (pfile->state.in_deferred_pragma)
3619 1.1 mrg {
3620 1.1 mrg /* Produce the PRAGMA_EOL on this line. File reading
3621 1.1 mrg ensures there is always a \n at end of the buffer, thus
3622 1.1 mrg in a deferred pragma we always see CPP_PRAGMA_EOL before
3623 1.1 mrg any CPP_EOF. */
3624 1.1 mrg result->type = CPP_PRAGMA_EOL;
3625 1.1 mrg result->flags &= ~PREV_WHITE;
3626 1.1 mrg pfile->state.in_deferred_pragma = false;
3627 1.1 mrg if (!pfile->state.pragma_allow_expansion)
3628 1.1 mrg pfile->state.prevent_expansion--;
3629 1.1 mrg return result;
3630 1.1 mrg }
3631 1.1 mrg goto fresh_line;
3632 1.1 mrg
3633 1.1 mrg case '0': case '1': case '2': case '3': case '4':
3634 1.1 mrg case '5': case '6': case '7': case '8': case '9':
3635 1.1 mrg {
3636 1.1 mrg struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3637 1.1 mrg result->type = CPP_NUMBER;
3638 1.1 mrg lex_number (pfile, &result->val.str, &nst);
3639 1.1 mrg warn_about_normalization (pfile, result, &nst);
3640 1.1 mrg break;
3641 1.1 mrg }
3642 1.1 mrg
3643 1.1 mrg case 'L':
3644 1.1 mrg case 'u':
3645 1.1 mrg case 'U':
3646 1.1 mrg case 'R':
3647 1.1 mrg /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3648 1.1 mrg wide strings or raw strings. */
3649 1.1 mrg if (c == 'L' || CPP_OPTION (pfile, rliterals)
3650 1.1 mrg || (c != 'R' && CPP_OPTION (pfile, uliterals)))
3651 1.1 mrg {
3652 1.1 mrg if ((*buffer->cur == '\'' && c != 'R')
3653 1.1 mrg || *buffer->cur == '"'
3654 1.1 mrg || (*buffer->cur == 'R'
3655 1.1 mrg && c != 'R'
3656 1.1 mrg && buffer->cur[1] == '"'
3657 1.1 mrg && CPP_OPTION (pfile, rliterals))
3658 1.1 mrg || (*buffer->cur == '8'
3659 1.1 mrg && c == 'u'
3660 1.1 mrg && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3661 1.1 mrg && CPP_OPTION (pfile, utf8_char_literals)))
3662 1.1 mrg || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3663 1.1 mrg && CPP_OPTION (pfile, rliterals)))))
3664 1.1 mrg {
3665 1.1 mrg lex_string (pfile, result, buffer->cur - 1);
3666 1.1 mrg break;
3667 1.1 mrg }
3668 1.1 mrg }
3669 1.1 mrg /* Fall through. */
3670 1.1 mrg
3671 1.1 mrg case '_':
3672 1.1 mrg case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3673 1.1 mrg case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3674 1.1 mrg case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
3675 1.1 mrg case 's': case 't': case 'v': case 'w': case 'x':
3676 1.1 mrg case 'y': case 'z':
3677 1.1 mrg case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
3678 1.1 mrg case 'G': case 'H': case 'I': case 'J': case 'K':
3679 1.1 mrg case 'M': case 'N': case 'O': case 'P': case 'Q':
3680 1.1 mrg case 'S': case 'T': case 'V': case 'W': case 'X':
3681 1.1 mrg case 'Y': case 'Z':
3682 1.1 mrg result->type = CPP_NAME;
3683 1.1 mrg {
3684 1.1 mrg struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3685 1.1 mrg result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
3686 1.1 mrg &nst,
3687 1.1 mrg &result->val.node.spelling);
3688 1.1 mrg warn_about_normalization (pfile, result, &nst);
3689 1.1 mrg }
3690 1.1 mrg
3691 1.1 mrg /* Convert named operators to their proper types. */
3692 1.1 mrg if (result->val.node.node->flags & NODE_OPERATOR)
3693 1.1 mrg {
3694 1.1 mrg result->flags |= NAMED_OP;
3695 1.1 mrg result->type = (enum cpp_ttype) result->val.node.node->directive_index;
3696 1.1 mrg }
3697 1.1 mrg
3698 1.1 mrg /* Signal FALLTHROUGH comment followed by another token. */
3699 1.1 mrg if (fallthrough_comment)
3700 1.1 mrg result->flags |= PREV_FALLTHROUGH;
3701 1.1 mrg break;
3702 1.1 mrg
3703 1.1 mrg case '\'':
3704 1.1 mrg case '"':
3705 1.1 mrg lex_string (pfile, result, buffer->cur - 1);
3706 1.1 mrg break;
3707 1.1 mrg
3708 1.1 mrg case '/':
3709 1.1 mrg /* A potential block or line comment. */
3710 1.1 mrg comment_start = buffer->cur;
3711 1.1 mrg c = *buffer->cur;
3712 1.1 mrg
3713 1.1 mrg if (c == '*')
3714 1.1 mrg {
3715 1.1 mrg if (_cpp_skip_block_comment (pfile))
3716 1.1 mrg cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
3717 1.1 mrg }
3718 1.1 mrg else if (c == '/' && ! CPP_OPTION (pfile, traditional))
3719 1.1 mrg {
3720 1.1 mrg /* Don't warn for system headers. */
3721 1.1 mrg if (_cpp_in_system_header (pfile))
3722 1.1 mrg ;
3723 1.1 mrg /* Warn about comments if pedantically GNUC89, and not
3724 1.1 mrg in system headers. */
3725 1.1 mrg else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3726 1.1 mrg && CPP_PEDANTIC (pfile)
3727 1.1 mrg && ! buffer->warned_cplusplus_comments)
3728 1.1 mrg {
3729 1.1 mrg if (cpp_error (pfile, CPP_DL_PEDWARN,
3730 1.1 mrg "C++ style comments are not allowed in ISO C90"))
3731 1.1 mrg cpp_error (pfile, CPP_DL_NOTE,
3732 1.1 mrg "(this will be reported only once per input file)");
3733 1.1 mrg buffer->warned_cplusplus_comments = 1;
3734 1.1 mrg }
3735 1.1 mrg /* Or if specifically desired via -Wc90-c99-compat. */
3736 1.1 mrg else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
3737 1.1 mrg && ! CPP_OPTION (pfile, cplusplus)
3738 1.1 mrg && ! buffer->warned_cplusplus_comments)
3739 1.1 mrg {
3740 1.1 mrg if (cpp_error (pfile, CPP_DL_WARNING,
3741 1.1 mrg "C++ style comments are incompatible with C90"))
3742 1.1 mrg cpp_error (pfile, CPP_DL_NOTE,
3743 1.1 mrg "(this will be reported only once per input file)");
3744 1.1 mrg buffer->warned_cplusplus_comments = 1;
3745 1.1 mrg }
3746 1.1 mrg /* In C89/C94, C++ style comments are forbidden. */
3747 1.1 mrg else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3748 1.1 mrg || CPP_OPTION (pfile, lang) == CLK_STDC94))
3749 1.1 mrg {
3750 1.1 mrg /* But don't be confused about valid code such as
3751 1.1 mrg - // immediately followed by *,
3752 1.1 mrg - // in a preprocessing directive,
3753 1.1 mrg - // in an #if 0 block. */
3754 1.1 mrg if (buffer->cur[1] == '*'
3755 1.1 mrg || pfile->state.in_directive
3756 1.1 mrg || pfile->state.skipping)
3757 1.1 mrg {
3758 1.1 mrg result->type = CPP_DIV;
3759 1.1 mrg break;
3760 1.1 mrg }
3761 1.1 mrg else if (! buffer->warned_cplusplus_comments)
3762 1.1 mrg {
3763 1.1 mrg if (cpp_error (pfile, CPP_DL_ERROR,
3764 1.1 mrg "C++ style comments are not allowed in "
3765 1.1 mrg "ISO C90"))
3766 1.1 mrg cpp_error (pfile, CPP_DL_NOTE,
3767 1.1 mrg "(this will be reported only once per input "
3768 1.1 mrg "file)");
3769 1.1 mrg buffer->warned_cplusplus_comments = 1;
3770 1.1 mrg }
3771 1.1 mrg }
3772 1.1 mrg if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
3773 1.1 mrg cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
3774 1.1 mrg }
3775 1.1 mrg else if (c == '=')
3776 1.1 mrg {
3777 1.1 mrg buffer->cur++;
3778 1.1 mrg result->type = CPP_DIV_EQ;
3779 1.1 mrg break;
3780 1.1 mrg }
3781 1.1 mrg else
3782 1.1 mrg {
3783 1.1 mrg result->type = CPP_DIV;
3784 1.1 mrg break;
3785 1.1 mrg }
3786 1.1 mrg
3787 1.1 mrg if (fallthrough_comment_p (pfile, comment_start))
3788 1.1 mrg fallthrough_comment = true;
3789 1.1 mrg
3790 1.1 mrg if (pfile->cb.comment)
3791 1.1 mrg {
3792 1.1 mrg size_t len = pfile->buffer->cur - comment_start;
3793 1.1 mrg pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3794 1.1 mrg len + 1);
3795 1.1 mrg }
3796 1.1 mrg
3797 1.1 mrg if (!pfile->state.save_comments)
3798 1.1 mrg {
3799 1.1 mrg result->flags |= PREV_WHITE;
3800 1.1 mrg goto update_tokens_line;
3801 1.1 mrg }
3802 1.1 mrg
3803 1.1 mrg if (fallthrough_comment)
3804 1.1 mrg result->flags |= PREV_FALLTHROUGH;
3805 1.1 mrg
3806 1.1 mrg /* Save the comment as a token in its own right. */
3807 1.1 mrg save_comment (pfile, result, comment_start, c);
3808 1.1 mrg break;
3809 1.1 mrg
3810 1.1 mrg case '<':
3811 1.1 mrg if (pfile->state.angled_headers)
3812 1.1 mrg {
3813 1.1 mrg lex_string (pfile, result, buffer->cur - 1);
3814 1.1 mrg if (result->type != CPP_LESS)
3815 1.1 mrg break;
3816 1.1 mrg }
3817 1.1 mrg
3818 1.1 mrg result->type = CPP_LESS;
3819 1.1 mrg if (*buffer->cur == '=')
3820 1.1 mrg {
3821 1.1 mrg buffer->cur++, result->type = CPP_LESS_EQ;
3822 1.1 mrg if (*buffer->cur == '>'
3823 1.1 mrg && CPP_OPTION (pfile, cplusplus)
3824 1.1 mrg && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3825 1.1 mrg buffer->cur++, result->type = CPP_SPACESHIP;
3826 1.1 mrg }
3827 1.1 mrg else if (*buffer->cur == '<')
3828 1.1 mrg {
3829 1.1 mrg buffer->cur++;
3830 1.1 mrg IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3831 1.1 mrg }
3832 1.1 mrg else if (CPP_OPTION (pfile, digraphs))
3833 1.1 mrg {
3834 1.1 mrg if (*buffer->cur == ':')
3835 1.1 mrg {
3836 1.1 mrg /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3837 1.1 mrg three characters are <:: and the subsequent character
3838 1.1 mrg is neither : nor >, the < is treated as a preprocessor
3839 1.1 mrg token by itself". */
3840 1.1 mrg if (CPP_OPTION (pfile, cplusplus)
3841 1.1 mrg && CPP_OPTION (pfile, lang) != CLK_CXX98
3842 1.1 mrg && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3843 1.1 mrg && buffer->cur[1] == ':'
3844 1.1 mrg && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3845 1.1 mrg break;
3846 1.1 mrg
3847 1.1 mrg buffer->cur++;
3848 1.1 mrg result->flags |= DIGRAPH;
3849 1.1 mrg result->type = CPP_OPEN_SQUARE;
3850 1.1 mrg }
3851 1.1 mrg else if (*buffer->cur == '%')
3852 1.1 mrg {
3853 1.1 mrg buffer->cur++;
3854 1.1 mrg result->flags |= DIGRAPH;
3855 1.1 mrg result->type = CPP_OPEN_BRACE;
3856 1.1 mrg }
3857 1.1 mrg }
3858 1.1 mrg break;
3859 1.1 mrg
3860 1.1 mrg case '>':
3861 1.1 mrg result->type = CPP_GREATER;
3862 1.1 mrg if (*buffer->cur == '=')
3863 1.1 mrg buffer->cur++, result->type = CPP_GREATER_EQ;
3864 1.1 mrg else if (*buffer->cur == '>')
3865 1.1 mrg {
3866 1.1 mrg buffer->cur++;
3867 1.1 mrg IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3868 1.1 mrg }
3869 1.1 mrg break;
3870 1.1 mrg
3871 1.1 mrg case '%':
3872 1.1 mrg result->type = CPP_MOD;
3873 1.1 mrg if (*buffer->cur == '=')
3874 1.1 mrg buffer->cur++, result->type = CPP_MOD_EQ;
3875 1.1 mrg else if (CPP_OPTION (pfile, digraphs))
3876 1.1 mrg {
3877 1.1 mrg if (*buffer->cur == ':')
3878 1.1 mrg {
3879 1.1 mrg buffer->cur++;
3880 1.1 mrg result->flags |= DIGRAPH;
3881 1.1 mrg result->type = CPP_HASH;
3882 1.1 mrg if (*buffer->cur == '%' && buffer->cur[1] == ':')
3883 1.1 mrg buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3884 1.1 mrg }
3885 1.1 mrg else if (*buffer->cur == '>')
3886 1.1 mrg {
3887 1.1 mrg buffer->cur++;
3888 1.1 mrg result->flags |= DIGRAPH;
3889 1.1 mrg result->type = CPP_CLOSE_BRACE;
3890 1.1 mrg }
3891 1.1 mrg }
3892 1.1 mrg break;
3893 1.1 mrg
3894 1.1 mrg case '.':
3895 1.1 mrg result->type = CPP_DOT;
3896 1.1 mrg if (ISDIGIT (*buffer->cur))
3897 1.1 mrg {
3898 1.1 mrg struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3899 1.1 mrg result->type = CPP_NUMBER;
3900 1.1 mrg lex_number (pfile, &result->val.str, &nst);
3901 1.1 mrg warn_about_normalization (pfile, result, &nst);
3902 1.1 mrg }
3903 1.1 mrg else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3904 1.1 mrg buffer->cur += 2, result->type = CPP_ELLIPSIS;
3905 1.1 mrg else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3906 1.1 mrg buffer->cur++, result->type = CPP_DOT_STAR;
3907 1.1 mrg break;
3908 1.1 mrg
3909 1.1 mrg case '+':
3910 1.1 mrg result->type = CPP_PLUS;
3911 1.1 mrg if (*buffer->cur == '+')
3912 1.1 mrg buffer->cur++, result->type = CPP_PLUS_PLUS;
3913 1.1 mrg else if (*buffer->cur == '=')
3914 1.1 mrg buffer->cur++, result->type = CPP_PLUS_EQ;
3915 1.1 mrg break;
3916 1.1 mrg
3917 1.1 mrg case '-':
3918 1.1 mrg result->type = CPP_MINUS;
3919 1.1 mrg if (*buffer->cur == '>')
3920 1.1 mrg {
3921 1.1 mrg buffer->cur++;
3922 1.1 mrg result->type = CPP_DEREF;
3923 1.1 mrg if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3924 1.1 mrg buffer->cur++, result->type = CPP_DEREF_STAR;
3925 1.1 mrg }
3926 1.1 mrg else if (*buffer->cur == '-')
3927 1.1 mrg buffer->cur++, result->type = CPP_MINUS_MINUS;
3928 1.1 mrg else if (*buffer->cur == '=')
3929 1.1 mrg buffer->cur++, result->type = CPP_MINUS_EQ;
3930 1.1 mrg break;
3931 1.1 mrg
3932 1.1 mrg case '&':
3933 1.1 mrg result->type = CPP_AND;
3934 1.1 mrg if (*buffer->cur == '&')
3935 1.1 mrg buffer->cur++, result->type = CPP_AND_AND;
3936 1.1 mrg else if (*buffer->cur == '=')
3937 1.1 mrg buffer->cur++, result->type = CPP_AND_EQ;
3938 1.1 mrg break;
3939 1.1 mrg
3940 1.1 mrg case '|':
3941 1.1 mrg result->type = CPP_OR;
3942 1.1 mrg if (*buffer->cur == '|')
3943 1.1 mrg buffer->cur++, result->type = CPP_OR_OR;
3944 1.1 mrg else if (*buffer->cur == '=')
3945 1.1 mrg buffer->cur++, result->type = CPP_OR_EQ;
3946 1.1 mrg break;
3947 1.1 mrg
3948 1.1 mrg case ':':
3949 1.3 mrg result->type = CPP_COLON;
3950 1.3 mrg if (*buffer->cur == ':')
3951 1.3 mrg {
3952 1.3 mrg if (CPP_OPTION (pfile, scope))
3953 1.3 mrg buffer->cur++, result->type = CPP_SCOPE;
3954 1.3 mrg else
3955 1.3 mrg result->flags |= COLON_SCOPE;
3956 1.1 mrg }
3957 1.1 mrg else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3958 1.1 mrg {
3959 1.1 mrg buffer->cur++;
3960 1.1 mrg result->flags |= DIGRAPH;
3961 1.1 mrg result->type = CPP_CLOSE_SQUARE;
3962 1.1 mrg }
3963 1.1 mrg break;
3964 1.1 mrg
3965 1.1 mrg case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3966 1.1 mrg case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3967 1.1 mrg case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3968 1.1 mrg case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3969 1.1 mrg case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3970 1.1 mrg
3971 1.1 mrg case '?': result->type = CPP_QUERY; break;
3972 1.1 mrg case '~': result->type = CPP_COMPL; break;
3973 1.1 mrg case ',': result->type = CPP_COMMA; break;
3974 1.1 mrg case '(': result->type = CPP_OPEN_PAREN; break;
3975 1.1 mrg case ')': result->type = CPP_CLOSE_PAREN; break;
3976 1.1 mrg case '[': result->type = CPP_OPEN_SQUARE; break;
3977 1.1 mrg case ']': result->type = CPP_CLOSE_SQUARE; break;
3978 1.1 mrg case '{': result->type = CPP_OPEN_BRACE; break;
3979 1.1 mrg case '}': result->type = CPP_CLOSE_BRACE; break;
3980 1.1 mrg case ';': result->type = CPP_SEMICOLON; break;
3981 1.1 mrg
3982 1.1 mrg /* @ is a punctuator in Objective-C. */
3983 1.1 mrg case '@': result->type = CPP_ATSIGN; break;
3984 1.1 mrg
3985 1.1 mrg default:
3986 1.1 mrg {
3987 1.1 mrg const uchar *base = --buffer->cur;
3988 1.1 mrg
3989 1.1 mrg /* Check for an extended identifier ($ or UCN or UTF-8). */
3990 1.1 mrg struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3991 1.1 mrg if (forms_identifier_p (pfile, true, &nst))
3992 1.1 mrg {
3993 1.1 mrg result->type = CPP_NAME;
3994 1.1 mrg result->val.node.node = lex_identifier (pfile, base, true, &nst,
3995 1.1 mrg &result->val.node.spelling);
3996 1.1 mrg warn_about_normalization (pfile, result, &nst);
3997 1.1 mrg break;
3998 1.1 mrg }
3999 1.1 mrg
4000 1.1 mrg /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
4001 1.1 mrg single token. */
4002 1.1 mrg buffer->cur++;
4003 1.1 mrg if (c >= utf8_signifier)
4004 1.1 mrg {
4005 1.1 mrg const uchar *pstr = base;
4006 1.1 mrg cppchar_t s;
4007 1.1 mrg if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
4008 1.1 mrg buffer->cur = pstr;
4009 1.1 mrg }
4010 1.1 mrg create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4011 1.1 mrg break;
4012 1.1 mrg }
4013 1.1 mrg
4014 1.1 mrg }
4015 1.1 mrg
4016 1.1 mrg /* Potentially convert the location of the token to a range. */
4017 1.1 mrg if (result->src_loc >= RESERVED_LOCATION_COUNT
4018 1.1 mrg && result->type != CPP_EOF)
4019 1.1 mrg {
4020 1.1 mrg /* Ensure that any line notes are processed, so that we have the
4021 1.1 mrg correct physical line/column for the end-point of the token even
4022 1.1 mrg when a logical line is split via one or more backslashes. */
4023 1.1 mrg if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4024 1.1 mrg && !pfile->overlaid_buffer)
4025 1.1 mrg _cpp_process_line_notes (pfile, false);
4026 1.1 mrg
4027 1.1 mrg source_range tok_range;
4028 1.1 mrg tok_range.m_start = result->src_loc;
4029 1.1 mrg tok_range.m_finish
4030 1.1 mrg = linemap_position_for_column (pfile->line_table,
4031 1.1 mrg CPP_BUF_COLUMN (buffer, buffer->cur));
4032 1.1 mrg
4033 1.1 mrg result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4034 1.1 mrg result->src_loc,
4035 1.1 mrg tok_range, NULL);
4036 1.1 mrg }
4037 1.1 mrg
4038 1.1 mrg return result;
4039 1.1 mrg }
4040 1.1 mrg
4041 1.1 mrg /* An upper bound on the number of bytes needed to spell TOKEN.
4042 1.1 mrg Does not include preceding whitespace. */
4043 1.1 mrg unsigned int
4044 1.1 mrg cpp_token_len (const cpp_token *token)
4045 1.1 mrg {
4046 1.1 mrg unsigned int len;
4047 1.1 mrg
4048 1.1 mrg switch (TOKEN_SPELL (token))
4049 1.1 mrg {
4050 1.1 mrg default: len = 6; break;
4051 1.1 mrg case SPELL_LITERAL: len = token->val.str.len; break;
4052 1.1 mrg case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
4053 1.1 mrg }
4054 1.1 mrg
4055 1.1 mrg return len;
4056 1.1 mrg }
4057 1.1 mrg
4058 1.1 mrg /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4059 1.1 mrg Return the number of bytes read out of NAME. (There are always
4060 1.1 mrg 10 bytes written to BUFFER.) */
4061 1.1 mrg
4062 1.1 mrg static size_t
4063 1.1 mrg utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4064 1.1 mrg {
4065 1.1 mrg int j;
4066 1.1 mrg int ucn_len = 0;
4067 1.1 mrg int ucn_len_c;
4068 1.1 mrg unsigned t;
4069 1.1 mrg unsigned long utf32;
4070 1.1 mrg
4071 1.1 mrg /* Compute the length of the UTF-8 sequence. */
4072 1.1 mrg for (t = *name; t & 0x80; t <<= 1)
4073 1.1 mrg ucn_len++;
4074 1.1 mrg
4075 1.1 mrg utf32 = *name & (0x7F >> ucn_len);
4076 1.1 mrg for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4077 1.1 mrg {
4078 1.1 mrg utf32 = (utf32 << 6) | (*++name & 0x3F);
4079 1.1 mrg
4080 1.1 mrg /* Ill-formed UTF-8. */
4081 1.1 mrg if ((*name & ~0x3F) != 0x80)
4082 1.1 mrg abort ();
4083 1.1 mrg }
4084 1.1 mrg
4085 1.1 mrg *buffer++ = '\\';
4086 1.1 mrg *buffer++ = 'U';
4087 1.1 mrg for (j = 7; j >= 0; j--)
4088 1.1 mrg *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4089 1.1 mrg return ucn_len;
4090 1.1 mrg }
4091 1.1 mrg
4092 1.1 mrg /* Given a token TYPE corresponding to a digraph, return a pointer to
4093 1.1 mrg the spelling of the digraph. */
4094 1.1 mrg static const unsigned char *
4095 1.1 mrg cpp_digraph2name (enum cpp_ttype type)
4096 1.1 mrg {
4097 1.1 mrg return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4098 1.1 mrg }
4099 1.1 mrg
4100 1.1 mrg /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4101 1.1 mrg The buffer must already contain the enough space to hold the
4102 1.1 mrg token's spelling. Returns a pointer to the character after the
4103 1.1 mrg last character written. */
4104 1.1 mrg unsigned char *
4105 1.1 mrg _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4106 1.1 mrg {
4107 1.1 mrg size_t i;
4108 1.1 mrg const unsigned char *name = NODE_NAME (ident);
4109 1.1 mrg
4110 1.1 mrg for (i = 0; i < NODE_LEN (ident); i++)
4111 1.1 mrg if (name[i] & ~0x7F)
4112 1.1 mrg {
4113 1.1 mrg i += utf8_to_ucn (buffer, name + i) - 1;
4114 1.1 mrg buffer += 10;
4115 1.1 mrg }
4116 1.1 mrg else
4117 1.1 mrg *buffer++ = name[i];
4118 1.1 mrg
4119 1.1 mrg return buffer;
4120 1.1 mrg }
4121 1.1 mrg
4122 1.1 mrg /* Write the spelling of a token TOKEN to BUFFER. The buffer must
4123 1.1 mrg already contain the enough space to hold the token's spelling.
4124 1.1 mrg Returns a pointer to the character after the last character written.
4125 1.1 mrg FORSTRING is true if this is to be the spelling after translation
4126 1.1 mrg phase 1 (with the original spelling of extended identifiers), false
4127 1.1 mrg if extended identifiers should always be written using UCNs (there is
4128 1.1 mrg no option for always writing them in the internal UTF-8 form).
4129 1.1 mrg FIXME: Would be nice if we didn't need the PFILE argument. */
4130 1.1 mrg unsigned char *
4131 1.1 mrg cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4132 1.1 mrg unsigned char *buffer, bool forstring)
4133 1.1 mrg {
4134 1.1 mrg switch (TOKEN_SPELL (token))
4135 1.1 mrg {
4136 1.1 mrg case SPELL_OPERATOR:
4137 1.1 mrg {
4138 1.1 mrg const unsigned char *spelling;
4139 1.1 mrg unsigned char c;
4140 1.1 mrg
4141 1.1 mrg if (token->flags & DIGRAPH)
4142 1.1 mrg spelling = cpp_digraph2name (token->type);
4143 1.1 mrg else if (token->flags & NAMED_OP)
4144 1.1 mrg goto spell_ident;
4145 1.1 mrg else
4146 1.1 mrg spelling = TOKEN_NAME (token);
4147 1.1 mrg
4148 1.1 mrg while ((c = *spelling++) != '\0')
4149 1.1 mrg *buffer++ = c;
4150 1.1 mrg }
4151 1.1 mrg break;
4152 1.1 mrg
4153 1.1 mrg spell_ident:
4154 1.1 mrg case SPELL_IDENT:
4155 1.1 mrg if (forstring)
4156 1.1 mrg {
4157 1.1 mrg memcpy (buffer, NODE_NAME (token->val.node.spelling),
4158 1.1 mrg NODE_LEN (token->val.node.spelling));
4159 1.1 mrg buffer += NODE_LEN (token->val.node.spelling);
4160 1.1 mrg }
4161 1.1 mrg else
4162 1.1 mrg buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
4163 1.1 mrg break;
4164 1.1 mrg
4165 1.1 mrg case SPELL_LITERAL:
4166 1.1 mrg memcpy (buffer, token->val.str.text, token->val.str.len);
4167 1.1 mrg buffer += token->val.str.len;
4168 1.1 mrg break;
4169 1.1 mrg
4170 1.1 mrg case SPELL_NONE:
4171 1.1 mrg cpp_error (pfile, CPP_DL_ICE,
4172 1.1 mrg "unspellable token %s", TOKEN_NAME (token));
4173 1.1 mrg break;
4174 1.1 mrg }
4175 1.1 mrg
4176 1.1 mrg return buffer;
4177 1.1 mrg }
4178 1.1 mrg
4179 1.1 mrg /* Returns TOKEN spelt as a null-terminated string. The string is
4180 1.1 mrg freed when the reader is destroyed. Useful for diagnostics. */
4181 1.1 mrg unsigned char *
4182 1.1 mrg cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4183 1.1 mrg {
4184 1.1 mrg unsigned int len = cpp_token_len (token) + 1;
4185 1.1 mrg unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4186 1.1 mrg
4187 1.1 mrg end = cpp_spell_token (pfile, token, start, false);
4188 1.1 mrg end[0] = '\0';
4189 1.1 mrg
4190 1.1 mrg return start;
4191 1.1 mrg }
4192 1.1 mrg
4193 1.1 mrg /* Returns a pointer to a string which spells the token defined by
4194 1.1 mrg TYPE and FLAGS. Used by C front ends, which really should move to
4195 1.1 mrg using cpp_token_as_text. */
4196 1.1 mrg const char *
4197 1.1 mrg cpp_type2name (enum cpp_ttype type, unsigned char flags)
4198 1.1 mrg {
4199 1.1 mrg if (flags & DIGRAPH)
4200 1.1 mrg return (const char *) cpp_digraph2name (type);
4201 1.1 mrg else if (flags & NAMED_OP)
4202 1.1 mrg return cpp_named_operator2name (type);
4203 1.1 mrg
4204 1.1 mrg return (const char *) token_spellings[type].name;
4205 1.1 mrg }
4206 1.1 mrg
4207 1.1 mrg /* Writes the spelling of token to FP, without any preceding space.
4208 1.1 mrg Separated from cpp_spell_token for efficiency - to avoid stdio
4209 1.1 mrg double-buffering. */
4210 1.1 mrg void
4211 1.1 mrg cpp_output_token (const cpp_token *token, FILE *fp)
4212 1.1 mrg {
4213 1.1 mrg switch (TOKEN_SPELL (token))
4214 1.1 mrg {
4215 1.1 mrg case SPELL_OPERATOR:
4216 1.1 mrg {
4217 1.1 mrg const unsigned char *spelling;
4218 1.1 mrg int c;
4219 1.1 mrg
4220 1.1 mrg if (token->flags & DIGRAPH)
4221 1.1 mrg spelling = cpp_digraph2name (token->type);
4222 1.1 mrg else if (token->flags & NAMED_OP)
4223 1.1 mrg goto spell_ident;
4224 1.1 mrg else
4225 1.1 mrg spelling = TOKEN_NAME (token);
4226 1.1 mrg
4227 1.1 mrg c = *spelling;
4228 1.1 mrg do
4229 1.1 mrg putc (c, fp);
4230 1.1 mrg while ((c = *++spelling) != '\0');
4231 1.1 mrg }
4232 1.1 mrg break;
4233 1.1 mrg
4234 1.1 mrg spell_ident:
4235 1.1 mrg case SPELL_IDENT:
4236 1.1 mrg {
4237 1.1 mrg size_t i;
4238 1.1 mrg const unsigned char * name = NODE_NAME (token->val.node.node);
4239 1.1 mrg
4240 1.1 mrg for (i = 0; i < NODE_LEN (token->val.node.node); i++)
4241 1.1 mrg if (name[i] & ~0x7F)
4242 1.1 mrg {
4243 1.1 mrg unsigned char buffer[10];
4244 1.1 mrg i += utf8_to_ucn (buffer, name + i) - 1;
4245 1.1 mrg fwrite (buffer, 1, 10, fp);
4246 1.1 mrg }
4247 1.1 mrg else
4248 1.1 mrg fputc (NODE_NAME (token->val.node.node)[i], fp);
4249 1.1 mrg }
4250 1.1 mrg break;
4251 1.1 mrg
4252 1.1 mrg case SPELL_LITERAL:
4253 1.1 mrg if (token->type == CPP_HEADER_NAME)
4254 1.1 mrg fputc ('"', fp);
4255 1.1 mrg fwrite (token->val.str.text, 1, token->val.str.len, fp);
4256 1.1 mrg if (token->type == CPP_HEADER_NAME)
4257 1.1 mrg fputc ('"', fp);
4258 1.1 mrg break;
4259 1.1 mrg
4260 1.1 mrg case SPELL_NONE:
4261 1.1 mrg /* An error, most probably. */
4262 1.1 mrg break;
4263 1.1 mrg }
4264 1.1 mrg }
4265 1.1 mrg
4266 1.1 mrg /* Compare two tokens. */
4267 1.1 mrg int
4268 1.1 mrg _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4269 1.1 mrg {
4270 1.1 mrg if (a->type == b->type && a->flags == b->flags)
4271 1.1 mrg switch (TOKEN_SPELL (a))
4272 1.1 mrg {
4273 1.1 mrg default: /* Keep compiler happy. */
4274 1.1 mrg case SPELL_OPERATOR:
4275 1.1 mrg /* token_no is used to track where multiple consecutive ##
4276 1.1 mrg tokens were originally located. */
4277 1.1 mrg return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4278 1.1 mrg case SPELL_NONE:
4279 1.1 mrg return (a->type != CPP_MACRO_ARG
4280 1.1 mrg || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4281 1.1 mrg && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4282 1.1 mrg case SPELL_IDENT:
4283 1.1 mrg return (a->val.node.node == b->val.node.node
4284 1.1 mrg && a->val.node.spelling == b->val.node.spelling);
4285 1.1 mrg case SPELL_LITERAL:
4286 1.1 mrg return (a->val.str.len == b->val.str.len
4287 1.1 mrg && !memcmp (a->val.str.text, b->val.str.text,
4288 1.1 mrg a->val.str.len));
4289 1.1 mrg }
4290 1.1 mrg
4291 1.1 mrg return 0;
4292 1.1 mrg }
4293 1.1 mrg
4294 1.1 mrg /* Returns nonzero if a space should be inserted to avoid an
4295 1.1 mrg accidental token paste for output. For simplicity, it is
4296 1.1 mrg conservative, and occasionally advises a space where one is not
4297 1.1 mrg needed, e.g. "." and ".2". */
4298 1.1 mrg int
4299 1.1 mrg cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4300 1.1 mrg const cpp_token *token2)
4301 1.1 mrg {
4302 1.1 mrg enum cpp_ttype a = token1->type, b = token2->type;
4303 1.1 mrg cppchar_t c;
4304 1.1 mrg
4305 1.1 mrg if (token1->flags & NAMED_OP)
4306 1.1 mrg a = CPP_NAME;
4307 1.1 mrg if (token2->flags & NAMED_OP)
4308 1.1 mrg b = CPP_NAME;
4309 1.1 mrg
4310 1.1 mrg c = EOF;
4311 1.1 mrg if (token2->flags & DIGRAPH)
4312 1.1 mrg c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4313 1.1 mrg else if (token_spellings[b].category == SPELL_OPERATOR)
4314 1.1 mrg c = token_spellings[b].name[0];
4315 1.1 mrg
4316 1.1 mrg /* Quickly get everything that can paste with an '='. */
4317 1.1 mrg if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4318 1.1 mrg return 1;
4319 1.1 mrg
4320 1.1 mrg switch (a)
4321 1.1 mrg {
4322 1.1 mrg case CPP_GREATER: return c == '>';
4323 1.1 mrg case CPP_LESS: return c == '<' || c == '%' || c == ':';
4324 1.1 mrg case CPP_PLUS: return c == '+';
4325 1.1 mrg case CPP_MINUS: return c == '-' || c == '>';
4326 1.1 mrg case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4327 1.1 mrg case CPP_MOD: return c == ':' || c == '>';
4328 1.1 mrg case CPP_AND: return c == '&';
4329 1.1 mrg case CPP_OR: return c == '|';
4330 1.1 mrg case CPP_COLON: return c == ':' || c == '>';
4331 1.1 mrg case CPP_DEREF: return c == '*';
4332 1.1 mrg case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
4333 1.1 mrg case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
4334 1.1 mrg case CPP_PRAGMA:
4335 1.1 mrg case CPP_NAME: return ((b == CPP_NUMBER
4336 1.1 mrg && name_p (pfile, &token2->val.str))
4337 1.1 mrg || b == CPP_NAME
4338 1.1 mrg || b == CPP_CHAR || b == CPP_STRING); /* L */
4339 1.1 mrg case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
4340 1.1 mrg || b == CPP_CHAR
4341 1.1 mrg || c == '.' || c == '+' || c == '-');
4342 1.1 mrg /* UCNs */
4343 1.1 mrg case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4344 1.1 mrg && b == CPP_NAME)
4345 1.1 mrg || (CPP_OPTION (pfile, objc)
4346 1.1 mrg && token1->val.str.text[0] == '@'
4347 1.1 mrg && (b == CPP_NAME || b == CPP_STRING)));
4348 1.1 mrg case CPP_LESS_EQ: return c == '>';
4349 1.1 mrg case CPP_STRING:
4350 1.1 mrg case CPP_WSTRING:
4351 1.1 mrg case CPP_UTF8STRING:
4352 1.1 mrg case CPP_STRING16:
4353 1.1 mrg case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4354 1.1 mrg && (b == CPP_NAME
4355 1.1 mrg || (TOKEN_SPELL (token2) == SPELL_LITERAL
4356 1.1 mrg && ISIDST (token2->val.str.text[0]))));
4357 1.1 mrg
4358 1.1 mrg default: break;
4359 1.1 mrg }
4360 1.1 mrg
4361 1.1 mrg return 0;
4362 1.1 mrg }
4363 1.1 mrg
4364 1.1 mrg /* Output all the remaining tokens on the current line, and a newline
4365 1.1 mrg character, to FP. Leading whitespace is removed. If there are
4366 1.1 mrg macros, special token padding is not performed. */
4367 1.1 mrg void
4368 1.1 mrg cpp_output_line (cpp_reader *pfile, FILE *fp)
4369 1.1 mrg {
4370 1.1 mrg const cpp_token *token;
4371 1.1 mrg
4372 1.1 mrg token = cpp_get_token (pfile);
4373 1.1 mrg while (token->type != CPP_EOF)
4374 1.1 mrg {
4375 1.1 mrg cpp_output_token (token, fp);
4376 1.1 mrg token = cpp_get_token (pfile);
4377 1.1 mrg if (token->flags & PREV_WHITE)
4378 1.1 mrg putc (' ', fp);
4379 1.1 mrg }
4380 1.1 mrg
4381 1.1 mrg putc ('\n', fp);
4382 1.1 mrg }
4383 1.1 mrg
4384 1.1 mrg /* Return a string representation of all the remaining tokens on the
4385 1.1 mrg current line. The result is allocated using xmalloc and must be
4386 1.1 mrg freed by the caller. */
4387 1.1 mrg unsigned char *
4388 1.1 mrg cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4389 1.1 mrg {
4390 1.1 mrg const cpp_token *token;
4391 1.1 mrg unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4392 1.1 mrg unsigned int alloced = 120 + out;
4393 1.1 mrg unsigned char *result = (unsigned char *) xmalloc (alloced);
4394 1.1 mrg
4395 1.1 mrg /* If DIR_NAME is empty, there are no initial contents. */
4396 1.1 mrg if (dir_name)
4397 1.1 mrg {
4398 1.1 mrg sprintf ((char *) result, "#%s ", dir_name);
4399 1.1 mrg out += 2;
4400 1.1 mrg }
4401 1.1 mrg
4402 1.1 mrg token = cpp_get_token (pfile);
4403 1.1 mrg while (token->type != CPP_EOF)
4404 1.1 mrg {
4405 1.1 mrg unsigned char *last;
4406 1.1 mrg /* Include room for a possible space and the terminating nul. */
4407 1.1 mrg unsigned int len = cpp_token_len (token) + 2;
4408 1.1 mrg
4409 1.1 mrg if (out + len > alloced)
4410 1.1 mrg {
4411 1.1 mrg alloced *= 2;
4412 1.1 mrg if (out + len > alloced)
4413 1.1 mrg alloced = out + len;
4414 1.1 mrg result = (unsigned char *) xrealloc (result, alloced);
4415 1.1 mrg }
4416 1.1 mrg
4417 1.1 mrg last = cpp_spell_token (pfile, token, &result[out], 0);
4418 1.1 mrg out = last - result;
4419 1.1 mrg
4420 1.1 mrg token = cpp_get_token (pfile);
4421 1.1 mrg if (token->flags & PREV_WHITE)
4422 1.1 mrg result[out++] = ' ';
4423 1.1 mrg }
4424 1.1 mrg
4425 1.1 mrg result[out] = '\0';
4426 1.1 mrg return result;
4427 1.1 mrg }
4428 1.1 mrg
4429 1.1 mrg /* Memory buffers. Changing these three constants can have a dramatic
4430 1.1 mrg effect on performance. The values here are reasonable defaults,
4431 1.1 mrg but might be tuned. If you adjust them, be sure to test across a
4432 1.1 mrg range of uses of cpplib, including heavy nested function-like macro
4433 1.1 mrg expansion. Also check the change in peak memory usage (NJAMD is a
4434 1.1 mrg good tool for this). */
4435 1.1 mrg #define MIN_BUFF_SIZE 8000
4436 1.1 mrg #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4437 1.1 mrg #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4438 1.1 mrg (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4439 1.1 mrg
4440 1.1 mrg #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4441 1.1 mrg #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4442 1.1 mrg #endif
4443 1.1 mrg
4444 1.1 mrg /* Create a new allocation buffer. Place the control block at the end
4445 1.1 mrg of the buffer, so that buffer overflows will cause immediate chaos. */
4446 1.1 mrg static _cpp_buff *
4447 1.1 mrg new_buff (size_t len)
4448 1.1 mrg {
4449 1.1 mrg _cpp_buff *result;
4450 1.1 mrg unsigned char *base;
4451 1.1 mrg
4452 1.1 mrg if (len < MIN_BUFF_SIZE)
4453 1.1 mrg len = MIN_BUFF_SIZE;
4454 1.1 mrg len = CPP_ALIGN (len);
4455 1.1 mrg
4456 1.1 mrg #ifdef ENABLE_VALGRIND_ANNOTATIONS
4457 1.1 mrg /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4458 1.1 mrg struct first. */
4459 1.1 mrg size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4460 1.1 mrg base = XNEWVEC (unsigned char, len + slen);
4461 1.1 mrg result = (_cpp_buff *) base;
4462 1.1 mrg base += slen;
4463 1.1 mrg #else
4464 1.1 mrg base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4465 1.1 mrg result = (_cpp_buff *) (base + len);
4466 1.1 mrg #endif
4467 1.1 mrg result->base = base;
4468 1.1 mrg result->cur = base;
4469 1.1 mrg result->limit = base + len;
4470 1.1 mrg result->next = NULL;
4471 1.1 mrg return result;
4472 1.1 mrg }
4473 1.1 mrg
4474 1.1 mrg /* Place a chain of unwanted allocation buffers on the free list. */
4475 1.1 mrg void
4476 1.1 mrg _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4477 1.1 mrg {
4478 1.1 mrg _cpp_buff *end = buff;
4479 1.1 mrg
4480 1.1 mrg while (end->next)
4481 1.1 mrg end = end->next;
4482 1.1 mrg end->next = pfile->free_buffs;
4483 1.1 mrg pfile->free_buffs = buff;
4484 1.1 mrg }
4485 1.1 mrg
4486 1.1 mrg /* Return a free buffer of size at least MIN_SIZE. */
4487 1.1 mrg _cpp_buff *
4488 1.1 mrg _cpp_get_buff (cpp_reader *pfile, size_t min_size)
4489 1.1 mrg {
4490 1.1 mrg _cpp_buff *result, **p;
4491 1.1 mrg
4492 1.1 mrg for (p = &pfile->free_buffs;; p = &(*p)->next)
4493 1.1 mrg {
4494 1.1 mrg size_t size;
4495 1.1 mrg
4496 1.1 mrg if (*p == NULL)
4497 1.1 mrg return new_buff (min_size);
4498 1.1 mrg result = *p;
4499 1.1 mrg size = result->limit - result->base;
4500 1.1 mrg /* Return a buffer that's big enough, but don't waste one that's
4501 1.1 mrg way too big. */
4502 1.1 mrg if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4503 1.1 mrg break;
4504 1.1 mrg }
4505 1.1 mrg
4506 1.1 mrg *p = result->next;
4507 1.1 mrg result->next = NULL;
4508 1.1 mrg result->cur = result->base;
4509 1.1 mrg return result;
4510 1.1 mrg }
4511 1.1 mrg
4512 1.1 mrg /* Creates a new buffer with enough space to hold the uncommitted
4513 1.1 mrg remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4514 1.1 mrg the excess bytes to the new buffer. Chains the new buffer after
4515 1.1 mrg BUFF, and returns the new buffer. */
4516 1.1 mrg _cpp_buff *
4517 1.1 mrg _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4518 1.1 mrg {
4519 1.1 mrg size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4520 1.1 mrg _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
4521 1.1 mrg
4522 1.1 mrg buff->next = new_buff;
4523 1.1 mrg memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4524 1.1 mrg return new_buff;
4525 1.1 mrg }
4526 1.1 mrg
4527 1.1 mrg /* Creates a new buffer with enough space to hold the uncommitted
4528 1.1 mrg remaining bytes of the buffer pointed to by BUFF, and at least
4529 1.1 mrg MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4530 1.1 mrg Chains the new buffer before the buffer pointed to by BUFF, and
4531 1.1 mrg updates the pointer to point to the new buffer. */
4532 1.1 mrg void
4533 1.1 mrg _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4534 1.1 mrg {
4535 1.1 mrg _cpp_buff *new_buff, *old_buff = *pbuff;
4536 1.1 mrg size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4537 1.1 mrg
4538 1.1 mrg new_buff = _cpp_get_buff (pfile, size);
4539 1.1 mrg memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4540 1.1 mrg new_buff->next = old_buff;
4541 1.1 mrg *pbuff = new_buff;
4542 1.1 mrg }
4543 1.1 mrg
4544 1.1 mrg /* Free a chain of buffers starting at BUFF. */
4545 1.1 mrg void
4546 1.1 mrg _cpp_free_buff (_cpp_buff *buff)
4547 1.1 mrg {
4548 1.1 mrg _cpp_buff *next;
4549 1.1 mrg
4550 1.1 mrg for (; buff; buff = next)
4551 1.1 mrg {
4552 1.1 mrg next = buff->next;
4553 1.1 mrg #ifdef ENABLE_VALGRIND_ANNOTATIONS
4554 1.1 mrg free (buff);
4555 1.1 mrg #else
4556 1.1 mrg free (buff->base);
4557 1.1 mrg #endif
4558 1.1 mrg }
4559 1.1 mrg }
4560 1.1 mrg
4561 1.1 mrg /* Allocate permanent, unaligned storage of length LEN. */
4562 1.1 mrg unsigned char *
4563 1.1 mrg _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4564 1.1 mrg {
4565 1.1 mrg _cpp_buff *buff = pfile->u_buff;
4566 1.1 mrg unsigned char *result = buff->cur;
4567 1.1 mrg
4568 1.1 mrg if (len > (size_t) (buff->limit - result))
4569 1.1 mrg {
4570 1.1 mrg buff = _cpp_get_buff (pfile, len);
4571 1.1 mrg buff->next = pfile->u_buff;
4572 1.1 mrg pfile->u_buff = buff;
4573 1.1 mrg result = buff->cur;
4574 1.1 mrg }
4575 1.1 mrg
4576 1.1 mrg buff->cur = result + len;
4577 1.1 mrg return result;
4578 1.1 mrg }
4579 1.1 mrg
4580 1.1 mrg /* Allocate permanent, unaligned storage of length LEN from a_buff.
4581 1.1 mrg That buffer is used for growing allocations when saving macro
4582 1.1 mrg replacement lists in a #define, and when parsing an answer to an
4583 1.1 mrg assertion in #assert, #unassert or #if (and therefore possibly
4584 1.1 mrg whilst expanding macros). It therefore must not be used by any
4585 1.1 mrg code that they might call: specifically the lexer and the guts of
4586 1.1 mrg the macro expander.
4587 1.1 mrg
4588 1.1 mrg All existing other uses clearly fit this restriction: storing
4589 1.1 mrg registered pragmas during initialization. */
4590 1.1 mrg unsigned char *
4591 1.1 mrg _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4592 1.1 mrg {
4593 1.1 mrg _cpp_buff *buff = pfile->a_buff;
4594 1.1 mrg unsigned char *result = buff->cur;
4595 1.1 mrg
4596 1.1 mrg if (len > (size_t) (buff->limit - result))
4597 1.1 mrg {
4598 1.1 mrg buff = _cpp_get_buff (pfile, len);
4599 1.1 mrg buff->next = pfile->a_buff;
4600 1.1 mrg pfile->a_buff = buff;
4601 1.1 mrg result = buff->cur;
4602 1.1 mrg }
4603 1.1 mrg
4604 1.1 mrg buff->cur = result + len;
4605 1.1 mrg return result;
4606 1.1 mrg }
4607 1.1 mrg
4608 1.1 mrg /* Commit or allocate storage from a buffer. */
4609 1.1 mrg
4610 1.1 mrg void *
4611 1.1 mrg _cpp_commit_buff (cpp_reader *pfile, size_t size)
4612 1.1 mrg {
4613 1.1 mrg void *ptr = BUFF_FRONT (pfile->a_buff);
4614 1.1 mrg
4615 1.1 mrg if (pfile->hash_table->alloc_subobject)
4616 1.1 mrg {
4617 1.1 mrg void *copy = pfile->hash_table->alloc_subobject (size);
4618 1.1 mrg memcpy (copy, ptr, size);
4619 1.1 mrg ptr = copy;
4620 1.1 mrg }
4621 1.1 mrg else
4622 1.1 mrg BUFF_FRONT (pfile->a_buff) += size;
4623 1.1 mrg
4624 1.1 mrg return ptr;
4625 1.1 mrg }
4626 1.1 mrg
4627 1.1 mrg /* Say which field of TOK is in use. */
4628 1.1 mrg
4629 1.1 mrg enum cpp_token_fld_kind
4630 1.1 mrg cpp_token_val_index (const cpp_token *tok)
4631 1.1 mrg {
4632 1.1 mrg switch (TOKEN_SPELL (tok))
4633 1.1 mrg {
4634 1.1 mrg case SPELL_IDENT:
4635 1.1 mrg return CPP_TOKEN_FLD_NODE;
4636 1.1 mrg case SPELL_LITERAL:
4637 1.1 mrg return CPP_TOKEN_FLD_STR;
4638 1.1 mrg case SPELL_OPERATOR:
4639 1.1 mrg /* Operands which were originally spelled as ident keep around
4640 1.1 mrg the node for the exact spelling. */
4641 1.1 mrg if (tok->flags & NAMED_OP)
4642 1.1 mrg return CPP_TOKEN_FLD_NODE;
4643 1.1 mrg else if (tok->type == CPP_PASTE)
4644 1.1 mrg return CPP_TOKEN_FLD_TOKEN_NO;
4645 1.1 mrg else
4646 1.1 mrg return CPP_TOKEN_FLD_NONE;
4647 1.1 mrg case SPELL_NONE:
4648 1.1 mrg if (tok->type == CPP_MACRO_ARG)
4649 1.1 mrg return CPP_TOKEN_FLD_ARG_NO;
4650 1.1 mrg else if (tok->type == CPP_PADDING)
4651 1.1 mrg return CPP_TOKEN_FLD_SOURCE;
4652 1.1 mrg else if (tok->type == CPP_PRAGMA)
4653 1.1 mrg return CPP_TOKEN_FLD_PRAGMA;
4654 1.1 mrg /* fall through */
4655 1.1 mrg default:
4656 1.1 mrg return CPP_TOKEN_FLD_NONE;
4657 1.1 mrg }
4658 1.1 mrg }
4659 1.1 mrg
4660 1.1 mrg /* All tokens lexed in R after calling this function will be forced to
4661 1.1 mrg have their location_t to be P, until
4662 1.1 mrg cpp_stop_forcing_token_locations is called for R. */
4663 1.1 mrg
4664 1.1 mrg void
4665 1.1 mrg cpp_force_token_locations (cpp_reader *r, location_t loc)
4666 1.1 mrg {
4667 1.1 mrg r->forced_token_location = loc;
4668 1.1 mrg }
4669 1.1 mrg
4670 1.1 mrg /* Go back to assigning locations naturally for lexed tokens. */
4671 1.1 mrg
4672 1.1 mrg void
4673 1.1 mrg cpp_stop_forcing_token_locations (cpp_reader *r)
4674 1.1 mrg {
4675 1.1 mrg r->forced_token_location = 0;
4676 1.1 mrg }
4677 1.1 mrg
4678 1.1 mrg /* We're looking at \, if it's escaping EOL, look past it. If at
4679 1.1 mrg LIMIT, don't advance. */
4680 1.1 mrg
4681 1.1 mrg static const unsigned char *
4682 1.1 mrg do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4683 1.1 mrg {
4684 1.1 mrg const unsigned char *probe = peek;
4685 1.1 mrg
4686 1.1 mrg if (__builtin_expect (peek[1] == '\n', true))
4687 1.1 mrg {
4688 1.1 mrg eol:
4689 1.1 mrg probe += 2;
4690 1.1 mrg if (__builtin_expect (probe < limit, true))
4691 1.1 mrg {
4692 1.1 mrg peek = probe;
4693 1.1 mrg if (*peek == '\\')
4694 1.1 mrg /* The user might be perverse. */
4695 1.1 mrg return do_peek_backslash (peek, limit);
4696 1.1 mrg }
4697 1.1 mrg }
4698 1.1 mrg else if (__builtin_expect (peek[1] == '\r', false))
4699 1.1 mrg {
4700 1.1 mrg if (probe[2] == '\n')
4701 1.1 mrg probe++;
4702 1.1 mrg goto eol;
4703 1.1 mrg }
4704 1.1 mrg
4705 1.1 mrg return peek;
4706 1.1 mrg }
4707 1.1 mrg
4708 1.1 mrg static const unsigned char *
4709 1.1 mrg do_peek_next (const unsigned char *peek, const unsigned char *limit)
4710 1.1 mrg {
4711 1.1 mrg if (__builtin_expect (*peek == '\\', false))
4712 1.1 mrg peek = do_peek_backslash (peek, limit);
4713 1.1 mrg return peek;
4714 1.1 mrg }
4715 1.1 mrg
4716 1.1 mrg static const unsigned char *
4717 1.1 mrg do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4718 1.1 mrg {
4719 1.1 mrg if (peek == bound)
4720 1.1 mrg return NULL;
4721 1.1 mrg
4722 1.1 mrg unsigned char c = *--peek;
4723 1.1 mrg if (__builtin_expect (c == '\n', false)
4724 1.1 mrg || __builtin_expect (c == 'r', false))
4725 1.1 mrg {
4726 1.1 mrg if (peek == bound)
4727 1.1 mrg return peek;
4728 1.1 mrg int ix = -1;
4729 1.1 mrg if (c == '\n' && peek[ix] == '\r')
4730 1.1 mrg {
4731 1.1 mrg if (peek + ix == bound)
4732 1.1 mrg return peek;
4733 1.1 mrg ix--;
4734 1.1 mrg }
4735 1.1 mrg
4736 1.1 mrg if (peek[ix] == '\\')
4737 1.1 mrg return do_peek_prev (peek + ix, bound);
4738 1.1 mrg
4739 1.1 mrg return peek;
4740 1.1 mrg }
4741 1.1 mrg else
4742 1.1 mrg return peek;
4743 1.1 mrg }
4744 1.1 mrg
4745 1.1 mrg /* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4746 1.1 mrg space. Otherwise return NULL. */
4747 1.1 mrg
4748 1.1 mrg static const unsigned char *
4749 1.1 mrg do_peek_ident (const char *match, const unsigned char *peek,
4750 1.1 mrg const unsigned char *limit)
4751 1.1 mrg {
4752 1.1 mrg for (; *++match; peek++)
4753 1.1 mrg if (*peek != *match)
4754 1.1 mrg {
4755 1.1 mrg peek = do_peek_next (peek, limit);
4756 1.1 mrg if (*peek != *match)
4757 1.1 mrg return NULL;
4758 1.1 mrg }
4759 1.1 mrg
4760 1.1 mrg /* Must now not be looking at an identifier char. */
4761 1.1 mrg peek = do_peek_next (peek, limit);
4762 1.1 mrg if (ISIDNUM (*peek))
4763 1.1 mrg return NULL;
4764 1.1 mrg
4765 1.1 mrg /* Skip control-line whitespace. */
4766 1.1 mrg ws:
4767 1.1 mrg while (*peek == ' ' || *peek == '\t')
4768 1.1 mrg peek++;
4769 1.1 mrg if (__builtin_expect (*peek == '\\', false))
4770 1.1 mrg {
4771 1.1 mrg peek = do_peek_backslash (peek, limit);
4772 1.1 mrg if (*peek != '\\')
4773 1.1 mrg goto ws;
4774 1.1 mrg }
4775 1.1 mrg
4776 1.1 mrg return peek;
4777 1.1 mrg }
4778 1.1 mrg
4779 1.1 mrg /* Are we looking at a module control line starting as PEEK - 1? */
4780 1.1 mrg
4781 1.1 mrg static bool
4782 1.1 mrg do_peek_module (cpp_reader *pfile, unsigned char c,
4783 1.1 mrg const unsigned char *peek, const unsigned char *limit)
4784 1.1 mrg {
4785 1.1 mrg bool import = false;
4786 1.1 mrg
4787 1.1 mrg if (__builtin_expect (c == 'e', false))
4788 1.1 mrg {
4789 1.1 mrg if (!((peek[0] == 'x' || peek[0] == '\\')
4790 1.1 mrg && (peek = do_peek_ident ("export", peek, limit))))
4791 1.1 mrg return false;
4792 1.1 mrg
4793 1.1 mrg /* export, peek for import or module. No need to peek __import
4794 1.1 mrg here. */
4795 1.1 mrg if (peek[0] == 'i')
4796 1.1 mrg {
4797 1.1 mrg if (!((peek[1] == 'm' || peek[1] == '\\')
4798 1.1 mrg && (peek = do_peek_ident ("import", peek + 1, limit))))
4799 1.1 mrg return false;
4800 1.1 mrg import = true;
4801 1.1 mrg }
4802 1.1 mrg else if (peek[0] == 'm')
4803 1.1 mrg {
4804 1.1 mrg if (!((peek[1] == 'o' || peek[1] == '\\')
4805 1.1 mrg && (peek = do_peek_ident ("module", peek + 1, limit))))
4806 1.1 mrg return false;
4807 1.1 mrg }
4808 1.1 mrg else
4809 1.1 mrg return false;
4810 1.1 mrg }
4811 1.1 mrg else if (__builtin_expect (c == 'i', false))
4812 1.1 mrg {
4813 1.1 mrg if (!((peek[0] == 'm' || peek[0] == '\\')
4814 1.1 mrg && (peek = do_peek_ident ("import", peek, limit))))
4815 1.1 mrg return false;
4816 1.1 mrg import = true;
4817 1.1 mrg }
4818 1.1 mrg else if (__builtin_expect (c == '_', false))
4819 1.1 mrg {
4820 1.1 mrg /* Needed for translated includes. */
4821 1.1 mrg if (!((peek[0] == '_' || peek[0] == '\\')
4822 1.1 mrg && (peek = do_peek_ident ("__import", peek, limit))))
4823 1.1 mrg return false;
4824 1.1 mrg import = true;
4825 1.1 mrg }
4826 1.1 mrg else if (__builtin_expect (c == 'm', false))
4827 1.1 mrg {
4828 1.1 mrg if (!((peek[0] == 'o' || peek[0] == '\\')
4829 1.1 mrg && (peek = do_peek_ident ("module", peek, limit))))
4830 1.1 mrg return false;
4831 1.1 mrg }
4832 1.1 mrg else
4833 1.1 mrg return false;
4834 1.1 mrg
4835 1.1 mrg /* Peek the next character to see if it's good enough. We'll be at
4836 1.1 mrg the first non-whitespace char, including skipping an escaped
4837 1.1 mrg newline. */
4838 1.1 mrg /* ... import followed by identifier, ':', '<' or header-name
4839 1.1 mrg preprocessing tokens, or module followed by identifier, ':' or
4840 1.1 mrg ';' preprocessing tokens. */
4841 1.1 mrg unsigned char p = *peek++;
4842 1.1 mrg
4843 1.1 mrg /* A character literal is ... single quotes, ... optionally preceded
4844 1.1 mrg by u8, u, U, or L */
4845 1.1 mrg /* A string-literal is a ... double quotes, optionally prefixed by
4846 1.1 mrg R, u8, u8R, u, uR, U, UR, L, or LR */
4847 1.1 mrg if (p == 'u')
4848 1.1 mrg {
4849 1.1 mrg peek = do_peek_next (peek, limit);
4850 1.1 mrg if (*peek == '8')
4851 1.1 mrg {
4852 1.1 mrg peek++;
4853 1.1 mrg goto peek_u8;
4854 1.1 mrg }
4855 1.1 mrg goto peek_u;
4856 1.1 mrg }
4857 1.1 mrg else if (p == 'U' || p == 'L')
4858 1.1 mrg {
4859 1.1 mrg peek_u8:
4860 1.1 mrg peek = do_peek_next (peek, limit);
4861 1.1 mrg peek_u:
4862 1.1 mrg if (*peek == '\"' || *peek == '\'')
4863 1.1 mrg return false;
4864 1.1 mrg
4865 1.1 mrg if (*peek == 'R')
4866 1.1 mrg goto peek_R;
4867 1.1 mrg /* Identifier. Ok. */
4868 1.1 mrg }
4869 1.1 mrg else if (p == 'R')
4870 1.1 mrg {
4871 1.1 mrg peek_R:
4872 1.1 mrg if (CPP_OPTION (pfile, rliterals))
4873 1.1 mrg {
4874 1.1 mrg peek = do_peek_next (peek, limit);
4875 1.1 mrg if (*peek == '\"')
4876 1.1 mrg return false;
4877 1.1 mrg }
4878 1.1 mrg /* Identifier. Ok. */
4879 1.1 mrg }
4880 1.1 mrg else if ('Z' - 'A' == 25
4881 1.1 mrg ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4882 1.1 mrg : ISIDST (p))
4883 1.1 mrg {
4884 1.1 mrg /* Identifier. Ok. */
4885 1.1 mrg }
4886 1.1 mrg else if (p == '<')
4887 1.1 mrg {
4888 1.1 mrg /* Maybe angle header, ok for import. Reject
4889 1.1 mrg '<=', '<<' digraph:'<:'. */
4890 1.1 mrg if (!import)
4891 1.1 mrg return false;
4892 1.1 mrg peek = do_peek_next (peek, limit);
4893 1.1 mrg if (*peek == '=' || *peek == '<'
4894 1.1 mrg || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4895 1.1 mrg return false;
4896 1.1 mrg }
4897 1.1 mrg else if (p == ';')
4898 1.1 mrg {
4899 1.1 mrg /* SEMICOLON, ok for module. */
4900 1.1 mrg if (import)
4901 1.1 mrg return false;
4902 1.1 mrg }
4903 1.1 mrg else if (p == '"')
4904 1.1 mrg {
4905 1.1 mrg /* STRING, ok for import. */
4906 1.1 mrg if (!import)
4907 1.1 mrg return false;
4908 1.1 mrg }
4909 1.1 mrg else if (p == ':')
4910 1.1 mrg {
4911 1.1 mrg /* Maybe COLON, ok. Reject '::', digraph:':>'. */
4912 1.1 mrg peek = do_peek_next (peek, limit);
4913 1.1 mrg if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4914 1.1 mrg return false;
4915 1.1 mrg }
4916 1.1 mrg else
4917 1.1 mrg /* FIXME: Detect a unicode character, excluding those not
4918 1.1 mrg permitted as the initial character. [lex.name]/1. I presume
4919 1.1 mrg we need to check the \[uU] spellings, and directly using
4920 1.1 mrg Unicode in say UTF8 form? Or perhaps we do the phase-1
4921 1.1 mrg conversion of UTF8 to universal-character-names? */
4922 1.1 mrg return false;
4923 1.1 mrg
4924 1.1 mrg return true;
4925 1.1 mrg }
4926 1.1 mrg
4927 1.1 mrg /* Directives-only scanning. Somewhat more relaxed than correct
4928 1.1 mrg parsing -- some ill-formed programs will not be rejected. */
4929 1.1 mrg
4930 1.1 mrg void
4931 1.1 mrg cpp_directive_only_process (cpp_reader *pfile,
4932 1.1 mrg void *data,
4933 1.1 mrg void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4934 1.1 mrg {
4935 1.1 mrg bool module_p = CPP_OPTION (pfile, module_directives);
4936 1.1 mrg
4937 1.1 mrg do
4938 1.1 mrg {
4939 1.1 mrg restart:
4940 1.1 mrg /* Buffer initialization, but no line cleaning. */
4941 1.1 mrg cpp_buffer *buffer = pfile->buffer;
4942 1.1 mrg buffer->cur_note = buffer->notes_used = 0;
4943 1.1 mrg buffer->cur = buffer->line_base = buffer->next_line;
4944 1.1 mrg buffer->need_line = false;
4945 1.1 mrg /* Files always end in a newline or carriage return. We rely on this for
4946 1.1 mrg character peeking safety. */
4947 1.1 mrg gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
4948 1.1 mrg
4949 1.1 mrg const unsigned char *base = buffer->cur;
4950 1.1 mrg unsigned line_count = 0;
4951 1.1 mrg const unsigned char *line_start = base;
4952 1.1 mrg
4953 1.1 mrg bool bol = true;
4954 1.1 mrg bool raw = false;
4955 1.1 mrg
4956 1.1 mrg const unsigned char *lwm = base;
4957 1.1 mrg for (const unsigned char *pos = base, *limit = buffer->rlimit;
4958 1.1 mrg pos < limit;)
4959 1.1 mrg {
4960 1.1 mrg unsigned char c = *pos++;
4961 1.1 mrg /* This matches the switch in _cpp_lex_direct. */
4962 1.1 mrg switch (c)
4963 1.1 mrg {
4964 1.1 mrg case ' ': case '\t': case '\f': case '\v':
4965 1.1 mrg /* Whitespace, do nothing. */
4966 1.1 mrg break;
4967 1.1 mrg
4968 1.1 mrg case '\r': /* MAC line ending, or Windows \r\n */
4969 1.1 mrg if (*pos == '\n')
4970 1.1 mrg pos++;
4971 1.1 mrg /* FALLTHROUGH */
4972 1.1 mrg
4973 1.1 mrg case '\n':
4974 1.1 mrg bol = true;
4975 1.1 mrg
4976 1.1 mrg next_line:
4977 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
4978 1.1 mrg line_count++;
4979 1.1 mrg line_start = pos;
4980 1.1 mrg break;
4981 1.1 mrg
4982 1.1 mrg case '\\':
4983 1.1 mrg /* <backslash><newline> is removed, and doesn't undo any
4984 1.1 mrg preceeding escape or whatnot. */
4985 1.1 mrg if (*pos == '\n')
4986 1.1 mrg {
4987 1.1 mrg pos++;
4988 1.1 mrg goto next_line;
4989 1.1 mrg }
4990 1.1 mrg else if (*pos == '\r')
4991 1.1 mrg {
4992 1.1 mrg if (pos[1] == '\n')
4993 1.1 mrg pos++;
4994 1.1 mrg pos++;
4995 1.1 mrg goto next_line;
4996 1.1 mrg }
4997 1.1 mrg goto dflt;
4998 1.1 mrg
4999 1.1 mrg case '#':
5000 1.1 mrg if (bol)
5001 1.1 mrg {
5002 1.1 mrg /* Line directive. */
5003 1.1 mrg if (pos - 1 > base && !pfile->state.skipping)
5004 1.1 mrg cb (pfile, CPP_DO_print, data,
5005 1.1 mrg line_count, base, pos - 1 - base);
5006 1.1 mrg
5007 1.1 mrg /* Prep things for directive handling. */
5008 1.1 mrg buffer->next_line = pos;
5009 1.1 mrg buffer->need_line = true;
5010 1.1 mrg bool ok = _cpp_get_fresh_line (pfile);
5011 1.1 mrg gcc_checking_assert (ok);
5012 1.1 mrg
5013 1.1 mrg /* Ensure proper column numbering for generated
5014 1.1 mrg error messages. */
5015 1.1 mrg buffer->line_base -= pos - line_start;
5016 1.1 mrg
5017 1.1 mrg _cpp_handle_directive (pfile, line_start + 1 != pos);
5018 1.1 mrg
5019 1.1 mrg /* Sanitize the line settings. Duplicate #include's can
5020 1.1 mrg mess things up. */
5021 1.1 mrg // FIXME: Necessary?
5022 1.1 mrg pfile->line_table->highest_location
5023 1.1 mrg = pfile->line_table->highest_line;
5024 1.1 mrg
5025 1.1 mrg if (!pfile->state.skipping
5026 1.1 mrg && pfile->buffer->next_line < pfile->buffer->rlimit)
5027 1.1 mrg cb (pfile, CPP_DO_location, data,
5028 1.1 mrg pfile->line_table->highest_line);
5029 1.1 mrg
5030 1.1 mrg goto restart;
5031 1.1 mrg }
5032 1.1 mrg goto dflt;
5033 1.1 mrg
5034 1.1 mrg case '/':
5035 1.1 mrg {
5036 1.1 mrg const unsigned char *peek = do_peek_next (pos, limit);
5037 1.1 mrg if (!(*peek == '/' || *peek == '*'))
5038 1.1 mrg goto dflt;
5039 1.1 mrg
5040 1.1 mrg /* Line or block comment */
5041 1.1 mrg bool is_block = *peek == '*';
5042 1.1 mrg bool star = false;
5043 1.1 mrg bool esc = false;
5044 1.1 mrg location_t sloc
5045 1.1 mrg = linemap_position_for_column (pfile->line_table,
5046 1.1 mrg pos - line_start);
5047 1.1 mrg
5048 1.1 mrg while (pos < limit)
5049 1.1 mrg {
5050 1.1 mrg char c = *pos++;
5051 1.1 mrg switch (c)
5052 1.1 mrg {
5053 1.1 mrg case '\\':
5054 1.1 mrg esc = true;
5055 1.1 mrg break;
5056 1.1 mrg
5057 1.1 mrg case '\r':
5058 1.1 mrg if (*pos == '\n')
5059 1.1 mrg pos++;
5060 1.1 mrg /* FALLTHROUGH */
5061 1.1 mrg
5062 1.1 mrg case '\n':
5063 1.1 mrg {
5064 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
5065 1.1 mrg line_count++;
5066 1.1 mrg line_start = pos;
5067 1.1 mrg if (!esc && !is_block)
5068 1.1 mrg {
5069 1.1 mrg bol = true;
5070 1.1 mrg goto done_comment;
5071 1.1 mrg }
5072 1.1 mrg }
5073 1.1 mrg if (!esc)
5074 1.1 mrg star = false;
5075 1.1 mrg esc = false;
5076 1.1 mrg break;
5077 1.1 mrg
5078 1.1 mrg case '*':
5079 1.1 mrg if (pos > peek)
5080 1.1 mrg star = is_block;
5081 1.1 mrg esc = false;
5082 1.1 mrg break;
5083 1.1 mrg
5084 1.1 mrg case '/':
5085 1.1 mrg if (star)
5086 1.1 mrg goto done_comment;
5087 1.1 mrg /* FALLTHROUGH */
5088 1.1 mrg
5089 1.1 mrg default:
5090 1.1 mrg star = false;
5091 1.1 mrg esc = false;
5092 1.1 mrg break;
5093 1.1 mrg }
5094 1.1 mrg }
5095 1.1 mrg if (pos < limit || is_block)
5096 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5097 1.1 mrg "unterminated comment");
5098 1.1 mrg done_comment:
5099 1.1 mrg lwm = pos;
5100 1.1 mrg break;
5101 1.1 mrg }
5102 1.1 mrg
5103 1.1 mrg case '\'':
5104 1.1 mrg if (!CPP_OPTION (pfile, digit_separators))
5105 1.1 mrg goto delimited_string;
5106 1.1 mrg
5107 1.1 mrg /* Possibly a number punctuator. */
5108 1.1 mrg if (!ISIDNUM (*do_peek_next (pos, limit)))
5109 1.1 mrg goto delimited_string;
5110 1.1 mrg
5111 1.1 mrg goto quote_peek;
5112 1.1 mrg
5113 1.1 mrg case '\"':
5114 1.1 mrg if (!CPP_OPTION (pfile, rliterals))
5115 1.1 mrg goto delimited_string;
5116 1.1 mrg
5117 1.1 mrg quote_peek:
5118 1.1 mrg {
5119 1.1 mrg /* For ' see if it's a number punctuator
5120 1.1 mrg \.?<digit>(<digit>|<identifier-nondigit>
5121 1.1 mrg |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5122 1.1 mrg /* For " see if it's a raw string
5123 1.1 mrg {U,L,u,u8}R. This includes CPP_NUMBER detection,
5124 1.1 mrg because that could be 0e+R. */
5125 1.1 mrg const unsigned char *peek = pos - 1;
5126 1.1 mrg bool quote_first = c == '"';
5127 1.1 mrg bool quote_eight = false;
5128 1.1 mrg bool maybe_number_start = false;
5129 1.1 mrg bool want_number = false;
5130 1.1 mrg
5131 1.1 mrg while ((peek = do_peek_prev (peek, lwm)))
5132 1.1 mrg {
5133 1.1 mrg unsigned char p = *peek;
5134 1.1 mrg if (quote_first)
5135 1.1 mrg {
5136 1.1 mrg if (!raw)
5137 1.1 mrg {
5138 1.1 mrg if (p != 'R')
5139 1.1 mrg break;
5140 1.1 mrg raw = true;
5141 1.1 mrg continue;
5142 1.1 mrg }
5143 1.1 mrg
5144 1.1 mrg quote_first = false;
5145 1.1 mrg if (p == 'L' || p == 'U' || p == 'u')
5146 1.1 mrg ;
5147 1.1 mrg else if (p == '8')
5148 1.1 mrg quote_eight = true;
5149 1.1 mrg else
5150 1.1 mrg goto second_raw;
5151 1.1 mrg }
5152 1.1 mrg else if (quote_eight)
5153 1.1 mrg {
5154 1.1 mrg if (p != 'u')
5155 1.1 mrg {
5156 1.1 mrg raw = false;
5157 1.1 mrg break;
5158 1.1 mrg }
5159 1.1 mrg quote_eight = false;
5160 1.1 mrg }
5161 1.1 mrg else if (c == '"')
5162 1.1 mrg {
5163 1.1 mrg second_raw:;
5164 1.1 mrg if (!want_number && ISIDNUM (p))
5165 1.1 mrg {
5166 1.1 mrg raw = false;
5167 1.1 mrg break;
5168 1.1 mrg }
5169 1.1 mrg }
5170 1.1 mrg
5171 1.1 mrg if (ISDIGIT (p))
5172 1.1 mrg maybe_number_start = true;
5173 1.1 mrg else if (p == '.')
5174 1.1 mrg want_number = true;
5175 1.1 mrg else if (ISIDNUM (p))
5176 1.1 mrg maybe_number_start = false;
5177 1.1 mrg else if (p == '+' || p == '-')
5178 1.1 mrg {
5179 1.1 mrg if (const unsigned char *peek_prev
5180 1.1 mrg = do_peek_prev (peek, lwm))
5181 1.1 mrg {
5182 1.1 mrg p = *peek_prev;
5183 1.1 mrg if (p == 'e' || p == 'E'
5184 1.1 mrg || p == 'p' || p == 'P')
5185 1.1 mrg {
5186 1.1 mrg want_number = true;
5187 1.1 mrg maybe_number_start = false;
5188 1.1 mrg }
5189 1.1 mrg else
5190 1.1 mrg break;
5191 1.1 mrg }
5192 1.1 mrg else
5193 1.1 mrg break;
5194 1.1 mrg }
5195 1.1 mrg else if (p == '\'' || p == '\"')
5196 1.1 mrg {
5197 1.1 mrg /* If this is lwm, this must be the end of a
5198 1.1 mrg previous string. So this is a trailing
5199 1.1 mrg literal type, (a) if those are allowed,
5200 1.1 mrg and (b) maybe_start is false. Otherwise
5201 1.1 mrg this must be a CPP_NUMBER because we've
5202 1.1 mrg met another ', and we'd have checked that
5203 1.1 mrg in its own right. */
5204 1.1 mrg if (peek == lwm && CPP_OPTION (pfile, uliterals))
5205 1.1 mrg {
5206 1.1 mrg if (!maybe_number_start && !want_number)
5207 1.1 mrg /* Must be a literal type. */
5208 1.1 mrg raw = false;
5209 1.1 mrg }
5210 1.1 mrg else if (p == '\''
5211 1.1 mrg && CPP_OPTION (pfile, digit_separators))
5212 1.1 mrg maybe_number_start = true;
5213 1.1 mrg break;
5214 1.1 mrg }
5215 1.1 mrg else if (c == '\'')
5216 1.1 mrg break;
5217 1.1 mrg else if (!quote_first && !quote_eight)
5218 1.1 mrg break;
5219 1.1 mrg }
5220 1.1 mrg
5221 1.1 mrg if (maybe_number_start)
5222 1.1 mrg {
5223 1.1 mrg if (c == '\'')
5224 1.1 mrg /* A CPP NUMBER. */
5225 1.1 mrg goto dflt;
5226 1.1 mrg raw = false;
5227 1.1 mrg }
5228 1.1 mrg
5229 1.1 mrg goto delimited_string;
5230 1.1 mrg }
5231 1.1 mrg
5232 1.1 mrg delimited_string:
5233 1.1 mrg {
5234 1.1 mrg /* (Possibly raw) string or char literal. */
5235 1.1 mrg unsigned char end = c;
5236 1.1 mrg int delim_len = -1;
5237 1.1 mrg const unsigned char *delim = NULL;
5238 1.1 mrg location_t sloc = linemap_position_for_column (pfile->line_table,
5239 1.1 mrg pos - line_start);
5240 1.1 mrg int esc = 0;
5241 1.1 mrg
5242 1.1 mrg if (raw)
5243 1.1 mrg {
5244 1.1 mrg /* There can be no line breaks in the delimiter. */
5245 1.1 mrg delim = pos;
5246 1.1 mrg for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5247 1.1 mrg {
5248 1.1 mrg if (delim_len == 16)
5249 1.1 mrg {
5250 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR,
5251 1.1 mrg sloc, 0,
5252 1.1 mrg "raw string delimiter"
5253 1.1 mrg " longer than %d"
5254 1.1 mrg " characters",
5255 1.1 mrg delim_len);
5256 1.1 mrg raw = false;
5257 1.1 mrg pos = delim;
5258 1.1 mrg break;
5259 1.1 mrg }
5260 1.1 mrg if (strchr (") \\\t\v\f\n", c))
5261 1.1 mrg {
5262 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR,
5263 1.1 mrg sloc, 0,
5264 1.1 mrg "invalid character '%c'"
5265 1.1 mrg " in raw string"
5266 1.1 mrg " delimiter", c);
5267 1.1 mrg raw = false;
5268 1.1 mrg pos = delim;
5269 1.1 mrg break;
5270 1.1 mrg }
5271 1.1 mrg if (pos >= limit)
5272 1.1 mrg goto bad_string;
5273 1.1 mrg }
5274 1.1 mrg }
5275 1.1 mrg
5276 1.1 mrg while (pos < limit)
5277 1.1 mrg {
5278 1.1 mrg char c = *pos++;
5279 1.1 mrg switch (c)
5280 1.1 mrg {
5281 1.1 mrg case '\\':
5282 1.1 mrg if (!raw)
5283 1.1 mrg esc++;
5284 1.1 mrg break;
5285 1.1 mrg
5286 1.1 mrg case '\r':
5287 1.1 mrg if (*pos == '\n')
5288 1.1 mrg pos++;
5289 1.1 mrg /* FALLTHROUGH */
5290 1.1 mrg
5291 1.1 mrg case '\n':
5292 1.1 mrg {
5293 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
5294 1.1 mrg line_count++;
5295 1.1 mrg line_start = pos;
5296 1.1 mrg }
5297 1.1 mrg if (esc)
5298 1.1 mrg esc--;
5299 1.1 mrg break;
5300 1.1 mrg
5301 1.1 mrg case ')':
5302 1.1 mrg if (raw
5303 1.1 mrg && pos + delim_len + 1 < limit
5304 1.1 mrg && pos[delim_len] == end
5305 1.1 mrg && !memcmp (delim, pos, delim_len))
5306 1.1 mrg {
5307 1.1 mrg pos += delim_len + 1;
5308 1.1 mrg raw = false;
5309 1.1 mrg goto done_string;
5310 1.1 mrg }
5311 1.1 mrg break;
5312 1.1 mrg
5313 1.1 mrg default:
5314 1.1 mrg if (!raw && !(esc & 1) && c == end)
5315 1.1 mrg goto done_string;
5316 1.1 mrg esc = 0;
5317 1.1 mrg break;
5318 1.1 mrg }
5319 1.1 mrg }
5320 1.1 mrg bad_string:
5321 1.1 mrg cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5322 1.1 mrg "unterminated literal");
5323 1.1 mrg
5324 1.1 mrg done_string:
5325 1.1 mrg raw = false;
5326 1.1 mrg lwm = pos - 1;
5327 1.1 mrg }
5328 1.1 mrg goto dflt;
5329 1.1 mrg
5330 1.1 mrg case '_':
5331 1.1 mrg case 'e':
5332 1.1 mrg case 'i':
5333 1.1 mrg case 'm':
5334 1.1 mrg if (bol && module_p && !pfile->state.skipping
5335 1.1 mrg && do_peek_module (pfile, c, pos, limit))
5336 1.1 mrg {
5337 1.1 mrg /* We've seen the start of a module control line.
5338 1.1 mrg Start up the tokenizer. */
5339 1.1 mrg pos--; /* Backup over the first character. */
5340 1.1 mrg
5341 1.1 mrg /* Backup over whitespace to start of line. */
5342 1.1 mrg while (pos > line_start
5343 1.1 mrg && (pos[-1] == ' ' || pos[-1] == '\t'))
5344 1.1 mrg pos--;
5345 1.1 mrg
5346 1.1 mrg if (pos > base)
5347 1.1 mrg cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5348 1.1 mrg
5349 1.1 mrg /* Prep things for directive handling. */
5350 1.1 mrg buffer->next_line = pos;
5351 1.1 mrg buffer->need_line = true;
5352 1.1 mrg
5353 1.1 mrg /* Now get tokens until the PRAGMA_EOL. */
5354 1.1 mrg do
5355 1.1 mrg {
5356 1.1 mrg location_t spelling;
5357 1.1 mrg const cpp_token *tok
5358 1.1 mrg = cpp_get_token_with_location (pfile, &spelling);
5359 1.1 mrg
5360 1.1 mrg gcc_assert (pfile->state.in_deferred_pragma
5361 1.1 mrg || tok->type == CPP_PRAGMA_EOL);
5362 1.1 mrg cb (pfile, CPP_DO_token, data, tok, spelling);
5363 1.1 mrg }
5364 1.1 mrg while (pfile->state.in_deferred_pragma);
5365 1.1 mrg
5366 1.1 mrg if (pfile->buffer->next_line < pfile->buffer->rlimit)
5367 1.1 mrg cb (pfile, CPP_DO_location, data,
5368 1.1 mrg pfile->line_table->highest_line);
5369 1.1 mrg
5370 1.1 mrg pfile->mi_valid = false;
5371 1.1 mrg goto restart;
5372 1.1 mrg }
5373 1.1 mrg goto dflt;
5374 1.1 mrg
5375 1.1 mrg default:
5376 1.1 mrg dflt:
5377 1.1 mrg bol = false;
5378 1.1 mrg pfile->mi_valid = false;
5379 1.1 mrg break;
5380 1.1 mrg }
5381 1.1 mrg }
5382 1.1 mrg
5383 1.1 mrg if (buffer->rlimit > base && !pfile->state.skipping)
5384 1.1 mrg {
5385 1.1 mrg const unsigned char *limit = buffer->rlimit;
5386 1.1 mrg /* If the file was not newline terminated, add rlimit, which is
5387 1.1 mrg guaranteed to point to a newline, to the end of our range. */
5388 1.1 mrg if (limit[-1] != '\n')
5389 1.1 mrg {
5390 1.1 mrg limit++;
5391 1.1 mrg CPP_INCREMENT_LINE (pfile, 0);
5392 1.1 mrg line_count++;
5393 1.1 mrg }
5394 1.1 mrg cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5395 1.1 mrg }
5396 1.1 mrg
5397 1.1 mrg _cpp_pop_buffer (pfile);
5398 1.1 mrg }
5399 1.1 mrg while (pfile->buffer);
5400 }
5401