Home | History | Annotate | Line # | Download | only in src
      1 /* xgettext PHP backend.
      2    Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc.
      3 
      4    This file was written by Bruno Haible <bruno (at) clisp.org>, 2002.
      5 
      6    This program is free software; you can redistribute it and/or modify
      7    it under the terms of the GNU General Public License as published by
      8    the Free Software Foundation; either version 2, or (at your option)
      9    any later version.
     10 
     11    This program is distributed in the hope that it will be useful,
     12    but WITHOUT ANY WARRANTY; without even the implied warranty of
     13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14    GNU General Public License for more details.
     15 
     16    You should have received a copy of the GNU General Public License
     17    along with this program; if not, write to the Free Software Foundation,
     18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
     19 
     20 #ifdef HAVE_CONFIG_H
     21 # include "config.h"
     22 #endif
     23 
     24 #include <errno.h>
     25 #include <stdbool.h>
     26 #include <stdio.h>
     27 #include <stdlib.h>
     28 
     29 #include "message.h"
     30 #include "xgettext.h"
     31 #include "x-php.h"
     32 #include "error.h"
     33 #include "xalloc.h"
     34 #include "exit.h"
     35 #include "gettext.h"
     36 
     37 #define _(s) gettext(s)
     38 
     39 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
     40 
     41 
     42 /* The PHP syntax is defined in phpdoc/manual/langref.html.
     43    See also php-4.1.0/Zend/zend_language_scanner.l.
     44    Note that variable and function names can contain bytes in the range
     45    0x7f..0xff; see
     46      http://www.php.net/manual/en/language.variables.php
     47      http://www.php.net/manual/en/language.functions.php  */
     48 
     49 
     50 /* ====================== Keyword set customization.  ====================== */
     51 
     52 /* If true extract all strings.  */
     53 static bool extract_all = false;
     54 
     55 static hash_table keywords;
     56 static bool default_keywords = true;
     57 
     58 
     59 void
     60 x_php_extract_all ()
     61 {
     62   extract_all = true;
     63 }
     64 
     65 
     66 void
     67 x_php_keyword (const char *name)
     68 {
     69   if (name == NULL)
     70     default_keywords = false;
     71   else
     72     {
     73       const char *end;
     74       struct callshape shape;
     75       const char *colon;
     76 
     77       if (keywords.table == NULL)
     78 	hash_init (&keywords, 100);
     79 
     80       split_keywordspec (name, &end, &shape);
     81 
     82       /* The characters between name and end should form a valid C identifier.
     83 	 A colon means an invalid parse in split_keywordspec().  */
     84       colon = strchr (name, ':');
     85       if (colon == NULL || colon >= end)
     86 	insert_keyword_callshape (&keywords, name, end - name, &shape);
     87     }
     88 }
     89 
     90 /* Finish initializing the keywords hash table.
     91    Called after argument processing, before each file is processed.  */
     92 static void
     93 init_keywords ()
     94 {
     95   if (default_keywords)
     96     {
     97       /* When adding new keywords here, also update the documentation in
     98 	 xgettext.texi!  */
     99       x_php_keyword ("_");
    100       x_php_keyword ("gettext");
    101       x_php_keyword ("dgettext:2");
    102       x_php_keyword ("dcgettext:2");
    103       /* The following were added in PHP 4.2.0.  */
    104       x_php_keyword ("ngettext:1,2");
    105       x_php_keyword ("dngettext:2,3");
    106       x_php_keyword ("dcngettext:2,3");
    107       default_keywords = false;
    108     }
    109 }
    110 
    111 void
    112 init_flag_table_php ()
    113 {
    114   xgettext_record_flag ("_:1:pass-php-format");
    115   xgettext_record_flag ("gettext:1:pass-php-format");
    116   xgettext_record_flag ("dgettext:2:pass-php-format");
    117   xgettext_record_flag ("dcgettext:2:pass-php-format");
    118   xgettext_record_flag ("ngettext:1:pass-php-format");
    119   xgettext_record_flag ("ngettext:2:pass-php-format");
    120   xgettext_record_flag ("dngettext:2:pass-php-format");
    121   xgettext_record_flag ("dngettext:3:pass-php-format");
    122   xgettext_record_flag ("dcngettext:2:pass-php-format");
    123   xgettext_record_flag ("dcngettext:3:pass-php-format");
    124   xgettext_record_flag ("sprintf:1:php-format");
    125   xgettext_record_flag ("printf:1:php-format");
    126 }
    127 
    128 
    129 /* ======================== Reading of characters.  ======================== */
    130 
    131 
    132 /* Real filename, used in error messages about the input file.  */
    133 static const char *real_file_name;
    134 
    135 /* Logical filename and line number, used to label the extracted messages.  */
    136 static char *logical_file_name;
    137 static int line_number;
    138 
    139 /* The input file stream.  */
    140 static FILE *fp;
    141 
    142 
    143 /* 1. line_number handling.  */
    144 
    145 static unsigned char phase1_pushback[2];
    146 static int phase1_pushback_length;
    147 
    148 static int
    149 phase1_getc ()
    150 {
    151   int c;
    152 
    153   if (phase1_pushback_length)
    154     c = phase1_pushback[--phase1_pushback_length];
    155   else
    156     {
    157       c = getc (fp);
    158 
    159       if (c == EOF)
    160 	{
    161 	  if (ferror (fp))
    162 	    error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
    163 		   real_file_name);
    164 	  return EOF;
    165 	}
    166     }
    167 
    168   if (c == '\n')
    169     line_number++;
    170 
    171   return c;
    172 }
    173 
    174 /* Supports 2 characters of pushback.  */
    175 static void
    176 phase1_ungetc (int c)
    177 {
    178   if (c != EOF)
    179     {
    180       if (c == '\n')
    181 	--line_number;
    182 
    183       if (phase1_pushback_length == SIZEOF (phase1_pushback))
    184 	abort ();
    185       phase1_pushback[phase1_pushback_length++] = c;
    186     }
    187 }
    188 
    189 
    190 /* 2. Ignore HTML sections.  They are equivalent to PHP echo commands and
    191    therefore don't contain translatable strings.  */
    192 
    193 static void
    194 skip_html ()
    195 {
    196   for (;;)
    197     {
    198       int c = phase1_getc ();
    199 
    200       if (c == EOF)
    201 	return;
    202 
    203       if (c == '<')
    204 	{
    205 	  int c2 = phase1_getc ();
    206 
    207 	  if (c2 == EOF)
    208 	    break;
    209 
    210 	  if (c2 == '?')
    211 	    {
    212 	      /* <?php is the normal way to enter PHP mode. <? and <?= are
    213 		 recognized by PHP depending on a configuration setting.  */
    214 	      int c3 = phase1_getc ();
    215 
    216 	      if (c3 != '=')
    217 		phase1_ungetc (c3);
    218 
    219 	      return;
    220 	    }
    221 
    222 	  if (c2 == '%')
    223 	    {
    224 	      /* <% and <%= are recognized by PHP depending on a configuration
    225 		 setting.  */
    226 	      int c3 = phase1_getc ();
    227 
    228 	      if (c3 != '=')
    229 		phase1_ungetc (c3);
    230 
    231 	      return;
    232 	    }
    233 
    234 	  if (c2 == '<')
    235 	    {
    236 	      phase1_ungetc (c2);
    237 	      continue;
    238 	    }
    239 
    240 	  /* < script language = php >
    241 	     < script language = "php" >
    242 	     < script language = 'php' >
    243 	     are always recognized.  */
    244 	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
    245 	    c2 = phase1_getc ();
    246 	  if (c2 != 's' && c2 != 'S')
    247 	    {
    248 	      phase1_ungetc (c2);
    249 	      continue;
    250 	    }
    251 	  c2 = phase1_getc ();
    252 	  if (c2 != 'c' && c2 != 'C')
    253 	    {
    254 	      phase1_ungetc (c2);
    255 	      continue;
    256 	    }
    257 	  c2 = phase1_getc ();
    258 	  if (c2 != 'r' && c2 != 'R')
    259 	    {
    260 	      phase1_ungetc (c2);
    261 	      continue;
    262 	    }
    263 	  c2 = phase1_getc ();
    264 	  if (c2 != 'i' && c2 != 'I')
    265 	    {
    266 	      phase1_ungetc (c2);
    267 	      continue;
    268 	    }
    269 	  c2 = phase1_getc ();
    270 	  if (c2 != 'p' && c2 != 'P')
    271 	    {
    272 	      phase1_ungetc (c2);
    273 	      continue;
    274 	    }
    275 	  c2 = phase1_getc ();
    276 	  if (c2 != 't' && c2 != 'T')
    277 	    {
    278 	      phase1_ungetc (c2);
    279 	      continue;
    280 	    }
    281 	  c2 = phase1_getc ();
    282 	  if (!(c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'))
    283 	    {
    284 	      phase1_ungetc (c2);
    285 	      continue;
    286 	    }
    287 	  do
    288 	    c2 = phase1_getc ();
    289 	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
    290 	  if (c2 != 'l' && c2 != 'L')
    291 	    {
    292 	      phase1_ungetc (c2);
    293 	      continue;
    294 	    }
    295 	  c2 = phase1_getc ();
    296 	  if (c2 != 'a' && c2 != 'A')
    297 	    {
    298 	      phase1_ungetc (c2);
    299 	      continue;
    300 	    }
    301 	  c2 = phase1_getc ();
    302 	  if (c2 != 'n' && c2 != 'N')
    303 	    {
    304 	      phase1_ungetc (c2);
    305 	      continue;
    306 	    }
    307 	  c2 = phase1_getc ();
    308 	  if (c2 != 'g' && c2 != 'G')
    309 	    {
    310 	      phase1_ungetc (c2);
    311 	      continue;
    312 	    }
    313 	  c2 = phase1_getc ();
    314 	  if (c2 != 'u' && c2 != 'U')
    315 	    {
    316 	      phase1_ungetc (c2);
    317 	      continue;
    318 	    }
    319 	  c2 = phase1_getc ();
    320 	  if (c2 != 'a' && c2 != 'A')
    321 	    {
    322 	      phase1_ungetc (c2);
    323 	      continue;
    324 	    }
    325 	  c2 = phase1_getc ();
    326 	  if (c2 != 'g' && c2 != 'G')
    327 	    {
    328 	      phase1_ungetc (c2);
    329 	      continue;
    330 	    }
    331 	  c2 = phase1_getc ();
    332 	  if (c2 != 'e' && c2 != 'E')
    333 	    {
    334 	      phase1_ungetc (c2);
    335 	      continue;
    336 	    }
    337 	  c2 = phase1_getc ();
    338 	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
    339 	    c2 = phase1_getc ();
    340 	  if (c2 != '=')
    341 	    {
    342 	      phase1_ungetc (c2);
    343 	      continue;
    344 	    }
    345 	  c2 = phase1_getc ();
    346 	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
    347 	    c2 = phase1_getc ();
    348 	  if (c2 == '"')
    349 	    {
    350 	      c2 = phase1_getc ();
    351 	      if (c2 != 'p')
    352 		{
    353 		  phase1_ungetc (c2);
    354 		  continue;
    355 		}
    356 	      c2 = phase1_getc ();
    357 	      if (c2 != 'h')
    358 		{
    359 		  phase1_ungetc (c2);
    360 		  continue;
    361 		}
    362 	      c2 = phase1_getc ();
    363 	      if (c2 != 'p')
    364 		{
    365 		  phase1_ungetc (c2);
    366 		  continue;
    367 		}
    368 	      c2 = phase1_getc ();
    369 	      if (c2 != '"')
    370 		{
    371 		  phase1_ungetc (c2);
    372 		  continue;
    373 		}
    374 	    }
    375 	  else if (c2 == '\'')
    376 	    {
    377 	      c2 = phase1_getc ();
    378 	      if (c2 != 'p')
    379 		{
    380 		  phase1_ungetc (c2);
    381 		  continue;
    382 		}
    383 	      c2 = phase1_getc ();
    384 	      if (c2 != 'h')
    385 		{
    386 		  phase1_ungetc (c2);
    387 		  continue;
    388 		}
    389 	      c2 = phase1_getc ();
    390 	      if (c2 != 'p')
    391 		{
    392 		  phase1_ungetc (c2);
    393 		  continue;
    394 		}
    395 	      c2 = phase1_getc ();
    396 	      if (c2 != '\'')
    397 		{
    398 		  phase1_ungetc (c2);
    399 		  continue;
    400 		}
    401 	    }
    402 	  else
    403 	    {
    404 	      if (c2 != 'p')
    405 		{
    406 		  phase1_ungetc (c2);
    407 		  continue;
    408 		}
    409 	      c2 = phase1_getc ();
    410 	      if (c2 != 'h')
    411 		{
    412 		  phase1_ungetc (c2);
    413 		  continue;
    414 		}
    415 	      c2 = phase1_getc ();
    416 	      if (c2 != 'p')
    417 		{
    418 		  phase1_ungetc (c2);
    419 		  continue;
    420 		}
    421 	    }
    422 	  c2 = phase1_getc ();
    423 	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
    424 	    c2 = phase1_getc ();
    425 	  if (c2 != '>')
    426 	    {
    427 	      phase1_ungetc (c2);
    428 	      continue;
    429 	    }
    430 	  return;
    431 	}
    432     }
    433 }
    434 
    435 #if 0
    436 
    437 static unsigned char phase2_pushback[1];
    438 static int phase2_pushback_length;
    439 
    440 static int
    441 phase2_getc ()
    442 {
    443   int c;
    444 
    445   if (phase2_pushback_length)
    446     return phase2_pushback[--phase2_pushback_length];
    447 
    448   c = phase1_getc ();
    449   switch (c)
    450     {
    451     case '?':
    452     case '%':
    453       {
    454 	int c2 = phase1_getc ();
    455 	if (c2 == '>')
    456 	  {
    457 	    /* ?> and %> terminate PHP mode and switch back to HTML mode.  */
    458 	    skip_html ();
    459 	    return ' ';
    460 	  }
    461 	phase1_ungetc (c2);
    462       }
    463       break;
    464 
    465     case '<':
    466       {
    467 	int c2 = phase1_getc ();
    468 
    469 	/* < / script > terminates PHP mode and switches back to HTML mode.  */
    470 	while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
    471 	  c2 = phase1_getc ();
    472 	if (c2 == '/')
    473 	  {
    474 	    do
    475 	      c2 = phase1_getc ();
    476 	    while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
    477 	    if (c2 == 's' || c2 == 'S')
    478 	      {
    479 		c2 = phase1_getc ();
    480 		if (c2 == 'c' || c2 == 'C')
    481 		  {
    482 		    c2 = phase1_getc ();
    483 		    if (c2 == 'r' || c2 == 'R')
    484 		      {
    485 			c2 = phase1_getc ();
    486 			if (c2 == 'i' || c2 == 'I')
    487 			  {
    488 			    c2 = phase1_getc ();
    489 			    if (c2 == 'p' || c2 == 'P')
    490 			      {
    491 				c2 = phase1_getc ();
    492 				if (c2 == 't' || c2 == 'T')
    493 				  {
    494 				    do
    495 				      c2 = phase1_getc ();
    496 				    while (c2 == ' ' || c2 == '\t'
    497 					   || c2 == '\n' || c2 == '\r');
    498 				    if (c2 == '>')
    499 				      {
    500 					skip_html ();
    501 					return ' ';
    502 				      }
    503 				  }
    504 			      }
    505 			  }
    506 		      }
    507 		  }
    508 	      }
    509 	  }
    510 	phase1_ungetc (c2);
    511       }
    512       break;
    513     }
    514 
    515   return c;
    516 }
    517 
    518 static void
    519 phase2_ungetc (int c)
    520 {
    521   if (c != EOF)
    522     {
    523       if (phase2_pushback_length == SIZEOF (phase2_pushback))
    524 	abort ();
    525       phase2_pushback[phase2_pushback_length++] = c;
    526     }
    527 }
    528 
    529 #endif
    530 
    531 
    532 /* Accumulating comments.  */
    533 
    534 static char *buffer;
    535 static size_t bufmax;
    536 static size_t buflen;
    537 
    538 static inline void
    539 comment_start ()
    540 {
    541   buflen = 0;
    542 }
    543 
    544 static inline void
    545 comment_add (int c)
    546 {
    547   if (buflen >= bufmax)
    548     {
    549       bufmax = 2 * bufmax + 10;
    550       buffer = xrealloc (buffer, bufmax);
    551     }
    552   buffer[buflen++] = c;
    553 }
    554 
    555 static inline void
    556 comment_line_end (size_t chars_to_remove)
    557 {
    558   buflen -= chars_to_remove;
    559   while (buflen >= 1
    560 	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
    561     --buflen;
    562   if (chars_to_remove == 0 && buflen >= bufmax)
    563     {
    564       bufmax = 2 * bufmax + 10;
    565       buffer = xrealloc (buffer, bufmax);
    566     }
    567   buffer[buflen] = '\0';
    568   savable_comment_add (buffer);
    569 }
    570 
    571 
    572 /* 3. Replace each comment that is not inside a string literal with a
    573    space character.  We need to remember the comment for later, because
    574    it may be attached to a keyword string.  */
    575 
    576 /* These are for tracking whether comments count as immediately before
    577    keyword.  */
    578 static int last_comment_line;
    579 static int last_non_comment_line;
    580 
    581 static unsigned char phase3_pushback[1];
    582 static int phase3_pushback_length;
    583 
    584 static int
    585 phase3_getc ()
    586 {
    587   int lineno;
    588   int c;
    589 
    590   if (phase3_pushback_length)
    591     return phase3_pushback[--phase3_pushback_length];
    592 
    593   c = phase1_getc ();
    594 
    595   if (c == '#')
    596     {
    597       /* sh comment.  */
    598       bool last_was_qmark = false;
    599 
    600       comment_start ();
    601       lineno = line_number;
    602       for (;;)
    603 	{
    604 	  c = phase1_getc ();
    605 	  if (c == '\n' || c == EOF)
    606 	    {
    607 	      comment_line_end (0);
    608 	      break;
    609 	    }
    610 	  if (last_was_qmark && c == '>')
    611 	    {
    612 	      comment_line_end (1);
    613 	      skip_html ();
    614 	      break;
    615 	    }
    616 	  /* We skip all leading white space, but not EOLs.  */
    617 	  if (!(buflen == 0 && (c == ' ' || c == '\t')))
    618 	    comment_add (c);
    619 	  last_was_qmark = (c == '?' || c == '%');
    620 	}
    621       last_comment_line = lineno;
    622       return '\n';
    623     }
    624   else if (c == '/')
    625     {
    626       c = phase1_getc ();
    627 
    628       switch (c)
    629 	{
    630 	default:
    631 	  phase1_ungetc (c);
    632 	  return '/';
    633 
    634 	case '*':
    635 	  {
    636 	    /* C comment.  */
    637 	    bool last_was_star;
    638 
    639 	    comment_start ();
    640 	    lineno = line_number;
    641 	    last_was_star = false;
    642 	    for (;;)
    643 	      {
    644 		c = phase1_getc ();
    645 		if (c == EOF)
    646 		  break;
    647 		/* We skip all leading white space, but not EOLs.  */
    648 		if (buflen == 0 && (c == ' ' || c == '\t'))
    649 		  continue;
    650 		comment_add (c);
    651 		switch (c)
    652 		  {
    653 		  case '\n':
    654 		    comment_line_end (1);
    655 		    comment_start ();
    656 		    lineno = line_number;
    657 		    last_was_star = false;
    658 		    continue;
    659 
    660 		  case '*':
    661 		    last_was_star = true;
    662 		    continue;
    663 
    664 		  case '/':
    665 		    if (last_was_star)
    666 		      {
    667 			comment_line_end (2);
    668 			break;
    669 		      }
    670 		    /* FALLTHROUGH */
    671 
    672 		  default:
    673 		    last_was_star = false;
    674 		    continue;
    675 		  }
    676 		break;
    677 	      }
    678 	    last_comment_line = lineno;
    679 	    return ' ';
    680 	  }
    681 
    682 	case '/':
    683 	  {
    684 	    /* C++ comment.  */
    685 	    bool last_was_qmark = false;
    686 
    687 	    comment_start ();
    688 	    lineno = line_number;
    689 	    for (;;)
    690 	      {
    691 		c = phase1_getc ();
    692 		if (c == '\n' || c == EOF)
    693 		  {
    694 		    comment_line_end (0);
    695 		    break;
    696 		  }
    697 		if (last_was_qmark && c == '>')
    698 		  {
    699 		    comment_line_end (1);
    700 		    skip_html ();
    701 		    break;
    702 		  }
    703 		/* We skip all leading white space, but not EOLs.  */
    704 		if (!(buflen == 0 && (c == ' ' || c == '\t')))
    705 		  comment_add (c);
    706 		last_was_qmark = (c == '?' || c == '%');
    707 	      }
    708 	    last_comment_line = lineno;
    709 	    return '\n';
    710 	  }
    711 	}
    712     }
    713   else
    714     return c;
    715 }
    716 
    717 #ifdef unused
    718 static void
    719 phase3_ungetc (int c)
    720 {
    721   if (c != EOF)
    722     {
    723       if (phase3_pushback_length == SIZEOF (phase3_pushback))
    724 	abort ();
    725       phase3_pushback[phase3_pushback_length++] = c;
    726     }
    727 }
    728 #endif
    729 
    730 
    731 /* ========================== Reading of tokens.  ========================== */
    732 
    733 
    734 enum token_type_ty
    735 {
    736   token_type_eof,
    737   token_type_lparen,		/* ( */
    738   token_type_rparen,		/* ) */
    739   token_type_comma,		/* , */
    740   token_type_string_literal,	/* "abc" */
    741   token_type_symbol,		/* symbol, number */
    742   token_type_other		/* misc. operator */
    743 };
    744 typedef enum token_type_ty token_type_ty;
    745 
    746 typedef struct token_ty token_ty;
    747 struct token_ty
    748 {
    749   token_type_ty type;
    750   char *string;		/* for token_type_string_literal, token_type_symbol */
    751   int line_number;
    752 };
    753 
    754 
    755 /* Free the memory pointed to by a 'struct token_ty'.  */
    756 static inline void
    757 free_token (token_ty *tp)
    758 {
    759   if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
    760     free (tp->string);
    761 }
    762 
    763 
    764 /* 4. Combine characters into tokens.  Discard whitespace.  */
    765 
    766 static void
    767 x_php_lex (token_ty *tp)
    768 {
    769   static char *buffer;
    770   static int bufmax;
    771   int bufpos;
    772   int c;
    773 
    774   tp->string = NULL;
    775 
    776   for (;;)
    777     {
    778       tp->line_number = line_number;
    779       c = phase3_getc ();
    780       switch (c)
    781 	{
    782 	case EOF:
    783 	  tp->type = token_type_eof;
    784 	  return;
    785 
    786 	case '\n':
    787 	  if (last_non_comment_line > last_comment_line)
    788 	    savable_comment_reset ();
    789 	  /* FALLTHROUGH */
    790 	case ' ':
    791 	case '\t':
    792 	case '\r':
    793 	  /* Ignore whitespace.  */
    794 	  continue;
    795 	}
    796 
    797       last_non_comment_line = tp->line_number;
    798 
    799       switch (c)
    800 	{
    801 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
    802 	case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
    803 	case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
    804 	case 'V': case 'W': case 'X': case 'Y': case 'Z':
    805 	case '_':
    806 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
    807 	case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
    808 	case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
    809 	case 'v': case 'w': case 'x': case 'y': case 'z':
    810 	case 127: case 128: case 129: case 130: case 131: case 132: case 133:
    811 	case 134: case 135: case 136: case 137: case 138: case 139: case 140:
    812 	case 141: case 142: case 143: case 144: case 145: case 146: case 147:
    813 	case 148: case 149: case 150: case 151: case 152: case 153: case 154:
    814 	case 155: case 156: case 157: case 158: case 159: case 160: case 161:
    815 	case 162: case 163: case 164: case 165: case 166: case 167: case 168:
    816 	case 169: case 170: case 171: case 172: case 173: case 174: case 175:
    817 	case 176: case 177: case 178: case 179: case 180: case 181: case 182:
    818 	case 183: case 184: case 185: case 186: case 187: case 188: case 189:
    819 	case 190: case 191: case 192: case 193: case 194: case 195: case 196:
    820 	case 197: case 198: case 199: case 200: case 201: case 202: case 203:
    821 	case 204: case 205: case 206: case 207: case 208: case 209: case 210:
    822 	case 211: case 212: case 213: case 214: case 215: case 216: case 217:
    823 	case 218: case 219: case 220: case 221: case 222: case 223: case 224:
    824 	case 225: case 226: case 227: case 228: case 229: case 230: case 231:
    825 	case 232: case 233: case 234: case 235: case 236: case 237: case 238:
    826 	case 239: case 240: case 241: case 242: case 243: case 244: case 245:
    827 	case 246: case 247: case 248: case 249: case 250: case 251: case 252:
    828 	case 253: case 254: case 255:
    829 	  bufpos = 0;
    830 	  for (;;)
    831 	    {
    832 	      if (bufpos >= bufmax)
    833 		{
    834 		  bufmax = 2 * bufmax + 10;
    835 		  buffer = xrealloc (buffer, bufmax);
    836 		}
    837 	      buffer[bufpos++] = c;
    838 	      c = phase1_getc ();
    839 	      switch (c)
    840 		{
    841 		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    842 		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
    843 		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
    844 		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
    845 		case 'Y': case 'Z':
    846 		case '_':
    847 		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    848 		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    849 		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
    850 		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
    851 		case 'y': case 'z':
    852 		case '0': case '1': case '2': case '3': case '4':
    853 		case '5': case '6': case '7': case '8': case '9':
    854 		case 127: case 128: case 129: case 130: case 131: case 132:
    855 		case 133: case 134: case 135: case 136: case 137: case 138:
    856 		case 139: case 140: case 141: case 142: case 143: case 144:
    857 		case 145: case 146: case 147: case 148: case 149: case 150:
    858 		case 151: case 152: case 153: case 154: case 155: case 156:
    859 		case 157: case 158: case 159: case 160: case 161: case 162:
    860 		case 163: case 164: case 165: case 166: case 167: case 168:
    861 		case 169: case 170: case 171: case 172: case 173: case 174:
    862 		case 175: case 176: case 177: case 178: case 179: case 180:
    863 		case 181: case 182: case 183: case 184: case 185: case 186:
    864 		case 187: case 188: case 189: case 190: case 191: case 192:
    865 		case 193: case 194: case 195: case 196: case 197: case 198:
    866 		case 199: case 200: case 201: case 202: case 203: case 204:
    867 		case 205: case 206: case 207: case 208: case 209: case 210:
    868 		case 211: case 212: case 213: case 214: case 215: case 216:
    869 		case 217: case 218: case 219: case 220: case 221: case 222:
    870 		case 223: case 224: case 225: case 226: case 227: case 228:
    871 		case 229: case 230: case 231: case 232: case 233: case 234:
    872 		case 235: case 236: case 237: case 238: case 239: case 240:
    873 		case 241: case 242: case 243: case 244: case 245: case 246:
    874 		case 247: case 248: case 249: case 250: case 251: case 252:
    875 		case 253: case 254: case 255:
    876 		  continue;
    877 
    878 		default:
    879 		  phase1_ungetc (c);
    880 		  break;
    881 		}
    882 	      break;
    883 	    }
    884 	  if (bufpos >= bufmax)
    885 	    {
    886 	      bufmax = 2 * bufmax + 10;
    887 	      buffer = xrealloc (buffer, bufmax);
    888 	    }
    889 	  buffer[bufpos] = 0;
    890 	  tp->string = xstrdup (buffer);
    891 	  tp->type = token_type_symbol;
    892 	  return;
    893 
    894 	case '\'':
    895 	  /* Single-quoted string literal.  */
    896 	  bufpos = 0;
    897 	  for (;;)
    898 	    {
    899 	      c = phase1_getc ();
    900 	      if (c == EOF || c == '\'')
    901 		break;
    902 	      if (c == '\\')
    903 		{
    904 		  c = phase1_getc ();
    905 		  if (c != '\\' && c != '\'')
    906 		    {
    907 		      phase1_ungetc (c);
    908 		      c = '\\';
    909 		    }
    910 		}
    911 	      if (bufpos >= bufmax)
    912 		{
    913 		  bufmax = 2 * bufmax + 10;
    914 		  buffer = xrealloc (buffer, bufmax);
    915 		}
    916 	      buffer[bufpos++] = c;
    917 	    }
    918 	  if (bufpos >= bufmax)
    919 	    {
    920 	      bufmax = 2 * bufmax + 10;
    921 	      buffer = xrealloc (buffer, bufmax);
    922 	    }
    923 	  buffer[bufpos] = 0;
    924 	  tp->type = token_type_string_literal;
    925 	  tp->string = xstrdup (buffer);
    926 	  return;
    927 
    928 	case '"':
    929 	  /* Double-quoted string literal.  */
    930 	  tp->type = token_type_string_literal;
    931 	  bufpos = 0;
    932 	  for (;;)
    933 	    {
    934 	      c = phase1_getc ();
    935 	      if (c == EOF || c == '"')
    936 		break;
    937 	      if (c == '$')
    938 		{
    939 		  c = phase1_getc ();
    940 		  if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
    941 		      || c == '_' || c == '{' || c >= 0x7f)
    942 		    {
    943 		      /* String with variables.  */
    944 		      tp->type = token_type_other;
    945 		      continue;
    946 		    }
    947 		  phase1_ungetc (c);
    948 		  c = '$';
    949 		}
    950 	      if (c == '{')
    951 		{
    952 		  c = phase1_getc ();
    953 		  if (c == '$')
    954 		    {
    955 		      /* String with expressions.  */
    956 		      tp->type = token_type_other;
    957 		      continue;
    958 		    }
    959 		  phase1_ungetc (c);
    960 		  c = '{';
    961 		}
    962 	      if (c == '\\')
    963 		{
    964 		  int n, j;
    965 
    966 		  c = phase1_getc ();
    967 		  switch (c)
    968 		    {
    969 		    case '"':
    970 		    case '\\':
    971 		    case '$':
    972 		      break;
    973 
    974 		    case '0': case '1': case '2': case '3':
    975 		    case '4': case '5': case '6': case '7':
    976 		      n = 0;
    977 		      for (j = 0; j < 3; ++j)
    978 			{
    979 			  n = n * 8 + c - '0';
    980 			  c = phase1_getc ();
    981 			  switch (c)
    982 			    {
    983 			    default:
    984 			      break;
    985 
    986 			    case '0': case '1': case '2': case '3':
    987 			    case '4': case '5': case '6': case '7':
    988 			      continue;
    989 			    }
    990 			  break;
    991 			}
    992 		      phase1_ungetc (c);
    993 		      c = n;
    994 		      break;
    995 
    996 		    case 'x':
    997 		      n = 0;
    998 		      for (j = 0; j < 2; ++j)
    999 			{
   1000 			  c = phase1_getc ();
   1001 			  switch (c)
   1002 			    {
   1003 			    case '0': case '1': case '2': case '3': case '4':
   1004 			    case '5': case '6': case '7': case '8': case '9':
   1005 			      n = n * 16 + c - '0';
   1006 			      break;
   1007 			    case 'A': case 'B': case 'C': case 'D': case 'E':
   1008 			    case 'F':
   1009 			      n = n * 16 + 10 + c - 'A';
   1010 			      break;
   1011 			    case 'a': case 'b': case 'c': case 'd': case 'e':
   1012 			    case 'f':
   1013 			      n = n * 16 + 10 + c - 'a';
   1014 			      break;
   1015 			    default:
   1016 			      phase1_ungetc (c);
   1017 			      c = 0;
   1018 			      break;
   1019 			    }
   1020 			  if (c == 0)
   1021 			    break;
   1022 			}
   1023 		      if (j == 0)
   1024 			{
   1025 			  phase1_ungetc ('x');
   1026 			  c = '\\';
   1027 			}
   1028 		      else
   1029 			c = n;
   1030 		      break;
   1031 
   1032 		    case 'n':
   1033 		      c = '\n';
   1034 		      break;
   1035 		    case 't':
   1036 		      c = '\t';
   1037 		      break;
   1038 		    case 'r':
   1039 		      c = '\r';
   1040 		      break;
   1041 
   1042 		    default:
   1043 		      phase1_ungetc (c);
   1044 		      c = '\\';
   1045 		      break;
   1046 		    }
   1047 		}
   1048 	      if (bufpos >= bufmax)
   1049 		{
   1050 		  bufmax = 2 * bufmax + 10;
   1051 		  buffer = xrealloc (buffer, bufmax);
   1052 		}
   1053 	      buffer[bufpos++] = c;
   1054 	    }
   1055 	  if (bufpos >= bufmax)
   1056 	    {
   1057 	      bufmax = 2 * bufmax + 10;
   1058 	      buffer = xrealloc (buffer, bufmax);
   1059 	    }
   1060 	  buffer[bufpos] = 0;
   1061 	  if (tp->type == token_type_string_literal)
   1062 	    tp->string = xstrdup (buffer);
   1063 	  return;
   1064 
   1065 	case '?':
   1066 	case '%':
   1067 	  {
   1068 	    int c2 = phase1_getc ();
   1069 	    if (c2 == '>')
   1070 	      {
   1071 		/* ?> and %> terminate PHP mode and switch back to HTML
   1072 		   mode.  */
   1073 		skip_html ();
   1074 	      }
   1075 	    else
   1076 	      phase1_ungetc (c2);
   1077 	    tp->type = token_type_other;
   1078 	    return;
   1079 	  }
   1080 
   1081 	case '(':
   1082 	  tp->type = token_type_lparen;
   1083 	  return;
   1084 
   1085 	case ')':
   1086 	  tp->type = token_type_rparen;
   1087 	  return;
   1088 
   1089 	case ',':
   1090 	  tp->type = token_type_comma;
   1091 	  return;
   1092 
   1093 	case '<':
   1094 	  {
   1095 	    int c2 = phase1_getc ();
   1096 	    if (c2 == '<')
   1097 	      {
   1098 		int c3 = phase1_getc ();
   1099 		if (c3 == '<')
   1100 		  {
   1101 		    /* Start of here document.
   1102 		       Parse whitespace, then label, then newline.  */
   1103 		    do
   1104 		      c = phase3_getc ();
   1105 		    while (c == ' ' || c == '\t' || c == '\n' || c == '\r');
   1106 
   1107 		    bufpos = 0;
   1108 		    do
   1109 		      {
   1110 			if (bufpos >= bufmax)
   1111 			  {
   1112 			    bufmax = 2 * bufmax + 10;
   1113 			    buffer = xrealloc (buffer, bufmax);
   1114 			  }
   1115 			buffer[bufpos++] = c;
   1116 			c = phase3_getc ();
   1117 		      }
   1118 		    while (c != EOF && c != '\n' && c != '\r');
   1119 		    /* buffer[0..bufpos-1] now contains the label.  */
   1120 
   1121 		    /* Now skip the here document.  */
   1122 		    for (;;)
   1123 		      {
   1124 			c = phase1_getc ();
   1125 			if (c == EOF)
   1126 			  break;
   1127 			if (c == '\n' || c == '\r')
   1128 			  {
   1129 			    int bufidx = 0;
   1130 
   1131 			    while (bufidx < bufpos)
   1132 			      {
   1133 				c = phase1_getc ();
   1134 				if (c == EOF)
   1135 				  break;
   1136 				if (c != buffer[bufidx])
   1137 				  {
   1138 				    phase1_ungetc (c);
   1139 				    break;
   1140 				  }
   1141 				bufidx++;
   1142 			      }
   1143 			    if (bufidx == bufpos)
   1144 			      {
   1145 				c = phase1_getc ();
   1146 				if (c != ';')
   1147 				  phase1_ungetc (c);
   1148 				c = phase1_getc ();
   1149 				if (c == '\n' || c == '\r')
   1150 				  break;
   1151 			      }
   1152 			  }
   1153 		      }
   1154 
   1155 		    /* FIXME: Ideally we should turn the here document into a
   1156 		       string literal if it didn't contain $ substitution.  And
   1157 		       we should also respect backslash escape sequences like
   1158 		       in double-quoted strings.  */
   1159 		    tp->type = token_type_other;
   1160 		    return;
   1161 		  }
   1162 		phase1_ungetc (c3);
   1163 	      }
   1164 
   1165 	    /* < / script > terminates PHP mode and switches back to HTML
   1166 	       mode.  */
   1167 	    while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
   1168 	      c2 = phase1_getc ();
   1169 	    if (c2 == '/')
   1170 	      {
   1171 		do
   1172 		  c2 = phase1_getc ();
   1173 		while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
   1174 		if (c2 == 's' || c2 == 'S')
   1175 		  {
   1176 		    c2 = phase1_getc ();
   1177 		    if (c2 == 'c' || c2 == 'C')
   1178 		      {
   1179 			c2 = phase1_getc ();
   1180 			if (c2 == 'r' || c2 == 'R')
   1181 			  {
   1182 			    c2 = phase1_getc ();
   1183 			    if (c2 == 'i' || c2 == 'I')
   1184 			      {
   1185 				c2 = phase1_getc ();
   1186 				if (c2 == 'p' || c2 == 'P')
   1187 				  {
   1188 				    c2 = phase1_getc ();
   1189 				    if (c2 == 't' || c2 == 'T')
   1190 				      {
   1191 					do
   1192 					  c2 = phase1_getc ();
   1193 					while (c2 == ' ' || c2 == '\t'
   1194 					       || c2 == '\n' || c2 == '\r');
   1195 					if (c2 == '>')
   1196 					  {
   1197 					    skip_html ();
   1198 					  }
   1199 					else
   1200 					  phase1_ungetc (c2);
   1201 				      }
   1202 				    else
   1203 				      phase1_ungetc (c2);
   1204 				  }
   1205 				else
   1206 				  phase1_ungetc (c2);
   1207 			      }
   1208 			    else
   1209 			      phase1_ungetc (c2);
   1210 			  }
   1211 			else
   1212 			  phase1_ungetc (c2);
   1213 		      }
   1214 		    else
   1215 		      phase1_ungetc (c2);
   1216 		  }
   1217 		else
   1218 		  phase1_ungetc (c2);
   1219 	      }
   1220 	    else
   1221 	      phase1_ungetc (c2);
   1222 
   1223 	    tp->type = token_type_other;
   1224 	    return;
   1225 	  }
   1226 
   1227 	case '`':
   1228 	  /* Execution operator.  */
   1229 	default:
   1230 	  /* We could carefully recognize each of the 2 and 3 character
   1231 	     operators, but it is not necessary, as we only need to recognize
   1232 	     gettext invocations.  Don't bother.  */
   1233 	  tp->type = token_type_other;
   1234 	  return;
   1235 	}
   1236     }
   1237 }
   1238 
   1239 
   1240 /* ========================= Extracting strings.  ========================== */
   1241 
   1242 
   1243 /* Context lookup table.  */
   1244 static flag_context_list_table_ty *flag_context_list_table;
   1245 
   1246 
   1247 /* The file is broken into tokens.  Scan the token stream, looking for
   1248    a keyword, followed by a left paren, followed by a string.  When we
   1249    see this sequence, we have something to remember.  We assume we are
   1250    looking at a valid C or C++ program, and leave the complaints about
   1251    the grammar to the compiler.
   1252 
   1253      Normal handling: Look for
   1254        keyword ( ... msgid ... )
   1255      Plural handling: Look for
   1256        keyword ( ... msgid ... msgid_plural ... )
   1257 
   1258    We use recursion because the arguments before msgid or between msgid
   1259    and msgid_plural can contain subexpressions of the same form.  */
   1260 
   1261 
   1262 /* Extract messages until the next balanced closing parenthesis.
   1263    Extracted messages are added to MLP.
   1264    Return true upon eof, false upon closing parenthesis.  */
   1265 static bool
   1266 extract_parenthesized (message_list_ty *mlp,
   1267 		       flag_context_ty outer_context,
   1268 		       flag_context_list_iterator_ty context_iter,
   1269 		       struct arglist_parser *argparser)
   1270 {
   1271   /* Current argument number.  */
   1272   int arg = 1;
   1273   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
   1274   int state;
   1275   /* Parameters of the keyword just seen.  Defined only in state 1.  */
   1276   const struct callshapes *next_shapes = NULL;
   1277   /* Context iterator that will be used if the next token is a '('.  */
   1278   flag_context_list_iterator_ty next_context_iter =
   1279     passthrough_context_list_iterator;
   1280   /* Current context.  */
   1281   flag_context_ty inner_context =
   1282     inherited_context (outer_context,
   1283 		       flag_context_list_iterator_advance (&context_iter));
   1284 
   1285   /* Start state is 0.  */
   1286   state = 0;
   1287 
   1288   for (;;)
   1289     {
   1290       token_ty token;
   1291 
   1292       x_php_lex (&token);
   1293       switch (token.type)
   1294 	{
   1295 	case token_type_symbol:
   1296 	  {
   1297 	    void *keyword_value;
   1298 
   1299 	    if (hash_find_entry (&keywords, token.string, strlen (token.string),
   1300 				 &keyword_value)
   1301 		== 0)
   1302 	      {
   1303 		next_shapes = (const struct callshapes *) keyword_value;
   1304 		state = 1;
   1305 	      }
   1306 	    else
   1307 	      state = 0;
   1308 	  }
   1309 	  next_context_iter =
   1310 	    flag_context_list_iterator (
   1311 	      flag_context_list_table_lookup (
   1312 		flag_context_list_table,
   1313 		token.string, strlen (token.string)));
   1314 	  free (token.string);
   1315 	  continue;
   1316 
   1317 	case token_type_lparen:
   1318 	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
   1319 				     arglist_parser_alloc (mlp,
   1320 							   state ? next_shapes : NULL)))
   1321 	    {
   1322 	      arglist_parser_done (argparser, arg);
   1323 	      return true;
   1324 	    }
   1325 	  next_context_iter = null_context_list_iterator;
   1326 	  state = 0;
   1327 	  continue;
   1328 
   1329 	case token_type_rparen:
   1330 	  arglist_parser_done (argparser, arg);
   1331 	  return false;
   1332 
   1333 	case token_type_comma:
   1334 	  arg++;
   1335 	  inner_context =
   1336 	    inherited_context (outer_context,
   1337 			       flag_context_list_iterator_advance (
   1338 				 &context_iter));
   1339 	  next_context_iter = passthrough_context_list_iterator;
   1340 	  state = 0;
   1341 	  continue;
   1342 
   1343 	case token_type_string_literal:
   1344 	  {
   1345 	    lex_pos_ty pos;
   1346 	    pos.file_name = logical_file_name;
   1347 	    pos.line_number = token.line_number;
   1348 
   1349 	    if (extract_all)
   1350 	      remember_a_message (mlp, NULL, token.string, inner_context,
   1351 				  &pos, savable_comment);
   1352 	    else
   1353 	      arglist_parser_remember (argparser, arg, token.string,
   1354 				       inner_context,
   1355 				       pos.file_name, pos.line_number,
   1356 				       savable_comment);
   1357 	  }
   1358 	  next_context_iter = null_context_list_iterator;
   1359 	  state = 0;
   1360 	  continue;
   1361 
   1362 	case token_type_other:
   1363 	  next_context_iter = null_context_list_iterator;
   1364 	  state = 0;
   1365 	  continue;
   1366 
   1367 	case token_type_eof:
   1368 	  arglist_parser_done (argparser, arg);
   1369 	  return true;
   1370 
   1371 	default:
   1372 	  abort ();
   1373 	}
   1374     }
   1375 }
   1376 
   1377 
   1378 void
   1379 extract_php (FILE *f,
   1380 	     const char *real_filename, const char *logical_filename,
   1381 	     flag_context_list_table_ty *flag_table,
   1382 	     msgdomain_list_ty *mdlp)
   1383 {
   1384   message_list_ty *mlp = mdlp->item[0]->messages;
   1385 
   1386   fp = f;
   1387   real_file_name = real_filename;
   1388   logical_file_name = xstrdup (logical_filename);
   1389   line_number = 1;
   1390 
   1391   last_comment_line = -1;
   1392   last_non_comment_line = -1;
   1393 
   1394   flag_context_list_table = flag_table;
   1395 
   1396   init_keywords ();
   1397 
   1398   /* Initial mode is HTML mode, not PHP mode.  */
   1399   skip_html ();
   1400 
   1401   /* Eat tokens until eof is seen.  When extract_parenthesized returns
   1402      due to an unbalanced closing parenthesis, just restart it.  */
   1403   while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
   1404 				 arglist_parser_alloc (mlp, NULL)))
   1405     ;
   1406 
   1407   /* Close scanner.  */
   1408   fp = NULL;
   1409   real_file_name = NULL;
   1410   logical_file_name = NULL;
   1411   line_number = 0;
   1412 }
   1413