Home | History | Annotate | Line # | Download | only in src
      1 /* xgettext YCP backend.
      2    Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc.
      3 
      4    This file was written by Bruno Haible <haible (at) clisp.cons.org>, 2001.
      5 
      6    This program is free software; you can redistribute it and/or modify
      7    it under the terms of the GNU General Public License as published by
      8    the Free Software Foundation; either version 2, or (at your option)
      9    any later version.
     10 
     11    This program is distributed in the hope that it will be useful,
     12    but WITHOUT ANY WARRANTY; without even the implied warranty of
     13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14    GNU General Public License for more details.
     15 
     16    You should have received a copy of the GNU General Public License
     17    along with this program; if not, write to the Free Software Foundation,
     18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
     19 
     20 #ifdef HAVE_CONFIG_H
     21 # include "config.h"
     22 #endif
     23 
     24 #include <errno.h>
     25 #include <limits.h>
     26 #include <stdbool.h>
     27 #include <stdio.h>
     28 #include <stdlib.h>
     29 
     30 #include "message.h"
     31 #include "xgettext.h"
     32 #include "x-ycp.h"
     33 #include "error.h"
     34 #include "xalloc.h"
     35 #include "exit.h"
     36 #include "gettext.h"
     37 
     38 #define _(s) gettext(s)
     39 
     40 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
     41 
     42 
     43 /* The YCP syntax is defined in libycp/doc/syntax.html.
     44    See also libycp/src/scanner.ll.
     45    Both are part of the yast2-core package in SuSE Linux distributions.  */
     46 
     47 
     48 void
     49 init_flag_table_ycp ()
     50 {
     51   xgettext_record_flag ("sformat:1:ycp-format");
     52   xgettext_record_flag ("y2debug:1:ycp-format");
     53   xgettext_record_flag ("y2milestone:1:ycp-format");
     54   xgettext_record_flag ("y2warning:1:ycp-format");
     55   xgettext_record_flag ("y2error:1:ycp-format");
     56   xgettext_record_flag ("y2security:1:ycp-format");
     57   xgettext_record_flag ("y2internal:1:ycp-format");
     58 }
     59 
     60 
     61 /* ======================== Reading of characters.  ======================== */
     62 
     63 
     64 /* Real filename, used in error messages about the input file.  */
     65 static const char *real_file_name;
     66 
     67 /* Logical filename and line number, used to label the extracted messages.  */
     68 static char *logical_file_name;
     69 static int line_number;
     70 static int char_in_line;
     71 
     72 /* The input file stream.  */
     73 static FILE *fp;
     74 
     75 /* These are for tracking whether comments count as immediately before
     76    keyword.  */
     77 static int last_comment_line;
     78 static int last_non_comment_line;
     79 
     80 
     81 /* 1. line_number handling.  */
     82 
     83 static int
     84 phase1_getc ()
     85 {
     86   int c = getc (fp);
     87 
     88   if (c == EOF)
     89     {
     90       if (ferror (fp))
     91 	error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
     92 	       real_file_name);
     93       return EOF;
     94     }
     95 
     96   if (c == '\n')
     97     {
     98       line_number++;
     99       char_in_line = 0;
    100     }
    101   else
    102     char_in_line++;
    103 
    104   return c;
    105 }
    106 
    107 /* Supports only one pushback character.  */
    108 static void
    109 phase1_ungetc (int c)
    110 {
    111   if (c != EOF)
    112     {
    113       if (c == '\n')
    114 	{
    115 	  --line_number;
    116 	  char_in_line = INT_MAX;
    117 	}
    118       else
    119 	--char_in_line;
    120 
    121       ungetc (c, fp);
    122     }
    123 }
    124 
    125 
    126 /* 2. Replace each comment that is not inside a character constant or
    127    string literal with a space character.  We need to remember the
    128    comment for later, because it may be attached to a keyword string.
    129    YCP comments can be in C comment syntax, C++ comment syntax or sh
    130    comment syntax.  */
    131 
    132 static unsigned char phase2_pushback[1];
    133 static int phase2_pushback_length;
    134 
    135 static int
    136 phase2_getc ()
    137 {
    138   static char *buffer;
    139   static size_t bufmax;
    140   size_t buflen;
    141   int lineno;
    142   int c;
    143   bool last_was_star;
    144 
    145   if (phase2_pushback_length)
    146     return phase2_pushback[--phase2_pushback_length];
    147 
    148   if (char_in_line == 0)
    149     {
    150       /* Eat whitespace, to recognize ^[\t ]*# pattern.  */
    151       do
    152 	c = phase1_getc ();
    153       while (c == '\t' || c == ' ');
    154 
    155       if (c == '#')
    156 	{
    157 	  /* sh comment.  */
    158 	  buflen = 0;
    159 	  lineno = line_number;
    160 	  for (;;)
    161 	    {
    162 	      c = phase1_getc ();
    163 	      if (c == '\n' || c == EOF)
    164 		break;
    165 	      /* We skip all leading white space, but not EOLs.  */
    166 	      if (!(buflen == 0 && (c == ' ' || c == '\t')))
    167 		{
    168 		  if (buflen >= bufmax)
    169 		    {
    170 		      bufmax = 2 * bufmax + 10;
    171 		      buffer = xrealloc (buffer, bufmax);
    172 		    }
    173 		  buffer[buflen++] = c;
    174 		}
    175 	    }
    176 	  if (buflen >= bufmax)
    177 	    {
    178 	      bufmax = 2 * bufmax + 10;
    179 	      buffer = xrealloc (buffer, bufmax);
    180 	    }
    181 	  buffer[buflen] = '\0';
    182 	  savable_comment_add (buffer);
    183 	  last_comment_line = lineno;
    184 	  return '\n';
    185 	}
    186     }
    187   else
    188     c = phase1_getc ();
    189 
    190   if (c == '/')
    191     {
    192       c = phase1_getc ();
    193 
    194       switch (c)
    195 	{
    196 	default:
    197 	  phase1_ungetc (c);
    198 	  return '/';
    199 
    200 	case '*':
    201 	  /* C comment.  */
    202 	  buflen = 0;
    203 	  lineno = line_number;
    204 	  last_was_star = false;
    205 	  for (;;)
    206 	    {
    207 	      c = phase1_getc ();
    208 	      if (c == EOF)
    209 		break;
    210 	      /* We skip all leading white space, but not EOLs.  */
    211 	      if (buflen == 0 && (c == ' ' || c == '\t'))
    212 		continue;
    213 	      if (buflen >= bufmax)
    214 		{
    215 		  bufmax = 2 * bufmax + 10;
    216 		  buffer = xrealloc (buffer, bufmax);
    217 	        }
    218 	      buffer[buflen++] = c;
    219 	      switch (c)
    220 		{
    221 		case '\n':
    222 		  --buflen;
    223 		  while (buflen >= 1
    224 			 && (buffer[buflen - 1] == ' '
    225 			     || buffer[buflen - 1] == '\t'))
    226 		    --buflen;
    227 		  buffer[buflen] = '\0';
    228 		  savable_comment_add (buffer);
    229 		  buflen = 0;
    230 		  lineno = line_number;
    231 		  last_was_star = false;
    232 		  continue;
    233 
    234 		case '*':
    235 		  last_was_star = true;
    236 		  continue;
    237 
    238 		case '/':
    239 		  if (last_was_star)
    240 		    {
    241 		      buflen -= 2;
    242 		      while (buflen >= 1
    243 			     && (buffer[buflen - 1] == ' '
    244 				 || buffer[buflen - 1] == '\t'))
    245 			--buflen;
    246 		      buffer[buflen] = '\0';
    247 		      savable_comment_add (buffer);
    248 		      break;
    249 		    }
    250 		  /* FALLTHROUGH */
    251 
    252 		default:
    253 		  last_was_star = false;
    254 		  continue;
    255 		}
    256 	      break;
    257 	    }
    258 	  last_comment_line = lineno;
    259 	  return ' ';
    260 
    261 	case '/':
    262 	  /* C++ comment.  */
    263 	  buflen = 0;
    264 	  lineno = line_number;
    265 	  for (;;)
    266 	    {
    267 	      c = phase1_getc ();
    268 	      if (c == '\n' || c == EOF)
    269 		break;
    270 	      /* We skip all leading white space, but not EOLs.  */
    271 	      if (!(buflen == 0 && (c == ' ' || c == '\t')))
    272 		{
    273 		  if (buflen >= bufmax)
    274 		    {
    275 		      bufmax = 2 * bufmax + 10;
    276 		      buffer = xrealloc (buffer, bufmax);
    277 		    }
    278 		  buffer[buflen++] = c;
    279 		}
    280 	    }
    281 	  if (buflen >= bufmax)
    282 	    {
    283 	      bufmax = 2 * bufmax + 10;
    284 	      buffer = xrealloc (buffer, bufmax);
    285 	    }
    286 	  buffer[buflen] = '\0';
    287 	  savable_comment_add (buffer);
    288 	  last_comment_line = lineno;
    289 	  return '\n';
    290 	}
    291     }
    292   else
    293     return c;
    294 }
    295 
    296 /* Supports only one pushback character.  */
    297 static void
    298 phase2_ungetc (int c)
    299 {
    300   if (c != EOF)
    301     {
    302       if (phase2_pushback_length == SIZEOF (phase2_pushback))
    303 	abort ();
    304       phase2_pushback[phase2_pushback_length++] = c;
    305     }
    306 }
    307 
    308 
    309 /* ========================== Reading of tokens.  ========================== */
    310 
    311 
    312 enum token_type_ty
    313 {
    314   token_type_eof,
    315   token_type_lparen,		/* ( */
    316   token_type_rparen,		/* ) */
    317   token_type_comma,		/* , */
    318   token_type_i18n,		/* _( */
    319   token_type_string_literal,	/* "abc" */
    320   token_type_symbol,		/* symbol, number */
    321   token_type_other		/* misc. operator */
    322 };
    323 typedef enum token_type_ty token_type_ty;
    324 
    325 typedef struct token_ty token_ty;
    326 struct token_ty
    327 {
    328   token_type_ty type;
    329   char *string;		/* for token_type_string_literal, token_type_symbol */
    330   int line_number;
    331 };
    332 
    333 
    334 /* 7. Replace escape sequences within character strings with their
    335    single character equivalents.  */
    336 
    337 #define P7_QUOTES (1000 + '"')
    338 
    339 static int
    340 phase7_getc ()
    341 {
    342   int c;
    343 
    344   for (;;)
    345     {
    346       /* Use phase 1, because phase 2 elides comments.  */
    347       c = phase1_getc ();
    348 
    349       if (c == '"')
    350 	return P7_QUOTES;
    351       if (c != '\\')
    352 	return c;
    353       c = phase1_getc ();
    354       if (c != '\n')
    355 	switch (c)
    356 	  {
    357 	  case 'b':
    358 	    return '\b';
    359 	  case 'f':
    360 	    return '\f';
    361 	  case 'n':
    362 	    return '\n';
    363 	  case 'r':
    364 	    return '\r';
    365 	  case 't':
    366 	    return '\t';
    367 
    368 	  /* FIXME: What is the octal escape syntax?
    369 	     syntax.html says: [0] [0-7]+
    370 	     scanner.ll says:  [0-7] [0-7] [0-7]
    371 	   */
    372 #if 0
    373 	  case '0': case '1': case '2': case '3':
    374 	  case '4': case '5': case '6': case '7':
    375 	    {
    376 	      int n, j;
    377 
    378 	      n = 0;
    379 	      for (j = 0; j < 3; ++j)
    380 		{
    381 		  n = n * 8 + c - '0';
    382 		  c = phase1_getc ();
    383 		  switch (c)
    384 		    {
    385 		    default:
    386 		      break;
    387 
    388 		    case '0': case '1': case '2': case '3':
    389 		    case '4': case '5': case '6': case '7':
    390 		      continue;
    391 		    }
    392 		  break;
    393 		}
    394 	      phase1_ungetc (c);
    395 	      return n;
    396 	    }
    397 #endif
    398 
    399 	  default:
    400 	    return c;
    401 	  }
    402     }
    403 }
    404 
    405 
    406 /* Combine characters into tokens.  Discard whitespace.  */
    407 
    408 static token_ty phase5_pushback[1];
    409 static int phase5_pushback_length;
    410 
    411 static void
    412 phase5_get (token_ty *tp)
    413 {
    414   static char *buffer;
    415   static int bufmax;
    416   int bufpos;
    417   int c;
    418 
    419   if (phase5_pushback_length)
    420     {
    421       *tp = phase5_pushback[--phase5_pushback_length];
    422       return;
    423     }
    424   for (;;)
    425     {
    426       tp->line_number = line_number;
    427       c = phase2_getc ();
    428 
    429       switch (c)
    430 	{
    431 	case EOF:
    432 	  tp->type = token_type_eof;
    433 	  return;
    434 
    435 	case '\n':
    436 	  if (last_non_comment_line > last_comment_line)
    437 	    savable_comment_reset ();
    438 	  /* FALLTHROUGH */
    439 	case '\r':
    440 	case '\t':
    441 	case ' ':
    442 	  /* Ignore whitespace and comments.  */
    443 	  continue;
    444 	}
    445 
    446       last_non_comment_line = tp->line_number;
    447 
    448       switch (c)
    449 	{
    450 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    451 	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
    452 	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
    453 	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
    454 	case 'Y': case 'Z':
    455 	case '_':
    456 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    457 	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    458 	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
    459 	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
    460 	case 'y': case 'z':
    461 	case '0': case '1': case '2': case '3': case '4':
    462 	case '5': case '6': case '7': case '8': case '9':
    463 	  /* Symbol, or part of a number.  */
    464 	  bufpos = 0;
    465 	  for (;;)
    466 	    {
    467 	      if (bufpos >= bufmax)
    468 		{
    469 		  bufmax = 2 * bufmax + 10;
    470 		  buffer = xrealloc (buffer, bufmax);
    471 		}
    472 	      buffer[bufpos++] = c;
    473 	      c = phase2_getc ();
    474 	      switch (c)
    475 		{
    476 		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
    477 		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
    478 		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
    479 		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
    480 		case 'Y': case 'Z':
    481 		case '_':
    482 		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
    483 		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
    484 		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
    485 		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
    486 		case 'y': case 'z':
    487 		case '0': case '1': case '2': case '3': case '4':
    488 		case '5': case '6': case '7': case '8': case '9':
    489 		  continue;
    490 		default:
    491 		  if (bufpos == 1 && buffer[0] == '_' && c == '(')
    492 		    {
    493 		      tp->type = token_type_i18n;
    494 		      return;
    495 		    }
    496 		  phase2_ungetc (c);
    497 		  break;
    498 		}
    499 	      break;
    500 	    }
    501 	  if (bufpos >= bufmax)
    502 	    {
    503 	      bufmax = 2 * bufmax + 10;
    504 	      buffer = xrealloc (buffer, bufmax);
    505 	    }
    506 	  buffer[bufpos] = '\0';
    507 	  tp->string = xstrdup (buffer);
    508 	  tp->type = token_type_symbol;
    509 	  return;
    510 
    511 	case '"':
    512 	  bufpos = 0;
    513 	  for (;;)
    514 	    {
    515 	      c = phase7_getc ();
    516 	      if (c == EOF || c == P7_QUOTES)
    517 		break;
    518 	      if (bufpos >= bufmax)
    519 		{
    520 		  bufmax = 2 * bufmax + 10;
    521 		  buffer = xrealloc (buffer, bufmax);
    522 		}
    523 	      buffer[bufpos++] = c;
    524 	    }
    525 	  if (bufpos >= bufmax)
    526 	    {
    527 	      bufmax = 2 * bufmax + 10;
    528 	      buffer = xrealloc (buffer, bufmax);
    529 	    }
    530 	  buffer[bufpos] = '\0';
    531 	  tp->string = xstrdup (buffer);
    532 	  tp->type = token_type_string_literal;
    533 	  return;
    534 
    535 	case '(':
    536 	  tp->type = token_type_lparen;
    537 	  return;
    538 
    539 	case ')':
    540 	  tp->type = token_type_rparen;
    541 	  return;
    542 
    543 	case ',':
    544 	  tp->type = token_type_comma;
    545 	  return;
    546 
    547 	default:
    548 	  /* We could carefully recognize each of the 2 and 3 character
    549 	     operators, but it is not necessary, as we only need to recognize
    550 	     gettext invocations.  Don't bother.  */
    551 	  tp->type = token_type_other;
    552 	  return;
    553 	}
    554     }
    555 }
    556 
    557 /* Supports only one pushback token.  */
    558 static void
    559 phase5_unget (token_ty *tp)
    560 {
    561   if (tp->type != token_type_eof)
    562     {
    563       if (phase5_pushback_length == SIZEOF (phase5_pushback))
    564 	abort ();
    565       phase5_pushback[phase5_pushback_length++] = *tp;
    566     }
    567 }
    568 
    569 
    570 /* Concatenate adjacent string literals to form single string literals.
    571    (See libycp/src/parser.yy, rule 'string' vs. terminal 'STRING'.)  */
    572 
    573 static void
    574 phase8_get (token_ty *tp)
    575 {
    576   phase5_get (tp);
    577   if (tp->type != token_type_string_literal)
    578     return;
    579   for (;;)
    580     {
    581       token_ty tmp;
    582       size_t len;
    583 
    584       phase5_get (&tmp);
    585       if (tmp.type != token_type_string_literal)
    586 	{
    587 	  phase5_unget (&tmp);
    588 	  return;
    589 	}
    590       len = strlen (tp->string);
    591       tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
    592       strcpy (tp->string + len, tmp.string);
    593       free (tmp.string);
    594     }
    595 }
    596 
    597 
    598 /* ========================= Extracting strings.  ========================== */
    599 
    600 
    601 /* Context lookup table.  */
    602 static flag_context_list_table_ty *flag_context_list_table;
    603 
    604 
    605 /* The file is broken into tokens.
    606 
    607      Normal handling: Look for
    608        [A] _( [B] msgid ... )
    609      Plural handling: Look for
    610        [A] _( [B] msgid [C] , [D] msgid_plural ... )
    611      At point [A]: state == 0.
    612      At point [B]: state == 1, plural_mp == NULL.
    613      At point [C]: state == 2, plural_mp != NULL.
    614      At point [D]: state == 1, plural_mp != NULL.
    615 
    616    We use recursion because we have to set the context according to the given
    617    flags.  */
    618 
    619 
    620 /* Extract messages until the next balanced closing parenthesis.
    621    Extracted messages are added to MLP.
    622    Return true upon eof, false upon closing parenthesis.  */
    623 static bool
    624 extract_parenthesized (message_list_ty *mlp,
    625 		       flag_context_ty outer_context,
    626 		       flag_context_list_iterator_ty context_iter,
    627 		       bool in_i18n)
    628 {
    629   int state; /* 1 or 2 inside _( ... ), otherwise 0 */
    630   message_ty *plural_mp = NULL;	/* defined only when in states 1 and 2 */
    631   /* Context iterator that will be used if the next token is a '('.  */
    632   flag_context_list_iterator_ty next_context_iter =
    633     passthrough_context_list_iterator;
    634   /* Current context.  */
    635   flag_context_ty inner_context =
    636     inherited_context (outer_context,
    637 		       flag_context_list_iterator_advance (&context_iter));
    638 
    639   /* Start state is 0 or 1.  */
    640   state = (in_i18n ? 1 : 0);
    641 
    642   for (;;)
    643     {
    644       token_ty token;
    645 
    646       if (in_i18n)
    647 	phase8_get (&token);
    648       else
    649 	phase5_get (&token);
    650 
    651       switch (token.type)
    652 	{
    653 	case token_type_i18n:
    654 	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
    655 				     true))
    656 	    return true;
    657 	  next_context_iter = null_context_list_iterator;
    658 	  state = 0;
    659 	  continue;
    660 
    661 	case token_type_string_literal:
    662 	  if (state == 1)
    663 	    {
    664 	      lex_pos_ty pos;
    665 	      pos.file_name = logical_file_name;
    666 	      pos.line_number = token.line_number;
    667 
    668 	      if (plural_mp == NULL)
    669 		{
    670 		  /* Seen an msgid.  */
    671 		  plural_mp = remember_a_message (mlp, NULL, token.string,
    672 						  inner_context, &pos,
    673 						  savable_comment);
    674 		  state = 2;
    675 		}
    676 	      else
    677 		{
    678 		  /* Seen an msgid_plural.  */
    679 		  remember_a_message_plural (plural_mp, token.string,
    680 					     inner_context, &pos,
    681 					     savable_comment);
    682 		  state = 0;
    683 		}
    684 	    }
    685 	  else
    686 	    {
    687 	      free (token.string);
    688 	      state = 0;
    689 	    }
    690 	  next_context_iter = null_context_list_iterator;
    691 	  continue;
    692 
    693 	case token_type_symbol:
    694 	  next_context_iter =
    695 	    flag_context_list_iterator (
    696 	      flag_context_list_table_lookup (
    697 		flag_context_list_table,
    698 		token.string, strlen (token.string)));
    699 	  free (token.string);
    700 	  state = 0;
    701 	  continue;
    702 
    703 	case token_type_lparen:
    704 	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
    705 				     false))
    706 	    return true;
    707 	  next_context_iter = null_context_list_iterator;
    708 	  state = 0;
    709 	  continue;
    710 
    711 	case token_type_rparen:
    712 	  return false;
    713 
    714 	case token_type_comma:
    715 	  if (state == 2)
    716 	    state = 1;
    717 	  else
    718 	    state = 0;
    719 	  inner_context =
    720 	    inherited_context (outer_context,
    721 			       flag_context_list_iterator_advance (
    722 				 &context_iter));
    723 	  next_context_iter = passthrough_context_list_iterator;
    724 	  continue;
    725 
    726 	case token_type_other:
    727 	  next_context_iter = null_context_list_iterator;
    728 	  state = 0;
    729 	  continue;
    730 
    731 	case token_type_eof:
    732 	  return true;
    733 
    734 	default:
    735 	  abort ();
    736 	}
    737     }
    738 }
    739 
    740 
    741 void
    742 extract_ycp (FILE *f,
    743 	     const char *real_filename, const char *logical_filename,
    744 	     flag_context_list_table_ty *flag_table,
    745 	     msgdomain_list_ty *mdlp)
    746 {
    747   message_list_ty *mlp = mdlp->item[0]->messages;
    748 
    749   fp = f;
    750   real_file_name = real_filename;
    751   logical_file_name = xstrdup (logical_filename);
    752   line_number = 1;
    753   char_in_line = 0;
    754 
    755   last_comment_line = -1;
    756   last_non_comment_line = -1;
    757 
    758   flag_context_list_table = flag_table;
    759 
    760   /* Eat tokens until eof is seen.  When extract_parenthesized returns
    761      due to an unbalanced closing parenthesis, just restart it.  */
    762   while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
    763 				 false))
    764     ;
    765 
    766   fp = NULL;
    767   real_file_name = NULL;
    768   logical_file_name = NULL;
    769   line_number = 0;
    770   char_in_line = 0;
    771 }
    772