Home | History | Annotate | Line # | Download | only in binutils
strings.c revision 1.1.1.6
      1 /* strings -- print the strings of printable characters in files
      2    Copyright (C) 1993-2022 Free Software Foundation, Inc.
      3 
      4    This program is free software; you can redistribute it and/or modify
      5    it under the terms of the GNU General Public License as published by
      6    the Free Software Foundation; either version 3, or (at your option)
      7    any later version.
      8 
      9    This program is distributed in the hope that it will be useful,
     10    but WITHOUT ANY WARRANTY; without even the implied warranty of
     11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     12    GNU General Public License for more details.
     13 
     14    You should have received a copy of the GNU General Public License
     15    along with this program; if not, write to the Free Software
     16    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
     17    02110-1301, USA.  */
     18 
     19 /* Usage: strings [options] file...
     21 
     22    Options:
     23    --all
     24    -a
     25    -		Scan each file in its entirety.
     26 
     27    --data
     28    -d		Scan only the initialized data section(s) of object files.
     29 
     30    --print-file-name
     31    -f		Print the name of the file before each string.
     32 
     33    --bytes=min-len
     34    -n min-len
     35    -min-len	Print graphic char sequences, MIN-LEN or more bytes long,
     36 		that are followed by a NUL or a non-displayable character.
     37 		Default is 4.
     38 
     39    --radix={o,x,d}
     40    -t {o,x,d}	Print the offset within the file before each string,
     41 		in octal/hex/decimal.
     42 
     43   --include-all-whitespace
     44   -w		By default tab and space are the only whitepace included in graphic
     45 		char sequences.  This option considers all of isspace() valid.
     46 
     47    -o		Like -to.  (Some other implementations have -o like -to,
     48 		others like -td.  We chose one arbitrarily.)
     49 
     50    --encoding={s,S,b,l,B,L}
     51    -e {s,S,b,l,B,L}
     52 		Select character encoding: 7-bit-character, 8-bit-character,
     53 		bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
     54 		littleendian 32-bit.
     55 
     56    --target=BFDNAME
     57    -T {bfdname}
     58 		Specify a non-default object file format.
     59 
     60   --unicode={default|locale|invalid|hex|escape|highlight}
     61   -U {d|l|i|x|e|h}
     62 		Determine how to handle UTF-8 unicode characters.  The default
     63 		is no special treatment.  All other versions of this option
     64 		only apply if the encoding is valid and enabling the option
     65 		implies --encoding=S.
     66 		The 'locale' option displays the characters according to the
     67 		current locale.  The 'invalid' option treats them as
     68 		non-string characters.  The 'hex' option displays them as hex
     69 		byte sequences.  The 'escape' option displays them as escape
     70 		sequences and the 'highlight' option displays them as
     71 		coloured escape sequences.
     72 
     73   --output-separator=sep_string
     74   -s sep_string	String used to separate parsed strings in output.
     75 		Default is newline.
     76 
     77    --help
     78    -h		Print the usage message on the standard output.
     79 
     80    --version
     81    -V
     82    -v		Print the program version number.
     83 
     84    Written by Richard Stallman <rms (at) gnu.ai.mit.edu>
     85    and David MacKenzie <djm (at) gnu.ai.mit.edu>.  */
     86 
     87 #include "sysdep.h"
     88 #include "bfd.h"
     89 #include "getopt.h"
     90 #include "libiberty.h"
     91 #include "safe-ctype.h"
     92 #include "bucomm.h"
     93 
     94 #ifndef streq
     95 #define streq(a,b) (strcmp ((a),(b)) == 0)
     96 #endif
     97 
     98 typedef enum unicode_display_type
     99 {
    100   unicode_default = 0,
    101   unicode_locale,
    102   unicode_escape,
    103   unicode_hex,
    104   unicode_highlight,
    105   unicode_invalid
    106 } unicode_display_type;
    107 
    108 static unicode_display_type unicode_display = unicode_default;
    109 
    110 #define STRING_ISGRAPHIC(c) \
    111       (   (c) >= 0 \
    112        && (c) <= 255 \
    113        && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
    114 	   || (include_all_whitespace && ISSPACE (c))) \
    115       )
    116 
    117 #ifndef errno
    118 extern int errno;
    119 #endif
    120 
    121 /* The BFD section flags that identify an initialized data section.  */
    122 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
    123 
    124 /* Radix for printing addresses (must be 8, 10 or 16).  */
    125 static int address_radix;
    126 
    127 /* Minimum length of sequence of graphic chars to trigger output.  */
    128 static unsigned int string_min;
    129 
    130 /* Whether or not we include all whitespace as a graphic char.   */
    131 static bool include_all_whitespace;
    132 
    133 /* TRUE means print address within file for each string.  */
    134 static bool print_addresses;
    135 
    136 /* TRUE means print filename for each string.  */
    137 static bool print_filenames;
    138 
    139 /* TRUE means for object files scan only the data section.  */
    140 static bool datasection_only;
    141 
    142 /* The BFD object file format.  */
    143 static char *target;
    144 
    145 /* The character encoding format.  */
    146 static char encoding;
    147 static int encoding_bytes;
    148 
    149 /* Output string used to separate parsed strings  */
    150 static char *output_separator;
    151 
    152 static struct option long_options[] =
    153 {
    154   {"all", no_argument, NULL, 'a'},
    155   {"bytes", required_argument, NULL, 'n'},
    156   {"data", no_argument, NULL, 'd'},
    157   {"encoding", required_argument, NULL, 'e'},
    158   {"help", no_argument, NULL, 'h'},
    159   {"include-all-whitespace", no_argument, NULL, 'w'},
    160   {"output-separator", required_argument, NULL, 's'},
    161   {"print-file-name", no_argument, NULL, 'f'},
    162   {"radix", required_argument, NULL, 't'},
    163   {"target", required_argument, NULL, 'T'},
    164   {"unicode", required_argument, NULL, 'U'},
    165   {"version", no_argument, NULL, 'v'},
    166   {NULL, 0, NULL, 0}
    167 };
    168 
    169 static bool strings_file (char *);
    170 static void print_strings (const char *, FILE *, file_ptr, int, char *);
    171 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
    172 
    173 int main (int, char **);
    175 
    176 int
    177 main (int argc, char **argv)
    178 {
    179   int optc;
    180   int exit_status = 0;
    181   bool files_given = false;
    182   char *s;
    183   int numeric_opt = 0;
    184 
    185   setlocale (LC_ALL, "");
    186   bindtextdomain (PACKAGE, LOCALEDIR);
    187   textdomain (PACKAGE);
    188 
    189   program_name = argv[0];
    190   xmalloc_set_program_name (program_name);
    191   bfd_set_error_program_name (program_name);
    192 
    193   expandargv (&argc, &argv);
    194 
    195   string_min = 4;
    196   include_all_whitespace = false;
    197   print_addresses = false;
    198   print_filenames = false;
    199   if (DEFAULT_STRINGS_ALL)
    200     datasection_only = false;
    201   else
    202     datasection_only = true;
    203   target = NULL;
    204   encoding = 's';
    205   output_separator = NULL;
    206 
    207   while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
    208 			      long_options, (int *) 0)) != EOF)
    209     {
    210       switch (optc)
    211 	{
    212 	case 'a':
    213 	  datasection_only = false;
    214 	  break;
    215 
    216 	case 'd':
    217 	  datasection_only = true;
    218 	  break;
    219 
    220 	case 'f':
    221 	  print_filenames = true;
    222 	  break;
    223 
    224 	case 'H':
    225 	case 'h':
    226 	  usage (stdout, 0);
    227 
    228 	case 'n':
    229 	  string_min = (int) strtoul (optarg, &s, 0);
    230 	  if (s != NULL && *s != 0)
    231 	    fatal (_("invalid integer argument %s"), optarg);
    232 	  break;
    233 
    234 	case 'w':
    235 	  include_all_whitespace = true;
    236 	  break;
    237 
    238 	case 'o':
    239 	  print_addresses = true;
    240 	  address_radix = 8;
    241 	  break;
    242 
    243 	case 't':
    244 	  print_addresses = true;
    245 	  if (optarg[1] != '\0')
    246 	    usage (stderr, 1);
    247 	  switch (optarg[0])
    248 	    {
    249 	    case 'o':
    250 	      address_radix = 8;
    251 	      break;
    252 
    253 	    case 'd':
    254 	      address_radix = 10;
    255 	      break;
    256 
    257 	    case 'x':
    258 	      address_radix = 16;
    259 	      break;
    260 
    261 	    default:
    262 	      usage (stderr, 1);
    263 	    }
    264 	  break;
    265 
    266 	case 'T':
    267 	  target = optarg;
    268 	  break;
    269 
    270 	case 'e':
    271 	  if (optarg[1] != '\0')
    272 	    usage (stderr, 1);
    273 	  encoding = optarg[0];
    274 	  break;
    275 
    276 	case 's':
    277 	  output_separator = optarg;
    278 	  break;
    279 
    280 	case 'U':
    281 	  if (streq (optarg, "default") || streq (optarg, "d"))
    282 	    unicode_display = unicode_default;
    283 	  else if (streq (optarg, "locale") || streq (optarg, "l"))
    284 	    unicode_display = unicode_locale;
    285 	  else if (streq (optarg, "escape") || streq (optarg, "e"))
    286 	    unicode_display = unicode_escape;
    287 	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
    288 	    unicode_display = unicode_invalid;
    289 	  else if (streq (optarg, "hex") || streq (optarg, "x"))
    290 	    unicode_display = unicode_hex;
    291 	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
    292 	    unicode_display = unicode_highlight;
    293 	  else
    294 	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
    295 	  break;
    296 
    297 	case 'V':
    298 	case 'v':
    299 	  print_version ("strings");
    300 	  break;
    301 
    302 	case '?':
    303 	  usage (stderr, 1);
    304 
    305 	default:
    306 	  numeric_opt = optind;
    307 	  break;
    308 	}
    309     }
    310 
    311   if (unicode_display != unicode_default)
    312     encoding = 'S';
    313 
    314   if (numeric_opt != 0)
    315     {
    316       string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
    317       if (s != NULL && *s != 0)
    318 	fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1);
    319     }
    320   if (string_min < 1)
    321     fatal (_("invalid minimum string length %d"), string_min);
    322 
    323   switch (encoding)
    324     {
    325     case 'S':
    326     case 's':
    327       encoding_bytes = 1;
    328       break;
    329     case 'b':
    330     case 'l':
    331       encoding_bytes = 2;
    332       break;
    333     case 'B':
    334     case 'L':
    335       encoding_bytes = 4;
    336       break;
    337     default:
    338       usage (stderr, 1);
    339     }
    340 
    341   if (bfd_init () != BFD_INIT_MAGIC)
    342     fatal (_("fatal error: libbfd ABI mismatch"));
    343   set_default_bfd_target ();
    344 
    345   if (optind >= argc)
    346     {
    347       datasection_only = false;
    348       SET_BINARY (fileno (stdin));
    349       print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
    350       files_given = true;
    351     }
    352   else
    353     {
    354       for (; optind < argc; ++optind)
    355 	{
    356 	  if (streq (argv[optind], "-"))
    357 	    datasection_only = false;
    358 	  else
    359 	    {
    360 	      files_given = true;
    361 	      exit_status |= !strings_file (argv[optind]);
    362 	    }
    363 	}
    364     }
    365 
    366   if (!files_given)
    367     usage (stderr, 1);
    368 
    369   return (exit_status);
    370 }
    371 
    372 /* Scan section SECT of the file ABFD, whose printable name is
    374    FILENAME.  If it contains initialized data set GOT_A_SECTION and
    375    print the strings in it.  */
    376 
    377 static void
    378 strings_a_section (bfd *abfd, asection *sect, const char *filename,
    379 		   bool *got_a_section)
    380 {
    381   bfd_size_type sectsize;
    382   bfd_byte *mem;
    383 
    384   if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
    385     return;
    386 
    387   sectsize = bfd_section_size (sect);
    388   if (sectsize == 0)
    389     return;
    390 
    391   if (!bfd_malloc_and_get_section (abfd, sect, &mem))
    392     {
    393       non_fatal (_("%s: Reading section %s failed: %s"),
    394 		 filename, sect->name, bfd_errmsg (bfd_get_error ()));
    395       return;
    396     }
    397 
    398   *got_a_section = true;
    399   print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
    400   free (mem);
    401 }
    402 
    403 /* Scan all of the sections in FILE, and print the strings
    404    in the initialized data section(s).
    405 
    406    Return TRUE if successful,
    407    FALSE if not (such as if FILE is not an object file).  */
    408 
    409 static bool
    410 strings_object_file (const char *file)
    411 {
    412   bfd *abfd;
    413   asection *s;
    414   bool got_a_section;
    415 
    416   abfd = bfd_openr (file, target);
    417 
    418   if (abfd == NULL)
    419     /* Treat the file as a non-object file.  */
    420     return false;
    421 
    422   /* This call is mainly for its side effect of reading in the sections.
    423      We follow the traditional behavior of `strings' in that we don't
    424      complain if we don't recognize a file to be an object file.  */
    425   if (!bfd_check_format (abfd, bfd_object))
    426     {
    427       bfd_close (abfd);
    428       return false;
    429     }
    430 
    431   got_a_section = false;
    432   for (s = abfd->sections; s != NULL; s = s->next)
    433     strings_a_section (abfd, s, file, &got_a_section);
    434 
    435   if (!bfd_close (abfd))
    436     {
    437       bfd_nonfatal (file);
    438       return false;
    439     }
    440 
    441   return got_a_section;
    442 }
    443 
    444 /* Print the strings in FILE.  Return TRUE if ok, FALSE if an error occurs.  */
    445 
    446 static bool
    447 strings_file (char *file)
    448 {
    449   struct stat st;
    450 
    451   /* get_file_size does not support non-S_ISREG files.  */
    452 
    453   if (stat (file, &st) < 0)
    454     {
    455       if (errno == ENOENT)
    456 	non_fatal (_("'%s': No such file"), file);
    457       else
    458 	non_fatal (_("Warning: could not locate '%s'.  reason: %s"),
    459 		   file, strerror (errno));
    460       return false;
    461     }
    462   else if (S_ISDIR (st.st_mode))
    463     {
    464       non_fatal (_("Warning: '%s' is a directory"), file);
    465       return false;
    466     }
    467 
    468   /* If we weren't told to scan the whole file,
    469      try to open it as an object file and only look at
    470      initialized data sections.  If that fails, fall back to the
    471      whole file.  */
    472   if (!datasection_only || !strings_object_file (file))
    473     {
    474       FILE *stream;
    475 
    476       stream = fopen (file, FOPEN_RB);
    477       if (stream == NULL)
    478 	{
    479 	  fprintf (stderr, "%s: ", program_name);
    480 	  perror (file);
    481 	  return false;
    482 	}
    483 
    484       print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
    485 
    486       if (fclose (stream) == EOF)
    487 	{
    488 	  fprintf (stderr, "%s: ", program_name);
    489 	  perror (file);
    490 	  return false;
    491 	}
    492     }
    493 
    494   return true;
    495 }
    496 
    497 /* Read the next character, return EOF if none available.
    499    Assume that STREAM is positioned so that the next byte read
    500    is at address ADDRESS in the file.
    501 
    502    If STREAM is NULL, do not read from it.
    503    The caller can supply a buffer of characters
    504    to be processed before the data in STREAM.
    505    MAGIC is the address of the buffer and
    506    MAGICCOUNT is how many characters are in it.  */
    507 
    508 static long
    509 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
    510 {
    511   int c, i;
    512   long r = 0;
    513 
    514   for (i = 0; i < encoding_bytes; i++)
    515     {
    516       if (*magiccount)
    517 	{
    518 	  (*magiccount)--;
    519 	  c = *(*magic)++;
    520 	}
    521       else
    522 	{
    523 	  if (stream == NULL)
    524 	    return EOF;
    525 
    526 	  /* Only use getc_unlocked if we found a declaration for it.
    527 	     Otherwise, libc is not thread safe by default, and we
    528 	     should not use it.  */
    529 
    530 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
    531 	  c = getc_unlocked (stream);
    532 #else
    533 	  c = getc (stream);
    534 #endif
    535 	  if (c == EOF)
    536 	    return EOF;
    537 	}
    538 
    539       (*address)++;
    540       r = (r << 8) | (c & 0xff);
    541     }
    542 
    543   switch (encoding)
    544     {
    545     default:
    546       break;
    547     case 'l':
    548       r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
    549       break;
    550     case 'L':
    551       r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
    552 	   | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
    553       break;
    554     }
    555 
    556   return r;
    557 }
    558 
    559 /* Throw away one byte of a (possibly) multi-byte char C, updating
    560    address and buffer to suit.  */
    561 
    562 static void
    563 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
    564 {
    565   static char tmp[4];
    566 
    567   if (encoding_bytes > 1)
    568     {
    569       *address -= encoding_bytes - 1;
    570 
    571       if (*magiccount == 0)
    572 	{
    573 	  /* If no magic buffer exists, use temp buffer.  */
    574 	  switch (encoding)
    575 	    {
    576 	    default:
    577 	      break;
    578 	    case 'b':
    579 	      tmp[0] = c & 0xff;
    580 	      *magiccount = 1;
    581 	      break;
    582 	    case 'l':
    583 	      tmp[0] = (c >> 8) & 0xff;
    584 	      *magiccount = 1;
    585 	      break;
    586 	    case 'B':
    587 	      tmp[0] = (c >> 16) & 0xff;
    588 	      tmp[1] = (c >> 8) & 0xff;
    589 	      tmp[2] = c & 0xff;
    590 	      *magiccount = 3;
    591 	      break;
    592 	    case 'L':
    593 	      tmp[0] = (c >> 8) & 0xff;
    594 	      tmp[1] = (c >> 16) & 0xff;
    595 	      tmp[2] = (c >> 24) & 0xff;
    596 	      *magiccount = 3;
    597 	      break;
    598 	    }
    599 	  *magic = tmp;
    600 	}
    601       else
    602 	{
    603 	  /* If magic buffer exists, rewind.  */
    604 	  *magic -= encoding_bytes - 1;
    605 	  *magiccount += encoding_bytes - 1;
    606 	}
    607     }
    608 }
    609 
    610 static void
    611 print_filename_and_address (const char * filename, file_ptr address)
    612 {
    613   if (print_filenames)
    614     printf ("%s: ", filename);
    615 
    616   if (! print_addresses)
    617     return;
    618 
    619   switch (address_radix)
    620     {
    621     case 8:
    622       if (sizeof (address) > sizeof (long))
    623 	{
    624 #ifndef __MSVCRT__
    625 	  printf ("%7llo ", (unsigned long long) address);
    626 #else
    627 	  printf ("%7I64o ", (unsigned long long) address);
    628 #endif
    629 	}
    630       else
    631 	printf ("%7lo ", (unsigned long) address);
    632       break;
    633 
    634     case 10:
    635       if (sizeof (address) > sizeof (long))
    636 	{
    637 #ifndef __MSVCRT__
    638 	  printf ("%7llu ", (unsigned long long) address);
    639 #else
    640 	  printf ("%7I64d ", (unsigned long long) address);
    641 #endif
    642 	}
    643       else
    644 	printf ("%7ld ", (long) address);
    645       break;
    646 
    647     case 16:
    648       if (sizeof (address) > sizeof (long))
    649 	{
    650 #ifndef __MSVCRT__
    651 	  printf ("%7llx ", (unsigned long long) address);
    652 #else
    653 	  printf ("%7I64x ", (unsigned long long) address);
    654 #endif
    655 	}
    656       else
    657 	printf ("%7lx ", (unsigned long) address);
    658       break;
    659     }
    660 }
    661 
    662 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
    663    If the encoding is valid then returns the number of bytes it uses.  */
    664 
    665 static unsigned int
    666 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
    667 {
    668   if (buffer[0] < 0xc0)
    669     return 0;
    670 
    671   if (buflen < 2)
    672     return 0;
    673 
    674   if ((buffer[1] & 0xc0) != 0x80)
    675     return 0;
    676 
    677   if ((buffer[0] & 0x20) == 0)
    678     return 2;
    679 
    680   if (buflen < 3)
    681     return 0;
    682 
    683   if ((buffer[2] & 0xc0) != 0x80)
    684     return 0;
    685 
    686   if ((buffer[0] & 0x10) == 0)
    687     return 3;
    688 
    689   if (buflen < 4)
    690     return 0;
    691 
    692   if ((buffer[3] & 0xc0) != 0x80)
    693     return 0;
    694 
    695   return 4;
    696 }
    697 
    698 /* Display a UTF-8 encoded character in BUFFER according to the setting
    699    of unicode_display.  The character is known to be valid.
    700    Returns the number of bytes consumed.  */
    701 
    702 static unsigned int
    703 display_utf8_char (const unsigned char * buffer)
    704 {
    705   unsigned int j;
    706   unsigned int utf8_len;
    707 
    708   switch (buffer[0] & 0x30)
    709     {
    710     case 0x00:
    711     case 0x10:
    712       utf8_len = 2;
    713       break;
    714     case 0x20:
    715       utf8_len = 3;
    716       break;
    717     default:
    718       utf8_len = 4;
    719     }
    720 
    721   switch (unicode_display)
    722     {
    723     default:
    724       fprintf (stderr, "ICE: unexpected unicode display type\n");
    725       break;
    726 
    727     case unicode_escape:
    728     case unicode_highlight:
    729       if (unicode_display == unicode_highlight && isatty (1))
    730 	printf ("\x1B[31;47m"); /* Red.  */
    731 
    732       switch (utf8_len)
    733 	{
    734 	case 2:
    735 	  printf ("\\u%02x%02x",
    736 		  ((buffer[0] & 0x1c) >> 2),
    737 		  ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
    738 	  break;
    739 
    740 	case 3:
    741 	  printf ("\\u%02x%02x",
    742 		  ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
    743 		  ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
    744 	  break;
    745 
    746 	case 4:
    747 	  printf ("\\u%02x%02x%02x",
    748 		  ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
    749 		  ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
    750 		  ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
    751 	  break;
    752 	default:
    753 	  /* URG.  */
    754 	  break;
    755 	}
    756 
    757       if (unicode_display == unicode_highlight && isatty (1))
    758 	printf ("\033[0m"); /* Default colour.  */
    759       break;
    760 
    761     case unicode_hex:
    762       putchar ('<');
    763       printf ("0x");
    764       for (j = 0; j < utf8_len; j++)
    765 	printf ("%02x", buffer [j]);
    766       putchar ('>');
    767       break;
    768 
    769     case unicode_locale:
    770       printf ("%.1s", buffer);
    771       break;
    772     }
    773 
    774   return utf8_len;
    775 }
    776 
    777 /* Display strings in BUFFER.  Treat any UTF-8 encoded characters encountered
    778    according to the setting of the unicode_display variable.  The buffer
    779    contains BUFLEN bytes.
    780 
    781    Display the characters as if they started at ADDRESS and are contained in
    782    FILENAME.  */
    783 
    784 static void
    785 print_unicode_buffer (const char *            filename,
    786 		      file_ptr                address,
    787 		      const unsigned char *   buffer,
    788 		      unsigned long           buflen)
    789 {
    790   /* Paranoia checks...  */
    791   if (filename == NULL
    792       || buffer == NULL
    793       || unicode_display == unicode_default
    794       || encoding != 'S'
    795       || encoding_bytes != 1)
    796     {
    797       fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
    798       return;
    799     }
    800 
    801   if (buflen == 0)
    802     return;
    803 
    804   /* We must only display strings that are at least string_min *characters*
    805      long.  So we scan the buffer in two stages.  First we locate the start
    806      of a potential string.  Then we walk along it until we have found
    807      string_min characters.  Then we go back to the start point and start
    808      displaying characters according to the unicode_display setting.  */
    809 
    810   unsigned long start_point = 0;
    811   unsigned long i = 0;
    812   unsigned int char_len = 1;
    813   unsigned int num_found = 0;
    814 
    815   for (i = 0; i < buflen; i += char_len)
    816     {
    817       int c = buffer[i];
    818 
    819       char_len = 1;
    820 
    821       /* Find the first potential character of a string.  */
    822       if (! STRING_ISGRAPHIC (c))
    823 	{
    824 	  num_found = 0;
    825 	  continue;
    826 	}
    827 
    828       if (c > 126)
    829 	{
    830 	  if (c < 0xc0)
    831 	    {
    832 	      num_found = 0;
    833 	      continue;
    834 	    }
    835 
    836 	  if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
    837 	    {
    838 	      char_len = 1;
    839 	      num_found = 0;
    840 	      continue;
    841 	    }
    842 
    843 	  if (unicode_display == unicode_invalid)
    844 	    {
    845 	      /* We have found a valid UTF-8 character, but we treat it as non-graphic.  */
    846 	      num_found = 0;
    847 	      continue;
    848 	    }
    849 	}
    850 
    851       if (num_found == 0)
    852 	/* We have found a potential starting point for a string.  */
    853 	start_point = i;
    854 
    855       ++ num_found;
    856 
    857       if (num_found >= string_min)
    858 	break;
    859     }
    860 
    861   if (num_found < string_min)
    862     return;
    863 
    864   print_filename_and_address (filename, address + start_point);
    865 
    866   /* We have found string_min characters.  Display them and any
    867      more that follow.  */
    868   for (i = start_point; i < buflen; i += char_len)
    869     {
    870       int c = buffer[i];
    871 
    872       char_len = 1;
    873 
    874       if (! STRING_ISGRAPHIC (c))
    875 	break;
    876       else if (c < 127)
    877 	putchar (c);
    878       else if (! is_valid_utf8 (buffer + i, buflen - i))
    879 	break;
    880       else if (unicode_display == unicode_invalid)
    881 	break;
    882       else
    883 	char_len = display_utf8_char (buffer + i);
    884     }
    885 
    886   if (output_separator)
    887     fputs (output_separator, stdout);
    888   else
    889     putchar ('\n');
    890 
    891   /* FIXME: Using tail recursion here is lazy programming...  */
    892   print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
    893 }
    894 
    895 static int
    896 get_unicode_byte (FILE *          stream,
    897 		  unsigned char * putback,
    898 		  unsigned int *  num_putback,
    899 		  unsigned int *  num_read)
    900 {
    901   if (* num_putback > 0)
    902     {
    903       * num_putback = * num_putback - 1;
    904       return putback [* num_putback];
    905     }
    906 
    907   * num_read = * num_read + 1;
    908 
    909 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
    910   return getc_unlocked (stream);
    911 #else
    912   return getc (stream);
    913 #endif
    914 }
    915 
    916 /* Helper function for print_unicode_stream.  */
    917 
    918 static void
    919 print_unicode_stream_body (const char *     filename,
    920 			   file_ptr         address,
    921 			   FILE *           stream,
    922 			   unsigned char *  putback_buf,
    923 			   unsigned int     num_putback,
    924 			   unsigned char *  print_buf)
    925 {
    926   /* It would be nice if we could just read the stream into a buffer
    927      and then process if with print_unicode_buffer.  But the input
    928      might be huge or it might time-locked (eg stdin).  So instead
    929      we go one byte at a time...  */
    930 
    931   file_ptr start_point = 0;
    932   unsigned int num_read = 0;
    933   unsigned int num_chars = 0;
    934   unsigned int num_print = 0;
    935   int c = 0;
    936 
    937   /* Find a series of string_min characters.  Put them into print_buf.  */
    938   do
    939     {
    940       if (num_chars >= string_min)
    941 	break;
    942 
    943       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
    944       if (c == EOF)
    945 	break;
    946 
    947       if (! STRING_ISGRAPHIC (c))
    948 	{
    949 	  num_chars = num_print = 0;
    950 	  continue;
    951 	}
    952 
    953       if (num_chars == 0)
    954 	start_point = num_read - 1;
    955 
    956       if (c < 127)
    957 	{
    958 	  print_buf[num_print] = c;
    959 	  num_chars ++;
    960 	  num_print ++;
    961 	  continue;
    962 	}
    963 
    964       if (c < 0xc0)
    965 	{
    966 	  num_chars = num_print = 0;
    967 	  continue;
    968 	}
    969 
    970       /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
    971       char utf8[4];
    972 
    973       utf8[0] = c;
    974       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
    975       if (c == EOF)
    976 	break;
    977       utf8[1] = c;
    978 
    979       if ((utf8[1] & 0xc0) != 0x80)
    980 	{
    981 	  /* Invalid UTF-8.  */
    982 	  putback_buf[num_putback++] = utf8[1];
    983 	  num_chars = num_print = 0;
    984 	  continue;
    985 	}
    986       else if ((utf8[0] & 0x20) == 0)
    987 	{
    988 	  /* A valid 2-byte UTF-8 encoding.  */
    989 	  if (unicode_display == unicode_invalid)
    990 	    {
    991 	      putback_buf[num_putback++] = utf8[1];
    992 	      num_chars = num_print = 0;
    993 	    }
    994 	  else
    995 	    {
    996 	      print_buf[num_print ++] = utf8[0];
    997 	      print_buf[num_print ++] = utf8[1];
    998 	      num_chars ++;
    999 	    }
   1000 	  continue;
   1001 	}
   1002 
   1003       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
   1004       if (c == EOF)
   1005 	break;
   1006       utf8[2] = c;
   1007 
   1008       if ((utf8[2] & 0xc0) != 0x80)
   1009 	{
   1010 	  /* Invalid UTF-8.  */
   1011 	  putback_buf[num_putback++] = utf8[2];
   1012 	  putback_buf[num_putback++] = utf8[1];
   1013 	  num_chars = num_print = 0;
   1014 	  continue;
   1015 	}
   1016       else if ((utf8[0] & 0x10) == 0)
   1017 	{
   1018 	  /* A valid 3-byte UTF-8 encoding.  */
   1019 	  if (unicode_display == unicode_invalid)
   1020 	    {
   1021 	      putback_buf[num_putback++] = utf8[2];
   1022 	      putback_buf[num_putback++] = utf8[1];
   1023 	      num_chars = num_print = 0;
   1024 	    }
   1025 	  else
   1026 	    {
   1027 	      print_buf[num_print ++] = utf8[0];
   1028 	      print_buf[num_print ++] = utf8[1];
   1029 	      print_buf[num_print ++] = utf8[2];
   1030 	      num_chars ++;
   1031 	    }
   1032 	  continue;
   1033 	}
   1034 
   1035       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
   1036       if (c == EOF)
   1037 	break;
   1038       utf8[3] = c;
   1039 
   1040       if ((utf8[3] & 0xc0) != 0x80)
   1041 	{
   1042 	  /* Invalid UTF-8.  */
   1043 	  putback_buf[num_putback++] = utf8[3];
   1044 	  putback_buf[num_putback++] = utf8[2];
   1045 	  putback_buf[num_putback++] = utf8[1];
   1046 	  num_chars = num_print = 0;
   1047 	}
   1048       /* We have a valid 4-byte UTF-8 encoding.  */
   1049       else if (unicode_display == unicode_invalid)
   1050 	{
   1051 	  putback_buf[num_putback++] = utf8[3];
   1052 	  putback_buf[num_putback++] = utf8[1];
   1053 	  putback_buf[num_putback++] = utf8[2];
   1054 	  num_chars = num_print = 0;
   1055 	}
   1056       else
   1057 	{
   1058 	  print_buf[num_print ++] = utf8[0];
   1059 	  print_buf[num_print ++] = utf8[1];
   1060 	  print_buf[num_print ++] = utf8[2];
   1061 	  print_buf[num_print ++] = utf8[3];
   1062 	  num_chars ++;
   1063 	}
   1064     }
   1065   while (1);
   1066 
   1067   if (num_chars >= string_min)
   1068     {
   1069       /* We know that we have string_min valid characters in print_buf,
   1070 	 and there may be more to come in the stream.  Start displaying
   1071 	 them.  */
   1072 
   1073       print_filename_and_address (filename, address + start_point);
   1074 
   1075       unsigned int i;
   1076       for (i = 0; i < num_print;)
   1077 	{
   1078 	  if (print_buf[i] < 127)
   1079 	    putchar (print_buf[i++]);
   1080 	  else
   1081 	    i += display_utf8_char (print_buf + i);
   1082 	}
   1083 
   1084       /* OK so now we have to start read unchecked bytes.  */
   1085 
   1086       /* Find a series of string_min characters.  Put them into print_buf.  */
   1087       do
   1088 	{
   1089 	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
   1090 	  if (c == EOF)
   1091 	    break;
   1092 
   1093 	  if (! STRING_ISGRAPHIC (c))
   1094 	    break;
   1095 
   1096 	  if (c < 127)
   1097 	    {
   1098 	      putchar (c);
   1099 	      continue;
   1100 	    }
   1101 
   1102 	  if (c < 0xc0)
   1103 	    break;
   1104 
   1105 	  /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
   1106 	  unsigned char utf8[4];
   1107 
   1108 	  utf8[0] = c;
   1109 	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
   1110 	  if (c == EOF)
   1111 	    break;
   1112 	  utf8[1] = c;
   1113 
   1114 	  if ((utf8[1] & 0xc0) != 0x80)
   1115 	    {
   1116 	      /* Invalid UTF-8.  */
   1117 	      putback_buf[num_putback++] = utf8[1];
   1118 	      break;
   1119 	    }
   1120 	  else if ((utf8[0] & 0x20) == 0)
   1121 	    {
   1122 	      /* Valid 2-byte UTF-8.  */
   1123 	      if (unicode_display == unicode_invalid)
   1124 		{
   1125 		  putback_buf[num_putback++] = utf8[1];
   1126 		  break;
   1127 		}
   1128 	      else
   1129 		{
   1130 		  (void) display_utf8_char (utf8);
   1131 		  continue;
   1132 		}
   1133 	    }
   1134 
   1135 	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
   1136 	  if (c == EOF)
   1137 	    break;
   1138 	  utf8[2] = c;
   1139 
   1140 	  if ((utf8[2] & 0xc0) != 0x80)
   1141 	    {
   1142 	      /* Invalid UTF-8.  */
   1143 	      putback_buf[num_putback++] = utf8[2];
   1144 	      putback_buf[num_putback++] = utf8[1];
   1145 	      break;
   1146 	    }
   1147 	  else if ((utf8[0] & 0x10) == 0)
   1148 	    {
   1149 	      /* Valid 3-byte UTF-8.  */
   1150 	      if (unicode_display == unicode_invalid)
   1151 		{
   1152 		  putback_buf[num_putback++] = utf8[2];
   1153 		  putback_buf[num_putback++] = utf8[1];
   1154 		  break;
   1155 		}
   1156 	      else
   1157 		{
   1158 		  (void) display_utf8_char (utf8);
   1159 		  continue;
   1160 		}
   1161 	    }
   1162 
   1163 	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
   1164 	  if (c == EOF)
   1165 	    break;
   1166 	  utf8[3] = c;
   1167 
   1168 	  if ((utf8[3] & 0xc0) != 0x80)
   1169 	    {
   1170 	      /* Invalid UTF-8.  */
   1171 	      putback_buf[num_putback++] = utf8[3];
   1172 	      putback_buf[num_putback++] = utf8[2];
   1173 	      putback_buf[num_putback++] = utf8[1];
   1174 	      break;
   1175 	    }
   1176 	  else if (unicode_display == unicode_invalid)
   1177 	    {
   1178 	      putback_buf[num_putback++] = utf8[3];
   1179 	      putback_buf[num_putback++] = utf8[2];
   1180 	      putback_buf[num_putback++] = utf8[1];
   1181 	      break;
   1182 	    }
   1183 	  else
   1184 	    /* A valid 4-byte UTF-8 encoding.  */
   1185 	    (void) display_utf8_char (utf8);
   1186 	}
   1187       while (1);
   1188 
   1189       if (output_separator)
   1190 	fputs (output_separator, stdout);
   1191       else
   1192 	putchar ('\n');
   1193     }
   1194 
   1195   if (c != EOF)
   1196     /* FIXME: Using tail recursion here is lazy, but it works.  */
   1197     print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
   1198 }
   1199 
   1200 /* Display strings read in from STREAM.  Treat any UTF-8 encoded characters
   1201    encountered according to the setting of the unicode_display variable.
   1202    The stream is positioned at ADDRESS and is attached to FILENAME.  */
   1203 
   1204 static void
   1205 print_unicode_stream (const char * filename,
   1206 		      file_ptr     address,
   1207 		      FILE *       stream)
   1208 {
   1209   /* Paranoia checks...  */
   1210   if (filename == NULL
   1211       || stream == NULL
   1212       || unicode_display == unicode_default
   1213       || encoding != 'S'
   1214       || encoding_bytes != 1)
   1215     {
   1216       fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
   1217       return;
   1218     }
   1219 
   1220   /* Allocate space for string_min 4-byte utf-8 characters.  */
   1221   unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
   1222   /* We should never have to put back more than 4 bytes.  */
   1223   unsigned char putback_buf[5];
   1224   unsigned int num_putback = 0;
   1225 
   1226   print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
   1227   free (print_buf);
   1228 }
   1229 
   1230 /* Find the strings in file FILENAME, read from STREAM.
   1232    Assume that STREAM is positioned so that the next byte read
   1233    is at address ADDRESS in the file.
   1234 
   1235    If STREAM is NULL, do not read from it.
   1236    The caller can supply a buffer of characters
   1237    to be processed before the data in STREAM.
   1238    MAGIC is the address of the buffer and
   1239    MAGICCOUNT is how many characters are in it.
   1240    Those characters come at address ADDRESS and the data in STREAM follow.  */
   1241 
   1242 static void
   1243 print_strings (const char *filename, FILE *stream, file_ptr address,
   1244 	       int magiccount, char *magic)
   1245 {
   1246   if (unicode_display != unicode_default)
   1247     {
   1248       if (magic != NULL)
   1249 	print_unicode_buffer (filename, address,
   1250 			      (const unsigned char *) magic, magiccount);
   1251 
   1252       if (stream != NULL)
   1253 	print_unicode_stream (filename, address, stream);
   1254       return;
   1255     }
   1256 
   1257   char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
   1258 
   1259   while (1)
   1260     {
   1261       file_ptr start;
   1262       unsigned int i;
   1263       long c;
   1264 
   1265       /* See if the next `string_min' chars are all graphic chars.  */
   1266     tryline:
   1267       start = address;
   1268       for (i = 0; i < string_min; i++)
   1269 	{
   1270 	  c = get_char (stream, &address, &magiccount, &magic);
   1271 	  if (c == EOF)
   1272 	    {
   1273 	      free (buf);
   1274 	      return;
   1275 	    }
   1276 
   1277 	  if (! STRING_ISGRAPHIC (c))
   1278 	    {
   1279 	      /* Found a non-graphic.  Try again starting with next byte.  */
   1280 	      unget_part_char (c, &address, &magiccount, &magic);
   1281 	      goto tryline;
   1282 	    }
   1283 	  buf[i] = c;
   1284 	}
   1285 
   1286       /* We found a run of `string_min' graphic characters.  Print up
   1287 	 to the next non-graphic character.  */
   1288       print_filename_and_address (filename, start);
   1289 
   1290       buf[i] = '\0';
   1291       fputs (buf, stdout);
   1292 
   1293       while (1)
   1294 	{
   1295 	  c = get_char (stream, &address, &magiccount, &magic);
   1296 	  if (c == EOF)
   1297 	    break;
   1298 	  if (! STRING_ISGRAPHIC (c))
   1299 	    {
   1300 	      unget_part_char (c, &address, &magiccount, &magic);
   1301 	      break;
   1302 	    }
   1303 	  putchar (c);
   1304 	}
   1305 
   1306       if (output_separator)
   1307 	fputs (output_separator, stdout);
   1308       else
   1309 	putchar ('\n');
   1310     }
   1311   free (buf);
   1312 }
   1313 
   1314 static void
   1316 usage (FILE *stream, int status)
   1317 {
   1318   fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
   1319   fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
   1320   fprintf (stream, _(" The options are:\n"));
   1321 
   1322   if (DEFAULT_STRINGS_ALL)
   1323     fprintf (stream, _("\
   1324   -a - --all                Scan the entire file, not just the data section [default]\n\
   1325   -d --data                 Only scan the data sections in the file\n"));
   1326   else
   1327     fprintf (stream, _("\
   1328   -a - --all                Scan the entire file, not just the data section\n\
   1329   -d --data                 Only scan the data sections in the file [default]\n"));
   1330 
   1331   fprintf (stream, _("\
   1332   -f --print-file-name      Print the name of the file before each string\n\
   1333   -n <number>               Locate & print any sequence of at least <number>\n\
   1334     --bytes=<number>         displayable characters.  (The default is 4).\n\
   1335   -t --radix={o,d,x}        Print the location of the string in base 8, 10 or 16\n\
   1336   -w --include-all-whitespace Include all whitespace as valid string characters\n\
   1337   -o                        An alias for --radix=o\n\
   1338   -T --target=<BFDNAME>     Specify the binary file format\n\
   1339   -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
   1340                             s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
   1341   --unicode={default|show|invalid|hex|escape|highlight}\n\
   1342   -U {d|s|i|x|e|h}          Specify how to treat UTF-8 encoded unicode characters\n\
   1343   -s --output-separator=<string> String used to separate strings in output.\n\
   1344   @<file>                   Read options from <file>\n\
   1345   -h --help                 Display this information\n\
   1346   -v -V --version           Print the program's version number\n"));
   1347   list_supported_targets (program_name, stream);
   1348   if (REPORT_BUGS_TO[0] && status == 0)
   1349     fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
   1350   exit (status);
   1351 }
   1352