strings.c revision 1.1.1.6 1 /* strings -- print the strings of printable characters in files
2 Copyright (C) 1993-2022 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
18
19 /* Usage: strings [options] file...
21
22 Options:
23 --all
24 -a
25 - Scan each file in its entirety.
26
27 --data
28 -d Scan only the initialized data section(s) of object files.
29
30 --print-file-name
31 -f Print the name of the file before each string.
32
33 --bytes=min-len
34 -n min-len
35 -min-len Print graphic char sequences, MIN-LEN or more bytes long,
36 that are followed by a NUL or a non-displayable character.
37 Default is 4.
38
39 --radix={o,x,d}
40 -t {o,x,d} Print the offset within the file before each string,
41 in octal/hex/decimal.
42
43 --include-all-whitespace
44 -w By default tab and space are the only whitepace included in graphic
45 char sequences. This option considers all of isspace() valid.
46
47 -o Like -to. (Some other implementations have -o like -to,
48 others like -td. We chose one arbitrarily.)
49
50 --encoding={s,S,b,l,B,L}
51 -e {s,S,b,l,B,L}
52 Select character encoding: 7-bit-character, 8-bit-character,
53 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
54 littleendian 32-bit.
55
56 --target=BFDNAME
57 -T {bfdname}
58 Specify a non-default object file format.
59
60 --unicode={default|locale|invalid|hex|escape|highlight}
61 -U {d|l|i|x|e|h}
62 Determine how to handle UTF-8 unicode characters. The default
63 is no special treatment. All other versions of this option
64 only apply if the encoding is valid and enabling the option
65 implies --encoding=S.
66 The 'locale' option displays the characters according to the
67 current locale. The 'invalid' option treats them as
68 non-string characters. The 'hex' option displays them as hex
69 byte sequences. The 'escape' option displays them as escape
70 sequences and the 'highlight' option displays them as
71 coloured escape sequences.
72
73 --output-separator=sep_string
74 -s sep_string String used to separate parsed strings in output.
75 Default is newline.
76
77 --help
78 -h Print the usage message on the standard output.
79
80 --version
81 -V
82 -v Print the program version number.
83
84 Written by Richard Stallman <rms (at) gnu.ai.mit.edu>
85 and David MacKenzie <djm (at) gnu.ai.mit.edu>. */
86
87 #include "sysdep.h"
88 #include "bfd.h"
89 #include "getopt.h"
90 #include "libiberty.h"
91 #include "safe-ctype.h"
92 #include "bucomm.h"
93
94 #ifndef streq
95 #define streq(a,b) (strcmp ((a),(b)) == 0)
96 #endif
97
98 typedef enum unicode_display_type
99 {
100 unicode_default = 0,
101 unicode_locale,
102 unicode_escape,
103 unicode_hex,
104 unicode_highlight,
105 unicode_invalid
106 } unicode_display_type;
107
108 static unicode_display_type unicode_display = unicode_default;
109
110 #define STRING_ISGRAPHIC(c) \
111 ( (c) >= 0 \
112 && (c) <= 255 \
113 && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
114 || (include_all_whitespace && ISSPACE (c))) \
115 )
116
117 #ifndef errno
118 extern int errno;
119 #endif
120
121 /* The BFD section flags that identify an initialized data section. */
122 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
123
124 /* Radix for printing addresses (must be 8, 10 or 16). */
125 static int address_radix;
126
127 /* Minimum length of sequence of graphic chars to trigger output. */
128 static unsigned int string_min;
129
130 /* Whether or not we include all whitespace as a graphic char. */
131 static bool include_all_whitespace;
132
133 /* TRUE means print address within file for each string. */
134 static bool print_addresses;
135
136 /* TRUE means print filename for each string. */
137 static bool print_filenames;
138
139 /* TRUE means for object files scan only the data section. */
140 static bool datasection_only;
141
142 /* The BFD object file format. */
143 static char *target;
144
145 /* The character encoding format. */
146 static char encoding;
147 static int encoding_bytes;
148
149 /* Output string used to separate parsed strings */
150 static char *output_separator;
151
152 static struct option long_options[] =
153 {
154 {"all", no_argument, NULL, 'a'},
155 {"bytes", required_argument, NULL, 'n'},
156 {"data", no_argument, NULL, 'd'},
157 {"encoding", required_argument, NULL, 'e'},
158 {"help", no_argument, NULL, 'h'},
159 {"include-all-whitespace", no_argument, NULL, 'w'},
160 {"output-separator", required_argument, NULL, 's'},
161 {"print-file-name", no_argument, NULL, 'f'},
162 {"radix", required_argument, NULL, 't'},
163 {"target", required_argument, NULL, 'T'},
164 {"unicode", required_argument, NULL, 'U'},
165 {"version", no_argument, NULL, 'v'},
166 {NULL, 0, NULL, 0}
167 };
168
169 static bool strings_file (char *);
170 static void print_strings (const char *, FILE *, file_ptr, int, char *);
171 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
172
173 int main (int, char **);
175
176 int
177 main (int argc, char **argv)
178 {
179 int optc;
180 int exit_status = 0;
181 bool files_given = false;
182 char *s;
183 int numeric_opt = 0;
184
185 setlocale (LC_ALL, "");
186 bindtextdomain (PACKAGE, LOCALEDIR);
187 textdomain (PACKAGE);
188
189 program_name = argv[0];
190 xmalloc_set_program_name (program_name);
191 bfd_set_error_program_name (program_name);
192
193 expandargv (&argc, &argv);
194
195 string_min = 4;
196 include_all_whitespace = false;
197 print_addresses = false;
198 print_filenames = false;
199 if (DEFAULT_STRINGS_ALL)
200 datasection_only = false;
201 else
202 datasection_only = true;
203 target = NULL;
204 encoding = 's';
205 output_separator = NULL;
206
207 while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
208 long_options, (int *) 0)) != EOF)
209 {
210 switch (optc)
211 {
212 case 'a':
213 datasection_only = false;
214 break;
215
216 case 'd':
217 datasection_only = true;
218 break;
219
220 case 'f':
221 print_filenames = true;
222 break;
223
224 case 'H':
225 case 'h':
226 usage (stdout, 0);
227
228 case 'n':
229 string_min = (int) strtoul (optarg, &s, 0);
230 if (s != NULL && *s != 0)
231 fatal (_("invalid integer argument %s"), optarg);
232 break;
233
234 case 'w':
235 include_all_whitespace = true;
236 break;
237
238 case 'o':
239 print_addresses = true;
240 address_radix = 8;
241 break;
242
243 case 't':
244 print_addresses = true;
245 if (optarg[1] != '\0')
246 usage (stderr, 1);
247 switch (optarg[0])
248 {
249 case 'o':
250 address_radix = 8;
251 break;
252
253 case 'd':
254 address_radix = 10;
255 break;
256
257 case 'x':
258 address_radix = 16;
259 break;
260
261 default:
262 usage (stderr, 1);
263 }
264 break;
265
266 case 'T':
267 target = optarg;
268 break;
269
270 case 'e':
271 if (optarg[1] != '\0')
272 usage (stderr, 1);
273 encoding = optarg[0];
274 break;
275
276 case 's':
277 output_separator = optarg;
278 break;
279
280 case 'U':
281 if (streq (optarg, "default") || streq (optarg, "d"))
282 unicode_display = unicode_default;
283 else if (streq (optarg, "locale") || streq (optarg, "l"))
284 unicode_display = unicode_locale;
285 else if (streq (optarg, "escape") || streq (optarg, "e"))
286 unicode_display = unicode_escape;
287 else if (streq (optarg, "invalid") || streq (optarg, "i"))
288 unicode_display = unicode_invalid;
289 else if (streq (optarg, "hex") || streq (optarg, "x"))
290 unicode_display = unicode_hex;
291 else if (streq (optarg, "highlight") || streq (optarg, "h"))
292 unicode_display = unicode_highlight;
293 else
294 fatal (_("invalid argument to -U/--unicode: %s"), optarg);
295 break;
296
297 case 'V':
298 case 'v':
299 print_version ("strings");
300 break;
301
302 case '?':
303 usage (stderr, 1);
304
305 default:
306 numeric_opt = optind;
307 break;
308 }
309 }
310
311 if (unicode_display != unicode_default)
312 encoding = 'S';
313
314 if (numeric_opt != 0)
315 {
316 string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
317 if (s != NULL && *s != 0)
318 fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1);
319 }
320 if (string_min < 1)
321 fatal (_("invalid minimum string length %d"), string_min);
322
323 switch (encoding)
324 {
325 case 'S':
326 case 's':
327 encoding_bytes = 1;
328 break;
329 case 'b':
330 case 'l':
331 encoding_bytes = 2;
332 break;
333 case 'B':
334 case 'L':
335 encoding_bytes = 4;
336 break;
337 default:
338 usage (stderr, 1);
339 }
340
341 if (bfd_init () != BFD_INIT_MAGIC)
342 fatal (_("fatal error: libbfd ABI mismatch"));
343 set_default_bfd_target ();
344
345 if (optind >= argc)
346 {
347 datasection_only = false;
348 SET_BINARY (fileno (stdin));
349 print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
350 files_given = true;
351 }
352 else
353 {
354 for (; optind < argc; ++optind)
355 {
356 if (streq (argv[optind], "-"))
357 datasection_only = false;
358 else
359 {
360 files_given = true;
361 exit_status |= !strings_file (argv[optind]);
362 }
363 }
364 }
365
366 if (!files_given)
367 usage (stderr, 1);
368
369 return (exit_status);
370 }
371
372 /* Scan section SECT of the file ABFD, whose printable name is
374 FILENAME. If it contains initialized data set GOT_A_SECTION and
375 print the strings in it. */
376
377 static void
378 strings_a_section (bfd *abfd, asection *sect, const char *filename,
379 bool *got_a_section)
380 {
381 bfd_size_type sectsize;
382 bfd_byte *mem;
383
384 if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
385 return;
386
387 sectsize = bfd_section_size (sect);
388 if (sectsize == 0)
389 return;
390
391 if (!bfd_malloc_and_get_section (abfd, sect, &mem))
392 {
393 non_fatal (_("%s: Reading section %s failed: %s"),
394 filename, sect->name, bfd_errmsg (bfd_get_error ()));
395 return;
396 }
397
398 *got_a_section = true;
399 print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
400 free (mem);
401 }
402
403 /* Scan all of the sections in FILE, and print the strings
404 in the initialized data section(s).
405
406 Return TRUE if successful,
407 FALSE if not (such as if FILE is not an object file). */
408
409 static bool
410 strings_object_file (const char *file)
411 {
412 bfd *abfd;
413 asection *s;
414 bool got_a_section;
415
416 abfd = bfd_openr (file, target);
417
418 if (abfd == NULL)
419 /* Treat the file as a non-object file. */
420 return false;
421
422 /* This call is mainly for its side effect of reading in the sections.
423 We follow the traditional behavior of `strings' in that we don't
424 complain if we don't recognize a file to be an object file. */
425 if (!bfd_check_format (abfd, bfd_object))
426 {
427 bfd_close (abfd);
428 return false;
429 }
430
431 got_a_section = false;
432 for (s = abfd->sections; s != NULL; s = s->next)
433 strings_a_section (abfd, s, file, &got_a_section);
434
435 if (!bfd_close (abfd))
436 {
437 bfd_nonfatal (file);
438 return false;
439 }
440
441 return got_a_section;
442 }
443
444 /* Print the strings in FILE. Return TRUE if ok, FALSE if an error occurs. */
445
446 static bool
447 strings_file (char *file)
448 {
449 struct stat st;
450
451 /* get_file_size does not support non-S_ISREG files. */
452
453 if (stat (file, &st) < 0)
454 {
455 if (errno == ENOENT)
456 non_fatal (_("'%s': No such file"), file);
457 else
458 non_fatal (_("Warning: could not locate '%s'. reason: %s"),
459 file, strerror (errno));
460 return false;
461 }
462 else if (S_ISDIR (st.st_mode))
463 {
464 non_fatal (_("Warning: '%s' is a directory"), file);
465 return false;
466 }
467
468 /* If we weren't told to scan the whole file,
469 try to open it as an object file and only look at
470 initialized data sections. If that fails, fall back to the
471 whole file. */
472 if (!datasection_only || !strings_object_file (file))
473 {
474 FILE *stream;
475
476 stream = fopen (file, FOPEN_RB);
477 if (stream == NULL)
478 {
479 fprintf (stderr, "%s: ", program_name);
480 perror (file);
481 return false;
482 }
483
484 print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
485
486 if (fclose (stream) == EOF)
487 {
488 fprintf (stderr, "%s: ", program_name);
489 perror (file);
490 return false;
491 }
492 }
493
494 return true;
495 }
496
497 /* Read the next character, return EOF if none available.
499 Assume that STREAM is positioned so that the next byte read
500 is at address ADDRESS in the file.
501
502 If STREAM is NULL, do not read from it.
503 The caller can supply a buffer of characters
504 to be processed before the data in STREAM.
505 MAGIC is the address of the buffer and
506 MAGICCOUNT is how many characters are in it. */
507
508 static long
509 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
510 {
511 int c, i;
512 long r = 0;
513
514 for (i = 0; i < encoding_bytes; i++)
515 {
516 if (*magiccount)
517 {
518 (*magiccount)--;
519 c = *(*magic)++;
520 }
521 else
522 {
523 if (stream == NULL)
524 return EOF;
525
526 /* Only use getc_unlocked if we found a declaration for it.
527 Otherwise, libc is not thread safe by default, and we
528 should not use it. */
529
530 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
531 c = getc_unlocked (stream);
532 #else
533 c = getc (stream);
534 #endif
535 if (c == EOF)
536 return EOF;
537 }
538
539 (*address)++;
540 r = (r << 8) | (c & 0xff);
541 }
542
543 switch (encoding)
544 {
545 default:
546 break;
547 case 'l':
548 r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
549 break;
550 case 'L':
551 r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
552 | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
553 break;
554 }
555
556 return r;
557 }
558
559 /* Throw away one byte of a (possibly) multi-byte char C, updating
560 address and buffer to suit. */
561
562 static void
563 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
564 {
565 static char tmp[4];
566
567 if (encoding_bytes > 1)
568 {
569 *address -= encoding_bytes - 1;
570
571 if (*magiccount == 0)
572 {
573 /* If no magic buffer exists, use temp buffer. */
574 switch (encoding)
575 {
576 default:
577 break;
578 case 'b':
579 tmp[0] = c & 0xff;
580 *magiccount = 1;
581 break;
582 case 'l':
583 tmp[0] = (c >> 8) & 0xff;
584 *magiccount = 1;
585 break;
586 case 'B':
587 tmp[0] = (c >> 16) & 0xff;
588 tmp[1] = (c >> 8) & 0xff;
589 tmp[2] = c & 0xff;
590 *magiccount = 3;
591 break;
592 case 'L':
593 tmp[0] = (c >> 8) & 0xff;
594 tmp[1] = (c >> 16) & 0xff;
595 tmp[2] = (c >> 24) & 0xff;
596 *magiccount = 3;
597 break;
598 }
599 *magic = tmp;
600 }
601 else
602 {
603 /* If magic buffer exists, rewind. */
604 *magic -= encoding_bytes - 1;
605 *magiccount += encoding_bytes - 1;
606 }
607 }
608 }
609
610 static void
611 print_filename_and_address (const char * filename, file_ptr address)
612 {
613 if (print_filenames)
614 printf ("%s: ", filename);
615
616 if (! print_addresses)
617 return;
618
619 switch (address_radix)
620 {
621 case 8:
622 if (sizeof (address) > sizeof (long))
623 {
624 #ifndef __MSVCRT__
625 printf ("%7llo ", (unsigned long long) address);
626 #else
627 printf ("%7I64o ", (unsigned long long) address);
628 #endif
629 }
630 else
631 printf ("%7lo ", (unsigned long) address);
632 break;
633
634 case 10:
635 if (sizeof (address) > sizeof (long))
636 {
637 #ifndef __MSVCRT__
638 printf ("%7llu ", (unsigned long long) address);
639 #else
640 printf ("%7I64d ", (unsigned long long) address);
641 #endif
642 }
643 else
644 printf ("%7ld ", (long) address);
645 break;
646
647 case 16:
648 if (sizeof (address) > sizeof (long))
649 {
650 #ifndef __MSVCRT__
651 printf ("%7llx ", (unsigned long long) address);
652 #else
653 printf ("%7I64x ", (unsigned long long) address);
654 #endif
655 }
656 else
657 printf ("%7lx ", (unsigned long) address);
658 break;
659 }
660 }
661
662 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
663 If the encoding is valid then returns the number of bytes it uses. */
664
665 static unsigned int
666 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
667 {
668 if (buffer[0] < 0xc0)
669 return 0;
670
671 if (buflen < 2)
672 return 0;
673
674 if ((buffer[1] & 0xc0) != 0x80)
675 return 0;
676
677 if ((buffer[0] & 0x20) == 0)
678 return 2;
679
680 if (buflen < 3)
681 return 0;
682
683 if ((buffer[2] & 0xc0) != 0x80)
684 return 0;
685
686 if ((buffer[0] & 0x10) == 0)
687 return 3;
688
689 if (buflen < 4)
690 return 0;
691
692 if ((buffer[3] & 0xc0) != 0x80)
693 return 0;
694
695 return 4;
696 }
697
698 /* Display a UTF-8 encoded character in BUFFER according to the setting
699 of unicode_display. The character is known to be valid.
700 Returns the number of bytes consumed. */
701
702 static unsigned int
703 display_utf8_char (const unsigned char * buffer)
704 {
705 unsigned int j;
706 unsigned int utf8_len;
707
708 switch (buffer[0] & 0x30)
709 {
710 case 0x00:
711 case 0x10:
712 utf8_len = 2;
713 break;
714 case 0x20:
715 utf8_len = 3;
716 break;
717 default:
718 utf8_len = 4;
719 }
720
721 switch (unicode_display)
722 {
723 default:
724 fprintf (stderr, "ICE: unexpected unicode display type\n");
725 break;
726
727 case unicode_escape:
728 case unicode_highlight:
729 if (unicode_display == unicode_highlight && isatty (1))
730 printf ("\x1B[31;47m"); /* Red. */
731
732 switch (utf8_len)
733 {
734 case 2:
735 printf ("\\u%02x%02x",
736 ((buffer[0] & 0x1c) >> 2),
737 ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
738 break;
739
740 case 3:
741 printf ("\\u%02x%02x",
742 ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
743 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
744 break;
745
746 case 4:
747 printf ("\\u%02x%02x%02x",
748 ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
749 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
750 ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
751 break;
752 default:
753 /* URG. */
754 break;
755 }
756
757 if (unicode_display == unicode_highlight && isatty (1))
758 printf ("\033[0m"); /* Default colour. */
759 break;
760
761 case unicode_hex:
762 putchar ('<');
763 printf ("0x");
764 for (j = 0; j < utf8_len; j++)
765 printf ("%02x", buffer [j]);
766 putchar ('>');
767 break;
768
769 case unicode_locale:
770 printf ("%.1s", buffer);
771 break;
772 }
773
774 return utf8_len;
775 }
776
777 /* Display strings in BUFFER. Treat any UTF-8 encoded characters encountered
778 according to the setting of the unicode_display variable. The buffer
779 contains BUFLEN bytes.
780
781 Display the characters as if they started at ADDRESS and are contained in
782 FILENAME. */
783
784 static void
785 print_unicode_buffer (const char * filename,
786 file_ptr address,
787 const unsigned char * buffer,
788 unsigned long buflen)
789 {
790 /* Paranoia checks... */
791 if (filename == NULL
792 || buffer == NULL
793 || unicode_display == unicode_default
794 || encoding != 'S'
795 || encoding_bytes != 1)
796 {
797 fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
798 return;
799 }
800
801 if (buflen == 0)
802 return;
803
804 /* We must only display strings that are at least string_min *characters*
805 long. So we scan the buffer in two stages. First we locate the start
806 of a potential string. Then we walk along it until we have found
807 string_min characters. Then we go back to the start point and start
808 displaying characters according to the unicode_display setting. */
809
810 unsigned long start_point = 0;
811 unsigned long i = 0;
812 unsigned int char_len = 1;
813 unsigned int num_found = 0;
814
815 for (i = 0; i < buflen; i += char_len)
816 {
817 int c = buffer[i];
818
819 char_len = 1;
820
821 /* Find the first potential character of a string. */
822 if (! STRING_ISGRAPHIC (c))
823 {
824 num_found = 0;
825 continue;
826 }
827
828 if (c > 126)
829 {
830 if (c < 0xc0)
831 {
832 num_found = 0;
833 continue;
834 }
835
836 if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
837 {
838 char_len = 1;
839 num_found = 0;
840 continue;
841 }
842
843 if (unicode_display == unicode_invalid)
844 {
845 /* We have found a valid UTF-8 character, but we treat it as non-graphic. */
846 num_found = 0;
847 continue;
848 }
849 }
850
851 if (num_found == 0)
852 /* We have found a potential starting point for a string. */
853 start_point = i;
854
855 ++ num_found;
856
857 if (num_found >= string_min)
858 break;
859 }
860
861 if (num_found < string_min)
862 return;
863
864 print_filename_and_address (filename, address + start_point);
865
866 /* We have found string_min characters. Display them and any
867 more that follow. */
868 for (i = start_point; i < buflen; i += char_len)
869 {
870 int c = buffer[i];
871
872 char_len = 1;
873
874 if (! STRING_ISGRAPHIC (c))
875 break;
876 else if (c < 127)
877 putchar (c);
878 else if (! is_valid_utf8 (buffer + i, buflen - i))
879 break;
880 else if (unicode_display == unicode_invalid)
881 break;
882 else
883 char_len = display_utf8_char (buffer + i);
884 }
885
886 if (output_separator)
887 fputs (output_separator, stdout);
888 else
889 putchar ('\n');
890
891 /* FIXME: Using tail recursion here is lazy programming... */
892 print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
893 }
894
895 static int
896 get_unicode_byte (FILE * stream,
897 unsigned char * putback,
898 unsigned int * num_putback,
899 unsigned int * num_read)
900 {
901 if (* num_putback > 0)
902 {
903 * num_putback = * num_putback - 1;
904 return putback [* num_putback];
905 }
906
907 * num_read = * num_read + 1;
908
909 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
910 return getc_unlocked (stream);
911 #else
912 return getc (stream);
913 #endif
914 }
915
916 /* Helper function for print_unicode_stream. */
917
918 static void
919 print_unicode_stream_body (const char * filename,
920 file_ptr address,
921 FILE * stream,
922 unsigned char * putback_buf,
923 unsigned int num_putback,
924 unsigned char * print_buf)
925 {
926 /* It would be nice if we could just read the stream into a buffer
927 and then process if with print_unicode_buffer. But the input
928 might be huge or it might time-locked (eg stdin). So instead
929 we go one byte at a time... */
930
931 file_ptr start_point = 0;
932 unsigned int num_read = 0;
933 unsigned int num_chars = 0;
934 unsigned int num_print = 0;
935 int c = 0;
936
937 /* Find a series of string_min characters. Put them into print_buf. */
938 do
939 {
940 if (num_chars >= string_min)
941 break;
942
943 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
944 if (c == EOF)
945 break;
946
947 if (! STRING_ISGRAPHIC (c))
948 {
949 num_chars = num_print = 0;
950 continue;
951 }
952
953 if (num_chars == 0)
954 start_point = num_read - 1;
955
956 if (c < 127)
957 {
958 print_buf[num_print] = c;
959 num_chars ++;
960 num_print ++;
961 continue;
962 }
963
964 if (c < 0xc0)
965 {
966 num_chars = num_print = 0;
967 continue;
968 }
969
970 /* We *might* have a UTF-8 sequence. Time to start peeking. */
971 char utf8[4];
972
973 utf8[0] = c;
974 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
975 if (c == EOF)
976 break;
977 utf8[1] = c;
978
979 if ((utf8[1] & 0xc0) != 0x80)
980 {
981 /* Invalid UTF-8. */
982 putback_buf[num_putback++] = utf8[1];
983 num_chars = num_print = 0;
984 continue;
985 }
986 else if ((utf8[0] & 0x20) == 0)
987 {
988 /* A valid 2-byte UTF-8 encoding. */
989 if (unicode_display == unicode_invalid)
990 {
991 putback_buf[num_putback++] = utf8[1];
992 num_chars = num_print = 0;
993 }
994 else
995 {
996 print_buf[num_print ++] = utf8[0];
997 print_buf[num_print ++] = utf8[1];
998 num_chars ++;
999 }
1000 continue;
1001 }
1002
1003 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1004 if (c == EOF)
1005 break;
1006 utf8[2] = c;
1007
1008 if ((utf8[2] & 0xc0) != 0x80)
1009 {
1010 /* Invalid UTF-8. */
1011 putback_buf[num_putback++] = utf8[2];
1012 putback_buf[num_putback++] = utf8[1];
1013 num_chars = num_print = 0;
1014 continue;
1015 }
1016 else if ((utf8[0] & 0x10) == 0)
1017 {
1018 /* A valid 3-byte UTF-8 encoding. */
1019 if (unicode_display == unicode_invalid)
1020 {
1021 putback_buf[num_putback++] = utf8[2];
1022 putback_buf[num_putback++] = utf8[1];
1023 num_chars = num_print = 0;
1024 }
1025 else
1026 {
1027 print_buf[num_print ++] = utf8[0];
1028 print_buf[num_print ++] = utf8[1];
1029 print_buf[num_print ++] = utf8[2];
1030 num_chars ++;
1031 }
1032 continue;
1033 }
1034
1035 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1036 if (c == EOF)
1037 break;
1038 utf8[3] = c;
1039
1040 if ((utf8[3] & 0xc0) != 0x80)
1041 {
1042 /* Invalid UTF-8. */
1043 putback_buf[num_putback++] = utf8[3];
1044 putback_buf[num_putback++] = utf8[2];
1045 putback_buf[num_putback++] = utf8[1];
1046 num_chars = num_print = 0;
1047 }
1048 /* We have a valid 4-byte UTF-8 encoding. */
1049 else if (unicode_display == unicode_invalid)
1050 {
1051 putback_buf[num_putback++] = utf8[3];
1052 putback_buf[num_putback++] = utf8[1];
1053 putback_buf[num_putback++] = utf8[2];
1054 num_chars = num_print = 0;
1055 }
1056 else
1057 {
1058 print_buf[num_print ++] = utf8[0];
1059 print_buf[num_print ++] = utf8[1];
1060 print_buf[num_print ++] = utf8[2];
1061 print_buf[num_print ++] = utf8[3];
1062 num_chars ++;
1063 }
1064 }
1065 while (1);
1066
1067 if (num_chars >= string_min)
1068 {
1069 /* We know that we have string_min valid characters in print_buf,
1070 and there may be more to come in the stream. Start displaying
1071 them. */
1072
1073 print_filename_and_address (filename, address + start_point);
1074
1075 unsigned int i;
1076 for (i = 0; i < num_print;)
1077 {
1078 if (print_buf[i] < 127)
1079 putchar (print_buf[i++]);
1080 else
1081 i += display_utf8_char (print_buf + i);
1082 }
1083
1084 /* OK so now we have to start read unchecked bytes. */
1085
1086 /* Find a series of string_min characters. Put them into print_buf. */
1087 do
1088 {
1089 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1090 if (c == EOF)
1091 break;
1092
1093 if (! STRING_ISGRAPHIC (c))
1094 break;
1095
1096 if (c < 127)
1097 {
1098 putchar (c);
1099 continue;
1100 }
1101
1102 if (c < 0xc0)
1103 break;
1104
1105 /* We *might* have a UTF-8 sequence. Time to start peeking. */
1106 unsigned char utf8[4];
1107
1108 utf8[0] = c;
1109 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1110 if (c == EOF)
1111 break;
1112 utf8[1] = c;
1113
1114 if ((utf8[1] & 0xc0) != 0x80)
1115 {
1116 /* Invalid UTF-8. */
1117 putback_buf[num_putback++] = utf8[1];
1118 break;
1119 }
1120 else if ((utf8[0] & 0x20) == 0)
1121 {
1122 /* Valid 2-byte UTF-8. */
1123 if (unicode_display == unicode_invalid)
1124 {
1125 putback_buf[num_putback++] = utf8[1];
1126 break;
1127 }
1128 else
1129 {
1130 (void) display_utf8_char (utf8);
1131 continue;
1132 }
1133 }
1134
1135 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1136 if (c == EOF)
1137 break;
1138 utf8[2] = c;
1139
1140 if ((utf8[2] & 0xc0) != 0x80)
1141 {
1142 /* Invalid UTF-8. */
1143 putback_buf[num_putback++] = utf8[2];
1144 putback_buf[num_putback++] = utf8[1];
1145 break;
1146 }
1147 else if ((utf8[0] & 0x10) == 0)
1148 {
1149 /* Valid 3-byte UTF-8. */
1150 if (unicode_display == unicode_invalid)
1151 {
1152 putback_buf[num_putback++] = utf8[2];
1153 putback_buf[num_putback++] = utf8[1];
1154 break;
1155 }
1156 else
1157 {
1158 (void) display_utf8_char (utf8);
1159 continue;
1160 }
1161 }
1162
1163 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1164 if (c == EOF)
1165 break;
1166 utf8[3] = c;
1167
1168 if ((utf8[3] & 0xc0) != 0x80)
1169 {
1170 /* Invalid UTF-8. */
1171 putback_buf[num_putback++] = utf8[3];
1172 putback_buf[num_putback++] = utf8[2];
1173 putback_buf[num_putback++] = utf8[1];
1174 break;
1175 }
1176 else if (unicode_display == unicode_invalid)
1177 {
1178 putback_buf[num_putback++] = utf8[3];
1179 putback_buf[num_putback++] = utf8[2];
1180 putback_buf[num_putback++] = utf8[1];
1181 break;
1182 }
1183 else
1184 /* A valid 4-byte UTF-8 encoding. */
1185 (void) display_utf8_char (utf8);
1186 }
1187 while (1);
1188
1189 if (output_separator)
1190 fputs (output_separator, stdout);
1191 else
1192 putchar ('\n');
1193 }
1194
1195 if (c != EOF)
1196 /* FIXME: Using tail recursion here is lazy, but it works. */
1197 print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1198 }
1199
1200 /* Display strings read in from STREAM. Treat any UTF-8 encoded characters
1201 encountered according to the setting of the unicode_display variable.
1202 The stream is positioned at ADDRESS and is attached to FILENAME. */
1203
1204 static void
1205 print_unicode_stream (const char * filename,
1206 file_ptr address,
1207 FILE * stream)
1208 {
1209 /* Paranoia checks... */
1210 if (filename == NULL
1211 || stream == NULL
1212 || unicode_display == unicode_default
1213 || encoding != 'S'
1214 || encoding_bytes != 1)
1215 {
1216 fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1217 return;
1218 }
1219
1220 /* Allocate space for string_min 4-byte utf-8 characters. */
1221 unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
1222 /* We should never have to put back more than 4 bytes. */
1223 unsigned char putback_buf[5];
1224 unsigned int num_putback = 0;
1225
1226 print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1227 free (print_buf);
1228 }
1229
1230 /* Find the strings in file FILENAME, read from STREAM.
1232 Assume that STREAM is positioned so that the next byte read
1233 is at address ADDRESS in the file.
1234
1235 If STREAM is NULL, do not read from it.
1236 The caller can supply a buffer of characters
1237 to be processed before the data in STREAM.
1238 MAGIC is the address of the buffer and
1239 MAGICCOUNT is how many characters are in it.
1240 Those characters come at address ADDRESS and the data in STREAM follow. */
1241
1242 static void
1243 print_strings (const char *filename, FILE *stream, file_ptr address,
1244 int magiccount, char *magic)
1245 {
1246 if (unicode_display != unicode_default)
1247 {
1248 if (magic != NULL)
1249 print_unicode_buffer (filename, address,
1250 (const unsigned char *) magic, magiccount);
1251
1252 if (stream != NULL)
1253 print_unicode_stream (filename, address, stream);
1254 return;
1255 }
1256
1257 char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1258
1259 while (1)
1260 {
1261 file_ptr start;
1262 unsigned int i;
1263 long c;
1264
1265 /* See if the next `string_min' chars are all graphic chars. */
1266 tryline:
1267 start = address;
1268 for (i = 0; i < string_min; i++)
1269 {
1270 c = get_char (stream, &address, &magiccount, &magic);
1271 if (c == EOF)
1272 {
1273 free (buf);
1274 return;
1275 }
1276
1277 if (! STRING_ISGRAPHIC (c))
1278 {
1279 /* Found a non-graphic. Try again starting with next byte. */
1280 unget_part_char (c, &address, &magiccount, &magic);
1281 goto tryline;
1282 }
1283 buf[i] = c;
1284 }
1285
1286 /* We found a run of `string_min' graphic characters. Print up
1287 to the next non-graphic character. */
1288 print_filename_and_address (filename, start);
1289
1290 buf[i] = '\0';
1291 fputs (buf, stdout);
1292
1293 while (1)
1294 {
1295 c = get_char (stream, &address, &magiccount, &magic);
1296 if (c == EOF)
1297 break;
1298 if (! STRING_ISGRAPHIC (c))
1299 {
1300 unget_part_char (c, &address, &magiccount, &magic);
1301 break;
1302 }
1303 putchar (c);
1304 }
1305
1306 if (output_separator)
1307 fputs (output_separator, stdout);
1308 else
1309 putchar ('\n');
1310 }
1311 free (buf);
1312 }
1313
1314 static void
1316 usage (FILE *stream, int status)
1317 {
1318 fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1319 fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
1320 fprintf (stream, _(" The options are:\n"));
1321
1322 if (DEFAULT_STRINGS_ALL)
1323 fprintf (stream, _("\
1324 -a - --all Scan the entire file, not just the data section [default]\n\
1325 -d --data Only scan the data sections in the file\n"));
1326 else
1327 fprintf (stream, _("\
1328 -a - --all Scan the entire file, not just the data section\n\
1329 -d --data Only scan the data sections in the file [default]\n"));
1330
1331 fprintf (stream, _("\
1332 -f --print-file-name Print the name of the file before each string\n\
1333 -n <number> Locate & print any sequence of at least <number>\n\
1334 --bytes=<number> displayable characters. (The default is 4).\n\
1335 -t --radix={o,d,x} Print the location of the string in base 8, 10 or 16\n\
1336 -w --include-all-whitespace Include all whitespace as valid string characters\n\
1337 -o An alias for --radix=o\n\
1338 -T --target=<BFDNAME> Specify the binary file format\n\
1339 -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1340 s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1341 --unicode={default|show|invalid|hex|escape|highlight}\n\
1342 -U {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\
1343 -s --output-separator=<string> String used to separate strings in output.\n\
1344 @<file> Read options from <file>\n\
1345 -h --help Display this information\n\
1346 -v -V --version Print the program's version number\n"));
1347 list_supported_targets (program_name, stream);
1348 if (REPORT_BUGS_TO[0] && status == 0)
1349 fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
1350 exit (status);
1351 }
1352