texindex.c revision 1.1.1.1 1 /* $NetBSD: texindex.c,v 1.1.1.1 2016/01/14 00:11:29 christos Exp $ */
2
3 /* texindex -- sort TeX index dribble output into an actual index.
4 Id: texindex.c,v 1.11 2004/04/11 17:56:47 karl Exp
5
6 Copyright (C) 1987, 1991, 1992, 1996, 1997, 1998, 1999, 2000, 2001,
7 2002, 2003, 2004 Free Software Foundation, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2, or (at your option)
12 any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307. */
22
23 #include "system.h"
24 #include <getopt.h>
25
26 static char *program_name = "texindex";
27
28 #if defined (emacs)
29 # include "../src/config.h"
30 /* Some s/os.h files redefine these. */
31 # undef read
32 # undef close
33 # undef write
34 # undef open
35 #endif
36
37 #if !defined (HAVE_MEMSET)
38 #undef memset
39 #define memset(ptr, ignore, count) bzero (ptr, count)
40 #endif
41
42 char *mktemp (char *);
43
44 #if !defined (SEEK_SET)
45 # define SEEK_SET 0
46 # define SEEK_CUR 1
47 # define SEEK_END 2
48 #endif /* !SEEK_SET */
49
50 struct linebuffer;
51
52 /* When sorting in core, this structure describes one line
53 and the position and length of its first keyfield. */
54 struct lineinfo
55 {
56 char *text; /* The actual text of the line. */
57 union {
58 char *text; /* The start of the key (for textual comparison). */
59 long number; /* The numeric value (for numeric comparison). */
60 } key;
61 long keylen; /* Length of KEY field. */
62 };
63
64 /* This structure describes a field to use as a sort key. */
65 struct keyfield
66 {
67 int startwords; /* Number of words to skip. */
68 int startchars; /* Number of additional chars to skip. */
69 int endwords; /* Number of words to ignore at end. */
70 int endchars; /* Ditto for characters of last word. */
71 char ignore_blanks; /* Non-zero means ignore spaces and tabs. */
72 char fold_case; /* Non-zero means case doesn't matter. */
73 char reverse; /* Non-zero means compare in reverse order. */
74 char numeric; /* Non-zeros means field is ASCII numeric. */
75 char positional; /* Sort according to file position. */
76 char braced; /* Count balanced-braced groupings as fields. */
77 };
78
79 /* Vector of keyfields to use. */
80 struct keyfield keyfields[3];
81
82 /* Number of keyfields stored in that vector. */
83 int num_keyfields = 3;
84
85 /* Vector of input file names, terminated with a null pointer. */
86 char **infiles;
87
88 /* Vector of corresponding output file names, or NULL, meaning default it
89 (add an `s' to the end). */
90 char **outfiles;
91
92 /* Length of `infiles'. */
93 int num_infiles;
94
95 /* Pointer to the array of pointers to lines being sorted. */
96 char **linearray;
97
98 /* The allocated length of `linearray'. */
99 long nlines;
100
101 /* Directory to use for temporary files. On Unix, it ends with a slash. */
102 char *tempdir;
103
104 /* Number of last temporary file. */
105 int tempcount;
106
107 /* Number of last temporary file already deleted.
108 Temporary files are deleted by `flush_tempfiles' in order of creation. */
109 int last_deleted_tempcount;
110
111 /* During in-core sort, this points to the base of the data block
112 which contains all the lines of data. */
113 char *text_base;
114
115 /* Initially 0; changed to 1 if we want initials in this index. */
116 int need_initials;
117
118 /* Remembers the first initial letter seen in this index, so we can
119 determine whether we need initials in the sorted form. */
120 char first_initial;
121
122 /* Additional command switches .*/
123
124 /* Nonzero means do not delete tempfiles -- for debugging. */
125 int keep_tempfiles;
126
127 /* Forward declarations of functions in this file. */
128 void decode_command (int argc, char **argv);
129 void sort_in_core (char *infile, int total, char *outfile);
130 void sort_offline (char *infile, off_t total, char *outfile);
131 char **parsefile (char *filename, char **nextline, char *data, long int size);
132 char *find_field (struct keyfield *keyfield, char *str, long int *lengthptr);
133 char *find_pos (char *str, int words, int chars, int ignore_blanks);
134 long find_value (char *start, long int length);
135 char *find_braced_pos (char *str, int words, int chars, int ignore_blanks);
136 char *find_braced_end (char *str);
137 void writelines (char **linearray, int nlines, FILE *ostream);
138 int compare_field (struct keyfield *keyfield, char *start1,
139 long int length1, long int pos1, char *start2,
140 long int length2, long int pos2);
141 int compare_full (const void *, const void *);
142 long readline (struct linebuffer *linebuffer, FILE *stream);
143 int merge_files (char **infiles, int nfiles, char *outfile);
144 int merge_direct (char **infiles, int nfiles, char *outfile);
145 void pfatal_with_name (const char *name);
146 void fatal (const char *format, const char *arg);
147 void error (const char *format, const char *arg);
148 void *xmalloc (), *xrealloc ();
149 char *concat (char *s1, char *s2);
150 void flush_tempfiles (int to_count);
151
152 #define MAX_IN_CORE_SORT 500000
154
155 int
156 main (int argc, char **argv)
157 {
158 int i;
159
160 tempcount = 0;
161 last_deleted_tempcount = 0;
162
163 #ifdef HAVE_SETLOCALE
164 /* Set locale via LC_ALL. */
165 setlocale (LC_ALL, "");
166 #endif
167
168 /* Set the text message domain. */
169 bindtextdomain (PACKAGE, LOCALEDIR);
170 textdomain (PACKAGE);
171
172 /* In case we write to a redirected stdout that fails. */
173 /* not ready atexit (close_stdout); */
174
175 /* Describe the kind of sorting to do. */
176 /* The first keyfield uses the first braced field and folds case. */
177 keyfields[0].braced = 1;
178 keyfields[0].fold_case = 1;
179 keyfields[0].endwords = -1;
180 keyfields[0].endchars = -1;
181
182 /* The second keyfield uses the second braced field, numerically. */
183 keyfields[1].braced = 1;
184 keyfields[1].numeric = 1;
185 keyfields[1].startwords = 1;
186 keyfields[1].endwords = -1;
187 keyfields[1].endchars = -1;
188
189 /* The third keyfield (which is ignored while discarding duplicates)
190 compares the whole line. */
191 keyfields[2].endwords = -1;
192 keyfields[2].endchars = -1;
193
194 decode_command (argc, argv);
195
196 /* Process input files completely, one by one. */
197
198 for (i = 0; i < num_infiles; i++)
199 {
200 int desc;
201 off_t ptr;
202 char *outfile;
203 struct stat instat;
204
205 desc = open (infiles[i], O_RDONLY, 0);
206 if (desc < 0)
207 pfatal_with_name (infiles[i]);
208
209 if (stat (infiles[i], &instat))
210 pfatal_with_name (infiles[i]);
211 if (S_ISDIR (instat.st_mode))
212 {
213 #ifdef EISDIR
214 errno = EISDIR;
215 #endif
216 pfatal_with_name (infiles[i]);
217 }
218
219 lseek (desc, (off_t) 0, SEEK_END);
220 ptr = (off_t) lseek (desc, (off_t) 0, SEEK_CUR);
221
222 close (desc);
223
224 outfile = outfiles[i];
225 if (!outfile)
226 outfile = concat (infiles[i], "s");
227
228 need_initials = 0;
229 first_initial = '\0';
230
231 if (ptr < MAX_IN_CORE_SORT)
232 /* Sort a small amount of data. */
233 sort_in_core (infiles[i], (int)ptr, outfile);
234 else
235 sort_offline (infiles[i], ptr, outfile);
236 }
237
238 flush_tempfiles (tempcount);
239 xexit (0);
240 return 0; /* Avoid bogus warnings. */
241 }
242
243 typedef struct
245 {
246 char *long_name;
247 char *short_name;
248 int *variable_ref;
249 int variable_value;
250 char *arg_name;
251 char *doc_string;
252 } TEXINDEX_OPTION;
253
254 TEXINDEX_OPTION texindex_options[] = {
255 { "--help", "-h", (int *)NULL, 0, (char *)NULL,
256 N_("display this help and exit") },
257 { "--keep", "-k", &keep_tempfiles, 1, (char *)NULL,
258 N_("keep temporary files around after processing") },
259 { "--no-keep", 0, &keep_tempfiles, 0, (char *)NULL,
260 N_("do not keep temporary files around after processing (default)") },
261 { "--output", "-o", (int *)NULL, 0, "FILE",
262 N_("send output to FILE") },
263 { "--version", (char *)NULL, (int *)NULL, 0, (char *)NULL,
264 N_("display version information and exit") },
265 { (char *)NULL, (char *)NULL, (int *)NULL, 0, (char *)NULL }
266 };
267
268 void
269 usage (int result_value)
270 {
271 register int i;
272 FILE *f = result_value ? stderr : stdout;
273
274 fprintf (f, _("Usage: %s [OPTION]... FILE...\n"), program_name);
275 fprintf (f, _("Generate a sorted index for each TeX output FILE.\n"));
276 /* Avoid trigraph nonsense. */
277 fprintf (f,
278 _("Usually FILE... is specified as `foo.%c%c\' for a document `foo.texi'.\n"),
279 '?', '?'); /* avoid trigraph in cat-id-tbl.c */
280 fprintf (f, _("\nOptions:\n"));
281
282 for (i = 0; texindex_options[i].long_name; i++)
283 {
284 putc (' ', f);
285
286 if (texindex_options[i].short_name)
287 fprintf (f, "%s, ", texindex_options[i].short_name);
288
289 fprintf (f, "%s %s",
290 texindex_options[i].long_name,
291 texindex_options[i].arg_name
292 ? texindex_options[i].arg_name : "");
293
294 fprintf (f, "\t%s\n", _(texindex_options[i].doc_string));
295 }
296 fputs (_("\n\
297 Email bug reports to bug-texinfo (at) gnu.org,\n\
298 general questions and discussion to help-texinfo (at) gnu.org.\n\
299 Texinfo home page: http://www.gnu.org/software/texinfo/"), f);
300 fputs ("\n", f);
301
302 xexit (result_value);
303 }
304
305 /* Decode the command line arguments to set the parameter variables
306 and set up the vector of keyfields and the vector of input files. */
307
308 void
309 decode_command (int argc, char **argv)
310 {
311 int arg_index = 1;
312 char **ip;
313 char **op;
314
315 /* Store default values into parameter variables. */
316
317 tempdir = getenv ("TMPDIR");
318 if (tempdir == NULL)
319 tempdir = getenv ("TEMP");
320 if (tempdir == NULL)
321 tempdir = getenv ("TMP");
322 if (tempdir == NULL)
323 tempdir = DEFAULT_TMPDIR;
324 else
325 tempdir = concat (tempdir, "/");
326
327 keep_tempfiles = 0;
328
329 /* Allocate ARGC input files, which must be enough. */
330
331 infiles = (char **) xmalloc (argc * sizeof (char *));
332 outfiles = (char **) xmalloc (argc * sizeof (char *));
333 ip = infiles;
334 op = outfiles;
335
336 while (arg_index < argc)
337 {
338 char *arg = argv[arg_index++];
339
340 if (*arg == '-')
341 {
342 if (strcmp (arg, "--version") == 0)
343 {
344 printf ("texindex (GNU %s) %s\n", PACKAGE, VERSION);
345 puts ("");
346 puts ("Copyright (C) 2004 Free Software Foundation, Inc.");
347 printf (_("There is NO warranty. You may redistribute this software\n\
348 under the terms of the GNU General Public License.\n\
349 For more information about these matters, see the files named COPYING.\n"));
350 xexit (0);
351 }
352 else if ((strcmp (arg, "--keep") == 0) ||
353 (strcmp (arg, "-k") == 0))
354 {
355 keep_tempfiles = 1;
356 }
357 else if ((strcmp (arg, "--help") == 0) ||
358 (strcmp (arg, "-h") == 0))
359 {
360 usage (0);
361 }
362 else if ((strcmp (arg, "--output") == 0) ||
363 (strcmp (arg, "-o") == 0))
364 {
365 if (argv[arg_index] != (char *)NULL)
366 {
367 arg_index++;
368 if (op > outfiles)
369 *(op - 1) = argv[arg_index];
370 }
371 else
372 usage (1);
373 }
374 else
375 usage (1);
376 }
377 else
378 {
379 *ip++ = arg;
380 *op++ = (char *)NULL;
381 }
382 }
383
384 /* Record number of keyfields and terminate list of filenames. */
385 num_infiles = ip - infiles;
386 *ip = (char *)NULL;
387 if (num_infiles == 0)
388 usage (1);
389 }
390
391 /* Return a name for temporary file COUNT. */
393
394 static char *
395 maketempname (int count)
396 {
397 static char *tempbase = NULL;
398 char tempsuffix[10];
399
400 if (!tempbase)
401 {
402 int fd;
403 tempbase = concat (tempdir, "txidxXXXXXX");
404
405 fd = mkstemp (tempbase);
406 if (fd == -1)
407 pfatal_with_name (tempbase);
408 }
409
410 sprintf (tempsuffix, ".%d", count);
411 return concat (tempbase, tempsuffix);
412 }
413
414
415 /* Delete all temporary files up to TO_COUNT. */
416
417 void
418 flush_tempfiles (int to_count)
419 {
420 if (keep_tempfiles)
421 return;
422 while (last_deleted_tempcount < to_count)
423 unlink (maketempname (++last_deleted_tempcount));
424 }
425
426
427 /* Compare LINE1 and LINE2 according to the specified set of keyfields. */
429
430 int
431 compare_full (const void *p1, const void *p2)
432 {
433 char **line1 = (char **) p1;
434 char **line2 = (char **) p2;
435 int i;
436
437 /* Compare using the first keyfield;
438 if that does not distinguish the lines, try the second keyfield;
439 and so on. */
440
441 for (i = 0; i < num_keyfields; i++)
442 {
443 long length1, length2;
444 char *start1 = find_field (&keyfields[i], *line1, &length1);
445 char *start2 = find_field (&keyfields[i], *line2, &length2);
446 int tem = compare_field (&keyfields[i], start1, length1,
447 *line1 - text_base,
448 start2, length2, *line2 - text_base);
449 if (tem)
450 {
451 if (keyfields[i].reverse)
452 return -tem;
453 return tem;
454 }
455 }
456
457 return 0; /* Lines match exactly. */
458 }
459
460 /* Compare LINE1 and LINE2, described by structures
461 in which the first keyfield is identified in advance.
462 For positional sorting, assumes that the order of the lines in core
463 reflects their nominal order. */
464 int
465 compare_prepared (const void *p1, const void *p2)
466 {
467 struct lineinfo *line1 = (struct lineinfo *) p1;
468 struct lineinfo *line2 = (struct lineinfo *) p2;
469 int i;
470 int tem;
471 char *text1, *text2;
472
473 /* Compare using the first keyfield, which has been found for us already. */
474 if (keyfields->positional)
475 {
476 if (line1->text - text_base > line2->text - text_base)
477 tem = 1;
478 else
479 tem = -1;
480 }
481 else if (keyfields->numeric)
482 tem = line1->key.number - line2->key.number;
483 else
484 tem = compare_field (keyfields, line1->key.text, line1->keylen, 0,
485 line2->key.text, line2->keylen, 0);
486 if (tem)
487 {
488 if (keyfields->reverse)
489 return -tem;
490 return tem;
491 }
492
493 text1 = line1->text;
494 text2 = line2->text;
495
496 /* Compare using the second keyfield;
497 if that does not distinguish the lines, try the third keyfield;
498 and so on. */
499
500 for (i = 1; i < num_keyfields; i++)
501 {
502 long length1, length2;
503 char *start1 = find_field (&keyfields[i], text1, &length1);
504 char *start2 = find_field (&keyfields[i], text2, &length2);
505 int tem = compare_field (&keyfields[i], start1, length1,
506 text1 - text_base,
507 start2, length2, text2 - text_base);
508 if (tem)
509 {
510 if (keyfields[i].reverse)
511 return -tem;
512 return tem;
513 }
514 }
515
516 return 0; /* Lines match exactly. */
517 }
518
519 /* Like compare_full but more general.
520 You can pass any strings, and you can say how many keyfields to use.
521 POS1 and POS2 should indicate the nominal positional ordering of
522 the two lines in the input. */
523
524 int
525 compare_general (char *str1, char *str2, long int pos1, long int pos2, int use_keyfields)
526 {
527 int i;
528
529 /* Compare using the first keyfield;
530 if that does not distinguish the lines, try the second keyfield;
531 and so on. */
532
533 for (i = 0; i < use_keyfields; i++)
534 {
535 long length1, length2;
536 char *start1 = find_field (&keyfields[i], str1, &length1);
537 char *start2 = find_field (&keyfields[i], str2, &length2);
538 int tem = compare_field (&keyfields[i], start1, length1, pos1,
539 start2, length2, pos2);
540 if (tem)
541 {
542 if (keyfields[i].reverse)
543 return -tem;
544 return tem;
545 }
546 }
547
548 return 0; /* Lines match exactly. */
549 }
550
551 /* Find the start and length of a field in STR according to KEYFIELD.
552 A pointer to the starting character is returned, and the length
553 is stored into the int that LENGTHPTR points to. */
554
555 char *
556 find_field (struct keyfield *keyfield, char *str, long int *lengthptr)
557 {
558 char *start;
559 char *end;
560 char *(*fun) ();
561
562 if (keyfield->braced)
563 fun = find_braced_pos;
564 else
565 fun = find_pos;
566
567 start = (*fun) (str, keyfield->startwords, keyfield->startchars,
568 keyfield->ignore_blanks);
569 if (keyfield->endwords < 0)
570 {
571 if (keyfield->braced)
572 end = find_braced_end (start);
573 else
574 {
575 end = start;
576 while (*end && *end != '\n')
577 end++;
578 }
579 }
580 else
581 {
582 end = (*fun) (str, keyfield->endwords, keyfield->endchars, 0);
583 if (end - str < start - str)
584 end = start;
585 }
586 *lengthptr = end - start;
587 return start;
588 }
589
590 /* Return a pointer to a specified place within STR,
591 skipping (from the beginning) WORDS words and then CHARS chars.
592 If IGNORE_BLANKS is nonzero, we skip all blanks
593 after finding the specified word. */
594
595 char *
596 find_pos (char *str, int words, int chars, int ignore_blanks)
597 {
598 int i;
599 char *p = str;
600
601 for (i = 0; i < words; i++)
602 {
603 char c;
604 /* Find next bunch of nonblanks and skip them. */
605 while ((c = *p) == ' ' || c == '\t')
606 p++;
607 while ((c = *p) && c != '\n' && !(c == ' ' || c == '\t'))
608 p++;
609 if (!*p || *p == '\n')
610 return p;
611 }
612
613 while (*p == ' ' || *p == '\t')
614 p++;
615
616 for (i = 0; i < chars; i++)
617 {
618 if (!*p || *p == '\n')
619 break;
620 p++;
621 }
622 return p;
623 }
624
625 /* Like find_pos but assumes that each field is surrounded by braces
626 and that braces within fields are balanced. */
627
628 char *
629 find_braced_pos (char *str, int words, int chars, int ignore_blanks)
630 {
631 int i;
632 int bracelevel;
633 char *p = str;
634 char c;
635
636 for (i = 0; i < words; i++)
637 {
638 bracelevel = 1;
639 while ((c = *p++) != '{' && c != '\n' && c)
640 /* Do nothing. */ ;
641 if (c != '{')
642 return p - 1;
643 while (bracelevel)
644 {
645 c = *p++;
646 if (c == '{')
647 bracelevel++;
648 if (c == '}')
649 bracelevel--;
650 if (c == 0 || c == '\n')
651 return p - 1;
652 }
653 }
654
655 while ((c = *p++) != '{' && c != '\n' && c)
656 /* Do nothing. */ ;
657
658 if (c != '{')
659 return p - 1;
660
661 if (ignore_blanks)
662 while ((c = *p) == ' ' || c == '\t')
663 p++;
664
665 for (i = 0; i < chars; i++)
666 {
667 if (!*p || *p == '\n')
668 break;
669 p++;
670 }
671 return p;
672 }
673
674 /* Find the end of the balanced-brace field which starts at STR.
675 The position returned is just before the closing brace. */
676
677 char *
678 find_braced_end (char *str)
679 {
680 int bracelevel;
681 char *p = str;
682 char c;
683
684 bracelevel = 1;
685 while (bracelevel)
686 {
687 c = *p++;
688 if (c == '{')
689 bracelevel++;
690 if (c == '}')
691 bracelevel--;
692 if (c == 0 || c == '\n')
693 return p - 1;
694 }
695 return p - 1;
696 }
697
698 long
699 find_value (char *start, long int length)
700 {
701 while (length != 0L)
702 {
703 if (isdigit (*start))
704 return atol (start);
705 length--;
706 start++;
707 }
708 return 0l;
709 }
710
711 /* Vector used to translate characters for comparison.
712 This is how we make all alphanumerics follow all else,
713 and ignore case in the first sorting. */
714 int char_order[256];
715
716 void
717 init_char_order (void)
718 {
719 int i;
720 for (i = 1; i < 256; i++)
721 char_order[i] = i;
722
723 for (i = '0'; i <= '9'; i++)
724 char_order[i] += 512;
725
726 for (i = 'a'; i <= 'z'; i++)
727 {
728 char_order[i] = 512 + i;
729 char_order[i + 'A' - 'a'] = 512 + i;
730 }
731 }
732
733 /* Compare two fields (each specified as a start pointer and a character count)
734 according to KEYFIELD.
735 The sign of the value reports the relation between the fields. */
736
737 int
738 compare_field (struct keyfield *keyfield, char *start1, long int length1,
739 long int pos1, char *start2, long int length2, long int pos2)
740 {
741 if (keyfields->positional)
742 {
743 if (pos1 > pos2)
744 return 1;
745 else
746 return -1;
747 }
748 if (keyfield->numeric)
749 {
750 long value = find_value (start1, length1) - find_value (start2, length2);
751 if (value > 0)
752 return 1;
753 if (value < 0)
754 return -1;
755 return 0;
756 }
757 else
758 {
759 char *p1 = start1;
760 char *p2 = start2;
761 char *e1 = start1 + length1;
762 char *e2 = start2 + length2;
763
764 while (1)
765 {
766 int c1, c2;
767
768 if (p1 == e1)
769 c1 = 0;
770 else
771 c1 = *p1++;
772 if (p2 == e2)
773 c2 = 0;
774 else
775 c2 = *p2++;
776
777 if (char_order[c1] != char_order[c2])
778 return char_order[c1] - char_order[c2];
779 if (!c1)
780 break;
781 }
782
783 /* Strings are equal except possibly for case. */
784 p1 = start1;
785 p2 = start2;
786 while (1)
787 {
788 int c1, c2;
789
790 if (p1 == e1)
791 c1 = 0;
792 else
793 c1 = *p1++;
794 if (p2 == e2)
795 c2 = 0;
796 else
797 c2 = *p2++;
798
799 if (c1 != c2)
800 /* Reverse sign here so upper case comes out last. */
801 return c2 - c1;
802 if (!c1)
803 break;
804 }
805
806 return 0;
807 }
808 }
809
810 /* A `struct linebuffer' is a structure which holds a line of text.
812 `readline' reads a line from a stream into a linebuffer
813 and works regardless of the length of the line. */
814
815 struct linebuffer
816 {
817 long size;
818 char *buffer;
819 };
820
821 /* Initialize LINEBUFFER for use. */
822
823 void
824 initbuffer (struct linebuffer *linebuffer)
825 {
826 linebuffer->size = 200;
827 linebuffer->buffer = (char *) xmalloc (200);
828 }
829
830 /* Read a line of text from STREAM into LINEBUFFER.
831 Return the length of the line. */
832
833 long
834 readline (struct linebuffer *linebuffer, FILE *stream)
835 {
836 char *buffer = linebuffer->buffer;
837 char *p = linebuffer->buffer;
838 char *end = p + linebuffer->size;
839
840 while (1)
841 {
842 int c = getc (stream);
843 if (p == end)
844 {
845 buffer = (char *) xrealloc (buffer, linebuffer->size *= 2);
846 p += buffer - linebuffer->buffer;
847 end += buffer - linebuffer->buffer;
848 linebuffer->buffer = buffer;
849 }
850 if (c < 0 || c == '\n')
851 {
852 *p = 0;
853 break;
854 }
855 *p++ = c;
856 }
857
858 return p - buffer;
859 }
860
861 /* Sort an input file too big to sort in core. */
863
864 void
865 sort_offline (char *infile, off_t total, char *outfile)
866 {
867 /* More than enough. */
868 int ntemps = 2 * (total + MAX_IN_CORE_SORT - 1) / MAX_IN_CORE_SORT;
869 char **tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
870 FILE *istream = fopen (infile, "r");
871 int i;
872 struct linebuffer lb;
873 long linelength;
874 int failure = 0;
875
876 initbuffer (&lb);
877
878 /* Read in one line of input data. */
879
880 linelength = readline (&lb, istream);
881
882 if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
883 {
884 error (_("%s: not a texinfo index file"), infile);
885 return;
886 }
887
888 /* Split up the input into `ntemps' temporary files, or maybe fewer,
889 and put the new files' names into `tempfiles' */
890
891 for (i = 0; i < ntemps; i++)
892 {
893 char *outname = maketempname (++tempcount);
894 FILE *ostream = fopen (outname, "w");
895 long tempsize = 0;
896
897 if (!ostream)
898 pfatal_with_name (outname);
899 tempfiles[i] = outname;
900
901 /* Copy lines into this temp file as long as it does not make file
902 "too big" or until there are no more lines. */
903
904 while (tempsize + linelength + 1 <= MAX_IN_CORE_SORT)
905 {
906 tempsize += linelength + 1;
907 fputs (lb.buffer, ostream);
908 putc ('\n', ostream);
909
910 /* Read another line of input data. */
911
912 linelength = readline (&lb, istream);
913 if (!linelength && feof (istream))
914 break;
915
916 if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
917 {
918 error (_("%s: not a texinfo index file"), infile);
919 failure = 1;
920 goto fail;
921 }
922 }
923 fclose (ostream);
924 if (feof (istream))
925 break;
926 }
927
928 free (lb.buffer);
929
930 fail:
931 /* Record number of temp files we actually needed. */
932
933 ntemps = i;
934
935 /* Sort each tempfile into another tempfile.
936 Delete the first set of tempfiles and put the names of the second
937 into `tempfiles'. */
938
939 for (i = 0; i < ntemps; i++)
940 {
941 char *newtemp = maketempname (++tempcount);
942 sort_in_core (tempfiles[i], MAX_IN_CORE_SORT, newtemp);
943 if (!keep_tempfiles)
944 unlink (tempfiles[i]);
945 tempfiles[i] = newtemp;
946 }
947
948 if (failure)
949 return;
950
951 /* Merge the tempfiles together and indexify. */
952
953 merge_files (tempfiles, ntemps, outfile);
954 }
955
956 /* Sort INFILE, whose size is TOTAL,
958 assuming that is small enough to be done in-core,
959 then indexify it and send the output to OUTFILE (or to stdout). */
960
961 void
962 sort_in_core (char *infile, int total, char *outfile)
963 {
964 char **nextline;
965 char *data = (char *) xmalloc (total + 1);
966 char *file_data;
967 long file_size;
968 int i;
969 FILE *ostream = stdout;
970 struct lineinfo *lineinfo;
971
972 /* Read the contents of the file into the moby array `data'. */
973
974 int desc = open (infile, O_RDONLY, 0);
975
976 if (desc < 0)
977 fatal (_("failure reopening %s"), infile);
978 for (file_size = 0;;)
979 {
980 i = read (desc, data + file_size, total - file_size);
981 if (i <= 0)
982 break;
983 file_size += i;
984 }
985 file_data = data;
986 data[file_size] = 0;
987
988 close (desc);
989
990 if (file_size > 0 && data[0] != '\\' && data[0] != '@')
991 {
992 error (_("%s: not a texinfo index file"), infile);
993 return;
994 }
995
996 init_char_order ();
997
998 /* Sort routines want to know this address. */
999
1000 text_base = data;
1001
1002 /* Create the array of pointers to lines, with a default size
1003 frequently enough. */
1004
1005 nlines = total / 50;
1006 if (!nlines)
1007 nlines = 2;
1008 linearray = (char **) xmalloc (nlines * sizeof (char *));
1009
1010 /* `nextline' points to the next free slot in this array.
1011 `nlines' is the allocated size. */
1012
1013 nextline = linearray;
1014
1015 /* Parse the input file's data, and make entries for the lines. */
1016
1017 nextline = parsefile (infile, nextline, file_data, file_size);
1018 if (nextline == 0)
1019 {
1020 error (_("%s: not a texinfo index file"), infile);
1021 return;
1022 }
1023
1024 /* Sort the lines. */
1025
1026 /* If we have enough space, find the first keyfield of each line in advance.
1027 Make a `struct lineinfo' for each line, which records the keyfield
1028 as well as the line, and sort them. */
1029
1030 lineinfo = malloc ((nextline - linearray) * sizeof (struct lineinfo));
1031
1032 if (lineinfo)
1033 {
1034 struct lineinfo *lp;
1035 char **p;
1036
1037 for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1038 {
1039 lp->text = *p;
1040 lp->key.text = find_field (keyfields, *p, &lp->keylen);
1041 if (keyfields->numeric)
1042 lp->key.number = find_value (lp->key.text, lp->keylen);
1043 }
1044
1045 qsort (lineinfo, nextline - linearray, sizeof (struct lineinfo),
1046 compare_prepared);
1047
1048 for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1049 *p = lp->text;
1050
1051 free (lineinfo);
1052 }
1053 else
1054 qsort (linearray, nextline - linearray, sizeof (char *), compare_full);
1055
1056 /* Open the output file. */
1057
1058 if (outfile)
1059 {
1060 ostream = fopen (outfile, "w");
1061 if (!ostream)
1062 pfatal_with_name (outfile);
1063 }
1064
1065 writelines (linearray, nextline - linearray, ostream);
1066 if (outfile)
1067 fclose (ostream);
1068
1069 free (linearray);
1070 free (data);
1071 }
1072
1073 /* Parse an input string in core into lines.
1075 DATA is the input string, and SIZE is its length.
1076 Data goes in LINEARRAY starting at NEXTLINE.
1077 The value returned is the first entry in LINEARRAY still unused.
1078 Value 0 means input file contents are invalid. */
1079
1080 char **
1081 parsefile (char *filename, char **nextline, char *data, long int size)
1082 {
1083 char *p, *end;
1084 char **line = nextline;
1085
1086 p = data;
1087 end = p + size;
1088 *end = 0;
1089
1090 while (p != end)
1091 {
1092 if (p[0] != '\\' && p[0] != '@')
1093 return 0;
1094
1095 *line = p;
1096
1097 /* Find the first letter of the first field of this line. If it
1098 is different from the first letter of the first field of the
1099 first line, we need initial headers in the output index. */
1100 while (*p && *p != '{')
1101 p++;
1102 if (p == end)
1103 return 0;
1104 p++;
1105 if (first_initial)
1106 {
1107 if (first_initial != toupper (*p))
1108 need_initials = 1;
1109 }
1110 else
1111 first_initial = toupper (*p);
1112
1113 while (*p && *p != '\n')
1114 p++;
1115 if (p != end)
1116 p++;
1117
1118 line++;
1119 if (line == linearray + nlines)
1120 {
1121 char **old = linearray;
1122 linearray = xrealloc (linearray, sizeof (char *) * (nlines *= 4));
1123 line += linearray - old;
1124 }
1125 }
1126
1127 return line;
1128 }
1129
1130 /* Indexification is a filter applied to the sorted lines
1132 as they are being written to the output file.
1133 Multiple entries for the same name, with different page numbers,
1134 get combined into a single entry with multiple page numbers.
1135 The first braced field, which is used for sorting, is discarded.
1136 However, its first character is examined, folded to lower case,
1137 and if it is different from that in the previous line fed to us
1138 a \initial line is written with one argument, the new initial.
1139
1140 If an entry has four braced fields, then the second and third
1141 constitute primary and secondary names.
1142 In this case, each change of primary name
1143 generates a \primary line which contains only the primary name,
1144 and in between these are \secondary lines which contain
1145 just a secondary name and page numbers. */
1146
1147 /* The last primary name we wrote a \primary entry for.
1148 If only one level of indexing is being done, this is the last name seen. */
1149 char *lastprimary;
1150 /* Length of storage allocated for lastprimary. */
1151 int lastprimarylength;
1152
1153 /* Similar, for the secondary name. */
1154 char *lastsecondary;
1155 int lastsecondarylength;
1156
1157 /* Zero if we are not in the middle of writing an entry.
1158 One if we have written the beginning of an entry but have not
1159 yet written any page numbers into it.
1160 Greater than one if we have written the beginning of an entry
1161 plus at least one page number. */
1162 int pending;
1163
1164 /* The initial (for sorting purposes) of the last primary entry written.
1165 When this changes, a \initial {c} line is written */
1166
1167 char *lastinitial;
1168
1169 int lastinitiallength;
1170
1171 /* When we need a string of length 1 for the value of lastinitial,
1172 store it here. */
1173
1174 char lastinitial1[2];
1175
1176 /* Initialize static storage for writing an index. */
1177
1178 void
1179 init_index (void)
1180 {
1181 pending = 0;
1182 lastinitial = lastinitial1;
1183 lastinitial1[0] = 0;
1184 lastinitial1[1] = 0;
1185 lastinitiallength = 0;
1186 lastprimarylength = 100;
1187 lastprimary = (char *) xmalloc (lastprimarylength + 1);
1188 memset (lastprimary, '\0', lastprimarylength + 1);
1189 lastsecondarylength = 100;
1190 lastsecondary = (char *) xmalloc (lastsecondarylength + 1);
1191 memset (lastsecondary, '\0', lastsecondarylength + 1);
1192 }
1193
1194 /* Indexify. Merge entries for the same name,
1195 insert headers for each initial character, etc. */
1196
1197 void
1198 indexify (char *line, FILE *ostream)
1199 {
1200 char *primary, *secondary, *pagenumber;
1201 int primarylength, secondarylength = 0, pagelength;
1202 int nosecondary;
1203 int initiallength;
1204 char *initial;
1205 char initial1[2];
1206 register char *p;
1207
1208 /* First, analyze the parts of the entry fed to us this time. */
1209
1210 p = find_braced_pos (line, 0, 0, 0);
1211 if (*p == '{')
1212 {
1213 initial = p;
1214 /* Get length of inner pair of braces starting at `p',
1215 including that inner pair of braces. */
1216 initiallength = find_braced_end (p + 1) + 1 - p;
1217 }
1218 else
1219 {
1220 initial = initial1;
1221 initial1[0] = toupper (*p);
1222 initial1[1] = 0;
1223 initiallength = 1;
1224 }
1225
1226 pagenumber = find_braced_pos (line, 1, 0, 0);
1227 pagelength = find_braced_end (pagenumber) - pagenumber;
1228 if (pagelength == 0)
1229 fatal (_("No page number in %s"), line);
1230
1231 primary = find_braced_pos (line, 2, 0, 0);
1232 primarylength = find_braced_end (primary) - primary;
1233
1234 secondary = find_braced_pos (line, 3, 0, 0);
1235 nosecondary = !*secondary;
1236 if (!nosecondary)
1237 secondarylength = find_braced_end (secondary) - secondary;
1238
1239 /* If the primary is different from before, make a new primary entry. */
1240 if (strncmp (primary, lastprimary, primarylength))
1241 {
1242 /* Close off current secondary entry first, if one is open. */
1243 if (pending)
1244 {
1245 fputs ("}\n", ostream);
1246 pending = 0;
1247 }
1248
1249 /* If this primary has a different initial, include an entry for
1250 the initial. */
1251 if (need_initials &&
1252 (initiallength != lastinitiallength ||
1253 strncmp (initial, lastinitial, initiallength)))
1254 {
1255 fprintf (ostream, "\\initial {");
1256 fwrite (initial, 1, initiallength, ostream);
1257 fputs ("}\n", ostream);
1258 if (initial == initial1)
1259 {
1260 lastinitial = lastinitial1;
1261 *lastinitial1 = *initial1;
1262 }
1263 else
1264 {
1265 lastinitial = initial;
1266 }
1267 lastinitiallength = initiallength;
1268 }
1269
1270 /* Make the entry for the primary. */
1271 if (nosecondary)
1272 fputs ("\\entry {", ostream);
1273 else
1274 fputs ("\\primary {", ostream);
1275 fwrite (primary, primarylength, 1, ostream);
1276 if (nosecondary)
1277 {
1278 fputs ("}{", ostream);
1279 pending = 1;
1280 }
1281 else
1282 fputs ("}\n", ostream);
1283
1284 /* Record name of most recent primary. */
1285 if (lastprimarylength < primarylength)
1286 {
1287 lastprimarylength = primarylength + 100;
1288 lastprimary = (char *) xrealloc (lastprimary,
1289 1 + lastprimarylength);
1290 }
1291 strncpy (lastprimary, primary, primarylength);
1292 lastprimary[primarylength] = 0;
1293
1294 /* There is no current secondary within this primary, now. */
1295 lastsecondary[0] = 0;
1296 }
1297
1298 /* Should not have an entry with no subtopic following one with a
1299 subtopic. */
1300
1301 if (nosecondary && *lastsecondary)
1302 error (_("entry %s follows an entry with a secondary name"), line);
1303
1304 /* Start a new secondary entry if necessary. */
1305 if (!nosecondary && strncmp (secondary, lastsecondary, secondarylength))
1306 {
1307 if (pending)
1308 {
1309 fputs ("}\n", ostream);
1310 pending = 0;
1311 }
1312
1313 /* Write the entry for the secondary. */
1314 fputs ("\\secondary {", ostream);
1315 fwrite (secondary, secondarylength, 1, ostream);
1316 fputs ("}{", ostream);
1317 pending = 1;
1318
1319 /* Record name of most recent secondary. */
1320 if (lastsecondarylength < secondarylength)
1321 {
1322 lastsecondarylength = secondarylength + 100;
1323 lastsecondary = (char *) xrealloc (lastsecondary,
1324 1 + lastsecondarylength);
1325 }
1326 strncpy (lastsecondary, secondary, secondarylength);
1327 lastsecondary[secondarylength] = 0;
1328 }
1329
1330 /* Here to add one more page number to the current entry. */
1331 if (pending++ != 1)
1332 fputs (", ", ostream); /* Punctuate first, if this is not the first. */
1333 fwrite (pagenumber, pagelength, 1, ostream);
1334 }
1335
1336 /* Close out any unfinished output entry. */
1337
1338 void
1339 finish_index (FILE *ostream)
1340 {
1341 if (pending)
1342 fputs ("}\n", ostream);
1343 free (lastprimary);
1344 free (lastsecondary);
1345 }
1346
1347 /* Copy the lines in the sorted order.
1349 Each line is copied out of the input file it was found in. */
1350
1351 void
1352 writelines (char **linearray, int nlines, FILE *ostream)
1353 {
1354 char **stop_line = linearray + nlines;
1355 char **next_line;
1356
1357 init_index ();
1358
1359 /* Output the text of the lines, and free the buffer space. */
1360
1361 for (next_line = linearray; next_line != stop_line; next_line++)
1362 {
1363 /* If -u was specified, output the line only if distinct from
1364 previous one. */
1365 if (next_line == linearray
1366 /* Compare previous line with this one, using only the
1367 explicitly specd keyfields. */
1368 || compare_general (*(next_line - 1), *next_line, 0L, 0L,
1369 num_keyfields - 1))
1370 {
1371 char *p = *next_line;
1372 char c;
1373
1374 while ((c = *p++) && c != '\n')
1375 /* Do nothing. */ ;
1376 *(p - 1) = 0;
1377 indexify (*next_line, ostream);
1378 }
1379 }
1380
1381 finish_index (ostream);
1382 }
1383
1384 /* Assume (and optionally verify) that each input file is sorted;
1386 merge them and output the result.
1387 Returns nonzero if any input file fails to be sorted.
1388
1389 This is the high-level interface that can handle an unlimited
1390 number of files. */
1391
1392 #define MAX_DIRECT_MERGE 10
1393
1394 int
1395 merge_files (char **infiles, int nfiles, char *outfile)
1396 {
1397 char **tempfiles;
1398 int ntemps;
1399 int i;
1400 int value = 0;
1401 int start_tempcount = tempcount;
1402
1403 if (nfiles <= MAX_DIRECT_MERGE)
1404 return merge_direct (infiles, nfiles, outfile);
1405
1406 /* Merge groups of MAX_DIRECT_MERGE input files at a time,
1407 making a temporary file to hold each group's result. */
1408
1409 ntemps = (nfiles + MAX_DIRECT_MERGE - 1) / MAX_DIRECT_MERGE;
1410 tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
1411 for (i = 0; i < ntemps; i++)
1412 {
1413 int nf = MAX_DIRECT_MERGE;
1414 if (i + 1 == ntemps)
1415 nf = nfiles - i * MAX_DIRECT_MERGE;
1416 tempfiles[i] = maketempname (++tempcount);
1417 value |= merge_direct (&infiles[i * MAX_DIRECT_MERGE], nf, tempfiles[i]);
1418 }
1419
1420 /* All temporary files that existed before are no longer needed
1421 since their contents have been merged into our new tempfiles.
1422 So delete them. */
1423 flush_tempfiles (start_tempcount);
1424
1425 /* Now merge the temporary files we created. */
1426
1427 merge_files (tempfiles, ntemps, outfile);
1428
1429 free (tempfiles);
1430
1431 return value;
1432 }
1433
1434 /* Assume (and optionally verify) that each input file is sorted;
1436 merge them and output the result.
1437 Returns nonzero if any input file fails to be sorted.
1438
1439 This version of merging will not work if the number of
1440 input files gets too high. Higher level functions
1441 use it only with a bounded number of input files. */
1442
1443 int
1444 merge_direct (char **infiles, int nfiles, char *outfile)
1445 {
1446 struct linebuffer *lb1, *lb2;
1447 struct linebuffer **thisline, **prevline;
1448 FILE **streams;
1449 int i;
1450 int nleft;
1451 int lossage = 0;
1452 int *file_lossage;
1453 struct linebuffer *prev_out = 0;
1454 FILE *ostream = stdout;
1455
1456 if (outfile)
1457 {
1458 ostream = fopen (outfile, "w");
1459 }
1460 if (!ostream)
1461 pfatal_with_name (outfile);
1462
1463 init_index ();
1464
1465 if (nfiles == 0)
1466 {
1467 if (outfile)
1468 fclose (ostream);
1469 return 0;
1470 }
1471
1472 /* For each file, make two line buffers. Also, for each file, there
1473 is an element of `thisline' which points at any time to one of the
1474 file's two buffers, and an element of `prevline' which points to
1475 the other buffer. `thisline' is supposed to point to the next
1476 available line from the file, while `prevline' holds the last file
1477 line used, which is remembered so that we can verify that the file
1478 is properly sorted. */
1479
1480 /* lb1 and lb2 contain one buffer each per file. */
1481 lb1 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1482 lb2 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1483
1484 /* thisline[i] points to the linebuffer holding the next available
1485 line in file i, or is zero if there are no lines left in that file. */
1486 thisline = (struct linebuffer **)
1487 xmalloc (nfiles * sizeof (struct linebuffer *));
1488 /* prevline[i] points to the linebuffer holding the last used line
1489 from file i. This is just for verifying that file i is properly
1490 sorted. */
1491 prevline = (struct linebuffer **)
1492 xmalloc (nfiles * sizeof (struct linebuffer *));
1493 /* streams[i] holds the input stream for file i. */
1494 streams = (FILE **) xmalloc (nfiles * sizeof (FILE *));
1495 /* file_lossage[i] is nonzero if we already know file i is not
1496 properly sorted. */
1497 file_lossage = (int *) xmalloc (nfiles * sizeof (int));
1498
1499 /* Allocate and initialize all that storage. */
1500
1501 for (i = 0; i < nfiles; i++)
1502 {
1503 initbuffer (&lb1[i]);
1504 initbuffer (&lb2[i]);
1505 thisline[i] = &lb1[i];
1506 prevline[i] = &lb2[i];
1507 file_lossage[i] = 0;
1508 streams[i] = fopen (infiles[i], "r");
1509 if (!streams[i])
1510 pfatal_with_name (infiles[i]);
1511
1512 readline (thisline[i], streams[i]);
1513 }
1514
1515 /* Keep count of number of files not at eof. */
1516 nleft = nfiles;
1517
1518 while (nleft)
1519 {
1520 struct linebuffer *best = 0;
1521 struct linebuffer *exch;
1522 int bestfile = -1;
1523 int i;
1524
1525 /* Look at the next avail line of each file; choose the least one. */
1526
1527 for (i = 0; i < nfiles; i++)
1528 {
1529 if (thisline[i] &&
1530 (!best ||
1531 0 < compare_general (best->buffer, thisline[i]->buffer,
1532 (long) bestfile, (long) i, num_keyfields)))
1533 {
1534 best = thisline[i];
1535 bestfile = i;
1536 }
1537 }
1538
1539 /* Output that line, unless it matches the previous one and we
1540 don't want duplicates. */
1541
1542 if (!(prev_out &&
1543 !compare_general (prev_out->buffer,
1544 best->buffer, 0L, 1L, num_keyfields - 1)))
1545 indexify (best->buffer, ostream);
1546 prev_out = best;
1547
1548 /* Now make the line the previous of its file, and fetch a new
1549 line from that file. */
1550
1551 exch = prevline[bestfile];
1552 prevline[bestfile] = thisline[bestfile];
1553 thisline[bestfile] = exch;
1554
1555 while (1)
1556 {
1557 /* If the file has no more, mark it empty. */
1558
1559 if (feof (streams[bestfile]))
1560 {
1561 thisline[bestfile] = 0;
1562 /* Update the number of files still not empty. */
1563 nleft--;
1564 break;
1565 }
1566 readline (thisline[bestfile], streams[bestfile]);
1567 if (thisline[bestfile]->buffer[0] || !feof (streams[bestfile]))
1568 break;
1569 }
1570 }
1571
1572 finish_index (ostream);
1573
1574 /* Free all storage and close all input streams. */
1575
1576 for (i = 0; i < nfiles; i++)
1577 {
1578 fclose (streams[i]);
1579 free (lb1[i].buffer);
1580 free (lb2[i].buffer);
1581 }
1582 free (file_lossage);
1583 free (lb1);
1584 free (lb2);
1585 free (thisline);
1586 free (prevline);
1587 free (streams);
1588
1589 if (outfile)
1590 fclose (ostream);
1591
1592 return lossage;
1593 }
1594
1595 /* Print error message and exit. */
1597
1598 void
1599 fatal (const char *format, const char *arg)
1600 {
1601 error (format, arg);
1602 xexit (1);
1603 }
1604
1605 /* Print error message. FORMAT is printf control string, ARG is arg for it. */
1606 void
1607 error (const char *format, const char *arg)
1608 {
1609 printf ("%s: ", program_name);
1610 printf (format, arg);
1611 if (format[strlen (format) -1] != '\n')
1612 printf ("\n");
1613 }
1614
1615 void
1616 perror_with_name (const char *name)
1617 {
1618 fprintf (stderr, "%s: ", program_name);
1619 perror (name);
1620 }
1621
1622 void
1623 pfatal_with_name (const char *name)
1624 {
1625 perror_with_name (name);
1626 xexit (1);
1627 }
1628
1629
1630 /* Return a newly-allocated string concatenating S1 and S2. */
1632
1633 char *
1634 concat (char *s1, char *s2)
1635 {
1636 int len1 = strlen (s1), len2 = strlen (s2);
1637 char *result = (char *) xmalloc (len1 + len2 + 1);
1638
1639 strcpy (result, s1);
1640 strcpy (result + len1, s2);
1641 *(result + len1 + len2) = 0;
1642
1643 return result;
1644 }
1645