join.c revision 1.8 1 /* $NetBSD: join.c,v 1.8 1997/01/09 13:29:58 tls Exp $ */
2
3 /*-
4 * Copyright (c) 1991, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Steve Hayman of Indiana University, Michiro Hikida and David
9 * Goodenough.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 */
39
40 #ifndef lint
41 char copyright[] =
42 "@(#) Copyright (c) 1991, 1993, 1994\n\
43 The Regents of the University of California. All rights reserved.\n";
44 #endif /* not lint */
45
46 #ifndef lint
47 /*static char sccsid[] = "from: @(#)join.c 8.6 (Berkeley) 5/4/95";*/
48 static char rcsid[] = "$Id: join.c,v 1.8 1997/01/09 13:29:58 tls Exp $";
49 #endif /* not lint */
50
51 #include <sys/param.h>
52
53 #include <ctype.h>
54 #include <sys/types.h>
55 #include <errno.h>
56 #include <stdio.h>
57 #include <stdlib.h>
58 #include <string.h>
59 #include <unistd.h>
60
61 /*
62 * There's a structure per input file which encapsulates the state of the
63 * file. We repeatedly read lines from each file until we've read in all
64 * the consecutive lines from the file with a common join field. Then we
65 * compare the set of lines with an equivalent set from the other file.
66 */
67 typedef struct {
68 char *line; /* line */
69 u_long linealloc; /* line allocated count */
70 char **fields; /* line field(s) */
71 u_long fieldcnt; /* line field(s) count */
72 u_long fieldalloc; /* line field(s) allocated count */
73 } LINE;
74
75 typedef struct {
76 FILE *fp; /* file descriptor */
77 u_long joinf; /* join field (-1, -2, -j) */
78 int unpair; /* output unpairable lines (-a) */
79 int number; /* 1 for file 1, 2 for file 2 */
80
81 LINE *set; /* set of lines with same field */
82 int pushbool; /* if pushback is set */
83 u_long pushback; /* line on the stack */
84 u_long setcnt; /* set count */
85 u_long setalloc; /* set allocated count */
86 } INPUT;
87 INPUT input1 = { NULL, 0, 0, 1, NULL, -1, 0, 0, 0 },
88 input2 = { NULL, 0, 0, 1, NULL, -1, 0, 0, 0 };
89
90 typedef struct {
91 u_long filenum; /* file number */
92 u_long fieldno; /* field number */
93 } OLIST;
94 OLIST *olist; /* output field list */
95 u_long olistcnt; /* output field list count */
96 u_long olistalloc; /* output field allocated count */
97
98 int joinout = 1; /* show lines with matched join fields (-v) */
99 int needsep; /* need separator character */
100 int spans = 1; /* span multiple delimiters (-t) */
101 char *empty; /* empty field replacement string (-e) */
102 char *tabchar = " \t"; /* delimiter characters (-t) */
103
104 int cmp __P((LINE *, u_long, LINE *, u_long));
105 void fieldarg __P((char *));
106 void joinlines __P((INPUT *, INPUT *));
107 void obsolete __P((char **));
108 void outfield __P((LINE *, u_long, int));
109 void outoneline __P((INPUT *, LINE *));
110 void outtwoline __P((INPUT *, LINE *, INPUT *, LINE *));
111 void slurp __P((INPUT *));
112 void usage __P((void));
113
114 int
115 main(argc, argv)
116 int argc;
117 char *argv[];
118 {
119 INPUT *F1, *F2;
120 int aflag, ch, cval, vflag;
121 char *end;
122
123 F1 = &input1;
124 F2 = &input2;
125
126 aflag = vflag = 0;
127 obsolete(argv);
128 while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != EOF) {
129 switch (ch) {
130 case '\01': /* See comment in obsolete(). */
131 aflag = 1;
132 F1->unpair = F2->unpair = 1;
133 break;
134 case '1':
135 if ((F1->joinf = strtol(optarg, &end, 10)) < 1)
136 errx(1, "-1 option field number less than 1");
137 if (*end)
138 errx(1, "illegal field number -- %s", optarg);
139 --F1->joinf;
140 break;
141 case '2':
142 if ((F2->joinf = strtol(optarg, &end, 10)) < 1)
143 errx(1, "-2 option field number less than 1");
144 if (*end)
145 errx(1, "illegal field number -- %s", optarg);
146 --F2->joinf;
147 break;
148 case 'a':
149 aflag = 1;
150 switch(strtol(optarg, &end, 10)) {
151 case 1:
152 F1->unpair = 1;
153 break;
154 case 2:
155 F2->unpair = 1;
156 break;
157 default:
158 errx(1, "-a option file number not 1 or 2");
159 break;
160 }
161 if (*end)
162 errx(1, "illegal file number -- %s", optarg);
163 break;
164 case 'e':
165 empty = optarg;
166 break;
167 case 'j':
168 if ((F1->joinf = F2->joinf =
169 strtol(optarg, &end, 10)) < 1)
170 errx(1, "-j option field number less than 1");
171 if (*end)
172 errx(1, "illegal field number -- %s", optarg);
173 --F1->joinf;
174 --F2->joinf;
175 break;
176 case 'o':
177 fieldarg(optarg);
178 break;
179 case 't':
180 spans = 0;
181 if (strlen(tabchar = optarg) != 1)
182 errx(1, "illegal tab character specification");
183 break;
184 case 'v':
185 vflag = 1;
186 joinout = 0;
187 switch (strtol(optarg, &end, 10)) {
188 case 1:
189 F1->unpair = 1;
190 break;
191 case 2:
192 F2->unpair = 1;
193 break;
194 default:
195 errx(1, "-v option file number not 1 or 2");
196 break;
197 }
198 if (*end)
199 errx(1, "illegal file number -- %s", optarg);
200 break;
201 case '?':
202 default:
203 usage();
204 }
205 }
206 argc -= optind;
207 argv += optind;
208
209 if (aflag && vflag)
210 errx(1, "the -a and -v options are mutually exclusive");
211
212 if (argc != 2)
213 usage();
214
215 /* Open the files; "-" means stdin. */
216 if (!strcmp(*argv, "-"))
217 F1->fp = stdin;
218 else if ((F1->fp = fopen(*argv, "r")) == NULL)
219 err(1, "%s", *argv);
220 ++argv;
221 if (!strcmp(*argv, "-"))
222 F2->fp = stdin;
223 else if ((F2->fp = fopen(*argv, "r")) == NULL)
224 err(1, "%s", *argv);
225 if (F1->fp == stdin && F2->fp == stdin)
226 errx(1, "only one input file may be stdin");
227
228 slurp(F1);
229 slurp(F2);
230 while (F1->setcnt && F2->setcnt) {
231 cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
232 if (cval == 0) {
233 /* Oh joy, oh rapture, oh beauty divine! */
234 if (joinout)
235 joinlines(F1, F2);
236 slurp(F1);
237 slurp(F2);
238 } else if (cval < 0) {
239 /* File 1 takes the lead... */
240 if (F1->unpair)
241 joinlines(F1, NULL);
242 slurp(F1);
243 } else {
244 /* File 2 takes the lead... */
245 if (F2->unpair)
246 joinlines(F2, NULL);
247 slurp(F2);
248 }
249 }
250
251 /*
252 * Now that one of the files is used up, optionally output any
253 * remaining lines from the other file.
254 */
255 if (F1->unpair)
256 while (F1->setcnt) {
257 joinlines(F1, NULL);
258 slurp(F1);
259 }
260 if (F2->unpair)
261 while (F2->setcnt) {
262 joinlines(F2, NULL);
263 slurp(F2);
264 }
265 exit(0);
266 }
267
268 void
269 slurp(F)
270 INPUT *F;
271 {
272 LINE *lp, *lastlp, tmp;
273 size_t len;
274 int cnt;
275 char *bp, *fieldp;
276
277 /*
278 * Read all of the lines from an input file that have the same
279 * join field.
280 */
281 F->setcnt = 0;
282 for (lastlp = NULL;; ++F->setcnt, lastlp = lp) {
283 /*
284 * If we're out of space to hold line structures, allocate
285 * more. Initialize the structure so that we know that this
286 * is new space.
287 */
288 if (F->setcnt == F->setalloc) {
289 cnt = F->setalloc;
290 F->setalloc += 50;
291 if ((F->set = realloc(F->set,
292 F->setalloc * sizeof(LINE))) == NULL)
293 err(1, NULL);
294 memset(F->set + cnt, 0, 50 * sizeof(LINE));
295
296 /* re-set lastlp in case it moved */
297 if (lastlp != NULL)
298 lastlp = &F->set[F->setcnt - 1];
299 }
300
301 /*
302 * Get any pushed back line, else get the next line. Allocate
303 * space as necessary. If taking the line from the stack swap
304 * the two structures so that we don't lose space allocated to
305 * either structure. This could be avoided by doing another
306 * indirection, but it's probably okay as is.
307 */
308 lp = &F->set[F->setcnt];
309 if (F->pushbool) {
310 tmp = F->set[F->setcnt];
311 F->set[F->setcnt] = F->set[F->pushback];
312 F->set[F->pushback] = tmp;
313 F->pushbool = 0;
314 continue;
315 }
316 if ((bp = fgetln(F->fp, &len)) == NULL)
317 return;
318 if (lp->linealloc <= len + 1) {
319 if (lp->linealloc == 0)
320 lp->linealloc = 128;
321 while (lp->linealloc <= len + 1)
322 lp->linealloc *= 2;
323
324 if ((lp->line = realloc(lp->line,
325 lp->linealloc * sizeof(char))) == NULL)
326 err(1, NULL);
327 }
328 memmove(lp->line, bp, len+1);
329
330 /* Replace trailing newline, if it exists. */
331 if (bp[len - 1] == '\n')
332 lp->line[len - 1] = '\0';
333 else
334 lp->line[len] = '\0';
335 bp = lp->line;
336
337 /* Split the line into fields, allocate space as necessary. */
338 lp->fieldcnt = 0;
339 while ((fieldp = strsep(&bp, tabchar)) != NULL) {
340 if (spans && *fieldp == '\0')
341 continue;
342 if (lp->fieldcnt == lp->fieldalloc) {
343 lp->fieldalloc += 50;
344 if ((lp->fields = realloc(lp->fields,
345 lp->fieldalloc * sizeof(char *))) == NULL)
346 err(1, NULL);
347 }
348 lp->fields[lp->fieldcnt++] = fieldp;
349 }
350
351 /* See if the join field value has changed. */
352 if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) {
353 F->pushback = F->setcnt;
354 break;
355 }
356 }
357 }
358
359 int
360 cmp(lp1, fieldno1, lp2, fieldno2)
361 LINE *lp1, *lp2;
362 u_long fieldno1, fieldno2;
363 {
364
365 if (lp1->fieldcnt <= fieldno1)
366 return (lp2->fieldcnt < fieldno2 ? 0 : 1);
367 if (lp2->fieldcnt <= fieldno2)
368 return (-1);
369 return (strcmp(lp1->fields[fieldno1], lp2->fields[fieldno2]));
370 }
371
372 void
373 joinlines(F1, F2)
374 INPUT *F1, *F2;
375 {
376 int cnt1, cnt2;
377
378 /*
379 * Output the results of a join comparison. The output may be from
380 * either file 1 or file 2 (in which case the first argument is the
381 * file from which to output) or from both.
382 */
383 if (F2 == NULL) {
384 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
385 outoneline(F1, &F1->set[cnt1]);
386 return;
387 }
388 for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
389 for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
390 outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
391 }
392
393 void
394 outoneline(F, lp)
395 INPUT *F;
396 LINE *lp;
397 {
398 int cnt;
399
400 /*
401 * Output a single line from one of the files, according to the
402 * join rules. This happens when we are writing unmatched single
403 * lines. Output empty fields in the right places.
404 */
405 if (olist)
406 for (cnt = 0; cnt < olistcnt; ++cnt) {
407 if (olist[cnt].filenum == F->number)
408 outfield(lp, olist[cnt].fieldno, 0);
409 else
410 outfield(lp, 0, 1);
411 }
412 else
413 for (cnt = 0; cnt < lp->fieldcnt; ++cnt)
414 outfield(lp, cnt, 0);
415 (void)printf("\n");
416 if (ferror(stdout))
417 err(1, "stdout");
418 needsep = 0;
419 }
420
421 void
422 outtwoline(F1, lp1, F2, lp2)
423 INPUT *F1, *F2;
424 LINE *lp1, *lp2;
425 {
426 int cnt;
427
428 /* Output a pair of lines according to the join list (if any). */
429 if (olist)
430 for (cnt = 0; cnt < olistcnt; ++cnt)
431 if (olist[cnt].filenum == 1)
432 outfield(lp1, olist[cnt].fieldno, 0);
433 else /* if (olist[cnt].filenum == 2) */
434 outfield(lp2, olist[cnt].fieldno, 0);
435 else {
436 /*
437 * Output the join field, then the remaining fields from F1
438 * and F2.
439 */
440 outfield(lp1, F1->joinf, 0);
441 for (cnt = 0; cnt < lp1->fieldcnt; ++cnt)
442 if (F1->joinf != cnt)
443 outfield(lp1, cnt, 0);
444 for (cnt = 0; cnt < lp2->fieldcnt; ++cnt)
445 if (F2->joinf != cnt)
446 outfield(lp2, cnt, 0);
447 }
448 (void)printf("\n");
449 if (ferror(stdout))
450 err(1, "stdout");
451 needsep = 0;
452 }
453
454 void
455 outfield(lp, fieldno, out_empty)
456 LINE *lp;
457 u_long fieldno;
458 int out_empty;
459 {
460 if (needsep++)
461 (void)printf("%c", *tabchar);
462 if (!ferror(stdout))
463 if (lp->fieldcnt < fieldno || out_empty) {
464 if (empty != NULL)
465 (void)printf("%s", empty);
466 } else {
467 if (*lp->fields[fieldno] == '\0')
468 return;
469 (void)printf("%s", lp->fields[fieldno]);
470 }
471 if (ferror(stdout))
472 err(1, "stdout");
473 }
474
475 /*
476 * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
477 * fields.
478 */
479 void
480 fieldarg(option)
481 char *option;
482 {
483 u_long fieldno;
484 char *end, *token;
485
486 while ((token = strsep(&option, ", \t")) != NULL) {
487 if (*token == '\0')
488 continue;
489 if (token[0] != '1' && token[0] != '2' || token[1] != '.')
490 errx(1, "malformed -o option field");
491 fieldno = strtol(token + 2, &end, 10);
492 if (*end)
493 errx(1, "malformed -o option field");
494 if (fieldno == 0)
495 errx(1, "field numbers are 1 based");
496 if (olistcnt == olistalloc) {
497 olistalloc += 50;
498 if ((olist = realloc(olist,
499 olistalloc * sizeof(OLIST))) == NULL)
500 err(1, NULL);
501 }
502 olist[olistcnt].filenum = token[0] - '0';
503 olist[olistcnt].fieldno = fieldno - 1;
504 ++olistcnt;
505 }
506 }
507
508 void
509 obsolete(argv)
510 char **argv;
511 {
512 int len;
513 char **p, *ap, *t;
514
515 while ((ap = *++argv) != NULL) {
516 /* Return if "--". */
517 if (ap[0] == '-' && ap[1] == '-')
518 return;
519 switch (ap[1]) {
520 case 'a':
521 /*
522 * The original join allowed "-a", which meant the
523 * same as -a1 plus -a2. POSIX 1003.2, Draft 11.2
524 * only specifies this as "-a 1" and "a -2", so we
525 * have to use another option flag, one that is
526 * unlikely to ever be used or accidentally entered
527 * on the command line. (Well, we could reallocate
528 * the argv array, but that hardly seems worthwhile.)
529 */
530 if (ap[2] == '\0')
531 ap[1] = '\01';
532 break;
533 case 'j':
534 /*
535 * The original join allowed "-j[12] arg" and "-j arg".
536 * Convert the former to "-[12] arg". Don't convert
537 * the latter since getopt(3) can handle it.
538 */
539 switch(ap[2]) {
540 case '1':
541 if (ap[3] != '\0')
542 goto jbad;
543 ap[1] = '1';
544 ap[2] = '\0';
545 break;
546 case '2':
547 if (ap[3] != '\0')
548 goto jbad;
549 ap[1] = '2';
550 ap[2] = '\0';
551 break;
552 case '\0':
553 break;
554 default:
555 jbad: errx(1, "illegal option -- %s", ap);
556 usage();
557 }
558 break;
559 case 'o':
560 /*
561 * The original join allowed "-o arg arg".
562 * Convert to "-o arg -o arg".
563 */
564 if (ap[2] != '\0')
565 break;
566 for (p = argv + 2; *p; ++p) {
567 if (p[0][0] != '1' &&
568 p[0][0] != '2' || p[0][1] != '.')
569 break;
570 len = strlen(*p);
571 if (len - 2 != strspn(*p + 2, "0123456789"))
572 break;
573 if ((t = malloc(len + 3)) == NULL)
574 err(1, NULL);
575 t[0] = '-';
576 t[1] = 'o';
577 memmove(t + 2, *p, len + 1);
578 *p = t;
579 }
580 argv = p - 1;
581 break;
582 }
583 }
584 }
585
586 void
587 usage()
588 {
589 (void)fprintf(stderr, "%s%s\n",
590 "usage: join [-a fileno | -v fileno ] [-e string] [-1 field] ",
591 "[-2 field]\n [-o list] [-t char] file1 file2");
592 exit(1);
593 }
594