Home | History | Annotate | Line # | Download | only in join
join.c revision 1.10
      1 /*	$NetBSD: join.c,v 1.10 1997/01/13 20:57:48 tls Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 1991 The Regents of the University of California.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to Berkeley by
      8  * Steve Hayman of Indiana University, Michiro Hikida and David
      9  * Goodenough.
     10  *
     11  * Redistribution and use in source and binary forms, with or without
     12  * modification, are permitted provided that the following conditions
     13  * are met:
     14  * 1. Redistributions of source code must retain the above copyright
     15  *    notice, this list of conditions and the following disclaimer.
     16  * 2. Redistributions in binary form must reproduce the above copyright
     17  *    notice, this list of conditions and the following disclaimer in the
     18  *    documentation and/or other materials provided with the distribution.
     19  * 3. All advertising materials mentioning features or use of this software
     20  *    must display the following acknowledgement:
     21  *	This product includes software developed by the University of
     22  *	California, Berkeley and its contributors.
     23  * 4. Neither the name of the University nor the names of its contributors
     24  *    may be used to endorse or promote products derived from this software
     25  *    without specific prior written permission.
     26  *
     27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     37  * SUCH DAMAGE.
     38  */
     39 
     40 #ifndef lint
     41 char copyright[] =
     42 "@(#) Copyright (c) 1991 The Regents of the University of California.\n\
     43  All rights reserved.\n";
     44 #endif /* not lint */
     45 
     46 #ifndef lint
     47 /*static char sccsid[] = "from: @(#)join.c	5.1 (Berkeley) 11/18/91";*/
     48 static char rcsid[] = "$Id: join.c,v 1.10 1997/01/13 20:57:48 tls Exp $";
     49 #endif /* not lint */
     50 
     51 #include <sys/types.h>
     52 #include <stdio.h>
     53 #include <stdlib.h>
     54 #include <string.h>
     55 #include <ctype.h>
     56 #include <errno.h>
     57 
     58 /*
     59  * There's a structure per input file which encapsulates the state of the
     60  * file.  We repeatedly read lines from each file until we've read in all
     61  * the consecutive lines from the file with a common join field.  Then we
     62  * compare the set of lines with an equivalent set from the other file.
     63  */
     64 typedef struct {
     65 	char *line;		/* line */
     66 	u_long linealloc;	/* line allocated count */
     67 	char **fields;		/* line field(s) */
     68 	u_long fieldcnt;	/* line field(s) count */
     69 	u_long fieldalloc;	/* line field(s) allocated count */
     70 } LINE;
     71 
     72 typedef struct {
     73 	FILE *fp;		/* file descriptor */
     74 	u_long joinf;		/* join field (-1, -2, -j) */
     75 	int unpair;		/* output unpairable lines (-a) */
     76 	int number;		/* 1 for file 1, 2 for file 2 */
     77 
     78 	LINE *set;		/* set of lines with same field */
     79 	u_long pushback;	/* line on the stack */
     80 	u_long setcnt;		/* set count */
     81 	u_long setalloc;	/* set allocated count */
     82 } INPUT;
     83 INPUT input1 = { NULL, 0, 0, 1, NULL, -1, 0, 0, },
     84       input2 = { NULL, 0, 0, 1, NULL, -1, 0, 0, };
     85 
     86 typedef struct {
     87 	u_long	fileno;		/* file number */
     88 	u_long	fieldno;	/* field number */
     89 } OLIST;
     90 OLIST *olist;			/* output field list */
     91 u_long olistcnt;		/* output field list count */
     92 u_long olistalloc;		/* output field allocated count */
     93 
     94 int joinout = 1;		/* show lines with matched join fields (-v) */
     95 int needsep;			/* need separator character */
     96 int showusage = 1;		/* show usage for usage err() calls */
     97 int spans = 1;			/* span multiple delimiters (-t) */
     98 char *empty;			/* empty field replacement string (-e) */
     99 char *tabchar = " \t";		/* delimiter characters (-t) */
    100 
    101 int  cmp __P((LINE *, u_long, LINE *, u_long));
    102 void enomem __P((void));
    103 void err __P((const char *, ...));
    104 void fieldarg __P((char *));
    105 void joinlines __P((INPUT *, INPUT *));
    106 void obsolete __P((char **));
    107 void outfield __P((LINE *, u_long));
    108 void outoneline __P((INPUT *, LINE *));
    109 void outtwoline __P((INPUT *, LINE *, INPUT *, LINE *));
    110 void slurp __P((INPUT *));
    111 void usage __P((void));
    112 
    113 int
    114 main(argc, argv)
    115 	int argc;
    116 	char *argv[];
    117 {
    118 	register INPUT *F1, *F2;
    119 	int aflag, ch, cval, vflag;
    120 	char *end;
    121 
    122 	F1 = &input1;
    123 	F2 = &input2;
    124 
    125 	aflag = vflag = 0;
    126 	obsolete(argv);
    127 	while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != EOF) {
    128 		switch (ch) {
    129 		case '\01':
    130 			aflag = 1;
    131 			F1->unpair = F2->unpair = 1;
    132 			break;
    133 		case '1':
    134 			if ((F1->joinf = strtol(optarg, &end, 10)) < 1)
    135 				err("-1 option field number less than 1");
    136 			if (*end)
    137 				err("illegal field number -- %s", optarg);
    138 			--F1->joinf;
    139 			break;
    140 		case '2':
    141 			if ((F2->joinf = strtol(optarg, &end, 10)) < 1)
    142 				err("-2 option field number less than 1");
    143 			if (*end)
    144 				err("illegal field number -- %s", optarg);
    145 			--F2->joinf;
    146 			break;
    147 		case 'a':
    148 			aflag = 1;
    149 			switch(strtol(optarg, &end, 10)) {
    150 			case 1:
    151 				F1->unpair = 1;
    152 				break;
    153 			case 2:
    154 				F2->unpair = 1;
    155 				break;
    156 			default:
    157 				err("-a option file number not 1 or 2");
    158 				break;
    159 			}
    160 			if (*end)
    161 				err("illegal file number -- %s", optarg);
    162 			break;
    163 		case 'e':
    164 			empty = optarg;
    165 			break;
    166 		case 'j':
    167 			if ((F1->joinf = F2->joinf =
    168 			    strtol(optarg, &end, 10)) < 1)
    169 				err("-j option field number less than 1");
    170 			if (*end)
    171 				err("illegal field number -- %s", optarg);
    172 			--F1->joinf;
    173 			--F2->joinf;
    174 			break;
    175 		case 'o':
    176 			fieldarg(optarg);
    177 			break;
    178 		case 't':
    179 			spans = 0;
    180 			if (strlen(tabchar = optarg) != 1)
    181 				err("illegal tab character specification");
    182 			break;
    183 		case 'v':
    184 			vflag = 1;
    185 			joinout = 0;
    186 			switch(strtol(optarg, &end, 10)) {
    187 			case 1:
    188 				F1->unpair = 1;
    189 				break;
    190 			case 2:
    191 				F2->unpair = 1;
    192 				break;
    193 			default:
    194 				err("-v option file number not 1 or 2");
    195 				break;
    196 			}
    197 			if (*end)
    198 				err("illegal file number -- %s", optarg);
    199 			break;
    200 		case '?':
    201 		default:
    202 			usage();
    203 		}
    204 	}
    205 	argc -= optind;
    206 	argv += optind;
    207 
    208 	if (aflag && vflag)
    209 		err("-a and -v options mutually exclusive");
    210 
    211 	if (argc != 2)
    212 		usage();
    213 	showusage = 0;
    214 
    215 	/* Open the files; "-" means stdin. */
    216 	if (!strcmp(*argv, "-"))
    217 		F1->fp = stdin;
    218 	else if ((F1->fp = fopen(*argv, "r")) == NULL)
    219 		err("%s: %s", *argv, strerror(errno));
    220 	++argv;
    221 	if (!strcmp(*argv, "-"))
    222 		F2->fp = stdin;
    223 	else if ((F2->fp = fopen(*argv, "r")) == NULL)
    224 		err("%s: %s", *argv, strerror(errno));
    225 	if (F1->fp == stdin && F2->fp == stdin)
    226 		err("only one input file may be stdin");
    227 
    228 	slurp(F1);
    229 	slurp(F2);
    230 	while (F1->setcnt && F2->setcnt) {
    231 		cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
    232 		if (cval == 0) {
    233 			/* Oh joy, oh rapture, oh beauty divine! */
    234 			if (joinout)
    235 				joinlines(F1, F2);
    236 			slurp(F1);
    237 			slurp(F2);
    238 		} else if (cval < 0) {
    239 			/* File 1 takes the lead... */
    240 			if (F1->unpair)
    241 				joinlines(F1, NULL);
    242 			slurp(F1);
    243 		} else {
    244 			/* File 2 takes the lead... */
    245 			if (F2->unpair)
    246 				joinlines(F2, NULL);
    247 			slurp(F2);
    248 		}
    249 	}
    250 
    251 	/*
    252 	 * Now that one of the files is used up, optionally output any
    253 	 * remaining lines from the other file.
    254 	 */
    255 	if (F1->unpair)
    256 		while (F1->setcnt) {
    257 			joinlines(F1, NULL);
    258 			slurp(F1);
    259 		}
    260 	if (F2->unpair)
    261 		while (F2->setcnt) {
    262 			joinlines(F2, NULL);
    263 			slurp(F2);
    264 		}
    265 	exit(0);
    266 }
    267 
    268 void
    269 slurp(F)
    270 	INPUT *F;
    271 {
    272 	register LINE *lp, *lastlp;
    273 	LINE tmp;
    274 	size_t len;
    275 	int cnt;
    276 	char *bp, *fieldp;
    277 
    278 	/*
    279 	 * Read all of the lines from an input file that have the same
    280 	 * join field.
    281 	 */
    282 	F->setcnt = 0;
    283 	for (lastlp = NULL;; ++F->setcnt, lastlp = lp) {
    284 		/*
    285 		 * If we're out of space to hold line structures, allocate
    286 		 * more.  Initialize the structure so that we know that this
    287 		 * is new space.
    288 		 */
    289 		if (F->setcnt == F->setalloc) {
    290 			cnt = F->setalloc;
    291 			F->setalloc += 100;
    292 			if ((F->set = realloc(F->set,
    293 			    F->setalloc * sizeof(LINE))) == NULL)
    294 				enomem();
    295 			bzero(F->set + cnt, 100 * sizeof(LINE *));
    296 		}
    297 
    298 		/*
    299 		 * Get any pushed back line, else get the next line.  Allocate
    300 		 * space as necessary.  If taking the line from the stack swap
    301 		 * the two structures so that we don't lose the allocated space.
    302 		 * This could be avoided by doing another level of indirection,
    303 		 * but it's probably okay as is.
    304 		 */
    305 		lp = &F->set[F->setcnt];
    306 		if (F->pushback != -1) {
    307 			tmp = F->set[F->setcnt];
    308 			F->set[F->setcnt] = F->set[F->pushback];
    309 			F->set[F->pushback] = tmp;
    310 			F->pushback = -1;
    311 			continue;
    312 		}
    313 		if ((bp = fgetln(F->fp, &len)) == NULL)
    314 			return;
    315 		if (lp->linealloc <= len + 1) {
    316 			if (lp->linealloc == 0)
    317 				lp->linealloc = 128;
    318 			while (lp->linealloc <= len + 1)
    319 				lp->linealloc *= 2;
    320 
    321 			if ((lp->line = realloc(lp->line,
    322 			    lp->linealloc * sizeof(char))) == NULL)
    323 				enomem();
    324 		}
    325 		bcopy(bp, lp->line, len+1);
    326 
    327 		/* Replace trailing newline, if it exists. */
    328 		if (bp[len - 1] == '\n')
    329 			lp->line[len - 1] = '\0';
    330 		else
    331 			lp->line[len] = '\0';
    332 		bp = lp->line;
    333 
    334 		/* Split the line into fields, allocate space as necessary. */
    335 		lp->fieldcnt = 0;
    336 		while ((fieldp = strsep(&bp, tabchar)) != NULL) {
    337 			if (spans && *fieldp == '\0')
    338 				continue;
    339 			if (lp->fieldcnt == lp->fieldalloc) {
    340 				lp->fieldalloc += 100;
    341 				if ((lp->fields = realloc(lp->fields,
    342 				    lp->fieldalloc * sizeof(char *))) == NULL)
    343 					enomem();
    344 			}
    345 			lp->fields[lp->fieldcnt++] = fieldp;
    346 		}
    347 
    348 		/* See if the join field value has changed. */
    349 		if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) {
    350 			F->pushback = F->setcnt;
    351 			break;
    352 		}
    353 	}
    354 }
    355 
    356 int
    357 cmp(lp1, fieldno1, lp2, fieldno2)
    358 	LINE *lp1, *lp2;
    359 	u_long fieldno1, fieldno2;
    360 {
    361 
    362 	if (lp1->fieldcnt <= fieldno1)
    363 		return (lp2->fieldcnt < fieldno2 ? 0 : 1);
    364 	if (lp2->fieldcnt <= fieldno2)
    365 		return (-1);
    366 	return (strcmp(lp1->fields[fieldno1], lp2->fields[fieldno2]));
    367 }
    368 
    369 void
    370 joinlines(F1, F2)
    371 	register INPUT *F1, *F2;
    372 {
    373 	register int cnt1, cnt2;
    374 
    375 	/*
    376 	 * Output the results of a join comparison.  The output may be from
    377 	 * either file 1 or file 2 (in which case the first argument is the
    378 	 * file from which to output) or from both.
    379 	 */
    380 	if (F2 == NULL) {
    381 		for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
    382 			outoneline(F1, &F1->set[cnt1]);
    383 		return;
    384 	}
    385 	for (cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
    386 		for (cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
    387 			outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
    388 }
    389 
    390 void
    391 outoneline(F, lp)
    392 	INPUT *F;
    393 	register LINE *lp;
    394 {
    395 	register int cnt;
    396 
    397 	/*
    398 	 * Output a single line from one of the files, according to the
    399 	 * join rules.  This happens when we are writing unmatched single
    400 	 * lines.  Output empty fields in the right places.
    401 	 */
    402 	if (olist)
    403 		for (cnt = 0; cnt < olistcnt; ++cnt) {
    404 			if (olist[cnt].fileno == F->number)
    405 				outfield(lp, olist[cnt].fieldno);
    406 		}
    407 	else
    408 		for (cnt = 0; cnt < lp->fieldcnt; ++cnt)
    409 			outfield(lp, cnt);
    410 	(void)printf("\n");
    411 	if (ferror(stdout))
    412 		err("stdout: %s", strerror(errno));
    413 	needsep = 0;
    414 }
    415 
    416 void
    417 outtwoline(F1, lp1, F2, lp2)
    418 	register INPUT *F1, *F2;
    419 	register LINE *lp1, *lp2;
    420 {
    421 	register int cnt;
    422 
    423 	/* Output a pair of lines according to the join list (if any). */
    424 	if (olist)
    425 		for (cnt = 0; cnt < olistcnt; ++cnt)
    426 			if (olist[cnt].fileno == 1)
    427 				outfield(lp1, olist[cnt].fieldno);
    428 			else /* if (olist[cnt].fileno == 2) */
    429 				outfield(lp2, olist[cnt].fieldno);
    430 	else {
    431 		/*
    432 		 * Output the join field, then the remaining fields from F1
    433 		 * and F2.
    434 		 */
    435 		outfield(lp1, F1->joinf);
    436 		for (cnt = 0; cnt < lp1->fieldcnt; ++cnt)
    437 			if (F1->joinf != cnt)
    438 				outfield(lp1, cnt);
    439 		for (cnt = 0; cnt < lp2->fieldcnt; ++cnt)
    440 			if (F2->joinf != cnt)
    441 				outfield(lp2, cnt);
    442 	}
    443 	(void)printf("\n");
    444 	if (ferror(stdout))
    445 		err("stdout: %s", strerror(errno));
    446 	needsep = 0;
    447 }
    448 
    449 void
    450 outfield(lp, fieldno)
    451 	LINE *lp;
    452 	u_long fieldno;
    453 {
    454 	if (needsep++)
    455 		(void)printf("%c", *tabchar);
    456 	if (!ferror(stdout))
    457 		if (lp->fieldcnt < fieldno) {
    458 			if (empty != NULL)
    459 				(void)printf("%s", empty);
    460 		} else {
    461 			if (*lp->fields[fieldno] == '\0')
    462 				return;
    463 			(void)printf("%s", lp->fields[fieldno]);
    464 		}
    465 	if (ferror(stdout))
    466 		err("stdout: %s", strerror(errno));
    467 }
    468 
    469 /*
    470  * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
    471  * fields.
    472  */
    473 void
    474 fieldarg(option)
    475 	char *option;
    476 {
    477 	u_long fieldno;
    478 	char *end, *token;
    479 
    480 	while ((token = strsep(&option, " \t")) != NULL) {
    481 		if (*token == '\0')
    482 			continue;
    483 		if (token[0] != '1' && token[0] != '2' || token[1] != '.')
    484 			err("malformed -o option field");
    485 		fieldno = strtol(token + 2, &end, 10);
    486 		if (*end)
    487 			err("malformed -o option field");
    488 		if (fieldno == 0)
    489 			err("field numbers are 1 based");
    490 		if (olistcnt == olistalloc) {
    491 			olistalloc += 50;
    492 			if ((olist = realloc(olist,
    493 			    olistalloc * sizeof(OLIST))) == NULL)
    494 				enomem();
    495 		}
    496 		olist[olistcnt].fileno = token[0] - '0';
    497 		olist[olistcnt].fieldno = fieldno - 1;
    498 		++olistcnt;
    499 	}
    500 }
    501 
    502 void
    503 obsolete(argv)
    504 	char **argv;
    505 {
    506 	int len;
    507 	char **p, *ap, *t;
    508 
    509 	while (ap = *++argv) {
    510 		/* Return if "--". */
    511 		if (ap[0] == '-' && ap[1] == '-')
    512 			return;
    513 		switch (ap[1]) {
    514 		case 'a':
    515 			/*
    516 			 * The original join allowed "-a", which meant the
    517 			 * same as -a1 plus -a2.  POSIX 1003.2, Draft 11.2
    518 			 * only specifies this as "-a 1" and "a -2", so we
    519 			 * have to use another option flag, one that is
    520 			 * unlikely to ever be used or accidentally entered
    521 			 * on the command line.  (Well, we could reallocate
    522 			 * the argv array, but that hardly seems worthwhile.)
    523 			 */
    524 			if (ap[2] == '\0')
    525 				ap[1] = '\01';
    526 			break;
    527 		case 'j':
    528 			/*
    529 			 * The original join allowed "-j[12] arg" and "-j arg".
    530 			 * Convert the former to "-[12] arg".  Don't convert
    531 			 * the latter since getopt(3) can handle it.
    532 			 */
    533 			switch(ap[2]) {
    534 			case '1':
    535 				if (ap[3] != '\0')
    536 					goto jbad;
    537 				ap[1] = '1';
    538 				ap[2] = '\0';
    539 				break;
    540 			case '2':
    541 				if (ap[3] != '\0')
    542 					goto jbad;
    543 				ap[1] = '2';
    544 				ap[2] = '\0';
    545 				break;
    546 			case '\0':
    547 				break;
    548 			default:
    549 jbad:				err("illegal option -- %s", ap);
    550 				usage();
    551 			}
    552 			break;
    553 		case 'o':
    554 			/*
    555 			 * The original join allowed "-o arg arg".  Convert to
    556 			 * "-o arg -o arg".
    557 			 */
    558 			if (ap[2] != '\0')
    559 				break;
    560 			for (p = argv + 2; *p; ++p) {
    561 				if (p[0][0] != '1' && p[0][0] != '2' ||
    562 				    p[0][1] != '.')
    563 					break;
    564 				len = strlen(*p);
    565 				if (len - 2 != strspn(*p + 2, "0123456789"))
    566 					break;
    567 				if ((t = malloc(len + 3)) == NULL)
    568 					enomem();
    569 				t[0] = '-';
    570 				t[1] = 'o';
    571 				bcopy(*p, t + 2, len + 1);
    572 				*p = t;
    573 			}
    574 			argv = p - 1;
    575 			break;
    576 		}
    577 	}
    578 }
    579 
    580 void
    581 enomem()
    582 {
    583 	showusage = 0;
    584 	err("%s", strerror(errno));
    585 }
    586 
    587 void
    588 usage()
    589 {
    590 	(void)fprintf(stderr, "%s%s\n",
    591 	    "usage: join [-a fileno | -v fileno ] [-e string] [-1 field] ",
    592 	    "[-2 field]\n            [-o list] [-t char] file1 file2");
    593 	exit(1);
    594 }
    595 
    596 #if __STDC__
    597 #include <stdarg.h>
    598 #else
    599 #include <varargs.h>
    600 #endif
    601 
    602 void
    603 #if __STDC__
    604 err(const char *fmt, ...)
    605 #else
    606 err(fmt, va_alist)
    607 	char *fmt;
    608         va_dcl
    609 #endif
    610 {
    611 	va_list ap;
    612 #if __STDC__
    613 	va_start(ap, fmt);
    614 #else
    615 	va_start(ap);
    616 #endif
    617 	(void)fprintf(stderr, "join: ");
    618 	(void)vfprintf(stderr, fmt, ap);
    619 	va_end(ap);
    620 	(void)fprintf(stderr, "\n");
    621 	if (showusage)
    622 		usage();
    623 	exit(1);
    624 	/* NOTREACHED */
    625 }
    626