Home | History | Annotate | Line # | Download | only in sort
msort.c revision 1.27
      1  1.27       dsl /*	$NetBSD: msort.c,v 1.27 2009/09/26 21:16:55 dsl Exp $	*/
      2  1.14  jdolecek 
      3  1.14  jdolecek /*-
      4  1.14  jdolecek  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
      5  1.14  jdolecek  * All rights reserved.
      6  1.14  jdolecek  *
      7  1.14  jdolecek  * This code is derived from software contributed to The NetBSD Foundation
      8  1.14  jdolecek  * by Ben Harris and Jaromir Dolecek.
      9  1.14  jdolecek  *
     10  1.14  jdolecek  * Redistribution and use in source and binary forms, with or without
     11  1.14  jdolecek  * modification, are permitted provided that the following conditions
     12  1.14  jdolecek  * are met:
     13  1.14  jdolecek  * 1. Redistributions of source code must retain the above copyright
     14  1.14  jdolecek  *    notice, this list of conditions and the following disclaimer.
     15  1.14  jdolecek  * 2. Redistributions in binary form must reproduce the above copyright
     16  1.14  jdolecek  *    notice, this list of conditions and the following disclaimer in the
     17  1.14  jdolecek  *    documentation and/or other materials provided with the distribution.
     18  1.14  jdolecek  *
     19  1.14  jdolecek  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  1.14  jdolecek  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  1.14  jdolecek  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  1.14  jdolecek  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  1.14  jdolecek  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  1.14  jdolecek  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  1.14  jdolecek  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  1.14  jdolecek  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  1.14  jdolecek  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  1.14  jdolecek  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  1.14  jdolecek  * POSSIBILITY OF SUCH DAMAGE.
     30  1.14  jdolecek  */
     31   1.2     bjh21 
     32   1.1     bjh21 /*-
     33   1.1     bjh21  * Copyright (c) 1993
     34   1.1     bjh21  *	The Regents of the University of California.  All rights reserved.
     35   1.1     bjh21  *
     36   1.1     bjh21  * This code is derived from software contributed to Berkeley by
     37   1.1     bjh21  * Peter McIlroy.
     38   1.1     bjh21  *
     39   1.1     bjh21  * Redistribution and use in source and binary forms, with or without
     40   1.1     bjh21  * modification, are permitted provided that the following conditions
     41   1.1     bjh21  * are met:
     42   1.1     bjh21  * 1. Redistributions of source code must retain the above copyright
     43   1.1     bjh21  *    notice, this list of conditions and the following disclaimer.
     44   1.1     bjh21  * 2. Redistributions in binary form must reproduce the above copyright
     45   1.1     bjh21  *    notice, this list of conditions and the following disclaimer in the
     46   1.1     bjh21  *    documentation and/or other materials provided with the distribution.
     47  1.13       agc  * 3. Neither the name of the University nor the names of its contributors
     48   1.1     bjh21  *    may be used to endorse or promote products derived from this software
     49   1.1     bjh21  *    without specific prior written permission.
     50   1.1     bjh21  *
     51   1.1     bjh21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     52   1.1     bjh21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     53   1.1     bjh21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     54   1.1     bjh21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     55   1.1     bjh21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     56   1.1     bjh21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     57   1.1     bjh21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     58   1.1     bjh21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     59   1.1     bjh21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     60   1.1     bjh21  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     61   1.1     bjh21  * SUCH DAMAGE.
     62   1.1     bjh21  */
     63   1.1     bjh21 
     64   1.2     bjh21 #include "sort.h"
     65   1.2     bjh21 #include "fsort.h"
     66   1.2     bjh21 
     67   1.1     bjh21 #ifndef lint
     68  1.27       dsl __RCSID("$NetBSD: msort.c,v 1.27 2009/09/26 21:16:55 dsl Exp $");
     69   1.2     bjh21 __SCCSID("@(#)msort.c	8.1 (Berkeley) 6/6/93");
     70   1.1     bjh21 #endif /* not lint */
     71   1.1     bjh21 
     72   1.1     bjh21 #include <stdlib.h>
     73   1.1     bjh21 #include <string.h>
     74   1.1     bjh21 #include <unistd.h>
     75  1.27       dsl #include <util.h>
     76   1.1     bjh21 
     77   1.1     bjh21 /* Subroutines using comparisons: merge sort and check order */
     78   1.1     bjh21 #define DELETE (1)
     79   1.8  jdolecek 
     80   1.1     bjh21 typedef struct mfile {
     81  1.27       dsl 	FILE         *fp;
     82  1.27       dsl 	get_func_t   get;
     83  1.27       dsl 	RECHEADER    *rec;
     84  1.27       dsl 	u_char       *end;
     85   1.1     bjh21 } MFILE;
     86   1.8  jdolecek 
     87  1.19       dsl static int cmp(RECHEADER *, RECHEADER *);
     88  1.27       dsl static int insert(struct mfile **, struct mfile *, int, int);
     89  1.27       dsl 
     90  1.27       dsl /*
     91  1.27       dsl  * Number of files merge() can merge in one pass.
     92  1.27       dsl  * This should be power of two so that it's possible to use this value
     93  1.27       dsl  * for rouding.
     94  1.27       dsl  */
     95  1.27       dsl #define MERGE_FNUM      16
     96  1.27       dsl 
     97  1.27       dsl static struct mfile fstack[MERGE_FNUM];
     98  1.27       dsl static int fstack_count;
     99   1.1     bjh21 
    100   1.1     bjh21 void
    101  1.27       dsl save_for_merge(FILE *fp, get_func_t get, struct field *ftbl)
    102   1.1     bjh21 {
    103  1.27       dsl 	FILE *mfp;
    104  1.27       dsl 
    105  1.27       dsl 	if (fstack_count == MERGE_FNUM) {
    106  1.27       dsl 		/* Must reduce the number of temporary files */
    107  1.27       dsl 		mfp = ftmp();
    108  1.27       dsl 		merge_sort(mfp, putrec, ftbl);
    109  1.27       dsl 		fstack[0].fp = mfp;
    110  1.27       dsl 		fstack[0].get = geteasy;
    111  1.27       dsl 		fstack_count = 1;
    112   1.1     bjh21 	}
    113  1.27       dsl 
    114  1.27       dsl 	fstack[fstack_count].fp = fp;
    115  1.27       dsl 	fstack[fstack_count++].get = get;
    116   1.1     bjh21 }
    117   1.1     bjh21 
    118  1.27       dsl void
    119  1.27       dsl fmerge(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl)
    120   1.1     bjh21 {
    121  1.27       dsl 	get_func_t get = SINGL_FLD ? makeline : makekey;
    122  1.27       dsl 	FILE *fp;
    123  1.27       dsl 	int i;
    124  1.27       dsl 
    125  1.27       dsl 	for (i = 0; i < nfiles; i++) {
    126  1.27       dsl 		fp = fopen(filelist->names[i], "r");
    127  1.27       dsl 		if (fp == NULL)
    128  1.27       dsl 			err(2, "%s", filelist->names[i]);
    129  1.27       dsl 		save_for_merge(fp, get, ftbl);
    130  1.27       dsl 	}
    131   1.9  jdolecek 
    132  1.27       dsl 	merge_sort(outfp, putline, ftbl);
    133  1.27       dsl }
    134   1.9  jdolecek 
    135  1.27       dsl void
    136  1.27       dsl merge_sort(FILE *outfp, put_func_t put, struct field *ftbl)
    137  1.27       dsl {
    138  1.27       dsl 	struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
    139  1.27       dsl 	RECHEADER *new_rec;
    140  1.27       dsl 	u_char *new_end;
    141  1.27       dsl 	void *tmp;
    142  1.27       dsl 	int c, i, nfiles;
    143  1.27       dsl 	size_t sz;
    144   1.8  jdolecek 
    145  1.23       dsl 	/* Read one record from each file (read again if a duplicate) */
    146  1.27       dsl 	for (nfiles = i = 0; i < fstack_count; i++) {
    147  1.27       dsl 		cfile = &fstack[i];
    148  1.27       dsl 		if (cfile->rec == NULL) {
    149  1.27       dsl 			cfile->rec = emalloc(DEFLLEN);
    150  1.27       dsl 			cfile->end = (u_char *)cfile->rec + DEFLLEN;
    151  1.27       dsl 		}
    152  1.27       dsl 		rewind(cfile->fp);
    153  1.27       dsl 
    154  1.27       dsl 		for (;;) {
    155  1.27       dsl 			c = cfile->get(cfile->fp, cfile->rec, cfile->end, ftbl);
    156  1.27       dsl 			if (c == EOF)
    157   1.1     bjh21 				break;
    158   1.9  jdolecek 
    159   1.9  jdolecek 			if (c == BUFFEND) {
    160  1.27       dsl 				/* Double buffer size */
    161  1.27       dsl 				sz = (cfile->end - (u_char *)cfile->rec) * 2;
    162  1.27       dsl 				cfile->rec = erealloc(cfile->rec, sz);
    163  1.27       dsl 				cfile->end = (u_char *)cfile->rec + sz;
    164   1.9  jdolecek 				continue;
    165   1.9  jdolecek 			}
    166   1.9  jdolecek 
    167  1.27       dsl 			if (nfiles != 0) {
    168  1.27       dsl 				if (insert(flist, cfile, nfiles, !DELETE))
    169  1.27       dsl 					/* Duplicate removed */
    170  1.27       dsl 					continue;
    171  1.27       dsl 			} else
    172   1.1     bjh21 				flist[0] = cfile;
    173  1.27       dsl 			nfiles++;
    174  1.27       dsl 			break;
    175   1.1     bjh21 		}
    176   1.1     bjh21 	}
    177   1.9  jdolecek 
    178  1.27       dsl 	if (nfiles == 0)
    179  1.27       dsl 		return;
    180  1.27       dsl 
    181  1.24       dsl 	/*
    182  1.24       dsl 	 * We now loop reading a new record from the file with the
    183  1.24       dsl 	 * 'sorted first' existing record.
    184  1.24       dsl 	 * As each record is added, the 'first' record is written to the
    185  1.24       dsl 	 * output file - maintaining one record from each file in the sorted
    186  1.24       dsl 	 * list.
    187  1.24       dsl 	 */
    188  1.27       dsl 	new_rec = emalloc(DEFLLEN);
    189  1.27       dsl 	new_end = (u_char *)new_rec + DEFLLEN;
    190  1.24       dsl 	for (;;) {
    191  1.27       dsl 		cfile = flist[0];
    192  1.27       dsl 		c = cfile->get(cfile->fp, new_rec, new_end, ftbl);
    193  1.24       dsl 		if (c == EOF) {
    194  1.24       dsl 			/* Write out last record from now-empty input */
    195  1.27       dsl 			put(cfile->rec, outfp);
    196  1.24       dsl 			if (--nfiles == 0)
    197   1.1     bjh21 				break;
    198  1.24       dsl 			/* Replace from file with now-first sorted record. */
    199  1.24       dsl 			/* (Moving base 'flist' saves copying everything!) */
    200  1.24       dsl 			flist++;
    201  1.24       dsl 			continue;
    202  1.24       dsl 		}
    203  1.24       dsl 		if (c == BUFFEND) {
    204  1.24       dsl 			/* Buffer not large enough - double in size */
    205  1.27       dsl 			sz = (new_end - (u_char *)new_rec) * 2;
    206  1.27       dsl 			new_rec = erealloc(new_rec, sz);
    207  1.27       dsl 			new_end = (u_char *)new_rec +sz;
    208  1.24       dsl 			continue;
    209   1.1     bjh21 		}
    210  1.27       dsl 
    211  1.27       dsl 		/* Swap in new buffer, saving old */
    212  1.27       dsl 		tmp = cfile->rec;
    213  1.27       dsl 		cfile->rec = new_rec;
    214  1.27       dsl 		new_rec = tmp;
    215  1.27       dsl 		tmp = cfile->end;
    216  1.27       dsl 		cfile->end = new_end;
    217  1.27       dsl 		new_end = tmp;
    218  1.27       dsl 
    219  1.24       dsl 		/* Add into sort, removing the original first entry */
    220  1.27       dsl 		c = insert(flist, cfile, nfiles, DELETE);
    221  1.27       dsl 		if (c != 0 || (UNIQUE && cfile == flist[0]
    222  1.27       dsl 			    && cmp(new_rec, cfile->rec) == 0)) {
    223  1.27       dsl 			/* Was an unwanted duplicate, restore buffer */
    224  1.27       dsl 			tmp = cfile->rec;
    225  1.27       dsl 			cfile->rec = new_rec;
    226  1.27       dsl 			new_rec = tmp;
    227  1.27       dsl 			tmp = cfile->end;
    228  1.27       dsl 			cfile->end = new_end;
    229  1.27       dsl 			new_end = tmp;
    230  1.27       dsl 			continue;
    231  1.27       dsl 		}
    232  1.24       dsl 
    233  1.27       dsl 		/* Write out 'old' record */
    234  1.27       dsl 		put(new_rec, outfp);
    235  1.24       dsl 	}
    236  1.27       dsl 
    237  1.27       dsl 	free(new_rec);
    238   1.1     bjh21 }
    239   1.1     bjh21 
    240   1.1     bjh21 /*
    241  1.27       dsl  * if delete: inserts rec in flist, deletes flist[0];
    242   1.1     bjh21  * otherwise just inserts *rec in flist.
    243  1.27       dsl  * Returns 1 if record is a duplicate to be ignored.
    244  1.10  jdolecek  */
    245   1.1     bjh21 static int
    246  1.27       dsl insert(struct mfile **flist, struct mfile *rec, int ttop, int delete)
    247   1.1     bjh21 {
    248   1.8  jdolecek 	int mid, top = ttop, bot = 0, cmpv = 1;
    249   1.8  jdolecek 
    250  1.16    itojun 	for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) {
    251  1.27       dsl 		cmpv = cmp(rec->rec, flist[mid]->rec);
    252  1.24       dsl 		if (cmpv == 0 ) {
    253   1.8  jdolecek 			if (UNIQUE)
    254  1.24       dsl 				/* Duplicate key, read another record */
    255  1.27       dsl 				/* NB: This doesn't guarantee to keep any
    256  1.27       dsl 				 * particular record. */
    257  1.24       dsl 				return 1;
    258  1.23       dsl 			/*
    259  1.27       dsl 			 * Apply sort by input file order.
    260  1.24       dsl 			 * We could truncate the sort is the fileno are
    261  1.24       dsl 			 * adjacent - but that is all too hard!
    262  1.24       dsl 			 * The fileno cannot be equal, since we only have one
    263  1.24       dsl 			 * record from each file (+ flist[0] which never
    264  1.24       dsl 			 * comes here).
    265  1.23       dsl 			 */
    266  1.27       dsl 			cmpv = rec < flist[mid] ? -1 : 1;
    267  1.26       dsl 			if (REVERSE)
    268  1.26       dsl 				cmpv = -cmpv;
    269   1.1     bjh21 		}
    270  1.24       dsl 		if (cmpv < 0)
    271  1.24       dsl 			top = mid;
    272  1.24       dsl 		else
    273  1.24       dsl 			bot = mid;
    274   1.1     bjh21 	}
    275   1.8  jdolecek 
    276  1.24       dsl 	/* At this point we haven't yet compared against flist[0] */
    277  1.24       dsl 
    278   1.1     bjh21 	if (delete) {
    279  1.27       dsl 		/* flist[0] is ourselves, only the caller knows the old data */
    280  1.27       dsl 		if (bot != 0) {
    281  1.27       dsl 			memmove(flist, flist + 1, bot * sizeof(MFILE *));
    282  1.27       dsl 			flist[bot] = rec;
    283  1.27       dsl 		}
    284  1.24       dsl 		return 0;
    285  1.24       dsl 	}
    286  1.24       dsl 
    287  1.24       dsl 	/* Inserting original set of records */
    288  1.24       dsl 
    289  1.24       dsl 	if (bot == 0 && cmpv != 0) {
    290  1.24       dsl 		/* Doesn't match flist[1], must compare with flist[0] */
    291  1.27       dsl 		cmpv = cmp(rec->rec, flist[0]->rec);
    292  1.24       dsl 		if (cmpv == 0 && UNIQUE)
    293  1.24       dsl 			return 1;
    294  1.24       dsl 		/* Add matching keys in file order (ie new is later) */
    295  1.24       dsl 		if (cmpv < 0)
    296  1.24       dsl 			bot = -1;
    297   1.1     bjh21 	}
    298  1.24       dsl 	bot++;
    299  1.27       dsl 	memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE *));
    300  1.27       dsl 	flist[bot] = rec;
    301  1.24       dsl 	return 0;
    302   1.1     bjh21 }
    303   1.1     bjh21 
    304   1.1     bjh21 /*
    305   1.1     bjh21  * check order on one file
    306   1.1     bjh21  */
    307   1.1     bjh21 void
    308  1.27       dsl order(struct filelist *filelist, struct field *ftbl)
    309   1.1     bjh21 {
    310  1.27       dsl 	get_func_t get = SINGL_FLD ? makeline : makekey;
    311  1.25       dsl 	RECHEADER *crec, *prec, *trec;
    312   1.6  jdolecek 	u_char *crec_end, *prec_end, *trec_end;
    313  1.27       dsl 	FILE *fp;
    314   1.1     bjh21 	int c;
    315   1.1     bjh21 
    316  1.27       dsl 	fp = fopen(filelist->names[0], "r");
    317  1.27       dsl 	if (fp == NULL)
    318  1.27       dsl 		err(2, "%s", filelist->names[0]);
    319  1.27       dsl 
    320  1.25       dsl 	crec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
    321  1.25       dsl 	crec_end = crec->data + DEFLLEN;
    322  1.25       dsl 	prec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
    323  1.25       dsl 	prec_end = prec->data + DEFLLEN;
    324  1.23       dsl 
    325  1.23       dsl 	/* XXX this does exit(0) for overlong lines */
    326  1.27       dsl 	if (get(fp, prec, prec_end, ftbl) != 0)
    327  1.23       dsl 		exit(0);
    328  1.27       dsl 	while (get(fp, crec, crec_end, ftbl) == 0) {
    329   1.1     bjh21 		if (0 < (c = cmp(prec, crec))) {
    330   1.1     bjh21 			crec->data[crec->length-1] = 0;
    331   1.1     bjh21 			errx(1, "found disorder: %s", crec->data+crec->offset);
    332   1.1     bjh21 		}
    333   1.1     bjh21 		if (UNIQUE && !c) {
    334   1.1     bjh21 			crec->data[crec->length-1] = 0;
    335   1.1     bjh21 			errx(1, "found non-uniqueness: %s",
    336   1.1     bjh21 			    crec->data+crec->offset);
    337   1.1     bjh21 		}
    338   1.6  jdolecek 		/*
    339   1.6  jdolecek 		 * Swap pointers so that this record is on place pointed
    340   1.6  jdolecek 		 * to by prec and new record is read to place pointed to by
    341   1.6  jdolecek 		 * crec.
    342   1.6  jdolecek 		 */
    343   1.1     bjh21 		trec = prec;
    344   1.1     bjh21 		prec = crec;
    345   1.1     bjh21 		crec = trec;
    346   1.6  jdolecek 		trec_end = prec_end;
    347   1.6  jdolecek 		prec_end = crec_end;
    348   1.6  jdolecek 		crec_end = trec_end;
    349   1.1     bjh21 	}
    350   1.1     bjh21 	exit(0);
    351   1.1     bjh21 }
    352   1.1     bjh21 
    353   1.1     bjh21 static int
    354  1.19       dsl cmp(RECHEADER *rec1, RECHEADER *rec2)
    355   1.1     bjh21 {
    356  1.26       dsl 	int len;
    357   1.4  jdolecek 	int r;
    358  1.23       dsl 
    359  1.26       dsl 	/* key is weights */
    360  1.26       dsl 	len = min(rec1->keylen, rec2->keylen);
    361  1.26       dsl 	r = memcmp(rec1->data, rec2->data, len);
    362  1.26       dsl 	if (r == 0)
    363  1.26       dsl 		r = rec1->keylen - rec2->keylen;
    364  1.26       dsl 	if (REVERSE)
    365  1.26       dsl 		r = -r;
    366  1.26       dsl 	return r;
    367   1.1     bjh21 }
    368