Home | History | Annotate | Line # | Download | only in sort
msort.c revision 1.18.6.2
      1 /*	$NetBSD: msort.c,v 1.18.6.2 2010/06/29 18:01:11 riz Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Ben Harris and Jaromir Dolecek.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*-
     33  * Copyright (c) 1993
     34  *	The Regents of the University of California.  All rights reserved.
     35  *
     36  * This code is derived from software contributed to Berkeley by
     37  * Peter McIlroy.
     38  *
     39  * Redistribution and use in source and binary forms, with or without
     40  * modification, are permitted provided that the following conditions
     41  * are met:
     42  * 1. Redistributions of source code must retain the above copyright
     43  *    notice, this list of conditions and the following disclaimer.
     44  * 2. Redistributions in binary form must reproduce the above copyright
     45  *    notice, this list of conditions and the following disclaimer in the
     46  *    documentation and/or other materials provided with the distribution.
     47  * 3. Neither the name of the University nor the names of its contributors
     48  *    may be used to endorse or promote products derived from this software
     49  *    without specific prior written permission.
     50  *
     51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     61  * SUCH DAMAGE.
     62  */
     63 
     64 #include "sort.h"
     65 #include "fsort.h"
     66 
     67 #ifndef lint
     68 __RCSID("$NetBSD: msort.c,v 1.18.6.2 2010/06/29 18:01:11 riz Exp $");
     69 __SCCSID("@(#)msort.c	8.1 (Berkeley) 6/6/93");
     70 #endif /* not lint */
     71 
     72 #include <stdlib.h>
     73 #include <string.h>
     74 #include <unistd.h>
     75 #include <util.h>
     76 
     77 /* Subroutines using comparisons: merge sort and check order */
     78 #define DELETE (1)
     79 
     80 typedef struct mfile {
     81 	FILE         *fp;
     82 	get_func_t   get;
     83 	RECHEADER    *rec;
     84 	u_char       *end;
     85 } MFILE;
     86 
     87 static int cmp(RECHEADER *, RECHEADER *);
     88 static int insert(struct mfile **, struct mfile *, int, int);
     89 static void merge_sort_fstack(FILE *, put_func_t, struct field *);
     90 
     91 /*
     92  * Number of files merge() can merge in one pass.
     93  */
     94 #define MERGE_FNUM      16
     95 
     96 static struct mfile fstack[MERGE_FNUM];
     97 static struct mfile fstack_1[MERGE_FNUM];
     98 static struct mfile fstack_2[MERGE_FNUM];
     99 static int fstack_count, fstack_1_count, fstack_2_count;
    100 
    101 void
    102 save_for_merge(FILE *fp, get_func_t get, struct field *ftbl)
    103 {
    104 	FILE *mfp, *mfp1, *mfp2;
    105 
    106 	if (fstack_count == MERGE_FNUM) {
    107 		/* Must reduce the number of temporary files */
    108 		mfp = ftmp();
    109 		merge_sort_fstack(mfp, putrec, ftbl);
    110 		/* Save output in next layer */
    111 		if (fstack_1_count == MERGE_FNUM) {
    112 			mfp1 = ftmp();
    113 			memcpy(fstack, fstack_1, sizeof fstack);
    114 			merge_sort_fstack(mfp1, putrec, ftbl);
    115 			if (fstack_2_count == MERGE_FNUM) {
    116 				/* More than 4096 files! */
    117 				mfp2 = ftmp();
    118 				memcpy(fstack, fstack_2, sizeof fstack);
    119 				merge_sort_fstack(mfp2, putrec, ftbl);
    120 				fstack_2[0].fp = mfp2;
    121 				fstack_2_count = 1;
    122 			}
    123 			fstack_2[fstack_2_count].fp = mfp1;
    124 			fstack_2[fstack_2_count].get = geteasy;
    125 			fstack_2_count++;
    126 			fstack_1_count = 0;
    127 		}
    128 		fstack_1[fstack_1_count].fp = mfp;
    129 		fstack_1[fstack_1_count].get = geteasy;
    130 		fstack_1_count++;
    131 		fstack_count = 0;
    132 	}
    133 
    134 	fstack[fstack_count].fp = fp;
    135 	fstack[fstack_count++].get = get;
    136 }
    137 
    138 void
    139 fmerge(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl)
    140 {
    141 	get_func_t get = SINGL_FLD ? makeline : makekey;
    142 	FILE *fp;
    143 	int i;
    144 
    145 	for (i = 0; i < nfiles; i++) {
    146 		fp = fopen(filelist->names[i], "r");
    147 		if (fp == NULL)
    148 			err(2, "%s", filelist->names[i]);
    149 		save_for_merge(fp, get, ftbl);
    150 	}
    151 
    152 	merge_sort(outfp, putline, ftbl);
    153 }
    154 
    155 void
    156 merge_sort(FILE *outfp, put_func_t put, struct field *ftbl)
    157 {
    158 	int count = fstack_1_count + fstack_2_count;
    159 	FILE *mfp;
    160 	int i;
    161 
    162 	if (count == 0) {
    163 		/* All files in initial array */
    164 		merge_sort_fstack(outfp, put, ftbl);
    165 		return;
    166 	}
    167 
    168 	count += fstack_count;
    169 
    170 	/* Too many files for one merge sort */
    171 	for (;;) {
    172 		/* Sort latest 16 files */
    173 		i = count;
    174 		if (i > MERGE_FNUM)
    175 			i = MERGE_FNUM;
    176 		while (fstack_count > 0)
    177 			fstack[--i] = fstack[--fstack_count];
    178 		while (i > 0 && fstack_1_count > 0)
    179 			fstack[--i] = fstack_1[--fstack_1_count];
    180 		while (i > 0)
    181 			fstack[--i] = fstack_2[--fstack_2_count];
    182 		if (count <= MERGE_FNUM) {
    183 			/* Got all the data */
    184 			fstack_count = count;
    185 			merge_sort_fstack(outfp, put, ftbl);
    186 			return;
    187 		}
    188 		mfp = ftmp();
    189 		fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count;
    190 		merge_sort_fstack(mfp, putrec, ftbl);
    191 		fstack[0].fp = mfp;
    192 		fstack[0].get = geteasy;
    193 		fstack_count = 1;
    194 		count -= MERGE_FNUM - 1;
    195 	}
    196 }
    197 
    198 static void
    199 merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl)
    200 {
    201 	struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
    202 	RECHEADER *new_rec;
    203 	u_char *new_end;
    204 	void *tmp;
    205 	int c, i, nfiles;
    206 	size_t sz;
    207 
    208 	/* Read one record from each file (read again if a duplicate) */
    209 	for (nfiles = i = 0; i < fstack_count; i++) {
    210 		cfile = &fstack[i];
    211 		if (cfile->rec == NULL) {
    212 			cfile->rec = allocrec(NULL, DEFLLEN);
    213 			cfile->end = (u_char *)cfile->rec + DEFLLEN;
    214 		}
    215 		rewind(cfile->fp);
    216 
    217 		for (;;) {
    218 			c = cfile->get(cfile->fp, cfile->rec, cfile->end, ftbl);
    219 			if (c == EOF)
    220 				break;
    221 
    222 			if (c == BUFFEND) {
    223 				/* Double buffer size */
    224 				sz = (cfile->end - (u_char *)cfile->rec) * 2;
    225 				cfile->rec = allocrec(cfile->rec, sz);
    226 				cfile->end = (u_char *)cfile->rec + sz;
    227 				continue;
    228 			}
    229 
    230 			if (nfiles != 0) {
    231 				if (insert(flist, cfile, nfiles, !DELETE))
    232 					/* Duplicate removed */
    233 					continue;
    234 			} else
    235 				flist[0] = cfile;
    236 			nfiles++;
    237 			break;
    238 		}
    239 	}
    240 
    241 	if (nfiles == 0)
    242 		return;
    243 
    244 	/*
    245 	 * We now loop reading a new record from the file with the
    246 	 * 'sorted first' existing record.
    247 	 * As each record is added, the 'first' record is written to the
    248 	 * output file - maintaining one record from each file in the sorted
    249 	 * list.
    250 	 */
    251 	new_rec = allocrec(NULL, DEFLLEN);
    252 	new_end = (u_char *)new_rec + DEFLLEN;
    253 	for (;;) {
    254 		cfile = flist[0];
    255 		c = cfile->get(cfile->fp, new_rec, new_end, ftbl);
    256 		if (c == EOF) {
    257 			/* Write out last record from now-empty input */
    258 			put(cfile->rec, outfp);
    259 			if (--nfiles == 0)
    260 				break;
    261 			/* Replace from file with now-first sorted record. */
    262 			/* (Moving base 'flist' saves copying everything!) */
    263 			flist++;
    264 			continue;
    265 		}
    266 		if (c == BUFFEND) {
    267 			/* Buffer not large enough - double in size */
    268 			sz = (new_end - (u_char *)new_rec) * 2;
    269 			new_rec = allocrec(new_rec, sz);
    270 			new_end = (u_char *)new_rec +sz;
    271 			continue;
    272 		}
    273 
    274 		/* Swap in new buffer, saving old */
    275 		tmp = cfile->rec;
    276 		cfile->rec = new_rec;
    277 		new_rec = tmp;
    278 		tmp = cfile->end;
    279 		cfile->end = new_end;
    280 		new_end = tmp;
    281 
    282 		/* Add into sort, removing the original first entry */
    283 		c = insert(flist, cfile, nfiles, DELETE);
    284 		if (c != 0 || (UNIQUE && cfile == flist[0]
    285 			    && cmp(new_rec, cfile->rec) == 0)) {
    286 			/* Was an unwanted duplicate, restore buffer */
    287 			tmp = cfile->rec;
    288 			cfile->rec = new_rec;
    289 			new_rec = tmp;
    290 			tmp = cfile->end;
    291 			cfile->end = new_end;
    292 			new_end = tmp;
    293 			continue;
    294 		}
    295 
    296 		/* Write out 'old' record */
    297 		put(new_rec, outfp);
    298 	}
    299 
    300 	free(new_rec);
    301 }
    302 
    303 /*
    304  * if delete: inserts rec in flist, deletes flist[0];
    305  * otherwise just inserts *rec in flist.
    306  * Returns 1 if record is a duplicate to be ignored.
    307  */
    308 static int
    309 insert(struct mfile **flist, struct mfile *rec, int ttop, int delete)
    310 {
    311 	int mid, top = ttop, bot = 0, cmpv = 1;
    312 
    313 	for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) {
    314 		cmpv = cmp(rec->rec, flist[mid]->rec);
    315 		if (cmpv == 0 ) {
    316 			if (UNIQUE)
    317 				/* Duplicate key, read another record */
    318 				/* NB: This doesn't guarantee to keep any
    319 				 * particular record. */
    320 				return 1;
    321 			/*
    322 			 * Apply sort by input file order.
    323 			 * We could truncate the sort is the fileno are
    324 			 * adjacent - but that is all too hard!
    325 			 * The fileno cannot be equal, since we only have one
    326 			 * record from each file (+ flist[0] which never
    327 			 * comes here).
    328 			 */
    329 			cmpv = rec < flist[mid] ? -1 : 1;
    330 			if (REVERSE)
    331 				cmpv = -cmpv;
    332 		}
    333 		if (cmpv < 0)
    334 			top = mid;
    335 		else
    336 			bot = mid;
    337 	}
    338 
    339 	/* At this point we haven't yet compared against flist[0] */
    340 
    341 	if (delete) {
    342 		/* flist[0] is ourselves, only the caller knows the old data */
    343 		if (bot != 0) {
    344 			memmove(flist, flist + 1, bot * sizeof(MFILE *));
    345 			flist[bot] = rec;
    346 		}
    347 		return 0;
    348 	}
    349 
    350 	/* Inserting original set of records */
    351 
    352 	if (bot == 0 && cmpv != 0) {
    353 		/* Doesn't match flist[1], must compare with flist[0] */
    354 		cmpv = cmp(rec->rec, flist[0]->rec);
    355 		if (cmpv == 0 && UNIQUE)
    356 			return 1;
    357 		/* Add matching keys in file order (ie new is later) */
    358 		if (cmpv < 0)
    359 			bot = -1;
    360 	}
    361 	bot++;
    362 	memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE *));
    363 	flist[bot] = rec;
    364 	return 0;
    365 }
    366 
    367 /*
    368  * check order on one file
    369  */
    370 void
    371 order(struct filelist *filelist, struct field *ftbl)
    372 {
    373 	get_func_t get = SINGL_FLD ? makeline : makekey;
    374 	RECHEADER *crec, *prec, *trec;
    375 	u_char *crec_end, *prec_end, *trec_end;
    376 	FILE *fp;
    377 	int c;
    378 
    379 	fp = fopen(filelist->names[0], "r");
    380 	if (fp == NULL)
    381 		err(2, "%s", filelist->names[0]);
    382 
    383 	crec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
    384 	crec_end = crec->data + DEFLLEN;
    385 	prec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
    386 	prec_end = prec->data + DEFLLEN;
    387 
    388 	/* XXX this does exit(0) for overlong lines */
    389 	if (get(fp, prec, prec_end, ftbl) != 0)
    390 		exit(0);
    391 	while (get(fp, crec, crec_end, ftbl) == 0) {
    392 		if (0 < (c = cmp(prec, crec))) {
    393 			crec->data[crec->length-1] = 0;
    394 			errx(1, "found disorder: %s", crec->data+crec->offset);
    395 		}
    396 		if (UNIQUE && !c) {
    397 			crec->data[crec->length-1] = 0;
    398 			errx(1, "found non-uniqueness: %s",
    399 			    crec->data+crec->offset);
    400 		}
    401 		/*
    402 		 * Swap pointers so that this record is on place pointed
    403 		 * to by prec and new record is read to place pointed to by
    404 		 * crec.
    405 		 */
    406 		trec = prec;
    407 		prec = crec;
    408 		crec = trec;
    409 		trec_end = prec_end;
    410 		prec_end = crec_end;
    411 		crec_end = trec_end;
    412 	}
    413 	exit(0);
    414 }
    415 
    416 static int
    417 cmp(RECHEADER *rec1, RECHEADER *rec2)
    418 {
    419 	int len;
    420 	int r;
    421 
    422 	/* key is weights */
    423 	len = min(rec1->keylen, rec2->keylen);
    424 	r = memcmp(rec1->data, rec2->data, len);
    425 	if (r == 0)
    426 		r = rec1->keylen - rec2->keylen;
    427 	if (REVERSE)
    428 		r = -r;
    429 	return r;
    430 }
    431