Home | History | Annotate | Line # | Download | only in sort
      1 /*	$NetBSD: msort.c,v 1.31 2016/06/01 02:37:55 kre Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Ben Harris and Jaromir Dolecek.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*-
     33  * Copyright (c) 1993
     34  *	The Regents of the University of California.  All rights reserved.
     35  *
     36  * This code is derived from software contributed to Berkeley by
     37  * Peter McIlroy.
     38  *
     39  * Redistribution and use in source and binary forms, with or without
     40  * modification, are permitted provided that the following conditions
     41  * are met:
     42  * 1. Redistributions of source code must retain the above copyright
     43  *    notice, this list of conditions and the following disclaimer.
     44  * 2. Redistributions in binary form must reproduce the above copyright
     45  *    notice, this list of conditions and the following disclaimer in the
     46  *    documentation and/or other materials provided with the distribution.
     47  * 3. Neither the name of the University nor the names of its contributors
     48  *    may be used to endorse or promote products derived from this software
     49  *    without specific prior written permission.
     50  *
     51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     61  * SUCH DAMAGE.
     62  */
     63 
     64 #include "sort.h"
     65 #include "fsort.h"
     66 
     67 __RCSID("$NetBSD: msort.c,v 1.31 2016/06/01 02:37:55 kre Exp $");
     68 
     69 #include <stdlib.h>
     70 #include <string.h>
     71 #include <unistd.h>
     72 #include <util.h>
     73 
     74 /* Subroutines using comparisons: merge sort and check order */
     75 #define DELETE (1)
     76 
     77 typedef struct mfile {
     78 	FILE         *fp;
     79 	get_func_t   get;
     80 	RECHEADER    *rec;
     81 	u_char       *end;
     82 } MFILE;
     83 
     84 static int cmp(RECHEADER *, RECHEADER *);
     85 static int insert(struct mfile **, struct mfile *, int, int);
     86 static void merge_sort_fstack(FILE *, put_func_t, struct field *);
     87 
     88 /*
     89  * Number of files merge() can merge in one pass.
     90  */
     91 #define MERGE_FNUM      16
     92 
     93 static struct mfile fstack[MERGE_FNUM];
     94 static struct mfile fstack_1[MERGE_FNUM];
     95 static struct mfile fstack_2[MERGE_FNUM];
     96 static int fstack_count, fstack_1_count, fstack_2_count;
     97 
     98 void
     99 save_for_merge(FILE *fp, get_func_t get, struct field *ftbl)
    100 {
    101 	FILE *mfp, *mfp1, *mfp2;
    102 
    103 	if (fstack_count == MERGE_FNUM) {
    104 		/* Must reduce the number of temporary files */
    105 		mfp = ftmp();
    106 		merge_sort_fstack(mfp, putrec, ftbl);
    107 		/* Save output in next layer */
    108 		if (fstack_1_count == MERGE_FNUM) {
    109 			mfp1 = ftmp();
    110 			memcpy(fstack, fstack_1, sizeof fstack);
    111 			merge_sort_fstack(mfp1, putrec, ftbl);
    112 			if (fstack_2_count == MERGE_FNUM) {
    113 				/* More than 4096 files! */
    114 				mfp2 = ftmp();
    115 				memcpy(fstack, fstack_2, sizeof fstack);
    116 				merge_sort_fstack(mfp2, putrec, ftbl);
    117 				fstack_2[0].fp = mfp2;
    118 				fstack_2_count = 1;
    119 			}
    120 			fstack_2[fstack_2_count].fp = mfp1;
    121 			fstack_2[fstack_2_count].get = geteasy;
    122 			fstack_2_count++;
    123 			fstack_1_count = 0;
    124 		}
    125 		fstack_1[fstack_1_count].fp = mfp;
    126 		fstack_1[fstack_1_count].get = geteasy;
    127 		fstack_1_count++;
    128 		fstack_count = 0;
    129 	}
    130 
    131 	fstack[fstack_count].fp = fp;
    132 	fstack[fstack_count++].get = get;
    133 }
    134 
    135 void
    136 fmerge(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl)
    137 {
    138 	get_func_t get = SINGL_FLD ? makeline : makekey;
    139 	FILE *fp;
    140 	int i;
    141 
    142 	for (i = 0; i < nfiles; i++) {
    143 		fp = fopen(filelist->names[i], "r");
    144 		if (fp == NULL)
    145 			err(2, "%s", filelist->names[i]);
    146 		save_for_merge(fp, get, ftbl);
    147 	}
    148 
    149 	merge_sort(outfp, putline, ftbl);
    150 }
    151 
    152 void
    153 merge_sort(FILE *outfp, put_func_t put, struct field *ftbl)
    154 {
    155 	int count = fstack_1_count + fstack_2_count;
    156 	FILE *mfp;
    157 	int i;
    158 
    159 	if (count == 0) {
    160 		/* All files in initial array */
    161 		merge_sort_fstack(outfp, put, ftbl);
    162 		return;
    163 	}
    164 
    165 	count += fstack_count;
    166 
    167 	/* Too many files for one merge sort */
    168 	for (;;) {
    169 		/* Sort latest 16 files */
    170 		i = count;
    171 		if (i > MERGE_FNUM)
    172 			i = MERGE_FNUM;
    173 		while (fstack_count > 0)
    174 			fstack[--i] = fstack[--fstack_count];
    175 		while (i > 0 && fstack_1_count > 0)
    176 			fstack[--i] = fstack_1[--fstack_1_count];
    177 		while (i > 0)
    178 			fstack[--i] = fstack_2[--fstack_2_count];
    179 		if (count <= MERGE_FNUM) {
    180 			/* Got all the data */
    181 			fstack_count = count;
    182 			merge_sort_fstack(outfp, put, ftbl);
    183 			return;
    184 		}
    185 		mfp = ftmp();
    186 		fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count;
    187 		merge_sort_fstack(mfp, putrec, ftbl);
    188 		fstack[0].fp = mfp;
    189 		fstack[0].get = geteasy;
    190 		fstack_count = 1;
    191 		count -= MERGE_FNUM - 1;
    192 	}
    193 }
    194 
    195 static void
    196 merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl)
    197 {
    198 	struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
    199 	RECHEADER *new_rec;
    200 	u_char *new_end;
    201 	void *tmp;
    202 	int c, i, nfiles;
    203 	size_t sz;
    204 
    205 	/* Read one record from each file (read again if a duplicate) */
    206 	for (nfiles = i = 0; i < fstack_count; i++) {
    207 		cfile = &fstack[i];
    208 		if (cfile->rec == NULL) {
    209 			cfile->rec = allocrec(NULL, DEFLLEN);
    210 			cfile->end = (u_char *)cfile->rec + DEFLLEN;
    211 		}
    212 		rewind(cfile->fp);
    213 
    214 		for (;;) {
    215 			c = cfile->get(cfile->fp, cfile->rec, cfile->end, ftbl);
    216 			if (c == EOF)
    217 				break;
    218 
    219 			if (c == BUFFEND) {
    220 				/* Double buffer size */
    221 				sz = (cfile->end - (u_char *)cfile->rec) * 2;
    222 				cfile->rec = allocrec(cfile->rec, sz);
    223 				cfile->end = (u_char *)cfile->rec + sz;
    224 				continue;
    225 			}
    226 
    227 			if (nfiles != 0) {
    228 				if (insert(flist, cfile, nfiles, !DELETE))
    229 					/* Duplicate removed */
    230 					continue;
    231 			} else
    232 				flist[0] = cfile;
    233 			nfiles++;
    234 			break;
    235 		}
    236 	}
    237 
    238 	if (nfiles == 0)
    239 		return;
    240 
    241 	/*
    242 	 * We now loop reading a new record from the file with the
    243 	 * 'sorted first' existing record.
    244 	 * As each record is added, the 'first' record is written to the
    245 	 * output file - maintaining one record from each file in the sorted
    246 	 * list.
    247 	 */
    248 	new_rec = allocrec(NULL, DEFLLEN);
    249 	new_end = (u_char *)new_rec + DEFLLEN;
    250 	for (;;) {
    251 		cfile = flist[0];
    252 		c = cfile->get(cfile->fp, new_rec, new_end, ftbl);
    253 		if (c == EOF) {
    254 			/* Write out last record from now-empty input */
    255 			put(cfile->rec, outfp);
    256 			if (--nfiles == 0)
    257 				break;
    258 			/* Replace from file with now-first sorted record. */
    259 			/* (Moving base 'flist' saves copying everything!) */
    260 			flist++;
    261 			continue;
    262 		}
    263 		if (c == BUFFEND) {
    264 			/* Buffer not large enough - double in size */
    265 			sz = (new_end - (u_char *)new_rec) * 2;
    266 			new_rec = allocrec(new_rec, sz);
    267 			new_end = (u_char *)new_rec +sz;
    268 			continue;
    269 		}
    270 
    271 		/* Swap in new buffer, saving old */
    272 		tmp = cfile->rec;
    273 		cfile->rec = new_rec;
    274 		new_rec = tmp;
    275 		tmp = cfile->end;
    276 		cfile->end = new_end;
    277 		new_end = tmp;
    278 
    279 		/* Add into sort, removing the original first entry */
    280 		c = insert(flist, cfile, nfiles, DELETE);
    281 		if (c != 0 || (UNIQUE && cfile == flist[0]
    282 			    && cmp(new_rec, cfile->rec) == 0)) {
    283 			/* Was an unwanted duplicate, restore buffer */
    284 			tmp = cfile->rec;
    285 			cfile->rec = new_rec;
    286 			new_rec = tmp;
    287 			tmp = cfile->end;
    288 			cfile->end = new_end;
    289 			new_end = tmp;
    290 			continue;
    291 		}
    292 
    293 		/* Write out 'old' record */
    294 		put(new_rec, outfp);
    295 	}
    296 
    297 	free(new_rec);
    298 }
    299 
    300 /*
    301  * if delete: inserts rec in flist, deletes flist[0];
    302  * otherwise just inserts *rec in flist.
    303  * Returns 1 if record is a duplicate to be ignored.
    304  */
    305 static int
    306 insert(struct mfile **flist, struct mfile *rec, int ttop, int delete)
    307 {
    308 	int mid, top = ttop, bot = 0, cmpv = 1;
    309 
    310 	for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) {
    311 		cmpv = cmp(rec->rec, flist[mid]->rec);
    312 		if (cmpv == 0 ) {
    313 			if (UNIQUE)
    314 				/* Duplicate key, read another record */
    315 				/* NB: This doesn't guarantee to keep any
    316 				 * particular record. */
    317 				return 1;
    318 			/*
    319 			 * Apply sort by input file order.
    320 			 * We could truncate the sort is the fileno are
    321 			 * adjacent - but that is all too hard!
    322 			 * The fileno cannot be equal, since we only have one
    323 			 * record from each file (+ flist[0] which never
    324 			 * comes here).
    325 			 */
    326 			cmpv = rec < flist[mid] ? -1 : 1;
    327 			if (REVERSE)
    328 				cmpv = -cmpv;
    329 		}
    330 		if (cmpv < 0)
    331 			top = mid;
    332 		else
    333 			bot = mid;
    334 	}
    335 
    336 	/* At this point we haven't yet compared against flist[0] */
    337 
    338 	if (delete) {
    339 		/* flist[0] is ourselves, only the caller knows the old data */
    340 		if (bot != 0) {
    341 			memmove(flist, flist + 1, bot * sizeof(MFILE *));
    342 			flist[bot] = rec;
    343 		}
    344 		return 0;
    345 	}
    346 
    347 	/* Inserting original set of records */
    348 
    349 	if (bot == 0 && cmpv != 0) {
    350 		/* Doesn't match flist[1], must compare with flist[0] */
    351 		cmpv = cmp(rec->rec, flist[0]->rec);
    352 		if (cmpv == 0 && UNIQUE)
    353 			return 1;
    354 		/* Add matching keys in file order (ie new is later) */
    355 		if (cmpv < 0)
    356 			bot = -1;
    357 	}
    358 	bot++;
    359 	memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE *));
    360 	flist[bot] = rec;
    361 	return 0;
    362 }
    363 
    364 /*
    365  * check order on one file
    366  */
    367 void
    368 order(struct filelist *filelist, struct field *ftbl, int quiet)
    369 {
    370 	get_func_t get = SINGL_FLD ? makeline : makekey;
    371 	RECHEADER *crec, *prec, *trec;
    372 	u_char *crec_end, *prec_end, *trec_end;
    373 	FILE *fp;
    374 	int c;
    375 
    376 	fp = fopen(filelist->names[0], "r");
    377 	if (fp == NULL)
    378 		err(2, "%s", filelist->names[0]);
    379 
    380 	crec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
    381 	crec_end = crec->data + DEFLLEN;
    382 	prec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
    383 	prec_end = prec->data + DEFLLEN;
    384 
    385 	/* XXX this does exit(0) for overlong lines */
    386 	if (get(fp, prec, prec_end, ftbl) != 0)
    387 		exit(0);
    388 	while (get(fp, crec, crec_end, ftbl) == 0) {
    389 		if (0 < (c = cmp(prec, crec))) {
    390 			if (quiet)
    391 				exit(1);
    392 			crec->data[crec->length-1] = 0;
    393 			errx(1, "found disorder: %s", crec->data+crec->offset);
    394 		}
    395 		if (UNIQUE && !c) {
    396 			if (quiet)
    397 				exit(1);
    398 			crec->data[crec->length-1] = 0;
    399 			errx(1, "found non-uniqueness: %s",
    400 			    crec->data+crec->offset);
    401 		}
    402 		/*
    403 		 * Swap pointers so that this record is on place pointed
    404 		 * to by prec and new record is read to place pointed to by
    405 		 * crec.
    406 		 */
    407 		trec = prec;
    408 		prec = crec;
    409 		crec = trec;
    410 		trec_end = prec_end;
    411 		prec_end = crec_end;
    412 		crec_end = trec_end;
    413 	}
    414 	exit(0);
    415 }
    416 
    417 static int
    418 cmp(RECHEADER *rec1, RECHEADER *rec2)
    419 {
    420 	int len;
    421 	int r;
    422 
    423 	/* key is weights */
    424 	len = min(rec1->keylen, rec2->keylen);
    425 	r = memcmp(rec1->data, rec2->data, len);
    426 	if (r == 0)
    427 		r = rec1->keylen - rec2->keylen;
    428 	if (REVERSE)
    429 		r = -r;
    430 	return r;
    431 }
    432