msort.c revision 1.18.6.2 1 /* $NetBSD: msort.c,v 1.18.6.2 2010/06/29 18:01:11 riz Exp $ */
2
3 /*-
4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Ben Harris and Jaromir Dolecek.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*-
33 * Copyright (c) 1993
34 * The Regents of the University of California. All rights reserved.
35 *
36 * This code is derived from software contributed to Berkeley by
37 * Peter McIlroy.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. Neither the name of the University nor the names of its contributors
48 * may be used to endorse or promote products derived from this software
49 * without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
62 */
63
64 #include "sort.h"
65 #include "fsort.h"
66
67 #ifndef lint
68 __RCSID("$NetBSD: msort.c,v 1.18.6.2 2010/06/29 18:01:11 riz Exp $");
69 __SCCSID("@(#)msort.c 8.1 (Berkeley) 6/6/93");
70 #endif /* not lint */
71
72 #include <stdlib.h>
73 #include <string.h>
74 #include <unistd.h>
75 #include <util.h>
76
77 /* Subroutines using comparisons: merge sort and check order */
78 #define DELETE (1)
79
80 typedef struct mfile {
81 FILE *fp;
82 get_func_t get;
83 RECHEADER *rec;
84 u_char *end;
85 } MFILE;
86
87 static int cmp(RECHEADER *, RECHEADER *);
88 static int insert(struct mfile **, struct mfile *, int, int);
89 static void merge_sort_fstack(FILE *, put_func_t, struct field *);
90
91 /*
92 * Number of files merge() can merge in one pass.
93 */
94 #define MERGE_FNUM 16
95
96 static struct mfile fstack[MERGE_FNUM];
97 static struct mfile fstack_1[MERGE_FNUM];
98 static struct mfile fstack_2[MERGE_FNUM];
99 static int fstack_count, fstack_1_count, fstack_2_count;
100
101 void
102 save_for_merge(FILE *fp, get_func_t get, struct field *ftbl)
103 {
104 FILE *mfp, *mfp1, *mfp2;
105
106 if (fstack_count == MERGE_FNUM) {
107 /* Must reduce the number of temporary files */
108 mfp = ftmp();
109 merge_sort_fstack(mfp, putrec, ftbl);
110 /* Save output in next layer */
111 if (fstack_1_count == MERGE_FNUM) {
112 mfp1 = ftmp();
113 memcpy(fstack, fstack_1, sizeof fstack);
114 merge_sort_fstack(mfp1, putrec, ftbl);
115 if (fstack_2_count == MERGE_FNUM) {
116 /* More than 4096 files! */
117 mfp2 = ftmp();
118 memcpy(fstack, fstack_2, sizeof fstack);
119 merge_sort_fstack(mfp2, putrec, ftbl);
120 fstack_2[0].fp = mfp2;
121 fstack_2_count = 1;
122 }
123 fstack_2[fstack_2_count].fp = mfp1;
124 fstack_2[fstack_2_count].get = geteasy;
125 fstack_2_count++;
126 fstack_1_count = 0;
127 }
128 fstack_1[fstack_1_count].fp = mfp;
129 fstack_1[fstack_1_count].get = geteasy;
130 fstack_1_count++;
131 fstack_count = 0;
132 }
133
134 fstack[fstack_count].fp = fp;
135 fstack[fstack_count++].get = get;
136 }
137
138 void
139 fmerge(struct filelist *filelist, int nfiles, FILE *outfp, struct field *ftbl)
140 {
141 get_func_t get = SINGL_FLD ? makeline : makekey;
142 FILE *fp;
143 int i;
144
145 for (i = 0; i < nfiles; i++) {
146 fp = fopen(filelist->names[i], "r");
147 if (fp == NULL)
148 err(2, "%s", filelist->names[i]);
149 save_for_merge(fp, get, ftbl);
150 }
151
152 merge_sort(outfp, putline, ftbl);
153 }
154
155 void
156 merge_sort(FILE *outfp, put_func_t put, struct field *ftbl)
157 {
158 int count = fstack_1_count + fstack_2_count;
159 FILE *mfp;
160 int i;
161
162 if (count == 0) {
163 /* All files in initial array */
164 merge_sort_fstack(outfp, put, ftbl);
165 return;
166 }
167
168 count += fstack_count;
169
170 /* Too many files for one merge sort */
171 for (;;) {
172 /* Sort latest 16 files */
173 i = count;
174 if (i > MERGE_FNUM)
175 i = MERGE_FNUM;
176 while (fstack_count > 0)
177 fstack[--i] = fstack[--fstack_count];
178 while (i > 0 && fstack_1_count > 0)
179 fstack[--i] = fstack_1[--fstack_1_count];
180 while (i > 0)
181 fstack[--i] = fstack_2[--fstack_2_count];
182 if (count <= MERGE_FNUM) {
183 /* Got all the data */
184 fstack_count = count;
185 merge_sort_fstack(outfp, put, ftbl);
186 return;
187 }
188 mfp = ftmp();
189 fstack_count = count > MERGE_FNUM ? MERGE_FNUM : count;
190 merge_sort_fstack(mfp, putrec, ftbl);
191 fstack[0].fp = mfp;
192 fstack[0].get = geteasy;
193 fstack_count = 1;
194 count -= MERGE_FNUM - 1;
195 }
196 }
197
198 static void
199 merge_sort_fstack(FILE *outfp, put_func_t put, struct field *ftbl)
200 {
201 struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
202 RECHEADER *new_rec;
203 u_char *new_end;
204 void *tmp;
205 int c, i, nfiles;
206 size_t sz;
207
208 /* Read one record from each file (read again if a duplicate) */
209 for (nfiles = i = 0; i < fstack_count; i++) {
210 cfile = &fstack[i];
211 if (cfile->rec == NULL) {
212 cfile->rec = allocrec(NULL, DEFLLEN);
213 cfile->end = (u_char *)cfile->rec + DEFLLEN;
214 }
215 rewind(cfile->fp);
216
217 for (;;) {
218 c = cfile->get(cfile->fp, cfile->rec, cfile->end, ftbl);
219 if (c == EOF)
220 break;
221
222 if (c == BUFFEND) {
223 /* Double buffer size */
224 sz = (cfile->end - (u_char *)cfile->rec) * 2;
225 cfile->rec = allocrec(cfile->rec, sz);
226 cfile->end = (u_char *)cfile->rec + sz;
227 continue;
228 }
229
230 if (nfiles != 0) {
231 if (insert(flist, cfile, nfiles, !DELETE))
232 /* Duplicate removed */
233 continue;
234 } else
235 flist[0] = cfile;
236 nfiles++;
237 break;
238 }
239 }
240
241 if (nfiles == 0)
242 return;
243
244 /*
245 * We now loop reading a new record from the file with the
246 * 'sorted first' existing record.
247 * As each record is added, the 'first' record is written to the
248 * output file - maintaining one record from each file in the sorted
249 * list.
250 */
251 new_rec = allocrec(NULL, DEFLLEN);
252 new_end = (u_char *)new_rec + DEFLLEN;
253 for (;;) {
254 cfile = flist[0];
255 c = cfile->get(cfile->fp, new_rec, new_end, ftbl);
256 if (c == EOF) {
257 /* Write out last record from now-empty input */
258 put(cfile->rec, outfp);
259 if (--nfiles == 0)
260 break;
261 /* Replace from file with now-first sorted record. */
262 /* (Moving base 'flist' saves copying everything!) */
263 flist++;
264 continue;
265 }
266 if (c == BUFFEND) {
267 /* Buffer not large enough - double in size */
268 sz = (new_end - (u_char *)new_rec) * 2;
269 new_rec = allocrec(new_rec, sz);
270 new_end = (u_char *)new_rec +sz;
271 continue;
272 }
273
274 /* Swap in new buffer, saving old */
275 tmp = cfile->rec;
276 cfile->rec = new_rec;
277 new_rec = tmp;
278 tmp = cfile->end;
279 cfile->end = new_end;
280 new_end = tmp;
281
282 /* Add into sort, removing the original first entry */
283 c = insert(flist, cfile, nfiles, DELETE);
284 if (c != 0 || (UNIQUE && cfile == flist[0]
285 && cmp(new_rec, cfile->rec) == 0)) {
286 /* Was an unwanted duplicate, restore buffer */
287 tmp = cfile->rec;
288 cfile->rec = new_rec;
289 new_rec = tmp;
290 tmp = cfile->end;
291 cfile->end = new_end;
292 new_end = tmp;
293 continue;
294 }
295
296 /* Write out 'old' record */
297 put(new_rec, outfp);
298 }
299
300 free(new_rec);
301 }
302
303 /*
304 * if delete: inserts rec in flist, deletes flist[0];
305 * otherwise just inserts *rec in flist.
306 * Returns 1 if record is a duplicate to be ignored.
307 */
308 static int
309 insert(struct mfile **flist, struct mfile *rec, int ttop, int delete)
310 {
311 int mid, top = ttop, bot = 0, cmpv = 1;
312
313 for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) {
314 cmpv = cmp(rec->rec, flist[mid]->rec);
315 if (cmpv == 0 ) {
316 if (UNIQUE)
317 /* Duplicate key, read another record */
318 /* NB: This doesn't guarantee to keep any
319 * particular record. */
320 return 1;
321 /*
322 * Apply sort by input file order.
323 * We could truncate the sort is the fileno are
324 * adjacent - but that is all too hard!
325 * The fileno cannot be equal, since we only have one
326 * record from each file (+ flist[0] which never
327 * comes here).
328 */
329 cmpv = rec < flist[mid] ? -1 : 1;
330 if (REVERSE)
331 cmpv = -cmpv;
332 }
333 if (cmpv < 0)
334 top = mid;
335 else
336 bot = mid;
337 }
338
339 /* At this point we haven't yet compared against flist[0] */
340
341 if (delete) {
342 /* flist[0] is ourselves, only the caller knows the old data */
343 if (bot != 0) {
344 memmove(flist, flist + 1, bot * sizeof(MFILE *));
345 flist[bot] = rec;
346 }
347 return 0;
348 }
349
350 /* Inserting original set of records */
351
352 if (bot == 0 && cmpv != 0) {
353 /* Doesn't match flist[1], must compare with flist[0] */
354 cmpv = cmp(rec->rec, flist[0]->rec);
355 if (cmpv == 0 && UNIQUE)
356 return 1;
357 /* Add matching keys in file order (ie new is later) */
358 if (cmpv < 0)
359 bot = -1;
360 }
361 bot++;
362 memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE *));
363 flist[bot] = rec;
364 return 0;
365 }
366
367 /*
368 * check order on one file
369 */
370 void
371 order(struct filelist *filelist, struct field *ftbl)
372 {
373 get_func_t get = SINGL_FLD ? makeline : makekey;
374 RECHEADER *crec, *prec, *trec;
375 u_char *crec_end, *prec_end, *trec_end;
376 FILE *fp;
377 int c;
378
379 fp = fopen(filelist->names[0], "r");
380 if (fp == NULL)
381 err(2, "%s", filelist->names[0]);
382
383 crec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
384 crec_end = crec->data + DEFLLEN;
385 prec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
386 prec_end = prec->data + DEFLLEN;
387
388 /* XXX this does exit(0) for overlong lines */
389 if (get(fp, prec, prec_end, ftbl) != 0)
390 exit(0);
391 while (get(fp, crec, crec_end, ftbl) == 0) {
392 if (0 < (c = cmp(prec, crec))) {
393 crec->data[crec->length-1] = 0;
394 errx(1, "found disorder: %s", crec->data+crec->offset);
395 }
396 if (UNIQUE && !c) {
397 crec->data[crec->length-1] = 0;
398 errx(1, "found non-uniqueness: %s",
399 crec->data+crec->offset);
400 }
401 /*
402 * Swap pointers so that this record is on place pointed
403 * to by prec and new record is read to place pointed to by
404 * crec.
405 */
406 trec = prec;
407 prec = crec;
408 crec = trec;
409 trec_end = prec_end;
410 prec_end = crec_end;
411 crec_end = trec_end;
412 }
413 exit(0);
414 }
415
416 static int
417 cmp(RECHEADER *rec1, RECHEADER *rec2)
418 {
419 int len;
420 int r;
421
422 /* key is weights */
423 len = min(rec1->keylen, rec2->keylen);
424 r = memcmp(rec1->data, rec2->data, len);
425 if (r == 0)
426 r = rec1->keylen - rec2->keylen;
427 if (REVERSE)
428 r = -r;
429 return r;
430 }
431