msort.c revision 1.26 1 /* $NetBSD: msort.c,v 1.26 2009/09/10 22:02:40 dsl Exp $ */
2
3 /*-
4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Ben Harris and Jaromir Dolecek.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*-
33 * Copyright (c) 1993
34 * The Regents of the University of California. All rights reserved.
35 *
36 * This code is derived from software contributed to Berkeley by
37 * Peter McIlroy.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. Neither the name of the University nor the names of its contributors
48 * may be used to endorse or promote products derived from this software
49 * without specific prior written permission.
50 *
51 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * SUCH DAMAGE.
62 */
63
64 #include "sort.h"
65 #include "fsort.h"
66
67 #ifndef lint
68 __RCSID("$NetBSD: msort.c,v 1.26 2009/09/10 22:02:40 dsl Exp $");
69 __SCCSID("@(#)msort.c 8.1 (Berkeley) 6/6/93");
70 #endif /* not lint */
71
72 #include <stdlib.h>
73 #include <string.h>
74 #include <unistd.h>
75
76 /* Subroutines using comparisons: merge sort and check order */
77 #define DELETE (1)
78
79 typedef struct mfile {
80 u_char *end;
81 int flno;
82 RECHEADER rec[1];
83 } MFILE;
84
85 static int cmp(RECHEADER *, RECHEADER *);
86 static int insert(struct mfile **, struct mfile **, int, int);
87 static void merge(int, int, get_func_t, FILE *, put_func_t, struct field *);
88
89 void
90 fmerge(int binno, struct filelist *filelist, int nfiles,
91 get_func_t get, FILE *outfp, put_func_t fput, struct field *ftbl)
92 {
93 FILE *tout;
94 int i, j, last;
95 put_func_t put;
96
97 while (nfiles) {
98 put = putrec;
99 for (j = 0; j < nfiles; j += MERGE_FNUM) {
100 if (nfiles <= MERGE_FNUM) {
101 tout = outfp;
102 put = fput;
103 }
104 else
105 tout = ftmp();
106 last = min(MERGE_FNUM, nfiles - j);
107 if (binno < 0) {
108 for (i = 0; i < last; i++)
109 if (!(fstack[i+MAXFCT-1-MERGE_FNUM].fp =
110 fopen(filelist->names[j+i], "r")))
111 err(2, "%s",
112 filelist->names[j+i]);
113 merge(MAXFCT-1-MERGE_FNUM, last, get, tout, put, ftbl);
114 } else {
115 for (i = 0; i< last; i++)
116 rewind(fstack[i+j].fp);
117 merge(j, last, get, tout, put, ftbl);
118 }
119 if (nfiles > MERGE_FNUM)
120 fstack[j/MERGE_FNUM].fp = tout;
121 }
122 nfiles = (nfiles + (MERGE_FNUM - 1)) / MERGE_FNUM;
123 if (nfiles == 1)
124 nfiles = 0;
125 if (binno < 0) {
126 binno = 0;
127 get = geteasy;
128 }
129 }
130 }
131
132 static void
133 merge(int infl0, int nfiles, get_func_t get, FILE *outfp, put_func_t put,
134 struct field *ftbl)
135 {
136 int c, i, j, nf = nfiles;
137 struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
138 size_t availsz;
139 static void *bufs[MERGE_FNUM + 1];
140 static size_t bufs_sz[MERGE_FNUM + 1];
141
142 /*
143 * We need nfiles + 1 buffers.
144 */
145 for (i = 0; i < nfiles + 1; i++) {
146 if (bufs[i])
147 continue;
148
149 bufs[i] = malloc(DEFLLEN);
150 if (!bufs[i])
151 err(2, "merge: malloc");
152 memset(bufs[i], 0, DEFLLEN);
153 bufs_sz[i] = DEFLLEN;
154 }
155
156 /* Read one record from each file (read again if a duplicate) */
157 for (i = j = 0; i < nfiles; i++, j++) {
158 cfile = (struct mfile *) bufs[j];
159 cfile->flno = infl0 + j;
160 cfile->end = (u_char *) bufs[j] + bufs_sz[j];
161 for (c = 1; c == 1;) {
162 c = get(cfile->flno, 0, NULL, nfiles, cfile->rec,
163 cfile->end, ftbl);
164 if (c == EOF) {
165 --i;
166 --nfiles;
167 break;
168 }
169
170 if (c == BUFFEND) {
171 bufs_sz[j] *= 2;
172 cfile = realloc(bufs[j], bufs_sz[j]);
173 if (!cfile)
174 err(2, "merge: realloc");
175
176 bufs[j] = (void *) cfile;
177 cfile->end = (u_char *)cfile + bufs_sz[j];
178
179 c = 1;
180 continue;
181 }
182
183 if (i)
184 c = insert(flist, &cfile, i, !DELETE);
185 else
186 flist[0] = cfile;
187 }
188 }
189
190 /*
191 * We now loop reading a new record from the file with the
192 * 'sorted first' existing record.
193 * As each record is added, the 'first' record is written to the
194 * output file - maintaining one record from each file in the sorted
195 * list.
196 */
197 cfile = (struct mfile *) bufs[nf];
198 cfile->end = (u_char *) cfile + bufs_sz[nf];
199 cfile->flno = flist[0]->flno;
200 for (;;) {
201 c = get(cfile->flno, 0, NULL, nfiles, cfile->rec,
202 cfile->end, ftbl);
203 if (c == EOF) {
204 /* Write out last record from now-empty input */
205 put(flist[0]->rec, outfp);
206 if (--nfiles == 0)
207 break;
208 /* Replace from file with now-first sorted record. */
209 /* (Moving base 'flist' saves copying everything!) */
210 flist++;
211 cfile->flno = flist[0]->flno;
212 continue;
213 }
214 if (c == BUFFEND) {
215 /* Buffer not large enough - double in size */
216 char *oldbuf = (char *) cfile;
217 availsz = (char *) cfile->end - oldbuf;
218 availsz *= 2;
219 cfile = realloc(oldbuf, availsz);
220 if (!cfile)
221 err(2, "merge: realloc");
222 cfile->end = (u_char *)cfile + availsz;
223
224 /* Update pointers we'll use for next merge */
225 for (i = 0; i < nf + 1; i++) {
226 if (bufs[i] == oldbuf) {
227 bufs[i] = (char *)cfile;
228 bufs_sz[i] = availsz;
229 break;
230 }
231 }
232 /* Read again from same file into same buffer */
233 continue;
234 }
235
236 /* Add into sort, removing the original first entry */
237 c = insert(flist, &cfile, nfiles, DELETE);
238
239 /*
240 * 'cfile' is now the buffer from the old record from the
241 * file we just read, but with the file number of the
242 * current 'first record.
243 * (Unless we are rejecting a duplicate, when c == 1 and
244 * it is unchanged!)
245 */
246 if (c == 0)
247 put(cfile->rec, outfp);
248 }
249 }
250
251 /*
252 * if delete: inserts *rec in flist, deletes flist[0], and leaves it in *rec;
253 * otherwise just inserts *rec in flist.
254 */
255 static int
256 insert(struct mfile **flist, struct mfile **rec, int ttop, int delete)
257 {
258 struct mfile *tmprec = *rec;
259 int mid, top = ttop, bot = 0, cmpv = 1;
260
261 for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) {
262 cmpv = cmp(tmprec->rec, flist[mid]->rec);
263 if (cmpv == 0 ) {
264 if (UNIQUE)
265 /* Duplicate key, read another record */
266 return 1;
267 /*
268 * Apply sort by fileno, to give priority to earlier
269 * specified files, hence providing a stable sort.
270 * We could truncate the sort is the fileno are
271 * adjacent - but that is all too hard!
272 * The fileno cannot be equal, since we only have one
273 * record from each file (+ flist[0] which never
274 * comes here).
275 */
276 cmpv = tmprec->flno - flist[mid]->flno;
277 if (REVERSE)
278 cmpv = -cmpv;
279 }
280 if (cmpv < 0)
281 top = mid;
282 else
283 bot = mid;
284 }
285
286 /* At this point we haven't yet compared against flist[0] */
287
288 if (delete) {
289 /* flist[0] came from the same file, it cannot be earlier. */
290 if (UNIQUE && bot == 0 && cmp(tmprec->rec, flist[0]->rec) == 0)
291 /* Duplicate record (key) in original file */
292 return 1;
293 tmprec = flist[0];
294 if (bot)
295 memmove(flist, flist + 1, bot * sizeof(MFILE **));
296 flist[bot] = *rec;
297 tmprec->flno = flist[0]->flno;
298 *rec = tmprec;
299 return 0;
300 }
301
302 /* Inserting original set of records */
303
304 if (bot == 0 && cmpv != 0) {
305 /* Doesn't match flist[1], must compare with flist[0] */
306 cmpv = cmp(tmprec->rec, flist[0]->rec);
307 if (cmpv == 0 && UNIQUE)
308 return 1;
309 /* Add matching keys in file order (ie new is later) */
310 if (cmpv < 0)
311 bot = -1;
312 }
313 bot++;
314 memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE **));
315 flist[bot] = *rec;
316 return 0;
317 }
318
319 /*
320 * check order on one file
321 */
322 void
323 order(struct filelist *filelist, get_func_t get, struct field *ftbl)
324 {
325 RECHEADER *crec, *prec, *trec;
326 u_char *crec_end, *prec_end, *trec_end;
327 int c;
328
329 crec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
330 crec_end = crec->data + DEFLLEN;
331 prec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
332 prec_end = prec->data + DEFLLEN;
333
334 /* XXX this does exit(0) for overlong lines */
335 if (get(-1, 0, filelist, 1, prec, prec_end, ftbl) != 0)
336 exit(0);
337 while (get(-1, 0, filelist, 1, crec, crec_end, ftbl) == 0) {
338 if (0 < (c = cmp(prec, crec))) {
339 crec->data[crec->length-1] = 0;
340 errx(1, "found disorder: %s", crec->data+crec->offset);
341 }
342 if (UNIQUE && !c) {
343 crec->data[crec->length-1] = 0;
344 errx(1, "found non-uniqueness: %s",
345 crec->data+crec->offset);
346 }
347 /*
348 * Swap pointers so that this record is on place pointed
349 * to by prec and new record is read to place pointed to by
350 * crec.
351 */
352 trec = prec;
353 prec = crec;
354 crec = trec;
355 trec_end = prec_end;
356 prec_end = crec_end;
357 crec_end = trec_end;
358 }
359 exit(0);
360 }
361
362 static int
363 cmp(RECHEADER *rec1, RECHEADER *rec2)
364 {
365 int len;
366 int r;
367
368 /* key is weights */
369 len = min(rec1->keylen, rec2->keylen);
370 r = memcmp(rec1->data, rec2->data, len);
371 if (r == 0)
372 r = rec1->keylen - rec2->keylen;
373 if (REVERSE)
374 r = -r;
375 return r;
376 }
377