msort.c revision 1.25 1 1.25 dsl /* $NetBSD: msort.c,v 1.25 2009/09/05 12:00:25 dsl Exp $ */
2 1.14 jdolecek
3 1.14 jdolecek /*-
4 1.14 jdolecek * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5 1.14 jdolecek * All rights reserved.
6 1.14 jdolecek *
7 1.14 jdolecek * This code is derived from software contributed to The NetBSD Foundation
8 1.14 jdolecek * by Ben Harris and Jaromir Dolecek.
9 1.14 jdolecek *
10 1.14 jdolecek * Redistribution and use in source and binary forms, with or without
11 1.14 jdolecek * modification, are permitted provided that the following conditions
12 1.14 jdolecek * are met:
13 1.14 jdolecek * 1. Redistributions of source code must retain the above copyright
14 1.14 jdolecek * notice, this list of conditions and the following disclaimer.
15 1.14 jdolecek * 2. Redistributions in binary form must reproduce the above copyright
16 1.14 jdolecek * notice, this list of conditions and the following disclaimer in the
17 1.14 jdolecek * documentation and/or other materials provided with the distribution.
18 1.14 jdolecek *
19 1.14 jdolecek * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 1.14 jdolecek * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 1.14 jdolecek * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 1.14 jdolecek * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 1.14 jdolecek * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 1.14 jdolecek * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 1.14 jdolecek * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 1.14 jdolecek * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 1.14 jdolecek * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 1.14 jdolecek * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 1.14 jdolecek * POSSIBILITY OF SUCH DAMAGE.
30 1.14 jdolecek */
31 1.2 bjh21
32 1.1 bjh21 /*-
33 1.1 bjh21 * Copyright (c) 1993
34 1.1 bjh21 * The Regents of the University of California. All rights reserved.
35 1.1 bjh21 *
36 1.1 bjh21 * This code is derived from software contributed to Berkeley by
37 1.1 bjh21 * Peter McIlroy.
38 1.1 bjh21 *
39 1.1 bjh21 * Redistribution and use in source and binary forms, with or without
40 1.1 bjh21 * modification, are permitted provided that the following conditions
41 1.1 bjh21 * are met:
42 1.1 bjh21 * 1. Redistributions of source code must retain the above copyright
43 1.1 bjh21 * notice, this list of conditions and the following disclaimer.
44 1.1 bjh21 * 2. Redistributions in binary form must reproduce the above copyright
45 1.1 bjh21 * notice, this list of conditions and the following disclaimer in the
46 1.1 bjh21 * documentation and/or other materials provided with the distribution.
47 1.13 agc * 3. Neither the name of the University nor the names of its contributors
48 1.1 bjh21 * may be used to endorse or promote products derived from this software
49 1.1 bjh21 * without specific prior written permission.
50 1.1 bjh21 *
51 1.1 bjh21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
52 1.1 bjh21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
53 1.1 bjh21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
54 1.1 bjh21 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
55 1.1 bjh21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
56 1.1 bjh21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
57 1.1 bjh21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
58 1.1 bjh21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
59 1.1 bjh21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
60 1.1 bjh21 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 1.1 bjh21 * SUCH DAMAGE.
62 1.1 bjh21 */
63 1.1 bjh21
64 1.2 bjh21 #include "sort.h"
65 1.2 bjh21 #include "fsort.h"
66 1.2 bjh21
67 1.1 bjh21 #ifndef lint
68 1.25 dsl __RCSID("$NetBSD: msort.c,v 1.25 2009/09/05 12:00:25 dsl Exp $");
69 1.2 bjh21 __SCCSID("@(#)msort.c 8.1 (Berkeley) 6/6/93");
70 1.1 bjh21 #endif /* not lint */
71 1.1 bjh21
72 1.1 bjh21 #include <stdlib.h>
73 1.1 bjh21 #include <string.h>
74 1.1 bjh21 #include <unistd.h>
75 1.1 bjh21
76 1.1 bjh21 /* Subroutines using comparisons: merge sort and check order */
77 1.1 bjh21 #define DELETE (1)
78 1.8 jdolecek
79 1.1 bjh21 typedef struct mfile {
80 1.1 bjh21 u_char *end;
81 1.1 bjh21 short flno;
82 1.25 dsl RECHEADER rec[1];
83 1.1 bjh21 } MFILE;
84 1.8 jdolecek
85 1.23 dsl static u_char *wts;
86 1.1 bjh21
87 1.19 dsl static int cmp(RECHEADER *, RECHEADER *);
88 1.19 dsl static int insert(struct mfile **, struct mfile **, int, int);
89 1.11 jdolecek static void merge(int, int, get_func_t, FILE *, put_func_t, struct field *);
90 1.1 bjh21
91 1.1 bjh21 void
92 1.22 dsl fmerge(int binno, struct filelist *filelist, int nfiles,
93 1.19 dsl get_func_t get, FILE *outfp, put_func_t fput, struct field *ftbl)
94 1.1 bjh21 {
95 1.1 bjh21 FILE *tout;
96 1.1 bjh21 int i, j, last;
97 1.7 jdolecek put_func_t put;
98 1.1 bjh21
99 1.1 bjh21 wts = ftbl->weights;
100 1.1 bjh21
101 1.1 bjh21 while (nfiles) {
102 1.1 bjh21 put = putrec;
103 1.8 jdolecek for (j = 0; j < nfiles; j += MERGE_FNUM) {
104 1.8 jdolecek if (nfiles <= MERGE_FNUM) {
105 1.3 bjh21 tout = outfp;
106 1.1 bjh21 put = fput;
107 1.1 bjh21 }
108 1.1 bjh21 else
109 1.1 bjh21 tout = ftmp();
110 1.8 jdolecek last = min(MERGE_FNUM, nfiles - j);
111 1.1 bjh21 if (binno < 0) {
112 1.10 jdolecek for (i = 0; i < last; i++)
113 1.22 dsl if (!(fstack[i+MAXFCT-1-MERGE_FNUM].fp =
114 1.10 jdolecek fopen(filelist->names[j+i], "r")))
115 1.7 jdolecek err(2, "%s",
116 1.7 jdolecek filelist->names[j+i]);
117 1.8 jdolecek merge(MAXFCT-1-MERGE_FNUM, last, get, tout, put, ftbl);
118 1.10 jdolecek } else {
119 1.1 bjh21 for (i = 0; i< last; i++)
120 1.22 dsl rewind(fstack[i+j].fp);
121 1.22 dsl merge(j, last, get, tout, put, ftbl);
122 1.1 bjh21 }
123 1.8 jdolecek if (nfiles > MERGE_FNUM)
124 1.22 dsl fstack[j/MERGE_FNUM].fp = tout;
125 1.1 bjh21 }
126 1.8 jdolecek nfiles = (nfiles + (MERGE_FNUM - 1)) / MERGE_FNUM;
127 1.1 bjh21 if (nfiles == 1)
128 1.1 bjh21 nfiles = 0;
129 1.1 bjh21 if (binno < 0) {
130 1.1 bjh21 binno = 0;
131 1.1 bjh21 get = geteasy;
132 1.1 bjh21 }
133 1.1 bjh21 }
134 1.1 bjh21 }
135 1.1 bjh21
136 1.11 jdolecek static void
137 1.19 dsl merge(int infl0, int nfiles, get_func_t get, FILE *outfp, put_func_t put,
138 1.19 dsl struct field *ftbl)
139 1.1 bjh21 {
140 1.9 jdolecek int c, i, j, nf = nfiles;
141 1.12 jdolecek struct mfile *flistb[MERGE_FNUM], **flist = flistb, *cfile;
142 1.24 dsl size_t availsz;
143 1.16 itojun static void *bufs[MERGE_FNUM + 1];
144 1.16 itojun static size_t bufs_sz[MERGE_FNUM + 1];
145 1.9 jdolecek
146 1.9 jdolecek /*
147 1.23 dsl * We need nfiles + 1 buffers.
148 1.9 jdolecek */
149 1.23 dsl for (i = 0; i < nfiles + 1; i++) {
150 1.9 jdolecek if (bufs[i])
151 1.9 jdolecek continue;
152 1.9 jdolecek
153 1.9 jdolecek bufs[i] = malloc(DEFLLEN);
154 1.9 jdolecek if (!bufs[i])
155 1.12 jdolecek err(2, "merge: malloc");
156 1.17 jdolecek memset(bufs[i], 0, DEFLLEN);
157 1.9 jdolecek bufs_sz[i] = DEFLLEN;
158 1.9 jdolecek }
159 1.8 jdolecek
160 1.23 dsl /* Read one record from each file (read again if a duplicate) */
161 1.11 jdolecek for (i = j = 0; i < nfiles; i++, j++) {
162 1.9 jdolecek cfile = (struct mfile *) bufs[j];
163 1.9 jdolecek cfile->flno = infl0 + j;
164 1.9 jdolecek cfile->end = (u_char *) bufs[j] + bufs_sz[j];
165 1.1 bjh21 for (c = 1; c == 1;) {
166 1.23 dsl c = get(cfile->flno, 0, NULL, nfiles, cfile->rec,
167 1.23 dsl cfile->end, ftbl);
168 1.23 dsl if (c == EOF) {
169 1.1 bjh21 --i;
170 1.1 bjh21 --nfiles;
171 1.1 bjh21 break;
172 1.1 bjh21 }
173 1.9 jdolecek
174 1.9 jdolecek if (c == BUFFEND) {
175 1.23 dsl bufs_sz[j] *= 2;
176 1.15 itojun cfile = realloc(bufs[j], bufs_sz[j]);
177 1.9 jdolecek if (!cfile)
178 1.12 jdolecek err(2, "merge: realloc");
179 1.9 jdolecek
180 1.12 jdolecek bufs[j] = (void *) cfile;
181 1.9 jdolecek cfile->end = (u_char *)cfile + bufs_sz[j];
182 1.9 jdolecek
183 1.9 jdolecek c = 1;
184 1.9 jdolecek continue;
185 1.9 jdolecek }
186 1.9 jdolecek
187 1.1 bjh21 if (i)
188 1.1 bjh21 c = insert(flist, &cfile, i, !DELETE);
189 1.1 bjh21 else
190 1.1 bjh21 flist[0] = cfile;
191 1.1 bjh21 }
192 1.1 bjh21 }
193 1.9 jdolecek
194 1.24 dsl /*
195 1.24 dsl * We now loop reading a new record from the file with the
196 1.24 dsl * 'sorted first' existing record.
197 1.24 dsl * As each record is added, the 'first' record is written to the
198 1.24 dsl * output file - maintaining one record from each file in the sorted
199 1.24 dsl * list.
200 1.24 dsl */
201 1.9 jdolecek cfile = (struct mfile *) bufs[nf];
202 1.24 dsl cfile->end = (u_char *) cfile + bufs_sz[nf];
203 1.1 bjh21 cfile->flno = flist[0]->flno;
204 1.24 dsl for (;;) {
205 1.24 dsl c = get(cfile->flno, 0, NULL, nfiles, cfile->rec,
206 1.24 dsl cfile->end, ftbl);
207 1.24 dsl if (c == EOF) {
208 1.24 dsl /* Write out last record from now-empty input */
209 1.24 dsl put(flist[0]->rec, outfp);
210 1.24 dsl if (--nfiles == 0)
211 1.1 bjh21 break;
212 1.24 dsl /* Replace from file with now-first sorted record. */
213 1.24 dsl /* (Moving base 'flist' saves copying everything!) */
214 1.24 dsl flist++;
215 1.24 dsl cfile->flno = flist[0]->flno;
216 1.24 dsl continue;
217 1.24 dsl }
218 1.24 dsl if (c == BUFFEND) {
219 1.24 dsl /* Buffer not large enough - double in size */
220 1.24 dsl char *oldbuf = (char *) cfile;
221 1.24 dsl availsz = (char *) cfile->end - oldbuf;
222 1.24 dsl availsz *= 2;
223 1.24 dsl cfile = realloc(oldbuf, availsz);
224 1.24 dsl if (!cfile)
225 1.24 dsl err(2, "merge: realloc");
226 1.24 dsl cfile->end = (u_char *)cfile + availsz;
227 1.24 dsl
228 1.24 dsl /* Update pointers we'll use for next merge */
229 1.24 dsl for (i = 0; i < nf + 1; i++) {
230 1.24 dsl if (bufs[i] == oldbuf) {
231 1.24 dsl bufs[i] = (char *)cfile;
232 1.24 dsl bufs_sz[i] = availsz;
233 1.24 dsl break;
234 1.9 jdolecek }
235 1.9 jdolecek }
236 1.24 dsl /* Read again from same file into same buffer */
237 1.24 dsl continue;
238 1.1 bjh21 }
239 1.24 dsl
240 1.24 dsl /* Add into sort, removing the original first entry */
241 1.24 dsl c = insert(flist, &cfile, nfiles, DELETE);
242 1.24 dsl
243 1.24 dsl /*
244 1.24 dsl * 'cfile' is now the buffer from the old record from the
245 1.24 dsl * file we just read, but with the file number of the
246 1.24 dsl * current 'first record.
247 1.24 dsl * (Unless we are rejecting a duplicate, when c == 1 and
248 1.24 dsl * it is unchanged!)
249 1.24 dsl */
250 1.24 dsl if (c == 0)
251 1.24 dsl put(cfile->rec, outfp);
252 1.24 dsl }
253 1.1 bjh21 }
254 1.1 bjh21
255 1.1 bjh21 /*
256 1.1 bjh21 * if delete: inserts *rec in flist, deletes flist[0], and leaves it in *rec;
257 1.1 bjh21 * otherwise just inserts *rec in flist.
258 1.10 jdolecek */
259 1.1 bjh21 static int
260 1.19 dsl insert(struct mfile **flist, struct mfile **rec, int ttop, int delete)
261 1.1 bjh21 {
262 1.8 jdolecek struct mfile *tmprec = *rec;
263 1.8 jdolecek int mid, top = ttop, bot = 0, cmpv = 1;
264 1.8 jdolecek
265 1.16 itojun for (mid = top / 2; bot + 1 != top; mid = (bot + top) / 2) {
266 1.1 bjh21 cmpv = cmp(tmprec->rec, flist[mid]->rec);
267 1.24 dsl if (cmpv == 0 ) {
268 1.8 jdolecek if (UNIQUE)
269 1.24 dsl /* Duplicate key, read another record */
270 1.24 dsl return 1;
271 1.23 dsl /*
272 1.24 dsl * Apply sort by fileno, to give priority to earlier
273 1.24 dsl * specified files, hence providing a stable sort.
274 1.24 dsl * We could truncate the sort is the fileno are
275 1.24 dsl * adjacent - but that is all too hard!
276 1.24 dsl * The fileno cannot be equal, since we only have one
277 1.24 dsl * record from each file (+ flist[0] which never
278 1.24 dsl * comes here).
279 1.23 dsl */
280 1.23 dsl cmpv = tmprec->flno - flist[mid]->flno;
281 1.1 bjh21 }
282 1.24 dsl if (cmpv < 0)
283 1.24 dsl top = mid;
284 1.24 dsl else
285 1.24 dsl bot = mid;
286 1.1 bjh21 }
287 1.8 jdolecek
288 1.24 dsl /* At this point we haven't yet compared against flist[0] */
289 1.24 dsl
290 1.1 bjh21 if (delete) {
291 1.24 dsl /* flist[0] came from the same file, it cannot be earlier. */
292 1.24 dsl if (UNIQUE && bot == 0 && cmp(tmprec->rec, flist[0]->rec) == 0)
293 1.24 dsl /* Duplicate record (key) in original file */
294 1.24 dsl return 1;
295 1.1 bjh21 tmprec = flist[0];
296 1.1 bjh21 if (bot)
297 1.16 itojun memmove(flist, flist + 1, bot * sizeof(MFILE **));
298 1.1 bjh21 flist[bot] = *rec;
299 1.24 dsl tmprec->flno = flist[0]->flno;
300 1.1 bjh21 *rec = tmprec;
301 1.24 dsl return 0;
302 1.24 dsl }
303 1.24 dsl
304 1.24 dsl /* Inserting original set of records */
305 1.24 dsl
306 1.24 dsl if (bot == 0 && cmpv != 0) {
307 1.24 dsl /* Doesn't match flist[1], must compare with flist[0] */
308 1.24 dsl cmpv = cmp(tmprec->rec, flist[0]->rec);
309 1.24 dsl if (cmpv == 0 && UNIQUE)
310 1.24 dsl return 1;
311 1.24 dsl /* Add matching keys in file order (ie new is later) */
312 1.24 dsl if (cmpv < 0)
313 1.24 dsl bot = -1;
314 1.1 bjh21 }
315 1.24 dsl bot++;
316 1.24 dsl memmove(flist + bot + 1, flist + bot, (ttop - bot) * sizeof(MFILE **));
317 1.24 dsl flist[bot] = *rec;
318 1.24 dsl return 0;
319 1.1 bjh21 }
320 1.1 bjh21
321 1.1 bjh21 /*
322 1.1 bjh21 * check order on one file
323 1.1 bjh21 */
324 1.1 bjh21 void
325 1.19 dsl order(struct filelist *filelist, get_func_t get, struct field *ftbl)
326 1.1 bjh21 {
327 1.25 dsl RECHEADER *crec, *prec, *trec;
328 1.6 jdolecek u_char *crec_end, *prec_end, *trec_end;
329 1.1 bjh21 int c;
330 1.1 bjh21
331 1.25 dsl crec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
332 1.25 dsl crec_end = crec->data + DEFLLEN;
333 1.25 dsl prec = malloc(offsetof(RECHEADER, data[DEFLLEN]));
334 1.25 dsl prec_end = prec->data + DEFLLEN;
335 1.1 bjh21 wts = ftbl->weights;
336 1.23 dsl
337 1.23 dsl /* XXX this does exit(0) for overlong lines */
338 1.23 dsl if (get(-1, 0, filelist, 1, prec, prec_end, ftbl) != 0)
339 1.23 dsl exit(0);
340 1.23 dsl while (get(-1, 0, filelist, 1, crec, crec_end, ftbl) == 0) {
341 1.1 bjh21 if (0 < (c = cmp(prec, crec))) {
342 1.1 bjh21 crec->data[crec->length-1] = 0;
343 1.1 bjh21 errx(1, "found disorder: %s", crec->data+crec->offset);
344 1.1 bjh21 }
345 1.1 bjh21 if (UNIQUE && !c) {
346 1.1 bjh21 crec->data[crec->length-1] = 0;
347 1.1 bjh21 errx(1, "found non-uniqueness: %s",
348 1.1 bjh21 crec->data+crec->offset);
349 1.1 bjh21 }
350 1.6 jdolecek /*
351 1.6 jdolecek * Swap pointers so that this record is on place pointed
352 1.6 jdolecek * to by prec and new record is read to place pointed to by
353 1.6 jdolecek * crec.
354 1.6 jdolecek */
355 1.1 bjh21 trec = prec;
356 1.1 bjh21 prec = crec;
357 1.1 bjh21 crec = trec;
358 1.6 jdolecek trec_end = prec_end;
359 1.6 jdolecek prec_end = crec_end;
360 1.6 jdolecek crec_end = trec_end;
361 1.1 bjh21 }
362 1.1 bjh21 exit(0);
363 1.1 bjh21 }
364 1.1 bjh21
365 1.1 bjh21 static int
366 1.19 dsl cmp(RECHEADER *rec1, RECHEADER *rec2)
367 1.1 bjh21 {
368 1.4 jdolecek int r;
369 1.23 dsl size_t len, i;
370 1.23 dsl u_char *pos1, *pos2;
371 1.4 jdolecek u_char *cwts;
372 1.23 dsl
373 1.23 dsl if (!SINGL_FLD)
374 1.23 dsl /* key is weights, and is 0x00 terminated */
375 1.23 dsl return memcmp(rec1->data, rec2->data, rec1->offset);
376 1.23 dsl
377 1.23 dsl /* We have to apply the weights ourselves */
378 1.23 dsl cwts = wts;
379 1.23 dsl
380 1.23 dsl pos1 = rec1->data;
381 1.23 dsl pos2 = rec2->data;
382 1.23 dsl len = rec1->length;
383 1.23 dsl
384 1.23 dsl for (i = 0; i < len; i++) {
385 1.23 dsl r = cwts[pos1[i]] - cwts[pos2[i]];
386 1.23 dsl if (r)
387 1.23 dsl return r;
388 1.1 bjh21 }
389 1.23 dsl
390 1.1 bjh21 return (0);
391 1.1 bjh21 }
392