fsort.c revision 1.30 1 /* $NetBSD: fsort.c,v 1.30 2004/02/15 11:54:17 jdolecek Exp $ */
2
3 /*-
4 * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Ben Harris and Jaromir Dolecek.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*-
40 * Copyright (c) 1993
41 * The Regents of the University of California. All rights reserved.
42 *
43 * This code is derived from software contributed to Berkeley by
44 * Peter McIlroy.
45 *
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions
48 * are met:
49 * 1. Redistributions of source code must retain the above copyright
50 * notice, this list of conditions and the following disclaimer.
51 * 2. Redistributions in binary form must reproduce the above copyright
52 * notice, this list of conditions and the following disclaimer in the
53 * documentation and/or other materials provided with the distribution.
54 * 3. Neither the name of the University nor the names of its contributors
55 * may be used to endorse or promote products derived from this software
56 * without specific prior written permission.
57 *
58 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
59 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
60 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
61 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
62 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
63 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
64 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
65 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
66 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
67 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
68 * SUCH DAMAGE.
69 */
70
71 /*
72 * Read in the next bin. If it fits in one segment sort it;
73 * otherwise refine it by segment deeper by one character,
74 * and try again on smaller bins. Sort the final bin at this level
75 * of recursion to keep the head of fstack at 0.
76 * After PANIC passes, abort to merge sort.
77 */
78 #include "sort.h"
79 #include "fsort.h"
80
81 #ifndef lint
82 __RCSID("$NetBSD: fsort.c,v 1.30 2004/02/15 11:54:17 jdolecek Exp $");
83 __SCCSID("@(#)fsort.c 8.1 (Berkeley) 6/6/93");
84 #endif /* not lint */
85
86 #include <stdlib.h>
87 #include <string.h>
88
89 static const u_char **keylist = 0;
90 u_char *buffer = 0, *linebuf = 0;
91 size_t bufsize = DEFBUFSIZE;
92 size_t linebuf_size;
93 #define FSORTMAX 4
94 int PANIC = FSORTMAX;
95
96 struct tempfile fstack[MAXFCT];
97 #define MSTART (MAXFCT - MERGE_FNUM)
98 #define CHECKFSTACK(n) \
99 if (n >= MAXFCT) \
100 errx(2, "fstack: too many temporary files; use -H or sort in pieces")
101
102 #define SALIGN(n) ((n+sizeof(length_t)-1) & ~(sizeof(length_t)-1))
103
104 void
105 fsort(binno, depth, top, filelist, nfiles, outfp, ftbl)
106 int binno, depth, top;
107 struct filelist *filelist;
108 int nfiles;
109 FILE *outfp;
110 struct field *ftbl;
111 {
112 const u_char **keypos;
113 u_char *bufend;
114 u_char *weights;
115 int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0;
116 int c, nelem, base;
117 long sizes [NBINS + 1];
118 get_func_t get;
119 struct recheader *crec;
120 struct field tfield[2];
121 FILE *prevfp, *tailfp[FSORTMAX + 1];
122 u_char *nbuffer;
123
124 memset(tailfp, 0, sizeof(tailfp));
125 prevfp = outfp;
126 memset(tfield, 0, sizeof(tfield));
127 if (ftbl[0].flags & R)
128 tfield[0].weights = Rascii;
129 else
130 tfield[0].weights = ascii;
131 tfield[0].icol.num = 1;
132 weights = ftbl[0].weights;
133 if (!buffer) {
134 buffer = malloc(bufsize);
135 keylist = malloc(MAXNUM * sizeof(u_char *));
136 memset(keylist, 0, MAXNUM * sizeof(u_char *));
137 if (!SINGL_FLD) {
138 linebuf_size = DEFLLEN;
139 if ((linebuf = malloc(linebuf_size)) == NULL)
140 errx(2, "cannot allocate memory");
141 }
142 }
143 bufend = buffer + bufsize;
144 if (binno >= 0) {
145 base = top + nfiles;
146 get = getnext;
147 } else {
148 base = 0;
149 if (SINGL_FLD)
150 get = makeline;
151 else
152 get = makekey;
153 }
154 for (;;) {
155 memset(sizes, 0, sizeof(sizes));
156 c = ntfiles = 0;
157 if (binno == weights[REC_D] &&
158 !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */
159 rd_append(weights[REC_D], top,
160 nfiles, prevfp, buffer, bufend);
161 break;
162 } else if (binno == weights[REC_D]) {
163 depth = 0; /* start over on flat weights */
164 ftbl = tfield;
165 weights = ftbl[0].weights;
166 }
167 while (c != EOF) {
168 keypos = keylist;
169 nelem = 0;
170 crec = (RECHEADER *) buffer;
171
172 do_read:
173 while ((c = get(binno, top, filelist, nfiles, crec,
174 bufend, ftbl)) == 0) {
175 *keypos++ = crec->data + depth;
176 if (++nelem == MAXNUM) {
177 c = BUFFEND;
178 break;
179 }
180 crec =(RECHEADER *)((char *) crec +
181 SALIGN(crec->length) + sizeof(TRECHEADER));
182 }
183
184 if (c == BUFFEND && nelem < MAXNUM
185 && bufsize < MAXBUFSIZE) {
186 const u_char **keyp;
187 u_char *oldb = buffer;
188
189 /* buffer was too small for data, allocate
190 * bigger buffer */
191 nbuffer = realloc(buffer, bufsize * 2);
192 if (!nbuffer) {
193 err(2, "failed to realloc buffer to %lu bytes",
194 (unsigned long) bufsize * 2);
195 }
196 buffer = nbuffer;
197 bufsize *= 2;
198 bufend = buffer + bufsize;
199
200 /* patch up keylist[] */
201 for (keyp = &keypos[-1]; keyp >= keylist; keyp--)
202 *keyp = buffer + (*keyp - oldb);
203
204 crec = (RECHEADER *) (buffer + ((u_char *)crec - oldb));
205 goto do_read;
206 }
207
208 if (c != BUFFEND && !ntfiles && !mfct) {
209 /* do not push */
210 continue;
211 }
212
213 /* push */
214 if (panic >= PANIC) {
215 fstack[MSTART + mfct].fp = ftmp();
216 if ((stable_sort)
217 ? sradixsort(keylist, nelem,
218 weights, REC_D)
219 : radixsort(keylist, nelem,
220 weights, REC_D) )
221 err(2, NULL);
222 append(keylist, nelem, depth,
223 fstack[MSTART + mfct].fp, putrec,
224 ftbl);
225 mfct++;
226 /* reduce number of open files */
227 if (mfct == MERGE_FNUM ||(c == EOF && ntfiles)) {
228 /*
229 * Only copy extra incomplete crec
230 * data if there are any.
231 */
232 int nodata = (bufend >= (u_char *)crec
233 && bufend <= crec->data);
234 size_t sz=0;
235 u_char *tmpbuf=NULL;
236
237 if (!nodata) {
238 sz = bufend - crec->data;
239 tmpbuf = malloc(sz);
240 memmove(tmpbuf, crec->data, sz);
241 }
242
243 CHECKFSTACK(base + ntfiles);
244 fstack[base + ntfiles].fp = ftmp();
245 fmerge(0, MSTART, filelist,
246 mfct, geteasy,
247 fstack[base + ntfiles].fp,
248 putrec, ftbl);
249 ntfiles++;
250 mfct = 0;
251
252 if (!nodata) {
253 memmove(crec->data, tmpbuf, sz);
254 free(tmpbuf);
255 }
256 }
257 } else {
258 CHECKFSTACK(base + ntfiles);
259 fstack[base + ntfiles].fp = ftmp();
260 onepass(keylist, depth, nelem, sizes,
261 weights, fstack[base + ntfiles].fp);
262 ntfiles++;
263 }
264 }
265 if (!ntfiles && !mfct) { /* everything in memory--pop */
266 if (nelem > 1
267 && ((stable_sort)
268 ? sradixsort(keylist, nelem, weights, REC_D)
269 : radixsort(keylist, nelem, weights, REC_D) ))
270 err(2, NULL);
271 if (nelem > 0)
272 append(keylist, nelem, depth, outfp, putline, ftbl);
273 break; /* pop */
274 }
275 if (panic >= PANIC) {
276 if (!ntfiles)
277 fmerge(0, MSTART, filelist, mfct, geteasy,
278 outfp, putline, ftbl);
279 else
280 fmerge(0, base, filelist, ntfiles, geteasy,
281 outfp, putline, ftbl);
282 break;
283
284 }
285 total = maxb = lastb = 0; /* find if one bin dominates */
286 for (i = 0; i < NBINS; i++)
287 if (sizes[i]) {
288 if (sizes[i] > sizes[maxb])
289 maxb = i;
290 lastb = i;
291 total += sizes[i];
292 }
293 if (sizes[maxb] < max((total / 2) , BUFSIZE))
294 maxb = lastb; /* otherwise pop after last bin */
295 fstack[base].lastb = lastb;
296 fstack[base].maxb = maxb;
297
298 /* start refining next level. */
299 getnext(-1, base, NULL, ntfiles, crec, bufend, 0); /* rewind */
300 for (i = 0; i < maxb; i++) {
301 if (!sizes[i]) /* bin empty; step ahead file offset */
302 getnext(i, base, NULL,ntfiles, crec, bufend, 0);
303 else
304 fsort(i, depth + 1, base, filelist, ntfiles,
305 outfp, ftbl);
306 }
307
308 get = getnext;
309
310 if (lastb != maxb) {
311 if (prevfp != outfp)
312 tailfp[panic] = prevfp;
313 prevfp = ftmp();
314 for (i = maxb + 1; i <= lastb; i++)
315 if (!sizes[i])
316 getnext(i, base, NULL, ntfiles, crec,
317 bufend,0);
318 else
319 fsort(i, depth + 1, base, filelist,
320 ntfiles, prevfp, ftbl);
321 }
322
323 /* sort biggest (or last) bin at this level */
324 depth++;
325 panic++;
326 binno = maxb;
327 top = base;
328 nfiles = ntfiles; /* so overwrite them */
329 }
330 if (prevfp != outfp) {
331 concat(outfp, prevfp);
332 fclose(prevfp);
333 }
334 for (i = panic; i >= 0; --i)
335 if (tailfp[i]) {
336 concat(outfp, tailfp[i]);
337 fclose(tailfp[i]);
338 }
339
340 /* If on top level, free our structures */
341 if (depth == 0) {
342 free(keylist), keylist = NULL;
343 free(buffer), buffer = NULL;
344 }
345 }
346
347 /*
348 * This is one pass of radix exchange, dumping the bins to disk.
349 */
350 #define swap(a, b, t) t = a, a = b, b = t
351 void
352 onepass(a, depth, n, sizes, tr, fp)
353 const u_char **a;
354 int depth;
355 long n, sizes[];
356 u_char *tr;
357 FILE *fp;
358 {
359 size_t tsizes[NBINS + 1];
360 const u_char **bin[257], ***bp, ***bpmax, **top[256], ***tp;
361 static int histo[256];
362 int *hp;
363 int c;
364 const u_char **an, *t, **aj;
365 const u_char **ak, *r;
366
367 memset(tsizes, 0, sizeof(tsizes));
368 depth += sizeof(TRECHEADER);
369 an = &a[n];
370 for (ak = a; ak < an; ak++) {
371 histo[c = tr[**ak]]++;
372 tsizes[c] += ((const RECHEADER *) (*ak -= depth))->length;
373 }
374
375 bin[0] = a;
376 bpmax = bin + 256;
377 tp = top, hp = histo;
378 for (bp = bin; bp < bpmax; bp++) {
379 *tp++ = *(bp + 1) = *bp + (c = *hp);
380 *hp++ = 0;
381 if (c <= 1)
382 continue;
383 }
384 for (aj = a; aj < an; *aj = r, aj = bin[c + 1])
385 for (r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;)
386 swap(*ak, r, t);
387
388 for (ak = a, c = 0; c < 256; c++) {
389 an = bin[c + 1];
390 n = an - ak;
391 tsizes[c] += n * sizeof(TRECHEADER);
392 /* tell getnext how many elements in this bin, this segment. */
393 EWRITE(&tsizes[c], sizeof(size_t), 1, fp);
394 sizes[c] += tsizes[c];
395 for (; ak < an; ++ak)
396 putrec((const RECHEADER *) *ak, fp);
397 }
398 }
399