Home | History | Annotate | Line # | Download | only in sort
      1 /*	$NetBSD: fields.c,v 1.33 2013/01/20 10:12:58 apb Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Ben Harris and Jaromir Dolecek.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*-
     33  * Copyright (c) 1993
     34  *	The Regents of the University of California.  All rights reserved.
     35  *
     36  * This code is derived from software contributed to Berkeley by
     37  * Peter McIlroy.
     38  *
     39  * Redistribution and use in source and binary forms, with or without
     40  * modification, are permitted provided that the following conditions
     41  * are met:
     42  * 1. Redistributions of source code must retain the above copyright
     43  *    notice, this list of conditions and the following disclaimer.
     44  * 2. Redistributions in binary form must reproduce the above copyright
     45  *    notice, this list of conditions and the following disclaimer in the
     46  *    documentation and/or other materials provided with the distribution.
     47  * 3. Neither the name of the University nor the names of its contributors
     48  *    may be used to endorse or promote products derived from this software
     49  *    without specific prior written permission.
     50  *
     51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     61  * SUCH DAMAGE.
     62  */
     63 
     64 /* Subroutines to generate sort keys. */
     65 
     66 #include "sort.h"
     67 
     68 __RCSID("$NetBSD: fields.c,v 1.33 2013/01/20 10:12:58 apb Exp $");
     69 
     70 #define SKIP_BLANKS(ptr) {					\
     71 	if (BLANK & d_mask[*(ptr)])				\
     72 		while (BLANK & d_mask[*(++(ptr))]);		\
     73 }
     74 
     75 #define NEXTCOL(pos) {						\
     76 	if (!SEP_FLAG)						\
     77 		while (BLANK & l_d_mask[*(++pos)]);		\
     78 	while ((*(pos+1) != '\0') && !((FLD_D | REC_D_F) & l_d_mask[*++pos]));\
     79 }
     80 
     81 static u_char *enterfield(u_char *, const u_char *, struct field *, int);
     82 static u_char *number(u_char *, const u_char *, u_char *, u_char *, int);
     83 static u_char *length(u_char *, const u_char *, u_char *, u_char *, int);
     84 
     85 #define DECIMAL_POINT '.'
     86 
     87 /*
     88  * constructs sort key with leading recheader, followed by the key,
     89  * followed by the original line.
     90  */
     91 length_t
     92 enterkey(RECHEADER *keybuf, const u_char *keybuf_end, u_char *line_data,
     93     size_t line_size, struct field fieldtable[])
     94 	/* keybuf:	 pointer to start of key */
     95 {
     96 	int i;
     97 	u_char *l_d_mask;
     98 	u_char *lineend, *pos;
     99 	const u_char *endkey;
    100 	u_char *keypos;
    101 	struct coldesc *clpos;
    102 	int col = 1;
    103 	struct field *ftpos;
    104 
    105 	l_d_mask = d_mask;
    106 	pos = line_data - 1;
    107 	lineend = line_data + line_size-1;
    108 				/* don't include rec_delimiter */
    109 
    110 	for (i = 0; i < ncols; i++) {
    111 		clpos = clist + i;
    112 		for (; (col < clpos->num) && (pos < lineend); col++) {
    113 			NEXTCOL(pos);
    114 		}
    115 		if (pos >= lineend)
    116 			break;
    117 		clpos->start = SEP_FLAG ? pos + 1 : pos;
    118 		NEXTCOL(pos);
    119 		clpos->end = pos;
    120 		col++;
    121 		if (pos >= lineend) {
    122 			clpos->end = lineend;
    123 			i++;
    124 			break;
    125 		}
    126 	}
    127 	for (; i <= ncols; i++)
    128 		clist[i].start = clist[i].end = lineend;
    129 	if (clist[0].start < line_data)
    130 		clist[0].start++;
    131 
    132 	/*
    133 	 * We write the sort keys (concatenated) followed by the
    134 	 * original line data (for output) as the 'keybuf' data.
    135 	 * keybuf->length is the number of key bytes + data bytes.
    136 	 * keybuf->offset is the number of key bytes.
    137 	 * We add a record separator weight after the key in case
    138 	 * (as is usual) we need to preserve the order of equal lines,
    139 	 * and for 'sort -u'.
    140 	 * The key itself will have had the correct weight applied.
    141 	 */
    142 	keypos = keybuf->data;
    143 	endkey = keybuf_end - line_size - 1;
    144 	if (endkey <= keypos)
    145 		/* No room for any key bytes */
    146 		return 1;
    147 
    148 	for (ftpos = fieldtable + 1; ftpos->icol.num; ftpos++) {
    149 		if ((keypos = enterfield(keypos, endkey, ftpos,
    150 		    fieldtable->flags)) == NULL)
    151 			return (1);
    152 	}
    153 
    154 	keybuf->offset = keypos - keybuf->data;
    155 	keybuf->length = keybuf->offset + line_size;
    156 
    157 	/*
    158 	 * Posix requires that equal keys be further sorted by the
    159 	 * entire original record.
    160 	 * NetBSD has (at least for some time) kept equal keys in
    161 	 * their original order.
    162 	 * For 'sort -u' posix_sort is unset.
    163 	 */
    164 	keybuf->keylen = posix_sort ? keybuf->length : keybuf->offset;
    165 
    166 	memcpy(keypos, line_data, line_size);
    167 	return (0);
    168 }
    169 
    170 /*
    171  * constructs a field (as defined by -k) within a key
    172  */
    173 static u_char *
    174 enterfield(u_char *tablepos, const u_char *endkey, struct field *cur_fld,
    175     int gflags)
    176 {
    177 	u_char *start, *end, *lineend, *mask, *lweight;
    178 	struct column icol, tcol;
    179 	u_int flags;
    180 
    181 	icol = cur_fld->icol;
    182 	tcol = cur_fld->tcol;
    183 	flags = cur_fld->flags;
    184 	start = icol.p->start;
    185 	lineend = clist[ncols].end;
    186 	if (flags & BI)
    187 		SKIP_BLANKS(start);
    188 	start += icol.indent;
    189 	start = min(start, lineend);
    190 
    191 	if (!tcol.num)
    192 		end = lineend;
    193 	else {
    194 		if (tcol.indent) {
    195 			end = tcol.p->start;
    196 			if (flags & BT)
    197 				SKIP_BLANKS(end);
    198 			end += tcol.indent;
    199 			end = min(end, lineend);
    200 		} else
    201 			end = tcol.p->end;
    202 	}
    203 
    204 	if (flags & L)
    205 		return length(tablepos, endkey, start, end, flags);
    206 	if (flags & N)
    207 		return number(tablepos, endkey, start, end, flags);
    208 
    209 	/* Bound check space - assuming nothing is skipped */
    210 	if (tablepos + (end - start) + 1 >= endkey)
    211 		return NULL;
    212 
    213 	mask = cur_fld->mask;
    214 	lweight = cur_fld->weights;
    215 	for (; start < end; start++) {
    216 		if (!mask || mask[*start]) {
    217 			*tablepos++ = lweight[*start];
    218 		}
    219 	}
    220 	/* Add extra byte (absent from lweight) to sort short keys correctly */
    221 	*tablepos++ = lweight[REC_D];
    222 	return tablepos;
    223 }
    224 
    225 /*
    226  * Numbers are converted to a floating point format (exponent & mantissa)
    227  * so that they compare correctly as sequence of unsigned bytes.
    228  * Bytes 0x00 and 0xff are used to terminate positive and negative numbers
    229  * to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12.
    230  *
    231  * The first byte contain the overall sign, exponent sign and some of the
    232  * exponent. These have to be ordered (-ve value, decreasing exponent),
    233  * zero, (+ve value, increasing exponent).
    234  *
    235  * The first byte is 0x80 for zero, 0xc0 for +ve with exponent 0.
    236  * -ve values are the 1's compliments (so 0x7f isn't used!).
    237  *
    238  * This only leaves 63 byte values for +ve exponents - which isn't enough.
    239  * The largest 4 exponent values are used to hold a byte count of the
    240  * number of following bytes that contain 8 exponent bits per byte,
    241  * This lets us sort exponents from -2^31 to +2^31.
    242  *
    243  * The mantissa is stored 2 digits per byte offset by 0x40, for negative
    244  * numbers the order must be reversed (they are bit inverted).
    245  *
    246  * Reverse sorts are done by inverting the sign of the number.
    247  */
    248 #define MAX_EXP_ENC  ((int)sizeof(int))
    249 
    250 static u_char *
    251 number(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend,
    252     int reverse)
    253 {
    254 	int exponent = -1;
    255 	int had_dp = 0;
    256 	u_char *tline;
    257 	char ch;
    258 	unsigned int val;
    259 	u_char *last_nz_pos;
    260 	u_char negate;
    261 
    262 	if (reverse & R)
    263 		negate = 0xff;
    264 	else
    265 		negate = 0;
    266 
    267 	/* Give ourselves space for the key terminator */
    268 	bufend--;
    269 
    270 	/* Ensure we have enough space for the exponent */
    271 	if (pos + 1 + MAX_EXP_ENC > bufend)
    272 		return (NULL);
    273 
    274 	SKIP_BLANKS(line);
    275 	if (*line == '-') {	/* set the sign */
    276 		negate ^= 0xff;
    277 		line++;
    278 	} else if (*line == '+') {
    279 		line++;
    280 	}
    281 
    282 	/* eat initial zeroes */
    283 	for (; *line == '0' && line < lineend; line++)
    284 		continue;
    285 
    286 	/* calculate exponents */
    287 	if (*line == DECIMAL_POINT) {
    288 		/* Decimal fraction */
    289 		had_dp = 1;
    290 		while (*++line == '0' && line < lineend)
    291 			exponent--;
    292 	} else {
    293 		/* Large (absolute) value, count digits */
    294 		for (tline = line; *tline >= '0' &&
    295 		    *tline <= '9' && tline < lineend; tline++)
    296 			exponent++;
    297 	}
    298 
    299 	/* If the first/next character isn't a digit, value is zero */
    300 	if (*line < '1' || *line > '9' || line >= lineend) {
    301 		/* This may be "0", "0.00", "000" or "fubar" but sorts as 0 */
    302 		/* XXX what about NaN, NAN, inf and INF */
    303 		*pos++ = 0x80;
    304 		return pos;
    305 	}
    306 
    307 	/* Maybe here we should allow for e+12 (etc) */
    308 
    309 	if (exponent < 0x40 - MAX_EXP_ENC && -exponent < 0x40 - MAX_EXP_ENC) {
    310 		/* Value ok for simple encoding */
    311 		/* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
    312 		exponent += 0xc0;
    313 		*pos++ = negate ^ exponent;
    314 	} else {
    315 		/* Out or range for a single byte */
    316 		int c, t;
    317 		t = exponent > 0 ? exponent : -exponent;
    318 		/* Count how many 8-bit bytes are needed */
    319 		for (c = 0; ; c++) {
    320 			t >>= 8;
    321 			if (t == 0)
    322 				break;
    323 		}
    324 		/* 'c' better be 0..3 here - but probably 0..1 */
    325 		/* Offset just outside valid range */
    326 		t = c + 0x40 - MAX_EXP_ENC;
    327 		if (exponent < 0)
    328 			t = -t;
    329 		*pos++ = negate ^ (t + 0xc0);
    330 		/* now add each byte, most significant first */
    331 		for (; c >= 0; c--)
    332 			*pos++ = negate ^ (exponent >> (c * 8));
    333 	}
    334 
    335 	/* Finally add mantissa, 2 digits per byte */
    336 	for (last_nz_pos = pos; line < lineend; ) {
    337 		if (pos >= bufend)
    338 			return NULL;
    339 		ch = *line++;
    340 		val = (ch - '0') * 10;
    341 		if (val > 90) {
    342 			if (ch == DECIMAL_POINT && !had_dp) {
    343 				had_dp = 1;
    344 				continue;
    345 			}
    346 			break;
    347 		}
    348 		while (line < lineend) {
    349 			ch = *line++;
    350 			if (ch == DECIMAL_POINT && !had_dp) {
    351 				had_dp = 1;
    352 				continue;
    353 			}
    354 			if (ch < '0' || ch > '9')
    355 				line = lineend;
    356 			else
    357 				val += ch - '0';
    358 			break;
    359 		}
    360 		*pos++ = negate ^ (val + 0x40);
    361 		if (val != 0)
    362 			last_nz_pos = pos;
    363 	}
    364 
    365 	/* Add key terminator, deleting any trailing "00" */
    366 	*last_nz_pos++ = negate;
    367 
    368 	return (last_nz_pos);
    369 }
    370 
    371 static u_char *
    372 length(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend,
    373     int flag)
    374 {
    375 	u_char buf[32];
    376 	int l;
    377 	SKIP_BLANKS(line);
    378 	l = snprintf((char *)buf, sizeof(buf), "%td", lineend - line);
    379 	return number(pos, bufend, buf, buf + l, flag);
    380 }
    381