uniq.c revision 1.5 1 /* $NetBSD: uniq.c,v 1.5 2010/04/25 00:54:46 joerg Exp $ */
2
3 /*-
4 * Copyright (c) 2007 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Christos Zoulas.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31 #include <sys/cdefs.h>
32 __RCSID("$NetBSD: uniq.c,v 1.5 2010/04/25 00:54:46 joerg Exp $");
33
34 #include <stdio.h>
35 #include <string.h>
36 #include <stdlib.h>
37 #include <db.h>
38 #include <err.h>
39 #include <util.h>
40 #include <ctype.h>
41 #include <fcntl.h>
42
43 #include "extern.h"
44
45 static const HASHINFO hinfo = {
46 .bsize = 256,
47 .ffactor = 4,
48 .nelem = 32768,
49 .cachesize = 1024,
50 .hash = NULL,
51 .lorder = 0
52 };
53
54 static int comp(const char *, char **, size_t *);
55
56 /*
57 * Preserve only unique content lines in a file. Input lines that have
58 * content [alphanumeric characters before a comment] are white-space
59 * normalized and have their comments removed. Then they are placed
60 * in a hash table, and only the first instance of them is printed.
61 * Comment lines without any alphanumeric content are always printed
62 * since they are there to make the file "pretty". Comment lines with
63 * alphanumeric content are also placed into the hash table and only
64 * printed once.
65 */
66 void
67 uniq(const char *fname)
68 {
69 DB *db;
70 DBT key;
71 static const DBT data = { NULL, 0 };
72 FILE *fp;
73 char *line;
74 size_t len;
75
76 if ((db = dbopen(NULL, O_RDWR, 0, DB_HASH, &hinfo)) == NULL)
77 err(1, "Cannot create in memory database");
78
79 fp = efopen(fname, "r");
80 while ((line = fgetln(fp, &len)) != NULL) {
81 size_t complen = len;
82 char *compline;
83 if (!comp(line, &compline, &complen)) {
84 (void)fprintf(stdout, "%*.*s", (int)len, (int)len,
85 line);
86 continue;
87 }
88 key.data = compline;
89 key.size = complen;
90 switch ((db->put)(db, &key, &data, R_NOOVERWRITE)) {
91 case 0:
92 (void)fprintf(stdout, "%*.*s", (int)len, (int)len,
93 line);
94 break;
95 case 1:
96 break;
97 case -1:
98 err(1, "put");
99 default:
100 abort();
101 break;
102 }
103 }
104 (void)fflush(stdout);
105 exit(0);
106 }
107
108 /*
109 * normalize whitespace in the original line and place a new string
110 * with whitespace converted to a single space in compline. If the line
111 * contains just comments, we preserve them. If it contains data and
112 * comments, we kill the comments. Return 1 if the line had actual
113 * contents, or 0 if it was just a comment without alphanumeric characters.
114 */
115 static int
116 comp(const char *origline, char **compline, size_t *len)
117 {
118 const unsigned char *p;
119 unsigned char *q;
120 char *cline;
121 size_t l = *len, complen;
122 int hasalnum, iscomment;
123
124 /* Eat leading space */
125 for (p = (const unsigned char *)origline; l && *p && isspace(*p);
126 p++, l--)
127 continue;
128 cline = emalloc(l + 1);
129 (void)memcpy(cline, p, l);
130 cline[l] = '\0';
131 if (*cline == '\0')
132 return 0;
133
134 complen = 0;
135 hasalnum = 0;
136 iscomment = 0;
137
138 for (q = (unsigned char *)cline; l && *p; p++, l--) {
139 if (isspace(*p)) {
140 if (complen && isspace(q[-1]))
141 continue;
142 *q++ = ' ';
143 complen++;
144 } else {
145 if (!iscomment && *p == '#') {
146 if (hasalnum)
147 break;
148 iscomment = 1;
149 } else
150 hasalnum |= isalnum(*p);
151 *q++ = *p;
152 complen++;
153 }
154 }
155
156 /* Eat trailing space */
157 while (complen && isspace(q[-1])) {
158 --q;
159 --complen;
160 }
161 *q = '\0';
162 *compline = cline;
163 *len = complen;
164 return hasalnum;
165 }
166