wc.c revision 1.36 1 /* $NetBSD: wc.c,v 1.36 2024/01/14 17:16:10 christos Exp $ */
2
3 /*
4 * Copyright (c) 1980, 1987, 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 #ifndef lint
34 __COPYRIGHT("@(#) Copyright (c) 1980, 1987, 1991, 1993\
35 The Regents of the University of California. All rights reserved.");
36 #endif /* not lint */
37
38 #ifndef lint
39 #if 0
40 static char sccsid[] = "@(#)wc.c 8.2 (Berkeley) 5/2/95";
41 #else
42 __RCSID("$NetBSD: wc.c,v 1.36 2024/01/14 17:16:10 christos Exp $");
43 #endif
44 #endif /* not lint */
45
46 /* wc line, word, char count and optionally longest line. */
47
48 #include <sys/param.h>
49 #include <sys/file.h>
50 #include <sys/stat.h>
51
52 #include <ctype.h>
53 #include <fcntl.h>
54 #include <err.h>
55 #include <errno.h>
56 #include <locale.h>
57 #include <stdbool.h>
58 #include <stdio.h>
59 #include <stdlib.h>
60 #include <string.h>
61 #include <unistd.h>
62 #include <wchar.h>
63 #include <wctype.h>
64
65 #ifdef NO_QUAD
66 typedef u_long wc_count_t;
67 # define WCFMT " %7lu"
68 # define WCCAST unsigned long
69 #else
70 typedef u_quad_t wc_count_t;
71 # define WCFMT " %7llu"
72 # define WCCAST unsigned long long
73 #endif
74
75 static wc_count_t tlinect, twordct, tcharct, tlongest;
76 static bool doline, doword, dobyte, dochar, dolongest;
77 static int rval = 0;
78
79 static void cnt(const char *);
80 static void print_counts(wc_count_t, wc_count_t, wc_count_t, wc_count_t,
81 const char *);
82 __dead static void usage(void);
83 static size_t do_mb(wchar_t *, const char *, size_t, mbstate_t *,
84 size_t *, const char *);
85
86 int
87 main(int argc, char *argv[])
88 {
89 int ch;
90
91 setlocale(LC_ALL, "");
92
93 while ((ch = getopt(argc, argv, "lwcmL")) != -1)
94 switch (ch) {
95 case 'l':
96 doline = true;
97 break;
98 case 'w':
99 doword = true;
100 break;
101 case 'm':
102 dochar = true;
103 dobyte = 0;
104 break;
105 case 'c':
106 dochar = 0;
107 dobyte = true;
108 break;
109 case 'L':
110 dolongest = true;
111 break;
112 case '?':
113 default:
114 usage();
115 }
116 argv += optind;
117 argc -= optind;
118
119 /* Wc's flags are on by default. */
120 if (!(doline || doword || dobyte || dochar || dolongest))
121 doline = doword = dobyte = true;
122
123 if (*argv == NULL) {
124 cnt(NULL);
125 } else {
126 bool dototal = (argc > 1);
127
128 do {
129 cnt(*argv);
130 } while(*++argv);
131
132 if (dototal) {
133 print_counts(tlinect, twordct, tcharct, tlongest,
134 "total");
135 }
136 }
137
138 exit(rval);
139 }
140
141 static size_t
142 do_mb(wchar_t *wc, const char *p, size_t len, mbstate_t *st,
143 size_t *retcnt, const char *file)
144 {
145 size_t r;
146 size_t c = 0;
147
148 do {
149 r = mbrtowc(wc, p, len, st);
150 if (r == (size_t)-1) {
151 warnx("%s: invalid byte sequence", file);
152 rval = 1;
153
154 /* XXX skip 1 byte */
155 len--;
156 p++;
157 memset(st, 0, sizeof(*st));
158 continue;
159 } else if (r == (size_t)-2)
160 break;
161 else if (r == 0)
162 r = 1;
163 c++;
164 if (wc)
165 wc++;
166 len -= r;
167 p += r;
168 } while (len > 0);
169
170 *retcnt = c;
171
172 return (r);
173 }
174
175 static void
176 cnt(const char *file)
177 {
178 u_char buf[MAXBSIZE];
179 wchar_t wbuf[MAXBSIZE];
180 struct stat sb;
181 wc_count_t charct, linect, wordct, longest;
182 mbstate_t st;
183 u_char *C;
184 wchar_t *WC;
185 const char *name; /* filename or <stdin> */
186 size_t r = 0;
187 int fd, len = 0;
188
189 linect = wordct = charct = longest = 0;
190 if (file != NULL) {
191 if ((fd = open(file, O_RDONLY, 0)) < 0) {
192 warn("%s", file);
193 rval = 1;
194 return;
195 }
196 name = file;
197 } else {
198 fd = STDIN_FILENO;
199 name = "<stdin>";
200 }
201
202 if (dochar || doword || dolongest)
203 (void)memset(&st, 0, sizeof(st));
204
205 if (!(doword || dolongest)) {
206 /*
207 * line counting is split out because it's a lot
208 * faster to get lines than to get words, since
209 * the word count requires some logic.
210 */
211 if (doline || dochar) {
212 while ((len = read(fd, buf, MAXBSIZE)) > 0) {
213 if (dochar) {
214 size_t wlen;
215
216 r = do_mb(0, (char *)buf, (size_t)len,
217 &st, &wlen, name);
218 charct += wlen;
219 } else if (dobyte)
220 charct += len;
221 if (doline) {
222 for (C = buf; len--; ++C) {
223 if (*C == '\n')
224 ++linect;
225 }
226 }
227 }
228 }
229
230 /*
231 * if all we need is the number of characters and
232 * it's a directory or a regular or linked file, just
233 * stat the puppy. We avoid testing for it not being
234 * a special device in case someone adds a new type
235 * of inode.
236 */
237 else if (dobyte) {
238 if (fstat(fd, &sb)) {
239 warn("%s", name);
240 rval = 1;
241 } else {
242 /* st_dev == -1 for kernfs/procfs files */
243 if (sb.st_dev != (dev_t)-1 &&
244 (S_ISREG(sb.st_mode) ||
245 S_ISLNK(sb.st_mode) ||
246 S_ISDIR(sb.st_mode))) {
247 charct = sb.st_size;
248 } else {
249 while ((len =
250 read(fd, buf, MAXBSIZE)) > 0)
251 charct += len;
252 }
253 }
254 }
255 } else {
256 /* do it the hard way... */
257 wc_count_t linelen;
258 bool gotsp;
259
260 linelen = 0;
261 gotsp = true;
262 while ((len = read(fd, buf, MAXBSIZE)) > 0) {
263 size_t wlen;
264
265 r = do_mb(wbuf, (char *)buf, (size_t)len, &st, &wlen,
266 name);
267 if (dochar) {
268 charct += wlen;
269 } else if (dobyte) {
270 charct += len;
271 }
272 for (WC = wbuf; wlen--; ++WC) {
273 if (iswspace(*WC)) {
274 gotsp = true;
275 if (*WC == L'\n') {
276 ++linect;
277 if (linelen > longest)
278 longest = linelen;
279 linelen = 0;
280 } else {
281 linelen++;
282 }
283 } else {
284 /*
285 * This line implements the POSIX
286 * spec, i.e. a word is a "maximal
287 * string of characters delimited by
288 * whitespace." Notice nothing was
289 * said about a character being
290 * printing or non-printing.
291 */
292 if (gotsp) {
293 gotsp = false;
294 ++wordct;
295 }
296
297 linelen++;
298 }
299 }
300 }
301 }
302
303 if (len == -1) {
304 warn("%s", name);
305 rval = 1;
306 }
307 if (dochar && r == (size_t)-2) {
308 warnx("%s: incomplete multibyte character", name);
309 rval = 1;
310 }
311
312 print_counts(linect, wordct, charct, longest, file);
313
314 /*
315 * don't bother checkint doline, doword, or dobyte --- speeds
316 * up the common case
317 */
318 tlinect += linect;
319 twordct += wordct;
320 tcharct += charct;
321 if (dolongest && longest > tlongest)
322 tlongest = longest;
323
324 if (close(fd)) {
325 warn("%s", name);
326 rval = 1;
327 }
328 }
329
330 static void
331 print_counts(wc_count_t lines, wc_count_t words, wc_count_t chars,
332 wc_count_t longest, const char *name)
333 {
334
335 if (doline)
336 (void)printf(WCFMT, (WCCAST)lines);
337 if (doword)
338 (void)printf(WCFMT, (WCCAST)words);
339 if (dobyte || dochar)
340 (void)printf(WCFMT, (WCCAST)chars);
341 if (dolongest)
342 (void)printf(WCFMT, (WCCAST)longest);
343
344 if (name != NULL)
345 (void)printf(" %s\n", name);
346 else
347 (void)putchar('\n');
348 }
349
350 static void
351 usage(void)
352 {
353
354 (void)fprintf(stderr, "usage: wc [-c | -m] [-Llw] [file ...]\n");
355 exit(1);
356 }
357