cut.c revision 1.32 1 /* $NetBSD: cut.c,v 1.32 2025/03/09 05:04:54 gutteridge Exp $ */
2
3 /*
4 * Copyright (c) 1989, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Adam S. Moskowitz of Menlo Consulting and Marciano Pitargue.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include <sys/cdefs.h>
36 #ifndef lint
37 __COPYRIGHT("@(#) Copyright (c) 1989, 1993\
38 The Regents of the University of California. All rights reserved.");
39 #endif /* not lint */
40
41 #ifndef lint
42 #if 0
43 static char sccsid[] = "@(#)cut.c 8.3 (Berkeley) 5/4/95";
44 #endif
45 __RCSID("$NetBSD: cut.c,v 1.32 2025/03/09 05:04:54 gutteridge Exp $");
46 #endif /* not lint */
47
48 #include <ctype.h>
49 #include <err.h>
50 #include <errno.h>
51 #include <limits.h>
52 #include <locale.h>
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <string.h>
56 #include <unistd.h>
57 #include <util.h>
58 #include <wchar.h>
59 #include <sys/param.h>
60
61 static int bflag;
62 static int cflag;
63 static char dchar;
64 static int dflag;
65 static int fflag;
66 static int nflag;
67 static int sflag;
68
69 static void b_cut(FILE *, const char *);
70 static void b_n_cut(FILE *, const char *);
71 static void c_cut(FILE *, const char *);
72 static void f_cut(FILE *, const char *);
73 static void get_list(char *);
74 static void usage(void) __dead;
75
76 int
77 main(int argc, char *argv[])
78 {
79 FILE *fp;
80 void (*fcn)(FILE *, const char *);
81 int ch, rval;
82
83 fcn = NULL;
84 (void)setlocale(LC_ALL, "");
85
86 dchar = '\t'; /* default delimiter is \t */
87
88 while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1)
89 switch(ch) {
90 case 'b':
91 fcn = b_cut;
92 get_list(optarg);
93 bflag = 1;
94 break;
95 case 'c':
96 fcn = c_cut;
97 get_list(optarg);
98 cflag = 1;
99 break;
100 case 'd':
101 dchar = *optarg;
102 dflag = 1;
103 break;
104 case 'f':
105 get_list(optarg);
106 fcn = f_cut;
107 fflag = 1;
108 break;
109 case 's':
110 sflag = 1;
111 break;
112 case 'n':
113 nflag = 1;
114 break;
115 case '?':
116 default:
117 usage();
118 }
119 argc -= optind;
120 argv += optind;
121
122 if (bflag + cflag + fflag != 1 ||
123 (nflag && !bflag) ||
124 ((dflag || sflag) && !fflag))
125 usage();
126
127 if (nflag)
128 fcn = b_n_cut;
129
130 rval = 0;
131 if (*argv)
132 for (; *argv; ++argv) {
133 if (strcmp(*argv, "-") == 0)
134 fcn(stdin, "stdin");
135 else {
136 if ((fp = fopen(*argv, "r"))) {
137 fcn(fp, *argv);
138 (void)fclose(fp);
139 } else {
140 rval = 1;
141 warn("%s", *argv);
142 }
143 }
144 }
145 else
146 fcn(stdin, "stdin");
147 return(rval);
148 }
149
150 static size_t autostart, autostop, maxval;
151
152 static char *positions = NULL;
153 static size_t numpositions = 0;
154 #define ALLOC_CHUNK _POSIX2_LINE_MAX /* malloc granularity */
155
156 static void
157 get_list(char *list)
158 {
159 size_t setautostart, start, stop;
160 char *pos;
161 char *p;
162
163 if (positions == NULL) {
164 numpositions = ALLOC_CHUNK;
165 positions = ecalloc(numpositions, sizeof(*positions));
166 }
167
168 /*
169 * Set a byte in the positions array to indicate if a field or
170 * column is to be selected; use +1, it's 1-based, not 0-based.
171 * Numbers and number ranges may be overlapping, repeated, and in
172 * any order. We handle "-3-5" although there's no real reason to.
173 */
174 for (; (p = strtok(list, ", \t")) != NULL; list = NULL) {
175 setautostart = start = stop = 0;
176 if (*p == '-') {
177 ++p;
178 setautostart = 1;
179 }
180 if (isdigit((unsigned char)*p)) {
181 start = stop = strtol(p, &p, 10);
182 if (setautostart && start > autostart)
183 autostart = start;
184 }
185 if (*p == '-') {
186 if (isdigit((unsigned char)p[1]))
187 stop = strtol(p + 1, &p, 10);
188 if (*p == '-') {
189 ++p;
190 if (!autostop || autostop > stop)
191 autostop = stop;
192 }
193 }
194 if (*p)
195 errx(1, "[-bcf] list: illegal list value");
196 if (!stop || !start)
197 errx(1, "[-bcf] list: values may not include zero");
198 if (stop + 1 > numpositions) {
199 size_t newsize;
200 newsize = roundup(stop + 1, ALLOC_CHUNK);
201 positions = erealloc(positions, newsize);
202 (void)memset(positions + numpositions, 0,
203 newsize - numpositions);
204 numpositions = newsize;
205 }
206 if (maxval < stop)
207 maxval = stop;
208 for (pos = positions + start; start++ <= stop; pos++)
209 *pos = 1;
210 }
211
212 /* overlapping ranges */
213 if (autostop && maxval > autostop)
214 maxval = autostop;
215
216 /* set autostart */
217 if (autostart)
218 (void)memset(positions + 1, '1', autostart);
219 }
220
221 /*
222 * Cut based on byte positions, taking care not to split multibyte characters.
223 * Although this function also handles the case where -n is not specified,
224 * b_cut() ought to be much faster.
225 */
226 static void
227 b_n_cut(FILE *fp, const char *fname)
228 {
229 size_t col, i, lbuflen;
230 char *lbuf;
231 int canwrite, clen, warned;
232 mbstate_t mbs;
233
234 memset(&mbs, 0, sizeof(mbs));
235 warned = 0;
236 while ((lbuf = fgetln(fp, &lbuflen)) != NULL) {
237 for (col = 0; lbuflen > 0; col += clen) {
238 if ((clen = mbrlen(lbuf, lbuflen, &mbs)) < 0) {
239 if (!warned) {
240 warn("%s", fname);
241 warned = 1;
242 }
243 memset(&mbs, 0, sizeof(mbs));
244 clen = 1;
245 }
246 if (clen == 0 || *lbuf == '\n')
247 break;
248 if (col < maxval && !positions[1 + col]) {
249 /*
250 * Print the character if (1) after an initial
251 * segment of un-selected bytes, the rest of
252 * it is selected, and (2) the last byte is
253 * selected.
254 */
255 i = col;
256 while (i < col + clen && i < maxval &&
257 !positions[1 + i])
258 i++;
259 canwrite = i < col + clen;
260 for (; i < col + clen && i < maxval; i++)
261 canwrite &= positions[1 + i];
262 if (canwrite)
263 fwrite(lbuf, 1, clen, stdout);
264 } else {
265 /*
266 * Print the character if all of it has
267 * been selected.
268 */
269 canwrite = 1;
270 for (i = col; i < col + clen; i++)
271 if ((i >= maxval && !autostop) ||
272 (i < maxval && !positions[1 + i])) {
273 canwrite = 0;
274 break;
275 }
276 if (canwrite)
277 fwrite(lbuf, 1, clen, stdout);
278 }
279 lbuf += clen;
280 lbuflen -= clen;
281 }
282 if (lbuflen > 0)
283 putchar('\n');
284 }
285 }
286
287 static void
288 /*ARGSUSED*/
289 f_cut(FILE *fp, const char *fname __unused)
290 {
291 int ch, field, isdelim;
292 char *pos, *p, sep;
293 int output;
294 size_t len;
295 char *lbuf, *tbuf;
296
297 for (sep = dchar, tbuf = NULL; (lbuf = fgetln(fp, &len)) != NULL;) {
298 output = 0;
299 if (lbuf[len - 1] != '\n') {
300 /* no newline at the end of the last line so add one */
301 if ((tbuf = (char *)malloc(len + 1)) == NULL)
302 err(1, NULL);
303 (void)memcpy(tbuf, lbuf, len);
304 tbuf[len++] = '\n';
305 lbuf = tbuf;
306 }
307 for (isdelim = 0, p = lbuf;; ++p) {
308 ch = *p;
309 /* this should work if newline is delimiter */
310 if (ch == sep)
311 isdelim = 1;
312 if (ch == '\n') {
313 if (!isdelim && !sflag)
314 (void)fwrite(lbuf, len, 1, stdout);
315 break;
316 }
317 }
318 if (!isdelim)
319 continue;
320
321 pos = positions + 1;
322 for (field = maxval, p = lbuf; field; --field, ++pos) {
323 if (*pos) {
324 if (output++)
325 (void)putchar(sep);
326 while ((ch = *p++) != '\n' && ch != sep)
327 (void)putchar(ch);
328 } else {
329 while ((ch = *p++) != '\n' && ch != sep)
330 continue;
331 }
332 if (ch == '\n')
333 break;
334 }
335 if (ch != '\n') {
336 if (autostop) {
337 if (output)
338 (void)putchar(sep);
339 for (; (ch = *p) != '\n'; ++p)
340 (void)putchar(ch);
341 } else
342 for (; (ch = *p) != '\n'; ++p);
343 }
344 (void)putchar('\n');
345 if (tbuf) {
346 free(tbuf);
347 tbuf = NULL;
348 }
349 }
350 if (tbuf)
351 free(tbuf);
352 }
353
354 static void
355 usage(void)
356 {
357 (void)fprintf(stderr, "usage:\tcut -b list [-n] [file ...]\n"
358 "\tcut -c list [file ...]\n"
359 "\tcut -f list [-d string] [-s] [file ...]\n");
360 exit(1);
361 }
362
363 /* make b_cut(): */
364 #define CUT_BYTE 1
365 #include "x_cut.c"
366 #undef CUT_BYTE
367
368 /* make c_cut(): */
369 #define CUT_BYTE 0
370 #include "x_cut.c"
371 #undef CUT_BYTE
372