npf_bpf_comp.c revision 1.9 1 /* $NetBSD: npf_bpf_comp.c,v 1.9 2016/12/26 23:05:05 christos Exp $ */
2
3 /*-
4 * Copyright (c) 2010-2014 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This material is based upon work partially supported by The
8 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * BPF byte-code generation for NPF rules.
34 */
35
36 #include <sys/cdefs.h>
37 __RCSID("$NetBSD: npf_bpf_comp.c,v 1.9 2016/12/26 23:05:05 christos Exp $");
38
39 #include <stdlib.h>
40 #include <stdbool.h>
41 #include <stddef.h>
42 #include <string.h>
43 #include <inttypes.h>
44 #include <err.h>
45 #include <assert.h>
46
47 #include <netinet/in.h>
48 #include <netinet/in_systm.h>
49 #define __FAVOR_BSD
50 #include <netinet/ip.h>
51 #include <netinet/ip6.h>
52 #include <netinet/udp.h>
53 #include <netinet/tcp.h>
54 #include <netinet/ip_icmp.h>
55 #include <netinet/icmp6.h>
56
57 #include <net/bpf.h>
58
59 #include "npfctl.h"
60
61 /*
62 * Note: clear X_EQ_L4OFF when register X is invalidated i.e. it stores
63 * something other than L4 header offset. Generally, when BPF_LDX is used.
64 */
65 #define FETCHED_L3 0x01
66 #define CHECKED_L4 0x02
67 #define X_EQ_L4OFF 0x04
68
69 struct npf_bpf {
70 /*
71 * BPF program code, the allocated length (in bytes), the number
72 * of logical blocks and the flags.
73 */
74 struct bpf_program prog;
75 size_t alen;
76 u_int nblocks;
77 sa_family_t af;
78 uint32_t flags;
79
80 /* The current group offset and block number. */
81 bool ingroup;
82 u_int goff;
83 u_int gblock;
84
85 /* BPF marks, allocated length and the real length. */
86 uint32_t * marks;
87 size_t malen;
88 size_t mlen;
89 };
90
91 /*
92 * NPF success and failure values to be returned from BPF.
93 */
94 #define NPF_BPF_SUCCESS ((u_int)-1)
95 #define NPF_BPF_FAILURE 0
96
97 /*
98 * Magic value to indicate the failure path, which is fixed up on completion.
99 * Note: this is the longest jump offset in BPF, since the offset is one byte.
100 */
101 #define JUMP_MAGIC 0xff
102
103 /* Reduce re-allocations by expanding in 64 byte blocks. */
104 #define ALLOC_MASK (64 - 1)
105 #define ALLOC_ROUND(x) (((x) + ALLOC_MASK) & ~ALLOC_MASK)
106
107 #ifndef IPV6_VERSION
108 #define IPV6_VERSION 0x60
109 #endif
110
111 npf_bpf_t *
112 npfctl_bpf_create(void)
113 {
114 return ecalloc(1, sizeof(npf_bpf_t));
115 }
116
117 static void
118 fixup_jumps(npf_bpf_t *ctx, u_int start, u_int end, bool swap)
119 {
120 struct bpf_program *bp = &ctx->prog;
121
122 for (u_int i = start; i < end; i++) {
123 struct bpf_insn *insn = &bp->bf_insns[i];
124 const u_int fail_off = end - i;
125
126 if (fail_off >= JUMP_MAGIC) {
127 errx(EXIT_FAILURE, "BPF generation error: "
128 "the number of instructions is over the limit");
129 }
130 if (BPF_CLASS(insn->code) != BPF_JMP) {
131 continue;
132 }
133 if (swap) {
134 uint8_t jt = insn->jt;
135 insn->jt = insn->jf;
136 insn->jf = jt;
137 }
138 if (insn->jt == JUMP_MAGIC)
139 insn->jt = fail_off;
140 if (insn->jf == JUMP_MAGIC)
141 insn->jf = fail_off;
142 }
143 }
144
145 static void
146 add_insns(npf_bpf_t *ctx, struct bpf_insn *insns, size_t count)
147 {
148 struct bpf_program *bp = &ctx->prog;
149 size_t offset, len, reqlen;
150
151 /* Note: bf_len is the count of instructions. */
152 offset = bp->bf_len * sizeof(struct bpf_insn);
153 len = count * sizeof(struct bpf_insn);
154
155 /* Ensure the memory buffer for the program. */
156 reqlen = ALLOC_ROUND(offset + len);
157 if (reqlen > ctx->alen) {
158 bp->bf_insns = erealloc(bp->bf_insns, reqlen);
159 ctx->alen = reqlen;
160 }
161
162 /* Add the code block. */
163 memcpy((uint8_t *)bp->bf_insns + offset, insns, len);
164 bp->bf_len += count;
165 }
166
167 static void
168 done_raw_block(npf_bpf_t *ctx, const uint32_t *m, size_t len)
169 {
170 size_t reqlen, nargs = m[1];
171
172 if ((len / sizeof(uint32_t) - 2) != nargs) {
173 errx(EXIT_FAILURE, "invalid BPF block description");
174 }
175 reqlen = ALLOC_ROUND(ctx->mlen + len);
176 if (reqlen > ctx->malen) {
177 ctx->marks = erealloc(ctx->marks, reqlen);
178 ctx->malen = reqlen;
179 }
180 memcpy((uint8_t *)ctx->marks + ctx->mlen, m, len);
181 ctx->mlen += len;
182 }
183
184 static void
185 done_block(npf_bpf_t *ctx, const uint32_t *m, size_t len)
186 {
187 done_raw_block(ctx, m, len);
188 ctx->nblocks++;
189 }
190
191 struct bpf_program *
192 npfctl_bpf_complete(npf_bpf_t *ctx)
193 {
194 struct bpf_program *bp = &ctx->prog;
195 const u_int retoff = bp->bf_len;
196
197 /* No instructions (optimised out). */
198 if (!bp->bf_len)
199 return NULL;
200
201 /* Add the return fragment (success and failure paths). */
202 struct bpf_insn insns_ret[] = {
203 BPF_STMT(BPF_RET+BPF_K, NPF_BPF_SUCCESS),
204 BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE),
205 };
206 add_insns(ctx, insns_ret, __arraycount(insns_ret));
207
208 /* Fixup all jumps to the main failure path. */
209 fixup_jumps(ctx, 0, retoff, false);
210
211 return &ctx->prog;
212 }
213
214 const void *
215 npfctl_bpf_bmarks(npf_bpf_t *ctx, size_t *len)
216 {
217 *len = ctx->mlen;
218 return ctx->marks;
219 }
220
221 void
222 npfctl_bpf_destroy(npf_bpf_t *ctx)
223 {
224 free(ctx->prog.bf_insns);
225 free(ctx->marks);
226 free(ctx);
227 }
228
229 /*
230 * npfctl_bpf_group: begin a logical group. It merely uses logical
231 * disjunction (OR) for compares within the group.
232 */
233 void
234 npfctl_bpf_group(npf_bpf_t *ctx)
235 {
236 struct bpf_program *bp = &ctx->prog;
237
238 assert(ctx->goff == 0);
239 assert(ctx->gblock == 0);
240
241 ctx->goff = bp->bf_len;
242 ctx->gblock = ctx->nblocks;
243 ctx->ingroup = true;
244 }
245
246 void
247 npfctl_bpf_endgroup(npf_bpf_t *ctx)
248 {
249 struct bpf_program *bp = &ctx->prog;
250 const size_t curoff = bp->bf_len;
251
252 /* If there are no blocks or only one - nothing to do. */
253 if ((ctx->nblocks - ctx->gblock) <= 1) {
254 ctx->goff = ctx->gblock = 0;
255 return;
256 }
257
258 /*
259 * Append a failure return as a fall-through i.e. if there is
260 * no match within the group.
261 */
262 struct bpf_insn insns_ret[] = {
263 BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE),
264 };
265 add_insns(ctx, insns_ret, __arraycount(insns_ret));
266
267 /*
268 * Adjust jump offsets: on match - jump outside the group i.e.
269 * to the current offset. Otherwise, jump to the next instruction
270 * which would lead to the fall-through code above if none matches.
271 */
272 fixup_jumps(ctx, ctx->goff, curoff, true);
273 ctx->goff = ctx->gblock = 0;
274 }
275
276 static void
277 fetch_l3(npf_bpf_t *ctx, sa_family_t af, u_int flags)
278 {
279 u_int ver;
280
281 switch (af) {
282 case AF_INET:
283 ver = IPVERSION;
284 break;
285 case AF_INET6:
286 ver = IPV6_VERSION >> 4;
287 break;
288 case AF_UNSPEC:
289 ver = 0;
290 break;
291 default:
292 abort();
293 }
294
295 /*
296 * The memory store is populated with:
297 * - BPF_MW_IPVER: IP version (4 or 6).
298 * - BPF_MW_L4OFF: L4 header offset.
299 * - BPF_MW_L4PROTO: L4 protocol.
300 */
301 if ((ctx->flags & FETCHED_L3) == 0 || (af && ctx->af == 0)) {
302 const uint8_t jt = ver ? 0 : JUMP_MAGIC;
303 const uint8_t jf = ver ? JUMP_MAGIC : 0;
304 bool ingroup = ctx->ingroup;
305
306 /*
307 * L3 block cannot be inserted in the middle of a group.
308 * In fact, it never is. Check and start the group after.
309 */
310 if (ingroup) {
311 assert(ctx->nblocks == ctx->gblock);
312 npfctl_bpf_endgroup(ctx);
313 }
314
315 /*
316 * A <- IP version; A == expected-version?
317 * If no particular version specified, check for non-zero.
318 */
319 struct bpf_insn insns_af[] = {
320 BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_IPVER),
321 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ver, jt, jf),
322 };
323 add_insns(ctx, insns_af, __arraycount(insns_af));
324 ctx->flags |= FETCHED_L3;
325 ctx->af = af;
326
327 if (af) {
328 uint32_t mwords[] = { BM_IPVER, 1, af };
329 done_raw_block(ctx, mwords, sizeof(mwords));
330 }
331 if (ingroup) {
332 npfctl_bpf_group(ctx);
333 }
334
335 } else if (af && af != ctx->af) {
336 errx(EXIT_FAILURE, "address family mismatch");
337 }
338
339 if ((flags & X_EQ_L4OFF) != 0 && (ctx->flags & X_EQ_L4OFF) == 0) {
340 /* X <- IP header length */
341 struct bpf_insn insns_hlen[] = {
342 BPF_STMT(BPF_LDX+BPF_MEM, BPF_MW_L4OFF),
343 };
344 add_insns(ctx, insns_hlen, __arraycount(insns_hlen));
345 ctx->flags |= X_EQ_L4OFF;
346 }
347 }
348
349 /*
350 * npfctl_bpf_proto: code block to match IP version and L4 protocol.
351 */
352 void
353 npfctl_bpf_proto(npf_bpf_t *ctx, sa_family_t af, int proto)
354 {
355 assert(af != AF_UNSPEC || proto != -1);
356
357 /* Note: fails if IP version does not match. */
358 fetch_l3(ctx, af, 0);
359 if (proto == -1) {
360 return;
361 }
362
363 struct bpf_insn insns_proto[] = {
364 /* A <- L4 protocol; A == expected-protocol? */
365 BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO),
366 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, proto, 0, JUMP_MAGIC),
367 };
368 add_insns(ctx, insns_proto, __arraycount(insns_proto));
369
370 uint32_t mwords[] = { BM_PROTO, 1, proto };
371 done_block(ctx, mwords, sizeof(mwords));
372 ctx->flags |= CHECKED_L4;
373 }
374
375 /*
376 * npfctl_bpf_cidr: code block to match IPv4 or IPv6 CIDR.
377 *
378 * => IP address shall be in the network byte order.
379 */
380 void
381 npfctl_bpf_cidr(npf_bpf_t *ctx, u_int opts, sa_family_t af,
382 const npf_addr_t *addr, const npf_netmask_t mask)
383 {
384 const uint32_t *awords = (const uint32_t *)addr;
385 u_int nwords, length, maxmask, off;
386
387 assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0));
388 assert((mask && mask <= NPF_MAX_NETMASK) || mask == NPF_NO_NETMASK);
389
390 switch (af) {
391 case AF_INET:
392 maxmask = 32;
393 off = (opts & MATCH_SRC) ?
394 offsetof(struct ip, ip_src) :
395 offsetof(struct ip, ip_dst);
396 nwords = sizeof(struct in_addr) / sizeof(uint32_t);
397 break;
398 case AF_INET6:
399 maxmask = 128;
400 off = (opts & MATCH_SRC) ?
401 offsetof(struct ip6_hdr, ip6_src) :
402 offsetof(struct ip6_hdr, ip6_dst);
403 nwords = sizeof(struct in6_addr) / sizeof(uint32_t);
404 break;
405 default:
406 abort();
407 }
408
409 /* Ensure address family. */
410 fetch_l3(ctx, af, 0);
411
412 length = (mask == NPF_NO_NETMASK) ? maxmask : mask;
413
414 /* CAUTION: BPF operates in host byte-order. */
415 for (u_int i = 0; i < nwords; i++) {
416 const u_int woff = i * sizeof(uint32_t);
417 uint32_t word = ntohl(awords[i]);
418 uint32_t wordmask;
419
420 if (length >= 32) {
421 /* The mask is a full word - do not apply it. */
422 wordmask = 0;
423 length -= 32;
424 } else if (length) {
425 wordmask = 0xffffffff << (32 - length);
426 length = 0;
427 } else {
428 /* The mask became zero - skip the rest. */
429 break;
430 }
431
432 /* A <- IP address (or one word of it) */
433 struct bpf_insn insns_ip[] = {
434 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, off + woff),
435 };
436 add_insns(ctx, insns_ip, __arraycount(insns_ip));
437
438 /* A <- (A & MASK) */
439 if (wordmask) {
440 struct bpf_insn insns_mask[] = {
441 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, wordmask),
442 };
443 add_insns(ctx, insns_mask, __arraycount(insns_mask));
444 }
445
446 /* A == expected-IP-word ? */
447 struct bpf_insn insns_cmp[] = {
448 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, word, 0, JUMP_MAGIC),
449 };
450 add_insns(ctx, insns_cmp, __arraycount(insns_cmp));
451 }
452
453 uint32_t mwords[] = {
454 (opts & MATCH_SRC) ? BM_SRC_CIDR: BM_DST_CIDR, 6,
455 af, mask, awords[0], awords[1], awords[2], awords[3],
456 };
457 done_block(ctx, mwords, sizeof(mwords));
458 }
459
460 /*
461 * npfctl_bpf_ports: code block to match TCP/UDP port range.
462 *
463 * => Port numbers shall be in the network byte order.
464 */
465 void
466 npfctl_bpf_ports(npf_bpf_t *ctx, u_int opts, in_port_t from, in_port_t to)
467 {
468 const u_int sport_off = offsetof(struct udphdr, uh_sport);
469 const u_int dport_off = offsetof(struct udphdr, uh_dport);
470 u_int off;
471
472 /* TCP and UDP port offsets are the same. */
473 assert(sport_off == offsetof(struct tcphdr, th_sport));
474 assert(dport_off == offsetof(struct tcphdr, th_dport));
475 assert(ctx->flags & CHECKED_L4);
476
477 assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0));
478 off = (opts & MATCH_SRC) ? sport_off : dport_off;
479
480 /* X <- IP header length */
481 fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
482
483 struct bpf_insn insns_fetch[] = {
484 /* A <- port */
485 BPF_STMT(BPF_LD+BPF_H+BPF_IND, off),
486 };
487 add_insns(ctx, insns_fetch, __arraycount(insns_fetch));
488
489 /* CAUTION: BPF operates in host byte-order. */
490 from = ntohs(from);
491 to = ntohs(to);
492
493 if (from == to) {
494 /* Single port case. */
495 struct bpf_insn insns_port[] = {
496 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, from, 0, JUMP_MAGIC),
497 };
498 add_insns(ctx, insns_port, __arraycount(insns_port));
499 } else {
500 /* Port range case. */
501 struct bpf_insn insns_range[] = {
502 BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, from, 0, JUMP_MAGIC),
503 BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, to, JUMP_MAGIC, 0),
504 };
505 add_insns(ctx, insns_range, __arraycount(insns_range));
506 }
507
508 uint32_t mwords[] = {
509 opts & MATCH_SRC ? BM_SRC_PORTS : BM_DST_PORTS, 2, from, to
510 };
511 done_block(ctx, mwords, sizeof(mwords));
512 }
513
514 /*
515 * npfctl_bpf_tcpfl: code block to match TCP flags.
516 */
517 void
518 npfctl_bpf_tcpfl(npf_bpf_t *ctx, uint8_t tf, uint8_t tf_mask, bool checktcp)
519 {
520 const u_int tcpfl_off = offsetof(struct tcphdr, th_flags);
521 const bool usingmask = tf_mask != tf;
522
523 /* X <- IP header length */
524 fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
525 if (checktcp) {
526 const u_int jf = usingmask ? 3 : 2;
527 assert(ctx->ingroup == false);
528
529 /* A <- L4 protocol; A == TCP? If not, jump out. */
530 struct bpf_insn insns_tcp[] = {
531 BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO),
532 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, IPPROTO_TCP, 0, jf),
533 };
534 add_insns(ctx, insns_tcp, __arraycount(insns_tcp));
535 } else {
536 assert(ctx->flags & CHECKED_L4);
537 }
538
539 struct bpf_insn insns_tf[] = {
540 /* A <- TCP flags */
541 BPF_STMT(BPF_LD+BPF_B+BPF_IND, tcpfl_off),
542 };
543 add_insns(ctx, insns_tf, __arraycount(insns_tf));
544
545 if (usingmask) {
546 /* A <- (A & mask) */
547 struct bpf_insn insns_mask[] = {
548 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, tf_mask),
549 };
550 add_insns(ctx, insns_mask, __arraycount(insns_mask));
551 }
552
553 struct bpf_insn insns_cmp[] = {
554 /* A == expected-TCP-flags? */
555 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, tf, 0, JUMP_MAGIC),
556 };
557 add_insns(ctx, insns_cmp, __arraycount(insns_cmp));
558
559 if (!checktcp) {
560 uint32_t mwords[] = { BM_TCPFL, 2, tf, tf_mask};
561 done_block(ctx, mwords, sizeof(mwords));
562 }
563 }
564
565 /*
566 * npfctl_bpf_icmp: code block to match ICMP type and/or code.
567 * Note: suitable both for the ICMPv4 and ICMPv6.
568 */
569 void
570 npfctl_bpf_icmp(npf_bpf_t *ctx, int type, int code)
571 {
572 const u_int type_off = offsetof(struct icmp, icmp_type);
573 const u_int code_off = offsetof(struct icmp, icmp_code);
574
575 assert(ctx->flags & CHECKED_L4);
576 assert(offsetof(struct icmp6_hdr, icmp6_type) == type_off);
577 assert(offsetof(struct icmp6_hdr, icmp6_code) == code_off);
578 assert(type != -1 || code != -1);
579
580 /* X <- IP header length */
581 fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
582
583 if (type != -1) {
584 struct bpf_insn insns_type[] = {
585 BPF_STMT(BPF_LD+BPF_B+BPF_IND, type_off),
586 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, type, 0, JUMP_MAGIC),
587 };
588 add_insns(ctx, insns_type, __arraycount(insns_type));
589
590 uint32_t mwords[] = { BM_ICMP_TYPE, 1, type };
591 done_block(ctx, mwords, sizeof(mwords));
592 }
593
594 if (code != -1) {
595 struct bpf_insn insns_code[] = {
596 BPF_STMT(BPF_LD+BPF_B+BPF_IND, code_off),
597 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, code, 0, JUMP_MAGIC),
598 };
599 add_insns(ctx, insns_code, __arraycount(insns_code));
600
601 uint32_t mwords[] = { BM_ICMP_CODE, 1, code };
602 done_block(ctx, mwords, sizeof(mwords));
603 }
604 }
605
606 #define SRC_FLAG_BIT (1U << 31)
607
608 /*
609 * npfctl_bpf_table: code block to match source/destination IP address
610 * against NPF table specified by ID.
611 */
612 void
613 npfctl_bpf_table(npf_bpf_t *ctx, u_int opts, u_int tid)
614 {
615 const bool src = (opts & MATCH_SRC) != 0;
616
617 struct bpf_insn insns_table[] = {
618 BPF_STMT(BPF_LD+BPF_IMM, (src ? SRC_FLAG_BIT : 0) | tid),
619 BPF_STMT(BPF_MISC+BPF_COP, NPF_COP_TABLE),
620 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, JUMP_MAGIC, 0),
621 };
622 add_insns(ctx, insns_table, __arraycount(insns_table));
623
624 uint32_t mwords[] = { src ? BM_SRC_TABLE: BM_DST_TABLE, 1, tid };
625 done_block(ctx, mwords, sizeof(mwords));
626 }
627