npf_bpf_comp.c revision 1.18 1 /*-
2 * Copyright (c) 2010-2020 The NetBSD Foundation, Inc.
3 * All rights reserved.
4 *
5 * This material is based upon work partially supported by The
6 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 * POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 /*
31 * BPF byte-code generation for NPF rules.
32 *
33 * Overview
34 *
35 * Each NPF rule is compiled into a BPF micro-program. There is a
36 * BPF byte-code fragment for each higher-level filtering logic,
37 * e.g. to match L4 protocol, IP/mask, etc. The generation process
38 * combines multiple BPF-byte code fragments into one program.
39 *
40 * Basic case
41 *
42 * Consider a basic case where all filters should match. They
43 * are expressed as logical conjunction, e.g.:
44 *
45 * A and B and C and D
46 *
47 * Each test (filter) criterion can be evaluated to true (match) or
48 * false (no match) and the logic is as follows:
49 *
50 * - If the value is true, then jump to the "next" test (offset 0).
51 *
52 * - If the value is false, then jump to the JUMP_MAGIC value (0xff).
53 * This "magic" value is used to indicate that it will have to be
54 * patched at a later stage.
55 *
56 * Once all byte-code fragments are combined into one, then there
57 * are two additional steps:
58 *
59 * - Two instructions are appended at the end of the program: "return
60 * success" followed by "return failure".
61 *
62 * - All jumps with the JUMP_MAGIC value are patched to point to the
63 * "return failure" instruction.
64 *
65 * Therefore, if all filter criteria will match, then the first
66 * instruction will be reached, indicating a successful match of the
67 * rule. Otherwise, if any of the criteria will not match, it will
68 * take the failure path and the rule will not be matching.
69 *
70 * Grouping
71 *
72 * Filters can have groups, which have an effect of logical
73 * disjunction, e.g.:
74 *
75 * A and B and (C or D)
76 *
77 * In such case, the logic inside the group has to be inverted i.e.
78 * the jump values swapped. If the test value is true, then jump
79 * out of the group; if false, then jump "next". At the end of the
80 * group, an addition failure path is appended and the JUMP_MAGIC
81 * uses within the group are patched to jump past the said path.
82 *
83 * For multi-word comparisons (IPv6 addresses), there is another
84 * layer of grouping:
85 *
86 * A and B and ((C and D) or (E and F))
87 *
88 * This strains the simple-minded JUMP_MAGIC logic, so for now,
89 * when generating the jump-if-false targets for (C and D), we
90 * simply count the number of instructions left to skip over.
91 *
92 * A better architecture might be to create asm-type labels for
93 * the jt and jf continuations in the first pass, and then, once
94 * their offsets are determined, go back and fill them in in the
95 * second pass. This would simplify the logic (no need to compute
96 * exactly how many instructions we're about to generate in a
97 * chain of conditionals) and eliminate redundant RET #0
98 * instructions which are currently generated after some groups.
99 */
100
101 #include <sys/cdefs.h>
102 __RCSID("$NetBSD: npf_bpf_comp.c,v 1.18 2025/07/01 19:55:15 joe Exp $");
103
104 #include <stdlib.h>
105 #include <stdbool.h>
106 #include <stddef.h>
107 #include <string.h>
108 #include <inttypes.h>
109 #include <err.h>
110 #include <assert.h>
111
112 #include <netinet/in.h>
113 #include <netinet/in_systm.h>
114 #define __FAVOR_BSD
115 #include <netinet/ip.h>
116 #include <netinet/ip6.h>
117 #include <netinet/udp.h>
118 #include <netinet/tcp.h>
119 #include <netinet/ip_icmp.h>
120 #include <netinet/icmp6.h>
121
122 #include <net/bpf.h>
123
124 #include "npfctl.h"
125
126 /*
127 * Note: clear X_EQ_L4OFF when register X is invalidated i.e. it stores
128 * something other than L4 header offset. Generally, when BPF_LDX is used.
129 */
130 #define FETCHED_L3 0x01
131 #define CHECKED_L4_PROTO 0x02
132 #define X_EQ_L4OFF 0x04
133 #define FETCHED_L2 0x08
134
135 struct npf_bpf {
136 /*
137 * BPF program code, the allocated length (in bytes), the number
138 * of logical blocks and the flags.
139 */
140 struct bpf_program prog;
141 size_t alen;
142 unsigned nblocks;
143 sa_family_t af;
144 uint32_t flags;
145 uint8_t eth_type;
146
147 /*
148 * Indicators whether we are inside the group and whether this
149 * group is implementing inverted logic.
150 *
151 * The current group offset (counted in BPF instructions)
152 * and block number at the start of the group.
153 */
154 unsigned ingroup;
155 bool invert;
156 bool multiword;
157 unsigned goff;
158 unsigned gblock;
159
160 /* Track inversion (excl. mark). */
161 uint32_t invflags;
162
163 /* BPF marks, allocated length and the real length. */
164 uint32_t * marks;
165 size_t malen;
166 size_t mlen;
167 };
168
169 /*
170 * NPF success and failure values to be returned from BPF.
171 */
172 #define NPF_BPF_SUCCESS ((u_int)-1)
173 #define NPF_BPF_FAILURE 0
174
175 /*
176 * Magic value to indicate the failure path, which is fixed up on completion.
177 * Note: this is the longest jump offset in BPF, since the offset is one byte.
178 */
179 #define JUMP_MAGIC 0xff
180
181 /* Reduce re-allocations by expanding in 64 byte blocks. */
182 #define ALLOC_MASK (64 - 1)
183 #define ALLOC_ROUND(x) (((x) + ALLOC_MASK) & ~ALLOC_MASK)
184
185 #ifndef IPV6_VERSION
186 #define IPV6_VERSION 0x60
187 #endif
188
189 npf_bpf_t *
190 npfctl_bpf_create(void)
191 {
192 return ecalloc(1, sizeof(npf_bpf_t));
193 }
194
195 static void
196 fixup_jumps(npf_bpf_t *ctx, u_int start, u_int end, bool swap)
197 {
198 struct bpf_program *bp = &ctx->prog;
199
200 for (u_int i = start; i < end; i++) {
201 struct bpf_insn *insn = &bp->bf_insns[i];
202 const u_int fail_off = end - i;
203 bool seen_magic = false;
204
205 if (fail_off >= JUMP_MAGIC) {
206 errx(EXIT_FAILURE, "BPF generation error: "
207 "the number of instructions is over the limit");
208 }
209 if (BPF_CLASS(insn->code) != BPF_JMP) {
210 continue;
211 }
212 if (BPF_OP(insn->code) == BPF_JA) {
213 /*
214 * BPF_JA can be used to jump to the failure path.
215 * If we are swapping i.e. inside the group, then
216 * jump "next"; groups have a failure path appended
217 * at their end.
218 */
219 if (insn->k == JUMP_MAGIC) {
220 insn->k = swap ? 0 : fail_off;
221 }
222 continue;
223 }
224
225 /*
226 * Fixup the "magic" value. Swap only the "magic" jumps.
227 */
228
229 if (insn->jt == JUMP_MAGIC) {
230 insn->jt = fail_off;
231 seen_magic = true;
232 }
233 if (insn->jf == JUMP_MAGIC) {
234 insn->jf = fail_off;
235 seen_magic = true;
236 }
237
238 if (seen_magic && swap) {
239 uint8_t jt = insn->jt;
240 insn->jt = insn->jf;
241 insn->jf = jt;
242 }
243 }
244 }
245
246 static void
247 add_insns(npf_bpf_t *ctx, struct bpf_insn *insns, size_t count)
248 {
249 struct bpf_program *bp = &ctx->prog;
250 size_t offset, len, reqlen;
251
252 /* Note: bf_len is the count of instructions. */
253 offset = bp->bf_len * sizeof(struct bpf_insn);
254 len = count * sizeof(struct bpf_insn);
255
256 /* Ensure the memory buffer for the program. */
257 reqlen = ALLOC_ROUND(offset + len);
258 if (reqlen > ctx->alen) {
259 bp->bf_insns = erealloc(bp->bf_insns, reqlen);
260 ctx->alen = reqlen;
261 }
262
263 /* Add the code block. */
264 memcpy((uint8_t *)bp->bf_insns + offset, insns, len);
265 bp->bf_len += count;
266 }
267
268 static void
269 add_bmarks(npf_bpf_t *ctx, const uint32_t *m, size_t len)
270 {
271 size_t reqlen, nargs = m[1];
272
273 if ((len / sizeof(uint32_t) - 2) != nargs) {
274 errx(EXIT_FAILURE, "invalid BPF block description");
275 }
276 reqlen = ALLOC_ROUND(ctx->mlen + len);
277 if (reqlen > ctx->malen) {
278 ctx->marks = erealloc(ctx->marks, reqlen);
279 ctx->malen = reqlen;
280 }
281 memcpy((uint8_t *)ctx->marks + ctx->mlen, m, len);
282 ctx->mlen += len;
283 }
284
285 static void
286 done_block(npf_bpf_t *ctx, const uint32_t *m, size_t len)
287 {
288 add_bmarks(ctx, m, len);
289 ctx->nblocks++;
290 }
291
292 struct bpf_program *
293 npfctl_bpf_complete(npf_bpf_t *ctx)
294 {
295 struct bpf_program *bp = &ctx->prog;
296 const u_int retoff = bp->bf_len;
297
298 /* No instructions (optimised out). */
299 if (!bp->bf_len)
300 return NULL;
301
302 /* Add the return fragment (success and failure paths). */
303 struct bpf_insn insns_ret[] = {
304 BPF_STMT(BPF_RET+BPF_K, NPF_BPF_SUCCESS),
305 BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE),
306 };
307 add_insns(ctx, insns_ret, __arraycount(insns_ret));
308
309 /* Fixup all jumps to the main failure path. */
310 fixup_jumps(ctx, 0, retoff, false);
311
312 return &ctx->prog;
313 }
314
315 const void *
316 npfctl_bpf_bmarks(npf_bpf_t *ctx, size_t *len)
317 {
318 *len = ctx->mlen;
319 return ctx->marks;
320 }
321
322 void
323 npfctl_bpf_destroy(npf_bpf_t *ctx)
324 {
325 free(ctx->prog.bf_insns);
326 free(ctx->marks);
327 free(ctx);
328 }
329
330 /*
331 * npfctl_bpf_group_enter: begin a logical group. It merely uses logical
332 * disjunction (OR) for comparisons within the group.
333 */
334 void
335 npfctl_bpf_group_enter(npf_bpf_t *ctx, bool invert)
336 {
337 struct bpf_program *bp = &ctx->prog;
338
339 assert(ctx->goff == 0);
340 assert(ctx->gblock == 0);
341
342 ctx->goff = bp->bf_len;
343 ctx->gblock = ctx->nblocks;
344 ctx->invert = invert;
345 ctx->multiword = false;
346 ctx->ingroup++;
347 }
348
349 void
350 npfctl_bpf_group_exit(npf_bpf_t *ctx)
351 {
352 struct bpf_program *bp = &ctx->prog;
353 const size_t curoff = bp->bf_len;
354
355 assert(ctx->ingroup);
356 ctx->ingroup--;
357
358 /*
359 * If we're not inverting, there were only zero or one options,
360 * and the last comparison was not a multi-word comparison
361 * requiring a fallthrough failure -- nothing to do.
362 */
363 if (!ctx->invert &&
364 (ctx->nblocks - ctx->gblock) <= 1 &&
365 !ctx->multiword) {
366 ctx->goff = ctx->gblock = 0;
367 return;
368 }
369
370 /*
371 * If inverting, then prepend a jump over the statement below.
372 * On match, it will skip-through and the fail path will be taken.
373 */
374 if (ctx->invert) {
375 struct bpf_insn insns_ret[] = {
376 BPF_STMT(BPF_JMP+BPF_JA, 1),
377 };
378 add_insns(ctx, insns_ret, __arraycount(insns_ret));
379 }
380
381 /*
382 * Append a failure return as a fall-through i.e. if there is
383 * no match within the group.
384 */
385 struct bpf_insn insns_ret[] = {
386 BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE),
387 };
388 add_insns(ctx, insns_ret, __arraycount(insns_ret));
389
390 /*
391 * Adjust jump offsets: on match - jump outside the group i.e.
392 * to the current offset. Otherwise, jump to the next instruction
393 * which would lead to the fall-through code above if none matches.
394 */
395 fixup_jumps(ctx, ctx->goff, curoff, true);
396 ctx->goff = ctx->gblock = 0;
397 }
398
399 static void
400 fetch_l3(npf_bpf_t *ctx, sa_family_t af, unsigned flags)
401 {
402 unsigned ver;
403
404 switch (af) {
405 case AF_INET:
406 ver = IPVERSION;
407 break;
408 case AF_INET6:
409 ver = IPV6_VERSION >> 4;
410 break;
411 case AF_UNSPEC:
412 ver = 0;
413 break;
414 default:
415 abort();
416 }
417
418 /*
419 * The memory store is populated with:
420 * - BPF_MW_IPVER: IP version (4 or 6).
421 * - BPF_MW_L4OFF: L4 header offset.
422 * - BPF_MW_L4PROTO: L4 protocol.
423 */
424 if ((ctx->flags & FETCHED_L3) == 0 || (af && ctx->af == 0)) {
425 const uint8_t jt = ver ? 0 : JUMP_MAGIC;
426 const uint8_t jf = ver ? JUMP_MAGIC : 0;
427 const bool ingroup = ctx->ingroup != 0;
428 const bool invert = ctx->invert;
429
430 /*
431 * L3 block cannot be inserted in the middle of a group.
432 * In fact, it never is. Check and start the group after.
433 */
434 if (ingroup) {
435 assert(ctx->nblocks == ctx->gblock);
436 npfctl_bpf_group_exit(ctx);
437 }
438
439 /*
440 * A <- IP version; A == expected-version?
441 * If no particular version specified, check for non-zero.
442 */
443 struct bpf_insn insns_af[] = {
444 BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_IPVER),
445 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ver, jt, jf),
446 };
447 add_insns(ctx, insns_af, __arraycount(insns_af));
448 ctx->flags |= FETCHED_L3;
449 ctx->af = af;
450
451 if (af) {
452 uint32_t mwords[] = { BM_IPVER, 1, af };
453 add_bmarks(ctx, mwords, sizeof(mwords));
454 }
455 if (ingroup) {
456 npfctl_bpf_group_enter(ctx, invert);
457 }
458
459 } else if (af && af != ctx->af) {
460 errx(EXIT_FAILURE, "address family mismatch");
461 }
462
463 if ((flags & X_EQ_L4OFF) != 0 && (ctx->flags & X_EQ_L4OFF) == 0) {
464 /* X <- IP header length */
465 struct bpf_insn insns_hlen[] = {
466 BPF_STMT(BPF_LDX+BPF_MEM, BPF_MW_L4OFF),
467 };
468 add_insns(ctx, insns_hlen, __arraycount(insns_hlen));
469 ctx->flags |= X_EQ_L4OFF;
470 }
471 }
472
473 void
474 fetch_ether_type(npf_bpf_t *ctx, uint16_t type)
475 {
476 if ((ctx->flags & FETCHED_L2) != 0 || (type && ctx->eth_type != 0))
477 return;
478
479 const uint8_t jt = type ? 0 : JUMP_MAGIC;
480 const uint8_t jf = type ? JUMP_MAGIC : 0;
481 const bool ingroup = ctx->ingroup != 0;
482 const bool invert = ctx->invert;
483 unsigned off = offsetof(struct ether_header, ether_type);
484
485 /*
486 * L2 block cannot be inserted in the middle of a group.
487 * Check and start the group after.
488 */
489 if (ingroup) {
490 assert(ctx->nblocks == ctx->gblock);
491 npfctl_bpf_group_exit(ctx);
492 }
493
494 type = ntohs(type);
495
496 struct bpf_insn insns_et[] = {
497 BPF_STMT(BPF_LD+BPF_H+BPF_ABS, off),
498 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, type, jt, jf),
499 };
500 add_insns(ctx, insns_et, __arraycount(insns_et));
501 ctx->flags |= FETCHED_L2;
502 ctx->eth_type = type;
503
504 if (type) { /* bookmark ether type */
505 uint32_t mwords[] = { BM_ETHER_TYPE, 1, htons(type) };
506 add_bmarks(ctx, mwords, sizeof(mwords));
507 }
508 if (ingroup) {
509 npfctl_bpf_group_enter(ctx, invert);
510 }
511 }
512
513 static void
514 bm_invert_checkpoint(npf_bpf_t *ctx, const unsigned opts, uint32_t layer)
515 {
516 uint32_t bm = 0;
517
518 if (ctx->ingroup && ctx->invert) {
519 const unsigned seen = ctx->invflags;
520
521 if ((opts & MATCH_SRC) != 0 && (seen & MATCH_SRC) == 0) {
522 bm = (layer & NPF_RULE_LAYER_3) ? BM_SRC_NEG : BM_SRC_ENEG;
523 }
524 if ((opts & MATCH_DST) != 0 && (seen & MATCH_DST) == 0) {
525 bm = (layer & NPF_RULE_LAYER_3) ? BM_DST_NEG : BM_DST_ENEG;
526 }
527 ctx->invflags |= opts & (MATCH_SRC | MATCH_DST);
528 }
529 if (bm) {
530 uint32_t mwords[] = { bm, 0 };
531 add_bmarks(ctx, mwords, sizeof(mwords));
532 }
533 }
534
535 /*
536 * npfctl_bpf_ipver: match the IP version.
537 */
538 void
539 npfctl_bpf_ipver(npf_bpf_t *ctx, sa_family_t af)
540 {
541 fetch_l3(ctx, af, 0);
542 }
543
544 /*
545 * npfctl_bpf_proto: code block to match IP version and L4 protocol.
546 */
547 void
548 npfctl_bpf_proto(npf_bpf_t *ctx, unsigned proto)
549 {
550 struct bpf_insn insns_proto[] = {
551 /* A <- L4 protocol; A == expected-protocol? */
552 BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO),
553 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, proto, 0, JUMP_MAGIC),
554 };
555 add_insns(ctx, insns_proto, __arraycount(insns_proto));
556
557 uint32_t mwords[] = { BM_PROTO, 1, proto };
558 done_block(ctx, mwords, sizeof(mwords));
559 ctx->flags |= CHECKED_L4_PROTO;
560 }
561
562 /*
563 * npfctl_bpf_cidr: code block to match IPv4 or IPv6 CIDR.
564 *
565 * => IP address shall be in the network byte order.
566 */
567 void
568 npfctl_bpf_cidr(npf_bpf_t *ctx, unsigned opts, sa_family_t af,
569 const npf_addr_t *addr, const npf_netmask_t mask)
570 {
571 const uint32_t *awords = (const uint32_t *)addr;
572 unsigned nwords, origlength, length, maxmask, off;
573
574 assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0));
575 assert((mask && mask <= NPF_MAX_NETMASK) || mask == NPF_NO_NETMASK);
576
577 switch (af) {
578 case AF_INET:
579 maxmask = 32;
580 off = (opts & MATCH_SRC) ?
581 offsetof(struct ip, ip_src) :
582 offsetof(struct ip, ip_dst);
583 nwords = sizeof(struct in_addr) / sizeof(uint32_t);
584 break;
585 case AF_INET6:
586 maxmask = 128;
587 off = (opts & MATCH_SRC) ?
588 offsetof(struct ip6_hdr, ip6_src) :
589 offsetof(struct ip6_hdr, ip6_dst);
590 nwords = sizeof(struct in6_addr) / sizeof(uint32_t);
591 break;
592 default:
593 abort();
594 }
595
596 /* Ensure address family. */
597 fetch_l3(ctx, af, 0);
598
599 length = origlength = (mask == NPF_NO_NETMASK) ? maxmask : mask;
600
601 /* CAUTION: BPF operates in host byte-order. */
602 for (unsigned i = 0; i < nwords; i++) {
603 const unsigned woff = i * sizeof(uint32_t);
604 uint32_t word = ntohl(awords[i]);
605 uint32_t wordmask;
606
607 if (length >= 32) {
608 /* The mask is a full word - do not apply it. */
609 wordmask = 0;
610 length -= 32;
611 } else if (length) {
612 wordmask = 0xffffffff << (32 - length);
613 length = 0;
614 } else {
615 /* The mask became zero - skip the rest. */
616 break;
617 }
618
619 /* A <- IP address (or one word of it) */
620 struct bpf_insn insns_ip[] = {
621 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, off + woff),
622 };
623 add_insns(ctx, insns_ip, __arraycount(insns_ip));
624
625 /* A <- (A & MASK) */
626 if (wordmask) {
627 struct bpf_insn insns_mask[] = {
628 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, wordmask),
629 };
630 add_insns(ctx, insns_mask, __arraycount(insns_mask));
631 }
632
633 /*
634 * Determine how many instructions we have to jump
635 * ahead if the match fails.
636 *
637 * - If this is the last word, we jump to the final
638 * failure, JUMP_MAGIC.
639 *
640 * - If this is not the last word, we jump past the
641 * remaining instructions to match this sequence.
642 * Each 32-bit word in the sequence takes two
643 * instructions (BPF_LD and BPF_JMP). If there is a
644 * partial-word mask ahead, there will be one
645 * additional instruction (BPF_ALU).
646 */
647 uint8_t jf;
648 if (i + 1 == (origlength + 31)/32) {
649 jf = JUMP_MAGIC;
650 } else {
651 jf = 2*((origlength + 31)/32 - i - 1);
652 if (origlength % 32 != 0 && wordmask == 0)
653 jf += 1;
654 }
655
656 /* A == expected-IP-word ? */
657 struct bpf_insn insns_cmp[] = {
658 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, word, 0, jf),
659 };
660 add_insns(ctx, insns_cmp, __arraycount(insns_cmp));
661 }
662
663 /*
664 * If we checked a chain of words in sequence, mark this as a
665 * multi-word comparison so if this is in a group there will be
666 * a fallthrough case.
667 *
668 * XXX This is a little silly; the compiler should really just
669 * record holes where conditional jumps need success/failure
670 * continuations, and go back to fill in the holes when the
671 * locations of the continuations are determined later. But
672 * that requires restructuring this code a little more.
673 */
674 ctx->multiword = (origlength + 31)/32 > 1;
675
676 uint32_t mwords[] = {
677 (opts & MATCH_SRC) ? BM_SRC_CIDR: BM_DST_CIDR, 6,
678 af, mask, awords[0], awords[1], awords[2], awords[3],
679 };
680 bm_invert_checkpoint(ctx, opts, NPF_RULE_LAYER_3);
681 done_block(ctx, mwords, sizeof(mwords));
682 }
683
684 /*
685 * for ether address, 6 octets(a word and halfword)
686 * just fetch directly using a word and halfword fetch
687 */
688 void
689 npfctl_bpf_ether(npf_bpf_t *ctx, unsigned opts, struct ether_addr *ether_addr)
690 {
691 uint32_t mac_word;
692 uint16_t mac_hword;
693 unsigned off;
694 assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0));
695
696 off = (opts & MATCH_SRC) ? offsetof(struct ether_header, ether_shost) :
697 offsetof(struct ether_header, ether_dhost);
698
699 memcpy(&mac_word, ether_addr, sizeof(mac_word));
700 mac_word = ntohl(mac_word);
701
702 /* copy the last two bytes of the 6 byte ether address */
703 memcpy(&mac_hword, (uint8_t *)ether_addr + sizeof(mac_word), sizeof(mac_hword));
704 mac_hword = ntohs(mac_hword);
705
706 /* load and compare first word then do same to last halfword */
707 struct bpf_insn insns_ether_w[] = {
708 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, off),
709 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, mac_word, 0, JUMP_MAGIC),
710 };
711 add_insns(ctx, insns_ether_w, __arraycount(insns_ether_w));
712
713 struct bpf_insn insns_ether_h[] = {
714 BPF_STMT(BPF_LD+BPF_H+BPF_ABS, off + sizeof(mac_word)),
715 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, mac_hword, 0, JUMP_MAGIC),
716 };
717 add_insns(ctx, insns_ether_h, __arraycount(insns_ether_h));
718
719 uint32_t mwords[] = {
720 (opts & MATCH_SRC) ? BM_SRC_ETHER: BM_DST_ETHER, 2,
721 htonl(mac_word), htons(mac_hword)
722 };
723
724 bm_invert_checkpoint(ctx, opts, NPF_RULE_LAYER_2);
725 done_block(ctx, mwords, sizeof(mwords));
726 }
727
728 /*
729 * npfctl_bpf_ports: code block to match TCP/UDP port range.
730 *
731 * => Port numbers shall be in the network byte order.
732 */
733 void
734 npfctl_bpf_ports(npf_bpf_t *ctx, unsigned opts, in_port_t from, in_port_t to)
735 {
736 const unsigned sport_off = offsetof(struct udphdr, uh_sport);
737 const unsigned dport_off = offsetof(struct udphdr, uh_dport);
738 unsigned off;
739
740 /* TCP and UDP port offsets are the same. */
741 assert(sport_off == offsetof(struct tcphdr, th_sport));
742 assert(dport_off == offsetof(struct tcphdr, th_dport));
743 assert(ctx->flags & CHECKED_L4_PROTO);
744
745 assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0));
746 off = (opts & MATCH_SRC) ? sport_off : dport_off;
747
748 /* X <- IP header length */
749 fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
750
751 struct bpf_insn insns_fetch[] = {
752 /* A <- port */
753 BPF_STMT(BPF_LD+BPF_H+BPF_IND, off),
754 };
755 add_insns(ctx, insns_fetch, __arraycount(insns_fetch));
756
757 /* CAUTION: BPF operates in host byte-order. */
758 from = ntohs(from);
759 to = ntohs(to);
760
761 if (from == to) {
762 /* Single port case. */
763 struct bpf_insn insns_port[] = {
764 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, from, 0, JUMP_MAGIC),
765 };
766 add_insns(ctx, insns_port, __arraycount(insns_port));
767 } else {
768 /* Port range case. */
769 struct bpf_insn insns_range[] = {
770 BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, from, 0, 1),
771 BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, to, 0, 1),
772 BPF_STMT(BPF_JMP+BPF_JA, JUMP_MAGIC),
773 };
774 add_insns(ctx, insns_range, __arraycount(insns_range));
775 }
776
777 uint32_t mwords[] = {
778 (opts & MATCH_SRC) ? BM_SRC_PORTS : BM_DST_PORTS, 2, from, to
779 };
780 done_block(ctx, mwords, sizeof(mwords));
781 }
782
783 /*
784 * npfctl_bpf_tcpfl: code block to match TCP flags.
785 */
786 void
787 npfctl_bpf_tcpfl(npf_bpf_t *ctx, uint8_t tf, uint8_t tf_mask)
788 {
789 const unsigned tcpfl_off = offsetof(struct tcphdr, th_flags);
790 const bool usingmask = tf_mask != tf;
791
792 /* X <- IP header length */
793 fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
794
795 if ((ctx->flags & CHECKED_L4_PROTO) == 0) {
796 const unsigned jf = usingmask ? 3 : 2;
797 assert(ctx->ingroup == 0);
798
799 /*
800 * A <- L4 protocol; A == TCP? If not, jump out.
801 *
802 * Note: the TCP flag matching might be without 'proto tcp'
803 * when using a plain 'stateful' rule. In such case it also
804 * handles other protocols, thus no strict TCP check.
805 */
806 struct bpf_insn insns_tcp[] = {
807 BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO),
808 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, IPPROTO_TCP, 0, jf),
809 };
810 add_insns(ctx, insns_tcp, __arraycount(insns_tcp));
811 }
812
813 struct bpf_insn insns_tf[] = {
814 /* A <- TCP flags */
815 BPF_STMT(BPF_LD+BPF_B+BPF_IND, tcpfl_off),
816 };
817 add_insns(ctx, insns_tf, __arraycount(insns_tf));
818
819 if (usingmask) {
820 /* A <- (A & mask) */
821 struct bpf_insn insns_mask[] = {
822 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, tf_mask),
823 };
824 add_insns(ctx, insns_mask, __arraycount(insns_mask));
825 }
826
827 struct bpf_insn insns_cmp[] = {
828 /* A == expected-TCP-flags? */
829 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, tf, 0, JUMP_MAGIC),
830 };
831 add_insns(ctx, insns_cmp, __arraycount(insns_cmp));
832
833 uint32_t mwords[] = { BM_TCPFL, 2, tf, tf_mask };
834 done_block(ctx, mwords, sizeof(mwords));
835 }
836
837 /*
838 * npfctl_bpf_icmp: code block to match ICMP type and/or code.
839 * Note: suitable for both the ICMPv4 and ICMPv6.
840 */
841 void
842 npfctl_bpf_icmp(npf_bpf_t *ctx, int type, int code)
843 {
844 const u_int type_off = offsetof(struct icmp, icmp_type);
845 const u_int code_off = offsetof(struct icmp, icmp_code);
846
847 assert(ctx->flags & CHECKED_L4_PROTO);
848 assert(offsetof(struct icmp6_hdr, icmp6_type) == type_off);
849 assert(offsetof(struct icmp6_hdr, icmp6_code) == code_off);
850 assert(type != -1 || code != -1);
851
852 /* X <- IP header length */
853 fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF);
854
855 if (type != -1) {
856 struct bpf_insn insns_type[] = {
857 BPF_STMT(BPF_LD+BPF_B+BPF_IND, type_off),
858 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, type, 0, JUMP_MAGIC),
859 };
860 add_insns(ctx, insns_type, __arraycount(insns_type));
861
862 uint32_t mwords[] = { BM_ICMP_TYPE, 1, type };
863 done_block(ctx, mwords, sizeof(mwords));
864 }
865
866 if (code != -1) {
867 struct bpf_insn insns_code[] = {
868 BPF_STMT(BPF_LD+BPF_B+BPF_IND, code_off),
869 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, code, 0, JUMP_MAGIC),
870 };
871 add_insns(ctx, insns_code, __arraycount(insns_code));
872
873 uint32_t mwords[] = { BM_ICMP_CODE, 1, code };
874 done_block(ctx, mwords, sizeof(mwords));
875 }
876 }
877
878 #define SRC_FLAG_BIT (1U << 31)
879
880 /*
881 * npfctl_bpf_table: code block to match source/destination IP address
882 * against NPF table specified by ID.
883 */
884 void
885 npfctl_bpf_table(npf_bpf_t *ctx, unsigned opts, unsigned tid)
886 {
887 const bool src = (opts & MATCH_SRC) != 0;
888
889 struct bpf_insn insns_table[] = {
890 BPF_STMT(BPF_LD+BPF_IMM, (src ? SRC_FLAG_BIT : 0) | tid),
891 BPF_STMT(BPF_MISC+BPF_COP, NPF_COP_TABLE),
892 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, JUMP_MAGIC, 0),
893 };
894 add_insns(ctx, insns_table, __arraycount(insns_table));
895
896 uint32_t mwords[] = { src ? BM_SRC_TABLE: BM_DST_TABLE, 1, tid };
897 bm_invert_checkpoint(ctx, opts, NPF_RULE_LAYER_3);
898 done_block(ctx, mwords, sizeof(mwords));
899 }
900