1 /*- 2 * Copyright (c) 2010-2020 The NetBSD Foundation, Inc. 3 * All rights reserved. 4 * 5 * This material is based upon work partially supported by The 6 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 19 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 21 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 * POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30 /* 31 * BPF byte-code generation for NPF rules. 32 * 33 * Overview 34 * 35 * Each NPF rule is compiled into a BPF micro-program. There is a 36 * BPF byte-code fragment for each higher-level filtering logic, 37 * e.g. to match L4 protocol, IP/mask, etc. The generation process 38 * combines multiple BPF-byte code fragments into one program. 39 * 40 * Basic case 41 * 42 * Consider a basic case where all filters should match. They 43 * are expressed as logical conjunction, e.g.: 44 * 45 * A and B and C and D 46 * 47 * Each test (filter) criterion can be evaluated to true (match) or 48 * false (no match) and the logic is as follows: 49 * 50 * - If the value is true, then jump to the "next" test (offset 0). 51 * 52 * - If the value is false, then jump to the JUMP_MAGIC value (0xff). 53 * This "magic" value is used to indicate that it will have to be 54 * patched at a later stage. 55 * 56 * Once all byte-code fragments are combined into one, then there 57 * are two additional steps: 58 * 59 * - Two instructions are appended at the end of the program: "return 60 * success" followed by "return failure". 61 * 62 * - All jumps with the JUMP_MAGIC value are patched to point to the 63 * "return failure" instruction. 64 * 65 * Therefore, if all filter criteria will match, then the first 66 * instruction will be reached, indicating a successful match of the 67 * rule. Otherwise, if any of the criteria will not match, it will 68 * take the failure path and the rule will not be matching. 69 * 70 * Grouping 71 * 72 * Filters can have groups, which have an effect of logical 73 * disjunction, e.g.: 74 * 75 * A and B and (C or D) 76 * 77 * In such case, the logic inside the group has to be inverted i.e. 78 * the jump values swapped. If the test value is true, then jump 79 * out of the group; if false, then jump "next". At the end of the 80 * group, an addition failure path is appended and the JUMP_MAGIC 81 * uses within the group are patched to jump past the said path. 82 * 83 * For multi-word comparisons (IPv6 addresses), there is another 84 * layer of grouping: 85 * 86 * A and B and ((C and D) or (E and F)) 87 * 88 * This strains the simple-minded JUMP_MAGIC logic, so for now, 89 * when generating the jump-if-false targets for (C and D), we 90 * simply count the number of instructions left to skip over. 91 * 92 * A better architecture might be to create asm-type labels for 93 * the jt and jf continuations in the first pass, and then, once 94 * their offsets are determined, go back and fill them in in the 95 * second pass. This would simplify the logic (no need to compute 96 * exactly how many instructions we're about to generate in a 97 * chain of conditionals) and eliminate redundant RET #0 98 * instructions which are currently generated after some groups. 99 */ 100 101 #include <sys/cdefs.h> 102 __RCSID("$NetBSD: npf_bpf_comp.c,v 1.19 2025/07/10 11:44:12 joe Exp $"); 103 104 #include <stdlib.h> 105 #include <stdbool.h> 106 #include <stddef.h> 107 #include <string.h> 108 #include <inttypes.h> 109 #include <err.h> 110 #include <assert.h> 111 112 #include <netinet/in.h> 113 #include <netinet/in_systm.h> 114 #define __FAVOR_BSD 115 #include <netinet/ip.h> 116 #include <netinet/ip6.h> 117 #include <netinet/udp.h> 118 #include <netinet/tcp.h> 119 #include <netinet/ip_icmp.h> 120 #include <netinet/icmp6.h> 121 122 #include <net/bpf.h> 123 124 #include "npfctl.h" 125 126 /* 127 * Note: clear X_EQ_L4OFF when register X is invalidated i.e. it stores 128 * something other than L4 header offset. Generally, when BPF_LDX is used. 129 */ 130 #define FETCHED_L3 0x01 131 #define CHECKED_L4_PROTO 0x02 132 #define X_EQ_L4OFF 0x04 133 #define FETCHED_L2 0x08 134 135 struct npf_bpf { 136 /* 137 * BPF program code, the allocated length (in bytes), the number 138 * of logical blocks and the flags. 139 */ 140 struct bpf_program prog; 141 size_t alen; 142 unsigned nblocks; 143 sa_family_t af; 144 uint32_t flags; 145 uint8_t eth_type; 146 147 /* 148 * Indicators whether we are inside the group and whether this 149 * group is implementing inverted logic. 150 * 151 * The current group offset (counted in BPF instructions) 152 * and block number at the start of the group. 153 */ 154 unsigned ingroup; 155 bool invert; 156 bool multiword; 157 unsigned goff; 158 unsigned gblock; 159 160 /* Track inversion (excl. mark). */ 161 uint32_t invflags; 162 163 /* BPF marks, allocated length and the real length. */ 164 uint32_t * marks; 165 size_t malen; 166 size_t mlen; 167 }; 168 169 /* 170 * NPF success and failure values to be returned from BPF. 171 */ 172 #define NPF_BPF_SUCCESS ((u_int)-1) 173 #define NPF_BPF_FAILURE 0 174 175 /* 176 * Magic value to indicate the failure path, which is fixed up on completion. 177 * Note: this is the longest jump offset in BPF, since the offset is one byte. 178 */ 179 #define JUMP_MAGIC 0xff 180 181 /* Reduce re-allocations by expanding in 64 byte blocks. */ 182 #define ALLOC_MASK (64 - 1) 183 #define ALLOC_ROUND(x) (((x) + ALLOC_MASK) & ~ALLOC_MASK) 184 185 #ifndef IPV6_VERSION 186 #define IPV6_VERSION 0x60 187 #endif 188 189 npf_bpf_t * 190 npfctl_bpf_create(void) 191 { 192 return ecalloc(1, sizeof(npf_bpf_t)); 193 } 194 195 static void 196 fixup_jumps(npf_bpf_t *ctx, u_int start, u_int end, bool swap) 197 { 198 struct bpf_program *bp = &ctx->prog; 199 200 for (u_int i = start; i < end; i++) { 201 struct bpf_insn *insn = &bp->bf_insns[i]; 202 const u_int fail_off = end - i; 203 bool seen_magic = false; 204 205 if (fail_off >= JUMP_MAGIC) { 206 errx(EXIT_FAILURE, "BPF generation error: " 207 "the number of instructions is over the limit"); 208 } 209 if (BPF_CLASS(insn->code) != BPF_JMP) { 210 continue; 211 } 212 if (BPF_OP(insn->code) == BPF_JA) { 213 /* 214 * BPF_JA can be used to jump to the failure path. 215 * If we are swapping i.e. inside the group, then 216 * jump "next"; groups have a failure path appended 217 * at their end. 218 */ 219 if (insn->k == JUMP_MAGIC) { 220 insn->k = swap ? 0 : fail_off; 221 } 222 continue; 223 } 224 225 /* 226 * Fixup the "magic" value. Swap only the "magic" jumps. 227 */ 228 229 if (insn->jt == JUMP_MAGIC) { 230 insn->jt = fail_off; 231 seen_magic = true; 232 } 233 if (insn->jf == JUMP_MAGIC) { 234 insn->jf = fail_off; 235 seen_magic = true; 236 } 237 238 if (seen_magic && swap) { 239 uint8_t jt = insn->jt; 240 insn->jt = insn->jf; 241 insn->jf = jt; 242 } 243 } 244 } 245 246 static void 247 add_insns(npf_bpf_t *ctx, struct bpf_insn *insns, size_t count) 248 { 249 struct bpf_program *bp = &ctx->prog; 250 size_t offset, len, reqlen; 251 252 /* Note: bf_len is the count of instructions. */ 253 offset = bp->bf_len * sizeof(struct bpf_insn); 254 len = count * sizeof(struct bpf_insn); 255 256 /* Ensure the memory buffer for the program. */ 257 reqlen = ALLOC_ROUND(offset + len); 258 if (reqlen > ctx->alen) { 259 bp->bf_insns = erealloc(bp->bf_insns, reqlen); 260 ctx->alen = reqlen; 261 } 262 263 /* Add the code block. */ 264 memcpy((uint8_t *)bp->bf_insns + offset, insns, len); 265 bp->bf_len += count; 266 } 267 268 static void 269 add_bmarks(npf_bpf_t *ctx, const uint32_t *m, size_t len) 270 { 271 size_t reqlen, nargs = m[1]; 272 273 if ((len / sizeof(uint32_t) - 2) != nargs) { 274 errx(EXIT_FAILURE, "invalid BPF block description"); 275 } 276 reqlen = ALLOC_ROUND(ctx->mlen + len); 277 if (reqlen > ctx->malen) { 278 ctx->marks = erealloc(ctx->marks, reqlen); 279 ctx->malen = reqlen; 280 } 281 memcpy((uint8_t *)ctx->marks + ctx->mlen, m, len); 282 ctx->mlen += len; 283 } 284 285 static void 286 done_block(npf_bpf_t *ctx, const uint32_t *m, size_t len) 287 { 288 add_bmarks(ctx, m, len); 289 ctx->nblocks++; 290 } 291 292 struct bpf_program * 293 npfctl_bpf_complete(npf_bpf_t *ctx) 294 { 295 struct bpf_program *bp = &ctx->prog; 296 const u_int retoff = bp->bf_len; 297 298 /* No instructions (optimised out). */ 299 if (!bp->bf_len) 300 return NULL; 301 302 /* Add the return fragment (success and failure paths). */ 303 struct bpf_insn insns_ret[] = { 304 BPF_STMT(BPF_RET+BPF_K, NPF_BPF_SUCCESS), 305 BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE), 306 }; 307 add_insns(ctx, insns_ret, __arraycount(insns_ret)); 308 309 /* Fixup all jumps to the main failure path. */ 310 fixup_jumps(ctx, 0, retoff, false); 311 312 return &ctx->prog; 313 } 314 315 const void * 316 npfctl_bpf_bmarks(npf_bpf_t *ctx, size_t *len) 317 { 318 *len = ctx->mlen; 319 return ctx->marks; 320 } 321 322 void 323 npfctl_bpf_destroy(npf_bpf_t *ctx) 324 { 325 free(ctx->prog.bf_insns); 326 free(ctx->marks); 327 free(ctx); 328 } 329 330 /* 331 * npfctl_bpf_group_enter: begin a logical group. It merely uses logical 332 * disjunction (OR) for comparisons within the group. 333 */ 334 void 335 npfctl_bpf_group_enter(npf_bpf_t *ctx, bool invert) 336 { 337 struct bpf_program *bp = &ctx->prog; 338 339 assert(ctx->goff == 0); 340 assert(ctx->gblock == 0); 341 342 ctx->goff = bp->bf_len; 343 ctx->gblock = ctx->nblocks; 344 ctx->invert = invert; 345 ctx->multiword = false; 346 ctx->ingroup++; 347 } 348 349 void 350 npfctl_bpf_group_exit(npf_bpf_t *ctx) 351 { 352 struct bpf_program *bp = &ctx->prog; 353 const size_t curoff = bp->bf_len; 354 355 assert(ctx->ingroup); 356 ctx->ingroup--; 357 358 /* 359 * If we're not inverting, there were only zero or one options, 360 * and the last comparison was not a multi-word comparison 361 * requiring a fallthrough failure -- nothing to do. 362 */ 363 if (!ctx->invert && 364 (ctx->nblocks - ctx->gblock) <= 1 && 365 !ctx->multiword) { 366 ctx->goff = ctx->gblock = 0; 367 return; 368 } 369 370 /* 371 * If inverting, then prepend a jump over the statement below. 372 * On match, it will skip-through and the fail path will be taken. 373 */ 374 if (ctx->invert) { 375 struct bpf_insn insns_ret[] = { 376 BPF_STMT(BPF_JMP+BPF_JA, 1), 377 }; 378 add_insns(ctx, insns_ret, __arraycount(insns_ret)); 379 } 380 381 /* 382 * Append a failure return as a fall-through i.e. if there is 383 * no match within the group. 384 */ 385 struct bpf_insn insns_ret[] = { 386 BPF_STMT(BPF_RET+BPF_K, NPF_BPF_FAILURE), 387 }; 388 add_insns(ctx, insns_ret, __arraycount(insns_ret)); 389 390 /* 391 * Adjust jump offsets: on match - jump outside the group i.e. 392 * to the current offset. Otherwise, jump to the next instruction 393 * which would lead to the fall-through code above if none matches. 394 */ 395 fixup_jumps(ctx, ctx->goff, curoff, true); 396 ctx->goff = ctx->gblock = 0; 397 } 398 399 static void 400 fetch_l3(npf_bpf_t *ctx, sa_family_t af, unsigned flags) 401 { 402 unsigned ver; 403 404 switch (af) { 405 case AF_INET: 406 ver = IPVERSION; 407 break; 408 case AF_INET6: 409 ver = IPV6_VERSION >> 4; 410 break; 411 case AF_UNSPEC: 412 ver = 0; 413 break; 414 default: 415 abort(); 416 } 417 418 /* 419 * The memory store is populated with: 420 * - BPF_MW_IPVER: IP version (4 or 6). 421 * - BPF_MW_L4OFF: L4 header offset. 422 * - BPF_MW_L4PROTO: L4 protocol. 423 */ 424 if ((ctx->flags & FETCHED_L3) == 0 || (af && ctx->af == 0)) { 425 const uint8_t jt = ver ? 0 : JUMP_MAGIC; 426 const uint8_t jf = ver ? JUMP_MAGIC : 0; 427 const bool ingroup = ctx->ingroup != 0; 428 const bool invert = ctx->invert; 429 430 /* 431 * L3 block cannot be inserted in the middle of a group. 432 * In fact, it never is. Check and start the group after. 433 */ 434 if (ingroup) { 435 assert(ctx->nblocks == ctx->gblock); 436 npfctl_bpf_group_exit(ctx); 437 } 438 439 /* 440 * A <- IP version; A == expected-version? 441 * If no particular version specified, check for non-zero. 442 */ 443 struct bpf_insn insns_af[] = { 444 BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_IPVER), 445 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ver, jt, jf), 446 }; 447 add_insns(ctx, insns_af, __arraycount(insns_af)); 448 ctx->flags |= FETCHED_L3; 449 ctx->af = af; 450 451 if (af) { 452 uint32_t mwords[] = { BM_IPVER, 1, af }; 453 add_bmarks(ctx, mwords, sizeof(mwords)); 454 } 455 if (ingroup) { 456 npfctl_bpf_group_enter(ctx, invert); 457 } 458 459 } else if (af && af != ctx->af) { 460 errx(EXIT_FAILURE, "address family mismatch"); 461 } 462 463 if ((flags & X_EQ_L4OFF) != 0 && (ctx->flags & X_EQ_L4OFF) == 0) { 464 /* X <- IP header length */ 465 struct bpf_insn insns_hlen[] = { 466 BPF_STMT(BPF_LDX+BPF_MEM, BPF_MW_L4OFF), 467 }; 468 add_insns(ctx, insns_hlen, __arraycount(insns_hlen)); 469 ctx->flags |= X_EQ_L4OFF; 470 } 471 } 472 473 void 474 fetch_ether_type(npf_bpf_t *ctx, uint16_t type) 475 { 476 if ((ctx->flags & FETCHED_L2) != 0 || (type && ctx->eth_type != 0)) 477 return; 478 479 const uint8_t jt = type ? 0 : JUMP_MAGIC; 480 const uint8_t jf = type ? JUMP_MAGIC : 0; 481 const bool ingroup = ctx->ingroup != 0; 482 const bool invert = ctx->invert; 483 unsigned off = offsetof(struct ether_header, ether_type); 484 485 /* 486 * L2 block cannot be inserted in the middle of a group. 487 * Check and start the group after. 488 */ 489 if (ingroup) { 490 assert(ctx->nblocks == ctx->gblock); 491 npfctl_bpf_group_exit(ctx); 492 } 493 494 type = ntohs(type); 495 496 struct bpf_insn insns_et[] = { 497 BPF_STMT(BPF_LD+BPF_H+BPF_ABS, off), 498 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, type, jt, jf), 499 }; 500 add_insns(ctx, insns_et, __arraycount(insns_et)); 501 ctx->flags |= FETCHED_L2; 502 ctx->eth_type = type; 503 504 if (type) { /* bookmark ether type */ 505 uint32_t mwords[] = { BM_ETHER_TYPE, 1, htons(type) }; 506 add_bmarks(ctx, mwords, sizeof(mwords)); 507 } 508 if (ingroup) { 509 npfctl_bpf_group_enter(ctx, invert); 510 } 511 } 512 513 static void 514 bm_invert_checkpoint(npf_bpf_t *ctx, const unsigned opts, uint32_t layer) 515 { 516 uint32_t bm = 0; 517 518 if (ctx->ingroup && ctx->invert) { 519 const unsigned seen = ctx->invflags; 520 521 if ((opts & MATCH_SRC) != 0 && (seen & MATCH_SRC) == 0) { 522 bm = (layer & NPF_RULE_LAYER_3) ? BM_SRC_NEG : BM_SRC_ENEG; 523 } 524 if ((opts & MATCH_DST) != 0 && (seen & MATCH_DST) == 0) { 525 bm = (layer & NPF_RULE_LAYER_3) ? BM_DST_NEG : BM_DST_ENEG; 526 } 527 ctx->invflags |= opts & (MATCH_SRC | MATCH_DST); 528 } 529 if (bm) { 530 uint32_t mwords[] = { bm, 0 }; 531 add_bmarks(ctx, mwords, sizeof(mwords)); 532 } 533 } 534 535 /* 536 * npfctl_bpf_ipver: match the IP version. 537 */ 538 void 539 npfctl_bpf_ipver(npf_bpf_t *ctx, sa_family_t af) 540 { 541 fetch_l3(ctx, af, 0); 542 } 543 544 /* 545 * npfctl_bpf_proto: code block to match IP version and L4 protocol. 546 */ 547 void 548 npfctl_bpf_proto(npf_bpf_t *ctx, unsigned proto) 549 { 550 struct bpf_insn insns_proto[] = { 551 /* A <- L4 protocol; A == expected-protocol? */ 552 BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO), 553 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, proto, 0, JUMP_MAGIC), 554 }; 555 add_insns(ctx, insns_proto, __arraycount(insns_proto)); 556 557 uint32_t mwords[] = { BM_PROTO, 1, proto }; 558 done_block(ctx, mwords, sizeof(mwords)); 559 ctx->flags |= CHECKED_L4_PROTO; 560 } 561 562 /* 563 * npfctl_bpf_cidr: code block to match IPv4 or IPv6 CIDR. 564 * 565 * => IP address shall be in the network byte order. 566 */ 567 void 568 npfctl_bpf_cidr(npf_bpf_t *ctx, unsigned opts, sa_family_t af, 569 const npf_addr_t *addr, const npf_netmask_t mask) 570 { 571 const uint32_t *awords = (const uint32_t *)addr; 572 unsigned nwords, origlength, length, maxmask, off; 573 574 assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0)); 575 assert((mask && mask <= NPF_MAX_NETMASK) || mask == NPF_NO_NETMASK); 576 577 switch (af) { 578 case AF_INET: 579 maxmask = 32; 580 off = (opts & MATCH_SRC) ? 581 offsetof(struct ip, ip_src) : 582 offsetof(struct ip, ip_dst); 583 nwords = sizeof(struct in_addr) / sizeof(uint32_t); 584 break; 585 case AF_INET6: 586 maxmask = 128; 587 off = (opts & MATCH_SRC) ? 588 offsetof(struct ip6_hdr, ip6_src) : 589 offsetof(struct ip6_hdr, ip6_dst); 590 nwords = sizeof(struct in6_addr) / sizeof(uint32_t); 591 break; 592 default: 593 abort(); 594 } 595 596 /* Ensure address family. */ 597 fetch_l3(ctx, af, 0); 598 599 length = origlength = (mask == NPF_NO_NETMASK) ? maxmask : mask; 600 601 /* CAUTION: BPF operates in host byte-order. */ 602 for (unsigned i = 0; i < nwords; i++) { 603 const unsigned woff = i * sizeof(uint32_t); 604 uint32_t word = ntohl(awords[i]); 605 uint32_t wordmask; 606 607 if (length >= 32) { 608 /* The mask is a full word - do not apply it. */ 609 wordmask = 0; 610 length -= 32; 611 } else if (length) { 612 wordmask = 0xffffffff << (32 - length); 613 length = 0; 614 } else { 615 /* The mask became zero - skip the rest. */ 616 break; 617 } 618 619 /* A <- IP address (or one word of it) */ 620 struct bpf_insn insns_ip[] = { 621 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, off + woff), 622 }; 623 add_insns(ctx, insns_ip, __arraycount(insns_ip)); 624 625 /* A <- (A & MASK) */ 626 if (wordmask) { 627 struct bpf_insn insns_mask[] = { 628 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, wordmask), 629 }; 630 add_insns(ctx, insns_mask, __arraycount(insns_mask)); 631 } 632 633 /* 634 * Determine how many instructions we have to jump 635 * ahead if the match fails. 636 * 637 * - If this is the last word, we jump to the final 638 * failure, JUMP_MAGIC. 639 * 640 * - If this is not the last word, we jump past the 641 * remaining instructions to match this sequence. 642 * Each 32-bit word in the sequence takes two 643 * instructions (BPF_LD and BPF_JMP). If there is a 644 * partial-word mask ahead, there will be one 645 * additional instruction (BPF_ALU). 646 */ 647 uint8_t jf; 648 if (i + 1 == (origlength + 31)/32) { 649 jf = JUMP_MAGIC; 650 } else { 651 jf = 2*((origlength + 31)/32 - i - 1); 652 if (origlength % 32 != 0 && wordmask == 0) 653 jf += 1; 654 } 655 656 /* A == expected-IP-word ? */ 657 struct bpf_insn insns_cmp[] = { 658 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, word, 0, jf), 659 }; 660 add_insns(ctx, insns_cmp, __arraycount(insns_cmp)); 661 } 662 663 /* 664 * If we checked a chain of words in sequence, mark this as a 665 * multi-word comparison so if this is in a group there will be 666 * a fallthrough case. 667 * 668 * XXX This is a little silly; the compiler should really just 669 * record holes where conditional jumps need success/failure 670 * continuations, and go back to fill in the holes when the 671 * locations of the continuations are determined later. But 672 * that requires restructuring this code a little more. 673 */ 674 ctx->multiword = (origlength + 31)/32 > 1; 675 676 uint32_t mwords[] = { 677 (opts & MATCH_SRC) ? BM_SRC_CIDR: BM_DST_CIDR, 6, 678 af, mask, awords[0], awords[1], awords[2], awords[3], 679 }; 680 bm_invert_checkpoint(ctx, opts, NPF_RULE_LAYER_3); 681 done_block(ctx, mwords, sizeof(mwords)); 682 } 683 684 /* 685 * for ether address, 6 octets(a word and halfword) 686 * just fetch directly using a word and halfword fetch 687 */ 688 void 689 npfctl_bpf_ether(npf_bpf_t *ctx, unsigned opts, struct ether_addr *ether_addr) 690 { 691 uint32_t mac_word; 692 uint16_t mac_hword; 693 unsigned off; 694 assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0)); 695 696 off = (opts & MATCH_SRC) ? offsetof(struct ether_header, ether_shost) : 697 offsetof(struct ether_header, ether_dhost); 698 699 memcpy(&mac_word, ether_addr, sizeof(mac_word)); 700 mac_word = ntohl(mac_word); 701 702 /* copy the last two bytes of the 6 byte ether address */ 703 memcpy(&mac_hword, (uint8_t *)ether_addr + sizeof(mac_word), sizeof(mac_hword)); 704 mac_hword = ntohs(mac_hword); 705 706 /* load and compare first word then do same to last halfword */ 707 struct bpf_insn insns_ether_w[] = { 708 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, off), 709 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, mac_word, 0, 2), 710 }; 711 add_insns(ctx, insns_ether_w, __arraycount(insns_ether_w)); 712 713 struct bpf_insn insns_ether_h[] = { 714 BPF_STMT(BPF_LD+BPF_H+BPF_ABS, off + sizeof(mac_word)), 715 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, mac_hword, 0, JUMP_MAGIC), 716 }; 717 add_insns(ctx, insns_ether_h, __arraycount(insns_ether_h)); 718 719 ctx->multiword = true; 720 721 uint32_t mwords[] = { 722 (opts & MATCH_SRC) ? BM_SRC_ETHER: BM_DST_ETHER, 2, 723 htonl(mac_word), htons(mac_hword) 724 }; 725 726 bm_invert_checkpoint(ctx, opts, NPF_RULE_LAYER_2); 727 done_block(ctx, mwords, sizeof(mwords)); 728 } 729 730 /* 731 * npfctl_bpf_ports: code block to match TCP/UDP port range. 732 * 733 * => Port numbers shall be in the network byte order. 734 */ 735 void 736 npfctl_bpf_ports(npf_bpf_t *ctx, unsigned opts, in_port_t from, in_port_t to) 737 { 738 const unsigned sport_off = offsetof(struct udphdr, uh_sport); 739 const unsigned dport_off = offsetof(struct udphdr, uh_dport); 740 unsigned off; 741 742 /* TCP and UDP port offsets are the same. */ 743 assert(sport_off == offsetof(struct tcphdr, th_sport)); 744 assert(dport_off == offsetof(struct tcphdr, th_dport)); 745 assert(ctx->flags & CHECKED_L4_PROTO); 746 747 assert(((opts & MATCH_SRC) != 0) ^ ((opts & MATCH_DST) != 0)); 748 off = (opts & MATCH_SRC) ? sport_off : dport_off; 749 750 /* X <- IP header length */ 751 fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF); 752 753 struct bpf_insn insns_fetch[] = { 754 /* A <- port */ 755 BPF_STMT(BPF_LD+BPF_H+BPF_IND, off), 756 }; 757 add_insns(ctx, insns_fetch, __arraycount(insns_fetch)); 758 759 /* CAUTION: BPF operates in host byte-order. */ 760 from = ntohs(from); 761 to = ntohs(to); 762 763 if (from == to) { 764 /* Single port case. */ 765 struct bpf_insn insns_port[] = { 766 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, from, 0, JUMP_MAGIC), 767 }; 768 add_insns(ctx, insns_port, __arraycount(insns_port)); 769 } else { 770 /* Port range case. */ 771 struct bpf_insn insns_range[] = { 772 BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, from, 0, 1), 773 BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, to, 0, 1), 774 BPF_STMT(BPF_JMP+BPF_JA, JUMP_MAGIC), 775 }; 776 add_insns(ctx, insns_range, __arraycount(insns_range)); 777 } 778 779 uint32_t mwords[] = { 780 (opts & MATCH_SRC) ? BM_SRC_PORTS : BM_DST_PORTS, 2, from, to 781 }; 782 done_block(ctx, mwords, sizeof(mwords)); 783 } 784 785 /* 786 * npfctl_bpf_tcpfl: code block to match TCP flags. 787 */ 788 void 789 npfctl_bpf_tcpfl(npf_bpf_t *ctx, uint8_t tf, uint8_t tf_mask) 790 { 791 const unsigned tcpfl_off = offsetof(struct tcphdr, th_flags); 792 const bool usingmask = tf_mask != tf; 793 794 /* X <- IP header length */ 795 fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF); 796 797 if ((ctx->flags & CHECKED_L4_PROTO) == 0) { 798 const unsigned jf = usingmask ? 3 : 2; 799 assert(ctx->ingroup == 0); 800 801 /* 802 * A <- L4 protocol; A == TCP? If not, jump out. 803 * 804 * Note: the TCP flag matching might be without 'proto tcp' 805 * when using a plain 'stateful' rule. In such case it also 806 * handles other protocols, thus no strict TCP check. 807 */ 808 struct bpf_insn insns_tcp[] = { 809 BPF_STMT(BPF_LD+BPF_W+BPF_MEM, BPF_MW_L4PROTO), 810 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, IPPROTO_TCP, 0, jf), 811 }; 812 add_insns(ctx, insns_tcp, __arraycount(insns_tcp)); 813 } 814 815 struct bpf_insn insns_tf[] = { 816 /* A <- TCP flags */ 817 BPF_STMT(BPF_LD+BPF_B+BPF_IND, tcpfl_off), 818 }; 819 add_insns(ctx, insns_tf, __arraycount(insns_tf)); 820 821 if (usingmask) { 822 /* A <- (A & mask) */ 823 struct bpf_insn insns_mask[] = { 824 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, tf_mask), 825 }; 826 add_insns(ctx, insns_mask, __arraycount(insns_mask)); 827 } 828 829 struct bpf_insn insns_cmp[] = { 830 /* A == expected-TCP-flags? */ 831 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, tf, 0, JUMP_MAGIC), 832 }; 833 add_insns(ctx, insns_cmp, __arraycount(insns_cmp)); 834 835 uint32_t mwords[] = { BM_TCPFL, 2, tf, tf_mask }; 836 done_block(ctx, mwords, sizeof(mwords)); 837 } 838 839 /* 840 * npfctl_bpf_icmp: code block to match ICMP type and/or code. 841 * Note: suitable for both the ICMPv4 and ICMPv6. 842 */ 843 void 844 npfctl_bpf_icmp(npf_bpf_t *ctx, int type, int code) 845 { 846 const u_int type_off = offsetof(struct icmp, icmp_type); 847 const u_int code_off = offsetof(struct icmp, icmp_code); 848 849 assert(ctx->flags & CHECKED_L4_PROTO); 850 assert(offsetof(struct icmp6_hdr, icmp6_type) == type_off); 851 assert(offsetof(struct icmp6_hdr, icmp6_code) == code_off); 852 assert(type != -1 || code != -1); 853 854 /* X <- IP header length */ 855 fetch_l3(ctx, AF_UNSPEC, X_EQ_L4OFF); 856 857 if (type != -1) { 858 struct bpf_insn insns_type[] = { 859 BPF_STMT(BPF_LD+BPF_B+BPF_IND, type_off), 860 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, type, 0, JUMP_MAGIC), 861 }; 862 add_insns(ctx, insns_type, __arraycount(insns_type)); 863 864 uint32_t mwords[] = { BM_ICMP_TYPE, 1, type }; 865 done_block(ctx, mwords, sizeof(mwords)); 866 } 867 868 if (code != -1) { 869 struct bpf_insn insns_code[] = { 870 BPF_STMT(BPF_LD+BPF_B+BPF_IND, code_off), 871 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, code, 0, JUMP_MAGIC), 872 }; 873 add_insns(ctx, insns_code, __arraycount(insns_code)); 874 875 uint32_t mwords[] = { BM_ICMP_CODE, 1, code }; 876 done_block(ctx, mwords, sizeof(mwords)); 877 } 878 } 879 880 #define SRC_FLAG_BIT (1U << 31) 881 882 /* 883 * npfctl_bpf_table: code block to match source/destination IP address 884 * against NPF table specified by ID. 885 */ 886 void 887 npfctl_bpf_table(npf_bpf_t *ctx, unsigned opts, unsigned tid) 888 { 889 const bool src = (opts & MATCH_SRC) != 0; 890 891 struct bpf_insn insns_table[] = { 892 BPF_STMT(BPF_LD+BPF_IMM, (src ? SRC_FLAG_BIT : 0) | tid), 893 BPF_STMT(BPF_MISC+BPF_COP, NPF_COP_TABLE), 894 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, JUMP_MAGIC, 0), 895 }; 896 add_insns(ctx, insns_table, __arraycount(insns_table)); 897 898 uint32_t mwords[] = { src ? BM_SRC_TABLE: BM_DST_TABLE, 1, tid }; 899 bm_invert_checkpoint(ctx, opts, NPF_RULE_LAYER_3); 900 done_block(ctx, mwords, sizeof(mwords)); 901 } 902