1 /* 2 * xdp-server.c -- integration of AF_XDP into nsd 3 * 4 * Copyright (c) 2024, NLnet Labs. All rights reserved. 5 * 6 * See LICENSE for the license. 7 * 8 */ 9 10 /* 11 * Parts inspired by https://github.com/xdp-project/xdp-tutorial 12 */ 13 14 #include "config.h" 15 16 #ifdef USE_XDP 17 18 #include <assert.h> 19 #include <errno.h> 20 #include <netinet/in.h> 21 #include <stdint.h> 22 #include <stdio.h> 23 #include <stdlib.h> 24 #include <string.h> 25 #include <sys/types.h> 26 #include <unistd.h> 27 #include <linux/limits.h> 28 #include <sys/mman.h> 29 30 #include <sys/poll.h> 31 #include <sys/resource.h> 32 33 /* #include <bpf/bpf.h> */ 34 #include <xdp/xsk.h> 35 #include <xdp/libxdp.h> 36 #include <bpf/libbpf.h> 37 38 #include <arpa/inet.h> 39 #include <linux/icmpv6.h> 40 #include <linux/if_ether.h> 41 #include <linux/ipv6.h> 42 #include <linux/ip.h> 43 #include <linux/udp.h> 44 #include <net/if.h> 45 46 #include <arpa/inet.h> 47 #include <netdb.h> 48 #include <ifaddrs.h> 49 #include <linux/if_link.h> 50 51 #include "query.h" 52 #include "dns.h" 53 #include "util.h" 54 #include "xdp-server.h" 55 #include "xdp-util.h" 56 #include "nsd.h" 57 58 // TODO: make configurable 59 #define DNS_PORT 53 60 61 struct xdp_config { 62 __u32 xdp_flags; 63 __u32 libxdp_flags; 64 __u16 xsk_bind_flags; 65 }; 66 67 struct umem_ptr { 68 uint64_t addr; 69 uint32_t len; 70 }; 71 72 static struct umem_ptr umem_ptrs[XDP_RX_BATCH_SIZE]; 73 74 /* 75 * Allocate memory for UMEM and setup rings 76 */ 77 static int 78 xsk_configure_umem(struct xsk_umem_info *umem_info, uint64_t size); 79 80 /* 81 * Retrieve a UMEM frame address for allocation 82 * 83 * Returns XDP_INVALID_UMEM_FRAME when there are no free frames available. 84 */ 85 static uint64_t xsk_alloc_umem_frame(struct xsk_socket_info *xsk); 86 87 /* 88 * Bind AF_XDP socket and setup rings 89 */ 90 static int xsk_configure_socket(struct xdp_server *xdp, 91 struct xsk_socket_info *xsk_info, 92 struct xsk_umem_info *umem, 93 uint32_t queue_index); 94 95 /* 96 * Get number of free frames in UMEM 97 */ 98 static uint64_t xsk_umem_free_frames(struct xsk_socket_info *xsk); 99 100 /* 101 * Free a frame in UMEM 102 */ 103 static void xsk_free_umem_frame(struct xsk_socket_info *xsk, uint64_t frame); 104 105 /* 106 * Fill fill ring with as many frames as possible 107 */ 108 static void fill_fq(struct xsk_socket_info *xsk); 109 110 /* 111 * Load eBPF program to forward traffic to our socket 112 */ 113 static int load_xdp_program_and_map(struct xdp_server *xdp); 114 115 /* 116 * Unload eBPF/XDP program 117 */ 118 static void unload_xdp_program(struct xdp_server *xdp); 119 120 /* 121 * Figure out IP addresses to listen to. 122 */ 123 static int figure_ip_addresses(struct xdp_server *xdp); 124 125 /* 126 * Add IP address to allowed destination addresses for incoming packets 127 */ 128 static void add_ip_address(struct xdp_server *xdp, 129 struct sockaddr_storage *addr); 130 131 /* 132 * Check whether destination IPv4 is in allowed IPs list 133 */ 134 static int dest_ip_allowed4(struct xdp_server *xdp, struct iphdr *ipv4); 135 136 /* 137 * Check whether destination IPv6 is in allowed IPs list 138 */ 139 static int dest_ip_allowed6(struct xdp_server *xdp, struct ipv6hdr *ipv6); 140 141 /* 142 * Setup XDP sockets 143 */ 144 static int xdp_sockets_init(struct xdp_server *xdp); 145 146 /* 147 * Cleanup XDP sockets and memory 148 */ 149 static void xdp_sockets_cleanup(struct xdp_server *xdp); 150 151 /* 152 * Allocate a block of shared memory 153 */ 154 static void *alloc_shared_mem(size_t len); 155 156 /* 157 * Collect free frames from completion queue 158 */ 159 static void drain_cq(struct xsk_socket_info *xsk); 160 161 /* 162 * Send outstanding packets and recollect completed frame addresses 163 */ 164 static void handle_tx(struct xsk_socket_info *xsk); 165 166 /* 167 * Process packet and indicate if it should be dropped 168 * return 0 or less => drop 169 * return greater than 0 => use for tx 170 */ 171 static int 172 process_packet(struct xdp_server *xdp, 173 uint8_t *pkt, 174 uint32_t *len, 175 struct query *query); 176 177 static inline void swap_eth(struct ethhdr *eth); 178 static inline void swap_udp(struct udphdr *udp); 179 static inline void swap_ipv6(struct ipv6hdr *ipv6); 180 static inline void swap_ipv4(struct iphdr *ipv4); 181 static inline void *parse_udp(struct udphdr *udp); 182 static inline void *parse_ipv6(struct ipv6hdr *ipv6); 183 static inline void *parse_ipv4(struct iphdr *ipv4); 184 185 /* 186 * Parse dns message and return new length of dns message 187 */ 188 static uint32_t parse_dns(struct nsd* nsd, 189 uint32_t dnslen, 190 struct query *q, 191 sa_family_t ai_family); 192 193 /* *************** */ 194 /* Implementations */ 195 /* *************** */ 196 197 static uint64_t xsk_alloc_umem_frame(struct xsk_socket_info *xsk) { 198 uint64_t frame; 199 if (xsk->umem->umem_frame_free == 0) { 200 return XDP_INVALID_UMEM_FRAME; 201 } 202 203 frame = xsk->umem->umem_frame_addr[--xsk->umem->umem_frame_free]; 204 xsk->umem->umem_frame_addr[xsk->umem->umem_frame_free] = 205 XDP_INVALID_UMEM_FRAME; 206 return frame; 207 } 208 209 static uint64_t xsk_umem_free_frames(struct xsk_socket_info *xsk) { 210 return xsk->umem->umem_frame_free; 211 } 212 213 static void xsk_free_umem_frame(struct xsk_socket_info *xsk, uint64_t frame) { 214 assert(xsk->umem->umem_frame_free < XDP_NUM_FRAMES); 215 xsk->umem->umem_frame_addr[xsk->umem->umem_frame_free++] = frame; 216 } 217 218 static void fill_fq(struct xsk_socket_info *xsk) { 219 uint32_t stock_frames; 220 uint32_t idx_fq = 0; 221 /* fill the fill ring with as many frames as are available */ 222 /* get number of spots available in fq */ 223 stock_frames = xsk_prod_nb_free(&xsk->umem->fq, 224 (uint32_t) xsk_umem_free_frames(xsk)); 225 if (stock_frames > 0) { 226 /* ignoring prod__reserve return value, because we got stock_frames 227 * from xsk_prod_nb_free(), which are therefore available */ 228 xsk_ring_prod__reserve(&xsk->umem->fq, stock_frames, &idx_fq); 229 230 for (uint32_t i = 0; i < stock_frames; ++i) { 231 /* TODO: handle lack of available frames? 232 * Is not necessary when the total amount of frames exceeds the 233 * total slots available across all queues combined */ 234 /* uint64_t frame = xsk_alloc_umem_frame(xsk); */ 235 /* if (frame == XDP_INVALID_UMEM_FRAME) */ 236 /* printf("xdp: trying to fill_addr INVALID UMEM FRAME"); */ 237 *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = 238 xsk_alloc_umem_frame(xsk); 239 } 240 241 xsk_ring_prod__submit(&xsk->umem->fq, stock_frames); 242 } 243 } 244 245 static int load_xdp_program_and_map(struct xdp_server *xdp) { 246 struct bpf_map *map; 247 char errmsg[512]; 248 int err, ret; 249 /* UNSPEC => let libxdp decide */ 250 // TODO: put this into a config option as well? 251 enum xdp_attach_mode attach_mode = XDP_MODE_UNSPEC; 252 253 DECLARE_LIBXDP_OPTS(bpf_object_open_opts, opts); 254 if (xdp->bpf_bpffs_path) 255 opts.pin_root_path = xdp->bpf_bpffs_path; 256 257 /* for now our xdp program should contain just one program section */ 258 // TODO: look at xdp_program__create because it can take a pinned prog 259 xdp->bpf_prog = xdp_program__open_file(xdp->bpf_prog_filename, NULL, &opts); 260 261 // conversion should be fine, libxdp errors shouldn't exceed (int), 262 // also libxdp_strerr takes int anyway... 263 err = (int) libxdp_get_error(xdp->bpf_prog); 264 if (err) { 265 libxdp_strerror(err, errmsg, sizeof(errmsg)); 266 log_msg(LOG_ERR, "xdp: could not open xdp program: %s\n", errmsg); 267 return err; 268 } 269 270 if (xdp->bpf_prog_should_load) { 271 /* TODO: I find setting environment variables from within a program 272 * not a good thing to do, but for the meantime this helps... */ 273 /* This is done to allow unloading the XDP program we load without 274 * needing the SYS_ADMIN capability, and libxdp doesn't allow skipping 275 * the dispatcher through different means. */ 276 putenv("LIBXDP_SKIP_DISPATCHER=1"); 277 err = xdp_program__attach(xdp->bpf_prog, (int) xdp->interface_index, attach_mode, 0); 278 /* err = xdp_program__attach_single(xdp->bpf_prog, xdp->interface_index, attach_mode); */ 279 if (err) { 280 libxdp_strerror(err, errmsg, sizeof(errmsg)); 281 log_msg(LOG_ERR, "xdp: could not attach xdp program to interface '%s' : %s\n", 282 xdp->interface_name, errmsg); 283 return err; 284 } 285 286 xdp->bpf_prog_fd = xdp_program__fd(xdp->bpf_prog); 287 xdp->bpf_prog_id = xdp_program__id(xdp->bpf_prog); 288 289 /* We also need to get the file descriptor to the xsks_map */ 290 map = bpf_object__find_map_by_name(xdp_program__bpf_obj(xdp->bpf_prog), "xsks_map"); 291 ret = bpf_map__fd(map); 292 if (ret < 0) { 293 log_msg(LOG_ERR, "xdp: no xsks map found in xdp program: %s\n", strerror(ret)); 294 return ret; 295 } 296 xdp->xsk_map_fd = ret; 297 xdp->xsk_map = map; 298 } else { 299 char map_path[PATH_MAX]; 300 int fd; 301 302 snprintf(map_path, PATH_MAX, "%s/%s", xdp->bpf_bpffs_path, "xsks_map"); 303 304 fd = bpf_obj_get(map_path); 305 if (fd < 0) { 306 log_msg(LOG_ERR, "xdp: could not retrieve xsks_map pin from %s: %s", map_path, strerror(errno)); 307 return fd; 308 } 309 310 map = bpf_object__find_map_by_name(xdp_program__bpf_obj(xdp->bpf_prog), "xsks_map"); 311 if ((ret = bpf_map__reuse_fd(map, fd))) { 312 log_msg(LOG_ERR, "xdp: could not re-use xsks_map: %s\n", strerror(errno)); 313 return ret; 314 } 315 316 xdp->xsk_map_fd = fd; 317 xdp->xsk_map = map; 318 } 319 320 return 0; 321 } 322 323 static int 324 xsk_configure_umem(struct xsk_umem_info *umem_info, uint64_t size) { 325 int ret; 326 struct xsk_umem_config umem_config = { 327 .fill_size = XSK_RING_PROD__NUM_DESCS, 328 .comp_size = XSK_RING_CONS__NUM_DESCS, 329 .frame_size = XDP_FRAME_SIZE, 330 .frame_headroom = XSK_UMEM_FRAME_HEADROOM, 331 .flags = XSK_UMEM_FLAGS, 332 }; 333 334 ret = xsk_umem__create(&umem_info->umem, umem_info->buffer, size, &umem_info->fq, &umem_info->cq, &umem_config); 335 if (ret) { 336 errno = -ret; 337 return ret; 338 } 339 340 return 0; 341 } 342 343 static int 344 xsk_configure_socket(struct xdp_server *xdp, struct xsk_socket_info *xsk_info, 345 struct xsk_umem_info *umem, uint32_t queue_index) { 346 struct xsk_socket_config xsk_cfg; 347 uint32_t idx, reserved; 348 int ret; 349 350 struct xdp_config cfg = { 351 .xdp_flags = 0, 352 .xsk_bind_flags = 0, 353 .libxdp_flags = XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD, 354 }; 355 356 uint16_t xsk_bind_flags = XDP_USE_NEED_WAKEUP; 357 if (xdp->force_copy) { 358 xsk_bind_flags |= XDP_COPY; 359 } 360 cfg.xsk_bind_flags = xsk_bind_flags; 361 362 xsk_info->umem = umem; 363 xsk_cfg.rx_size = XSK_RING_CONS__NUM_DESCS; 364 xsk_cfg.tx_size = XSK_RING_PROD__NUM_DESCS; 365 xsk_cfg.xdp_flags = cfg.xdp_flags; 366 xsk_cfg.bind_flags = cfg.xsk_bind_flags; 367 xsk_cfg.libxdp_flags = cfg.libxdp_flags; 368 369 ret = xsk_socket__create(&xsk_info->xsk, 370 xdp->interface_name, 371 queue_index, 372 umem->umem, 373 &xsk_info->rx, 374 &xsk_info->tx, 375 &xsk_cfg); 376 if (ret) { 377 log_msg(LOG_ERR, "xdp: failed to create xsk_socket"); 378 goto error_exit; 379 } 380 381 ret = xsk_socket__update_xskmap(xsk_info->xsk, xdp->xsk_map_fd); 382 if (ret) { 383 log_msg(LOG_ERR, "xdp: failed to update xskmap"); 384 goto error_exit; 385 } 386 387 /* Initialize umem frame allocation */ 388 for (uint32_t i = 0; i < XDP_NUM_FRAMES; ++i) { 389 xsk_info->umem->umem_frame_addr[i] = i * XDP_FRAME_SIZE; 390 } 391 392 xsk_info->umem->umem_frame_free = XDP_NUM_FRAMES; 393 394 reserved = xsk_ring_prod__reserve(&xsk_info->umem->fq, 395 XSK_RING_PROD__NUM_DESCS, 396 &idx); 397 398 if (reserved != XSK_RING_PROD__NUM_DESCS) { 399 log_msg(LOG_ERR, 400 "xdp: amount of reserved addr not as expected (is %d)", reserved); 401 // "ENOMEM 12 Cannot allocate memory" is the closest to the 402 // error that not as much memory was reserved as expected 403 ret = -12; 404 goto error_exit; 405 } 406 407 for (uint32_t i = 0; i < XSK_RING_PROD__NUM_DESCS; ++i) { 408 *xsk_ring_prod__fill_addr(&xsk_info->umem->fq, idx++) = 409 xsk_alloc_umem_frame(xsk_info); 410 } 411 412 xsk_ring_prod__submit(&xsk_info->umem->fq, XSK_RING_PROD__NUM_DESCS); 413 414 return 0; 415 416 error_exit: 417 errno = -ret; 418 return ret; 419 } 420 421 static void *alloc_shared_mem(size_t len) { 422 /* MAP_ANONYMOUS memory is initialized with zero */ 423 return mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); 424 } 425 426 static int xdp_sockets_init(struct xdp_server *xdp) { 427 size_t umems_len = sizeof(struct xsk_umem_info) * xdp->queue_count; 428 size_t xsks_len = sizeof(struct xsk_socket_info) * xdp->queue_count; 429 430 xdp->umems = (struct xsk_umem_info *) alloc_shared_mem(umems_len); 431 if (xdp->umems == MAP_FAILED) { 432 log_msg(LOG_ERR, 433 "xdp: failed to allocate shared memory for umem info: %s", 434 strerror(errno)); 435 return -1; 436 } 437 438 xdp->xsks = (struct xsk_socket_info *) alloc_shared_mem(xsks_len); 439 if (xdp->xsks == MAP_FAILED) { 440 log_msg(LOG_ERR, 441 "xdp: failed to allocate shared memory for xsk info: %s", 442 strerror(errno)); 443 return -1; 444 } 445 446 for (uint32_t q_idx = 0; q_idx < xdp->queue_count; ++q_idx) { 447 /* mmap is supposedly page-aligned, so should be fine */ 448 xdp->umems[q_idx].buffer = alloc_shared_mem(XDP_BUFFER_SIZE); 449 450 if (xsk_configure_umem(&xdp->umems[q_idx], XDP_BUFFER_SIZE)) { 451 log_msg(LOG_ERR, "xdp: cannot create umem: %s", strerror(errno)); 452 goto out_err_umem; 453 } 454 455 if (xsk_configure_socket(xdp, &xdp->xsks[q_idx], &xdp->umems[q_idx], 456 q_idx)) { 457 log_msg(LOG_ERR, 458 "xdp: cannot create AF_XDP socket: %s", 459 strerror(errno)); 460 goto out_err_xsk; 461 } 462 } 463 464 return 0; 465 466 out_err_xsk: 467 for (uint32_t i = 0; i < xdp->queue_count; ++i) 468 xsk_umem__delete(xdp->umems[i].umem); 469 470 out_err_umem: 471 return -1; 472 } 473 474 static void xdp_sockets_cleanup(struct xdp_server *xdp) { 475 for (uint32_t i = 0; i < xdp->queue_count; ++i) { 476 xsk_socket__delete(xdp->xsks[i].xsk); 477 xsk_umem__delete(xdp->umems[i].umem); 478 } 479 } 480 481 int xdp_server_init(struct xdp_server *xdp) { 482 struct rlimit rlim = {RLIM_INFINITY, RLIM_INFINITY}; 483 484 /* check if interface name exists */ 485 xdp->interface_index = if_nametoindex(xdp->interface_name); 486 if (xdp->interface_index == 0) { 487 log_msg(LOG_ERR, "xdp: configured xdp-interface (%s) is unknown: %s", 488 xdp->interface_name, strerror(errno)); 489 return -1; 490 } 491 492 /* (optionally) load xdp program and (definitely) set xsks_map_fd */ 493 if (load_xdp_program_and_map(xdp)) { 494 log_msg(LOG_ERR, "xdp: failed to load/pin xdp program/map"); 495 return -1; 496 } 497 498 /* if we don't do set rlimit, libbpf does it */ 499 /* this either has to be done before privilege drop or 500 * requires CAP_SYS_RESOURCE */ 501 if (setrlimit(RLIMIT_MEMLOCK, &rlim)) { 502 log_msg(LOG_ERR, "xdp: cannot adjust rlimit (RLIMIT_MEMLOCK): \"%s\"\n", 503 strerror(errno)); 504 return -1; 505 } 506 507 if (xdp_sockets_init(xdp)) 508 return -1; 509 510 for (int i = 0; i < XDP_RX_BATCH_SIZE; ++i) { 511 umem_ptrs[i].addr = XDP_INVALID_UMEM_FRAME; 512 umem_ptrs[i].len = 0; 513 } 514 515 if (!xdp->ip_addresses) 516 figure_ip_addresses(xdp); 517 518 return 0; 519 } 520 521 void xdp_server_cleanup(struct xdp_server *xdp) { 522 xdp_sockets_cleanup(xdp); 523 524 /* only unpin if we loaded the program */ 525 if (xdp->bpf_prog_should_load) { 526 if (xdp->xsk_map && bpf_map__is_pinned(xdp->xsk_map)) { 527 if (bpf_map__unpin(xdp->xsk_map, NULL)) { 528 /* We currently ship an XDP program that doesn't pin the map. So 529 * if this error happens, it is because the user specified their 530 * custom XDP program to load by NSD. Therefore they should know 531 * about the pinned map and be able to unlink it themselves. 532 */ 533 log_msg(LOG_ERR, "xdp: failed to unpin bpf map during cleanup: \"%s\". " 534 "This is usually ok, but you need to unpin the map yourself. " 535 "This can usually be fixed by executing chmod o+wx %s\n", 536 strerror(errno), xdp->bpf_bpffs_path); 537 } 538 } 539 540 unload_xdp_program(xdp); 541 } 542 } 543 544 static void unload_xdp_program(struct xdp_server *xdp) { 545 DECLARE_LIBBPF_OPTS(bpf_xdp_attach_opts, bpf_opts, 546 .old_prog_fd = xdp->bpf_prog_fd); 547 548 log_msg(LOG_INFO, "xdp: detaching xdp program %u from %s\n", 549 xdp->bpf_prog_id, xdp->interface_name); 550 551 if (bpf_xdp_detach((int) xdp->interface_index, 0, &bpf_opts)) 552 log_msg(LOG_ERR, "xdp: failed to detach xdp program: %s\n", 553 strerror(errno)); 554 } 555 556 static int dest_ip_allowed6(struct xdp_server *xdp, struct ipv6hdr *ipv6) { 557 struct xdp_ip_address *ip = xdp->ip_addresses; 558 if (!ip) 559 // no IPs available, allowing all 560 return 1; 561 562 while (ip) { 563 if (ip->addr.ss_family == AF_INET6 && 564 !memcmp(&(((struct sockaddr_in6 *) &ip->addr)->sin6_addr), 565 &ipv6->daddr, 566 sizeof(struct in6_addr))) 567 return 1; 568 ip = ip->next; 569 } 570 571 return 0; 572 } 573 574 static int dest_ip_allowed4(struct xdp_server *xdp, struct iphdr *ipv4) { 575 struct xdp_ip_address *ip = xdp->ip_addresses; 576 if (!ip) 577 // no IPs available, allowing all 578 return 1; 579 580 while (ip) { 581 if (ip->addr.ss_family == AF_INET && 582 ipv4->daddr == ((struct sockaddr_in *) &ip->addr)->sin_addr.s_addr) 583 return 1; 584 ip = ip->next; 585 } 586 587 return 0; 588 } 589 590 static void 591 add_ip_address(struct xdp_server *xdp, struct sockaddr_storage *addr) { 592 struct xdp_ip_address *ip = xdp->ip_addresses; 593 if (!ip) { 594 xdp->ip_addresses = region_alloc_zero(xdp->region, 595 sizeof(struct xdp_ip_address)); 596 ip = xdp->ip_addresses; 597 } else { 598 while (ip->next) 599 ip = ip->next; 600 601 ip->next = region_alloc_zero(xdp->region, 602 sizeof(struct xdp_ip_address)); 603 ip = ip->next; 604 } 605 606 memcpy(&ip->addr, addr, sizeof(struct sockaddr_storage)); 607 } 608 609 static int figure_ip_addresses(struct xdp_server *xdp) { 610 // TODO: if using VLANs, also find appropriate IP addresses? 611 struct ifaddrs *ifaddr; 612 int family, ret = 0; 613 614 if (getifaddrs(&ifaddr) == -1) { 615 log_msg(LOG_ERR, "xdp: couldn't determine local IP addresses. " 616 "Serving all IP addresses now"); 617 return -1; 618 } 619 620 for (struct ifaddrs *ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { 621 if (ifa->ifa_addr == NULL) 622 continue; 623 624 if (strcmp(ifa->ifa_name, xdp->interface_name)) 625 continue; 626 627 family = ifa->ifa_addr->sa_family; 628 629 switch (family) { 630 default: 631 continue; 632 case AF_INET: 633 case AF_INET6: 634 add_ip_address(xdp, (struct sockaddr_storage *) ifa->ifa_addr); 635 } 636 } 637 638 freeifaddrs(ifaddr); 639 return ret; 640 } 641 642 static inline void swap_eth(struct ethhdr *eth) { 643 uint8_t tmp_mac[ETH_ALEN]; 644 memcpy(tmp_mac, eth->h_dest, ETH_ALEN); 645 memcpy(eth->h_dest, eth->h_source, ETH_ALEN); 646 memcpy(eth->h_source, tmp_mac, ETH_ALEN); 647 } 648 649 static inline void swap_udp(struct udphdr *udp) { 650 uint16_t tmp_port; /* not touching endianness */ 651 tmp_port = udp->source; 652 udp->source = udp->dest; 653 udp->dest = tmp_port; 654 } 655 656 static inline void swap_ipv6(struct ipv6hdr *ipv6) { 657 struct in6_addr tmp_ip; 658 memcpy(&tmp_ip, &ipv6->saddr, sizeof(tmp_ip)); 659 memcpy(&ipv6->saddr, &ipv6->daddr, sizeof(tmp_ip)); 660 memcpy(&ipv6->daddr, &tmp_ip, sizeof(tmp_ip)); 661 } 662 663 static inline void swap_ipv4(struct iphdr *ipv4) { 664 struct in_addr tmp_ip; 665 memcpy(&tmp_ip, &ipv4->saddr, sizeof(tmp_ip)); 666 memcpy(&ipv4->saddr, &ipv4->daddr, sizeof(tmp_ip)); 667 memcpy(&ipv4->daddr, &tmp_ip, sizeof(tmp_ip)); 668 } 669 670 static inline void *parse_udp(struct udphdr *udp) { 671 if (ntohs(udp->dest) != DNS_PORT) 672 return NULL; 673 674 return (void *)(udp + 1); 675 } 676 677 static inline void *parse_ipv6(struct ipv6hdr *ipv6) { 678 if (ipv6->nexthdr != IPPROTO_UDP) 679 return NULL; 680 681 return (void *)(ipv6 + 1); 682 } 683 684 static inline void *parse_ipv4(struct iphdr *ipv4) { 685 if (ipv4->protocol != IPPROTO_UDP) 686 return NULL; 687 688 return (void *)(ipv4 + 1); 689 } 690 691 static uint32_t parse_dns(struct nsd* nsd, uint32_t dnslen, 692 struct query *q, sa_family_t ai_family) { 693 /* TODO: implement DNSTAP, PROXYv2, ...? */ 694 uint32_t now = 0; 695 696 /* set the size of the dns message and move position to start */ 697 buffer_skip(q->packet, dnslen); 698 buffer_flip(q->packet); 699 700 if (query_process(q, nsd, &now) != QUERY_DISCARDED) { 701 if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) { 702 STATUP(nsd, nona); 703 ZTATUP(nsd, q->zone, nona); 704 } 705 706 #ifdef USE_ZONE_STATS 707 if (ai_family == AF_INET) { 708 ZTATUP(nsd, q->zone, qudp); 709 } else if (ai_family == AF_INET6) { 710 ZTATUP(nsd, q->zone, qudp6); 711 } 712 #endif /* USE_ZONE_STATS */ 713 714 query_add_optional(q, nsd, &now); 715 716 buffer_flip(q->packet); 717 718 #ifdef BIND8_STATS 719 /* Account the rcode & TC... */ 720 STATUP2(nsd, rcode, RCODE(q->packet)); 721 ZTATUP2(nsd, q->zone, rcode, RCODE(q->packet)); 722 if (TC(q->packet)) { 723 STATUP(nsd, truncated); 724 ZTATUP(nsd, q->zone, truncated); 725 } 726 #endif /* BIND8_STATS */ 727 728 /* return new dns message length */ 729 return (uint32_t) buffer_remaining(q->packet); 730 } else { 731 query_reset(q, UDP_MAX_MESSAGE_LEN, 0); 732 STATUP(nsd, dropped); 733 ZTATUP(nsd, q->zone, dropped); 734 return 0; 735 } 736 } 737 738 static int 739 process_packet(struct xdp_server *xdp, uint8_t *pkt, 740 uint32_t *len, struct query *query) { 741 /* log_msg(LOG_INFO, "xdp: received packet with len %d", *len); */ 742 743 uint32_t dnslen = *len; 744 uint32_t data_before_dnshdr_len = 0; 745 746 struct ethhdr *eth = (struct ethhdr *)pkt; 747 struct ipv6hdr *ipv6 = NULL; 748 struct iphdr *ipv4 = NULL; 749 struct udphdr *udp = NULL; 750 void *dnshdr = NULL; 751 752 /* doing the check here, so that the packet/frame is large enough to contain 753 * at least an ethernet header, an ipv4 header (ipv6 header is larger), and 754 * a udp header. 755 */ 756 if (*len < (sizeof(*eth) + sizeof(struct iphdr) + sizeof(*udp))) 757 return -1; 758 759 data_before_dnshdr_len = sizeof(*eth) + sizeof(*udp); 760 761 switch (ntohs(eth->h_proto)) { 762 case ETH_P_IPV6: { 763 ipv6 = (struct ipv6hdr *)(eth + 1); 764 765 if (*len < (sizeof(*eth) + sizeof(*ipv6) + sizeof(*udp))) 766 return -2; 767 if (!(udp = parse_ipv6(ipv6))) 768 return -3; 769 770 dnslen -= (uint32_t) (sizeof(*eth) + sizeof(*ipv6) + sizeof(*udp)); 771 data_before_dnshdr_len += sizeof(*ipv6); 772 773 if (!dest_ip_allowed6(xdp, ipv6)) 774 return -4; 775 776 break; 777 } case ETH_P_IP: { 778 ipv4 = (struct iphdr *)(eth + 1); 779 780 if (!(udp = parse_ipv4(ipv4))) 781 return -5; 782 783 dnslen -= (uint32_t) (sizeof(*eth) + sizeof(*ipv4) + sizeof(*udp)); 784 data_before_dnshdr_len += sizeof(*ipv4); 785 786 if (!dest_ip_allowed4(xdp, ipv4)) 787 return -6; 788 789 break; 790 } 791 792 /* TODO: vlan? */ 793 /* case ETH_P_8021AD: case ETH_P_8021Q: */ 794 /* if (*len < (sizeof(*eth) + sizeof(*vlan))) */ 795 /* break; */ 796 default: 797 return -7; 798 } 799 800 if (!(dnshdr = parse_udp(udp))) 801 return -8; 802 803 query_set_buffer_data(query, dnshdr, XDP_FRAME_SIZE - data_before_dnshdr_len); 804 805 if(ipv6) { 806 #ifdef INET6 807 struct sockaddr_in6* sock6 = (struct sockaddr_in6*)&query->remote_addr; 808 sock6->sin6_family = AF_INET6; 809 sock6->sin6_port = udp->dest; 810 sock6->sin6_flowinfo = 0; 811 sock6->sin6_scope_id = 0; 812 memcpy(&sock6->sin6_addr, &ipv6->saddr, sizeof(ipv6->saddr)); 813 query->remote_addrlen = (socklen_t)sizeof(struct sockaddr_in6); 814 #else 815 return 0; /* no inet6 no network */ 816 #endif /* INET6 */ 817 #ifdef BIND8_STATS 818 STATUP(xdp->nsd, qudp6); 819 #endif /* BIND8_STATS */ 820 } else { 821 struct sockaddr_in* sock4 = (struct sockaddr_in*)&query->remote_addr; 822 sock4->sin_family = AF_INET; 823 sock4->sin_port = udp->dest; 824 sock4->sin_addr.s_addr = ipv4->saddr; 825 query->remote_addrlen = (socklen_t)sizeof(struct sockaddr_in); 826 #ifdef BIND8_STATS 827 STATUP(xdp->nsd, qudp); 828 #endif /* BIND8_STATS */ 829 } 830 831 query->client_addr = query->remote_addr; 832 query->client_addrlen = query->remote_addrlen; 833 query->is_proxied = 0; 834 835 dnslen = parse_dns(xdp->nsd, dnslen, query, query->remote_addr.ss_family); 836 if (!dnslen) { 837 return -9; 838 } 839 840 // Not verifying the packet length (that it fits in an IP packet), as 841 // parse_dns truncates too long response messages. 842 udp->len = htons((uint16_t) (sizeof(*udp) + dnslen)); 843 844 swap_eth(eth); 845 swap_udp(udp); 846 847 if (ipv4) { 848 __be16 ipv4_old_len = ipv4->tot_len; 849 swap_ipv4(ipv4); 850 ipv4->tot_len = htons(sizeof(*ipv4)) + udp->len; 851 csum16_replace(&ipv4->check, ipv4_old_len, ipv4->tot_len); 852 udp->check = calc_csum_udp4(udp, ipv4); 853 } else if (ipv6) { 854 swap_ipv6(ipv6); 855 ipv6->payload_len = udp->len; 856 udp->check = calc_csum_udp6(udp, ipv6); 857 } else { 858 log_msg(LOG_ERR, "xdp: we forgot to implement something... oops"); 859 return 0; 860 } 861 862 /* log_msg(LOG_INFO, "xdp: done with processing the packet"); */ 863 864 *len = data_before_dnshdr_len + dnslen; 865 return 1; 866 } 867 868 void xdp_handle_recv_and_send(struct xdp_server *xdp) { 869 struct xsk_socket_info *xsk = &xdp->xsks[xdp->queue_index]; 870 unsigned int recvd, i, reserved, to_send = 0; 871 uint32_t idx_rx = 0; 872 uint32_t tx_idx = 0; 873 int ret; 874 875 recvd = xsk_ring_cons__peek(&xsk->rx, XDP_RX_BATCH_SIZE, &idx_rx); 876 if (!recvd) { 877 /* no data available */ 878 return; 879 } 880 881 fill_fq(xsk); 882 883 /* Process received packets */ 884 for (i = 0; i < recvd; ++i) { 885 uint64_t addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; 886 uint32_t len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; 887 888 uint8_t *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); 889 if ((ret = process_packet(xdp, pkt, &len, xdp->queries[i])) <= 0) { 890 /* drop packet */ 891 xsk_free_umem_frame(xsk, addr); 892 } else { 893 umem_ptrs[to_send].addr = addr; 894 umem_ptrs[to_send].len = len; 895 ++to_send; 896 } 897 /* we can reset the query directly after each packet processing, 898 * because the reset does not delete the underlying buffer/data. 899 * However, if we, in future, need to access data from the query 900 * struct when sending the answer, this needs to change. 901 * This also means, that currently a single query instance (and 902 * not an array) would suffice for this implementation. */ 903 query_reset(xdp->queries[i], UDP_MAX_MESSAGE_LEN, 0); 904 905 /* xsk->stats.rx_bytes += len; */ 906 } 907 908 xsk_ring_cons__release(&xsk->rx, recvd); 909 /* xsk->stats.rx_packets += rcvd; */ 910 911 /* Process sending packets */ 912 913 /* TODO: at least send as many packets as slots are available */ 914 reserved = xsk_ring_prod__reserve(&xsk->tx, to_send, &tx_idx); 915 // if we can't reserve to_send frames, we'll get 0 frames, so 916 // no need to "un-reserve" 917 if (reserved != to_send) { 918 // not enough tx slots available, drop packets 919 log_msg(LOG_ERR, "xdp: not enough TX frames available, dropping " 920 "whole batch"); 921 for (i = 0; i < to_send; ++i) { 922 xsk_free_umem_frame(xsk, umem_ptrs[i].addr); 923 umem_ptrs[i].addr = XDP_INVALID_UMEM_FRAME; 924 umem_ptrs[i].len = 0; 925 } 926 #ifdef BIND8_STATS 927 xdp->nsd->st->txerr += to_send; 928 #endif /* BIND8_STATS */ 929 to_send = 0; 930 } 931 932 for (i = 0; i < to_send; ++i) { 933 uint64_t addr = umem_ptrs[i].addr; 934 uint32_t len = umem_ptrs[i].len; 935 xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->addr = addr; 936 xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->len = len; 937 tx_idx++; 938 xsk->outstanding_tx++; 939 umem_ptrs[i].addr = XDP_INVALID_UMEM_FRAME; 940 umem_ptrs[i].len = 0; 941 } 942 943 xsk_ring_prod__submit(&xsk->tx, to_send); 944 945 /* wake up kernel for tx if needed and collect completed tx buffers */ 946 handle_tx(xsk); 947 /* TODO: maybe call fill_fq(xsk) here as well? */ 948 } 949 950 static void drain_cq(struct xsk_socket_info *xsk) { 951 uint32_t completed, idx_cq; 952 953 /* free completed TX buffers */ 954 completed = xsk_ring_cons__peek(&xsk->umem->cq, 955 XSK_RING_CONS__NUM_DESCS, 956 &idx_cq); 957 958 if (completed > 0) { 959 for (uint32_t i = 0; i < completed; i++) { 960 xsk_free_umem_frame(xsk, *xsk_ring_cons__comp_addr(&xsk->umem->cq, 961 idx_cq++)); 962 } 963 964 xsk_ring_cons__release(&xsk->umem->cq, completed); 965 xsk->outstanding_tx -= completed < xsk->outstanding_tx ? 966 completed : xsk->outstanding_tx; 967 } 968 } 969 970 static void handle_tx(struct xsk_socket_info *xsk) { 971 if (!xsk->outstanding_tx) 972 return; 973 974 if (xsk_ring_prod__needs_wakeup(&xsk->tx)) 975 sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); 976 977 drain_cq(xsk); 978 979 // Update TX-queue pointers 980 // This is not needed, because prod__reserve calls this function too, 981 // and therefore, if not enough frames are free on the cached pointers, 982 // it will update the real pointers. 983 /* xsk_prod_nb_free(&xsk->tx, XSK_RING_PROD__NUM_DESCS/4); */ 984 } 985 986 #endif /* USE_XDP */ 987