Home | History | Annotate | Line # | Download | only in dist
      1 /*
      2  * xdp-server.c -- integration of AF_XDP into nsd
      3  *
      4  * Copyright (c) 2024, NLnet Labs. All rights reserved.
      5  *
      6  * See LICENSE for the license.
      7  *
      8  */
      9 
     10 /*
     11  * Parts inspired by https://github.com/xdp-project/xdp-tutorial
     12  */
     13 
     14 #include "config.h"
     15 
     16 #ifdef USE_XDP
     17 
     18 #include <assert.h>
     19 #include <errno.h>
     20 #include <netinet/in.h>
     21 #include <stdint.h>
     22 #include <stdio.h>
     23 #include <stdlib.h>
     24 #include <string.h>
     25 #include <sys/types.h>
     26 #include <unistd.h>
     27 #include <linux/limits.h>
     28 #include <sys/mman.h>
     29 
     30 #include <sys/poll.h>
     31 #include <sys/resource.h>
     32 
     33 /* #include <bpf/bpf.h> */
     34 #include <xdp/xsk.h>
     35 #include <xdp/libxdp.h>
     36 #include <bpf/libbpf.h>
     37 
     38 #include <arpa/inet.h>
     39 #include <linux/icmpv6.h>
     40 #include <linux/if_ether.h>
     41 #include <linux/ipv6.h>
     42 #include <linux/ip.h>
     43 #include <linux/udp.h>
     44 #include <net/if.h>
     45 
     46 #include <arpa/inet.h>
     47 #include <netdb.h>
     48 #include <ifaddrs.h>
     49 #include <linux/if_link.h>
     50 
     51 #include "query.h"
     52 #include "dns.h"
     53 #include "util.h"
     54 #include "xdp-server.h"
     55 #include "xdp-util.h"
     56 #include "nsd.h"
     57 
     58 // TODO: make configurable
     59 #define DNS_PORT 53
     60 
     61 struct xdp_config {
     62 	__u32 xdp_flags;
     63 	__u32 libxdp_flags;
     64 	__u16 xsk_bind_flags;
     65 };
     66 
     67 struct umem_ptr {
     68 	uint64_t addr;
     69 	uint32_t len;
     70 };
     71 
     72 static struct umem_ptr umem_ptrs[XDP_RX_BATCH_SIZE];
     73 
     74 /*
     75  * Allocate memory for UMEM and setup rings
     76  */
     77 static int
     78 xsk_configure_umem(struct xsk_umem_info *umem_info, uint64_t size);
     79 
     80 /*
     81  * Retrieve a UMEM frame address for allocation
     82  *
     83  * Returns XDP_INVALID_UMEM_FRAME when there are no free frames available.
     84  */
     85 static uint64_t xsk_alloc_umem_frame(struct xsk_socket_info *xsk);
     86 
     87 /*
     88  * Bind AF_XDP socket and setup rings
     89  */
     90 static int xsk_configure_socket(struct xdp_server *xdp,
     91                                 struct xsk_socket_info *xsk_info,
     92                                 struct xsk_umem_info *umem,
     93                                 uint32_t queue_index);
     94 
     95 /*
     96  * Get number of free frames in UMEM
     97  */
     98 static uint64_t xsk_umem_free_frames(struct xsk_socket_info *xsk);
     99 
    100 /*
    101  * Free a frame in UMEM
    102  */
    103 static void xsk_free_umem_frame(struct xsk_socket_info *xsk, uint64_t frame);
    104 
    105 /*
    106  * Fill fill ring with as many frames as possible
    107  */
    108 static void fill_fq(struct xsk_socket_info *xsk);
    109 
    110 /*
    111  * Load eBPF program to forward traffic to our socket
    112  */
    113 static int load_xdp_program_and_map(struct xdp_server *xdp);
    114 
    115 /*
    116  * Unload eBPF/XDP program
    117  */
    118 static void unload_xdp_program(struct xdp_server *xdp);
    119 
    120 /*
    121  * Figure out IP addresses to listen to.
    122  */
    123 static int figure_ip_addresses(struct xdp_server *xdp);
    124 
    125 /*
    126  * Add IP address to allowed destination addresses for incoming packets
    127  */
    128 static void add_ip_address(struct xdp_server *xdp,
    129                            struct sockaddr_storage *addr);
    130 
    131 /*
    132  * Check whether destination IPv4 is in allowed IPs list
    133  */
    134 static int dest_ip_allowed4(struct xdp_server *xdp, struct iphdr *ipv4);
    135 
    136 /*
    137  * Check whether destination IPv6 is in allowed IPs list
    138  */
    139 static int dest_ip_allowed6(struct xdp_server *xdp, struct ipv6hdr *ipv6);
    140 
    141 /*
    142  * Setup XDP sockets
    143  */
    144 static int xdp_sockets_init(struct xdp_server *xdp);
    145 
    146 /*
    147  * Cleanup XDP sockets and memory
    148  */
    149 static void xdp_sockets_cleanup(struct xdp_server *xdp);
    150 
    151 /*
    152  * Allocate a block of shared memory
    153  */
    154 static void *alloc_shared_mem(size_t len);
    155 
    156 /*
    157  * Collect free frames from completion queue
    158  */
    159 static void drain_cq(struct xsk_socket_info *xsk);
    160 
    161 /*
    162  * Send outstanding packets and recollect completed frame addresses
    163  */
    164 static void handle_tx(struct xsk_socket_info *xsk);
    165 
    166 /*
    167  * Process packet and indicate if it should be dropped
    168  * return 0 or less => drop
    169  * return greater than 0 => use for tx
    170  */
    171 static int
    172 process_packet(struct xdp_server *xdp,
    173                uint8_t *pkt,
    174                uint32_t *len,
    175                struct query *query);
    176 
    177 static inline void swap_eth(struct ethhdr *eth);
    178 static inline void swap_udp(struct udphdr *udp);
    179 static inline void swap_ipv6(struct ipv6hdr *ipv6);
    180 static inline void swap_ipv4(struct iphdr *ipv4);
    181 static inline void *parse_udp(struct udphdr *udp);
    182 static inline void *parse_ipv6(struct ipv6hdr *ipv6);
    183 static inline void *parse_ipv4(struct iphdr *ipv4);
    184 
    185 /*
    186  * Parse dns message and return new length of dns message
    187  */
    188 static uint32_t parse_dns(struct nsd* nsd,
    189                           uint32_t dnslen,
    190                           struct query *q,
    191                           sa_family_t ai_family);
    192 
    193 /* *************** */
    194 /* Implementations */
    195 /* *************** */
    196 
    197 static uint64_t xsk_alloc_umem_frame(struct xsk_socket_info *xsk) {
    198 	uint64_t frame;
    199 	if (xsk->umem->umem_frame_free == 0) {
    200 		return XDP_INVALID_UMEM_FRAME;
    201 	}
    202 
    203 	frame = xsk->umem->umem_frame_addr[--xsk->umem->umem_frame_free];
    204 	xsk->umem->umem_frame_addr[xsk->umem->umem_frame_free] =
    205 		XDP_INVALID_UMEM_FRAME;
    206 	return frame;
    207 }
    208 
    209 static uint64_t xsk_umem_free_frames(struct xsk_socket_info *xsk) {
    210 	return xsk->umem->umem_frame_free;
    211 }
    212 
    213 static void xsk_free_umem_frame(struct xsk_socket_info *xsk, uint64_t frame) {
    214 	assert(xsk->umem->umem_frame_free < XDP_NUM_FRAMES);
    215 	xsk->umem->umem_frame_addr[xsk->umem->umem_frame_free++] = frame;
    216 }
    217 
    218 static void fill_fq(struct xsk_socket_info *xsk) {
    219 	uint32_t stock_frames;
    220 	uint32_t idx_fq = 0;
    221 	/* fill the fill ring with as many frames as are available */
    222 	/* get number of spots available in fq */
    223 	stock_frames = xsk_prod_nb_free(&xsk->umem->fq,
    224 	                                (uint32_t) xsk_umem_free_frames(xsk));
    225 	if (stock_frames > 0) {
    226 		/* ignoring prod__reserve return value, because we got stock_frames
    227 		 * from xsk_prod_nb_free(), which are therefore available */
    228 		xsk_ring_prod__reserve(&xsk->umem->fq, stock_frames, &idx_fq);
    229 
    230 		for (uint32_t i = 0; i < stock_frames; ++i) {
    231 			/* TODO: handle lack of available frames?
    232 			 * Is not necessary when the total amount of frames exceeds the
    233 			 * total slots available across all queues combined */
    234 			/* uint64_t frame = xsk_alloc_umem_frame(xsk); */
    235 			/* if (frame == XDP_INVALID_UMEM_FRAME) */
    236 			/*     printf("xdp: trying to fill_addr INVALID UMEM FRAME"); */
    237 			*xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) =
    238 				xsk_alloc_umem_frame(xsk);
    239 		}
    240 
    241 		xsk_ring_prod__submit(&xsk->umem->fq, stock_frames);
    242 	}
    243 }
    244 
    245 static int load_xdp_program_and_map(struct xdp_server *xdp) {
    246 	struct bpf_map *map;
    247 	char errmsg[512];
    248 	int err, ret;
    249 	/* UNSPEC => let libxdp decide */
    250 	// TODO: put this into a config option as well?
    251 	enum xdp_attach_mode attach_mode = XDP_MODE_UNSPEC;
    252 
    253 	DECLARE_LIBXDP_OPTS(bpf_object_open_opts, opts);
    254 	if (xdp->bpf_bpffs_path)
    255 		opts.pin_root_path = xdp->bpf_bpffs_path;
    256 
    257 	/* for now our xdp program should contain just one program section */
    258 	// TODO: look at xdp_program__create because it can take a pinned prog
    259 	xdp->bpf_prog = xdp_program__open_file(xdp->bpf_prog_filename, NULL, &opts);
    260 
    261 	// conversion should be fine, libxdp errors shouldn't exceed (int),
    262 	// also libxdp_strerr takes int anyway...
    263 	err = (int) libxdp_get_error(xdp->bpf_prog);
    264 	if (err) {
    265 		libxdp_strerror(err, errmsg, sizeof(errmsg));
    266 		log_msg(LOG_ERR, "xdp: could not open xdp program: %s\n", errmsg);
    267 		return err;
    268 	}
    269 
    270 	if (xdp->bpf_prog_should_load) {
    271 		/* TODO: I find setting environment variables from within a program
    272 		 * not a good thing to do, but for the meantime this helps... */
    273 		/* This is done to allow unloading the XDP program we load without
    274 		 * needing the SYS_ADMIN capability, and libxdp doesn't allow skipping
    275 		 * the dispatcher through different means. */
    276 		putenv("LIBXDP_SKIP_DISPATCHER=1");
    277 		err = xdp_program__attach(xdp->bpf_prog, (int) xdp->interface_index, attach_mode, 0);
    278 		/* err = xdp_program__attach_single(xdp->bpf_prog, xdp->interface_index, attach_mode); */
    279 		if (err) {
    280 			libxdp_strerror(err, errmsg, sizeof(errmsg));
    281 			log_msg(LOG_ERR, "xdp: could not attach xdp program to interface '%s' : %s\n",
    282 					xdp->interface_name, errmsg);
    283 			return err;
    284 		}
    285 
    286 		xdp->bpf_prog_fd = xdp_program__fd(xdp->bpf_prog);
    287 		xdp->bpf_prog_id = xdp_program__id(xdp->bpf_prog);
    288 
    289 		/* We also need to get the file descriptor to the xsks_map */
    290 		map = bpf_object__find_map_by_name(xdp_program__bpf_obj(xdp->bpf_prog), "xsks_map");
    291 		ret = bpf_map__fd(map);
    292 		if (ret < 0) {
    293 			log_msg(LOG_ERR, "xdp: no xsks map found in xdp program: %s\n", strerror(ret));
    294 			return ret;
    295 		}
    296 		xdp->xsk_map_fd = ret;
    297 		xdp->xsk_map = map;
    298 	} else {
    299 		char map_path[PATH_MAX];
    300 		int fd;
    301 
    302 		snprintf(map_path, PATH_MAX, "%s/%s", xdp->bpf_bpffs_path, "xsks_map");
    303 
    304 		fd = bpf_obj_get(map_path);
    305 		if (fd < 0) {
    306 			log_msg(LOG_ERR, "xdp: could not retrieve xsks_map pin from %s: %s", map_path, strerror(errno));
    307 			return fd;
    308 		}
    309 
    310 		map = bpf_object__find_map_by_name(xdp_program__bpf_obj(xdp->bpf_prog), "xsks_map");
    311 		if ((ret = bpf_map__reuse_fd(map, fd))) {
    312 			log_msg(LOG_ERR, "xdp: could not re-use xsks_map: %s\n", strerror(errno));
    313 			return ret;
    314 		}
    315 
    316 		xdp->xsk_map_fd = fd;
    317 		xdp->xsk_map = map;
    318 	}
    319 
    320 	return 0;
    321 }
    322 
    323 static int
    324 xsk_configure_umem(struct xsk_umem_info *umem_info, uint64_t size) {
    325 	int ret;
    326 	struct xsk_umem_config umem_config = {
    327 		.fill_size = XSK_RING_PROD__NUM_DESCS,
    328 		.comp_size = XSK_RING_CONS__NUM_DESCS,
    329 		.frame_size = XDP_FRAME_SIZE,
    330 		.frame_headroom = XSK_UMEM_FRAME_HEADROOM,
    331 		.flags = XSK_UMEM_FLAGS,
    332 	};
    333 
    334 	ret = xsk_umem__create(&umem_info->umem, umem_info->buffer, size, &umem_info->fq, &umem_info->cq, &umem_config);
    335 	if (ret) {
    336 		errno = -ret;
    337 		return ret;
    338 	}
    339 
    340 	return 0;
    341 }
    342 
    343 static int
    344 xsk_configure_socket(struct xdp_server *xdp, struct xsk_socket_info *xsk_info,
    345                      struct xsk_umem_info *umem, uint32_t queue_index) {
    346 	struct xsk_socket_config xsk_cfg;
    347 	uint32_t idx, reserved;
    348 	int ret;
    349 
    350 	struct xdp_config cfg = {
    351 		.xdp_flags = 0,
    352 		.xsk_bind_flags = 0,
    353 		.libxdp_flags = XSK_LIBXDP_FLAGS__INHIBIT_PROG_LOAD,
    354 	};
    355 
    356 	uint16_t xsk_bind_flags = XDP_USE_NEED_WAKEUP;
    357 	if (xdp->force_copy) {
    358 		xsk_bind_flags |= XDP_COPY;
    359 	}
    360 	cfg.xsk_bind_flags = xsk_bind_flags;
    361 
    362 	xsk_info->umem = umem;
    363 	xsk_cfg.rx_size = XSK_RING_CONS__NUM_DESCS;
    364 	xsk_cfg.tx_size = XSK_RING_PROD__NUM_DESCS;
    365 	xsk_cfg.xdp_flags = cfg.xdp_flags;
    366 	xsk_cfg.bind_flags = cfg.xsk_bind_flags;
    367 	xsk_cfg.libxdp_flags = cfg.libxdp_flags;
    368 
    369 	ret = xsk_socket__create(&xsk_info->xsk,
    370 	                         xdp->interface_name,
    371 	                         queue_index,
    372 	                         umem->umem,
    373 	                         &xsk_info->rx,
    374 	                         &xsk_info->tx,
    375 	                         &xsk_cfg);
    376 	if (ret) {
    377 		log_msg(LOG_ERR, "xdp: failed to create xsk_socket");
    378 		goto error_exit;
    379 	}
    380 
    381 	ret = xsk_socket__update_xskmap(xsk_info->xsk, xdp->xsk_map_fd);
    382 	if (ret) {
    383 		log_msg(LOG_ERR, "xdp: failed to update xskmap");
    384 		goto error_exit;
    385 	}
    386 
    387 	/* Initialize umem frame allocation */
    388 	for (uint32_t i = 0; i < XDP_NUM_FRAMES; ++i) {
    389 		xsk_info->umem->umem_frame_addr[i] = i * XDP_FRAME_SIZE;
    390 	}
    391 
    392 	xsk_info->umem->umem_frame_free = XDP_NUM_FRAMES;
    393 
    394 	reserved = xsk_ring_prod__reserve(&xsk_info->umem->fq,
    395 	                             XSK_RING_PROD__NUM_DESCS,
    396 	                             &idx);
    397 
    398 	if (reserved != XSK_RING_PROD__NUM_DESCS) {
    399 		log_msg(LOG_ERR,
    400 		        "xdp: amount of reserved addr not as expected (is %d)", reserved);
    401 		// "ENOMEM 12 Cannot allocate memory" is the closest to the
    402 		// error that not as much memory was reserved as expected
    403 		ret = -12;
    404 		goto error_exit;
    405 	}
    406 
    407 	for (uint32_t i = 0; i < XSK_RING_PROD__NUM_DESCS; ++i) {
    408 		*xsk_ring_prod__fill_addr(&xsk_info->umem->fq, idx++) =
    409 			xsk_alloc_umem_frame(xsk_info);
    410 	}
    411 
    412 	xsk_ring_prod__submit(&xsk_info->umem->fq, XSK_RING_PROD__NUM_DESCS);
    413 
    414 	return 0;
    415 
    416 error_exit:
    417 	errno = -ret;
    418 	return ret;
    419 }
    420 
    421 static void *alloc_shared_mem(size_t len) {
    422 	/* MAP_ANONYMOUS memory is initialized with zero */
    423 	return mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
    424 }
    425 
    426 static int xdp_sockets_init(struct xdp_server *xdp) {
    427 	size_t umems_len = sizeof(struct xsk_umem_info) * xdp->queue_count;
    428 	size_t xsks_len = sizeof(struct xsk_socket_info) * xdp->queue_count;
    429 
    430 	xdp->umems = (struct xsk_umem_info *) alloc_shared_mem(umems_len);
    431 	if (xdp->umems == MAP_FAILED) {
    432 		log_msg(LOG_ERR,
    433 		        "xdp: failed to allocate shared memory for umem info: %s",
    434 		        strerror(errno));
    435 		return -1;
    436 	}
    437 
    438 	xdp->xsks = (struct xsk_socket_info *) alloc_shared_mem(xsks_len);
    439 	if (xdp->xsks == MAP_FAILED) {
    440 		log_msg(LOG_ERR,
    441 		        "xdp: failed to allocate shared memory for xsk info: %s",
    442 		        strerror(errno));
    443 		return -1;
    444 	}
    445 
    446 	for (uint32_t q_idx = 0; q_idx < xdp->queue_count; ++q_idx) {
    447 		/* mmap is supposedly page-aligned, so should be fine */
    448 		xdp->umems[q_idx].buffer = alloc_shared_mem(XDP_BUFFER_SIZE);
    449 
    450 		if (xsk_configure_umem(&xdp->umems[q_idx], XDP_BUFFER_SIZE)) {
    451 			log_msg(LOG_ERR, "xdp: cannot create umem: %s", strerror(errno));
    452 			goto out_err_umem;
    453 		}
    454 
    455 		if (xsk_configure_socket(xdp, &xdp->xsks[q_idx], &xdp->umems[q_idx],
    456 		                         q_idx)) {
    457 			log_msg(LOG_ERR,
    458 			        "xdp: cannot create AF_XDP socket: %s",
    459 			        strerror(errno));
    460 			goto out_err_xsk;
    461 		}
    462 	}
    463 
    464 	return 0;
    465 
    466 out_err_xsk:
    467 	for (uint32_t i = 0; i < xdp->queue_count; ++i)
    468 		xsk_umem__delete(xdp->umems[i].umem);
    469 
    470 out_err_umem:
    471 	return -1;
    472 }
    473 
    474 static void xdp_sockets_cleanup(struct xdp_server *xdp) {
    475 	for (uint32_t i = 0; i < xdp->queue_count; ++i) {
    476 		xsk_socket__delete(xdp->xsks[i].xsk);
    477 		xsk_umem__delete(xdp->umems[i].umem);
    478 	}
    479 }
    480 
    481 int xdp_server_init(struct xdp_server *xdp) {
    482 	struct rlimit rlim = {RLIM_INFINITY, RLIM_INFINITY};
    483 
    484 	/* check if interface name exists */
    485 	xdp->interface_index = if_nametoindex(xdp->interface_name);
    486 	if (xdp->interface_index == 0) {
    487 		log_msg(LOG_ERR, "xdp: configured xdp-interface (%s) is unknown: %s",
    488 		        xdp->interface_name, strerror(errno));
    489 		return -1;
    490 	}
    491 
    492 	/* (optionally) load xdp program and (definitely) set xsks_map_fd */
    493 	if (load_xdp_program_and_map(xdp)) {
    494 		log_msg(LOG_ERR, "xdp: failed to load/pin xdp program/map");
    495 		return -1;
    496 	}
    497 
    498 	/* if we don't do set rlimit, libbpf does it */
    499 	/* this either has to be done before privilege drop or
    500 	 * requires CAP_SYS_RESOURCE */
    501 	if (setrlimit(RLIMIT_MEMLOCK, &rlim)) {
    502 		log_msg(LOG_ERR, "xdp: cannot adjust rlimit (RLIMIT_MEMLOCK): \"%s\"\n",
    503 			strerror(errno));
    504 		return -1;
    505 	}
    506 
    507 	if (xdp_sockets_init(xdp))
    508 		return -1;
    509 
    510 	for (int i = 0; i < XDP_RX_BATCH_SIZE; ++i) {
    511 		umem_ptrs[i].addr = XDP_INVALID_UMEM_FRAME;
    512 		umem_ptrs[i].len = 0;
    513 	}
    514 
    515 	if (!xdp->ip_addresses)
    516 		figure_ip_addresses(xdp);
    517 
    518 	return 0;
    519 }
    520 
    521 void xdp_server_cleanup(struct xdp_server *xdp) {
    522 	xdp_sockets_cleanup(xdp);
    523 
    524 	/* only unpin if we loaded the program */
    525 	if (xdp->bpf_prog_should_load) {
    526 		if (xdp->xsk_map && bpf_map__is_pinned(xdp->xsk_map)) {
    527 			if (bpf_map__unpin(xdp->xsk_map, NULL)) {
    528 				/* We currently ship an XDP program that doesn't pin the map. So
    529 				 * if this error happens, it is because the user specified their
    530 				 * custom XDP program to load by NSD. Therefore they should know
    531 				 * about the pinned map and be able to unlink it themselves.
    532 				 */
    533 				log_msg(LOG_ERR, "xdp: failed to unpin bpf map during cleanup: \"%s\". "
    534 				        "This is usually ok, but you need to unpin the map yourself. "
    535 				        "This can usually be fixed by executing chmod o+wx %s\n",
    536 				        strerror(errno), xdp->bpf_bpffs_path);
    537 			}
    538 		}
    539 
    540 		unload_xdp_program(xdp);
    541 	}
    542 }
    543 
    544 static void unload_xdp_program(struct xdp_server *xdp) {
    545 	DECLARE_LIBBPF_OPTS(bpf_xdp_attach_opts, bpf_opts,
    546 	                    .old_prog_fd = xdp->bpf_prog_fd);
    547 
    548 	log_msg(LOG_INFO, "xdp: detaching xdp program %u from %s\n",
    549 			xdp->bpf_prog_id, xdp->interface_name);
    550 
    551 	if (bpf_xdp_detach((int) xdp->interface_index, 0, &bpf_opts))
    552 		log_msg(LOG_ERR, "xdp: failed to detach xdp program: %s\n",
    553 		        strerror(errno));
    554 }
    555 
    556 static int dest_ip_allowed6(struct xdp_server *xdp, struct ipv6hdr *ipv6) {
    557 	struct xdp_ip_address *ip = xdp->ip_addresses;
    558 	if (!ip)
    559 		// no IPs available, allowing all
    560 		return 1;
    561 
    562 	while (ip) {
    563 		if (ip->addr.ss_family == AF_INET6 &&
    564 		    !memcmp(&(((struct sockaddr_in6 *) &ip->addr)->sin6_addr),
    565 		            &ipv6->daddr,
    566 		            sizeof(struct in6_addr)))
    567 			return 1;
    568 		ip = ip->next;
    569 	}
    570 
    571 	return 0;
    572 }
    573 
    574 static int dest_ip_allowed4(struct xdp_server *xdp, struct iphdr *ipv4) {
    575 	struct xdp_ip_address *ip = xdp->ip_addresses;
    576 	if (!ip)
    577 		// no IPs available, allowing all
    578 		return 1;
    579 
    580 	while (ip) {
    581 		if (ip->addr.ss_family == AF_INET &&
    582 		    ipv4->daddr == ((struct sockaddr_in *) &ip->addr)->sin_addr.s_addr)
    583 			return 1;
    584 		ip = ip->next;
    585 	}
    586 
    587 	return 0;
    588 }
    589 
    590 static void
    591 add_ip_address(struct xdp_server *xdp, struct sockaddr_storage *addr) {
    592 	struct xdp_ip_address *ip = xdp->ip_addresses;
    593 	if (!ip) {
    594 		xdp->ip_addresses = region_alloc_zero(xdp->region,
    595 		                                      sizeof(struct xdp_ip_address));
    596 		ip = xdp->ip_addresses;
    597 	} else {
    598 		while (ip->next)
    599 			ip = ip->next;
    600 
    601 		ip->next = region_alloc_zero(xdp->region,
    602 		                             sizeof(struct xdp_ip_address));
    603 		ip = ip->next;
    604 	}
    605 
    606 	memcpy(&ip->addr, addr, sizeof(struct sockaddr_storage));
    607 }
    608 
    609 static int figure_ip_addresses(struct xdp_server *xdp) {
    610 	// TODO: if using VLANs, also find appropriate IP addresses?
    611 	struct ifaddrs *ifaddr;
    612 	int family, ret = 0;
    613 
    614 	if (getifaddrs(&ifaddr) == -1) {
    615 		log_msg(LOG_ERR, "xdp: couldn't determine local IP addresses. "
    616 		                 "Serving all IP addresses now");
    617 		return -1;
    618 	}
    619 
    620 	for (struct ifaddrs *ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) {
    621 		if (ifa->ifa_addr == NULL)
    622 			continue;
    623 
    624 		if (strcmp(ifa->ifa_name, xdp->interface_name))
    625 			continue;
    626 
    627 		family = ifa->ifa_addr->sa_family;
    628 
    629 		switch (family) {
    630 			default:
    631 				continue;
    632 			case AF_INET:
    633 			case AF_INET6:
    634 				add_ip_address(xdp, (struct sockaddr_storage *) ifa->ifa_addr);
    635 		}
    636 	}
    637 
    638 	freeifaddrs(ifaddr);
    639 	return ret;
    640 }
    641 
    642 static inline void swap_eth(struct ethhdr *eth) {
    643 	uint8_t tmp_mac[ETH_ALEN];
    644 	memcpy(tmp_mac, eth->h_dest, ETH_ALEN);
    645 	memcpy(eth->h_dest, eth->h_source, ETH_ALEN);
    646 	memcpy(eth->h_source, tmp_mac, ETH_ALEN);
    647 }
    648 
    649 static inline void swap_udp(struct udphdr *udp) {
    650 	uint16_t tmp_port; /* not touching endianness */
    651 	tmp_port = udp->source;
    652 	udp->source = udp->dest;
    653 	udp->dest = tmp_port;
    654 }
    655 
    656 static inline void swap_ipv6(struct ipv6hdr *ipv6) {
    657 	struct in6_addr tmp_ip;
    658 	memcpy(&tmp_ip, &ipv6->saddr, sizeof(tmp_ip));
    659 	memcpy(&ipv6->saddr, &ipv6->daddr, sizeof(tmp_ip));
    660 	memcpy(&ipv6->daddr, &tmp_ip, sizeof(tmp_ip));
    661 }
    662 
    663 static inline void swap_ipv4(struct iphdr *ipv4) {
    664 	struct in_addr tmp_ip;
    665 	memcpy(&tmp_ip, &ipv4->saddr, sizeof(tmp_ip));
    666 	memcpy(&ipv4->saddr, &ipv4->daddr, sizeof(tmp_ip));
    667 	memcpy(&ipv4->daddr, &tmp_ip, sizeof(tmp_ip));
    668 }
    669 
    670 static inline void *parse_udp(struct udphdr *udp) {
    671 	if (ntohs(udp->dest) != DNS_PORT)
    672 		return NULL;
    673 
    674 	return (void *)(udp + 1);
    675 }
    676 
    677 static inline void *parse_ipv6(struct ipv6hdr *ipv6) {
    678 	if (ipv6->nexthdr != IPPROTO_UDP)
    679 		return NULL;
    680 
    681 	return (void *)(ipv6 + 1);
    682 }
    683 
    684 static inline void *parse_ipv4(struct iphdr *ipv4) {
    685 	if (ipv4->protocol != IPPROTO_UDP)
    686 		return NULL;
    687 
    688 	return (void *)(ipv4 + 1);
    689 }
    690 
    691 static uint32_t parse_dns(struct nsd* nsd, uint32_t dnslen,
    692                           struct query *q, sa_family_t ai_family) {
    693 	/* TODO: implement DNSTAP, PROXYv2, ...? */
    694 	uint32_t now = 0;
    695 
    696 	/* set the size of the dns message and move position to start */
    697 	buffer_skip(q->packet, dnslen);
    698 	buffer_flip(q->packet);
    699 
    700 	if (query_process(q, nsd, &now) != QUERY_DISCARDED) {
    701 		if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) {
    702 			STATUP(nsd, nona);
    703 			ZTATUP(nsd, q->zone, nona);
    704 		}
    705 
    706 #ifdef USE_ZONE_STATS
    707 		if (ai_family == AF_INET) {
    708 			ZTATUP(nsd, q->zone, qudp);
    709 		} else if (ai_family == AF_INET6) {
    710 			ZTATUP(nsd, q->zone, qudp6);
    711 		}
    712 #endif /* USE_ZONE_STATS */
    713 
    714 		query_add_optional(q, nsd, &now);
    715 
    716 		buffer_flip(q->packet);
    717 
    718 #ifdef BIND8_STATS
    719 			/* Account the rcode & TC... */
    720 			STATUP2(nsd, rcode, RCODE(q->packet));
    721 			ZTATUP2(nsd, q->zone, rcode, RCODE(q->packet));
    722 			if (TC(q->packet)) {
    723 				STATUP(nsd, truncated);
    724 				ZTATUP(nsd, q->zone, truncated);
    725 			}
    726 #endif /* BIND8_STATS */
    727 
    728 		/* return new dns message length */
    729 		return (uint32_t) buffer_remaining(q->packet);
    730 	} else {
    731 		query_reset(q, UDP_MAX_MESSAGE_LEN, 0);
    732 		STATUP(nsd, dropped);
    733 		ZTATUP(nsd, q->zone, dropped);
    734 		return 0;
    735 	}
    736 }
    737 
    738 static int
    739 process_packet(struct xdp_server *xdp, uint8_t *pkt,
    740                uint32_t *len, struct query *query) {
    741 	/* log_msg(LOG_INFO, "xdp: received packet with len %d", *len); */
    742 
    743 	uint32_t dnslen = *len;
    744 	uint32_t data_before_dnshdr_len = 0;
    745 
    746 	struct ethhdr *eth = (struct ethhdr *)pkt;
    747 	struct ipv6hdr *ipv6 = NULL;
    748 	struct iphdr *ipv4 = NULL;
    749 	struct udphdr *udp = NULL;
    750 	void *dnshdr = NULL;
    751 
    752 	/* doing the check here, so that the packet/frame is large enough to contain
    753 	 * at least an ethernet header, an ipv4 header (ipv6 header is larger), and
    754 	 * a udp header.
    755 	 */
    756 	if (*len < (sizeof(*eth) + sizeof(struct iphdr) + sizeof(*udp)))
    757 		return -1;
    758 
    759 	data_before_dnshdr_len = sizeof(*eth) + sizeof(*udp);
    760 
    761 	switch (ntohs(eth->h_proto)) {
    762 	case ETH_P_IPV6: {
    763 		ipv6 = (struct ipv6hdr *)(eth + 1);
    764 
    765 		if (*len < (sizeof(*eth) + sizeof(*ipv6) + sizeof(*udp)))
    766 			return -2;
    767 		if (!(udp = parse_ipv6(ipv6)))
    768 			return -3;
    769 
    770 		dnslen -= (uint32_t) (sizeof(*eth) + sizeof(*ipv6) + sizeof(*udp));
    771 		data_before_dnshdr_len += sizeof(*ipv6);
    772 
    773 		if (!dest_ip_allowed6(xdp, ipv6))
    774 			return -4;
    775 
    776 		break;
    777 	} case ETH_P_IP: {
    778 		ipv4 = (struct iphdr *)(eth + 1);
    779 
    780 		if (!(udp = parse_ipv4(ipv4)))
    781 			return -5;
    782 
    783 		dnslen -= (uint32_t) (sizeof(*eth) + sizeof(*ipv4) + sizeof(*udp));
    784 		data_before_dnshdr_len += sizeof(*ipv4);
    785 
    786 		if (!dest_ip_allowed4(xdp, ipv4))
    787 			return -6;
    788 
    789 		break;
    790 	}
    791 
    792 	/* TODO: vlan? */
    793 	/* case ETH_P_8021AD: case ETH_P_8021Q: */
    794 	/*     if (*len < (sizeof(*eth) + sizeof(*vlan))) */
    795 	/*         break; */
    796 	default:
    797 		return -7;
    798 	}
    799 
    800 	if (!(dnshdr = parse_udp(udp)))
    801 		return -8;
    802 
    803 	query_set_buffer_data(query, dnshdr, XDP_FRAME_SIZE - data_before_dnshdr_len);
    804 
    805 	if(ipv6) {
    806 #ifdef INET6
    807 		struct sockaddr_in6* sock6 = (struct sockaddr_in6*)&query->remote_addr;
    808 		sock6->sin6_family = AF_INET6;
    809 		sock6->sin6_port = udp->dest;
    810 		sock6->sin6_flowinfo = 0;
    811 		sock6->sin6_scope_id = 0;
    812 		memcpy(&sock6->sin6_addr, &ipv6->saddr, sizeof(ipv6->saddr));
    813 		query->remote_addrlen = (socklen_t)sizeof(struct sockaddr_in6);
    814 #else
    815 		return 0; /* no inet6 no network */
    816 #endif /* INET6 */
    817 #ifdef BIND8_STATS
    818 		STATUP(xdp->nsd, qudp6);
    819 #endif /* BIND8_STATS */
    820 	} else {
    821 		struct sockaddr_in* sock4 = (struct sockaddr_in*)&query->remote_addr;
    822 		sock4->sin_family = AF_INET;
    823 		sock4->sin_port = udp->dest;
    824 		sock4->sin_addr.s_addr = ipv4->saddr;
    825 		query->remote_addrlen = (socklen_t)sizeof(struct sockaddr_in);
    826 #ifdef BIND8_STATS
    827 		STATUP(xdp->nsd, qudp);
    828 #endif /* BIND8_STATS */
    829 	}
    830 
    831 	query->client_addr    = query->remote_addr;
    832 	query->client_addrlen = query->remote_addrlen;
    833 	query->is_proxied = 0;
    834 
    835 	dnslen = parse_dns(xdp->nsd, dnslen, query, query->remote_addr.ss_family);
    836 	if (!dnslen) {
    837 		return -9;
    838 	}
    839 
    840 	// Not verifying the packet length (that it fits in an IP packet), as
    841 	// parse_dns truncates too long response messages.
    842 	udp->len = htons((uint16_t) (sizeof(*udp) + dnslen));
    843 
    844 	swap_eth(eth);
    845 	swap_udp(udp);
    846 
    847 	if (ipv4) {
    848 		__be16 ipv4_old_len = ipv4->tot_len;
    849 		swap_ipv4(ipv4);
    850 		ipv4->tot_len = htons(sizeof(*ipv4)) + udp->len;
    851 		csum16_replace(&ipv4->check, ipv4_old_len, ipv4->tot_len);
    852 		udp->check = calc_csum_udp4(udp, ipv4);
    853 	} else if (ipv6) {
    854 		swap_ipv6(ipv6);
    855 		ipv6->payload_len = udp->len;
    856 		udp->check = calc_csum_udp6(udp, ipv6);
    857 	} else {
    858 		log_msg(LOG_ERR, "xdp: we forgot to implement something... oops");
    859 		return 0;
    860 	}
    861 
    862 	/* log_msg(LOG_INFO, "xdp: done with processing the packet"); */
    863 
    864 	*len = data_before_dnshdr_len + dnslen;
    865 	return 1;
    866 }
    867 
    868 void xdp_handle_recv_and_send(struct xdp_server *xdp) {
    869 	struct xsk_socket_info *xsk = &xdp->xsks[xdp->queue_index];
    870 	unsigned int recvd, i, reserved, to_send = 0;
    871 	uint32_t idx_rx = 0;
    872 	uint32_t tx_idx = 0;
    873 	int ret;
    874 
    875 	recvd = xsk_ring_cons__peek(&xsk->rx, XDP_RX_BATCH_SIZE, &idx_rx);
    876 	if (!recvd) {
    877 		/* no data available */
    878 		return;
    879 	}
    880 
    881 	fill_fq(xsk);
    882 
    883 	/* Process received packets */
    884 	for (i = 0; i < recvd; ++i) {
    885 		uint64_t addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
    886 		uint32_t len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
    887 
    888 		uint8_t *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
    889 		if ((ret = process_packet(xdp, pkt, &len, xdp->queries[i])) <= 0) {
    890 			/* drop packet */
    891 			xsk_free_umem_frame(xsk, addr);
    892 		} else {
    893 			umem_ptrs[to_send].addr = addr;
    894 			umem_ptrs[to_send].len = len;
    895 			++to_send;
    896 		}
    897 		/* we can reset the query directly after each packet processing,
    898 		 * because the reset does not delete the underlying buffer/data.
    899 		 * However, if we, in future, need to access data from the query
    900 		 * struct when sending the answer, this needs to change.
    901 		 * This also means, that currently a single query instance (and
    902 		 * not an array) would suffice for this implementation. */
    903 		query_reset(xdp->queries[i], UDP_MAX_MESSAGE_LEN, 0);
    904 
    905 		/* xsk->stats.rx_bytes += len; */
    906 	}
    907 
    908 	xsk_ring_cons__release(&xsk->rx, recvd);
    909 	/* xsk->stats.rx_packets += rcvd; */
    910 
    911 	/* Process sending packets */
    912 
    913 	/* TODO: at least send as many packets as slots are available */
    914 	reserved = xsk_ring_prod__reserve(&xsk->tx, to_send, &tx_idx);
    915 	// if we can't reserve to_send frames, we'll get 0 frames, so
    916 	// no need to "un-reserve"
    917 	if (reserved != to_send) {
    918 		// not enough tx slots available, drop packets
    919 		log_msg(LOG_ERR, "xdp: not enough TX frames available, dropping "
    920 		        "whole batch");
    921 		for (i = 0; i < to_send; ++i) {
    922 			xsk_free_umem_frame(xsk, umem_ptrs[i].addr);
    923 			umem_ptrs[i].addr = XDP_INVALID_UMEM_FRAME;
    924 			umem_ptrs[i].len = 0;
    925 		}
    926 #ifdef BIND8_STATS
    927 		xdp->nsd->st->txerr += to_send;
    928 #endif /* BIND8_STATS */
    929 		to_send = 0;
    930 	}
    931 
    932 	for (i = 0; i < to_send; ++i) {
    933 		uint64_t addr = umem_ptrs[i].addr;
    934 		uint32_t len = umem_ptrs[i].len;
    935 		xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->addr = addr;
    936 		xsk_ring_prod__tx_desc(&xsk->tx, tx_idx)->len = len;
    937 		tx_idx++;
    938 		xsk->outstanding_tx++;
    939 		umem_ptrs[i].addr = XDP_INVALID_UMEM_FRAME;
    940 		umem_ptrs[i].len = 0;
    941 	}
    942 
    943 	xsk_ring_prod__submit(&xsk->tx, to_send);
    944 
    945 	/* wake up kernel for tx if needed and collect completed tx buffers */
    946 	handle_tx(xsk);
    947 	/* TODO: maybe call fill_fq(xsk) here as well? */
    948 }
    949 
    950 static void drain_cq(struct xsk_socket_info *xsk) {
    951 	uint32_t completed, idx_cq;
    952 
    953 	/* free completed TX buffers */
    954 	completed = xsk_ring_cons__peek(&xsk->umem->cq,
    955 	                                XSK_RING_CONS__NUM_DESCS,
    956 	                                &idx_cq);
    957 
    958 	if (completed > 0) {
    959 		for (uint32_t i = 0; i < completed; i++) {
    960 			xsk_free_umem_frame(xsk, *xsk_ring_cons__comp_addr(&xsk->umem->cq,
    961 			                                                   idx_cq++));
    962 		}
    963 
    964 		xsk_ring_cons__release(&xsk->umem->cq, completed);
    965 		xsk->outstanding_tx -= completed < xsk->outstanding_tx ?
    966 		                       completed : xsk->outstanding_tx;
    967 	}
    968 }
    969 
    970 static void handle_tx(struct xsk_socket_info *xsk) {
    971 	if (!xsk->outstanding_tx)
    972 		return;
    973 
    974 	if (xsk_ring_prod__needs_wakeup(&xsk->tx))
    975 		sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
    976 
    977 	drain_cq(xsk);
    978 
    979 	// Update TX-queue pointers
    980 	// This is not needed, because prod__reserve calls this function too,
    981 	// and therefore, if not enough frames are free on the cached pointers,
    982 	// it will update the real pointers.
    983 	/* xsk_prod_nb_free(&xsk->tx, XSK_RING_PROD__NUM_DESCS/4); */
    984 }
    985 
    986 #endif /* USE_XDP */
    987