1 1.86 riastrad /* $NetBSD: ip_flow.c,v 1.86 2024/06/29 12:59:08 riastradh Exp $ */ 2 1.1 matt 3 1.1 matt /*- 4 1.1 matt * Copyright (c) 1998 The NetBSD Foundation, Inc. 5 1.1 matt * All rights reserved. 6 1.1 matt * 7 1.1 matt * This code is derived from software contributed to The NetBSD Foundation 8 1.1 matt * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas. 9 1.1 matt * 10 1.1 matt * Redistribution and use in source and binary forms, with or without 11 1.1 matt * modification, are permitted provided that the following conditions 12 1.1 matt * are met: 13 1.1 matt * 1. Redistributions of source code must retain the above copyright 14 1.1 matt * notice, this list of conditions and the following disclaimer. 15 1.1 matt * 2. Redistributions in binary form must reproduce the above copyright 16 1.1 matt * notice, this list of conditions and the following disclaimer in the 17 1.1 matt * documentation and/or other materials provided with the distribution. 18 1.1 matt * 19 1.1 matt * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 1.1 matt * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 1.1 matt * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 1.1 matt * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 1.1 matt * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 1.1 matt * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 1.1 matt * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 1.1 matt * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 1.1 matt * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 1.1 matt * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 1.1 matt * POSSIBILITY OF SUCH DAMAGE. 30 1.1 matt */ 31 1.22 lukem 32 1.22 lukem #include <sys/cdefs.h> 33 1.86 riastrad __KERNEL_RCSID(0, "$NetBSD: ip_flow.c,v 1.86 2024/06/29 12:59:08 riastradh Exp $"); 34 1.77 ozaki 35 1.77 ozaki #ifdef _KERNEL_OPT 36 1.77 ozaki #include "opt_net_mpsafe.h" 37 1.77 ozaki #endif 38 1.1 matt 39 1.1 matt #include <sys/param.h> 40 1.1 matt #include <sys/systm.h> 41 1.1 matt #include <sys/malloc.h> 42 1.1 matt #include <sys/mbuf.h> 43 1.1 matt #include <sys/socketvar.h> 44 1.1 matt #include <sys/errno.h> 45 1.1 matt #include <sys/time.h> 46 1.1 matt #include <sys/kernel.h> 47 1.7 thorpej #include <sys/pool.h> 48 1.1 matt #include <sys/sysctl.h> 49 1.73 ozaki #include <sys/workqueue.h> 50 1.74 ozaki #include <sys/atomic.h> 51 1.1 matt 52 1.1 matt #include <net/if.h> 53 1.1 matt #include <net/if_dl.h> 54 1.1 matt #include <net/route.h> 55 1.1 matt #include <net/pfil.h> 56 1.1 matt 57 1.1 matt #include <netinet/in.h> 58 1.1 matt #include <netinet/in_systm.h> 59 1.1 matt #include <netinet/ip.h> 60 1.1 matt #include <netinet/in_pcb.h> 61 1.1 matt #include <netinet/in_var.h> 62 1.1 matt #include <netinet/ip_var.h> 63 1.54 thorpej #include <netinet/ip_private.h> 64 1.1 matt 65 1.44 liamjfoy /* 66 1.44 liamjfoy * Similar code is very well commented in netinet6/ip6_flow.c 67 1.82 maxv */ 68 1.44 liamjfoy 69 1.53 thorpej #define IPFLOW_HASHBITS 6 /* should not be a multiple of 8 */ 70 1.53 thorpej 71 1.57 pooka static struct pool ipflow_pool; 72 1.7 thorpej 73 1.76 knakahar TAILQ_HEAD(ipflowhead, ipflow); 74 1.5 thorpej 75 1.1 matt #define IPFLOW_TIMER (5 * PR_SLOWHZ) 76 1.43 liamjfoy #define IPFLOW_DEFAULT_HASHSIZE (1 << IPFLOW_HASHBITS) 77 1.5 thorpej 78 1.70 knakahar /* 79 1.70 knakahar * ip_flow.c internal lock. 80 1.70 knakahar * If we use softnet_lock, it would cause recursive lock. 81 1.70 knakahar * 82 1.70 knakahar * This is a tentative workaround. 83 1.70 knakahar * We should make it scalable somehow in the future. 84 1.70 knakahar */ 85 1.70 knakahar static kmutex_t ipflow_lock; 86 1.43 liamjfoy static struct ipflowhead *ipflowtable = NULL; 87 1.5 thorpej static struct ipflowhead ipflowlist; 88 1.1 matt static int ipflow_inuse; 89 1.5 thorpej 90 1.76 knakahar #define IPFLOW_INSERT(hashidx, ipf) \ 91 1.5 thorpej do { \ 92 1.76 knakahar (ipf)->ipf_hashidx = (hashidx); \ 93 1.76 knakahar TAILQ_INSERT_HEAD(&ipflowtable[(hashidx)], (ipf), ipf_hash); \ 94 1.76 knakahar TAILQ_INSERT_HEAD(&ipflowlist, (ipf), ipf_list); \ 95 1.26 perry } while (/*CONSTCOND*/ 0) 96 1.5 thorpej 97 1.76 knakahar #define IPFLOW_REMOVE(hashidx, ipf) \ 98 1.5 thorpej do { \ 99 1.76 knakahar TAILQ_REMOVE(&ipflowtable[(hashidx)], (ipf), ipf_hash); \ 100 1.76 knakahar TAILQ_REMOVE(&ipflowlist, (ipf), ipf_list); \ 101 1.26 perry } while (/*CONSTCOND*/ 0) 102 1.5 thorpej 103 1.3 matt #ifndef IPFLOW_MAX 104 1.1 matt #define IPFLOW_MAX 256 105 1.3 matt #endif 106 1.64 rmind static int ip_maxflows = IPFLOW_MAX; 107 1.64 rmind static int ip_hashsize = IPFLOW_DEFAULT_HASHSIZE; 108 1.64 rmind 109 1.69 knakahar static struct ipflow *ipflow_reap(bool); 110 1.64 rmind static void ipflow_sysctl_init(struct sysctllog **); 111 1.1 matt 112 1.73 ozaki static void ipflow_slowtimo_work(struct work *, void *); 113 1.73 ozaki static struct workqueue *ipflow_slowtimo_wq; 114 1.73 ozaki static struct work ipflow_slowtimo_wk; 115 1.73 ozaki 116 1.82 maxv static size_t 117 1.51 dyoung ipflow_hash(const struct ip *ip) 118 1.1 matt { 119 1.45 liamjfoy size_t hash = ip->ip_tos; 120 1.45 liamjfoy size_t idx; 121 1.45 liamjfoy 122 1.45 liamjfoy for (idx = 0; idx < 32; idx += IPFLOW_HASHBITS) { 123 1.45 liamjfoy hash += (ip->ip_dst.s_addr >> (32 - idx)) + 124 1.45 liamjfoy (ip->ip_src.s_addr >> idx); 125 1.45 liamjfoy } 126 1.45 liamjfoy 127 1.43 liamjfoy return hash & (ip_hashsize-1); 128 1.1 matt } 129 1.1 matt 130 1.1 matt static struct ipflow * 131 1.51 dyoung ipflow_lookup(const struct ip *ip) 132 1.1 matt { 133 1.45 liamjfoy size_t hash; 134 1.1 matt struct ipflow *ipf; 135 1.1 matt 136 1.70 knakahar KASSERT(mutex_owned(&ipflow_lock)); 137 1.70 knakahar 138 1.45 liamjfoy hash = ipflow_hash(ip); 139 1.1 matt 140 1.76 knakahar TAILQ_FOREACH(ipf, &ipflowtable[hash], ipf_hash) { 141 1.1 matt if (ip->ip_dst.s_addr == ipf->ipf_dst.s_addr 142 1.1 matt && ip->ip_src.s_addr == ipf->ipf_src.s_addr 143 1.1 matt && ip->ip_tos == ipf->ipf_tos) 144 1.1 matt break; 145 1.1 matt } 146 1.1 matt return ipf; 147 1.1 matt } 148 1.1 matt 149 1.57 pooka void 150 1.58 cegger ipflow_poolinit(void) 151 1.57 pooka { 152 1.57 pooka 153 1.57 pooka pool_init(&ipflow_pool, sizeof(struct ipflow), 0, 0, 0, "ipflowpl", 154 1.57 pooka NULL, IPL_NET); 155 1.57 pooka } 156 1.57 pooka 157 1.64 rmind static int 158 1.64 rmind ipflow_reinit(int table_size) 159 1.7 thorpej { 160 1.43 liamjfoy struct ipflowhead *new_table; 161 1.45 liamjfoy size_t i; 162 1.7 thorpej 163 1.70 knakahar KASSERT(mutex_owned(&ipflow_lock)); 164 1.70 knakahar 165 1.43 liamjfoy new_table = (struct ipflowhead *)malloc(sizeof(struct ipflowhead) * 166 1.43 liamjfoy table_size, M_RTABLE, M_NOWAIT); 167 1.43 liamjfoy 168 1.43 liamjfoy if (new_table == NULL) 169 1.43 liamjfoy return 1; 170 1.43 liamjfoy 171 1.43 liamjfoy if (ipflowtable != NULL) 172 1.43 liamjfoy free(ipflowtable, M_RTABLE); 173 1.43 liamjfoy 174 1.43 liamjfoy ipflowtable = new_table; 175 1.43 liamjfoy ip_hashsize = table_size; 176 1.43 liamjfoy 177 1.76 knakahar TAILQ_INIT(&ipflowlist); 178 1.43 liamjfoy for (i = 0; i < ip_hashsize; i++) 179 1.76 knakahar TAILQ_INIT(&ipflowtable[i]); 180 1.43 liamjfoy 181 1.43 liamjfoy return 0; 182 1.7 thorpej } 183 1.7 thorpej 184 1.64 rmind void 185 1.64 rmind ipflow_init(void) 186 1.64 rmind { 187 1.73 ozaki int error; 188 1.73 ozaki 189 1.73 ozaki error = workqueue_create(&ipflow_slowtimo_wq, "ipflow_slowtimo", 190 1.73 ozaki ipflow_slowtimo_work, NULL, PRI_SOFTNET, IPL_SOFTNET, WQ_MPSAFE); 191 1.73 ozaki if (error != 0) 192 1.73 ozaki panic("%s: workqueue_create failed (%d)\n", __func__, error); 193 1.70 knakahar 194 1.70 knakahar mutex_init(&ipflow_lock, MUTEX_DEFAULT, IPL_NONE); 195 1.70 knakahar 196 1.70 knakahar mutex_enter(&ipflow_lock); 197 1.64 rmind (void)ipflow_reinit(ip_hashsize); 198 1.70 knakahar mutex_exit(&ipflow_lock); 199 1.64 rmind ipflow_sysctl_init(NULL); 200 1.64 rmind } 201 1.64 rmind 202 1.1 matt int 203 1.29 perry ipflow_fastforward(struct mbuf *m) 204 1.1 matt { 205 1.51 dyoung struct ip *ip; 206 1.51 dyoung struct ip ip_store; 207 1.1 matt struct ipflow *ipf; 208 1.78 ozaki struct rtentry *rt = NULL; 209 1.40 dyoung const struct sockaddr *dst; 210 1.1 matt int error; 211 1.6 sommerfe int iplen; 212 1.67 ozaki struct ifnet *ifp; 213 1.67 ozaki int s; 214 1.70 knakahar int ret = 0; 215 1.1 matt 216 1.70 knakahar mutex_enter(&ipflow_lock); 217 1.1 matt /* 218 1.1 matt * Are we forwarding packets? Big enough for an IP packet? 219 1.1 matt */ 220 1.3 matt if (!ipforwarding || ipflow_inuse == 0 || m->m_len < sizeof(struct ip)) 221 1.70 knakahar goto out; 222 1.14 sommerfe 223 1.14 sommerfe /* 224 1.19 wiz * Was packet received as a link-level multicast or broadcast? 225 1.14 sommerfe * If so, don't try to fast forward.. 226 1.14 sommerfe */ 227 1.14 sommerfe if ((m->m_flags & (M_BCAST|M_MCAST)) != 0) 228 1.70 knakahar goto out; 229 1.24 itojun 230 1.1 matt /* 231 1.1 matt * IP header with no option and valid version and length 232 1.1 matt */ 233 1.83 christos ip = mtod(m, struct ip *); 234 1.85 christos if (!ACCESSIBLE_POINTER(ip, struct ip)) { 235 1.51 dyoung memcpy(&ip_store, mtod(m, const void *), sizeof(ip_store)); 236 1.25 thorpej ip = &ip_store; 237 1.25 thorpej } 238 1.6 sommerfe iplen = ntohs(ip->ip_len); 239 1.5 thorpej if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2) || 240 1.13 proff iplen < sizeof(struct ip) || iplen > m->m_pkthdr.len) 241 1.70 knakahar goto out; 242 1.1 matt /* 243 1.1 matt * Find a flow. 244 1.1 matt */ 245 1.1 matt if ((ipf = ipflow_lookup(ip)) == NULL) 246 1.70 knakahar goto out; 247 1.1 matt 248 1.67 ozaki ifp = m_get_rcvif(m, &s); 249 1.80 ozaki if (__predict_false(ifp == NULL)) 250 1.80 ozaki goto out_unref; 251 1.1 matt /* 252 1.18 thorpej * Verify the IP header checksum. 253 1.2 thorpej */ 254 1.18 thorpej switch (m->m_pkthdr.csum_flags & 255 1.67 ozaki ((ifp->if_csum_flags_rx & M_CSUM_IPv4) | 256 1.18 thorpej M_CSUM_IPv4_BAD)) { 257 1.18 thorpej case M_CSUM_IPv4|M_CSUM_IPv4_BAD: 258 1.67 ozaki m_put_rcvif(ifp, &s); 259 1.78 ozaki goto out_unref; 260 1.18 thorpej 261 1.18 thorpej case M_CSUM_IPv4: 262 1.18 thorpej /* Checksum was okay. */ 263 1.18 thorpej break; 264 1.18 thorpej 265 1.18 thorpej default: 266 1.18 thorpej /* Must compute it ourselves. */ 267 1.67 ozaki if (in_cksum(m, sizeof(struct ip)) != 0) { 268 1.67 ozaki m_put_rcvif(ifp, &s); 269 1.78 ozaki goto out_unref; 270 1.67 ozaki } 271 1.18 thorpej break; 272 1.18 thorpej } 273 1.67 ozaki m_put_rcvif(ifp, &s); 274 1.2 thorpej 275 1.2 thorpej /* 276 1.1 matt * Route and interface still up? 277 1.1 matt */ 278 1.78 ozaki rt = rtcache_validate(&ipf->ipf_ro); 279 1.78 ozaki if (rt == NULL || (rt->rt_ifp->if_flags & IFF_UP) == 0 || 280 1.66 roy (rt->rt_flags & (RTF_BLACKHOLE | RTF_BROADCAST)) != 0) 281 1.78 ozaki goto out_unref; 282 1.1 matt 283 1.1 matt /* 284 1.1 matt * Packet size OK? TTL? 285 1.1 matt */ 286 1.1 matt if (m->m_pkthdr.len > rt->rt_ifp->if_mtu || ip->ip_ttl <= IPTTLDEC) 287 1.78 ozaki goto out_unref; 288 1.1 matt 289 1.1 matt /* 290 1.18 thorpej * Clear any in-bound checksum flags for this packet. 291 1.18 thorpej */ 292 1.18 thorpej m->m_pkthdr.csum_flags = 0; 293 1.18 thorpej 294 1.18 thorpej /* 295 1.1 matt * Everything checks out and so we can forward this packet. 296 1.1 matt * Modify the TTL and incrementally change the checksum. 297 1.24 itojun * 298 1.9 mycroft * This method of adding the checksum works on either endian CPU. 299 1.9 mycroft * If htons() is inlined, all the arithmetic is folded; otherwise 300 1.32 perry * the htons()s are combined by CSE due to the const attribute. 301 1.18 thorpej * 302 1.18 thorpej * Don't bother using HW checksumming here -- the incremental 303 1.18 thorpej * update is pretty fast. 304 1.1 matt */ 305 1.1 matt ip->ip_ttl -= IPTTLDEC; 306 1.12 itohy if (ip->ip_sum >= (u_int16_t) ~htons(IPTTLDEC << 8)) 307 1.11 mycroft ip->ip_sum -= ~htons(IPTTLDEC << 8); 308 1.8 mycroft else 309 1.2 thorpej ip->ip_sum += htons(IPTTLDEC << 8); 310 1.25 thorpej 311 1.25 thorpej /* 312 1.25 thorpej * Done modifying the header; copy it back, if necessary. 313 1.51 dyoung * 314 1.51 dyoung * XXX Use m_copyback_cow(9) here? --dyoung 315 1.25 thorpej */ 316 1.85 christos if (!ACCESSIBLE_POINTER(mtod(m, void *), struct ip)) 317 1.41 christos memcpy(mtod(m, void *), &ip_store, sizeof(ip_store)); 318 1.6 sommerfe 319 1.6 sommerfe /* 320 1.24 itojun * Trim the packet in case it's too long.. 321 1.6 sommerfe */ 322 1.6 sommerfe if (m->m_pkthdr.len > iplen) { 323 1.6 sommerfe if (m->m_len == m->m_pkthdr.len) { 324 1.6 sommerfe m->m_len = iplen; 325 1.6 sommerfe m->m_pkthdr.len = iplen; 326 1.6 sommerfe } else 327 1.6 sommerfe m_adj(m, iplen - m->m_pkthdr.len); 328 1.2 thorpej } 329 1.1 matt 330 1.1 matt /* 331 1.65 snj * Send the packet on its way. All we can get back is ENOBUFS 332 1.1 matt */ 333 1.1 matt ipf->ipf_uses++; 334 1.76 knakahar 335 1.76 knakahar #if 0 336 1.76 knakahar /* 337 1.76 knakahar * Sorting list is too heavy for fast path(packet processing path). 338 1.76 knakahar * It degrades about 10% performance. So, we does not sort ipflowtable, 339 1.76 knakahar * and then we use FIFO cache replacement instead fo LRU. 340 1.76 knakahar */ 341 1.76 knakahar /* move to head (LRU) for ipflowlist. ipflowtable ooes not care LRU. */ 342 1.76 knakahar TAILQ_REMOVE(&ipflowlist, ipf, ipf_list); 343 1.76 knakahar TAILQ_INSERT_HEAD(&ipflowlist, ipf, ipf_list); 344 1.76 knakahar #endif 345 1.76 knakahar 346 1.5 thorpej PRT_SLOW_ARM(ipf->ipf_timer, IPFLOW_TIMER); 347 1.16 thorpej 348 1.16 thorpej if (rt->rt_flags & RTF_GATEWAY) 349 1.16 thorpej dst = rt->rt_gateway; 350 1.16 thorpej else 351 1.40 dyoung dst = rtcache_getdst(&ipf->ipf_ro); 352 1.16 thorpej 353 1.72 knakahar if ((error = if_output_lock(rt->rt_ifp, rt->rt_ifp, m, dst, rt)) != 0) { 354 1.1 matt if (error == ENOBUFS) 355 1.1 matt ipf->ipf_dropped++; 356 1.1 matt else 357 1.1 matt ipf->ipf_errors++; 358 1.1 matt } 359 1.70 knakahar ret = 1; 360 1.78 ozaki out_unref: 361 1.78 ozaki rtcache_unref(rt, &ipf->ipf_ro); 362 1.78 ozaki out: 363 1.70 knakahar mutex_exit(&ipflow_lock); 364 1.70 knakahar return ret; 365 1.1 matt } 366 1.82 maxv 367 1.1 matt static void 368 1.29 perry ipflow_addstats(struct ipflow *ipf) 369 1.1 matt { 370 1.49 dyoung struct rtentry *rt; 371 1.86 riastrad net_stat_ref_t ips; 372 1.49 dyoung 373 1.78 ozaki rt = rtcache_validate(&ipf->ipf_ro); 374 1.78 ozaki if (rt != NULL) { 375 1.49 dyoung rt->rt_use += ipf->ipf_uses; 376 1.78 ozaki rtcache_unref(rt, &ipf->ipf_ro); 377 1.78 ozaki } 378 1.82 maxv 379 1.54 thorpej ips = IP_STAT_GETREF(); 380 1.86 riastrad _NET_STATADD_REF(ips, IP_STAT_CANTFORWARD, 381 1.86 riastrad ipf->ipf_errors + ipf->ipf_dropped); 382 1.86 riastrad _NET_STATADD_REF(ips, IP_STAT_TOTAL, ipf->ipf_uses); 383 1.86 riastrad _NET_STATADD_REF(ips, IP_STAT_FORWARD, ipf->ipf_uses); 384 1.86 riastrad _NET_STATADD_REF(ips, IP_STAT_FASTFORWARD, ipf->ipf_uses); 385 1.54 thorpej IP_STAT_PUTREF(); 386 1.1 matt } 387 1.1 matt 388 1.1 matt static void 389 1.29 perry ipflow_free(struct ipflow *ipf) 390 1.1 matt { 391 1.70 knakahar 392 1.70 knakahar KASSERT(mutex_owned(&ipflow_lock)); 393 1.70 knakahar 394 1.1 matt /* 395 1.1 matt * Remove the flow from the hash table (at elevated IPL). 396 1.1 matt * Once it's off the list, we can deal with it at normal 397 1.1 matt * network IPL. 398 1.1 matt */ 399 1.76 knakahar IPFLOW_REMOVE(ipf->ipf_hashidx, ipf); 400 1.71 knakahar 401 1.1 matt ipflow_addstats(ipf); 402 1.38 joerg rtcache_free(&ipf->ipf_ro); 403 1.1 matt ipflow_inuse--; 404 1.7 thorpej pool_put(&ipflow_pool, ipf); 405 1.1 matt } 406 1.1 matt 407 1.69 knakahar static struct ipflow * 408 1.53 thorpej ipflow_reap(bool just_one) 409 1.1 matt { 410 1.76 knakahar struct ipflow *ipf; 411 1.70 knakahar 412 1.70 knakahar KASSERT(mutex_owned(&ipflow_lock)); 413 1.70 knakahar 414 1.76 knakahar /* 415 1.76 knakahar * This case must remove one ipflow. Furthermore, this case is used in 416 1.76 knakahar * fast path(packet processing path). So, simply remove TAILQ_LAST one. 417 1.76 knakahar */ 418 1.76 knakahar if (just_one) { 419 1.76 knakahar ipf = TAILQ_LAST(&ipflowlist, ipflowhead); 420 1.76 knakahar KASSERT(ipf != NULL); 421 1.76 knakahar 422 1.76 knakahar IPFLOW_REMOVE(ipf->ipf_hashidx, ipf); 423 1.76 knakahar 424 1.76 knakahar ipflow_addstats(ipf); 425 1.76 knakahar rtcache_free(&ipf->ipf_ro); 426 1.76 knakahar return ipf; 427 1.76 knakahar } 428 1.76 knakahar 429 1.76 knakahar /* 430 1.76 knakahar * This case is used in slow path(sysctl). 431 1.76 knakahar * At first, remove invalid rtcache ipflow, and then remove TAILQ_LAST 432 1.76 knakahar * ipflow if it is ensured least recently used by comparing last_uses. 433 1.76 knakahar */ 434 1.76 knakahar while (ipflow_inuse > ip_maxflows) { 435 1.76 knakahar struct ipflow *maybe_ipf = TAILQ_LAST(&ipflowlist, ipflowhead); 436 1.3 matt 437 1.76 knakahar TAILQ_FOREACH(ipf, &ipflowlist, ipf_list) { 438 1.78 ozaki struct rtentry *rt; 439 1.5 thorpej /* 440 1.5 thorpej * If this no longer points to a valid route 441 1.5 thorpej * reclaim it. 442 1.5 thorpej */ 443 1.78 ozaki rt = rtcache_validate(&ipf->ipf_ro); 444 1.78 ozaki if (rt == NULL) 445 1.5 thorpej goto done; 446 1.78 ozaki rtcache_unref(rt, &ipf->ipf_ro); 447 1.5 thorpej /* 448 1.5 thorpej * choose the one that's been least recently 449 1.5 thorpej * used or has had the least uses in the 450 1.5 thorpej * last 1.5 intervals. 451 1.5 thorpej */ 452 1.76 knakahar if (ipf->ipf_timer < maybe_ipf->ipf_timer 453 1.76 knakahar || ((ipf->ipf_timer == maybe_ipf->ipf_timer) 454 1.76 knakahar && (ipf->ipf_last_uses + ipf->ipf_uses 455 1.76 knakahar < maybe_ipf->ipf_last_uses + maybe_ipf->ipf_uses))) 456 1.5 thorpej maybe_ipf = ipf; 457 1.1 matt } 458 1.3 matt ipf = maybe_ipf; 459 1.3 matt done: 460 1.3 matt /* 461 1.3 matt * Remove the entry from the flow table. 462 1.3 matt */ 463 1.76 knakahar IPFLOW_REMOVE(ipf->ipf_hashidx, ipf); 464 1.71 knakahar 465 1.3 matt ipflow_addstats(ipf); 466 1.38 joerg rtcache_free(&ipf->ipf_ro); 467 1.7 thorpej pool_put(&ipflow_pool, ipf); 468 1.3 matt ipflow_inuse--; 469 1.1 matt } 470 1.3 matt return NULL; 471 1.1 matt } 472 1.1 matt 473 1.74 ozaki static unsigned int ipflow_work_enqueued = 0; 474 1.73 ozaki 475 1.73 ozaki static void 476 1.73 ozaki ipflow_slowtimo_work(struct work *wk, void *arg) 477 1.1 matt { 478 1.49 dyoung struct rtentry *rt; 479 1.5 thorpej struct ipflow *ipf, *next_ipf; 480 1.86 riastrad net_stat_ref_t ips; 481 1.2 thorpej 482 1.74 ozaki /* We can allow enqueuing another work at this point */ 483 1.74 ozaki atomic_swap_uint(&ipflow_work_enqueued, 0); 484 1.74 ozaki 485 1.81 ozaki SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE(); 486 1.70 knakahar mutex_enter(&ipflow_lock); 487 1.76 knakahar for (ipf = TAILQ_FIRST(&ipflowlist); ipf != NULL; ipf = next_ipf) { 488 1.76 knakahar next_ipf = TAILQ_NEXT(ipf, ipf_list); 489 1.37 dyoung if (PRT_SLOW_ISEXPIRED(ipf->ipf_timer) || 490 1.50 dyoung (rt = rtcache_validate(&ipf->ipf_ro)) == NULL) { 491 1.5 thorpej ipflow_free(ipf); 492 1.5 thorpej } else { 493 1.5 thorpej ipf->ipf_last_uses = ipf->ipf_uses; 494 1.49 dyoung rt->rt_use += ipf->ipf_uses; 495 1.78 ozaki rtcache_unref(rt, &ipf->ipf_ro); 496 1.54 thorpej ips = IP_STAT_GETREF(); 497 1.86 riastrad _NET_STATADD_REF(ips, IP_STAT_TOTAL, ipf->ipf_uses); 498 1.86 riastrad _NET_STATADD_REF(ips, IP_STAT_FORWARD, ipf->ipf_uses); 499 1.86 riastrad _NET_STATADD_REF(ips, IP_STAT_FASTFORWARD, 500 1.86 riastrad ipf->ipf_uses); 501 1.54 thorpej IP_STAT_PUTREF(); 502 1.5 thorpej ipf->ipf_uses = 0; 503 1.1 matt } 504 1.1 matt } 505 1.77 ozaki mutex_exit(&ipflow_lock); 506 1.81 ozaki SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); 507 1.1 matt } 508 1.1 matt 509 1.1 matt void 510 1.73 ozaki ipflow_slowtimo(void) 511 1.73 ozaki { 512 1.73 ozaki 513 1.73 ozaki /* Avoid enqueuing another work when one is already enqueued */ 514 1.74 ozaki if (atomic_swap_uint(&ipflow_work_enqueued, 1) == 1) 515 1.73 ozaki return; 516 1.73 ozaki 517 1.73 ozaki workqueue_enqueue(ipflow_slowtimo_wq, &ipflow_slowtimo_wk, NULL); 518 1.73 ozaki } 519 1.73 ozaki 520 1.73 ozaki void 521 1.78 ozaki ipflow_create(struct route *ro, struct mbuf *m) 522 1.1 matt { 523 1.51 dyoung const struct ip *const ip = mtod(m, const struct ip *); 524 1.1 matt struct ipflow *ipf; 525 1.45 liamjfoy size_t hash; 526 1.1 matt 527 1.81 ozaki KERNEL_LOCK_UNLESS_NET_MPSAFE(); 528 1.70 knakahar mutex_enter(&ipflow_lock); 529 1.70 knakahar 530 1.1 matt /* 531 1.1 matt * Don't create cache entries for ICMP messages. 532 1.1 matt */ 533 1.77 ozaki if (ip_maxflows == 0 || ip->ip_p == IPPROTO_ICMP) 534 1.77 ozaki goto out; 535 1.63 pooka 536 1.1 matt /* 537 1.65 snj * See if an existing flow struct exists. If so remove it from its 538 1.1 matt * list and free the old route. If not, try to malloc a new one 539 1.1 matt * (if we aren't at our limit). 540 1.1 matt */ 541 1.1 matt ipf = ipflow_lookup(ip); 542 1.1 matt if (ipf == NULL) { 543 1.3 matt if (ipflow_inuse >= ip_maxflows) { 544 1.53 thorpej ipf = ipflow_reap(true); 545 1.1 matt } else { 546 1.7 thorpej ipf = pool_get(&ipflow_pool, PR_NOWAIT); 547 1.1 matt if (ipf == NULL) 548 1.63 pooka goto out; 549 1.1 matt ipflow_inuse++; 550 1.1 matt } 551 1.39 dyoung memset(ipf, 0, sizeof(*ipf)); 552 1.1 matt } else { 553 1.76 knakahar IPFLOW_REMOVE(ipf->ipf_hashidx, ipf); 554 1.71 knakahar 555 1.1 matt ipflow_addstats(ipf); 556 1.38 joerg rtcache_free(&ipf->ipf_ro); 557 1.1 matt ipf->ipf_uses = ipf->ipf_last_uses = 0; 558 1.1 matt ipf->ipf_errors = ipf->ipf_dropped = 0; 559 1.1 matt } 560 1.1 matt 561 1.1 matt /* 562 1.1 matt * Fill in the updated information. 563 1.1 matt */ 564 1.46 dyoung rtcache_copy(&ipf->ipf_ro, ro); 565 1.1 matt ipf->ipf_dst = ip->ip_dst; 566 1.1 matt ipf->ipf_src = ip->ip_src; 567 1.1 matt ipf->ipf_tos = ip->ip_tos; 568 1.5 thorpej PRT_SLOW_ARM(ipf->ipf_timer, IPFLOW_TIMER); 569 1.60 liamjfoy 570 1.1 matt /* 571 1.1 matt * Insert into the approriate bucket of the flow table. 572 1.1 matt */ 573 1.45 liamjfoy hash = ipflow_hash(ip); 574 1.76 knakahar IPFLOW_INSERT(hash, ipf); 575 1.63 pooka 576 1.63 pooka out: 577 1.77 ozaki mutex_exit(&ipflow_lock); 578 1.81 ozaki KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); 579 1.27 scw } 580 1.27 scw 581 1.43 liamjfoy int 582 1.43 liamjfoy ipflow_invalidate_all(int new_size) 583 1.27 scw { 584 1.27 scw struct ipflow *ipf, *next_ipf; 585 1.71 knakahar int error; 586 1.27 scw 587 1.43 liamjfoy error = 0; 588 1.70 knakahar 589 1.70 knakahar mutex_enter(&ipflow_lock); 590 1.70 knakahar 591 1.76 knakahar for (ipf = TAILQ_FIRST(&ipflowlist); ipf != NULL; ipf = next_ipf) { 592 1.76 knakahar next_ipf = TAILQ_NEXT(ipf, ipf_list); 593 1.27 scw ipflow_free(ipf); 594 1.27 scw } 595 1.43 liamjfoy 596 1.43 liamjfoy if (new_size) 597 1.64 rmind error = ipflow_reinit(new_size); 598 1.43 liamjfoy 599 1.70 knakahar mutex_exit(&ipflow_lock); 600 1.70 knakahar 601 1.43 liamjfoy return error; 602 1.1 matt } 603 1.64 rmind 604 1.64 rmind /* 605 1.64 rmind * sysctl helper routine for net.inet.ip.maxflows. 606 1.64 rmind */ 607 1.64 rmind static int 608 1.64 rmind sysctl_net_inet_ip_maxflows(SYSCTLFN_ARGS) 609 1.64 rmind { 610 1.64 rmind int error; 611 1.64 rmind 612 1.64 rmind error = sysctl_lookup(SYSCTLFN_CALL(rnode)); 613 1.64 rmind if (error || newp == NULL) 614 1.64 rmind return (error); 615 1.64 rmind 616 1.81 ozaki SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE(); 617 1.70 knakahar mutex_enter(&ipflow_lock); 618 1.64 rmind 619 1.64 rmind ipflow_reap(false); 620 1.64 rmind 621 1.77 ozaki mutex_exit(&ipflow_lock); 622 1.81 ozaki SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); 623 1.64 rmind 624 1.64 rmind return (0); 625 1.64 rmind } 626 1.64 rmind 627 1.64 rmind static int 628 1.64 rmind sysctl_net_inet_ip_hashsize(SYSCTLFN_ARGS) 629 1.64 rmind { 630 1.64 rmind int error, tmp; 631 1.64 rmind struct sysctlnode node; 632 1.64 rmind 633 1.64 rmind node = *rnode; 634 1.64 rmind tmp = ip_hashsize; 635 1.64 rmind node.sysctl_data = &tmp; 636 1.64 rmind error = sysctl_lookup(SYSCTLFN_CALL(&node)); 637 1.64 rmind if (error || newp == NULL) 638 1.64 rmind return (error); 639 1.64 rmind 640 1.64 rmind if ((tmp & (tmp - 1)) == 0 && tmp != 0) { 641 1.64 rmind /* 642 1.64 rmind * Can only fail due to malloc() 643 1.64 rmind */ 644 1.81 ozaki SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE(); 645 1.64 rmind error = ipflow_invalidate_all(tmp); 646 1.81 ozaki SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); 647 1.64 rmind } else { 648 1.64 rmind /* 649 1.64 rmind * EINVAL if not a power of 2 650 1.82 maxv */ 651 1.64 rmind error = EINVAL; 652 1.64 rmind } 653 1.64 rmind 654 1.64 rmind return error; 655 1.64 rmind } 656 1.64 rmind 657 1.64 rmind static void 658 1.64 rmind ipflow_sysctl_init(struct sysctllog **clog) 659 1.64 rmind { 660 1.64 rmind sysctl_createv(clog, 0, NULL, NULL, 661 1.64 rmind CTLFLAG_PERMANENT, 662 1.64 rmind CTLTYPE_NODE, "inet", 663 1.64 rmind SYSCTL_DESCR("PF_INET related settings"), 664 1.64 rmind NULL, 0, NULL, 0, 665 1.64 rmind CTL_NET, PF_INET, CTL_EOL); 666 1.64 rmind sysctl_createv(clog, 0, NULL, NULL, 667 1.64 rmind CTLFLAG_PERMANENT, 668 1.64 rmind CTLTYPE_NODE, "ip", 669 1.64 rmind SYSCTL_DESCR("IPv4 related settings"), 670 1.64 rmind NULL, 0, NULL, 0, 671 1.64 rmind CTL_NET, PF_INET, IPPROTO_IP, CTL_EOL); 672 1.64 rmind 673 1.64 rmind sysctl_createv(clog, 0, NULL, NULL, 674 1.64 rmind CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 675 1.64 rmind CTLTYPE_INT, "maxflows", 676 1.64 rmind SYSCTL_DESCR("Number of flows for fast forwarding"), 677 1.64 rmind sysctl_net_inet_ip_maxflows, 0, &ip_maxflows, 0, 678 1.64 rmind CTL_NET, PF_INET, IPPROTO_IP, 679 1.64 rmind IPCTL_MAXFLOWS, CTL_EOL); 680 1.64 rmind sysctl_createv(clog, 0, NULL, NULL, 681 1.64 rmind CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 682 1.64 rmind CTLTYPE_INT, "hashsize", 683 1.64 rmind SYSCTL_DESCR("Size of hash table for fast forwarding (IPv4)"), 684 1.64 rmind sysctl_net_inet_ip_hashsize, 0, &ip_hashsize, 0, 685 1.64 rmind CTL_NET, PF_INET, IPPROTO_IP, 686 1.64 rmind CTL_CREATE, CTL_EOL); 687 1.64 rmind } 688