ip_flow.c revision 1.60.10.1 1 /* $NetBSD: ip_flow.c,v 1.60.10.1 2013/07/17 03:16:31 rmind Exp $ */
2
3 /*-
4 * Copyright (c) 1998, 2013 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: ip_flow.c,v 1.60.10.1 2013/07/17 03:16:31 rmind Exp $");
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/cpu.h>
38 #include <sys/mbuf.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/socket.h>
42 #include <sys/socketvar.h>
43 #include <sys/kmem.h>
44 #include <sys/pool.h>
45 #include <sys/sysctl.h>
46
47 #include <net/if.h>
48 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <net/pfil.h>
51
52 #include <netinet/in.h>
53 #include <netinet/in_systm.h>
54 #include <netinet/ip.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/ip_var.h>
58 #include <netinet/ip_private.h>
59
60 /*
61 * Similar code is very well commented in netinet6/ip6_flow.c
62 */
63
64 struct ipflow {
65 kmutex_t ipf_lock; /* lock protecting the fields */
66 LIST_ENTRY(ipflow) ipf_list; /* next in active list */
67 LIST_ENTRY(ipflow) ipf_hash; /* next ipflow in bucket */
68 struct in_addr ipf_dst; /* destination address */
69 struct in_addr ipf_src; /* source address */
70 uint8_t ipf_tos; /* type-of-service */
71 struct route ipf_ro; /* associated route entry */
72 u_long ipf_uses; /* number of uses in this period */
73 u_long ipf_last_uses; /* number of uses in last period */
74 u_long ipf_dropped; /* ENOBUFS retured by if_output */
75 u_long ipf_errors; /* other errors returned by if_output */
76 u_int ipf_timer; /* lifetime timer */
77 };
78
79 #define IPFLOW_HASHBITS 6 /* should not be a multiple of 8 */
80
81 static pool_cache_t ipflow_cache;
82 static struct sysctllog * ipflow_sysctl_log;
83
84 LIST_HEAD(ipflowhead, ipflow);
85
86 #define IPFLOW_TIMER (5 * PR_SLOWHZ)
87 #define IPFLOW_DEFAULT_HASHSIZE (1 << IPFLOW_HASHBITS)
88
89 /*
90 * IP flow hash table, a list, and the number of entries.
91 * All are protected by ipflow_lock. TODO: Consider RW-lock.
92 */
93 static struct ipflowhead * ipflowtable = NULL;
94 static struct ipflowhead ipflowlist;
95 static kmutex_t ipflow_lock;
96 static u_int ipflow_inuse;
97
98 #ifndef IPFLOW_MAX
99 #define IPFLOW_MAX 256
100 #endif
101
102 static int ip_maxflows = IPFLOW_MAX;
103 static int ip_hashsize = IPFLOW_DEFAULT_HASHSIZE;
104
105 static void ipflow_sysctl_init(void);
106
107 void
108 ipflow_poolinit(void)
109 {
110 ipflow_cache = pool_cache_init(sizeof(struct ipflow), coherency_unit,
111 0, 0, "ipflow", NULL, IPL_SOFTNET, NULL, NULL, NULL);
112 }
113
114 static bool
115 ipflow_reinit(int table_size, bool waitok, struct ipflowhead *gclist)
116 {
117 struct ipflowhead *new_table, *old_table;
118 size_t old_size, i;
119
120 new_table = kmem_alloc(sizeof(struct ipflowhead) * table_size,
121 waitok ? KM_SLEEP : KM_NOSLEEP);
122 if (new_table == NULL) {
123 return false;
124 }
125 for (i = 0; i < table_size; i++) {
126 LIST_INIT(&new_table[i]);
127 }
128
129 mutex_enter(&ipflow_lock);
130 old_table = ipflowtable;
131 old_size = ip_hashsize;
132
133 ipflowtable = new_table;
134 ip_hashsize = table_size;
135
136 if (!gclist) {
137 KASSERT(old_table == NULL);
138 LIST_INIT(&ipflowlist);
139 } else {
140 LIST_CONCAT(gclist, &ipflowlist);
141 }
142 mutex_exit(&ipflow_lock);
143
144 if (old_table) {
145 kmem_free(old_table, sizeof(struct ipflowhead) * old_size);
146 }
147 return true;
148 }
149
150 void
151 ipflow_init(void)
152 {
153 mutex_init(&ipflow_lock, MUTEX_DEFAULT, IPL_SOFTNET);
154 (void)ipflow_reinit(ip_hashsize, true, NULL);
155 ipflow_sysctl_init();
156 }
157
158 static size_t
159 ipflow_hash(const struct ip *ip)
160 {
161 size_t idx, hash = ip->ip_tos;
162
163 for (idx = 0; idx < 32; idx += IPFLOW_HASHBITS) {
164 hash += (ip->ip_dst.s_addr >> (32 - idx)) +
165 (ip->ip_src.s_addr >> idx);
166 }
167 return hash & (ip_hashsize - 1);
168 }
169
170 /*
171 * ipflow_lookup: search for a flow entry in the hash table.
172 *
173 * => Acquires the flow lock, if entry is found.
174 */
175 static struct ipflow *
176 ipflow_lookup(const struct ip *ip)
177 {
178 size_t hash = ipflow_hash(ip);
179 struct ipflow *ipf;
180
181 mutex_enter(&ipflow_lock);
182 LIST_FOREACH(ipf, &ipflowtable[hash], ipf_hash) {
183 if (ip->ip_dst.s_addr == ipf->ipf_dst.s_addr &&
184 ip->ip_src.s_addr == ipf->ipf_src.s_addr &&
185 ip->ip_tos == ipf->ipf_tos) {
186 mutex_enter(&ipf->ipf_lock);
187 break;
188 }
189 }
190 mutex_exit(&ipflow_lock);
191 return ipf;
192 }
193
194 /*
195 * Main routine performing fast-forward.
196 *
197 * => Returns true if the packet was forwarded and false otherwise.
198 */
199 bool
200 ipflow_fastforward(struct mbuf *m)
201 {
202 struct ip *ip, ip_store;
203 struct ipflow *ipf;
204 struct rtentry *rt;
205 const struct sockaddr *dst;
206 int error, iplen;
207
208 /* Are we forwarding packets? Pre-check without lock held. */
209 if (!ipforwarding || ipflow_inuse == 0)
210 return false;
211
212 /* Big enough for an IP packet? */
213 if (m->m_len < sizeof(struct ip))
214 return false;
215
216 /*
217 * Was packet received as a link-level multicast or broadcast?
218 * If so, don't try to fast forward..
219 */
220 if ((m->m_flags & (M_BCAST|M_MCAST)) != 0) {
221 return false;
222 }
223
224 /*
225 * Check for IP header with no option, invalid version or length.
226 */
227 if (IP_HDR_ALIGNED_P(mtod(m, const void *))) {
228 ip = mtod(m, struct ip *);
229 } else {
230 memcpy(&ip_store, mtod(m, const void *), sizeof(ip_store));
231 ip = &ip_store;
232 }
233 iplen = ntohs(ip->ip_len);
234 if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2) ||
235 iplen < sizeof(struct ip) || iplen > m->m_pkthdr.len) {
236 return false;
237 }
238
239 /*
240 * Verify the IP header checksum.
241 */
242 switch (m->m_pkthdr.csum_flags &
243 ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_IPv4) |
244 M_CSUM_IPv4_BAD)) {
245 case M_CSUM_IPv4|M_CSUM_IPv4_BAD:
246 return false;
247
248 case M_CSUM_IPv4:
249 /* Checksum was okay. */
250 break;
251
252 default:
253 /* Must compute it ourselves. */
254 if (in_cksum(m, sizeof(struct ip))) {
255 return false;
256 }
257 break;
258 }
259
260 /*
261 * Find a flow (acquires the lock if found).
262 */
263 if ((ipf = ipflow_lookup(ip)) == NULL) {
264 return false;
265 }
266 KASSERT(mutex_owned(&ipf->ipf_lock));
267
268 /*
269 * Route and interface still up?
270 */
271 if ((rt = rtcache_validate(&ipf->ipf_ro)) == NULL ||
272 (rt->rt_ifp->if_flags & IFF_UP) == 0) {
273 mutex_exit(&ipf->ipf_lock);
274 return false;
275 }
276
277 /*
278 * Packet size OK? TTL?
279 */
280 if (m->m_pkthdr.len > rt->rt_ifp->if_mtu || ip->ip_ttl <= IPTTLDEC) {
281 mutex_exit(&ipf->ipf_lock);
282 return false;
283 }
284
285 ipf->ipf_uses++;
286 PRT_SLOW_ARM(ipf->ipf_timer, IPFLOW_TIMER);
287
288 if (rt->rt_flags & RTF_GATEWAY) {
289 dst = rt->rt_gateway;
290 } else {
291 dst = rtcache_getdst(&ipf->ipf_ro);
292 }
293 mutex_exit(&ipf->ipf_lock);
294
295 /*
296 * Clear any in-bound checksum flags for this packet.
297 */
298 m->m_pkthdr.csum_flags = 0;
299
300 /*
301 * Everything checks out and so we can forward this packet.
302 * Modify the TTL and incrementally change the checksum.
303 *
304 * This method of adding the checksum works on either endian CPU.
305 * If htons() is inlined, all the arithmetic is folded; otherwise
306 * the htons()s are combined by CSE due to the const attribute.
307 *
308 * Don't bother using HW checksumming here -- the incremental
309 * update is pretty fast.
310 */
311 ip->ip_ttl -= IPTTLDEC;
312 if (ip->ip_sum >= (uint16_t)~htons(IPTTLDEC << 8)) {
313 ip->ip_sum -= ~htons(IPTTLDEC << 8);
314 } else {
315 ip->ip_sum += htons(IPTTLDEC << 8);
316 }
317
318 /*
319 * Done modifying the header; copy it back, if necessary.
320 *
321 * XXX Use m_copyback_cow(9) here? --dyoung
322 */
323 if (IP_HDR_ALIGNED_P(mtod(m, void *)) == 0) {
324 memcpy(mtod(m, void *), &ip_store, sizeof(ip_store));
325 }
326
327 /*
328 * Trim the packet in case it's too long..
329 */
330 if (m->m_pkthdr.len > iplen) {
331 if (m->m_len == m->m_pkthdr.len) {
332 m->m_len = iplen;
333 m->m_pkthdr.len = iplen;
334 } else
335 m_adj(m, iplen - m->m_pkthdr.len);
336 }
337
338 /*
339 * Send the packet on it's way. All we can get back is ENOBUFS.
340 */
341 KERNEL_LOCK(1, NULL);
342 if ((error = (*rt->rt_ifp->if_output)(rt->rt_ifp, m, dst, rt)) != 0) {
343 /* FIMXErmind */
344 if (error == ENOBUFS)
345 ipf->ipf_dropped++;
346 else
347 ipf->ipf_errors++;
348 }
349 KERNEL_UNLOCK_ONE(NULL);
350 return true;
351 }
352
353 static void
354 ipflow_addstats(struct ipflow *ipf)
355 {
356 struct rtentry *rt;
357 uint64_t *ips;
358
359 if ((rt = rtcache_validate(&ipf->ipf_ro)) != NULL)
360 rt->rt_use += ipf->ipf_uses;
361
362 ips = IP_STAT_GETREF();
363 ips[IP_STAT_CANTFORWARD] += ipf->ipf_errors + ipf->ipf_dropped;
364 ips[IP_STAT_TOTAL] += ipf->ipf_uses;
365 ips[IP_STAT_FORWARD] += ipf->ipf_uses;
366 ips[IP_STAT_FASTFORWARD] += ipf->ipf_uses;
367 IP_STAT_PUTREF();
368 }
369
370 static void
371 ipflow_insert(size_t hash, struct ipflow *ipf)
372 {
373 KASSERT(mutex_owned(&ipflow_lock));
374 LIST_INSERT_HEAD(&ipflowtable[hash], ipf, ipf_hash);
375 LIST_INSERT_HEAD(&ipflowlist, ipf, ipf_list);
376 ipflow_inuse++;
377 }
378
379 static void
380 ipflow_remove(struct ipflow *ipf)
381 {
382 KASSERT(mutex_owned(&ipflow_lock));
383 LIST_REMOVE(ipf, ipf_hash);
384 LIST_REMOVE(ipf, ipf_list);
385 ipflow_inuse--;
386 ipflow_addstats(ipf);
387 }
388
389 static void
390 ipflow_free(struct ipflow *ipf)
391 {
392 rtcache_free(&ipf->ipf_ro);
393 pool_cache_put(ipflow_cache, ipf);
394 }
395
396 static struct ipflow *
397 ipflow_reap(bool just_one)
398 {
399 struct ipflowhead ipf_gclist;
400 struct ipflow *ipf;
401
402 LIST_INIT(&ipf_gclist);
403 mutex_enter(&ipflow_lock);
404 while (just_one || ipflow_inuse > ip_maxflows) {
405 struct ipflow *maybe_ipf = NULL;
406
407 ipf = LIST_FIRST(&ipflowlist);
408 while (ipf != NULL) {
409 /*
410 * If this no longer points to a valid route
411 * reclaim it.
412 */
413 if (rtcache_validate(&ipf->ipf_ro) == NULL) {
414 goto found;
415 }
416
417 /*
418 * Choose the one that has been least recently
419 * used or has had the least uses in the
420 * last 1.5 intervals.
421 */
422 if (maybe_ipf == NULL ||
423 ipf->ipf_timer < maybe_ipf->ipf_timer ||
424 (ipf->ipf_timer == maybe_ipf->ipf_timer &&
425 ipf->ipf_last_uses + ipf->ipf_uses <
426 maybe_ipf->ipf_last_uses +
427 maybe_ipf->ipf_uses))
428 maybe_ipf = ipf;
429 ipf = LIST_NEXT(ipf, ipf_list);
430 }
431 ipf = maybe_ipf;
432 found:
433 /*
434 * Remove the entry from the flow table.
435 */
436 ipflow_remove(ipf);
437 if (just_one) {
438 /* Unlock, free the route cache and return. */
439 mutex_exit(&ipflow_lock);
440 rtcache_free(&ipf->ipf_ro);
441 return ipf;
442 }
443 LIST_INSERT_HEAD(&ipf_gclist, ipf, ipf_list);
444 }
445 mutex_exit(&ipflow_lock);
446
447 while ((ipf = LIST_FIRST(&ipf_gclist)) != NULL) {
448 LIST_REMOVE(ipf, ipf_list);
449 ipflow_free(ipf);
450 }
451 return NULL;
452 }
453
454 void
455 ipflow_slowtimo(void)
456 {
457 struct ipflowhead ipf_gclist;
458 struct ipflow *ipf, *next_ipf;
459
460 if (!ipflow_inuse) {
461 return;
462 }
463 LIST_INIT(&ipf_gclist);
464
465 mutex_enter(&ipflow_lock);
466 for (ipf = LIST_FIRST(&ipflowlist); ipf != NULL; ipf = next_ipf) {
467 struct rtentry *rt;
468 uint64_t *ips;
469
470 /*
471 * Destroy if entry has expired or its route no longer valid.
472 */
473 next_ipf = LIST_NEXT(ipf, ipf_list);
474 if (PRT_SLOW_ISEXPIRED(ipf->ipf_timer) ||
475 (rt = rtcache_validate(&ipf->ipf_ro)) == NULL) {
476 /* Move to destruction list. */
477 ipflow_remove(ipf);
478 LIST_INSERT_HEAD(&ipf_gclist, ipf, ipf_list);
479 continue;
480 }
481
482 /* Lockless access - for statistics only. */
483 ipf->ipf_last_uses = ipf->ipf_uses;
484 rt->rt_use += ipf->ipf_uses;
485 ips = IP_STAT_GETREF();
486 ips[IP_STAT_TOTAL] += ipf->ipf_uses;
487 ips[IP_STAT_FORWARD] += ipf->ipf_uses;
488 ips[IP_STAT_FASTFORWARD] += ipf->ipf_uses;
489 IP_STAT_PUTREF();
490 ipf->ipf_uses = 0;
491 }
492 mutex_exit(&ipflow_lock);
493
494 while ((ipf = LIST_FIRST(&ipf_gclist)) != NULL) {
495 LIST_REMOVE(ipf, ipf_list);
496 ipflow_free(ipf);
497 }
498 }
499
500 void
501 ipflow_create(const struct route *ro, struct mbuf *m)
502 {
503 const struct ip *const ip = mtod(m, const struct ip *);
504 struct ipflow *ipf;
505 size_t hash;
506
507 /* Do not create cache entries for ICMP messages. */
508 if (ip_maxflows == 0 || ip->ip_p == IPPROTO_ICMP) {
509 return;
510 }
511
512 /*
513 * See if there is an existing flow. If so, remove it from the list
514 * and free the old route. If not, try to allocate a new one,
515 * unless we hit our limit.
516 *
517 * Note: if found, flow lock is acquired.
518 */
519 ipf = ipflow_lookup(ip);
520 if (ipf == NULL) {
521 if (ipflow_inuse < ip_maxflows) {
522 ipf = pool_cache_get(ipflow_cache, PR_NOWAIT);
523 if (ipf == NULL)
524 return;
525 } else {
526 ipf = ipflow_reap(true);
527 }
528 memset(ipf, 0, sizeof(*ipf));
529 } else {
530 KASSERT(mutex_owned(&ipf->ipf_lock));
531 ipflow_remove(ipf);
532 mutex_exit(&ipflow_lock);
533
534 rtcache_free(&ipf->ipf_ro);
535 ipf->ipf_uses = ipf->ipf_last_uses = 0;
536 ipf->ipf_errors = ipf->ipf_dropped = 0;
537 }
538
539 /*
540 * Fill in the updated information.
541 */
542 rtcache_copy(&ipf->ipf_ro, ro);
543 ipf->ipf_dst = ip->ip_dst;
544 ipf->ipf_src = ip->ip_src;
545 ipf->ipf_tos = ip->ip_tos;
546 PRT_SLOW_ARM(ipf->ipf_timer, IPFLOW_TIMER);
547
548 /*
549 * Insert into the appropriate bucket of the flow table.
550 */
551 hash = ipflow_hash(ip);
552 mutex_enter(&ipflow_lock);
553 ipflow_insert(hash, ipf);
554 mutex_exit(&ipflow_lock);
555 }
556
557 int
558 ipflow_invalidate_all(int new_size)
559 {
560 struct ipflowhead ipf_gclist;
561 struct ipflow *ipf;
562 int error = 0;
563
564 LIST_INIT(&ipf_gclist);
565
566 if (new_size) {
567 if (!ipflow_reinit(new_size, false, &ipf_gclist)) {
568 return ENOMEM;
569 }
570 } else {
571 mutex_enter(&ipflow_lock);
572 LIST_CONCAT(&ipf_gclist, &ipflowlist);
573 mutex_exit(&ipflow_lock);
574 }
575
576 while ((ipf = LIST_FIRST(&ipf_gclist)) != NULL) {
577 LIST_REMOVE(ipf, ipf_list);
578 ipflow_free(ipf);
579 }
580 return error;
581 }
582
583 /*
584 * sysctl helper routine for net.inet.ip.maxflows.
585 */
586 static int
587 sysctl_net_inet_ip_maxflows(SYSCTLFN_ARGS)
588 {
589 int error;
590
591 error = sysctl_lookup(SYSCTLFN_CALL(rnode));
592 if (error || newp == NULL) {
593 return error;
594 }
595 mutex_enter(&ipflow_lock);
596 (void)ipflow_reap(false);
597 mutex_exit(&ipflow_lock);
598 return 0;
599 }
600
601 static int
602 sysctl_net_inet_ip_hashsize(SYSCTLFN_ARGS)
603 {
604 int error, tmp;
605 struct sysctlnode node;
606
607 node = *rnode;
608 tmp = ip_hashsize;
609 node.sysctl_data = &tmp;
610 error = sysctl_lookup(SYSCTLFN_CALL(&node));
611 if (error || newp == NULL) {
612 return error;
613 }
614
615 if (tmp && powerof2(tmp)) {
616 error = ipflow_invalidate_all(tmp);
617 } else {
618 error = EINVAL;
619 }
620 return error;
621 }
622
623 static void
624 ipflow_sysctl_init(void)
625 {
626 ipflow_sysctl_log = NULL;
627
628 sysctl_createv(&ipflow_sysctl_log, 0, NULL, NULL, CTLFLAG_PERMANENT,
629 CTLTYPE_NODE, "net", NULL, NULL, 0, NULL, 0, CTL_NET, CTL_EOL);
630 sysctl_createv(&ipflow_sysctl_log, 0, NULL, NULL, CTLFLAG_PERMANENT,
631 CTLTYPE_NODE, "inet", SYSCTL_DESCR("PF_INET related settings"),
632 NULL, 0, NULL, 0, CTL_NET, PF_INET, CTL_EOL);
633 sysctl_createv(&ipflow_sysctl_log, 0, NULL, NULL, CTLFLAG_PERMANENT,
634 CTLTYPE_NODE, "ip", SYSCTL_DESCR("IPv4 related settings"), NULL,
635 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_IP, CTL_EOL);
636
637 sysctl_createv(&ipflow_sysctl_log, 0, NULL, NULL,
638 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
639 CTLTYPE_INT, "maxflows",
640 SYSCTL_DESCR("Number of flows for fast forwarding"),
641 sysctl_net_inet_ip_maxflows, 0, &ip_maxflows, 0,
642 CTL_NET, PF_INET, IPPROTO_IP, IPCTL_MAXFLOWS, CTL_EOL);
643 sysctl_createv(&ipflow_sysctl_log, 0, NULL, NULL,
644 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
645 CTLTYPE_INT, "hashsize",
646 SYSCTL_DESCR("Size of hash table for fast forwarding (IPv4)"),
647 sysctl_net_inet_ip_hashsize, 0, &ip_hashsize, 0,
648 CTL_NET, PF_INET, IPPROTO_IP, CTL_CREATE, CTL_EOL);
649 }
650