ip_reass.c revision 1.1 1 /* $NetBSD: ip_reass.c,v 1.1 2010/07/13 22:16:10 rmind Exp $ */
2
3 /*
4 * Copyright (c) 1982, 1986, 1988, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 * @(#)ip_input.c 8.2 (Berkeley) 1/4/94
32 */
33
34 /*
35 * IP reassembly.
36 *
37 * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for IP
38 * reassembly queue buffer managment.
39 *
40 * We keep a count of total IP fragments (NB: not fragmented packets),
41 * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments.
42 * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the total
43 * fragments in reassembly queues. This AIMD policy avoids repeatedly
44 * deleting single packets under heavy fragmentation load (e.g., from lossy
45 * NFS peers).
46 */
47
48 #include <sys/cdefs.h>
49 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.1 2010/07/13 22:16:10 rmind Exp $");
50
51 #include <sys/param.h>
52 #include <sys/systm.h>
53
54 #include <sys/malloc.h>
55 #include <sys/mbuf.h>
56 #include <sys/domain.h>
57 #include <sys/protosw.h>
58 #include <sys/pool.h>
59 #include <sys/sysctl.h>
60
61 #include <net/if.h>
62 #include <net/route.h>
63
64 #include <netinet/in.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/ip.h>
67 #include <netinet/in_pcb.h>
68 #include <netinet/in_proto.h>
69 #include <netinet/ip_private.h>
70 #include <netinet/in_var.h>
71 #include <netinet/ip_var.h>
72
73 /*
74 * IP datagram reassembly hashed queues, pool, lock and counters.
75 */
76 #define IPREASS_HASH_SHIFT 6
77 #define IPREASS_HASH_SIZE (1 << IPREASS_HASH_SHIFT)
78 #define IPREASS_HASH_MASK (IPREASS_HASH_SIZE - 1)
79 #define IPREASS_HASH(x, y) \
80 (((((x) & 0xf) | ((((x) >> 8) & 0xf) << 4)) ^ (y)) & IPREASS_HASH_MASK)
81
82 struct ipqhead ipq[IPREASS_HASH_SIZE];
83 struct pool ipqent_pool;
84 static int ipq_locked;
85
86 static int ip_nfragpackets; /* packets in reass queue */
87 static int ip_nfrags; /* total fragments in reass queues */
88
89 static int ip_maxfragpackets; /* limit on packets. XXX sysctl */
90 static int ip_maxfrags; /* limit on fragments. XXX sysctl */
91
92 /*
93 * Cached copy of nmbclusters. If nbclusters is different,
94 * recalculate IP parameters derived from nmbclusters.
95 */
96 static int ip_nmbclusters; /* copy of nmbclusters */
97
98 /*
99 * IP reassembly TTL machinery for multiplicative drop.
100 */
101 static u_int fragttl_histo[IPFRAGTTL + 1];
102
103 void sysctl_ip_reass_setup(void);
104 static void ip_nmbclusters_changed(void);
105 static u_int ip_reass_ttl_decr(u_int ticks);
106 static void ip_reass_drophalf(void);
107
108 /*
109 * ip_reass_init:
110 *
111 * Initialization of IP reassembly mechanism.
112 */
113 void
114 ip_reass_init(void)
115 {
116 int i;
117
118 pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl",
119 NULL, IPL_VM);
120
121 for (i = 0; i < IPREASS_HASH_SIZE; i++) {
122 LIST_INIT(&ipq[i]);
123 }
124 ip_maxfragpackets = 200;
125 ip_maxfrags = 0;
126 ip_nmbclusters_changed();
127
128 sysctl_ip_reass_setup();
129 }
130
131 static struct sysctllog *ip_reass_sysctllog;
132
133 void
134 sysctl_ip_reass_setup(void)
135 {
136
137 sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
138 CTLFLAG_PERMANENT,
139 CTLTYPE_NODE, "net", NULL,
140 NULL, 0, NULL, 0,
141 CTL_NET, CTL_EOL);
142 sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
143 CTLFLAG_PERMANENT,
144 CTLTYPE_NODE, "inet",
145 SYSCTL_DESCR("PF_INET related settings"),
146 NULL, 0, NULL, 0,
147 CTL_NET, PF_INET, CTL_EOL);
148 sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
149 CTLFLAG_PERMANENT,
150 CTLTYPE_NODE, "ip",
151 SYSCTL_DESCR("IPv4 related settings"),
152 NULL, 0, NULL, 0,
153 CTL_NET, PF_INET, IPPROTO_IP, CTL_EOL);
154
155 sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
156 CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
157 CTLTYPE_INT, "maxfragpackets",
158 SYSCTL_DESCR("Maximum number of fragments to retain for "
159 "possible reassembly"),
160 NULL, 0, &ip_maxfragpackets, 0,
161 CTL_NET, PF_INET, IPPROTO_IP, IPCTL_MAXFRAGPACKETS, CTL_EOL);
162 }
163
164 #define CHECK_NMBCLUSTER_PARAMS() \
165 do { \
166 if (__predict_false(ip_nmbclusters != nmbclusters)) \
167 ip_nmbclusters_changed(); \
168 } while (/*CONSTCOND*/0)
169
170 /*
171 * Compute IP limits derived from the value of nmbclusters.
172 */
173 static void
174 ip_nmbclusters_changed(void)
175 {
176 ip_maxfrags = nmbclusters / 4;
177 ip_nmbclusters = nmbclusters;
178 }
179
180 static inline int ipq_lock_try(void);
181 static inline void ipq_unlock(void);
182
183 static inline int
184 ipq_lock_try(void)
185 {
186 int s;
187
188 /*
189 * Use splvm() -- we're blocking things that would cause
190 * mbuf allocation.
191 */
192 s = splvm();
193 if (ipq_locked) {
194 splx(s);
195 return (0);
196 }
197 ipq_locked = 1;
198 splx(s);
199 return (1);
200 }
201
202 static inline void
203 ipq_unlock(void)
204 {
205 int s;
206
207 s = splvm();
208 ipq_locked = 0;
209 splx(s);
210 }
211
212 #ifdef DIAGNOSTIC
213 #define IPQ_LOCK() \
214 do { \
215 if (ipq_lock_try() == 0) { \
216 printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \
217 panic("ipq_lock"); \
218 } \
219 } while (/*CONSTCOND*/ 0)
220 #define IPQ_LOCK_CHECK() \
221 do { \
222 if (ipq_locked == 0) { \
223 printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \
224 panic("ipq lock check"); \
225 } \
226 } while (/*CONSTCOND*/ 0)
227 #else
228 #define IPQ_LOCK() (void) ipq_lock_try()
229 #define IPQ_LOCK_CHECK() /* nothing */
230 #endif
231
232 #define IPQ_UNLOCK() ipq_unlock()
233
234 /*
235 * ip_reass_lookup:
236 *
237 * Look for queue of fragments of this datagram.
238 */
239 struct ipq *
240 ip_reass_lookup(struct ip *ip, u_int *hashp)
241 {
242 struct ipq *fp;
243 u_int hash;
244
245 IPQ_LOCK();
246 hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
247 LIST_FOREACH(fp, &ipq[hash], ipq_q) {
248 if (ip->ip_id != fp->ipq_id)
249 continue;
250 if (!in_hosteq(ip->ip_src, fp->ipq_src))
251 continue;
252 if (!in_hosteq(ip->ip_dst, fp->ipq_dst))
253 continue;
254 if (ip->ip_p != fp->ipq_p)
255 continue;
256 break;
257 }
258 *hashp = hash;
259 return fp;
260 }
261
262 void
263 ip_reass_unlock(void)
264 {
265
266 IPQ_UNLOCK();
267 }
268
269 struct ipqent *
270 ip_reass_getent(void)
271 {
272 struct ipqent *ipqe;
273 int s;
274
275 IP_STATINC(IP_STAT_FRAGMENTS);
276 s = splvm();
277 ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
278 splx(s);
279
280 return ipqe;
281 }
282
283 /*
284 * ip_reass:
285 *
286 * Take incoming datagram fragment and try to reassemble it into whole
287 * datagram. If a chain for reassembly of this datagram already exists,
288 * then it is given as 'fp'; otherwise have to make a chain.
289 */
290 struct mbuf *
291 ip_reass(struct ipqent *ipqe, struct ipq *fp, u_int hash)
292 {
293 struct ipqhead *ipqhead = &ipq[hash];
294 const int hlen = ipqe->ipqe_ip->ip_hl << 2;
295 struct mbuf *m = ipqe->ipqe_m, *t;
296 struct ipqent *nq, *p, *q;
297 struct ip *ip;
298 int i, next, s;
299
300 IPQ_LOCK_CHECK();
301
302 /*
303 * Presence of header sizes in mbufs would confuse code below.
304 */
305 m->m_data += hlen;
306 m->m_len -= hlen;
307
308 #ifdef notyet
309 /* Make sure fragment limit is up-to-date. */
310 CHECK_NMBCLUSTER_PARAMS();
311
312 /* If we have too many fragments, drop the older half. */
313 if (ip_nfrags >= ip_maxfrags) {
314 ip_reass_drophalf(void);
315 }
316 #endif
317
318 /*
319 * We are about to add a fragment; increment frag count.
320 */
321 ip_nfrags++;
322
323 /*
324 * If first fragment to arrive, create a reassembly queue.
325 */
326 if (fp == NULL) {
327 /*
328 * Enforce upper bound on number of fragmented packets
329 * for which we attempt reassembly: a) if maxfrag is 0,
330 * never accept fragments b) if maxfrag is -1, accept
331 * all fragments without limitation.
332 */
333 if (ip_maxfragpackets < 0)
334 ;
335 else if (ip_nfragpackets >= ip_maxfragpackets) {
336 goto dropfrag;
337 }
338 ip_nfragpackets++;
339 fp = malloc(sizeof(struct ipq), M_FTABLE, M_NOWAIT);
340 if (fp == NULL) {
341 goto dropfrag;
342 }
343 LIST_INSERT_HEAD(ipqhead, fp, ipq_q);
344 fp->ipq_nfrags = 1;
345 fp->ipq_ttl = IPFRAGTTL;
346 fp->ipq_p = ipqe->ipqe_ip->ip_p;
347 fp->ipq_id = ipqe->ipqe_ip->ip_id;
348 fp->ipq_tos = ipqe->ipqe_ip->ip_tos;
349 TAILQ_INIT(&fp->ipq_fragq);
350 fp->ipq_src = ipqe->ipqe_ip->ip_src;
351 fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
352 p = NULL;
353 goto insert;
354 } else {
355 fp->ipq_nfrags++;
356 }
357
358 /*
359 * Find a segment which begins after this one does.
360 */
361 for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
362 p = q, q = TAILQ_NEXT(q, ipqe_q))
363 if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off))
364 break;
365
366 /*
367 * If there is a preceding segment, it may provide some of our
368 * data already. If so, drop the data from the incoming segment.
369 * If it provides all of our data, drop us.
370 */
371 if (p != NULL) {
372 i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) -
373 ntohs(ipqe->ipqe_ip->ip_off);
374 if (i > 0) {
375 if (i >= ntohs(ipqe->ipqe_ip->ip_len)) {
376 goto dropfrag;
377 }
378 m_adj(ipqe->ipqe_m, i);
379 ipqe->ipqe_ip->ip_off =
380 htons(ntohs(ipqe->ipqe_ip->ip_off) + i);
381 ipqe->ipqe_ip->ip_len =
382 htons(ntohs(ipqe->ipqe_ip->ip_len) - i);
383 }
384 }
385
386 /*
387 * While we overlap succeeding segments trim them or, if they are
388 * completely covered, dequeue them.
389 */
390 for (; q != NULL &&
391 ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) >
392 ntohs(q->ipqe_ip->ip_off); q = nq) {
393 i = (ntohs(ipqe->ipqe_ip->ip_off) +
394 ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off);
395 if (i < ntohs(q->ipqe_ip->ip_len)) {
396 q->ipqe_ip->ip_len =
397 htons(ntohs(q->ipqe_ip->ip_len) - i);
398 q->ipqe_ip->ip_off =
399 htons(ntohs(q->ipqe_ip->ip_off) + i);
400 m_adj(q->ipqe_m, i);
401 break;
402 }
403 nq = TAILQ_NEXT(q, ipqe_q);
404 m_freem(q->ipqe_m);
405 TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
406 s = splvm();
407 pool_put(&ipqent_pool, q);
408 splx(s);
409 fp->ipq_nfrags--;
410 ip_nfrags--;
411 }
412
413 insert:
414 /*
415 * Stick new segment in its place; check for complete reassembly.
416 */
417 if (p == NULL) {
418 TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
419 } else {
420 TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q);
421 }
422 next = 0;
423 for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
424 p = q, q = TAILQ_NEXT(q, ipqe_q)) {
425 if (ntohs(q->ipqe_ip->ip_off) != next) {
426 IPQ_UNLOCK();
427 return NULL;
428 }
429 next += ntohs(q->ipqe_ip->ip_len);
430 }
431 if (p->ipqe_mff) {
432 IPQ_UNLOCK();
433 return NULL;
434 }
435 /*
436 * Reassembly is complete. Check for a bogus message size and
437 * concatenate fragments.
438 */
439 q = TAILQ_FIRST(&fp->ipq_fragq);
440 ip = q->ipqe_ip;
441 if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
442 IP_STATINC(IP_STAT_TOOLONG);
443 ip_freef(fp);
444 IPQ_UNLOCK();
445 return NULL;
446 }
447 m = q->ipqe_m;
448 t = m->m_next;
449 m->m_next = NULL;
450 m_cat(m, t);
451 nq = TAILQ_NEXT(q, ipqe_q);
452 s = splvm();
453 pool_put(&ipqent_pool, q);
454 splx(s);
455 for (q = nq; q != NULL; q = nq) {
456 t = q->ipqe_m;
457 nq = TAILQ_NEXT(q, ipqe_q);
458 s = splvm();
459 pool_put(&ipqent_pool, q);
460 splx(s);
461 m_cat(m, t);
462 }
463 ip_nfrags -= fp->ipq_nfrags;
464
465 /*
466 * Create header for new packet by modifying header of first
467 * packet. Dequeue and discard fragment reassembly header. Make
468 * header visible.
469 */
470 ip->ip_len = htons(next);
471 ip->ip_src = fp->ipq_src;
472 ip->ip_dst = fp->ipq_dst;
473 LIST_REMOVE(fp, ipq_q);
474 free(fp, M_FTABLE);
475 ip_nfragpackets--;
476 m->m_len += (ip->ip_hl << 2);
477 m->m_data -= (ip->ip_hl << 2);
478 /* some debugging cruft by sklower, below, will go away soon */
479 if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
480 int plen = 0;
481 for (t = m; t; t = t->m_next) {
482 plen += t->m_len;
483 }
484 m->m_pkthdr.len = plen;
485 m->m_pkthdr.csum_flags = 0;
486 }
487 IPQ_UNLOCK();
488 return m;
489
490 dropfrag:
491 if (fp != NULL) {
492 fp->ipq_nfrags--;
493 }
494 ip_nfrags--;
495 IP_STATINC(IP_STAT_FRAGDROPPED);
496 m_freem(m);
497 s = splvm();
498 pool_put(&ipqent_pool, ipqe);
499 splx(s);
500 IPQ_UNLOCK();
501 return NULL;
502 }
503
504 /*
505 * ip_freef:
506 *
507 * Free a fragment reassembly header and all associated datagrams.
508 */
509 void
510 ip_freef(struct ipq *fp)
511 {
512 struct ipqent *q, *p;
513 u_int nfrags = 0;
514 int s;
515
516 IPQ_LOCK_CHECK();
517
518 for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) {
519 p = TAILQ_NEXT(q, ipqe_q);
520 m_freem(q->ipqe_m);
521 nfrags++;
522 TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
523 s = splvm();
524 pool_put(&ipqent_pool, q);
525 splx(s);
526 }
527
528 if (nfrags != fp->ipq_nfrags) {
529 printf("ip_freef: nfrags %d != %d\n", fp->ipq_nfrags, nfrags);
530 }
531 ip_nfrags -= nfrags;
532 LIST_REMOVE(fp, ipq_q);
533 free(fp, M_FTABLE);
534 ip_nfragpackets--;
535 }
536
537 /*
538 * ip_reass_ttl_decr:
539 *
540 * Decrement TTL of all reasembly queue entries by `ticks'. Count
541 * number of distinct fragments (as opposed to partial, fragmented
542 * datagrams) inthe reassembly queue. While we traverse the entire
543 * reassembly queue, compute and return the median TTL over all
544 * fragments.
545 */
546 static u_int
547 ip_reass_ttl_decr(u_int ticks)
548 {
549 u_int nfrags, median, dropfraction, keepfraction;
550 struct ipq *fp, *nfp;
551 int i;
552
553 nfrags = 0;
554 memset(fragttl_histo, 0, sizeof(fragttl_histo));
555
556 for (i = 0; i < IPREASS_HASH_SIZE; i++) {
557 for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
558 fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ?
559 0 : fp->ipq_ttl - ticks);
560 nfp = LIST_NEXT(fp, ipq_q);
561 if (fp->ipq_ttl == 0) {
562 IP_STATINC(IP_STAT_FRAGTIMEOUT);
563 ip_freef(fp);
564 } else {
565 nfrags += fp->ipq_nfrags;
566 fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags;
567 }
568 }
569 }
570
571 KASSERT(ip_nfrags == nfrags);
572
573 /* Find median (or other drop fraction) in histogram. */
574 dropfraction = (ip_nfrags / 2);
575 keepfraction = ip_nfrags - dropfraction;
576 for (i = IPFRAGTTL, median = 0; i >= 0; i--) {
577 median += fragttl_histo[i];
578 if (median >= keepfraction)
579 break;
580 }
581
582 /* Return TTL of median (or other fraction). */
583 return (u_int)i;
584 }
585
586 static void
587 ip_reass_drophalf(void)
588 {
589 u_int median_ticks;
590
591 /*
592 * Compute median TTL of all fragments, and count frags
593 * with that TTL or lower (roughly half of all fragments).
594 */
595 median_ticks = ip_reass_ttl_decr(0);
596
597 /* Drop half. */
598 median_ticks = ip_reass_ttl_decr(median_ticks);
599 }
600
601 /*
602 * ip_reass_drain: drain off all datagram fragments. Do not acquire
603 * softnet_lock as can be called from hardware interrupt context.
604 */
605 void
606 ip_reass_drain(void)
607 {
608
609 /*
610 * We may be called from a device's interrupt context. If
611 * the ipq is already busy, just bail out now.
612 */
613 if (ipq_lock_try() != 0) {
614 /*
615 * Drop half the total fragments now. If more mbufs are
616 * needed, we will be called again soon.
617 */
618 ip_reass_drophalf();
619 IPQ_UNLOCK();
620 }
621 }
622
623 /*
624 * ip_reass_slowtimo:
625 *
626 * If a timer expires on a reassembly queue, discard it.
627 */
628 void
629 ip_reass_slowtimo(void)
630 {
631 static u_int dropscanidx = 0;
632 u_int i, median_ttl;
633
634 IPQ_LOCK();
635
636 /* Age TTL of all fragments by 1 tick .*/
637 median_ttl = ip_reass_ttl_decr(1);
638
639 /* Make sure fragment limit is up-to-date. */
640 CHECK_NMBCLUSTER_PARAMS();
641
642 /* If we have too many fragments, drop the older half. */
643 if (ip_nfrags > ip_maxfrags) {
644 ip_reass_ttl_decr(median_ttl);
645 }
646
647 /*
648 * If we are over the maximum number of fragmented packets (due to
649 * the limit being lowered), drain off enough to get down to the
650 * new limit. Start draining from the reassembly hashqueue most
651 * recently drained.
652 */
653 if (ip_maxfragpackets < 0)
654 ;
655 else {
656 int wrapped = 0;
657
658 i = dropscanidx;
659 while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) {
660 while (LIST_FIRST(&ipq[i]) != NULL) {
661 ip_freef(LIST_FIRST(&ipq[i]));
662 }
663 if (++i >= IPREASS_HASH_SIZE) {
664 i = 0;
665 }
666 /*
667 * Do not scan forever even if fragment counters are
668 * wrong: stop after scanning entire reassembly queue.
669 */
670 if (i == dropscanidx) {
671 wrapped = 1;
672 }
673 }
674 dropscanidx = i;
675 }
676 IPQ_UNLOCK();
677 }
678