npf_inet.c revision 1.28 1 /* $NetBSD: npf_inet.c,v 1.28 2013/12/06 01:33:37 rmind Exp $ */
2
3 /*-
4 * Copyright (c) 2009-2012 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This material is based upon work partially supported by The
8 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Various protocol related helper routines.
34 *
35 * This layer manipulates npf_cache_t structure i.e. caches requested headers
36 * and stores which information was cached in the information bit field.
37 * It is also responsibility of this layer to update or invalidate the cache
38 * on rewrites (e.g. by translation routines).
39 */
40
41 #include <sys/cdefs.h>
42 __KERNEL_RCSID(0, "$NetBSD: npf_inet.c,v 1.28 2013/12/06 01:33:37 rmind Exp $");
43
44 #include <sys/param.h>
45 #include <sys/types.h>
46
47 #include <net/pfil.h>
48 #include <net/if.h>
49 #include <net/ethertypes.h>
50 #include <net/if_ether.h>
51
52 #include <netinet/in_systm.h>
53 #include <netinet/in.h>
54 #include <netinet/ip.h>
55 #include <netinet/ip6.h>
56 #include <netinet/tcp.h>
57 #include <netinet/udp.h>
58 #include <netinet/ip_icmp.h>
59
60 #include "npf_impl.h"
61
62 /*
63 * npf_fixup{16,32}_cksum: incremental update of the Internet checksum.
64 */
65
66 uint16_t
67 npf_fixup16_cksum(uint16_t cksum, uint16_t odatum, uint16_t ndatum)
68 {
69 uint32_t sum;
70
71 /*
72 * RFC 1624:
73 * HC' = ~(~HC + ~m + m')
74 *
75 * Note: 1's complement sum is endian-independent (RFC 1071, page 2).
76 */
77 sum = ~cksum & 0xffff;
78 sum += (~odatum & 0xffff) + ndatum;
79 sum = (sum >> 16) + (sum & 0xffff);
80 sum += (sum >> 16);
81
82 return ~sum & 0xffff;
83 }
84
85 uint16_t
86 npf_fixup32_cksum(uint16_t cksum, uint32_t odatum, uint32_t ndatum)
87 {
88 uint32_t sum;
89
90 /*
91 * Checksum 32-bit datum as as two 16-bit. Note, the first
92 * 32->16 bit reduction is not necessary.
93 */
94 sum = ~cksum & 0xffff;
95 sum += (~odatum & 0xffff) + (ndatum & 0xffff);
96
97 sum += (~odatum >> 16) + (ndatum >> 16);
98 sum = (sum >> 16) + (sum & 0xffff);
99 sum += (sum >> 16);
100 return ~sum & 0xffff;
101 }
102
103 /*
104 * npf_addr_cksum: calculate checksum of the address, either IPv4 or IPv6.
105 */
106 uint16_t
107 npf_addr_cksum(uint16_t cksum, int sz, const npf_addr_t *oaddr,
108 const npf_addr_t *naddr)
109 {
110 const uint32_t *oip32 = (const uint32_t *)oaddr;
111 const uint32_t *nip32 = (const uint32_t *)naddr;
112
113 KASSERT(sz % sizeof(uint32_t) == 0);
114 do {
115 cksum = npf_fixup32_cksum(cksum, *oip32++, *nip32++);
116 sz -= sizeof(uint32_t);
117 } while (sz);
118
119 return cksum;
120 }
121
122 /*
123 * npf_addr_sum: provide IP addresses as a XORed 32-bit integer.
124 * Note: used for hash function.
125 */
126 uint32_t
127 npf_addr_mix(const int sz, const npf_addr_t *a1, const npf_addr_t *a2)
128 {
129 uint32_t mix = 0;
130
131 KASSERT(sz > 0 && a1 != NULL && a2 != NULL);
132
133 for (int i = 0; i < (sz >> 2); i++) {
134 mix ^= a1->s6_addr32[i];
135 mix ^= a2->s6_addr32[i];
136 }
137 return mix;
138 }
139
140 /*
141 * npf_addr_mask: apply the mask to a given address and store the result.
142 */
143 void
144 npf_addr_mask(const npf_addr_t *addr, const npf_netmask_t mask,
145 const int alen, npf_addr_t *out)
146 {
147 const int nwords = alen >> 2;
148 uint_fast8_t length = mask;
149
150 /* Note: maximum length is 32 for IPv4 and 128 for IPv6. */
151 KASSERT(length <= NPF_MAX_NETMASK);
152
153 for (int i = 0; i < nwords; i++) {
154 uint32_t wordmask;
155
156 if (length >= 32) {
157 wordmask = htonl(0xffffffff);
158 length -= 32;
159 } else if (length) {
160 wordmask = htonl(0xffffffff << (32 - length));
161 length = 0;
162 } else {
163 wordmask = 0;
164 }
165 out->s6_addr32[i] = addr->s6_addr32[i] & wordmask;
166 }
167 }
168
169 /*
170 * npf_addr_cmp: compare two addresses, either IPv4 or IPv6.
171 *
172 * => Return 0 if equal and negative/positive if less/greater accordingly.
173 * => Ignore the mask, if NPF_NO_NETMASK is specified.
174 */
175 int
176 npf_addr_cmp(const npf_addr_t *addr1, const npf_netmask_t mask1,
177 const npf_addr_t *addr2, const npf_netmask_t mask2, const int alen)
178 {
179 npf_addr_t realaddr1, realaddr2;
180
181 if (mask1 != NPF_NO_NETMASK) {
182 npf_addr_mask(addr1, mask1, alen, &realaddr1);
183 addr1 = &realaddr1;
184 }
185 if (mask2 != NPF_NO_NETMASK) {
186 npf_addr_mask(addr2, mask2, alen, &realaddr2);
187 addr2 = &realaddr2;
188 }
189 return memcmp(addr1, addr2, alen);
190 }
191
192 /*
193 * npf_tcpsaw: helper to fetch SEQ, ACK, WIN and return TCP data length.
194 *
195 * => Returns all values in host byte-order.
196 */
197 int
198 npf_tcpsaw(const npf_cache_t *npc, tcp_seq *seq, tcp_seq *ack, uint32_t *win)
199 {
200 const struct tcphdr *th = npc->npc_l4.tcp;
201 u_int thlen;
202
203 KASSERT(npf_iscached(npc, NPC_TCP));
204
205 *seq = ntohl(th->th_seq);
206 *ack = ntohl(th->th_ack);
207 *win = (uint32_t)ntohs(th->th_win);
208 thlen = th->th_off << 2;
209
210 if (npf_iscached(npc, NPC_IP4)) {
211 const struct ip *ip = npc->npc_ip.v4;
212 return ntohs(ip->ip_len) - npc->npc_hlen - thlen;
213 } else if (npf_iscached(npc, NPC_IP6)) {
214 const struct ip6_hdr *ip6 = npc->npc_ip.v6;
215 return ntohs(ip6->ip6_plen) - thlen;
216 }
217 return 0;
218 }
219
220 /*
221 * npf_fetch_tcpopts: parse and return TCP options.
222 */
223 bool
224 npf_fetch_tcpopts(npf_cache_t *npc, nbuf_t *nbuf, uint16_t *mss, int *wscale)
225 {
226 const struct tcphdr *th = npc->npc_l4.tcp;
227 int topts_len, step;
228 void *nptr;
229 uint8_t val;
230 bool ok;
231
232 KASSERT(npf_iscached(npc, NPC_IP46));
233 KASSERT(npf_iscached(npc, NPC_TCP));
234
235 /* Determine if there are any TCP options, get their length. */
236 topts_len = (th->th_off << 2) - sizeof(struct tcphdr);
237 if (topts_len <= 0) {
238 /* No options. */
239 return false;
240 }
241 KASSERT(topts_len <= MAX_TCPOPTLEN);
242
243 /* First step: IP and TCP header up to options. */
244 step = npc->npc_hlen + sizeof(struct tcphdr);
245 nbuf_reset(nbuf);
246 next:
247 if ((nptr = nbuf_advance(nbuf, step, 1)) == NULL) {
248 ok = false;
249 goto done;
250 }
251 val = *(uint8_t *)nptr;
252
253 switch (val) {
254 case TCPOPT_EOL:
255 /* Done. */
256 ok = true;
257 goto done;
258 case TCPOPT_NOP:
259 topts_len--;
260 step = 1;
261 break;
262 case TCPOPT_MAXSEG:
263 if ((nptr = nbuf_advance(nbuf, 2, 2)) == NULL) {
264 ok = false;
265 goto done;
266 }
267 if (mss) {
268 if (*mss) {
269 memcpy(nptr, mss, sizeof(uint16_t));
270 } else {
271 memcpy(mss, nptr, sizeof(uint16_t));
272 }
273 }
274 topts_len -= TCPOLEN_MAXSEG;
275 step = 2;
276 break;
277 case TCPOPT_WINDOW:
278 /* TCP Window Scaling (RFC 1323). */
279 if ((nptr = nbuf_advance(nbuf, 2, 1)) == NULL) {
280 ok = false;
281 goto done;
282 }
283 val = *(uint8_t *)nptr;
284 *wscale = (val > TCP_MAX_WINSHIFT) ? TCP_MAX_WINSHIFT : val;
285 topts_len -= TCPOLEN_WINDOW;
286 step = 1;
287 break;
288 default:
289 if ((nptr = nbuf_advance(nbuf, 1, 1)) == NULL) {
290 ok = false;
291 goto done;
292 }
293 val = *(uint8_t *)nptr;
294 if (val < 2 || val > topts_len) {
295 ok = false;
296 goto done;
297 }
298 topts_len -= val;
299 step = val - 1;
300 }
301
302 /* Any options left? */
303 if (__predict_true(topts_len > 0)) {
304 goto next;
305 }
306 ok = true;
307 done:
308 if (nbuf_flag_p(nbuf, NBUF_DATAREF_RESET)) {
309 npf_recache(npc, nbuf);
310 }
311 return ok;
312 }
313
314 static int
315 npf_cache_ip(npf_cache_t *npc, nbuf_t *nbuf)
316 {
317 const void *nptr = nbuf_dataptr(nbuf);
318 const uint8_t ver = *(const uint8_t *)nptr;
319 int flags = 0;
320
321 switch (ver >> 4) {
322 case IPVERSION: {
323 struct ip *ip;
324
325 ip = nbuf_ensure_contig(nbuf, sizeof(struct ip));
326 if (ip == NULL) {
327 return 0;
328 }
329
330 /* Check header length and fragment offset. */
331 if ((u_int)(ip->ip_hl << 2) < sizeof(struct ip)) {
332 return 0;
333 }
334 if (ip->ip_off & ~htons(IP_DF | IP_RF)) {
335 /* Note fragmentation. */
336 flags |= NPC_IPFRAG;
337 }
338
339 /* Cache: layer 3 - IPv4. */
340 npc->npc_alen = sizeof(struct in_addr);
341 npc->npc_ips[NPF_SRC] = (npf_addr_t *)&ip->ip_src;
342 npc->npc_ips[NPF_DST] = (npf_addr_t *)&ip->ip_dst;
343 npc->npc_hlen = ip->ip_hl << 2;
344 npc->npc_proto = ip->ip_p;
345
346 npc->npc_ip.v4 = ip;
347 flags |= NPC_IP4;
348 break;
349 }
350
351 case (IPV6_VERSION >> 4): {
352 struct ip6_hdr *ip6;
353 struct ip6_ext *ip6e;
354 size_t off, hlen;
355
356 ip6 = nbuf_ensure_contig(nbuf, sizeof(struct ip6_hdr));
357 if (ip6 == NULL) {
358 return 0;
359 }
360
361 /* Set initial next-protocol value. */
362 hlen = sizeof(struct ip6_hdr);
363 npc->npc_proto = ip6->ip6_nxt;
364 npc->npc_hlen = hlen;
365
366 /*
367 * Advance by the length of the current header.
368 */
369 off = nbuf_offset(nbuf);
370 while (nbuf_advance(nbuf, hlen, 0) != NULL) {
371 ip6e = nbuf_ensure_contig(nbuf, sizeof(*ip6e));
372 if (ip6e == NULL) {
373 return 0;
374 }
375
376 /*
377 * Determine whether we are going to continue.
378 */
379 switch (npc->npc_proto) {
380 case IPPROTO_HOPOPTS:
381 case IPPROTO_DSTOPTS:
382 case IPPROTO_ROUTING:
383 hlen = (ip6e->ip6e_len + 1) << 3;
384 break;
385 case IPPROTO_FRAGMENT:
386 hlen = sizeof(struct ip6_frag);
387 flags |= NPC_IPFRAG;
388 break;
389 case IPPROTO_AH:
390 hlen = (ip6e->ip6e_len + 2) << 2;
391 break;
392 default:
393 hlen = 0;
394 break;
395 }
396
397 if (!hlen) {
398 break;
399 }
400 npc->npc_proto = ip6e->ip6e_nxt;
401 npc->npc_hlen += hlen;
402 }
403
404 /*
405 * Re-fetch the header pointers (nbufs might have been
406 * reallocated). Restore the original offset (if any).
407 */
408 nbuf_reset(nbuf);
409 ip6 = nbuf_dataptr(nbuf);
410 if (off) {
411 nbuf_advance(nbuf, off, 0);
412 }
413
414 /* Cache: layer 3 - IPv6. */
415 npc->npc_alen = sizeof(struct in6_addr);
416 npc->npc_ips[NPF_SRC] = (npf_addr_t *)&ip6->ip6_src;
417 npc->npc_ips[NPF_DST]= (npf_addr_t *)&ip6->ip6_dst;
418
419 npc->npc_ip.v6 = ip6;
420 flags |= NPC_IP6;
421 break;
422 }
423 default:
424 break;
425 }
426 return flags;
427 }
428
429 /*
430 * npf_cache_all: general routine to cache all relevant IP (v4 or v6)
431 * and TCP, UDP or ICMP headers.
432 *
433 * => nbuf offset shall be set accordingly.
434 */
435 int
436 npf_cache_all(npf_cache_t *npc, nbuf_t *nbuf)
437 {
438 int flags, l4flags;
439 u_int hlen;
440
441 /*
442 * This routine is a main point where the references are cached,
443 * therefore clear the flag as we reset.
444 */
445 again:
446 nbuf_unset_flag(nbuf, NBUF_DATAREF_RESET);
447
448 /*
449 * First, cache the L3 header (IPv4 or IPv6). If IP packet is
450 * fragmented, then we cannot look into L4.
451 */
452 flags = npf_cache_ip(npc, nbuf);
453 if ((flags & NPC_IP46) == 0 || (flags & NPC_IPFRAG) != 0) {
454 nbuf_unset_flag(nbuf, NBUF_DATAREF_RESET);
455 npc->npc_info |= flags;
456 return flags;
457 }
458 hlen = npc->npc_hlen;
459
460 switch (npc->npc_proto) {
461 case IPPROTO_TCP:
462 /* Cache: layer 4 - TCP. */
463 npc->npc_l4.tcp = nbuf_advance(nbuf, hlen,
464 sizeof(struct tcphdr));
465 l4flags = NPC_LAYER4 | NPC_TCP;
466 break;
467 case IPPROTO_UDP:
468 /* Cache: layer 4 - UDP. */
469 npc->npc_l4.udp = nbuf_advance(nbuf, hlen,
470 sizeof(struct udphdr));
471 l4flags = NPC_LAYER4 | NPC_UDP;
472 break;
473 case IPPROTO_ICMP:
474 /* Cache: layer 4 - ICMPv4. */
475 npc->npc_l4.icmp = nbuf_advance(nbuf, hlen,
476 offsetof(struct icmp, icmp_void));
477 l4flags = NPC_LAYER4 | NPC_ICMP;
478 break;
479 case IPPROTO_ICMPV6:
480 /* Cache: layer 4 - ICMPv6. */
481 npc->npc_l4.icmp6 = nbuf_advance(nbuf, hlen,
482 offsetof(struct icmp6_hdr, icmp6_data32));
483 l4flags = NPC_LAYER4 | NPC_ICMP;
484 break;
485 default:
486 l4flags = 0;
487 break;
488 }
489
490 if (nbuf_flag_p(nbuf, NBUF_DATAREF_RESET)) {
491 goto again;
492 }
493
494 /* Add the L4 flags if nbuf_advance() succeeded. */
495 if (l4flags && npc->npc_l4.hdr) {
496 flags |= l4flags;
497 }
498 npc->npc_info |= flags;
499 return flags;
500 }
501
502 void
503 npf_recache(npf_cache_t *npc, nbuf_t *nbuf)
504 {
505 const int mflags __diagused = npc->npc_info & (NPC_IP46 | NPC_LAYER4);
506 int flags __diagused;
507
508 nbuf_reset(nbuf);
509 npc->npc_info = 0;
510 flags = npf_cache_all(npc, nbuf);
511 KASSERT((flags & mflags) == mflags);
512 KASSERT(nbuf_flag_p(nbuf, NBUF_DATAREF_RESET) == 0);
513 }
514
515 /*
516 * npf_rwrip: rewrite required IP address.
517 */
518 bool
519 npf_rwrip(const npf_cache_t *npc, u_int which, const npf_addr_t *addr)
520 {
521 KASSERT(npf_iscached(npc, NPC_IP46));
522 KASSERT(which == NPF_SRC || which == NPF_DST);
523
524 memcpy(npc->npc_ips[which], addr, npc->npc_alen);
525 return true;
526 }
527
528 /*
529 * npf_rwrport: rewrite required TCP/UDP port.
530 */
531 bool
532 npf_rwrport(const npf_cache_t *npc, u_int which, const in_port_t port)
533 {
534 const int proto = npc->npc_proto;
535 in_port_t *oport;
536
537 KASSERT(npf_iscached(npc, NPC_TCP) || npf_iscached(npc, NPC_UDP));
538 KASSERT(proto == IPPROTO_TCP || proto == IPPROTO_UDP);
539 KASSERT(which == NPF_SRC || which == NPF_DST);
540
541 /* Get the offset and store the port in it. */
542 if (proto == IPPROTO_TCP) {
543 struct tcphdr *th = npc->npc_l4.tcp;
544 oport = (which == NPF_SRC) ? &th->th_sport : &th->th_dport;
545 } else {
546 struct udphdr *uh = npc->npc_l4.udp;
547 oport = (which == NPF_SRC) ? &uh->uh_sport : &uh->uh_dport;
548 }
549 memcpy(oport, &port, sizeof(in_port_t));
550 return true;
551 }
552
553 /*
554 * npf_rwrcksum: rewrite IPv4 and/or TCP/UDP checksum.
555 */
556 bool
557 npf_rwrcksum(const npf_cache_t *npc, u_int which,
558 const npf_addr_t *addr, const in_port_t port)
559 {
560 const npf_addr_t *oaddr = npc->npc_ips[which];
561 const int proto = npc->npc_proto;
562 const int alen = npc->npc_alen;
563 uint16_t *ocksum;
564 in_port_t oport;
565
566 KASSERT(npf_iscached(npc, NPC_LAYER4));
567 KASSERT(which == NPF_SRC || which == NPF_DST);
568
569 if (npf_iscached(npc, NPC_IP4)) {
570 struct ip *ip = npc->npc_ip.v4;
571 uint16_t ipsum = ip->ip_sum;
572
573 /* Recalculate IPv4 checksum and rewrite. */
574 ip->ip_sum = npf_addr_cksum(ipsum, alen, oaddr, addr);
575 } else {
576 /* No checksum for IPv6. */
577 KASSERT(npf_iscached(npc, NPC_IP6));
578 }
579
580 /* Nothing else to do for ICMP. */
581 if (proto == IPPROTO_ICMP) {
582 return true;
583 }
584 KASSERT(npf_iscached(npc, NPC_TCP) || npf_iscached(npc, NPC_UDP));
585
586 /*
587 * Calculate TCP/UDP checksum:
588 * - Skip if UDP and the current checksum is zero.
589 * - Fixup the IP address change.
590 * - Fixup the port change, if required (non-zero).
591 */
592 if (proto == IPPROTO_TCP) {
593 struct tcphdr *th = npc->npc_l4.tcp;
594
595 ocksum = &th->th_sum;
596 oport = (which == NPF_SRC) ? th->th_sport : th->th_dport;
597 } else {
598 struct udphdr *uh = npc->npc_l4.udp;
599
600 KASSERT(proto == IPPROTO_UDP);
601 ocksum = &uh->uh_sum;
602 if (*ocksum == 0) {
603 /* No need to update. */
604 return true;
605 }
606 oport = (which == NPF_SRC) ? uh->uh_sport : uh->uh_dport;
607 }
608
609 uint16_t cksum = npf_addr_cksum(*ocksum, alen, oaddr, addr);
610 if (port) {
611 cksum = npf_fixup16_cksum(cksum, oport, port);
612 }
613
614 /* Rewrite TCP/UDP checksum. */
615 memcpy(ocksum, &cksum, sizeof(uint16_t));
616 return true;
617 }
618
619 #if defined(DDB) || defined(_NPF_TESTING)
620
621 void
622 npf_addr_dump(const npf_addr_t *addr)
623 {
624 printf("IP[%x:%x:%x:%x]\n",
625 addr->s6_addr32[0], addr->s6_addr32[1],
626 addr->s6_addr32[2], addr->s6_addr32[3]);
627 }
628
629 #endif
630