Home | History | Annotate | Line # | Download | only in npf
npf_state.c revision 1.3
      1 /*	$NetBSD: npf_state.c,v 1.3 2011/01/18 20:33:46 rmind Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2010 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This material is based upon work partially supported by The
      8  * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 /*
     33  * NPF state engine to track connections.
     34  */
     35 
     36 #include <sys/cdefs.h>
     37 __KERNEL_RCSID(0, "$NetBSD: npf_state.c,v 1.3 2011/01/18 20:33:46 rmind Exp $");
     38 
     39 #include <sys/param.h>
     40 #include <sys/systm.h>
     41 
     42 #include <sys/mutex.h>
     43 #include <netinet/in.h>
     44 #include <netinet/tcp.h>
     45 #include <netinet/tcp_seq.h>
     46 #include <netinet/tcp_fsm.h>
     47 
     48 #include "npf_impl.h"
     49 
     50 /* TCP session expiration table. */
     51 static const u_int tcp_expire_table[ ] __read_mostly = {
     52 	/* Initial synchronisation.  Timeout: 30 sec and 1 minute. */
     53 	[TCPS_SYN_SENT]		= 30,
     54 	[TCPS_SYN_RECEIVED]	= 60,
     55 	/* Established (synchronised).  Timeout: 24 hours. */
     56 	[TCPS_ESTABLISHED]	= 60 * 60 * 24,
     57 	[TCPS_FIN_WAIT_1]	= 60 * 60 * 24,
     58 	[TCPS_FIN_WAIT_2]	= 60 * 60 * 24,
     59 	/* UNUSED [TCPS_CLOSE_WAIT]	= 60 * 60 * 24, */
     60 	/* Closure.  Timeout: 4 minutes (2 * MSL). */
     61 	[TCPS_CLOSING]		= 60 * 4,
     62 	[TCPS_LAST_ACK]		= 60 * 4,
     63 	[TCPS_TIME_WAIT]	= 60 * 4,
     64 	/* Fully closed.  Timeout immediately. */
     65 	[TCPS_CLOSED]		= 0
     66 };
     67 
     68 /* Session expiration table. */
     69 static const u_int expire_table[ ] __read_mostly = {
     70 	[IPPROTO_UDP]		= 60,		/* 1 min */
     71 	[IPPROTO_ICMP]		= 30		/* 30 sec */
     72 };
     73 
     74 #define	MAXACKWINDOW		66000
     75 
     76 static bool
     77 npf_tcp_inwindow(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst,
     78     const bool forw)
     79 {
     80 	const struct tcphdr * const th = &npc->npc_l4.tcp;
     81 	const int tcpfl = th->th_flags;
     82 	npf_tcpstate_t *fstate, *tstate;
     83 	int tcpdlen, wscale, ackskew;
     84 	tcp_seq seq, ack, end;
     85 	uint32_t win;
     86 
     87 	KASSERT(npf_iscached(npc, NPC_TCP));
     88 	tcpdlen = npf_tcpsaw(__UNCONST(npc), &seq, &ack, &win);
     89 	end = seq + tcpdlen;
     90 	if (tcpfl & TH_SYN) {
     91 		end++;
     92 	}
     93 	if (tcpfl & TH_FIN) {
     94 		end++;
     95 	}
     96 
     97 	/*
     98 	 * Perform SEQ/ACK numbers check against boundaries.  Reference:
     99 	 *
    100 	 *	Rooij G., "Real stateful TCP packet filtering in IP Filter",
    101 	 *	10th USENIX Security Symposium invited talk, Aug. 2001.
    102 	 */
    103 
    104 	fstate = &nst->nst_tcpst[forw ? 0 : 1];
    105 	tstate = &nst->nst_tcpst[forw ? 1 : 0];
    106 	win = win ? (win << fstate->nst_wscale) : 1;
    107 
    108 	if (tcpfl == TH_SYN) {
    109 		/*
    110 		 * First SYN or re-transmission of SYN.  Initialize all
    111 		 * values.  State of other side will get set with a SYN-ACK
    112 		 * reply (see below).
    113 		 */
    114 		fstate->nst_seqend = end;
    115 		fstate->nst_ackend = end;
    116 		fstate->nst_maxwin = win;
    117 		tstate->nst_ackend = 0;
    118 		tstate->nst_ackend = 0;
    119 		tstate->nst_maxwin = 0;
    120 		/*
    121 		 * Handle TCP Window Scaling (RFC 1323).  Both sides may
    122 		 * send this option in their SYN packets.
    123 		 */
    124 		if (npf_fetch_tcpopts(npc, nbuf, NULL, &wscale)) {
    125 			fstate->nst_wscale = wscale;
    126 		} else {
    127 			fstate->nst_wscale = 0;
    128 		}
    129 		tstate->nst_wscale = 0;
    130 		/* Done. */
    131 		return true;
    132 	}
    133 	if (fstate->nst_seqend == 0) {
    134 		/*
    135 		 * Should be a SYN-ACK reply to SYN.  If SYN is not set,
    136 		 * then we are in the middle connection and lost tracking.
    137 		 */
    138 		fstate->nst_seqend = end;
    139 		fstate->nst_ackend = end + 1;
    140 		fstate->nst_maxwin = 1;
    141 
    142 		/* Handle TCP Window Scaling (must be ignored if no SYN). */
    143 		if (tcpfl & TH_SYN) {
    144 			fstate->nst_wscale =
    145 			    npf_fetch_tcpopts(npc, nbuf, NULL, &wscale) ?
    146 			    wscale : 0;
    147 		}
    148 	}
    149 	if ((tcpfl & TH_ACK) == 0) {
    150 		/* Pretend that an ACK was sent. */
    151 		ack = tstate->nst_seqend;
    152 	} else if ((tcpfl & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST) && ack == 0) {
    153 		/* Workaround for some TCP stacks. */
    154 		ack = tstate->nst_seqend;
    155 	}
    156 	if (seq == end) {
    157 		/* If packet contains no data - assume it is valid. */
    158 		end = fstate->nst_seqend;
    159 		seq = end;
    160 	}
    161 
    162 	/*
    163 	 * Determine whether the data is within previously noted window,
    164 	 * that is, upper boundary for valid data (I).
    165 	 */
    166 	if (!SEQ_GEQ(fstate->nst_ackend, end)) {
    167 		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP1);
    168 		return false;
    169 	}
    170 	/* Lower boundary (II), which is no more than one window back. */
    171 	if (!SEQ_GEQ(seq, fstate->nst_seqend - tstate->nst_maxwin)) {
    172 		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP2);
    173 		return false;
    174 	}
    175 	/*
    176 	 * Boundaries for valid acknowledgments (III, IV) - on predicted
    177 	 * window up or down, since packets may be fragmented.
    178 	 */
    179 	ackskew = tstate->nst_seqend - ack;
    180 	if (ackskew < -MAXACKWINDOW || ackskew > MAXACKWINDOW) {
    181 		npf_stats_inc(NPF_STAT_INVALID_STATE_TCP3);
    182 		return false;
    183 	}
    184 
    185 	/*
    186 	 * Packet is passed now.
    187 	 *
    188 	 * Negative ackskew might be due to fragmented packets.  Since the
    189 	 * total length of the packet is unknown - bump the boundary.
    190 	 */
    191 	if (ackskew < 0) {
    192 		tstate->nst_seqend = end;
    193 	}
    194 	/* Keep track of the maximum window seen. */
    195 	if (fstate->nst_maxwin < win) {
    196 		fstate->nst_maxwin = win;
    197 	}
    198 	if (SEQ_GT(end, fstate->nst_seqend)) {
    199 		fstate->nst_seqend = end;
    200 	}
    201 	/* Note the window for upper boundary. */
    202 	if (SEQ_GEQ(ack + win, tstate->nst_ackend)) {
    203 		tstate->nst_ackend = ack + win;
    204 	}
    205 	return true;
    206 }
    207 
    208 static inline bool
    209 npf_state_tcp(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst,
    210     const bool forw)
    211 {
    212 	const struct tcphdr * const th = &npc->npc_l4.tcp;
    213 	const int tcpfl = th->th_flags, state = nst->nst_state;
    214 #if 0
    215 	/* Determine whether TCP packet really belongs to this connection. */
    216 	if (!npf_tcp_inwindow(npc, nbuf, nst, forw)) {
    217 		return false;
    218 	}
    219 #endif
    220 	/*
    221 	 * Handle 3-way handshake (SYN -> SYN,ACK -> ACK), connection
    222 	 * reset (RST), half-open connections, connection closure, etc.
    223 	 */
    224 	if (__predict_false(tcpfl & TH_RST)) {
    225 		nst->nst_state = TCPS_CLOSED;
    226 		return true;
    227 	}
    228 	switch (state) {
    229 	case TCPS_ESTABLISHED:
    230 	case TCPS_FIN_WAIT_2:
    231 		/* Common case - connection is established. */
    232 		if ((tcpfl & (TH_SYN | TH_ACK | TH_FIN)) == TH_ACK) {
    233 			return true;
    234 		}
    235 		/* Otherwise, can only be a FIN. */
    236 		if ((tcpfl & TH_FIN) == 0) {
    237 			break;
    238 		}
    239 		/* XXX see below TCPS_CLOSE_WAIT */
    240 		if (state != TCPS_FIN_WAIT_2) {
    241 			/* First FIN: closure of one end. */
    242 			nst->nst_state = TCPS_FIN_WAIT_1;
    243 		} else {
    244 			/* Second FIN: connection closure, wait for ACK. */
    245 			nst->nst_state = TCPS_LAST_ACK;
    246 		}
    247 		return true;
    248 	case TCPS_SYN_SENT:
    249 		/* After SYN expecting SYN-ACK. */
    250 		if (tcpfl == (TH_SYN | TH_ACK) && !forw) {
    251 			/* Received backwards SYN-ACK. */
    252 			nst->nst_state = TCPS_SYN_RECEIVED;
    253 			return true;
    254 		}
    255 		if (tcpfl == TH_SYN && forw) {
    256 			/* Re-transmission of SYN. */
    257 			return true;
    258 		}
    259 		break;
    260 	case TCPS_SYN_RECEIVED:
    261 		/* SYN-ACK was seen, expecting ACK. */
    262 		if ((tcpfl & (TH_SYN | TH_ACK | TH_FIN)) == TH_ACK) {
    263 			/* ACK - establish connection. */
    264 			nst->nst_state = TCPS_ESTABLISHED;
    265 			return true;
    266 		}
    267 		if (tcpfl == (TH_SYN | TH_ACK)) {
    268 			/* Re-transmission of SYN-ACK. */
    269 			return true;
    270 		}
    271 		break;
    272 	case TCPS_CLOSE_WAIT:
    273 		/* UNUSED */
    274 	case TCPS_FIN_WAIT_1:
    275 		/*
    276 		 * XXX: FIN re-transmission is not handled, use TCPS_CLOSE_WAIT.
    277 		 */
    278 		/*
    279 		 * First FIN was seen, expecting ACK.  However, we may receive
    280 		 * a simultaneous FIN or exchange of FINs with FIN-ACK.
    281 		 */
    282 		if ((tcpfl & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) {
    283 			/* Exchange of FINs with ACK.  Wait for last ACK. */
    284 			nst->nst_state = TCPS_LAST_ACK;
    285 			return true;
    286 		} else if (tcpfl & TH_ACK) {
    287 			/* ACK of first FIN. */
    288 			nst->nst_state = TCPS_FIN_WAIT_2;
    289 			return true;
    290 		} else if (tcpfl & TH_FIN) {
    291 			/* Simultaneous FIN.  Need to wait for ACKs. */
    292 			nst->nst_state = TCPS_CLOSING;
    293 			return true;
    294 		}
    295 		break;
    296 	case TCPS_CLOSING:
    297 	case TCPS_LAST_ACK:
    298 	case TCPS_TIME_WAIT:
    299 		/* Expecting only ACK. */
    300 		if ((tcpfl & (TH_SYN | TH_ACK | TH_FIN)) != TH_ACK) {
    301 			return false;
    302 		}
    303 		switch (state) {
    304 		case TCPS_CLOSING:
    305 			/* One ACK noted, wait for last one. */
    306 			nst->nst_state = TCPS_LAST_ACK;
    307 			break;
    308 		case TCPS_LAST_ACK:
    309 			/* Last ACK received, quiet wait now. */
    310 			nst->nst_state = TCPS_TIME_WAIT;
    311 			break;
    312 		}
    313 		return true;
    314 	case TCPS_CLOSED:
    315 		/* XXX: Drop or pass? */
    316 		break;
    317 	default:
    318 		npf_state_dump(nst);
    319 		KASSERT(false);
    320 	}
    321 	return false;
    322 }
    323 
    324 bool
    325 npf_state_init(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst)
    326 {
    327 	const int proto = npf_cache_ipproto(npc);
    328 
    329 	KASSERT(npf_iscached(npc, NPC_IP46 | NPC_LAYER4));
    330 
    331 	mutex_init(&nst->nst_lock, MUTEX_DEFAULT, IPL_SOFTNET);
    332 
    333 	if (proto == IPPROTO_TCP) {
    334 		const struct tcphdr *th = &npc->npc_l4.tcp;
    335 
    336 		/* TCP case: must be SYN. */
    337 		KASSERT(npf_iscached(npc, NPC_TCP));
    338 		if (th->th_flags != TH_SYN) {
    339 			npf_stats_inc(NPF_STAT_INVALID_STATE);
    340 			return false;
    341 		}
    342 		/* Initial values for TCP window and sequence tracking. */
    343 		if (!npf_tcp_inwindow(npc, nbuf, nst, true)) {
    344 			npf_stats_inc(NPF_STAT_INVALID_STATE);
    345 			return false;
    346 		}
    347 	}
    348 
    349 	/*
    350 	 * Initial state: SYN sent, waiting for response from the other side.
    351 	 * Note: for UDP or ICMP, reuse SYN-sent flag to note response.
    352 	 */
    353 	nst->nst_state = TCPS_SYN_SENT;
    354 	return true;
    355 }
    356 
    357 void
    358 npf_state_destroy(npf_state_t *nst)
    359 {
    360 
    361 	mutex_destroy(&nst->nst_lock);
    362 }
    363 
    364 bool
    365 npf_state_inspect(const npf_cache_t *npc, nbuf_t *nbuf,
    366     npf_state_t *nst, const bool forw)
    367 {
    368 	const int proto = npf_cache_ipproto(npc);
    369 	bool ret;
    370 
    371 	mutex_enter(&nst->nst_lock);
    372 	switch (proto) {
    373 	case IPPROTO_TCP:
    374 		/* Handle TCP. */
    375 		ret = npf_state_tcp(npc, nbuf, nst, forw);
    376 		break;
    377 	default:
    378 		/*
    379 		 * Handle UDP or ICMP response for opening session.
    380 		 */
    381 		if (nst->nst_state == TCPS_SYN_SENT && !forw) {
    382 			nst->nst_state= TCPS_ESTABLISHED;
    383 		}
    384 		ret = true;
    385 	}
    386 	mutex_exit(&nst->nst_lock);
    387 	if (__predict_false(!ret)) {
    388 		npf_stats_inc(NPF_STAT_INVALID_STATE);
    389 	}
    390 	return ret;
    391 }
    392 
    393 /*
    394  * npf_state_etime: return session expiration time according to the state.
    395  */
    396 int
    397 npf_state_etime(const npf_state_t *nst, const int proto)
    398 {
    399 	const int state = nst->nst_state;
    400 
    401 	if (__predict_true(proto == IPPROTO_TCP)) {
    402 		return tcp_expire_table[state];
    403 	}
    404 	return expire_table[proto];
    405 }
    406 
    407 #if defined(DDB) || defined(_NPF_TESTING)
    408 
    409 void
    410 npf_state_dump(npf_state_t *nst)
    411 {
    412 	npf_tcpstate_t *fst = &nst->nst_tcpst[0], *tst = &nst->nst_tcpst[1];
    413 
    414 	printf("\tstate (%p) %d:\n\t\t"
    415 	    "F { seqend %u ackend %u mwin %u wscale %u }\n\t\t"
    416 	    "T { seqend %u ackend %u mwin %u wscale %u }\n",
    417 	    nst, nst->nst_state,
    418 	    fst->nst_seqend, fst->nst_ackend, fst->nst_maxwin, fst->nst_wscale,
    419 	    tst->nst_seqend, tst->nst_ackend, tst->nst_maxwin, tst->nst_wscale
    420 	);
    421 }
    422 
    423 #endif
    424