npf_state_tcp.c revision 1.1 1 /* $NetBSD: npf_state_tcp.c,v 1.1 2011/11/29 20:05:30 rmind Exp $ */
2
3 /*-
4 * Copyright (c) 2010-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This material is based upon work partially supported by The
8 * NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * NPF TCP state engine for connection tracking.
34 */
35
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: npf_state_tcp.c,v 1.1 2011/11/29 20:05:30 rmind Exp $");
38
39 #include <sys/param.h>
40 #include <sys/types.h>
41
42 #ifndef _KERNEL
43 #include <stdio.h>
44 #include <stdbool.h>
45 #include <inttypes.h>
46 #endif
47 #include <netinet/in.h>
48 #include <netinet/tcp.h>
49 #include <netinet/tcp_seq.h>
50
51 #include "npf_impl.h"
52
53 #if defined(_NPF_TESTING)
54 void npf_state_sample(npf_state_t *);
55 #define NPF_TCP_STATE_SAMPLE(nst) npf_state_sample(nst)
56 #else
57 #define NPF_TCP_STATE_SAMPLE(nst)
58 #endif
59
60 /*
61 * NPF TCP states. Note: these states are different from the TCP FSM
62 * states of RFC 793. Mind that packet filter is a man-in-the-middle.
63 */
64 #define NPF_TCPS_OK (-1)
65 #define NPF_TCPS_CLOSED 0
66 #define NPF_TCPS_SYN_SENT 1
67 #define NPF_TCPS_SIMSYN_SENT 2
68 #define NPF_TCPS_SYN_RECEIVED 3
69 #define NPF_TCPS_ESTABLISHED 4
70 #define NPF_TCPS_FIN_SEEN 5
71 #define NPF_TCPS_CLOSE_WAIT 6
72 #define NPF_TCPS_FIN_WAIT 7
73 #define NPF_TCPS_CLOSING 8
74 #define NPF_TCPS_LAST_ACK 9
75 #define NPF_TCPS_TIME_WAIT 10
76
77 #define NPF_TCP_NSTATES 11
78
79 /*
80 * TCP connection timeout table (in seconds).
81 */
82 static const u_int npf_tcp_timeouts[] __read_mostly = {
83 /* Closed, timeout nearly immediately. */
84 [NPF_TCPS_CLOSED] = 10,
85 /* Unsynchronised states. */
86 [NPF_TCPS_SYN_SENT] = 30,
87 [NPF_TCPS_SIMSYN_SENT] = 30,
88 [NPF_TCPS_SYN_RECEIVED] = 60,
89 /* Established, timeout: 24 hours. */
90 [NPF_TCPS_ESTABLISHED] = 60 * 60 * 24,
91 /* Closure cases, timeout: 4 minutes (2 * MSL). */
92 [NPF_TCPS_FIN_SEEN] = 60 * 2 * 2,
93 [NPF_TCPS_CLOSE_WAIT] = 60 * 2 * 2,
94 [NPF_TCPS_FIN_WAIT] = 60 * 2 * 2,
95 [NPF_TCPS_CLOSING] = 30,
96 [NPF_TCPS_LAST_ACK] = 30,
97 [NPF_TCPS_TIME_WAIT] = 60 * 2 * 2,
98 };
99
100 #define NPF_TCP_MAXACKWIN 66000
101
102 #define TH_STATE_MASK (TH_SYN | TH_ACK | TH_FIN)
103 #define TH_SYNACK (TH_SYN | TH_ACK)
104 #define TH_FINACK (TH_FIN | TH_ACK)
105
106 /*
107 * NPF transition table of a tracked TCP connection.
108 *
109 * There is a single state, which is changed in the following way:
110 *
111 * new_state = npf_tcp_fsm[old_state][direction][tcp_flags & TH_STATE_MASK];
112 *
113 * Note that this state is different from the state in each end (host).
114 */
115
116 static const int npf_tcp_fsm[NPF_TCP_NSTATES][2][TH_STATE_MASK + 1]
117 __read_mostly = {
118 [NPF_TCPS_CLOSED] = {
119 [NPF_FLOW_FORW] = {
120 /* Handshake (1): initial SYN. */
121 [TH_SYN] = NPF_TCPS_SYN_SENT,
122 },
123 },
124 [NPF_TCPS_SYN_SENT] = {
125 [NPF_FLOW_FORW] = {
126 /* SYN may be retransmitted. */
127 [TH_SYN] = NPF_TCPS_OK,
128 },
129 [NPF_FLOW_BACK] = {
130 /* Handshake (2): SYN-ACK is expected. */
131 [TH_SYNACK] = NPF_TCPS_SYN_RECEIVED,
132 /* Simultaneous initiation - SYN. */
133 [TH_SYN] = NPF_TCPS_SIMSYN_SENT,
134 },
135 },
136 [NPF_TCPS_SIMSYN_SENT] = {
137 [NPF_FLOW_FORW] = {
138 /* Original SYN re-transmission. */
139 [TH_SYN] = NPF_TCPS_OK,
140 /* SYN-ACK response to simultaneous SYN. */
141 [TH_SYNACK] = NPF_TCPS_SYN_RECEIVED,
142 },
143 [NPF_FLOW_BACK] = {
144 /* Simultaneous SYN re-transmission.*/
145 [TH_SYN] = NPF_TCPS_OK,
146 /* SYN-ACK response to original SYN. */
147 [TH_SYNACK] = NPF_TCPS_SYN_RECEIVED,
148 /* FIN may be sent at this point. */
149 [TH_FIN] = NPF_TCPS_FIN_SEEN,
150 [TH_FINACK] = NPF_TCPS_FIN_SEEN,
151 },
152 },
153 [NPF_TCPS_SYN_RECEIVED] = {
154 [NPF_FLOW_FORW] = {
155 /* Handshake (3): ACK is expected. */
156 [TH_ACK] = NPF_TCPS_ESTABLISHED,
157 [TH_FIN] = NPF_TCPS_CLOSING,
158 [TH_FINACK] = NPF_TCPS_CLOSING,
159 },
160 [NPF_FLOW_BACK] = {
161 /* SYN-ACK may be retransmitted. */
162 [TH_SYNACK] = NPF_TCPS_OK,
163 /* XXX: ACK of late SYN in simultaneous case? */
164 [TH_ACK] = NPF_TCPS_OK,
165 /* XXX: Can this happen?
166 [TH_FIN] = NPF_TCPS_CLOSING, */
167 },
168 },
169 [NPF_TCPS_ESTABLISHED] = {
170 /*
171 * Regular ACKs (data exchange) or FIN.
172 * FIN packets may have ACK set.
173 */
174 [NPF_FLOW_FORW] = {
175 [TH_ACK] = NPF_TCPS_OK,
176 /* FIN by the sender. */
177 [TH_FIN] = NPF_TCPS_FIN_SEEN,
178 [TH_FINACK] = NPF_TCPS_FIN_SEEN,
179 },
180 [NPF_FLOW_BACK] = {
181 [TH_ACK] = NPF_TCPS_OK,
182 /* FIN by the receiver. */
183 [TH_FIN] = NPF_TCPS_FIN_SEEN,
184 [TH_FINACK] = NPF_TCPS_FIN_SEEN,
185 },
186 },
187 [NPF_TCPS_FIN_SEEN] = {
188 /*
189 * FIN was seen. If ACK only, connection is half-closed now,
190 * need to determine which end is closed (sender or receiver).
191 * However, both FIN and FIN-ACK may race here - in which
192 * case we are closing immediately.
193 */
194 [NPF_FLOW_FORW] = {
195 [TH_ACK] = NPF_TCPS_CLOSE_WAIT,
196 [TH_FIN] = NPF_TCPS_CLOSING,
197 [TH_FINACK] = NPF_TCPS_CLOSING,
198 },
199 [NPF_FLOW_BACK] = {
200 [TH_ACK] = NPF_TCPS_FIN_WAIT,
201 [TH_FIN] = NPF_TCPS_CLOSING,
202 [TH_FINACK] = NPF_TCPS_CLOSING,
203 },
204 },
205 [NPF_TCPS_CLOSE_WAIT] = {
206 /* Sender has sent the FIN and closed its end. */
207 [NPF_FLOW_FORW] = {
208 [TH_ACK] = NPF_TCPS_OK,
209 [TH_FIN] = NPF_TCPS_LAST_ACK,
210 [TH_FINACK] = NPF_TCPS_LAST_ACK,
211 },
212 [NPF_FLOW_BACK] = {
213 [TH_ACK] = NPF_TCPS_OK,
214 [TH_FIN] = NPF_TCPS_LAST_ACK,
215 [TH_FINACK] = NPF_TCPS_LAST_ACK,
216 },
217 },
218 [NPF_TCPS_FIN_WAIT] = {
219 /* Receiver has closed its end. */
220 [NPF_FLOW_FORW] = {
221 [TH_ACK] = NPF_TCPS_OK,
222 [TH_FIN] = NPF_TCPS_LAST_ACK,
223 [TH_FINACK] = NPF_TCPS_LAST_ACK,
224 },
225 [NPF_FLOW_BACK] = {
226 [TH_ACK] = NPF_TCPS_OK,
227 [TH_FIN] = NPF_TCPS_LAST_ACK,
228 [TH_FINACK] = NPF_TCPS_LAST_ACK,
229 },
230 },
231 [NPF_TCPS_CLOSING] = {
232 /* Race of FINs - expecting ACK. */
233 [NPF_FLOW_FORW] = {
234 [TH_ACK] = NPF_TCPS_LAST_ACK,
235 },
236 [NPF_FLOW_BACK] = {
237 [TH_ACK] = NPF_TCPS_LAST_ACK,
238 },
239 },
240 [NPF_TCPS_LAST_ACK] = {
241 /* FINs exchanged - expecting last ACK. */
242 [NPF_FLOW_FORW] = {
243 [TH_ACK] = NPF_TCPS_TIME_WAIT,
244 },
245 [NPF_FLOW_BACK] = {
246 [TH_ACK] = NPF_TCPS_TIME_WAIT,
247 },
248 },
249 [NPF_TCPS_TIME_WAIT] = {
250 /* May re-open the connection as per RFC 1122. */
251 [NPF_FLOW_FORW] = {
252 [TH_SYN] = NPF_TCPS_SYN_SENT,
253 },
254 },
255 };
256
257 /*
258 * npf_tcp_inwindow: determine whether the packet is in the TCP window
259 * and thus part of the connection we are tracking.
260 */
261 static bool
262 npf_tcp_inwindow(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst,
263 const int di)
264 {
265 const struct tcphdr * const th = &npc->npc_l4.tcp;
266 const int tcpfl = th->th_flags;
267 npf_tcpstate_t *fstate, *tstate;
268 int tcpdlen, wscale, ackskew;
269 tcp_seq seq, ack, end;
270 uint32_t win;
271
272 KASSERT(npf_iscached(npc, NPC_TCP));
273 KASSERT(di == NPF_FLOW_FORW || di == NPF_FLOW_BACK);
274
275 /*
276 * Perform SEQ/ACK numbers check against boundaries. Reference:
277 *
278 * Rooij G., "Real stateful TCP packet filtering in IP Filter",
279 * 10th USENIX Security Symposium invited talk, Aug. 2001.
280 *
281 * There four boundaries are defined as following:
282 * I) SEQ + LEN <= MAX { SND.ACK + MAX(SND.WIN, 1) }
283 * II) SEQ >= MAX { SND.SEQ + SND.LEN }
284 * III) ACK <= MAX { RCV.SEQ + RCV.LEN }
285 * IV) ACK >= MAX { RCV.SEQ + RCV.LEN } - MAXACKWIN
286 *
287 * Let these members of npf_tcpstate_t be the maximum seen values of:
288 * nst_end - SEQ + LEN
289 * nst_maxend - ACK + MAX(WIN, 1)
290 * nst_maxwin - MAX(WIN, 1)
291 */
292
293 tcpdlen = npf_tcpsaw(__UNCONST(npc), &seq, &ack, &win);
294 end = seq + tcpdlen;
295 if (tcpfl & TH_SYN) {
296 end++;
297 }
298 if (tcpfl & TH_FIN) {
299 end++;
300 }
301
302 fstate = &nst->nst_tcpst[di];
303 tstate = &nst->nst_tcpst[!di];
304 win = win ? (win << fstate->nst_wscale) : 1;
305
306 /*
307 * Initialise if the first packet.
308 * Note: only case when nst_maxwin is zero.
309 */
310 if (__predict_false(fstate->nst_maxwin == 0)) {
311 /*
312 * Should be first SYN or re-transmission of SYN. State of
313 * other side will get set with a SYN-ACK reply (see below).
314 */
315 fstate->nst_end = end;
316 fstate->nst_maxend = end;
317 fstate->nst_maxwin = win;
318 tstate->nst_end = 0;
319 tstate->nst_maxend = 0;
320 tstate->nst_maxwin = 1;
321
322 /*
323 * Handle TCP Window Scaling (RFC 1323). Both sides may
324 * send this option in their SYN packets.
325 */
326 if (npf_fetch_tcpopts(npc, nbuf, NULL, &wscale)) {
327 fstate->nst_wscale = wscale;
328 } else {
329 fstate->nst_wscale = 0;
330 }
331 tstate->nst_wscale = 0;
332
333 /* Done. */
334 return true;
335 }
336 if (fstate->nst_end == 0) {
337 /*
338 * Should be a SYN-ACK reply to SYN. If SYN is not set,
339 * then we are in the middle of connection and lost tracking.
340 */
341 fstate->nst_end = end;
342 fstate->nst_maxend = end + 1;
343 fstate->nst_maxwin = win;
344
345 /* Handle TCP Window Scaling (must be ignored if no SYN). */
346 if (tcpfl & TH_SYN) {
347 fstate->nst_wscale =
348 npf_fetch_tcpopts(npc, nbuf, NULL, &wscale) ?
349 wscale : 0;
350 }
351 }
352 if ((tcpfl & TH_ACK) == 0) {
353 /* Pretend that an ACK was sent. */
354 ack = tstate->nst_end;
355 } else if ((tcpfl & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST) && ack == 0) {
356 /* Workaround for some TCP stacks. */
357 ack = tstate->nst_end;
358 }
359 if (seq == end) {
360 /* If packet contains no data - assume it is valid. */
361 end = fstate->nst_end;
362 seq = end;
363 }
364
365 NPF_TCP_STATE_SAMPLE(nst);
366 #if 0
367 /* Strict in-order sequence for RST packets. */
368 if (((tcpfl & TH_RST) != 0) && (fstate->nst_end - seq) > 1) {
369 return false;
370 }
371 #endif
372 /*
373 * Determine whether the data is within previously noted window,
374 * that is, upper boundary for valid data (I).
375 */
376 if (!SEQ_LEQ(end, fstate->nst_maxend)) {
377 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP1);
378 return false;
379 }
380
381 /* Lower boundary (II), which is no more than one window back. */
382 if (!SEQ_GEQ(seq, fstate->nst_end - tstate->nst_maxwin)) {
383 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP2);
384 return false;
385 }
386
387 /*
388 * Boundaries for valid acknowledgments (III, IV) - on predicted
389 * window up or down, since packets may be fragmented.
390 */
391 ackskew = tstate->nst_end - ack;
392 if (ackskew < -NPF_TCP_MAXACKWIN ||
393 ackskew > (NPF_TCP_MAXACKWIN << fstate->nst_wscale)) {
394 npf_stats_inc(NPF_STAT_INVALID_STATE_TCP3);
395 return false;
396 }
397
398 /*
399 * Packet has been passed.
400 *
401 * Negative ackskew might be due to fragmented packets. Since the
402 * total length of the packet is unknown - bump the boundary.
403 */
404 if (ackskew < 0) {
405 tstate->nst_end = end;
406 }
407 /* Keep track of the maximum window seen. */
408 if (fstate->nst_maxwin < win) {
409 fstate->nst_maxwin = win;
410 }
411 if (SEQ_GT(end, fstate->nst_end)) {
412 fstate->nst_end = end;
413 }
414 /* Note the window for upper boundary. */
415 if (SEQ_GEQ(ack + win, tstate->nst_maxend)) {
416 tstate->nst_maxend = ack + win;
417 }
418 return true;
419 }
420
421 bool
422 npf_state_tcp(const npf_cache_t *npc, nbuf_t *nbuf, npf_state_t *nst, int di)
423 {
424 const struct tcphdr * const th = &npc->npc_l4.tcp;
425 const int tcpfl = th->th_flags, state = nst->nst_state;
426 int nstate;
427
428 /* Look for a transition to a new state. */
429 if (__predict_true((tcpfl & TH_RST) == 0)) {
430 nstate = npf_tcp_fsm[state][di][tcpfl & TH_STATE_MASK];
431 } else if (state == NPF_TCPS_TIME_WAIT) {
432 /* Prevent TIME-WAIT assassination (RFC 1337). */
433 nstate = NPF_TCPS_OK;
434 } else {
435 nstate = NPF_TCPS_CLOSED;
436 }
437 /* Determine whether TCP packet really belongs to this connection. */
438 if (!npf_tcp_inwindow(npc, nbuf, nst, di)) {
439 return false;
440 }
441 if (__predict_true(nstate == NPF_TCPS_OK)) {
442 return true;
443 }
444 nst->nst_state = nstate;
445 return true;
446 }
447
448 int
449 npf_state_tcp_timeout(const npf_state_t *nst)
450 {
451 const u_int state = nst->nst_state;
452
453 KASSERT(state < NPF_TCP_NSTATES);
454 return npf_tcp_timeouts[state];
455 }
456