tcp_usrreq.c revision 1.25 1 /* $NetBSD: tcp_usrreq.c,v 1.25 1997/07/28 22:18:48 thorpej Exp $ */
2
3 /*
4 * Copyright (c) 1982, 1986, 1988, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed by the University of
18 * California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
36 */
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/protosw.h>
46 #include <sys/errno.h>
47 #include <sys/stat.h>
48 #include <sys/proc.h>
49 #include <sys/ucred.h>
50
51 #include <vm/vm.h>
52 #include <sys/sysctl.h>
53
54 #include <net/if.h>
55 #include <net/route.h>
56
57 #include <netinet/in.h>
58 #include <netinet/in_systm.h>
59 #include <netinet/in_var.h>
60 #include <netinet/ip.h>
61 #include <netinet/in_pcb.h>
62 #include <netinet/ip_var.h>
63 #include <netinet/tcp.h>
64 #include <netinet/tcp_fsm.h>
65 #include <netinet/tcp_seq.h>
66 #include <netinet/tcp_timer.h>
67 #include <netinet/tcp_var.h>
68 #include <netinet/tcpip.h>
69 #include <netinet/tcp_debug.h>
70
71 /*
72 * TCP protocol interface to socket abstraction.
73 */
74 extern char *tcpstates[];
75
76 /*
77 * Process a TCP user request for TCP tb. If this is a send request
78 * then m is the mbuf chain of send data. If this is a timer expiration
79 * (called from the software clock routine), then timertype tells which timer.
80 */
81 /*ARGSUSED*/
82 int
83 tcp_usrreq(so, req, m, nam, control, p)
84 struct socket *so;
85 int req;
86 struct mbuf *m, *nam, *control;
87 struct proc *p;
88 {
89 register struct inpcb *inp;
90 register struct tcpcb *tp = NULL;
91 int s;
92 int error = 0;
93 int ostate;
94
95 if (req == PRU_CONTROL)
96 return (in_control(so, (long)m, (caddr_t)nam,
97 (struct ifnet *)control, p));
98
99 s = splsoftnet();
100 inp = sotoinpcb(so);
101 #ifdef DIAGNOSTIC
102 if (req != PRU_SEND && req != PRU_SENDOOB && control)
103 panic("tcp_usrreq: unexpected control mbuf");
104 #endif
105 /*
106 * When a TCP is attached to a socket, then there will be
107 * a (struct inpcb) pointed at by the socket, and this
108 * structure will point at a subsidary (struct tcpcb).
109 */
110 if (inp == 0 && req != PRU_ATTACH) {
111 error = EINVAL;
112 goto release;
113 }
114 if (inp) {
115 tp = intotcpcb(inp);
116 /* WHAT IF TP IS 0? */
117 #ifdef KPROF
118 tcp_acounts[tp->t_state][req]++;
119 #endif
120 ostate = tp->t_state;
121 } else
122 ostate = 0;
123
124 switch (req) {
125
126 /*
127 * TCP attaches to socket via PRU_ATTACH, reserving space,
128 * and an internet control block.
129 */
130 case PRU_ATTACH:
131 if (inp != 0) {
132 error = EISCONN;
133 break;
134 }
135 error = tcp_attach(so);
136 if (error)
137 break;
138 if ((so->so_options & SO_LINGER) && so->so_linger == 0)
139 so->so_linger = TCP_LINGERTIME * hz;
140 tp = sototcpcb(so);
141 break;
142
143 /*
144 * PRU_DETACH detaches the TCP protocol from the socket.
145 */
146 case PRU_DETACH:
147 tp = tcp_disconnect(tp);
148 break;
149
150 /*
151 * Give the socket an address.
152 */
153 case PRU_BIND:
154 error = in_pcbbind(inp, nam, p);
155 break;
156
157 /*
158 * Prepare to accept connections.
159 */
160 case PRU_LISTEN:
161 if (inp->inp_lport == 0) {
162 error = in_pcbbind(inp, (struct mbuf *)0,
163 (struct proc *)0);
164 if (error)
165 break;
166 }
167 tp->t_state = TCPS_LISTEN;
168 break;
169
170 /*
171 * Initiate connection to peer.
172 * Create a template for use in transmissions on this connection.
173 * Enter SYN_SENT state, and mark socket as connecting.
174 * Start keep-alive timer, and seed output sequence space.
175 * Send initial segment on connection.
176 */
177 case PRU_CONNECT:
178 if (inp->inp_lport == 0) {
179 error = in_pcbbind(inp, (struct mbuf *)0,
180 (struct proc *)0);
181 if (error)
182 break;
183 }
184 error = in_pcbconnect(inp, nam);
185 if (error)
186 break;
187 tp->t_template = tcp_template(tp);
188 if (tp->t_template == 0) {
189 in_pcbdisconnect(inp);
190 error = ENOBUFS;
191 break;
192 }
193 /* Compute window scaling to request. */
194 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
195 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
196 tp->request_r_scale++;
197 soisconnecting(so);
198 tcpstat.tcps_connattempt++;
199 tp->t_state = TCPS_SYN_SENT;
200 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
201 tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2;
202 tcp_sendseqinit(tp);
203 error = tcp_output(tp);
204 break;
205
206 /*
207 * Create a TCP connection between two sockets.
208 */
209 case PRU_CONNECT2:
210 error = EOPNOTSUPP;
211 break;
212
213 /*
214 * Initiate disconnect from peer.
215 * If connection never passed embryonic stage, just drop;
216 * else if don't need to let data drain, then can just drop anyways,
217 * else have to begin TCP shutdown process: mark socket disconnecting,
218 * drain unread data, state switch to reflect user close, and
219 * send segment (e.g. FIN) to peer. Socket will be really disconnected
220 * when peer sends FIN and acks ours.
221 *
222 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
223 */
224 case PRU_DISCONNECT:
225 tp = tcp_disconnect(tp);
226 break;
227
228 /*
229 * Accept a connection. Essentially all the work is
230 * done at higher levels; just return the address
231 * of the peer, storing through addr.
232 */
233 case PRU_ACCEPT:
234 in_setpeeraddr(inp, nam);
235 break;
236
237 /*
238 * Mark the connection as being incapable of further output.
239 */
240 case PRU_SHUTDOWN:
241 socantsendmore(so);
242 tp = tcp_usrclosed(tp);
243 if (tp)
244 error = tcp_output(tp);
245 break;
246
247 /*
248 * After a receive, possibly send window update to peer.
249 */
250 case PRU_RCVD:
251 (void) tcp_output(tp);
252 break;
253
254 /*
255 * Do a send by putting data in output queue and updating urgent
256 * marker if URG set. Possibly send more data.
257 */
258 case PRU_SEND:
259 if (control && control->m_len) {
260 m_freem(control);
261 m_freem(m);
262 error = EINVAL;
263 break;
264 }
265 sbappend(&so->so_snd, m);
266 error = tcp_output(tp);
267 break;
268
269 /*
270 * Abort the TCP.
271 */
272 case PRU_ABORT:
273 tp = tcp_drop(tp, ECONNABORTED);
274 break;
275
276 case PRU_SENSE:
277 /*
278 * stat: don't bother with a blocksize.
279 */
280 splx(s);
281 return (0);
282
283 case PRU_RCVOOB:
284 if (control && control->m_len) {
285 m_freem(control);
286 m_freem(m);
287 error = EINVAL;
288 break;
289 }
290 if ((so->so_oobmark == 0 &&
291 (so->so_state & SS_RCVATMARK) == 0) ||
292 so->so_options & SO_OOBINLINE ||
293 tp->t_oobflags & TCPOOB_HADDATA) {
294 error = EINVAL;
295 break;
296 }
297 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
298 error = EWOULDBLOCK;
299 break;
300 }
301 m->m_len = 1;
302 *mtod(m, caddr_t) = tp->t_iobc;
303 if (((long)nam & MSG_PEEK) == 0)
304 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
305 break;
306
307 case PRU_SENDOOB:
308 if (sbspace(&so->so_snd) < -512) {
309 m_freem(m);
310 error = ENOBUFS;
311 break;
312 }
313 /*
314 * According to RFC961 (Assigned Protocols),
315 * the urgent pointer points to the last octet
316 * of urgent data. We continue, however,
317 * to consider it to indicate the first octet
318 * of data past the urgent section.
319 * Otherwise, snd_up should be one lower.
320 */
321 sbappend(&so->so_snd, m);
322 tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
323 tp->t_force = 1;
324 error = tcp_output(tp);
325 tp->t_force = 0;
326 break;
327
328 case PRU_SOCKADDR:
329 in_setsockaddr(inp, nam);
330 break;
331
332 case PRU_PEERADDR:
333 in_setpeeraddr(inp, nam);
334 break;
335
336 /*
337 * TCP slow timer went off; going through this
338 * routine for tracing's sake.
339 */
340 case PRU_SLOWTIMO:
341 tp = tcp_timers(tp, (long)nam);
342 req |= (long)nam << 8; /* for debug's sake */
343 break;
344
345 default:
346 panic("tcp_usrreq");
347 }
348 if (tp && (so->so_options & SO_DEBUG))
349 tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req);
350
351 release:
352 splx(s);
353 return (error);
354 }
355
356 int
357 tcp_ctloutput(op, so, level, optname, mp)
358 int op;
359 struct socket *so;
360 int level, optname;
361 struct mbuf **mp;
362 {
363 int error = 0, s;
364 struct inpcb *inp;
365 register struct tcpcb *tp;
366 register struct mbuf *m;
367 register int i;
368
369 s = splsoftnet();
370 inp = sotoinpcb(so);
371 if (inp == NULL) {
372 splx(s);
373 if (op == PRCO_SETOPT && *mp)
374 (void) m_free(*mp);
375 return (ECONNRESET);
376 }
377 if (level != IPPROTO_TCP) {
378 error = ip_ctloutput(op, so, level, optname, mp);
379 splx(s);
380 return (error);
381 }
382 tp = intotcpcb(inp);
383
384 switch (op) {
385
386 case PRCO_SETOPT:
387 m = *mp;
388 switch (optname) {
389
390 case TCP_NODELAY:
391 if (m == NULL || m->m_len < sizeof (int))
392 error = EINVAL;
393 else if (*mtod(m, int *))
394 tp->t_flags |= TF_NODELAY;
395 else
396 tp->t_flags &= ~TF_NODELAY;
397 break;
398
399 case TCP_MAXSEG:
400 if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg)
401 tp->t_maxseg = i;
402 else
403 error = EINVAL;
404 break;
405
406 default:
407 error = ENOPROTOOPT;
408 break;
409 }
410 if (m)
411 (void) m_free(m);
412 break;
413
414 case PRCO_GETOPT:
415 *mp = m = m_get(M_WAIT, MT_SOOPTS);
416 m->m_len = sizeof(int);
417
418 switch (optname) {
419 case TCP_NODELAY:
420 *mtod(m, int *) = tp->t_flags & TF_NODELAY;
421 break;
422 case TCP_MAXSEG:
423 *mtod(m, int *) = tp->t_maxseg;
424 break;
425 default:
426 error = ENOPROTOOPT;
427 break;
428 }
429 break;
430 }
431 splx(s);
432 return (error);
433 }
434
435 #ifndef TCP_SENDSPACE
436 #define TCP_SENDSPACE 1024*16;
437 #endif
438 int tcp_sendspace = TCP_SENDSPACE;
439 #ifndef TCP_RECVSPACE
440 #define TCP_RECVSPACE 1024*16;
441 #endif
442 int tcp_recvspace = TCP_RECVSPACE;
443
444 /*
445 * Attach TCP protocol to socket, allocating
446 * internet protocol control block, tcp control block,
447 * bufer space, and entering LISTEN state if to accept connections.
448 */
449 int
450 tcp_attach(so)
451 struct socket *so;
452 {
453 register struct tcpcb *tp;
454 struct inpcb *inp;
455 int error;
456
457 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
458 error = soreserve(so, tcp_sendspace, tcp_recvspace);
459 if (error)
460 return (error);
461 }
462 error = in_pcballoc(so, &tcbtable);
463 if (error)
464 return (error);
465 inp = sotoinpcb(so);
466 tp = tcp_newtcpcb(inp);
467 if (tp == 0) {
468 int nofd = so->so_state & SS_NOFDREF; /* XXX */
469
470 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */
471 in_pcbdetach(inp);
472 so->so_state |= nofd;
473 return (ENOBUFS);
474 }
475 tp->t_state = TCPS_CLOSED;
476 return (0);
477 }
478
479 /*
480 * Initiate (or continue) disconnect.
481 * If embryonic state, just send reset (once).
482 * If in ``let data drain'' option and linger null, just drop.
483 * Otherwise (hard), mark socket disconnecting and drop
484 * current input data; switch states based on user close, and
485 * send segment to peer (with FIN).
486 */
487 struct tcpcb *
488 tcp_disconnect(tp)
489 register struct tcpcb *tp;
490 {
491 struct socket *so = tp->t_inpcb->inp_socket;
492
493 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
494 tp = tcp_close(tp);
495 else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
496 tp = tcp_drop(tp, 0);
497 else {
498 soisdisconnecting(so);
499 sbflush(&so->so_rcv);
500 tp = tcp_usrclosed(tp);
501 if (tp)
502 (void) tcp_output(tp);
503 }
504 return (tp);
505 }
506
507 /*
508 * User issued close, and wish to trail through shutdown states:
509 * if never received SYN, just forget it. If got a SYN from peer,
510 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
511 * If already got a FIN from peer, then almost done; go to LAST_ACK
512 * state. In all other cases, have already sent FIN to peer (e.g.
513 * after PRU_SHUTDOWN), and just have to play tedious game waiting
514 * for peer to send FIN or not respond to keep-alives, etc.
515 * We can let the user exit from the close as soon as the FIN is acked.
516 */
517 struct tcpcb *
518 tcp_usrclosed(tp)
519 register struct tcpcb *tp;
520 {
521
522 switch (tp->t_state) {
523
524 case TCPS_CLOSED:
525 case TCPS_LISTEN:
526 case TCPS_SYN_SENT:
527 tp->t_state = TCPS_CLOSED;
528 tp = tcp_close(tp);
529 break;
530
531 case TCPS_SYN_RECEIVED:
532 case TCPS_ESTABLISHED:
533 tp->t_state = TCPS_FIN_WAIT_1;
534 break;
535
536 case TCPS_CLOSE_WAIT:
537 tp->t_state = TCPS_LAST_ACK;
538 break;
539 }
540 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
541 soisdisconnected(tp->t_inpcb->inp_socket);
542 /*
543 * If we are in FIN_WAIT_2, we arrived here because the
544 * application did a shutdown of the send side. Like the
545 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
546 * a full close, we start a timer to make sure sockets are
547 * not left in FIN_WAIT_2 forever.
548 */
549 if (tp->t_state == TCPS_FIN_WAIT_2)
550 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
551 }
552 return (tp);
553 }
554
555 /*
556 * Sysctl for tcp variables.
557 */
558 int
559 tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
560 int *name;
561 u_int namelen;
562 void *oldp;
563 size_t *oldlenp;
564 void *newp;
565 size_t newlen;
566 {
567
568 /* All sysctl names at this level are terminal. */
569 if (namelen != 1)
570 return (ENOTDIR);
571
572 switch (name[0]) {
573 case TCPCTL_RFC1323:
574 return (sysctl_int(oldp, oldlenp, newp, newlen,
575 &tcp_do_rfc1323));
576 case TCPCTL_SENDSPACE:
577 return (sysctl_int(oldp, oldlenp, newp, newlen,
578 &tcp_sendspace));
579 case TCPCTL_RECVSPACE:
580 return (sysctl_int(oldp, oldlenp, newp, newlen,
581 &tcp_recvspace));
582 case TCPCTL_MSSDFLT:
583 return (sysctl_int(oldp, oldlenp, newp, newlen,
584 &tcp_mssdflt));
585 case TCPCTL_SYN_CACHE_LIMIT:
586 return (sysctl_int(oldp, oldlenp, newp, newlen,
587 &tcp_syn_cache_limit));
588 case TCPCTL_SYN_BUCKET_LIMIT:
589 return (sysctl_int(oldp, oldlenp, newp, newlen,
590 &tcp_syn_bucket_limit));
591 case TCPCTL_SYN_CACHE_INTER:
592 return (sysctl_int(oldp, oldlenp, newp, newlen,
593 &tcp_syn_cache_interval));
594 default:
595 return (ENOPROTOOPT);
596 }
597 /* NOTREACHED */
598 }
599