tcp_usrreq.c revision 1.22 1 /* $NetBSD: tcp_usrreq.c,v 1.22 1996/05/23 16:13:19 mycroft Exp $ */
2
3 /*
4 * Copyright (c) 1982, 1986, 1988, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed by the University of
18 * California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
36 */
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/protosw.h>
46 #include <sys/errno.h>
47 #include <sys/stat.h>
48 #include <sys/proc.h>
49 #include <sys/ucred.h>
50
51 #include <vm/vm.h>
52 #include <sys/sysctl.h>
53
54 #include <net/if.h>
55 #include <net/route.h>
56
57 #include <netinet/in.h>
58 #include <netinet/in_systm.h>
59 #include <netinet/in_var.h>
60 #include <netinet/ip.h>
61 #include <netinet/in_pcb.h>
62 #include <netinet/ip_var.h>
63 #include <netinet/tcp.h>
64 #include <netinet/tcp_fsm.h>
65 #include <netinet/tcp_seq.h>
66 #include <netinet/tcp_timer.h>
67 #include <netinet/tcp_var.h>
68 #include <netinet/tcpip.h>
69 #include <netinet/tcp_debug.h>
70
71 /*
72 * TCP protocol interface to socket abstraction.
73 */
74 extern char *tcpstates[];
75
76 /*
77 * Process a TCP user request for TCP tb. If this is a send request
78 * then m is the mbuf chain of send data. If this is a timer expiration
79 * (called from the software clock routine), then timertype tells which timer.
80 */
81 /*ARGSUSED*/
82 int
83 tcp_usrreq(so, req, m, nam, control, p)
84 struct socket *so;
85 int req;
86 struct mbuf *m, *nam, *control;
87 struct proc *p;
88 {
89 register struct inpcb *inp;
90 register struct tcpcb *tp = NULL;
91 int s;
92 int error = 0;
93 int ostate;
94
95 if (req == PRU_CONTROL)
96 return (in_control(so, (long)m, (caddr_t)nam,
97 (struct ifnet *)control, p));
98
99 s = splsoftnet();
100 inp = sotoinpcb(so);
101 if (control && control->m_len) {
102 m_freem(control);
103 error = EINVAL;
104 goto release;
105 }
106 /*
107 * When a TCP is attached to a socket, then there will be
108 * a (struct inpcb) pointed at by the socket, and this
109 * structure will point at a subsidary (struct tcpcb).
110 */
111 if (inp == 0 && req != PRU_ATTACH) {
112 error = EINVAL;
113 goto release;
114 }
115 if (inp) {
116 tp = intotcpcb(inp);
117 /* WHAT IF TP IS 0? */
118 #ifdef KPROF
119 tcp_acounts[tp->t_state][req]++;
120 #endif
121 ostate = tp->t_state;
122 } else
123 ostate = 0;
124
125 switch (req) {
126
127 /*
128 * TCP attaches to socket via PRU_ATTACH, reserving space,
129 * and an internet control block.
130 */
131 case PRU_ATTACH:
132 if (inp != 0) {
133 error = EISCONN;
134 break;
135 }
136 error = tcp_attach(so);
137 if (error)
138 break;
139 if ((so->so_options & SO_LINGER) && so->so_linger == 0)
140 so->so_linger = TCP_LINGERTIME * hz;
141 tp = sototcpcb(so);
142 break;
143
144 /*
145 * PRU_DETACH detaches the TCP protocol from the socket.
146 * If the protocol state is non-embryonic, then can't
147 * do this directly: have to initiate a PRU_DISCONNECT,
148 * which may finish later; embryonic TCB's can just
149 * be discarded here.
150 */
151 case PRU_DETACH:
152 if (tp->t_state > TCPS_LISTEN)
153 tp = tcp_disconnect(tp);
154 else
155 tp = tcp_close(tp);
156 break;
157
158 /*
159 * Give the socket an address.
160 */
161 case PRU_BIND:
162 error = in_pcbbind(inp, nam, p);
163 break;
164
165 /*
166 * Prepare to accept connections.
167 */
168 case PRU_LISTEN:
169 if (inp->inp_lport == 0) {
170 error = in_pcbbind(inp, (struct mbuf *)0,
171 (struct proc *)0);
172 if (error)
173 break;
174 }
175 tp->t_state = TCPS_LISTEN;
176 break;
177
178 /*
179 * Initiate connection to peer.
180 * Create a template for use in transmissions on this connection.
181 * Enter SYN_SENT state, and mark socket as connecting.
182 * Start keep-alive timer, and seed output sequence space.
183 * Send initial segment on connection.
184 */
185 case PRU_CONNECT:
186 if (inp->inp_lport == 0) {
187 error = in_pcbbind(inp, (struct mbuf *)0,
188 (struct proc *)0);
189 if (error)
190 break;
191 }
192 error = in_pcbconnect(inp, nam);
193 if (error)
194 break;
195 tp->t_template = tcp_template(tp);
196 if (tp->t_template == 0) {
197 in_pcbdisconnect(inp);
198 error = ENOBUFS;
199 break;
200 }
201 /* Compute window scaling to request. */
202 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
203 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
204 tp->request_r_scale++;
205 soisconnecting(so);
206 tcpstat.tcps_connattempt++;
207 tp->t_state = TCPS_SYN_SENT;
208 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
209 tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2;
210 tcp_sendseqinit(tp);
211 error = tcp_output(tp);
212 break;
213
214 /*
215 * Create a TCP connection between two sockets.
216 */
217 case PRU_CONNECT2:
218 error = EOPNOTSUPP;
219 break;
220
221 /*
222 * Initiate disconnect from peer.
223 * If connection never passed embryonic stage, just drop;
224 * else if don't need to let data drain, then can just drop anyways,
225 * else have to begin TCP shutdown process: mark socket disconnecting,
226 * drain unread data, state switch to reflect user close, and
227 * send segment (e.g. FIN) to peer. Socket will be really disconnected
228 * when peer sends FIN and acks ours.
229 *
230 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
231 */
232 case PRU_DISCONNECT:
233 tp = tcp_disconnect(tp);
234 break;
235
236 /*
237 * Accept a connection. Essentially all the work is
238 * done at higher levels; just return the address
239 * of the peer, storing through addr.
240 */
241 case PRU_ACCEPT:
242 in_setpeeraddr(inp, nam);
243 break;
244
245 /*
246 * Mark the connection as being incapable of further output.
247 */
248 case PRU_SHUTDOWN:
249 socantsendmore(so);
250 tp = tcp_usrclosed(tp);
251 if (tp)
252 error = tcp_output(tp);
253 break;
254
255 /*
256 * After a receive, possibly send window update to peer.
257 */
258 case PRU_RCVD:
259 (void) tcp_output(tp);
260 break;
261
262 /*
263 * Do a send by putting data in output queue and updating urgent
264 * marker if URG set. Possibly send more data.
265 */
266 case PRU_SEND:
267 sbappend(&so->so_snd, m);
268 error = tcp_output(tp);
269 break;
270
271 /*
272 * Abort the TCP.
273 */
274 case PRU_ABORT:
275 tp = tcp_drop(tp, ECONNABORTED);
276 break;
277
278 case PRU_SENSE:
279 /*
280 * stat: don't bother with a blocksize.
281 */
282 splx(s);
283 return (0);
284
285 case PRU_RCVOOB:
286 if ((so->so_oobmark == 0 &&
287 (so->so_state & SS_RCVATMARK) == 0) ||
288 so->so_options & SO_OOBINLINE ||
289 tp->t_oobflags & TCPOOB_HADDATA) {
290 error = EINVAL;
291 break;
292 }
293 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
294 error = EWOULDBLOCK;
295 break;
296 }
297 m->m_len = 1;
298 *mtod(m, caddr_t) = tp->t_iobc;
299 if (((long)nam & MSG_PEEK) == 0)
300 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
301 break;
302
303 case PRU_SENDOOB:
304 if (sbspace(&so->so_snd) < -512) {
305 m_freem(m);
306 error = ENOBUFS;
307 break;
308 }
309 /*
310 * According to RFC961 (Assigned Protocols),
311 * the urgent pointer points to the last octet
312 * of urgent data. We continue, however,
313 * to consider it to indicate the first octet
314 * of data past the urgent section.
315 * Otherwise, snd_up should be one lower.
316 */
317 sbappend(&so->so_snd, m);
318 tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
319 tp->t_force = 1;
320 error = tcp_output(tp);
321 tp->t_force = 0;
322 break;
323
324 case PRU_SOCKADDR:
325 in_setsockaddr(inp, nam);
326 break;
327
328 case PRU_PEERADDR:
329 in_setpeeraddr(inp, nam);
330 break;
331
332 /*
333 * TCP slow timer went off; going through this
334 * routine for tracing's sake.
335 */
336 case PRU_SLOWTIMO:
337 tp = tcp_timers(tp, (long)nam);
338 req |= (long)nam << 8; /* for debug's sake */
339 break;
340
341 default:
342 panic("tcp_usrreq");
343 }
344 if (tp && (so->so_options & SO_DEBUG))
345 tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req);
346
347 release:
348 splx(s);
349 return (error);
350 }
351
352 int
353 tcp_ctloutput(op, so, level, optname, mp)
354 int op;
355 struct socket *so;
356 int level, optname;
357 struct mbuf **mp;
358 {
359 int error = 0, s;
360 struct inpcb *inp;
361 register struct tcpcb *tp;
362 register struct mbuf *m;
363 register int i;
364
365 s = splsoftnet();
366 inp = sotoinpcb(so);
367 if (inp == NULL) {
368 splx(s);
369 if (op == PRCO_SETOPT && *mp)
370 (void) m_free(*mp);
371 return (ECONNRESET);
372 }
373 if (level != IPPROTO_TCP) {
374 error = ip_ctloutput(op, so, level, optname, mp);
375 splx(s);
376 return (error);
377 }
378 tp = intotcpcb(inp);
379
380 switch (op) {
381
382 case PRCO_SETOPT:
383 m = *mp;
384 switch (optname) {
385
386 case TCP_NODELAY:
387 if (m == NULL || m->m_len < sizeof (int))
388 error = EINVAL;
389 else if (*mtod(m, int *))
390 tp->t_flags |= TF_NODELAY;
391 else
392 tp->t_flags &= ~TF_NODELAY;
393 break;
394
395 case TCP_MAXSEG:
396 if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg)
397 tp->t_maxseg = i;
398 else
399 error = EINVAL;
400 break;
401
402 default:
403 error = ENOPROTOOPT;
404 break;
405 }
406 if (m)
407 (void) m_free(m);
408 break;
409
410 case PRCO_GETOPT:
411 *mp = m = m_get(M_WAIT, MT_SOOPTS);
412 m->m_len = sizeof(int);
413
414 switch (optname) {
415 case TCP_NODELAY:
416 *mtod(m, int *) = tp->t_flags & TF_NODELAY;
417 break;
418 case TCP_MAXSEG:
419 *mtod(m, int *) = tp->t_maxseg;
420 break;
421 default:
422 error = ENOPROTOOPT;
423 break;
424 }
425 break;
426 }
427 splx(s);
428 return (error);
429 }
430
431 #ifndef TCP_SENDSPACE
432 #define TCP_SENDSPACE 1024*16;
433 #endif
434 u_long tcp_sendspace = TCP_SENDSPACE;
435 #ifndef TCP_RECVSPACE
436 #define TCP_RECVSPACE 1024*16;
437 #endif
438 u_long tcp_recvspace = TCP_RECVSPACE;
439
440 /*
441 * Attach TCP protocol to socket, allocating
442 * internet protocol control block, tcp control block,
443 * bufer space, and entering LISTEN state if to accept connections.
444 */
445 int
446 tcp_attach(so)
447 struct socket *so;
448 {
449 register struct tcpcb *tp;
450 struct inpcb *inp;
451 int error;
452
453 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
454 error = soreserve(so, tcp_sendspace, tcp_recvspace);
455 if (error)
456 return (error);
457 }
458 error = in_pcballoc(so, &tcbtable);
459 if (error)
460 return (error);
461 inp = sotoinpcb(so);
462 tp = tcp_newtcpcb(inp);
463 if (tp == 0) {
464 int nofd = so->so_state & SS_NOFDREF; /* XXX */
465
466 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */
467 in_pcbdetach(inp);
468 so->so_state |= nofd;
469 return (ENOBUFS);
470 }
471 tp->t_state = TCPS_CLOSED;
472 return (0);
473 }
474
475 /*
476 * Initiate (or continue) disconnect.
477 * If embryonic state, just send reset (once).
478 * If in ``let data drain'' option and linger null, just drop.
479 * Otherwise (hard), mark socket disconnecting and drop
480 * current input data; switch states based on user close, and
481 * send segment to peer (with FIN).
482 */
483 struct tcpcb *
484 tcp_disconnect(tp)
485 register struct tcpcb *tp;
486 {
487 struct socket *so = tp->t_inpcb->inp_socket;
488
489 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
490 tp = tcp_close(tp);
491 else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
492 tp = tcp_drop(tp, 0);
493 else {
494 soisdisconnecting(so);
495 sbflush(&so->so_rcv);
496 tp = tcp_usrclosed(tp);
497 if (tp)
498 (void) tcp_output(tp);
499 }
500 return (tp);
501 }
502
503 /*
504 * User issued close, and wish to trail through shutdown states:
505 * if never received SYN, just forget it. If got a SYN from peer,
506 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
507 * If already got a FIN from peer, then almost done; go to LAST_ACK
508 * state. In all other cases, have already sent FIN to peer (e.g.
509 * after PRU_SHUTDOWN), and just have to play tedious game waiting
510 * for peer to send FIN or not respond to keep-alives, etc.
511 * We can let the user exit from the close as soon as the FIN is acked.
512 */
513 struct tcpcb *
514 tcp_usrclosed(tp)
515 register struct tcpcb *tp;
516 {
517
518 switch (tp->t_state) {
519
520 case TCPS_CLOSED:
521 case TCPS_LISTEN:
522 case TCPS_SYN_SENT:
523 tp->t_state = TCPS_CLOSED;
524 tp = tcp_close(tp);
525 break;
526
527 case TCPS_SYN_RECEIVED:
528 case TCPS_ESTABLISHED:
529 tp->t_state = TCPS_FIN_WAIT_1;
530 break;
531
532 case TCPS_CLOSE_WAIT:
533 tp->t_state = TCPS_LAST_ACK;
534 break;
535 }
536 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
537 soisdisconnected(tp->t_inpcb->inp_socket);
538 /*
539 * If we are in FIN_WAIT_2, we arrived here because the
540 * application did a shutdown of the send side. Like the
541 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
542 * a full close, we start a timer to make sure sockets are
543 * not left in FIN_WAIT_2 forever.
544 */
545 if (tp->t_state == TCPS_FIN_WAIT_2)
546 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
547 }
548 return (tp);
549 }
550
551 /*
552 * Sysctl for tcp variables.
553 */
554 int
555 tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
556 int *name;
557 u_int namelen;
558 void *oldp;
559 size_t *oldlenp;
560 void *newp;
561 size_t newlen;
562 {
563
564 /* All sysctl names at this level are terminal. */
565 if (namelen != 1)
566 return (ENOTDIR);
567
568 switch (name[0]) {
569 case TCPCTL_RFC1323:
570 return (sysctl_int(oldp, oldlenp, newp, newlen,
571 &tcp_do_rfc1323));
572
573 default:
574 return (ENOPROTOOPT);
575 }
576 /* NOTREACHED */
577 }
578