tcp_usrreq.c revision 1.17.2.1 1 /* $NetBSD: tcp_usrreq.c,v 1.17.2.1 1996/02/02 06:12:58 mycroft Exp $ */
2
3 /*
4 * Copyright (c) 1982, 1986, 1988, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed by the University of
18 * California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
36 */
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/protosw.h>
46 #include <sys/errno.h>
47 #include <sys/stat.h>
48
49 #include <net/if.h>
50 #include <net/route.h>
51
52 #include <netinet/in.h>
53 #include <netinet/in_systm.h>
54 #include <netinet/in_var.h>
55 #include <netinet/ip.h>
56 #include <netinet/in_pcb.h>
57 #include <netinet/ip_var.h>
58 #include <netinet/tcp.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_timer.h>
62 #include <netinet/tcp_var.h>
63 #include <netinet/tcpip.h>
64 #include <netinet/tcp_debug.h>
65
66 /*
67 * TCP protocol interface to socket abstraction.
68 */
69 extern char *tcpstates[];
70
71 /*
72 * Process a TCP user request for TCP tb. If this is a send request
73 * then m is the mbuf chain of send data. If this is a timer expiration
74 * (called from the software clock routine), then timertype tells which timer.
75 */
76 /*ARGSUSED*/
77 int
78 tcp_usrreq(so, req, m, nam, control)
79 struct socket *so;
80 int req;
81 struct mbuf *m, *nam, *control;
82 {
83 register struct inpcb *inp;
84 register struct tcpcb *tp;
85 int s;
86 int error = 0;
87 int ostate;
88
89 if (req == PRU_CONTROL)
90 return (in_control(so, (long)m, (caddr_t)nam,
91 (struct ifnet *)control));
92 if (control && control->m_len) {
93 m_freem(control);
94 if (m)
95 m_freem(m);
96 return (EINVAL);
97 }
98
99 s = splsoftnet();
100 inp = sotoinpcb(so);
101 /*
102 * When a TCP is attached to a socket, then there will be
103 * a (struct inpcb) pointed at by the socket, and this
104 * structure will point at a subsidary (struct tcpcb).
105 */
106 if (inp == 0 && req != PRU_ATTACH) {
107 splx(s);
108 return (EINVAL); /* XXX */
109 }
110 if (inp) {
111 tp = intotcpcb(inp);
112 /* WHAT IF TP IS 0? */
113 #ifdef KPROF
114 tcp_acounts[tp->t_state][req]++;
115 #endif
116 ostate = tp->t_state;
117 } else
118 ostate = 0;
119 switch (req) {
120
121 /*
122 * TCP attaches to socket via PRU_ATTACH, reserving space,
123 * and an internet control block.
124 */
125 case PRU_ATTACH:
126 if (inp) {
127 error = EISCONN;
128 break;
129 }
130 error = tcp_attach(so);
131 if (error)
132 break;
133 if ((so->so_options & SO_LINGER) && so->so_linger == 0)
134 so->so_linger = TCP_LINGERTIME * hz;
135 tp = sototcpcb(so);
136 break;
137
138 /*
139 * PRU_DETACH detaches the TCP protocol from the socket.
140 * If the protocol state is non-embryonic, then can't
141 * do this directly: have to initiate a PRU_DISCONNECT,
142 * which may finish later; embryonic TCB's can just
143 * be discarded here.
144 */
145 case PRU_DETACH:
146 if (tp->t_state > TCPS_LISTEN)
147 tp = tcp_disconnect(tp);
148 else
149 tp = tcp_close(tp);
150 break;
151
152 /*
153 * Give the socket an address.
154 */
155 case PRU_BIND:
156 error = in_pcbbind(inp, nam);
157 if (error)
158 break;
159 break;
160
161 /*
162 * Prepare to accept connections.
163 */
164 case PRU_LISTEN:
165 if (inp->inp_lport == 0)
166 error = in_pcbbind(inp, (struct mbuf *)0);
167 if (error == 0)
168 tp->t_state = TCPS_LISTEN;
169 break;
170
171 /*
172 * Initiate connection to peer.
173 * Create a template for use in transmissions on this connection.
174 * Enter SYN_SENT state, and mark socket as connecting.
175 * Start keep-alive timer, and seed output sequence space.
176 * Send initial segment on connection.
177 */
178 case PRU_CONNECT:
179 if (inp->inp_lport == 0) {
180 error = in_pcbbind(inp, (struct mbuf *)0);
181 if (error)
182 break;
183 }
184 error = in_pcbconnect(inp, nam);
185 if (error)
186 break;
187 tp->t_template = tcp_template(tp);
188 if (tp->t_template == 0) {
189 in_pcbdisconnect(inp);
190 error = ENOBUFS;
191 break;
192 }
193 /* Compute window scaling to request. */
194 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
195 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
196 tp->request_r_scale++;
197 soisconnecting(so);
198 tcpstat.tcps_connattempt++;
199 tp->t_state = TCPS_SYN_SENT;
200 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT;
201 tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2;
202 tcp_sendseqinit(tp);
203 error = tcp_output(tp);
204 break;
205
206 /*
207 * Create a TCP connection between two sockets.
208 */
209 case PRU_CONNECT2:
210 error = EOPNOTSUPP;
211 break;
212
213 /*
214 * Initiate disconnect from peer.
215 * If connection never passed embryonic stage, just drop;
216 * else if don't need to let data drain, then can just drop anyways,
217 * else have to begin TCP shutdown process: mark socket disconnecting,
218 * drain unread data, state switch to reflect user close, and
219 * send segment (e.g. FIN) to peer. Socket will be really disconnected
220 * when peer sends FIN and acks ours.
221 *
222 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
223 */
224 case PRU_DISCONNECT:
225 tp = tcp_disconnect(tp);
226 break;
227
228 /*
229 * Accept a connection. Essentially all the work is
230 * done at higher levels; just return the address
231 * of the peer, storing through addr.
232 */
233 case PRU_ACCEPT:
234 in_setpeeraddr(inp, nam);
235 break;
236
237 /*
238 * Mark the connection as being incapable of further output.
239 */
240 case PRU_SHUTDOWN:
241 socantsendmore(so);
242 tp = tcp_usrclosed(tp);
243 if (tp)
244 error = tcp_output(tp);
245 break;
246
247 /*
248 * After a receive, possibly send window update to peer.
249 */
250 case PRU_RCVD:
251 (void) tcp_output(tp);
252 break;
253
254 /*
255 * Do a send by putting data in output queue and updating urgent
256 * marker if URG set. Possibly send more data.
257 */
258 case PRU_SEND:
259 sbappend(&so->so_snd, m);
260 error = tcp_output(tp);
261 break;
262
263 /*
264 * Abort the TCP.
265 */
266 case PRU_ABORT:
267 tp = tcp_drop(tp, ECONNABORTED);
268 break;
269
270 case PRU_SENSE:
271 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
272 (void) splx(s);
273 return (0);
274
275 case PRU_RCVOOB:
276 if ((so->so_oobmark == 0 &&
277 (so->so_state & SS_RCVATMARK) == 0) ||
278 so->so_options & SO_OOBINLINE ||
279 tp->t_oobflags & TCPOOB_HADDATA) {
280 error = EINVAL;
281 break;
282 }
283 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
284 error = EWOULDBLOCK;
285 break;
286 }
287 m->m_len = 1;
288 *mtod(m, caddr_t) = tp->t_iobc;
289 if (((long)nam & MSG_PEEK) == 0)
290 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
291 break;
292
293 case PRU_SENDOOB:
294 if (sbspace(&so->so_snd) < -512) {
295 m_freem(m);
296 error = ENOBUFS;
297 break;
298 }
299 /*
300 * According to RFC961 (Assigned Protocols),
301 * the urgent pointer points to the last octet
302 * of urgent data. We continue, however,
303 * to consider it to indicate the first octet
304 * of data past the urgent section.
305 * Otherwise, snd_up should be one lower.
306 */
307 sbappend(&so->so_snd, m);
308 tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
309 tp->t_force = 1;
310 error = tcp_output(tp);
311 tp->t_force = 0;
312 break;
313
314 case PRU_SOCKADDR:
315 in_setsockaddr(inp, nam);
316 break;
317
318 case PRU_PEERADDR:
319 in_setpeeraddr(inp, nam);
320 break;
321
322 /*
323 * TCP slow timer went off; going through this
324 * routine for tracing's sake.
325 */
326 case PRU_SLOWTIMO:
327 tp = tcp_timers(tp, (long)nam);
328 req |= (long)nam << 8; /* for debug's sake */
329 break;
330
331 default:
332 panic("tcp_usrreq");
333 }
334 if (tp && (so->so_options & SO_DEBUG))
335 tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req);
336 splx(s);
337 return (error);
338 }
339
340 int
341 tcp_ctloutput(op, so, level, optname, mp)
342 int op;
343 struct socket *so;
344 int level, optname;
345 struct mbuf **mp;
346 {
347 int error = 0, s;
348 struct inpcb *inp;
349 register struct tcpcb *tp;
350 register struct mbuf *m;
351 register int i;
352
353 s = splsoftnet();
354 inp = sotoinpcb(so);
355 if (inp == NULL) {
356 splx(s);
357 if (op == PRCO_SETOPT && *mp)
358 (void) m_free(*mp);
359 return (ECONNRESET);
360 }
361 if (level != IPPROTO_TCP) {
362 error = ip_ctloutput(op, so, level, optname, mp);
363 splx(s);
364 return (error);
365 }
366 tp = intotcpcb(inp);
367
368 switch (op) {
369
370 case PRCO_SETOPT:
371 m = *mp;
372 switch (optname) {
373
374 case TCP_NODELAY:
375 if (m == NULL || m->m_len < sizeof (int))
376 error = EINVAL;
377 else if (*mtod(m, int *))
378 tp->t_flags |= TF_NODELAY;
379 else
380 tp->t_flags &= ~TF_NODELAY;
381 break;
382
383 case TCP_MAXSEG:
384 if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg)
385 tp->t_maxseg = i;
386 else
387 error = EINVAL;
388 break;
389
390 default:
391 error = ENOPROTOOPT;
392 break;
393 }
394 if (m)
395 (void) m_free(m);
396 break;
397
398 case PRCO_GETOPT:
399 *mp = m = m_get(M_WAIT, MT_SOOPTS);
400 m->m_len = sizeof(int);
401
402 switch (optname) {
403 case TCP_NODELAY:
404 *mtod(m, int *) = tp->t_flags & TF_NODELAY;
405 break;
406 case TCP_MAXSEG:
407 *mtod(m, int *) = tp->t_maxseg;
408 break;
409 default:
410 error = ENOPROTOOPT;
411 break;
412 }
413 break;
414 }
415 splx(s);
416 return (error);
417 }
418
419 #ifndef TCP_SENDSPACE
420 #define TCP_SENDSPACE 1024*16;
421 #endif
422 u_long tcp_sendspace = TCP_SENDSPACE;
423 #ifndef TCP_RECVSPACE
424 #define TCP_RECVSPACE 1024*16;
425 #endif
426 u_long tcp_recvspace = TCP_RECVSPACE;
427
428 /*
429 * Attach TCP protocol to socket, allocating
430 * internet protocol control block, tcp control block,
431 * bufer space, and entering LISTEN state if to accept connections.
432 */
433 int
434 tcp_attach(so)
435 struct socket *so;
436 {
437 register struct tcpcb *tp;
438 struct inpcb *inp;
439 int error;
440
441 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
442 error = soreserve(so, tcp_sendspace, tcp_recvspace);
443 if (error)
444 return (error);
445 }
446 error = in_pcballoc(so, &tcbtable);
447 if (error)
448 return (error);
449 inp = sotoinpcb(so);
450 tp = tcp_newtcpcb(inp);
451 if (tp == 0) {
452 int nofd = so->so_state & SS_NOFDREF; /* XXX */
453
454 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */
455 in_pcbdetach(inp);
456 so->so_state |= nofd;
457 return (ENOBUFS);
458 }
459 tp->t_state = TCPS_CLOSED;
460 return (0);
461 }
462
463 /*
464 * Initiate (or continue) disconnect.
465 * If embryonic state, just send reset (once).
466 * If in ``let data drain'' option and linger null, just drop.
467 * Otherwise (hard), mark socket disconnecting and drop
468 * current input data; switch states based on user close, and
469 * send segment to peer (with FIN).
470 */
471 struct tcpcb *
472 tcp_disconnect(tp)
473 register struct tcpcb *tp;
474 {
475 struct socket *so = tp->t_inpcb->inp_socket;
476
477 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
478 tp = tcp_close(tp);
479 else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
480 tp = tcp_drop(tp, 0);
481 else {
482 soisdisconnecting(so);
483 sbflush(&so->so_rcv);
484 tp = tcp_usrclosed(tp);
485 if (tp)
486 (void) tcp_output(tp);
487 }
488 return (tp);
489 }
490
491 /*
492 * User issued close, and wish to trail through shutdown states:
493 * if never received SYN, just forget it. If got a SYN from peer,
494 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
495 * If already got a FIN from peer, then almost done; go to LAST_ACK
496 * state. In all other cases, have already sent FIN to peer (e.g.
497 * after PRU_SHUTDOWN), and just have to play tedious game waiting
498 * for peer to send FIN or not respond to keep-alives, etc.
499 * We can let the user exit from the close as soon as the FIN is acked.
500 */
501 struct tcpcb *
502 tcp_usrclosed(tp)
503 register struct tcpcb *tp;
504 {
505
506 switch (tp->t_state) {
507
508 case TCPS_CLOSED:
509 case TCPS_LISTEN:
510 case TCPS_SYN_SENT:
511 tp->t_state = TCPS_CLOSED;
512 tp = tcp_close(tp);
513 break;
514
515 case TCPS_SYN_RECEIVED:
516 case TCPS_ESTABLISHED:
517 tp->t_state = TCPS_FIN_WAIT_1;
518 break;
519
520 case TCPS_CLOSE_WAIT:
521 tp->t_state = TCPS_LAST_ACK;
522 break;
523 }
524 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
525 soisdisconnected(tp->t_inpcb->inp_socket);
526 /*
527 * If we are in FIN_WAIT_2, we arrived here because the
528 * application did a shutdown of the send side. Like the
529 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
530 * a full close, we start a timer to make sure sockets are
531 * not left in FIN_WAIT_2 forever.
532 */
533 if (tp->t_state == TCPS_FIN_WAIT_2)
534 tp->t_timer[TCPT_2MSL] = tcp_maxidle;
535 }
536 return (tp);
537 }
538
539 /*
540 * Sysctl for tcp variables.
541 */
542 int
543 tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
544 int *name;
545 u_int namelen;
546 void *oldp;
547 size_t *oldlenp;
548 void *newp;
549 size_t newlen;
550 {
551
552 /* All sysctl names at this level are terminal. */
553 if (namelen != 1)
554 return (ENOTDIR);
555
556 switch (name[0]) {
557 case TCPCTL_RFC1323:
558 return (sysctl_int(oldp, oldlenp, newp, newlen,
559 &tcp_do_rfc1323));
560
561 default:
562 return (ENOPROTOOPT);
563 }
564 /* NOTREACHED */
565 }
566