tcp_subr.c revision 1.27 1 /* $NetBSD: tcp_subr.c,v 1.27 1997/07/23 21:26:51 thorpej Exp $ */
2
3 /*
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed by the University of
18 * California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)tcp_subr.c 8.1 (Berkeley) 6/10/93
36 */
37
38 #include <sys/param.h>
39 #include <sys/proc.h>
40 #include <sys/systm.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/protosw.h>
46 #include <sys/errno.h>
47 #include <sys/kernel.h>
48
49 #include <net/route.h>
50 #include <net/if.h>
51
52 #include <netinet/in.h>
53 #include <netinet/in_systm.h>
54 #include <netinet/ip.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/ip_var.h>
57 #include <netinet/ip_icmp.h>
58 #include <netinet/tcp.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_timer.h>
62 #include <netinet/tcp_var.h>
63 #include <netinet/tcpip.h>
64
65 /* patchable/settable parameters for tcp */
66 int tcp_mssdflt = TCP_MSS;
67 int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
68 int tcp_do_rfc1323 = 1;
69
70 #ifndef TCBHASHSIZE
71 #define TCBHASHSIZE 128
72 #endif
73 int tcbhashsize = TCBHASHSIZE;
74
75 /*
76 * Tcp initialization
77 */
78 void
79 tcp_init()
80 {
81
82 tcp_iss = 1; /* XXX wrong */
83 in_pcbinit(&tcbtable, tcbhashsize, tcbhashsize);
84 if (max_protohdr < sizeof(struct tcpiphdr))
85 max_protohdr = sizeof(struct tcpiphdr);
86 if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN)
87 panic("tcp_init");
88 }
89
90 /*
91 * Create template to be used to send tcp packets on a connection.
92 * Call after host entry created, allocates an mbuf and fills
93 * in a skeletal tcp/ip header, minimizing the amount of work
94 * necessary when the connection is used.
95 */
96 struct tcpiphdr *
97 tcp_template(tp)
98 struct tcpcb *tp;
99 {
100 register struct inpcb *inp = tp->t_inpcb;
101 register struct tcpiphdr *n;
102
103 if ((n = tp->t_template) == 0) {
104 MALLOC(n, struct tcpiphdr *, sizeof (struct tcpiphdr),
105 M_MBUF, M_NOWAIT);
106 if (n == NULL)
107 return (0);
108 }
109 bzero(n->ti_x1, sizeof n->ti_x1);
110 n->ti_pr = IPPROTO_TCP;
111 n->ti_len = htons(sizeof (struct tcpiphdr) - sizeof (struct ip));
112 n->ti_src = inp->inp_laddr;
113 n->ti_dst = inp->inp_faddr;
114 n->ti_sport = inp->inp_lport;
115 n->ti_dport = inp->inp_fport;
116 n->ti_seq = 0;
117 n->ti_ack = 0;
118 n->ti_x2 = 0;
119 n->ti_off = 5;
120 n->ti_flags = 0;
121 n->ti_win = 0;
122 n->ti_sum = 0;
123 n->ti_urp = 0;
124 return (n);
125 }
126
127 /*
128 * Send a single message to the TCP at address specified by
129 * the given TCP/IP header. If m == 0, then we make a copy
130 * of the tcpiphdr at ti and send directly to the addressed host.
131 * This is used to force keep alive messages out using the TCP
132 * template for a connection tp->t_template. If flags are given
133 * then we send a message back to the TCP which originated the
134 * segment ti, and discard the mbuf containing it and any other
135 * attached mbufs.
136 *
137 * In any case the ack and sequence number of the transmitted
138 * segment are as specified by the parameters.
139 */
140 int
141 tcp_respond(tp, ti, m, ack, seq, flags)
142 struct tcpcb *tp;
143 register struct tcpiphdr *ti;
144 register struct mbuf *m;
145 tcp_seq ack, seq;
146 int flags;
147 {
148 register int tlen;
149 int win = 0;
150 struct route *ro = 0;
151
152 if (tp) {
153 win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
154 ro = &tp->t_inpcb->inp_route;
155 }
156 if (m == 0) {
157 m = m_gethdr(M_DONTWAIT, MT_HEADER);
158 if (m == NULL)
159 return (ENOBUFS);
160 #ifdef TCP_COMPAT_42
161 tlen = 1;
162 #else
163 tlen = 0;
164 #endif
165 m->m_data += max_linkhdr;
166 *mtod(m, struct tcpiphdr *) = *ti;
167 ti = mtod(m, struct tcpiphdr *);
168 flags = TH_ACK;
169 } else {
170 m_freem(m->m_next);
171 m->m_next = 0;
172 m->m_data = (caddr_t)ti;
173 m->m_len = sizeof (struct tcpiphdr);
174 tlen = 0;
175 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
176 xchg(ti->ti_dst.s_addr, ti->ti_src.s_addr, u_int32_t);
177 xchg(ti->ti_dport, ti->ti_sport, u_int16_t);
178 #undef xchg
179 }
180 bzero(ti->ti_x1, sizeof ti->ti_x1);
181 ti->ti_seq = htonl(seq);
182 ti->ti_ack = htonl(ack);
183 ti->ti_x2 = 0;
184 if ((flags & TH_SYN) == 0) {
185 if (tp)
186 ti->ti_win = htons((u_int16_t) (win >> tp->rcv_scale));
187 else
188 ti->ti_win = htons((u_int16_t)win);
189 ti->ti_off = sizeof (struct tcphdr) >> 2;
190 tlen += sizeof (struct tcphdr);
191 } else
192 tlen += ti->ti_off << 2;
193 ti->ti_len = htons((u_int16_t)tlen);
194 tlen += sizeof (struct ip);
195 m->m_len = tlen;
196 m->m_pkthdr.len = tlen;
197 m->m_pkthdr.rcvif = (struct ifnet *) 0;
198 ti->ti_flags = flags;
199 ti->ti_urp = 0;
200 ti->ti_sum = 0;
201 ti->ti_sum = in_cksum(m, tlen);
202 ((struct ip *)ti)->ip_len = tlen;
203 ((struct ip *)ti)->ip_ttl = ip_defttl;
204 return ip_output(m, NULL, ro, 0, NULL);
205 }
206
207 /*
208 * Create a new TCP control block, making an
209 * empty reassembly queue and hooking it to the argument
210 * protocol control block.
211 */
212 struct tcpcb *
213 tcp_newtcpcb(inp)
214 struct inpcb *inp;
215 {
216 register struct tcpcb *tp;
217
218 tp = malloc(sizeof(*tp), M_PCB, M_NOWAIT);
219 if (tp == NULL)
220 return ((struct tcpcb *)0);
221 bzero((caddr_t)tp, sizeof(struct tcpcb));
222 LIST_INIT(&tp->segq);
223 tp->t_maxseg = tcp_mssdflt;
224
225 tp->t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
226 tp->t_inpcb = inp;
227 /*
228 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
229 * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives
230 * reasonable initial retransmit time.
231 */
232 tp->t_srtt = TCPTV_SRTTBASE;
233 tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << (TCP_RTTVAR_SHIFT + 2 - 1);
234 tp->t_rttmin = TCPTV_MIN;
235 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
236 TCPTV_MIN, TCPTV_REXMTMAX);
237 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
238 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
239 inp->inp_ip.ip_ttl = ip_defttl;
240 inp->inp_ppcb = (caddr_t)tp;
241 return (tp);
242 }
243
244 /*
245 * Drop a TCP connection, reporting
246 * the specified error. If connection is synchronized,
247 * then send a RST to peer.
248 */
249 struct tcpcb *
250 tcp_drop(tp, errno)
251 register struct tcpcb *tp;
252 int errno;
253 {
254 struct socket *so = tp->t_inpcb->inp_socket;
255
256 if (TCPS_HAVERCVDSYN(tp->t_state)) {
257 tp->t_state = TCPS_CLOSED;
258 (void) tcp_output(tp);
259 tcpstat.tcps_drops++;
260 } else
261 tcpstat.tcps_conndrops++;
262 if (errno == ETIMEDOUT && tp->t_softerror)
263 errno = tp->t_softerror;
264 so->so_error = errno;
265 return (tcp_close(tp));
266 }
267
268 /*
269 * Close a TCP control block:
270 * discard all space held by the tcp
271 * discard internet protocol block
272 * wake up any sleepers
273 */
274 struct tcpcb *
275 tcp_close(tp)
276 register struct tcpcb *tp;
277 {
278 register struct ipqent *qe;
279 struct inpcb *inp = tp->t_inpcb;
280 struct socket *so = inp->inp_socket;
281 #ifdef RTV_RTT
282 register struct rtentry *rt;
283
284 /*
285 * If we sent enough data to get some meaningful characteristics,
286 * save them in the routing entry. 'Enough' is arbitrarily
287 * defined as the sendpipesize (default 4K) * 16. This would
288 * give us 16 rtt samples assuming we only get one sample per
289 * window (the usual case on a long haul net). 16 samples is
290 * enough for the srtt filter to converge to within 5% of the correct
291 * value; fewer samples and we could save a very bogus rtt.
292 *
293 * Don't update the default route's characteristics and don't
294 * update anything that the user "locked".
295 */
296 if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) &&
297 (rt = inp->inp_route.ro_rt) &&
298 !in_nullhost(satosin(rt_key(rt))->sin_addr)) {
299 register u_long i = 0;
300
301 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
302 i = tp->t_srtt *
303 ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
304 if (rt->rt_rmx.rmx_rtt && i)
305 /*
306 * filter this update to half the old & half
307 * the new values, converting scale.
308 * See route.h and tcp_var.h for a
309 * description of the scaling constants.
310 */
311 rt->rt_rmx.rmx_rtt =
312 (rt->rt_rmx.rmx_rtt + i) / 2;
313 else
314 rt->rt_rmx.rmx_rtt = i;
315 }
316 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
317 i = tp->t_rttvar *
318 ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTTVAR_SHIFT + 2));
319 if (rt->rt_rmx.rmx_rttvar && i)
320 rt->rt_rmx.rmx_rttvar =
321 (rt->rt_rmx.rmx_rttvar + i) / 2;
322 else
323 rt->rt_rmx.rmx_rttvar = i;
324 }
325 /*
326 * update the pipelimit (ssthresh) if it has been updated
327 * already or if a pipesize was specified & the threshhold
328 * got below half the pipesize. I.e., wait for bad news
329 * before we start updating, then update on both good
330 * and bad news.
331 */
332 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
333 (i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh) ||
334 i < (rt->rt_rmx.rmx_sendpipe / 2)) {
335 /*
336 * convert the limit from user data bytes to
337 * packets then to packet data bytes.
338 */
339 i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
340 if (i < 2)
341 i = 2;
342 i *= (u_long)(tp->t_maxseg + sizeof (struct tcpiphdr));
343 if (rt->rt_rmx.rmx_ssthresh)
344 rt->rt_rmx.rmx_ssthresh =
345 (rt->rt_rmx.rmx_ssthresh + i) / 2;
346 else
347 rt->rt_rmx.rmx_ssthresh = i;
348 }
349 }
350 #endif /* RTV_RTT */
351 /* free the reassembly queue, if any */
352 while ((qe = tp->segq.lh_first) != NULL) {
353 LIST_REMOVE(qe, ipqe_q);
354 m_freem(qe->ipqe_m);
355 FREE(qe, M_IPQ);
356 }
357 if (tp->t_template)
358 FREE(tp->t_template, M_MBUF);
359 free(tp, M_PCB);
360 inp->inp_ppcb = 0;
361 soisdisconnected(so);
362 in_pcbdetach(inp);
363 tcpstat.tcps_closed++;
364 return ((struct tcpcb *)0);
365 }
366
367 void
368 tcp_drain()
369 {
370
371 }
372
373 /*
374 * Notify a tcp user of an asynchronous error;
375 * store error as soft error, but wake up user
376 * (for now, won't do anything until can select for soft error).
377 */
378 void
379 tcp_notify(inp, error)
380 struct inpcb *inp;
381 int error;
382 {
383 register struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
384 register struct socket *so = inp->inp_socket;
385
386 /*
387 * Ignore some errors if we are hooked up.
388 * If connection hasn't completed, has retransmitted several times,
389 * and receives a second error, give up now. This is better
390 * than waiting a long time to establish a connection that
391 * can never complete.
392 */
393 if (tp->t_state == TCPS_ESTABLISHED &&
394 (error == EHOSTUNREACH || error == ENETUNREACH ||
395 error == EHOSTDOWN)) {
396 return;
397 } else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 &&
398 tp->t_rxtshift > 3 && tp->t_softerror)
399 so->so_error = error;
400 else
401 tp->t_softerror = error;
402 wakeup((caddr_t) &so->so_timeo);
403 sorwakeup(so);
404 sowwakeup(so);
405 }
406
407 void *
408 tcp_ctlinput(cmd, sa, v)
409 int cmd;
410 struct sockaddr *sa;
411 register void *v;
412 {
413 register struct ip *ip = v;
414 register struct tcphdr *th;
415 extern int inetctlerrmap[];
416 void (*notify) __P((struct inpcb *, int)) = tcp_notify;
417 int errno;
418 int nmatch;
419
420 if ((unsigned)cmd >= PRC_NCMDS)
421 return NULL;
422 errno = inetctlerrmap[cmd];
423 if (cmd == PRC_QUENCH)
424 notify = tcp_quench;
425 else if (PRC_IS_REDIRECT(cmd))
426 notify = in_rtchange, ip = 0;
427 else if (cmd == PRC_HOSTDEAD)
428 ip = 0;
429 else if (errno == 0)
430 return NULL;
431 if (ip) {
432 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
433 nmatch = in_pcbnotify(&tcbtable, satosin(sa)->sin_addr,
434 th->th_dport, ip->ip_src, th->th_sport, errno, notify);
435 if (nmatch == 0 && syn_cache_count &&
436 (inetctlerrmap[cmd] == EHOSTUNREACH ||
437 inetctlerrmap[cmd] == ENETUNREACH ||
438 inetctlerrmap[cmd] == EHOSTDOWN))
439 syn_cache_unreach(ip, th);
440 } else
441 (void)in_pcbnotifyall(&tcbtable, satosin(sa)->sin_addr, errno,
442 notify);
443 return NULL;
444 }
445
446 /*
447 * When a source quench is received, close congestion window
448 * to one segment. We will gradually open it again as we proceed.
449 */
450 void
451 tcp_quench(inp, errno)
452 struct inpcb *inp;
453 int errno;
454 {
455 struct tcpcb *tp = intotcpcb(inp);
456
457 if (tp)
458 tp->snd_cwnd = tp->t_maxseg;
459 }
460