uipc_socket.c revision 1.56.2.3 1 /* $NetBSD: uipc_socket.c,v 1.56.2.3 2002/01/10 20:00:14 thorpej Exp $ */
2
3 /*
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed by the University of
18 * California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95
36 */
37
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.56.2.3 2002/01/10 20:00:14 thorpej Exp $");
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/proc.h>
44 #include <sys/file.h>
45 #include <sys/malloc.h>
46 #include <sys/mbuf.h>
47 #include <sys/domain.h>
48 #include <sys/kernel.h>
49 #include <sys/protosw.h>
50 #include <sys/socket.h>
51 #include <sys/socketvar.h>
52 #include <sys/signalvar.h>
53 #include <sys/resourcevar.h>
54 #include <sys/pool.h>
55 #include <sys/event.h>
56
57 static void filt_sordetach(struct knote *kn);
58 static int filt_soread(struct knote *kn, long hint);
59 static void filt_sowdetach(struct knote *kn);
60 static int filt_sowrite(struct knote *kn, long hint);
61 static int filt_solisten(struct knote *kn, long hint);
62
63 static const struct filterops solisten_filtops =
64 { 1, NULL, filt_sordetach, filt_solisten };
65 const struct filterops soread_filtops =
66 { 1, NULL, filt_sordetach, filt_soread };
67 const struct filterops sowrite_filtops =
68 { 1, NULL, filt_sowdetach, filt_sowrite };
69
70 struct pool socket_pool;
71
72 extern int somaxconn; /* patchable (XXX sysctl) */
73 int somaxconn = SOMAXCONN;
74
75 void
76 soinit(void)
77 {
78
79 pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0,
80 "sockpl", 0, NULL, NULL, M_SOCKET);
81 }
82
83 /*
84 * Socket operation routines.
85 * These routines are called by the routines in
86 * sys_socket.c or from a system process, and
87 * implement the semantics of socket operations by
88 * switching out to the protocol specific routines.
89 */
90 /*ARGSUSED*/
91 int
92 socreate(int dom, struct socket **aso, int type, int proto)
93 {
94 struct proc *p;
95 struct protosw *prp;
96 struct socket *so;
97 int error, s;
98
99 p = curproc; /* XXX */
100 if (proto)
101 prp = pffindproto(dom, proto, type);
102 else
103 prp = pffindtype(dom, type);
104 if (prp == 0 || prp->pr_usrreq == 0)
105 return (EPROTONOSUPPORT);
106 if (prp->pr_type != type)
107 return (EPROTOTYPE);
108 s = splsoftnet();
109 so = pool_get(&socket_pool, PR_WAITOK);
110 memset((caddr_t)so, 0, sizeof(*so));
111 TAILQ_INIT(&so->so_q0);
112 TAILQ_INIT(&so->so_q);
113 so->so_type = type;
114 so->so_proto = prp;
115 so->so_send = sosend;
116 so->so_receive = soreceive;
117 if (p != 0)
118 so->so_uid = p->p_ucred->cr_uid;
119 error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0,
120 (struct mbuf *)(long)proto, (struct mbuf *)0, p);
121 if (error) {
122 so->so_state |= SS_NOFDREF;
123 sofree(so);
124 splx(s);
125 return (error);
126 }
127 splx(s);
128 *aso = so;
129 return (0);
130 }
131
132 int
133 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
134 {
135 int s, error;
136
137 s = splsoftnet();
138 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0,
139 nam, (struct mbuf *)0, p);
140 splx(s);
141 return (error);
142 }
143
144 int
145 solisten(struct socket *so, int backlog)
146 {
147 int s, error;
148
149 s = splsoftnet();
150 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0,
151 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
152 if (error) {
153 splx(s);
154 return (error);
155 }
156 if (so->so_q.tqh_first == NULL)
157 so->so_options |= SO_ACCEPTCONN;
158 if (backlog < 0)
159 backlog = 0;
160 so->so_qlimit = min(backlog, somaxconn);
161 splx(s);
162 return (0);
163 }
164
165 void
166 sofree(struct socket *so)
167 {
168
169 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
170 return;
171 if (so->so_head) {
172 /*
173 * We must not decommission a socket that's on the accept(2)
174 * queue. If we do, then accept(2) may hang after select(2)
175 * indicated that the listening socket was ready.
176 */
177 if (!soqremque(so, 0))
178 return;
179 }
180 sbrelease(&so->so_snd);
181 sorflush(so);
182 pool_put(&socket_pool, so);
183 }
184
185 /*
186 * Close a socket on last file table reference removal.
187 * Initiate disconnect if connected.
188 * Free socket when disconnect complete.
189 */
190 int
191 soclose(struct socket *so)
192 {
193 struct socket *so2;
194 int s, error;
195
196 error = 0;
197 s = splsoftnet(); /* conservative */
198 if (so->so_options & SO_ACCEPTCONN) {
199 while ((so2 = so->so_q0.tqh_first) != 0) {
200 (void) soqremque(so2, 0);
201 (void) soabort(so2);
202 }
203 while ((so2 = so->so_q.tqh_first) != 0) {
204 (void) soqremque(so2, 1);
205 (void) soabort(so2);
206 }
207 }
208 if (so->so_pcb == 0)
209 goto discard;
210 if (so->so_state & SS_ISCONNECTED) {
211 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
212 error = sodisconnect(so);
213 if (error)
214 goto drop;
215 }
216 if (so->so_options & SO_LINGER) {
217 if ((so->so_state & SS_ISDISCONNECTING) &&
218 (so->so_state & SS_NBIO))
219 goto drop;
220 while (so->so_state & SS_ISCONNECTED) {
221 error = tsleep((caddr_t)&so->so_timeo,
222 PSOCK | PCATCH, netcls,
223 so->so_linger * hz);
224 if (error)
225 break;
226 }
227 }
228 }
229 drop:
230 if (so->so_pcb) {
231 int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
232 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
233 (struct proc *)0);
234 if (error == 0)
235 error = error2;
236 }
237 discard:
238 if (so->so_state & SS_NOFDREF)
239 panic("soclose: NOFDREF");
240 so->so_state |= SS_NOFDREF;
241 sofree(so);
242 splx(s);
243 return (error);
244 }
245
246 /*
247 * Must be called at splsoftnet...
248 */
249 int
250 soabort(struct socket *so)
251 {
252
253 return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0,
254 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
255 }
256
257 int
258 soaccept(struct socket *so, struct mbuf *nam)
259 {
260 int s, error;
261
262 error = 0;
263 s = splsoftnet();
264 if ((so->so_state & SS_NOFDREF) == 0)
265 panic("soaccept: !NOFDREF");
266 so->so_state &= ~SS_NOFDREF;
267 if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
268 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
269 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
270 (struct mbuf *)0, nam, (struct mbuf *)0, (struct proc *)0);
271 else
272 error = ECONNABORTED;
273
274 splx(s);
275 return (error);
276 }
277
278 int
279 soconnect(struct socket *so, struct mbuf *nam)
280 {
281 struct proc *p;
282 int s, error;
283
284 p = curproc; /* XXX */
285 if (so->so_options & SO_ACCEPTCONN)
286 return (EOPNOTSUPP);
287 s = splsoftnet();
288 /*
289 * If protocol is connection-based, can only connect once.
290 * Otherwise, if connected, try to disconnect first.
291 * This allows user to disconnect by connecting to, e.g.,
292 * a null address.
293 */
294 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
295 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
296 (error = sodisconnect(so))))
297 error = EISCONN;
298 else
299 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
300 (struct mbuf *)0, nam, (struct mbuf *)0, p);
301 splx(s);
302 return (error);
303 }
304
305 int
306 soconnect2(struct socket *so1, struct socket *so2)
307 {
308 int s, error;
309
310 s = splsoftnet();
311 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
312 (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0,
313 (struct proc *)0);
314 splx(s);
315 return (error);
316 }
317
318 int
319 sodisconnect(struct socket *so)
320 {
321 int s, error;
322
323 s = splsoftnet();
324 if ((so->so_state & SS_ISCONNECTED) == 0) {
325 error = ENOTCONN;
326 goto bad;
327 }
328 if (so->so_state & SS_ISDISCONNECTING) {
329 error = EALREADY;
330 goto bad;
331 }
332 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
333 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
334 (struct proc *)0);
335 bad:
336 splx(s);
337 return (error);
338 }
339
340 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
341 /*
342 * Send on a socket.
343 * If send must go all at once and message is larger than
344 * send buffering, then hard error.
345 * Lock against other senders.
346 * If must go all at once and not enough room now, then
347 * inform user that this would block and do nothing.
348 * Otherwise, if nonblocking, send as much as possible.
349 * The data to be sent is described by "uio" if nonzero,
350 * otherwise by the mbuf chain "top" (which must be null
351 * if uio is not). Data provided in mbuf chain must be small
352 * enough to send all at once.
353 *
354 * Returns nonzero on error, timeout or signal; callers
355 * must check for short counts if EINTR/ERESTART are returned.
356 * Data and control buffers are freed on return.
357 */
358 int
359 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
360 struct mbuf *control, int flags)
361 {
362 struct proc *p;
363 struct mbuf **mp, *m;
364 long space, len, resid, clen, mlen;
365 int error, s, dontroute, atomic;
366
367 p = curproc; /* XXX */
368 clen = 0;
369 atomic = sosendallatonce(so) || top;
370 if (uio)
371 resid = uio->uio_resid;
372 else
373 resid = top->m_pkthdr.len;
374 /*
375 * In theory resid should be unsigned.
376 * However, space must be signed, as it might be less than 0
377 * if we over-committed, and we must use a signed comparison
378 * of space and resid. On the other hand, a negative resid
379 * causes us to loop sending 0-length segments to the protocol.
380 */
381 if (resid < 0) {
382 error = EINVAL;
383 goto out;
384 }
385 dontroute =
386 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
387 (so->so_proto->pr_flags & PR_ATOMIC);
388 p->p_stats->p_ru.ru_msgsnd++;
389 if (control)
390 clen = control->m_len;
391 #define snderr(errno) { error = errno; splx(s); goto release; }
392
393 restart:
394 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
395 goto out;
396 do {
397 s = splsoftnet();
398 if (so->so_state & SS_CANTSENDMORE)
399 snderr(EPIPE);
400 if (so->so_error) {
401 error = so->so_error;
402 so->so_error = 0;
403 splx(s);
404 goto release;
405 }
406 if ((so->so_state & SS_ISCONNECTED) == 0) {
407 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
408 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
409 !(resid == 0 && clen != 0))
410 snderr(ENOTCONN);
411 } else if (addr == 0)
412 snderr(EDESTADDRREQ);
413 }
414 space = sbspace(&so->so_snd);
415 if (flags & MSG_OOB)
416 space += 1024;
417 if ((atomic && resid > so->so_snd.sb_hiwat) ||
418 clen > so->so_snd.sb_hiwat)
419 snderr(EMSGSIZE);
420 if (space < resid + clen && uio &&
421 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
422 if (so->so_state & SS_NBIO)
423 snderr(EWOULDBLOCK);
424 sbunlock(&so->so_snd);
425 error = sbwait(&so->so_snd);
426 splx(s);
427 if (error)
428 goto out;
429 goto restart;
430 }
431 splx(s);
432 mp = ⊤
433 space -= clen;
434 do {
435 if (uio == NULL) {
436 /*
437 * Data is prepackaged in "top".
438 */
439 resid = 0;
440 if (flags & MSG_EOR)
441 top->m_flags |= M_EOR;
442 } else do {
443 if (top == 0) {
444 MGETHDR(m, M_WAIT, MT_DATA);
445 mlen = MHLEN;
446 m->m_pkthdr.len = 0;
447 m->m_pkthdr.rcvif = (struct ifnet *)0;
448 } else {
449 MGET(m, M_WAIT, MT_DATA);
450 mlen = MLEN;
451 }
452 if (resid >= MINCLSIZE && space >= MCLBYTES) {
453 MCLGET(m, M_WAIT);
454 if ((m->m_flags & M_EXT) == 0)
455 goto nopages;
456 mlen = MCLBYTES;
457 #ifdef MAPPED_MBUFS
458 len = lmin(MCLBYTES, resid);
459 #else
460 if (atomic && top == 0) {
461 len = lmin(MCLBYTES - max_hdr,
462 resid);
463 m->m_data += max_hdr;
464 } else
465 len = lmin(MCLBYTES, resid);
466 #endif
467 space -= len;
468 } else {
469 nopages:
470 len = lmin(lmin(mlen, resid), space);
471 space -= len;
472 /*
473 * For datagram protocols, leave room
474 * for protocol headers in first mbuf.
475 */
476 if (atomic && top == 0 && len < mlen)
477 MH_ALIGN(m, len);
478 }
479 error = uiomove(mtod(m, caddr_t), (int)len,
480 uio);
481 resid = uio->uio_resid;
482 m->m_len = len;
483 *mp = m;
484 top->m_pkthdr.len += len;
485 if (error)
486 goto release;
487 mp = &m->m_next;
488 if (resid <= 0) {
489 if (flags & MSG_EOR)
490 top->m_flags |= M_EOR;
491 break;
492 }
493 } while (space > 0 && atomic);
494
495 s = splsoftnet();
496
497 if (so->so_state & SS_CANTSENDMORE)
498 snderr(EPIPE);
499
500 if (dontroute)
501 so->so_options |= SO_DONTROUTE;
502 if (resid > 0)
503 so->so_state |= SS_MORETOCOME;
504 error = (*so->so_proto->pr_usrreq)(so,
505 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
506 top, addr, control, p);
507 if (dontroute)
508 so->so_options &= ~SO_DONTROUTE;
509 if (resid > 0)
510 so->so_state &= ~SS_MORETOCOME;
511 splx(s);
512
513 clen = 0;
514 control = 0;
515 top = 0;
516 mp = ⊤
517 if (error)
518 goto release;
519 } while (resid && space > 0);
520 } while (resid);
521
522 release:
523 sbunlock(&so->so_snd);
524 out:
525 if (top)
526 m_freem(top);
527 if (control)
528 m_freem(control);
529 return (error);
530 }
531
532 /*
533 * Implement receive operations on a socket.
534 * We depend on the way that records are added to the sockbuf
535 * by sbappend*. In particular, each record (mbufs linked through m_next)
536 * must begin with an address if the protocol so specifies,
537 * followed by an optional mbuf or mbufs containing ancillary data,
538 * and then zero or more mbufs of data.
539 * In order to avoid blocking network interrupts for the entire time here,
540 * we splx() while doing the actual copy to user space.
541 * Although the sockbuf is locked, new data may still be appended,
542 * and thus we must maintain consistency of the sockbuf during that time.
543 *
544 * The caller may receive the data as a single mbuf chain by supplying
545 * an mbuf **mp0 for use in returning the chain. The uio is then used
546 * only for the count in uio_resid.
547 */
548 int
549 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
550 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
551 {
552 struct mbuf *m, **mp;
553 int flags, len, error, s, offset, moff, type, orig_resid;
554 struct protosw *pr;
555 struct mbuf *nextrecord;
556
557 pr = so->so_proto;
558 mp = mp0;
559 type = 0;
560 orig_resid = uio->uio_resid;
561 if (paddr)
562 *paddr = 0;
563 if (controlp)
564 *controlp = 0;
565 if (flagsp)
566 flags = *flagsp &~ MSG_EOR;
567 else
568 flags = 0;
569 if (flags & MSG_OOB) {
570 m = m_get(M_WAIT, MT_DATA);
571 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
572 (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0,
573 (struct proc *)0);
574 if (error)
575 goto bad;
576 do {
577 error = uiomove(mtod(m, caddr_t),
578 (int) min(uio->uio_resid, m->m_len), uio);
579 m = m_free(m);
580 } while (uio->uio_resid && error == 0 && m);
581 bad:
582 if (m)
583 m_freem(m);
584 return (error);
585 }
586 if (mp)
587 *mp = (struct mbuf *)0;
588 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
589 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
590 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
591
592 restart:
593 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
594 return (error);
595 s = splsoftnet();
596
597 m = so->so_rcv.sb_mb;
598 /*
599 * If we have less data than requested, block awaiting more
600 * (subject to any timeout) if:
601 * 1. the current count is less than the low water mark,
602 * 2. MSG_WAITALL is set, and it is possible to do the entire
603 * receive operation at once if we block (resid <= hiwat), or
604 * 3. MSG_DONTWAIT is not set.
605 * If MSG_WAITALL is set but resid is larger than the receive buffer,
606 * we have to do the receive in sections, and thus risk returning
607 * a short count if a timeout or signal occurs after we start.
608 */
609 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
610 so->so_rcv.sb_cc < uio->uio_resid) &&
611 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
612 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
613 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
614 #ifdef DIAGNOSTIC
615 if (m == 0 && so->so_rcv.sb_cc)
616 panic("receive 1");
617 #endif
618 if (so->so_error) {
619 if (m)
620 goto dontblock;
621 error = so->so_error;
622 if ((flags & MSG_PEEK) == 0)
623 so->so_error = 0;
624 goto release;
625 }
626 if (so->so_state & SS_CANTRCVMORE) {
627 if (m)
628 goto dontblock;
629 else
630 goto release;
631 }
632 for (; m; m = m->m_next)
633 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
634 m = so->so_rcv.sb_mb;
635 goto dontblock;
636 }
637 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
638 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
639 error = ENOTCONN;
640 goto release;
641 }
642 if (uio->uio_resid == 0)
643 goto release;
644 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
645 error = EWOULDBLOCK;
646 goto release;
647 }
648 sbunlock(&so->so_rcv);
649 error = sbwait(&so->so_rcv);
650 splx(s);
651 if (error)
652 return (error);
653 goto restart;
654 }
655 dontblock:
656 #ifdef notyet /* XXXX */
657 if (uio->uio_procp)
658 uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
659 #endif
660 nextrecord = m->m_nextpkt;
661 if (pr->pr_flags & PR_ADDR) {
662 #ifdef DIAGNOSTIC
663 if (m->m_type != MT_SONAME)
664 panic("receive 1a");
665 #endif
666 orig_resid = 0;
667 if (flags & MSG_PEEK) {
668 if (paddr)
669 *paddr = m_copy(m, 0, m->m_len);
670 m = m->m_next;
671 } else {
672 sbfree(&so->so_rcv, m);
673 if (paddr) {
674 *paddr = m;
675 so->so_rcv.sb_mb = m->m_next;
676 m->m_next = 0;
677 m = so->so_rcv.sb_mb;
678 } else {
679 MFREE(m, so->so_rcv.sb_mb);
680 m = so->so_rcv.sb_mb;
681 }
682 }
683 }
684 while (m && m->m_type == MT_CONTROL && error == 0) {
685 if (flags & MSG_PEEK) {
686 if (controlp)
687 *controlp = m_copy(m, 0, m->m_len);
688 m = m->m_next;
689 } else {
690 sbfree(&so->so_rcv, m);
691 if (controlp) {
692 if (pr->pr_domain->dom_externalize &&
693 mtod(m, struct cmsghdr *)->cmsg_type ==
694 SCM_RIGHTS)
695 error = (*pr->pr_domain->dom_externalize)(m);
696 *controlp = m;
697 so->so_rcv.sb_mb = m->m_next;
698 m->m_next = 0;
699 m = so->so_rcv.sb_mb;
700 } else {
701 MFREE(m, so->so_rcv.sb_mb);
702 m = so->so_rcv.sb_mb;
703 }
704 }
705 if (controlp) {
706 orig_resid = 0;
707 controlp = &(*controlp)->m_next;
708 }
709 }
710 if (m) {
711 if ((flags & MSG_PEEK) == 0)
712 m->m_nextpkt = nextrecord;
713 type = m->m_type;
714 if (type == MT_OOBDATA)
715 flags |= MSG_OOB;
716 }
717 moff = 0;
718 offset = 0;
719 while (m && uio->uio_resid > 0 && error == 0) {
720 if (m->m_type == MT_OOBDATA) {
721 if (type != MT_OOBDATA)
722 break;
723 } else if (type == MT_OOBDATA)
724 break;
725 #ifdef DIAGNOSTIC
726 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
727 panic("receive 3");
728 #endif
729 so->so_state &= ~SS_RCVATMARK;
730 len = uio->uio_resid;
731 if (so->so_oobmark && len > so->so_oobmark - offset)
732 len = so->so_oobmark - offset;
733 if (len > m->m_len - moff)
734 len = m->m_len - moff;
735 /*
736 * If mp is set, just pass back the mbufs.
737 * Otherwise copy them out via the uio, then free.
738 * Sockbuf must be consistent here (points to current mbuf,
739 * it points to next record) when we drop priority;
740 * we must note any additions to the sockbuf when we
741 * block interrupts again.
742 */
743 if (mp == 0) {
744 splx(s);
745 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
746 s = splsoftnet();
747 if (error)
748 goto release;
749 } else
750 uio->uio_resid -= len;
751 if (len == m->m_len - moff) {
752 if (m->m_flags & M_EOR)
753 flags |= MSG_EOR;
754 if (flags & MSG_PEEK) {
755 m = m->m_next;
756 moff = 0;
757 } else {
758 nextrecord = m->m_nextpkt;
759 sbfree(&so->so_rcv, m);
760 if (mp) {
761 *mp = m;
762 mp = &m->m_next;
763 so->so_rcv.sb_mb = m = m->m_next;
764 *mp = (struct mbuf *)0;
765 } else {
766 MFREE(m, so->so_rcv.sb_mb);
767 m = so->so_rcv.sb_mb;
768 }
769 if (m)
770 m->m_nextpkt = nextrecord;
771 }
772 } else {
773 if (flags & MSG_PEEK)
774 moff += len;
775 else {
776 if (mp)
777 *mp = m_copym(m, 0, len, M_WAIT);
778 m->m_data += len;
779 m->m_len -= len;
780 so->so_rcv.sb_cc -= len;
781 }
782 }
783 if (so->so_oobmark) {
784 if ((flags & MSG_PEEK) == 0) {
785 so->so_oobmark -= len;
786 if (so->so_oobmark == 0) {
787 so->so_state |= SS_RCVATMARK;
788 break;
789 }
790 } else {
791 offset += len;
792 if (offset == so->so_oobmark)
793 break;
794 }
795 }
796 if (flags & MSG_EOR)
797 break;
798 /*
799 * If the MSG_WAITALL flag is set (for non-atomic socket),
800 * we must not quit until "uio->uio_resid == 0" or an error
801 * termination. If a signal/timeout occurs, return
802 * with a short count but without error.
803 * Keep sockbuf locked against other readers.
804 */
805 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
806 !sosendallatonce(so) && !nextrecord) {
807 if (so->so_error || so->so_state & SS_CANTRCVMORE)
808 break;
809 error = sbwait(&so->so_rcv);
810 if (error) {
811 sbunlock(&so->so_rcv);
812 splx(s);
813 return (0);
814 }
815 if ((m = so->so_rcv.sb_mb) != NULL)
816 nextrecord = m->m_nextpkt;
817 }
818 }
819
820 if (m && pr->pr_flags & PR_ATOMIC) {
821 flags |= MSG_TRUNC;
822 if ((flags & MSG_PEEK) == 0)
823 (void) sbdroprecord(&so->so_rcv);
824 }
825 if ((flags & MSG_PEEK) == 0) {
826 if (m == 0)
827 so->so_rcv.sb_mb = nextrecord;
828 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
829 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
830 (struct mbuf *)(long)flags, (struct mbuf *)0,
831 (struct proc *)0);
832 }
833 if (orig_resid == uio->uio_resid && orig_resid &&
834 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
835 sbunlock(&so->so_rcv);
836 splx(s);
837 goto restart;
838 }
839
840 if (flagsp)
841 *flagsp |= flags;
842 release:
843 sbunlock(&so->so_rcv);
844 splx(s);
845 return (error);
846 }
847
848 int
849 soshutdown(struct socket *so, int how)
850 {
851 struct protosw *pr;
852
853 pr = so->so_proto;
854 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
855 return (EINVAL);
856
857 if (how == SHUT_RD || how == SHUT_RDWR)
858 sorflush(so);
859 if (how == SHUT_WR || how == SHUT_RDWR)
860 return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0,
861 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
862 return (0);
863 }
864
865 void
866 sorflush(struct socket *so)
867 {
868 struct sockbuf *sb, asb;
869 struct protosw *pr;
870 int s;
871
872 sb = &so->so_rcv;
873 pr = so->so_proto;
874 sb->sb_flags |= SB_NOINTR;
875 (void) sblock(sb, M_WAITOK);
876 s = splnet();
877 socantrcvmore(so);
878 sbunlock(sb);
879 asb = *sb;
880 memset((caddr_t)sb, 0, sizeof(*sb));
881 splx(s);
882 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
883 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
884 sbrelease(&asb);
885 }
886
887 int
888 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0)
889 {
890 int error;
891 struct mbuf *m;
892
893 error = 0;
894 m = m0;
895 if (level != SOL_SOCKET) {
896 if (so->so_proto && so->so_proto->pr_ctloutput)
897 return ((*so->so_proto->pr_ctloutput)
898 (PRCO_SETOPT, so, level, optname, &m0));
899 error = ENOPROTOOPT;
900 } else {
901 switch (optname) {
902
903 case SO_LINGER:
904 if (m == NULL || m->m_len != sizeof(struct linger)) {
905 error = EINVAL;
906 goto bad;
907 }
908 so->so_linger = mtod(m, struct linger *)->l_linger;
909 /* fall thru... */
910
911 case SO_DEBUG:
912 case SO_KEEPALIVE:
913 case SO_DONTROUTE:
914 case SO_USELOOPBACK:
915 case SO_BROADCAST:
916 case SO_REUSEADDR:
917 case SO_REUSEPORT:
918 case SO_OOBINLINE:
919 case SO_TIMESTAMP:
920 if (m == NULL || m->m_len < sizeof(int)) {
921 error = EINVAL;
922 goto bad;
923 }
924 if (*mtod(m, int *))
925 so->so_options |= optname;
926 else
927 so->so_options &= ~optname;
928 break;
929
930 case SO_SNDBUF:
931 case SO_RCVBUF:
932 case SO_SNDLOWAT:
933 case SO_RCVLOWAT:
934 {
935 int optval;
936
937 if (m == NULL || m->m_len < sizeof(int)) {
938 error = EINVAL;
939 goto bad;
940 }
941
942 /*
943 * Values < 1 make no sense for any of these
944 * options, so disallow them.
945 */
946 optval = *mtod(m, int *);
947 if (optval < 1) {
948 error = EINVAL;
949 goto bad;
950 }
951
952 switch (optname) {
953
954 case SO_SNDBUF:
955 case SO_RCVBUF:
956 if (sbreserve(optname == SO_SNDBUF ?
957 &so->so_snd : &so->so_rcv,
958 (u_long) optval) == 0) {
959 error = ENOBUFS;
960 goto bad;
961 }
962 break;
963
964 /*
965 * Make sure the low-water is never greater than
966 * the high-water.
967 */
968 case SO_SNDLOWAT:
969 so->so_snd.sb_lowat =
970 (optval > so->so_snd.sb_hiwat) ?
971 so->so_snd.sb_hiwat : optval;
972 break;
973 case SO_RCVLOWAT:
974 so->so_rcv.sb_lowat =
975 (optval > so->so_rcv.sb_hiwat) ?
976 so->so_rcv.sb_hiwat : optval;
977 break;
978 }
979 break;
980 }
981
982 case SO_SNDTIMEO:
983 case SO_RCVTIMEO:
984 {
985 struct timeval *tv;
986 short val;
987
988 if (m == NULL || m->m_len < sizeof(*tv)) {
989 error = EINVAL;
990 goto bad;
991 }
992 tv = mtod(m, struct timeval *);
993 if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) {
994 error = EDOM;
995 goto bad;
996 }
997 val = tv->tv_sec * hz + tv->tv_usec / tick;
998
999 switch (optname) {
1000
1001 case SO_SNDTIMEO:
1002 so->so_snd.sb_timeo = val;
1003 break;
1004 case SO_RCVTIMEO:
1005 so->so_rcv.sb_timeo = val;
1006 break;
1007 }
1008 break;
1009 }
1010
1011 default:
1012 error = ENOPROTOOPT;
1013 break;
1014 }
1015 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
1016 (void) ((*so->so_proto->pr_ctloutput)
1017 (PRCO_SETOPT, so, level, optname, &m0));
1018 m = NULL; /* freed by protocol */
1019 }
1020 }
1021 bad:
1022 if (m)
1023 (void) m_free(m);
1024 return (error);
1025 }
1026
1027 int
1028 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp)
1029 {
1030 struct mbuf *m;
1031
1032 if (level != SOL_SOCKET) {
1033 if (so->so_proto && so->so_proto->pr_ctloutput) {
1034 return ((*so->so_proto->pr_ctloutput)
1035 (PRCO_GETOPT, so, level, optname, mp));
1036 } else
1037 return (ENOPROTOOPT);
1038 } else {
1039 m = m_get(M_WAIT, MT_SOOPTS);
1040 m->m_len = sizeof(int);
1041
1042 switch (optname) {
1043
1044 case SO_LINGER:
1045 m->m_len = sizeof(struct linger);
1046 mtod(m, struct linger *)->l_onoff =
1047 so->so_options & SO_LINGER;
1048 mtod(m, struct linger *)->l_linger = so->so_linger;
1049 break;
1050
1051 case SO_USELOOPBACK:
1052 case SO_DONTROUTE:
1053 case SO_DEBUG:
1054 case SO_KEEPALIVE:
1055 case SO_REUSEADDR:
1056 case SO_REUSEPORT:
1057 case SO_BROADCAST:
1058 case SO_OOBINLINE:
1059 case SO_TIMESTAMP:
1060 *mtod(m, int *) = so->so_options & optname;
1061 break;
1062
1063 case SO_TYPE:
1064 *mtod(m, int *) = so->so_type;
1065 break;
1066
1067 case SO_ERROR:
1068 *mtod(m, int *) = so->so_error;
1069 so->so_error = 0;
1070 break;
1071
1072 case SO_SNDBUF:
1073 *mtod(m, int *) = so->so_snd.sb_hiwat;
1074 break;
1075
1076 case SO_RCVBUF:
1077 *mtod(m, int *) = so->so_rcv.sb_hiwat;
1078 break;
1079
1080 case SO_SNDLOWAT:
1081 *mtod(m, int *) = so->so_snd.sb_lowat;
1082 break;
1083
1084 case SO_RCVLOWAT:
1085 *mtod(m, int *) = so->so_rcv.sb_lowat;
1086 break;
1087
1088 case SO_SNDTIMEO:
1089 case SO_RCVTIMEO:
1090 {
1091 int val = (optname == SO_SNDTIMEO ?
1092 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1093
1094 m->m_len = sizeof(struct timeval);
1095 mtod(m, struct timeval *)->tv_sec = val / hz;
1096 mtod(m, struct timeval *)->tv_usec =
1097 (val % hz) * tick;
1098 break;
1099 }
1100
1101 default:
1102 (void)m_free(m);
1103 return (ENOPROTOOPT);
1104 }
1105 *mp = m;
1106 return (0);
1107 }
1108 }
1109
1110 void
1111 sohasoutofband(struct socket *so)
1112 {
1113 struct proc *p;
1114
1115 if (so->so_pgid < 0)
1116 gsignal(-so->so_pgid, SIGURG);
1117 else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1118 psignal(p, SIGURG);
1119 selwakeup(&so->so_rcv.sb_sel);
1120 }
1121
1122
1123 int
1124 soo_kqfilter(struct file *fp, struct knote *kn)
1125 {
1126 struct socket *so;
1127 struct sockbuf *sb;
1128 int s;
1129
1130 so = (struct socket *)kn->kn_fp->f_data;
1131 switch (kn->kn_filter) {
1132 case EVFILT_READ:
1133 if (so->so_options & SO_ACCEPTCONN)
1134 kn->kn_fop = &solisten_filtops;
1135 else
1136 kn->kn_fop = &soread_filtops;
1137 sb = &so->so_rcv;
1138 break;
1139 case EVFILT_WRITE:
1140 kn->kn_fop = &sowrite_filtops;
1141 sb = &so->so_snd;
1142 break;
1143 default:
1144 return (1);
1145 }
1146 s = splnet(); /* XXXLUKEM: maybe splsoftnet() ? */
1147 SLIST_INSERT_HEAD(&sb->sb_sel.si_klist, kn, kn_selnext);
1148 sb->sb_flags |= SB_KNOTE;
1149 splx(s);
1150 return (0);
1151 }
1152
1153 static void
1154 filt_sordetach(struct knote *kn)
1155 {
1156 struct socket *so;
1157 int s;
1158
1159 so = (struct socket *)kn->kn_fp->f_data;
1160 s = splnet(); /* XXXLUKEM: maybe splsoftnet() ? */
1161 SLIST_REMOVE(&so->so_rcv.sb_sel.si_klist, kn, knote, kn_selnext);
1162 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_klist))
1163 so->so_rcv.sb_flags &= ~SB_KNOTE;
1164 splx(s);
1165 }
1166
1167 /*ARGSUSED*/
1168 static int
1169 filt_soread(struct knote *kn, long hint)
1170 {
1171 struct socket *so;
1172
1173 so = (struct socket *)kn->kn_fp->f_data;
1174 kn->kn_data = so->so_rcv.sb_cc;
1175 if (so->so_state & SS_CANTRCVMORE) {
1176 kn->kn_flags |= EV_EOF;
1177 kn->kn_fflags = so->so_error;
1178 return (1);
1179 }
1180 if (so->so_error) /* temporary udp error */
1181 return (1);
1182 if (kn->kn_sfflags & NOTE_LOWAT)
1183 return (kn->kn_data >= kn->kn_sdata);
1184 return (kn->kn_data >= so->so_rcv.sb_lowat);
1185 }
1186
1187 static void
1188 filt_sowdetach(struct knote *kn)
1189 {
1190 struct socket *so;
1191 int s;
1192
1193 so = (struct socket *)kn->kn_fp->f_data;
1194 s = splnet(); /* XXXLUKEM: maybe splsoftnet() ? */
1195 SLIST_REMOVE(&so->so_snd.sb_sel.si_klist, kn, knote, kn_selnext);
1196 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_klist))
1197 so->so_snd.sb_flags &= ~SB_KNOTE;
1198 splx(s);
1199 }
1200
1201 /*ARGSUSED*/
1202 static int
1203 filt_sowrite(struct knote *kn, long hint)
1204 {
1205 struct socket *so;
1206
1207 so = (struct socket *)kn->kn_fp->f_data;
1208 kn->kn_data = sbspace(&so->so_snd);
1209 if (so->so_state & SS_CANTSENDMORE) {
1210 kn->kn_flags |= EV_EOF;
1211 kn->kn_fflags = so->so_error;
1212 return (1);
1213 }
1214 if (so->so_error) /* temporary udp error */
1215 return (1);
1216 if (((so->so_state & SS_ISCONNECTED) == 0) &&
1217 (so->so_proto->pr_flags & PR_CONNREQUIRED))
1218 return (0);
1219 if (kn->kn_sfflags & NOTE_LOWAT)
1220 return (kn->kn_data >= kn->kn_sdata);
1221 return (kn->kn_data >= so->so_snd.sb_lowat);
1222 }
1223
1224 /*ARGSUSED*/
1225 static int
1226 filt_solisten(struct knote *kn, long hint)
1227 {
1228 struct socket *so;
1229
1230 so = (struct socket *)kn->kn_fp->f_data;
1231 #if 0
1232 /*
1233 * XXXLUKEM: this was freebsd's code. it appears that they
1234 * XXXLUKEM: modified the socket code to store the count
1235 * XXXLUKEM: of all connections in so_qlen, and separately
1236 * XXXLUKEM: track the number of incompletes in so_incqlen.
1237 * XXXLUKEM: as we appear to keep only completed connections
1238 * XXXLUKEM: on so_qlen we can just return that.
1239 * XXXLUKEM: that said, a socket guru should double check for me :)
1240 */
1241 kn->kn_data = so->so_qlen - so->so_incqlen;
1242 return (! TAILQ_EMPTY(&so->so_comp));
1243 #else
1244 kn->kn_data = so->so_qlen;
1245 return (kn->kn_data > 0);
1246 #endif
1247 }
1248