uipc_socket.c revision 1.56.2.1 1 /* $NetBSD: uipc_socket.c,v 1.56.2.1 2001/07/10 13:48:47 lukem Exp $ */
2
3 /*
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed by the University of
18 * California, Berkeley and its contributors.
19 * 4. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95
36 */
37
38 #include "opt_compat_sunos.h"
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/proc.h>
43 #include <sys/file.h>
44 #include <sys/malloc.h>
45 #include <sys/mbuf.h>
46 #include <sys/domain.h>
47 #include <sys/kernel.h>
48 #include <sys/protosw.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/signalvar.h>
52 #include <sys/resourcevar.h>
53 #include <sys/pool.h>
54 #include <sys/event.h>
55
56 static void filt_sordetach(struct knote *kn);
57 static int filt_soread(struct knote *kn, long hint);
58 static void filt_sowdetach(struct knote *kn);
59 static int filt_sowrite(struct knote *kn, long hint);
60 static int filt_solisten(struct knote *kn, long hint);
61
62 static struct filterops solisten_filtops =
63 { 1, NULL, filt_sordetach, filt_solisten };
64 struct filterops soread_filtops =
65 { 1, NULL, filt_sordetach, filt_soread };
66 struct filterops sowrite_filtops =
67 { 1, NULL, filt_sowdetach, filt_sowrite };
68
69 struct pool socket_pool;
70
71 extern int somaxconn; /* patchable (XXX sysctl) */
72 int somaxconn = SOMAXCONN;
73
74 void
75 soinit(void)
76 {
77
78 pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0,
79 "sockpl", 0, NULL, NULL, M_SOCKET);
80 }
81
82 /*
83 * Socket operation routines.
84 * These routines are called by the routines in
85 * sys_socket.c or from a system process, and
86 * implement the semantics of socket operations by
87 * switching out to the protocol specific routines.
88 */
89 /*ARGSUSED*/
90 int
91 socreate(int dom, struct socket **aso, int type, int proto)
92 {
93 struct proc *p;
94 struct protosw *prp;
95 struct socket *so;
96 int error, s;
97
98 p = curproc; /* XXX */
99 if (proto)
100 prp = pffindproto(dom, proto, type);
101 else
102 prp = pffindtype(dom, type);
103 if (prp == 0 || prp->pr_usrreq == 0)
104 return (EPROTONOSUPPORT);
105 if (prp->pr_type != type)
106 return (EPROTOTYPE);
107 s = splsoftnet();
108 so = pool_get(&socket_pool, PR_WAITOK);
109 memset((caddr_t)so, 0, sizeof(*so));
110 TAILQ_INIT(&so->so_q0);
111 TAILQ_INIT(&so->so_q);
112 so->so_type = type;
113 so->so_proto = prp;
114 so->so_send = sosend;
115 so->so_receive = soreceive;
116 if (p != 0)
117 so->so_uid = p->p_ucred->cr_uid;
118 error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0,
119 (struct mbuf *)(long)proto, (struct mbuf *)0, p);
120 if (error) {
121 so->so_state |= SS_NOFDREF;
122 sofree(so);
123 splx(s);
124 return (error);
125 }
126 #ifdef COMPAT_SUNOS
127 {
128 extern struct emul emul_sunos;
129 if (p->p_emul == &emul_sunos && type == SOCK_DGRAM)
130 so->so_options |= SO_BROADCAST;
131 }
132 #endif
133 splx(s);
134 *aso = so;
135 return (0);
136 }
137
138 int
139 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
140 {
141 int s, error;
142
143 s = splsoftnet();
144 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0,
145 nam, (struct mbuf *)0, p);
146 splx(s);
147 return (error);
148 }
149
150 int
151 solisten(struct socket *so, int backlog)
152 {
153 int s, error;
154
155 s = splsoftnet();
156 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0,
157 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
158 if (error) {
159 splx(s);
160 return (error);
161 }
162 if (so->so_q.tqh_first == NULL)
163 so->so_options |= SO_ACCEPTCONN;
164 if (backlog < 0)
165 backlog = 0;
166 so->so_qlimit = min(backlog, somaxconn);
167 splx(s);
168 return (0);
169 }
170
171 void
172 sofree(struct socket *so)
173 {
174
175 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
176 return;
177 if (so->so_head) {
178 /*
179 * We must not decommission a socket that's on the accept(2)
180 * queue. If we do, then accept(2) may hang after select(2)
181 * indicated that the listening socket was ready.
182 */
183 if (!soqremque(so, 0))
184 return;
185 }
186 sbrelease(&so->so_snd);
187 sorflush(so);
188 pool_put(&socket_pool, so);
189 }
190
191 /*
192 * Close a socket on last file table reference removal.
193 * Initiate disconnect if connected.
194 * Free socket when disconnect complete.
195 */
196 int
197 soclose(struct socket *so)
198 {
199 struct socket *so2;
200 int s, error;
201
202 error = 0;
203 s = splsoftnet(); /* conservative */
204 if (so->so_options & SO_ACCEPTCONN) {
205 while ((so2 = so->so_q0.tqh_first) != 0) {
206 (void) soqremque(so2, 0);
207 (void) soabort(so2);
208 }
209 while ((so2 = so->so_q.tqh_first) != 0) {
210 (void) soqremque(so2, 1);
211 (void) soabort(so2);
212 }
213 }
214 if (so->so_pcb == 0)
215 goto discard;
216 if (so->so_state & SS_ISCONNECTED) {
217 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
218 error = sodisconnect(so);
219 if (error)
220 goto drop;
221 }
222 if (so->so_options & SO_LINGER) {
223 if ((so->so_state & SS_ISDISCONNECTING) &&
224 (so->so_state & SS_NBIO))
225 goto drop;
226 while (so->so_state & SS_ISCONNECTED) {
227 error = tsleep((caddr_t)&so->so_timeo,
228 PSOCK | PCATCH, netcls,
229 so->so_linger * hz);
230 if (error)
231 break;
232 }
233 }
234 }
235 drop:
236 if (so->so_pcb) {
237 int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
238 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
239 (struct proc *)0);
240 if (error == 0)
241 error = error2;
242 }
243 discard:
244 if (so->so_state & SS_NOFDREF)
245 panic("soclose: NOFDREF");
246 so->so_state |= SS_NOFDREF;
247 sofree(so);
248 splx(s);
249 return (error);
250 }
251
252 /*
253 * Must be called at splsoftnet...
254 */
255 int
256 soabort(struct socket *so)
257 {
258
259 return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0,
260 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
261 }
262
263 int
264 soaccept(struct socket *so, struct mbuf *nam)
265 {
266 int s, error;
267
268 error = 0;
269 s = splsoftnet();
270 if ((so->so_state & SS_NOFDREF) == 0)
271 panic("soaccept: !NOFDREF");
272 so->so_state &= ~SS_NOFDREF;
273 if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
274 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
275 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
276 (struct mbuf *)0, nam, (struct mbuf *)0, (struct proc *)0);
277 else
278 error = ECONNABORTED;
279
280 splx(s);
281 return (error);
282 }
283
284 int
285 soconnect(struct socket *so, struct mbuf *nam)
286 {
287 struct proc *p;
288 int s, error;
289
290 p = curproc; /* XXX */
291 if (so->so_options & SO_ACCEPTCONN)
292 return (EOPNOTSUPP);
293 s = splsoftnet();
294 /*
295 * If protocol is connection-based, can only connect once.
296 * Otherwise, if connected, try to disconnect first.
297 * This allows user to disconnect by connecting to, e.g.,
298 * a null address.
299 */
300 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
301 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
302 (error = sodisconnect(so))))
303 error = EISCONN;
304 else
305 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
306 (struct mbuf *)0, nam, (struct mbuf *)0, p);
307 splx(s);
308 return (error);
309 }
310
311 int
312 soconnect2(struct socket *so1, struct socket *so2)
313 {
314 int s, error;
315
316 s = splsoftnet();
317 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
318 (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0,
319 (struct proc *)0);
320 splx(s);
321 return (error);
322 }
323
324 int
325 sodisconnect(struct socket *so)
326 {
327 int s, error;
328
329 s = splsoftnet();
330 if ((so->so_state & SS_ISCONNECTED) == 0) {
331 error = ENOTCONN;
332 goto bad;
333 }
334 if (so->so_state & SS_ISDISCONNECTING) {
335 error = EALREADY;
336 goto bad;
337 }
338 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
339 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
340 (struct proc *)0);
341 bad:
342 splx(s);
343 return (error);
344 }
345
346 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
347 /*
348 * Send on a socket.
349 * If send must go all at once and message is larger than
350 * send buffering, then hard error.
351 * Lock against other senders.
352 * If must go all at once and not enough room now, then
353 * inform user that this would block and do nothing.
354 * Otherwise, if nonblocking, send as much as possible.
355 * The data to be sent is described by "uio" if nonzero,
356 * otherwise by the mbuf chain "top" (which must be null
357 * if uio is not). Data provided in mbuf chain must be small
358 * enough to send all at once.
359 *
360 * Returns nonzero on error, timeout or signal; callers
361 * must check for short counts if EINTR/ERESTART are returned.
362 * Data and control buffers are freed on return.
363 */
364 int
365 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
366 struct mbuf *control, int flags)
367 {
368 struct proc *p;
369 struct mbuf **mp, *m;
370 long space, len, resid;
371 int clen, error, s, dontroute, mlen, atomic;
372
373 p = curproc; /* XXX */
374 clen = 0;
375 atomic = sosendallatonce(so) || top;
376 if (uio)
377 resid = uio->uio_resid;
378 else
379 resid = top->m_pkthdr.len;
380 /*
381 * In theory resid should be unsigned.
382 * However, space must be signed, as it might be less than 0
383 * if we over-committed, and we must use a signed comparison
384 * of space and resid. On the other hand, a negative resid
385 * causes us to loop sending 0-length segments to the protocol.
386 */
387 if (resid < 0) {
388 error = EINVAL;
389 goto out;
390 }
391 dontroute =
392 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
393 (so->so_proto->pr_flags & PR_ATOMIC);
394 p->p_stats->p_ru.ru_msgsnd++;
395 if (control)
396 clen = control->m_len;
397 #define snderr(errno) { error = errno; splx(s); goto release; }
398
399 restart:
400 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
401 goto out;
402 do {
403 s = splsoftnet();
404 if (so->so_state & SS_CANTSENDMORE)
405 snderr(EPIPE);
406 if (so->so_error) {
407 error = so->so_error;
408 so->so_error = 0;
409 splx(s);
410 goto release;
411 }
412 if ((so->so_state & SS_ISCONNECTED) == 0) {
413 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
414 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
415 !(resid == 0 && clen != 0))
416 snderr(ENOTCONN);
417 } else if (addr == 0)
418 snderr(EDESTADDRREQ);
419 }
420 space = sbspace(&so->so_snd);
421 if (flags & MSG_OOB)
422 space += 1024;
423 if ((atomic && resid > so->so_snd.sb_hiwat) ||
424 clen > so->so_snd.sb_hiwat)
425 snderr(EMSGSIZE);
426 if (space < resid + clen && uio &&
427 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
428 if (so->so_state & SS_NBIO)
429 snderr(EWOULDBLOCK);
430 sbunlock(&so->so_snd);
431 error = sbwait(&so->so_snd);
432 splx(s);
433 if (error)
434 goto out;
435 goto restart;
436 }
437 splx(s);
438 mp = ⊤
439 space -= clen;
440 do {
441 if (uio == NULL) {
442 /*
443 * Data is prepackaged in "top".
444 */
445 resid = 0;
446 if (flags & MSG_EOR)
447 top->m_flags |= M_EOR;
448 } else do {
449 if (top == 0) {
450 MGETHDR(m, M_WAIT, MT_DATA);
451 mlen = MHLEN;
452 m->m_pkthdr.len = 0;
453 m->m_pkthdr.rcvif = (struct ifnet *)0;
454 } else {
455 MGET(m, M_WAIT, MT_DATA);
456 mlen = MLEN;
457 }
458 if (resid >= MINCLSIZE && space >= MCLBYTES) {
459 MCLGET(m, M_WAIT);
460 if ((m->m_flags & M_EXT) == 0)
461 goto nopages;
462 mlen = MCLBYTES;
463 #ifdef MAPPED_MBUFS
464 len = min(MCLBYTES, resid);
465 #else
466 if (atomic && top == 0) {
467 len = min(MCLBYTES - max_hdr,
468 resid);
469 m->m_data += max_hdr;
470 } else
471 len = min(MCLBYTES, resid);
472 #endif
473 space -= len;
474 } else {
475 nopages:
476 len = min(min(mlen, resid), space);
477 space -= len;
478 /*
479 * For datagram protocols, leave room
480 * for protocol headers in first mbuf.
481 */
482 if (atomic && top == 0 && len < mlen)
483 MH_ALIGN(m, len);
484 }
485 error = uiomove(mtod(m, caddr_t), (int)len,
486 uio);
487 resid = uio->uio_resid;
488 m->m_len = len;
489 *mp = m;
490 top->m_pkthdr.len += len;
491 if (error)
492 goto release;
493 mp = &m->m_next;
494 if (resid <= 0) {
495 if (flags & MSG_EOR)
496 top->m_flags |= M_EOR;
497 break;
498 }
499 } while (space > 0 && atomic);
500
501 s = splsoftnet();
502
503 if (so->so_state & SS_CANTSENDMORE)
504 snderr(EPIPE);
505
506 if (dontroute)
507 so->so_options |= SO_DONTROUTE;
508 if (resid > 0)
509 so->so_state |= SS_MORETOCOME;
510 error = (*so->so_proto->pr_usrreq)(so,
511 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
512 top, addr, control, p);
513 if (dontroute)
514 so->so_options &= ~SO_DONTROUTE;
515 if (resid > 0)
516 so->so_state &= ~SS_MORETOCOME;
517 splx(s);
518
519 clen = 0;
520 control = 0;
521 top = 0;
522 mp = ⊤
523 if (error)
524 goto release;
525 } while (resid && space > 0);
526 } while (resid);
527
528 release:
529 sbunlock(&so->so_snd);
530 out:
531 if (top)
532 m_freem(top);
533 if (control)
534 m_freem(control);
535 return (error);
536 }
537
538 /*
539 * Implement receive operations on a socket.
540 * We depend on the way that records are added to the sockbuf
541 * by sbappend*. In particular, each record (mbufs linked through m_next)
542 * must begin with an address if the protocol so specifies,
543 * followed by an optional mbuf or mbufs containing ancillary data,
544 * and then zero or more mbufs of data.
545 * In order to avoid blocking network interrupts for the entire time here,
546 * we splx() while doing the actual copy to user space.
547 * Although the sockbuf is locked, new data may still be appended,
548 * and thus we must maintain consistency of the sockbuf during that time.
549 *
550 * The caller may receive the data as a single mbuf chain by supplying
551 * an mbuf **mp0 for use in returning the chain. The uio is then used
552 * only for the count in uio_resid.
553 */
554 int
555 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
556 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
557 {
558 struct mbuf *m, **mp;
559 int flags, len, error, s, offset, moff, type, orig_resid;
560 struct protosw *pr;
561 struct mbuf *nextrecord;
562
563 pr = so->so_proto;
564 mp = mp0;
565 type = 0;
566 orig_resid = uio->uio_resid;
567 if (paddr)
568 *paddr = 0;
569 if (controlp)
570 *controlp = 0;
571 if (flagsp)
572 flags = *flagsp &~ MSG_EOR;
573 else
574 flags = 0;
575 if (flags & MSG_OOB) {
576 m = m_get(M_WAIT, MT_DATA);
577 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
578 (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0,
579 (struct proc *)0);
580 if (error)
581 goto bad;
582 do {
583 error = uiomove(mtod(m, caddr_t),
584 (int) min(uio->uio_resid, m->m_len), uio);
585 m = m_free(m);
586 } while (uio->uio_resid && error == 0 && m);
587 bad:
588 if (m)
589 m_freem(m);
590 return (error);
591 }
592 if (mp)
593 *mp = (struct mbuf *)0;
594 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
595 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
596 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
597
598 restart:
599 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
600 return (error);
601 s = splsoftnet();
602
603 m = so->so_rcv.sb_mb;
604 /*
605 * If we have less data than requested, block awaiting more
606 * (subject to any timeout) if:
607 * 1. the current count is less than the low water mark,
608 * 2. MSG_WAITALL is set, and it is possible to do the entire
609 * receive operation at once if we block (resid <= hiwat), or
610 * 3. MSG_DONTWAIT is not set.
611 * If MSG_WAITALL is set but resid is larger than the receive buffer,
612 * we have to do the receive in sections, and thus risk returning
613 * a short count if a timeout or signal occurs after we start.
614 */
615 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
616 so->so_rcv.sb_cc < uio->uio_resid) &&
617 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
618 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
619 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
620 #ifdef DIAGNOSTIC
621 if (m == 0 && so->so_rcv.sb_cc)
622 panic("receive 1");
623 #endif
624 if (so->so_error) {
625 if (m)
626 goto dontblock;
627 error = so->so_error;
628 if ((flags & MSG_PEEK) == 0)
629 so->so_error = 0;
630 goto release;
631 }
632 if (so->so_state & SS_CANTRCVMORE) {
633 if (m)
634 goto dontblock;
635 else
636 goto release;
637 }
638 for (; m; m = m->m_next)
639 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
640 m = so->so_rcv.sb_mb;
641 goto dontblock;
642 }
643 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
644 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
645 error = ENOTCONN;
646 goto release;
647 }
648 if (uio->uio_resid == 0)
649 goto release;
650 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
651 error = EWOULDBLOCK;
652 goto release;
653 }
654 sbunlock(&so->so_rcv);
655 error = sbwait(&so->so_rcv);
656 splx(s);
657 if (error)
658 return (error);
659 goto restart;
660 }
661 dontblock:
662 #ifdef notyet /* XXXX */
663 if (uio->uio_procp)
664 uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
665 #endif
666 nextrecord = m->m_nextpkt;
667 if (pr->pr_flags & PR_ADDR) {
668 #ifdef DIAGNOSTIC
669 if (m->m_type != MT_SONAME)
670 panic("receive 1a");
671 #endif
672 orig_resid = 0;
673 if (flags & MSG_PEEK) {
674 if (paddr)
675 *paddr = m_copy(m, 0, m->m_len);
676 m = m->m_next;
677 } else {
678 sbfree(&so->so_rcv, m);
679 if (paddr) {
680 *paddr = m;
681 so->so_rcv.sb_mb = m->m_next;
682 m->m_next = 0;
683 m = so->so_rcv.sb_mb;
684 } else {
685 MFREE(m, so->so_rcv.sb_mb);
686 m = so->so_rcv.sb_mb;
687 }
688 }
689 }
690 while (m && m->m_type == MT_CONTROL && error == 0) {
691 if (flags & MSG_PEEK) {
692 if (controlp)
693 *controlp = m_copy(m, 0, m->m_len);
694 m = m->m_next;
695 } else {
696 sbfree(&so->so_rcv, m);
697 if (controlp) {
698 if (pr->pr_domain->dom_externalize &&
699 mtod(m, struct cmsghdr *)->cmsg_type ==
700 SCM_RIGHTS)
701 error = (*pr->pr_domain->dom_externalize)(m);
702 *controlp = m;
703 so->so_rcv.sb_mb = m->m_next;
704 m->m_next = 0;
705 m = so->so_rcv.sb_mb;
706 } else {
707 MFREE(m, so->so_rcv.sb_mb);
708 m = so->so_rcv.sb_mb;
709 }
710 }
711 if (controlp) {
712 orig_resid = 0;
713 controlp = &(*controlp)->m_next;
714 }
715 }
716 if (m) {
717 if ((flags & MSG_PEEK) == 0)
718 m->m_nextpkt = nextrecord;
719 type = m->m_type;
720 if (type == MT_OOBDATA)
721 flags |= MSG_OOB;
722 }
723 moff = 0;
724 offset = 0;
725 while (m && uio->uio_resid > 0 && error == 0) {
726 if (m->m_type == MT_OOBDATA) {
727 if (type != MT_OOBDATA)
728 break;
729 } else if (type == MT_OOBDATA)
730 break;
731 #ifdef DIAGNOSTIC
732 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
733 panic("receive 3");
734 #endif
735 so->so_state &= ~SS_RCVATMARK;
736 len = uio->uio_resid;
737 if (so->so_oobmark && len > so->so_oobmark - offset)
738 len = so->so_oobmark - offset;
739 if (len > m->m_len - moff)
740 len = m->m_len - moff;
741 /*
742 * If mp is set, just pass back the mbufs.
743 * Otherwise copy them out via the uio, then free.
744 * Sockbuf must be consistent here (points to current mbuf,
745 * it points to next record) when we drop priority;
746 * we must note any additions to the sockbuf when we
747 * block interrupts again.
748 */
749 if (mp == 0) {
750 splx(s);
751 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
752 s = splsoftnet();
753 } else
754 uio->uio_resid -= len;
755 if (len == m->m_len - moff) {
756 if (m->m_flags & M_EOR)
757 flags |= MSG_EOR;
758 if (flags & MSG_PEEK) {
759 m = m->m_next;
760 moff = 0;
761 } else {
762 nextrecord = m->m_nextpkt;
763 sbfree(&so->so_rcv, m);
764 if (mp) {
765 *mp = m;
766 mp = &m->m_next;
767 so->so_rcv.sb_mb = m = m->m_next;
768 *mp = (struct mbuf *)0;
769 } else {
770 MFREE(m, so->so_rcv.sb_mb);
771 m = so->so_rcv.sb_mb;
772 }
773 if (m)
774 m->m_nextpkt = nextrecord;
775 }
776 } else {
777 if (flags & MSG_PEEK)
778 moff += len;
779 else {
780 if (mp)
781 *mp = m_copym(m, 0, len, M_WAIT);
782 m->m_data += len;
783 m->m_len -= len;
784 so->so_rcv.sb_cc -= len;
785 }
786 }
787 if (so->so_oobmark) {
788 if ((flags & MSG_PEEK) == 0) {
789 so->so_oobmark -= len;
790 if (so->so_oobmark == 0) {
791 so->so_state |= SS_RCVATMARK;
792 break;
793 }
794 } else {
795 offset += len;
796 if (offset == so->so_oobmark)
797 break;
798 }
799 }
800 if (flags & MSG_EOR)
801 break;
802 /*
803 * If the MSG_WAITALL flag is set (for non-atomic socket),
804 * we must not quit until "uio->uio_resid == 0" or an error
805 * termination. If a signal/timeout occurs, return
806 * with a short count but without error.
807 * Keep sockbuf locked against other readers.
808 */
809 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
810 !sosendallatonce(so) && !nextrecord) {
811 if (so->so_error || so->so_state & SS_CANTRCVMORE)
812 break;
813 error = sbwait(&so->so_rcv);
814 if (error) {
815 sbunlock(&so->so_rcv);
816 splx(s);
817 return (0);
818 }
819 if ((m = so->so_rcv.sb_mb) != NULL)
820 nextrecord = m->m_nextpkt;
821 }
822 }
823
824 if (m && pr->pr_flags & PR_ATOMIC) {
825 flags |= MSG_TRUNC;
826 if ((flags & MSG_PEEK) == 0)
827 (void) sbdroprecord(&so->so_rcv);
828 }
829 if ((flags & MSG_PEEK) == 0) {
830 if (m == 0)
831 so->so_rcv.sb_mb = nextrecord;
832 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
833 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
834 (struct mbuf *)(long)flags, (struct mbuf *)0,
835 (struct proc *)0);
836 }
837 if (orig_resid == uio->uio_resid && orig_resid &&
838 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
839 sbunlock(&so->so_rcv);
840 splx(s);
841 goto restart;
842 }
843
844 if (flagsp)
845 *flagsp |= flags;
846 release:
847 sbunlock(&so->so_rcv);
848 splx(s);
849 return (error);
850 }
851
852 int
853 soshutdown(struct socket *so, int how)
854 {
855 struct protosw *pr;
856
857 pr = so->so_proto;
858 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
859 return (EINVAL);
860
861 if (how == SHUT_RD || how == SHUT_RDWR)
862 sorflush(so);
863 if (how == SHUT_WR || how == SHUT_RDWR)
864 return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0,
865 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
866 return (0);
867 }
868
869 void
870 sorflush(struct socket *so)
871 {
872 struct sockbuf *sb, asb;
873 struct protosw *pr;
874 int s;
875
876 sb = &so->so_rcv;
877 pr = so->so_proto;
878 sb->sb_flags |= SB_NOINTR;
879 (void) sblock(sb, M_WAITOK);
880 s = splnet();
881 socantrcvmore(so);
882 sbunlock(sb);
883 asb = *sb;
884 memset((caddr_t)sb, 0, sizeof(*sb));
885 splx(s);
886 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
887 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
888 sbrelease(&asb);
889 }
890
891 int
892 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0)
893 {
894 int error;
895 struct mbuf *m;
896
897 error = 0;
898 m = m0;
899 if (level != SOL_SOCKET) {
900 if (so->so_proto && so->so_proto->pr_ctloutput)
901 return ((*so->so_proto->pr_ctloutput)
902 (PRCO_SETOPT, so, level, optname, &m0));
903 error = ENOPROTOOPT;
904 } else {
905 switch (optname) {
906
907 case SO_LINGER:
908 if (m == NULL || m->m_len != sizeof(struct linger)) {
909 error = EINVAL;
910 goto bad;
911 }
912 so->so_linger = mtod(m, struct linger *)->l_linger;
913 /* fall thru... */
914
915 case SO_DEBUG:
916 case SO_KEEPALIVE:
917 case SO_DONTROUTE:
918 case SO_USELOOPBACK:
919 case SO_BROADCAST:
920 case SO_REUSEADDR:
921 case SO_REUSEPORT:
922 case SO_OOBINLINE:
923 case SO_TIMESTAMP:
924 if (m == NULL || m->m_len < sizeof(int)) {
925 error = EINVAL;
926 goto bad;
927 }
928 if (*mtod(m, int *))
929 so->so_options |= optname;
930 else
931 so->so_options &= ~optname;
932 break;
933
934 case SO_SNDBUF:
935 case SO_RCVBUF:
936 case SO_SNDLOWAT:
937 case SO_RCVLOWAT:
938 {
939 int optval;
940
941 if (m == NULL || m->m_len < sizeof(int)) {
942 error = EINVAL;
943 goto bad;
944 }
945
946 /*
947 * Values < 1 make no sense for any of these
948 * options, so disallow them.
949 */
950 optval = *mtod(m, int *);
951 if (optval < 1) {
952 error = EINVAL;
953 goto bad;
954 }
955
956 switch (optname) {
957
958 case SO_SNDBUF:
959 case SO_RCVBUF:
960 if (sbreserve(optname == SO_SNDBUF ?
961 &so->so_snd : &so->so_rcv,
962 (u_long) optval) == 0) {
963 error = ENOBUFS;
964 goto bad;
965 }
966 break;
967
968 /*
969 * Make sure the low-water is never greater than
970 * the high-water.
971 */
972 case SO_SNDLOWAT:
973 so->so_snd.sb_lowat =
974 (optval > so->so_snd.sb_hiwat) ?
975 so->so_snd.sb_hiwat : optval;
976 break;
977 case SO_RCVLOWAT:
978 so->so_rcv.sb_lowat =
979 (optval > so->so_rcv.sb_hiwat) ?
980 so->so_rcv.sb_hiwat : optval;
981 break;
982 }
983 break;
984 }
985
986 case SO_SNDTIMEO:
987 case SO_RCVTIMEO:
988 {
989 struct timeval *tv;
990 short val;
991
992 if (m == NULL || m->m_len < sizeof(*tv)) {
993 error = EINVAL;
994 goto bad;
995 }
996 tv = mtod(m, struct timeval *);
997 if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) {
998 error = EDOM;
999 goto bad;
1000 }
1001 val = tv->tv_sec * hz + tv->tv_usec / tick;
1002
1003 switch (optname) {
1004
1005 case SO_SNDTIMEO:
1006 so->so_snd.sb_timeo = val;
1007 break;
1008 case SO_RCVTIMEO:
1009 so->so_rcv.sb_timeo = val;
1010 break;
1011 }
1012 break;
1013 }
1014
1015 default:
1016 error = ENOPROTOOPT;
1017 break;
1018 }
1019 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
1020 (void) ((*so->so_proto->pr_ctloutput)
1021 (PRCO_SETOPT, so, level, optname, &m0));
1022 m = NULL; /* freed by protocol */
1023 }
1024 }
1025 bad:
1026 if (m)
1027 (void) m_free(m);
1028 return (error);
1029 }
1030
1031 int
1032 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp)
1033 {
1034 struct mbuf *m;
1035
1036 if (level != SOL_SOCKET) {
1037 if (so->so_proto && so->so_proto->pr_ctloutput) {
1038 return ((*so->so_proto->pr_ctloutput)
1039 (PRCO_GETOPT, so, level, optname, mp));
1040 } else
1041 return (ENOPROTOOPT);
1042 } else {
1043 m = m_get(M_WAIT, MT_SOOPTS);
1044 m->m_len = sizeof(int);
1045
1046 switch (optname) {
1047
1048 case SO_LINGER:
1049 m->m_len = sizeof(struct linger);
1050 mtod(m, struct linger *)->l_onoff =
1051 so->so_options & SO_LINGER;
1052 mtod(m, struct linger *)->l_linger = so->so_linger;
1053 break;
1054
1055 case SO_USELOOPBACK:
1056 case SO_DONTROUTE:
1057 case SO_DEBUG:
1058 case SO_KEEPALIVE:
1059 case SO_REUSEADDR:
1060 case SO_REUSEPORT:
1061 case SO_BROADCAST:
1062 case SO_OOBINLINE:
1063 case SO_TIMESTAMP:
1064 *mtod(m, int *) = so->so_options & optname;
1065 break;
1066
1067 case SO_TYPE:
1068 *mtod(m, int *) = so->so_type;
1069 break;
1070
1071 case SO_ERROR:
1072 *mtod(m, int *) = so->so_error;
1073 so->so_error = 0;
1074 break;
1075
1076 case SO_SNDBUF:
1077 *mtod(m, int *) = so->so_snd.sb_hiwat;
1078 break;
1079
1080 case SO_RCVBUF:
1081 *mtod(m, int *) = so->so_rcv.sb_hiwat;
1082 break;
1083
1084 case SO_SNDLOWAT:
1085 *mtod(m, int *) = so->so_snd.sb_lowat;
1086 break;
1087
1088 case SO_RCVLOWAT:
1089 *mtod(m, int *) = so->so_rcv.sb_lowat;
1090 break;
1091
1092 case SO_SNDTIMEO:
1093 case SO_RCVTIMEO:
1094 {
1095 int val = (optname == SO_SNDTIMEO ?
1096 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1097
1098 m->m_len = sizeof(struct timeval);
1099 mtod(m, struct timeval *)->tv_sec = val / hz;
1100 mtod(m, struct timeval *)->tv_usec =
1101 (val % hz) * tick;
1102 break;
1103 }
1104
1105 default:
1106 (void)m_free(m);
1107 return (ENOPROTOOPT);
1108 }
1109 *mp = m;
1110 return (0);
1111 }
1112 }
1113
1114 void
1115 sohasoutofband(struct socket *so)
1116 {
1117 struct proc *p;
1118
1119 if (so->so_pgid < 0)
1120 gsignal(-so->so_pgid, SIGURG);
1121 else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1122 psignal(p, SIGURG);
1123 selwakeup(&so->so_rcv.sb_sel);
1124 }
1125
1126
1127 int
1128 soo_kqfilter(struct file *fp, struct knote *kn)
1129 {
1130 struct socket *so;
1131 struct sockbuf *sb;
1132 int s;
1133
1134 so = (struct socket *)kn->kn_fp->f_data;
1135 switch (kn->kn_filter) {
1136 case EVFILT_READ:
1137 if (so->so_options & SO_ACCEPTCONN)
1138 kn->kn_fop = &solisten_filtops;
1139 else
1140 kn->kn_fop = &soread_filtops;
1141 sb = &so->so_rcv;
1142 break;
1143 case EVFILT_WRITE:
1144 kn->kn_fop = &sowrite_filtops;
1145 sb = &so->so_snd;
1146 break;
1147 default:
1148 return (1);
1149 }
1150 s = splnet(); /* XXXLUKEM: maybe splsoftnet() ? */
1151 SLIST_INSERT_HEAD(&sb->sb_sel.si_klist, kn, kn_selnext);
1152 sb->sb_flags |= SB_KNOTE;
1153 splx(s);
1154 return (0);
1155 }
1156
1157 static void
1158 filt_sordetach(struct knote *kn)
1159 {
1160 struct socket *so;
1161 int s;
1162
1163 so = (struct socket *)kn->kn_fp->f_data;
1164 s = splnet(); /* XXXLUKEM: maybe splsoftnet() ? */
1165 SLIST_REMOVE(&so->so_rcv.sb_sel.si_klist, kn, knote, kn_selnext);
1166 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_klist))
1167 so->so_rcv.sb_flags &= ~SB_KNOTE;
1168 splx(s);
1169 }
1170
1171 /*ARGSUSED*/
1172 static int
1173 filt_soread(struct knote *kn, long hint)
1174 {
1175 struct socket *so;
1176
1177 so = (struct socket *)kn->kn_fp->f_data;
1178 kn->kn_data = so->so_rcv.sb_cc;
1179 if (so->so_state & SS_CANTRCVMORE) {
1180 kn->kn_flags |= EV_EOF;
1181 kn->kn_fflags = so->so_error;
1182 return (1);
1183 }
1184 if (so->so_error) /* temporary udp error */
1185 return (1);
1186 if (kn->kn_sfflags & NOTE_LOWAT)
1187 return (kn->kn_data >= kn->kn_sdata);
1188 return (kn->kn_data >= so->so_rcv.sb_lowat);
1189 }
1190
1191 static void
1192 filt_sowdetach(struct knote *kn)
1193 {
1194 struct socket *so;
1195 int s;
1196
1197 so = (struct socket *)kn->kn_fp->f_data;
1198 s = splnet(); /* XXXLUKEM: maybe splsoftnet() ? */
1199 SLIST_REMOVE(&so->so_snd.sb_sel.si_klist, kn, knote, kn_selnext);
1200 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_klist))
1201 so->so_snd.sb_flags &= ~SB_KNOTE;
1202 splx(s);
1203 }
1204
1205 /*ARGSUSED*/
1206 static int
1207 filt_sowrite(struct knote *kn, long hint)
1208 {
1209 struct socket *so;
1210
1211 so = (struct socket *)kn->kn_fp->f_data;
1212 kn->kn_data = sbspace(&so->so_snd);
1213 if (so->so_state & SS_CANTSENDMORE) {
1214 kn->kn_flags |= EV_EOF;
1215 kn->kn_fflags = so->so_error;
1216 return (1);
1217 }
1218 if (so->so_error) /* temporary udp error */
1219 return (1);
1220 if (((so->so_state & SS_ISCONNECTED) == 0) &&
1221 (so->so_proto->pr_flags & PR_CONNREQUIRED))
1222 return (0);
1223 if (kn->kn_sfflags & NOTE_LOWAT)
1224 return (kn->kn_data >= kn->kn_sdata);
1225 return (kn->kn_data >= so->so_snd.sb_lowat);
1226 }
1227
1228 /*ARGSUSED*/
1229 static int
1230 filt_solisten(struct knote *kn, long hint)
1231 {
1232 struct socket *so;
1233
1234 so = (struct socket *)kn->kn_fp->f_data;
1235 #if 0
1236 /*
1237 * XXXLUKEM: this was freebsd's code. it appears that they
1238 * XXXLUKEM: modified the socket code to store the count
1239 * XXXLUKEM: of all connections in so_qlen, and separately
1240 * XXXLUKEM: track the number of incompletes in so_incqlen.
1241 * XXXLUKEM: as we appear to keep only completed connections
1242 * XXXLUKEM: on so_qlen we can just return that.
1243 * XXXLUKEM: that said, a socket guru should double check for me :)
1244 */
1245 kn->kn_data = so->so_qlen - so->so_incqlen;
1246 return (! TAILQ_EMPTY(&so->so_comp));
1247 #else
1248 kn->kn_data = so->so_qlen;
1249 return (kn->kn_data > 0);
1250 #endif
1251 }
1252