uipc_socket.c revision 1.90 1 /* $NetBSD: uipc_socket.c,v 1.90 2003/09/22 12:59:58 christos Exp $ */
2
3 /*-
4 * Copyright (c) 2002 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of Wasabi Systems, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*
40 * Copyright (c) 1982, 1986, 1988, 1990, 1993
41 * The Regents of the University of California. All rights reserved.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 3. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95
68 */
69
70 #include <sys/cdefs.h>
71 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.90 2003/09/22 12:59:58 christos Exp $");
72
73 #include "opt_sock_counters.h"
74 #include "opt_sosend_loan.h"
75 #include "opt_mbuftrace.h"
76 #include "opt_somaxkva.h"
77
78 #include <sys/param.h>
79 #include <sys/systm.h>
80 #include <sys/proc.h>
81 #include <sys/file.h>
82 #include <sys/malloc.h>
83 #include <sys/mbuf.h>
84 #include <sys/domain.h>
85 #include <sys/kernel.h>
86 #include <sys/protosw.h>
87 #include <sys/socket.h>
88 #include <sys/socketvar.h>
89 #include <sys/signalvar.h>
90 #include <sys/resourcevar.h>
91 #include <sys/pool.h>
92 #include <sys/event.h>
93 #include <sys/poll.h>
94
95 #include <uvm/uvm.h>
96
97 struct pool socket_pool;
98
99 MALLOC_DEFINE(M_SOOPTS, "soopts", "socket options");
100 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
101
102 extern int somaxconn; /* patchable (XXX sysctl) */
103 int somaxconn = SOMAXCONN;
104
105 #ifdef SOSEND_COUNTERS
106 #include <sys/device.h>
107
108 struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
109 NULL, "sosend", "loan big");
110 struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
111 NULL, "sosend", "copy big");
112 struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
113 NULL, "sosend", "copy small");
114 struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
115 NULL, "sosend", "kva limit");
116
117 #define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++
118
119 #else
120
121 #define SOSEND_COUNTER_INCR(ev) /* nothing */
122
123 #endif /* SOSEND_COUNTERS */
124
125 void
126 soinit(void)
127 {
128
129 pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0,
130 "sockpl", NULL);
131
132 #ifdef SOSEND_COUNTERS
133 evcnt_attach_static(&sosend_loan_big);
134 evcnt_attach_static(&sosend_copy_big);
135 evcnt_attach_static(&sosend_copy_small);
136 evcnt_attach_static(&sosend_kvalimit);
137 #endif /* SOSEND_COUNTERS */
138 }
139
140 #ifdef SOSEND_NO_LOAN
141 int use_sosend_loan = 0;
142 #else
143 int use_sosend_loan = 1;
144 #endif
145
146 struct mbuf *so_pendfree;
147
148 #ifndef SOMAXKVA
149 #define SOMAXKVA (16 * 1024 * 1024)
150 #endif
151 int somaxkva = SOMAXKVA;
152 int socurkva;
153 int sokvawaiters;
154
155 #define SOCK_LOAN_THRESH 4096
156 #define SOCK_LOAN_CHUNK 65536
157
158 static size_t sodopendfree(struct socket *);
159
160 vaddr_t
161 sokvaalloc(vsize_t len, struct socket *so)
162 {
163 vaddr_t lva;
164 int s;
165
166 while (socurkva + len > somaxkva) {
167 if (sodopendfree(so))
168 continue;
169 SOSEND_COUNTER_INCR(&sosend_kvalimit);
170 s = splvm();
171 sokvawaiters++;
172 (void) tsleep(&socurkva, PVM, "sokva", 0);
173 sokvawaiters--;
174 splx(s);
175 }
176
177 lva = uvm_km_valloc_wait(kernel_map, len);
178 if (lva == 0)
179 return (0);
180 socurkva += len;
181
182 return lva;
183 }
184
185 void
186 sokvafree(vaddr_t sva, vsize_t len)
187 {
188
189 uvm_km_free(kernel_map, sva, len);
190 socurkva -= len;
191 if (sokvawaiters)
192 wakeup(&socurkva);
193 }
194
195 static void
196 sodoloanfree(struct vm_page **pgs, caddr_t buf, size_t size)
197 {
198 vaddr_t va, sva, eva;
199 vsize_t len;
200 paddr_t pa;
201 int i, npgs;
202
203 eva = round_page((vaddr_t) buf + size);
204 sva = trunc_page((vaddr_t) buf);
205 len = eva - sva;
206 npgs = len >> PAGE_SHIFT;
207
208 if (__predict_false(pgs == NULL)) {
209 pgs = alloca(npgs * sizeof(*pgs));
210
211 for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) {
212 if (pmap_extract(pmap_kernel(), va, &pa) == FALSE)
213 panic("sodoloanfree: va 0x%lx not mapped", va);
214 pgs[i] = PHYS_TO_VM_PAGE(pa);
215 }
216 }
217
218 pmap_kremove(sva, len);
219 pmap_update(pmap_kernel());
220 uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE);
221 sokvafree(sva, len);
222 }
223
224 static size_t
225 sodopendfree(struct socket *so)
226 {
227 struct mbuf *m;
228 size_t rv = 0;
229 int s;
230
231 s = splvm();
232
233 for (;;) {
234 m = so_pendfree;
235 if (m == NULL)
236 break;
237 so_pendfree = m->m_next;
238 splx(s);
239
240 rv += m->m_ext.ext_size;
241 sodoloanfree((m->m_flags & M_EXT_PAGES) ?
242 m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf,
243 m->m_ext.ext_size);
244 s = splvm();
245 pool_cache_put(&mbpool_cache, m);
246 }
247
248 for (;;) {
249 m = so->so_pendfree;
250 if (m == NULL)
251 break;
252 so->so_pendfree = m->m_next;
253 splx(s);
254
255 rv += m->m_ext.ext_size;
256 sodoloanfree((m->m_flags & M_EXT_PAGES) ?
257 m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf,
258 m->m_ext.ext_size);
259 s = splvm();
260 pool_cache_put(&mbpool_cache, m);
261 }
262
263 splx(s);
264 return (rv);
265 }
266
267 void
268 soloanfree(struct mbuf *m, caddr_t buf, size_t size, void *arg)
269 {
270 struct socket *so = arg;
271 int s;
272
273 if (m == NULL) {
274 sodoloanfree(NULL, buf, size);
275 return;
276 }
277
278 s = splvm();
279 m->m_next = so->so_pendfree;
280 so->so_pendfree = m;
281 splx(s);
282 if (sokvawaiters)
283 wakeup(&socurkva);
284 }
285
286 static long
287 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space)
288 {
289 struct iovec *iov = uio->uio_iov;
290 vaddr_t sva, eva;
291 vsize_t len;
292 vaddr_t lva, va;
293 int npgs, i, error;
294
295 if (uio->uio_segflg != UIO_USERSPACE)
296 return (0);
297
298 if (iov->iov_len < (size_t) space)
299 space = iov->iov_len;
300 if (space > SOCK_LOAN_CHUNK)
301 space = SOCK_LOAN_CHUNK;
302
303 eva = round_page((vaddr_t) iov->iov_base + space);
304 sva = trunc_page((vaddr_t) iov->iov_base);
305 len = eva - sva;
306 npgs = len >> PAGE_SHIFT;
307
308 /* XXX KDASSERT */
309 KASSERT(npgs <= M_EXT_MAXPAGES);
310
311 lva = sokvaalloc(len, so);
312 if (lva == 0)
313 return 0;
314
315 error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len,
316 m->m_ext.ext_pgs, UVM_LOAN_TOPAGE);
317 if (error) {
318 sokvafree(lva, len);
319 return (0);
320 }
321
322 for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE)
323 pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]),
324 VM_PROT_READ);
325 pmap_update(pmap_kernel());
326
327 lva += (vaddr_t) iov->iov_base & PAGE_MASK;
328
329 MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so);
330 m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;
331
332 uio->uio_resid -= space;
333 /* uio_offset not updated, not set/used for write(2) */
334 uio->uio_iov->iov_base = (caddr_t) uio->uio_iov->iov_base + space;
335 uio->uio_iov->iov_len -= space;
336 if (uio->uio_iov->iov_len == 0) {
337 uio->uio_iov++;
338 uio->uio_iovcnt--;
339 }
340
341 return (space);
342 }
343
344 /*
345 * Socket operation routines.
346 * These routines are called by the routines in
347 * sys_socket.c or from a system process, and
348 * implement the semantics of socket operations by
349 * switching out to the protocol specific routines.
350 */
351 /*ARGSUSED*/
352 int
353 socreate(int dom, struct socket **aso, int type, int proto)
354 {
355 struct proc *p;
356 struct protosw *prp;
357 struct socket *so;
358 int error, s;
359
360 p = curproc; /* XXX */
361 if (proto)
362 prp = pffindproto(dom, proto, type);
363 else
364 prp = pffindtype(dom, type);
365 if (prp == 0 || prp->pr_usrreq == 0)
366 return (EPROTONOSUPPORT);
367 if (prp->pr_type != type)
368 return (EPROTOTYPE);
369 s = splsoftnet();
370 so = pool_get(&socket_pool, PR_WAITOK);
371 memset((caddr_t)so, 0, sizeof(*so));
372 TAILQ_INIT(&so->so_q0);
373 TAILQ_INIT(&so->so_q);
374 so->so_type = type;
375 so->so_proto = prp;
376 so->so_send = sosend;
377 so->so_receive = soreceive;
378 #ifdef MBUFTRACE
379 so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner;
380 so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner;
381 so->so_mowner = &prp->pr_domain->dom_mowner;
382 #endif
383 if (p != 0)
384 so->so_uid = p->p_ucred->cr_uid;
385 error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0,
386 (struct mbuf *)(long)proto, (struct mbuf *)0, p);
387 if (error) {
388 so->so_state |= SS_NOFDREF;
389 sofree(so);
390 splx(s);
391 return (error);
392 }
393 splx(s);
394 *aso = so;
395 return (0);
396 }
397
398 int
399 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
400 {
401 int s, error;
402
403 s = splsoftnet();
404 error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, (struct mbuf *)0,
405 nam, (struct mbuf *)0, p);
406 splx(s);
407 return (error);
408 }
409
410 int
411 solisten(struct socket *so, int backlog)
412 {
413 int s, error;
414
415 s = splsoftnet();
416 error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, (struct mbuf *)0,
417 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
418 if (error) {
419 splx(s);
420 return (error);
421 }
422 if (TAILQ_EMPTY(&so->so_q))
423 so->so_options |= SO_ACCEPTCONN;
424 if (backlog < 0)
425 backlog = 0;
426 so->so_qlimit = min(backlog, somaxconn);
427 splx(s);
428 return (0);
429 }
430
431 void
432 sofree(struct socket *so)
433 {
434 struct mbuf *m;
435
436 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
437 return;
438 if (so->so_head) {
439 /*
440 * We must not decommission a socket that's on the accept(2)
441 * queue. If we do, then accept(2) may hang after select(2)
442 * indicated that the listening socket was ready.
443 */
444 if (!soqremque(so, 0))
445 return;
446 }
447 sbrelease(&so->so_snd);
448 sorflush(so);
449 while ((m = so->so_pendfree) != NULL) {
450 so->so_pendfree = m->m_next;
451 m->m_next = so_pendfree;
452 so_pendfree = m;
453 }
454 pool_put(&socket_pool, so);
455 }
456
457 /*
458 * Close a socket on last file table reference removal.
459 * Initiate disconnect if connected.
460 * Free socket when disconnect complete.
461 */
462 int
463 soclose(struct socket *so)
464 {
465 struct socket *so2;
466 int s, error;
467
468 error = 0;
469 s = splsoftnet(); /* conservative */
470 if (so->so_options & SO_ACCEPTCONN) {
471 while ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) {
472 (void) soqremque(so2, 0);
473 (void) soabort(so2);
474 }
475 while ((so2 = TAILQ_FIRST(&so->so_q)) != 0) {
476 (void) soqremque(so2, 1);
477 (void) soabort(so2);
478 }
479 }
480 if (so->so_pcb == 0)
481 goto discard;
482 if (so->so_state & SS_ISCONNECTED) {
483 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
484 error = sodisconnect(so);
485 if (error)
486 goto drop;
487 }
488 if (so->so_options & SO_LINGER) {
489 if ((so->so_state & SS_ISDISCONNECTING) &&
490 (so->so_state & SS_NBIO))
491 goto drop;
492 while (so->so_state & SS_ISCONNECTED) {
493 error = tsleep((caddr_t)&so->so_timeo,
494 PSOCK | PCATCH, netcls,
495 so->so_linger * hz);
496 if (error)
497 break;
498 }
499 }
500 }
501 drop:
502 if (so->so_pcb) {
503 int error2 = (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
504 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
505 (struct proc *)0);
506 if (error == 0)
507 error = error2;
508 }
509 discard:
510 if (so->so_state & SS_NOFDREF)
511 panic("soclose: NOFDREF");
512 so->so_state |= SS_NOFDREF;
513 sofree(so);
514 splx(s);
515 return (error);
516 }
517
518 /*
519 * Must be called at splsoftnet...
520 */
521 int
522 soabort(struct socket *so)
523 {
524
525 return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, (struct mbuf *)0,
526 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
527 }
528
529 int
530 soaccept(struct socket *so, struct mbuf *nam)
531 {
532 int s, error;
533
534 error = 0;
535 s = splsoftnet();
536 if ((so->so_state & SS_NOFDREF) == 0)
537 panic("soaccept: !NOFDREF");
538 so->so_state &= ~SS_NOFDREF;
539 if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
540 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
541 error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
542 (struct mbuf *)0, nam, (struct mbuf *)0, (struct proc *)0);
543 else
544 error = ECONNABORTED;
545
546 splx(s);
547 return (error);
548 }
549
550 int
551 soconnect(struct socket *so, struct mbuf *nam)
552 {
553 struct proc *p;
554 int s, error;
555
556 p = curproc; /* XXX */
557 if (so->so_options & SO_ACCEPTCONN)
558 return (EOPNOTSUPP);
559 s = splsoftnet();
560 /*
561 * If protocol is connection-based, can only connect once.
562 * Otherwise, if connected, try to disconnect first.
563 * This allows user to disconnect by connecting to, e.g.,
564 * a null address.
565 */
566 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
567 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
568 (error = sodisconnect(so))))
569 error = EISCONN;
570 else
571 error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
572 (struct mbuf *)0, nam, (struct mbuf *)0, p);
573 splx(s);
574 return (error);
575 }
576
577 int
578 soconnect2(struct socket *so1, struct socket *so2)
579 {
580 int s, error;
581
582 s = splsoftnet();
583 error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
584 (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0,
585 (struct proc *)0);
586 splx(s);
587 return (error);
588 }
589
590 int
591 sodisconnect(struct socket *so)
592 {
593 int s, error;
594
595 s = splsoftnet();
596 if ((so->so_state & SS_ISCONNECTED) == 0) {
597 error = ENOTCONN;
598 goto bad;
599 }
600 if (so->so_state & SS_ISDISCONNECTING) {
601 error = EALREADY;
602 goto bad;
603 }
604 error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
605 (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0,
606 (struct proc *)0);
607 bad:
608 splx(s);
609 sodopendfree(so);
610 return (error);
611 }
612
613 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
614 /*
615 * Send on a socket.
616 * If send must go all at once and message is larger than
617 * send buffering, then hard error.
618 * Lock against other senders.
619 * If must go all at once and not enough room now, then
620 * inform user that this would block and do nothing.
621 * Otherwise, if nonblocking, send as much as possible.
622 * The data to be sent is described by "uio" if nonzero,
623 * otherwise by the mbuf chain "top" (which must be null
624 * if uio is not). Data provided in mbuf chain must be small
625 * enough to send all at once.
626 *
627 * Returns nonzero on error, timeout or signal; callers
628 * must check for short counts if EINTR/ERESTART are returned.
629 * Data and control buffers are freed on return.
630 */
631 int
632 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
633 struct mbuf *control, int flags)
634 {
635 struct proc *p;
636 struct mbuf **mp, *m;
637 long space, len, resid, clen, mlen;
638 int error, s, dontroute, atomic;
639
640 sodopendfree(so);
641
642 p = curproc; /* XXX */
643 clen = 0;
644 atomic = sosendallatonce(so) || top;
645 if (uio)
646 resid = uio->uio_resid;
647 else
648 resid = top->m_pkthdr.len;
649 /*
650 * In theory resid should be unsigned.
651 * However, space must be signed, as it might be less than 0
652 * if we over-committed, and we must use a signed comparison
653 * of space and resid. On the other hand, a negative resid
654 * causes us to loop sending 0-length segments to the protocol.
655 */
656 if (resid < 0) {
657 error = EINVAL;
658 goto out;
659 }
660 dontroute =
661 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
662 (so->so_proto->pr_flags & PR_ATOMIC);
663 p->p_stats->p_ru.ru_msgsnd++;
664 if (control)
665 clen = control->m_len;
666 #define snderr(errno) { error = errno; splx(s); goto release; }
667
668 restart:
669 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
670 goto out;
671 do {
672 s = splsoftnet();
673 if (so->so_state & SS_CANTSENDMORE)
674 snderr(EPIPE);
675 if (so->so_error) {
676 error = so->so_error;
677 so->so_error = 0;
678 splx(s);
679 goto release;
680 }
681 if ((so->so_state & SS_ISCONNECTED) == 0) {
682 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
683 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
684 !(resid == 0 && clen != 0))
685 snderr(ENOTCONN);
686 } else if (addr == 0)
687 snderr(EDESTADDRREQ);
688 }
689 space = sbspace(&so->so_snd);
690 if (flags & MSG_OOB)
691 space += 1024;
692 if ((atomic && resid > so->so_snd.sb_hiwat) ||
693 clen > so->so_snd.sb_hiwat)
694 snderr(EMSGSIZE);
695 if (space < resid + clen && uio &&
696 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
697 if (so->so_state & SS_NBIO)
698 snderr(EWOULDBLOCK);
699 sbunlock(&so->so_snd);
700 error = sbwait(&so->so_snd);
701 splx(s);
702 if (error)
703 goto out;
704 goto restart;
705 }
706 splx(s);
707 mp = ⊤
708 space -= clen;
709 do {
710 if (uio == NULL) {
711 /*
712 * Data is prepackaged in "top".
713 */
714 resid = 0;
715 if (flags & MSG_EOR)
716 top->m_flags |= M_EOR;
717 } else do {
718 if (top == 0) {
719 m = m_gethdr(M_WAIT, MT_DATA);
720 mlen = MHLEN;
721 m->m_pkthdr.len = 0;
722 m->m_pkthdr.rcvif = (struct ifnet *)0;
723 } else {
724 m = m_get(M_WAIT, MT_DATA);
725 mlen = MLEN;
726 }
727 MCLAIM(m, so->so_snd.sb_mowner);
728 if (use_sosend_loan &&
729 uio->uio_iov->iov_len >= SOCK_LOAN_THRESH &&
730 space >= SOCK_LOAN_THRESH &&
731 (len = sosend_loan(so, uio, m,
732 space)) != 0) {
733 SOSEND_COUNTER_INCR(&sosend_loan_big);
734 space -= len;
735 goto have_data;
736 }
737 if (resid >= MINCLSIZE && space >= MCLBYTES) {
738 SOSEND_COUNTER_INCR(&sosend_copy_big);
739 m_clget(m, M_WAIT);
740 if ((m->m_flags & M_EXT) == 0)
741 goto nopages;
742 mlen = MCLBYTES;
743 if (atomic && top == 0) {
744 len = lmin(MCLBYTES - max_hdr,
745 resid);
746 m->m_data += max_hdr;
747 } else
748 len = lmin(MCLBYTES, resid);
749 space -= len;
750 } else {
751 nopages:
752 SOSEND_COUNTER_INCR(&sosend_copy_small);
753 len = lmin(lmin(mlen, resid), space);
754 space -= len;
755 /*
756 * For datagram protocols, leave room
757 * for protocol headers in first mbuf.
758 */
759 if (atomic && top == 0 && len < mlen)
760 MH_ALIGN(m, len);
761 }
762 error = uiomove(mtod(m, caddr_t), (int)len,
763 uio);
764 have_data:
765 resid = uio->uio_resid;
766 m->m_len = len;
767 *mp = m;
768 top->m_pkthdr.len += len;
769 if (error)
770 goto release;
771 mp = &m->m_next;
772 if (resid <= 0) {
773 if (flags & MSG_EOR)
774 top->m_flags |= M_EOR;
775 break;
776 }
777 } while (space > 0 && atomic);
778
779 s = splsoftnet();
780
781 if (so->so_state & SS_CANTSENDMORE)
782 snderr(EPIPE);
783
784 if (dontroute)
785 so->so_options |= SO_DONTROUTE;
786 if (resid > 0)
787 so->so_state |= SS_MORETOCOME;
788 error = (*so->so_proto->pr_usrreq)(so,
789 (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
790 top, addr, control, p);
791 if (dontroute)
792 so->so_options &= ~SO_DONTROUTE;
793 if (resid > 0)
794 so->so_state &= ~SS_MORETOCOME;
795 splx(s);
796
797 clen = 0;
798 control = 0;
799 top = 0;
800 mp = ⊤
801 if (error)
802 goto release;
803 } while (resid && space > 0);
804 } while (resid);
805
806 release:
807 sbunlock(&so->so_snd);
808 out:
809 if (top)
810 m_freem(top);
811 if (control)
812 m_freem(control);
813 return (error);
814 }
815
816 /*
817 * Implement receive operations on a socket.
818 * We depend on the way that records are added to the sockbuf
819 * by sbappend*. In particular, each record (mbufs linked through m_next)
820 * must begin with an address if the protocol so specifies,
821 * followed by an optional mbuf or mbufs containing ancillary data,
822 * and then zero or more mbufs of data.
823 * In order to avoid blocking network interrupts for the entire time here,
824 * we splx() while doing the actual copy to user space.
825 * Although the sockbuf is locked, new data may still be appended,
826 * and thus we must maintain consistency of the sockbuf during that time.
827 *
828 * The caller may receive the data as a single mbuf chain by supplying
829 * an mbuf **mp0 for use in returning the chain. The uio is then used
830 * only for the count in uio_resid.
831 */
832 int
833 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
834 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
835 {
836 struct mbuf *m, **mp;
837 int flags, len, error, s, offset, moff, type, orig_resid;
838 struct protosw *pr;
839 struct mbuf *nextrecord;
840 int mbuf_removed = 0;
841
842 pr = so->so_proto;
843 mp = mp0;
844 type = 0;
845 orig_resid = uio->uio_resid;
846 if (paddr)
847 *paddr = 0;
848 if (controlp)
849 *controlp = 0;
850 if (flagsp)
851 flags = *flagsp &~ MSG_EOR;
852 else
853 flags = 0;
854
855 if ((flags & MSG_DONTWAIT) == 0)
856 sodopendfree(so);
857
858 if (flags & MSG_OOB) {
859 m = m_get(M_WAIT, MT_DATA);
860 error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
861 (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0,
862 (struct proc *)0);
863 if (error)
864 goto bad;
865 do {
866 error = uiomove(mtod(m, caddr_t),
867 (int) min(uio->uio_resid, m->m_len), uio);
868 m = m_free(m);
869 } while (uio->uio_resid && error == 0 && m);
870 bad:
871 if (m)
872 m_freem(m);
873 return (error);
874 }
875 if (mp)
876 *mp = (struct mbuf *)0;
877 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
878 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
879 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
880
881 restart:
882 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
883 return (error);
884 s = splsoftnet();
885
886 m = so->so_rcv.sb_mb;
887 /*
888 * If we have less data than requested, block awaiting more
889 * (subject to any timeout) if:
890 * 1. the current count is less than the low water mark,
891 * 2. MSG_WAITALL is set, and it is possible to do the entire
892 * receive operation at once if we block (resid <= hiwat), or
893 * 3. MSG_DONTWAIT is not set.
894 * If MSG_WAITALL is set but resid is larger than the receive buffer,
895 * we have to do the receive in sections, and thus risk returning
896 * a short count if a timeout or signal occurs after we start.
897 */
898 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
899 so->so_rcv.sb_cc < uio->uio_resid) &&
900 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
901 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
902 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
903 #ifdef DIAGNOSTIC
904 if (m == 0 && so->so_rcv.sb_cc)
905 panic("receive 1");
906 #endif
907 if (so->so_error) {
908 if (m)
909 goto dontblock;
910 error = so->so_error;
911 if ((flags & MSG_PEEK) == 0)
912 so->so_error = 0;
913 goto release;
914 }
915 if (so->so_state & SS_CANTRCVMORE) {
916 if (m)
917 goto dontblock;
918 else
919 goto release;
920 }
921 for (; m; m = m->m_next)
922 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
923 m = so->so_rcv.sb_mb;
924 goto dontblock;
925 }
926 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
927 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
928 error = ENOTCONN;
929 goto release;
930 }
931 if (uio->uio_resid == 0)
932 goto release;
933 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
934 error = EWOULDBLOCK;
935 goto release;
936 }
937 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
938 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
939 sbunlock(&so->so_rcv);
940 error = sbwait(&so->so_rcv);
941 splx(s);
942 if (error)
943 return (error);
944 goto restart;
945 }
946 dontblock:
947 /*
948 * On entry here, m points to the first record of the socket buffer.
949 * While we process the initial mbufs containing address and control
950 * info, we save a copy of m->m_nextpkt into nextrecord.
951 */
952 #ifdef notyet /* XXXX */
953 if (uio->uio_procp)
954 uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
955 #endif
956 KASSERT(m == so->so_rcv.sb_mb);
957 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
958 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
959 nextrecord = m->m_nextpkt;
960 if (pr->pr_flags & PR_ADDR) {
961 #ifdef DIAGNOSTIC
962 if (m->m_type != MT_SONAME)
963 panic("receive 1a");
964 #endif
965 orig_resid = 0;
966 if (flags & MSG_PEEK) {
967 if (paddr)
968 *paddr = m_copy(m, 0, m->m_len);
969 m = m->m_next;
970 } else {
971 sbfree(&so->so_rcv, m);
972 mbuf_removed = 1;
973 if (paddr) {
974 *paddr = m;
975 so->so_rcv.sb_mb = m->m_next;
976 m->m_next = 0;
977 m = so->so_rcv.sb_mb;
978 } else {
979 MFREE(m, so->so_rcv.sb_mb);
980 m = so->so_rcv.sb_mb;
981 }
982 }
983 }
984 while (m && m->m_type == MT_CONTROL && error == 0) {
985 if (flags & MSG_PEEK) {
986 if (controlp)
987 *controlp = m_copy(m, 0, m->m_len);
988 m = m->m_next;
989 } else {
990 sbfree(&so->so_rcv, m);
991 mbuf_removed = 1;
992 if (controlp) {
993 if (pr->pr_domain->dom_externalize &&
994 mtod(m, struct cmsghdr *)->cmsg_type ==
995 SCM_RIGHTS)
996 error = (*pr->pr_domain->dom_externalize)(m);
997 *controlp = m;
998 so->so_rcv.sb_mb = m->m_next;
999 m->m_next = 0;
1000 m = so->so_rcv.sb_mb;
1001 } else {
1002 MFREE(m, so->so_rcv.sb_mb);
1003 m = so->so_rcv.sb_mb;
1004 }
1005 }
1006 if (controlp) {
1007 orig_resid = 0;
1008 controlp = &(*controlp)->m_next;
1009 }
1010 }
1011
1012 /*
1013 * If m is non-NULL, we have some data to read. From now on,
1014 * make sure to keep sb_lastrecord consistent when working on
1015 * the last packet on the chain (nextrecord == NULL) and we
1016 * change m->m_nextpkt.
1017 */
1018 if (m) {
1019 if ((flags & MSG_PEEK) == 0) {
1020 m->m_nextpkt = nextrecord;
1021 /*
1022 * If nextrecord == NULL (this is a single chain),
1023 * then sb_lastrecord may not be valid here if m
1024 * was changed earlier.
1025 */
1026 if (nextrecord == NULL) {
1027 KASSERT(so->so_rcv.sb_mb == m);
1028 so->so_rcv.sb_lastrecord = m;
1029 }
1030 }
1031 type = m->m_type;
1032 if (type == MT_OOBDATA)
1033 flags |= MSG_OOB;
1034 } else {
1035 if ((flags & MSG_PEEK) == 0) {
1036 KASSERT(so->so_rcv.sb_mb == m);
1037 so->so_rcv.sb_mb = nextrecord;
1038 SB_EMPTY_FIXUP(&so->so_rcv);
1039 }
1040 }
1041 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
1042 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
1043
1044 moff = 0;
1045 offset = 0;
1046 while (m && uio->uio_resid > 0 && error == 0) {
1047 if (m->m_type == MT_OOBDATA) {
1048 if (type != MT_OOBDATA)
1049 break;
1050 } else if (type == MT_OOBDATA)
1051 break;
1052 #ifdef DIAGNOSTIC
1053 else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
1054 panic("receive 3");
1055 #endif
1056 so->so_state &= ~SS_RCVATMARK;
1057 len = uio->uio_resid;
1058 if (so->so_oobmark && len > so->so_oobmark - offset)
1059 len = so->so_oobmark - offset;
1060 if (len > m->m_len - moff)
1061 len = m->m_len - moff;
1062 /*
1063 * If mp is set, just pass back the mbufs.
1064 * Otherwise copy them out via the uio, then free.
1065 * Sockbuf must be consistent here (points to current mbuf,
1066 * it points to next record) when we drop priority;
1067 * we must note any additions to the sockbuf when we
1068 * block interrupts again.
1069 */
1070 if (mp == 0) {
1071 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
1072 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
1073 splx(s);
1074 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
1075 s = splsoftnet();
1076 if (error) {
1077 /*
1078 * If any part of the record has been removed
1079 * (such as the MT_SONAME mbuf, which will
1080 * happen when PR_ADDR, and thus also
1081 * PR_ATOMIC, is set), then drop the entire
1082 * record to maintain the atomicity of the
1083 * receive operation.
1084 *
1085 * This avoids a later panic("receive 1a")
1086 * when compiled with DIAGNOSTIC.
1087 */
1088 if (m && mbuf_removed
1089 && (pr->pr_flags & PR_ATOMIC))
1090 (void) sbdroprecord(&so->so_rcv);
1091
1092 goto release;
1093 }
1094 } else
1095 uio->uio_resid -= len;
1096 if (len == m->m_len - moff) {
1097 if (m->m_flags & M_EOR)
1098 flags |= MSG_EOR;
1099 if (flags & MSG_PEEK) {
1100 m = m->m_next;
1101 moff = 0;
1102 } else {
1103 nextrecord = m->m_nextpkt;
1104 sbfree(&so->so_rcv, m);
1105 if (mp) {
1106 *mp = m;
1107 mp = &m->m_next;
1108 so->so_rcv.sb_mb = m = m->m_next;
1109 *mp = (struct mbuf *)0;
1110 } else {
1111 MFREE(m, so->so_rcv.sb_mb);
1112 m = so->so_rcv.sb_mb;
1113 }
1114 /*
1115 * If m != NULL, we also know that
1116 * so->so_rcv.sb_mb != NULL.
1117 */
1118 KASSERT(so->so_rcv.sb_mb == m);
1119 if (m) {
1120 m->m_nextpkt = nextrecord;
1121 if (nextrecord == NULL)
1122 so->so_rcv.sb_lastrecord = m;
1123 } else {
1124 so->so_rcv.sb_mb = nextrecord;
1125 SB_EMPTY_FIXUP(&so->so_rcv);
1126 }
1127 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
1128 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1129 }
1130 } else {
1131 if (flags & MSG_PEEK)
1132 moff += len;
1133 else {
1134 if (mp)
1135 *mp = m_copym(m, 0, len, M_WAIT);
1136 m->m_data += len;
1137 m->m_len -= len;
1138 so->so_rcv.sb_cc -= len;
1139 }
1140 }
1141 if (so->so_oobmark) {
1142 if ((flags & MSG_PEEK) == 0) {
1143 so->so_oobmark -= len;
1144 if (so->so_oobmark == 0) {
1145 so->so_state |= SS_RCVATMARK;
1146 break;
1147 }
1148 } else {
1149 offset += len;
1150 if (offset == so->so_oobmark)
1151 break;
1152 }
1153 }
1154 if (flags & MSG_EOR)
1155 break;
1156 /*
1157 * If the MSG_WAITALL flag is set (for non-atomic socket),
1158 * we must not quit until "uio->uio_resid == 0" or an error
1159 * termination. If a signal/timeout occurs, return
1160 * with a short count but without error.
1161 * Keep sockbuf locked against other readers.
1162 */
1163 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
1164 !sosendallatonce(so) && !nextrecord) {
1165 if (so->so_error || so->so_state & SS_CANTRCVMORE)
1166 break;
1167 /*
1168 * If we are peeking and the socket receive buffer is
1169 * full, stop since we can't get more data to peek at.
1170 */
1171 if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0)
1172 break;
1173 /*
1174 * If we've drained the socket buffer, tell the
1175 * protocol in case it needs to do something to
1176 * get it filled again.
1177 */
1178 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb)
1179 (*pr->pr_usrreq)(so, PRU_RCVD,
1180 (struct mbuf *)0,
1181 (struct mbuf *)(long)flags,
1182 (struct mbuf *)0,
1183 (struct proc *)0);
1184 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
1185 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
1186 error = sbwait(&so->so_rcv);
1187 if (error) {
1188 sbunlock(&so->so_rcv);
1189 splx(s);
1190 return (0);
1191 }
1192 if ((m = so->so_rcv.sb_mb) != NULL)
1193 nextrecord = m->m_nextpkt;
1194 }
1195 }
1196
1197 if (m && pr->pr_flags & PR_ATOMIC) {
1198 flags |= MSG_TRUNC;
1199 if ((flags & MSG_PEEK) == 0)
1200 (void) sbdroprecord(&so->so_rcv);
1201 }
1202 if ((flags & MSG_PEEK) == 0) {
1203 if (m == 0) {
1204 /*
1205 * First part is an inline SB_EMPTY_FIXUP(). Second
1206 * part makes sure sb_lastrecord is up-to-date if
1207 * there is still data in the socket buffer.
1208 */
1209 so->so_rcv.sb_mb = nextrecord;
1210 if (so->so_rcv.sb_mb == NULL) {
1211 so->so_rcv.sb_mbtail = NULL;
1212 so->so_rcv.sb_lastrecord = NULL;
1213 } else if (nextrecord->m_nextpkt == NULL)
1214 so->so_rcv.sb_lastrecord = nextrecord;
1215 }
1216 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
1217 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1218 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
1219 (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
1220 (struct mbuf *)(long)flags, (struct mbuf *)0,
1221 (struct proc *)0);
1222 }
1223 if (orig_resid == uio->uio_resid && orig_resid &&
1224 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1225 sbunlock(&so->so_rcv);
1226 splx(s);
1227 goto restart;
1228 }
1229
1230 if (flagsp)
1231 *flagsp |= flags;
1232 release:
1233 sbunlock(&so->so_rcv);
1234 splx(s);
1235 return (error);
1236 }
1237
1238 int
1239 soshutdown(struct socket *so, int how)
1240 {
1241 struct protosw *pr;
1242
1243 pr = so->so_proto;
1244 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1245 return (EINVAL);
1246
1247 if (how == SHUT_RD || how == SHUT_RDWR)
1248 sorflush(so);
1249 if (how == SHUT_WR || how == SHUT_RDWR)
1250 return (*pr->pr_usrreq)(so, PRU_SHUTDOWN, (struct mbuf *)0,
1251 (struct mbuf *)0, (struct mbuf *)0, (struct proc *)0);
1252 return (0);
1253 }
1254
1255 void
1256 sorflush(struct socket *so)
1257 {
1258 struct sockbuf *sb, asb;
1259 struct protosw *pr;
1260 int s;
1261
1262 sb = &so->so_rcv;
1263 pr = so->so_proto;
1264 sb->sb_flags |= SB_NOINTR;
1265 (void) sblock(sb, M_WAITOK);
1266 s = splnet();
1267 socantrcvmore(so);
1268 sbunlock(sb);
1269 asb = *sb;
1270 /*
1271 * Clear most of the sockbuf structure, but leave some of the
1272 * fields valid.
1273 */
1274 memset(&sb->sb_startzero, 0,
1275 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1276 splx(s);
1277 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
1278 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1279 sbrelease(&asb);
1280 }
1281
1282 int
1283 sosetopt(struct socket *so, int level, int optname, struct mbuf *m0)
1284 {
1285 int error;
1286 struct mbuf *m;
1287
1288 error = 0;
1289 m = m0;
1290 if (level != SOL_SOCKET) {
1291 if (so->so_proto && so->so_proto->pr_ctloutput)
1292 return ((*so->so_proto->pr_ctloutput)
1293 (PRCO_SETOPT, so, level, optname, &m0));
1294 error = ENOPROTOOPT;
1295 } else {
1296 switch (optname) {
1297
1298 case SO_LINGER:
1299 if (m == NULL || m->m_len != sizeof(struct linger)) {
1300 error = EINVAL;
1301 goto bad;
1302 }
1303 so->so_linger = mtod(m, struct linger *)->l_linger;
1304 /* fall thru... */
1305
1306 case SO_DEBUG:
1307 case SO_KEEPALIVE:
1308 case SO_DONTROUTE:
1309 case SO_USELOOPBACK:
1310 case SO_BROADCAST:
1311 case SO_REUSEADDR:
1312 case SO_REUSEPORT:
1313 case SO_OOBINLINE:
1314 case SO_TIMESTAMP:
1315 if (m == NULL || m->m_len < sizeof(int)) {
1316 error = EINVAL;
1317 goto bad;
1318 }
1319 if (*mtod(m, int *))
1320 so->so_options |= optname;
1321 else
1322 so->so_options &= ~optname;
1323 break;
1324
1325 case SO_SNDBUF:
1326 case SO_RCVBUF:
1327 case SO_SNDLOWAT:
1328 case SO_RCVLOWAT:
1329 {
1330 int optval;
1331
1332 if (m == NULL || m->m_len < sizeof(int)) {
1333 error = EINVAL;
1334 goto bad;
1335 }
1336
1337 /*
1338 * Values < 1 make no sense for any of these
1339 * options, so disallow them.
1340 */
1341 optval = *mtod(m, int *);
1342 if (optval < 1) {
1343 error = EINVAL;
1344 goto bad;
1345 }
1346
1347 switch (optname) {
1348
1349 case SO_SNDBUF:
1350 case SO_RCVBUF:
1351 if (sbreserve(optname == SO_SNDBUF ?
1352 &so->so_snd : &so->so_rcv,
1353 (u_long) optval) == 0) {
1354 error = ENOBUFS;
1355 goto bad;
1356 }
1357 break;
1358
1359 /*
1360 * Make sure the low-water is never greater than
1361 * the high-water.
1362 */
1363 case SO_SNDLOWAT:
1364 so->so_snd.sb_lowat =
1365 (optval > so->so_snd.sb_hiwat) ?
1366 so->so_snd.sb_hiwat : optval;
1367 break;
1368 case SO_RCVLOWAT:
1369 so->so_rcv.sb_lowat =
1370 (optval > so->so_rcv.sb_hiwat) ?
1371 so->so_rcv.sb_hiwat : optval;
1372 break;
1373 }
1374 break;
1375 }
1376
1377 case SO_SNDTIMEO:
1378 case SO_RCVTIMEO:
1379 {
1380 struct timeval *tv;
1381 short val;
1382
1383 if (m == NULL || m->m_len < sizeof(*tv)) {
1384 error = EINVAL;
1385 goto bad;
1386 }
1387 tv = mtod(m, struct timeval *);
1388 if (tv->tv_sec > (SHRT_MAX - tv->tv_usec / tick) / hz) {
1389 error = EDOM;
1390 goto bad;
1391 }
1392 val = tv->tv_sec * hz + tv->tv_usec / tick;
1393 if (val == 0 && tv->tv_usec != 0)
1394 val = 1;
1395
1396 switch (optname) {
1397
1398 case SO_SNDTIMEO:
1399 so->so_snd.sb_timeo = val;
1400 break;
1401 case SO_RCVTIMEO:
1402 so->so_rcv.sb_timeo = val;
1403 break;
1404 }
1405 break;
1406 }
1407
1408 default:
1409 error = ENOPROTOOPT;
1410 break;
1411 }
1412 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
1413 (void) ((*so->so_proto->pr_ctloutput)
1414 (PRCO_SETOPT, so, level, optname, &m0));
1415 m = NULL; /* freed by protocol */
1416 }
1417 }
1418 bad:
1419 if (m)
1420 (void) m_free(m);
1421 return (error);
1422 }
1423
1424 int
1425 sogetopt(struct socket *so, int level, int optname, struct mbuf **mp)
1426 {
1427 struct mbuf *m;
1428
1429 if (level != SOL_SOCKET) {
1430 if (so->so_proto && so->so_proto->pr_ctloutput) {
1431 return ((*so->so_proto->pr_ctloutput)
1432 (PRCO_GETOPT, so, level, optname, mp));
1433 } else
1434 return (ENOPROTOOPT);
1435 } else {
1436 m = m_get(M_WAIT, MT_SOOPTS);
1437 m->m_len = sizeof(int);
1438
1439 switch (optname) {
1440
1441 case SO_LINGER:
1442 m->m_len = sizeof(struct linger);
1443 mtod(m, struct linger *)->l_onoff =
1444 so->so_options & SO_LINGER;
1445 mtod(m, struct linger *)->l_linger = so->so_linger;
1446 break;
1447
1448 case SO_USELOOPBACK:
1449 case SO_DONTROUTE:
1450 case SO_DEBUG:
1451 case SO_KEEPALIVE:
1452 case SO_REUSEADDR:
1453 case SO_REUSEPORT:
1454 case SO_BROADCAST:
1455 case SO_OOBINLINE:
1456 case SO_TIMESTAMP:
1457 *mtod(m, int *) = so->so_options & optname;
1458 break;
1459
1460 case SO_TYPE:
1461 *mtod(m, int *) = so->so_type;
1462 break;
1463
1464 case SO_ERROR:
1465 *mtod(m, int *) = so->so_error;
1466 so->so_error = 0;
1467 break;
1468
1469 case SO_SNDBUF:
1470 *mtod(m, int *) = so->so_snd.sb_hiwat;
1471 break;
1472
1473 case SO_RCVBUF:
1474 *mtod(m, int *) = so->so_rcv.sb_hiwat;
1475 break;
1476
1477 case SO_SNDLOWAT:
1478 *mtod(m, int *) = so->so_snd.sb_lowat;
1479 break;
1480
1481 case SO_RCVLOWAT:
1482 *mtod(m, int *) = so->so_rcv.sb_lowat;
1483 break;
1484
1485 case SO_SNDTIMEO:
1486 case SO_RCVTIMEO:
1487 {
1488 int val = (optname == SO_SNDTIMEO ?
1489 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1490
1491 m->m_len = sizeof(struct timeval);
1492 mtod(m, struct timeval *)->tv_sec = val / hz;
1493 mtod(m, struct timeval *)->tv_usec =
1494 (val % hz) * tick;
1495 break;
1496 }
1497
1498 default:
1499 (void)m_free(m);
1500 return (ENOPROTOOPT);
1501 }
1502 *mp = m;
1503 return (0);
1504 }
1505 }
1506
1507 void
1508 sohasoutofband(struct socket *so)
1509 {
1510 fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so);
1511 selwakeup(&so->so_rcv.sb_sel);
1512 }
1513
1514 static void
1515 filt_sordetach(struct knote *kn)
1516 {
1517 struct socket *so;
1518
1519 so = (struct socket *)kn->kn_fp->f_data;
1520 SLIST_REMOVE(&so->so_rcv.sb_sel.sel_klist, kn, knote, kn_selnext);
1521 if (SLIST_EMPTY(&so->so_rcv.sb_sel.sel_klist))
1522 so->so_rcv.sb_flags &= ~SB_KNOTE;
1523 }
1524
1525 /*ARGSUSED*/
1526 static int
1527 filt_soread(struct knote *kn, long hint)
1528 {
1529 struct socket *so;
1530
1531 so = (struct socket *)kn->kn_fp->f_data;
1532 kn->kn_data = so->so_rcv.sb_cc;
1533 if (so->so_state & SS_CANTRCVMORE) {
1534 kn->kn_flags |= EV_EOF;
1535 kn->kn_fflags = so->so_error;
1536 return (1);
1537 }
1538 if (so->so_error) /* temporary udp error */
1539 return (1);
1540 if (kn->kn_sfflags & NOTE_LOWAT)
1541 return (kn->kn_data >= kn->kn_sdata);
1542 return (kn->kn_data >= so->so_rcv.sb_lowat);
1543 }
1544
1545 static void
1546 filt_sowdetach(struct knote *kn)
1547 {
1548 struct socket *so;
1549
1550 so = (struct socket *)kn->kn_fp->f_data;
1551 SLIST_REMOVE(&so->so_snd.sb_sel.sel_klist, kn, knote, kn_selnext);
1552 if (SLIST_EMPTY(&so->so_snd.sb_sel.sel_klist))
1553 so->so_snd.sb_flags &= ~SB_KNOTE;
1554 }
1555
1556 /*ARGSUSED*/
1557 static int
1558 filt_sowrite(struct knote *kn, long hint)
1559 {
1560 struct socket *so;
1561
1562 so = (struct socket *)kn->kn_fp->f_data;
1563 kn->kn_data = sbspace(&so->so_snd);
1564 if (so->so_state & SS_CANTSENDMORE) {
1565 kn->kn_flags |= EV_EOF;
1566 kn->kn_fflags = so->so_error;
1567 return (1);
1568 }
1569 if (so->so_error) /* temporary udp error */
1570 return (1);
1571 if (((so->so_state & SS_ISCONNECTED) == 0) &&
1572 (so->so_proto->pr_flags & PR_CONNREQUIRED))
1573 return (0);
1574 if (kn->kn_sfflags & NOTE_LOWAT)
1575 return (kn->kn_data >= kn->kn_sdata);
1576 return (kn->kn_data >= so->so_snd.sb_lowat);
1577 }
1578
1579 /*ARGSUSED*/
1580 static int
1581 filt_solisten(struct knote *kn, long hint)
1582 {
1583 struct socket *so;
1584
1585 so = (struct socket *)kn->kn_fp->f_data;
1586
1587 /*
1588 * Set kn_data to number of incoming connections, not
1589 * counting partial (incomplete) connections.
1590 */
1591 kn->kn_data = so->so_qlen;
1592 return (kn->kn_data > 0);
1593 }
1594
1595 static const struct filterops solisten_filtops =
1596 { 1, NULL, filt_sordetach, filt_solisten };
1597 static const struct filterops soread_filtops =
1598 { 1, NULL, filt_sordetach, filt_soread };
1599 static const struct filterops sowrite_filtops =
1600 { 1, NULL, filt_sowdetach, filt_sowrite };
1601
1602 int
1603 soo_kqfilter(struct file *fp, struct knote *kn)
1604 {
1605 struct socket *so;
1606 struct sockbuf *sb;
1607
1608 so = (struct socket *)kn->kn_fp->f_data;
1609 switch (kn->kn_filter) {
1610 case EVFILT_READ:
1611 if (so->so_options & SO_ACCEPTCONN)
1612 kn->kn_fop = &solisten_filtops;
1613 else
1614 kn->kn_fop = &soread_filtops;
1615 sb = &so->so_rcv;
1616 break;
1617 case EVFILT_WRITE:
1618 kn->kn_fop = &sowrite_filtops;
1619 sb = &so->so_snd;
1620 break;
1621 default:
1622 return (1);
1623 }
1624 SLIST_INSERT_HEAD(&sb->sb_sel.sel_klist, kn, kn_selnext);
1625 sb->sb_flags |= SB_KNOTE;
1626 return (0);
1627 }
1628
1629