uipc_usrreq.c revision 1.118 1 /* $NetBSD: uipc_usrreq.c,v 1.118 2008/08/06 15:01:23 plunky Exp $ */
2
3 /*-
4 * Copyright (c) 1998, 2000, 2004, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Copyright (c) 1982, 1986, 1989, 1991, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95
62 */
63
64 /*
65 * Copyright (c) 1997 Christopher G. Demetriou. All rights reserved.
66 *
67 * Redistribution and use in source and binary forms, with or without
68 * modification, are permitted provided that the following conditions
69 * are met:
70 * 1. Redistributions of source code must retain the above copyright
71 * notice, this list of conditions and the following disclaimer.
72 * 2. Redistributions in binary form must reproduce the above copyright
73 * notice, this list of conditions and the following disclaimer in the
74 * documentation and/or other materials provided with the distribution.
75 * 3. All advertising materials mentioning features or use of this software
76 * must display the following acknowledgement:
77 * This product includes software developed by the University of
78 * California, Berkeley and its contributors.
79 * 4. Neither the name of the University nor the names of its contributors
80 * may be used to endorse or promote products derived from this software
81 * without specific prior written permission.
82 *
83 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
84 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
85 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
86 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
87 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
88 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
89 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
90 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
91 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
92 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
93 * SUCH DAMAGE.
94 *
95 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95
96 */
97
98 #include <sys/cdefs.h>
99 __KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.118 2008/08/06 15:01:23 plunky Exp $");
100
101 #include <sys/param.h>
102 #include <sys/systm.h>
103 #include <sys/proc.h>
104 #include <sys/filedesc.h>
105 #include <sys/domain.h>
106 #include <sys/protosw.h>
107 #include <sys/socket.h>
108 #include <sys/socketvar.h>
109 #include <sys/unpcb.h>
110 #include <sys/un.h>
111 #include <sys/namei.h>
112 #include <sys/vnode.h>
113 #include <sys/file.h>
114 #include <sys/stat.h>
115 #include <sys/mbuf.h>
116 #include <sys/kauth.h>
117 #include <sys/kmem.h>
118 #include <sys/atomic.h>
119
120 /*
121 * Unix communications domain.
122 *
123 * TODO:
124 * SEQPACKET, RDM
125 * rethink name space problems
126 * need a proper out-of-band
127 *
128 * Notes on locking:
129 *
130 * The generic rules noted in uipc_socket2.c apply. In addition:
131 *
132 * o We have a global lock, uipc_lock.
133 *
134 * o All datagram sockets are locked by uipc_lock.
135 *
136 * o For stream socketpairs, the two endpoints are created sharing the same
137 * independent lock. Sockets presented to PRU_CONNECT2 must already have
138 * matching locks.
139 *
140 * o Stream sockets created via socket() start life with their own
141 * independent lock.
142 *
143 * o Stream connections to a named endpoint are slightly more complicated.
144 * Sockets that have called listen() have their lock pointer mutated to
145 * the global uipc_lock. When establishing a connection, the connecting
146 * socket also has its lock mutated to uipc_lock, which matches the head
147 * (listening socket). We create a new socket for accept() to return, and
148 * that also shares the head's lock. Until the connection is completely
149 * done on both ends, all three sockets are locked by uipc_lock. Once the
150 * connection is complete, the association with the head's lock is broken.
151 * The connecting socket and the socket returned from accept() have their
152 * lock pointers mutated away from uipc_lock, and back to the connecting
153 * socket's original, independent lock. The head continues to be locked
154 * by uipc_lock.
155 *
156 * o If uipc_lock is determined to be a significant source of contention,
157 * it could easily be hashed out. It is difficult to simply make it an
158 * independent lock because of visibility / garbage collection issues:
159 * if a socket has been associated with a lock at any point, that lock
160 * must remain valid until the socket is no longer visible in the system.
161 * The lock must not be freed or otherwise destroyed until any sockets
162 * that had referenced it have also been destroyed.
163 */
164 const struct sockaddr_un sun_noname = {
165 .sun_len = sizeof(sun_noname),
166 .sun_family = AF_LOCAL,
167 };
168 ino_t unp_ino; /* prototype for fake inode numbers */
169
170 struct mbuf *unp_addsockcred(struct lwp *, struct mbuf *);
171 static kmutex_t *uipc_lock;
172
173 /*
174 * Initialize Unix protocols.
175 */
176 void
177 uipc_init(void)
178 {
179
180 uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
181 }
182
183 /*
184 * A connection succeeded: disassociate both endpoints from the head's
185 * lock, and make them share their own lock. There is a race here: for
186 * a very brief time one endpoint will be locked by a different lock
187 * than the other end. However, since the current thread holds the old
188 * lock (the listening socket's lock, the head) access can still only be
189 * made to one side of the connection.
190 */
191 static void
192 unp_setpeerlocks(struct socket *so, struct socket *so2)
193 {
194 struct unpcb *unp;
195 kmutex_t *lock;
196
197 KASSERT(solocked2(so, so2));
198
199 /*
200 * Bail out if either end of the socket is not yet fully
201 * connected or accepted. We only break the lock association
202 * with the head when the pair of sockets stand completely
203 * on their own.
204 */
205 if (so->so_head != NULL || so2->so_head != NULL)
206 return;
207
208 /*
209 * Drop references to old lock. A third reference (from the
210 * queue head) must be held as we still hold its lock. Bonus:
211 * we don't need to worry about garbage collecting the lock.
212 */
213 lock = so->so_lock;
214 KASSERT(lock == uipc_lock);
215 mutex_obj_free(lock);
216 mutex_obj_free(lock);
217
218 /*
219 * Grab stream lock from the initiator and share between the two
220 * endpoints. Issue memory barrier to ensure all modifications
221 * become globally visible before the lock change. so2 is
222 * assumed not to have a stream lock, because it was created
223 * purely for the server side to accept this connection and
224 * started out life using the domain-wide lock.
225 */
226 unp = sotounpcb(so);
227 KASSERT(unp->unp_streamlock != NULL);
228 KASSERT(sotounpcb(so2)->unp_streamlock == NULL);
229 lock = unp->unp_streamlock;
230 unp->unp_streamlock = NULL;
231 mutex_obj_hold(lock);
232 membar_exit();
233 solockreset(so, lock);
234 solockreset(so2, lock);
235 }
236
237 /*
238 * Reset a socket's lock back to the domain-wide lock.
239 */
240 static void
241 unp_resetlock(struct socket *so)
242 {
243 kmutex_t *olock, *nlock;
244 struct unpcb *unp;
245
246 KASSERT(solocked(so));
247
248 olock = so->so_lock;
249 nlock = uipc_lock;
250 if (olock == nlock)
251 return;
252 unp = sotounpcb(so);
253 KASSERT(unp->unp_streamlock == NULL);
254 unp->unp_streamlock = olock;
255 mutex_obj_hold(nlock);
256 mutex_enter(nlock);
257 solockreset(so, nlock);
258 mutex_exit(olock);
259 }
260
261 static void
262 unp_free(struct unpcb *unp)
263 {
264
265 if (unp->unp_addr)
266 free(unp->unp_addr, M_SONAME);
267 if (unp->unp_streamlock != NULL)
268 mutex_obj_free(unp->unp_streamlock);
269 free(unp, M_PCB);
270 }
271
272 int
273 unp_output(struct mbuf *m, struct mbuf *control, struct unpcb *unp,
274 struct lwp *l)
275 {
276 struct socket *so2;
277 const struct sockaddr_un *sun;
278
279 so2 = unp->unp_conn->unp_socket;
280
281 KASSERT(solocked(so2));
282
283 if (unp->unp_addr)
284 sun = unp->unp_addr;
285 else
286 sun = &sun_noname;
287 if (unp->unp_conn->unp_flags & UNP_WANTCRED)
288 control = unp_addsockcred(l, control);
289 if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m,
290 control) == 0) {
291 so2->so_rcv.sb_overflowed++;
292 sounlock(so2);
293 unp_dispose(control);
294 m_freem(control);
295 m_freem(m);
296 solock(so2);
297 return (ENOBUFS);
298 } else {
299 sorwakeup(so2);
300 return (0);
301 }
302 }
303
304 void
305 unp_setaddr(struct socket *so, struct mbuf *nam, bool peeraddr)
306 {
307 const struct sockaddr_un *sun;
308 struct unpcb *unp;
309 bool ext;
310
311 unp = sotounpcb(so);
312 ext = false;
313
314 for (;;) {
315 sun = NULL;
316 if (peeraddr) {
317 if (unp->unp_conn && unp->unp_conn->unp_addr)
318 sun = unp->unp_conn->unp_addr;
319 } else {
320 if (unp->unp_addr)
321 sun = unp->unp_addr;
322 }
323 if (sun == NULL)
324 sun = &sun_noname;
325 nam->m_len = sun->sun_len;
326 if (nam->m_len > MLEN && !ext) {
327 sounlock(so);
328 MEXTMALLOC(nam, MAXPATHLEN * 2, M_WAITOK);
329 solock(so);
330 ext = true;
331 } else {
332 KASSERT(nam->m_len <= MAXPATHLEN * 2);
333 memcpy(mtod(nam, void *), sun, (size_t)nam->m_len);
334 break;
335 }
336 }
337 }
338
339 /*ARGSUSED*/
340 int
341 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
342 struct mbuf *control, struct lwp *l)
343 {
344 struct unpcb *unp = sotounpcb(so);
345 struct socket *so2;
346 struct proc *p;
347 u_int newhiwat;
348 int error = 0;
349
350 if (req == PRU_CONTROL)
351 return (EOPNOTSUPP);
352
353 #ifdef DIAGNOSTIC
354 if (req != PRU_SEND && req != PRU_SENDOOB && control)
355 panic("uipc_usrreq: unexpected control mbuf");
356 #endif
357 p = l ? l->l_proc : NULL;
358 if (req != PRU_ATTACH) {
359 if (unp == 0) {
360 error = EINVAL;
361 goto release;
362 }
363 KASSERT(solocked(so));
364 }
365
366 switch (req) {
367
368 case PRU_ATTACH:
369 if (unp != 0) {
370 error = EISCONN;
371 break;
372 }
373 error = unp_attach(so);
374 break;
375
376 case PRU_DETACH:
377 unp_detach(unp);
378 break;
379
380 case PRU_BIND:
381 KASSERT(l != NULL);
382 error = unp_bind(so, nam, l);
383 break;
384
385 case PRU_LISTEN:
386 /*
387 * If the socket can accept a connection, it must be
388 * locked by uipc_lock.
389 */
390 unp_resetlock(so);
391 if (unp->unp_vnode == 0)
392 error = EINVAL;
393 break;
394
395 case PRU_CONNECT:
396 KASSERT(l != NULL);
397 error = unp_connect(so, nam, l);
398 break;
399
400 case PRU_CONNECT2:
401 error = unp_connect2(so, (struct socket *)nam, PRU_CONNECT2);
402 break;
403
404 case PRU_DISCONNECT:
405 unp_disconnect(unp);
406 break;
407
408 case PRU_ACCEPT:
409 KASSERT(so->so_lock == uipc_lock);
410 /*
411 * Mark the initiating STREAM socket as connected *ONLY*
412 * after it's been accepted. This prevents a client from
413 * overrunning a server and receiving ECONNREFUSED.
414 */
415 if (unp->unp_conn == NULL)
416 break;
417 so2 = unp->unp_conn->unp_socket;
418 if (so2->so_state & SS_ISCONNECTING) {
419 KASSERT(solocked2(so, so->so_head));
420 KASSERT(solocked2(so2, so->so_head));
421 soisconnected(so2);
422 }
423 /*
424 * If the connection is fully established, break the
425 * association with uipc_lock and give the connected
426 * pair a seperate lock to share.
427 */
428 unp_setpeerlocks(so2, so);
429 /*
430 * Only now return peer's address, as we may need to
431 * block in order to allocate memory.
432 *
433 * XXX Minor race: connection can be broken while
434 * lock is dropped in unp_setaddr(). We will return
435 * error == 0 and sun_noname as the peer address.
436 */
437 unp_setaddr(so, nam, true);
438 break;
439
440 case PRU_SHUTDOWN:
441 socantsendmore(so);
442 unp_shutdown(unp);
443 break;
444
445 case PRU_RCVD:
446 switch (so->so_type) {
447
448 case SOCK_DGRAM:
449 panic("uipc 1");
450 /*NOTREACHED*/
451
452 case SOCK_STREAM:
453 #define rcv (&so->so_rcv)
454 #define snd (&so2->so_snd)
455 if (unp->unp_conn == 0)
456 break;
457 so2 = unp->unp_conn->unp_socket;
458 KASSERT(solocked2(so, so2));
459 /*
460 * Adjust backpressure on sender
461 * and wakeup any waiting to write.
462 */
463 snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
464 unp->unp_mbcnt = rcv->sb_mbcnt;
465 newhiwat = snd->sb_hiwat + unp->unp_cc - rcv->sb_cc;
466 (void)chgsbsize(so2->so_uidinfo,
467 &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
468 unp->unp_cc = rcv->sb_cc;
469 sowwakeup(so2);
470 #undef snd
471 #undef rcv
472 break;
473
474 default:
475 panic("uipc 2");
476 }
477 break;
478
479 case PRU_SEND:
480 /*
481 * Note: unp_internalize() rejects any control message
482 * other than SCM_RIGHTS, and only allows one. This
483 * has the side-effect of preventing a caller from
484 * forging SCM_CREDS.
485 */
486 if (control) {
487 sounlock(so);
488 error = unp_internalize(&control);
489 solock(so);
490 if (error != 0) {
491 m_freem(control);
492 m_freem(m);
493 break;
494 }
495 }
496 switch (so->so_type) {
497
498 case SOCK_DGRAM: {
499 KASSERT(so->so_lock == uipc_lock);
500 if (nam) {
501 if ((so->so_state & SS_ISCONNECTED) != 0)
502 error = EISCONN;
503 else {
504 /*
505 * Note: once connected, the
506 * socket's lock must not be
507 * dropped until we have sent
508 * the message and disconnected.
509 * This is necessary to prevent
510 * intervening control ops, like
511 * another connection.
512 */
513 error = unp_connect(so, nam, l);
514 }
515 } else {
516 if ((so->so_state & SS_ISCONNECTED) == 0)
517 error = ENOTCONN;
518 }
519 if (error) {
520 sounlock(so);
521 unp_dispose(control);
522 m_freem(control);
523 m_freem(m);
524 solock(so);
525 break;
526 }
527 KASSERT(p != NULL);
528 error = unp_output(m, control, unp, l);
529 if (nam)
530 unp_disconnect(unp);
531 break;
532 }
533
534 case SOCK_STREAM:
535 #define rcv (&so2->so_rcv)
536 #define snd (&so->so_snd)
537 if (unp->unp_conn == NULL) {
538 error = ENOTCONN;
539 break;
540 }
541 so2 = unp->unp_conn->unp_socket;
542 KASSERT(solocked2(so, so2));
543 if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
544 /*
545 * Credentials are passed only once on
546 * SOCK_STREAM.
547 */
548 unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
549 control = unp_addsockcred(l, control);
550 }
551 /*
552 * Send to paired receive port, and then reduce
553 * send buffer hiwater marks to maintain backpressure.
554 * Wake up readers.
555 */
556 if (control) {
557 if (sbappendcontrol(rcv, m, control) != 0)
558 control = NULL;
559 } else
560 sbappend(rcv, m);
561 snd->sb_mbmax -=
562 rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
563 unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
564 newhiwat = snd->sb_hiwat -
565 (rcv->sb_cc - unp->unp_conn->unp_cc);
566 (void)chgsbsize(so->so_uidinfo,
567 &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
568 unp->unp_conn->unp_cc = rcv->sb_cc;
569 sorwakeup(so2);
570 #undef snd
571 #undef rcv
572 if (control != NULL) {
573 sounlock(so);
574 unp_dispose(control);
575 m_freem(control);
576 solock(so);
577 }
578 break;
579
580 default:
581 panic("uipc 4");
582 }
583 break;
584
585 case PRU_ABORT:
586 (void)unp_drop(unp, ECONNABORTED);
587
588 KASSERT(so->so_head == NULL);
589 #ifdef DIAGNOSTIC
590 if (so->so_pcb == 0)
591 panic("uipc 5: drop killed pcb");
592 #endif
593 unp_detach(unp);
594 break;
595
596 case PRU_SENSE:
597 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
598 if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) {
599 so2 = unp->unp_conn->unp_socket;
600 KASSERT(solocked2(so, so2));
601 ((struct stat *) m)->st_blksize += so2->so_rcv.sb_cc;
602 }
603 ((struct stat *) m)->st_dev = NODEV;
604 if (unp->unp_ino == 0)
605 unp->unp_ino = unp_ino++;
606 ((struct stat *) m)->st_atimespec =
607 ((struct stat *) m)->st_mtimespec =
608 ((struct stat *) m)->st_ctimespec = unp->unp_ctime;
609 ((struct stat *) m)->st_ino = unp->unp_ino;
610 return (0);
611
612 case PRU_RCVOOB:
613 error = EOPNOTSUPP;
614 break;
615
616 case PRU_SENDOOB:
617 m_freem(control);
618 m_freem(m);
619 error = EOPNOTSUPP;
620 break;
621
622 case PRU_SOCKADDR:
623 unp_setaddr(so, nam, false);
624 break;
625
626 case PRU_PEERADDR:
627 unp_setaddr(so, nam, true);
628 break;
629
630 default:
631 panic("piusrreq");
632 }
633
634 release:
635 return (error);
636 }
637
638 /*
639 * Unix domain socket option processing.
640 */
641 int
642 uipc_ctloutput(int op, struct socket *so, struct sockopt *sopt)
643 {
644 struct unpcb *unp = sotounpcb(so);
645 int optval = 0, error = 0;
646
647 KASSERT(solocked(so));
648
649 if (sopt->sopt_level != 0) {
650 error = ENOPROTOOPT;
651 } else switch (op) {
652
653 case PRCO_SETOPT:
654 switch (sopt->sopt_name) {
655 case LOCAL_CREDS:
656 case LOCAL_CONNWAIT:
657 error = sockopt_getint(sopt, &optval);
658 if (error)
659 break;
660 switch (sopt->sopt_name) {
661 #define OPTSET(bit) \
662 if (optval) \
663 unp->unp_flags |= (bit); \
664 else \
665 unp->unp_flags &= ~(bit);
666
667 case LOCAL_CREDS:
668 OPTSET(UNP_WANTCRED);
669 break;
670 case LOCAL_CONNWAIT:
671 OPTSET(UNP_CONNWAIT);
672 break;
673 }
674 break;
675 #undef OPTSET
676
677 default:
678 error = ENOPROTOOPT;
679 break;
680 }
681 break;
682
683 case PRCO_GETOPT:
684 sounlock(so);
685 switch (sopt->sopt_name) {
686 case LOCAL_PEEREID:
687 if (unp->unp_flags & UNP_EIDSVALID) {
688 error = sockopt_set(sopt,
689 &unp->unp_connid, sizeof(unp->unp_connid));
690 } else {
691 error = EINVAL;
692 }
693 break;
694 case LOCAL_CREDS:
695 #define OPTBIT(bit) (unp->unp_flags & (bit) ? 1 : 0)
696
697 optval = OPTBIT(UNP_WANTCRED);
698 error = sockopt_setint(sopt, optval);
699 break;
700 #undef OPTBIT
701
702 default:
703 error = ENOPROTOOPT;
704 break;
705 }
706 solock(so);
707 break;
708 }
709 return (error);
710 }
711
712 /*
713 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
714 * for stream sockets, although the total for sender and receiver is
715 * actually only PIPSIZ.
716 * Datagram sockets really use the sendspace as the maximum datagram size,
717 * and don't really want to reserve the sendspace. Their recvspace should
718 * be large enough for at least one max-size datagram plus address.
719 */
720 #define PIPSIZ 4096
721 u_long unpst_sendspace = PIPSIZ;
722 u_long unpst_recvspace = PIPSIZ;
723 u_long unpdg_sendspace = 2*1024; /* really max datagram size */
724 u_long unpdg_recvspace = 4*1024;
725
726 u_int unp_rights; /* file descriptors in flight */
727
728 int
729 unp_attach(struct socket *so)
730 {
731 struct unpcb *unp;
732 int error;
733
734 switch (so->so_type) {
735 case SOCK_STREAM:
736 if (so->so_lock == NULL) {
737 /*
738 * XXX Assuming that no socket locks are held,
739 * as this call may sleep.
740 */
741 so->so_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
742 solock(so);
743 }
744 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
745 error = soreserve(so, unpst_sendspace, unpst_recvspace);
746 if (error != 0)
747 return (error);
748 }
749 break;
750
751 case SOCK_DGRAM:
752 if (so->so_lock == NULL) {
753 mutex_obj_hold(uipc_lock);
754 so->so_lock = uipc_lock;
755 solock(so);
756 }
757 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
758 error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
759 if (error != 0)
760 return (error);
761 }
762 break;
763
764 default:
765 panic("unp_attach");
766 }
767 KASSERT(solocked(so));
768 unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT);
769 if (unp == NULL)
770 return (ENOBUFS);
771 memset((void *)unp, 0, sizeof(*unp));
772 unp->unp_socket = so;
773 so->so_pcb = unp;
774 nanotime(&unp->unp_ctime);
775 return (0);
776 }
777
778 void
779 unp_detach(struct unpcb *unp)
780 {
781 struct socket *so;
782 vnode_t *vp;
783
784 so = unp->unp_socket;
785
786 retry:
787 if ((vp = unp->unp_vnode) != NULL) {
788 sounlock(so);
789 /* Acquire v_interlock to protect against unp_connect(). */
790 /* XXXAD racy */
791 mutex_enter(&vp->v_interlock);
792 vp->v_socket = NULL;
793 vrelel(vp, 0);
794 solock(so);
795 unp->unp_vnode = NULL;
796 }
797 if (unp->unp_conn)
798 unp_disconnect(unp);
799 while (unp->unp_refs) {
800 KASSERT(solocked2(so, unp->unp_refs->unp_socket));
801 if (unp_drop(unp->unp_refs, ECONNRESET)) {
802 solock(so);
803 goto retry;
804 }
805 }
806 soisdisconnected(so);
807 so->so_pcb = NULL;
808 if (unp_rights) {
809 /*
810 * Normally the receive buffer is flushed later,
811 * in sofree, but if our receive buffer holds references
812 * to descriptors that are now garbage, we will dispose
813 * of those descriptor references after the garbage collector
814 * gets them (resulting in a "panic: closef: count < 0").
815 */
816 sorflush(so);
817 unp_free(unp);
818 sounlock(so);
819 unp_gc();
820 solock(so);
821 } else
822 unp_free(unp);
823 }
824
825 int
826 unp_bind(struct socket *so, struct mbuf *nam, struct lwp *l)
827 {
828 struct sockaddr_un *sun;
829 struct unpcb *unp;
830 vnode_t *vp;
831 struct vattr vattr;
832 size_t addrlen;
833 int error;
834 struct nameidata nd;
835 proc_t *p;
836
837 unp = sotounpcb(so);
838 if (unp->unp_vnode != NULL)
839 return (EINVAL);
840 if ((unp->unp_flags & UNP_BUSY) != 0) {
841 /*
842 * EALREADY may not be strictly accurate, but since this
843 * is a major application error it's hardly a big deal.
844 */
845 return (EALREADY);
846 }
847 unp->unp_flags |= UNP_BUSY;
848 sounlock(so);
849
850 /*
851 * Allocate the new sockaddr. We have to allocate one
852 * extra byte so that we can ensure that the pathname
853 * is nul-terminated.
854 */
855 p = l->l_proc;
856 addrlen = nam->m_len + 1;
857 sun = malloc(addrlen, M_SONAME, M_WAITOK);
858 m_copydata(nam, 0, nam->m_len, (void *)sun);
859 *(((char *)sun) + nam->m_len) = '\0';
860
861 NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT | TRYEMULROOT, UIO_SYSSPACE,
862 sun->sun_path);
863
864 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
865 if ((error = namei(&nd)) != 0)
866 goto bad;
867 vp = nd.ni_vp;
868 if (vp != NULL) {
869 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
870 if (nd.ni_dvp == vp)
871 vrele(nd.ni_dvp);
872 else
873 vput(nd.ni_dvp);
874 vrele(vp);
875 error = EADDRINUSE;
876 goto bad;
877 }
878 VATTR_NULL(&vattr);
879 vattr.va_type = VSOCK;
880 vattr.va_mode = ACCESSPERMS & ~(p->p_cwdi->cwdi_cmask);
881 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
882 if (error)
883 goto bad;
884 vp = nd.ni_vp;
885 solock(so);
886 vp->v_socket = unp->unp_socket;
887 unp->unp_vnode = vp;
888 unp->unp_addrlen = addrlen;
889 unp->unp_addr = sun;
890 unp->unp_connid.unp_pid = p->p_pid;
891 unp->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred);
892 unp->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred);
893 unp->unp_flags |= UNP_EIDSBIND;
894 VOP_UNLOCK(vp, 0);
895 unp->unp_flags &= ~UNP_BUSY;
896 return (0);
897
898 bad:
899 free(sun, M_SONAME);
900 solock(so);
901 unp->unp_flags &= ~UNP_BUSY;
902 return (error);
903 }
904
905 int
906 unp_connect(struct socket *so, struct mbuf *nam, struct lwp *l)
907 {
908 struct sockaddr_un *sun;
909 vnode_t *vp;
910 struct socket *so2, *so3;
911 struct unpcb *unp, *unp2, *unp3;
912 size_t addrlen;
913 int error;
914 struct nameidata nd;
915
916 unp = sotounpcb(so);
917 if ((unp->unp_flags & UNP_BUSY) != 0) {
918 /*
919 * EALREADY may not be strictly accurate, but since this
920 * is a major application error it's hardly a big deal.
921 */
922 return (EALREADY);
923 }
924 unp->unp_flags |= UNP_BUSY;
925 sounlock(so);
926
927 /*
928 * Allocate a temporary sockaddr. We have to allocate one extra
929 * byte so that we can ensure that the pathname is nul-terminated.
930 * When we establish the connection, we copy the other PCB's
931 * sockaddr to our own.
932 */
933 addrlen = nam->m_len + 1;
934 sun = malloc(addrlen, M_SONAME, M_WAITOK);
935 m_copydata(nam, 0, nam->m_len, (void *)sun);
936 *(((char *)sun) + nam->m_len) = '\0';
937
938 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, UIO_SYSSPACE,
939 sun->sun_path);
940
941 if ((error = namei(&nd)) != 0)
942 goto bad2;
943 vp = nd.ni_vp;
944 if (vp->v_type != VSOCK) {
945 error = ENOTSOCK;
946 goto bad;
947 }
948 if ((error = VOP_ACCESS(vp, VWRITE, l->l_cred)) != 0)
949 goto bad;
950 /* Acquire v_interlock to protect against unp_detach(). */
951 mutex_enter(&vp->v_interlock);
952 so2 = vp->v_socket;
953 if (so2 == NULL) {
954 mutex_exit(&vp->v_interlock);
955 error = ECONNREFUSED;
956 goto bad;
957 }
958 if (so->so_type != so2->so_type) {
959 mutex_exit(&vp->v_interlock);
960 error = EPROTOTYPE;
961 goto bad;
962 }
963 solock(so);
964 unp_resetlock(so);
965 mutex_exit(&vp->v_interlock);
966 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
967 /*
968 * This may seem somewhat fragile but is OK: if we can
969 * see SO_ACCEPTCONN set on the endpoint, then it must
970 * be locked by the domain-wide uipc_lock.
971 */
972 KASSERT((so->so_options & SO_ACCEPTCONN) == 0 ||
973 so2->so_lock == uipc_lock);
974 if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
975 (so3 = sonewconn(so2, 0)) == 0) {
976 error = ECONNREFUSED;
977 sounlock(so);
978 goto bad;
979 }
980 unp2 = sotounpcb(so2);
981 unp3 = sotounpcb(so3);
982 if (unp2->unp_addr) {
983 unp3->unp_addr = malloc(unp2->unp_addrlen,
984 M_SONAME, M_WAITOK);
985 memcpy(unp3->unp_addr, unp2->unp_addr,
986 unp2->unp_addrlen);
987 unp3->unp_addrlen = unp2->unp_addrlen;
988 }
989 unp3->unp_flags = unp2->unp_flags;
990 unp3->unp_connid.unp_pid = l->l_proc->p_pid;
991 unp3->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred);
992 unp3->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred);
993 unp3->unp_flags |= UNP_EIDSVALID;
994 if (unp2->unp_flags & UNP_EIDSBIND) {
995 unp->unp_connid = unp2->unp_connid;
996 unp->unp_flags |= UNP_EIDSVALID;
997 }
998 so2 = so3;
999 }
1000 error = unp_connect2(so, so2, PRU_CONNECT);
1001 sounlock(so);
1002 bad:
1003 vput(vp);
1004 bad2:
1005 free(sun, M_SONAME);
1006 solock(so);
1007 unp->unp_flags &= ~UNP_BUSY;
1008 return (error);
1009 }
1010
1011 int
1012 unp_connect2(struct socket *so, struct socket *so2, int req)
1013 {
1014 struct unpcb *unp = sotounpcb(so);
1015 struct unpcb *unp2;
1016
1017 if (so2->so_type != so->so_type)
1018 return (EPROTOTYPE);
1019
1020 /*
1021 * All three sockets involved must be locked by same lock:
1022 *
1023 * local endpoint (so)
1024 * remote endpoint (so2)
1025 * queue head (so->so_head, only if PR_CONNREQUIRED)
1026 */
1027 KASSERT(solocked2(so, so2));
1028 if (so->so_head != NULL) {
1029 KASSERT(so->so_lock == uipc_lock);
1030 KASSERT(solocked2(so, so->so_head));
1031 }
1032
1033 unp2 = sotounpcb(so2);
1034 unp->unp_conn = unp2;
1035 switch (so->so_type) {
1036
1037 case SOCK_DGRAM:
1038 unp->unp_nextref = unp2->unp_refs;
1039 unp2->unp_refs = unp;
1040 soisconnected(so);
1041 break;
1042
1043 case SOCK_STREAM:
1044 unp2->unp_conn = unp;
1045 if (req == PRU_CONNECT &&
1046 ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
1047 soisconnecting(so);
1048 else
1049 soisconnected(so);
1050 soisconnected(so2);
1051 /*
1052 * If the connection is fully established, break the
1053 * association with uipc_lock and give the connected
1054 * pair a seperate lock to share. For CONNECT2, we
1055 * require that the locks already match (the sockets
1056 * are created that way).
1057 */
1058 if (req == PRU_CONNECT)
1059 unp_setpeerlocks(so, so2);
1060 break;
1061
1062 default:
1063 panic("unp_connect2");
1064 }
1065 return (0);
1066 }
1067
1068 void
1069 unp_disconnect(struct unpcb *unp)
1070 {
1071 struct unpcb *unp2 = unp->unp_conn;
1072 struct socket *so;
1073
1074 if (unp2 == 0)
1075 return;
1076 unp->unp_conn = 0;
1077 so = unp->unp_socket;
1078 switch (so->so_type) {
1079 case SOCK_DGRAM:
1080 if (unp2->unp_refs == unp)
1081 unp2->unp_refs = unp->unp_nextref;
1082 else {
1083 unp2 = unp2->unp_refs;
1084 for (;;) {
1085 KASSERT(solocked2(so, unp2->unp_socket));
1086 if (unp2 == 0)
1087 panic("unp_disconnect");
1088 if (unp2->unp_nextref == unp)
1089 break;
1090 unp2 = unp2->unp_nextref;
1091 }
1092 unp2->unp_nextref = unp->unp_nextref;
1093 }
1094 unp->unp_nextref = 0;
1095 so->so_state &= ~SS_ISCONNECTED;
1096 break;
1097
1098 case SOCK_STREAM:
1099 KASSERT(solocked2(so, unp2->unp_socket));
1100 soisdisconnected(so);
1101 unp2->unp_conn = 0;
1102 soisdisconnected(unp2->unp_socket);
1103 break;
1104 }
1105 }
1106
1107 #ifdef notdef
1108 unp_abort(struct unpcb *unp)
1109 {
1110 unp_detach(unp);
1111 }
1112 #endif
1113
1114 void
1115 unp_shutdown(struct unpcb *unp)
1116 {
1117 struct socket *so;
1118
1119 if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
1120 (so = unp->unp_conn->unp_socket))
1121 socantrcvmore(so);
1122 }
1123
1124 bool
1125 unp_drop(struct unpcb *unp, int errno)
1126 {
1127 struct socket *so = unp->unp_socket;
1128
1129 KASSERT(solocked(so));
1130
1131 so->so_error = errno;
1132 unp_disconnect(unp);
1133 if (so->so_head) {
1134 so->so_pcb = NULL;
1135 /* sofree() drops the socket lock */
1136 sofree(so);
1137 unp_free(unp);
1138 return true;
1139 }
1140 return false;
1141 }
1142
1143 #ifdef notdef
1144 unp_drain(void)
1145 {
1146
1147 }
1148 #endif
1149
1150 int
1151 unp_externalize(struct mbuf *rights, struct lwp *l)
1152 {
1153 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
1154 struct proc *p = l->l_proc;
1155 int i, *fdp;
1156 file_t **rp;
1157 file_t *fp;
1158 int nfds, error = 0;
1159
1160 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
1161 sizeof(file_t *);
1162 rp = (file_t **)CMSG_DATA(cm);
1163
1164 fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK);
1165 rw_enter(&p->p_cwdi->cwdi_lock, RW_READER);
1166
1167 /* Make sure the recipient should be able to see the descriptors.. */
1168 if (p->p_cwdi->cwdi_rdir != NULL) {
1169 rp = (file_t **)CMSG_DATA(cm);
1170 for (i = 0; i < nfds; i++) {
1171 fp = *rp++;
1172 /*
1173 * If we are in a chroot'ed directory, and
1174 * someone wants to pass us a directory, make
1175 * sure it's inside the subtree we're allowed
1176 * to access.
1177 */
1178 if (fp->f_type == DTYPE_VNODE) {
1179 vnode_t *vp = (vnode_t *)fp->f_data;
1180 if ((vp->v_type == VDIR) &&
1181 !vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) {
1182 error = EPERM;
1183 break;
1184 }
1185 }
1186 }
1187 }
1188
1189 restart:
1190 rp = (file_t **)CMSG_DATA(cm);
1191 if (error != 0) {
1192 for (i = 0; i < nfds; i++) {
1193 fp = *rp;
1194 /*
1195 * zero the pointer before calling unp_discard,
1196 * since it may end up in unp_gc()..
1197 */
1198 *rp++ = 0;
1199 unp_discard(fp);
1200 }
1201 goto out;
1202 }
1203
1204 /*
1205 * First loop -- allocate file descriptor table slots for the
1206 * new descriptors.
1207 */
1208 for (i = 0; i < nfds; i++) {
1209 fp = *rp++;
1210 if ((error = fd_alloc(p, 0, &fdp[i])) != 0) {
1211 /*
1212 * Back out what we've done so far.
1213 */
1214 for (--i; i >= 0; i--) {
1215 fd_abort(p, NULL, fdp[i]);
1216 }
1217 if (error == ENOSPC) {
1218 fd_tryexpand(p);
1219 error = 0;
1220 } else {
1221 /*
1222 * This is the error that has historically
1223 * been returned, and some callers may
1224 * expect it.
1225 */
1226 error = EMSGSIZE;
1227 }
1228 goto restart;
1229 }
1230 }
1231
1232 /*
1233 * Now that adding them has succeeded, update all of the
1234 * descriptor passing state.
1235 */
1236 rp = (file_t **)CMSG_DATA(cm);
1237 for (i = 0; i < nfds; i++) {
1238 fp = *rp++;
1239 atomic_dec_uint(&unp_rights);
1240 fd_affix(p, fp, fdp[i]);
1241 mutex_enter(&fp->f_lock);
1242 fp->f_msgcount--;
1243 mutex_exit(&fp->f_lock);
1244 /*
1245 * Note that fd_affix() adds a reference to the file.
1246 * The file may already have been closed by another
1247 * LWP in the process, so we must drop the reference
1248 * added by unp_internalize() with closef().
1249 */
1250 closef(fp);
1251 }
1252
1253 /*
1254 * Copy temporary array to message and adjust length, in case of
1255 * transition from large file_t pointers to ints.
1256 */
1257 memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int));
1258 cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
1259 rights->m_len = CMSG_SPACE(nfds * sizeof(int));
1260 out:
1261 rw_exit(&p->p_cwdi->cwdi_lock);
1262 free(fdp, M_TEMP);
1263 return (error);
1264 }
1265
1266 int
1267 unp_internalize(struct mbuf **controlp)
1268 {
1269 struct filedesc *fdescp = curlwp->l_fd;
1270 struct mbuf *control = *controlp;
1271 struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *);
1272 file_t **rp, **files;
1273 file_t *fp;
1274 int i, fd, *fdp;
1275 int nfds, error;
1276
1277 error = 0;
1278 newcm = NULL;
1279
1280 /* Sanity check the control message header. */
1281 if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
1282 cm->cmsg_len > control->m_len ||
1283 cm->cmsg_len < CMSG_ALIGN(sizeof(*cm)))
1284 return (EINVAL);
1285
1286 /*
1287 * Verify that the file descriptors are valid, and acquire
1288 * a reference to each.
1289 */
1290 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int);
1291 fdp = (int *)CMSG_DATA(cm);
1292 for (i = 0; i < nfds; i++) {
1293 fd = *fdp++;
1294 if ((fp = fd_getfile(fd)) == NULL) {
1295 nfds = i + 1;
1296 error = EBADF;
1297 goto out;
1298 }
1299 }
1300
1301 /* Allocate new space and copy header into it. */
1302 newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK);
1303 if (newcm == NULL) {
1304 error = E2BIG;
1305 goto out;
1306 }
1307 memcpy(newcm, cm, sizeof(struct cmsghdr));
1308 files = (file_t **)CMSG_DATA(newcm);
1309
1310 /*
1311 * Transform the file descriptors into file_t pointers, in
1312 * reverse order so that if pointers are bigger than ints, the
1313 * int won't get until we're done. No need to lock, as we have
1314 * already validated the descriptors with fd_getfile().
1315 */
1316 fdp = (int *)CMSG_DATA(cm) + nfds;
1317 rp = files + nfds;
1318 for (i = 0; i < nfds; i++) {
1319 fp = fdescp->fd_ofiles[*--fdp]->ff_file;
1320 KASSERT(fp != NULL);
1321 mutex_enter(&fp->f_lock);
1322 *--rp = fp;
1323 fp->f_count++;
1324 fp->f_msgcount++;
1325 mutex_exit(&fp->f_lock);
1326 atomic_inc_uint(&unp_rights);
1327 }
1328
1329 out:
1330 /* Release descriptor references. */
1331 fdp = (int *)CMSG_DATA(cm);
1332 for (i = 0; i < nfds; i++) {
1333 fd_putfile(*fdp++);
1334 }
1335
1336 if (error == 0) {
1337 if (control->m_flags & M_EXT) {
1338 m_freem(control);
1339 *controlp = control = m_get(M_WAIT, MT_CONTROL);
1340 }
1341 MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)),
1342 M_MBUF, NULL, NULL);
1343 cm = newcm;
1344 /*
1345 * Adjust message & mbuf to note amount of space
1346 * actually used.
1347 */
1348 cm->cmsg_len = CMSG_LEN(nfds * sizeof(file_t *));
1349 control->m_len = CMSG_SPACE(nfds * sizeof(file_t *));
1350 }
1351
1352 return error;
1353 }
1354
1355 struct mbuf *
1356 unp_addsockcred(struct lwp *l, struct mbuf *control)
1357 {
1358 struct cmsghdr *cmp;
1359 struct sockcred *sc;
1360 struct mbuf *m, *n;
1361 int len, space, i;
1362
1363 len = CMSG_LEN(SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred)));
1364 space = CMSG_SPACE(SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred)));
1365
1366 m = m_get(M_WAIT, MT_CONTROL);
1367 if (space > MLEN) {
1368 if (space > MCLBYTES)
1369 MEXTMALLOC(m, space, M_WAITOK);
1370 else
1371 m_clget(m, M_WAIT);
1372 if ((m->m_flags & M_EXT) == 0) {
1373 m_free(m);
1374 return (control);
1375 }
1376 }
1377
1378 m->m_len = space;
1379 m->m_next = NULL;
1380 cmp = mtod(m, struct cmsghdr *);
1381 sc = (struct sockcred *)CMSG_DATA(cmp);
1382 cmp->cmsg_len = len;
1383 cmp->cmsg_level = SOL_SOCKET;
1384 cmp->cmsg_type = SCM_CREDS;
1385 sc->sc_uid = kauth_cred_getuid(l->l_cred);
1386 sc->sc_euid = kauth_cred_geteuid(l->l_cred);
1387 sc->sc_gid = kauth_cred_getgid(l->l_cred);
1388 sc->sc_egid = kauth_cred_getegid(l->l_cred);
1389 sc->sc_ngroups = kauth_cred_ngroups(l->l_cred);
1390 for (i = 0; i < sc->sc_ngroups; i++)
1391 sc->sc_groups[i] = kauth_cred_group(l->l_cred, i);
1392
1393 /*
1394 * If a control message already exists, append us to the end.
1395 */
1396 if (control != NULL) {
1397 for (n = control; n->m_next != NULL; n = n->m_next)
1398 ;
1399 n->m_next = m;
1400 } else
1401 control = m;
1402
1403 return (control);
1404 }
1405
1406 int unp_defer, unp_gcing;
1407 extern struct domain unixdomain;
1408
1409 /*
1410 * Comment added long after the fact explaining what's going on here.
1411 * Do a mark-sweep GC of file descriptors on the system, to free up
1412 * any which are caught in flight to an about-to-be-closed socket.
1413 *
1414 * Traditional mark-sweep gc's start at the "root", and mark
1415 * everything reachable from the root (which, in our case would be the
1416 * process table). The mark bits are cleared during the sweep.
1417 *
1418 * XXX For some inexplicable reason (perhaps because the file
1419 * descriptor tables used to live in the u area which could be swapped
1420 * out and thus hard to reach), we do multiple scans over the set of
1421 * descriptors, using use *two* mark bits per object (DEFER and MARK).
1422 * Whenever we find a descriptor which references other descriptors,
1423 * the ones it references are marked with both bits, and we iterate
1424 * over the whole file table until there are no more DEFER bits set.
1425 * We also make an extra pass *before* the GC to clear the mark bits,
1426 * which could have been cleared at almost no cost during the previous
1427 * sweep.
1428 */
1429 void
1430 unp_gc(void)
1431 {
1432 file_t *fp, *nextfp;
1433 struct socket *so, *so1;
1434 file_t **extra_ref, **fpp;
1435 int nunref, nslots, i;
1436
1437 if (atomic_swap_uint(&unp_gcing, 1) == 1)
1438 return;
1439
1440 restart:
1441 nslots = nfiles * 2;
1442 extra_ref = kmem_alloc(nslots * sizeof(file_t *), KM_SLEEP);
1443
1444 mutex_enter(&filelist_lock);
1445 unp_defer = 0;
1446
1447 /* Clear mark bits */
1448 LIST_FOREACH(fp, &filehead, f_list) {
1449 atomic_and_uint(&fp->f_flag, ~(FMARK|FDEFER));
1450 }
1451
1452 /*
1453 * Iterate over the set of descriptors, marking ones believed
1454 * (based on refcount) to be referenced from a process, and
1455 * marking for rescan descriptors which are queued on a socket.
1456 */
1457 do {
1458 LIST_FOREACH(fp, &filehead, f_list) {
1459 mutex_enter(&fp->f_lock);
1460 if (fp->f_flag & FDEFER) {
1461 atomic_and_uint(&fp->f_flag, ~FDEFER);
1462 unp_defer--;
1463 KASSERT(fp->f_count != 0);
1464 } else {
1465 if (fp->f_count == 0 ||
1466 (fp->f_flag & FMARK) ||
1467 fp->f_count == fp->f_msgcount) {
1468 mutex_exit(&fp->f_lock);
1469 continue;
1470 }
1471 }
1472 atomic_or_uint(&fp->f_flag, FMARK);
1473
1474 if (fp->f_type != DTYPE_SOCKET ||
1475 (so = fp->f_data) == NULL ||
1476 so->so_proto->pr_domain != &unixdomain ||
1477 (so->so_proto->pr_flags&PR_RIGHTS) == 0) {
1478 mutex_exit(&fp->f_lock);
1479 continue;
1480 }
1481 #ifdef notdef
1482 if (so->so_rcv.sb_flags & SB_LOCK) {
1483 mutex_exit(&fp->f_lock);
1484 mutex_exit(&filelist_lock);
1485 kmem_free(extra_ref, nslots * sizeof(file_t *));
1486 /*
1487 * This is problematical; it's not clear
1488 * we need to wait for the sockbuf to be
1489 * unlocked (on a uniprocessor, at least),
1490 * and it's also not clear what to do
1491 * if sbwait returns an error due to receipt
1492 * of a signal. If sbwait does return
1493 * an error, we'll go into an infinite
1494 * loop. Delete all of this for now.
1495 */
1496 (void) sbwait(&so->so_rcv);
1497 goto restart;
1498 }
1499 #endif
1500 mutex_exit(&fp->f_lock);
1501
1502 /*
1503 * XXX Locking a socket with filelist_lock held
1504 * is ugly. filelist_lock can be taken by the
1505 * pagedaemon when reclaiming items from file_cache.
1506 * Socket activity could delay the pagedaemon.
1507 */
1508 solock(so);
1509 unp_scan(so->so_rcv.sb_mb, unp_mark, 0);
1510 /*
1511 * Mark descriptors referenced from sockets queued
1512 * on the accept queue as well.
1513 */
1514 if (so->so_options & SO_ACCEPTCONN) {
1515 TAILQ_FOREACH(so1, &so->so_q0, so_qe) {
1516 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
1517 }
1518 TAILQ_FOREACH(so1, &so->so_q, so_qe) {
1519 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
1520 }
1521 }
1522 sounlock(so);
1523 }
1524 } while (unp_defer);
1525
1526 /*
1527 * Sweep pass. Find unmarked descriptors, and free them.
1528 *
1529 * We grab an extra reference to each of the file table entries
1530 * that are not otherwise accessible and then free the rights
1531 * that are stored in messages on them.
1532 *
1533 * The bug in the original code is a little tricky, so I'll describe
1534 * what's wrong with it here.
1535 *
1536 * It is incorrect to simply unp_discard each entry for f_msgcount
1537 * times -- consider the case of sockets A and B that contain
1538 * references to each other. On a last close of some other socket,
1539 * we trigger a gc since the number of outstanding rights (unp_rights)
1540 * is non-zero. If during the sweep phase the gc code un_discards,
1541 * we end up doing a (full) closef on the descriptor. A closef on A
1542 * results in the following chain. Closef calls soo_close, which
1543 * calls soclose. Soclose calls first (through the switch
1544 * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply
1545 * returns because the previous instance had set unp_gcing, and
1546 * we return all the way back to soclose, which marks the socket
1547 * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush
1548 * to free up the rights that are queued in messages on the socket A,
1549 * i.e., the reference on B. The sorflush calls via the dom_dispose
1550 * switch unp_dispose, which unp_scans with unp_discard. This second
1551 * instance of unp_discard just calls closef on B.
1552 *
1553 * Well, a similar chain occurs on B, resulting in a sorflush on B,
1554 * which results in another closef on A. Unfortunately, A is already
1555 * being closed, and the descriptor has already been marked with
1556 * SS_NOFDREF, and soclose panics at this point.
1557 *
1558 * Here, we first take an extra reference to each inaccessible
1559 * descriptor. Then, if the inaccessible descriptor is a
1560 * socket, we call sorflush in case it is a Unix domain
1561 * socket. After we destroy all the rights carried in
1562 * messages, we do a last closef to get rid of our extra
1563 * reference. This is the last close, and the unp_detach etc
1564 * will shut down the socket.
1565 *
1566 * 91/09/19, bsy (at) cs.cmu.edu
1567 */
1568 if (nslots < nfiles) {
1569 mutex_exit(&filelist_lock);
1570 kmem_free(extra_ref, nslots * sizeof(file_t *));
1571 goto restart;
1572 }
1573 for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != 0;
1574 fp = nextfp) {
1575 nextfp = LIST_NEXT(fp, f_list);
1576 mutex_enter(&fp->f_lock);
1577 if (fp->f_count != 0 &&
1578 fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
1579 *fpp++ = fp;
1580 nunref++;
1581 fp->f_count++;
1582 }
1583 mutex_exit(&fp->f_lock);
1584 }
1585 mutex_exit(&filelist_lock);
1586
1587 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
1588 fp = *fpp;
1589 if (fp->f_type == DTYPE_SOCKET) {
1590 so = fp->f_data;
1591 solock(so);
1592 sorflush(fp->f_data);
1593 sounlock(so);
1594 }
1595 }
1596 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
1597 closef(*fpp);
1598 }
1599 kmem_free(extra_ref, nslots * sizeof(file_t *));
1600 atomic_swap_uint(&unp_gcing, 0);
1601 }
1602
1603 void
1604 unp_dispose(struct mbuf *m)
1605 {
1606
1607 if (m)
1608 unp_scan(m, unp_discard, 1);
1609 }
1610
1611 void
1612 unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard)
1613 {
1614 struct mbuf *m;
1615 file_t **rp;
1616 struct cmsghdr *cm;
1617 int i;
1618 int qfds;
1619
1620 while (m0) {
1621 for (m = m0; m; m = m->m_next) {
1622 if (m->m_type == MT_CONTROL &&
1623 m->m_len >= sizeof(*cm)) {
1624 cm = mtod(m, struct cmsghdr *);
1625 if (cm->cmsg_level != SOL_SOCKET ||
1626 cm->cmsg_type != SCM_RIGHTS)
1627 continue;
1628 qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm)))
1629 / sizeof(file_t *);
1630 rp = (file_t **)CMSG_DATA(cm);
1631 for (i = 0; i < qfds; i++) {
1632 file_t *fp = *rp;
1633 if (discard)
1634 *rp = 0;
1635 (*op)(fp);
1636 rp++;
1637 }
1638 break; /* XXX, but saves time */
1639 }
1640 }
1641 m0 = m0->m_nextpkt;
1642 }
1643 }
1644
1645 void
1646 unp_mark(file_t *fp)
1647 {
1648
1649 if (fp == NULL)
1650 return;
1651
1652 /* If we're already deferred, don't screw up the defer count */
1653 mutex_enter(&fp->f_lock);
1654 if (fp->f_flag & (FMARK | FDEFER)) {
1655 mutex_exit(&fp->f_lock);
1656 return;
1657 }
1658
1659 /*
1660 * Minimize the number of deferrals... Sockets are the only
1661 * type of descriptor which can hold references to another
1662 * descriptor, so just mark other descriptors, and defer
1663 * unmarked sockets for the next pass.
1664 */
1665 if (fp->f_type == DTYPE_SOCKET) {
1666 unp_defer++;
1667 KASSERT(fp->f_count != 0);
1668 atomic_or_uint(&fp->f_flag, FDEFER);
1669 } else {
1670 atomic_or_uint(&fp->f_flag, FMARK);
1671 }
1672 mutex_exit(&fp->f_lock);
1673 return;
1674 }
1675
1676 void
1677 unp_discard(file_t *fp)
1678 {
1679
1680 if (fp == NULL)
1681 return;
1682
1683 mutex_enter(&fp->f_lock);
1684 KASSERT(fp->f_count > 0);
1685 fp->f_msgcount--;
1686 mutex_exit(&fp->f_lock);
1687 atomic_dec_uint(&unp_rights);
1688 (void)closef(fp);
1689 }
1690