uipc_usrreq.c revision 1.80 1 /* $NetBSD: uipc_usrreq.c,v 1.80 2005/02/26 21:34:56 perry Exp $ */
2
3 /*-
4 * Copyright (c) 1998, 2000, 2004 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the NetBSD
22 * Foundation, Inc. and its contributors.
23 * 4. Neither the name of The NetBSD Foundation nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*
41 * Copyright (c) 1982, 1986, 1989, 1991, 1993
42 * The Regents of the University of California. All rights reserved.
43 *
44 * Redistribution and use in source and binary forms, with or without
45 * modification, are permitted provided that the following conditions
46 * are met:
47 * 1. Redistributions of source code must retain the above copyright
48 * notice, this list of conditions and the following disclaimer.
49 * 2. Redistributions in binary form must reproduce the above copyright
50 * notice, this list of conditions and the following disclaimer in the
51 * documentation and/or other materials provided with the distribution.
52 * 3. Neither the name of the University nor the names of its contributors
53 * may be used to endorse or promote products derived from this software
54 * without specific prior written permission.
55 *
56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66 * SUCH DAMAGE.
67 *
68 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95
69 */
70
71 /*
72 * Copyright (c) 1997 Christopher G. Demetriou. All rights reserved.
73 *
74 * Redistribution and use in source and binary forms, with or without
75 * modification, are permitted provided that the following conditions
76 * are met:
77 * 1. Redistributions of source code must retain the above copyright
78 * notice, this list of conditions and the following disclaimer.
79 * 2. Redistributions in binary form must reproduce the above copyright
80 * notice, this list of conditions and the following disclaimer in the
81 * documentation and/or other materials provided with the distribution.
82 * 3. All advertising materials mentioning features or use of this software
83 * must display the following acknowledgement:
84 * This product includes software developed by the University of
85 * California, Berkeley and its contributors.
86 * 4. Neither the name of the University nor the names of its contributors
87 * may be used to endorse or promote products derived from this software
88 * without specific prior written permission.
89 *
90 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
91 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
92 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
93 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
94 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
95 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
96 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
97 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
98 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
99 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
100 * SUCH DAMAGE.
101 *
102 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95
103 */
104
105 #include <sys/cdefs.h>
106 __KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.80 2005/02/26 21:34:56 perry Exp $");
107
108 #include <sys/param.h>
109 #include <sys/systm.h>
110 #include <sys/proc.h>
111 #include <sys/filedesc.h>
112 #include <sys/domain.h>
113 #include <sys/protosw.h>
114 #include <sys/socket.h>
115 #include <sys/socketvar.h>
116 #include <sys/unpcb.h>
117 #include <sys/un.h>
118 #include <sys/namei.h>
119 #include <sys/vnode.h>
120 #include <sys/file.h>
121 #include <sys/stat.h>
122 #include <sys/mbuf.h>
123
124 /*
125 * Unix communications domain.
126 *
127 * TODO:
128 * SEQPACKET, RDM
129 * rethink name space problems
130 * need a proper out-of-band
131 */
132 const struct sockaddr_un sun_noname = { sizeof(sun_noname), AF_LOCAL };
133 ino_t unp_ino; /* prototype for fake inode numbers */
134
135 struct mbuf *unp_addsockcred(struct proc *, struct mbuf *);
136
137 int
138 unp_output(struct mbuf *m, struct mbuf *control, struct unpcb *unp,
139 struct proc *p)
140 {
141 struct socket *so2;
142 const struct sockaddr_un *sun;
143
144 so2 = unp->unp_conn->unp_socket;
145 if (unp->unp_addr)
146 sun = unp->unp_addr;
147 else
148 sun = &sun_noname;
149 if (unp->unp_conn->unp_flags & UNP_WANTCRED)
150 control = unp_addsockcred(p, control);
151 if (sbappendaddr(&so2->so_rcv, (struct sockaddr *)sun, m,
152 control) == 0) {
153 m_freem(control);
154 m_freem(m);
155 so2->so_rcv.sb_overflowed++;
156 return (ENOBUFS);
157 } else {
158 sorwakeup(so2);
159 return (0);
160 }
161 }
162
163 void
164 unp_setsockaddr(struct unpcb *unp, struct mbuf *nam)
165 {
166 const struct sockaddr_un *sun;
167
168 if (unp->unp_addr)
169 sun = unp->unp_addr;
170 else
171 sun = &sun_noname;
172 nam->m_len = sun->sun_len;
173 if (nam->m_len > MLEN)
174 MEXTMALLOC(nam, nam->m_len, M_WAITOK);
175 memcpy(mtod(nam, caddr_t), sun, (size_t)nam->m_len);
176 }
177
178 void
179 unp_setpeeraddr(struct unpcb *unp, struct mbuf *nam)
180 {
181 const struct sockaddr_un *sun;
182
183 if (unp->unp_conn && unp->unp_conn->unp_addr)
184 sun = unp->unp_conn->unp_addr;
185 else
186 sun = &sun_noname;
187 nam->m_len = sun->sun_len;
188 if (nam->m_len > MLEN)
189 MEXTMALLOC(nam, nam->m_len, M_WAITOK);
190 memcpy(mtod(nam, caddr_t), sun, (size_t)nam->m_len);
191 }
192
193 /*ARGSUSED*/
194 int
195 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
196 struct mbuf *control, struct proc *p)
197 {
198 struct unpcb *unp = sotounpcb(so);
199 struct socket *so2;
200 u_int newhiwat;
201 int error = 0;
202
203 if (req == PRU_CONTROL)
204 return (EOPNOTSUPP);
205
206 #ifdef DIAGNOSTIC
207 if (req != PRU_SEND && req != PRU_SENDOOB && control)
208 panic("uipc_usrreq: unexpected control mbuf");
209 #endif
210 if (unp == 0 && req != PRU_ATTACH) {
211 error = EINVAL;
212 goto release;
213 }
214
215 switch (req) {
216
217 case PRU_ATTACH:
218 if (unp != 0) {
219 error = EISCONN;
220 break;
221 }
222 error = unp_attach(so);
223 break;
224
225 case PRU_DETACH:
226 unp_detach(unp);
227 break;
228
229 case PRU_BIND:
230 error = unp_bind(unp, nam, p);
231 break;
232
233 case PRU_LISTEN:
234 if (unp->unp_vnode == 0)
235 error = EINVAL;
236 break;
237
238 case PRU_CONNECT:
239 error = unp_connect(so, nam, p);
240 break;
241
242 case PRU_CONNECT2:
243 error = unp_connect2(so, (struct socket *)nam, PRU_CONNECT2);
244 break;
245
246 case PRU_DISCONNECT:
247 unp_disconnect(unp);
248 break;
249
250 case PRU_ACCEPT:
251 unp_setpeeraddr(unp, nam);
252 /*
253 * Mark the initiating STREAM socket as connected *ONLY*
254 * after it's been accepted. This prevents a client from
255 * overrunning a server and receiving ECONNREFUSED.
256 */
257 if (unp->unp_conn != NULL &&
258 (unp->unp_conn->unp_socket->so_state & SS_ISCONNECTING))
259 soisconnected(unp->unp_conn->unp_socket);
260 break;
261
262 case PRU_SHUTDOWN:
263 socantsendmore(so);
264 unp_shutdown(unp);
265 break;
266
267 case PRU_RCVD:
268 switch (so->so_type) {
269
270 case SOCK_DGRAM:
271 panic("uipc 1");
272 /*NOTREACHED*/
273
274 case SOCK_STREAM:
275 #define rcv (&so->so_rcv)
276 #define snd (&so2->so_snd)
277 if (unp->unp_conn == 0)
278 break;
279 so2 = unp->unp_conn->unp_socket;
280 /*
281 * Adjust backpressure on sender
282 * and wakeup any waiting to write.
283 */
284 snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
285 unp->unp_mbcnt = rcv->sb_mbcnt;
286 newhiwat = snd->sb_hiwat + unp->unp_cc - rcv->sb_cc;
287 (void)chgsbsize(so2->so_uid,
288 &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
289 unp->unp_cc = rcv->sb_cc;
290 sowwakeup(so2);
291 #undef snd
292 #undef rcv
293 break;
294
295 default:
296 panic("uipc 2");
297 }
298 break;
299
300 case PRU_SEND:
301 /*
302 * Note: unp_internalize() rejects any control message
303 * other than SCM_RIGHTS, and only allows one. This
304 * has the side-effect of preventing a caller from
305 * forging SCM_CREDS.
306 */
307 if (control && (error = unp_internalize(control, p)))
308 break;
309 switch (so->so_type) {
310
311 case SOCK_DGRAM: {
312 if (nam) {
313 if ((so->so_state & SS_ISCONNECTED) != 0) {
314 error = EISCONN;
315 goto die;
316 }
317 error = unp_connect(so, nam, p);
318 if (error) {
319 die:
320 m_freem(control);
321 m_freem(m);
322 break;
323 }
324 } else {
325 if ((so->so_state & SS_ISCONNECTED) == 0) {
326 error = ENOTCONN;
327 goto die;
328 }
329 }
330 error = unp_output(m, control, unp, p);
331 if (nam)
332 unp_disconnect(unp);
333 break;
334 }
335
336 case SOCK_STREAM:
337 #define rcv (&so2->so_rcv)
338 #define snd (&so->so_snd)
339 if (unp->unp_conn == 0)
340 panic("uipc 3");
341 so2 = unp->unp_conn->unp_socket;
342 if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
343 /*
344 * Credentials are passed only once on
345 * SOCK_STREAM.
346 */
347 unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
348 control = unp_addsockcred(p, control);
349 }
350 /*
351 * Send to paired receive port, and then reduce
352 * send buffer hiwater marks to maintain backpressure.
353 * Wake up readers.
354 */
355 if (control) {
356 if (sbappendcontrol(rcv, m, control) == 0)
357 m_freem(control);
358 } else
359 sbappend(rcv, m);
360 snd->sb_mbmax -=
361 rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
362 unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
363 newhiwat = snd->sb_hiwat -
364 (rcv->sb_cc - unp->unp_conn->unp_cc);
365 (void)chgsbsize(so->so_uid,
366 &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
367 unp->unp_conn->unp_cc = rcv->sb_cc;
368 sorwakeup(so2);
369 #undef snd
370 #undef rcv
371 break;
372
373 default:
374 panic("uipc 4");
375 }
376 break;
377
378 case PRU_ABORT:
379 unp_drop(unp, ECONNABORTED);
380
381 #ifdef DIAGNOSTIC
382 if (so->so_pcb == 0)
383 panic("uipc 5: drop killed pcb");
384 #endif
385 unp_detach(unp);
386 break;
387
388 case PRU_SENSE:
389 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
390 if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) {
391 so2 = unp->unp_conn->unp_socket;
392 ((struct stat *) m)->st_blksize += so2->so_rcv.sb_cc;
393 }
394 ((struct stat *) m)->st_dev = NODEV;
395 if (unp->unp_ino == 0)
396 unp->unp_ino = unp_ino++;
397 ((struct stat *) m)->st_atimespec =
398 ((struct stat *) m)->st_mtimespec =
399 ((struct stat *) m)->st_ctimespec = unp->unp_ctime;
400 ((struct stat *) m)->st_ino = unp->unp_ino;
401 return (0);
402
403 case PRU_RCVOOB:
404 error = EOPNOTSUPP;
405 break;
406
407 case PRU_SENDOOB:
408 m_freem(control);
409 m_freem(m);
410 error = EOPNOTSUPP;
411 break;
412
413 case PRU_SOCKADDR:
414 unp_setsockaddr(unp, nam);
415 break;
416
417 case PRU_PEERADDR:
418 unp_setpeeraddr(unp, nam);
419 break;
420
421 default:
422 panic("piusrreq");
423 }
424
425 release:
426 return (error);
427 }
428
429 /*
430 * Unix domain socket option processing.
431 */
432 int
433 uipc_ctloutput(int op, struct socket *so, int level, int optname,
434 struct mbuf **mp)
435 {
436 struct unpcb *unp = sotounpcb(so);
437 struct mbuf *m = *mp;
438 int optval = 0, error = 0;
439
440 if (level != 0) {
441 error = EINVAL;
442 if (op == PRCO_SETOPT && m)
443 (void) m_free(m);
444 } else switch (op) {
445
446 case PRCO_SETOPT:
447 switch (optname) {
448 case LOCAL_CREDS:
449 case LOCAL_CONNWAIT:
450 if (m == NULL || m->m_len != sizeof(int))
451 error = EINVAL;
452 else {
453 optval = *mtod(m, int *);
454 switch (optname) {
455 #define OPTSET(bit) \
456 if (optval) \
457 unp->unp_flags |= (bit); \
458 else \
459 unp->unp_flags &= ~(bit);
460
461 case LOCAL_CREDS:
462 OPTSET(UNP_WANTCRED);
463 break;
464 case LOCAL_CONNWAIT:
465 OPTSET(UNP_CONNWAIT);
466 break;
467 }
468 }
469 break;
470 #undef OPTSET
471
472 default:
473 error = ENOPROTOOPT;
474 break;
475 }
476 if (m)
477 (void) m_free(m);
478 break;
479
480 case PRCO_GETOPT:
481 switch (optname) {
482 case LOCAL_CREDS:
483 *mp = m = m_get(M_WAIT, MT_SOOPTS);
484 m->m_len = sizeof(int);
485 switch (optname) {
486
487 #define OPTBIT(bit) (unp->unp_flags & (bit) ? 1 : 0)
488
489 case LOCAL_CREDS:
490 optval = OPTBIT(UNP_WANTCRED);
491 break;
492 }
493 *mtod(m, int *) = optval;
494 break;
495 #undef OPTBIT
496
497 default:
498 error = ENOPROTOOPT;
499 break;
500 }
501 break;
502 }
503 return (error);
504 }
505
506 /*
507 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
508 * for stream sockets, although the total for sender and receiver is
509 * actually only PIPSIZ.
510 * Datagram sockets really use the sendspace as the maximum datagram size,
511 * and don't really want to reserve the sendspace. Their recvspace should
512 * be large enough for at least one max-size datagram plus address.
513 */
514 #define PIPSIZ 4096
515 u_long unpst_sendspace = PIPSIZ;
516 u_long unpst_recvspace = PIPSIZ;
517 u_long unpdg_sendspace = 2*1024; /* really max datagram size */
518 u_long unpdg_recvspace = 4*1024;
519
520 int unp_rights; /* file descriptors in flight */
521
522 int
523 unp_attach(struct socket *so)
524 {
525 struct unpcb *unp;
526 struct timeval tv;
527 int error;
528
529 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
530 switch (so->so_type) {
531
532 case SOCK_STREAM:
533 error = soreserve(so, unpst_sendspace, unpst_recvspace);
534 break;
535
536 case SOCK_DGRAM:
537 error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
538 break;
539
540 default:
541 panic("unp_attach");
542 }
543 if (error)
544 return (error);
545 }
546 unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT);
547 if (unp == NULL)
548 return (ENOBUFS);
549 memset((caddr_t)unp, 0, sizeof(*unp));
550 unp->unp_socket = so;
551 so->so_pcb = unp;
552 microtime(&tv);
553 TIMEVAL_TO_TIMESPEC(&tv, &unp->unp_ctime);
554 return (0);
555 }
556
557 void
558 unp_detach(struct unpcb *unp)
559 {
560
561 if (unp->unp_vnode) {
562 unp->unp_vnode->v_socket = 0;
563 vrele(unp->unp_vnode);
564 unp->unp_vnode = 0;
565 }
566 if (unp->unp_conn)
567 unp_disconnect(unp);
568 while (unp->unp_refs)
569 unp_drop(unp->unp_refs, ECONNRESET);
570 soisdisconnected(unp->unp_socket);
571 unp->unp_socket->so_pcb = 0;
572 if (unp->unp_addr)
573 free(unp->unp_addr, M_SONAME);
574 if (unp_rights) {
575 /*
576 * Normally the receive buffer is flushed later,
577 * in sofree, but if our receive buffer holds references
578 * to descriptors that are now garbage, we will dispose
579 * of those descriptor references after the garbage collector
580 * gets them (resulting in a "panic: closef: count < 0").
581 */
582 sorflush(unp->unp_socket);
583 free(unp, M_PCB);
584 unp_gc();
585 } else
586 free(unp, M_PCB);
587 }
588
589 int
590 unp_bind(struct unpcb *unp, struct mbuf *nam, struct proc *p)
591 {
592 struct sockaddr_un *sun;
593 struct vnode *vp;
594 struct mount *mp;
595 struct vattr vattr;
596 size_t addrlen;
597 int error;
598 struct nameidata nd;
599
600 if (unp->unp_vnode != 0)
601 return (EINVAL);
602
603 /*
604 * Allocate the new sockaddr. We have to allocate one
605 * extra byte so that we can ensure that the pathname
606 * is nul-terminated.
607 */
608 addrlen = nam->m_len + 1;
609 sun = malloc(addrlen, M_SONAME, M_WAITOK);
610 m_copydata(nam, 0, nam->m_len, (caddr_t)sun);
611 *(((char *)sun) + nam->m_len) = '\0';
612
613 restart:
614 NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT, UIO_SYSSPACE,
615 sun->sun_path, p);
616
617 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
618 if ((error = namei(&nd)) != 0)
619 goto bad;
620 vp = nd.ni_vp;
621 if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
622 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
623 if (nd.ni_dvp == vp)
624 vrele(nd.ni_dvp);
625 else
626 vput(nd.ni_dvp);
627 vrele(vp);
628 if (vp != NULL) {
629 error = EADDRINUSE;
630 goto bad;
631 }
632 error = vn_start_write(NULL, &mp,
633 V_WAIT | V_SLEEPONLY | V_PCATCH);
634 if (error)
635 goto bad;
636 goto restart;
637 }
638 VATTR_NULL(&vattr);
639 vattr.va_type = VSOCK;
640 vattr.va_mode = ACCESSPERMS;
641 VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
642 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
643 vn_finished_write(mp, 0);
644 if (error)
645 goto bad;
646 vp = nd.ni_vp;
647 vp->v_socket = unp->unp_socket;
648 unp->unp_vnode = vp;
649 unp->unp_addrlen = addrlen;
650 unp->unp_addr = sun;
651 VOP_UNLOCK(vp, 0);
652 return (0);
653
654 bad:
655 free(sun, M_SONAME);
656 return (error);
657 }
658
659 int
660 unp_connect(struct socket *so, struct mbuf *nam, struct proc *p)
661 {
662 struct sockaddr_un *sun;
663 struct vnode *vp;
664 struct socket *so2, *so3;
665 struct unpcb *unp2, *unp3;
666 size_t addrlen;
667 int error;
668 struct nameidata nd;
669
670 /*
671 * Allocate a temporary sockaddr. We have to allocate one extra
672 * byte so that we can ensure that the pathname is nul-terminated.
673 * When we establish the connection, we copy the other PCB's
674 * sockaddr to our own.
675 */
676 addrlen = nam->m_len + 1;
677 sun = malloc(addrlen, M_SONAME, M_WAITOK);
678 m_copydata(nam, 0, nam->m_len, (caddr_t)sun);
679 *(((char *)sun) + nam->m_len) = '\0';
680
681 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, sun->sun_path, p);
682
683 if ((error = namei(&nd)) != 0)
684 goto bad2;
685 vp = nd.ni_vp;
686 if (vp->v_type != VSOCK) {
687 error = ENOTSOCK;
688 goto bad;
689 }
690 if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
691 goto bad;
692 so2 = vp->v_socket;
693 if (so2 == 0) {
694 error = ECONNREFUSED;
695 goto bad;
696 }
697 if (so->so_type != so2->so_type) {
698 error = EPROTOTYPE;
699 goto bad;
700 }
701 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
702 if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
703 (so3 = sonewconn(so2, 0)) == 0) {
704 error = ECONNREFUSED;
705 goto bad;
706 }
707 unp2 = sotounpcb(so2);
708 unp3 = sotounpcb(so3);
709 if (unp2->unp_addr) {
710 unp3->unp_addr = malloc(unp2->unp_addrlen,
711 M_SONAME, M_WAITOK);
712 memcpy(unp3->unp_addr, unp2->unp_addr,
713 unp2->unp_addrlen);
714 unp3->unp_addrlen = unp2->unp_addrlen;
715 }
716 unp3->unp_flags = unp2->unp_flags;
717 so2 = so3;
718 }
719 error = unp_connect2(so, so2, PRU_CONNECT);
720 bad:
721 vput(vp);
722 bad2:
723 free(sun, M_SONAME);
724 return (error);
725 }
726
727 int
728 unp_connect2(struct socket *so, struct socket *so2, int req)
729 {
730 struct unpcb *unp = sotounpcb(so);
731 struct unpcb *unp2;
732
733 if (so2->so_type != so->so_type)
734 return (EPROTOTYPE);
735 unp2 = sotounpcb(so2);
736 unp->unp_conn = unp2;
737 switch (so->so_type) {
738
739 case SOCK_DGRAM:
740 unp->unp_nextref = unp2->unp_refs;
741 unp2->unp_refs = unp;
742 soisconnected(so);
743 break;
744
745 case SOCK_STREAM:
746 unp2->unp_conn = unp;
747 if (req == PRU_CONNECT &&
748 ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
749 soisconnecting(so);
750 else
751 soisconnected(so);
752 soisconnected(so2);
753 break;
754
755 default:
756 panic("unp_connect2");
757 }
758 return (0);
759 }
760
761 void
762 unp_disconnect(struct unpcb *unp)
763 {
764 struct unpcb *unp2 = unp->unp_conn;
765
766 if (unp2 == 0)
767 return;
768 unp->unp_conn = 0;
769 switch (unp->unp_socket->so_type) {
770
771 case SOCK_DGRAM:
772 if (unp2->unp_refs == unp)
773 unp2->unp_refs = unp->unp_nextref;
774 else {
775 unp2 = unp2->unp_refs;
776 for (;;) {
777 if (unp2 == 0)
778 panic("unp_disconnect");
779 if (unp2->unp_nextref == unp)
780 break;
781 unp2 = unp2->unp_nextref;
782 }
783 unp2->unp_nextref = unp->unp_nextref;
784 }
785 unp->unp_nextref = 0;
786 unp->unp_socket->so_state &= ~SS_ISCONNECTED;
787 break;
788
789 case SOCK_STREAM:
790 soisdisconnected(unp->unp_socket);
791 unp2->unp_conn = 0;
792 soisdisconnected(unp2->unp_socket);
793 break;
794 }
795 }
796
797 #ifdef notdef
798 unp_abort(struct unpcb *unp)
799 {
800 unp_detach(unp);
801 }
802 #endif
803
804 void
805 unp_shutdown(struct unpcb *unp)
806 {
807 struct socket *so;
808
809 if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
810 (so = unp->unp_conn->unp_socket))
811 socantrcvmore(so);
812 }
813
814 void
815 unp_drop(struct unpcb *unp, int errno)
816 {
817 struct socket *so = unp->unp_socket;
818
819 so->so_error = errno;
820 unp_disconnect(unp);
821 if (so->so_head) {
822 so->so_pcb = 0;
823 sofree(so);
824 if (unp->unp_addr)
825 free(unp->unp_addr, M_SONAME);
826 free(unp, M_PCB);
827 }
828 }
829
830 #ifdef notdef
831 unp_drain(void)
832 {
833
834 }
835 #endif
836
837 int
838 unp_externalize(struct mbuf *rights, struct proc *p)
839 {
840 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
841 int i, *fdp;
842 struct file **rp;
843 struct file *fp;
844 int nfds, error = 0;
845
846 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
847 sizeof(struct file *);
848 rp = (struct file **)CMSG_DATA(cm);
849
850 fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK);
851
852 /* Make sure the recipient should be able to see the descriptors.. */
853 if (p->p_cwdi->cwdi_rdir != NULL) {
854 rp = (struct file **)CMSG_DATA(cm);
855 for (i = 0; i < nfds; i++) {
856 fp = *rp++;
857 /*
858 * If we are in a chroot'ed directory, and
859 * someone wants to pass us a directory, make
860 * sure it's inside the subtree we're allowed
861 * to access.
862 */
863 if (fp->f_type == DTYPE_VNODE) {
864 struct vnode *vp = (struct vnode *)fp->f_data;
865 if ((vp->v_type == VDIR) &&
866 !vn_isunder(vp, p->p_cwdi->cwdi_rdir, p)) {
867 error = EPERM;
868 break;
869 }
870 }
871 }
872 }
873
874 restart:
875 rp = (struct file **)CMSG_DATA(cm);
876 if (error != 0) {
877 for (i = 0; i < nfds; i++) {
878 fp = *rp;
879 /*
880 * zero the pointer before calling unp_discard,
881 * since it may end up in unp_gc()..
882 */
883 *rp++ = 0;
884 unp_discard(fp);
885 }
886 goto out;
887 }
888
889 /*
890 * First loop -- allocate file descriptor table slots for the
891 * new descriptors.
892 */
893 for (i = 0; i < nfds; i++) {
894 fp = *rp++;
895 if ((error = fdalloc(p, 0, &fdp[i])) != 0) {
896 /*
897 * Back out what we've done so far.
898 */
899 for (--i; i >= 0; i--)
900 fdremove(p->p_fd, fdp[i]);
901
902 if (error == ENOSPC) {
903 fdexpand(p);
904 error = 0;
905 } else {
906 /*
907 * This is the error that has historically
908 * been returned, and some callers may
909 * expect it.
910 */
911 error = EMSGSIZE;
912 }
913 goto restart;
914 }
915
916 /*
917 * Make the slot reference the descriptor so that
918 * fdalloc() works properly.. We finalize it all
919 * in the loop below.
920 */
921 p->p_fd->fd_ofiles[fdp[i]] = fp;
922 }
923
924 /*
925 * Now that adding them has succeeded, update all of the
926 * descriptor passing state.
927 */
928 rp = (struct file **)CMSG_DATA(cm);
929 for (i = 0; i < nfds; i++) {
930 fp = *rp++;
931 fp->f_msgcount--;
932 unp_rights--;
933 }
934
935 /*
936 * Copy temporary array to message and adjust length, in case of
937 * transition from large struct file pointers to ints.
938 */
939 memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int));
940 cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
941 rights->m_len = CMSG_SPACE(nfds * sizeof(int));
942 out:
943 free(fdp, M_TEMP);
944 return (error);
945 }
946
947 int
948 unp_internalize(struct mbuf *control, struct proc *p)
949 {
950 struct filedesc *fdescp = p->p_fd;
951 struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *);
952 struct file **rp, **files;
953 struct file *fp;
954 int i, fd, *fdp;
955 int nfds;
956 u_int neededspace;
957
958 /* Sanity check the control message header */
959 if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
960 cm->cmsg_len != control->m_len)
961 return (EINVAL);
962
963 /* Verify that the file descriptors are valid */
964 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int);
965 fdp = (int *)CMSG_DATA(cm);
966 for (i = 0; i < nfds; i++) {
967 fd = *fdp++;
968 if ((fp = fd_getfile(fdescp, fd)) == NULL)
969 return (EBADF);
970 simple_unlock(&fp->f_slock);
971 }
972
973 /* Make sure we have room for the struct file pointers */
974 neededspace = CMSG_SPACE(nfds * sizeof(struct file *)) -
975 control->m_len;
976 if (neededspace > M_TRAILINGSPACE(control)) {
977
978 /* allocate new space and copy header into it */
979 newcm = malloc(
980 CMSG_SPACE(nfds * sizeof(struct file *)),
981 M_MBUF, M_WAITOK);
982 if (newcm == NULL)
983 return (E2BIG);
984 memcpy(newcm, cm, sizeof(struct cmsghdr));
985 files = (struct file **)CMSG_DATA(newcm);
986 } else {
987 /* we can convert in-place */
988 newcm = NULL;
989 files = (struct file **)CMSG_DATA(cm);
990 }
991
992 /*
993 * Transform the file descriptors into struct file pointers, in
994 * reverse order so that if pointers are bigger than ints, the
995 * int won't get until we're done.
996 */
997 fdp = (int *)CMSG_DATA(cm) + nfds - 1;
998 rp = files + nfds - 1;
999 for (i = 0; i < nfds; i++) {
1000 fp = fdescp->fd_ofiles[*fdp--];
1001 simple_lock(&fp->f_slock);
1002 #ifdef DIAGNOSTIC
1003 if (fp->f_iflags & FIF_WANTCLOSE)
1004 panic("unp_internalize: file already closed");
1005 #endif
1006 *rp-- = fp;
1007 fp->f_count++;
1008 fp->f_msgcount++;
1009 simple_unlock(&fp->f_slock);
1010 unp_rights++;
1011 }
1012
1013 if (newcm) {
1014 if (control->m_flags & M_EXT)
1015 MEXTREMOVE(control);
1016 MEXTADD(control, newcm,
1017 CMSG_SPACE(nfds * sizeof(struct file *)),
1018 M_MBUF, NULL, NULL);
1019 cm = newcm;
1020 }
1021
1022 /* adjust message & mbuf to note amount of space actually used. */
1023 cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct file *));
1024 control->m_len = CMSG_SPACE(nfds * sizeof(struct file *));
1025
1026 return (0);
1027 }
1028
1029 struct mbuf *
1030 unp_addsockcred(struct proc *p, struct mbuf *control)
1031 {
1032 struct cmsghdr *cmp;
1033 struct sockcred *sc;
1034 struct mbuf *m, *n;
1035 int len, space, i;
1036
1037 len = CMSG_LEN(SOCKCREDSIZE(p->p_ucred->cr_ngroups));
1038 space = CMSG_SPACE(SOCKCREDSIZE(p->p_ucred->cr_ngroups));
1039
1040 m = m_get(M_WAIT, MT_CONTROL);
1041 if (space > MLEN) {
1042 if (space > MCLBYTES)
1043 MEXTMALLOC(m, space, M_WAITOK);
1044 else
1045 m_clget(m, M_WAIT);
1046 if ((m->m_flags & M_EXT) == 0) {
1047 m_free(m);
1048 return (control);
1049 }
1050 }
1051
1052 m->m_len = space;
1053 m->m_next = NULL;
1054 cmp = mtod(m, struct cmsghdr *);
1055 sc = (struct sockcred *)CMSG_DATA(cmp);
1056 cmp->cmsg_len = len;
1057 cmp->cmsg_level = SOL_SOCKET;
1058 cmp->cmsg_type = SCM_CREDS;
1059 sc->sc_uid = p->p_cred->p_ruid;
1060 sc->sc_euid = p->p_ucred->cr_uid;
1061 sc->sc_gid = p->p_cred->p_rgid;
1062 sc->sc_egid = p->p_ucred->cr_gid;
1063 sc->sc_ngroups = p->p_ucred->cr_ngroups;
1064 for (i = 0; i < sc->sc_ngroups; i++)
1065 sc->sc_groups[i] = p->p_ucred->cr_groups[i];
1066
1067 /*
1068 * If a control message already exists, append us to the end.
1069 */
1070 if (control != NULL) {
1071 for (n = control; n->m_next != NULL; n = n->m_next)
1072 ;
1073 n->m_next = m;
1074 } else
1075 control = m;
1076
1077 return (control);
1078 }
1079
1080 int unp_defer, unp_gcing;
1081 extern struct domain unixdomain;
1082
1083 /*
1084 * Comment added long after the fact explaining what's going on here.
1085 * Do a mark-sweep GC of file descriptors on the system, to free up
1086 * any which are caught in flight to an about-to-be-closed socket.
1087 *
1088 * Traditional mark-sweep gc's start at the "root", and mark
1089 * everything reachable from the root (which, in our case would be the
1090 * process table). The mark bits are cleared during the sweep.
1091 *
1092 * XXX For some inexplicable reason (perhaps because the file
1093 * descriptor tables used to live in the u area which could be swapped
1094 * out and thus hard to reach), we do multiple scans over the set of
1095 * descriptors, using use *two* mark bits per object (DEFER and MARK).
1096 * Whenever we find a descriptor which references other descriptors,
1097 * the ones it references are marked with both bits, and we iterate
1098 * over the whole file table until there are no more DEFER bits set.
1099 * We also make an extra pass *before* the GC to clear the mark bits,
1100 * which could have been cleared at almost no cost during the previous
1101 * sweep.
1102 *
1103 * XXX MP: this needs to run with locks such that no other thread of
1104 * control can create or destroy references to file descriptors. it
1105 * may be necessary to defer the GC until later (when the locking
1106 * situation is more hospitable); it may be necessary to push this
1107 * into a separate thread.
1108 */
1109 void
1110 unp_gc(void)
1111 {
1112 struct file *fp, *nextfp;
1113 struct socket *so, *so1;
1114 struct file **extra_ref, **fpp;
1115 int nunref, i;
1116
1117 if (unp_gcing)
1118 return;
1119 unp_gcing = 1;
1120 unp_defer = 0;
1121
1122 /* Clear mark bits */
1123 LIST_FOREACH(fp, &filehead, f_list)
1124 fp->f_flag &= ~(FMARK|FDEFER);
1125
1126 /*
1127 * Iterate over the set of descriptors, marking ones believed
1128 * (based on refcount) to be referenced from a process, and
1129 * marking for rescan descriptors which are queued on a socket.
1130 */
1131 do {
1132 LIST_FOREACH(fp, &filehead, f_list) {
1133 if (fp->f_flag & FDEFER) {
1134 fp->f_flag &= ~FDEFER;
1135 unp_defer--;
1136 #ifdef DIAGNOSTIC
1137 if (fp->f_count == 0)
1138 panic("unp_gc: deferred unreferenced socket");
1139 #endif
1140 } else {
1141 if (fp->f_count == 0)
1142 continue;
1143 if (fp->f_flag & FMARK)
1144 continue;
1145 if (fp->f_count == fp->f_msgcount)
1146 continue;
1147 }
1148 fp->f_flag |= FMARK;
1149
1150 if (fp->f_type != DTYPE_SOCKET ||
1151 (so = (struct socket *)fp->f_data) == 0)
1152 continue;
1153 if (so->so_proto->pr_domain != &unixdomain ||
1154 (so->so_proto->pr_flags&PR_RIGHTS) == 0)
1155 continue;
1156 #ifdef notdef
1157 if (so->so_rcv.sb_flags & SB_LOCK) {
1158 /*
1159 * This is problematical; it's not clear
1160 * we need to wait for the sockbuf to be
1161 * unlocked (on a uniprocessor, at least),
1162 * and it's also not clear what to do
1163 * if sbwait returns an error due to receipt
1164 * of a signal. If sbwait does return
1165 * an error, we'll go into an infinite
1166 * loop. Delete all of this for now.
1167 */
1168 (void) sbwait(&so->so_rcv);
1169 goto restart;
1170 }
1171 #endif
1172 unp_scan(so->so_rcv.sb_mb, unp_mark, 0);
1173 /*
1174 * mark descriptors referenced from sockets queued on the accept queue as well.
1175 */
1176 if (so->so_options & SO_ACCEPTCONN) {
1177 TAILQ_FOREACH(so1, &so->so_q0, so_qe) {
1178 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
1179 }
1180 TAILQ_FOREACH(so1, &so->so_q, so_qe) {
1181 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
1182 }
1183 }
1184
1185 }
1186 } while (unp_defer);
1187 /*
1188 * Sweep pass. Find unmarked descriptors, and free them.
1189 *
1190 * We grab an extra reference to each of the file table entries
1191 * that are not otherwise accessible and then free the rights
1192 * that are stored in messages on them.
1193 *
1194 * The bug in the original code is a little tricky, so I'll describe
1195 * what's wrong with it here.
1196 *
1197 * It is incorrect to simply unp_discard each entry for f_msgcount
1198 * times -- consider the case of sockets A and B that contain
1199 * references to each other. On a last close of some other socket,
1200 * we trigger a gc since the number of outstanding rights (unp_rights)
1201 * is non-zero. If during the sweep phase the gc code un_discards,
1202 * we end up doing a (full) closef on the descriptor. A closef on A
1203 * results in the following chain. Closef calls soo_close, which
1204 * calls soclose. Soclose calls first (through the switch
1205 * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply
1206 * returns because the previous instance had set unp_gcing, and
1207 * we return all the way back to soclose, which marks the socket
1208 * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush
1209 * to free up the rights that are queued in messages on the socket A,
1210 * i.e., the reference on B. The sorflush calls via the dom_dispose
1211 * switch unp_dispose, which unp_scans with unp_discard. This second
1212 * instance of unp_discard just calls closef on B.
1213 *
1214 * Well, a similar chain occurs on B, resulting in a sorflush on B,
1215 * which results in another closef on A. Unfortunately, A is already
1216 * being closed, and the descriptor has already been marked with
1217 * SS_NOFDREF, and soclose panics at this point.
1218 *
1219 * Here, we first take an extra reference to each inaccessible
1220 * descriptor. Then, if the inaccessible descriptor is a
1221 * socket, we call sorflush in case it is a Unix domain
1222 * socket. After we destroy all the rights carried in
1223 * messages, we do a last closef to get rid of our extra
1224 * reference. This is the last close, and the unp_detach etc
1225 * will shut down the socket.
1226 *
1227 * 91/09/19, bsy (at) cs.cmu.edu
1228 */
1229 extra_ref = malloc(nfiles * sizeof(struct file *), M_FILE, M_WAITOK);
1230 for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != 0;
1231 fp = nextfp) {
1232 nextfp = LIST_NEXT(fp, f_list);
1233 simple_lock(&fp->f_slock);
1234 if (fp->f_count != 0 &&
1235 fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
1236 *fpp++ = fp;
1237 nunref++;
1238 fp->f_count++;
1239 }
1240 simple_unlock(&fp->f_slock);
1241 }
1242 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
1243 fp = *fpp;
1244 simple_lock(&fp->f_slock);
1245 FILE_USE(fp);
1246 if (fp->f_type == DTYPE_SOCKET)
1247 sorflush((struct socket *)fp->f_data);
1248 FILE_UNUSE(fp, NULL);
1249 }
1250 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
1251 fp = *fpp;
1252 simple_lock(&fp->f_slock);
1253 FILE_USE(fp);
1254 (void) closef(fp, (struct proc *)0);
1255 }
1256 free((caddr_t)extra_ref, M_FILE);
1257 unp_gcing = 0;
1258 }
1259
1260 void
1261 unp_dispose(struct mbuf *m)
1262 {
1263
1264 if (m)
1265 unp_scan(m, unp_discard, 1);
1266 }
1267
1268 void
1269 unp_scan(struct mbuf *m0, void (*op)(struct file *), int discard)
1270 {
1271 struct mbuf *m;
1272 struct file **rp;
1273 struct cmsghdr *cm;
1274 int i;
1275 int qfds;
1276
1277 while (m0) {
1278 for (m = m0; m; m = m->m_next) {
1279 if (m->m_type == MT_CONTROL &&
1280 m->m_len >= sizeof(*cm)) {
1281 cm = mtod(m, struct cmsghdr *);
1282 if (cm->cmsg_level != SOL_SOCKET ||
1283 cm->cmsg_type != SCM_RIGHTS)
1284 continue;
1285 qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm)))
1286 / sizeof(struct file *);
1287 rp = (struct file **)CMSG_DATA(cm);
1288 for (i = 0; i < qfds; i++) {
1289 struct file *fp = *rp;
1290 if (discard)
1291 *rp = 0;
1292 (*op)(fp);
1293 rp++;
1294 }
1295 break; /* XXX, but saves time */
1296 }
1297 }
1298 m0 = m0->m_nextpkt;
1299 }
1300 }
1301
1302 void
1303 unp_mark(struct file *fp)
1304 {
1305 if (fp == NULL)
1306 return;
1307
1308 if (fp->f_flag & FMARK)
1309 return;
1310
1311 /* If we're already deferred, don't screw up the defer count */
1312 if (fp->f_flag & FDEFER)
1313 return;
1314
1315 /*
1316 * Minimize the number of deferrals... Sockets are the only
1317 * type of descriptor which can hold references to another
1318 * descriptor, so just mark other descriptors, and defer
1319 * unmarked sockets for the next pass.
1320 */
1321 if (fp->f_type == DTYPE_SOCKET) {
1322 unp_defer++;
1323 if (fp->f_count == 0)
1324 panic("unp_mark: queued unref");
1325 fp->f_flag |= FDEFER;
1326 } else {
1327 fp->f_flag |= FMARK;
1328 }
1329 return;
1330 }
1331
1332 void
1333 unp_discard(struct file *fp)
1334 {
1335 if (fp == NULL)
1336 return;
1337 simple_lock(&fp->f_slock);
1338 fp->f_usecount++; /* i.e. FILE_USE(fp) sans locking */
1339 fp->f_msgcount--;
1340 simple_unlock(&fp->f_slock);
1341 unp_rights--;
1342 (void) closef(fp, (struct proc *)0);
1343 }
1344