uipc_usrreq.c revision 1.86 1 /* $NetBSD: uipc_usrreq.c,v 1.86 2005/12/11 12:24:30 christos Exp $ */
2
3 /*-
4 * Copyright (c) 1998, 2000, 2004 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the NetBSD
22 * Foundation, Inc. and its contributors.
23 * 4. Neither the name of The NetBSD Foundation nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 /*
41 * Copyright (c) 1982, 1986, 1989, 1991, 1993
42 * The Regents of the University of California. All rights reserved.
43 *
44 * Redistribution and use in source and binary forms, with or without
45 * modification, are permitted provided that the following conditions
46 * are met:
47 * 1. Redistributions of source code must retain the above copyright
48 * notice, this list of conditions and the following disclaimer.
49 * 2. Redistributions in binary form must reproduce the above copyright
50 * notice, this list of conditions and the following disclaimer in the
51 * documentation and/or other materials provided with the distribution.
52 * 3. Neither the name of the University nor the names of its contributors
53 * may be used to endorse or promote products derived from this software
54 * without specific prior written permission.
55 *
56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66 * SUCH DAMAGE.
67 *
68 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95
69 */
70
71 /*
72 * Copyright (c) 1997 Christopher G. Demetriou. All rights reserved.
73 *
74 * Redistribution and use in source and binary forms, with or without
75 * modification, are permitted provided that the following conditions
76 * are met:
77 * 1. Redistributions of source code must retain the above copyright
78 * notice, this list of conditions and the following disclaimer.
79 * 2. Redistributions in binary form must reproduce the above copyright
80 * notice, this list of conditions and the following disclaimer in the
81 * documentation and/or other materials provided with the distribution.
82 * 3. All advertising materials mentioning features or use of this software
83 * must display the following acknowledgement:
84 * This product includes software developed by the University of
85 * California, Berkeley and its contributors.
86 * 4. Neither the name of the University nor the names of its contributors
87 * may be used to endorse or promote products derived from this software
88 * without specific prior written permission.
89 *
90 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
91 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
92 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
93 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
94 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
95 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
96 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
97 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
98 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
99 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
100 * SUCH DAMAGE.
101 *
102 * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95
103 */
104
105 #include <sys/cdefs.h>
106 __KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.86 2005/12/11 12:24:30 christos Exp $");
107
108 #include <sys/param.h>
109 #include <sys/systm.h>
110 #include <sys/proc.h>
111 #include <sys/filedesc.h>
112 #include <sys/domain.h>
113 #include <sys/protosw.h>
114 #include <sys/socket.h>
115 #include <sys/socketvar.h>
116 #include <sys/unpcb.h>
117 #include <sys/un.h>
118 #include <sys/namei.h>
119 #include <sys/vnode.h>
120 #include <sys/file.h>
121 #include <sys/stat.h>
122 #include <sys/mbuf.h>
123
124 /*
125 * Unix communications domain.
126 *
127 * TODO:
128 * SEQPACKET, RDM
129 * rethink name space problems
130 * need a proper out-of-band
131 */
132 const struct sockaddr_un sun_noname = { sizeof(sun_noname), AF_LOCAL };
133 ino_t unp_ino; /* prototype for fake inode numbers */
134
135 struct mbuf *unp_addsockcred(struct proc *, struct mbuf *);
136
137 int
138 unp_output(struct mbuf *m, struct mbuf *control, struct unpcb *unp,
139 struct proc *p)
140 {
141 struct socket *so2;
142 const struct sockaddr_un *sun;
143
144 so2 = unp->unp_conn->unp_socket;
145 if (unp->unp_addr)
146 sun = unp->unp_addr;
147 else
148 sun = &sun_noname;
149 if (unp->unp_conn->unp_flags & UNP_WANTCRED)
150 control = unp_addsockcred(p, control);
151 if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m,
152 control) == 0) {
153 m_freem(control);
154 m_freem(m);
155 so2->so_rcv.sb_overflowed++;
156 return (ENOBUFS);
157 } else {
158 sorwakeup(so2);
159 return (0);
160 }
161 }
162
163 void
164 unp_setsockaddr(struct unpcb *unp, struct mbuf *nam)
165 {
166 const struct sockaddr_un *sun;
167
168 if (unp->unp_addr)
169 sun = unp->unp_addr;
170 else
171 sun = &sun_noname;
172 nam->m_len = sun->sun_len;
173 if (nam->m_len > MLEN)
174 MEXTMALLOC(nam, nam->m_len, M_WAITOK);
175 memcpy(mtod(nam, caddr_t), sun, (size_t)nam->m_len);
176 }
177
178 void
179 unp_setpeeraddr(struct unpcb *unp, struct mbuf *nam)
180 {
181 const struct sockaddr_un *sun;
182
183 if (unp->unp_conn && unp->unp_conn->unp_addr)
184 sun = unp->unp_conn->unp_addr;
185 else
186 sun = &sun_noname;
187 nam->m_len = sun->sun_len;
188 if (nam->m_len > MLEN)
189 MEXTMALLOC(nam, nam->m_len, M_WAITOK);
190 memcpy(mtod(nam, caddr_t), sun, (size_t)nam->m_len);
191 }
192
193 /*ARGSUSED*/
194 int
195 uipc_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
196 struct mbuf *control, struct lwp *l)
197 {
198 struct unpcb *unp = sotounpcb(so);
199 struct socket *so2;
200 struct proc *p;
201 u_int newhiwat;
202 int error = 0;
203
204 if (req == PRU_CONTROL)
205 return (EOPNOTSUPP);
206
207 #ifdef DIAGNOSTIC
208 if (req != PRU_SEND && req != PRU_SENDOOB && control)
209 panic("uipc_usrreq: unexpected control mbuf");
210 #endif
211 p = l ? l->l_proc : NULL;
212 if (unp == 0 && req != PRU_ATTACH) {
213 error = EINVAL;
214 goto release;
215 }
216
217 switch (req) {
218
219 case PRU_ATTACH:
220 if (unp != 0) {
221 error = EISCONN;
222 break;
223 }
224 error = unp_attach(so);
225 break;
226
227 case PRU_DETACH:
228 unp_detach(unp);
229 break;
230
231 case PRU_BIND:
232 error = unp_bind(unp, nam, l);
233 break;
234
235 case PRU_LISTEN:
236 if (unp->unp_vnode == 0)
237 error = EINVAL;
238 break;
239
240 case PRU_CONNECT:
241 error = unp_connect(so, nam, l);
242 break;
243
244 case PRU_CONNECT2:
245 error = unp_connect2(so, (struct socket *)nam, PRU_CONNECT2);
246 break;
247
248 case PRU_DISCONNECT:
249 unp_disconnect(unp);
250 break;
251
252 case PRU_ACCEPT:
253 unp_setpeeraddr(unp, nam);
254 /*
255 * Mark the initiating STREAM socket as connected *ONLY*
256 * after it's been accepted. This prevents a client from
257 * overrunning a server and receiving ECONNREFUSED.
258 */
259 if (unp->unp_conn != NULL &&
260 (unp->unp_conn->unp_socket->so_state & SS_ISCONNECTING))
261 soisconnected(unp->unp_conn->unp_socket);
262 break;
263
264 case PRU_SHUTDOWN:
265 socantsendmore(so);
266 unp_shutdown(unp);
267 break;
268
269 case PRU_RCVD:
270 switch (so->so_type) {
271
272 case SOCK_DGRAM:
273 panic("uipc 1");
274 /*NOTREACHED*/
275
276 case SOCK_STREAM:
277 #define rcv (&so->so_rcv)
278 #define snd (&so2->so_snd)
279 if (unp->unp_conn == 0)
280 break;
281 so2 = unp->unp_conn->unp_socket;
282 /*
283 * Adjust backpressure on sender
284 * and wakeup any waiting to write.
285 */
286 snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
287 unp->unp_mbcnt = rcv->sb_mbcnt;
288 newhiwat = snd->sb_hiwat + unp->unp_cc - rcv->sb_cc;
289 (void)chgsbsize(so2->so_uidinfo,
290 &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
291 unp->unp_cc = rcv->sb_cc;
292 sowwakeup(so2);
293 #undef snd
294 #undef rcv
295 break;
296
297 default:
298 panic("uipc 2");
299 }
300 break;
301
302 case PRU_SEND:
303 /*
304 * Note: unp_internalize() rejects any control message
305 * other than SCM_RIGHTS, and only allows one. This
306 * has the side-effect of preventing a caller from
307 * forging SCM_CREDS.
308 */
309 if (control && (error = unp_internalize(control, l))) {
310 goto die;
311 }
312 switch (so->so_type) {
313
314 case SOCK_DGRAM: {
315 if (nam) {
316 if ((so->so_state & SS_ISCONNECTED) != 0) {
317 error = EISCONN;
318 goto die;
319 }
320 error = unp_connect(so, nam, l);
321 if (error) {
322 die:
323 m_freem(control);
324 m_freem(m);
325 break;
326 }
327 } else {
328 if ((so->so_state & SS_ISCONNECTED) == 0) {
329 error = ENOTCONN;
330 goto die;
331 }
332 }
333 error = unp_output(m, control, unp, p);
334 if (nam)
335 unp_disconnect(unp);
336 break;
337 }
338
339 case SOCK_STREAM:
340 #define rcv (&so2->so_rcv)
341 #define snd (&so->so_snd)
342 if (unp->unp_conn == 0)
343 panic("uipc 3");
344 so2 = unp->unp_conn->unp_socket;
345 if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
346 /*
347 * Credentials are passed only once on
348 * SOCK_STREAM.
349 */
350 unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
351 control = unp_addsockcred(p, control);
352 }
353 /*
354 * Send to paired receive port, and then reduce
355 * send buffer hiwater marks to maintain backpressure.
356 * Wake up readers.
357 */
358 if (control) {
359 if (sbappendcontrol(rcv, m, control) == 0)
360 m_freem(control);
361 } else
362 sbappend(rcv, m);
363 snd->sb_mbmax -=
364 rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
365 unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
366 newhiwat = snd->sb_hiwat -
367 (rcv->sb_cc - unp->unp_conn->unp_cc);
368 (void)chgsbsize(so->so_uidinfo,
369 &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
370 unp->unp_conn->unp_cc = rcv->sb_cc;
371 sorwakeup(so2);
372 #undef snd
373 #undef rcv
374 break;
375
376 default:
377 panic("uipc 4");
378 }
379 break;
380
381 case PRU_ABORT:
382 unp_drop(unp, ECONNABORTED);
383
384 #ifdef DIAGNOSTIC
385 if (so->so_pcb == 0)
386 panic("uipc 5: drop killed pcb");
387 #endif
388 unp_detach(unp);
389 break;
390
391 case PRU_SENSE:
392 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat;
393 if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) {
394 so2 = unp->unp_conn->unp_socket;
395 ((struct stat *) m)->st_blksize += so2->so_rcv.sb_cc;
396 }
397 ((struct stat *) m)->st_dev = NODEV;
398 if (unp->unp_ino == 0)
399 unp->unp_ino = unp_ino++;
400 ((struct stat *) m)->st_atimespec =
401 ((struct stat *) m)->st_mtimespec =
402 ((struct stat *) m)->st_ctimespec = unp->unp_ctime;
403 ((struct stat *) m)->st_ino = unp->unp_ino;
404 return (0);
405
406 case PRU_RCVOOB:
407 error = EOPNOTSUPP;
408 break;
409
410 case PRU_SENDOOB:
411 m_freem(control);
412 m_freem(m);
413 error = EOPNOTSUPP;
414 break;
415
416 case PRU_SOCKADDR:
417 unp_setsockaddr(unp, nam);
418 break;
419
420 case PRU_PEERADDR:
421 unp_setpeeraddr(unp, nam);
422 break;
423
424 default:
425 panic("piusrreq");
426 }
427
428 release:
429 return (error);
430 }
431
432 /*
433 * Unix domain socket option processing.
434 */
435 int
436 uipc_ctloutput(int op, struct socket *so, int level, int optname,
437 struct mbuf **mp)
438 {
439 struct unpcb *unp = sotounpcb(so);
440 struct mbuf *m = *mp;
441 int optval = 0, error = 0;
442
443 if (level != 0) {
444 error = EINVAL;
445 if (op == PRCO_SETOPT && m)
446 (void) m_free(m);
447 } else switch (op) {
448
449 case PRCO_SETOPT:
450 switch (optname) {
451 case LOCAL_CREDS:
452 case LOCAL_CONNWAIT:
453 if (m == NULL || m->m_len != sizeof(int))
454 error = EINVAL;
455 else {
456 optval = *mtod(m, int *);
457 switch (optname) {
458 #define OPTSET(bit) \
459 if (optval) \
460 unp->unp_flags |= (bit); \
461 else \
462 unp->unp_flags &= ~(bit);
463
464 case LOCAL_CREDS:
465 OPTSET(UNP_WANTCRED);
466 break;
467 case LOCAL_CONNWAIT:
468 OPTSET(UNP_CONNWAIT);
469 break;
470 }
471 }
472 break;
473 #undef OPTSET
474
475 default:
476 error = ENOPROTOOPT;
477 break;
478 }
479 if (m)
480 (void) m_free(m);
481 break;
482
483 case PRCO_GETOPT:
484 switch (optname) {
485 case LOCAL_CREDS:
486 *mp = m = m_get(M_WAIT, MT_SOOPTS);
487 m->m_len = sizeof(int);
488 switch (optname) {
489
490 #define OPTBIT(bit) (unp->unp_flags & (bit) ? 1 : 0)
491
492 case LOCAL_CREDS:
493 optval = OPTBIT(UNP_WANTCRED);
494 break;
495 }
496 *mtod(m, int *) = optval;
497 break;
498 #undef OPTBIT
499
500 default:
501 error = ENOPROTOOPT;
502 break;
503 }
504 break;
505 }
506 return (error);
507 }
508
509 /*
510 * Both send and receive buffers are allocated PIPSIZ bytes of buffering
511 * for stream sockets, although the total for sender and receiver is
512 * actually only PIPSIZ.
513 * Datagram sockets really use the sendspace as the maximum datagram size,
514 * and don't really want to reserve the sendspace. Their recvspace should
515 * be large enough for at least one max-size datagram plus address.
516 */
517 #define PIPSIZ 4096
518 u_long unpst_sendspace = PIPSIZ;
519 u_long unpst_recvspace = PIPSIZ;
520 u_long unpdg_sendspace = 2*1024; /* really max datagram size */
521 u_long unpdg_recvspace = 4*1024;
522
523 int unp_rights; /* file descriptors in flight */
524
525 int
526 unp_attach(struct socket *so)
527 {
528 struct unpcb *unp;
529 int error;
530
531 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
532 switch (so->so_type) {
533
534 case SOCK_STREAM:
535 error = soreserve(so, unpst_sendspace, unpst_recvspace);
536 break;
537
538 case SOCK_DGRAM:
539 error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
540 break;
541
542 default:
543 panic("unp_attach");
544 }
545 if (error)
546 return (error);
547 }
548 unp = malloc(sizeof(*unp), M_PCB, M_NOWAIT);
549 if (unp == NULL)
550 return (ENOBUFS);
551 memset((caddr_t)unp, 0, sizeof(*unp));
552 unp->unp_socket = so;
553 so->so_pcb = unp;
554 nanotime(&unp->unp_ctime);
555 return (0);
556 }
557
558 void
559 unp_detach(struct unpcb *unp)
560 {
561
562 if (unp->unp_vnode) {
563 unp->unp_vnode->v_socket = 0;
564 vrele(unp->unp_vnode);
565 unp->unp_vnode = 0;
566 }
567 if (unp->unp_conn)
568 unp_disconnect(unp);
569 while (unp->unp_refs)
570 unp_drop(unp->unp_refs, ECONNRESET);
571 soisdisconnected(unp->unp_socket);
572 unp->unp_socket->so_pcb = 0;
573 if (unp->unp_addr)
574 free(unp->unp_addr, M_SONAME);
575 if (unp_rights) {
576 /*
577 * Normally the receive buffer is flushed later,
578 * in sofree, but if our receive buffer holds references
579 * to descriptors that are now garbage, we will dispose
580 * of those descriptor references after the garbage collector
581 * gets them (resulting in a "panic: closef: count < 0").
582 */
583 sorflush(unp->unp_socket);
584 free(unp, M_PCB);
585 unp_gc();
586 } else
587 free(unp, M_PCB);
588 }
589
590 int
591 unp_bind(struct unpcb *unp, struct mbuf *nam, struct lwp *l)
592 {
593 struct sockaddr_un *sun;
594 struct vnode *vp;
595 struct mount *mp;
596 struct vattr vattr;
597 size_t addrlen;
598 struct proc *p;
599 int error;
600 struct nameidata nd;
601
602 if (unp->unp_vnode != 0)
603 return (EINVAL);
604
605 p = l->l_proc;
606 /*
607 * Allocate the new sockaddr. We have to allocate one
608 * extra byte so that we can ensure that the pathname
609 * is nul-terminated.
610 */
611 addrlen = nam->m_len + 1;
612 sun = malloc(addrlen, M_SONAME, M_WAITOK);
613 m_copydata(nam, 0, nam->m_len, (caddr_t)sun);
614 *(((char *)sun) + nam->m_len) = '\0';
615
616 restart:
617 NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT, UIO_SYSSPACE,
618 sun->sun_path, l);
619
620 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
621 if ((error = namei(&nd)) != 0)
622 goto bad;
623 vp = nd.ni_vp;
624 if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
625 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
626 if (nd.ni_dvp == vp)
627 vrele(nd.ni_dvp);
628 else
629 vput(nd.ni_dvp);
630 vrele(vp);
631 if (vp != NULL) {
632 error = EADDRINUSE;
633 goto bad;
634 }
635 error = vn_start_write(NULL, &mp,
636 V_WAIT | V_SLEEPONLY | V_PCATCH);
637 if (error)
638 goto bad;
639 goto restart;
640 }
641 VATTR_NULL(&vattr);
642 vattr.va_type = VSOCK;
643 vattr.va_mode = ACCESSPERMS & ~(p->p_cwdi->cwdi_cmask);
644 VOP_LEASE(nd.ni_dvp, l, p->p_ucred, LEASE_WRITE);
645 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
646 vn_finished_write(mp, 0);
647 if (error)
648 goto bad;
649 vp = nd.ni_vp;
650 vp->v_socket = unp->unp_socket;
651 unp->unp_vnode = vp;
652 unp->unp_addrlen = addrlen;
653 unp->unp_addr = sun;
654 VOP_UNLOCK(vp, 0);
655 return (0);
656
657 bad:
658 free(sun, M_SONAME);
659 return (error);
660 }
661
662 int
663 unp_connect(struct socket *so, struct mbuf *nam, struct lwp *l)
664 {
665 struct sockaddr_un *sun;
666 struct vnode *vp;
667 struct socket *so2, *so3;
668 struct unpcb *unp2, *unp3;
669 size_t addrlen;
670 int error;
671 struct nameidata nd;
672
673 /*
674 * Allocate a temporary sockaddr. We have to allocate one extra
675 * byte so that we can ensure that the pathname is nul-terminated.
676 * When we establish the connection, we copy the other PCB's
677 * sockaddr to our own.
678 */
679 addrlen = nam->m_len + 1;
680 sun = malloc(addrlen, M_SONAME, M_WAITOK);
681 m_copydata(nam, 0, nam->m_len, (caddr_t)sun);
682 *(((char *)sun) + nam->m_len) = '\0';
683
684 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, sun->sun_path, l);
685
686 if ((error = namei(&nd)) != 0)
687 goto bad2;
688 vp = nd.ni_vp;
689 if (vp->v_type != VSOCK) {
690 error = ENOTSOCK;
691 goto bad;
692 }
693 if ((error = VOP_ACCESS(vp, VWRITE, l->l_proc->p_ucred, l)) != 0)
694 goto bad;
695 so2 = vp->v_socket;
696 if (so2 == 0) {
697 error = ECONNREFUSED;
698 goto bad;
699 }
700 if (so->so_type != so2->so_type) {
701 error = EPROTOTYPE;
702 goto bad;
703 }
704 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
705 if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
706 (so3 = sonewconn(so2, 0)) == 0) {
707 error = ECONNREFUSED;
708 goto bad;
709 }
710 unp2 = sotounpcb(so2);
711 unp3 = sotounpcb(so3);
712 if (unp2->unp_addr) {
713 unp3->unp_addr = malloc(unp2->unp_addrlen,
714 M_SONAME, M_WAITOK);
715 memcpy(unp3->unp_addr, unp2->unp_addr,
716 unp2->unp_addrlen);
717 unp3->unp_addrlen = unp2->unp_addrlen;
718 }
719 unp3->unp_flags = unp2->unp_flags;
720 so2 = so3;
721 }
722 error = unp_connect2(so, so2, PRU_CONNECT);
723 bad:
724 vput(vp);
725 bad2:
726 free(sun, M_SONAME);
727 return (error);
728 }
729
730 int
731 unp_connect2(struct socket *so, struct socket *so2, int req)
732 {
733 struct unpcb *unp = sotounpcb(so);
734 struct unpcb *unp2;
735
736 if (so2->so_type != so->so_type)
737 return (EPROTOTYPE);
738 unp2 = sotounpcb(so2);
739 unp->unp_conn = unp2;
740 switch (so->so_type) {
741
742 case SOCK_DGRAM:
743 unp->unp_nextref = unp2->unp_refs;
744 unp2->unp_refs = unp;
745 soisconnected(so);
746 break;
747
748 case SOCK_STREAM:
749 unp2->unp_conn = unp;
750 if (req == PRU_CONNECT &&
751 ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
752 soisconnecting(so);
753 else
754 soisconnected(so);
755 soisconnected(so2);
756 break;
757
758 default:
759 panic("unp_connect2");
760 }
761 return (0);
762 }
763
764 void
765 unp_disconnect(struct unpcb *unp)
766 {
767 struct unpcb *unp2 = unp->unp_conn;
768
769 if (unp2 == 0)
770 return;
771 unp->unp_conn = 0;
772 switch (unp->unp_socket->so_type) {
773
774 case SOCK_DGRAM:
775 if (unp2->unp_refs == unp)
776 unp2->unp_refs = unp->unp_nextref;
777 else {
778 unp2 = unp2->unp_refs;
779 for (;;) {
780 if (unp2 == 0)
781 panic("unp_disconnect");
782 if (unp2->unp_nextref == unp)
783 break;
784 unp2 = unp2->unp_nextref;
785 }
786 unp2->unp_nextref = unp->unp_nextref;
787 }
788 unp->unp_nextref = 0;
789 unp->unp_socket->so_state &= ~SS_ISCONNECTED;
790 break;
791
792 case SOCK_STREAM:
793 soisdisconnected(unp->unp_socket);
794 unp2->unp_conn = 0;
795 soisdisconnected(unp2->unp_socket);
796 break;
797 }
798 }
799
800 #ifdef notdef
801 unp_abort(struct unpcb *unp)
802 {
803 unp_detach(unp);
804 }
805 #endif
806
807 void
808 unp_shutdown(struct unpcb *unp)
809 {
810 struct socket *so;
811
812 if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
813 (so = unp->unp_conn->unp_socket))
814 socantrcvmore(so);
815 }
816
817 void
818 unp_drop(struct unpcb *unp, int errno)
819 {
820 struct socket *so = unp->unp_socket;
821
822 so->so_error = errno;
823 unp_disconnect(unp);
824 if (so->so_head) {
825 so->so_pcb = 0;
826 sofree(so);
827 if (unp->unp_addr)
828 free(unp->unp_addr, M_SONAME);
829 free(unp, M_PCB);
830 }
831 }
832
833 #ifdef notdef
834 unp_drain(void)
835 {
836
837 }
838 #endif
839
840 int
841 unp_externalize(struct mbuf *rights, struct lwp *l)
842 {
843 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
844 struct proc *p = l->l_proc;
845 int i, *fdp;
846 struct file **rp;
847 struct file *fp;
848 int nfds, error = 0;
849
850 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
851 sizeof(struct file *);
852 rp = (struct file **)CMSG_DATA(cm);
853
854 fdp = malloc(nfds * sizeof(int), M_TEMP, M_WAITOK);
855
856 /* Make sure the recipient should be able to see the descriptors.. */
857 if (p->p_cwdi->cwdi_rdir != NULL) {
858 rp = (struct file **)CMSG_DATA(cm);
859 for (i = 0; i < nfds; i++) {
860 fp = *rp++;
861 /*
862 * If we are in a chroot'ed directory, and
863 * someone wants to pass us a directory, make
864 * sure it's inside the subtree we're allowed
865 * to access.
866 */
867 if (fp->f_type == DTYPE_VNODE) {
868 struct vnode *vp = (struct vnode *)fp->f_data;
869 if ((vp->v_type == VDIR) &&
870 !vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) {
871 error = EPERM;
872 break;
873 }
874 }
875 }
876 }
877
878 restart:
879 rp = (struct file **)CMSG_DATA(cm);
880 if (error != 0) {
881 for (i = 0; i < nfds; i++) {
882 fp = *rp;
883 /*
884 * zero the pointer before calling unp_discard,
885 * since it may end up in unp_gc()..
886 */
887 *rp++ = 0;
888 unp_discard(fp);
889 }
890 goto out;
891 }
892
893 /*
894 * First loop -- allocate file descriptor table slots for the
895 * new descriptors.
896 */
897 for (i = 0; i < nfds; i++) {
898 fp = *rp++;
899 if ((error = fdalloc(p, 0, &fdp[i])) != 0) {
900 /*
901 * Back out what we've done so far.
902 */
903 for (--i; i >= 0; i--)
904 fdremove(p->p_fd, fdp[i]);
905
906 if (error == ENOSPC) {
907 fdexpand(p);
908 error = 0;
909 } else {
910 /*
911 * This is the error that has historically
912 * been returned, and some callers may
913 * expect it.
914 */
915 error = EMSGSIZE;
916 }
917 goto restart;
918 }
919
920 /*
921 * Make the slot reference the descriptor so that
922 * fdalloc() works properly.. We finalize it all
923 * in the loop below.
924 */
925 p->p_fd->fd_ofiles[fdp[i]] = fp;
926 }
927
928 /*
929 * Now that adding them has succeeded, update all of the
930 * descriptor passing state.
931 */
932 rp = (struct file **)CMSG_DATA(cm);
933 for (i = 0; i < nfds; i++) {
934 fp = *rp++;
935 fp->f_msgcount--;
936 unp_rights--;
937 }
938
939 /*
940 * Copy temporary array to message and adjust length, in case of
941 * transition from large struct file pointers to ints.
942 */
943 memcpy(CMSG_DATA(cm), fdp, nfds * sizeof(int));
944 cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
945 rights->m_len = CMSG_SPACE(nfds * sizeof(int));
946 out:
947 free(fdp, M_TEMP);
948 return (error);
949 }
950
951 int
952 unp_internalize(struct mbuf *control, struct lwp *l)
953 {
954 struct proc *p = l->l_proc;
955 struct filedesc *fdescp = p->p_fd;
956 struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *);
957 struct file **rp, **files;
958 struct file *fp;
959 int i, fd, *fdp;
960 int nfds;
961 u_int neededspace;
962
963 /* Sanity check the control message header */
964 if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
965 cm->cmsg_len != control->m_len)
966 return (EINVAL);
967
968 /* Verify that the file descriptors are valid */
969 nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int);
970 fdp = (int *)CMSG_DATA(cm);
971 for (i = 0; i < nfds; i++) {
972 fd = *fdp++;
973 if ((fp = fd_getfile(fdescp, fd)) == NULL)
974 return (EBADF);
975 simple_unlock(&fp->f_slock);
976 }
977
978 /* Make sure we have room for the struct file pointers */
979 neededspace = CMSG_SPACE(nfds * sizeof(struct file *)) -
980 control->m_len;
981 if (neededspace > M_TRAILINGSPACE(control)) {
982
983 /* allocate new space and copy header into it */
984 newcm = malloc(
985 CMSG_SPACE(nfds * sizeof(struct file *)),
986 M_MBUF, M_WAITOK);
987 if (newcm == NULL)
988 return (E2BIG);
989 memcpy(newcm, cm, sizeof(struct cmsghdr));
990 files = (struct file **)CMSG_DATA(newcm);
991 } else {
992 /* we can convert in-place */
993 newcm = NULL;
994 files = (struct file **)CMSG_DATA(cm);
995 }
996
997 /*
998 * Transform the file descriptors into struct file pointers, in
999 * reverse order so that if pointers are bigger than ints, the
1000 * int won't get until we're done.
1001 */
1002 fdp = (int *)CMSG_DATA(cm) + nfds - 1;
1003 rp = files + nfds - 1;
1004 for (i = 0; i < nfds; i++) {
1005 fp = fdescp->fd_ofiles[*fdp--];
1006 simple_lock(&fp->f_slock);
1007 #ifdef DIAGNOSTIC
1008 if (fp->f_iflags & FIF_WANTCLOSE)
1009 panic("unp_internalize: file already closed");
1010 #endif
1011 *rp-- = fp;
1012 fp->f_count++;
1013 fp->f_msgcount++;
1014 simple_unlock(&fp->f_slock);
1015 unp_rights++;
1016 }
1017
1018 if (newcm) {
1019 if (control->m_flags & M_EXT)
1020 MEXTREMOVE(control);
1021 MEXTADD(control, newcm,
1022 CMSG_SPACE(nfds * sizeof(struct file *)),
1023 M_MBUF, NULL, NULL);
1024 cm = newcm;
1025 }
1026
1027 /* adjust message & mbuf to note amount of space actually used. */
1028 cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct file *));
1029 control->m_len = CMSG_SPACE(nfds * sizeof(struct file *));
1030
1031 return (0);
1032 }
1033
1034 struct mbuf *
1035 unp_addsockcred(struct proc *p, struct mbuf *control)
1036 {
1037 struct cmsghdr *cmp;
1038 struct sockcred *sc;
1039 struct mbuf *m, *n;
1040 int len, space, i;
1041
1042 len = CMSG_LEN(SOCKCREDSIZE(p->p_ucred->cr_ngroups));
1043 space = CMSG_SPACE(SOCKCREDSIZE(p->p_ucred->cr_ngroups));
1044
1045 m = m_get(M_WAIT, MT_CONTROL);
1046 if (space > MLEN) {
1047 if (space > MCLBYTES)
1048 MEXTMALLOC(m, space, M_WAITOK);
1049 else
1050 m_clget(m, M_WAIT);
1051 if ((m->m_flags & M_EXT) == 0) {
1052 m_free(m);
1053 return (control);
1054 }
1055 }
1056
1057 m->m_len = space;
1058 m->m_next = NULL;
1059 cmp = mtod(m, struct cmsghdr *);
1060 sc = (struct sockcred *)CMSG_DATA(cmp);
1061 cmp->cmsg_len = len;
1062 cmp->cmsg_level = SOL_SOCKET;
1063 cmp->cmsg_type = SCM_CREDS;
1064 sc->sc_uid = p->p_cred->p_ruid;
1065 sc->sc_euid = p->p_ucred->cr_uid;
1066 sc->sc_gid = p->p_cred->p_rgid;
1067 sc->sc_egid = p->p_ucred->cr_gid;
1068 sc->sc_ngroups = p->p_ucred->cr_ngroups;
1069 for (i = 0; i < sc->sc_ngroups; i++)
1070 sc->sc_groups[i] = p->p_ucred->cr_groups[i];
1071
1072 /*
1073 * If a control message already exists, append us to the end.
1074 */
1075 if (control != NULL) {
1076 for (n = control; n->m_next != NULL; n = n->m_next)
1077 ;
1078 n->m_next = m;
1079 } else
1080 control = m;
1081
1082 return (control);
1083 }
1084
1085 int unp_defer, unp_gcing;
1086 extern struct domain unixdomain;
1087
1088 /*
1089 * Comment added long after the fact explaining what's going on here.
1090 * Do a mark-sweep GC of file descriptors on the system, to free up
1091 * any which are caught in flight to an about-to-be-closed socket.
1092 *
1093 * Traditional mark-sweep gc's start at the "root", and mark
1094 * everything reachable from the root (which, in our case would be the
1095 * process table). The mark bits are cleared during the sweep.
1096 *
1097 * XXX For some inexplicable reason (perhaps because the file
1098 * descriptor tables used to live in the u area which could be swapped
1099 * out and thus hard to reach), we do multiple scans over the set of
1100 * descriptors, using use *two* mark bits per object (DEFER and MARK).
1101 * Whenever we find a descriptor which references other descriptors,
1102 * the ones it references are marked with both bits, and we iterate
1103 * over the whole file table until there are no more DEFER bits set.
1104 * We also make an extra pass *before* the GC to clear the mark bits,
1105 * which could have been cleared at almost no cost during the previous
1106 * sweep.
1107 *
1108 * XXX MP: this needs to run with locks such that no other thread of
1109 * control can create or destroy references to file descriptors. it
1110 * may be necessary to defer the GC until later (when the locking
1111 * situation is more hospitable); it may be necessary to push this
1112 * into a separate thread.
1113 */
1114 void
1115 unp_gc(void)
1116 {
1117 struct file *fp, *nextfp;
1118 struct socket *so, *so1;
1119 struct file **extra_ref, **fpp;
1120 int nunref, i;
1121
1122 if (unp_gcing)
1123 return;
1124 unp_gcing = 1;
1125 unp_defer = 0;
1126
1127 /* Clear mark bits */
1128 LIST_FOREACH(fp, &filehead, f_list)
1129 fp->f_flag &= ~(FMARK|FDEFER);
1130
1131 /*
1132 * Iterate over the set of descriptors, marking ones believed
1133 * (based on refcount) to be referenced from a process, and
1134 * marking for rescan descriptors which are queued on a socket.
1135 */
1136 do {
1137 LIST_FOREACH(fp, &filehead, f_list) {
1138 if (fp->f_flag & FDEFER) {
1139 fp->f_flag &= ~FDEFER;
1140 unp_defer--;
1141 #ifdef DIAGNOSTIC
1142 if (fp->f_count == 0)
1143 panic("unp_gc: deferred unreferenced socket");
1144 #endif
1145 } else {
1146 if (fp->f_count == 0)
1147 continue;
1148 if (fp->f_flag & FMARK)
1149 continue;
1150 if (fp->f_count == fp->f_msgcount)
1151 continue;
1152 }
1153 fp->f_flag |= FMARK;
1154
1155 if (fp->f_type != DTYPE_SOCKET ||
1156 (so = (struct socket *)fp->f_data) == 0)
1157 continue;
1158 if (so->so_proto->pr_domain != &unixdomain ||
1159 (so->so_proto->pr_flags&PR_RIGHTS) == 0)
1160 continue;
1161 #ifdef notdef
1162 if (so->so_rcv.sb_flags & SB_LOCK) {
1163 /*
1164 * This is problematical; it's not clear
1165 * we need to wait for the sockbuf to be
1166 * unlocked (on a uniprocessor, at least),
1167 * and it's also not clear what to do
1168 * if sbwait returns an error due to receipt
1169 * of a signal. If sbwait does return
1170 * an error, we'll go into an infinite
1171 * loop. Delete all of this for now.
1172 */
1173 (void) sbwait(&so->so_rcv);
1174 goto restart;
1175 }
1176 #endif
1177 unp_scan(so->so_rcv.sb_mb, unp_mark, 0);
1178 /*
1179 * mark descriptors referenced from sockets queued on the accept queue as well.
1180 */
1181 if (so->so_options & SO_ACCEPTCONN) {
1182 TAILQ_FOREACH(so1, &so->so_q0, so_qe) {
1183 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
1184 }
1185 TAILQ_FOREACH(so1, &so->so_q, so_qe) {
1186 unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
1187 }
1188 }
1189
1190 }
1191 } while (unp_defer);
1192 /*
1193 * Sweep pass. Find unmarked descriptors, and free them.
1194 *
1195 * We grab an extra reference to each of the file table entries
1196 * that are not otherwise accessible and then free the rights
1197 * that are stored in messages on them.
1198 *
1199 * The bug in the original code is a little tricky, so I'll describe
1200 * what's wrong with it here.
1201 *
1202 * It is incorrect to simply unp_discard each entry for f_msgcount
1203 * times -- consider the case of sockets A and B that contain
1204 * references to each other. On a last close of some other socket,
1205 * we trigger a gc since the number of outstanding rights (unp_rights)
1206 * is non-zero. If during the sweep phase the gc code un_discards,
1207 * we end up doing a (full) closef on the descriptor. A closef on A
1208 * results in the following chain. Closef calls soo_close, which
1209 * calls soclose. Soclose calls first (through the switch
1210 * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply
1211 * returns because the previous instance had set unp_gcing, and
1212 * we return all the way back to soclose, which marks the socket
1213 * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush
1214 * to free up the rights that are queued in messages on the socket A,
1215 * i.e., the reference on B. The sorflush calls via the dom_dispose
1216 * switch unp_dispose, which unp_scans with unp_discard. This second
1217 * instance of unp_discard just calls closef on B.
1218 *
1219 * Well, a similar chain occurs on B, resulting in a sorflush on B,
1220 * which results in another closef on A. Unfortunately, A is already
1221 * being closed, and the descriptor has already been marked with
1222 * SS_NOFDREF, and soclose panics at this point.
1223 *
1224 * Here, we first take an extra reference to each inaccessible
1225 * descriptor. Then, if the inaccessible descriptor is a
1226 * socket, we call sorflush in case it is a Unix domain
1227 * socket. After we destroy all the rights carried in
1228 * messages, we do a last closef to get rid of our extra
1229 * reference. This is the last close, and the unp_detach etc
1230 * will shut down the socket.
1231 *
1232 * 91/09/19, bsy (at) cs.cmu.edu
1233 */
1234 extra_ref = malloc(nfiles * sizeof(struct file *), M_FILE, M_WAITOK);
1235 for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != 0;
1236 fp = nextfp) {
1237 nextfp = LIST_NEXT(fp, f_list);
1238 simple_lock(&fp->f_slock);
1239 if (fp->f_count != 0 &&
1240 fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
1241 *fpp++ = fp;
1242 nunref++;
1243 fp->f_count++;
1244 }
1245 simple_unlock(&fp->f_slock);
1246 }
1247 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
1248 fp = *fpp;
1249 simple_lock(&fp->f_slock);
1250 FILE_USE(fp);
1251 if (fp->f_type == DTYPE_SOCKET)
1252 sorflush((struct socket *)fp->f_data);
1253 FILE_UNUSE(fp, NULL);
1254 }
1255 for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
1256 fp = *fpp;
1257 simple_lock(&fp->f_slock);
1258 FILE_USE(fp);
1259 (void) closef(fp, (struct lwp *)0);
1260 }
1261 free((caddr_t)extra_ref, M_FILE);
1262 unp_gcing = 0;
1263 }
1264
1265 void
1266 unp_dispose(struct mbuf *m)
1267 {
1268
1269 if (m)
1270 unp_scan(m, unp_discard, 1);
1271 }
1272
1273 void
1274 unp_scan(struct mbuf *m0, void (*op)(struct file *), int discard)
1275 {
1276 struct mbuf *m;
1277 struct file **rp;
1278 struct cmsghdr *cm;
1279 int i;
1280 int qfds;
1281
1282 while (m0) {
1283 for (m = m0; m; m = m->m_next) {
1284 if (m->m_type == MT_CONTROL &&
1285 m->m_len >= sizeof(*cm)) {
1286 cm = mtod(m, struct cmsghdr *);
1287 if (cm->cmsg_level != SOL_SOCKET ||
1288 cm->cmsg_type != SCM_RIGHTS)
1289 continue;
1290 qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm)))
1291 / sizeof(struct file *);
1292 rp = (struct file **)CMSG_DATA(cm);
1293 for (i = 0; i < qfds; i++) {
1294 struct file *fp = *rp;
1295 if (discard)
1296 *rp = 0;
1297 (*op)(fp);
1298 rp++;
1299 }
1300 break; /* XXX, but saves time */
1301 }
1302 }
1303 m0 = m0->m_nextpkt;
1304 }
1305 }
1306
1307 void
1308 unp_mark(struct file *fp)
1309 {
1310 if (fp == NULL)
1311 return;
1312
1313 if (fp->f_flag & FMARK)
1314 return;
1315
1316 /* If we're already deferred, don't screw up the defer count */
1317 if (fp->f_flag & FDEFER)
1318 return;
1319
1320 /*
1321 * Minimize the number of deferrals... Sockets are the only
1322 * type of descriptor which can hold references to another
1323 * descriptor, so just mark other descriptors, and defer
1324 * unmarked sockets for the next pass.
1325 */
1326 if (fp->f_type == DTYPE_SOCKET) {
1327 unp_defer++;
1328 if (fp->f_count == 0)
1329 panic("unp_mark: queued unref");
1330 fp->f_flag |= FDEFER;
1331 } else {
1332 fp->f_flag |= FMARK;
1333 }
1334 return;
1335 }
1336
1337 void
1338 unp_discard(struct file *fp)
1339 {
1340 if (fp == NULL)
1341 return;
1342 simple_lock(&fp->f_slock);
1343 fp->f_usecount++; /* i.e. FILE_USE(fp) sans locking */
1344 fp->f_msgcount--;
1345 simple_unlock(&fp->f_slock);
1346 unp_rights--;
1347 (void) closef(fp, (struct lwp *)0);
1348 }
1349