uipc_socket2.c revision 1.91.2.4 1 /* $NetBSD: uipc_socket2.c,v 1.91.2.4 2009/09/16 13:38:01 yamt Exp $ */
2
3 /*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. Neither the name of the University nor the names of its contributors
42 * may be used to endorse or promote products derived from this software
43 * without specific prior written permission.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 *
57 * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95
58 */
59
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.91.2.4 2009/09/16 13:38:01 yamt Exp $");
62
63 #include "opt_mbuftrace.h"
64 #include "opt_sb_max.h"
65
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/proc.h>
69 #include <sys/file.h>
70 #include <sys/buf.h>
71 #include <sys/malloc.h>
72 #include <sys/mbuf.h>
73 #include <sys/protosw.h>
74 #include <sys/domain.h>
75 #include <sys/poll.h>
76 #include <sys/socket.h>
77 #include <sys/socketvar.h>
78 #include <sys/signalvar.h>
79 #include <sys/kauth.h>
80 #include <sys/pool.h>
81 #include <sys/uidinfo.h>
82
83 /*
84 * Primitive routines for operating on sockets and socket buffers.
85 *
86 * Locking rules and assumptions:
87 *
88 * o socket::so_lock can change on the fly. The low level routines used
89 * to lock sockets are aware of this. When so_lock is acquired, the
90 * routine locking must check to see if so_lock still points to the
91 * lock that was acquired. If so_lock has changed in the meantime, the
92 * now irellevant lock that was acquired must be dropped and the lock
93 * operation retried. Although not proven here, this is completely safe
94 * on a multiprocessor system, even with relaxed memory ordering, given
95 * the next two rules:
96 *
97 * o In order to mutate so_lock, the lock pointed to by the current value
98 * of so_lock must be held: i.e., the socket must be held locked by the
99 * changing thread. The thread must issue membar_exit() to prevent
100 * memory accesses being reordered, and can set so_lock to the desired
101 * value. If the lock pointed to by the new value of so_lock is not
102 * held by the changing thread, the socket must then be considered
103 * unlocked.
104 *
105 * o If so_lock is mutated, and the previous lock referred to by so_lock
106 * could still be visible to other threads in the system (e.g. via file
107 * descriptor or protocol-internal reference), then the old lock must
108 * remain valid until the socket and/or protocol control block has been
109 * torn down.
110 *
111 * o If a socket has a non-NULL so_head value (i.e. is in the process of
112 * connecting), then locking the socket must also lock the socket pointed
113 * to by so_head: their lock pointers must match.
114 *
115 * o If a socket has connections in progress (so_q, so_q0 not empty) then
116 * locking the socket must also lock the sockets attached to both queues.
117 * Again, their lock pointers must match.
118 *
119 * o Beyond the initial lock assigment in socreate(), assigning locks to
120 * sockets is the responsibility of the individual protocols / protocol
121 * domains.
122 */
123
124 static pool_cache_t socket_cache;
125
126 u_long sb_max = SB_MAX; /* maximum socket buffer size */
127 static u_long sb_max_adj; /* adjusted sb_max */
128
129 /*
130 * Procedures to manipulate state flags of socket
131 * and do appropriate wakeups. Normal sequence from the
132 * active (originating) side is that soisconnecting() is
133 * called during processing of connect() call,
134 * resulting in an eventual call to soisconnected() if/when the
135 * connection is established. When the connection is torn down
136 * soisdisconnecting() is called during processing of disconnect() call,
137 * and soisdisconnected() is called when the connection to the peer
138 * is totally severed. The semantics of these routines are such that
139 * connectionless protocols can call soisconnected() and soisdisconnected()
140 * only, bypassing the in-progress calls when setting up a ``connection''
141 * takes no time.
142 *
143 * From the passive side, a socket is created with
144 * two queues of sockets: so_q0 for connections in progress
145 * and so_q for connections already made and awaiting user acceptance.
146 * As a protocol is preparing incoming connections, it creates a socket
147 * structure queued on so_q0 by calling sonewconn(). When the connection
148 * is established, soisconnected() is called, and transfers the
149 * socket structure to so_q, making it available to accept().
150 *
151 * If a socket is closed with sockets on either
152 * so_q0 or so_q, these sockets are dropped.
153 *
154 * If higher level protocols are implemented in
155 * the kernel, the wakeups done here will sometimes
156 * cause software-interrupt process scheduling.
157 */
158
159 void
160 soisconnecting(struct socket *so)
161 {
162
163 KASSERT(solocked(so));
164
165 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
166 so->so_state |= SS_ISCONNECTING;
167 }
168
169 void
170 soisconnected(struct socket *so)
171 {
172 struct socket *head;
173
174 head = so->so_head;
175
176 KASSERT(solocked(so));
177 KASSERT(head == NULL || solocked2(so, head));
178
179 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
180 so->so_state |= SS_ISCONNECTED;
181 if (head && so->so_onq == &head->so_q0) {
182 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
183 soqremque(so, 0);
184 soqinsque(head, so, 1);
185 sorwakeup(head);
186 cv_broadcast(&head->so_cv);
187 } else {
188 so->so_upcall =
189 head->so_accf->so_accept_filter->accf_callback;
190 so->so_upcallarg = head->so_accf->so_accept_filter_arg;
191 so->so_rcv.sb_flags |= SB_UPCALL;
192 so->so_options &= ~SO_ACCEPTFILTER;
193 (*so->so_upcall)(so, so->so_upcallarg,
194 POLLIN|POLLRDNORM, M_DONTWAIT);
195 }
196 } else {
197 cv_broadcast(&so->so_cv);
198 sorwakeup(so);
199 sowwakeup(so);
200 }
201 }
202
203 void
204 soisdisconnecting(struct socket *so)
205 {
206
207 KASSERT(solocked(so));
208
209 so->so_state &= ~SS_ISCONNECTING;
210 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
211 cv_broadcast(&so->so_cv);
212 sowwakeup(so);
213 sorwakeup(so);
214 }
215
216 void
217 soisdisconnected(struct socket *so)
218 {
219
220 KASSERT(solocked(so));
221
222 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
223 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
224 cv_broadcast(&so->so_cv);
225 sowwakeup(so);
226 sorwakeup(so);
227 }
228
229 void
230 soinit2(void)
231 {
232
233 socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0,
234 "socket", NULL, IPL_SOFTNET, NULL, NULL, NULL);
235 }
236
237 /*
238 * When an attempt at a new connection is noted on a socket
239 * which accepts connections, sonewconn is called. If the
240 * connection is possible (subject to space constraints, etc.)
241 * then we allocate a new structure, propoerly linked into the
242 * data structure of the original socket, and return this.
243 * Connstatus may be 0, SS_ISCONFIRMING, or SS_ISCONNECTED.
244 */
245 struct socket *
246 sonewconn(struct socket *head, int connstatus)
247 {
248 struct socket *so;
249 int soqueue, error;
250
251 KASSERT(connstatus == 0 || connstatus == SS_ISCONFIRMING ||
252 connstatus == SS_ISCONNECTED);
253 KASSERT(solocked(head));
254
255 if ((head->so_options & SO_ACCEPTFILTER) != 0)
256 connstatus = 0;
257 soqueue = connstatus ? 1 : 0;
258 if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2)
259 return NULL;
260 so = soget(false);
261 if (so == NULL)
262 return NULL;
263 mutex_obj_hold(head->so_lock);
264 so->so_lock = head->so_lock;
265 so->so_type = head->so_type;
266 so->so_options = head->so_options &~ SO_ACCEPTCONN;
267 so->so_linger = head->so_linger;
268 so->so_state = head->so_state | SS_NOFDREF;
269 so->so_nbio = head->so_nbio;
270 so->so_proto = head->so_proto;
271 so->so_timeo = head->so_timeo;
272 so->so_pgid = head->so_pgid;
273 so->so_send = head->so_send;
274 so->so_receive = head->so_receive;
275 so->so_uidinfo = head->so_uidinfo;
276 so->so_egid = head->so_egid;
277 so->so_cpid = head->so_cpid;
278 #ifdef MBUFTRACE
279 so->so_mowner = head->so_mowner;
280 so->so_rcv.sb_mowner = head->so_rcv.sb_mowner;
281 so->so_snd.sb_mowner = head->so_snd.sb_mowner;
282 #endif
283 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) != 0)
284 goto out;
285 so->so_snd.sb_lowat = head->so_snd.sb_lowat;
286 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
287 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
288 so->so_snd.sb_timeo = head->so_snd.sb_timeo;
289 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
290 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
291 soqinsque(head, so, soqueue);
292 error = (*so->so_proto->pr_usrreq)(so, PRU_ATTACH, NULL, NULL,
293 NULL, NULL);
294 KASSERT(solocked(so));
295 if (error != 0) {
296 (void) soqremque(so, soqueue);
297 out:
298 /*
299 * Remove acccept filter if one is present.
300 * XXX Is this really needed?
301 */
302 if (so->so_accf != NULL)
303 (void)accept_filt_clear(so);
304 soput(so);
305 return NULL;
306 }
307 if (connstatus) {
308 sorwakeup(head);
309 cv_broadcast(&head->so_cv);
310 so->so_state |= connstatus;
311 }
312 return so;
313 }
314
315 struct socket *
316 soget(bool waitok)
317 {
318 struct socket *so;
319
320 so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
321 if (__predict_false(so == NULL))
322 return (NULL);
323 memset(so, 0, sizeof(*so));
324 TAILQ_INIT(&so->so_q0);
325 TAILQ_INIT(&so->so_q);
326 cv_init(&so->so_cv, "socket");
327 cv_init(&so->so_rcv.sb_cv, "netio");
328 cv_init(&so->so_snd.sb_cv, "netio");
329 selinit(&so->so_rcv.sb_sel);
330 selinit(&so->so_snd.sb_sel);
331 so->so_rcv.sb_so = so;
332 so->so_snd.sb_so = so;
333 return so;
334 }
335
336 void
337 soput(struct socket *so)
338 {
339
340 KASSERT(!cv_has_waiters(&so->so_cv));
341 KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv));
342 KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
343 seldestroy(&so->so_rcv.sb_sel);
344 seldestroy(&so->so_snd.sb_sel);
345 mutex_obj_free(so->so_lock);
346 cv_destroy(&so->so_cv);
347 cv_destroy(&so->so_rcv.sb_cv);
348 cv_destroy(&so->so_snd.sb_cv);
349 pool_cache_put(socket_cache, so);
350 }
351
352 void
353 soqinsque(struct socket *head, struct socket *so, int q)
354 {
355
356 KASSERT(solocked2(head, so));
357
358 #ifdef DIAGNOSTIC
359 if (so->so_onq != NULL)
360 panic("soqinsque");
361 #endif
362
363 so->so_head = head;
364 if (q == 0) {
365 head->so_q0len++;
366 so->so_onq = &head->so_q0;
367 } else {
368 head->so_qlen++;
369 so->so_onq = &head->so_q;
370 }
371 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
372 }
373
374 int
375 soqremque(struct socket *so, int q)
376 {
377 struct socket *head;
378
379 head = so->so_head;
380
381 KASSERT(solocked(so));
382 if (q == 0) {
383 if (so->so_onq != &head->so_q0)
384 return (0);
385 head->so_q0len--;
386 } else {
387 if (so->so_onq != &head->so_q)
388 return (0);
389 head->so_qlen--;
390 }
391 KASSERT(solocked2(so, head));
392 TAILQ_REMOVE(so->so_onq, so, so_qe);
393 so->so_onq = NULL;
394 so->so_head = NULL;
395 return (1);
396 }
397
398 /*
399 * Socantsendmore indicates that no more data will be sent on the
400 * socket; it would normally be applied to a socket when the user
401 * informs the system that no more data is to be sent, by the protocol
402 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
403 * will be received, and will normally be applied to the socket by a
404 * protocol when it detects that the peer will send no more data.
405 * Data queued for reading in the socket may yet be read.
406 */
407
408 void
409 socantsendmore(struct socket *so)
410 {
411
412 KASSERT(solocked(so));
413
414 so->so_state |= SS_CANTSENDMORE;
415 sowwakeup(so);
416 }
417
418 void
419 socantrcvmore(struct socket *so)
420 {
421
422 KASSERT(solocked(so));
423
424 so->so_state |= SS_CANTRCVMORE;
425 sorwakeup(so);
426 }
427
428 /*
429 * Wait for data to arrive at/drain from a socket buffer.
430 */
431 int
432 sbwait(struct sockbuf *sb)
433 {
434 struct socket *so;
435 kmutex_t *lock;
436 int error;
437
438 so = sb->sb_so;
439
440 KASSERT(solocked(so));
441
442 sb->sb_flags |= SB_NOTIFY;
443 lock = so->so_lock;
444 if ((sb->sb_flags & SB_NOINTR) != 0)
445 error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo);
446 else
447 error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo);
448 if (__predict_false(lock != so->so_lock))
449 solockretry(so, lock);
450 return error;
451 }
452
453 /*
454 * Wakeup processes waiting on a socket buffer.
455 * Do asynchronous notification via SIGIO
456 * if the socket buffer has the SB_ASYNC flag set.
457 */
458 void
459 sowakeup(struct socket *so, struct sockbuf *sb, int code)
460 {
461 int band;
462
463 KASSERT(solocked(so));
464 KASSERT(sb->sb_so == so);
465
466 if (code == POLL_IN)
467 band = POLLIN|POLLRDNORM;
468 else
469 band = POLLOUT|POLLWRNORM;
470 sb->sb_flags &= ~SB_NOTIFY;
471 selnotify(&sb->sb_sel, band, NOTE_SUBMIT);
472 cv_broadcast(&sb->sb_cv);
473 if (sb->sb_flags & SB_ASYNC)
474 fownsignal(so->so_pgid, SIGIO, code, band, so);
475 if (sb->sb_flags & SB_UPCALL)
476 (*so->so_upcall)(so, so->so_upcallarg, band, M_DONTWAIT);
477 }
478
479 /*
480 * Reset a socket's lock pointer. Wake all threads waiting on the
481 * socket's condition variables so that they can restart their waits
482 * using the new lock. The existing lock must be held.
483 */
484 void
485 solockreset(struct socket *so, kmutex_t *lock)
486 {
487
488 KASSERT(solocked(so));
489
490 so->so_lock = lock;
491 cv_broadcast(&so->so_snd.sb_cv);
492 cv_broadcast(&so->so_rcv.sb_cv);
493 cv_broadcast(&so->so_cv);
494 }
495
496 /*
497 * Socket buffer (struct sockbuf) utility routines.
498 *
499 * Each socket contains two socket buffers: one for sending data and
500 * one for receiving data. Each buffer contains a queue of mbufs,
501 * information about the number of mbufs and amount of data in the
502 * queue, and other fields allowing poll() statements and notification
503 * on data availability to be implemented.
504 *
505 * Data stored in a socket buffer is maintained as a list of records.
506 * Each record is a list of mbufs chained together with the m_next
507 * field. Records are chained together with the m_nextpkt field. The upper
508 * level routine soreceive() expects the following conventions to be
509 * observed when placing information in the receive buffer:
510 *
511 * 1. If the protocol requires each message be preceded by the sender's
512 * name, then a record containing that name must be present before
513 * any associated data (mbuf's must be of type MT_SONAME).
514 * 2. If the protocol supports the exchange of ``access rights'' (really
515 * just additional data associated with the message), and there are
516 * ``rights'' to be received, then a record containing this data
517 * should be present (mbuf's must be of type MT_CONTROL).
518 * 3. If a name or rights record exists, then it must be followed by
519 * a data record, perhaps of zero length.
520 *
521 * Before using a new socket structure it is first necessary to reserve
522 * buffer space to the socket, by calling sbreserve(). This should commit
523 * some of the available buffer space in the system buffer pool for the
524 * socket (currently, it does nothing but enforce limits). The space
525 * should be released by calling sbrelease() when the socket is destroyed.
526 */
527
528 int
529 sb_max_set(u_long new_sbmax)
530 {
531 int s;
532
533 if (new_sbmax < (16 * 1024))
534 return (EINVAL);
535
536 s = splsoftnet();
537 sb_max = new_sbmax;
538 sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES);
539 splx(s);
540
541 return (0);
542 }
543
544 int
545 soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
546 {
547
548 KASSERT(so->so_lock == NULL || solocked(so));
549
550 /*
551 * there's at least one application (a configure script of screen)
552 * which expects a fifo is writable even if it has "some" bytes
553 * in its buffer.
554 * so we want to make sure (hiwat - lowat) >= (some bytes).
555 *
556 * PIPE_BUF here is an arbitrary value chosen as (some bytes) above.
557 * we expect it's large enough for such applications.
558 */
559 u_long lowat = MAX(sock_loan_thresh, MCLBYTES);
560 u_long hiwat = lowat + PIPE_BUF;
561
562 if (sndcc < hiwat)
563 sndcc = hiwat;
564 if (sbreserve(&so->so_snd, sndcc, so) == 0)
565 goto bad;
566 if (sbreserve(&so->so_rcv, rcvcc, so) == 0)
567 goto bad2;
568 if (so->so_rcv.sb_lowat == 0)
569 so->so_rcv.sb_lowat = 1;
570 if (so->so_snd.sb_lowat == 0)
571 so->so_snd.sb_lowat = lowat;
572 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
573 so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
574 return (0);
575 bad2:
576 sbrelease(&so->so_snd, so);
577 bad:
578 return (ENOBUFS);
579 }
580
581 /*
582 * Allot mbufs to a sockbuf.
583 * Attempt to scale mbmax so that mbcnt doesn't become limiting
584 * if buffering efficiency is near the normal case.
585 */
586 int
587 sbreserve(struct sockbuf *sb, u_long cc, struct socket *so)
588 {
589 struct lwp *l = curlwp; /* XXX */
590 rlim_t maxcc;
591 struct uidinfo *uidinfo;
592
593 KASSERT(so->so_lock == NULL || solocked(so));
594 KASSERT(sb->sb_so == so);
595 KASSERT(sb_max_adj != 0);
596
597 if (cc == 0 || cc > sb_max_adj)
598 return (0);
599
600 if (kauth_cred_geteuid(l->l_cred) == so->so_uidinfo->ui_uid)
601 maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur;
602 else
603 maxcc = RLIM_INFINITY;
604
605 uidinfo = so->so_uidinfo;
606 if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc))
607 return 0;
608 sb->sb_mbmax = min(cc * 2, sb_max);
609 if (sb->sb_lowat > sb->sb_hiwat)
610 sb->sb_lowat = sb->sb_hiwat;
611 return (1);
612 }
613
614 /*
615 * Free mbufs held by a socket, and reserved mbuf space. We do not assert
616 * that the socket is held locked here: see sorflush().
617 */
618 void
619 sbrelease(struct sockbuf *sb, struct socket *so)
620 {
621
622 KASSERT(sb->sb_so == so);
623
624 sbflush(sb);
625 (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY);
626 sb->sb_mbmax = 0;
627 }
628
629 /*
630 * Routines to add and remove
631 * data from an mbuf queue.
632 *
633 * The routines sbappend() or sbappendrecord() are normally called to
634 * append new mbufs to a socket buffer, after checking that adequate
635 * space is available, comparing the function sbspace() with the amount
636 * of data to be added. sbappendrecord() differs from sbappend() in
637 * that data supplied is treated as the beginning of a new record.
638 * To place a sender's address, optional access rights, and data in a
639 * socket receive buffer, sbappendaddr() should be used. To place
640 * access rights and data in a socket receive buffer, sbappendrights()
641 * should be used. In either case, the new data begins a new record.
642 * Note that unlike sbappend() and sbappendrecord(), these routines check
643 * for the caller that there will be enough space to store the data.
644 * Each fails if there is not enough space, or if it cannot find mbufs
645 * to store additional information in.
646 *
647 * Reliable protocols may use the socket send buffer to hold data
648 * awaiting acknowledgement. Data is normally copied from a socket
649 * send buffer in a protocol with m_copy for output to a peer,
650 * and then removing the data from the socket buffer with sbdrop()
651 * or sbdroprecord() when the data is acknowledged by the peer.
652 */
653
654 #ifdef SOCKBUF_DEBUG
655 void
656 sblastrecordchk(struct sockbuf *sb, const char *where)
657 {
658 struct mbuf *m = sb->sb_mb;
659
660 KASSERT(solocked(sb->sb_so));
661
662 while (m && m->m_nextpkt)
663 m = m->m_nextpkt;
664
665 if (m != sb->sb_lastrecord) {
666 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
667 sb->sb_mb, sb->sb_lastrecord, m);
668 printf("packet chain:\n");
669 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
670 printf("\t%p\n", m);
671 panic("sblastrecordchk from %s", where);
672 }
673 }
674
675 void
676 sblastmbufchk(struct sockbuf *sb, const char *where)
677 {
678 struct mbuf *m = sb->sb_mb;
679 struct mbuf *n;
680
681 KASSERT(solocked(sb->sb_so));
682
683 while (m && m->m_nextpkt)
684 m = m->m_nextpkt;
685
686 while (m && m->m_next)
687 m = m->m_next;
688
689 if (m != sb->sb_mbtail) {
690 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
691 sb->sb_mb, sb->sb_mbtail, m);
692 printf("packet tree:\n");
693 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
694 printf("\t");
695 for (n = m; n != NULL; n = n->m_next)
696 printf("%p ", n);
697 printf("\n");
698 }
699 panic("sblastmbufchk from %s", where);
700 }
701 }
702 #endif /* SOCKBUF_DEBUG */
703
704 /*
705 * Link a chain of records onto a socket buffer
706 */
707 #define SBLINKRECORDCHAIN(sb, m0, mlast) \
708 do { \
709 if ((sb)->sb_lastrecord != NULL) \
710 (sb)->sb_lastrecord->m_nextpkt = (m0); \
711 else \
712 (sb)->sb_mb = (m0); \
713 (sb)->sb_lastrecord = (mlast); \
714 } while (/*CONSTCOND*/0)
715
716
717 #define SBLINKRECORD(sb, m0) \
718 SBLINKRECORDCHAIN(sb, m0, m0)
719
720 /*
721 * Append mbuf chain m to the last record in the
722 * socket buffer sb. The additional space associated
723 * the mbuf chain is recorded in sb. Empty mbufs are
724 * discarded and mbufs are compacted where possible.
725 */
726 void
727 sbappend(struct sockbuf *sb, struct mbuf *m)
728 {
729 struct mbuf *n;
730
731 KASSERT(solocked(sb->sb_so));
732
733 if (m == 0)
734 return;
735
736 #ifdef MBUFTRACE
737 m_claimm(m, sb->sb_mowner);
738 #endif
739
740 SBLASTRECORDCHK(sb, "sbappend 1");
741
742 if ((n = sb->sb_lastrecord) != NULL) {
743 /*
744 * XXX Would like to simply use sb_mbtail here, but
745 * XXX I need to verify that I won't miss an EOR that
746 * XXX way.
747 */
748 do {
749 if (n->m_flags & M_EOR) {
750 sbappendrecord(sb, m); /* XXXXXX!!!! */
751 return;
752 }
753 } while (n->m_next && (n = n->m_next));
754 } else {
755 /*
756 * If this is the first record in the socket buffer, it's
757 * also the last record.
758 */
759 sb->sb_lastrecord = m;
760 }
761 sbcompress(sb, m, n);
762 SBLASTRECORDCHK(sb, "sbappend 2");
763 }
764
765 /*
766 * This version of sbappend() should only be used when the caller
767 * absolutely knows that there will never be more than one record
768 * in the socket buffer, that is, a stream protocol (such as TCP).
769 */
770 void
771 sbappendstream(struct sockbuf *sb, struct mbuf *m)
772 {
773
774 KASSERT(solocked(sb->sb_so));
775 KDASSERT(m->m_nextpkt == NULL);
776 KASSERT(sb->sb_mb == sb->sb_lastrecord);
777
778 SBLASTMBUFCHK(sb, __func__);
779
780 #ifdef MBUFTRACE
781 m_claimm(m, sb->sb_mowner);
782 #endif
783
784 sbcompress(sb, m, sb->sb_mbtail);
785
786 sb->sb_lastrecord = sb->sb_mb;
787 SBLASTRECORDCHK(sb, __func__);
788 }
789
790 #ifdef SOCKBUF_DEBUG
791 void
792 sbcheck(struct sockbuf *sb)
793 {
794 struct mbuf *m, *m2;
795 u_long len, mbcnt;
796
797 KASSERT(solocked(sb->sb_so));
798
799 len = 0;
800 mbcnt = 0;
801 for (m = sb->sb_mb; m; m = m->m_nextpkt) {
802 for (m2 = m; m2 != NULL; m2 = m2->m_next) {
803 len += m2->m_len;
804 mbcnt += MSIZE;
805 if (m2->m_flags & M_EXT)
806 mbcnt += m2->m_ext.ext_size;
807 if (m2->m_nextpkt != NULL)
808 panic("sbcheck nextpkt");
809 }
810 }
811 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
812 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
813 mbcnt, sb->sb_mbcnt);
814 panic("sbcheck");
815 }
816 }
817 #endif
818
819 /*
820 * As above, except the mbuf chain
821 * begins a new record.
822 */
823 void
824 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
825 {
826 struct mbuf *m;
827
828 KASSERT(solocked(sb->sb_so));
829
830 if (m0 == 0)
831 return;
832
833 #ifdef MBUFTRACE
834 m_claimm(m0, sb->sb_mowner);
835 #endif
836 /*
837 * Put the first mbuf on the queue.
838 * Note this permits zero length records.
839 */
840 sballoc(sb, m0);
841 SBLASTRECORDCHK(sb, "sbappendrecord 1");
842 SBLINKRECORD(sb, m0);
843 m = m0->m_next;
844 m0->m_next = 0;
845 if (m && (m0->m_flags & M_EOR)) {
846 m0->m_flags &= ~M_EOR;
847 m->m_flags |= M_EOR;
848 }
849 sbcompress(sb, m, m0);
850 SBLASTRECORDCHK(sb, "sbappendrecord 2");
851 }
852
853 /*
854 * As above except that OOB data
855 * is inserted at the beginning of the sockbuf,
856 * but after any other OOB data.
857 */
858 void
859 sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
860 {
861 struct mbuf *m, **mp;
862
863 KASSERT(solocked(sb->sb_so));
864
865 if (m0 == 0)
866 return;
867
868 SBLASTRECORDCHK(sb, "sbinsertoob 1");
869
870 for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) {
871 again:
872 switch (m->m_type) {
873
874 case MT_OOBDATA:
875 continue; /* WANT next train */
876
877 case MT_CONTROL:
878 if ((m = m->m_next) != NULL)
879 goto again; /* inspect THIS train further */
880 }
881 break;
882 }
883 /*
884 * Put the first mbuf on the queue.
885 * Note this permits zero length records.
886 */
887 sballoc(sb, m0);
888 m0->m_nextpkt = *mp;
889 if (*mp == NULL) {
890 /* m0 is actually the new tail */
891 sb->sb_lastrecord = m0;
892 }
893 *mp = m0;
894 m = m0->m_next;
895 m0->m_next = 0;
896 if (m && (m0->m_flags & M_EOR)) {
897 m0->m_flags &= ~M_EOR;
898 m->m_flags |= M_EOR;
899 }
900 sbcompress(sb, m, m0);
901 SBLASTRECORDCHK(sb, "sbinsertoob 2");
902 }
903
904 /*
905 * Append address and data, and optionally, control (ancillary) data
906 * to the receive queue of a socket. If present,
907 * m0 must include a packet header with total length.
908 * Returns 0 if no space in sockbuf or insufficient mbufs.
909 */
910 int
911 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
912 struct mbuf *control)
913 {
914 struct mbuf *m, *n, *nlast;
915 int space, len;
916
917 KASSERT(solocked(sb->sb_so));
918
919 space = asa->sa_len;
920
921 if (m0 != NULL) {
922 if ((m0->m_flags & M_PKTHDR) == 0)
923 panic("sbappendaddr");
924 space += m0->m_pkthdr.len;
925 #ifdef MBUFTRACE
926 m_claimm(m0, sb->sb_mowner);
927 #endif
928 }
929 for (n = control; n; n = n->m_next) {
930 space += n->m_len;
931 MCLAIM(n, sb->sb_mowner);
932 if (n->m_next == 0) /* keep pointer to last control buf */
933 break;
934 }
935 if (space > sbspace(sb))
936 return (0);
937 MGET(m, M_DONTWAIT, MT_SONAME);
938 if (m == 0)
939 return (0);
940 MCLAIM(m, sb->sb_mowner);
941 /*
942 * XXX avoid 'comparison always true' warning which isn't easily
943 * avoided.
944 */
945 len = asa->sa_len;
946 if (len > MLEN) {
947 MEXTMALLOC(m, asa->sa_len, M_NOWAIT);
948 if ((m->m_flags & M_EXT) == 0) {
949 m_free(m);
950 return (0);
951 }
952 }
953 m->m_len = asa->sa_len;
954 memcpy(mtod(m, void *), asa, asa->sa_len);
955 if (n)
956 n->m_next = m0; /* concatenate data to control */
957 else
958 control = m0;
959 m->m_next = control;
960
961 SBLASTRECORDCHK(sb, "sbappendaddr 1");
962
963 for (n = m; n->m_next != NULL; n = n->m_next)
964 sballoc(sb, n);
965 sballoc(sb, n);
966 nlast = n;
967 SBLINKRECORD(sb, m);
968
969 sb->sb_mbtail = nlast;
970 SBLASTMBUFCHK(sb, "sbappendaddr");
971 SBLASTRECORDCHK(sb, "sbappendaddr 2");
972
973 return (1);
974 }
975
976 /*
977 * Helper for sbappendchainaddr: prepend a struct sockaddr* to
978 * an mbuf chain.
979 */
980 static inline struct mbuf *
981 m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0,
982 const struct sockaddr *asa)
983 {
984 struct mbuf *m;
985 const int salen = asa->sa_len;
986
987 KASSERT(solocked(sb->sb_so));
988
989 /* only the first in each chain need be a pkthdr */
990 MGETHDR(m, M_DONTWAIT, MT_SONAME);
991 if (m == 0)
992 return (0);
993 MCLAIM(m, sb->sb_mowner);
994 #ifdef notyet
995 if (salen > MHLEN) {
996 MEXTMALLOC(m, salen, M_NOWAIT);
997 if ((m->m_flags & M_EXT) == 0) {
998 m_free(m);
999 return (0);
1000 }
1001 }
1002 #else
1003 KASSERT(salen <= MHLEN);
1004 #endif
1005 m->m_len = salen;
1006 memcpy(mtod(m, void *), asa, salen);
1007 m->m_next = m0;
1008 m->m_pkthdr.len = salen + m0->m_pkthdr.len;
1009
1010 return m;
1011 }
1012
1013 int
1014 sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa,
1015 struct mbuf *m0, int sbprio)
1016 {
1017 int space;
1018 struct mbuf *m, *n, *n0, *nlast;
1019 int error;
1020
1021 KASSERT(solocked(sb->sb_so));
1022
1023 /*
1024 * XXX sbprio reserved for encoding priority of this* request:
1025 * SB_PRIO_NONE --> honour normal sb limits
1026 * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
1027 * take whole chain. Intended for large requests
1028 * that should be delivered atomically (all, or none).
1029 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
1030 * over normal socket limits, for messages indicating
1031 * buffer overflow in earlier normal/lower-priority messages
1032 * SB_PRIO_BESTEFFORT --> ignore limits entirely.
1033 * Intended for kernel-generated messages only.
1034 * Up to generator to avoid total mbuf resource exhaustion.
1035 */
1036 (void)sbprio;
1037
1038 if (m0 && (m0->m_flags & M_PKTHDR) == 0)
1039 panic("sbappendaddrchain");
1040
1041 space = sbspace(sb);
1042
1043 #ifdef notyet
1044 /*
1045 * Enforce SB_PRIO_* limits as described above.
1046 */
1047 #endif
1048
1049 n0 = NULL;
1050 nlast = NULL;
1051 for (m = m0; m; m = m->m_nextpkt) {
1052 struct mbuf *np;
1053
1054 #ifdef MBUFTRACE
1055 m_claimm(m, sb->sb_mowner);
1056 #endif
1057
1058 /* Prepend sockaddr to this record (m) of input chain m0 */
1059 n = m_prepend_sockaddr(sb, m, asa);
1060 if (n == NULL) {
1061 error = ENOBUFS;
1062 goto bad;
1063 }
1064
1065 /* Append record (asa+m) to end of new chain n0 */
1066 if (n0 == NULL) {
1067 n0 = n;
1068 } else {
1069 nlast->m_nextpkt = n;
1070 }
1071 /* Keep track of last record on new chain */
1072 nlast = n;
1073
1074 for (np = n; np; np = np->m_next)
1075 sballoc(sb, np);
1076 }
1077
1078 SBLASTRECORDCHK(sb, "sbappendaddrchain 1");
1079
1080 /* Drop the entire chain of (asa+m) records onto the socket */
1081 SBLINKRECORDCHAIN(sb, n0, nlast);
1082
1083 SBLASTRECORDCHK(sb, "sbappendaddrchain 2");
1084
1085 for (m = nlast; m->m_next; m = m->m_next)
1086 ;
1087 sb->sb_mbtail = m;
1088 SBLASTMBUFCHK(sb, "sbappendaddrchain");
1089
1090 return (1);
1091
1092 bad:
1093 /*
1094 * On error, free the prepended addreseses. For consistency
1095 * with sbappendaddr(), leave it to our caller to free
1096 * the input record chain passed to us as m0.
1097 */
1098 while ((n = n0) != NULL) {
1099 struct mbuf *np;
1100
1101 /* Undo the sballoc() of this record */
1102 for (np = n; np; np = np->m_next)
1103 sbfree(sb, np);
1104
1105 n0 = n->m_nextpkt; /* iterate at next prepended address */
1106 MFREE(n, np); /* free prepended address (not data) */
1107 }
1108 return 0;
1109 }
1110
1111
1112 int
1113 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
1114 {
1115 struct mbuf *m, *mlast, *n;
1116 int space;
1117
1118 KASSERT(solocked(sb->sb_so));
1119
1120 space = 0;
1121 if (control == 0)
1122 panic("sbappendcontrol");
1123 for (m = control; ; m = m->m_next) {
1124 space += m->m_len;
1125 MCLAIM(m, sb->sb_mowner);
1126 if (m->m_next == 0)
1127 break;
1128 }
1129 n = m; /* save pointer to last control buffer */
1130 for (m = m0; m; m = m->m_next) {
1131 MCLAIM(m, sb->sb_mowner);
1132 space += m->m_len;
1133 }
1134 if (space > sbspace(sb))
1135 return (0);
1136 n->m_next = m0; /* concatenate data to control */
1137
1138 SBLASTRECORDCHK(sb, "sbappendcontrol 1");
1139
1140 for (m = control; m->m_next != NULL; m = m->m_next)
1141 sballoc(sb, m);
1142 sballoc(sb, m);
1143 mlast = m;
1144 SBLINKRECORD(sb, control);
1145
1146 sb->sb_mbtail = mlast;
1147 SBLASTMBUFCHK(sb, "sbappendcontrol");
1148 SBLASTRECORDCHK(sb, "sbappendcontrol 2");
1149
1150 return (1);
1151 }
1152
1153 /*
1154 * Compress mbuf chain m into the socket
1155 * buffer sb following mbuf n. If n
1156 * is null, the buffer is presumed empty.
1157 */
1158 void
1159 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
1160 {
1161 int eor;
1162 struct mbuf *o;
1163
1164 KASSERT(solocked(sb->sb_so));
1165
1166 eor = 0;
1167 while (m) {
1168 eor |= m->m_flags & M_EOR;
1169 if (m->m_len == 0 &&
1170 (eor == 0 ||
1171 (((o = m->m_next) || (o = n)) &&
1172 o->m_type == m->m_type))) {
1173 if (sb->sb_lastrecord == m)
1174 sb->sb_lastrecord = m->m_next;
1175 m = m_free(m);
1176 continue;
1177 }
1178 if (n && (n->m_flags & M_EOR) == 0 &&
1179 /* M_TRAILINGSPACE() checks buffer writeability */
1180 m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */
1181 m->m_len <= M_TRAILINGSPACE(n) &&
1182 n->m_type == m->m_type) {
1183 memcpy(mtod(n, char *) + n->m_len, mtod(m, void *),
1184 (unsigned)m->m_len);
1185 n->m_len += m->m_len;
1186 sb->sb_cc += m->m_len;
1187 m = m_free(m);
1188 continue;
1189 }
1190 if (n)
1191 n->m_next = m;
1192 else
1193 sb->sb_mb = m;
1194 sb->sb_mbtail = m;
1195 sballoc(sb, m);
1196 n = m;
1197 m->m_flags &= ~M_EOR;
1198 m = m->m_next;
1199 n->m_next = 0;
1200 }
1201 if (eor) {
1202 if (n)
1203 n->m_flags |= eor;
1204 else
1205 printf("semi-panic: sbcompress\n");
1206 }
1207 SBLASTMBUFCHK(sb, __func__);
1208 }
1209
1210 /*
1211 * Free all mbufs in a sockbuf.
1212 * Check that all resources are reclaimed.
1213 */
1214 void
1215 sbflush(struct sockbuf *sb)
1216 {
1217
1218 KASSERT(solocked(sb->sb_so));
1219 KASSERT((sb->sb_flags & SB_LOCK) == 0);
1220
1221 while (sb->sb_mbcnt)
1222 sbdrop(sb, (int)sb->sb_cc);
1223
1224 KASSERT(sb->sb_cc == 0);
1225 KASSERT(sb->sb_mb == NULL);
1226 KASSERT(sb->sb_mbtail == NULL);
1227 KASSERT(sb->sb_lastrecord == NULL);
1228 }
1229
1230 /*
1231 * Drop data from (the front of) a sockbuf.
1232 */
1233 void
1234 sbdrop(struct sockbuf *sb, int len)
1235 {
1236 struct mbuf *m, *mn, *next;
1237
1238 KASSERT(solocked(sb->sb_so));
1239
1240 next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
1241 while (len > 0) {
1242 if (m == 0) {
1243 if (next == 0)
1244 panic("sbdrop");
1245 m = next;
1246 next = m->m_nextpkt;
1247 continue;
1248 }
1249 if (m->m_len > len) {
1250 m->m_len -= len;
1251 m->m_data += len;
1252 sb->sb_cc -= len;
1253 break;
1254 }
1255 len -= m->m_len;
1256 sbfree(sb, m);
1257 MFREE(m, mn);
1258 m = mn;
1259 }
1260 while (m && m->m_len == 0) {
1261 sbfree(sb, m);
1262 MFREE(m, mn);
1263 m = mn;
1264 }
1265 if (m) {
1266 sb->sb_mb = m;
1267 m->m_nextpkt = next;
1268 } else
1269 sb->sb_mb = next;
1270 /*
1271 * First part is an inline SB_EMPTY_FIXUP(). Second part
1272 * makes sure sb_lastrecord is up-to-date if we dropped
1273 * part of the last record.
1274 */
1275 m = sb->sb_mb;
1276 if (m == NULL) {
1277 sb->sb_mbtail = NULL;
1278 sb->sb_lastrecord = NULL;
1279 } else if (m->m_nextpkt == NULL)
1280 sb->sb_lastrecord = m;
1281 }
1282
1283 /*
1284 * Drop a record off the front of a sockbuf
1285 * and move the next record to the front.
1286 */
1287 void
1288 sbdroprecord(struct sockbuf *sb)
1289 {
1290 struct mbuf *m, *mn;
1291
1292 KASSERT(solocked(sb->sb_so));
1293
1294 m = sb->sb_mb;
1295 if (m) {
1296 sb->sb_mb = m->m_nextpkt;
1297 do {
1298 sbfree(sb, m);
1299 MFREE(m, mn);
1300 } while ((m = mn) != NULL);
1301 }
1302 SB_EMPTY_FIXUP(sb);
1303 }
1304
1305 /*
1306 * Create a "control" mbuf containing the specified data
1307 * with the specified type for presentation on a socket buffer.
1308 */
1309 struct mbuf *
1310 sbcreatecontrol(void *p, int size, int type, int level)
1311 {
1312 struct cmsghdr *cp;
1313 struct mbuf *m;
1314
1315 if (CMSG_SPACE(size) > MCLBYTES) {
1316 printf("sbcreatecontrol: message too large %d\n", size);
1317 return NULL;
1318 }
1319
1320 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
1321 return ((struct mbuf *) NULL);
1322 if (CMSG_SPACE(size) > MLEN) {
1323 MCLGET(m, M_DONTWAIT);
1324 if ((m->m_flags & M_EXT) == 0) {
1325 m_free(m);
1326 return NULL;
1327 }
1328 }
1329 cp = mtod(m, struct cmsghdr *);
1330 memcpy(CMSG_DATA(cp), p, size);
1331 m->m_len = CMSG_SPACE(size);
1332 cp->cmsg_len = CMSG_LEN(size);
1333 cp->cmsg_level = level;
1334 cp->cmsg_type = type;
1335 return (m);
1336 }
1337
1338 void
1339 solockretry(struct socket *so, kmutex_t *lock)
1340 {
1341
1342 while (lock != so->so_lock) {
1343 mutex_exit(lock);
1344 lock = so->so_lock;
1345 mutex_enter(lock);
1346 }
1347 }
1348
1349 bool
1350 solocked(struct socket *so)
1351 {
1352
1353 return mutex_owned(so->so_lock);
1354 }
1355
1356 bool
1357 solocked2(struct socket *so1, struct socket *so2)
1358 {
1359 kmutex_t *lock;
1360
1361 lock = so1->so_lock;
1362 if (lock != so2->so_lock)
1363 return false;
1364 return mutex_owned(lock);
1365 }
1366
1367 /*
1368 * Assign a default lock to a new socket. For PRU_ATTACH, and done by
1369 * protocols that do not have special locking requirements.
1370 */
1371 void
1372 sosetlock(struct socket *so)
1373 {
1374 kmutex_t *lock;
1375
1376 if (so->so_lock == NULL) {
1377 lock = softnet_lock;
1378 so->so_lock = lock;
1379 mutex_obj_hold(lock);
1380 mutex_enter(lock);
1381 }
1382
1383 /* In all cases, lock must be held on return from PRU_ATTACH. */
1384 KASSERT(solocked(so));
1385 }
1386
1387 /*
1388 * Set lock on sockbuf sb; sleep if lock is already held.
1389 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1390 * Returns error without lock if sleep is interrupted.
1391 */
1392 int
1393 sblock(struct sockbuf *sb, int wf)
1394 {
1395 struct socket *so;
1396 kmutex_t *lock;
1397 int error;
1398
1399 KASSERT(solocked(sb->sb_so));
1400
1401 for (;;) {
1402 if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) {
1403 sb->sb_flags |= SB_LOCK;
1404 return 0;
1405 }
1406 if (wf != M_WAITOK)
1407 return EWOULDBLOCK;
1408 so = sb->sb_so;
1409 lock = so->so_lock;
1410 if ((sb->sb_flags & SB_NOINTR) != 0) {
1411 cv_wait(&so->so_cv, lock);
1412 error = 0;
1413 } else
1414 error = cv_wait_sig(&so->so_cv, lock);
1415 if (__predict_false(lock != so->so_lock))
1416 solockretry(so, lock);
1417 if (error != 0)
1418 return error;
1419 }
1420 }
1421
1422 void
1423 sbunlock(struct sockbuf *sb)
1424 {
1425 struct socket *so;
1426
1427 so = sb->sb_so;
1428
1429 KASSERT(solocked(so));
1430 KASSERT((sb->sb_flags & SB_LOCK) != 0);
1431
1432 sb->sb_flags &= ~SB_LOCK;
1433 cv_broadcast(&so->so_cv);
1434 }
1435
1436 int
1437 sowait(struct socket *so, bool catch, int timo)
1438 {
1439 kmutex_t *lock;
1440 int error;
1441
1442 KASSERT(solocked(so));
1443 KASSERT(catch || timo != 0);
1444
1445 lock = so->so_lock;
1446 if (catch)
1447 error = cv_timedwait_sig(&so->so_cv, lock, timo);
1448 else
1449 error = cv_timedwait(&so->so_cv, lock, timo);
1450 if (__predict_false(lock != so->so_lock))
1451 solockretry(so, lock);
1452 return error;
1453 }
1454