uipc_socket2.c revision 1.138.2.1 1 /* $NetBSD: uipc_socket2.c,v 1.138.2.1 2021/04/03 22:29:00 thorpej Exp $ */
2
3 /*-
4 * Copyright (c) 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. Neither the name of the University nor the names of its contributors
42 * may be used to endorse or promote products derived from this software
43 * without specific prior written permission.
44 *
45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 *
57 * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95
58 */
59
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.138.2.1 2021/04/03 22:29:00 thorpej Exp $");
62
63 #ifdef _KERNEL_OPT
64 #include "opt_ddb.h"
65 #include "opt_inet.h"
66 #include "opt_mbuftrace.h"
67 #include "opt_sb_max.h"
68 #endif
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/proc.h>
73 #include <sys/file.h>
74 #include <sys/buf.h>
75 #include <sys/mbuf.h>
76 #include <sys/protosw.h>
77 #include <sys/domain.h>
78 #include <sys/poll.h>
79 #include <sys/socket.h>
80 #include <sys/socketvar.h>
81 #include <sys/signalvar.h>
82 #include <sys/kauth.h>
83 #include <sys/pool.h>
84 #include <sys/uidinfo.h>
85
86 #ifdef DDB
87 #include <sys/filedesc.h>
88 #endif
89
90 /*
91 * Primitive routines for operating on sockets and socket buffers.
92 *
93 * Connection life-cycle:
94 *
95 * Normal sequence from the active (originating) side:
96 *
97 * - soisconnecting() is called during processing of connect() call,
98 * - resulting in an eventual call to soisconnected() if/when the
99 * connection is established.
100 *
101 * When the connection is torn down during processing of disconnect():
102 *
103 * - soisdisconnecting() is called and,
104 * - soisdisconnected() is called when the connection to the peer
105 * is totally severed.
106 *
107 * The semantics of these routines are such that connectionless protocols
108 * can call soisconnected() and soisdisconnected() only, bypassing the
109 * in-progress calls when setting up a ``connection'' takes no time.
110 *
111 * From the passive side, a socket is created with two queues of sockets:
112 *
113 * - so_q0 (0) for partial connections (i.e. connections in progress)
114 * - so_q (1) for connections already made and awaiting user acceptance.
115 *
116 * As a protocol is preparing incoming connections, it creates a socket
117 * structure queued on so_q0 by calling sonewconn(). When the connection
118 * is established, soisconnected() is called, and transfers the
119 * socket structure to so_q, making it available to accept().
120 *
121 * If a socket is closed with sockets on either so_q0 or so_q, these
122 * sockets are dropped.
123 *
124 * Locking rules and assumptions:
125 *
126 * o socket::so_lock can change on the fly. The low level routines used
127 * to lock sockets are aware of this. When so_lock is acquired, the
128 * routine locking must check to see if so_lock still points to the
129 * lock that was acquired. If so_lock has changed in the meantime, the
130 * now irrelevant lock that was acquired must be dropped and the lock
131 * operation retried. Although not proven here, this is completely safe
132 * on a multiprocessor system, even with relaxed memory ordering, given
133 * the next two rules:
134 *
135 * o In order to mutate so_lock, the lock pointed to by the current value
136 * of so_lock must be held: i.e., the socket must be held locked by the
137 * changing thread. The thread must issue membar_exit() to prevent
138 * memory accesses being reordered, and can set so_lock to the desired
139 * value. If the lock pointed to by the new value of so_lock is not
140 * held by the changing thread, the socket must then be considered
141 * unlocked.
142 *
143 * o If so_lock is mutated, and the previous lock referred to by so_lock
144 * could still be visible to other threads in the system (e.g. via file
145 * descriptor or protocol-internal reference), then the old lock must
146 * remain valid until the socket and/or protocol control block has been
147 * torn down.
148 *
149 * o If a socket has a non-NULL so_head value (i.e. is in the process of
150 * connecting), then locking the socket must also lock the socket pointed
151 * to by so_head: their lock pointers must match.
152 *
153 * o If a socket has connections in progress (so_q, so_q0 not empty) then
154 * locking the socket must also lock the sockets attached to both queues.
155 * Again, their lock pointers must match.
156 *
157 * o Beyond the initial lock assignment in socreate(), assigning locks to
158 * sockets is the responsibility of the individual protocols / protocol
159 * domains.
160 */
161
162 static pool_cache_t socket_cache;
163 u_long sb_max = SB_MAX;/* maximum socket buffer size */
164 static u_long sb_max_adj; /* adjusted sb_max */
165
166 void
167 soisconnecting(struct socket *so)
168 {
169
170 KASSERT(solocked(so));
171
172 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
173 so->so_state |= SS_ISCONNECTING;
174 }
175
176 void
177 soisconnected(struct socket *so)
178 {
179 struct socket *head;
180
181 head = so->so_head;
182
183 KASSERT(solocked(so));
184 KASSERT(head == NULL || solocked2(so, head));
185
186 so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING);
187 so->so_state |= SS_ISCONNECTED;
188 if (head && so->so_onq == &head->so_q0) {
189 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
190 /*
191 * Re-enqueue and wake up any waiters, e.g.
192 * processes blocking on accept().
193 */
194 soqremque(so, 0);
195 soqinsque(head, so, 1);
196 sorwakeup(head);
197 cv_broadcast(&head->so_cv);
198 } else {
199 so->so_upcall =
200 head->so_accf->so_accept_filter->accf_callback;
201 so->so_upcallarg = head->so_accf->so_accept_filter_arg;
202 so->so_rcv.sb_flags |= SB_UPCALL;
203 so->so_options &= ~SO_ACCEPTFILTER;
204 (*so->so_upcall)(so, so->so_upcallarg,
205 POLLIN|POLLRDNORM, M_DONTWAIT);
206 }
207 } else {
208 cv_broadcast(&so->so_cv);
209 sorwakeup(so);
210 sowwakeup(so);
211 }
212 }
213
214 void
215 soisdisconnecting(struct socket *so)
216 {
217
218 KASSERT(solocked(so));
219
220 so->so_state &= ~SS_ISCONNECTING;
221 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
222 cv_broadcast(&so->so_cv);
223 sowwakeup(so);
224 sorwakeup(so);
225 }
226
227 void
228 soisdisconnected(struct socket *so)
229 {
230
231 KASSERT(solocked(so));
232
233 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
234 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
235 cv_broadcast(&so->so_cv);
236 sowwakeup(so);
237 sorwakeup(so);
238 }
239
240 void
241 soinit2(void)
242 {
243
244 socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0,
245 "socket", NULL, IPL_SOFTNET, NULL, NULL, NULL);
246 }
247
248 /*
249 * sonewconn: accept a new connection.
250 *
251 * When an attempt at a new connection is noted on a socket which accepts
252 * connections, sonewconn(9) is called. If the connection is possible
253 * (subject to space constraints, etc) then we allocate a new structure,
254 * properly linked into the data structure of the original socket.
255 *
256 * => If 'soready' is true, then socket will become ready for accept() i.e.
257 * inserted into the so_q queue, SS_ISCONNECTED set and waiters awoken.
258 * => May be called from soft-interrupt context.
259 * => Listening socket should be locked.
260 * => Returns the new socket locked.
261 */
262 struct socket *
263 sonewconn(struct socket *head, bool soready)
264 {
265 struct socket *so;
266 int soqueue, error;
267
268 KASSERT(solocked(head));
269
270 if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) {
271 /*
272 * Listen queue overflow. If there is an accept filter
273 * active, pass through the oldest cxn it's handling.
274 */
275 if (head->so_accf == NULL) {
276 return NULL;
277 } else {
278 struct socket *so2, *next;
279
280 /* Pass the oldest connection waiting in the
281 accept filter */
282 for (so2 = TAILQ_FIRST(&head->so_q0);
283 so2 != NULL; so2 = next) {
284 next = TAILQ_NEXT(so2, so_qe);
285 if (so2->so_upcall == NULL) {
286 continue;
287 }
288 so2->so_upcall = NULL;
289 so2->so_upcallarg = NULL;
290 so2->so_options &= ~SO_ACCEPTFILTER;
291 so2->so_rcv.sb_flags &= ~SB_UPCALL;
292 soisconnected(so2);
293 break;
294 }
295
296 /* If nothing was nudged out of the acept filter, bail
297 * out; otherwise proceed allocating the socket. */
298 if (so2 == NULL) {
299 return NULL;
300 }
301 }
302 }
303 if ((head->so_options & SO_ACCEPTFILTER) != 0) {
304 soready = false;
305 }
306 soqueue = soready ? 1 : 0;
307
308 if ((so = soget(false)) == NULL) {
309 return NULL;
310 }
311 so->so_type = head->so_type;
312 so->so_options = head->so_options & ~SO_ACCEPTCONN;
313 so->so_linger = head->so_linger;
314 so->so_state = head->so_state | SS_NOFDREF;
315 so->so_proto = head->so_proto;
316 so->so_timeo = head->so_timeo;
317 so->so_pgid = head->so_pgid;
318 so->so_send = head->so_send;
319 so->so_receive = head->so_receive;
320 so->so_uidinfo = head->so_uidinfo;
321 so->so_egid = head->so_egid;
322 so->so_cpid = head->so_cpid;
323
324 /*
325 * Share the lock with the listening-socket, it may get unshared
326 * once the connection is complete.
327 */
328 mutex_obj_hold(head->so_lock);
329 so->so_lock = head->so_lock;
330
331 /*
332 * Reserve the space for socket buffers.
333 */
334 #ifdef MBUFTRACE
335 so->so_mowner = head->so_mowner;
336 so->so_rcv.sb_mowner = head->so_rcv.sb_mowner;
337 so->so_snd.sb_mowner = head->so_snd.sb_mowner;
338 #endif
339 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
340 goto out;
341 }
342 so->so_snd.sb_lowat = head->so_snd.sb_lowat;
343 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
344 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
345 so->so_snd.sb_timeo = head->so_snd.sb_timeo;
346 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & (SB_AUTOSIZE | SB_ASYNC);
347 so->so_snd.sb_flags |= head->so_snd.sb_flags & (SB_AUTOSIZE | SB_ASYNC);
348
349 /*
350 * Finally, perform the protocol attach. Note: a new socket
351 * lock may be assigned at this point (if so, it will be held).
352 */
353 error = (*so->so_proto->pr_usrreqs->pr_attach)(so, 0);
354 if (error) {
355 out:
356 KASSERT(solocked(so));
357 KASSERT(so->so_accf == NULL);
358 soput(so);
359
360 /* Note: the listening socket shall stay locked. */
361 KASSERT(solocked(head));
362 return NULL;
363 }
364 KASSERT(solocked2(head, so));
365
366 /*
367 * Insert into the queue. If ready, update the connection status
368 * and wake up any waiters, e.g. processes blocking on accept().
369 */
370 soqinsque(head, so, soqueue);
371 if (soready) {
372 so->so_state |= SS_ISCONNECTED;
373 sorwakeup(head);
374 cv_broadcast(&head->so_cv);
375 }
376 return so;
377 }
378
379 struct socket *
380 soget(bool waitok)
381 {
382 struct socket *so;
383
384 so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
385 if (__predict_false(so == NULL))
386 return (NULL);
387 memset(so, 0, sizeof(*so));
388 TAILQ_INIT(&so->so_q0);
389 TAILQ_INIT(&so->so_q);
390 cv_init(&so->so_cv, "socket");
391 cv_init(&so->so_rcv.sb_cv, "netio");
392 cv_init(&so->so_snd.sb_cv, "netio");
393 selinit(&so->so_rcv.sb_sel);
394 selinit(&so->so_snd.sb_sel);
395 so->so_rcv.sb_so = so;
396 so->so_snd.sb_so = so;
397 return so;
398 }
399
400 void
401 soput(struct socket *so)
402 {
403
404 KASSERT(!cv_has_waiters(&so->so_cv));
405 KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv));
406 KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
407 seldestroy(&so->so_rcv.sb_sel);
408 seldestroy(&so->so_snd.sb_sel);
409 mutex_obj_free(so->so_lock);
410 cv_destroy(&so->so_cv);
411 cv_destroy(&so->so_rcv.sb_cv);
412 cv_destroy(&so->so_snd.sb_cv);
413 pool_cache_put(socket_cache, so);
414 }
415
416 /*
417 * soqinsque: insert socket of a new connection into the specified
418 * accept queue of the listening socket (head).
419 *
420 * q = 0: queue of partial connections
421 * q = 1: queue of incoming connections
422 */
423 void
424 soqinsque(struct socket *head, struct socket *so, int q)
425 {
426 KASSERT(q == 0 || q == 1);
427 KASSERT(solocked2(head, so));
428 KASSERT(so->so_onq == NULL);
429 KASSERT(so->so_head == NULL);
430
431 so->so_head = head;
432 if (q == 0) {
433 head->so_q0len++;
434 so->so_onq = &head->so_q0;
435 } else {
436 head->so_qlen++;
437 so->so_onq = &head->so_q;
438 }
439 TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
440 }
441
442 /*
443 * soqremque: remove socket from the specified queue.
444 *
445 * => Returns true if socket was removed from the specified queue.
446 * => False if socket was not removed (because it was in other queue).
447 */
448 bool
449 soqremque(struct socket *so, int q)
450 {
451 struct socket *head = so->so_head;
452
453 KASSERT(q == 0 || q == 1);
454 KASSERT(solocked(so));
455 KASSERT(so->so_onq != NULL);
456 KASSERT(head != NULL);
457
458 if (q == 0) {
459 if (so->so_onq != &head->so_q0)
460 return false;
461 head->so_q0len--;
462 } else {
463 if (so->so_onq != &head->so_q)
464 return false;
465 head->so_qlen--;
466 }
467 KASSERT(solocked2(so, head));
468 TAILQ_REMOVE(so->so_onq, so, so_qe);
469 so->so_onq = NULL;
470 so->so_head = NULL;
471 return true;
472 }
473
474 /*
475 * socantsendmore: indicates that no more data will be sent on the
476 * socket; it would normally be applied to a socket when the user
477 * informs the system that no more data is to be sent, by the protocol
478 * code (in case pr_shutdown()).
479 */
480 void
481 socantsendmore(struct socket *so)
482 {
483 KASSERT(solocked(so));
484
485 so->so_state |= SS_CANTSENDMORE;
486 sowwakeup(so);
487 }
488
489 /*
490 * socantrcvmore(): indicates that no more data will be received and
491 * will normally be applied to the socket by a protocol when it detects
492 * that the peer will send no more data. Data queued for reading in
493 * the socket may yet be read.
494 */
495 void
496 socantrcvmore(struct socket *so)
497 {
498 KASSERT(solocked(so));
499
500 so->so_state |= SS_CANTRCVMORE;
501 sorwakeup(so);
502 }
503
504 /*
505 * soroverflow(): indicates that data was attempted to be sent
506 * but the receiving buffer overflowed.
507 */
508 void
509 soroverflow(struct socket *so)
510 {
511 KASSERT(solocked(so));
512
513 so->so_rcv.sb_overflowed++;
514 if (so->so_options & SO_RERROR) {
515 so->so_rerror = ENOBUFS;
516 sorwakeup(so);
517 }
518 }
519
520 /*
521 * Wait for data to arrive at/drain from a socket buffer.
522 */
523 int
524 sbwait(struct sockbuf *sb)
525 {
526 struct socket *so;
527 kmutex_t *lock;
528 int error;
529
530 so = sb->sb_so;
531
532 KASSERT(solocked(so));
533
534 sb->sb_flags |= SB_NOTIFY;
535 lock = so->so_lock;
536 if ((sb->sb_flags & SB_NOINTR) != 0)
537 error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo);
538 else
539 error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo);
540 if (__predict_false(lock != so->so_lock))
541 solockretry(so, lock);
542 return error;
543 }
544
545 /*
546 * Wakeup processes waiting on a socket buffer.
547 * Do asynchronous notification via SIGIO
548 * if the socket buffer has the SB_ASYNC flag set.
549 */
550 void
551 sowakeup(struct socket *so, struct sockbuf *sb, int code)
552 {
553 int band;
554
555 KASSERT(solocked(so));
556 KASSERT(sb->sb_so == so);
557
558 if (code == POLL_IN)
559 band = POLLIN|POLLRDNORM;
560 else
561 band = POLLOUT|POLLWRNORM;
562 sb->sb_flags &= ~SB_NOTIFY;
563 selnotify(&sb->sb_sel, band, NOTE_SUBMIT);
564 cv_broadcast(&sb->sb_cv);
565 if (sb->sb_flags & SB_ASYNC)
566 fownsignal(so->so_pgid, SIGIO, code, band, so);
567 if (sb->sb_flags & SB_UPCALL)
568 (*so->so_upcall)(so, so->so_upcallarg, band, M_DONTWAIT);
569 }
570
571 /*
572 * Reset a socket's lock pointer. Wake all threads waiting on the
573 * socket's condition variables so that they can restart their waits
574 * using the new lock. The existing lock must be held.
575 */
576 void
577 solockreset(struct socket *so, kmutex_t *lock)
578 {
579
580 KASSERT(solocked(so));
581
582 so->so_lock = lock;
583 cv_broadcast(&so->so_snd.sb_cv);
584 cv_broadcast(&so->so_rcv.sb_cv);
585 cv_broadcast(&so->so_cv);
586 }
587
588 /*
589 * Socket buffer (struct sockbuf) utility routines.
590 *
591 * Each socket contains two socket buffers: one for sending data and
592 * one for receiving data. Each buffer contains a queue of mbufs,
593 * information about the number of mbufs and amount of data in the
594 * queue, and other fields allowing poll() statements and notification
595 * on data availability to be implemented.
596 *
597 * Data stored in a socket buffer is maintained as a list of records.
598 * Each record is a list of mbufs chained together with the m_next
599 * field. Records are chained together with the m_nextpkt field. The upper
600 * level routine soreceive() expects the following conventions to be
601 * observed when placing information in the receive buffer:
602 *
603 * 1. If the protocol requires each message be preceded by the sender's
604 * name, then a record containing that name must be present before
605 * any associated data (mbuf's must be of type MT_SONAME).
606 * 2. If the protocol supports the exchange of ``access rights'' (really
607 * just additional data associated with the message), and there are
608 * ``rights'' to be received, then a record containing this data
609 * should be present (mbuf's must be of type MT_CONTROL).
610 * 3. If a name or rights record exists, then it must be followed by
611 * a data record, perhaps of zero length.
612 *
613 * Before using a new socket structure it is first necessary to reserve
614 * buffer space to the socket, by calling sbreserve(). This should commit
615 * some of the available buffer space in the system buffer pool for the
616 * socket (currently, it does nothing but enforce limits). The space
617 * should be released by calling sbrelease() when the socket is destroyed.
618 */
619
620 int
621 sb_max_set(u_long new_sbmax)
622 {
623 int s;
624
625 if (new_sbmax < (16 * 1024))
626 return (EINVAL);
627
628 s = splsoftnet();
629 sb_max = new_sbmax;
630 sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES);
631 splx(s);
632
633 return (0);
634 }
635
636 int
637 soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
638 {
639 KASSERT(so->so_pcb == NULL || solocked(so));
640
641 /*
642 * there's at least one application (a configure script of screen)
643 * which expects a fifo is writable even if it has "some" bytes
644 * in its buffer.
645 * so we want to make sure (hiwat - lowat) >= (some bytes).
646 *
647 * PIPE_BUF here is an arbitrary value chosen as (some bytes) above.
648 * we expect it's large enough for such applications.
649 */
650 u_long lowat = MAX(sock_loan_thresh, MCLBYTES);
651 u_long hiwat = lowat + PIPE_BUF;
652
653 if (sndcc < hiwat)
654 sndcc = hiwat;
655 if (sbreserve(&so->so_snd, sndcc, so) == 0)
656 goto bad;
657 if (sbreserve(&so->so_rcv, rcvcc, so) == 0)
658 goto bad2;
659 if (so->so_rcv.sb_lowat == 0)
660 so->so_rcv.sb_lowat = 1;
661 if (so->so_snd.sb_lowat == 0)
662 so->so_snd.sb_lowat = lowat;
663 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
664 so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
665 return (0);
666 bad2:
667 sbrelease(&so->so_snd, so);
668 bad:
669 return (ENOBUFS);
670 }
671
672 /*
673 * Allot mbufs to a sockbuf.
674 * Attempt to scale mbmax so that mbcnt doesn't become limiting
675 * if buffering efficiency is near the normal case.
676 */
677 int
678 sbreserve(struct sockbuf *sb, u_long cc, struct socket *so)
679 {
680 struct lwp *l = curlwp; /* XXX */
681 rlim_t maxcc;
682 struct uidinfo *uidinfo;
683
684 KASSERT(so->so_pcb == NULL || solocked(so));
685 KASSERT(sb->sb_so == so);
686 KASSERT(sb_max_adj != 0);
687
688 if (cc == 0 || cc > sb_max_adj)
689 return (0);
690
691 maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur;
692
693 uidinfo = so->so_uidinfo;
694 if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc))
695 return 0;
696 sb->sb_mbmax = uimin(cc * 2, sb_max);
697 if (sb->sb_lowat > sb->sb_hiwat)
698 sb->sb_lowat = sb->sb_hiwat;
699
700 return (1);
701 }
702
703 /*
704 * Free mbufs held by a socket, and reserved mbuf space. We do not assert
705 * that the socket is held locked here: see sorflush().
706 */
707 void
708 sbrelease(struct sockbuf *sb, struct socket *so)
709 {
710
711 KASSERT(sb->sb_so == so);
712
713 sbflush(sb);
714 (void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY);
715 sb->sb_mbmax = 0;
716 }
717
718 /*
719 * Routines to add and remove
720 * data from an mbuf queue.
721 *
722 * The routines sbappend() or sbappendrecord() are normally called to
723 * append new mbufs to a socket buffer, after checking that adequate
724 * space is available, comparing the function sbspace() with the amount
725 * of data to be added. sbappendrecord() differs from sbappend() in
726 * that data supplied is treated as the beginning of a new record.
727 * To place a sender's address, optional access rights, and data in a
728 * socket receive buffer, sbappendaddr() should be used. To place
729 * access rights and data in a socket receive buffer, sbappendrights()
730 * should be used. In either case, the new data begins a new record.
731 * Note that unlike sbappend() and sbappendrecord(), these routines check
732 * for the caller that there will be enough space to store the data.
733 * Each fails if there is not enough space, or if it cannot find mbufs
734 * to store additional information in.
735 *
736 * Reliable protocols may use the socket send buffer to hold data
737 * awaiting acknowledgement. Data is normally copied from a socket
738 * send buffer in a protocol with m_copym for output to a peer,
739 * and then removing the data from the socket buffer with sbdrop()
740 * or sbdroprecord() when the data is acknowledged by the peer.
741 */
742
743 #ifdef SOCKBUF_DEBUG
744 void
745 sblastrecordchk(struct sockbuf *sb, const char *where)
746 {
747 struct mbuf *m = sb->sb_mb;
748
749 KASSERT(solocked(sb->sb_so));
750
751 while (m && m->m_nextpkt)
752 m = m->m_nextpkt;
753
754 if (m != sb->sb_lastrecord) {
755 printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
756 sb->sb_mb, sb->sb_lastrecord, m);
757 printf("packet chain:\n");
758 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
759 printf("\t%p\n", m);
760 panic("sblastrecordchk from %s", where);
761 }
762 }
763
764 void
765 sblastmbufchk(struct sockbuf *sb, const char *where)
766 {
767 struct mbuf *m = sb->sb_mb;
768 struct mbuf *n;
769
770 KASSERT(solocked(sb->sb_so));
771
772 while (m && m->m_nextpkt)
773 m = m->m_nextpkt;
774
775 while (m && m->m_next)
776 m = m->m_next;
777
778 if (m != sb->sb_mbtail) {
779 printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
780 sb->sb_mb, sb->sb_mbtail, m);
781 printf("packet tree:\n");
782 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
783 printf("\t");
784 for (n = m; n != NULL; n = n->m_next)
785 printf("%p ", n);
786 printf("\n");
787 }
788 panic("sblastmbufchk from %s", where);
789 }
790 }
791 #endif /* SOCKBUF_DEBUG */
792
793 /*
794 * Link a chain of records onto a socket buffer
795 */
796 #define SBLINKRECORDCHAIN(sb, m0, mlast) \
797 do { \
798 if ((sb)->sb_lastrecord != NULL) \
799 (sb)->sb_lastrecord->m_nextpkt = (m0); \
800 else \
801 (sb)->sb_mb = (m0); \
802 (sb)->sb_lastrecord = (mlast); \
803 } while (/*CONSTCOND*/0)
804
805
806 #define SBLINKRECORD(sb, m0) \
807 SBLINKRECORDCHAIN(sb, m0, m0)
808
809 /*
810 * Append mbuf chain m to the last record in the
811 * socket buffer sb. The additional space associated
812 * the mbuf chain is recorded in sb. Empty mbufs are
813 * discarded and mbufs are compacted where possible.
814 */
815 void
816 sbappend(struct sockbuf *sb, struct mbuf *m)
817 {
818 struct mbuf *n;
819
820 KASSERT(solocked(sb->sb_so));
821
822 if (m == NULL)
823 return;
824
825 #ifdef MBUFTRACE
826 m_claimm(m, sb->sb_mowner);
827 #endif
828
829 SBLASTRECORDCHK(sb, "sbappend 1");
830
831 if ((n = sb->sb_lastrecord) != NULL) {
832 /*
833 * XXX Would like to simply use sb_mbtail here, but
834 * XXX I need to verify that I won't miss an EOR that
835 * XXX way.
836 */
837 do {
838 if (n->m_flags & M_EOR) {
839 sbappendrecord(sb, m); /* XXXXXX!!!! */
840 return;
841 }
842 } while (n->m_next && (n = n->m_next));
843 } else {
844 /*
845 * If this is the first record in the socket buffer, it's
846 * also the last record.
847 */
848 sb->sb_lastrecord = m;
849 }
850 sbcompress(sb, m, n);
851 SBLASTRECORDCHK(sb, "sbappend 2");
852 }
853
854 /*
855 * This version of sbappend() should only be used when the caller
856 * absolutely knows that there will never be more than one record
857 * in the socket buffer, that is, a stream protocol (such as TCP).
858 */
859 void
860 sbappendstream(struct sockbuf *sb, struct mbuf *m)
861 {
862
863 KASSERT(solocked(sb->sb_so));
864 KDASSERT(m->m_nextpkt == NULL);
865 KASSERT(sb->sb_mb == sb->sb_lastrecord);
866
867 SBLASTMBUFCHK(sb, __func__);
868
869 #ifdef MBUFTRACE
870 m_claimm(m, sb->sb_mowner);
871 #endif
872
873 sbcompress(sb, m, sb->sb_mbtail);
874
875 sb->sb_lastrecord = sb->sb_mb;
876 SBLASTRECORDCHK(sb, __func__);
877 }
878
879 #ifdef SOCKBUF_DEBUG
880 void
881 sbcheck(struct sockbuf *sb)
882 {
883 struct mbuf *m, *m2;
884 u_long len, mbcnt;
885
886 KASSERT(solocked(sb->sb_so));
887
888 len = 0;
889 mbcnt = 0;
890 for (m = sb->sb_mb; m; m = m->m_nextpkt) {
891 for (m2 = m; m2 != NULL; m2 = m2->m_next) {
892 len += m2->m_len;
893 mbcnt += MSIZE;
894 if (m2->m_flags & M_EXT)
895 mbcnt += m2->m_ext.ext_size;
896 if (m2->m_nextpkt != NULL)
897 panic("sbcheck nextpkt");
898 }
899 }
900 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
901 printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
902 mbcnt, sb->sb_mbcnt);
903 panic("sbcheck");
904 }
905 }
906 #endif
907
908 /*
909 * As above, except the mbuf chain
910 * begins a new record.
911 */
912 void
913 sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
914 {
915 struct mbuf *m;
916
917 KASSERT(solocked(sb->sb_so));
918
919 if (m0 == NULL)
920 return;
921
922 #ifdef MBUFTRACE
923 m_claimm(m0, sb->sb_mowner);
924 #endif
925 /*
926 * Put the first mbuf on the queue.
927 * Note this permits zero length records.
928 */
929 sballoc(sb, m0);
930 SBLASTRECORDCHK(sb, "sbappendrecord 1");
931 SBLINKRECORD(sb, m0);
932 m = m0->m_next;
933 m0->m_next = 0;
934 if (m && (m0->m_flags & M_EOR)) {
935 m0->m_flags &= ~M_EOR;
936 m->m_flags |= M_EOR;
937 }
938 sbcompress(sb, m, m0);
939 SBLASTRECORDCHK(sb, "sbappendrecord 2");
940 }
941
942 /*
943 * As above except that OOB data
944 * is inserted at the beginning of the sockbuf,
945 * but after any other OOB data.
946 */
947 void
948 sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
949 {
950 struct mbuf *m, **mp;
951
952 KASSERT(solocked(sb->sb_so));
953
954 if (m0 == NULL)
955 return;
956
957 SBLASTRECORDCHK(sb, "sbinsertoob 1");
958
959 for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) {
960 again:
961 switch (m->m_type) {
962
963 case MT_OOBDATA:
964 continue; /* WANT next train */
965
966 case MT_CONTROL:
967 if ((m = m->m_next) != NULL)
968 goto again; /* inspect THIS train further */
969 }
970 break;
971 }
972 /*
973 * Put the first mbuf on the queue.
974 * Note this permits zero length records.
975 */
976 sballoc(sb, m0);
977 m0->m_nextpkt = *mp;
978 if (*mp == NULL) {
979 /* m0 is actually the new tail */
980 sb->sb_lastrecord = m0;
981 }
982 *mp = m0;
983 m = m0->m_next;
984 m0->m_next = 0;
985 if (m && (m0->m_flags & M_EOR)) {
986 m0->m_flags &= ~M_EOR;
987 m->m_flags |= M_EOR;
988 }
989 sbcompress(sb, m, m0);
990 SBLASTRECORDCHK(sb, "sbinsertoob 2");
991 }
992
993 /*
994 * Append address and data, and optionally, control (ancillary) data
995 * to the receive queue of a socket. If present,
996 * m0 must include a packet header with total length.
997 * Returns 0 if no space in sockbuf or insufficient mbufs.
998 */
999 int
1000 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
1001 struct mbuf *control)
1002 {
1003 struct mbuf *m, *n, *nlast;
1004 int space, len;
1005
1006 KASSERT(solocked(sb->sb_so));
1007
1008 space = asa->sa_len;
1009
1010 if (m0 != NULL) {
1011 if ((m0->m_flags & M_PKTHDR) == 0)
1012 panic("sbappendaddr");
1013 space += m0->m_pkthdr.len;
1014 #ifdef MBUFTRACE
1015 m_claimm(m0, sb->sb_mowner);
1016 #endif
1017 }
1018 for (n = control; n; n = n->m_next) {
1019 space += n->m_len;
1020 MCLAIM(n, sb->sb_mowner);
1021 if (n->m_next == NULL) /* keep pointer to last control buf */
1022 break;
1023 }
1024 if (space > sbspace(sb))
1025 return (0);
1026 m = m_get(M_DONTWAIT, MT_SONAME);
1027 if (m == NULL)
1028 return (0);
1029 MCLAIM(m, sb->sb_mowner);
1030 /*
1031 * XXX avoid 'comparison always true' warning which isn't easily
1032 * avoided.
1033 */
1034 len = asa->sa_len;
1035 if (len > MLEN) {
1036 MEXTMALLOC(m, asa->sa_len, M_NOWAIT);
1037 if ((m->m_flags & M_EXT) == 0) {
1038 m_free(m);
1039 return (0);
1040 }
1041 }
1042 m->m_len = asa->sa_len;
1043 memcpy(mtod(m, void *), asa, asa->sa_len);
1044 if (n)
1045 n->m_next = m0; /* concatenate data to control */
1046 else
1047 control = m0;
1048 m->m_next = control;
1049
1050 SBLASTRECORDCHK(sb, "sbappendaddr 1");
1051
1052 for (n = m; n->m_next != NULL; n = n->m_next)
1053 sballoc(sb, n);
1054 sballoc(sb, n);
1055 nlast = n;
1056 SBLINKRECORD(sb, m);
1057
1058 sb->sb_mbtail = nlast;
1059 SBLASTMBUFCHK(sb, "sbappendaddr");
1060 SBLASTRECORDCHK(sb, "sbappendaddr 2");
1061
1062 return (1);
1063 }
1064
1065 /*
1066 * Helper for sbappendchainaddr: prepend a struct sockaddr* to
1067 * an mbuf chain.
1068 */
1069 static inline struct mbuf *
1070 m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0,
1071 const struct sockaddr *asa)
1072 {
1073 struct mbuf *m;
1074 const int salen = asa->sa_len;
1075
1076 KASSERT(solocked(sb->sb_so));
1077
1078 /* only the first in each chain need be a pkthdr */
1079 m = m_gethdr(M_DONTWAIT, MT_SONAME);
1080 if (m == NULL)
1081 return NULL;
1082 MCLAIM(m, sb->sb_mowner);
1083 #ifdef notyet
1084 if (salen > MHLEN) {
1085 MEXTMALLOC(m, salen, M_NOWAIT);
1086 if ((m->m_flags & M_EXT) == 0) {
1087 m_free(m);
1088 return NULL;
1089 }
1090 }
1091 #else
1092 KASSERT(salen <= MHLEN);
1093 #endif
1094 m->m_len = salen;
1095 memcpy(mtod(m, void *), asa, salen);
1096 m->m_next = m0;
1097 m->m_pkthdr.len = salen + m0->m_pkthdr.len;
1098
1099 return m;
1100 }
1101
1102 int
1103 sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa,
1104 struct mbuf *m0, int sbprio)
1105 {
1106 struct mbuf *m, *n, *n0, *nlast;
1107 int error;
1108
1109 KASSERT(solocked(sb->sb_so));
1110
1111 /*
1112 * XXX sbprio reserved for encoding priority of this* request:
1113 * SB_PRIO_NONE --> honour normal sb limits
1114 * SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
1115 * take whole chain. Intended for large requests
1116 * that should be delivered atomically (all, or none).
1117 * SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
1118 * over normal socket limits, for messages indicating
1119 * buffer overflow in earlier normal/lower-priority messages
1120 * SB_PRIO_BESTEFFORT --> ignore limits entirely.
1121 * Intended for kernel-generated messages only.
1122 * Up to generator to avoid total mbuf resource exhaustion.
1123 */
1124 (void)sbprio;
1125
1126 if (m0 && (m0->m_flags & M_PKTHDR) == 0)
1127 panic("sbappendaddrchain");
1128
1129 #ifdef notyet
1130 space = sbspace(sb);
1131
1132 /*
1133 * Enforce SB_PRIO_* limits as described above.
1134 */
1135 #endif
1136
1137 n0 = NULL;
1138 nlast = NULL;
1139 for (m = m0; m; m = m->m_nextpkt) {
1140 struct mbuf *np;
1141
1142 #ifdef MBUFTRACE
1143 m_claimm(m, sb->sb_mowner);
1144 #endif
1145
1146 /* Prepend sockaddr to this record (m) of input chain m0 */
1147 n = m_prepend_sockaddr(sb, m, asa);
1148 if (n == NULL) {
1149 error = ENOBUFS;
1150 goto bad;
1151 }
1152
1153 /* Append record (asa+m) to end of new chain n0 */
1154 if (n0 == NULL) {
1155 n0 = n;
1156 } else {
1157 nlast->m_nextpkt = n;
1158 }
1159 /* Keep track of last record on new chain */
1160 nlast = n;
1161
1162 for (np = n; np; np = np->m_next)
1163 sballoc(sb, np);
1164 }
1165
1166 SBLASTRECORDCHK(sb, "sbappendaddrchain 1");
1167
1168 /* Drop the entire chain of (asa+m) records onto the socket */
1169 SBLINKRECORDCHAIN(sb, n0, nlast);
1170
1171 SBLASTRECORDCHK(sb, "sbappendaddrchain 2");
1172
1173 for (m = nlast; m->m_next; m = m->m_next)
1174 ;
1175 sb->sb_mbtail = m;
1176 SBLASTMBUFCHK(sb, "sbappendaddrchain");
1177
1178 return (1);
1179
1180 bad:
1181 /*
1182 * On error, free the prepended addreseses. For consistency
1183 * with sbappendaddr(), leave it to our caller to free
1184 * the input record chain passed to us as m0.
1185 */
1186 while ((n = n0) != NULL) {
1187 struct mbuf *np;
1188
1189 /* Undo the sballoc() of this record */
1190 for (np = n; np; np = np->m_next)
1191 sbfree(sb, np);
1192
1193 n0 = n->m_nextpkt; /* iterate at next prepended address */
1194 np = m_free(n); /* free prepended address (not data) */
1195 }
1196 return error;
1197 }
1198
1199
1200 int
1201 sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
1202 {
1203 struct mbuf *m, *mlast, *n;
1204 int space;
1205
1206 KASSERT(solocked(sb->sb_so));
1207
1208 space = 0;
1209 if (control == NULL)
1210 panic("sbappendcontrol");
1211 for (m = control; ; m = m->m_next) {
1212 space += m->m_len;
1213 MCLAIM(m, sb->sb_mowner);
1214 if (m->m_next == NULL)
1215 break;
1216 }
1217 n = m; /* save pointer to last control buffer */
1218 for (m = m0; m; m = m->m_next) {
1219 MCLAIM(m, sb->sb_mowner);
1220 space += m->m_len;
1221 }
1222 if (space > sbspace(sb))
1223 return (0);
1224 n->m_next = m0; /* concatenate data to control */
1225
1226 SBLASTRECORDCHK(sb, "sbappendcontrol 1");
1227
1228 for (m = control; m->m_next != NULL; m = m->m_next)
1229 sballoc(sb, m);
1230 sballoc(sb, m);
1231 mlast = m;
1232 SBLINKRECORD(sb, control);
1233
1234 sb->sb_mbtail = mlast;
1235 SBLASTMBUFCHK(sb, "sbappendcontrol");
1236 SBLASTRECORDCHK(sb, "sbappendcontrol 2");
1237
1238 return (1);
1239 }
1240
1241 /*
1242 * Compress mbuf chain m into the socket
1243 * buffer sb following mbuf n. If n
1244 * is null, the buffer is presumed empty.
1245 */
1246 void
1247 sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
1248 {
1249 int eor;
1250 struct mbuf *o;
1251
1252 KASSERT(solocked(sb->sb_so));
1253
1254 eor = 0;
1255 while (m) {
1256 eor |= m->m_flags & M_EOR;
1257 if (m->m_len == 0 &&
1258 (eor == 0 ||
1259 (((o = m->m_next) || (o = n)) &&
1260 o->m_type == m->m_type))) {
1261 if (sb->sb_lastrecord == m)
1262 sb->sb_lastrecord = m->m_next;
1263 m = m_free(m);
1264 continue;
1265 }
1266 if (n && (n->m_flags & M_EOR) == 0 &&
1267 /* M_TRAILINGSPACE() checks buffer writeability */
1268 m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */
1269 m->m_len <= M_TRAILINGSPACE(n) &&
1270 n->m_type == m->m_type) {
1271 memcpy(mtod(n, char *) + n->m_len, mtod(m, void *),
1272 (unsigned)m->m_len);
1273 n->m_len += m->m_len;
1274 sb->sb_cc += m->m_len;
1275 m = m_free(m);
1276 continue;
1277 }
1278 if (n)
1279 n->m_next = m;
1280 else
1281 sb->sb_mb = m;
1282 sb->sb_mbtail = m;
1283 sballoc(sb, m);
1284 n = m;
1285 m->m_flags &= ~M_EOR;
1286 m = m->m_next;
1287 n->m_next = 0;
1288 }
1289 if (eor) {
1290 if (n)
1291 n->m_flags |= eor;
1292 else
1293 printf("semi-panic: sbcompress\n");
1294 }
1295 SBLASTMBUFCHK(sb, __func__);
1296 }
1297
1298 /*
1299 * Free all mbufs in a sockbuf.
1300 * Check that all resources are reclaimed.
1301 */
1302 void
1303 sbflush(struct sockbuf *sb)
1304 {
1305
1306 KASSERT(solocked(sb->sb_so));
1307 KASSERT((sb->sb_flags & SB_LOCK) == 0);
1308
1309 while (sb->sb_mbcnt)
1310 sbdrop(sb, (int)sb->sb_cc);
1311
1312 KASSERT(sb->sb_cc == 0);
1313 KASSERT(sb->sb_mb == NULL);
1314 KASSERT(sb->sb_mbtail == NULL);
1315 KASSERT(sb->sb_lastrecord == NULL);
1316 }
1317
1318 /*
1319 * Drop data from (the front of) a sockbuf.
1320 */
1321 void
1322 sbdrop(struct sockbuf *sb, int len)
1323 {
1324 struct mbuf *m, *next;
1325
1326 KASSERT(solocked(sb->sb_so));
1327
1328 next = (m = sb->sb_mb) ? m->m_nextpkt : NULL;
1329 while (len > 0) {
1330 if (m == NULL) {
1331 if (next == NULL)
1332 panic("sbdrop(%p,%d): cc=%lu",
1333 sb, len, sb->sb_cc);
1334 m = next;
1335 next = m->m_nextpkt;
1336 continue;
1337 }
1338 if (m->m_len > len) {
1339 m->m_len -= len;
1340 m->m_data += len;
1341 sb->sb_cc -= len;
1342 break;
1343 }
1344 len -= m->m_len;
1345 sbfree(sb, m);
1346 m = m_free(m);
1347 }
1348 while (m && m->m_len == 0) {
1349 sbfree(sb, m);
1350 m = m_free(m);
1351 }
1352 if (m) {
1353 sb->sb_mb = m;
1354 m->m_nextpkt = next;
1355 } else
1356 sb->sb_mb = next;
1357 /*
1358 * First part is an inline SB_EMPTY_FIXUP(). Second part
1359 * makes sure sb_lastrecord is up-to-date if we dropped
1360 * part of the last record.
1361 */
1362 m = sb->sb_mb;
1363 if (m == NULL) {
1364 sb->sb_mbtail = NULL;
1365 sb->sb_lastrecord = NULL;
1366 } else if (m->m_nextpkt == NULL)
1367 sb->sb_lastrecord = m;
1368 }
1369
1370 /*
1371 * Drop a record off the front of a sockbuf
1372 * and move the next record to the front.
1373 */
1374 void
1375 sbdroprecord(struct sockbuf *sb)
1376 {
1377 struct mbuf *m, *mn;
1378
1379 KASSERT(solocked(sb->sb_so));
1380
1381 m = sb->sb_mb;
1382 if (m) {
1383 sb->sb_mb = m->m_nextpkt;
1384 do {
1385 sbfree(sb, m);
1386 mn = m_free(m);
1387 } while ((m = mn) != NULL);
1388 }
1389 SB_EMPTY_FIXUP(sb);
1390 }
1391
1392 /*
1393 * Create a "control" mbuf containing the specified data
1394 * with the specified type for presentation on a socket buffer.
1395 */
1396 struct mbuf *
1397 sbcreatecontrol1(void **p, int size, int type, int level, int flags)
1398 {
1399 struct cmsghdr *cp;
1400 struct mbuf *m;
1401 int space = CMSG_SPACE(size);
1402
1403 if ((flags & M_DONTWAIT) && space > MCLBYTES) {
1404 printf("%s: message too large %d\n", __func__, space);
1405 return NULL;
1406 }
1407
1408 if ((m = m_get(flags, MT_CONTROL)) == NULL)
1409 return NULL;
1410 if (space > MLEN) {
1411 if (space > MCLBYTES)
1412 MEXTMALLOC(m, space, M_WAITOK);
1413 else
1414 MCLGET(m, flags);
1415 if ((m->m_flags & M_EXT) == 0) {
1416 m_free(m);
1417 return NULL;
1418 }
1419 }
1420 cp = mtod(m, struct cmsghdr *);
1421 *p = CMSG_DATA(cp);
1422 m->m_len = space;
1423 cp->cmsg_len = CMSG_LEN(size);
1424 cp->cmsg_level = level;
1425 cp->cmsg_type = type;
1426
1427 memset(cp + 1, 0, CMSG_LEN(0) - sizeof(*cp));
1428 memset((uint8_t *)*p + size, 0, CMSG_ALIGN(size) - size);
1429
1430 return m;
1431 }
1432
1433 struct mbuf *
1434 sbcreatecontrol(void *p, int size, int type, int level)
1435 {
1436 struct mbuf *m;
1437 void *v;
1438
1439 m = sbcreatecontrol1(&v, size, type, level, M_DONTWAIT);
1440 if (m == NULL)
1441 return NULL;
1442 memcpy(v, p, size);
1443 return m;
1444 }
1445
1446 void
1447 solockretry(struct socket *so, kmutex_t *lock)
1448 {
1449
1450 while (lock != so->so_lock) {
1451 mutex_exit(lock);
1452 lock = so->so_lock;
1453 mutex_enter(lock);
1454 }
1455 }
1456
1457 bool
1458 solocked(const struct socket *so)
1459 {
1460
1461 return mutex_owned(so->so_lock);
1462 }
1463
1464 bool
1465 solocked2(const struct socket *so1, const struct socket *so2)
1466 {
1467 const kmutex_t *lock;
1468
1469 lock = so1->so_lock;
1470 if (lock != so2->so_lock)
1471 return false;
1472 return mutex_owned(lock);
1473 }
1474
1475 /*
1476 * sosetlock: assign a default lock to a new socket.
1477 */
1478 void
1479 sosetlock(struct socket *so)
1480 {
1481 if (so->so_lock == NULL) {
1482 kmutex_t *lock = softnet_lock;
1483
1484 so->so_lock = lock;
1485 mutex_obj_hold(lock);
1486 mutex_enter(lock);
1487 }
1488 KASSERT(solocked(so));
1489 }
1490
1491 /*
1492 * Set lock on sockbuf sb; sleep if lock is already held.
1493 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1494 * Returns error without lock if sleep is interrupted.
1495 */
1496 int
1497 sblock(struct sockbuf *sb, int wf)
1498 {
1499 struct socket *so;
1500 kmutex_t *lock;
1501 int error;
1502
1503 KASSERT(solocked(sb->sb_so));
1504
1505 for (;;) {
1506 if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) {
1507 sb->sb_flags |= SB_LOCK;
1508 return 0;
1509 }
1510 if (wf != M_WAITOK)
1511 return EWOULDBLOCK;
1512 so = sb->sb_so;
1513 lock = so->so_lock;
1514 if ((sb->sb_flags & SB_NOINTR) != 0) {
1515 cv_wait(&so->so_cv, lock);
1516 error = 0;
1517 } else
1518 error = cv_wait_sig(&so->so_cv, lock);
1519 if (__predict_false(lock != so->so_lock))
1520 solockretry(so, lock);
1521 if (error != 0)
1522 return error;
1523 }
1524 }
1525
1526 void
1527 sbunlock(struct sockbuf *sb)
1528 {
1529 struct socket *so;
1530
1531 so = sb->sb_so;
1532
1533 KASSERT(solocked(so));
1534 KASSERT((sb->sb_flags & SB_LOCK) != 0);
1535
1536 sb->sb_flags &= ~SB_LOCK;
1537 cv_broadcast(&so->so_cv);
1538 }
1539
1540 int
1541 sowait(struct socket *so, bool catch_p, int timo)
1542 {
1543 kmutex_t *lock;
1544 int error;
1545
1546 KASSERT(solocked(so));
1547 KASSERT(catch_p || timo != 0);
1548
1549 lock = so->so_lock;
1550 if (catch_p)
1551 error = cv_timedwait_sig(&so->so_cv, lock, timo);
1552 else
1553 error = cv_timedwait(&so->so_cv, lock, timo);
1554 if (__predict_false(lock != so->so_lock))
1555 solockretry(so, lock);
1556 return error;
1557 }
1558
1559 #ifdef DDB
1560
1561 /*
1562 * Currently, sofindproc() is used only from DDB. It could be used from others
1563 * by using db_mutex_enter()
1564 */
1565
1566 static inline int
1567 db_mutex_enter(kmutex_t *mtx)
1568 {
1569 extern int db_active;
1570 int rv;
1571
1572 if (!db_active) {
1573 mutex_enter(mtx);
1574 rv = 1;
1575 } else
1576 rv = mutex_tryenter(mtx);
1577
1578 return rv;
1579 }
1580
1581 int
1582 sofindproc(struct socket *so, int all, void (*pr)(const char *, ...))
1583 {
1584 proc_t *p;
1585 filedesc_t *fdp;
1586 fdtab_t *dt;
1587 fdfile_t *ff;
1588 file_t *fp = NULL;
1589 int found = 0;
1590 int i, t;
1591
1592 if (so == NULL)
1593 return 0;
1594
1595 t = db_mutex_enter(&proc_lock);
1596 if (!t) {
1597 pr("could not acquire proc_lock mutex\n");
1598 return 0;
1599 }
1600 PROCLIST_FOREACH(p, &allproc) {
1601 if (p->p_stat == SIDL)
1602 continue;
1603 fdp = p->p_fd;
1604 t = db_mutex_enter(&fdp->fd_lock);
1605 if (!t) {
1606 pr("could not acquire fd_lock mutex\n");
1607 continue;
1608 }
1609 dt = atomic_load_consume(&fdp->fd_dt);
1610 for (i = 0; i < dt->dt_nfiles; i++) {
1611 ff = dt->dt_ff[i];
1612 if (ff == NULL)
1613 continue;
1614
1615 fp = atomic_load_consume(&ff->ff_file);
1616 if (fp == NULL)
1617 continue;
1618
1619 t = db_mutex_enter(&fp->f_lock);
1620 if (!t) {
1621 pr("could not acquire f_lock mutex\n");
1622 continue;
1623 }
1624 if ((struct socket *)fp->f_data != so) {
1625 mutex_exit(&fp->f_lock);
1626 continue;
1627 }
1628 found++;
1629 if (pr)
1630 pr("socket %p: owner %s(pid=%d)\n",
1631 so, p->p_comm, p->p_pid);
1632 mutex_exit(&fp->f_lock);
1633 if (all == 0)
1634 break;
1635 }
1636 mutex_exit(&fdp->fd_lock);
1637 if (all == 0 && found != 0)
1638 break;
1639 }
1640 mutex_exit(&proc_lock);
1641
1642 return found;
1643 }
1644
1645 void
1646 socket_print(const char *modif, void (*pr)(const char *, ...))
1647 {
1648 file_t *fp;
1649 struct socket *so;
1650 struct sockbuf *sb_snd, *sb_rcv;
1651 struct mbuf *m_rec, *m;
1652 bool opt_v = false;
1653 bool opt_m = false;
1654 bool opt_a = false;
1655 bool opt_p = false;
1656 int nrecs, nmbufs;
1657 char ch;
1658 const char *family;
1659
1660 while ( (ch = *(modif++)) != '\0') {
1661 switch (ch) {
1662 case 'v':
1663 opt_v = true;
1664 break;
1665 case 'm':
1666 opt_m = true;
1667 break;
1668 case 'a':
1669 opt_a = true;
1670 break;
1671 case 'p':
1672 opt_p = true;
1673 break;
1674 }
1675 }
1676 if (opt_v == false && pr)
1677 (pr)("Ignore empty sockets. use /v to print all.\n");
1678 if (opt_p == true && pr)
1679 (pr)("Don't search owner process.\n");
1680
1681 LIST_FOREACH(fp, &filehead, f_list) {
1682 if (fp->f_type != DTYPE_SOCKET)
1683 continue;
1684 so = (struct socket *)fp->f_data;
1685 if (so == NULL)
1686 continue;
1687
1688 if (so->so_proto->pr_domain->dom_family == AF_INET)
1689 family = "INET";
1690 #ifdef INET6
1691 else if (so->so_proto->pr_domain->dom_family == AF_INET6)
1692 family = "INET6";
1693 #endif
1694 else if (so->so_proto->pr_domain->dom_family == pseudo_AF_KEY)
1695 family = "KEY";
1696 else if (so->so_proto->pr_domain->dom_family == AF_ROUTE)
1697 family = "ROUTE";
1698 else
1699 continue;
1700
1701 sb_snd = &so->so_snd;
1702 sb_rcv = &so->so_rcv;
1703
1704 if (opt_v != true &&
1705 sb_snd->sb_cc == 0 && sb_rcv->sb_cc == 0)
1706 continue;
1707
1708 pr("---SOCKET %p: type %s\n", so, family);
1709 if (opt_p != true)
1710 sofindproc(so, opt_a == true ? 1 : 0, pr);
1711 pr("Send Buffer Bytes: %d [bytes]\n", sb_snd->sb_cc);
1712 pr("Send Buffer mbufs:\n");
1713 m_rec = m = sb_snd->sb_mb;
1714 nrecs = 0;
1715 nmbufs = 0;
1716 while (m_rec) {
1717 nrecs++;
1718 if (opt_m == true)
1719 pr(" mbuf chain %p\n", m_rec);
1720 while (m) {
1721 nmbufs++;
1722 m = m->m_next;
1723 }
1724 m_rec = m = m_rec->m_nextpkt;
1725 }
1726 pr(" Total %d records, %d mbufs.\n", nrecs, nmbufs);
1727
1728 pr("Recv Buffer Usage: %d [bytes]\n", sb_rcv->sb_cc);
1729 pr("Recv Buffer mbufs:\n");
1730 m_rec = m = sb_rcv->sb_mb;
1731 nrecs = 0;
1732 nmbufs = 0;
1733 while (m_rec) {
1734 nrecs++;
1735 if (opt_m == true)
1736 pr(" mbuf chain %p\n", m_rec);
1737 while (m) {
1738 nmbufs++;
1739 m = m->m_next;
1740 }
1741 m_rec = m = m_rec->m_nextpkt;
1742 }
1743 pr(" Total %d records, %d mbufs.\n", nrecs, nmbufs);
1744 }
1745 }
1746 #endif /* DDB */
1747