Home | History | Annotate | Line # | Download | only in sys
      1 /*	$NetBSD: mbuf.h,v 1.240 2024/05/12 10:34:56 rillig Exp $	*/
      2 
      3 /*
      4  * Copyright (c) 1996, 1997, 1999, 2001, 2007 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
      9  * NASA Ames Research Center and Matt Thomas of 3am Software Foundry.
     10  *
     11  * Redistribution and use in source and binary forms, with or without
     12  * modification, are permitted provided that the following conditions
     13  * are met:
     14  * 1. Redistributions of source code must retain the above copyright
     15  *    notice, this list of conditions and the following disclaimer.
     16  * 2. Redistributions in binary form must reproduce the above copyright
     17  *    notice, this list of conditions and the following disclaimer in the
     18  *    documentation and/or other materials provided with the distribution.
     19  *
     20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     30  * POSSIBILITY OF SUCH DAMAGE.
     31  */
     32 
     33 /*
     34  * Copyright (c) 1982, 1986, 1988, 1993
     35  *	The Regents of the University of California.  All rights reserved.
     36  *
     37  * Redistribution and use in source and binary forms, with or without
     38  * modification, are permitted provided that the following conditions
     39  * are met:
     40  * 1. Redistributions of source code must retain the above copyright
     41  *    notice, this list of conditions and the following disclaimer.
     42  * 2. Redistributions in binary form must reproduce the above copyright
     43  *    notice, this list of conditions and the following disclaimer in the
     44  *    documentation and/or other materials provided with the distribution.
     45  * 3. Neither the name of the University nor the names of its contributors
     46  *    may be used to endorse or promote products derived from this software
     47  *    without specific prior written permission.
     48  *
     49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     59  * SUCH DAMAGE.
     60  *
     61  *	@(#)mbuf.h	8.5 (Berkeley) 2/19/95
     62  */
     63 
     64 #ifndef _SYS_MBUF_H_
     65 #define _SYS_MBUF_H_
     66 
     67 #ifdef _KERNEL_OPT
     68 #include "opt_mbuftrace.h"
     69 #endif
     70 
     71 #ifndef M_WAITOK
     72 #include <sys/malloc.h>
     73 #endif
     74 #include <sys/pool.h>
     75 #include <sys/queue.h>
     76 #if defined(_KERNEL)
     77 #include <sys/percpu_types.h>
     78 #include <sys/socket.h>	/* for AF_UNSPEC */
     79 #include <sys/psref.h>
     80 #endif /* defined(_KERNEL) */
     81 
     82 /* For offsetof() */
     83 #if defined(_KERNEL) || defined(_STANDALONE)
     84 #include <sys/systm.h>
     85 #else
     86 #include <stddef.h>
     87 #endif
     88 
     89 #include <uvm/uvm_param.h>	/* for MIN_PAGE_SIZE */
     90 
     91 #include <net/if.h>
     92 
     93 /*
     94  * Mbufs are of a single size, MSIZE (machine/param.h), which
     95  * includes overhead.  An mbuf may add a single "mbuf cluster" of size
     96  * MCLBYTES (also in machine/param.h), which has no additional overhead
     97  * and is used instead of the internal data area; this is done when
     98  * at least MINCLSIZE of data must be stored.
     99  */
    100 
    101 /* Packet tags structure */
    102 struct m_tag {
    103 	SLIST_ENTRY(m_tag)	m_tag_link;	/* List of packet tags */
    104 	uint16_t		m_tag_id;	/* Tag ID */
    105 	uint16_t		m_tag_len;	/* Length of data */
    106 };
    107 
    108 /* mbuf ownership structure */
    109 struct mowner {
    110 	char mo_name[16];		/* owner name (fxp0) */
    111 	char mo_descr[16];		/* owner description (input) */
    112 	LIST_ENTRY(mowner) mo_link;	/* */
    113 	struct percpu *mo_counters;
    114 };
    115 
    116 #define MOWNER_INIT(x, y) { .mo_name = x, .mo_descr = y }
    117 
    118 enum mowner_counter_index {
    119 	MOWNER_COUNTER_CLAIMS,		/* # of small mbuf claimed */
    120 	MOWNER_COUNTER_RELEASES,	/* # of small mbuf released */
    121 	MOWNER_COUNTER_CLUSTER_CLAIMS,	/* # of cluster mbuf claimed */
    122 	MOWNER_COUNTER_CLUSTER_RELEASES,/* # of cluster mbuf released */
    123 	MOWNER_COUNTER_EXT_CLAIMS,	/* # of M_EXT mbuf claimed */
    124 	MOWNER_COUNTER_EXT_RELEASES,	/* # of M_EXT mbuf released */
    125 
    126 	MOWNER_COUNTER_NCOUNTERS,
    127 };
    128 
    129 #if defined(_KERNEL)
    130 struct mowner_counter {
    131 	u_long mc_counter[MOWNER_COUNTER_NCOUNTERS];
    132 };
    133 #endif
    134 
    135 /* userland-exported version of struct mowner */
    136 struct mowner_user {
    137 	char mo_name[16];		/* owner name (fxp0) */
    138 	char mo_descr[16];		/* owner description (input) */
    139 	LIST_ENTRY(mowner) mo_link;	/* unused padding; for compatibility */
    140 	u_long mo_counter[MOWNER_COUNTER_NCOUNTERS]; /* counters */
    141 };
    142 
    143 /*
    144  * Macros for type conversion
    145  * mtod(m,t) -	convert mbuf pointer to data pointer of correct type
    146  */
    147 #define mtod(m, t)	((t)((m)->m_data))
    148 
    149 /* header at beginning of each mbuf */
    150 struct m_hdr {
    151 	struct	mbuf *mh_next;		/* next buffer in chain */
    152 	struct	mbuf *mh_nextpkt;	/* next chain in queue/record */
    153 	char	*mh_data;		/* location of data */
    154 	struct	mowner *mh_owner;	/* mbuf owner */
    155 	int	mh_len;			/* amount of data in this mbuf */
    156 	int	mh_flags;		/* flags; see below */
    157 	paddr_t	mh_paddr;		/* physical address of mbuf */
    158 	short	mh_type;		/* type of data in this mbuf */
    159 };
    160 
    161 /*
    162  * record/packet header in first mbuf of chain; valid if M_PKTHDR set
    163  *
    164  * A note about csum_data:
    165  *
    166  *  o For the out-bound direction, the low 16 bits indicates the offset after
    167  *    the L4 header where the final L4 checksum value is to be stored and the
    168  *    high 16 bits is the length of the L3 header (the start of the data to
    169  *    be checksummed).
    170  *
    171  *  o For the in-bound direction, it is only valid if the M_CSUM_DATA flag is
    172  *    set. In this case, an L4 checksum has been calculated by hardware and
    173  *    is stored in csum_data, but it is up to software to perform final
    174  *    verification.
    175  *
    176  * Note for in-bound TCP/UDP checksums: we expect the csum_data to NOT
    177  * be bit-wise inverted (the final step in the calculation of an IP
    178  * checksum) -- this is so we can accumulate the checksum for fragmented
    179  * packets during reassembly.
    180  *
    181  * Size ILP32: 40
    182  *       LP64: 56
    183  */
    184 struct pkthdr {
    185 	union {
    186 		void		*ctx;		/* for M_GETCTX/M_SETCTX */
    187 		if_index_t	index;		/* rcv interface index */
    188 	} _rcvif;
    189 #define rcvif_index		_rcvif.index
    190 	SLIST_HEAD(packet_tags, m_tag) tags;	/* list of packet tags */
    191 	int		len;			/* total packet length */
    192 	int		csum_flags;		/* checksum flags */
    193 	uint32_t	csum_data;		/* checksum data */
    194 	u_int		segsz;			/* segment size */
    195 	uint16_t	ether_vtag;		/* ethernet 802.1p+q vlan tag */
    196 	uint16_t	pkthdr_flags;		/* flags for pkthdr, see blow */
    197 #define PKTHDR_FLAG_IPSEC_SKIP_PFIL	0x0001	/* skip pfil_run_hooks() after ipsec decrypt */
    198 
    199 	/*
    200 	 * Following three fields are open-coded struct altq_pktattr
    201 	 * to rearrange struct pkthdr fields flexibly.
    202 	 */
    203 	int	pattr_af;		/* ALTQ: address family */
    204 	void	*pattr_class;		/* ALTQ: sched class set by classifier */
    205 	void	*pattr_hdr;		/* ALTQ: saved header position in mbuf */
    206 };
    207 
    208 /* Checksumming flags (csum_flags). */
    209 #define M_CSUM_TCPv4		0x00000001	/* TCP header/payload */
    210 #define M_CSUM_UDPv4		0x00000002	/* UDP header/payload */
    211 #define M_CSUM_TCP_UDP_BAD	0x00000004	/* TCP/UDP checksum bad */
    212 #define M_CSUM_DATA		0x00000008	/* consult csum_data */
    213 #define M_CSUM_TCPv6		0x00000010	/* IPv6 TCP header/payload */
    214 #define M_CSUM_UDPv6		0x00000020	/* IPv6 UDP header/payload */
    215 #define M_CSUM_IPv4		0x00000040	/* IPv4 header */
    216 #define M_CSUM_IPv4_BAD		0x00000080	/* IPv4 header checksum bad */
    217 #define M_CSUM_TSOv4		0x00000100	/* TCPv4 segmentation offload */
    218 #define M_CSUM_TSOv6		0x00000200	/* TCPv6 segmentation offload */
    219 
    220 /* Checksum-assist quirks: keep separate from jump-table bits. */
    221 #define M_CSUM_BLANK		0x40000000	/* csum is missing */
    222 #define M_CSUM_NO_PSEUDOHDR	0x80000000	/* Rx csum_data does not include
    223 						 * the UDP/TCP pseudo-hdr, and
    224 						 * is not yet 1s-complemented.
    225 						 */
    226 
    227 #define M_CSUM_BITS \
    228     "\20\1TCPv4\2UDPv4\3TCP_UDP_BAD\4DATA\5TCPv6\6UDPv6\7IPv4\10IPv4_BAD" \
    229     "\11TSOv4\12TSOv6\37BLANK\40NO_PSEUDOHDR"
    230 
    231 /*
    232  * Macros for manipulating csum_data on outgoing packets. These are
    233  * used to pass information down from the L4/L3 to the L2.
    234  *
    235  *   _IPHL:   Length of the IPv{4/6} header, plus the options; in other
    236  *            words the offset of the UDP/TCP header in the packet.
    237  *   _OFFSET: Offset of the checksum field in the UDP/TCP header.
    238  */
    239 #define M_CSUM_DATA_IPv4_IPHL(x)	((x) >> 16)
    240 #define M_CSUM_DATA_IPv4_OFFSET(x)	((x) & 0xffff)
    241 #define M_CSUM_DATA_IPv6_IPHL(x)	((x) >> 16)
    242 #define M_CSUM_DATA_IPv6_OFFSET(x)	((x) & 0xffff)
    243 #define M_CSUM_DATA_IPv6_SET(x, v)	(x) = ((x) & 0xffff) | ((v) << 16)
    244 
    245 /*
    246  * Max # of pages we can attach to m_ext.  This is carefully chosen
    247  * to be able to handle SOSEND_LOAN_CHUNK with our minimum sized page.
    248  */
    249 #ifdef MIN_PAGE_SIZE
    250 #define M_EXT_MAXPAGES		((65536 / MIN_PAGE_SIZE) + 1)
    251 #endif
    252 
    253 /*
    254  * Description of external storage mapped into mbuf, valid if M_EXT set.
    255  */
    256 struct _m_ext_storage {
    257 	unsigned int ext_refcnt;
    258 	char *ext_buf;			/* start of buffer */
    259 	void (*ext_free)		/* free routine if not the usual */
    260 		(struct mbuf *, void *, size_t, void *);
    261 	void *ext_arg;			/* argument for ext_free */
    262 	size_t ext_size;		/* size of buffer, for ext_free */
    263 
    264 	union {
    265 		/* M_EXT_CLUSTER: physical address */
    266 		paddr_t extun_paddr;
    267 #ifdef M_EXT_MAXPAGES
    268 		/* M_EXT_PAGES: pages */
    269 		struct vm_page *extun_pgs[M_EXT_MAXPAGES];
    270 #endif
    271 	} ext_un;
    272 #define ext_paddr	ext_un.extun_paddr
    273 #define ext_pgs		ext_un.extun_pgs
    274 };
    275 
    276 struct _m_ext {
    277 	struct mbuf *ext_ref;
    278 	struct _m_ext_storage ext_storage;
    279 };
    280 
    281 #define M_PADDR_INVALID		POOL_PADDR_INVALID
    282 
    283 /*
    284  * Definition of "struct mbuf".
    285  * Don't change this without understanding how MHLEN/MLEN are defined.
    286  */
    287 #define MBUF_DEFINE(name, mhlen, mlen)					\
    288 	struct name {							\
    289 		struct m_hdr m_hdr;					\
    290 		union {							\
    291 			struct {					\
    292 				struct pkthdr MH_pkthdr;		\
    293 				union {					\
    294 					struct _m_ext MH_ext;		\
    295 					char MH_databuf[(mhlen)];	\
    296 				} MH_dat;				\
    297 			} MH;						\
    298 			char M_databuf[(mlen)];				\
    299 		} M_dat;						\
    300 	}
    301 #define m_next		m_hdr.mh_next
    302 #define m_len		m_hdr.mh_len
    303 #define m_data		m_hdr.mh_data
    304 #define m_owner		m_hdr.mh_owner
    305 #define m_type		m_hdr.mh_type
    306 #define m_flags		m_hdr.mh_flags
    307 #define m_nextpkt	m_hdr.mh_nextpkt
    308 #define m_paddr		m_hdr.mh_paddr
    309 #define m_pkthdr	M_dat.MH.MH_pkthdr
    310 #define m_ext_storage	M_dat.MH.MH_dat.MH_ext.ext_storage
    311 #define m_ext_ref	M_dat.MH.MH_dat.MH_ext.ext_ref
    312 #define m_ext		m_ext_ref->m_ext_storage
    313 #define m_pktdat	M_dat.MH.MH_dat.MH_databuf
    314 #define m_dat		M_dat.M_databuf
    315 
    316 /*
    317  * Dummy mbuf structure to calculate the right values for MLEN/MHLEN, taking
    318  * into account inter-structure padding.
    319  */
    320 MBUF_DEFINE(_mbuf_dummy, 1, 1);
    321 
    322 /* normal data len */
    323 #define MLEN		((int)(MSIZE - offsetof(struct _mbuf_dummy, m_dat)))
    324 /* data len w/pkthdr */
    325 #define MHLEN		((int)(MSIZE - offsetof(struct _mbuf_dummy, m_pktdat)))
    326 
    327 #define MINCLSIZE	(MHLEN+MLEN+1)	/* smallest amount to put in cluster */
    328 
    329 /*
    330  * The *real* struct mbuf
    331  */
    332 MBUF_DEFINE(mbuf, MHLEN, MLEN);
    333 
    334 /* mbuf flags */
    335 #define M_EXT		0x00000001	/* has associated external storage */
    336 #define M_PKTHDR	0x00000002	/* start of record */
    337 #define M_EOR		0x00000004	/* end of record */
    338 #define M_PROTO1	0x00000008	/* protocol-specific */
    339 
    340 /* mbuf pkthdr flags, also in m_flags */
    341 #define M_AUTHIPHDR	0x00000010	/* authenticated (IPsec) */
    342 #define M_DECRYPTED	0x00000020	/* decrypted (IPsec) */
    343 #define M_LOOP		0x00000040	/* received on loopback */
    344 #define M_BCAST		0x00000100	/* send/received as L2 broadcast */
    345 #define M_MCAST		0x00000200	/* send/received as L2 multicast */
    346 #define M_CANFASTFWD	0x00000400	/* packet can be fast-forwarded */
    347 #define M_ANYCAST6	0x00000800	/* received as IPv6 anycast */
    348 
    349 #define M_LINK0		0x00001000	/* link layer specific flag */
    350 #define M_LINK1		0x00002000	/* link layer specific flag */
    351 #define M_LINK2		0x00004000	/* link layer specific flag */
    352 #define M_LINK3		0x00008000	/* link layer specific flag */
    353 #define M_LINK4		0x00010000	/* link layer specific flag */
    354 #define M_LINK5		0x00020000	/* link layer specific flag */
    355 #define M_LINK6		0x00040000	/* link layer specific flag */
    356 #define M_LINK7		0x00080000	/* link layer specific flag */
    357 
    358 #define M_VLANTAG	0x00100000	/* ether_vtag is valid */
    359 
    360 /* additional flags for M_EXT mbufs */
    361 #define M_EXT_FLAGS	0xff000000
    362 #define M_EXT_CLUSTER	0x01000000	/* ext is a cluster */
    363 #define M_EXT_PAGES	0x02000000	/* ext_pgs is valid */
    364 #define M_EXT_ROMAP	0x04000000	/* ext mapping is r-o at MMU */
    365 #define M_EXT_RW	0x08000000	/* ext storage is writable */
    366 
    367 /* for source-level compatibility */
    368 #define M_NOTIFICATION	M_PROTO1
    369 
    370 #define M_FLAGS_BITS \
    371     "\20\1EXT\2PKTHDR\3EOR\4PROTO1\5AUTHIPHDR\6DECRYPTED\7LOOP\10NONE" \
    372     "\11BCAST\12MCAST\13CANFASTFWD\14ANYCAST6\15LINK0\16LINK1\17LINK2\20LINK3" \
    373     "\21LINK4\22LINK5\23LINK6\24LINK7" \
    374     "\25VLANTAG" \
    375     "\31EXT_CLUSTER\32EXT_PAGES\33EXT_ROMAP\34EXT_RW"
    376 
    377 /* flags copied when copying m_pkthdr */
    378 #define M_COPYFLAGS	(M_PKTHDR|M_EOR|M_BCAST|M_MCAST|M_CANFASTFWD| \
    379     M_ANYCAST6|M_LINK0|M_LINK1|M_LINK2|M_AUTHIPHDR|M_DECRYPTED|M_LOOP| \
    380     M_VLANTAG)
    381 
    382 /* flag copied when shallow-copying external storage */
    383 #define M_EXTCOPYFLAGS	(M_EXT|M_EXT_FLAGS)
    384 
    385 /* mbuf types */
    386 #define MT_FREE		0	/* should be on free list */
    387 #define MT_DATA		1	/* dynamic (data) allocation */
    388 #define MT_HEADER	2	/* packet header */
    389 #define MT_SONAME	3	/* socket name */
    390 #define MT_SOOPTS	4	/* socket options */
    391 #define MT_FTABLE	5	/* fragment reassembly header */
    392 #define MT_CONTROL	6	/* extra-data protocol message */
    393 #define MT_OOBDATA	7	/* expedited data  */
    394 
    395 #ifdef MBUFTYPES
    396 const char * const mbuftypes[] = {
    397 	"mbfree",
    398 	"mbdata",
    399 	"mbheader",
    400 	"mbsoname",
    401 	"mbsopts",
    402 	"mbftable",
    403 	"mbcontrol",
    404 	"mboobdata",
    405 };
    406 #else
    407 extern const char * const mbuftypes[];
    408 #endif
    409 
    410 /* flags to m_get/MGET */
    411 #define M_DONTWAIT	M_NOWAIT
    412 #define M_WAIT		M_WAITOK
    413 
    414 #ifdef MBUFTRACE
    415 /* Mbuf allocation tracing. */
    416 void mowner_init_owner(struct mowner *, const char *, const char *);
    417 void mowner_init(struct mbuf *, int);
    418 void mowner_ref(struct mbuf *, int);
    419 void m_claim(struct mbuf *, struct mowner *);
    420 void mowner_revoke(struct mbuf *, bool, int);
    421 void mowner_attach(struct mowner *);
    422 void mowner_detach(struct mowner *);
    423 void m_claimm(struct mbuf *, struct mowner *);
    424 #else
    425 #define mowner_init_owner(mo, n, d)	__nothing
    426 #define mowner_init(m, type)		__nothing
    427 #define mowner_ref(m, flags)		__nothing
    428 #define mowner_revoke(m, all, flags)	__nothing
    429 #define m_claim(m, mowner)		__nothing
    430 #define mowner_attach(mo)		__nothing
    431 #define mowner_detach(mo)		__nothing
    432 #define m_claimm(m, mo)			__nothing
    433 #endif
    434 
    435 #define MCLAIM(m, mo)		m_claim((m), (mo))
    436 #define MOWNER_ATTACH(mo)	mowner_attach(mo)
    437 #define MOWNER_DETACH(mo)	mowner_detach(mo)
    438 
    439 /*
    440  * mbuf allocation/deallocation macros:
    441  *
    442  *	MGET(struct mbuf *m, int how, int type)
    443  * allocates an mbuf and initializes it to contain internal data.
    444  *
    445  *	MGETHDR(struct mbuf *m, int how, int type)
    446  * allocates an mbuf and initializes it to contain a packet header
    447  * and internal data.
    448  *
    449  * If 'how' is M_WAIT, these macros (and the corresponding functions)
    450  * are guaranteed to return successfully.
    451  */
    452 #define MGET(m, how, type)	m = m_get((how), (type))
    453 #define MGETHDR(m, how, type)	m = m_gethdr((how), (type))
    454 
    455 #if defined(_KERNEL)
    456 
    457 #define MCLINITREFERENCE(m)						\
    458 do {									\
    459 	KASSERT(((m)->m_flags & M_EXT) == 0);				\
    460 	(m)->m_ext_ref = (m);						\
    461 	(m)->m_ext.ext_refcnt = 1;					\
    462 } while (0)
    463 
    464 /*
    465  * Macros for mbuf external storage.
    466  *
    467  * MCLGET allocates and adds an mbuf cluster to a normal mbuf;
    468  * the flag M_EXT is set upon success.
    469  *
    470  * MEXTMALLOC allocates external storage and adds it to
    471  * a normal mbuf; the flag M_EXT is set upon success.
    472  *
    473  * MEXTADD adds pre-allocated external storage to
    474  * a normal mbuf; the flag M_EXT is set upon success.
    475  */
    476 
    477 #define MCLGET(m, how)	m_clget((m), (how))
    478 
    479 #define MEXTMALLOC(m, size, how)					\
    480 do {									\
    481 	(m)->m_ext_storage.ext_buf = malloc((size), 0, (how));		\
    482 	if ((m)->m_ext_storage.ext_buf != NULL) {			\
    483 		MCLINITREFERENCE(m);					\
    484 		(m)->m_data = (m)->m_ext.ext_buf;			\
    485 		(m)->m_flags = ((m)->m_flags & ~M_EXTCOPYFLAGS) |	\
    486 				M_EXT|M_EXT_RW;				\
    487 		(m)->m_ext.ext_size = (size);				\
    488 		(m)->m_ext.ext_free = NULL;				\
    489 		(m)->m_ext.ext_arg = NULL;				\
    490 		mowner_ref((m), M_EXT);					\
    491 	}								\
    492 } while (0)
    493 
    494 #define MEXTADD(m, buf, size, type, free, arg)				\
    495 do {									\
    496 	MCLINITREFERENCE(m);						\
    497 	(m)->m_data = (m)->m_ext.ext_buf = (char *)(buf);		\
    498 	(m)->m_flags = ((m)->m_flags & ~M_EXTCOPYFLAGS) | M_EXT;	\
    499 	(m)->m_ext.ext_size = (size);					\
    500 	(m)->m_ext.ext_free = (free);					\
    501 	(m)->m_ext.ext_arg = (arg);					\
    502 	mowner_ref((m), M_EXT);						\
    503 } while (0)
    504 
    505 #define M_BUFADDR(m)							\
    506 	(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf :			\
    507 	    ((m)->m_flags & M_PKTHDR) ? (m)->m_pktdat : (m)->m_dat)
    508 
    509 #define M_BUFSIZE(m)							\
    510 	(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size :			\
    511 	    ((m)->m_flags & M_PKTHDR) ? MHLEN : MLEN)
    512 
    513 #define MRESETDATA(m)	(m)->m_data = M_BUFADDR(m)
    514 
    515 /*
    516  * Compute the offset of the beginning of the data buffer of a non-ext
    517  * mbuf.
    518  */
    519 #define M_BUFOFFSET(m)							\
    520 	(((m)->m_flags & M_PKTHDR) ?					\
    521 	 offsetof(struct mbuf, m_pktdat) : offsetof(struct mbuf, m_dat))
    522 
    523 /*
    524  * Determine if an mbuf's data area is read-only.  This is true
    525  * if external storage is read-only mapped, or not marked as R/W,
    526  * or referenced by more than one mbuf.
    527  */
    528 #define M_READONLY(m)							\
    529 	(((m)->m_flags & M_EXT) != 0 &&					\
    530 	  (((m)->m_flags & (M_EXT_ROMAP|M_EXT_RW)) != M_EXT_RW ||	\
    531 	  (m)->m_ext.ext_refcnt > 1))
    532 
    533 #define M_UNWRITABLE(__m, __len)					\
    534 	((__m)->m_len < (__len) || M_READONLY((__m)))
    535 
    536 /*
    537  * Determine if an mbuf's data area is read-only at the MMU.
    538  */
    539 #define M_ROMAP(m)							\
    540 	(((m)->m_flags & (M_EXT|M_EXT_ROMAP)) == (M_EXT|M_EXT_ROMAP))
    541 
    542 /*
    543  * Compute the amount of space available before the current start of
    544  * data in an mbuf.
    545  */
    546 #define M_LEADINGSPACE(m)						\
    547 	(M_READONLY((m)) ? 0 : ((m)->m_data - M_BUFADDR(m)))
    548 
    549 /*
    550  * Compute the amount of space available
    551  * after the end of data in an mbuf.
    552  */
    553 #define _M_TRAILINGSPACE(m)						\
    554 	((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf + (m)->m_ext.ext_size - \
    555 	 ((m)->m_data + (m)->m_len) :					\
    556 	 &(m)->m_dat[MLEN] - ((m)->m_data + (m)->m_len))
    557 
    558 #define M_TRAILINGSPACE(m)						\
    559 	(M_READONLY((m)) ? 0 : _M_TRAILINGSPACE((m)))
    560 
    561 /*
    562  * Arrange to prepend space of size plen to mbuf m.
    563  * If a new mbuf must be allocated, how specifies whether to wait.
    564  * If how is M_DONTWAIT and allocation fails, the original mbuf chain
    565  * is freed and m is set to NULL.
    566  */
    567 #define M_PREPEND(m, plen, how)						\
    568 do {									\
    569 	if (M_LEADINGSPACE(m) >= (plen)) {				\
    570 		(m)->m_data -= (plen);					\
    571 		(m)->m_len += (plen);					\
    572 	} else								\
    573 		(m) = m_prepend((m), (plen), (how));			\
    574 	if ((m) && (m)->m_flags & M_PKTHDR)				\
    575 		(m)->m_pkthdr.len += (plen);				\
    576 } while (0)
    577 
    578 /* change mbuf to new type */
    579 #define MCHTYPE(m, t)							\
    580 do {									\
    581 	KASSERT((t) != MT_FREE);					\
    582 	mbstat_type_add((m)->m_type, -1);				\
    583 	mbstat_type_add(t, 1);						\
    584 	(m)->m_type = t;						\
    585 } while (0)
    586 
    587 #ifdef DIAGNOSTIC
    588 #define M_VERIFY_PACKET(m)	m_verify_packet(m)
    589 #else
    590 #define M_VERIFY_PACKET(m)	__nothing
    591 #endif
    592 
    593 /* The "copy all" special length. */
    594 #define M_COPYALL	-1
    595 
    596 /*
    597  * Allow drivers and/or protocols to store private context information.
    598  */
    599 #define M_GETCTX(m, t)		((t)(m)->m_pkthdr._rcvif.ctx)
    600 #define M_SETCTX(m, c)		((void)((m)->m_pkthdr._rcvif.ctx = (void *)(c)))
    601 #define M_CLEARCTX(m)		M_SETCTX((m), NULL)
    602 
    603 /*
    604  * M_REGION_GET ensures that the "len"-sized region of type "typ" starting
    605  * from "off" within "m" is located in a single mbuf, contiguously.
    606  *
    607  * The pointer to the region will be returned to pointer variable "val".
    608  */
    609 #define M_REGION_GET(val, typ, m, off, len) \
    610 do {									\
    611 	struct mbuf *_t;						\
    612 	int _tmp;							\
    613 	if ((m)->m_len >= (off) + (len))				\
    614 		(val) = (typ)(mtod((m), char *) + (off));		\
    615 	else {								\
    616 		_t = m_pulldown((m), (off), (len), &_tmp);		\
    617 		if (_t) {						\
    618 			if (_t->m_len < _tmp + (len))			\
    619 				panic("m_pulldown malfunction");	\
    620 			(val) = (typ)(mtod(_t, char *) + _tmp);	\
    621 		} else {						\
    622 			(val) = (typ)NULL;				\
    623 			(m) = NULL;					\
    624 		}							\
    625 	}								\
    626 } while (0)
    627 
    628 #endif /* defined(_KERNEL) */
    629 
    630 /*
    631  * Simple mbuf queueing system
    632  *
    633  * this is basically a SIMPLEQ adapted to mbuf use (ie using
    634  * m_nextpkt instead of field.sqe_next).
    635  *
    636  * m_next is ignored, so queueing chains of mbufs is possible
    637  */
    638 #define MBUFQ_HEAD(name)					\
    639 struct name {							\
    640 	struct mbuf *mq_first;					\
    641 	struct mbuf **mq_last;					\
    642 }
    643 
    644 #define MBUFQ_INIT(q)		do {				\
    645 	(q)->mq_first = NULL;					\
    646 	(q)->mq_last = &(q)->mq_first;				\
    647 } while (0)
    648 
    649 #define MBUFQ_ENQUEUE(q, m)	do {				\
    650 	(m)->m_nextpkt = NULL;					\
    651 	*(q)->mq_last = (m);					\
    652 	(q)->mq_last = &(m)->m_nextpkt;				\
    653 } while (0)
    654 
    655 #define MBUFQ_PREPEND(q, m)	do {				\
    656 	if (((m)->m_nextpkt = (q)->mq_first) == NULL)		\
    657 		(q)->mq_last = &(m)->m_nextpkt;			\
    658 	(q)->mq_first = (m);					\
    659 } while (0)
    660 
    661 #define MBUFQ_DEQUEUE(q, m)	do {				\
    662 	if (((m) = (q)->mq_first) != NULL) {			\
    663 		if (((q)->mq_first = (m)->m_nextpkt) == NULL)	\
    664 			(q)->mq_last = &(q)->mq_first;		\
    665 		else						\
    666 			(m)->m_nextpkt = NULL;			\
    667 	}							\
    668 } while (0)
    669 
    670 #define MBUFQ_DRAIN(q)		do {				\
    671 	struct mbuf *__m0;					\
    672 	while ((__m0 = (q)->mq_first) != NULL) {		\
    673 		(q)->mq_first = __m0->m_nextpkt;		\
    674 		m_freem(__m0);					\
    675 	}							\
    676 	(q)->mq_last = &(q)->mq_first;				\
    677 } while (0)
    678 
    679 #define MBUFQ_FIRST(q)		((q)->mq_first)
    680 #define MBUFQ_NEXT(m)		((m)->m_nextpkt)
    681 #define MBUFQ_LAST(q)		(*(q)->mq_last)
    682 
    683 /*
    684  * Mbuf statistics.
    685  * For statistics related to mbuf and cluster allocations, see also the
    686  * pool headers (mb_cache and mcl_cache).
    687  */
    688 struct mbstat {
    689 	u_long	_m_spare;	/* formerly m_mbufs */
    690 	u_long	_m_spare1;	/* formerly m_clusters */
    691 	u_long	_m_spare2;	/* spare field */
    692 	u_long	_m_spare3;	/* formely m_clfree - free clusters */
    693 	u_long	m_drops;	/* times failed to find space */
    694 	u_long	m_wait;		/* times waited for space */
    695 	u_long	m_drain;	/* times drained protocols for space */
    696 	u_short	m_mtypes[256];	/* type specific mbuf allocations */
    697 };
    698 
    699 struct mbstat_cpu {
    700 	u_int	m_mtypes[256];	/* type specific mbuf allocations */
    701 };
    702 
    703 /*
    704  * Mbuf sysctl variables.
    705  */
    706 #define MBUF_MSIZE		1	/* int: mbuf base size */
    707 #define MBUF_MCLBYTES		2	/* int: mbuf cluster size */
    708 #define MBUF_NMBCLUSTERS	3	/* int: limit on the # of clusters */
    709 #define MBUF_MBLOWAT		4	/* int: mbuf low water mark */
    710 #define MBUF_MCLLOWAT		5	/* int: mbuf cluster low water mark */
    711 #define MBUF_STATS		6	/* struct: mbstat */
    712 #define MBUF_MOWNERS		7	/* struct: m_owner[] */
    713 #define MBUF_NMBCLUSTERS_LIMIT	8	/* int: limit of nmbclusters */
    714 
    715 #ifdef _KERNEL
    716 extern struct mbstat mbstat;
    717 extern int nmbclusters;		/* limit on the # of clusters */
    718 extern int mblowat;		/* mbuf low water mark */
    719 extern int mcllowat;		/* mbuf cluster low water mark */
    720 extern int max_linkhdr;		/* largest link-level header */
    721 extern int max_protohdr;		/* largest protocol header */
    722 extern int max_hdr;		/* largest link+protocol header */
    723 extern int max_datalen;		/* MHLEN - max_hdr */
    724 extern const int msize;			/* mbuf base size */
    725 extern const int mclbytes;		/* mbuf cluster size */
    726 extern pool_cache_t mb_cache;
    727 #ifdef MBUFTRACE
    728 LIST_HEAD(mownerhead, mowner);
    729 extern struct mownerhead mowners;
    730 extern struct mowner unknown_mowners[];
    731 extern struct mowner revoked_mowner;
    732 #endif
    733 
    734 MALLOC_DECLARE(M_MBUF);
    735 MALLOC_DECLARE(M_SONAME);
    736 
    737 struct	mbuf *m_copym(struct mbuf *, int, int, int);
    738 struct	mbuf *m_copypacket(struct mbuf *, int);
    739 struct	mbuf *m_devget(char *, int, int, struct ifnet *);
    740 struct	mbuf *m_dup(struct mbuf *, int, int, int);
    741 struct	mbuf *m_get(int, int);
    742 struct	mbuf *m_gethdr(int, int);
    743 struct	mbuf *m_get_n(int, int, size_t, size_t);
    744 struct	mbuf *m_gethdr_n(int, int, size_t, size_t);
    745 struct	mbuf *m_prepend(struct mbuf *,int, int);
    746 struct	mbuf *m_pulldown(struct mbuf *, int, int, int *);
    747 struct	mbuf *m_pullup(struct mbuf *, int);
    748 struct	mbuf *m_copyup(struct mbuf *, int, int);
    749 struct	mbuf *m_split(struct mbuf *,int, int);
    750 struct	mbuf *m_getptr(struct mbuf *, int, int *);
    751 void	m_adj(struct mbuf *, int);
    752 struct	mbuf *m_defrag(struct mbuf *, int);
    753 int	m_apply(struct mbuf *, int, int,
    754     int (*)(void *, void *, unsigned int), void *);
    755 void	m_cat(struct mbuf *,struct mbuf *);
    756 void	m_clget(struct mbuf *, int);
    757 void	m_copyback(struct mbuf *, int, int, const void *);
    758 struct	mbuf *m_copyback_cow(struct mbuf *, int, int, const void *, int);
    759 int	m_makewritable(struct mbuf **, int, int, int);
    760 struct	mbuf *m_getcl(int, int, int);
    761 void	m_copydata(struct mbuf *, int, int, void *);
    762 void	m_verify_packet(struct mbuf *);
    763 struct	mbuf *m_free(struct mbuf *);
    764 void	m_freem(struct mbuf *);
    765 void	mbinit(void);
    766 void	m_remove_pkthdr(struct mbuf *);
    767 void	m_copy_pkthdr(struct mbuf *, struct mbuf *);
    768 void	m_move_pkthdr(struct mbuf *, struct mbuf *);
    769 void	m_align(struct mbuf *, int);
    770 
    771 bool	m_ensure_contig(struct mbuf **, int);
    772 struct mbuf *m_add(struct mbuf *, struct mbuf *);
    773 
    774 /* Inline routines. */
    775 static __inline u_int m_length(const struct mbuf *) __unused;
    776 
    777 /* Statistics */
    778 void mbstat_type_add(int, int);
    779 
    780 /* Packet tag routines */
    781 struct	m_tag *m_tag_get(int, int, int);
    782 void	m_tag_free(struct m_tag *);
    783 void	m_tag_prepend(struct mbuf *, struct m_tag *);
    784 void	m_tag_unlink(struct mbuf *, struct m_tag *);
    785 void	m_tag_delete(struct mbuf *, struct m_tag *);
    786 void	m_tag_delete_chain(struct mbuf *);
    787 struct	m_tag *m_tag_find(const struct mbuf *, int);
    788 struct	m_tag *m_tag_copy(struct m_tag *);
    789 int	m_tag_copy_chain(struct mbuf *, struct mbuf *);
    790 
    791 /* Packet tag types */
    792 #define PACKET_TAG_NONE			0  /* Nothing */
    793 #define PACKET_TAG_SO			4  /* sending socket pointer */
    794 #define PACKET_TAG_NPF			10 /* packet filter */
    795 #define PACKET_TAG_PF			11 /* packet filter */
    796 #define PACKET_TAG_ALTQ_QID		12 /* ALTQ queue id */
    797 #define PACKET_TAG_IPSEC_OUT_DONE	18
    798 #define PACKET_TAG_IPSEC_NAT_T_PORTS	25 /* two uint16_t */
    799 #define PACKET_TAG_INET6		26 /* IPv6 info */
    800 #define PACKET_TAG_TUNNEL_INFO		28 /* tunnel identification and
    801 					    * protocol callback, for loop
    802 					    * detection/recovery
    803 					    */
    804 #define PACKET_TAG_MPLS			29 /* Indicate it's for MPLS */
    805 #define PACKET_TAG_SRCROUTE		30 /* IPv4 source routing */
    806 #define PACKET_TAG_ETHERNET_SRC		31 /* Ethernet source address */
    807 
    808 /*
    809  * Return the number of bytes in the mbuf chain, m.
    810  */
    811 static __inline u_int
    812 m_length(const struct mbuf *m)
    813 {
    814 	const struct mbuf *m0;
    815 	u_int pktlen;
    816 
    817 	if ((m->m_flags & M_PKTHDR) != 0)
    818 		return m->m_pkthdr.len;
    819 
    820 	pktlen = 0;
    821 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
    822 		pktlen += m0->m_len;
    823 	return pktlen;
    824 }
    825 
    826 static __inline void
    827 m_set_rcvif(struct mbuf *m, const struct ifnet *ifp)
    828 {
    829 	KASSERT(m->m_flags & M_PKTHDR);
    830 	m->m_pkthdr.rcvif_index = ifp->if_index;
    831 }
    832 
    833 static __inline void
    834 m_reset_rcvif(struct mbuf *m)
    835 {
    836 	KASSERT(m->m_flags & M_PKTHDR);
    837 	/* A caller may expect whole _rcvif union is zeroed */
    838 	/* m->m_pkthdr.rcvif_index = 0; */
    839 	m->m_pkthdr._rcvif.ctx = NULL;
    840 }
    841 
    842 static __inline void
    843 m_copy_rcvif(struct mbuf *m, const struct mbuf *n)
    844 {
    845 	KASSERT(m->m_flags & M_PKTHDR);
    846 	KASSERT(n->m_flags & M_PKTHDR);
    847 	m->m_pkthdr.rcvif_index = n->m_pkthdr.rcvif_index;
    848 }
    849 
    850 #define M_GET_ALIGNED_HDR(m, type, linkhdr) \
    851     m_get_aligned_hdr((m), __alignof(type) - 1, sizeof(type), (linkhdr))
    852 
    853 static __inline int
    854 m_get_aligned_hdr(struct mbuf **m, int mask, size_t hlen, bool linkhdr)
    855 {
    856 #ifndef __NO_STRICT_ALIGNMENT
    857 	if (((uintptr_t)mtod(*m, void *) & mask) != 0)
    858 		*m = m_copyup(*m, hlen,
    859 		      linkhdr ? (max_linkhdr + mask) & ~mask : 0);
    860 	else
    861 #endif
    862 	if (__predict_false((size_t)(*m)->m_len < hlen))
    863 		*m = m_pullup(*m, hlen);
    864 
    865 	return *m == NULL;
    866 }
    867 
    868 void m_print(const struct mbuf *, const char *, void (*)(const char *, ...)
    869     __printflike(1, 2));
    870 
    871 /* from uipc_mbufdebug.c */
    872 void	m_examine(const struct mbuf *, int, const char *,
    873     void (*)(const char *, ...) __printflike(1, 2));
    874 
    875 /* parsers for m_examine() */
    876 void m_examine_ether(const struct mbuf *, int, const char *,
    877     void (*)(const char *, ...) __printflike(1, 2));
    878 void m_examine_pppoe(const struct mbuf *, int, const char *,
    879     void (*)(const char *, ...) __printflike(1, 2));
    880 void m_examine_ppp(const struct mbuf *, int, const char *,
    881     void (*)(const char *, ...) __printflike(1, 2));
    882 void m_examine_arp(const struct mbuf *, int, const char *,
    883     void (*)(const char *, ...) __printflike(1, 2));
    884 void m_examine_ip(const struct mbuf *, int, const char *,
    885     void (*)(const char *, ...) __printflike(1, 2));
    886 void m_examine_icmp(const struct mbuf *, int, const char *,
    887     void (*)(const char *, ...) __printflike(1, 2));
    888 void m_examine_ip6(const struct mbuf *, int, const char *,
    889     void (*)(const char *, ...) __printflike(1, 2));
    890 void m_examine_icmp6(const struct mbuf *, int, const char *,
    891     void (*)(const char *, ...) __printflike(1, 2));
    892 void m_examine_tcp(const struct mbuf *, int, const char *,
    893     void (*)(const char *, ...) __printflike(1, 2));
    894 void m_examine_udp(const struct mbuf *, int, const char *,
    895     void (*)(const char *, ...) __printflike(1, 2));
    896 void m_examine_hex(const struct mbuf *, int, const char *,
    897     void (*)(const char *, ...) __printflike(1, 2));
    898 
    899 /*
    900  * Get rcvif of a mbuf.
    901  *
    902  * The caller must call m_put_rcvif after using rcvif if the returned rcvif
    903  * isn't NULL. If the returned rcvif is NULL, the caller doesn't need to call
    904  * m_put_rcvif (although calling it is safe).
    905  *
    906  * The caller must not block or sleep while using rcvif. The API ensures a
    907  * returned rcvif isn't freed until m_put_rcvif is called.
    908  */
    909 static __inline struct ifnet *
    910 m_get_rcvif(const struct mbuf *m, int *s)
    911 {
    912 	struct ifnet *ifp;
    913 
    914 	KASSERT(m->m_flags & M_PKTHDR);
    915 	*s = pserialize_read_enter();
    916 	ifp = if_byindex(m->m_pkthdr.rcvif_index);
    917 	if (__predict_false(ifp == NULL))
    918 		pserialize_read_exit(*s);
    919 
    920 	return ifp;
    921 }
    922 
    923 static __inline void
    924 m_put_rcvif(struct ifnet *ifp, int *s)
    925 {
    926 
    927 	if (ifp == NULL)
    928 		return;
    929 	pserialize_read_exit(*s);
    930 }
    931 
    932 /*
    933  * Get rcvif of a mbuf.
    934  *
    935  * The caller must call m_put_rcvif_psref after using rcvif. The API ensures
    936  * a got rcvif isn't be freed until m_put_rcvif_psref is called.
    937  */
    938 static __inline struct ifnet *
    939 m_get_rcvif_psref(const struct mbuf *m, struct psref *psref)
    940 {
    941 	KASSERT(m->m_flags & M_PKTHDR);
    942 	return if_get_byindex(m->m_pkthdr.rcvif_index, psref);
    943 }
    944 
    945 static __inline void
    946 m_put_rcvif_psref(struct ifnet *ifp, struct psref *psref)
    947 {
    948 
    949 	if (ifp == NULL)
    950 		return;
    951 	if_put(ifp, psref);
    952 }
    953 
    954 /*
    955  * Get rcvif of a mbuf.
    956  *
    957  * This is NOT an MP-safe API and shouldn't be used at where we want MP-safe.
    958  */
    959 static __inline struct ifnet *
    960 m_get_rcvif_NOMPSAFE(const struct mbuf *m)
    961 {
    962 	KASSERT(m->m_flags & M_PKTHDR);
    963 	return if_byindex(m->m_pkthdr.rcvif_index);
    964 }
    965 
    966 #endif /* _KERNEL */
    967 #endif /* !_SYS_MBUF_H_ */
    968