Home | History | Annotate | Line # | Download | only in net
pktqueue.c revision 1.21
      1  1.21   thorpej /*	$NetBSD: pktqueue.c,v 1.21 2022/09/04 17:34:43 thorpej Exp $	*/
      2   1.1     rmind 
      3   1.1     rmind /*-
      4   1.1     rmind  * Copyright (c) 2014 The NetBSD Foundation, Inc.
      5   1.1     rmind  * All rights reserved.
      6   1.1     rmind  *
      7   1.1     rmind  * This code is derived from software contributed to The NetBSD Foundation
      8   1.1     rmind  * by Mindaugas Rasiukevicius.
      9   1.1     rmind  *
     10   1.1     rmind  * Redistribution and use in source and binary forms, with or without
     11   1.1     rmind  * modification, are permitted provided that the following conditions
     12   1.1     rmind  * are met:
     13   1.1     rmind  * 1. Redistributions of source code must retain the above copyright
     14   1.1     rmind  *    notice, this list of conditions and the following disclaimer.
     15   1.1     rmind  * 2. Redistributions in binary form must reproduce the above copyright
     16   1.1     rmind  *    notice, this list of conditions and the following disclaimer in the
     17   1.1     rmind  *    documentation and/or other materials provided with the distribution.
     18   1.1     rmind  *
     19   1.1     rmind  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20   1.1     rmind  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21   1.1     rmind  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22   1.1     rmind  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23   1.1     rmind  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24   1.1     rmind  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25   1.1     rmind  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26   1.1     rmind  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27   1.1     rmind  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28   1.1     rmind  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29   1.1     rmind  * POSSIBILITY OF SUCH DAMAGE.
     30   1.1     rmind  */
     31   1.1     rmind 
     32   1.4     rmind /*
     33   1.4     rmind  * The packet queue (pktqueue) interface is a lockless IP input queue
     34   1.4     rmind  * which also abstracts and handles network ISR scheduling.  It provides
     35   1.4     rmind  * a mechanism to enable receiver-side packet steering (RPS).
     36   1.4     rmind  */
     37   1.4     rmind 
     38   1.1     rmind #include <sys/cdefs.h>
     39  1.21   thorpej __KERNEL_RCSID(0, "$NetBSD: pktqueue.c,v 1.21 2022/09/04 17:34:43 thorpej Exp $");
     40  1.14  knakahar 
     41  1.14  knakahar #ifdef _KERNEL_OPT
     42  1.14  knakahar #include "opt_net_mpsafe.h"
     43  1.14  knakahar #endif
     44   1.1     rmind 
     45   1.1     rmind #include <sys/param.h>
     46   1.1     rmind #include <sys/types.h>
     47   1.1     rmind 
     48   1.1     rmind #include <sys/atomic.h>
     49   1.1     rmind #include <sys/cpu.h>
     50   1.1     rmind #include <sys/pcq.h>
     51   1.1     rmind #include <sys/intr.h>
     52   1.1     rmind #include <sys/mbuf.h>
     53   1.1     rmind #include <sys/proc.h>
     54   1.1     rmind #include <sys/percpu.h>
     55  1.11   thorpej #include <sys/xcall.h>
     56  1.20   thorpej #include <sys/once.h>
     57  1.20   thorpej #include <sys/queue.h>
     58  1.20   thorpej #include <sys/rwlock.h>
     59   1.1     rmind 
     60   1.1     rmind #include <net/pktqueue.h>
     61  1.14  knakahar #include <net/rss_config.h>
     62  1.14  knakahar 
     63  1.14  knakahar #include <netinet/in.h>
     64  1.14  knakahar #include <netinet/ip.h>
     65  1.14  knakahar #include <netinet/ip6.h>
     66   1.1     rmind 
     67   1.1     rmind struct pktqueue {
     68   1.1     rmind 	/*
     69   1.1     rmind 	 * The lock used for a barrier mechanism.  The barrier counter,
     70   1.1     rmind 	 * as well as the drop counter, are managed atomically though.
     71   1.1     rmind 	 * Ensure this group is in a separate cache line.
     72   1.1     rmind 	 */
     73  1.13     skrll 	union {
     74  1.13     skrll 		struct {
     75  1.13     skrll 			kmutex_t	pq_lock;
     76  1.13     skrll 			volatile u_int	pq_barrier;
     77  1.13     skrll 		};
     78  1.13     skrll 		uint8_t	 _pad[COHERENCY_UNIT];
     79  1.13     skrll 	};
     80   1.1     rmind 
     81   1.1     rmind 	/* The size of the queue, counters and the interrupt handler. */
     82   1.1     rmind 	u_int		pq_maxlen;
     83   1.1     rmind 	percpu_t *	pq_counters;
     84   1.1     rmind 	void *		pq_sih;
     85   1.1     rmind 
     86  1.20   thorpej 	/* The per-CPU queues. */
     87  1.12  riastrad 	struct percpu *	pq_pcq;	/* struct pcq * */
     88  1.20   thorpej 
     89  1.20   thorpej 	/* The linkage on the list of all pktqueues. */
     90  1.20   thorpej 	LIST_ENTRY(pktqueue) pq_list;
     91   1.1     rmind };
     92   1.1     rmind 
     93   1.1     rmind /* The counters of the packet queue. */
     94   1.1     rmind #define	PQCNT_ENQUEUE	0
     95   1.1     rmind #define	PQCNT_DEQUEUE	1
     96   1.1     rmind #define	PQCNT_DROP	2
     97   1.1     rmind #define	PQCNT_NCOUNTERS	3
     98   1.1     rmind 
     99   1.1     rmind typedef struct {
    100   1.1     rmind 	uint64_t	count[PQCNT_NCOUNTERS];
    101   1.1     rmind } pktq_counters_t;
    102   1.1     rmind 
    103   1.1     rmind /* Special marker value used by pktq_barrier() mechanism. */
    104   1.1     rmind #define	PKTQ_MARKER	((void *)(~0ULL))
    105   1.1     rmind 
    106  1.20   thorpej /*
    107  1.20   thorpej  * This is a list of all pktqueues.  This list is used by
    108  1.20   thorpej  * pktq_ifdetach() to issue a barrier on every pktqueue.
    109  1.20   thorpej  *
    110  1.20   thorpej  * The r/w lock is acquired for writing in pktq_create() and
    111  1.20   thorpej  * pktq_destroy(), and for reading in pktq_ifdetach().
    112  1.20   thorpej  *
    113  1.20   thorpej  * This list is not performance critical, and will seldom be
    114  1.20   thorpej  * accessed.
    115  1.20   thorpej  */
    116  1.20   thorpej static LIST_HEAD(, pktqueue) pktqueue_list	__read_mostly;
    117  1.20   thorpej static krwlock_t pktqueue_list_lock		__read_mostly;
    118  1.20   thorpej static once_t pktqueue_list_init_once		__read_mostly;
    119  1.20   thorpej 
    120  1.20   thorpej static int
    121  1.20   thorpej pktqueue_list_init(void)
    122  1.20   thorpej {
    123  1.20   thorpej 	LIST_INIT(&pktqueue_list);
    124  1.20   thorpej 	rw_init(&pktqueue_list_lock);
    125  1.20   thorpej 	return 0;
    126  1.20   thorpej }
    127  1.20   thorpej 
    128  1.12  riastrad static void
    129  1.12  riastrad pktq_init_cpu(void *vqp, void *vpq, struct cpu_info *ci)
    130  1.12  riastrad {
    131  1.12  riastrad 	struct pcq **qp = vqp;
    132  1.12  riastrad 	struct pktqueue *pq = vpq;
    133  1.12  riastrad 
    134  1.12  riastrad 	*qp = pcq_create(pq->pq_maxlen, KM_SLEEP);
    135  1.12  riastrad }
    136  1.12  riastrad 
    137  1.12  riastrad static void
    138  1.12  riastrad pktq_fini_cpu(void *vqp, void *vpq, struct cpu_info *ci)
    139  1.12  riastrad {
    140  1.12  riastrad 	struct pcq **qp = vqp, *q = *qp;
    141  1.12  riastrad 
    142  1.12  riastrad 	KASSERT(pcq_peek(q) == NULL);
    143  1.12  riastrad 	pcq_destroy(q);
    144  1.12  riastrad 	*qp = NULL;		/* paranoia */
    145  1.12  riastrad }
    146  1.12  riastrad 
    147  1.12  riastrad static struct pcq *
    148  1.12  riastrad pktq_pcq(struct pktqueue *pq, struct cpu_info *ci)
    149  1.12  riastrad {
    150  1.12  riastrad 	struct pcq **qp, *q;
    151  1.12  riastrad 
    152  1.12  riastrad 	/*
    153  1.12  riastrad 	 * As long as preemption is disabled, the xcall to swap percpu
    154  1.12  riastrad 	 * buffers can't complete, so it is safe to read the pointer.
    155  1.12  riastrad 	 */
    156  1.12  riastrad 	KASSERT(kpreempt_disabled());
    157  1.12  riastrad 
    158  1.12  riastrad 	qp = percpu_getptr_remote(pq->pq_pcq, ci);
    159  1.12  riastrad 	q = *qp;
    160  1.12  riastrad 
    161  1.12  riastrad 	return q;
    162  1.12  riastrad }
    163   1.1     rmind 
    164   1.1     rmind pktqueue_t *
    165   1.5     ozaki pktq_create(size_t maxlen, void (*intrh)(void *), void *sc)
    166   1.1     rmind {
    167   1.1     rmind 	const u_int sflags = SOFTINT_NET | SOFTINT_MPSAFE | SOFTINT_RCPU;
    168   1.1     rmind 	pktqueue_t *pq;
    169   1.1     rmind 	percpu_t *pc;
    170   1.1     rmind 	void *sih;
    171   1.1     rmind 
    172  1.20   thorpej 	RUN_ONCE(&pktqueue_list_init_once, pktqueue_list_init);
    173  1.20   thorpej 
    174   1.9       chs 	pc = percpu_alloc(sizeof(pktq_counters_t));
    175   1.5     ozaki 	if ((sih = softint_establish(sflags, intrh, sc)) == NULL) {
    176   1.1     rmind 		percpu_free(pc, sizeof(pktq_counters_t));
    177   1.1     rmind 		return NULL;
    178   1.1     rmind 	}
    179   1.1     rmind 
    180  1.12  riastrad 	pq = kmem_zalloc(sizeof(*pq), KM_SLEEP);
    181   1.1     rmind 	mutex_init(&pq->pq_lock, MUTEX_DEFAULT, IPL_NONE);
    182   1.1     rmind 	pq->pq_maxlen = maxlen;
    183   1.1     rmind 	pq->pq_counters = pc;
    184   1.1     rmind 	pq->pq_sih = sih;
    185  1.12  riastrad 	pq->pq_pcq = percpu_create(sizeof(struct pcq *),
    186  1.12  riastrad 	    pktq_init_cpu, pktq_fini_cpu, pq);
    187   1.1     rmind 
    188  1.20   thorpej 	rw_enter(&pktqueue_list_lock, RW_WRITER);
    189  1.20   thorpej 	LIST_INSERT_HEAD(&pktqueue_list, pq, pq_list);
    190  1.20   thorpej 	rw_exit(&pktqueue_list_lock);
    191  1.20   thorpej 
    192   1.1     rmind 	return pq;
    193   1.1     rmind }
    194   1.1     rmind 
    195   1.1     rmind void
    196   1.1     rmind pktq_destroy(pktqueue_t *pq)
    197   1.1     rmind {
    198   1.1     rmind 
    199  1.20   thorpej 	KASSERT(pktqueue_list_init_once.o_status == ONCE_DONE);
    200  1.20   thorpej 
    201  1.20   thorpej 	rw_enter(&pktqueue_list_lock, RW_WRITER);
    202  1.20   thorpej 	LIST_REMOVE(pq, pq_list);
    203  1.20   thorpej 	rw_exit(&pktqueue_list_lock);
    204  1.20   thorpej 
    205  1.12  riastrad 	percpu_free(pq->pq_pcq, sizeof(struct pcq *));
    206   1.1     rmind 	percpu_free(pq->pq_counters, sizeof(pktq_counters_t));
    207   1.1     rmind 	softint_disestablish(pq->pq_sih);
    208   1.1     rmind 	mutex_destroy(&pq->pq_lock);
    209  1.12  riastrad 	kmem_free(pq, sizeof(*pq));
    210   1.1     rmind }
    211   1.1     rmind 
    212   1.1     rmind /*
    213   1.1     rmind  * - pktq_inc_counter: increment the counter given an ID.
    214   1.1     rmind  * - pktq_collect_counts: handler to sum up the counts from each CPU.
    215   1.1     rmind  * - pktq_getcount: return the effective count given an ID.
    216   1.1     rmind  */
    217   1.1     rmind 
    218   1.1     rmind static inline void
    219   1.1     rmind pktq_inc_count(pktqueue_t *pq, u_int i)
    220   1.1     rmind {
    221   1.1     rmind 	percpu_t *pc = pq->pq_counters;
    222   1.1     rmind 	pktq_counters_t *c;
    223   1.1     rmind 
    224   1.1     rmind 	c = percpu_getref(pc);
    225   1.1     rmind 	c->count[i]++;
    226   1.1     rmind 	percpu_putref(pc);
    227   1.1     rmind }
    228   1.1     rmind 
    229   1.1     rmind static void
    230   1.1     rmind pktq_collect_counts(void *mem, void *arg, struct cpu_info *ci)
    231   1.1     rmind {
    232   1.1     rmind 	const pktq_counters_t *c = mem;
    233   1.1     rmind 	pktq_counters_t *sum = arg;
    234   1.1     rmind 
    235  1.11   thorpej 	int s = splnet();
    236  1.11   thorpej 
    237   1.1     rmind 	for (u_int i = 0; i < PQCNT_NCOUNTERS; i++) {
    238   1.1     rmind 		sum->count[i] += c->count[i];
    239   1.1     rmind 	}
    240  1.11   thorpej 
    241  1.11   thorpej 	splx(s);
    242   1.1     rmind }
    243   1.1     rmind 
    244  1.19   thorpej static uint64_t
    245   1.1     rmind pktq_get_count(pktqueue_t *pq, pktq_count_t c)
    246   1.1     rmind {
    247   1.1     rmind 	pktq_counters_t sum;
    248   1.1     rmind 
    249   1.1     rmind 	if (c != PKTQ_MAXLEN) {
    250   1.1     rmind 		memset(&sum, 0, sizeof(sum));
    251  1.11   thorpej 		percpu_foreach_xcall(pq->pq_counters,
    252  1.11   thorpej 		    XC_HIGHPRI_IPL(IPL_SOFTNET), pktq_collect_counts, &sum);
    253   1.1     rmind 	}
    254   1.1     rmind 	switch (c) {
    255   1.1     rmind 	case PKTQ_NITEMS:
    256   1.1     rmind 		return sum.count[PQCNT_ENQUEUE] - sum.count[PQCNT_DEQUEUE];
    257   1.1     rmind 	case PKTQ_DROPS:
    258   1.1     rmind 		return sum.count[PQCNT_DROP];
    259   1.1     rmind 	case PKTQ_MAXLEN:
    260   1.1     rmind 		return pq->pq_maxlen;
    261   1.1     rmind 	}
    262   1.1     rmind 	return 0;
    263   1.1     rmind }
    264   1.1     rmind 
    265   1.1     rmind uint32_t
    266  1.18   thorpej pktq_rps_hash(const pktq_rps_hash_func_t *funcp, const struct mbuf *m)
    267  1.14  knakahar {
    268  1.14  knakahar 	pktq_rps_hash_func_t func = atomic_load_relaxed(funcp);
    269  1.14  knakahar 
    270  1.14  knakahar 	KASSERT(func != NULL);
    271  1.14  knakahar 
    272  1.14  knakahar 	return (*func)(m);
    273  1.14  knakahar }
    274  1.14  knakahar 
    275  1.14  knakahar static uint32_t
    276  1.14  knakahar pktq_rps_hash_zero(const struct mbuf *m __unused)
    277   1.1     rmind {
    278  1.14  knakahar 
    279  1.14  knakahar 	return 0;
    280  1.14  knakahar }
    281  1.14  knakahar 
    282  1.14  knakahar static uint32_t
    283  1.14  knakahar pktq_rps_hash_curcpu(const struct mbuf *m __unused)
    284  1.14  knakahar {
    285  1.14  knakahar 
    286  1.14  knakahar 	return cpu_index(curcpu());
    287  1.14  knakahar }
    288  1.14  knakahar 
    289  1.14  knakahar static uint32_t
    290  1.14  knakahar pktq_rps_hash_toeplitz(const struct mbuf *m)
    291  1.14  knakahar {
    292  1.14  knakahar 	struct ip *ip;
    293   1.1     rmind 	/*
    294  1.14  knakahar 	 * Disable UDP port - IP fragments aren't currently being handled
    295  1.14  knakahar 	 * and so we end up with a mix of 2-tuple and 4-tuple
    296  1.14  knakahar 	 * traffic.
    297   1.1     rmind 	 */
    298  1.14  knakahar 	const u_int flag = RSS_TOEPLITZ_USE_TCP_PORT;
    299  1.14  knakahar 
    300  1.14  knakahar 	/* glance IP version */
    301  1.14  knakahar 	if ((m->m_flags & M_PKTHDR) == 0)
    302  1.14  knakahar 		return 0;
    303  1.14  knakahar 
    304  1.14  knakahar 	ip = mtod(m, struct ip *);
    305  1.14  knakahar 	if (ip->ip_v == IPVERSION) {
    306  1.14  knakahar 		if (__predict_false(m->m_len < sizeof(struct ip)))
    307  1.14  knakahar 			return 0;
    308  1.14  knakahar 		return rss_toeplitz_hash_from_mbuf_ipv4(m, flag);
    309  1.14  knakahar 	} else if (ip->ip_v == 6) {
    310  1.14  knakahar 		if (__predict_false(m->m_len < sizeof(struct ip6_hdr)))
    311  1.14  knakahar 			return 0;
    312  1.14  knakahar 		return rss_toeplitz_hash_from_mbuf_ipv6(m, flag);
    313  1.14  knakahar 	}
    314  1.14  knakahar 
    315   1.1     rmind 	return 0;
    316   1.1     rmind }
    317   1.1     rmind 
    318   1.1     rmind /*
    319  1.15  knakahar  * toeplitz without curcpu.
    320  1.15  knakahar  * Generally, this has better performance than toeplitz.
    321  1.14  knakahar  */
    322  1.14  knakahar static uint32_t
    323  1.14  knakahar pktq_rps_hash_toeplitz_othercpus(const struct mbuf *m)
    324  1.14  knakahar {
    325  1.14  knakahar 	uint32_t hash;
    326  1.14  knakahar 
    327  1.16  knakahar 	if (ncpu == 1)
    328  1.16  knakahar 		return 0;
    329  1.16  knakahar 
    330  1.14  knakahar 	hash = pktq_rps_hash_toeplitz(m);
    331  1.14  knakahar 	hash %= ncpu - 1;
    332  1.14  knakahar 	if (hash >= cpu_index(curcpu()))
    333  1.14  knakahar 		return hash + 1;
    334  1.14  knakahar 	else
    335  1.14  knakahar 		return hash;
    336  1.14  knakahar }
    337  1.14  knakahar 
    338  1.14  knakahar static struct pktq_rps_hash_table {
    339  1.14  knakahar 	const char* prh_type;
    340  1.14  knakahar 	pktq_rps_hash_func_t prh_func;
    341  1.14  knakahar } const pktq_rps_hash_tab[] = {
    342  1.14  knakahar 	{ "zero", pktq_rps_hash_zero },
    343  1.14  knakahar 	{ "curcpu", pktq_rps_hash_curcpu },
    344  1.14  knakahar 	{ "toeplitz", pktq_rps_hash_toeplitz },
    345  1.14  knakahar 	{ "toeplitz-othercpus", pktq_rps_hash_toeplitz_othercpus },
    346  1.14  knakahar };
    347  1.14  knakahar const pktq_rps_hash_func_t pktq_rps_hash_default =
    348  1.14  knakahar #ifdef NET_MPSAFE
    349  1.14  knakahar 	pktq_rps_hash_curcpu;
    350  1.14  knakahar #else
    351  1.14  knakahar 	pktq_rps_hash_zero;
    352  1.14  knakahar #endif
    353  1.14  knakahar 
    354  1.14  knakahar static const char *
    355  1.14  knakahar pktq_get_rps_hash_type(pktq_rps_hash_func_t func)
    356  1.14  knakahar {
    357  1.14  knakahar 
    358  1.14  knakahar 	for (int i = 0; i < __arraycount(pktq_rps_hash_tab); i++) {
    359  1.14  knakahar 		if (func == pktq_rps_hash_tab[i].prh_func) {
    360  1.14  knakahar 			return pktq_rps_hash_tab[i].prh_type;
    361  1.14  knakahar 		}
    362  1.14  knakahar 	}
    363  1.14  knakahar 
    364  1.14  knakahar 	return NULL;
    365  1.14  knakahar }
    366  1.14  knakahar 
    367  1.14  knakahar static int
    368  1.14  knakahar pktq_set_rps_hash_type(pktq_rps_hash_func_t *func, const char *type)
    369  1.14  knakahar {
    370  1.14  knakahar 
    371  1.14  knakahar 	if (strcmp(type, pktq_get_rps_hash_type(*func)) == 0)
    372  1.14  knakahar 		return 0;
    373  1.14  knakahar 
    374  1.14  knakahar 	for (int i = 0; i < __arraycount(pktq_rps_hash_tab); i++) {
    375  1.14  knakahar 		if (strcmp(type, pktq_rps_hash_tab[i].prh_type) == 0) {
    376  1.14  knakahar 			atomic_store_relaxed(func, pktq_rps_hash_tab[i].prh_func);
    377  1.14  knakahar 			return 0;
    378  1.14  knakahar 		}
    379  1.14  knakahar 	}
    380  1.14  knakahar 
    381  1.14  knakahar 	return ENOENT;
    382  1.14  knakahar }
    383  1.14  knakahar 
    384  1.14  knakahar int
    385  1.14  knakahar sysctl_pktq_rps_hash_handler(SYSCTLFN_ARGS)
    386  1.14  knakahar {
    387  1.14  knakahar 	struct sysctlnode node;
    388  1.14  knakahar 	pktq_rps_hash_func_t *func;
    389  1.14  knakahar 	int error;
    390  1.14  knakahar 	char type[PKTQ_RPS_HASH_NAME_LEN];
    391  1.14  knakahar 
    392  1.14  knakahar 	node = *rnode;
    393  1.14  knakahar 	func = node.sysctl_data;
    394  1.14  knakahar 
    395  1.14  knakahar 	strlcpy(type, pktq_get_rps_hash_type(*func), PKTQ_RPS_HASH_NAME_LEN);
    396  1.14  knakahar 
    397  1.14  knakahar 	node.sysctl_data = &type;
    398  1.14  knakahar 	node.sysctl_size = sizeof(type);
    399  1.14  knakahar 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    400  1.14  knakahar 	if (error || newp == NULL)
    401  1.14  knakahar 		return error;
    402  1.14  knakahar 
    403  1.14  knakahar 	error = pktq_set_rps_hash_type(func, type);
    404  1.14  knakahar 
    405  1.14  knakahar 	return error;
    406  1.14  knakahar  }
    407  1.14  knakahar 
    408  1.14  knakahar /*
    409   1.1     rmind  * pktq_enqueue: inject the packet into the end of the queue.
    410   1.1     rmind  *
    411   1.1     rmind  * => Must be called from the interrupt or with the preemption disabled.
    412   1.1     rmind  * => Consumes the packet and returns true on success.
    413   1.1     rmind  * => Returns false on failure; caller is responsible to free the packet.
    414   1.1     rmind  */
    415   1.1     rmind bool
    416   1.3     rmind pktq_enqueue(pktqueue_t *pq, struct mbuf *m, const u_int hash __unused)
    417   1.1     rmind {
    418   1.8     ozaki #if defined(_RUMPKERNEL) || defined(_RUMP_NATIVE_ABI)
    419  1.12  riastrad 	struct cpu_info *ci = curcpu();
    420   1.7     ozaki #else
    421  1.12  riastrad 	struct cpu_info *ci = cpu_lookup(hash % ncpu);
    422   1.7     ozaki #endif
    423   1.1     rmind 
    424   1.1     rmind 	KASSERT(kpreempt_disabled());
    425   1.1     rmind 
    426  1.12  riastrad 	if (__predict_false(!pcq_put(pktq_pcq(pq, ci), m))) {
    427   1.1     rmind 		pktq_inc_count(pq, PQCNT_DROP);
    428   1.1     rmind 		return false;
    429   1.1     rmind 	}
    430  1.12  riastrad 	softint_schedule_cpu(pq->pq_sih, ci);
    431   1.1     rmind 	pktq_inc_count(pq, PQCNT_ENQUEUE);
    432   1.1     rmind 	return true;
    433   1.1     rmind }
    434   1.1     rmind 
    435   1.1     rmind /*
    436   1.1     rmind  * pktq_dequeue: take a packet from the queue.
    437   1.1     rmind  *
    438   1.1     rmind  * => Must be called with preemption disabled.
    439   1.1     rmind  * => Must ensure there are not concurrent dequeue calls.
    440   1.1     rmind  */
    441   1.1     rmind struct mbuf *
    442   1.1     rmind pktq_dequeue(pktqueue_t *pq)
    443   1.1     rmind {
    444  1.12  riastrad 	struct cpu_info *ci = curcpu();
    445   1.1     rmind 	struct mbuf *m;
    446   1.1     rmind 
    447  1.12  riastrad 	KASSERT(kpreempt_disabled());
    448  1.12  riastrad 
    449  1.12  riastrad 	m = pcq_get(pktq_pcq(pq, ci));
    450   1.1     rmind 	if (__predict_false(m == PKTQ_MARKER)) {
    451   1.1     rmind 		/* Note the marker entry. */
    452   1.1     rmind 		atomic_inc_uint(&pq->pq_barrier);
    453  1.17   thorpej 
    454  1.17   thorpej 		/* Get the next queue entry. */
    455  1.17   thorpej 		m = pcq_get(pktq_pcq(pq, ci));
    456  1.17   thorpej 
    457  1.17   thorpej 		/*
    458  1.17   thorpej 		 * There can only be one barrier operation pending
    459  1.17   thorpej 		 * on a pktqueue at any given time, so we can assert
    460  1.17   thorpej 		 * that the next item is not a marker.
    461  1.17   thorpej 		 */
    462  1.17   thorpej 		KASSERT(m != PKTQ_MARKER);
    463   1.1     rmind 	}
    464   1.1     rmind 	if (__predict_true(m != NULL)) {
    465   1.1     rmind 		pktq_inc_count(pq, PQCNT_DEQUEUE);
    466   1.1     rmind 	}
    467   1.1     rmind 	return m;
    468   1.1     rmind }
    469   1.1     rmind 
    470   1.1     rmind /*
    471   1.1     rmind  * pktq_barrier: waits for a grace period when all packets enqueued at
    472   1.1     rmind  * the moment of calling this routine will be processed.  This is used
    473   1.1     rmind  * to ensure that e.g. packets referencing some interface were drained.
    474   1.1     rmind  */
    475   1.1     rmind void
    476   1.1     rmind pktq_barrier(pktqueue_t *pq)
    477   1.1     rmind {
    478  1.12  riastrad 	CPU_INFO_ITERATOR cii;
    479  1.12  riastrad 	struct cpu_info *ci;
    480   1.1     rmind 	u_int pending = 0;
    481   1.1     rmind 
    482   1.1     rmind 	mutex_enter(&pq->pq_lock);
    483   1.1     rmind 	KASSERT(pq->pq_barrier == 0);
    484   1.1     rmind 
    485  1.12  riastrad 	for (CPU_INFO_FOREACH(cii, ci)) {
    486  1.12  riastrad 		struct pcq *q;
    487  1.12  riastrad 
    488  1.12  riastrad 		kpreempt_disable();
    489  1.12  riastrad 		q = pktq_pcq(pq, ci);
    490  1.12  riastrad 		kpreempt_enable();
    491   1.1     rmind 
    492   1.1     rmind 		/* If the queue is empty - nothing to do. */
    493   1.1     rmind 		if (pcq_peek(q) == NULL) {
    494   1.1     rmind 			continue;
    495   1.1     rmind 		}
    496   1.1     rmind 		/* Otherwise, put the marker and entry. */
    497   1.1     rmind 		while (!pcq_put(q, PKTQ_MARKER)) {
    498   1.1     rmind 			kpause("pktqsync", false, 1, NULL);
    499   1.1     rmind 		}
    500   1.1     rmind 		kpreempt_disable();
    501  1.12  riastrad 		softint_schedule_cpu(pq->pq_sih, ci);
    502   1.1     rmind 		kpreempt_enable();
    503   1.1     rmind 		pending++;
    504   1.1     rmind 	}
    505   1.1     rmind 
    506   1.1     rmind 	/* Wait for each queue to process the markers. */
    507   1.1     rmind 	while (pq->pq_barrier != pending) {
    508   1.1     rmind 		kpause("pktqsync", false, 1, NULL);
    509   1.1     rmind 	}
    510   1.1     rmind 	pq->pq_barrier = 0;
    511   1.1     rmind 	mutex_exit(&pq->pq_lock);
    512   1.1     rmind }
    513   1.1     rmind 
    514   1.1     rmind /*
    515  1.20   thorpej  * pktq_ifdetach: issue a barrier on all pktqueues when a network
    516  1.20   thorpej  * interface is detached.
    517  1.20   thorpej  */
    518  1.20   thorpej void
    519  1.20   thorpej pktq_ifdetach(void)
    520  1.20   thorpej {
    521  1.20   thorpej 	pktqueue_t *pq;
    522  1.20   thorpej 
    523  1.20   thorpej 	/* Just in case no pktqueues have been created yet... */
    524  1.20   thorpej 	RUN_ONCE(&pktqueue_list_init_once, pktqueue_list_init);
    525  1.20   thorpej 
    526  1.20   thorpej 	rw_enter(&pktqueue_list_lock, RW_READER);
    527  1.20   thorpej 	LIST_FOREACH(pq, &pktqueue_list, pq_list) {
    528  1.20   thorpej 		pktq_barrier(pq);
    529  1.20   thorpej 	}
    530  1.20   thorpej 	rw_exit(&pktqueue_list_lock);
    531  1.20   thorpej }
    532  1.20   thorpej 
    533  1.20   thorpej /*
    534   1.1     rmind  * pktq_flush: free mbufs in all queues.
    535   1.1     rmind  *
    536   1.4     rmind  * => The caller must ensure there are no concurrent writers or flush calls.
    537   1.1     rmind  */
    538   1.1     rmind void
    539   1.1     rmind pktq_flush(pktqueue_t *pq)
    540   1.1     rmind {
    541  1.12  riastrad 	CPU_INFO_ITERATOR cii;
    542  1.12  riastrad 	struct cpu_info *ci;
    543  1.21   thorpej 	struct mbuf *m, *m0 = NULL;
    544  1.21   thorpej 
    545  1.21   thorpej 	ASSERT_SLEEPABLE();
    546  1.21   thorpej 
    547  1.21   thorpej 	/*
    548  1.21   thorpej 	 * Run a dummy softint at IPL_SOFTNET on all CPUs to ensure that any
    549  1.21   thorpej 	 * already running handler for this pktqueue is no longer running.
    550  1.21   thorpej 	 */
    551  1.21   thorpej 	xc_barrier(XC_HIGHPRI_IPL(IPL_SOFTNET));
    552  1.21   thorpej 
    553  1.21   thorpej 	/*
    554  1.21   thorpej 	 * Acquire the barrier lock.  While the caller ensures that
    555  1.21   thorpej 	 * no explcit pktq_barrier() calls will be issued, this holds
    556  1.21   thorpej 	 * off any implicit pktq_barrier() calls that would happen
    557  1.21   thorpej 	 * as the result of pktq_ifdetach().
    558  1.21   thorpej 	 */
    559  1.21   thorpej 	mutex_enter(&pq->pq_lock);
    560   1.1     rmind 
    561  1.12  riastrad 	for (CPU_INFO_FOREACH(cii, ci)) {
    562  1.12  riastrad 		struct pcq *q;
    563  1.12  riastrad 
    564  1.12  riastrad 		kpreempt_disable();
    565  1.12  riastrad 		q = pktq_pcq(pq, ci);
    566  1.12  riastrad 		kpreempt_enable();
    567  1.12  riastrad 
    568  1.12  riastrad 		/*
    569  1.21   thorpej 		 * Pull the packets off the pcq and chain them into
    570  1.21   thorpej 		 * a list to be freed later.
    571  1.12  riastrad 		 */
    572  1.12  riastrad 		while ((m = pcq_get(q)) != NULL) {
    573   1.1     rmind 			pktq_inc_count(pq, PQCNT_DEQUEUE);
    574  1.21   thorpej 			m->m_nextpkt = m0;
    575  1.21   thorpej 			m0 = m;
    576   1.1     rmind 		}
    577   1.1     rmind 	}
    578  1.21   thorpej 
    579  1.21   thorpej 	mutex_exit(&pq->pq_lock);
    580  1.21   thorpej 
    581  1.21   thorpej 	/* Free the packets now that the critical section is over. */
    582  1.21   thorpej 	while ((m = m0) != NULL) {
    583  1.21   thorpej 		m0 = m->m_nextpkt;
    584  1.21   thorpej 		m_freem(m);
    585  1.21   thorpej 	}
    586   1.1     rmind }
    587   1.2     rmind 
    588  1.12  riastrad static void
    589  1.12  riastrad pktq_set_maxlen_cpu(void *vpq, void *vqs)
    590  1.12  riastrad {
    591  1.12  riastrad 	struct pktqueue *pq = vpq;
    592  1.12  riastrad 	struct pcq **qp, *q, **qs = vqs;
    593  1.12  riastrad 	unsigned i = cpu_index(curcpu());
    594  1.12  riastrad 	int s;
    595  1.12  riastrad 
    596  1.12  riastrad 	s = splnet();
    597  1.12  riastrad 	qp = percpu_getref(pq->pq_pcq);
    598  1.12  riastrad 	q = *qp;
    599  1.12  riastrad 	*qp = qs[i];
    600  1.12  riastrad 	qs[i] = q;
    601  1.12  riastrad 	percpu_putref(pq->pq_pcq);
    602  1.12  riastrad 	splx(s);
    603  1.12  riastrad }
    604  1.12  riastrad 
    605   1.2     rmind /*
    606   1.2     rmind  * pktq_set_maxlen: create per-CPU queues using a new size and replace
    607   1.2     rmind  * the existing queues without losing any packets.
    608  1.12  riastrad  *
    609  1.12  riastrad  * XXX ncpu must remain stable throughout.
    610   1.2     rmind  */
    611   1.2     rmind int
    612   1.2     rmind pktq_set_maxlen(pktqueue_t *pq, size_t maxlen)
    613   1.2     rmind {
    614   1.2     rmind 	const u_int slotbytes = ncpu * sizeof(pcq_t *);
    615   1.2     rmind 	pcq_t **qs;
    616   1.2     rmind 
    617   1.2     rmind 	if (!maxlen || maxlen > PCQ_MAXLEN)
    618   1.2     rmind 		return EINVAL;
    619   1.2     rmind 	if (pq->pq_maxlen == maxlen)
    620   1.2     rmind 		return 0;
    621   1.2     rmind 
    622  1.12  riastrad 	/* First, allocate the new queues. */
    623   1.2     rmind 	qs = kmem_zalloc(slotbytes, KM_SLEEP);
    624   1.2     rmind 	for (u_int i = 0; i < ncpu; i++) {
    625   1.2     rmind 		qs[i] = pcq_create(maxlen, KM_SLEEP);
    626   1.2     rmind 	}
    627  1.12  riastrad 
    628  1.12  riastrad 	/*
    629  1.12  riastrad 	 * Issue an xcall to replace the queue pointers on each CPU.
    630  1.12  riastrad 	 * This implies all the necessary memory barriers.
    631  1.12  riastrad 	 */
    632   1.2     rmind 	mutex_enter(&pq->pq_lock);
    633  1.12  riastrad 	xc_wait(xc_broadcast(XC_HIGHPRI, pktq_set_maxlen_cpu, pq, qs));
    634   1.2     rmind 	pq->pq_maxlen = maxlen;
    635   1.2     rmind 	mutex_exit(&pq->pq_lock);
    636   1.2     rmind 
    637   1.2     rmind 	/*
    638   1.2     rmind 	 * At this point, the new packets are flowing into the new
    639   1.4     rmind 	 * queues.  However, the old queues may have some packets
    640   1.4     rmind 	 * present which are no longer being processed.  We are going
    641   1.2     rmind 	 * to re-enqueue them.  This may change the order of packet
    642   1.2     rmind 	 * arrival, but it is not considered an issue.
    643   1.2     rmind 	 *
    644   1.4     rmind 	 * There may be in-flight interrupts calling pktq_dequeue()
    645   1.2     rmind 	 * which reference the old queues.  Issue a barrier to ensure
    646   1.2     rmind 	 * that we are going to be the only pcq_get() callers on the
    647   1.2     rmind 	 * old queues.
    648   1.2     rmind 	 */
    649   1.2     rmind 	pktq_barrier(pq);
    650   1.2     rmind 
    651   1.2     rmind 	for (u_int i = 0; i < ncpu; i++) {
    652  1.12  riastrad 		struct pcq *q;
    653   1.2     rmind 		struct mbuf *m;
    654   1.2     rmind 
    655  1.12  riastrad 		kpreempt_disable();
    656  1.12  riastrad 		q = pktq_pcq(pq, cpu_lookup(i));
    657  1.12  riastrad 		kpreempt_enable();
    658  1.12  riastrad 
    659   1.2     rmind 		while ((m = pcq_get(qs[i])) != NULL) {
    660  1.12  riastrad 			while (!pcq_put(q, m)) {
    661   1.2     rmind 				kpause("pktqrenq", false, 1, NULL);
    662   1.2     rmind 			}
    663   1.2     rmind 		}
    664   1.2     rmind 		pcq_destroy(qs[i]);
    665   1.2     rmind 	}
    666   1.2     rmind 
    667   1.2     rmind 	/* Well, that was fun. */
    668   1.2     rmind 	kmem_free(qs, slotbytes);
    669   1.2     rmind 	return 0;
    670   1.2     rmind }
    671   1.6     ozaki 
    672  1.19   thorpej static int
    673  1.19   thorpej sysctl_pktq_maxlen(SYSCTLFN_ARGS)
    674   1.6     ozaki {
    675  1.19   thorpej 	struct sysctlnode node = *rnode;
    676  1.19   thorpej 	pktqueue_t * const pq = node.sysctl_data;
    677   1.6     ozaki 	u_int nmaxlen = pktq_get_count(pq, PKTQ_MAXLEN);
    678   1.6     ozaki 	int error;
    679   1.6     ozaki 
    680   1.6     ozaki 	node.sysctl_data = &nmaxlen;
    681   1.6     ozaki 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    682   1.6     ozaki 	if (error || newp == NULL)
    683   1.6     ozaki 		return error;
    684   1.6     ozaki 	return pktq_set_maxlen(pq, nmaxlen);
    685   1.6     ozaki }
    686   1.6     ozaki 
    687  1.19   thorpej static int
    688  1.19   thorpej sysctl_pktq_count(SYSCTLFN_ARGS, u_int count_id)
    689   1.6     ozaki {
    690  1.19   thorpej 	struct sysctlnode node = *rnode;
    691  1.19   thorpej 	pktqueue_t * const pq = node.sysctl_data;
    692  1.10   msaitoh 	uint64_t count = pktq_get_count(pq, count_id);
    693  1.10   msaitoh 
    694   1.6     ozaki 	node.sysctl_data = &count;
    695   1.6     ozaki 	return sysctl_lookup(SYSCTLFN_CALL(&node));
    696   1.6     ozaki }
    697  1.19   thorpej 
    698  1.19   thorpej static int
    699  1.19   thorpej sysctl_pktq_nitems(SYSCTLFN_ARGS)
    700  1.19   thorpej {
    701  1.19   thorpej 	return sysctl_pktq_count(SYSCTLFN_CALL(rnode), PKTQ_NITEMS);
    702  1.19   thorpej }
    703  1.19   thorpej 
    704  1.19   thorpej static int
    705  1.19   thorpej sysctl_pktq_drops(SYSCTLFN_ARGS)
    706  1.19   thorpej {
    707  1.19   thorpej 	return sysctl_pktq_count(SYSCTLFN_CALL(rnode), PKTQ_DROPS);
    708  1.19   thorpej }
    709  1.19   thorpej 
    710  1.19   thorpej /*
    711  1.19   thorpej  * pktqueue_sysctl_setup: set up the sysctl nodes for a pktqueue
    712  1.19   thorpej  * using standardized names at the specified parent node and
    713  1.19   thorpej  * node ID (or CTL_CREATE).
    714  1.19   thorpej  */
    715  1.19   thorpej void
    716  1.19   thorpej pktq_sysctl_setup(pktqueue_t * const pq, struct sysctllog ** const clog,
    717  1.19   thorpej 		  const struct sysctlnode * const parent_node, const int qid)
    718  1.19   thorpej {
    719  1.19   thorpej 	const struct sysctlnode *rnode = parent_node, *cnode;
    720  1.19   thorpej 
    721  1.19   thorpej 	KASSERT(pq != NULL);
    722  1.19   thorpej 	KASSERT(parent_node != NULL);
    723  1.19   thorpej 	KASSERT(qid == CTL_CREATE || qid >= 0);
    724  1.19   thorpej 
    725  1.19   thorpej 	/* Create the "ifq" node below the parent node. */
    726  1.19   thorpej 	sysctl_createv(clog, 0, &rnode, &cnode,
    727  1.19   thorpej 		       CTLFLAG_PERMANENT,
    728  1.19   thorpej 		       CTLTYPE_NODE, "ifq",
    729  1.19   thorpej 		       SYSCTL_DESCR("Protocol input queue controls"),
    730  1.19   thorpej 		       NULL, 0, NULL, 0,
    731  1.19   thorpej 		       qid, CTL_EOL);
    732  1.19   thorpej 
    733  1.19   thorpej 	/* Now create the standard child nodes below "ifq". */
    734  1.19   thorpej 	rnode = cnode;
    735  1.19   thorpej 
    736  1.19   thorpej 	sysctl_createv(clog, 0, &rnode, &cnode,
    737  1.19   thorpej 		       CTLFLAG_PERMANENT,
    738  1.19   thorpej 		       CTLTYPE_QUAD, "len",
    739  1.19   thorpej 		       SYSCTL_DESCR("Current input queue length"),
    740  1.19   thorpej 		       sysctl_pktq_nitems, 0, (void *)pq, 0,
    741  1.19   thorpej 		       IFQCTL_LEN, CTL_EOL);
    742  1.19   thorpej 	sysctl_createv(clog, 0, &rnode, &cnode,
    743  1.19   thorpej 		       CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
    744  1.19   thorpej 		       CTLTYPE_INT, "maxlen",
    745  1.19   thorpej 		       SYSCTL_DESCR("Maximum allowed input queue length"),
    746  1.19   thorpej 		       sysctl_pktq_maxlen, 0, (void *)pq, 0,
    747  1.19   thorpej 		       IFQCTL_MAXLEN, CTL_EOL);
    748  1.19   thorpej 	sysctl_createv(clog, 0, &rnode, &cnode,
    749  1.19   thorpej 		       CTLFLAG_PERMANENT,
    750  1.19   thorpej 		       CTLTYPE_QUAD, "drops",
    751  1.19   thorpej 		       SYSCTL_DESCR("Packets dropped due to full input queue"),
    752  1.19   thorpej 		       sysctl_pktq_drops, 0, (void *)pq, 0,
    753  1.19   thorpej 		       IFQCTL_DROPS, CTL_EOL);
    754  1.19   thorpej }
    755