Home | History | Annotate | Line # | Download | only in net
pktqueue.c revision 1.8.8.1
      1  1.8.8.1  skrll /*	$NetBSD: pktqueue.c,v 1.8.8.1 2017/08/28 17:53:11 skrll Exp $	*/
      2      1.1  rmind 
      3      1.1  rmind /*-
      4      1.1  rmind  * Copyright (c) 2014 The NetBSD Foundation, Inc.
      5      1.1  rmind  * All rights reserved.
      6      1.1  rmind  *
      7      1.1  rmind  * This code is derived from software contributed to The NetBSD Foundation
      8      1.1  rmind  * by Mindaugas Rasiukevicius.
      9      1.1  rmind  *
     10      1.1  rmind  * Redistribution and use in source and binary forms, with or without
     11      1.1  rmind  * modification, are permitted provided that the following conditions
     12      1.1  rmind  * are met:
     13      1.1  rmind  * 1. Redistributions of source code must retain the above copyright
     14      1.1  rmind  *    notice, this list of conditions and the following disclaimer.
     15      1.1  rmind  * 2. Redistributions in binary form must reproduce the above copyright
     16      1.1  rmind  *    notice, this list of conditions and the following disclaimer in the
     17      1.1  rmind  *    documentation and/or other materials provided with the distribution.
     18      1.1  rmind  *
     19      1.1  rmind  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20      1.1  rmind  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21      1.1  rmind  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22      1.1  rmind  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23      1.1  rmind  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24      1.1  rmind  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25      1.1  rmind  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26      1.1  rmind  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27      1.1  rmind  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28      1.1  rmind  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29      1.1  rmind  * POSSIBILITY OF SUCH DAMAGE.
     30      1.1  rmind  */
     31      1.1  rmind 
     32      1.4  rmind /*
     33      1.4  rmind  * The packet queue (pktqueue) interface is a lockless IP input queue
     34      1.4  rmind  * which also abstracts and handles network ISR scheduling.  It provides
     35      1.4  rmind  * a mechanism to enable receiver-side packet steering (RPS).
     36      1.4  rmind  */
     37      1.4  rmind 
     38      1.1  rmind #include <sys/cdefs.h>
     39  1.8.8.1  skrll __KERNEL_RCSID(0, "$NetBSD: pktqueue.c,v 1.8.8.1 2017/08/28 17:53:11 skrll Exp $");
     40      1.1  rmind 
     41      1.1  rmind #include <sys/param.h>
     42      1.1  rmind #include <sys/types.h>
     43      1.1  rmind 
     44      1.1  rmind #include <sys/atomic.h>
     45      1.1  rmind #include <sys/cpu.h>
     46      1.1  rmind #include <sys/pcq.h>
     47      1.1  rmind #include <sys/intr.h>
     48      1.1  rmind #include <sys/mbuf.h>
     49      1.1  rmind #include <sys/proc.h>
     50      1.1  rmind #include <sys/percpu.h>
     51      1.1  rmind 
     52      1.1  rmind #include <net/pktqueue.h>
     53      1.1  rmind 
     54      1.1  rmind /*
     55      1.1  rmind  * WARNING: update this if struct pktqueue changes.
     56      1.1  rmind  */
     57      1.1  rmind #define	PKTQ_CLPAD	\
     58      1.1  rmind     MAX(COHERENCY_UNIT, COHERENCY_UNIT - sizeof(kmutex_t) - sizeof(u_int))
     59      1.1  rmind 
     60      1.1  rmind struct pktqueue {
     61      1.1  rmind 	/*
     62      1.1  rmind 	 * The lock used for a barrier mechanism.  The barrier counter,
     63      1.1  rmind 	 * as well as the drop counter, are managed atomically though.
     64      1.1  rmind 	 * Ensure this group is in a separate cache line.
     65      1.1  rmind 	 */
     66      1.1  rmind 	kmutex_t	pq_lock;
     67      1.1  rmind 	volatile u_int	pq_barrier;
     68      1.1  rmind 	uint8_t		_pad[PKTQ_CLPAD];
     69      1.1  rmind 
     70      1.1  rmind 	/* The size of the queue, counters and the interrupt handler. */
     71      1.1  rmind 	u_int		pq_maxlen;
     72      1.1  rmind 	percpu_t *	pq_counters;
     73      1.1  rmind 	void *		pq_sih;
     74      1.1  rmind 
     75      1.1  rmind 	/* Finally, per-CPU queues. */
     76      1.1  rmind 	pcq_t *		pq_queue[];
     77      1.1  rmind };
     78      1.1  rmind 
     79      1.1  rmind /* The counters of the packet queue. */
     80      1.1  rmind #define	PQCNT_ENQUEUE	0
     81      1.1  rmind #define	PQCNT_DEQUEUE	1
     82      1.1  rmind #define	PQCNT_DROP	2
     83      1.1  rmind #define	PQCNT_NCOUNTERS	3
     84      1.1  rmind 
     85      1.1  rmind typedef struct {
     86      1.1  rmind 	uint64_t	count[PQCNT_NCOUNTERS];
     87      1.1  rmind } pktq_counters_t;
     88      1.1  rmind 
     89      1.1  rmind /* Special marker value used by pktq_barrier() mechanism. */
     90      1.1  rmind #define	PKTQ_MARKER	((void *)(~0ULL))
     91      1.1  rmind 
     92      1.1  rmind /*
     93      1.1  rmind  * The total size of pktqueue_t which depends on the number of CPUs.
     94      1.1  rmind  */
     95      1.1  rmind #define	PKTQUEUE_STRUCT_LEN(ncpu)	\
     96      1.1  rmind     roundup2(offsetof(pktqueue_t, pq_queue[ncpu]), coherency_unit)
     97      1.1  rmind 
     98      1.1  rmind pktqueue_t *
     99      1.5  ozaki pktq_create(size_t maxlen, void (*intrh)(void *), void *sc)
    100      1.1  rmind {
    101      1.1  rmind 	const u_int sflags = SOFTINT_NET | SOFTINT_MPSAFE | SOFTINT_RCPU;
    102      1.1  rmind 	const size_t len = PKTQUEUE_STRUCT_LEN(ncpu);
    103      1.1  rmind 	pktqueue_t *pq;
    104      1.1  rmind 	percpu_t *pc;
    105      1.1  rmind 	void *sih;
    106      1.1  rmind 
    107  1.8.8.1  skrll 	pc = percpu_alloc(sizeof(pktq_counters_t));
    108      1.5  ozaki 	if ((sih = softint_establish(sflags, intrh, sc)) == NULL) {
    109      1.1  rmind 		percpu_free(pc, sizeof(pktq_counters_t));
    110      1.1  rmind 		return NULL;
    111      1.1  rmind 	}
    112      1.1  rmind 
    113      1.1  rmind 	pq = kmem_zalloc(len, KM_SLEEP);
    114      1.1  rmind 	for (u_int i = 0; i < ncpu; i++) {
    115      1.1  rmind 		pq->pq_queue[i] = pcq_create(maxlen, KM_SLEEP);
    116      1.1  rmind 	}
    117      1.1  rmind 	mutex_init(&pq->pq_lock, MUTEX_DEFAULT, IPL_NONE);
    118      1.1  rmind 	pq->pq_maxlen = maxlen;
    119      1.1  rmind 	pq->pq_counters = pc;
    120      1.1  rmind 	pq->pq_sih = sih;
    121      1.1  rmind 
    122      1.1  rmind 	return pq;
    123      1.1  rmind }
    124      1.1  rmind 
    125      1.1  rmind void
    126      1.1  rmind pktq_destroy(pktqueue_t *pq)
    127      1.1  rmind {
    128      1.1  rmind 	const size_t len = PKTQUEUE_STRUCT_LEN(ncpu);
    129      1.1  rmind 
    130      1.1  rmind 	for (u_int i = 0; i < ncpu; i++) {
    131      1.1  rmind 		pcq_t *q = pq->pq_queue[i];
    132      1.1  rmind 		KASSERT(pcq_peek(q) == NULL);
    133      1.1  rmind 		pcq_destroy(q);
    134      1.1  rmind 	}
    135      1.1  rmind 	percpu_free(pq->pq_counters, sizeof(pktq_counters_t));
    136      1.1  rmind 	softint_disestablish(pq->pq_sih);
    137      1.1  rmind 	mutex_destroy(&pq->pq_lock);
    138      1.1  rmind 	kmem_free(pq, len);
    139      1.1  rmind }
    140      1.1  rmind 
    141      1.1  rmind /*
    142      1.1  rmind  * - pktq_inc_counter: increment the counter given an ID.
    143      1.1  rmind  * - pktq_collect_counts: handler to sum up the counts from each CPU.
    144      1.1  rmind  * - pktq_getcount: return the effective count given an ID.
    145      1.1  rmind  */
    146      1.1  rmind 
    147      1.1  rmind static inline void
    148      1.1  rmind pktq_inc_count(pktqueue_t *pq, u_int i)
    149      1.1  rmind {
    150      1.1  rmind 	percpu_t *pc = pq->pq_counters;
    151      1.1  rmind 	pktq_counters_t *c;
    152      1.1  rmind 
    153      1.1  rmind 	c = percpu_getref(pc);
    154      1.1  rmind 	c->count[i]++;
    155      1.1  rmind 	percpu_putref(pc);
    156      1.1  rmind }
    157      1.1  rmind 
    158      1.1  rmind static void
    159      1.1  rmind pktq_collect_counts(void *mem, void *arg, struct cpu_info *ci)
    160      1.1  rmind {
    161      1.1  rmind 	const pktq_counters_t *c = mem;
    162      1.1  rmind 	pktq_counters_t *sum = arg;
    163      1.1  rmind 
    164      1.1  rmind 	for (u_int i = 0; i < PQCNT_NCOUNTERS; i++) {
    165      1.1  rmind 		sum->count[i] += c->count[i];
    166      1.1  rmind 	}
    167      1.1  rmind }
    168      1.1  rmind 
    169      1.1  rmind uint64_t
    170      1.1  rmind pktq_get_count(pktqueue_t *pq, pktq_count_t c)
    171      1.1  rmind {
    172      1.1  rmind 	pktq_counters_t sum;
    173      1.1  rmind 
    174      1.1  rmind 	if (c != PKTQ_MAXLEN) {
    175      1.1  rmind 		memset(&sum, 0, sizeof(sum));
    176      1.1  rmind 		percpu_foreach(pq->pq_counters, pktq_collect_counts, &sum);
    177      1.1  rmind 	}
    178      1.1  rmind 	switch (c) {
    179      1.1  rmind 	case PKTQ_NITEMS:
    180      1.1  rmind 		return sum.count[PQCNT_ENQUEUE] - sum.count[PQCNT_DEQUEUE];
    181      1.1  rmind 	case PKTQ_DROPS:
    182      1.1  rmind 		return sum.count[PQCNT_DROP];
    183      1.1  rmind 	case PKTQ_MAXLEN:
    184      1.1  rmind 		return pq->pq_maxlen;
    185      1.1  rmind 	}
    186      1.1  rmind 	return 0;
    187      1.1  rmind }
    188      1.1  rmind 
    189      1.1  rmind uint32_t
    190      1.1  rmind pktq_rps_hash(const struct mbuf *m __unused)
    191      1.1  rmind {
    192      1.1  rmind 	/*
    193      1.1  rmind 	 * XXX: No distribution yet; the softnet_lock contention
    194      1.1  rmind 	 * XXX: must be eliminated first.
    195      1.1  rmind 	 */
    196      1.1  rmind 	return 0;
    197      1.1  rmind }
    198      1.1  rmind 
    199      1.1  rmind /*
    200      1.1  rmind  * pktq_enqueue: inject the packet into the end of the queue.
    201      1.1  rmind  *
    202      1.1  rmind  * => Must be called from the interrupt or with the preemption disabled.
    203      1.1  rmind  * => Consumes the packet and returns true on success.
    204      1.1  rmind  * => Returns false on failure; caller is responsible to free the packet.
    205      1.1  rmind  */
    206      1.1  rmind bool
    207      1.3  rmind pktq_enqueue(pktqueue_t *pq, struct mbuf *m, const u_int hash __unused)
    208      1.1  rmind {
    209      1.8  ozaki #if defined(_RUMPKERNEL) || defined(_RUMP_NATIVE_ABI)
    210      1.7  ozaki 	const unsigned cpuid = curcpu()->ci_index;
    211      1.7  ozaki #else
    212      1.7  ozaki 	const unsigned cpuid = hash % ncpu;
    213      1.7  ozaki #endif
    214      1.1  rmind 
    215      1.1  rmind 	KASSERT(kpreempt_disabled());
    216      1.1  rmind 
    217      1.1  rmind 	if (__predict_false(!pcq_put(pq->pq_queue[cpuid], m))) {
    218      1.1  rmind 		pktq_inc_count(pq, PQCNT_DROP);
    219      1.1  rmind 		return false;
    220      1.1  rmind 	}
    221      1.1  rmind 	softint_schedule_cpu(pq->pq_sih, cpu_lookup(cpuid));
    222      1.1  rmind 	pktq_inc_count(pq, PQCNT_ENQUEUE);
    223      1.1  rmind 	return true;
    224      1.1  rmind }
    225      1.1  rmind 
    226      1.1  rmind /*
    227      1.1  rmind  * pktq_dequeue: take a packet from the queue.
    228      1.1  rmind  *
    229      1.1  rmind  * => Must be called with preemption disabled.
    230      1.1  rmind  * => Must ensure there are not concurrent dequeue calls.
    231      1.1  rmind  */
    232      1.1  rmind struct mbuf *
    233      1.1  rmind pktq_dequeue(pktqueue_t *pq)
    234      1.1  rmind {
    235      1.1  rmind 	const struct cpu_info *ci = curcpu();
    236      1.1  rmind 	const unsigned cpuid = cpu_index(ci);
    237      1.1  rmind 	struct mbuf *m;
    238      1.1  rmind 
    239      1.1  rmind 	m = pcq_get(pq->pq_queue[cpuid]);
    240      1.1  rmind 	if (__predict_false(m == PKTQ_MARKER)) {
    241      1.1  rmind 		/* Note the marker entry. */
    242      1.1  rmind 		atomic_inc_uint(&pq->pq_barrier);
    243      1.1  rmind 		return NULL;
    244      1.1  rmind 	}
    245      1.1  rmind 	if (__predict_true(m != NULL)) {
    246      1.1  rmind 		pktq_inc_count(pq, PQCNT_DEQUEUE);
    247      1.1  rmind 	}
    248      1.1  rmind 	return m;
    249      1.1  rmind }
    250      1.1  rmind 
    251      1.1  rmind /*
    252      1.1  rmind  * pktq_barrier: waits for a grace period when all packets enqueued at
    253      1.1  rmind  * the moment of calling this routine will be processed.  This is used
    254      1.1  rmind  * to ensure that e.g. packets referencing some interface were drained.
    255      1.1  rmind  */
    256      1.1  rmind void
    257      1.1  rmind pktq_barrier(pktqueue_t *pq)
    258      1.1  rmind {
    259      1.1  rmind 	u_int pending = 0;
    260      1.1  rmind 
    261      1.1  rmind 	mutex_enter(&pq->pq_lock);
    262      1.1  rmind 	KASSERT(pq->pq_barrier == 0);
    263      1.1  rmind 
    264      1.1  rmind 	for (u_int i = 0; i < ncpu; i++) {
    265      1.1  rmind 		pcq_t *q = pq->pq_queue[i];
    266      1.1  rmind 
    267      1.1  rmind 		/* If the queue is empty - nothing to do. */
    268      1.1  rmind 		if (pcq_peek(q) == NULL) {
    269      1.1  rmind 			continue;
    270      1.1  rmind 		}
    271      1.1  rmind 		/* Otherwise, put the marker and entry. */
    272      1.1  rmind 		while (!pcq_put(q, PKTQ_MARKER)) {
    273      1.1  rmind 			kpause("pktqsync", false, 1, NULL);
    274      1.1  rmind 		}
    275      1.1  rmind 		kpreempt_disable();
    276      1.1  rmind 		softint_schedule_cpu(pq->pq_sih, cpu_lookup(i));
    277      1.1  rmind 		kpreempt_enable();
    278      1.1  rmind 		pending++;
    279      1.1  rmind 	}
    280      1.1  rmind 
    281      1.1  rmind 	/* Wait for each queue to process the markers. */
    282      1.1  rmind 	while (pq->pq_barrier != pending) {
    283      1.1  rmind 		kpause("pktqsync", false, 1, NULL);
    284      1.1  rmind 	}
    285      1.1  rmind 	pq->pq_barrier = 0;
    286      1.1  rmind 	mutex_exit(&pq->pq_lock);
    287      1.1  rmind }
    288      1.1  rmind 
    289      1.1  rmind /*
    290      1.1  rmind  * pktq_flush: free mbufs in all queues.
    291      1.1  rmind  *
    292      1.4  rmind  * => The caller must ensure there are no concurrent writers or flush calls.
    293      1.1  rmind  */
    294      1.1  rmind void
    295      1.1  rmind pktq_flush(pktqueue_t *pq)
    296      1.1  rmind {
    297      1.1  rmind 	struct mbuf *m;
    298      1.1  rmind 
    299      1.1  rmind 	for (u_int i = 0; i < ncpu; i++) {
    300      1.1  rmind 		while ((m = pcq_get(pq->pq_queue[i])) != NULL) {
    301      1.1  rmind 			pktq_inc_count(pq, PQCNT_DEQUEUE);
    302      1.1  rmind 			m_freem(m);
    303      1.1  rmind 		}
    304      1.1  rmind 	}
    305      1.1  rmind }
    306      1.2  rmind 
    307      1.2  rmind /*
    308      1.2  rmind  * pktq_set_maxlen: create per-CPU queues using a new size and replace
    309      1.2  rmind  * the existing queues without losing any packets.
    310      1.2  rmind  */
    311      1.2  rmind int
    312      1.2  rmind pktq_set_maxlen(pktqueue_t *pq, size_t maxlen)
    313      1.2  rmind {
    314      1.2  rmind 	const u_int slotbytes = ncpu * sizeof(pcq_t *);
    315      1.2  rmind 	pcq_t **qs;
    316      1.2  rmind 
    317      1.2  rmind 	if (!maxlen || maxlen > PCQ_MAXLEN)
    318      1.2  rmind 		return EINVAL;
    319      1.2  rmind 	if (pq->pq_maxlen == maxlen)
    320      1.2  rmind 		return 0;
    321      1.2  rmind 
    322      1.2  rmind 	/* First, allocate the new queues and replace them. */
    323      1.2  rmind 	qs = kmem_zalloc(slotbytes, KM_SLEEP);
    324      1.2  rmind 	for (u_int i = 0; i < ncpu; i++) {
    325      1.2  rmind 		qs[i] = pcq_create(maxlen, KM_SLEEP);
    326      1.2  rmind 	}
    327      1.2  rmind 	mutex_enter(&pq->pq_lock);
    328      1.2  rmind 	for (u_int i = 0; i < ncpu; i++) {
    329      1.2  rmind 		/* Swap: store of a word is atomic. */
    330      1.2  rmind 		pcq_t *q = pq->pq_queue[i];
    331      1.2  rmind 		pq->pq_queue[i] = qs[i];
    332      1.2  rmind 		qs[i] = q;
    333      1.2  rmind 	}
    334      1.2  rmind 	pq->pq_maxlen = maxlen;
    335      1.2  rmind 	mutex_exit(&pq->pq_lock);
    336      1.2  rmind 
    337      1.2  rmind 	/*
    338      1.2  rmind 	 * At this point, the new packets are flowing into the new
    339      1.4  rmind 	 * queues.  However, the old queues may have some packets
    340      1.4  rmind 	 * present which are no longer being processed.  We are going
    341      1.2  rmind 	 * to re-enqueue them.  This may change the order of packet
    342      1.2  rmind 	 * arrival, but it is not considered an issue.
    343      1.2  rmind 	 *
    344      1.4  rmind 	 * There may be in-flight interrupts calling pktq_dequeue()
    345      1.2  rmind 	 * which reference the old queues.  Issue a barrier to ensure
    346      1.2  rmind 	 * that we are going to be the only pcq_get() callers on the
    347      1.2  rmind 	 * old queues.
    348      1.2  rmind 	 */
    349      1.2  rmind 	pktq_barrier(pq);
    350      1.2  rmind 
    351      1.2  rmind 	for (u_int i = 0; i < ncpu; i++) {
    352      1.2  rmind 		struct mbuf *m;
    353      1.2  rmind 
    354      1.2  rmind 		while ((m = pcq_get(qs[i])) != NULL) {
    355      1.2  rmind 			while (!pcq_put(pq->pq_queue[i], m)) {
    356      1.2  rmind 				kpause("pktqrenq", false, 1, NULL);
    357      1.2  rmind 			}
    358      1.2  rmind 		}
    359      1.2  rmind 		pcq_destroy(qs[i]);
    360      1.2  rmind 	}
    361      1.2  rmind 
    362      1.2  rmind 	/* Well, that was fun. */
    363      1.2  rmind 	kmem_free(qs, slotbytes);
    364      1.2  rmind 	return 0;
    365      1.2  rmind }
    366      1.6  ozaki 
    367      1.6  ozaki int
    368      1.6  ozaki sysctl_pktq_maxlen(SYSCTLFN_ARGS, pktqueue_t *pq)
    369      1.6  ozaki {
    370      1.6  ozaki 	u_int nmaxlen = pktq_get_count(pq, PKTQ_MAXLEN);
    371      1.6  ozaki 	struct sysctlnode node = *rnode;
    372      1.6  ozaki 	int error;
    373      1.6  ozaki 
    374      1.6  ozaki 	node.sysctl_data = &nmaxlen;
    375      1.6  ozaki 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
    376      1.6  ozaki 	if (error || newp == NULL)
    377      1.6  ozaki 		return error;
    378      1.6  ozaki 	return pktq_set_maxlen(pq, nmaxlen);
    379      1.6  ozaki }
    380      1.6  ozaki 
    381      1.6  ozaki int
    382      1.6  ozaki sysctl_pktq_count(SYSCTLFN_ARGS, pktqueue_t *pq, u_int count_id)
    383      1.6  ozaki {
    384      1.6  ozaki 	int count = pktq_get_count(pq, count_id);
    385      1.6  ozaki 	struct sysctlnode node = *rnode;
    386      1.6  ozaki 	node.sysctl_data = &count;
    387      1.6  ozaki 	return sysctl_lookup(SYSCTLFN_CALL(&node));
    388      1.6  ozaki }
    389