subr_percpu.c revision 1.21 1 /* $NetBSD: subr_percpu.c,v 1.21 2020/02/01 12:49:02 riastradh Exp $ */
2
3 /*-
4 * Copyright (c)2007,2008 YAMAMOTO Takashi,
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 /*
30 * per-cpu storage.
31 */
32
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: subr_percpu.c,v 1.21 2020/02/01 12:49:02 riastradh Exp $");
35
36 #include <sys/param.h>
37 #include <sys/cpu.h>
38 #include <sys/kmem.h>
39 #include <sys/kernel.h>
40 #include <sys/mutex.h>
41 #include <sys/percpu.h>
42 #include <sys/rwlock.h>
43 #include <sys/vmem.h>
44 #include <sys/xcall.h>
45
46 #define PERCPU_QUANTUM_SIZE (ALIGNBYTES + 1)
47 #define PERCPU_QCACHE_MAX 0
48 #define PERCPU_IMPORT_SIZE 2048
49
50 struct percpu {
51 unsigned pc_offset;
52 size_t pc_size;
53 percpu_callback_t pc_dtor;
54 void *pc_cookie;
55 };
56
57 static krwlock_t percpu_swap_lock __cacheline_aligned;
58 static kmutex_t percpu_allocation_lock __cacheline_aligned;
59 static vmem_t * percpu_offset_arena __cacheline_aligned;
60 static unsigned int percpu_nextoff __cacheline_aligned;
61
62 static percpu_cpu_t *
63 cpu_percpu(struct cpu_info *ci)
64 {
65
66 return &ci->ci_data.cpu_percpu;
67 }
68
69 static unsigned int
70 percpu_offset(percpu_t *pc)
71 {
72 const unsigned int off = pc->pc_offset;
73
74 KASSERT(off < percpu_nextoff);
75 return off;
76 }
77
78 /*
79 * percpu_cpu_swap: crosscall handler for percpu_cpu_enlarge
80 */
81 __noubsan
82 static void
83 percpu_cpu_swap(void *p1, void *p2)
84 {
85 struct cpu_info * const ci = p1;
86 percpu_cpu_t * const newpcc = p2;
87 percpu_cpu_t * const pcc = cpu_percpu(ci);
88
89 KASSERT(ci == curcpu() || !mp_online);
90
91 /*
92 * swap *pcc and *newpcc unless anyone has beaten us.
93 */
94 rw_enter(&percpu_swap_lock, RW_WRITER);
95 if (newpcc->pcc_size > pcc->pcc_size) {
96 percpu_cpu_t tmp;
97 int s;
98
99 tmp = *pcc;
100
101 /*
102 * block interrupts so that we don't lose their modifications.
103 */
104
105 s = splhigh();
106
107 /*
108 * copy data to new storage.
109 */
110
111 memcpy(newpcc->pcc_data, pcc->pcc_data, pcc->pcc_size);
112
113 /*
114 * this assignment needs to be atomic for percpu_getptr_remote.
115 */
116
117 pcc->pcc_data = newpcc->pcc_data;
118
119 splx(s);
120
121 pcc->pcc_size = newpcc->pcc_size;
122 *newpcc = tmp;
123 }
124 rw_exit(&percpu_swap_lock);
125 }
126
127 /*
128 * percpu_cpu_enlarge: ensure that percpu_cpu_t of each cpus have enough space
129 */
130
131 static void
132 percpu_cpu_enlarge(size_t size)
133 {
134 CPU_INFO_ITERATOR cii;
135 struct cpu_info *ci;
136
137 for (CPU_INFO_FOREACH(cii, ci)) {
138 percpu_cpu_t pcc;
139
140 pcc.pcc_data = kmem_alloc(size, KM_SLEEP); /* XXX cacheline */
141 pcc.pcc_size = size;
142 if (!mp_online) {
143 percpu_cpu_swap(ci, &pcc);
144 } else {
145 uint64_t where;
146
147 where = xc_unicast(0, percpu_cpu_swap, ci, &pcc, ci);
148 xc_wait(where);
149 }
150 KASSERT(pcc.pcc_size <= size);
151 if (pcc.pcc_data != NULL) {
152 kmem_free(pcc.pcc_data, pcc.pcc_size);
153 }
154 }
155 }
156
157 /*
158 * percpu_backend_alloc: vmem import callback for percpu_offset_arena
159 */
160
161 static int
162 percpu_backend_alloc(vmem_t *dummy, vmem_size_t size, vmem_size_t *resultsize,
163 vm_flag_t vmflags, vmem_addr_t *addrp)
164 {
165 unsigned int offset;
166 unsigned int nextoff;
167
168 ASSERT_SLEEPABLE();
169 KASSERT(dummy == NULL);
170
171 if ((vmflags & VM_NOSLEEP) != 0)
172 return ENOMEM;
173
174 size = roundup(size, PERCPU_IMPORT_SIZE);
175 mutex_enter(&percpu_allocation_lock);
176 offset = percpu_nextoff;
177 percpu_nextoff = nextoff = percpu_nextoff + size;
178 mutex_exit(&percpu_allocation_lock);
179
180 percpu_cpu_enlarge(nextoff);
181
182 *resultsize = size;
183 *addrp = (vmem_addr_t)offset;
184 return 0;
185 }
186
187 static void
188 percpu_zero_cb(void *vp, void *vp2, struct cpu_info *ci)
189 {
190 size_t sz = (uintptr_t)vp2;
191
192 memset(vp, 0, sz);
193 }
194
195 /*
196 * percpu_zero: initialize percpu storage with zero.
197 */
198
199 static void
200 percpu_zero(percpu_t *pc, size_t sz)
201 {
202
203 percpu_foreach(pc, percpu_zero_cb, (void *)(uintptr_t)sz);
204 }
205
206 /*
207 * percpu_init: subsystem initialization
208 */
209
210 void
211 percpu_init(void)
212 {
213
214 ASSERT_SLEEPABLE();
215 rw_init(&percpu_swap_lock);
216 mutex_init(&percpu_allocation_lock, MUTEX_DEFAULT, IPL_NONE);
217 percpu_nextoff = PERCPU_QUANTUM_SIZE;
218
219 percpu_offset_arena = vmem_xcreate("percpu", 0, 0, PERCPU_QUANTUM_SIZE,
220 percpu_backend_alloc, NULL, NULL, PERCPU_QCACHE_MAX, VM_SLEEP,
221 IPL_NONE);
222 }
223
224 /*
225 * percpu_init_cpu: cpu initialization
226 *
227 * => should be called before the cpu appears on the list for CPU_INFO_FOREACH.
228 */
229
230 void
231 percpu_init_cpu(struct cpu_info *ci)
232 {
233 percpu_cpu_t * const pcc = cpu_percpu(ci);
234 size_t size = percpu_nextoff; /* XXX racy */
235
236 ASSERT_SLEEPABLE();
237 pcc->pcc_size = size;
238 if (size) {
239 pcc->pcc_data = kmem_zalloc(pcc->pcc_size, KM_SLEEP);
240 }
241 }
242
243 /*
244 * percpu_alloc: allocate percpu storage
245 *
246 * => called in thread context.
247 * => considered as an expensive and rare operation.
248 * => allocated storage is initialized with zeros.
249 */
250
251 percpu_t *
252 percpu_alloc(size_t size)
253 {
254
255 return percpu_create(size, NULL, NULL, NULL);
256 }
257
258 /*
259 * percpu_create: allocate percpu storage and associate ctor/dtor with it
260 *
261 * => called in thread context.
262 * => considered as an expensive and rare operation.
263 * => allocated storage is initialized by ctor, or zeros if ctor is null
264 * => percpu_free will call dtor first, if dtor is nonnull
265 * => ctor or dtor may sleep, even on allocation
266 */
267
268 percpu_t *
269 percpu_create(size_t size, percpu_callback_t ctor, percpu_callback_t dtor,
270 void *cookie)
271 {
272 vmem_addr_t offset;
273 percpu_t *pc;
274
275 ASSERT_SLEEPABLE();
276 (void)vmem_alloc(percpu_offset_arena, size, VM_SLEEP | VM_BESTFIT,
277 &offset);
278
279 pc = kmem_alloc(sizeof(*pc), KM_SLEEP);
280 pc->pc_offset = offset;
281 pc->pc_size = size;
282 pc->pc_dtor = dtor;
283 pc->pc_cookie = cookie;
284
285 if (ctor) {
286 CPU_INFO_ITERATOR cii;
287 struct cpu_info *ci;
288 void *buf;
289
290 buf = kmem_alloc(size, KM_SLEEP);
291 for (CPU_INFO_FOREACH(cii, ci)) {
292 memset(buf, 0, size);
293 (*ctor)(buf, cookie, ci);
294 percpu_traverse_enter();
295 memcpy(percpu_getptr_remote(pc, ci), buf, size);
296 percpu_traverse_exit();
297 }
298 explicit_memset(buf, 0, size);
299 kmem_free(buf, size);
300 } else {
301 percpu_zero(pc, size);
302 }
303
304 return pc;
305 }
306
307 /*
308 * percpu_free: free percpu storage
309 *
310 * => called in thread context.
311 * => considered as an expensive and rare operation.
312 */
313
314 void
315 percpu_free(percpu_t *pc, size_t size)
316 {
317
318 ASSERT_SLEEPABLE();
319 KASSERT(size == pc->pc_size);
320
321 if (pc->pc_dtor) {
322 CPU_INFO_ITERATOR cii;
323 struct cpu_info *ci;
324 void *buf;
325
326 buf = kmem_alloc(size, KM_SLEEP);
327 for (CPU_INFO_FOREACH(cii, ci)) {
328 percpu_traverse_enter();
329 memcpy(buf, percpu_getptr_remote(pc, ci), size);
330 explicit_memset(percpu_getptr_remote(pc, ci), 0, size);
331 percpu_traverse_exit();
332 (*pc->pc_dtor)(buf, pc->pc_cookie, ci);
333 }
334 explicit_memset(buf, 0, size);
335 kmem_free(buf, size);
336 }
337
338 vmem_free(percpu_offset_arena, (vmem_addr_t)percpu_offset(pc), size);
339 kmem_free(pc, sizeof(*pc));
340 }
341
342 /*
343 * percpu_getref:
344 *
345 * => safe to be used in either thread or interrupt context
346 * => disables preemption; must be bracketed with a percpu_putref()
347 */
348
349 void *
350 percpu_getref(percpu_t *pc)
351 {
352
353 kpreempt_disable();
354 return percpu_getptr_remote(pc, curcpu());
355 }
356
357 /*
358 * percpu_putref:
359 *
360 * => drops the preemption-disabled count after caller is done with per-cpu
361 * data
362 */
363
364 void
365 percpu_putref(percpu_t *pc)
366 {
367
368 kpreempt_enable();
369 }
370
371 /*
372 * percpu_traverse_enter, percpu_traverse_exit, percpu_getptr_remote:
373 * helpers to access remote cpu's percpu data.
374 *
375 * => called in thread context.
376 * => percpu_traverse_enter can block low-priority xcalls.
377 * => typical usage would be:
378 *
379 * sum = 0;
380 * percpu_traverse_enter();
381 * for (CPU_INFO_FOREACH(cii, ci)) {
382 * unsigned int *p = percpu_getptr_remote(pc, ci);
383 * sum += *p;
384 * }
385 * percpu_traverse_exit();
386 */
387
388 void
389 percpu_traverse_enter(void)
390 {
391
392 ASSERT_SLEEPABLE();
393 rw_enter(&percpu_swap_lock, RW_READER);
394 }
395
396 void
397 percpu_traverse_exit(void)
398 {
399
400 rw_exit(&percpu_swap_lock);
401 }
402
403 void *
404 percpu_getptr_remote(percpu_t *pc, struct cpu_info *ci)
405 {
406
407 return &((char *)cpu_percpu(ci)->pcc_data)[percpu_offset(pc)];
408 }
409
410 /*
411 * percpu_foreach: call the specified callback function for each cpus.
412 *
413 * => called in thread context.
414 * => caller should not rely on the cpu iteration order.
415 * => the callback function should be minimum because it is executed with
416 * holding a global lock, which can block low-priority xcalls.
417 * eg. it's illegal for a callback function to sleep for memory allocation.
418 */
419 void
420 percpu_foreach(percpu_t *pc, percpu_callback_t cb, void *arg)
421 {
422 CPU_INFO_ITERATOR cii;
423 struct cpu_info *ci;
424
425 percpu_traverse_enter();
426 for (CPU_INFO_FOREACH(cii, ci)) {
427 (*cb)(percpu_getptr_remote(pc, ci), arg, ci);
428 }
429 percpu_traverse_exit();
430 }
431