nfs_fha.c revision 1.1 1 /* $NetBSD: nfs_fha.c,v 1.1 2013/09/30 07:19:33 dholland Exp $ */
2 /*-
3 * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 /* __FBSDID("FreeBSD: head/sys/nfs/nfs_fha.c 249596 2013-04-17 22:42:43Z ken "); */
29 __RCSID("$NetBSD: nfs_fha.c,v 1.1 2013/09/30 07:19:33 dholland Exp $");
30
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/sysproto.h>
34 #include <sys/kernel.h>
35 #include <sys/sysctl.h>
36 #include <sys/vnode.h>
37 #include <sys/malloc.h>
38 #include <sys/mount.h>
39 #include <sys/mbuf.h>
40 #include <sys/sbuf.h>
41
42 #include <rpc/rpc.h>
43 #include <nfs/nfs_fha.h>
44
45 static MALLOC_DEFINE(M_NFS_FHA, "NFS FHA", "NFS FHA");
46
47 /*
48 * XXX need to commonize definitions between old and new NFS code. Define
49 * this here so we don't include one nfsproto.h over the other.
50 */
51 #define NFS_PROG 100003
52
53 void
54 fha_init(struct fha_params *softc)
55 {
56 char tmpstr[128];
57
58 /*
59 * A small hash table to map filehandles to fha_hash_entry
60 * structures.
61 */
62 softc->g_fha.hashtable = hashinit(256, M_NFS_FHA,
63 &softc->g_fha.hashmask);
64
65 /*
66 * Set the default tuning parameters.
67 */
68 softc->ctls.enable = FHA_DEF_ENABLE;
69 softc->ctls.bin_shift = FHA_DEF_BIN_SHIFT;
70 softc->ctls.max_nfsds_per_fh = FHA_DEF_MAX_NFSDS_PER_FH;
71 softc->ctls.max_reqs_per_nfsd = FHA_DEF_MAX_REQS_PER_NFSD;
72
73 /*
74 * Allow the user to override the defaults at boot time with
75 * tunables.
76 */
77 snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.enable",
78 softc->server_name);
79 TUNABLE_INT_FETCH(tmpstr, &softc->ctls.enable);
80 snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.bin_shift",
81 softc->server_name);
82 TUNABLE_INT_FETCH(tmpstr, &softc->ctls.bin_shift);
83 snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.max_nfsds_per_fh",
84 softc->server_name);
85 TUNABLE_INT_FETCH(tmpstr, &softc->ctls.max_nfsds_per_fh);
86 snprintf(tmpstr, sizeof(tmpstr), "vfs.%s.fha.max_reqs_per_nfsd",
87 softc->server_name);
88 TUNABLE_INT_FETCH(tmpstr, &softc->ctls.max_reqs_per_nfsd);
89
90 /*
91 * Add sysctls so the user can change the tuning parameters at
92 * runtime.
93 */
94 SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
95 OID_AUTO, "enable", CTLFLAG_RW,
96 &softc->ctls.enable, 0, "Enable NFS File Handle Affinity (FHA)");
97
98 SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
99 OID_AUTO, "bin_shift", CTLFLAG_RW,
100 &softc->ctls.bin_shift, 0, "For FHA reads, no two requests will "
101 "contend if they're 2^(bin_shift) bytes apart");
102
103 SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
104 OID_AUTO, "max_nfsds_per_fh", CTLFLAG_RW,
105 &softc->ctls.max_nfsds_per_fh, 0, "Maximum nfsd threads that "
106 "should be working on requests for the same file handle");
107
108 SYSCTL_ADD_UINT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
109 OID_AUTO, "max_reqs_per_nfsd", CTLFLAG_RW,
110 &softc->ctls.max_reqs_per_nfsd, 0, "Maximum requests that "
111 "single nfsd thread should be working on at any time");
112
113 SYSCTL_ADD_OID(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
114 OID_AUTO, "fhe_stats", CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
115 softc->callbacks.fhe_stats_sysctl, "A", "");
116
117 }
118
119 void
120 fha_uninit(struct fha_params *softc)
121 {
122 sysctl_ctx_free(&softc->sysctl_ctx);
123 hashdestroy(softc->g_fha.hashtable, M_NFS_FHA, softc->g_fha.hashmask);
124 }
125
126 /*
127 * This just specifies that offsets should obey affinity when within
128 * the same 1Mbyte (1<<20) chunk for the file (reads only for now).
129 */
130 static void
131 fha_extract_info(struct svc_req *req, struct fha_info *i,
132 struct fha_callbacks *cb)
133 {
134 struct mbuf *md;
135 fhandle_t fh;
136 caddr_t dpos;
137 static u_int64_t random_fh = 0;
138 int error;
139 int v3 = (req->rq_vers == 3);
140 rpcproc_t procnum;
141
142 /*
143 * We start off with a random fh. If we get a reasonable
144 * procnum, we set the fh. If there's a concept of offset
145 * that we're interested in, we set that.
146 */
147 i->fh = ++random_fh;
148 i->offset = 0;
149 i->locktype = LK_EXCLUSIVE;
150
151 /*
152 * Extract the procnum and convert to v3 form if necessary,
153 * taking care to deal with out-of-range procnums. Caller will
154 * ensure that rq_vers is either 2 or 3.
155 */
156 procnum = req->rq_proc;
157 if (!v3) {
158 rpcproc_t tmp_procnum;
159
160 tmp_procnum = cb->get_procnum(procnum);
161 if (tmp_procnum == -1)
162 goto out;
163 procnum = tmp_procnum;
164 }
165
166 /*
167 * We do affinity for most. However, we divide a realm of affinity
168 * by file offset so as to allow for concurrent random access. We
169 * only do this for reads today, but this may change when IFS supports
170 * efficient concurrent writes.
171 */
172 if (cb->no_offset(procnum))
173 goto out;
174
175 error = cb->realign(&req->rq_args, M_NOWAIT);
176 if (error)
177 goto out;
178 md = req->rq_args;
179 dpos = mtod(md, caddr_t);
180
181 /* Grab the filehandle. */
182 error = cb->get_fh(&fh, v3, &md, &dpos);
183 if (error)
184 goto out;
185
186 bcopy(fh.fh_fid.fid_data, &i->fh, sizeof(i->fh));
187
188 /* Content ourselves with zero offset for all but reads. */
189 if (cb->is_read(procnum) || cb->is_write(procnum))
190 cb->get_offset(&md, &dpos, v3, i);
191
192 out:
193 cb->set_locktype(procnum, i);
194 }
195
196 static struct fha_hash_entry *
197 fha_hash_entry_new(u_int64_t fh)
198 {
199 struct fha_hash_entry *e;
200
201 e = malloc(sizeof(*e), M_NFS_FHA, M_WAITOK);
202 e->fh = fh;
203 e->num_rw = 0;
204 e->num_exclusive = 0;
205 e->num_threads = 0;
206 LIST_INIT(&e->threads);
207
208 return (e);
209 }
210
211 static void
212 fha_hash_entry_destroy(struct fha_hash_entry *e)
213 {
214
215 if (e->num_rw + e->num_exclusive)
216 panic("nonempty fhe");
217 free(e, M_NFS_FHA);
218 }
219
220 static void
221 fha_hash_entry_remove(struct fha_hash_entry *e)
222 {
223
224 LIST_REMOVE(e, link);
225 fha_hash_entry_destroy(e);
226 }
227
228 static struct fha_hash_entry *
229 fha_hash_entry_lookup(struct fha_params *softc, u_int64_t fh)
230 {
231 SVCPOOL *pool;
232
233 pool = *softc->pool;
234
235 struct fha_hash_entry *fhe, *new_fhe;
236
237 LIST_FOREACH(fhe, &softc->g_fha.hashtable[fh % softc->g_fha.hashmask],
238 link)
239 if (fhe->fh == fh)
240 break;
241
242 if (!fhe) {
243 /* Allocate a new entry. */
244 mtx_unlock(&pool->sp_lock);
245 new_fhe = fha_hash_entry_new(fh);
246 mtx_lock(&pool->sp_lock);
247
248 /* Double-check to make sure we still need the new entry. */
249 LIST_FOREACH(fhe,
250 &softc->g_fha.hashtable[fh % softc->g_fha.hashmask], link)
251 if (fhe->fh == fh)
252 break;
253 if (!fhe) {
254 fhe = new_fhe;
255 LIST_INSERT_HEAD(
256 &softc->g_fha.hashtable[fh % softc->g_fha.hashmask],
257 fhe, link);
258 } else
259 fha_hash_entry_destroy(new_fhe);
260 }
261
262 return (fhe);
263 }
264
265 static void
266 fha_hash_entry_add_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
267 {
268
269 LIST_INSERT_HEAD(&fhe->threads, thread, st_alink);
270 fhe->num_threads++;
271 }
272
273 static void
274 fha_hash_entry_remove_thread(struct fha_hash_entry *fhe, SVCTHREAD *thread)
275 {
276
277 LIST_REMOVE(thread, st_alink);
278 fhe->num_threads--;
279 }
280
281 /*
282 * Account for an ongoing operation associated with this file.
283 */
284 static void
285 fha_hash_entry_add_op(struct fha_hash_entry *fhe, int locktype, int count)
286 {
287
288 if (LK_EXCLUSIVE == locktype)
289 fhe->num_exclusive += count;
290 else
291 fhe->num_rw += count;
292 }
293
294 static SVCTHREAD *
295 get_idle_thread(SVCPOOL *pool)
296 {
297 SVCTHREAD *st;
298
299 LIST_FOREACH(st, &pool->sp_idlethreads, st_ilink) {
300 if (st->st_xprt == NULL && STAILQ_EMPTY(&st->st_reqs))
301 return (st);
302 }
303 return (NULL);
304 }
305
306
307 /*
308 * Get the service thread currently associated with the fhe that is
309 * appropriate to handle this operation.
310 */
311 SVCTHREAD *
312 fha_hash_entry_choose_thread(struct fha_params *softc,
313 struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread);
314
315 SVCTHREAD *
316 fha_hash_entry_choose_thread(struct fha_params *softc,
317 struct fha_hash_entry *fhe, struct fha_info *i, SVCTHREAD *this_thread)
318 {
319 SVCTHREAD *thread, *min_thread = NULL;
320 SVCPOOL *pool;
321 int req_count, min_count = 0;
322 off_t offset1, offset2;
323
324 pool = *softc->pool;
325
326 LIST_FOREACH(thread, &fhe->threads, st_alink) {
327 req_count = thread->st_reqcount;
328
329 /* If there are any writes in progress, use the first thread. */
330 if (fhe->num_exclusive) {
331 #if 0
332 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
333 "fha: %p(%d)w", thread, req_count);
334 #endif
335 return (thread);
336 }
337
338 /*
339 * Check for read locality, making sure that we won't
340 * exceed our per-thread load limit in the process.
341 */
342 offset1 = i->offset;
343 offset2 = STAILQ_FIRST(&thread->st_reqs)->rq_p3;
344
345 if (((offset1 >= offset2)
346 && ((offset1 - offset2) < (1 << softc->ctls.bin_shift)))
347 || ((offset2 > offset1)
348 && ((offset2 - offset1) < (1 << softc->ctls.bin_shift)))) {
349 if ((softc->ctls.max_reqs_per_nfsd == 0) ||
350 (req_count < softc->ctls.max_reqs_per_nfsd)) {
351 #if 0
352 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
353 "fha: %p(%d)r", thread, req_count);
354 #endif
355 return (thread);
356 }
357 }
358
359 /*
360 * We don't have a locality match, so skip this thread,
361 * but keep track of the most attractive thread in case
362 * we need to come back to it later.
363 */
364 #if 0
365 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
366 "fha: %p(%d)s off1 %llu off2 %llu", thread,
367 req_count, offset1, offset2);
368 #endif
369 if ((min_thread == NULL) || (req_count < min_count)) {
370 min_count = req_count;
371 min_thread = thread;
372 }
373 }
374
375 /*
376 * We didn't find a good match yet. See if we can add
377 * a new thread to this file handle entry's thread list.
378 */
379 if ((softc->ctls.max_nfsds_per_fh == 0) ||
380 (fhe->num_threads < softc->ctls.max_nfsds_per_fh)) {
381 /*
382 * We can add a new thread, so try for an idle thread
383 * first, and fall back to this_thread if none are idle.
384 */
385 if (STAILQ_EMPTY(&this_thread->st_reqs)) {
386 thread = this_thread;
387 #if 0
388 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
389 "fha: %p(%d)t", thread, thread->st_reqcount);
390 #endif
391 } else if ((thread = get_idle_thread(pool))) {
392 #if 0
393 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
394 "fha: %p(%d)i", thread, thread->st_reqcount);
395 #endif
396 } else {
397 thread = this_thread;
398 #if 0
399 ITRACE_CURPROC(ITRACE_NFS, ITRACE_INFO,
400 "fha: %p(%d)b", thread, thread->st_reqcount);
401 #endif
402 }
403 fha_hash_entry_add_thread(fhe, thread);
404 } else {
405 /*
406 * We don't want to use any more threads for this file, so
407 * go back to the most attractive nfsd we're already using.
408 */
409 thread = min_thread;
410 }
411
412 return (thread);
413 }
414
415 /*
416 * After getting a request, try to assign it to some thread. Usually we
417 * handle it ourselves.
418 */
419 SVCTHREAD *
420 fha_assign(SVCTHREAD *this_thread, struct svc_req *req,
421 struct fha_params *softc)
422 {
423 SVCPOOL *pool;
424 SVCTHREAD *thread;
425 struct fha_info i;
426 struct fha_hash_entry *fhe;
427 struct fha_callbacks *cb;
428
429 cb = &softc->callbacks;
430
431 /* Check to see whether we're enabled. */
432 if (softc->ctls.enable == 0)
433 return (this_thread);
434
435 /*
436 * Only do placement if this is an NFS request.
437 */
438 if (req->rq_prog != NFS_PROG)
439 return (this_thread);
440
441 if (req->rq_vers != 2 && req->rq_vers != 3)
442 return (this_thread);
443
444 pool = req->rq_xprt->xp_pool;
445 fha_extract_info(req, &i, cb);
446
447 /*
448 * We save the offset associated with this request for later
449 * nfsd matching.
450 */
451 fhe = fha_hash_entry_lookup(softc, i.fh);
452 req->rq_p1 = fhe;
453 req->rq_p2 = i.locktype;
454 req->rq_p3 = i.offset;
455
456 /*
457 * Choose a thread, taking into consideration locality, thread load,
458 * and the number of threads already working on this file.
459 */
460 thread = fha_hash_entry_choose_thread(softc, fhe, &i, this_thread);
461 KASSERT(thread, ("fha_assign: NULL thread!"));
462 fha_hash_entry_add_op(fhe, i.locktype, 1);
463
464 return (thread);
465 }
466
467 /*
468 * Called when we're done with an operation. The request has already
469 * been de-queued.
470 */
471 void
472 fha_nd_complete(SVCTHREAD *thread, struct svc_req *req)
473 {
474 struct fha_hash_entry *fhe = req->rq_p1;
475
476 /*
477 * This may be called for reqs that didn't go through
478 * fha_assign (e.g. extra NULL ops used for RPCSEC_GSS.
479 */
480 if (!fhe)
481 return;
482
483 fha_hash_entry_add_op(fhe, req->rq_p2, -1);
484
485 if (thread->st_reqcount == 0) {
486 fha_hash_entry_remove_thread(fhe, thread);
487 if (0 == fhe->num_rw + fhe->num_exclusive)
488 fha_hash_entry_remove(fhe);
489 }
490 }
491
492 int
493 fhe_stats_sysctl(SYSCTL_HANDLER_ARGS, struct fha_params *softc)
494 {
495 int error, count, i;
496 struct sbuf sb;
497 struct fha_hash_entry *fhe;
498 bool_t first = TRUE;
499 SVCTHREAD *thread;
500 SVCPOOL *pool;
501
502 sbuf_new(&sb, NULL, 4096, SBUF_FIXEDLEN);
503
504 pool = NULL;
505
506 if (!*softc->pool) {
507 sbuf_printf(&sb, "NFSD not running\n");
508 goto out;
509 }
510 pool = *softc->pool;
511
512 mtx_lock(&pool->sp_lock);
513 count = 0;
514 for (i = 0; i <= softc->g_fha.hashmask; i++)
515 if (!LIST_EMPTY(&softc->g_fha.hashtable[i]))
516 count++;
517
518 if (count == 0) {
519 sbuf_printf(&sb, "No file handle entries.\n");
520 goto out;
521 }
522
523 for (i = 0; i <= softc->g_fha.hashmask; i++) {
524 LIST_FOREACH(fhe, &softc->g_fha.hashtable[i], link) {
525 sbuf_printf(&sb, "%sfhe %p: {\n", first ? "" : ", ", fhe);
526
527 sbuf_printf(&sb, " fh: %ju\n", (uintmax_t) fhe->fh);
528 sbuf_printf(&sb, " num_rw: %d\n", fhe->num_rw);
529 sbuf_printf(&sb, " num_exclusive: %d\n", fhe->num_exclusive);
530 sbuf_printf(&sb, " num_threads: %d\n", fhe->num_threads);
531
532 LIST_FOREACH(thread, &fhe->threads, st_alink) {
533 sbuf_printf(&sb, " thread %p offset %ju "
534 "(count %d)\n", thread,
535 STAILQ_FIRST(&thread->st_reqs)->rq_p3,
536 thread->st_reqcount);
537 }
538
539 sbuf_printf(&sb, "}");
540 first = FALSE;
541
542 /* Limit the output. */
543 if (++count > 128) {
544 sbuf_printf(&sb, "...");
545 break;
546 }
547 }
548 }
549
550 out:
551 if (pool)
552 mtx_unlock(&pool->sp_lock);
553 sbuf_trim(&sb);
554 sbuf_finish(&sb);
555 error = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
556 sbuf_delete(&sb);
557 return (error);
558 }
559