vfs_lockf.c revision 1.69.4.1 1 /* $NetBSD: vfs_lockf.c,v 1.69.4.1 2009/07/01 22:45:03 snj Exp $ */
2
3 /*
4 * Copyright (c) 1982, 1986, 1989, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Scooter Morris at Genentech Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)ufs_lockf.c 8.4 (Berkeley) 10/26/94
35 */
36
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: vfs_lockf.c,v 1.69.4.1 2009/07/01 22:45:03 snj Exp $");
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/file.h>
44 #include <sys/proc.h>
45 #include <sys/vnode.h>
46 #include <sys/pool.h>
47 #include <sys/fcntl.h>
48 #include <sys/lockf.h>
49 #include <sys/atomic.h>
50 #include <sys/kauth.h>
51 #include <sys/uidinfo.h>
52
53 /*
54 * The lockf structure is a kernel structure which contains the information
55 * associated with a byte range lock. The lockf structures are linked into
56 * the vnode structure. Locks are sorted by the starting byte of the lock for
57 * efficiency.
58 *
59 * lf_next is used for two purposes, depending on whether the lock is
60 * being held, or is in conflict with an existing lock. If this lock
61 * is held, it indicates the next lock on the same vnode.
62 * For pending locks, if lock->lf_next is non-NULL, then lock->lf_block
63 * must be queued on the lf_blkhd TAILQ of lock->lf_next.
64 */
65
66 TAILQ_HEAD(locklist, lockf);
67
68 struct lockf {
69 kcondvar_t lf_cv; /* Signalling */
70 short lf_flags; /* Lock semantics: F_POSIX, F_FLOCK, F_WAIT */
71 short lf_type; /* Lock type: F_RDLCK, F_WRLCK */
72 off_t lf_start; /* The byte # of the start of the lock */
73 off_t lf_end; /* The byte # of the end of the lock (-1=EOF)*/
74 void *lf_id; /* process or file description holding lock */
75 struct lockf **lf_head; /* Back pointer to the head of lockf list */
76 struct lockf *lf_next; /* Next lock on this vnode, or blocking lock */
77 struct locklist lf_blkhd; /* List of requests blocked on this lock */
78 TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */
79 uid_t lf_uid; /* User ID responsible */
80 };
81
82 /* Maximum length of sleep chains to traverse to try and detect deadlock. */
83 #define MAXDEPTH 50
84
85 static pool_cache_t lockf_cache;
86 static kmutex_t *lockf_lock;
87 static char lockstr[] = "lockf";
88
89 /*
90 * This variable controls the maximum number of processes that will
91 * be checked in doing deadlock detection.
92 */
93 int maxlockdepth = MAXDEPTH;
94
95 #ifdef LOCKF_DEBUG
96 int lockf_debug = 0;
97 #endif
98
99 #define SELF 0x1
100 #define OTHERS 0x2
101
102 /*
103 * XXX TODO
104 * Misc cleanups: "void *id" should be visible in the API as a
105 * "struct proc *".
106 * (This requires rototilling all VFS's which support advisory locking).
107 */
108
109 /*
110 * If there's a lot of lock contention on a single vnode, locking
111 * schemes which allow for more paralleism would be needed. Given how
112 * infrequently byte-range locks are actually used in typical BSD
113 * code, a more complex approach probably isn't worth it.
114 */
115
116 /*
117 * We enforce a limit on locks by uid, so that a single user cannot
118 * run the kernel out of memory. For now, the limit is pretty coarse.
119 * There is no limit on root.
120 *
121 * Splitting a lock will always succeed, regardless of current allocations.
122 * If you're slightly above the limit, we still have to permit an allocation
123 * so that the unlock can succeed. If the unlocking causes too many splits,
124 * however, you're totally cutoff.
125 */
126 int maxlocksperuid = 1024;
127
128 #ifdef LOCKF_DEBUG
129 /*
130 * Print out a lock.
131 */
132 static void
133 lf_print(const char *tag, struct lockf *lock)
134 {
135
136 printf("%s: lock %p for ", tag, lock);
137 if (lock->lf_flags & F_POSIX)
138 printf("proc %d", ((struct proc *)lock->lf_id)->p_pid);
139 else
140 printf("file %p", (struct file *)lock->lf_id);
141 printf(" %s, start %qx, end %qx",
142 lock->lf_type == F_RDLCK ? "shared" :
143 lock->lf_type == F_WRLCK ? "exclusive" :
144 lock->lf_type == F_UNLCK ? "unlock" :
145 "unknown", lock->lf_start, lock->lf_end);
146 if (TAILQ_FIRST(&lock->lf_blkhd))
147 printf(" block %p\n", TAILQ_FIRST(&lock->lf_blkhd));
148 else
149 printf("\n");
150 }
151
152 static void
153 lf_printlist(const char *tag, struct lockf *lock)
154 {
155 struct lockf *lf, *blk;
156
157 printf("%s: Lock list:\n", tag);
158 for (lf = *lock->lf_head; lf; lf = lf->lf_next) {
159 printf("\tlock %p for ", lf);
160 if (lf->lf_flags & F_POSIX)
161 printf("proc %d", ((struct proc *)lf->lf_id)->p_pid);
162 else
163 printf("file %p", (struct file *)lf->lf_id);
164 printf(", %s, start %qx, end %qx",
165 lf->lf_type == F_RDLCK ? "shared" :
166 lf->lf_type == F_WRLCK ? "exclusive" :
167 lf->lf_type == F_UNLCK ? "unlock" :
168 "unknown", lf->lf_start, lf->lf_end);
169 TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
170 if (blk->lf_flags & F_POSIX)
171 printf("; proc %d",
172 ((struct proc *)blk->lf_id)->p_pid);
173 else
174 printf("; file %p", (struct file *)blk->lf_id);
175 printf(", %s, start %qx, end %qx",
176 blk->lf_type == F_RDLCK ? "shared" :
177 blk->lf_type == F_WRLCK ? "exclusive" :
178 blk->lf_type == F_UNLCK ? "unlock" :
179 "unknown", blk->lf_start, blk->lf_end);
180 if (TAILQ_FIRST(&blk->lf_blkhd))
181 panic("lf_printlist: bad list");
182 }
183 printf("\n");
184 }
185 }
186 #endif /* LOCKF_DEBUG */
187
188 /*
189 * 3 options for allowfail.
190 * 0 - always allocate. 1 - cutoff at limit. 2 - cutoff at double limit.
191 */
192 static struct lockf *
193 lf_alloc(uid_t uid, int allowfail)
194 {
195 struct uidinfo *uip;
196 struct lockf *lock;
197 u_long lcnt;
198
199 uip = uid_find(uid);
200 lcnt = atomic_inc_ulong_nv(&uip->ui_lockcnt);
201 if (uid && allowfail && lcnt >
202 (allowfail == 1 ? maxlocksperuid : (maxlocksperuid * 2))) {
203 atomic_dec_ulong(&uip->ui_lockcnt);
204 return NULL;
205 }
206
207 lock = pool_cache_get(lockf_cache, PR_WAITOK);
208 lock->lf_uid = uid;
209 return lock;
210 }
211
212 static void
213 lf_free(struct lockf *lock)
214 {
215 struct uidinfo *uip;
216
217 uip = uid_find(lock->lf_uid);
218 atomic_dec_ulong(&uip->ui_lockcnt);
219 pool_cache_put(lockf_cache, lock);
220 }
221
222 static int
223 lf_ctor(void *arg, void *obj, int flag)
224 {
225 struct lockf *lock;
226
227 lock = obj;
228 cv_init(&lock->lf_cv, lockstr);
229
230 return 0;
231 }
232
233 static void
234 lf_dtor(void *arg, void *obj)
235 {
236 struct lockf *lock;
237
238 lock = obj;
239 cv_destroy(&lock->lf_cv);
240 }
241
242 /*
243 * Walk the list of locks for an inode to
244 * find an overlapping lock (if any).
245 *
246 * NOTE: this returns only the FIRST overlapping lock. There
247 * may be more than one.
248 */
249 static int
250 lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
251 struct lockf ***prev, struct lockf **overlap)
252 {
253 off_t start, end;
254
255 *overlap = lf;
256 if (lf == NULL)
257 return 0;
258 #ifdef LOCKF_DEBUG
259 if (lockf_debug & 2)
260 lf_print("lf_findoverlap: looking for overlap in", lock);
261 #endif /* LOCKF_DEBUG */
262 start = lock->lf_start;
263 end = lock->lf_end;
264 while (lf != NULL) {
265 if (((type == SELF) && lf->lf_id != lock->lf_id) ||
266 ((type == OTHERS) && lf->lf_id == lock->lf_id)) {
267 *prev = &lf->lf_next;
268 *overlap = lf = lf->lf_next;
269 continue;
270 }
271 #ifdef LOCKF_DEBUG
272 if (lockf_debug & 2)
273 lf_print("\tchecking", lf);
274 #endif /* LOCKF_DEBUG */
275 /*
276 * OK, check for overlap
277 *
278 * Six cases:
279 * 0) no overlap
280 * 1) overlap == lock
281 * 2) overlap contains lock
282 * 3) lock contains overlap
283 * 4) overlap starts before lock
284 * 5) overlap ends after lock
285 */
286 if ((lf->lf_end != -1 && start > lf->lf_end) ||
287 (end != -1 && lf->lf_start > end)) {
288 /* Case 0 */
289 #ifdef LOCKF_DEBUG
290 if (lockf_debug & 2)
291 printf("no overlap\n");
292 #endif /* LOCKF_DEBUG */
293 if ((type & SELF) && end != -1 && lf->lf_start > end)
294 return 0;
295 *prev = &lf->lf_next;
296 *overlap = lf = lf->lf_next;
297 continue;
298 }
299 if ((lf->lf_start == start) && (lf->lf_end == end)) {
300 /* Case 1 */
301 #ifdef LOCKF_DEBUG
302 if (lockf_debug & 2)
303 printf("overlap == lock\n");
304 #endif /* LOCKF_DEBUG */
305 return 1;
306 }
307 if ((lf->lf_start <= start) &&
308 (end != -1) &&
309 ((lf->lf_end >= end) || (lf->lf_end == -1))) {
310 /* Case 2 */
311 #ifdef LOCKF_DEBUG
312 if (lockf_debug & 2)
313 printf("overlap contains lock\n");
314 #endif /* LOCKF_DEBUG */
315 return 2;
316 }
317 if (start <= lf->lf_start &&
318 (end == -1 ||
319 (lf->lf_end != -1 && end >= lf->lf_end))) {
320 /* Case 3 */
321 #ifdef LOCKF_DEBUG
322 if (lockf_debug & 2)
323 printf("lock contains overlap\n");
324 #endif /* LOCKF_DEBUG */
325 return 3;
326 }
327 if ((lf->lf_start < start) &&
328 ((lf->lf_end >= start) || (lf->lf_end == -1))) {
329 /* Case 4 */
330 #ifdef LOCKF_DEBUG
331 if (lockf_debug & 2)
332 printf("overlap starts before lock\n");
333 #endif /* LOCKF_DEBUG */
334 return 4;
335 }
336 if ((lf->lf_start > start) &&
337 (end != -1) &&
338 ((lf->lf_end > end) || (lf->lf_end == -1))) {
339 /* Case 5 */
340 #ifdef LOCKF_DEBUG
341 if (lockf_debug & 2)
342 printf("overlap ends after lock\n");
343 #endif /* LOCKF_DEBUG */
344 return 5;
345 }
346 panic("lf_findoverlap: default");
347 }
348 return 0;
349 }
350
351 /*
352 * Split a lock and a contained region into
353 * two or three locks as necessary.
354 */
355 static void
356 lf_split(struct lockf *lock1, struct lockf *lock2, struct lockf **sparelock)
357 {
358 struct lockf *splitlock;
359
360 #ifdef LOCKF_DEBUG
361 if (lockf_debug & 2) {
362 lf_print("lf_split", lock1);
363 lf_print("splitting from", lock2);
364 }
365 #endif /* LOCKF_DEBUG */
366 /*
367 * Check to see if spliting into only two pieces.
368 */
369 if (lock1->lf_start == lock2->lf_start) {
370 lock1->lf_start = lock2->lf_end + 1;
371 lock2->lf_next = lock1;
372 return;
373 }
374 if (lock1->lf_end == lock2->lf_end) {
375 lock1->lf_end = lock2->lf_start - 1;
376 lock2->lf_next = lock1->lf_next;
377 lock1->lf_next = lock2;
378 return;
379 }
380 /*
381 * Make a new lock consisting of the last part of
382 * the encompassing lock
383 */
384 splitlock = *sparelock;
385 *sparelock = NULL;
386 cv_destroy(&splitlock->lf_cv);
387 memcpy(splitlock, lock1, sizeof(*splitlock));
388 cv_init(&splitlock->lf_cv, lockstr);
389
390 splitlock->lf_start = lock2->lf_end + 1;
391 TAILQ_INIT(&splitlock->lf_blkhd);
392 lock1->lf_end = lock2->lf_start - 1;
393 /*
394 * OK, now link it in
395 */
396 splitlock->lf_next = lock1->lf_next;
397 lock2->lf_next = splitlock;
398 lock1->lf_next = lock2;
399 }
400
401 /*
402 * Wakeup a blocklist
403 */
404 static void
405 lf_wakelock(struct lockf *listhead)
406 {
407 struct lockf *wakelock;
408
409 while ((wakelock = TAILQ_FIRST(&listhead->lf_blkhd))) {
410 KASSERT(wakelock->lf_next == listhead);
411 TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
412 wakelock->lf_next = NULL;
413 #ifdef LOCKF_DEBUG
414 if (lockf_debug & 2)
415 lf_print("lf_wakelock: awakening", wakelock);
416 #endif
417 cv_broadcast(&wakelock->lf_cv);
418 }
419 }
420
421 /*
422 * Remove a byte-range lock on an inode.
423 *
424 * Generally, find the lock (or an overlap to that lock)
425 * and remove it (or shrink it), then wakeup anyone we can.
426 */
427 static int
428 lf_clearlock(struct lockf *unlock, struct lockf **sparelock)
429 {
430 struct lockf **head = unlock->lf_head;
431 struct lockf *lf = *head;
432 struct lockf *overlap, **prev;
433 int ovcase;
434
435 if (lf == NULL)
436 return 0;
437 #ifdef LOCKF_DEBUG
438 if (unlock->lf_type != F_UNLCK)
439 panic("lf_clearlock: bad type");
440 if (lockf_debug & 1)
441 lf_print("lf_clearlock", unlock);
442 #endif /* LOCKF_DEBUG */
443 prev = head;
444 while ((ovcase = lf_findoverlap(lf, unlock, SELF,
445 &prev, &overlap)) != 0) {
446 /*
447 * Wakeup the list of locks to be retried.
448 */
449 lf_wakelock(overlap);
450
451 switch (ovcase) {
452
453 case 1: /* overlap == lock */
454 *prev = overlap->lf_next;
455 lf_free(overlap);
456 break;
457
458 case 2: /* overlap contains lock: split it */
459 if (overlap->lf_start == unlock->lf_start) {
460 overlap->lf_start = unlock->lf_end + 1;
461 break;
462 }
463 lf_split(overlap, unlock, sparelock);
464 overlap->lf_next = unlock->lf_next;
465 break;
466
467 case 3: /* lock contains overlap */
468 *prev = overlap->lf_next;
469 lf = overlap->lf_next;
470 lf_free(overlap);
471 continue;
472
473 case 4: /* overlap starts before lock */
474 overlap->lf_end = unlock->lf_start - 1;
475 prev = &overlap->lf_next;
476 lf = overlap->lf_next;
477 continue;
478
479 case 5: /* overlap ends after lock */
480 overlap->lf_start = unlock->lf_end + 1;
481 break;
482 }
483 break;
484 }
485 #ifdef LOCKF_DEBUG
486 if (lockf_debug & 1)
487 lf_printlist("lf_clearlock", unlock);
488 #endif /* LOCKF_DEBUG */
489 return 0;
490 }
491
492 /*
493 * Walk the list of locks for an inode and
494 * return the first blocking lock.
495 */
496 static struct lockf *
497 lf_getblock(struct lockf *lock)
498 {
499 struct lockf **prev, *overlap, *lf = *(lock->lf_head);
500
501 prev = lock->lf_head;
502 while (lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != 0) {
503 /*
504 * We've found an overlap, see if it blocks us
505 */
506 if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
507 return overlap;
508 /*
509 * Nope, point to the next one on the list and
510 * see if it blocks us
511 */
512 lf = overlap->lf_next;
513 }
514 return NULL;
515 }
516
517 /*
518 * Set a byte-range lock.
519 */
520 static int
521 lf_setlock(struct lockf *lock, struct lockf **sparelock,
522 kmutex_t *interlock)
523 {
524 struct lockf *block;
525 struct lockf **head = lock->lf_head;
526 struct lockf **prev, *overlap, *ltmp;
527 int ovcase, needtolink, error;
528
529 #ifdef LOCKF_DEBUG
530 if (lockf_debug & 1)
531 lf_print("lf_setlock", lock);
532 #endif /* LOCKF_DEBUG */
533
534 /*
535 * Scan lock list for this file looking for locks that would block us.
536 */
537 while ((block = lf_getblock(lock)) != NULL) {
538 /*
539 * Free the structure and return if nonblocking.
540 */
541 if ((lock->lf_flags & F_WAIT) == 0) {
542 lf_free(lock);
543 return EAGAIN;
544 }
545 /*
546 * We are blocked. Since flock style locks cover
547 * the whole file, there is no chance for deadlock.
548 * For byte-range locks we must check for deadlock.
549 *
550 * Deadlock detection is done by looking through the
551 * wait channels to see if there are any cycles that
552 * involve us. MAXDEPTH is set just to make sure we
553 * do not go off into neverneverland.
554 */
555 if ((lock->lf_flags & F_POSIX) &&
556 (block->lf_flags & F_POSIX)) {
557 struct lwp *wlwp;
558 volatile const struct lockf *waitblock;
559 int i = 0;
560 struct proc *p;
561
562 p = (struct proc *)block->lf_id;
563 KASSERT(p != NULL);
564 while (i++ < maxlockdepth) {
565 mutex_enter(p->p_lock);
566 if (p->p_nlwps > 1) {
567 mutex_exit(p->p_lock);
568 break;
569 }
570 wlwp = LIST_FIRST(&p->p_lwps);
571 lwp_lock(wlwp);
572 if (wlwp->l_wchan == NULL ||
573 wlwp->l_wmesg != lockstr) {
574 lwp_unlock(wlwp);
575 mutex_exit(p->p_lock);
576 break;
577 }
578 waitblock = wlwp->l_wchan;
579 lwp_unlock(wlwp);
580 mutex_exit(p->p_lock);
581 /* Get the owner of the blocking lock */
582 waitblock = waitblock->lf_next;
583 if ((waitblock->lf_flags & F_POSIX) == 0)
584 break;
585 p = (struct proc *)waitblock->lf_id;
586 if (p == curproc) {
587 lf_free(lock);
588 return EDEADLK;
589 }
590 }
591 /*
592 * If we're still following a dependency chain
593 * after maxlockdepth iterations, assume we're in
594 * a cycle to be safe.
595 */
596 if (i >= maxlockdepth) {
597 lf_free(lock);
598 return EDEADLK;
599 }
600 }
601 /*
602 * For flock type locks, we must first remove
603 * any shared locks that we hold before we sleep
604 * waiting for an exclusive lock.
605 */
606 if ((lock->lf_flags & F_FLOCK) &&
607 lock->lf_type == F_WRLCK) {
608 lock->lf_type = F_UNLCK;
609 (void) lf_clearlock(lock, NULL);
610 lock->lf_type = F_WRLCK;
611 }
612 /*
613 * Add our lock to the blocked list and sleep until we're free.
614 * Remember who blocked us (for deadlock detection).
615 */
616 lock->lf_next = block;
617 TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
618 #ifdef LOCKF_DEBUG
619 if (lockf_debug & 1) {
620 lf_print("lf_setlock: blocking on", block);
621 lf_printlist("lf_setlock", block);
622 }
623 #endif /* LOCKF_DEBUG */
624 error = cv_wait_sig(&lock->lf_cv, interlock);
625
626 /*
627 * We may have been awoken by a signal (in
628 * which case we must remove ourselves from the
629 * blocked list) and/or by another process
630 * releasing a lock (in which case we have already
631 * been removed from the blocked list and our
632 * lf_next field set to NULL).
633 */
634 if (lock->lf_next != NULL) {
635 TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
636 lock->lf_next = NULL;
637 }
638 if (error) {
639 lf_free(lock);
640 return error;
641 }
642 }
643 /*
644 * No blocks!! Add the lock. Note that we will
645 * downgrade or upgrade any overlapping locks this
646 * process already owns.
647 *
648 * Skip over locks owned by other processes.
649 * Handle any locks that overlap and are owned by ourselves.
650 */
651 prev = head;
652 block = *head;
653 needtolink = 1;
654 for (;;) {
655 ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
656 if (ovcase)
657 block = overlap->lf_next;
658 /*
659 * Six cases:
660 * 0) no overlap
661 * 1) overlap == lock
662 * 2) overlap contains lock
663 * 3) lock contains overlap
664 * 4) overlap starts before lock
665 * 5) overlap ends after lock
666 */
667 switch (ovcase) {
668 case 0: /* no overlap */
669 if (needtolink) {
670 *prev = lock;
671 lock->lf_next = overlap;
672 }
673 break;
674
675 case 1: /* overlap == lock */
676 /*
677 * If downgrading lock, others may be
678 * able to acquire it.
679 */
680 if (lock->lf_type == F_RDLCK &&
681 overlap->lf_type == F_WRLCK)
682 lf_wakelock(overlap);
683 overlap->lf_type = lock->lf_type;
684 lf_free(lock);
685 lock = overlap; /* for debug output below */
686 break;
687
688 case 2: /* overlap contains lock */
689 /*
690 * Check for common starting point and different types.
691 */
692 if (overlap->lf_type == lock->lf_type) {
693 lf_free(lock);
694 lock = overlap; /* for debug output below */
695 break;
696 }
697 if (overlap->lf_start == lock->lf_start) {
698 *prev = lock;
699 lock->lf_next = overlap;
700 overlap->lf_start = lock->lf_end + 1;
701 } else
702 lf_split(overlap, lock, sparelock);
703 lf_wakelock(overlap);
704 break;
705
706 case 3: /* lock contains overlap */
707 /*
708 * If downgrading lock, others may be able to
709 * acquire it, otherwise take the list.
710 */
711 if (lock->lf_type == F_RDLCK &&
712 overlap->lf_type == F_WRLCK) {
713 lf_wakelock(overlap);
714 } else {
715 while ((ltmp = TAILQ_FIRST(&overlap->lf_blkhd))) {
716 KASSERT(ltmp->lf_next == overlap);
717 TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
718 lf_block);
719 ltmp->lf_next = lock;
720 TAILQ_INSERT_TAIL(&lock->lf_blkhd,
721 ltmp, lf_block);
722 }
723 }
724 /*
725 * Add the new lock if necessary and delete the overlap.
726 */
727 if (needtolink) {
728 *prev = lock;
729 lock->lf_next = overlap->lf_next;
730 prev = &lock->lf_next;
731 needtolink = 0;
732 } else
733 *prev = overlap->lf_next;
734 lf_free(overlap);
735 continue;
736
737 case 4: /* overlap starts before lock */
738 /*
739 * Add lock after overlap on the list.
740 */
741 lock->lf_next = overlap->lf_next;
742 overlap->lf_next = lock;
743 overlap->lf_end = lock->lf_start - 1;
744 prev = &lock->lf_next;
745 lf_wakelock(overlap);
746 needtolink = 0;
747 continue;
748
749 case 5: /* overlap ends after lock */
750 /*
751 * Add the new lock before overlap.
752 */
753 if (needtolink) {
754 *prev = lock;
755 lock->lf_next = overlap;
756 }
757 overlap->lf_start = lock->lf_end + 1;
758 lf_wakelock(overlap);
759 break;
760 }
761 break;
762 }
763 #ifdef LOCKF_DEBUG
764 if (lockf_debug & 1) {
765 lf_print("lf_setlock: got the lock", lock);
766 lf_printlist("lf_setlock", lock);
767 }
768 #endif /* LOCKF_DEBUG */
769 return 0;
770 }
771
772 /*
773 * Check whether there is a blocking lock,
774 * and if so return its process identifier.
775 */
776 static int
777 lf_getlock(struct lockf *lock, struct flock *fl)
778 {
779 struct lockf *block;
780
781 #ifdef LOCKF_DEBUG
782 if (lockf_debug & 1)
783 lf_print("lf_getlock", lock);
784 #endif /* LOCKF_DEBUG */
785
786 if ((block = lf_getblock(lock)) != NULL) {
787 fl->l_type = block->lf_type;
788 fl->l_whence = SEEK_SET;
789 fl->l_start = block->lf_start;
790 if (block->lf_end == -1)
791 fl->l_len = 0;
792 else
793 fl->l_len = block->lf_end - block->lf_start + 1;
794 if (block->lf_flags & F_POSIX)
795 fl->l_pid = ((struct proc *)block->lf_id)->p_pid;
796 else
797 fl->l_pid = -1;
798 } else {
799 fl->l_type = F_UNLCK;
800 }
801 return 0;
802 }
803
804 /*
805 * Do an advisory lock operation.
806 */
807 int
808 lf_advlock(struct vop_advlock_args *ap, struct lockf **head, off_t size)
809 {
810 struct lwp *l = curlwp;
811 struct flock *fl = ap->a_fl;
812 struct lockf *lock = NULL;
813 struct lockf *sparelock;
814 kmutex_t *interlock = lockf_lock;
815 off_t start, end;
816 int error = 0;
817
818 /*
819 * Convert the flock structure into a start and end.
820 */
821 switch (fl->l_whence) {
822 case SEEK_SET:
823 case SEEK_CUR:
824 /*
825 * Caller is responsible for adding any necessary offset
826 * when SEEK_CUR is used.
827 */
828 start = fl->l_start;
829 break;
830
831 case SEEK_END:
832 start = size + fl->l_start;
833 break;
834
835 default:
836 return EINVAL;
837 }
838 if (start < 0)
839 return EINVAL;
840
841 /*
842 * Allocate locks before acquiring the interlock. We need two
843 * locks in the worst case.
844 */
845 switch (ap->a_op) {
846 case F_SETLK:
847 case F_UNLCK:
848 /*
849 * XXX For F_UNLCK case, we can re-use the lock.
850 */
851 if ((ap->a_flags & F_FLOCK) == 0) {
852 /*
853 * Byte-range lock might need one more lock.
854 */
855 sparelock = lf_alloc(kauth_cred_geteuid(l->l_cred), 0);
856 if (sparelock == NULL) {
857 error = ENOMEM;
858 goto quit;
859 }
860 break;
861 }
862 /* FALLTHROUGH */
863
864 case F_GETLK:
865 sparelock = NULL;
866 break;
867
868 default:
869 return EINVAL;
870 }
871
872 lock = lf_alloc(kauth_cred_geteuid(l->l_cred),
873 ap->a_op != F_UNLCK ? 1 : 2);
874 if (lock == NULL) {
875 error = ENOMEM;
876 goto quit;
877 }
878
879 mutex_enter(interlock);
880
881 /*
882 * Avoid the common case of unlocking when inode has no locks.
883 */
884 if (*head == (struct lockf *)0) {
885 if (ap->a_op != F_SETLK) {
886 fl->l_type = F_UNLCK;
887 error = 0;
888 goto quit_unlock;
889 }
890 }
891
892 if (fl->l_len == 0)
893 end = -1;
894 else
895 end = start + fl->l_len - 1;
896 /*
897 * Create the lockf structure.
898 */
899 lock->lf_start = start;
900 lock->lf_end = end;
901 lock->lf_head = head;
902 lock->lf_type = fl->l_type;
903 lock->lf_next = (struct lockf *)0;
904 TAILQ_INIT(&lock->lf_blkhd);
905 lock->lf_flags = ap->a_flags;
906 if (lock->lf_flags & F_POSIX) {
907 KASSERT(curproc == (struct proc *)ap->a_id);
908 }
909 lock->lf_id = (struct proc *)ap->a_id;
910
911 /*
912 * Do the requested operation.
913 */
914 switch (ap->a_op) {
915
916 case F_SETLK:
917 error = lf_setlock(lock, &sparelock, interlock);
918 lock = NULL; /* lf_setlock freed it */
919 break;
920
921 case F_UNLCK:
922 error = lf_clearlock(lock, &sparelock);
923 break;
924
925 case F_GETLK:
926 error = lf_getlock(lock, fl);
927 break;
928
929 default:
930 break;
931 /* NOTREACHED */
932 }
933
934 quit_unlock:
935 mutex_exit(interlock);
936 quit:
937 if (lock)
938 lf_free(lock);
939 if (sparelock)
940 lf_free(sparelock);
941
942 return error;
943 }
944
945 /*
946 * Initialize subsystem. XXX We use a global lock. This could be the
947 * vnode interlock, but the deadlock detection code may need to inspect
948 * locks belonging to other files.
949 */
950 void
951 lf_init(void)
952 {
953
954 lockf_cache = pool_cache_init(sizeof(struct lockf), 0, 0, 0, "lockf",
955 NULL, IPL_NONE, lf_ctor, lf_dtor, NULL);
956 lockf_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
957 }
958