1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28 /* Portions Copyright 2007 Jeremy Teo */ 29 /* Portions Copyright 2010 Robert Milkowski */ 30 31 #include <sys/types.h> 32 #include <sys/param.h> 33 #include <sys/time.h> 34 #include <sys/systm.h> 35 #include <sys/sysmacros.h> 36 #include <sys/resource.h> 37 #include <sys/vfs.h> 38 #include <sys/vm.h> 39 #include <sys/vnode.h> 40 #include <sys/file.h> 41 #include <sys/stat.h> 42 #include <sys/kmem.h> 43 #include <sys/taskq.h> 44 #include <sys/uio.h> 45 #include <sys/atomic.h> 46 #include <sys/namei.h> 47 #include <sys/mman.h> 48 #include <sys/cmn_err.h> 49 #include <sys/errno.h> 50 #include <sys/unistd.h> 51 #include <sys/zfs_dir.h> 52 #include <sys/zfs_ioctl.h> 53 #include <sys/fs/zfs.h> 54 #include <sys/dmu.h> 55 #include <sys/dmu_objset.h> 56 #include <sys/spa.h> 57 #include <sys/txg.h> 58 #include <sys/dbuf.h> 59 #include <sys/zap.h> 60 #include <sys/sa.h> 61 #include <sys/dirent.h> 62 #include <sys/policy.h> 63 #include <sys/sunddi.h> 64 #include <sys/filio.h> 65 #include <sys/sid.h> 66 #include <sys/zfs_ctldir.h> 67 #include <sys/zfs_fuid.h> 68 #include <sys/zfs_sa.h> 69 #include <sys/dnlc.h> 70 #include <sys/zfs_rlock.h> 71 #include <sys/buf.h> 72 #include <sys/sched.h> 73 #include <sys/acl.h> 74 #include <sys/extdirent.h> 75 76 #ifdef __FreeBSD__ 77 #include <sys/kidmap.h> 78 #include <sys/bio.h> 79 #include <vm/vm_param.h> 80 #endif 81 82 #ifdef __NetBSD__ 83 #include <dev/mm.h> 84 #include <miscfs/fifofs/fifo.h> 85 #include <miscfs/genfs/genfs.h> 86 #include <miscfs/genfs/genfs_node.h> 87 #include <uvm/uvm_extern.h> 88 #include <sys/fstrans.h> 89 #include <sys/malloc.h> 90 91 uint_t zfs_putpage_key; 92 #endif 93 94 /* 95 * Programming rules. 96 * 97 * Each vnode op performs some logical unit of work. To do this, the ZPL must 98 * properly lock its in-core state, create a DMU transaction, do the work, 99 * record this work in the intent log (ZIL), commit the DMU transaction, 100 * and wait for the intent log to commit if it is a synchronous operation. 101 * Moreover, the vnode ops must work in both normal and log replay context. 102 * The ordering of events is important to avoid deadlocks and references 103 * to freed memory. The example below illustrates the following Big Rules: 104 * 105 * (1) A check must be made in each zfs thread for a mounted file system. 106 * This is done avoiding races using ZFS_ENTER(zfsvfs). 107 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes 108 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros 109 * can return EIO from the calling function. 110 * 111 * (2) VN_RELE() should always be the last thing except for zil_commit() 112 * (if necessary) and ZFS_EXIT(). This is for 3 reasons: 113 * First, if it's the last reference, the vnode/znode 114 * can be freed, so the zp may point to freed memory. Second, the last 115 * reference will call zfs_zinactive(), which may induce a lot of work -- 116 * pushing cached pages (which acquires range locks) and syncing out 117 * cached atime changes. Third, zfs_zinactive() may require a new tx, 118 * which could deadlock the system if you were already holding one. 119 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). 120 * 121 * (3) All range locks must be grabbed before calling dmu_tx_assign(), 122 * as they can span dmu_tx_assign() calls. 123 * 124 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to 125 * dmu_tx_assign(). This is critical because we don't want to block 126 * while holding locks. 127 * 128 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This 129 * reduces lock contention and CPU usage when we must wait (note that if 130 * throughput is constrained by the storage, nearly every transaction 131 * must wait). 132 * 133 * Note, in particular, that if a lock is sometimes acquired before 134 * the tx assigns, and sometimes after (e.g. z_lock), then failing 135 * to use a non-blocking assign can deadlock the system. The scenario: 136 * 137 * Thread A has grabbed a lock before calling dmu_tx_assign(). 138 * Thread B is in an already-assigned tx, and blocks for this lock. 139 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() 140 * forever, because the previous txg can't quiesce until B's tx commits. 141 * 142 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, 143 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent 144 * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, 145 * to indicate that this operation has already called dmu_tx_wait(). 146 * This will ensure that we don't retry forever, waiting a short bit 147 * each time. 148 * 149 * (5) If the operation succeeded, generate the intent log entry for it 150 * before dropping locks. This ensures that the ordering of events 151 * in the intent log matches the order in which they actually occurred. 152 * During ZIL replay the zfs_log_* functions will update the sequence 153 * number to indicate the zil transaction has replayed. 154 * 155 * (6) At the end of each vnode op, the DMU tx must always commit, 156 * regardless of whether there were any errors. 157 * 158 * (7) After dropping all locks, invoke zil_commit(zilog, foid) 159 * to ensure that synchronous semantics are provided when necessary. 160 * 161 * In general, this is how things should be ordered in each vnode op: 162 * 163 * ZFS_ENTER(zfsvfs); // exit if unmounted 164 * top: 165 * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) 166 * rw_enter(...); // grab any other locks you need 167 * tx = dmu_tx_create(...); // get DMU tx 168 * dmu_tx_hold_*(); // hold each object you might modify 169 * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); 170 * if (error) { 171 * rw_exit(...); // drop locks 172 * zfs_dirent_unlock(dl); // unlock directory entry 173 * VN_RELE(...); // release held vnodes 174 * if (error == ERESTART) { 175 * waited = B_TRUE; 176 * dmu_tx_wait(tx); 177 * dmu_tx_abort(tx); 178 * goto top; 179 * } 180 * dmu_tx_abort(tx); // abort DMU tx 181 * ZFS_EXIT(zfsvfs); // finished in zfs 182 * return (error); // really out of space 183 * } 184 * error = do_real_work(); // do whatever this VOP does 185 * if (error == 0) 186 * zfs_log_*(...); // on success, make ZIL entry 187 * dmu_tx_commit(tx); // commit DMU tx -- error or not 188 * rw_exit(...); // drop locks 189 * zfs_dirent_unlock(dl); // unlock directory entry 190 * VN_RELE(...); // release held vnodes 191 * zil_commit(zilog, foid); // synchronous when necessary 192 * ZFS_EXIT(zfsvfs); // finished in zfs 193 * return (error); // done, report error 194 */ 195 196 /* ARGSUSED */ 197 static int 198 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 199 { 200 znode_t *zp = VTOZ(*vpp); 201 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 202 203 ZFS_ENTER(zfsvfs); 204 ZFS_VERIFY_ZP(zp); 205 206 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && 207 ((flag & FAPPEND) == 0)) { 208 ZFS_EXIT(zfsvfs); 209 return (SET_ERROR(EPERM)); 210 } 211 212 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 213 ZTOV(zp)->v_type == VREG && 214 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { 215 if (fs_vscan(*vpp, cr, 0) != 0) { 216 ZFS_EXIT(zfsvfs); 217 return (SET_ERROR(EACCES)); 218 } 219 } 220 221 /* 222 * Keep a count of the synchronous opens in the znode. On first 223 * synchronous open we must convert all previous async transactions 224 * into sync to keep correct ordering. 225 */ 226 if (flag & (FSYNC | FDSYNC)) { 227 if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1) 228 zil_async_to_sync(zfsvfs->z_log, zp->z_id); 229 } 230 231 ZFS_EXIT(zfsvfs); 232 return (0); 233 } 234 235 /* ARGSUSED */ 236 static int 237 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 238 caller_context_t *ct) 239 { 240 znode_t *zp = VTOZ(vp); 241 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 242 243 /* 244 * Clean up any locks held by this process on the vp. 245 */ 246 cleanlocks(vp, ddi_get_pid(), 0); 247 cleanshares(vp, ddi_get_pid()); 248 249 ZFS_ENTER(zfsvfs); 250 ZFS_VERIFY_ZP(zp); 251 252 /* Decrement the synchronous opens in the znode */ 253 if ((flag & (FSYNC | FDSYNC)) && (count == 1)) 254 atomic_dec_32(&zp->z_sync_cnt); 255 256 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && 257 ZTOV(zp)->v_type == VREG && 258 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) 259 VERIFY(fs_vscan(vp, cr, 1) == 0); 260 261 ZFS_EXIT(zfsvfs); 262 return (0); 263 } 264 265 /* 266 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and 267 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. 268 */ 269 static int 270 zfs_holey(vnode_t *vp, u_long cmd, offset_t *off) 271 { 272 znode_t *zp = VTOZ(vp); 273 uint64_t noff = (uint64_t)*off; /* new offset */ 274 uint64_t file_sz; 275 int error; 276 boolean_t hole; 277 278 file_sz = zp->z_size; 279 if (noff >= file_sz) { 280 return (SET_ERROR(ENXIO)); 281 } 282 283 if (cmd == _FIO_SEEK_HOLE) 284 hole = B_TRUE; 285 else 286 hole = B_FALSE; 287 288 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); 289 290 if (error == ESRCH) 291 return (SET_ERROR(ENXIO)); 292 293 /* 294 * We could find a hole that begins after the logical end-of-file, 295 * because dmu_offset_next() only works on whole blocks. If the 296 * EOF falls mid-block, then indicate that the "virtual hole" 297 * at the end of the file begins at the logical EOF, rather than 298 * at the end of the last block. 299 */ 300 if (noff > file_sz) { 301 ASSERT(hole); 302 noff = file_sz; 303 } 304 305 if (noff < *off) 306 return (error); 307 *off = noff; 308 return (error); 309 } 310 311 /* ARGSUSED */ 312 static int 313 zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, 314 int *rvalp, caller_context_t *ct) 315 { 316 offset_t off; 317 offset_t ndata; 318 dmu_object_info_t doi; 319 int error; 320 zfsvfs_t *zfsvfs; 321 znode_t *zp; 322 323 switch (com) { 324 case _FIOFFS: 325 { 326 return (0); 327 328 /* 329 * The following two ioctls are used by bfu. Faking out, 330 * necessary to avoid bfu errors. 331 */ 332 } 333 case _FIOGDIO: 334 case _FIOSDIO: 335 { 336 return (0); 337 } 338 339 case _FIO_SEEK_DATA: 340 case _FIO_SEEK_HOLE: 341 { 342 #ifdef illumos 343 if (ddi_copyin((void *)data, &off, sizeof (off), flag)) 344 return (SET_ERROR(EFAULT)); 345 #else 346 off = *(offset_t *)data; 347 #endif 348 zp = VTOZ(vp); 349 zfsvfs = zp->z_zfsvfs; 350 ZFS_ENTER(zfsvfs); 351 ZFS_VERIFY_ZP(zp); 352 353 /* offset parameter is in/out */ 354 error = zfs_holey(vp, com, &off); 355 ZFS_EXIT(zfsvfs); 356 if (error) 357 return (error); 358 #ifdef illumos 359 if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) 360 return (SET_ERROR(EFAULT)); 361 #else 362 *(offset_t *)data = off; 363 #endif 364 return (0); 365 } 366 #ifdef illumos 367 case _FIO_COUNT_FILLED: 368 { 369 /* 370 * _FIO_COUNT_FILLED adds a new ioctl command which 371 * exposes the number of filled blocks in a 372 * ZFS object. 373 */ 374 zp = VTOZ(vp); 375 zfsvfs = zp->z_zfsvfs; 376 ZFS_ENTER(zfsvfs); 377 ZFS_VERIFY_ZP(zp); 378 379 /* 380 * Wait for all dirty blocks for this object 381 * to get synced out to disk, and the DMU info 382 * updated. 383 */ 384 error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); 385 if (error) { 386 ZFS_EXIT(zfsvfs); 387 return (error); 388 } 389 390 /* 391 * Retrieve fill count from DMU object. 392 */ 393 error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); 394 if (error) { 395 ZFS_EXIT(zfsvfs); 396 return (error); 397 } 398 399 ndata = doi.doi_fill_count; 400 401 ZFS_EXIT(zfsvfs); 402 if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) 403 return (SET_ERROR(EFAULT)); 404 return (0); 405 } 406 #endif 407 } 408 return (SET_ERROR(ENOTTY)); 409 } 410 411 #ifdef __FreeBSD__ 412 static vm_page_t 413 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) 414 { 415 vm_object_t obj; 416 vm_page_t pp; 417 int64_t end; 418 419 /* 420 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE 421 * aligned boundaries, if the range is not aligned. As a result a 422 * DEV_BSIZE subrange with partially dirty data may get marked as clean. 423 * It may happen that all DEV_BSIZE subranges are marked clean and thus 424 * the whole page would be considred clean despite have some dirty data. 425 * For this reason we should shrink the range to DEV_BSIZE aligned 426 * boundaries before calling vm_page_clear_dirty. 427 */ 428 end = rounddown2(off + nbytes, DEV_BSIZE); 429 off = roundup2(off, DEV_BSIZE); 430 nbytes = end - off; 431 432 obj = vp->v_object; 433 zfs_vmobject_assert_wlocked(obj); 434 435 for (;;) { 436 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 437 pp->valid) { 438 if (vm_page_xbusied(pp)) { 439 /* 440 * Reference the page before unlocking and 441 * sleeping so that the page daemon is less 442 * likely to reclaim it. 443 */ 444 vm_page_reference(pp); 445 vm_page_lock(pp); 446 zfs_vmobject_wunlock(obj); 447 vm_page_busy_sleep(pp, "zfsmwb", true); 448 zfs_vmobject_wlock(obj); 449 continue; 450 } 451 vm_page_sbusy(pp); 452 } else if (pp != NULL) { 453 ASSERT(!pp->valid); 454 pp = NULL; 455 } 456 457 if (pp != NULL) { 458 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 459 vm_object_pip_add(obj, 1); 460 pmap_remove_write(pp); 461 if (nbytes != 0) 462 vm_page_clear_dirty(pp, off, nbytes); 463 } 464 break; 465 } 466 return (pp); 467 } 468 469 static void 470 page_unbusy(vm_page_t pp) 471 { 472 473 vm_page_sunbusy(pp); 474 vm_object_pip_subtract(pp->object, 1); 475 } 476 477 static vm_page_t 478 page_hold(vnode_t *vp, int64_t start) 479 { 480 vm_object_t obj; 481 vm_page_t pp; 482 483 obj = vp->v_object; 484 zfs_vmobject_assert_wlocked(obj); 485 486 for (;;) { 487 if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && 488 pp->valid) { 489 if (vm_page_xbusied(pp)) { 490 /* 491 * Reference the page before unlocking and 492 * sleeping so that the page daemon is less 493 * likely to reclaim it. 494 */ 495 vm_page_reference(pp); 496 vm_page_lock(pp); 497 zfs_vmobject_wunlock(obj); 498 vm_page_busy_sleep(pp, "zfsmwb", true); 499 zfs_vmobject_wlock(obj); 500 continue; 501 } 502 503 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 504 vm_page_lock(pp); 505 vm_page_hold(pp); 506 vm_page_unlock(pp); 507 508 } else 509 pp = NULL; 510 break; 511 } 512 return (pp); 513 } 514 515 static void 516 page_unhold(vm_page_t pp) 517 { 518 519 vm_page_lock(pp); 520 vm_page_unhold(pp); 521 vm_page_unlock(pp); 522 } 523 524 /* 525 * When a file is memory mapped, we must keep the IO data synchronized 526 * between the DMU cache and the memory mapped pages. What this means: 527 * 528 * On Write: If we find a memory mapped page, we write to *both* 529 * the page and the dmu buffer. 530 */ 531 static void 532 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, 533 int segflg, dmu_tx_t *tx) 534 { 535 vm_object_t obj; 536 struct sf_buf *sf; 537 caddr_t va; 538 int off; 539 540 ASSERT(segflg != UIO_NOCOPY); 541 ASSERT(vp->v_mount != NULL); 542 obj = vp->v_object; 543 ASSERT(obj != NULL); 544 545 off = start & PAGEOFFSET; 546 zfs_vmobject_wlock(obj); 547 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 548 vm_page_t pp; 549 int nbytes = imin(PAGESIZE - off, len); 550 551 if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { 552 zfs_vmobject_wunlock(obj); 553 554 va = zfs_map_page(pp, &sf); 555 (void) dmu_read(os, oid, start+off, nbytes, 556 va+off, DMU_READ_PREFETCH);; 557 zfs_unmap_page(sf); 558 559 zfs_vmobject_wlock(obj); 560 page_unbusy(pp); 561 } 562 len -= nbytes; 563 off = 0; 564 } 565 vm_object_pip_wakeupn(obj, 0); 566 zfs_vmobject_wunlock(obj); 567 } 568 569 /* 570 * Read with UIO_NOCOPY flag means that sendfile(2) requests 571 * ZFS to populate a range of page cache pages with data. 572 * 573 * NOTE: this function could be optimized to pre-allocate 574 * all pages in advance, drain exclusive busy on all of them, 575 * map them into contiguous KVA region and populate them 576 * in one single dmu_read() call. 577 */ 578 static int 579 mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) 580 { 581 znode_t *zp = VTOZ(vp); 582 objset_t *os = zp->z_zfsvfs->z_os; 583 struct sf_buf *sf; 584 vm_object_t obj; 585 vm_page_t pp; 586 int64_t start; 587 caddr_t va; 588 int len = nbytes; 589 int off; 590 int error = 0; 591 592 ASSERT(uio->uio_segflg == UIO_NOCOPY); 593 ASSERT(vp->v_mount != NULL); 594 obj = vp->v_object; 595 ASSERT(obj != NULL); 596 ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); 597 598 zfs_vmobject_wlock(obj); 599 for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { 600 int bytes = MIN(PAGESIZE, len); 601 602 pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | 603 VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); 604 if (pp->valid == 0) { 605 zfs_vmobject_wunlock(obj); 606 va = zfs_map_page(pp, &sf); 607 error = dmu_read(os, zp->z_id, start, bytes, va, 608 DMU_READ_PREFETCH); 609 if (bytes != PAGESIZE && error == 0) 610 bzero(va + bytes, PAGESIZE - bytes); 611 zfs_unmap_page(sf); 612 zfs_vmobject_wlock(obj); 613 vm_page_sunbusy(pp); 614 vm_page_lock(pp); 615 if (error) { 616 if (pp->wire_count == 0 && pp->valid == 0 && 617 !vm_page_busied(pp)) 618 vm_page_free(pp); 619 } else { 620 pp->valid = VM_PAGE_BITS_ALL; 621 vm_page_activate(pp); 622 } 623 vm_page_unlock(pp); 624 } else { 625 ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); 626 vm_page_sunbusy(pp); 627 } 628 if (error) 629 break; 630 uio->uio_resid -= bytes; 631 uio->uio_offset += bytes; 632 len -= bytes; 633 } 634 zfs_vmobject_wunlock(obj); 635 return (error); 636 } 637 638 /* 639 * When a file is memory mapped, we must keep the IO data synchronized 640 * between the DMU cache and the memory mapped pages. What this means: 641 * 642 * On Read: We "read" preferentially from memory mapped pages, 643 * else we default from the dmu buffer. 644 * 645 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when 646 * the file is memory mapped. 647 */ 648 static int 649 mappedread(vnode_t *vp, int nbytes, uio_t *uio) 650 { 651 znode_t *zp = VTOZ(vp); 652 vm_object_t obj; 653 int64_t start; 654 caddr_t va; 655 int len = nbytes; 656 int off; 657 int error = 0; 658 659 ASSERT(vp->v_mount != NULL); 660 obj = vp->v_object; 661 ASSERT(obj != NULL); 662 663 start = uio->uio_loffset; 664 off = start & PAGEOFFSET; 665 zfs_vmobject_wlock(obj); 666 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 667 vm_page_t pp; 668 uint64_t bytes = MIN(PAGESIZE - off, len); 669 670 if (pp = page_hold(vp, start)) { 671 struct sf_buf *sf; 672 caddr_t va; 673 674 zfs_vmobject_wunlock(obj); 675 va = zfs_map_page(pp, &sf); 676 #ifdef illumos 677 error = uiomove(va + off, bytes, UIO_READ, uio); 678 #else 679 error = vn_io_fault_uiomove(va + off, bytes, uio); 680 #endif 681 zfs_unmap_page(sf); 682 zfs_vmobject_wlock(obj); 683 page_unhold(pp); 684 } else { 685 zfs_vmobject_wunlock(obj); 686 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 687 uio, bytes); 688 zfs_vmobject_wlock(obj); 689 } 690 len -= bytes; 691 off = 0; 692 if (error) 693 break; 694 } 695 zfs_vmobject_wunlock(obj); 696 return (error); 697 } 698 #endif /* __FreeBSD__ */ 699 700 #ifdef __NetBSD__ 701 702 caddr_t 703 zfs_map_page(page_t *pp, enum seg_rw rw) 704 { 705 vaddr_t va; 706 int flags; 707 708 #ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS 709 if (mm_md_direct_mapped_phys(VM_PAGE_TO_PHYS(pp), &va)) 710 return (caddr_t)va; 711 #endif 712 713 flags = UVMPAGER_MAPIN_WAITOK | 714 (rw == S_READ ? UVMPAGER_MAPIN_WRITE : UVMPAGER_MAPIN_READ); 715 va = uvm_pagermapin(&pp, 1, flags); 716 return (caddr_t)va; 717 } 718 719 void 720 zfs_unmap_page(page_t *pp, caddr_t addr) 721 { 722 723 #ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS 724 vaddr_t va; 725 726 if (mm_md_direct_mapped_phys(VM_PAGE_TO_PHYS(pp), &va)) 727 return; 728 #endif 729 uvm_pagermapout((vaddr_t)addr, 1); 730 } 731 732 static int 733 mappedread(vnode_t *vp, int nbytes, uio_t *uio) 734 { 735 znode_t *zp = VTOZ(vp); 736 struct uvm_object *uobj = &vp->v_uobj; 737 krwlock_t *rw = uobj->vmobjlock; 738 int64_t start; 739 caddr_t va; 740 size_t len = nbytes; 741 int off; 742 int error = 0; 743 int npages, found; 744 void *buf = NULL; 745 746 start = uio->uio_loffset; 747 off = start & PAGEOFFSET; 748 749 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 750 page_t *pp; 751 uint64_t bytes = MIN(PAGESIZE - off, len); 752 retry: 753 pp = NULL; 754 npages = 1; 755 rw_enter(rw, RW_WRITER); 756 found = uvn_findpages(uobj, start, &npages, &pp, NULL, 757 UFP_NOALLOC); 758 rw_exit(rw); 759 760 if (found) { 761 if (buf != NULL) { 762 va = zfs_map_page(pp, S_READ); 763 memcpy(buf, va + off, bytes); 764 zfs_unmap_page(pp, va); 765 } 766 rw_enter(rw, RW_WRITER); 767 uvm_page_unbusy(&pp, 1); 768 rw_exit(rw); 769 if (buf == NULL) { 770 buf = kmem_alloc(PAGESIZE, KM_SLEEP); 771 goto retry; 772 } 773 error = uiomove(buf, bytes, UIO_READ, uio); 774 } else { 775 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 776 uio, bytes); 777 } 778 779 len -= bytes; 780 off = 0; 781 if (error) 782 break; 783 } 784 if (buf != NULL) { 785 kmem_free(buf, PAGESIZE); 786 } 787 return (error); 788 } 789 790 static void 791 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, 792 int segflg, dmu_tx_t *tx) 793 { 794 struct uvm_object *uobj = &vp->v_uobj; 795 krwlock_t *rw = uobj->vmobjlock; 796 caddr_t va; 797 int off, status; 798 799 ASSERT(vp->v_mount != NULL); 800 801 rw_enter(rw, RW_WRITER); 802 803 off = start & PAGEOFFSET; 804 for (start &= PAGEMASK; len > 0; start += PAGESIZE) { 805 page_t *pp; 806 int nbytes = MIN(PAGESIZE - off, len); 807 int npages, found; 808 809 pp = NULL; 810 npages = 1; 811 found = uvn_findpages(uobj, start, &npages, &pp, NULL, 812 UFP_NOALLOC); 813 if (found) { 814 if (nbytes == PAGESIZE) { 815 /* 816 * We're about to zap the page's contents 817 * and don't care about any existing 818 * modifications. We must keep track of 819 * any new modifications past this point. 820 * Clear the modified bit in the pmap, and 821 * if the page is marked dirty revert to 822 * tracking the modified bit. 823 */ 824 switch (uvm_pagegetdirty(pp)) { 825 case UVM_PAGE_STATUS_DIRTY: 826 /* Does pmap_clear_modify(). */ 827 uvm_pagemarkdirty(pp, UVM_PAGE_STATUS_UNKNOWN); 828 break; 829 case UVM_PAGE_STATUS_UNKNOWN: 830 pmap_clear_modify(pp); 831 break; 832 case UVM_PAGE_STATUS_CLEAN: 833 /* Nothing to do. */ 834 break; 835 } 836 } 837 rw_exit(rw); 838 839 va = zfs_map_page(pp, S_WRITE); 840 (void) dmu_read(os, oid, start + off, nbytes, 841 va + off, DMU_READ_PREFETCH); 842 zfs_unmap_page(pp, va); 843 844 rw_enter(rw, RW_WRITER); 845 uvm_page_unbusy(&pp, 1); 846 } 847 len -= nbytes; 848 off = 0; 849 } 850 rw_exit(rw); 851 } 852 #endif /* __NetBSD__ */ 853 854 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ 855 856 /* 857 * Read bytes from specified file into supplied buffer. 858 * 859 * IN: vp - vnode of file to be read from. 860 * uio - structure supplying read location, range info, 861 * and return buffer. 862 * ioflag - SYNC flags; used to provide FRSYNC semantics. 863 * cr - credentials of caller. 864 * ct - caller context 865 * 866 * OUT: uio - updated offset and range, buffer filled. 867 * 868 * RETURN: 0 on success, error code on failure. 869 * 870 * Side Effects: 871 * vp - atime updated if byte count > 0 872 */ 873 /* ARGSUSED */ 874 static int 875 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 876 { 877 znode_t *zp = VTOZ(vp); 878 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 879 ssize_t n, nbytes; 880 int error = 0; 881 rl_t *rl; 882 xuio_t *xuio = NULL; 883 884 ZFS_ENTER(zfsvfs); 885 ZFS_VERIFY_ZP(zp); 886 887 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 888 ZFS_EXIT(zfsvfs); 889 return (SET_ERROR(EACCES)); 890 } 891 892 /* 893 * Validate file offset 894 */ 895 if (uio->uio_loffset < (offset_t)0) { 896 ZFS_EXIT(zfsvfs); 897 return (SET_ERROR(EINVAL)); 898 } 899 900 /* 901 * Fasttrack empty reads 902 */ 903 if (uio->uio_resid == 0) { 904 ZFS_EXIT(zfsvfs); 905 return (0); 906 } 907 908 /* 909 * Check for mandatory locks 910 */ 911 if (MANDMODE(zp->z_mode)) { 912 if (error = chklock(vp, FREAD, 913 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { 914 ZFS_EXIT(zfsvfs); 915 return (error); 916 } 917 } 918 919 /* 920 * If we're in FRSYNC mode, sync out this znode before reading it. 921 */ 922 if (zfsvfs->z_log && 923 (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) 924 zil_commit(zfsvfs->z_log, zp->z_id); 925 926 /* 927 * Lock the range against changes. 928 */ 929 rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER); 930 931 /* 932 * If we are reading past end-of-file we can skip 933 * to the end; but we might still need to set atime. 934 */ 935 if (uio->uio_loffset >= zp->z_size) { 936 error = 0; 937 goto out; 938 } 939 940 ASSERT(uio->uio_loffset < zp->z_size); 941 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); 942 943 #ifdef illumos 944 if ((uio->uio_extflg == UIO_XUIO) && 945 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { 946 int nblk; 947 int blksz = zp->z_blksz; 948 uint64_t offset = uio->uio_loffset; 949 950 xuio = (xuio_t *)uio; 951 if ((ISP2(blksz))) { 952 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, 953 blksz)) / blksz; 954 } else { 955 ASSERT(offset + n <= blksz); 956 nblk = 1; 957 } 958 (void) dmu_xuio_init(xuio, nblk); 959 960 if (vn_has_cached_data(vp)) { 961 /* 962 * For simplicity, we always allocate a full buffer 963 * even if we only expect to read a portion of a block. 964 */ 965 while (--nblk >= 0) { 966 (void) dmu_xuio_add(xuio, 967 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 968 blksz), 0, blksz); 969 } 970 } 971 } 972 #endif /* illumos */ 973 974 while (n > 0) { 975 nbytes = MIN(n, zfs_read_chunk_size - 976 P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); 977 978 #ifdef __FreeBSD__ 979 if (uio->uio_segflg == UIO_NOCOPY) 980 error = mappedread_sf(vp, nbytes, uio); 981 else 982 #endif /* __FreeBSD__ */ 983 if (vn_has_cached_data(vp)) { 984 error = mappedread(vp, nbytes, uio); 985 } else { 986 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 987 uio, nbytes); 988 } 989 if (error) { 990 /* convert checksum errors into IO errors */ 991 if (error == ECKSUM) 992 error = SET_ERROR(EIO); 993 break; 994 } 995 996 n -= nbytes; 997 } 998 out: 999 zfs_range_unlock(rl); 1000 1001 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 1002 ZFS_EXIT(zfsvfs); 1003 return (error); 1004 } 1005 1006 /* 1007 * Write the bytes to a file. 1008 * 1009 * IN: vp - vnode of file to be written to. 1010 * uio - structure supplying write location, range info, 1011 * and data buffer. 1012 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is 1013 * set if in append mode. 1014 * cr - credentials of caller. 1015 * ct - caller context (NFS/CIFS fem monitor only) 1016 * 1017 * OUT: uio - updated offset and range. 1018 * 1019 * RETURN: 0 on success, error code on failure. 1020 * 1021 * Timestamps: 1022 * vp - ctime|mtime updated if byte count > 0 1023 */ 1024 1025 /* ARGSUSED */ 1026 static int 1027 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) 1028 { 1029 znode_t *zp = VTOZ(vp); 1030 rlim64_t limit = MAXOFFSET_T; 1031 ssize_t start_resid = uio->uio_resid; 1032 ssize_t tx_bytes; 1033 uint64_t end_size; 1034 dmu_tx_t *tx; 1035 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1036 zilog_t *zilog; 1037 offset_t woff; 1038 ssize_t n, nbytes; 1039 rl_t *rl; 1040 int max_blksz = zfsvfs->z_max_blksz; 1041 int error = 0; 1042 arc_buf_t *abuf; 1043 iovec_t *aiov = NULL; 1044 xuio_t *xuio = NULL; 1045 int i_iov = 0; 1046 int iovcnt = uio->uio_iovcnt; 1047 iovec_t *iovp = uio->uio_iov; 1048 int write_eof; 1049 int count = 0; 1050 sa_bulk_attr_t bulk[4]; 1051 uint64_t mtime[2], ctime[2]; 1052 int segflg; 1053 boolean_t commit; 1054 1055 #ifdef __NetBSD__ 1056 segflg = VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ? 1057 UIO_SYSSPACE : UIO_USERSPACE; 1058 #else 1059 segflg = uio->uio_segflg; 1060 #endif 1061 1062 /* 1063 * Fasttrack empty write 1064 */ 1065 n = start_resid; 1066 if (n == 0) 1067 return (0); 1068 1069 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 1070 limit = MAXOFFSET_T; 1071 1072 ZFS_ENTER(zfsvfs); 1073 ZFS_VERIFY_ZP(zp); 1074 1075 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 1076 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 1077 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 1078 &zp->z_size, 8); 1079 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 1080 &zp->z_pflags, 8); 1081 1082 /* 1083 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our 1084 * callers might not be able to detect properly that we are read-only, 1085 * so check it explicitly here. 1086 */ 1087 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 1088 ZFS_EXIT(zfsvfs); 1089 return (SET_ERROR(EROFS)); 1090 } 1091 1092 /* 1093 * If immutable or not appending then return EPERM 1094 */ 1095 if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || 1096 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && 1097 (uio->uio_loffset < zp->z_size))) { 1098 ZFS_EXIT(zfsvfs); 1099 return (SET_ERROR(EPERM)); 1100 } 1101 1102 zilog = zfsvfs->z_log; 1103 1104 /* 1105 * Validate file offset 1106 */ 1107 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; 1108 if (woff < 0) { 1109 ZFS_EXIT(zfsvfs); 1110 return (SET_ERROR(EINVAL)); 1111 } 1112 1113 /* 1114 * Check for mandatory locks before calling zfs_range_lock() 1115 * in order to prevent a deadlock with locks set via fcntl(). 1116 */ 1117 if (MANDMODE((mode_t)zp->z_mode) && 1118 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { 1119 ZFS_EXIT(zfsvfs); 1120 return (error); 1121 } 1122 1123 #ifdef illumos 1124 /* 1125 * Pre-fault the pages to ensure slow (eg NFS) pages 1126 * don't hold up txg. 1127 * Skip this if uio contains loaned arc_buf. 1128 */ 1129 if ((uio->uio_extflg == UIO_XUIO) && 1130 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) 1131 xuio = (xuio_t *)uio; 1132 else 1133 uio_prefaultpages(MIN(n, max_blksz), uio); 1134 #endif 1135 1136 /* 1137 * If in append mode, set the io offset pointer to eof. 1138 */ 1139 if (ioflag & FAPPEND) { 1140 /* 1141 * Obtain an appending range lock to guarantee file append 1142 * semantics. We reset the write offset once we have the lock. 1143 */ 1144 rl = zfs_range_lock(zp, 0, n, RL_APPEND); 1145 woff = rl->r_off; 1146 if (rl->r_len == UINT64_MAX) { 1147 /* 1148 * We overlocked the file because this write will cause 1149 * the file block size to increase. 1150 * Note that zp_size cannot change with this lock held. 1151 */ 1152 woff = zp->z_size; 1153 } 1154 uio->uio_loffset = woff; 1155 } else { 1156 /* 1157 * Note that if the file block size will change as a result of 1158 * this write, then this range lock will lock the entire file 1159 * so that we can re-write the block safely. 1160 */ 1161 rl = zfs_range_lock(zp, woff, n, RL_WRITER); 1162 } 1163 1164 #ifdef illumos 1165 if (woff >= limit) { 1166 zfs_range_unlock(rl); 1167 ZFS_EXIT(zfsvfs); 1168 return (SET_ERROR(EFBIG)); 1169 } 1170 1171 #endif 1172 #ifdef __FreeBSD__ 1173 if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { 1174 zfs_range_unlock(rl); 1175 ZFS_EXIT(zfsvfs); 1176 return (SET_ERROR(EFBIG)); 1177 } 1178 #endif 1179 #ifdef __NetBSD__ 1180 /* XXXNETBSD we might need vn_rlimit_fsize() too here eventually */ 1181 #endif 1182 1183 if ((woff + n) > limit || woff > (limit - n)) 1184 n = limit - woff; 1185 1186 /* Will this write extend the file length? */ 1187 write_eof = (woff + n > zp->z_size); 1188 1189 end_size = MAX(zp->z_size, woff + n); 1190 1191 commit = ((ioflag & (FSYNC | FDSYNC)) != 0 || 1192 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS); 1193 1194 /* 1195 * Write the file in reasonable size chunks. Each chunk is written 1196 * in a separate transaction; this keeps the intent log records small 1197 * and allows us to do more fine-grained space accounting. 1198 */ 1199 while (n > 0) { 1200 abuf = NULL; 1201 woff = uio->uio_loffset; 1202 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 1203 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 1204 if (abuf != NULL) 1205 dmu_return_arcbuf(abuf); 1206 error = SET_ERROR(EDQUOT); 1207 break; 1208 } 1209 1210 if (xuio && abuf == NULL) { 1211 ASSERT(i_iov < iovcnt); 1212 aiov = &iovp[i_iov]; 1213 abuf = dmu_xuio_arcbuf(xuio, i_iov); 1214 dmu_xuio_clear(xuio, i_iov); 1215 DTRACE_PROBE3(zfs_cp_write, int, i_iov, 1216 iovec_t *, aiov, arc_buf_t *, abuf); 1217 ASSERT((aiov->iov_base == abuf->b_data) || 1218 ((char *)aiov->iov_base - (char *)abuf->b_data + 1219 aiov->iov_len == arc_buf_size(abuf))); 1220 i_iov++; 1221 } else if (abuf == NULL && n >= max_blksz && 1222 woff >= zp->z_size && 1223 P2PHASE(woff, max_blksz) == 0 && 1224 zp->z_blksz == max_blksz) { 1225 /* 1226 * This write covers a full block. "Borrow" a buffer 1227 * from the dmu so that we can fill it before we enter 1228 * a transaction. This avoids the possibility of 1229 * holding up the transaction if the data copy hangs 1230 * up on a pagefault (e.g., from an NFS server mapping). 1231 */ 1232 #if defined(illumos) || defined(__NetBSD__) 1233 size_t cbytes; 1234 #endif 1235 1236 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 1237 max_blksz); 1238 ASSERT(abuf != NULL); 1239 ASSERT(arc_buf_size(abuf) == max_blksz); 1240 #if defined(illumos) || defined(__NetBSD__) 1241 if (error = uiocopy(abuf->b_data, max_blksz, 1242 UIO_WRITE, uio, &cbytes)) { 1243 dmu_return_arcbuf(abuf); 1244 break; 1245 } 1246 ASSERT(cbytes == max_blksz); 1247 #endif 1248 #ifdef __FreeBSD__ 1249 ssize_t resid = uio->uio_resid; 1250 1251 error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio); 1252 if (error != 0) { 1253 uio->uio_offset -= resid - uio->uio_resid; 1254 uio->uio_resid = resid; 1255 dmu_return_arcbuf(abuf); 1256 break; 1257 } 1258 #endif 1259 } 1260 1261 /* 1262 * Start a transaction. 1263 */ 1264 tx = dmu_tx_create(zfsvfs->z_os); 1265 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1266 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); 1267 zfs_sa_upgrade_txholds(tx, zp); 1268 error = dmu_tx_assign(tx, TXG_WAIT); 1269 if (error) { 1270 dmu_tx_abort(tx); 1271 if (abuf != NULL) 1272 dmu_return_arcbuf(abuf); 1273 break; 1274 } 1275 1276 /* 1277 * If zfs_range_lock() over-locked we grow the blocksize 1278 * and then reduce the lock range. This will only happen 1279 * on the first iteration since zfs_range_reduce() will 1280 * shrink down r_len to the appropriate size. 1281 */ 1282 if (rl->r_len == UINT64_MAX) { 1283 uint64_t new_blksz; 1284 1285 if (zp->z_blksz > max_blksz) { 1286 /* 1287 * File's blocksize is already larger than the 1288 * "recordsize" property. Only let it grow to 1289 * the next power of 2. 1290 */ 1291 ASSERT(!ISP2(zp->z_blksz)); 1292 new_blksz = MIN(end_size, 1293 1 << highbit64(zp->z_blksz)); 1294 } else { 1295 new_blksz = MIN(end_size, max_blksz); 1296 } 1297 zfs_grow_blocksize(zp, new_blksz, tx); 1298 zfs_range_reduce(rl, woff, n); 1299 } 1300 1301 /* 1302 * XXX - should we really limit each write to z_max_blksz? 1303 * Perhaps we should use SPA_MAXBLOCKSIZE chunks? 1304 */ 1305 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); 1306 1307 if (woff + nbytes > zp->z_size) 1308 vnode_pager_setsize(vp, woff + nbytes); 1309 1310 if (abuf == NULL) { 1311 tx_bytes = uio->uio_resid; 1312 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 1313 uio, nbytes, tx); 1314 tx_bytes -= uio->uio_resid; 1315 } else { 1316 tx_bytes = nbytes; 1317 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); 1318 /* 1319 * If this is not a full block write, but we are 1320 * extending the file past EOF and this data starts 1321 * block-aligned, use assign_arcbuf(). Otherwise, 1322 * write via dmu_write(). 1323 */ 1324 if (tx_bytes < max_blksz && (!write_eof || 1325 aiov->iov_base != abuf->b_data)) { 1326 ASSERT(xuio); 1327 dmu_write(zfsvfs->z_os, zp->z_id, woff, 1328 aiov->iov_len, aiov->iov_base, tx); 1329 dmu_return_arcbuf(abuf); 1330 xuio_stat_wbuf_copied(); 1331 } else { 1332 ASSERT(xuio || tx_bytes == max_blksz); 1333 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), 1334 woff, abuf, tx); 1335 } 1336 #if defined(illumos) || defined(__NetBSD__) 1337 ASSERT(tx_bytes <= uio->uio_resid); 1338 uioskip(uio, tx_bytes); 1339 #endif 1340 } 1341 if (tx_bytes && vn_has_cached_data(vp)) { 1342 update_pages(vp, woff, tx_bytes, zfsvfs->z_os, 1343 zp->z_id, segflg, tx); 1344 } 1345 1346 /* 1347 * If we made no progress, we're done. If we made even 1348 * partial progress, update the znode and ZIL accordingly. 1349 */ 1350 if (tx_bytes == 0) { 1351 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 1352 (void *)&zp->z_size, sizeof (uint64_t), tx); 1353 dmu_tx_commit(tx); 1354 ASSERT(error != 0); 1355 break; 1356 } 1357 1358 /* 1359 * Clear Set-UID/Set-GID bits on successful write if not 1360 * privileged and at least one of the excute bits is set. 1361 * 1362 * It would be nice to to this after all writes have 1363 * been done, but that would still expose the ISUID/ISGID 1364 * to another app after the partial write is committed. 1365 * 1366 * Note: we don't call zfs_fuid_map_id() here because 1367 * user 0 is not an ephemeral uid. 1368 */ 1369 mutex_enter(&zp->z_acl_lock); 1370 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | 1371 (S_IXUSR >> 6))) != 0 && 1372 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 1373 secpolicy_vnode_setid_retain(vp, cr, 1374 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { 1375 uint64_t newmode; 1376 zp->z_mode &= ~(S_ISUID | S_ISGID); 1377 newmode = zp->z_mode; 1378 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 1379 (void *)&newmode, sizeof (uint64_t), tx); 1380 #ifdef __NetBSD__ 1381 if (zfsvfs->z_use_namecache) 1382 cache_enter_id(vp, zp->z_mode, zp->z_uid, 1383 zp->z_gid, true); 1384 #endif 1385 } 1386 mutex_exit(&zp->z_acl_lock); 1387 1388 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 1389 B_TRUE); 1390 1391 /* 1392 * Update the file size (zp_size) if it has changed; 1393 * account for possible concurrent updates. 1394 */ 1395 while ((end_size = zp->z_size) < uio->uio_loffset) { 1396 (void) atomic_cas_64(&zp->z_size, end_size, 1397 uio->uio_loffset); 1398 #ifdef illumos 1399 ASSERT(error == 0); 1400 #else 1401 ASSERT(error == 0 || error == EFAULT); 1402 #endif 1403 } 1404 /* 1405 * If we are replaying and eof is non zero then force 1406 * the file size to the specified eof. Note, there's no 1407 * concurrency during replay. 1408 */ 1409 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 1410 zp->z_size = zfsvfs->z_replay_eof; 1411 1412 if (error == 0) 1413 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1414 else 1415 (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1416 1417 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit); 1418 dmu_tx_commit(tx); 1419 1420 if (error != 0) 1421 break; 1422 ASSERT(tx_bytes == nbytes); 1423 n -= nbytes; 1424 1425 #ifdef illumos 1426 if (!xuio && n > 0) 1427 uio_prefaultpages(MIN(n, max_blksz), uio); 1428 #endif 1429 } 1430 1431 zfs_range_unlock(rl); 1432 1433 /* 1434 * If we're in replay mode, or we made no progress, return error. 1435 * Otherwise, it's at least a partial write, so it's successful. 1436 */ 1437 if (zfsvfs->z_replay || uio->uio_resid == start_resid) { 1438 ZFS_EXIT(zfsvfs); 1439 return (error); 1440 } 1441 1442 #ifdef __FreeBSD__ 1443 /* 1444 * EFAULT means that at least one page of the source buffer was not 1445 * available. VFS will re-try remaining I/O upon this error. 1446 */ 1447 if (error == EFAULT) { 1448 ZFS_EXIT(zfsvfs); 1449 return (error); 1450 } 1451 #endif 1452 1453 if (commit) 1454 zil_commit(zilog, zp->z_id); 1455 1456 ZFS_EXIT(zfsvfs); 1457 return (0); 1458 } 1459 1460 void 1461 zfs_get_done(zgd_t *zgd, int error) 1462 { 1463 znode_t *zp = zgd->zgd_private; 1464 objset_t *os = zp->z_zfsvfs->z_os; 1465 1466 if (zgd->zgd_db) 1467 dmu_buf_rele(zgd->zgd_db, zgd); 1468 1469 zfs_range_unlock(zgd->zgd_rl); 1470 1471 /* 1472 * Release the vnode asynchronously as we currently have the 1473 * txg stopped from syncing. 1474 */ 1475 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1476 1477 if (error == 0 && zgd->zgd_bp) 1478 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 1479 1480 kmem_free(zgd, sizeof (zgd_t)); 1481 } 1482 1483 #ifdef DEBUG 1484 static int zil_fault_io = 0; 1485 #endif 1486 1487 /* 1488 * Get data to generate a TX_WRITE intent log record. 1489 */ 1490 int 1491 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 1492 { 1493 zfsvfs_t *zfsvfs = arg; 1494 objset_t *os = zfsvfs->z_os; 1495 znode_t *zp; 1496 uint64_t object = lr->lr_foid; 1497 uint64_t offset = lr->lr_offset; 1498 uint64_t size = lr->lr_length; 1499 blkptr_t *bp = &lr->lr_blkptr; 1500 dmu_buf_t *db; 1501 zgd_t *zgd; 1502 int error = 0; 1503 1504 ASSERT(zio != NULL); 1505 ASSERT(size != 0); 1506 1507 /* 1508 * Nothing to do if the file has been removed 1509 */ 1510 if (zfs_zget(zfsvfs, object, &zp) != 0) 1511 return (SET_ERROR(ENOENT)); 1512 if (zp->z_unlinked) { 1513 /* 1514 * Release the vnode asynchronously as we currently have the 1515 * txg stopped from syncing. 1516 */ 1517 VN_RELE_ASYNC(ZTOV(zp), 1518 dsl_pool_vnrele_taskq(dmu_objset_pool(os))); 1519 return (SET_ERROR(ENOENT)); 1520 } 1521 1522 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1523 zgd->zgd_zilog = zfsvfs->z_log; 1524 zgd->zgd_private = zp; 1525 1526 /* 1527 * Write records come in two flavors: immediate and indirect. 1528 * For small writes it's cheaper to store the data with the 1529 * log record (immediate); for large writes it's cheaper to 1530 * sync the data and get a pointer to it (indirect) so that 1531 * we don't have to write the data twice. 1532 */ 1533 if (buf != NULL) { /* immediate write */ 1534 zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); 1535 /* test for truncation needs to be done while range locked */ 1536 if (offset >= zp->z_size) { 1537 error = SET_ERROR(ENOENT); 1538 } else { 1539 error = dmu_read(os, object, offset, size, buf, 1540 DMU_READ_NO_PREFETCH); 1541 } 1542 ASSERT(error == 0 || error == ENOENT); 1543 } else { /* indirect write */ 1544 /* 1545 * Have to lock the whole block to ensure when it's 1546 * written out and it's checksum is being calculated 1547 * that no one can change the data. We need to re-check 1548 * blocksize after we get the lock in case it's changed! 1549 */ 1550 for (;;) { 1551 uint64_t blkoff; 1552 size = zp->z_blksz; 1553 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1554 offset -= blkoff; 1555 zgd->zgd_rl = zfs_range_lock(zp, offset, size, 1556 RL_READER); 1557 if (zp->z_blksz == size) 1558 break; 1559 offset += blkoff; 1560 zfs_range_unlock(zgd->zgd_rl); 1561 } 1562 /* test for truncation needs to be done while range locked */ 1563 if (lr->lr_offset >= zp->z_size) 1564 error = SET_ERROR(ENOENT); 1565 #ifdef DEBUG 1566 if (zil_fault_io) { 1567 error = SET_ERROR(EIO); 1568 zil_fault_io = 0; 1569 } 1570 #endif 1571 if (error == 0) 1572 error = dmu_buf_hold(os, object, offset, zgd, &db, 1573 DMU_READ_NO_PREFETCH); 1574 1575 if (error == 0) { 1576 blkptr_t *obp = dmu_buf_get_blkptr(db); 1577 if (obp) { 1578 ASSERT(BP_IS_HOLE(bp)); 1579 *bp = *obp; 1580 } 1581 1582 zgd->zgd_db = db; 1583 zgd->zgd_bp = bp; 1584 1585 ASSERT(db->db_offset == offset); 1586 ASSERT(db->db_size == size); 1587 1588 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1589 zfs_get_done, zgd); 1590 ASSERT(error || lr->lr_length <= zp->z_blksz); 1591 1592 /* 1593 * On success, we need to wait for the write I/O 1594 * initiated by dmu_sync() to complete before we can 1595 * release this dbuf. We will finish everything up 1596 * in the zfs_get_done() callback. 1597 */ 1598 if (error == 0) 1599 return (0); 1600 1601 if (error == EALREADY) { 1602 lr->lr_common.lrc_txtype = TX_WRITE2; 1603 error = 0; 1604 } 1605 } 1606 } 1607 1608 zfs_get_done(zgd, error); 1609 1610 return (error); 1611 } 1612 1613 /*ARGSUSED*/ 1614 static int 1615 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, 1616 caller_context_t *ct) 1617 { 1618 znode_t *zp = VTOZ(vp); 1619 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1620 int error; 1621 1622 ZFS_ENTER(zfsvfs); 1623 ZFS_VERIFY_ZP(zp); 1624 1625 if (flag & V_ACE_MASK) 1626 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); 1627 else 1628 error = zfs_zaccess_rwx(zp, mode, flag, cr); 1629 1630 ZFS_EXIT(zfsvfs); 1631 return (error); 1632 } 1633 1634 #ifdef __FreeBSD__ 1635 static int 1636 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) 1637 { 1638 int error; 1639 1640 *vpp = arg; 1641 error = vn_lock(*vpp, lkflags); 1642 if (error != 0) 1643 vrele(*vpp); 1644 return (error); 1645 } 1646 1647 static int 1648 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) 1649 { 1650 znode_t *zdp = VTOZ(dvp); 1651 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1652 int error; 1653 int ltype; 1654 1655 ASSERT_VOP_LOCKED(dvp, __func__); 1656 #ifdef DIAGNOSTIC 1657 if ((zdp->z_pflags & ZFS_XATTR) == 0) 1658 VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); 1659 #endif 1660 1661 if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { 1662 ASSERT3P(dvp, ==, vp); 1663 vref(dvp); 1664 ltype = lkflags & LK_TYPE_MASK; 1665 if (ltype != VOP_ISLOCKED(dvp)) { 1666 if (ltype == LK_EXCLUSIVE) 1667 vn_lock(dvp, LK_UPGRADE | LK_RETRY); 1668 else /* if (ltype == LK_SHARED) */ 1669 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); 1670 1671 /* 1672 * Relock for the "." case could leave us with 1673 * reclaimed vnode. 1674 */ 1675 if (dvp->v_iflag & VI_DOOMED) { 1676 vrele(dvp); 1677 return (SET_ERROR(ENOENT)); 1678 } 1679 } 1680 return (0); 1681 } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { 1682 /* 1683 * Note that in this case, dvp is the child vnode, and we 1684 * are looking up the parent vnode - exactly reverse from 1685 * normal operation. Unlocking dvp requires some rather 1686 * tricky unlock/relock dance to prevent mp from being freed; 1687 * use vn_vget_ino_gen() which takes care of all that. 1688 * 1689 * XXX Note that there is a time window when both vnodes are 1690 * unlocked. It is possible, although highly unlikely, that 1691 * during that window the parent-child relationship between 1692 * the vnodes may change, for example, get reversed. 1693 * In that case we would have a wrong lock order for the vnodes. 1694 * All other filesystems seem to ignore this problem, so we 1695 * do the same here. 1696 * A potential solution could be implemented as follows: 1697 * - using LK_NOWAIT when locking the second vnode and retrying 1698 * if necessary 1699 * - checking that the parent-child relationship still holds 1700 * after locking both vnodes and retrying if it doesn't 1701 */ 1702 error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); 1703 return (error); 1704 } else { 1705 error = vn_lock(vp, lkflags); 1706 if (error != 0) 1707 vrele(vp); 1708 return (error); 1709 } 1710 } 1711 1712 /* 1713 * Lookup an entry in a directory, or an extended attribute directory. 1714 * If it exists, return a held vnode reference for it. 1715 * 1716 * IN: dvp - vnode of directory to search. 1717 * nm - name of entry to lookup. 1718 * pnp - full pathname to lookup [UNUSED]. 1719 * flags - LOOKUP_XATTR set if looking for an attribute. 1720 * rdir - root directory vnode [UNUSED]. 1721 * cr - credentials of caller. 1722 * ct - caller context 1723 * 1724 * OUT: vpp - vnode of located entry, NULL if not found. 1725 * 1726 * RETURN: 0 on success, error code on failure. 1727 * 1728 * Timestamps: 1729 * NA 1730 */ 1731 /* ARGSUSED */ 1732 static int 1733 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, 1734 int nameiop, cred_t *cr, kthread_t *td, int flags) 1735 { 1736 znode_t *zdp = VTOZ(dvp); 1737 znode_t *zp; 1738 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1739 int error = 0; 1740 1741 /* fast path (should be redundant with vfs namecache) */ 1742 if (!(flags & LOOKUP_XATTR)) { 1743 if (dvp->v_type != VDIR) { 1744 return (SET_ERROR(ENOTDIR)); 1745 } else if (zdp->z_sa_hdl == NULL) { 1746 return (SET_ERROR(EIO)); 1747 } 1748 } 1749 1750 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 1751 1752 ZFS_ENTER(zfsvfs); 1753 ZFS_VERIFY_ZP(zdp); 1754 1755 *vpp = NULL; 1756 1757 if (flags & LOOKUP_XATTR) { 1758 #ifdef TODO 1759 /* 1760 * If the xattr property is off, refuse the lookup request. 1761 */ 1762 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 1763 ZFS_EXIT(zfsvfs); 1764 return (SET_ERROR(EINVAL)); 1765 } 1766 #endif 1767 1768 /* 1769 * We don't allow recursive attributes.. 1770 * Maybe someday we will. 1771 */ 1772 if (zdp->z_pflags & ZFS_XATTR) { 1773 ZFS_EXIT(zfsvfs); 1774 return (SET_ERROR(EINVAL)); 1775 } 1776 1777 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 1778 ZFS_EXIT(zfsvfs); 1779 return (error); 1780 } 1781 1782 /* 1783 * Do we have permission to get into attribute directory? 1784 */ 1785 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 1786 B_FALSE, cr)) { 1787 vrele(*vpp); 1788 *vpp = NULL; 1789 } 1790 1791 ZFS_EXIT(zfsvfs); 1792 return (error); 1793 } 1794 1795 /* 1796 * Check accessibility of directory. 1797 */ 1798 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 1799 ZFS_EXIT(zfsvfs); 1800 return (error); 1801 } 1802 1803 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 1804 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 1805 ZFS_EXIT(zfsvfs); 1806 return (SET_ERROR(EILSEQ)); 1807 } 1808 1809 1810 /* 1811 * First handle the special cases. 1812 */ 1813 if ((cnp->cn_flags & ISDOTDOT) != 0) { 1814 /* 1815 * If we are a snapshot mounted under .zfs, return 1816 * the vp for the snapshot directory. 1817 */ 1818 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { 1819 struct componentname cn; 1820 vnode_t *zfsctl_vp; 1821 int ltype; 1822 1823 ZFS_EXIT(zfsvfs); 1824 ltype = VOP_ISLOCKED(dvp); 1825 VOP_UNLOCK(dvp, 0); 1826 error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, 1827 &zfsctl_vp); 1828 if (error == 0) { 1829 cn.cn_nameptr = "snapshot"; 1830 cn.cn_namelen = strlen(cn.cn_nameptr); 1831 cn.cn_nameiop = cnp->cn_nameiop; 1832 cn.cn_flags = cnp->cn_flags; 1833 cn.cn_lkflags = cnp->cn_lkflags; 1834 error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); 1835 vput(zfsctl_vp); 1836 } 1837 vn_lock(dvp, ltype | LK_RETRY); 1838 return (error); 1839 } 1840 } 1841 if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { 1842 ZFS_EXIT(zfsvfs); 1843 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) 1844 return (SET_ERROR(ENOTSUP)); 1845 error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); 1846 return (error); 1847 } 1848 1849 /* 1850 * The loop is retry the lookup if the parent-child relationship 1851 * changes during the dot-dot locking complexities. 1852 */ 1853 for (;;) { 1854 uint64_t parent; 1855 1856 error = zfs_dirlook(zdp, nm, &zp); 1857 if (error == 0) 1858 *vpp = ZTOV(zp); 1859 1860 ZFS_EXIT(zfsvfs); 1861 if (error != 0) 1862 break; 1863 1864 error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); 1865 if (error != 0) { 1866 /* 1867 * If we've got a locking error, then the vnode 1868 * got reclaimed because of a force unmount. 1869 * We never enter doomed vnodes into the name cache. 1870 */ 1871 *vpp = NULL; 1872 return (error); 1873 } 1874 1875 if ((cnp->cn_flags & ISDOTDOT) == 0) 1876 break; 1877 1878 ZFS_ENTER(zfsvfs); 1879 if (zdp->z_sa_hdl == NULL) { 1880 error = SET_ERROR(EIO); 1881 } else { 1882 error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 1883 &parent, sizeof (parent)); 1884 } 1885 if (error != 0) { 1886 ZFS_EXIT(zfsvfs); 1887 vput(ZTOV(zp)); 1888 break; 1889 } 1890 if (zp->z_id == parent) { 1891 ZFS_EXIT(zfsvfs); 1892 break; 1893 } 1894 vput(ZTOV(zp)); 1895 } 1896 1897 out: 1898 if (error != 0) 1899 *vpp = NULL; 1900 1901 /* Translate errors and add SAVENAME when needed. */ 1902 if (cnp->cn_flags & ISLASTCN) { 1903 switch (nameiop) { 1904 case CREATE: 1905 case RENAME: 1906 if (error == ENOENT) { 1907 error = EJUSTRETURN; 1908 cnp->cn_flags |= SAVENAME; 1909 break; 1910 } 1911 /* FALLTHROUGH */ 1912 case DELETE: 1913 if (error == 0) 1914 cnp->cn_flags |= SAVENAME; 1915 break; 1916 } 1917 } 1918 1919 /* Insert name into cache (as non-existent) if appropriate. */ 1920 if (zfsvfs->z_use_namecache && 1921 error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) 1922 cache_enter(dvp, NULL, cnp); 1923 1924 /* Insert name into cache if appropriate. */ 1925 if (zfsvfs->z_use_namecache && 1926 error == 0 && (cnp->cn_flags & MAKEENTRY)) { 1927 if (!(cnp->cn_flags & ISLASTCN) || 1928 (nameiop != DELETE && nameiop != RENAME)) { 1929 cache_enter(dvp, *vpp, cnp); 1930 } 1931 } 1932 1933 return (error); 1934 } 1935 #endif /* __FreeBSD__ */ 1936 1937 #ifdef __NetBSD__ 1938 /* 1939 * If vnode is for a device return a specfs vnode instead. 1940 */ 1941 static int 1942 specvp_check(vnode_t **vpp, cred_t *cr) 1943 { 1944 int error = 0; 1945 1946 if (IS_DEVVP(*vpp)) { 1947 struct vnode *svp; 1948 1949 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); 1950 VN_RELE(*vpp); 1951 if (svp == NULL) 1952 error = ENOSYS; 1953 *vpp = svp; 1954 } 1955 return (error); 1956 } 1957 1958 /* 1959 * Lookup an entry in a directory, or an extended attribute directory. 1960 * If it exists, return a held vnode reference for it. 1961 * 1962 * IN: dvp - vnode of directory to search. 1963 * nm - name of entry to lookup. 1964 * pnp - full pathname to lookup [UNUSED]. 1965 * flags - LOOKUP_XATTR set if looking for an attribute. 1966 * rdir - root directory vnode [UNUSED]. 1967 * cr - credentials of caller. 1968 * ct - caller context 1969 * direntflags - directory lookup flags 1970 * realpnp - returned pathname. 1971 * 1972 * OUT: vpp - vnode of located entry, NULL if not found. 1973 * 1974 * RETURN: 0 if success 1975 * error code if failure 1976 * 1977 * Timestamps: 1978 * NA 1979 */ 1980 /* ARGSUSED */ 1981 static int 1982 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, 1983 struct componentname *cnp, int nameiop, cred_t *cr) 1984 { 1985 znode_t *zdp = VTOZ(dvp); 1986 znode_t *zp; 1987 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 1988 int error = 0; 1989 1990 /* fast path */ 1991 if (!(flags & LOOKUP_XATTR)) { 1992 if (dvp->v_type != VDIR) { 1993 return (ENOTDIR); 1994 } else if (zdp->z_sa_hdl == NULL) { 1995 return (SET_ERROR(EIO)); 1996 } 1997 1998 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { 1999 error = zfs_fastaccesschk_execute(zdp, cr); 2000 if (!error) { 2001 *vpp = dvp; 2002 VN_HOLD(*vpp); 2003 return (0); 2004 } 2005 return (error); 2006 } else { 2007 vnode_t *tvp = dnlc_lookup(dvp, nm); 2008 2009 if (tvp) { 2010 error = zfs_fastaccesschk_execute(zdp, cr); 2011 if (error) { 2012 VN_RELE(tvp); 2013 return (error); 2014 } 2015 if (tvp == DNLC_NO_VNODE) { 2016 VN_RELE(tvp); 2017 return (ENOENT); 2018 } else { 2019 *vpp = tvp; 2020 return (specvp_check(vpp, cr)); 2021 } 2022 } 2023 } 2024 } 2025 2026 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); 2027 2028 ZFS_ENTER(zfsvfs); 2029 ZFS_VERIFY_ZP(zdp); 2030 2031 *vpp = NULL; 2032 2033 if (flags & LOOKUP_XATTR) { 2034 #ifdef TODO 2035 /* 2036 * If the xattr property is off, refuse the lookup request. 2037 */ 2038 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { 2039 ZFS_EXIT(zfsvfs); 2040 return (EINVAL); 2041 } 2042 #endif 2043 2044 /* 2045 * We don't allow recursive attributes.. 2046 * Maybe someday we will. 2047 */ 2048 if (zdp->z_pflags & ZFS_XATTR) { 2049 ZFS_EXIT(zfsvfs); 2050 return (EINVAL); 2051 } 2052 2053 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { 2054 ZFS_EXIT(zfsvfs); 2055 return (error); 2056 } 2057 2058 /* 2059 * Do we have permission to get into attribute directory? 2060 */ 2061 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, 2062 B_FALSE, cr)) { 2063 VN_RELE(*vpp); 2064 *vpp = NULL; 2065 } 2066 2067 ZFS_EXIT(zfsvfs); 2068 return (error); 2069 } 2070 2071 if (dvp->v_type != VDIR) { 2072 ZFS_EXIT(zfsvfs); 2073 return (ENOTDIR); 2074 } 2075 2076 /* 2077 * Check accessibility of directory. 2078 */ 2079 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { 2080 ZFS_EXIT(zfsvfs); 2081 return (error); 2082 } 2083 2084 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), 2085 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2086 ZFS_EXIT(zfsvfs); 2087 return (EILSEQ); 2088 } 2089 2090 /* 2091 * First handle the special cases. 2092 */ 2093 if ((cnp->cn_flags & ISDOTDOT) != 0) { 2094 /* 2095 * If we are a snapshot mounted under .zfs, return 2096 * the vp for the snapshot directory. 2097 */ 2098 if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { 2099 ZFS_EXIT(zfsvfs); 2100 error = zfsctl_snapshot(zfsvfs->z_parent, vpp); 2101 2102 return (error); 2103 } 2104 } 2105 if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { 2106 ZFS_EXIT(zfsvfs); 2107 if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) 2108 return (SET_ERROR(ENOTSUP)); 2109 error = zfsctl_root(zfsvfs, vpp); 2110 return (error); 2111 } 2112 2113 error = zfs_dirlook(zdp, nm, &zp); 2114 if (error == 0) { 2115 *vpp = ZTOV(zp); 2116 error = specvp_check(vpp, cr); 2117 } 2118 2119 ZFS_EXIT(zfsvfs); 2120 return (error); 2121 } 2122 #endif 2123 2124 /* 2125 * Attempt to create a new entry in a directory. If the entry 2126 * already exists, truncate the file if permissible, else return 2127 * an error. Return the vp of the created or trunc'd file. 2128 * 2129 * IN: dvp - vnode of directory to put new file entry in. 2130 * name - name of new file entry. 2131 * vap - attributes of new file. 2132 * excl - flag indicating exclusive or non-exclusive mode. 2133 * mode - mode to open file with. 2134 * cr - credentials of caller. 2135 * flag - large file flag [UNUSED]. 2136 * ct - caller context 2137 * vsecp - ACL to be set 2138 * 2139 * OUT: vpp - vnode of created or trunc'd entry. 2140 * 2141 * RETURN: 0 on success, error code on failure. 2142 * 2143 * Timestamps: 2144 * dvp - ctime|mtime updated if new entry created 2145 * vp - ctime|mtime always, atime if new 2146 */ 2147 2148 /* ARGSUSED */ 2149 static int 2150 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, 2151 vnode_t **vpp, cred_t *cr, kthread_t *td) 2152 { 2153 znode_t *zp, *dzp = VTOZ(dvp); 2154 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2155 zilog_t *zilog; 2156 objset_t *os; 2157 dmu_tx_t *tx; 2158 int error; 2159 ksid_t *ksid; 2160 uid_t uid; 2161 gid_t gid = crgetgid(cr); 2162 zfs_acl_ids_t acl_ids; 2163 boolean_t fuid_dirtied; 2164 void *vsecp = NULL; 2165 int flag = 0; 2166 uint64_t txtype; 2167 2168 /* 2169 * If we have an ephemeral id, ACL, or XVATTR then 2170 * make sure file system is at proper version 2171 */ 2172 2173 ksid = crgetsid(cr, KSID_OWNER); 2174 if (ksid) 2175 uid = ksid_getid(ksid); 2176 else 2177 uid = crgetuid(cr); 2178 2179 if (zfsvfs->z_use_fuids == B_FALSE && 2180 (vsecp || (vap->va_mask & AT_XVATTR) || 2181 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 2182 return (SET_ERROR(EINVAL)); 2183 2184 ZFS_ENTER(zfsvfs); 2185 ZFS_VERIFY_ZP(dzp); 2186 os = zfsvfs->z_os; 2187 zilog = zfsvfs->z_log; 2188 2189 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 2190 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2191 ZFS_EXIT(zfsvfs); 2192 return (SET_ERROR(EILSEQ)); 2193 } 2194 2195 if (vap->va_mask & AT_XVATTR) { 2196 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 2197 crgetuid(cr), cr, vap->va_type)) != 0) { 2198 ZFS_EXIT(zfsvfs); 2199 return (error); 2200 } 2201 } 2202 2203 *vpp = NULL; 2204 2205 if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) 2206 vap->va_mode &= ~S_ISVTX; 2207 2208 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); 2209 if (error) { 2210 ZFS_EXIT(zfsvfs); 2211 return (error); 2212 } 2213 ASSERT3P(zp, ==, NULL); 2214 2215 /* 2216 * Create a new file object and update the directory 2217 * to reference it. 2218 */ 2219 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 2220 goto out; 2221 } 2222 2223 /* 2224 * We only support the creation of regular files in 2225 * extended attribute directories. 2226 */ 2227 2228 if ((dzp->z_pflags & ZFS_XATTR) && 2229 (vap->va_type != VREG)) { 2230 error = SET_ERROR(EINVAL); 2231 goto out; 2232 } 2233 2234 if ((error = zfs_acl_ids_create(dzp, 0, vap, 2235 cr, vsecp, &acl_ids)) != 0) 2236 goto out; 2237 2238 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 2239 zfs_acl_ids_free(&acl_ids); 2240 error = SET_ERROR(EDQUOT); 2241 goto out; 2242 } 2243 2244 getnewvnode_reserve(1); 2245 2246 tx = dmu_tx_create(os); 2247 2248 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 2249 ZFS_SA_BASE_ATTR_SIZE); 2250 2251 fuid_dirtied = zfsvfs->z_fuid_dirty; 2252 if (fuid_dirtied) 2253 zfs_fuid_txhold(zfsvfs, tx); 2254 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 2255 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 2256 if (!zfsvfs->z_use_sa && 2257 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2258 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 2259 0, acl_ids.z_aclp->z_acl_bytes); 2260 } 2261 error = dmu_tx_assign(tx, TXG_WAIT); 2262 if (error) { 2263 zfs_acl_ids_free(&acl_ids); 2264 dmu_tx_abort(tx); 2265 getnewvnode_drop_reserve(); 2266 ZFS_EXIT(zfsvfs); 2267 return (error); 2268 } 2269 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 2270 2271 if (fuid_dirtied) 2272 zfs_fuid_sync(zfsvfs, tx); 2273 2274 (void) zfs_link_create(dzp, name, zp, tx, ZNEW); 2275 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); 2276 zfs_log_create(zilog, tx, txtype, dzp, zp, name, 2277 vsecp, acl_ids.z_fuidp, vap); 2278 zfs_acl_ids_free(&acl_ids); 2279 dmu_tx_commit(tx); 2280 2281 getnewvnode_drop_reserve(); 2282 2283 out: 2284 if (error == 0) { 2285 *vpp = ZTOV(zp); 2286 } 2287 2288 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2289 zil_commit(zilog, 0); 2290 2291 ZFS_EXIT(zfsvfs); 2292 return (error); 2293 } 2294 2295 /* 2296 * Remove an entry from a directory. 2297 * 2298 * IN: dvp - vnode of directory to remove entry from. 2299 * name - name of entry to remove. 2300 * cr - credentials of caller. 2301 * ct - caller context 2302 * flags - case flags 2303 * 2304 * RETURN: 0 on success, error code on failure. 2305 * 2306 * Timestamps: 2307 * dvp - ctime|mtime 2308 * vp - ctime (if nlink > 0) 2309 */ 2310 2311 /*ARGSUSED*/ 2312 static int 2313 zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) 2314 { 2315 znode_t *dzp = VTOZ(dvp); 2316 znode_t *zp = VTOZ(vp); 2317 znode_t *xzp; 2318 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2319 zilog_t *zilog; 2320 uint64_t acl_obj, xattr_obj; 2321 uint64_t obj = 0; 2322 dmu_tx_t *tx; 2323 boolean_t unlinked, toobig = FALSE; 2324 uint64_t txtype; 2325 int error; 2326 2327 ZFS_ENTER(zfsvfs); 2328 ZFS_VERIFY_ZP(dzp); 2329 ZFS_VERIFY_ZP(zp); 2330 zilog = zfsvfs->z_log; 2331 zp = VTOZ(vp); 2332 2333 xattr_obj = 0; 2334 xzp = NULL; 2335 2336 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2337 goto out; 2338 } 2339 2340 /* 2341 * Need to use rmdir for removing directories. 2342 */ 2343 if (vp->v_type == VDIR) { 2344 error = SET_ERROR(EPERM); 2345 goto out; 2346 } 2347 2348 vnevent_remove(vp, dvp, name, ct); 2349 2350 obj = zp->z_id; 2351 2352 /* are there any extended attributes? */ 2353 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 2354 &xattr_obj, sizeof (xattr_obj)); 2355 if (error == 0 && xattr_obj) { 2356 error = zfs_zget(zfsvfs, xattr_obj, &xzp); 2357 ASSERT0(error); 2358 } 2359 2360 /* 2361 * We may delete the znode now, or we may put it in the unlinked set; 2362 * it depends on whether we're the last link, and on whether there are 2363 * other holds on the vnode. So we dmu_tx_hold() the right things to 2364 * allow for either case. 2365 */ 2366 tx = dmu_tx_create(zfsvfs->z_os); 2367 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2368 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2369 zfs_sa_upgrade_txholds(tx, zp); 2370 zfs_sa_upgrade_txholds(tx, dzp); 2371 2372 if (xzp) { 2373 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 2374 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); 2375 } 2376 2377 /* charge as an update -- would be nice not to charge at all */ 2378 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2379 2380 /* 2381 * Mark this transaction as typically resulting in a net free of space 2382 */ 2383 dmu_tx_mark_netfree(tx); 2384 2385 error = dmu_tx_assign(tx, TXG_WAIT); 2386 if (error) { 2387 dmu_tx_abort(tx); 2388 ZFS_EXIT(zfsvfs); 2389 return (error); 2390 } 2391 2392 /* 2393 * Remove the directory entry. 2394 */ 2395 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); 2396 2397 if (error) { 2398 dmu_tx_commit(tx); 2399 goto out; 2400 } 2401 2402 if (unlinked) { 2403 zfs_unlinked_add(zp, tx); 2404 vp->v_vflag |= VV_NOSYNC; 2405 } 2406 2407 txtype = TX_REMOVE; 2408 zfs_log_remove(zilog, tx, txtype, dzp, name, obj); 2409 2410 dmu_tx_commit(tx); 2411 out: 2412 2413 if (xzp) 2414 vrele(ZTOV(xzp)); 2415 2416 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2417 zil_commit(zilog, 0); 2418 2419 ZFS_EXIT(zfsvfs); 2420 return (error); 2421 } 2422 2423 /* 2424 * Create a new directory and insert it into dvp using the name 2425 * provided. Return a pointer to the inserted directory. 2426 * 2427 * IN: dvp - vnode of directory to add subdir to. 2428 * dirname - name of new directory. 2429 * vap - attributes of new directory. 2430 * cr - credentials of caller. 2431 * ct - caller context 2432 * flags - case flags 2433 * vsecp - ACL to be set 2434 * 2435 * OUT: vpp - vnode of created directory. 2436 * 2437 * RETURN: 0 on success, error code on failure. 2438 * 2439 * Timestamps: 2440 * dvp - ctime|mtime updated 2441 * vp - ctime|mtime|atime updated 2442 */ 2443 /*ARGSUSED*/ 2444 static int 2445 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) 2446 { 2447 znode_t *zp, *dzp = VTOZ(dvp); 2448 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2449 zilog_t *zilog; 2450 uint64_t txtype; 2451 dmu_tx_t *tx; 2452 int error; 2453 ksid_t *ksid; 2454 uid_t uid; 2455 gid_t gid = crgetgid(cr); 2456 zfs_acl_ids_t acl_ids; 2457 boolean_t fuid_dirtied; 2458 2459 ASSERT(vap->va_type == VDIR); 2460 2461 /* 2462 * If we have an ephemeral id, ACL, or XVATTR then 2463 * make sure file system is at proper version 2464 */ 2465 2466 ksid = crgetsid(cr, KSID_OWNER); 2467 if (ksid) 2468 uid = ksid_getid(ksid); 2469 else 2470 uid = crgetuid(cr); 2471 if (zfsvfs->z_use_fuids == B_FALSE && 2472 ((vap->va_mask & AT_XVATTR) || 2473 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) 2474 return (SET_ERROR(EINVAL)); 2475 2476 ZFS_ENTER(zfsvfs); 2477 ZFS_VERIFY_ZP(dzp); 2478 zilog = zfsvfs->z_log; 2479 2480 if (dzp->z_pflags & ZFS_XATTR) { 2481 ZFS_EXIT(zfsvfs); 2482 return (SET_ERROR(EINVAL)); 2483 } 2484 2485 if (zfsvfs->z_utf8 && u8_validate(dirname, 2486 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 2487 ZFS_EXIT(zfsvfs); 2488 return (SET_ERROR(EILSEQ)); 2489 } 2490 2491 if (vap->va_mask & AT_XVATTR) { 2492 if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, 2493 crgetuid(cr), cr, vap->va_type)) != 0) { 2494 ZFS_EXIT(zfsvfs); 2495 return (error); 2496 } 2497 } 2498 2499 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, 2500 NULL, &acl_ids)) != 0) { 2501 ZFS_EXIT(zfsvfs); 2502 return (error); 2503 } 2504 2505 /* 2506 * First make sure the new directory doesn't exist. 2507 * 2508 * Existence is checked first to make sure we don't return 2509 * EACCES instead of EEXIST which can cause some applications 2510 * to fail. 2511 */ 2512 *vpp = NULL; 2513 2514 if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { 2515 zfs_acl_ids_free(&acl_ids); 2516 ZFS_EXIT(zfsvfs); 2517 return (error); 2518 } 2519 ASSERT3P(zp, ==, NULL); 2520 2521 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { 2522 zfs_acl_ids_free(&acl_ids); 2523 ZFS_EXIT(zfsvfs); 2524 return (error); 2525 } 2526 2527 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 2528 zfs_acl_ids_free(&acl_ids); 2529 ZFS_EXIT(zfsvfs); 2530 return (SET_ERROR(EDQUOT)); 2531 } 2532 2533 /* 2534 * Add a new entry to the directory. 2535 */ 2536 getnewvnode_reserve(1); 2537 tx = dmu_tx_create(zfsvfs->z_os); 2538 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); 2539 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); 2540 fuid_dirtied = zfsvfs->z_fuid_dirty; 2541 if (fuid_dirtied) 2542 zfs_fuid_txhold(zfsvfs, tx); 2543 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 2544 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 2545 acl_ids.z_aclp->z_acl_bytes); 2546 } 2547 2548 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 2549 ZFS_SA_BASE_ATTR_SIZE); 2550 2551 error = dmu_tx_assign(tx, TXG_WAIT); 2552 if (error) { 2553 zfs_acl_ids_free(&acl_ids); 2554 dmu_tx_abort(tx); 2555 getnewvnode_drop_reserve(); 2556 ZFS_EXIT(zfsvfs); 2557 return (error); 2558 } 2559 2560 /* 2561 * Create new node. 2562 */ 2563 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 2564 2565 if (fuid_dirtied) 2566 zfs_fuid_sync(zfsvfs, tx); 2567 2568 /* 2569 * Now put new name in parent dir. 2570 */ 2571 (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); 2572 2573 *vpp = ZTOV(zp); 2574 2575 txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); 2576 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, 2577 acl_ids.z_fuidp, vap); 2578 2579 zfs_acl_ids_free(&acl_ids); 2580 2581 dmu_tx_commit(tx); 2582 2583 getnewvnode_drop_reserve(); 2584 2585 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2586 zil_commit(zilog, 0); 2587 2588 ZFS_EXIT(zfsvfs); 2589 return (0); 2590 } 2591 2592 /* 2593 * Remove a directory subdir entry. If the current working 2594 * directory is the same as the subdir to be removed, the 2595 * remove will fail. 2596 * 2597 * IN: dvp - vnode of directory to remove from. 2598 * name - name of directory to be removed. 2599 * cwd - vnode of current working directory. 2600 * cr - credentials of caller. 2601 * ct - caller context 2602 * flags - case flags 2603 * 2604 * RETURN: 0 on success, error code on failure. 2605 * 2606 * Timestamps: 2607 * dvp - ctime|mtime updated 2608 */ 2609 /*ARGSUSED*/ 2610 static int 2611 zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) 2612 { 2613 znode_t *dzp = VTOZ(dvp); 2614 znode_t *zp = VTOZ(vp); 2615 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 2616 zilog_t *zilog; 2617 dmu_tx_t *tx; 2618 int error; 2619 2620 ZFS_ENTER(zfsvfs); 2621 ZFS_VERIFY_ZP(dzp); 2622 ZFS_VERIFY_ZP(zp); 2623 zilog = zfsvfs->z_log; 2624 2625 2626 if (error = zfs_zaccess_delete(dzp, zp, cr)) { 2627 goto out; 2628 } 2629 2630 if (vp->v_type != VDIR) { 2631 error = SET_ERROR(ENOTDIR); 2632 goto out; 2633 } 2634 2635 vnevent_rmdir(vp, dvp, name, ct); 2636 2637 tx = dmu_tx_create(zfsvfs->z_os); 2638 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); 2639 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2640 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 2641 zfs_sa_upgrade_txholds(tx, zp); 2642 zfs_sa_upgrade_txholds(tx, dzp); 2643 dmu_tx_mark_netfree(tx); 2644 error = dmu_tx_assign(tx, TXG_WAIT); 2645 if (error) { 2646 dmu_tx_abort(tx); 2647 ZFS_EXIT(zfsvfs); 2648 return (error); 2649 } 2650 2651 cache_purge(dvp); 2652 2653 error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); 2654 2655 if (error == 0) { 2656 uint64_t txtype = TX_RMDIR; 2657 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); 2658 } 2659 2660 dmu_tx_commit(tx); 2661 2662 cache_purge(vp); 2663 out: 2664 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 2665 zil_commit(zilog, 0); 2666 2667 ZFS_EXIT(zfsvfs); 2668 return (error); 2669 } 2670 2671 /* 2672 * Read as many directory entries as will fit into the provided 2673 * buffer from the given directory cursor position (specified in 2674 * the uio structure). 2675 * 2676 * IN: vp - vnode of directory to read. 2677 * uio - structure supplying read location, range info, 2678 * and return buffer. 2679 * cr - credentials of caller. 2680 * ct - caller context 2681 * flags - case flags 2682 * 2683 * OUT: uio - updated offset and range, buffer filled. 2684 * eofp - set to true if end-of-file detected. 2685 * 2686 * RETURN: 0 on success, error code on failure. 2687 * 2688 * Timestamps: 2689 * vp - atime updated 2690 * 2691 * Note that the low 4 bits of the cookie returned by zap is always zero. 2692 * This allows us to use the low range for "special" directory entries: 2693 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, 2694 * we use the offset 2 for the '.zfs' directory. 2695 */ 2696 /* ARGSUSED */ 2697 static int 2698 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, off_t **cookies) 2699 { 2700 znode_t *zp = VTOZ(vp); 2701 iovec_t *iovp; 2702 edirent_t *eodp; 2703 dirent64_t *odp; 2704 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 2705 objset_t *os; 2706 caddr_t outbuf; 2707 size_t bufsize; 2708 zap_cursor_t zc; 2709 zap_attribute_t zap; 2710 uint_t bytes_wanted; 2711 uint64_t offset; /* must be unsigned; checks for < 1 */ 2712 uint64_t parent; 2713 int local_eof; 2714 int outcount; 2715 int error; 2716 uint8_t prefetch; 2717 boolean_t check_sysattrs; 2718 uint8_t type; 2719 int ncooks = 0; 2720 off_t *cooks = NULL; 2721 int flags = 0; 2722 #ifdef __FreeBSD__ 2723 boolean_t user = uio->uio_segflg != UIO_SYSSPACE; 2724 #endif 2725 #ifdef __NetBSD__ 2726 boolean_t user = !VMSPACE_IS_KERNEL_P(uio->uio_vmspace); 2727 #endif 2728 2729 ZFS_ENTER(zfsvfs); 2730 ZFS_VERIFY_ZP(zp); 2731 2732 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 2733 &parent, sizeof (parent))) != 0) { 2734 ZFS_EXIT(zfsvfs); 2735 return (error); 2736 } 2737 2738 /* 2739 * If we are not given an eof variable, 2740 * use a local one. 2741 */ 2742 if (eofp == NULL) 2743 eofp = &local_eof; 2744 2745 /* 2746 * Check for valid iov_len. 2747 */ 2748 if (uio->uio_iov->iov_len <= 0) { 2749 ZFS_EXIT(zfsvfs); 2750 return (SET_ERROR(EINVAL)); 2751 } 2752 2753 /* 2754 * Quit if directory has been removed (posix) 2755 */ 2756 if ((*eofp = zp->z_unlinked) != 0) { 2757 ZFS_EXIT(zfsvfs); 2758 return (0); 2759 } 2760 2761 error = 0; 2762 os = zfsvfs->z_os; 2763 offset = uio->uio_loffset; 2764 prefetch = zp->z_zn_prefetch; 2765 2766 /* 2767 * Initialize the iterator cursor. 2768 */ 2769 if (offset <= 3) { 2770 /* 2771 * Start iteration from the beginning of the directory. 2772 */ 2773 zap_cursor_init(&zc, os, zp->z_id); 2774 } else { 2775 /* 2776 * The offset is a serialized cursor. 2777 */ 2778 zap_cursor_init_serialized(&zc, os, zp->z_id, offset); 2779 } 2780 2781 /* 2782 * Get space to change directory entries into fs independent format. 2783 */ 2784 iovp = uio->uio_iov; 2785 bytes_wanted = iovp->iov_len; 2786 if (user || uio->uio_iovcnt != 1) { 2787 bufsize = bytes_wanted; 2788 outbuf = kmem_alloc(bufsize, KM_SLEEP); 2789 odp = (struct dirent64 *)outbuf; 2790 } else { 2791 bufsize = bytes_wanted; 2792 outbuf = NULL; 2793 odp = (struct dirent64 *)iovp->iov_base; 2794 } 2795 eodp = (struct edirent *)odp; 2796 2797 if (ncookies != NULL) { 2798 /* 2799 * Minimum entry size is dirent size and 1 byte for a file name. 2800 */ 2801 #ifdef __FreeBSD__ 2802 ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); 2803 cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); 2804 #endif 2805 #ifdef __NetBSD__ 2806 ncooks = uio->uio_resid / _DIRENT_MINSIZE(odp); 2807 cooks = malloc(ncooks * sizeof(off_t), M_TEMP, M_WAITOK); 2808 #endif 2809 *cookies = cooks; 2810 *ncookies = ncooks; 2811 } 2812 2813 /* 2814 * If this VFS supports the system attribute view interface; and 2815 * we're looking at an extended attribute directory; and we care 2816 * about normalization conflicts on this vfs; then we must check 2817 * for normalization conflicts with the sysattr name space. 2818 */ 2819 #ifdef TODO 2820 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 2821 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && 2822 (flags & V_RDDIR_ENTFLAGS); 2823 #else 2824 check_sysattrs = 0; 2825 #endif 2826 2827 /* 2828 * Transform to file-system independent format 2829 */ 2830 outcount = 0; 2831 while (outcount < bytes_wanted) { 2832 ino64_t objnum; 2833 ushort_t reclen; 2834 off64_t *next = NULL; 2835 2836 /* 2837 * Special case `.', `..', and `.zfs'. 2838 */ 2839 if (offset == 0) { 2840 (void) strcpy(zap.za_name, "."); 2841 zap.za_normalization_conflict = 0; 2842 objnum = zp->z_id; 2843 type = DT_DIR; 2844 } else if (offset == 1) { 2845 (void) strcpy(zap.za_name, ".."); 2846 zap.za_normalization_conflict = 0; 2847 objnum = parent; 2848 type = DT_DIR; 2849 } else if (offset == 2 && zfs_show_ctldir(zp)) { 2850 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); 2851 zap.za_normalization_conflict = 0; 2852 objnum = ZFSCTL_INO_ROOT; 2853 type = DT_DIR; 2854 } else { 2855 /* 2856 * Grab next entry. 2857 */ 2858 if (error = zap_cursor_retrieve(&zc, &zap)) { 2859 if ((*eofp = (error == ENOENT)) != 0) 2860 break; 2861 else 2862 goto update; 2863 } 2864 2865 if (zap.za_integer_length != 8 || 2866 zap.za_num_integers != 1) { 2867 cmn_err(CE_WARN, "zap_readdir: bad directory " 2868 "entry, obj = %lld, offset = %lld\n", 2869 (u_longlong_t)zp->z_id, 2870 (u_longlong_t)offset); 2871 error = SET_ERROR(ENXIO); 2872 goto update; 2873 } 2874 2875 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); 2876 /* 2877 * MacOS X can extract the object type here such as: 2878 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2879 */ 2880 type = ZFS_DIRENT_TYPE(zap.za_first_integer); 2881 2882 if (check_sysattrs && !zap.za_normalization_conflict) { 2883 #ifdef TODO 2884 zap.za_normalization_conflict = 2885 xattr_sysattr_casechk(zap.za_name); 2886 #else 2887 panic("%s:%u: TODO", __func__, __LINE__); 2888 #endif 2889 } 2890 } 2891 2892 if (flags & V_RDDIR_ACCFILTER) { 2893 /* 2894 * If we have no access at all, don't include 2895 * this entry in the returned information 2896 */ 2897 znode_t *ezp; 2898 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) 2899 goto skip_entry; 2900 if (!zfs_has_access(ezp, cr)) { 2901 vrele(ZTOV(ezp)); 2902 goto skip_entry; 2903 } 2904 vrele(ZTOV(ezp)); 2905 } 2906 2907 if (flags & V_RDDIR_ENTFLAGS) 2908 reclen = EDIRENT_RECLEN(strlen(zap.za_name)); 2909 else 2910 reclen = DIRENT64_RECLEN(strlen(zap.za_name)); 2911 2912 /* 2913 * Will this entry fit in the buffer? 2914 */ 2915 if (outcount + reclen > bufsize) { 2916 /* 2917 * Did we manage to fit anything in the buffer? 2918 */ 2919 if (!outcount) { 2920 error = SET_ERROR(EINVAL); 2921 goto update; 2922 } 2923 break; 2924 } 2925 if (flags & V_RDDIR_ENTFLAGS) { 2926 /* 2927 * Add extended flag entry: 2928 */ 2929 eodp->ed_ino = objnum; 2930 eodp->ed_reclen = reclen; 2931 /* NOTE: ed_off is the offset for the *next* entry */ 2932 next = &(eodp->ed_off); 2933 eodp->ed_eflags = zap.za_normalization_conflict ? 2934 ED_CASE_CONFLICT : 0; 2935 (void) strncpy(eodp->ed_name, zap.za_name, 2936 EDIRENT_NAMELEN(reclen)); 2937 eodp = (edirent_t *)((intptr_t)eodp + reclen); 2938 } else { 2939 /* 2940 * Add normal entry: 2941 */ 2942 odp->d_ino = objnum; 2943 odp->d_reclen = reclen; 2944 odp->d_namlen = strlen(zap.za_name); 2945 (void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); 2946 odp->d_type = type; 2947 odp = (dirent64_t *)((intptr_t)odp + reclen); 2948 } 2949 outcount += reclen; 2950 2951 ASSERT(outcount <= bufsize); 2952 2953 /* Prefetch znode */ 2954 if (prefetch) 2955 dmu_prefetch(os, objnum, 0, 0, 0, 2956 ZIO_PRIORITY_SYNC_READ); 2957 2958 skip_entry: 2959 /* 2960 * Move to the next entry, fill in the previous offset. 2961 */ 2962 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { 2963 zap_cursor_advance(&zc); 2964 offset = zap_cursor_serialize(&zc); 2965 } else { 2966 offset += 1; 2967 } 2968 2969 if (cooks != NULL) { 2970 *cooks++ = offset; 2971 ncooks--; 2972 #ifdef __FreeBSD__ 2973 KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); 2974 #endif 2975 #ifdef __NetBSD__ 2976 KASSERTMSG(ncooks >= 0, "ncooks=%d", ncooks); 2977 #endif 2978 } 2979 } 2980 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ 2981 2982 /* Subtract unused cookies */ 2983 if (ncookies != NULL) 2984 *ncookies -= ncooks; 2985 2986 if (!user && uio->uio_iovcnt == 1) { 2987 iovp->iov_base += outcount; 2988 iovp->iov_len -= outcount; 2989 uio->uio_resid -= outcount; 2990 } else if (error = uiomove(outbuf, (size_t)outcount, UIO_READ, uio)) { 2991 /* 2992 * Reset the pointer. 2993 */ 2994 offset = uio->uio_loffset; 2995 } 2996 2997 update: 2998 zap_cursor_fini(&zc); 2999 if (user || uio->uio_iovcnt != 1) 3000 kmem_free(outbuf, bufsize); 3001 3002 if (error == ENOENT) 3003 error = 0; 3004 3005 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 3006 3007 uio->uio_loffset = offset; 3008 ZFS_EXIT(zfsvfs); 3009 if (error != 0 && cookies != NULL) { 3010 #ifdef __FreeBSD__ 3011 free(*cookies, M_TEMP); 3012 #endif 3013 #ifdef __NetBSD__ 3014 kmem_free(*cookies, ncooks * sizeof(off_t)); 3015 #endif 3016 *cookies = NULL; 3017 *ncookies = 0; 3018 } 3019 return (error); 3020 } 3021 3022 static int 3023 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 3024 { 3025 znode_t *zp = VTOZ(vp); 3026 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3027 3028 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 3029 ZFS_ENTER(zfsvfs); 3030 ZFS_VERIFY_ZP(zp); 3031 3032 #ifdef __NetBSD__ 3033 if (!zp->z_unlinked) 3034 #endif 3035 zil_commit(zfsvfs->z_log, zp->z_id); 3036 ZFS_EXIT(zfsvfs); 3037 } 3038 return (0); 3039 } 3040 3041 3042 /* 3043 * Get the requested file attributes and place them in the provided 3044 * vattr structure. 3045 * 3046 * IN: vp - vnode of file. 3047 * vap - va_mask identifies requested attributes. 3048 * If AT_XVATTR set, then optional attrs are requested 3049 * flags - ATTR_NOACLCHECK (CIFS server context) 3050 * cr - credentials of caller. 3051 * ct - caller context 3052 * 3053 * OUT: vap - attribute values. 3054 * 3055 * RETURN: 0 (always succeeds). 3056 */ 3057 /* ARGSUSED */ 3058 static int 3059 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 3060 caller_context_t *ct) 3061 { 3062 znode_t *zp = VTOZ(vp); 3063 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3064 int error = 0; 3065 uint32_t blksize; 3066 u_longlong_t nblocks; 3067 uint64_t links; 3068 uint64_t mtime[2], ctime[2], crtime[2], rdev; 3069 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 3070 xoptattr_t *xoap = NULL; 3071 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 3072 sa_bulk_attr_t bulk[4]; 3073 int count = 0; 3074 3075 ZFS_ENTER(zfsvfs); 3076 ZFS_VERIFY_ZP(zp); 3077 3078 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); 3079 3080 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 3081 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 3082 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); 3083 if (vp->v_type == VBLK || vp->v_type == VCHR) 3084 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, 3085 &rdev, 8); 3086 3087 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { 3088 ZFS_EXIT(zfsvfs); 3089 return (error); 3090 } 3091 3092 /* 3093 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. 3094 * Also, if we are the owner don't bother, since owner should 3095 * always be allowed to read basic attributes of file. 3096 */ 3097 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && 3098 (vap->va_uid != crgetuid(cr))) { 3099 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, 3100 skipaclchk, cr)) { 3101 ZFS_EXIT(zfsvfs); 3102 return (error); 3103 } 3104 } 3105 3106 /* 3107 * Return all attributes. It's cheaper to provide the answer 3108 * than to determine whether we were asked the question. 3109 */ 3110 3111 vap->va_type = IFTOVT(zp->z_mode); 3112 vap->va_mode = zp->z_mode & ~S_IFMT; 3113 #ifdef illumos 3114 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; 3115 #endif 3116 #ifdef __FreeBSD__ 3117 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 3118 vap->va_nodeid = zp->z_id; 3119 #endif 3120 #ifdef __NetBSD__ 3121 /* 3122 * note: f_fsid is a signed long. 3123 * we don't want sign extension here. 3124 */ 3125 vap->va_fsid = (uint32_t)vp->v_mount->mnt_stat.f_fsid; 3126 vap->va_nodeid = zp->z_id; 3127 /* 3128 * If we are a snapshot mounted under .zfs, return 3129 * the object id of the snapshot to make getcwd happy. 3130 */ 3131 if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { 3132 vnode_t *cvp = vp->v_mount->mnt_vnodecovered; 3133 3134 if (cvp && zfsctl_is_node(cvp)) 3135 vap->va_nodeid = dmu_objset_id(zfsvfs->z_os); 3136 } 3137 #endif 3138 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) 3139 links = zp->z_links + 1; 3140 else 3141 links = zp->z_links; 3142 /* XXX NetBSD: use LINK_MAX when that value matches 32-bit nlink_t */ 3143 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ 3144 vap->va_size = zp->z_size; 3145 #ifdef illumos 3146 vap->va_rdev = vp->v_rdev; 3147 #else 3148 if (vp->v_type == VBLK || vp->v_type == VCHR) 3149 vap->va_rdev = zfs_cmpldev(rdev); 3150 #endif 3151 vap->va_seq = zp->z_seq; 3152 vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ 3153 vap->va_filerev = zp->z_seq; 3154 3155 /* 3156 * Add in any requested optional attributes and the create time. 3157 * Also set the corresponding bits in the returned attribute bitmap. 3158 */ 3159 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { 3160 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 3161 xoap->xoa_archive = 3162 ((zp->z_pflags & ZFS_ARCHIVE) != 0); 3163 XVA_SET_RTN(xvap, XAT_ARCHIVE); 3164 } 3165 3166 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 3167 xoap->xoa_readonly = 3168 ((zp->z_pflags & ZFS_READONLY) != 0); 3169 XVA_SET_RTN(xvap, XAT_READONLY); 3170 } 3171 3172 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 3173 xoap->xoa_system = 3174 ((zp->z_pflags & ZFS_SYSTEM) != 0); 3175 XVA_SET_RTN(xvap, XAT_SYSTEM); 3176 } 3177 3178 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 3179 xoap->xoa_hidden = 3180 ((zp->z_pflags & ZFS_HIDDEN) != 0); 3181 XVA_SET_RTN(xvap, XAT_HIDDEN); 3182 } 3183 3184 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 3185 xoap->xoa_nounlink = 3186 ((zp->z_pflags & ZFS_NOUNLINK) != 0); 3187 XVA_SET_RTN(xvap, XAT_NOUNLINK); 3188 } 3189 3190 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 3191 xoap->xoa_immutable = 3192 ((zp->z_pflags & ZFS_IMMUTABLE) != 0); 3193 XVA_SET_RTN(xvap, XAT_IMMUTABLE); 3194 } 3195 3196 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 3197 xoap->xoa_appendonly = 3198 ((zp->z_pflags & ZFS_APPENDONLY) != 0); 3199 XVA_SET_RTN(xvap, XAT_APPENDONLY); 3200 } 3201 3202 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 3203 xoap->xoa_nodump = 3204 ((zp->z_pflags & ZFS_NODUMP) != 0); 3205 XVA_SET_RTN(xvap, XAT_NODUMP); 3206 } 3207 3208 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 3209 xoap->xoa_opaque = 3210 ((zp->z_pflags & ZFS_OPAQUE) != 0); 3211 XVA_SET_RTN(xvap, XAT_OPAQUE); 3212 } 3213 3214 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 3215 xoap->xoa_av_quarantined = 3216 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); 3217 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 3218 } 3219 3220 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 3221 xoap->xoa_av_modified = 3222 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); 3223 XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 3224 } 3225 3226 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && 3227 vp->v_type == VREG) { 3228 zfs_sa_get_scanstamp(zp, xvap); 3229 } 3230 3231 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 3232 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); 3233 XVA_SET_RTN(xvap, XAT_REPARSE); 3234 } 3235 if (XVA_ISSET_REQ(xvap, XAT_GEN)) { 3236 xoap->xoa_generation = zp->z_gen; 3237 XVA_SET_RTN(xvap, XAT_GEN); 3238 } 3239 3240 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { 3241 xoap->xoa_offline = 3242 ((zp->z_pflags & ZFS_OFFLINE) != 0); 3243 XVA_SET_RTN(xvap, XAT_OFFLINE); 3244 } 3245 3246 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { 3247 xoap->xoa_sparse = 3248 ((zp->z_pflags & ZFS_SPARSE) != 0); 3249 XVA_SET_RTN(xvap, XAT_SPARSE); 3250 } 3251 } 3252 3253 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); 3254 ZFS_TIME_DECODE(&vap->va_mtime, mtime); 3255 ZFS_TIME_DECODE(&vap->va_ctime, ctime); 3256 ZFS_TIME_DECODE(&vap->va_birthtime, crtime); 3257 3258 3259 sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); 3260 vap->va_blksize = blksize; 3261 vap->va_bytes = nblocks << 9; /* nblocks * 512 */ 3262 3263 if (zp->z_blksz == 0) { 3264 /* 3265 * Block size hasn't been set; suggest maximal I/O transfers. 3266 */ 3267 vap->va_blksize = zfsvfs->z_max_blksz; 3268 } 3269 3270 ZFS_EXIT(zfsvfs); 3271 return (0); 3272 } 3273 3274 /* 3275 * Set the file attributes to the values contained in the 3276 * vattr structure. 3277 * 3278 * IN: vp - vnode of file to be modified. 3279 * vap - new attribute values. 3280 * If AT_XVATTR set, then optional attrs are being set 3281 * flags - ATTR_UTIME set if non-default time values provided. 3282 * - ATTR_NOACLCHECK (CIFS context only). 3283 * cr - credentials of caller. 3284 * ct - caller context 3285 * 3286 * RETURN: 0 on success, error code on failure. 3287 * 3288 * Timestamps: 3289 * vp - ctime updated, mtime updated if size changed. 3290 */ 3291 /* ARGSUSED */ 3292 static int 3293 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, 3294 caller_context_t *ct) 3295 { 3296 znode_t *zp = VTOZ(vp); 3297 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 3298 zilog_t *zilog; 3299 dmu_tx_t *tx; 3300 vattr_t oldva; 3301 xvattr_t tmpxvattr; 3302 uint_t mask = vap->va_mask; 3303 uint_t saved_mask = 0; 3304 uint64_t saved_mode; 3305 int trim_mask = 0; 3306 uint64_t new_mode; 3307 uint64_t new_uid, new_gid; 3308 uint64_t xattr_obj; 3309 uint64_t mtime[2], ctime[2]; 3310 znode_t *attrzp; 3311 int need_policy = FALSE; 3312 int err, err2; 3313 zfs_fuid_info_t *fuidp = NULL; 3314 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ 3315 xoptattr_t *xoap; 3316 zfs_acl_t *aclp; 3317 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 3318 boolean_t fuid_dirtied = B_FALSE; 3319 sa_bulk_attr_t bulk[7], xattr_bulk[7]; 3320 int count = 0, xattr_count = 0; 3321 3322 if (mask == 0) 3323 return (0); 3324 3325 if (mask & AT_NOSET) 3326 return (SET_ERROR(EINVAL)); 3327 3328 ZFS_ENTER(zfsvfs); 3329 ZFS_VERIFY_ZP(zp); 3330 3331 zilog = zfsvfs->z_log; 3332 3333 /* 3334 * Make sure that if we have ephemeral uid/gid or xvattr specified 3335 * that file system is at proper version level 3336 */ 3337 3338 if (zfsvfs->z_use_fuids == B_FALSE && 3339 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || 3340 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || 3341 (mask & AT_XVATTR))) { 3342 ZFS_EXIT(zfsvfs); 3343 return (SET_ERROR(EINVAL)); 3344 } 3345 3346 if (mask & AT_SIZE && vp->v_type == VDIR) { 3347 ZFS_EXIT(zfsvfs); 3348 return (SET_ERROR(EISDIR)); 3349 } 3350 3351 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { 3352 ZFS_EXIT(zfsvfs); 3353 return (SET_ERROR(EINVAL)); 3354 } 3355 3356 /* 3357 * If this is an xvattr_t, then get a pointer to the structure of 3358 * optional attributes. If this is NULL, then we have a vattr_t. 3359 */ 3360 xoap = xva_getxoptattr(xvap); 3361 3362 xva_init(&tmpxvattr); 3363 3364 /* 3365 * Immutable files can only alter immutable bit and atime 3366 */ 3367 if ((zp->z_pflags & ZFS_IMMUTABLE) && 3368 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || 3369 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { 3370 ZFS_EXIT(zfsvfs); 3371 return (SET_ERROR(EPERM)); 3372 } 3373 3374 if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { 3375 ZFS_EXIT(zfsvfs); 3376 return (SET_ERROR(EPERM)); 3377 } 3378 3379 /* 3380 * Verify timestamps doesn't overflow 32 bits. 3381 * ZFS can handle large timestamps, but 32bit syscalls can't 3382 * handle times greater than 2039. This check should be removed 3383 * once large timestamps are fully supported. 3384 */ 3385 if (mask & (AT_ATIME | AT_MTIME)) { 3386 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 3387 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 3388 ZFS_EXIT(zfsvfs); 3389 return (SET_ERROR(EOVERFLOW)); 3390 } 3391 } 3392 if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) && 3393 TIMESPEC_OVERFLOW(&vap->va_birthtime)) { 3394 ZFS_EXIT(zfsvfs); 3395 return (SET_ERROR(EOVERFLOW)); 3396 } 3397 3398 attrzp = NULL; 3399 aclp = NULL; 3400 3401 /* Can this be moved to before the top label? */ 3402 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { 3403 ZFS_EXIT(zfsvfs); 3404 return (SET_ERROR(EROFS)); 3405 } 3406 3407 /* 3408 * First validate permissions 3409 */ 3410 3411 if (mask & AT_SIZE) { 3412 /* 3413 * XXX - Note, we are not providing any open 3414 * mode flags here (like FNDELAY), so we may 3415 * block if there are locks present... this 3416 * should be addressed in openat(). 3417 */ 3418 /* XXX - would it be OK to generate a log record here? */ 3419 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); 3420 if (err) { 3421 ZFS_EXIT(zfsvfs); 3422 return (err); 3423 } 3424 } 3425 3426 if (mask & (AT_ATIME|AT_MTIME) || 3427 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || 3428 XVA_ISSET_REQ(xvap, XAT_READONLY) || 3429 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || 3430 XVA_ISSET_REQ(xvap, XAT_OFFLINE) || 3431 XVA_ISSET_REQ(xvap, XAT_SPARSE) || 3432 XVA_ISSET_REQ(xvap, XAT_CREATETIME) || 3433 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { 3434 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, 3435 skipaclchk, cr); 3436 } 3437 3438 if (mask & (AT_UID|AT_GID)) { 3439 int idmask = (mask & (AT_UID|AT_GID)); 3440 int take_owner; 3441 int take_group; 3442 3443 /* 3444 * NOTE: even if a new mode is being set, 3445 * we may clear S_ISUID/S_ISGID bits. 3446 */ 3447 3448 if (!(mask & AT_MODE)) 3449 vap->va_mode = zp->z_mode; 3450 3451 /* 3452 * Take ownership or chgrp to group we are a member of 3453 */ 3454 3455 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); 3456 take_group = (mask & AT_GID) && 3457 zfs_groupmember(zfsvfs, vap->va_gid, cr); 3458 3459 /* 3460 * If both AT_UID and AT_GID are set then take_owner and 3461 * take_group must both be set in order to allow taking 3462 * ownership. 3463 * 3464 * Otherwise, send the check through secpolicy_vnode_setattr() 3465 * 3466 */ 3467 3468 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || 3469 ((idmask == AT_UID) && take_owner) || 3470 ((idmask == AT_GID) && take_group)) { 3471 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, 3472 skipaclchk, cr) == 0) { 3473 /* 3474 * Remove setuid/setgid for non-privileged users 3475 */ 3476 secpolicy_setid_clear(vap, vp, cr); 3477 trim_mask = (mask & (AT_UID|AT_GID)); 3478 } else { 3479 need_policy = TRUE; 3480 } 3481 } else { 3482 need_policy = TRUE; 3483 } 3484 } 3485 3486 oldva.va_mode = zp->z_mode; 3487 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); 3488 if (mask & AT_XVATTR) { 3489 /* 3490 * Update xvattr mask to include only those attributes 3491 * that are actually changing. 3492 * 3493 * the bits will be restored prior to actually setting 3494 * the attributes so the caller thinks they were set. 3495 */ 3496 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 3497 if (xoap->xoa_appendonly != 3498 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { 3499 need_policy = TRUE; 3500 } else { 3501 XVA_CLR_REQ(xvap, XAT_APPENDONLY); 3502 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); 3503 } 3504 } 3505 3506 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 3507 if (xoap->xoa_nounlink != 3508 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { 3509 need_policy = TRUE; 3510 } else { 3511 XVA_CLR_REQ(xvap, XAT_NOUNLINK); 3512 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); 3513 } 3514 } 3515 3516 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 3517 if (xoap->xoa_immutable != 3518 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { 3519 need_policy = TRUE; 3520 } else { 3521 XVA_CLR_REQ(xvap, XAT_IMMUTABLE); 3522 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); 3523 } 3524 } 3525 3526 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 3527 if (xoap->xoa_nodump != 3528 ((zp->z_pflags & ZFS_NODUMP) != 0)) { 3529 #if 0 3530 /* 3531 * XXXSB - zfs_netbsd_setattr() 3532 * has already checked if this 3533 * request is authorised, and our 3534 * secpolicy_xvattr() doesn't check 3535 * kauth chflags. Fix this when we 3536 * migrate to openzfs. 3537 */ 3538 need_policy = TRUE; 3539 #endif 3540 } else { 3541 XVA_CLR_REQ(xvap, XAT_NODUMP); 3542 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); 3543 } 3544 } 3545 3546 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 3547 if (xoap->xoa_av_modified != 3548 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { 3549 need_policy = TRUE; 3550 } else { 3551 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); 3552 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); 3553 } 3554 } 3555 3556 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 3557 if ((vp->v_type != VREG && 3558 xoap->xoa_av_quarantined) || 3559 xoap->xoa_av_quarantined != 3560 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { 3561 need_policy = TRUE; 3562 } else { 3563 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); 3564 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); 3565 } 3566 } 3567 3568 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { 3569 ZFS_EXIT(zfsvfs); 3570 return (SET_ERROR(EPERM)); 3571 } 3572 3573 if (need_policy == FALSE && 3574 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || 3575 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { 3576 need_policy = TRUE; 3577 } 3578 } 3579 3580 if (mask & AT_MODE) { 3581 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { 3582 err = secpolicy_setid_setsticky_clear(vp, vap, 3583 &oldva, cr); 3584 if (err) { 3585 ZFS_EXIT(zfsvfs); 3586 return (err); 3587 } 3588 trim_mask |= AT_MODE; 3589 } else { 3590 need_policy = TRUE; 3591 } 3592 } 3593 3594 if (need_policy) { 3595 /* 3596 * If trim_mask is set then take ownership 3597 * has been granted or write_acl is present and user 3598 * has the ability to modify mode. In that case remove 3599 * UID|GID and or MODE from mask so that 3600 * secpolicy_vnode_setattr() doesn't revoke it. 3601 */ 3602 3603 if (trim_mask) { 3604 saved_mask = vap->va_mask; 3605 vap->va_mask &= ~trim_mask; 3606 if (trim_mask & AT_MODE) { 3607 /* 3608 * Save the mode, as secpolicy_vnode_setattr() 3609 * will overwrite it with ova.va_mode. 3610 */ 3611 saved_mode = vap->va_mode; 3612 } 3613 } 3614 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, 3615 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); 3616 if (err) { 3617 ZFS_EXIT(zfsvfs); 3618 return (err); 3619 } 3620 3621 if (trim_mask) { 3622 vap->va_mask |= saved_mask; 3623 if (trim_mask & AT_MODE) { 3624 /* 3625 * Recover the mode after 3626 * secpolicy_vnode_setattr(). 3627 */ 3628 vap->va_mode = saved_mode; 3629 } 3630 } 3631 } 3632 3633 /* 3634 * secpolicy_vnode_setattr, or take ownership may have 3635 * changed va_mask 3636 */ 3637 mask = vap->va_mask; 3638 3639 if ((mask & (AT_UID | AT_GID))) { 3640 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), 3641 &xattr_obj, sizeof (xattr_obj)); 3642 3643 if (err == 0 && xattr_obj) { 3644 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); 3645 if (err == 0) { 3646 err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); 3647 if (err != 0) 3648 vrele(ZTOV(attrzp)); 3649 } 3650 if (err) 3651 goto out2; 3652 } 3653 if (mask & AT_UID) { 3654 new_uid = zfs_fuid_create(zfsvfs, 3655 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); 3656 if (new_uid != zp->z_uid && 3657 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { 3658 if (attrzp) 3659 vput(ZTOV(attrzp)); 3660 err = SET_ERROR(EDQUOT); 3661 goto out2; 3662 } 3663 } 3664 3665 if (mask & AT_GID) { 3666 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, 3667 cr, ZFS_GROUP, &fuidp); 3668 if (new_gid != zp->z_gid && 3669 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { 3670 if (attrzp) 3671 vput(ZTOV(attrzp)); 3672 err = SET_ERROR(EDQUOT); 3673 goto out2; 3674 } 3675 } 3676 } 3677 tx = dmu_tx_create(zfsvfs->z_os); 3678 3679 if (mask & AT_MODE) { 3680 uint64_t pmode = zp->z_mode; 3681 uint64_t acl_obj; 3682 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); 3683 3684 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && 3685 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { 3686 err = SET_ERROR(EPERM); 3687 goto out; 3688 } 3689 3690 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) 3691 goto out; 3692 3693 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { 3694 /* 3695 * Are we upgrading ACL from old V0 format 3696 * to V1 format? 3697 */ 3698 if (zfsvfs->z_version >= ZPL_VERSION_FUID && 3699 zfs_znode_acl_version(zp) == 3700 ZFS_ACL_VERSION_INITIAL) { 3701 dmu_tx_hold_free(tx, acl_obj, 0, 3702 DMU_OBJECT_END); 3703 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3704 0, aclp->z_acl_bytes); 3705 } else { 3706 dmu_tx_hold_write(tx, acl_obj, 0, 3707 aclp->z_acl_bytes); 3708 } 3709 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { 3710 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 3711 0, aclp->z_acl_bytes); 3712 } 3713 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3714 } else { 3715 if ((mask & AT_XVATTR) && 3716 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3717 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); 3718 else 3719 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 3720 } 3721 3722 if (attrzp) { 3723 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); 3724 } 3725 3726 fuid_dirtied = zfsvfs->z_fuid_dirty; 3727 if (fuid_dirtied) 3728 zfs_fuid_txhold(zfsvfs, tx); 3729 3730 zfs_sa_upgrade_txholds(tx, zp); 3731 3732 err = dmu_tx_assign(tx, TXG_WAIT); 3733 if (err) 3734 goto out; 3735 3736 count = 0; 3737 /* 3738 * Set each attribute requested. 3739 * We group settings according to the locks they need to acquire. 3740 * 3741 * Note: you cannot set ctime directly, although it will be 3742 * updated as a side-effect of calling this function. 3743 */ 3744 3745 if (mask & (AT_UID|AT_GID|AT_MODE)) 3746 mutex_enter(&zp->z_acl_lock); 3747 3748 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 3749 &zp->z_pflags, sizeof (zp->z_pflags)); 3750 3751 if (attrzp) { 3752 if (mask & (AT_UID|AT_GID|AT_MODE)) 3753 mutex_enter(&attrzp->z_acl_lock); 3754 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3755 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, 3756 sizeof (attrzp->z_pflags)); 3757 } 3758 3759 if (mask & (AT_UID|AT_GID)) { 3760 3761 if (mask & AT_UID) { 3762 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, 3763 &new_uid, sizeof (new_uid)); 3764 zp->z_uid = new_uid; 3765 if (attrzp) { 3766 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3767 SA_ZPL_UID(zfsvfs), NULL, &new_uid, 3768 sizeof (new_uid)); 3769 attrzp->z_uid = new_uid; 3770 } 3771 } 3772 3773 if (mask & AT_GID) { 3774 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), 3775 NULL, &new_gid, sizeof (new_gid)); 3776 zp->z_gid = new_gid; 3777 if (attrzp) { 3778 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3779 SA_ZPL_GID(zfsvfs), NULL, &new_gid, 3780 sizeof (new_gid)); 3781 attrzp->z_gid = new_gid; 3782 } 3783 } 3784 if (!(mask & AT_MODE)) { 3785 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), 3786 NULL, &new_mode, sizeof (new_mode)); 3787 new_mode = zp->z_mode; 3788 } 3789 err = zfs_acl_chown_setattr(zp); 3790 ASSERT(err == 0); 3791 if (attrzp) { 3792 err = zfs_acl_chown_setattr(attrzp); 3793 ASSERT(err == 0); 3794 } 3795 } 3796 3797 if (mask & AT_MODE) { 3798 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, 3799 &new_mode, sizeof (new_mode)); 3800 zp->z_mode = new_mode; 3801 ASSERT3U((uintptr_t)aclp, !=, 0); 3802 err = zfs_aclset_common(zp, aclp, cr, tx); 3803 ASSERT0(err); 3804 if (zp->z_acl_cached) 3805 zfs_acl_free(zp->z_acl_cached); 3806 zp->z_acl_cached = aclp; 3807 aclp = NULL; 3808 } 3809 3810 3811 if (mask & AT_ATIME) { 3812 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); 3813 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, 3814 &zp->z_atime, sizeof (zp->z_atime)); 3815 } 3816 3817 if (mask & AT_MTIME) { 3818 ZFS_TIME_ENCODE(&vap->va_mtime, mtime); 3819 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 3820 mtime, sizeof (mtime)); 3821 } 3822 3823 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ 3824 if (mask & AT_SIZE && !(mask & AT_MTIME)) { 3825 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), 3826 NULL, mtime, sizeof (mtime)); 3827 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3828 &ctime, sizeof (ctime)); 3829 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 3830 B_TRUE); 3831 } else if (mask != 0) { 3832 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 3833 &ctime, sizeof (ctime)); 3834 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, 3835 B_TRUE); 3836 if (attrzp) { 3837 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, 3838 SA_ZPL_CTIME(zfsvfs), NULL, 3839 &ctime, sizeof (ctime)); 3840 zfs_tstamp_update_setup(attrzp, STATE_CHANGED, 3841 mtime, ctime, B_TRUE); 3842 } 3843 } 3844 /* 3845 * Do this after setting timestamps to prevent timestamp 3846 * update from toggling bit 3847 */ 3848 3849 if (xoap && (mask & AT_XVATTR)) { 3850 3851 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) 3852 xoap->xoa_createtime = vap->va_birthtime; 3853 /* 3854 * restore trimmed off masks 3855 * so that return masks can be set for caller. 3856 */ 3857 3858 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { 3859 XVA_SET_REQ(xvap, XAT_APPENDONLY); 3860 } 3861 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { 3862 XVA_SET_REQ(xvap, XAT_NOUNLINK); 3863 } 3864 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { 3865 XVA_SET_REQ(xvap, XAT_IMMUTABLE); 3866 } 3867 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { 3868 XVA_SET_REQ(xvap, XAT_NODUMP); 3869 } 3870 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { 3871 XVA_SET_REQ(xvap, XAT_AV_MODIFIED); 3872 } 3873 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { 3874 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); 3875 } 3876 3877 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) 3878 ASSERT(vp->v_type == VREG); 3879 3880 zfs_xvattr_set(zp, xvap, tx); 3881 } 3882 3883 if (fuid_dirtied) 3884 zfs_fuid_sync(zfsvfs, tx); 3885 3886 if (mask != 0) 3887 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); 3888 3889 if (mask & (AT_UID|AT_GID|AT_MODE)) 3890 mutex_exit(&zp->z_acl_lock); 3891 3892 if (attrzp) { 3893 if (mask & (AT_UID|AT_GID|AT_MODE)) 3894 mutex_exit(&attrzp->z_acl_lock); 3895 } 3896 out: 3897 if (err == 0 && attrzp) { 3898 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, 3899 xattr_count, tx); 3900 ASSERT(err2 == 0); 3901 } 3902 3903 if (attrzp) 3904 vput(ZTOV(attrzp)); 3905 3906 if (aclp) 3907 zfs_acl_free(aclp); 3908 3909 if (fuidp) { 3910 zfs_fuid_info_free(fuidp); 3911 fuidp = NULL; 3912 } 3913 3914 if (err) { 3915 dmu_tx_abort(tx); 3916 } else { 3917 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 3918 dmu_tx_commit(tx); 3919 } 3920 3921 out2: 3922 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 3923 zil_commit(zilog, 0); 3924 3925 ZFS_EXIT(zfsvfs); 3926 return (err); 3927 } 3928 3929 /* 3930 * We acquire all but fdvp locks using non-blocking acquisitions. If we 3931 * fail to acquire any lock in the path we will drop all held locks, 3932 * acquire the new lock in a blocking fashion, and then release it and 3933 * restart the rename. This acquire/release step ensures that we do not 3934 * spin on a lock waiting for release. On error release all vnode locks 3935 * and decrement references the way tmpfs_rename() would do. 3936 */ 3937 static int 3938 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, 3939 struct vnode *tdvp, struct vnode **tvpp, 3940 const struct componentname *scnp, const struct componentname *tcnp) 3941 { 3942 zfsvfs_t *zfsvfs; 3943 struct vnode *nvp, *svp, *tvp; 3944 znode_t *sdzp, *tdzp, *szp, *tzp; 3945 #ifdef __FreeBSD__ 3946 const char *snm = scnp->cn_nameptr; 3947 const char *tnm = tcnp->cn_nameptr; 3948 #endif 3949 #ifdef __NetBSD__ 3950 char *snm, *tnm; 3951 #endif 3952 int error; 3953 3954 #ifdef __FreeBSD__ 3955 VOP_UNLOCK(tdvp, 0); 3956 if (*tvpp != NULL && *tvpp != tdvp) 3957 VOP_UNLOCK(*tvpp, 0); 3958 #endif 3959 3960 relock: 3961 error = vn_lock(sdvp, LK_EXCLUSIVE); 3962 if (error) 3963 goto out; 3964 sdzp = VTOZ(sdvp); 3965 3966 #ifdef __NetBSD__ 3967 if (tdvp == sdvp) { 3968 } else { 3969 #endif 3970 error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); 3971 if (error != 0) { 3972 VOP_UNLOCK(sdvp, 0); 3973 if (error != EBUSY) 3974 goto out; 3975 error = vn_lock(tdvp, LK_EXCLUSIVE); 3976 if (error) 3977 goto out; 3978 VOP_UNLOCK(tdvp, 0); 3979 goto relock; 3980 } 3981 #ifdef __NetBSD__ 3982 } /* end if (tdvp == sdvp) */ 3983 #endif 3984 3985 tdzp = VTOZ(tdvp); 3986 3987 /* 3988 * Before using sdzp and tdzp we must ensure that they are live. 3989 * As a porting legacy from illumos we have two things to worry 3990 * about. One is typical for FreeBSD and it is that the vnode is 3991 * not reclaimed (doomed). The other is that the znode is live. 3992 * The current code can invalidate the znode without acquiring the 3993 * corresponding vnode lock if the object represented by the znode 3994 * and vnode is no longer valid after a rollback or receive operation. 3995 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock 3996 * that protects the znodes from the invalidation. 3997 */ 3998 zfsvfs = sdzp->z_zfsvfs; 3999 ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); 4000 ZFS_ENTER(zfsvfs); 4001 4002 /* 4003 * We can not use ZFS_VERIFY_ZP() here because it could directly return 4004 * bypassing the cleanup code in the case of an error. 4005 */ 4006 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { 4007 ZFS_EXIT(zfsvfs); 4008 VOP_UNLOCK(sdvp, 0); 4009 #ifdef __NetBSD__ 4010 if (tdvp != sdvp) 4011 #endif 4012 VOP_UNLOCK(tdvp, 0); 4013 error = SET_ERROR(EIO); 4014 goto out; 4015 } 4016 4017 /* 4018 * Re-resolve svp to be certain it still exists and fetch the 4019 * correct vnode. 4020 */ 4021 #ifdef __NetBSD__ 4022 /* ZFS wants a null-terminated name. */ 4023 snm = PNBUF_GET(); 4024 strlcpy(snm, scnp->cn_nameptr, scnp->cn_namelen + 1); 4025 #endif 4026 error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); 4027 #ifdef __NetBSD__ 4028 PNBUF_PUT(snm); 4029 #endif 4030 if (error != 0) { 4031 /* Source entry invalid or not there. */ 4032 ZFS_EXIT(zfsvfs); 4033 VOP_UNLOCK(sdvp, 0); 4034 #ifdef __NetBSD__ 4035 if (tdvp != sdvp) 4036 #endif 4037 VOP_UNLOCK(tdvp, 0); 4038 if ((scnp->cn_flags & ISDOTDOT) != 0 || 4039 (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) 4040 error = SET_ERROR(EINVAL); 4041 goto out; 4042 } 4043 svp = ZTOV(szp); 4044 4045 /* 4046 * Re-resolve tvp, if it disappeared we just carry on. 4047 */ 4048 #ifdef __NetBSD__ 4049 /* ZFS wants a null-terminated name. */ 4050 tnm = PNBUF_GET(); 4051 strlcpy(tnm, tcnp->cn_nameptr, tcnp->cn_namelen + 1); 4052 #endif 4053 error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); 4054 #ifdef __NetBSD__ 4055 PNBUF_PUT(tnm); 4056 #endif 4057 if (error != 0) { 4058 ZFS_EXIT(zfsvfs); 4059 VOP_UNLOCK(sdvp, 0); 4060 #ifdef __NetBSD__ 4061 if (tdvp != sdvp) 4062 #endif 4063 VOP_UNLOCK(tdvp, 0); 4064 vrele(svp); 4065 if ((tcnp->cn_flags & ISDOTDOT) != 0) 4066 error = SET_ERROR(EINVAL); 4067 goto out; 4068 } 4069 if (tzp != NULL) 4070 tvp = ZTOV(tzp); 4071 else 4072 tvp = NULL; 4073 4074 /* 4075 * At present the vnode locks must be acquired before z_teardown_lock, 4076 * although it would be more logical to use the opposite order. 4077 */ 4078 ZFS_EXIT(zfsvfs); 4079 4080 /* 4081 * Now try acquire locks on svp and tvp. 4082 */ 4083 nvp = svp; 4084 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); 4085 if (error != 0) { 4086 VOP_UNLOCK(sdvp, 0); 4087 #ifdef __NetBSD__ 4088 if (tdvp != sdvp) 4089 #endif 4090 VOP_UNLOCK(tdvp, 0); 4091 if (tvp != NULL) 4092 vrele(tvp); 4093 if (error != EBUSY) { 4094 vrele(nvp); 4095 goto out; 4096 } 4097 error = vn_lock(nvp, LK_EXCLUSIVE); 4098 if (error != 0) { 4099 vrele(nvp); 4100 goto out; 4101 } 4102 VOP_UNLOCK(nvp, 0); 4103 /* 4104 * Concurrent rename race. 4105 * XXX ? 4106 */ 4107 if (nvp == tdvp) { 4108 vrele(nvp); 4109 error = SET_ERROR(EINVAL); 4110 goto out; 4111 } 4112 #ifdef __NetBSD__ 4113 if (*svpp != NULL) 4114 #endif 4115 vrele(*svpp); 4116 *svpp = nvp; 4117 goto relock; 4118 } 4119 #ifdef __NetBSD__ 4120 if (*svpp != NULL) 4121 #endif 4122 vrele(*svpp); 4123 *svpp = nvp; 4124 4125 if (*tvpp != NULL) 4126 vrele(*tvpp); 4127 *tvpp = NULL; 4128 if (tvp != NULL) { 4129 nvp = tvp; 4130 4131 #ifdef __NetBSD__ 4132 if (tvp == svp || tvp == sdvp) { 4133 } else { 4134 #endif 4135 error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); 4136 if (error != 0) { 4137 VOP_UNLOCK(sdvp, 0); 4138 #ifdef __NetBSD__ 4139 if (tdvp != sdvp) 4140 #endif 4141 VOP_UNLOCK(tdvp, 0); 4142 #ifdef __NetBSD__ 4143 if (*svpp != tdvp) 4144 #endif 4145 VOP_UNLOCK(*svpp, 0); 4146 if (error != EBUSY) { 4147 vrele(nvp); 4148 goto out; 4149 } 4150 error = vn_lock(nvp, LK_EXCLUSIVE); 4151 if (error != 0) { 4152 vrele(nvp); 4153 goto out; 4154 } 4155 vput(nvp); 4156 goto relock; 4157 } 4158 #ifdef __NetBSD__ 4159 } /* end if (tvp == svp || tvp == sdvp) */ 4160 #endif 4161 4162 *tvpp = nvp; 4163 } 4164 4165 KASSERT(VOP_ISLOCKED(sdvp) == LK_EXCLUSIVE); 4166 KASSERT(VOP_ISLOCKED(*svpp) == LK_EXCLUSIVE); 4167 KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); 4168 KASSERT(*tvpp == NULL || VOP_ISLOCKED(*tvpp) == LK_EXCLUSIVE); 4169 4170 return (0); 4171 4172 out: 4173 return (error); 4174 } 4175 4176 /* 4177 * Note that we must use VRELE_ASYNC in this function as it walks 4178 * up the directory tree and vrele may need to acquire an exclusive 4179 * lock if a last reference to a vnode is dropped. 4180 */ 4181 static int 4182 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) 4183 { 4184 zfsvfs_t *zfsvfs; 4185 znode_t *zp, *zp1; 4186 uint64_t parent; 4187 int error; 4188 4189 zfsvfs = tdzp->z_zfsvfs; 4190 if (tdzp == szp) 4191 return (SET_ERROR(EINVAL)); 4192 if (tdzp == sdzp) 4193 return (0); 4194 if (tdzp->z_id == zfsvfs->z_root) 4195 return (0); 4196 zp = tdzp; 4197 for (;;) { 4198 ASSERT(!zp->z_unlinked); 4199 if ((error = sa_lookup(zp->z_sa_hdl, 4200 SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) 4201 break; 4202 4203 if (parent == szp->z_id) { 4204 error = SET_ERROR(EINVAL); 4205 break; 4206 } 4207 if (parent == zfsvfs->z_root) 4208 break; 4209 if (parent == sdzp->z_id) 4210 break; 4211 4212 error = zfs_zget(zfsvfs, parent, &zp1); 4213 if (error != 0) 4214 break; 4215 4216 if (zp != tdzp) 4217 VN_RELE_ASYNC(ZTOV(zp), 4218 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); 4219 zp = zp1; 4220 } 4221 4222 if (error == ENOTDIR) 4223 panic("checkpath: .. not a directory\n"); 4224 if (zp != tdzp) 4225 VN_RELE_ASYNC(ZTOV(zp), 4226 dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); 4227 return (error); 4228 } 4229 4230 /* 4231 * Move an entry from the provided source directory to the target 4232 * directory. Change the entry name as indicated. 4233 * 4234 * IN: sdvp - Source directory containing the "old entry". 4235 * snm - Old entry name. 4236 * tdvp - Target directory to contain the "new entry". 4237 * tnm - New entry name. 4238 * cr - credentials of caller. 4239 * ct - caller context 4240 * flags - case flags 4241 * 4242 * RETURN: 0 on success, error code on failure. 4243 * 4244 * Timestamps: 4245 * sdvp,tdvp - ctime|mtime updated 4246 */ 4247 /*ARGSUSED*/ 4248 static int 4249 zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, 4250 vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, 4251 cred_t *cr) 4252 { 4253 zfsvfs_t *zfsvfs; 4254 znode_t *sdzp, *tdzp, *szp, *tzp; 4255 zilog_t *zilog = NULL; 4256 dmu_tx_t *tx; 4257 #ifdef __FreeBSD__ 4258 char *snm = __UNCONST(scnp->cn_nameptr); 4259 char *tnm = __UNCONST(tcnp->cn_nameptr); 4260 #endif 4261 #ifdef __NetBSD__ 4262 char *snm, *tnm; 4263 #endif 4264 int error = 0; 4265 4266 /* Reject renames across filesystems. */ 4267 if (((*svpp) != NULL && (*svpp)->v_mount != tdvp->v_mount) || 4268 ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { 4269 error = SET_ERROR(EXDEV); 4270 goto out; 4271 } 4272 4273 if (zfsctl_is_node(tdvp)) { 4274 error = SET_ERROR(EXDEV); 4275 goto out; 4276 } 4277 4278 /* 4279 * Lock all four vnodes to ensure safety and semantics of renaming. 4280 */ 4281 error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); 4282 if (error != 0) { 4283 /* no vnodes are locked in the case of error here */ 4284 return (error); 4285 } 4286 4287 tdzp = VTOZ(tdvp); 4288 sdzp = VTOZ(sdvp); 4289 zfsvfs = tdzp->z_zfsvfs; 4290 zilog = zfsvfs->z_log; 4291 #ifdef __NetBSD__ 4292 /* ZFS wants a null-terminated name. */ 4293 snm = PNBUF_GET(); 4294 strlcpy(snm, scnp->cn_nameptr, scnp->cn_namelen + 1); 4295 tnm = PNBUF_GET(); 4296 strlcpy(tnm, tcnp->cn_nameptr, tcnp->cn_namelen + 1); 4297 #endif 4298 4299 /* 4300 * After we re-enter ZFS_ENTER() we will have to revalidate all 4301 * znodes involved. 4302 */ 4303 ZFS_ENTER(zfsvfs); 4304 4305 if (zfsvfs->z_utf8 && u8_validate(tnm, 4306 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4307 error = SET_ERROR(EILSEQ); 4308 goto unlockout; 4309 } 4310 4311 #ifndef __NetBSD__ 4312 /* If source and target are the same file, there is nothing to do. */ 4313 if ((*svpp) == (*tvpp)) { 4314 error = 0; 4315 goto unlockout; 4316 } 4317 #endif 4318 4319 if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || 4320 ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && 4321 (*tvpp)->v_mountedhere != NULL)) { 4322 error = SET_ERROR(EXDEV); 4323 goto unlockout; 4324 } 4325 4326 /* 4327 * We can not use ZFS_VERIFY_ZP() here because it could directly return 4328 * bypassing the cleanup code in the case of an error. 4329 */ 4330 if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { 4331 error = SET_ERROR(EIO); 4332 goto unlockout; 4333 } 4334 4335 szp = VTOZ(*svpp); 4336 tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); 4337 if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { 4338 error = SET_ERROR(EIO); 4339 goto unlockout; 4340 } 4341 4342 /* 4343 * This is to prevent the creation of links into attribute space 4344 * by renaming a linked file into/outof an attribute directory. 4345 * See the comment in zfs_link() for why this is considered bad. 4346 */ 4347 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { 4348 error = SET_ERROR(EINVAL); 4349 goto unlockout; 4350 } 4351 4352 /* 4353 * Must have write access at the source to remove the old entry 4354 * and write access at the target to create the new entry. 4355 * Note that if target and source are the same, this can be 4356 * done in a single check. 4357 */ 4358 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) 4359 goto unlockout; 4360 4361 if ((*svpp)->v_type == VDIR) { 4362 /* 4363 * Avoid ".", "..", and aliases of "." for obvious reasons. 4364 */ 4365 if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || 4366 sdzp == szp || 4367 (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { 4368 error = SET_ERROR(EINVAL); 4369 goto unlockout; 4370 } 4371 4372 /* 4373 * Check to make sure rename is valid. 4374 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d 4375 */ 4376 if (error = zfs_rename_check(szp, sdzp, tdzp)) 4377 goto unlockout; 4378 } 4379 4380 /* 4381 * Does target exist? 4382 */ 4383 if (tzp) { 4384 /* 4385 * Source and target must be the same type. 4386 */ 4387 if ((*svpp)->v_type == VDIR) { 4388 if ((*tvpp)->v_type != VDIR) { 4389 error = SET_ERROR(ENOTDIR); 4390 goto unlockout; 4391 } else { 4392 cache_purge(tdvp); 4393 if (sdvp != tdvp) 4394 cache_purge(sdvp); 4395 } 4396 } else { 4397 if ((*tvpp)->v_type == VDIR) { 4398 error = SET_ERROR(EISDIR); 4399 goto unlockout; 4400 } 4401 } 4402 4403 /* 4404 * POSIX dictates that when the source and target 4405 * entries refer to the same file object, rename 4406 * must do nothing and exit without error. 4407 */ 4408 #ifndef __NetBSD__ 4409 /* 4410 * But on NetBSD we have a different system call to do 4411 * this, posix_rename, which sorta kinda handles this 4412 * case (modulo races), and our tests expect BSD 4413 * semantics for rename, so we'll do that until we can 4414 * push the choice between BSD and POSIX semantics into 4415 * the VOP_RENAME protocol as a flag. 4416 */ 4417 if (szp->z_id == tzp->z_id) { 4418 error = 0; 4419 goto unlockout; 4420 } 4421 #endif 4422 } 4423 4424 vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); 4425 if (tzp) 4426 vnevent_rename_dest(*tvpp, tdvp, tnm, ct); 4427 4428 /* 4429 * notify the target directory if it is not the same 4430 * as source directory. 4431 */ 4432 if (tdvp != sdvp) { 4433 vnevent_rename_dest_dir(tdvp, ct); 4434 } 4435 4436 tx = dmu_tx_create(zfsvfs->z_os); 4437 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4438 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); 4439 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); 4440 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); 4441 if (sdzp != tdzp) { 4442 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); 4443 zfs_sa_upgrade_txholds(tx, tdzp); 4444 } 4445 if (tzp) { 4446 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); 4447 zfs_sa_upgrade_txholds(tx, tzp); 4448 } 4449 4450 zfs_sa_upgrade_txholds(tx, szp); 4451 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); 4452 error = dmu_tx_assign(tx, TXG_WAIT); 4453 if (error) { 4454 dmu_tx_abort(tx); 4455 goto unlockout; 4456 } 4457 4458 4459 if (tzp && (tzp->z_id != szp->z_id)) 4460 /* Attempt to remove the existing target */ 4461 error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); 4462 4463 if (error == 0) { 4464 if (!tzp || (tzp->z_id != szp->z_id)) 4465 error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); 4466 if (error == 0) { 4467 szp->z_pflags |= ZFS_AV_MODIFIED; 4468 4469 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), 4470 (void *)&szp->z_pflags, sizeof (uint64_t), tx); 4471 ASSERT0(error); 4472 4473 error = zfs_link_destroy(sdzp, snm, szp, tx, 4474 /* Kludge for BSD rename semantics. */ 4475 tzp && tzp->z_id == szp->z_id ? 0: ZRENAMING, NULL); 4476 if (error == 0) { 4477 zfs_log_rename(zilog, tx, TX_RENAME, sdzp, 4478 snm, tdzp, tnm, szp); 4479 4480 /* 4481 * Update path information for the target vnode 4482 */ 4483 vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); 4484 } else { 4485 /* 4486 * At this point, we have successfully created 4487 * the target name, but have failed to remove 4488 * the source name. Since the create was done 4489 * with the ZRENAMING flag, there are 4490 * complications; for one, the link count is 4491 * wrong. The easiest way to deal with this 4492 * is to remove the newly created target, and 4493 * return the original error. This must 4494 * succeed; fortunately, it is very unlikely to 4495 * fail, since we just created it. 4496 */ 4497 VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, 4498 ZRENAMING, NULL), ==, 0); 4499 } 4500 } 4501 if (error == 0) { 4502 cache_purge(*svpp); 4503 if (*tvpp != NULL) 4504 cache_purge(*tvpp); 4505 cache_purge_negative(tdvp); 4506 #ifdef __NetBSD__ 4507 if (*svpp == *tvpp) { 4508 VN_KNOTE(sdvp, NOTE_WRITE); 4509 VN_KNOTE(*svpp, (szp->z_links == 0 ? 4510 NOTE_DELETE : NOTE_LINK)); 4511 } else { 4512 genfs_rename_knote(sdvp, *svpp, tdvp, *tvpp, 4513 tzp != NULL ? tzp->z_links : 0); 4514 } 4515 #endif 4516 } 4517 } 4518 4519 dmu_tx_commit(tx); 4520 4521 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4522 zil_commit(zilog, 0); 4523 4524 unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ 4525 ZFS_EXIT(zfsvfs); 4526 4527 VOP_UNLOCK(*svpp, 0); 4528 VOP_UNLOCK(sdvp, 0); 4529 #ifdef __NetBSD__ 4530 PNBUF_PUT(snm); 4531 PNBUF_PUT(tnm); 4532 #endif 4533 4534 if (*tvpp != sdvp && *tvpp != *svpp) 4535 if (*tvpp != NULL) 4536 VOP_UNLOCK(*tvpp, 0); 4537 if (tdvp != sdvp && tdvp != *svpp) 4538 if (tdvp != *tvpp) 4539 VOP_UNLOCK(tdvp, 0); 4540 4541 out: 4542 return (error); 4543 } 4544 4545 /* 4546 * Insert the indicated symbolic reference entry into the directory. 4547 * 4548 * IN: dvp - Directory to contain new symbolic link. 4549 * link - Name for new symlink entry. 4550 * vap - Attributes of new entry. 4551 * cr - credentials of caller. 4552 * ct - caller context 4553 * flags - case flags 4554 * 4555 * RETURN: 0 on success, error code on failure. 4556 * 4557 * Timestamps: 4558 * dvp - ctime|mtime updated 4559 */ 4560 /*ARGSUSED*/ 4561 static int 4562 zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, 4563 cred_t *cr, kthread_t *td) 4564 { 4565 znode_t *zp, *dzp = VTOZ(dvp); 4566 dmu_tx_t *tx; 4567 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4568 zilog_t *zilog; 4569 uint64_t len = strlen(link); 4570 int error; 4571 zfs_acl_ids_t acl_ids; 4572 boolean_t fuid_dirtied; 4573 uint64_t txtype = TX_SYMLINK; 4574 int flags = 0; 4575 4576 ASSERT(vap->va_type == VLNK); 4577 4578 ZFS_ENTER(zfsvfs); 4579 ZFS_VERIFY_ZP(dzp); 4580 zilog = zfsvfs->z_log; 4581 4582 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), 4583 NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4584 ZFS_EXIT(zfsvfs); 4585 return (SET_ERROR(EILSEQ)); 4586 } 4587 4588 if (len > MAXPATHLEN) { 4589 ZFS_EXIT(zfsvfs); 4590 return (SET_ERROR(ENAMETOOLONG)); 4591 } 4592 4593 if ((error = zfs_acl_ids_create(dzp, 0, 4594 vap, cr, NULL, &acl_ids)) != 0) { 4595 ZFS_EXIT(zfsvfs); 4596 return (error); 4597 } 4598 4599 /* 4600 * Attempt to lock directory; fail if entry already exists. 4601 */ 4602 error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); 4603 if (error) { 4604 zfs_acl_ids_free(&acl_ids); 4605 ZFS_EXIT(zfsvfs); 4606 return (error); 4607 } 4608 4609 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4610 zfs_acl_ids_free(&acl_ids); 4611 ZFS_EXIT(zfsvfs); 4612 return (error); 4613 } 4614 4615 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { 4616 zfs_acl_ids_free(&acl_ids); 4617 ZFS_EXIT(zfsvfs); 4618 return (SET_ERROR(EDQUOT)); 4619 } 4620 4621 getnewvnode_reserve(1); 4622 tx = dmu_tx_create(zfsvfs->z_os); 4623 fuid_dirtied = zfsvfs->z_fuid_dirty; 4624 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); 4625 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4626 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + 4627 ZFS_SA_BASE_ATTR_SIZE + len); 4628 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); 4629 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { 4630 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 4631 acl_ids.z_aclp->z_acl_bytes); 4632 } 4633 if (fuid_dirtied) 4634 zfs_fuid_txhold(zfsvfs, tx); 4635 error = dmu_tx_assign(tx, TXG_WAIT); 4636 if (error) { 4637 zfs_acl_ids_free(&acl_ids); 4638 dmu_tx_abort(tx); 4639 getnewvnode_drop_reserve(); 4640 ZFS_EXIT(zfsvfs); 4641 return (error); 4642 } 4643 4644 /* 4645 * Create a new object for the symlink. 4646 * for version 4 ZPL datsets the symlink will be an SA attribute 4647 */ 4648 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); 4649 4650 if (fuid_dirtied) 4651 zfs_fuid_sync(zfsvfs, tx); 4652 4653 if (zp->z_is_sa) 4654 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), 4655 link, len, tx); 4656 else 4657 zfs_sa_symlink(zp, link, len, tx); 4658 4659 zp->z_size = len; 4660 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 4661 &zp->z_size, sizeof (zp->z_size), tx); 4662 /* 4663 * Insert the new object into the directory. 4664 */ 4665 (void) zfs_link_create(dzp, name, zp, tx, ZNEW); 4666 4667 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); 4668 *vpp = ZTOV(zp); 4669 4670 zfs_acl_ids_free(&acl_ids); 4671 4672 dmu_tx_commit(tx); 4673 4674 getnewvnode_drop_reserve(); 4675 4676 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4677 zil_commit(zilog, 0); 4678 4679 ZFS_EXIT(zfsvfs); 4680 return (error); 4681 } 4682 4683 /* 4684 * Return, in the buffer contained in the provided uio structure, 4685 * the symbolic path referred to by vp. 4686 * 4687 * IN: vp - vnode of symbolic link. 4688 * uio - structure to contain the link path. 4689 * cr - credentials of caller. 4690 * ct - caller context 4691 * 4692 * OUT: uio - structure containing the link path. 4693 * 4694 * RETURN: 0 on success, error code on failure. 4695 * 4696 * Timestamps: 4697 * vp - atime updated 4698 */ 4699 /* ARGSUSED */ 4700 static int 4701 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) 4702 { 4703 znode_t *zp = VTOZ(vp); 4704 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4705 int error; 4706 4707 ZFS_ENTER(zfsvfs); 4708 ZFS_VERIFY_ZP(zp); 4709 4710 if (zp->z_is_sa) 4711 error = sa_lookup_uio(zp->z_sa_hdl, 4712 SA_ZPL_SYMLINK(zfsvfs), uio); 4713 else 4714 error = zfs_sa_readlink(zp, uio); 4715 4716 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 4717 4718 ZFS_EXIT(zfsvfs); 4719 return (error); 4720 } 4721 4722 /* 4723 * Insert a new entry into directory tdvp referencing svp. 4724 * 4725 * IN: tdvp - Directory to contain new entry. 4726 * svp - vnode of new entry. 4727 * name - name of new entry. 4728 * cr - credentials of caller. 4729 * ct - caller context 4730 * 4731 * RETURN: 0 on success, error code on failure. 4732 * 4733 * Timestamps: 4734 * tdvp - ctime|mtime updated 4735 * svp - ctime updated 4736 */ 4737 /* ARGSUSED */ 4738 static int 4739 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, 4740 caller_context_t *ct, int flags) 4741 { 4742 znode_t *dzp = VTOZ(tdvp); 4743 znode_t *tzp, *szp; 4744 zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 4745 zilog_t *zilog; 4746 dmu_tx_t *tx; 4747 int error; 4748 uint64_t parent; 4749 uid_t owner; 4750 4751 ASSERT(tdvp->v_type == VDIR); 4752 4753 ZFS_ENTER(zfsvfs); 4754 ZFS_VERIFY_ZP(dzp); 4755 zilog = zfsvfs->z_log; 4756 4757 /* 4758 * POSIX dictates that we return EPERM here. 4759 * Better choices include ENOTSUP or EISDIR. 4760 */ 4761 if (svp->v_type == VDIR) { 4762 ZFS_EXIT(zfsvfs); 4763 return (SET_ERROR(EPERM)); 4764 } 4765 4766 szp = VTOZ(svp); 4767 ZFS_VERIFY_ZP(szp); 4768 4769 if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { 4770 ZFS_EXIT(zfsvfs); 4771 return (SET_ERROR(EPERM)); 4772 } 4773 4774 /* Prevent links to .zfs/shares files */ 4775 4776 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), 4777 &parent, sizeof (uint64_t))) != 0) { 4778 ZFS_EXIT(zfsvfs); 4779 return (error); 4780 } 4781 if (parent == zfsvfs->z_shares_dir) { 4782 ZFS_EXIT(zfsvfs); 4783 return (SET_ERROR(EPERM)); 4784 } 4785 4786 if (zfsvfs->z_utf8 && u8_validate(name, 4787 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { 4788 ZFS_EXIT(zfsvfs); 4789 return (SET_ERROR(EILSEQ)); 4790 } 4791 4792 /* 4793 * We do not support links between attributes and non-attributes 4794 * because of the potential security risk of creating links 4795 * into "normal" file space in order to circumvent restrictions 4796 * imposed in attribute space. 4797 */ 4798 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { 4799 ZFS_EXIT(zfsvfs); 4800 return (SET_ERROR(EINVAL)); 4801 } 4802 4803 4804 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); 4805 if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { 4806 ZFS_EXIT(zfsvfs); 4807 return (SET_ERROR(EPERM)); 4808 } 4809 4810 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { 4811 ZFS_EXIT(zfsvfs); 4812 return (error); 4813 } 4814 4815 /* 4816 * Attempt to lock directory; fail if entry already exists. 4817 */ 4818 error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); 4819 if (error) { 4820 ZFS_EXIT(zfsvfs); 4821 return (error); 4822 } 4823 4824 tx = dmu_tx_create(zfsvfs->z_os); 4825 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); 4826 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); 4827 zfs_sa_upgrade_txholds(tx, szp); 4828 zfs_sa_upgrade_txholds(tx, dzp); 4829 error = dmu_tx_assign(tx, TXG_WAIT); 4830 if (error) { 4831 dmu_tx_abort(tx); 4832 ZFS_EXIT(zfsvfs); 4833 return (error); 4834 } 4835 4836 error = zfs_link_create(dzp, name, szp, tx, 0); 4837 4838 if (error == 0) { 4839 uint64_t txtype = TX_LINK; 4840 zfs_log_link(zilog, tx, txtype, dzp, szp, name); 4841 } 4842 4843 dmu_tx_commit(tx); 4844 4845 if (error == 0) { 4846 vnevent_link(svp, ct); 4847 } 4848 4849 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 4850 zil_commit(zilog, 0); 4851 4852 ZFS_EXIT(zfsvfs); 4853 return (error); 4854 } 4855 4856 4857 #if !defined(__NetBSD__) 4858 /*ARGSUSED*/ 4859 void 4860 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4861 { 4862 znode_t *zp = VTOZ(vp); 4863 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4864 int error; 4865 4866 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 4867 if (zp->z_sa_hdl == NULL) { 4868 /* 4869 * The fs has been unmounted, or we did a 4870 * suspend/resume and this file no longer exists. 4871 */ 4872 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4873 vrecycle(vp); 4874 return; 4875 } 4876 4877 if (zp->z_unlinked) { 4878 /* 4879 * Fast path to recycle a vnode of a removed file. 4880 */ 4881 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4882 vrecycle(vp); 4883 return; 4884 } 4885 4886 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 4887 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 4888 4889 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 4890 zfs_sa_upgrade_txholds(tx, zp); 4891 error = dmu_tx_assign(tx, TXG_WAIT); 4892 if (error) { 4893 dmu_tx_abort(tx); 4894 } else { 4895 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 4896 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 4897 zp->z_atime_dirty = 0; 4898 dmu_tx_commit(tx); 4899 } 4900 } 4901 rw_exit(&zfsvfs->z_teardown_inactive_lock); 4902 } 4903 #endif /* !defined(__NetBSD__) */ 4904 4905 4906 #ifdef __FreeBSD__ 4907 CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); 4908 CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); 4909 #endif 4910 4911 /*ARGSUSED*/ 4912 static int 4913 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 4914 { 4915 znode_t *zp = VTOZ(vp); 4916 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 4917 uint32_t gen; 4918 uint64_t gen64; 4919 uint64_t object = zp->z_id; 4920 zfid_short_t *zfid; 4921 int size, i, error; 4922 4923 ZFS_ENTER(zfsvfs); 4924 ZFS_VERIFY_ZP(zp); 4925 4926 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), 4927 &gen64, sizeof (uint64_t))) != 0) { 4928 ZFS_EXIT(zfsvfs); 4929 return (error); 4930 } 4931 4932 gen = (uint32_t)gen64; 4933 4934 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; 4935 4936 #ifdef illumos 4937 if (fidp->fid_len < size) { 4938 fidp->fid_len = size; 4939 ZFS_EXIT(zfsvfs); 4940 return (SET_ERROR(ENOSPC)); 4941 } 4942 #else 4943 fidp->fid_len = size; 4944 #endif 4945 4946 zfid = (zfid_short_t *)fidp; 4947 4948 zfid->zf_len = size; 4949 4950 for (i = 0; i < sizeof (zfid->zf_object); i++) 4951 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 4952 4953 /* Must have a non-zero generation number to distinguish from .zfs */ 4954 if (gen == 0) 4955 gen = 1; 4956 for (i = 0; i < sizeof (zfid->zf_gen); i++) 4957 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); 4958 4959 if (size == LONG_FID_LEN) { 4960 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); 4961 zfid_long_t *zlfid; 4962 4963 zlfid = (zfid_long_t *)fidp; 4964 4965 for (i = 0; i < sizeof (zlfid->zf_setid); i++) 4966 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); 4967 4968 /* XXX - this should be the generation number for the objset */ 4969 for (i = 0; i < sizeof (zlfid->zf_setgen); i++) 4970 zlfid->zf_setgen[i] = 0; 4971 } 4972 4973 ZFS_EXIT(zfsvfs); 4974 return (0); 4975 } 4976 4977 static int 4978 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 4979 caller_context_t *ct) 4980 { 4981 znode_t *zp, *xzp; 4982 zfsvfs_t *zfsvfs; 4983 int error; 4984 4985 switch (cmd) { 4986 case _PC_LINK_MAX: 4987 *valp = INT_MAX; 4988 return (0); 4989 4990 case _PC_FILESIZEBITS: 4991 *valp = 64; 4992 return (0); 4993 #ifdef illumos 4994 case _PC_XATTR_EXISTS: 4995 zp = VTOZ(vp); 4996 zfsvfs = zp->z_zfsvfs; 4997 ZFS_ENTER(zfsvfs); 4998 ZFS_VERIFY_ZP(zp); 4999 *valp = 0; 5000 error = zfs_dirent_lookup(zp, "", &xzp, 5001 ZXATTR | ZEXISTS | ZSHARED); 5002 if (error == 0) { 5003 if (!zfs_dirempty(xzp)) 5004 *valp = 1; 5005 vrele(ZTOV(xzp)); 5006 } else if (error == ENOENT) { 5007 /* 5008 * If there aren't extended attributes, it's the 5009 * same as having zero of them. 5010 */ 5011 error = 0; 5012 } 5013 ZFS_EXIT(zfsvfs); 5014 return (error); 5015 5016 case _PC_SATTR_ENABLED: 5017 case _PC_SATTR_EXISTS: 5018 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && 5019 (vp->v_type == VREG || vp->v_type == VDIR); 5020 return (0); 5021 5022 case _PC_ACCESS_FILTERING: 5023 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && 5024 vp->v_type == VDIR; 5025 return (0); 5026 5027 case _PC_ACL_ENABLED: 5028 *valp = _ACL_ACE_ENABLED; 5029 return (0); 5030 #endif /* illumos */ 5031 case _PC_MIN_HOLE_SIZE: 5032 *valp = (int)SPA_MINBLOCKSIZE; 5033 return (0); 5034 #ifdef illumos 5035 case _PC_TIMESTAMP_RESOLUTION: 5036 /* nanosecond timestamp resolution */ 5037 *valp = 1L; 5038 return (0); 5039 #endif 5040 case _PC_ACL_EXTENDED: 5041 *valp = 0; 5042 return (0); 5043 5044 #ifndef __NetBSD__ 5045 case _PC_ACL_NFS4: 5046 *valp = 1; 5047 return (0); 5048 5049 case _PC_ACL_PATH_MAX: 5050 *valp = ACL_MAX_ENTRIES; 5051 return (0); 5052 #endif 5053 5054 default: 5055 return (EOPNOTSUPP); 5056 } 5057 } 5058 5059 /*ARGSUSED*/ 5060 static int 5061 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 5062 caller_context_t *ct) 5063 { 5064 znode_t *zp = VTOZ(vp); 5065 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5066 int error; 5067 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 5068 5069 ZFS_ENTER(zfsvfs); 5070 ZFS_VERIFY_ZP(zp); 5071 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 5072 ZFS_EXIT(zfsvfs); 5073 5074 return (error); 5075 } 5076 5077 /*ARGSUSED*/ 5078 int 5079 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, 5080 caller_context_t *ct) 5081 { 5082 znode_t *zp = VTOZ(vp); 5083 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5084 int error; 5085 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 5086 zilog_t *zilog = zfsvfs->z_log; 5087 5088 ZFS_ENTER(zfsvfs); 5089 ZFS_VERIFY_ZP(zp); 5090 5091 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 5092 5093 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 5094 zil_commit(zilog, 0); 5095 5096 ZFS_EXIT(zfsvfs); 5097 return (error); 5098 } 5099 5100 static int 5101 ioflags(int ioflags) 5102 { 5103 int flags = 0; 5104 5105 if (ioflags & IO_APPEND) 5106 flags |= FAPPEND; 5107 if (ioflags & IO_NDELAY) 5108 flags |= FNONBLOCK; 5109 if (ioflags & IO_SYNC) 5110 flags |= (FSYNC | FDSYNC | FRSYNC); 5111 5112 return (flags); 5113 } 5114 5115 #ifdef __NetBSD__ 5116 5117 static void zfs_netbsd_update_mctime(vnode_t *vp); 5118 5119 static int 5120 zfs_netbsd_open(void *v) 5121 { 5122 struct vop_open_args *ap = v; 5123 5124 return (zfs_open(&ap->a_vp, ap->a_mode, ap->a_cred, NULL)); 5125 } 5126 5127 static int 5128 zfs_netbsd_close(void *v) 5129 { 5130 struct vop_close_args *ap = v; 5131 5132 return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL)); 5133 } 5134 5135 static int 5136 zfs_netbsd_ioctl(void *v) 5137 { 5138 struct vop_ioctl_args *ap = v; 5139 5140 return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, 5141 ap->a_fflag, ap->a_cred, NULL, NULL)); 5142 } 5143 5144 5145 static int 5146 zfs_netbsd_read(void *v) 5147 { 5148 struct vop_read_args *ap = v; 5149 vnode_t *vp = ap->a_vp; 5150 znode_t *zp = VTOZ(vp); 5151 5152 switch (vp->v_type) { 5153 case VBLK: 5154 case VCHR: 5155 ZFS_ACCESSTIME_STAMP(zp->z_zfsvfs, zp); 5156 return (VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap)); 5157 case VFIFO: 5158 ZFS_ACCESSTIME_STAMP(zp->z_zfsvfs, zp); 5159 return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap)); 5160 case VREG: 5161 break; 5162 case VDIR: 5163 /* 5164 * Note: this is normal on NetBSD because it historically 5165 * allows read() on a directory. 5166 * We simply reject it here though because it doesn't make 5167 * sense to allow read() unless we implement a conversion 5168 * to the historical version of the UFS dirent structure, 5169 * which i (yamt) don't think is worth the effort. 5170 */ 5171 return EISDIR; 5172 default: 5173 return EINVAL; 5174 } 5175 5176 return (zfs_read(vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL)); 5177 } 5178 5179 static int 5180 zfs_netbsd_write(void *v) 5181 { 5182 struct vop_write_args *ap = v; 5183 vnode_t *vp = ap->a_vp; 5184 znode_t *zp = VTOZ(vp); 5185 struct uio *uio = ap->a_uio; 5186 off_t osize = zp->z_size; 5187 int error, resid; 5188 5189 switch (vp->v_type) { 5190 case VBLK: 5191 case VCHR: 5192 zfs_netbsd_update_mctime(vp); 5193 return (VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap)); 5194 case VFIFO: 5195 zfs_netbsd_update_mctime(vp); 5196 return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap)); 5197 case VREG: 5198 break; 5199 case VDIR: 5200 /* 5201 * Note: this shouldn't happen as NetBSD's vn_openchk 5202 * rejects FWRITE on VDIR. 5203 */ 5204 return EIO; 5205 default: 5206 return EINVAL; 5207 } 5208 5209 resid = uio->uio_resid; 5210 error = zfs_write(vp, uio, ioflags(ap->a_ioflag), ap->a_cred, NULL); 5211 5212 return error; 5213 } 5214 5215 static int 5216 zfs_netbsd_access(void *v) 5217 { 5218 struct vop_access_args /* { 5219 struct vnode *a_vp; 5220 accmode_t a_accmode; 5221 kauth_cred_t a_cred; 5222 } */ *ap = v; 5223 vnode_t *vp = ap->a_vp; 5224 znode_t *zp = VTOZ(vp); 5225 accmode_t accmode; 5226 kauth_cred_t cred = ap->a_cred; 5227 int error = 0; 5228 5229 /* 5230 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, 5231 */ 5232 accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); 5233 if (accmode != 0) 5234 error = zfs_access(vp, accmode, 0, cred, NULL); 5235 5236 /* 5237 * VADMIN has to be handled by kauth_authorize_vnode(). 5238 */ 5239 if (error == 0) { 5240 accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); 5241 if (accmode != 0) { 5242 error = kauth_authorize_vnode(cred, 5243 KAUTH_ACCESS_ACTION(accmode, vp->v_type, 5244 zp->z_mode & ALLPERMS), vp, NULL, 5245 genfs_can_access(vp, cred, zp->z_uid, 5246 zp->z_gid, zp->z_mode & ALLPERMS, NULL, accmode)); 5247 } 5248 } 5249 5250 /* 5251 * For VEXEC, ensure that at least one execute bit is set for 5252 * non-directories. 5253 */ 5254 if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && 5255 (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { 5256 error = EACCES; 5257 } 5258 5259 /* We expect EACCES as common error. */ 5260 if (error == EPERM) 5261 error = EACCES; 5262 5263 return error; 5264 } 5265 5266 static int 5267 zfs_netbsd_lookup(void *v) 5268 { 5269 struct vop_lookup_v2_args /* { 5270 struct vnode *a_dvp; 5271 struct vnode **a_vpp; 5272 struct componentname *a_cnp; 5273 } */ *ap = v; 5274 struct vnode *dvp = ap->a_dvp; 5275 struct vnode **vpp = ap->a_vpp; 5276 struct componentname *cnp = ap->a_cnp; 5277 znode_t *zdp = VTOZ(dvp); 5278 zfsvfs_t *zfsvfs = zdp->z_zfsvfs; 5279 char *nm, short_nm[31]; 5280 int error; 5281 5282 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5283 5284 *vpp = NULL; 5285 5286 /* 5287 * Do an access check before the cache lookup. zfs_lookup does 5288 * an access check too, but it's too scary to contemplate 5289 * injecting our namecache stuff into zfs internals. 5290 * 5291 * XXX Is this the correct access check? 5292 */ 5293 if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred)) != 0) 5294 goto out; 5295 5296 /* 5297 * Check the namecache before entering zfs_lookup. 5298 * cache_lookup does the locking dance for us. 5299 */ 5300 if (zfsvfs->z_use_namecache) { 5301 if (cache_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen, 5302 cnp->cn_nameiop, cnp->cn_flags, NULL, vpp)) { 5303 return *vpp == NULL ? ENOENT : 0; 5304 } 5305 } 5306 5307 /* 5308 * zfs_lookup wants a null-terminated component name, but namei 5309 * gives us a pointer into the full pathname. 5310 */ 5311 ASSERT(cnp->cn_namelen < PATH_MAX - 1); 5312 if (cnp->cn_namelen + 1 > sizeof(short_nm)) 5313 nm = PNBUF_GET(); 5314 else 5315 nm = short_nm; 5316 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1); 5317 5318 error = zfs_lookup(dvp, nm, vpp, 0, cnp, cnp->cn_nameiop, cnp->cn_cred); 5319 5320 if (nm != short_nm) 5321 PNBUF_PUT(nm); 5322 5323 /* 5324 * Translate errors to match our namei insanity. Also, if the 5325 * caller wants to create an entry here, it's apparently our 5326 * responsibility as lookup to make sure that's permissible. 5327 * Go figure. 5328 */ 5329 if (cnp->cn_flags & ISLASTCN) { 5330 switch (cnp->cn_nameiop) { 5331 case CREATE: 5332 case RENAME: 5333 if (error == ENOENT) { 5334 error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred); 5335 if (error) 5336 break; 5337 error = EJUSTRETURN; 5338 break; 5339 } 5340 break; 5341 case DELETE: 5342 if (error == 0) { 5343 error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred); 5344 if (error) { 5345 VN_RELE(*vpp); 5346 *vpp = NULL; 5347 } 5348 } 5349 break; 5350 } 5351 } 5352 5353 if (error) { 5354 KASSERT(*vpp == NULL); 5355 goto out; 5356 } 5357 KASSERT(*vpp != NULL); 5358 5359 if ((cnp->cn_namelen == 1) && (cnp->cn_nameptr[0] == '.')) { 5360 KASSERT(!(cnp->cn_flags & ISDOTDOT)); 5361 KASSERT(dvp == *vpp); 5362 } else if ((cnp->cn_namelen == 2) && 5363 (cnp->cn_nameptr[0] == '.') && 5364 (cnp->cn_nameptr[1] == '.')) { 5365 KASSERT(cnp->cn_flags & ISDOTDOT); 5366 } else { 5367 KASSERT(!(cnp->cn_flags & ISDOTDOT)); 5368 } 5369 5370 out: 5371 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5372 5373 /* 5374 * Insert name into cache if appropriate. 5375 */ 5376 5377 if (zfsvfs->z_use_namecache) { 5378 if (error == 0 || 5379 (error == ENOENT && cnp->cn_nameiop != CREATE)) 5380 cache_enter(dvp, *vpp, cnp->cn_nameptr, 5381 cnp->cn_namelen, cnp->cn_flags); 5382 } 5383 5384 return (error); 5385 } 5386 5387 static int 5388 zfs_netbsd_create(void *v) 5389 { 5390 struct vop_create_v3_args /* { 5391 struct vnode *a_dvp; 5392 struct vnode **a_vpp; 5393 struct componentname *a_cnp; 5394 struct vattr *a_vap; 5395 } */ *ap = v; 5396 struct vnode *dvp = ap->a_dvp; 5397 struct vnode **vpp = ap->a_vpp; 5398 struct componentname *cnp = ap->a_cnp; 5399 struct vattr *vap = ap->a_vap; 5400 char *nm; 5401 int mode; 5402 int error; 5403 5404 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5405 5406 vattr_init_mask(vap); 5407 mode = vap->va_mode & ALLPERMS; 5408 5409 /* ZFS wants a null-terminated name. */ 5410 nm = PNBUF_GET(); 5411 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1); 5412 5413 /* XXX !EXCL is wrong here... */ 5414 error = zfs_create(dvp, nm, vap, !EXCL, mode, vpp, cnp->cn_cred, NULL); 5415 5416 PNBUF_PUT(nm); 5417 5418 KASSERT((error == 0) == (*vpp != NULL)); 5419 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5420 if (*vpp != NULL) 5421 VOP_UNLOCK(*vpp, 0); 5422 5423 return (error); 5424 } 5425 5426 static int 5427 zfs_netbsd_mknod(void *v) 5428 { 5429 struct vop_mknod_v3_args /* { 5430 struct vnode *a_dvp; 5431 struct vnode **a_vpp; 5432 struct componentname *a_cnp; 5433 struct vattr *a_vap; 5434 } */ *ap = v; 5435 struct vnode *dvp = ap->a_dvp; 5436 struct vnode **vpp = ap->a_vpp; 5437 struct componentname *cnp = ap->a_cnp; 5438 struct vattr *vap = ap->a_vap; 5439 char *nm; 5440 int mode; 5441 int error; 5442 5443 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5444 5445 vattr_init_mask(vap); 5446 mode = vap->va_mode & ALLPERMS; 5447 5448 /* ZFS wants a null-terminated name. */ 5449 nm = PNBUF_GET(); 5450 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1); 5451 5452 /* XXX !EXCL is wrong here... */ 5453 error = zfs_create(dvp, nm, vap, !EXCL, mode, vpp, cnp->cn_cred, NULL); 5454 5455 PNBUF_PUT(nm); 5456 5457 KASSERT((error == 0) == (*vpp != NULL)); 5458 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5459 if (*vpp != NULL) 5460 VOP_UNLOCK(*vpp, 0); 5461 5462 return (error); 5463 } 5464 5465 static int 5466 zfs_netbsd_remove(void *v) 5467 { 5468 struct vop_remove_v3_args /* { 5469 struct vnode *a_dvp; 5470 struct vnode *a_vp; 5471 struct componentname *a_cnp; 5472 nlink_t ctx_vp_new_nlink; 5473 } */ *ap = v; 5474 struct vnode *dvp = ap->a_dvp; 5475 struct vnode *vp = ap->a_vp; 5476 struct componentname *cnp = ap->a_cnp; 5477 char *nm; 5478 int error; 5479 5480 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5481 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 5482 5483 /* ZFS wants a null-terminated name. */ 5484 nm = PNBUF_GET(); 5485 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1); 5486 5487 error = zfs_remove(dvp, vp, nm, cnp->cn_cred); 5488 5489 /* 5490 * XXX Should update ctx_vp_new_nlink, but for now the 5491 * XXX the kevent sent on "vp" matches historical behavior. 5492 */ 5493 5494 PNBUF_PUT(nm); 5495 vput(vp); 5496 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5497 return (error); 5498 } 5499 5500 static int 5501 zfs_netbsd_mkdir(void *v) 5502 { 5503 struct vop_mkdir_v3_args /* { 5504 struct vnode *a_dvp; 5505 struct vnode **a_vpp; 5506 struct componentname *a_cnp; 5507 struct vattr *a_vap; 5508 } */ *ap = v; 5509 struct vnode *dvp = ap->a_dvp; 5510 struct vnode **vpp = ap->a_vpp; 5511 struct componentname *cnp = ap->a_cnp; 5512 struct vattr *vap = ap->a_vap; 5513 char *nm; 5514 int error; 5515 5516 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5517 5518 vattr_init_mask(vap); 5519 5520 /* ZFS wants a null-terminated name. */ 5521 nm = PNBUF_GET(); 5522 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1); 5523 5524 error = zfs_mkdir(dvp, nm, vap, vpp, cnp->cn_cred); 5525 5526 PNBUF_PUT(nm); 5527 5528 KASSERT((error == 0) == (*vpp != NULL)); 5529 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5530 if (*vpp != NULL) 5531 VOP_UNLOCK(*vpp, 0); 5532 5533 return (error); 5534 } 5535 5536 static int 5537 zfs_netbsd_rmdir(void *v) 5538 { 5539 struct vop_rmdir_v2_args /* { 5540 struct vnode *a_dvp; 5541 struct vnode *a_vp; 5542 struct componentname *a_cnp; 5543 } */ *ap = v; 5544 struct vnode *dvp = ap->a_dvp; 5545 struct vnode *vp = ap->a_vp; 5546 struct componentname *cnp = ap->a_cnp; 5547 char *nm; 5548 int error; 5549 5550 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5551 KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); 5552 5553 /* ZFS wants a null-terminated name. */ 5554 nm = PNBUF_GET(); 5555 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1); 5556 5557 error = zfs_rmdir(dvp, vp, nm, cnp->cn_cred); 5558 5559 PNBUF_PUT(nm); 5560 vput(vp); 5561 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5562 return error; 5563 } 5564 5565 static int 5566 zfs_netbsd_readdir(void *v) 5567 { 5568 struct vop_readdir_args *ap = v; 5569 5570 return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, 5571 ap->a_ncookies, ap->a_cookies)); 5572 } 5573 5574 static int 5575 zfs_netbsd_fsync(void *v) 5576 { 5577 struct vop_fsync_args *ap = v; 5578 struct vnode *vp = ap->a_vp; 5579 int flags = ap->a_flags; 5580 int error; 5581 5582 /* 5583 * Regardless of whether this is required for standards conformance, 5584 * this is the logical behavior when fsync() is called on a file with 5585 * dirty pages. We use async putpages since the ZIL transactions are 5586 * already going to be pushed out as part of the zil_commit(). 5587 */ 5588 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 5589 error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo), 5590 round_page(ap->a_offhi), PGO_CLEANIT); 5591 if (error != 0) { 5592 return error; 5593 } 5594 5595 /* 5596 * it isn't safe or necessary to call zil_commit when reclaiming 5597 * a vnode. 5598 * 5599 * - it can deadlock by attempting vcache_get on itself. 5600 * (zfs_get_data) 5601 * 5602 * - for the purpose of vnode reclaim, we only need to push the 5603 * data to the txg. no need to log the intent. 5604 * 5605 * no need to commit the zil for ioflush either. (FSYNC_LAZY) 5606 */ 5607 if ((flags & (FSYNC_RECLAIM|FSYNC_LAZY)) != 0) { 5608 return (0); 5609 } 5610 5611 return (zfs_fsync(vp, flags, ap->a_cred, NULL)); 5612 } 5613 5614 static int 5615 zfs_spec_fsync(void *v) 5616 { 5617 struct vop_fsync_args *ap = v; 5618 int error; 5619 5620 error = spec_fsync(v); 5621 if (error) 5622 return error; 5623 5624 return (zfs_fsync(ap->a_vp, ap->a_flags, ap->a_cred, NULL)); 5625 } 5626 5627 static int 5628 zfs_netbsd_getattr(void *v) 5629 { 5630 struct vop_getattr_args *ap = v; 5631 vattr_t *vap = ap->a_vap; 5632 xvattr_t xvap; 5633 u_long fflags = 0; 5634 int error; 5635 5636 xva_init(&xvap); 5637 xvap.xva_vattr = *vap; 5638 xvap.xva_vattr.va_mask |= AT_XVATTR; 5639 5640 /* Convert chflags into ZFS-type flags. */ 5641 /* XXX: what about SF_SETTABLE?. */ 5642 XVA_SET_REQ(&xvap, XAT_IMMUTABLE); 5643 XVA_SET_REQ(&xvap, XAT_APPENDONLY); 5644 XVA_SET_REQ(&xvap, XAT_NOUNLINK); 5645 XVA_SET_REQ(&xvap, XAT_NODUMP); 5646 error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL); 5647 if (error != 0) 5648 return (error); 5649 5650 /* Convert ZFS xattr into chflags. */ 5651 #define FLAG_CHECK(fflag, xflag, xfield) do { \ 5652 if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ 5653 fflags |= (fflag); \ 5654 } while (0) 5655 FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, 5656 xvap.xva_xoptattrs.xoa_immutable); 5657 FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, 5658 xvap.xva_xoptattrs.xoa_appendonly); 5659 FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, 5660 xvap.xva_xoptattrs.xoa_nounlink); 5661 FLAG_CHECK(UF_NODUMP, XAT_NODUMP, 5662 xvap.xva_xoptattrs.xoa_nodump); 5663 #undef FLAG_CHECK 5664 *vap = xvap.xva_vattr; 5665 vap->va_flags = fflags; 5666 return (0); 5667 } 5668 5669 static int 5670 zfs_netbsd_setattr(void *v) 5671 { 5672 struct vop_setattr_args *ap = v; 5673 vnode_t *vp = ap->a_vp; 5674 vattr_t *vap = ap->a_vap; 5675 cred_t *cred = ap->a_cred; 5676 znode_t *zp = VTOZ(vp); 5677 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5678 xvattr_t xvap; 5679 kauth_action_t action; 5680 u_long fflags, sfflags = 0; 5681 uint64_t zflags; 5682 int error, flags = 0; 5683 bool changing_sysflags; 5684 5685 vattr_init_mask(vap); 5686 vap->va_mask &= ~AT_NOSET; 5687 if (ISSET(vap->va_vaflags, VA_UTIMES_NULL)) 5688 flags |= ATTR_UTIME; 5689 5690 xva_init(&xvap); 5691 xvap.xva_vattr = *vap; 5692 5693 zflags = VTOZ(vp)->z_pflags; 5694 5695 /* Ignore size changes on device nodes. */ 5696 if (vp->v_type == VBLK || vp->v_type == VCHR) 5697 xvap.xva_vattr.va_mask &= ~AT_SIZE; 5698 if (vap->va_flags != VNOVAL) { 5699 int error; 5700 5701 fflags = vap->va_flags; 5702 if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0) 5703 return (EOPNOTSUPP); 5704 5705 #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ 5706 if (((fflags & (fflag)) && !(zflags & (zflag))) || \ 5707 ((zflags & (zflag)) && !(fflags & (fflag)))) { \ 5708 XVA_SET_REQ(&xvap, (xflag)); \ 5709 (xfield) = ((fflags & (fflag)) != 0); \ 5710 if (((fflag) & SF_SETTABLE) != 0) \ 5711 sfflags |= (fflag); \ 5712 } \ 5713 } while (0) 5714 /* Convert chflags into ZFS-type flags. */ 5715 /* XXX: what about SF_SETTABLE?. */ 5716 FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, 5717 xvap.xva_xoptattrs.xoa_immutable); 5718 FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, 5719 xvap.xva_xoptattrs.xoa_appendonly); 5720 FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, 5721 xvap.xva_xoptattrs.xoa_nounlink); 5722 FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, 5723 xvap.xva_xoptattrs.xoa_nodump); 5724 #undef FLAG_CHANGE 5725 5726 action = KAUTH_VNODE_WRITE_FLAGS; 5727 changing_sysflags = false; 5728 5729 if (zflags & (ZFS_IMMUTABLE|ZFS_APPENDONLY|ZFS_NOUNLINK)) { 5730 action |= KAUTH_VNODE_HAS_SYSFLAGS; 5731 } 5732 if (sfflags != 0) { 5733 action |= KAUTH_VNODE_WRITE_SYSFLAGS; 5734 changing_sysflags = true; 5735 } 5736 5737 error = kauth_authorize_vnode(cred, action, vp, NULL, 5738 genfs_can_chflags(vp, cred, zp->z_uid, changing_sysflags)); 5739 if (error) 5740 return error; 5741 } 5742 5743 if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || 5744 vap->va_birthtime.tv_sec != VNOVAL) { 5745 error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp, 5746 NULL, genfs_can_chtimes(vp, cred, zp->z_uid, 5747 vap->va_vaflags)); 5748 if (error) 5749 return error; 5750 } 5751 5752 error = zfs_setattr(vp, (vattr_t *)&xvap, flags, cred, NULL); 5753 if (error) 5754 return error; 5755 5756 if (zfsvfs->z_use_namecache) 5757 cache_enter_id(vp, zp->z_mode, zp->z_uid, zp->z_gid, true); 5758 5759 return error; 5760 } 5761 5762 static int 5763 zfs_netbsd_rename(void *v) 5764 { 5765 struct vop_rename_args /* { 5766 struct vnode *a_fdvp; 5767 struct vnode *a_fvp; 5768 struct componentname *a_fcnp; 5769 struct vnode *a_tdvp; 5770 struct vnode *a_tvp; 5771 struct componentname *a_tcnp; 5772 } */ *ap = v; 5773 vnode_t *fdvp = ap->a_fdvp; 5774 vnode_t *fvp = ap->a_fvp; 5775 struct componentname *fcnp = ap->a_fcnp; 5776 vnode_t *tdvp = ap->a_tdvp; 5777 vnode_t *tvp = ap->a_tvp; 5778 struct componentname *tcnp = ap->a_tcnp; 5779 kauth_cred_t cred; 5780 int error; 5781 5782 KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); 5783 KASSERT(tvp == NULL || VOP_ISLOCKED(tvp) == LK_EXCLUSIVE); 5784 KASSERT(fdvp->v_type == VDIR); 5785 KASSERT(tdvp->v_type == VDIR); 5786 5787 cred = fcnp->cn_cred; 5788 5789 /* 5790 * XXX Want a better equality test. `tcnp->cn_cred == cred' 5791 * hoses p2k because puffs transmits the creds separately and 5792 * allocates distinct but equivalent structures for them. 5793 */ 5794 KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred)); 5795 5796 /* 5797 * Drop the insane locks. 5798 */ 5799 VOP_UNLOCK(tdvp, 0); 5800 if (tvp != NULL && tvp != tdvp) 5801 VOP_UNLOCK(tvp, 0); 5802 5803 /* 5804 * Release the source and target nodes; zfs_rename will look 5805 * them up again once the locking situation is sane. 5806 */ 5807 VN_RELE(fvp); 5808 if (tvp != NULL) 5809 VN_RELE(tvp); 5810 fvp = NULL; 5811 tvp = NULL; 5812 5813 /* 5814 * Do the rename ZFSly. 5815 */ 5816 error = zfs_rename(fdvp, &fvp, fcnp, tdvp, &tvp, tcnp, cred); 5817 5818 /* 5819 * Release the directories now too, because the VOP_RENAME 5820 * protocol is insane. 5821 */ 5822 5823 VN_RELE(fdvp); 5824 VN_RELE(tdvp); 5825 if (fvp != NULL) 5826 VN_RELE(fvp); 5827 if (tvp != NULL) 5828 VN_RELE(tvp); 5829 5830 return (error); 5831 } 5832 5833 static int 5834 zfs_netbsd_symlink(void *v) 5835 { 5836 struct vop_symlink_v3_args /* { 5837 struct vnode *a_dvp; 5838 struct vnode **a_vpp; 5839 struct componentname *a_cnp; 5840 struct vattr *a_vap; 5841 char *a_target; 5842 } */ *ap = v; 5843 struct vnode *dvp = ap->a_dvp; 5844 struct vnode **vpp = ap->a_vpp; 5845 struct componentname *cnp = ap->a_cnp; 5846 struct vattr *vap = ap->a_vap; 5847 char *target = ap->a_target; 5848 char *nm; 5849 int error; 5850 5851 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5852 5853 vap->va_type = VLNK; /* Netbsd: Syscall only sets va_mode. */ 5854 vattr_init_mask(vap); 5855 5856 /* ZFS wants a null-terminated name. */ 5857 nm = PNBUF_GET(); 5858 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1); 5859 5860 error = zfs_symlink(dvp, vpp, nm, vap, target, cnp->cn_cred, 0); 5861 5862 PNBUF_PUT(nm); 5863 KASSERT((error == 0) == (*vpp != NULL)); 5864 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5865 if (*vpp != NULL) 5866 VOP_UNLOCK(*vpp, 0); 5867 5868 return (error); 5869 } 5870 5871 static int 5872 zfs_netbsd_readlink(void *v) 5873 { 5874 struct vop_readlink_args *ap = v; 5875 5876 return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); 5877 } 5878 5879 static int 5880 zfs_netbsd_link(void *v) 5881 { 5882 struct vop_link_v2_args /* { 5883 struct vnode *a_dvp; 5884 struct vnode *a_vp; 5885 struct componentname *a_cnp; 5886 } */ *ap = v; 5887 struct vnode *dvp = ap->a_dvp; 5888 struct vnode *vp = ap->a_vp; 5889 struct componentname *cnp = ap->a_cnp; 5890 char *nm; 5891 int error; 5892 5893 KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); 5894 5895 /* ZFS wants a null-terminated name. */ 5896 nm = PNBUF_GET(); 5897 (void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1); 5898 5899 if ((error = vn_lock(vp, LK_EXCLUSIVE)) != 0) { 5900 /* XXX: No ABORTOP? */ 5901 PNBUF_PUT(nm); 5902 return error; 5903 } 5904 error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_ADD_LINK, vp, 5905 dvp, 0); 5906 if (error) 5907 goto out; 5908 error = zfs_link(dvp, vp, nm, cnp->cn_cred, 5909 NULL, 0); 5910 5911 out: 5912 PNBUF_PUT(nm); 5913 VOP_UNLOCK(vp, 0); 5914 return error; 5915 } 5916 5917 static int 5918 zfs_netbsd_inactive(void *v) 5919 { 5920 struct vop_inactive_v2_args *ap = v; 5921 vnode_t *vp = ap->a_vp; 5922 znode_t *zp = VTOZ(vp); 5923 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 5924 int error; 5925 5926 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 5927 if (zp->z_sa_hdl == NULL) { 5928 /* 5929 * The fs has been unmounted, or we did a 5930 * suspend/resume and this file no longer exists. 5931 */ 5932 rw_exit(&zfsvfs->z_teardown_inactive_lock); 5933 *ap->a_recycle = true; 5934 return (0); 5935 } 5936 5937 if (zp->z_unlinked) { 5938 /* 5939 * Fast path to recycle a vnode of a removed file. 5940 */ 5941 rw_exit(&zfsvfs->z_teardown_inactive_lock); 5942 *ap->a_recycle = true; 5943 return (0); 5944 } 5945 5946 if (zp->z_atime_dirty && zp->z_unlinked == 0) { 5947 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 5948 5949 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 5950 zfs_sa_upgrade_txholds(tx, zp); 5951 error = dmu_tx_assign(tx, TXG_WAIT); 5952 if (error) { 5953 dmu_tx_abort(tx); 5954 } else { 5955 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), 5956 (void *)&zp->z_atime, sizeof (zp->z_atime), tx); 5957 zp->z_atime_dirty = 0; 5958 dmu_tx_commit(tx); 5959 } 5960 } 5961 rw_exit(&zfsvfs->z_teardown_inactive_lock); 5962 5963 *ap->a_recycle = false; 5964 return (0); 5965 } 5966 5967 static int 5968 zfs_netbsd_reclaim(void *v) 5969 { 5970 struct vop_reclaim_v2_args /* { 5971 struct vnode *a_vp; 5972 } */ *ap = v; 5973 struct vnode *vp = ap->a_vp; 5974 znode_t *zp; 5975 zfsvfs_t *zfsvfs; 5976 int error; 5977 5978 VOP_UNLOCK(vp, 0); 5979 zp = VTOZ(vp); 5980 zfsvfs = zp->z_zfsvfs; 5981 5982 KASSERTMSG(!vn_has_cached_data(vp), "vp %p", vp); 5983 5984 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); 5985 if (zp->z_sa_hdl == NULL) 5986 zfs_znode_free(zp); 5987 else 5988 zfs_zinactive(zp); 5989 rw_exit(&zfsvfs->z_teardown_inactive_lock); 5990 return 0; 5991 } 5992 5993 static int 5994 zfs_netbsd_fid(void *v) 5995 { 5996 struct vop_fid_args *ap = v; 5997 5998 return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); 5999 } 6000 6001 static int 6002 zfs_netbsd_pathconf(void *v) 6003 { 6004 struct vop_pathconf_args *ap = v; 6005 ulong_t val; 6006 int error; 6007 6008 error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->l_cred, NULL); 6009 if (error == 0) 6010 *ap->a_retval = val; 6011 else if (error == EOPNOTSUPP) { 6012 switch (ap->a_name) { 6013 case _PC_NAME_MAX: 6014 *ap->a_retval = NAME_MAX; 6015 return (0); 6016 case _PC_PATH_MAX: 6017 *ap->a_retval = PATH_MAX; 6018 return (0); 6019 case _PC_LINK_MAX: 6020 *ap->a_retval = LINK_MAX; 6021 return (0); 6022 case _PC_MAX_CANON: 6023 *ap->a_retval = MAX_CANON; 6024 return (0); 6025 case _PC_MAX_INPUT: 6026 *ap->a_retval = MAX_INPUT; 6027 return (0); 6028 case _PC_PIPE_BUF: 6029 *ap->a_retval = PIPE_BUF; 6030 return (0); 6031 case _PC_CHOWN_RESTRICTED: 6032 *ap->a_retval = 1; 6033 return (0); 6034 case _PC_NO_TRUNC: 6035 *ap->a_retval = 1; 6036 return (0); 6037 case _PC_VDISABLE: 6038 *ap->a_retval = _POSIX_VDISABLE; 6039 return (0); 6040 default: 6041 return (EINVAL); 6042 } 6043 /* NOTREACHED */ 6044 } 6045 return (error); 6046 } 6047 6048 static int 6049 zfs_netbsd_advlock(void *v) 6050 { 6051 struct vop_advlock_args /* { 6052 struct vnode *a_vp; 6053 void *a_id; 6054 int a_op; 6055 struct flock *a_fl; 6056 int a_flags; 6057 } */ *ap = v; 6058 struct vnode *vp; 6059 struct znode *zp; 6060 struct zfsvfs *zfsvfs; 6061 int error; 6062 6063 vp = ap->a_vp; 6064 zp = VTOZ(vp); 6065 zfsvfs = zp->z_zfsvfs; 6066 6067 ZFS_ENTER(zfsvfs); 6068 ZFS_VERIFY_ZP(zp); 6069 error = lf_advlock(ap, &zp->z_lockf, zp->z_size); 6070 ZFS_EXIT(zfsvfs); 6071 6072 return error; 6073 } 6074 6075 static int 6076 zfs_netbsd_getpages(void *v) 6077 { 6078 struct vop_getpages_args /* { 6079 struct vnode *a_vp; 6080 voff_t a_offset; 6081 struct vm_page **a_m; 6082 int *a_count; 6083 int a_centeridx; 6084 vm_prot_t a_access_type; 6085 int a_advice; 6086 int a_flags; 6087 } */ * const ap = v; 6088 6089 vnode_t *const vp = ap->a_vp; 6090 const int flags = ap->a_flags; 6091 const bool async = (flags & PGO_SYNCIO) == 0; 6092 const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0; 6093 6094 struct uvm_object * const uobj = &vp->v_uobj; 6095 krwlock_t * const rw = uobj->vmobjlock; 6096 znode_t *zp = VTOZ(vp); 6097 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 6098 vfs_t *mp; 6099 struct vm_page *pg; 6100 caddr_t va; 6101 int npages = *ap->a_count, found, err = 0; 6102 6103 if (flags & PGO_LOCKED) { 6104 uvn_findpages(uobj, ap->a_offset, &npages, ap->a_m, NULL, 6105 UFP_NOWAIT | UFP_NOALLOC | UFP_NOBUSY | 6106 (memwrite ? UFP_NORDONLY : 0)); 6107 KASSERT(npages == *ap->a_count); 6108 if (memwrite) { 6109 KASSERT(rw_write_held(uobj->vmobjlock)); 6110 for (int i = 0; i < npages; i++) { 6111 pg = ap->a_m[i]; 6112 if (pg == NULL || pg == PGO_DONTCARE) { 6113 continue; 6114 } 6115 if (uvm_pagegetdirty(pg) == 6116 UVM_PAGE_STATUS_CLEAN) { 6117 uvm_pagemarkdirty(pg, 6118 UVM_PAGE_STATUS_UNKNOWN); 6119 } 6120 } 6121 } 6122 return ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0; 6123 } 6124 rw_exit(rw); 6125 6126 if (async) { 6127 return 0; 6128 } 6129 6130 mp = vp->v_mount; 6131 fstrans_start(mp); 6132 if (vp->v_mount != mp) { 6133 fstrans_done(mp); 6134 return ENOENT; 6135 } 6136 ZFS_ENTER(zfsvfs); 6137 ZFS_VERIFY_ZP(zp); 6138 6139 rw_enter(rw, RW_WRITER); 6140 if (ap->a_offset + (npages << PAGE_SHIFT) > round_page(vp->v_size)) { 6141 rw_exit(rw); 6142 ZFS_EXIT(zfsvfs); 6143 fstrans_done(mp); 6144 return EINVAL; 6145 } 6146 uvn_findpages(uobj, ap->a_offset, &npages, ap->a_m, NULL, UFP_ALL); 6147 KASSERT(npages == *ap->a_count); 6148 6149 for (int i = 0; i < npages; i++) { 6150 pg = ap->a_m[i]; 6151 if (pg->flags & PG_FAKE) { 6152 voff_t offset = pg->offset; 6153 KASSERT(pg->offset == ap->a_offset + (i << PAGE_SHIFT)); 6154 rw_exit(rw); 6155 6156 va = zfs_map_page(pg, S_WRITE); 6157 err = dmu_read(zfsvfs->z_os, zp->z_id, offset, 6158 PAGE_SIZE, va, DMU_READ_PREFETCH); 6159 zfs_unmap_page(pg, va); 6160 6161 if (err != 0) { 6162 uvm_aio_aiodone_pages(ap->a_m, npages, false, err); 6163 memset(ap->a_m, 0, sizeof(ap->a_m[0]) * 6164 npages); 6165 break; 6166 } 6167 rw_enter(rw, RW_WRITER); 6168 pg->flags &= ~(PG_FAKE); 6169 } 6170 6171 if (memwrite && uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) { 6172 /* For write faults, start dirtiness tracking. */ 6173 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN); 6174 } 6175 } 6176 rw_exit(rw); 6177 6178 ZFS_EXIT(zfsvfs); 6179 fstrans_done(mp); 6180 6181 return (err); 6182 } 6183 6184 static int 6185 zfs_putapage(vnode_t *vp, page_t **pp, int count, int flags) 6186 { 6187 znode_t *zp = VTOZ(vp); 6188 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 6189 dmu_tx_t *tx; 6190 voff_t off, koff; 6191 voff_t len, klen; 6192 int err; 6193 6194 bool *cleanedp; 6195 struct uvm_object *uobj = &vp->v_uobj; 6196 krwlock_t *rw = uobj->vmobjlock; 6197 6198 if (zp->z_sa_hdl == NULL) { 6199 err = 0; 6200 goto out; 6201 } 6202 6203 /* 6204 * writing to zfs needs memory allocation, locks, etc, 6205 * which are not safe for the page daemon. 6206 * ENOMEM to signal a transient error to uvm. 6207 * hopefully it can find other pages to free. 6208 */ 6209 6210 if (uvm_lwp_is_pagedaemon(curlwp)) { 6211 err = SET_ERROR(ENOMEM); 6212 goto out; 6213 } 6214 6215 /* 6216 * Calculate the length and assert that no whole pages are past EOF. 6217 * This check is equivalent to "off + len <= round_page(zp->z_size)", 6218 * with gyrations to avoid signed integer overflow. 6219 */ 6220 6221 off = pp[0]->offset; 6222 len = count * PAGESIZE; 6223 KASSERT(off <= zp->z_size); 6224 KASSERT(len <= round_page(zp->z_size)); 6225 KASSERT(off <= round_page(zp->z_size) - len); 6226 6227 /* 6228 * If EOF is within the last page, reduce len to avoid writing past 6229 * the file size in the ZFS buffer. Assert that 6230 * "off + len <= zp->z_size", again avoiding signed integer overflow. 6231 */ 6232 6233 if (len > zp->z_size - off) { 6234 len = zp->z_size - off; 6235 } 6236 KASSERT(len <= zp->z_size); 6237 KASSERT(off <= zp->z_size - len); 6238 6239 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || 6240 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { 6241 err = SET_ERROR(EDQUOT); 6242 goto out; 6243 } 6244 tx = dmu_tx_create(zfsvfs->z_os); 6245 dmu_tx_hold_write(tx, zp->z_id, off, len); 6246 6247 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 6248 zfs_sa_upgrade_txholds(tx, zp); 6249 err = dmu_tx_assign(tx, TXG_WAIT); 6250 if (err != 0) { 6251 dmu_tx_abort(tx); 6252 goto out; 6253 } 6254 6255 if (zp->z_blksz <= PAGESIZE) { 6256 KASSERTMSG(count == 1, "vp %p pp %p count %d", vp, pp, count); 6257 caddr_t va = zfs_map_page(*pp, S_READ); 6258 ASSERT3U(len, <=, PAGESIZE); 6259 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); 6260 zfs_unmap_page(*pp, va); 6261 } else { 6262 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); 6263 } 6264 cleanedp = tsd_get(zfs_putpage_key); 6265 *cleanedp = true; 6266 6267 if (err == 0) { 6268 uint64_t mtime[2], ctime[2]; 6269 sa_bulk_attr_t bulk[3]; 6270 int count = 0; 6271 6272 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, 6273 &mtime, 16); 6274 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, 6275 &ctime, 16); 6276 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 6277 &zp->z_pflags, 8); 6278 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, 6279 B_TRUE); 6280 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 6281 ASSERT0(err); 6282 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 6283 B_FALSE); 6284 } 6285 dmu_tx_commit(tx); 6286 6287 out: 6288 uvm_aio_aiodone_pages(pp, count, true, err); 6289 return (err); 6290 } 6291 6292 static void 6293 zfs_netbsd_update_mctime(vnode_t *vp) 6294 { 6295 znode_t *zp = VTOZ(vp); 6296 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 6297 dmu_tx_t *tx; 6298 sa_bulk_attr_t bulk[2]; 6299 uint64_t mtime[2], ctime[2]; 6300 int count = 0, err; 6301 6302 tx = dmu_tx_create(zfsvfs->z_os); 6303 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 6304 zfs_sa_upgrade_txholds(tx, zp); 6305 err = dmu_tx_assign(tx, TXG_WAIT); 6306 if (err != 0) { 6307 dmu_tx_abort(tx); 6308 return; 6309 } 6310 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 6311 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 6312 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); 6313 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 6314 dmu_tx_commit(tx); 6315 if (err != 0) { 6316 printf("%s: sa_bulk_update failed with %d\n", __func__, err); 6317 } 6318 } 6319 6320 static int 6321 zfs_netbsd_putpages(void *v) 6322 { 6323 struct vop_putpages_args /* { 6324 struct vnode *a_vp; 6325 voff_t a_offlo; 6326 voff_t a_offhi; 6327 int a_flags; 6328 } */ * const ap = v; 6329 6330 struct vnode *vp = ap->a_vp; 6331 voff_t offlo = ap->a_offlo; 6332 voff_t offhi = ap->a_offhi; 6333 int flags = ap->a_flags; 6334 6335 znode_t *zp = VTOZ(vp); 6336 zfsvfs_t *zfsvfs = zp->z_zfsvfs; 6337 rl_t *rl = NULL; 6338 uint64_t len; 6339 int error; 6340 bool cleaned = false; 6341 bool cleaning = (flags & PGO_CLEANIT) != 0; 6342 6343 if (cleaning) { 6344 bool pagedaemon = uvm_lwp_is_pagedaemon(curlwp); 6345 6346 ASSERT((offlo & PAGE_MASK) == 0 && (offhi & PAGE_MASK) == 0); 6347 ASSERT(offlo < offhi || offhi == 0); 6348 if (offhi == 0) 6349 len = UINT64_MAX; 6350 else 6351 len = offhi - offlo; 6352 rw_exit(vp->v_uobj.vmobjlock); 6353 if (pagedaemon) { 6354 error = fstrans_start_nowait(vp->v_mount); 6355 if (error) 6356 return error; 6357 } else { 6358 vfs_t *mp = vp->v_mount; 6359 fstrans_start(mp); 6360 if (vp->v_mount != mp) { 6361 fstrans_done(mp); 6362 ASSERT(!vn_has_cached_data(vp)); 6363 return 0; 6364 } 6365 } 6366 /* 6367 * Cannot use ZFS_ENTER() here as it returns with error 6368 * if z_unmounted. The next statement is equivalent. 6369 */ 6370 rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); 6371 6372 if (pagedaemon) { 6373 rl = zfs_range_lock_try(zp, offlo, len, RL_WRITER); 6374 if (rl == NULL) { 6375 error = EBUSY; 6376 goto fail; 6377 } 6378 } else { 6379 rl = zfs_range_lock(zp, offlo, len, RL_WRITER); 6380 } 6381 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 6382 tsd_set(zfs_putpage_key, &cleaned); 6383 } 6384 error = genfs_putpages(v); 6385 if (cleaning) { 6386 tsd_set(zfs_putpage_key, NULL); 6387 zfs_range_unlock(rl); 6388 6389 /* 6390 * Only zil_commit() if we cleaned something. This avoids 6391 * deadlock if we're called from zfs_netbsd_setsize(). 6392 * 6393 * Also, it isn't safe or nessesary to call it for vnode 6394 * reclaim. See the comment in zfs_netbsd_fsync. 6395 */ 6396 6397 if (cleaned && (flags & PGO_RECLAIM) == 0) { 6398 if ((flags & PGO_SYNCIO) != 0 6399 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 6400 zil_commit(zfsvfs->z_log, zp->z_id); 6401 } 6402 fail: 6403 ZFS_EXIT(zfsvfs); 6404 fstrans_done(vp->v_mount); 6405 } 6406 return error; 6407 } 6408 6409 /* 6410 * Restrict the putpages range to the ZFS block containing the offset. 6411 */ 6412 static void 6413 zfs_netbsd_gop_putrange(struct vnode *vp, off_t off, off_t *lop, off_t *hip) 6414 { 6415 znode_t *zp = VTOZ(vp); 6416 6417 *lop = trunc_page(rounddown2(off, zp->z_blksz)); 6418 *hip = round_page(*lop + zp->z_blksz); 6419 } 6420 6421 void 6422 zfs_netbsd_setsize(vnode_t *vp, off_t size) 6423 { 6424 struct uvm_object *uobj = &vp->v_uobj; 6425 krwlock_t *rw = uobj->vmobjlock; 6426 page_t *pg; 6427 int count, pgoff; 6428 caddr_t va; 6429 off_t tsize; 6430 6431 uvm_vnp_setsize(vp, size); 6432 if (!vn_has_cached_data(vp)) 6433 return; 6434 6435 tsize = trunc_page(size); 6436 if (tsize == size) 6437 return; 6438 6439 /* 6440 * If there's a partial page, we need to zero the tail. 6441 */ 6442 6443 rw_enter(rw, RW_WRITER); 6444 count = 1; 6445 pg = NULL; 6446 if (uvn_findpages(uobj, tsize, &count, &pg, NULL, UFP_NOALLOC)) { 6447 va = zfs_map_page(pg, S_WRITE); 6448 pgoff = size - tsize; 6449 memset(va + pgoff, 0, PAGESIZE - pgoff); 6450 zfs_unmap_page(pg, va); 6451 uvm_page_unbusy(&pg, 1); 6452 } 6453 6454 rw_exit(rw); 6455 } 6456 6457 static int 6458 zfs_netbsd_print(void *v) 6459 { 6460 struct vop_print_args /* { 6461 struct vnode *a_vp; 6462 } */ *ap = v; 6463 vnode_t *vp; 6464 znode_t *zp; 6465 6466 vp = ap->a_vp; 6467 zp = VTOZ(vp); 6468 6469 printf("\tino %" PRIu64 " size %" PRIu64 "\n", 6470 zp->z_id, zp->z_size); 6471 return 0; 6472 } 6473 6474 const struct genfs_ops zfs_genfsops = { 6475 .gop_write = zfs_putapage, 6476 .gop_putrange = zfs_netbsd_gop_putrange, 6477 }; 6478 6479 int (**zfs_vnodeop_p)(void *); 6480 const struct vnodeopv_entry_desc zfs_vnodeop_entries[] = { 6481 { &vop_default_desc, vn_default_error }, 6482 { &vop_parsepath_desc, genfs_parsepath }, 6483 { &vop_lookup_desc, zfs_netbsd_lookup }, 6484 { &vop_create_desc, zfs_netbsd_create }, 6485 { &vop_mknod_desc, zfs_netbsd_mknod }, 6486 { &vop_open_desc, zfs_netbsd_open }, 6487 { &vop_close_desc, zfs_netbsd_close }, 6488 { &vop_access_desc, zfs_netbsd_access }, 6489 { &vop_accessx_desc, genfs_accessx }, 6490 { &vop_getattr_desc, zfs_netbsd_getattr }, 6491 { &vop_setattr_desc, zfs_netbsd_setattr }, 6492 { &vop_read_desc, zfs_netbsd_read }, 6493 { &vop_write_desc, zfs_netbsd_write }, 6494 { &vop_ioctl_desc, zfs_netbsd_ioctl }, 6495 { &vop_poll_desc, genfs_poll }, 6496 { &vop_kqfilter_desc, genfs_kqfilter }, 6497 { &vop_revoke_desc, genfs_revoke }, 6498 { &vop_fsync_desc, zfs_netbsd_fsync }, 6499 { &vop_remove_desc, zfs_netbsd_remove }, 6500 { &vop_link_desc, zfs_netbsd_link }, 6501 { &vop_lock_desc, genfs_lock }, 6502 { &vop_unlock_desc, genfs_unlock }, 6503 { &vop_rename_desc, zfs_netbsd_rename }, 6504 { &vop_mkdir_desc, zfs_netbsd_mkdir }, 6505 { &vop_rmdir_desc, zfs_netbsd_rmdir }, 6506 { &vop_symlink_desc, zfs_netbsd_symlink }, 6507 { &vop_readdir_desc, zfs_netbsd_readdir }, 6508 { &vop_readlink_desc, zfs_netbsd_readlink }, 6509 { &vop_inactive_desc, zfs_netbsd_inactive }, 6510 { &vop_reclaim_desc, zfs_netbsd_reclaim }, 6511 { &vop_pathconf_desc, zfs_netbsd_pathconf }, 6512 { &vop_seek_desc, genfs_seek }, 6513 { &vop_getpages_desc, zfs_netbsd_getpages }, 6514 { &vop_putpages_desc, zfs_netbsd_putpages }, 6515 { &vop_mmap_desc, genfs_mmap }, 6516 { &vop_islocked_desc, genfs_islocked }, 6517 { &vop_advlock_desc, zfs_netbsd_advlock }, 6518 { &vop_print_desc, zfs_netbsd_print }, 6519 { &vop_fcntl_desc, genfs_fcntl }, 6520 { NULL, NULL } 6521 }; 6522 6523 const struct vnodeopv_desc zfs_vnodeop_opv_desc = 6524 { &zfs_vnodeop_p, zfs_vnodeop_entries }; 6525 6526 int (**zfs_specop_p)(void *); 6527 const struct vnodeopv_entry_desc zfs_specop_entries[] = { 6528 { &vop_default_desc, vn_default_error }, 6529 GENFS_SPECOP_ENTRIES, 6530 { &vop_close_desc, spec_close }, 6531 { &vop_access_desc, zfs_netbsd_access }, 6532 { &vop_accessx_desc, genfs_accessx }, 6533 { &vop_getattr_desc, zfs_netbsd_getattr }, 6534 { &vop_setattr_desc, zfs_netbsd_setattr }, 6535 { &vop_read_desc, zfs_netbsd_read }, 6536 { &vop_write_desc, zfs_netbsd_write }, 6537 { &vop_fsync_desc, zfs_spec_fsync }, 6538 { &vop_lock_desc, genfs_lock }, 6539 { &vop_unlock_desc, genfs_unlock }, 6540 { &vop_inactive_desc, zfs_netbsd_inactive }, 6541 { &vop_reclaim_desc, zfs_netbsd_reclaim }, 6542 { &vop_islocked_desc, genfs_islocked }, 6543 { &vop_bwrite_desc, vn_bwrite }, 6544 { &vop_print_desc, zfs_netbsd_print }, 6545 { &vop_fcntl_desc, genfs_fcntl }, 6546 { NULL, NULL } 6547 }; 6548 6549 const struct vnodeopv_desc zfs_specop_opv_desc = 6550 { &zfs_specop_p, zfs_specop_entries }; 6551 6552 int (**zfs_fifoop_p)(void *); 6553 const struct vnodeopv_entry_desc zfs_fifoop_entries[] = { 6554 { &vop_default_desc, vn_default_error }, 6555 GENFS_FIFOOP_ENTRIES, 6556 { &vop_close_desc, vn_fifo_bypass }, 6557 { &vop_access_desc, zfs_netbsd_access }, 6558 { &vop_accessx_desc, genfs_accessx }, 6559 { &vop_getattr_desc, zfs_netbsd_getattr }, 6560 { &vop_setattr_desc, zfs_netbsd_setattr }, 6561 { &vop_read_desc, zfs_netbsd_read }, 6562 { &vop_write_desc, zfs_netbsd_write }, 6563 { &vop_fsync_desc, zfs_netbsd_fsync }, 6564 { &vop_lock_desc, genfs_lock }, 6565 { &vop_unlock_desc, genfs_unlock }, 6566 { &vop_inactive_desc, zfs_netbsd_inactive }, 6567 { &vop_reclaim_desc, zfs_netbsd_reclaim }, 6568 { &vop_islocked_desc, genfs_islocked }, 6569 { &vop_bwrite_desc, vn_bwrite }, 6570 { &vop_strategy_desc, vn_fifo_bypass }, 6571 { &vop_print_desc, zfs_netbsd_print }, 6572 { &vop_fcntl_desc, genfs_fcntl }, 6573 { NULL, NULL } 6574 }; 6575 6576 const struct vnodeopv_desc zfs_fifoop_opv_desc = 6577 { &zfs_fifoop_p, zfs_fifoop_entries }; 6578 6579 #endif /* __NetBSD__ */ 6580