tmpfs_subr.c revision 1.105.2.2 1 /* $NetBSD: tmpfs_subr.c,v 1.105.2.2 2020/01/24 16:48:58 ad Exp $ */
2
3 /*
4 * Copyright (c) 2005-2013 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Julio M. Merino Vidal, developed as part of Google's Summer of Code
9 * 2005 program, and by Mindaugas Rasiukevicius.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /*
34 * Efficient memory file system: interfaces for inode and directory entry
35 * construction, destruction and manipulation.
36 *
37 * Reference counting
38 *
39 * The link count of inode (tmpfs_node_t::tn_links) is used as a
40 * reference counter. However, it has slightly different semantics.
41 *
42 * For directories - link count represents directory entries, which
43 * refer to the directories. In other words, it represents the count
44 * of sub-directories. It also takes into account the virtual '.'
45 * entry (which has no real entry in the list). For files - link count
46 * represents the hard links. Since only empty directories can be
47 * removed - link count aligns the reference counting requirements
48 * enough. Note: to check whether directory is not empty, the inode
49 * size (tmpfs_node_t::tn_size) can be used.
50 *
51 * The inode itself, as an object, gathers its first reference when
52 * directory entry is attached via tmpfs_dir_attach(9). For instance,
53 * after regular tmpfs_create(), a file would have a link count of 1,
54 * while directory after tmpfs_mkdir() would have 2 (due to '.').
55 *
56 * Reclamation
57 *
58 * It should be noted that tmpfs inodes rely on a combination of vnode
59 * reference counting and link counting. That is, an inode can only be
60 * destroyed if its associated vnode is inactive. The destruction is
61 * done on vnode reclamation i.e. tmpfs_reclaim(). It should be noted
62 * that tmpfs_node_t::tn_links being 0 is a destruction criterion.
63 *
64 * If an inode has references within the file system (tn_links > 0) and
65 * its inactive vnode gets reclaimed/recycled - then the association is
66 * broken in tmpfs_reclaim(). In such case, an inode will always pass
67 * tmpfs_lookup() and thus vcache_get() to associate a new vnode.
68 *
69 * Lock order
70 *
71 * vnode_t::v_vlock ->
72 * vnode_t::v_interlock
73 */
74
75 #include <sys/cdefs.h>
76 __KERNEL_RCSID(0, "$NetBSD: tmpfs_subr.c,v 1.105.2.2 2020/01/24 16:48:58 ad Exp $");
77
78 #include <sys/param.h>
79 #include <sys/cprng.h>
80 #include <sys/dirent.h>
81 #include <sys/event.h>
82 #include <sys/kmem.h>
83 #include <sys/mount.h>
84 #include <sys/namei.h>
85 #include <sys/time.h>
86 #include <sys/stat.h>
87 #include <sys/systm.h>
88 #include <sys/vnode.h>
89 #include <sys/kauth.h>
90 #include <sys/atomic.h>
91
92 #include <uvm/uvm.h>
93
94 #include <miscfs/specfs/specdev.h>
95 #include <miscfs/genfs/genfs.h>
96 #include <fs/tmpfs/tmpfs.h>
97 #include <fs/tmpfs/tmpfs_fifoops.h>
98 #include <fs/tmpfs/tmpfs_specops.h>
99 #include <fs/tmpfs/tmpfs_vnops.h>
100
101 static void tmpfs_dir_putseq(tmpfs_node_t *, tmpfs_dirent_t *);
102
103 /*
104 * Initialize vnode with tmpfs node.
105 */
106 static void
107 tmpfs_init_vnode(struct vnode *vp, tmpfs_node_t *node)
108 {
109 kmutex_t *slock;
110
111 KASSERT(node->tn_vnode == NULL);
112
113 /* Share the interlock with the node. */
114 if (node->tn_type == VREG) {
115 slock = node->tn_spec.tn_reg.tn_aobj->vmobjlock;
116 mutex_obj_hold(slock);
117 uvm_obj_setlock(&vp->v_uobj, slock);
118 }
119
120 vp->v_tag = VT_TMPFS;
121 vp->v_type = node->tn_type;
122
123 /* Type-specific initialization. */
124 switch (vp->v_type) {
125 case VBLK:
126 case VCHR:
127 vp->v_op = tmpfs_specop_p;
128 spec_node_init(vp, node->tn_spec.tn_dev.tn_rdev);
129 break;
130 case VFIFO:
131 vp->v_op = tmpfs_fifoop_p;
132 break;
133 case VDIR:
134 if (node->tn_spec.tn_dir.tn_parent == node)
135 vp->v_vflag |= VV_ROOT;
136 /* FALLTHROUGH */
137 case VLNK:
138 case VREG:
139 case VSOCK:
140 vp->v_op = tmpfs_vnodeop_p;
141 break;
142 default:
143 panic("bad node type %d", vp->v_type);
144 break;
145 }
146
147 vp->v_data = node;
148 node->tn_vnode = vp;
149 uvm_vnp_setsize(vp, node->tn_size);
150 KASSERT(node->tn_mode != VNOVAL);
151 cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid);
152 }
153
154 /*
155 * tmpfs_loadvnode: initialise a vnode for a specified inode.
156 */
157 int
158 tmpfs_loadvnode(struct mount *mp, struct vnode *vp,
159 const void *key, size_t key_len, const void **new_key)
160 {
161 tmpfs_node_t *node;
162
163 KASSERT(key_len == sizeof(node));
164 memcpy(&node, key, key_len);
165
166 if (node->tn_links == 0)
167 return ENOENT;
168
169 tmpfs_init_vnode(vp, node);
170
171 *new_key = &vp->v_data;
172
173 return 0;
174 }
175
176 /*
177 * tmpfs_newvnode: allocate a new inode of a specified type and
178 * attach the vonode.
179 */
180 int
181 tmpfs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp,
182 struct vattr *vap, kauth_cred_t cred, void *extra,
183 size_t *key_len, const void **new_key)
184 {
185 tmpfs_mount_t *tmp = VFS_TO_TMPFS(mp);
186 tmpfs_node_t *node, *dnode;
187
188 if (dvp != NULL) {
189 KASSERT(VOP_ISLOCKED(dvp));
190 dnode = VP_TO_TMPFS_DIR(dvp);
191 if (dnode->tn_links == 0)
192 return ENOENT;
193 if (vap->va_type == VDIR) {
194 /* Check for maximum links limit. */
195 if (dnode->tn_links == LINK_MAX)
196 return EMLINK;
197 KASSERT(dnode->tn_links < LINK_MAX);
198 }
199 } else
200 dnode = NULL;
201
202 node = tmpfs_node_get(tmp);
203 if (node == NULL)
204 return ENOSPC;
205
206 /* Initially, no references and no associations. */
207 node->tn_links = 0;
208 node->tn_vnode = NULL;
209 node->tn_holdcount = 0;
210 node->tn_dirent_hint = NULL;
211
212 /*
213 * XXX Where the pool is backed by a map larger than (4GB *
214 * sizeof(*node)), this may produce duplicate inode numbers
215 * for applications that do not understand 64-bit ino_t.
216 */
217 node->tn_id = (ino_t)((uintptr_t)node / sizeof(*node));
218 /*
219 * Make sure the generation number is not zero.
220 * tmpfs_inactive() uses generation zero to mark dead nodes.
221 */
222 do {
223 node->tn_gen = TMPFS_NODE_GEN_MASK & cprng_fast32();
224 } while (node->tn_gen == 0);
225
226 /* Generic initialization. */
227 KASSERT((int)vap->va_type != VNOVAL);
228 node->tn_type = vap->va_type;
229 node->tn_size = 0;
230 node->tn_flags = 0;
231 node->tn_lockf = NULL;
232
233 vfs_timestamp(&node->tn_atime);
234 node->tn_birthtime = node->tn_atime;
235 node->tn_ctime = node->tn_atime;
236 node->tn_mtime = node->tn_atime;
237
238 if (dvp == NULL) {
239 KASSERT(vap->va_uid != VNOVAL && vap->va_gid != VNOVAL);
240 node->tn_uid = vap->va_uid;
241 node->tn_gid = vap->va_gid;
242 vp->v_vflag |= VV_ROOT;
243 } else {
244 KASSERT(dnode != NULL);
245 node->tn_uid = kauth_cred_geteuid(cred);
246 node->tn_gid = dnode->tn_gid;
247 }
248 KASSERT(vap->va_mode != VNOVAL);
249 node->tn_mode = vap->va_mode;
250
251 /* Type-specific initialization. */
252 switch (node->tn_type) {
253 case VBLK:
254 case VCHR:
255 /* Character/block special device. */
256 KASSERT(vap->va_rdev != VNOVAL);
257 node->tn_spec.tn_dev.tn_rdev = vap->va_rdev;
258 break;
259 case VDIR:
260 /* Directory. */
261 TAILQ_INIT(&node->tn_spec.tn_dir.tn_dir);
262 node->tn_spec.tn_dir.tn_parent = NULL;
263 node->tn_spec.tn_dir.tn_seq_arena = NULL;
264 node->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
265 node->tn_spec.tn_dir.tn_readdir_lastp = NULL;
266
267 /* Extra link count for the virtual '.' entry. */
268 node->tn_links++;
269 break;
270 case VFIFO:
271 case VSOCK:
272 break;
273 case VLNK:
274 node->tn_size = 0;
275 node->tn_spec.tn_lnk.tn_link = NULL;
276 break;
277 case VREG:
278 /* Regular file. Create an underlying UVM object. */
279 node->tn_spec.tn_reg.tn_aobj =
280 uao_create(INT64_MAX - PAGE_SIZE, 0);
281 node->tn_spec.tn_reg.tn_aobj_pages = 0;
282 break;
283 default:
284 panic("bad node type %d", vp->v_type);
285 break;
286 }
287
288 tmpfs_init_vnode(vp, node);
289
290 mutex_enter(&tmp->tm_lock);
291 LIST_INSERT_HEAD(&tmp->tm_nodes, node, tn_entries);
292 mutex_exit(&tmp->tm_lock);
293
294 *key_len = sizeof(vp->v_data);
295 *new_key = &vp->v_data;
296
297 return 0;
298 }
299
300 /*
301 * tmpfs_free_node: remove the inode from a list in the mount point and
302 * destroy the inode structures.
303 */
304 void
305 tmpfs_free_node(tmpfs_mount_t *tmp, tmpfs_node_t *node)
306 {
307 size_t objsz;
308 uint32_t hold;
309
310 mutex_enter(&tmp->tm_lock);
311 hold = atomic_or_32_nv(&node->tn_holdcount, TMPFS_NODE_RECLAIMED);
312 /* Defer destruction to last thread holding this node. */
313 if (hold != TMPFS_NODE_RECLAIMED) {
314 mutex_exit(&tmp->tm_lock);
315 return;
316 }
317 LIST_REMOVE(node, tn_entries);
318 mutex_exit(&tmp->tm_lock);
319
320 switch (node->tn_type) {
321 case VLNK:
322 if (node->tn_size > 0) {
323 tmpfs_strname_free(tmp, node->tn_spec.tn_lnk.tn_link,
324 node->tn_size);
325 }
326 break;
327 case VREG:
328 /*
329 * Calculate the size of inode data, decrease the used-memory
330 * counter, and destroy the unerlying UVM object (if any).
331 */
332 objsz = PAGE_SIZE * node->tn_spec.tn_reg.tn_aobj_pages;
333 if (objsz != 0) {
334 tmpfs_mem_decr(tmp, objsz);
335 }
336 if (node->tn_spec.tn_reg.tn_aobj != NULL) {
337 uao_detach(node->tn_spec.tn_reg.tn_aobj);
338 }
339 break;
340 case VDIR:
341 KASSERT(node->tn_size == 0);
342 KASSERT(node->tn_spec.tn_dir.tn_seq_arena == NULL);
343 KASSERT(TAILQ_EMPTY(&node->tn_spec.tn_dir.tn_dir));
344 KASSERT(node->tn_spec.tn_dir.tn_parent == NULL ||
345 node == tmp->tm_root);
346 break;
347 default:
348 break;
349 }
350 KASSERT(node->tn_vnode == NULL);
351 KASSERT(node->tn_links == 0);
352
353 tmpfs_node_put(tmp, node);
354 }
355
356 /*
357 * tmpfs_construct_node: allocate a new file of specified type and adds it
358 * into the parent directory.
359 *
360 * => Credentials of the caller are used.
361 */
362 int
363 tmpfs_construct_node(vnode_t *dvp, vnode_t **vpp, struct vattr *vap,
364 struct componentname *cnp, char *target)
365 {
366 tmpfs_mount_t *tmp = VFS_TO_TMPFS(dvp->v_mount);
367 tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp), *node;
368 tmpfs_dirent_t *de, *wde;
369 char *slink = NULL;
370 int ssize = 0;
371 int error;
372
373 /* Allocate symlink target. */
374 if (target != NULL) {
375 KASSERT(vap->va_type == VLNK);
376 ssize = strlen(target);
377 KASSERT(ssize < MAXPATHLEN);
378 if (ssize > 0) {
379 slink = tmpfs_strname_alloc(tmp, ssize);
380 if (slink == NULL)
381 return ENOSPC;
382 memcpy(slink, target, ssize);
383 }
384 }
385
386 /* Allocate a directory entry that points to the new file. */
387 error = tmpfs_alloc_dirent(tmp, cnp->cn_nameptr, cnp->cn_namelen, &de);
388 if (error) {
389 if (slink != NULL)
390 tmpfs_strname_free(tmp, slink, ssize);
391 return error;
392 }
393
394 /* Allocate a vnode that represents the new file. */
395 error = vcache_new(dvp->v_mount, dvp, vap, cnp->cn_cred, NULL, vpp);
396 if (error) {
397 if (slink != NULL)
398 tmpfs_strname_free(tmp, slink, ssize);
399 tmpfs_free_dirent(tmp, de);
400 return error;
401 }
402 error = vn_lock(*vpp, LK_EXCLUSIVE);
403 if (error) {
404 vrele(*vpp);
405 *vpp = NULL;
406 if (slink != NULL)
407 tmpfs_strname_free(tmp, slink, ssize);
408 tmpfs_free_dirent(tmp, de);
409 return error;
410 }
411
412 node = VP_TO_TMPFS_NODE(*vpp);
413
414 if (slink != NULL) {
415 node->tn_spec.tn_lnk.tn_link = slink;
416 node->tn_size = ssize;
417 }
418
419 /* Remove whiteout before adding the new entry. */
420 if (cnp->cn_flags & ISWHITEOUT) {
421 wde = tmpfs_dir_lookup(dnode, cnp);
422 KASSERT(wde != NULL && wde->td_node == TMPFS_NODE_WHITEOUT);
423 tmpfs_dir_detach(dnode, wde);
424 tmpfs_free_dirent(tmp, wde);
425 }
426
427 /* Associate inode and attach the entry into the directory. */
428 tmpfs_dir_attach(dnode, de, node);
429
430 /* Make node opaque if requested. */
431 if (cnp->cn_flags & ISWHITEOUT)
432 node->tn_flags |= UF_OPAQUE;
433
434 /* Update the parent's timestamps. */
435 tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME);
436
437 VOP_UNLOCK(*vpp);
438
439 cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags);
440 return 0;
441 }
442
443 /*
444 * tmpfs_alloc_dirent: allocates a new directory entry for the inode.
445 * The directory entry contains a path name component.
446 */
447 int
448 tmpfs_alloc_dirent(tmpfs_mount_t *tmp, const char *name, uint16_t len,
449 tmpfs_dirent_t **de)
450 {
451 tmpfs_dirent_t *nde;
452
453 nde = tmpfs_dirent_get(tmp);
454 if (nde == NULL)
455 return ENOSPC;
456
457 nde->td_name = tmpfs_strname_alloc(tmp, len);
458 if (nde->td_name == NULL) {
459 tmpfs_dirent_put(tmp, nde);
460 return ENOSPC;
461 }
462 nde->td_namelen = len;
463 memcpy(nde->td_name, name, len);
464 nde->td_seq = TMPFS_DIRSEQ_NONE;
465 nde->td_node = NULL; /* for asserts */
466
467 *de = nde;
468 return 0;
469 }
470
471 /*
472 * tmpfs_free_dirent: free a directory entry.
473 */
474 void
475 tmpfs_free_dirent(tmpfs_mount_t *tmp, tmpfs_dirent_t *de)
476 {
477 KASSERT(de->td_node == NULL);
478 KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
479 tmpfs_strname_free(tmp, de->td_name, de->td_namelen);
480 tmpfs_dirent_put(tmp, de);
481 }
482
483 /*
484 * tmpfs_dir_attach: associate directory entry with a specified inode,
485 * and attach the entry into the directory, specified by vnode.
486 *
487 * => Increases link count on the associated node.
488 * => Increases link count on directory node if our node is VDIR.
489 * => It is caller's responsibility to check for the LINK_MAX limit.
490 * => Triggers kqueue events here.
491 */
492 void
493 tmpfs_dir_attach(tmpfs_node_t *dnode, tmpfs_dirent_t *de, tmpfs_node_t *node)
494 {
495 vnode_t *dvp = dnode->tn_vnode;
496 int events = NOTE_WRITE;
497
498 KASSERT(dvp != NULL);
499 KASSERT(VOP_ISLOCKED(dvp));
500
501 /* Get a new sequence number. */
502 KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
503 de->td_seq = tmpfs_dir_getseq(dnode, de);
504
505 /* Associate directory entry and the inode. */
506 de->td_node = node;
507 if (node != TMPFS_NODE_WHITEOUT) {
508 KASSERT(node->tn_links < LINK_MAX);
509 node->tn_links++;
510
511 /* Save the hint (might overwrite). */
512 node->tn_dirent_hint = de;
513 } else if ((dnode->tn_gen & TMPFS_WHITEOUT_BIT) == 0) {
514 /* Flag that there are whiteout entries. */
515 atomic_or_32(&dnode->tn_gen, TMPFS_WHITEOUT_BIT);
516 }
517
518 /* Insert the entry to the directory (parent of inode). */
519 TAILQ_INSERT_TAIL(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
520 dnode->tn_size += sizeof(tmpfs_dirent_t);
521 uvm_vnp_setsize(dvp, dnode->tn_size);
522
523 if (node != TMPFS_NODE_WHITEOUT && node->tn_type == VDIR) {
524 /* Set parent. */
525 KASSERT(node->tn_spec.tn_dir.tn_parent == NULL);
526 node->tn_spec.tn_dir.tn_parent = dnode;
527
528 /* Increase the link count of parent. */
529 KASSERT(dnode->tn_links < LINK_MAX);
530 dnode->tn_links++;
531 events |= NOTE_LINK;
532
533 TMPFS_VALIDATE_DIR(node);
534 }
535 VN_KNOTE(dvp, events);
536 }
537
538 /*
539 * tmpfs_dir_detach: disassociate directory entry and its inode,
540 * and detach the entry from the directory, specified by vnode.
541 *
542 * => Decreases link count on the associated node.
543 * => Decreases the link count on directory node, if our node is VDIR.
544 * => Triggers kqueue events here.
545 *
546 * => Note: dvp and vp may be NULL only if called by tmpfs_unmount().
547 */
548 void
549 tmpfs_dir_detach(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
550 {
551 tmpfs_node_t *node = de->td_node;
552 vnode_t *vp, *dvp = dnode->tn_vnode;
553 int events = NOTE_WRITE;
554
555 KASSERT(dvp == NULL || VOP_ISLOCKED(dvp));
556
557 if (__predict_true(node != TMPFS_NODE_WHITEOUT)) {
558 /* Deassociate the inode and entry. */
559 node->tn_dirent_hint = NULL;
560
561 KASSERT(node->tn_links > 0);
562 node->tn_links--;
563
564 if ((vp = node->tn_vnode) != NULL) {
565 KASSERT(VOP_ISLOCKED(vp));
566 VN_KNOTE(vp, node->tn_links ? NOTE_LINK : NOTE_DELETE);
567 }
568
569 /* If directory - decrease the link count of parent. */
570 if (node->tn_type == VDIR) {
571 KASSERT(node->tn_spec.tn_dir.tn_parent == dnode);
572 node->tn_spec.tn_dir.tn_parent = NULL;
573
574 KASSERT(dnode->tn_links > 0);
575 dnode->tn_links--;
576 events |= NOTE_LINK;
577 }
578 }
579 de->td_node = NULL;
580
581 /* Remove the entry from the directory. */
582 if (dnode->tn_spec.tn_dir.tn_readdir_lastp == de) {
583 dnode->tn_spec.tn_dir.tn_readdir_lastp = NULL;
584 }
585 TAILQ_REMOVE(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries);
586 dnode->tn_size -= sizeof(tmpfs_dirent_t);
587 tmpfs_dir_putseq(dnode, de);
588
589 if (dvp) {
590 uvm_vnp_setsize(dvp, dnode->tn_size);
591 VN_KNOTE(dvp, events);
592 }
593 }
594
595 /*
596 * tmpfs_dir_lookup: find a directory entry in the specified inode.
597 *
598 * Note that the . and .. components are not allowed as they do not
599 * physically exist within directories.
600 */
601 tmpfs_dirent_t *
602 tmpfs_dir_lookup(tmpfs_node_t *node, struct componentname *cnp)
603 {
604 const char *name = cnp->cn_nameptr;
605 const uint16_t nlen = cnp->cn_namelen;
606 tmpfs_dirent_t *de;
607
608 KASSERT(VOP_ISLOCKED(node->tn_vnode));
609 KASSERT(nlen != 1 || !(name[0] == '.'));
610 KASSERT(nlen != 2 || !(name[0] == '.' && name[1] == '.'));
611 TMPFS_VALIDATE_DIR(node);
612
613 TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
614 if (de->td_namelen != nlen)
615 continue;
616 if (memcmp(de->td_name, name, nlen) != 0)
617 continue;
618 break;
619 }
620 return de;
621 }
622
623 /*
624 * tmpfs_dir_cached: get a cached directory entry if it is valid. Used to
625 * avoid unnecessary tmpfs_dir_lookup().
626 *
627 * => The vnode must be locked.
628 */
629 tmpfs_dirent_t *
630 tmpfs_dir_cached(tmpfs_node_t *node)
631 {
632 tmpfs_dirent_t *de = node->tn_dirent_hint;
633
634 KASSERT(VOP_ISLOCKED(node->tn_vnode));
635
636 if (de == NULL) {
637 return NULL;
638 }
639 KASSERT(de->td_node == node);
640
641 /*
642 * Directories always have a valid hint. For files, check if there
643 * are any hard links. If there are - hint might be invalid.
644 */
645 return (node->tn_type != VDIR && node->tn_links > 1) ? NULL : de;
646 }
647
648 /*
649 * tmpfs_dir_getseq: get a per-directory sequence number for the entry.
650 *
651 * => Shall not be larger than 2^31 for linux32 compatibility.
652 */
653 uint32_t
654 tmpfs_dir_getseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
655 {
656 uint32_t seq = de->td_seq;
657 vmem_t *seq_arena;
658 vmem_addr_t off;
659 int error __diagused;
660
661 TMPFS_VALIDATE_DIR(dnode);
662
663 if (__predict_true(seq != TMPFS_DIRSEQ_NONE)) {
664 /* Already set. */
665 KASSERT(seq >= TMPFS_DIRSEQ_START);
666 return seq;
667 }
668
669 /*
670 * The "." and ".." and the end-of-directory have reserved numbers.
671 * The other sequence numbers are allocated as following:
672 *
673 * - The first half of the 2^31 is assigned incrementally.
674 *
675 * - If that range is exceeded, then the second half of 2^31
676 * is used, but managed by vmem(9).
677 */
678
679 seq = dnode->tn_spec.tn_dir.tn_next_seq;
680 KASSERT(seq >= TMPFS_DIRSEQ_START);
681
682 if (__predict_true(seq < TMPFS_DIRSEQ_END)) {
683 /* First half: just increment and return. */
684 dnode->tn_spec.tn_dir.tn_next_seq++;
685 return seq;
686 }
687
688 /*
689 * First half exceeded, use the second half. May need to create
690 * vmem(9) arena for the directory first.
691 */
692 if ((seq_arena = dnode->tn_spec.tn_dir.tn_seq_arena) == NULL) {
693 seq_arena = vmem_create("tmpfscoo", 0,
694 TMPFS_DIRSEQ_END - 1, 1, NULL, NULL, NULL, 0,
695 VM_SLEEP, IPL_NONE);
696 dnode->tn_spec.tn_dir.tn_seq_arena = seq_arena;
697 KASSERT(seq_arena != NULL);
698 }
699 error = vmem_alloc(seq_arena, 1, VM_SLEEP | VM_BESTFIT, &off);
700 KASSERT(error == 0);
701
702 KASSERT(off < TMPFS_DIRSEQ_END);
703 seq = off | TMPFS_DIRSEQ_END;
704 return seq;
705 }
706
707 static void
708 tmpfs_dir_putseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
709 {
710 vmem_t *seq_arena = dnode->tn_spec.tn_dir.tn_seq_arena;
711 uint32_t seq = de->td_seq;
712
713 TMPFS_VALIDATE_DIR(dnode);
714
715 if (seq == TMPFS_DIRSEQ_NONE || seq < TMPFS_DIRSEQ_END) {
716 /* First half (or no sequence number set yet). */
717 KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
718 } else {
719 /* Second half. */
720 KASSERT(seq_arena != NULL);
721 KASSERT(seq >= TMPFS_DIRSEQ_END);
722 seq &= ~TMPFS_DIRSEQ_END;
723 vmem_free(seq_arena, seq, 1);
724 }
725 de->td_seq = TMPFS_DIRSEQ_NONE;
726
727 /* Empty? We can reset. */
728 if (seq_arena && dnode->tn_size == 0) {
729 dnode->tn_spec.tn_dir.tn_seq_arena = NULL;
730 dnode->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
731 vmem_destroy(seq_arena);
732 }
733 }
734
735 /*
736 * tmpfs_dir_lookupbyseq: lookup a directory entry by the sequence number.
737 */
738 tmpfs_dirent_t *
739 tmpfs_dir_lookupbyseq(tmpfs_node_t *node, off_t seq)
740 {
741 tmpfs_dirent_t *de = node->tn_spec.tn_dir.tn_readdir_lastp;
742
743 TMPFS_VALIDATE_DIR(node);
744
745 /*
746 * First, check the cache. If does not match - perform a lookup.
747 */
748 if (de && de->td_seq == seq) {
749 KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
750 KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE);
751 return de;
752 }
753 TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) {
754 KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
755 KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE);
756 if (de->td_seq == seq)
757 return de;
758 }
759 return NULL;
760 }
761
762 /*
763 * tmpfs_dir_getdotents: helper function for tmpfs_readdir() to get the
764 * dot meta entries, that is, "." or "..". Copy it to the UIO space.
765 */
766 static int
767 tmpfs_dir_getdotents(tmpfs_node_t *node, struct dirent *dp, struct uio *uio)
768 {
769 tmpfs_dirent_t *de;
770 off_t next = 0;
771 int error;
772
773 switch (uio->uio_offset) {
774 case TMPFS_DIRSEQ_DOT:
775 dp->d_fileno = node->tn_id;
776 strlcpy(dp->d_name, ".", sizeof(dp->d_name));
777 next = TMPFS_DIRSEQ_DOTDOT;
778 break;
779 case TMPFS_DIRSEQ_DOTDOT:
780 dp->d_fileno = node->tn_spec.tn_dir.tn_parent->tn_id;
781 strlcpy(dp->d_name, "..", sizeof(dp->d_name));
782 de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir);
783 next = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
784 break;
785 default:
786 KASSERT(false);
787 }
788 dp->d_type = DT_DIR;
789 dp->d_namlen = strlen(dp->d_name);
790 dp->d_reclen = _DIRENT_SIZE(dp);
791
792 if (dp->d_reclen > uio->uio_resid) {
793 return EJUSTRETURN;
794 }
795 if ((error = uiomove(dp, dp->d_reclen, uio)) != 0) {
796 return error;
797 }
798
799 uio->uio_offset = next;
800 return error;
801 }
802
803 /*
804 * tmpfs_dir_getdents: helper function for tmpfs_readdir.
805 *
806 * => Returns as much directory entries as can fit in the uio space.
807 * => The read starts at uio->uio_offset.
808 */
809 int
810 tmpfs_dir_getdents(tmpfs_node_t *node, struct uio *uio, off_t *cntp)
811 {
812 tmpfs_dirent_t *de;
813 struct dirent dent;
814 int error = 0;
815
816 KASSERT(VOP_ISLOCKED(node->tn_vnode));
817 TMPFS_VALIDATE_DIR(node);
818
819 /*
820 * First check for the "." and ".." cases.
821 * Note: tmpfs_dir_getdotents() will "seek" for us.
822 */
823 memset(&dent, 0, sizeof(dent));
824
825 if (uio->uio_offset == TMPFS_DIRSEQ_DOT) {
826 if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
827 goto done;
828 }
829 (*cntp)++;
830 }
831 if (uio->uio_offset == TMPFS_DIRSEQ_DOTDOT) {
832 if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
833 goto done;
834 }
835 (*cntp)++;
836 }
837
838 /* Done if we reached the end. */
839 if (uio->uio_offset == TMPFS_DIRSEQ_EOF) {
840 goto done;
841 }
842
843 /* Locate the directory entry given by the given sequence number. */
844 de = tmpfs_dir_lookupbyseq(node, uio->uio_offset);
845 if (de == NULL) {
846 error = EINVAL;
847 goto done;
848 }
849
850 /*
851 * Read as many entries as possible; i.e., until we reach the end
852 * of the directory or we exhaust UIO space.
853 */
854 do {
855 if (de->td_node == TMPFS_NODE_WHITEOUT) {
856 dent.d_fileno = 1;
857 dent.d_type = DT_WHT;
858 } else {
859 dent.d_fileno = de->td_node->tn_id;
860 dent.d_type = vtype2dt(de->td_node->tn_type);
861 }
862 dent.d_namlen = de->td_namelen;
863 KASSERT(de->td_namelen < sizeof(dent.d_name));
864 memcpy(dent.d_name, de->td_name, de->td_namelen);
865 dent.d_name[de->td_namelen] = '\0';
866 dent.d_reclen = _DIRENT_SIZE(&dent);
867
868 if (dent.d_reclen > uio->uio_resid) {
869 /* Exhausted UIO space. */
870 error = EJUSTRETURN;
871 break;
872 }
873
874 /* Copy out the directory entry and continue. */
875 error = uiomove(&dent, dent.d_reclen, uio);
876 if (error) {
877 break;
878 }
879 (*cntp)++;
880 de = TAILQ_NEXT(de, td_entries);
881
882 } while (uio->uio_resid > 0 && de);
883
884 /* Cache the last entry or clear and mark EOF. */
885 uio->uio_offset = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
886 node->tn_spec.tn_dir.tn_readdir_lastp = de;
887 done:
888 tmpfs_update(node->tn_vnode, TMPFS_UPDATE_ATIME);
889
890 if (error == EJUSTRETURN) {
891 /* Exhausted UIO space - just return. */
892 error = 0;
893 }
894 KASSERT(error >= 0);
895 return error;
896 }
897
898 /*
899 * tmpfs_reg_resize: resize the underlying UVM object associated with the
900 * specified regular file.
901 */
902 int
903 tmpfs_reg_resize(struct vnode *vp, off_t newsize)
904 {
905 tmpfs_mount_t *tmp = VFS_TO_TMPFS(vp->v_mount);
906 tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
907 struct uvm_object *uobj = node->tn_spec.tn_reg.tn_aobj;
908 size_t newpages, oldpages;
909 off_t oldsize;
910
911 KASSERT(vp->v_type == VREG);
912 KASSERT(newsize >= 0);
913
914 oldsize = node->tn_size;
915 oldpages = round_page(oldsize) >> PAGE_SHIFT;
916 newpages = round_page(newsize) >> PAGE_SHIFT;
917 KASSERT(oldpages == node->tn_spec.tn_reg.tn_aobj_pages);
918
919 if (newpages > oldpages) {
920 /* Increase the used-memory counter if getting extra pages. */
921 if (!tmpfs_mem_incr(tmp, (newpages - oldpages) << PAGE_SHIFT)) {
922 return ENOSPC;
923 }
924 } else if (newsize < oldsize) {
925 size_t zerolen;
926
927 zerolen = MIN(round_page(newsize), node->tn_size) - newsize;
928 ubc_zerorange(uobj, newsize, zerolen, UBC_UNMAP_FLAG(vp));
929 }
930
931 node->tn_spec.tn_reg.tn_aobj_pages = newpages;
932 node->tn_size = newsize;
933 uvm_vnp_setsize(vp, newsize);
934
935 /*
936 * Free "backing store".
937 */
938 if (newpages < oldpages) {
939 KASSERT(uobj->vmobjlock == vp->v_interlock);
940
941 mutex_enter(uobj->vmobjlock);
942 uao_dropswap_range(uobj, newpages, oldpages);
943 mutex_exit(uobj->vmobjlock);
944
945 /* Decrease the used-memory counter. */
946 tmpfs_mem_decr(tmp, (oldpages - newpages) << PAGE_SHIFT);
947 }
948 if (newsize > oldsize) {
949 VN_KNOTE(vp, NOTE_EXTEND);
950 }
951 return 0;
952 }
953
954 /*
955 * tmpfs_chflags: change flags of the given vnode.
956 */
957 int
958 tmpfs_chflags(vnode_t *vp, int flags, kauth_cred_t cred, lwp_t *l)
959 {
960 tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
961 kauth_action_t action = KAUTH_VNODE_WRITE_FLAGS;
962 int error;
963 bool changing_sysflags = false;
964
965 KASSERT(VOP_ISLOCKED(vp));
966
967 /* Disallow this operation if the file system is mounted read-only. */
968 if (vp->v_mount->mnt_flag & MNT_RDONLY)
969 return EROFS;
970
971 /*
972 * If the new flags have non-user flags that are different than
973 * those on the node, we need special permission to change them.
974 */
975 if ((flags & SF_SETTABLE) != (node->tn_flags & SF_SETTABLE)) {
976 action |= KAUTH_VNODE_WRITE_SYSFLAGS;
977 changing_sysflags = true;
978 }
979
980 /*
981 * Indicate that this node's flags have system attributes in them if
982 * that's the case.
983 */
984 if (node->tn_flags & (SF_IMMUTABLE | SF_APPEND)) {
985 action |= KAUTH_VNODE_HAS_SYSFLAGS;
986 }
987
988 error = kauth_authorize_vnode(cred, action, vp, NULL,
989 genfs_can_chflags(cred, vp->v_type, node->tn_uid,
990 changing_sysflags));
991 if (error)
992 return error;
993
994 /*
995 * Set the flags. If we're not setting non-user flags, be careful not
996 * to overwrite them.
997 *
998 * XXX: Can't we always assign here? if the system flags are different,
999 * the code above should catch attempts to change them without
1000 * proper permissions, and if we're here it means it's okay to
1001 * change them...
1002 */
1003 if (!changing_sysflags) {
1004 /* Clear all user-settable flags and re-set them. */
1005 node->tn_flags &= SF_SETTABLE;
1006 node->tn_flags |= (flags & UF_SETTABLE);
1007 } else {
1008 node->tn_flags = flags;
1009 }
1010 tmpfs_update(vp, TMPFS_UPDATE_CTIME);
1011 VN_KNOTE(vp, NOTE_ATTRIB);
1012 return 0;
1013 }
1014
1015 /*
1016 * tmpfs_chmod: change access mode on the given vnode.
1017 */
1018 int
1019 tmpfs_chmod(vnode_t *vp, mode_t mode, kauth_cred_t cred, lwp_t *l)
1020 {
1021 tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1022 int error;
1023
1024 KASSERT(VOP_ISLOCKED(vp));
1025
1026 /* Disallow this operation if the file system is mounted read-only. */
1027 if (vp->v_mount->mnt_flag & MNT_RDONLY)
1028 return EROFS;
1029
1030 /* Immutable or append-only files cannot be modified, either. */
1031 if (node->tn_flags & (IMMUTABLE | APPEND))
1032 return EPERM;
1033
1034 error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY, vp,
1035 NULL, genfs_can_chmod(vp->v_type, cred, node->tn_uid, node->tn_gid, mode));
1036 if (error) {
1037 return error;
1038 }
1039 node->tn_mode = (mode & ALLPERMS);
1040 tmpfs_update(vp, TMPFS_UPDATE_CTIME);
1041 VN_KNOTE(vp, NOTE_ATTRIB);
1042 cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid);
1043 return 0;
1044 }
1045
1046 /*
1047 * tmpfs_chown: change ownership of the given vnode.
1048 *
1049 * => At least one of uid or gid must be different than VNOVAL.
1050 * => Attribute is unchanged for VNOVAL case.
1051 */
1052 int
1053 tmpfs_chown(vnode_t *vp, uid_t uid, gid_t gid, kauth_cred_t cred, lwp_t *l)
1054 {
1055 tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1056 int error;
1057
1058 KASSERT(VOP_ISLOCKED(vp));
1059
1060 /* Assign default values if they are unknown. */
1061 KASSERT(uid != VNOVAL || gid != VNOVAL);
1062 if (uid == VNOVAL) {
1063 uid = node->tn_uid;
1064 }
1065 if (gid == VNOVAL) {
1066 gid = node->tn_gid;
1067 }
1068
1069 /* Disallow this operation if the file system is mounted read-only. */
1070 if (vp->v_mount->mnt_flag & MNT_RDONLY)
1071 return EROFS;
1072
1073 /* Immutable or append-only files cannot be modified, either. */
1074 if (node->tn_flags & (IMMUTABLE | APPEND))
1075 return EPERM;
1076
1077 error = kauth_authorize_vnode(cred, KAUTH_VNODE_CHANGE_OWNERSHIP, vp,
1078 NULL, genfs_can_chown(cred, node->tn_uid, node->tn_gid, uid,
1079 gid));
1080 if (error) {
1081 return error;
1082 }
1083 node->tn_uid = uid;
1084 node->tn_gid = gid;
1085 tmpfs_update(vp, TMPFS_UPDATE_CTIME);
1086 VN_KNOTE(vp, NOTE_ATTRIB);
1087 cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid);
1088 return 0;
1089 }
1090
1091 /*
1092 * tmpfs_chsize: change size of the given vnode.
1093 */
1094 int
1095 tmpfs_chsize(vnode_t *vp, u_quad_t size, kauth_cred_t cred, lwp_t *l)
1096 {
1097 tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1098 const off_t length = size;
1099 int error;
1100
1101 KASSERT(VOP_ISLOCKED(vp));
1102
1103 /* Decide whether this is a valid operation based on the file type. */
1104 switch (vp->v_type) {
1105 case VDIR:
1106 return EISDIR;
1107 case VREG:
1108 if (vp->v_mount->mnt_flag & MNT_RDONLY) {
1109 return EROFS;
1110 }
1111 break;
1112 case VBLK:
1113 case VCHR:
1114 case VFIFO:
1115 /*
1116 * Allow modifications of special files even if in the file
1117 * system is mounted read-only (we are not modifying the
1118 * files themselves, but the objects they represent).
1119 */
1120 return 0;
1121 default:
1122 return EOPNOTSUPP;
1123 }
1124
1125 /* Immutable or append-only files cannot be modified, either. */
1126 if (node->tn_flags & (IMMUTABLE | APPEND)) {
1127 return EPERM;
1128 }
1129
1130 if (length < 0) {
1131 return EINVAL;
1132 }
1133
1134 /* Note: tmpfs_reg_resize() will raise NOTE_EXTEND and NOTE_ATTRIB. */
1135 if (node->tn_size != length &&
1136 (error = tmpfs_reg_resize(vp, length)) != 0) {
1137 return error;
1138 }
1139 tmpfs_update(vp, TMPFS_UPDATE_CTIME | TMPFS_UPDATE_MTIME);
1140 return 0;
1141 }
1142
1143 /*
1144 * tmpfs_chtimes: change access and modification times for vnode.
1145 */
1146 int
1147 tmpfs_chtimes(vnode_t *vp, const struct timespec *atime,
1148 const struct timespec *mtime, const struct timespec *btime,
1149 int vaflags, kauth_cred_t cred, lwp_t *l)
1150 {
1151 tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1152 int error;
1153
1154 KASSERT(VOP_ISLOCKED(vp));
1155
1156 /* Disallow this operation if the file system is mounted read-only. */
1157 if (vp->v_mount->mnt_flag & MNT_RDONLY)
1158 return EROFS;
1159
1160 /* Immutable or append-only files cannot be modified, either. */
1161 if (node->tn_flags & (IMMUTABLE | APPEND))
1162 return EPERM;
1163
1164 error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp, NULL,
1165 genfs_can_chtimes(vp, vaflags, node->tn_uid, cred));
1166 if (error)
1167 return error;
1168
1169 if (atime->tv_sec != VNOVAL) {
1170 node->tn_atime = *atime;
1171 }
1172 if (mtime->tv_sec != VNOVAL) {
1173 node->tn_mtime = *mtime;
1174 }
1175 if (btime->tv_sec != VNOVAL) {
1176 node->tn_birthtime = *btime;
1177 }
1178 VN_KNOTE(vp, NOTE_ATTRIB);
1179 return 0;
1180 }
1181
1182 /*
1183 * tmpfs_update: update the timestamps as indicated by the flags.
1184 */
1185 void
1186 tmpfs_update(vnode_t *vp, unsigned tflags)
1187 {
1188 tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
1189 struct timespec nowtm;
1190
1191 if (tflags == 0) {
1192 return;
1193 }
1194 vfs_timestamp(&nowtm);
1195
1196 if (tflags & TMPFS_UPDATE_ATIME) {
1197 node->tn_atime = nowtm;
1198 }
1199 if (tflags & TMPFS_UPDATE_MTIME) {
1200 node->tn_mtime = nowtm;
1201 }
1202 if (tflags & TMPFS_UPDATE_CTIME) {
1203 node->tn_ctime = nowtm;
1204 }
1205 }
1206