Home | History | Annotate | Line # | Download | only in kern
      1 /*	$NetBSD: sys_memfd.c,v 1.13 2025/11/15 19:02:26 gutteridge Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2023 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Theodore Preduta.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: sys_memfd.c,v 1.13 2025/11/15 19:02:26 gutteridge Exp $");
     34 
     35 #include <sys/param.h>
     36 #include <sys/types.h>
     37 
     38 #include <sys/fcntl.h>
     39 #include <sys/file.h>
     40 #include <sys/filedesc.h>
     41 #include <sys/memfd.h>
     42 #include <sys/mman.h>
     43 #include <sys/syscallargs.h>
     44 
     45 #include <uvm/uvm_extern.h>
     46 #include <uvm/uvm_object.h>
     47 
     48 #define F_SEAL_ANY_WRITE	(F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
     49 #define MFD_KNOWN_SEALS		(F_SEAL_SEAL|F_SEAL_SHRINK|F_SEAL_GROW \
     50 				|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
     51 
     52 static const char memfd_prefix[] = "memfd:";
     53 
     54 static int memfd_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
     55 static int memfd_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
     56 static int memfd_ioctl(file_t *, u_long, void *);
     57 static int memfd_fcntl(file_t *, u_int, void *);
     58 static int memfd_stat(file_t *, struct stat *);
     59 static int memfd_close(file_t *);
     60 static int memfd_mmap(file_t *, off_t *, size_t, int, int *, int *,
     61     struct uvm_object **, int *);
     62 static int memfd_seek(file_t *, off_t, int, off_t *, int);
     63 static int memfd_truncate_locked(file_t *, off_t);
     64 static int memfd_truncate(file_t *, off_t);
     65 
     66 static const struct fileops memfd_fileops = {
     67 	.fo_name = "memfd",
     68 	.fo_read = memfd_read,
     69 	.fo_write = memfd_write,
     70 	.fo_ioctl = memfd_ioctl,
     71 	.fo_fcntl = memfd_fcntl,
     72 	.fo_poll = fnullop_poll,
     73 	.fo_stat = memfd_stat,
     74 	.fo_close = memfd_close,
     75 	.fo_kqfilter = fnullop_kqfilter,
     76 	.fo_restart = fnullop_restart,
     77 	.fo_mmap = memfd_mmap,
     78 	.fo_seek = memfd_seek,
     79 	.fo_fpathconf = (void *)eopnotsupp,
     80 	.fo_posix_fadvise = (void *)eopnotsupp,
     81 	.fo_truncate = memfd_truncate,
     82 };
     83 
     84 /*
     85  * memfd_create(2).  Create a file descriptor associated with anonymous
     86  * memory.
     87  */
     88 int
     89 sys_memfd_create(struct lwp *l, const struct sys_memfd_create_args *uap,
     90     register_t *retval)
     91 {
     92 	/* {
     93 		syscallarg(const char *) name;
     94 		syscallarg(unsigned int) flags;
     95 	} */
     96 	int error, fd;
     97 	file_t *fp;
     98 	struct memfd *mfd;
     99 	struct proc *p = l->l_proc;
    100 	const unsigned int flags = SCARG(uap, flags);
    101 
    102 	if (flags & ~(MFD_CLOEXEC|MFD_CLOFORK|MFD_ALLOW_SEALING))
    103 		return EINVAL;
    104 
    105 	mfd = kmem_zalloc(sizeof(*mfd), KM_SLEEP);
    106 	mfd->mfd_size = 0;
    107 	mfd->mfd_uobj = uao_create(INT64_MAX - PAGE_SIZE, 0); /* same as tmpfs */
    108 
    109 	CTASSERT(sizeof(memfd_prefix) < NAME_MAX); /* sanity check */
    110 	strcpy(mfd->mfd_name, memfd_prefix);
    111 	error = copyinstr(SCARG(uap, name),
    112 	    &mfd->mfd_name[sizeof(memfd_prefix) - 1],
    113 	    sizeof(mfd->mfd_name) - sizeof(memfd_prefix), NULL);
    114 	if (error != 0)
    115  		goto leave;
    116 
    117 	getnanotime(&mfd->mfd_btime);
    118 
    119 	if ((flags & MFD_ALLOW_SEALING) == 0)
    120 		mfd->mfd_seals |= F_SEAL_SEAL;
    121 
    122 	error = fd_allocfile(&fp, &fd);
    123 	if (error != 0)
    124 		goto leave;
    125 
    126 	fp->f_flag = FREAD|FWRITE;
    127 	fp->f_type = DTYPE_MEMFD;
    128 	fp->f_ops = &memfd_fileops;
    129 	fp->f_memfd = mfd;
    130 	fd_set_exclose(l, fd, (flags & MFD_CLOEXEC) != 0);
    131 	fd_set_foclose(l, fd, (flags & MFD_CLOFORK) != 0);
    132 	fd_affix(p, fp, fd);
    133 
    134 	*retval = fd;
    135 	return 0;
    136 
    137 leave:
    138 	uao_detach(mfd->mfd_uobj);
    139 	kmem_free(mfd, sizeof(*mfd));
    140 	return error;
    141 }
    142 
    143 static int
    144 memfd_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
    145     int flags)
    146 {
    147 	int error;
    148 	vsize_t todo;
    149 	struct memfd *mfd = fp->f_memfd;
    150 
    151 	mutex_enter(&fp->f_lock);
    152 
    153 	if (*offp < 0) {
    154 		error = EINVAL;
    155 		goto leave;
    156 	}
    157 
    158 	/* Trying to read past the end does nothing. */
    159 	if (*offp >= mfd->mfd_size) {
    160 		error = 0;
    161 		goto leave;
    162 	}
    163 
    164 	uio->uio_offset = *offp;
    165 	todo = MIN(uio->uio_resid, mfd->mfd_size - *offp);
    166 	error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
    167 	    UBC_READ|UBC_PARTIALOK);
    168 	if (flags & FOF_UPDATE_OFFSET)
    169 		*offp = uio->uio_offset;
    170 
    171 leave:
    172 	getnanotime(&mfd->mfd_atime);
    173 
    174 
    175 	mutex_exit(&fp->f_lock);
    176 
    177 	return error;
    178 }
    179 
    180 static int
    181 memfd_write(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
    182     int flags)
    183 {
    184 	int error;
    185 	vsize_t todo;
    186 	struct memfd *mfd = fp->f_memfd;
    187 
    188 	mutex_enter(&fp->f_lock);
    189 
    190 	if (mfd->mfd_seals & F_SEAL_ANY_WRITE) {
    191 		error = EPERM;
    192 		goto leave;
    193 	}
    194 
    195 	if (*offp < 0) {
    196 		error = EINVAL;
    197 		goto leave;
    198 	}
    199 
    200 	uio->uio_offset = *offp;
    201 	todo = uio->uio_resid;
    202 
    203 	if (mfd->mfd_seals & F_SEAL_GROW) {
    204 		if (*offp >= mfd->mfd_size) {
    205 			error = EPERM;
    206 			goto leave;
    207 		}
    208 
    209 		/* Truncate the write to fit in mfd_size */
    210 		if (*offp + uio->uio_resid >= mfd->mfd_size)
    211 			todo = mfd->mfd_size - *offp;
    212 	} else if (*offp + uio->uio_resid >= mfd->mfd_size) {
    213 		/* Grow to accommodate the write request. */
    214 		error = memfd_truncate_locked(fp, *offp + uio->uio_resid);
    215 		if (error != 0)
    216 			goto leave;
    217 	}
    218 
    219 	error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
    220 	    UBC_WRITE|UBC_PARTIALOK);
    221 	if (flags & FOF_UPDATE_OFFSET)
    222 		*offp = uio->uio_offset;
    223 
    224 	getnanotime(&mfd->mfd_mtime);
    225 
    226 leave:
    227 	mutex_exit(&fp->f_lock);
    228 
    229 	return error;
    230 }
    231 
    232 static int
    233 memfd_ioctl(file_t *fp, u_long cmd, void *data)
    234 {
    235 
    236 	return EINVAL;
    237 }
    238 
    239 static int
    240 memfd_fcntl(file_t *fp, u_int cmd, void *data)
    241 {
    242 	struct memfd *mfd = fp->f_memfd;
    243 	int error = 0;
    244 
    245 	switch (cmd) {
    246 	case F_GETPATH:
    247 		strncpy(data, mfd->mfd_name, MAXPATHLEN);
    248 		return 0;
    249 
    250 	case F_ADD_SEALS:
    251 		mutex_enter(&fp->f_lock);
    252 
    253 		if (mfd->mfd_seals & F_SEAL_SEAL) {
    254 		        error = EPERM;
    255 			goto leave_add_seals;
    256 		}
    257 
    258 		if (*(int *)data & ~MFD_KNOWN_SEALS) {
    259 		        error = EINVAL;
    260 			goto leave_add_seals;
    261 		}
    262 
    263 		/*
    264 		 * Can only add F_SEAL_WRITE if there are no currently
    265 		 * open mmaps.
    266 		 *
    267 		 * XXX should only disallow if there are no currently
    268 		 * open mmaps with PROT_WRITE.
    269 		 */
    270 		if ((mfd->mfd_seals & F_SEAL_WRITE) == 0 &&
    271 		    (*(int *)data & F_SEAL_WRITE) != 0 &&
    272 		    mfd->mfd_uobj->uo_refs > 1)
    273 		{
    274 			error = EBUSY;
    275 			goto leave_add_seals;
    276 		}
    277 
    278 		mfd->mfd_seals |= *(int *)data;
    279 
    280 	leave_add_seals:
    281 		mutex_exit(&fp->f_lock);
    282 		return error;
    283 
    284 	case F_GET_SEALS:
    285 		mutex_enter(&fp->f_lock);
    286 		*(int *)data = mfd->mfd_seals;
    287 		mutex_exit(&fp->f_lock);
    288 		return 0;
    289 
    290 	default:
    291 		return EINVAL;
    292 	}
    293 }
    294 
    295 static int
    296 memfd_stat(file_t *fp, struct stat *st)
    297 {
    298 	struct memfd *mfd = fp->f_memfd;
    299 
    300 	mutex_enter(&fp->f_lock);
    301 
    302 	memset(st, 0, sizeof(*st));
    303 	st->st_uid = kauth_cred_geteuid(fp->f_cred);
    304 	st->st_gid = kauth_cred_getegid(fp->f_cred);
    305 	st->st_size = mfd->mfd_size;
    306 
    307 	st->st_mode = S_IREAD;
    308 	if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) == 0)
    309 		st->st_mode |= S_IWRITE;
    310 
    311 	st->st_birthtimespec = mfd->mfd_btime;
    312 	st->st_ctimespec = mfd->mfd_mtime;
    313 	st->st_atimespec = mfd->mfd_atime;
    314 	st->st_mtimespec = mfd->mfd_mtime;
    315 
    316 	mutex_exit(&fp->f_lock);
    317 
    318 	return 0;
    319 }
    320 
    321 static int
    322 memfd_close(file_t *fp)
    323 {
    324 	struct memfd *mfd = fp->f_memfd;
    325 
    326 	uao_detach(mfd->mfd_uobj);
    327 
    328 	kmem_free(mfd, sizeof(*mfd));
    329 	fp->f_memfd = NULL;
    330 
    331 	return 0;
    332 }
    333 
    334 static int
    335 memfd_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp,
    336     int *advicep, struct uvm_object **uobjp, int *maxprotp)
    337 {
    338 	struct memfd *mfd = fp->f_memfd;
    339 	int error = 0;
    340 
    341 	/* uvm_mmap guarantees page-aligned offset and size.  */
    342 	KASSERT(*offp == round_page(*offp));
    343 	KASSERT(size == round_page(size));
    344 	KASSERT(size > 0);
    345 
    346 	mutex_enter(&fp->f_lock);
    347 
    348 	if (*offp < 0) {
    349 		error = EINVAL;
    350 		goto leave;
    351 	}
    352 	if (*offp + size > mfd->mfd_size) {
    353 		error = EINVAL;
    354 		goto leave;
    355 	}
    356 
    357 	if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) &&
    358 	    (prot & VM_PROT_WRITE) && (*flagsp & MAP_PRIVATE) == 0) {
    359 		error = EPERM;
    360 		goto leave;
    361 	}
    362 
    363 	uao_reference(fp->f_memfd->mfd_uobj);
    364 	*uobjp = fp->f_memfd->mfd_uobj;
    365 
    366 	*maxprotp = prot;
    367 	*advicep = UVM_ADV_RANDOM;
    368 
    369 leave:
    370 	mutex_exit(&fp->f_lock);
    371 
    372 	return error;
    373 }
    374 
    375 static int
    376 memfd_seek(file_t *fp, off_t delta, int whence, off_t *newoffp,
    377     int flags)
    378 {
    379 	off_t newoff;
    380 	int error = 0;
    381 
    382 	mutex_enter(&fp->f_lock);
    383 
    384 	switch (whence) {
    385 	case SEEK_CUR:
    386 		newoff = fp->f_offset + delta;
    387 		break;
    388 
    389 	case SEEK_END:
    390 		newoff = fp->f_memfd->mfd_size + delta;
    391 		break;
    392 
    393 	case SEEK_SET:
    394 		newoff = delta;
    395 		break;
    396 
    397 	default:
    398 		error = EINVAL;
    399 		goto leave;
    400 	}
    401 
    402 	if (newoffp)
    403 		*newoffp = newoff;
    404 	if (flags & FOF_UPDATE_OFFSET)
    405 		fp->f_offset = newoff;
    406 
    407 leave:
    408 	mutex_exit(&fp->f_lock);
    409 
    410 	return error;
    411 }
    412 
    413 static int
    414 memfd_truncate_locked(file_t *fp, off_t length)
    415 {
    416 	struct memfd *mfd = fp->f_memfd;
    417 	voff_t start, end;
    418 	int error = 0;
    419 
    420 	KASSERT(mutex_owned(&fp->f_lock));
    421 
    422 	if (length < 0)
    423 		return EINVAL;
    424 	if (length == mfd->mfd_size)
    425 		return 0;
    426 
    427 	if ((mfd->mfd_seals & F_SEAL_SHRINK) && length < mfd->mfd_size)
    428 		return EPERM;
    429 	if ((mfd->mfd_seals & F_SEAL_GROW) && length > mfd->mfd_size)
    430 		return EPERM;
    431 
    432 	if (length > mfd->mfd_size)
    433 		ubc_zerorange(mfd->mfd_uobj, mfd->mfd_size,
    434 		    length - mfd->mfd_size, 0);
    435 	else {
    436 		/* length < mfd->mfd_size, so try to get rid of excess pages */
    437 		start = round_page(length);
    438 		end = round_page(mfd->mfd_size);
    439 
    440 		if (start < end) { /* we actually have pages to remove */
    441 			rw_enter(mfd->mfd_uobj->vmobjlock, RW_WRITER);
    442 			error = (*mfd->mfd_uobj->pgops->pgo_put)(mfd->mfd_uobj,
    443 			    start, end, PGO_FREE);
    444 			/* pgo_put drops vmobjlock */
    445 		}
    446 	}
    447 
    448 	getnanotime(&mfd->mfd_mtime);
    449 	mfd->mfd_size = length;
    450 
    451 	return error;
    452 }
    453 
    454 static int
    455 memfd_truncate(file_t *fp, off_t length)
    456 {
    457 	int error;
    458 
    459 	mutex_enter(&fp->f_lock);
    460 	error = memfd_truncate_locked(fp, length);
    461 	mutex_exit(&fp->f_lock);
    462 	return error;
    463 }
    464