Home | History | Annotate | Line # | Download | only in kern
sys_memfd.c revision 1.1
      1 /*	$NetBSD: sys_memfd.c,v 1.1 2023/07/10 02:31:55 christos Exp $	*/
      2 
      3 /*-
      4  * Copyright (c) 2023 The NetBSD Foundation, Inc.
      5  * All rights reserved.
      6  *
      7  * This code is derived from software contributed to The NetBSD Foundation
      8  * by Theodore Preduta.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  * POSSIBILITY OF SUCH DAMAGE.
     30  */
     31 
     32 #include <sys/cdefs.h>
     33 __KERNEL_RCSID(0, "$NetBSD: sys_memfd.c,v 1.1 2023/07/10 02:31:55 christos Exp $");
     34 
     35 #include <sys/param.h>
     36 #include <sys/types.h>
     37 #include <sys/fcntl.h>
     38 #include <sys/file.h>
     39 #include <sys/filedesc.h>
     40 #include <sys/mman.h>
     41 #include <sys/syscallargs.h>
     42 
     43 #include <uvm/uvm_extern.h>
     44 #include <uvm/uvm_object.h>
     45 
     46 #define F_SEAL_ANY_WRITE	(F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
     47 #define MFD_KNOWN_SEALS		(F_SEAL_SEAL|F_SEAL_SHRINK|F_SEAL_GROW \
     48 				|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
     49 
     50 static const char memfd_prefix[] = "memfd:";
     51 
     52 static int memfd_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
     53 static int memfd_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
     54 static int memfd_ioctl(file_t *, u_long, void *);
     55 static int memfd_fcntl(file_t *, u_int, void *);
     56 static int memfd_stat(file_t *, struct stat *);
     57 static int memfd_close(file_t *);
     58 static int memfd_mmap(file_t *, off_t *, size_t, int, int *, int *,
     59     struct uvm_object **, int *);
     60 static int memfd_seek(file_t *, off_t, int, off_t *, int);
     61 static int memfd_truncate(file_t *, off_t);
     62 
     63 static const struct fileops memfd_fileops = {
     64 	.fo_name = "memfd",
     65 	.fo_read = memfd_read,
     66 	.fo_write = memfd_write,
     67 	.fo_ioctl = memfd_ioctl,
     68 	.fo_fcntl = memfd_fcntl,
     69 	.fo_poll = fnullop_poll,
     70 	.fo_stat = memfd_stat,
     71 	.fo_close = memfd_close,
     72 	.fo_kqfilter = fnullop_kqfilter,
     73 	.fo_restart = fnullop_restart,
     74 	.fo_mmap = memfd_mmap,
     75 	.fo_seek = memfd_seek,
     76 	.fo_fpathconf = (void *)eopnotsupp,
     77 	.fo_posix_fadvise = (void *)eopnotsupp,
     78 	.fo_truncate = memfd_truncate,
     79 };
     80 
     81 /*
     82  * memfd_create(2).  Creat a file descriptor associated with anonymous
     83  * memory.
     84  */
     85 int
     86 sys_memfd_create(struct lwp *l, const struct sys_memfd_create_args *uap,
     87     register_t *retval)
     88 {
     89 	/* {
     90 		syscallarg(const char *) name;
     91 		syscallarg(unsigned int) flags;
     92 	} */
     93 	int error, fd;
     94 	file_t *fp;
     95 	struct memfd *mfd;
     96 	struct proc *p = l->l_proc;
     97 	const unsigned int flags = SCARG(uap, flags);
     98 
     99 	KASSERT(NAME_MAX - sizeof(memfd_prefix) > 0); /* sanity check */
    100 
    101 	if (flags & ~(MFD_CLOEXEC|MFD_ALLOW_SEALING))
    102 		return EINVAL;
    103 
    104 	mfd = kmem_zalloc(sizeof(*mfd), KM_SLEEP);
    105 	mfd->mfd_size = 0;
    106 	mfd->mfd_uobj = uao_create(INT64_MAX - PAGE_SIZE, 0); /* same as tmpfs */
    107 	mutex_init(&mfd->mfd_lock, MUTEX_DEFAULT, IPL_NONE);
    108 
    109 	strcpy(mfd->mfd_name, memfd_prefix);
    110 	error = copyinstr(SCARG(uap, name),
    111 	    &mfd->mfd_name[sizeof(memfd_prefix) - 1],
    112 	    sizeof(mfd->mfd_name) - sizeof(memfd_prefix), NULL);
    113 	if (error != 0)
    114  		goto leave;
    115 
    116 	getnanotime(&mfd->mfd_btime);
    117 
    118 	if ((flags & MFD_ALLOW_SEALING) == 0)
    119 		mfd->mfd_seals |= F_SEAL_SEAL;
    120 
    121 	error = fd_allocfile(&fp, &fd);
    122 	if (error != 0)
    123 		goto leave;
    124 
    125 	fp->f_flag = FREAD|FWRITE;
    126 	fp->f_type = DTYPE_MEMFD;
    127 	fp->f_ops = &memfd_fileops;
    128 	fp->f_memfd = mfd;
    129 	fd_set_exclose(l, fd, (flags & MFD_CLOEXEC) != 0);
    130 	fd_affix(p, fp, fd);
    131 
    132 	*retval = fd;
    133 	return 0;
    134 
    135 leave:
    136 	uao_detach(mfd->mfd_uobj);
    137 	kmem_free(mfd, sizeof(*mfd));
    138 	return error;
    139 }
    140 
    141 static int
    142 memfd_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
    143     int flags)
    144 {
    145 	int error;
    146 	vsize_t todo;
    147 	struct memfd *mfd = fp->f_memfd;
    148 
    149 	if (offp == &fp->f_offset)
    150 		mutex_enter(&fp->f_lock);
    151 
    152 	if (*offp < 0) {
    153 		error = EINVAL;
    154 		goto leave;
    155 	}
    156 
    157 	/* Trying to read past the end does nothing. */
    158 	if (*offp >= mfd->mfd_size) {
    159 		error = 0;
    160 		goto leave;
    161 	}
    162 
    163 	uio->uio_offset = *offp;
    164 	todo = MIN(uio->uio_resid, mfd->mfd_size - *offp);
    165 	error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
    166 	    UBC_READ|UBC_PARTIALOK);
    167 
    168 leave:
    169 	if (offp == &fp->f_offset)
    170 		mutex_exit(&fp->f_lock);
    171 
    172 	getnanotime(&mfd->mfd_atime);
    173 
    174 	return error;
    175 }
    176 
    177 static int
    178 memfd_write(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
    179     int flags)
    180 {
    181 	int error;
    182 	vsize_t todo;
    183 	struct memfd *mfd = fp->f_memfd;
    184 
    185 	if (mfd->mfd_seals & F_SEAL_ANY_WRITE)
    186 		return EPERM;
    187 
    188 	if (offp == &fp->f_offset)
    189 		mutex_enter(&fp->f_lock);
    190 
    191 	if (*offp < 0) {
    192 		error = EINVAL;
    193 		goto leave;
    194 	}
    195 
    196 	uio->uio_offset = *offp;
    197 	todo = uio->uio_resid;
    198 
    199 	if (mfd->mfd_seals & F_SEAL_GROW) {
    200 		if (*offp >= mfd->mfd_size) {
    201 			error = EPERM;
    202 			goto leave;
    203 		}
    204 
    205 		/* Truncate the write to fit in mfd_size */
    206 		if (*offp + uio->uio_resid >= mfd->mfd_size)
    207 			todo = mfd->mfd_size - *offp;
    208 	} else if (*offp + uio->uio_resid >= mfd->mfd_size) {
    209 		/* Grow to accommodate the write request. */
    210 		error = memfd_truncate(fp, *offp + uio->uio_resid);
    211 		if (error != 0)
    212 			goto leave;
    213 	}
    214 
    215 	error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
    216 	    UBC_WRITE|UBC_PARTIALOK);
    217 
    218 	getnanotime(&mfd->mfd_mtime);
    219 
    220 leave:
    221 	if (offp == &fp->f_offset)
    222 		mutex_exit(&fp->f_lock);
    223 
    224 	return error;
    225 }
    226 
    227 static int
    228 memfd_ioctl(file_t *fp, u_long cmd, void *data)
    229 {
    230 
    231 	return EINVAL;
    232 }
    233 
    234 static int
    235 memfd_fcntl(file_t *fp, u_int cmd, void *data)
    236 {
    237 	struct memfd *mfd = fp->f_memfd;
    238 
    239 	switch (cmd) {
    240 	case F_ADD_SEALS:
    241 		if (mfd->mfd_seals & F_SEAL_SEAL)
    242 			return EPERM;
    243 
    244 		if (*(int *)data & ~MFD_KNOWN_SEALS)
    245 		        return EINVAL;
    246 
    247 		/*
    248 		 * Can only add F_SEAL_WRITE if there are no currently
    249 		 * open mmaps.
    250 		 *
    251 		 * XXX should only disallow if there are no currently
    252 		 * open mmaps with PROT_WRITE.
    253 		 */
    254 		if ((mfd->mfd_seals & F_SEAL_WRITE) == 0 &&
    255 		    (*(int *)data & F_SEAL_WRITE) != 0 &&
    256 		    mfd->mfd_uobj->uo_refs > 1)
    257 			return EBUSY;
    258 
    259 		mfd->mfd_seals |= *(int *)data;
    260 		return 0;
    261 
    262 	case F_GET_SEALS:
    263 		*(int *)data = mfd->mfd_seals;
    264 		return 0;
    265 
    266 	default:
    267 		return EINVAL;
    268 	}
    269 }
    270 
    271 static int
    272 memfd_stat(file_t *fp, struct stat *st)
    273 {
    274 	struct memfd *mfd = fp->f_memfd;
    275 
    276 	memset(st, 0, sizeof(*st));
    277 	st->st_uid = kauth_cred_geteuid(fp->f_cred);
    278 	st->st_gid = kauth_cred_getegid(fp->f_cred);
    279 	st->st_size = mfd->mfd_size;
    280 
    281 	st->st_mode = S_IREAD;
    282 	if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) == 0)
    283 		st->st_mode |= S_IWRITE;
    284 
    285 	st->st_birthtimespec = mfd->mfd_btime;
    286 	st->st_ctimespec = mfd->mfd_mtime;
    287 	st->st_atimespec = mfd->mfd_atime;
    288 	st->st_mtimespec = mfd->mfd_mtime;
    289 
    290 	return 0;
    291 }
    292 
    293 static int
    294 memfd_close(file_t *fp)
    295 {
    296 	struct memfd *mfd = fp->f_memfd;
    297 
    298 	uao_detach(mfd->mfd_uobj);
    299 	mutex_destroy(&mfd->mfd_lock);
    300 
    301 	kmem_free(mfd, sizeof(*mfd));
    302 	fp->f_memfd = NULL;
    303 
    304 	return 0;
    305 }
    306 
    307 static int
    308 memfd_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp,
    309     int *advicep, struct uvm_object **uobjp, int *maxprotp)
    310 {
    311 	struct memfd *mfd = fp->f_memfd;
    312 
    313 	/* uvm_mmap guarantees page-aligned offset and size.  */
    314 	KASSERT(*offp == round_page(*offp));
    315 	KASSERT(size == round_page(size));
    316 	KASSERT(size > 0);
    317 
    318 	if (*offp < 0)
    319 		return EINVAL;
    320 	if (*offp + size > mfd->mfd_size)
    321 		return EINVAL;
    322 
    323 	if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) &&
    324 	    (prot & VM_PROT_WRITE) && (*flagsp & MAP_PRIVATE) == 0)
    325 		return EPERM;
    326 
    327 	uao_reference(fp->f_memfd->mfd_uobj);
    328 	*uobjp = fp->f_memfd->mfd_uobj;
    329 
    330 	*maxprotp = prot;
    331 	*advicep = UVM_ADV_RANDOM;
    332 
    333 	return 0;
    334 }
    335 
    336 static int
    337 memfd_seek(file_t *fp, off_t delta, int whence, off_t *newoffp,
    338     int flags)
    339 {
    340 	off_t newoff;
    341 	int error;
    342 
    343 	switch (whence) {
    344 	case SEEK_CUR:
    345 		newoff = fp->f_offset + delta;
    346 		break;
    347 
    348 	case SEEK_END:
    349 		newoff = fp->f_memfd->mfd_size + delta;
    350 		break;
    351 
    352 	case SEEK_SET:
    353 		newoff = delta;
    354 		break;
    355 
    356 	default:
    357 		error = EINVAL;
    358 		return error;
    359 	}
    360 
    361 	if (newoffp)
    362 		*newoffp = newoff;
    363 	if (flags & FOF_UPDATE_OFFSET)
    364 		fp->f_offset = newoff;
    365 
    366 	return 0;
    367 }
    368 
    369 static int
    370 memfd_truncate(file_t *fp, off_t length)
    371 {
    372 	struct memfd *mfd = fp->f_memfd;
    373 	int error = 0;
    374 	voff_t start, end;
    375 
    376 	if (length < 0)
    377 		return EINVAL;
    378 	if (length == mfd->mfd_size)
    379 		return 0;
    380 
    381 	if ((mfd->mfd_seals & F_SEAL_SHRINK) && length < mfd->mfd_size)
    382 		return EPERM;
    383 	if ((mfd->mfd_seals & F_SEAL_GROW) && length > mfd->mfd_size)
    384 		return EPERM;
    385 
    386 	mutex_enter(&mfd->mfd_lock);
    387 
    388 	if (length > mfd->mfd_size)
    389 		ubc_zerorange(mfd->mfd_uobj, mfd->mfd_size,
    390 		    length - mfd->mfd_size, 0);
    391 	else {
    392 		/* length < mfd->mfd_size, so try to get rid of excess pages */
    393 		start = round_page(length);
    394 		end = round_page(mfd->mfd_size);
    395 
    396 		if (start < end) { /* we actually have pages to remove */
    397 			rw_enter(mfd->mfd_uobj->vmobjlock, RW_WRITER);
    398 			error = (*mfd->mfd_uobj->pgops->pgo_put)(mfd->mfd_uobj,
    399 			    start, end, PGO_FREE);
    400 			/* pgo_put drops vmobjlock */
    401 		}
    402 	}
    403 
    404 	getnanotime(&mfd->mfd_mtime);
    405 	mfd->mfd_size = length;
    406 	mutex_exit(&mfd->mfd_lock);
    407 	return error;
    408 }
    409