Home | History | Annotate | Line # | Download | only in kern
sys_memfd.c revision 1.2
      1  1.2  christos /*	$NetBSD: sys_memfd.c,v 1.2 2023/07/10 15:49:18 christos Exp $	*/
      2  1.1  christos 
      3  1.1  christos /*-
      4  1.1  christos  * Copyright (c) 2023 The NetBSD Foundation, Inc.
      5  1.1  christos  * All rights reserved.
      6  1.1  christos  *
      7  1.1  christos  * This code is derived from software contributed to The NetBSD Foundation
      8  1.1  christos  * by Theodore Preduta.
      9  1.1  christos  *
     10  1.1  christos  * Redistribution and use in source and binary forms, with or without
     11  1.1  christos  * modification, are permitted provided that the following conditions
     12  1.1  christos  * are met:
     13  1.1  christos  * 1. Redistributions of source code must retain the above copyright
     14  1.1  christos  *    notice, this list of conditions and the following disclaimer.
     15  1.1  christos  * 2. Redistributions in binary form must reproduce the above copyright
     16  1.1  christos  *    notice, this list of conditions and the following disclaimer in the
     17  1.1  christos  *    documentation and/or other materials provided with the distribution.
     18  1.1  christos  *
     19  1.1  christos  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
     20  1.1  christos  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     21  1.1  christos  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     22  1.1  christos  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
     23  1.1  christos  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     24  1.1  christos  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     25  1.1  christos  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     26  1.1  christos  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     27  1.1  christos  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     28  1.1  christos  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     29  1.1  christos  * POSSIBILITY OF SUCH DAMAGE.
     30  1.1  christos  */
     31  1.1  christos 
     32  1.1  christos #include <sys/cdefs.h>
     33  1.2  christos __KERNEL_RCSID(0, "$NetBSD: sys_memfd.c,v 1.2 2023/07/10 15:49:18 christos Exp $");
     34  1.1  christos 
     35  1.1  christos #include <sys/param.h>
     36  1.1  christos #include <sys/types.h>
     37  1.1  christos #include <sys/fcntl.h>
     38  1.1  christos #include <sys/file.h>
     39  1.1  christos #include <sys/filedesc.h>
     40  1.1  christos #include <sys/mman.h>
     41  1.2  christos #include <sys/miscfd.h>
     42  1.1  christos #include <sys/syscallargs.h>
     43  1.1  christos 
     44  1.1  christos #include <uvm/uvm_extern.h>
     45  1.1  christos #include <uvm/uvm_object.h>
     46  1.1  christos 
     47  1.1  christos #define F_SEAL_ANY_WRITE	(F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
     48  1.1  christos #define MFD_KNOWN_SEALS		(F_SEAL_SEAL|F_SEAL_SHRINK|F_SEAL_GROW \
     49  1.1  christos 				|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
     50  1.1  christos 
     51  1.1  christos static const char memfd_prefix[] = "memfd:";
     52  1.1  christos 
     53  1.1  christos static int memfd_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
     54  1.1  christos static int memfd_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
     55  1.1  christos static int memfd_ioctl(file_t *, u_long, void *);
     56  1.1  christos static int memfd_fcntl(file_t *, u_int, void *);
     57  1.1  christos static int memfd_stat(file_t *, struct stat *);
     58  1.1  christos static int memfd_close(file_t *);
     59  1.1  christos static int memfd_mmap(file_t *, off_t *, size_t, int, int *, int *,
     60  1.1  christos     struct uvm_object **, int *);
     61  1.1  christos static int memfd_seek(file_t *, off_t, int, off_t *, int);
     62  1.1  christos static int memfd_truncate(file_t *, off_t);
     63  1.1  christos 
     64  1.1  christos static const struct fileops memfd_fileops = {
     65  1.1  christos 	.fo_name = "memfd",
     66  1.1  christos 	.fo_read = memfd_read,
     67  1.1  christos 	.fo_write = memfd_write,
     68  1.1  christos 	.fo_ioctl = memfd_ioctl,
     69  1.1  christos 	.fo_fcntl = memfd_fcntl,
     70  1.1  christos 	.fo_poll = fnullop_poll,
     71  1.1  christos 	.fo_stat = memfd_stat,
     72  1.1  christos 	.fo_close = memfd_close,
     73  1.1  christos 	.fo_kqfilter = fnullop_kqfilter,
     74  1.1  christos 	.fo_restart = fnullop_restart,
     75  1.1  christos 	.fo_mmap = memfd_mmap,
     76  1.1  christos 	.fo_seek = memfd_seek,
     77  1.1  christos 	.fo_fpathconf = (void *)eopnotsupp,
     78  1.1  christos 	.fo_posix_fadvise = (void *)eopnotsupp,
     79  1.1  christos 	.fo_truncate = memfd_truncate,
     80  1.1  christos };
     81  1.1  christos 
     82  1.1  christos /*
     83  1.1  christos  * memfd_create(2).  Creat a file descriptor associated with anonymous
     84  1.1  christos  * memory.
     85  1.1  christos  */
     86  1.1  christos int
     87  1.1  christos sys_memfd_create(struct lwp *l, const struct sys_memfd_create_args *uap,
     88  1.1  christos     register_t *retval)
     89  1.1  christos {
     90  1.1  christos 	/* {
     91  1.1  christos 		syscallarg(const char *) name;
     92  1.1  christos 		syscallarg(unsigned int) flags;
     93  1.1  christos 	} */
     94  1.1  christos 	int error, fd;
     95  1.1  christos 	file_t *fp;
     96  1.1  christos 	struct memfd *mfd;
     97  1.1  christos 	struct proc *p = l->l_proc;
     98  1.1  christos 	const unsigned int flags = SCARG(uap, flags);
     99  1.1  christos 
    100  1.1  christos 	KASSERT(NAME_MAX - sizeof(memfd_prefix) > 0); /* sanity check */
    101  1.1  christos 
    102  1.1  christos 	if (flags & ~(MFD_CLOEXEC|MFD_ALLOW_SEALING))
    103  1.1  christos 		return EINVAL;
    104  1.1  christos 
    105  1.1  christos 	mfd = kmem_zalloc(sizeof(*mfd), KM_SLEEP);
    106  1.1  christos 	mfd->mfd_size = 0;
    107  1.1  christos 	mfd->mfd_uobj = uao_create(INT64_MAX - PAGE_SIZE, 0); /* same as tmpfs */
    108  1.1  christos 	mutex_init(&mfd->mfd_lock, MUTEX_DEFAULT, IPL_NONE);
    109  1.1  christos 
    110  1.1  christos 	strcpy(mfd->mfd_name, memfd_prefix);
    111  1.1  christos 	error = copyinstr(SCARG(uap, name),
    112  1.1  christos 	    &mfd->mfd_name[sizeof(memfd_prefix) - 1],
    113  1.1  christos 	    sizeof(mfd->mfd_name) - sizeof(memfd_prefix), NULL);
    114  1.1  christos 	if (error != 0)
    115  1.1  christos  		goto leave;
    116  1.1  christos 
    117  1.1  christos 	getnanotime(&mfd->mfd_btime);
    118  1.1  christos 
    119  1.1  christos 	if ((flags & MFD_ALLOW_SEALING) == 0)
    120  1.1  christos 		mfd->mfd_seals |= F_SEAL_SEAL;
    121  1.1  christos 
    122  1.1  christos 	error = fd_allocfile(&fp, &fd);
    123  1.1  christos 	if (error != 0)
    124  1.1  christos 		goto leave;
    125  1.1  christos 
    126  1.1  christos 	fp->f_flag = FREAD|FWRITE;
    127  1.1  christos 	fp->f_type = DTYPE_MEMFD;
    128  1.1  christos 	fp->f_ops = &memfd_fileops;
    129  1.1  christos 	fp->f_memfd = mfd;
    130  1.1  christos 	fd_set_exclose(l, fd, (flags & MFD_CLOEXEC) != 0);
    131  1.1  christos 	fd_affix(p, fp, fd);
    132  1.1  christos 
    133  1.1  christos 	*retval = fd;
    134  1.1  christos 	return 0;
    135  1.1  christos 
    136  1.1  christos leave:
    137  1.1  christos 	uao_detach(mfd->mfd_uobj);
    138  1.1  christos 	kmem_free(mfd, sizeof(*mfd));
    139  1.1  christos 	return error;
    140  1.1  christos }
    141  1.1  christos 
    142  1.1  christos static int
    143  1.1  christos memfd_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
    144  1.1  christos     int flags)
    145  1.1  christos {
    146  1.1  christos 	int error;
    147  1.1  christos 	vsize_t todo;
    148  1.1  christos 	struct memfd *mfd = fp->f_memfd;
    149  1.1  christos 
    150  1.1  christos 	if (offp == &fp->f_offset)
    151  1.1  christos 		mutex_enter(&fp->f_lock);
    152  1.1  christos 
    153  1.1  christos 	if (*offp < 0) {
    154  1.1  christos 		error = EINVAL;
    155  1.1  christos 		goto leave;
    156  1.1  christos 	}
    157  1.1  christos 
    158  1.1  christos 	/* Trying to read past the end does nothing. */
    159  1.1  christos 	if (*offp >= mfd->mfd_size) {
    160  1.1  christos 		error = 0;
    161  1.1  christos 		goto leave;
    162  1.1  christos 	}
    163  1.1  christos 
    164  1.1  christos 	uio->uio_offset = *offp;
    165  1.1  christos 	todo = MIN(uio->uio_resid, mfd->mfd_size - *offp);
    166  1.1  christos 	error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
    167  1.1  christos 	    UBC_READ|UBC_PARTIALOK);
    168  1.1  christos 
    169  1.1  christos leave:
    170  1.1  christos 	if (offp == &fp->f_offset)
    171  1.1  christos 		mutex_exit(&fp->f_lock);
    172  1.1  christos 
    173  1.1  christos 	getnanotime(&mfd->mfd_atime);
    174  1.1  christos 
    175  1.1  christos 	return error;
    176  1.1  christos }
    177  1.1  christos 
    178  1.1  christos static int
    179  1.1  christos memfd_write(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
    180  1.1  christos     int flags)
    181  1.1  christos {
    182  1.1  christos 	int error;
    183  1.1  christos 	vsize_t todo;
    184  1.1  christos 	struct memfd *mfd = fp->f_memfd;
    185  1.1  christos 
    186  1.1  christos 	if (mfd->mfd_seals & F_SEAL_ANY_WRITE)
    187  1.1  christos 		return EPERM;
    188  1.1  christos 
    189  1.1  christos 	if (offp == &fp->f_offset)
    190  1.1  christos 		mutex_enter(&fp->f_lock);
    191  1.1  christos 
    192  1.1  christos 	if (*offp < 0) {
    193  1.1  christos 		error = EINVAL;
    194  1.1  christos 		goto leave;
    195  1.1  christos 	}
    196  1.1  christos 
    197  1.1  christos 	uio->uio_offset = *offp;
    198  1.1  christos 	todo = uio->uio_resid;
    199  1.1  christos 
    200  1.1  christos 	if (mfd->mfd_seals & F_SEAL_GROW) {
    201  1.1  christos 		if (*offp >= mfd->mfd_size) {
    202  1.1  christos 			error = EPERM;
    203  1.1  christos 			goto leave;
    204  1.1  christos 		}
    205  1.1  christos 
    206  1.1  christos 		/* Truncate the write to fit in mfd_size */
    207  1.1  christos 		if (*offp + uio->uio_resid >= mfd->mfd_size)
    208  1.1  christos 			todo = mfd->mfd_size - *offp;
    209  1.1  christos 	} else if (*offp + uio->uio_resid >= mfd->mfd_size) {
    210  1.1  christos 		/* Grow to accommodate the write request. */
    211  1.1  christos 		error = memfd_truncate(fp, *offp + uio->uio_resid);
    212  1.1  christos 		if (error != 0)
    213  1.1  christos 			goto leave;
    214  1.1  christos 	}
    215  1.1  christos 
    216  1.1  christos 	error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
    217  1.1  christos 	    UBC_WRITE|UBC_PARTIALOK);
    218  1.1  christos 
    219  1.1  christos 	getnanotime(&mfd->mfd_mtime);
    220  1.1  christos 
    221  1.1  christos leave:
    222  1.1  christos 	if (offp == &fp->f_offset)
    223  1.1  christos 		mutex_exit(&fp->f_lock);
    224  1.1  christos 
    225  1.1  christos 	return error;
    226  1.1  christos }
    227  1.1  christos 
    228  1.1  christos static int
    229  1.1  christos memfd_ioctl(file_t *fp, u_long cmd, void *data)
    230  1.1  christos {
    231  1.1  christos 
    232  1.1  christos 	return EINVAL;
    233  1.1  christos }
    234  1.1  christos 
    235  1.1  christos static int
    236  1.1  christos memfd_fcntl(file_t *fp, u_int cmd, void *data)
    237  1.1  christos {
    238  1.1  christos 	struct memfd *mfd = fp->f_memfd;
    239  1.1  christos 
    240  1.1  christos 	switch (cmd) {
    241  1.1  christos 	case F_ADD_SEALS:
    242  1.1  christos 		if (mfd->mfd_seals & F_SEAL_SEAL)
    243  1.1  christos 			return EPERM;
    244  1.1  christos 
    245  1.1  christos 		if (*(int *)data & ~MFD_KNOWN_SEALS)
    246  1.1  christos 		        return EINVAL;
    247  1.1  christos 
    248  1.1  christos 		/*
    249  1.1  christos 		 * Can only add F_SEAL_WRITE if there are no currently
    250  1.1  christos 		 * open mmaps.
    251  1.1  christos 		 *
    252  1.1  christos 		 * XXX should only disallow if there are no currently
    253  1.1  christos 		 * open mmaps with PROT_WRITE.
    254  1.1  christos 		 */
    255  1.1  christos 		if ((mfd->mfd_seals & F_SEAL_WRITE) == 0 &&
    256  1.1  christos 		    (*(int *)data & F_SEAL_WRITE) != 0 &&
    257  1.1  christos 		    mfd->mfd_uobj->uo_refs > 1)
    258  1.1  christos 			return EBUSY;
    259  1.1  christos 
    260  1.1  christos 		mfd->mfd_seals |= *(int *)data;
    261  1.1  christos 		return 0;
    262  1.1  christos 
    263  1.1  christos 	case F_GET_SEALS:
    264  1.1  christos 		*(int *)data = mfd->mfd_seals;
    265  1.1  christos 		return 0;
    266  1.1  christos 
    267  1.1  christos 	default:
    268  1.1  christos 		return EINVAL;
    269  1.1  christos 	}
    270  1.1  christos }
    271  1.1  christos 
    272  1.1  christos static int
    273  1.1  christos memfd_stat(file_t *fp, struct stat *st)
    274  1.1  christos {
    275  1.1  christos 	struct memfd *mfd = fp->f_memfd;
    276  1.1  christos 
    277  1.1  christos 	memset(st, 0, sizeof(*st));
    278  1.1  christos 	st->st_uid = kauth_cred_geteuid(fp->f_cred);
    279  1.1  christos 	st->st_gid = kauth_cred_getegid(fp->f_cred);
    280  1.1  christos 	st->st_size = mfd->mfd_size;
    281  1.1  christos 
    282  1.1  christos 	st->st_mode = S_IREAD;
    283  1.1  christos 	if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) == 0)
    284  1.1  christos 		st->st_mode |= S_IWRITE;
    285  1.1  christos 
    286  1.1  christos 	st->st_birthtimespec = mfd->mfd_btime;
    287  1.1  christos 	st->st_ctimespec = mfd->mfd_mtime;
    288  1.1  christos 	st->st_atimespec = mfd->mfd_atime;
    289  1.1  christos 	st->st_mtimespec = mfd->mfd_mtime;
    290  1.1  christos 
    291  1.1  christos 	return 0;
    292  1.1  christos }
    293  1.1  christos 
    294  1.1  christos static int
    295  1.1  christos memfd_close(file_t *fp)
    296  1.1  christos {
    297  1.1  christos 	struct memfd *mfd = fp->f_memfd;
    298  1.1  christos 
    299  1.1  christos 	uao_detach(mfd->mfd_uobj);
    300  1.1  christos 	mutex_destroy(&mfd->mfd_lock);
    301  1.1  christos 
    302  1.1  christos 	kmem_free(mfd, sizeof(*mfd));
    303  1.1  christos 	fp->f_memfd = NULL;
    304  1.1  christos 
    305  1.1  christos 	return 0;
    306  1.1  christos }
    307  1.1  christos 
    308  1.1  christos static int
    309  1.1  christos memfd_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp,
    310  1.1  christos     int *advicep, struct uvm_object **uobjp, int *maxprotp)
    311  1.1  christos {
    312  1.1  christos 	struct memfd *mfd = fp->f_memfd;
    313  1.1  christos 
    314  1.1  christos 	/* uvm_mmap guarantees page-aligned offset and size.  */
    315  1.1  christos 	KASSERT(*offp == round_page(*offp));
    316  1.1  christos 	KASSERT(size == round_page(size));
    317  1.1  christos 	KASSERT(size > 0);
    318  1.1  christos 
    319  1.1  christos 	if (*offp < 0)
    320  1.1  christos 		return EINVAL;
    321  1.1  christos 	if (*offp + size > mfd->mfd_size)
    322  1.1  christos 		return EINVAL;
    323  1.1  christos 
    324  1.1  christos 	if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) &&
    325  1.1  christos 	    (prot & VM_PROT_WRITE) && (*flagsp & MAP_PRIVATE) == 0)
    326  1.1  christos 		return EPERM;
    327  1.1  christos 
    328  1.1  christos 	uao_reference(fp->f_memfd->mfd_uobj);
    329  1.1  christos 	*uobjp = fp->f_memfd->mfd_uobj;
    330  1.1  christos 
    331  1.1  christos 	*maxprotp = prot;
    332  1.1  christos 	*advicep = UVM_ADV_RANDOM;
    333  1.1  christos 
    334  1.1  christos 	return 0;
    335  1.1  christos }
    336  1.1  christos 
    337  1.1  christos static int
    338  1.1  christos memfd_seek(file_t *fp, off_t delta, int whence, off_t *newoffp,
    339  1.1  christos     int flags)
    340  1.1  christos {
    341  1.1  christos 	off_t newoff;
    342  1.1  christos 	int error;
    343  1.1  christos 
    344  1.1  christos 	switch (whence) {
    345  1.1  christos 	case SEEK_CUR:
    346  1.1  christos 		newoff = fp->f_offset + delta;
    347  1.1  christos 		break;
    348  1.1  christos 
    349  1.1  christos 	case SEEK_END:
    350  1.1  christos 		newoff = fp->f_memfd->mfd_size + delta;
    351  1.1  christos 		break;
    352  1.1  christos 
    353  1.1  christos 	case SEEK_SET:
    354  1.1  christos 		newoff = delta;
    355  1.1  christos 		break;
    356  1.1  christos 
    357  1.1  christos 	default:
    358  1.1  christos 		error = EINVAL;
    359  1.1  christos 		return error;
    360  1.1  christos 	}
    361  1.1  christos 
    362  1.1  christos 	if (newoffp)
    363  1.1  christos 		*newoffp = newoff;
    364  1.1  christos 	if (flags & FOF_UPDATE_OFFSET)
    365  1.1  christos 		fp->f_offset = newoff;
    366  1.1  christos 
    367  1.1  christos 	return 0;
    368  1.1  christos }
    369  1.1  christos 
    370  1.1  christos static int
    371  1.1  christos memfd_truncate(file_t *fp, off_t length)
    372  1.1  christos {
    373  1.1  christos 	struct memfd *mfd = fp->f_memfd;
    374  1.1  christos 	int error = 0;
    375  1.1  christos 	voff_t start, end;
    376  1.1  christos 
    377  1.1  christos 	if (length < 0)
    378  1.1  christos 		return EINVAL;
    379  1.1  christos 	if (length == mfd->mfd_size)
    380  1.1  christos 		return 0;
    381  1.1  christos 
    382  1.1  christos 	if ((mfd->mfd_seals & F_SEAL_SHRINK) && length < mfd->mfd_size)
    383  1.1  christos 		return EPERM;
    384  1.1  christos 	if ((mfd->mfd_seals & F_SEAL_GROW) && length > mfd->mfd_size)
    385  1.1  christos 		return EPERM;
    386  1.1  christos 
    387  1.1  christos 	mutex_enter(&mfd->mfd_lock);
    388  1.1  christos 
    389  1.1  christos 	if (length > mfd->mfd_size)
    390  1.1  christos 		ubc_zerorange(mfd->mfd_uobj, mfd->mfd_size,
    391  1.1  christos 		    length - mfd->mfd_size, 0);
    392  1.1  christos 	else {
    393  1.1  christos 		/* length < mfd->mfd_size, so try to get rid of excess pages */
    394  1.1  christos 		start = round_page(length);
    395  1.1  christos 		end = round_page(mfd->mfd_size);
    396  1.1  christos 
    397  1.1  christos 		if (start < end) { /* we actually have pages to remove */
    398  1.1  christos 			rw_enter(mfd->mfd_uobj->vmobjlock, RW_WRITER);
    399  1.1  christos 			error = (*mfd->mfd_uobj->pgops->pgo_put)(mfd->mfd_uobj,
    400  1.1  christos 			    start, end, PGO_FREE);
    401  1.1  christos 			/* pgo_put drops vmobjlock */
    402  1.1  christos 		}
    403  1.1  christos 	}
    404  1.1  christos 
    405  1.1  christos 	getnanotime(&mfd->mfd_mtime);
    406  1.1  christos 	mfd->mfd_size = length;
    407  1.1  christos 	mutex_exit(&mfd->mfd_lock);
    408  1.1  christos 	return error;
    409  1.1  christos }
    410