1 1.12 gutterid /* $NetBSD: sys_memfd.c,v 1.12 2025/08/02 15:46:04 gutteridge Exp $ */ 2 1.1 christos 3 1.1 christos /*- 4 1.1 christos * Copyright (c) 2023 The NetBSD Foundation, Inc. 5 1.1 christos * All rights reserved. 6 1.1 christos * 7 1.1 christos * This code is derived from software contributed to The NetBSD Foundation 8 1.1 christos * by Theodore Preduta. 9 1.1 christos * 10 1.1 christos * Redistribution and use in source and binary forms, with or without 11 1.1 christos * modification, are permitted provided that the following conditions 12 1.1 christos * are met: 13 1.1 christos * 1. Redistributions of source code must retain the above copyright 14 1.1 christos * notice, this list of conditions and the following disclaimer. 15 1.1 christos * 2. Redistributions in binary form must reproduce the above copyright 16 1.1 christos * notice, this list of conditions and the following disclaimer in the 17 1.1 christos * documentation and/or other materials provided with the distribution. 18 1.1 christos * 19 1.1 christos * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 1.1 christos * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 1.1 christos * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 1.1 christos * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 1.1 christos * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 1.1 christos * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 1.1 christos * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 1.1 christos * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 1.1 christos * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 1.1 christos * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 1.1 christos * POSSIBILITY OF SUCH DAMAGE. 30 1.1 christos */ 31 1.1 christos 32 1.1 christos #include <sys/cdefs.h> 33 1.12 gutterid __KERNEL_RCSID(0, "$NetBSD: sys_memfd.c,v 1.12 2025/08/02 15:46:04 gutteridge Exp $"); 34 1.1 christos 35 1.1 christos #include <sys/param.h> 36 1.1 christos #include <sys/types.h> 37 1.4 riastrad 38 1.1 christos #include <sys/fcntl.h> 39 1.1 christos #include <sys/file.h> 40 1.1 christos #include <sys/filedesc.h> 41 1.4 riastrad #include <sys/memfd.h> 42 1.1 christos #include <sys/mman.h> 43 1.1 christos #include <sys/syscallargs.h> 44 1.1 christos 45 1.1 christos #include <uvm/uvm_extern.h> 46 1.1 christos #include <uvm/uvm_object.h> 47 1.1 christos 48 1.1 christos #define F_SEAL_ANY_WRITE (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE) 49 1.1 christos #define MFD_KNOWN_SEALS (F_SEAL_SEAL|F_SEAL_SHRINK|F_SEAL_GROW \ 50 1.1 christos |F_SEAL_WRITE|F_SEAL_FUTURE_WRITE) 51 1.1 christos 52 1.1 christos static const char memfd_prefix[] = "memfd:"; 53 1.1 christos 54 1.1 christos static int memfd_read(file_t *, off_t *, struct uio *, kauth_cred_t, int); 55 1.1 christos static int memfd_write(file_t *, off_t *, struct uio *, kauth_cred_t, int); 56 1.1 christos static int memfd_ioctl(file_t *, u_long, void *); 57 1.1 christos static int memfd_fcntl(file_t *, u_int, void *); 58 1.1 christos static int memfd_stat(file_t *, struct stat *); 59 1.1 christos static int memfd_close(file_t *); 60 1.1 christos static int memfd_mmap(file_t *, off_t *, size_t, int, int *, int *, 61 1.1 christos struct uvm_object **, int *); 62 1.1 christos static int memfd_seek(file_t *, off_t, int, off_t *, int); 63 1.8 rin static int memfd_truncate_locked(file_t *, off_t); 64 1.1 christos static int memfd_truncate(file_t *, off_t); 65 1.1 christos 66 1.1 christos static const struct fileops memfd_fileops = { 67 1.1 christos .fo_name = "memfd", 68 1.1 christos .fo_read = memfd_read, 69 1.1 christos .fo_write = memfd_write, 70 1.1 christos .fo_ioctl = memfd_ioctl, 71 1.1 christos .fo_fcntl = memfd_fcntl, 72 1.1 christos .fo_poll = fnullop_poll, 73 1.1 christos .fo_stat = memfd_stat, 74 1.1 christos .fo_close = memfd_close, 75 1.1 christos .fo_kqfilter = fnullop_kqfilter, 76 1.1 christos .fo_restart = fnullop_restart, 77 1.1 christos .fo_mmap = memfd_mmap, 78 1.1 christos .fo_seek = memfd_seek, 79 1.1 christos .fo_fpathconf = (void *)eopnotsupp, 80 1.1 christos .fo_posix_fadvise = (void *)eopnotsupp, 81 1.1 christos .fo_truncate = memfd_truncate, 82 1.1 christos }; 83 1.1 christos 84 1.1 christos /* 85 1.12 gutterid * memfd_create(2). Create a file descriptor associated with anonymous 86 1.1 christos * memory. 87 1.1 christos */ 88 1.1 christos int 89 1.1 christos sys_memfd_create(struct lwp *l, const struct sys_memfd_create_args *uap, 90 1.1 christos register_t *retval) 91 1.1 christos { 92 1.1 christos /* { 93 1.1 christos syscallarg(const char *) name; 94 1.1 christos syscallarg(unsigned int) flags; 95 1.1 christos } */ 96 1.1 christos int error, fd; 97 1.1 christos file_t *fp; 98 1.1 christos struct memfd *mfd; 99 1.1 christos struct proc *p = l->l_proc; 100 1.1 christos const unsigned int flags = SCARG(uap, flags); 101 1.1 christos 102 1.1 christos if (flags & ~(MFD_CLOEXEC|MFD_ALLOW_SEALING)) 103 1.1 christos return EINVAL; 104 1.1 christos 105 1.1 christos mfd = kmem_zalloc(sizeof(*mfd), KM_SLEEP); 106 1.1 christos mfd->mfd_size = 0; 107 1.1 christos mfd->mfd_uobj = uao_create(INT64_MAX - PAGE_SIZE, 0); /* same as tmpfs */ 108 1.1 christos 109 1.3 riastrad CTASSERT(sizeof(memfd_prefix) < NAME_MAX); /* sanity check */ 110 1.1 christos strcpy(mfd->mfd_name, memfd_prefix); 111 1.1 christos error = copyinstr(SCARG(uap, name), 112 1.1 christos &mfd->mfd_name[sizeof(memfd_prefix) - 1], 113 1.1 christos sizeof(mfd->mfd_name) - sizeof(memfd_prefix), NULL); 114 1.1 christos if (error != 0) 115 1.1 christos goto leave; 116 1.1 christos 117 1.1 christos getnanotime(&mfd->mfd_btime); 118 1.1 christos 119 1.1 christos if ((flags & MFD_ALLOW_SEALING) == 0) 120 1.1 christos mfd->mfd_seals |= F_SEAL_SEAL; 121 1.1 christos 122 1.1 christos error = fd_allocfile(&fp, &fd); 123 1.1 christos if (error != 0) 124 1.1 christos goto leave; 125 1.1 christos 126 1.1 christos fp->f_flag = FREAD|FWRITE; 127 1.1 christos fp->f_type = DTYPE_MEMFD; 128 1.1 christos fp->f_ops = &memfd_fileops; 129 1.1 christos fp->f_memfd = mfd; 130 1.1 christos fd_set_exclose(l, fd, (flags & MFD_CLOEXEC) != 0); 131 1.1 christos fd_affix(p, fp, fd); 132 1.1 christos 133 1.1 christos *retval = fd; 134 1.1 christos return 0; 135 1.1 christos 136 1.1 christos leave: 137 1.1 christos uao_detach(mfd->mfd_uobj); 138 1.1 christos kmem_free(mfd, sizeof(*mfd)); 139 1.1 christos return error; 140 1.1 christos } 141 1.1 christos 142 1.1 christos static int 143 1.1 christos memfd_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, 144 1.1 christos int flags) 145 1.1 christos { 146 1.1 christos int error; 147 1.1 christos vsize_t todo; 148 1.1 christos struct memfd *mfd = fp->f_memfd; 149 1.1 christos 150 1.6 christos mutex_enter(&fp->f_lock); 151 1.1 christos 152 1.1 christos if (*offp < 0) { 153 1.1 christos error = EINVAL; 154 1.1 christos goto leave; 155 1.1 christos } 156 1.1 christos 157 1.1 christos /* Trying to read past the end does nothing. */ 158 1.1 christos if (*offp >= mfd->mfd_size) { 159 1.1 christos error = 0; 160 1.1 christos goto leave; 161 1.1 christos } 162 1.1 christos 163 1.7 rin uio->uio_offset = *offp; 164 1.1 christos todo = MIN(uio->uio_resid, mfd->mfd_size - *offp); 165 1.1 christos error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL, 166 1.1 christos UBC_READ|UBC_PARTIALOK); 167 1.7 rin if (flags & FOF_UPDATE_OFFSET) 168 1.7 rin *offp = uio->uio_offset; 169 1.1 christos 170 1.1 christos leave: 171 1.6 christos getnanotime(&mfd->mfd_atime); 172 1.6 christos 173 1.1 christos 174 1.6 christos mutex_exit(&fp->f_lock); 175 1.1 christos 176 1.1 christos return error; 177 1.1 christos } 178 1.1 christos 179 1.1 christos static int 180 1.1 christos memfd_write(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, 181 1.1 christos int flags) 182 1.1 christos { 183 1.1 christos int error; 184 1.1 christos vsize_t todo; 185 1.1 christos struct memfd *mfd = fp->f_memfd; 186 1.1 christos 187 1.6 christos mutex_enter(&fp->f_lock); 188 1.1 christos 189 1.6 christos if (mfd->mfd_seals & F_SEAL_ANY_WRITE) { 190 1.6 christos error = EPERM; 191 1.6 christos goto leave; 192 1.6 christos } 193 1.1 christos 194 1.1 christos if (*offp < 0) { 195 1.1 christos error = EINVAL; 196 1.1 christos goto leave; 197 1.1 christos } 198 1.1 christos 199 1.1 christos uio->uio_offset = *offp; 200 1.1 christos todo = uio->uio_resid; 201 1.1 christos 202 1.1 christos if (mfd->mfd_seals & F_SEAL_GROW) { 203 1.1 christos if (*offp >= mfd->mfd_size) { 204 1.1 christos error = EPERM; 205 1.1 christos goto leave; 206 1.1 christos } 207 1.1 christos 208 1.1 christos /* Truncate the write to fit in mfd_size */ 209 1.1 christos if (*offp + uio->uio_resid >= mfd->mfd_size) 210 1.1 christos todo = mfd->mfd_size - *offp; 211 1.1 christos } else if (*offp + uio->uio_resid >= mfd->mfd_size) { 212 1.1 christos /* Grow to accommodate the write request. */ 213 1.8 rin error = memfd_truncate_locked(fp, *offp + uio->uio_resid); 214 1.1 christos if (error != 0) 215 1.1 christos goto leave; 216 1.1 christos } 217 1.1 christos 218 1.1 christos error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL, 219 1.1 christos UBC_WRITE|UBC_PARTIALOK); 220 1.6 christos if (flags & FOF_UPDATE_OFFSET) 221 1.6 christos *offp = uio->uio_offset; 222 1.1 christos 223 1.1 christos getnanotime(&mfd->mfd_mtime); 224 1.1 christos 225 1.1 christos leave: 226 1.6 christos mutex_exit(&fp->f_lock); 227 1.1 christos 228 1.1 christos return error; 229 1.1 christos } 230 1.1 christos 231 1.1 christos static int 232 1.1 christos memfd_ioctl(file_t *fp, u_long cmd, void *data) 233 1.1 christos { 234 1.1 christos 235 1.1 christos return EINVAL; 236 1.1 christos } 237 1.1 christos 238 1.1 christos static int 239 1.1 christos memfd_fcntl(file_t *fp, u_int cmd, void *data) 240 1.1 christos { 241 1.1 christos struct memfd *mfd = fp->f_memfd; 242 1.6 christos int error = 0; 243 1.1 christos 244 1.1 christos switch (cmd) { 245 1.9 christos case F_GETPATH: 246 1.11 christos strncpy(data, mfd->mfd_name, MAXPATHLEN); 247 1.9 christos return 0; 248 1.9 christos 249 1.1 christos case F_ADD_SEALS: 250 1.6 christos mutex_enter(&fp->f_lock); 251 1.1 christos 252 1.6 christos if (mfd->mfd_seals & F_SEAL_SEAL) { 253 1.6 christos error = EPERM; 254 1.6 christos goto leave_add_seals; 255 1.6 christos } 256 1.6 christos 257 1.6 christos if (*(int *)data & ~MFD_KNOWN_SEALS) { 258 1.6 christos error = EINVAL; 259 1.6 christos goto leave_add_seals; 260 1.6 christos } 261 1.1 christos 262 1.1 christos /* 263 1.1 christos * Can only add F_SEAL_WRITE if there are no currently 264 1.1 christos * open mmaps. 265 1.1 christos * 266 1.1 christos * XXX should only disallow if there are no currently 267 1.1 christos * open mmaps with PROT_WRITE. 268 1.1 christos */ 269 1.1 christos if ((mfd->mfd_seals & F_SEAL_WRITE) == 0 && 270 1.1 christos (*(int *)data & F_SEAL_WRITE) != 0 && 271 1.1 christos mfd->mfd_uobj->uo_refs > 1) 272 1.6 christos { 273 1.6 christos error = EBUSY; 274 1.6 christos goto leave_add_seals; 275 1.6 christos } 276 1.1 christos 277 1.1 christos mfd->mfd_seals |= *(int *)data; 278 1.6 christos 279 1.6 christos leave_add_seals: 280 1.6 christos mutex_exit(&fp->f_lock); 281 1.6 christos return error; 282 1.1 christos 283 1.1 christos case F_GET_SEALS: 284 1.6 christos mutex_enter(&fp->f_lock); 285 1.1 christos *(int *)data = mfd->mfd_seals; 286 1.6 christos mutex_exit(&fp->f_lock); 287 1.1 christos return 0; 288 1.1 christos 289 1.1 christos default: 290 1.1 christos return EINVAL; 291 1.1 christos } 292 1.1 christos } 293 1.1 christos 294 1.1 christos static int 295 1.1 christos memfd_stat(file_t *fp, struct stat *st) 296 1.1 christos { 297 1.1 christos struct memfd *mfd = fp->f_memfd; 298 1.1 christos 299 1.6 christos mutex_enter(&fp->f_lock); 300 1.6 christos 301 1.1 christos memset(st, 0, sizeof(*st)); 302 1.1 christos st->st_uid = kauth_cred_geteuid(fp->f_cred); 303 1.1 christos st->st_gid = kauth_cred_getegid(fp->f_cred); 304 1.1 christos st->st_size = mfd->mfd_size; 305 1.1 christos 306 1.1 christos st->st_mode = S_IREAD; 307 1.1 christos if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) == 0) 308 1.1 christos st->st_mode |= S_IWRITE; 309 1.1 christos 310 1.1 christos st->st_birthtimespec = mfd->mfd_btime; 311 1.1 christos st->st_ctimespec = mfd->mfd_mtime; 312 1.1 christos st->st_atimespec = mfd->mfd_atime; 313 1.1 christos st->st_mtimespec = mfd->mfd_mtime; 314 1.1 christos 315 1.6 christos mutex_exit(&fp->f_lock); 316 1.6 christos 317 1.1 christos return 0; 318 1.1 christos } 319 1.1 christos 320 1.1 christos static int 321 1.1 christos memfd_close(file_t *fp) 322 1.1 christos { 323 1.1 christos struct memfd *mfd = fp->f_memfd; 324 1.1 christos 325 1.1 christos uao_detach(mfd->mfd_uobj); 326 1.1 christos 327 1.1 christos kmem_free(mfd, sizeof(*mfd)); 328 1.1 christos fp->f_memfd = NULL; 329 1.1 christos 330 1.1 christos return 0; 331 1.1 christos } 332 1.1 christos 333 1.1 christos static int 334 1.1 christos memfd_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp, 335 1.1 christos int *advicep, struct uvm_object **uobjp, int *maxprotp) 336 1.1 christos { 337 1.1 christos struct memfd *mfd = fp->f_memfd; 338 1.6 christos int error = 0; 339 1.1 christos 340 1.1 christos /* uvm_mmap guarantees page-aligned offset and size. */ 341 1.1 christos KASSERT(*offp == round_page(*offp)); 342 1.1 christos KASSERT(size == round_page(size)); 343 1.1 christos KASSERT(size > 0); 344 1.1 christos 345 1.6 christos mutex_enter(&fp->f_lock); 346 1.6 christos 347 1.6 christos if (*offp < 0) { 348 1.6 christos error = EINVAL; 349 1.6 christos goto leave; 350 1.6 christos } 351 1.6 christos if (*offp + size > mfd->mfd_size) { 352 1.6 christos error = EINVAL; 353 1.6 christos goto leave; 354 1.6 christos } 355 1.1 christos 356 1.1 christos if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) && 357 1.6 christos (prot & VM_PROT_WRITE) && (*flagsp & MAP_PRIVATE) == 0) { 358 1.6 christos error = EPERM; 359 1.6 christos goto leave; 360 1.6 christos } 361 1.1 christos 362 1.1 christos uao_reference(fp->f_memfd->mfd_uobj); 363 1.1 christos *uobjp = fp->f_memfd->mfd_uobj; 364 1.1 christos 365 1.1 christos *maxprotp = prot; 366 1.1 christos *advicep = UVM_ADV_RANDOM; 367 1.1 christos 368 1.6 christos leave: 369 1.6 christos mutex_exit(&fp->f_lock); 370 1.6 christos 371 1.6 christos return error; 372 1.1 christos } 373 1.1 christos 374 1.1 christos static int 375 1.1 christos memfd_seek(file_t *fp, off_t delta, int whence, off_t *newoffp, 376 1.1 christos int flags) 377 1.1 christos { 378 1.1 christos off_t newoff; 379 1.6 christos int error = 0; 380 1.6 christos 381 1.6 christos mutex_enter(&fp->f_lock); 382 1.1 christos 383 1.1 christos switch (whence) { 384 1.1 christos case SEEK_CUR: 385 1.1 christos newoff = fp->f_offset + delta; 386 1.1 christos break; 387 1.1 christos 388 1.1 christos case SEEK_END: 389 1.1 christos newoff = fp->f_memfd->mfd_size + delta; 390 1.1 christos break; 391 1.1 christos 392 1.1 christos case SEEK_SET: 393 1.1 christos newoff = delta; 394 1.1 christos break; 395 1.1 christos 396 1.1 christos default: 397 1.1 christos error = EINVAL; 398 1.6 christos goto leave; 399 1.1 christos } 400 1.1 christos 401 1.1 christos if (newoffp) 402 1.1 christos *newoffp = newoff; 403 1.1 christos if (flags & FOF_UPDATE_OFFSET) 404 1.1 christos fp->f_offset = newoff; 405 1.1 christos 406 1.6 christos leave: 407 1.6 christos mutex_exit(&fp->f_lock); 408 1.6 christos 409 1.6 christos return error; 410 1.1 christos } 411 1.1 christos 412 1.1 christos static int 413 1.8 rin memfd_truncate_locked(file_t *fp, off_t length) 414 1.1 christos { 415 1.1 christos struct memfd *mfd = fp->f_memfd; 416 1.6 christos voff_t start, end; 417 1.1 christos int error = 0; 418 1.6 christos 419 1.6 christos KASSERT(mutex_owned(&fp->f_lock)); 420 1.1 christos 421 1.1 christos if (length < 0) 422 1.1 christos return EINVAL; 423 1.1 christos if (length == mfd->mfd_size) 424 1.1 christos return 0; 425 1.1 christos 426 1.1 christos if ((mfd->mfd_seals & F_SEAL_SHRINK) && length < mfd->mfd_size) 427 1.1 christos return EPERM; 428 1.1 christos if ((mfd->mfd_seals & F_SEAL_GROW) && length > mfd->mfd_size) 429 1.1 christos return EPERM; 430 1.1 christos 431 1.1 christos if (length > mfd->mfd_size) 432 1.1 christos ubc_zerorange(mfd->mfd_uobj, mfd->mfd_size, 433 1.1 christos length - mfd->mfd_size, 0); 434 1.1 christos else { 435 1.1 christos /* length < mfd->mfd_size, so try to get rid of excess pages */ 436 1.1 christos start = round_page(length); 437 1.1 christos end = round_page(mfd->mfd_size); 438 1.1 christos 439 1.1 christos if (start < end) { /* we actually have pages to remove */ 440 1.1 christos rw_enter(mfd->mfd_uobj->vmobjlock, RW_WRITER); 441 1.1 christos error = (*mfd->mfd_uobj->pgops->pgo_put)(mfd->mfd_uobj, 442 1.1 christos start, end, PGO_FREE); 443 1.1 christos /* pgo_put drops vmobjlock */ 444 1.1 christos } 445 1.1 christos } 446 1.1 christos 447 1.1 christos getnanotime(&mfd->mfd_mtime); 448 1.1 christos mfd->mfd_size = length; 449 1.6 christos 450 1.6 christos return error; 451 1.6 christos } 452 1.6 christos 453 1.6 christos static int 454 1.6 christos memfd_truncate(file_t *fp, off_t length) 455 1.6 christos { 456 1.6 christos int error; 457 1.6 christos 458 1.6 christos mutex_enter(&fp->f_lock); 459 1.8 rin error = memfd_truncate_locked(fp, length); 460 1.6 christos mutex_exit(&fp->f_lock); 461 1.1 christos return error; 462 1.1 christos } 463