1 /* $NetBSD: sys_memfd.c,v 1.13 2025/11/15 19:02:26 gutteridge Exp $ */ 2 3 /*- 4 * Copyright (c) 2023 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Theodore Preduta. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __KERNEL_RCSID(0, "$NetBSD: sys_memfd.c,v 1.13 2025/11/15 19:02:26 gutteridge Exp $"); 34 35 #include <sys/param.h> 36 #include <sys/types.h> 37 38 #include <sys/fcntl.h> 39 #include <sys/file.h> 40 #include <sys/filedesc.h> 41 #include <sys/memfd.h> 42 #include <sys/mman.h> 43 #include <sys/syscallargs.h> 44 45 #include <uvm/uvm_extern.h> 46 #include <uvm/uvm_object.h> 47 48 #define F_SEAL_ANY_WRITE (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE) 49 #define MFD_KNOWN_SEALS (F_SEAL_SEAL|F_SEAL_SHRINK|F_SEAL_GROW \ 50 |F_SEAL_WRITE|F_SEAL_FUTURE_WRITE) 51 52 static const char memfd_prefix[] = "memfd:"; 53 54 static int memfd_read(file_t *, off_t *, struct uio *, kauth_cred_t, int); 55 static int memfd_write(file_t *, off_t *, struct uio *, kauth_cred_t, int); 56 static int memfd_ioctl(file_t *, u_long, void *); 57 static int memfd_fcntl(file_t *, u_int, void *); 58 static int memfd_stat(file_t *, struct stat *); 59 static int memfd_close(file_t *); 60 static int memfd_mmap(file_t *, off_t *, size_t, int, int *, int *, 61 struct uvm_object **, int *); 62 static int memfd_seek(file_t *, off_t, int, off_t *, int); 63 static int memfd_truncate_locked(file_t *, off_t); 64 static int memfd_truncate(file_t *, off_t); 65 66 static const struct fileops memfd_fileops = { 67 .fo_name = "memfd", 68 .fo_read = memfd_read, 69 .fo_write = memfd_write, 70 .fo_ioctl = memfd_ioctl, 71 .fo_fcntl = memfd_fcntl, 72 .fo_poll = fnullop_poll, 73 .fo_stat = memfd_stat, 74 .fo_close = memfd_close, 75 .fo_kqfilter = fnullop_kqfilter, 76 .fo_restart = fnullop_restart, 77 .fo_mmap = memfd_mmap, 78 .fo_seek = memfd_seek, 79 .fo_fpathconf = (void *)eopnotsupp, 80 .fo_posix_fadvise = (void *)eopnotsupp, 81 .fo_truncate = memfd_truncate, 82 }; 83 84 /* 85 * memfd_create(2). Create a file descriptor associated with anonymous 86 * memory. 87 */ 88 int 89 sys_memfd_create(struct lwp *l, const struct sys_memfd_create_args *uap, 90 register_t *retval) 91 { 92 /* { 93 syscallarg(const char *) name; 94 syscallarg(unsigned int) flags; 95 } */ 96 int error, fd; 97 file_t *fp; 98 struct memfd *mfd; 99 struct proc *p = l->l_proc; 100 const unsigned int flags = SCARG(uap, flags); 101 102 if (flags & ~(MFD_CLOEXEC|MFD_CLOFORK|MFD_ALLOW_SEALING)) 103 return EINVAL; 104 105 mfd = kmem_zalloc(sizeof(*mfd), KM_SLEEP); 106 mfd->mfd_size = 0; 107 mfd->mfd_uobj = uao_create(INT64_MAX - PAGE_SIZE, 0); /* same as tmpfs */ 108 109 CTASSERT(sizeof(memfd_prefix) < NAME_MAX); /* sanity check */ 110 strcpy(mfd->mfd_name, memfd_prefix); 111 error = copyinstr(SCARG(uap, name), 112 &mfd->mfd_name[sizeof(memfd_prefix) - 1], 113 sizeof(mfd->mfd_name) - sizeof(memfd_prefix), NULL); 114 if (error != 0) 115 goto leave; 116 117 getnanotime(&mfd->mfd_btime); 118 119 if ((flags & MFD_ALLOW_SEALING) == 0) 120 mfd->mfd_seals |= F_SEAL_SEAL; 121 122 error = fd_allocfile(&fp, &fd); 123 if (error != 0) 124 goto leave; 125 126 fp->f_flag = FREAD|FWRITE; 127 fp->f_type = DTYPE_MEMFD; 128 fp->f_ops = &memfd_fileops; 129 fp->f_memfd = mfd; 130 fd_set_exclose(l, fd, (flags & MFD_CLOEXEC) != 0); 131 fd_set_foclose(l, fd, (flags & MFD_CLOFORK) != 0); 132 fd_affix(p, fp, fd); 133 134 *retval = fd; 135 return 0; 136 137 leave: 138 uao_detach(mfd->mfd_uobj); 139 kmem_free(mfd, sizeof(*mfd)); 140 return error; 141 } 142 143 static int 144 memfd_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, 145 int flags) 146 { 147 int error; 148 vsize_t todo; 149 struct memfd *mfd = fp->f_memfd; 150 151 mutex_enter(&fp->f_lock); 152 153 if (*offp < 0) { 154 error = EINVAL; 155 goto leave; 156 } 157 158 /* Trying to read past the end does nothing. */ 159 if (*offp >= mfd->mfd_size) { 160 error = 0; 161 goto leave; 162 } 163 164 uio->uio_offset = *offp; 165 todo = MIN(uio->uio_resid, mfd->mfd_size - *offp); 166 error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL, 167 UBC_READ|UBC_PARTIALOK); 168 if (flags & FOF_UPDATE_OFFSET) 169 *offp = uio->uio_offset; 170 171 leave: 172 getnanotime(&mfd->mfd_atime); 173 174 175 mutex_exit(&fp->f_lock); 176 177 return error; 178 } 179 180 static int 181 memfd_write(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred, 182 int flags) 183 { 184 int error; 185 vsize_t todo; 186 struct memfd *mfd = fp->f_memfd; 187 188 mutex_enter(&fp->f_lock); 189 190 if (mfd->mfd_seals & F_SEAL_ANY_WRITE) { 191 error = EPERM; 192 goto leave; 193 } 194 195 if (*offp < 0) { 196 error = EINVAL; 197 goto leave; 198 } 199 200 uio->uio_offset = *offp; 201 todo = uio->uio_resid; 202 203 if (mfd->mfd_seals & F_SEAL_GROW) { 204 if (*offp >= mfd->mfd_size) { 205 error = EPERM; 206 goto leave; 207 } 208 209 /* Truncate the write to fit in mfd_size */ 210 if (*offp + uio->uio_resid >= mfd->mfd_size) 211 todo = mfd->mfd_size - *offp; 212 } else if (*offp + uio->uio_resid >= mfd->mfd_size) { 213 /* Grow to accommodate the write request. */ 214 error = memfd_truncate_locked(fp, *offp + uio->uio_resid); 215 if (error != 0) 216 goto leave; 217 } 218 219 error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL, 220 UBC_WRITE|UBC_PARTIALOK); 221 if (flags & FOF_UPDATE_OFFSET) 222 *offp = uio->uio_offset; 223 224 getnanotime(&mfd->mfd_mtime); 225 226 leave: 227 mutex_exit(&fp->f_lock); 228 229 return error; 230 } 231 232 static int 233 memfd_ioctl(file_t *fp, u_long cmd, void *data) 234 { 235 236 return EINVAL; 237 } 238 239 static int 240 memfd_fcntl(file_t *fp, u_int cmd, void *data) 241 { 242 struct memfd *mfd = fp->f_memfd; 243 int error = 0; 244 245 switch (cmd) { 246 case F_GETPATH: 247 strncpy(data, mfd->mfd_name, MAXPATHLEN); 248 return 0; 249 250 case F_ADD_SEALS: 251 mutex_enter(&fp->f_lock); 252 253 if (mfd->mfd_seals & F_SEAL_SEAL) { 254 error = EPERM; 255 goto leave_add_seals; 256 } 257 258 if (*(int *)data & ~MFD_KNOWN_SEALS) { 259 error = EINVAL; 260 goto leave_add_seals; 261 } 262 263 /* 264 * Can only add F_SEAL_WRITE if there are no currently 265 * open mmaps. 266 * 267 * XXX should only disallow if there are no currently 268 * open mmaps with PROT_WRITE. 269 */ 270 if ((mfd->mfd_seals & F_SEAL_WRITE) == 0 && 271 (*(int *)data & F_SEAL_WRITE) != 0 && 272 mfd->mfd_uobj->uo_refs > 1) 273 { 274 error = EBUSY; 275 goto leave_add_seals; 276 } 277 278 mfd->mfd_seals |= *(int *)data; 279 280 leave_add_seals: 281 mutex_exit(&fp->f_lock); 282 return error; 283 284 case F_GET_SEALS: 285 mutex_enter(&fp->f_lock); 286 *(int *)data = mfd->mfd_seals; 287 mutex_exit(&fp->f_lock); 288 return 0; 289 290 default: 291 return EINVAL; 292 } 293 } 294 295 static int 296 memfd_stat(file_t *fp, struct stat *st) 297 { 298 struct memfd *mfd = fp->f_memfd; 299 300 mutex_enter(&fp->f_lock); 301 302 memset(st, 0, sizeof(*st)); 303 st->st_uid = kauth_cred_geteuid(fp->f_cred); 304 st->st_gid = kauth_cred_getegid(fp->f_cred); 305 st->st_size = mfd->mfd_size; 306 307 st->st_mode = S_IREAD; 308 if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) == 0) 309 st->st_mode |= S_IWRITE; 310 311 st->st_birthtimespec = mfd->mfd_btime; 312 st->st_ctimespec = mfd->mfd_mtime; 313 st->st_atimespec = mfd->mfd_atime; 314 st->st_mtimespec = mfd->mfd_mtime; 315 316 mutex_exit(&fp->f_lock); 317 318 return 0; 319 } 320 321 static int 322 memfd_close(file_t *fp) 323 { 324 struct memfd *mfd = fp->f_memfd; 325 326 uao_detach(mfd->mfd_uobj); 327 328 kmem_free(mfd, sizeof(*mfd)); 329 fp->f_memfd = NULL; 330 331 return 0; 332 } 333 334 static int 335 memfd_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp, 336 int *advicep, struct uvm_object **uobjp, int *maxprotp) 337 { 338 struct memfd *mfd = fp->f_memfd; 339 int error = 0; 340 341 /* uvm_mmap guarantees page-aligned offset and size. */ 342 KASSERT(*offp == round_page(*offp)); 343 KASSERT(size == round_page(size)); 344 KASSERT(size > 0); 345 346 mutex_enter(&fp->f_lock); 347 348 if (*offp < 0) { 349 error = EINVAL; 350 goto leave; 351 } 352 if (*offp + size > mfd->mfd_size) { 353 error = EINVAL; 354 goto leave; 355 } 356 357 if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) && 358 (prot & VM_PROT_WRITE) && (*flagsp & MAP_PRIVATE) == 0) { 359 error = EPERM; 360 goto leave; 361 } 362 363 uao_reference(fp->f_memfd->mfd_uobj); 364 *uobjp = fp->f_memfd->mfd_uobj; 365 366 *maxprotp = prot; 367 *advicep = UVM_ADV_RANDOM; 368 369 leave: 370 mutex_exit(&fp->f_lock); 371 372 return error; 373 } 374 375 static int 376 memfd_seek(file_t *fp, off_t delta, int whence, off_t *newoffp, 377 int flags) 378 { 379 off_t newoff; 380 int error = 0; 381 382 mutex_enter(&fp->f_lock); 383 384 switch (whence) { 385 case SEEK_CUR: 386 newoff = fp->f_offset + delta; 387 break; 388 389 case SEEK_END: 390 newoff = fp->f_memfd->mfd_size + delta; 391 break; 392 393 case SEEK_SET: 394 newoff = delta; 395 break; 396 397 default: 398 error = EINVAL; 399 goto leave; 400 } 401 402 if (newoffp) 403 *newoffp = newoff; 404 if (flags & FOF_UPDATE_OFFSET) 405 fp->f_offset = newoff; 406 407 leave: 408 mutex_exit(&fp->f_lock); 409 410 return error; 411 } 412 413 static int 414 memfd_truncate_locked(file_t *fp, off_t length) 415 { 416 struct memfd *mfd = fp->f_memfd; 417 voff_t start, end; 418 int error = 0; 419 420 KASSERT(mutex_owned(&fp->f_lock)); 421 422 if (length < 0) 423 return EINVAL; 424 if (length == mfd->mfd_size) 425 return 0; 426 427 if ((mfd->mfd_seals & F_SEAL_SHRINK) && length < mfd->mfd_size) 428 return EPERM; 429 if ((mfd->mfd_seals & F_SEAL_GROW) && length > mfd->mfd_size) 430 return EPERM; 431 432 if (length > mfd->mfd_size) 433 ubc_zerorange(mfd->mfd_uobj, mfd->mfd_size, 434 length - mfd->mfd_size, 0); 435 else { 436 /* length < mfd->mfd_size, so try to get rid of excess pages */ 437 start = round_page(length); 438 end = round_page(mfd->mfd_size); 439 440 if (start < end) { /* we actually have pages to remove */ 441 rw_enter(mfd->mfd_uobj->vmobjlock, RW_WRITER); 442 error = (*mfd->mfd_uobj->pgops->pgo_put)(mfd->mfd_uobj, 443 start, end, PGO_FREE); 444 /* pgo_put drops vmobjlock */ 445 } 446 } 447 448 getnanotime(&mfd->mfd_mtime); 449 mfd->mfd_size = length; 450 451 return error; 452 } 453 454 static int 455 memfd_truncate(file_t *fp, off_t length) 456 { 457 int error; 458 459 mutex_enter(&fp->f_lock); 460 error = memfd_truncate_locked(fp, length); 461 mutex_exit(&fp->f_lock); 462 return error; 463 } 464