sys_memfd.c revision 1.2 1 /* $NetBSD: sys_memfd.c,v 1.2 2023/07/10 15:49:18 christos Exp $ */
2
3 /*-
4 * Copyright (c) 2023 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Theodore Preduta.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_memfd.c,v 1.2 2023/07/10 15:49:18 christos Exp $");
34
35 #include <sys/param.h>
36 #include <sys/types.h>
37 #include <sys/fcntl.h>
38 #include <sys/file.h>
39 #include <sys/filedesc.h>
40 #include <sys/mman.h>
41 #include <sys/miscfd.h>
42 #include <sys/syscallargs.h>
43
44 #include <uvm/uvm_extern.h>
45 #include <uvm/uvm_object.h>
46
47 #define F_SEAL_ANY_WRITE (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
48 #define MFD_KNOWN_SEALS (F_SEAL_SEAL|F_SEAL_SHRINK|F_SEAL_GROW \
49 |F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
50
51 static const char memfd_prefix[] = "memfd:";
52
53 static int memfd_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
54 static int memfd_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
55 static int memfd_ioctl(file_t *, u_long, void *);
56 static int memfd_fcntl(file_t *, u_int, void *);
57 static int memfd_stat(file_t *, struct stat *);
58 static int memfd_close(file_t *);
59 static int memfd_mmap(file_t *, off_t *, size_t, int, int *, int *,
60 struct uvm_object **, int *);
61 static int memfd_seek(file_t *, off_t, int, off_t *, int);
62 static int memfd_truncate(file_t *, off_t);
63
64 static const struct fileops memfd_fileops = {
65 .fo_name = "memfd",
66 .fo_read = memfd_read,
67 .fo_write = memfd_write,
68 .fo_ioctl = memfd_ioctl,
69 .fo_fcntl = memfd_fcntl,
70 .fo_poll = fnullop_poll,
71 .fo_stat = memfd_stat,
72 .fo_close = memfd_close,
73 .fo_kqfilter = fnullop_kqfilter,
74 .fo_restart = fnullop_restart,
75 .fo_mmap = memfd_mmap,
76 .fo_seek = memfd_seek,
77 .fo_fpathconf = (void *)eopnotsupp,
78 .fo_posix_fadvise = (void *)eopnotsupp,
79 .fo_truncate = memfd_truncate,
80 };
81
82 /*
83 * memfd_create(2). Creat a file descriptor associated with anonymous
84 * memory.
85 */
86 int
87 sys_memfd_create(struct lwp *l, const struct sys_memfd_create_args *uap,
88 register_t *retval)
89 {
90 /* {
91 syscallarg(const char *) name;
92 syscallarg(unsigned int) flags;
93 } */
94 int error, fd;
95 file_t *fp;
96 struct memfd *mfd;
97 struct proc *p = l->l_proc;
98 const unsigned int flags = SCARG(uap, flags);
99
100 KASSERT(NAME_MAX - sizeof(memfd_prefix) > 0); /* sanity check */
101
102 if (flags & ~(MFD_CLOEXEC|MFD_ALLOW_SEALING))
103 return EINVAL;
104
105 mfd = kmem_zalloc(sizeof(*mfd), KM_SLEEP);
106 mfd->mfd_size = 0;
107 mfd->mfd_uobj = uao_create(INT64_MAX - PAGE_SIZE, 0); /* same as tmpfs */
108 mutex_init(&mfd->mfd_lock, MUTEX_DEFAULT, IPL_NONE);
109
110 strcpy(mfd->mfd_name, memfd_prefix);
111 error = copyinstr(SCARG(uap, name),
112 &mfd->mfd_name[sizeof(memfd_prefix) - 1],
113 sizeof(mfd->mfd_name) - sizeof(memfd_prefix), NULL);
114 if (error != 0)
115 goto leave;
116
117 getnanotime(&mfd->mfd_btime);
118
119 if ((flags & MFD_ALLOW_SEALING) == 0)
120 mfd->mfd_seals |= F_SEAL_SEAL;
121
122 error = fd_allocfile(&fp, &fd);
123 if (error != 0)
124 goto leave;
125
126 fp->f_flag = FREAD|FWRITE;
127 fp->f_type = DTYPE_MEMFD;
128 fp->f_ops = &memfd_fileops;
129 fp->f_memfd = mfd;
130 fd_set_exclose(l, fd, (flags & MFD_CLOEXEC) != 0);
131 fd_affix(p, fp, fd);
132
133 *retval = fd;
134 return 0;
135
136 leave:
137 uao_detach(mfd->mfd_uobj);
138 kmem_free(mfd, sizeof(*mfd));
139 return error;
140 }
141
142 static int
143 memfd_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
144 int flags)
145 {
146 int error;
147 vsize_t todo;
148 struct memfd *mfd = fp->f_memfd;
149
150 if (offp == &fp->f_offset)
151 mutex_enter(&fp->f_lock);
152
153 if (*offp < 0) {
154 error = EINVAL;
155 goto leave;
156 }
157
158 /* Trying to read past the end does nothing. */
159 if (*offp >= mfd->mfd_size) {
160 error = 0;
161 goto leave;
162 }
163
164 uio->uio_offset = *offp;
165 todo = MIN(uio->uio_resid, mfd->mfd_size - *offp);
166 error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
167 UBC_READ|UBC_PARTIALOK);
168
169 leave:
170 if (offp == &fp->f_offset)
171 mutex_exit(&fp->f_lock);
172
173 getnanotime(&mfd->mfd_atime);
174
175 return error;
176 }
177
178 static int
179 memfd_write(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
180 int flags)
181 {
182 int error;
183 vsize_t todo;
184 struct memfd *mfd = fp->f_memfd;
185
186 if (mfd->mfd_seals & F_SEAL_ANY_WRITE)
187 return EPERM;
188
189 if (offp == &fp->f_offset)
190 mutex_enter(&fp->f_lock);
191
192 if (*offp < 0) {
193 error = EINVAL;
194 goto leave;
195 }
196
197 uio->uio_offset = *offp;
198 todo = uio->uio_resid;
199
200 if (mfd->mfd_seals & F_SEAL_GROW) {
201 if (*offp >= mfd->mfd_size) {
202 error = EPERM;
203 goto leave;
204 }
205
206 /* Truncate the write to fit in mfd_size */
207 if (*offp + uio->uio_resid >= mfd->mfd_size)
208 todo = mfd->mfd_size - *offp;
209 } else if (*offp + uio->uio_resid >= mfd->mfd_size) {
210 /* Grow to accommodate the write request. */
211 error = memfd_truncate(fp, *offp + uio->uio_resid);
212 if (error != 0)
213 goto leave;
214 }
215
216 error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
217 UBC_WRITE|UBC_PARTIALOK);
218
219 getnanotime(&mfd->mfd_mtime);
220
221 leave:
222 if (offp == &fp->f_offset)
223 mutex_exit(&fp->f_lock);
224
225 return error;
226 }
227
228 static int
229 memfd_ioctl(file_t *fp, u_long cmd, void *data)
230 {
231
232 return EINVAL;
233 }
234
235 static int
236 memfd_fcntl(file_t *fp, u_int cmd, void *data)
237 {
238 struct memfd *mfd = fp->f_memfd;
239
240 switch (cmd) {
241 case F_ADD_SEALS:
242 if (mfd->mfd_seals & F_SEAL_SEAL)
243 return EPERM;
244
245 if (*(int *)data & ~MFD_KNOWN_SEALS)
246 return EINVAL;
247
248 /*
249 * Can only add F_SEAL_WRITE if there are no currently
250 * open mmaps.
251 *
252 * XXX should only disallow if there are no currently
253 * open mmaps with PROT_WRITE.
254 */
255 if ((mfd->mfd_seals & F_SEAL_WRITE) == 0 &&
256 (*(int *)data & F_SEAL_WRITE) != 0 &&
257 mfd->mfd_uobj->uo_refs > 1)
258 return EBUSY;
259
260 mfd->mfd_seals |= *(int *)data;
261 return 0;
262
263 case F_GET_SEALS:
264 *(int *)data = mfd->mfd_seals;
265 return 0;
266
267 default:
268 return EINVAL;
269 }
270 }
271
272 static int
273 memfd_stat(file_t *fp, struct stat *st)
274 {
275 struct memfd *mfd = fp->f_memfd;
276
277 memset(st, 0, sizeof(*st));
278 st->st_uid = kauth_cred_geteuid(fp->f_cred);
279 st->st_gid = kauth_cred_getegid(fp->f_cred);
280 st->st_size = mfd->mfd_size;
281
282 st->st_mode = S_IREAD;
283 if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) == 0)
284 st->st_mode |= S_IWRITE;
285
286 st->st_birthtimespec = mfd->mfd_btime;
287 st->st_ctimespec = mfd->mfd_mtime;
288 st->st_atimespec = mfd->mfd_atime;
289 st->st_mtimespec = mfd->mfd_mtime;
290
291 return 0;
292 }
293
294 static int
295 memfd_close(file_t *fp)
296 {
297 struct memfd *mfd = fp->f_memfd;
298
299 uao_detach(mfd->mfd_uobj);
300 mutex_destroy(&mfd->mfd_lock);
301
302 kmem_free(mfd, sizeof(*mfd));
303 fp->f_memfd = NULL;
304
305 return 0;
306 }
307
308 static int
309 memfd_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp,
310 int *advicep, struct uvm_object **uobjp, int *maxprotp)
311 {
312 struct memfd *mfd = fp->f_memfd;
313
314 /* uvm_mmap guarantees page-aligned offset and size. */
315 KASSERT(*offp == round_page(*offp));
316 KASSERT(size == round_page(size));
317 KASSERT(size > 0);
318
319 if (*offp < 0)
320 return EINVAL;
321 if (*offp + size > mfd->mfd_size)
322 return EINVAL;
323
324 if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) &&
325 (prot & VM_PROT_WRITE) && (*flagsp & MAP_PRIVATE) == 0)
326 return EPERM;
327
328 uao_reference(fp->f_memfd->mfd_uobj);
329 *uobjp = fp->f_memfd->mfd_uobj;
330
331 *maxprotp = prot;
332 *advicep = UVM_ADV_RANDOM;
333
334 return 0;
335 }
336
337 static int
338 memfd_seek(file_t *fp, off_t delta, int whence, off_t *newoffp,
339 int flags)
340 {
341 off_t newoff;
342 int error;
343
344 switch (whence) {
345 case SEEK_CUR:
346 newoff = fp->f_offset + delta;
347 break;
348
349 case SEEK_END:
350 newoff = fp->f_memfd->mfd_size + delta;
351 break;
352
353 case SEEK_SET:
354 newoff = delta;
355 break;
356
357 default:
358 error = EINVAL;
359 return error;
360 }
361
362 if (newoffp)
363 *newoffp = newoff;
364 if (flags & FOF_UPDATE_OFFSET)
365 fp->f_offset = newoff;
366
367 return 0;
368 }
369
370 static int
371 memfd_truncate(file_t *fp, off_t length)
372 {
373 struct memfd *mfd = fp->f_memfd;
374 int error = 0;
375 voff_t start, end;
376
377 if (length < 0)
378 return EINVAL;
379 if (length == mfd->mfd_size)
380 return 0;
381
382 if ((mfd->mfd_seals & F_SEAL_SHRINK) && length < mfd->mfd_size)
383 return EPERM;
384 if ((mfd->mfd_seals & F_SEAL_GROW) && length > mfd->mfd_size)
385 return EPERM;
386
387 mutex_enter(&mfd->mfd_lock);
388
389 if (length > mfd->mfd_size)
390 ubc_zerorange(mfd->mfd_uobj, mfd->mfd_size,
391 length - mfd->mfd_size, 0);
392 else {
393 /* length < mfd->mfd_size, so try to get rid of excess pages */
394 start = round_page(length);
395 end = round_page(mfd->mfd_size);
396
397 if (start < end) { /* we actually have pages to remove */
398 rw_enter(mfd->mfd_uobj->vmobjlock, RW_WRITER);
399 error = (*mfd->mfd_uobj->pgops->pgo_put)(mfd->mfd_uobj,
400 start, end, PGO_FREE);
401 /* pgo_put drops vmobjlock */
402 }
403 }
404
405 getnanotime(&mfd->mfd_mtime);
406 mfd->mfd_size = length;
407 mutex_exit(&mfd->mfd_lock);
408 return error;
409 }
410