sys_memfd.c revision 1.1 1 /* $NetBSD: sys_memfd.c,v 1.1 2023/07/10 02:31:55 christos Exp $ */
2
3 /*-
4 * Copyright (c) 2023 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Theodore Preduta.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_memfd.c,v 1.1 2023/07/10 02:31:55 christos Exp $");
34
35 #include <sys/param.h>
36 #include <sys/types.h>
37 #include <sys/fcntl.h>
38 #include <sys/file.h>
39 #include <sys/filedesc.h>
40 #include <sys/mman.h>
41 #include <sys/syscallargs.h>
42
43 #include <uvm/uvm_extern.h>
44 #include <uvm/uvm_object.h>
45
46 #define F_SEAL_ANY_WRITE (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
47 #define MFD_KNOWN_SEALS (F_SEAL_SEAL|F_SEAL_SHRINK|F_SEAL_GROW \
48 |F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
49
50 static const char memfd_prefix[] = "memfd:";
51
52 static int memfd_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
53 static int memfd_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
54 static int memfd_ioctl(file_t *, u_long, void *);
55 static int memfd_fcntl(file_t *, u_int, void *);
56 static int memfd_stat(file_t *, struct stat *);
57 static int memfd_close(file_t *);
58 static int memfd_mmap(file_t *, off_t *, size_t, int, int *, int *,
59 struct uvm_object **, int *);
60 static int memfd_seek(file_t *, off_t, int, off_t *, int);
61 static int memfd_truncate(file_t *, off_t);
62
63 static const struct fileops memfd_fileops = {
64 .fo_name = "memfd",
65 .fo_read = memfd_read,
66 .fo_write = memfd_write,
67 .fo_ioctl = memfd_ioctl,
68 .fo_fcntl = memfd_fcntl,
69 .fo_poll = fnullop_poll,
70 .fo_stat = memfd_stat,
71 .fo_close = memfd_close,
72 .fo_kqfilter = fnullop_kqfilter,
73 .fo_restart = fnullop_restart,
74 .fo_mmap = memfd_mmap,
75 .fo_seek = memfd_seek,
76 .fo_fpathconf = (void *)eopnotsupp,
77 .fo_posix_fadvise = (void *)eopnotsupp,
78 .fo_truncate = memfd_truncate,
79 };
80
81 /*
82 * memfd_create(2). Creat a file descriptor associated with anonymous
83 * memory.
84 */
85 int
86 sys_memfd_create(struct lwp *l, const struct sys_memfd_create_args *uap,
87 register_t *retval)
88 {
89 /* {
90 syscallarg(const char *) name;
91 syscallarg(unsigned int) flags;
92 } */
93 int error, fd;
94 file_t *fp;
95 struct memfd *mfd;
96 struct proc *p = l->l_proc;
97 const unsigned int flags = SCARG(uap, flags);
98
99 KASSERT(NAME_MAX - sizeof(memfd_prefix) > 0); /* sanity check */
100
101 if (flags & ~(MFD_CLOEXEC|MFD_ALLOW_SEALING))
102 return EINVAL;
103
104 mfd = kmem_zalloc(sizeof(*mfd), KM_SLEEP);
105 mfd->mfd_size = 0;
106 mfd->mfd_uobj = uao_create(INT64_MAX - PAGE_SIZE, 0); /* same as tmpfs */
107 mutex_init(&mfd->mfd_lock, MUTEX_DEFAULT, IPL_NONE);
108
109 strcpy(mfd->mfd_name, memfd_prefix);
110 error = copyinstr(SCARG(uap, name),
111 &mfd->mfd_name[sizeof(memfd_prefix) - 1],
112 sizeof(mfd->mfd_name) - sizeof(memfd_prefix), NULL);
113 if (error != 0)
114 goto leave;
115
116 getnanotime(&mfd->mfd_btime);
117
118 if ((flags & MFD_ALLOW_SEALING) == 0)
119 mfd->mfd_seals |= F_SEAL_SEAL;
120
121 error = fd_allocfile(&fp, &fd);
122 if (error != 0)
123 goto leave;
124
125 fp->f_flag = FREAD|FWRITE;
126 fp->f_type = DTYPE_MEMFD;
127 fp->f_ops = &memfd_fileops;
128 fp->f_memfd = mfd;
129 fd_set_exclose(l, fd, (flags & MFD_CLOEXEC) != 0);
130 fd_affix(p, fp, fd);
131
132 *retval = fd;
133 return 0;
134
135 leave:
136 uao_detach(mfd->mfd_uobj);
137 kmem_free(mfd, sizeof(*mfd));
138 return error;
139 }
140
141 static int
142 memfd_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
143 int flags)
144 {
145 int error;
146 vsize_t todo;
147 struct memfd *mfd = fp->f_memfd;
148
149 if (offp == &fp->f_offset)
150 mutex_enter(&fp->f_lock);
151
152 if (*offp < 0) {
153 error = EINVAL;
154 goto leave;
155 }
156
157 /* Trying to read past the end does nothing. */
158 if (*offp >= mfd->mfd_size) {
159 error = 0;
160 goto leave;
161 }
162
163 uio->uio_offset = *offp;
164 todo = MIN(uio->uio_resid, mfd->mfd_size - *offp);
165 error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
166 UBC_READ|UBC_PARTIALOK);
167
168 leave:
169 if (offp == &fp->f_offset)
170 mutex_exit(&fp->f_lock);
171
172 getnanotime(&mfd->mfd_atime);
173
174 return error;
175 }
176
177 static int
178 memfd_write(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
179 int flags)
180 {
181 int error;
182 vsize_t todo;
183 struct memfd *mfd = fp->f_memfd;
184
185 if (mfd->mfd_seals & F_SEAL_ANY_WRITE)
186 return EPERM;
187
188 if (offp == &fp->f_offset)
189 mutex_enter(&fp->f_lock);
190
191 if (*offp < 0) {
192 error = EINVAL;
193 goto leave;
194 }
195
196 uio->uio_offset = *offp;
197 todo = uio->uio_resid;
198
199 if (mfd->mfd_seals & F_SEAL_GROW) {
200 if (*offp >= mfd->mfd_size) {
201 error = EPERM;
202 goto leave;
203 }
204
205 /* Truncate the write to fit in mfd_size */
206 if (*offp + uio->uio_resid >= mfd->mfd_size)
207 todo = mfd->mfd_size - *offp;
208 } else if (*offp + uio->uio_resid >= mfd->mfd_size) {
209 /* Grow to accommodate the write request. */
210 error = memfd_truncate(fp, *offp + uio->uio_resid);
211 if (error != 0)
212 goto leave;
213 }
214
215 error = ubc_uiomove(mfd->mfd_uobj, uio, todo, UVM_ADV_SEQUENTIAL,
216 UBC_WRITE|UBC_PARTIALOK);
217
218 getnanotime(&mfd->mfd_mtime);
219
220 leave:
221 if (offp == &fp->f_offset)
222 mutex_exit(&fp->f_lock);
223
224 return error;
225 }
226
227 static int
228 memfd_ioctl(file_t *fp, u_long cmd, void *data)
229 {
230
231 return EINVAL;
232 }
233
234 static int
235 memfd_fcntl(file_t *fp, u_int cmd, void *data)
236 {
237 struct memfd *mfd = fp->f_memfd;
238
239 switch (cmd) {
240 case F_ADD_SEALS:
241 if (mfd->mfd_seals & F_SEAL_SEAL)
242 return EPERM;
243
244 if (*(int *)data & ~MFD_KNOWN_SEALS)
245 return EINVAL;
246
247 /*
248 * Can only add F_SEAL_WRITE if there are no currently
249 * open mmaps.
250 *
251 * XXX should only disallow if there are no currently
252 * open mmaps with PROT_WRITE.
253 */
254 if ((mfd->mfd_seals & F_SEAL_WRITE) == 0 &&
255 (*(int *)data & F_SEAL_WRITE) != 0 &&
256 mfd->mfd_uobj->uo_refs > 1)
257 return EBUSY;
258
259 mfd->mfd_seals |= *(int *)data;
260 return 0;
261
262 case F_GET_SEALS:
263 *(int *)data = mfd->mfd_seals;
264 return 0;
265
266 default:
267 return EINVAL;
268 }
269 }
270
271 static int
272 memfd_stat(file_t *fp, struct stat *st)
273 {
274 struct memfd *mfd = fp->f_memfd;
275
276 memset(st, 0, sizeof(*st));
277 st->st_uid = kauth_cred_geteuid(fp->f_cred);
278 st->st_gid = kauth_cred_getegid(fp->f_cred);
279 st->st_size = mfd->mfd_size;
280
281 st->st_mode = S_IREAD;
282 if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) == 0)
283 st->st_mode |= S_IWRITE;
284
285 st->st_birthtimespec = mfd->mfd_btime;
286 st->st_ctimespec = mfd->mfd_mtime;
287 st->st_atimespec = mfd->mfd_atime;
288 st->st_mtimespec = mfd->mfd_mtime;
289
290 return 0;
291 }
292
293 static int
294 memfd_close(file_t *fp)
295 {
296 struct memfd *mfd = fp->f_memfd;
297
298 uao_detach(mfd->mfd_uobj);
299 mutex_destroy(&mfd->mfd_lock);
300
301 kmem_free(mfd, sizeof(*mfd));
302 fp->f_memfd = NULL;
303
304 return 0;
305 }
306
307 static int
308 memfd_mmap(file_t *fp, off_t *offp, size_t size, int prot, int *flagsp,
309 int *advicep, struct uvm_object **uobjp, int *maxprotp)
310 {
311 struct memfd *mfd = fp->f_memfd;
312
313 /* uvm_mmap guarantees page-aligned offset and size. */
314 KASSERT(*offp == round_page(*offp));
315 KASSERT(size == round_page(size));
316 KASSERT(size > 0);
317
318 if (*offp < 0)
319 return EINVAL;
320 if (*offp + size > mfd->mfd_size)
321 return EINVAL;
322
323 if ((mfd->mfd_seals & F_SEAL_ANY_WRITE) &&
324 (prot & VM_PROT_WRITE) && (*flagsp & MAP_PRIVATE) == 0)
325 return EPERM;
326
327 uao_reference(fp->f_memfd->mfd_uobj);
328 *uobjp = fp->f_memfd->mfd_uobj;
329
330 *maxprotp = prot;
331 *advicep = UVM_ADV_RANDOM;
332
333 return 0;
334 }
335
336 static int
337 memfd_seek(file_t *fp, off_t delta, int whence, off_t *newoffp,
338 int flags)
339 {
340 off_t newoff;
341 int error;
342
343 switch (whence) {
344 case SEEK_CUR:
345 newoff = fp->f_offset + delta;
346 break;
347
348 case SEEK_END:
349 newoff = fp->f_memfd->mfd_size + delta;
350 break;
351
352 case SEEK_SET:
353 newoff = delta;
354 break;
355
356 default:
357 error = EINVAL;
358 return error;
359 }
360
361 if (newoffp)
362 *newoffp = newoff;
363 if (flags & FOF_UPDATE_OFFSET)
364 fp->f_offset = newoff;
365
366 return 0;
367 }
368
369 static int
370 memfd_truncate(file_t *fp, off_t length)
371 {
372 struct memfd *mfd = fp->f_memfd;
373 int error = 0;
374 voff_t start, end;
375
376 if (length < 0)
377 return EINVAL;
378 if (length == mfd->mfd_size)
379 return 0;
380
381 if ((mfd->mfd_seals & F_SEAL_SHRINK) && length < mfd->mfd_size)
382 return EPERM;
383 if ((mfd->mfd_seals & F_SEAL_GROW) && length > mfd->mfd_size)
384 return EPERM;
385
386 mutex_enter(&mfd->mfd_lock);
387
388 if (length > mfd->mfd_size)
389 ubc_zerorange(mfd->mfd_uobj, mfd->mfd_size,
390 length - mfd->mfd_size, 0);
391 else {
392 /* length < mfd->mfd_size, so try to get rid of excess pages */
393 start = round_page(length);
394 end = round_page(mfd->mfd_size);
395
396 if (start < end) { /* we actually have pages to remove */
397 rw_enter(mfd->mfd_uobj->vmobjlock, RW_WRITER);
398 error = (*mfd->mfd_uobj->pgops->pgo_put)(mfd->mfd_uobj,
399 start, end, PGO_FREE);
400 /* pgo_put drops vmobjlock */
401 }
402 }
403
404 getnanotime(&mfd->mfd_mtime);
405 mfd->mfd_size = length;
406 mutex_exit(&mfd->mfd_lock);
407 return error;
408 }
409