linux_file.c revision 1.125 1 /* $NetBSD: linux_file.c,v 1.125 2024/09/28 19:35:56 christos Exp $ */
2
3 /*-
4 * Copyright (c) 1995, 1998, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Frank van der Linden and Eric Haszlakiewicz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Functions in multiarch:
34 * linux_sys_llseek : linux_llseek.c
35 */
36
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: linux_file.c,v 1.125 2024/09/28 19:35:56 christos Exp $");
39
40 #include <sys/types.h>
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/namei.h>
44 #include <sys/proc.h>
45 #include <sys/file.h>
46 #include <sys/fcntl.h>
47 #include <sys/stat.h>
48 #include <sys/vfs_syscalls.h>
49 #include <sys/filedesc.h>
50 #include <sys/ioctl.h>
51 #include <sys/kernel.h>
52 #include <sys/mount.h>
53 #include <sys/namei.h>
54 #include <sys/vnode.h>
55 #include <sys/tty.h>
56 #include <sys/socketvar.h>
57 #include <sys/conf.h>
58 #include <sys/pipe.h>
59 #include <sys/fstrans.h>
60 #include <sys/syscallargs.h>
61 #include <sys/vfs_syscalls.h>
62
63 #include <compat/linux/common/linux_types.h>
64 #include <compat/linux/common/linux_signal.h>
65 #include <compat/linux/common/linux_fcntl.h>
66 #include <compat/linux/common/linux_util.h>
67 #include <compat/linux/common/linux_machdep.h>
68 #include <compat/linux/common/linux_ipc.h>
69 #include <compat/linux/common/linux_sem.h>
70
71 #include <compat/linux/linux_syscallargs.h>
72
73 #ifdef DEBUG_LINUX
74 #define DPRINTF(a, ...) uprintf(a, __VA_ARGS__)
75 #else
76 #define DPRINTF(a, ...)
77 #endif
78
79 #define LINUX_COPY_FILE_RANGE_MAX_CHUNK 8192
80
81 static int bsd_to_linux_ioflags(int);
82 #if !defined(__aarch64__) && !defined(__amd64__)
83 static void bsd_to_linux_stat(struct stat *, struct linux_stat *);
84 #endif
85
86 conv_linux_flock(linux, flock)
87
88 /*
89 * Some file-related calls are handled here. The usual flag conversion
90 * an structure conversion is done, and alternate emul path searching.
91 */
92
93 /*
94 * The next two functions convert between the Linux and NetBSD values
95 * of the flags used in open(2) and fcntl(2).
96 */
97 int
98 linux_to_bsd_ioflags(int lflags)
99 {
100 int res = 0;
101
102 res |= cvtto_bsd_mask(lflags, LINUX_O_WRONLY, O_WRONLY);
103 res |= cvtto_bsd_mask(lflags, LINUX_O_RDONLY, O_RDONLY);
104 res |= cvtto_bsd_mask(lflags, LINUX_O_RDWR, O_RDWR);
105
106 res |= cvtto_bsd_mask(lflags, LINUX_O_CREAT, O_CREAT);
107 res |= cvtto_bsd_mask(lflags, LINUX_O_EXCL, O_EXCL);
108 res |= cvtto_bsd_mask(lflags, LINUX_O_NOCTTY, O_NOCTTY);
109 res |= cvtto_bsd_mask(lflags, LINUX_O_TRUNC, O_TRUNC);
110 res |= cvtto_bsd_mask(lflags, LINUX_O_APPEND, O_APPEND);
111 res |= cvtto_bsd_mask(lflags, LINUX_O_NONBLOCK, O_NONBLOCK);
112 res |= cvtto_bsd_mask(lflags, LINUX_O_NDELAY, O_NDELAY);
113 res |= cvtto_bsd_mask(lflags, LINUX_O_SYNC, O_FSYNC);
114 res |= cvtto_bsd_mask(lflags, LINUX_FASYNC, O_ASYNC);
115 res |= cvtto_bsd_mask(lflags, LINUX_O_DIRECT, O_DIRECT);
116 res |= cvtto_bsd_mask(lflags, LINUX_O_DIRECTORY, O_DIRECTORY);
117 res |= cvtto_bsd_mask(lflags, LINUX_O_NOFOLLOW, O_NOFOLLOW);
118 res |= cvtto_bsd_mask(lflags, LINUX_O_CLOEXEC, O_CLOEXEC);
119
120 return res;
121 }
122
123 static int
124 bsd_to_linux_ioflags(int bflags)
125 {
126 int res = 0;
127
128 res |= cvtto_linux_mask(bflags, O_WRONLY, LINUX_O_WRONLY);
129 res |= cvtto_linux_mask(bflags, O_RDONLY, LINUX_O_RDONLY);
130 res |= cvtto_linux_mask(bflags, O_RDWR, LINUX_O_RDWR);
131
132 res |= cvtto_linux_mask(bflags, O_CREAT, LINUX_O_CREAT);
133 res |= cvtto_linux_mask(bflags, O_EXCL, LINUX_O_EXCL);
134 res |= cvtto_linux_mask(bflags, O_NOCTTY, LINUX_O_NOCTTY);
135 res |= cvtto_linux_mask(bflags, O_TRUNC, LINUX_O_TRUNC);
136 res |= cvtto_linux_mask(bflags, O_APPEND, LINUX_O_APPEND);
137 res |= cvtto_linux_mask(bflags, O_NONBLOCK, LINUX_O_NONBLOCK);
138 res |= cvtto_linux_mask(bflags, O_NDELAY, LINUX_O_NDELAY);
139 res |= cvtto_linux_mask(bflags, O_FSYNC, LINUX_O_SYNC);
140 res |= cvtto_linux_mask(bflags, O_ASYNC, LINUX_FASYNC);
141 res |= cvtto_linux_mask(bflags, O_DIRECT, LINUX_O_DIRECT);
142 res |= cvtto_linux_mask(bflags, O_DIRECTORY, LINUX_O_DIRECTORY);
143 res |= cvtto_linux_mask(bflags, O_NOFOLLOW, LINUX_O_NOFOLLOW);
144 res |= cvtto_linux_mask(bflags, O_CLOEXEC, LINUX_O_CLOEXEC);
145
146 return res;
147 }
148
149 static inline off_t
150 linux_hilo_to_off_t(unsigned long hi, unsigned long lo)
151 {
152 #ifdef _LP64
153 /*
154 * Linux discards the "hi" portion on LP64 platforms; even though
155 * glibc puts of the upper 32-bits of the offset into the "hi"
156 * argument regardless, the "lo" argument has all the bits in
157 * this case.
158 */
159 (void) hi;
160 return (off_t)lo;
161 #else
162 return (((off_t)hi) << 32) | lo;
163 #endif /* _LP64 */
164 }
165
166 #if !defined(__aarch64__)
167 /*
168 * creat(2) is an obsolete function, but it's present as a Linux
169 * system call, so let's deal with it.
170 *
171 * Note: On the Alpha this doesn't really exist in Linux, but it's defined
172 * in syscalls.master anyway so this doesn't have to be special cased.
173 *
174 * Just call open(2) with the TRUNC, CREAT and WRONLY flags.
175 */
176 int
177 linux_sys_creat(struct lwp *l, const struct linux_sys_creat_args *uap, register_t *retval)
178 {
179 /* {
180 syscallarg(const char *) path;
181 syscallarg(linux_umode_t) mode;
182 } */
183 struct sys_open_args oa;
184
185 SCARG(&oa, path) = SCARG(uap, path);
186 SCARG(&oa, flags) = O_CREAT | O_TRUNC | O_WRONLY;
187 SCARG(&oa, mode) = SCARG(uap, mode);
188
189 return sys_open(l, &oa, retval);
190 }
191 #endif
192
193 static void
194 linux_open_ctty(struct lwp *l, int flags, int fd)
195 {
196 struct proc *p = l->l_proc;
197
198 /*
199 * this bit from sunos_misc.c (and svr4_fcntl.c).
200 * If we are a session leader, and we don't have a controlling
201 * terminal yet, and the O_NOCTTY flag is not set, try to make
202 * this the controlling terminal.
203 */
204 if (!(flags & O_NOCTTY) && SESS_LEADER(p) && !(p->p_lflag & PL_CONTROLT)) {
205 file_t *fp;
206
207 fp = fd_getfile(fd);
208
209 /* ignore any error, just give it a try */
210 if (fp != NULL) {
211 if (fp->f_type == DTYPE_VNODE) {
212 (fp->f_ops->fo_ioctl) (fp, TIOCSCTTY, NULL);
213 }
214 fd_putfile(fd);
215 }
216 }
217 }
218
219 /*
220 * open(2). Take care of the different flag values, and let the
221 * NetBSD syscall do the real work. See if this operation
222 * gives the current process a controlling terminal.
223 * (XXX is this necessary?)
224 */
225 int
226 linux_sys_open(struct lwp *l, const struct linux_sys_open_args *uap, register_t *retval)
227 {
228 /* {
229 syscallarg(const char *) path;
230 syscallarg(int) flags;
231 syscallarg(linux_umode_t) mode;
232 } */
233 int error, fl;
234 struct sys_open_args boa;
235
236 fl = linux_to_bsd_ioflags(SCARG(uap, flags));
237
238 SCARG(&boa, path) = SCARG(uap, path);
239 SCARG(&boa, flags) = fl;
240 SCARG(&boa, mode) = SCARG(uap, mode);
241
242 if ((error = sys_open(l, &boa, retval)))
243 return (error == EFTYPE) ? ELOOP : error;
244
245 linux_open_ctty(l, fl, *retval);
246 return 0;
247 }
248
249 int
250 linux_sys_openat(struct lwp *l, const struct linux_sys_openat_args *uap, register_t *retval)
251 {
252 /* {
253 syscallarg(int) fd;
254 syscallarg(const char *) path;
255 syscallarg(int) flags;
256 syscallarg(linux_umode_t) mode;
257 } */
258 int error, fl;
259 struct sys_openat_args boa;
260
261 fl = linux_to_bsd_ioflags(SCARG(uap, flags));
262
263 SCARG(&boa, fd) = SCARG(uap, fd);
264 SCARG(&boa, path) = SCARG(uap, path);
265 SCARG(&boa, oflags) = fl;
266 SCARG(&boa, mode) = SCARG(uap, mode);
267
268 if ((error = sys_openat(l, &boa, retval)))
269 return (error == EFTYPE) ? ELOOP : error;
270
271 linux_open_ctty(l, fl, *retval);
272 return 0;
273 }
274
275 /*
276 * Most actions in the fcntl() call are straightforward; simply
277 * pass control to the NetBSD system call. A few commands need
278 * conversions after the actual system call has done its work,
279 * because the flag values and lock structure are different.
280 */
281 int
282 linux_sys_fcntl(struct lwp *l, const struct linux_sys_fcntl_args *uap, register_t *retval)
283 {
284 /* {
285 syscallarg(int) fd;
286 syscallarg(int) cmd;
287 syscallarg(void *) arg;
288 } */
289 struct proc *p = l->l_proc;
290 int fd, cmd, error;
291 u_long val;
292 void *arg;
293 struct sys_fcntl_args fca;
294 file_t *fp;
295 struct vnode *vp;
296 struct vattr va;
297 long pgid;
298 struct pgrp *pgrp;
299 struct tty *tp;
300
301 fd = SCARG(uap, fd);
302 cmd = SCARG(uap, cmd);
303 arg = SCARG(uap, arg);
304
305 switch (cmd) {
306
307 case LINUX_F_DUPFD:
308 cmd = F_DUPFD;
309 break;
310
311 case LINUX_F_GETFD:
312 cmd = F_GETFD;
313 break;
314
315 case LINUX_F_SETFD:
316 cmd = F_SETFD;
317 break;
318
319 case LINUX_F_GETFL:
320 SCARG(&fca, fd) = fd;
321 SCARG(&fca, cmd) = F_GETFL;
322 SCARG(&fca, arg) = arg;
323 if ((error = sys_fcntl(l, &fca, retval)))
324 return error;
325 retval[0] = bsd_to_linux_ioflags(retval[0]);
326 return 0;
327
328 case LINUX_F_SETFL: {
329 file_t *fp1 = NULL;
330
331 val = linux_to_bsd_ioflags((unsigned long)SCARG(uap, arg));
332 /*
333 * Linux seems to have same semantics for sending SIGIO to the
334 * read side of socket, but slightly different semantics
335 * for SIGIO to the write side. Rather than sending the SIGIO
336 * every time it's possible to write (directly) more data, it
337 * only sends SIGIO if last write(2) failed due to insufficient
338 * memory to hold the data. This is compatible enough
339 * with NetBSD semantics to not do anything about the
340 * difference.
341 *
342 * Linux does NOT send SIGIO for pipes. Deal with socketpair
343 * ones and DTYPE_PIPE ones. For these, we don't set
344 * the underlying flags (we don't pass O_ASYNC flag down
345 * to sys_fcntl()), but set the FASYNC flag for file descriptor,
346 * so that F_GETFL would report the ASYNC i/o is on.
347 */
348 if (val & O_ASYNC) {
349 if (((fp1 = fd_getfile(fd)) == NULL))
350 return (EBADF);
351 if (((fp1->f_type == DTYPE_SOCKET) && fp1->f_data
352 && ((struct socket *)fp1->f_data)->so_state & SS_ISAPIPE)
353 || (fp1->f_type == DTYPE_PIPE))
354 val &= ~O_ASYNC;
355 else {
356 /* not a pipe, do not modify anything */
357 fd_putfile(fd);
358 fp1 = NULL;
359 }
360 }
361
362 SCARG(&fca, fd) = fd;
363 SCARG(&fca, cmd) = F_SETFL;
364 SCARG(&fca, arg) = (void *) val;
365
366 error = sys_fcntl(l, &fca, retval);
367
368 /* Now set the FASYNC flag for pipes */
369 if (fp1) {
370 if (!error) {
371 mutex_enter(&fp1->f_lock);
372 fp1->f_flag |= FASYNC;
373 mutex_exit(&fp1->f_lock);
374 }
375 fd_putfile(fd);
376 }
377
378 return (error);
379 }
380
381 case LINUX_F_GETLK:
382 do_linux_getlk(fd, cmd, arg, linux, flock);
383
384 case LINUX_F_SETLK:
385 case LINUX_F_SETLKW:
386 do_linux_setlk(fd, cmd, arg, linux, flock, LINUX_F_SETLK);
387
388 case LINUX_F_SETOWN:
389 case LINUX_F_GETOWN:
390 /*
391 * We need to route fcntl() for tty descriptors around normal
392 * fcntl(), since NetBSD tty TIOC{G,S}PGRP semantics is too
393 * restrictive for Linux F_{G,S}ETOWN. For non-tty descriptors,
394 * this is not a problem.
395 */
396 if ((fp = fd_getfile(fd)) == NULL)
397 return EBADF;
398
399 /* Check it's a character device vnode */
400 if (fp->f_type != DTYPE_VNODE
401 || (vp = (struct vnode *)fp->f_data) == NULL
402 || vp->v_type != VCHR) {
403 fd_putfile(fd);
404
405 not_tty:
406 /* Not a tty, proceed with common fcntl() */
407 cmd = cmd == LINUX_F_SETOWN ? F_SETOWN : F_GETOWN;
408 break;
409 }
410
411 vn_lock(vp, LK_SHARED | LK_RETRY);
412 error = VOP_GETATTR(vp, &va, l->l_cred);
413 VOP_UNLOCK(vp);
414
415 fd_putfile(fd);
416
417 if (error)
418 return error;
419
420 if ((tp = cdev_tty(va.va_rdev)) == NULL)
421 goto not_tty;
422
423 /* set tty pg_id appropriately */
424 mutex_enter(&proc_lock);
425 if (cmd == LINUX_F_GETOWN) {
426 retval[0] = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID;
427 mutex_exit(&proc_lock);
428 return 0;
429 }
430 if ((long)arg <= 0) {
431 pgid = -(long)arg;
432 } else {
433 struct proc *p1 = proc_find((long)arg);
434 if (p1 == NULL) {
435 mutex_exit(&proc_lock);
436 return (ESRCH);
437 }
438 pgid = (long)p1->p_pgrp->pg_id;
439 }
440 pgrp = pgrp_find(pgid);
441 if (pgrp == NULL || pgrp->pg_session != p->p_session) {
442 mutex_exit(&proc_lock);
443 return EPERM;
444 }
445 tp->t_pgrp = pgrp;
446 mutex_exit(&proc_lock);
447 return 0;
448
449 case LINUX_F_DUPFD_CLOEXEC:
450 cmd = F_DUPFD_CLOEXEC;
451 break;
452
453 case LINUX_F_ADD_SEALS:
454 cmd = F_ADD_SEALS;
455 break;
456
457 case LINUX_F_GET_SEALS:
458 cmd = F_GET_SEALS;
459 break;
460
461 default:
462 return EOPNOTSUPP;
463 }
464
465 SCARG(&fca, fd) = fd;
466 SCARG(&fca, cmd) = cmd;
467 SCARG(&fca, arg) = arg;
468
469 return sys_fcntl(l, &fca, retval);
470 }
471
472 #if !defined(__aarch64__) && !defined(__amd64__)
473 /*
474 * Convert a NetBSD stat structure to a Linux stat structure.
475 * Only the order of the fields and the padding in the structure
476 * is different. linux_fakedev is a machine-dependent function
477 * which optionally converts device driver major/minor numbers
478 * (XXX horrible, but what can you do against code that compares
479 * things against constant major device numbers? sigh)
480 */
481 static void
482 bsd_to_linux_stat(struct stat *bsp, struct linux_stat *lsp)
483 {
484
485 memset(lsp, 0, sizeof(*lsp));
486 lsp->lst_dev = linux_fakedev(bsp->st_dev, 0);
487 lsp->lst_ino = bsp->st_ino;
488 lsp->lst_mode = (linux_mode_t)bsp->st_mode;
489 if (bsp->st_nlink >= (1 << 15))
490 lsp->lst_nlink = (1 << 15) - 1;
491 else
492 lsp->lst_nlink = (linux_nlink_t)bsp->st_nlink;
493 lsp->lst_uid = bsp->st_uid;
494 lsp->lst_gid = bsp->st_gid;
495 lsp->lst_rdev = linux_fakedev(bsp->st_rdev, 1);
496 lsp->lst_size = bsp->st_size;
497 lsp->lst_blksize = bsp->st_blksize;
498 lsp->lst_blocks = bsp->st_blocks;
499 lsp->lst_atime = bsp->st_atime;
500 lsp->lst_mtime = bsp->st_mtime;
501 lsp->lst_ctime = bsp->st_ctime;
502 #ifdef LINUX_STAT_HAS_NSEC
503 lsp->lst_atime_nsec = bsp->st_atimensec;
504 lsp->lst_mtime_nsec = bsp->st_mtimensec;
505 lsp->lst_ctime_nsec = bsp->st_ctimensec;
506 #endif
507 }
508
509 /*
510 * The stat functions below are plain sailing. stat and lstat are handled
511 * by one function to avoid code duplication.
512 */
513 int
514 linux_sys_fstat(struct lwp *l, const struct linux_sys_fstat_args *uap, register_t *retval)
515 {
516 /* {
517 syscallarg(int) fd;
518 syscallarg(linux_stat *) sp;
519 } */
520 struct linux_stat tmplst;
521 struct stat tmpst;
522 int error;
523
524 error = do_sys_fstat(SCARG(uap, fd), &tmpst);
525 if (error != 0)
526 return error;
527 bsd_to_linux_stat(&tmpst, &tmplst);
528
529 return copyout(&tmplst, SCARG(uap, sp), sizeof tmplst);
530 }
531
532 static int
533 linux_stat1(const struct linux_sys_stat_args *uap, register_t *retval, int flags)
534 {
535 struct linux_stat tmplst;
536 struct stat tmpst;
537 int error;
538
539 error = do_sys_stat(SCARG(uap, path), flags, &tmpst);
540 if (error != 0)
541 return error;
542
543 bsd_to_linux_stat(&tmpst, &tmplst);
544
545 return copyout(&tmplst, SCARG(uap, sp), sizeof tmplst);
546 }
547
548 int
549 linux_sys_stat(struct lwp *l, const struct linux_sys_stat_args *uap, register_t *retval)
550 {
551 /* {
552 syscallarg(const char *) path;
553 syscallarg(struct linux_stat *) sp;
554 } */
555
556 return linux_stat1(uap, retval, FOLLOW);
557 }
558
559 /* Note: this is "newlstat" in the Linux sources */
560 /* (we don't bother with the old lstat currently) */
561 int
562 linux_sys_lstat(struct lwp *l, const struct linux_sys_lstat_args *uap, register_t *retval)
563 {
564 /* {
565 syscallarg(const char *) path;
566 syscallarg(struct linux_stat *) sp;
567 } */
568
569 return linux_stat1((const void *)uap, retval, NOFOLLOW);
570 }
571 #endif /* !__aarch64__ && !__amd64__ */
572
573 /*
574 * The following syscalls are mostly here because of the alternate path check.
575 */
576
577 int
578 linux_sys_linkat(struct lwp *l, const struct linux_sys_linkat_args *uap, register_t *retval)
579 {
580 /* {
581 syscallarg(int) fd1;
582 syscallarg(const char *) name1;
583 syscallarg(int) fd2;
584 syscallarg(const char *) name2;
585 syscallarg(int) flags;
586 } */
587 int fd1 = SCARG(uap, fd1);
588 const char *name1 = SCARG(uap, name1);
589 int fd2 = SCARG(uap, fd2);
590 const char *name2 = SCARG(uap, name2);
591 int follow;
592
593 follow = SCARG(uap, flags) & LINUX_AT_SYMLINK_FOLLOW;
594
595 return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
596 }
597
598 static int
599 linux_unlink_dircheck(const char *path)
600 {
601 struct nameidata nd;
602 struct pathbuf *pb;
603 int error;
604
605 /*
606 * Linux returns EISDIR if unlink(2) is called on a directory.
607 * We return EPERM in such cases. To emulate correct behaviour,
608 * check if the path points to directory and return EISDIR if this
609 * is the case.
610 *
611 * XXX this should really not copy in the path buffer twice...
612 */
613 error = pathbuf_copyin(path, &pb);
614 if (error) {
615 return error;
616 }
617 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
618 if (namei(&nd) == 0) {
619 struct stat sb;
620
621 if (vn_stat(nd.ni_vp, &sb) == 0
622 && S_ISDIR(sb.st_mode))
623 error = EISDIR;
624
625 vput(nd.ni_vp);
626 }
627 pathbuf_destroy(pb);
628 return error ? error : EPERM;
629 }
630
631 int
632 linux_sys_unlink(struct lwp *l, const struct linux_sys_unlink_args *uap, register_t *retval)
633 {
634 /* {
635 syscallarg(const char *) path;
636 } */
637 int error;
638
639 error = sys_unlink(l, (const void *)uap, retval);
640 if (error == EPERM)
641 error = linux_unlink_dircheck(SCARG(uap, path));
642
643 return error;
644 }
645
646 int
647 linux_sys_unlinkat(struct lwp *l, const struct linux_sys_unlinkat_args *uap, register_t *retval)
648 {
649 /* {
650 syscallarg(int) fd;
651 syscallarg(const char *) path;
652 syscallarg(int) flag;
653 } */
654 struct sys_unlinkat_args ua;
655 int error;
656
657 SCARG(&ua, fd) = SCARG(uap, fd);
658 SCARG(&ua, path) = SCARG(uap, path);
659 SCARG(&ua, flag) = linux_to_bsd_atflags(SCARG(uap, flag));
660
661 error = sys_unlinkat(l, &ua, retval);
662 if (error == EPERM)
663 error = linux_unlink_dircheck(SCARG(uap, path));
664
665 return error;
666 }
667
668 int
669 linux_sys_mknod(struct lwp *l, const struct linux_sys_mknod_args *uap, register_t *retval)
670 {
671 /* {
672 syscallarg(const char *) path;
673 syscallarg(linux_umode_t) mode;
674 syscallarg(unsigned) dev;
675 } */
676 struct linux_sys_mknodat_args ua;
677
678 SCARG(&ua, fd) = LINUX_AT_FDCWD;
679 SCARG(&ua, path) = SCARG(uap, path);
680 SCARG(&ua, mode) = SCARG(uap, mode);
681 SCARG(&ua, dev) = SCARG(uap, dev);
682
683 return linux_sys_mknodat(l, &ua, retval);
684 }
685
686 int
687 linux_sys_mknodat(struct lwp *l, const struct linux_sys_mknodat_args *uap, register_t *retval)
688 {
689 /* {
690 syscallarg(int) fd;
691 syscallarg(const char *) path;
692 syscallarg(linux_umode_t) mode;
693 syscallarg(unsigned) dev;
694 } */
695
696 /*
697 * BSD handles FIFOs separately
698 */
699 if (S_ISFIFO(SCARG(uap, mode))) {
700 struct sys_mkfifoat_args bma;
701
702 SCARG(&bma, fd) = SCARG(uap, fd);
703 SCARG(&bma, path) = SCARG(uap, path);
704 SCARG(&bma, mode) = SCARG(uap, mode);
705 return sys_mkfifoat(l, &bma, retval);
706 } else {
707
708 /*
709 * Linux device numbers uses 8 bits for minor and 8 bits
710 * for major. Due to how we map our major and minor,
711 * this just fits into our dev_t. Just mask off the
712 * upper 16bit to remove any random junk.
713 */
714
715 return do_sys_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
716 SCARG(uap, mode), SCARG(uap, dev) & 0xffff, UIO_USERSPACE);
717 }
718 }
719
720 int
721 linux_sys_fchmodat(struct lwp *l, const struct linux_sys_fchmodat_args *uap, register_t *retval)
722 {
723 /* {
724 syscallarg(int) fd;
725 syscallarg(const char *) path;
726 syscallarg(linux_umode_t) mode;
727 } */
728
729 return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
730 SCARG(uap, mode), AT_SYMLINK_FOLLOW);
731 }
732
733 int
734 linux_sys_fchownat(struct lwp *l, const struct linux_sys_fchownat_args *uap, register_t *retval)
735 {
736 /* {
737 syscallarg(int) fd;
738 syscallarg(const char *) path;
739 syscallarg(uid_t) owner;
740 syscallarg(gid_t) group;
741 syscallarg(int) flag;
742 } */
743 int flag;
744
745 flag = linux_to_bsd_atflags(SCARG(uap, flag));
746 return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
747 SCARG(uap, owner), SCARG(uap, group), flag);
748 }
749
750 int
751 linux_sys_faccessat(struct lwp *l, const struct linux_sys_faccessat_args *uap, register_t *retval)
752 {
753 /* {
754 syscallarg(int) fd;
755 syscallarg(const char *) path;
756 syscallarg(int) amode;
757 } */
758
759 return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
760 SCARG(uap, amode), AT_SYMLINK_FOLLOW);
761 }
762
763 /*
764 * This is just fsync() for now (just as it is in the Linux kernel)
765 * Note: this is not implemented under Linux on Alpha and Arm
766 * but should still be defined in our syscalls.master.
767 * (syscall #148 on the arm)
768 */
769 int
770 linux_sys_fdatasync(struct lwp *l, const struct linux_sys_fdatasync_args *uap, register_t *retval)
771 {
772 /* {
773 syscallarg(int) fd;
774 } */
775
776 return sys_fsync(l, (const void *)uap, retval);
777 }
778
779 /*
780 * pread(2).
781 */
782 int
783 linux_sys_pread(struct lwp *l, const struct linux_sys_pread_args *uap, register_t *retval)
784 {
785 /* {
786 syscallarg(int) fd;
787 syscallarg(void *) buf;
788 syscallarg(size_t) nbyte;
789 syscallarg(off_t) offset;
790 } */
791 struct sys_pread_args pra;
792
793 SCARG(&pra, fd) = SCARG(uap, fd);
794 SCARG(&pra, buf) = SCARG(uap, buf);
795 SCARG(&pra, nbyte) = SCARG(uap, nbyte);
796 SCARG(&pra, PAD) = 0;
797 SCARG(&pra, offset) = SCARG(uap, offset);
798
799 return sys_pread(l, &pra, retval);
800 }
801
802 /*
803 * pwrite(2).
804 */
805 int
806 linux_sys_pwrite(struct lwp *l, const struct linux_sys_pwrite_args *uap, register_t *retval)
807 {
808 /* {
809 syscallarg(int) fd;
810 syscallarg(void *) buf;
811 syscallarg(size_t) nbyte;
812 syscallarg(off_t) offset;
813 } */
814 struct sys_pwrite_args pra;
815
816 SCARG(&pra, fd) = SCARG(uap, fd);
817 SCARG(&pra, buf) = SCARG(uap, buf);
818 SCARG(&pra, nbyte) = SCARG(uap, nbyte);
819 SCARG(&pra, PAD) = 0;
820 SCARG(&pra, offset) = SCARG(uap, offset);
821
822 return sys_pwrite(l, &pra, retval);
823 }
824
825 /*
826 * preadv(2)
827 */
828 int
829 linux_sys_preadv(struct lwp *l, const struct linux_sys_preadv_args *uap,
830 register_t *retval)
831 {
832 /* {
833 syscallarg(int) fd;
834 syscallarg(const struct iovec *) iovp;
835 syscallarg(int) iovcnt;
836 syscallarg(unsigned long) off_lo;
837 syscallarg(unsigned long) off_hi;
838 } */
839 struct sys_preadv_args ua;
840
841 SCARG(&ua, fd) = SCARG(uap, fd);
842 SCARG(&ua, iovp) = SCARG(uap, iovp);
843 SCARG(&ua, iovcnt) = SCARG(uap, iovcnt);
844 SCARG(&ua, PAD) = 0;
845 SCARG(&ua, offset) = linux_hilo_to_off_t(SCARG(uap, off_hi),
846 SCARG(uap, off_lo));
847 return sys_preadv(l, &ua, retval);
848 }
849
850 /*
851 * pwritev(2)
852 */
853 int
854 linux_sys_pwritev(struct lwp *l, const struct linux_sys_pwritev_args *uap,
855 register_t *retval)
856 {
857 /* {
858 syscallarg(int) fd;
859 syscallarg(const struct iovec *) iovp;
860 syscallarg(int) iovcnt;
861 syscallarg(unsigned long) off_lo;
862 syscallarg(unsigned long) off_hi;
863 } */
864 struct sys_pwritev_args ua;
865
866 SCARG(&ua, fd) = SCARG(uap, fd);
867 SCARG(&ua, iovp) = (const void *)SCARG(uap, iovp);
868 SCARG(&ua, iovcnt) = SCARG(uap, iovcnt);
869 SCARG(&ua, PAD) = 0;
870 SCARG(&ua, offset) = linux_hilo_to_off_t(SCARG(uap, off_hi),
871 SCARG(uap, off_lo));
872 return sys_pwritev(l, &ua, retval);
873 }
874
875 int
876 linux_sys_dup3(struct lwp *l, const struct linux_sys_dup3_args *uap,
877 register_t *retval)
878 {
879 /* {
880 syscallarg(int) from;
881 syscallarg(int) to;
882 syscallarg(int) flags;
883 } */
884 int flags;
885
886 flags = linux_to_bsd_ioflags(SCARG(uap, flags));
887 if ((flags & ~O_CLOEXEC) != 0)
888 return EINVAL;
889
890 if (SCARG(uap, from) == SCARG(uap, to))
891 return EINVAL;
892
893 return dodup(l, SCARG(uap, from), SCARG(uap, to), flags, retval);
894 }
895
896
897 int
898 linux_to_bsd_atflags(int lflags)
899 {
900 int bflags = 0;
901
902 if (lflags & LINUX_AT_SYMLINK_NOFOLLOW)
903 bflags |= AT_SYMLINK_NOFOLLOW;
904 if (lflags & LINUX_AT_REMOVEDIR)
905 bflags |= AT_REMOVEDIR;
906 if (lflags & LINUX_AT_SYMLINK_FOLLOW)
907 bflags |= AT_SYMLINK_FOLLOW;
908
909 return bflags;
910 }
911
912 int
913 linux_sys_faccessat2(lwp_t *l, const struct linux_sys_faccessat2_args *uap,
914 register_t *retval)
915 {
916 /* {
917 syscallarg(int) fd;
918 syscallarg(const char *) path;
919 syscallarg(int) amode;
920 syscallarg(int) flags;
921 }*/
922 int flag = linux_to_bsd_atflags(SCARG(uap, flags));
923 int mode = SCARG(uap, amode);
924 int fd = SCARG(uap, fd);
925 const char *path = SCARG(uap, path);
926
927 return do_sys_accessat(l, fd, path, mode, flag);
928 }
929
930
931 int
932 linux_sys_sync_file_range(lwp_t *l,
933 const struct linux_sys_sync_file_range_args *uap, register_t *retval)
934 {
935 /* {
936 syscallarg(int) fd;
937 syscallarg(off_t) offset;
938 syscallarg(off_t) nbytes;
939 syscallarg(unsigned int) flags;
940 } */
941
942 struct sys_fsync_range_args ua;
943
944 if (SCARG(uap, offset) < 0 || SCARG(uap, nbytes) < 0 ||
945 ((SCARG(uap, flags) & ~LINUX_SYNC_FILE_RANGE_ALL) != 0))
946 return EINVAL;
947
948 /* Fill ua with uap */
949 SCARG(&ua, fd) = SCARG(uap, fd);
950 SCARG(&ua, flags) = SCARG(uap, flags);
951
952 /* Round down offset to page boundary */
953 SCARG(&ua, start) = rounddown(SCARG(uap, offset), PAGE_SIZE);
954 SCARG(&ua, length) = SCARG(uap, nbytes);
955 if (SCARG(&ua, length) != 0) {
956 /* Round up length to nbytes+offset to page boundary */
957 SCARG(&ua, length) = roundup(SCARG(uap, nbytes)
958 + SCARG(uap, offset) - SCARG(&ua, start), PAGE_SIZE);
959 }
960
961 return sys_fsync_range(l, &ua, retval);
962 }
963
964 int
965 linux_sys_syncfs(lwp_t *l, const struct linux_sys_syncfs_args *uap,
966 register_t *retval)
967 {
968 /* {
969 syscallarg(int) fd;
970 } */
971
972 struct mount *mp;
973 struct vnode *vp;
974 file_t *fp;
975 int error, fd;
976 fd = SCARG(uap, fd);
977
978 /* Get file pointer */
979 if ((error = fd_getvnode(fd, &fp)) != 0)
980 return error;
981
982 /* Get vnode and mount point */
983 vp = fp->f_vnode;
984 mp = vp->v_mount;
985
986 mutex_enter(mp->mnt_updating);
987 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
988 int asyncflag = mp->mnt_flag & MNT_ASYNC;
989 mp->mnt_flag &= ~MNT_ASYNC;
990 VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
991 if (asyncflag)
992 mp->mnt_flag |= MNT_ASYNC;
993 }
994 mutex_exit(mp->mnt_updating);
995
996 /* Cleanup vnode and file pointer */
997 vrele(vp);
998 fd_putfile(fd);
999 return 0;
1000
1001 }
1002
1003 int
1004 linux_sys_renameat2(struct lwp *l, const struct linux_sys_renameat2_args *uap,
1005 register_t *retval)
1006 {
1007 /* {
1008 syscallarg(int) fromfd;
1009 syscallarg(const char *) from;
1010 syscallarg(int) tofd;
1011 syscallarg(const char *) to;
1012 syscallarg(unsigned int) flags;
1013 } */
1014
1015 struct sys_renameat_args ua;
1016 SCARG(&ua, fromfd) = SCARG(uap, fromfd);
1017 SCARG(&ua, from) = SCARG(uap, from);
1018 SCARG(&ua, tofd) = SCARG(uap, tofd);
1019 SCARG(&ua, to) = SCARG(uap, to);
1020
1021 unsigned int flags = SCARG(uap, flags);
1022 int error;
1023
1024 if (flags != 0) {
1025 if (flags & ~LINUX_RENAME_ALL)
1026 return EINVAL;
1027 if ((flags & LINUX_RENAME_EXCHANGE) != 0 &&
1028 (flags & (LINUX_RENAME_NOREPLACE | LINUX_RENAME_WHITEOUT))
1029 != 0)
1030 return EINVAL;
1031 /*
1032 * Suppoting renameat2 flags without support from file systems
1033 * becomes a messy affair cause of locks and how VOP_RENAME
1034 * protocol is implemented. So, return EOPNOTSUPP for now.
1035 */
1036 return EOPNOTSUPP;
1037 }
1038
1039 error = sys_renameat(l, &ua, retval);
1040 return error;
1041 }
1042
1043 int linux_sys_copy_file_range(lwp_t *l,
1044 const struct linux_sys_copy_file_range_args *uap, register_t *retval)
1045 {
1046 /* {
1047 syscallarg(int) fd_in;
1048 syscallarg(unsigned long) off_in;
1049 syscallarg(int) fd_out;
1050 syscallarg(unsigned long) off_out;
1051 syscallarg(size_t) len;
1052 syscallarg(unsigned int) flags;
1053 } */
1054
1055 int fd_in, fd_out;
1056 file_t *fp_in, *fp_out;
1057 struct vnode *invp, *outvp;
1058 off_t off_in = 0, off_out = 0;
1059 struct vattr vattr_in, vattr_out;
1060 ssize_t total_copied = 0;
1061 size_t bytes_left, to_copy;
1062 bool have_off_in = false, have_off_out = false;
1063 int error = 0;
1064 size_t len = SCARG(uap, len);
1065 unsigned int flags = SCARG(uap, flags);
1066 // Structures for actual copy
1067 char *buffer = NULL;
1068 struct uio auio;
1069 struct iovec aiov;
1070
1071
1072 if (len > SSIZE_MAX) {
1073 DPRINTF("%s: len is greater than SSIZE_MAX\n",
1074 __func__);
1075 return EOVERFLOW;
1076 }
1077
1078 if(flags != 0) {
1079 DPRINTF("%s: unsupported flags %#x\n", __func__, flags);
1080 return EINVAL;
1081 }
1082
1083 fd_in = SCARG(uap, fd_in);
1084 fd_out = SCARG(uap, fd_out);
1085 error = fd_getvnode(fd_in, &fp_in);
1086 if (error) {
1087 return error;
1088 }
1089
1090 error = fd_getvnode(fd_out, &fp_out);
1091 if (error) {
1092 fd_putfile(fd_in);
1093 return error;
1094 }
1095
1096 invp = fp_in->f_vnode;
1097 outvp = fp_out->f_vnode;
1098
1099 /* Get attributes of input and output files */
1100 VOP_GETATTR(invp, &vattr_in, l->l_cred);
1101 VOP_GETATTR(outvp, &vattr_out, l->l_cred);
1102
1103 /* Check if input and output files are regular files */
1104 if (vattr_in.va_type == VDIR || vattr_out.va_type == VDIR) {
1105 error = EISDIR;
1106 DPRINTF("%s: Input or output is a directory\n", __func__);
1107 goto out;
1108 }
1109 if ((SCARG(uap, off_in) != NULL && *SCARG(uap, off_in) < 0) ||
1110 (SCARG(uap, off_out) != NULL && *SCARG(uap, off_out) < 0) ||
1111 vattr_in.va_type != VREG || vattr_out.va_type != VREG)
1112 {
1113 error = EINVAL;
1114 DPRINTF("%s: Invalid offset or file type\n", __func__);
1115 goto out;
1116 }
1117
1118 if ((fp_in->f_flag & FREAD) == 0 ||
1119 (fp_out->f_flag & FWRITE) == 0 || (fp_out->f_flag & FAPPEND) != 0) {
1120 DPRINTF("%s: input file can't be read or output file "
1121 "can't be written\n", __func__);
1122 error = EBADF;
1123 goto out;
1124 }
1125 /* Retrieve and validate offsets if provided */
1126 if (SCARG(uap, off_in) != NULL) {
1127 error = copyin(SCARG(uap, off_in), &off_in, sizeof(off_in));
1128 if (error) {
1129 goto out;
1130 }
1131 have_off_in = true;
1132 }
1133
1134 if (SCARG(uap, off_out) != NULL) {
1135 error = copyin(SCARG(uap, off_out), &off_out, sizeof(off_out));
1136 if (error) {
1137 goto out;
1138 }
1139 have_off_out = true;
1140 }
1141
1142 off_t new_size = off_out + len;
1143 if (new_size < 0) {
1144 DPRINTF("%s: New size is greater than OFF_MAX\n", __func__);
1145 error = EFBIG;
1146 goto out;
1147 }
1148
1149 /* Identify overlapping ranges */
1150 if ((invp == outvp) &&
1151 ((off_in <= off_out && off_in + (off_t)len > off_out) ||
1152 (off_in > off_out && off_out + (off_t)len > off_in))) {
1153 DPRINTF("%s: Ranges overlap\n", __func__);
1154 error = EINVAL;
1155 goto out;
1156 }
1157
1158 buffer = kmem_alloc(LINUX_COPY_FILE_RANGE_MAX_CHUNK, KM_SLEEP);
1159 /* Allocation cannot fail, so no need for error handling? */
1160 if (buffer == NULL) {
1161 error = ENOMEM;
1162 goto out;
1163 }
1164
1165 bytes_left = len;
1166
1167 while (bytes_left > 0) {
1168 to_copy = MIN(bytes_left, LINUX_COPY_FILE_RANGE_MAX_CHUNK);
1169
1170 /* Lock the input vnode for reading */
1171 vn_lock(fp_in->f_vnode, LK_SHARED | LK_RETRY);
1172 /* Set up iovec and uio for reading */
1173 aiov.iov_base = buffer;
1174 aiov.iov_len = to_copy;
1175 auio.uio_iov = &aiov;
1176 auio.uio_iovcnt = 1;
1177 auio.uio_offset = have_off_in ? off_in : fp_in->f_offset;
1178 auio.uio_resid = to_copy;
1179 auio.uio_rw = UIO_READ;
1180 auio.uio_vmspace = l->l_proc->p_vmspace;
1181 UIO_SETUP_SYSSPACE(&auio);
1182
1183 /* Perform read using vn_read */
1184 error = VOP_READ(fp_in->f_vnode, &auio, 0, l->l_cred);
1185 VOP_UNLOCK(fp_in->f_vnode);
1186 if (error) {
1187 DPRINTF("%s: Read error %d\n", __func__, error);
1188 break;
1189 }
1190
1191 size_t read_bytes = to_copy - auio.uio_resid;
1192 if (read_bytes == 0) {
1193 /* EOF reached */
1194 break;
1195 }
1196
1197 /* Lock the output vnode for writing */
1198 vn_lock(fp_out->f_vnode, LK_EXCLUSIVE | LK_RETRY);
1199 /* Set up iovec and uio for writing */
1200 aiov.iov_base = buffer;
1201 aiov.iov_len = read_bytes;
1202 auio.uio_iov = &aiov;
1203 auio.uio_iovcnt = 1;
1204 auio.uio_offset = have_off_out ? off_out : fp_out->f_offset;
1205 auio.uio_resid = read_bytes;
1206 auio.uio_rw = UIO_WRITE;
1207 auio.uio_vmspace = l->l_proc->p_vmspace;
1208 UIO_SETUP_SYSSPACE(&auio);
1209
1210 /* Perform the write */
1211 error = VOP_WRITE(fp_out->f_vnode, &auio, 0, l->l_cred);
1212 VOP_UNLOCK(fp_out->f_vnode);
1213 if (error) {
1214 DPRINTF("%s: Write error %d\n", __func__, error);
1215 break;
1216 }
1217 size_t written_bytes = read_bytes - auio.uio_resid;
1218 total_copied += written_bytes;
1219 bytes_left -= written_bytes;
1220
1221 /* Update offsets if provided */
1222 if (have_off_in) {
1223 off_in += written_bytes;
1224 } else {
1225 fp_in->f_offset += written_bytes;
1226 }
1227 if (have_off_out) {
1228 off_out += written_bytes;
1229 } else {
1230 fp_out->f_offset += written_bytes;
1231 }
1232 }
1233
1234 if (have_off_in) {
1235 /* Adjust user space offset */
1236 error = copyout(&off_in, SCARG(uap, off_in), sizeof(off_t));
1237 if (error) {
1238 DPRINTF("%s: Error adjusting user space offset\n",
1239 __func__);
1240 }
1241 goto out;
1242 }
1243
1244 if (have_off_out) {
1245 /* Adjust user space offset */
1246 error = copyout(&off_out, SCARG(uap, off_out), sizeof(off_t));
1247 if (error) {
1248 DPRINTF("%s: Error adjusting user space offset\n",
1249 __func__);
1250 }
1251 }
1252
1253 *retval = total_copied;
1254 out:
1255 if (buffer) {
1256 kmem_free(buffer, LINUX_COPY_FILE_RANGE_MAX_CHUNK);
1257 }
1258 if (fp_out) {
1259 fd_putfile(fd_out);
1260 }
1261 if (fp_in) {
1262 fd_putfile(fd_in);
1263 }
1264 return error;
1265 }
1266
1267
1268 #define LINUX_NOT_SUPPORTED(fun) \
1269 int \
1270 fun(struct lwp *l, const struct fun##_args *uap, register_t *retval) \
1271 { \
1272 return EOPNOTSUPP; \
1273 }
1274
1275 LINUX_NOT_SUPPORTED(linux_sys_setxattr)
1276 LINUX_NOT_SUPPORTED(linux_sys_lsetxattr)
1277 LINUX_NOT_SUPPORTED(linux_sys_fsetxattr)
1278
1279 LINUX_NOT_SUPPORTED(linux_sys_getxattr)
1280 LINUX_NOT_SUPPORTED(linux_sys_lgetxattr)
1281 LINUX_NOT_SUPPORTED(linux_sys_fgetxattr)
1282
1283 LINUX_NOT_SUPPORTED(linux_sys_listxattr)
1284 LINUX_NOT_SUPPORTED(linux_sys_llistxattr)
1285 LINUX_NOT_SUPPORTED(linux_sys_flistxattr)
1286
1287 LINUX_NOT_SUPPORTED(linux_sys_removexattr)
1288 LINUX_NOT_SUPPORTED(linux_sys_lremovexattr)
1289 LINUX_NOT_SUPPORTED(linux_sys_fremovexattr)
1290
1291 /*
1292 * For now just return EOPNOTSUPP, this makes glibc posix_fallocate()
1293 * to fallback to emulation.
1294 * XXX Right now no filesystem actually implements fallocate support,
1295 * so no need for mapping.
1296 */
1297 LINUX_NOT_SUPPORTED(linux_sys_fallocate)
1298