sys_pipe.c revision 1.4.2.11 1 /* $NetBSD: sys_pipe.c,v 1.4.2.11 2002/04/01 07:47:57 nathanw Exp $ */
2
3 /*
4 * Copyright (c) 1996 John S. Dyson
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice immediately at the beginning of the file, without modification,
12 * this list of conditions, and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Absolutely no warranty of function or purpose is made by the author
17 * John S. Dyson.
18 * 4. Modifications may be freely made to this file if the above conditions
19 * are met.
20 *
21 * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.95 2002/03/09 22:06:31 alfred Exp $
22 */
23
24 /*
25 * This file contains a high-performance replacement for the socket-based
26 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support
27 * all features of sockets, but does do everything that pipes normally
28 * do.
29 *
30 * Adaption for NetBSD UVM, including uvm_loan() based direct write, was
31 * written by Jaromir Dolecek.
32 */
33
34 /*
35 * This code has two modes of operation, a small write mode and a large
36 * write mode. The small write mode acts like conventional pipes with
37 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
38 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
39 * and PIPE_SIZE in size, it is fully mapped into the kernel (on FreeBSD,
40 * those pages are also wired), and the receiving process can copy it directly
41 * from the pages in the sending process.
42 *
43 * If the sending process receives a signal, it is possible that it will
44 * go away, and certainly its address space can change, because control
45 * is returned back to the user-mode side. In that case, the pipe code
46 * arranges to copy the buffer supplied by the user process on FreeBSD, to
47 * a pageable kernel buffer, and the receiving process will grab the data
48 * from the pageable kernel buffer. Since signals don't happen all that often,
49 * the copy operation is normally eliminated.
50 * For NetBSD, the pages are mapped read-only, COW for kernel by uvm_loan(),
51 * so no explicit handling need to be done, all is handled by standard VM
52 * facilities.
53 *
54 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
55 * happen for small transfers so that the system will not spend all of
56 * its time context switching. PIPE_SIZE is constrained by the
57 * amount of kernel virtual memory.
58 */
59
60 #include <sys/cdefs.h>
61 __KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.4.2.11 2002/04/01 07:47:57 nathanw Exp $");
62
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/proc.h>
66 #include <sys/fcntl.h>
67 #include <sys/file.h>
68 #include <sys/filedesc.h>
69 #include <sys/filio.h>
70 #include <sys/kernel.h>
71 #include <sys/lock.h>
72 #include <sys/ttycom.h>
73 #include <sys/stat.h>
74 #include <sys/malloc.h>
75 #include <sys/poll.h>
76 #include <sys/signalvar.h>
77 #include <sys/vnode.h>
78 #include <sys/uio.h>
79 #include <sys/lock.h>
80 #ifdef __FreeBSD__
81 #include <sys/mutex.h>
82 #endif
83 #ifdef __NetBSD__
84 #include <sys/select.h>
85 #include <sys/mount.h>
86 #include <sys/syscallargs.h>
87 #include <uvm/uvm.h>
88 #include <sys/sysctl.h>
89 #include <sys/kernel.h>
90 #endif /* NetBSD, FreeBSD */
91
92 #include <sys/pipe.h>
93
94 #ifdef __NetBSD__
95 /*
96 * Avoid microtime(9), it's slow. We don't guard the read from time(9)
97 * with splclock(9) since we don't actually need to be THAT sure the access
98 * is atomic.
99 */
100 #define vfs_timestamp(tv) (*(tv) = time)
101 #endif
102
103 /*
104 * Use this define if you want to disable *fancy* VM things. Expect an
105 * approx 30% decrease in transfer rate. This could be useful for
106 * OpenBSD.
107 */
108 /* #define PIPE_NODIRECT */
109
110 /*
111 * interfaces to the outside world
112 */
113 #ifdef __FreeBSD__
114 static int pipe_read(struct file *fp, struct uio *uio,
115 struct ucred *cred, int flags, struct thread *td);
116 static int pipe_write(struct file *fp, struct uio *uio,
117 struct ucred *cred, int flags, struct thread *td);
118 static int pipe_close(struct file *fp, struct thread *td);
119 static int pipe_poll(struct file *fp, int events, struct ucred *cred,
120 struct thread *td);
121 static int pipe_kqfilter(struct file *fp, struct knote *kn);
122 static int pipe_stat(struct file *fp, struct stat *sb, struct thread *td);
123 static int pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct thread *td);
124
125 static struct fileops pipeops = {
126 pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
127 pipe_stat, pipe_close
128 };
129
130 static void filt_pipedetach(struct knote *kn);
131 static int filt_piperead(struct knote *kn, long hint);
132 static int filt_pipewrite(struct knote *kn, long hint);
133
134 static struct filterops pipe_rfiltops =
135 { 1, NULL, filt_pipedetach, filt_piperead };
136 static struct filterops pipe_wfiltops =
137 { 1, NULL, filt_pipedetach, filt_pipewrite };
138
139 #define PIPE_GET_GIANT(pipe) \
140 do { \
141 PIPE_UNLOCK(wpipe); \
142 mtx_lock(&Giant); \
143 } while (0)
144
145 #define PIPE_DROP_GIANT(pipe) \
146 do { \
147 mtx_unlock(&Giant); \
148 PIPE_LOCK(wpipe); \
149 } while (0)
150
151 #endif /* FreeBSD */
152
153 #ifdef __NetBSD__
154 static int pipe_read(struct file *fp, off_t *offset, struct uio *uio,
155 struct ucred *cred, int flags);
156 static int pipe_write(struct file *fp, off_t *offset, struct uio *uio,
157 struct ucred *cred, int flags);
158 static int pipe_close(struct file *fp, struct proc *p);
159 static int pipe_poll(struct file *fp, int events, struct proc *p);
160 static int pipe_fcntl(struct file *fp, u_int com, caddr_t data,
161 struct proc *p);
162 static int pipe_stat(struct file *fp, struct stat *sb, struct proc *p);
163 static int pipe_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p);
164
165 static struct fileops pipeops =
166 { pipe_read, pipe_write, pipe_ioctl, pipe_fcntl, pipe_poll,
167 pipe_stat, pipe_close };
168
169 /* XXXSMP perhaps use spinlocks & KERNEL_PROC_(UN)LOCK() ? just clear now */
170 #define PIPE_GET_GIANT(pipe)
171 #define PIPE_DROP_GIANT(pipe)
172 #define GIANT_REQUIRED
173
174 #endif /* NetBSD */
175
176 /*
177 * Default pipe buffer size(s), this can be kind-of large now because pipe
178 * space is pageable. The pipe code will try to maintain locality of
179 * reference for performance reasons, so small amounts of outstanding I/O
180 * will not wipe the cache.
181 */
182 #define MINPIPESIZE (PIPE_SIZE/3)
183 #define MAXPIPESIZE (2*PIPE_SIZE/3)
184
185 /*
186 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
187 * is there so that on large systems, we don't exhaust it.
188 */
189 #define MAXPIPEKVA (8*1024*1024)
190 static int maxpipekva = MAXPIPEKVA;
191
192 /*
193 * Limit for direct transfers, we cannot, of course limit
194 * the amount of kva for pipes in general though.
195 */
196 #define LIMITPIPEKVA (16*1024*1024)
197 static int limitpipekva = LIMITPIPEKVA;
198
199 /*
200 * Limit the number of "big" pipes
201 */
202 #define LIMITBIGPIPES 32
203 static int maxbigpipes = LIMITBIGPIPES;
204 static int nbigpipe = 0;
205
206 /*
207 * Amount of KVA consumed by pipe buffers.
208 */
209 static int amountpipekva = 0;
210
211 static void pipeclose(struct pipe *cpipe);
212 static void pipe_free_kmem(struct pipe *cpipe);
213 static int pipe_create(struct pipe **cpipep, int allockva);
214 static __inline int pipelock(struct pipe *cpipe, int catch);
215 static __inline void pipeunlock(struct pipe *cpipe);
216 static __inline void pipeselwakeup(struct pipe *cpipe, struct pipe *sigp);
217 #ifndef PIPE_NODIRECT
218 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
219 #endif
220 static int pipespace(struct pipe *cpipe, int size);
221
222 #ifdef __NetBSD__
223 #ifndef PIPE_NODIRECT
224 static int pipe_loan_alloc(struct pipe *, int);
225 static void pipe_loan_free(struct pipe *);
226 #endif /* PIPE_NODIRECT */
227
228 static struct pool pipe_pool;
229 #endif /* NetBSD */
230
231 #ifdef __FreeBSD__
232 static vm_zone_t pipe_zone;
233
234 static void pipeinit(void *dummy __unused);
235 #ifndef PIPE_NODIRECT
236 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
237 static void pipe_destroy_write_buffer(struct pipe *wpipe);
238 static void pipe_clone_write_buffer(struct pipe *wpipe);
239 #endif
240
241 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
242
243 static void
244 pipeinit(void *dummy __unused)
245 {
246
247 pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4);
248 }
249 #endif /* FreeBSD */
250
251 /*
252 * The pipe system call for the DTYPE_PIPE type of pipes
253 */
254
255 /* ARGSUSED */
256 #ifdef __FreeBSD__
257 int
258 pipe(td, uap)
259 struct thread *td;
260 struct pipe_args /* {
261 int dummy;
262 } */ *uap;
263 #elif defined(__NetBSD__)
264 int
265 sys_pipe(l, v, retval)
266 struct lwp *l;
267 void *v;
268 register_t *retval;
269 #endif
270 {
271 struct file *rf, *wf;
272 struct pipe *rpipe, *wpipe;
273 int fd, error;
274 struct proc *p;
275 #ifdef __FreeBSD__
276 struct mtx *pmtx;
277
278 KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
279
280 pmtx = malloc(sizeof(*pmtx), M_TEMP, M_WAITOK | M_ZERO);
281
282 rpipe = wpipe = NULL;
283 if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 1)) {
284 pipeclose(rpipe);
285 pipeclose(wpipe);
286 free(pmtx, M_TEMP);
287 return (ENFILE);
288 }
289
290 error = falloc(td, &rf, &fd);
291 if (error) {
292 pipeclose(rpipe);
293 pipeclose(wpipe);
294 free(pmtx, M_TEMP);
295 return (error);
296 }
297 fhold(rf);
298 td->td_retval[0] = fd;
299
300 /*
301 * Warning: once we've gotten past allocation of the fd for the
302 * read-side, we can only drop the read side via fdrop() in order
303 * to avoid races against processes which manage to dup() the read
304 * side while we are blocked trying to allocate the write side.
305 */
306 FILE_LOCK(rf);
307 rf->f_flag = FREAD | FWRITE;
308 rf->f_type = DTYPE_PIPE;
309 rf->f_data = (caddr_t)rpipe;
310 rf->f_ops = &pipeops;
311 FILE_UNLOCK(rf);
312 error = falloc(td, &wf, &fd);
313 if (error) {
314 struct filedesc *fdp = td->td_proc->p_fd;
315 FILEDESC_LOCK(fdp);
316 if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
317 fdp->fd_ofiles[td->td_retval[0]] = NULL;
318 FILEDESC_UNLOCK(fdp);
319 fdrop(rf, td);
320 } else
321 FILEDESC_UNLOCK(fdp);
322 fdrop(rf, td);
323 /* rpipe has been closed by fdrop(). */
324 pipeclose(wpipe);
325 free(pmtx, M_TEMP);
326 return (error);
327 }
328 FILE_LOCK(wf);
329 wf->f_flag = FREAD | FWRITE;
330 wf->f_type = DTYPE_PIPE;
331 wf->f_data = (caddr_t)wpipe;
332 wf->f_ops = &pipeops;
333 p->p_retval[1] = fd;
334 rpipe->pipe_peer = wpipe;
335 wpipe->pipe_peer = rpipe;
336 mtx_init(pmtx, "pipe mutex", MTX_DEF);
337 rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx;
338 fdrop(rf, td);
339 #endif /* FreeBSD */
340
341 #ifdef __NetBSD__
342 p = l->l_proc;
343 rpipe = wpipe = NULL;
344 if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 0)) {
345 pipeclose(rpipe);
346 pipeclose(wpipe);
347 return (ENFILE);
348 }
349
350 /*
351 * Note: the file structure returned from falloc() is marked
352 * as 'larval' initially. Unless we mark it as 'mature' by
353 * FILE_SET_MATURE(), any attempt to do anything with it would
354 * return EBADF, including e.g. dup(2) or close(2). This avoids
355 * file descriptor races if we block in the second falloc().
356 */
357
358 error = falloc(p, &rf, &fd);
359 if (error)
360 goto free2;
361 retval[0] = fd;
362 rf->f_flag = FREAD;
363 rf->f_type = DTYPE_PIPE;
364 rf->f_data = (caddr_t)rpipe;
365 rf->f_ops = &pipeops;
366
367 error = falloc(p, &wf, &fd);
368 if (error)
369 goto free3;
370 retval[1] = fd;
371 wf->f_flag = FWRITE;
372 wf->f_type = DTYPE_PIPE;
373 wf->f_data = (caddr_t)wpipe;
374 wf->f_ops = &pipeops;
375
376 rpipe->pipe_peer = wpipe;
377 wpipe->pipe_peer = rpipe;
378
379 FILE_SET_MATURE(rf);
380 FILE_SET_MATURE(wf);
381 FILE_UNUSE(rf, p);
382 FILE_UNUSE(wf, p);
383 return (0);
384 free3:
385 FILE_UNUSE(rf, p);
386 ffree(rf);
387 fdremove(p->p_fd, retval[0]);
388 free2:
389 pipeclose(wpipe);
390 pipeclose(rpipe);
391 #endif /* NetBSD */
392
393 return (error);
394 }
395
396 /*
397 * Allocate kva for pipe circular buffer, the space is pageable
398 * This routine will 'realloc' the size of a pipe safely, if it fails
399 * it will retain the old buffer.
400 * If it fails it will return ENOMEM.
401 */
402 static int
403 pipespace(cpipe, size)
404 struct pipe *cpipe;
405 int size;
406 {
407 caddr_t buffer;
408 #ifdef __FreeBSD__
409 struct vm_object *object;
410 int npages, error;
411
412 GIANT_REQUIRED;
413 KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
414 ("pipespace: pipe mutex locked"));
415
416 npages = round_page(size)/PAGE_SIZE;
417 /*
418 * Create an object, I don't like the idea of paging to/from
419 * kernel_object.
420 */
421 object = vm_object_allocate(OBJT_DEFAULT, npages);
422 buffer = (caddr_t) vm_map_min(kernel_map);
423
424 /*
425 * Insert the object into the kernel map, and allocate kva for it.
426 * The map entry is, by default, pageable.
427 */
428 error = vm_map_find(kernel_map, object, 0,
429 (vm_offset_t *) &buffer, size, 1,
430 VM_PROT_ALL, VM_PROT_ALL, 0);
431
432 if (error != KERN_SUCCESS) {
433 vm_object_deallocate(object);
434 return (ENOMEM);
435 }
436 #endif /* FreeBSD */
437
438 #ifdef __NetBSD__
439 /*
440 * Allocate pageable virtual address space. Physical memory is allocated
441 * on demand.
442 */
443 buffer = (caddr_t) uvm_km_valloc(kernel_map, round_page(size));
444 if (buffer == NULL)
445 return (ENOMEM);
446 #endif /* NetBSD */
447
448 /* free old resources if we're resizing */
449 pipe_free_kmem(cpipe);
450 #ifdef __FreeBSD__
451 cpipe->pipe_buffer.object = object;
452 #endif
453 cpipe->pipe_buffer.buffer = buffer;
454 cpipe->pipe_buffer.size = size;
455 cpipe->pipe_buffer.in = 0;
456 cpipe->pipe_buffer.out = 0;
457 cpipe->pipe_buffer.cnt = 0;
458 amountpipekva += cpipe->pipe_buffer.size;
459 return (0);
460 }
461
462 /*
463 * initialize and allocate VM and memory for pipe
464 */
465 static int
466 pipe_create(cpipep, allockva)
467 struct pipe **cpipep;
468 int allockva;
469 {
470 struct pipe *cpipe;
471 int error;
472
473 #ifdef __FreeBSD__
474 *cpipep = zalloc(pipe_zone);
475 #endif
476 #ifdef __NetBSD__
477 *cpipep = pool_get(&pipe_pool, M_WAITOK);
478 #endif
479 if (*cpipep == NULL)
480 return (ENOMEM);
481
482 cpipe = *cpipep;
483
484 /* Initialize */
485 memset(cpipe, 0, sizeof(*cpipe));
486 cpipe->pipe_state = PIPE_SIGNALR;
487
488 #ifdef __FreeBSD__
489 cpipe->pipe_mtxp = NULL; /* avoid pipespace assertion */
490 #endif
491 if (allockva && (error = pipespace(cpipe, PIPE_SIZE)))
492 return (error);
493
494 vfs_timestamp(&cpipe->pipe_ctime);
495 cpipe->pipe_atime = cpipe->pipe_ctime;
496 cpipe->pipe_mtime = cpipe->pipe_ctime;
497 #ifdef __NetBSD__
498 cpipe->pipe_pgid = NO_PID;
499 lockinit(&cpipe->pipe_lock, PRIBIO | PCATCH, "pipelk", 0, 0);
500 #endif
501
502 return (0);
503 }
504
505
506 /*
507 * lock a pipe for I/O, blocking other access
508 */
509 static __inline int
510 pipelock(cpipe, catch)
511 struct pipe *cpipe;
512 int catch;
513 {
514 int error;
515
516 #ifdef __FreeBSD__
517 PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
518 while (cpipe->pipe_state & PIPE_LOCKFL) {
519 cpipe->pipe_state |= PIPE_LWANT;
520 error = msleep(cpipe, PIPE_MTX(cpipe),
521 catch ? (PRIBIO | PCATCH) : PRIBIO,
522 "pipelk", 0);
523 if (error != 0)
524 return (error);
525 }
526 cpipe->pipe_state |= PIPE_LOCKFL;
527 return (0);
528 #endif
529
530 #ifdef __NetBSD__
531 do {
532 error = lockmgr(&cpipe->pipe_lock, LK_EXCLUSIVE, NULL);
533 } while (!catch && (error == EINTR || error == ERESTART));
534 return (error);
535 #endif
536 }
537
538 /*
539 * unlock a pipe I/O lock
540 */
541 static __inline void
542 pipeunlock(cpipe)
543 struct pipe *cpipe;
544 {
545
546 #ifdef __FreeBSD__
547 PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
548 cpipe->pipe_state &= ~PIPE_LOCKFL;
549 if (cpipe->pipe_state & PIPE_LWANT) {
550 cpipe->pipe_state &= ~PIPE_LWANT;
551 wakeup(cpipe);
552 }
553 #endif
554
555 #ifdef __NetBSD__
556 lockmgr(&cpipe->pipe_lock, LK_RELEASE, NULL);
557 #endif
558 }
559
560 /*
561 * Select/poll wakup. This also sends SIGIO to peer connected to
562 * 'sigpipe' side of pipe.
563 */
564 static __inline void
565 pipeselwakeup(selp, sigp)
566 struct pipe *selp, *sigp;
567 {
568 if (selp->pipe_state & PIPE_SEL) {
569 selp->pipe_state &= ~PIPE_SEL;
570 selwakeup(&selp->pipe_sel);
571 }
572 #ifdef __FreeBSD__
573 if (sigp && (sigp->pipe_state & PIPE_ASYNC) && sigp->pipe_sigio)
574 pgsigio(sigp->pipe_sigio, SIGIO, 0);
575 KNOTE(&selp->pipe_sel.si_note, 0);
576 #endif
577
578 #ifdef __NetBSD__
579 if (sigp && (sigp->pipe_state & PIPE_ASYNC)
580 && sigp->pipe_pgid != NO_PID){
581 struct proc *p;
582
583 if (sigp->pipe_pgid < 0)
584 gsignal(-sigp->pipe_pgid, SIGIO);
585 else if (sigp->pipe_pgid > 0 && (p = pfind(sigp->pipe_pgid)) != 0)
586 psignal(p, SIGIO);
587 }
588 #endif /* NetBSD */
589 }
590
591 /* ARGSUSED */
592 #ifdef __FreeBSD__
593 static int
594 pipe_read(fp, uio, cred, flags, td)
595 struct file *fp;
596 struct uio *uio;
597 struct ucred *cred;
598 struct thread *td;
599 int flags;
600 struct proc *p;
601 #elif defined(__NetBSD__)
602 static int
603 pipe_read(fp, offset, uio, cred, flags)
604 struct file *fp;
605 off_t *offset;
606 struct uio *uio;
607 struct ucred *cred;
608 int flags;
609 #endif
610 {
611 struct pipe *rpipe = (struct pipe *) fp->f_data;
612 int error;
613 size_t nread = 0;
614 size_t size;
615 size_t ocnt;
616
617 PIPE_LOCK(rpipe);
618 ++rpipe->pipe_busy;
619 error = pipelock(rpipe, 1);
620 if (error)
621 goto unlocked_error;
622
623 ocnt = rpipe->pipe_buffer.cnt;
624
625 while (uio->uio_resid) {
626 /*
627 * normal pipe buffer receive
628 */
629 if (rpipe->pipe_buffer.cnt > 0) {
630 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
631 if (size > rpipe->pipe_buffer.cnt)
632 size = rpipe->pipe_buffer.cnt;
633 if (size > uio->uio_resid)
634 size = uio->uio_resid;
635
636 PIPE_UNLOCK(rpipe);
637 error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
638 size, uio);
639 PIPE_LOCK(rpipe);
640 if (error)
641 break;
642
643 rpipe->pipe_buffer.out += size;
644 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
645 rpipe->pipe_buffer.out = 0;
646
647 rpipe->pipe_buffer.cnt -= size;
648
649 /*
650 * If there is no more to read in the pipe, reset
651 * its pointers to the beginning. This improves
652 * cache hit stats.
653 */
654 if (rpipe->pipe_buffer.cnt == 0) {
655 rpipe->pipe_buffer.in = 0;
656 rpipe->pipe_buffer.out = 0;
657 }
658 nread += size;
659 #ifndef PIPE_NODIRECT
660 /*
661 * Direct copy, bypassing a kernel buffer.
662 */
663 } else if ((size = rpipe->pipe_map.cnt) &&
664 (rpipe->pipe_state & PIPE_DIRECTW)) {
665 caddr_t va;
666 if (size > uio->uio_resid)
667 size = uio->uio_resid;
668
669 va = (caddr_t) rpipe->pipe_map.kva +
670 rpipe->pipe_map.pos;
671 PIPE_UNLOCK(rpipe);
672 error = uiomove(va, size, uio);
673 PIPE_LOCK(rpipe);
674 if (error)
675 break;
676 nread += size;
677 rpipe->pipe_map.pos += size;
678 rpipe->pipe_map.cnt -= size;
679 if (rpipe->pipe_map.cnt == 0) {
680 rpipe->pipe_state &= ~PIPE_DIRECTW;
681 wakeup(rpipe);
682 }
683 #endif
684 } else {
685 /*
686 * detect EOF condition
687 * read returns 0 on EOF, no need to set error
688 */
689 if (rpipe->pipe_state & PIPE_EOF)
690 break;
691
692 /*
693 * If the "write-side" has been blocked, wake it up now.
694 */
695 if (rpipe->pipe_state & PIPE_WANTW) {
696 rpipe->pipe_state &= ~PIPE_WANTW;
697 wakeup(rpipe);
698 }
699
700 /*
701 * Break if some data was read.
702 */
703 if (nread > 0)
704 break;
705
706 /*
707 * don't block on non-blocking I/O
708 */
709 if (fp->f_flag & FNONBLOCK) {
710 error = EAGAIN;
711 break;
712 }
713
714 /*
715 * Unlock the pipe buffer for our remaining processing.
716 * We will either break out with an error or we will
717 * sleep and relock to loop.
718 */
719 pipeunlock(rpipe);
720
721 /*
722 * We want to read more, wake up select/poll.
723 */
724 pipeselwakeup(rpipe, rpipe->pipe_peer);
725
726 rpipe->pipe_state |= PIPE_WANTR;
727 #ifdef __FreeBSD__
728 error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
729 "piperd", 0);
730 #else
731 error = tsleep(rpipe, PRIBIO | PCATCH, "piperd", 0);
732 #endif
733 if (error != 0 || (error = pipelock(rpipe, 1)))
734 goto unlocked_error;
735 }
736 }
737 pipeunlock(rpipe);
738
739 /* XXX: should probably do this before getting any locks. */
740 if (error == 0)
741 vfs_timestamp(&rpipe->pipe_atime);
742 unlocked_error:
743 --rpipe->pipe_busy;
744
745 /*
746 * PIPE_WANTCLOSE processing only makes sense if pipe_busy is 0.
747 */
748 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANTCLOSE)) {
749 rpipe->pipe_state &= ~(PIPE_WANTCLOSE|PIPE_WANTW);
750 wakeup(rpipe);
751 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
752 /*
753 * Handle write blocking hysteresis.
754 */
755 if (rpipe->pipe_state & PIPE_WANTW) {
756 rpipe->pipe_state &= ~PIPE_WANTW;
757 wakeup(rpipe);
758 }
759 }
760
761 /*
762 * If anything was read off the buffer, signal to the writer it's
763 * possible to write more data. Also send signal if we are here for the
764 * first time after last write.
765 */
766 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF
767 && (ocnt != rpipe->pipe_buffer.cnt || (rpipe->pipe_state & PIPE_SIGNALR))) {
768 pipeselwakeup(rpipe, rpipe->pipe_peer);
769 rpipe->pipe_state &= ~PIPE_SIGNALR;
770 }
771
772 PIPE_UNLOCK(rpipe);
773 return (error);
774 }
775
776 #ifdef __FreeBSD__
777 #ifndef PIPE_NODIRECT
778 /*
779 * Map the sending processes' buffer into kernel space and wire it.
780 * This is similar to a physical write operation.
781 */
782 static int
783 pipe_build_write_buffer(wpipe, uio)
784 struct pipe *wpipe;
785 struct uio *uio;
786 {
787 size_t size;
788 int i;
789 vm_offset_t addr, endaddr, paddr;
790
791 GIANT_REQUIRED;
792 PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
793
794 size = uio->uio_iov->iov_len;
795 if (size > wpipe->pipe_buffer.size)
796 size = wpipe->pipe_buffer.size;
797
798 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
799 addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
800 for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
801 vm_page_t m;
802
803 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
804 (paddr = pmap_kextract(addr)) == 0) {
805 int j;
806
807 for (j = 0; j < i; j++)
808 vm_page_unwire(wpipe->pipe_map.ms[j], 1);
809 return (EFAULT);
810 }
811
812 m = PHYS_TO_VM_PAGE(paddr);
813 vm_page_wire(m);
814 wpipe->pipe_map.ms[i] = m;
815 }
816
817 /*
818 * set up the control block
819 */
820 wpipe->pipe_map.npages = i;
821 wpipe->pipe_map.pos =
822 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
823 wpipe->pipe_map.cnt = size;
824
825 /*
826 * and map the buffer
827 */
828 if (wpipe->pipe_map.kva == 0) {
829 /*
830 * We need to allocate space for an extra page because the
831 * address range might (will) span pages at times.
832 */
833 wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
834 wpipe->pipe_buffer.size + PAGE_SIZE);
835 amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
836 }
837 pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
838 wpipe->pipe_map.npages);
839
840 /*
841 * and update the uio data
842 */
843
844 uio->uio_iov->iov_len -= size;
845 uio->uio_iov->iov_base += size;
846 if (uio->uio_iov->iov_len == 0)
847 uio->uio_iov++;
848 uio->uio_resid -= size;
849 uio->uio_offset += size;
850 return (0);
851 }
852
853 /*
854 * unmap and unwire the process buffer
855 */
856 static void
857 pipe_destroy_write_buffer(wpipe)
858 struct pipe *wpipe;
859 {
860 int i;
861
862 GIANT_REQUIRED;
863 PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
864
865 if (wpipe->pipe_map.kva) {
866 pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
867
868 if (amountpipekva > maxpipekva) {
869 vm_offset_t kva = wpipe->pipe_map.kva;
870 wpipe->pipe_map.kva = 0;
871 kmem_free(kernel_map, kva,
872 wpipe->pipe_buffer.size + PAGE_SIZE);
873 amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
874 }
875 }
876 for (i = 0; i < wpipe->pipe_map.npages; i++)
877 vm_page_unwire(wpipe->pipe_map.ms[i], 1);
878 wpipe->pipe_map.npages = 0;
879 }
880
881 /*
882 * In the case of a signal, the writing process might go away. This
883 * code copies the data into the circular buffer so that the source
884 * pages can be freed without loss of data.
885 */
886 static void
887 pipe_clone_write_buffer(wpipe)
888 struct pipe *wpipe;
889 {
890 int size;
891 int pos;
892
893 PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
894 size = wpipe->pipe_map.cnt;
895 pos = wpipe->pipe_map.pos;
896 memcpy((caddr_t) wpipe->pipe_buffer.buffer,
897 (caddr_t) wpipe->pipe_map.kva + pos, size);
898
899 wpipe->pipe_buffer.in = size;
900 wpipe->pipe_buffer.out = 0;
901 wpipe->pipe_buffer.cnt = size;
902 wpipe->pipe_state &= ~PIPE_DIRECTW;
903
904 PIPE_GET_GIANT(wpipe);
905 pipe_destroy_write_buffer(wpipe);
906 PIPE_DROP_GIANT(wpipe);
907 }
908
909 /*
910 * This implements the pipe buffer write mechanism. Note that only
911 * a direct write OR a normal pipe write can be pending at any given time.
912 * If there are any characters in the pipe buffer, the direct write will
913 * be deferred until the receiving process grabs all of the bytes from
914 * the pipe buffer. Then the direct mapping write is set-up.
915 */
916 static int
917 pipe_direct_write(wpipe, uio)
918 struct pipe *wpipe;
919 struct uio *uio;
920 {
921 int error;
922
923 retry:
924 PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
925 while (wpipe->pipe_state & PIPE_DIRECTW) {
926 if (wpipe->pipe_state & PIPE_WANTR) {
927 wpipe->pipe_state &= ~PIPE_WANTR;
928 wakeup(wpipe);
929 }
930 wpipe->pipe_state |= PIPE_WANTW;
931 error = msleep(wpipe, PIPE_MTX(wpipe),
932 PRIBIO | PCATCH, "pipdww", 0);
933 if (error)
934 goto error1;
935 if (wpipe->pipe_state & PIPE_EOF) {
936 error = EPIPE;
937 goto error1;
938 }
939 }
940 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
941 if (wpipe->pipe_buffer.cnt > 0) {
942 if (wpipe->pipe_state & PIPE_WANTR) {
943 wpipe->pipe_state &= ~PIPE_WANTR;
944 wakeup(wpipe);
945 }
946
947 wpipe->pipe_state |= PIPE_WANTW;
948 error = msleep(wpipe, PIPE_MTX(wpipe),
949 PRIBIO | PCATCH, "pipdwc", 0);
950 if (error)
951 goto error1;
952 if (wpipe->pipe_state & PIPE_EOF) {
953 error = EPIPE;
954 goto error1;
955 }
956 goto retry;
957 }
958
959 wpipe->pipe_state |= PIPE_DIRECTW;
960
961 PIPE_GET_GIANT(wpipe);
962 error = pipe_build_write_buffer(wpipe, uio);
963 PIPE_DROP_GIANT(wpipe);
964 if (error) {
965 wpipe->pipe_state &= ~PIPE_DIRECTW;
966 goto error1;
967 }
968
969 error = 0;
970 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
971 if (wpipe->pipe_state & PIPE_EOF) {
972 pipelock(wpipe, 0);
973 PIPE_GET_GIANT(wpipe);
974 pipe_destroy_write_buffer(wpipe);
975 PIPE_DROP_GIANT(wpipe);
976 pipeunlock(wpipe);
977 pipeselwakeup(wpipe, wpipe);
978 error = EPIPE;
979 goto error1;
980 }
981 if (wpipe->pipe_state & PIPE_WANTR) {
982 wpipe->pipe_state &= ~PIPE_WANTR;
983 wakeup(wpipe);
984 }
985 pipeselwakeup(wpipe, wpipe);
986 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
987 "pipdwt", 0);
988 }
989
990 pipelock(wpipe,0);
991 if (wpipe->pipe_state & PIPE_DIRECTW) {
992 /*
993 * this bit of trickery substitutes a kernel buffer for
994 * the process that might be going away.
995 */
996 pipe_clone_write_buffer(wpipe);
997 } else {
998 PIPE_GET_GIANT(wpipe);
999 pipe_destroy_write_buffer(wpipe);
1000 PIPE_DROP_GIANT(wpipe);
1001 }
1002 pipeunlock(wpipe);
1003 return (error);
1004
1005 error1:
1006 wakeup(wpipe);
1007 return (error);
1008 }
1009 #endif /* !PIPE_NODIRECT */
1010 #endif /* FreeBSD */
1011
1012 #ifdef __NetBSD__
1013 #ifndef PIPE_NODIRECT
1014 /*
1015 * Allocate structure for loan transfer.
1016 */
1017 static int
1018 pipe_loan_alloc(wpipe, npages)
1019 struct pipe *wpipe;
1020 int npages;
1021 {
1022 vsize_t len;
1023
1024 len = (vsize_t)npages << PAGE_SHIFT;
1025 wpipe->pipe_map.kva = uvm_km_valloc_wait(kernel_map, len);
1026 if (wpipe->pipe_map.kva == 0)
1027 return (ENOMEM);
1028
1029 amountpipekva += len;
1030 wpipe->pipe_map.npages = npages;
1031 wpipe->pipe_map.pgs = malloc(npages * sizeof(struct vm_page *), M_PIPE,
1032 M_WAITOK);
1033 return (0);
1034 }
1035
1036 /*
1037 * Free resources allocated for loan transfer.
1038 */
1039 static void
1040 pipe_loan_free(wpipe)
1041 struct pipe *wpipe;
1042 {
1043 vsize_t len;
1044
1045 len = (vsize_t)wpipe->pipe_map.npages << PAGE_SHIFT;
1046 uvm_km_free(kernel_map, wpipe->pipe_map.kva, len);
1047 wpipe->pipe_map.kva = 0;
1048 amountpipekva -= len;
1049 free(wpipe->pipe_map.pgs, M_PIPE);
1050 wpipe->pipe_map.pgs = NULL;
1051 }
1052
1053 /*
1054 * NetBSD direct write, using uvm_loan() mechanism.
1055 * This implements the pipe buffer write mechanism. Note that only
1056 * a direct write OR a normal pipe write can be pending at any given time.
1057 * If there are any characters in the pipe buffer, the direct write will
1058 * be deferred until the receiving process grabs all of the bytes from
1059 * the pipe buffer. Then the direct mapping write is set-up.
1060 */
1061 static int
1062 pipe_direct_write(wpipe, uio)
1063 struct pipe *wpipe;
1064 struct uio *uio;
1065 {
1066 int error, npages, j;
1067 struct vm_page **pgs;
1068 vaddr_t bbase, kva, base, bend;
1069 vsize_t blen, bcnt;
1070 voff_t bpos;
1071
1072 retry:
1073 while (wpipe->pipe_state & PIPE_DIRECTW) {
1074 if (wpipe->pipe_state & PIPE_WANTR) {
1075 wpipe->pipe_state &= ~PIPE_WANTR;
1076 wakeup(wpipe);
1077 }
1078 wpipe->pipe_state |= PIPE_WANTW;
1079 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
1080 if (error)
1081 goto error;
1082 if (wpipe->pipe_state & PIPE_EOF) {
1083 error = EPIPE;
1084 goto error;
1085 }
1086 }
1087 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
1088 if (wpipe->pipe_buffer.cnt > 0) {
1089 if (wpipe->pipe_state & PIPE_WANTR) {
1090 wpipe->pipe_state &= ~PIPE_WANTR;
1091 wakeup(wpipe);
1092 }
1093
1094 wpipe->pipe_state |= PIPE_WANTW;
1095 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
1096 if (error)
1097 goto error;
1098 if (wpipe->pipe_state & PIPE_EOF) {
1099 error = EPIPE;
1100 goto error;
1101 }
1102 goto retry;
1103 }
1104
1105 /*
1106 * Handle first PIPE_CHUNK_SIZE bytes of buffer. Deal with buffers
1107 * not aligned to PAGE_SIZE.
1108 */
1109 bbase = (vaddr_t)uio->uio_iov->iov_base;
1110 base = trunc_page(bbase);
1111 bend = round_page(bbase + uio->uio_iov->iov_len);
1112 blen = bend - base;
1113 bpos = bbase - base;
1114
1115 if (blen > PIPE_DIRECT_CHUNK) {
1116 blen = PIPE_DIRECT_CHUNK;
1117 bend = base + blen;
1118 bcnt = PIPE_DIRECT_CHUNK - bpos;
1119 } else {
1120 bcnt = uio->uio_iov->iov_len;
1121 }
1122 npages = blen >> PAGE_SHIFT;
1123
1124 wpipe->pipe_map.pos = bpos;
1125 wpipe->pipe_map.cnt = bcnt;
1126
1127 /*
1128 * Free the old kva if we need more pages than we have
1129 * allocated.
1130 */
1131 if (wpipe->pipe_map.kva && npages > wpipe->pipe_map.npages)
1132 pipe_loan_free(wpipe);
1133
1134 /* Allocate new kva. */
1135 if (wpipe->pipe_map.kva == 0) {
1136 error = pipe_loan_alloc(wpipe, npages);
1137 if (error) {
1138 goto error;
1139 }
1140 }
1141
1142 /* Loan the write buffer memory from writer process */
1143 pgs = wpipe->pipe_map.pgs;
1144 error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, base, blen,
1145 pgs, UVM_LOAN_TOPAGE);
1146 if (error) {
1147 pgs = NULL;
1148 goto cleanup;
1149 }
1150
1151 /* Enter the loaned pages to kva */
1152 kva = wpipe->pipe_map.kva;
1153 for (j = 0; j < npages; j++, kva += PAGE_SIZE) {
1154 pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pgs[j]), VM_PROT_READ);
1155 }
1156 pmap_update(pmap_kernel());
1157
1158 wpipe->pipe_state |= PIPE_DIRECTW;
1159 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
1160 if (wpipe->pipe_state & PIPE_EOF) {
1161 error = EPIPE;
1162 break;
1163 }
1164 if (wpipe->pipe_state & PIPE_WANTR) {
1165 wpipe->pipe_state &= ~PIPE_WANTR;
1166 wakeup(wpipe);
1167 }
1168 pipeselwakeup(wpipe, wpipe);
1169 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
1170 }
1171
1172 if (error)
1173 wpipe->pipe_state &= ~PIPE_DIRECTW;
1174
1175 cleanup:
1176 pipelock(wpipe, 0);
1177 if (pgs != NULL) {
1178 pmap_kremove(wpipe->pipe_map.kva, blen);
1179 uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE);
1180 }
1181 if (error || amountpipekva > maxpipekva)
1182 pipe_loan_free(wpipe);
1183 pipeunlock(wpipe);
1184
1185 if (error) {
1186 pipeselwakeup(wpipe, wpipe);
1187
1188 /*
1189 * If nothing was read from what we offered, return error
1190 * straight on. Otherwise update uio resid first. Caller
1191 * will deal with the error condition, returning short
1192 * write, error, or restarting the write(2) as appropriate.
1193 */
1194 if (wpipe->pipe_map.cnt == bcnt) {
1195 error:
1196 wakeup(wpipe);
1197 return (error);
1198 }
1199
1200 bcnt -= wpipe->pipe_map.cnt;
1201 }
1202
1203 uio->uio_resid -= bcnt;
1204 /* uio_offset not updated, not set/used for write(2) */
1205 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + bcnt;
1206 uio->uio_iov->iov_len -= bcnt;
1207 if (uio->uio_iov->iov_len == 0) {
1208 uio->uio_iov++;
1209 uio->uio_iovcnt--;
1210 }
1211
1212 return (error);
1213 }
1214 #endif /* !PIPE_NODIRECT */
1215 #endif /* NetBSD */
1216
1217 #ifdef __FreeBSD__
1218 static int
1219 pipe_write(fp, uio, cred, flags, td)
1220 struct file *fp;
1221 off_t *offset;
1222 struct uio *uio;
1223 struct ucred *cred;
1224 int flags;
1225 struct thread *td;
1226 #elif defined(__NetBSD__)
1227 static int
1228 pipe_write(fp, offset, uio, cred, flags)
1229 struct file *fp;
1230 off_t *offset;
1231 struct uio *uio;
1232 struct ucred *cred;
1233 int flags;
1234 #endif
1235 {
1236 int error = 0;
1237 struct pipe *wpipe, *rpipe;
1238
1239 rpipe = (struct pipe *) fp->f_data;
1240 wpipe = rpipe->pipe_peer;
1241
1242 PIPE_LOCK(rpipe);
1243 /*
1244 * detect loss of pipe read side, issue SIGPIPE if lost.
1245 */
1246 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1247 PIPE_UNLOCK(rpipe);
1248 return (EPIPE);
1249 }
1250
1251 ++wpipe->pipe_busy;
1252
1253 /*
1254 * If it is advantageous to resize the pipe buffer, do
1255 * so.
1256 */
1257 if ((uio->uio_resid > PIPE_SIZE) &&
1258 (nbigpipe < maxbigpipes) &&
1259 #ifndef PIPE_NODIRECT
1260 (wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
1261 #endif
1262 (wpipe->pipe_buffer.size <= PIPE_SIZE) &&
1263 (wpipe->pipe_buffer.cnt == 0)) {
1264
1265 if ((error = pipelock(wpipe,1)) == 0) {
1266 PIPE_GET_GIANT(rpipe);
1267 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
1268 nbigpipe++;
1269 PIPE_DROP_GIANT(rpipe);
1270 pipeunlock(wpipe);
1271 } else {
1272 /*
1273 * If an error occurred, unbusy and return, waking up
1274 * any waiting readers.
1275 */
1276 --wpipe->pipe_busy;
1277 if (wpipe->pipe_busy == 0
1278 && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1279 wpipe->pipe_state &=
1280 ~(PIPE_WANTCLOSE | PIPE_WANTR);
1281 wakeup(wpipe);
1282 }
1283
1284 return (error);
1285 }
1286 }
1287
1288 #ifdef __FreeBSD__
1289 /*
1290 * If an early error occured unbusy and return, waking up any pending
1291 * readers.
1292 */
1293 if (error) {
1294 --wpipe->pipe_busy;
1295 if ((wpipe->pipe_busy == 0) &&
1296 (wpipe->pipe_state & PIPE_WANT)) {
1297 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1298 wakeup(wpipe);
1299 }
1300 PIPE_UNLOCK(rpipe);
1301 return(error);
1302 }
1303
1304 KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
1305 #endif
1306
1307 while (uio->uio_resid) {
1308 int space;
1309
1310 #ifndef PIPE_NODIRECT
1311 /*
1312 * If the transfer is large, we can gain performance if
1313 * we do process-to-process copies directly.
1314 * If the write is non-blocking, we don't use the
1315 * direct write mechanism.
1316 *
1317 * The direct write mechanism will detect the reader going
1318 * away on us.
1319 */
1320 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
1321 (fp->f_flag & FNONBLOCK) == 0 &&
1322 (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) {
1323 error = pipe_direct_write(wpipe, uio);
1324
1325 /*
1326 * Break out if error occured, unless it's ENOMEM.
1327 * ENOMEM means we failed to allocate some resources
1328 * for direct write, so we just fallback to ordinary
1329 * write. If the direct write was successful,
1330 * process rest of data via ordinary write.
1331 */
1332 if (!error)
1333 continue;
1334
1335 if (error != ENOMEM)
1336 break;
1337 }
1338 #endif /* PIPE_NODIRECT */
1339
1340 /*
1341 * Pipe buffered writes cannot be coincidental with
1342 * direct writes. We wait until the currently executing
1343 * direct write is completed before we start filling the
1344 * pipe buffer. We break out if a signal occurs or the
1345 * reader goes away.
1346 */
1347 retrywrite:
1348 while (wpipe->pipe_state & PIPE_DIRECTW) {
1349 if (wpipe->pipe_state & PIPE_WANTR) {
1350 wpipe->pipe_state &= ~PIPE_WANTR;
1351 wakeup(wpipe);
1352 }
1353 #ifdef __FreeBSD__
1354 error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
1355 "pipbww", 0);
1356 #else
1357 error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0);
1358 #endif
1359 if (wpipe->pipe_state & PIPE_EOF)
1360 break;
1361 if (error)
1362 break;
1363 }
1364 if (wpipe->pipe_state & PIPE_EOF) {
1365 error = EPIPE;
1366 break;
1367 }
1368
1369 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1370
1371 /* Writes of size <= PIPE_BUF must be atomic. */
1372 if ((space < uio->uio_resid) && (uio->uio_resid <= PIPE_BUF))
1373 space = 0;
1374
1375 if (space > 0) {
1376 int size; /* Transfer size */
1377 int segsize; /* first segment to transfer */
1378
1379 if ((error = pipelock(wpipe,1)) != 0)
1380 break;
1381
1382 /*
1383 * It is possible for a direct write to
1384 * slip in on us... handle it here...
1385 */
1386 if (wpipe->pipe_state & PIPE_DIRECTW) {
1387 pipeunlock(wpipe);
1388 goto retrywrite;
1389 }
1390 /*
1391 * If a process blocked in uiomove, our
1392 * value for space might be bad.
1393 *
1394 * XXX will we be ok if the reader has gone
1395 * away here?
1396 */
1397 if (space > wpipe->pipe_buffer.size -
1398 wpipe->pipe_buffer.cnt) {
1399 pipeunlock(wpipe);
1400 goto retrywrite;
1401 }
1402
1403 /*
1404 * Transfer size is minimum of uio transfer
1405 * and free space in pipe buffer.
1406 */
1407 if (space > uio->uio_resid)
1408 size = uio->uio_resid;
1409 else
1410 size = space;
1411 /*
1412 * First segment to transfer is minimum of
1413 * transfer size and contiguous space in
1414 * pipe buffer. If first segment to transfer
1415 * is less than the transfer size, we've got
1416 * a wraparound in the buffer.
1417 */
1418 segsize = wpipe->pipe_buffer.size -
1419 wpipe->pipe_buffer.in;
1420 if (segsize > size)
1421 segsize = size;
1422
1423 /* Transfer first segment */
1424
1425 PIPE_UNLOCK(rpipe);
1426 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1427 segsize, uio);
1428 PIPE_LOCK(rpipe);
1429
1430 if (error == 0 && segsize < size) {
1431 /*
1432 * Transfer remaining part now, to
1433 * support atomic writes. Wraparound
1434 * happened.
1435 */
1436 #ifdef DEBUG
1437 if (wpipe->pipe_buffer.in + segsize !=
1438 wpipe->pipe_buffer.size)
1439 panic("Expected pipe buffer wraparound disappeared");
1440 #endif
1441
1442 PIPE_UNLOCK(rpipe);
1443 error = uiomove(&wpipe->pipe_buffer.buffer[0],
1444 size - segsize, uio);
1445 PIPE_LOCK(rpipe);
1446 }
1447 if (error == 0) {
1448 wpipe->pipe_buffer.in += size;
1449 if (wpipe->pipe_buffer.in >=
1450 wpipe->pipe_buffer.size) {
1451 #ifdef DEBUG
1452 if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
1453 panic("Expected wraparound bad");
1454 #endif
1455 wpipe->pipe_buffer.in = size - segsize;
1456 }
1457
1458 wpipe->pipe_buffer.cnt += size;
1459 #ifdef DEBUG
1460 if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
1461 panic("Pipe buffer overflow");
1462 #endif
1463 }
1464 pipeunlock(wpipe);
1465 if (error)
1466 break;
1467 } else {
1468 /*
1469 * If the "read-side" has been blocked, wake it up now.
1470 */
1471 if (wpipe->pipe_state & PIPE_WANTR) {
1472 wpipe->pipe_state &= ~PIPE_WANTR;
1473 wakeup(wpipe);
1474 }
1475
1476 /*
1477 * don't block on non-blocking I/O
1478 */
1479 if (fp->f_flag & FNONBLOCK) {
1480 error = EAGAIN;
1481 break;
1482 }
1483
1484 /*
1485 * We have no more space and have something to offer,
1486 * wake up select/poll.
1487 */
1488 pipeselwakeup(wpipe, wpipe);
1489
1490 wpipe->pipe_state |= PIPE_WANTW;
1491 #ifdef __FreeBSD__
1492 error = msleep(wpipe, PIPE_MTX(rpipe),
1493 PRIBIO | PCATCH, "pipewr", 0);
1494 #else
1495 error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0);
1496 #endif
1497 if (error != 0)
1498 break;
1499 /*
1500 * If read side wants to go away, we just issue a signal
1501 * to ourselves.
1502 */
1503 if (wpipe->pipe_state & PIPE_EOF) {
1504 error = EPIPE;
1505 break;
1506 }
1507 }
1508 }
1509
1510 --wpipe->pipe_busy;
1511 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1512 wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR);
1513 wakeup(wpipe);
1514 } else if (wpipe->pipe_buffer.cnt > 0) {
1515 /*
1516 * If we have put any characters in the buffer, we wake up
1517 * the reader.
1518 */
1519 if (wpipe->pipe_state & PIPE_WANTR) {
1520 wpipe->pipe_state &= ~PIPE_WANTR;
1521 wakeup(wpipe);
1522 }
1523 }
1524
1525 /*
1526 * Don't return EPIPE if I/O was successful
1527 */
1528 if ((error == EPIPE) && (wpipe->pipe_buffer.cnt == 0)
1529 && (uio->uio_resid == 0))
1530 error = 0;
1531
1532 if (error == 0)
1533 vfs_timestamp(&wpipe->pipe_mtime);
1534
1535 /*
1536 * We have something to offer, wake up select/poll.
1537 * wpipe->pipe_map.cnt is always 0 in this point (direct write
1538 * is only done synchronously), so check only wpipe->pipe_buffer.cnt
1539 */
1540 if (wpipe->pipe_buffer.cnt)
1541 pipeselwakeup(wpipe, wpipe);
1542
1543 /*
1544 * Arrange for next read(2) to do a signal.
1545 */
1546 wpipe->pipe_state |= PIPE_SIGNALR;
1547
1548 PIPE_UNLOCK(rpipe);
1549 return (error);
1550 }
1551
1552 /*
1553 * we implement a very minimal set of ioctls for compatibility with sockets.
1554 */
1555 int
1556 #ifdef __FreeBSD__
1557 pipe_ioctl(fp, cmd, data, td)
1558 struct file *fp;
1559 u_long cmd;
1560 caddr_t data;
1561 struct thread *td;
1562 #else
1563 pipe_ioctl(fp, cmd, data, p)
1564 struct file *fp;
1565 u_long cmd;
1566 caddr_t data;
1567 struct proc *p;
1568 #endif
1569 {
1570 struct pipe *mpipe = (struct pipe *)fp->f_data;
1571
1572 switch (cmd) {
1573
1574 case FIONBIO:
1575 return (0);
1576
1577 case FIOASYNC:
1578 PIPE_LOCK(mpipe);
1579 if (*(int *)data) {
1580 mpipe->pipe_state |= PIPE_ASYNC;
1581 } else {
1582 mpipe->pipe_state &= ~PIPE_ASYNC;
1583 }
1584 PIPE_UNLOCK(mpipe);
1585 return (0);
1586
1587 case FIONREAD:
1588 PIPE_LOCK(mpipe);
1589 #ifndef PIPE_NODIRECT
1590 if (mpipe->pipe_state & PIPE_DIRECTW)
1591 *(int *)data = mpipe->pipe_map.cnt;
1592 else
1593 #endif
1594 *(int *)data = mpipe->pipe_buffer.cnt;
1595 PIPE_UNLOCK(mpipe);
1596 return (0);
1597
1598 #ifdef __FreeBSD__
1599 case FIOSETOWN:
1600 return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1601
1602 case FIOGETOWN:
1603 *(int *)data = fgetown(mpipe->pipe_sigio);
1604 return (0);
1605
1606 /* This is deprecated, FIOSETOWN should be used instead. */
1607 case TIOCSPGRP:
1608 return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1609
1610 /* This is deprecated, FIOGETOWN should be used instead. */
1611 case TIOCGPGRP:
1612 *(int *)data = -fgetown(mpipe->pipe_sigio);
1613 return (0);
1614 #endif /* FreeBSD */
1615 #ifdef __NetBSD__
1616 case TIOCSPGRP:
1617 mpipe->pipe_pgid = *(int *)data;
1618 return (0);
1619
1620 case TIOCGPGRP:
1621 *(int *)data = mpipe->pipe_pgid;
1622 return (0);
1623 #endif /* NetBSD */
1624
1625 }
1626 return (EPASSTHROUGH);
1627 }
1628
1629 int
1630 #ifdef __FreeBSD__
1631 pipe_poll(fp, events, cred, td)
1632 struct file *fp;
1633 int events;
1634 struct ucred *cred;
1635 struct thread *td;
1636 #elif defined(__NetBSD__)
1637 pipe_poll(fp, events, td)
1638 struct file *fp;
1639 int events;
1640 struct proc *td;
1641 #endif
1642 {
1643 struct pipe *rpipe = (struct pipe *)fp->f_data;
1644 struct pipe *wpipe;
1645 int revents = 0;
1646
1647 wpipe = rpipe->pipe_peer;
1648 PIPE_LOCK(rpipe);
1649 if (events & (POLLIN | POLLRDNORM))
1650 if ((rpipe->pipe_buffer.cnt > 0) ||
1651 #ifndef PIPE_NODIRECT
1652 (rpipe->pipe_state & PIPE_DIRECTW) ||
1653 #endif
1654 (rpipe->pipe_state & PIPE_EOF))
1655 revents |= events & (POLLIN | POLLRDNORM);
1656
1657 if (events & (POLLOUT | POLLWRNORM))
1658 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)
1659 || (
1660 #ifndef PIPE_NODIRECT
1661 ((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1662 #endif
1663 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1664 revents |= events & (POLLOUT | POLLWRNORM);
1665
1666 if ((rpipe->pipe_state & PIPE_EOF) ||
1667 (wpipe == NULL) ||
1668 (wpipe->pipe_state & PIPE_EOF))
1669 revents |= POLLHUP;
1670
1671 if (revents == 0) {
1672 if (events & (POLLIN | POLLRDNORM)) {
1673 selrecord(td, &rpipe->pipe_sel);
1674 rpipe->pipe_state |= PIPE_SEL;
1675 }
1676
1677 if (events & (POLLOUT | POLLWRNORM)) {
1678 selrecord(td, &wpipe->pipe_sel);
1679 wpipe->pipe_state |= PIPE_SEL;
1680 }
1681 }
1682 PIPE_UNLOCK(rpipe);
1683
1684 return (revents);
1685 }
1686
1687 static int
1688 #ifdef __FreeBSD__
1689 pipe_stat(fp, ub, td)
1690 struct file *fp;
1691 struct stat *ub;
1692 struct thread *td;
1693 #else
1694 pipe_stat(fp, ub, td)
1695 struct file *fp;
1696 struct stat *ub;
1697 struct proc *td;
1698 #endif
1699 {
1700 struct pipe *pipe = (struct pipe *)fp->f_data;
1701
1702 memset((caddr_t)ub, 0, sizeof(*ub));
1703 ub->st_mode = S_IFIFO;
1704 ub->st_blksize = pipe->pipe_buffer.size;
1705 ub->st_size = pipe->pipe_buffer.cnt;
1706 ub->st_blocks = (ub->st_size) ? 1 : 0;
1707 #ifdef __FreeBSD__
1708 ub->st_atimespec = pipe->pipe_atime;
1709 ub->st_mtimespec = pipe->pipe_mtime;
1710 ub->st_ctimespec = pipe->pipe_ctime;
1711 #endif /* FreeBSD */
1712 #ifdef __NetBSD__
1713 TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec)
1714 TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec);
1715 TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec);
1716 #endif /* NetBSD */
1717 ub->st_uid = fp->f_cred->cr_uid;
1718 ub->st_gid = fp->f_cred->cr_gid;
1719 /*
1720 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1721 * XXX (st_dev, st_ino) should be unique.
1722 */
1723 return (0);
1724 }
1725
1726 /* ARGSUSED */
1727 static int
1728 #ifdef __FreeBSD__
1729 pipe_close(fp, td)
1730 struct file *fp;
1731 struct thread *td;
1732 #else
1733 pipe_close(fp, td)
1734 struct file *fp;
1735 struct proc *td;
1736 #endif
1737 {
1738 struct pipe *cpipe = (struct pipe *)fp->f_data;
1739
1740 #ifdef __FreeBSD__
1741 fp->f_ops = &badfileops;
1742 funsetown(cpipe->pipe_sigio);
1743 #endif
1744 fp->f_data = NULL;
1745 pipeclose(cpipe);
1746 return (0);
1747 }
1748
1749 static void
1750 pipe_free_kmem(cpipe)
1751 struct pipe *cpipe;
1752 {
1753
1754 #ifdef __FreeBSD__
1755
1756 GIANT_REQUIRED;
1757 KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
1758 ("pipespace: pipe mutex locked"));
1759 #endif
1760
1761 if (cpipe->pipe_buffer.buffer != NULL) {
1762 if (cpipe->pipe_buffer.size > PIPE_SIZE)
1763 --nbigpipe;
1764 amountpipekva -= cpipe->pipe_buffer.size;
1765 #ifdef __FreeBSD__
1766 kmem_free(kernel_map,
1767 (vm_offset_t)cpipe->pipe_buffer.buffer,
1768 cpipe->pipe_buffer.size);
1769 #elif defined(__NetBSD__)
1770 uvm_km_free(kernel_map,
1771 (vaddr_t)cpipe->pipe_buffer.buffer,
1772 cpipe->pipe_buffer.size);
1773 #endif /* NetBSD */
1774 cpipe->pipe_buffer.buffer = NULL;
1775 }
1776 #ifndef PIPE_NODIRECT
1777 if (cpipe->pipe_map.kva != 0) {
1778 #ifdef __FreeBSD__
1779 amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1780 kmem_free(kernel_map,
1781 cpipe->pipe_map.kva,
1782 cpipe->pipe_buffer.size + PAGE_SIZE);
1783 #elif defined(__NetBSD__)
1784 pipe_loan_free(cpipe);
1785 #endif /* NetBSD */
1786 cpipe->pipe_map.cnt = 0;
1787 cpipe->pipe_map.kva = 0;
1788 cpipe->pipe_map.pos = 0;
1789 cpipe->pipe_map.npages = 0;
1790 }
1791 #endif /* !PIPE_NODIRECT */
1792 }
1793
1794 /*
1795 * shutdown the pipe
1796 */
1797 static void
1798 pipeclose(cpipe)
1799 struct pipe *cpipe;
1800 {
1801 struct pipe *ppipe;
1802 #ifdef __FreeBSD__
1803 int hadpeer = 0;
1804 #endif
1805
1806 if (cpipe == NULL)
1807 return;
1808
1809 /* partially created pipes won't have a valid mutex. */
1810 if (PIPE_MTX(cpipe) != NULL)
1811 PIPE_LOCK(cpipe);
1812
1813 pipeselwakeup(cpipe, cpipe);
1814
1815 /*
1816 * If the other side is blocked, wake it up saying that
1817 * we want to close it down.
1818 */
1819 while (cpipe->pipe_busy) {
1820 wakeup(cpipe);
1821 cpipe->pipe_state |= PIPE_WANTCLOSE | PIPE_EOF;
1822 #ifdef __FreeBSD__
1823 msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1824 #else
1825 tsleep(cpipe, PRIBIO, "pipecl", 0);
1826 #endif
1827 }
1828
1829 /*
1830 * Disconnect from peer
1831 */
1832 if ((ppipe = cpipe->pipe_peer) != NULL) {
1833 #ifdef __FreeBSD__
1834 hadpeer++;
1835 #endif
1836 pipeselwakeup(ppipe, ppipe);
1837
1838 ppipe->pipe_state |= PIPE_EOF;
1839 wakeup(ppipe);
1840 #ifdef __FreeBSD__
1841 KNOTE(&ppipe->pipe_sel.si_note, 0);
1842 #endif
1843 ppipe->pipe_peer = NULL;
1844 }
1845 /*
1846 * free resources
1847 */
1848 #ifdef __FreeBSD__
1849 if (PIPE_MTX(cpipe) != NULL) {
1850 PIPE_UNLOCK(cpipe);
1851 if (!hadpeer) {
1852 mtx_destroy(PIPE_MTX(cpipe));
1853 free(PIPE_MTX(cpipe), M_TEMP);
1854 }
1855 }
1856 mtx_lock(&Giant);
1857 pipe_free_kmem(cpipe);
1858 zfree(pipe_zone, cpipe);
1859 mtx_unlock(&Giant);
1860 #endif
1861
1862 #ifdef __NetBSD__
1863 if (PIPE_MTX(cpipe) != NULL)
1864 PIPE_UNLOCK(cpipe);
1865
1866 pipe_free_kmem(cpipe);
1867 (void) lockmgr(&cpipe->pipe_lock, LK_DRAIN, NULL);
1868 pool_put(&pipe_pool, cpipe);
1869 #endif
1870 }
1871
1872 #ifdef __FreeBSD__
1873 /*ARGSUSED*/
1874 static int
1875 pipe_kqfilter(struct file *fp, struct knote *kn)
1876 {
1877 struct pipe *cpipe;
1878
1879 cpipe = (struct pipe *)kn->kn_fp->f_data;
1880 switch (kn->kn_filter) {
1881 case EVFILT_READ:
1882 kn->kn_fop = &pipe_rfiltops;
1883 break;
1884 case EVFILT_WRITE:
1885 kn->kn_fop = &pipe_wfiltops;
1886 cpipe = cpipe->pipe_peer;
1887 break;
1888 default:
1889 return (1);
1890 }
1891 kn->kn_hook = (caddr_t)cpipe;
1892
1893 PIPE_LOCK(cpipe);
1894 SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1895 PIPE_UNLOCK(cpipe);
1896 return (0);
1897 }
1898
1899 static void
1900 filt_pipedetach(struct knote *kn)
1901 {
1902 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1903
1904 PIPE_LOCK(cpipe);
1905 SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1906 PIPE_UNLOCK(cpipe);
1907 }
1908
1909 /*ARGSUSED*/
1910 static int
1911 filt_piperead(struct knote *kn, long hint)
1912 {
1913 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1914 struct pipe *wpipe = rpipe->pipe_peer;
1915
1916 PIPE_LOCK(rpipe);
1917 kn->kn_data = rpipe->pipe_buffer.cnt;
1918 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1919 kn->kn_data = rpipe->pipe_map.cnt;
1920
1921 if ((rpipe->pipe_state & PIPE_EOF) ||
1922 (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1923 kn->kn_flags |= EV_EOF;
1924 PIPE_UNLOCK(rpipe);
1925 return (1);
1926 }
1927 PIPE_UNLOCK(rpipe);
1928 return (kn->kn_data > 0);
1929 }
1930
1931 /*ARGSUSED*/
1932 static int
1933 filt_pipewrite(struct knote *kn, long hint)
1934 {
1935 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1936 struct pipe *wpipe = rpipe->pipe_peer;
1937
1938 PIPE_LOCK(rpipe);
1939 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1940 kn->kn_data = 0;
1941 kn->kn_flags |= EV_EOF;
1942 PIPE_UNLOCK(rpipe);
1943 return (1);
1944 }
1945 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1946 if (wpipe->pipe_state & PIPE_DIRECTW)
1947 kn->kn_data = 0;
1948
1949 PIPE_UNLOCK(rpipe);
1950 return (kn->kn_data >= PIPE_BUF);
1951 }
1952 #endif /* FreeBSD */
1953
1954 #ifdef __NetBSD__
1955 static int
1956 pipe_fcntl(fp, cmd, data, p)
1957 struct file *fp;
1958 u_int cmd;
1959 caddr_t data;
1960 struct proc *p;
1961 {
1962 if (cmd == F_SETFL)
1963 return (0);
1964 else
1965 return (EOPNOTSUPP);
1966 }
1967
1968 /*
1969 * Handle pipe sysctls.
1970 */
1971 int
1972 sysctl_dopipe(name, namelen, oldp, oldlenp, newp, newlen)
1973 int *name;
1974 u_int namelen;
1975 void *oldp;
1976 size_t *oldlenp;
1977 void *newp;
1978 size_t newlen;
1979 {
1980 /* All sysctl names at this level are terminal. */
1981 if (namelen != 1)
1982 return (ENOTDIR); /* overloaded */
1983
1984 switch (name[0]) {
1985 case KERN_PIPE_MAXKVASZ:
1986 return (sysctl_int(oldp, oldlenp, newp, newlen, &maxpipekva));
1987 case KERN_PIPE_LIMITKVA:
1988 return (sysctl_int(oldp, oldlenp, newp, newlen, &limitpipekva));
1989 case KERN_PIPE_MAXBIGPIPES:
1990 return (sysctl_int(oldp, oldlenp, newp, newlen, &maxbigpipes));
1991 case KERN_PIPE_NBIGPIPES:
1992 return (sysctl_rdint(oldp, oldlenp, newp, nbigpipe));
1993 case KERN_PIPE_KVASIZE:
1994 return (sysctl_rdint(oldp, oldlenp, newp, amountpipekva));
1995 default:
1996 return (EOPNOTSUPP);
1997 }
1998 /* NOTREACHED */
1999 }
2000
2001 /*
2002 * Initialize pipe structs.
2003 */
2004 void
2005 pipe_init(void)
2006 {
2007 pool_init(&pipe_pool, sizeof(struct pipe), 0, 0, 0, "pipepl", NULL);
2008 }
2009
2010 #endif /* __NetBSD __ */
2011