sys_pipe.c revision 1.3 1 /* $NetBSD: sys_pipe.c,v 1.3 2001/06/21 18:46:22 jdolecek Exp $ */
2
3 /*
4 * Copyright (c) 1996 John S. Dyson
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice immediately at the beginning of the file, without modification,
12 * this list of conditions, and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Absolutely no warranty of function or purpose is made by the author
17 * John S. Dyson.
18 * 4. Modifications may be freely made to this file if the above conditions
19 * are met.
20 *
21 * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.82 2001/06/15 20:45:01 jlemon Exp $
22 */
23
24 /*
25 * This file contains a high-performance replacement for the socket-based
26 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support
27 * all features of sockets, but does do everything that pipes normally
28 * do.
29 *
30 * Adaption for NetBSD UVM, including uvm_loan() based direct write, was
31 * written by Jaromir Dolecek.
32 */
33
34 /*
35 * This code has two modes of operation, a small write mode and a large
36 * write mode. The small write mode acts like conventional pipes with
37 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
38 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
39 * and PIPE_SIZE in size, it is fully mapped into the kernel (on FreeBSD,
40 * those pages are also wired), and the receiving process can copy it directly
41 * from the pages in the sending process.
42 *
43 * If the sending process receives a signal, it is possible that it will
44 * go away, and certainly its address space can change, because control
45 * is returned back to the user-mode side. In that case, the pipe code
46 * arranges to copy the buffer supplied by the user process on FreeBSD, to
47 * a pageable kernel buffer, and the receiving process will grab the data
48 * from the pageable kernel buffer. Since signals don't happen all that often,
49 * the copy operation is normally eliminated.
50 * For NetBSD, the pages are mapped read-only, COW for kernel by uvm_loan(),
51 * so no explicit handling need to be done, all is handled by standard VM
52 * facilities.
53 *
54 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
55 * happen for small transfers so that the system will not spend all of
56 * its time context switching. PIPE_SIZE is constrained by the
57 * amount of kernel virtual memory.
58 */
59
60 #ifdef __NetBSD__
61 #include "opt_new_pipe.h"
62 #endif
63
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/proc.h>
67 #include <sys/fcntl.h>
68 #include <sys/file.h>
69 #include <sys/filedesc.h>
70 #include <sys/filio.h>
71 #include <sys/ttycom.h>
72 #include <sys/stat.h>
73 #include <sys/poll.h>
74 #include <sys/signalvar.h>
75 #include <sys/vnode.h>
76 #include <sys/uio.h>
77 #include <sys/lock.h>
78 #ifdef __FreeBSD__
79 #include <sys/mutex.h>
80 #include <sys/selinfo.h>
81 #include <sys/sysproto.h>
82 #elif defined(__NetBSD__)
83 #include <sys/select.h>
84 #include <sys/malloc.h>
85 #include <sys/mount.h>
86 #include <sys/syscallargs.h>
87 #include <uvm/uvm.h>
88 #include <sys/sysctl.h>
89 #endif /* NetBSD, FreeBSD */
90
91 #include <sys/pipe.h>
92
93 #ifdef __NetBSD__
94 #define vfs_timestamp(tv) microtime(tv)
95 #endif
96
97 /*
98 * Use this define if you want to disable *fancy* VM things. Expect an
99 * approx 30% decrease in transfer rate. This could be useful for
100 * OpenBSD.
101 */
102 /* #define PIPE_NODIRECT */
103
104 /*
105 * interfaces to the outside world
106 */
107 #ifdef __FreeBSD__
108 static int pipe_read __P((struct file *fp, struct uio *uio,
109 struct ucred *cred, int flags, struct proc *p));
110 static int pipe_write __P((struct file *fp, struct uio *uio,
111 struct ucred *cred, int flags, struct proc *p));
112 static int pipe_close __P((struct file *fp, struct proc *p));
113 static int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
114 struct proc *p));
115 static int pipe_kqfilter __P((struct file *fp, struct knote *kn));
116 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
117 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
118
119 static struct fileops pipeops = {
120 pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
121 pipe_stat, pipe_close
122 };
123
124 static void filt_pipedetach(struct knote *kn);
125 static int filt_piperead(struct knote *kn, long hint);
126 static int filt_pipewrite(struct knote *kn, long hint);
127
128 static struct filterops pipe_rfiltops =
129 { 1, NULL, filt_pipedetach, filt_piperead };
130 static struct filterops pipe_wfiltops =
131 { 1, NULL, filt_pipedetach, filt_pipewrite };
132 #endif /* FreeBSD */
133
134 #ifdef __NetBSD__
135 static int pipe_read __P((struct file *fp, off_t *offset, struct uio *uio,
136 struct ucred *cred, int flags));
137 static int pipe_write __P((struct file *fp, off_t *offset, struct uio *uio,
138 struct ucred *cred, int flags));
139 static int pipe_close __P((struct file *fp, struct proc *p));
140 static int pipe_poll __P((struct file *fp, int events, struct proc *p));
141 static int pipe_fcntl __P((struct file *fp, u_int com, caddr_t data,
142 struct proc *p));
143 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
144 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
145
146 static struct fileops pipeops =
147 { pipe_read, pipe_write, pipe_ioctl, pipe_fcntl, pipe_poll,
148 pipe_stat, pipe_close };
149 #endif /* NetBSD */
150
151 /*
152 * Default pipe buffer size(s), this can be kind-of large now because pipe
153 * space is pageable. The pipe code will try to maintain locality of
154 * reference for performance reasons, so small amounts of outstanding I/O
155 * will not wipe the cache.
156 */
157 #define MINPIPESIZE (PIPE_SIZE/3)
158 #define MAXPIPESIZE (2*PIPE_SIZE/3)
159
160 /*
161 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
162 * is there so that on large systems, we don't exhaust it.
163 */
164 #define MAXPIPEKVA (8*1024*1024)
165 static int maxpipekva = MAXPIPEKVA;
166
167 /*
168 * Limit for direct transfers, we cannot, of course limit
169 * the amount of kva for pipes in general though.
170 */
171 #define LIMITPIPEKVA (16*1024*1024)
172 static int limitpipekva = LIMITPIPEKVA;
173
174 /*
175 * Limit the number of "big" pipes
176 */
177 #define LIMITBIGPIPES 32
178 static int maxbigpipes = LIMITBIGPIPES;
179 static int nbigpipe = 0;
180
181 /*
182 * Amount of KVA consumed by pipe buffers.
183 */
184 static int amountpipekva = 0;
185
186 static void pipeclose __P((struct pipe *cpipe));
187 static void pipe_free_kmem __P((struct pipe *cpipe));
188 static int pipe_create __P((struct pipe **cpipep));
189 static __inline int pipelock __P((struct pipe *cpipe, int catch));
190 static __inline void pipeunlock __P((struct pipe *cpipe));
191 static __inline void pipeselwakeup __P((struct pipe *selp,
192 struct pipe *sigp));
193 static int pipespace __P((struct pipe *cpipe, int size));
194
195 #ifdef __FreeBSD__
196 #ifndef PIPE_NODIRECT
197 static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
198 static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
199 static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
200 static void pipe_clone_write_buffer __P((struct pipe *wpipe));
201 #endif
202
203 static vm_zone_t pipe_zone;
204 #endif /* FreeBSD */
205
206 #ifdef __NetBSD__
207 #ifndef PIPE_NODIRECT
208 static __inline int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
209 static __inline int pipe_loan_alloc __P((struct pipe *wpipe, int npages,
210 vsize_t blen));
211 static void pipe_loan_free __P((struct pipe *wpipe));
212 #endif /* PIPE_NODIRECT */
213
214 static struct pool pipe_pool;
215 #endif /* NetBSD */
216
217 /*
218 * The pipe system call for the DTYPE_PIPE type of pipes
219 */
220
221 /* ARGSUSED */
222 #ifdef __FreeBSD__
223 int
224 pipe(p, uap)
225 struct proc *p;
226 struct pipe_args /* {
227 int dummy;
228 } */ *uap;
229 #elif defined(__NetBSD__)
230 int
231 sys_pipe(p, v, retval)
232 struct proc *p;
233 void *v;
234 register_t *retval;
235 #endif
236 {
237 struct filedesc *fdp = p->p_fd;
238 struct file *rf, *wf;
239 struct pipe *rpipe, *wpipe;
240 int fd, error;
241
242 #ifdef __FreeBSD__
243 if (pipe_zone == NULL)
244 pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4);
245 #endif
246
247 rpipe = wpipe = NULL;
248 if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
249 pipeclose(rpipe);
250 pipeclose(wpipe);
251 return (ENFILE);
252 }
253
254 #ifdef __FreeBSD__
255 error = falloc(p, &rf, &fd);
256 if (error) {
257 pipeclose(rpipe);
258 pipeclose(wpipe);
259 return (error);
260 }
261 fhold(rf);
262 p->p_retval[0] = fd;
263
264 /*
265 * Warning: once we've gotten past allocation of the fd for the
266 * read-side, we can only drop the read side via fdrop() in order
267 * to avoid races against processes which manage to dup() the read
268 * side while we are blocked trying to allocate the write side.
269 */
270 rf->f_flag = FREAD | FWRITE;
271 rf->f_type = DTYPE_PIPE;
272 rf->f_data = (caddr_t)rpipe;
273 rf->f_ops = &pipeops;
274 error = falloc(p, &wf, &fd);
275 if (error) {
276 if (fdp->fd_ofiles[p->p_retval[0]] == rf) {
277 fdp->fd_ofiles[p->p_retval[0]] = NULL;
278 fdrop(rf, p);
279 }
280 fdrop(rf, p);
281 /* rpipe has been closed by fdrop(). */
282 pipeclose(wpipe);
283 return (error);
284 }
285 wf->f_flag = FREAD | FWRITE;
286 wf->f_type = DTYPE_PIPE;
287 wf->f_data = (caddr_t)wpipe;
288 wf->f_ops = &pipeops;
289 p->p_retval[1] = fd;
290
291 rpipe->pipe_peer = wpipe;
292 wpipe->pipe_peer = rpipe;
293 fdrop(rf, p);
294 #endif /* FreeBSD */
295
296 #ifdef __NetBSD__
297 /*
298 * Note: the file structure returned from falloc() is marked
299 * as 'larval' initially. Unless we mark it as 'mature' by
300 * FILE_SET_MATURE(), any attempt to do anything with it would
301 * return EBADF, including e.g. dup(2) or close(2). This avoids
302 * file descriptor races if we block in the second falloc().
303 */
304
305 error = falloc(p, &rf, &fd);
306 if (error)
307 goto free2;
308 retval[0] = fd;
309 rf->f_flag = FREAD;
310 rf->f_type = DTYPE_PIPE;
311 rf->f_data = (caddr_t)rpipe;
312 rf->f_ops = &pipeops;
313
314 error = falloc(p, &wf, &fd);
315 if (error)
316 goto free3;
317 retval[1] = fd;
318 wf->f_flag = FWRITE;
319 wf->f_type = DTYPE_PIPE;
320 wf->f_data = (caddr_t)wpipe;
321 wf->f_ops = &pipeops;
322
323 rpipe->pipe_peer = wpipe;
324 wpipe->pipe_peer = rpipe;
325
326 FILE_SET_MATURE(rf);
327 FILE_SET_MATURE(wf);
328 FILE_UNUSE(rf, p);
329 FILE_UNUSE(wf, p);
330 return (0);
331 free3:
332 FILE_UNUSE(rf, p);
333 ffree(rf);
334 fdremove(fdp, retval[0]);
335 free2:
336 pipeclose(wpipe);
337 pipeclose(rpipe);
338 #endif /* NetBSD */
339
340 return (error);
341 }
342
343 /*
344 * Allocate kva for pipe circular buffer, the space is pageable
345 * This routine will 'realloc' the size of a pipe safely, if it fails
346 * it will retain the old buffer.
347 * If it fails it will return ENOMEM.
348 */
349 static int
350 pipespace(cpipe, size)
351 struct pipe *cpipe;
352 int size;
353 {
354 caddr_t buffer;
355 #ifdef __FreeBSD__
356 struct vm_object *object;
357 int npages, error;
358
359 npages = round_page(size)/PAGE_SIZE;
360 /*
361 * Create an object, I don't like the idea of paging to/from
362 * kernel_object.
363 */
364 mtx_lock(&vm_mtx);
365 object = vm_object_allocate(OBJT_DEFAULT, npages);
366 buffer = (caddr_t) vm_map_min(kernel_map);
367
368 /*
369 * Insert the object into the kernel map, and allocate kva for it.
370 * The map entry is, by default, pageable.
371 */
372 error = vm_map_find(kernel_map, object, 0,
373 (vm_offset_t *) &buffer, size, 1,
374 VM_PROT_ALL, VM_PROT_ALL, 0);
375
376 if (error != KERN_SUCCESS) {
377 vm_object_deallocate(object);
378 mtx_unlock(&vm_mtx);
379 return (ENOMEM);
380 }
381 #endif /* FreeBSD */
382
383 #ifdef __NetBSD__
384 /*
385 * Allocate pageable virtual address space. Physical memory is allocated
386 * on demand.
387 */
388 buffer = (caddr_t) uvm_km_valloc(kernel_map, round_page(size));
389 if (buffer == NULL)
390 return (ENOMEM);
391 #endif /* NetBSD */
392
393 /* free old resources if we're resizing */
394 pipe_free_kmem(cpipe);
395 #ifdef __FreeBSD__
396 mtx_unlock(&vm_mtx);
397 cpipe->pipe_buffer.object = object;
398 #endif
399 cpipe->pipe_buffer.buffer = buffer;
400 cpipe->pipe_buffer.size = size;
401 cpipe->pipe_buffer.in = 0;
402 cpipe->pipe_buffer.out = 0;
403 cpipe->pipe_buffer.cnt = 0;
404 amountpipekva += cpipe->pipe_buffer.size;
405 return (0);
406 }
407
408 /*
409 * initialize and allocate VM and memory for pipe
410 */
411 static int
412 pipe_create(cpipep)
413 struct pipe **cpipep;
414 {
415 struct pipe *cpipe;
416 int error;
417
418 #ifdef __FreeBSD__
419 *cpipep = zalloc(pipe_zone);
420 #endif
421 #ifdef __NetBSD__
422 *cpipep = pool_get(&pipe_pool, M_WAITOK);
423 #endif
424 if (*cpipep == NULL)
425 return (ENOMEM);
426
427 cpipe = *cpipep;
428
429 #ifdef __FreeBSD__
430 /* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */
431 cpipe->pipe_buffer.object = NULL;
432 #endif /* FreeBSD */
433 /*
434 * protect so pipeclose() doesn't follow a junk pointer
435 * if pipespace() fails.
436 */
437 cpipe->pipe_buffer.buffer = NULL;
438 bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
439 cpipe->pipe_state = PIPE_SIGNALR;
440 cpipe->pipe_peer = NULL;
441 cpipe->pipe_busy = 0;
442
443 #ifndef PIPE_NODIRECT
444 /*
445 * pipe data structure initializations to support direct pipe I/O
446 */
447 cpipe->pipe_map.cnt = 0;
448 cpipe->pipe_map.kva = NULL;
449 cpipe->pipe_map.pos = 0;
450 cpipe->pipe_map.npages = 0;
451 #ifdef __NetBSD__
452 cpipe->pipe_map.ms = NULL;
453 #endif
454 #endif /* !PIPE_NODIRECT */
455
456 if ((error = pipespace(cpipe, PIPE_SIZE)))
457 return (error);
458
459 vfs_timestamp(&cpipe->pipe_ctime);
460 cpipe->pipe_atime = cpipe->pipe_ctime;
461 cpipe->pipe_mtime = cpipe->pipe_ctime;
462 #ifdef __NetBSD__
463 cpipe->pipe_pgid = NO_PID;
464 lockinit(&cpipe->pipe_lock, PRIBIO | PCATCH, "pipelk", 0, 0);
465 #endif
466
467 return (0);
468 }
469
470
471 /*
472 * lock a pipe for I/O, blocking other access
473 */
474 static __inline int
475 pipelock(cpipe, catch)
476 struct pipe *cpipe;
477 int catch;
478 {
479 int error;
480
481 #ifdef __FreeBSD__
482 while (cpipe->pipe_state & PIPE_LOCK) {
483 cpipe->pipe_state |= PIPE_LWANT;
484 error = tsleep(cpipe, catch ? (PRIBIO | PCATCH) : PRIBIO,
485 "pipelk", 0);
486 if (error != 0)
487 return (error);
488 }
489 cpipe->pipe_state |= PIPE_LOCK;
490 return (0);
491 #endif
492
493 #ifdef __NetBSD__
494 do {
495 error = lockmgr(&cpipe->pipe_lock, LK_EXCLUSIVE, NULL);
496 } while (!catch && (error == EINTR || error == ERESTART));
497 return (error);
498 #endif
499 }
500
501 /*
502 * unlock a pipe I/O lock
503 */
504 static __inline void
505 pipeunlock(cpipe)
506 struct pipe *cpipe;
507 {
508 #ifdef __FreeBSD__
509 cpipe->pipe_state &= ~PIPE_LOCK;
510 if (cpipe->pipe_state & PIPE_LWANT) {
511 cpipe->pipe_state &= ~PIPE_LWANT;
512 wakeup(cpipe);
513 }
514 #endif
515
516 #ifdef __NetBSD__
517 lockmgr(&cpipe->pipe_lock, LK_RELEASE, NULL);
518 #endif
519 }
520
521 /*
522 * Select/poll wakup. This also sends SIGIO to peer connected to
523 * 'sigpipe' side of pipe.
524 */
525 static __inline void
526 pipeselwakeup(selp, sigp)
527 struct pipe *selp, *sigp;
528 {
529 if (selp->pipe_state & PIPE_SEL) {
530 selp->pipe_state &= ~PIPE_SEL;
531 selwakeup(&selp->pipe_sel);
532 }
533 #ifdef __FreeBSD__
534 if (sigp && (sigp->pipe_state & PIPE_ASYNC) && sigp->pipe_sigio)
535 pgsigio(sigp->pipe_sigio, SIGIO, 0);
536 KNOTE(&selp->pipe_sel.si_note, 0);
537 #endif
538
539 #ifdef __NetBSD__
540 if (sigp && (sigp->pipe_state & PIPE_ASYNC)
541 && sigp->pipe_pgid != NO_PID){
542 struct proc *p;
543
544 if (sigp->pipe_pgid < 0)
545 gsignal(-sigp->pipe_pgid, SIGIO);
546 else if (sigp->pipe_pgid > 0 && (p = pfind(sigp->pipe_pgid)) != 0)
547 psignal(p, SIGIO);
548 }
549 #endif /* NetBSD */
550 }
551
552 /* ARGSUSED */
553 #ifdef __FreeBSD__
554 static int
555 pipe_read(fp, uio, cred, flags, p)
556 struct file *fp;
557 struct uio *uio;
558 struct ucred *cred;
559 int flags;
560 struct proc *p;
561 #elif defined(__NetBSD__)
562 static int
563 pipe_read(fp, offset, uio, cred, flags)
564 struct file *fp;
565 off_t *offset;
566 struct uio *uio;
567 struct ucred *cred;
568 int flags;
569 #endif
570 {
571 struct pipe *rpipe = (struct pipe *) fp->f_data;
572 int error;
573 size_t nread = 0;
574 size_t size;
575 size_t ocnt;
576
577 ++rpipe->pipe_busy;
578 error = pipelock(rpipe, 1);
579 if (error)
580 goto unlocked_error;
581
582 ocnt = rpipe->pipe_buffer.cnt;
583
584 while (uio->uio_resid) {
585 /*
586 * normal pipe buffer receive
587 */
588 if (rpipe->pipe_buffer.cnt > 0) {
589 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
590 if (size > rpipe->pipe_buffer.cnt)
591 size = rpipe->pipe_buffer.cnt;
592 if (size > uio->uio_resid)
593 size = uio->uio_resid;
594
595 error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
596 size, uio);
597 if (error)
598 break;
599
600 rpipe->pipe_buffer.out += size;
601 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
602 rpipe->pipe_buffer.out = 0;
603
604 rpipe->pipe_buffer.cnt -= size;
605
606 /*
607 * If there is no more to read in the pipe, reset
608 * its pointers to the beginning. This improves
609 * cache hit stats.
610 */
611 if (rpipe->pipe_buffer.cnt == 0) {
612 rpipe->pipe_buffer.in = 0;
613 rpipe->pipe_buffer.out = 0;
614 }
615 nread += size;
616 #ifndef PIPE_NODIRECT
617 /*
618 * Direct copy, bypassing a kernel buffer.
619 */
620 } else if ((size = rpipe->pipe_map.cnt) &&
621 (rpipe->pipe_state & PIPE_DIRECTW)) {
622 caddr_t va;
623 if (size > uio->uio_resid)
624 size = uio->uio_resid;
625
626 va = (caddr_t) rpipe->pipe_map.kva +
627 rpipe->pipe_map.pos;
628 error = uiomove(va, size, uio);
629 if (error)
630 break;
631 nread += size;
632 rpipe->pipe_map.pos += size;
633 rpipe->pipe_map.cnt -= size;
634 if (rpipe->pipe_map.cnt == 0) {
635 rpipe->pipe_state &= ~PIPE_DIRECTW;
636 wakeup(rpipe);
637 #ifdef __NetBSD__
638 if (uio->uio_resid > 0 &&
639 (rpipe->pipe_state & PIPE_MOREW))
640 goto waitformore;
641 #endif /* NetBSD */
642 }
643 #endif
644 } else {
645 /*
646 * detect EOF condition
647 * read returns 0 on EOF, no need to set error
648 */
649 if (rpipe->pipe_state & PIPE_EOF)
650 break;
651
652 /*
653 * If the "write-side" has been blocked, wake it up now.
654 */
655 if (rpipe->pipe_state & PIPE_WANTW) {
656 rpipe->pipe_state &= ~PIPE_WANTW;
657 wakeup(rpipe);
658 }
659
660 /*
661 * Break if some data was read.
662 */
663 if (nread > 0)
664 break;
665
666 /*
667 * don't block on non-blocking I/O
668 */
669 if (fp->f_flag & FNONBLOCK) {
670 error = EAGAIN;
671 break;
672 }
673
674 #if defined(__NetBSD__) && !defined(PIPE_NODIRECT)
675 waitformore:
676 #endif
677 /*
678 * Unlock the pipe buffer for our remaining processing.
679 * We will either break out with an error or we will
680 * sleep and relock to loop.
681 */
682 pipeunlock(rpipe);
683
684 /*
685 * We want to read more, wake up select/poll.
686 */
687 pipeselwakeup(rpipe, rpipe->pipe_peer);
688
689 rpipe->pipe_state |= PIPE_WANTR;
690 error = tsleep(rpipe, PRIBIO | PCATCH, "piperd", 0);
691 if (error != 0 || (error = pipelock(rpipe, 1)))
692 goto unlocked_error;
693 }
694 }
695 pipeunlock(rpipe);
696
697 if (error == 0)
698 vfs_timestamp(&rpipe->pipe_atime);
699 unlocked_error:
700 --rpipe->pipe_busy;
701
702 /*
703 * PIPE_WANTCLOSE processing only makes sense if pipe_busy is 0.
704 */
705 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANTCLOSE)) {
706 rpipe->pipe_state &= ~(PIPE_WANTCLOSE|PIPE_WANTW);
707 wakeup(rpipe);
708 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
709 /*
710 * Handle write blocking hysteresis.
711 */
712 if (rpipe->pipe_state & PIPE_WANTW) {
713 rpipe->pipe_state &= ~PIPE_WANTW;
714 wakeup(rpipe);
715 }
716 }
717
718 /*
719 * If anything was read off the buffer, signal to the writer it's
720 * possible to write more data. Also send signal if we are here for the
721 * first time after last write.
722 */
723 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF
724 && (ocnt != rpipe->pipe_buffer.cnt || (rpipe->pipe_state & PIPE_SIGNALR))) {
725 pipeselwakeup(rpipe, rpipe->pipe_peer);
726 rpipe->pipe_state &= ~PIPE_SIGNALR;
727 }
728
729 return (error);
730 }
731
732 #ifdef __FreeBSD__
733 #ifndef PIPE_NODIRECT
734 /*
735 * Map the sending processes' buffer into kernel space and wire it.
736 * This is similar to a physical write operation.
737 */
738 static int
739 pipe_build_write_buffer(wpipe, uio)
740 struct pipe *wpipe;
741 struct uio *uio;
742 {
743 size_t size;
744 int i;
745 vm_offset_t addr, endaddr, paddr;
746
747 size = uio->uio_iov->iov_len;
748 if (size > wpipe->pipe_buffer.size)
749 size = wpipe->pipe_buffer.size;
750
751 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
752 mtx_lock(&vm_mtx);
753 addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
754 for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
755 vm_page_t m;
756
757 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
758 (paddr = pmap_kextract(addr)) == 0) {
759 int j;
760
761 for (j = 0; j < i; j++)
762 vm_page_unwire(wpipe->pipe_map.ms[j], 1);
763 mtx_unlock(&vm_mtx);
764 return (EFAULT);
765 }
766
767 m = PHYS_TO_VM_PAGE(paddr);
768 vm_page_wire(m);
769 wpipe->pipe_map.ms[i] = m;
770 }
771
772 /*
773 * set up the control block
774 */
775 wpipe->pipe_map.npages = i;
776 wpipe->pipe_map.pos =
777 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
778 wpipe->pipe_map.cnt = size;
779
780 /*
781 * and map the buffer
782 */
783 if (wpipe->pipe_map.kva == 0) {
784 /*
785 * We need to allocate space for an extra page because the
786 * address range might (will) span pages at times.
787 */
788 wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
789 wpipe->pipe_buffer.size + PAGE_SIZE);
790 amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
791 }
792 pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
793 wpipe->pipe_map.npages);
794
795 mtx_unlock(&vm_mtx);
796 /*
797 * and update the uio data
798 */
799
800 uio->uio_iov->iov_len -= size;
801 uio->uio_iov->iov_base += size;
802 if (uio->uio_iov->iov_len == 0)
803 uio->uio_iov++;
804 uio->uio_resid -= size;
805 uio->uio_offset += size;
806 return (0);
807 }
808
809 /*
810 * unmap and unwire the process buffer
811 */
812 static void
813 pipe_destroy_write_buffer(wpipe)
814 struct pipe *wpipe;
815 {
816 int i;
817
818 mtx_lock(&vm_mtx);
819 if (wpipe->pipe_map.kva) {
820 pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
821
822 if (amountpipekva > maxpipekva) {
823 vm_offset_t kva = wpipe->pipe_map.kva;
824 wpipe->pipe_map.kva = 0;
825 kmem_free(kernel_map, kva,
826 wpipe->pipe_buffer.size + PAGE_SIZE);
827 amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
828 }
829 }
830 for (i = 0; i < wpipe->pipe_map.npages; i++)
831 vm_page_unwire(wpipe->pipe_map.ms[i], 1);
832 mtx_unlock(&vm_mtx);
833 }
834
835 /*
836 * In the case of a signal, the writing process might go away. This
837 * code copies the data into the circular buffer so that the source
838 * pages can be freed without loss of data.
839 */
840 static void
841 pipe_clone_write_buffer(wpipe)
842 struct pipe *wpipe;
843 {
844 int size;
845 int pos;
846
847 size = wpipe->pipe_map.cnt;
848 pos = wpipe->pipe_map.pos;
849 bcopy((caddr_t) wpipe->pipe_map.kva + pos,
850 (caddr_t) wpipe->pipe_buffer.buffer, size);
851
852 wpipe->pipe_buffer.in = size;
853 wpipe->pipe_buffer.out = 0;
854 wpipe->pipe_buffer.cnt = size;
855 wpipe->pipe_state &= ~PIPE_DIRECTW;
856
857 pipe_destroy_write_buffer(wpipe);
858 }
859
860 /*
861 * This implements the pipe buffer write mechanism. Note that only
862 * a direct write OR a normal pipe write can be pending at any given time.
863 * If there are any characters in the pipe buffer, the direct write will
864 * be deferred until the receiving process grabs all of the bytes from
865 * the pipe buffer. Then the direct mapping write is set-up.
866 */
867 static int
868 pipe_direct_write(wpipe, uio)
869 struct pipe *wpipe;
870 struct uio *uio;
871 {
872 int error;
873
874 retry:
875 while (wpipe->pipe_state & PIPE_DIRECTW) {
876 if (wpipe->pipe_state & PIPE_WANTR) {
877 wpipe->pipe_state &= ~PIPE_WANTR;
878 wakeup(wpipe);
879 }
880 wpipe->pipe_state |= PIPE_WANTW;
881 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
882 if (error)
883 goto error1;
884 if (wpipe->pipe_state & PIPE_EOF) {
885 error = EPIPE;
886 goto error1;
887 }
888 }
889 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
890 if (wpipe->pipe_buffer.cnt > 0) {
891 if (wpipe->pipe_state & PIPE_WANTR) {
892 wpipe->pipe_state &= ~PIPE_WANTR;
893 wakeup(wpipe);
894 }
895
896 wpipe->pipe_state |= PIPE_WANTW;
897 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
898 if (error)
899 goto error1;
900 if (wpipe->pipe_state & PIPE_EOF) {
901 error = EPIPE;
902 goto error1;
903 }
904 goto retry;
905 }
906
907 wpipe->pipe_state |= PIPE_DIRECTW;
908
909 error = pipe_build_write_buffer(wpipe, uio);
910 if (error) {
911 wpipe->pipe_state &= ~PIPE_DIRECTW;
912 goto error1;
913 }
914
915 error = 0;
916 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
917 if (wpipe->pipe_state & PIPE_EOF) {
918 pipelock(wpipe, 0);
919 pipe_destroy_write_buffer(wpipe);
920 pipeunlock(wpipe);
921 pipeselwakeup(wpipe, wpipe);
922 error = EPIPE;
923 goto error1;
924 }
925 if (wpipe->pipe_state & PIPE_WANTR) {
926 wpipe->pipe_state &= ~PIPE_WANTR;
927 wakeup(wpipe);
928 }
929 pipeselwakeup(wpipe, wpipe);
930 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
931 }
932
933 pipelock(wpipe,0);
934 if (wpipe->pipe_state & PIPE_DIRECTW) {
935 /*
936 * this bit of trickery substitutes a kernel buffer for
937 * the process that might be going away.
938 */
939 pipe_clone_write_buffer(wpipe);
940 } else {
941 pipe_destroy_write_buffer(wpipe);
942 }
943 pipeunlock(wpipe);
944 return (error);
945
946 error1:
947 wakeup(wpipe);
948 return (error);
949 }
950 #endif /* !PIPE_NODIRECT */
951 #endif /* FreeBSD */
952
953 #ifdef __NetBSD__
954 #ifndef PIPE_NODIRECT
955 /*
956 * Allocate structure for loan transfer.
957 */
958 static __inline int
959 pipe_loan_alloc(wpipe, npages, blen)
960 struct pipe *wpipe;
961 int npages;
962 vsize_t blen;
963 {
964 wpipe->pipe_map.kva = uvm_km_valloc(kernel_map, blen);
965 if (wpipe->pipe_map.kva == NULL)
966 return (ENOMEM);
967
968 amountpipekva += blen;
969 wpipe->pipe_map.npages = npages;
970 wpipe->pipe_map.ms = (struct vm_page **) malloc(
971 npages * sizeof(struct vm_page *), M_PIPE, M_WAITOK);
972
973 return (0);
974 }
975
976 /*
977 * Free resources allocated for loan transfer.
978 */
979 static void
980 pipe_loan_free(wpipe)
981 struct pipe *wpipe;
982 {
983 uvm_km_free(kernel_map, wpipe->pipe_map.kva,
984 wpipe->pipe_map.npages * PAGE_SIZE);
985 wpipe->pipe_map.kva = NULL;
986 amountpipekva -= wpipe->pipe_map.npages * PAGE_SIZE;
987 free(wpipe->pipe_map.ms, M_PIPE);
988 wpipe->pipe_map.ms = NULL;
989 }
990
991 /*
992 * NetBSD direct write, using uvm_loan() mechanism.
993 * This implements the pipe buffer write mechanism. Note that only
994 * a direct write OR a normal pipe write can be pending at any given time.
995 * If there are any characters in the pipe buffer, the direct write will
996 * be deferred until the receiving process grabs all of the bytes from
997 * the pipe buffer. Then the direct mapping write is set-up.
998 */
999 static __inline int
1000 pipe_direct_write(wpipe, uio)
1001 struct pipe *wpipe;
1002 struct uio *uio;
1003 {
1004 int error, i, npages, j;
1005 struct vm_page **res;
1006 vaddr_t bbase, kva, base, bend;
1007 vsize_t blen, bcnt;
1008 voff_t boff, bpos;
1009 struct vm_map *wmap = &uio->uio_procp->p_vmspace->vm_map;
1010 retry:
1011 while (wpipe->pipe_state & PIPE_DIRECTW) {
1012 if (wpipe->pipe_state & PIPE_WANTR) {
1013 wpipe->pipe_state &= ~PIPE_WANTR;
1014 wakeup(wpipe);
1015 }
1016 wpipe->pipe_state |= PIPE_WANTW;
1017 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
1018 if (error)
1019 goto error1;
1020 if (wpipe->pipe_state & PIPE_EOF) {
1021 error = EPIPE;
1022 goto error1;
1023 }
1024 }
1025 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
1026 if (wpipe->pipe_buffer.cnt > 0) {
1027 if ( wpipe->pipe_state & PIPE_WANTR) {
1028 wpipe->pipe_state &= ~PIPE_WANTR;
1029 wakeup(wpipe);
1030 }
1031
1032 wpipe->pipe_state |= PIPE_WANTW;
1033 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
1034 if (error)
1035 goto error1;
1036 if (wpipe->pipe_state & PIPE_EOF) {
1037 error = EPIPE;
1038 goto error1;
1039 }
1040 goto retry;
1041 }
1042
1043 /*
1044 * For each iovec:
1045 * 1. Loan the pages to kernel.
1046 * 2. Set up pipe structures.
1047 * 3. Wait until consumer reads it all or exits.
1048 */
1049 boff = 0;
1050 for(i=0; i < uio->uio_iovcnt; ) {
1051 /*
1052 * Note: need to handle buffers not aligned to PAGE_SIZE.
1053 */
1054 bbase = (vaddr_t)uio->uio_iov[i].iov_base;
1055 base = trunc_page(bbase + boff);
1056 bend = round_page(bbase + uio->uio_iov[i].iov_len);
1057 blen = bend - base;
1058
1059 if (boff == 0)
1060 bpos = bbase % PAGE_SIZE;
1061 else
1062 bpos = 0;
1063
1064 if (blen > PIPE_DIRECT_CHUNK) {
1065 blen = PIPE_DIRECT_CHUNK;
1066 boff += PIPE_DIRECT_CHUNK;
1067 bend = base + blen;
1068 bcnt = PIPE_DIRECT_CHUNK - bpos;
1069 wpipe->pipe_state |= PIPE_MOREW;
1070 } else {
1071 if (boff == 0)
1072 bcnt = uio->uio_iov[i].iov_len;
1073 else
1074 bcnt = ((bbase % PAGE_SIZE) +
1075 uio->uio_iov[i].iov_len) %PIPE_DIRECT_CHUNK;
1076 boff = 0;
1077 i++;
1078 wpipe->pipe_state &= ~PIPE_MOREW;
1079 }
1080
1081 npages = blen / PAGE_SIZE;
1082
1083 /*
1084 * Free the old kva if we need more pages than we have
1085 * allocated.
1086 */
1087 if (wpipe->pipe_map.kva
1088 && npages > wpipe->pipe_map.npages)
1089 pipe_loan_free(wpipe);
1090
1091 /* Allocate new kva. */
1092 if (!wpipe->pipe_map.kva) {
1093 if ((error = pipe_loan_alloc(wpipe,
1094 npages, blen)))
1095 goto error;
1096 }
1097
1098 /* Loan the write buffer memory from writer process */
1099 res = wpipe->pipe_map.ms;
1100 error = uvm_loan(wmap, base, blen,
1101 (void **) res, UVM_LOAN_TOPAGE);
1102 if (error)
1103 goto cleanup;
1104
1105 /* Enter the loaned pages to kva */
1106 kva = wpipe->pipe_map.kva;
1107 for(j=0; j < npages; j++, kva += PAGE_SIZE)
1108 pmap_enter(pmap_kernel(), kva, res[j]->phys_addr,
1109 VM_PROT_READ, 0);
1110
1111 wpipe->pipe_map.pos = bpos;
1112 wpipe->pipe_map.cnt = bcnt;
1113 wpipe->pipe_state |= PIPE_DIRECTW;
1114
1115 error = 0;
1116 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
1117 if (wpipe->pipe_state & PIPE_EOF) {
1118 error = EPIPE;
1119 break;
1120 }
1121 if (wpipe->pipe_state & PIPE_WANTR) {
1122 wpipe->pipe_state &= ~PIPE_WANTR;
1123 wakeup(wpipe);
1124 }
1125 pipeselwakeup(wpipe, wpipe);
1126 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
1127 }
1128
1129 cleanup:
1130 pipelock(wpipe,0);
1131 if (amountpipekva > maxpipekva)
1132 pipe_loan_free(wpipe);
1133 uvm_unloanpage(res, npages);
1134 pipeunlock(wpipe);
1135 if (error) {
1136 error:
1137 /* XXX update uio ? */
1138 if (error == EPIPE)
1139 pipeselwakeup(wpipe, wpipe);
1140
1141 wpipe->pipe_state &= ~PIPE_MOREW;
1142 goto error1;
1143 }
1144
1145 uio->uio_offset += bcnt;
1146 uio->uio_resid -= bcnt;
1147
1148 } /* for */
1149
1150 return (error);
1151
1152 error1:
1153 wakeup(wpipe);
1154 return (error);
1155 }
1156 #endif /* !PIPE_NODIRECT */
1157 #endif /* NetBSD */
1158
1159 #ifdef __FreeBSD__
1160 static int
1161 pipe_write(fp, uio, cred, flags, p)
1162 struct file *fp;
1163 off_t *offset;
1164 struct uio *uio;
1165 struct ucred *cred;
1166 int flags;
1167 struct proc *p;
1168 #elif defined(__NetBSD__)
1169 static int
1170 pipe_write(fp, offset, uio, cred, flags)
1171 struct file *fp;
1172 off_t *offset;
1173 struct uio *uio;
1174 struct ucred *cred;
1175 int flags;
1176 #endif
1177 {
1178 int error = 0;
1179 int orig_resid;
1180 struct pipe *wpipe, *rpipe;
1181
1182 rpipe = (struct pipe *) fp->f_data;
1183 wpipe = rpipe->pipe_peer;
1184
1185 /*
1186 * detect loss of pipe read side, issue SIGPIPE if lost.
1187 */
1188 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF))
1189 return (EPIPE);
1190
1191 ++wpipe->pipe_busy;
1192
1193 /*
1194 * If it is advantageous to resize the pipe buffer, do
1195 * so.
1196 */
1197 if ((uio->uio_resid > PIPE_SIZE) &&
1198 (nbigpipe < maxbigpipes) &&
1199 #ifndef PIPE_NODIRECT
1200 (wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
1201 #endif
1202 (wpipe->pipe_buffer.size <= PIPE_SIZE) &&
1203 (wpipe->pipe_buffer.cnt == 0)) {
1204
1205 if ((error = pipelock(wpipe,1)) == 0) {
1206 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
1207 nbigpipe++;
1208 pipeunlock(wpipe);
1209 } else {
1210 /*
1211 * If an error occured unbusy and return, waking up any
1212 * pending readers.
1213 */
1214 --wpipe->pipe_busy;
1215 if (wpipe->pipe_busy == 0
1216 && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1217 wpipe->pipe_state &=
1218 ~(PIPE_WANTCLOSE | PIPE_WANTR);
1219 wakeup(wpipe);
1220 }
1221
1222 return (error);
1223 }
1224 }
1225
1226 #ifdef __FreeBSD__
1227 KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
1228 #endif
1229
1230 orig_resid = uio->uio_resid;
1231 while (uio->uio_resid) {
1232 int space;
1233
1234 #ifndef PIPE_NODIRECT
1235 /*
1236 * If the transfer is large, we can gain performance if
1237 * we do process-to-process copies directly.
1238 * If the write is non-blocking, we don't use the
1239 * direct write mechanism.
1240 *
1241 * The direct write mechanism will detect the reader going
1242 * away on us.
1243 */
1244 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
1245 (fp->f_flag & FNONBLOCK) == 0 &&
1246 (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) {
1247 error = pipe_direct_write(wpipe, uio);
1248 if (error)
1249 break;
1250 continue;
1251 }
1252 #endif /* PIPE_NODIRECT */
1253
1254 /*
1255 * Pipe buffered writes cannot be coincidental with
1256 * direct writes. We wait until the currently executing
1257 * direct write is completed before we start filling the
1258 * pipe buffer. We break out if a signal occurs or the
1259 * reader goes away.
1260 */
1261 retrywrite:
1262 while (wpipe->pipe_state & PIPE_DIRECTW) {
1263 if (wpipe->pipe_state & PIPE_WANTR) {
1264 wpipe->pipe_state &= ~PIPE_WANTR;
1265 wakeup(wpipe);
1266 }
1267 error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0);
1268 if (wpipe->pipe_state & PIPE_EOF)
1269 break;
1270 if (error)
1271 break;
1272 }
1273 if (wpipe->pipe_state & PIPE_EOF) {
1274 error = EPIPE;
1275 break;
1276 }
1277
1278 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1279
1280 /* Writes of size <= PIPE_BUF must be atomic. */
1281 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1282 space = 0;
1283
1284 if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
1285 int size; /* Transfer size */
1286 int segsize; /* first segment to transfer */
1287
1288 if ((error = pipelock(wpipe,1)) != 0)
1289 break;
1290
1291 /*
1292 * It is possible for a direct write to
1293 * slip in on us... handle it here...
1294 */
1295 if (wpipe->pipe_state & PIPE_DIRECTW) {
1296 pipeunlock(wpipe);
1297 goto retrywrite;
1298 }
1299 /*
1300 * If a process blocked in uiomove, our
1301 * value for space might be bad.
1302 *
1303 * XXX will we be ok if the reader has gone
1304 * away here?
1305 */
1306 if (space > wpipe->pipe_buffer.size -
1307 wpipe->pipe_buffer.cnt) {
1308 pipeunlock(wpipe);
1309 goto retrywrite;
1310 }
1311
1312 /*
1313 * Transfer size is minimum of uio transfer
1314 * and free space in pipe buffer.
1315 */
1316 if (space > uio->uio_resid)
1317 size = uio->uio_resid;
1318 else
1319 size = space;
1320 /*
1321 * First segment to transfer is minimum of
1322 * transfer size and contiguous space in
1323 * pipe buffer. If first segment to transfer
1324 * is less than the transfer size, we've got
1325 * a wraparound in the buffer.
1326 */
1327 segsize = wpipe->pipe_buffer.size -
1328 wpipe->pipe_buffer.in;
1329 if (segsize > size)
1330 segsize = size;
1331
1332 /* Transfer first segment */
1333
1334 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1335 segsize, uio);
1336
1337 if (error == 0 && segsize < size) {
1338 /*
1339 * Transfer remaining part now, to
1340 * support atomic writes. Wraparound
1341 * happened.
1342 */
1343 #ifdef DEBUG
1344 if (wpipe->pipe_buffer.in + segsize !=
1345 wpipe->pipe_buffer.size)
1346 panic("Expected pipe buffer wraparound disappeared");
1347 #endif
1348
1349 error = uiomove(&wpipe->pipe_buffer.buffer[0],
1350 size - segsize, uio);
1351 }
1352 if (error == 0) {
1353 wpipe->pipe_buffer.in += size;
1354 if (wpipe->pipe_buffer.in >=
1355 wpipe->pipe_buffer.size) {
1356 #ifdef DEBUG
1357 if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
1358 panic("Expected wraparound bad");
1359 #endif
1360 wpipe->pipe_buffer.in = size - segsize;
1361 }
1362
1363 wpipe->pipe_buffer.cnt += size;
1364 #ifdef DEBUG
1365 if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
1366 panic("Pipe buffer overflow");
1367 #endif
1368
1369 }
1370 pipeunlock(wpipe);
1371 if (error)
1372 break;
1373
1374 } else {
1375 /*
1376 * If the "read-side" has been blocked, wake it up now.
1377 */
1378 if (wpipe->pipe_state & PIPE_WANTR) {
1379 wpipe->pipe_state &= ~PIPE_WANTR;
1380 wakeup(wpipe);
1381 }
1382
1383 /*
1384 * don't block on non-blocking I/O
1385 */
1386 if (fp->f_flag & FNONBLOCK) {
1387 error = EAGAIN;
1388 break;
1389 }
1390
1391 /*
1392 * We have no more space and have something to offer,
1393 * wake up select/poll.
1394 */
1395 pipeselwakeup(wpipe, wpipe);
1396
1397 wpipe->pipe_state |= PIPE_WANTW;
1398 error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0);
1399 if (error != 0)
1400 break;
1401 /*
1402 * If read side wants to go away, we just issue a signal
1403 * to ourselves.
1404 */
1405 if (wpipe->pipe_state & PIPE_EOF) {
1406 error = EPIPE;
1407 break;
1408 }
1409 }
1410 }
1411
1412 --wpipe->pipe_busy;
1413 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1414 wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR);
1415 wakeup(wpipe);
1416 } else if (wpipe->pipe_buffer.cnt > 0) {
1417 /*
1418 * If we have put any characters in the buffer, we wake up
1419 * the reader.
1420 */
1421 if (wpipe->pipe_state & PIPE_WANTR) {
1422 wpipe->pipe_state &= ~PIPE_WANTR;
1423 wakeup(wpipe);
1424 }
1425 }
1426
1427 /*
1428 * Don't return EPIPE if I/O was successful
1429 */
1430 if ((error == EPIPE) && (wpipe->pipe_buffer.cnt == 0)
1431 && (uio->uio_resid == 0))
1432 error = 0;
1433
1434 if (error == 0)
1435 vfs_timestamp(&wpipe->pipe_mtime);
1436
1437 /*
1438 * We have something to offer, wake up select/poll.
1439 * wpipe->pipe_map.cnt is always 0 in this point (direct write
1440 * is only done synchronously), so check wpipe->only pipe_buffer.cnt
1441 */
1442 if (wpipe->pipe_buffer.cnt)
1443 pipeselwakeup(wpipe, wpipe);
1444
1445 /*
1446 * Arrange for next read(2) to do a signal.
1447 */
1448 wpipe->pipe_state |= PIPE_SIGNALR;
1449
1450 return (error);
1451 }
1452
1453 /*
1454 * we implement a very minimal set of ioctls for compatibility with sockets.
1455 */
1456 int
1457 pipe_ioctl(fp, cmd, data, p)
1458 struct file *fp;
1459 u_long cmd;
1460 caddr_t data;
1461 struct proc *p;
1462 {
1463 struct pipe *mpipe = (struct pipe *)fp->f_data;
1464
1465 switch (cmd) {
1466
1467 case FIONBIO:
1468 return (0);
1469
1470 case FIOASYNC:
1471 if (*(int *)data) {
1472 mpipe->pipe_state |= PIPE_ASYNC;
1473 } else {
1474 mpipe->pipe_state &= ~PIPE_ASYNC;
1475 }
1476 return (0);
1477
1478 case FIONREAD:
1479 #ifndef PIPE_NODIRECT
1480 if (mpipe->pipe_state & PIPE_DIRECTW)
1481 *(int *)data = mpipe->pipe_map.cnt;
1482 else
1483 #endif
1484 *(int *)data = mpipe->pipe_buffer.cnt;
1485 return (0);
1486
1487 #ifdef __FreeBSD__
1488 case FIOSETOWN:
1489 return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1490
1491 case FIOGETOWN:
1492 *(int *)data = fgetown(mpipe->pipe_sigio);
1493 return (0);
1494
1495 /* This is deprecated, FIOSETOWN should be used instead. */
1496 case TIOCSPGRP:
1497 return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1498
1499 /* This is deprecated, FIOGETOWN should be used instead. */
1500 case TIOCGPGRP:
1501 *(int *)data = -fgetown(mpipe->pipe_sigio);
1502 return (0);
1503 #endif /* FreeBSD */
1504 #ifdef __NetBSD__
1505 case TIOCSPGRP:
1506 mpipe->pipe_pgid = *(int *)data;
1507 return (0);
1508
1509 case TIOCGPGRP:
1510 *(int *)data = mpipe->pipe_pgid;
1511 return (0);
1512 #endif /* NetBSD */
1513
1514 }
1515 return (ENOTTY);
1516 }
1517
1518 int
1519 pipe_poll(fp, events, p)
1520 struct file *fp;
1521 int events;
1522 struct proc *p;
1523 {
1524 struct pipe *rpipe = (struct pipe *)fp->f_data;
1525 struct pipe *wpipe;
1526 int revents = 0;
1527
1528 wpipe = rpipe->pipe_peer;
1529 if (events & (POLLIN | POLLRDNORM))
1530 if ((rpipe->pipe_buffer.cnt > 0) ||
1531 #ifndef PIPE_NODIRECT
1532 (rpipe->pipe_state & PIPE_DIRECTW) ||
1533 #endif
1534 (rpipe->pipe_state & PIPE_EOF))
1535 revents |= events & (POLLIN | POLLRDNORM);
1536
1537 if (events & (POLLOUT | POLLWRNORM))
1538 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)
1539 || (
1540 #ifndef PIPE_NODIRECT
1541 ((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1542 #endif
1543 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1544 revents |= events & (POLLOUT | POLLWRNORM);
1545
1546 if ((rpipe->pipe_state & PIPE_EOF) ||
1547 (wpipe == NULL) ||
1548 (wpipe->pipe_state & PIPE_EOF))
1549 revents |= POLLHUP;
1550
1551 if (revents == 0) {
1552 if (events & (POLLIN | POLLRDNORM)) {
1553 selrecord(p, &rpipe->pipe_sel);
1554 rpipe->pipe_state |= PIPE_SEL;
1555 }
1556
1557 if (events & (POLLOUT | POLLWRNORM)) {
1558 selrecord(p, &wpipe->pipe_sel);
1559 wpipe->pipe_state |= PIPE_SEL;
1560 }
1561 }
1562
1563 return (revents);
1564 }
1565
1566 static int
1567 pipe_stat(fp, ub, p)
1568 struct file *fp;
1569 struct stat *ub;
1570 struct proc *p;
1571 {
1572 struct pipe *pipe = (struct pipe *)fp->f_data;
1573
1574 bzero((caddr_t)ub, sizeof(*ub));
1575 ub->st_mode = S_IFIFO;
1576 ub->st_blksize = pipe->pipe_buffer.size;
1577 ub->st_size = pipe->pipe_buffer.cnt;
1578 ub->st_blocks = (ub->st_size) ? 1 : 0;
1579 #ifdef __FreeBSD__
1580 ub->st_atimespec = pipe->pipe_atime;
1581 ub->st_mtimespec = pipe->pipe_mtime;
1582 ub->st_ctimespec = pipe->pipe_ctime;
1583 #endif /* FreeBSD */
1584 #ifdef __NetBSD__
1585 TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec)
1586 TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec);
1587 TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec);
1588 #endif /* NetBSD */
1589 ub->st_uid = fp->f_cred->cr_uid;
1590 ub->st_gid = fp->f_cred->cr_gid;
1591 /*
1592 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1593 * XXX (st_dev, st_ino) should be unique.
1594 */
1595 return (0);
1596 }
1597
1598 /* ARGSUSED */
1599 static int
1600 pipe_close(fp, p)
1601 struct file *fp;
1602 struct proc *p;
1603 {
1604 struct pipe *cpipe = (struct pipe *)fp->f_data;
1605
1606 #ifdef __FreeBSD__
1607 fp->f_ops = &badfileops;
1608 funsetown(cpipe->pipe_sigio);
1609 #endif
1610 fp->f_data = NULL;
1611 pipeclose(cpipe);
1612 return (0);
1613 }
1614
1615 static void
1616 pipe_free_kmem(cpipe)
1617 struct pipe *cpipe;
1618 {
1619
1620 #ifdef __FreeBSD__
1621 mtx_assert(&vm_mtx, MA_OWNED);
1622 #endif
1623 if (cpipe->pipe_buffer.buffer != NULL) {
1624 if (cpipe->pipe_buffer.size > PIPE_SIZE)
1625 --nbigpipe;
1626 amountpipekva -= cpipe->pipe_buffer.size;
1627 #ifdef __FreeBSD__
1628 kmem_free(kernel_map,
1629 (vm_offset_t)cpipe->pipe_buffer.buffer,
1630 cpipe->pipe_buffer.size);
1631 #elif defined(__NetBSD__)
1632 uvm_km_free(kernel_map,
1633 (vaddr_t)cpipe->pipe_buffer.buffer,
1634 cpipe->pipe_buffer.size);
1635 #endif /* NetBSD */
1636
1637 cpipe->pipe_buffer.buffer = NULL;
1638 }
1639 #ifndef PIPE_NODIRECT
1640 if (cpipe->pipe_map.kva != NULL) {
1641 #ifdef __FreeBSD__
1642 amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1643 kmem_free(kernel_map,
1644 cpipe->pipe_map.kva,
1645 cpipe->pipe_buffer.size + PAGE_SIZE);
1646 #elif defined(__NetBSD__)
1647 pipe_loan_free(cpipe);
1648 #endif /* NetBSD */
1649 cpipe->pipe_map.cnt = 0;
1650 cpipe->pipe_map.kva = NULL;
1651 cpipe->pipe_map.pos = 0;
1652 cpipe->pipe_map.npages = 0;
1653 }
1654 #endif /* !PIPE_NODIRECT */
1655 }
1656
1657 /*
1658 * shutdown the pipe
1659 */
1660 static void
1661 pipeclose(cpipe)
1662 struct pipe *cpipe;
1663 {
1664 struct pipe *ppipe;
1665
1666 if (!cpipe)
1667 return;
1668
1669 pipeselwakeup(cpipe, cpipe);
1670
1671 /*
1672 * If the other side is blocked, wake it up saying that
1673 * we want to close it down.
1674 */
1675 while (cpipe->pipe_busy) {
1676 wakeup(cpipe);
1677 cpipe->pipe_state |= PIPE_WANTCLOSE | PIPE_EOF;
1678 tsleep(cpipe, PRIBIO, "pipecl", 0);
1679 }
1680
1681 /*
1682 * Disconnect from peer
1683 */
1684 if ((ppipe = cpipe->pipe_peer) != NULL) {
1685 pipeselwakeup(ppipe, ppipe);
1686
1687 ppipe->pipe_state |= PIPE_EOF;
1688 wakeup(ppipe);
1689 ppipe->pipe_peer = NULL;
1690 }
1691
1692 /*
1693 * free resources
1694 */
1695 #ifdef _FreeBSD__
1696 mtx_lock(&vm_mtx);
1697 pipe_free_kmem(cpipe);
1698 /* XXX: erm, doesn't zalloc already have its own locks and
1699 * not need the giant vm lock?
1700 */
1701 zfree(pipe_zone, cpipe);
1702 mtx_unlock(&vm_mtx);
1703 #endif /* FreeBSD */
1704
1705 #ifdef __NetBSD__
1706 pipe_free_kmem(cpipe);
1707 (void) lockmgr(&cpipe->pipe_lock, LK_DRAIN, NULL);
1708 pool_put(&pipe_pool, cpipe);
1709 #endif
1710 }
1711
1712 #ifdef __FreeBSD__
1713 /*ARGSUSED*/
1714 static int
1715 pipe_kqfilter(struct file *fp, struct knote *kn)
1716 {
1717 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1718
1719 switch (kn->kn_filter) {
1720 case EVFILT_READ:
1721 kn->kn_fop = &pipe_rfiltops;
1722 break;
1723 case EVFILT_WRITE:
1724 kn->kn_fop = &pipe_wfiltops;
1725 cpipe = cpipe->pipe_peer;
1726 break;
1727 default:
1728 return (1);
1729 }
1730 kn->kn_hook = (caddr_t)cpipe;
1731
1732 SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1733 return (0);
1734 }
1735
1736 static void
1737 filt_pipedetach(struct knote *kn)
1738 {
1739 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1740
1741 SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1742 }
1743
1744 /*ARGSUSED*/
1745 static int
1746 filt_piperead(struct knote *kn, long hint)
1747 {
1748 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1749 struct pipe *wpipe = rpipe->pipe_peer;
1750
1751 kn->kn_data = rpipe->pipe_buffer.cnt;
1752 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1753 kn->kn_data = rpipe->pipe_map.cnt;
1754
1755 if ((rpipe->pipe_state & PIPE_EOF) ||
1756 (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1757 kn->kn_flags |= EV_EOF;
1758 return (1);
1759 }
1760 return (kn->kn_data > 0);
1761 }
1762
1763 /*ARGSUSED*/
1764 static int
1765 filt_pipewrite(struct knote *kn, long hint)
1766 {
1767 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1768 struct pipe *wpipe = rpipe->pipe_peer;
1769
1770 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1771 kn->kn_data = 0;
1772 kn->kn_flags |= EV_EOF;
1773 return (1);
1774 }
1775 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1776 if (wpipe->pipe_state & PIPE_DIRECTW)
1777 kn->kn_data = 0;
1778
1779 return (kn->kn_data >= PIPE_BUF);
1780 }
1781 #endif /* FreeBSD */
1782
1783 #ifdef __NetBSD__
1784 static int
1785 pipe_fcntl(fp, cmd, data, p)
1786 struct file *fp;
1787 u_int cmd;
1788 caddr_t data;
1789 struct proc *p;
1790 {
1791 if (cmd == F_SETFL)
1792 return (0);
1793 else
1794 return (EOPNOTSUPP);
1795 }
1796
1797 /*
1798 * Handle pipe sysctls.
1799 */
1800 int
1801 sysctl_dopipe(name, namelen, oldp, oldlenp, newp, newlen)
1802 int *name;
1803 u_int namelen;
1804 void *oldp;
1805 size_t *oldlenp;
1806 void *newp;
1807 size_t newlen;
1808 {
1809 /* All sysctl names at this level are terminal. */
1810 if (namelen != 1)
1811 return (ENOTDIR); /* overloaded */
1812
1813 switch (name[0]) {
1814 case KERN_PIPE_MAXKVASZ:
1815 return (sysctl_int(oldp, oldlenp, newp, newlen, &maxpipekva));
1816 case KERN_PIPE_LIMITKVA:
1817 return (sysctl_int(oldp, oldlenp, newp, newlen, &limitpipekva));
1818 case KERN_PIPE_MAXBIGPIPES:
1819 return (sysctl_int(oldp, oldlenp, newp, newlen, &maxbigpipes));
1820 case KERN_PIPE_NBIGPIPES:
1821 return (sysctl_rdint(oldp, oldlenp, newp, nbigpipe));
1822 case KERN_PIPE_KVASIZE:
1823 return (sysctl_rdint(oldp, oldlenp, newp, amountpipekva));
1824 default:
1825 return (EOPNOTSUPP);
1826 }
1827 /* NOTREACHED */
1828 }
1829
1830 /*
1831 * Initialize pipe structs.
1832 */
1833 void
1834 pipe_init(void)
1835 {
1836 pool_init(&pipe_pool, sizeof(struct pipe), 0, 0, 0, "pipepl",
1837 0, NULL, NULL, M_PIPE);
1838 }
1839
1840 #endif /* __NetBSD __ */
1841