sys_pipe.c revision 1.5.2.2 1 /* $NetBSD: sys_pipe.c,v 1.5.2.2 2001/09/07 22:14:49 thorpej Exp $ */
2
3 /*
4 * Copyright (c) 1996 John S. Dyson
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice immediately at the beginning of the file, without modification,
12 * this list of conditions, and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Absolutely no warranty of function or purpose is made by the author
17 * John S. Dyson.
18 * 4. Modifications may be freely made to this file if the above conditions
19 * are met.
20 *
21 * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.82 2001/06/15 20:45:01 jlemon Exp $
22 */
23
24 /*
25 * This file contains a high-performance replacement for the socket-based
26 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support
27 * all features of sockets, but does do everything that pipes normally
28 * do.
29 *
30 * Adaption for NetBSD UVM, including uvm_loan() based direct write, was
31 * written by Jaromir Dolecek.
32 */
33
34 /*
35 * This code has two modes of operation, a small write mode and a large
36 * write mode. The small write mode acts like conventional pipes with
37 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
38 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
39 * and PIPE_SIZE in size, it is fully mapped into the kernel (on FreeBSD,
40 * those pages are also wired), and the receiving process can copy it directly
41 * from the pages in the sending process.
42 *
43 * If the sending process receives a signal, it is possible that it will
44 * go away, and certainly its address space can change, because control
45 * is returned back to the user-mode side. In that case, the pipe code
46 * arranges to copy the buffer supplied by the user process on FreeBSD, to
47 * a pageable kernel buffer, and the receiving process will grab the data
48 * from the pageable kernel buffer. Since signals don't happen all that often,
49 * the copy operation is normally eliminated.
50 * For NetBSD, the pages are mapped read-only, COW for kernel by uvm_loan(),
51 * so no explicit handling need to be done, all is handled by standard VM
52 * facilities.
53 *
54 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
55 * happen for small transfers so that the system will not spend all of
56 * its time context switching. PIPE_SIZE is constrained by the
57 * amount of kernel virtual memory.
58 */
59
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/proc.h>
63 #include <sys/fcntl.h>
64 #include <sys/file.h>
65 #include <sys/filedesc.h>
66 #include <sys/filio.h>
67 #include <sys/ttycom.h>
68 #include <sys/stat.h>
69 #include <sys/poll.h>
70 #include <sys/signalvar.h>
71 #include <sys/vnode.h>
72 #include <sys/uio.h>
73 #include <sys/lock.h>
74 #ifdef __FreeBSD__
75 #include <sys/mutex.h>
76 #include <sys/selinfo.h>
77 #include <sys/sysproto.h>
78 #elif defined(__NetBSD__)
79 #include <sys/select.h>
80 #include <sys/malloc.h>
81 #include <sys/mount.h>
82 #include <sys/syscallargs.h>
83 #include <uvm/uvm.h>
84 #include <sys/sysctl.h>
85 #endif /* NetBSD, FreeBSD */
86
87 #include <sys/pipe.h>
88
89 #ifdef __NetBSD__
90 #define vfs_timestamp(tv) microtime(tv)
91 #endif
92
93 /*
94 * Use this define if you want to disable *fancy* VM things. Expect an
95 * approx 30% decrease in transfer rate. This could be useful for
96 * OpenBSD.
97 */
98 /* #define PIPE_NODIRECT */
99
100 /*
101 * interfaces to the outside world
102 */
103 #ifdef __FreeBSD__
104 static int pipe_read __P((struct file *fp, struct uio *uio,
105 struct ucred *cred, int flags, struct proc *p));
106 static int pipe_write __P((struct file *fp, struct uio *uio,
107 struct ucred *cred, int flags, struct proc *p));
108 static int pipe_close __P((struct file *fp, struct proc *p));
109 static int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
110 struct proc *p));
111 static int pipe_kqfilter __P((struct file *fp, struct knote *kn));
112 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
113 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
114
115 static struct fileops pipeops = {
116 pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
117 pipe_stat, pipe_close
118 };
119 #endif /* FreeBSD */
120
121 static void filt_pipedetach(struct knote *kn);
122 static int filt_piperead(struct knote *kn, long hint);
123 static int filt_pipewrite(struct knote *kn, long hint);
124
125 static const struct filterops pipe_rfiltops =
126 { 1, NULL, filt_pipedetach, filt_piperead };
127 static const struct filterops pipe_wfiltops =
128 { 1, NULL, filt_pipedetach, filt_pipewrite };
129
130 #ifdef __NetBSD__
131 static int pipe_read __P((struct file *fp, off_t *offset, struct uio *uio,
132 struct ucred *cred, int flags));
133 static int pipe_write __P((struct file *fp, off_t *offset, struct uio *uio,
134 struct ucred *cred, int flags));
135 static int pipe_close __P((struct file *fp, struct proc *p));
136 static int pipe_poll __P((struct file *fp, int events, struct proc *p));
137 static int pipe_fcntl __P((struct file *fp, u_int com, caddr_t data,
138 struct proc *p));
139 static int pipe_kqfilter __P((struct file *fp, struct knote *kn));
140 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
141 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
142
143 static struct fileops pipeops =
144 { pipe_read, pipe_write, pipe_ioctl, pipe_fcntl, pipe_poll,
145 pipe_stat, pipe_close, pipe_kqfilter };
146 #endif /* NetBSD */
147
148 /*
149 * Default pipe buffer size(s), this can be kind-of large now because pipe
150 * space is pageable. The pipe code will try to maintain locality of
151 * reference for performance reasons, so small amounts of outstanding I/O
152 * will not wipe the cache.
153 */
154 #define MINPIPESIZE (PIPE_SIZE/3)
155 #define MAXPIPESIZE (2*PIPE_SIZE/3)
156
157 /*
158 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
159 * is there so that on large systems, we don't exhaust it.
160 */
161 #define MAXPIPEKVA (8*1024*1024)
162 static int maxpipekva = MAXPIPEKVA;
163
164 /*
165 * Limit for direct transfers, we cannot, of course limit
166 * the amount of kva for pipes in general though.
167 */
168 #define LIMITPIPEKVA (16*1024*1024)
169 static int limitpipekva = LIMITPIPEKVA;
170
171 /*
172 * Limit the number of "big" pipes
173 */
174 #define LIMITBIGPIPES 32
175 static int maxbigpipes = LIMITBIGPIPES;
176 static int nbigpipe = 0;
177
178 /*
179 * Amount of KVA consumed by pipe buffers.
180 */
181 static int amountpipekva = 0;
182
183 static void pipeclose __P((struct pipe *cpipe));
184 static void pipe_free_kmem __P((struct pipe *cpipe));
185 static int pipe_create __P((struct pipe **cpipep, int allockva));
186 static __inline int pipelock __P((struct pipe *cpipe, int catch));
187 static __inline void pipeunlock __P((struct pipe *cpipe));
188 static __inline void pipeselwakeup __P((struct pipe *selp,
189 struct pipe *sigp));
190 static int pipespace __P((struct pipe *cpipe, int size));
191
192 #ifdef __FreeBSD__
193 #ifndef PIPE_NODIRECT
194 static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
195 static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
196 static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
197 static void pipe_clone_write_buffer __P((struct pipe *wpipe));
198 #endif
199
200 static vm_zone_t pipe_zone;
201 #endif /* FreeBSD */
202
203 #ifdef __NetBSD__
204 #ifndef PIPE_NODIRECT
205 static __inline int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
206 static __inline int pipe_loan_alloc __P((struct pipe *wpipe, int npages,
207 vsize_t blen));
208 static void pipe_loan_free __P((struct pipe *wpipe));
209 #endif /* PIPE_NODIRECT */
210
211 static struct pool pipe_pool;
212 #endif /* NetBSD */
213
214 /*
215 * The pipe system call for the DTYPE_PIPE type of pipes
216 */
217
218 /* ARGSUSED */
219 #ifdef __FreeBSD__
220 int
221 pipe(p, uap)
222 struct proc *p;
223 struct pipe_args /* {
224 int dummy;
225 } */ *uap;
226 #elif defined(__NetBSD__)
227 int
228 sys_pipe(p, v, retval)
229 struct proc *p;
230 void *v;
231 register_t *retval;
232 #endif
233 {
234 struct file *rf, *wf;
235 struct pipe *rpipe, *wpipe;
236 int fd, error;
237
238 #ifdef __FreeBSD__
239 if (pipe_zone == NULL)
240 pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4);
241
242 rpipe = wpipe = NULL;
243 if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 1)) {
244 pipeclose(rpipe);
245 pipeclose(wpipe);
246 return (ENFILE);
247 }
248
249 error = falloc(p, &rf, &fd);
250 if (error) {
251 pipeclose(rpipe);
252 pipeclose(wpipe);
253 return (error);
254 }
255 fhold(rf);
256 p->p_retval[0] = fd;
257
258 /*
259 * Warning: once we've gotten past allocation of the fd for the
260 * read-side, we can only drop the read side via fdrop() in order
261 * to avoid races against processes which manage to dup() the read
262 * side while we are blocked trying to allocate the write side.
263 */
264 rf->f_flag = FREAD | FWRITE;
265 rf->f_type = DTYPE_PIPE;
266 rf->f_data = (caddr_t)rpipe;
267 rf->f_ops = &pipeops;
268 error = falloc(p, &wf, &fd);
269 if (error) {
270 struct filedesc *fdp = p->p_fd;
271
272 if (fdp->fd_ofiles[p->p_retval[0]] == rf) {
273 fdp->fd_ofiles[p->p_retval[0]] = NULL;
274 fdrop(rf, p);
275 }
276 fdrop(rf, p);
277 /* rpipe has been closed by fdrop(). */
278 pipeclose(wpipe);
279 return (error);
280 }
281 wf->f_flag = FREAD | FWRITE;
282 wf->f_type = DTYPE_PIPE;
283 wf->f_data = (caddr_t)wpipe;
284 wf->f_ops = &pipeops;
285 p->p_retval[1] = fd;
286
287 rpipe->pipe_peer = wpipe;
288 wpipe->pipe_peer = rpipe;
289 fdrop(rf, p);
290 #endif /* FreeBSD */
291
292 #ifdef __NetBSD__
293 rpipe = wpipe = NULL;
294 if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 0)) {
295 pipeclose(rpipe);
296 pipeclose(wpipe);
297 return (ENFILE);
298 }
299
300 /*
301 * Note: the file structure returned from falloc() is marked
302 * as 'larval' initially. Unless we mark it as 'mature' by
303 * FILE_SET_MATURE(), any attempt to do anything with it would
304 * return EBADF, including e.g. dup(2) or close(2). This avoids
305 * file descriptor races if we block in the second falloc().
306 */
307
308 error = falloc(p, &rf, &fd);
309 if (error)
310 goto free2;
311 retval[0] = fd;
312 rf->f_flag = FREAD;
313 rf->f_type = DTYPE_PIPE;
314 rf->f_data = (caddr_t)rpipe;
315 rf->f_ops = &pipeops;
316
317 error = falloc(p, &wf, &fd);
318 if (error)
319 goto free3;
320 retval[1] = fd;
321 wf->f_flag = FWRITE;
322 wf->f_type = DTYPE_PIPE;
323 wf->f_data = (caddr_t)wpipe;
324 wf->f_ops = &pipeops;
325
326 rpipe->pipe_peer = wpipe;
327 wpipe->pipe_peer = rpipe;
328
329 FILE_SET_MATURE(rf);
330 FILE_SET_MATURE(wf);
331 FILE_UNUSE(rf, p);
332 FILE_UNUSE(wf, p);
333 return (0);
334 free3:
335 FILE_UNUSE(rf, p);
336 ffree(rf);
337 fdremove(p->p_fd, retval[0]);
338 free2:
339 pipeclose(wpipe);
340 pipeclose(rpipe);
341 #endif /* NetBSD */
342
343 return (error);
344 }
345
346 /*
347 * Allocate kva for pipe circular buffer, the space is pageable
348 * This routine will 'realloc' the size of a pipe safely, if it fails
349 * it will retain the old buffer.
350 * If it fails it will return ENOMEM.
351 */
352 static int
353 pipespace(cpipe, size)
354 struct pipe *cpipe;
355 int size;
356 {
357 caddr_t buffer;
358 #ifdef __FreeBSD__
359 struct vm_object *object;
360 int npages, error;
361
362 npages = round_page(size)/PAGE_SIZE;
363 /*
364 * Create an object, I don't like the idea of paging to/from
365 * kernel_object.
366 */
367 mtx_lock(&vm_mtx);
368 object = vm_object_allocate(OBJT_DEFAULT, npages);
369 buffer = (caddr_t) vm_map_min(kernel_map);
370
371 /*
372 * Insert the object into the kernel map, and allocate kva for it.
373 * The map entry is, by default, pageable.
374 */
375 error = vm_map_find(kernel_map, object, 0,
376 (vm_offset_t *) &buffer, size, 1,
377 VM_PROT_ALL, VM_PROT_ALL, 0);
378
379 if (error != KERN_SUCCESS) {
380 vm_object_deallocate(object);
381 mtx_unlock(&vm_mtx);
382 return (ENOMEM);
383 }
384 #endif /* FreeBSD */
385
386 #ifdef __NetBSD__
387 /*
388 * Allocate pageable virtual address space. Physical memory is allocated
389 * on demand.
390 */
391 buffer = (caddr_t) uvm_km_valloc(kernel_map, round_page(size));
392 if (buffer == NULL)
393 return (ENOMEM);
394 #endif /* NetBSD */
395
396 /* free old resources if we're resizing */
397 pipe_free_kmem(cpipe);
398 #ifdef __FreeBSD__
399 mtx_unlock(&vm_mtx);
400 cpipe->pipe_buffer.object = object;
401 #endif
402 cpipe->pipe_buffer.buffer = buffer;
403 cpipe->pipe_buffer.size = size;
404 cpipe->pipe_buffer.in = 0;
405 cpipe->pipe_buffer.out = 0;
406 cpipe->pipe_buffer.cnt = 0;
407 amountpipekva += cpipe->pipe_buffer.size;
408 return (0);
409 }
410
411 /*
412 * initialize and allocate VM and memory for pipe
413 */
414 static int
415 pipe_create(cpipep, allockva)
416 struct pipe **cpipep;
417 int allockva;
418 {
419 struct pipe *cpipe;
420 int error;
421
422 #ifdef __FreeBSD__
423 *cpipep = zalloc(pipe_zone);
424 #endif
425 #ifdef __NetBSD__
426 *cpipep = pool_get(&pipe_pool, M_WAITOK);
427 #endif
428 if (*cpipep == NULL)
429 return (ENOMEM);
430
431 cpipe = *cpipep;
432
433 /* Initialize */
434 memset(cpipe, 0, sizeof(*cpipe));
435 cpipe->pipe_state = PIPE_SIGNALR;
436
437 if (allockva && (error = pipespace(cpipe, PIPE_SIZE)))
438 return (error);
439
440 vfs_timestamp(&cpipe->pipe_ctime);
441 cpipe->pipe_atime = cpipe->pipe_ctime;
442 cpipe->pipe_mtime = cpipe->pipe_ctime;
443 #ifdef __NetBSD__
444 cpipe->pipe_pgid = NO_PID;
445 lockinit(&cpipe->pipe_lock, PRIBIO | PCATCH, "pipelk", 0, 0);
446 #endif
447
448 return (0);
449 }
450
451
452 /*
453 * lock a pipe for I/O, blocking other access
454 */
455 static __inline int
456 pipelock(cpipe, catch)
457 struct pipe *cpipe;
458 int catch;
459 {
460 int error;
461
462 #ifdef __FreeBSD__
463 while (cpipe->pipe_state & PIPE_LOCK) {
464 cpipe->pipe_state |= PIPE_LWANT;
465 error = tsleep(cpipe, catch ? (PRIBIO | PCATCH) : PRIBIO,
466 "pipelk", 0);
467 if (error != 0)
468 return (error);
469 }
470 cpipe->pipe_state |= PIPE_LOCK;
471 return (0);
472 #endif
473
474 #ifdef __NetBSD__
475 do {
476 error = lockmgr(&cpipe->pipe_lock, LK_EXCLUSIVE, NULL);
477 } while (!catch && (error == EINTR || error == ERESTART));
478 return (error);
479 #endif
480 }
481
482 /*
483 * unlock a pipe I/O lock
484 */
485 static __inline void
486 pipeunlock(cpipe)
487 struct pipe *cpipe;
488 {
489 #ifdef __FreeBSD__
490 cpipe->pipe_state &= ~PIPE_LOCK;
491 if (cpipe->pipe_state & PIPE_LWANT) {
492 cpipe->pipe_state &= ~PIPE_LWANT;
493 wakeup(cpipe);
494 }
495 #endif
496
497 #ifdef __NetBSD__
498 lockmgr(&cpipe->pipe_lock, LK_RELEASE, NULL);
499 #endif
500 }
501
502 /*
503 * Select/poll wakup. This also sends SIGIO to peer connected to
504 * 'sigpipe' side of pipe.
505 */
506 static __inline void
507 pipeselwakeup(selp, sigp)
508 struct pipe *selp, *sigp;
509 {
510 if (selp->pipe_state & PIPE_SEL) {
511 selp->pipe_state &= ~PIPE_SEL;
512 selwakeup(&selp->pipe_sel);
513 }
514 #ifdef __FreeBSD__
515 if (sigp && (sigp->pipe_state & PIPE_ASYNC) && sigp->pipe_sigio)
516 pgsigio(sigp->pipe_sigio, SIGIO, 0);
517 KNOTE(&selp->pipe_sel.si_note, 0);
518 #endif
519
520 #ifdef __NetBSD__
521 if (sigp && (sigp->pipe_state & PIPE_ASYNC) &&
522 sigp->pipe_pgid != NO_PID) {
523 struct proc *p;
524
525 if (sigp->pipe_pgid < 0)
526 gsignal(-sigp->pipe_pgid, SIGIO);
527 else if (sigp->pipe_pgid > 0 &&
528 (p = pfind(sigp->pipe_pgid)) != NULL)
529 psignal(p, SIGIO);
530 }
531 KNOTE(&selp->pipe_sel.si_klist, 0);
532 #endif /* NetBSD */
533 }
534
535 /* ARGSUSED */
536 #ifdef __FreeBSD__
537 static int
538 pipe_read(fp, uio, cred, flags, p)
539 struct file *fp;
540 struct uio *uio;
541 struct ucred *cred;
542 int flags;
543 struct proc *p;
544 #elif defined(__NetBSD__)
545 static int
546 pipe_read(fp, offset, uio, cred, flags)
547 struct file *fp;
548 off_t *offset;
549 struct uio *uio;
550 struct ucred *cred;
551 int flags;
552 #endif
553 {
554 struct pipe *rpipe = (struct pipe *) fp->f_data;
555 int error;
556 size_t nread = 0;
557 size_t size;
558 size_t ocnt;
559
560 ++rpipe->pipe_busy;
561 error = pipelock(rpipe, 1);
562 if (error)
563 goto unlocked_error;
564
565 ocnt = rpipe->pipe_buffer.cnt;
566
567 while (uio->uio_resid) {
568 /*
569 * normal pipe buffer receive
570 */
571 if (rpipe->pipe_buffer.cnt > 0) {
572 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
573 if (size > rpipe->pipe_buffer.cnt)
574 size = rpipe->pipe_buffer.cnt;
575 if (size > uio->uio_resid)
576 size = uio->uio_resid;
577
578 error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
579 size, uio);
580 if (error)
581 break;
582
583 rpipe->pipe_buffer.out += size;
584 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
585 rpipe->pipe_buffer.out = 0;
586
587 rpipe->pipe_buffer.cnt -= size;
588
589 /*
590 * If there is no more to read in the pipe, reset
591 * its pointers to the beginning. This improves
592 * cache hit stats.
593 */
594 if (rpipe->pipe_buffer.cnt == 0) {
595 rpipe->pipe_buffer.in = 0;
596 rpipe->pipe_buffer.out = 0;
597 }
598 nread += size;
599 #ifndef PIPE_NODIRECT
600 /*
601 * Direct copy, bypassing a kernel buffer.
602 */
603 } else if ((size = rpipe->pipe_map.cnt) &&
604 (rpipe->pipe_state & PIPE_DIRECTW)) {
605 caddr_t va;
606 if (size > uio->uio_resid)
607 size = uio->uio_resid;
608
609 va = (caddr_t) rpipe->pipe_map.kva +
610 rpipe->pipe_map.pos;
611 error = uiomove(va, size, uio);
612 if (error)
613 break;
614 nread += size;
615 rpipe->pipe_map.pos += size;
616 rpipe->pipe_map.cnt -= size;
617 if (rpipe->pipe_map.cnt == 0) {
618 rpipe->pipe_state &= ~PIPE_DIRECTW;
619 wakeup(rpipe);
620 }
621 #endif
622 } else {
623 /*
624 * detect EOF condition
625 * read returns 0 on EOF, no need to set error
626 */
627 if (rpipe->pipe_state & PIPE_EOF)
628 break;
629
630 /*
631 * If the "write-side" has been blocked, wake it up now.
632 */
633 if (rpipe->pipe_state & PIPE_WANTW) {
634 rpipe->pipe_state &= ~PIPE_WANTW;
635 wakeup(rpipe);
636 }
637
638 /*
639 * Break if some data was read.
640 */
641 if (nread > 0)
642 break;
643
644 /*
645 * don't block on non-blocking I/O
646 */
647 if (fp->f_flag & FNONBLOCK) {
648 error = EAGAIN;
649 break;
650 }
651
652 /*
653 * Unlock the pipe buffer for our remaining processing.
654 * We will either break out with an error or we will
655 * sleep and relock to loop.
656 */
657 pipeunlock(rpipe);
658
659 /*
660 * We want to read more, wake up select/poll.
661 */
662 pipeselwakeup(rpipe, rpipe->pipe_peer);
663
664 rpipe->pipe_state |= PIPE_WANTR;
665 error = tsleep(rpipe, PRIBIO | PCATCH, "piperd", 0);
666 if (error != 0 || (error = pipelock(rpipe, 1)))
667 goto unlocked_error;
668 }
669 }
670 pipeunlock(rpipe);
671
672 if (error == 0)
673 vfs_timestamp(&rpipe->pipe_atime);
674 unlocked_error:
675 --rpipe->pipe_busy;
676
677 /*
678 * PIPE_WANTCLOSE processing only makes sense if pipe_busy is 0.
679 */
680 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANTCLOSE)) {
681 rpipe->pipe_state &= ~(PIPE_WANTCLOSE|PIPE_WANTW);
682 wakeup(rpipe);
683 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
684 /*
685 * Handle write blocking hysteresis.
686 */
687 if (rpipe->pipe_state & PIPE_WANTW) {
688 rpipe->pipe_state &= ~PIPE_WANTW;
689 wakeup(rpipe);
690 }
691 }
692
693 /*
694 * If anything was read off the buffer, signal to the writer it's
695 * possible to write more data. Also send signal if we are here for the
696 * first time after last write.
697 */
698 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF
699 && (ocnt != rpipe->pipe_buffer.cnt || (rpipe->pipe_state & PIPE_SIGNALR))) {
700 pipeselwakeup(rpipe, rpipe->pipe_peer);
701 rpipe->pipe_state &= ~PIPE_SIGNALR;
702 }
703
704 return (error);
705 }
706
707 #ifdef __FreeBSD__
708 #ifndef PIPE_NODIRECT
709 /*
710 * Map the sending processes' buffer into kernel space and wire it.
711 * This is similar to a physical write operation.
712 */
713 static int
714 pipe_build_write_buffer(wpipe, uio)
715 struct pipe *wpipe;
716 struct uio *uio;
717 {
718 size_t size;
719 int i;
720 vm_offset_t addr, endaddr, paddr;
721
722 size = uio->uio_iov->iov_len;
723 if (size > wpipe->pipe_buffer.size)
724 size = wpipe->pipe_buffer.size;
725
726 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
727 mtx_lock(&vm_mtx);
728 addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
729 for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
730 vm_page_t m;
731
732 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
733 (paddr = pmap_kextract(addr)) == 0) {
734 int j;
735
736 for (j = 0; j < i; j++)
737 vm_page_unwire(wpipe->pipe_map.ms[j], 1);
738 mtx_unlock(&vm_mtx);
739 return (EFAULT);
740 }
741
742 m = PHYS_TO_VM_PAGE(paddr);
743 vm_page_wire(m);
744 wpipe->pipe_map.ms[i] = m;
745 }
746
747 /*
748 * set up the control block
749 */
750 wpipe->pipe_map.npages = i;
751 wpipe->pipe_map.pos =
752 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
753 wpipe->pipe_map.cnt = size;
754
755 /*
756 * and map the buffer
757 */
758 if (wpipe->pipe_map.kva == 0) {
759 /*
760 * We need to allocate space for an extra page because the
761 * address range might (will) span pages at times.
762 */
763 wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
764 wpipe->pipe_buffer.size + PAGE_SIZE);
765 amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
766 }
767 pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
768 wpipe->pipe_map.npages);
769
770 mtx_unlock(&vm_mtx);
771 /*
772 * and update the uio data
773 */
774
775 uio->uio_iov->iov_len -= size;
776 uio->uio_iov->iov_base += size;
777 if (uio->uio_iov->iov_len == 0)
778 uio->uio_iov++;
779 uio->uio_resid -= size;
780 uio->uio_offset += size;
781 return (0);
782 }
783
784 /*
785 * unmap and unwire the process buffer
786 */
787 static void
788 pipe_destroy_write_buffer(wpipe)
789 struct pipe *wpipe;
790 {
791 int i;
792
793 mtx_lock(&vm_mtx);
794 if (wpipe->pipe_map.kva) {
795 pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
796
797 if (amountpipekva > maxpipekva) {
798 vm_offset_t kva = wpipe->pipe_map.kva;
799 wpipe->pipe_map.kva = 0;
800 kmem_free(kernel_map, kva,
801 wpipe->pipe_buffer.size + PAGE_SIZE);
802 amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
803 }
804 }
805 for (i = 0; i < wpipe->pipe_map.npages; i++)
806 vm_page_unwire(wpipe->pipe_map.ms[i], 1);
807 mtx_unlock(&vm_mtx);
808 }
809
810 /*
811 * In the case of a signal, the writing process might go away. This
812 * code copies the data into the circular buffer so that the source
813 * pages can be freed without loss of data.
814 */
815 static void
816 pipe_clone_write_buffer(wpipe)
817 struct pipe *wpipe;
818 {
819 int size;
820 int pos;
821
822 size = wpipe->pipe_map.cnt;
823 pos = wpipe->pipe_map.pos;
824 memcpy((caddr_t) wpipe->pipe_buffer.buffer,
825 (caddr_t) wpipe->pipe_map.kva + pos, size);
826
827 wpipe->pipe_buffer.in = size;
828 wpipe->pipe_buffer.out = 0;
829 wpipe->pipe_buffer.cnt = size;
830 wpipe->pipe_state &= ~PIPE_DIRECTW;
831
832 pipe_destroy_write_buffer(wpipe);
833 }
834
835 /*
836 * This implements the pipe buffer write mechanism. Note that only
837 * a direct write OR a normal pipe write can be pending at any given time.
838 * If there are any characters in the pipe buffer, the direct write will
839 * be deferred until the receiving process grabs all of the bytes from
840 * the pipe buffer. Then the direct mapping write is set-up.
841 */
842 static int
843 pipe_direct_write(wpipe, uio)
844 struct pipe *wpipe;
845 struct uio *uio;
846 {
847 int error;
848
849 retry:
850 while (wpipe->pipe_state & PIPE_DIRECTW) {
851 if (wpipe->pipe_state & PIPE_WANTR) {
852 wpipe->pipe_state &= ~PIPE_WANTR;
853 wakeup(wpipe);
854 }
855 wpipe->pipe_state |= PIPE_WANTW;
856 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
857 if (error)
858 goto error1;
859 if (wpipe->pipe_state & PIPE_EOF) {
860 error = EPIPE;
861 goto error1;
862 }
863 }
864 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
865 if (wpipe->pipe_buffer.cnt > 0) {
866 if (wpipe->pipe_state & PIPE_WANTR) {
867 wpipe->pipe_state &= ~PIPE_WANTR;
868 wakeup(wpipe);
869 }
870
871 wpipe->pipe_state |= PIPE_WANTW;
872 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
873 if (error)
874 goto error1;
875 if (wpipe->pipe_state & PIPE_EOF) {
876 error = EPIPE;
877 goto error1;
878 }
879 goto retry;
880 }
881
882 wpipe->pipe_state |= PIPE_DIRECTW;
883
884 error = pipe_build_write_buffer(wpipe, uio);
885 if (error) {
886 wpipe->pipe_state &= ~PIPE_DIRECTW;
887 goto error1;
888 }
889
890 error = 0;
891 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
892 if (wpipe->pipe_state & PIPE_EOF) {
893 pipelock(wpipe, 0);
894 pipe_destroy_write_buffer(wpipe);
895 pipeunlock(wpipe);
896 pipeselwakeup(wpipe, wpipe);
897 error = EPIPE;
898 goto error1;
899 }
900 if (wpipe->pipe_state & PIPE_WANTR) {
901 wpipe->pipe_state &= ~PIPE_WANTR;
902 wakeup(wpipe);
903 }
904 pipeselwakeup(wpipe, wpipe);
905 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
906 }
907
908 pipelock(wpipe,0);
909 if (wpipe->pipe_state & PIPE_DIRECTW) {
910 /*
911 * this bit of trickery substitutes a kernel buffer for
912 * the process that might be going away.
913 */
914 pipe_clone_write_buffer(wpipe);
915 } else {
916 pipe_destroy_write_buffer(wpipe);
917 }
918 pipeunlock(wpipe);
919 return (error);
920
921 error1:
922 wakeup(wpipe);
923 return (error);
924 }
925 #endif /* !PIPE_NODIRECT */
926 #endif /* FreeBSD */
927
928 #ifdef __NetBSD__
929 #ifndef PIPE_NODIRECT
930 /*
931 * Allocate structure for loan transfer.
932 */
933 static __inline int
934 pipe_loan_alloc(wpipe, npages, blen)
935 struct pipe *wpipe;
936 int npages;
937 vsize_t blen;
938 {
939 wpipe->pipe_map.kva = uvm_km_valloc_wait(kernel_map, blen);
940 if (wpipe->pipe_map.kva == NULL)
941 return (ENOMEM);
942
943 amountpipekva += blen;
944 wpipe->pipe_map.npages = npages;
945 wpipe->pipe_map.ms = (struct vm_page **) malloc(
946 npages * sizeof(struct vm_page *), M_PIPE, M_WAITOK);
947
948 return (0);
949 }
950
951 /*
952 * Free resources allocated for loan transfer.
953 */
954 static void
955 pipe_loan_free(wpipe)
956 struct pipe *wpipe;
957 {
958 uvm_km_free(kernel_map, wpipe->pipe_map.kva,
959 wpipe->pipe_map.npages * PAGE_SIZE);
960 wpipe->pipe_map.kva = NULL;
961 amountpipekva -= wpipe->pipe_map.npages * PAGE_SIZE;
962 free(wpipe->pipe_map.ms, M_PIPE);
963 wpipe->pipe_map.ms = NULL;
964 }
965
966 /*
967 * NetBSD direct write, using uvm_loan() mechanism.
968 * This implements the pipe buffer write mechanism. Note that only
969 * a direct write OR a normal pipe write can be pending at any given time.
970 * If there are any characters in the pipe buffer, the direct write will
971 * be deferred until the receiving process grabs all of the bytes from
972 * the pipe buffer. Then the direct mapping write is set-up.
973 */
974 static __inline int
975 pipe_direct_write(wpipe, uio)
976 struct pipe *wpipe;
977 struct uio *uio;
978 {
979 int error, npages, j;
980 struct vm_page **res = NULL;
981 vaddr_t bbase, kva, base, bend;
982 vsize_t blen, bcnt;
983 voff_t bpos;
984
985 retry:
986 while (wpipe->pipe_state & PIPE_DIRECTW) {
987 if (wpipe->pipe_state & PIPE_WANTR) {
988 wpipe->pipe_state &= ~PIPE_WANTR;
989 wakeup(wpipe);
990 }
991 wpipe->pipe_state |= PIPE_WANTW;
992 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
993 if (error)
994 goto error;
995 if (wpipe->pipe_state & PIPE_EOF) {
996 error = EPIPE;
997 goto error;
998 }
999 }
1000 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
1001 if (wpipe->pipe_buffer.cnt > 0) {
1002 if (wpipe->pipe_state & PIPE_WANTR) {
1003 wpipe->pipe_state &= ~PIPE_WANTR;
1004 wakeup(wpipe);
1005 }
1006
1007 wpipe->pipe_state |= PIPE_WANTW;
1008 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
1009 if (error)
1010 goto error;
1011 if (wpipe->pipe_state & PIPE_EOF) {
1012 error = EPIPE;
1013 goto error;
1014 }
1015 goto retry;
1016 }
1017
1018 /*
1019 * Handle first iovec, first PIPE_CHUNK_SIZE bytes. Expect caller
1020 * to deal with short write.
1021 *
1022 * Note: need to deal with buffers not aligned to PAGE_SIZE.
1023 */
1024 bbase = (vaddr_t)uio->uio_iov[0].iov_base;
1025 base = trunc_page(bbase);
1026 bend = round_page(bbase + uio->uio_iov[0].iov_len);
1027 blen = bend - base;
1028 bpos = bbase - base;
1029
1030 if (blen > PIPE_DIRECT_CHUNK) {
1031 blen = PIPE_DIRECT_CHUNK;
1032 bend = base + blen;
1033 bcnt = PIPE_DIRECT_CHUNK - bpos;
1034 } else
1035 bcnt = uio->uio_iov[0].iov_len;
1036
1037 npages = blen / PAGE_SIZE;
1038
1039 wpipe->pipe_map.pos = bpos;
1040 wpipe->pipe_map.cnt = bcnt;
1041
1042 /*
1043 * Free the old kva if we need more pages than we have
1044 * allocated.
1045 */
1046 if (wpipe->pipe_map.kva && npages > wpipe->pipe_map.npages)
1047 pipe_loan_free(wpipe);
1048
1049 /* Allocate new kva. */
1050 if (!wpipe->pipe_map.kva
1051 && (error = pipe_loan_alloc(wpipe, npages, blen)))
1052 goto error;
1053
1054 /* Loan the write buffer memory from writer process */
1055 error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, base, blen,
1056 (void **) wpipe->pipe_map.ms, UVM_LOAN_TOPAGE);
1057 if (error)
1058 goto cleanup;
1059 res = wpipe->pipe_map.ms;
1060
1061 /* Enter the loaned pages to kva */
1062 kva = wpipe->pipe_map.kva;
1063 for(j=0; j < npages; j++, kva += PAGE_SIZE)
1064 pmap_enter(pmap_kernel(), kva, res[j]->phys_addr,
1065 VM_PROT_READ, 0);
1066
1067 wpipe->pipe_state |= PIPE_DIRECTW;
1068 error = 0;
1069 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
1070 if (wpipe->pipe_state & PIPE_EOF) {
1071 error = EPIPE;
1072 break;
1073 }
1074 if (wpipe->pipe_state & PIPE_WANTR) {
1075 wpipe->pipe_state &= ~PIPE_WANTR;
1076 wakeup(wpipe);
1077 }
1078 pipeselwakeup(wpipe, wpipe);
1079 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
1080 }
1081
1082 if (error)
1083 wpipe->pipe_state &= ~PIPE_DIRECTW;
1084
1085 cleanup:
1086 pipelock(wpipe, 0);
1087 if (error || amountpipekva > maxpipekva)
1088 pipe_loan_free(wpipe);
1089 else if (res)
1090 uvm_unloanpage(res, npages);
1091 pipeunlock(wpipe);
1092
1093 if (error == EPIPE) {
1094 pipeselwakeup(wpipe, wpipe);
1095
1096 /*
1097 * If anything was read from what we offered, return success
1098 * and short write. We return EOF on next write(2).
1099 */
1100 if (wpipe->pipe_map.cnt < bcnt) {
1101 bcnt -= wpipe->pipe_map.cnt;
1102 error = 0;
1103 }
1104 }
1105
1106 if (error) {
1107 error:
1108 wakeup(wpipe);
1109 return (error);
1110 }
1111
1112 uio->uio_resid -= bcnt;
1113 /* uio_offset not updated, not set/used for write(2) */
1114
1115 return (0);
1116 }
1117 #endif /* !PIPE_NODIRECT */
1118 #endif /* NetBSD */
1119
1120 #ifdef __FreeBSD__
1121 static int
1122 pipe_write(fp, uio, cred, flags, p)
1123 struct file *fp;
1124 off_t *offset;
1125 struct uio *uio;
1126 struct ucred *cred;
1127 int flags;
1128 struct proc *p;
1129 #elif defined(__NetBSD__)
1130 static int
1131 pipe_write(fp, offset, uio, cred, flags)
1132 struct file *fp;
1133 off_t *offset;
1134 struct uio *uio;
1135 struct ucred *cred;
1136 int flags;
1137 #endif
1138 {
1139 int error = 0;
1140 int orig_resid;
1141 struct pipe *wpipe, *rpipe;
1142
1143 rpipe = (struct pipe *) fp->f_data;
1144 wpipe = rpipe->pipe_peer;
1145
1146 /*
1147 * detect loss of pipe read side, issue SIGPIPE if lost.
1148 */
1149 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF))
1150 return (EPIPE);
1151
1152 ++wpipe->pipe_busy;
1153
1154 /*
1155 * If it is advantageous to resize the pipe buffer, do
1156 * so.
1157 */
1158 if ((uio->uio_resid > PIPE_SIZE) &&
1159 (nbigpipe < maxbigpipes) &&
1160 #ifndef PIPE_NODIRECT
1161 (wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
1162 #endif
1163 (wpipe->pipe_buffer.size <= PIPE_SIZE) &&
1164 (wpipe->pipe_buffer.cnt == 0)) {
1165
1166 if ((error = pipelock(wpipe,1)) == 0) {
1167 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
1168 nbigpipe++;
1169 pipeunlock(wpipe);
1170 } else {
1171 /*
1172 * If an error occurred, unbusy and return, waking up
1173 * any waiting readers.
1174 */
1175 --wpipe->pipe_busy;
1176 if (wpipe->pipe_busy == 0
1177 && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1178 wpipe->pipe_state &=
1179 ~(PIPE_WANTCLOSE | PIPE_WANTR);
1180 wakeup(wpipe);
1181 }
1182
1183 return (error);
1184 }
1185 }
1186
1187 #ifdef __FreeBSD__
1188 KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
1189 #endif
1190
1191 orig_resid = uio->uio_resid;
1192 while (uio->uio_resid) {
1193 int space;
1194
1195 #ifndef PIPE_NODIRECT
1196 /*
1197 * If the transfer is large, we can gain performance if
1198 * we do process-to-process copies directly.
1199 * If the write is non-blocking, we don't use the
1200 * direct write mechanism.
1201 *
1202 * The direct write mechanism will detect the reader going
1203 * away on us.
1204 */
1205 if ((uio->uio_iov[0].iov_len >= PIPE_MINDIRECT) &&
1206 (uio->uio_resid == orig_resid) &&
1207 (fp->f_flag & FNONBLOCK) == 0 &&
1208 (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) {
1209 error = pipe_direct_write(wpipe, uio);
1210
1211 /*
1212 * We either errorred, wrote whole buffer, or
1213 * wrote part of buffer. If the error is ENOMEM,
1214 * we failed to allocate some resources for direct
1215 * write and fall back to ordinary write. Otherwise,
1216 * break out now.
1217 */
1218 if (error != ENOMEM)
1219 break;
1220 }
1221 #endif /* PIPE_NODIRECT */
1222
1223 /*
1224 * Pipe buffered writes cannot be coincidental with
1225 * direct writes. We wait until the currently executing
1226 * direct write is completed before we start filling the
1227 * pipe buffer. We break out if a signal occurs or the
1228 * reader goes away.
1229 */
1230 retrywrite:
1231 while (wpipe->pipe_state & PIPE_DIRECTW) {
1232 if (wpipe->pipe_state & PIPE_WANTR) {
1233 wpipe->pipe_state &= ~PIPE_WANTR;
1234 wakeup(wpipe);
1235 }
1236 error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0);
1237 if (wpipe->pipe_state & PIPE_EOF)
1238 break;
1239 if (error)
1240 break;
1241 }
1242 if (wpipe->pipe_state & PIPE_EOF) {
1243 error = EPIPE;
1244 break;
1245 }
1246
1247 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1248
1249 /* Writes of size <= PIPE_BUF must be atomic. */
1250 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1251 space = 0;
1252
1253 if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
1254 int size; /* Transfer size */
1255 int segsize; /* first segment to transfer */
1256
1257 if ((error = pipelock(wpipe,1)) != 0)
1258 break;
1259
1260 /*
1261 * It is possible for a direct write to
1262 * slip in on us... handle it here...
1263 */
1264 if (wpipe->pipe_state & PIPE_DIRECTW) {
1265 pipeunlock(wpipe);
1266 goto retrywrite;
1267 }
1268 /*
1269 * If a process blocked in uiomove, our
1270 * value for space might be bad.
1271 *
1272 * XXX will we be ok if the reader has gone
1273 * away here?
1274 */
1275 if (space > wpipe->pipe_buffer.size -
1276 wpipe->pipe_buffer.cnt) {
1277 pipeunlock(wpipe);
1278 goto retrywrite;
1279 }
1280
1281 /*
1282 * Transfer size is minimum of uio transfer
1283 * and free space in pipe buffer.
1284 */
1285 if (space > uio->uio_resid)
1286 size = uio->uio_resid;
1287 else
1288 size = space;
1289 /*
1290 * First segment to transfer is minimum of
1291 * transfer size and contiguous space in
1292 * pipe buffer. If first segment to transfer
1293 * is less than the transfer size, we've got
1294 * a wraparound in the buffer.
1295 */
1296 segsize = wpipe->pipe_buffer.size -
1297 wpipe->pipe_buffer.in;
1298 if (segsize > size)
1299 segsize = size;
1300
1301 /* Transfer first segment */
1302
1303 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1304 segsize, uio);
1305
1306 if (error == 0 && segsize < size) {
1307 /*
1308 * Transfer remaining part now, to
1309 * support atomic writes. Wraparound
1310 * happened.
1311 */
1312 #ifdef DEBUG
1313 if (wpipe->pipe_buffer.in + segsize !=
1314 wpipe->pipe_buffer.size)
1315 panic("Expected pipe buffer wraparound disappeared");
1316 #endif
1317
1318 error = uiomove(&wpipe->pipe_buffer.buffer[0],
1319 size - segsize, uio);
1320 }
1321 if (error == 0) {
1322 wpipe->pipe_buffer.in += size;
1323 if (wpipe->pipe_buffer.in >=
1324 wpipe->pipe_buffer.size) {
1325 #ifdef DEBUG
1326 if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
1327 panic("Expected wraparound bad");
1328 #endif
1329 wpipe->pipe_buffer.in = size - segsize;
1330 }
1331
1332 wpipe->pipe_buffer.cnt += size;
1333 #ifdef DEBUG
1334 if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
1335 panic("Pipe buffer overflow");
1336 #endif
1337
1338 }
1339 pipeunlock(wpipe);
1340 if (error)
1341 break;
1342
1343 } else {
1344 /*
1345 * If the "read-side" has been blocked, wake it up now.
1346 */
1347 if (wpipe->pipe_state & PIPE_WANTR) {
1348 wpipe->pipe_state &= ~PIPE_WANTR;
1349 wakeup(wpipe);
1350 }
1351
1352 /*
1353 * don't block on non-blocking I/O
1354 */
1355 if (fp->f_flag & FNONBLOCK) {
1356 error = EAGAIN;
1357 break;
1358 }
1359
1360 /*
1361 * We have no more space and have something to offer,
1362 * wake up select/poll.
1363 */
1364 pipeselwakeup(wpipe, wpipe);
1365
1366 wpipe->pipe_state |= PIPE_WANTW;
1367 error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0);
1368 if (error != 0)
1369 break;
1370 /*
1371 * If read side wants to go away, we just issue a signal
1372 * to ourselves.
1373 */
1374 if (wpipe->pipe_state & PIPE_EOF) {
1375 error = EPIPE;
1376 break;
1377 }
1378 }
1379 }
1380
1381 --wpipe->pipe_busy;
1382 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1383 wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR);
1384 wakeup(wpipe);
1385 } else if (wpipe->pipe_buffer.cnt > 0) {
1386 /*
1387 * If we have put any characters in the buffer, we wake up
1388 * the reader.
1389 */
1390 if (wpipe->pipe_state & PIPE_WANTR) {
1391 wpipe->pipe_state &= ~PIPE_WANTR;
1392 wakeup(wpipe);
1393 }
1394 }
1395
1396 /*
1397 * Don't return EPIPE if I/O was successful
1398 */
1399 if ((error == EPIPE) && (wpipe->pipe_buffer.cnt == 0)
1400 && (uio->uio_resid == 0))
1401 error = 0;
1402
1403 if (error == 0)
1404 vfs_timestamp(&wpipe->pipe_mtime);
1405
1406 /*
1407 * We have something to offer, wake up select/poll.
1408 * wpipe->pipe_map.cnt is always 0 in this point (direct write
1409 * is only done synchronously), so check wpipe->only pipe_buffer.cnt
1410 */
1411 if (wpipe->pipe_buffer.cnt)
1412 pipeselwakeup(wpipe, wpipe);
1413
1414 /*
1415 * Arrange for next read(2) to do a signal.
1416 */
1417 wpipe->pipe_state |= PIPE_SIGNALR;
1418
1419 return (error);
1420 }
1421
1422 /*
1423 * we implement a very minimal set of ioctls for compatibility with sockets.
1424 */
1425 int
1426 pipe_ioctl(fp, cmd, data, p)
1427 struct file *fp;
1428 u_long cmd;
1429 caddr_t data;
1430 struct proc *p;
1431 {
1432 struct pipe *mpipe = (struct pipe *)fp->f_data;
1433
1434 switch (cmd) {
1435
1436 case FIONBIO:
1437 return (0);
1438
1439 case FIOASYNC:
1440 if (*(int *)data) {
1441 mpipe->pipe_state |= PIPE_ASYNC;
1442 } else {
1443 mpipe->pipe_state &= ~PIPE_ASYNC;
1444 }
1445 return (0);
1446
1447 case FIONREAD:
1448 #ifndef PIPE_NODIRECT
1449 if (mpipe->pipe_state & PIPE_DIRECTW)
1450 *(int *)data = mpipe->pipe_map.cnt;
1451 else
1452 #endif
1453 *(int *)data = mpipe->pipe_buffer.cnt;
1454 return (0);
1455
1456 #ifdef __FreeBSD__
1457 case FIOSETOWN:
1458 return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1459
1460 case FIOGETOWN:
1461 *(int *)data = fgetown(mpipe->pipe_sigio);
1462 return (0);
1463
1464 /* This is deprecated, FIOSETOWN should be used instead. */
1465 case TIOCSPGRP:
1466 return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1467
1468 /* This is deprecated, FIOGETOWN should be used instead. */
1469 case TIOCGPGRP:
1470 *(int *)data = -fgetown(mpipe->pipe_sigio);
1471 return (0);
1472 #endif /* FreeBSD */
1473 #ifdef __NetBSD__
1474 case TIOCSPGRP:
1475 mpipe->pipe_pgid = *(int *)data;
1476 return (0);
1477
1478 case TIOCGPGRP:
1479 *(int *)data = mpipe->pipe_pgid;
1480 return (0);
1481 #endif /* NetBSD */
1482
1483 }
1484 return (ENOTTY);
1485 }
1486
1487 int
1488 pipe_poll(fp, events, p)
1489 struct file *fp;
1490 int events;
1491 struct proc *p;
1492 {
1493 struct pipe *rpipe = (struct pipe *)fp->f_data;
1494 struct pipe *wpipe;
1495 int revents = 0;
1496
1497 wpipe = rpipe->pipe_peer;
1498 if (events & (POLLIN | POLLRDNORM))
1499 if ((rpipe->pipe_buffer.cnt > 0) ||
1500 #ifndef PIPE_NODIRECT
1501 (rpipe->pipe_state & PIPE_DIRECTW) ||
1502 #endif
1503 (rpipe->pipe_state & PIPE_EOF))
1504 revents |= events & (POLLIN | POLLRDNORM);
1505
1506 if (events & (POLLOUT | POLLWRNORM))
1507 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)
1508 || (
1509 #ifndef PIPE_NODIRECT
1510 ((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1511 #endif
1512 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1513 revents |= events & (POLLOUT | POLLWRNORM);
1514
1515 if ((rpipe->pipe_state & PIPE_EOF) ||
1516 (wpipe == NULL) ||
1517 (wpipe->pipe_state & PIPE_EOF))
1518 revents |= POLLHUP;
1519
1520 if (revents == 0) {
1521 if (events & (POLLIN | POLLRDNORM)) {
1522 selrecord(p, &rpipe->pipe_sel);
1523 rpipe->pipe_state |= PIPE_SEL;
1524 }
1525
1526 if (events & (POLLOUT | POLLWRNORM)) {
1527 selrecord(p, &wpipe->pipe_sel);
1528 wpipe->pipe_state |= PIPE_SEL;
1529 }
1530 }
1531
1532 return (revents);
1533 }
1534
1535 static int
1536 pipe_stat(fp, ub, p)
1537 struct file *fp;
1538 struct stat *ub;
1539 struct proc *p;
1540 {
1541 struct pipe *pipe = (struct pipe *)fp->f_data;
1542
1543 memset((caddr_t)ub, 0, sizeof(*ub));
1544 ub->st_mode = S_IFIFO;
1545 ub->st_blksize = pipe->pipe_buffer.size;
1546 ub->st_size = pipe->pipe_buffer.cnt;
1547 ub->st_blocks = (ub->st_size) ? 1 : 0;
1548 #ifdef __FreeBSD__
1549 ub->st_atimespec = pipe->pipe_atime;
1550 ub->st_mtimespec = pipe->pipe_mtime;
1551 ub->st_ctimespec = pipe->pipe_ctime;
1552 #endif /* FreeBSD */
1553 #ifdef __NetBSD__
1554 TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec)
1555 TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec);
1556 TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec);
1557 #endif /* NetBSD */
1558 ub->st_uid = fp->f_cred->cr_uid;
1559 ub->st_gid = fp->f_cred->cr_gid;
1560 /*
1561 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1562 * XXX (st_dev, st_ino) should be unique.
1563 */
1564 return (0);
1565 }
1566
1567 /* ARGSUSED */
1568 static int
1569 pipe_close(fp, p)
1570 struct file *fp;
1571 struct proc *p;
1572 {
1573 struct pipe *cpipe = (struct pipe *)fp->f_data;
1574
1575 #ifdef __FreeBSD__
1576 fp->f_ops = &badfileops;
1577 funsetown(cpipe->pipe_sigio);
1578 #endif
1579 fp->f_data = NULL;
1580 pipeclose(cpipe);
1581 return (0);
1582 }
1583
1584 static void
1585 pipe_free_kmem(cpipe)
1586 struct pipe *cpipe;
1587 {
1588
1589 #ifdef __FreeBSD__
1590 mtx_assert(&vm_mtx, MA_OWNED);
1591 #endif
1592 if (cpipe->pipe_buffer.buffer != NULL) {
1593 if (cpipe->pipe_buffer.size > PIPE_SIZE)
1594 --nbigpipe;
1595 amountpipekva -= cpipe->pipe_buffer.size;
1596 #ifdef __FreeBSD__
1597 kmem_free(kernel_map,
1598 (vm_offset_t)cpipe->pipe_buffer.buffer,
1599 cpipe->pipe_buffer.size);
1600 #elif defined(__NetBSD__)
1601 uvm_km_free(kernel_map,
1602 (vaddr_t)cpipe->pipe_buffer.buffer,
1603 cpipe->pipe_buffer.size);
1604 #endif /* NetBSD */
1605
1606 cpipe->pipe_buffer.buffer = NULL;
1607 }
1608 #ifndef PIPE_NODIRECT
1609 if (cpipe->pipe_map.kva != NULL) {
1610 #ifdef __FreeBSD__
1611 amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1612 kmem_free(kernel_map,
1613 cpipe->pipe_map.kva,
1614 cpipe->pipe_buffer.size + PAGE_SIZE);
1615 #elif defined(__NetBSD__)
1616 pipe_loan_free(cpipe);
1617 #endif /* NetBSD */
1618 cpipe->pipe_map.cnt = 0;
1619 cpipe->pipe_map.kva = NULL;
1620 cpipe->pipe_map.pos = 0;
1621 cpipe->pipe_map.npages = 0;
1622 }
1623 #endif /* !PIPE_NODIRECT */
1624 }
1625
1626 /*
1627 * shutdown the pipe
1628 */
1629 static void
1630 pipeclose(cpipe)
1631 struct pipe *cpipe;
1632 {
1633 struct pipe *ppipe;
1634
1635 if (!cpipe)
1636 return;
1637
1638 pipeselwakeup(cpipe, cpipe);
1639
1640 /*
1641 * If the other side is blocked, wake it up saying that
1642 * we want to close it down.
1643 */
1644 while (cpipe->pipe_busy) {
1645 wakeup(cpipe);
1646 cpipe->pipe_state |= PIPE_WANTCLOSE | PIPE_EOF;
1647 tsleep(cpipe, PRIBIO, "pipecl", 0);
1648 }
1649
1650 /*
1651 * Disconnect from peer
1652 */
1653 if ((ppipe = cpipe->pipe_peer) != NULL) {
1654 pipeselwakeup(ppipe, ppipe);
1655
1656 ppipe->pipe_state |= PIPE_EOF;
1657 wakeup(ppipe);
1658 ppipe->pipe_peer = NULL;
1659 }
1660
1661 /*
1662 * free resources
1663 */
1664 #ifdef _FreeBSD__
1665 mtx_lock(&vm_mtx);
1666 pipe_free_kmem(cpipe);
1667 /* XXX: erm, doesn't zalloc already have its own locks and
1668 * not need the giant vm lock?
1669 */
1670 zfree(pipe_zone, cpipe);
1671 mtx_unlock(&vm_mtx);
1672 #endif /* FreeBSD */
1673
1674 #ifdef __NetBSD__
1675 pipe_free_kmem(cpipe);
1676 (void) lockmgr(&cpipe->pipe_lock, LK_DRAIN, NULL);
1677 pool_put(&pipe_pool, cpipe);
1678 #endif
1679 }
1680
1681 /*ARGSUSED*/
1682 static int
1683 pipe_kqfilter(struct file *fp, struct knote *kn)
1684 {
1685 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1686
1687 switch (kn->kn_filter) {
1688 case EVFILT_READ:
1689 kn->kn_fop = &pipe_rfiltops;
1690 break;
1691 case EVFILT_WRITE:
1692 kn->kn_fop = &pipe_wfiltops;
1693 cpipe = cpipe->pipe_peer;
1694 break;
1695 default:
1696 return (1);
1697 }
1698 kn->kn_hook = (caddr_t)cpipe;
1699
1700 #ifdef __FreeBSD__
1701 SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1702 #else
1703 SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_klist, kn, kn_selnext);
1704 #endif /* __FreeBSD__ */
1705 return (0);
1706 }
1707
1708 static void
1709 filt_pipedetach(struct knote *kn)
1710 {
1711 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1712
1713 #ifdef __FreeBSD__
1714 SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1715 #else
1716 SLIST_REMOVE(&cpipe->pipe_sel.si_klist, kn, knote, kn_selnext);
1717 #endif /* __FreeBSD__ */
1718 }
1719
1720 /*ARGSUSED*/
1721 static int
1722 filt_piperead(struct knote *kn, long hint)
1723 {
1724 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1725 struct pipe *wpipe = rpipe->pipe_peer;
1726
1727 kn->kn_data = rpipe->pipe_buffer.cnt;
1728 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1729 kn->kn_data = rpipe->pipe_map.cnt;
1730
1731 if ((rpipe->pipe_state & PIPE_EOF) ||
1732 (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1733 kn->kn_flags |= EV_EOF;
1734 return (1);
1735 }
1736 return (kn->kn_data > 0);
1737 }
1738
1739 /*ARGSUSED*/
1740 static int
1741 filt_pipewrite(struct knote *kn, long hint)
1742 {
1743 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1744 struct pipe *wpipe = rpipe->pipe_peer;
1745
1746 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1747 kn->kn_data = 0;
1748 kn->kn_flags |= EV_EOF;
1749 return (1);
1750 }
1751 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1752 if (wpipe->pipe_state & PIPE_DIRECTW)
1753 kn->kn_data = 0;
1754
1755 return (kn->kn_data >= PIPE_BUF);
1756 }
1757
1758 #ifdef __NetBSD__
1759 static int
1760 pipe_fcntl(fp, cmd, data, p)
1761 struct file *fp;
1762 u_int cmd;
1763 caddr_t data;
1764 struct proc *p;
1765 {
1766 if (cmd == F_SETFL)
1767 return (0);
1768 else
1769 return (EOPNOTSUPP);
1770 }
1771
1772 /*
1773 * Handle pipe sysctls.
1774 */
1775 int
1776 sysctl_dopipe(name, namelen, oldp, oldlenp, newp, newlen)
1777 int *name;
1778 u_int namelen;
1779 void *oldp;
1780 size_t *oldlenp;
1781 void *newp;
1782 size_t newlen;
1783 {
1784 /* All sysctl names at this level are terminal. */
1785 if (namelen != 1)
1786 return (ENOTDIR); /* overloaded */
1787
1788 switch (name[0]) {
1789 case KERN_PIPE_MAXKVASZ:
1790 return (sysctl_int(oldp, oldlenp, newp, newlen, &maxpipekva));
1791 case KERN_PIPE_LIMITKVA:
1792 return (sysctl_int(oldp, oldlenp, newp, newlen, &limitpipekva));
1793 case KERN_PIPE_MAXBIGPIPES:
1794 return (sysctl_int(oldp, oldlenp, newp, newlen, &maxbigpipes));
1795 case KERN_PIPE_NBIGPIPES:
1796 return (sysctl_rdint(oldp, oldlenp, newp, nbigpipe));
1797 case KERN_PIPE_KVASIZE:
1798 return (sysctl_rdint(oldp, oldlenp, newp, amountpipekva));
1799 default:
1800 return (EOPNOTSUPP);
1801 }
1802 /* NOTREACHED */
1803 }
1804
1805 /*
1806 * Initialize pipe structs.
1807 */
1808 void
1809 pipe_init(void)
1810 {
1811 pool_init(&pipe_pool, sizeof(struct pipe), 0, 0, 0, "pipepl",
1812 0, NULL, NULL, M_PIPE);
1813 }
1814
1815 #endif /* __NetBSD __ */
1816