sys_pipe.c revision 1.18 1 /* $NetBSD: sys_pipe.c,v 1.18 2001/11/06 07:30:15 chs Exp $ */
2
3 /*
4 * Copyright (c) 1996 John S. Dyson
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice immediately at the beginning of the file, without modification,
12 * this list of conditions, and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Absolutely no warranty of function or purpose is made by the author
17 * John S. Dyson.
18 * 4. Modifications may be freely made to this file if the above conditions
19 * are met.
20 *
21 * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.82 2001/06/15 20:45:01 jlemon Exp $
22 */
23
24 /*
25 * This file contains a high-performance replacement for the socket-based
26 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support
27 * all features of sockets, but does do everything that pipes normally
28 * do.
29 *
30 * Adaption for NetBSD UVM, including uvm_loan() based direct write, was
31 * written by Jaromir Dolecek.
32 */
33
34 /*
35 * This code has two modes of operation, a small write mode and a large
36 * write mode. The small write mode acts like conventional pipes with
37 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
38 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
39 * and PIPE_SIZE in size, it is fully mapped into the kernel (on FreeBSD,
40 * those pages are also wired), and the receiving process can copy it directly
41 * from the pages in the sending process.
42 *
43 * If the sending process receives a signal, it is possible that it will
44 * go away, and certainly its address space can change, because control
45 * is returned back to the user-mode side. In that case, the pipe code
46 * arranges to copy the buffer supplied by the user process on FreeBSD, to
47 * a pageable kernel buffer, and the receiving process will grab the data
48 * from the pageable kernel buffer. Since signals don't happen all that often,
49 * the copy operation is normally eliminated.
50 * For NetBSD, the pages are mapped read-only, COW for kernel by uvm_loan(),
51 * so no explicit handling need to be done, all is handled by standard VM
52 * facilities.
53 *
54 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
55 * happen for small transfers so that the system will not spend all of
56 * its time context switching. PIPE_SIZE is constrained by the
57 * amount of kernel virtual memory.
58 */
59
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/proc.h>
63 #include <sys/fcntl.h>
64 #include <sys/file.h>
65 #include <sys/filedesc.h>
66 #include <sys/filio.h>
67 #include <sys/ttycom.h>
68 #include <sys/stat.h>
69 #include <sys/poll.h>
70 #include <sys/signalvar.h>
71 #include <sys/vnode.h>
72 #include <sys/uio.h>
73 #include <sys/lock.h>
74 #ifdef __FreeBSD__
75 #include <sys/mutex.h>
76 #include <sys/selinfo.h>
77 #include <sys/sysproto.h>
78 #elif defined(__NetBSD__)
79 #include <sys/select.h>
80 #include <sys/malloc.h>
81 #include <sys/mount.h>
82 #include <sys/syscallargs.h>
83 #include <uvm/uvm.h>
84 #include <sys/sysctl.h>
85 #include <sys/kernel.h>
86 #endif /* NetBSD, FreeBSD */
87
88 #include <sys/pipe.h>
89
90 #ifdef __NetBSD__
91 /*
92 * Avoid microtime(9), it's slow. We don't guard the read from time(9)
93 * with splclock(9) since we don't actually need to be THAT sure the access
94 * is atomic.
95 */
96 #define vfs_timestamp(tv) (*(tv) = time)
97 #endif
98
99 /*
100 * Use this define if you want to disable *fancy* VM things. Expect an
101 * approx 30% decrease in transfer rate. This could be useful for
102 * OpenBSD.
103 */
104 /* #define PIPE_NODIRECT */
105
106 /*
107 * interfaces to the outside world
108 */
109 #ifdef __FreeBSD__
110 static int pipe_read __P((struct file *fp, struct uio *uio,
111 struct ucred *cred, int flags, struct proc *p));
112 static int pipe_write __P((struct file *fp, struct uio *uio,
113 struct ucred *cred, int flags, struct proc *p));
114 static int pipe_close __P((struct file *fp, struct proc *p));
115 static int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
116 struct proc *p));
117 static int pipe_kqfilter __P((struct file *fp, struct knote *kn));
118 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
119 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
120
121 static struct fileops pipeops = {
122 pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
123 pipe_stat, pipe_close
124 };
125
126 static void filt_pipedetach(struct knote *kn);
127 static int filt_piperead(struct knote *kn, long hint);
128 static int filt_pipewrite(struct knote *kn, long hint);
129
130 static struct filterops pipe_rfiltops =
131 { 1, NULL, filt_pipedetach, filt_piperead };
132 static struct filterops pipe_wfiltops =
133 { 1, NULL, filt_pipedetach, filt_pipewrite };
134 #endif /* FreeBSD */
135
136 #ifdef __NetBSD__
137 static int pipe_read __P((struct file *fp, off_t *offset, struct uio *uio,
138 struct ucred *cred, int flags));
139 static int pipe_write __P((struct file *fp, off_t *offset, struct uio *uio,
140 struct ucred *cred, int flags));
141 static int pipe_close __P((struct file *fp, struct proc *p));
142 static int pipe_poll __P((struct file *fp, int events, struct proc *p));
143 static int pipe_fcntl __P((struct file *fp, u_int com, caddr_t data,
144 struct proc *p));
145 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
146 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
147
148 static struct fileops pipeops =
149 { pipe_read, pipe_write, pipe_ioctl, pipe_fcntl, pipe_poll,
150 pipe_stat, pipe_close };
151 #endif /* NetBSD */
152
153 /*
154 * Default pipe buffer size(s), this can be kind-of large now because pipe
155 * space is pageable. The pipe code will try to maintain locality of
156 * reference for performance reasons, so small amounts of outstanding I/O
157 * will not wipe the cache.
158 */
159 #define MINPIPESIZE (PIPE_SIZE/3)
160 #define MAXPIPESIZE (2*PIPE_SIZE/3)
161
162 /*
163 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
164 * is there so that on large systems, we don't exhaust it.
165 */
166 #define MAXPIPEKVA (8*1024*1024)
167 static int maxpipekva = MAXPIPEKVA;
168
169 /*
170 * Limit for direct transfers, we cannot, of course limit
171 * the amount of kva for pipes in general though.
172 */
173 #define LIMITPIPEKVA (16*1024*1024)
174 static int limitpipekva = LIMITPIPEKVA;
175
176 /*
177 * Limit the number of "big" pipes
178 */
179 #define LIMITBIGPIPES 32
180 static int maxbigpipes = LIMITBIGPIPES;
181 static int nbigpipe = 0;
182
183 /*
184 * Amount of KVA consumed by pipe buffers.
185 */
186 static int amountpipekva = 0;
187
188 static void pipeclose __P((struct pipe *));
189 static void pipe_free_kmem __P((struct pipe *));
190 static int pipe_create __P((struct pipe **, int));
191 static __inline int pipelock __P((struct pipe *, int));
192 static __inline void pipeunlock __P((struct pipe *));
193 static __inline void pipeselwakeup __P((struct pipe *, struct pipe *));
194 static int pipespace __P((struct pipe *, int));
195
196 #ifdef __FreeBSD__
197 #ifndef PIPE_NODIRECT
198 static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
199 static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
200 static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
201 static void pipe_clone_write_buffer __P((struct pipe *wpipe));
202 #endif
203
204 static vm_zone_t pipe_zone;
205 #endif /* FreeBSD */
206
207 #ifdef __NetBSD__
208 #ifndef PIPE_NODIRECT
209 static int pipe_direct_write __P((struct pipe *, struct uio *));
210 static int pipe_loan_alloc __P((struct pipe *, int));
211 static void pipe_loan_free __P((struct pipe *));
212 #endif /* PIPE_NODIRECT */
213
214 static struct pool pipe_pool;
215 #endif /* NetBSD */
216
217 /*
218 * The pipe system call for the DTYPE_PIPE type of pipes
219 */
220
221 /* ARGSUSED */
222 #ifdef __FreeBSD__
223 int
224 pipe(p, uap)
225 struct proc *p;
226 struct pipe_args /* {
227 int dummy;
228 } */ *uap;
229 #elif defined(__NetBSD__)
230 int
231 sys_pipe(p, v, retval)
232 struct proc *p;
233 void *v;
234 register_t *retval;
235 #endif
236 {
237 struct file *rf, *wf;
238 struct pipe *rpipe, *wpipe;
239 int fd, error;
240
241 #ifdef __FreeBSD__
242 if (pipe_zone == NULL)
243 pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4);
244
245 rpipe = wpipe = NULL;
246 if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 1)) {
247 pipeclose(rpipe);
248 pipeclose(wpipe);
249 return (ENFILE);
250 }
251
252 error = falloc(p, &rf, &fd);
253 if (error) {
254 pipeclose(rpipe);
255 pipeclose(wpipe);
256 return (error);
257 }
258 fhold(rf);
259 p->p_retval[0] = fd;
260
261 /*
262 * Warning: once we've gotten past allocation of the fd for the
263 * read-side, we can only drop the read side via fdrop() in order
264 * to avoid races against processes which manage to dup() the read
265 * side while we are blocked trying to allocate the write side.
266 */
267 rf->f_flag = FREAD | FWRITE;
268 rf->f_type = DTYPE_PIPE;
269 rf->f_data = (caddr_t)rpipe;
270 rf->f_ops = &pipeops;
271 error = falloc(p, &wf, &fd);
272 if (error) {
273 struct filedesc *fdp = p->p_fd;
274
275 if (fdp->fd_ofiles[p->p_retval[0]] == rf) {
276 fdp->fd_ofiles[p->p_retval[0]] = NULL;
277 fdrop(rf, p);
278 }
279 fdrop(rf, p);
280 /* rpipe has been closed by fdrop(). */
281 pipeclose(wpipe);
282 return (error);
283 }
284 wf->f_flag = FREAD | FWRITE;
285 wf->f_type = DTYPE_PIPE;
286 wf->f_data = (caddr_t)wpipe;
287 wf->f_ops = &pipeops;
288 p->p_retval[1] = fd;
289
290 rpipe->pipe_peer = wpipe;
291 wpipe->pipe_peer = rpipe;
292 fdrop(rf, p);
293 #endif /* FreeBSD */
294
295 #ifdef __NetBSD__
296 rpipe = wpipe = NULL;
297 if (pipe_create(&rpipe, 1) || pipe_create(&wpipe, 0)) {
298 pipeclose(rpipe);
299 pipeclose(wpipe);
300 return (ENFILE);
301 }
302
303 /*
304 * Note: the file structure returned from falloc() is marked
305 * as 'larval' initially. Unless we mark it as 'mature' by
306 * FILE_SET_MATURE(), any attempt to do anything with it would
307 * return EBADF, including e.g. dup(2) or close(2). This avoids
308 * file descriptor races if we block in the second falloc().
309 */
310
311 error = falloc(p, &rf, &fd);
312 if (error)
313 goto free2;
314 retval[0] = fd;
315 rf->f_flag = FREAD;
316 rf->f_type = DTYPE_PIPE;
317 rf->f_data = (caddr_t)rpipe;
318 rf->f_ops = &pipeops;
319
320 error = falloc(p, &wf, &fd);
321 if (error)
322 goto free3;
323 retval[1] = fd;
324 wf->f_flag = FWRITE;
325 wf->f_type = DTYPE_PIPE;
326 wf->f_data = (caddr_t)wpipe;
327 wf->f_ops = &pipeops;
328
329 rpipe->pipe_peer = wpipe;
330 wpipe->pipe_peer = rpipe;
331
332 FILE_SET_MATURE(rf);
333 FILE_SET_MATURE(wf);
334 FILE_UNUSE(rf, p);
335 FILE_UNUSE(wf, p);
336 return (0);
337 free3:
338 FILE_UNUSE(rf, p);
339 ffree(rf);
340 fdremove(p->p_fd, retval[0]);
341 free2:
342 pipeclose(wpipe);
343 pipeclose(rpipe);
344 #endif /* NetBSD */
345
346 return (error);
347 }
348
349 /*
350 * Allocate kva for pipe circular buffer, the space is pageable
351 * This routine will 'realloc' the size of a pipe safely, if it fails
352 * it will retain the old buffer.
353 * If it fails it will return ENOMEM.
354 */
355 static int
356 pipespace(cpipe, size)
357 struct pipe *cpipe;
358 int size;
359 {
360 caddr_t buffer;
361 #ifdef __FreeBSD__
362 struct vm_object *object;
363 int npages, error;
364
365 npages = round_page(size)/PAGE_SIZE;
366 /*
367 * Create an object, I don't like the idea of paging to/from
368 * kernel_object.
369 */
370 mtx_lock(&vm_mtx);
371 object = vm_object_allocate(OBJT_DEFAULT, npages);
372 buffer = (caddr_t) vm_map_min(kernel_map);
373
374 /*
375 * Insert the object into the kernel map, and allocate kva for it.
376 * The map entry is, by default, pageable.
377 */
378 error = vm_map_find(kernel_map, object, 0,
379 (vm_offset_t *) &buffer, size, 1,
380 VM_PROT_ALL, VM_PROT_ALL, 0);
381
382 if (error != KERN_SUCCESS) {
383 vm_object_deallocate(object);
384 mtx_unlock(&vm_mtx);
385 return (ENOMEM);
386 }
387 #endif /* FreeBSD */
388
389 #ifdef __NetBSD__
390 /*
391 * Allocate pageable virtual address space. Physical memory is allocated
392 * on demand.
393 */
394 buffer = (caddr_t) uvm_km_valloc(kernel_map, round_page(size));
395 if (buffer == NULL)
396 return (ENOMEM);
397 #endif /* NetBSD */
398
399 /* free old resources if we're resizing */
400 pipe_free_kmem(cpipe);
401 #ifdef __FreeBSD__
402 mtx_unlock(&vm_mtx);
403 cpipe->pipe_buffer.object = object;
404 #endif
405 cpipe->pipe_buffer.buffer = buffer;
406 cpipe->pipe_buffer.size = size;
407 cpipe->pipe_buffer.in = 0;
408 cpipe->pipe_buffer.out = 0;
409 cpipe->pipe_buffer.cnt = 0;
410 amountpipekva += cpipe->pipe_buffer.size;
411 return (0);
412 }
413
414 /*
415 * initialize and allocate VM and memory for pipe
416 */
417 static int
418 pipe_create(cpipep, allockva)
419 struct pipe **cpipep;
420 int allockva;
421 {
422 struct pipe *cpipe;
423 int error;
424
425 #ifdef __FreeBSD__
426 *cpipep = zalloc(pipe_zone);
427 #endif
428 #ifdef __NetBSD__
429 *cpipep = pool_get(&pipe_pool, M_WAITOK);
430 #endif
431 if (*cpipep == NULL)
432 return (ENOMEM);
433
434 cpipe = *cpipep;
435
436 /* Initialize */
437 memset(cpipe, 0, sizeof(*cpipe));
438 cpipe->pipe_state = PIPE_SIGNALR;
439
440 if (allockva && (error = pipespace(cpipe, PIPE_SIZE)))
441 return (error);
442
443 vfs_timestamp(&cpipe->pipe_ctime);
444 cpipe->pipe_atime = cpipe->pipe_ctime;
445 cpipe->pipe_mtime = cpipe->pipe_ctime;
446 #ifdef __NetBSD__
447 cpipe->pipe_pgid = NO_PID;
448 lockinit(&cpipe->pipe_lock, PRIBIO | PCATCH, "pipelk", 0, 0);
449 #endif
450
451 return (0);
452 }
453
454
455 /*
456 * lock a pipe for I/O, blocking other access
457 */
458 static __inline int
459 pipelock(cpipe, catch)
460 struct pipe *cpipe;
461 int catch;
462 {
463 int error;
464
465 #ifdef __FreeBSD__
466 while (cpipe->pipe_state & PIPE_LOCK) {
467 cpipe->pipe_state |= PIPE_LWANT;
468 error = tsleep(cpipe, catch ? (PRIBIO | PCATCH) : PRIBIO,
469 "pipelk", 0);
470 if (error != 0)
471 return (error);
472 }
473 cpipe->pipe_state |= PIPE_LOCK;
474 return (0);
475 #endif
476
477 #ifdef __NetBSD__
478 do {
479 error = lockmgr(&cpipe->pipe_lock, LK_EXCLUSIVE, NULL);
480 } while (!catch && (error == EINTR || error == ERESTART));
481 return (error);
482 #endif
483 }
484
485 /*
486 * unlock a pipe I/O lock
487 */
488 static __inline void
489 pipeunlock(cpipe)
490 struct pipe *cpipe;
491 {
492 #ifdef __FreeBSD__
493 cpipe->pipe_state &= ~PIPE_LOCK;
494 if (cpipe->pipe_state & PIPE_LWANT) {
495 cpipe->pipe_state &= ~PIPE_LWANT;
496 wakeup(cpipe);
497 }
498 #endif
499
500 #ifdef __NetBSD__
501 lockmgr(&cpipe->pipe_lock, LK_RELEASE, NULL);
502 #endif
503 }
504
505 /*
506 * Select/poll wakup. This also sends SIGIO to peer connected to
507 * 'sigpipe' side of pipe.
508 */
509 static __inline void
510 pipeselwakeup(selp, sigp)
511 struct pipe *selp, *sigp;
512 {
513 if (selp->pipe_state & PIPE_SEL) {
514 selp->pipe_state &= ~PIPE_SEL;
515 selwakeup(&selp->pipe_sel);
516 }
517 #ifdef __FreeBSD__
518 if (sigp && (sigp->pipe_state & PIPE_ASYNC) && sigp->pipe_sigio)
519 pgsigio(sigp->pipe_sigio, SIGIO, 0);
520 KNOTE(&selp->pipe_sel.si_note, 0);
521 #endif
522
523 #ifdef __NetBSD__
524 if (sigp && (sigp->pipe_state & PIPE_ASYNC)
525 && sigp->pipe_pgid != NO_PID){
526 struct proc *p;
527
528 if (sigp->pipe_pgid < 0)
529 gsignal(-sigp->pipe_pgid, SIGIO);
530 else if (sigp->pipe_pgid > 0 && (p = pfind(sigp->pipe_pgid)) != 0)
531 psignal(p, SIGIO);
532 }
533 #endif /* NetBSD */
534 }
535
536 /* ARGSUSED */
537 #ifdef __FreeBSD__
538 static int
539 pipe_read(fp, uio, cred, flags, p)
540 struct file *fp;
541 struct uio *uio;
542 struct ucred *cred;
543 int flags;
544 struct proc *p;
545 #elif defined(__NetBSD__)
546 static int
547 pipe_read(fp, offset, uio, cred, flags)
548 struct file *fp;
549 off_t *offset;
550 struct uio *uio;
551 struct ucred *cred;
552 int flags;
553 #endif
554 {
555 struct pipe *rpipe = (struct pipe *) fp->f_data;
556 int error;
557 size_t nread = 0;
558 size_t size;
559 size_t ocnt;
560
561 ++rpipe->pipe_busy;
562 error = pipelock(rpipe, 1);
563 if (error)
564 goto unlocked_error;
565
566 ocnt = rpipe->pipe_buffer.cnt;
567
568 while (uio->uio_resid) {
569 /*
570 * normal pipe buffer receive
571 */
572 if (rpipe->pipe_buffer.cnt > 0) {
573 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
574 if (size > rpipe->pipe_buffer.cnt)
575 size = rpipe->pipe_buffer.cnt;
576 if (size > uio->uio_resid)
577 size = uio->uio_resid;
578
579 error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
580 size, uio);
581 if (error)
582 break;
583
584 rpipe->pipe_buffer.out += size;
585 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
586 rpipe->pipe_buffer.out = 0;
587
588 rpipe->pipe_buffer.cnt -= size;
589
590 /*
591 * If there is no more to read in the pipe, reset
592 * its pointers to the beginning. This improves
593 * cache hit stats.
594 */
595 if (rpipe->pipe_buffer.cnt == 0) {
596 rpipe->pipe_buffer.in = 0;
597 rpipe->pipe_buffer.out = 0;
598 }
599 nread += size;
600 #ifndef PIPE_NODIRECT
601 /*
602 * Direct copy, bypassing a kernel buffer.
603 */
604 } else if ((size = rpipe->pipe_map.cnt) &&
605 (rpipe->pipe_state & PIPE_DIRECTW)) {
606 caddr_t va;
607 if (size > uio->uio_resid)
608 size = uio->uio_resid;
609
610 va = (caddr_t) rpipe->pipe_map.kva +
611 rpipe->pipe_map.pos;
612 error = uiomove(va, size, uio);
613 if (error)
614 break;
615 nread += size;
616 rpipe->pipe_map.pos += size;
617 rpipe->pipe_map.cnt -= size;
618 if (rpipe->pipe_map.cnt == 0) {
619 rpipe->pipe_state &= ~PIPE_DIRECTW;
620 wakeup(rpipe);
621 }
622 #endif
623 } else {
624 /*
625 * detect EOF condition
626 * read returns 0 on EOF, no need to set error
627 */
628 if (rpipe->pipe_state & PIPE_EOF)
629 break;
630
631 /*
632 * If the "write-side" has been blocked, wake it up now.
633 */
634 if (rpipe->pipe_state & PIPE_WANTW) {
635 rpipe->pipe_state &= ~PIPE_WANTW;
636 wakeup(rpipe);
637 }
638
639 /*
640 * Break if some data was read.
641 */
642 if (nread > 0)
643 break;
644
645 /*
646 * don't block on non-blocking I/O
647 */
648 if (fp->f_flag & FNONBLOCK) {
649 error = EAGAIN;
650 break;
651 }
652
653 /*
654 * Unlock the pipe buffer for our remaining processing.
655 * We will either break out with an error or we will
656 * sleep and relock to loop.
657 */
658 pipeunlock(rpipe);
659
660 /*
661 * We want to read more, wake up select/poll.
662 */
663 pipeselwakeup(rpipe, rpipe->pipe_peer);
664
665 rpipe->pipe_state |= PIPE_WANTR;
666 error = tsleep(rpipe, PRIBIO | PCATCH, "piperd", 0);
667 if (error != 0 || (error = pipelock(rpipe, 1)))
668 goto unlocked_error;
669 }
670 }
671 pipeunlock(rpipe);
672
673 if (error == 0)
674 vfs_timestamp(&rpipe->pipe_atime);
675 unlocked_error:
676 --rpipe->pipe_busy;
677
678 /*
679 * PIPE_WANTCLOSE processing only makes sense if pipe_busy is 0.
680 */
681 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANTCLOSE)) {
682 rpipe->pipe_state &= ~(PIPE_WANTCLOSE|PIPE_WANTW);
683 wakeup(rpipe);
684 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
685 /*
686 * Handle write blocking hysteresis.
687 */
688 if (rpipe->pipe_state & PIPE_WANTW) {
689 rpipe->pipe_state &= ~PIPE_WANTW;
690 wakeup(rpipe);
691 }
692 }
693
694 /*
695 * If anything was read off the buffer, signal to the writer it's
696 * possible to write more data. Also send signal if we are here for the
697 * first time after last write.
698 */
699 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF
700 && (ocnt != rpipe->pipe_buffer.cnt || (rpipe->pipe_state & PIPE_SIGNALR))) {
701 pipeselwakeup(rpipe, rpipe->pipe_peer);
702 rpipe->pipe_state &= ~PIPE_SIGNALR;
703 }
704
705 return (error);
706 }
707
708 #ifdef __FreeBSD__
709 #ifndef PIPE_NODIRECT
710 /*
711 * Map the sending processes' buffer into kernel space and wire it.
712 * This is similar to a physical write operation.
713 */
714 static int
715 pipe_build_write_buffer(wpipe, uio)
716 struct pipe *wpipe;
717 struct uio *uio;
718 {
719 size_t size;
720 int i;
721 vm_offset_t addr, endaddr, paddr;
722
723 size = uio->uio_iov->iov_len;
724 if (size > wpipe->pipe_buffer.size)
725 size = wpipe->pipe_buffer.size;
726
727 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
728 mtx_lock(&vm_mtx);
729 addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
730 for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
731 vm_page_t m;
732
733 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
734 (paddr = pmap_kextract(addr)) == 0) {
735 int j;
736
737 for (j = 0; j < i; j++)
738 vm_page_unwire(wpipe->pipe_map.ms[j], 1);
739 mtx_unlock(&vm_mtx);
740 return (EFAULT);
741 }
742
743 m = PHYS_TO_VM_PAGE(paddr);
744 vm_page_wire(m);
745 wpipe->pipe_map.ms[i] = m;
746 }
747
748 /*
749 * set up the control block
750 */
751 wpipe->pipe_map.npages = i;
752 wpipe->pipe_map.pos =
753 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
754 wpipe->pipe_map.cnt = size;
755
756 /*
757 * and map the buffer
758 */
759 if (wpipe->pipe_map.kva == 0) {
760 /*
761 * We need to allocate space for an extra page because the
762 * address range might (will) span pages at times.
763 */
764 wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
765 wpipe->pipe_buffer.size + PAGE_SIZE);
766 amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
767 }
768 pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
769 wpipe->pipe_map.npages);
770
771 mtx_unlock(&vm_mtx);
772 /*
773 * and update the uio data
774 */
775
776 uio->uio_iov->iov_len -= size;
777 uio->uio_iov->iov_base += size;
778 if (uio->uio_iov->iov_len == 0)
779 uio->uio_iov++;
780 uio->uio_resid -= size;
781 uio->uio_offset += size;
782 return (0);
783 }
784
785 /*
786 * unmap and unwire the process buffer
787 */
788 static void
789 pipe_destroy_write_buffer(wpipe)
790 struct pipe *wpipe;
791 {
792 int i;
793
794 mtx_lock(&vm_mtx);
795 if (wpipe->pipe_map.kva) {
796 pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
797
798 if (amountpipekva > maxpipekva) {
799 vm_offset_t kva = wpipe->pipe_map.kva;
800 wpipe->pipe_map.kva = 0;
801 kmem_free(kernel_map, kva,
802 wpipe->pipe_buffer.size + PAGE_SIZE);
803 amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
804 }
805 }
806 for (i = 0; i < wpipe->pipe_map.npages; i++)
807 vm_page_unwire(wpipe->pipe_map.ms[i], 1);
808 mtx_unlock(&vm_mtx);
809 }
810
811 /*
812 * In the case of a signal, the writing process might go away. This
813 * code copies the data into the circular buffer so that the source
814 * pages can be freed without loss of data.
815 */
816 static void
817 pipe_clone_write_buffer(wpipe)
818 struct pipe *wpipe;
819 {
820 int size;
821 int pos;
822
823 size = wpipe->pipe_map.cnt;
824 pos = wpipe->pipe_map.pos;
825 memcpy((caddr_t) wpipe->pipe_buffer.buffer,
826 (caddr_t) wpipe->pipe_map.kva + pos, size);
827
828 wpipe->pipe_buffer.in = size;
829 wpipe->pipe_buffer.out = 0;
830 wpipe->pipe_buffer.cnt = size;
831 wpipe->pipe_state &= ~PIPE_DIRECTW;
832
833 pipe_destroy_write_buffer(wpipe);
834 }
835
836 /*
837 * This implements the pipe buffer write mechanism. Note that only
838 * a direct write OR a normal pipe write can be pending at any given time.
839 * If there are any characters in the pipe buffer, the direct write will
840 * be deferred until the receiving process grabs all of the bytes from
841 * the pipe buffer. Then the direct mapping write is set-up.
842 */
843 static int
844 pipe_direct_write(wpipe, uio)
845 struct pipe *wpipe;
846 struct uio *uio;
847 {
848 int error;
849
850 retry:
851 while (wpipe->pipe_state & PIPE_DIRECTW) {
852 if (wpipe->pipe_state & PIPE_WANTR) {
853 wpipe->pipe_state &= ~PIPE_WANTR;
854 wakeup(wpipe);
855 }
856 wpipe->pipe_state |= PIPE_WANTW;
857 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
858 if (error)
859 goto error1;
860 if (wpipe->pipe_state & PIPE_EOF) {
861 error = EPIPE;
862 goto error1;
863 }
864 }
865 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
866 if (wpipe->pipe_buffer.cnt > 0) {
867 if (wpipe->pipe_state & PIPE_WANTR) {
868 wpipe->pipe_state &= ~PIPE_WANTR;
869 wakeup(wpipe);
870 }
871
872 wpipe->pipe_state |= PIPE_WANTW;
873 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
874 if (error)
875 goto error1;
876 if (wpipe->pipe_state & PIPE_EOF) {
877 error = EPIPE;
878 goto error1;
879 }
880 goto retry;
881 }
882
883 wpipe->pipe_state |= PIPE_DIRECTW;
884
885 error = pipe_build_write_buffer(wpipe, uio);
886 if (error) {
887 wpipe->pipe_state &= ~PIPE_DIRECTW;
888 goto error1;
889 }
890
891 error = 0;
892 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
893 if (wpipe->pipe_state & PIPE_EOF) {
894 pipelock(wpipe, 0);
895 pipe_destroy_write_buffer(wpipe);
896 pipeunlock(wpipe);
897 pipeselwakeup(wpipe, wpipe);
898 error = EPIPE;
899 goto error1;
900 }
901 if (wpipe->pipe_state & PIPE_WANTR) {
902 wpipe->pipe_state &= ~PIPE_WANTR;
903 wakeup(wpipe);
904 }
905 pipeselwakeup(wpipe, wpipe);
906 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
907 }
908
909 pipelock(wpipe,0);
910 if (wpipe->pipe_state & PIPE_DIRECTW) {
911 /*
912 * this bit of trickery substitutes a kernel buffer for
913 * the process that might be going away.
914 */
915 pipe_clone_write_buffer(wpipe);
916 } else {
917 pipe_destroy_write_buffer(wpipe);
918 }
919 pipeunlock(wpipe);
920 return (error);
921
922 error1:
923 wakeup(wpipe);
924 return (error);
925 }
926 #endif /* !PIPE_NODIRECT */
927 #endif /* FreeBSD */
928
929 #ifdef __NetBSD__
930 #ifndef PIPE_NODIRECT
931 /*
932 * Allocate structure for loan transfer.
933 */
934 static int
935 pipe_loan_alloc(wpipe, npages)
936 struct pipe *wpipe;
937 int npages;
938 {
939 vsize_t len;
940
941 len = (vsize_t)npages << PAGE_SHIFT;
942 wpipe->pipe_map.kva = uvm_km_valloc_wait(kernel_map, len);
943 if (wpipe->pipe_map.kva == NULL)
944 return (ENOMEM);
945
946 amountpipekva += len;
947 wpipe->pipe_map.npages = npages;
948 wpipe->pipe_map.pgs = malloc(npages * sizeof(struct vm_page *), M_PIPE,
949 M_WAITOK);
950 return (0);
951 }
952
953 /*
954 * Free resources allocated for loan transfer.
955 */
956 static void
957 pipe_loan_free(wpipe)
958 struct pipe *wpipe;
959 {
960 vsize_t len;
961
962 len = (vsize_t)wpipe->pipe_map.npages << PAGE_SHIFT;
963 pmap_kremove(wpipe->pipe_map.kva, len);
964 uvm_km_free(kernel_map, wpipe->pipe_map.kva, len);
965 wpipe->pipe_map.kva = NULL;
966 amountpipekva -= len;
967 free(wpipe->pipe_map.pgs, M_PIPE);
968 wpipe->pipe_map.pgs = NULL;
969 }
970
971 /*
972 * NetBSD direct write, using uvm_loan() mechanism.
973 * This implements the pipe buffer write mechanism. Note that only
974 * a direct write OR a normal pipe write can be pending at any given time.
975 * If there are any characters in the pipe buffer, the direct write will
976 * be deferred until the receiving process grabs all of the bytes from
977 * the pipe buffer. Then the direct mapping write is set-up.
978 */
979 static int
980 pipe_direct_write(wpipe, uio)
981 struct pipe *wpipe;
982 struct uio *uio;
983 {
984 int error, npages, j;
985 struct vm_page **pgs;
986 vaddr_t bbase, kva, base, bend;
987 vsize_t blen, bcnt;
988 voff_t bpos;
989
990 retry:
991 while (wpipe->pipe_state & PIPE_DIRECTW) {
992 if (wpipe->pipe_state & PIPE_WANTR) {
993 wpipe->pipe_state &= ~PIPE_WANTR;
994 wakeup(wpipe);
995 }
996 wpipe->pipe_state |= PIPE_WANTW;
997 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
998 if (error)
999 goto error;
1000 if (wpipe->pipe_state & PIPE_EOF) {
1001 error = EPIPE;
1002 goto error;
1003 }
1004 }
1005 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
1006 if (wpipe->pipe_buffer.cnt > 0) {
1007 if (wpipe->pipe_state & PIPE_WANTR) {
1008 wpipe->pipe_state &= ~PIPE_WANTR;
1009 wakeup(wpipe);
1010 }
1011
1012 wpipe->pipe_state |= PIPE_WANTW;
1013 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
1014 if (error)
1015 goto error;
1016 if (wpipe->pipe_state & PIPE_EOF) {
1017 error = EPIPE;
1018 goto error;
1019 }
1020 goto retry;
1021 }
1022
1023 /*
1024 * Handle first PIPE_CHUNK_SIZE bytes of buffer. Deal with buffers
1025 * not aligned to PAGE_SIZE.
1026 */
1027 bbase = (vaddr_t)uio->uio_iov->iov_base;
1028 base = trunc_page(bbase);
1029 bend = round_page(bbase + uio->uio_iov->iov_len);
1030 blen = bend - base;
1031 bpos = bbase - base;
1032
1033 if (blen > PIPE_DIRECT_CHUNK) {
1034 blen = PIPE_DIRECT_CHUNK;
1035 bend = base + blen;
1036 bcnt = PIPE_DIRECT_CHUNK - bpos;
1037 } else {
1038 bcnt = uio->uio_iov->iov_len;
1039 }
1040 npages = blen >> PAGE_SHIFT;
1041
1042 wpipe->pipe_map.pos = bpos;
1043 wpipe->pipe_map.cnt = bcnt;
1044
1045 /*
1046 * Free the old kva if we need more pages than we have
1047 * allocated.
1048 */
1049 if (wpipe->pipe_map.kva && npages > wpipe->pipe_map.npages)
1050 pipe_loan_free(wpipe);
1051
1052 /* Allocate new kva. */
1053 if (wpipe->pipe_map.kva == NULL) {
1054 error = pipe_loan_alloc(wpipe, npages);
1055 if (error) {
1056 goto error;
1057 }
1058 }
1059
1060 /* Loan the write buffer memory from writer process */
1061 pgs = wpipe->pipe_map.pgs;
1062 error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, base, blen,
1063 pgs, UVM_LOAN_TOPAGE);
1064 if (error) {
1065 pgs = NULL;
1066 goto cleanup;
1067 }
1068
1069 /* Enter the loaned pages to kva */
1070 kva = wpipe->pipe_map.kva;
1071 for (j = 0; j < npages; j++, kva += PAGE_SIZE) {
1072 pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pgs[j]), VM_PROT_READ);
1073 }
1074 pmap_update(pmap_kernel());
1075
1076 wpipe->pipe_state |= PIPE_DIRECTW;
1077 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
1078 if (wpipe->pipe_state & PIPE_EOF) {
1079 error = EPIPE;
1080 break;
1081 }
1082 if (wpipe->pipe_state & PIPE_WANTR) {
1083 wpipe->pipe_state &= ~PIPE_WANTR;
1084 wakeup(wpipe);
1085 }
1086 pipeselwakeup(wpipe, wpipe);
1087 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
1088 }
1089
1090 if (error)
1091 wpipe->pipe_state &= ~PIPE_DIRECTW;
1092
1093 cleanup:
1094 pipelock(wpipe, 0);
1095 if (pgs != NULL)
1096 uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE);
1097 if (error || amountpipekva > maxpipekva)
1098 pipe_loan_free(wpipe);
1099 pipeunlock(wpipe);
1100
1101 if (error) {
1102 pipeselwakeup(wpipe, wpipe);
1103
1104 /*
1105 * If nothing was read from what we offered, return error
1106 * straight on. Otherwise update uio resid first. Caller
1107 * will deal with the error condition, returning short
1108 * write, error, or restarting the write(2) as appropriate.
1109 */
1110 if (wpipe->pipe_map.cnt == bcnt) {
1111 error:
1112 wakeup(wpipe);
1113 return (error);
1114 }
1115
1116 bcnt -= wpipe->pipe_map.cnt;
1117 }
1118
1119 uio->uio_resid -= bcnt;
1120 /* uio_offset not updated, not set/used for write(2) */
1121 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + bcnt;
1122 uio->uio_iov->iov_len -= bcnt;
1123 if (uio->uio_iov->iov_len == 0) {
1124 uio->uio_iov++;
1125 uio->uio_iovcnt--;
1126 }
1127
1128 return (error);
1129 }
1130 #endif /* !PIPE_NODIRECT */
1131 #endif /* NetBSD */
1132
1133 #ifdef __FreeBSD__
1134 static int
1135 pipe_write(fp, uio, cred, flags, p)
1136 struct file *fp;
1137 off_t *offset;
1138 struct uio *uio;
1139 struct ucred *cred;
1140 int flags;
1141 struct proc *p;
1142 #elif defined(__NetBSD__)
1143 static int
1144 pipe_write(fp, offset, uio, cred, flags)
1145 struct file *fp;
1146 off_t *offset;
1147 struct uio *uio;
1148 struct ucred *cred;
1149 int flags;
1150 #endif
1151 {
1152 int error = 0;
1153 struct pipe *wpipe, *rpipe;
1154
1155 rpipe = (struct pipe *) fp->f_data;
1156 wpipe = rpipe->pipe_peer;
1157
1158 /*
1159 * detect loss of pipe read side, issue SIGPIPE if lost.
1160 */
1161 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF))
1162 return (EPIPE);
1163
1164 ++wpipe->pipe_busy;
1165
1166 /*
1167 * If it is advantageous to resize the pipe buffer, do
1168 * so.
1169 */
1170 if ((uio->uio_resid > PIPE_SIZE) &&
1171 (nbigpipe < maxbigpipes) &&
1172 #ifndef PIPE_NODIRECT
1173 (wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
1174 #endif
1175 (wpipe->pipe_buffer.size <= PIPE_SIZE) &&
1176 (wpipe->pipe_buffer.cnt == 0)) {
1177
1178 if ((error = pipelock(wpipe,1)) == 0) {
1179 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
1180 nbigpipe++;
1181 pipeunlock(wpipe);
1182 } else {
1183 /*
1184 * If an error occurred, unbusy and return, waking up
1185 * any waiting readers.
1186 */
1187 --wpipe->pipe_busy;
1188 if (wpipe->pipe_busy == 0
1189 && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1190 wpipe->pipe_state &=
1191 ~(PIPE_WANTCLOSE | PIPE_WANTR);
1192 wakeup(wpipe);
1193 }
1194
1195 return (error);
1196 }
1197 }
1198
1199 #ifdef __FreeBSD__
1200 KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
1201 #endif
1202
1203 while (uio->uio_resid) {
1204 int space;
1205
1206 #ifndef PIPE_NODIRECT
1207 /*
1208 * If the transfer is large, we can gain performance if
1209 * we do process-to-process copies directly.
1210 * If the write is non-blocking, we don't use the
1211 * direct write mechanism.
1212 *
1213 * The direct write mechanism will detect the reader going
1214 * away on us.
1215 */
1216 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
1217 (fp->f_flag & FNONBLOCK) == 0 &&
1218 (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) {
1219 error = pipe_direct_write(wpipe, uio);
1220
1221 /*
1222 * Break out if error occured, unless it's ENOMEM.
1223 * ENOMEM means we failed to allocate some resources
1224 * for direct write, so we just fallback to ordinary
1225 * write. If the direct write was successful,
1226 * process rest of data via ordinary write.
1227 */
1228 if (!error)
1229 continue;
1230
1231 if (error != ENOMEM)
1232 break;
1233 }
1234 #endif /* PIPE_NODIRECT */
1235
1236 /*
1237 * Pipe buffered writes cannot be coincidental with
1238 * direct writes. We wait until the currently executing
1239 * direct write is completed before we start filling the
1240 * pipe buffer. We break out if a signal occurs or the
1241 * reader goes away.
1242 */
1243 retrywrite:
1244 while (wpipe->pipe_state & PIPE_DIRECTW) {
1245 if (wpipe->pipe_state & PIPE_WANTR) {
1246 wpipe->pipe_state &= ~PIPE_WANTR;
1247 wakeup(wpipe);
1248 }
1249 error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0);
1250 if (wpipe->pipe_state & PIPE_EOF)
1251 break;
1252 if (error)
1253 break;
1254 }
1255 if (wpipe->pipe_state & PIPE_EOF) {
1256 error = EPIPE;
1257 break;
1258 }
1259
1260 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1261
1262 /* Writes of size <= PIPE_BUF must be atomic. */
1263 if ((space < uio->uio_resid) && (uio->uio_resid <= PIPE_BUF))
1264 space = 0;
1265
1266 if (space > 0) {
1267 int size; /* Transfer size */
1268 int segsize; /* first segment to transfer */
1269
1270 if ((error = pipelock(wpipe,1)) != 0)
1271 break;
1272
1273 /*
1274 * It is possible for a direct write to
1275 * slip in on us... handle it here...
1276 */
1277 if (wpipe->pipe_state & PIPE_DIRECTW) {
1278 pipeunlock(wpipe);
1279 goto retrywrite;
1280 }
1281 /*
1282 * If a process blocked in uiomove, our
1283 * value for space might be bad.
1284 *
1285 * XXX will we be ok if the reader has gone
1286 * away here?
1287 */
1288 if (space > wpipe->pipe_buffer.size -
1289 wpipe->pipe_buffer.cnt) {
1290 pipeunlock(wpipe);
1291 goto retrywrite;
1292 }
1293
1294 /*
1295 * Transfer size is minimum of uio transfer
1296 * and free space in pipe buffer.
1297 */
1298 if (space > uio->uio_resid)
1299 size = uio->uio_resid;
1300 else
1301 size = space;
1302 /*
1303 * First segment to transfer is minimum of
1304 * transfer size and contiguous space in
1305 * pipe buffer. If first segment to transfer
1306 * is less than the transfer size, we've got
1307 * a wraparound in the buffer.
1308 */
1309 segsize = wpipe->pipe_buffer.size -
1310 wpipe->pipe_buffer.in;
1311 if (segsize > size)
1312 segsize = size;
1313
1314 /* Transfer first segment */
1315
1316 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1317 segsize, uio);
1318
1319 if (error == 0 && segsize < size) {
1320 /*
1321 * Transfer remaining part now, to
1322 * support atomic writes. Wraparound
1323 * happened.
1324 */
1325 #ifdef DEBUG
1326 if (wpipe->pipe_buffer.in + segsize !=
1327 wpipe->pipe_buffer.size)
1328 panic("Expected pipe buffer wraparound disappeared");
1329 #endif
1330
1331 error = uiomove(&wpipe->pipe_buffer.buffer[0],
1332 size - segsize, uio);
1333 }
1334 if (error == 0) {
1335 wpipe->pipe_buffer.in += size;
1336 if (wpipe->pipe_buffer.in >=
1337 wpipe->pipe_buffer.size) {
1338 #ifdef DEBUG
1339 if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
1340 panic("Expected wraparound bad");
1341 #endif
1342 wpipe->pipe_buffer.in = size - segsize;
1343 }
1344
1345 wpipe->pipe_buffer.cnt += size;
1346 #ifdef DEBUG
1347 if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
1348 panic("Pipe buffer overflow");
1349 #endif
1350 }
1351 pipeunlock(wpipe);
1352 if (error)
1353 break;
1354 } else {
1355 /*
1356 * If the "read-side" has been blocked, wake it up now.
1357 */
1358 if (wpipe->pipe_state & PIPE_WANTR) {
1359 wpipe->pipe_state &= ~PIPE_WANTR;
1360 wakeup(wpipe);
1361 }
1362
1363 /*
1364 * don't block on non-blocking I/O
1365 */
1366 if (fp->f_flag & FNONBLOCK) {
1367 error = EAGAIN;
1368 break;
1369 }
1370
1371 /*
1372 * We have no more space and have something to offer,
1373 * wake up select/poll.
1374 */
1375 pipeselwakeup(wpipe, wpipe);
1376
1377 wpipe->pipe_state |= PIPE_WANTW;
1378 error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0);
1379 if (error != 0)
1380 break;
1381 /*
1382 * If read side wants to go away, we just issue a signal
1383 * to ourselves.
1384 */
1385 if (wpipe->pipe_state & PIPE_EOF) {
1386 error = EPIPE;
1387 break;
1388 }
1389 }
1390 }
1391
1392 --wpipe->pipe_busy;
1393 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1394 wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR);
1395 wakeup(wpipe);
1396 } else if (wpipe->pipe_buffer.cnt > 0) {
1397 /*
1398 * If we have put any characters in the buffer, we wake up
1399 * the reader.
1400 */
1401 if (wpipe->pipe_state & PIPE_WANTR) {
1402 wpipe->pipe_state &= ~PIPE_WANTR;
1403 wakeup(wpipe);
1404 }
1405 }
1406
1407 /*
1408 * Don't return EPIPE if I/O was successful
1409 */
1410 if ((error == EPIPE) && (wpipe->pipe_buffer.cnt == 0)
1411 && (uio->uio_resid == 0))
1412 error = 0;
1413
1414 if (error == 0)
1415 vfs_timestamp(&wpipe->pipe_mtime);
1416
1417 /*
1418 * We have something to offer, wake up select/poll.
1419 * wpipe->pipe_map.cnt is always 0 in this point (direct write
1420 * is only done synchronously), so check only wpipe->pipe_buffer.cnt
1421 */
1422 if (wpipe->pipe_buffer.cnt)
1423 pipeselwakeup(wpipe, wpipe);
1424
1425 /*
1426 * Arrange for next read(2) to do a signal.
1427 */
1428 wpipe->pipe_state |= PIPE_SIGNALR;
1429
1430 return (error);
1431 }
1432
1433 /*
1434 * we implement a very minimal set of ioctls for compatibility with sockets.
1435 */
1436 int
1437 pipe_ioctl(fp, cmd, data, p)
1438 struct file *fp;
1439 u_long cmd;
1440 caddr_t data;
1441 struct proc *p;
1442 {
1443 struct pipe *mpipe = (struct pipe *)fp->f_data;
1444
1445 switch (cmd) {
1446
1447 case FIONBIO:
1448 return (0);
1449
1450 case FIOASYNC:
1451 if (*(int *)data) {
1452 mpipe->pipe_state |= PIPE_ASYNC;
1453 } else {
1454 mpipe->pipe_state &= ~PIPE_ASYNC;
1455 }
1456 return (0);
1457
1458 case FIONREAD:
1459 #ifndef PIPE_NODIRECT
1460 if (mpipe->pipe_state & PIPE_DIRECTW)
1461 *(int *)data = mpipe->pipe_map.cnt;
1462 else
1463 #endif
1464 *(int *)data = mpipe->pipe_buffer.cnt;
1465 return (0);
1466
1467 #ifdef __FreeBSD__
1468 case FIOSETOWN:
1469 return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1470
1471 case FIOGETOWN:
1472 *(int *)data = fgetown(mpipe->pipe_sigio);
1473 return (0);
1474
1475 /* This is deprecated, FIOSETOWN should be used instead. */
1476 case TIOCSPGRP:
1477 return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1478
1479 /* This is deprecated, FIOGETOWN should be used instead. */
1480 case TIOCGPGRP:
1481 *(int *)data = -fgetown(mpipe->pipe_sigio);
1482 return (0);
1483 #endif /* FreeBSD */
1484 #ifdef __NetBSD__
1485 case TIOCSPGRP:
1486 mpipe->pipe_pgid = *(int *)data;
1487 return (0);
1488
1489 case TIOCGPGRP:
1490 *(int *)data = mpipe->pipe_pgid;
1491 return (0);
1492 #endif /* NetBSD */
1493
1494 }
1495 return (ENOTTY);
1496 }
1497
1498 int
1499 pipe_poll(fp, events, p)
1500 struct file *fp;
1501 int events;
1502 struct proc *p;
1503 {
1504 struct pipe *rpipe = (struct pipe *)fp->f_data;
1505 struct pipe *wpipe;
1506 int revents = 0;
1507
1508 wpipe = rpipe->pipe_peer;
1509 if (events & (POLLIN | POLLRDNORM))
1510 if ((rpipe->pipe_buffer.cnt > 0) ||
1511 #ifndef PIPE_NODIRECT
1512 (rpipe->pipe_state & PIPE_DIRECTW) ||
1513 #endif
1514 (rpipe->pipe_state & PIPE_EOF))
1515 revents |= events & (POLLIN | POLLRDNORM);
1516
1517 if (events & (POLLOUT | POLLWRNORM))
1518 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)
1519 || (
1520 #ifndef PIPE_NODIRECT
1521 ((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1522 #endif
1523 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1524 revents |= events & (POLLOUT | POLLWRNORM);
1525
1526 if ((rpipe->pipe_state & PIPE_EOF) ||
1527 (wpipe == NULL) ||
1528 (wpipe->pipe_state & PIPE_EOF))
1529 revents |= POLLHUP;
1530
1531 if (revents == 0) {
1532 if (events & (POLLIN | POLLRDNORM)) {
1533 selrecord(p, &rpipe->pipe_sel);
1534 rpipe->pipe_state |= PIPE_SEL;
1535 }
1536
1537 if (events & (POLLOUT | POLLWRNORM)) {
1538 selrecord(p, &wpipe->pipe_sel);
1539 wpipe->pipe_state |= PIPE_SEL;
1540 }
1541 }
1542
1543 return (revents);
1544 }
1545
1546 static int
1547 pipe_stat(fp, ub, p)
1548 struct file *fp;
1549 struct stat *ub;
1550 struct proc *p;
1551 {
1552 struct pipe *pipe = (struct pipe *)fp->f_data;
1553
1554 memset((caddr_t)ub, 0, sizeof(*ub));
1555 ub->st_mode = S_IFIFO;
1556 ub->st_blksize = pipe->pipe_buffer.size;
1557 ub->st_size = pipe->pipe_buffer.cnt;
1558 ub->st_blocks = (ub->st_size) ? 1 : 0;
1559 #ifdef __FreeBSD__
1560 ub->st_atimespec = pipe->pipe_atime;
1561 ub->st_mtimespec = pipe->pipe_mtime;
1562 ub->st_ctimespec = pipe->pipe_ctime;
1563 #endif /* FreeBSD */
1564 #ifdef __NetBSD__
1565 TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec)
1566 TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec);
1567 TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec);
1568 #endif /* NetBSD */
1569 ub->st_uid = fp->f_cred->cr_uid;
1570 ub->st_gid = fp->f_cred->cr_gid;
1571 /*
1572 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1573 * XXX (st_dev, st_ino) should be unique.
1574 */
1575 return (0);
1576 }
1577
1578 /* ARGSUSED */
1579 static int
1580 pipe_close(fp, p)
1581 struct file *fp;
1582 struct proc *p;
1583 {
1584 struct pipe *cpipe = (struct pipe *)fp->f_data;
1585
1586 #ifdef __FreeBSD__
1587 fp->f_ops = &badfileops;
1588 funsetown(cpipe->pipe_sigio);
1589 #endif
1590 fp->f_data = NULL;
1591 pipeclose(cpipe);
1592 return (0);
1593 }
1594
1595 static void
1596 pipe_free_kmem(cpipe)
1597 struct pipe *cpipe;
1598 {
1599
1600 #ifdef __FreeBSD__
1601 mtx_assert(&vm_mtx, MA_OWNED);
1602 #endif
1603 if (cpipe->pipe_buffer.buffer != NULL) {
1604 if (cpipe->pipe_buffer.size > PIPE_SIZE)
1605 --nbigpipe;
1606 amountpipekva -= cpipe->pipe_buffer.size;
1607 #ifdef __FreeBSD__
1608 kmem_free(kernel_map,
1609 (vm_offset_t)cpipe->pipe_buffer.buffer,
1610 cpipe->pipe_buffer.size);
1611 #elif defined(__NetBSD__)
1612 uvm_km_free(kernel_map,
1613 (vaddr_t)cpipe->pipe_buffer.buffer,
1614 cpipe->pipe_buffer.size);
1615 #endif /* NetBSD */
1616 cpipe->pipe_buffer.buffer = NULL;
1617 }
1618 #ifndef PIPE_NODIRECT
1619 if (cpipe->pipe_map.kva != NULL) {
1620 #ifdef __FreeBSD__
1621 amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1622 kmem_free(kernel_map,
1623 cpipe->pipe_map.kva,
1624 cpipe->pipe_buffer.size + PAGE_SIZE);
1625 #elif defined(__NetBSD__)
1626 pipe_loan_free(cpipe);
1627 #endif /* NetBSD */
1628 cpipe->pipe_map.cnt = 0;
1629 cpipe->pipe_map.kva = NULL;
1630 cpipe->pipe_map.pos = 0;
1631 cpipe->pipe_map.npages = 0;
1632 }
1633 #endif /* !PIPE_NODIRECT */
1634 }
1635
1636 /*
1637 * shutdown the pipe
1638 */
1639 static void
1640 pipeclose(cpipe)
1641 struct pipe *cpipe;
1642 {
1643 struct pipe *ppipe;
1644
1645 if (!cpipe)
1646 return;
1647
1648 pipeselwakeup(cpipe, cpipe);
1649
1650 /*
1651 * If the other side is blocked, wake it up saying that
1652 * we want to close it down.
1653 */
1654 while (cpipe->pipe_busy) {
1655 wakeup(cpipe);
1656 cpipe->pipe_state |= PIPE_WANTCLOSE | PIPE_EOF;
1657 tsleep(cpipe, PRIBIO, "pipecl", 0);
1658 }
1659
1660 /*
1661 * Disconnect from peer
1662 */
1663 if ((ppipe = cpipe->pipe_peer) != NULL) {
1664 pipeselwakeup(ppipe, ppipe);
1665
1666 ppipe->pipe_state |= PIPE_EOF;
1667 wakeup(ppipe);
1668 ppipe->pipe_peer = NULL;
1669 }
1670
1671 /*
1672 * free resources
1673 */
1674 #ifdef _FreeBSD__
1675 mtx_lock(&vm_mtx);
1676 pipe_free_kmem(cpipe);
1677 /* XXX: erm, doesn't zalloc already have its own locks and
1678 * not need the giant vm lock?
1679 */
1680 zfree(pipe_zone, cpipe);
1681 mtx_unlock(&vm_mtx);
1682 #endif /* FreeBSD */
1683
1684 #ifdef __NetBSD__
1685 pipe_free_kmem(cpipe);
1686 (void) lockmgr(&cpipe->pipe_lock, LK_DRAIN, NULL);
1687 pool_put(&pipe_pool, cpipe);
1688 #endif
1689 }
1690
1691 #ifdef __FreeBSD__
1692 /*ARGSUSED*/
1693 static int
1694 pipe_kqfilter(struct file *fp, struct knote *kn)
1695 {
1696 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1697
1698 switch (kn->kn_filter) {
1699 case EVFILT_READ:
1700 kn->kn_fop = &pipe_rfiltops;
1701 break;
1702 case EVFILT_WRITE:
1703 kn->kn_fop = &pipe_wfiltops;
1704 cpipe = cpipe->pipe_peer;
1705 break;
1706 default:
1707 return (1);
1708 }
1709 kn->kn_hook = (caddr_t)cpipe;
1710 SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1711 return (0);
1712 }
1713
1714 static void
1715 filt_pipedetach(struct knote *kn)
1716 {
1717 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1718
1719 SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1720 }
1721
1722 /*ARGSUSED*/
1723 static int
1724 filt_piperead(struct knote *kn, long hint)
1725 {
1726 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1727 struct pipe *wpipe = rpipe->pipe_peer;
1728
1729 kn->kn_data = rpipe->pipe_buffer.cnt;
1730 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1731 kn->kn_data = rpipe->pipe_map.cnt;
1732
1733 if ((rpipe->pipe_state & PIPE_EOF) ||
1734 (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1735 kn->kn_flags |= EV_EOF;
1736 return (1);
1737 }
1738 return (kn->kn_data > 0);
1739 }
1740
1741 /*ARGSUSED*/
1742 static int
1743 filt_pipewrite(struct knote *kn, long hint)
1744 {
1745 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1746 struct pipe *wpipe = rpipe->pipe_peer;
1747
1748 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1749 kn->kn_data = 0;
1750 kn->kn_flags |= EV_EOF;
1751 return (1);
1752 }
1753 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1754 if (wpipe->pipe_state & PIPE_DIRECTW)
1755 kn->kn_data = 0;
1756
1757 return (kn->kn_data >= PIPE_BUF);
1758 }
1759 #endif /* FreeBSD */
1760
1761 #ifdef __NetBSD__
1762 static int
1763 pipe_fcntl(fp, cmd, data, p)
1764 struct file *fp;
1765 u_int cmd;
1766 caddr_t data;
1767 struct proc *p;
1768 {
1769 if (cmd == F_SETFL)
1770 return (0);
1771 else
1772 return (EOPNOTSUPP);
1773 }
1774
1775 /*
1776 * Handle pipe sysctls.
1777 */
1778 int
1779 sysctl_dopipe(name, namelen, oldp, oldlenp, newp, newlen)
1780 int *name;
1781 u_int namelen;
1782 void *oldp;
1783 size_t *oldlenp;
1784 void *newp;
1785 size_t newlen;
1786 {
1787 /* All sysctl names at this level are terminal. */
1788 if (namelen != 1)
1789 return (ENOTDIR); /* overloaded */
1790
1791 switch (name[0]) {
1792 case KERN_PIPE_MAXKVASZ:
1793 return (sysctl_int(oldp, oldlenp, newp, newlen, &maxpipekva));
1794 case KERN_PIPE_LIMITKVA:
1795 return (sysctl_int(oldp, oldlenp, newp, newlen, &limitpipekva));
1796 case KERN_PIPE_MAXBIGPIPES:
1797 return (sysctl_int(oldp, oldlenp, newp, newlen, &maxbigpipes));
1798 case KERN_PIPE_NBIGPIPES:
1799 return (sysctl_rdint(oldp, oldlenp, newp, nbigpipe));
1800 case KERN_PIPE_KVASIZE:
1801 return (sysctl_rdint(oldp, oldlenp, newp, amountpipekva));
1802 default:
1803 return (EOPNOTSUPP);
1804 }
1805 /* NOTREACHED */
1806 }
1807
1808 /*
1809 * Initialize pipe structs.
1810 */
1811 void
1812 pipe_init(void)
1813 {
1814 pool_init(&pipe_pool, sizeof(struct pipe), 0, 0, 0, "pipepl",
1815 0, NULL, NULL, M_PIPE);
1816 }
1817
1818 #endif /* __NetBSD __ */
1819