sys_pipe.c revision 1.5 1 /* $NetBSD: sys_pipe.c,v 1.5 2001/07/02 20:43:39 jdolecek Exp $ */
2
3 /*
4 * Copyright (c) 1996 John S. Dyson
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice immediately at the beginning of the file, without modification,
12 * this list of conditions, and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Absolutely no warranty of function or purpose is made by the author
17 * John S. Dyson.
18 * 4. Modifications may be freely made to this file if the above conditions
19 * are met.
20 *
21 * $FreeBSD: src/sys/kern/sys_pipe.c,v 1.82 2001/06/15 20:45:01 jlemon Exp $
22 */
23
24 /*
25 * This file contains a high-performance replacement for the socket-based
26 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support
27 * all features of sockets, but does do everything that pipes normally
28 * do.
29 *
30 * Adaption for NetBSD UVM, including uvm_loan() based direct write, was
31 * written by Jaromir Dolecek.
32 */
33
34 /*
35 * This code has two modes of operation, a small write mode and a large
36 * write mode. The small write mode acts like conventional pipes with
37 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
38 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
39 * and PIPE_SIZE in size, it is fully mapped into the kernel (on FreeBSD,
40 * those pages are also wired), and the receiving process can copy it directly
41 * from the pages in the sending process.
42 *
43 * If the sending process receives a signal, it is possible that it will
44 * go away, and certainly its address space can change, because control
45 * is returned back to the user-mode side. In that case, the pipe code
46 * arranges to copy the buffer supplied by the user process on FreeBSD, to
47 * a pageable kernel buffer, and the receiving process will grab the data
48 * from the pageable kernel buffer. Since signals don't happen all that often,
49 * the copy operation is normally eliminated.
50 * For NetBSD, the pages are mapped read-only, COW for kernel by uvm_loan(),
51 * so no explicit handling need to be done, all is handled by standard VM
52 * facilities.
53 *
54 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
55 * happen for small transfers so that the system will not spend all of
56 * its time context switching. PIPE_SIZE is constrained by the
57 * amount of kernel virtual memory.
58 */
59
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/proc.h>
63 #include <sys/fcntl.h>
64 #include <sys/file.h>
65 #include <sys/filedesc.h>
66 #include <sys/filio.h>
67 #include <sys/ttycom.h>
68 #include <sys/stat.h>
69 #include <sys/poll.h>
70 #include <sys/signalvar.h>
71 #include <sys/vnode.h>
72 #include <sys/uio.h>
73 #include <sys/lock.h>
74 #ifdef __FreeBSD__
75 #include <sys/mutex.h>
76 #include <sys/selinfo.h>
77 #include <sys/sysproto.h>
78 #elif defined(__NetBSD__)
79 #include <sys/select.h>
80 #include <sys/malloc.h>
81 #include <sys/mount.h>
82 #include <sys/syscallargs.h>
83 #include <uvm/uvm.h>
84 #include <sys/sysctl.h>
85 #endif /* NetBSD, FreeBSD */
86
87 #include <sys/pipe.h>
88
89 #ifdef __NetBSD__
90 #define vfs_timestamp(tv) microtime(tv)
91 #endif
92
93 /*
94 * Use this define if you want to disable *fancy* VM things. Expect an
95 * approx 30% decrease in transfer rate. This could be useful for
96 * OpenBSD.
97 */
98 /* #define PIPE_NODIRECT */
99
100 /*
101 * interfaces to the outside world
102 */
103 #ifdef __FreeBSD__
104 static int pipe_read __P((struct file *fp, struct uio *uio,
105 struct ucred *cred, int flags, struct proc *p));
106 static int pipe_write __P((struct file *fp, struct uio *uio,
107 struct ucred *cred, int flags, struct proc *p));
108 static int pipe_close __P((struct file *fp, struct proc *p));
109 static int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
110 struct proc *p));
111 static int pipe_kqfilter __P((struct file *fp, struct knote *kn));
112 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
113 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
114
115 static struct fileops pipeops = {
116 pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
117 pipe_stat, pipe_close
118 };
119
120 static void filt_pipedetach(struct knote *kn);
121 static int filt_piperead(struct knote *kn, long hint);
122 static int filt_pipewrite(struct knote *kn, long hint);
123
124 static struct filterops pipe_rfiltops =
125 { 1, NULL, filt_pipedetach, filt_piperead };
126 static struct filterops pipe_wfiltops =
127 { 1, NULL, filt_pipedetach, filt_pipewrite };
128 #endif /* FreeBSD */
129
130 #ifdef __NetBSD__
131 static int pipe_read __P((struct file *fp, off_t *offset, struct uio *uio,
132 struct ucred *cred, int flags));
133 static int pipe_write __P((struct file *fp, off_t *offset, struct uio *uio,
134 struct ucred *cred, int flags));
135 static int pipe_close __P((struct file *fp, struct proc *p));
136 static int pipe_poll __P((struct file *fp, int events, struct proc *p));
137 static int pipe_fcntl __P((struct file *fp, u_int com, caddr_t data,
138 struct proc *p));
139 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
140 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
141
142 static struct fileops pipeops =
143 { pipe_read, pipe_write, pipe_ioctl, pipe_fcntl, pipe_poll,
144 pipe_stat, pipe_close };
145 #endif /* NetBSD */
146
147 /*
148 * Default pipe buffer size(s), this can be kind-of large now because pipe
149 * space is pageable. The pipe code will try to maintain locality of
150 * reference for performance reasons, so small amounts of outstanding I/O
151 * will not wipe the cache.
152 */
153 #define MINPIPESIZE (PIPE_SIZE/3)
154 #define MAXPIPESIZE (2*PIPE_SIZE/3)
155
156 /*
157 * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
158 * is there so that on large systems, we don't exhaust it.
159 */
160 #define MAXPIPEKVA (8*1024*1024)
161 static int maxpipekva = MAXPIPEKVA;
162
163 /*
164 * Limit for direct transfers, we cannot, of course limit
165 * the amount of kva for pipes in general though.
166 */
167 #define LIMITPIPEKVA (16*1024*1024)
168 static int limitpipekva = LIMITPIPEKVA;
169
170 /*
171 * Limit the number of "big" pipes
172 */
173 #define LIMITBIGPIPES 32
174 static int maxbigpipes = LIMITBIGPIPES;
175 static int nbigpipe = 0;
176
177 /*
178 * Amount of KVA consumed by pipe buffers.
179 */
180 static int amountpipekva = 0;
181
182 static void pipeclose __P((struct pipe *cpipe));
183 static void pipe_free_kmem __P((struct pipe *cpipe));
184 static int pipe_create __P((struct pipe **cpipep));
185 static __inline int pipelock __P((struct pipe *cpipe, int catch));
186 static __inline void pipeunlock __P((struct pipe *cpipe));
187 static __inline void pipeselwakeup __P((struct pipe *selp,
188 struct pipe *sigp));
189 static int pipespace __P((struct pipe *cpipe, int size));
190
191 #ifdef __FreeBSD__
192 #ifndef PIPE_NODIRECT
193 static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
194 static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
195 static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
196 static void pipe_clone_write_buffer __P((struct pipe *wpipe));
197 #endif
198
199 static vm_zone_t pipe_zone;
200 #endif /* FreeBSD */
201
202 #ifdef __NetBSD__
203 #ifndef PIPE_NODIRECT
204 static __inline int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
205 static __inline int pipe_loan_alloc __P((struct pipe *wpipe, int npages,
206 vsize_t blen));
207 static void pipe_loan_free __P((struct pipe *wpipe));
208 #endif /* PIPE_NODIRECT */
209
210 static struct pool pipe_pool;
211 #endif /* NetBSD */
212
213 /*
214 * The pipe system call for the DTYPE_PIPE type of pipes
215 */
216
217 /* ARGSUSED */
218 #ifdef __FreeBSD__
219 int
220 pipe(p, uap)
221 struct proc *p;
222 struct pipe_args /* {
223 int dummy;
224 } */ *uap;
225 #elif defined(__NetBSD__)
226 int
227 sys_pipe(p, v, retval)
228 struct proc *p;
229 void *v;
230 register_t *retval;
231 #endif
232 {
233 struct filedesc *fdp = p->p_fd;
234 struct file *rf, *wf;
235 struct pipe *rpipe, *wpipe;
236 int fd, error;
237
238 #ifdef __FreeBSD__
239 if (pipe_zone == NULL)
240 pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4);
241 #endif
242
243 rpipe = wpipe = NULL;
244 if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
245 pipeclose(rpipe);
246 pipeclose(wpipe);
247 return (ENFILE);
248 }
249
250 #ifdef __FreeBSD__
251 error = falloc(p, &rf, &fd);
252 if (error) {
253 pipeclose(rpipe);
254 pipeclose(wpipe);
255 return (error);
256 }
257 fhold(rf);
258 p->p_retval[0] = fd;
259
260 /*
261 * Warning: once we've gotten past allocation of the fd for the
262 * read-side, we can only drop the read side via fdrop() in order
263 * to avoid races against processes which manage to dup() the read
264 * side while we are blocked trying to allocate the write side.
265 */
266 rf->f_flag = FREAD | FWRITE;
267 rf->f_type = DTYPE_PIPE;
268 rf->f_data = (caddr_t)rpipe;
269 rf->f_ops = &pipeops;
270 error = falloc(p, &wf, &fd);
271 if (error) {
272 if (fdp->fd_ofiles[p->p_retval[0]] == rf) {
273 fdp->fd_ofiles[p->p_retval[0]] = NULL;
274 fdrop(rf, p);
275 }
276 fdrop(rf, p);
277 /* rpipe has been closed by fdrop(). */
278 pipeclose(wpipe);
279 return (error);
280 }
281 wf->f_flag = FREAD | FWRITE;
282 wf->f_type = DTYPE_PIPE;
283 wf->f_data = (caddr_t)wpipe;
284 wf->f_ops = &pipeops;
285 p->p_retval[1] = fd;
286
287 rpipe->pipe_peer = wpipe;
288 wpipe->pipe_peer = rpipe;
289 fdrop(rf, p);
290 #endif /* FreeBSD */
291
292 #ifdef __NetBSD__
293 /*
294 * Note: the file structure returned from falloc() is marked
295 * as 'larval' initially. Unless we mark it as 'mature' by
296 * FILE_SET_MATURE(), any attempt to do anything with it would
297 * return EBADF, including e.g. dup(2) or close(2). This avoids
298 * file descriptor races if we block in the second falloc().
299 */
300
301 error = falloc(p, &rf, &fd);
302 if (error)
303 goto free2;
304 retval[0] = fd;
305 rf->f_flag = FREAD;
306 rf->f_type = DTYPE_PIPE;
307 rf->f_data = (caddr_t)rpipe;
308 rf->f_ops = &pipeops;
309
310 error = falloc(p, &wf, &fd);
311 if (error)
312 goto free3;
313 retval[1] = fd;
314 wf->f_flag = FWRITE;
315 wf->f_type = DTYPE_PIPE;
316 wf->f_data = (caddr_t)wpipe;
317 wf->f_ops = &pipeops;
318
319 rpipe->pipe_peer = wpipe;
320 wpipe->pipe_peer = rpipe;
321
322 FILE_SET_MATURE(rf);
323 FILE_SET_MATURE(wf);
324 FILE_UNUSE(rf, p);
325 FILE_UNUSE(wf, p);
326 return (0);
327 free3:
328 FILE_UNUSE(rf, p);
329 ffree(rf);
330 fdremove(fdp, retval[0]);
331 free2:
332 pipeclose(wpipe);
333 pipeclose(rpipe);
334 #endif /* NetBSD */
335
336 return (error);
337 }
338
339 /*
340 * Allocate kva for pipe circular buffer, the space is pageable
341 * This routine will 'realloc' the size of a pipe safely, if it fails
342 * it will retain the old buffer.
343 * If it fails it will return ENOMEM.
344 */
345 static int
346 pipespace(cpipe, size)
347 struct pipe *cpipe;
348 int size;
349 {
350 caddr_t buffer;
351 #ifdef __FreeBSD__
352 struct vm_object *object;
353 int npages, error;
354
355 npages = round_page(size)/PAGE_SIZE;
356 /*
357 * Create an object, I don't like the idea of paging to/from
358 * kernel_object.
359 */
360 mtx_lock(&vm_mtx);
361 object = vm_object_allocate(OBJT_DEFAULT, npages);
362 buffer = (caddr_t) vm_map_min(kernel_map);
363
364 /*
365 * Insert the object into the kernel map, and allocate kva for it.
366 * The map entry is, by default, pageable.
367 */
368 error = vm_map_find(kernel_map, object, 0,
369 (vm_offset_t *) &buffer, size, 1,
370 VM_PROT_ALL, VM_PROT_ALL, 0);
371
372 if (error != KERN_SUCCESS) {
373 vm_object_deallocate(object);
374 mtx_unlock(&vm_mtx);
375 return (ENOMEM);
376 }
377 #endif /* FreeBSD */
378
379 #ifdef __NetBSD__
380 /*
381 * Allocate pageable virtual address space. Physical memory is allocated
382 * on demand.
383 */
384 buffer = (caddr_t) uvm_km_valloc(kernel_map, round_page(size));
385 if (buffer == NULL)
386 return (ENOMEM);
387 #endif /* NetBSD */
388
389 /* free old resources if we're resizing */
390 pipe_free_kmem(cpipe);
391 #ifdef __FreeBSD__
392 mtx_unlock(&vm_mtx);
393 cpipe->pipe_buffer.object = object;
394 #endif
395 cpipe->pipe_buffer.buffer = buffer;
396 cpipe->pipe_buffer.size = size;
397 cpipe->pipe_buffer.in = 0;
398 cpipe->pipe_buffer.out = 0;
399 cpipe->pipe_buffer.cnt = 0;
400 amountpipekva += cpipe->pipe_buffer.size;
401 return (0);
402 }
403
404 /*
405 * initialize and allocate VM and memory for pipe
406 */
407 static int
408 pipe_create(cpipep)
409 struct pipe **cpipep;
410 {
411 struct pipe *cpipe;
412 int error;
413
414 #ifdef __FreeBSD__
415 *cpipep = zalloc(pipe_zone);
416 #endif
417 #ifdef __NetBSD__
418 *cpipep = pool_get(&pipe_pool, M_WAITOK);
419 #endif
420 if (*cpipep == NULL)
421 return (ENOMEM);
422
423 cpipe = *cpipep;
424
425 #ifdef __FreeBSD__
426 /* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */
427 cpipe->pipe_buffer.object = NULL;
428 #endif /* FreeBSD */
429 /*
430 * protect so pipeclose() doesn't follow a junk pointer
431 * if pipespace() fails.
432 */
433 cpipe->pipe_buffer.buffer = NULL;
434 bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
435 cpipe->pipe_state = PIPE_SIGNALR;
436 cpipe->pipe_peer = NULL;
437 cpipe->pipe_busy = 0;
438
439 #ifndef PIPE_NODIRECT
440 /*
441 * pipe data structure initializations to support direct pipe I/O
442 */
443 cpipe->pipe_map.cnt = 0;
444 cpipe->pipe_map.kva = NULL;
445 cpipe->pipe_map.pos = 0;
446 cpipe->pipe_map.npages = 0;
447 #ifdef __NetBSD__
448 cpipe->pipe_map.ms = NULL;
449 #endif
450 #endif /* !PIPE_NODIRECT */
451
452 if ((error = pipespace(cpipe, PIPE_SIZE)))
453 return (error);
454
455 vfs_timestamp(&cpipe->pipe_ctime);
456 cpipe->pipe_atime = cpipe->pipe_ctime;
457 cpipe->pipe_mtime = cpipe->pipe_ctime;
458 #ifdef __NetBSD__
459 cpipe->pipe_pgid = NO_PID;
460 lockinit(&cpipe->pipe_lock, PRIBIO | PCATCH, "pipelk", 0, 0);
461 #endif
462
463 return (0);
464 }
465
466
467 /*
468 * lock a pipe for I/O, blocking other access
469 */
470 static __inline int
471 pipelock(cpipe, catch)
472 struct pipe *cpipe;
473 int catch;
474 {
475 int error;
476
477 #ifdef __FreeBSD__
478 while (cpipe->pipe_state & PIPE_LOCK) {
479 cpipe->pipe_state |= PIPE_LWANT;
480 error = tsleep(cpipe, catch ? (PRIBIO | PCATCH) : PRIBIO,
481 "pipelk", 0);
482 if (error != 0)
483 return (error);
484 }
485 cpipe->pipe_state |= PIPE_LOCK;
486 return (0);
487 #endif
488
489 #ifdef __NetBSD__
490 do {
491 error = lockmgr(&cpipe->pipe_lock, LK_EXCLUSIVE, NULL);
492 } while (!catch && (error == EINTR || error == ERESTART));
493 return (error);
494 #endif
495 }
496
497 /*
498 * unlock a pipe I/O lock
499 */
500 static __inline void
501 pipeunlock(cpipe)
502 struct pipe *cpipe;
503 {
504 #ifdef __FreeBSD__
505 cpipe->pipe_state &= ~PIPE_LOCK;
506 if (cpipe->pipe_state & PIPE_LWANT) {
507 cpipe->pipe_state &= ~PIPE_LWANT;
508 wakeup(cpipe);
509 }
510 #endif
511
512 #ifdef __NetBSD__
513 lockmgr(&cpipe->pipe_lock, LK_RELEASE, NULL);
514 #endif
515 }
516
517 /*
518 * Select/poll wakup. This also sends SIGIO to peer connected to
519 * 'sigpipe' side of pipe.
520 */
521 static __inline void
522 pipeselwakeup(selp, sigp)
523 struct pipe *selp, *sigp;
524 {
525 if (selp->pipe_state & PIPE_SEL) {
526 selp->pipe_state &= ~PIPE_SEL;
527 selwakeup(&selp->pipe_sel);
528 }
529 #ifdef __FreeBSD__
530 if (sigp && (sigp->pipe_state & PIPE_ASYNC) && sigp->pipe_sigio)
531 pgsigio(sigp->pipe_sigio, SIGIO, 0);
532 KNOTE(&selp->pipe_sel.si_note, 0);
533 #endif
534
535 #ifdef __NetBSD__
536 if (sigp && (sigp->pipe_state & PIPE_ASYNC)
537 && sigp->pipe_pgid != NO_PID){
538 struct proc *p;
539
540 if (sigp->pipe_pgid < 0)
541 gsignal(-sigp->pipe_pgid, SIGIO);
542 else if (sigp->pipe_pgid > 0 && (p = pfind(sigp->pipe_pgid)) != 0)
543 psignal(p, SIGIO);
544 }
545 #endif /* NetBSD */
546 }
547
548 /* ARGSUSED */
549 #ifdef __FreeBSD__
550 static int
551 pipe_read(fp, uio, cred, flags, p)
552 struct file *fp;
553 struct uio *uio;
554 struct ucred *cred;
555 int flags;
556 struct proc *p;
557 #elif defined(__NetBSD__)
558 static int
559 pipe_read(fp, offset, uio, cred, flags)
560 struct file *fp;
561 off_t *offset;
562 struct uio *uio;
563 struct ucred *cred;
564 int flags;
565 #endif
566 {
567 struct pipe *rpipe = (struct pipe *) fp->f_data;
568 int error;
569 size_t nread = 0;
570 size_t size;
571 size_t ocnt;
572
573 ++rpipe->pipe_busy;
574 error = pipelock(rpipe, 1);
575 if (error)
576 goto unlocked_error;
577
578 ocnt = rpipe->pipe_buffer.cnt;
579
580 while (uio->uio_resid) {
581 /*
582 * normal pipe buffer receive
583 */
584 if (rpipe->pipe_buffer.cnt > 0) {
585 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
586 if (size > rpipe->pipe_buffer.cnt)
587 size = rpipe->pipe_buffer.cnt;
588 if (size > uio->uio_resid)
589 size = uio->uio_resid;
590
591 error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
592 size, uio);
593 if (error)
594 break;
595
596 rpipe->pipe_buffer.out += size;
597 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
598 rpipe->pipe_buffer.out = 0;
599
600 rpipe->pipe_buffer.cnt -= size;
601
602 /*
603 * If there is no more to read in the pipe, reset
604 * its pointers to the beginning. This improves
605 * cache hit stats.
606 */
607 if (rpipe->pipe_buffer.cnt == 0) {
608 rpipe->pipe_buffer.in = 0;
609 rpipe->pipe_buffer.out = 0;
610 }
611 nread += size;
612 #ifndef PIPE_NODIRECT
613 /*
614 * Direct copy, bypassing a kernel buffer.
615 */
616 } else if ((size = rpipe->pipe_map.cnt) &&
617 (rpipe->pipe_state & PIPE_DIRECTW)) {
618 caddr_t va;
619 if (size > uio->uio_resid)
620 size = uio->uio_resid;
621
622 va = (caddr_t) rpipe->pipe_map.kva +
623 rpipe->pipe_map.pos;
624 error = uiomove(va, size, uio);
625 if (error)
626 break;
627 nread += size;
628 rpipe->pipe_map.pos += size;
629 rpipe->pipe_map.cnt -= size;
630 if (rpipe->pipe_map.cnt == 0) {
631 rpipe->pipe_state &= ~PIPE_DIRECTW;
632 wakeup(rpipe);
633 }
634 #endif
635 } else {
636 /*
637 * detect EOF condition
638 * read returns 0 on EOF, no need to set error
639 */
640 if (rpipe->pipe_state & PIPE_EOF)
641 break;
642
643 /*
644 * If the "write-side" has been blocked, wake it up now.
645 */
646 if (rpipe->pipe_state & PIPE_WANTW) {
647 rpipe->pipe_state &= ~PIPE_WANTW;
648 wakeup(rpipe);
649 }
650
651 /*
652 * Break if some data was read.
653 */
654 if (nread > 0)
655 break;
656
657 /*
658 * don't block on non-blocking I/O
659 */
660 if (fp->f_flag & FNONBLOCK) {
661 error = EAGAIN;
662 break;
663 }
664
665 /*
666 * Unlock the pipe buffer for our remaining processing.
667 * We will either break out with an error or we will
668 * sleep and relock to loop.
669 */
670 pipeunlock(rpipe);
671
672 /*
673 * We want to read more, wake up select/poll.
674 */
675 pipeselwakeup(rpipe, rpipe->pipe_peer);
676
677 rpipe->pipe_state |= PIPE_WANTR;
678 error = tsleep(rpipe, PRIBIO | PCATCH, "piperd", 0);
679 if (error != 0 || (error = pipelock(rpipe, 1)))
680 goto unlocked_error;
681 }
682 }
683 pipeunlock(rpipe);
684
685 if (error == 0)
686 vfs_timestamp(&rpipe->pipe_atime);
687 unlocked_error:
688 --rpipe->pipe_busy;
689
690 /*
691 * PIPE_WANTCLOSE processing only makes sense if pipe_busy is 0.
692 */
693 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANTCLOSE)) {
694 rpipe->pipe_state &= ~(PIPE_WANTCLOSE|PIPE_WANTW);
695 wakeup(rpipe);
696 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
697 /*
698 * Handle write blocking hysteresis.
699 */
700 if (rpipe->pipe_state & PIPE_WANTW) {
701 rpipe->pipe_state &= ~PIPE_WANTW;
702 wakeup(rpipe);
703 }
704 }
705
706 /*
707 * If anything was read off the buffer, signal to the writer it's
708 * possible to write more data. Also send signal if we are here for the
709 * first time after last write.
710 */
711 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF
712 && (ocnt != rpipe->pipe_buffer.cnt || (rpipe->pipe_state & PIPE_SIGNALR))) {
713 pipeselwakeup(rpipe, rpipe->pipe_peer);
714 rpipe->pipe_state &= ~PIPE_SIGNALR;
715 }
716
717 return (error);
718 }
719
720 #ifdef __FreeBSD__
721 #ifndef PIPE_NODIRECT
722 /*
723 * Map the sending processes' buffer into kernel space and wire it.
724 * This is similar to a physical write operation.
725 */
726 static int
727 pipe_build_write_buffer(wpipe, uio)
728 struct pipe *wpipe;
729 struct uio *uio;
730 {
731 size_t size;
732 int i;
733 vm_offset_t addr, endaddr, paddr;
734
735 size = uio->uio_iov->iov_len;
736 if (size > wpipe->pipe_buffer.size)
737 size = wpipe->pipe_buffer.size;
738
739 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
740 mtx_lock(&vm_mtx);
741 addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
742 for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
743 vm_page_t m;
744
745 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
746 (paddr = pmap_kextract(addr)) == 0) {
747 int j;
748
749 for (j = 0; j < i; j++)
750 vm_page_unwire(wpipe->pipe_map.ms[j], 1);
751 mtx_unlock(&vm_mtx);
752 return (EFAULT);
753 }
754
755 m = PHYS_TO_VM_PAGE(paddr);
756 vm_page_wire(m);
757 wpipe->pipe_map.ms[i] = m;
758 }
759
760 /*
761 * set up the control block
762 */
763 wpipe->pipe_map.npages = i;
764 wpipe->pipe_map.pos =
765 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
766 wpipe->pipe_map.cnt = size;
767
768 /*
769 * and map the buffer
770 */
771 if (wpipe->pipe_map.kva == 0) {
772 /*
773 * We need to allocate space for an extra page because the
774 * address range might (will) span pages at times.
775 */
776 wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
777 wpipe->pipe_buffer.size + PAGE_SIZE);
778 amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
779 }
780 pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
781 wpipe->pipe_map.npages);
782
783 mtx_unlock(&vm_mtx);
784 /*
785 * and update the uio data
786 */
787
788 uio->uio_iov->iov_len -= size;
789 uio->uio_iov->iov_base += size;
790 if (uio->uio_iov->iov_len == 0)
791 uio->uio_iov++;
792 uio->uio_resid -= size;
793 uio->uio_offset += size;
794 return (0);
795 }
796
797 /*
798 * unmap and unwire the process buffer
799 */
800 static void
801 pipe_destroy_write_buffer(wpipe)
802 struct pipe *wpipe;
803 {
804 int i;
805
806 mtx_lock(&vm_mtx);
807 if (wpipe->pipe_map.kva) {
808 pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
809
810 if (amountpipekva > maxpipekva) {
811 vm_offset_t kva = wpipe->pipe_map.kva;
812 wpipe->pipe_map.kva = 0;
813 kmem_free(kernel_map, kva,
814 wpipe->pipe_buffer.size + PAGE_SIZE);
815 amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
816 }
817 }
818 for (i = 0; i < wpipe->pipe_map.npages; i++)
819 vm_page_unwire(wpipe->pipe_map.ms[i], 1);
820 mtx_unlock(&vm_mtx);
821 }
822
823 /*
824 * In the case of a signal, the writing process might go away. This
825 * code copies the data into the circular buffer so that the source
826 * pages can be freed without loss of data.
827 */
828 static void
829 pipe_clone_write_buffer(wpipe)
830 struct pipe *wpipe;
831 {
832 int size;
833 int pos;
834
835 size = wpipe->pipe_map.cnt;
836 pos = wpipe->pipe_map.pos;
837 bcopy((caddr_t) wpipe->pipe_map.kva + pos,
838 (caddr_t) wpipe->pipe_buffer.buffer, size);
839
840 wpipe->pipe_buffer.in = size;
841 wpipe->pipe_buffer.out = 0;
842 wpipe->pipe_buffer.cnt = size;
843 wpipe->pipe_state &= ~PIPE_DIRECTW;
844
845 pipe_destroy_write_buffer(wpipe);
846 }
847
848 /*
849 * This implements the pipe buffer write mechanism. Note that only
850 * a direct write OR a normal pipe write can be pending at any given time.
851 * If there are any characters in the pipe buffer, the direct write will
852 * be deferred until the receiving process grabs all of the bytes from
853 * the pipe buffer. Then the direct mapping write is set-up.
854 */
855 static int
856 pipe_direct_write(wpipe, uio)
857 struct pipe *wpipe;
858 struct uio *uio;
859 {
860 int error;
861
862 retry:
863 while (wpipe->pipe_state & PIPE_DIRECTW) {
864 if (wpipe->pipe_state & PIPE_WANTR) {
865 wpipe->pipe_state &= ~PIPE_WANTR;
866 wakeup(wpipe);
867 }
868 wpipe->pipe_state |= PIPE_WANTW;
869 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
870 if (error)
871 goto error1;
872 if (wpipe->pipe_state & PIPE_EOF) {
873 error = EPIPE;
874 goto error1;
875 }
876 }
877 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
878 if (wpipe->pipe_buffer.cnt > 0) {
879 if (wpipe->pipe_state & PIPE_WANTR) {
880 wpipe->pipe_state &= ~PIPE_WANTR;
881 wakeup(wpipe);
882 }
883
884 wpipe->pipe_state |= PIPE_WANTW;
885 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
886 if (error)
887 goto error1;
888 if (wpipe->pipe_state & PIPE_EOF) {
889 error = EPIPE;
890 goto error1;
891 }
892 goto retry;
893 }
894
895 wpipe->pipe_state |= PIPE_DIRECTW;
896
897 error = pipe_build_write_buffer(wpipe, uio);
898 if (error) {
899 wpipe->pipe_state &= ~PIPE_DIRECTW;
900 goto error1;
901 }
902
903 error = 0;
904 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
905 if (wpipe->pipe_state & PIPE_EOF) {
906 pipelock(wpipe, 0);
907 pipe_destroy_write_buffer(wpipe);
908 pipeunlock(wpipe);
909 pipeselwakeup(wpipe, wpipe);
910 error = EPIPE;
911 goto error1;
912 }
913 if (wpipe->pipe_state & PIPE_WANTR) {
914 wpipe->pipe_state &= ~PIPE_WANTR;
915 wakeup(wpipe);
916 }
917 pipeselwakeup(wpipe, wpipe);
918 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
919 }
920
921 pipelock(wpipe,0);
922 if (wpipe->pipe_state & PIPE_DIRECTW) {
923 /*
924 * this bit of trickery substitutes a kernel buffer for
925 * the process that might be going away.
926 */
927 pipe_clone_write_buffer(wpipe);
928 } else {
929 pipe_destroy_write_buffer(wpipe);
930 }
931 pipeunlock(wpipe);
932 return (error);
933
934 error1:
935 wakeup(wpipe);
936 return (error);
937 }
938 #endif /* !PIPE_NODIRECT */
939 #endif /* FreeBSD */
940
941 #ifdef __NetBSD__
942 #ifndef PIPE_NODIRECT
943 /*
944 * Allocate structure for loan transfer.
945 */
946 static __inline int
947 pipe_loan_alloc(wpipe, npages, blen)
948 struct pipe *wpipe;
949 int npages;
950 vsize_t blen;
951 {
952 wpipe->pipe_map.kva = uvm_km_valloc_wait(kernel_map, blen);
953 if (wpipe->pipe_map.kva == NULL)
954 return (ENOMEM);
955
956 amountpipekva += blen;
957 wpipe->pipe_map.npages = npages;
958 wpipe->pipe_map.ms = (struct vm_page **) malloc(
959 npages * sizeof(struct vm_page *), M_PIPE, M_WAITOK);
960
961 return (0);
962 }
963
964 /*
965 * Free resources allocated for loan transfer.
966 */
967 static void
968 pipe_loan_free(wpipe)
969 struct pipe *wpipe;
970 {
971 uvm_km_free(kernel_map, wpipe->pipe_map.kva,
972 wpipe->pipe_map.npages * PAGE_SIZE);
973 wpipe->pipe_map.kva = NULL;
974 amountpipekva -= wpipe->pipe_map.npages * PAGE_SIZE;
975 free(wpipe->pipe_map.ms, M_PIPE);
976 wpipe->pipe_map.ms = NULL;
977 }
978
979 /*
980 * NetBSD direct write, using uvm_loan() mechanism.
981 * This implements the pipe buffer write mechanism. Note that only
982 * a direct write OR a normal pipe write can be pending at any given time.
983 * If there are any characters in the pipe buffer, the direct write will
984 * be deferred until the receiving process grabs all of the bytes from
985 * the pipe buffer. Then the direct mapping write is set-up.
986 */
987 static __inline int
988 pipe_direct_write(wpipe, uio)
989 struct pipe *wpipe;
990 struct uio *uio;
991 {
992 int error, npages, j;
993 struct vm_page **res = NULL;
994 vaddr_t bbase, kva, base, bend;
995 vsize_t blen, bcnt;
996 voff_t bpos;
997
998 retry:
999 while (wpipe->pipe_state & PIPE_DIRECTW) {
1000 if (wpipe->pipe_state & PIPE_WANTR) {
1001 wpipe->pipe_state &= ~PIPE_WANTR;
1002 wakeup(wpipe);
1003 }
1004 wpipe->pipe_state |= PIPE_WANTW;
1005 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
1006 if (error)
1007 goto error;
1008 if (wpipe->pipe_state & PIPE_EOF) {
1009 error = EPIPE;
1010 goto error;
1011 }
1012 }
1013 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
1014 if (wpipe->pipe_buffer.cnt > 0) {
1015 if (wpipe->pipe_state & PIPE_WANTR) {
1016 wpipe->pipe_state &= ~PIPE_WANTR;
1017 wakeup(wpipe);
1018 }
1019
1020 wpipe->pipe_state |= PIPE_WANTW;
1021 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
1022 if (error)
1023 goto error;
1024 if (wpipe->pipe_state & PIPE_EOF) {
1025 error = EPIPE;
1026 goto error;
1027 }
1028 goto retry;
1029 }
1030
1031 /*
1032 * Handle first iovec, first PIPE_CHUNK_SIZE bytes. Expect caller
1033 * to deal with short write.
1034 *
1035 * Note: need to deal with buffers not aligned to PAGE_SIZE.
1036 */
1037 bbase = (vaddr_t)uio->uio_iov[0].iov_base;
1038 base = trunc_page(bbase);
1039 bend = round_page(bbase + uio->uio_iov[0].iov_len);
1040 blen = bend - base;
1041 bpos = bbase - base;
1042
1043 if (blen > PIPE_DIRECT_CHUNK) {
1044 blen = PIPE_DIRECT_CHUNK;
1045 bend = base + blen;
1046 bcnt = PIPE_DIRECT_CHUNK - bpos;
1047 } else
1048 bcnt = uio->uio_iov[0].iov_len;
1049
1050 npages = blen / PAGE_SIZE;
1051
1052 wpipe->pipe_map.pos = bpos;
1053 wpipe->pipe_map.cnt = bcnt;
1054
1055 /*
1056 * Free the old kva if we need more pages than we have
1057 * allocated.
1058 */
1059 if (wpipe->pipe_map.kva && npages > wpipe->pipe_map.npages)
1060 pipe_loan_free(wpipe);
1061
1062 /* Allocate new kva. */
1063 if (!wpipe->pipe_map.kva
1064 && (error = pipe_loan_alloc(wpipe, npages, blen)))
1065 goto error;
1066
1067 /* Loan the write buffer memory from writer process */
1068 error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, base, blen,
1069 (void **) wpipe->pipe_map.ms, UVM_LOAN_TOPAGE);
1070 if (error)
1071 goto cleanup;
1072 res = wpipe->pipe_map.ms;
1073
1074 /* Enter the loaned pages to kva */
1075 kva = wpipe->pipe_map.kva;
1076 for(j=0; j < npages; j++, kva += PAGE_SIZE)
1077 pmap_enter(pmap_kernel(), kva, res[j]->phys_addr,
1078 VM_PROT_READ, 0);
1079
1080 wpipe->pipe_state |= PIPE_DIRECTW;
1081 error = 0;
1082 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
1083 if (wpipe->pipe_state & PIPE_EOF) {
1084 error = EPIPE;
1085 break;
1086 }
1087 if (wpipe->pipe_state & PIPE_WANTR) {
1088 wpipe->pipe_state &= ~PIPE_WANTR;
1089 wakeup(wpipe);
1090 }
1091 pipeselwakeup(wpipe, wpipe);
1092 error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
1093 }
1094
1095 if (error)
1096 wpipe->pipe_state &= ~PIPE_DIRECTW;
1097
1098 cleanup:
1099 pipelock(wpipe, 0);
1100 if (error || amountpipekva > maxpipekva)
1101 pipe_loan_free(wpipe);
1102 else if (res)
1103 uvm_unloanpage(res, npages);
1104 pipeunlock(wpipe);
1105
1106 if (error == EPIPE) {
1107 pipeselwakeup(wpipe, wpipe);
1108
1109 /*
1110 * If anything was read from what we offered, return success
1111 * and short write. We return EOF on next write(2).
1112 */
1113 if (wpipe->pipe_map.cnt < bcnt) {
1114 bcnt -= wpipe->pipe_map.cnt;
1115 error = 0;
1116 }
1117 }
1118
1119 if (error) {
1120 error:
1121 wakeup(wpipe);
1122 return (error);
1123 }
1124
1125 uio->uio_offset += bcnt;
1126 uio->uio_resid -= bcnt;
1127
1128 return (0);
1129 }
1130 #endif /* !PIPE_NODIRECT */
1131 #endif /* NetBSD */
1132
1133 #ifdef __FreeBSD__
1134 static int
1135 pipe_write(fp, uio, cred, flags, p)
1136 struct file *fp;
1137 off_t *offset;
1138 struct uio *uio;
1139 struct ucred *cred;
1140 int flags;
1141 struct proc *p;
1142 #elif defined(__NetBSD__)
1143 static int
1144 pipe_write(fp, offset, uio, cred, flags)
1145 struct file *fp;
1146 off_t *offset;
1147 struct uio *uio;
1148 struct ucred *cred;
1149 int flags;
1150 #endif
1151 {
1152 int error = 0;
1153 int orig_resid;
1154 struct pipe *wpipe, *rpipe;
1155
1156 rpipe = (struct pipe *) fp->f_data;
1157 wpipe = rpipe->pipe_peer;
1158
1159 /*
1160 * detect loss of pipe read side, issue SIGPIPE if lost.
1161 */
1162 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF))
1163 return (EPIPE);
1164
1165 ++wpipe->pipe_busy;
1166
1167 /*
1168 * If it is advantageous to resize the pipe buffer, do
1169 * so.
1170 */
1171 if ((uio->uio_resid > PIPE_SIZE) &&
1172 (nbigpipe < maxbigpipes) &&
1173 #ifndef PIPE_NODIRECT
1174 (wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
1175 #endif
1176 (wpipe->pipe_buffer.size <= PIPE_SIZE) &&
1177 (wpipe->pipe_buffer.cnt == 0)) {
1178
1179 if ((error = pipelock(wpipe,1)) == 0) {
1180 if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
1181 nbigpipe++;
1182 pipeunlock(wpipe);
1183 } else {
1184 /*
1185 * If an error occured, unbusy and return, waking up any
1186 * pending readers.
1187 */
1188 --wpipe->pipe_busy;
1189 if (wpipe->pipe_busy == 0
1190 && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1191 wpipe->pipe_state &=
1192 ~(PIPE_WANTCLOSE | PIPE_WANTR);
1193 wakeup(wpipe);
1194 }
1195
1196 return (error);
1197 }
1198 }
1199
1200 #ifdef __FreeBSD__
1201 KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
1202 #endif
1203
1204 orig_resid = uio->uio_resid;
1205 while (uio->uio_resid) {
1206 int space;
1207
1208 #ifndef PIPE_NODIRECT
1209 /*
1210 * If the transfer is large, we can gain performance if
1211 * we do process-to-process copies directly.
1212 * If the write is non-blocking, we don't use the
1213 * direct write mechanism.
1214 *
1215 * The direct write mechanism will detect the reader going
1216 * away on us.
1217 */
1218 if ((uio->uio_iov[0].iov_len >= PIPE_MINDIRECT) &&
1219 (uio->uio_offset == 0) &&
1220 (fp->f_flag & FNONBLOCK) == 0 &&
1221 (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) {
1222 error = pipe_direct_write(wpipe, uio);
1223
1224 /*
1225 * We either errorred, wrote whole buffer, or
1226 * wrote part of buffer. If the error is ENOMEM,
1227 * we failed to allocate some resources for direct
1228 * write and fall back to ordinary write. Otherwise,
1229 * break out now.
1230 */
1231 if (error != ENOMEM)
1232 break;
1233 }
1234 #endif /* PIPE_NODIRECT */
1235
1236 /*
1237 * Pipe buffered writes cannot be coincidental with
1238 * direct writes. We wait until the currently executing
1239 * direct write is completed before we start filling the
1240 * pipe buffer. We break out if a signal occurs or the
1241 * reader goes away.
1242 */
1243 retrywrite:
1244 while (wpipe->pipe_state & PIPE_DIRECTW) {
1245 if (wpipe->pipe_state & PIPE_WANTR) {
1246 wpipe->pipe_state &= ~PIPE_WANTR;
1247 wakeup(wpipe);
1248 }
1249 error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0);
1250 if (wpipe->pipe_state & PIPE_EOF)
1251 break;
1252 if (error)
1253 break;
1254 }
1255 if (wpipe->pipe_state & PIPE_EOF) {
1256 error = EPIPE;
1257 break;
1258 }
1259
1260 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1261
1262 /* Writes of size <= PIPE_BUF must be atomic. */
1263 if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1264 space = 0;
1265
1266 if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
1267 int size; /* Transfer size */
1268 int segsize; /* first segment to transfer */
1269
1270 if ((error = pipelock(wpipe,1)) != 0)
1271 break;
1272
1273 /*
1274 * It is possible for a direct write to
1275 * slip in on us... handle it here...
1276 */
1277 if (wpipe->pipe_state & PIPE_DIRECTW) {
1278 pipeunlock(wpipe);
1279 goto retrywrite;
1280 }
1281 /*
1282 * If a process blocked in uiomove, our
1283 * value for space might be bad.
1284 *
1285 * XXX will we be ok if the reader has gone
1286 * away here?
1287 */
1288 if (space > wpipe->pipe_buffer.size -
1289 wpipe->pipe_buffer.cnt) {
1290 pipeunlock(wpipe);
1291 goto retrywrite;
1292 }
1293
1294 /*
1295 * Transfer size is minimum of uio transfer
1296 * and free space in pipe buffer.
1297 */
1298 if (space > uio->uio_resid)
1299 size = uio->uio_resid;
1300 else
1301 size = space;
1302 /*
1303 * First segment to transfer is minimum of
1304 * transfer size and contiguous space in
1305 * pipe buffer. If first segment to transfer
1306 * is less than the transfer size, we've got
1307 * a wraparound in the buffer.
1308 */
1309 segsize = wpipe->pipe_buffer.size -
1310 wpipe->pipe_buffer.in;
1311 if (segsize > size)
1312 segsize = size;
1313
1314 /* Transfer first segment */
1315
1316 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1317 segsize, uio);
1318
1319 if (error == 0 && segsize < size) {
1320 /*
1321 * Transfer remaining part now, to
1322 * support atomic writes. Wraparound
1323 * happened.
1324 */
1325 #ifdef DEBUG
1326 if (wpipe->pipe_buffer.in + segsize !=
1327 wpipe->pipe_buffer.size)
1328 panic("Expected pipe buffer wraparound disappeared");
1329 #endif
1330
1331 error = uiomove(&wpipe->pipe_buffer.buffer[0],
1332 size - segsize, uio);
1333 }
1334 if (error == 0) {
1335 wpipe->pipe_buffer.in += size;
1336 if (wpipe->pipe_buffer.in >=
1337 wpipe->pipe_buffer.size) {
1338 #ifdef DEBUG
1339 if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
1340 panic("Expected wraparound bad");
1341 #endif
1342 wpipe->pipe_buffer.in = size - segsize;
1343 }
1344
1345 wpipe->pipe_buffer.cnt += size;
1346 #ifdef DEBUG
1347 if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
1348 panic("Pipe buffer overflow");
1349 #endif
1350
1351 }
1352 pipeunlock(wpipe);
1353 if (error)
1354 break;
1355
1356 } else {
1357 /*
1358 * If the "read-side" has been blocked, wake it up now.
1359 */
1360 if (wpipe->pipe_state & PIPE_WANTR) {
1361 wpipe->pipe_state &= ~PIPE_WANTR;
1362 wakeup(wpipe);
1363 }
1364
1365 /*
1366 * don't block on non-blocking I/O
1367 */
1368 if (fp->f_flag & FNONBLOCK) {
1369 error = EAGAIN;
1370 break;
1371 }
1372
1373 /*
1374 * We have no more space and have something to offer,
1375 * wake up select/poll.
1376 */
1377 pipeselwakeup(wpipe, wpipe);
1378
1379 wpipe->pipe_state |= PIPE_WANTW;
1380 error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0);
1381 if (error != 0)
1382 break;
1383 /*
1384 * If read side wants to go away, we just issue a signal
1385 * to ourselves.
1386 */
1387 if (wpipe->pipe_state & PIPE_EOF) {
1388 error = EPIPE;
1389 break;
1390 }
1391 }
1392 }
1393
1394 --wpipe->pipe_busy;
1395 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANTCLOSE)) {
1396 wpipe->pipe_state &= ~(PIPE_WANTCLOSE | PIPE_WANTR);
1397 wakeup(wpipe);
1398 } else if (wpipe->pipe_buffer.cnt > 0) {
1399 /*
1400 * If we have put any characters in the buffer, we wake up
1401 * the reader.
1402 */
1403 if (wpipe->pipe_state & PIPE_WANTR) {
1404 wpipe->pipe_state &= ~PIPE_WANTR;
1405 wakeup(wpipe);
1406 }
1407 }
1408
1409 /*
1410 * Don't return EPIPE if I/O was successful
1411 */
1412 if ((error == EPIPE) && (wpipe->pipe_buffer.cnt == 0)
1413 && (uio->uio_resid == 0))
1414 error = 0;
1415
1416 if (error == 0)
1417 vfs_timestamp(&wpipe->pipe_mtime);
1418
1419 /*
1420 * We have something to offer, wake up select/poll.
1421 * wpipe->pipe_map.cnt is always 0 in this point (direct write
1422 * is only done synchronously), so check wpipe->only pipe_buffer.cnt
1423 */
1424 if (wpipe->pipe_buffer.cnt)
1425 pipeselwakeup(wpipe, wpipe);
1426
1427 /*
1428 * Arrange for next read(2) to do a signal.
1429 */
1430 wpipe->pipe_state |= PIPE_SIGNALR;
1431
1432 return (error);
1433 }
1434
1435 /*
1436 * we implement a very minimal set of ioctls for compatibility with sockets.
1437 */
1438 int
1439 pipe_ioctl(fp, cmd, data, p)
1440 struct file *fp;
1441 u_long cmd;
1442 caddr_t data;
1443 struct proc *p;
1444 {
1445 struct pipe *mpipe = (struct pipe *)fp->f_data;
1446
1447 switch (cmd) {
1448
1449 case FIONBIO:
1450 return (0);
1451
1452 case FIOASYNC:
1453 if (*(int *)data) {
1454 mpipe->pipe_state |= PIPE_ASYNC;
1455 } else {
1456 mpipe->pipe_state &= ~PIPE_ASYNC;
1457 }
1458 return (0);
1459
1460 case FIONREAD:
1461 #ifndef PIPE_NODIRECT
1462 if (mpipe->pipe_state & PIPE_DIRECTW)
1463 *(int *)data = mpipe->pipe_map.cnt;
1464 else
1465 #endif
1466 *(int *)data = mpipe->pipe_buffer.cnt;
1467 return (0);
1468
1469 #ifdef __FreeBSD__
1470 case FIOSETOWN:
1471 return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1472
1473 case FIOGETOWN:
1474 *(int *)data = fgetown(mpipe->pipe_sigio);
1475 return (0);
1476
1477 /* This is deprecated, FIOSETOWN should be used instead. */
1478 case TIOCSPGRP:
1479 return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1480
1481 /* This is deprecated, FIOGETOWN should be used instead. */
1482 case TIOCGPGRP:
1483 *(int *)data = -fgetown(mpipe->pipe_sigio);
1484 return (0);
1485 #endif /* FreeBSD */
1486 #ifdef __NetBSD__
1487 case TIOCSPGRP:
1488 mpipe->pipe_pgid = *(int *)data;
1489 return (0);
1490
1491 case TIOCGPGRP:
1492 *(int *)data = mpipe->pipe_pgid;
1493 return (0);
1494 #endif /* NetBSD */
1495
1496 }
1497 return (ENOTTY);
1498 }
1499
1500 int
1501 pipe_poll(fp, events, p)
1502 struct file *fp;
1503 int events;
1504 struct proc *p;
1505 {
1506 struct pipe *rpipe = (struct pipe *)fp->f_data;
1507 struct pipe *wpipe;
1508 int revents = 0;
1509
1510 wpipe = rpipe->pipe_peer;
1511 if (events & (POLLIN | POLLRDNORM))
1512 if ((rpipe->pipe_buffer.cnt > 0) ||
1513 #ifndef PIPE_NODIRECT
1514 (rpipe->pipe_state & PIPE_DIRECTW) ||
1515 #endif
1516 (rpipe->pipe_state & PIPE_EOF))
1517 revents |= events & (POLLIN | POLLRDNORM);
1518
1519 if (events & (POLLOUT | POLLWRNORM))
1520 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)
1521 || (
1522 #ifndef PIPE_NODIRECT
1523 ((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1524 #endif
1525 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1526 revents |= events & (POLLOUT | POLLWRNORM);
1527
1528 if ((rpipe->pipe_state & PIPE_EOF) ||
1529 (wpipe == NULL) ||
1530 (wpipe->pipe_state & PIPE_EOF))
1531 revents |= POLLHUP;
1532
1533 if (revents == 0) {
1534 if (events & (POLLIN | POLLRDNORM)) {
1535 selrecord(p, &rpipe->pipe_sel);
1536 rpipe->pipe_state |= PIPE_SEL;
1537 }
1538
1539 if (events & (POLLOUT | POLLWRNORM)) {
1540 selrecord(p, &wpipe->pipe_sel);
1541 wpipe->pipe_state |= PIPE_SEL;
1542 }
1543 }
1544
1545 return (revents);
1546 }
1547
1548 static int
1549 pipe_stat(fp, ub, p)
1550 struct file *fp;
1551 struct stat *ub;
1552 struct proc *p;
1553 {
1554 struct pipe *pipe = (struct pipe *)fp->f_data;
1555
1556 bzero((caddr_t)ub, sizeof(*ub));
1557 ub->st_mode = S_IFIFO;
1558 ub->st_blksize = pipe->pipe_buffer.size;
1559 ub->st_size = pipe->pipe_buffer.cnt;
1560 ub->st_blocks = (ub->st_size) ? 1 : 0;
1561 #ifdef __FreeBSD__
1562 ub->st_atimespec = pipe->pipe_atime;
1563 ub->st_mtimespec = pipe->pipe_mtime;
1564 ub->st_ctimespec = pipe->pipe_ctime;
1565 #endif /* FreeBSD */
1566 #ifdef __NetBSD__
1567 TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec)
1568 TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec);
1569 TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec);
1570 #endif /* NetBSD */
1571 ub->st_uid = fp->f_cred->cr_uid;
1572 ub->st_gid = fp->f_cred->cr_gid;
1573 /*
1574 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1575 * XXX (st_dev, st_ino) should be unique.
1576 */
1577 return (0);
1578 }
1579
1580 /* ARGSUSED */
1581 static int
1582 pipe_close(fp, p)
1583 struct file *fp;
1584 struct proc *p;
1585 {
1586 struct pipe *cpipe = (struct pipe *)fp->f_data;
1587
1588 #ifdef __FreeBSD__
1589 fp->f_ops = &badfileops;
1590 funsetown(cpipe->pipe_sigio);
1591 #endif
1592 fp->f_data = NULL;
1593 pipeclose(cpipe);
1594 return (0);
1595 }
1596
1597 static void
1598 pipe_free_kmem(cpipe)
1599 struct pipe *cpipe;
1600 {
1601
1602 #ifdef __FreeBSD__
1603 mtx_assert(&vm_mtx, MA_OWNED);
1604 #endif
1605 if (cpipe->pipe_buffer.buffer != NULL) {
1606 if (cpipe->pipe_buffer.size > PIPE_SIZE)
1607 --nbigpipe;
1608 amountpipekva -= cpipe->pipe_buffer.size;
1609 #ifdef __FreeBSD__
1610 kmem_free(kernel_map,
1611 (vm_offset_t)cpipe->pipe_buffer.buffer,
1612 cpipe->pipe_buffer.size);
1613 #elif defined(__NetBSD__)
1614 uvm_km_free(kernel_map,
1615 (vaddr_t)cpipe->pipe_buffer.buffer,
1616 cpipe->pipe_buffer.size);
1617 #endif /* NetBSD */
1618
1619 cpipe->pipe_buffer.buffer = NULL;
1620 }
1621 #ifndef PIPE_NODIRECT
1622 if (cpipe->pipe_map.kva != NULL) {
1623 #ifdef __FreeBSD__
1624 amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1625 kmem_free(kernel_map,
1626 cpipe->pipe_map.kva,
1627 cpipe->pipe_buffer.size + PAGE_SIZE);
1628 #elif defined(__NetBSD__)
1629 pipe_loan_free(cpipe);
1630 #endif /* NetBSD */
1631 cpipe->pipe_map.cnt = 0;
1632 cpipe->pipe_map.kva = NULL;
1633 cpipe->pipe_map.pos = 0;
1634 cpipe->pipe_map.npages = 0;
1635 }
1636 #endif /* !PIPE_NODIRECT */
1637 }
1638
1639 /*
1640 * shutdown the pipe
1641 */
1642 static void
1643 pipeclose(cpipe)
1644 struct pipe *cpipe;
1645 {
1646 struct pipe *ppipe;
1647
1648 if (!cpipe)
1649 return;
1650
1651 pipeselwakeup(cpipe, cpipe);
1652
1653 /*
1654 * If the other side is blocked, wake it up saying that
1655 * we want to close it down.
1656 */
1657 while (cpipe->pipe_busy) {
1658 wakeup(cpipe);
1659 cpipe->pipe_state |= PIPE_WANTCLOSE | PIPE_EOF;
1660 tsleep(cpipe, PRIBIO, "pipecl", 0);
1661 }
1662
1663 /*
1664 * Disconnect from peer
1665 */
1666 if ((ppipe = cpipe->pipe_peer) != NULL) {
1667 pipeselwakeup(ppipe, ppipe);
1668
1669 ppipe->pipe_state |= PIPE_EOF;
1670 wakeup(ppipe);
1671 ppipe->pipe_peer = NULL;
1672 }
1673
1674 /*
1675 * free resources
1676 */
1677 #ifdef _FreeBSD__
1678 mtx_lock(&vm_mtx);
1679 pipe_free_kmem(cpipe);
1680 /* XXX: erm, doesn't zalloc already have its own locks and
1681 * not need the giant vm lock?
1682 */
1683 zfree(pipe_zone, cpipe);
1684 mtx_unlock(&vm_mtx);
1685 #endif /* FreeBSD */
1686
1687 #ifdef __NetBSD__
1688 pipe_free_kmem(cpipe);
1689 (void) lockmgr(&cpipe->pipe_lock, LK_DRAIN, NULL);
1690 pool_put(&pipe_pool, cpipe);
1691 #endif
1692 }
1693
1694 #ifdef __FreeBSD__
1695 /*ARGSUSED*/
1696 static int
1697 pipe_kqfilter(struct file *fp, struct knote *kn)
1698 {
1699 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1700
1701 switch (kn->kn_filter) {
1702 case EVFILT_READ:
1703 kn->kn_fop = &pipe_rfiltops;
1704 break;
1705 case EVFILT_WRITE:
1706 kn->kn_fop = &pipe_wfiltops;
1707 cpipe = cpipe->pipe_peer;
1708 break;
1709 default:
1710 return (1);
1711 }
1712 kn->kn_hook = (caddr_t)cpipe;
1713
1714 SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1715 return (0);
1716 }
1717
1718 static void
1719 filt_pipedetach(struct knote *kn)
1720 {
1721 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1722
1723 SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1724 }
1725
1726 /*ARGSUSED*/
1727 static int
1728 filt_piperead(struct knote *kn, long hint)
1729 {
1730 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1731 struct pipe *wpipe = rpipe->pipe_peer;
1732
1733 kn->kn_data = rpipe->pipe_buffer.cnt;
1734 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1735 kn->kn_data = rpipe->pipe_map.cnt;
1736
1737 if ((rpipe->pipe_state & PIPE_EOF) ||
1738 (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1739 kn->kn_flags |= EV_EOF;
1740 return (1);
1741 }
1742 return (kn->kn_data > 0);
1743 }
1744
1745 /*ARGSUSED*/
1746 static int
1747 filt_pipewrite(struct knote *kn, long hint)
1748 {
1749 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1750 struct pipe *wpipe = rpipe->pipe_peer;
1751
1752 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1753 kn->kn_data = 0;
1754 kn->kn_flags |= EV_EOF;
1755 return (1);
1756 }
1757 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1758 if (wpipe->pipe_state & PIPE_DIRECTW)
1759 kn->kn_data = 0;
1760
1761 return (kn->kn_data >= PIPE_BUF);
1762 }
1763 #endif /* FreeBSD */
1764
1765 #ifdef __NetBSD__
1766 static int
1767 pipe_fcntl(fp, cmd, data, p)
1768 struct file *fp;
1769 u_int cmd;
1770 caddr_t data;
1771 struct proc *p;
1772 {
1773 if (cmd == F_SETFL)
1774 return (0);
1775 else
1776 return (EOPNOTSUPP);
1777 }
1778
1779 /*
1780 * Handle pipe sysctls.
1781 */
1782 int
1783 sysctl_dopipe(name, namelen, oldp, oldlenp, newp, newlen)
1784 int *name;
1785 u_int namelen;
1786 void *oldp;
1787 size_t *oldlenp;
1788 void *newp;
1789 size_t newlen;
1790 {
1791 /* All sysctl names at this level are terminal. */
1792 if (namelen != 1)
1793 return (ENOTDIR); /* overloaded */
1794
1795 switch (name[0]) {
1796 case KERN_PIPE_MAXKVASZ:
1797 return (sysctl_int(oldp, oldlenp, newp, newlen, &maxpipekva));
1798 case KERN_PIPE_LIMITKVA:
1799 return (sysctl_int(oldp, oldlenp, newp, newlen, &limitpipekva));
1800 case KERN_PIPE_MAXBIGPIPES:
1801 return (sysctl_int(oldp, oldlenp, newp, newlen, &maxbigpipes));
1802 case KERN_PIPE_NBIGPIPES:
1803 return (sysctl_rdint(oldp, oldlenp, newp, nbigpipe));
1804 case KERN_PIPE_KVASIZE:
1805 return (sysctl_rdint(oldp, oldlenp, newp, amountpipekva));
1806 default:
1807 return (EOPNOTSUPP);
1808 }
1809 /* NOTREACHED */
1810 }
1811
1812 /*
1813 * Initialize pipe structs.
1814 */
1815 void
1816 pipe_init(void)
1817 {
1818 pool_init(&pipe_pool, sizeof(struct pipe), 0, 0, 0, "pipepl",
1819 0, NULL, NULL, M_PIPE);
1820 }
1821
1822 #endif /* __NetBSD __ */
1823