rcache.c revision 1.14 1 /* $NetBSD: rcache.c,v 1.14 2003/02/04 08:06:42 enami Exp $ */
2
3 /*-
4 * Copyright (c) 1999 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Martin J. Laubach <mjl (at) emsi.priv.at> and
9 * Manuel Bouyer <Manuel.Bouyer (at) lip6.fr>.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the NetBSD
22 * Foundation, Inc. and its contributors.
23 * 4. Neither the name of The NetBSD Foundation nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 #include <sys/cdefs.h>
41 #ifndef lint
42 __RCSID("$NetBSD: rcache.c,v 1.14 2003/02/04 08:06:42 enami Exp $");
43 #endif /* not lint */
44
45 #include <sys/types.h>
46 #include <sys/uio.h>
47 #include <sys/mman.h>
48 #include <sys/param.h>
49 #include <sys/sysctl.h>
50 #include <ufs/ufs/dinode.h>
51
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <unistd.h>
55 #include <fcntl.h>
56 #include <errno.h>
57 #include <string.h>
58
59 #include "dump.h"
60
61 /*-----------------------------------------------------------------------*/
62 #define MAXCACHEBUFS 512 /* max 512 buffers */
63 #define MAXMEMPART 6 /* max 15% of the user mem */
64
65 /*-----------------------------------------------------------------------*/
66 union cdesc {
67 volatile size_t cd_count;
68 struct {
69 volatile daddr_t blkstart;
70 volatile daddr_t blkend; /* start + nblksread */
71 volatile daddr_t blocksRead;
72 volatile size_t time;
73 #ifdef DIAGNOSTICS
74 volatile pid_t owner;
75 #endif
76 } desc;
77 #define cd_blkstart desc.blkstart
78 #define cd_blkend desc.blkend
79 #define cd_blocksRead desc.blocksRead
80 #define cd_time desc.time
81 #define cd_owner desc.owner
82 };
83
84 static int findlru(void);
85
86 static void *shareBuffer = NULL;
87 static union cdesc *cheader;
88 static union cdesc *cdesc;
89 static char *cdata;
90 static int cachebufs;
91 static int nblksread;
92
93 #ifdef STATS
94 static int nreads;
95 static int nphysread;
96 static int64_t readsize;
97 static int64_t physreadsize;
98 #endif
99
100 #define CDATA(i) (cdata + ((i) * nblksread * dev_bsize))
101
102 void
103 initcache(int cachesize, int readblksize)
104 {
105 size_t len;
106 size_t sharedSize;
107
108 nblksread = howmany(readblksize, ufsib->ufs_bsize);
109 if (cachesize == -1) { /* Compute from memory available */
110 int usermem;
111 int mib[2] = { CTL_HW, HW_USERMEM };
112
113 len = sizeof(usermem);
114 if (sysctl(mib, 2, &usermem, &len, NULL, 0) < 0) {
115 msg("sysctl(hw.usermem) failed: %s\n",
116 strerror(errno));
117 return;
118 }
119 cachebufs = (usermem / MAXMEMPART) / (nblksread * dev_bsize);
120 } else { /* User specified */
121 cachebufs = cachesize;
122 }
123
124 if (cachebufs) { /* Don't allocate if zero --> no caching */
125 if (cachebufs > MAXCACHEBUFS)
126 cachebufs = MAXCACHEBUFS;
127
128 sharedSize = sizeof(union cdesc) +
129 sizeof(union cdesc) * cachebufs +
130 nblksread * cachebufs * dev_bsize;
131 #ifdef STATS
132 fprintf(stderr, "Using %d buffers (%d bytes)\n", cachebufs,
133 sharedSize);
134 #endif
135 shareBuffer = mmap(NULL, sharedSize, PROT_READ | PROT_WRITE,
136 MAP_ANON | MAP_SHARED, -1, 0);
137 if (shareBuffer == MAP_FAILED) {
138 msg("can't mmap shared memory for buffer: %s\n",
139 strerror(errno));
140 return;
141 }
142 cheader = shareBuffer;
143 cdesc = (union cdesc *) (((char *) shareBuffer) +
144 sizeof(union cdesc));
145 cdata = ((char *) shareBuffer) + sizeof(union cdesc) +
146 sizeof(union cdesc) * cachebufs;
147
148 memset(shareBuffer, '\0', sharedSize);
149 }
150 }
151
152 /*
153 * Find the cache buffer descriptor that shows the minimal access time
154 */
155 static int
156 findlru(void)
157 {
158 int i;
159 size_t minTime = cdesc[0].cd_time;
160 int minIdx = 0;
161
162 for (i = 0; i < cachebufs; i++) {
163 if (cdesc[i].cd_time < minTime) {
164 minIdx = i;
165 minTime = cdesc[i].cd_time;
166 }
167 }
168
169 return minIdx;
170 }
171
172 /*
173 * Read data directly from disk, with smart error handling.
174 * Try to recover from hard errors by reading in sector sized pieces.
175 * Error recovery is attempted at most BREADEMAX times before seeking
176 * consent from the operator to continue.
177 */
178
179 static int breaderrors = 0;
180 #define BREADEMAX 32
181
182 void
183 rawread(daddr_t blkno, char *buf, int size)
184 {
185 int cnt, i;
186
187 #ifdef STATS
188 nphysread++;
189 physreadsize += size;
190 #endif
191
192 loop:
193 if (lseek(diskfd, ((off_t) blkno << dev_bshift), SEEK_SET) < 0) {
194 msg("rawread: lseek fails\n");
195 goto err;
196 }
197 if ((cnt = read(diskfd, buf, size)) == size)
198 return;
199 if (blkno + (size / dev_bsize) > ufsib->ufs_dsize) {
200 /*
201 * Trying to read the final fragment.
202 *
203 * NB - dump only works in TP_BSIZE blocks, hence
204 * rounds `dev_bsize' fragments up to TP_BSIZE pieces.
205 * It should be smarter about not actually trying to
206 * read more than it can get, but for the time being
207 * we punt and scale back the read only when it gets
208 * us into trouble. (mkm 9/25/83)
209 */
210 size -= dev_bsize;
211 goto loop;
212 }
213 if (cnt == -1)
214 msg("read error from %s: %s: [block %lld]: count=%d\n",
215 disk, strerror(errno), (long long)blkno, size);
216 else
217 msg("short read error from %s: [block %lld]: "
218 "count=%d, got=%d\n",
219 disk, (long long)blkno, size, cnt);
220 err:
221 if (++breaderrors > BREADEMAX) {
222 msg("More than %d block read errors from %s\n",
223 BREADEMAX, disk);
224 broadcast("DUMP IS AILING!\n");
225 msg("This is an unrecoverable error.\n");
226 if (!query("Do you want to attempt to continue?")) {
227 dumpabort(0);
228 /*NOTREACHED*/
229 } else
230 breaderrors = 0;
231 }
232 /*
233 * Zero buffer, then try to read each sector of buffer separately.
234 */
235 memset(buf, 0, size);
236 for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) {
237 if (lseek(diskfd, ((off_t)blkno << dev_bshift),
238 SEEK_SET) < 0) {
239 msg("rawread: lseek2 fails: %s!\n",
240 strerror(errno));
241 continue;
242 }
243 if ((cnt = read(diskfd, buf, (int)dev_bsize)) == dev_bsize)
244 continue;
245 if (cnt == -1) {
246 msg("read error from %s: %s: [sector %lld]: "
247 "count=%ld: %s\n", disk, strerror(errno),
248 (long long)blkno,
249 dev_bsize, strerror(errno));
250 continue;
251 }
252 msg("short read error from %s: [sector %lld]: "
253 "count=%ld, got=%d\n",
254 disk, (long long)blkno, dev_bsize, cnt);
255 }
256 }
257
258 void
259 bread(daddr_t blkno, char *buf, int size)
260 {
261 int osize = size;
262 daddr_t oblkno = blkno;
263 char *obuf = buf;
264 daddr_t numBlocks = howmany(size, dev_bsize);
265
266 #ifdef STATS
267 nreads++;
268 readsize += size;
269 #endif
270
271 if (!shareBuffer) {
272 rawread(blkno, buf, size);
273 return;
274 }
275
276 if (flock(diskfd, LOCK_EX)) {
277 msg("flock(LOCK_EX) failed: %s\n",
278 strerror(errno));
279 rawread(blkno, buf, size);
280 return;
281 }
282
283 retry:
284 while(size > 0) {
285 int i;
286
287 for (i = 0; i < cachebufs; i++) {
288 union cdesc *curr = &cdesc[i];
289
290 #ifdef DIAGNOSTICS
291 if (curr->cd_owner) {
292 fprintf(stderr, "Owner is set (%d, me=%d), can"
293 "not happen.\n", curr->cd_owner, getpid());
294 }
295 #endif
296
297 if (curr->cd_blkend == 0)
298 continue;
299 /*
300 * If we find a bit of the read in the buffers,
301 * now compute how many blocks we can copy,
302 * copy them out, adjust blkno, buf and size,
303 * and restart
304 */
305 if (curr->cd_blkstart <= blkno &&
306 blkno < curr->cd_blkend) {
307 /* Number of data blocks to be copied */
308 int toCopy = MIN(size,
309 (curr->cd_blkend - blkno) * dev_bsize);
310 #ifdef DIAGNOSTICS
311 if (toCopy <= 0 ||
312 toCopy > nblksread * dev_bsize) {
313 fprintf(stderr, "toCopy %d !\n",
314 toCopy);
315 dumpabort(0);
316 }
317 if (CDATA(i) + (blkno - curr->cd_blkstart) *
318 dev_bsize < CDATA(i) ||
319 CDATA(i) + (blkno - curr->cd_blkstart) *
320 dev_bsize >
321 CDATA(i) + nblksread * dev_bsize) {
322 fprintf(stderr, "%p < %p !!!\n",
323 CDATA(i) + (blkno -
324 curr->cd_blkstart) * dev_bsize,
325 CDATA(i));
326 fprintf(stderr, "cdesc[i].cd_blkstart %d "
327 "blkno %d dev_bsize %ld\n",
328 curr->cd_blkstart, blkno, dev_bsize);
329 dumpabort(0);
330 }
331 #endif
332 memcpy(buf, CDATA(i) +
333 (blkno - curr->cd_blkstart) * dev_bsize,
334 toCopy);
335
336 buf += toCopy;
337 size -= toCopy;
338 blkno += howmany(toCopy, dev_bsize);
339 numBlocks -= howmany(toCopy, dev_bsize);
340
341 curr->cd_time = cheader->cd_count++;
342
343 /*
344 * If all data of a cache block have been
345 * read, chances are good no more reads
346 * will occur, so expire the cache immediately
347 */
348
349 curr->cd_blocksRead +=
350 howmany(toCopy, dev_bsize);
351 if (curr->cd_blocksRead >= nblksread)
352 curr->cd_time = 0;
353
354 goto retry;
355 }
356 }
357
358 /* No more to do? */
359 if (size == 0)
360 break;
361
362 /*
363 * This does actually not happen if fs blocks are not greater
364 * than nblksread.
365 */
366 if (numBlocks > nblksread || blkno >= ufsib->ufs_dsize) {
367 rawread(oblkno, obuf, osize);
368 break;
369 } else {
370 int idx;
371 ssize_t rsize;
372 daddr_t blockBlkNo;
373
374 blockBlkNo = (blkno / nblksread) * nblksread;
375 idx = findlru();
376 rsize = MIN(nblksread,
377 ufsib->ufs_dsize - blockBlkNo) *
378 dev_bsize;
379
380 #ifdef DIAGNOSTICS
381 if (cdesc[idx].cd_owner)
382 fprintf(stderr, "Owner is set (%d, me=%d), can"
383 "not happen(2).\n", cdesc[idx].cd_owner,
384 getpid());
385 cdesc[idx].cd_owner = getpid();
386 #endif
387 cdesc[idx].cd_time = cheader->cd_count++;
388 cdesc[idx].cd_blkstart = blockBlkNo;
389 cdesc[idx].cd_blocksRead = 0;
390
391 if (lseek(diskfd, ((off_t) blockBlkNo << dev_bshift),
392 SEEK_SET) < 0) {
393 msg("readBlocks: lseek fails: %s\n",
394 strerror(errno));
395 rsize = -1;
396 } else {
397 rsize = read(diskfd, CDATA(idx), rsize);
398 if (rsize < 0) {
399 msg("readBlocks: read fails: %s\n",
400 strerror(errno));
401 }
402 }
403
404 /* On errors, panic, punt, try to read without
405 * cache and let raw read routine do the rest.
406 */
407
408 if (rsize <= 0) {
409 rawread(oblkno, obuf, osize);
410 #ifdef DIAGNOSTICS
411 if (cdesc[idx].cd_owner != getpid())
412 fprintf(stderr, "Owner changed from "
413 "%d to %d, can't happen\n",
414 getpid(), cdesc[idx].cd_owner);
415 cdesc[idx].cd_owner = 0;
416 #endif
417 break;
418 }
419
420 /* On short read, just note the fact and go on */
421 cdesc[idx].cd_blkend = blockBlkNo + rsize / dev_bsize;
422
423 #ifdef STATS
424 nphysread++;
425 physreadsize += rsize;
426 #endif
427 #ifdef DIAGNOSTICS
428 if (cdesc[idx].cd_owner != getpid())
429 fprintf(stderr, "Owner changed from "
430 "%d to %d, can't happen\n",
431 getpid(), cdesc[idx].cd_owner);
432 cdesc[idx].cd_owner = 0;
433 #endif
434 /*
435 * We swapped some of data in, let the loop fetch
436 * them from cache
437 */
438 }
439 }
440
441 if (flock(diskfd, LOCK_UN))
442 msg("flock(LOCK_UN) failed: %s\n",
443 strerror(errno));
444 }
445
446 void
447 printcachestats(void)
448 {
449
450 #ifdef STATS
451 fprintf(stderr, "Pid %d: %d reads (%u bytes) "
452 "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n",
453 getpid(), nreads, (u_int) readsize, nphysread,
454 (u_int) physreadsize, (nreads - nphysread) * 100 / nreads,
455 (int) (((physreadsize - readsize) * 100) / readsize));
456 #endif
457 }
458