rcache.c revision 1.13 1 /* $NetBSD: rcache.c,v 1.13 2003/02/03 23:08:37 hannken Exp $ */
2
3 /*-
4 * Copyright (c) 1999 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Martin J. Laubach <mjl (at) emsi.priv.at> and
9 * Manuel Bouyer <Manuel.Bouyer (at) lip6.fr>.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the NetBSD
22 * Foundation, Inc. and its contributors.
23 * 4. Neither the name of The NetBSD Foundation nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 * POSSIBILITY OF SUCH DAMAGE.
38 */
39
40 #include <sys/cdefs.h>
41 #ifndef lint
42 __RCSID("$NetBSD: rcache.c,v 1.13 2003/02/03 23:08:37 hannken Exp $");
43 #endif /* not lint */
44
45 #include <sys/types.h>
46 #include <sys/uio.h>
47 #include <sys/mman.h>
48 #include <sys/param.h>
49 #include <sys/sysctl.h>
50 #include <ufs/ufs/dinode.h>
51
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <unistd.h>
55 #include <fcntl.h>
56 #include <errno.h>
57 #include <string.h>
58
59 #include "dump.h"
60
61 /*-----------------------------------------------------------------------*/
62 #define MAXCACHEBUFS 512 /* max 512 buffers */
63 #define MAXMEMPART 6 /* max 15% of the user mem */
64
65 /*-----------------------------------------------------------------------*/
66 union cdesc {
67 volatile size_t cd_count;
68 struct {
69 volatile daddr_t blkstart;
70 volatile daddr_t blkend;/* start + nblksread */
71 volatile daddr_t blocksRead;
72 volatile size_t time;
73 #ifdef DIAGNOSTICS
74 volatile pid_t owner;
75 #endif
76 } desc;
77 #define cd_blkstart desc.blkstart
78 #define cd_blkend desc.blkend
79 #define cd_blocksRead desc.blocksRead
80 #define cd_time desc.time
81 #define cd_owner desc.owner
82 };
83
84 static int findlru(void);
85
86 static void *shareBuffer = NULL;
87 static union cdesc *cheader;
88 static union cdesc *cdesc;
89 static char *cdata;
90 static int cachebufs;
91 static int nblksread;
92
93 #ifdef STATS
94 static int nreads;
95 static int nphysread;
96 static int64_t readsize;
97 static int64_t physreadsize;
98 #endif
99
100 #define CDATA(i) (cdata + ((i) * nblksread * dev_bsize))
101
102 void
103 initcache(int cachesize, int readblksize)
104 {
105 size_t len;
106 size_t sharedSize;
107
108 nblksread = (readblksize + ufsib->ufs_bsize - 1) / ufsib->ufs_bsize;
109 if(cachesize == -1) { /* Compute from memory available */
110 int usermem;
111 int mib[2] = { CTL_HW, HW_USERMEM };
112
113 len = sizeof(usermem);
114 if (sysctl(mib, 2, &usermem, &len, NULL, 0) < 0) {
115 msg("sysctl(hw.usermem) failed: %s\n", strerror(errno));
116 return;
117 }
118 cachebufs = (usermem / MAXMEMPART) / (nblksread * dev_bsize);
119 } else { /* User specified */
120 cachebufs = cachesize;
121 }
122
123 if(cachebufs) { /* Don't allocate if zero --> no caching */
124 if (cachebufs > MAXCACHEBUFS)
125 cachebufs = MAXCACHEBUFS;
126
127 sharedSize = sizeof(union cdesc) +
128 sizeof(union cdesc) * cachebufs +
129 nblksread * cachebufs * dev_bsize;
130 #ifdef STATS
131 fprintf(stderr, "Using %d buffers (%d bytes)\n", cachebufs,
132 sharedSize);
133 #endif
134 shareBuffer = mmap(NULL, sharedSize, PROT_READ | PROT_WRITE,
135 MAP_ANON | MAP_SHARED, -1, 0);
136 if (shareBuffer == (void *)-1) {
137 msg("can't mmap shared memory for buffer: %s\n",
138 strerror(errno));
139 return;
140 }
141 cheader = shareBuffer;
142 cdesc = (union cdesc *) (((char *) shareBuffer) +
143 sizeof(union cdesc));
144 cdata = ((char *) shareBuffer) + sizeof(union cdesc) +
145 sizeof(union cdesc) * cachebufs;
146
147 memset(shareBuffer, '\0', sharedSize);
148 }
149 }
150
151 /*
152 * Find the cache buffer descriptor that shows the minimal access time
153 */
154 static int
155 findlru(void)
156 {
157 int i;
158 size_t minTime = cdesc[0].cd_time;
159 int minIdx = 0;
160
161 for (i = 0; i < cachebufs; i++) {
162 if (cdesc[i].cd_time < minTime) {
163 minIdx = i;
164 minTime = cdesc[i].cd_time;
165 }
166 }
167
168 return minIdx;
169 }
170
171 /*
172 * Read data directly from disk, with smart error handling.
173 * Try to recover from hard errors by reading in sector sized pieces.
174 * Error recovery is attempted at most BREADEMAX times before seeking
175 * consent from the operator to continue.
176 */
177
178 static int breaderrors = 0;
179 #define BREADEMAX 32
180
181 void
182 rawread(daddr_t blkno, char *buf, int size)
183 {
184 int cnt, i;
185 #ifdef STATS
186 nphysread++;
187 physreadsize += size;
188 #endif
189
190 loop:
191 if (lseek(diskfd, ((off_t) blkno << dev_bshift), 0) < 0) {
192 msg("rawread: lseek fails\n");
193 goto err;
194 }
195 if ((cnt = read(diskfd, buf, size)) == size)
196 return;
197 if (blkno + (size / dev_bsize) > ufsib->ufs_dsize) {
198 /*
199 * Trying to read the final fragment.
200 *
201 * NB - dump only works in TP_BSIZE blocks, hence
202 * rounds `dev_bsize' fragments up to TP_BSIZE pieces.
203 * It should be smarter about not actually trying to
204 * read more than it can get, but for the time being
205 * we punt and scale back the read only when it gets
206 * us into trouble. (mkm 9/25/83)
207 */
208 size -= dev_bsize;
209 goto loop;
210 }
211 if (cnt == -1)
212 msg("read error from %s: %s: [block %lld]: count=%d\n",
213 disk, strerror(errno), (long long)blkno, size);
214 else
215 msg("short read error from %s: [block %lld]: count=%d, got=%d\n",
216 disk, (long long)blkno, size, cnt);
217 err:
218 if (++breaderrors > BREADEMAX) {
219 msg("More than %d block read errors from %s\n",
220 BREADEMAX, disk);
221 broadcast("DUMP IS AILING!\n");
222 msg("This is an unrecoverable error.\n");
223 if (!query("Do you want to attempt to continue?")){
224 dumpabort(0);
225 /*NOTREACHED*/
226 } else
227 breaderrors = 0;
228 }
229 /*
230 * Zero buffer, then try to read each sector of buffer separately.
231 */
232 memset(buf, 0, size);
233 for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) {
234 if (lseek(diskfd, ((off_t)blkno << dev_bshift), 0) < 0) {
235 msg("rawread: lseek2 fails: %s!\n",
236 strerror(errno));
237 continue;
238 }
239 if ((cnt = read(diskfd, buf, (int)dev_bsize)) == dev_bsize)
240 continue;
241 if (cnt == -1) {
242 msg("read error from %s: %s: [sector %lld]: count=%ld: "
243 "%s\n", disk, strerror(errno), (long long)blkno,
244 dev_bsize, strerror(errno));
245 continue;
246 }
247 msg("short read error from %s: [sector %lld]: count=%ld, got=%d\n",
248 disk, (long long)blkno, dev_bsize, cnt);
249 }
250 }
251
252 void
253 bread(daddr_t blkno, char *buf, int size)
254 {
255 int osize = size;
256 daddr_t oblkno = blkno;
257 char *obuf = buf;
258 daddr_t numBlocks = (size + dev_bsize -1) / dev_bsize;
259
260 #ifdef STATS
261 nreads++;
262 readsize += size;
263 #endif
264
265 if (!shareBuffer) {
266 rawread(blkno, buf, size);
267 return;
268 }
269
270 if (flock(diskfd, LOCK_EX)) {
271 msg("flock(LOCK_EX) failed: %s\n",
272 strerror(errno));
273 rawread(blkno, buf, size);
274 return;
275 }
276
277 retry:
278 while(size > 0) {
279 int i;
280
281 for (i = 0; i < cachebufs; i++) {
282 union cdesc *curr = &cdesc[i];
283
284 #ifdef DIAGNOSTICS
285 if (curr->cd_owner) {
286 fprintf(stderr, "Owner is set (%d, me=%d), can"
287 "not happen.\n", curr->cd_owner, getpid());
288 }
289 #endif
290
291 if (curr->cd_blkend == 0)
292 continue;
293 /*
294 * If we find a bit of the read in the buffers,
295 * now compute how many blocks we can copy,
296 * copy them out, adjust blkno, buf and size,
297 * and restart
298 */
299 if (curr->cd_blkstart <= blkno &&
300 blkno < curr->cd_blkend) {
301 /* Number of data blocks to be copied */
302 int toCopy = MIN(size,
303 (curr->cd_blkend - blkno) * dev_bsize);
304 #ifdef DIAGNOSTICS
305 if (toCopy <= 0 ||
306 toCopy > nblksread * dev_bsize) {
307 fprintf(stderr, "toCopy %d !\n",
308 toCopy);
309 dumpabort(0);
310 }
311 if (CDATA(i) + (blkno - curr->cd_blkstart) *
312 dev_bsize < CDATA(i) ||
313 CDATA(i) + (blkno - curr->cd_blkstart) *
314 dev_bsize >
315 CDATA(i) + nblksread * dev_bsize) {
316 fprintf(stderr, "%p < %p !!!\n",
317 CDATA(i) + (blkno -
318 curr->cd_blkstart) * dev_bsize,
319 CDATA(i));
320 fprintf(stderr, "cdesc[i].cd_blkstart %d "
321 "blkno %d dev_bsize %ld\n",
322 curr->cd_blkstart, blkno, dev_bsize);
323 dumpabort(0);
324 }
325 #endif
326 memcpy(buf, CDATA(i) +
327 (blkno - curr->cd_blkstart) * dev_bsize,
328 toCopy);
329
330 buf += toCopy;
331 size -= toCopy;
332 blkno += (toCopy + dev_bsize - 1) / dev_bsize;
333 numBlocks -=
334 (toCopy + dev_bsize - 1) / dev_bsize;
335
336 curr->cd_time = cheader->cd_count++;
337
338 /*
339 * If all data of a cache block have been
340 * read, chances are good no more reads
341 * will occur, so expire the cache immediately
342 */
343
344 curr->cd_blocksRead +=
345 (toCopy + dev_bsize -1) / dev_bsize;
346 if (curr->cd_blocksRead >= nblksread)
347 curr->cd_time = 0;
348
349 goto retry;
350 }
351 }
352
353 /* No more to do? */
354 if (size == 0)
355 break;
356
357 /*
358 * This does actually not happen if fs blocks are not greater
359 * than nblksread.
360 */
361 if (numBlocks > nblksread || blkno >= ufsib->ufs_dsize) {
362 rawread(oblkno, obuf, osize);
363 break;
364 } else {
365 int idx;
366 ssize_t rsize;
367 daddr_t blockBlkNo;
368
369 blockBlkNo = (blkno / nblksread) * nblksread;
370 idx = findlru();
371 rsize = MIN(nblksread,
372 ufsib->ufs_dsize - blockBlkNo) *
373 dev_bsize;
374
375 #ifdef DIAGNOSTICS
376 if (cdesc[idx].cd_owner)
377 fprintf(stderr, "Owner is set (%d, me=%d), can"
378 "not happen(2).\n", cdesc[idx].cd_owner,
379 getpid());
380 cdesc[idx].cd_owner = getpid();
381 #endif
382 cdesc[idx].cd_time = cheader->cd_count++;
383 cdesc[idx].cd_blkstart = blockBlkNo;
384 cdesc[idx].cd_blocksRead = 0;
385
386 if (lseek(diskfd,
387 ((off_t) (blockBlkNo) << dev_bshift), 0) < 0) {
388 msg("readBlocks: lseek fails: %s\n",
389 strerror(errno));
390 rsize = -1;
391 } else {
392 rsize = read(diskfd, CDATA(idx), rsize);
393 if (rsize < 0) {
394 msg("readBlocks: read fails: %s\n",
395 strerror(errno));
396 }
397 }
398
399 /* On errors, panic, punt, try to read without
400 * cache and let raw read routine do the rest.
401 */
402
403 if (rsize <= 0) {
404 rawread(oblkno, obuf, osize);
405 #ifdef DIAGNOSTICS
406 if (cdesc[idx].cd_owner != getpid())
407 fprintf(stderr, "Owner changed from "
408 "%d to %d, can't happen\n",
409 getpid(), cdesc[idx].cd_owner);
410 cdesc[idx].cd_owner = 0;
411 #endif
412 break;
413 }
414
415 /* On short read, just note the fact and go on */
416 cdesc[idx].cd_blkend = blockBlkNo + rsize / dev_bsize;
417
418 #ifdef STATS
419 nphysread++;
420 physreadsize += rsize;
421 #endif
422 #ifdef DIAGNOSTICS
423 if (cdesc[idx].cd_owner != getpid())
424 fprintf(stderr, "Owner changed from "
425 "%d to %d, can't happen\n",
426 getpid(), cdesc[idx].cd_owner);
427 cdesc[idx].cd_owner = 0;
428 #endif
429 /*
430 * We swapped some of data in, let the loop fetch
431 * them from cache
432 */
433 }
434 }
435
436 if (flock(diskfd, LOCK_UN))
437 msg("flock(LOCK_UN) failed: %s\n",
438 strerror(errno));
439 return;
440 }
441
442 void
443 printcachestats(void)
444 {
445 #ifdef STATS
446 fprintf(stderr, "Pid %d: %d reads (%u bytes) "
447 "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n",
448 getpid(), nreads, (u_int) readsize, nphysread,
449 (u_int) physreadsize, (nreads - nphysread) * 100 / nreads,
450 (int) (((physreadsize - readsize) * 100) / readsize));
451 #endif
452 }
453