bt_split.c revision 1.13 1 /* $NetBSD: bt_split.c,v 1.13 2003/08/07 16:42:41 agc Exp $ */
2
3 /*-
4 * Copyright (c) 1990, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Mike Olson.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include <sys/cdefs.h>
36 #if defined(LIBC_SCCS) && !defined(lint)
37 #if 0
38 static char sccsid[] = "@(#)bt_split.c 8.9 (Berkeley) 7/26/94";
39 #else
40 __RCSID("$NetBSD: bt_split.c,v 1.13 2003/08/07 16:42:41 agc Exp $");
41 #endif
42 #endif /* LIBC_SCCS and not lint */
43
44 #include "namespace.h"
45 #include <sys/types.h>
46
47 #include <limits.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <string.h>
51
52 #include <db.h>
53 #include "btree.h"
54
55 static int bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
56 static PAGE *bt_page
57 __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
58 static int bt_preserve __P((BTREE *, pgno_t));
59 static PAGE *bt_psplit
60 __P((BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t));
61 static PAGE *bt_root
62 __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
63 static int bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
64 static recno_t rec_total __P((PAGE *));
65
66 #ifdef STATISTICS
67 u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
68 #endif
69
70 /*
71 * __BT_SPLIT -- Split the tree.
72 *
73 * Parameters:
74 * t: tree
75 * sp: page to split
76 * key: key to insert
77 * data: data to insert
78 * flags: BIGKEY/BIGDATA flags
79 * ilen: insert length
80 * skip: index to leave open
81 *
82 * Returns:
83 * RET_ERROR, RET_SUCCESS
84 */
85 int
86 __bt_split(t, sp, key, data, flags, ilen, argskip)
87 BTREE *t;
88 PAGE *sp;
89 const DBT *key, *data;
90 int flags;
91 size_t ilen;
92 u_int32_t argskip;
93 {
94 BINTERNAL *bi = NULL; /* pacify gcc */
95 BLEAF *bl = NULL, *tbl; /* pacify gcc */
96 DBT a, b;
97 EPGNO *parent;
98 PAGE *h, *l, *r, *lchild, *rchild;
99 indx_t nxtindex;
100 u_int16_t skip;
101 u_int32_t n, nbytes, nksize = 0; /* pacify gcc */
102 int parentsplit;
103 char *dest;
104
105 /*
106 * Split the page into two pages, l and r. The split routines return
107 * a pointer to the page into which the key should be inserted and with
108 * skip set to the offset which should be used. Additionally, l and r
109 * are pinned.
110 */
111 skip = argskip;
112 h = sp->pgno == P_ROOT ?
113 bt_root(t, sp, &l, &r, &skip, ilen) :
114 bt_page(t, sp, &l, &r, &skip, ilen);
115 if (h == NULL)
116 return (RET_ERROR);
117
118 /*
119 * Insert the new key/data pair into the leaf page. (Key inserts
120 * always cause a leaf page to split first.)
121 */
122 h->linp[skip] = h->upper -= ilen;
123 dest = (char *)(void *)h + h->upper;
124 if (F_ISSET(t, R_RECNO))
125 WR_RLEAF(dest, data, flags)
126 else
127 WR_BLEAF(dest, key, data, flags)
128
129 /* If the root page was split, make it look right. */
130 if (sp->pgno == P_ROOT &&
131 (F_ISSET(t, R_RECNO) ?
132 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
133 goto err2;
134
135 /*
136 * Now we walk the parent page stack -- a LIFO stack of the pages that
137 * were traversed when we searched for the page that split. Each stack
138 * entry is a page number and a page index offset. The offset is for
139 * the page traversed on the search. We've just split a page, so we
140 * have to insert a new key into the parent page.
141 *
142 * If the insert into the parent page causes it to split, may have to
143 * continue splitting all the way up the tree. We stop if the root
144 * splits or the page inserted into didn't have to split to hold the
145 * new key. Some algorithms replace the key for the old page as well
146 * as the new page. We don't, as there's no reason to believe that the
147 * first key on the old page is any better than the key we have, and,
148 * in the case of a key being placed at index 0 causing the split, the
149 * key is unavailable.
150 *
151 * There are a maximum of 5 pages pinned at any time. We keep the left
152 * and right pages pinned while working on the parent. The 5 are the
153 * two children, left parent and right parent (when the parent splits)
154 * and the root page or the overflow key page when calling bt_preserve.
155 * This code must make sure that all pins are released other than the
156 * root page or overflow page which is unlocked elsewhere.
157 */
158 while ((parent = BT_POP(t)) != NULL) {
159 lchild = l;
160 rchild = r;
161
162 /* Get the parent page. */
163 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
164 goto err2;
165
166 /*
167 * The new key goes ONE AFTER the index, because the split
168 * was to the right.
169 */
170 skip = parent->index + 1;
171
172 /*
173 * Calculate the space needed on the parent page.
174 *
175 * Prefix trees: space hack when inserting into BINTERNAL
176 * pages. Retain only what's needed to distinguish between
177 * the new entry and the LAST entry on the page to its left.
178 * If the keys compare equal, retain the entire key. Note,
179 * we don't touch overflow keys, and the entire key must be
180 * retained for the next-to-left most key on the leftmost
181 * page of each level, or the search will fail. Applicable
182 * ONLY to internal pages that have leaf pages as children.
183 * Further reduction of the key between pairs of internal
184 * pages loses too much information.
185 */
186 switch (rchild->flags & P_TYPE) {
187 case P_BINTERNAL:
188 bi = GETBINTERNAL(rchild, 0);
189 nbytes = NBINTERNAL(bi->ksize);
190 break;
191 case P_BLEAF:
192 bl = GETBLEAF(rchild, 0);
193 nbytes = NBINTERNAL(bl->ksize);
194 if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
195 (h->prevpg != P_INVALID || skip > 1)) {
196 tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
197 a.size = tbl->ksize;
198 a.data = tbl->bytes;
199 b.size = bl->ksize;
200 b.data = bl->bytes;
201 nksize = t->bt_pfx(&a, &b);
202 n = NBINTERNAL(nksize);
203 if (n < nbytes) {
204 #ifdef STATISTICS
205 bt_pfxsaved += nbytes - n;
206 #endif
207 nbytes = n;
208 } else
209 nksize = 0;
210 } else
211 nksize = 0;
212 break;
213 case P_RINTERNAL:
214 case P_RLEAF:
215 nbytes = NRINTERNAL;
216 break;
217 default:
218 abort();
219 }
220
221 /* Split the parent page if necessary or shift the indices. */
222 if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
223 sp = h;
224 h = h->pgno == P_ROOT ?
225 bt_root(t, h, &l, &r, &skip, nbytes) :
226 bt_page(t, h, &l, &r, &skip, nbytes);
227 if (h == NULL)
228 goto err1;
229 parentsplit = 1;
230 } else {
231 if (skip < (nxtindex = NEXTINDEX(h)))
232 memmove(h->linp + skip + 1, h->linp + skip,
233 (nxtindex - skip) * sizeof(indx_t));
234 h->lower += sizeof(indx_t);
235 parentsplit = 0;
236 }
237
238 /* Insert the key into the parent page. */
239 switch (rchild->flags & P_TYPE) {
240 case P_BINTERNAL:
241 h->linp[skip] = h->upper -= nbytes;
242 dest = (char *)(void *)h + h->linp[skip];
243 memmove(dest, bi, nbytes);
244 ((BINTERNAL *)(void *)dest)->pgno = rchild->pgno;
245 break;
246 case P_BLEAF:
247 h->linp[skip] = h->upper -= nbytes;
248 dest = (char *)(void *)h + h->linp[skip];
249 WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
250 rchild->pgno, bl->flags & P_BIGKEY);
251 memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
252 if (bl->flags & P_BIGKEY &&
253 bt_preserve(t, *(pgno_t *)(void *)bl->bytes) ==
254 RET_ERROR)
255 goto err1;
256 break;
257 case P_RINTERNAL:
258 /*
259 * Update the left page count. If split
260 * added at index 0, fix the correct page.
261 */
262 if (skip > 0)
263 dest = (char *)(void *)h + h->linp[skip - 1];
264 else
265 dest = (char *)(void *)l + l->linp[NEXTINDEX(l) - 1];
266 ((RINTERNAL *)(void *)dest)->nrecs = rec_total(lchild);
267 ((RINTERNAL *)(void *)dest)->pgno = lchild->pgno;
268
269 /* Update the right page count. */
270 h->linp[skip] = h->upper -= nbytes;
271 dest = (char *)(void *)h + h->linp[skip];
272 ((RINTERNAL *)(void *)dest)->nrecs = rec_total(rchild);
273 ((RINTERNAL *)(void *)dest)->pgno = rchild->pgno;
274 break;
275 case P_RLEAF:
276 /*
277 * Update the left page count. If split
278 * added at index 0, fix the correct page.
279 */
280 if (skip > 0)
281 dest = (char *)(void *)h + h->linp[skip - 1];
282 else
283 dest = (char *)(void *)l + l->linp[NEXTINDEX(l) - 1];
284 ((RINTERNAL *)(void *)dest)->nrecs = NEXTINDEX(lchild);
285 ((RINTERNAL *)(void *)dest)->pgno = lchild->pgno;
286
287 /* Update the right page count. */
288 h->linp[skip] = h->upper -= nbytes;
289 dest = (char *)(void *)h + h->linp[skip];
290 ((RINTERNAL *)(void *)dest)->nrecs = NEXTINDEX(rchild);
291 ((RINTERNAL *)(void *)dest)->pgno = rchild->pgno;
292 break;
293 default:
294 abort();
295 }
296
297 /* Unpin the held pages. */
298 if (!parentsplit) {
299 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
300 break;
301 }
302
303 /* If the root page was split, make it look right. */
304 if (sp->pgno == P_ROOT &&
305 (F_ISSET(t, R_RECNO) ?
306 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
307 goto err1;
308
309 mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
310 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
311 }
312
313 /* Unpin the held pages. */
314 mpool_put(t->bt_mp, l, MPOOL_DIRTY);
315 mpool_put(t->bt_mp, r, MPOOL_DIRTY);
316
317 /* Clear any pages left on the stack. */
318 return (RET_SUCCESS);
319
320 /*
321 * If something fails in the above loop we were already walking back
322 * up the tree and the tree is now inconsistent. Nothing much we can
323 * do about it but release any memory we're holding.
324 */
325 err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
326 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
327
328 err2: mpool_put(t->bt_mp, l, 0);
329 mpool_put(t->bt_mp, r, 0);
330 __dbpanic(t->bt_dbp);
331 return (RET_ERROR);
332 }
333
334 /*
335 * BT_PAGE -- Split a non-root page of a btree.
336 *
337 * Parameters:
338 * t: tree
339 * h: root page
340 * lp: pointer to left page pointer
341 * rp: pointer to right page pointer
342 * skip: pointer to index to leave open
343 * ilen: insert length
344 *
345 * Returns:
346 * Pointer to page in which to insert or NULL on error.
347 */
348 static PAGE *
349 bt_page(t, h, lp, rp, skip, ilen)
350 BTREE *t;
351 PAGE *h, **lp, **rp;
352 indx_t *skip;
353 size_t ilen;
354 {
355 PAGE *l, *r, *tp;
356 pgno_t npg;
357
358 #ifdef STATISTICS
359 ++bt_split;
360 #endif
361 /* Put the new right page for the split into place. */
362 if ((r = __bt_new(t, &npg)) == NULL)
363 return (NULL);
364 r->pgno = npg;
365 r->lower = BTDATAOFF;
366 r->upper = t->bt_psize;
367 r->nextpg = h->nextpg;
368 r->prevpg = h->pgno;
369 r->flags = h->flags & P_TYPE;
370
371 /*
372 * If we're splitting the last page on a level because we're appending
373 * a key to it (skip is NEXTINDEX()), it's likely that the data is
374 * sorted. Adding an empty page on the side of the level is less work
375 * and can push the fill factor much higher than normal. If we're
376 * wrong it's no big deal, we'll just do the split the right way next
377 * time. It may look like it's equally easy to do a similar hack for
378 * reverse sorted data, that is, split the tree left, but it's not.
379 * Don't even try.
380 */
381 if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
382 #ifdef STATISTICS
383 ++bt_sortsplit;
384 #endif
385 h->nextpg = r->pgno;
386 r->lower = BTDATAOFF + sizeof(indx_t);
387 *skip = 0;
388 *lp = h;
389 *rp = r;
390 return (r);
391 }
392
393 /* Put the new left page for the split into place. */
394 if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) {
395 mpool_put(t->bt_mp, r, 0);
396 return (NULL);
397 }
398 #ifdef PURIFY
399 memset(l, 0xff, t->bt_psize);
400 #endif
401 l->pgno = h->pgno;
402 l->nextpg = r->pgno;
403 l->prevpg = h->prevpg;
404 l->lower = BTDATAOFF;
405 l->upper = t->bt_psize;
406 l->flags = h->flags & P_TYPE;
407
408 /* Fix up the previous pointer of the page after the split page. */
409 if (h->nextpg != P_INVALID) {
410 if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
411 free(l);
412 /* XXX mpool_free(t->bt_mp, r->pgno); */
413 return (NULL);
414 }
415 tp->prevpg = r->pgno;
416 mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
417 }
418
419 /*
420 * Split right. The key/data pairs aren't sorted in the btree page so
421 * it's simpler to copy the data from the split page onto two new pages
422 * instead of copying half the data to the right page and compacting
423 * the left page in place. Since the left page can't change, we have
424 * to swap the original and the allocated left page after the split.
425 */
426 tp = bt_psplit(t, h, l, r, skip, ilen);
427
428 /* Move the new left page onto the old left page. */
429 memmove(h, l, t->bt_psize);
430 if (tp == l)
431 tp = h;
432 free(l);
433
434 *lp = h;
435 *rp = r;
436 return (tp);
437 }
438
439 /*
440 * BT_ROOT -- Split the root page of a btree.
441 *
442 * Parameters:
443 * t: tree
444 * h: root page
445 * lp: pointer to left page pointer
446 * rp: pointer to right page pointer
447 * skip: pointer to index to leave open
448 * ilen: insert length
449 *
450 * Returns:
451 * Pointer to page in which to insert or NULL on error.
452 */
453 static PAGE *
454 bt_root(t, h, lp, rp, skip, ilen)
455 BTREE *t;
456 PAGE *h, **lp, **rp;
457 indx_t *skip;
458 size_t ilen;
459 {
460 PAGE *l, *r, *tp;
461 pgno_t lnpg, rnpg;
462
463 #ifdef STATISTICS
464 ++bt_split;
465 ++bt_rootsplit;
466 #endif
467 /* Put the new left and right pages for the split into place. */
468 if ((l = __bt_new(t, &lnpg)) == NULL ||
469 (r = __bt_new(t, &rnpg)) == NULL)
470 return (NULL);
471 l->pgno = lnpg;
472 r->pgno = rnpg;
473 l->nextpg = r->pgno;
474 r->prevpg = l->pgno;
475 l->prevpg = r->nextpg = P_INVALID;
476 l->lower = r->lower = BTDATAOFF;
477 l->upper = r->upper = t->bt_psize;
478 l->flags = r->flags = h->flags & P_TYPE;
479
480 /* Split the root page. */
481 tp = bt_psplit(t, h, l, r, skip, ilen);
482
483 *lp = l;
484 *rp = r;
485 return (tp);
486 }
487
488 /*
489 * BT_RROOT -- Fix up the recno root page after it has been split.
490 *
491 * Parameters:
492 * t: tree
493 * h: root page
494 * l: left page
495 * r: right page
496 *
497 * Returns:
498 * RET_ERROR, RET_SUCCESS
499 */
500 static int
501 bt_rroot(t, h, l, r)
502 BTREE *t;
503 PAGE *h, *l, *r;
504 {
505 char *dest;
506
507 /* Insert the left and right keys, set the header information. */
508 h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
509 dest = (char *)(void *)h + h->upper;
510 WR_RINTERNAL(dest,
511 l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
512
513 h->linp[1] = h->upper -= NRINTERNAL;
514 dest = (char *)(void *)h + h->upper;
515 WR_RINTERNAL(dest,
516 r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
517
518 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
519
520 /* Unpin the root page, set to recno internal page. */
521 h->flags &= ~P_TYPE;
522 h->flags |= P_RINTERNAL;
523 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
524
525 return (RET_SUCCESS);
526 }
527
528 /*
529 * BT_BROOT -- Fix up the btree root page after it has been split.
530 *
531 * Parameters:
532 * t: tree
533 * h: root page
534 * l: left page
535 * r: right page
536 *
537 * Returns:
538 * RET_ERROR, RET_SUCCESS
539 */
540 static int
541 bt_broot(t, h, l, r)
542 BTREE *t;
543 PAGE *h, *l, *r;
544 {
545 BINTERNAL *bi = NULL; /* pacify gcc */
546 BLEAF *bl;
547 u_int32_t nbytes;
548 char *dest;
549
550 /*
551 * If the root page was a leaf page, change it into an internal page.
552 * We copy the key we split on (but not the key's data, in the case of
553 * a leaf page) to the new root page.
554 *
555 * The btree comparison code guarantees that the left-most key on any
556 * level of the tree is never used, so it doesn't need to be filled in.
557 */
558 nbytes = NBINTERNAL(0);
559 h->linp[0] = h->upper = t->bt_psize - nbytes;
560 dest = (char *)(void *)h + h->upper;
561 WR_BINTERNAL(dest, 0, l->pgno, 0);
562
563 switch (h->flags & P_TYPE) {
564 case P_BLEAF:
565 bl = GETBLEAF(r, 0);
566 nbytes = NBINTERNAL(bl->ksize);
567 h->linp[1] = h->upper -= nbytes;
568 dest = (char *)(void *)h + h->upper;
569 WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
570 memmove(dest, bl->bytes, bl->ksize);
571
572 /*
573 * If the key is on an overflow page, mark the overflow chain
574 * so it isn't deleted when the leaf copy of the key is deleted.
575 */
576 if (bl->flags & P_BIGKEY &&
577 bt_preserve(t, *(pgno_t *)(void *)bl->bytes) == RET_ERROR)
578 return (RET_ERROR);
579 break;
580 case P_BINTERNAL:
581 bi = GETBINTERNAL(r, 0);
582 nbytes = NBINTERNAL(bi->ksize);
583 h->linp[1] = h->upper -= nbytes;
584 dest = (char *)(void *)h + h->upper;
585 memmove(dest, bi, nbytes);
586 ((BINTERNAL *)(void *)dest)->pgno = r->pgno;
587 break;
588 default:
589 abort();
590 }
591
592 /* There are two keys on the page. */
593 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
594
595 /* Unpin the root page, set to btree internal page. */
596 h->flags &= ~P_TYPE;
597 h->flags |= P_BINTERNAL;
598 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
599
600 return (RET_SUCCESS);
601 }
602
603 /*
604 * BT_PSPLIT -- Do the real work of splitting the page.
605 *
606 * Parameters:
607 * t: tree
608 * h: page to be split
609 * l: page to put lower half of data
610 * r: page to put upper half of data
611 * pskip: pointer to index to leave open
612 * ilen: insert length
613 *
614 * Returns:
615 * Pointer to page in which to insert.
616 */
617 static PAGE *
618 bt_psplit(t, h, l, r, pskip, ilen)
619 BTREE *t;
620 PAGE *h, *l, *r;
621 indx_t *pskip;
622 size_t ilen;
623 {
624 BINTERNAL *bi;
625 BLEAF *bl;
626 CURSOR *c;
627 RLEAF *rl;
628 PAGE *rval;
629 void *src = NULL; /* pacify gcc */
630 indx_t full, half, nxt, off, skip, top, used;
631 u_int32_t nbytes;
632 int bigkeycnt, isbigkey;
633
634 /*
635 * Split the data to the left and right pages. Leave the skip index
636 * open. Additionally, make some effort not to split on an overflow
637 * key. This makes internal page processing faster and can save
638 * space as overflow keys used by internal pages are never deleted.
639 */
640 bigkeycnt = 0;
641 skip = *pskip;
642 full = t->bt_psize - BTDATAOFF;
643 half = full / 2;
644 used = 0;
645 for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
646 if (skip == off) {
647 nbytes = ilen;
648 isbigkey = 0; /* XXX: not really known. */
649 } else
650 switch (h->flags & P_TYPE) {
651 case P_BINTERNAL:
652 src = bi = GETBINTERNAL(h, nxt);
653 nbytes = NBINTERNAL(bi->ksize);
654 isbigkey = bi->flags & P_BIGKEY;
655 break;
656 case P_BLEAF:
657 src = bl = GETBLEAF(h, nxt);
658 nbytes = NBLEAF(bl);
659 isbigkey = bl->flags & P_BIGKEY;
660 break;
661 case P_RINTERNAL:
662 src = GETRINTERNAL(h, nxt);
663 nbytes = NRINTERNAL;
664 isbigkey = 0;
665 break;
666 case P_RLEAF:
667 src = rl = GETRLEAF(h, nxt);
668 nbytes = NRLEAF(rl);
669 isbigkey = 0;
670 break;
671 default:
672 abort();
673 }
674
675 /*
676 * If the key/data pairs are substantial fractions of the max
677 * possible size for the page, it's possible to get situations
678 * where we decide to try and copy too much onto the left page.
679 * Make sure that doesn't happen.
680 */
681 if ((skip <= off && used + nbytes + sizeof(indx_t) >= full) ||
682 nxt == top - 1) {
683 --off;
684 break;
685 }
686
687 /* Copy the key/data pair, if not the skipped index. */
688 if (skip != off) {
689 ++nxt;
690
691 l->linp[off] = l->upper -= nbytes;
692 memmove((char *)(void *)l + l->upper, src, nbytes);
693 }
694
695 used += nbytes + sizeof(indx_t);
696 if (used >= half) {
697 if (!isbigkey || bigkeycnt == 3)
698 break;
699 else
700 ++bigkeycnt;
701 }
702 }
703
704 /*
705 * Off is the last offset that's valid for the left page.
706 * Nxt is the first offset to be placed on the right page.
707 */
708 l->lower += (off + 1) * sizeof(indx_t);
709
710 /*
711 * If splitting the page that the cursor was on, the cursor has to be
712 * adjusted to point to the same record as before the split. If the
713 * cursor is at or past the skipped slot, the cursor is incremented by
714 * one. If the cursor is on the right page, it is decremented by the
715 * number of records split to the left page.
716 */
717 c = &t->bt_cursor;
718 if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
719 if (c->pg.index >= skip)
720 ++c->pg.index;
721 if (c->pg.index < nxt) /* Left page. */
722 c->pg.pgno = l->pgno;
723 else { /* Right page. */
724 c->pg.pgno = r->pgno;
725 c->pg.index -= nxt;
726 }
727 }
728
729 /*
730 * If the skipped index was on the left page, just return that page.
731 * Otherwise, adjust the skip index to reflect the new position on
732 * the right page.
733 */
734 if (skip <= off) {
735 skip = MAX_PAGE_OFFSET;
736 rval = l;
737 } else {
738 rval = r;
739 *pskip -= nxt;
740 }
741
742 for (off = 0; nxt < top; ++off) {
743 if (skip == nxt) {
744 ++off;
745 skip = MAX_PAGE_OFFSET;
746 }
747 switch (h->flags & P_TYPE) {
748 case P_BINTERNAL:
749 src = bi = GETBINTERNAL(h, nxt);
750 nbytes = NBINTERNAL(bi->ksize);
751 break;
752 case P_BLEAF:
753 src = bl = GETBLEAF(h, nxt);
754 nbytes = NBLEAF(bl);
755 break;
756 case P_RINTERNAL:
757 src = GETRINTERNAL(h, nxt);
758 nbytes = NRINTERNAL;
759 break;
760 case P_RLEAF:
761 src = rl = GETRLEAF(h, nxt);
762 nbytes = NRLEAF(rl);
763 break;
764 default:
765 abort();
766 }
767 ++nxt;
768 r->linp[off] = r->upper -= nbytes;
769 memmove((char *)(void *)r + r->upper, src, nbytes);
770 }
771 r->lower += off * sizeof(indx_t);
772
773 /* If the key is being appended to the page, adjust the index. */
774 if (skip == top)
775 r->lower += sizeof(indx_t);
776
777 return (rval);
778 }
779
780 /*
781 * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
782 *
783 * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
784 * record that references them gets deleted. Chains pointed to by internal
785 * pages never get deleted. This routine marks a chain as pointed to by an
786 * internal page.
787 *
788 * Parameters:
789 * t: tree
790 * pg: page number of first page in the chain.
791 *
792 * Returns:
793 * RET_SUCCESS, RET_ERROR.
794 */
795 static int
796 bt_preserve(t, pg)
797 BTREE *t;
798 pgno_t pg;
799 {
800 PAGE *h;
801
802 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
803 return (RET_ERROR);
804 h->flags |= P_PRESERVE;
805 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
806 return (RET_SUCCESS);
807 }
808
809 /*
810 * REC_TOTAL -- Return the number of recno entries below a page.
811 *
812 * Parameters:
813 * h: page
814 *
815 * Returns:
816 * The number of recno entries below a page.
817 *
818 * XXX
819 * These values could be set by the bt_psplit routine. The problem is that the
820 * entry has to be popped off of the stack etc. or the values have to be passed
821 * all the way back to bt_split/bt_rroot and it's not very clean.
822 */
823 static recno_t
824 rec_total(h)
825 PAGE *h;
826 {
827 recno_t recs;
828 indx_t nxt, top;
829
830 for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
831 recs += GETRINTERNAL(h, nxt)->nrecs;
832 return (recs);
833 }
834