bt_split.c revision 1.7 1 /* $NetBSD: bt_split.c,v 1.7 1997/07/13 18:51:59 christos Exp $ */
2
3 /*-
4 * Copyright (c) 1990, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Mike Olson.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 */
38
39 #include <sys/cdefs.h>
40 #if defined(LIBC_SCCS) && !defined(lint)
41 #if 0
42 static char sccsid[] = "@(#)bt_split.c 8.9 (Berkeley) 7/26/94";
43 #else
44 __RCSID("$NetBSD: bt_split.c,v 1.7 1997/07/13 18:51:59 christos Exp $");
45 #endif
46 #endif /* LIBC_SCCS and not lint */
47
48 #include <sys/types.h>
49
50 #include <limits.h>
51 #include <stdio.h>
52 #include <stdlib.h>
53 #include <string.h>
54
55 #include <db.h>
56 #include "btree.h"
57
58 static int bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
59 static PAGE *bt_page
60 __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
61 static int bt_preserve __P((BTREE *, pgno_t));
62 static PAGE *bt_psplit
63 __P((BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t));
64 static PAGE *bt_root
65 __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
66 static int bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
67 static recno_t rec_total __P((PAGE *));
68
69 #ifdef STATISTICS
70 u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
71 #endif
72
73 /*
74 * __BT_SPLIT -- Split the tree.
75 *
76 * Parameters:
77 * t: tree
78 * sp: page to split
79 * key: key to insert
80 * data: data to insert
81 * flags: BIGKEY/BIGDATA flags
82 * ilen: insert length
83 * skip: index to leave open
84 *
85 * Returns:
86 * RET_ERROR, RET_SUCCESS
87 */
88 int
89 __bt_split(t, sp, key, data, flags, ilen, argskip)
90 BTREE *t;
91 PAGE *sp;
92 const DBT *key, *data;
93 int flags;
94 size_t ilen;
95 u_int32_t argskip;
96 {
97 BINTERNAL *bi = NULL; /* pacify gcc */
98 BLEAF *bl = NULL, *tbl; /* pacify gcc */
99 DBT a, b;
100 EPGNO *parent;
101 PAGE *h, *l, *r, *lchild, *rchild;
102 indx_t nxtindex;
103 u_int16_t skip;
104 u_int32_t n, nbytes, nksize = 0; /* pacify gcc */
105 int parentsplit;
106 char *dest;
107
108 /*
109 * Split the page into two pages, l and r. The split routines return
110 * a pointer to the page into which the key should be inserted and with
111 * skip set to the offset which should be used. Additionally, l and r
112 * are pinned.
113 */
114 skip = argskip;
115 h = sp->pgno == P_ROOT ?
116 bt_root(t, sp, &l, &r, &skip, ilen) :
117 bt_page(t, sp, &l, &r, &skip, ilen);
118 if (h == NULL)
119 return (RET_ERROR);
120
121 /*
122 * Insert the new key/data pair into the leaf page. (Key inserts
123 * always cause a leaf page to split first.)
124 */
125 h->linp[skip] = h->upper -= ilen;
126 dest = (char *)h + h->upper;
127 if (F_ISSET(t, R_RECNO))
128 WR_RLEAF(dest, data, flags)
129 else
130 WR_BLEAF(dest, key, data, flags)
131
132 /* If the root page was split, make it look right. */
133 if (sp->pgno == P_ROOT &&
134 (F_ISSET(t, R_RECNO) ?
135 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
136 goto err2;
137
138 /*
139 * Now we walk the parent page stack -- a LIFO stack of the pages that
140 * were traversed when we searched for the page that split. Each stack
141 * entry is a page number and a page index offset. The offset is for
142 * the page traversed on the search. We've just split a page, so we
143 * have to insert a new key into the parent page.
144 *
145 * If the insert into the parent page causes it to split, may have to
146 * continue splitting all the way up the tree. We stop if the root
147 * splits or the page inserted into didn't have to split to hold the
148 * new key. Some algorithms replace the key for the old page as well
149 * as the new page. We don't, as there's no reason to believe that the
150 * first key on the old page is any better than the key we have, and,
151 * in the case of a key being placed at index 0 causing the split, the
152 * key is unavailable.
153 *
154 * There are a maximum of 5 pages pinned at any time. We keep the left
155 * and right pages pinned while working on the parent. The 5 are the
156 * two children, left parent and right parent (when the parent splits)
157 * and the root page or the overflow key page when calling bt_preserve.
158 * This code must make sure that all pins are released other than the
159 * root page or overflow page which is unlocked elsewhere.
160 */
161 while ((parent = BT_POP(t)) != NULL) {
162 lchild = l;
163 rchild = r;
164
165 /* Get the parent page. */
166 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
167 goto err2;
168
169 /*
170 * The new key goes ONE AFTER the index, because the split
171 * was to the right.
172 */
173 skip = parent->index + 1;
174
175 /*
176 * Calculate the space needed on the parent page.
177 *
178 * Prefix trees: space hack when inserting into BINTERNAL
179 * pages. Retain only what's needed to distinguish between
180 * the new entry and the LAST entry on the page to its left.
181 * If the keys compare equal, retain the entire key. Note,
182 * we don't touch overflow keys, and the entire key must be
183 * retained for the next-to-left most key on the leftmost
184 * page of each level, or the search will fail. Applicable
185 * ONLY to internal pages that have leaf pages as children.
186 * Further reduction of the key between pairs of internal
187 * pages loses too much information.
188 */
189 switch (rchild->flags & P_TYPE) {
190 case P_BINTERNAL:
191 bi = GETBINTERNAL(rchild, 0);
192 nbytes = NBINTERNAL(bi->ksize);
193 break;
194 case P_BLEAF:
195 bl = GETBLEAF(rchild, 0);
196 nbytes = NBINTERNAL(bl->ksize);
197 if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
198 (h->prevpg != P_INVALID || skip > 1)) {
199 tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
200 a.size = tbl->ksize;
201 a.data = tbl->bytes;
202 b.size = bl->ksize;
203 b.data = bl->bytes;
204 nksize = t->bt_pfx(&a, &b);
205 n = NBINTERNAL(nksize);
206 if (n < nbytes) {
207 #ifdef STATISTICS
208 bt_pfxsaved += nbytes - n;
209 #endif
210 nbytes = n;
211 } else
212 nksize = 0;
213 } else
214 nksize = 0;
215 break;
216 case P_RINTERNAL:
217 case P_RLEAF:
218 nbytes = NRINTERNAL;
219 break;
220 default:
221 abort();
222 }
223
224 /* Split the parent page if necessary or shift the indices. */
225 if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
226 sp = h;
227 h = h->pgno == P_ROOT ?
228 bt_root(t, h, &l, &r, &skip, nbytes) :
229 bt_page(t, h, &l, &r, &skip, nbytes);
230 if (h == NULL)
231 goto err1;
232 parentsplit = 1;
233 } else {
234 if (skip < (nxtindex = NEXTINDEX(h)))
235 memmove(h->linp + skip + 1, h->linp + skip,
236 (nxtindex - skip) * sizeof(indx_t));
237 h->lower += sizeof(indx_t);
238 parentsplit = 0;
239 }
240
241 /* Insert the key into the parent page. */
242 switch (rchild->flags & P_TYPE) {
243 case P_BINTERNAL:
244 h->linp[skip] = h->upper -= nbytes;
245 dest = (char *)h + h->linp[skip];
246 memmove(dest, bi, nbytes);
247 ((BINTERNAL *)dest)->pgno = rchild->pgno;
248 break;
249 case P_BLEAF:
250 h->linp[skip] = h->upper -= nbytes;
251 dest = (char *)h + h->linp[skip];
252 WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
253 rchild->pgno, bl->flags & P_BIGKEY);
254 memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
255 if (bl->flags & P_BIGKEY &&
256 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
257 goto err1;
258 break;
259 case P_RINTERNAL:
260 /*
261 * Update the left page count. If split
262 * added at index 0, fix the correct page.
263 */
264 if (skip > 0)
265 dest = (char *)h + h->linp[skip - 1];
266 else
267 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
268 ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
269 ((RINTERNAL *)dest)->pgno = lchild->pgno;
270
271 /* Update the right page count. */
272 h->linp[skip] = h->upper -= nbytes;
273 dest = (char *)h + h->linp[skip];
274 ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
275 ((RINTERNAL *)dest)->pgno = rchild->pgno;
276 break;
277 case P_RLEAF:
278 /*
279 * Update the left page count. If split
280 * added at index 0, fix the correct page.
281 */
282 if (skip > 0)
283 dest = (char *)h + h->linp[skip - 1];
284 else
285 dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
286 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
287 ((RINTERNAL *)dest)->pgno = lchild->pgno;
288
289 /* Update the right page count. */
290 h->linp[skip] = h->upper -= nbytes;
291 dest = (char *)h + h->linp[skip];
292 ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
293 ((RINTERNAL *)dest)->pgno = rchild->pgno;
294 break;
295 default:
296 abort();
297 }
298
299 /* Unpin the held pages. */
300 if (!parentsplit) {
301 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
302 break;
303 }
304
305 /* If the root page was split, make it look right. */
306 if (sp->pgno == P_ROOT &&
307 (F_ISSET(t, R_RECNO) ?
308 bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
309 goto err1;
310
311 mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
312 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
313 }
314
315 /* Unpin the held pages. */
316 mpool_put(t->bt_mp, l, MPOOL_DIRTY);
317 mpool_put(t->bt_mp, r, MPOOL_DIRTY);
318
319 /* Clear any pages left on the stack. */
320 return (RET_SUCCESS);
321
322 /*
323 * If something fails in the above loop we were already walking back
324 * up the tree and the tree is now inconsistent. Nothing much we can
325 * do about it but release any memory we're holding.
326 */
327 err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
328 mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
329
330 err2: mpool_put(t->bt_mp, l, 0);
331 mpool_put(t->bt_mp, r, 0);
332 __dbpanic(t->bt_dbp);
333 return (RET_ERROR);
334 }
335
336 /*
337 * BT_PAGE -- Split a non-root page of a btree.
338 *
339 * Parameters:
340 * t: tree
341 * h: root page
342 * lp: pointer to left page pointer
343 * rp: pointer to right page pointer
344 * skip: pointer to index to leave open
345 * ilen: insert length
346 *
347 * Returns:
348 * Pointer to page in which to insert or NULL on error.
349 */
350 static PAGE *
351 bt_page(t, h, lp, rp, skip, ilen)
352 BTREE *t;
353 PAGE *h, **lp, **rp;
354 indx_t *skip;
355 size_t ilen;
356 {
357 PAGE *l, *r, *tp;
358 pgno_t npg;
359
360 #ifdef STATISTICS
361 ++bt_split;
362 #endif
363 /* Put the new right page for the split into place. */
364 if ((r = __bt_new(t, &npg)) == NULL)
365 return (NULL);
366 r->pgno = npg;
367 r->lower = BTDATAOFF;
368 r->upper = t->bt_psize;
369 r->nextpg = h->nextpg;
370 r->prevpg = h->pgno;
371 r->flags = h->flags & P_TYPE;
372
373 /*
374 * If we're splitting the last page on a level because we're appending
375 * a key to it (skip is NEXTINDEX()), it's likely that the data is
376 * sorted. Adding an empty page on the side of the level is less work
377 * and can push the fill factor much higher than normal. If we're
378 * wrong it's no big deal, we'll just do the split the right way next
379 * time. It may look like it's equally easy to do a similar hack for
380 * reverse sorted data, that is, split the tree left, but it's not.
381 * Don't even try.
382 */
383 if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
384 #ifdef STATISTICS
385 ++bt_sortsplit;
386 #endif
387 h->nextpg = r->pgno;
388 r->lower = BTDATAOFF + sizeof(indx_t);
389 *skip = 0;
390 *lp = h;
391 *rp = r;
392 return (r);
393 }
394
395 /* Put the new left page for the split into place. */
396 if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) {
397 mpool_put(t->bt_mp, r, 0);
398 return (NULL);
399 }
400 #ifdef PURIFY
401 memset(l, 0xff, t->bt_psize);
402 #endif
403 l->pgno = h->pgno;
404 l->nextpg = r->pgno;
405 l->prevpg = h->prevpg;
406 l->lower = BTDATAOFF;
407 l->upper = t->bt_psize;
408 l->flags = h->flags & P_TYPE;
409
410 /* Fix up the previous pointer of the page after the split page. */
411 if (h->nextpg != P_INVALID) {
412 if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
413 free(l);
414 /* XXX mpool_free(t->bt_mp, r->pgno); */
415 return (NULL);
416 }
417 tp->prevpg = r->pgno;
418 mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
419 }
420
421 /*
422 * Split right. The key/data pairs aren't sorted in the btree page so
423 * it's simpler to copy the data from the split page onto two new pages
424 * instead of copying half the data to the right page and compacting
425 * the left page in place. Since the left page can't change, we have
426 * to swap the original and the allocated left page after the split.
427 */
428 tp = bt_psplit(t, h, l, r, skip, ilen);
429
430 /* Move the new left page onto the old left page. */
431 memmove(h, l, t->bt_psize);
432 if (tp == l)
433 tp = h;
434 free(l);
435
436 *lp = h;
437 *rp = r;
438 return (tp);
439 }
440
441 /*
442 * BT_ROOT -- Split the root page of a btree.
443 *
444 * Parameters:
445 * t: tree
446 * h: root page
447 * lp: pointer to left page pointer
448 * rp: pointer to right page pointer
449 * skip: pointer to index to leave open
450 * ilen: insert length
451 *
452 * Returns:
453 * Pointer to page in which to insert or NULL on error.
454 */
455 static PAGE *
456 bt_root(t, h, lp, rp, skip, ilen)
457 BTREE *t;
458 PAGE *h, **lp, **rp;
459 indx_t *skip;
460 size_t ilen;
461 {
462 PAGE *l, *r, *tp;
463 pgno_t lnpg, rnpg;
464
465 #ifdef STATISTICS
466 ++bt_split;
467 ++bt_rootsplit;
468 #endif
469 /* Put the new left and right pages for the split into place. */
470 if ((l = __bt_new(t, &lnpg)) == NULL ||
471 (r = __bt_new(t, &rnpg)) == NULL)
472 return (NULL);
473 l->pgno = lnpg;
474 r->pgno = rnpg;
475 l->nextpg = r->pgno;
476 r->prevpg = l->pgno;
477 l->prevpg = r->nextpg = P_INVALID;
478 l->lower = r->lower = BTDATAOFF;
479 l->upper = r->upper = t->bt_psize;
480 l->flags = r->flags = h->flags & P_TYPE;
481
482 /* Split the root page. */
483 tp = bt_psplit(t, h, l, r, skip, ilen);
484
485 *lp = l;
486 *rp = r;
487 return (tp);
488 }
489
490 /*
491 * BT_RROOT -- Fix up the recno root page after it has been split.
492 *
493 * Parameters:
494 * t: tree
495 * h: root page
496 * l: left page
497 * r: right page
498 *
499 * Returns:
500 * RET_ERROR, RET_SUCCESS
501 */
502 static int
503 bt_rroot(t, h, l, r)
504 BTREE *t;
505 PAGE *h, *l, *r;
506 {
507 char *dest;
508
509 /* Insert the left and right keys, set the header information. */
510 h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
511 dest = (char *)h + h->upper;
512 WR_RINTERNAL(dest,
513 l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
514
515 h->linp[1] = h->upper -= NRINTERNAL;
516 dest = (char *)h + h->upper;
517 WR_RINTERNAL(dest,
518 r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
519
520 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
521
522 /* Unpin the root page, set to recno internal page. */
523 h->flags &= ~P_TYPE;
524 h->flags |= P_RINTERNAL;
525 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
526
527 return (RET_SUCCESS);
528 }
529
530 /*
531 * BT_BROOT -- Fix up the btree root page after it has been split.
532 *
533 * Parameters:
534 * t: tree
535 * h: root page
536 * l: left page
537 * r: right page
538 *
539 * Returns:
540 * RET_ERROR, RET_SUCCESS
541 */
542 static int
543 bt_broot(t, h, l, r)
544 BTREE *t;
545 PAGE *h, *l, *r;
546 {
547 BINTERNAL *bi = NULL; /* pacify gcc */
548 BLEAF *bl;
549 u_int32_t nbytes;
550 char *dest;
551
552 /*
553 * If the root page was a leaf page, change it into an internal page.
554 * We copy the key we split on (but not the key's data, in the case of
555 * a leaf page) to the new root page.
556 *
557 * The btree comparison code guarantees that the left-most key on any
558 * level of the tree is never used, so it doesn't need to be filled in.
559 */
560 nbytes = NBINTERNAL(0);
561 h->linp[0] = h->upper = t->bt_psize - nbytes;
562 dest = (char *)h + h->upper;
563 WR_BINTERNAL(dest, 0, l->pgno, 0);
564
565 switch (h->flags & P_TYPE) {
566 case P_BLEAF:
567 bl = GETBLEAF(r, 0);
568 nbytes = NBINTERNAL(bl->ksize);
569 h->linp[1] = h->upper -= nbytes;
570 dest = (char *)h + h->upper;
571 WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
572 memmove(dest, bl->bytes, bl->ksize);
573
574 /*
575 * If the key is on an overflow page, mark the overflow chain
576 * so it isn't deleted when the leaf copy of the key is deleted.
577 */
578 if (bl->flags & P_BIGKEY &&
579 bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
580 return (RET_ERROR);
581 break;
582 case P_BINTERNAL:
583 bi = GETBINTERNAL(r, 0);
584 nbytes = NBINTERNAL(bi->ksize);
585 h->linp[1] = h->upper -= nbytes;
586 dest = (char *)h + h->upper;
587 memmove(dest, bi, nbytes);
588 ((BINTERNAL *)dest)->pgno = r->pgno;
589 break;
590 default:
591 abort();
592 }
593
594 /* There are two keys on the page. */
595 h->lower = BTDATAOFF + 2 * sizeof(indx_t);
596
597 /* Unpin the root page, set to btree internal page. */
598 h->flags &= ~P_TYPE;
599 h->flags |= P_BINTERNAL;
600 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
601
602 return (RET_SUCCESS);
603 }
604
605 /*
606 * BT_PSPLIT -- Do the real work of splitting the page.
607 *
608 * Parameters:
609 * t: tree
610 * h: page to be split
611 * l: page to put lower half of data
612 * r: page to put upper half of data
613 * pskip: pointer to index to leave open
614 * ilen: insert length
615 *
616 * Returns:
617 * Pointer to page in which to insert.
618 */
619 static PAGE *
620 bt_psplit(t, h, l, r, pskip, ilen)
621 BTREE *t;
622 PAGE *h, *l, *r;
623 indx_t *pskip;
624 size_t ilen;
625 {
626 BINTERNAL *bi;
627 BLEAF *bl;
628 CURSOR *c;
629 RLEAF *rl;
630 PAGE *rval;
631 void *src = NULL; /* pacify gcc */
632 indx_t full, half, nxt, off, skip, top, used;
633 u_int32_t nbytes;
634 int bigkeycnt, isbigkey;
635
636 /*
637 * Split the data to the left and right pages. Leave the skip index
638 * open. Additionally, make some effort not to split on an overflow
639 * key. This makes internal page processing faster and can save
640 * space as overflow keys used by internal pages are never deleted.
641 */
642 bigkeycnt = 0;
643 skip = *pskip;
644 full = t->bt_psize - BTDATAOFF;
645 half = full / 2;
646 used = 0;
647 for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
648 if (skip == off) {
649 nbytes = ilen;
650 isbigkey = 0; /* XXX: not really known. */
651 } else
652 switch (h->flags & P_TYPE) {
653 case P_BINTERNAL:
654 src = bi = GETBINTERNAL(h, nxt);
655 nbytes = NBINTERNAL(bi->ksize);
656 isbigkey = bi->flags & P_BIGKEY;
657 break;
658 case P_BLEAF:
659 src = bl = GETBLEAF(h, nxt);
660 nbytes = NBLEAF(bl);
661 isbigkey = bl->flags & P_BIGKEY;
662 break;
663 case P_RINTERNAL:
664 src = GETRINTERNAL(h, nxt);
665 nbytes = NRINTERNAL;
666 isbigkey = 0;
667 break;
668 case P_RLEAF:
669 src = rl = GETRLEAF(h, nxt);
670 nbytes = NRLEAF(rl);
671 isbigkey = 0;
672 break;
673 default:
674 abort();
675 }
676
677 /*
678 * If the key/data pairs are substantial fractions of the max
679 * possible size for the page, it's possible to get situations
680 * where we decide to try and copy too much onto the left page.
681 * Make sure that doesn't happen.
682 */
683 if (skip <= off && used + nbytes >= full) {
684 --off;
685 break;
686 }
687
688 /* Copy the key/data pair, if not the skipped index. */
689 if (skip != off) {
690 ++nxt;
691
692 l->linp[off] = l->upper -= nbytes;
693 memmove((char *)l + l->upper, src, nbytes);
694 }
695
696 used += nbytes;
697 if (used >= half) {
698 if (!isbigkey || bigkeycnt == 3)
699 break;
700 else
701 ++bigkeycnt;
702 }
703 }
704
705 /*
706 * Off is the last offset that's valid for the left page.
707 * Nxt is the first offset to be placed on the right page.
708 */
709 l->lower += (off + 1) * sizeof(indx_t);
710
711 /*
712 * If splitting the page that the cursor was on, the cursor has to be
713 * adjusted to point to the same record as before the split. If the
714 * cursor is at or past the skipped slot, the cursor is incremented by
715 * one. If the cursor is on the right page, it is decremented by the
716 * number of records split to the left page.
717 */
718 c = &t->bt_cursor;
719 if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
720 if (c->pg.index >= skip)
721 ++c->pg.index;
722 if (c->pg.index < nxt) /* Left page. */
723 c->pg.pgno = l->pgno;
724 else { /* Right page. */
725 c->pg.pgno = r->pgno;
726 c->pg.index -= nxt;
727 }
728 }
729
730 /*
731 * If the skipped index was on the left page, just return that page.
732 * Otherwise, adjust the skip index to reflect the new position on
733 * the right page.
734 */
735 if (skip <= off) {
736 skip = 0;
737 rval = l;
738 } else {
739 rval = r;
740 *pskip -= nxt;
741 }
742
743 for (off = 0; nxt < top; ++off) {
744 if (skip == nxt) {
745 ++off;
746 skip = 0;
747 }
748 switch (h->flags & P_TYPE) {
749 case P_BINTERNAL:
750 src = bi = GETBINTERNAL(h, nxt);
751 nbytes = NBINTERNAL(bi->ksize);
752 break;
753 case P_BLEAF:
754 src = bl = GETBLEAF(h, nxt);
755 nbytes = NBLEAF(bl);
756 break;
757 case P_RINTERNAL:
758 src = GETRINTERNAL(h, nxt);
759 nbytes = NRINTERNAL;
760 break;
761 case P_RLEAF:
762 src = rl = GETRLEAF(h, nxt);
763 nbytes = NRLEAF(rl);
764 break;
765 default:
766 abort();
767 }
768 ++nxt;
769 r->linp[off] = r->upper -= nbytes;
770 memmove((char *)r + r->upper, src, nbytes);
771 }
772 r->lower += off * sizeof(indx_t);
773
774 /* If the key is being appended to the page, adjust the index. */
775 if (skip == top)
776 r->lower += sizeof(indx_t);
777
778 return (rval);
779 }
780
781 /*
782 * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
783 *
784 * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
785 * record that references them gets deleted. Chains pointed to by internal
786 * pages never get deleted. This routine marks a chain as pointed to by an
787 * internal page.
788 *
789 * Parameters:
790 * t: tree
791 * pg: page number of first page in the chain.
792 *
793 * Returns:
794 * RET_SUCCESS, RET_ERROR.
795 */
796 static int
797 bt_preserve(t, pg)
798 BTREE *t;
799 pgno_t pg;
800 {
801 PAGE *h;
802
803 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
804 return (RET_ERROR);
805 h->flags |= P_PRESERVE;
806 mpool_put(t->bt_mp, h, MPOOL_DIRTY);
807 return (RET_SUCCESS);
808 }
809
810 /*
811 * REC_TOTAL -- Return the number of recno entries below a page.
812 *
813 * Parameters:
814 * h: page
815 *
816 * Returns:
817 * The number of recno entries below a page.
818 *
819 * XXX
820 * These values could be set by the bt_psplit routine. The problem is that the
821 * entry has to be popped off of the stack etc. or the values have to be passed
822 * all the way back to bt_split/bt_rroot and it's not very clean.
823 */
824 static recno_t
825 rec_total(h)
826 PAGE *h;
827 {
828 recno_t recs;
829 indx_t nxt, top;
830
831 for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
832 recs += GETRINTERNAL(h, nxt)->nrecs;
833 return (recs);
834 }
835