Home | History | Annotate | Line # | Download | only in btree
btree.h revision 1.1
      1  1.1  cgd /*-
      2  1.1  cgd  * Copyright (c) 1990 The Regents of the University of California.
      3  1.1  cgd  * All rights reserved.
      4  1.1  cgd  *
      5  1.1  cgd  * This code is derived from software contributed to Berkeley by
      6  1.1  cgd  * Mike Olson.
      7  1.1  cgd  *
      8  1.1  cgd  * Redistribution and use in source and binary forms, with or without
      9  1.1  cgd  * modification, are permitted provided that the following conditions
     10  1.1  cgd  * are met:
     11  1.1  cgd  * 1. Redistributions of source code must retain the above copyright
     12  1.1  cgd  *    notice, this list of conditions and the following disclaimer.
     13  1.1  cgd  * 2. Redistributions in binary form must reproduce the above copyright
     14  1.1  cgd  *    notice, this list of conditions and the following disclaimer in the
     15  1.1  cgd  *    documentation and/or other materials provided with the distribution.
     16  1.1  cgd  * 3. All advertising materials mentioning features or use of this software
     17  1.1  cgd  *    must display the following acknowledgement:
     18  1.1  cgd  *	This product includes software developed by the University of
     19  1.1  cgd  *	California, Berkeley and its contributors.
     20  1.1  cgd  * 4. Neither the name of the University nor the names of its contributors
     21  1.1  cgd  *    may be used to endorse or promote products derived from this software
     22  1.1  cgd  *    without specific prior written permission.
     23  1.1  cgd  *
     24  1.1  cgd  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
     25  1.1  cgd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     26  1.1  cgd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     27  1.1  cgd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
     28  1.1  cgd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     29  1.1  cgd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     30  1.1  cgd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     31  1.1  cgd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     32  1.1  cgd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     33  1.1  cgd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     34  1.1  cgd  * SUCH DAMAGE.
     35  1.1  cgd  */
     36  1.1  cgd 
     37  1.1  cgd /*
     38  1.1  cgd  *  @(#)btree.h	5.2 (Berkeley) 2/22/91
     39  1.1  cgd  */
     40  1.1  cgd 
     41  1.1  cgd typedef char	*BTREE;		/* should really be (void *) */
     42  1.1  cgd 
     43  1.1  cgd /* #define	DEBUG */
     44  1.1  cgd 
     45  1.1  cgd #define RET_ERROR	-1
     46  1.1  cgd #define RET_SUCCESS	 0
     47  1.1  cgd #define RET_SPECIAL	 1
     48  1.1  cgd 
     49  1.1  cgd #ifndef TRUE
     50  1.1  cgd #define TRUE	1
     51  1.1  cgd #define FALSE	0
     52  1.1  cgd #endif /* ndef TRUE */
     53  1.1  cgd 
     54  1.1  cgd #ifndef NULL
     55  1.1  cgd #define NULL	0
     56  1.1  cgd #endif /* ndef NULL */
     57  1.1  cgd 
     58  1.1  cgd /* these are defined in lrucache.c */
     59  1.1  cgd extern char	*lruinit();
     60  1.1  cgd extern char	*lruget();
     61  1.1  cgd extern char	*lrugetnew();
     62  1.1  cgd extern int	lrusync();
     63  1.1  cgd extern int	lruwrite();
     64  1.1  cgd extern int	lrurelease();
     65  1.1  cgd extern void	lrufree();
     66  1.1  cgd 
     67  1.1  cgd /* these are defined here */
     68  1.1  cgd extern BTREE	bt_open();
     69  1.1  cgd extern int	bt_close();
     70  1.1  cgd extern int	bt_delete();
     71  1.1  cgd extern int	bt_get();
     72  1.1  cgd extern int	bt_put();
     73  1.1  cgd extern int	bt_seq();
     74  1.1  cgd extern int	bt_sync();
     75  1.1  cgd 
     76  1.1  cgd /*
     77  1.1  cgd  *  Private types.  What you choose for these depends on how big you
     78  1.1  cgd  *  want to let files get, and how big you want to let pages get.
     79  1.1  cgd  */
     80  1.1  cgd 
     81  1.1  cgd typedef u_long	index_t;	/* so # bytes on a page fits in a long */
     82  1.1  cgd typedef u_long	pgno_t;		/* so # of pages in a btree fits in a long */
     83  1.1  cgd 
     84  1.1  cgd /*
     85  1.1  cgd  *  When we do searches, we push the parent page numbers onto a stack
     86  1.1  cgd  *  as we descend the tree.  This is so that for insertions, we can
     87  1.1  cgd  *  find our way back up to do internal page insertions and splits.
     88  1.1  cgd  */
     89  1.1  cgd 
     90  1.1  cgd typedef struct BTSTACK {
     91  1.1  cgd 	pgno_t		bts_pgno;
     92  1.1  cgd 	struct BTSTACK	*bts_next;
     93  1.1  cgd } BTSTACK;
     94  1.1  cgd 
     95  1.1  cgd /*
     96  1.1  cgd  *  Every btree page has a header that looks like this.  Flags are given
     97  1.1  cgd  *  in the #define's for the F_ flags (see below).
     98  1.1  cgd  */
     99  1.1  cgd 
    100  1.1  cgd typedef struct BTHEADER {
    101  1.1  cgd 	pgno_t h_pgno;		/* page number of this page */
    102  1.1  cgd 	pgno_t h_prevpg;	/* left sibling */
    103  1.1  cgd 	pgno_t h_nextpg;	/* right sibling */
    104  1.1  cgd 
    105  1.1  cgd #define F_LEAF		0x01	/* leaf page, contains user data */
    106  1.1  cgd #define F_CONT		0x02	/* continuation page (large items) */
    107  1.1  cgd #define F_DIRTY		0x04	/* need to write to disk */
    108  1.1  cgd #define F_PRESERVE	0x08	/* never delete this chain of pages */
    109  1.1  cgd 
    110  1.1  cgd 	u_long h_flags;		/* page state */
    111  1.1  cgd 	index_t h_lower;	/* lower bound of free space on page */
    112  1.1  cgd 	index_t h_upper;	/* upper bound of free space on page */
    113  1.1  cgd 	index_t h_linp[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
    114  1.1  cgd } BTHEADER;
    115  1.1  cgd 
    116  1.1  cgd /*
    117  1.1  cgd  *  HTBUCKETs are hash table buckets for looking up pages of in-memory
    118  1.1  cgd  *  btrees by page number.  We use this indirection, rather than direct
    119  1.1  cgd  *  pointers, so that the code for manipulating in-memory trees is the
    120  1.1  cgd  *  same as that for manipulating on-disk trees.
    121  1.1  cgd  */
    122  1.1  cgd 
    123  1.1  cgd typedef struct HTBUCKET {
    124  1.1  cgd 	pgno_t		ht_pgno;
    125  1.1  cgd 	BTHEADER	*ht_page;
    126  1.1  cgd 	struct HTBUCKET	*ht_next;
    127  1.1  cgd } HTBUCKET;
    128  1.1  cgd 
    129  1.1  cgd typedef HTBUCKET	**HTABLE;
    130  1.1  cgd 
    131  1.1  cgd /* minimum size we'll let a page be */
    132  1.1  cgd #define MINPSIZE	512
    133  1.1  cgd 
    134  1.1  cgd /* default cache size, in bytes */
    135  1.1  cgd #define DEFCACHE	(20 * 1024)
    136  1.1  cgd 
    137  1.1  cgd /* hash table size for in-memory trees */
    138  1.1  cgd #define	HTSIZE		128
    139  1.1  cgd 
    140  1.1  cgd /* generate a hash key from a page number */
    141  1.1  cgd #define HASHKEY(pgno)	((pgno - 1) % HTSIZE)
    142  1.1  cgd 
    143  1.1  cgd /*
    144  1.1  cgd  *  Disk btrees have a file descriptor, and may also have an lru buffer
    145  1.1  cgd  *  cache, if the user asked for one.
    146  1.1  cgd  */
    147  1.1  cgd 
    148  1.1  cgd typedef struct BTDISK {
    149  1.1  cgd 	int	d_fd;
    150  1.1  cgd 	char	*d_cache;
    151  1.1  cgd } BTDISK;
    152  1.1  cgd 
    153  1.1  cgd /*
    154  1.1  cgd  *  Cursors keep track of the current location in a sequential scan of
    155  1.1  cgd  *  the database.  Since btrees impose a total ordering on keys, we can
    156  1.1  cgd  *  walk forward or backward through the database from any point.  Cursors
    157  1.1  cgd  *  survive updates to the tree, and can be used to delete a particular
    158  1.1  cgd  *  record.
    159  1.1  cgd  */
    160  1.1  cgd 
    161  1.1  cgd typedef struct CURSOR {
    162  1.1  cgd 	pgno_t		c_pgno;		/* pgno of current item in scan */
    163  1.1  cgd 	index_t		c_index;	/* index of current item in scan */
    164  1.1  cgd 	char		*c_key;		/* current key, used for updates */
    165  1.1  cgd 
    166  1.1  cgd #define CRSR_BEFORE	0x01
    167  1.1  cgd 
    168  1.1  cgd 	u_char		c_flags;	/* to handle updates properly */
    169  1.1  cgd } CURSOR;
    170  1.1  cgd 
    171  1.1  cgd /*
    172  1.1  cgd  *  The private btree data structure.  The user passes a pointer to one of
    173  1.1  cgd  *  these when we are to manipulate a tree, but the BTREE type is opaque
    174  1.1  cgd  *  to him.
    175  1.1  cgd  */
    176  1.1  cgd 
    177  1.1  cgd typedef struct BTREEDATA_P {
    178  1.1  cgd 	char		*bt_fname;		/* NULL for in-memory trees */
    179  1.1  cgd 	union {
    180  1.1  cgd 		BTDISK	bt_d;			/* for on-disk btrees */
    181  1.1  cgd 		HTABLE	bt_ht;			/* hash table for mem trees */
    182  1.1  cgd 	} bt_s;
    183  1.1  cgd 	size_t		bt_psize;		/* page size for btree pages */
    184  1.1  cgd 	int		(*bt_compare)();	/* key comparison function */
    185  1.1  cgd 	pgno_t		bt_npages;		/* number of pages in tree */
    186  1.1  cgd 	BTHEADER	*bt_curpage;		/* current page contents */
    187  1.1  cgd 	pgno_t		bt_free;		/* free pg list for big data */
    188  1.1  cgd 	CURSOR		bt_cursor;		/* cursor for scans */
    189  1.1  cgd 	BTSTACK		*bt_stack;		/* parent stack for inserts */
    190  1.1  cgd 	u_long		bt_lorder;		/* byte order (endian.h) */
    191  1.1  cgd 
    192  1.1  cgd #define BTF_METAOK	0x01	/* meta-data written to start of file */
    193  1.1  cgd #define BTF_SEQINIT	0x02	/* we have called bt_seq */
    194  1.1  cgd #define BTF_ISWRITE	0x04	/* tree was opened for write */
    195  1.1  cgd #define BTF_NODUPS	0x08	/* tree created for unique keys */
    196  1.1  cgd 
    197  1.1  cgd 	u_long		bt_flags;		/* btree state */
    198  1.1  cgd } BTREEDATA_P;
    199  1.1  cgd 
    200  1.1  cgd typedef BTREEDATA_P	*BTREE_P;
    201  1.1  cgd 
    202  1.1  cgd /*
    203  1.1  cgd  *  The first thing in a btree file is a BTMETA structure.  The rest of
    204  1.1  cgd  *  the first page is empty, so that all disk operations are page-aligned.
    205  1.1  cgd  */
    206  1.1  cgd 
    207  1.1  cgd typedef struct BTMETA {
    208  1.1  cgd 	u_long	m_magic;
    209  1.1  cgd 	u_long	m_version;
    210  1.1  cgd 	size_t	m_psize;
    211  1.1  cgd 	pgno_t	m_free;
    212  1.1  cgd 	u_long	m_flags;
    213  1.1  cgd 	u_long	m_lorder;
    214  1.1  cgd } BTMETA;
    215  1.1  cgd 
    216  1.1  cgd #define P_NONE		0		/* invalid page number in tree */
    217  1.1  cgd #define P_ROOT		1		/* page number of root pg in btree */
    218  1.1  cgd 
    219  1.1  cgd #define NORELEASE	0		/* don't release a page during write */
    220  1.1  cgd #define RELEASE		1		/* release a page during write */
    221  1.1  cgd 
    222  1.1  cgd #define INSERT		0		/* doing an insert operation */
    223  1.1  cgd #define DELETE		1		/* doing a delete operation */
    224  1.1  cgd 
    225  1.1  cgd /* get the next free index on a btree page */
    226  1.1  cgd #define NEXTINDEX(p)	((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t)))
    227  1.1  cgd 
    228  1.1  cgd /* is a BTITEM actually on the btree page? */
    229  1.1  cgd #define VALIDITEM(t, i)	((i)->bti_index < NEXTINDEX((t)->bt_curpage))
    230  1.1  cgd 
    231  1.1  cgd /* guarantee longword alignment so structure refs work */
    232  1.1  cgd #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03)
    233  1.1  cgd 
    234  1.1  cgd /* get a particular datum (or idatum) off a page */
    235  1.1  cgd #define GETDATUM(h,i)	 (((char *) h) + h->h_linp[i])
    236  1.1  cgd 
    237  1.1  cgd /* is a {key,datum} too big to put on a single page? */
    238  1.1  cgd #define TOOBIG(t, sz)	(sz >= t->bt_psize / 5)
    239  1.1  cgd 
    240  1.1  cgd /* is this a disk tree or a memory tree? */
    241  1.1  cgd #define ISDISK(t)	(t->bt_fname != (char *) NULL)
    242  1.1  cgd 
    243  1.1  cgd /* does the disk tree use a cache? */
    244  1.1  cgd #define ISCACHE(t)	(t->bt_s.bt_d.d_cache != (char *) NULL)
    245  1.1  cgd 
    246  1.1  cgd /*
    247  1.1  cgd  *  DATUMs are for user data -- one appears on leaf pages for every
    248  1.1  cgd  *  tree entry.  The d_bytes[] array contains the key first, then the data.
    249  1.1  cgd  *
    250  1.1  cgd  *  If either the key or the datum is too big to store on a single page,
    251  1.1  cgd  *  a bit is set in the flags entry, and the d_bytes[] array contains a
    252  1.1  cgd  *  pgno pointing to the page at which the data is actually stored.
    253  1.1  cgd  *
    254  1.1  cgd  *  Note on alignment:  every DATUM is guaranteed to be longword aligned
    255  1.1  cgd  *  on the disk page.  In order to force longword alignment of user key
    256  1.1  cgd  *  and data values, we must guarantee that the d_bytes[] array starts
    257  1.1  cgd  *  on a longword boundary.  This is the reason that d_flags is a u_long,
    258  1.1  cgd  *  rather than a u_char (it really only needs to be two bits big).  This
    259  1.1  cgd  *  is necessary because we call the user's comparison function with a
    260  1.1  cgd  *  pointer to the start of the d_bytes array.  We don't need to force
    261  1.1  cgd  *  longword alignment of the data following the key, since that is copied
    262  1.1  cgd  *  to a longword-aligned buffer before being returned to the user.
    263  1.1  cgd  */
    264  1.1  cgd 
    265  1.1  cgd typedef struct DATUM {
    266  1.1  cgd 	size_t d_ksize;		/* size of key */
    267  1.1  cgd 	size_t d_dsize;		/* size of data */
    268  1.1  cgd 
    269  1.1  cgd #define D_BIGDATA	0x01	/* indirect datum ptr flag */
    270  1.1  cgd #define D_BIGKEY	0x02	/* indirect key ptr flag */
    271  1.1  cgd 
    272  1.1  cgd 	u_long d_flags;		/* flags (indirect bit) */
    273  1.1  cgd 	char d_bytes[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
    274  1.1  cgd } DATUM;
    275  1.1  cgd 
    276  1.1  cgd /* BTITEMs are used to return (page, index, datum) tuples from searches */
    277  1.1  cgd typedef struct BTITEM {
    278  1.1  cgd 	pgno_t bti_pgno;
    279  1.1  cgd 	index_t bti_index;
    280  1.1  cgd 	DATUM *bti_datum;
    281  1.1  cgd } BTITEM;
    282  1.1  cgd 
    283  1.1  cgd /*
    284  1.1  cgd  *  IDATUMs are for data stored on internal pages.  This is the (key, pgno)
    285  1.1  cgd  *  pair, such that key 'key' is the first entry on page 'pgno'.  If our
    286  1.1  cgd  *  internal page contains keys (a) and (b) next to each other, then all
    287  1.1  cgd  *  items >= to (a) and < (b) go on the same page as (a).  There are some
    288  1.1  cgd  *  gotchas with duplicate keys, however.  See the split code for details.
    289  1.1  cgd  *
    290  1.1  cgd  *  If a key is too big to fit on a single page, then the i_bytes[] array
    291  1.1  cgd  *  contains a pgno pointing to the start of a chain that actually stores
    292  1.1  cgd  *  the bytes.  Since items on internal pages are never deleted from the
    293  1.1  cgd  *  tree, these indirect chains are marked as special, so that they won't
    294  1.1  cgd  *  be deleted if the corresponding leaf item is deleted.
    295  1.1  cgd  *
    296  1.1  cgd  *  As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char)
    297  1.1  cgd  *  in order to guarantee that user keys are longword aligned on the disk
    298  1.1  cgd  *  page.
    299  1.1  cgd  */
    300  1.1  cgd 
    301  1.1  cgd typedef struct IDATUM {
    302  1.1  cgd 	size_t i_size;
    303  1.1  cgd 	pgno_t i_pgno;
    304  1.1  cgd 	u_long i_flags;		/* see DATUM.d_flags, above */
    305  1.1  cgd 	char i_bytes[1];	/* VARIABLE LENGTH DATA AT END OF STRUCT */
    306  1.1  cgd } IDATUM;
    307  1.1  cgd 
    308  1.1  cgd /* all private interfaces have a leading _ in their names */
    309  1.1  cgd extern BTITEM	*_bt_search();
    310  1.1  cgd extern BTITEM	*_bt_searchr();
    311  1.1  cgd extern BTHEADER	*_bt_allocpg();
    312  1.1  cgd extern index_t	_bt_binsrch();
    313  1.1  cgd extern int	_bt_isonpage();
    314  1.1  cgd extern BTITEM	*_bt_first();
    315  1.1  cgd extern int	_bt_release();
    316  1.1  cgd extern int	_bt_wrtmeta();
    317  1.1  cgd extern int	_bt_delindir();
    318  1.1  cgd extern int	_bt_pgout();
    319  1.1  cgd extern int	_bt_pgin();
    320  1.1  cgd extern int	_bt_fixscan();
    321  1.1  cgd extern int	_bt_indirect();
    322  1.1  cgd extern int	_bt_crsrdel();
    323  1.1  cgd extern int	_bt_push();
    324  1.1  cgd extern pgno_t	_bt_pop();
    325  1.1  cgd extern int	strcmp();
    326  1.1  cgd 
    327