btree.h revision 1.1 1 1.1 cgd /*-
2 1.1 cgd * Copyright (c) 1990 The Regents of the University of California.
3 1.1 cgd * All rights reserved.
4 1.1 cgd *
5 1.1 cgd * This code is derived from software contributed to Berkeley by
6 1.1 cgd * Mike Olson.
7 1.1 cgd *
8 1.1 cgd * Redistribution and use in source and binary forms, with or without
9 1.1 cgd * modification, are permitted provided that the following conditions
10 1.1 cgd * are met:
11 1.1 cgd * 1. Redistributions of source code must retain the above copyright
12 1.1 cgd * notice, this list of conditions and the following disclaimer.
13 1.1 cgd * 2. Redistributions in binary form must reproduce the above copyright
14 1.1 cgd * notice, this list of conditions and the following disclaimer in the
15 1.1 cgd * documentation and/or other materials provided with the distribution.
16 1.1 cgd * 3. All advertising materials mentioning features or use of this software
17 1.1 cgd * must display the following acknowledgement:
18 1.1 cgd * This product includes software developed by the University of
19 1.1 cgd * California, Berkeley and its contributors.
20 1.1 cgd * 4. Neither the name of the University nor the names of its contributors
21 1.1 cgd * may be used to endorse or promote products derived from this software
22 1.1 cgd * without specific prior written permission.
23 1.1 cgd *
24 1.1 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 1.1 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 1.1 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 1.1 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 1.1 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 1.1 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 1.1 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 1.1 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 1.1 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 1.1 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 1.1 cgd * SUCH DAMAGE.
35 1.1 cgd */
36 1.1 cgd
37 1.1 cgd /*
38 1.1 cgd * @(#)btree.h 5.2 (Berkeley) 2/22/91
39 1.1 cgd */
40 1.1 cgd
41 1.1 cgd typedef char *BTREE; /* should really be (void *) */
42 1.1 cgd
43 1.1 cgd /* #define DEBUG */
44 1.1 cgd
45 1.1 cgd #define RET_ERROR -1
46 1.1 cgd #define RET_SUCCESS 0
47 1.1 cgd #define RET_SPECIAL 1
48 1.1 cgd
49 1.1 cgd #ifndef TRUE
50 1.1 cgd #define TRUE 1
51 1.1 cgd #define FALSE 0
52 1.1 cgd #endif /* ndef TRUE */
53 1.1 cgd
54 1.1 cgd #ifndef NULL
55 1.1 cgd #define NULL 0
56 1.1 cgd #endif /* ndef NULL */
57 1.1 cgd
58 1.1 cgd /* these are defined in lrucache.c */
59 1.1 cgd extern char *lruinit();
60 1.1 cgd extern char *lruget();
61 1.1 cgd extern char *lrugetnew();
62 1.1 cgd extern int lrusync();
63 1.1 cgd extern int lruwrite();
64 1.1 cgd extern int lrurelease();
65 1.1 cgd extern void lrufree();
66 1.1 cgd
67 1.1 cgd /* these are defined here */
68 1.1 cgd extern BTREE bt_open();
69 1.1 cgd extern int bt_close();
70 1.1 cgd extern int bt_delete();
71 1.1 cgd extern int bt_get();
72 1.1 cgd extern int bt_put();
73 1.1 cgd extern int bt_seq();
74 1.1 cgd extern int bt_sync();
75 1.1 cgd
76 1.1 cgd /*
77 1.1 cgd * Private types. What you choose for these depends on how big you
78 1.1 cgd * want to let files get, and how big you want to let pages get.
79 1.1 cgd */
80 1.1 cgd
81 1.1 cgd typedef u_long index_t; /* so # bytes on a page fits in a long */
82 1.1 cgd typedef u_long pgno_t; /* so # of pages in a btree fits in a long */
83 1.1 cgd
84 1.1 cgd /*
85 1.1 cgd * When we do searches, we push the parent page numbers onto a stack
86 1.1 cgd * as we descend the tree. This is so that for insertions, we can
87 1.1 cgd * find our way back up to do internal page insertions and splits.
88 1.1 cgd */
89 1.1 cgd
90 1.1 cgd typedef struct BTSTACK {
91 1.1 cgd pgno_t bts_pgno;
92 1.1 cgd struct BTSTACK *bts_next;
93 1.1 cgd } BTSTACK;
94 1.1 cgd
95 1.1 cgd /*
96 1.1 cgd * Every btree page has a header that looks like this. Flags are given
97 1.1 cgd * in the #define's for the F_ flags (see below).
98 1.1 cgd */
99 1.1 cgd
100 1.1 cgd typedef struct BTHEADER {
101 1.1 cgd pgno_t h_pgno; /* page number of this page */
102 1.1 cgd pgno_t h_prevpg; /* left sibling */
103 1.1 cgd pgno_t h_nextpg; /* right sibling */
104 1.1 cgd
105 1.1 cgd #define F_LEAF 0x01 /* leaf page, contains user data */
106 1.1 cgd #define F_CONT 0x02 /* continuation page (large items) */
107 1.1 cgd #define F_DIRTY 0x04 /* need to write to disk */
108 1.1 cgd #define F_PRESERVE 0x08 /* never delete this chain of pages */
109 1.1 cgd
110 1.1 cgd u_long h_flags; /* page state */
111 1.1 cgd index_t h_lower; /* lower bound of free space on page */
112 1.1 cgd index_t h_upper; /* upper bound of free space on page */
113 1.1 cgd index_t h_linp[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */
114 1.1 cgd } BTHEADER;
115 1.1 cgd
116 1.1 cgd /*
117 1.1 cgd * HTBUCKETs are hash table buckets for looking up pages of in-memory
118 1.1 cgd * btrees by page number. We use this indirection, rather than direct
119 1.1 cgd * pointers, so that the code for manipulating in-memory trees is the
120 1.1 cgd * same as that for manipulating on-disk trees.
121 1.1 cgd */
122 1.1 cgd
123 1.1 cgd typedef struct HTBUCKET {
124 1.1 cgd pgno_t ht_pgno;
125 1.1 cgd BTHEADER *ht_page;
126 1.1 cgd struct HTBUCKET *ht_next;
127 1.1 cgd } HTBUCKET;
128 1.1 cgd
129 1.1 cgd typedef HTBUCKET **HTABLE;
130 1.1 cgd
131 1.1 cgd /* minimum size we'll let a page be */
132 1.1 cgd #define MINPSIZE 512
133 1.1 cgd
134 1.1 cgd /* default cache size, in bytes */
135 1.1 cgd #define DEFCACHE (20 * 1024)
136 1.1 cgd
137 1.1 cgd /* hash table size for in-memory trees */
138 1.1 cgd #define HTSIZE 128
139 1.1 cgd
140 1.1 cgd /* generate a hash key from a page number */
141 1.1 cgd #define HASHKEY(pgno) ((pgno - 1) % HTSIZE)
142 1.1 cgd
143 1.1 cgd /*
144 1.1 cgd * Disk btrees have a file descriptor, and may also have an lru buffer
145 1.1 cgd * cache, if the user asked for one.
146 1.1 cgd */
147 1.1 cgd
148 1.1 cgd typedef struct BTDISK {
149 1.1 cgd int d_fd;
150 1.1 cgd char *d_cache;
151 1.1 cgd } BTDISK;
152 1.1 cgd
153 1.1 cgd /*
154 1.1 cgd * Cursors keep track of the current location in a sequential scan of
155 1.1 cgd * the database. Since btrees impose a total ordering on keys, we can
156 1.1 cgd * walk forward or backward through the database from any point. Cursors
157 1.1 cgd * survive updates to the tree, and can be used to delete a particular
158 1.1 cgd * record.
159 1.1 cgd */
160 1.1 cgd
161 1.1 cgd typedef struct CURSOR {
162 1.1 cgd pgno_t c_pgno; /* pgno of current item in scan */
163 1.1 cgd index_t c_index; /* index of current item in scan */
164 1.1 cgd char *c_key; /* current key, used for updates */
165 1.1 cgd
166 1.1 cgd #define CRSR_BEFORE 0x01
167 1.1 cgd
168 1.1 cgd u_char c_flags; /* to handle updates properly */
169 1.1 cgd } CURSOR;
170 1.1 cgd
171 1.1 cgd /*
172 1.1 cgd * The private btree data structure. The user passes a pointer to one of
173 1.1 cgd * these when we are to manipulate a tree, but the BTREE type is opaque
174 1.1 cgd * to him.
175 1.1 cgd */
176 1.1 cgd
177 1.1 cgd typedef struct BTREEDATA_P {
178 1.1 cgd char *bt_fname; /* NULL for in-memory trees */
179 1.1 cgd union {
180 1.1 cgd BTDISK bt_d; /* for on-disk btrees */
181 1.1 cgd HTABLE bt_ht; /* hash table for mem trees */
182 1.1 cgd } bt_s;
183 1.1 cgd size_t bt_psize; /* page size for btree pages */
184 1.1 cgd int (*bt_compare)(); /* key comparison function */
185 1.1 cgd pgno_t bt_npages; /* number of pages in tree */
186 1.1 cgd BTHEADER *bt_curpage; /* current page contents */
187 1.1 cgd pgno_t bt_free; /* free pg list for big data */
188 1.1 cgd CURSOR bt_cursor; /* cursor for scans */
189 1.1 cgd BTSTACK *bt_stack; /* parent stack for inserts */
190 1.1 cgd u_long bt_lorder; /* byte order (endian.h) */
191 1.1 cgd
192 1.1 cgd #define BTF_METAOK 0x01 /* meta-data written to start of file */
193 1.1 cgd #define BTF_SEQINIT 0x02 /* we have called bt_seq */
194 1.1 cgd #define BTF_ISWRITE 0x04 /* tree was opened for write */
195 1.1 cgd #define BTF_NODUPS 0x08 /* tree created for unique keys */
196 1.1 cgd
197 1.1 cgd u_long bt_flags; /* btree state */
198 1.1 cgd } BTREEDATA_P;
199 1.1 cgd
200 1.1 cgd typedef BTREEDATA_P *BTREE_P;
201 1.1 cgd
202 1.1 cgd /*
203 1.1 cgd * The first thing in a btree file is a BTMETA structure. The rest of
204 1.1 cgd * the first page is empty, so that all disk operations are page-aligned.
205 1.1 cgd */
206 1.1 cgd
207 1.1 cgd typedef struct BTMETA {
208 1.1 cgd u_long m_magic;
209 1.1 cgd u_long m_version;
210 1.1 cgd size_t m_psize;
211 1.1 cgd pgno_t m_free;
212 1.1 cgd u_long m_flags;
213 1.1 cgd u_long m_lorder;
214 1.1 cgd } BTMETA;
215 1.1 cgd
216 1.1 cgd #define P_NONE 0 /* invalid page number in tree */
217 1.1 cgd #define P_ROOT 1 /* page number of root pg in btree */
218 1.1 cgd
219 1.1 cgd #define NORELEASE 0 /* don't release a page during write */
220 1.1 cgd #define RELEASE 1 /* release a page during write */
221 1.1 cgd
222 1.1 cgd #define INSERT 0 /* doing an insert operation */
223 1.1 cgd #define DELETE 1 /* doing a delete operation */
224 1.1 cgd
225 1.1 cgd /* get the next free index on a btree page */
226 1.1 cgd #define NEXTINDEX(p) ((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t)))
227 1.1 cgd
228 1.1 cgd /* is a BTITEM actually on the btree page? */
229 1.1 cgd #define VALIDITEM(t, i) ((i)->bti_index < NEXTINDEX((t)->bt_curpage))
230 1.1 cgd
231 1.1 cgd /* guarantee longword alignment so structure refs work */
232 1.1 cgd #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03)
233 1.1 cgd
234 1.1 cgd /* get a particular datum (or idatum) off a page */
235 1.1 cgd #define GETDATUM(h,i) (((char *) h) + h->h_linp[i])
236 1.1 cgd
237 1.1 cgd /* is a {key,datum} too big to put on a single page? */
238 1.1 cgd #define TOOBIG(t, sz) (sz >= t->bt_psize / 5)
239 1.1 cgd
240 1.1 cgd /* is this a disk tree or a memory tree? */
241 1.1 cgd #define ISDISK(t) (t->bt_fname != (char *) NULL)
242 1.1 cgd
243 1.1 cgd /* does the disk tree use a cache? */
244 1.1 cgd #define ISCACHE(t) (t->bt_s.bt_d.d_cache != (char *) NULL)
245 1.1 cgd
246 1.1 cgd /*
247 1.1 cgd * DATUMs are for user data -- one appears on leaf pages for every
248 1.1 cgd * tree entry. The d_bytes[] array contains the key first, then the data.
249 1.1 cgd *
250 1.1 cgd * If either the key or the datum is too big to store on a single page,
251 1.1 cgd * a bit is set in the flags entry, and the d_bytes[] array contains a
252 1.1 cgd * pgno pointing to the page at which the data is actually stored.
253 1.1 cgd *
254 1.1 cgd * Note on alignment: every DATUM is guaranteed to be longword aligned
255 1.1 cgd * on the disk page. In order to force longword alignment of user key
256 1.1 cgd * and data values, we must guarantee that the d_bytes[] array starts
257 1.1 cgd * on a longword boundary. This is the reason that d_flags is a u_long,
258 1.1 cgd * rather than a u_char (it really only needs to be two bits big). This
259 1.1 cgd * is necessary because we call the user's comparison function with a
260 1.1 cgd * pointer to the start of the d_bytes array. We don't need to force
261 1.1 cgd * longword alignment of the data following the key, since that is copied
262 1.1 cgd * to a longword-aligned buffer before being returned to the user.
263 1.1 cgd */
264 1.1 cgd
265 1.1 cgd typedef struct DATUM {
266 1.1 cgd size_t d_ksize; /* size of key */
267 1.1 cgd size_t d_dsize; /* size of data */
268 1.1 cgd
269 1.1 cgd #define D_BIGDATA 0x01 /* indirect datum ptr flag */
270 1.1 cgd #define D_BIGKEY 0x02 /* indirect key ptr flag */
271 1.1 cgd
272 1.1 cgd u_long d_flags; /* flags (indirect bit) */
273 1.1 cgd char d_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */
274 1.1 cgd } DATUM;
275 1.1 cgd
276 1.1 cgd /* BTITEMs are used to return (page, index, datum) tuples from searches */
277 1.1 cgd typedef struct BTITEM {
278 1.1 cgd pgno_t bti_pgno;
279 1.1 cgd index_t bti_index;
280 1.1 cgd DATUM *bti_datum;
281 1.1 cgd } BTITEM;
282 1.1 cgd
283 1.1 cgd /*
284 1.1 cgd * IDATUMs are for data stored on internal pages. This is the (key, pgno)
285 1.1 cgd * pair, such that key 'key' is the first entry on page 'pgno'. If our
286 1.1 cgd * internal page contains keys (a) and (b) next to each other, then all
287 1.1 cgd * items >= to (a) and < (b) go on the same page as (a). There are some
288 1.1 cgd * gotchas with duplicate keys, however. See the split code for details.
289 1.1 cgd *
290 1.1 cgd * If a key is too big to fit on a single page, then the i_bytes[] array
291 1.1 cgd * contains a pgno pointing to the start of a chain that actually stores
292 1.1 cgd * the bytes. Since items on internal pages are never deleted from the
293 1.1 cgd * tree, these indirect chains are marked as special, so that they won't
294 1.1 cgd * be deleted if the corresponding leaf item is deleted.
295 1.1 cgd *
296 1.1 cgd * As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char)
297 1.1 cgd * in order to guarantee that user keys are longword aligned on the disk
298 1.1 cgd * page.
299 1.1 cgd */
300 1.1 cgd
301 1.1 cgd typedef struct IDATUM {
302 1.1 cgd size_t i_size;
303 1.1 cgd pgno_t i_pgno;
304 1.1 cgd u_long i_flags; /* see DATUM.d_flags, above */
305 1.1 cgd char i_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */
306 1.1 cgd } IDATUM;
307 1.1 cgd
308 1.1 cgd /* all private interfaces have a leading _ in their names */
309 1.1 cgd extern BTITEM *_bt_search();
310 1.1 cgd extern BTITEM *_bt_searchr();
311 1.1 cgd extern BTHEADER *_bt_allocpg();
312 1.1 cgd extern index_t _bt_binsrch();
313 1.1 cgd extern int _bt_isonpage();
314 1.1 cgd extern BTITEM *_bt_first();
315 1.1 cgd extern int _bt_release();
316 1.1 cgd extern int _bt_wrtmeta();
317 1.1 cgd extern int _bt_delindir();
318 1.1 cgd extern int _bt_pgout();
319 1.1 cgd extern int _bt_pgin();
320 1.1 cgd extern int _bt_fixscan();
321 1.1 cgd extern int _bt_indirect();
322 1.1 cgd extern int _bt_crsrdel();
323 1.1 cgd extern int _bt_push();
324 1.1 cgd extern pgno_t _bt_pop();
325 1.1 cgd extern int strcmp();
326 1.1 cgd
327