btree.h revision 1.1 1 /*-
2 * Copyright (c) 1990 The Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Mike Olson.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37 /*
38 * @(#)btree.h 5.2 (Berkeley) 2/22/91
39 */
40
41 typedef char *BTREE; /* should really be (void *) */
42
43 /* #define DEBUG */
44
45 #define RET_ERROR -1
46 #define RET_SUCCESS 0
47 #define RET_SPECIAL 1
48
49 #ifndef TRUE
50 #define TRUE 1
51 #define FALSE 0
52 #endif /* ndef TRUE */
53
54 #ifndef NULL
55 #define NULL 0
56 #endif /* ndef NULL */
57
58 /* these are defined in lrucache.c */
59 extern char *lruinit();
60 extern char *lruget();
61 extern char *lrugetnew();
62 extern int lrusync();
63 extern int lruwrite();
64 extern int lrurelease();
65 extern void lrufree();
66
67 /* these are defined here */
68 extern BTREE bt_open();
69 extern int bt_close();
70 extern int bt_delete();
71 extern int bt_get();
72 extern int bt_put();
73 extern int bt_seq();
74 extern int bt_sync();
75
76 /*
77 * Private types. What you choose for these depends on how big you
78 * want to let files get, and how big you want to let pages get.
79 */
80
81 typedef u_long index_t; /* so # bytes on a page fits in a long */
82 typedef u_long pgno_t; /* so # of pages in a btree fits in a long */
83
84 /*
85 * When we do searches, we push the parent page numbers onto a stack
86 * as we descend the tree. This is so that for insertions, we can
87 * find our way back up to do internal page insertions and splits.
88 */
89
90 typedef struct BTSTACK {
91 pgno_t bts_pgno;
92 struct BTSTACK *bts_next;
93 } BTSTACK;
94
95 /*
96 * Every btree page has a header that looks like this. Flags are given
97 * in the #define's for the F_ flags (see below).
98 */
99
100 typedef struct BTHEADER {
101 pgno_t h_pgno; /* page number of this page */
102 pgno_t h_prevpg; /* left sibling */
103 pgno_t h_nextpg; /* right sibling */
104
105 #define F_LEAF 0x01 /* leaf page, contains user data */
106 #define F_CONT 0x02 /* continuation page (large items) */
107 #define F_DIRTY 0x04 /* need to write to disk */
108 #define F_PRESERVE 0x08 /* never delete this chain of pages */
109
110 u_long h_flags; /* page state */
111 index_t h_lower; /* lower bound of free space on page */
112 index_t h_upper; /* upper bound of free space on page */
113 index_t h_linp[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */
114 } BTHEADER;
115
116 /*
117 * HTBUCKETs are hash table buckets for looking up pages of in-memory
118 * btrees by page number. We use this indirection, rather than direct
119 * pointers, so that the code for manipulating in-memory trees is the
120 * same as that for manipulating on-disk trees.
121 */
122
123 typedef struct HTBUCKET {
124 pgno_t ht_pgno;
125 BTHEADER *ht_page;
126 struct HTBUCKET *ht_next;
127 } HTBUCKET;
128
129 typedef HTBUCKET **HTABLE;
130
131 /* minimum size we'll let a page be */
132 #define MINPSIZE 512
133
134 /* default cache size, in bytes */
135 #define DEFCACHE (20 * 1024)
136
137 /* hash table size for in-memory trees */
138 #define HTSIZE 128
139
140 /* generate a hash key from a page number */
141 #define HASHKEY(pgno) ((pgno - 1) % HTSIZE)
142
143 /*
144 * Disk btrees have a file descriptor, and may also have an lru buffer
145 * cache, if the user asked for one.
146 */
147
148 typedef struct BTDISK {
149 int d_fd;
150 char *d_cache;
151 } BTDISK;
152
153 /*
154 * Cursors keep track of the current location in a sequential scan of
155 * the database. Since btrees impose a total ordering on keys, we can
156 * walk forward or backward through the database from any point. Cursors
157 * survive updates to the tree, and can be used to delete a particular
158 * record.
159 */
160
161 typedef struct CURSOR {
162 pgno_t c_pgno; /* pgno of current item in scan */
163 index_t c_index; /* index of current item in scan */
164 char *c_key; /* current key, used for updates */
165
166 #define CRSR_BEFORE 0x01
167
168 u_char c_flags; /* to handle updates properly */
169 } CURSOR;
170
171 /*
172 * The private btree data structure. The user passes a pointer to one of
173 * these when we are to manipulate a tree, but the BTREE type is opaque
174 * to him.
175 */
176
177 typedef struct BTREEDATA_P {
178 char *bt_fname; /* NULL for in-memory trees */
179 union {
180 BTDISK bt_d; /* for on-disk btrees */
181 HTABLE bt_ht; /* hash table for mem trees */
182 } bt_s;
183 size_t bt_psize; /* page size for btree pages */
184 int (*bt_compare)(); /* key comparison function */
185 pgno_t bt_npages; /* number of pages in tree */
186 BTHEADER *bt_curpage; /* current page contents */
187 pgno_t bt_free; /* free pg list for big data */
188 CURSOR bt_cursor; /* cursor for scans */
189 BTSTACK *bt_stack; /* parent stack for inserts */
190 u_long bt_lorder; /* byte order (endian.h) */
191
192 #define BTF_METAOK 0x01 /* meta-data written to start of file */
193 #define BTF_SEQINIT 0x02 /* we have called bt_seq */
194 #define BTF_ISWRITE 0x04 /* tree was opened for write */
195 #define BTF_NODUPS 0x08 /* tree created for unique keys */
196
197 u_long bt_flags; /* btree state */
198 } BTREEDATA_P;
199
200 typedef BTREEDATA_P *BTREE_P;
201
202 /*
203 * The first thing in a btree file is a BTMETA structure. The rest of
204 * the first page is empty, so that all disk operations are page-aligned.
205 */
206
207 typedef struct BTMETA {
208 u_long m_magic;
209 u_long m_version;
210 size_t m_psize;
211 pgno_t m_free;
212 u_long m_flags;
213 u_long m_lorder;
214 } BTMETA;
215
216 #define P_NONE 0 /* invalid page number in tree */
217 #define P_ROOT 1 /* page number of root pg in btree */
218
219 #define NORELEASE 0 /* don't release a page during write */
220 #define RELEASE 1 /* release a page during write */
221
222 #define INSERT 0 /* doing an insert operation */
223 #define DELETE 1 /* doing a delete operation */
224
225 /* get the next free index on a btree page */
226 #define NEXTINDEX(p) ((((int)(p)->h_lower) - ((int)((((char *)(&(p)->h_linp[0]))) - ((char *) (p)))))/(sizeof(index_t)))
227
228 /* is a BTITEM actually on the btree page? */
229 #define VALIDITEM(t, i) ((i)->bti_index < NEXTINDEX((t)->bt_curpage))
230
231 /* guarantee longword alignment so structure refs work */
232 #define LONGALIGN(p) (((long)(p) + 3) & ~ 0x03)
233
234 /* get a particular datum (or idatum) off a page */
235 #define GETDATUM(h,i) (((char *) h) + h->h_linp[i])
236
237 /* is a {key,datum} too big to put on a single page? */
238 #define TOOBIG(t, sz) (sz >= t->bt_psize / 5)
239
240 /* is this a disk tree or a memory tree? */
241 #define ISDISK(t) (t->bt_fname != (char *) NULL)
242
243 /* does the disk tree use a cache? */
244 #define ISCACHE(t) (t->bt_s.bt_d.d_cache != (char *) NULL)
245
246 /*
247 * DATUMs are for user data -- one appears on leaf pages for every
248 * tree entry. The d_bytes[] array contains the key first, then the data.
249 *
250 * If either the key or the datum is too big to store on a single page,
251 * a bit is set in the flags entry, and the d_bytes[] array contains a
252 * pgno pointing to the page at which the data is actually stored.
253 *
254 * Note on alignment: every DATUM is guaranteed to be longword aligned
255 * on the disk page. In order to force longword alignment of user key
256 * and data values, we must guarantee that the d_bytes[] array starts
257 * on a longword boundary. This is the reason that d_flags is a u_long,
258 * rather than a u_char (it really only needs to be two bits big). This
259 * is necessary because we call the user's comparison function with a
260 * pointer to the start of the d_bytes array. We don't need to force
261 * longword alignment of the data following the key, since that is copied
262 * to a longword-aligned buffer before being returned to the user.
263 */
264
265 typedef struct DATUM {
266 size_t d_ksize; /* size of key */
267 size_t d_dsize; /* size of data */
268
269 #define D_BIGDATA 0x01 /* indirect datum ptr flag */
270 #define D_BIGKEY 0x02 /* indirect key ptr flag */
271
272 u_long d_flags; /* flags (indirect bit) */
273 char d_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */
274 } DATUM;
275
276 /* BTITEMs are used to return (page, index, datum) tuples from searches */
277 typedef struct BTITEM {
278 pgno_t bti_pgno;
279 index_t bti_index;
280 DATUM *bti_datum;
281 } BTITEM;
282
283 /*
284 * IDATUMs are for data stored on internal pages. This is the (key, pgno)
285 * pair, such that key 'key' is the first entry on page 'pgno'. If our
286 * internal page contains keys (a) and (b) next to each other, then all
287 * items >= to (a) and < (b) go on the same page as (a). There are some
288 * gotchas with duplicate keys, however. See the split code for details.
289 *
290 * If a key is too big to fit on a single page, then the i_bytes[] array
291 * contains a pgno pointing to the start of a chain that actually stores
292 * the bytes. Since items on internal pages are never deleted from the
293 * tree, these indirect chains are marked as special, so that they won't
294 * be deleted if the corresponding leaf item is deleted.
295 *
296 * As for DATUMs, IDATUMs have a u_long flag entry (rather than u_char)
297 * in order to guarantee that user keys are longword aligned on the disk
298 * page.
299 */
300
301 typedef struct IDATUM {
302 size_t i_size;
303 pgno_t i_pgno;
304 u_long i_flags; /* see DATUM.d_flags, above */
305 char i_bytes[1]; /* VARIABLE LENGTH DATA AT END OF STRUCT */
306 } IDATUM;
307
308 /* all private interfaces have a leading _ in their names */
309 extern BTITEM *_bt_search();
310 extern BTITEM *_bt_searchr();
311 extern BTHEADER *_bt_allocpg();
312 extern index_t _bt_binsrch();
313 extern int _bt_isonpage();
314 extern BTITEM *_bt_first();
315 extern int _bt_release();
316 extern int _bt_wrtmeta();
317 extern int _bt_delindir();
318 extern int _bt_pgout();
319 extern int _bt_pgin();
320 extern int _bt_fixscan();
321 extern int _bt_indirect();
322 extern int _bt_crsrdel();
323 extern int _bt_push();
324 extern pgno_t _bt_pop();
325 extern int strcmp();
326
327