1 1.29 christos /* $NetBSD: bt_open.c,v 1.29 2016/09/24 21:31:25 christos Exp $ */ 2 1.7 cgd 3 1.1 cgd /*- 4 1.6 cgd * Copyright (c) 1990, 1993, 1994 5 1.1 cgd * The Regents of the University of California. All rights reserved. 6 1.1 cgd * 7 1.1 cgd * This code is derived from software contributed to Berkeley by 8 1.1 cgd * Mike Olson. 9 1.1 cgd * 10 1.1 cgd * Redistribution and use in source and binary forms, with or without 11 1.1 cgd * modification, are permitted provided that the following conditions 12 1.1 cgd * are met: 13 1.1 cgd * 1. Redistributions of source code must retain the above copyright 14 1.1 cgd * notice, this list of conditions and the following disclaimer. 15 1.1 cgd * 2. Redistributions in binary form must reproduce the above copyright 16 1.1 cgd * notice, this list of conditions and the following disclaimer in the 17 1.1 cgd * documentation and/or other materials provided with the distribution. 18 1.17 agc * 3. Neither the name of the University nor the names of its contributors 19 1.1 cgd * may be used to endorse or promote products derived from this software 20 1.1 cgd * without specific prior written permission. 21 1.1 cgd * 22 1.1 cgd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 1.1 cgd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 1.1 cgd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 1.1 cgd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 1.1 cgd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 1.1 cgd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 1.1 cgd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 1.1 cgd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 1.1 cgd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 1.1 cgd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 1.1 cgd * SUCH DAMAGE. 33 1.1 cgd */ 34 1.1 cgd 35 1.24 joerg #if HAVE_NBTOOL_CONFIG_H 36 1.24 joerg #include "nbtool_config.h" 37 1.24 joerg #endif 38 1.24 joerg 39 1.9 christos #include <sys/cdefs.h> 40 1.29 christos __RCSID("$NetBSD: bt_open.c,v 1.29 2016/09/24 21:31:25 christos Exp $"); 41 1.1 cgd 42 1.1 cgd /* 43 1.1 cgd * Implementation of btree access method for 4.4BSD. 44 1.1 cgd * 45 1.1 cgd * The design here was originally based on that of the btree access method 46 1.1 cgd * used in the Postgres database system at UC Berkeley. This implementation 47 1.1 cgd * is wholly independent of the Postgres code. 48 1.1 cgd */ 49 1.1 cgd 50 1.10 jtc #include "namespace.h" 51 1.1 cgd #include <sys/stat.h> 52 1.1 cgd 53 1.21 christos #include <assert.h> 54 1.1 cgd #include <errno.h> 55 1.1 cgd #include <fcntl.h> 56 1.1 cgd #include <limits.h> 57 1.1 cgd #include <signal.h> 58 1.1 cgd #include <stdio.h> 59 1.1 cgd #include <stdlib.h> 60 1.1 cgd #include <string.h> 61 1.1 cgd #include <unistd.h> 62 1.11 fair #include <paths.h> 63 1.1 cgd 64 1.1 cgd #include <db.h> 65 1.1 cgd #include "btree.h" 66 1.1 cgd 67 1.8 cgd #ifdef DEBUG 68 1.8 cgd #undef MINPSIZE 69 1.8 cgd #define MINPSIZE 128 70 1.8 cgd #endif 71 1.8 cgd 72 1.21 christos static int byteorder(void); 73 1.21 christos static int nroot(BTREE *); 74 1.1 cgd 75 1.1 cgd /* 76 1.1 cgd * __BT_OPEN -- Open a btree. 77 1.1 cgd * 78 1.1 cgd * Creates and fills a DB struct, and calls the routine that actually 79 1.1 cgd * opens the btree. 80 1.1 cgd * 81 1.1 cgd * Parameters: 82 1.1 cgd * fname: filename (NULL for in-memory trees) 83 1.1 cgd * flags: open flag bits 84 1.1 cgd * mode: open permission bits 85 1.1 cgd * b: BTREEINFO pointer 86 1.1 cgd * 87 1.1 cgd * Returns: 88 1.1 cgd * NULL on failure, pointer to DB on success. 89 1.1 cgd * 90 1.1 cgd */ 91 1.1 cgd DB * 92 1.21 christos __bt_open(const char *fname, int flags, mode_t mode, const BTREEINFO *openinfo, 93 1.21 christos int dflags) 94 1.1 cgd { 95 1.6 cgd struct stat sb; 96 1.1 cgd BTMETA m; 97 1.1 cgd BTREE *t; 98 1.1 cgd BTREEINFO b; 99 1.1 cgd DB *dbp; 100 1.1 cgd pgno_t ncache; 101 1.6 cgd ssize_t nr; 102 1.21 christos size_t temp; 103 1.6 cgd int machine_lorder; 104 1.1 cgd 105 1.1 cgd t = NULL; 106 1.1 cgd 107 1.1 cgd /* 108 1.1 cgd * Intention is to make sure all of the user's selections are okay 109 1.1 cgd * here and then use them without checking. Can't be complete, since 110 1.1 cgd * we don't know the right page size, lorder or flags until the backing 111 1.1 cgd * file is opened. Also, the file's page size can cause the cachesize 112 1.1 cgd * to change. 113 1.1 cgd */ 114 1.1 cgd machine_lorder = byteorder(); 115 1.1 cgd if (openinfo) { 116 1.1 cgd b = *openinfo; 117 1.1 cgd 118 1.1 cgd /* Flags: R_DUP. */ 119 1.1 cgd if (b.flags & ~(R_DUP)) 120 1.1 cgd goto einval; 121 1.1 cgd 122 1.1 cgd /* 123 1.1 cgd * Page size must be indx_t aligned and >= MINPSIZE. Default 124 1.1 cgd * page size is set farther on, based on the underlying file 125 1.1 cgd * transfer size. 126 1.1 cgd */ 127 1.1 cgd if (b.psize && 128 1.1 cgd (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 || 129 1.9 christos b.psize & (sizeof(indx_t) - 1))) 130 1.1 cgd goto einval; 131 1.1 cgd 132 1.1 cgd /* Minimum number of keys per page; absolute minimum is 2. */ 133 1.1 cgd if (b.minkeypage) { 134 1.1 cgd if (b.minkeypage < 2) 135 1.1 cgd goto einval; 136 1.1 cgd } else 137 1.1 cgd b.minkeypage = DEFMINKEYPAGE; 138 1.1 cgd 139 1.1 cgd /* If no comparison, use default comparison and prefix. */ 140 1.1 cgd if (b.compare == NULL) { 141 1.1 cgd b.compare = __bt_defcmp; 142 1.1 cgd if (b.prefix == NULL) 143 1.1 cgd b.prefix = __bt_defpfx; 144 1.1 cgd } 145 1.1 cgd 146 1.1 cgd if (b.lorder == 0) 147 1.1 cgd b.lorder = machine_lorder; 148 1.1 cgd } else { 149 1.1 cgd b.compare = __bt_defcmp; 150 1.1 cgd b.cachesize = 0; 151 1.1 cgd b.flags = 0; 152 1.1 cgd b.lorder = machine_lorder; 153 1.1 cgd b.minkeypage = DEFMINKEYPAGE; 154 1.1 cgd b.prefix = __bt_defpfx; 155 1.1 cgd b.psize = 0; 156 1.1 cgd } 157 1.1 cgd 158 1.1 cgd /* Check for the ubiquitous PDP-11. */ 159 1.1 cgd if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN) 160 1.1 cgd goto einval; 161 1.1 cgd 162 1.1 cgd /* Allocate and initialize DB and BTREE structures. */ 163 1.27 christos if ((t = malloc(sizeof(*t))) == NULL) 164 1.1 cgd goto err; 165 1.5 cgd memset(t, 0, sizeof(BTREE)); 166 1.1 cgd t->bt_fd = -1; /* Don't close unopened fd on error. */ 167 1.1 cgd t->bt_lorder = b.lorder; 168 1.1 cgd t->bt_order = NOT; 169 1.1 cgd t->bt_cmp = b.compare; 170 1.1 cgd t->bt_pfx = b.prefix; 171 1.5 cgd t->bt_rfd = -1; 172 1.5 cgd 173 1.27 christos if ((t->bt_dbp = dbp = malloc(sizeof(*dbp))) == NULL) 174 1.5 cgd goto err; 175 1.8 cgd memset(t->bt_dbp, 0, sizeof(DB)); 176 1.1 cgd if (t->bt_lorder != machine_lorder) 177 1.8 cgd F_SET(t, B_NEEDSWAP); 178 1.1 cgd 179 1.1 cgd dbp->type = DB_BTREE; 180 1.1 cgd dbp->internal = t; 181 1.1 cgd dbp->close = __bt_close; 182 1.1 cgd dbp->del = __bt_delete; 183 1.1 cgd dbp->fd = __bt_fd; 184 1.1 cgd dbp->get = __bt_get; 185 1.1 cgd dbp->put = __bt_put; 186 1.1 cgd dbp->seq = __bt_seq; 187 1.1 cgd dbp->sync = __bt_sync; 188 1.1 cgd 189 1.1 cgd /* 190 1.1 cgd * If no file name was supplied, this is an in-memory btree and we 191 1.1 cgd * open a backing temporary file. Otherwise, it's a disk-based tree. 192 1.1 cgd */ 193 1.1 cgd if (fname) { 194 1.8 cgd switch (flags & O_ACCMODE) { 195 1.1 cgd case O_RDONLY: 196 1.8 cgd F_SET(t, B_RDONLY); 197 1.1 cgd break; 198 1.1 cgd case O_RDWR: 199 1.1 cgd break; 200 1.1 cgd case O_WRONLY: 201 1.1 cgd default: 202 1.1 cgd goto einval; 203 1.1 cgd } 204 1.27 christos if ((t->bt_fd = __dbopen(fname, flags, mode, &sb)) == -1) 205 1.1 cgd goto err; 206 1.1 cgd } else { 207 1.1 cgd if ((flags & O_ACCMODE) != O_RDWR) 208 1.1 cgd goto einval; 209 1.27 christos if ((t->bt_fd = __dbtemp("bt.", &sb)) == -1) 210 1.1 cgd goto err; 211 1.8 cgd F_SET(t, B_INMEM); 212 1.1 cgd } 213 1.1 cgd 214 1.1 cgd 215 1.1 cgd if (sb.st_size) { 216 1.6 cgd if ((nr = read(t->bt_fd, &m, sizeof(BTMETA))) < 0) 217 1.1 cgd goto err; 218 1.1 cgd if (nr != sizeof(BTMETA)) 219 1.1 cgd goto eftype; 220 1.1 cgd 221 1.1 cgd /* 222 1.1 cgd * Read in the meta-data. This can change the notion of what 223 1.1 cgd * the lorder, page size and flags are, and, when the page size 224 1.1 cgd * changes, the cachesize value can change too. If the user 225 1.1 cgd * specified the wrong byte order for an existing database, we 226 1.1 cgd * don't bother to return an error, we just clear the NEEDSWAP 227 1.1 cgd * bit. 228 1.1 cgd */ 229 1.8 cgd if (m.magic == BTREEMAGIC) 230 1.8 cgd F_CLR(t, B_NEEDSWAP); 231 1.1 cgd else { 232 1.8 cgd F_SET(t, B_NEEDSWAP); 233 1.8 cgd M_32_SWAP(m.magic); 234 1.8 cgd M_32_SWAP(m.version); 235 1.8 cgd M_32_SWAP(m.psize); 236 1.8 cgd M_32_SWAP(m.free); 237 1.8 cgd M_32_SWAP(m.nrecs); 238 1.8 cgd M_32_SWAP(m.flags); 239 1.1 cgd } 240 1.8 cgd if (m.magic != BTREEMAGIC || m.version != BTREEVERSION) 241 1.1 cgd goto eftype; 242 1.8 cgd if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 || 243 1.9 christos m.psize & (sizeof(indx_t) - 1)) 244 1.1 cgd goto eftype; 245 1.8 cgd if (m.flags & ~SAVEMETA) 246 1.1 cgd goto eftype; 247 1.8 cgd b.psize = m.psize; 248 1.8 cgd F_SET(t, m.flags); 249 1.8 cgd t->bt_free = m.free; 250 1.8 cgd t->bt_nrecs = m.nrecs; 251 1.1 cgd } else { 252 1.1 cgd /* 253 1.1 cgd * Set the page size to the best value for I/O to this file. 254 1.1 cgd * Don't overflow the page offset type. 255 1.1 cgd */ 256 1.1 cgd if (b.psize == 0) { 257 1.1 cgd b.psize = sb.st_blksize; 258 1.1 cgd if (b.psize < MINPSIZE) 259 1.1 cgd b.psize = MINPSIZE; 260 1.1 cgd if (b.psize > MAX_PAGE_OFFSET + 1) 261 1.1 cgd b.psize = MAX_PAGE_OFFSET + 1; 262 1.1 cgd } 263 1.1 cgd 264 1.1 cgd /* Set flag if duplicates permitted. */ 265 1.1 cgd if (!(b.flags & R_DUP)) 266 1.8 cgd F_SET(t, B_NODUPS); 267 1.1 cgd 268 1.1 cgd t->bt_free = P_INVALID; 269 1.1 cgd t->bt_nrecs = 0; 270 1.8 cgd F_SET(t, B_METADIRTY); 271 1.1 cgd } 272 1.1 cgd 273 1.1 cgd t->bt_psize = b.psize; 274 1.1 cgd 275 1.1 cgd /* Set the cache size; must be a multiple of the page size. */ 276 1.9 christos if (b.cachesize && b.cachesize & (b.psize - 1)) 277 1.9 christos b.cachesize += (~b.cachesize & (b.psize - 1)) + 1; 278 1.1 cgd if (b.cachesize < b.psize * MINCACHE) 279 1.1 cgd b.cachesize = b.psize * MINCACHE; 280 1.1 cgd 281 1.1 cgd /* Calculate number of pages to cache. */ 282 1.1 cgd ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize; 283 1.1 cgd 284 1.1 cgd /* 285 1.1 cgd * The btree data structure requires that at least two keys can fit on 286 1.1 cgd * a page, but other than that there's no fixed requirement. The user 287 1.1 cgd * specified a minimum number per page, and we translated that into the 288 1.1 cgd * number of bytes a key/data pair can use before being placed on an 289 1.1 cgd * overflow page. This calculation includes the page header, the size 290 1.1 cgd * of the index referencing the leaf item and the size of the leaf item 291 1.1 cgd * structure. Also, don't let the user specify a minkeypage such that 292 1.1 cgd * a key/data pair won't fit even if both key and data are on overflow 293 1.1 cgd * pages. 294 1.1 cgd */ 295 1.21 christos temp = (t->bt_psize - BTDATAOFF) / b.minkeypage - 296 1.1 cgd (sizeof(indx_t) + NBLEAFDBT(0, 0)); 297 1.21 christos _DBFIT(temp, indx_t); 298 1.21 christos t->bt_ovflsize = (indx_t)temp; 299 1.26 christos if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t)) { 300 1.26 christos size_t l = NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t); 301 1.26 christos _DBFIT(l, indx_t); 302 1.26 christos t->bt_ovflsize = (indx_t)l; 303 1.26 christos } 304 1.1 cgd 305 1.1 cgd /* Initialize the buffer pool. */ 306 1.1 cgd if ((t->bt_mp = 307 1.1 cgd mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL) 308 1.1 cgd goto err; 309 1.8 cgd if (!F_ISSET(t, B_INMEM)) 310 1.1 cgd mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t); 311 1.1 cgd 312 1.1 cgd /* Create a root page if new tree. */ 313 1.1 cgd if (nroot(t) == RET_ERROR) 314 1.1 cgd goto err; 315 1.1 cgd 316 1.4 cgd /* Global flags. */ 317 1.4 cgd if (dflags & DB_LOCK) 318 1.8 cgd F_SET(t, B_DB_LOCK); 319 1.4 cgd if (dflags & DB_SHMEM) 320 1.8 cgd F_SET(t, B_DB_SHMEM); 321 1.4 cgd if (dflags & DB_TXN) 322 1.8 cgd F_SET(t, B_DB_TXN); 323 1.4 cgd 324 1.1 cgd return (dbp); 325 1.1 cgd 326 1.1 cgd einval: errno = EINVAL; 327 1.1 cgd goto err; 328 1.1 cgd 329 1.1 cgd eftype: errno = EFTYPE; 330 1.1 cgd goto err; 331 1.1 cgd 332 1.1 cgd err: if (t) { 333 1.1 cgd if (t->bt_dbp) 334 1.1 cgd free(t->bt_dbp); 335 1.1 cgd if (t->bt_fd != -1) 336 1.1 cgd (void)close(t->bt_fd); 337 1.1 cgd free(t); 338 1.1 cgd } 339 1.1 cgd return (NULL); 340 1.1 cgd } 341 1.1 cgd 342 1.1 cgd /* 343 1.1 cgd * NROOT -- Create the root of a new tree. 344 1.1 cgd * 345 1.1 cgd * Parameters: 346 1.1 cgd * t: tree 347 1.1 cgd * 348 1.1 cgd * Returns: 349 1.1 cgd * RET_ERROR, RET_SUCCESS 350 1.1 cgd */ 351 1.1 cgd static int 352 1.21 christos nroot(BTREE *t) 353 1.1 cgd { 354 1.1 cgd PAGE *meta, *root; 355 1.1 cgd pgno_t npg; 356 1.1 cgd 357 1.29 christos if ((root = mpool_get(t->bt_mp, 1, 0)) != NULL) { 358 1.28 christos if (root->lower == 0 && 359 1.28 christos root->pgno == 0 && 360 1.28 christos root->linp[0] == 0) { 361 1.28 christos mpool_delete(t->bt_mp, root); 362 1.28 christos errno = EINVAL; 363 1.28 christos } else { 364 1.28 christos mpool_put(t->bt_mp, root, 0); 365 1.28 christos return RET_SUCCESS; 366 1.28 christos } 367 1.1 cgd } 368 1.6 cgd if (errno != EINVAL) /* It's OK to not exist. */ 369 1.1 cgd return (RET_ERROR); 370 1.6 cgd errno = 0; 371 1.1 cgd 372 1.28 christos if ((meta = mpool_newf(t->bt_mp, &npg, MPOOL_PAGE_NEXT)) == NULL) 373 1.1 cgd return (RET_ERROR); 374 1.1 cgd 375 1.28 christos if ((root = mpool_newf(t->bt_mp, &npg, MPOOL_PAGE_NEXT)) == NULL) 376 1.1 cgd return (RET_ERROR); 377 1.1 cgd 378 1.1 cgd if (npg != P_ROOT) 379 1.1 cgd return (RET_ERROR); 380 1.1 cgd root->pgno = npg; 381 1.1 cgd root->prevpg = root->nextpg = P_INVALID; 382 1.1 cgd root->lower = BTDATAOFF; 383 1.1 cgd root->upper = t->bt_psize; 384 1.1 cgd root->flags = P_BLEAF; 385 1.1 cgd memset(meta, 0, t->bt_psize); 386 1.1 cgd mpool_put(t->bt_mp, meta, MPOOL_DIRTY); 387 1.1 cgd mpool_put(t->bt_mp, root, MPOOL_DIRTY); 388 1.1 cgd return (RET_SUCCESS); 389 1.1 cgd } 390 1.1 cgd 391 1.1 cgd static int 392 1.21 christos byteorder(void) 393 1.1 cgd { 394 1.22 joerg uint32_t x; 395 1.22 joerg uint8_t *p; 396 1.1 cgd 397 1.1 cgd x = 0x01020304; 398 1.22 joerg p = (uint8_t *)(void *)&x; 399 1.1 cgd switch (*p) { 400 1.1 cgd case 1: 401 1.1 cgd return (BIG_ENDIAN); 402 1.1 cgd case 4: 403 1.1 cgd return (LITTLE_ENDIAN); 404 1.1 cgd default: 405 1.1 cgd return (0); 406 1.1 cgd } 407 1.1 cgd } 408 1.1 cgd 409 1.1 cgd int 410 1.21 christos __bt_fd(const DB *dbp) 411 1.1 cgd { 412 1.1 cgd BTREE *t; 413 1.1 cgd 414 1.1 cgd t = dbp->internal; 415 1.1 cgd 416 1.4 cgd /* Toss any page pinned across calls. */ 417 1.4 cgd if (t->bt_pinned != NULL) { 418 1.4 cgd mpool_put(t->bt_mp, t->bt_pinned, 0); 419 1.4 cgd t->bt_pinned = NULL; 420 1.4 cgd } 421 1.4 cgd 422 1.4 cgd /* In-memory database can't have a file descriptor. */ 423 1.8 cgd if (F_ISSET(t, B_INMEM)) { 424 1.1 cgd errno = ENOENT; 425 1.1 cgd return (-1); 426 1.1 cgd } 427 1.1 cgd return (t->bt_fd); 428 1.1 cgd } 429