1 /* $NetBSD: mdb.c,v 1.4 2025/09/05 21:16:22 christos Exp $ */ 2 3 /** @file mdb.c 4 * @brief Lightning memory-mapped database library 5 * 6 * A Btree-based database management library modeled loosely on the 7 * BerkeleyDB API, but much simplified. 8 */ 9 /* 10 * Copyright 2011-2021 Howard Chu, Symas Corp. 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted only as authorized by the OpenLDAP 15 * Public License. 16 * 17 * A copy of this license is available in the file LICENSE in the 18 * top-level directory of the distribution or, alternatively, at 19 * <http://www.OpenLDAP.org/license.html>. 20 * 21 * This code is derived from btree.c written by Martin Hedenfalk. 22 * 23 * Copyright (c) 2009, 2010 Martin Hedenfalk <martin (at) bzero.se> 24 * 25 * Permission to use, copy, modify, and distribute this software for any 26 * purpose with or without fee is hereby granted, provided that the above 27 * copyright notice and this permission notice appear in all copies. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 30 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 31 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 32 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 33 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 34 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 35 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 36 */ 37 #ifndef _GNU_SOURCE 38 #define _GNU_SOURCE 1 39 #endif 40 #if defined(__WIN64__) 41 #define _FILE_OFFSET_BITS 64 42 #endif 43 #ifdef _WIN32 44 #include <malloc.h> 45 #include <windows.h> 46 #include <wchar.h> /* get wcscpy() */ 47 48 /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it 49 * as int64 which is wrong. MSVC doesn't define it at all, so just 50 * don't use it. 51 */ 52 #define MDB_PID_T int 53 #define MDB_THR_T DWORD 54 #include <sys/types.h> 55 #include <sys/stat.h> 56 #ifdef __GNUC__ 57 # include <sys/param.h> 58 #else 59 # define LITTLE_ENDIAN 1234 60 # define BIG_ENDIAN 4321 61 # define BYTE_ORDER LITTLE_ENDIAN 62 # ifndef SSIZE_MAX 63 # define SSIZE_MAX INT_MAX 64 # endif 65 #endif 66 #else 67 #include <sys/types.h> 68 #include <sys/stat.h> 69 #define MDB_PID_T pid_t 70 #define MDB_THR_T pthread_t 71 #include <sys/param.h> 72 #include <sys/uio.h> 73 #include <sys/mman.h> 74 #ifdef HAVE_SYS_FILE_H 75 #include <sys/file.h> 76 #endif 77 #include <fcntl.h> 78 #endif 79 80 #if defined(__mips) && defined(__linux) 81 /* MIPS has cache coherency issues, requires explicit cache control */ 82 #include <sys/cachectl.h> 83 #define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache) 84 #else 85 #define CACHEFLUSH(addr, bytes, cache) 86 #endif 87 88 #if defined(__linux) && !defined(MDB_FDATASYNC_WORKS) 89 /** fdatasync is broken on ext3/ext4fs on older kernels, see 90 * description in #mdb_env_open2 comments. You can safely 91 * define MDB_FDATASYNC_WORKS if this code will only be run 92 * on kernels 3.6 and newer. 93 */ 94 #define BROKEN_FDATASYNC 95 #endif 96 97 #include <errno.h> 98 #include <limits.h> 99 #include <stddef.h> 100 #include <inttypes.h> 101 #include <stdio.h> 102 #include <stdlib.h> 103 #include <string.h> 104 #include <time.h> 105 106 #ifdef _MSC_VER 107 #include <io.h> 108 typedef SSIZE_T ssize_t; 109 #else 110 #include <unistd.h> 111 #endif 112 113 #if defined(__sun) || defined(ANDROID) 114 /* Most platforms have posix_memalign, older may only have memalign */ 115 #define HAVE_MEMALIGN 1 116 #include <malloc.h> 117 /* On Solaris, we need the POSIX sigwait function */ 118 #if defined (__sun) 119 # define _POSIX_PTHREAD_SEMANTICS 1 120 #endif 121 #endif 122 123 #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) 124 #include <netinet/in.h> 125 #include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */ 126 #endif 127 128 #if defined(__FreeBSD__) && defined(__FreeBSD_version) && __FreeBSD_version >= 1100110 129 # define MDB_USE_POSIX_MUTEX 1 130 # define MDB_USE_ROBUST 1 131 #elif defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) 132 # define MDB_USE_POSIX_SEM 1 133 # define MDB_FDATASYNC fsync 134 #elif defined(ANDROID) 135 # define MDB_FDATASYNC fsync 136 #endif 137 138 #ifndef _WIN32 139 #include <pthread.h> 140 #include <signal.h> 141 #ifdef MDB_USE_POSIX_SEM 142 # define MDB_USE_HASH 1 143 #include <semaphore.h> 144 #else 145 #define MDB_USE_POSIX_MUTEX 1 146 #endif 147 #endif 148 149 #if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) \ 150 + defined(MDB_USE_POSIX_MUTEX) != 1 151 # error "Ambiguous shared-lock implementation" 152 #endif 153 154 #ifdef USE_VALGRIND 155 #include <valgrind/memcheck.h> 156 #define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z) 157 #define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s) 158 #define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a) 159 #define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h) 160 #define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s) 161 #else 162 #define VGMEMP_CREATE(h,r,z) 163 #define VGMEMP_ALLOC(h,a,s) 164 #define VGMEMP_FREE(h,a) 165 #define VGMEMP_DESTROY(h) 166 #define VGMEMP_DEFINED(a,s) 167 #endif 168 169 #ifndef BYTE_ORDER 170 # if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) 171 /* Solaris just defines one or the other */ 172 # define LITTLE_ENDIAN 1234 173 # define BIG_ENDIAN 4321 174 # ifdef _LITTLE_ENDIAN 175 # define BYTE_ORDER LITTLE_ENDIAN 176 # else 177 # define BYTE_ORDER BIG_ENDIAN 178 # endif 179 # else 180 # define BYTE_ORDER __BYTE_ORDER 181 # endif 182 #endif 183 184 #ifndef LITTLE_ENDIAN 185 #define LITTLE_ENDIAN __LITTLE_ENDIAN 186 #endif 187 #ifndef BIG_ENDIAN 188 #define BIG_ENDIAN __BIG_ENDIAN 189 #endif 190 191 #if defined(__i386) || defined(__x86_64) || defined(_M_IX86) 192 #define MISALIGNED_OK 1 193 #endif 194 195 #include "lmdb.h" 196 #include "midl.h" 197 198 #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) 199 # error "Unknown or unsupported endianness (BYTE_ORDER)" 200 #elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF 201 # error "Two's complement, reasonably sized integer types, please" 202 #endif 203 204 #if (((__clang_major__ << 8) | __clang_minor__) >= 0x0302) || (((__GNUC__ << 8) | __GNUC_MINOR__) >= 0x0403) 205 /** Mark infrequently used env functions as cold. This puts them in a separate 206 * section, and optimizes them for size */ 207 #define ESECT __attribute__ ((cold)) 208 #else 209 /* On older compilers, use a separate section */ 210 # ifdef __GNUC__ 211 # ifdef __APPLE__ 212 # define ESECT __attribute__ ((section("__TEXT,text_env"))) 213 # else 214 # define ESECT __attribute__ ((section("text_env"))) 215 # endif 216 # else 217 # define ESECT 218 # endif 219 #endif 220 221 #ifdef _WIN32 222 #define CALL_CONV WINAPI 223 #else 224 #define CALL_CONV 225 #endif 226 227 /** @defgroup internal LMDB Internals 228 * @{ 229 */ 230 /** @defgroup compat Compatibility Macros 231 * A bunch of macros to minimize the amount of platform-specific ifdefs 232 * needed throughout the rest of the code. When the features this library 233 * needs are similar enough to POSIX to be hidden in a one-or-two line 234 * replacement, this macro approach is used. 235 * @{ 236 */ 237 238 /** Features under development */ 239 #ifndef MDB_DEVEL 240 #define MDB_DEVEL 0 241 #endif 242 243 /** Wrapper around __func__, which is a C99 feature */ 244 #if __STDC_VERSION__ >= 199901L 245 # define mdb_func_ __func__ 246 #elif __GNUC__ >= 2 || _MSC_VER >= 1300 247 # define mdb_func_ __FUNCTION__ 248 #else 249 /* If a debug message says <mdb_unknown>(), update the #if statements above */ 250 # define mdb_func_ "<mdb_unknown>" 251 #endif 252 253 /* Internal error codes, not exposed outside liblmdb */ 254 #define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) 255 #ifdef _WIN32 256 #define MDB_OWNERDEAD ((int) WAIT_ABANDONED) 257 #elif defined(MDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD) 258 #define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */ 259 #endif 260 261 #ifdef __GLIBC__ 262 #define GLIBC_VER ((__GLIBC__ << 16 )| __GLIBC_MINOR__) 263 #endif 264 /** Some platforms define the EOWNERDEAD error code 265 * even though they don't support Robust Mutexes. 266 * Compile with -DMDB_USE_ROBUST=0, or use some other 267 * mechanism like -DMDB_USE_POSIX_SEM instead of 268 * -DMDB_USE_POSIX_MUTEX. 269 * (Posix semaphores are not robust.) 270 */ 271 #ifndef MDB_USE_ROBUST 272 /* Android currently lacks Robust Mutex support. So does glibc < 2.4. */ 273 # if defined(MDB_USE_POSIX_MUTEX) && (defined(ANDROID) || \ 274 (defined(__GLIBC__) && GLIBC_VER < 0x020004)) 275 # define MDB_USE_ROBUST 0 276 # else 277 # define MDB_USE_ROBUST 1 278 # endif 279 #endif /* !MDB_USE_ROBUST */ 280 281 #if defined(MDB_USE_POSIX_MUTEX) && (MDB_USE_ROBUST) 282 /* glibc < 2.12 only provided _np API */ 283 # if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \ 284 (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST)) 285 # define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP 286 # define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) 287 # define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) 288 # endif 289 #endif /* MDB_USE_POSIX_MUTEX && MDB_USE_ROBUST */ 290 291 #if defined(MDB_OWNERDEAD) && (MDB_USE_ROBUST) 292 #define MDB_ROBUST_SUPPORTED 1 293 #endif 294 295 #ifdef _WIN32 296 #define MDB_USE_HASH 1 297 #define MDB_PIDLOCK 0 298 #define THREAD_RET DWORD 299 #define pthread_t HANDLE 300 #define pthread_mutex_t HANDLE 301 #define pthread_cond_t HANDLE 302 typedef HANDLE mdb_mutex_t, mdb_mutexref_t; 303 #define pthread_key_t DWORD 304 #define pthread_self() GetCurrentThreadId() 305 #define pthread_key_create(x,y) \ 306 ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0) 307 #define pthread_key_delete(x) TlsFree(x) 308 #define pthread_getspecific(x) TlsGetValue(x) 309 #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) 310 #define pthread_mutex_unlock(x) ReleaseMutex(*x) 311 #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) 312 #define pthread_cond_signal(x) SetEvent(*x) 313 #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) 314 #define THREAD_CREATE(thr,start,arg) \ 315 (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode()) 316 #define THREAD_FINISH(thr) \ 317 (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0) 318 #define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE) 319 #define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex) 320 #define mdb_mutex_consistent(mutex) 0 321 #define getpid() GetCurrentProcessId() 322 #define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd)) 323 #define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len)) 324 #define ErrCode() GetLastError() 325 #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} 326 #define close(fd) (CloseHandle(fd) ? 0 : -1) 327 #define munmap(ptr,len) UnmapViewOfFile(ptr) 328 #ifdef PROCESS_QUERY_LIMITED_INFORMATION 329 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION 330 #else 331 #define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000 332 #endif 333 #define Z "I" 334 #else 335 #define THREAD_RET void * 336 #define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg) 337 #define THREAD_FINISH(thr) pthread_join(thr,NULL) 338 #define Z "z" /**< printf format modifier for size_t */ 339 340 /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ 341 #define MDB_PIDLOCK 1 342 343 #ifdef MDB_USE_POSIX_SEM 344 345 typedef sem_t *mdb_mutex_t, *mdb_mutexref_t; 346 #define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex) 347 #define UNLOCK_MUTEX(mutex) sem_post(mutex) 348 349 static int 350 mdb_sem_wait(sem_t *sem) 351 { 352 int rc; 353 while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ; 354 return rc; 355 } 356 357 #else /* MDB_USE_POSIX_MUTEX: */ 358 /** Shared mutex/semaphore as the original is stored. 359 * 360 * Not for copies. Instead it can be assigned to an #mdb_mutexref_t. 361 * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it 362 * is array[size 1] so it can be assigned to the pointer. 363 */ 364 typedef pthread_mutex_t mdb_mutex_t[1]; 365 /** Reference to an #mdb_mutex_t */ 366 typedef pthread_mutex_t *mdb_mutexref_t; 367 /** Lock the reader or writer mutex. 368 * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX(). 369 */ 370 #define LOCK_MUTEX0(mutex) pthread_mutex_lock(mutex) 371 /** Unlock the reader or writer mutex. 372 */ 373 #define UNLOCK_MUTEX(mutex) pthread_mutex_unlock(mutex) 374 /** Mark mutex-protected data as repaired, after death of previous owner. 375 */ 376 #define mdb_mutex_consistent(mutex) pthread_mutex_consistent(mutex) 377 #endif /* MDB_USE_POSIX_SEM */ 378 379 /** Get the error code for the last failed system function. 380 */ 381 #define ErrCode() errno 382 383 /** An abstraction for a file handle. 384 * On POSIX systems file handles are small integers. On Windows 385 * they're opaque pointers. 386 */ 387 #define HANDLE int 388 389 /** A value for an invalid file handle. 390 * Mainly used to initialize file variables and signify that they are 391 * unused. 392 */ 393 #define INVALID_HANDLE_VALUE (-1) 394 395 /** Get the size of a memory page for the system. 396 * This is the basic size that the platform's memory manager uses, and is 397 * fundamental to the use of memory-mapped files. 398 */ 399 #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) 400 #endif 401 402 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) 403 #define MNAME_LEN 32 404 #else 405 #define MNAME_LEN (sizeof(pthread_mutex_t)) 406 #endif 407 408 /** @} */ 409 410 #ifdef MDB_ROBUST_SUPPORTED 411 /** Lock mutex, handle any error, set rc = result. 412 * Return 0 on success, nonzero (not rc) on error. 413 */ 414 #define LOCK_MUTEX(rc, env, mutex) \ 415 (((rc) = LOCK_MUTEX0(mutex)) && \ 416 ((rc) = mdb_mutex_failed(env, mutex, rc))) 417 static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc); 418 #else 419 #define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex)) 420 #define mdb_mutex_failed(env, mutex, rc) (rc) 421 #endif 422 423 #ifndef _WIN32 424 /** A flag for opening a file and requesting synchronous data writes. 425 * This is only used when writing a meta page. It's not strictly needed; 426 * we could just do a normal write and then immediately perform a flush. 427 * But if this flag is available it saves us an extra system call. 428 * 429 * @note If O_DSYNC is undefined but exists in /usr/include, 430 * preferably set some compiler flag to get the definition. 431 */ 432 #ifndef MDB_DSYNC 433 # ifdef O_DSYNC 434 # define MDB_DSYNC O_DSYNC 435 # else 436 # define MDB_DSYNC O_SYNC 437 # endif 438 #endif 439 #endif 440 441 /** Function for flushing the data of a file. Define this to fsync 442 * if fdatasync() is not supported. 443 */ 444 #ifndef MDB_FDATASYNC 445 # define MDB_FDATASYNC fdatasync 446 #endif 447 448 #ifndef MDB_MSYNC 449 # define MDB_MSYNC(addr,len,flags) msync(addr,len,flags) 450 #endif 451 452 #ifndef MS_SYNC 453 #define MS_SYNC 1 454 #endif 455 456 #ifndef MS_ASYNC 457 #define MS_ASYNC 0 458 #endif 459 460 /** A page number in the database. 461 * Note that 64 bit page numbers are overkill, since pages themselves 462 * already represent 12-13 bits of addressable memory, and the OS will 463 * always limit applications to a maximum of 63 bits of address space. 464 * 465 * @note In the #MDB_node structure, we only store 48 bits of this value, 466 * which thus limits us to only 60 bits of addressable data. 467 */ 468 typedef MDB_ID pgno_t; 469 470 /** A transaction ID. 471 * See struct MDB_txn.mt_txnid for details. 472 */ 473 typedef MDB_ID txnid_t; 474 475 /** @defgroup debug Debug Macros 476 * @{ 477 */ 478 #ifndef MDB_DEBUG 479 /** Enable debug output. Needs variable argument macros (a C99 feature). 480 * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs 481 * read from and written to the database (used for free space management). 482 */ 483 #define MDB_DEBUG 0 484 #endif 485 486 #define MDB_DBG_INFO 1 487 #define MDB_DBG_TRACE 2 488 489 #if MDB_DEBUG 490 static int mdb_debug = MDB_DBG_TRACE; 491 static txnid_t mdb_debug_start; 492 493 /** Print a debug message with printf formatting. 494 * Requires double parenthesis around 2 or more args. 495 */ 496 # define DPRINTF(args) ((void) ((mdb_debug & MDB_DBG_INFO) && DPRINTF0 args)) 497 # define DPRINTF0(fmt, ...) \ 498 fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__) 499 /** Trace info for replaying */ 500 # define MDB_TRACE(args) ((void) ((mdb_debug & MDB_DBG_TRACE) && DPRINTF1 args)) 501 # define DPRINTF1(fmt, ...) \ 502 fprintf(stderr, ">%d:%s: " fmt "\n", getpid(), mdb_func_, __VA_ARGS__) 503 #else 504 # define DPRINTF(args) ((void) 0) 505 # define MDB_TRACE(args) ((void) 0) 506 #endif 507 /** Print a debug string. 508 * The string is printed literally, with no format processing. 509 */ 510 #define DPUTS(arg) DPRINTF(("%s", arg)) 511 /** Debugging output value of a cursor DBI: Negative in a sub-cursor. */ 512 #define DDBI(mc) \ 513 (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) 514 /** @} */ 515 516 /** @brief The maximum size of a database page. 517 * 518 * It is 32k or 64k, since value-PAGEBASE must fit in 519 * #MDB_page.%mp_upper. 520 * 521 * LMDB will use database pages < OS pages if needed. 522 * That causes more I/O in write transactions: The OS must 523 * know (read) the whole page before writing a partial page. 524 * 525 * Note that we don't currently support Huge pages. On Linux, 526 * regular data files cannot use Huge pages, and in general 527 * Huge pages aren't actually pageable. We rely on the OS 528 * demand-pager to read our data and page it out when memory 529 * pressure from other processes is high. So until OSs have 530 * actual paging support for Huge pages, they're not viable. 531 */ 532 #define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) 533 534 /** The minimum number of keys required in a database page. 535 * Setting this to a larger value will place a smaller bound on the 536 * maximum size of a data item. Data items larger than this size will 537 * be pushed into overflow pages instead of being stored directly in 538 * the B-tree node. This value used to default to 4. With a page size 539 * of 4096 bytes that meant that any item larger than 1024 bytes would 540 * go into an overflow page. That also meant that on average 2-3KB of 541 * each overflow page was wasted space. The value cannot be lower than 542 * 2 because then there would no longer be a tree structure. With this 543 * value, items larger than 2KB will go into overflow pages, and on 544 * average only 1KB will be wasted. 545 */ 546 #define MDB_MINKEYS 2 547 548 /** A stamp that identifies a file as an LMDB file. 549 * There's nothing special about this value other than that it is easily 550 * recognizable, and it will reflect any byte order mismatches. 551 */ 552 #define MDB_MAGIC 0xBEEFC0DE 553 554 /** The version number for a database's datafile format. */ 555 #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) 556 /** The version number for a database's lockfile format. */ 557 #define MDB_LOCK_VERSION 1 558 559 /** @brief The max size of a key we can write, or 0 for computed max. 560 * 561 * This macro should normally be left alone or set to 0. 562 * Note that a database with big keys or dupsort data cannot be 563 * reliably modified by a liblmdb which uses a smaller max. 564 * The default is 511 for backwards compat, or 0 when #MDB_DEVEL. 565 * 566 * Other values are allowed, for backwards compat. However: 567 * A value bigger than the computed max can break if you do not 568 * know what you are doing, and liblmdb <= 0.9.10 can break when 569 * modifying a DB with keys/dupsort data bigger than its max. 570 * 571 * Data items in an #MDB_DUPSORT database are also limited to 572 * this size, since they're actually keys of a sub-DB. Keys and 573 * #MDB_DUPSORT data items must fit on a node in a regular page. 574 */ 575 #ifndef MDB_MAXKEYSIZE 576 #define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) 577 #endif 578 579 /** The maximum size of a key we can write to the environment. */ 580 #if MDB_MAXKEYSIZE 581 #define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) 582 #else 583 #define ENV_MAXKEY(env) ((env)->me_maxkey) 584 #endif 585 586 /** @brief The maximum size of a data item. 587 * 588 * We only store a 32 bit value for node sizes. 589 */ 590 #define MAXDATASIZE 0xffffffffUL 591 592 #if MDB_DEBUG 593 /** Key size which fits in a #DKBUF. 594 * @ingroup debug 595 */ 596 #define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) 597 /** A key buffer. 598 * @ingroup debug 599 * This is used for printing a hex dump of a key's contents. 600 */ 601 #define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1] 602 /** A data value buffer. 603 * @ingroup debug 604 * This is used for printing a hex dump of a #MDB_DUPSORT value's contents. 605 */ 606 #define DDBUF char dbuf[DKBUF_MAXKEYSIZE*2+1+2] 607 /** Display a key in hex. 608 * @ingroup debug 609 * Invoke a function to display a key in hex. 610 */ 611 #define DKEY(x) mdb_dkey(x, kbuf) 612 #else 613 #define DKBUF 614 #define DDBUF 615 #define DKEY(x) 0 616 #endif 617 618 /** An invalid page number. 619 * Mainly used to denote an empty tree. 620 */ 621 #define P_INVALID (~(pgno_t)0) 622 623 /** Test if the flags \b f are set in a flag word \b w. */ 624 #define F_ISSET(w, f) (((w) & (f)) == (f)) 625 626 /** Round \b n up to an even number. */ 627 #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ 628 629 /** Used for offsets within a single page. 630 * Since memory pages are typically 4 or 8KB in size, 12-13 bits, 631 * this is plenty. 632 */ 633 typedef uint16_t indx_t; 634 635 /** Default size of memory map. 636 * This is certainly too small for any actual applications. Apps should always set 637 * the size explicitly using #mdb_env_set_mapsize(). 638 */ 639 #define DEFAULT_MAPSIZE 1048576 640 641 /** @defgroup readers Reader Lock Table 642 * Readers don't acquire any locks for their data access. Instead, they 643 * simply record their transaction ID in the reader table. The reader 644 * mutex is needed just to find an empty slot in the reader table. The 645 * slot's address is saved in thread-specific data so that subsequent read 646 * transactions started by the same thread need no further locking to proceed. 647 * 648 * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. 649 * 650 * No reader table is used if the database is on a read-only filesystem, or 651 * if #MDB_NOLOCK is set. 652 * 653 * Since the database uses multi-version concurrency control, readers don't 654 * actually need any locking. This table is used to keep track of which 655 * readers are using data from which old transactions, so that we'll know 656 * when a particular old transaction is no longer in use. Old transactions 657 * that have discarded any data pages can then have those pages reclaimed 658 * for use by a later write transaction. 659 * 660 * The lock table is constructed such that reader slots are aligned with the 661 * processor's cache line size. Any slot is only ever used by one thread. 662 * This alignment guarantees that there will be no contention or cache 663 * thrashing as threads update their own slot info, and also eliminates 664 * any need for locking when accessing a slot. 665 * 666 * A writer thread will scan every slot in the table to determine the oldest 667 * outstanding reader transaction. Any freed pages older than this will be 668 * reclaimed by the writer. The writer doesn't use any locks when scanning 669 * this table. This means that there's no guarantee that the writer will 670 * see the most up-to-date reader info, but that's not required for correct 671 * operation - all we need is to know the upper bound on the oldest reader, 672 * we don't care at all about the newest reader. So the only consequence of 673 * reading stale information here is that old pages might hang around a 674 * while longer before being reclaimed. That's actually good anyway, because 675 * the longer we delay reclaiming old pages, the more likely it is that a 676 * string of contiguous pages can be found after coalescing old pages from 677 * many old transactions together. 678 * @{ 679 */ 680 /** Number of slots in the reader table. 681 * This value was chosen somewhat arbitrarily. 126 readers plus a 682 * couple mutexes fit exactly into 8KB on my development machine. 683 * Applications should set the table size using #mdb_env_set_maxreaders(). 684 */ 685 #define DEFAULT_READERS 126 686 687 /** The size of a CPU cache line in bytes. We want our lock structures 688 * aligned to this size to avoid false cache line sharing in the 689 * lock table. 690 * This value works for most CPUs. For Itanium this should be 128. 691 */ 692 #ifndef CACHELINE 693 #define CACHELINE 64 694 #endif 695 696 /** The information we store in a single slot of the reader table. 697 * In addition to a transaction ID, we also record the process and 698 * thread ID that owns a slot, so that we can detect stale information, 699 * e.g. threads or processes that went away without cleaning up. 700 * @note We currently don't check for stale records. We simply re-init 701 * the table when we know that we're the only process opening the 702 * lock file. 703 */ 704 typedef struct MDB_rxbody { 705 /** Current Transaction ID when this transaction began, or (txnid_t)-1. 706 * Multiple readers that start at the same time will probably have the 707 * same ID here. Again, it's not important to exclude them from 708 * anything; all we need to know is which version of the DB they 709 * started from so we can avoid overwriting any data used in that 710 * particular version. 711 */ 712 volatile txnid_t mrb_txnid; 713 /** The process ID of the process owning this reader txn. */ 714 volatile MDB_PID_T mrb_pid; 715 /** The thread ID of the thread owning this txn. */ 716 volatile MDB_THR_T mrb_tid; 717 } MDB_rxbody; 718 719 /** The actual reader record, with cacheline padding. */ 720 typedef struct MDB_reader { 721 union { 722 MDB_rxbody mrx; 723 /** shorthand for mrb_txnid */ 724 #define mr_txnid mru.mrx.mrb_txnid 725 #define mr_pid mru.mrx.mrb_pid 726 #define mr_tid mru.mrx.mrb_tid 727 /** cache line alignment */ 728 char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)]; 729 } mru; 730 } MDB_reader; 731 732 /** The header for the reader table. 733 * The table resides in a memory-mapped file. (This is a different file 734 * than is used for the main database.) 735 * 736 * For POSIX the actual mutexes reside in the shared memory of this 737 * mapped file. On Windows, mutexes are named objects allocated by the 738 * kernel; we store the mutex names in this mapped file so that other 739 * processes can grab them. This same approach is also used on 740 * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support 741 * process-shared POSIX mutexes. For these cases where a named object 742 * is used, the object name is derived from a 64 bit FNV hash of the 743 * environment pathname. As such, naming collisions are extremely 744 * unlikely. If a collision occurs, the results are unpredictable. 745 */ 746 typedef struct MDB_txbody { 747 /** Stamp identifying this as an LMDB file. It must be set 748 * to #MDB_MAGIC. */ 749 uint32_t mtb_magic; 750 /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ 751 uint32_t mtb_format; 752 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) 753 char mtb_rmname[MNAME_LEN]; 754 #else 755 /** Mutex protecting access to this table. 756 * This is the reader table lock used with LOCK_MUTEX(). 757 */ 758 mdb_mutex_t mtb_rmutex; 759 #endif 760 /** The ID of the last transaction committed to the database. 761 * This is recorded here only for convenience; the value can always 762 * be determined by reading the main database meta pages. 763 */ 764 volatile txnid_t mtb_txnid; 765 /** The number of slots that have been used in the reader table. 766 * This always records the maximum count, it is not decremented 767 * when readers release their slots. 768 */ 769 volatile unsigned mtb_numreaders; 770 } MDB_txbody; 771 772 /** The actual reader table definition. */ 773 typedef struct MDB_txninfo { 774 union { 775 MDB_txbody mtb; 776 #define mti_magic mt1.mtb.mtb_magic 777 #define mti_format mt1.mtb.mtb_format 778 #define mti_rmutex mt1.mtb.mtb_rmutex 779 #define mti_rmname mt1.mtb.mtb_rmname 780 #define mti_txnid mt1.mtb.mtb_txnid 781 #define mti_numreaders mt1.mtb.mtb_numreaders 782 char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; 783 } mt1; 784 union { 785 #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) 786 char mt2_wmname[MNAME_LEN]; 787 #define mti_wmname mt2.mt2_wmname 788 #else 789 mdb_mutex_t mt2_wmutex; 790 #define mti_wmutex mt2.mt2_wmutex 791 #endif 792 char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)]; 793 } mt2; 794 MDB_reader mti_readers[1]; 795 } MDB_txninfo; 796 797 /** Lockfile format signature: version, features and field layout */ 798 #define MDB_LOCK_FORMAT \ 799 ((uint32_t) \ 800 ((MDB_LOCK_VERSION) \ 801 /* Flags which describe functionality */ \ 802 + (((MDB_PIDLOCK) != 0) << 16))) 803 /** @} */ 804 805 /** Common header for all page types. The page type depends on #mp_flags. 806 * 807 * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with 808 * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages 809 * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. 810 * 811 * #P_OVERFLOW records occupy one or more contiguous pages where only the 812 * first has a page header. They hold the real data of #F_BIGDATA nodes. 813 * 814 * #P_SUBP sub-pages are small leaf "pages" with duplicate data. 815 * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. 816 * (Duplicate data can also go in sub-databases, which use normal pages.) 817 * 818 * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. 819 * 820 * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once 821 * in the snapshot: Either used by a database or listed in a freeDB record. 822 */ 823 typedef struct MDB_page { 824 #define mp_pgno mp_p.p_pgno 825 #define mp_next mp_p.p_next 826 union { 827 pgno_t p_pgno; /**< page number */ 828 struct MDB_page *p_next; /**< for in-memory list of freed pages */ 829 } mp_p; 830 uint16_t mp_pad; /**< key size if this is a LEAF2 page */ 831 /** @defgroup mdb_page Page Flags 832 * @ingroup internal 833 * Flags for the page headers. 834 * @{ 835 */ 836 #define P_BRANCH 0x01 /**< branch page */ 837 #define P_LEAF 0x02 /**< leaf page */ 838 #define P_OVERFLOW 0x04 /**< overflow page */ 839 #define P_META 0x08 /**< meta page */ 840 #define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ 841 #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ 842 #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ 843 #define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ 844 #define P_KEEP 0x8000 /**< leave this page alone during spill */ 845 /** @} */ 846 uint16_t mp_flags; /**< @ref mdb_page */ 847 #define mp_lower mp_pb.pb.pb_lower 848 #define mp_upper mp_pb.pb.pb_upper 849 #define mp_pages mp_pb.pb_pages 850 union { 851 struct { 852 indx_t pb_lower; /**< lower bound of free space */ 853 indx_t pb_upper; /**< upper bound of free space */ 854 } pb; 855 uint32_t pb_pages; /**< number of overflow pages */ 856 } mp_pb; 857 indx_t mp_ptrs[0]; /**< dynamic size */ 858 } MDB_page; 859 860 /** Alternate page header, for 2-byte aligned access */ 861 typedef struct MDB_page2 { 862 uint16_t mp2_p[sizeof(pgno_t)/2]; 863 uint16_t mp2_pad; 864 uint16_t mp2_flags; 865 indx_t mp2_lower; 866 indx_t mp2_upper; 867 indx_t mp2_ptrs[0]; 868 } MDB_page2; 869 870 #define MP_PGNO(p) (((MDB_page2 *)(void *)(p))->mp2_p) 871 #define MP_PAD(p) (((MDB_page2 *)(void *)(p))->mp2_pad) 872 #define MP_FLAGS(p) (((MDB_page2 *)(void *)(p))->mp2_flags) 873 #define MP_LOWER(p) (((MDB_page2 *)(void *)(p))->mp2_lower) 874 #define MP_UPPER(p) (((MDB_page2 *)(void *)(p))->mp2_upper) 875 #define MP_PTRS(p) (((MDB_page2 *)(void *)(p))->mp2_ptrs) 876 877 /** Size of the page header, excluding dynamic data at the end */ 878 #define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs)) 879 880 /** Address of first usable data byte in a page, after the header */ 881 #define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) 882 883 /** ITS#7713, change PAGEBASE to handle 65536 byte pages */ 884 #define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) 885 886 /** Number of nodes on a page */ 887 #define NUMKEYS(p) ((MP_LOWER(p) - (PAGEHDRSZ-PAGEBASE)) >> 1) 888 889 /** The amount of space remaining in the page */ 890 #define SIZELEFT(p) (indx_t)(MP_UPPER(p) - MP_LOWER(p)) 891 892 /** The percentage of space used in the page, in tenths of a percent. */ 893 #define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ 894 ((env)->me_psize - PAGEHDRSZ)) 895 /** The minimum page fill factor, in tenths of a percent. 896 * Pages emptier than this are candidates for merging. 897 */ 898 #define FILL_THRESHOLD 250 899 900 /** Test if a page is a leaf page */ 901 #define IS_LEAF(p) F_ISSET(MP_FLAGS(p), P_LEAF) 902 /** Test if a page is a LEAF2 page */ 903 #define IS_LEAF2(p) F_ISSET(MP_FLAGS(p), P_LEAF2) 904 /** Test if a page is a branch page */ 905 #define IS_BRANCH(p) F_ISSET(MP_FLAGS(p), P_BRANCH) 906 /** Test if a page is an overflow page */ 907 #define IS_OVERFLOW(p) F_ISSET(MP_FLAGS(p), P_OVERFLOW) 908 /** Test if a page is a sub page */ 909 #define IS_SUBP(p) F_ISSET(MP_FLAGS(p), P_SUBP) 910 911 /** The number of overflow pages needed to store the given size. */ 912 #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) 913 914 /** Link in #MDB_txn.%mt_loose_pgs list. 915 * Kept outside the page header, which is needed when reusing the page. 916 */ 917 #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) 918 919 /** Header for a single key/data pair within a page. 920 * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. 921 * We guarantee 2-byte alignment for 'MDB_node's. 922 * 923 * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child 924 * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used 925 * for pgno. (Branch nodes have no flags). Lo and hi are in host byte 926 * order in case some accesses can be optimized to 32-bit word access. 927 * 928 * Leaf node flags describe node contents. #F_BIGDATA says the node's 929 * data part is the page number of an overflow page with actual data. 930 * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in 931 * a sub-page/sub-database, and named databases (just #F_SUBDATA). 932 */ 933 typedef struct MDB_node { 934 /** part of data size or pgno 935 * @{ */ 936 #if BYTE_ORDER == LITTLE_ENDIAN 937 unsigned short mn_lo, mn_hi; 938 #else 939 unsigned short mn_hi, mn_lo; 940 #endif 941 /** @} */ 942 /** @defgroup mdb_node Node Flags 943 * @ingroup internal 944 * Flags for node headers. 945 * @{ 946 */ 947 #define F_BIGDATA 0x01 /**< data put on overflow page */ 948 #define F_SUBDATA 0x02 /**< data is a sub-database */ 949 #define F_DUPDATA 0x04 /**< data has duplicates */ 950 951 /** valid flags for #mdb_node_add() */ 952 #define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND) 953 954 /** @} */ 955 unsigned short mn_flags; /**< @ref mdb_node */ 956 unsigned short mn_ksize; /**< key size */ 957 char mn_data[1]; /**< key and data are appended here */ 958 } MDB_node; 959 960 /** Size of the node header, excluding dynamic data at the end */ 961 #define NODESIZE offsetof(MDB_node, mn_data) 962 963 /** Bit position of top word in page number, for shifting mn_flags */ 964 #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) 965 966 /** Size of a node in a branch page with a given key. 967 * This is just the node header plus the key, there is no data. 968 */ 969 #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) 970 971 /** Size of a node in a leaf page with a given key and data. 972 * This is node header plus key plus data size. 973 */ 974 #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) 975 976 /** Address of node \b i in page \b p */ 977 #define NODEPTR(p, i) ((MDB_node *)((char *)(p) + MP_PTRS(p)[i] + PAGEBASE)) 978 979 /** Address of the key for the node */ 980 #define NODEKEY(node) (void *)((node)->mn_data) 981 982 /** Address of the data for a node */ 983 #define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) 984 985 /** Get the page number pointed to by a branch node */ 986 #define NODEPGNO(node) \ 987 ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \ 988 (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0)) 989 /** Set the page number in a branch node */ 990 #define SETPGNO(node,pgno) do { \ 991 (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \ 992 if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0) 993 994 /** Get the size of the data in a leaf node */ 995 #define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) 996 /** Set the size of the data for a leaf node */ 997 #define SETDSZ(node,size) do { \ 998 (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0) 999 /** The size of a key in a node */ 1000 #define NODEKSZ(node) ((node)->mn_ksize) 1001 1002 /** Copy a page number from src to dst */ 1003 #ifdef MISALIGNED_OK 1004 #define COPY_PGNO(dst,src) dst = src 1005 #undef MP_PGNO 1006 #define MP_PGNO(p) ((p)->mp_pgno) 1007 #else 1008 #if SIZE_MAX > 4294967295UL 1009 #define COPY_PGNO(dst,src) do { \ 1010 unsigned short *s, *d; \ 1011 s = (unsigned short *)&(src); \ 1012 d = (unsigned short *)&(dst); \ 1013 *d++ = *s++; \ 1014 *d++ = *s++; \ 1015 *d++ = *s++; \ 1016 *d = *s; \ 1017 } while (0) 1018 #else 1019 #define COPY_PGNO(dst,src) do { \ 1020 unsigned short *s, *d; \ 1021 s = (unsigned short *)&(src); \ 1022 d = (unsigned short *)&(dst); \ 1023 *d++ = *s++; \ 1024 *d = *s; \ 1025 } while (0) 1026 #endif 1027 #endif 1028 /** The address of a key in a LEAF2 page. 1029 * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs. 1030 * There are no node headers, keys are stored contiguously. 1031 */ 1032 #define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks))) 1033 1034 /** Set the \b node's key into \b keyptr, if requested. */ 1035 #define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \ 1036 (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } } 1037 1038 /** Set the \b node's key into \b key. */ 1039 #define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); } 1040 1041 /** Information about a single database in the environment. */ 1042 typedef struct MDB_db { 1043 uint32_t md_pad; /**< also ksize for LEAF2 pages */ 1044 uint16_t md_flags; /**< @ref mdb_dbi_open */ 1045 uint16_t md_depth; /**< depth of this tree */ 1046 pgno_t md_branch_pages; /**< number of internal pages */ 1047 pgno_t md_leaf_pages; /**< number of leaf pages */ 1048 pgno_t md_overflow_pages; /**< number of overflow pages */ 1049 size_t md_entries; /**< number of data items */ 1050 pgno_t md_root; /**< the root page of this tree */ 1051 } MDB_db; 1052 1053 #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ 1054 #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) 1055 /** #mdb_dbi_open() flags */ 1056 #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ 1057 MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) 1058 1059 /** Handle for the DB used to track free pages. */ 1060 #define FREE_DBI 0 1061 /** Handle for the default DB. */ 1062 #define MAIN_DBI 1 1063 /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ 1064 #define CORE_DBS 2 1065 1066 /** Number of meta pages - also hardcoded elsewhere */ 1067 #define NUM_METAS 2 1068 1069 /** Meta page content. 1070 * A meta page is the start point for accessing a database snapshot. 1071 * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). 1072 */ 1073 typedef struct MDB_meta { 1074 /** Stamp identifying this as an LMDB file. It must be set 1075 * to #MDB_MAGIC. */ 1076 uint32_t mm_magic; 1077 /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ 1078 uint32_t mm_version; 1079 void *mm_address; /**< address for fixed mapping */ 1080 size_t mm_mapsize; /**< size of mmap region */ 1081 MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ 1082 /** The size of pages used in this DB */ 1083 #define mm_psize mm_dbs[FREE_DBI].md_pad 1084 /** Any persistent environment flags. @ref mdb_env */ 1085 #define mm_flags mm_dbs[FREE_DBI].md_flags 1086 /** Last used page in the datafile. 1087 * Actually the file may be shorter if the freeDB lists the final pages. 1088 */ 1089 pgno_t mm_last_pg; 1090 volatile txnid_t mm_txnid; /**< txnid that committed this page */ 1091 } MDB_meta; 1092 1093 /** Buffer for a stack-allocated meta page. 1094 * The members define size and alignment, and silence type 1095 * aliasing warnings. They are not used directly; that could 1096 * mean incorrectly using several union members in parallel. 1097 */ 1098 typedef union MDB_metabuf { 1099 MDB_page mb_page; 1100 struct { 1101 char mm_pad[PAGEHDRSZ]; 1102 MDB_meta mm_meta; 1103 } mb_metabuf; 1104 } MDB_metabuf; 1105 1106 /** Auxiliary DB info. 1107 * The information here is mostly static/read-only. There is 1108 * only a single copy of this record in the environment. 1109 */ 1110 typedef struct MDB_dbx { 1111 MDB_val md_name; /**< name of the database */ 1112 MDB_cmp_func *md_cmp; /**< function for comparing keys */ 1113 MDB_cmp_func *md_dcmp; /**< function for comparing data items */ 1114 MDB_rel_func *md_rel; /**< user relocate function */ 1115 void *md_relctx; /**< user-provided context for md_rel */ 1116 } MDB_dbx; 1117 1118 /** A database transaction. 1119 * Every operation requires a transaction handle. 1120 */ 1121 struct MDB_txn { 1122 MDB_txn *mt_parent; /**< parent of a nested txn */ 1123 /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ 1124 MDB_txn *mt_child; 1125 pgno_t mt_next_pgno; /**< next unallocated page */ 1126 /** The ID of this transaction. IDs are integers incrementing from 1. 1127 * Only committed write transactions increment the ID. If a transaction 1128 * aborts, the ID may be re-used by the next writer. 1129 */ 1130 txnid_t mt_txnid; 1131 MDB_env *mt_env; /**< the DB environment */ 1132 /** The list of pages that became unused during this transaction. 1133 */ 1134 MDB_IDL mt_free_pgs; 1135 /** The list of loose pages that became unused and may be reused 1136 * in this transaction, linked through #NEXT_LOOSE_PAGE(page). 1137 */ 1138 MDB_page *mt_loose_pgs; 1139 /** Number of loose pages (#mt_loose_pgs) */ 1140 int mt_loose_count; 1141 /** The sorted list of dirty pages we temporarily wrote to disk 1142 * because the dirty list was full. page numbers in here are 1143 * shifted left by 1, deleted slots have the LSB set. 1144 */ 1145 MDB_IDL mt_spill_pgs; 1146 union { 1147 /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ 1148 MDB_ID2L dirty_list; 1149 /** For read txns: This thread/txn's reader table slot, or NULL. */ 1150 MDB_reader *reader; 1151 } mt_u; 1152 /** Array of records for each DB known in the environment. */ 1153 MDB_dbx *mt_dbxs; 1154 /** Array of MDB_db records for each known DB */ 1155 MDB_db *mt_dbs; 1156 /** Array of sequence numbers for each DB handle */ 1157 unsigned int *mt_dbiseqs; 1158 /** @defgroup mt_dbflag Transaction DB Flags 1159 * @ingroup internal 1160 * @{ 1161 */ 1162 #define DB_DIRTY 0x01 /**< DB was written in this txn */ 1163 #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ 1164 #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ 1165 #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ 1166 #define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ 1167 #define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ 1168 /** @} */ 1169 /** In write txns, array of cursors for each DB */ 1170 MDB_cursor **mt_cursors; 1171 /** Array of flags for each DB */ 1172 unsigned char *mt_dbflags; 1173 /** Number of DB records in use, or 0 when the txn is finished. 1174 * This number only ever increments until the txn finishes; we 1175 * don't decrement it when individual DB handles are closed. 1176 */ 1177 MDB_dbi mt_numdbs; 1178 1179 /** @defgroup mdb_txn Transaction Flags 1180 * @ingroup internal 1181 * @{ 1182 */ 1183 /** #mdb_txn_begin() flags */ 1184 #define MDB_TXN_BEGIN_FLAGS MDB_RDONLY 1185 #define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ 1186 /* internal txn flags */ 1187 #define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */ 1188 #define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ 1189 #define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ 1190 #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ 1191 #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ 1192 #define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ 1193 /** most operations on the txn are currently illegal */ 1194 #define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD) 1195 /** @} */ 1196 unsigned int mt_flags; /**< @ref mdb_txn */ 1197 /** #dirty_list room: Array size - \#dirty pages visible to this txn. 1198 * Includes ancestor txns' dirty pages not hidden by other txns' 1199 * dirty/spilled pages. Thus commit(nested txn) has room to merge 1200 * dirty_list into mt_parent after freeing hidden mt_parent pages. 1201 */ 1202 unsigned int mt_dirty_room; 1203 }; 1204 1205 /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. 1206 * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to 1207 * raise this on a 64 bit machine. 1208 */ 1209 #define CURSOR_STACK 32 1210 1211 struct MDB_xcursor; 1212 1213 /** Cursors are used for all DB operations. 1214 * A cursor holds a path of (page pointer, key index) from the DB 1215 * root to a position in the DB, plus other state. #MDB_DUPSORT 1216 * cursors include an xcursor to the current data item. Write txns 1217 * track their cursors and keep them up to date when data moves. 1218 * Exception: An xcursor's pointer to a #P_SUBP page can be stale. 1219 * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). 1220 */ 1221 struct MDB_cursor { 1222 /** Next cursor on this DB in this txn */ 1223 MDB_cursor *mc_next; 1224 /** Backup of the original cursor if this cursor is a shadow */ 1225 MDB_cursor *mc_backup; 1226 /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ 1227 struct MDB_xcursor *mc_xcursor; 1228 /** The transaction that owns this cursor */ 1229 MDB_txn *mc_txn; 1230 /** The database handle this cursor operates on */ 1231 MDB_dbi mc_dbi; 1232 /** The database record for this cursor */ 1233 MDB_db *mc_db; 1234 /** The database auxiliary record for this cursor */ 1235 MDB_dbx *mc_dbx; 1236 /** The @ref mt_dbflag for this database */ 1237 unsigned char *mc_dbflag; 1238 unsigned short mc_snum; /**< number of pushed pages */ 1239 unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ 1240 /** @defgroup mdb_cursor Cursor Flags 1241 * @ingroup internal 1242 * Cursor state flags. 1243 * @{ 1244 */ 1245 #define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ 1246 #define C_EOF 0x02 /**< No more data */ 1247 #define C_SUB 0x04 /**< Cursor is a sub-cursor */ 1248 #define C_DEL 0x08 /**< last op was a cursor_del */ 1249 #define C_UNTRACK 0x40 /**< Un-track cursor when closing */ 1250 /** @} */ 1251 unsigned int mc_flags; /**< @ref mdb_cursor */ 1252 MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ 1253 indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ 1254 }; 1255 1256 /** Context for sorted-dup records. 1257 * We could have gone to a fully recursive design, with arbitrarily 1258 * deep nesting of sub-databases. But for now we only handle these 1259 * levels - main DB, optional sub-DB, sorted-duplicate DB. 1260 */ 1261 typedef struct MDB_xcursor { 1262 /** A sub-cursor for traversing the Dup DB */ 1263 MDB_cursor mx_cursor; 1264 /** The database record for this Dup DB */ 1265 MDB_db mx_db; 1266 /** The auxiliary DB record for this Dup DB */ 1267 MDB_dbx mx_dbx; 1268 /** The @ref mt_dbflag for this Dup DB */ 1269 unsigned char mx_dbflag; 1270 } MDB_xcursor; 1271 1272 /** Check if there is an inited xcursor */ 1273 #define XCURSOR_INITED(mc) \ 1274 ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) 1275 1276 /** Update the xcursor's sub-page pointer, if any, in \b mc. Needed 1277 * when the node which contains the sub-page may have moved. Called 1278 * with leaf page \b mp = mc->mc_pg[\b top]. 1279 */ 1280 #define XCURSOR_REFRESH(mc, top, mp) do { \ 1281 MDB_page *xr_pg = (mp); \ 1282 MDB_node *xr_node; \ 1283 if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \ 1284 xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \ 1285 if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ 1286 (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ 1287 } while (0) 1288 1289 /** State of FreeDB old pages, stored in the MDB_env */ 1290 typedef struct MDB_pgstate { 1291 pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ 1292 txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ 1293 } MDB_pgstate; 1294 1295 /** The database environment. */ 1296 struct MDB_env { 1297 HANDLE me_fd; /**< The main data file */ 1298 HANDLE me_lfd; /**< The lock file */ 1299 HANDLE me_mfd; /**< For writing and syncing the meta pages */ 1300 /** Failed to update the meta page. Probably an I/O error. */ 1301 #define MDB_FATAL_ERROR 0x80000000U 1302 /** Some fields are initialized. */ 1303 #define MDB_ENV_ACTIVE 0x20000000U 1304 /** me_txkey is set */ 1305 #define MDB_ENV_TXKEY 0x10000000U 1306 /** fdatasync is unreliable */ 1307 #define MDB_FSYNCONLY 0x08000000U 1308 uint32_t me_flags; /**< @ref mdb_env */ 1309 unsigned int me_psize; /**< DB page size, inited from me_os_psize */ 1310 unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ 1311 unsigned int me_maxreaders; /**< size of the reader table */ 1312 /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */ 1313 volatile int me_close_readers; 1314 MDB_dbi me_numdbs; /**< number of DBs opened */ 1315 MDB_dbi me_maxdbs; /**< size of the DB table */ 1316 MDB_PID_T me_pid; /**< process ID of this env */ 1317 char *me_path; /**< path to the DB files */ 1318 char *me_map; /**< the memory map of the data file */ 1319 MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ 1320 MDB_meta *me_metas[NUM_METAS]; /**< pointers to the two meta pages */ 1321 void *me_pbuf; /**< scratch area for DUPSORT put() */ 1322 MDB_txn *me_txn; /**< current write transaction */ 1323 MDB_txn *me_txn0; /**< prealloc'd write transaction */ 1324 size_t me_mapsize; /**< size of the data memory map */ 1325 off_t me_size; /**< current file size */ 1326 pgno_t me_maxpg; /**< me_mapsize / me_psize */ 1327 MDB_dbx *me_dbxs; /**< array of static DB info */ 1328 uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ 1329 unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */ 1330 pthread_key_t me_txkey; /**< thread-key for readers */ 1331 txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ 1332 MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ 1333 # define me_pglast me_pgstate.mf_pglast 1334 # define me_pghead me_pgstate.mf_pghead 1335 MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ 1336 /** IDL of pages that became unused in a write txn */ 1337 MDB_IDL me_free_pgs; 1338 /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ 1339 MDB_ID2L me_dirty_list; 1340 /** Max number of freelist items that can fit in a single overflow page */ 1341 int me_maxfree_1pg; 1342 /** Max size of a node on a page */ 1343 unsigned int me_nodemax; 1344 #if !(MDB_MAXKEYSIZE) 1345 unsigned int me_maxkey; /**< max size of a key */ 1346 #endif 1347 int me_live_reader; /**< have liveness lock in reader table */ 1348 #ifdef _WIN32 1349 int me_pidquery; /**< Used in OpenProcess */ 1350 #endif 1351 #ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */ 1352 # define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */ 1353 # define me_wmutex me_txns->mti_wmutex /**< Shared writer lock */ 1354 #else 1355 mdb_mutex_t me_rmutex; 1356 mdb_mutex_t me_wmutex; 1357 #endif 1358 void *me_userctx; /**< User-settable context */ 1359 MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ 1360 }; 1361 1362 /** Nested transaction */ 1363 typedef struct MDB_ntxn { 1364 MDB_txn mnt_txn; /**< the transaction */ 1365 MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ 1366 } MDB_ntxn; 1367 1368 /** max number of pages to commit in one writev() call */ 1369 #define MDB_COMMIT_PAGES 64 1370 #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES 1371 #undef MDB_COMMIT_PAGES 1372 #define MDB_COMMIT_PAGES IOV_MAX 1373 #endif 1374 1375 /** max bytes to write in one call */ 1376 #define MAX_WRITE (0x40000000U >> (sizeof(ssize_t) == 4)) 1377 1378 /** Check \b txn and \b dbi arguments to a function */ 1379 #define TXN_DBI_EXIST(txn, dbi, validity) \ 1380 ((txn) && (dbi)<(txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) 1381 1382 /** Check for misused \b dbi handles */ 1383 #define TXN_DBI_CHANGED(txn, dbi) \ 1384 ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) 1385 1386 static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); 1387 static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); 1388 static int mdb_page_touch(MDB_cursor *mc); 1389 1390 #define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \ 1391 "reset-tmp", "fail-begin", "fail-beginchild"} 1392 enum { 1393 /* mdb_txn_end operation number, for logging */ 1394 MDB_END_COMMITTED, MDB_END_EMPTY_COMMIT, MDB_END_ABORT, MDB_END_RESET, 1395 MDB_END_RESET_TMP, MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD 1396 }; 1397 #define MDB_END_OPMASK 0x0F /**< mask for #mdb_txn_end() operation number */ 1398 #define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ 1399 #define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ 1400 #define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ 1401 static void mdb_txn_end(MDB_txn *txn, unsigned mode); 1402 1403 static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); 1404 static int mdb_page_search_root(MDB_cursor *mc, 1405 MDB_val *key, int modify); 1406 #define MDB_PS_MODIFY 1 1407 #define MDB_PS_ROOTONLY 2 1408 #define MDB_PS_FIRST 4 1409 #define MDB_PS_LAST 8 1410 static int mdb_page_search(MDB_cursor *mc, 1411 MDB_val *key, int flags); 1412 static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); 1413 1414 #define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ 1415 static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, 1416 pgno_t newpgno, unsigned int nflags); 1417 1418 static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); 1419 static MDB_meta *mdb_env_pick_meta(const MDB_env *env); 1420 static int mdb_env_write_meta(MDB_txn *txn); 1421 #if defined(MDB_USE_POSIX_MUTEX) && !defined(MDB_ROBUST_SUPPORTED) /* Drop unused excl arg */ 1422 # define mdb_env_close0(env, excl) mdb_env_close1(env) 1423 #endif 1424 static void mdb_env_close0(MDB_env *env, int excl); 1425 1426 static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); 1427 static int mdb_node_add(MDB_cursor *mc, indx_t indx, 1428 MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags); 1429 static void mdb_node_del(MDB_cursor *mc, int ksize); 1430 static void mdb_node_shrink(MDB_page *mp, indx_t indx); 1431 static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); 1432 static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); 1433 static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); 1434 static size_t mdb_branch_size(MDB_env *env, MDB_val *key); 1435 1436 static int mdb_rebalance(MDB_cursor *mc); 1437 static int mdb_update_key(MDB_cursor *mc, MDB_val *key); 1438 1439 static void mdb_cursor_pop(MDB_cursor *mc); 1440 static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); 1441 1442 static int _mdb_cursor_del(MDB_cursor *mc, unsigned int flags); 1443 static int _mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, unsigned int flags); 1444 1445 static int mdb_cursor_del0(MDB_cursor *mc); 1446 static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); 1447 static int mdb_cursor_sibling(MDB_cursor *mc, int move_right); 1448 static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); 1449 static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); 1450 static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, 1451 int *exactp); 1452 static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); 1453 static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); 1454 1455 static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); 1456 static void mdb_xcursor_init0(MDB_cursor *mc); 1457 static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); 1458 static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); 1459 1460 static int mdb_drop0(MDB_cursor *mc, int subs); 1461 static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); 1462 static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); 1463 1464 /** @cond */ 1465 static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long; 1466 /** @endcond */ 1467 1468 /** Compare two items pointing at size_t's of unknown alignment. */ 1469 #ifdef MISALIGNED_OK 1470 # define mdb_cmp_clong mdb_cmp_long 1471 #else 1472 # define mdb_cmp_clong mdb_cmp_cint 1473 #endif 1474 1475 #ifdef _WIN32 1476 static SECURITY_DESCRIPTOR mdb_null_sd; 1477 static SECURITY_ATTRIBUTES mdb_all_sa; 1478 static int mdb_sec_inited; 1479 1480 struct MDB_name; 1481 static int utf8_to_utf16(const char *src, struct MDB_name *dst, int xtra); 1482 #endif 1483 1484 /** Return the library version info. */ 1485 char * ESECT 1486 mdb_version(int *major, int *minor, int *patch) 1487 { 1488 if (major) *major = MDB_VERSION_MAJOR; 1489 if (minor) *minor = MDB_VERSION_MINOR; 1490 if (patch) *patch = MDB_VERSION_PATCH; 1491 return MDB_VERSION_STRING; 1492 } 1493 1494 /** Table of descriptions for LMDB @ref errors */ 1495 static char *const mdb_errstr[] = { 1496 "MDB_KEYEXIST: Key/data pair already exists", 1497 "MDB_NOTFOUND: No matching key/data pair found", 1498 "MDB_PAGE_NOTFOUND: Requested page not found", 1499 "MDB_CORRUPTED: Located page was wrong type", 1500 "MDB_PANIC: Update of meta page failed or environment had fatal error", 1501 "MDB_VERSION_MISMATCH: Database environment version mismatch", 1502 "MDB_INVALID: File is not an LMDB file", 1503 "MDB_MAP_FULL: Environment mapsize limit reached", 1504 "MDB_DBS_FULL: Environment maxdbs limit reached", 1505 "MDB_READERS_FULL: Environment maxreaders limit reached", 1506 "MDB_TLS_FULL: Thread-local storage keys full - too many environments open", 1507 "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big", 1508 "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", 1509 "MDB_PAGE_FULL: Internal error - page has no more space", 1510 "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", 1511 "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", 1512 "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", 1513 "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", 1514 "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", 1515 "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", 1516 }; 1517 1518 char * 1519 mdb_strerror(int err) 1520 { 1521 #ifdef _WIN32 1522 /** HACK: pad 4KB on stack over the buf. Return system msgs in buf. 1523 * This works as long as no function between the call to mdb_strerror 1524 * and the actual use of the message uses more than 4K of stack. 1525 */ 1526 #define MSGSIZE 1024 1527 #define PADSIZE 4096 1528 char buf[MSGSIZE+PADSIZE], *ptr = buf; 1529 #endif 1530 int i; 1531 if (!err) 1532 return ("Successful return: 0"); 1533 1534 if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { 1535 i = err - MDB_KEYEXIST; 1536 return mdb_errstr[i]; 1537 } 1538 1539 #ifdef _WIN32 1540 /* These are the C-runtime error codes we use. The comment indicates 1541 * their numeric value, and the Win32 error they would correspond to 1542 * if the error actually came from a Win32 API. A major mess, we should 1543 * have used LMDB-specific error codes for everything. 1544 */ 1545 switch(err) { 1546 case ENOENT: /* 2, FILE_NOT_FOUND */ 1547 case EIO: /* 5, ACCESS_DENIED */ 1548 case ENOMEM: /* 12, INVALID_ACCESS */ 1549 case EACCES: /* 13, INVALID_DATA */ 1550 case EBUSY: /* 16, CURRENT_DIRECTORY */ 1551 case EINVAL: /* 22, BAD_COMMAND */ 1552 case ENOSPC: /* 28, OUT_OF_PAPER */ 1553 return strerror(err); 1554 default: 1555 ; 1556 } 1557 buf[0] = 0; 1558 FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | 1559 FORMAT_MESSAGE_IGNORE_INSERTS, 1560 NULL, err, 0, ptr, MSGSIZE, NULL); 1561 return ptr; 1562 #else 1563 if (err < 0) 1564 return "Invalid error code"; 1565 return strerror(err); 1566 #endif 1567 } 1568 1569 /** assert(3) variant in cursor context */ 1570 #define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) 1571 /** assert(3) variant in transaction context */ 1572 #define mdb_tassert(txn, expr) mdb_assert0((txn)->mt_env, expr, #expr) 1573 /** assert(3) variant in environment context */ 1574 #define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) 1575 1576 #ifndef NDEBUG 1577 # define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \ 1578 mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__)) 1579 1580 static void ESECT 1581 mdb_assert_fail(MDB_env *env, const char *expr_txt, 1582 const char *func, const char *file, int line) 1583 { 1584 char buf[400]; 1585 sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()", 1586 file, line, expr_txt, func); 1587 if (env->me_assert_func) 1588 env->me_assert_func(env, buf); 1589 fprintf(stderr, "%s\n", buf); 1590 abort(); 1591 } 1592 #else 1593 # define mdb_assert0(env, expr, expr_txt) ((void) 0) 1594 #endif /* NDEBUG */ 1595 1596 #if MDB_DEBUG 1597 /** Return the page number of \b mp which may be sub-page, for debug output */ 1598 static pgno_t 1599 mdb_dbg_pgno(MDB_page *mp) 1600 { 1601 pgno_t ret; 1602 COPY_PGNO(ret, MP_PGNO(mp)); 1603 return ret; 1604 } 1605 1606 /** Display a key in hexadecimal and return the address of the result. 1607 * @param[in] key the key to display 1608 * @param[in] buf the buffer to write into. Should always be #DKBUF. 1609 * @return The key in hexadecimal form. 1610 */ 1611 char * 1612 mdb_dkey(MDB_val *key, char *buf) 1613 { 1614 char *ptr = buf; 1615 unsigned char *c = key->mv_data; 1616 unsigned int i; 1617 1618 if (!key) 1619 return ""; 1620 1621 if (key->mv_size > DKBUF_MAXKEYSIZE) 1622 return "MDB_MAXKEYSIZE"; 1623 /* may want to make this a dynamic check: if the key is mostly 1624 * printable characters, print it as-is instead of converting to hex. 1625 */ 1626 #if 1 1627 buf[0] = '\0'; 1628 for (i=0; i<key->mv_size; i++) 1629 ptr += sprintf(ptr, "%02x", *c++); 1630 #else 1631 sprintf(buf, "%.*s", key->mv_size, key->mv_data); 1632 #endif 1633 return buf; 1634 } 1635 1636 static char * 1637 mdb_dval(MDB_txn *txn, MDB_dbi dbi, MDB_val *data, char *buf) 1638 { 1639 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { 1640 mdb_dkey(data, buf+1); 1641 *buf = '['; 1642 strcpy(buf + data->mv_size * 2 + 1, "]"); 1643 } else 1644 *buf = '\0'; 1645 return buf; 1646 } 1647 1648 static const char * 1649 mdb_leafnode_type(MDB_node *n) 1650 { 1651 static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; 1652 return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : 1653 tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; 1654 } 1655 1656 /** Display all the keys in the page. */ 1657 void 1658 mdb_page_list(MDB_page *mp) 1659 { 1660 pgno_t pgno = mdb_dbg_pgno(mp); 1661 const char *type, *state = (MP_FLAGS(mp) & P_DIRTY) ? ", dirty" : ""; 1662 MDB_node *node; 1663 unsigned int i, nkeys, nsize, total = 0; 1664 MDB_val key; 1665 DKBUF; 1666 1667 switch (MP_FLAGS(mp) & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { 1668 case P_BRANCH: type = "Branch page"; break; 1669 case P_LEAF: type = "Leaf page"; break; 1670 case P_LEAF|P_SUBP: type = "Sub-page"; break; 1671 case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; 1672 case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; 1673 case P_OVERFLOW: 1674 fprintf(stderr, "Overflow page %"Z"u pages %u%s\n", 1675 pgno, mp->mp_pages, state); 1676 return; 1677 case P_META: 1678 fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n", 1679 pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); 1680 return; 1681 default: 1682 fprintf(stderr, "Bad page %"Z"u flags 0x%X\n", pgno, MP_FLAGS(mp)); 1683 return; 1684 } 1685 1686 nkeys = NUMKEYS(mp); 1687 fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state); 1688 1689 for (i=0; i<nkeys; i++) { 1690 if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ 1691 key.mv_size = nsize = mp->mp_pad; 1692 key.mv_data = LEAF2KEY(mp, i, nsize); 1693 total += nsize; 1694 fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); 1695 continue; 1696 } 1697 node = NODEPTR(mp, i); 1698 key.mv_size = node->mn_ksize; 1699 key.mv_data = node->mn_data; 1700 nsize = NODESIZE + key.mv_size; 1701 if (IS_BRANCH(mp)) { 1702 fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node), 1703 DKEY(&key)); 1704 total += nsize; 1705 } else { 1706 if (F_ISSET(node->mn_flags, F_BIGDATA)) 1707 nsize += sizeof(pgno_t); 1708 else 1709 nsize += NODEDSZ(node); 1710 total += nsize; 1711 nsize += sizeof(indx_t); 1712 fprintf(stderr, "key %d: nsize %d, %s%s\n", 1713 i, nsize, DKEY(&key), mdb_leafnode_type(node)); 1714 } 1715 total = EVEN(total); 1716 } 1717 fprintf(stderr, "Total: header %d + contents %d + unused %d\n", 1718 IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + MP_LOWER(mp), total, SIZELEFT(mp)); 1719 } 1720 1721 void 1722 mdb_cursor_chk(MDB_cursor *mc) 1723 { 1724 unsigned int i; 1725 MDB_node *node; 1726 MDB_page *mp; 1727 1728 if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; 1729 for (i=0; i<mc->mc_top; i++) { 1730 mp = mc->mc_pg[i]; 1731 node = NODEPTR(mp, mc->mc_ki[i]); 1732 if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno) 1733 printf("oops!\n"); 1734 } 1735 if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) 1736 printf("ack!\n"); 1737 if (XCURSOR_INITED(mc)) { 1738 node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 1739 if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && 1740 mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { 1741 printf("blah!\n"); 1742 } 1743 } 1744 } 1745 #endif 1746 1747 #if (MDB_DEBUG) > 2 1748 /** Count all the pages in each DB and in the freelist 1749 * and make sure it matches the actual number of pages 1750 * being used. 1751 * All named DBs must be open for a correct count. 1752 */ 1753 static void mdb_audit(MDB_txn *txn) 1754 { 1755 MDB_cursor mc; 1756 MDB_val key, data; 1757 MDB_ID freecount, count; 1758 MDB_dbi i; 1759 int rc; 1760 1761 freecount = 0; 1762 mdb_cursor_init(&mc, txn, FREE_DBI, NULL); 1763 while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) 1764 freecount += *(MDB_ID *)data.mv_data; 1765 mdb_tassert(txn, rc == MDB_NOTFOUND); 1766 1767 count = 0; 1768 for (i = 0; i<txn->mt_numdbs; i++) { 1769 MDB_xcursor mx; 1770 if (!(txn->mt_dbflags[i] & DB_VALID)) 1771 continue; 1772 mdb_cursor_init(&mc, txn, i, &mx); 1773 if (txn->mt_dbs[i].md_root == P_INVALID) 1774 continue; 1775 count += txn->mt_dbs[i].md_branch_pages + 1776 txn->mt_dbs[i].md_leaf_pages + 1777 txn->mt_dbs[i].md_overflow_pages; 1778 if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { 1779 rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST); 1780 for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) { 1781 unsigned j; 1782 MDB_page *mp; 1783 mp = mc.mc_pg[mc.mc_top]; 1784 for (j=0; j<NUMKEYS(mp); j++) { 1785 MDB_node *leaf = NODEPTR(mp, j); 1786 if (leaf->mn_flags & F_SUBDATA) { 1787 MDB_db db; 1788 memcpy(&db, NODEDATA(leaf), sizeof(db)); 1789 count += db.md_branch_pages + db.md_leaf_pages + 1790 db.md_overflow_pages; 1791 } 1792 } 1793 } 1794 mdb_tassert(txn, rc == MDB_NOTFOUND); 1795 } 1796 } 1797 if (freecount + count + NUM_METAS != txn->mt_next_pgno) { 1798 fprintf(stderr, "audit: %"Z"u freecount: %"Z"u count: %"Z"u total: %"Z"u next_pgno: %"Z"u\n", 1799 txn->mt_txnid, freecount, count+NUM_METAS, 1800 freecount+count+NUM_METAS, txn->mt_next_pgno); 1801 } 1802 } 1803 #endif 1804 1805 int 1806 mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) 1807 { 1808 return txn->mt_dbxs[dbi].md_cmp(a, b); 1809 } 1810 1811 int 1812 mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) 1813 { 1814 MDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp; 1815 #if UINT_MAX < SIZE_MAX 1816 if (dcmp == mdb_cmp_int && a->mv_size == sizeof(size_t)) 1817 dcmp = mdb_cmp_clong; 1818 #endif 1819 return dcmp(a, b); 1820 } 1821 1822 /** Allocate memory for a page. 1823 * Re-use old malloc'd pages first for singletons, otherwise just malloc. 1824 * Set #MDB_TXN_ERROR on failure. 1825 */ 1826 static MDB_page * 1827 mdb_page_malloc(MDB_txn *txn, unsigned num) 1828 { 1829 MDB_env *env = txn->mt_env; 1830 MDB_page *ret = env->me_dpages; 1831 size_t psize = env->me_psize, sz = psize, off; 1832 /* For ! #MDB_NOMEMINIT, psize counts how much to init. 1833 * For a single page alloc, we init everything after the page header. 1834 * For multi-page, we init the final page; if the caller needed that 1835 * many pages they will be filling in at least up to the last page. 1836 */ 1837 if (num == 1) { 1838 if (ret) { 1839 VGMEMP_ALLOC(env, ret, sz); 1840 VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); 1841 env->me_dpages = ret->mp_next; 1842 return ret; 1843 } 1844 psize -= off = PAGEHDRSZ; 1845 } else { 1846 sz *= num; 1847 off = sz - psize; 1848 } 1849 if ((ret = malloc(sz)) != NULL) { 1850 VGMEMP_ALLOC(env, ret, sz); 1851 if (!(env->me_flags & MDB_NOMEMINIT)) { 1852 memset((char *)ret + off, 0, psize); 1853 ret->mp_pad = 0; 1854 } 1855 } else { 1856 txn->mt_flags |= MDB_TXN_ERROR; 1857 } 1858 return ret; 1859 } 1860 /** Free a single page. 1861 * Saves single pages to a list, for future reuse. 1862 * (This is not used for multi-page overflow pages.) 1863 */ 1864 static void 1865 mdb_page_free(MDB_env *env, MDB_page *mp) 1866 { 1867 mp->mp_next = env->me_dpages; 1868 VGMEMP_FREE(env, mp); 1869 env->me_dpages = mp; 1870 } 1871 1872 /** Free a dirty page */ 1873 static void 1874 mdb_dpage_free(MDB_env *env, MDB_page *dp) 1875 { 1876 if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { 1877 mdb_page_free(env, dp); 1878 } else { 1879 /* large pages just get freed directly */ 1880 VGMEMP_FREE(env, dp); 1881 free(dp); 1882 } 1883 } 1884 1885 /** Return all dirty pages to dpage list */ 1886 static void 1887 mdb_dlist_free(MDB_txn *txn) 1888 { 1889 MDB_env *env = txn->mt_env; 1890 MDB_ID2L dl = txn->mt_u.dirty_list; 1891 unsigned i, n = dl[0].mid; 1892 1893 for (i = 1; i <= n; i++) { 1894 mdb_dpage_free(env, dl[i].mptr); 1895 } 1896 dl[0].mid = 0; 1897 } 1898 1899 /** Loosen or free a single page. 1900 * Saves single pages to a list for future reuse 1901 * in this same txn. It has been pulled from the freeDB 1902 * and already resides on the dirty list, but has been 1903 * deleted. Use these pages first before pulling again 1904 * from the freeDB. 1905 * 1906 * If the page wasn't dirtied in this txn, just add it 1907 * to this txn's free list. 1908 */ 1909 static int 1910 mdb_page_loose(MDB_cursor *mc, MDB_page *mp) 1911 { 1912 int loose = 0; 1913 pgno_t pgno = mp->mp_pgno; 1914 MDB_txn *txn = mc->mc_txn; 1915 1916 if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { 1917 if (txn->mt_parent) { 1918 MDB_ID2 *dl = txn->mt_u.dirty_list; 1919 /* If txn has a parent, make sure the page is in our 1920 * dirty list. 1921 */ 1922 if (dl[0].mid) { 1923 unsigned x = mdb_mid2l_search(dl, pgno); 1924 if (x <= dl[0].mid && dl[x].mid == pgno) { 1925 if (mp != dl[x].mptr) { /* bad cursor? */ 1926 mc->mc_flags &= ~(C_INITIALIZED|C_EOF); 1927 txn->mt_flags |= MDB_TXN_ERROR; 1928 return MDB_CORRUPTED; 1929 } 1930 /* ok, it's ours */ 1931 loose = 1; 1932 } 1933 } 1934 } else { 1935 /* no parent txn, so it's just ours */ 1936 loose = 1; 1937 } 1938 } 1939 if (loose) { 1940 DPRINTF(("loosen db %d page %"Z"u", DDBI(mc), 1941 mp->mp_pgno)); 1942 NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs; 1943 txn->mt_loose_pgs = mp; 1944 txn->mt_loose_count++; 1945 mp->mp_flags |= P_LOOSE; 1946 } else { 1947 int rc = mdb_midl_append(&txn->mt_free_pgs, pgno); 1948 if (rc) 1949 return rc; 1950 } 1951 1952 return MDB_SUCCESS; 1953 } 1954 1955 /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. 1956 * @param[in] mc A cursor handle for the current operation. 1957 * @param[in] pflags Flags of the pages to update: 1958 * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. 1959 * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). 1960 * @return 0 on success, non-zero on failure. 1961 */ 1962 static int 1963 mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) 1964 { 1965 enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; 1966 MDB_txn *txn = mc->mc_txn; 1967 MDB_cursor *m3, *m0 = mc; 1968 MDB_xcursor *mx; 1969 MDB_page *dp, *mp; 1970 MDB_node *leaf; 1971 unsigned i, j; 1972 int rc = MDB_SUCCESS, level; 1973 1974 /* Mark pages seen by cursors */ 1975 if (mc->mc_flags & C_UNTRACK) 1976 mc = NULL; /* will find mc in mt_cursors */ 1977 for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { 1978 for (; mc; mc=mc->mc_next) { 1979 if (!(mc->mc_flags & C_INITIALIZED)) 1980 continue; 1981 for (m3 = mc;; m3 = &mx->mx_cursor) { 1982 mp = NULL; 1983 for (j=0; j<m3->mc_snum; j++) { 1984 mp = m3->mc_pg[j]; 1985 if ((mp->mp_flags & Mask) == pflags) 1986 mp->mp_flags ^= P_KEEP; 1987 } 1988 mx = m3->mc_xcursor; 1989 /* Proceed to mx if it is at a sub-database */ 1990 if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) 1991 break; 1992 if (! (mp && (mp->mp_flags & P_LEAF))) 1993 break; 1994 leaf = NODEPTR(mp, m3->mc_ki[j-1]); 1995 if (!(leaf->mn_flags & F_SUBDATA)) 1996 break; 1997 } 1998 } 1999 if (i == 0) 2000 break; 2001 } 2002 2003 if (all) { 2004 /* Mark dirty root pages */ 2005 for (i=0; i<txn->mt_numdbs; i++) { 2006 if (txn->mt_dbflags[i] & DB_DIRTY) { 2007 pgno_t pgno = txn->mt_dbs[i].md_root; 2008 if (pgno == P_INVALID) 2009 continue; 2010 if ((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS) 2011 break; 2012 if ((dp->mp_flags & Mask) == pflags && level <= 1) 2013 dp->mp_flags ^= P_KEEP; 2014 } 2015 } 2016 } 2017 2018 return rc; 2019 } 2020 2021 static int mdb_page_flush(MDB_txn *txn, int keep); 2022 2023 /** Spill pages from the dirty list back to disk. 2024 * This is intended to prevent running into #MDB_TXN_FULL situations, 2025 * but note that they may still occur in a few cases: 2026 * 1) our estimate of the txn size could be too small. Currently this 2027 * seems unlikely, except with a large number of #MDB_MULTIPLE items. 2028 * 2) child txns may run out of space if their parents dirtied a 2029 * lot of pages and never spilled them. TODO: we probably should do 2030 * a preemptive spill during #mdb_txn_begin() of a child txn, if 2031 * the parent's dirty_room is below a given threshold. 2032 * 2033 * Otherwise, if not using nested txns, it is expected that apps will 2034 * not run into #MDB_TXN_FULL any more. The pages are flushed to disk 2035 * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. 2036 * If the txn never references them again, they can be left alone. 2037 * If the txn only reads them, they can be used without any fuss. 2038 * If the txn writes them again, they can be dirtied immediately without 2039 * going thru all of the work of #mdb_page_touch(). Such references are 2040 * handled by #mdb_page_unspill(). 2041 * 2042 * Also note, we never spill DB root pages, nor pages of active cursors, 2043 * because we'll need these back again soon anyway. And in nested txns, 2044 * we can't spill a page in a child txn if it was already spilled in a 2045 * parent txn. That would alter the parent txns' data even though 2046 * the child hasn't committed yet, and we'd have no way to undo it if 2047 * the child aborted. 2048 * 2049 * @param[in] m0 cursor A cursor handle identifying the transaction and 2050 * database for which we are checking space. 2051 * @param[in] key For a put operation, the key being stored. 2052 * @param[in] data For a put operation, the data being stored. 2053 * @return 0 on success, non-zero on failure. 2054 */ 2055 static int 2056 mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) 2057 { 2058 MDB_txn *txn = m0->mc_txn; 2059 MDB_page *dp; 2060 MDB_ID2L dl = txn->mt_u.dirty_list; 2061 unsigned int i, j, need; 2062 int rc; 2063 2064 if (m0->mc_flags & C_SUB) 2065 return MDB_SUCCESS; 2066 2067 /* Estimate how much space this op will take */ 2068 i = m0->mc_db->md_depth; 2069 /* Named DBs also dirty the main DB */ 2070 if (m0->mc_dbi >= CORE_DBS) 2071 i += txn->mt_dbs[MAIN_DBI].md_depth; 2072 /* For puts, roughly factor in the key+data size */ 2073 if (key) 2074 i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; 2075 i += i; /* double it for good measure */ 2076 need = i; 2077 2078 if (txn->mt_dirty_room > i) 2079 return MDB_SUCCESS; 2080 2081 if (!txn->mt_spill_pgs) { 2082 txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); 2083 if (!txn->mt_spill_pgs) 2084 return ENOMEM; 2085 } else { 2086 /* purge deleted slots */ 2087 MDB_IDL sl = txn->mt_spill_pgs; 2088 unsigned int num = sl[0]; 2089 j=0; 2090 for (i=1; i<=num; i++) { 2091 if (!(sl[i] & 1)) 2092 sl[++j] = sl[i]; 2093 } 2094 sl[0] = j; 2095 } 2096 2097 /* Preserve pages which may soon be dirtied again */ 2098 if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) 2099 goto done; 2100 2101 /* Less aggressive spill - we originally spilled the entire dirty list, 2102 * with a few exceptions for cursor pages and DB root pages. But this 2103 * turns out to be a lot of wasted effort because in a large txn many 2104 * of those pages will need to be used again. So now we spill only 1/8th 2105 * of the dirty pages. Testing revealed this to be a good tradeoff, 2106 * better than 1/2, 1/4, or 1/10. 2107 */ 2108 if (need < MDB_IDL_UM_MAX / 8) 2109 need = MDB_IDL_UM_MAX / 8; 2110 2111 /* Save the page IDs of all the pages we're flushing */ 2112 /* flush from the tail forward, this saves a lot of shifting later on. */ 2113 for (i=dl[0].mid; i && need; i--) { 2114 MDB_ID pn = dl[i].mid << 1; 2115 dp = dl[i].mptr; 2116 if (dp->mp_flags & (P_LOOSE|P_KEEP)) 2117 continue; 2118 /* Can't spill twice, make sure it's not already in a parent's 2119 * spill list. 2120 */ 2121 if (txn->mt_parent) { 2122 MDB_txn *tx2; 2123 for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { 2124 if (tx2->mt_spill_pgs) { 2125 j = mdb_midl_search(tx2->mt_spill_pgs, pn); 2126 if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { 2127 dp->mp_flags |= P_KEEP; 2128 break; 2129 } 2130 } 2131 } 2132 if (tx2) 2133 continue; 2134 } 2135 if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) 2136 goto done; 2137 need--; 2138 } 2139 mdb_midl_sort(txn->mt_spill_pgs); 2140 2141 /* Flush the spilled part of dirty list */ 2142 if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) 2143 goto done; 2144 2145 /* Reset any dirty pages we kept that page_flush didn't see */ 2146 rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); 2147 2148 done: 2149 txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; 2150 return rc; 2151 } 2152 2153 /** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ 2154 static txnid_t 2155 mdb_find_oldest(MDB_txn *txn) 2156 { 2157 int i; 2158 txnid_t mr, oldest = txn->mt_txnid - 1; 2159 if (txn->mt_env->me_txns) { 2160 MDB_reader *r = txn->mt_env->me_txns->mti_readers; 2161 for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { 2162 if (r[i].mr_pid) { 2163 mr = r[i].mr_txnid; 2164 if (oldest > mr) 2165 oldest = mr; 2166 } 2167 } 2168 } 2169 return oldest; 2170 } 2171 2172 /** Add a page to the txn's dirty list */ 2173 static void 2174 mdb_page_dirty(MDB_txn *txn, MDB_page *mp) 2175 { 2176 MDB_ID2 mid; 2177 int rc, (*insert)(MDB_ID2L, MDB_ID2 *); 2178 2179 if (txn->mt_flags & MDB_TXN_WRITEMAP) { 2180 insert = mdb_mid2l_append; 2181 } else { 2182 insert = mdb_mid2l_insert; 2183 } 2184 mid.mid = mp->mp_pgno; 2185 mid.mptr = mp; 2186 rc = insert(txn->mt_u.dirty_list, &mid); 2187 mdb_tassert(txn, rc == 0); 2188 txn->mt_dirty_room--; 2189 } 2190 2191 /** Allocate page numbers and memory for writing. Maintain me_pglast, 2192 * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. 2193 * 2194 * If there are free pages available from older transactions, they 2195 * are re-used first. Otherwise allocate a new page at mt_next_pgno. 2196 * Do not modify the freedB, just merge freeDB records into me_pghead[] 2197 * and move me_pglast to say which records were consumed. Only this 2198 * function can create me_pghead and move me_pglast/mt_next_pgno. 2199 * @param[in] mc cursor A cursor handle identifying the transaction and 2200 * database for which we are allocating. 2201 * @param[in] num the number of pages to allocate. 2202 * @param[out] mp Address of the allocated page(s). Requests for multiple pages 2203 * will always be satisfied by a single contiguous chunk of memory. 2204 * @return 0 on success, non-zero on failure. 2205 */ 2206 static int 2207 mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) 2208 { 2209 #ifdef MDB_PARANOID /* Seems like we can ignore this now */ 2210 /* Get at most <Max_retries> more freeDB records once me_pghead 2211 * has enough pages. If not enough, use new pages from the map. 2212 * If <Paranoid> and mc is updating the freeDB, only get new 2213 * records if me_pghead is empty. Then the freelist cannot play 2214 * catch-up with itself by growing while trying to save it. 2215 */ 2216 enum { Paranoid = 1, Max_retries = 500 }; 2217 #else 2218 enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ }; 2219 #endif 2220 int rc, retry = num * 60; 2221 MDB_txn *txn = mc->mc_txn; 2222 MDB_env *env = txn->mt_env; 2223 pgno_t pgno, *mop = env->me_pghead; 2224 unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1; 2225 MDB_page *np; 2226 txnid_t oldest = 0, last; 2227 MDB_cursor_op op; 2228 MDB_cursor m2; 2229 int found_old = 0; 2230 2231 /* If there are any loose pages, just use them */ 2232 if (num == 1 && txn->mt_loose_pgs) { 2233 np = txn->mt_loose_pgs; 2234 txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); 2235 txn->mt_loose_count--; 2236 DPRINTF(("db %d use loose page %"Z"u", DDBI(mc), 2237 np->mp_pgno)); 2238 *mp = np; 2239 return MDB_SUCCESS; 2240 } 2241 2242 *mp = NULL; 2243 2244 /* If our dirty list is already full, we can't do anything */ 2245 if (txn->mt_dirty_room == 0) { 2246 rc = MDB_TXN_FULL; 2247 goto fail; 2248 } 2249 2250 for (op = MDB_FIRST;; op = MDB_NEXT) { 2251 MDB_val key, data; 2252 MDB_node *leaf; 2253 pgno_t *idl; 2254 2255 /* Seek a big enough contiguous page range. Prefer 2256 * pages at the tail, just truncating the list. 2257 */ 2258 if (mop_len > n2) { 2259 i = mop_len; 2260 do { 2261 pgno = mop[i]; 2262 if (mop[i-n2] == pgno+n2) 2263 goto search_done; 2264 } while (--i > n2); 2265 if (--retry < 0) 2266 break; 2267 } 2268 2269 if (op == MDB_FIRST) { /* 1st iteration */ 2270 /* Prepare to fetch more and coalesce */ 2271 last = env->me_pglast; 2272 oldest = env->me_pgoldest; 2273 mdb_cursor_init(&m2, txn, FREE_DBI, NULL); 2274 if (last) { 2275 op = MDB_SET_RANGE; 2276 key.mv_data = &last; /* will look up last+1 */ 2277 key.mv_size = sizeof(last); 2278 } 2279 if (Paranoid && mc->mc_dbi == FREE_DBI) 2280 retry = -1; 2281 } 2282 if (Paranoid && retry < 0 && mop_len) 2283 break; 2284 2285 last++; 2286 /* Do not fetch more if the record will be too recent */ 2287 if (oldest <= last) { 2288 if (!found_old) { 2289 oldest = mdb_find_oldest(txn); 2290 env->me_pgoldest = oldest; 2291 found_old = 1; 2292 } 2293 if (oldest <= last) 2294 break; 2295 } 2296 rc = mdb_cursor_get(&m2, &key, NULL, op); 2297 if (rc) { 2298 if (rc == MDB_NOTFOUND) 2299 break; 2300 goto fail; 2301 } 2302 last = *(txnid_t*)key.mv_data; 2303 if (oldest <= last) { 2304 if (!found_old) { 2305 oldest = mdb_find_oldest(txn); 2306 env->me_pgoldest = oldest; 2307 found_old = 1; 2308 } 2309 if (oldest <= last) 2310 break; 2311 } 2312 np = m2.mc_pg[m2.mc_top]; 2313 leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); 2314 if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS) 2315 goto fail; 2316 2317 idl = (MDB_ID *) data.mv_data; 2318 i = idl[0]; 2319 if (!mop) { 2320 if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { 2321 rc = ENOMEM; 2322 goto fail; 2323 } 2324 } else { 2325 if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) 2326 goto fail; 2327 mop = env->me_pghead; 2328 } 2329 env->me_pglast = last; 2330 #if (MDB_DEBUG) > 1 2331 DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u", 2332 last, txn->mt_dbs[FREE_DBI].md_root, i)); 2333 for (j = i; j; j--) 2334 DPRINTF(("IDL %"Z"u", idl[j])); 2335 #endif 2336 /* Merge in descending sorted order */ 2337 mdb_midl_xmerge(mop, idl); 2338 mop_len = mop[0]; 2339 } 2340 2341 /* Use new pages from the map when nothing suitable in the freeDB */ 2342 i = 0; 2343 pgno = txn->mt_next_pgno; 2344 if (pgno + num >= env->me_maxpg) { 2345 DPUTS("DB size maxed out"); 2346 rc = MDB_MAP_FULL; 2347 goto fail; 2348 } 2349 2350 search_done: 2351 if (env->me_flags & MDB_WRITEMAP) { 2352 np = (MDB_page *)(env->me_map + env->me_psize * pgno); 2353 } else { 2354 if (!(np = mdb_page_malloc(txn, num))) { 2355 rc = ENOMEM; 2356 goto fail; 2357 } 2358 } 2359 if (i) { 2360 mop[0] = mop_len -= num; 2361 /* Move any stragglers down */ 2362 for (j = i-num; j < mop_len; ) 2363 mop[++j] = mop[++i]; 2364 } else { 2365 txn->mt_next_pgno = pgno + num; 2366 } 2367 np->mp_pgno = pgno; 2368 mdb_page_dirty(txn, np); 2369 *mp = np; 2370 2371 return MDB_SUCCESS; 2372 2373 fail: 2374 txn->mt_flags |= MDB_TXN_ERROR; 2375 return rc; 2376 } 2377 2378 /** Copy the used portions of a non-overflow page. 2379 * @param[in] dst page to copy into 2380 * @param[in] src page to copy from 2381 * @param[in] psize size of a page 2382 */ 2383 static void 2384 mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) 2385 { 2386 enum { Align = sizeof(pgno_t) }; 2387 indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; 2388 2389 /* If page isn't full, just copy the used portion. Adjust 2390 * alignment so memcpy may copy words instead of bytes. 2391 */ 2392 if ((unused &= -Align) && !IS_LEAF2(src)) { 2393 upper = (upper + PAGEBASE) & -Align; 2394 memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align); 2395 memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), 2396 psize - upper); 2397 } else { 2398 memcpy(dst, src, psize - unused); 2399 } 2400 } 2401 2402 /** Pull a page off the txn's spill list, if present. 2403 * If a page being referenced was spilled to disk in this txn, bring 2404 * it back and make it dirty/writable again. 2405 * @param[in] txn the transaction handle. 2406 * @param[in] mp the page being referenced. It must not be dirty. 2407 * @param[out] ret the writable page, if any. ret is unchanged if 2408 * mp wasn't spilled. 2409 */ 2410 static int 2411 mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) 2412 { 2413 MDB_env *env = txn->mt_env; 2414 const MDB_txn *tx2; 2415 unsigned x; 2416 pgno_t pgno = mp->mp_pgno, pn = pgno << 1; 2417 2418 for (tx2 = txn; tx2; tx2=tx2->mt_parent) { 2419 if (!tx2->mt_spill_pgs) 2420 continue; 2421 x = mdb_midl_search(tx2->mt_spill_pgs, pn); 2422 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { 2423 MDB_page *np; 2424 int num; 2425 if (txn->mt_dirty_room == 0) 2426 return MDB_TXN_FULL; 2427 if (IS_OVERFLOW(mp)) 2428 num = mp->mp_pages; 2429 else 2430 num = 1; 2431 if (env->me_flags & MDB_WRITEMAP) { 2432 np = mp; 2433 } else { 2434 np = mdb_page_malloc(txn, num); 2435 if (!np) 2436 return ENOMEM; 2437 if (num > 1) 2438 memcpy(np, mp, num * env->me_psize); 2439 else 2440 mdb_page_copy(np, mp, env->me_psize); 2441 } 2442 if (tx2 == txn) { 2443 /* If in current txn, this page is no longer spilled. 2444 * If it happens to be the last page, truncate the spill list. 2445 * Otherwise mark it as deleted by setting the LSB. 2446 */ 2447 if (x == txn->mt_spill_pgs[0]) 2448 txn->mt_spill_pgs[0]--; 2449 else 2450 txn->mt_spill_pgs[x] |= 1; 2451 } /* otherwise, if belonging to a parent txn, the 2452 * page remains spilled until child commits 2453 */ 2454 2455 mdb_page_dirty(txn, np); 2456 np->mp_flags |= P_DIRTY; 2457 *ret = np; 2458 break; 2459 } 2460 } 2461 return MDB_SUCCESS; 2462 } 2463 2464 /** Touch a page: make it dirty and re-insert into tree with updated pgno. 2465 * Set #MDB_TXN_ERROR on failure. 2466 * @param[in] mc cursor pointing to the page to be touched 2467 * @return 0 on success, non-zero on failure. 2468 */ 2469 static int 2470 mdb_page_touch(MDB_cursor *mc) 2471 { 2472 MDB_page *mp = mc->mc_pg[mc->mc_top], *np; 2473 MDB_txn *txn = mc->mc_txn; 2474 MDB_cursor *m2, *m3; 2475 pgno_t pgno; 2476 int rc; 2477 2478 if (!F_ISSET(MP_FLAGS(mp), P_DIRTY)) { 2479 if (txn->mt_flags & MDB_TXN_SPILLS) { 2480 np = NULL; 2481 rc = mdb_page_unspill(txn, mp, &np); 2482 if (rc) 2483 goto fail; 2484 if (np) 2485 goto done; 2486 } 2487 if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || 2488 (rc = mdb_page_alloc(mc, 1, &np))) 2489 goto fail; 2490 pgno = np->mp_pgno; 2491 DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), 2492 mp->mp_pgno, pgno)); 2493 mdb_cassert(mc, mp->mp_pgno != pgno); 2494 mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); 2495 /* Update the parent page, if any, to point to the new page */ 2496 if (mc->mc_top) { 2497 MDB_page *parent = mc->mc_pg[mc->mc_top-1]; 2498 MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); 2499 SETPGNO(node, pgno); 2500 } else { 2501 mc->mc_db->md_root = pgno; 2502 } 2503 } else if (txn->mt_parent && !IS_SUBP(mp)) { 2504 MDB_ID2 mid, *dl = txn->mt_u.dirty_list; 2505 pgno = mp->mp_pgno; 2506 /* If txn has a parent, make sure the page is in our 2507 * dirty list. 2508 */ 2509 if (dl[0].mid) { 2510 unsigned x = mdb_mid2l_search(dl, pgno); 2511 if (x <= dl[0].mid && dl[x].mid == pgno) { 2512 if (mp != dl[x].mptr) { /* bad cursor? */ 2513 mc->mc_flags &= ~(C_INITIALIZED|C_EOF); 2514 txn->mt_flags |= MDB_TXN_ERROR; 2515 return MDB_CORRUPTED; 2516 } 2517 return 0; 2518 } 2519 } 2520 mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); 2521 /* No - copy it */ 2522 np = mdb_page_malloc(txn, 1); 2523 if (!np) 2524 return ENOMEM; 2525 mid.mid = pgno; 2526 mid.mptr = np; 2527 rc = mdb_mid2l_insert(dl, &mid); 2528 mdb_cassert(mc, rc == 0); 2529 } else { 2530 return 0; 2531 } 2532 2533 mdb_page_copy(np, mp, txn->mt_env->me_psize); 2534 np->mp_pgno = pgno; 2535 np->mp_flags |= P_DIRTY; 2536 2537 done: 2538 /* Adjust cursors pointing to mp */ 2539 mc->mc_pg[mc->mc_top] = np; 2540 m2 = txn->mt_cursors[mc->mc_dbi]; 2541 if (mc->mc_flags & C_SUB) { 2542 for (; m2; m2=m2->mc_next) { 2543 m3 = &m2->mc_xcursor->mx_cursor; 2544 if (m3->mc_snum < mc->mc_snum) continue; 2545 if (m3->mc_pg[mc->mc_top] == mp) 2546 m3->mc_pg[mc->mc_top] = np; 2547 } 2548 } else { 2549 for (; m2; m2=m2->mc_next) { 2550 if (m2->mc_snum < mc->mc_snum) continue; 2551 if (m2 == mc) continue; 2552 if (m2->mc_pg[mc->mc_top] == mp) { 2553 m2->mc_pg[mc->mc_top] = np; 2554 if (IS_LEAF(np)) 2555 XCURSOR_REFRESH(m2, mc->mc_top, np); 2556 } 2557 } 2558 } 2559 return 0; 2560 2561 fail: 2562 txn->mt_flags |= MDB_TXN_ERROR; 2563 return rc; 2564 } 2565 2566 int 2567 mdb_env_sync(MDB_env *env, int force) 2568 { 2569 int rc = 0; 2570 if (env->me_flags & MDB_RDONLY) 2571 return EACCES; 2572 if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { 2573 if (env->me_flags & MDB_WRITEMAP) { 2574 int flags = ((env->me_flags & MDB_MAPASYNC) && !force) 2575 ? MS_ASYNC : MS_SYNC; 2576 if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) 2577 rc = ErrCode(); 2578 #ifdef _WIN32 2579 else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) 2580 rc = ErrCode(); 2581 #endif 2582 } else { 2583 #ifdef BROKEN_FDATASYNC 2584 if (env->me_flags & MDB_FSYNCONLY) { 2585 if (fsync(env->me_fd)) 2586 rc = ErrCode(); 2587 } else 2588 #endif 2589 if (MDB_FDATASYNC(env->me_fd)) 2590 rc = ErrCode(); 2591 } 2592 } 2593 return rc; 2594 } 2595 2596 /** Back up parent txn's cursors, then grab the originals for tracking */ 2597 static int 2598 mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) 2599 { 2600 MDB_cursor *mc, *bk; 2601 MDB_xcursor *mx; 2602 size_t size; 2603 int i; 2604 2605 for (i = src->mt_numdbs; --i >= 0; ) { 2606 if ((mc = src->mt_cursors[i]) != NULL) { 2607 size = sizeof(MDB_cursor); 2608 if (mc->mc_xcursor) 2609 size += sizeof(MDB_xcursor); 2610 for (; mc; mc = bk->mc_next) { 2611 bk = malloc(size); 2612 if (!bk) 2613 return ENOMEM; 2614 *bk = *mc; 2615 mc->mc_backup = bk; 2616 mc->mc_db = &dst->mt_dbs[i]; 2617 /* Kill pointers into src to reduce abuse: The 2618 * user may not use mc until dst ends. But we need a valid 2619 * txn pointer here for cursor fixups to keep working. 2620 */ 2621 mc->mc_txn = dst; 2622 mc->mc_dbflag = &dst->mt_dbflags[i]; 2623 if ((mx = mc->mc_xcursor) != NULL) { 2624 *(MDB_xcursor *)(bk+1) = *mx; 2625 mx->mx_cursor.mc_txn = dst; 2626 } 2627 mc->mc_next = dst->mt_cursors[i]; 2628 dst->mt_cursors[i] = mc; 2629 } 2630 } 2631 } 2632 return MDB_SUCCESS; 2633 } 2634 2635 /** Close this write txn's cursors, give parent txn's cursors back to parent. 2636 * @param[in] txn the transaction handle. 2637 * @param[in] merge true to keep changes to parent cursors, false to revert. 2638 * @return 0 on success, non-zero on failure. 2639 */ 2640 static void 2641 mdb_cursors_close(MDB_txn *txn, unsigned merge) 2642 { 2643 MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; 2644 MDB_xcursor *mx; 2645 int i; 2646 2647 for (i = txn->mt_numdbs; --i >= 0; ) { 2648 for (mc = cursors[i]; mc; mc = next) { 2649 next = mc->mc_next; 2650 if ((bk = mc->mc_backup) != NULL) { 2651 if (merge) { 2652 /* Commit changes to parent txn */ 2653 mc->mc_next = bk->mc_next; 2654 mc->mc_backup = bk->mc_backup; 2655 mc->mc_txn = bk->mc_txn; 2656 mc->mc_db = bk->mc_db; 2657 mc->mc_dbflag = bk->mc_dbflag; 2658 if ((mx = mc->mc_xcursor) != NULL) 2659 mx->mx_cursor.mc_txn = bk->mc_txn; 2660 } else { 2661 /* Abort nested txn */ 2662 *mc = *bk; 2663 if ((mx = mc->mc_xcursor) != NULL) 2664 *mx = *(MDB_xcursor *)(bk+1); 2665 } 2666 mc = bk; 2667 } 2668 /* Only malloced cursors are permanently tracked. */ 2669 free(mc); 2670 } 2671 cursors[i] = NULL; 2672 } 2673 } 2674 2675 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ 2676 enum Pidlock_op { 2677 Pidset, Pidcheck 2678 }; 2679 #else 2680 enum Pidlock_op { 2681 Pidset = F_SETLK, Pidcheck = F_GETLK 2682 }; 2683 #endif 2684 2685 /** Set or check a pid lock. Set returns 0 on success. 2686 * Check returns 0 if the process is certainly dead, nonzero if it may 2687 * be alive (the lock exists or an error happened so we do not know). 2688 * 2689 * On Windows Pidset is a no-op, we merely check for the existence 2690 * of the process with the given pid. On POSIX we use a single byte 2691 * lock on the lockfile, set at an offset equal to the pid. 2692 */ 2693 static int 2694 mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid) 2695 { 2696 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ 2697 int ret = 0; 2698 HANDLE h; 2699 if (op == Pidcheck) { 2700 h = OpenProcess(env->me_pidquery, FALSE, pid); 2701 /* No documented "no such process" code, but other program use this: */ 2702 if (!h) 2703 return ErrCode() != ERROR_INVALID_PARAMETER; 2704 /* A process exists until all handles to it close. Has it exited? */ 2705 ret = WaitForSingleObject(h, 0) != 0; 2706 CloseHandle(h); 2707 } 2708 return ret; 2709 #else 2710 for (;;) { 2711 int rc; 2712 struct flock lock_info; 2713 memset(&lock_info, 0, sizeof(lock_info)); 2714 lock_info.l_type = F_WRLCK; 2715 lock_info.l_whence = SEEK_SET; 2716 lock_info.l_start = pid; 2717 lock_info.l_len = 1; 2718 if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { 2719 if (op == F_GETLK && lock_info.l_type != F_UNLCK) 2720 rc = -1; 2721 } else if ((rc = ErrCode()) == EINTR) { 2722 continue; 2723 } 2724 return rc; 2725 } 2726 #endif 2727 } 2728 2729 /** Common code for #mdb_txn_begin() and #mdb_txn_renew(). 2730 * @param[in] txn the transaction handle to initialize 2731 * @return 0 on success, non-zero on failure. 2732 */ 2733 static int 2734 mdb_txn_renew0(MDB_txn *txn) 2735 { 2736 MDB_env *env = txn->mt_env; 2737 MDB_txninfo *ti = env->me_txns; 2738 MDB_meta *meta; 2739 unsigned int i, nr, flags = txn->mt_flags; 2740 uint16_t x; 2741 int rc, new_notls = 0; 2742 2743 if ((flags &= MDB_TXN_RDONLY) != 0) { 2744 if (!ti) { 2745 meta = mdb_env_pick_meta(env); 2746 txn->mt_txnid = meta->mm_txnid; 2747 txn->mt_u.reader = NULL; 2748 } else { 2749 MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : 2750 pthread_getspecific(env->me_txkey); 2751 if (r) { 2752 if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) 2753 return MDB_BAD_RSLOT; 2754 } else { 2755 MDB_PID_T pid = env->me_pid; 2756 MDB_THR_T tid = pthread_self(); 2757 mdb_mutexref_t rmutex = env->me_rmutex; 2758 2759 if (!env->me_live_reader) { 2760 rc = mdb_reader_pid(env, Pidset, pid); 2761 if (rc) 2762 return rc; 2763 env->me_live_reader = 1; 2764 } 2765 2766 if (LOCK_MUTEX(rc, env, rmutex)) 2767 return rc; 2768 nr = ti->mti_numreaders; 2769 for (i=0; i<nr; i++) 2770 if (ti->mti_readers[i].mr_pid == 0) 2771 break; 2772 if (i == env->me_maxreaders) { 2773 UNLOCK_MUTEX(rmutex); 2774 return MDB_READERS_FULL; 2775 } 2776 r = &ti->mti_readers[i]; 2777 /* Claim the reader slot, carefully since other code 2778 * uses the reader table un-mutexed: First reset the 2779 * slot, next publish it in mti_numreaders. After 2780 * that, it is safe for mdb_env_close() to touch it. 2781 * When it will be closed, we can finally claim it. 2782 */ 2783 r->mr_pid = 0; 2784 r->mr_txnid = (txnid_t)-1; 2785 r->mr_tid = tid; 2786 if (i == nr) 2787 ti->mti_numreaders = ++nr; 2788 env->me_close_readers = nr; 2789 r->mr_pid = pid; 2790 UNLOCK_MUTEX(rmutex); 2791 2792 new_notls = (env->me_flags & MDB_NOTLS); 2793 if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { 2794 r->mr_pid = 0; 2795 return rc; 2796 } 2797 } 2798 do /* LY: Retry on a race, ITS#7970. */ 2799 r->mr_txnid = ti->mti_txnid; 2800 while(r->mr_txnid != ti->mti_txnid); 2801 if (!r->mr_txnid && (env->me_flags & MDB_RDONLY)) { 2802 meta = mdb_env_pick_meta(env); 2803 r->mr_txnid = meta->mm_txnid; 2804 } else { 2805 meta = env->me_metas[r->mr_txnid & 1]; 2806 } 2807 txn->mt_txnid = r->mr_txnid; 2808 txn->mt_u.reader = r; 2809 } 2810 2811 } else { 2812 /* Not yet touching txn == env->me_txn0, it may be active */ 2813 if (ti) { 2814 if (LOCK_MUTEX(rc, env, env->me_wmutex)) 2815 return rc; 2816 txn->mt_txnid = ti->mti_txnid; 2817 meta = env->me_metas[txn->mt_txnid & 1]; 2818 } else { 2819 meta = mdb_env_pick_meta(env); 2820 txn->mt_txnid = meta->mm_txnid; 2821 } 2822 txn->mt_txnid++; 2823 #if MDB_DEBUG 2824 if (txn->mt_txnid == mdb_debug_start) 2825 mdb_debug = MDB_DBG_INFO; 2826 #endif 2827 txn->mt_child = NULL; 2828 txn->mt_loose_pgs = NULL; 2829 txn->mt_loose_count = 0; 2830 txn->mt_dirty_room = MDB_IDL_UM_MAX; 2831 txn->mt_u.dirty_list = env->me_dirty_list; 2832 txn->mt_u.dirty_list[0].mid = 0; 2833 txn->mt_free_pgs = env->me_free_pgs; 2834 txn->mt_free_pgs[0] = 0; 2835 txn->mt_spill_pgs = NULL; 2836 env->me_txn = txn; 2837 memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int)); 2838 } 2839 2840 /* Copy the DB info and flags */ 2841 memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); 2842 2843 /* Moved to here to avoid a data race in read TXNs */ 2844 txn->mt_next_pgno = meta->mm_last_pg+1; 2845 2846 txn->mt_flags = flags; 2847 2848 /* Setup db info */ 2849 txn->mt_numdbs = env->me_numdbs; 2850 for (i=CORE_DBS; i<txn->mt_numdbs; i++) { 2851 x = env->me_dbflags[i]; 2852 txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; 2853 txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0; 2854 } 2855 txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID; 2856 txn->mt_dbflags[FREE_DBI] = DB_VALID; 2857 2858 if (env->me_flags & MDB_FATAL_ERROR) { 2859 DPUTS("environment had fatal error, must shutdown!"); 2860 rc = MDB_PANIC; 2861 } else if (env->me_maxpg < txn->mt_next_pgno) { 2862 rc = MDB_MAP_RESIZED; 2863 } else { 2864 return MDB_SUCCESS; 2865 } 2866 mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); 2867 return rc; 2868 } 2869 2870 int 2871 mdb_txn_renew(MDB_txn *txn) 2872 { 2873 int rc; 2874 2875 if (!txn || !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED)) 2876 return EINVAL; 2877 2878 rc = mdb_txn_renew0(txn); 2879 if (rc == MDB_SUCCESS) { 2880 DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", 2881 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', 2882 (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); 2883 } 2884 return rc; 2885 } 2886 2887 int 2888 mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) 2889 { 2890 MDB_txn *txn; 2891 MDB_ntxn *ntxn; 2892 int rc, size, tsize; 2893 2894 flags &= MDB_TXN_BEGIN_FLAGS; 2895 flags |= env->me_flags & MDB_WRITEMAP; 2896 2897 if (env->me_flags & MDB_RDONLY & ~flags) /* write txn in RDONLY env */ 2898 return EACCES; 2899 2900 if (parent) { 2901 /* Nested transactions: Max 1 child, write txns only, no writemap */ 2902 flags |= parent->mt_flags; 2903 if (flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_BLOCKED)) { 2904 return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; 2905 } 2906 /* Child txns save MDB_pgstate and use own copy of cursors */ 2907 size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1); 2908 size += tsize = sizeof(MDB_ntxn); 2909 } else if (flags & MDB_RDONLY) { 2910 size = env->me_maxdbs * (sizeof(MDB_db)+1); 2911 size += tsize = sizeof(MDB_txn); 2912 } else { 2913 /* Reuse preallocated write txn. However, do not touch it until 2914 * mdb_txn_renew0() succeeds, since it currently may be active. 2915 */ 2916 txn = env->me_txn0; 2917 goto renew; 2918 } 2919 if ((txn = calloc(1, size)) == NULL) { 2920 DPRINTF(("calloc: %s", strerror(errno))); 2921 return ENOMEM; 2922 } 2923 txn->mt_dbxs = env->me_dbxs; /* static */ 2924 txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); 2925 txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; 2926 txn->mt_flags = flags; 2927 txn->mt_env = env; 2928 2929 if (parent) { 2930 unsigned int i; 2931 txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); 2932 txn->mt_dbiseqs = parent->mt_dbiseqs; 2933 txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); 2934 if (!txn->mt_u.dirty_list || 2935 !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) 2936 { 2937 free(txn->mt_u.dirty_list); 2938 free(txn); 2939 return ENOMEM; 2940 } 2941 txn->mt_txnid = parent->mt_txnid; 2942 txn->mt_dirty_room = parent->mt_dirty_room; 2943 txn->mt_u.dirty_list[0].mid = 0; 2944 txn->mt_spill_pgs = NULL; 2945 txn->mt_next_pgno = parent->mt_next_pgno; 2946 parent->mt_flags |= MDB_TXN_HAS_CHILD; 2947 parent->mt_child = txn; 2948 txn->mt_parent = parent; 2949 txn->mt_numdbs = parent->mt_numdbs; 2950 memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); 2951 /* Copy parent's mt_dbflags, but clear DB_NEW */ 2952 for (i=0; i<txn->mt_numdbs; i++) 2953 txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; 2954 rc = 0; 2955 ntxn = (MDB_ntxn *)txn; 2956 ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ 2957 if (env->me_pghead) { 2958 size = MDB_IDL_SIZEOF(env->me_pghead); 2959 env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); 2960 if (env->me_pghead) 2961 memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); 2962 else 2963 rc = ENOMEM; 2964 } 2965 if (!rc) 2966 rc = mdb_cursor_shadow(parent, txn); 2967 if (rc) 2968 mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD); 2969 } else { /* MDB_RDONLY */ 2970 txn->mt_dbiseqs = env->me_dbiseqs; 2971 renew: 2972 rc = mdb_txn_renew0(txn); 2973 } 2974 if (rc) { 2975 if (txn != env->me_txn0) 2976 free(txn); 2977 } else { 2978 txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */ 2979 *ret = txn; 2980 DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", 2981 txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', 2982 (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); 2983 } 2984 MDB_TRACE(("%p, %p, %u = %p", env, parent, flags, txn)); 2985 2986 return rc; 2987 } 2988 2989 MDB_env * 2990 mdb_txn_env(MDB_txn *txn) 2991 { 2992 if(!txn) return NULL; 2993 return txn->mt_env; 2994 } 2995 2996 size_t 2997 mdb_txn_id(MDB_txn *txn) 2998 { 2999 if(!txn) return 0; 3000 return txn->mt_txnid; 3001 } 3002 3003 /** Export or close DBI handles opened in this txn. */ 3004 static void 3005 mdb_dbis_update(MDB_txn *txn, int keep) 3006 { 3007 int i; 3008 MDB_dbi n = txn->mt_numdbs; 3009 MDB_env *env = txn->mt_env; 3010 unsigned char *tdbflags = txn->mt_dbflags; 3011 3012 for (i = n; --i >= CORE_DBS;) { 3013 if (tdbflags[i] & DB_NEW) { 3014 if (keep) { 3015 env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; 3016 } else { 3017 char *ptr = env->me_dbxs[i].md_name.mv_data; 3018 if (ptr) { 3019 env->me_dbxs[i].md_name.mv_data = NULL; 3020 env->me_dbxs[i].md_name.mv_size = 0; 3021 env->me_dbflags[i] = 0; 3022 env->me_dbiseqs[i]++; 3023 free(ptr); 3024 } 3025 } 3026 } 3027 } 3028 if (keep && env->me_numdbs < n) 3029 env->me_numdbs = n; 3030 } 3031 3032 /** End a transaction, except successful commit of a nested transaction. 3033 * May be called twice for readonly txns: First reset it, then abort. 3034 * @param[in] txn the transaction handle to end 3035 * @param[in] mode why and how to end the transaction 3036 */ 3037 static void 3038 mdb_txn_end(MDB_txn *txn, unsigned mode) 3039 { 3040 MDB_env *env = txn->mt_env; 3041 #if MDB_DEBUG 3042 static const char *const names[] = MDB_END_NAMES; 3043 #endif 3044 3045 /* Export or close DBI handles opened in this txn */ 3046 mdb_dbis_update(txn, mode & MDB_END_UPDATE); 3047 3048 DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", 3049 names[mode & MDB_END_OPMASK], 3050 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', 3051 (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); 3052 3053 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { 3054 if (txn->mt_u.reader) { 3055 txn->mt_u.reader->mr_txnid = (txnid_t)-1; 3056 if (!(env->me_flags & MDB_NOTLS)) { 3057 txn->mt_u.reader = NULL; /* txn does not own reader */ 3058 } else if (mode & MDB_END_SLOT) { 3059 txn->mt_u.reader->mr_pid = 0; 3060 txn->mt_u.reader = NULL; 3061 } /* else txn owns the slot until it does MDB_END_SLOT */ 3062 } 3063 txn->mt_numdbs = 0; /* prevent further DBI activity */ 3064 txn->mt_flags |= MDB_TXN_FINISHED; 3065 3066 } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { 3067 pgno_t *pghead = env->me_pghead; 3068 3069 if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ 3070 mdb_cursors_close(txn, 0); 3071 if (!(env->me_flags & MDB_WRITEMAP)) { 3072 mdb_dlist_free(txn); 3073 } 3074 3075 txn->mt_numdbs = 0; 3076 txn->mt_flags = MDB_TXN_FINISHED; 3077 3078 if (!txn->mt_parent) { 3079 mdb_midl_shrink(&txn->mt_free_pgs); 3080 env->me_free_pgs = txn->mt_free_pgs; 3081 /* me_pgstate: */ 3082 env->me_pghead = NULL; 3083 env->me_pglast = 0; 3084 3085 env->me_txn = NULL; 3086 mode = 0; /* txn == env->me_txn0, do not free() it */ 3087 3088 /* The writer mutex was locked in mdb_txn_begin. */ 3089 if (env->me_txns) 3090 UNLOCK_MUTEX(env->me_wmutex); 3091 } else { 3092 txn->mt_parent->mt_child = NULL; 3093 txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; 3094 env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; 3095 mdb_midl_free(txn->mt_free_pgs); 3096 free(txn->mt_u.dirty_list); 3097 } 3098 mdb_midl_free(txn->mt_spill_pgs); 3099 3100 mdb_midl_free(pghead); 3101 } 3102 3103 if (mode & MDB_END_FREE) 3104 free(txn); 3105 } 3106 3107 void 3108 mdb_txn_reset(MDB_txn *txn) 3109 { 3110 if (txn == NULL) 3111 return; 3112 3113 /* This call is only valid for read-only txns */ 3114 if (!(txn->mt_flags & MDB_TXN_RDONLY)) 3115 return; 3116 3117 mdb_txn_end(txn, MDB_END_RESET); 3118 } 3119 3120 static void 3121 _mdb_txn_abort(MDB_txn *txn) 3122 { 3123 if (txn == NULL) 3124 return; 3125 3126 if (txn->mt_child) 3127 _mdb_txn_abort(txn->mt_child); 3128 3129 mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); 3130 } 3131 3132 void 3133 mdb_txn_abort(MDB_txn *txn) 3134 { 3135 MDB_TRACE(("%p", txn)); 3136 _mdb_txn_abort(txn); 3137 } 3138 3139 /** Save the freelist as of this transaction to the freeDB. 3140 * This changes the freelist. Keep trying until it stabilizes. 3141 */ 3142 static int 3143 mdb_freelist_save(MDB_txn *txn) 3144 { 3145 /* env->me_pghead[] can grow and shrink during this call. 3146 * env->me_pglast and txn->mt_free_pgs[] can only grow. 3147 * Page numbers cannot disappear from txn->mt_free_pgs[]. 3148 */ 3149 MDB_cursor mc; 3150 MDB_env *env = txn->mt_env; 3151 int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; 3152 txnid_t pglast = 0, head_id = 0; 3153 pgno_t freecnt = 0, *free_pgs, *mop; 3154 ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; 3155 3156 mdb_cursor_init(&mc, txn, FREE_DBI, NULL); 3157 3158 if (env->me_pghead) { 3159 /* Make sure first page of freeDB is touched and on freelist */ 3160 rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); 3161 if (rc && rc != MDB_NOTFOUND) 3162 return rc; 3163 } 3164 3165 if (!env->me_pghead && txn->mt_loose_pgs) { 3166 /* Put loose page numbers in mt_free_pgs, since 3167 * we may be unable to return them to me_pghead. 3168 */ 3169 MDB_page *mp = txn->mt_loose_pgs; 3170 MDB_ID2 *dl = txn->mt_u.dirty_list; 3171 unsigned x; 3172 if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0) 3173 return rc; 3174 for (; mp; mp = NEXT_LOOSE_PAGE(mp)) { 3175 mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); 3176 /* must also remove from dirty list */ 3177 if (txn->mt_flags & MDB_TXN_WRITEMAP) { 3178 for (x=1; x<=dl[0].mid; x++) 3179 if (dl[x].mid == mp->mp_pgno) 3180 break; 3181 mdb_tassert(txn, x <= dl[0].mid); 3182 } else { 3183 x = mdb_mid2l_search(dl, mp->mp_pgno); 3184 mdb_tassert(txn, dl[x].mid == mp->mp_pgno); 3185 mdb_dpage_free(env, mp); 3186 } 3187 dl[x].mptr = NULL; 3188 } 3189 { 3190 /* squash freed slots out of the dirty list */ 3191 unsigned y; 3192 for (y=1; dl[y].mptr && y <= dl[0].mid; y++); 3193 if (y <= dl[0].mid) { 3194 for(x=y, y++;;) { 3195 while (!dl[y].mptr && y <= dl[0].mid) y++; 3196 if (y > dl[0].mid) break; 3197 dl[x++] = dl[y++]; 3198 } 3199 dl[0].mid = x-1; 3200 } else { 3201 /* all slots freed */ 3202 dl[0].mid = 0; 3203 } 3204 } 3205 txn->mt_loose_pgs = NULL; 3206 txn->mt_loose_count = 0; 3207 } 3208 3209 /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ 3210 clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) 3211 ? SSIZE_MAX : maxfree_1pg; 3212 3213 for (;;) { 3214 /* Come back here after each Put() in case freelist changed */ 3215 MDB_val key, data; 3216 pgno_t *pgs; 3217 ssize_t j; 3218 3219 /* If using records from freeDB which we have not yet 3220 * deleted, delete them and any we reserved for me_pghead. 3221 */ 3222 while (pglast < env->me_pglast) { 3223 rc = mdb_cursor_first(&mc, &key, NULL); 3224 if (rc) 3225 return rc; 3226 pglast = head_id = *(txnid_t *)key.mv_data; 3227 total_room = head_room = 0; 3228 mdb_tassert(txn, pglast <= env->me_pglast); 3229 rc = _mdb_cursor_del(&mc, 0); 3230 if (rc) 3231 return rc; 3232 } 3233 3234 /* Save the IDL of pages freed by this txn, to a single record */ 3235 if (freecnt < txn->mt_free_pgs[0]) { 3236 if (!freecnt) { 3237 /* Make sure last page of freeDB is touched and on freelist */ 3238 rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); 3239 if (rc && rc != MDB_NOTFOUND) 3240 return rc; 3241 } 3242 free_pgs = txn->mt_free_pgs; 3243 /* Write to last page of freeDB */ 3244 key.mv_size = sizeof(txn->mt_txnid); 3245 key.mv_data = &txn->mt_txnid; 3246 do { 3247 freecnt = free_pgs[0]; 3248 data.mv_size = MDB_IDL_SIZEOF(free_pgs); 3249 rc = _mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); 3250 if (rc) 3251 return rc; 3252 /* Retry if mt_free_pgs[] grew during the Put() */ 3253 free_pgs = txn->mt_free_pgs; 3254 } while (freecnt < free_pgs[0]); 3255 mdb_midl_sort(free_pgs); 3256 memcpy(data.mv_data, free_pgs, data.mv_size); 3257 #if (MDB_DEBUG) > 1 3258 { 3259 unsigned int i = free_pgs[0]; 3260 DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", 3261 txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); 3262 for (; i; i--) 3263 DPRINTF(("IDL %"Z"u", free_pgs[i])); 3264 } 3265 #endif 3266 continue; 3267 } 3268 3269 mop = env->me_pghead; 3270 mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; 3271 3272 /* Reserve records for me_pghead[]. Split it if multi-page, 3273 * to avoid searching freeDB for a page range. Use keys in 3274 * range [1,me_pglast]: Smaller than txnid of oldest reader. 3275 */ 3276 if (total_room >= mop_len) { 3277 if (total_room == mop_len || --more < 0) 3278 break; 3279 } else if (head_room >= maxfree_1pg && head_id > 1) { 3280 /* Keep current record (overflow page), add a new one */ 3281 head_id--; 3282 head_room = 0; 3283 } 3284 /* (Re)write {key = head_id, IDL length = head_room} */ 3285 total_room -= head_room; 3286 head_room = mop_len - total_room; 3287 if (head_room > maxfree_1pg && head_id > 1) { 3288 /* Overflow multi-page for part of me_pghead */ 3289 head_room /= head_id; /* amortize page sizes */ 3290 head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); 3291 } else if (head_room < 0) { 3292 /* Rare case, not bothering to delete this record */ 3293 head_room = 0; 3294 } 3295 key.mv_size = sizeof(head_id); 3296 key.mv_data = &head_id; 3297 data.mv_size = (head_room + 1) * sizeof(pgno_t); 3298 rc = _mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); 3299 if (rc) 3300 return rc; 3301 /* IDL is initially empty, zero out at least the length */ 3302 pgs = (pgno_t *)data.mv_data; 3303 j = head_room > clean_limit ? head_room : 0; 3304 do { 3305 pgs[j] = 0; 3306 } while (--j >= 0); 3307 total_room += head_room; 3308 } 3309 3310 /* Return loose page numbers to me_pghead, though usually none are 3311 * left at this point. The pages themselves remain in dirty_list. 3312 */ 3313 if (txn->mt_loose_pgs) { 3314 MDB_page *mp = txn->mt_loose_pgs; 3315 unsigned count = txn->mt_loose_count; 3316 MDB_IDL loose; 3317 /* Room for loose pages + temp IDL with same */ 3318 if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0) 3319 return rc; 3320 mop = env->me_pghead; 3321 loose = mop + MDB_IDL_ALLOCLEN(mop) - count; 3322 for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) 3323 loose[ ++count ] = mp->mp_pgno; 3324 loose[0] = count; 3325 mdb_midl_sort(loose); 3326 mdb_midl_xmerge(mop, loose); 3327 txn->mt_loose_pgs = NULL; 3328 txn->mt_loose_count = 0; 3329 mop_len = mop[0]; 3330 } 3331 3332 /* Fill in the reserved me_pghead records */ 3333 rc = MDB_SUCCESS; 3334 if (mop_len) { 3335 MDB_val key, data; 3336 3337 mop += mop_len; 3338 rc = mdb_cursor_first(&mc, &key, &data); 3339 for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { 3340 txnid_t id = *(txnid_t *)key.mv_data; 3341 ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; 3342 MDB_ID save; 3343 3344 mdb_tassert(txn, len >= 0 && id <= env->me_pglast); 3345 key.mv_data = &id; 3346 if (len > mop_len) { 3347 len = mop_len; 3348 data.mv_size = (len + 1) * sizeof(MDB_ID); 3349 } 3350 data.mv_data = mop -= len; 3351 save = mop[0]; 3352 mop[0] = len; 3353 rc = _mdb_cursor_put(&mc, &key, &data, MDB_CURRENT); 3354 mop[0] = save; 3355 if (rc || !(mop_len -= len)) 3356 break; 3357 } 3358 } 3359 return rc; 3360 } 3361 3362 /** Flush (some) dirty pages to the map, after clearing their dirty flag. 3363 * @param[in] txn the transaction that's being committed 3364 * @param[in] keep number of initial pages in dirty_list to keep dirty. 3365 * @return 0 on success, non-zero on failure. 3366 */ 3367 static int 3368 mdb_page_flush(MDB_txn *txn, int keep) 3369 { 3370 MDB_env *env = txn->mt_env; 3371 MDB_ID2L dl = txn->mt_u.dirty_list; 3372 unsigned psize = env->me_psize, j; 3373 int i, pagecount = dl[0].mid, rc; 3374 size_t size = 0, pos = 0; 3375 pgno_t pgno = 0; 3376 MDB_page *dp = NULL; 3377 #ifdef _WIN32 3378 OVERLAPPED ov; 3379 #else 3380 struct iovec iov[MDB_COMMIT_PAGES]; 3381 ssize_t wpos = 0, wsize = 0, wres; 3382 size_t next_pos = 1; /* impossible pos, so pos != next_pos */ 3383 int n = 0; 3384 #endif 3385 3386 j = i = keep; 3387 3388 if (env->me_flags & MDB_WRITEMAP) { 3389 /* Clear dirty flags */ 3390 while (++i <= pagecount) { 3391 dp = dl[i].mptr; 3392 /* Don't flush this page yet */ 3393 if (dp->mp_flags & (P_LOOSE|P_KEEP)) { 3394 dp->mp_flags &= ~P_KEEP; 3395 dl[++j] = dl[i]; 3396 continue; 3397 } 3398 dp->mp_flags &= ~P_DIRTY; 3399 } 3400 goto done; 3401 } 3402 3403 /* Write the pages */ 3404 for (;;) { 3405 if (++i <= pagecount) { 3406 dp = dl[i].mptr; 3407 /* Don't flush this page yet */ 3408 if (dp->mp_flags & (P_LOOSE|P_KEEP)) { 3409 dp->mp_flags &= ~P_KEEP; 3410 dl[i].mid = 0; 3411 continue; 3412 } 3413 pgno = dl[i].mid; 3414 /* clear dirty flag */ 3415 dp->mp_flags &= ~P_DIRTY; 3416 pos = pgno * psize; 3417 size = psize; 3418 if (IS_OVERFLOW(dp)) size *= dp->mp_pages; 3419 } 3420 #ifdef _WIN32 3421 else break; 3422 3423 /* Windows actually supports scatter/gather I/O, but only on 3424 * unbuffered file handles. Since we're relying on the OS page 3425 * cache for all our data, that's self-defeating. So we just 3426 * write pages one at a time. We use the ov structure to set 3427 * the write offset, to at least save the overhead of a Seek 3428 * system call. 3429 */ 3430 DPRINTF(("committing page %"Z"u", pgno)); 3431 memset(&ov, 0, sizeof(ov)); 3432 ov.Offset = pos & 0xffffffff; 3433 ov.OffsetHigh = pos >> 16 >> 16; 3434 if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { 3435 rc = ErrCode(); 3436 DPRINTF(("WriteFile: %d", rc)); 3437 return rc; 3438 } 3439 #else 3440 /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ 3441 if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { 3442 if (n) { 3443 retry_write: 3444 /* Write previous page(s) */ 3445 #ifdef MDB_USE_PWRITEV 3446 wres = pwritev(env->me_fd, iov, n, wpos); 3447 #else 3448 if (n == 1) { 3449 wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); 3450 } else { 3451 retry_seek: 3452 if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { 3453 rc = ErrCode(); 3454 if (rc == EINTR) 3455 goto retry_seek; 3456 DPRINTF(("lseek: %s", strerror(rc))); 3457 return rc; 3458 } 3459 wres = writev(env->me_fd, iov, n); 3460 } 3461 #endif 3462 if (wres != wsize) { 3463 if (wres < 0) { 3464 rc = ErrCode(); 3465 if (rc == EINTR) 3466 goto retry_write; 3467 DPRINTF(("Write error: %s", strerror(rc))); 3468 } else { 3469 rc = EIO; /* TODO: Use which error code? */ 3470 DPUTS("short write, filesystem full?"); 3471 } 3472 return rc; 3473 } 3474 n = 0; 3475 } 3476 if (i > pagecount) 3477 break; 3478 wpos = pos; 3479 wsize = 0; 3480 } 3481 DPRINTF(("committing page %"Z"u", pgno)); 3482 next_pos = pos + size; 3483 iov[n].iov_len = size; 3484 iov[n].iov_base = (char *)dp; 3485 wsize += size; 3486 n++; 3487 #endif /* _WIN32 */ 3488 } 3489 3490 /* MIPS has cache coherency issues, this is a no-op everywhere else 3491 * Note: for any size >= on-chip cache size, entire on-chip cache is 3492 * flushed. 3493 */ 3494 CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); 3495 3496 for (i = keep; ++i <= pagecount; ) { 3497 dp = dl[i].mptr; 3498 /* This is a page we skipped above */ 3499 if (!dl[i].mid) { 3500 dl[++j] = dl[i]; 3501 dl[j].mid = dp->mp_pgno; 3502 continue; 3503 } 3504 mdb_dpage_free(env, dp); 3505 } 3506 3507 done: 3508 i--; 3509 txn->mt_dirty_room += i - j; 3510 dl[0].mid = j; 3511 return MDB_SUCCESS; 3512 } 3513 3514 static int 3515 _mdb_txn_commit(MDB_txn *txn) 3516 { 3517 int rc; 3518 unsigned int i, end_mode; 3519 MDB_env *env; 3520 3521 if (txn == NULL) 3522 return EINVAL; 3523 3524 /* mdb_txn_end() mode for a commit which writes nothing */ 3525 end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; 3526 3527 if (txn->mt_child) { 3528 rc = _mdb_txn_commit(txn->mt_child); 3529 if (rc) 3530 goto fail; 3531 } 3532 3533 env = txn->mt_env; 3534 3535 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { 3536 goto done; 3537 } 3538 3539 if (txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR)) { 3540 DPUTS("txn has failed/finished, can't commit"); 3541 if (txn->mt_parent) 3542 txn->mt_parent->mt_flags |= MDB_TXN_ERROR; 3543 rc = MDB_BAD_TXN; 3544 goto fail; 3545 } 3546 3547 if (txn->mt_parent) { 3548 MDB_txn *parent = txn->mt_parent; 3549 MDB_page **lp; 3550 MDB_ID2L dst, src; 3551 MDB_IDL pspill; 3552 unsigned x, y, len, ps_len; 3553 3554 /* Append our free list to parent's */ 3555 rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); 3556 if (rc) 3557 goto fail; 3558 mdb_midl_free(txn->mt_free_pgs); 3559 /* Failures after this must either undo the changes 3560 * to the parent or set MDB_TXN_ERROR in the parent. 3561 */ 3562 3563 parent->mt_next_pgno = txn->mt_next_pgno; 3564 parent->mt_flags = txn->mt_flags; 3565 3566 /* Merge our cursors into parent's and close them */ 3567 mdb_cursors_close(txn, 1); 3568 3569 /* Update parent's DB table. */ 3570 memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); 3571 parent->mt_numdbs = txn->mt_numdbs; 3572 parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; 3573 parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; 3574 for (i=CORE_DBS; i<txn->mt_numdbs; i++) { 3575 /* preserve parent's DB_NEW status */ 3576 x = parent->mt_dbflags[i] & DB_NEW; 3577 parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; 3578 } 3579 3580 dst = parent->mt_u.dirty_list; 3581 src = txn->mt_u.dirty_list; 3582 /* Remove anything in our dirty list from parent's spill list */ 3583 if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { 3584 x = y = ps_len; 3585 pspill[0] = (pgno_t)-1; 3586 /* Mark our dirty pages as deleted in parent spill list */ 3587 for (i=0, len=src[0].mid; ++i <= len; ) { 3588 MDB_ID pn = src[i].mid << 1; 3589 while (pn > pspill[x]) 3590 x--; 3591 if (pn == pspill[x]) { 3592 pspill[x] = 1; 3593 y = --x; 3594 } 3595 } 3596 /* Squash deleted pagenums if we deleted any */ 3597 for (x=y; ++x <= ps_len; ) 3598 if (!(pspill[x] & 1)) 3599 pspill[++y] = pspill[x]; 3600 pspill[0] = y; 3601 } 3602 3603 /* Remove anything in our spill list from parent's dirty list */ 3604 if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { 3605 for (i=1; i<=txn->mt_spill_pgs[0]; i++) { 3606 MDB_ID pn = txn->mt_spill_pgs[i]; 3607 if (pn & 1) 3608 continue; /* deleted spillpg */ 3609 pn >>= 1; 3610 y = mdb_mid2l_search(dst, pn); 3611 if (y <= dst[0].mid && dst[y].mid == pn) { 3612 free(dst[y].mptr); 3613 while (y < dst[0].mid) { 3614 dst[y] = dst[y+1]; 3615 y++; 3616 } 3617 dst[0].mid--; 3618 } 3619 } 3620 } 3621 3622 /* Find len = length of merging our dirty list with parent's */ 3623 x = dst[0].mid; 3624 dst[0].mid = 0; /* simplify loops */ 3625 if (parent->mt_parent) { 3626 len = x + src[0].mid; 3627 y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; 3628 for (i = x; y && i; y--) { 3629 pgno_t yp = src[y].mid; 3630 while (yp < dst[i].mid) 3631 i--; 3632 if (yp == dst[i].mid) { 3633 i--; 3634 len--; 3635 } 3636 } 3637 } else { /* Simplify the above for single-ancestor case */ 3638 len = MDB_IDL_UM_MAX - txn->mt_dirty_room; 3639 } 3640 /* Merge our dirty list with parent's */ 3641 y = src[0].mid; 3642 for (i = len; y; dst[i--] = src[y--]) { 3643 pgno_t yp = src[y].mid; 3644 while (yp < dst[x].mid) 3645 dst[i--] = dst[x--]; 3646 if (yp == dst[x].mid) 3647 free(dst[x--].mptr); 3648 } 3649 mdb_tassert(txn, i == x); 3650 dst[0].mid = len; 3651 free(txn->mt_u.dirty_list); 3652 parent->mt_dirty_room = txn->mt_dirty_room; 3653 if (txn->mt_spill_pgs) { 3654 if (parent->mt_spill_pgs) { 3655 /* TODO: Prevent failure here, so parent does not fail */ 3656 rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); 3657 if (rc) 3658 parent->mt_flags |= MDB_TXN_ERROR; 3659 mdb_midl_free(txn->mt_spill_pgs); 3660 mdb_midl_sort(parent->mt_spill_pgs); 3661 } else { 3662 parent->mt_spill_pgs = txn->mt_spill_pgs; 3663 } 3664 } 3665 3666 /* Append our loose page list to parent's */ 3667 for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) 3668 ; 3669 *lp = txn->mt_loose_pgs; 3670 parent->mt_loose_count += txn->mt_loose_count; 3671 3672 parent->mt_child = NULL; 3673 mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); 3674 free(txn); 3675 return rc; 3676 } 3677 3678 if (txn != env->me_txn) { 3679 DPUTS("attempt to commit unknown transaction"); 3680 rc = EINVAL; 3681 goto fail; 3682 } 3683 3684 mdb_cursors_close(txn, 0); 3685 3686 if (!txn->mt_u.dirty_list[0].mid && 3687 !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) 3688 goto done; 3689 3690 DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", 3691 txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); 3692 3693 /* Update DB root pointers */ 3694 if (txn->mt_numdbs > CORE_DBS) { 3695 MDB_cursor mc; 3696 MDB_dbi i; 3697 MDB_val data; 3698 data.mv_size = sizeof(MDB_db); 3699 3700 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); 3701 for (i = CORE_DBS; i < txn->mt_numdbs; i++) { 3702 if (txn->mt_dbflags[i] & DB_DIRTY) { 3703 if (TXN_DBI_CHANGED(txn, i)) { 3704 rc = MDB_BAD_DBI; 3705 goto fail; 3706 } 3707 data.mv_data = &txn->mt_dbs[i]; 3708 rc = _mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 3709 F_SUBDATA); 3710 if (rc) 3711 goto fail; 3712 } 3713 } 3714 } 3715 3716 rc = mdb_freelist_save(txn); 3717 if (rc) 3718 goto fail; 3719 3720 mdb_midl_free(env->me_pghead); 3721 env->me_pghead = NULL; 3722 mdb_midl_shrink(&txn->mt_free_pgs); 3723 3724 #if (MDB_DEBUG) > 2 3725 mdb_audit(txn); 3726 #endif 3727 3728 if ((rc = mdb_page_flush(txn, 0)) || 3729 (rc = mdb_env_sync(env, 0)) || 3730 (rc = mdb_env_write_meta(txn))) 3731 goto fail; 3732 end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; 3733 3734 done: 3735 mdb_txn_end(txn, end_mode); 3736 return MDB_SUCCESS; 3737 3738 fail: 3739 _mdb_txn_abort(txn); 3740 return rc; 3741 } 3742 3743 int 3744 mdb_txn_commit(MDB_txn *txn) 3745 { 3746 MDB_TRACE(("%p", txn)); 3747 return _mdb_txn_commit(txn); 3748 } 3749 3750 /** Read the environment parameters of a DB environment before 3751 * mapping it into memory. 3752 * @param[in] env the environment handle 3753 * @param[out] meta address of where to store the meta information 3754 * @return 0 on success, non-zero on failure. 3755 */ 3756 static int ESECT 3757 mdb_env_read_header(MDB_env *env, MDB_meta *meta) 3758 { 3759 MDB_metabuf pbuf; 3760 MDB_page *p; 3761 MDB_meta *m; 3762 int i, rc, off; 3763 enum { Size = sizeof(pbuf) }; 3764 3765 /* We don't know the page size yet, so use a minimum value. 3766 * Read both meta pages so we can use the latest one. 3767 */ 3768 3769 for (i=off=0; i<NUM_METAS; i++, off += meta->mm_psize) { 3770 #ifdef _WIN32 3771 DWORD len; 3772 OVERLAPPED ov; 3773 memset(&ov, 0, sizeof(ov)); 3774 ov.Offset = off; 3775 rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1; 3776 if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF) 3777 rc = 0; 3778 #else 3779 rc = pread(env->me_fd, &pbuf, Size, off); 3780 #endif 3781 if (rc != Size) { 3782 if (rc == 0 && off == 0) 3783 return ENOENT; 3784 rc = rc < 0 ? (int) ErrCode() : MDB_INVALID; 3785 DPRINTF(("read: %s", mdb_strerror(rc))); 3786 return rc; 3787 } 3788 3789 p = (MDB_page *)&pbuf; 3790 3791 if (!F_ISSET(p->mp_flags, P_META)) { 3792 DPRINTF(("page %"Z"u not a meta page", p->mp_pgno)); 3793 return MDB_INVALID; 3794 } 3795 3796 m = METADATA(p); 3797 if (m->mm_magic != MDB_MAGIC) { 3798 DPUTS("meta has invalid magic"); 3799 return MDB_INVALID; 3800 } 3801 3802 if (m->mm_version != MDB_DATA_VERSION) { 3803 DPRINTF(("database is version %u, expected version %u", 3804 m->mm_version, MDB_DATA_VERSION)); 3805 return MDB_VERSION_MISMATCH; 3806 } 3807 3808 if (off == 0 || m->mm_txnid > meta->mm_txnid) 3809 *meta = *m; 3810 } 3811 return 0; 3812 } 3813 3814 /** Fill in most of the zeroed #MDB_meta for an empty database environment */ 3815 static void ESECT 3816 mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) 3817 { 3818 meta->mm_magic = MDB_MAGIC; 3819 meta->mm_version = MDB_DATA_VERSION; 3820 meta->mm_mapsize = env->me_mapsize; 3821 meta->mm_psize = env->me_psize; 3822 meta->mm_last_pg = NUM_METAS-1; 3823 meta->mm_flags = env->me_flags & 0xffff; 3824 meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ 3825 meta->mm_dbs[FREE_DBI].md_root = P_INVALID; 3826 meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; 3827 } 3828 3829 /** Write the environment parameters of a freshly created DB environment. 3830 * @param[in] env the environment handle 3831 * @param[in] meta the #MDB_meta to write 3832 * @return 0 on success, non-zero on failure. 3833 */ 3834 static int ESECT 3835 mdb_env_init_meta(MDB_env *env, MDB_meta *meta) 3836 { 3837 MDB_page *p, *q; 3838 int rc; 3839 unsigned int psize; 3840 #ifdef _WIN32 3841 DWORD len; 3842 OVERLAPPED ov; 3843 memset(&ov, 0, sizeof(ov)); 3844 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ 3845 ov.Offset = pos; \ 3846 rc = WriteFile(fd, ptr, size, &len, &ov); } while(0) 3847 #else 3848 int len; 3849 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ 3850 len = pwrite(fd, ptr, size, pos); \ 3851 if (len == -1 && ErrCode() == EINTR) continue; \ 3852 rc = (len >= 0); break; } while(1) 3853 #endif 3854 3855 DPUTS("writing new meta page"); 3856 3857 psize = env->me_psize; 3858 3859 p = calloc(NUM_METAS, psize); 3860 if (!p) 3861 return ENOMEM; 3862 3863 p->mp_pgno = 0; 3864 p->mp_flags = P_META; 3865 *(MDB_meta *)METADATA(p) = *meta; 3866 3867 q = (MDB_page *)((char *)p + psize); 3868 q->mp_pgno = 1; 3869 q->mp_flags = P_META; 3870 *(MDB_meta *)METADATA(q) = *meta; 3871 3872 DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0); 3873 if (!rc) 3874 rc = ErrCode(); 3875 else if ((unsigned) len == psize * NUM_METAS) 3876 rc = MDB_SUCCESS; 3877 else 3878 rc = ENOSPC; 3879 free(p); 3880 return rc; 3881 } 3882 3883 /** Update the environment info to commit a transaction. 3884 * @param[in] txn the transaction that's being committed 3885 * @return 0 on success, non-zero on failure. 3886 */ 3887 static int 3888 mdb_env_write_meta(MDB_txn *txn) 3889 { 3890 MDB_env *env; 3891 MDB_meta meta, metab, *mp; 3892 unsigned flags; 3893 size_t mapsize; 3894 off_t off; 3895 int rc, len, toggle; 3896 char *ptr; 3897 HANDLE mfd; 3898 #ifdef _WIN32 3899 OVERLAPPED ov; 3900 #else 3901 int r2; 3902 #endif 3903 3904 toggle = txn->mt_txnid & 1; 3905 DPRINTF(("writing meta page %d for root page %"Z"u", 3906 toggle, txn->mt_dbs[MAIN_DBI].md_root)); 3907 3908 env = txn->mt_env; 3909 flags = env->me_flags; 3910 mp = env->me_metas[toggle]; 3911 mapsize = env->me_metas[toggle ^ 1]->mm_mapsize; 3912 /* Persist any increases of mapsize config */ 3913 if (mapsize < env->me_mapsize) 3914 mapsize = env->me_mapsize; 3915 3916 if (flags & MDB_WRITEMAP) { 3917 mp->mm_mapsize = mapsize; 3918 mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; 3919 mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; 3920 mp->mm_last_pg = txn->mt_next_pgno - 1; 3921 #if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && /* TODO: portability */ \ 3922 !(defined(__i386__) || defined(__x86_64__)) 3923 /* LY: issue a memory barrier, if not x86. ITS#7969 */ 3924 __sync_synchronize(); 3925 #endif 3926 mp->mm_txnid = txn->mt_txnid; 3927 if (!(flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { 3928 unsigned meta_size = env->me_psize; 3929 rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; 3930 ptr = (char *)mp - PAGEHDRSZ; 3931 #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ 3932 r2 = (ptr - env->me_map) & (env->me_os_psize - 1); 3933 ptr -= r2; 3934 meta_size += r2; 3935 #endif 3936 if (MDB_MSYNC(ptr, meta_size, rc)) { 3937 rc = ErrCode(); 3938 goto fail; 3939 } 3940 } 3941 goto done; 3942 } 3943 metab.mm_txnid = mp->mm_txnid; 3944 metab.mm_last_pg = mp->mm_last_pg; 3945 3946 meta.mm_mapsize = mapsize; 3947 meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; 3948 meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; 3949 meta.mm_last_pg = txn->mt_next_pgno - 1; 3950 meta.mm_txnid = txn->mt_txnid; 3951 3952 off = offsetof(MDB_meta, mm_mapsize); 3953 ptr = (char *)&meta + off; 3954 len = sizeof(MDB_meta) - off; 3955 off += (char *)mp - env->me_map; 3956 3957 /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC. 3958 * (me_mfd goes to the same file as me_fd, but writing to it 3959 * also syncs to disk. Avoids a separate fdatasync() call.) 3960 */ 3961 mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; 3962 #ifdef _WIN32 3963 { 3964 memset(&ov, 0, sizeof(ov)); 3965 ov.Offset = off; 3966 if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) 3967 rc = -1; 3968 } 3969 #else 3970 retry_write: 3971 rc = pwrite(mfd, ptr, len, off); 3972 #endif 3973 if (rc != len) { 3974 rc = rc < 0 ? ErrCode() : EIO; 3975 #ifndef _WIN32 3976 if (rc == EINTR) 3977 goto retry_write; 3978 #endif 3979 DPUTS("write failed, disk error?"); 3980 /* On a failure, the pagecache still contains the new data. 3981 * Write some old data back, to prevent it from being used. 3982 * Use the non-SYNC fd; we know it will fail anyway. 3983 */ 3984 meta.mm_last_pg = metab.mm_last_pg; 3985 meta.mm_txnid = metab.mm_txnid; 3986 #ifdef _WIN32 3987 memset(&ov, 0, sizeof(ov)); 3988 ov.Offset = off; 3989 WriteFile(env->me_fd, ptr, len, NULL, &ov); 3990 #else 3991 r2 = pwrite(env->me_fd, ptr, len, off); 3992 (void)r2; /* Silence warnings. We don't care about pwrite's return value */ 3993 #endif 3994 fail: 3995 env->me_flags |= MDB_FATAL_ERROR; 3996 return rc; 3997 } 3998 /* MIPS has cache coherency issues, this is a no-op everywhere else */ 3999 CACHEFLUSH(env->me_map + off, len, DCACHE); 4000 done: 4001 /* Memory ordering issues are irrelevant; since the entire writer 4002 * is wrapped by wmutex, all of these changes will become visible 4003 * after the wmutex is unlocked. Since the DB is multi-version, 4004 * readers will get consistent data regardless of how fresh or 4005 * how stale their view of these values is. 4006 */ 4007 if (env->me_txns) 4008 env->me_txns->mti_txnid = txn->mt_txnid; 4009 4010 return MDB_SUCCESS; 4011 } 4012 4013 /** Check both meta pages to see which one is newer. 4014 * @param[in] env the environment handle 4015 * @return newest #MDB_meta. 4016 */ 4017 static MDB_meta * 4018 mdb_env_pick_meta(const MDB_env *env) 4019 { 4020 MDB_meta *const *metas = env->me_metas; 4021 return metas[ metas[0]->mm_txnid < metas[1]->mm_txnid ]; 4022 } 4023 4024 int ESECT 4025 mdb_env_create(MDB_env **env) 4026 { 4027 MDB_env *e; 4028 4029 e = calloc(1, sizeof(MDB_env)); 4030 if (!e) 4031 return ENOMEM; 4032 4033 e->me_maxreaders = DEFAULT_READERS; 4034 e->me_maxdbs = e->me_numdbs = CORE_DBS; 4035 e->me_fd = INVALID_HANDLE_VALUE; 4036 e->me_lfd = INVALID_HANDLE_VALUE; 4037 e->me_mfd = INVALID_HANDLE_VALUE; 4038 #ifdef MDB_USE_POSIX_SEM 4039 e->me_rmutex = SEM_FAILED; 4040 e->me_wmutex = SEM_FAILED; 4041 #endif 4042 e->me_pid = getpid(); 4043 GET_PAGESIZE(e->me_os_psize); 4044 VGMEMP_CREATE(e,0,0); 4045 *env = e; 4046 MDB_TRACE(("%p", e)); 4047 return MDB_SUCCESS; 4048 } 4049 4050 static int ESECT 4051 mdb_env_map(MDB_env *env, void *addr) 4052 { 4053 MDB_page *p; 4054 unsigned int flags = env->me_flags; 4055 #ifdef _WIN32 4056 int rc; 4057 HANDLE mh; 4058 LONG sizelo, sizehi; 4059 size_t msize; 4060 4061 if (flags & MDB_RDONLY) { 4062 /* Don't set explicit map size, use whatever exists */ 4063 msize = 0; 4064 sizelo = 0; 4065 sizehi = 0; 4066 } else { 4067 msize = env->me_mapsize; 4068 sizelo = msize & 0xffffffff; 4069 sizehi = msize >> 16 >> 16; /* only needed on Win64 */ 4070 4071 /* Windows won't create mappings for zero length files. 4072 * and won't map more than the file size. 4073 * Just set the maxsize right now. 4074 */ 4075 if (!(flags & MDB_WRITEMAP) && (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo 4076 || !SetEndOfFile(env->me_fd) 4077 || SetFilePointer(env->me_fd, 0, NULL, 0) != 0)) 4078 return ErrCode(); 4079 } 4080 4081 mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? 4082 PAGE_READWRITE : PAGE_READONLY, 4083 sizehi, sizelo, NULL); 4084 if (!mh) 4085 return ErrCode(); 4086 env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? 4087 FILE_MAP_WRITE : FILE_MAP_READ, 4088 0, 0, msize, addr); 4089 rc = env->me_map ? 0 : ErrCode(); 4090 CloseHandle(mh); 4091 if (rc) 4092 return rc; 4093 #else 4094 int mmap_flags = MAP_SHARED; 4095 int prot = PROT_READ; 4096 #ifdef MAP_NOSYNC /* Used on FreeBSD */ 4097 if (flags & MDB_NOSYNC) 4098 mmap_flags |= MAP_NOSYNC; 4099 #endif 4100 if (flags & MDB_WRITEMAP) { 4101 prot |= PROT_WRITE; 4102 if (ftruncate(env->me_fd, env->me_mapsize) < 0) 4103 return ErrCode(); 4104 } 4105 env->me_map = mmap(addr, env->me_mapsize, prot, mmap_flags, 4106 env->me_fd, 0); 4107 if (env->me_map == MAP_FAILED) { 4108 env->me_map = NULL; 4109 return ErrCode(); 4110 } 4111 4112 if (flags & MDB_NORDAHEAD) { 4113 /* Turn off readahead. It's harmful when the DB is larger than RAM. */ 4114 #ifdef MADV_RANDOM 4115 madvise(env->me_map, env->me_mapsize, MADV_RANDOM); 4116 #else 4117 #ifdef POSIX_MADV_RANDOM 4118 posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); 4119 #endif /* POSIX_MADV_RANDOM */ 4120 #endif /* MADV_RANDOM */ 4121 } 4122 #endif /* _WIN32 */ 4123 4124 /* Can happen because the address argument to mmap() is just a 4125 * hint. mmap() can pick another, e.g. if the range is in use. 4126 * The MAP_FIXED flag would prevent that, but then mmap could 4127 * instead unmap existing pages to make room for the new map. 4128 */ 4129 if (addr && env->me_map != addr) 4130 return EBUSY; /* TODO: Make a new MDB_* error code? */ 4131 4132 p = (MDB_page *)env->me_map; 4133 env->me_metas[0] = METADATA(p); 4134 env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize); 4135 4136 return MDB_SUCCESS; 4137 } 4138 4139 int ESECT 4140 mdb_env_set_mapsize(MDB_env *env, size_t size) 4141 { 4142 /* If env is already open, caller is responsible for making 4143 * sure there are no active txns. 4144 */ 4145 if (env->me_map) { 4146 int rc; 4147 MDB_meta *meta; 4148 void *old; 4149 if (env->me_txn) 4150 return EINVAL; 4151 meta = mdb_env_pick_meta(env); 4152 if (!size) 4153 size = meta->mm_mapsize; 4154 { 4155 /* Silently round up to minimum if the size is too small */ 4156 size_t minsize = (meta->mm_last_pg + 1) * env->me_psize; 4157 if (size < minsize) 4158 size = minsize; 4159 } 4160 munmap(env->me_map, env->me_mapsize); 4161 env->me_mapsize = size; 4162 old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; 4163 rc = mdb_env_map(env, old); 4164 if (rc) 4165 return rc; 4166 } 4167 env->me_mapsize = size; 4168 if (env->me_psize) 4169 env->me_maxpg = env->me_mapsize / env->me_psize; 4170 MDB_TRACE(("%p, %"Yu"", env, size)); 4171 return MDB_SUCCESS; 4172 } 4173 4174 int ESECT 4175 mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) 4176 { 4177 if (env->me_map) 4178 return EINVAL; 4179 env->me_maxdbs = dbs + CORE_DBS; 4180 MDB_TRACE(("%p, %u", env, dbs)); 4181 return MDB_SUCCESS; 4182 } 4183 4184 int ESECT 4185 mdb_env_set_maxreaders(MDB_env *env, unsigned int readers) 4186 { 4187 if (env->me_map || readers < 1) 4188 return EINVAL; 4189 env->me_maxreaders = readers; 4190 MDB_TRACE(("%p, %u", env, readers)); 4191 return MDB_SUCCESS; 4192 } 4193 4194 int ESECT 4195 mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) 4196 { 4197 if (!env || !readers) 4198 return EINVAL; 4199 *readers = env->me_maxreaders; 4200 return MDB_SUCCESS; 4201 } 4202 4203 static int ESECT 4204 mdb_fsize(HANDLE fd, size_t *size) 4205 { 4206 #ifdef _WIN32 4207 LARGE_INTEGER fsize; 4208 4209 if (!GetFileSizeEx(fd, &fsize)) 4210 return ErrCode(); 4211 4212 *size = fsize.QuadPart; 4213 #else 4214 struct stat st; 4215 4216 if (fstat(fd, &st)) 4217 return ErrCode(); 4218 4219 *size = st.st_size; 4220 #endif 4221 return MDB_SUCCESS; 4222 } 4223 4224 4225 #ifdef _WIN32 4226 typedef wchar_t mdb_nchar_t; 4227 # define MDB_NAME(str) L##str 4228 # define mdb_name_cpy wcscpy 4229 #else 4230 /** Character type for file names: char on Unix, wchar_t on Windows */ 4231 typedef char mdb_nchar_t; 4232 # define MDB_NAME(str) str /**< #mdb_nchar_t[] string literal */ 4233 # define mdb_name_cpy strcpy /**< Copy name (#mdb_nchar_t string) */ 4234 #endif 4235 4236 /** Filename - string of #mdb_nchar_t[] */ 4237 typedef struct MDB_name { 4238 int mn_len; /**< Length */ 4239 int mn_alloced; /**< True if #mn_val was malloced */ 4240 mdb_nchar_t *mn_val; /**< Contents */ 4241 } MDB_name; 4242 4243 /** Filename suffixes [datafile,lockfile][without,with MDB_NOSUBDIR] */ 4244 static const mdb_nchar_t *const mdb_suffixes[2][2] = { 4245 { MDB_NAME("/data.mdb"), MDB_NAME("") }, 4246 { MDB_NAME("/lock.mdb"), MDB_NAME("-lock") } 4247 }; 4248 4249 #define MDB_SUFFLEN 9 /**< Max string length in #mdb_suffixes[] */ 4250 4251 /** Set up filename + scratch area for filename suffix, for opening files. 4252 * It should be freed with #mdb_fname_destroy(). 4253 * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16. 4254 * 4255 * @param[in] path Pathname for #mdb_env_open(). 4256 * @param[in] envflags Whether a subdir and/or lockfile will be used. 4257 * @param[out] fname Resulting filename, with room for a suffix if necessary. 4258 */ 4259 static int ESECT 4260 mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname) 4261 { 4262 int no_suffix = F_ISSET(envflags, MDB_NOSUBDIR|MDB_NOLOCK); 4263 fname->mn_alloced = 0; 4264 #ifdef _WIN32 4265 return utf8_to_utf16(path, fname, no_suffix ? 0 : MDB_SUFFLEN); 4266 #else 4267 fname->mn_len = strlen(path); 4268 if (no_suffix) 4269 fname->mn_val = (char *) path; 4270 else if ((fname->mn_val = malloc(fname->mn_len + MDB_SUFFLEN+1)) != NULL) { 4271 fname->mn_alloced = 1; 4272 strcpy(fname->mn_val, path); 4273 } 4274 else 4275 return ENOMEM; 4276 return MDB_SUCCESS; 4277 #endif 4278 } 4279 4280 /** Destroy \b fname from #mdb_fname_init() */ 4281 #define mdb_fname_destroy(fname) \ 4282 do { if ((fname).mn_alloced) free((fname).mn_val); } while (0) 4283 4284 #ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */ 4285 # define MDB_CLOEXEC O_CLOEXEC 4286 #else 4287 # define MDB_CLOEXEC 0 4288 #endif 4289 4290 /** File type, access mode etc. for #mdb_fopen() */ 4291 enum mdb_fopen_type { 4292 #ifdef _WIN32 4293 MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS 4294 #else 4295 /* A comment in mdb_fopen() explains some O_* flag choices. */ 4296 MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ 4297 MDB_O_RDWR = O_RDWR |O_CREAT, /**< for me_fd */ 4298 MDB_O_META = O_WRONLY|MDB_DSYNC |MDB_CLOEXEC, /**< for me_mfd */ 4299 MDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|MDB_CLOEXEC, /**< for #mdb_env_copy() */ 4300 /** Bitmask for open() flags in enum #mdb_fopen_type. The other bits 4301 * distinguish otherwise-equal MDB_O_* constants from each other. 4302 */ 4303 MDB_O_MASK = MDB_O_RDWR|MDB_CLOEXEC | MDB_O_RDONLY|MDB_O_META|MDB_O_COPY, 4304 MDB_O_LOCKS = MDB_O_RDWR|MDB_CLOEXEC | ((MDB_O_MASK+1) & ~MDB_O_MASK) /**< for me_lfd */ 4305 #endif 4306 }; 4307 4308 /** Open an LMDB file. 4309 * @param[in] env The LMDB environment. 4310 * @param[in,out] fname Path from from #mdb_fname_init(). A suffix is 4311 * appended if necessary to create the filename, without changing mn_len. 4312 * @param[in] which Determines file type, access mode, etc. 4313 * @param[in] mode The Unix permissions for the file, if we create it. 4314 * @param[out] res Resulting file handle. 4315 * @return 0 on success, non-zero on failure. 4316 */ 4317 static int ESECT 4318 mdb_fopen(const MDB_env *env, MDB_name *fname, 4319 enum mdb_fopen_type which, mdb_mode_t mode, 4320 HANDLE *res) 4321 { 4322 int rc = MDB_SUCCESS; 4323 HANDLE fd; 4324 #ifdef _WIN32 4325 DWORD acc, share, disp, attrs; 4326 #else 4327 int flags; 4328 #endif 4329 4330 if (fname->mn_alloced) /* modifiable copy */ 4331 mdb_name_cpy(fname->mn_val + fname->mn_len, 4332 mdb_suffixes[which==MDB_O_LOCKS][F_ISSET(env->me_flags, MDB_NOSUBDIR)]); 4333 4334 /* The directory must already exist. Usually the file need not. 4335 * MDB_O_META requires the file because we already created it using 4336 * MDB_O_RDWR. MDB_O_COPY must not overwrite an existing file. 4337 * 4338 * With MDB_O_COPY we do not want the OS to cache the writes, since 4339 * the source data is already in the OS cache. 4340 * 4341 * The lockfile needs FD_CLOEXEC (close file descriptor on exec*()) 4342 * to avoid the flock() issues noted under Caveats in lmdb.h. 4343 * Also set it for other filehandles which the user cannot get at 4344 * and close himself, which he may need after fork(). I.e. all but 4345 * me_fd, which programs do use via mdb_env_get_fd(). 4346 */ 4347 4348 #ifdef _WIN32 4349 acc = GENERIC_READ|GENERIC_WRITE; 4350 share = FILE_SHARE_READ|FILE_SHARE_WRITE; 4351 disp = OPEN_ALWAYS; 4352 attrs = FILE_ATTRIBUTE_NORMAL; 4353 switch (which) { 4354 case MDB_O_RDONLY: /* read-only datafile */ 4355 acc = GENERIC_READ; 4356 disp = OPEN_EXISTING; 4357 break; 4358 case MDB_O_META: /* for writing metapages */ 4359 acc = GENERIC_WRITE; 4360 disp = OPEN_EXISTING; 4361 attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH; 4362 break; 4363 case MDB_O_COPY: /* mdb_env_copy() & co */ 4364 acc = GENERIC_WRITE; 4365 share = 0; 4366 disp = CREATE_NEW; 4367 attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH; 4368 break; 4369 default: break; /* silence gcc -Wswitch (not all enum values handled) */ 4370 } 4371 fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL); 4372 #else 4373 fd = open(fname->mn_val, which & MDB_O_MASK, mode); 4374 #endif 4375 4376 if (fd == INVALID_HANDLE_VALUE) 4377 rc = ErrCode(); 4378 #ifndef _WIN32 4379 else { 4380 if (which != MDB_O_RDONLY && which != MDB_O_RDWR) { 4381 /* Set CLOEXEC if we could not pass it to open() */ 4382 if (!MDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1) 4383 (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC); 4384 } 4385 if (which == MDB_O_COPY && env->me_psize >= env->me_os_psize) { 4386 /* This may require buffer alignment. There is no portable 4387 * way to ask how much, so we require OS pagesize alignment. 4388 */ 4389 # ifdef F_NOCACHE /* __APPLE__ */ 4390 (void) fcntl(fd, F_NOCACHE, 1); 4391 # elif defined O_DIRECT 4392 /* open(...O_DIRECT...) would break on filesystems without 4393 * O_DIRECT support (ITS#7682). Try to set it here instead. 4394 */ 4395 if ((flags = fcntl(fd, F_GETFL)) != -1) 4396 (void) fcntl(fd, F_SETFL, flags | O_DIRECT); 4397 # endif 4398 } 4399 } 4400 #endif /* !_WIN32 */ 4401 4402 *res = fd; 4403 return rc; 4404 } 4405 4406 4407 #ifdef BROKEN_FDATASYNC 4408 #include <sys/utsname.h> 4409 #include <sys/vfs.h> 4410 #endif 4411 4412 /** Further setup required for opening an LMDB environment 4413 */ 4414 static int ESECT 4415 mdb_env_open2(MDB_env *env) 4416 { 4417 unsigned int flags = env->me_flags; 4418 int i, newenv = 0, rc; 4419 MDB_meta meta; 4420 4421 #ifdef _WIN32 4422 /* See if we should use QueryLimited */ 4423 rc = GetVersion(); 4424 if ((rc & 0xff) > 5) 4425 env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION; 4426 else 4427 env->me_pidquery = PROCESS_QUERY_INFORMATION; 4428 #endif /* _WIN32 */ 4429 4430 #ifdef BROKEN_FDATASYNC 4431 /* ext3/ext4 fdatasync is broken on some older Linux kernels. 4432 * https://lkml.org/lkml/2012/9/3/83 4433 * Kernels after 3.6-rc6 are known good. 4434 * https://lkml.org/lkml/2012/9/10/556 4435 * See if the DB is on ext3/ext4, then check for new enough kernel 4436 * Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known 4437 * to be patched. 4438 */ 4439 { 4440 struct statfs st; 4441 fstatfs(env->me_fd, &st); 4442 while (st.f_type == 0xEF53) { 4443 struct utsname uts; 4444 int i; 4445 uname(&uts); 4446 if (uts.release[0] < '3') { 4447 if (!strncmp(uts.release, "2.6.32.", 7)) { 4448 i = atoi(uts.release+7); 4449 if (i >= 60) 4450 break; /* 2.6.32.60 and newer is OK */ 4451 } else if (!strncmp(uts.release, "2.6.34.", 7)) { 4452 i = atoi(uts.release+7); 4453 if (i >= 15) 4454 break; /* 2.6.34.15 and newer is OK */ 4455 } 4456 } else if (uts.release[0] == '3') { 4457 i = atoi(uts.release+2); 4458 if (i > 5) 4459 break; /* 3.6 and newer is OK */ 4460 if (i == 5) { 4461 i = atoi(uts.release+4); 4462 if (i >= 4) 4463 break; /* 3.5.4 and newer is OK */ 4464 } else if (i == 2) { 4465 i = atoi(uts.release+4); 4466 if (i >= 30) 4467 break; /* 3.2.30 and newer is OK */ 4468 } 4469 } else { /* 4.x and newer is OK */ 4470 break; 4471 } 4472 env->me_flags |= MDB_FSYNCONLY; 4473 break; 4474 } 4475 } 4476 #endif 4477 4478 if ((i = mdb_env_read_header(env, &meta)) != 0) { 4479 if (i != ENOENT) 4480 return i; 4481 DPUTS("new mdbenv"); 4482 newenv = 1; 4483 env->me_psize = env->me_os_psize; 4484 if (env->me_psize > MAX_PAGESIZE) 4485 env->me_psize = MAX_PAGESIZE; 4486 memset(&meta, 0, sizeof(meta)); 4487 mdb_env_init_meta0(env, &meta); 4488 meta.mm_mapsize = DEFAULT_MAPSIZE; 4489 } else { 4490 env->me_psize = meta.mm_psize; 4491 } 4492 4493 /* Was a mapsize configured? */ 4494 if (!env->me_mapsize) { 4495 env->me_mapsize = meta.mm_mapsize; 4496 } 4497 { 4498 /* Make sure mapsize >= committed data size. Even when using 4499 * mm_mapsize, which could be broken in old files (ITS#7789). 4500 */ 4501 size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; 4502 if (env->me_mapsize < minsize) 4503 env->me_mapsize = minsize; 4504 } 4505 meta.mm_mapsize = env->me_mapsize; 4506 4507 if (newenv && !(flags & MDB_FIXEDMAP)) { 4508 /* mdb_env_map() may grow the datafile. Write the metapages 4509 * first, so the file will be valid if initialization fails. 4510 * Except with FIXEDMAP, since we do not yet know mm_address. 4511 * We could fill in mm_address later, but then a different 4512 * program might end up doing that - one with a memory layout 4513 * and map address which does not suit the main program. 4514 */ 4515 rc = mdb_env_init_meta(env, &meta); 4516 if (rc) 4517 return rc; 4518 newenv = 0; 4519 } 4520 4521 rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL); 4522 if (rc) 4523 return rc; 4524 4525 if (newenv) { 4526 if (flags & MDB_FIXEDMAP) 4527 meta.mm_address = env->me_map; 4528 i = mdb_env_init_meta(env, &meta); 4529 if (i != MDB_SUCCESS) { 4530 return i; 4531 } 4532 } 4533 4534 env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; 4535 env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) 4536 - sizeof(indx_t); 4537 #if !(MDB_MAXKEYSIZE) 4538 env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); 4539 #endif 4540 env->me_maxpg = env->me_mapsize / env->me_psize; 4541 4542 #if MDB_DEBUG 4543 { 4544 MDB_meta *meta = mdb_env_pick_meta(env); 4545 MDB_db *db = &meta->mm_dbs[MAIN_DBI]; 4546 4547 DPRINTF(("opened database version %u, pagesize %u", 4548 meta->mm_version, env->me_psize)); 4549 DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1))); 4550 DPRINTF(("depth: %u", db->md_depth)); 4551 DPRINTF(("entries: %"Z"u", db->md_entries)); 4552 DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); 4553 DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages)); 4554 DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages)); 4555 DPRINTF(("root: %"Z"u", db->md_root)); 4556 } 4557 #endif 4558 4559 return MDB_SUCCESS; 4560 } 4561 4562 4563 /** Release a reader thread's slot in the reader lock table. 4564 * This function is called automatically when a thread exits. 4565 * @param[in] ptr This points to the slot in the reader lock table. 4566 */ 4567 static void 4568 mdb_env_reader_dest(void *ptr) 4569 { 4570 MDB_reader *reader = ptr; 4571 4572 #ifndef _WIN32 4573 if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */ 4574 #endif 4575 /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */ 4576 reader->mr_pid = 0; 4577 } 4578 4579 #ifdef _WIN32 4580 /** Junk for arranging thread-specific callbacks on Windows. This is 4581 * necessarily platform and compiler-specific. Windows supports up 4582 * to 1088 keys. Let's assume nobody opens more than 64 environments 4583 * in a single process, for now. They can override this if needed. 4584 */ 4585 #ifndef MAX_TLS_KEYS 4586 #define MAX_TLS_KEYS 64 4587 #endif 4588 static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS]; 4589 static int mdb_tls_nkeys; 4590 4591 static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr) 4592 { 4593 int i; 4594 switch(reason) { 4595 case DLL_PROCESS_ATTACH: break; 4596 case DLL_THREAD_ATTACH: break; 4597 case DLL_THREAD_DETACH: 4598 for (i=0; i<mdb_tls_nkeys; i++) { 4599 MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]); 4600 if (r) { 4601 mdb_env_reader_dest(r); 4602 } 4603 } 4604 break; 4605 case DLL_PROCESS_DETACH: break; 4606 } 4607 } 4608 #ifdef __GNUC__ 4609 #ifdef _WIN64 4610 const PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback; 4611 #else 4612 PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback; 4613 #endif 4614 #else 4615 #ifdef _WIN64 4616 /* Force some symbol references. 4617 * _tls_used forces the linker to create the TLS directory if not already done 4618 * mdb_tls_cbp prevents whole-program-optimizer from dropping the symbol. 4619 */ 4620 #pragma comment(linker, "/INCLUDE:_tls_used") 4621 #pragma comment(linker, "/INCLUDE:mdb_tls_cbp") 4622 #pragma const_seg(".CRT$XLB") 4623 extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp; 4624 const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; 4625 #pragma const_seg() 4626 #else /* _WIN32 */ 4627 #pragma comment(linker, "/INCLUDE:__tls_used") 4628 #pragma comment(linker, "/INCLUDE:_mdb_tls_cbp") 4629 #pragma data_seg(".CRT$XLB") 4630 PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; 4631 #pragma data_seg() 4632 #endif /* WIN 32/64 */ 4633 #endif /* !__GNUC__ */ 4634 #endif 4635 4636 /** Downgrade the exclusive lock on the region back to shared */ 4637 static int ESECT 4638 mdb_env_share_locks(MDB_env *env, int *excl) 4639 { 4640 int rc = 0; 4641 MDB_meta *meta = mdb_env_pick_meta(env); 4642 4643 env->me_txns->mti_txnid = meta->mm_txnid; 4644 4645 #ifdef _WIN32 4646 { 4647 OVERLAPPED ov; 4648 /* First acquire a shared lock. The Unlock will 4649 * then release the existing exclusive lock. 4650 */ 4651 memset(&ov, 0, sizeof(ov)); 4652 if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { 4653 rc = ErrCode(); 4654 } else { 4655 UnlockFile(env->me_lfd, 0, 0, 1, 0); 4656 *excl = 0; 4657 } 4658 } 4659 #else 4660 { 4661 struct flock lock_info; 4662 /* The shared lock replaces the existing lock */ 4663 memset((void *)&lock_info, 0, sizeof(lock_info)); 4664 lock_info.l_type = F_RDLCK; 4665 lock_info.l_whence = SEEK_SET; 4666 lock_info.l_start = 0; 4667 lock_info.l_len = 1; 4668 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && 4669 (rc = ErrCode()) == EINTR) ; 4670 *excl = rc ? -1 : 0; /* error may mean we lost the lock */ 4671 } 4672 #endif 4673 4674 return rc; 4675 } 4676 4677 /** Try to get exclusive lock, otherwise shared. 4678 * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. 4679 */ 4680 static int ESECT 4681 mdb_env_excl_lock(MDB_env *env, int *excl) 4682 { 4683 int rc = 0; 4684 #ifdef _WIN32 4685 if (LockFile(env->me_lfd, 0, 0, 1, 0)) { 4686 *excl = 1; 4687 } else { 4688 OVERLAPPED ov; 4689 memset(&ov, 0, sizeof(ov)); 4690 if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { 4691 *excl = 0; 4692 } else { 4693 rc = ErrCode(); 4694 } 4695 } 4696 #else 4697 struct flock lock_info; 4698 memset((void *)&lock_info, 0, sizeof(lock_info)); 4699 lock_info.l_type = F_WRLCK; 4700 lock_info.l_whence = SEEK_SET; 4701 lock_info.l_start = 0; 4702 lock_info.l_len = 1; 4703 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && 4704 (rc = ErrCode()) == EINTR) ; 4705 if (!rc) { 4706 *excl = 1; 4707 } else 4708 # ifndef MDB_USE_POSIX_MUTEX 4709 if (*excl < 0) /* always true when MDB_USE_POSIX_MUTEX */ 4710 # endif 4711 { 4712 lock_info.l_type = F_RDLCK; 4713 while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && 4714 (rc = ErrCode()) == EINTR) ; 4715 if (rc == 0) 4716 *excl = 0; 4717 } 4718 #endif 4719 return rc; 4720 } 4721 4722 #ifdef MDB_USE_HASH 4723 /* 4724 * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code 4725 * 4726 * @(#) Revision: 5.1 4727 * @(#) Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp 4728 * @(#) Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v 4729 * 4730 * http://www.isthe.com/chongo/tech/comp/fnv/index.html 4731 * 4732 *** 4733 * 4734 * Please do not copyright this code. This code is in the public domain. 4735 * 4736 * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 4737 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO 4738 * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR 4739 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF 4740 * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 4741 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 4742 * PERFORMANCE OF THIS SOFTWARE. 4743 * 4744 * By: 4745 * chongo <Landon Curt Noll> /\oo/\ 4746 * http://www.isthe.com/chongo/ 4747 * 4748 * Share and Enjoy! :-) 4749 */ 4750 4751 typedef unsigned long long mdb_hash_t; 4752 #define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL) 4753 4754 /** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer 4755 * @param[in] val value to hash 4756 * @param[in] hval initial value for hash 4757 * @return 64 bit hash 4758 * 4759 * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the 4760 * hval arg on the first call. 4761 */ 4762 static mdb_hash_t 4763 mdb_hash_val(MDB_val *val, mdb_hash_t hval) 4764 { 4765 unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ 4766 unsigned char *end = s + val->mv_size; 4767 /* 4768 * FNV-1a hash each octet of the string 4769 */ 4770 while (s < end) { 4771 /* xor the bottom with the current octet */ 4772 hval ^= (mdb_hash_t)*s++; 4773 4774 /* multiply by the 64 bit FNV magic prime mod 2^64 */ 4775 hval += (hval << 1) + (hval << 4) + (hval << 5) + 4776 (hval << 7) + (hval << 8) + (hval << 40); 4777 } 4778 /* return our new hash value */ 4779 return hval; 4780 } 4781 4782 /** Hash the string and output the encoded hash. 4783 * This uses modified RFC1924 Ascii85 encoding to accommodate systems with 4784 * very short name limits. We don't care about the encoding being reversible, 4785 * we just want to preserve as many bits of the input as possible in a 4786 * small printable string. 4787 * @param[in] str string to hash 4788 * @param[out] encbuf an array of 11 chars to hold the hash 4789 */ 4790 static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; 4791 4792 static void ESECT 4793 mdb_pack85(unsigned long l, char *out) 4794 { 4795 int i; 4796 4797 for (i=0; i<5; i++) { 4798 *out++ = mdb_a85[l % 85]; 4799 l /= 85; 4800 } 4801 } 4802 4803 static void ESECT 4804 mdb_hash_enc(MDB_val *val, char *encbuf) 4805 { 4806 mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); 4807 4808 mdb_pack85(h, encbuf); 4809 mdb_pack85(h>>32, encbuf+5); 4810 encbuf[10] = '\0'; 4811 } 4812 #endif 4813 4814 /** Open and/or initialize the lock region for the environment. 4815 * @param[in] env The LMDB environment. 4816 * @param[in] fname Filename + scratch area, from #mdb_fname_init(). 4817 * @param[in] mode The Unix permissions for the file, if we create it. 4818 * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive 4819 * @return 0 on success, non-zero on failure. 4820 */ 4821 static int ESECT 4822 mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) 4823 { 4824 #ifdef _WIN32 4825 # define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT 4826 #else 4827 # define MDB_ERRCODE_ROFS EROFS 4828 #endif 4829 int rc; 4830 off_t size, rsize; 4831 4832 rc = mdb_fopen(env, fname, MDB_O_LOCKS, mode, &env->me_lfd); 4833 if (rc) { 4834 /* Omit lockfile if read-only env on read-only filesystem */ 4835 if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { 4836 return MDB_SUCCESS; 4837 } 4838 goto fail; 4839 } 4840 4841 if (!(env->me_flags & MDB_NOTLS)) { 4842 rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); 4843 if (rc) 4844 goto fail; 4845 env->me_flags |= MDB_ENV_TXKEY; 4846 #ifdef _WIN32 4847 /* Windows TLS callbacks need help finding their TLS info. */ 4848 if (mdb_tls_nkeys >= MAX_TLS_KEYS) { 4849 rc = MDB_TLS_FULL; 4850 goto fail; 4851 } 4852 mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; 4853 #endif 4854 } 4855 4856 /* Try to get exclusive lock. If we succeed, then 4857 * nobody is using the lock region and we should initialize it. 4858 */ 4859 if ((rc = mdb_env_excl_lock(env, excl))) goto fail; 4860 4861 #ifdef _WIN32 4862 size = GetFileSize(env->me_lfd, NULL); 4863 #else 4864 size = lseek(env->me_lfd, 0, SEEK_END); 4865 if (size == -1) goto fail_errno; 4866 #endif 4867 rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); 4868 if (size < rsize && *excl > 0) { 4869 #ifdef _WIN32 4870 if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize 4871 || !SetEndOfFile(env->me_lfd)) 4872 goto fail_errno; 4873 #else 4874 if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; 4875 #endif 4876 } else { 4877 rsize = size; 4878 size = rsize - sizeof(MDB_txninfo); 4879 env->me_maxreaders = size/sizeof(MDB_reader) + 1; 4880 } 4881 { 4882 #ifdef _WIN32 4883 HANDLE mh; 4884 mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE, 4885 0, 0, NULL); 4886 if (!mh) goto fail_errno; 4887 env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL); 4888 CloseHandle(mh); 4889 if (!env->me_txns) goto fail_errno; 4890 #else 4891 void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, 4892 env->me_lfd, 0); 4893 if (m == MAP_FAILED) goto fail_errno; 4894 env->me_txns = m; 4895 #endif 4896 } 4897 if (*excl > 0) { 4898 #ifdef _WIN32 4899 BY_HANDLE_FILE_INFORMATION stbuf; 4900 struct { 4901 DWORD volume; 4902 DWORD nhigh; 4903 DWORD nlow; 4904 } idbuf; 4905 MDB_val val; 4906 char encbuf[11]; 4907 4908 if (!mdb_sec_inited) { 4909 InitializeSecurityDescriptor(&mdb_null_sd, 4910 SECURITY_DESCRIPTOR_REVISION); 4911 SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE); 4912 mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES); 4913 mdb_all_sa.bInheritHandle = FALSE; 4914 mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd; 4915 mdb_sec_inited = 1; 4916 } 4917 if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno; 4918 idbuf.volume = stbuf.dwVolumeSerialNumber; 4919 idbuf.nhigh = stbuf.nFileIndexHigh; 4920 idbuf.nlow = stbuf.nFileIndexLow; 4921 val.mv_data = &idbuf; 4922 val.mv_size = sizeof(idbuf); 4923 mdb_hash_enc(&val, encbuf); 4924 sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf); 4925 sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf); 4926 env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); 4927 if (!env->me_rmutex) goto fail_errno; 4928 env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); 4929 if (!env->me_wmutex) goto fail_errno; 4930 #elif defined(MDB_USE_POSIX_SEM) 4931 struct stat stbuf; 4932 struct { 4933 dev_t dev; 4934 ino_t ino; 4935 } idbuf; 4936 MDB_val val; 4937 char encbuf[11]; 4938 4939 #if defined(__NetBSD__) 4940 #define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */ 4941 #endif 4942 if (fstat(env->me_lfd, &stbuf)) goto fail_errno; 4943 idbuf.dev = stbuf.st_dev; 4944 idbuf.ino = stbuf.st_ino; 4945 val.mv_data = &idbuf; 4946 val.mv_size = sizeof(idbuf); 4947 mdb_hash_enc(&val, encbuf); 4948 #ifdef MDB_SHORT_SEMNAMES 4949 encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */ 4950 #endif 4951 sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf); 4952 sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf); 4953 /* Clean up after a previous run, if needed: Try to 4954 * remove both semaphores before doing anything else. 4955 */ 4956 sem_unlink(env->me_txns->mti_rmname); 4957 sem_unlink(env->me_txns->mti_wmname); 4958 env->me_rmutex = sem_open(env->me_txns->mti_rmname, 4959 O_CREAT|O_EXCL, mode, 1); 4960 if (env->me_rmutex == SEM_FAILED) goto fail_errno; 4961 env->me_wmutex = sem_open(env->me_txns->mti_wmname, 4962 O_CREAT|O_EXCL, mode, 1); 4963 if (env->me_wmutex == SEM_FAILED) goto fail_errno; 4964 #else /* MDB_USE_POSIX_MUTEX: */ 4965 pthread_mutexattr_t mattr; 4966 4967 /* Solaris needs this before initing a robust mutex. Otherwise 4968 * it may skip the init and return EBUSY "seems someone already 4969 * inited" or EINVAL "it was inited differently". 4970 */ 4971 memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex)); 4972 memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex)); 4973 4974 if ((rc = pthread_mutexattr_init(&mattr))) 4975 goto fail; 4976 4977 rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); 4978 #ifdef MDB_ROBUST_SUPPORTED 4979 if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); 4980 #endif 4981 if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr); 4982 if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr); 4983 pthread_mutexattr_destroy(&mattr); 4984 if (rc) 4985 goto fail; 4986 #endif /* _WIN32 || MDB_USE_POSIX_SEM */ 4987 4988 env->me_txns->mti_magic = MDB_MAGIC; 4989 env->me_txns->mti_format = MDB_LOCK_FORMAT; 4990 env->me_txns->mti_txnid = 0; 4991 env->me_txns->mti_numreaders = 0; 4992 4993 } else { 4994 if (env->me_txns->mti_magic != MDB_MAGIC) { 4995 DPUTS("lock region has invalid magic"); 4996 rc = MDB_INVALID; 4997 goto fail; 4998 } 4999 if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { 5000 DPRINTF(("lock region has format+version 0x%x, expected 0x%x", 5001 env->me_txns->mti_format, MDB_LOCK_FORMAT)); 5002 rc = MDB_VERSION_MISMATCH; 5003 goto fail; 5004 } 5005 rc = ErrCode(); 5006 if (rc && rc != EACCES && rc != EAGAIN) { 5007 goto fail; 5008 } 5009 #ifdef _WIN32 5010 env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); 5011 if (!env->me_rmutex) goto fail_errno; 5012 env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); 5013 if (!env->me_wmutex) goto fail_errno; 5014 #elif defined(MDB_USE_POSIX_SEM) 5015 env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0); 5016 if (env->me_rmutex == SEM_FAILED) goto fail_errno; 5017 env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0); 5018 if (env->me_wmutex == SEM_FAILED) goto fail_errno; 5019 #endif 5020 } 5021 return MDB_SUCCESS; 5022 5023 fail_errno: 5024 rc = ErrCode(); 5025 fail: 5026 return rc; 5027 } 5028 5029 /** Only a subset of the @ref mdb_env flags can be changed 5030 * at runtime. Changing other flags requires closing the 5031 * environment and re-opening it with the new flags. 5032 */ 5033 #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) 5034 #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ 5035 MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD) 5036 5037 #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) 5038 # error "Persistent DB flags & env flags overlap, but both go in mm_flags" 5039 #endif 5040 5041 int ESECT 5042 mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) 5043 { 5044 int rc, excl = -1; 5045 MDB_name fname; 5046 5047 if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) 5048 return EINVAL; 5049 5050 flags |= env->me_flags; 5051 5052 rc = mdb_fname_init(path, flags, &fname); 5053 if (rc) 5054 return rc; 5055 5056 if (flags & MDB_RDONLY) { 5057 /* silently ignore WRITEMAP when we're only getting read access */ 5058 flags &= ~MDB_WRITEMAP; 5059 } else { 5060 if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) && 5061 (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) 5062 rc = ENOMEM; 5063 } 5064 env->me_flags = flags |= MDB_ENV_ACTIVE; 5065 if (rc) 5066 goto leave; 5067 5068 env->me_path = strdup(path); 5069 env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); 5070 env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); 5071 env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int)); 5072 if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { 5073 rc = ENOMEM; 5074 goto leave; 5075 } 5076 env->me_dbxs[FREE_DBI].md_cmp = mdb_cmp_long; /* aligned MDB_INTEGERKEY */ 5077 5078 /* For RDONLY, get lockfile after we know datafile exists */ 5079 if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { 5080 rc = mdb_env_setup_locks(env, &fname, mode, &excl); 5081 if (rc) 5082 goto leave; 5083 } 5084 5085 rc = mdb_fopen(env, &fname, 5086 (flags & MDB_RDONLY) ? MDB_O_RDONLY : MDB_O_RDWR, 5087 mode, &env->me_fd); 5088 if (rc) 5089 goto leave; 5090 5091 if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { 5092 rc = mdb_env_setup_locks(env, &fname, mode, &excl); 5093 if (rc) 5094 goto leave; 5095 } 5096 5097 if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { 5098 if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { 5099 /* Synchronous fd for meta writes. Needed even with 5100 * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. 5101 */ 5102 rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); 5103 if (rc) 5104 goto leave; 5105 } 5106 DPRINTF(("opened dbenv %p", (void *) env)); 5107 if (excl > 0) { 5108 rc = mdb_env_share_locks(env, &excl); 5109 if (rc) 5110 goto leave; 5111 } 5112 if (!(flags & MDB_RDONLY)) { 5113 MDB_txn *txn; 5114 int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs * 5115 (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned int)+1); 5116 if ((env->me_pbuf = calloc(1, env->me_psize)) && 5117 (txn = calloc(1, size))) 5118 { 5119 txn->mt_dbs = (MDB_db *)((char *)txn + tsize); 5120 txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); 5121 txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); 5122 txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); 5123 txn->mt_env = env; 5124 txn->mt_dbxs = env->me_dbxs; 5125 txn->mt_flags = MDB_TXN_FINISHED; 5126 env->me_txn0 = txn; 5127 } else { 5128 rc = ENOMEM; 5129 } 5130 } 5131 } 5132 5133 leave: 5134 MDB_TRACE(("%p, %s, %u, %04o", env, path, flags & (CHANGEABLE|CHANGELESS), mode)); 5135 if (rc) { 5136 mdb_env_close0(env, excl); 5137 } 5138 mdb_fname_destroy(fname); 5139 return rc; 5140 } 5141 5142 /** Destroy resources from mdb_env_open(), clear our readers & DBIs */ 5143 static void ESECT 5144 mdb_env_close0(MDB_env *env, int excl) 5145 { 5146 int i; 5147 5148 if (!(env->me_flags & MDB_ENV_ACTIVE)) 5149 return; 5150 5151 /* Doing this here since me_dbxs may not exist during mdb_env_close */ 5152 if (env->me_dbxs) { 5153 for (i = env->me_maxdbs; --i >= CORE_DBS; ) 5154 free(env->me_dbxs[i].md_name.mv_data); 5155 free(env->me_dbxs); 5156 } 5157 5158 free(env->me_pbuf); 5159 free(env->me_dbiseqs); 5160 free(env->me_dbflags); 5161 free(env->me_path); 5162 free(env->me_dirty_list); 5163 free(env->me_txn0); 5164 mdb_midl_free(env->me_free_pgs); 5165 5166 if (env->me_flags & MDB_ENV_TXKEY) { 5167 pthread_key_delete(env->me_txkey); 5168 #ifdef _WIN32 5169 /* Delete our key from the global list */ 5170 for (i=0; i<mdb_tls_nkeys; i++) 5171 if (mdb_tls_keys[i] == env->me_txkey) { 5172 mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; 5173 mdb_tls_nkeys--; 5174 break; 5175 } 5176 #endif 5177 } 5178 5179 if (env->me_map) { 5180 munmap(env->me_map, env->me_mapsize); 5181 } 5182 if (env->me_mfd != INVALID_HANDLE_VALUE) 5183 (void) close(env->me_mfd); 5184 if (env->me_fd != INVALID_HANDLE_VALUE) 5185 (void) close(env->me_fd); 5186 if (env->me_txns) { 5187 MDB_PID_T pid = getpid(); 5188 /* Clearing readers is done in this function because 5189 * me_txkey with its destructor must be disabled first. 5190 * 5191 * We skip the the reader mutex, so we touch only 5192 * data owned by this process (me_close_readers and 5193 * our readers), and clear each reader atomically. 5194 */ 5195 for (i = env->me_close_readers; --i >= 0; ) 5196 if (env->me_txns->mti_readers[i].mr_pid == pid) 5197 env->me_txns->mti_readers[i].mr_pid = 0; 5198 #ifdef _WIN32 5199 if (env->me_rmutex) { 5200 CloseHandle(env->me_rmutex); 5201 if (env->me_wmutex) CloseHandle(env->me_wmutex); 5202 } 5203 /* Windows automatically destroys the mutexes when 5204 * the last handle closes. 5205 */ 5206 #elif defined(MDB_USE_POSIX_SEM) 5207 if (env->me_rmutex != SEM_FAILED) { 5208 sem_close(env->me_rmutex); 5209 if (env->me_wmutex != SEM_FAILED) 5210 sem_close(env->me_wmutex); 5211 /* If we have the filelock: If we are the 5212 * only remaining user, clean up semaphores. 5213 */ 5214 if (excl == 0) 5215 mdb_env_excl_lock(env, &excl); 5216 if (excl > 0) { 5217 sem_unlink(env->me_txns->mti_rmname); 5218 sem_unlink(env->me_txns->mti_wmname); 5219 } 5220 } 5221 #endif 5222 munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); 5223 } 5224 if (env->me_lfd != INVALID_HANDLE_VALUE) { 5225 #ifdef _WIN32 5226 if (excl >= 0) { 5227 /* Unlock the lockfile. Windows would have unlocked it 5228 * after closing anyway, but not necessarily at once. 5229 */ 5230 UnlockFile(env->me_lfd, 0, 0, 1, 0); 5231 } 5232 #endif 5233 (void) close(env->me_lfd); 5234 } 5235 5236 env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); 5237 } 5238 5239 void ESECT 5240 mdb_env_close(MDB_env *env) 5241 { 5242 MDB_page *dp; 5243 5244 if (env == NULL) 5245 return; 5246 5247 MDB_TRACE(("%p", env)); 5248 VGMEMP_DESTROY(env); 5249 while ((dp = env->me_dpages) != NULL) { 5250 VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); 5251 env->me_dpages = dp->mp_next; 5252 free(dp); 5253 } 5254 5255 mdb_env_close0(env, 0); 5256 free(env); 5257 } 5258 5259 /** Compare two items pointing at aligned size_t's */ 5260 static int 5261 mdb_cmp_long(const MDB_val *a, const MDB_val *b) 5262 { 5263 return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 : 5264 *(size_t *)a->mv_data > *(size_t *)b->mv_data; 5265 } 5266 5267 /** Compare two items pointing at aligned unsigned int's. 5268 * 5269 * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp, 5270 * but #mdb_cmp_clong() is called instead if the data type is size_t. 5271 */ 5272 static int 5273 mdb_cmp_int(const MDB_val *a, const MDB_val *b) 5274 { 5275 return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 : 5276 *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data; 5277 } 5278 5279 /** Compare two items pointing at unsigned ints of unknown alignment. 5280 * Nodes and keys are guaranteed to be 2-byte aligned. 5281 */ 5282 static int 5283 mdb_cmp_cint(const MDB_val *a, const MDB_val *b) 5284 { 5285 #if BYTE_ORDER == LITTLE_ENDIAN 5286 unsigned short *u, *c; 5287 int x; 5288 5289 u = (unsigned short *) ((char *) a->mv_data + a->mv_size); 5290 c = (unsigned short *) ((char *) b->mv_data + a->mv_size); 5291 do { 5292 x = *--u - *--c; 5293 } while(!x && u > (unsigned short *)a->mv_data); 5294 return x; 5295 #else 5296 unsigned short *u, *c, *end; 5297 int x; 5298 5299 end = (unsigned short *) ((char *) a->mv_data + a->mv_size); 5300 u = (unsigned short *)a->mv_data; 5301 c = (unsigned short *)b->mv_data; 5302 do { 5303 x = *u++ - *c++; 5304 } while(!x && u < end); 5305 return x; 5306 #endif 5307 } 5308 5309 /** Compare two items lexically */ 5310 static int 5311 mdb_cmp_memn(const MDB_val *a, const MDB_val *b) 5312 { 5313 int diff; 5314 ssize_t len_diff; 5315 unsigned int len; 5316 5317 len = a->mv_size; 5318 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; 5319 if (len_diff > 0) { 5320 len = b->mv_size; 5321 len_diff = 1; 5322 } 5323 5324 diff = memcmp(a->mv_data, b->mv_data, len); 5325 return diff ? diff : len_diff<0 ? -1 : len_diff; 5326 } 5327 5328 /** Compare two items in reverse byte order */ 5329 static int 5330 mdb_cmp_memnr(const MDB_val *a, const MDB_val *b) 5331 { 5332 const unsigned char *p1, *p2, *p1_lim; 5333 ssize_t len_diff; 5334 int diff; 5335 5336 p1_lim = (const unsigned char *)a->mv_data; 5337 p1 = (const unsigned char *)a->mv_data + a->mv_size; 5338 p2 = (const unsigned char *)b->mv_data + b->mv_size; 5339 5340 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; 5341 if (len_diff > 0) { 5342 p1_lim += len_diff; 5343 len_diff = 1; 5344 } 5345 5346 while (p1 > p1_lim) { 5347 diff = *--p1 - *--p2; 5348 if (diff) 5349 return diff; 5350 } 5351 return len_diff<0 ? -1 : len_diff; 5352 } 5353 5354 /** Search for key within a page, using binary search. 5355 * Returns the smallest entry larger or equal to the key. 5356 * If exactp is non-null, stores whether the found entry was an exact match 5357 * in *exactp (1 or 0). 5358 * Updates the cursor index with the index of the found entry. 5359 * If no entry larger or equal to the key is found, returns NULL. 5360 */ 5361 static MDB_node * 5362 mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) 5363 { 5364 unsigned int i = 0, nkeys; 5365 int low, high; 5366 int rc = 0; 5367 MDB_page *mp = mc->mc_pg[mc->mc_top]; 5368 MDB_node *node = NULL; 5369 MDB_val nodekey; 5370 MDB_cmp_func *cmp; 5371 DKBUF; 5372 5373 nkeys = NUMKEYS(mp); 5374 5375 DPRINTF(("searching %u keys in %s %spage %"Z"u", 5376 nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", 5377 mdb_dbg_pgno(mp))); 5378 5379 low = IS_LEAF(mp) ? 0 : 1; 5380 high = nkeys - 1; 5381 cmp = mc->mc_dbx->md_cmp; 5382 5383 /* Branch pages have no data, so if using integer keys, 5384 * alignment is guaranteed. Use faster mdb_cmp_int. 5385 */ 5386 if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { 5387 if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) 5388 cmp = mdb_cmp_long; 5389 else 5390 cmp = mdb_cmp_int; 5391 } 5392 5393 if (IS_LEAF2(mp)) { 5394 nodekey.mv_size = mc->mc_db->md_pad; 5395 node = NODEPTR(mp, 0); /* fake */ 5396 while (low <= high) { 5397 i = (low + high) >> 1; 5398 nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); 5399 rc = cmp(key, &nodekey); 5400 DPRINTF(("found leaf index %u [%s], rc = %i", 5401 i, DKEY(&nodekey), rc)); 5402 if (rc == 0) 5403 break; 5404 if (rc > 0) 5405 low = i + 1; 5406 else 5407 high = i - 1; 5408 } 5409 } else { 5410 while (low <= high) { 5411 i = (low + high) >> 1; 5412 5413 node = NODEPTR(mp, i); 5414 nodekey.mv_size = NODEKSZ(node); 5415 nodekey.mv_data = NODEKEY(node); 5416 5417 rc = cmp(key, &nodekey); 5418 #if MDB_DEBUG 5419 if (IS_LEAF(mp)) 5420 DPRINTF(("found leaf index %u [%s], rc = %i", 5421 i, DKEY(&nodekey), rc)); 5422 else 5423 DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i", 5424 i, DKEY(&nodekey), NODEPGNO(node), rc)); 5425 #endif 5426 if (rc == 0) 5427 break; 5428 if (rc > 0) 5429 low = i + 1; 5430 else 5431 high = i - 1; 5432 } 5433 } 5434 5435 if (rc > 0) { /* Found entry is less than the key. */ 5436 i++; /* Skip to get the smallest entry larger than key. */ 5437 if (!IS_LEAF2(mp)) 5438 node = NODEPTR(mp, i); 5439 } 5440 if (exactp) 5441 *exactp = (rc == 0 && nkeys > 0); 5442 /* store the key index */ 5443 mc->mc_ki[mc->mc_top] = i; 5444 if (i >= nkeys) 5445 /* There is no entry larger or equal to the key. */ 5446 return NULL; 5447 5448 /* nodeptr is fake for LEAF2 */ 5449 return node; 5450 } 5451 5452 #if 0 5453 static void 5454 mdb_cursor_adjust(MDB_cursor *mc, func) 5455 { 5456 MDB_cursor *m2; 5457 5458 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { 5459 if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { 5460 func(mc, m2); 5461 } 5462 } 5463 } 5464 #endif 5465 5466 /** Pop a page off the top of the cursor's stack. */ 5467 static void 5468 mdb_cursor_pop(MDB_cursor *mc) 5469 { 5470 if (mc->mc_snum) { 5471 DPRINTF(("popping page %"Z"u off db %d cursor %p", 5472 mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc)); 5473 5474 mc->mc_snum--; 5475 if (mc->mc_snum) { 5476 mc->mc_top--; 5477 } else { 5478 mc->mc_flags &= ~C_INITIALIZED; 5479 } 5480 } 5481 } 5482 5483 /** Push a page onto the top of the cursor's stack. 5484 * Set #MDB_TXN_ERROR on failure. 5485 */ 5486 static int 5487 mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) 5488 { 5489 DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, 5490 DDBI(mc), (void *) mc)); 5491 5492 if (mc->mc_snum >= CURSOR_STACK) { 5493 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 5494 return MDB_CURSOR_FULL; 5495 } 5496 5497 mc->mc_top = mc->mc_snum++; 5498 mc->mc_pg[mc->mc_top] = mp; 5499 mc->mc_ki[mc->mc_top] = 0; 5500 5501 return MDB_SUCCESS; 5502 } 5503 5504 /** Find the address of the page corresponding to a given page number. 5505 * Set #MDB_TXN_ERROR on failure. 5506 * @param[in] mc the cursor accessing the page. 5507 * @param[in] pgno the page number for the page to retrieve. 5508 * @param[out] ret address of a pointer where the page's address will be stored. 5509 * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. 5510 * @return 0 on success, non-zero on failure. 5511 */ 5512 static int 5513 mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) 5514 { 5515 MDB_txn *txn = mc->mc_txn; 5516 MDB_env *env = txn->mt_env; 5517 MDB_page *p = NULL; 5518 int level; 5519 5520 if (! (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_WRITEMAP))) { 5521 MDB_txn *tx2 = txn; 5522 level = 1; 5523 do { 5524 MDB_ID2L dl = tx2->mt_u.dirty_list; 5525 unsigned x; 5526 /* Spilled pages were dirtied in this txn and flushed 5527 * because the dirty list got full. Bring this page 5528 * back in from the map (but don't unspill it here, 5529 * leave that unless page_touch happens again). 5530 */ 5531 if (tx2->mt_spill_pgs) { 5532 MDB_ID pn = pgno << 1; 5533 x = mdb_midl_search(tx2->mt_spill_pgs, pn); 5534 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { 5535 p = (MDB_page *)(env->me_map + env->me_psize * pgno); 5536 goto done; 5537 } 5538 } 5539 if (dl[0].mid) { 5540 unsigned x = mdb_mid2l_search(dl, pgno); 5541 if (x <= dl[0].mid && dl[x].mid == pgno) { 5542 p = dl[x].mptr; 5543 goto done; 5544 } 5545 } 5546 level++; 5547 } while ((tx2 = tx2->mt_parent) != NULL); 5548 } 5549 5550 if (pgno < txn->mt_next_pgno) { 5551 level = 0; 5552 p = (MDB_page *)(env->me_map + env->me_psize * pgno); 5553 } else { 5554 DPRINTF(("page %"Z"u not found", pgno)); 5555 txn->mt_flags |= MDB_TXN_ERROR; 5556 return MDB_PAGE_NOTFOUND; 5557 } 5558 5559 done: 5560 *ret = p; 5561 if (lvl) 5562 *lvl = level; 5563 return MDB_SUCCESS; 5564 } 5565 5566 /** Finish #mdb_page_search() / #mdb_page_search_lowest(). 5567 * The cursor is at the root page, set up the rest of it. 5568 */ 5569 static int 5570 mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) 5571 { 5572 MDB_page *mp = mc->mc_pg[mc->mc_top]; 5573 int rc; 5574 DKBUF; 5575 5576 while (IS_BRANCH(mp)) { 5577 MDB_node *node; 5578 indx_t i; 5579 5580 DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); 5581 /* Don't assert on branch pages in the FreeDB. We can get here 5582 * while in the process of rebalancing a FreeDB branch page; we must 5583 * let that proceed. ITS#8336 5584 */ 5585 mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); 5586 DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); 5587 5588 if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { 5589 i = 0; 5590 if (flags & MDB_PS_LAST) { 5591 i = NUMKEYS(mp) - 1; 5592 /* if already init'd, see if we're already in right place */ 5593 if (mc->mc_flags & C_INITIALIZED) { 5594 if (mc->mc_ki[mc->mc_top] == i) { 5595 mc->mc_top = mc->mc_snum++; 5596 mp = mc->mc_pg[mc->mc_top]; 5597 goto ready; 5598 } 5599 } 5600 } 5601 } else { 5602 int exact; 5603 node = mdb_node_search(mc, key, &exact); 5604 if (node == NULL) 5605 i = NUMKEYS(mp) - 1; 5606 else { 5607 i = mc->mc_ki[mc->mc_top]; 5608 if (!exact) { 5609 mdb_cassert(mc, i > 0); 5610 i--; 5611 } 5612 } 5613 DPRINTF(("following index %u for key [%s]", i, DKEY(key))); 5614 } 5615 5616 mdb_cassert(mc, i < NUMKEYS(mp)); 5617 node = NODEPTR(mp, i); 5618 5619 if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) 5620 return rc; 5621 5622 mc->mc_ki[mc->mc_top] = i; 5623 if ((rc = mdb_cursor_push(mc, mp))) 5624 return rc; 5625 5626 ready: 5627 if (flags & MDB_PS_MODIFY) { 5628 if ((rc = mdb_page_touch(mc)) != 0) 5629 return rc; 5630 mp = mc->mc_pg[mc->mc_top]; 5631 } 5632 } 5633 5634 if (!IS_LEAF(mp)) { 5635 DPRINTF(("internal error, index points to a %02X page!?", 5636 mp->mp_flags)); 5637 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 5638 return MDB_CORRUPTED; 5639 } 5640 5641 DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, 5642 key ? DKEY(key) : "null")); 5643 mc->mc_flags |= C_INITIALIZED; 5644 mc->mc_flags &= ~C_EOF; 5645 5646 return MDB_SUCCESS; 5647 } 5648 5649 /** Search for the lowest key under the current branch page. 5650 * This just bypasses a NUMKEYS check in the current page 5651 * before calling mdb_page_search_root(), because the callers 5652 * are all in situations where the current page is known to 5653 * be underfilled. 5654 */ 5655 static int 5656 mdb_page_search_lowest(MDB_cursor *mc) 5657 { 5658 MDB_page *mp = mc->mc_pg[mc->mc_top]; 5659 MDB_node *node = NODEPTR(mp, 0); 5660 int rc; 5661 5662 if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) 5663 return rc; 5664 5665 mc->mc_ki[mc->mc_top] = 0; 5666 if ((rc = mdb_cursor_push(mc, mp))) 5667 return rc; 5668 return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); 5669 } 5670 5671 /** Search for the page a given key should be in. 5672 * Push it and its parent pages on the cursor stack. 5673 * @param[in,out] mc the cursor for this operation. 5674 * @param[in] key the key to search for, or NULL for first/last page. 5675 * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB 5676 * are touched (updated with new page numbers). 5677 * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. 5678 * This is used by #mdb_cursor_first() and #mdb_cursor_last(). 5679 * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. 5680 * @return 0 on success, non-zero on failure. 5681 */ 5682 static int 5683 mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) 5684 { 5685 int rc; 5686 pgno_t root; 5687 5688 /* Make sure the txn is still viable, then find the root from 5689 * the txn's db table and set it as the root of the cursor's stack. 5690 */ 5691 if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) { 5692 DPUTS("transaction may not be used now"); 5693 return MDB_BAD_TXN; 5694 } else { 5695 /* Make sure we're using an up-to-date root */ 5696 if (*mc->mc_dbflag & DB_STALE) { 5697 MDB_cursor mc2; 5698 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) 5699 return MDB_BAD_DBI; 5700 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); 5701 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); 5702 if (rc) 5703 return rc; 5704 { 5705 MDB_val data; 5706 int exact = 0; 5707 uint16_t flags; 5708 MDB_node *leaf = mdb_node_search(&mc2, 5709 &mc->mc_dbx->md_name, &exact); 5710 if (!exact) 5711 return MDB_BAD_DBI; 5712 if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) 5713 return MDB_INCOMPATIBLE; /* not a named DB */ 5714 rc = mdb_node_read(&mc2, leaf, &data); 5715 if (rc) 5716 return rc; 5717 memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), 5718 sizeof(uint16_t)); 5719 /* The txn may not know this DBI, or another process may 5720 * have dropped and recreated the DB with other flags. 5721 */ 5722 if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) 5723 return MDB_INCOMPATIBLE; 5724 memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); 5725 } 5726 *mc->mc_dbflag &= ~DB_STALE; 5727 } 5728 root = mc->mc_db->md_root; 5729 5730 if (root == P_INVALID) { /* Tree is empty. */ 5731 DPUTS("tree is empty"); 5732 return MDB_NOTFOUND; 5733 } 5734 } 5735 5736 mdb_cassert(mc, root > 1); 5737 if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) 5738 if ((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0) 5739 return rc; 5740 5741 mc->mc_snum = 1; 5742 mc->mc_top = 0; 5743 5744 DPRINTF(("db %d root page %"Z"u has flags 0x%X", 5745 DDBI(mc), root, mc->mc_pg[0]->mp_flags)); 5746 5747 if (flags & MDB_PS_MODIFY) { 5748 if ((rc = mdb_page_touch(mc))) 5749 return rc; 5750 } 5751 5752 if (flags & MDB_PS_ROOTONLY) 5753 return MDB_SUCCESS; 5754 5755 return mdb_page_search_root(mc, key, flags); 5756 } 5757 5758 static int 5759 mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) 5760 { 5761 MDB_txn *txn = mc->mc_txn; 5762 pgno_t pg = mp->mp_pgno; 5763 unsigned x = 0, ovpages = mp->mp_pages; 5764 MDB_env *env = txn->mt_env; 5765 MDB_IDL sl = txn->mt_spill_pgs; 5766 MDB_ID pn = pg << 1; 5767 int rc; 5768 5769 DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); 5770 /* If the page is dirty or on the spill list we just acquired it, 5771 * so we should give it back to our current free list, if any. 5772 * Otherwise put it onto the list of pages we freed in this txn. 5773 * 5774 * Won't create me_pghead: me_pglast must be inited along with it. 5775 * Unsupported in nested txns: They would need to hide the page 5776 * range in ancestor txns' dirty and spilled lists. 5777 */ 5778 if (env->me_pghead && 5779 !txn->mt_parent && 5780 ((mp->mp_flags & P_DIRTY) || 5781 (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) 5782 { 5783 unsigned i, j; 5784 pgno_t *mop; 5785 MDB_ID2 *dl, ix, iy; 5786 rc = mdb_midl_need(&env->me_pghead, ovpages); 5787 if (rc) 5788 return rc; 5789 if (!(mp->mp_flags & P_DIRTY)) { 5790 /* This page is no longer spilled */ 5791 if (x == sl[0]) 5792 sl[0]--; 5793 else 5794 sl[x] |= 1; 5795 goto release; 5796 } 5797 /* Remove from dirty list */ 5798 dl = txn->mt_u.dirty_list; 5799 x = dl[0].mid--; 5800 for (ix = dl[x]; ix.mptr != mp; ix = iy) { 5801 if (x > 1) { 5802 x--; 5803 iy = dl[x]; 5804 dl[x] = ix; 5805 } else { 5806 mdb_cassert(mc, x > 1); 5807 j = ++(dl[0].mid); 5808 dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ 5809 txn->mt_flags |= MDB_TXN_ERROR; 5810 return MDB_CORRUPTED; 5811 } 5812 } 5813 txn->mt_dirty_room++; 5814 if (!(env->me_flags & MDB_WRITEMAP)) 5815 mdb_dpage_free(env, mp); 5816 release: 5817 /* Insert in me_pghead */ 5818 mop = env->me_pghead; 5819 j = mop[0] + ovpages; 5820 for (i = mop[0]; i && mop[i] < pg; i--) 5821 mop[j--] = mop[i]; 5822 while (j>i) 5823 mop[j--] = pg++; 5824 mop[0] += ovpages; 5825 } else { 5826 rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); 5827 if (rc) 5828 return rc; 5829 } 5830 mc->mc_db->md_overflow_pages -= ovpages; 5831 return 0; 5832 } 5833 5834 /** Return the data associated with a given node. 5835 * @param[in] mc The cursor for this operation. 5836 * @param[in] leaf The node being read. 5837 * @param[out] data Updated to point to the node's data. 5838 * @return 0 on success, non-zero on failure. 5839 */ 5840 static int 5841 mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) 5842 { 5843 MDB_page *omp; /* overflow page */ 5844 pgno_t pgno; 5845 int rc; 5846 5847 if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { 5848 data->mv_size = NODEDSZ(leaf); 5849 data->mv_data = NODEDATA(leaf); 5850 return MDB_SUCCESS; 5851 } 5852 5853 /* Read overflow data. 5854 */ 5855 data->mv_size = NODEDSZ(leaf); 5856 memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); 5857 if ((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0) { 5858 DPRINTF(("read overflow page %"Z"u failed", pgno)); 5859 return rc; 5860 } 5861 data->mv_data = METADATA(omp); 5862 5863 return MDB_SUCCESS; 5864 } 5865 5866 int 5867 mdb_get(MDB_txn *txn, MDB_dbi dbi, 5868 MDB_val *key, MDB_val *data) 5869 { 5870 MDB_cursor mc; 5871 MDB_xcursor mx; 5872 int exact = 0; 5873 DKBUF; 5874 5875 DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); 5876 5877 if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 5878 return EINVAL; 5879 5880 if (txn->mt_flags & MDB_TXN_BLOCKED) 5881 return MDB_BAD_TXN; 5882 5883 mdb_cursor_init(&mc, txn, dbi, &mx); 5884 return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); 5885 } 5886 5887 /** Find a sibling for a page. 5888 * Replaces the page at the top of the cursor's stack with the 5889 * specified sibling, if one exists. 5890 * @param[in] mc The cursor for this operation. 5891 * @param[in] move_right Non-zero if the right sibling is requested, 5892 * otherwise the left sibling. 5893 * @return 0 on success, non-zero on failure. 5894 */ 5895 static int 5896 mdb_cursor_sibling(MDB_cursor *mc, int move_right) 5897 { 5898 int rc; 5899 MDB_node *indx; 5900 MDB_page *mp; 5901 5902 if (mc->mc_snum < 2) { 5903 return MDB_NOTFOUND; /* root has no siblings */ 5904 } 5905 5906 mdb_cursor_pop(mc); 5907 DPRINTF(("parent page is page %"Z"u, index %u", 5908 mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); 5909 5910 if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) 5911 : (mc->mc_ki[mc->mc_top] == 0)) { 5912 DPRINTF(("no more keys left, moving to %s sibling", 5913 move_right ? "right" : "left")); 5914 if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { 5915 /* undo cursor_pop before returning */ 5916 mc->mc_top++; 5917 mc->mc_snum++; 5918 return rc; 5919 } 5920 } else { 5921 if (move_right) 5922 mc->mc_ki[mc->mc_top]++; 5923 else 5924 mc->mc_ki[mc->mc_top]--; 5925 DPRINTF(("just moving to %s index key %u", 5926 move_right ? "right" : "left", mc->mc_ki[mc->mc_top])); 5927 } 5928 mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); 5929 5930 indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 5931 if ((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) { 5932 /* mc will be inconsistent if caller does mc_snum++ as above */ 5933 mc->mc_flags &= ~(C_INITIALIZED|C_EOF); 5934 return rc; 5935 } 5936 5937 mdb_cursor_push(mc, mp); 5938 if (!move_right) 5939 mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; 5940 5941 return MDB_SUCCESS; 5942 } 5943 5944 /** Move the cursor to the next data item. */ 5945 static int 5946 mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) 5947 { 5948 MDB_page *mp; 5949 MDB_node *leaf; 5950 int rc; 5951 5952 if ((mc->mc_flags & C_DEL && op == MDB_NEXT_DUP)) 5953 return MDB_NOTFOUND; 5954 5955 if (!(mc->mc_flags & C_INITIALIZED)) 5956 return mdb_cursor_first(mc, key, data); 5957 5958 mp = mc->mc_pg[mc->mc_top]; 5959 5960 if (mc->mc_flags & C_EOF) { 5961 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1) 5962 return MDB_NOTFOUND; 5963 mc->mc_flags ^= C_EOF; 5964 } 5965 5966 if (mc->mc_db->md_flags & MDB_DUPSORT) { 5967 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 5968 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 5969 if (op == MDB_NEXT || op == MDB_NEXT_DUP) { 5970 rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); 5971 if (op != MDB_NEXT || rc != MDB_NOTFOUND) { 5972 if (rc == MDB_SUCCESS) 5973 MDB_GET_KEY(leaf, key); 5974 return rc; 5975 } 5976 } 5977 } else { 5978 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 5979 if (op == MDB_NEXT_DUP) 5980 return MDB_NOTFOUND; 5981 } 5982 } 5983 5984 DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", 5985 mdb_dbg_pgno(mp), (void *) mc)); 5986 if (mc->mc_flags & C_DEL) { 5987 mc->mc_flags ^= C_DEL; 5988 goto skip; 5989 } 5990 5991 if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { 5992 DPUTS("=====> move to next sibling page"); 5993 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { 5994 mc->mc_flags |= C_EOF; 5995 return rc; 5996 } 5997 mp = mc->mc_pg[mc->mc_top]; 5998 DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); 5999 } else 6000 mc->mc_ki[mc->mc_top]++; 6001 6002 skip: 6003 DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", 6004 mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); 6005 6006 if (IS_LEAF2(mp)) { 6007 key->mv_size = mc->mc_db->md_pad; 6008 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 6009 return MDB_SUCCESS; 6010 } 6011 6012 mdb_cassert(mc, IS_LEAF(mp)); 6013 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 6014 6015 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6016 mdb_xcursor_init1(mc, leaf); 6017 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 6018 if (rc != MDB_SUCCESS) 6019 return rc; 6020 } else if (data) { 6021 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 6022 return rc; 6023 } 6024 6025 MDB_GET_KEY(leaf, key); 6026 return MDB_SUCCESS; 6027 } 6028 6029 /** Move the cursor to the previous data item. */ 6030 static int 6031 mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) 6032 { 6033 MDB_page *mp; 6034 MDB_node *leaf; 6035 int rc; 6036 6037 if (!(mc->mc_flags & C_INITIALIZED)) { 6038 rc = mdb_cursor_last(mc, key, data); 6039 if (rc) 6040 return rc; 6041 mc->mc_ki[mc->mc_top]++; 6042 } 6043 6044 mp = mc->mc_pg[mc->mc_top]; 6045 6046 if ((mc->mc_db->md_flags & MDB_DUPSORT) && 6047 mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { 6048 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 6049 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6050 if (op == MDB_PREV || op == MDB_PREV_DUP) { 6051 rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); 6052 if (op != MDB_PREV || rc != MDB_NOTFOUND) { 6053 if (rc == MDB_SUCCESS) { 6054 MDB_GET_KEY(leaf, key); 6055 mc->mc_flags &= ~C_EOF; 6056 } 6057 return rc; 6058 } 6059 } 6060 } else { 6061 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 6062 if (op == MDB_PREV_DUP) 6063 return MDB_NOTFOUND; 6064 } 6065 } 6066 6067 DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", 6068 mdb_dbg_pgno(mp), (void *) mc)); 6069 6070 mc->mc_flags &= ~(C_EOF|C_DEL); 6071 6072 if (mc->mc_ki[mc->mc_top] == 0) { 6073 DPUTS("=====> move to prev sibling page"); 6074 if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { 6075 return rc; 6076 } 6077 mp = mc->mc_pg[mc->mc_top]; 6078 mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; 6079 DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); 6080 } else 6081 mc->mc_ki[mc->mc_top]--; 6082 6083 DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", 6084 mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); 6085 6086 if (!IS_LEAF(mp)) 6087 return MDB_CORRUPTED; 6088 6089 if (IS_LEAF2(mp)) { 6090 key->mv_size = mc->mc_db->md_pad; 6091 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 6092 return MDB_SUCCESS; 6093 } 6094 6095 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 6096 6097 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6098 mdb_xcursor_init1(mc, leaf); 6099 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); 6100 if (rc != MDB_SUCCESS) 6101 return rc; 6102 } else if (data) { 6103 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 6104 return rc; 6105 } 6106 6107 MDB_GET_KEY(leaf, key); 6108 return MDB_SUCCESS; 6109 } 6110 6111 /** Set the cursor on a specific data item. */ 6112 static int 6113 mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, 6114 MDB_cursor_op op, int *exactp) 6115 { 6116 int rc; 6117 MDB_page *mp; 6118 MDB_node *leaf = NULL; 6119 DKBUF; 6120 6121 if (key->mv_size == 0) 6122 return MDB_BAD_VALSIZE; 6123 6124 if (mc->mc_xcursor) 6125 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 6126 6127 /* See if we're already on the right page */ 6128 if (mc->mc_flags & C_INITIALIZED) { 6129 MDB_val nodekey; 6130 6131 mp = mc->mc_pg[mc->mc_top]; 6132 if (!NUMKEYS(mp)) { 6133 mc->mc_ki[mc->mc_top] = 0; 6134 return MDB_NOTFOUND; 6135 } 6136 if (MP_FLAGS(mp) & P_LEAF2) { 6137 nodekey.mv_size = mc->mc_db->md_pad; 6138 nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); 6139 } else { 6140 leaf = NODEPTR(mp, 0); 6141 MDB_GET_KEY2(leaf, nodekey); 6142 } 6143 rc = mc->mc_dbx->md_cmp(key, &nodekey); 6144 if (rc == 0) { 6145 /* Probably happens rarely, but first node on the page 6146 * was the one we wanted. 6147 */ 6148 mc->mc_ki[mc->mc_top] = 0; 6149 if (exactp) 6150 *exactp = 1; 6151 goto set1; 6152 } 6153 if (rc > 0) { 6154 unsigned int i; 6155 unsigned int nkeys = NUMKEYS(mp); 6156 if (nkeys > 1) { 6157 if (MP_FLAGS(mp) & P_LEAF2) { 6158 nodekey.mv_data = LEAF2KEY(mp, 6159 nkeys-1, nodekey.mv_size); 6160 } else { 6161 leaf = NODEPTR(mp, nkeys-1); 6162 MDB_GET_KEY2(leaf, nodekey); 6163 } 6164 rc = mc->mc_dbx->md_cmp(key, &nodekey); 6165 if (rc == 0) { 6166 /* last node was the one we wanted */ 6167 mc->mc_ki[mc->mc_top] = nkeys-1; 6168 if (exactp) 6169 *exactp = 1; 6170 goto set1; 6171 } 6172 if (rc < 0) { 6173 if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { 6174 /* This is definitely the right page, skip search_page */ 6175 if (MP_FLAGS(mp) & P_LEAF2) { 6176 nodekey.mv_data = LEAF2KEY(mp, 6177 mc->mc_ki[mc->mc_top], nodekey.mv_size); 6178 } else { 6179 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 6180 MDB_GET_KEY2(leaf, nodekey); 6181 } 6182 rc = mc->mc_dbx->md_cmp(key, &nodekey); 6183 if (rc == 0) { 6184 /* current node was the one we wanted */ 6185 if (exactp) 6186 *exactp = 1; 6187 goto set1; 6188 } 6189 } 6190 rc = 0; 6191 mc->mc_flags &= ~C_EOF; 6192 goto set2; 6193 } 6194 } 6195 /* If any parents have right-sibs, search. 6196 * Otherwise, there's nothing further. 6197 */ 6198 for (i=0; i<mc->mc_top; i++) 6199 if (mc->mc_ki[i] < 6200 NUMKEYS(mc->mc_pg[i])-1) 6201 break; 6202 if (i == mc->mc_top) { 6203 /* There are no other pages */ 6204 mc->mc_ki[mc->mc_top] = nkeys; 6205 return MDB_NOTFOUND; 6206 } 6207 } 6208 if (!mc->mc_top) { 6209 /* There are no other pages */ 6210 mc->mc_ki[mc->mc_top] = 0; 6211 if (op == MDB_SET_RANGE && !exactp) { 6212 rc = 0; 6213 goto set1; 6214 } else 6215 return MDB_NOTFOUND; 6216 } 6217 } else { 6218 mc->mc_pg[0] = 0; 6219 } 6220 6221 rc = mdb_page_search(mc, key, 0); 6222 if (rc != MDB_SUCCESS) 6223 return rc; 6224 6225 mp = mc->mc_pg[mc->mc_top]; 6226 mdb_cassert(mc, IS_LEAF(mp)); 6227 6228 set2: 6229 leaf = mdb_node_search(mc, key, exactp); 6230 if (exactp != NULL && !*exactp) { 6231 /* MDB_SET specified and not an exact match. */ 6232 return MDB_NOTFOUND; 6233 } 6234 6235 if (leaf == NULL) { 6236 DPUTS("===> inexact leaf not found, goto sibling"); 6237 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { 6238 mc->mc_flags |= C_EOF; 6239 return rc; /* no entries matched */ 6240 } 6241 mp = mc->mc_pg[mc->mc_top]; 6242 mdb_cassert(mc, IS_LEAF(mp)); 6243 leaf = NODEPTR(mp, 0); 6244 } 6245 6246 set1: 6247 mc->mc_flags |= C_INITIALIZED; 6248 mc->mc_flags &= ~C_EOF; 6249 6250 if (IS_LEAF2(mp)) { 6251 if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { 6252 key->mv_size = mc->mc_db->md_pad; 6253 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 6254 } 6255 return MDB_SUCCESS; 6256 } 6257 6258 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6259 mdb_xcursor_init1(mc, leaf); 6260 if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { 6261 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 6262 } else { 6263 int ex2, *ex2p; 6264 if (op == MDB_GET_BOTH) { 6265 ex2p = &ex2; 6266 ex2 = 0; 6267 } else { 6268 ex2p = NULL; 6269 } 6270 rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); 6271 if (rc != MDB_SUCCESS) 6272 return rc; 6273 } 6274 } else if (data) { 6275 if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { 6276 MDB_val olddata; 6277 MDB_cmp_func *dcmp; 6278 if ((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS) 6279 return rc; 6280 dcmp = mc->mc_dbx->md_dcmp; 6281 #if UINT_MAX < SIZE_MAX 6282 if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) 6283 dcmp = mdb_cmp_clong; 6284 #endif 6285 rc = dcmp(data, &olddata); 6286 if (rc) { 6287 if (op == MDB_GET_BOTH || rc > 0) 6288 return MDB_NOTFOUND; 6289 rc = 0; 6290 } 6291 *data = olddata; 6292 6293 } else { 6294 if (mc->mc_xcursor) 6295 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 6296 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 6297 return rc; 6298 } 6299 } 6300 6301 /* The key already matches in all other cases */ 6302 if (op == MDB_SET_RANGE || op == MDB_SET_KEY) 6303 MDB_GET_KEY(leaf, key); 6304 DPRINTF(("==> cursor placed on key [%s]", DKEY(key))); 6305 6306 return rc; 6307 } 6308 6309 /** Move the cursor to the first item in the database. */ 6310 static int 6311 mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) 6312 { 6313 int rc; 6314 MDB_node *leaf; 6315 6316 if (mc->mc_xcursor) 6317 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 6318 6319 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { 6320 rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); 6321 if (rc != MDB_SUCCESS) 6322 return rc; 6323 } 6324 mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); 6325 6326 leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); 6327 mc->mc_flags |= C_INITIALIZED; 6328 mc->mc_flags &= ~C_EOF; 6329 6330 mc->mc_ki[mc->mc_top] = 0; 6331 6332 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { 6333 if ( key ) { 6334 key->mv_size = mc->mc_db->md_pad; 6335 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); 6336 } 6337 return MDB_SUCCESS; 6338 } 6339 6340 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6341 mdb_xcursor_init1(mc, leaf); 6342 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); 6343 if (rc) 6344 return rc; 6345 } else if (data) { 6346 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 6347 return rc; 6348 } 6349 6350 MDB_GET_KEY(leaf, key); 6351 return MDB_SUCCESS; 6352 } 6353 6354 /** Move the cursor to the last item in the database. */ 6355 static int 6356 mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) 6357 { 6358 int rc; 6359 MDB_node *leaf; 6360 6361 if (mc->mc_xcursor) 6362 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 6363 6364 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { 6365 rc = mdb_page_search(mc, NULL, MDB_PS_LAST); 6366 if (rc != MDB_SUCCESS) 6367 return rc; 6368 } 6369 mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); 6370 6371 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; 6372 mc->mc_flags |= C_INITIALIZED|C_EOF; 6373 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 6374 6375 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { 6376 if (key) { 6377 key->mv_size = mc->mc_db->md_pad; 6378 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); 6379 } 6380 return MDB_SUCCESS; 6381 } 6382 6383 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6384 mdb_xcursor_init1(mc, leaf); 6385 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); 6386 if (rc) 6387 return rc; 6388 } else if (data) { 6389 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) 6390 return rc; 6391 } 6392 6393 MDB_GET_KEY(leaf, key); 6394 return MDB_SUCCESS; 6395 } 6396 6397 int 6398 mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, 6399 MDB_cursor_op op) 6400 { 6401 int rc; 6402 int exact = 0; 6403 int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); 6404 6405 if (mc == NULL) 6406 return EINVAL; 6407 6408 if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) 6409 return MDB_BAD_TXN; 6410 6411 switch (op) { 6412 case MDB_GET_CURRENT: 6413 if (!(mc->mc_flags & C_INITIALIZED)) { 6414 rc = EINVAL; 6415 } else { 6416 MDB_page *mp = mc->mc_pg[mc->mc_top]; 6417 int nkeys = NUMKEYS(mp); 6418 if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { 6419 mc->mc_ki[mc->mc_top] = nkeys; 6420 rc = MDB_NOTFOUND; 6421 break; 6422 } 6423 rc = MDB_SUCCESS; 6424 if (IS_LEAF2(mp)) { 6425 key->mv_size = mc->mc_db->md_pad; 6426 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); 6427 } else { 6428 MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 6429 MDB_GET_KEY(leaf, key); 6430 if (data) { 6431 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6432 rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); 6433 } else { 6434 rc = mdb_node_read(mc, leaf, data); 6435 } 6436 } 6437 } 6438 } 6439 break; 6440 case MDB_GET_BOTH: 6441 case MDB_GET_BOTH_RANGE: 6442 if (data == NULL) { 6443 rc = EINVAL; 6444 break; 6445 } 6446 if (mc->mc_xcursor == NULL) { 6447 rc = MDB_INCOMPATIBLE; 6448 break; 6449 } 6450 /* FALLTHRU */ 6451 case MDB_SET: 6452 case MDB_SET_KEY: 6453 case MDB_SET_RANGE: 6454 if (key == NULL) { 6455 rc = EINVAL; 6456 } else { 6457 rc = mdb_cursor_set(mc, key, data, op, 6458 op == MDB_SET_RANGE ? NULL : &exact); 6459 } 6460 break; 6461 case MDB_GET_MULTIPLE: 6462 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { 6463 rc = EINVAL; 6464 break; 6465 } 6466 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 6467 rc = MDB_INCOMPATIBLE; 6468 break; 6469 } 6470 rc = MDB_SUCCESS; 6471 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || 6472 (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) 6473 break; 6474 goto fetchm; 6475 case MDB_NEXT_MULTIPLE: 6476 if (data == NULL) { 6477 rc = EINVAL; 6478 break; 6479 } 6480 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 6481 rc = MDB_INCOMPATIBLE; 6482 break; 6483 } 6484 rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); 6485 if (rc == MDB_SUCCESS) { 6486 if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { 6487 MDB_cursor *mx; 6488 fetchm: 6489 mx = &mc->mc_xcursor->mx_cursor; 6490 data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * 6491 mx->mc_db->md_pad; 6492 data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); 6493 mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; 6494 } else { 6495 rc = MDB_NOTFOUND; 6496 } 6497 } 6498 break; 6499 case MDB_PREV_MULTIPLE: 6500 if (data == NULL) { 6501 rc = EINVAL; 6502 break; 6503 } 6504 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 6505 rc = MDB_INCOMPATIBLE; 6506 break; 6507 } 6508 if (!(mc->mc_flags & C_INITIALIZED)) 6509 rc = mdb_cursor_last(mc, key, data); 6510 else 6511 rc = MDB_SUCCESS; 6512 if (rc == MDB_SUCCESS) { 6513 MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; 6514 if (mx->mc_flags & C_INITIALIZED) { 6515 rc = mdb_cursor_sibling(mx, 0); 6516 if (rc == MDB_SUCCESS) 6517 goto fetchm; 6518 } else { 6519 rc = MDB_NOTFOUND; 6520 } 6521 } 6522 break; 6523 case MDB_NEXT: 6524 case MDB_NEXT_DUP: 6525 case MDB_NEXT_NODUP: 6526 rc = mdb_cursor_next(mc, key, data, op); 6527 break; 6528 case MDB_PREV: 6529 case MDB_PREV_DUP: 6530 case MDB_PREV_NODUP: 6531 rc = mdb_cursor_prev(mc, key, data, op); 6532 break; 6533 case MDB_FIRST: 6534 rc = mdb_cursor_first(mc, key, data); 6535 break; 6536 case MDB_FIRST_DUP: 6537 mfunc = mdb_cursor_first; 6538 mmove: 6539 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { 6540 rc = EINVAL; 6541 break; 6542 } 6543 if (mc->mc_xcursor == NULL) { 6544 rc = MDB_INCOMPATIBLE; 6545 break; 6546 } 6547 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) { 6548 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); 6549 rc = MDB_NOTFOUND; 6550 break; 6551 } 6552 mc->mc_flags &= ~C_EOF; 6553 { 6554 MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 6555 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6556 MDB_GET_KEY(leaf, key); 6557 rc = mdb_node_read(mc, leaf, data); 6558 break; 6559 } 6560 } 6561 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { 6562 rc = EINVAL; 6563 break; 6564 } 6565 rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); 6566 break; 6567 case MDB_LAST: 6568 rc = mdb_cursor_last(mc, key, data); 6569 break; 6570 case MDB_LAST_DUP: 6571 mfunc = mdb_cursor_last; 6572 goto mmove; 6573 default: 6574 DPRINTF(("unhandled/unimplemented cursor operation %u", op)); 6575 rc = EINVAL; 6576 break; 6577 } 6578 6579 if (mc->mc_flags & C_DEL) 6580 mc->mc_flags ^= C_DEL; 6581 6582 return rc; 6583 } 6584 6585 /** Touch all the pages in the cursor stack. Set mc_top. 6586 * Makes sure all the pages are writable, before attempting a write operation. 6587 * @param[in] mc The cursor to operate on. 6588 */ 6589 static int 6590 mdb_cursor_touch(MDB_cursor *mc) 6591 { 6592 int rc = MDB_SUCCESS; 6593 6594 if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { 6595 /* Touch DB record of named DB */ 6596 MDB_cursor mc2; 6597 MDB_xcursor mcx; 6598 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) 6599 return MDB_BAD_DBI; 6600 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); 6601 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); 6602 if (rc) 6603 return rc; 6604 *mc->mc_dbflag |= DB_DIRTY; 6605 } 6606 mc->mc_top = 0; 6607 if (mc->mc_snum) { 6608 do { 6609 rc = mdb_page_touch(mc); 6610 } while (!rc && ++(mc->mc_top) < mc->mc_snum); 6611 mc->mc_top = mc->mc_snum-1; 6612 } 6613 return rc; 6614 } 6615 6616 /** Do not spill pages to disk if txn is getting full, may fail instead */ 6617 #define MDB_NOSPILL 0x8000 6618 6619 static int 6620 _mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, 6621 unsigned int flags) 6622 { 6623 MDB_env *env; 6624 MDB_node *leaf = NULL; 6625 MDB_page *fp, *mp, *sub_root = NULL; 6626 uint16_t fp_flags; 6627 MDB_val xdata, *rdata, dkey, olddata; 6628 MDB_db dummy; 6629 int do_sub = 0, insert_key, insert_data; 6630 unsigned int mcount = 0, dcount = 0, nospill; 6631 size_t nsize; 6632 int rc, rc2; 6633 unsigned int nflags; 6634 DKBUF; 6635 6636 if (mc == NULL || key == NULL) 6637 return EINVAL; 6638 6639 env = mc->mc_txn->mt_env; 6640 6641 /* Check this first so counter will always be zero on any 6642 * early failures. 6643 */ 6644 if (flags & MDB_MULTIPLE) { 6645 dcount = data[1].mv_size; 6646 data[1].mv_size = 0; 6647 if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) 6648 return MDB_INCOMPATIBLE; 6649 } 6650 6651 nospill = flags & MDB_NOSPILL; 6652 flags &= ~MDB_NOSPILL; 6653 6654 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) 6655 return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 6656 6657 if (key->mv_size-1 >= ENV_MAXKEY(env)) 6658 return MDB_BAD_VALSIZE; 6659 6660 #if SIZE_MAX > MAXDATASIZE 6661 if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) 6662 return MDB_BAD_VALSIZE; 6663 #else 6664 if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) 6665 return MDB_BAD_VALSIZE; 6666 #endif 6667 6668 DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u", 6669 DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); 6670 6671 dkey.mv_size = 0; 6672 6673 if (flags & MDB_CURRENT) { 6674 if (!(mc->mc_flags & C_INITIALIZED)) 6675 return EINVAL; 6676 rc = MDB_SUCCESS; 6677 } else if (mc->mc_db->md_root == P_INVALID) { 6678 /* new database, cursor has nothing to point to */ 6679 mc->mc_snum = 0; 6680 mc->mc_top = 0; 6681 mc->mc_flags &= ~C_INITIALIZED; 6682 rc = MDB_NO_ROOT; 6683 } else { 6684 int exact = 0; 6685 MDB_val d2; 6686 if (flags & MDB_APPEND) { 6687 MDB_val k2; 6688 rc = mdb_cursor_last(mc, &k2, &d2); 6689 if (rc == 0) { 6690 rc = mc->mc_dbx->md_cmp(key, &k2); 6691 if (rc > 0) { 6692 rc = MDB_NOTFOUND; 6693 mc->mc_ki[mc->mc_top]++; 6694 } else { 6695 /* new key is <= last key */ 6696 rc = MDB_KEYEXIST; 6697 } 6698 } 6699 } else { 6700 rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); 6701 } 6702 if ((flags & MDB_NOOVERWRITE) && rc == 0) { 6703 DPRINTF(("duplicate key [%s]", DKEY(key))); 6704 *data = d2; 6705 return MDB_KEYEXIST; 6706 } 6707 if (rc && rc != MDB_NOTFOUND) 6708 return rc; 6709 } 6710 6711 if (mc->mc_flags & C_DEL) 6712 mc->mc_flags ^= C_DEL; 6713 6714 /* Cursor is positioned, check for room in the dirty list */ 6715 if (!nospill) { 6716 if (flags & MDB_MULTIPLE) { 6717 rdata = &xdata; 6718 xdata.mv_size = data->mv_size * dcount; 6719 } else { 6720 rdata = data; 6721 } 6722 if ((rc2 = mdb_page_spill(mc, key, rdata))) 6723 return rc2; 6724 } 6725 6726 if (rc == MDB_NO_ROOT) { 6727 MDB_page *np; 6728 /* new database, write a root leaf page */ 6729 DPUTS("allocating new root leaf page"); 6730 if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { 6731 return rc2; 6732 } 6733 mdb_cursor_push(mc, np); 6734 mc->mc_db->md_root = np->mp_pgno; 6735 mc->mc_db->md_depth++; 6736 *mc->mc_dbflag |= DB_DIRTY; 6737 if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) 6738 == MDB_DUPFIXED) 6739 MP_FLAGS(np) |= P_LEAF2; 6740 mc->mc_flags |= C_INITIALIZED; 6741 } else { 6742 /* make sure all cursor pages are writable */ 6743 rc2 = mdb_cursor_touch(mc); 6744 if (rc2) 6745 return rc2; 6746 } 6747 6748 insert_key = insert_data = rc; 6749 if (insert_key) { 6750 /* The key does not exist */ 6751 DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top])); 6752 if ((mc->mc_db->md_flags & MDB_DUPSORT) && 6753 LEAFSIZE(key, data) > env->me_nodemax) 6754 { 6755 /* Too big for a node, insert in sub-DB. Set up an empty 6756 * "old sub-page" for prep_subDB to expand to a full page. 6757 */ 6758 fp_flags = P_LEAF|P_DIRTY; 6759 fp = env->me_pbuf; 6760 fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */ 6761 MP_LOWER(fp) = MP_UPPER(fp) = (PAGEHDRSZ-PAGEBASE); 6762 olddata.mv_size = PAGEHDRSZ; 6763 goto prep_subDB; 6764 } 6765 } else { 6766 /* there's only a key anyway, so this is a no-op */ 6767 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { 6768 char *ptr; 6769 unsigned int ksize = mc->mc_db->md_pad; 6770 if (key->mv_size != ksize) 6771 return MDB_BAD_VALSIZE; 6772 ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); 6773 memcpy(ptr, key->mv_data, ksize); 6774 fix_parent: 6775 /* if overwriting slot 0 of leaf, need to 6776 * update branch key if there is a parent page 6777 */ 6778 if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { 6779 unsigned short dtop = 1; 6780 mc->mc_top--; 6781 /* slot 0 is always an empty key, find real slot */ 6782 while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { 6783 mc->mc_top--; 6784 dtop++; 6785 } 6786 if (mc->mc_ki[mc->mc_top]) 6787 rc2 = mdb_update_key(mc, key); 6788 else 6789 rc2 = MDB_SUCCESS; 6790 mc->mc_top += dtop; 6791 if (rc2) 6792 return rc2; 6793 } 6794 return MDB_SUCCESS; 6795 } 6796 6797 more: 6798 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 6799 olddata.mv_size = NODEDSZ(leaf); 6800 olddata.mv_data = NODEDATA(leaf); 6801 6802 /* DB has dups? */ 6803 if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { 6804 /* Prepare (sub-)page/sub-DB to accept the new item, 6805 * if needed. fp: old sub-page or a header faking 6806 * it. mp: new (sub-)page. offset: growth in page 6807 * size. xdata: node data with new page or DB. 6808 */ 6809 unsigned i, offset = 0; 6810 mp = fp = xdata.mv_data = env->me_pbuf; 6811 mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; 6812 6813 /* Was a single item before, must convert now */ 6814 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { 6815 MDB_cmp_func *dcmp; 6816 /* Just overwrite the current item */ 6817 if (flags == MDB_CURRENT) 6818 goto current; 6819 dcmp = mc->mc_dbx->md_dcmp; 6820 #if UINT_MAX < SIZE_MAX 6821 if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) 6822 dcmp = mdb_cmp_clong; 6823 #endif 6824 /* does data match? */ 6825 if (!dcmp(data, &olddata)) { 6826 if (flags & (MDB_NODUPDATA|MDB_APPENDDUP)) 6827 return MDB_KEYEXIST; 6828 /* overwrite it */ 6829 goto current; 6830 } 6831 6832 /* Back up original data item */ 6833 dkey.mv_size = olddata.mv_size; 6834 dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); 6835 6836 /* Make sub-page header for the dup items, with dummy body */ 6837 MP_FLAGS(fp) = P_LEAF|P_DIRTY|P_SUBP; 6838 MP_LOWER(fp) = (PAGEHDRSZ-PAGEBASE); 6839 xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; 6840 if (mc->mc_db->md_flags & MDB_DUPFIXED) { 6841 MP_FLAGS(fp) |= P_LEAF2; 6842 fp->mp_pad = data->mv_size; 6843 xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ 6844 } else { 6845 xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + 6846 (dkey.mv_size & 1) + (data->mv_size & 1); 6847 } 6848 MP_UPPER(fp) = xdata.mv_size - PAGEBASE; 6849 olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ 6850 } else if (leaf->mn_flags & F_SUBDATA) { 6851 /* Data is on sub-DB, just store it */ 6852 flags |= F_DUPDATA|F_SUBDATA; 6853 goto put_sub; 6854 } else { 6855 /* Data is on sub-page */ 6856 fp = olddata.mv_data; 6857 switch (flags) { 6858 default: 6859 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { 6860 offset = EVEN(NODESIZE + sizeof(indx_t) + 6861 data->mv_size); 6862 break; 6863 } 6864 offset = fp->mp_pad; 6865 if (SIZELEFT(fp) < offset) { 6866 offset *= 4; /* space for 4 more */ 6867 break; 6868 } 6869 /* FALLTHRU */ /* Big enough MDB_DUPFIXED sub-page */ 6870 case MDB_CURRENT: 6871 MP_FLAGS(fp) |= P_DIRTY; 6872 COPY_PGNO(MP_PGNO(fp), MP_PGNO(mp)); 6873 mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; 6874 flags |= F_DUPDATA; 6875 goto put_sub; 6876 } 6877 xdata.mv_size = olddata.mv_size + offset; 6878 } 6879 6880 fp_flags = MP_FLAGS(fp); 6881 if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { 6882 /* Too big for a sub-page, convert to sub-DB */ 6883 fp_flags &= ~P_SUBP; 6884 prep_subDB: 6885 if (mc->mc_db->md_flags & MDB_DUPFIXED) { 6886 fp_flags |= P_LEAF2; 6887 dummy.md_pad = fp->mp_pad; 6888 dummy.md_flags = MDB_DUPFIXED; 6889 if (mc->mc_db->md_flags & MDB_INTEGERDUP) 6890 dummy.md_flags |= MDB_INTEGERKEY; 6891 } else { 6892 dummy.md_pad = 0; 6893 dummy.md_flags = 0; 6894 } 6895 dummy.md_depth = 1; 6896 dummy.md_branch_pages = 0; 6897 dummy.md_leaf_pages = 1; 6898 dummy.md_overflow_pages = 0; 6899 dummy.md_entries = NUMKEYS(fp); 6900 xdata.mv_size = sizeof(MDB_db); 6901 xdata.mv_data = &dummy; 6902 if ((rc = mdb_page_alloc(mc, 1, &mp))) 6903 return rc; 6904 offset = env->me_psize - olddata.mv_size; 6905 flags |= F_DUPDATA|F_SUBDATA; 6906 dummy.md_root = mp->mp_pgno; 6907 sub_root = mp; 6908 } 6909 if (mp != fp) { 6910 MP_FLAGS(mp) = fp_flags | P_DIRTY; 6911 MP_PAD(mp) = MP_PAD(fp); 6912 MP_LOWER(mp) = MP_LOWER(fp); 6913 MP_UPPER(mp) = MP_UPPER(fp) + offset; 6914 if (fp_flags & P_LEAF2) { 6915 memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); 6916 } else { 6917 memcpy((char *)mp + MP_UPPER(mp) + PAGEBASE, (char *)fp + MP_UPPER(fp) + PAGEBASE, 6918 olddata.mv_size - MP_UPPER(fp) - PAGEBASE); 6919 memcpy((char *)MP_PTRS(mp), (char *)MP_PTRS(fp), NUMKEYS(fp) * sizeof(mp->mp_ptrs[0])); 6920 for (i=0; i<NUMKEYS(fp); i++) 6921 mp->mp_ptrs[i] += offset; 6922 } 6923 } 6924 6925 rdata = &xdata; 6926 flags |= F_DUPDATA; 6927 do_sub = 1; 6928 if (!insert_key) 6929 mdb_node_del(mc, 0); 6930 goto new_sub; 6931 } 6932 current: 6933 /* LMDB passes F_SUBDATA in 'flags' to write a DB record */ 6934 if ((leaf->mn_flags ^ flags) & F_SUBDATA) 6935 return MDB_INCOMPATIBLE; 6936 /* overflow page overwrites need special handling */ 6937 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { 6938 MDB_page *omp; 6939 pgno_t pg; 6940 int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); 6941 6942 memcpy(&pg, olddata.mv_data, sizeof(pg)); 6943 if ((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0) 6944 return rc2; 6945 ovpages = omp->mp_pages; 6946 6947 /* Is the ov page large enough? */ 6948 if (ovpages >= dpages) { 6949 if (!(omp->mp_flags & P_DIRTY) && 6950 (level || (env->me_flags & MDB_WRITEMAP))) 6951 { 6952 rc = mdb_page_unspill(mc->mc_txn, omp, &omp); 6953 if (rc) 6954 return rc; 6955 level = 0; /* dirty in this txn or clean */ 6956 } 6957 /* Is it dirty? */ 6958 if (omp->mp_flags & P_DIRTY) { 6959 /* yes, overwrite it. Note in this case we don't 6960 * bother to try shrinking the page if the new data 6961 * is smaller than the overflow threshold. 6962 */ 6963 if (level > 1) { 6964 /* It is writable only in a parent txn */ 6965 size_t sz = (size_t) env->me_psize * ovpages, off; 6966 MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); 6967 MDB_ID2 id2; 6968 if (!np) 6969 return ENOMEM; 6970 id2.mid = pg; 6971 id2.mptr = np; 6972 /* Note - this page is already counted in parent's dirty_room */ 6973 rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); 6974 mdb_cassert(mc, rc2 == 0); 6975 /* Currently we make the page look as with put() in the 6976 * parent txn, in case the user peeks at MDB_RESERVEd 6977 * or unused parts. Some users treat ovpages specially. 6978 */ 6979 if (!(flags & MDB_RESERVE)) { 6980 /* Skip the part where LMDB will put *data. 6981 * Copy end of page, adjusting alignment so 6982 * compiler may copy words instead of bytes. 6983 */ 6984 off = (PAGEHDRSZ + data->mv_size) & -(int)sizeof(size_t); 6985 memcpy((size_t *)((char *)np + off), 6986 (size_t *)((char *)omp + off), sz - off); 6987 sz = PAGEHDRSZ; 6988 } 6989 memcpy(np, omp, sz); /* Copy beginning of page */ 6990 omp = np; 6991 } 6992 SETDSZ(leaf, data->mv_size); 6993 if (F_ISSET(flags, MDB_RESERVE)) 6994 data->mv_data = METADATA(omp); 6995 else 6996 memcpy(METADATA(omp), data->mv_data, data->mv_size); 6997 return MDB_SUCCESS; 6998 } 6999 } 7000 if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) 7001 return rc2; 7002 } else if (data->mv_size == olddata.mv_size) { 7003 /* same size, just replace it. Note that we could 7004 * also reuse this node if the new data is smaller, 7005 * but instead we opt to shrink the node in that case. 7006 */ 7007 if (F_ISSET(flags, MDB_RESERVE)) 7008 data->mv_data = olddata.mv_data; 7009 else if (!(mc->mc_flags & C_SUB)) 7010 memcpy(olddata.mv_data, data->mv_data, data->mv_size); 7011 else { 7012 if (key->mv_size != NODEKSZ(leaf)) 7013 goto new_ksize; 7014 memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); 7015 goto fix_parent; 7016 } 7017 return MDB_SUCCESS; 7018 } 7019 new_ksize: 7020 mdb_node_del(mc, 0); 7021 } 7022 7023 rdata = data; 7024 7025 new_sub: 7026 nflags = flags & NODE_ADD_FLAGS; 7027 nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); 7028 if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { 7029 if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) 7030 nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ 7031 if (!insert_key) 7032 nflags |= MDB_SPLIT_REPLACE; 7033 rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); 7034 } else { 7035 /* There is room already in this leaf page. */ 7036 rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); 7037 if (rc == 0) { 7038 /* Adjust other cursors pointing to mp */ 7039 MDB_cursor *m2, *m3; 7040 MDB_dbi dbi = mc->mc_dbi; 7041 unsigned i = mc->mc_top; 7042 MDB_page *mp = mc->mc_pg[i]; 7043 7044 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 7045 if (mc->mc_flags & C_SUB) 7046 m3 = &m2->mc_xcursor->mx_cursor; 7047 else 7048 m3 = m2; 7049 if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue; 7050 if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { 7051 m3->mc_ki[i]++; 7052 } 7053 XCURSOR_REFRESH(m3, i, mp); 7054 } 7055 } 7056 } 7057 7058 if (rc == MDB_SUCCESS) { 7059 /* Now store the actual data in the child DB. Note that we're 7060 * storing the user data in the keys field, so there are strict 7061 * size limits on dupdata. The actual data fields of the child 7062 * DB are all zero size. 7063 */ 7064 if (do_sub) { 7065 int xflags, new_dupdata; 7066 size_t ecount; 7067 put_sub: 7068 xdata.mv_size = 0; 7069 xdata.mv_data = ""; 7070 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 7071 if ((flags & (MDB_CURRENT|MDB_APPENDDUP)) == MDB_CURRENT) { 7072 xflags = MDB_CURRENT|MDB_NOSPILL; 7073 } else { 7074 mdb_xcursor_init1(mc, leaf); 7075 xflags = (flags & MDB_NODUPDATA) ? 7076 MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; 7077 } 7078 if (sub_root) 7079 mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; 7080 new_dupdata = (int)dkey.mv_size; 7081 /* converted, write the original data first */ 7082 if (dkey.mv_size) { 7083 rc = _mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); 7084 if (rc) 7085 goto bad_sub; 7086 /* we've done our job */ 7087 dkey.mv_size = 0; 7088 } 7089 if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { 7090 /* Adjust other cursors pointing to mp */ 7091 MDB_cursor *m2; 7092 MDB_xcursor *mx = mc->mc_xcursor; 7093 unsigned i = mc->mc_top; 7094 MDB_page *mp = mc->mc_pg[i]; 7095 7096 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { 7097 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; 7098 if (!(m2->mc_flags & C_INITIALIZED)) continue; 7099 if (m2->mc_pg[i] == mp) { 7100 if (m2->mc_ki[i] == mc->mc_ki[i]) { 7101 mdb_xcursor_init2(m2, mx, new_dupdata); 7102 } else if (!insert_key) { 7103 XCURSOR_REFRESH(m2, i, mp); 7104 } 7105 } 7106 } 7107 } 7108 ecount = mc->mc_xcursor->mx_db.md_entries; 7109 if (flags & MDB_APPENDDUP) 7110 xflags |= MDB_APPEND; 7111 rc = _mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); 7112 if (flags & F_SUBDATA) { 7113 void *db = NODEDATA(leaf); 7114 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); 7115 } 7116 insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; 7117 } 7118 /* Increment count unless we just replaced an existing item. */ 7119 if (insert_data) 7120 mc->mc_db->md_entries++; 7121 if (insert_key) { 7122 /* Invalidate txn if we created an empty sub-DB */ 7123 if (rc) 7124 goto bad_sub; 7125 /* If we succeeded and the key didn't exist before, 7126 * make sure the cursor is marked valid. 7127 */ 7128 mc->mc_flags |= C_INITIALIZED; 7129 } 7130 if (flags & MDB_MULTIPLE) { 7131 if (!rc) { 7132 mcount++; 7133 /* let caller know how many succeeded, if any */ 7134 data[1].mv_size = mcount; 7135 if (mcount < dcount) { 7136 data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; 7137 insert_key = insert_data = 0; 7138 goto more; 7139 } 7140 } 7141 } 7142 return rc; 7143 bad_sub: 7144 if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */ 7145 rc = MDB_CORRUPTED; 7146 } 7147 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 7148 return rc; 7149 } 7150 7151 int 7152 mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, 7153 unsigned int flags) 7154 { 7155 DKBUF; 7156 DDBUF; 7157 int rc = _mdb_cursor_put(mc, key, data, flags); 7158 MDB_TRACE(("%p, %"Z"u[%s], %"Z"u%s, %u", 7159 mc, key ? key->mv_size:0, DKEY(key), data ? data->mv_size:0, 7160 data ? mdb_dval(mc->mc_txn, mc->mc_dbi, data, dbuf):"", flags)); 7161 return rc; 7162 } 7163 7164 static int 7165 _mdb_cursor_del(MDB_cursor *mc, unsigned int flags) 7166 { 7167 MDB_node *leaf; 7168 MDB_page *mp; 7169 int rc; 7170 7171 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) 7172 return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 7173 7174 if (!(mc->mc_flags & C_INITIALIZED)) 7175 return EINVAL; 7176 7177 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) 7178 return MDB_NOTFOUND; 7179 7180 if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) 7181 return rc; 7182 7183 rc = mdb_cursor_touch(mc); 7184 if (rc) 7185 return rc; 7186 7187 mp = mc->mc_pg[mc->mc_top]; 7188 if (!IS_LEAF(mp)) 7189 return MDB_CORRUPTED; 7190 if (IS_LEAF2(mp)) 7191 goto del_key; 7192 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 7193 7194 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { 7195 if (flags & MDB_NODUPDATA) { 7196 /* mdb_cursor_del0() will subtract the final entry */ 7197 mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; 7198 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; 7199 } else { 7200 if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { 7201 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); 7202 } 7203 rc = _mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); 7204 if (rc) 7205 return rc; 7206 /* If sub-DB still has entries, we're done */ 7207 if (mc->mc_xcursor->mx_db.md_entries) { 7208 if (leaf->mn_flags & F_SUBDATA) { 7209 /* update subDB info */ 7210 void *db = NODEDATA(leaf); 7211 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); 7212 } else { 7213 MDB_cursor *m2; 7214 /* shrink fake page */ 7215 mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); 7216 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); 7217 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); 7218 /* fix other sub-DB cursors pointed at fake pages on this page */ 7219 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { 7220 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; 7221 if (!(m2->mc_flags & C_INITIALIZED)) continue; 7222 if (m2->mc_pg[mc->mc_top] == mp) { 7223 XCURSOR_REFRESH(m2, mc->mc_top, mp); 7224 } 7225 } 7226 } 7227 mc->mc_db->md_entries--; 7228 return rc; 7229 } else { 7230 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; 7231 } 7232 /* otherwise fall thru and delete the sub-DB */ 7233 } 7234 7235 if (leaf->mn_flags & F_SUBDATA) { 7236 /* add all the child DB's pages to the free list */ 7237 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); 7238 if (rc) 7239 goto fail; 7240 } 7241 } 7242 /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */ 7243 else if ((leaf->mn_flags ^ flags) & F_SUBDATA) { 7244 rc = MDB_INCOMPATIBLE; 7245 goto fail; 7246 } 7247 7248 /* add overflow pages to free list */ 7249 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { 7250 MDB_page *omp; 7251 pgno_t pg; 7252 7253 memcpy(&pg, NODEDATA(leaf), sizeof(pg)); 7254 if ((rc = mdb_page_get(mc, pg, &omp, NULL)) || 7255 (rc = mdb_ovpage_free(mc, omp))) 7256 goto fail; 7257 } 7258 7259 del_key: 7260 return mdb_cursor_del0(mc); 7261 7262 fail: 7263 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 7264 return rc; 7265 } 7266 7267 int 7268 mdb_cursor_del(MDB_cursor *mc, unsigned int flags) 7269 { 7270 MDB_TRACE(("%p, %u", 7271 mc, flags)); 7272 return _mdb_cursor_del(mc, flags); 7273 } 7274 7275 /** Allocate and initialize new pages for a database. 7276 * Set #MDB_TXN_ERROR on failure. 7277 * @param[in] mc a cursor on the database being added to. 7278 * @param[in] flags flags defining what type of page is being allocated. 7279 * @param[in] num the number of pages to allocate. This is usually 1, 7280 * unless allocating overflow pages for a large record. 7281 * @param[out] mp Address of a page, or NULL on failure. 7282 * @return 0 on success, non-zero on failure. 7283 */ 7284 static int 7285 mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) 7286 { 7287 MDB_page *np; 7288 int rc; 7289 7290 if ((rc = mdb_page_alloc(mc, num, &np))) 7291 return rc; 7292 DPRINTF(("allocated new mpage %"Z"u, page size %u", 7293 np->mp_pgno, mc->mc_txn->mt_env->me_psize)); 7294 np->mp_flags = flags | P_DIRTY; 7295 np->mp_lower = (PAGEHDRSZ-PAGEBASE); 7296 np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; 7297 7298 if (IS_BRANCH(np)) 7299 mc->mc_db->md_branch_pages++; 7300 else if (IS_LEAF(np)) 7301 mc->mc_db->md_leaf_pages++; 7302 else if (IS_OVERFLOW(np)) { 7303 mc->mc_db->md_overflow_pages += num; 7304 np->mp_pages = num; 7305 } 7306 *mp = np; 7307 7308 return 0; 7309 } 7310 7311 /** Calculate the size of a leaf node. 7312 * The size depends on the environment's page size; if a data item 7313 * is too large it will be put onto an overflow page and the node 7314 * size will only include the key and not the data. Sizes are always 7315 * rounded up to an even number of bytes, to guarantee 2-byte alignment 7316 * of the #MDB_node headers. 7317 * @param[in] env The environment handle. 7318 * @param[in] key The key for the node. 7319 * @param[in] data The data for the node. 7320 * @return The number of bytes needed to store the node. 7321 */ 7322 static size_t 7323 mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) 7324 { 7325 size_t sz; 7326 7327 sz = LEAFSIZE(key, data); 7328 if (sz > env->me_nodemax) { 7329 /* put on overflow page */ 7330 sz -= data->mv_size - sizeof(pgno_t); 7331 } 7332 7333 return EVEN(sz + sizeof(indx_t)); 7334 } 7335 7336 /** Calculate the size of a branch node. 7337 * The size should depend on the environment's page size but since 7338 * we currently don't support spilling large keys onto overflow 7339 * pages, it's simply the size of the #MDB_node header plus the 7340 * size of the key. Sizes are always rounded up to an even number 7341 * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. 7342 * @param[in] env The environment handle. 7343 * @param[in] key The key for the node. 7344 * @return The number of bytes needed to store the node. 7345 */ 7346 static size_t 7347 mdb_branch_size(MDB_env *env, MDB_val *key) 7348 { 7349 size_t sz; 7350 7351 sz = INDXSIZE(key); 7352 if (sz > env->me_nodemax) { 7353 /* put on overflow page */ 7354 /* not implemented */ 7355 /* sz -= key->size - sizeof(pgno_t); */ 7356 } 7357 7358 return sz + sizeof(indx_t); 7359 } 7360 7361 /** Add a node to the page pointed to by the cursor. 7362 * Set #MDB_TXN_ERROR on failure. 7363 * @param[in] mc The cursor for this operation. 7364 * @param[in] indx The index on the page where the new node should be added. 7365 * @param[in] key The key for the new node. 7366 * @param[in] data The data for the new node, if any. 7367 * @param[in] pgno The page number, if adding a branch node. 7368 * @param[in] flags Flags for the node. 7369 * @return 0 on success, non-zero on failure. Possible errors are: 7370 * <ul> 7371 * <li>ENOMEM - failed to allocate overflow pages for the node. 7372 * <li>MDB_PAGE_FULL - there is insufficient room in the page. This error 7373 * should never happen since all callers already calculate the 7374 * page's free space before calling this function. 7375 * </ul> 7376 */ 7377 static int 7378 mdb_node_add(MDB_cursor *mc, indx_t indx, 7379 MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags) 7380 { 7381 unsigned int i; 7382 size_t node_size = NODESIZE; 7383 ssize_t room; 7384 indx_t ofs; 7385 MDB_node *node; 7386 MDB_page *mp = mc->mc_pg[mc->mc_top]; 7387 MDB_page *ofp = NULL; /* overflow page */ 7388 void *ndata; 7389 DKBUF; 7390 7391 mdb_cassert(mc, MP_UPPER(mp) >= MP_LOWER(mp)); 7392 7393 DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", 7394 IS_LEAF(mp) ? "leaf" : "branch", 7395 IS_SUBP(mp) ? "sub-" : "", 7396 mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, 7397 key ? key->mv_size : 0, key ? DKEY(key) : "null")); 7398 7399 if (IS_LEAF2(mp)) { 7400 /* Move higher keys up one slot. */ 7401 int ksize = mc->mc_db->md_pad, dif; 7402 char *ptr = LEAF2KEY(mp, indx, ksize); 7403 dif = NUMKEYS(mp) - indx; 7404 if (dif > 0) 7405 memmove(ptr+ksize, ptr, dif*ksize); 7406 /* insert new key */ 7407 memcpy(ptr, key->mv_data, ksize); 7408 7409 /* Just using these for counting */ 7410 MP_LOWER(mp) += sizeof(indx_t); 7411 MP_UPPER(mp) -= ksize - sizeof(indx_t); 7412 return MDB_SUCCESS; 7413 } 7414 7415 room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); 7416 if (key != NULL) 7417 node_size += key->mv_size; 7418 if (IS_LEAF(mp)) { 7419 mdb_cassert(mc, key && data); 7420 if (F_ISSET(flags, F_BIGDATA)) { 7421 /* Data already on overflow page. */ 7422 node_size += sizeof(pgno_t); 7423 } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { 7424 int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); 7425 int rc; 7426 /* Put data on overflow page. */ 7427 DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", 7428 data->mv_size, node_size+data->mv_size)); 7429 node_size = EVEN(node_size + sizeof(pgno_t)); 7430 if ((ssize_t)node_size > room) 7431 goto full; 7432 if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) 7433 return rc; 7434 DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); 7435 flags |= F_BIGDATA; 7436 goto update; 7437 } else { 7438 node_size += data->mv_size; 7439 } 7440 } 7441 node_size = EVEN(node_size); 7442 if ((ssize_t)node_size > room) 7443 goto full; 7444 7445 update: 7446 /* Move higher pointers up one slot. */ 7447 for (i = NUMKEYS(mp); i > indx; i--) 7448 MP_PTRS(mp)[i] = MP_PTRS(mp)[i - 1]; 7449 7450 /* Adjust free space offsets. */ 7451 ofs = MP_UPPER(mp) - node_size; 7452 mdb_cassert(mc, ofs >= MP_LOWER(mp) + sizeof(indx_t)); 7453 MP_PTRS(mp)[indx] = ofs; 7454 MP_UPPER(mp) = ofs; 7455 MP_LOWER(mp) += sizeof(indx_t); 7456 7457 /* Write the node data. */ 7458 node = NODEPTR(mp, indx); 7459 node->mn_ksize = (key == NULL) ? 0 : key->mv_size; 7460 node->mn_flags = flags; 7461 if (IS_LEAF(mp)) 7462 SETDSZ(node,data->mv_size); 7463 else 7464 SETPGNO(node,pgno); 7465 7466 if (key) 7467 memcpy(NODEKEY(node), key->mv_data, key->mv_size); 7468 7469 if (IS_LEAF(mp)) { 7470 ndata = NODEDATA(node); 7471 if (ofp == NULL) { 7472 if (F_ISSET(flags, F_BIGDATA)) 7473 memcpy(ndata, data->mv_data, sizeof(pgno_t)); 7474 else if (F_ISSET(flags, MDB_RESERVE)) 7475 data->mv_data = ndata; 7476 else 7477 memcpy(ndata, data->mv_data, data->mv_size); 7478 } else { 7479 memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); 7480 ndata = METADATA(ofp); 7481 if (F_ISSET(flags, MDB_RESERVE)) 7482 data->mv_data = ndata; 7483 else 7484 memcpy(ndata, data->mv_data, data->mv_size); 7485 } 7486 } 7487 7488 return MDB_SUCCESS; 7489 7490 full: 7491 DPRINTF(("not enough room in page %"Z"u, got %u ptrs", 7492 mdb_dbg_pgno(mp), NUMKEYS(mp))); 7493 DPRINTF(("upper-lower = %u - %u = %"Z"d", MP_UPPER(mp),MP_LOWER(mp),room)); 7494 DPRINTF(("node size = %"Z"u", node_size)); 7495 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 7496 return MDB_PAGE_FULL; 7497 } 7498 7499 /** Delete the specified node from a page. 7500 * @param[in] mc Cursor pointing to the node to delete. 7501 * @param[in] ksize The size of a node. Only used if the page is 7502 * part of a #MDB_DUPFIXED database. 7503 */ 7504 static void 7505 mdb_node_del(MDB_cursor *mc, int ksize) 7506 { 7507 MDB_page *mp = mc->mc_pg[mc->mc_top]; 7508 indx_t indx = mc->mc_ki[mc->mc_top]; 7509 unsigned int sz; 7510 indx_t i, j, numkeys, ptr; 7511 MDB_node *node; 7512 char *base; 7513 7514 DPRINTF(("delete node %u on %s page %"Z"u", indx, 7515 IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); 7516 numkeys = NUMKEYS(mp); 7517 mdb_cassert(mc, indx < numkeys); 7518 7519 if (IS_LEAF2(mp)) { 7520 int x = numkeys - 1 - indx; 7521 base = LEAF2KEY(mp, indx, ksize); 7522 if (x) 7523 memmove(base, base + ksize, x * ksize); 7524 MP_LOWER(mp) -= sizeof(indx_t); 7525 MP_UPPER(mp) += ksize - sizeof(indx_t); 7526 return; 7527 } 7528 7529 node = NODEPTR(mp, indx); 7530 sz = NODESIZE + node->mn_ksize; 7531 if (IS_LEAF(mp)) { 7532 if (F_ISSET(node->mn_flags, F_BIGDATA)) 7533 sz += sizeof(pgno_t); 7534 else 7535 sz += NODEDSZ(node); 7536 } 7537 sz = EVEN(sz); 7538 7539 ptr = MP_PTRS(mp)[indx]; 7540 for (i = j = 0; i < numkeys; i++) { 7541 if (i != indx) { 7542 MP_PTRS(mp)[j] = MP_PTRS(mp)[i]; 7543 if (MP_PTRS(mp)[i] < ptr) 7544 MP_PTRS(mp)[j] += sz; 7545 j++; 7546 } 7547 } 7548 7549 base = (char *)mp + MP_UPPER(mp) + PAGEBASE; 7550 memmove(base + sz, base, ptr - MP_UPPER(mp)); 7551 7552 MP_LOWER(mp) -= sizeof(indx_t); 7553 MP_UPPER(mp) += sz; 7554 } 7555 7556 /** Compact the main page after deleting a node on a subpage. 7557 * @param[in] mp The main page to operate on. 7558 * @param[in] indx The index of the subpage on the main page. 7559 */ 7560 static void 7561 mdb_node_shrink(MDB_page *mp, indx_t indx) 7562 { 7563 MDB_node *node; 7564 MDB_page *sp, *xp; 7565 char *base; 7566 indx_t delta, nsize, len, ptr; 7567 int i; 7568 7569 node = NODEPTR(mp, indx); 7570 sp = (MDB_page *)NODEDATA(node); 7571 delta = SIZELEFT(sp); 7572 nsize = NODEDSZ(node) - delta; 7573 7574 /* Prepare to shift upward, set len = length(subpage part to shift) */ 7575 if (IS_LEAF2(sp)) { 7576 len = nsize; 7577 if (nsize & 1) 7578 return; /* do not make the node uneven-sized */ 7579 } else { 7580 xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ 7581 for (i = NUMKEYS(sp); --i >= 0; ) 7582 MP_PTRS(xp)[i] = MP_PTRS(sp)[i] - delta; 7583 len = PAGEHDRSZ; 7584 } 7585 MP_UPPER(sp) = MP_LOWER(sp); 7586 COPY_PGNO(MP_PGNO(sp), mp->mp_pgno); 7587 SETDSZ(node, nsize); 7588 7589 /* Shift <lower nodes...initial part of subpage> upward */ 7590 base = (char *)mp + mp->mp_upper + PAGEBASE; 7591 memmove(base + delta, base, (char *)sp + len - base); 7592 7593 ptr = mp->mp_ptrs[indx]; 7594 for (i = NUMKEYS(mp); --i >= 0; ) { 7595 if (mp->mp_ptrs[i] <= ptr) 7596 mp->mp_ptrs[i] += delta; 7597 } 7598 mp->mp_upper += delta; 7599 } 7600 7601 /** Initial setup of a sorted-dups cursor. 7602 * Sorted duplicates are implemented as a sub-database for the given key. 7603 * The duplicate data items are actually keys of the sub-database. 7604 * Operations on the duplicate data items are performed using a sub-cursor 7605 * initialized when the sub-database is first accessed. This function does 7606 * the preliminary setup of the sub-cursor, filling in the fields that 7607 * depend only on the parent DB. 7608 * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. 7609 */ 7610 static void 7611 mdb_xcursor_init0(MDB_cursor *mc) 7612 { 7613 MDB_xcursor *mx = mc->mc_xcursor; 7614 7615 mx->mx_cursor.mc_xcursor = NULL; 7616 mx->mx_cursor.mc_txn = mc->mc_txn; 7617 mx->mx_cursor.mc_db = &mx->mx_db; 7618 mx->mx_cursor.mc_dbx = &mx->mx_dbx; 7619 mx->mx_cursor.mc_dbi = mc->mc_dbi; 7620 mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; 7621 mx->mx_cursor.mc_snum = 0; 7622 mx->mx_cursor.mc_top = 0; 7623 mx->mx_cursor.mc_flags = C_SUB; 7624 mx->mx_dbx.md_name.mv_size = 0; 7625 mx->mx_dbx.md_name.mv_data = NULL; 7626 mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; 7627 mx->mx_dbx.md_dcmp = NULL; 7628 mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; 7629 } 7630 7631 /** Final setup of a sorted-dups cursor. 7632 * Sets up the fields that depend on the data from the main cursor. 7633 * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. 7634 * @param[in] node The data containing the #MDB_db record for the 7635 * sorted-dup database. 7636 */ 7637 static void 7638 mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) 7639 { 7640 MDB_xcursor *mx = mc->mc_xcursor; 7641 7642 if (node->mn_flags & F_SUBDATA) { 7643 memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); 7644 mx->mx_cursor.mc_pg[0] = 0; 7645 mx->mx_cursor.mc_snum = 0; 7646 mx->mx_cursor.mc_top = 0; 7647 mx->mx_cursor.mc_flags = C_SUB; 7648 } else { 7649 MDB_page *fp = NODEDATA(node); 7650 mx->mx_db.md_pad = 0; 7651 mx->mx_db.md_flags = 0; 7652 mx->mx_db.md_depth = 1; 7653 mx->mx_db.md_branch_pages = 0; 7654 mx->mx_db.md_leaf_pages = 1; 7655 mx->mx_db.md_overflow_pages = 0; 7656 mx->mx_db.md_entries = NUMKEYS(fp); 7657 COPY_PGNO(mx->mx_db.md_root, MP_PGNO(fp)); 7658 mx->mx_cursor.mc_snum = 1; 7659 mx->mx_cursor.mc_top = 0; 7660 mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; 7661 mx->mx_cursor.mc_pg[0] = fp; 7662 mx->mx_cursor.mc_ki[0] = 0; 7663 if (mc->mc_db->md_flags & MDB_DUPFIXED) { 7664 mx->mx_db.md_flags = MDB_DUPFIXED; 7665 mx->mx_db.md_pad = fp->mp_pad; 7666 if (mc->mc_db->md_flags & MDB_INTEGERDUP) 7667 mx->mx_db.md_flags |= MDB_INTEGERKEY; 7668 } 7669 } 7670 DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, 7671 mx->mx_db.md_root)); 7672 mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; 7673 #if UINT_MAX < SIZE_MAX 7674 if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) 7675 mx->mx_dbx.md_cmp = mdb_cmp_clong; 7676 #endif 7677 } 7678 7679 7680 /** Fixup a sorted-dups cursor due to underlying update. 7681 * Sets up some fields that depend on the data from the main cursor. 7682 * Almost the same as init1, but skips initialization steps if the 7683 * xcursor had already been used. 7684 * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up. 7685 * @param[in] src_mx The xcursor of an up-to-date cursor. 7686 * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item. 7687 */ 7688 static void 7689 mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) 7690 { 7691 MDB_xcursor *mx = mc->mc_xcursor; 7692 7693 if (new_dupdata) { 7694 mx->mx_cursor.mc_snum = 1; 7695 mx->mx_cursor.mc_top = 0; 7696 mx->mx_cursor.mc_flags |= C_INITIALIZED; 7697 mx->mx_cursor.mc_ki[0] = 0; 7698 mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; 7699 #if UINT_MAX < SIZE_MAX 7700 mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; 7701 #endif 7702 } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) { 7703 return; 7704 } 7705 mx->mx_db = src_mx->mx_db; 7706 mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; 7707 DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, 7708 mx->mx_db.md_root)); 7709 } 7710 7711 /** Initialize a cursor for a given transaction and database. */ 7712 static void 7713 mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) 7714 { 7715 mc->mc_next = NULL; 7716 mc->mc_backup = NULL; 7717 mc->mc_dbi = dbi; 7718 mc->mc_txn = txn; 7719 mc->mc_db = &txn->mt_dbs[dbi]; 7720 mc->mc_dbx = &txn->mt_dbxs[dbi]; 7721 mc->mc_dbflag = &txn->mt_dbflags[dbi]; 7722 mc->mc_snum = 0; 7723 mc->mc_top = 0; 7724 mc->mc_pg[0] = 0; 7725 mc->mc_ki[0] = 0; 7726 mc->mc_flags = 0; 7727 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { 7728 mdb_tassert(txn, mx != NULL); 7729 mc->mc_xcursor = mx; 7730 mdb_xcursor_init0(mc); 7731 } else { 7732 mc->mc_xcursor = NULL; 7733 } 7734 if (*mc->mc_dbflag & DB_STALE) { 7735 mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); 7736 } 7737 } 7738 7739 int 7740 mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) 7741 { 7742 MDB_cursor *mc; 7743 size_t size = sizeof(MDB_cursor); 7744 7745 if (!ret || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) 7746 return EINVAL; 7747 7748 if (txn->mt_flags & MDB_TXN_BLOCKED) 7749 return MDB_BAD_TXN; 7750 7751 if (dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) 7752 return EINVAL; 7753 7754 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) 7755 size += sizeof(MDB_xcursor); 7756 7757 if ((mc = malloc(size)) != NULL) { 7758 mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); 7759 if (txn->mt_cursors) { 7760 mc->mc_next = txn->mt_cursors[dbi]; 7761 txn->mt_cursors[dbi] = mc; 7762 mc->mc_flags |= C_UNTRACK; 7763 } 7764 } else { 7765 return ENOMEM; 7766 } 7767 7768 MDB_TRACE(("%p, %u = %p", txn, dbi, mc)); 7769 *ret = mc; 7770 7771 return MDB_SUCCESS; 7772 } 7773 7774 int 7775 mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) 7776 { 7777 if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID)) 7778 return EINVAL; 7779 7780 if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) 7781 return EINVAL; 7782 7783 if (txn->mt_flags & MDB_TXN_BLOCKED) 7784 return MDB_BAD_TXN; 7785 7786 mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); 7787 return MDB_SUCCESS; 7788 } 7789 7790 /* Return the count of duplicate data items for the current key */ 7791 int 7792 mdb_cursor_count(MDB_cursor *mc, size_t *countp) 7793 { 7794 MDB_node *leaf; 7795 7796 if (mc == NULL || countp == NULL) 7797 return EINVAL; 7798 7799 if (mc->mc_xcursor == NULL) 7800 return MDB_INCOMPATIBLE; 7801 7802 if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) 7803 return MDB_BAD_TXN; 7804 7805 if (!(mc->mc_flags & C_INITIALIZED)) 7806 return EINVAL; 7807 7808 if (!mc->mc_snum) 7809 return MDB_NOTFOUND; 7810 7811 if (mc->mc_flags & C_EOF) { 7812 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) 7813 return MDB_NOTFOUND; 7814 mc->mc_flags ^= C_EOF; 7815 } 7816 7817 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 7818 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { 7819 *countp = 1; 7820 } else { 7821 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) 7822 return EINVAL; 7823 7824 *countp = mc->mc_xcursor->mx_db.md_entries; 7825 } 7826 return MDB_SUCCESS; 7827 } 7828 7829 void 7830 mdb_cursor_close(MDB_cursor *mc) 7831 { 7832 MDB_TRACE(("%p", mc)); 7833 if (mc && !mc->mc_backup) { 7834 /* remove from txn, if tracked */ 7835 if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { 7836 MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; 7837 while (*prev && *prev != mc) prev = &(*prev)->mc_next; 7838 if (*prev == mc) 7839 *prev = mc->mc_next; 7840 } 7841 free(mc); 7842 } 7843 } 7844 7845 MDB_txn * 7846 mdb_cursor_txn(MDB_cursor *mc) 7847 { 7848 if (!mc) return NULL; 7849 return mc->mc_txn; 7850 } 7851 7852 MDB_dbi 7853 mdb_cursor_dbi(MDB_cursor *mc) 7854 { 7855 return mc->mc_dbi; 7856 } 7857 7858 /** Replace the key for a branch node with a new key. 7859 * Set #MDB_TXN_ERROR on failure. 7860 * @param[in] mc Cursor pointing to the node to operate on. 7861 * @param[in] key The new key to use. 7862 * @return 0 on success, non-zero on failure. 7863 */ 7864 static int 7865 mdb_update_key(MDB_cursor *mc, MDB_val *key) 7866 { 7867 MDB_page *mp; 7868 MDB_node *node; 7869 char *base; 7870 size_t len; 7871 int delta, ksize, oksize; 7872 indx_t ptr, i, numkeys, indx; 7873 DKBUF; 7874 7875 indx = mc->mc_ki[mc->mc_top]; 7876 mp = mc->mc_pg[mc->mc_top]; 7877 node = NODEPTR(mp, indx); 7878 ptr = mp->mp_ptrs[indx]; 7879 #if MDB_DEBUG 7880 { 7881 MDB_val k2; 7882 char kbuf2[DKBUF_MAXKEYSIZE*2+1]; 7883 k2.mv_data = NODEKEY(node); 7884 k2.mv_size = node->mn_ksize; 7885 DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", 7886 indx, ptr, 7887 mdb_dkey(&k2, kbuf2), 7888 DKEY(key), 7889 mp->mp_pgno)); 7890 } 7891 #endif 7892 7893 /* Sizes must be 2-byte aligned. */ 7894 ksize = EVEN(key->mv_size); 7895 oksize = EVEN(node->mn_ksize); 7896 delta = ksize - oksize; 7897 7898 /* Shift node contents if EVEN(key length) changed. */ 7899 if (delta) { 7900 if (delta > 0 && SIZELEFT(mp) < delta) { 7901 pgno_t pgno; 7902 /* not enough space left, do a delete and split */ 7903 DPRINTF(("Not enough room, delta = %d, splitting...", delta)); 7904 pgno = NODEPGNO(node); 7905 mdb_node_del(mc, 0); 7906 return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); 7907 } 7908 7909 numkeys = NUMKEYS(mp); 7910 for (i = 0; i < numkeys; i++) { 7911 if (mp->mp_ptrs[i] <= ptr) 7912 mp->mp_ptrs[i] -= delta; 7913 } 7914 7915 base = (char *)mp + mp->mp_upper + PAGEBASE; 7916 len = ptr - mp->mp_upper + NODESIZE; 7917 memmove(base - delta, base, len); 7918 mp->mp_upper -= delta; 7919 7920 node = NODEPTR(mp, indx); 7921 } 7922 7923 /* But even if no shift was needed, update ksize */ 7924 if (node->mn_ksize != key->mv_size) 7925 node->mn_ksize = key->mv_size; 7926 7927 if (key->mv_size) 7928 memcpy(NODEKEY(node), key->mv_data, key->mv_size); 7929 7930 return MDB_SUCCESS; 7931 } 7932 7933 static void 7934 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); 7935 7936 /** Perform \b act while tracking temporary cursor \b mn */ 7937 #define WITH_CURSOR_TRACKING(mn, act) do { \ 7938 MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ 7939 if ((mn).mc_flags & C_SUB) { \ 7940 dummy.mc_flags = C_INITIALIZED; \ 7941 dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ 7942 tracked = &dummy; \ 7943 } else { \ 7944 tracked = &(mn); \ 7945 } \ 7946 tracked->mc_next = *tp; \ 7947 *tp = tracked; \ 7948 { act; } \ 7949 *tp = tracked->mc_next; \ 7950 } while (0) 7951 7952 /** Move a node from csrc to cdst. 7953 */ 7954 static int 7955 mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) 7956 { 7957 MDB_node *srcnode; 7958 MDB_val key, data; 7959 pgno_t srcpg; 7960 MDB_cursor mn; 7961 int rc; 7962 unsigned short flags; 7963 7964 DKBUF; 7965 7966 /* Mark src and dst as dirty. */ 7967 if ((rc = mdb_page_touch(csrc)) || 7968 (rc = mdb_page_touch(cdst))) 7969 return rc; 7970 7971 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 7972 key.mv_size = csrc->mc_db->md_pad; 7973 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); 7974 data.mv_size = 0; 7975 data.mv_data = NULL; 7976 srcpg = 0; 7977 flags = 0; 7978 } else { 7979 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); 7980 mdb_cassert(csrc, !((size_t)srcnode & 1)); 7981 srcpg = NODEPGNO(srcnode); 7982 flags = srcnode->mn_flags; 7983 if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { 7984 unsigned int snum = csrc->mc_snum; 7985 MDB_node *s2; 7986 /* must find the lowest key below src */ 7987 rc = mdb_page_search_lowest(csrc); 7988 if (rc) 7989 return rc; 7990 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 7991 key.mv_size = csrc->mc_db->md_pad; 7992 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); 7993 } else { 7994 s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); 7995 key.mv_size = NODEKSZ(s2); 7996 key.mv_data = NODEKEY(s2); 7997 } 7998 csrc->mc_snum = snum--; 7999 csrc->mc_top = snum; 8000 } else { 8001 key.mv_size = NODEKSZ(srcnode); 8002 key.mv_data = NODEKEY(srcnode); 8003 } 8004 data.mv_size = NODEDSZ(srcnode); 8005 data.mv_data = NODEDATA(srcnode); 8006 } 8007 mn.mc_xcursor = NULL; 8008 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { 8009 unsigned int snum = cdst->mc_snum; 8010 MDB_node *s2; 8011 MDB_val bkey; 8012 /* must find the lowest key below dst */ 8013 mdb_cursor_copy(cdst, &mn); 8014 rc = mdb_page_search_lowest(&mn); 8015 if (rc) 8016 return rc; 8017 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { 8018 bkey.mv_size = mn.mc_db->md_pad; 8019 bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); 8020 } else { 8021 s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); 8022 bkey.mv_size = NODEKSZ(s2); 8023 bkey.mv_data = NODEKEY(s2); 8024 } 8025 mn.mc_snum = snum--; 8026 mn.mc_top = snum; 8027 mn.mc_ki[snum] = 0; 8028 rc = mdb_update_key(&mn, &bkey); 8029 if (rc) 8030 return rc; 8031 } 8032 8033 DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", 8034 IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", 8035 csrc->mc_ki[csrc->mc_top], 8036 DKEY(&key), 8037 csrc->mc_pg[csrc->mc_top]->mp_pgno, 8038 cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); 8039 8040 /* Add the node to the destination page. 8041 */ 8042 rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); 8043 if (rc != MDB_SUCCESS) 8044 return rc; 8045 8046 /* Delete the node from the source page. 8047 */ 8048 mdb_node_del(csrc, key.mv_size); 8049 8050 { 8051 /* Adjust other cursors pointing to mp */ 8052 MDB_cursor *m2, *m3; 8053 MDB_dbi dbi = csrc->mc_dbi; 8054 MDB_page *mpd, *mps; 8055 8056 mps = csrc->mc_pg[csrc->mc_top]; 8057 /* If we're adding on the left, bump others up */ 8058 if (fromleft) { 8059 mpd = cdst->mc_pg[csrc->mc_top]; 8060 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8061 if (csrc->mc_flags & C_SUB) 8062 m3 = &m2->mc_xcursor->mx_cursor; 8063 else 8064 m3 = m2; 8065 if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) 8066 continue; 8067 if (m3 != cdst && 8068 m3->mc_pg[csrc->mc_top] == mpd && 8069 m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { 8070 m3->mc_ki[csrc->mc_top]++; 8071 } 8072 if (m3 !=csrc && 8073 m3->mc_pg[csrc->mc_top] == mps && 8074 m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { 8075 m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; 8076 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; 8077 m3->mc_ki[csrc->mc_top-1]++; 8078 } 8079 if (IS_LEAF(mps)) 8080 XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); 8081 } 8082 } else 8083 /* Adding on the right, bump others down */ 8084 { 8085 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8086 if (csrc->mc_flags & C_SUB) 8087 m3 = &m2->mc_xcursor->mx_cursor; 8088 else 8089 m3 = m2; 8090 if (m3 == csrc) continue; 8091 if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) 8092 continue; 8093 if (m3->mc_pg[csrc->mc_top] == mps) { 8094 if (!m3->mc_ki[csrc->mc_top]) { 8095 m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; 8096 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; 8097 m3->mc_ki[csrc->mc_top-1]--; 8098 } else { 8099 m3->mc_ki[csrc->mc_top]--; 8100 } 8101 if (IS_LEAF(mps)) 8102 XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); 8103 } 8104 } 8105 } 8106 } 8107 8108 /* Update the parent separators. 8109 */ 8110 if (csrc->mc_ki[csrc->mc_top] == 0) { 8111 if (csrc->mc_ki[csrc->mc_top-1] != 0) { 8112 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 8113 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); 8114 } else { 8115 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); 8116 key.mv_size = NODEKSZ(srcnode); 8117 key.mv_data = NODEKEY(srcnode); 8118 } 8119 DPRINTF(("update separator for source page %"Z"u to [%s]", 8120 csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); 8121 mdb_cursor_copy(csrc, &mn); 8122 mn.mc_snum--; 8123 mn.mc_top--; 8124 /* We want mdb_rebalance to find mn when doing fixups */ 8125 WITH_CURSOR_TRACKING(mn, 8126 rc = mdb_update_key(&mn, &key)); 8127 if (rc) 8128 return rc; 8129 } 8130 if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { 8131 MDB_val nullkey; 8132 indx_t ix = csrc->mc_ki[csrc->mc_top]; 8133 nullkey.mv_size = 0; 8134 csrc->mc_ki[csrc->mc_top] = 0; 8135 rc = mdb_update_key(csrc, &nullkey); 8136 csrc->mc_ki[csrc->mc_top] = ix; 8137 mdb_cassert(csrc, rc == MDB_SUCCESS); 8138 } 8139 } 8140 8141 if (cdst->mc_ki[cdst->mc_top] == 0) { 8142 if (cdst->mc_ki[cdst->mc_top-1] != 0) { 8143 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { 8144 key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); 8145 } else { 8146 srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); 8147 key.mv_size = NODEKSZ(srcnode); 8148 key.mv_data = NODEKEY(srcnode); 8149 } 8150 DPRINTF(("update separator for destination page %"Z"u to [%s]", 8151 cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); 8152 mdb_cursor_copy(cdst, &mn); 8153 mn.mc_snum--; 8154 mn.mc_top--; 8155 /* We want mdb_rebalance to find mn when doing fixups */ 8156 WITH_CURSOR_TRACKING(mn, 8157 rc = mdb_update_key(&mn, &key)); 8158 if (rc) 8159 return rc; 8160 } 8161 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { 8162 MDB_val nullkey; 8163 indx_t ix = cdst->mc_ki[cdst->mc_top]; 8164 nullkey.mv_size = 0; 8165 cdst->mc_ki[cdst->mc_top] = 0; 8166 rc = mdb_update_key(cdst, &nullkey); 8167 cdst->mc_ki[cdst->mc_top] = ix; 8168 mdb_cassert(cdst, rc == MDB_SUCCESS); 8169 } 8170 } 8171 8172 return MDB_SUCCESS; 8173 } 8174 8175 /** Merge one page into another. 8176 * The nodes from the page pointed to by \b csrc will 8177 * be copied to the page pointed to by \b cdst and then 8178 * the \b csrc page will be freed. 8179 * @param[in] csrc Cursor pointing to the source page. 8180 * @param[in] cdst Cursor pointing to the destination page. 8181 * @return 0 on success, non-zero on failure. 8182 */ 8183 static int 8184 mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) 8185 { 8186 MDB_page *psrc, *pdst; 8187 MDB_node *srcnode; 8188 MDB_val key, data; 8189 unsigned nkeys; 8190 int rc; 8191 indx_t i, j; 8192 8193 psrc = csrc->mc_pg[csrc->mc_top]; 8194 pdst = cdst->mc_pg[cdst->mc_top]; 8195 8196 DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno)); 8197 8198 mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ 8199 mdb_cassert(csrc, cdst->mc_snum > 1); 8200 8201 /* Mark dst as dirty. */ 8202 if ((rc = mdb_page_touch(cdst))) 8203 return rc; 8204 8205 /* get dst page again now that we've touched it. */ 8206 pdst = cdst->mc_pg[cdst->mc_top]; 8207 8208 /* Move all nodes from src to dst. 8209 */ 8210 j = nkeys = NUMKEYS(pdst); 8211 if (IS_LEAF2(psrc)) { 8212 key.mv_size = csrc->mc_db->md_pad; 8213 key.mv_data = METADATA(psrc); 8214 for (i = 0; i < NUMKEYS(psrc); i++, j++) { 8215 rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); 8216 if (rc != MDB_SUCCESS) 8217 return rc; 8218 key.mv_data = (char *)key.mv_data + key.mv_size; 8219 } 8220 } else { 8221 for (i = 0; i < NUMKEYS(psrc); i++, j++) { 8222 srcnode = NODEPTR(psrc, i); 8223 if (i == 0 && IS_BRANCH(psrc)) { 8224 MDB_cursor mn; 8225 MDB_node *s2; 8226 mdb_cursor_copy(csrc, &mn); 8227 mn.mc_xcursor = NULL; 8228 /* must find the lowest key below src */ 8229 rc = mdb_page_search_lowest(&mn); 8230 if (rc) 8231 return rc; 8232 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { 8233 key.mv_size = mn.mc_db->md_pad; 8234 key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); 8235 } else { 8236 s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); 8237 key.mv_size = NODEKSZ(s2); 8238 key.mv_data = NODEKEY(s2); 8239 } 8240 } else { 8241 key.mv_size = srcnode->mn_ksize; 8242 key.mv_data = NODEKEY(srcnode); 8243 } 8244 8245 data.mv_size = NODEDSZ(srcnode); 8246 data.mv_data = NODEDATA(srcnode); 8247 rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); 8248 if (rc != MDB_SUCCESS) 8249 return rc; 8250 } 8251 } 8252 8253 DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", 8254 pdst->mp_pgno, NUMKEYS(pdst), 8255 (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10)); 8256 8257 /* Unlink the src page from parent and add to free list. 8258 */ 8259 csrc->mc_top--; 8260 mdb_node_del(csrc, 0); 8261 if (csrc->mc_ki[csrc->mc_top] == 0) { 8262 key.mv_size = 0; 8263 rc = mdb_update_key(csrc, &key); 8264 if (rc) { 8265 csrc->mc_top++; 8266 return rc; 8267 } 8268 } 8269 csrc->mc_top++; 8270 8271 psrc = csrc->mc_pg[csrc->mc_top]; 8272 /* If not operating on FreeDB, allow this page to be reused 8273 * in this txn. Otherwise just add to free list. 8274 */ 8275 rc = mdb_page_loose(csrc, psrc); 8276 if (rc) 8277 return rc; 8278 if (IS_LEAF(psrc)) 8279 csrc->mc_db->md_leaf_pages--; 8280 else 8281 csrc->mc_db->md_branch_pages--; 8282 { 8283 /* Adjust other cursors pointing to mp */ 8284 MDB_cursor *m2, *m3; 8285 MDB_dbi dbi = csrc->mc_dbi; 8286 unsigned int top = csrc->mc_top; 8287 8288 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8289 if (csrc->mc_flags & C_SUB) 8290 m3 = &m2->mc_xcursor->mx_cursor; 8291 else 8292 m3 = m2; 8293 if (m3 == csrc) continue; 8294 if (m3->mc_snum < csrc->mc_snum) continue; 8295 if (m3->mc_pg[top] == psrc) { 8296 m3->mc_pg[top] = pdst; 8297 m3->mc_ki[top] += nkeys; 8298 m3->mc_ki[top-1] = cdst->mc_ki[top-1]; 8299 } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] && 8300 m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { 8301 m3->mc_ki[top-1]--; 8302 } 8303 if (IS_LEAF(psrc)) 8304 XCURSOR_REFRESH(m3, top, m3->mc_pg[top]); 8305 } 8306 } 8307 { 8308 unsigned int snum = cdst->mc_snum; 8309 uint16_t depth = cdst->mc_db->md_depth; 8310 mdb_cursor_pop(cdst); 8311 rc = mdb_rebalance(cdst); 8312 /* Did the tree height change? */ 8313 if (depth != cdst->mc_db->md_depth) 8314 snum += cdst->mc_db->md_depth - depth; 8315 cdst->mc_snum = snum; 8316 cdst->mc_top = snum-1; 8317 } 8318 return rc; 8319 } 8320 8321 /** Copy the contents of a cursor. 8322 * @param[in] csrc The cursor to copy from. 8323 * @param[out] cdst The cursor to copy to. 8324 */ 8325 static void 8326 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) 8327 { 8328 unsigned int i; 8329 8330 cdst->mc_txn = csrc->mc_txn; 8331 cdst->mc_dbi = csrc->mc_dbi; 8332 cdst->mc_db = csrc->mc_db; 8333 cdst->mc_dbx = csrc->mc_dbx; 8334 cdst->mc_snum = csrc->mc_snum; 8335 cdst->mc_top = csrc->mc_top; 8336 cdst->mc_flags = csrc->mc_flags; 8337 8338 for (i=0; i<csrc->mc_snum; i++) { 8339 cdst->mc_pg[i] = csrc->mc_pg[i]; 8340 cdst->mc_ki[i] = csrc->mc_ki[i]; 8341 } 8342 } 8343 8344 /** Rebalance the tree after a delete operation. 8345 * @param[in] mc Cursor pointing to the page where rebalancing 8346 * should begin. 8347 * @return 0 on success, non-zero on failure. 8348 */ 8349 static int 8350 mdb_rebalance(MDB_cursor *mc) 8351 { 8352 MDB_node *node; 8353 int rc, fromleft; 8354 unsigned int ptop, minkeys, thresh; 8355 MDB_cursor mn; 8356 indx_t oldki; 8357 8358 if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { 8359 minkeys = 2; 8360 thresh = 1; 8361 } else { 8362 minkeys = 1; 8363 thresh = FILL_THRESHOLD; 8364 } 8365 DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", 8366 IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", 8367 mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), 8368 (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); 8369 8370 if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && 8371 NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { 8372 DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", 8373 mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); 8374 return MDB_SUCCESS; 8375 } 8376 8377 if (mc->mc_snum < 2) { 8378 MDB_page *mp = mc->mc_pg[0]; 8379 if (IS_SUBP(mp)) { 8380 DPUTS("Can't rebalance a subpage, ignoring"); 8381 return MDB_SUCCESS; 8382 } 8383 if (NUMKEYS(mp) == 0) { 8384 DPUTS("tree is completely empty"); 8385 mc->mc_db->md_root = P_INVALID; 8386 mc->mc_db->md_depth = 0; 8387 mc->mc_db->md_leaf_pages = 0; 8388 rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); 8389 if (rc) 8390 return rc; 8391 /* Adjust cursors pointing to mp */ 8392 mc->mc_snum = 0; 8393 mc->mc_top = 0; 8394 mc->mc_flags &= ~C_INITIALIZED; 8395 { 8396 MDB_cursor *m2, *m3; 8397 MDB_dbi dbi = mc->mc_dbi; 8398 8399 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8400 if (mc->mc_flags & C_SUB) 8401 m3 = &m2->mc_xcursor->mx_cursor; 8402 else 8403 m3 = m2; 8404 if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) 8405 continue; 8406 if (m3->mc_pg[0] == mp) { 8407 m3->mc_snum = 0; 8408 m3->mc_top = 0; 8409 m3->mc_flags &= ~C_INITIALIZED; 8410 } 8411 } 8412 } 8413 } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { 8414 int i; 8415 DPUTS("collapsing root page!"); 8416 rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); 8417 if (rc) 8418 return rc; 8419 mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); 8420 rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); 8421 if (rc) 8422 return rc; 8423 mc->mc_db->md_depth--; 8424 mc->mc_db->md_branch_pages--; 8425 mc->mc_ki[0] = mc->mc_ki[1]; 8426 for (i = 1; i<mc->mc_db->md_depth; i++) { 8427 mc->mc_pg[i] = mc->mc_pg[i+1]; 8428 mc->mc_ki[i] = mc->mc_ki[i+1]; 8429 } 8430 { 8431 /* Adjust other cursors pointing to mp */ 8432 MDB_cursor *m2, *m3; 8433 MDB_dbi dbi = mc->mc_dbi; 8434 8435 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8436 if (mc->mc_flags & C_SUB) 8437 m3 = &m2->mc_xcursor->mx_cursor; 8438 else 8439 m3 = m2; 8440 if (m3 == mc) continue; 8441 if (!(m3->mc_flags & C_INITIALIZED)) 8442 continue; 8443 if (m3->mc_pg[0] == mp) { 8444 for (i=0; i<mc->mc_db->md_depth; i++) { 8445 m3->mc_pg[i] = m3->mc_pg[i+1]; 8446 m3->mc_ki[i] = m3->mc_ki[i+1]; 8447 } 8448 m3->mc_snum--; 8449 m3->mc_top--; 8450 } 8451 } 8452 } 8453 } else 8454 DPUTS("root page doesn't need rebalancing"); 8455 return MDB_SUCCESS; 8456 } 8457 8458 /* The parent (branch page) must have at least 2 pointers, 8459 * otherwise the tree is invalid. 8460 */ 8461 ptop = mc->mc_top-1; 8462 mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); 8463 8464 /* Leaf page fill factor is below the threshold. 8465 * Try to move keys from left or right neighbor, or 8466 * merge with a neighbor page. 8467 */ 8468 8469 /* Find neighbors. 8470 */ 8471 mdb_cursor_copy(mc, &mn); 8472 mn.mc_xcursor = NULL; 8473 8474 oldki = mc->mc_ki[mc->mc_top]; 8475 if (mc->mc_ki[ptop] == 0) { 8476 /* We're the leftmost leaf in our parent. 8477 */ 8478 DPUTS("reading right neighbor"); 8479 mn.mc_ki[ptop]++; 8480 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); 8481 rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); 8482 if (rc) 8483 return rc; 8484 mn.mc_ki[mn.mc_top] = 0; 8485 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); 8486 fromleft = 0; 8487 } else { 8488 /* There is at least one neighbor to the left. 8489 */ 8490 DPUTS("reading left neighbor"); 8491 mn.mc_ki[ptop]--; 8492 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); 8493 rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); 8494 if (rc) 8495 return rc; 8496 mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; 8497 mc->mc_ki[mc->mc_top] = 0; 8498 fromleft = 1; 8499 } 8500 8501 DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", 8502 mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), 8503 (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); 8504 8505 /* If the neighbor page is above threshold and has enough keys, 8506 * move one key from it. Otherwise we should try to merge them. 8507 * (A branch page must never have less than 2 keys.) 8508 */ 8509 if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { 8510 rc = mdb_node_move(&mn, mc, fromleft); 8511 if (fromleft) { 8512 /* if we inserted on left, bump position up */ 8513 oldki++; 8514 } 8515 } else { 8516 if (!fromleft) { 8517 rc = mdb_page_merge(&mn, mc); 8518 } else { 8519 oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); 8520 mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; 8521 /* We want mdb_rebalance to find mn when doing fixups */ 8522 WITH_CURSOR_TRACKING(mn, 8523 rc = mdb_page_merge(mc, &mn)); 8524 mdb_cursor_copy(&mn, mc); 8525 } 8526 mc->mc_flags &= ~C_EOF; 8527 } 8528 mc->mc_ki[mc->mc_top] = oldki; 8529 return rc; 8530 } 8531 8532 /** Complete a delete operation started by #mdb_cursor_del(). */ 8533 static int 8534 mdb_cursor_del0(MDB_cursor *mc) 8535 { 8536 int rc; 8537 MDB_page *mp; 8538 indx_t ki; 8539 unsigned int nkeys; 8540 MDB_cursor *m2, *m3; 8541 MDB_dbi dbi = mc->mc_dbi; 8542 8543 ki = mc->mc_ki[mc->mc_top]; 8544 mp = mc->mc_pg[mc->mc_top]; 8545 mdb_node_del(mc, mc->mc_db->md_pad); 8546 mc->mc_db->md_entries--; 8547 { 8548 /* Adjust other cursors pointing to mp */ 8549 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 8550 m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 8551 if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) 8552 continue; 8553 if (m3 == mc || m3->mc_snum < mc->mc_snum) 8554 continue; 8555 if (m3->mc_pg[mc->mc_top] == mp) { 8556 if (m3->mc_ki[mc->mc_top] == ki) { 8557 m3->mc_flags |= C_DEL; 8558 if (mc->mc_db->md_flags & MDB_DUPSORT) { 8559 /* Sub-cursor referred into dataset which is gone */ 8560 m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); 8561 } 8562 continue; 8563 } else if (m3->mc_ki[mc->mc_top] > ki) { 8564 m3->mc_ki[mc->mc_top]--; 8565 } 8566 XCURSOR_REFRESH(m3, mc->mc_top, mp); 8567 } 8568 } 8569 } 8570 rc = mdb_rebalance(mc); 8571 if (rc) 8572 goto fail; 8573 8574 /* DB is totally empty now, just bail out. 8575 * Other cursors adjustments were already done 8576 * by mdb_rebalance and aren't needed here. 8577 */ 8578 if (!mc->mc_snum) { 8579 mc->mc_flags |= C_EOF; 8580 return rc; 8581 } 8582 8583 mp = mc->mc_pg[mc->mc_top]; 8584 nkeys = NUMKEYS(mp); 8585 8586 /* Adjust other cursors pointing to mp */ 8587 for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { 8588 m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; 8589 if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) 8590 continue; 8591 if (m3->mc_snum < mc->mc_snum) 8592 continue; 8593 if (m3->mc_pg[mc->mc_top] == mp) { 8594 if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { 8595 /* if m3 points past last node in page, find next sibling */ 8596 if (m3->mc_ki[mc->mc_top] >= nkeys) { 8597 rc = mdb_cursor_sibling(m3, 1); 8598 if (rc == MDB_NOTFOUND) { 8599 m3->mc_flags |= C_EOF; 8600 rc = MDB_SUCCESS; 8601 continue; 8602 } 8603 if (rc) 8604 goto fail; 8605 } 8606 if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { 8607 MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); 8608 /* If this node has dupdata, it may need to be reinited 8609 * because its data has moved. 8610 * If the xcursor was not initd it must be reinited. 8611 * Else if node points to a subDB, nothing is needed. 8612 * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset. 8613 */ 8614 if (node->mn_flags & F_DUPDATA) { 8615 if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { 8616 if (!(node->mn_flags & F_SUBDATA)) 8617 m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); 8618 } else { 8619 mdb_xcursor_init1(m3, node); 8620 rc = mdb_cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); 8621 if (rc) 8622 goto fail; 8623 } 8624 } 8625 m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; 8626 } 8627 } 8628 } 8629 } 8630 mc->mc_flags |= C_DEL; 8631 8632 fail: 8633 if (rc) 8634 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 8635 return rc; 8636 } 8637 8638 int 8639 mdb_del(MDB_txn *txn, MDB_dbi dbi, 8640 MDB_val *key, MDB_val *data) 8641 { 8642 DKBUF; 8643 DDBUF; 8644 if (!key || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 8645 return EINVAL; 8646 8647 if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) 8648 return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 8649 8650 if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { 8651 /* must ignore any data */ 8652 data = NULL; 8653 } 8654 8655 MDB_TRACE(("%p, %u, %"Z"u[%s], %"Z"u%s", 8656 txn, dbi, key ? key->mv_size:0, DKEY(key), data ? data->mv_size:0, 8657 data ? mdb_dval(txn, dbi, data, dbuf):"")); 8658 return mdb_del0(txn, dbi, key, data, 0); 8659 } 8660 8661 static int 8662 mdb_del0(MDB_txn *txn, MDB_dbi dbi, 8663 MDB_val *key, MDB_val *data, unsigned flags) 8664 { 8665 MDB_cursor mc; 8666 MDB_xcursor mx; 8667 MDB_cursor_op op; 8668 MDB_val rdata, *xdata; 8669 int rc, exact = 0; 8670 DKBUF; 8671 8672 DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key))); 8673 8674 mdb_cursor_init(&mc, txn, dbi, &mx); 8675 8676 if (data) { 8677 op = MDB_GET_BOTH; 8678 rdata = *data; 8679 xdata = &rdata; 8680 } else { 8681 op = MDB_SET; 8682 xdata = NULL; 8683 flags |= MDB_NODUPDATA; 8684 } 8685 rc = mdb_cursor_set(&mc, key, xdata, op, &exact); 8686 if (rc == 0) { 8687 /* let mdb_page_split know about this cursor if needed: 8688 * delete will trigger a rebalance; if it needs to move 8689 * a node from one page to another, it will have to 8690 * update the parent's separator key(s). If the new sepkey 8691 * is larger than the current one, the parent page may 8692 * run out of space, triggering a split. We need this 8693 * cursor to be consistent until the end of the rebalance. 8694 */ 8695 mc.mc_flags |= C_UNTRACK; 8696 mc.mc_next = txn->mt_cursors[dbi]; 8697 txn->mt_cursors[dbi] = &mc; 8698 rc = _mdb_cursor_del(&mc, flags); 8699 txn->mt_cursors[dbi] = mc.mc_next; 8700 } 8701 return rc; 8702 } 8703 8704 /** Split a page and insert a new node. 8705 * Set #MDB_TXN_ERROR on failure. 8706 * @param[in,out] mc Cursor pointing to the page and desired insertion index. 8707 * The cursor will be updated to point to the actual page and index where 8708 * the node got inserted after the split. 8709 * @param[in] newkey The key for the newly inserted node. 8710 * @param[in] newdata The data for the newly inserted node. 8711 * @param[in] newpgno The page number, if the new node is a branch node. 8712 * @param[in] nflags The #NODE_ADD_FLAGS for the new node. 8713 * @return 0 on success, non-zero on failure. 8714 */ 8715 static int 8716 mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, 8717 unsigned int nflags) 8718 { 8719 unsigned int flags; 8720 int rc = MDB_SUCCESS, new_root = 0, did_split = 0; 8721 indx_t newindx; 8722 pgno_t pgno = 0; 8723 int i, j, split_indx, nkeys, pmax; 8724 MDB_env *env = mc->mc_txn->mt_env; 8725 MDB_node *node; 8726 MDB_val sepkey, rkey, xdata, *rdata = &xdata; 8727 MDB_page *copy = NULL; 8728 MDB_page *mp, *rp, *pp; 8729 int ptop; 8730 MDB_cursor mn; 8731 DKBUF; 8732 8733 mp = mc->mc_pg[mc->mc_top]; 8734 newindx = mc->mc_ki[mc->mc_top]; 8735 nkeys = NUMKEYS(mp); 8736 8737 DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", 8738 IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, 8739 DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); 8740 8741 /* Create a right sibling. */ 8742 if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) 8743 return rc; 8744 rp->mp_pad = mp->mp_pad; 8745 DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); 8746 8747 /* Usually when splitting the root page, the cursor 8748 * height is 1. But when called from mdb_update_key, 8749 * the cursor height may be greater because it walks 8750 * up the stack while finding the branch slot to update. 8751 */ 8752 if (mc->mc_top < 1) { 8753 if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) 8754 goto done; 8755 /* shift current top to make room for new parent */ 8756 for (i=mc->mc_snum; i>0; i--) { 8757 mc->mc_pg[i] = mc->mc_pg[i-1]; 8758 mc->mc_ki[i] = mc->mc_ki[i-1]; 8759 } 8760 mc->mc_pg[0] = pp; 8761 mc->mc_ki[0] = 0; 8762 mc->mc_db->md_root = pp->mp_pgno; 8763 DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); 8764 new_root = mc->mc_db->md_depth++; 8765 8766 /* Add left (implicit) pointer. */ 8767 if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { 8768 /* undo the pre-push */ 8769 mc->mc_pg[0] = mc->mc_pg[1]; 8770 mc->mc_ki[0] = mc->mc_ki[1]; 8771 mc->mc_db->md_root = mp->mp_pgno; 8772 mc->mc_db->md_depth--; 8773 goto done; 8774 } 8775 mc->mc_snum++; 8776 mc->mc_top++; 8777 ptop = 0; 8778 } else { 8779 ptop = mc->mc_top-1; 8780 DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); 8781 } 8782 8783 mdb_cursor_copy(mc, &mn); 8784 mn.mc_xcursor = NULL; 8785 mn.mc_pg[mn.mc_top] = rp; 8786 mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; 8787 8788 if (nflags & MDB_APPEND) { 8789 mn.mc_ki[mn.mc_top] = 0; 8790 sepkey = *newkey; 8791 split_indx = newindx; 8792 nkeys = 0; 8793 } else { 8794 8795 split_indx = (nkeys+1) / 2; 8796 8797 if (IS_LEAF2(rp)) { 8798 char *split, *ins; 8799 int x; 8800 unsigned int lsize, rsize, ksize; 8801 /* Move half of the keys to the right sibling */ 8802 x = mc->mc_ki[mc->mc_top] - split_indx; 8803 ksize = mc->mc_db->md_pad; 8804 split = LEAF2KEY(mp, split_indx, ksize); 8805 rsize = (nkeys - split_indx) * ksize; 8806 lsize = (nkeys - split_indx) * sizeof(indx_t); 8807 mp->mp_lower -= lsize; 8808 rp->mp_lower += lsize; 8809 mp->mp_upper += rsize - lsize; 8810 rp->mp_upper -= rsize - lsize; 8811 sepkey.mv_size = ksize; 8812 if (newindx == split_indx) { 8813 sepkey.mv_data = newkey->mv_data; 8814 } else { 8815 sepkey.mv_data = split; 8816 } 8817 if (x<0) { 8818 ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); 8819 memcpy(rp->mp_ptrs, split, rsize); 8820 sepkey.mv_data = rp->mp_ptrs; 8821 memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); 8822 memcpy(ins, newkey->mv_data, ksize); 8823 mp->mp_lower += sizeof(indx_t); 8824 mp->mp_upper -= ksize - sizeof(indx_t); 8825 } else { 8826 if (x) 8827 memcpy(rp->mp_ptrs, split, x * ksize); 8828 ins = LEAF2KEY(rp, x, ksize); 8829 memcpy(ins, newkey->mv_data, ksize); 8830 memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); 8831 rp->mp_lower += sizeof(indx_t); 8832 rp->mp_upper -= ksize - sizeof(indx_t); 8833 mc->mc_ki[mc->mc_top] = x; 8834 } 8835 } else { 8836 int psize, nsize, k, keythresh; 8837 8838 /* Maximum free space in an empty page */ 8839 pmax = env->me_psize - PAGEHDRSZ; 8840 /* Threshold number of keys considered "small" */ 8841 keythresh = env->me_psize >> 7; 8842 8843 if (IS_LEAF(mp)) 8844 nsize = mdb_leaf_size(env, newkey, newdata); 8845 else 8846 nsize = mdb_branch_size(env, newkey); 8847 nsize = EVEN(nsize); 8848 8849 /* grab a page to hold a temporary copy */ 8850 copy = mdb_page_malloc(mc->mc_txn, 1); 8851 if (copy == NULL) { 8852 rc = ENOMEM; 8853 goto done; 8854 } 8855 copy->mp_pgno = mp->mp_pgno; 8856 copy->mp_flags = mp->mp_flags; 8857 copy->mp_lower = (PAGEHDRSZ-PAGEBASE); 8858 copy->mp_upper = env->me_psize - PAGEBASE; 8859 8860 /* prepare to insert */ 8861 for (i=0, j=0; i<nkeys; i++) { 8862 if (i == newindx) { 8863 copy->mp_ptrs[j++] = 0; 8864 } 8865 copy->mp_ptrs[j++] = mp->mp_ptrs[i]; 8866 } 8867 8868 /* When items are relatively large the split point needs 8869 * to be checked, because being off-by-one will make the 8870 * difference between success or failure in mdb_node_add. 8871 * 8872 * It's also relevant if a page happens to be laid out 8873 * such that one half of its nodes are all "small" and 8874 * the other half of its nodes are "large." If the new 8875 * item is also "large" and falls on the half with 8876 * "large" nodes, it also may not fit. 8877 * 8878 * As a final tweak, if the new item goes on the last 8879 * spot on the page (and thus, onto the new page), bias 8880 * the split so the new page is emptier than the old page. 8881 * This yields better packing during sequential inserts. 8882 */ 8883 if (nkeys < keythresh || nsize > pmax/16 || newindx >= nkeys) { 8884 /* Find split point */ 8885 psize = 0; 8886 if (newindx <= split_indx || newindx >= nkeys) { 8887 i = 0; j = 1; 8888 k = newindx >= nkeys ? nkeys : split_indx+1+IS_LEAF(mp); 8889 } else { 8890 i = nkeys; j = -1; 8891 k = split_indx-1; 8892 } 8893 for (; i!=k; i+=j) { 8894 if (i == newindx) { 8895 psize += nsize; 8896 node = NULL; 8897 } else { 8898 node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); 8899 psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); 8900 if (IS_LEAF(mp)) { 8901 if (F_ISSET(node->mn_flags, F_BIGDATA)) 8902 psize += sizeof(pgno_t); 8903 else 8904 psize += NODEDSZ(node); 8905 } 8906 psize = EVEN(psize); 8907 } 8908 if (psize > pmax || i == k-j) { 8909 split_indx = i + (j<0); 8910 break; 8911 } 8912 } 8913 } 8914 if (split_indx == newindx) { 8915 sepkey.mv_size = newkey->mv_size; 8916 sepkey.mv_data = newkey->mv_data; 8917 } else { 8918 node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); 8919 sepkey.mv_size = node->mn_ksize; 8920 sepkey.mv_data = NODEKEY(node); 8921 } 8922 } 8923 } 8924 8925 DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey))); 8926 8927 /* Copy separator key to the parent. 8928 */ 8929 if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { 8930 int snum = mc->mc_snum; 8931 mn.mc_snum--; 8932 mn.mc_top--; 8933 did_split = 1; 8934 /* We want other splits to find mn when doing fixups */ 8935 WITH_CURSOR_TRACKING(mn, 8936 rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); 8937 if (rc) 8938 goto done; 8939 8940 /* root split? */ 8941 if (mc->mc_snum > snum) { 8942 ptop++; 8943 } 8944 /* Right page might now have changed parent. 8945 * Check if left page also changed parent. 8946 */ 8947 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && 8948 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { 8949 for (i=0; i<ptop; i++) { 8950 mc->mc_pg[i] = mn.mc_pg[i]; 8951 mc->mc_ki[i] = mn.mc_ki[i]; 8952 } 8953 mc->mc_pg[ptop] = mn.mc_pg[ptop]; 8954 if (mn.mc_ki[ptop]) { 8955 mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; 8956 } else { 8957 /* find right page's left sibling */ 8958 mc->mc_ki[ptop] = mn.mc_ki[ptop]; 8959 mdb_cursor_sibling(mc, 0); 8960 } 8961 } 8962 } else { 8963 mn.mc_top--; 8964 rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); 8965 mn.mc_top++; 8966 } 8967 if (rc != MDB_SUCCESS) { 8968 goto done; 8969 } 8970 if (nflags & MDB_APPEND) { 8971 mc->mc_pg[mc->mc_top] = rp; 8972 mc->mc_ki[mc->mc_top] = 0; 8973 rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); 8974 if (rc) 8975 goto done; 8976 for (i=0; i<mc->mc_top; i++) 8977 mc->mc_ki[i] = mn.mc_ki[i]; 8978 } else if (!IS_LEAF2(mp)) { 8979 /* Move nodes */ 8980 mc->mc_pg[mc->mc_top] = rp; 8981 i = split_indx; 8982 j = 0; 8983 do { 8984 if (i == newindx) { 8985 rkey.mv_data = newkey->mv_data; 8986 rkey.mv_size = newkey->mv_size; 8987 if (IS_LEAF(mp)) { 8988 rdata = newdata; 8989 } else 8990 pgno = newpgno; 8991 flags = nflags; 8992 /* Update index for the new key. */ 8993 mc->mc_ki[mc->mc_top] = j; 8994 } else { 8995 node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); 8996 rkey.mv_data = NODEKEY(node); 8997 rkey.mv_size = node->mn_ksize; 8998 if (IS_LEAF(mp)) { 8999 xdata.mv_data = NODEDATA(node); 9000 xdata.mv_size = NODEDSZ(node); 9001 rdata = &xdata; 9002 } else 9003 pgno = NODEPGNO(node); 9004 flags = node->mn_flags; 9005 } 9006 9007 if (!IS_LEAF(mp) && j == 0) { 9008 /* First branch index doesn't need key data. */ 9009 rkey.mv_size = 0; 9010 } 9011 9012 rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); 9013 if (rc) 9014 goto done; 9015 if (i == nkeys) { 9016 i = 0; 9017 j = 0; 9018 mc->mc_pg[mc->mc_top] = copy; 9019 } else { 9020 i++; 9021 j++; 9022 } 9023 } while (i != split_indx); 9024 9025 nkeys = NUMKEYS(copy); 9026 for (i=0; i<nkeys; i++) 9027 mp->mp_ptrs[i] = copy->mp_ptrs[i]; 9028 mp->mp_lower = copy->mp_lower; 9029 mp->mp_upper = copy->mp_upper; 9030 memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), 9031 env->me_psize - copy->mp_upper - PAGEBASE); 9032 9033 /* reset back to original page */ 9034 if (newindx < split_indx) { 9035 mc->mc_pg[mc->mc_top] = mp; 9036 } else { 9037 mc->mc_pg[mc->mc_top] = rp; 9038 mc->mc_ki[ptop]++; 9039 /* Make sure mc_ki is still valid. 9040 */ 9041 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && 9042 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { 9043 for (i=0; i<=ptop; i++) { 9044 mc->mc_pg[i] = mn.mc_pg[i]; 9045 mc->mc_ki[i] = mn.mc_ki[i]; 9046 } 9047 } 9048 } 9049 if (nflags & MDB_RESERVE) { 9050 node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); 9051 if (!(node->mn_flags & F_BIGDATA)) 9052 newdata->mv_data = NODEDATA(node); 9053 } 9054 } else { 9055 if (newindx >= split_indx) { 9056 mc->mc_pg[mc->mc_top] = rp; 9057 mc->mc_ki[ptop]++; 9058 /* Make sure mc_ki is still valid. 9059 */ 9060 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && 9061 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { 9062 for (i=0; i<=ptop; i++) { 9063 mc->mc_pg[i] = mn.mc_pg[i]; 9064 mc->mc_ki[i] = mn.mc_ki[i]; 9065 } 9066 } 9067 } 9068 } 9069 9070 { 9071 /* Adjust other cursors pointing to mp */ 9072 MDB_cursor *m2, *m3; 9073 MDB_dbi dbi = mc->mc_dbi; 9074 nkeys = NUMKEYS(mp); 9075 9076 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { 9077 if (mc->mc_flags & C_SUB) 9078 m3 = &m2->mc_xcursor->mx_cursor; 9079 else 9080 m3 = m2; 9081 if (m3 == mc) 9082 continue; 9083 if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) 9084 continue; 9085 if (new_root) { 9086 int k; 9087 /* sub cursors may be on different DB */ 9088 if (m3->mc_pg[0] != mp) 9089 continue; 9090 /* root split */ 9091 for (k=new_root; k>=0; k--) { 9092 m3->mc_ki[k+1] = m3->mc_ki[k]; 9093 m3->mc_pg[k+1] = m3->mc_pg[k]; 9094 } 9095 if (m3->mc_ki[0] >= nkeys) { 9096 m3->mc_ki[0] = 1; 9097 } else { 9098 m3->mc_ki[0] = 0; 9099 } 9100 m3->mc_pg[0] = mc->mc_pg[0]; 9101 m3->mc_snum++; 9102 m3->mc_top++; 9103 } 9104 if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { 9105 if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) 9106 m3->mc_ki[mc->mc_top]++; 9107 if (m3->mc_ki[mc->mc_top] >= nkeys) { 9108 m3->mc_pg[mc->mc_top] = rp; 9109 m3->mc_ki[mc->mc_top] -= nkeys; 9110 for (i=0; i<mc->mc_top; i++) { 9111 m3->mc_ki[i] = mn.mc_ki[i]; 9112 m3->mc_pg[i] = mn.mc_pg[i]; 9113 } 9114 } 9115 } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && 9116 m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { 9117 m3->mc_ki[ptop]++; 9118 } 9119 if (IS_LEAF(mp)) 9120 XCURSOR_REFRESH(m3, mc->mc_top, m3->mc_pg[mc->mc_top]); 9121 } 9122 } 9123 DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); 9124 9125 done: 9126 if (copy) /* tmp page */ 9127 mdb_page_free(env, copy); 9128 if (rc) 9129 mc->mc_txn->mt_flags |= MDB_TXN_ERROR; 9130 return rc; 9131 } 9132 9133 int 9134 mdb_put(MDB_txn *txn, MDB_dbi dbi, 9135 MDB_val *key, MDB_val *data, unsigned int flags) 9136 { 9137 MDB_cursor mc; 9138 MDB_xcursor mx; 9139 int rc; 9140 DKBUF; 9141 DDBUF; 9142 9143 if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 9144 return EINVAL; 9145 9146 if (flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) 9147 return EINVAL; 9148 9149 if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) 9150 return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; 9151 9152 MDB_TRACE(("%p, %u, %"Z"u[%s], %"Z"u%s, %u", 9153 txn, dbi, key ? key->mv_size:0, DKEY(key), data->mv_size, mdb_dval(txn, dbi, data, dbuf), flags)); 9154 mdb_cursor_init(&mc, txn, dbi, &mx); 9155 mc.mc_next = txn->mt_cursors[dbi]; 9156 txn->mt_cursors[dbi] = &mc; 9157 rc = _mdb_cursor_put(&mc, key, data, flags); 9158 txn->mt_cursors[dbi] = mc.mc_next; 9159 return rc; 9160 } 9161 9162 #ifndef MDB_WBUF 9163 #define MDB_WBUF (1024*1024) 9164 #endif 9165 #define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ 9166 9167 /** State needed for a double-buffering compacting copy. */ 9168 typedef struct mdb_copy { 9169 MDB_env *mc_env; 9170 MDB_txn *mc_txn; 9171 pthread_mutex_t mc_mutex; 9172 pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ 9173 char *mc_wbuf[2]; 9174 char *mc_over[2]; 9175 int mc_wlen[2]; 9176 int mc_olen[2]; 9177 pgno_t mc_next_pgno; 9178 HANDLE mc_fd; 9179 int mc_toggle; /**< Buffer number in provider */ 9180 int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ 9181 /** Error code. Never cleared if set. Both threads can set nonzero 9182 * to fail the copy. Not mutex-protected, LMDB expects atomic int. 9183 */ 9184 volatile int mc_error; 9185 } mdb_copy; 9186 9187 /** Dedicated writer thread for compacting copy. */ 9188 static THREAD_RET ESECT CALL_CONV 9189 mdb_env_copythr(void *arg) 9190 { 9191 mdb_copy *my = arg; 9192 char *ptr; 9193 int toggle = 0, wsize, rc; 9194 #ifdef _WIN32 9195 DWORD len; 9196 #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) 9197 #else 9198 int len; 9199 #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) 9200 #ifdef SIGPIPE 9201 sigset_t set; 9202 sigemptyset(&set); 9203 sigaddset(&set, SIGPIPE); 9204 if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) 9205 my->mc_error = rc; 9206 #endif 9207 #endif 9208 9209 pthread_mutex_lock(&my->mc_mutex); 9210 for(;;) { 9211 while (!my->mc_new) 9212 pthread_cond_wait(&my->mc_cond, &my->mc_mutex); 9213 if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ 9214 break; 9215 wsize = my->mc_wlen[toggle]; 9216 ptr = my->mc_wbuf[toggle]; 9217 again: 9218 rc = MDB_SUCCESS; 9219 while (wsize > 0 && !my->mc_error) { 9220 DO_WRITE(rc, my->mc_fd, ptr, wsize, len); 9221 if (!rc) { 9222 rc = ErrCode(); 9223 #if defined(SIGPIPE) && !defined(_WIN32) 9224 if (rc == EPIPE) { 9225 /* Collect the pending SIGPIPE, otherwise at least OS X 9226 * gives it to the process on thread-exit (ITS#8504). 9227 */ 9228 int tmp; 9229 sigwait(&set, &tmp); 9230 } 9231 #endif 9232 break; 9233 } else if (len > 0) { 9234 rc = MDB_SUCCESS; 9235 ptr += len; 9236 wsize -= len; 9237 continue; 9238 } else { 9239 rc = EIO; 9240 break; 9241 } 9242 } 9243 if (rc) { 9244 my->mc_error = rc; 9245 } 9246 /* If there's an overflow page tail, write it too */ 9247 if (my->mc_olen[toggle]) { 9248 wsize = my->mc_olen[toggle]; 9249 ptr = my->mc_over[toggle]; 9250 my->mc_olen[toggle] = 0; 9251 goto again; 9252 } 9253 my->mc_wlen[toggle] = 0; 9254 toggle ^= 1; 9255 /* Return the empty buffer to provider */ 9256 my->mc_new--; 9257 pthread_cond_signal(&my->mc_cond); 9258 } 9259 pthread_mutex_unlock(&my->mc_mutex); 9260 return (THREAD_RET)0; 9261 #undef DO_WRITE 9262 } 9263 9264 /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. 9265 * 9266 * @param[in] my control structure. 9267 * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). 9268 */ 9269 static int ESECT 9270 mdb_env_cthr_toggle(mdb_copy *my, int adjust) 9271 { 9272 pthread_mutex_lock(&my->mc_mutex); 9273 my->mc_new += adjust; 9274 pthread_cond_signal(&my->mc_cond); 9275 while (my->mc_new & 2) /* both buffers in use */ 9276 pthread_cond_wait(&my->mc_cond, &my->mc_mutex); 9277 pthread_mutex_unlock(&my->mc_mutex); 9278 9279 my->mc_toggle ^= (adjust & 1); 9280 /* Both threads reset mc_wlen, to be safe from threading errors */ 9281 my->mc_wlen[my->mc_toggle] = 0; 9282 return my->mc_error; 9283 } 9284 9285 /** Depth-first tree traversal for compacting copy. 9286 * @param[in] my control structure. 9287 * @param[in,out] pg database root. 9288 * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. 9289 */ 9290 static int ESECT 9291 mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) 9292 { 9293 MDB_cursor mc = {0}; 9294 MDB_node *ni; 9295 MDB_page *mo, *mp, *leaf; 9296 char *buf, *ptr; 9297 int rc, toggle; 9298 unsigned int i; 9299 9300 /* Empty DB, nothing to do */ 9301 if (*pg == P_INVALID) 9302 return MDB_SUCCESS; 9303 9304 mc.mc_snum = 1; 9305 mc.mc_txn = my->mc_txn; 9306 9307 rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); 9308 if (rc) 9309 return rc; 9310 rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); 9311 if (rc) 9312 return rc; 9313 9314 /* Make cursor pages writable */ 9315 buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); 9316 if (buf == NULL) 9317 return ENOMEM; 9318 9319 for (i=0; i<mc.mc_top; i++) { 9320 mdb_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize); 9321 mc.mc_pg[i] = (MDB_page *)ptr; 9322 ptr += my->mc_env->me_psize; 9323 } 9324 9325 /* This is writable space for a leaf page. Usually not needed. */ 9326 leaf = (MDB_page *)ptr; 9327 9328 toggle = my->mc_toggle; 9329 while (mc.mc_snum > 0) { 9330 unsigned n; 9331 mp = mc.mc_pg[mc.mc_top]; 9332 n = NUMKEYS(mp); 9333 9334 if (IS_LEAF(mp)) { 9335 if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { 9336 for (i=0; i<n; i++) { 9337 ni = NODEPTR(mp, i); 9338 if (ni->mn_flags & F_BIGDATA) { 9339 MDB_page *omp; 9340 pgno_t pg; 9341 9342 /* Need writable leaf */ 9343 if (mp != leaf) { 9344 mc.mc_pg[mc.mc_top] = leaf; 9345 mdb_page_copy(leaf, mp, my->mc_env->me_psize); 9346 mp = leaf; 9347 ni = NODEPTR(mp, i); 9348 } 9349 9350 memcpy(&pg, NODEDATA(ni), sizeof(pg)); 9351 memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); 9352 rc = mdb_page_get(&mc, pg, &omp, NULL); 9353 if (rc) 9354 goto done; 9355 if (my->mc_wlen[toggle] >= MDB_WBUF) { 9356 rc = mdb_env_cthr_toggle(my, 1); 9357 if (rc) 9358 goto done; 9359 toggle = my->mc_toggle; 9360 } 9361 mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); 9362 memcpy(mo, omp, my->mc_env->me_psize); 9363 mo->mp_pgno = my->mc_next_pgno; 9364 my->mc_next_pgno += omp->mp_pages; 9365 my->mc_wlen[toggle] += my->mc_env->me_psize; 9366 if (omp->mp_pages > 1) { 9367 my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); 9368 my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; 9369 rc = mdb_env_cthr_toggle(my, 1); 9370 if (rc) 9371 goto done; 9372 toggle = my->mc_toggle; 9373 } 9374 } else if (ni->mn_flags & F_SUBDATA) { 9375 MDB_db db; 9376 9377 /* Need writable leaf */ 9378 if (mp != leaf) { 9379 mc.mc_pg[mc.mc_top] = leaf; 9380 mdb_page_copy(leaf, mp, my->mc_env->me_psize); 9381 mp = leaf; 9382 ni = NODEPTR(mp, i); 9383 } 9384 9385 memcpy(&db, NODEDATA(ni), sizeof(db)); 9386 my->mc_toggle = toggle; 9387 rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); 9388 if (rc) 9389 goto done; 9390 toggle = my->mc_toggle; 9391 memcpy(NODEDATA(ni), &db, sizeof(db)); 9392 } 9393 } 9394 } 9395 } else { 9396 mc.mc_ki[mc.mc_top]++; 9397 if (mc.mc_ki[mc.mc_top] < n) { 9398 pgno_t pg; 9399 again: 9400 ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); 9401 pg = NODEPGNO(ni); 9402 rc = mdb_page_get(&mc, pg, &mp, NULL); 9403 if (rc) 9404 goto done; 9405 mc.mc_top++; 9406 mc.mc_snum++; 9407 mc.mc_ki[mc.mc_top] = 0; 9408 if (IS_BRANCH(mp)) { 9409 /* Whenever we advance to a sibling branch page, 9410 * we must proceed all the way down to its first leaf. 9411 */ 9412 mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); 9413 goto again; 9414 } else 9415 mc.mc_pg[mc.mc_top] = mp; 9416 continue; 9417 } 9418 } 9419 if (my->mc_wlen[toggle] >= MDB_WBUF) { 9420 rc = mdb_env_cthr_toggle(my, 1); 9421 if (rc) 9422 goto done; 9423 toggle = my->mc_toggle; 9424 } 9425 mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); 9426 mdb_page_copy(mo, mp, my->mc_env->me_psize); 9427 mo->mp_pgno = my->mc_next_pgno++; 9428 my->mc_wlen[toggle] += my->mc_env->me_psize; 9429 if (mc.mc_top) { 9430 /* Update parent if there is one */ 9431 ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]); 9432 SETPGNO(ni, mo->mp_pgno); 9433 mdb_cursor_pop(&mc); 9434 } else { 9435 /* Otherwise we're done */ 9436 *pg = mo->mp_pgno; 9437 break; 9438 } 9439 } 9440 done: 9441 free(buf); 9442 return rc; 9443 } 9444 9445 /** Copy environment with compaction. */ 9446 static int ESECT 9447 mdb_env_copyfd1(MDB_env *env, HANDLE fd) 9448 { 9449 MDB_meta *mm; 9450 MDB_page *mp; 9451 mdb_copy my = {0}; 9452 MDB_txn *txn = NULL; 9453 pthread_t thr; 9454 pgno_t root, new_root; 9455 int rc = MDB_SUCCESS; 9456 9457 #ifdef _WIN32 9458 if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) || 9459 !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) { 9460 rc = ErrCode(); 9461 goto done; 9462 } 9463 my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize); 9464 if (my.mc_wbuf[0] == NULL) { 9465 /* _aligned_malloc() sets errno, but we use Windows error codes */ 9466 rc = ERROR_NOT_ENOUGH_MEMORY; 9467 goto done; 9468 } 9469 #else 9470 if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) 9471 return rc; 9472 if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) 9473 goto done2; 9474 #ifdef HAVE_MEMALIGN 9475 my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); 9476 if (my.mc_wbuf[0] == NULL) { 9477 rc = errno; 9478 goto done; 9479 } 9480 #else 9481 { 9482 void *p; 9483 if ((rc = posix_memalign(&p, env->me_os_psize, MDB_WBUF*2)) != 0) 9484 goto done; 9485 my.mc_wbuf[0] = p; 9486 } 9487 #endif 9488 #endif 9489 memset(my.mc_wbuf[0], 0, MDB_WBUF*2); 9490 my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; 9491 my.mc_next_pgno = NUM_METAS; 9492 my.mc_env = env; 9493 my.mc_fd = fd; 9494 rc = THREAD_CREATE(thr, mdb_env_copythr, &my); 9495 if (rc) 9496 goto done; 9497 9498 rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); 9499 if (rc) 9500 goto finish; 9501 9502 mp = (MDB_page *)my.mc_wbuf[0]; 9503 memset(mp, 0, NUM_METAS * env->me_psize); 9504 mp->mp_pgno = 0; 9505 mp->mp_flags = P_META; 9506 mm = (MDB_meta *)METADATA(mp); 9507 mdb_env_init_meta0(env, mm); 9508 mm->mm_address = env->me_metas[0]->mm_address; 9509 9510 mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); 9511 mp->mp_pgno = 1; 9512 mp->mp_flags = P_META; 9513 *(MDB_meta *)METADATA(mp) = *mm; 9514 mm = (MDB_meta *)METADATA(mp); 9515 9516 /* Set metapage 1 with current main DB */ 9517 root = new_root = txn->mt_dbs[MAIN_DBI].md_root; 9518 if (root != P_INVALID) { 9519 /* Count free pages + freeDB pages. Subtract from last_pg 9520 * to find the new last_pg, which also becomes the new root. 9521 */ 9522 MDB_ID freecount = 0; 9523 MDB_cursor mc; 9524 MDB_val key, data; 9525 mdb_cursor_init(&mc, txn, FREE_DBI, NULL); 9526 while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) 9527 freecount += *(MDB_ID *)data.mv_data; 9528 if (rc != MDB_NOTFOUND) 9529 goto finish; 9530 freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + 9531 txn->mt_dbs[FREE_DBI].md_leaf_pages + 9532 txn->mt_dbs[FREE_DBI].md_overflow_pages; 9533 9534 new_root = txn->mt_next_pgno - 1 - freecount; 9535 mm->mm_last_pg = new_root; 9536 mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; 9537 mm->mm_dbs[MAIN_DBI].md_root = new_root; 9538 } else { 9539 /* When the DB is empty, handle it specially to 9540 * fix any breakage like page leaks from ITS#8174. 9541 */ 9542 mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; 9543 } 9544 if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { 9545 mm->mm_txnid = 1; /* use metapage 1 */ 9546 } 9547 9548 my.mc_wlen[0] = env->me_psize * NUM_METAS; 9549 my.mc_txn = txn; 9550 rc = mdb_env_cwalk(&my, &root, 0); 9551 if (rc == MDB_SUCCESS && root != new_root) { 9552 rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ 9553 } 9554 9555 finish: 9556 if (rc) 9557 my.mc_error = rc; 9558 mdb_env_cthr_toggle(&my, 1 | MDB_EOF); 9559 rc = THREAD_FINISH(thr); 9560 _mdb_txn_abort(txn); 9561 9562 done: 9563 #ifdef _WIN32 9564 if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]); 9565 if (my.mc_cond) CloseHandle(my.mc_cond); 9566 if (my.mc_mutex) CloseHandle(my.mc_mutex); 9567 #else 9568 free(my.mc_wbuf[0]); 9569 pthread_cond_destroy(&my.mc_cond); 9570 done2: 9571 pthread_mutex_destroy(&my.mc_mutex); 9572 #endif 9573 return rc ? rc : my.mc_error; 9574 } 9575 9576 /** Copy environment as-is. */ 9577 static int ESECT 9578 mdb_env_copyfd0(MDB_env *env, HANDLE fd) 9579 { 9580 MDB_txn *txn = NULL; 9581 mdb_mutexref_t wmutex = NULL; 9582 int rc; 9583 size_t wsize, w3; 9584 char *ptr; 9585 #ifdef _WIN32 9586 DWORD len, w2; 9587 #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) 9588 #else 9589 ssize_t len; 9590 size_t w2; 9591 #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) 9592 #endif 9593 9594 /* Do the lock/unlock of the reader mutex before starting the 9595 * write txn. Otherwise other read txns could block writers. 9596 */ 9597 rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); 9598 if (rc) 9599 return rc; 9600 9601 if (env->me_txns) { 9602 /* We must start the actual read txn after blocking writers */ 9603 mdb_txn_end(txn, MDB_END_RESET_TMP); 9604 9605 /* Temporarily block writers until we snapshot the meta pages */ 9606 wmutex = env->me_wmutex; 9607 if (LOCK_MUTEX(rc, env, wmutex)) 9608 goto leave; 9609 9610 rc = mdb_txn_renew0(txn); 9611 if (rc) { 9612 UNLOCK_MUTEX(wmutex); 9613 goto leave; 9614 } 9615 } 9616 9617 wsize = env->me_psize * NUM_METAS; 9618 ptr = env->me_map; 9619 w2 = wsize; 9620 while (w2 > 0) { 9621 DO_WRITE(rc, fd, ptr, w2, len); 9622 if (!rc) { 9623 rc = ErrCode(); 9624 break; 9625 } else if (len > 0) { 9626 rc = MDB_SUCCESS; 9627 ptr += len; 9628 w2 -= len; 9629 continue; 9630 } else { 9631 /* Non-blocking or async handles are not supported */ 9632 rc = EIO; 9633 break; 9634 } 9635 } 9636 if (wmutex) 9637 UNLOCK_MUTEX(wmutex); 9638 9639 if (rc) 9640 goto leave; 9641 9642 w3 = txn->mt_next_pgno * env->me_psize; 9643 { 9644 size_t fsize = 0; 9645 if ((rc = mdb_fsize(env->me_fd, &fsize))) 9646 goto leave; 9647 if (w3 > fsize) 9648 w3 = fsize; 9649 } 9650 wsize = w3 - wsize; 9651 while (wsize > 0) { 9652 if (wsize > MAX_WRITE) 9653 w2 = MAX_WRITE; 9654 else 9655 w2 = wsize; 9656 DO_WRITE(rc, fd, ptr, w2, len); 9657 if (!rc) { 9658 rc = ErrCode(); 9659 break; 9660 } else if (len > 0) { 9661 rc = MDB_SUCCESS; 9662 ptr += len; 9663 wsize -= len; 9664 continue; 9665 } else { 9666 rc = EIO; 9667 break; 9668 } 9669 } 9670 9671 leave: 9672 _mdb_txn_abort(txn); 9673 return rc; 9674 } 9675 9676 int ESECT 9677 mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags) 9678 { 9679 if (flags & MDB_CP_COMPACT) 9680 return mdb_env_copyfd1(env, fd); 9681 else 9682 return mdb_env_copyfd0(env, fd); 9683 } 9684 9685 int ESECT 9686 mdb_env_copyfd(MDB_env *env, HANDLE fd) 9687 { 9688 return mdb_env_copyfd2(env, fd, 0); 9689 } 9690 9691 int ESECT 9692 mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) 9693 { 9694 int rc; 9695 MDB_name fname; 9696 HANDLE newfd = INVALID_HANDLE_VALUE; 9697 9698 rc = mdb_fname_init(path, env->me_flags | MDB_NOLOCK, &fname); 9699 if (rc == MDB_SUCCESS) { 9700 rc = mdb_fopen(env, &fname, MDB_O_COPY, 0666, &newfd); 9701 mdb_fname_destroy(fname); 9702 } 9703 if (rc == MDB_SUCCESS) { 9704 rc = mdb_env_copyfd2(env, newfd, flags); 9705 if (close(newfd) < 0 && rc == MDB_SUCCESS) 9706 rc = ErrCode(); 9707 } 9708 return rc; 9709 } 9710 9711 int ESECT 9712 mdb_env_copy(MDB_env *env, const char *path) 9713 { 9714 return mdb_env_copy2(env, path, 0); 9715 } 9716 9717 int ESECT 9718 mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) 9719 { 9720 if (flag & ~CHANGEABLE) 9721 return EINVAL; 9722 if (onoff) 9723 env->me_flags |= flag; 9724 else 9725 env->me_flags &= ~flag; 9726 return MDB_SUCCESS; 9727 } 9728 9729 int ESECT 9730 mdb_env_get_flags(MDB_env *env, unsigned int *arg) 9731 { 9732 if (!env || !arg) 9733 return EINVAL; 9734 9735 *arg = env->me_flags & (CHANGEABLE|CHANGELESS); 9736 return MDB_SUCCESS; 9737 } 9738 9739 int ESECT 9740 mdb_env_set_userctx(MDB_env *env, void *ctx) 9741 { 9742 if (!env) 9743 return EINVAL; 9744 env->me_userctx = ctx; 9745 return MDB_SUCCESS; 9746 } 9747 9748 void * ESECT 9749 mdb_env_get_userctx(MDB_env *env) 9750 { 9751 return env ? env->me_userctx : NULL; 9752 } 9753 9754 int ESECT 9755 mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) 9756 { 9757 if (!env) 9758 return EINVAL; 9759 #ifndef NDEBUG 9760 env->me_assert_func = func; 9761 #endif 9762 return MDB_SUCCESS; 9763 } 9764 9765 int ESECT 9766 mdb_env_get_path(MDB_env *env, const char **arg) 9767 { 9768 if (!env || !arg) 9769 return EINVAL; 9770 9771 *arg = env->me_path; 9772 return MDB_SUCCESS; 9773 } 9774 9775 int ESECT 9776 mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) 9777 { 9778 if (!env || !arg) 9779 return EINVAL; 9780 9781 *arg = env->me_fd; 9782 return MDB_SUCCESS; 9783 } 9784 9785 /** Common code for #mdb_stat() and #mdb_env_stat(). 9786 * @param[in] env the environment to operate in. 9787 * @param[in] db the #MDB_db record containing the stats to return. 9788 * @param[out] arg the address of an #MDB_stat structure to receive the stats. 9789 * @return 0, this function always succeeds. 9790 */ 9791 static int ESECT 9792 mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) 9793 { 9794 arg->ms_psize = env->me_psize; 9795 arg->ms_depth = db->md_depth; 9796 arg->ms_branch_pages = db->md_branch_pages; 9797 arg->ms_leaf_pages = db->md_leaf_pages; 9798 arg->ms_overflow_pages = db->md_overflow_pages; 9799 arg->ms_entries = db->md_entries; 9800 9801 return MDB_SUCCESS; 9802 } 9803 9804 int ESECT 9805 mdb_env_stat(MDB_env *env, MDB_stat *arg) 9806 { 9807 MDB_meta *meta; 9808 9809 if (env == NULL || arg == NULL) 9810 return EINVAL; 9811 9812 meta = mdb_env_pick_meta(env); 9813 9814 return mdb_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); 9815 } 9816 9817 int ESECT 9818 mdb_env_info(MDB_env *env, MDB_envinfo *arg) 9819 { 9820 MDB_meta *meta; 9821 9822 if (env == NULL || arg == NULL) 9823 return EINVAL; 9824 9825 meta = mdb_env_pick_meta(env); 9826 arg->me_mapaddr = meta->mm_address; 9827 arg->me_last_pgno = meta->mm_last_pg; 9828 arg->me_last_txnid = meta->mm_txnid; 9829 9830 arg->me_mapsize = env->me_mapsize; 9831 arg->me_maxreaders = env->me_maxreaders; 9832 arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0; 9833 return MDB_SUCCESS; 9834 } 9835 9836 /** Set the default comparison functions for a database. 9837 * Called immediately after a database is opened to set the defaults. 9838 * The user can then override them with #mdb_set_compare() or 9839 * #mdb_set_dupsort(). 9840 * @param[in] txn A transaction handle returned by #mdb_txn_begin() 9841 * @param[in] dbi A database handle returned by #mdb_dbi_open() 9842 */ 9843 static void 9844 mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) 9845 { 9846 uint16_t f = txn->mt_dbs[dbi].md_flags; 9847 9848 txn->mt_dbxs[dbi].md_cmp = 9849 (f & MDB_REVERSEKEY) ? mdb_cmp_memnr : 9850 (f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn; 9851 9852 txn->mt_dbxs[dbi].md_dcmp = 9853 !(f & MDB_DUPSORT) ? 0 : 9854 ((f & MDB_INTEGERDUP) 9855 ? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint) 9856 : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn)); 9857 } 9858 9859 int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) 9860 { 9861 MDB_val key, data; 9862 MDB_dbi i; 9863 MDB_cursor mc; 9864 MDB_db dummy; 9865 int rc, dbflag, exact; 9866 unsigned int unused = 0, seq; 9867 char *namedup; 9868 size_t len; 9869 9870 if (flags & ~VALID_FLAGS) 9871 return EINVAL; 9872 if (txn->mt_flags & MDB_TXN_BLOCKED) 9873 return MDB_BAD_TXN; 9874 9875 /* main DB? */ 9876 if (!name) { 9877 *dbi = MAIN_DBI; 9878 if (flags & PERSISTENT_FLAGS) { 9879 uint16_t f2 = flags & PERSISTENT_FLAGS; 9880 /* make sure flag changes get committed */ 9881 if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { 9882 txn->mt_dbs[MAIN_DBI].md_flags |= f2; 9883 txn->mt_flags |= MDB_TXN_DIRTY; 9884 } 9885 } 9886 mdb_default_cmp(txn, MAIN_DBI); 9887 MDB_TRACE(("%p, (null), %u = %u", txn, flags, MAIN_DBI)); 9888 return MDB_SUCCESS; 9889 } 9890 9891 if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { 9892 mdb_default_cmp(txn, MAIN_DBI); 9893 } 9894 9895 /* Is the DB already open? */ 9896 len = strlen(name); 9897 for (i=CORE_DBS; i<txn->mt_numdbs; i++) { 9898 if (!txn->mt_dbxs[i].md_name.mv_size) { 9899 /* Remember this free slot */ 9900 if (!unused) unused = i; 9901 continue; 9902 } 9903 if (len == txn->mt_dbxs[i].md_name.mv_size && 9904 !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { 9905 *dbi = i; 9906 return MDB_SUCCESS; 9907 } 9908 } 9909 9910 /* If no free slot and max hit, fail */ 9911 if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) 9912 return MDB_DBS_FULL; 9913 9914 /* Cannot mix named databases with some mainDB flags */ 9915 if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) 9916 return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; 9917 9918 /* Find the DB info */ 9919 dbflag = DB_NEW|DB_VALID|DB_USRVALID; 9920 exact = 0; 9921 key.mv_size = len; 9922 key.mv_data = (void *)name; 9923 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); 9924 rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); 9925 if (rc == MDB_SUCCESS) { 9926 /* make sure this is actually a DB */ 9927 MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); 9928 if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) 9929 return MDB_INCOMPATIBLE; 9930 } else { 9931 if (rc != MDB_NOTFOUND || !(flags & MDB_CREATE)) 9932 return rc; 9933 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) 9934 return EACCES; 9935 } 9936 9937 /* Done here so we cannot fail after creating a new DB */ 9938 if ((namedup = strdup(name)) == NULL) 9939 return ENOMEM; 9940 9941 if (rc) { 9942 /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ 9943 data.mv_size = sizeof(MDB_db); 9944 data.mv_data = &dummy; 9945 memset(&dummy, 0, sizeof(dummy)); 9946 dummy.md_root = P_INVALID; 9947 dummy.md_flags = flags & PERSISTENT_FLAGS; 9948 WITH_CURSOR_TRACKING(mc, 9949 rc = _mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); 9950 dbflag |= DB_DIRTY; 9951 } 9952 9953 if (rc) { 9954 free(namedup); 9955 } else { 9956 /* Got info, register DBI in this txn */ 9957 unsigned int slot = unused ? unused : txn->mt_numdbs; 9958 txn->mt_dbxs[slot].md_name.mv_data = namedup; 9959 txn->mt_dbxs[slot].md_name.mv_size = len; 9960 txn->mt_dbxs[slot].md_rel = NULL; 9961 txn->mt_dbflags[slot] = dbflag; 9962 /* txn-> and env-> are the same in read txns, use 9963 * tmp variable to avoid undefined assignment 9964 */ 9965 seq = ++txn->mt_env->me_dbiseqs[slot]; 9966 txn->mt_dbiseqs[slot] = seq; 9967 9968 memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); 9969 *dbi = slot; 9970 mdb_default_cmp(txn, slot); 9971 if (!unused) { 9972 txn->mt_numdbs++; 9973 } 9974 MDB_TRACE(("%p, %s, %u = %u", txn, name, flags, slot)); 9975 } 9976 9977 return rc; 9978 } 9979 9980 int ESECT 9981 mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) 9982 { 9983 if (!arg || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) 9984 return EINVAL; 9985 9986 if (txn->mt_flags & MDB_TXN_BLOCKED) 9987 return MDB_BAD_TXN; 9988 9989 if (txn->mt_dbflags[dbi] & DB_STALE) { 9990 MDB_cursor mc; 9991 MDB_xcursor mx; 9992 /* Stale, must read the DB's root. cursor_init does it for us. */ 9993 mdb_cursor_init(&mc, txn, dbi, &mx); 9994 } 9995 return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); 9996 } 9997 9998 void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) 9999 { 10000 char *ptr; 10001 if (dbi < CORE_DBS || dbi >= env->me_maxdbs) 10002 return; 10003 ptr = env->me_dbxs[dbi].md_name.mv_data; 10004 /* If there was no name, this was already closed */ 10005 if (ptr) { 10006 MDB_TRACE(("%p, %u", env, dbi)); 10007 env->me_dbxs[dbi].md_name.mv_data = NULL; 10008 env->me_dbxs[dbi].md_name.mv_size = 0; 10009 env->me_dbflags[dbi] = 0; 10010 env->me_dbiseqs[dbi]++; 10011 free(ptr); 10012 } 10013 } 10014 10015 int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) 10016 { 10017 /* We could return the flags for the FREE_DBI too but what's the point? */ 10018 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10019 return EINVAL; 10020 *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; 10021 return MDB_SUCCESS; 10022 } 10023 10024 /** Add all the DB's pages to the free list. 10025 * @param[in] mc Cursor on the DB to free. 10026 * @param[in] subs non-Zero to check for sub-DBs in this DB. 10027 * @return 0 on success, non-zero on failure. 10028 */ 10029 static int 10030 mdb_drop0(MDB_cursor *mc, int subs) 10031 { 10032 int rc; 10033 10034 rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); 10035 if (rc == MDB_SUCCESS) { 10036 MDB_txn *txn = mc->mc_txn; 10037 MDB_node *ni; 10038 MDB_cursor mx; 10039 unsigned int i; 10040 10041 /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. 10042 * This also avoids any P_LEAF2 pages, which have no nodes. 10043 * Also if the DB doesn't have sub-DBs and has no overflow 10044 * pages, omit scanning leaves. 10045 */ 10046 if ((mc->mc_flags & C_SUB) || 10047 (!subs && !mc->mc_db->md_overflow_pages)) 10048 mdb_cursor_pop(mc); 10049 10050 mdb_cursor_copy(mc, &mx); 10051 while (mc->mc_snum > 0) { 10052 MDB_page *mp = mc->mc_pg[mc->mc_top]; 10053 unsigned n = NUMKEYS(mp); 10054 if (IS_LEAF(mp)) { 10055 for (i=0; i<n; i++) { 10056 ni = NODEPTR(mp, i); 10057 if (ni->mn_flags & F_BIGDATA) { 10058 MDB_page *omp; 10059 pgno_t pg; 10060 memcpy(&pg, NODEDATA(ni), sizeof(pg)); 10061 rc = mdb_page_get(mc, pg, &omp, NULL); 10062 if (rc != 0) 10063 goto done; 10064 mdb_cassert(mc, IS_OVERFLOW(omp)); 10065 rc = mdb_midl_append_range(&txn->mt_free_pgs, 10066 pg, omp->mp_pages); 10067 if (rc) 10068 goto done; 10069 mc->mc_db->md_overflow_pages -= omp->mp_pages; 10070 if (!mc->mc_db->md_overflow_pages && !subs) 10071 break; 10072 } else if (subs && (ni->mn_flags & F_SUBDATA)) { 10073 mdb_xcursor_init1(mc, ni); 10074 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); 10075 if (rc) 10076 goto done; 10077 } 10078 } 10079 if (!subs && !mc->mc_db->md_overflow_pages) 10080 goto pop; 10081 } else { 10082 if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) 10083 goto done; 10084 for (i=0; i<n; i++) { 10085 pgno_t pg; 10086 ni = NODEPTR(mp, i); 10087 pg = NODEPGNO(ni); 10088 /* free it */ 10089 mdb_midl_xappend(txn->mt_free_pgs, pg); 10090 } 10091 } 10092 if (!mc->mc_top) 10093 break; 10094 mc->mc_ki[mc->mc_top] = i; 10095 rc = mdb_cursor_sibling(mc, 1); 10096 if (rc) { 10097 if (rc != MDB_NOTFOUND) 10098 goto done; 10099 /* no more siblings, go back to beginning 10100 * of previous level. 10101 */ 10102 pop: 10103 mdb_cursor_pop(mc); 10104 mc->mc_ki[0] = 0; 10105 for (i=1; i<mc->mc_snum; i++) { 10106 mc->mc_ki[i] = 0; 10107 mc->mc_pg[i] = mx.mc_pg[i]; 10108 } 10109 } 10110 } 10111 /* free it */ 10112 rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); 10113 done: 10114 if (rc) 10115 txn->mt_flags |= MDB_TXN_ERROR; 10116 } else if (rc == MDB_NOTFOUND) { 10117 rc = MDB_SUCCESS; 10118 } 10119 mc->mc_flags &= ~C_INITIALIZED; 10120 return rc; 10121 } 10122 10123 int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) 10124 { 10125 MDB_cursor *mc, *m2; 10126 int rc; 10127 10128 if ((unsigned)del > 1 || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10129 return EINVAL; 10130 10131 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) 10132 return EACCES; 10133 10134 if (TXN_DBI_CHANGED(txn, dbi)) 10135 return MDB_BAD_DBI; 10136 10137 rc = mdb_cursor_open(txn, dbi, &mc); 10138 if (rc) 10139 return rc; 10140 10141 MDB_TRACE(("%u, %d", dbi, del)); 10142 rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); 10143 /* Invalidate the dropped DB's cursors */ 10144 for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) 10145 m2->mc_flags &= ~(C_INITIALIZED|C_EOF); 10146 if (rc) 10147 goto leave; 10148 10149 /* Can't delete the main DB */ 10150 if (del && dbi >= CORE_DBS) { 10151 rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); 10152 if (!rc) { 10153 txn->mt_dbflags[dbi] = DB_STALE; 10154 mdb_dbi_close(txn->mt_env, dbi); 10155 } else { 10156 txn->mt_flags |= MDB_TXN_ERROR; 10157 } 10158 } else { 10159 /* reset the DB record, mark it dirty */ 10160 txn->mt_dbflags[dbi] |= DB_DIRTY; 10161 txn->mt_dbs[dbi].md_depth = 0; 10162 txn->mt_dbs[dbi].md_branch_pages = 0; 10163 txn->mt_dbs[dbi].md_leaf_pages = 0; 10164 txn->mt_dbs[dbi].md_overflow_pages = 0; 10165 txn->mt_dbs[dbi].md_entries = 0; 10166 txn->mt_dbs[dbi].md_root = P_INVALID; 10167 10168 txn->mt_flags |= MDB_TXN_DIRTY; 10169 } 10170 leave: 10171 mdb_cursor_close(mc); 10172 return rc; 10173 } 10174 10175 int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) 10176 { 10177 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10178 return EINVAL; 10179 10180 txn->mt_dbxs[dbi].md_cmp = cmp; 10181 return MDB_SUCCESS; 10182 } 10183 10184 int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) 10185 { 10186 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10187 return EINVAL; 10188 10189 txn->mt_dbxs[dbi].md_dcmp = cmp; 10190 return MDB_SUCCESS; 10191 } 10192 10193 int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) 10194 { 10195 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10196 return EINVAL; 10197 10198 txn->mt_dbxs[dbi].md_rel = rel; 10199 return MDB_SUCCESS; 10200 } 10201 10202 int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) 10203 { 10204 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) 10205 return EINVAL; 10206 10207 txn->mt_dbxs[dbi].md_relctx = ctx; 10208 return MDB_SUCCESS; 10209 } 10210 10211 int ESECT 10212 mdb_env_get_maxkeysize(MDB_env *env) 10213 { 10214 return ENV_MAXKEY(env); 10215 } 10216 10217 int ESECT 10218 mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) 10219 { 10220 unsigned int i, rdrs; 10221 MDB_reader *mr; 10222 char buf[64]; 10223 int rc = 0, first = 1; 10224 10225 if (!env || !func) 10226 return -1; 10227 if (!env->me_txns) { 10228 return func("(no reader locks)\n", ctx); 10229 } 10230 rdrs = env->me_txns->mti_numreaders; 10231 mr = env->me_txns->mti_readers; 10232 for (i=0; i<rdrs; i++) { 10233 if (mr[i].mr_pid) { 10234 txnid_t txnid = mr[i].mr_txnid; 10235 sprintf(buf, txnid == (txnid_t)-1 ? 10236 "%10d %"Z"x -\n" : "%10d %"Z"x %"Z"u\n", 10237 (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid); 10238 if (first) { 10239 first = 0; 10240 rc = func(" pid thread txnid\n", ctx); 10241 if (rc < 0) 10242 break; 10243 } 10244 rc = func(buf, ctx); 10245 if (rc < 0) 10246 break; 10247 } 10248 } 10249 if (first) { 10250 rc = func("(no active readers)\n", ctx); 10251 } 10252 return rc; 10253 } 10254 10255 /** Insert pid into list if not already present. 10256 * return -1 if already present. 10257 */ 10258 static int ESECT 10259 mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid) 10260 { 10261 /* binary search of pid in list */ 10262 unsigned base = 0; 10263 unsigned cursor = 1; 10264 int val = 0; 10265 unsigned n = ids[0]; 10266 10267 while( 0 < n ) { 10268 unsigned pivot = n >> 1; 10269 cursor = base + pivot + 1; 10270 val = pid - ids[cursor]; 10271 10272 if( val < 0 ) { 10273 n = pivot; 10274 10275 } else if ( val > 0 ) { 10276 base = cursor; 10277 n -= pivot + 1; 10278 10279 } else { 10280 /* found, so it's a duplicate */ 10281 return -1; 10282 } 10283 } 10284 10285 if( val > 0 ) { 10286 ++cursor; 10287 } 10288 ids[0]++; 10289 for (n = ids[0]; n > cursor; n--) 10290 ids[n] = ids[n-1]; 10291 ids[n] = pid; 10292 return 0; 10293 } 10294 10295 int ESECT 10296 mdb_reader_check(MDB_env *env, int *dead) 10297 { 10298 if (!env) 10299 return EINVAL; 10300 if (dead) 10301 *dead = 0; 10302 return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS; 10303 } 10304 10305 /** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ 10306 static int ESECT 10307 mdb_reader_check0(MDB_env *env, int rlocked, int *dead) 10308 { 10309 mdb_mutexref_t rmutex = rlocked ? NULL : env->me_rmutex; 10310 unsigned int i, j, rdrs; 10311 MDB_reader *mr; 10312 MDB_PID_T *pids, pid; 10313 int rc = MDB_SUCCESS, count = 0; 10314 10315 rdrs = env->me_txns->mti_numreaders; 10316 pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); 10317 if (!pids) 10318 return ENOMEM; 10319 pids[0] = 0; 10320 mr = env->me_txns->mti_readers; 10321 for (i=0; i<rdrs; i++) { 10322 pid = mr[i].mr_pid; 10323 if (pid && pid != env->me_pid) { 10324 if (mdb_pid_insert(pids, pid) == 0) { 10325 if (!mdb_reader_pid(env, Pidcheck, pid)) { 10326 /* Stale reader found */ 10327 j = i; 10328 if (rmutex) { 10329 if ((rc = LOCK_MUTEX0(rmutex)) != 0) { 10330 if ((rc = mdb_mutex_failed(env, rmutex, rc))) 10331 break; 10332 rdrs = 0; /* the above checked all readers */ 10333 } else { 10334 /* Recheck, a new process may have reused pid */ 10335 if (mdb_reader_pid(env, Pidcheck, pid)) 10336 j = rdrs; 10337 } 10338 } 10339 for (; j<rdrs; j++) 10340 if (mr[j].mr_pid == pid) { 10341 DPRINTF(("clear stale reader pid %u txn %"Z"d", 10342 (unsigned) pid, mr[j].mr_txnid)); 10343 mr[j].mr_pid = 0; 10344 count++; 10345 } 10346 if (rmutex) 10347 UNLOCK_MUTEX(rmutex); 10348 } 10349 } 10350 } 10351 } 10352 free(pids); 10353 if (dead) 10354 *dead = count; 10355 return rc; 10356 } 10357 10358 #ifdef MDB_ROBUST_SUPPORTED 10359 /** Handle #LOCK_MUTEX0() failure. 10360 * Try to repair the lock file if the mutex owner died. 10361 * @param[in] env the environment handle 10362 * @param[in] mutex LOCK_MUTEX0() mutex 10363 * @param[in] rc LOCK_MUTEX0() error (nonzero) 10364 * @return 0 on success with the mutex locked, or an error code on failure. 10365 */ 10366 static int ESECT 10367 mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc) 10368 { 10369 int rlocked, rc2; 10370 MDB_meta *meta; 10371 10372 if (rc == MDB_OWNERDEAD) { 10373 /* We own the mutex. Clean up after dead previous owner. */ 10374 rc = MDB_SUCCESS; 10375 rlocked = (mutex == env->me_rmutex); 10376 if (!rlocked) { 10377 /* Keep mti_txnid updated, otherwise next writer can 10378 * overwrite data which latest meta page refers to. 10379 */ 10380 meta = mdb_env_pick_meta(env); 10381 env->me_txns->mti_txnid = meta->mm_txnid; 10382 /* env is hosed if the dead thread was ours */ 10383 if (env->me_txn) { 10384 env->me_flags |= MDB_FATAL_ERROR; 10385 env->me_txn = NULL; 10386 rc = MDB_PANIC; 10387 } 10388 } 10389 DPRINTF(("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), 10390 (rc ? "this process' env is hosed" : "recovering"))); 10391 rc2 = mdb_reader_check0(env, rlocked, NULL); 10392 if (rc2 == 0) 10393 rc2 = mdb_mutex_consistent(mutex); 10394 if (rc || (rc = rc2)) { 10395 DPRINTF(("LOCK_MUTEX recovery failed, %s", mdb_strerror(rc))); 10396 UNLOCK_MUTEX(mutex); 10397 } 10398 } else { 10399 #ifdef _WIN32 10400 rc = ErrCode(); 10401 #endif 10402 DPRINTF(("LOCK_MUTEX failed, %s", mdb_strerror(rc))); 10403 } 10404 10405 return rc; 10406 } 10407 #endif /* MDB_ROBUST_SUPPORTED */ 10408 10409 #if defined(_WIN32) 10410 /** Convert \b src to new wchar_t[] string with room for \b xtra extra chars */ 10411 static int ESECT 10412 utf8_to_utf16(const char *src, MDB_name *dst, int xtra) 10413 { 10414 int rc, need = 0; 10415 wchar_t *result = NULL; 10416 for (;;) { /* malloc result, then fill it in */ 10417 need = MultiByteToWideChar(CP_UTF8, 0, src, -1, result, need); 10418 if (!need) { 10419 rc = ErrCode(); 10420 free(result); 10421 return rc; 10422 } 10423 if (!result) { 10424 result = malloc(sizeof(wchar_t) * (need + xtra)); 10425 if (!result) 10426 return ENOMEM; 10427 continue; 10428 } 10429 dst->mn_alloced = 1; 10430 dst->mn_len = need - 1; 10431 dst->mn_val = result; 10432 return MDB_SUCCESS; 10433 } 10434 } 10435 #endif /* defined(_WIN32) */ 10436 /** @} */ 10437