1/** @file mdb.c
2 * @brief Lightning memory-mapped database library
3 *
4 * A Btree-based database management library modeled loosely on the
5 * BerkeleyDB API, but much simplified.
6 */
7/*
8 * Copyright 2011-2021 Howard Chu, Symas Corp.
9 * All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted only as authorized by the OpenLDAP
13 * Public License.
14 *
15 * A copy of this license is available in the file LICENSE in the
16 * top-level directory of the distribution or, alternatively, at
17 * <http://www.OpenLDAP.org/license.html>.
18 *
19 * This code is derived from btree.c written by Martin Hedenfalk.
20 *
21 * Copyright (c) 2009, 2010 Martin Hedenfalk <[email protected]>
22 *
23 * Permission to use, copy, modify, and distribute this software for any
24 * purpose with or without fee is hereby granted, provided that the above
25 * copyright notice and this permission notice appear in all copies.
26 *
27 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
28 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
30 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
31 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
32 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
33 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
34 */
35#ifndef _GNU_SOURCE
36#define _GNU_SOURCE 1
37#endif
38#if defined(__WIN64__)
39#define _FILE_OFFSET_BITS 64
40#endif
41#ifdef _WIN32
42#include <malloc.h>
43#include <windows.h>
44#include <wchar.h> /* get wcscpy() */
45
46/** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
47 * as int64 which is wrong. MSVC doesn't define it at all, so just
48 * don't use it.
49 */
50#define MDB_PID_T int
51#define MDB_THR_T DWORD
52#include <sys/types.h>
53#include <sys/stat.h>
54#ifdef __GNUC__
55# include <sys/param.h>
56#else
57# define LITTLE_ENDIAN 1234
58# define BIG_ENDIAN 4321
59# define BYTE_ORDER LITTLE_ENDIAN
60# ifndef SSIZE_MAX
61# define SSIZE_MAX INT_MAX
62# endif
63#endif
64#else
65#include <sys/types.h>
66#include <sys/stat.h>
67#define MDB_PID_T pid_t
68#define MDB_THR_T pthread_t
69#include <sys/param.h>
70#include <sys/uio.h>
71#include <sys/mman.h>
72#ifdef HAVE_SYS_FILE_H
73#include <sys/file.h>
74#endif
75#include <fcntl.h>
76#endif
77
78#if defined(__mips) && defined(__linux)
79/* MIPS has cache coherency issues, requires explicit cache control */
80#include <asm/cachectl.h>
81extern int cacheflush(char *addr, int nbytes, int cache);
82#define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache)
83#else
84#define CACHEFLUSH(addr, bytes, cache)
85#endif
86
87#if defined(__linux) && !defined(MDB_FDATASYNC_WORKS)
88/** fdatasync is broken on ext3/ext4fs on older kernels, see
89 * description in #mdb_env_open2 comments. You can safely
90 * define MDB_FDATASYNC_WORKS if this code will only be run
91 * on kernels 3.6 and newer.
92 */
93#define BROKEN_FDATASYNC
94#endif
95
96#include <errno.h>
97#include <limits.h>
98#include <stddef.h>
99#include <inttypes.h>
100#include <stdio.h>
101#include <stdlib.h>
102#include <string.h>
103#include <time.h>
104
105#ifdef _MSC_VER
106#include <io.h>
107typedef SSIZE_T ssize_t;
108#else
109#include <unistd.h>
110#endif
111
112#if defined(__sun) || defined(ANDROID)
113/* Most platforms have posix_memalign, older may only have memalign */
114#define HAVE_MEMALIGN 1
115#include <malloc.h>
116/* On Solaris, we need the POSIX sigwait function */
117#if defined (__sun)
118# define _POSIX_PTHREAD_SEMANTICS 1
119#endif
120#endif
121
122#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
123#include <netinet/in.h>
124#include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */
125#endif
126
127#if defined(__FreeBSD__) && defined(__FreeBSD_version) && __FreeBSD_version >= 1100110
128# define MDB_USE_POSIX_MUTEX 1
129# define MDB_USE_ROBUST 1
130#elif defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__)
131# define MDB_USE_POSIX_SEM 1
132# define MDB_FDATASYNC fsync
133#elif defined(ANDROID)
134# define MDB_FDATASYNC fsync
135#endif
136
137#ifndef _WIN32
138#include <pthread.h>
139#include <signal.h>
140#ifdef MDB_USE_POSIX_SEM
141# define MDB_USE_HASH 1
142#include <semaphore.h>
143#else
144#define MDB_USE_POSIX_MUTEX 1
145#endif
146#endif
147
148#if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) \
149 + defined(MDB_USE_POSIX_MUTEX) != 1
150# error "Ambiguous shared-lock implementation"
151#endif
152
153#ifdef USE_VALGRIND
154#include <valgrind/memcheck.h>
155#define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z)
156#define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s)
157#define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a)
158#define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h)
159#define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s)
160#else
161#define VGMEMP_CREATE(h,r,z)
162#define VGMEMP_ALLOC(h,a,s)
163#define VGMEMP_FREE(h,a)
164#define VGMEMP_DESTROY(h)
165#define VGMEMP_DEFINED(a,s)
166#endif
167
168#ifndef BYTE_ORDER
169# if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN))
170/* Solaris just defines one or the other */
171# define LITTLE_ENDIAN 1234
172# define BIG_ENDIAN 4321
173# ifdef _LITTLE_ENDIAN
174# define BYTE_ORDER LITTLE_ENDIAN
175# else
176# define BYTE_ORDER BIG_ENDIAN
177# endif
178# else
179# define BYTE_ORDER __BYTE_ORDER
180# endif
181#endif
182
183#ifndef LITTLE_ENDIAN
184#define LITTLE_ENDIAN __LITTLE_ENDIAN
185#endif
186#ifndef BIG_ENDIAN
187#define BIG_ENDIAN __BIG_ENDIAN
188#endif
189
190#if defined(__i386) || defined(__x86_64) || defined(_M_IX86)
191#define MISALIGNED_OK 1
192#endif
193
194#include "lmdb.h"
195#include "midl.h"
196
197#if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN)
198# error "Unknown or unsupported endianness (BYTE_ORDER)"
199#elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
200# error "Two's complement, reasonably sized integer types, please"
201#endif
202
203#ifdef __GNUC__
204/** Put infrequently used env functions in separate section */
205# ifdef __APPLE__
206# define ESECT __attribute__ ((section("__TEXT,text_env")))
207# else
208# define ESECT __attribute__ ((section("text_env")))
209# endif
210#else
211#define ESECT
212#endif
213
214#ifdef _WIN32
215#define CALL_CONV WINAPI
216#else
217#define CALL_CONV
218#endif
219
220/** @defgroup internal LMDB Internals
221 * @{
222 */
223/** @defgroup compat Compatibility Macros
224 * A bunch of macros to minimize the amount of platform-specific ifdefs
225 * needed throughout the rest of the code. When the features this library
226 * needs are similar enough to POSIX to be hidden in a one-or-two line
227 * replacement, this macro approach is used.
228 * @{
229 */
230
231 /** Features under development */
232#ifndef MDB_DEVEL
233#define MDB_DEVEL 0
234#endif
235
236 /** Wrapper around __func__, which is a C99 feature */
237#if __STDC_VERSION__ >= 199901L
238# define mdb_func_ __func__
239#elif __GNUC__ >= 2 || _MSC_VER >= 1300
240# define mdb_func_ __FUNCTION__
241#else
242/* If a debug message says <mdb_unknown>(), update the #if statements above */
243# define mdb_func_ "<mdb_unknown>"
244#endif
245
246/* Internal error codes, not exposed outside liblmdb */
247#define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10)
248#ifdef _WIN32
249#define MDB_OWNERDEAD ((int) WAIT_ABANDONED)
250#elif defined(MDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD)
251#define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */
252#endif
253
254#ifdef __GLIBC__
255#define GLIBC_VER ((__GLIBC__ << 16 )| __GLIBC_MINOR__)
256#endif
257/** Some platforms define the EOWNERDEAD error code
258 * even though they don't support Robust Mutexes.
259 * Compile with -DMDB_USE_ROBUST=0, or use some other
260 * mechanism like -DMDB_USE_POSIX_SEM instead of
261 * -DMDB_USE_POSIX_MUTEX.
262 * (Posix semaphores are not robust.)
263 */
264#ifndef MDB_USE_ROBUST
265/* Android currently lacks Robust Mutex support. So does glibc < 2.4. */
266# if defined(MDB_USE_POSIX_MUTEX) && (defined(ANDROID) || \
267 (defined(__GLIBC__) && GLIBC_VER < 0x020004))
268# define MDB_USE_ROBUST 0
269# else
270# define MDB_USE_ROBUST 1
271# endif
272#endif /* !MDB_USE_ROBUST */
273
274#if defined(MDB_USE_POSIX_MUTEX) && (MDB_USE_ROBUST)
275/* glibc < 2.12 only provided _np API */
276# if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \
277 (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST))
278# define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP
279# define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag)
280# define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex)
281# endif
282#endif /* MDB_USE_POSIX_MUTEX && MDB_USE_ROBUST */
283
284#if defined(MDB_OWNERDEAD) && (MDB_USE_ROBUST)
285#define MDB_ROBUST_SUPPORTED 1
286#endif
287
288#ifdef _WIN32
289#define MDB_USE_HASH 1
290#define MDB_PIDLOCK 0
291#define THREAD_RET DWORD
292#define pthread_t HANDLE
293#define pthread_mutex_t HANDLE
294#define pthread_cond_t HANDLE
295typedef HANDLE mdb_mutex_t, mdb_mutexref_t;
296#define pthread_key_t DWORD
297#define pthread_self() GetCurrentThreadId()
298#define pthread_key_create(x,y) \
299 ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0)
300#define pthread_key_delete(x) TlsFree(x)
301#define pthread_getspecific(x) TlsGetValue(x)
302#define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode())
303#define pthread_mutex_unlock(x) ReleaseMutex(*x)
304#define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE)
305#define pthread_cond_signal(x) SetEvent(*x)
306#define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0)
307#define THREAD_CREATE(thr,start,arg) \
308 (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode())
309#define THREAD_FINISH(thr) \
310 (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0)
311#define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE)
312#define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex)
313#define mdb_mutex_consistent(mutex) 0
314#define getpid() GetCurrentProcessId()
315#define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd))
316#define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len))
317#define ErrCode() GetLastError()
318#define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
319#define close(fd) (CloseHandle(fd) ? 0 : -1)
320#define munmap(ptr,len) UnmapViewOfFile(ptr)
321#ifdef PROCESS_QUERY_LIMITED_INFORMATION
322#define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION
323#else
324#define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000
325#endif
326#define Z "I"
327#else
328#define THREAD_RET void *
329#define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg)
330#define THREAD_FINISH(thr) pthread_join(thr,NULL)
331#define Z "z" /**< printf format modifier for size_t */
332
333 /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */
334#define MDB_PIDLOCK 1
335
336#ifdef MDB_USE_POSIX_SEM
337
338typedef sem_t *mdb_mutex_t, *mdb_mutexref_t;
339#define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex)
340#define UNLOCK_MUTEX(mutex) sem_post(mutex)
341
342static int
343mdb_sem_wait(sem_t *sem)
344{
345 int rc;
346 while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ;
347 return rc;
348}
349
350#else /* MDB_USE_POSIX_MUTEX: */
351 /** Shared mutex/semaphore as the original is stored.
352 *
353 * Not for copies. Instead it can be assigned to an #mdb_mutexref_t.
354 * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it
355 * is array[size 1] so it can be assigned to the pointer.
356 */
357typedef pthread_mutex_t mdb_mutex_t[1];
358 /** Reference to an #mdb_mutex_t */
359typedef pthread_mutex_t *mdb_mutexref_t;
360 /** Lock the reader or writer mutex.
361 * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX().
362 */
363#define LOCK_MUTEX0(mutex) pthread_mutex_lock(mutex)
364 /** Unlock the reader or writer mutex.
365 */
366#define UNLOCK_MUTEX(mutex) pthread_mutex_unlock(mutex)
367 /** Mark mutex-protected data as repaired, after death of previous owner.
368 */
369#define mdb_mutex_consistent(mutex) pthread_mutex_consistent(mutex)
370#endif /* MDB_USE_POSIX_SEM */
371
372 /** Get the error code for the last failed system function.
373 */
374#define ErrCode() errno
375
376 /** An abstraction for a file handle.
377 * On POSIX systems file handles are small integers. On Windows
378 * they're opaque pointers.
379 */
380#define HANDLE int
381
382 /** A value for an invalid file handle.
383 * Mainly used to initialize file variables and signify that they are
384 * unused.
385 */
386#define INVALID_HANDLE_VALUE (-1)
387
388 /** Get the size of a memory page for the system.
389 * This is the basic size that the platform's memory manager uses, and is
390 * fundamental to the use of memory-mapped files.
391 */
392#define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE))
393#endif
394
395#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
396#define MNAME_LEN 32
397#else
398#define MNAME_LEN (sizeof(pthread_mutex_t))
399#endif
400
401/** @} */
402
403#ifdef MDB_ROBUST_SUPPORTED
404 /** Lock mutex, handle any error, set rc = result.
405 * Return 0 on success, nonzero (not rc) on error.
406 */
407#define LOCK_MUTEX(rc, env, mutex) \
408 (((rc) = LOCK_MUTEX0(mutex)) && \
409 ((rc) = mdb_mutex_failed(env, mutex, rc)))
410static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc);
411#else
412#define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex))
413#define mdb_mutex_failed(env, mutex, rc) (rc)
414#endif
415
416#ifndef _WIN32
417/** A flag for opening a file and requesting synchronous data writes.
418 * This is only used when writing a meta page. It's not strictly needed;
419 * we could just do a normal write and then immediately perform a flush.
420 * But if this flag is available it saves us an extra system call.
421 *
422 * @note If O_DSYNC is undefined but exists in /usr/include,
423 * preferably set some compiler flag to get the definition.
424 */
425#ifndef MDB_DSYNC
426# ifdef O_DSYNC
427# define MDB_DSYNC O_DSYNC
428# else
429# define MDB_DSYNC O_SYNC
430# endif
431#endif
432#endif
433
434/** Function for flushing the data of a file. Define this to fsync
435 * if fdatasync() is not supported.
436 */
437#ifndef MDB_FDATASYNC
438# define MDB_FDATASYNC fdatasync
439#endif
440
441#ifndef MDB_MSYNC
442# define MDB_MSYNC(addr,len,flags) msync(addr,len,flags)
443#endif
444
445#ifndef MS_SYNC
446#define MS_SYNC 1
447#endif
448
449#ifndef MS_ASYNC
450#define MS_ASYNC 0
451#endif
452
453 /** A page number in the database.
454 * Note that 64 bit page numbers are overkill, since pages themselves
455 * already represent 12-13 bits of addressable memory, and the OS will
456 * always limit applications to a maximum of 63 bits of address space.
457 *
458 * @note In the #MDB_node structure, we only store 48 bits of this value,
459 * which thus limits us to only 60 bits of addressable data.
460 */
461typedef MDB_ID pgno_t;
462
463 /** A transaction ID.
464 * See struct MDB_txn.mt_txnid for details.
465 */
466typedef MDB_ID txnid_t;
467
468/** @defgroup debug Debug Macros
469 * @{
470 */
471#ifndef MDB_DEBUG
472 /** Enable debug output. Needs variable argument macros (a C99 feature).
473 * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs
474 * read from and written to the database (used for free space management).
475 */
476#define MDB_DEBUG 0
477#endif
478
479#if MDB_DEBUG
480static int mdb_debug;
481static txnid_t mdb_debug_start;
482
483 /** Print a debug message with printf formatting.
484 * Requires double parenthesis around 2 or more args.
485 */
486# define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args))
487# define DPRINTF0(fmt, ...) \
488 fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__)
489#else
490# define DPRINTF(args) ((void) 0)
491#endif
492 /** Print a debug string.
493 * The string is printed literally, with no format processing.
494 */
495#define DPUTS(arg) DPRINTF(("%s", arg))
496 /** Debugging output value of a cursor DBI: Negative in a sub-cursor. */
497#define DDBI(mc) \
498 (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
499/** @} */
500
501 /** @brief The maximum size of a database page.
502 *
503 * It is 32k or 64k, since value-PAGEBASE must fit in
504 * #MDB_page.%mp_upper.
505 *
506 * LMDB will use database pages < OS pages if needed.
507 * That causes more I/O in write transactions: The OS must
508 * know (read) the whole page before writing a partial page.
509 *
510 * Note that we don't currently support Huge pages. On Linux,
511 * regular data files cannot use Huge pages, and in general
512 * Huge pages aren't actually pageable. We rely on the OS
513 * demand-pager to read our data and page it out when memory
514 * pressure from other processes is high. So until OSs have
515 * actual paging support for Huge pages, they're not viable.
516 */
517#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000)
518
519 /** The minimum number of keys required in a database page.
520 * Setting this to a larger value will place a smaller bound on the
521 * maximum size of a data item. Data items larger than this size will
522 * be pushed into overflow pages instead of being stored directly in
523 * the B-tree node. This value used to default to 4. With a page size
524 * of 4096 bytes that meant that any item larger than 1024 bytes would
525 * go into an overflow page. That also meant that on average 2-3KB of
526 * each overflow page was wasted space. The value cannot be lower than
527 * 2 because then there would no longer be a tree structure. With this
528 * value, items larger than 2KB will go into overflow pages, and on
529 * average only 1KB will be wasted.
530 */
531#define MDB_MINKEYS 2
532
533 /** A stamp that identifies a file as an LMDB file.
534 * There's nothing special about this value other than that it is easily
535 * recognizable, and it will reflect any byte order mismatches.
536 */
537#define MDB_MAGIC 0xBEEFC0DE
538
539 /** The version number for a database's datafile format. */
540#define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1)
541 /** The version number for a database's lockfile format. */
542#define MDB_LOCK_VERSION 1
543
544 /** @brief The max size of a key we can write, or 0 for computed max.
545 *
546 * This macro should normally be left alone or set to 0.
547 * Note that a database with big keys or dupsort data cannot be
548 * reliably modified by a liblmdb which uses a smaller max.
549 * The default is 511 for backwards compat, or 0 when #MDB_DEVEL.
550 *
551 * Other values are allowed, for backwards compat. However:
552 * A value bigger than the computed max can break if you do not
553 * know what you are doing, and liblmdb <= 0.9.10 can break when
554 * modifying a DB with keys/dupsort data bigger than its max.
555 *
556 * Data items in an #MDB_DUPSORT database are also limited to
557 * this size, since they're actually keys of a sub-DB. Keys and
558 * #MDB_DUPSORT data items must fit on a node in a regular page.
559 */
560#ifndef MDB_MAXKEYSIZE
561#define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511)
562#endif
563
564 /** The maximum size of a key we can write to the environment. */
565#if MDB_MAXKEYSIZE
566#define ENV_MAXKEY(env) (MDB_MAXKEYSIZE)
567#else
568#define ENV_MAXKEY(env) ((env)->me_maxkey)
569#endif
570
571 /** @brief The maximum size of a data item.
572 *
573 * We only store a 32 bit value for node sizes.
574 */
575#define MAXDATASIZE 0xffffffffUL
576
577#if MDB_DEBUG
578 /** Key size which fits in a #DKBUF.
579 * @ingroup debug
580 */
581#define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511)
582 /** A key buffer.
583 * @ingroup debug
584 * This is used for printing a hex dump of a key's contents.
585 */
586#define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1]
587 /** Display a key in hex.
588 * @ingroup debug
589 * Invoke a function to display a key in hex.
590 */
591#define DKEY(x) mdb_dkey(x, kbuf)
592#else
593#define DKBUF
594#define DKEY(x) 0
595#endif
596
597 /** An invalid page number.
598 * Mainly used to denote an empty tree.
599 */
600#define P_INVALID (~(pgno_t)0)
601
602 /** Test if the flags \b f are set in a flag word \b w. */
603#define F_ISSET(w, f) (((w) & (f)) == (f))
604
605 /** Round \b n up to an even number. */
606#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */
607
608 /** Used for offsets within a single page.
609 * Since memory pages are typically 4 or 8KB in size, 12-13 bits,
610 * this is plenty.
611 */
612typedef uint16_t indx_t;
613
614 /** Default size of memory map.
615 * This is certainly too small for any actual applications. Apps should always set
616 * the size explicitly using #mdb_env_set_mapsize().
617 */
618#define DEFAULT_MAPSIZE 1048576
619
620/** @defgroup readers Reader Lock Table
621 * Readers don't acquire any locks for their data access. Instead, they
622 * simply record their transaction ID in the reader table. The reader
623 * mutex is needed just to find an empty slot in the reader table. The
624 * slot's address is saved in thread-specific data so that subsequent read
625 * transactions started by the same thread need no further locking to proceed.
626 *
627 * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data.
628 *
629 * No reader table is used if the database is on a read-only filesystem, or
630 * if #MDB_NOLOCK is set.
631 *
632 * Since the database uses multi-version concurrency control, readers don't
633 * actually need any locking. This table is used to keep track of which
634 * readers are using data from which old transactions, so that we'll know
635 * when a particular old transaction is no longer in use. Old transactions
636 * that have discarded any data pages can then have those pages reclaimed
637 * for use by a later write transaction.
638 *
639 * The lock table is constructed such that reader slots are aligned with the
640 * processor's cache line size. Any slot is only ever used by one thread.
641 * This alignment guarantees that there will be no contention or cache
642 * thrashing as threads update their own slot info, and also eliminates
643 * any need for locking when accessing a slot.
644 *
645 * A writer thread will scan every slot in the table to determine the oldest
646 * outstanding reader transaction. Any freed pages older than this will be
647 * reclaimed by the writer. The writer doesn't use any locks when scanning
648 * this table. This means that there's no guarantee that the writer will
649 * see the most up-to-date reader info, but that's not required for correct
650 * operation - all we need is to know the upper bound on the oldest reader,
651 * we don't care at all about the newest reader. So the only consequence of
652 * reading stale information here is that old pages might hang around a
653 * while longer before being reclaimed. That's actually good anyway, because
654 * the longer we delay reclaiming old pages, the more likely it is that a
655 * string of contiguous pages can be found after coalescing old pages from
656 * many old transactions together.
657 * @{
658 */
659 /** Number of slots in the reader table.
660 * This value was chosen somewhat arbitrarily. 126 readers plus a
661 * couple mutexes fit exactly into 8KB on my development machine.
662 * Applications should set the table size using #mdb_env_set_maxreaders().
663 */
664#define DEFAULT_READERS 126
665
666 /** The size of a CPU cache line in bytes. We want our lock structures
667 * aligned to this size to avoid false cache line sharing in the
668 * lock table.
669 * This value works for most CPUs. For Itanium this should be 128.
670 */
671#ifndef CACHELINE
672#define CACHELINE 64
673#endif
674
675 /** The information we store in a single slot of the reader table.
676 * In addition to a transaction ID, we also record the process and
677 * thread ID that owns a slot, so that we can detect stale information,
678 * e.g. threads or processes that went away without cleaning up.
679 * @note We currently don't check for stale records. We simply re-init
680 * the table when we know that we're the only process opening the
681 * lock file.
682 */
683typedef struct MDB_rxbody {
684 /** Current Transaction ID when this transaction began, or (txnid_t)-1.
685 * Multiple readers that start at the same time will probably have the
686 * same ID here. Again, it's not important to exclude them from
687 * anything; all we need to know is which version of the DB they
688 * started from so we can avoid overwriting any data used in that
689 * particular version.
690 */
691 volatile txnid_t mrb_txnid;
692 /** The process ID of the process owning this reader txn. */
693 volatile MDB_PID_T mrb_pid;
694 /** The thread ID of the thread owning this txn. */
695 volatile MDB_THR_T mrb_tid;
696} MDB_rxbody;
697
698 /** The actual reader record, with cacheline padding. */
699typedef struct MDB_reader {
700 union {
701 MDB_rxbody mrx;
702 /** shorthand for mrb_txnid */
703#define mr_txnid mru.mrx.mrb_txnid
704#define mr_pid mru.mrx.mrb_pid
705#define mr_tid mru.mrx.mrb_tid
706 /** cache line alignment */
707 char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)];
708 } mru;
709} MDB_reader;
710
711 /** The header for the reader table.
712 * The table resides in a memory-mapped file. (This is a different file
713 * than is used for the main database.)
714 *
715 * For POSIX the actual mutexes reside in the shared memory of this
716 * mapped file. On Windows, mutexes are named objects allocated by the
717 * kernel; we store the mutex names in this mapped file so that other
718 * processes can grab them. This same approach is also used on
719 * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support
720 * process-shared POSIX mutexes. For these cases where a named object
721 * is used, the object name is derived from a 64 bit FNV hash of the
722 * environment pathname. As such, naming collisions are extremely
723 * unlikely. If a collision occurs, the results are unpredictable.
724 */
725typedef struct MDB_txbody {
726 /** Stamp identifying this as an LMDB file. It must be set
727 * to #MDB_MAGIC. */
728 uint32_t mtb_magic;
729 /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */
730 uint32_t mtb_format;
731#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
732 char mtb_rmname[MNAME_LEN];
733#else
734 /** Mutex protecting access to this table.
735 * This is the reader table lock used with LOCK_MUTEX().
736 */
737 mdb_mutex_t mtb_rmutex;
738#endif
739 /** The ID of the last transaction committed to the database.
740 * This is recorded here only for convenience; the value can always
741 * be determined by reading the main database meta pages.
742 */
743 volatile txnid_t mtb_txnid;
744 /** The number of slots that have been used in the reader table.
745 * This always records the maximum count, it is not decremented
746 * when readers release their slots.
747 */
748 volatile unsigned mtb_numreaders;
749} MDB_txbody;
750
751 /** The actual reader table definition. */
752typedef struct MDB_txninfo {
753 union {
754 MDB_txbody mtb;
755#define mti_magic mt1.mtb.mtb_magic
756#define mti_format mt1.mtb.mtb_format
757#define mti_rmutex mt1.mtb.mtb_rmutex
758#define mti_rmname mt1.mtb.mtb_rmname
759#define mti_txnid mt1.mtb.mtb_txnid
760#define mti_numreaders mt1.mtb.mtb_numreaders
761 char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
762 } mt1;
763 union {
764#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
765 char mt2_wmname[MNAME_LEN];
766#define mti_wmname mt2.mt2_wmname
767#else
768 mdb_mutex_t mt2_wmutex;
769#define mti_wmutex mt2.mt2_wmutex
770#endif
771 char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)];
772 } mt2;
773 MDB_reader mti_readers[1];
774} MDB_txninfo;
775
776 /** Lockfile format signature: version, features and field layout */
777#define MDB_LOCK_FORMAT \
778 ((uint32_t) \
779 ((MDB_LOCK_VERSION) \
780 /* Flags which describe functionality */ \
781 + (((MDB_PIDLOCK) != 0) << 16)))
782/** @} */
783
784/** Common header for all page types. The page type depends on #mp_flags.
785 *
786 * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with
787 * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages
788 * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header.
789 *
790 * #P_OVERFLOW records occupy one or more contiguous pages where only the
791 * first has a page header. They hold the real data of #F_BIGDATA nodes.
792 *
793 * #P_SUBP sub-pages are small leaf "pages" with duplicate data.
794 * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page.
795 * (Duplicate data can also go in sub-databases, which use normal pages.)
796 *
797 * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot.
798 *
799 * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once
800 * in the snapshot: Either used by a database or listed in a freeDB record.
801 */
802typedef struct MDB_page {
803#define mp_pgno mp_p.p_pgno
804#define mp_next mp_p.p_next
805 union {
806 pgno_t p_pgno; /**< page number */
807 struct MDB_page *p_next; /**< for in-memory list of freed pages */
808 } mp_p;
809 uint16_t mp_pad; /**< key size if this is a LEAF2 page */
810/** @defgroup mdb_page Page Flags
811 * @ingroup internal
812 * Flags for the page headers.
813 * @{
814 */
815#define P_BRANCH 0x01 /**< branch page */
816#define P_LEAF 0x02 /**< leaf page */
817#define P_OVERFLOW 0x04 /**< overflow page */
818#define P_META 0x08 /**< meta page */
819#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */
820#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
821#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */
822#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */
823#define P_KEEP 0x8000 /**< leave this page alone during spill */
824/** @} */
825 uint16_t mp_flags; /**< @ref mdb_page */
826#define mp_lower mp_pb.pb.pb_lower
827#define mp_upper mp_pb.pb.pb_upper
828#define mp_pages mp_pb.pb_pages
829 union {
830 struct {
831 indx_t pb_lower; /**< lower bound of free space */
832 indx_t pb_upper; /**< upper bound of free space */
833 } pb;
834 uint32_t pb_pages; /**< number of overflow pages */
835 } mp_pb;
836 indx_t mp_ptrs[1]; /**< dynamic size */
837} MDB_page;
838
839 /** Size of the page header, excluding dynamic data at the end */
840#define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs))
841
842 /** Address of first usable data byte in a page, after the header */
843#define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ))
844
845 /** ITS#7713, change PAGEBASE to handle 65536 byte pages */
846#define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0)
847
848 /** Number of nodes on a page */
849#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1)
850
851 /** The amount of space remaining in the page */
852#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower)
853
854 /** The percentage of space used in the page, in tenths of a percent. */
855#define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
856 ((env)->me_psize - PAGEHDRSZ))
857 /** The minimum page fill factor, in tenths of a percent.
858 * Pages emptier than this are candidates for merging.
859 */
860#define FILL_THRESHOLD 250
861
862 /** Test if a page is a leaf page */
863#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF)
864 /** Test if a page is a LEAF2 page */
865#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2)
866 /** Test if a page is a branch page */
867#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH)
868 /** Test if a page is an overflow page */
869#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW)
870 /** Test if a page is a sub page */
871#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP)
872
873 /** The number of overflow pages needed to store the given size. */
874#define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1)
875
876 /** Link in #MDB_txn.%mt_loose_pgs list.
877 * Kept outside the page header, which is needed when reusing the page.
878 */
879#define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2))
880
881 /** Header for a single key/data pair within a page.
882 * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2.
883 * We guarantee 2-byte alignment for 'MDB_node's.
884 *
885 * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child
886 * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used
887 * for pgno. (Branch nodes have no flags). Lo and hi are in host byte
888 * order in case some accesses can be optimized to 32-bit word access.
889 *
890 * Leaf node flags describe node contents. #F_BIGDATA says the node's
891 * data part is the page number of an overflow page with actual data.
892 * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in
893 * a sub-page/sub-database, and named databases (just #F_SUBDATA).
894 */
895typedef struct MDB_node {
896 /** part of data size or pgno
897 * @{ */
898#if BYTE_ORDER == LITTLE_ENDIAN
899 unsigned short mn_lo, mn_hi;
900#else
901 unsigned short mn_hi, mn_lo;
902#endif
903 /** @} */
904/** @defgroup mdb_node Node Flags
905 * @ingroup internal
906 * Flags for node headers.
907 * @{
908 */
909#define F_BIGDATA 0x01 /**< data put on overflow page */
910#define F_SUBDATA 0x02 /**< data is a sub-database */
911#define F_DUPDATA 0x04 /**< data has duplicates */
912
913/** valid flags for #mdb_node_add() */
914#define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND)
915
916/** @} */
917 unsigned short mn_flags; /**< @ref mdb_node */
918 unsigned short mn_ksize; /**< key size */
919 char mn_data[1]; /**< key and data are appended here */
920} MDB_node;
921
922 /** Size of the node header, excluding dynamic data at the end */
923#define NODESIZE offsetof(MDB_node, mn_data)
924
925 /** Bit position of top word in page number, for shifting mn_flags */
926#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0)
927
928 /** Size of a node in a branch page with a given key.
929 * This is just the node header plus the key, there is no data.
930 */
931#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
932
933 /** Size of a node in a leaf page with a given key and data.
934 * This is node header plus key plus data size.
935 */
936#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size)
937
938 /** Address of node \b i in page \b p */
939#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE))
940
941 /** Address of the key for the node */
942#define NODEKEY(node) (void *)((node)->mn_data)
943
944 /** Address of the data for a node */
945#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize)
946
947 /** Get the page number pointed to by a branch node */
948#define NODEPGNO(node) \
949 ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \
950 (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0))
951 /** Set the page number in a branch node */
952#define SETPGNO(node,pgno) do { \
953 (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \
954 if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0)
955
956 /** Get the size of the data in a leaf node */
957#define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16))
958 /** Set the size of the data for a leaf node */
959#define SETDSZ(node,size) do { \
960 (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0)
961 /** The size of a key in a node */
962#define NODEKSZ(node) ((node)->mn_ksize)
963
964 /** Copy a page number from src to dst */
965#ifdef MISALIGNED_OK
966#define COPY_PGNO(dst,src) dst = src
967#else
968#if SIZE_MAX > 4294967295UL
969#define COPY_PGNO(dst,src) do { \
970 unsigned short *s, *d; \
971 s = (unsigned short *)&(src); \
972 d = (unsigned short *)&(dst); \
973 *d++ = *s++; \
974 *d++ = *s++; \
975 *d++ = *s++; \
976 *d = *s; \
977} while (0)
978#else
979#define COPY_PGNO(dst,src) do { \
980 unsigned short *s, *d; \
981 s = (unsigned short *)&(src); \
982 d = (unsigned short *)&(dst); \
983 *d++ = *s++; \
984 *d = *s; \
985} while (0)
986#endif
987#endif
988 /** The address of a key in a LEAF2 page.
989 * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs.
990 * There are no node headers, keys are stored contiguously.
991 */
992#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks)))
993
994 /** Set the \b node's key into \b keyptr, if requested. */
995#define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \
996 (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } }
997
998 /** Set the \b node's key into \b key. */
999#define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); }
1000
1001 /** Information about a single database in the environment. */
1002typedef struct MDB_db {
1003 uint32_t md_pad; /**< also ksize for LEAF2 pages */
1004 uint16_t md_flags; /**< @ref mdb_dbi_open */
1005 uint16_t md_depth; /**< depth of this tree */
1006 pgno_t md_branch_pages; /**< number of internal pages */
1007 pgno_t md_leaf_pages; /**< number of leaf pages */
1008 pgno_t md_overflow_pages; /**< number of overflow pages */
1009 size_t md_entries; /**< number of data items */
1010 pgno_t md_root; /**< the root page of this tree */
1011} MDB_db;
1012
1013#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */
1014#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID))
1015 /** #mdb_dbi_open() flags */
1016#define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\
1017 MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE)
1018
1019 /** Handle for the DB used to track free pages. */
1020#define FREE_DBI 0
1021 /** Handle for the default DB. */
1022#define MAIN_DBI 1
1023 /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */
1024#define CORE_DBS 2
1025
1026 /** Number of meta pages - also hardcoded elsewhere */
1027#define NUM_METAS 2
1028
1029 /** Meta page content.
1030 * A meta page is the start point for accessing a database snapshot.
1031 * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2).
1032 */
1033typedef struct MDB_meta {
1034 /** Stamp identifying this as an LMDB file. It must be set
1035 * to #MDB_MAGIC. */
1036 uint32_t mm_magic;
1037 /** Version number of this file. Must be set to #MDB_DATA_VERSION. */
1038 uint32_t mm_version;
1039 void *mm_address; /**< address for fixed mapping */
1040 size_t mm_mapsize; /**< size of mmap region */
1041 MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */
1042 /** The size of pages used in this DB */
1043#define mm_psize mm_dbs[FREE_DBI].md_pad
1044 /** Any persistent environment flags. @ref mdb_env */
1045#define mm_flags mm_dbs[FREE_DBI].md_flags
1046 /** Last used page in the datafile.
1047 * Actually the file may be shorter if the freeDB lists the final pages.
1048 */
1049 pgno_t mm_last_pg;
1050 volatile txnid_t mm_txnid; /**< txnid that committed this page */
1051} MDB_meta;
1052
1053 /** Buffer for a stack-allocated meta page.
1054 * The members define size and alignment, and silence type
1055 * aliasing warnings. They are not used directly; that could
1056 * mean incorrectly using several union members in parallel.
1057 */
1058typedef union MDB_metabuf {
1059 MDB_page mb_page;
1060 struct {
1061 char mm_pad[PAGEHDRSZ];
1062 MDB_meta mm_meta;
1063 } mb_metabuf;
1064} MDB_metabuf;
1065
1066 /** Auxiliary DB info.
1067 * The information here is mostly static/read-only. There is
1068 * only a single copy of this record in the environment.
1069 */
1070typedef struct MDB_dbx {
1071 MDB_val md_name; /**< name of the database */
1072 MDB_cmp_func *md_cmp; /**< function for comparing keys */
1073 MDB_cmp_func *md_dcmp; /**< function for comparing data items */
1074 MDB_rel_func *md_rel; /**< user relocate function */
1075 void *md_relctx; /**< user-provided context for md_rel */
1076} MDB_dbx;
1077
1078 /** A database transaction.
1079 * Every operation requires a transaction handle.
1080 */
1081struct MDB_txn {
1082 MDB_txn *mt_parent; /**< parent of a nested txn */
1083 /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */
1084 MDB_txn *mt_child;
1085 pgno_t mt_next_pgno; /**< next unallocated page */
1086 /** The ID of this transaction. IDs are integers incrementing from 1.
1087 * Only committed write transactions increment the ID. If a transaction
1088 * aborts, the ID may be re-used by the next writer.
1089 */
1090 txnid_t mt_txnid;
1091 MDB_env *mt_env; /**< the DB environment */
1092 /** The list of pages that became unused during this transaction.
1093 */
1094 MDB_IDL mt_free_pgs;
1095 /** The list of loose pages that became unused and may be reused
1096 * in this transaction, linked through #NEXT_LOOSE_PAGE(page).
1097 */
1098 MDB_page *mt_loose_pgs;
1099 /** Number of loose pages (#mt_loose_pgs) */
1100 int mt_loose_count;
1101 /** The sorted list of dirty pages we temporarily wrote to disk
1102 * because the dirty list was full. page numbers in here are
1103 * shifted left by 1, deleted slots have the LSB set.
1104 */
1105 MDB_IDL mt_spill_pgs;
1106 union {
1107 /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */
1108 MDB_ID2L dirty_list;
1109 /** For read txns: This thread/txn's reader table slot, or NULL. */
1110 MDB_reader *reader;
1111 } mt_u;
1112 /** Array of records for each DB known in the environment. */
1113 MDB_dbx *mt_dbxs;
1114 /** Array of MDB_db records for each known DB */
1115 MDB_db *mt_dbs;
1116 /** Array of sequence numbers for each DB handle */
1117 unsigned int *mt_dbiseqs;
1118/** @defgroup mt_dbflag Transaction DB Flags
1119 * @ingroup internal
1120 * @{
1121 */
1122#define DB_DIRTY 0x01 /**< DB was written in this txn */
1123#define DB_STALE 0x02 /**< Named-DB record is older than txnID */
1124#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */
1125#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */
1126#define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */
1127#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */
1128/** @} */
1129 /** In write txns, array of cursors for each DB */
1130 MDB_cursor **mt_cursors;
1131 /** Array of flags for each DB */
1132 unsigned char *mt_dbflags;
1133 /** Number of DB records in use, or 0 when the txn is finished.
1134 * This number only ever increments until the txn finishes; we
1135 * don't decrement it when individual DB handles are closed.
1136 */
1137 MDB_dbi mt_numdbs;
1138
1139/** @defgroup mdb_txn Transaction Flags
1140 * @ingroup internal
1141 * @{
1142 */
1143 /** #mdb_txn_begin() flags */
1144#define MDB_TXN_BEGIN_FLAGS MDB_RDONLY
1145#define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */
1146 /* internal txn flags */
1147#define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */
1148#define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */
1149#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */
1150#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */
1151#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */
1152#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */
1153 /** most operations on the txn are currently illegal */
1154#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD)
1155/** @} */
1156 unsigned int mt_flags; /**< @ref mdb_txn */
1157 /** #dirty_list room: Array size - \#dirty pages visible to this txn.
1158 * Includes ancestor txns' dirty pages not hidden by other txns'
1159 * dirty/spilled pages. Thus commit(nested txn) has room to merge
1160 * dirty_list into mt_parent after freeing hidden mt_parent pages.
1161 */
1162 unsigned int mt_dirty_room;
1163};
1164
1165/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
1166 * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
1167 * raise this on a 64 bit machine.
1168 */
1169#define CURSOR_STACK 32
1170
1171struct MDB_xcursor;
1172
1173 /** Cursors are used for all DB operations.
1174 * A cursor holds a path of (page pointer, key index) from the DB
1175 * root to a position in the DB, plus other state. #MDB_DUPSORT
1176 * cursors include an xcursor to the current data item. Write txns
1177 * track their cursors and keep them up to date when data moves.
1178 * Exception: An xcursor's pointer to a #P_SUBP page can be stale.
1179 * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage).
1180 */
1181struct MDB_cursor {
1182 /** Next cursor on this DB in this txn */
1183 MDB_cursor *mc_next;
1184 /** Backup of the original cursor if this cursor is a shadow */
1185 MDB_cursor *mc_backup;
1186 /** Context used for databases with #MDB_DUPSORT, otherwise NULL */
1187 struct MDB_xcursor *mc_xcursor;
1188 /** The transaction that owns this cursor */
1189 MDB_txn *mc_txn;
1190 /** The database handle this cursor operates on */
1191 MDB_dbi mc_dbi;
1192 /** The database record for this cursor */
1193 MDB_db *mc_db;
1194 /** The database auxiliary record for this cursor */
1195 MDB_dbx *mc_dbx;
1196 /** The @ref mt_dbflag for this database */
1197 unsigned char *mc_dbflag;
1198 unsigned short mc_snum; /**< number of pushed pages */
1199 unsigned short mc_top; /**< index of top page, normally mc_snum-1 */
1200/** @defgroup mdb_cursor Cursor Flags
1201 * @ingroup internal
1202 * Cursor state flags.
1203 * @{
1204 */
1205#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */
1206#define C_EOF 0x02 /**< No more data */
1207#define C_SUB 0x04 /**< Cursor is a sub-cursor */
1208#define C_DEL 0x08 /**< last op was a cursor_del */
1209#define C_UNTRACK 0x40 /**< Un-track cursor when closing */
1210/** @} */
1211 unsigned int mc_flags; /**< @ref mdb_cursor */
1212 MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */
1213 indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */
1214};
1215
1216 /** Context for sorted-dup records.
1217 * We could have gone to a fully recursive design, with arbitrarily
1218 * deep nesting of sub-databases. But for now we only handle these
1219 * levels - main DB, optional sub-DB, sorted-duplicate DB.
1220 */
1221typedef struct MDB_xcursor {
1222 /** A sub-cursor for traversing the Dup DB */
1223 MDB_cursor mx_cursor;
1224 /** The database record for this Dup DB */
1225 MDB_db mx_db;
1226 /** The auxiliary DB record for this Dup DB */
1227 MDB_dbx mx_dbx;
1228 /** The @ref mt_dbflag for this Dup DB */
1229 unsigned char mx_dbflag;
1230} MDB_xcursor;
1231
1232 /** Check if there is an inited xcursor */
1233#define XCURSOR_INITED(mc) \
1234 ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
1235
1236 /** Update the xcursor's sub-page pointer, if any, in \b mc. Needed
1237 * when the node which contains the sub-page may have moved. Called
1238 * with leaf page \b mp = mc->mc_pg[\b top].
1239 */
1240#define XCURSOR_REFRESH(mc, top, mp) do { \
1241 MDB_page *xr_pg = (mp); \
1242 MDB_node *xr_node; \
1243 if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \
1244 xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \
1245 if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \
1246 (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \
1247} while (0)
1248
1249 /** State of FreeDB old pages, stored in the MDB_env */
1250typedef struct MDB_pgstate {
1251 pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */
1252 txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */
1253} MDB_pgstate;
1254
1255 /** The database environment. */
1256struct MDB_env {
1257 HANDLE me_fd; /**< The main data file */
1258 HANDLE me_lfd; /**< The lock file */
1259 HANDLE me_mfd; /**< For writing and syncing the meta pages */
1260 /** Failed to update the meta page. Probably an I/O error. */
1261#define MDB_FATAL_ERROR 0x80000000U
1262 /** Some fields are initialized. */
1263#define MDB_ENV_ACTIVE 0x20000000U
1264 /** me_txkey is set */
1265#define MDB_ENV_TXKEY 0x10000000U
1266 /** fdatasync is unreliable */
1267#define MDB_FSYNCONLY 0x08000000U
1268 uint32_t me_flags; /**< @ref mdb_env */
1269 unsigned int me_psize; /**< DB page size, inited from me_os_psize */
1270 unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */
1271 unsigned int me_maxreaders; /**< size of the reader table */
1272 /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */
1273 volatile int me_close_readers;
1274 MDB_dbi me_numdbs; /**< number of DBs opened */
1275 MDB_dbi me_maxdbs; /**< size of the DB table */
1276 MDB_PID_T me_pid; /**< process ID of this env */
1277 char *me_path; /**< path to the DB files */
1278 char *me_map; /**< the memory map of the data file */
1279 MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */
1280 MDB_meta *me_metas[NUM_METAS]; /**< pointers to the two meta pages */
1281 void *me_pbuf; /**< scratch area for DUPSORT put() */
1282 MDB_txn *me_txn; /**< current write transaction */
1283 MDB_txn *me_txn0; /**< prealloc'd write transaction */
1284 size_t me_mapsize; /**< size of the data memory map */
1285 off_t me_size; /**< current file size */
1286 pgno_t me_maxpg; /**< me_mapsize / me_psize */
1287 MDB_dbx *me_dbxs; /**< array of static DB info */
1288 uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */
1289 unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */
1290 pthread_key_t me_txkey; /**< thread-key for readers */
1291 txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */
1292 MDB_pgstate me_pgstate; /**< state of old pages from freeDB */
1293# define me_pglast me_pgstate.mf_pglast
1294# define me_pghead me_pgstate.mf_pghead
1295 MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */
1296 /** IDL of pages that became unused in a write txn */
1297 MDB_IDL me_free_pgs;
1298 /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */
1299 MDB_ID2L me_dirty_list;
1300 /** Max number of freelist items that can fit in a single overflow page */
1301 int me_maxfree_1pg;
1302 /** Max size of a node on a page */
1303 unsigned int me_nodemax;
1304#if !(MDB_MAXKEYSIZE)
1305 unsigned int me_maxkey; /**< max size of a key */
1306#endif
1307 int me_live_reader; /**< have liveness lock in reader table */
1308#ifdef _WIN32
1309 int me_pidquery; /**< Used in OpenProcess */
1310#endif
1311#ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */
1312# define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */
1313# define me_wmutex me_txns->mti_wmutex /**< Shared writer lock */
1314#else
1315 mdb_mutex_t me_rmutex;
1316 mdb_mutex_t me_wmutex;
1317#endif
1318 void *me_userctx; /**< User-settable context */
1319 MDB_assert_func *me_assert_func; /**< Callback for assertion failures */
1320};
1321
1322 /** Nested transaction */
1323typedef struct MDB_ntxn {
1324 MDB_txn mnt_txn; /**< the transaction */
1325 MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */
1326} MDB_ntxn;
1327
1328 /** max number of pages to commit in one writev() call */
1329#define MDB_COMMIT_PAGES 64
1330#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES
1331#undef MDB_COMMIT_PAGES
1332#define MDB_COMMIT_PAGES IOV_MAX
1333#endif
1334
1335 /** max bytes to write in one call */
1336#define MAX_WRITE (0x40000000U >> (sizeof(ssize_t) == 4))
1337
1338 /** Check \b txn and \b dbi arguments to a function */
1339#define TXN_DBI_EXIST(txn, dbi, validity) \
1340 ((txn) && (dbi)<(txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity)))
1341
1342 /** Check for misused \b dbi handles */
1343#define TXN_DBI_CHANGED(txn, dbi) \
1344 ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi])
1345
1346static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp);
1347static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp);
1348static int mdb_page_touch(MDB_cursor *mc);
1349
1350#define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \
1351 "reset-tmp", "fail-begin", "fail-beginchild"}
1352enum {
1353 /* mdb_txn_end operation number, for logging */
1354 MDB_END_COMMITTED, MDB_END_EMPTY_COMMIT, MDB_END_ABORT, MDB_END_RESET,
1355 MDB_END_RESET_TMP, MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD
1356};
1357#define MDB_END_OPMASK 0x0F /**< mask for #mdb_txn_end() operation number */
1358#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */
1359#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */
1360#define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */
1361static void mdb_txn_end(MDB_txn *txn, unsigned mode);
1362
1363static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl);
1364static int mdb_page_search_root(MDB_cursor *mc,
1365 MDB_val *key, int modify);
1366#define MDB_PS_MODIFY 1
1367#define MDB_PS_ROOTONLY 2
1368#define MDB_PS_FIRST 4
1369#define MDB_PS_LAST 8
1370static int mdb_page_search(MDB_cursor *mc,
1371 MDB_val *key, int flags);
1372static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
1373
1374#define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */
1375static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata,
1376 pgno_t newpgno, unsigned int nflags);
1377
1378static int mdb_env_read_header(MDB_env *env, MDB_meta *meta);
1379static MDB_meta *mdb_env_pick_meta(const MDB_env *env);
1380static int mdb_env_write_meta(MDB_txn *txn);
1381#if defined(MDB_USE_POSIX_MUTEX) && !defined(MDB_ROBUST_SUPPORTED) /* Drop unused excl arg */
1382# define mdb_env_close0(env, excl) mdb_env_close1(env)
1383#endif
1384static void mdb_env_close0(MDB_env *env, int excl);
1385
1386static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp);
1387static int mdb_node_add(MDB_cursor *mc, indx_t indx,
1388 MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags);
1389static void mdb_node_del(MDB_cursor *mc, int ksize);
1390static void mdb_node_shrink(MDB_page *mp, indx_t indx);
1391static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft);
1392static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data);
1393static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data);
1394static size_t mdb_branch_size(MDB_env *env, MDB_val *key);
1395
1396static int mdb_rebalance(MDB_cursor *mc);
1397static int mdb_update_key(MDB_cursor *mc, MDB_val *key);
1398
1399static void mdb_cursor_pop(MDB_cursor *mc);
1400static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp);
1401
1402static int mdb_cursor_del0(MDB_cursor *mc);
1403static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags);
1404static int mdb_cursor_sibling(MDB_cursor *mc, int move_right);
1405static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op);
1406static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op);
1407static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op,
1408 int *exactp);
1409static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data);
1410static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data);
1411
1412static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);
1413static void mdb_xcursor_init0(MDB_cursor *mc);
1414static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node);
1415static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force);
1416
1417static int mdb_drop0(MDB_cursor *mc, int subs);
1418static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi);
1419static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead);
1420
1421/** @cond */
1422static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long;
1423/** @endcond */
1424
1425/** Compare two items pointing at size_t's of unknown alignment. */
1426#ifdef MISALIGNED_OK
1427# define mdb_cmp_clong mdb_cmp_long
1428#else
1429# define mdb_cmp_clong mdb_cmp_cint
1430#endif
1431
1432#ifdef _WIN32
1433static SECURITY_DESCRIPTOR mdb_null_sd;
1434static SECURITY_ATTRIBUTES mdb_all_sa;
1435static int mdb_sec_inited;
1436
1437struct MDB_name;
1438static int utf8_to_utf16(const char *src, struct MDB_name *dst, int xtra);
1439#endif
1440
1441/** Return the library version info. */
1442char * ESECT
1443mdb_version(int *major, int *minor, int *patch)
1444{
1445 if (major) *major = MDB_VERSION_MAJOR;
1446 if (minor) *minor = MDB_VERSION_MINOR;
1447 if (patch) *patch = MDB_VERSION_PATCH;
1448 return MDB_VERSION_STRING;
1449}
1450
1451/** Table of descriptions for LMDB @ref errors */
1452static char *const mdb_errstr[] = {
1453 "MDB_KEYEXIST: Key/data pair already exists",
1454 "MDB_NOTFOUND: No matching key/data pair found",
1455 "MDB_PAGE_NOTFOUND: Requested page not found",
1456 "MDB_CORRUPTED: Located page was wrong type",
1457 "MDB_PANIC: Update of meta page failed or environment had fatal error",
1458 "MDB_VERSION_MISMATCH: Database environment version mismatch",
1459 "MDB_INVALID: File is not an LMDB file",
1460 "MDB_MAP_FULL: Environment mapsize limit reached",
1461 "MDB_DBS_FULL: Environment maxdbs limit reached",
1462 "MDB_READERS_FULL: Environment maxreaders limit reached",
1463 "MDB_TLS_FULL: Thread-local storage keys full - too many environments open",
1464 "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big",
1465 "MDB_CURSOR_FULL: Internal error - cursor stack limit reached",
1466 "MDB_PAGE_FULL: Internal error - page has no more space",
1467 "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize",
1468 "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed",
1469 "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot",
1470 "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid",
1471 "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size",
1472 "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly",
1473};
1474
1475char *
1476mdb_strerror(int err)
1477{
1478#ifdef _WIN32
1479 /** HACK: pad 4KB on stack over the buf. Return system msgs in buf.
1480 * This works as long as no function between the call to mdb_strerror
1481 * and the actual use of the message uses more than 4K of stack.
1482 */
1483#define MSGSIZE 1024
1484#define PADSIZE 4096
1485 char buf[MSGSIZE+PADSIZE], *ptr = buf;
1486#endif
1487 int i;
1488 if (!err)
1489 return ("Successful return: 0");
1490
1491 if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) {
1492 i = err - MDB_KEYEXIST;
1493 return mdb_errstr[i];
1494 }
1495
1496#ifdef _WIN32
1497 /* These are the C-runtime error codes we use. The comment indicates
1498 * their numeric value, and the Win32 error they would correspond to
1499 * if the error actually came from a Win32 API. A major mess, we should
1500 * have used LMDB-specific error codes for everything.
1501 */
1502 switch(err) {
1503 case ENOENT: /* 2, FILE_NOT_FOUND */
1504 case EIO: /* 5, ACCESS_DENIED */
1505 case ENOMEM: /* 12, INVALID_ACCESS */
1506 case EACCES: /* 13, INVALID_DATA */
1507 case EBUSY: /* 16, CURRENT_DIRECTORY */
1508 case EINVAL: /* 22, BAD_COMMAND */
1509 case ENOSPC: /* 28, OUT_OF_PAPER */
1510 return strerror(err);
1511 default:
1512 ;
1513 }
1514 buf[0] = 0;
1515 FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM |
1516 FORMAT_MESSAGE_IGNORE_INSERTS,
1517 NULL, err, 0, ptr, MSGSIZE, (va_list *)buf+MSGSIZE);
1518 return ptr;
1519#else
1520 return strerror(err);
1521#endif
1522}
1523
1524/** assert(3) variant in cursor context */
1525#define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr)
1526/** assert(3) variant in transaction context */
1527#define mdb_tassert(txn, expr) mdb_assert0((txn)->mt_env, expr, #expr)
1528/** assert(3) variant in environment context */
1529#define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr)
1530
1531#ifndef NDEBUG
1532# define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \
1533 mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__))
1534
1535static void ESECT
1536mdb_assert_fail(MDB_env *env, const char *expr_txt,
1537 const char *func, const char *file, int line)
1538{
1539 char buf[400];
1540 sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()",
1541 file, line, expr_txt, func);
1542 if (env->me_assert_func)
1543 env->me_assert_func(env, buf);
1544 fprintf(stderr, "%s\n", buf);
1545 abort();
1546}
1547#else
1548# define mdb_assert0(env, expr, expr_txt) ((void) 0)
1549#endif /* NDEBUG */
1550
1551#if MDB_DEBUG
1552/** Return the page number of \b mp which may be sub-page, for debug output */
1553static pgno_t
1554mdb_dbg_pgno(MDB_page *mp)
1555{
1556 pgno_t ret;
1557 COPY_PGNO(ret, mp->mp_pgno);
1558 return ret;
1559}
1560
1561/** Display a key in hexadecimal and return the address of the result.
1562 * @param[in] key the key to display
1563 * @param[in] buf the buffer to write into. Should always be #DKBUF.
1564 * @return The key in hexadecimal form.
1565 */
1566char *
1567mdb_dkey(MDB_val *key, char *buf)
1568{
1569 char *ptr = buf;
1570 unsigned char *c = key->mv_data;
1571 unsigned int i;
1572
1573 if (!key)
1574 return "";
1575
1576 if (key->mv_size > DKBUF_MAXKEYSIZE)
1577 return "MDB_MAXKEYSIZE";
1578 /* may want to make this a dynamic check: if the key is mostly
1579 * printable characters, print it as-is instead of converting to hex.
1580 */
1581#if 1
1582 buf[0] = '\0';
1583 for (i=0; i<key->mv_size; i++)
1584 ptr += sprintf(ptr, "%02x", *c++);
1585#else
1586 sprintf(buf, "%.*s", key->mv_size, key->mv_data);
1587#endif
1588 return buf;
1589}
1590
1591static const char *
1592mdb_leafnode_type(MDB_node *n)
1593{
1594 static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}};
1595 return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" :
1596 tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)];
1597}
1598
1599/** Display all the keys in the page. */
1600void
1601mdb_page_list(MDB_page *mp)
1602{
1603 pgno_t pgno = mdb_dbg_pgno(mp);
1604 const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : "";
1605 MDB_node *node;
1606 unsigned int i, nkeys, nsize, total = 0;
1607 MDB_val key;
1608 DKBUF;
1609
1610 switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) {
1611 case P_BRANCH: type = "Branch page"; break;
1612 case P_LEAF: type = "Leaf page"; break;
1613 case P_LEAF|P_SUBP: type = "Sub-page"; break;
1614 case P_LEAF|P_LEAF2: type = "LEAF2 page"; break;
1615 case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break;
1616 case P_OVERFLOW:
1617 fprintf(stderr, "Overflow page %"Z"u pages %u%s\n",
1618 pgno, mp->mp_pages, state);
1619 return;
1620 case P_META:
1621 fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n",
1622 pgno, ((MDB_meta *)METADATA(mp))->mm_txnid);
1623 return;
1624 default:
1625 fprintf(stderr, "Bad page %"Z"u flags 0x%X\n", pgno, mp->mp_flags);
1626 return;
1627 }
1628
1629 nkeys = NUMKEYS(mp);
1630 fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state);
1631
1632 for (i=0; i<nkeys; i++) {
1633 if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */
1634 key.mv_size = nsize = mp->mp_pad;
1635 key.mv_data = LEAF2KEY(mp, i, nsize);
1636 total += nsize;
1637 fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key));
1638 continue;
1639 }
1640 node = NODEPTR(mp, i);
1641 key.mv_size = node->mn_ksize;
1642 key.mv_data = node->mn_data;
1643 nsize = NODESIZE + key.mv_size;
1644 if (IS_BRANCH(mp)) {
1645 fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node),
1646 DKEY(&key));
1647 total += nsize;
1648 } else {
1649 if (F_ISSET(node->mn_flags, F_BIGDATA))
1650 nsize += sizeof(pgno_t);
1651 else
1652 nsize += NODEDSZ(node);
1653 total += nsize;
1654 nsize += sizeof(indx_t);
1655 fprintf(stderr, "key %d: nsize %d, %s%s\n",
1656 i, nsize, DKEY(&key), mdb_leafnode_type(node));
1657 }
1658 total = EVEN(total);
1659 }
1660 fprintf(stderr, "Total: header %d + contents %d + unused %d\n",
1661 IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp));
1662}
1663
1664void
1665mdb_cursor_chk(MDB_cursor *mc)
1666{
1667 unsigned int i;
1668 MDB_node *node;
1669 MDB_page *mp;
1670
1671 if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return;
1672 for (i=0; i<mc->mc_top; i++) {
1673 mp = mc->mc_pg[i];
1674 node = NODEPTR(mp, mc->mc_ki[i]);
1675 if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno)
1676 printf("oops!\n");
1677 }
1678 if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))
1679 printf("ack!\n");
1680 if (XCURSOR_INITED(mc)) {
1681 node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
1682 if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) &&
1683 mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) {
1684 printf("blah!\n");
1685 }
1686 }
1687}
1688#endif
1689
1690#if (MDB_DEBUG) > 2
1691/** Count all the pages in each DB and in the freelist
1692 * and make sure it matches the actual number of pages
1693 * being used.
1694 * All named DBs must be open for a correct count.
1695 */
1696static void mdb_audit(MDB_txn *txn)
1697{
1698 MDB_cursor mc;
1699 MDB_val key, data;
1700 MDB_ID freecount, count;
1701 MDB_dbi i;
1702 int rc;
1703
1704 freecount = 0;
1705 mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
1706 while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
1707 freecount += *(MDB_ID *)data.mv_data;
1708 mdb_tassert(txn, rc == MDB_NOTFOUND);
1709
1710 count = 0;
1711 for (i = 0; i<txn->mt_numdbs; i++) {
1712 MDB_xcursor mx;
1713 if (!(txn->mt_dbflags[i] & DB_VALID))
1714 continue;
1715 mdb_cursor_init(&mc, txn, i, &mx);
1716 if (txn->mt_dbs[i].md_root == P_INVALID)
1717 continue;
1718 count += txn->mt_dbs[i].md_branch_pages +
1719 txn->mt_dbs[i].md_leaf_pages +
1720 txn->mt_dbs[i].md_overflow_pages;
1721 if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) {
1722 rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST);
1723 for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) {
1724 unsigned j;
1725 MDB_page *mp;
1726 mp = mc.mc_pg[mc.mc_top];
1727 for (j=0; j<NUMKEYS(mp); j++) {
1728 MDB_node *leaf = NODEPTR(mp, j);
1729 if (leaf->mn_flags & F_SUBDATA) {
1730 MDB_db db;
1731 memcpy(&db, NODEDATA(leaf), sizeof(db));
1732 count += db.md_branch_pages + db.md_leaf_pages +
1733 db.md_overflow_pages;
1734 }
1735 }
1736 }
1737 mdb_tassert(txn, rc == MDB_NOTFOUND);
1738 }
1739 }
1740 if (freecount + count + NUM_METAS != txn->mt_next_pgno) {
1741 fprintf(stderr, "audit: %"Z"u freecount: %"Z"u count: %"Z"u total: %"Z"u next_pgno: %"Z"u\n",
1742 txn->mt_txnid, freecount, count+NUM_METAS,
1743 freecount+count+NUM_METAS, txn->mt_next_pgno);
1744 }
1745}
1746#endif
1747
1748int
1749mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
1750{
1751 return txn->mt_dbxs[dbi].md_cmp(a, b);
1752}
1753
1754int
1755mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
1756{
1757 MDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp;
1758#if UINT_MAX < SIZE_MAX
1759 if (dcmp == mdb_cmp_int && a->mv_size == sizeof(size_t))
1760 dcmp = mdb_cmp_clong;
1761#endif
1762 return dcmp(a, b);
1763}
1764
1765/** Allocate memory for a page.
1766 * Re-use old malloc'd pages first for singletons, otherwise just malloc.
1767 * Set #MDB_TXN_ERROR on failure.
1768 */
1769static MDB_page *
1770mdb_page_malloc(MDB_txn *txn, unsigned num)
1771{
1772 MDB_env *env = txn->mt_env;
1773 MDB_page *ret = env->me_dpages;
1774 size_t psize = env->me_psize, sz = psize, off;
1775 /* For ! #MDB_NOMEMINIT, psize counts how much to init.
1776 * For a single page alloc, we init everything after the page header.
1777 * For multi-page, we init the final page; if the caller needed that
1778 * many pages they will be filling in at least up to the last page.
1779 */
1780 if (num == 1) {
1781 if (ret) {
1782 VGMEMP_ALLOC(env, ret, sz);
1783 VGMEMP_DEFINED(ret, sizeof(ret->mp_next));
1784 env->me_dpages = ret->mp_next;
1785 return ret;
1786 }
1787 psize -= off = PAGEHDRSZ;
1788 } else {
1789 sz *= num;
1790 off = sz - psize;
1791 }
1792 if ((ret = malloc(sz)) != NULL) {
1793 VGMEMP_ALLOC(env, ret, sz);
1794 if (!(env->me_flags & MDB_NOMEMINIT)) {
1795 memset((char *)ret + off, 0, psize);
1796 ret->mp_pad = 0;
1797 }
1798 } else {
1799 txn->mt_flags |= MDB_TXN_ERROR;
1800 }
1801 return ret;
1802}
1803/** Free a single page.
1804 * Saves single pages to a list, for future reuse.
1805 * (This is not used for multi-page overflow pages.)
1806 */
1807static void
1808mdb_page_free(MDB_env *env, MDB_page *mp)
1809{
1810 mp->mp_next = env->me_dpages;
1811 VGMEMP_FREE(env, mp);
1812 env->me_dpages = mp;
1813}
1814
1815/** Free a dirty page */
1816static void
1817mdb_dpage_free(MDB_env *env, MDB_page *dp)
1818{
1819 if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
1820 mdb_page_free(env, dp);
1821 } else {
1822 /* large pages just get freed directly */
1823 VGMEMP_FREE(env, dp);
1824 free(dp);
1825 }
1826}
1827
1828/** Return all dirty pages to dpage list */
1829static void
1830mdb_dlist_free(MDB_txn *txn)
1831{
1832 MDB_env *env = txn->mt_env;
1833 MDB_ID2L dl = txn->mt_u.dirty_list;
1834 unsigned i, n = dl[0].mid;
1835
1836 for (i = 1; i <= n; i++) {
1837 mdb_dpage_free(env, dl[i].mptr);
1838 }
1839 dl[0].mid = 0;
1840}
1841
1842/** Loosen or free a single page.
1843 * Saves single pages to a list for future reuse
1844 * in this same txn. It has been pulled from the freeDB
1845 * and already resides on the dirty list, but has been
1846 * deleted. Use these pages first before pulling again
1847 * from the freeDB.
1848 *
1849 * If the page wasn't dirtied in this txn, just add it
1850 * to this txn's free list.
1851 */
1852static int
1853mdb_page_loose(MDB_cursor *mc, MDB_page *mp)
1854{
1855 int loose = 0;
1856 pgno_t pgno = mp->mp_pgno;
1857 MDB_txn *txn = mc->mc_txn;
1858
1859 if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) {
1860 if (txn->mt_parent) {
1861 MDB_ID2 *dl = txn->mt_u.dirty_list;
1862 /* If txn has a parent, make sure the page is in our
1863 * dirty list.
1864 */
1865 if (dl[0].mid) {
1866 unsigned x = mdb_mid2l_search(dl, pgno);
1867 if (x <= dl[0].mid && dl[x].mid == pgno) {
1868 if (mp != dl[x].mptr) { /* bad cursor? */
1869 mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
1870 txn->mt_flags |= MDB_TXN_ERROR;
1871 return MDB_CORRUPTED;
1872 }
1873 /* ok, it's ours */
1874 loose = 1;
1875 }
1876 }
1877 } else {
1878 /* no parent txn, so it's just ours */
1879 loose = 1;
1880 }
1881 }
1882 if (loose) {
1883 DPRINTF(("loosen db %d page %"Z"u", DDBI(mc),
1884 mp->mp_pgno));
1885 NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs;
1886 txn->mt_loose_pgs = mp;
1887 txn->mt_loose_count++;
1888 mp->mp_flags |= P_LOOSE;
1889 } else {
1890 int rc = mdb_midl_append(&txn->mt_free_pgs, pgno);
1891 if (rc)
1892 return rc;
1893 }
1894
1895 return MDB_SUCCESS;
1896}
1897
1898/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
1899 * @param[in] mc A cursor handle for the current operation.
1900 * @param[in] pflags Flags of the pages to update:
1901 * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it.
1902 * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush().
1903 * @return 0 on success, non-zero on failure.
1904 */
1905static int
1906mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all)
1907{
1908 enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP };
1909 MDB_txn *txn = mc->mc_txn;
1910 MDB_cursor *m3, *m0 = mc;
1911 MDB_xcursor *mx;
1912 MDB_page *dp, *mp;
1913 MDB_node *leaf;
1914 unsigned i, j;
1915 int rc = MDB_SUCCESS, level;
1916
1917 /* Mark pages seen by cursors */
1918 if (mc->mc_flags & C_UNTRACK)
1919 mc = NULL; /* will find mc in mt_cursors */
1920 for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) {
1921 for (; mc; mc=mc->mc_next) {
1922 if (!(mc->mc_flags & C_INITIALIZED))
1923 continue;
1924 for (m3 = mc;; m3 = &mx->mx_cursor) {
1925 mp = NULL;
1926 for (j=0; j<m3->mc_snum; j++) {
1927 mp = m3->mc_pg[j];
1928 if ((mp->mp_flags & Mask) == pflags)
1929 mp->mp_flags ^= P_KEEP;
1930 }
1931 mx = m3->mc_xcursor;
1932 /* Proceed to mx if it is at a sub-database */
1933 if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED)))
1934 break;
1935 if (! (mp && (mp->mp_flags & P_LEAF)))
1936 break;
1937 leaf = NODEPTR(mp, m3->mc_ki[j-1]);
1938 if (!(leaf->mn_flags & F_SUBDATA))
1939 break;
1940 }
1941 }
1942 if (i == 0)
1943 break;
1944 }
1945
1946 if (all) {
1947 /* Mark dirty root pages */
1948 for (i=0; i<txn->mt_numdbs; i++) {
1949 if (txn->mt_dbflags[i] & DB_DIRTY) {
1950 pgno_t pgno = txn->mt_dbs[i].md_root;
1951 if (pgno == P_INVALID)
1952 continue;
1953 if ((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS)
1954 break;
1955 if ((dp->mp_flags & Mask) == pflags && level <= 1)
1956 dp->mp_flags ^= P_KEEP;
1957 }
1958 }
1959 }
1960
1961 return rc;
1962}
1963
1964static int mdb_page_flush(MDB_txn *txn, int keep);
1965
1966/** Spill pages from the dirty list back to disk.
1967 * This is intended to prevent running into #MDB_TXN_FULL situations,
1968 * but note that they may still occur in a few cases:
1969 * 1) our estimate of the txn size could be too small. Currently this
1970 * seems unlikely, except with a large number of #MDB_MULTIPLE items.
1971 * 2) child txns may run out of space if their parents dirtied a
1972 * lot of pages and never spilled them. TODO: we probably should do
1973 * a preemptive spill during #mdb_txn_begin() of a child txn, if
1974 * the parent's dirty_room is below a given threshold.
1975 *
1976 * Otherwise, if not using nested txns, it is expected that apps will
1977 * not run into #MDB_TXN_FULL any more. The pages are flushed to disk
1978 * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared.
1979 * If the txn never references them again, they can be left alone.
1980 * If the txn only reads them, they can be used without any fuss.
1981 * If the txn writes them again, they can be dirtied immediately without
1982 * going thru all of the work of #mdb_page_touch(). Such references are
1983 * handled by #mdb_page_unspill().
1984 *
1985 * Also note, we never spill DB root pages, nor pages of active cursors,
1986 * because we'll need these back again soon anyway. And in nested txns,
1987 * we can't spill a page in a child txn if it was already spilled in a
1988 * parent txn. That would alter the parent txns' data even though
1989 * the child hasn't committed yet, and we'd have no way to undo it if
1990 * the child aborted.
1991 *
1992 * @param[in] m0 cursor A cursor handle identifying the transaction and
1993 * database for which we are checking space.
1994 * @param[in] key For a put operation, the key being stored.
1995 * @param[in] data For a put operation, the data being stored.
1996 * @return 0 on success, non-zero on failure.
1997 */
1998static int
1999mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data)
2000{
2001 MDB_txn *txn = m0->mc_txn;
2002 MDB_page *dp;
2003 MDB_ID2L dl = txn->mt_u.dirty_list;
2004 unsigned int i, j, need;
2005 int rc;
2006
2007 if (m0->mc_flags & C_SUB)
2008 return MDB_SUCCESS;
2009
2010 /* Estimate how much space this op will take */
2011 i = m0->mc_db->md_depth;
2012 /* Named DBs also dirty the main DB */
2013 if (m0->mc_dbi >= CORE_DBS)
2014 i += txn->mt_dbs[MAIN_DBI].md_depth;
2015 /* For puts, roughly factor in the key+data size */
2016 if (key)
2017 i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize;
2018 i += i; /* double it for good measure */
2019 need = i;
2020
2021 if (txn->mt_dirty_room > i)
2022 return MDB_SUCCESS;
2023
2024 if (!txn->mt_spill_pgs) {
2025 txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX);
2026 if (!txn->mt_spill_pgs)
2027 return ENOMEM;
2028 } else {
2029 /* purge deleted slots */
2030 MDB_IDL sl = txn->mt_spill_pgs;
2031 unsigned int num = sl[0];
2032 j=0;
2033 for (i=1; i<=num; i++) {
2034 if (!(sl[i] & 1))
2035 sl[++j] = sl[i];
2036 }
2037 sl[0] = j;
2038 }
2039
2040 /* Preserve pages which may soon be dirtied again */
2041 if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS)
2042 goto done;
2043
2044 /* Less aggressive spill - we originally spilled the entire dirty list,
2045 * with a few exceptions for cursor pages and DB root pages. But this
2046 * turns out to be a lot of wasted effort because in a large txn many
2047 * of those pages will need to be used again. So now we spill only 1/8th
2048 * of the dirty pages. Testing revealed this to be a good tradeoff,
2049 * better than 1/2, 1/4, or 1/10.
2050 */
2051 if (need < MDB_IDL_UM_MAX / 8)
2052 need = MDB_IDL_UM_MAX / 8;
2053
2054 /* Save the page IDs of all the pages we're flushing */
2055 /* flush from the tail forward, this saves a lot of shifting later on. */
2056 for (i=dl[0].mid; i && need; i--) {
2057 MDB_ID pn = dl[i].mid << 1;
2058 dp = dl[i].mptr;
2059 if (dp->mp_flags & (P_LOOSE|P_KEEP))
2060 continue;
2061 /* Can't spill twice, make sure it's not already in a parent's
2062 * spill list.
2063 */
2064 if (txn->mt_parent) {
2065 MDB_txn *tx2;
2066 for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
2067 if (tx2->mt_spill_pgs) {
2068 j = mdb_midl_search(tx2->mt_spill_pgs, pn);
2069 if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) {
2070 dp->mp_flags |= P_KEEP;
2071 break;
2072 }
2073 }
2074 }
2075 if (tx2)
2076 continue;
2077 }
2078 if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn)))
2079 goto done;
2080 need--;
2081 }
2082 mdb_midl_sort(txn->mt_spill_pgs);
2083
2084 /* Flush the spilled part of dirty list */
2085 if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS)
2086 goto done;
2087
2088 /* Reset any dirty pages we kept that page_flush didn't see */
2089 rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i);
2090
2091done:
2092 txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS;
2093 return rc;
2094}
2095
2096/** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */
2097static txnid_t
2098mdb_find_oldest(MDB_txn *txn)
2099{
2100 int i;
2101 txnid_t mr, oldest = txn->mt_txnid - 1;
2102 if (txn->mt_env->me_txns) {
2103 MDB_reader *r = txn->mt_env->me_txns->mti_readers;
2104 for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
2105 if (r[i].mr_pid) {
2106 mr = r[i].mr_txnid;
2107 if (oldest > mr)
2108 oldest = mr;
2109 }
2110 }
2111 }
2112 return oldest;
2113}
2114
2115/** Add a page to the txn's dirty list */
2116static void
2117mdb_page_dirty(MDB_txn *txn, MDB_page *mp)
2118{
2119 MDB_ID2 mid;
2120 int rc, (*insert)(MDB_ID2L, MDB_ID2 *);
2121
2122 if (txn->mt_flags & MDB_TXN_WRITEMAP) {
2123 insert = mdb_mid2l_append;
2124 } else {
2125 insert = mdb_mid2l_insert;
2126 }
2127 mid.mid = mp->mp_pgno;
2128 mid.mptr = mp;
2129 rc = insert(txn->mt_u.dirty_list, &mid);
2130 mdb_tassert(txn, rc == 0);
2131 txn->mt_dirty_room--;
2132}
2133
2134/** Allocate page numbers and memory for writing. Maintain me_pglast,
2135 * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure.
2136 *
2137 * If there are free pages available from older transactions, they
2138 * are re-used first. Otherwise allocate a new page at mt_next_pgno.
2139 * Do not modify the freedB, just merge freeDB records into me_pghead[]
2140 * and move me_pglast to say which records were consumed. Only this
2141 * function can create me_pghead and move me_pglast/mt_next_pgno.
2142 * @param[in] mc cursor A cursor handle identifying the transaction and
2143 * database for which we are allocating.
2144 * @param[in] num the number of pages to allocate.
2145 * @param[out] mp Address of the allocated page(s). Requests for multiple pages
2146 * will always be satisfied by a single contiguous chunk of memory.
2147 * @return 0 on success, non-zero on failure.
2148 */
2149static int
2150mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
2151{
2152#ifdef MDB_PARANOID /* Seems like we can ignore this now */
2153 /* Get at most <Max_retries> more freeDB records once me_pghead
2154 * has enough pages. If not enough, use new pages from the map.
2155 * If <Paranoid> and mc is updating the freeDB, only get new
2156 * records if me_pghead is empty. Then the freelist cannot play
2157 * catch-up with itself by growing while trying to save it.
2158 */
2159 enum { Paranoid = 1, Max_retries = 500 };
2160#else
2161 enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ };
2162#endif
2163 int rc, retry = num * 60;
2164 MDB_txn *txn = mc->mc_txn;
2165 MDB_env *env = txn->mt_env;
2166 pgno_t pgno, *mop = env->me_pghead;
2167 unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1;
2168 MDB_page *np;
2169 txnid_t oldest = 0, last;
2170 MDB_cursor_op op;
2171 MDB_cursor m2;
2172 int found_old = 0;
2173
2174 /* If there are any loose pages, just use them */
2175 if (num == 1 && txn->mt_loose_pgs) {
2176 np = txn->mt_loose_pgs;
2177 txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np);
2178 txn->mt_loose_count--;
2179 DPRINTF(("db %d use loose page %"Z"u", DDBI(mc),
2180 np->mp_pgno));
2181 *mp = np;
2182 return MDB_SUCCESS;
2183 }
2184
2185 *mp = NULL;
2186
2187 /* If our dirty list is already full, we can't do anything */
2188 if (txn->mt_dirty_room == 0) {
2189 rc = MDB_TXN_FULL;
2190 goto fail;
2191 }
2192
2193 for (op = MDB_FIRST;; op = MDB_NEXT) {
2194 MDB_val key, data;
2195 MDB_node *leaf;
2196 pgno_t *idl;
2197
2198 /* Seek a big enough contiguous page range. Prefer
2199 * pages at the tail, just truncating the list.
2200 */
2201 if (mop_len > n2) {
2202 i = mop_len;
2203 do {
2204 pgno = mop[i];
2205 if (mop[i-n2] == pgno+n2)
2206 goto search_done;
2207 } while (--i > n2);
2208 if (--retry < 0)
2209 break;
2210 }
2211
2212 if (op == MDB_FIRST) { /* 1st iteration */
2213 /* Prepare to fetch more and coalesce */
2214 last = env->me_pglast;
2215 oldest = env->me_pgoldest;
2216 mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
2217 if (last) {
2218 op = MDB_SET_RANGE;
2219 key.mv_data = &last; /* will look up last+1 */
2220 key.mv_size = sizeof(last);
2221 }
2222 if (Paranoid && mc->mc_dbi == FREE_DBI)
2223 retry = -1;
2224 }
2225 if (Paranoid && retry < 0 && mop_len)
2226 break;
2227
2228 last++;
2229 /* Do not fetch more if the record will be too recent */
2230 if (oldest <= last) {
2231 if (!found_old) {
2232 oldest = mdb_find_oldest(txn);
2233 env->me_pgoldest = oldest;
2234 found_old = 1;
2235 }
2236 if (oldest <= last)
2237 break;
2238 }
2239 rc = mdb_cursor_get(&m2, &key, NULL, op);
2240 if (rc) {
2241 if (rc == MDB_NOTFOUND)
2242 break;
2243 goto fail;
2244 }
2245 last = *(txnid_t*)key.mv_data;
2246 if (oldest <= last) {
2247 if (!found_old) {
2248 oldest = mdb_find_oldest(txn);
2249 env->me_pgoldest = oldest;
2250 found_old = 1;
2251 }
2252 if (oldest <= last)
2253 break;
2254 }
2255 np = m2.mc_pg[m2.mc_top];
2256 leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]);
2257 if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS)
2258 goto fail;
2259
2260 idl = (MDB_ID *) data.mv_data;
2261 i = idl[0];
2262 if (!mop) {
2263 if (!(env->me_pghead = mop = mdb_midl_alloc(i))) {
2264 rc = ENOMEM;
2265 goto fail;
2266 }
2267 } else {
2268 if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0)
2269 goto fail;
2270 mop = env->me_pghead;
2271 }
2272 env->me_pglast = last;
2273#if (MDB_DEBUG) > 1
2274 DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u",
2275 last, txn->mt_dbs[FREE_DBI].md_root, i));
2276 for (j = i; j; j--)
2277 DPRINTF(("IDL %"Z"u", idl[j]));
2278#endif
2279 /* Merge in descending sorted order */
2280 mdb_midl_xmerge(mop, idl);
2281 mop_len = mop[0];
2282 }
2283
2284 /* Use new pages from the map when nothing suitable in the freeDB */
2285 i = 0;
2286 pgno = txn->mt_next_pgno;
2287 if (pgno + num >= env->me_maxpg) {
2288 DPUTS("DB size maxed out");
2289 rc = MDB_MAP_FULL;
2290 goto fail;
2291 }
2292
2293search_done:
2294 if (env->me_flags & MDB_WRITEMAP) {
2295 np = (MDB_page *)(env->me_map + env->me_psize * pgno);
2296 } else {
2297 if (!(np = mdb_page_malloc(txn, num))) {
2298 rc = ENOMEM;
2299 goto fail;
2300 }
2301 }
2302 if (i) {
2303 mop[0] = mop_len -= num;
2304 /* Move any stragglers down */
2305 for (j = i-num; j < mop_len; )
2306 mop[++j] = mop[++i];
2307 } else {
2308 txn->mt_next_pgno = pgno + num;
2309 }
2310 np->mp_pgno = pgno;
2311 mdb_page_dirty(txn, np);
2312 *mp = np;
2313
2314 return MDB_SUCCESS;
2315
2316fail:
2317 txn->mt_flags |= MDB_TXN_ERROR;
2318 return rc;
2319}
2320
2321/** Copy the used portions of a non-overflow page.
2322 * @param[in] dst page to copy into
2323 * @param[in] src page to copy from
2324 * @param[in] psize size of a page
2325 */
2326static void
2327mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize)
2328{
2329 enum { Align = sizeof(pgno_t) };
2330 indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower;
2331
2332 /* If page isn't full, just copy the used portion. Adjust
2333 * alignment so memcpy may copy words instead of bytes.
2334 */
2335 if ((unused &= -Align) && !IS_LEAF2(src)) {
2336 upper = (upper + PAGEBASE) & -Align;
2337 memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align);
2338 memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper),
2339 psize - upper);
2340 } else {
2341 memcpy(dst, src, psize - unused);
2342 }
2343}
2344
2345/** Pull a page off the txn's spill list, if present.
2346 * If a page being referenced was spilled to disk in this txn, bring
2347 * it back and make it dirty/writable again.
2348 * @param[in] txn the transaction handle.
2349 * @param[in] mp the page being referenced. It must not be dirty.
2350 * @param[out] ret the writable page, if any. ret is unchanged if
2351 * mp wasn't spilled.
2352 */
2353static int
2354mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret)
2355{
2356 MDB_env *env = txn->mt_env;
2357 const MDB_txn *tx2;
2358 unsigned x;
2359 pgno_t pgno = mp->mp_pgno, pn = pgno << 1;
2360
2361 for (tx2 = txn; tx2; tx2=tx2->mt_parent) {
2362 if (!tx2->mt_spill_pgs)
2363 continue;
2364 x = mdb_midl_search(tx2->mt_spill_pgs, pn);
2365 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
2366 MDB_page *np;
2367 int num;
2368 if (txn->mt_dirty_room == 0)
2369 return MDB_TXN_FULL;
2370 if (IS_OVERFLOW(mp))
2371 num = mp->mp_pages;
2372 else
2373 num = 1;
2374 if (env->me_flags & MDB_WRITEMAP) {
2375 np = mp;
2376 } else {
2377 np = mdb_page_malloc(txn, num);
2378 if (!np)
2379 return ENOMEM;
2380 if (num > 1)
2381 memcpy(np, mp, num * env->me_psize);
2382 else
2383 mdb_page_copy(np, mp, env->me_psize);
2384 }
2385 if (tx2 == txn) {
2386 /* If in current txn, this page is no longer spilled.
2387 * If it happens to be the last page, truncate the spill list.
2388 * Otherwise mark it as deleted by setting the LSB.
2389 */
2390 if (x == txn->mt_spill_pgs[0])
2391 txn->mt_spill_pgs[0]--;
2392 else
2393 txn->mt_spill_pgs[x] |= 1;
2394 } /* otherwise, if belonging to a parent txn, the
2395 * page remains spilled until child commits
2396 */
2397
2398 mdb_page_dirty(txn, np);
2399 np->mp_flags |= P_DIRTY;
2400 *ret = np;
2401 break;
2402 }
2403 }
2404 return MDB_SUCCESS;
2405}
2406
2407/** Touch a page: make it dirty and re-insert into tree with updated pgno.
2408 * Set #MDB_TXN_ERROR on failure.
2409 * @param[in] mc cursor pointing to the page to be touched
2410 * @return 0 on success, non-zero on failure.
2411 */
2412static int
2413mdb_page_touch(MDB_cursor *mc)
2414{
2415 MDB_page *mp = mc->mc_pg[mc->mc_top], *np;
2416 MDB_txn *txn = mc->mc_txn;
2417 MDB_cursor *m2, *m3;
2418 pgno_t pgno;
2419 int rc;
2420
2421 if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
2422 if (txn->mt_flags & MDB_TXN_SPILLS) {
2423 np = NULL;
2424 rc = mdb_page_unspill(txn, mp, &np);
2425 if (rc)
2426 goto fail;
2427 if (np)
2428 goto done;
2429 }
2430 if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) ||
2431 (rc = mdb_page_alloc(mc, 1, &np)))
2432 goto fail;
2433 pgno = np->mp_pgno;
2434 DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc),
2435 mp->mp_pgno, pgno));
2436 mdb_cassert(mc, mp->mp_pgno != pgno);
2437 mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
2438 /* Update the parent page, if any, to point to the new page */
2439 if (mc->mc_top) {
2440 MDB_page *parent = mc->mc_pg[mc->mc_top-1];
2441 MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]);
2442 SETPGNO(node, pgno);
2443 } else {
2444 mc->mc_db->md_root = pgno;
2445 }
2446 } else if (txn->mt_parent && !IS_SUBP(mp)) {
2447 MDB_ID2 mid, *dl = txn->mt_u.dirty_list;
2448 pgno = mp->mp_pgno;
2449 /* If txn has a parent, make sure the page is in our
2450 * dirty list.
2451 */
2452 if (dl[0].mid) {
2453 unsigned x = mdb_mid2l_search(dl, pgno);
2454 if (x <= dl[0].mid && dl[x].mid == pgno) {
2455 if (mp != dl[x].mptr) { /* bad cursor? */
2456 mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
2457 txn->mt_flags |= MDB_TXN_ERROR;
2458 return MDB_CORRUPTED;
2459 }
2460 return 0;
2461 }
2462 }
2463 mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX);
2464 /* No - copy it */
2465 np = mdb_page_malloc(txn, 1);
2466 if (!np)
2467 return ENOMEM;
2468 mid.mid = pgno;
2469 mid.mptr = np;
2470 rc = mdb_mid2l_insert(dl, &mid);
2471 mdb_cassert(mc, rc == 0);
2472 } else {
2473 return 0;
2474 }
2475
2476 mdb_page_copy(np, mp, txn->mt_env->me_psize);
2477 np->mp_pgno = pgno;
2478 np->mp_flags |= P_DIRTY;
2479
2480done:
2481 /* Adjust cursors pointing to mp */
2482 mc->mc_pg[mc->mc_top] = np;
2483 m2 = txn->mt_cursors[mc->mc_dbi];
2484 if (mc->mc_flags & C_SUB) {
2485 for (; m2; m2=m2->mc_next) {
2486 m3 = &m2->mc_xcursor->mx_cursor;
2487 if (m3->mc_snum < mc->mc_snum) continue;
2488 if (m3->mc_pg[mc->mc_top] == mp)
2489 m3->mc_pg[mc->mc_top] = np;
2490 }
2491 } else {
2492 for (; m2; m2=m2->mc_next) {
2493 if (m2->mc_snum < mc->mc_snum) continue;
2494 if (m2 == mc) continue;
2495 if (m2->mc_pg[mc->mc_top] == mp) {
2496 m2->mc_pg[mc->mc_top] = np;
2497 if (IS_LEAF(np))
2498 XCURSOR_REFRESH(m2, mc->mc_top, np);
2499 }
2500 }
2501 }
2502 return 0;
2503
2504fail:
2505 txn->mt_flags |= MDB_TXN_ERROR;
2506 return rc;
2507}
2508
2509int
2510mdb_env_sync(MDB_env *env, int force)
2511{
2512 int rc = 0;
2513 if (env->me_flags & MDB_RDONLY)
2514 return EACCES;
2515 if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
2516 if (env->me_flags & MDB_WRITEMAP) {
2517 int flags = ((env->me_flags & MDB_MAPASYNC) && !force)
2518 ? MS_ASYNC : MS_SYNC;
2519 if (MDB_MSYNC(env->me_map, env->me_mapsize, flags))
2520 rc = ErrCode();
2521#ifdef _WIN32
2522 else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd))
2523 rc = ErrCode();
2524#endif
2525 } else {
2526#ifdef BROKEN_FDATASYNC
2527 if (env->me_flags & MDB_FSYNCONLY) {
2528 if (fsync(env->me_fd))
2529 rc = ErrCode();
2530 } else
2531#endif
2532 if (MDB_FDATASYNC(env->me_fd))
2533 rc = ErrCode();
2534 }
2535 }
2536 return rc;
2537}
2538
2539/** Back up parent txn's cursors, then grab the originals for tracking */
2540static int
2541mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst)
2542{
2543 MDB_cursor *mc, *bk;
2544 MDB_xcursor *mx;
2545 size_t size;
2546 int i;
2547
2548 for (i = src->mt_numdbs; --i >= 0; ) {
2549 if ((mc = src->mt_cursors[i]) != NULL) {
2550 size = sizeof(MDB_cursor);
2551 if (mc->mc_xcursor)
2552 size += sizeof(MDB_xcursor);
2553 for (; mc; mc = bk->mc_next) {
2554 bk = malloc(size);
2555 if (!bk)
2556 return ENOMEM;
2557 *bk = *mc;
2558 mc->mc_backup = bk;
2559 mc->mc_db = &dst->mt_dbs[i];
2560 /* Kill pointers into src to reduce abuse: The
2561 * user may not use mc until dst ends. But we need a valid
2562 * txn pointer here for cursor fixups to keep working.
2563 */
2564 mc->mc_txn = dst;
2565 mc->mc_dbflag = &dst->mt_dbflags[i];
2566 if ((mx = mc->mc_xcursor) != NULL) {
2567 *(MDB_xcursor *)(bk+1) = *mx;
2568 mx->mx_cursor.mc_txn = dst;
2569 }
2570 mc->mc_next = dst->mt_cursors[i];
2571 dst->mt_cursors[i] = mc;
2572 }
2573 }
2574 }
2575 return MDB_SUCCESS;
2576}
2577
2578/** Close this write txn's cursors, give parent txn's cursors back to parent.
2579 * @param[in] txn the transaction handle.
2580 * @param[in] merge true to keep changes to parent cursors, false to revert.
2581 * @return 0 on success, non-zero on failure.
2582 */
2583static void
2584mdb_cursors_close(MDB_txn *txn, unsigned merge)
2585{
2586 MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk;
2587 MDB_xcursor *mx;
2588 int i;
2589
2590 for (i = txn->mt_numdbs; --i >= 0; ) {
2591 for (mc = cursors[i]; mc; mc = next) {
2592 next = mc->mc_next;
2593 if ((bk = mc->mc_backup) != NULL) {
2594 if (merge) {
2595 /* Commit changes to parent txn */
2596 mc->mc_next = bk->mc_next;
2597 mc->mc_backup = bk->mc_backup;
2598 mc->mc_txn = bk->mc_txn;
2599 mc->mc_db = bk->mc_db;
2600 mc->mc_dbflag = bk->mc_dbflag;
2601 if ((mx = mc->mc_xcursor) != NULL)
2602 mx->mx_cursor.mc_txn = bk->mc_txn;
2603 } else {
2604 /* Abort nested txn */
2605 *mc = *bk;
2606 if ((mx = mc->mc_xcursor) != NULL)
2607 *mx = *(MDB_xcursor *)(bk+1);
2608 }
2609 mc = bk;
2610 }
2611 /* Only malloced cursors are permanently tracked. */
2612 free(mc);
2613 }
2614 cursors[i] = NULL;
2615 }
2616}
2617
2618#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */
2619enum Pidlock_op {
2620 Pidset, Pidcheck
2621};
2622#else
2623enum Pidlock_op {
2624 Pidset = F_SETLK, Pidcheck = F_GETLK
2625};
2626#endif
2627
2628/** Set or check a pid lock. Set returns 0 on success.
2629 * Check returns 0 if the process is certainly dead, nonzero if it may
2630 * be alive (the lock exists or an error happened so we do not know).
2631 *
2632 * On Windows Pidset is a no-op, we merely check for the existence
2633 * of the process with the given pid. On POSIX we use a single byte
2634 * lock on the lockfile, set at an offset equal to the pid.
2635 */
2636static int
2637mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid)
2638{
2639#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */
2640 int ret = 0;
2641 HANDLE h;
2642 if (op == Pidcheck) {
2643 h = OpenProcess(env->me_pidquery, FALSE, pid);
2644 /* No documented "no such process" code, but other program use this: */
2645 if (!h)
2646 return ErrCode() != ERROR_INVALID_PARAMETER;
2647 /* A process exists until all handles to it close. Has it exited? */
2648 ret = WaitForSingleObject(h, 0) != 0;
2649 CloseHandle(h);
2650 }
2651 return ret;
2652#else
2653 for (;;) {
2654 int rc;
2655 struct flock lock_info;
2656 memset(&lock_info, 0, sizeof(lock_info));
2657 lock_info.l_type = F_WRLCK;
2658 lock_info.l_whence = SEEK_SET;
2659 lock_info.l_start = pid;
2660 lock_info.l_len = 1;
2661 if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) {
2662 if (op == F_GETLK && lock_info.l_type != F_UNLCK)
2663 rc = -1;
2664 } else if ((rc = ErrCode()) == EINTR) {
2665 continue;
2666 }
2667 return rc;
2668 }
2669#endif
2670}
2671
2672/** Common code for #mdb_txn_begin() and #mdb_txn_renew().
2673 * @param[in] txn the transaction handle to initialize
2674 * @return 0 on success, non-zero on failure.
2675 */
2676static int
2677mdb_txn_renew0(MDB_txn *txn)
2678{
2679 MDB_env *env = txn->mt_env;
2680 MDB_txninfo *ti = env->me_txns;
2681 MDB_meta *meta;
2682 unsigned int i, nr, flags = txn->mt_flags;
2683 uint16_t x;
2684 int rc, new_notls = 0;
2685
2686 if ((flags &= MDB_TXN_RDONLY) != 0) {
2687 if (!ti) {
2688 meta = mdb_env_pick_meta(env);
2689 txn->mt_txnid = meta->mm_txnid;
2690 txn->mt_u.reader = NULL;
2691 } else {
2692 MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader :
2693 pthread_getspecific(env->me_txkey);
2694 if (r) {
2695 if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
2696 return MDB_BAD_RSLOT;
2697 } else {
2698 MDB_PID_T pid = env->me_pid;
2699 MDB_THR_T tid = pthread_self();
2700 mdb_mutexref_t rmutex = env->me_rmutex;
2701
2702 if (!env->me_live_reader) {
2703 rc = mdb_reader_pid(env, Pidset, pid);
2704 if (rc)
2705 return rc;
2706 env->me_live_reader = 1;
2707 }
2708
2709 if (LOCK_MUTEX(rc, env, rmutex))
2710 return rc;
2711 nr = ti->mti_numreaders;
2712 for (i=0; i<nr; i++)
2713 if (ti->mti_readers[i].mr_pid == 0)
2714 break;
2715 if (i == env->me_maxreaders) {
2716 UNLOCK_MUTEX(rmutex);
2717 return MDB_READERS_FULL;
2718 }
2719 r = &ti->mti_readers[i];
2720 /* Claim the reader slot, carefully since other code
2721 * uses the reader table un-mutexed: First reset the
2722 * slot, next publish it in mti_numreaders. After
2723 * that, it is safe for mdb_env_close() to touch it.
2724 * When it will be closed, we can finally claim it.
2725 */
2726 r->mr_pid = 0;
2727 r->mr_txnid = (txnid_t)-1;
2728 r->mr_tid = tid;
2729 if (i == nr)
2730 ti->mti_numreaders = ++nr;
2731 env->me_close_readers = nr;
2732 r->mr_pid = pid;
2733 UNLOCK_MUTEX(rmutex);
2734
2735 new_notls = (env->me_flags & MDB_NOTLS);
2736 if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) {
2737 r->mr_pid = 0;
2738 return rc;
2739 }
2740 }
2741 do /* LY: Retry on a race, ITS#7970. */
2742 r->mr_txnid = ti->mti_txnid;
2743 while(r->mr_txnid != ti->mti_txnid);
2744 txn->mt_txnid = r->mr_txnid;
2745 txn->mt_u.reader = r;
2746 meta = env->me_metas[txn->mt_txnid & 1];
2747 }
2748
2749 } else {
2750 /* Not yet touching txn == env->me_txn0, it may be active */
2751 if (ti) {
2752 if (LOCK_MUTEX(rc, env, env->me_wmutex))
2753 return rc;
2754 txn->mt_txnid = ti->mti_txnid;
2755 meta = env->me_metas[txn->mt_txnid & 1];
2756 } else {
2757 meta = mdb_env_pick_meta(env);
2758 txn->mt_txnid = meta->mm_txnid;
2759 }
2760 txn->mt_txnid++;
2761#if MDB_DEBUG
2762 if (txn->mt_txnid == mdb_debug_start)
2763 mdb_debug = 1;
2764#endif
2765 txn->mt_child = NULL;
2766 txn->mt_loose_pgs = NULL;
2767 txn->mt_loose_count = 0;
2768 txn->mt_dirty_room = MDB_IDL_UM_MAX;
2769 txn->mt_u.dirty_list = env->me_dirty_list;
2770 txn->mt_u.dirty_list[0].mid = 0;
2771 txn->mt_free_pgs = env->me_free_pgs;
2772 txn->mt_free_pgs[0] = 0;
2773 txn->mt_spill_pgs = NULL;
2774 env->me_txn = txn;
2775 memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int));
2776 }
2777
2778 /* Copy the DB info and flags */
2779 memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db));
2780
2781 /* Moved to here to avoid a data race in read TXNs */
2782 txn->mt_next_pgno = meta->mm_last_pg+1;
2783
2784 txn->mt_flags = flags;
2785
2786 /* Setup db info */
2787 txn->mt_numdbs = env->me_numdbs;
2788 for (i=CORE_DBS; i<txn->mt_numdbs; i++) {
2789 x = env->me_dbflags[i];
2790 txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS;
2791 txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0;
2792 }
2793 txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID;
2794 txn->mt_dbflags[FREE_DBI] = DB_VALID;
2795
2796 if (env->me_flags & MDB_FATAL_ERROR) {
2797 DPUTS("environment had fatal error, must shutdown!");
2798 rc = MDB_PANIC;
2799 } else if (env->me_maxpg < txn->mt_next_pgno) {
2800 rc = MDB_MAP_RESIZED;
2801 } else {
2802 return MDB_SUCCESS;
2803 }
2804 mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN);
2805 return rc;
2806}
2807
2808int
2809mdb_txn_renew(MDB_txn *txn)
2810{
2811 int rc;
2812
2813 if (!txn || !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED))
2814 return EINVAL;
2815
2816 rc = mdb_txn_renew0(txn);
2817 if (rc == MDB_SUCCESS) {
2818 DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2819 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
2820 (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root));
2821 }
2822 return rc;
2823}
2824
2825int
2826mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
2827{
2828 MDB_txn *txn;
2829 MDB_ntxn *ntxn;
2830 int rc, size, tsize;
2831
2832 flags &= MDB_TXN_BEGIN_FLAGS;
2833 flags |= env->me_flags & MDB_WRITEMAP;
2834
2835 if (env->me_flags & MDB_RDONLY & ~flags) /* write txn in RDONLY env */
2836 return EACCES;
2837
2838 if (parent) {
2839 /* Nested transactions: Max 1 child, write txns only, no writemap */
2840 flags |= parent->mt_flags;
2841 if (flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_BLOCKED)) {
2842 return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN;
2843 }
2844 /* Child txns save MDB_pgstate and use own copy of cursors */
2845 size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1);
2846 size += tsize = sizeof(MDB_ntxn);
2847 } else if (flags & MDB_RDONLY) {
2848 size = env->me_maxdbs * (sizeof(MDB_db)+1);
2849 size += tsize = sizeof(MDB_txn);
2850 } else {
2851 /* Reuse preallocated write txn. However, do not touch it until
2852 * mdb_txn_renew0() succeeds, since it currently may be active.
2853 */
2854 txn = env->me_txn0;
2855 goto renew;
2856 }
2857 if ((txn = calloc(1, size)) == NULL) {
2858 DPRINTF(("calloc: %s", strerror(errno)));
2859 return ENOMEM;
2860 }
2861 txn->mt_dbxs = env->me_dbxs; /* static */
2862 txn->mt_dbs = (MDB_db *) ((char *)txn + tsize);
2863 txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs;
2864 txn->mt_flags = flags;
2865 txn->mt_env = env;
2866
2867 if (parent) {
2868 unsigned int i;
2869 txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
2870 txn->mt_dbiseqs = parent->mt_dbiseqs;
2871 txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE);
2872 if (!txn->mt_u.dirty_list ||
2873 !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)))
2874 {
2875 free(txn->mt_u.dirty_list);
2876 free(txn);
2877 return ENOMEM;
2878 }
2879 txn->mt_txnid = parent->mt_txnid;
2880 txn->mt_dirty_room = parent->mt_dirty_room;
2881 txn->mt_u.dirty_list[0].mid = 0;
2882 txn->mt_spill_pgs = NULL;
2883 txn->mt_next_pgno = parent->mt_next_pgno;
2884 parent->mt_flags |= MDB_TXN_HAS_CHILD;
2885 parent->mt_child = txn;
2886 txn->mt_parent = parent;
2887 txn->mt_numdbs = parent->mt_numdbs;
2888 memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
2889 /* Copy parent's mt_dbflags, but clear DB_NEW */
2890 for (i=0; i<txn->mt_numdbs; i++)
2891 txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW;
2892 rc = 0;
2893 ntxn = (MDB_ntxn *)txn;
2894 ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */
2895 if (env->me_pghead) {
2896 size = MDB_IDL_SIZEOF(env->me_pghead);
2897 env->me_pghead = mdb_midl_alloc(env->me_pghead[0]);
2898 if (env->me_pghead)
2899 memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size);
2900 else
2901 rc = ENOMEM;
2902 }
2903 if (!rc)
2904 rc = mdb_cursor_shadow(parent, txn);
2905 if (rc)
2906 mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD);
2907 } else { /* MDB_RDONLY */
2908 txn->mt_dbiseqs = env->me_dbiseqs;
2909renew:
2910 rc = mdb_txn_renew0(txn);
2911 }
2912 if (rc) {
2913 if (txn != env->me_txn0)
2914 free(txn);
2915 } else {
2916 txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */
2917 *ret = txn;
2918 DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2919 txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w',
2920 (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root));
2921 }
2922
2923 return rc;
2924}
2925
2926MDB_env *
2927mdb_txn_env(MDB_txn *txn)
2928{
2929 if(!txn) return NULL;
2930 return txn->mt_env;
2931}
2932
2933size_t
2934mdb_txn_id(MDB_txn *txn)
2935{
2936 if(!txn) return 0;
2937 return txn->mt_txnid;
2938}
2939
2940/** Export or close DBI handles opened in this txn. */
2941static void
2942mdb_dbis_update(MDB_txn *txn, int keep)
2943{
2944 int i;
2945 MDB_dbi n = txn->mt_numdbs;
2946 MDB_env *env = txn->mt_env;
2947 unsigned char *tdbflags = txn->mt_dbflags;
2948
2949 for (i = n; --i >= CORE_DBS;) {
2950 if (tdbflags[i] & DB_NEW) {
2951 if (keep) {
2952 env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID;
2953 } else {
2954 char *ptr = env->me_dbxs[i].md_name.mv_data;
2955 if (ptr) {
2956 env->me_dbxs[i].md_name.mv_data = NULL;
2957 env->me_dbxs[i].md_name.mv_size = 0;
2958 env->me_dbflags[i] = 0;
2959 env->me_dbiseqs[i]++;
2960 free(ptr);
2961 }
2962 }
2963 }
2964 }
2965 if (keep && env->me_numdbs < n)
2966 env->me_numdbs = n;
2967}
2968
2969/** End a transaction, except successful commit of a nested transaction.
2970 * May be called twice for readonly txns: First reset it, then abort.
2971 * @param[in] txn the transaction handle to end
2972 * @param[in] mode why and how to end the transaction
2973 */
2974static void
2975mdb_txn_end(MDB_txn *txn, unsigned mode)
2976{
2977 MDB_env *env = txn->mt_env;
2978#if MDB_DEBUG
2979 static const char *const names[] = MDB_END_NAMES;
2980#endif
2981
2982 /* Export or close DBI handles opened in this txn */
2983 mdb_dbis_update(txn, mode & MDB_END_UPDATE);
2984
2985 DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u",
2986 names[mode & MDB_END_OPMASK],
2987 txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
2988 (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root));
2989
2990 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
2991 if (txn->mt_u.reader) {
2992 txn->mt_u.reader->mr_txnid = (txnid_t)-1;
2993 if (!(env->me_flags & MDB_NOTLS)) {
2994 txn->mt_u.reader = NULL; /* txn does not own reader */
2995 } else if (mode & MDB_END_SLOT) {
2996 txn->mt_u.reader->mr_pid = 0;
2997 txn->mt_u.reader = NULL;
2998 } /* else txn owns the slot until it does MDB_END_SLOT */
2999 }
3000 txn->mt_numdbs = 0; /* prevent further DBI activity */
3001 txn->mt_flags |= MDB_TXN_FINISHED;
3002
3003 } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) {
3004 pgno_t *pghead = env->me_pghead;
3005
3006 if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */
3007 mdb_cursors_close(txn, 0);
3008 if (!(env->me_flags & MDB_WRITEMAP)) {
3009 mdb_dlist_free(txn);
3010 }
3011
3012 txn->mt_numdbs = 0;
3013 txn->mt_flags = MDB_TXN_FINISHED;
3014
3015 if (!txn->mt_parent) {
3016 mdb_midl_shrink(&txn->mt_free_pgs);
3017 env->me_free_pgs = txn->mt_free_pgs;
3018 /* me_pgstate: */
3019 env->me_pghead = NULL;
3020 env->me_pglast = 0;
3021
3022 env->me_txn = NULL;
3023 mode = 0; /* txn == env->me_txn0, do not free() it */
3024
3025 /* The writer mutex was locked in mdb_txn_begin. */
3026 if (env->me_txns)
3027 UNLOCK_MUTEX(env->me_wmutex);
3028 } else {
3029 txn->mt_parent->mt_child = NULL;
3030 txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD;
3031 env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate;
3032 mdb_midl_free(txn->mt_free_pgs);
3033 free(txn->mt_u.dirty_list);
3034 }
3035 mdb_midl_free(txn->mt_spill_pgs);
3036
3037 mdb_midl_free(pghead);
3038 }
3039
3040 if (mode & MDB_END_FREE)
3041 free(txn);
3042}
3043
3044void
3045mdb_txn_reset(MDB_txn *txn)
3046{
3047 if (txn == NULL)
3048 return;
3049
3050 /* This call is only valid for read-only txns */
3051 if (!(txn->mt_flags & MDB_TXN_RDONLY))
3052 return;
3053
3054 mdb_txn_end(txn, MDB_END_RESET);
3055}
3056
3057void
3058mdb_txn_abort(MDB_txn *txn)
3059{
3060 if (txn == NULL)
3061 return;
3062
3063 if (txn->mt_child)
3064 mdb_txn_abort(txn->mt_child);
3065
3066 mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE);
3067}
3068
3069/** Save the freelist as of this transaction to the freeDB.
3070 * This changes the freelist. Keep trying until it stabilizes.
3071 */
3072static int
3073mdb_freelist_save(MDB_txn *txn)
3074{
3075 /* env->me_pghead[] can grow and shrink during this call.
3076 * env->me_pglast and txn->mt_free_pgs[] can only grow.
3077 * Page numbers cannot disappear from txn->mt_free_pgs[].
3078 */
3079 MDB_cursor mc;
3080 MDB_env *env = txn->mt_env;
3081 int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
3082 txnid_t pglast = 0, head_id = 0;
3083 pgno_t freecnt = 0, *free_pgs, *mop;
3084 ssize_t head_room = 0, total_room = 0, mop_len, clean_limit;
3085
3086 mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
3087
3088 if (env->me_pghead) {
3089 /* Make sure first page of freeDB is touched and on freelist */
3090 rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY);
3091 if (rc && rc != MDB_NOTFOUND)
3092 return rc;
3093 }
3094
3095 if (!env->me_pghead && txn->mt_loose_pgs) {
3096 /* Put loose page numbers in mt_free_pgs, since
3097 * we may be unable to return them to me_pghead.
3098 */
3099 MDB_page *mp = txn->mt_loose_pgs;
3100 MDB_ID2 *dl = txn->mt_u.dirty_list;
3101 unsigned x;
3102 if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0)
3103 return rc;
3104 for (; mp; mp = NEXT_LOOSE_PAGE(mp)) {
3105 mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno);
3106 /* must also remove from dirty list */
3107 if (txn->mt_flags & MDB_TXN_WRITEMAP) {
3108 for (x=1; x<=dl[0].mid; x++)
3109 if (dl[x].mid == mp->mp_pgno)
3110 break;
3111 mdb_tassert(txn, x <= dl[0].mid);
3112 } else {
3113 x = mdb_mid2l_search(dl, mp->mp_pgno);
3114 mdb_tassert(txn, dl[x].mid == mp->mp_pgno);
3115 mdb_dpage_free(env, mp);
3116 }
3117 dl[x].mptr = NULL;
3118 }
3119 {
3120 /* squash freed slots out of the dirty list */
3121 unsigned y;
3122 for (y=1; dl[y].mptr && y <= dl[0].mid; y++);
3123 if (y <= dl[0].mid) {
3124 for(x=y, y++;;) {
3125 while (!dl[y].mptr && y <= dl[0].mid) y++;
3126 if (y > dl[0].mid) break;
3127 dl[x++] = dl[y++];
3128 }
3129 dl[0].mid = x-1;
3130 } else {
3131 /* all slots freed */
3132 dl[0].mid = 0;
3133 }
3134 }
3135 txn->mt_loose_pgs = NULL;
3136 txn->mt_loose_count = 0;
3137 }
3138
3139 /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
3140 clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
3141 ? SSIZE_MAX : maxfree_1pg;
3142
3143 for (;;) {
3144 /* Come back here after each Put() in case freelist changed */
3145 MDB_val key, data;
3146 pgno_t *pgs;
3147 ssize_t j;
3148
3149 /* If using records from freeDB which we have not yet
3150 * deleted, delete them and any we reserved for me_pghead.
3151 */
3152 while (pglast < env->me_pglast) {
3153 rc = mdb_cursor_first(&mc, &key, NULL);
3154 if (rc)
3155 return rc;
3156 pglast = head_id = *(txnid_t *)key.mv_data;
3157 total_room = head_room = 0;
3158 mdb_tassert(txn, pglast <= env->me_pglast);
3159 rc = mdb_cursor_del(&mc, 0);
3160 if (rc)
3161 return rc;
3162 }
3163
3164 /* Save the IDL of pages freed by this txn, to a single record */
3165 if (freecnt < txn->mt_free_pgs[0]) {
3166 if (!freecnt) {
3167 /* Make sure last page of freeDB is touched and on freelist */
3168 rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY);
3169 if (rc && rc != MDB_NOTFOUND)
3170 return rc;
3171 }
3172 free_pgs = txn->mt_free_pgs;
3173 /* Write to last page of freeDB */
3174 key.mv_size = sizeof(txn->mt_txnid);
3175 key.mv_data = &txn->mt_txnid;
3176 do {
3177 freecnt = free_pgs[0];
3178 data.mv_size = MDB_IDL_SIZEOF(free_pgs);
3179 rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
3180 if (rc)
3181 return rc;
3182 /* Retry if mt_free_pgs[] grew during the Put() */
3183 free_pgs = txn->mt_free_pgs;
3184 } while (freecnt < free_pgs[0]);
3185 mdb_midl_sort(free_pgs);
3186 memcpy(data.mv_data, free_pgs, data.mv_size);
3187#if (MDB_DEBUG) > 1
3188 {
3189 unsigned int i = free_pgs[0];
3190 DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u",
3191 txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i));
3192 for (; i; i--)
3193 DPRINTF(("IDL %"Z"u", free_pgs[i]));
3194 }
3195#endif
3196 continue;
3197 }
3198
3199 mop = env->me_pghead;
3200 mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count;
3201
3202 /* Reserve records for me_pghead[]. Split it if multi-page,
3203 * to avoid searching freeDB for a page range. Use keys in
3204 * range [1,me_pglast]: Smaller than txnid of oldest reader.
3205 */
3206 if (total_room >= mop_len) {
3207 if (total_room == mop_len || --more < 0)
3208 break;
3209 } else if (head_room >= maxfree_1pg && head_id > 1) {
3210 /* Keep current record (overflow page), add a new one */
3211 head_id--;
3212 head_room = 0;
3213 }
3214 /* (Re)write {key = head_id, IDL length = head_room} */
3215 total_room -= head_room;
3216 head_room = mop_len - total_room;
3217 if (head_room > maxfree_1pg && head_id > 1) {
3218 /* Overflow multi-page for part of me_pghead */
3219 head_room /= head_id; /* amortize page sizes */
3220 head_room += maxfree_1pg - head_room % (maxfree_1pg + 1);
3221 } else if (head_room < 0) {
3222 /* Rare case, not bothering to delete this record */
3223 head_room = 0;
3224 }
3225 key.mv_size = sizeof(head_id);
3226 key.mv_data = &head_id;
3227 data.mv_size = (head_room + 1) * sizeof(pgno_t);
3228 rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
3229 if (rc)
3230 return rc;
3231 /* IDL is initially empty, zero out at least the length */
3232 pgs = (pgno_t *)data.mv_data;
3233 j = head_room > clean_limit ? head_room : 0;
3234 do {
3235 pgs[j] = 0;
3236 } while (--j >= 0);
3237 total_room += head_room;
3238 }
3239
3240 /* Return loose page numbers to me_pghead, though usually none are
3241 * left at this point. The pages themselves remain in dirty_list.
3242 */
3243 if (txn->mt_loose_pgs) {
3244 MDB_page *mp = txn->mt_loose_pgs;
3245 unsigned count = txn->mt_loose_count;
3246 MDB_IDL loose;
3247 /* Room for loose pages + temp IDL with same */
3248 if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0)
3249 return rc;
3250 mop = env->me_pghead;
3251 loose = mop + MDB_IDL_ALLOCLEN(mop) - count;
3252 for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp))
3253 loose[ ++count ] = mp->mp_pgno;
3254 loose[0] = count;
3255 mdb_midl_sort(loose);
3256 mdb_midl_xmerge(mop, loose);
3257 txn->mt_loose_pgs = NULL;
3258 txn->mt_loose_count = 0;
3259 mop_len = mop[0];
3260 }
3261
3262 /* Fill in the reserved me_pghead records */
3263 rc = MDB_SUCCESS;
3264 if (mop_len) {
3265 MDB_val key, data;
3266
3267 mop += mop_len;
3268 rc = mdb_cursor_first(&mc, &key, &data);
3269 for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) {
3270 txnid_t id = *(txnid_t *)key.mv_data;
3271 ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1;
3272 MDB_ID save;
3273
3274 mdb_tassert(txn, len >= 0 && id <= env->me_pglast);
3275 key.mv_data = &id;
3276 if (len > mop_len) {
3277 len = mop_len;
3278 data.mv_size = (len + 1) * sizeof(MDB_ID);
3279 }
3280 data.mv_data = mop -= len;
3281 save = mop[0];
3282 mop[0] = len;
3283 rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT);
3284 mop[0] = save;
3285 if (rc || !(mop_len -= len))
3286 break;
3287 }
3288 }
3289 return rc;
3290}
3291
3292/** Flush (some) dirty pages to the map, after clearing their dirty flag.
3293 * @param[in] txn the transaction that's being committed
3294 * @param[in] keep number of initial pages in dirty_list to keep dirty.
3295 * @return 0 on success, non-zero on failure.
3296 */
3297static int
3298mdb_page_flush(MDB_txn *txn, int keep)
3299{
3300 MDB_env *env = txn->mt_env;
3301 MDB_ID2L dl = txn->mt_u.dirty_list;
3302 unsigned psize = env->me_psize, j;
3303 int i, pagecount = dl[0].mid, rc;
3304 size_t size = 0, pos = 0;
3305 pgno_t pgno = 0;
3306 MDB_page *dp = NULL;
3307#ifdef _WIN32
3308 OVERLAPPED ov;
3309#else
3310 struct iovec iov[MDB_COMMIT_PAGES];
3311 ssize_t wpos = 0, wsize = 0, wres;
3312 size_t next_pos = 1; /* impossible pos, so pos != next_pos */
3313 int n = 0;
3314#endif
3315
3316 j = i = keep;
3317
3318 if (env->me_flags & MDB_WRITEMAP) {
3319 /* Clear dirty flags */
3320 while (++i <= pagecount) {
3321 dp = dl[i].mptr;
3322 /* Don't flush this page yet */
3323 if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3324 dp->mp_flags &= ~P_KEEP;
3325 dl[++j] = dl[i];
3326 continue;
3327 }
3328 dp->mp_flags &= ~P_DIRTY;
3329 }
3330 goto done;
3331 }
3332
3333 /* Write the pages */
3334 for (;;) {
3335 if (++i <= pagecount) {
3336 dp = dl[i].mptr;
3337 /* Don't flush this page yet */
3338 if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3339 dp->mp_flags &= ~P_KEEP;
3340 dl[i].mid = 0;
3341 continue;
3342 }
3343 pgno = dl[i].mid;
3344 /* clear dirty flag */
3345 dp->mp_flags &= ~P_DIRTY;
3346 pos = pgno * psize;
3347 size = psize;
3348 if (IS_OVERFLOW(dp)) size *= dp->mp_pages;
3349 }
3350#ifdef _WIN32
3351 else break;
3352
3353 /* Windows actually supports scatter/gather I/O, but only on
3354 * unbuffered file handles. Since we're relying on the OS page
3355 * cache for all our data, that's self-defeating. So we just
3356 * write pages one at a time. We use the ov structure to set
3357 * the write offset, to at least save the overhead of a Seek
3358 * system call.
3359 */
3360 DPRINTF(("committing page %"Z"u", pgno));
3361 memset(&ov, 0, sizeof(ov));
3362 ov.Offset = pos & 0xffffffff;
3363 ov.OffsetHigh = pos >> 16 >> 16;
3364 if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) {
3365 rc = ErrCode();
3366 DPRINTF(("WriteFile: %d", rc));
3367 return rc;
3368 }
3369#else
3370 /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */
3371 if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) {
3372 if (n) {
3373retry_write:
3374 /* Write previous page(s) */
3375#ifdef MDB_USE_PWRITEV
3376 wres = pwritev(env->me_fd, iov, n, wpos);
3377#else
3378 if (n == 1) {
3379 wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos);
3380 } else {
3381retry_seek:
3382 if (lseek(env->me_fd, wpos, SEEK_SET) == -1) {
3383 rc = ErrCode();
3384 if (rc == EINTR)
3385 goto retry_seek;
3386 DPRINTF(("lseek: %s", strerror(rc)));
3387 return rc;
3388 }
3389 wres = writev(env->me_fd, iov, n);
3390 }
3391#endif
3392 if (wres != wsize) {
3393 if (wres < 0) {
3394 rc = ErrCode();
3395 if (rc == EINTR)
3396 goto retry_write;
3397 DPRINTF(("Write error: %s", strerror(rc)));
3398 } else {
3399 rc = EIO; /* TODO: Use which error code? */
3400 DPUTS("short write, filesystem full?");
3401 }
3402 return rc;
3403 }
3404 n = 0;
3405 }
3406 if (i > pagecount)
3407 break;
3408 wpos = pos;
3409 wsize = 0;
3410 }
3411 DPRINTF(("committing page %"Z"u", pgno));
3412 next_pos = pos + size;
3413 iov[n].iov_len = size;
3414 iov[n].iov_base = (char *)dp;
3415 wsize += size;
3416 n++;
3417#endif /* _WIN32 */
3418 }
3419
3420 /* MIPS has cache coherency issues, this is a no-op everywhere else
3421 * Note: for any size >= on-chip cache size, entire on-chip cache is
3422 * flushed.
3423 */
3424 CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE);
3425
3426 for (i = keep; ++i <= pagecount; ) {
3427 dp = dl[i].mptr;
3428 /* This is a page we skipped above */
3429 if (!dl[i].mid) {
3430 dl[++j] = dl[i];
3431 dl[j].mid = dp->mp_pgno;
3432 continue;
3433 }
3434 mdb_dpage_free(env, dp);
3435 }
3436
3437done:
3438 i--;
3439 txn->mt_dirty_room += i - j;
3440 dl[0].mid = j;
3441 return MDB_SUCCESS;
3442}
3443
3444int
3445mdb_txn_commit(MDB_txn *txn)
3446{
3447 int rc;
3448 unsigned int i, end_mode;
3449 MDB_env *env;
3450
3451 if (txn == NULL)
3452 return EINVAL;
3453
3454 /* mdb_txn_end() mode for a commit which writes nothing */
3455 end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE;
3456
3457 if (txn->mt_child) {
3458 rc = mdb_txn_commit(txn->mt_child);
3459 if (rc)
3460 goto fail;
3461 }
3462
3463 env = txn->mt_env;
3464
3465 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
3466 goto done;
3467 }
3468
3469 if (txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR)) {
3470 DPUTS("txn has failed/finished, can't commit");
3471 if (txn->mt_parent)
3472 txn->mt_parent->mt_flags |= MDB_TXN_ERROR;
3473 rc = MDB_BAD_TXN;
3474 goto fail;
3475 }
3476
3477 if (txn->mt_parent) {
3478 MDB_txn *parent = txn->mt_parent;
3479 MDB_page **lp;
3480 MDB_ID2L dst, src;
3481 MDB_IDL pspill;
3482 unsigned x, y, len, ps_len;
3483
3484 /* Append our free list to parent's */
3485 rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs);
3486 if (rc)
3487 goto fail;
3488 mdb_midl_free(txn->mt_free_pgs);
3489 /* Failures after this must either undo the changes
3490 * to the parent or set MDB_TXN_ERROR in the parent.
3491 */
3492
3493 parent->mt_next_pgno = txn->mt_next_pgno;
3494 parent->mt_flags = txn->mt_flags;
3495
3496 /* Merge our cursors into parent's and close them */
3497 mdb_cursors_close(txn, 1);
3498
3499 /* Update parent's DB table. */
3500 memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
3501 parent->mt_numdbs = txn->mt_numdbs;
3502 parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI];
3503 parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI];
3504 for (i=CORE_DBS; i<txn->mt_numdbs; i++) {
3505 /* preserve parent's DB_NEW status */
3506 x = parent->mt_dbflags[i] & DB_NEW;
3507 parent->mt_dbflags[i] = txn->mt_dbflags[i] | x;
3508 }
3509
3510 dst = parent->mt_u.dirty_list;
3511 src = txn->mt_u.dirty_list;
3512 /* Remove anything in our dirty list from parent's spill list */
3513 if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) {
3514 x = y = ps_len;
3515 pspill[0] = (pgno_t)-1;
3516 /* Mark our dirty pages as deleted in parent spill list */
3517 for (i=0, len=src[0].mid; ++i <= len; ) {
3518 MDB_ID pn = src[i].mid << 1;
3519 while (pn > pspill[x])
3520 x--;
3521 if (pn == pspill[x]) {
3522 pspill[x] = 1;
3523 y = --x;
3524 }
3525 }
3526 /* Squash deleted pagenums if we deleted any */
3527 for (x=y; ++x <= ps_len; )
3528 if (!(pspill[x] & 1))
3529 pspill[++y] = pspill[x];
3530 pspill[0] = y;
3531 }
3532
3533 /* Remove anything in our spill list from parent's dirty list */
3534 if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) {
3535 for (i=1; i<=txn->mt_spill_pgs[0]; i++) {
3536 MDB_ID pn = txn->mt_spill_pgs[i];
3537 if (pn & 1)
3538 continue; /* deleted spillpg */
3539 pn >>= 1;
3540 y = mdb_mid2l_search(dst, pn);
3541 if (y <= dst[0].mid && dst[y].mid == pn) {
3542 free(dst[y].mptr);
3543 while (y < dst[0].mid) {
3544 dst[y] = dst[y+1];
3545 y++;
3546 }
3547 dst[0].mid--;
3548 }
3549 }
3550 }
3551
3552 /* Find len = length of merging our dirty list with parent's */
3553 x = dst[0].mid;
3554 dst[0].mid = 0; /* simplify loops */
3555 if (parent->mt_parent) {
3556 len = x + src[0].mid;
3557 y = mdb_mid2l_search(src, dst[x].mid + 1) - 1;
3558 for (i = x; y && i; y--) {
3559 pgno_t yp = src[y].mid;
3560 while (yp < dst[i].mid)
3561 i--;
3562 if (yp == dst[i].mid) {
3563 i--;
3564 len--;
3565 }
3566 }
3567 } else { /* Simplify the above for single-ancestor case */
3568 len = MDB_IDL_UM_MAX - txn->mt_dirty_room;
3569 }
3570 /* Merge our dirty list with parent's */
3571 y = src[0].mid;
3572 for (i = len; y; dst[i--] = src[y--]) {
3573 pgno_t yp = src[y].mid;
3574 while (yp < dst[x].mid)
3575 dst[i--] = dst[x--];
3576 if (yp == dst[x].mid)
3577 free(dst[x--].mptr);
3578 }
3579 mdb_tassert(txn, i == x);
3580 dst[0].mid = len;
3581 free(txn->mt_u.dirty_list);
3582 parent->mt_dirty_room = txn->mt_dirty_room;
3583 if (txn->mt_spill_pgs) {
3584 if (parent->mt_spill_pgs) {
3585 /* TODO: Prevent failure here, so parent does not fail */
3586 rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
3587 if (rc)
3588 parent->mt_flags |= MDB_TXN_ERROR;
3589 mdb_midl_free(txn->mt_spill_pgs);
3590 mdb_midl_sort(parent->mt_spill_pgs);
3591 } else {
3592 parent->mt_spill_pgs = txn->mt_spill_pgs;
3593 }
3594 }
3595
3596 /* Append our loose page list to parent's */
3597 for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp))
3598 ;
3599 *lp = txn->mt_loose_pgs;
3600 parent->mt_loose_count += txn->mt_loose_count;
3601
3602 parent->mt_child = NULL;
3603 mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
3604 free(txn);
3605 return rc;
3606 }
3607
3608 if (txn != env->me_txn) {
3609 DPUTS("attempt to commit unknown transaction");
3610 rc = EINVAL;
3611 goto fail;
3612 }
3613
3614 mdb_cursors_close(txn, 0);
3615
3616 if (!txn->mt_u.dirty_list[0].mid &&
3617 !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS)))
3618 goto done;
3619
3620 DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u",
3621 txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root));
3622
3623 /* Update DB root pointers */
3624 if (txn->mt_numdbs > CORE_DBS) {
3625 MDB_cursor mc;
3626 MDB_dbi i;
3627 MDB_val data;
3628 data.mv_size = sizeof(MDB_db);
3629
3630 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
3631 for (i = CORE_DBS; i < txn->mt_numdbs; i++) {
3632 if (txn->mt_dbflags[i] & DB_DIRTY) {
3633 if (TXN_DBI_CHANGED(txn, i)) {
3634 rc = MDB_BAD_DBI;
3635 goto fail;
3636 }
3637 data.mv_data = &txn->mt_dbs[i];
3638 rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data,
3639 F_SUBDATA);
3640 if (rc)
3641 goto fail;
3642 }
3643 }
3644 }
3645
3646 rc = mdb_freelist_save(txn);
3647 if (rc)
3648 goto fail;
3649
3650 mdb_midl_free(env->me_pghead);
3651 env->me_pghead = NULL;
3652 mdb_midl_shrink(&txn->mt_free_pgs);
3653
3654#if (MDB_DEBUG) > 2
3655 mdb_audit(txn);
3656#endif
3657
3658 if ((rc = mdb_page_flush(txn, 0)) ||
3659 (rc = mdb_env_sync(env, 0)) ||
3660 (rc = mdb_env_write_meta(txn)))
3661 goto fail;
3662 end_mode = MDB_END_COMMITTED|MDB_END_UPDATE;
3663
3664done:
3665 mdb_txn_end(txn, end_mode);
3666 return MDB_SUCCESS;
3667
3668fail:
3669 mdb_txn_abort(txn);
3670 return rc;
3671}
3672
3673/** Read the environment parameters of a DB environment before
3674 * mapping it into memory.
3675 * @param[in] env the environment handle
3676 * @param[out] meta address of where to store the meta information
3677 * @return 0 on success, non-zero on failure.
3678 */
3679static int ESECT
3680mdb_env_read_header(MDB_env *env, MDB_meta *meta)
3681{
3682 MDB_metabuf pbuf;
3683 MDB_page *p;
3684 MDB_meta *m;
3685 int i, rc, off;
3686 enum { Size = sizeof(pbuf) };
3687
3688 /* We don't know the page size yet, so use a minimum value.
3689 * Read both meta pages so we can use the latest one.
3690 */
3691
3692 for (i=off=0; i<NUM_METAS; i++, off += meta->mm_psize) {
3693#ifdef _WIN32
3694 DWORD len;
3695 OVERLAPPED ov;
3696 memset(&ov, 0, sizeof(ov));
3697 ov.Offset = off;
3698 rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1;
3699 if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF)
3700 rc = 0;
3701#else
3702 rc = pread(env->me_fd, &pbuf, Size, off);
3703#endif
3704 if (rc != Size) {
3705 if (rc == 0 && off == 0)
3706 return ENOENT;
3707 rc = rc < 0 ? (int) ErrCode() : MDB_INVALID;
3708 DPRINTF(("read: %s", mdb_strerror(rc)));
3709 return rc;
3710 }
3711
3712 p = (MDB_page *)&pbuf;
3713
3714 if (!F_ISSET(p->mp_flags, P_META)) {
3715 DPRINTF(("page %"Z"u not a meta page", p->mp_pgno));
3716 return MDB_INVALID;
3717 }
3718
3719 m = METADATA(p);
3720 if (m->mm_magic != MDB_MAGIC) {
3721 DPUTS("meta has invalid magic");
3722 return MDB_INVALID;
3723 }
3724
3725 if (m->mm_version != MDB_DATA_VERSION) {
3726 DPRINTF(("database is version %u, expected version %u",
3727 m->mm_version, MDB_DATA_VERSION));
3728 return MDB_VERSION_MISMATCH;
3729 }
3730
3731 if (off == 0 || m->mm_txnid > meta->mm_txnid)
3732 *meta = *m;
3733 }
3734 return 0;
3735}
3736
3737/** Fill in most of the zeroed #MDB_meta for an empty database environment */
3738static void ESECT
3739mdb_env_init_meta0(MDB_env *env, MDB_meta *meta)
3740{
3741 meta->mm_magic = MDB_MAGIC;
3742 meta->mm_version = MDB_DATA_VERSION;
3743 meta->mm_mapsize = env->me_mapsize;
3744 meta->mm_psize = env->me_psize;
3745 meta->mm_last_pg = NUM_METAS-1;
3746 meta->mm_flags = env->me_flags & 0xffff;
3747 meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */
3748 meta->mm_dbs[FREE_DBI].md_root = P_INVALID;
3749 meta->mm_dbs[MAIN_DBI].md_root = P_INVALID;
3750}
3751
3752/** Write the environment parameters of a freshly created DB environment.
3753 * @param[in] env the environment handle
3754 * @param[in] meta the #MDB_meta to write
3755 * @return 0 on success, non-zero on failure.
3756 */
3757static int ESECT
3758mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
3759{
3760 MDB_page *p, *q;
3761 int rc;
3762 unsigned int psize;
3763#ifdef _WIN32
3764 DWORD len;
3765 OVERLAPPED ov;
3766 memset(&ov, 0, sizeof(ov));
3767#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \
3768 ov.Offset = pos; \
3769 rc = WriteFile(fd, ptr, size, &len, &ov); } while(0)
3770#else
3771 int len;
3772#define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \
3773 len = pwrite(fd, ptr, size, pos); \
3774 if (len == -1 && ErrCode() == EINTR) continue; \
3775 rc = (len >= 0); break; } while(1)
3776#endif
3777
3778 DPUTS("writing new meta page");
3779
3780 psize = env->me_psize;
3781
3782 p = calloc(NUM_METAS, psize);
3783 if (!p)
3784 return ENOMEM;
3785
3786 p->mp_pgno = 0;
3787 p->mp_flags = P_META;
3788 *(MDB_meta *)METADATA(p) = *meta;
3789
3790 q = (MDB_page *)((char *)p + psize);
3791 q->mp_pgno = 1;
3792 q->mp_flags = P_META;
3793 *(MDB_meta *)METADATA(q) = *meta;
3794
3795 DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0);
3796 if (!rc)
3797 rc = ErrCode();
3798 else if ((unsigned) len == psize * NUM_METAS)
3799 rc = MDB_SUCCESS;
3800 else
3801 rc = ENOSPC;
3802 free(p);
3803 return rc;
3804}
3805
3806/** Update the environment info to commit a transaction.
3807 * @param[in] txn the transaction that's being committed
3808 * @return 0 on success, non-zero on failure.
3809 */
3810static int
3811mdb_env_write_meta(MDB_txn *txn)
3812{
3813 MDB_env *env;
3814 MDB_meta meta, metab, *mp;
3815 unsigned flags;
3816 size_t mapsize;
3817 off_t off;
3818 int rc, len, toggle;
3819 char *ptr;
3820 HANDLE mfd;
3821#ifdef _WIN32
3822 OVERLAPPED ov;
3823#else
3824 int r2;
3825#endif
3826
3827 toggle = txn->mt_txnid & 1;
3828 DPRINTF(("writing meta page %d for root page %"Z"u",
3829 toggle, txn->mt_dbs[MAIN_DBI].md_root));
3830
3831 env = txn->mt_env;
3832 flags = env->me_flags;
3833 mp = env->me_metas[toggle];
3834 mapsize = env->me_metas[toggle ^ 1]->mm_mapsize;
3835 /* Persist any increases of mapsize config */
3836 if (mapsize < env->me_mapsize)
3837 mapsize = env->me_mapsize;
3838
3839 if (flags & MDB_WRITEMAP) {
3840 mp->mm_mapsize = mapsize;
3841 mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
3842 mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
3843 mp->mm_last_pg = txn->mt_next_pgno - 1;
3844#if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && /* TODO: portability */ \
3845 !(defined(__i386__) || defined(__x86_64__))
3846 /* LY: issue a memory barrier, if not x86. ITS#7969 */
3847 __sync_synchronize();
3848#endif
3849 mp->mm_txnid = txn->mt_txnid;
3850 if (!(flags & (MDB_NOMETASYNC|MDB_NOSYNC))) {
3851 unsigned meta_size = env->me_psize;
3852 rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
3853 ptr = (char *)mp - PAGEHDRSZ;
3854#ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */
3855 r2 = (ptr - env->me_map) & (env->me_os_psize - 1);
3856 ptr -= r2;
3857 meta_size += r2;
3858#endif
3859 if (MDB_MSYNC(ptr, meta_size, rc)) {
3860 rc = ErrCode();
3861 goto fail;
3862 }
3863 }
3864 goto done;
3865 }
3866 metab.mm_txnid = mp->mm_txnid;
3867 metab.mm_last_pg = mp->mm_last_pg;
3868
3869 meta.mm_mapsize = mapsize;
3870 meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
3871 meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
3872 meta.mm_last_pg = txn->mt_next_pgno - 1;
3873 meta.mm_txnid = txn->mt_txnid;
3874
3875 off = offsetof(MDB_meta, mm_mapsize);
3876 ptr = (char *)&meta + off;
3877 len = sizeof(MDB_meta) - off;
3878 off += (char *)mp - env->me_map;
3879
3880 /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC.
3881 * (me_mfd goes to the same file as me_fd, but writing to it
3882 * also syncs to disk. Avoids a separate fdatasync() call.)
3883 */
3884 mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd;
3885#ifdef _WIN32
3886 {
3887 memset(&ov, 0, sizeof(ov));
3888 ov.Offset = off;
3889 if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov))
3890 rc = -1;
3891 }
3892#else
3893retry_write:
3894 rc = pwrite(mfd, ptr, len, off);
3895#endif
3896 if (rc != len) {
3897 rc = rc < 0 ? ErrCode() : EIO;
3898#ifndef _WIN32
3899 if (rc == EINTR)
3900 goto retry_write;
3901#endif
3902 DPUTS("write failed, disk error?");
3903 /* On a failure, the pagecache still contains the new data.
3904 * Write some old data back, to prevent it from being used.
3905 * Use the non-SYNC fd; we know it will fail anyway.
3906 */
3907 meta.mm_last_pg = metab.mm_last_pg;
3908 meta.mm_txnid = metab.mm_txnid;
3909#ifdef _WIN32
3910 memset(&ov, 0, sizeof(ov));
3911 ov.Offset = off;
3912 WriteFile(env->me_fd, ptr, len, NULL, &ov);
3913#else
3914 r2 = pwrite(env->me_fd, ptr, len, off);
3915 (void)r2; /* Silence warnings. We don't care about pwrite's return value */
3916#endif
3917fail:
3918 env->me_flags |= MDB_FATAL_ERROR;
3919 return rc;
3920 }
3921 /* MIPS has cache coherency issues, this is a no-op everywhere else */
3922 CACHEFLUSH(env->me_map + off, len, DCACHE);
3923done:
3924 /* Memory ordering issues are irrelevant; since the entire writer
3925 * is wrapped by wmutex, all of these changes will become visible
3926 * after the wmutex is unlocked. Since the DB is multi-version,
3927 * readers will get consistent data regardless of how fresh or
3928 * how stale their view of these values is.
3929 */
3930 if (env->me_txns)
3931 env->me_txns->mti_txnid = txn->mt_txnid;
3932
3933 return MDB_SUCCESS;
3934}
3935
3936/** Check both meta pages to see which one is newer.
3937 * @param[in] env the environment handle
3938 * @return newest #MDB_meta.
3939 */
3940static MDB_meta *
3941mdb_env_pick_meta(const MDB_env *env)
3942{
3943 MDB_meta *const *metas = env->me_metas;
3944 return metas[ metas[0]->mm_txnid < metas[1]->mm_txnid ];
3945}
3946
3947int ESECT
3948mdb_env_create(MDB_env **env)
3949{
3950 MDB_env *e;
3951
3952 e = calloc(1, sizeof(MDB_env));
3953 if (!e)
3954 return ENOMEM;
3955
3956 e->me_maxreaders = DEFAULT_READERS;
3957 e->me_maxdbs = e->me_numdbs = CORE_DBS;
3958 e->me_fd = INVALID_HANDLE_VALUE;
3959 e->me_lfd = INVALID_HANDLE_VALUE;
3960 e->me_mfd = INVALID_HANDLE_VALUE;
3961#ifdef MDB_USE_POSIX_SEM
3962 e->me_rmutex = SEM_FAILED;
3963 e->me_wmutex = SEM_FAILED;
3964#endif
3965 e->me_pid = getpid();
3966 GET_PAGESIZE(e->me_os_psize);
3967 VGMEMP_CREATE(e,0,0);
3968 *env = e;
3969 return MDB_SUCCESS;
3970}
3971
3972static int ESECT
3973mdb_env_map(MDB_env *env, void *addr)
3974{
3975 MDB_page *p;
3976 unsigned int flags = env->me_flags;
3977#ifdef _WIN32
3978 int rc;
3979 HANDLE mh;
3980 LONG sizelo, sizehi;
3981 size_t msize;
3982
3983 if (flags & MDB_RDONLY) {
3984 /* Don't set explicit map size, use whatever exists */
3985 msize = 0;
3986 sizelo = 0;
3987 sizehi = 0;
3988 } else {
3989 msize = env->me_mapsize;
3990 sizelo = msize & 0xffffffff;
3991 sizehi = msize >> 16 >> 16; /* only needed on Win64 */
3992
3993 /* Windows won't create mappings for zero length files.
3994 * and won't map more than the file size.
3995 * Just set the maxsize right now.
3996 */
3997 if (!(flags & MDB_WRITEMAP) && (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo
3998 || !SetEndOfFile(env->me_fd)
3999 || SetFilePointer(env->me_fd, 0, NULL, 0) != 0))
4000 return ErrCode();
4001 }
4002
4003 mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ?
4004 PAGE_READWRITE : PAGE_READONLY,
4005 sizehi, sizelo, NULL);
4006 if (!mh)
4007 return ErrCode();
4008 env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ?
4009 FILE_MAP_WRITE : FILE_MAP_READ,
4010 0, 0, msize, addr);
4011 rc = env->me_map ? 0 : ErrCode();
4012 CloseHandle(mh);
4013 if (rc)
4014 return rc;
4015#else
4016 int mmap_flags = MAP_SHARED;
4017 int prot = PROT_READ;
4018#ifdef MAP_NOSYNC /* Used on FreeBSD */
4019 if (flags & MDB_NOSYNC)
4020 mmap_flags |= MAP_NOSYNC;
4021#endif
4022 if (flags & MDB_WRITEMAP) {
4023 prot |= PROT_WRITE;
4024 if (ftruncate(env->me_fd, env->me_mapsize) < 0)
4025 return ErrCode();
4026 }
4027 env->me_map = mmap(addr, env->me_mapsize, prot, mmap_flags,
4028 env->me_fd, 0);
4029 if (env->me_map == MAP_FAILED) {
4030 env->me_map = NULL;
4031 return ErrCode();
4032 }
4033
4034 if (flags & MDB_NORDAHEAD) {
4035 /* Turn off readahead. It's harmful when the DB is larger than RAM. */
4036#ifdef MADV_RANDOM
4037 madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
4038#else
4039#ifdef POSIX_MADV_RANDOM
4040 posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
4041#endif /* POSIX_MADV_RANDOM */
4042#endif /* MADV_RANDOM */
4043 }
4044#endif /* _WIN32 */
4045
4046 /* Can happen because the address argument to mmap() is just a
4047 * hint. mmap() can pick another, e.g. if the range is in use.
4048 * The MAP_FIXED flag would prevent that, but then mmap could
4049 * instead unmap existing pages to make room for the new map.
4050 */
4051 if (addr && env->me_map != addr)
4052 return EBUSY; /* TODO: Make a new MDB_* error code? */
4053
4054 p = (MDB_page *)env->me_map;
4055 env->me_metas[0] = METADATA(p);
4056 env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize);
4057
4058 return MDB_SUCCESS;
4059}
4060
4061int ESECT
4062mdb_env_set_mapsize(MDB_env *env, size_t size)
4063{
4064 /* If env is already open, caller is responsible for making
4065 * sure there are no active txns.
4066 */
4067 if (env->me_map) {
4068 int rc;
4069 MDB_meta *meta;
4070 void *old;
4071 if (env->me_txn)
4072 return EINVAL;
4073 meta = mdb_env_pick_meta(env);
4074 if (!size)
4075 size = meta->mm_mapsize;
4076 {
4077 /* Silently round up to minimum if the size is too small */
4078 size_t minsize = (meta->mm_last_pg + 1) * env->me_psize;
4079 if (size < minsize)
4080 size = minsize;
4081 }
4082 munmap(env->me_map, env->me_mapsize);
4083 env->me_mapsize = size;
4084 old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL;
4085 rc = mdb_env_map(env, old);
4086 if (rc)
4087 return rc;
4088 }
4089 env->me_mapsize = size;
4090 if (env->me_psize)
4091 env->me_maxpg = env->me_mapsize / env->me_psize;
4092 return MDB_SUCCESS;
4093}
4094
4095int ESECT
4096mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs)
4097{
4098 if (env->me_map)
4099 return EINVAL;
4100 env->me_maxdbs = dbs + CORE_DBS;
4101 return MDB_SUCCESS;
4102}
4103
4104int ESECT
4105mdb_env_set_maxreaders(MDB_env *env, unsigned int readers)
4106{
4107 if (env->me_map || readers < 1)
4108 return EINVAL;
4109 env->me_maxreaders = readers;
4110 return MDB_SUCCESS;
4111}
4112
4113int ESECT
4114mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers)
4115{
4116 if (!env || !readers)
4117 return EINVAL;
4118 *readers = env->me_maxreaders;
4119 return MDB_SUCCESS;
4120}
4121
4122static int ESECT
4123mdb_fsize(HANDLE fd, size_t *size)
4124{
4125#ifdef _WIN32
4126 LARGE_INTEGER fsize;
4127
4128 if (!GetFileSizeEx(fd, &fsize))
4129 return ErrCode();
4130
4131 *size = fsize.QuadPart;
4132#else
4133 struct stat st;
4134
4135 if (fstat(fd, &st))
4136 return ErrCode();
4137
4138 *size = st.st_size;
4139#endif
4140 return MDB_SUCCESS;
4141}
4142
4143
4144#ifdef _WIN32
4145typedef wchar_t mdb_nchar_t;
4146# define MDB_NAME(str) L##str
4147# define mdb_name_cpy wcscpy
4148#else
4149/** Character type for file names: char on Unix, wchar_t on Windows */
4150typedef char mdb_nchar_t;
4151# define MDB_NAME(str) str /**< #mdb_nchar_t[] string literal */
4152# define mdb_name_cpy strcpy /**< Copy name (#mdb_nchar_t string) */
4153#endif
4154
4155/** Filename - string of #mdb_nchar_t[] */
4156typedef struct MDB_name {
4157 int mn_len; /**< Length */
4158 int mn_alloced; /**< True if #mn_val was malloced */
4159 mdb_nchar_t *mn_val; /**< Contents */
4160} MDB_name;
4161
4162/** Filename suffixes [datafile,lockfile][without,with MDB_NOSUBDIR] */
4163static const mdb_nchar_t *const mdb_suffixes[2][2] = {
4164 { MDB_NAME("/data.mdb"), MDB_NAME("") },
4165 { MDB_NAME("/lock.mdb"), MDB_NAME("-lock") }
4166};
4167
4168#define MDB_SUFFLEN 9 /**< Max string length in #mdb_suffixes[] */
4169
4170/** Set up filename + scratch area for filename suffix, for opening files.
4171 * It should be freed with #mdb_fname_destroy().
4172 * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16.
4173 *
4174 * @param[in] path Pathname for #mdb_env_open().
4175 * @param[in] envflags Whether a subdir and/or lockfile will be used.
4176 * @param[out] fname Resulting filename, with room for a suffix if necessary.
4177 */
4178static int ESECT
4179mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname)
4180{
4181 int no_suffix = F_ISSET(envflags, MDB_NOSUBDIR|MDB_NOLOCK);
4182 fname->mn_alloced = 0;
4183#ifdef _WIN32
4184 return utf8_to_utf16(path, fname, no_suffix ? 0 : MDB_SUFFLEN);
4185#else
4186 fname->mn_len = strlen(path);
4187 if (no_suffix)
4188 fname->mn_val = (char *) path;
4189 else if ((fname->mn_val = malloc(fname->mn_len + MDB_SUFFLEN+1)) != NULL) {
4190 fname->mn_alloced = 1;
4191 strcpy(fname->mn_val, path);
4192 }
4193 else
4194 return ENOMEM;
4195 return MDB_SUCCESS;
4196#endif
4197}
4198
4199/** Destroy \b fname from #mdb_fname_init() */
4200#define mdb_fname_destroy(fname) \
4201 do { if ((fname).mn_alloced) free((fname).mn_val); } while (0)
4202
4203#ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */
4204# define MDB_CLOEXEC O_CLOEXEC
4205#else
4206# define MDB_CLOEXEC 0
4207#endif
4208
4209/** File type, access mode etc. for #mdb_fopen() */
4210enum mdb_fopen_type {
4211#ifdef _WIN32
4212 MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS
4213#else
4214 /* A comment in mdb_fopen() explains some O_* flag choices. */
4215 MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */
4216 MDB_O_RDWR = O_RDWR |O_CREAT, /**< for me_fd */
4217 MDB_O_META = O_WRONLY|MDB_DSYNC |MDB_CLOEXEC, /**< for me_mfd */
4218 MDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|MDB_CLOEXEC, /**< for #mdb_env_copy() */
4219 /** Bitmask for open() flags in enum #mdb_fopen_type. The other bits
4220 * distinguish otherwise-equal MDB_O_* constants from each other.
4221 */
4222 MDB_O_MASK = MDB_O_RDWR|MDB_CLOEXEC | MDB_O_RDONLY|MDB_O_META|MDB_O_COPY,
4223 MDB_O_LOCKS = MDB_O_RDWR|MDB_CLOEXEC | ((MDB_O_MASK+1) & ~MDB_O_MASK) /**< for me_lfd */
4224#endif
4225};
4226
4227/** Open an LMDB file.
4228 * @param[in] env The LMDB environment.
4229 * @param[in,out] fname Path from from #mdb_fname_init(). A suffix is
4230 * appended if necessary to create the filename, without changing mn_len.
4231 * @param[in] which Determines file type, access mode, etc.
4232 * @param[in] mode The Unix permissions for the file, if we create it.
4233 * @param[out] res Resulting file handle.
4234 * @return 0 on success, non-zero on failure.
4235 */
4236static int ESECT
4237mdb_fopen(const MDB_env *env, MDB_name *fname,
4238 enum mdb_fopen_type which, mdb_mode_t mode,
4239 HANDLE *res)
4240{
4241 int rc = MDB_SUCCESS;
4242 HANDLE fd;
4243#ifdef _WIN32
4244 DWORD acc, share, disp, attrs;
4245#else
4246 int flags;
4247#endif
4248
4249 if (fname->mn_alloced) /* modifiable copy */
4250 mdb_name_cpy(fname->mn_val + fname->mn_len,
4251 mdb_suffixes[which==MDB_O_LOCKS][F_ISSET(env->me_flags, MDB_NOSUBDIR)]);
4252
4253 /* The directory must already exist. Usually the file need not.
4254 * MDB_O_META requires the file because we already created it using
4255 * MDB_O_RDWR. MDB_O_COPY must not overwrite an existing file.
4256 *
4257 * With MDB_O_COPY we do not want the OS to cache the writes, since
4258 * the source data is already in the OS cache.
4259 *
4260 * The lockfile needs FD_CLOEXEC (close file descriptor on exec*())
4261 * to avoid the flock() issues noted under Caveats in lmdb.h.
4262 * Also set it for other filehandles which the user cannot get at
4263 * and close himself, which he may need after fork(). I.e. all but
4264 * me_fd, which programs do use via mdb_env_get_fd().
4265 */
4266
4267#ifdef _WIN32
4268 acc = GENERIC_READ|GENERIC_WRITE;
4269 share = FILE_SHARE_READ|FILE_SHARE_WRITE;
4270 disp = OPEN_ALWAYS;
4271 attrs = FILE_ATTRIBUTE_NORMAL;
4272 switch (which) {
4273 case MDB_O_RDONLY: /* read-only datafile */
4274 acc = GENERIC_READ;
4275 disp = OPEN_EXISTING;
4276 break;
4277 case MDB_O_META: /* for writing metapages */
4278 acc = GENERIC_WRITE;
4279 disp = OPEN_EXISTING;
4280 attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH;
4281 break;
4282 case MDB_O_COPY: /* mdb_env_copy() & co */
4283 acc = GENERIC_WRITE;
4284 share = 0;
4285 disp = CREATE_NEW;
4286 attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH;
4287 break;
4288 default: break; /* silence gcc -Wswitch (not all enum values handled) */
4289 }
4290 fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL);
4291#else
4292 fd = open(fname->mn_val, which & MDB_O_MASK, mode);
4293#endif
4294
4295 if (fd == INVALID_HANDLE_VALUE)
4296 rc = ErrCode();
4297#ifndef _WIN32
4298 else {
4299 if (which != MDB_O_RDONLY && which != MDB_O_RDWR) {
4300 /* Set CLOEXEC if we could not pass it to open() */
4301 if (!MDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1)
4302 (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
4303 }
4304 if (which == MDB_O_COPY && env->me_psize >= env->me_os_psize) {
4305 /* This may require buffer alignment. There is no portable
4306 * way to ask how much, so we require OS pagesize alignment.
4307 */
4308# ifdef F_NOCACHE /* __APPLE__ */
4309 (void) fcntl(fd, F_NOCACHE, 1);
4310# elif defined O_DIRECT
4311 /* open(...O_DIRECT...) would break on filesystems without
4312 * O_DIRECT support (ITS#7682). Try to set it here instead.
4313 */
4314 if ((flags = fcntl(fd, F_GETFL)) != -1)
4315 (void) fcntl(fd, F_SETFL, flags | O_DIRECT);
4316# endif
4317 }
4318 }
4319#endif /* !_WIN32 */
4320
4321 *res = fd;
4322 return rc;
4323}
4324
4325
4326#ifdef BROKEN_FDATASYNC
4327#include <sys/utsname.h>
4328#include <sys/vfs.h>
4329#endif
4330
4331/** Further setup required for opening an LMDB environment
4332 */
4333static int ESECT
4334mdb_env_open2(MDB_env *env)
4335{
4336 unsigned int flags = env->me_flags;
4337 int i, newenv = 0, rc;
4338 MDB_meta meta;
4339
4340#ifdef _WIN32
4341 /* See if we should use QueryLimited */
4342 rc = GetVersion();
4343 if ((rc & 0xff) > 5)
4344 env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION;
4345 else
4346 env->me_pidquery = PROCESS_QUERY_INFORMATION;
4347#endif /* _WIN32 */
4348
4349#ifdef BROKEN_FDATASYNC
4350 /* ext3/ext4 fdatasync is broken on some older Linux kernels.
4351 * https://lkml.org/lkml/2012/9/3/83
4352 * Kernels after 3.6-rc6 are known good.
4353 * https://lkml.org/lkml/2012/9/10/556
4354 * See if the DB is on ext3/ext4, then check for new enough kernel
4355 * Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known
4356 * to be patched.
4357 */
4358 {
4359 struct statfs st;
4360 fstatfs(env->me_fd, &st);
4361 while (st.f_type == 0xEF53) {
4362 struct utsname uts;
4363 int i;
4364 uname(&uts);
4365 if (uts.release[0] < '3') {
4366 if (!strncmp(uts.release, "2.6.32.", 7)) {
4367 i = atoi(uts.release+7);
4368 if (i >= 60)
4369 break; /* 2.6.32.60 and newer is OK */
4370 } else if (!strncmp(uts.release, "2.6.34.", 7)) {
4371 i = atoi(uts.release+7);
4372 if (i >= 15)
4373 break; /* 2.6.34.15 and newer is OK */
4374 }
4375 } else if (uts.release[0] == '3') {
4376 i = atoi(uts.release+2);
4377 if (i > 5)
4378 break; /* 3.6 and newer is OK */
4379 if (i == 5) {
4380 i = atoi(uts.release+4);
4381 if (i >= 4)
4382 break; /* 3.5.4 and newer is OK */
4383 } else if (i == 2) {
4384 i = atoi(uts.release+4);
4385 if (i >= 30)
4386 break; /* 3.2.30 and newer is OK */
4387 }
4388 } else { /* 4.x and newer is OK */
4389 break;
4390 }
4391 env->me_flags |= MDB_FSYNCONLY;
4392 break;
4393 }
4394 }
4395#endif
4396
4397 if ((i = mdb_env_read_header(env, &meta)) != 0) {
4398 if (i != ENOENT)
4399 return i;
4400 DPUTS("new mdbenv");
4401 newenv = 1;
4402 env->me_psize = env->me_os_psize;
4403 if (env->me_psize > MAX_PAGESIZE)
4404 env->me_psize = MAX_PAGESIZE;
4405 memset(&meta, 0, sizeof(meta));
4406 mdb_env_init_meta0(env, &meta);
4407 meta.mm_mapsize = DEFAULT_MAPSIZE;
4408 } else {
4409 env->me_psize = meta.mm_psize;
4410 }
4411
4412 /* Was a mapsize configured? */
4413 if (!env->me_mapsize) {
4414 env->me_mapsize = meta.mm_mapsize;
4415 }
4416 {
4417 /* Make sure mapsize >= committed data size. Even when using
4418 * mm_mapsize, which could be broken in old files (ITS#7789).
4419 */
4420 size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize;
4421 if (env->me_mapsize < minsize)
4422 env->me_mapsize = minsize;
4423 }
4424 meta.mm_mapsize = env->me_mapsize;
4425
4426 if (newenv && !(flags & MDB_FIXEDMAP)) {
4427 /* mdb_env_map() may grow the datafile. Write the metapages
4428 * first, so the file will be valid if initialization fails.
4429 * Except with FIXEDMAP, since we do not yet know mm_address.
4430 * We could fill in mm_address later, but then a different
4431 * program might end up doing that - one with a memory layout
4432 * and map address which does not suit the main program.
4433 */
4434 rc = mdb_env_init_meta(env, &meta);
4435 if (rc)
4436 return rc;
4437 newenv = 0;
4438 }
4439
4440 rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL);
4441 if (rc)
4442 return rc;
4443
4444 if (newenv) {
4445 if (flags & MDB_FIXEDMAP)
4446 meta.mm_address = env->me_map;
4447 i = mdb_env_init_meta(env, &meta);
4448 if (i != MDB_SUCCESS) {
4449 return i;
4450 }
4451 }
4452
4453 env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
4454 env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2)
4455 - sizeof(indx_t);
4456#if !(MDB_MAXKEYSIZE)
4457 env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db));
4458#endif
4459 env->me_maxpg = env->me_mapsize / env->me_psize;
4460
4461#if MDB_DEBUG
4462 {
4463 MDB_meta *meta = mdb_env_pick_meta(env);
4464 MDB_db *db = &meta->mm_dbs[MAIN_DBI];
4465
4466 DPRINTF(("opened database version %u, pagesize %u",
4467 meta->mm_version, env->me_psize));
4468 DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1)));
4469 DPRINTF(("depth: %u", db->md_depth));
4470 DPRINTF(("entries: %"Z"u", db->md_entries));
4471 DPRINTF(("branch pages: %"Z"u", db->md_branch_pages));
4472 DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages));
4473 DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages));
4474 DPRINTF(("root: %"Z"u", db->md_root));
4475 }
4476#endif
4477
4478 return MDB_SUCCESS;
4479}
4480
4481
4482/** Release a reader thread's slot in the reader lock table.
4483 * This function is called automatically when a thread exits.
4484 * @param[in] ptr This points to the slot in the reader lock table.
4485 */
4486static void
4487mdb_env_reader_dest(void *ptr)
4488{
4489 MDB_reader *reader = ptr;
4490
4491#ifndef _WIN32
4492 if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */
4493#endif
4494 /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */
4495 reader->mr_pid = 0;
4496}
4497
4498#ifdef _WIN32
4499/** Junk for arranging thread-specific callbacks on Windows. This is
4500 * necessarily platform and compiler-specific. Windows supports up
4501 * to 1088 keys. Let's assume nobody opens more than 64 environments
4502 * in a single process, for now. They can override this if needed.
4503 */
4504#ifndef MAX_TLS_KEYS
4505#define MAX_TLS_KEYS 64
4506#endif
4507static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS];
4508static int mdb_tls_nkeys;
4509
4510static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr)
4511{
4512 int i;
4513 switch(reason) {
4514 case DLL_PROCESS_ATTACH: break;
4515 case DLL_THREAD_ATTACH: break;
4516 case DLL_THREAD_DETACH:
4517 for (i=0; i<mdb_tls_nkeys; i++) {
4518 MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]);
4519 if (r) {
4520 mdb_env_reader_dest(r);
4521 }
4522 }
4523 break;
4524 case DLL_PROCESS_DETACH: break;
4525 }
4526}
4527#ifdef __GNUC__
4528#ifdef _WIN64
4529const PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback;
4530#else
4531PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback;
4532#endif
4533#else
4534#ifdef _WIN64
4535/* Force some symbol references.
4536 * _tls_used forces the linker to create the TLS directory if not already done
4537 * mdb_tls_cbp prevents whole-program-optimizer from dropping the symbol.
4538 */
4539#pragma comment(linker, "/INCLUDE:_tls_used")
4540#pragma comment(linker, "/INCLUDE:mdb_tls_cbp")
4541#pragma const_seg(".CRT$XLB")
4542extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp;
4543const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
4544#pragma const_seg()
4545#else /* _WIN32 */
4546#pragma comment(linker, "/INCLUDE:__tls_used")
4547#pragma comment(linker, "/INCLUDE:_mdb_tls_cbp")
4548#pragma data_seg(".CRT$XLB")
4549PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
4550#pragma data_seg()
4551#endif /* WIN 32/64 */
4552#endif /* !__GNUC__ */
4553#endif
4554
4555/** Downgrade the exclusive lock on the region back to shared */
4556static int ESECT
4557mdb_env_share_locks(MDB_env *env, int *excl)
4558{
4559 int rc = 0;
4560 MDB_meta *meta = mdb_env_pick_meta(env);
4561
4562 env->me_txns->mti_txnid = meta->mm_txnid;
4563
4564#ifdef _WIN32
4565 {
4566 OVERLAPPED ov;
4567 /* First acquire a shared lock. The Unlock will
4568 * then release the existing exclusive lock.
4569 */
4570 memset(&ov, 0, sizeof(ov));
4571 if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
4572 rc = ErrCode();
4573 } else {
4574 UnlockFile(env->me_lfd, 0, 0, 1, 0);
4575 *excl = 0;
4576 }
4577 }
4578#else
4579 {
4580 struct flock lock_info;
4581 /* The shared lock replaces the existing lock */
4582 memset((void *)&lock_info, 0, sizeof(lock_info));
4583 lock_info.l_type = F_RDLCK;
4584 lock_info.l_whence = SEEK_SET;
4585 lock_info.l_start = 0;
4586 lock_info.l_len = 1;
4587 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
4588 (rc = ErrCode()) == EINTR) ;
4589 *excl = rc ? -1 : 0; /* error may mean we lost the lock */
4590 }
4591#endif
4592
4593 return rc;
4594}
4595
4596/** Try to get exclusive lock, otherwise shared.
4597 * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive.
4598 */
4599static int ESECT
4600mdb_env_excl_lock(MDB_env *env, int *excl)
4601{
4602 int rc = 0;
4603#ifdef _WIN32
4604 if (LockFile(env->me_lfd, 0, 0, 1, 0)) {
4605 *excl = 1;
4606 } else {
4607 OVERLAPPED ov;
4608 memset(&ov, 0, sizeof(ov));
4609 if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
4610 *excl = 0;
4611 } else {
4612 rc = ErrCode();
4613 }
4614 }
4615#else
4616 struct flock lock_info;
4617 memset((void *)&lock_info, 0, sizeof(lock_info));
4618 lock_info.l_type = F_WRLCK;
4619 lock_info.l_whence = SEEK_SET;
4620 lock_info.l_start = 0;
4621 lock_info.l_len = 1;
4622 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
4623 (rc = ErrCode()) == EINTR) ;
4624 if (!rc) {
4625 *excl = 1;
4626 } else
4627# ifndef MDB_USE_POSIX_MUTEX
4628 if (*excl < 0) /* always true when MDB_USE_POSIX_MUTEX */
4629# endif
4630 {
4631 lock_info.l_type = F_RDLCK;
4632 while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) &&
4633 (rc = ErrCode()) == EINTR) ;
4634 if (rc == 0)
4635 *excl = 0;
4636 }
4637#endif
4638 return rc;
4639}
4640
4641#ifdef MDB_USE_HASH
4642/*
4643 * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
4644 *
4645 * @(#) $Revision: 5.1 $
4646 * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $
4647 * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $
4648 *
4649 * http://www.isthe.com/chongo/tech/comp/fnv/index.html
4650 *
4651 ***
4652 *
4653 * Please do not copyright this code. This code is in the public domain.
4654 *
4655 * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
4656 * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO
4657 * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR
4658 * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
4659 * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
4660 * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
4661 * PERFORMANCE OF THIS SOFTWARE.
4662 *
4663 * By:
4664 * chongo <Landon Curt Noll> /\oo/\
4665 * http://www.isthe.com/chongo/
4666 *
4667 * Share and Enjoy! :-)
4668 */
4669
4670typedef unsigned long long mdb_hash_t;
4671#define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL)
4672
4673/** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer
4674 * @param[in] val value to hash
4675 * @param[in] hval initial value for hash
4676 * @return 64 bit hash
4677 *
4678 * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the
4679 * hval arg on the first call.
4680 */
4681static mdb_hash_t
4682mdb_hash_val(MDB_val *val, mdb_hash_t hval)
4683{
4684 unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */
4685 unsigned char *end = s + val->mv_size;
4686 /*
4687 * FNV-1a hash each octet of the string
4688 */
4689 while (s < end) {
4690 /* xor the bottom with the current octet */
4691 hval ^= (mdb_hash_t)*s++;
4692
4693 /* multiply by the 64 bit FNV magic prime mod 2^64 */
4694 hval += (hval << 1) + (hval << 4) + (hval << 5) +
4695 (hval << 7) + (hval << 8) + (hval << 40);
4696 }
4697 /* return our new hash value */
4698 return hval;
4699}
4700
4701/** Hash the string and output the encoded hash.
4702 * This uses modified RFC1924 Ascii85 encoding to accommodate systems with
4703 * very short name limits. We don't care about the encoding being reversible,
4704 * we just want to preserve as many bits of the input as possible in a
4705 * small printable string.
4706 * @param[in] str string to hash
4707 * @param[out] encbuf an array of 11 chars to hold the hash
4708 */
4709static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~";
4710
4711static void ESECT
4712mdb_pack85(unsigned long l, char *out)
4713{
4714 int i;
4715
4716 for (i=0; i<5; i++) {
4717 *out++ = mdb_a85[l % 85];
4718 l /= 85;
4719 }
4720}
4721
4722static void ESECT
4723mdb_hash_enc(MDB_val *val, char *encbuf)
4724{
4725 mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT);
4726
4727 mdb_pack85(h, encbuf);
4728 mdb_pack85(h>>32, encbuf+5);
4729 encbuf[10] = '\0';
4730}
4731#endif
4732
4733/** Open and/or initialize the lock region for the environment.
4734 * @param[in] env The LMDB environment.
4735 * @param[in] fname Filename + scratch area, from #mdb_fname_init().
4736 * @param[in] mode The Unix permissions for the file, if we create it.
4737 * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive
4738 * @return 0 on success, non-zero on failure.
4739 */
4740static int ESECT
4741mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl)
4742{
4743#ifdef _WIN32
4744# define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT
4745#else
4746# define MDB_ERRCODE_ROFS EROFS
4747#endif
4748 int rc;
4749 off_t size, rsize;
4750
4751 rc = mdb_fopen(env, fname, MDB_O_LOCKS, mode, &env->me_lfd);
4752 if (rc) {
4753 /* Omit lockfile if read-only env on read-only filesystem */
4754 if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) {
4755 return MDB_SUCCESS;
4756 }
4757 goto fail;
4758 }
4759
4760 if (!(env->me_flags & MDB_NOTLS)) {
4761 rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
4762 if (rc)
4763 goto fail;
4764 env->me_flags |= MDB_ENV_TXKEY;
4765#ifdef _WIN32
4766 /* Windows TLS callbacks need help finding their TLS info. */
4767 if (mdb_tls_nkeys >= MAX_TLS_KEYS) {
4768 rc = MDB_TLS_FULL;
4769 goto fail;
4770 }
4771 mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey;
4772#endif
4773 }
4774
4775 /* Try to get exclusive lock. If we succeed, then
4776 * nobody is using the lock region and we should initialize it.
4777 */
4778 if ((rc = mdb_env_excl_lock(env, excl))) goto fail;
4779
4780#ifdef _WIN32
4781 size = GetFileSize(env->me_lfd, NULL);
4782#else
4783 size = lseek(env->me_lfd, 0, SEEK_END);
4784 if (size == -1) goto fail_errno;
4785#endif
4786 rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
4787 if (size < rsize && *excl > 0) {
4788#ifdef _WIN32
4789 if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize
4790 || !SetEndOfFile(env->me_lfd))
4791 goto fail_errno;
4792#else
4793 if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno;
4794#endif
4795 } else {
4796 rsize = size;
4797 size = rsize - sizeof(MDB_txninfo);
4798 env->me_maxreaders = size/sizeof(MDB_reader) + 1;
4799 }
4800 {
4801#ifdef _WIN32
4802 HANDLE mh;
4803 mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE,
4804 0, 0, NULL);
4805 if (!mh) goto fail_errno;
4806 env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL);
4807 CloseHandle(mh);
4808 if (!env->me_txns) goto fail_errno;
4809#else
4810 void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
4811 env->me_lfd, 0);
4812 if (m == MAP_FAILED) goto fail_errno;
4813 env->me_txns = m;
4814#endif
4815 }
4816 if (*excl > 0) {
4817#ifdef _WIN32
4818 BY_HANDLE_FILE_INFORMATION stbuf;
4819 struct {
4820 DWORD volume;
4821 DWORD nhigh;
4822 DWORD nlow;
4823 } idbuf;
4824 MDB_val val;
4825 char encbuf[11];
4826
4827 if (!mdb_sec_inited) {
4828 InitializeSecurityDescriptor(&mdb_null_sd,
4829 SECURITY_DESCRIPTOR_REVISION);
4830 SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE);
4831 mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES);
4832 mdb_all_sa.bInheritHandle = FALSE;
4833 mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd;
4834 mdb_sec_inited = 1;
4835 }
4836 if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno;
4837 idbuf.volume = stbuf.dwVolumeSerialNumber;
4838 idbuf.nhigh = stbuf.nFileIndexHigh;
4839 idbuf.nlow = stbuf.nFileIndexLow;
4840 val.mv_data = &idbuf;
4841 val.mv_size = sizeof(idbuf);
4842 mdb_hash_enc(&val, encbuf);
4843 sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", encbuf);
4844 sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", encbuf);
4845 env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_rmname);
4846 if (!env->me_rmutex) goto fail_errno;
4847 env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_wmname);
4848 if (!env->me_wmutex) goto fail_errno;
4849#elif defined(MDB_USE_POSIX_SEM)
4850 struct stat stbuf;
4851 struct {
4852 dev_t dev;
4853 ino_t ino;
4854 } idbuf;
4855 MDB_val val;
4856 char encbuf[11];
4857
4858#if defined(__NetBSD__)
4859#define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */
4860#endif
4861 if (fstat(env->me_lfd, &stbuf)) goto fail_errno;
4862 idbuf.dev = stbuf.st_dev;
4863 idbuf.ino = stbuf.st_ino;
4864 val.mv_data = &idbuf;
4865 val.mv_size = sizeof(idbuf);
4866 mdb_hash_enc(&val, encbuf);
4867#ifdef MDB_SHORT_SEMNAMES
4868 encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */
4869#endif
4870 sprintf(env->me_txns->mti_rmname, "/MDBr%s", encbuf);
4871 sprintf(env->me_txns->mti_wmname, "/MDBw%s", encbuf);
4872 /* Clean up after a previous run, if needed: Try to
4873 * remove both semaphores before doing anything else.
4874 */
4875 sem_unlink(env->me_txns->mti_rmname);
4876 sem_unlink(env->me_txns->mti_wmname);
4877 env->me_rmutex = sem_open(env->me_txns->mti_rmname,
4878 O_CREAT|O_EXCL, mode, 1);
4879 if (env->me_rmutex == SEM_FAILED) goto fail_errno;
4880 env->me_wmutex = sem_open(env->me_txns->mti_wmname,
4881 O_CREAT|O_EXCL, mode, 1);
4882 if (env->me_wmutex == SEM_FAILED) goto fail_errno;
4883#else /* MDB_USE_POSIX_MUTEX: */
4884 pthread_mutexattr_t mattr;
4885
4886 /* Solaris needs this before initing a robust mutex. Otherwise
4887 * it may skip the init and return EBUSY "seems someone already
4888 * inited" or EINVAL "it was inited differently".
4889 */
4890 memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex));
4891 memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex));
4892
4893 if ((rc = pthread_mutexattr_init(&mattr)))
4894 goto fail;
4895
4896 rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
4897#ifdef MDB_ROBUST_SUPPORTED
4898 if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST);
4899#endif
4900 if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr);
4901 if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr);
4902 pthread_mutexattr_destroy(&mattr);
4903 if (rc)
4904 goto fail;
4905#endif /* _WIN32 || MDB_USE_POSIX_SEM */
4906
4907 env->me_txns->mti_magic = MDB_MAGIC;
4908 env->me_txns->mti_format = MDB_LOCK_FORMAT;
4909 env->me_txns->mti_txnid = 0;
4910 env->me_txns->mti_numreaders = 0;
4911
4912 } else {
4913 if (env->me_txns->mti_magic != MDB_MAGIC) {
4914 DPUTS("lock region has invalid magic");
4915 rc = MDB_INVALID;
4916 goto fail;
4917 }
4918 if (env->me_txns->mti_format != MDB_LOCK_FORMAT) {
4919 DPRINTF(("lock region has format+version 0x%x, expected 0x%x",
4920 env->me_txns->mti_format, MDB_LOCK_FORMAT));
4921 rc = MDB_VERSION_MISMATCH;
4922 goto fail;
4923 }
4924 rc = ErrCode();
4925 if (rc && rc != EACCES && rc != EAGAIN) {
4926 goto fail;
4927 }
4928#ifdef _WIN32
4929 env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname);
4930 if (!env->me_rmutex) goto fail_errno;
4931 env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname);
4932 if (!env->me_wmutex) goto fail_errno;
4933#elif defined(MDB_USE_POSIX_SEM)
4934 env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0);
4935 if (env->me_rmutex == SEM_FAILED) goto fail_errno;
4936 env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0);
4937 if (env->me_wmutex == SEM_FAILED) goto fail_errno;
4938#endif
4939 }
4940 return MDB_SUCCESS;
4941
4942fail_errno:
4943 rc = ErrCode();
4944fail:
4945 return rc;
4946}
4947
4948 /** Only a subset of the @ref mdb_env flags can be changed
4949 * at runtime. Changing other flags requires closing the
4950 * environment and re-opening it with the new flags.
4951 */
4952#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
4953#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \
4954 MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
4955
4956#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS)
4957# error "Persistent DB flags & env flags overlap, but both go in mm_flags"
4958#endif
4959
4960int ESECT
4961mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
4962{
4963 int rc, excl = -1;
4964 MDB_name fname;
4965
4966 if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS)))
4967 return EINVAL;
4968
4969 flags |= env->me_flags;
4970
4971 rc = mdb_fname_init(path, flags, &fname);
4972 if (rc)
4973 return rc;
4974
4975 if (flags & MDB_RDONLY) {
4976 /* silently ignore WRITEMAP when we're only getting read access */
4977 flags &= ~MDB_WRITEMAP;
4978 } else {
4979 if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) &&
4980 (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2)))))
4981 rc = ENOMEM;
4982 }
4983 env->me_flags = flags |= MDB_ENV_ACTIVE;
4984 if (rc)
4985 goto leave;
4986
4987 env->me_path = strdup(path);
4988 env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx));
4989 env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t));
4990 env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int));
4991 if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) {
4992 rc = ENOMEM;
4993 goto leave;
4994 }
4995 env->me_dbxs[FREE_DBI].md_cmp = mdb_cmp_long; /* aligned MDB_INTEGERKEY */
4996
4997 /* For RDONLY, get lockfile after we know datafile exists */
4998 if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) {
4999 rc = mdb_env_setup_locks(env, &fname, mode, &excl);
5000 if (rc)
5001 goto leave;
5002 }
5003
5004 rc = mdb_fopen(env, &fname,
5005 (flags & MDB_RDONLY) ? MDB_O_RDONLY : MDB_O_RDWR,
5006 mode, &env->me_fd);
5007 if (rc)
5008 goto leave;
5009
5010 if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) {
5011 rc = mdb_env_setup_locks(env, &fname, mode, &excl);
5012 if (rc)
5013 goto leave;
5014 }
5015
5016 if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) {
5017 if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) {
5018 /* Synchronous fd for meta writes. Needed even with
5019 * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset.
5020 */
5021 rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd);
5022 if (rc)
5023 goto leave;
5024 }
5025 DPRINTF(("opened dbenv %p", (void *) env));
5026 if (excl > 0) {
5027 rc = mdb_env_share_locks(env, &excl);
5028 if (rc)
5029 goto leave;
5030 }
5031 if (!(flags & MDB_RDONLY)) {
5032 MDB_txn *txn;
5033 int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs *
5034 (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned int)+1);
5035 if ((env->me_pbuf = calloc(1, env->me_psize)) &&
5036 (txn = calloc(1, size)))
5037 {
5038 txn->mt_dbs = (MDB_db *)((char *)txn + tsize);
5039 txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
5040 txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs);
5041 txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs);
5042 txn->mt_env = env;
5043 txn->mt_dbxs = env->me_dbxs;
5044 txn->mt_flags = MDB_TXN_FINISHED;
5045 env->me_txn0 = txn;
5046 } else {
5047 rc = ENOMEM;
5048 }
5049 }
5050 }
5051
5052leave:
5053 if (rc) {
5054 mdb_env_close0(env, excl);
5055 }
5056 mdb_fname_destroy(fname);
5057 return rc;
5058}
5059
5060/** Destroy resources from mdb_env_open(), clear our readers & DBIs */
5061static void ESECT
5062mdb_env_close0(MDB_env *env, int excl)
5063{
5064 int i;
5065
5066 if (!(env->me_flags & MDB_ENV_ACTIVE))
5067 return;
5068
5069 /* Doing this here since me_dbxs may not exist during mdb_env_close */
5070 if (env->me_dbxs) {
5071 for (i = env->me_maxdbs; --i >= CORE_DBS; )
5072 free(env->me_dbxs[i].md_name.mv_data);
5073 free(env->me_dbxs);
5074 }
5075
5076 free(env->me_pbuf);
5077 free(env->me_dbiseqs);
5078 free(env->me_dbflags);
5079 free(env->me_path);
5080 free(env->me_dirty_list);
5081 free(env->me_txn0);
5082 mdb_midl_free(env->me_free_pgs);
5083
5084 if (env->me_flags & MDB_ENV_TXKEY) {
5085 pthread_key_delete(env->me_txkey);
5086#ifdef _WIN32
5087 /* Delete our key from the global list */
5088 for (i=0; i<mdb_tls_nkeys; i++)
5089 if (mdb_tls_keys[i] == env->me_txkey) {
5090 mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1];
5091 mdb_tls_nkeys--;
5092 break;
5093 }
5094#endif
5095 }
5096
5097 if (env->me_map) {
5098 munmap(env->me_map, env->me_mapsize);
5099 }
5100 if (env->me_mfd != INVALID_HANDLE_VALUE)
5101 (void) close(env->me_mfd);
5102 if (env->me_fd != INVALID_HANDLE_VALUE)
5103 (void) close(env->me_fd);
5104 if (env->me_txns) {
5105 MDB_PID_T pid = getpid();
5106 /* Clearing readers is done in this function because
5107 * me_txkey with its destructor must be disabled first.
5108 *
5109 * We skip the the reader mutex, so we touch only
5110 * data owned by this process (me_close_readers and
5111 * our readers), and clear each reader atomically.
5112 */
5113 for (i = env->me_close_readers; --i >= 0; )
5114 if (env->me_txns->mti_readers[i].mr_pid == pid)
5115 env->me_txns->mti_readers[i].mr_pid = 0;
5116#ifdef _WIN32
5117 if (env->me_rmutex) {
5118 CloseHandle(env->me_rmutex);
5119 if (env->me_wmutex) CloseHandle(env->me_wmutex);
5120 }
5121 /* Windows automatically destroys the mutexes when
5122 * the last handle closes.
5123 */
5124#elif defined(MDB_USE_POSIX_SEM)
5125 if (env->me_rmutex != SEM_FAILED) {
5126 sem_close(env->me_rmutex);
5127 if (env->me_wmutex != SEM_FAILED)
5128 sem_close(env->me_wmutex);
5129 /* If we have the filelock: If we are the
5130 * only remaining user, clean up semaphores.
5131 */
5132 if (excl == 0)
5133 mdb_env_excl_lock(env, &excl);
5134 if (excl > 0) {
5135 sem_unlink(env->me_txns->mti_rmname);
5136 sem_unlink(env->me_txns->mti_wmname);
5137 }
5138 }
5139#elif defined(MDB_ROBUST_SUPPORTED)
5140 /* If we have the filelock: If we are the
5141 * only remaining user, clean up robust
5142 * mutexes.
5143 */
5144 if (excl == 0)
5145 mdb_env_excl_lock(env, &excl);
5146 if (excl > 0) {
5147 pthread_mutex_destroy(env->me_txns->mti_rmutex);
5148 pthread_mutex_destroy(env->me_txns->mti_wmutex);
5149 }
5150#endif
5151 munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo));
5152 }
5153 if (env->me_lfd != INVALID_HANDLE_VALUE) {
5154#ifdef _WIN32
5155 if (excl >= 0) {
5156 /* Unlock the lockfile. Windows would have unlocked it
5157 * after closing anyway, but not necessarily at once.
5158 */
5159 UnlockFile(env->me_lfd, 0, 0, 1, 0);
5160 }
5161#endif
5162 (void) close(env->me_lfd);
5163 }
5164
5165 env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY);
5166}
5167
5168void ESECT
5169mdb_env_close(MDB_env *env)
5170{
5171 MDB_page *dp;
5172
5173 if (env == NULL)
5174 return;
5175
5176 VGMEMP_DESTROY(env);
5177 while ((dp = env->me_dpages) != NULL) {
5178 VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
5179 env->me_dpages = dp->mp_next;
5180 free(dp);
5181 }
5182
5183 mdb_env_close0(env, 0);
5184 free(env);
5185}
5186
5187/** Compare two items pointing at aligned size_t's */
5188static int
5189mdb_cmp_long(const MDB_val *a, const MDB_val *b)
5190{
5191 return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 :
5192 *(size_t *)a->mv_data > *(size_t *)b->mv_data;
5193}
5194
5195/** Compare two items pointing at aligned unsigned int's.
5196 *
5197 * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp,
5198 * but #mdb_cmp_clong() is called instead if the data type is size_t.
5199 */
5200static int
5201mdb_cmp_int(const MDB_val *a, const MDB_val *b)
5202{
5203 return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 :
5204 *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
5205}
5206
5207/** Compare two items pointing at unsigned ints of unknown alignment.
5208 * Nodes and keys are guaranteed to be 2-byte aligned.
5209 */
5210static int
5211mdb_cmp_cint(const MDB_val *a, const MDB_val *b)
5212{
5213#if BYTE_ORDER == LITTLE_ENDIAN
5214 unsigned short *u, *c;
5215 int x;
5216
5217 u = (unsigned short *) ((char *) a->mv_data + a->mv_size);
5218 c = (unsigned short *) ((char *) b->mv_data + a->mv_size);
5219 do {
5220 x = *--u - *--c;
5221 } while(!x && u > (unsigned short *)a->mv_data);
5222 return x;
5223#else
5224 unsigned short *u, *c, *end;
5225 int x;
5226
5227 end = (unsigned short *) ((char *) a->mv_data + a->mv_size);
5228 u = (unsigned short *)a->mv_data;
5229 c = (unsigned short *)b->mv_data;
5230 do {
5231 x = *u++ - *c++;
5232 } while(!x && u < end);
5233 return x;
5234#endif
5235}
5236
5237/** Compare two items lexically */
5238static int
5239mdb_cmp_memn(const MDB_val *a, const MDB_val *b)
5240{
5241 int diff;
5242 ssize_t len_diff;
5243 unsigned int len;
5244
5245 len = a->mv_size;
5246 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
5247 if (len_diff > 0) {
5248 len = b->mv_size;
5249 len_diff = 1;
5250 }
5251
5252 diff = memcmp(a->mv_data, b->mv_data, len);
5253 return diff ? diff : len_diff<0 ? -1 : len_diff;
5254}
5255
5256/** Compare two items in reverse byte order */
5257static int
5258mdb_cmp_memnr(const MDB_val *a, const MDB_val *b)
5259{
5260 const unsigned char *p1, *p2, *p1_lim;
5261 ssize_t len_diff;
5262 int diff;
5263
5264 p1_lim = (const unsigned char *)a->mv_data;
5265 p1 = (const unsigned char *)a->mv_data + a->mv_size;
5266 p2 = (const unsigned char *)b->mv_data + b->mv_size;
5267
5268 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
5269 if (len_diff > 0) {
5270 p1_lim += len_diff;
5271 len_diff = 1;
5272 }
5273
5274 while (p1 > p1_lim) {
5275 diff = *--p1 - *--p2;
5276 if (diff)
5277 return diff;
5278 }
5279 return len_diff<0 ? -1 : len_diff;
5280}
5281
5282/** Search for key within a page, using binary search.
5283 * Returns the smallest entry larger or equal to the key.
5284 * If exactp is non-null, stores whether the found entry was an exact match
5285 * in *exactp (1 or 0).
5286 * Updates the cursor index with the index of the found entry.
5287 * If no entry larger or equal to the key is found, returns NULL.
5288 */
5289static MDB_node *
5290mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp)
5291{
5292 unsigned int i = 0, nkeys;
5293 int low, high;
5294 int rc = 0;
5295 MDB_page *mp = mc->mc_pg[mc->mc_top];
5296 MDB_node *node = NULL;
5297 MDB_val nodekey;
5298 MDB_cmp_func *cmp;
5299 DKBUF;
5300
5301 nkeys = NUMKEYS(mp);
5302
5303 DPRINTF(("searching %u keys in %s %spage %"Z"u",
5304 nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "",
5305 mdb_dbg_pgno(mp)));
5306
5307 low = IS_LEAF(mp) ? 0 : 1;
5308 high = nkeys - 1;
5309 cmp = mc->mc_dbx->md_cmp;
5310
5311 /* Branch pages have no data, so if using integer keys,
5312 * alignment is guaranteed. Use faster mdb_cmp_int.
5313 */
5314 if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) {
5315 if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t))
5316 cmp = mdb_cmp_long;
5317 else
5318 cmp = mdb_cmp_int;
5319 }
5320
5321 if (IS_LEAF2(mp)) {
5322 nodekey.mv_size = mc->mc_db->md_pad;
5323 node = NODEPTR(mp, 0); /* fake */
5324 while (low <= high) {
5325 i = (low + high) >> 1;
5326 nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size);
5327 rc = cmp(key, &nodekey);
5328 DPRINTF(("found leaf index %u [%s], rc = %i",
5329 i, DKEY(&nodekey), rc));
5330 if (rc == 0)
5331 break;
5332 if (rc > 0)
5333 low = i + 1;
5334 else
5335 high = i - 1;
5336 }
5337 } else {
5338 while (low <= high) {
5339 i = (low + high) >> 1;
5340
5341 node = NODEPTR(mp, i);
5342 nodekey.mv_size = NODEKSZ(node);
5343 nodekey.mv_data = NODEKEY(node);
5344
5345 rc = cmp(key, &nodekey);
5346#if MDB_DEBUG
5347 if (IS_LEAF(mp))
5348 DPRINTF(("found leaf index %u [%s], rc = %i",
5349 i, DKEY(&nodekey), rc));
5350 else
5351 DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i",
5352 i, DKEY(&nodekey), NODEPGNO(node), rc));
5353#endif
5354 if (rc == 0)
5355 break;
5356 if (rc > 0)
5357 low = i + 1;
5358 else
5359 high = i - 1;
5360 }
5361 }
5362
5363 if (rc > 0) { /* Found entry is less than the key. */
5364 i++; /* Skip to get the smallest entry larger than key. */
5365 if (!IS_LEAF2(mp))
5366 node = NODEPTR(mp, i);
5367 }
5368 if (exactp)
5369 *exactp = (rc == 0 && nkeys > 0);
5370 /* store the key index */
5371 mc->mc_ki[mc->mc_top] = i;
5372 if (i >= nkeys)
5373 /* There is no entry larger or equal to the key. */
5374 return NULL;
5375
5376 /* nodeptr is fake for LEAF2 */
5377 return node;
5378}
5379
5380#if 0
5381static void
5382mdb_cursor_adjust(MDB_cursor *mc, func)
5383{
5384 MDB_cursor *m2;
5385
5386 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
5387 if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) {
5388 func(mc, m2);
5389 }
5390 }
5391}
5392#endif
5393
5394/** Pop a page off the top of the cursor's stack. */
5395static void
5396mdb_cursor_pop(MDB_cursor *mc)
5397{
5398 if (mc->mc_snum) {
5399 DPRINTF(("popping page %"Z"u off db %d cursor %p",
5400 mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc));
5401
5402 mc->mc_snum--;
5403 if (mc->mc_snum) {
5404 mc->mc_top--;
5405 } else {
5406 mc->mc_flags &= ~C_INITIALIZED;
5407 }
5408 }
5409}
5410
5411/** Push a page onto the top of the cursor's stack.
5412 * Set #MDB_TXN_ERROR on failure.
5413 */
5414static int
5415mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
5416{
5417 DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno,
5418 DDBI(mc), (void *) mc));
5419
5420 if (mc->mc_snum >= CURSOR_STACK) {
5421 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
5422 return MDB_CURSOR_FULL;
5423 }
5424
5425 mc->mc_top = mc->mc_snum++;
5426 mc->mc_pg[mc->mc_top] = mp;
5427 mc->mc_ki[mc->mc_top] = 0;
5428
5429 return MDB_SUCCESS;
5430}
5431
5432/** Find the address of the page corresponding to a given page number.
5433 * Set #MDB_TXN_ERROR on failure.
5434 * @param[in] mc the cursor accessing the page.
5435 * @param[in] pgno the page number for the page to retrieve.
5436 * @param[out] ret address of a pointer where the page's address will be stored.
5437 * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page.
5438 * @return 0 on success, non-zero on failure.
5439 */
5440static int
5441mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl)
5442{
5443 MDB_txn *txn = mc->mc_txn;
5444 MDB_env *env = txn->mt_env;
5445 MDB_page *p = NULL;
5446 int level;
5447
5448 if (! (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_WRITEMAP))) {
5449 MDB_txn *tx2 = txn;
5450 level = 1;
5451 do {
5452 MDB_ID2L dl = tx2->mt_u.dirty_list;
5453 unsigned x;
5454 /* Spilled pages were dirtied in this txn and flushed
5455 * because the dirty list got full. Bring this page
5456 * back in from the map (but don't unspill it here,
5457 * leave that unless page_touch happens again).
5458 */
5459 if (tx2->mt_spill_pgs) {
5460 MDB_ID pn = pgno << 1;
5461 x = mdb_midl_search(tx2->mt_spill_pgs, pn);
5462 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
5463 p = (MDB_page *)(env->me_map + env->me_psize * pgno);
5464 goto done;
5465 }
5466 }
5467 if (dl[0].mid) {
5468 unsigned x = mdb_mid2l_search(dl, pgno);
5469 if (x <= dl[0].mid && dl[x].mid == pgno) {
5470 p = dl[x].mptr;
5471 goto done;
5472 }
5473 }
5474 level++;
5475 } while ((tx2 = tx2->mt_parent) != NULL);
5476 }
5477
5478 if (pgno < txn->mt_next_pgno) {
5479 level = 0;
5480 p = (MDB_page *)(env->me_map + env->me_psize * pgno);
5481 } else {
5482 DPRINTF(("page %"Z"u not found", pgno));
5483 txn->mt_flags |= MDB_TXN_ERROR;
5484 return MDB_PAGE_NOTFOUND;
5485 }
5486
5487done:
5488 *ret = p;
5489 if (lvl)
5490 *lvl = level;
5491 return MDB_SUCCESS;
5492}
5493
5494/** Finish #mdb_page_search() / #mdb_page_search_lowest().
5495 * The cursor is at the root page, set up the rest of it.
5496 */
5497static int
5498mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags)
5499{
5500 MDB_page *mp = mc->mc_pg[mc->mc_top];
5501 int rc;
5502 DKBUF;
5503
5504 while (IS_BRANCH(mp)) {
5505 MDB_node *node;
5506 indx_t i;
5507
5508 DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp)));
5509 /* Don't assert on branch pages in the FreeDB. We can get here
5510 * while in the process of rebalancing a FreeDB branch page; we must
5511 * let that proceed. ITS#8336
5512 */
5513 mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1);
5514 DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0))));
5515
5516 if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) {
5517 i = 0;
5518 if (flags & MDB_PS_LAST) {
5519 i = NUMKEYS(mp) - 1;
5520 /* if already init'd, see if we're already in right place */
5521 if (mc->mc_flags & C_INITIALIZED) {
5522 if (mc->mc_ki[mc->mc_top] == i) {
5523 mc->mc_top = mc->mc_snum++;
5524 mp = mc->mc_pg[mc->mc_top];
5525 goto ready;
5526 }
5527 }
5528 }
5529 } else {
5530 int exact;
5531 node = mdb_node_search(mc, key, &exact);
5532 if (node == NULL)
5533 i = NUMKEYS(mp) - 1;
5534 else {
5535 i = mc->mc_ki[mc->mc_top];
5536 if (!exact) {
5537 mdb_cassert(mc, i > 0);
5538 i--;
5539 }
5540 }
5541 DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
5542 }
5543
5544 mdb_cassert(mc, i < NUMKEYS(mp));
5545 node = NODEPTR(mp, i);
5546
5547 if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)
5548 return rc;
5549
5550 mc->mc_ki[mc->mc_top] = i;
5551 if ((rc = mdb_cursor_push(mc, mp)))
5552 return rc;
5553
5554ready:
5555 if (flags & MDB_PS_MODIFY) {
5556 if ((rc = mdb_page_touch(mc)) != 0)
5557 return rc;
5558 mp = mc->mc_pg[mc->mc_top];
5559 }
5560 }
5561
5562 if (!IS_LEAF(mp)) {
5563 DPRINTF(("internal error, index points to a %02X page!?",
5564 mp->mp_flags));
5565 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
5566 return MDB_CORRUPTED;
5567 }
5568
5569 DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno,
5570 key ? DKEY(key) : "null"));
5571 mc->mc_flags |= C_INITIALIZED;
5572 mc->mc_flags &= ~C_EOF;
5573
5574 return MDB_SUCCESS;
5575}
5576
5577/** Search for the lowest key under the current branch page.
5578 * This just bypasses a NUMKEYS check in the current page
5579 * before calling mdb_page_search_root(), because the callers
5580 * are all in situations where the current page is known to
5581 * be underfilled.
5582 */
5583static int
5584mdb_page_search_lowest(MDB_cursor *mc)
5585{
5586 MDB_page *mp = mc->mc_pg[mc->mc_top];
5587 MDB_node *node = NODEPTR(mp, 0);
5588 int rc;
5589
5590 if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)
5591 return rc;
5592
5593 mc->mc_ki[mc->mc_top] = 0;
5594 if ((rc = mdb_cursor_push(mc, mp)))
5595 return rc;
5596 return mdb_page_search_root(mc, NULL, MDB_PS_FIRST);
5597}
5598
5599/** Search for the page a given key should be in.
5600 * Push it and its parent pages on the cursor stack.
5601 * @param[in,out] mc the cursor for this operation.
5602 * @param[in] key the key to search for, or NULL for first/last page.
5603 * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB
5604 * are touched (updated with new page numbers).
5605 * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf.
5606 * This is used by #mdb_cursor_first() and #mdb_cursor_last().
5607 * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups.
5608 * @return 0 on success, non-zero on failure.
5609 */
5610static int
5611mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
5612{
5613 int rc;
5614 pgno_t root;
5615
5616 /* Make sure the txn is still viable, then find the root from
5617 * the txn's db table and set it as the root of the cursor's stack.
5618 */
5619 if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) {
5620 DPUTS("transaction may not be used now");
5621 return MDB_BAD_TXN;
5622 } else {
5623 /* Make sure we're using an up-to-date root */
5624 if (*mc->mc_dbflag & DB_STALE) {
5625 MDB_cursor mc2;
5626 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))
5627 return MDB_BAD_DBI;
5628 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
5629 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0);
5630 if (rc)
5631 return rc;
5632 {
5633 MDB_val data;
5634 int exact = 0;
5635 uint16_t flags;
5636 MDB_node *leaf = mdb_node_search(&mc2,
5637 &mc->mc_dbx->md_name, &exact);
5638 if (!exact)
5639 return MDB_NOTFOUND;
5640 if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)
5641 return MDB_INCOMPATIBLE; /* not a named DB */
5642 rc = mdb_node_read(&mc2, leaf, &data);
5643 if (rc)
5644 return rc;
5645 memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)),
5646 sizeof(uint16_t));
5647 /* The txn may not know this DBI, or another process may
5648 * have dropped and recreated the DB with other flags.
5649 */
5650 if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags)
5651 return MDB_INCOMPATIBLE;
5652 memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db));
5653 }
5654 *mc->mc_dbflag &= ~DB_STALE;
5655 }
5656 root = mc->mc_db->md_root;
5657
5658 if (root == P_INVALID) { /* Tree is empty. */
5659 DPUTS("tree is empty");
5660 return MDB_NOTFOUND;
5661 }
5662 }
5663
5664 mdb_cassert(mc, root > 1);
5665 if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root)
5666 if ((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0)
5667 return rc;
5668
5669 mc->mc_snum = 1;
5670 mc->mc_top = 0;
5671
5672 DPRINTF(("db %d root page %"Z"u has flags 0x%X",
5673 DDBI(mc), root, mc->mc_pg[0]->mp_flags));
5674
5675 if (flags & MDB_PS_MODIFY) {
5676 if ((rc = mdb_page_touch(mc)))
5677 return rc;
5678 }
5679
5680 if (flags & MDB_PS_ROOTONLY)
5681 return MDB_SUCCESS;
5682
5683 return mdb_page_search_root(mc, key, flags);
5684}
5685
5686static int
5687mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp)
5688{
5689 MDB_txn *txn = mc->mc_txn;
5690 pgno_t pg = mp->mp_pgno;
5691 unsigned x = 0, ovpages = mp->mp_pages;
5692 MDB_env *env = txn->mt_env;
5693 MDB_IDL sl = txn->mt_spill_pgs;
5694 MDB_ID pn = pg << 1;
5695 int rc;
5696
5697 DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages));
5698 /* If the page is dirty or on the spill list we just acquired it,
5699 * so we should give it back to our current free list, if any.
5700 * Otherwise put it onto the list of pages we freed in this txn.
5701 *
5702 * Won't create me_pghead: me_pglast must be inited along with it.
5703 * Unsupported in nested txns: They would need to hide the page
5704 * range in ancestor txns' dirty and spilled lists.
5705 */
5706 if (env->me_pghead &&
5707 !txn->mt_parent &&
5708 ((mp->mp_flags & P_DIRTY) ||
5709 (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn)))
5710 {
5711 unsigned i, j;
5712 pgno_t *mop;
5713 MDB_ID2 *dl, ix, iy;
5714 rc = mdb_midl_need(&env->me_pghead, ovpages);
5715 if (rc)
5716 return rc;
5717 if (!(mp->mp_flags & P_DIRTY)) {
5718 /* This page is no longer spilled */
5719 if (x == sl[0])
5720 sl[0]--;
5721 else
5722 sl[x] |= 1;
5723 goto release;
5724 }
5725 /* Remove from dirty list */
5726 dl = txn->mt_u.dirty_list;
5727 x = dl[0].mid--;
5728 for (ix = dl[x]; ix.mptr != mp; ix = iy) {
5729 if (x > 1) {
5730 x--;
5731 iy = dl[x];
5732 dl[x] = ix;
5733 } else {
5734 mdb_cassert(mc, x > 1);
5735 j = ++(dl[0].mid);
5736 dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */
5737 txn->mt_flags |= MDB_TXN_ERROR;
5738 return MDB_CORRUPTED;
5739 }
5740 }
5741 txn->mt_dirty_room++;
5742 if (!(env->me_flags & MDB_WRITEMAP))
5743 mdb_dpage_free(env, mp);
5744release:
5745 /* Insert in me_pghead */
5746 mop = env->me_pghead;
5747 j = mop[0] + ovpages;
5748 for (i = mop[0]; i && mop[i] < pg; i--)
5749 mop[j--] = mop[i];
5750 while (j>i)
5751 mop[j--] = pg++;
5752 mop[0] += ovpages;
5753 } else {
5754 rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages);
5755 if (rc)
5756 return rc;
5757 }
5758 mc->mc_db->md_overflow_pages -= ovpages;
5759 return 0;
5760}
5761
5762/** Return the data associated with a given node.
5763 * @param[in] mc The cursor for this operation.
5764 * @param[in] leaf The node being read.
5765 * @param[out] data Updated to point to the node's data.
5766 * @return 0 on success, non-zero on failure.
5767 */
5768static int
5769mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data)
5770{
5771 MDB_page *omp; /* overflow page */
5772 pgno_t pgno;
5773 int rc;
5774
5775 if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) {
5776 data->mv_size = NODEDSZ(leaf);
5777 data->mv_data = NODEDATA(leaf);
5778 return MDB_SUCCESS;
5779 }
5780
5781 /* Read overflow data.
5782 */
5783 data->mv_size = NODEDSZ(leaf);
5784 memcpy(&pgno, NODEDATA(leaf), sizeof(pgno));
5785 if ((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0) {
5786 DPRINTF(("read overflow page %"Z"u failed", pgno));
5787 return rc;
5788 }
5789 data->mv_data = METADATA(omp);
5790
5791 return MDB_SUCCESS;
5792}
5793
5794int
5795mdb_get(MDB_txn *txn, MDB_dbi dbi,
5796 MDB_val *key, MDB_val *data)
5797{
5798 MDB_cursor mc;
5799 MDB_xcursor mx;
5800 int exact = 0;
5801 DKBUF;
5802
5803 DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key)));
5804
5805 if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
5806 return EINVAL;
5807
5808 if (txn->mt_flags & MDB_TXN_BLOCKED)
5809 return MDB_BAD_TXN;
5810
5811 mdb_cursor_init(&mc, txn, dbi, &mx);
5812 return mdb_cursor_set(&mc, key, data, MDB_SET, &exact);
5813}
5814
5815/** Find a sibling for a page.
5816 * Replaces the page at the top of the cursor's stack with the
5817 * specified sibling, if one exists.
5818 * @param[in] mc The cursor for this operation.
5819 * @param[in] move_right Non-zero if the right sibling is requested,
5820 * otherwise the left sibling.
5821 * @return 0 on success, non-zero on failure.
5822 */
5823static int
5824mdb_cursor_sibling(MDB_cursor *mc, int move_right)
5825{
5826 int rc;
5827 MDB_node *indx;
5828 MDB_page *mp;
5829
5830 if (mc->mc_snum < 2) {
5831 return MDB_NOTFOUND; /* root has no siblings */
5832 }
5833
5834 mdb_cursor_pop(mc);
5835 DPRINTF(("parent page is page %"Z"u, index %u",
5836 mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]));
5837
5838 if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top]))
5839 : (mc->mc_ki[mc->mc_top] == 0)) {
5840 DPRINTF(("no more keys left, moving to %s sibling",
5841 move_right ? "right" : "left"));
5842 if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) {
5843 /* undo cursor_pop before returning */
5844 mc->mc_top++;
5845 mc->mc_snum++;
5846 return rc;
5847 }
5848 } else {
5849 if (move_right)
5850 mc->mc_ki[mc->mc_top]++;
5851 else
5852 mc->mc_ki[mc->mc_top]--;
5853 DPRINTF(("just moving to %s index key %u",
5854 move_right ? "right" : "left", mc->mc_ki[mc->mc_top]));
5855 }
5856 mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top]));
5857
5858 indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5859 if ((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) {
5860 /* mc will be inconsistent if caller does mc_snum++ as above */
5861 mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
5862 return rc;
5863 }
5864
5865 mdb_cursor_push(mc, mp);
5866 if (!move_right)
5867 mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1;
5868
5869 return MDB_SUCCESS;
5870}
5871
5872/** Move the cursor to the next data item. */
5873static int
5874mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
5875{
5876 MDB_page *mp;
5877 MDB_node *leaf;
5878 int rc;
5879
5880 if ((mc->mc_flags & C_DEL && op == MDB_NEXT_DUP))
5881 return MDB_NOTFOUND;
5882
5883 if (!(mc->mc_flags & C_INITIALIZED))
5884 return mdb_cursor_first(mc, key, data);
5885
5886 mp = mc->mc_pg[mc->mc_top];
5887
5888 if (mc->mc_flags & C_EOF) {
5889 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1)
5890 return MDB_NOTFOUND;
5891 mc->mc_flags ^= C_EOF;
5892 }
5893
5894 if (mc->mc_db->md_flags & MDB_DUPSORT) {
5895 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5896 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5897 if (op == MDB_NEXT || op == MDB_NEXT_DUP) {
5898 rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT);
5899 if (op != MDB_NEXT || rc != MDB_NOTFOUND) {
5900 if (rc == MDB_SUCCESS)
5901 MDB_GET_KEY(leaf, key);
5902 return rc;
5903 }
5904 }
5905 } else {
5906 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5907 if (op == MDB_NEXT_DUP)
5908 return MDB_NOTFOUND;
5909 }
5910 }
5911
5912 DPRINTF(("cursor_next: top page is %"Z"u in cursor %p",
5913 mdb_dbg_pgno(mp), (void *) mc));
5914 if (mc->mc_flags & C_DEL) {
5915 mc->mc_flags ^= C_DEL;
5916 goto skip;
5917 }
5918
5919 if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) {
5920 DPUTS("=====> move to next sibling page");
5921 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) {
5922 mc->mc_flags |= C_EOF;
5923 return rc;
5924 }
5925 mp = mc->mc_pg[mc->mc_top];
5926 DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]));
5927 } else
5928 mc->mc_ki[mc->mc_top]++;
5929
5930skip:
5931 DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u",
5932 mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]));
5933
5934 if (IS_LEAF2(mp)) {
5935 key->mv_size = mc->mc_db->md_pad;
5936 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
5937 return MDB_SUCCESS;
5938 }
5939
5940 mdb_cassert(mc, IS_LEAF(mp));
5941 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5942
5943 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5944 mdb_xcursor_init1(mc, leaf);
5945 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
5946 if (rc != MDB_SUCCESS)
5947 return rc;
5948 } else if (data) {
5949 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)
5950 return rc;
5951 }
5952
5953 MDB_GET_KEY(leaf, key);
5954 return MDB_SUCCESS;
5955}
5956
5957/** Move the cursor to the previous data item. */
5958static int
5959mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
5960{
5961 MDB_page *mp;
5962 MDB_node *leaf;
5963 int rc;
5964
5965 if (!(mc->mc_flags & C_INITIALIZED)) {
5966 rc = mdb_cursor_last(mc, key, data);
5967 if (rc)
5968 return rc;
5969 mc->mc_ki[mc->mc_top]++;
5970 }
5971
5972 mp = mc->mc_pg[mc->mc_top];
5973
5974 if ((mc->mc_db->md_flags & MDB_DUPSORT) &&
5975 mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) {
5976 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
5977 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
5978 if (op == MDB_PREV || op == MDB_PREV_DUP) {
5979 rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV);
5980 if (op != MDB_PREV || rc != MDB_NOTFOUND) {
5981 if (rc == MDB_SUCCESS) {
5982 MDB_GET_KEY(leaf, key);
5983 mc->mc_flags &= ~C_EOF;
5984 }
5985 return rc;
5986 }
5987 }
5988 } else {
5989 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
5990 if (op == MDB_PREV_DUP)
5991 return MDB_NOTFOUND;
5992 }
5993 }
5994
5995 DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p",
5996 mdb_dbg_pgno(mp), (void *) mc));
5997
5998 mc->mc_flags &= ~(C_EOF|C_DEL);
5999
6000 if (mc->mc_ki[mc->mc_top] == 0) {
6001 DPUTS("=====> move to prev sibling page");
6002 if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) {
6003 return rc;
6004 }
6005 mp = mc->mc_pg[mc->mc_top];
6006 mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1;
6007 DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]));
6008 } else
6009 mc->mc_ki[mc->mc_top]--;
6010
6011 DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u",
6012 mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]));
6013
6014 if (!IS_LEAF(mp))
6015 return MDB_CORRUPTED;
6016
6017 if (IS_LEAF2(mp)) {
6018 key->mv_size = mc->mc_db->md_pad;
6019 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
6020 return MDB_SUCCESS;
6021 }
6022
6023 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6024
6025 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6026 mdb_xcursor_init1(mc, leaf);
6027 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
6028 if (rc != MDB_SUCCESS)
6029 return rc;
6030 } else if (data) {
6031 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)
6032 return rc;
6033 }
6034
6035 MDB_GET_KEY(leaf, key);
6036 return MDB_SUCCESS;
6037}
6038
6039/** Set the cursor on a specific data item. */
6040static int
6041mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
6042 MDB_cursor_op op, int *exactp)
6043{
6044 int rc;
6045 MDB_page *mp;
6046 MDB_node *leaf = NULL;
6047 DKBUF;
6048
6049 if (key->mv_size == 0)
6050 return MDB_BAD_VALSIZE;
6051
6052 if (mc->mc_xcursor)
6053 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
6054
6055 /* See if we're already on the right page */
6056 if (mc->mc_flags & C_INITIALIZED) {
6057 MDB_val nodekey;
6058
6059 mp = mc->mc_pg[mc->mc_top];
6060 if (!NUMKEYS(mp)) {
6061 mc->mc_ki[mc->mc_top] = 0;
6062 return MDB_NOTFOUND;
6063 }
6064 if (mp->mp_flags & P_LEAF2) {
6065 nodekey.mv_size = mc->mc_db->md_pad;
6066 nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size);
6067 } else {
6068 leaf = NODEPTR(mp, 0);
6069 MDB_GET_KEY2(leaf, nodekey);
6070 }
6071 rc = mc->mc_dbx->md_cmp(key, &nodekey);
6072 if (rc == 0) {
6073 /* Probably happens rarely, but first node on the page
6074 * was the one we wanted.
6075 */
6076 mc->mc_ki[mc->mc_top] = 0;
6077 if (exactp)
6078 *exactp = 1;
6079 goto set1;
6080 }
6081 if (rc > 0) {
6082 unsigned int i;
6083 unsigned int nkeys = NUMKEYS(mp);
6084 if (nkeys > 1) {
6085 if (mp->mp_flags & P_LEAF2) {
6086 nodekey.mv_data = LEAF2KEY(mp,
6087 nkeys-1, nodekey.mv_size);
6088 } else {
6089 leaf = NODEPTR(mp, nkeys-1);
6090 MDB_GET_KEY2(leaf, nodekey);
6091 }
6092 rc = mc->mc_dbx->md_cmp(key, &nodekey);
6093 if (rc == 0) {
6094 /* last node was the one we wanted */
6095 mc->mc_ki[mc->mc_top] = nkeys-1;
6096 if (exactp)
6097 *exactp = 1;
6098 goto set1;
6099 }
6100 if (rc < 0) {
6101 if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) {
6102 /* This is definitely the right page, skip search_page */
6103 if (mp->mp_flags & P_LEAF2) {
6104 nodekey.mv_data = LEAF2KEY(mp,
6105 mc->mc_ki[mc->mc_top], nodekey.mv_size);
6106 } else {
6107 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6108 MDB_GET_KEY2(leaf, nodekey);
6109 }
6110 rc = mc->mc_dbx->md_cmp(key, &nodekey);
6111 if (rc == 0) {
6112 /* current node was the one we wanted */
6113 if (exactp)
6114 *exactp = 1;
6115 goto set1;
6116 }
6117 }
6118 rc = 0;
6119 mc->mc_flags &= ~C_EOF;
6120 goto set2;
6121 }
6122 }
6123 /* If any parents have right-sibs, search.
6124 * Otherwise, there's nothing further.
6125 */
6126 for (i=0; i<mc->mc_top; i++)
6127 if (mc->mc_ki[i] <
6128 NUMKEYS(mc->mc_pg[i])-1)
6129 break;
6130 if (i == mc->mc_top) {
6131 /* There are no other pages */
6132 mc->mc_ki[mc->mc_top] = nkeys;
6133 return MDB_NOTFOUND;
6134 }
6135 }
6136 if (!mc->mc_top) {
6137 /* There are no other pages */
6138 mc->mc_ki[mc->mc_top] = 0;
6139 if (op == MDB_SET_RANGE && !exactp) {
6140 rc = 0;
6141 goto set1;
6142 } else
6143 return MDB_NOTFOUND;
6144 }
6145 } else {
6146 mc->mc_pg[0] = 0;
6147 }
6148
6149 rc = mdb_page_search(mc, key, 0);
6150 if (rc != MDB_SUCCESS)
6151 return rc;
6152
6153 mp = mc->mc_pg[mc->mc_top];
6154 mdb_cassert(mc, IS_LEAF(mp));
6155
6156set2:
6157 leaf = mdb_node_search(mc, key, exactp);
6158 if (exactp != NULL && !*exactp) {
6159 /* MDB_SET specified and not an exact match. */
6160 return MDB_NOTFOUND;
6161 }
6162
6163 if (leaf == NULL) {
6164 DPUTS("===> inexact leaf not found, goto sibling");
6165 if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) {
6166 mc->mc_flags |= C_EOF;
6167 return rc; /* no entries matched */
6168 }
6169 mp = mc->mc_pg[mc->mc_top];
6170 mdb_cassert(mc, IS_LEAF(mp));
6171 leaf = NODEPTR(mp, 0);
6172 }
6173
6174set1:
6175 mc->mc_flags |= C_INITIALIZED;
6176 mc->mc_flags &= ~C_EOF;
6177
6178 if (IS_LEAF2(mp)) {
6179 if (op == MDB_SET_RANGE || op == MDB_SET_KEY) {
6180 key->mv_size = mc->mc_db->md_pad;
6181 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
6182 }
6183 return MDB_SUCCESS;
6184 }
6185
6186 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6187 mdb_xcursor_init1(mc, leaf);
6188 if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) {
6189 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
6190 } else {
6191 int ex2, *ex2p;
6192 if (op == MDB_GET_BOTH) {
6193 ex2p = &ex2;
6194 ex2 = 0;
6195 } else {
6196 ex2p = NULL;
6197 }
6198 rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p);
6199 if (rc != MDB_SUCCESS)
6200 return rc;
6201 }
6202 } else if (data) {
6203 if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) {
6204 MDB_val olddata;
6205 MDB_cmp_func *dcmp;
6206 if ((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS)
6207 return rc;
6208 dcmp = mc->mc_dbx->md_dcmp;
6209#if UINT_MAX < SIZE_MAX
6210 if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t))
6211 dcmp = mdb_cmp_clong;
6212#endif
6213 rc = dcmp(data, &olddata);
6214 if (rc) {
6215 if (op == MDB_GET_BOTH || rc > 0)
6216 return MDB_NOTFOUND;
6217 rc = 0;
6218 }
6219 *data = olddata;
6220
6221 } else {
6222 if (mc->mc_xcursor)
6223 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
6224 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)
6225 return rc;
6226 }
6227 }
6228
6229 /* The key already matches in all other cases */
6230 if (op == MDB_SET_RANGE || op == MDB_SET_KEY)
6231 MDB_GET_KEY(leaf, key);
6232 DPRINTF(("==> cursor placed on key [%s]", DKEY(key)));
6233
6234 return rc;
6235}
6236
6237/** Move the cursor to the first item in the database. */
6238static int
6239mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data)
6240{
6241 int rc;
6242 MDB_node *leaf;
6243
6244 if (mc->mc_xcursor)
6245 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
6246
6247 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
6248 rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
6249 if (rc != MDB_SUCCESS)
6250 return rc;
6251 }
6252 mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
6253
6254 leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0);
6255 mc->mc_flags |= C_INITIALIZED;
6256 mc->mc_flags &= ~C_EOF;
6257
6258 mc->mc_ki[mc->mc_top] = 0;
6259
6260 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
6261 if ( key ) {
6262 key->mv_size = mc->mc_db->md_pad;
6263 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size);
6264 }
6265 return MDB_SUCCESS;
6266 }
6267
6268 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6269 mdb_xcursor_init1(mc, leaf);
6270 rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
6271 if (rc)
6272 return rc;
6273 } else if (data) {
6274 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)
6275 return rc;
6276 }
6277
6278 MDB_GET_KEY(leaf, key);
6279 return MDB_SUCCESS;
6280}
6281
6282/** Move the cursor to the last item in the database. */
6283static int
6284mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data)
6285{
6286 int rc;
6287 MDB_node *leaf;
6288
6289 if (mc->mc_xcursor)
6290 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
6291
6292 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
6293 rc = mdb_page_search(mc, NULL, MDB_PS_LAST);
6294 if (rc != MDB_SUCCESS)
6295 return rc;
6296 }
6297 mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
6298
6299 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1;
6300 mc->mc_flags |= C_INITIALIZED|C_EOF;
6301 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6302
6303 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
6304 if (key) {
6305 key->mv_size = mc->mc_db->md_pad;
6306 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size);
6307 }
6308 return MDB_SUCCESS;
6309 }
6310
6311 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6312 mdb_xcursor_init1(mc, leaf);
6313 rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
6314 if (rc)
6315 return rc;
6316 } else if (data) {
6317 if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)
6318 return rc;
6319 }
6320
6321 MDB_GET_KEY(leaf, key);
6322 return MDB_SUCCESS;
6323}
6324
6325int
6326mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
6327 MDB_cursor_op op)
6328{
6329 int rc;
6330 int exact = 0;
6331 int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data);
6332
6333 if (mc == NULL)
6334 return EINVAL;
6335
6336 if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)
6337 return MDB_BAD_TXN;
6338
6339 switch (op) {
6340 case MDB_GET_CURRENT:
6341 if (!(mc->mc_flags & C_INITIALIZED)) {
6342 rc = EINVAL;
6343 } else {
6344 MDB_page *mp = mc->mc_pg[mc->mc_top];
6345 int nkeys = NUMKEYS(mp);
6346 if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) {
6347 mc->mc_ki[mc->mc_top] = nkeys;
6348 rc = MDB_NOTFOUND;
6349 break;
6350 }
6351 rc = MDB_SUCCESS;
6352 if (IS_LEAF2(mp)) {
6353 key->mv_size = mc->mc_db->md_pad;
6354 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
6355 } else {
6356 MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6357 MDB_GET_KEY(leaf, key);
6358 if (data) {
6359 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6360 rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT);
6361 } else {
6362 rc = mdb_node_read(mc, leaf, data);
6363 }
6364 }
6365 }
6366 }
6367 break;
6368 case MDB_GET_BOTH:
6369 case MDB_GET_BOTH_RANGE:
6370 if (data == NULL) {
6371 rc = EINVAL;
6372 break;
6373 }
6374 if (mc->mc_xcursor == NULL) {
6375 rc = MDB_INCOMPATIBLE;
6376 break;
6377 }
6378 /* FALLTHRU */
6379 case MDB_SET:
6380 case MDB_SET_KEY:
6381 case MDB_SET_RANGE:
6382 if (key == NULL) {
6383 rc = EINVAL;
6384 } else {
6385 rc = mdb_cursor_set(mc, key, data, op,
6386 op == MDB_SET_RANGE ? NULL : &exact);
6387 }
6388 break;
6389 case MDB_GET_MULTIPLE:
6390 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) {
6391 rc = EINVAL;
6392 break;
6393 }
6394 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
6395 rc = MDB_INCOMPATIBLE;
6396 break;
6397 }
6398 rc = MDB_SUCCESS;
6399 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) ||
6400 (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF))
6401 break;
6402 goto fetchm;
6403 case MDB_NEXT_MULTIPLE:
6404 if (data == NULL) {
6405 rc = EINVAL;
6406 break;
6407 }
6408 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
6409 rc = MDB_INCOMPATIBLE;
6410 break;
6411 }
6412 rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP);
6413 if (rc == MDB_SUCCESS) {
6414 if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
6415 MDB_cursor *mx;
6416fetchm:
6417 mx = &mc->mc_xcursor->mx_cursor;
6418 data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) *
6419 mx->mc_db->md_pad;
6420 data->mv_data = METADATA(mx->mc_pg[mx->mc_top]);
6421 mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1;
6422 } else {
6423 rc = MDB_NOTFOUND;
6424 }
6425 }
6426 break;
6427 case MDB_PREV_MULTIPLE:
6428 if (data == NULL) {
6429 rc = EINVAL;
6430 break;
6431 }
6432 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
6433 rc = MDB_INCOMPATIBLE;
6434 break;
6435 }
6436 if (!(mc->mc_flags & C_INITIALIZED))
6437 rc = mdb_cursor_last(mc, key, data);
6438 else
6439 rc = MDB_SUCCESS;
6440 if (rc == MDB_SUCCESS) {
6441 MDB_cursor *mx = &mc->mc_xcursor->mx_cursor;
6442 if (mx->mc_flags & C_INITIALIZED) {
6443 rc = mdb_cursor_sibling(mx, 0);
6444 if (rc == MDB_SUCCESS)
6445 goto fetchm;
6446 } else {
6447 rc = MDB_NOTFOUND;
6448 }
6449 }
6450 break;
6451 case MDB_NEXT:
6452 case MDB_NEXT_DUP:
6453 case MDB_NEXT_NODUP:
6454 rc = mdb_cursor_next(mc, key, data, op);
6455 break;
6456 case MDB_PREV:
6457 case MDB_PREV_DUP:
6458 case MDB_PREV_NODUP:
6459 rc = mdb_cursor_prev(mc, key, data, op);
6460 break;
6461 case MDB_FIRST:
6462 rc = mdb_cursor_first(mc, key, data);
6463 break;
6464 case MDB_FIRST_DUP:
6465 mfunc = mdb_cursor_first;
6466 mmove:
6467 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) {
6468 rc = EINVAL;
6469 break;
6470 }
6471 if (mc->mc_xcursor == NULL) {
6472 rc = MDB_INCOMPATIBLE;
6473 break;
6474 }
6475 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) {
6476 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
6477 rc = MDB_NOTFOUND;
6478 break;
6479 }
6480 {
6481 MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6482 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6483 MDB_GET_KEY(leaf, key);
6484 rc = mdb_node_read(mc, leaf, data);
6485 break;
6486 }
6487 }
6488 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
6489 rc = EINVAL;
6490 break;
6491 }
6492 rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL);
6493 break;
6494 case MDB_LAST:
6495 rc = mdb_cursor_last(mc, key, data);
6496 break;
6497 case MDB_LAST_DUP:
6498 mfunc = mdb_cursor_last;
6499 goto mmove;
6500 default:
6501 DPRINTF(("unhandled/unimplemented cursor operation %u", op));
6502 rc = EINVAL;
6503 break;
6504 }
6505
6506 if (mc->mc_flags & C_DEL)
6507 mc->mc_flags ^= C_DEL;
6508
6509 return rc;
6510}
6511
6512/** Touch all the pages in the cursor stack. Set mc_top.
6513 * Makes sure all the pages are writable, before attempting a write operation.
6514 * @param[in] mc The cursor to operate on.
6515 */
6516static int
6517mdb_cursor_touch(MDB_cursor *mc)
6518{
6519 int rc = MDB_SUCCESS;
6520
6521 if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) {
6522 /* Touch DB record of named DB */
6523 MDB_cursor mc2;
6524 MDB_xcursor mcx;
6525 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))
6526 return MDB_BAD_DBI;
6527 mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx);
6528 rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY);
6529 if (rc)
6530 return rc;
6531 *mc->mc_dbflag |= DB_DIRTY;
6532 }
6533 mc->mc_top = 0;
6534 if (mc->mc_snum) {
6535 do {
6536 rc = mdb_page_touch(mc);
6537 } while (!rc && ++(mc->mc_top) < mc->mc_snum);
6538 mc->mc_top = mc->mc_snum-1;
6539 }
6540 return rc;
6541}
6542
6543/** Do not spill pages to disk if txn is getting full, may fail instead */
6544#define MDB_NOSPILL 0x8000
6545
6546int
6547mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
6548 unsigned int flags)
6549{
6550 MDB_env *env;
6551 MDB_node *leaf = NULL;
6552 MDB_page *fp, *mp, *sub_root = NULL;
6553 uint16_t fp_flags;
6554 MDB_val xdata, *rdata, dkey, olddata;
6555 MDB_db dummy;
6556 int do_sub = 0, insert_key, insert_data;
6557 unsigned int mcount = 0, dcount = 0, nospill;
6558 size_t nsize;
6559 int rc, rc2;
6560 unsigned int nflags;
6561 DKBUF;
6562
6563 if (mc == NULL || key == NULL)
6564 return EINVAL;
6565
6566 env = mc->mc_txn->mt_env;
6567
6568 /* Check this first so counter will always be zero on any
6569 * early failures.
6570 */
6571 if (flags & MDB_MULTIPLE) {
6572 dcount = data[1].mv_size;
6573 data[1].mv_size = 0;
6574 if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED))
6575 return MDB_INCOMPATIBLE;
6576 }
6577
6578 nospill = flags & MDB_NOSPILL;
6579 flags &= ~MDB_NOSPILL;
6580
6581 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED))
6582 return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
6583
6584 if (key->mv_size-1 >= ENV_MAXKEY(env))
6585 return MDB_BAD_VALSIZE;
6586
6587#if SIZE_MAX > MAXDATASIZE
6588 if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE))
6589 return MDB_BAD_VALSIZE;
6590#else
6591 if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env))
6592 return MDB_BAD_VALSIZE;
6593#endif
6594
6595 DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u",
6596 DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size));
6597
6598 dkey.mv_size = 0;
6599
6600 if (flags & MDB_CURRENT) {
6601 if (!(mc->mc_flags & C_INITIALIZED))
6602 return EINVAL;
6603 rc = MDB_SUCCESS;
6604 } else if (mc->mc_db->md_root == P_INVALID) {
6605 /* new database, cursor has nothing to point to */
6606 mc->mc_snum = 0;
6607 mc->mc_top = 0;
6608 mc->mc_flags &= ~C_INITIALIZED;
6609 rc = MDB_NO_ROOT;
6610 } else {
6611 int exact = 0;
6612 MDB_val d2;
6613 if (flags & MDB_APPEND) {
6614 MDB_val k2;
6615 rc = mdb_cursor_last(mc, &k2, &d2);
6616 if (rc == 0) {
6617 rc = mc->mc_dbx->md_cmp(key, &k2);
6618 if (rc > 0) {
6619 rc = MDB_NOTFOUND;
6620 mc->mc_ki[mc->mc_top]++;
6621 } else {
6622 /* new key is <= last key */
6623 rc = MDB_KEYEXIST;
6624 }
6625 }
6626 } else {
6627 rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact);
6628 }
6629 if ((flags & MDB_NOOVERWRITE) && rc == 0) {
6630 DPRINTF(("duplicate key [%s]", DKEY(key)));
6631 *data = d2;
6632 return MDB_KEYEXIST;
6633 }
6634 if (rc && rc != MDB_NOTFOUND)
6635 return rc;
6636 }
6637
6638 if (mc->mc_flags & C_DEL)
6639 mc->mc_flags ^= C_DEL;
6640
6641 /* Cursor is positioned, check for room in the dirty list */
6642 if (!nospill) {
6643 if (flags & MDB_MULTIPLE) {
6644 rdata = &xdata;
6645 xdata.mv_size = data->mv_size * dcount;
6646 } else {
6647 rdata = data;
6648 }
6649 if ((rc2 = mdb_page_spill(mc, key, rdata)))
6650 return rc2;
6651 }
6652
6653 if (rc == MDB_NO_ROOT) {
6654 MDB_page *np;
6655 /* new database, write a root leaf page */
6656 DPUTS("allocating new root leaf page");
6657 if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) {
6658 return rc2;
6659 }
6660 mdb_cursor_push(mc, np);
6661 mc->mc_db->md_root = np->mp_pgno;
6662 mc->mc_db->md_depth++;
6663 *mc->mc_dbflag |= DB_DIRTY;
6664 if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED))
6665 == MDB_DUPFIXED)
6666 np->mp_flags |= P_LEAF2;
6667 mc->mc_flags |= C_INITIALIZED;
6668 } else {
6669 /* make sure all cursor pages are writable */
6670 rc2 = mdb_cursor_touch(mc);
6671 if (rc2)
6672 return rc2;
6673 }
6674
6675 insert_key = insert_data = rc;
6676 if (insert_key) {
6677 /* The key does not exist */
6678 DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top]));
6679 if ((mc->mc_db->md_flags & MDB_DUPSORT) &&
6680 LEAFSIZE(key, data) > env->me_nodemax)
6681 {
6682 /* Too big for a node, insert in sub-DB. Set up an empty
6683 * "old sub-page" for prep_subDB to expand to a full page.
6684 */
6685 fp_flags = P_LEAF|P_DIRTY;
6686 fp = env->me_pbuf;
6687 fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */
6688 fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE);
6689 olddata.mv_size = PAGEHDRSZ;
6690 goto prep_subDB;
6691 }
6692 } else {
6693 /* there's only a key anyway, so this is a no-op */
6694 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
6695 char *ptr;
6696 unsigned int ksize = mc->mc_db->md_pad;
6697 if (key->mv_size != ksize)
6698 return MDB_BAD_VALSIZE;
6699 ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
6700 memcpy(ptr, key->mv_data, ksize);
6701fix_parent:
6702 /* if overwriting slot 0 of leaf, need to
6703 * update branch key if there is a parent page
6704 */
6705 if (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
6706 unsigned short dtop = 1;
6707 mc->mc_top--;
6708 /* slot 0 is always an empty key, find real slot */
6709 while (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
6710 mc->mc_top--;
6711 dtop++;
6712 }
6713 if (mc->mc_ki[mc->mc_top])
6714 rc2 = mdb_update_key(mc, key);
6715 else
6716 rc2 = MDB_SUCCESS;
6717 mc->mc_top += dtop;
6718 if (rc2)
6719 return rc2;
6720 }
6721 return MDB_SUCCESS;
6722 }
6723
6724more:
6725 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6726 olddata.mv_size = NODEDSZ(leaf);
6727 olddata.mv_data = NODEDATA(leaf);
6728
6729 /* DB has dups? */
6730 if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
6731 /* Prepare (sub-)page/sub-DB to accept the new item,
6732 * if needed. fp: old sub-page or a header faking
6733 * it. mp: new (sub-)page. offset: growth in page
6734 * size. xdata: node data with new page or DB.
6735 */
6736 unsigned i, offset = 0;
6737 mp = fp = xdata.mv_data = env->me_pbuf;
6738 mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
6739
6740 /* Was a single item before, must convert now */
6741 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6742 MDB_cmp_func *dcmp;
6743 /* Just overwrite the current item */
6744 if (flags == MDB_CURRENT)
6745 goto current;
6746 dcmp = mc->mc_dbx->md_dcmp;
6747#if UINT_MAX < SIZE_MAX
6748 if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t))
6749 dcmp = mdb_cmp_clong;
6750#endif
6751 /* does data match? */
6752 if (!dcmp(data, &olddata)) {
6753 if (flags & (MDB_NODUPDATA|MDB_APPENDDUP))
6754 return MDB_KEYEXIST;
6755 /* overwrite it */
6756 goto current;
6757 }
6758
6759 /* Back up original data item */
6760 dkey.mv_size = olddata.mv_size;
6761 dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size);
6762
6763 /* Make sub-page header for the dup items, with dummy body */
6764 fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
6765 fp->mp_lower = (PAGEHDRSZ-PAGEBASE);
6766 xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
6767 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
6768 fp->mp_flags |= P_LEAF2;
6769 fp->mp_pad = data->mv_size;
6770 xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */
6771 } else {
6772 xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
6773 (dkey.mv_size & 1) + (data->mv_size & 1);
6774 }
6775 fp->mp_upper = xdata.mv_size - PAGEBASE;
6776 olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */
6777 } else if (leaf->mn_flags & F_SUBDATA) {
6778 /* Data is on sub-DB, just store it */
6779 flags |= F_DUPDATA|F_SUBDATA;
6780 goto put_sub;
6781 } else {
6782 /* Data is on sub-page */
6783 fp = olddata.mv_data;
6784 switch (flags) {
6785 default:
6786 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
6787 offset = EVEN(NODESIZE + sizeof(indx_t) +
6788 data->mv_size);
6789 break;
6790 }
6791 offset = fp->mp_pad;
6792 if (SIZELEFT(fp) < offset) {
6793 offset *= 4; /* space for 4 more */
6794 break;
6795 }
6796 /* FALLTHRU */ /* Big enough MDB_DUPFIXED sub-page */
6797 case MDB_CURRENT:
6798 fp->mp_flags |= P_DIRTY;
6799 COPY_PGNO(fp->mp_pgno, mp->mp_pgno);
6800 mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
6801 flags |= F_DUPDATA;
6802 goto put_sub;
6803 }
6804 xdata.mv_size = olddata.mv_size + offset;
6805 }
6806
6807 fp_flags = fp->mp_flags;
6808 if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) {
6809 /* Too big for a sub-page, convert to sub-DB */
6810 fp_flags &= ~P_SUBP;
6811prep_subDB:
6812 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
6813 fp_flags |= P_LEAF2;
6814 dummy.md_pad = fp->mp_pad;
6815 dummy.md_flags = MDB_DUPFIXED;
6816 if (mc->mc_db->md_flags & MDB_INTEGERDUP)
6817 dummy.md_flags |= MDB_INTEGERKEY;
6818 } else {
6819 dummy.md_pad = 0;
6820 dummy.md_flags = 0;
6821 }
6822 dummy.md_depth = 1;
6823 dummy.md_branch_pages = 0;
6824 dummy.md_leaf_pages = 1;
6825 dummy.md_overflow_pages = 0;
6826 dummy.md_entries = NUMKEYS(fp);
6827 xdata.mv_size = sizeof(MDB_db);
6828 xdata.mv_data = &dummy;
6829 if ((rc = mdb_page_alloc(mc, 1, &mp)))
6830 return rc;
6831 offset = env->me_psize - olddata.mv_size;
6832 flags |= F_DUPDATA|F_SUBDATA;
6833 dummy.md_root = mp->mp_pgno;
6834 sub_root = mp;
6835 }
6836 if (mp != fp) {
6837 mp->mp_flags = fp_flags | P_DIRTY;
6838 mp->mp_pad = fp->mp_pad;
6839 mp->mp_lower = fp->mp_lower;
6840 mp->mp_upper = fp->mp_upper + offset;
6841 if (fp_flags & P_LEAF2) {
6842 memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
6843 } else {
6844 memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE,
6845 olddata.mv_size - fp->mp_upper - PAGEBASE);
6846 memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), NUMKEYS(fp) * sizeof(mp->mp_ptrs[0]));
6847 for (i=0; i<NUMKEYS(fp); i++)
6848 mp->mp_ptrs[i] += offset;
6849 }
6850 }
6851
6852 rdata = &xdata;
6853 flags |= F_DUPDATA;
6854 do_sub = 1;
6855 if (!insert_key)
6856 mdb_node_del(mc, 0);
6857 goto new_sub;
6858 }
6859current:
6860 /* LMDB passes F_SUBDATA in 'flags' to write a DB record */
6861 if ((leaf->mn_flags ^ flags) & F_SUBDATA)
6862 return MDB_INCOMPATIBLE;
6863 /* overflow page overwrites need special handling */
6864 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
6865 MDB_page *omp;
6866 pgno_t pg;
6867 int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize);
6868
6869 memcpy(&pg, olddata.mv_data, sizeof(pg));
6870 if ((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0)
6871 return rc2;
6872 ovpages = omp->mp_pages;
6873
6874 /* Is the ov page large enough? */
6875 if (ovpages >= dpages) {
6876 if (!(omp->mp_flags & P_DIRTY) &&
6877 (level || (env->me_flags & MDB_WRITEMAP)))
6878 {
6879 rc = mdb_page_unspill(mc->mc_txn, omp, &omp);
6880 if (rc)
6881 return rc;
6882 level = 0; /* dirty in this txn or clean */
6883 }
6884 /* Is it dirty? */
6885 if (omp->mp_flags & P_DIRTY) {
6886 /* yes, overwrite it. Note in this case we don't
6887 * bother to try shrinking the page if the new data
6888 * is smaller than the overflow threshold.
6889 */
6890 if (level > 1) {
6891 /* It is writable only in a parent txn */
6892 size_t sz = (size_t) env->me_psize * ovpages, off;
6893 MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages);
6894 MDB_ID2 id2;
6895 if (!np)
6896 return ENOMEM;
6897 id2.mid = pg;
6898 id2.mptr = np;
6899 /* Note - this page is already counted in parent's dirty_room */
6900 rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2);
6901 mdb_cassert(mc, rc2 == 0);
6902 /* Currently we make the page look as with put() in the
6903 * parent txn, in case the user peeks at MDB_RESERVEd
6904 * or unused parts. Some users treat ovpages specially.
6905 */
6906 if (!(flags & MDB_RESERVE)) {
6907 /* Skip the part where LMDB will put *data.
6908 * Copy end of page, adjusting alignment so
6909 * compiler may copy words instead of bytes.
6910 */
6911 off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t);
6912 memcpy((size_t *)((char *)np + off),
6913 (size_t *)((char *)omp + off), sz - off);
6914 sz = PAGEHDRSZ;
6915 }
6916 memcpy(np, omp, sz); /* Copy beginning of page */
6917 omp = np;
6918 }
6919 SETDSZ(leaf, data->mv_size);
6920 if (F_ISSET(flags, MDB_RESERVE))
6921 data->mv_data = METADATA(omp);
6922 else
6923 memcpy(METADATA(omp), data->mv_data, data->mv_size);
6924 return MDB_SUCCESS;
6925 }
6926 }
6927 if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS)
6928 return rc2;
6929 } else if (data->mv_size == olddata.mv_size) {
6930 /* same size, just replace it. Note that we could
6931 * also reuse this node if the new data is smaller,
6932 * but instead we opt to shrink the node in that case.
6933 */
6934 if (F_ISSET(flags, MDB_RESERVE))
6935 data->mv_data = olddata.mv_data;
6936 else if (!(mc->mc_flags & C_SUB))
6937 memcpy(olddata.mv_data, data->mv_data, data->mv_size);
6938 else {
6939 memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
6940 goto fix_parent;
6941 }
6942 return MDB_SUCCESS;
6943 }
6944 mdb_node_del(mc, 0);
6945 }
6946
6947 rdata = data;
6948
6949new_sub:
6950 nflags = flags & NODE_ADD_FLAGS;
6951 nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata);
6952 if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
6953 if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
6954 nflags &= ~MDB_APPEND; /* sub-page may need room to grow */
6955 if (!insert_key)
6956 nflags |= MDB_SPLIT_REPLACE;
6957 rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags);
6958 } else {
6959 /* There is room already in this leaf page. */
6960 rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags);
6961 if (rc == 0) {
6962 /* Adjust other cursors pointing to mp */
6963 MDB_cursor *m2, *m3;
6964 MDB_dbi dbi = mc->mc_dbi;
6965 unsigned i = mc->mc_top;
6966 MDB_page *mp = mc->mc_pg[i];
6967
6968 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
6969 if (mc->mc_flags & C_SUB)
6970 m3 = &m2->mc_xcursor->mx_cursor;
6971 else
6972 m3 = m2;
6973 if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue;
6974 if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) {
6975 m3->mc_ki[i]++;
6976 }
6977 XCURSOR_REFRESH(m3, i, mp);
6978 }
6979 }
6980 }
6981
6982 if (rc == MDB_SUCCESS) {
6983 /* Now store the actual data in the child DB. Note that we're
6984 * storing the user data in the keys field, so there are strict
6985 * size limits on dupdata. The actual data fields of the child
6986 * DB are all zero size.
6987 */
6988 if (do_sub) {
6989 int xflags, new_dupdata;
6990 size_t ecount;
6991put_sub:
6992 xdata.mv_size = 0;
6993 xdata.mv_data = "";
6994 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6995 if ((flags & (MDB_CURRENT|MDB_APPENDDUP)) == MDB_CURRENT) {
6996 xflags = MDB_CURRENT|MDB_NOSPILL;
6997 } else {
6998 mdb_xcursor_init1(mc, leaf);
6999 xflags = (flags & MDB_NODUPDATA) ?
7000 MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL;
7001 }
7002 if (sub_root)
7003 mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root;
7004 new_dupdata = (int)dkey.mv_size;
7005 /* converted, write the original data first */
7006 if (dkey.mv_size) {
7007 rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
7008 if (rc)
7009 goto bad_sub;
7010 /* we've done our job */
7011 dkey.mv_size = 0;
7012 }
7013 if (!(leaf->mn_flags & F_SUBDATA) || sub_root) {
7014 /* Adjust other cursors pointing to mp */
7015 MDB_cursor *m2;
7016 MDB_xcursor *mx = mc->mc_xcursor;
7017 unsigned i = mc->mc_top;
7018 MDB_page *mp = mc->mc_pg[i];
7019
7020 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
7021 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
7022 if (!(m2->mc_flags & C_INITIALIZED)) continue;
7023 if (m2->mc_pg[i] == mp) {
7024 if (m2->mc_ki[i] == mc->mc_ki[i]) {
7025 mdb_xcursor_init2(m2, mx, new_dupdata);
7026 } else if (!insert_key) {
7027 XCURSOR_REFRESH(m2, i, mp);
7028 }
7029 }
7030 }
7031 }
7032 ecount = mc->mc_xcursor->mx_db.md_entries;
7033 if (flags & MDB_APPENDDUP)
7034 xflags |= MDB_APPEND;
7035 rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
7036 if (flags & F_SUBDATA) {
7037 void *db = NODEDATA(leaf);
7038 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
7039 }
7040 insert_data = mc->mc_xcursor->mx_db.md_entries - ecount;
7041 }
7042 /* Increment count unless we just replaced an existing item. */
7043 if (insert_data)
7044 mc->mc_db->md_entries++;
7045 if (insert_key) {
7046 /* Invalidate txn if we created an empty sub-DB */
7047 if (rc)
7048 goto bad_sub;
7049 /* If we succeeded and the key didn't exist before,
7050 * make sure the cursor is marked valid.
7051 */
7052 mc->mc_flags |= C_INITIALIZED;
7053 }
7054 if (flags & MDB_MULTIPLE) {
7055 if (!rc) {
7056 mcount++;
7057 /* let caller know how many succeeded, if any */
7058 data[1].mv_size = mcount;
7059 if (mcount < dcount) {
7060 data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
7061 insert_key = insert_data = 0;
7062 goto more;
7063 }
7064 }
7065 }
7066 return rc;
7067bad_sub:
7068 if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */
7069 rc = MDB_CORRUPTED;
7070 }
7071 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
7072 return rc;
7073}
7074
7075int
7076mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
7077{
7078 MDB_node *leaf;
7079 MDB_page *mp;
7080 int rc;
7081
7082 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED))
7083 return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
7084
7085 if (!(mc->mc_flags & C_INITIALIZED))
7086 return EINVAL;
7087
7088 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
7089 return MDB_NOTFOUND;
7090
7091 if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL)))
7092 return rc;
7093
7094 rc = mdb_cursor_touch(mc);
7095 if (rc)
7096 return rc;
7097
7098 mp = mc->mc_pg[mc->mc_top];
7099 if (!IS_LEAF(mp))
7100 return MDB_CORRUPTED;
7101 if (IS_LEAF2(mp))
7102 goto del_key;
7103 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
7104
7105 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7106 if (flags & MDB_NODUPDATA) {
7107 /* mdb_cursor_del0() will subtract the final entry */
7108 mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1;
7109 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
7110 } else {
7111 if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
7112 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
7113 }
7114 rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL);
7115 if (rc)
7116 return rc;
7117 /* If sub-DB still has entries, we're done */
7118 if (mc->mc_xcursor->mx_db.md_entries) {
7119 if (leaf->mn_flags & F_SUBDATA) {
7120 /* update subDB info */
7121 void *db = NODEDATA(leaf);
7122 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
7123 } else {
7124 MDB_cursor *m2;
7125 /* shrink fake page */
7126 mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]);
7127 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
7128 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
7129 /* fix other sub-DB cursors pointed at fake pages on this page */
7130 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
7131 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
7132 if (!(m2->mc_flags & C_INITIALIZED)) continue;
7133 if (m2->mc_pg[mc->mc_top] == mp) {
7134 XCURSOR_REFRESH(m2, mc->mc_top, mp);
7135 }
7136 }
7137 }
7138 mc->mc_db->md_entries--;
7139 return rc;
7140 } else {
7141 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
7142 }
7143 /* otherwise fall thru and delete the sub-DB */
7144 }
7145
7146 if (leaf->mn_flags & F_SUBDATA) {
7147 /* add all the child DB's pages to the free list */
7148 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
7149 if (rc)
7150 goto fail;
7151 }
7152 }
7153 /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */
7154 else if ((leaf->mn_flags ^ flags) & F_SUBDATA) {
7155 rc = MDB_INCOMPATIBLE;
7156 goto fail;
7157 }
7158
7159 /* add overflow pages to free list */
7160 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
7161 MDB_page *omp;
7162 pgno_t pg;
7163
7164 memcpy(&pg, NODEDATA(leaf), sizeof(pg));
7165 if ((rc = mdb_page_get(mc, pg, &omp, NULL)) ||
7166 (rc = mdb_ovpage_free(mc, omp)))
7167 goto fail;
7168 }
7169
7170del_key:
7171 return mdb_cursor_del0(mc);
7172
7173fail:
7174 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
7175 return rc;
7176}
7177
7178/** Allocate and initialize new pages for a database.
7179 * Set #MDB_TXN_ERROR on failure.
7180 * @param[in] mc a cursor on the database being added to.
7181 * @param[in] flags flags defining what type of page is being allocated.
7182 * @param[in] num the number of pages to allocate. This is usually 1,
7183 * unless allocating overflow pages for a large record.
7184 * @param[out] mp Address of a page, or NULL on failure.
7185 * @return 0 on success, non-zero on failure.
7186 */
7187static int
7188mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp)
7189{
7190 MDB_page *np;
7191 int rc;
7192
7193 if ((rc = mdb_page_alloc(mc, num, &np)))
7194 return rc;
7195 DPRINTF(("allocated new mpage %"Z"u, page size %u",
7196 np->mp_pgno, mc->mc_txn->mt_env->me_psize));
7197 np->mp_flags = flags | P_DIRTY;
7198 np->mp_lower = (PAGEHDRSZ-PAGEBASE);
7199 np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE;
7200
7201 if (IS_BRANCH(np))
7202 mc->mc_db->md_branch_pages++;
7203 else if (IS_LEAF(np))
7204 mc->mc_db->md_leaf_pages++;
7205 else if (IS_OVERFLOW(np)) {
7206 mc->mc_db->md_overflow_pages += num;
7207 np->mp_pages = num;
7208 }
7209 *mp = np;
7210
7211 return 0;
7212}
7213
7214/** Calculate the size of a leaf node.
7215 * The size depends on the environment's page size; if a data item
7216 * is too large it will be put onto an overflow page and the node
7217 * size will only include the key and not the data. Sizes are always
7218 * rounded up to an even number of bytes, to guarantee 2-byte alignment
7219 * of the #MDB_node headers.
7220 * @param[in] env The environment handle.
7221 * @param[in] key The key for the node.
7222 * @param[in] data The data for the node.
7223 * @return The number of bytes needed to store the node.
7224 */
7225static size_t
7226mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data)
7227{
7228 size_t sz;
7229
7230 sz = LEAFSIZE(key, data);
7231 if (sz > env->me_nodemax) {
7232 /* put on overflow page */
7233 sz -= data->mv_size - sizeof(pgno_t);
7234 }
7235
7236 return EVEN(sz + sizeof(indx_t));
7237}
7238
7239/** Calculate the size of a branch node.
7240 * The size should depend on the environment's page size but since
7241 * we currently don't support spilling large keys onto overflow
7242 * pages, it's simply the size of the #MDB_node header plus the
7243 * size of the key. Sizes are always rounded up to an even number
7244 * of bytes, to guarantee 2-byte alignment of the #MDB_node headers.
7245 * @param[in] env The environment handle.
7246 * @param[in] key The key for the node.
7247 * @return The number of bytes needed to store the node.
7248 */
7249static size_t
7250mdb_branch_size(MDB_env *env, MDB_val *key)
7251{
7252 size_t sz;
7253
7254 sz = INDXSIZE(key);
7255 if (sz > env->me_nodemax) {
7256 /* put on overflow page */
7257 /* not implemented */
7258 /* sz -= key->size - sizeof(pgno_t); */
7259 }
7260
7261 return sz + sizeof(indx_t);
7262}
7263
7264/** Add a node to the page pointed to by the cursor.
7265 * Set #MDB_TXN_ERROR on failure.
7266 * @param[in] mc The cursor for this operation.
7267 * @param[in] indx The index on the page where the new node should be added.
7268 * @param[in] key The key for the new node.
7269 * @param[in] data The data for the new node, if any.
7270 * @param[in] pgno The page number, if adding a branch node.
7271 * @param[in] flags Flags for the node.
7272 * @return 0 on success, non-zero on failure. Possible errors are:
7273 * <ul>
7274 * <li>ENOMEM - failed to allocate overflow pages for the node.
7275 * <li>MDB_PAGE_FULL - there is insufficient room in the page. This error
7276 * should never happen since all callers already calculate the
7277 * page's free space before calling this function.
7278 * </ul>
7279 */
7280static int
7281mdb_node_add(MDB_cursor *mc, indx_t indx,
7282 MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags)
7283{
7284 unsigned int i;
7285 size_t node_size = NODESIZE;
7286 ssize_t room;
7287 indx_t ofs;
7288 MDB_node *node;
7289 MDB_page *mp = mc->mc_pg[mc->mc_top];
7290 MDB_page *ofp = NULL; /* overflow page */
7291 void *ndata;
7292 DKBUF;
7293
7294 mdb_cassert(mc, mp->mp_upper >= mp->mp_lower);
7295
7296 DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]",
7297 IS_LEAF(mp) ? "leaf" : "branch",
7298 IS_SUBP(mp) ? "sub-" : "",
7299 mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0,
7300 key ? key->mv_size : 0, key ? DKEY(key) : "null"));
7301
7302 if (IS_LEAF2(mp)) {
7303 /* Move higher keys up one slot. */
7304 int ksize = mc->mc_db->md_pad, dif;
7305 char *ptr = LEAF2KEY(mp, indx, ksize);
7306 dif = NUMKEYS(mp) - indx;
7307 if (dif > 0)
7308 memmove(ptr+ksize, ptr, dif*ksize);
7309 /* insert new key */
7310 memcpy(ptr, key->mv_data, ksize);
7311
7312 /* Just using these for counting */
7313 mp->mp_lower += sizeof(indx_t);
7314 mp->mp_upper -= ksize - sizeof(indx_t);
7315 return MDB_SUCCESS;
7316 }
7317
7318 room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t);
7319 if (key != NULL)
7320 node_size += key->mv_size;
7321 if (IS_LEAF(mp)) {
7322 mdb_cassert(mc, key && data);
7323 if (F_ISSET(flags, F_BIGDATA)) {
7324 /* Data already on overflow page. */
7325 node_size += sizeof(pgno_t);
7326 } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) {
7327 int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
7328 int rc;
7329 /* Put data on overflow page. */
7330 DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page",
7331 data->mv_size, node_size+data->mv_size));
7332 node_size = EVEN(node_size + sizeof(pgno_t));
7333 if ((ssize_t)node_size > room)
7334 goto full;
7335 if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp)))
7336 return rc;
7337 DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno));
7338 flags |= F_BIGDATA;
7339 goto update;
7340 } else {
7341 node_size += data->mv_size;
7342 }
7343 }
7344 node_size = EVEN(node_size);
7345 if ((ssize_t)node_size > room)
7346 goto full;
7347
7348update:
7349 /* Move higher pointers up one slot. */
7350 for (i = NUMKEYS(mp); i > indx; i--)
7351 mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
7352
7353 /* Adjust free space offsets. */
7354 ofs = mp->mp_upper - node_size;
7355 mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t));
7356 mp->mp_ptrs[indx] = ofs;
7357 mp->mp_upper = ofs;
7358 mp->mp_lower += sizeof(indx_t);
7359
7360 /* Write the node data. */
7361 node = NODEPTR(mp, indx);
7362 node->mn_ksize = (key == NULL) ? 0 : key->mv_size;
7363 node->mn_flags = flags;
7364 if (IS_LEAF(mp))
7365 SETDSZ(node,data->mv_size);
7366 else
7367 SETPGNO(node,pgno);
7368
7369 if (key)
7370 memcpy(NODEKEY(node), key->mv_data, key->mv_size);
7371
7372 if (IS_LEAF(mp)) {
7373 ndata = NODEDATA(node);
7374 if (ofp == NULL) {
7375 if (F_ISSET(flags, F_BIGDATA))
7376 memcpy(ndata, data->mv_data, sizeof(pgno_t));
7377 else if (F_ISSET(flags, MDB_RESERVE))
7378 data->mv_data = ndata;
7379 else
7380 memcpy(ndata, data->mv_data, data->mv_size);
7381 } else {
7382 memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t));
7383 ndata = METADATA(ofp);
7384 if (F_ISSET(flags, MDB_RESERVE))
7385 data->mv_data = ndata;
7386 else
7387 memcpy(ndata, data->mv_data, data->mv_size);
7388 }
7389 }
7390
7391 return MDB_SUCCESS;
7392
7393full:
7394 DPRINTF(("not enough room in page %"Z"u, got %u ptrs",
7395 mdb_dbg_pgno(mp), NUMKEYS(mp)));
7396 DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room));
7397 DPRINTF(("node size = %"Z"u", node_size));
7398 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
7399 return MDB_PAGE_FULL;
7400}
7401
7402/** Delete the specified node from a page.
7403 * @param[in] mc Cursor pointing to the node to delete.
7404 * @param[in] ksize The size of a node. Only used if the page is
7405 * part of a #MDB_DUPFIXED database.
7406 */
7407static void
7408mdb_node_del(MDB_cursor *mc, int ksize)
7409{
7410 MDB_page *mp = mc->mc_pg[mc->mc_top];
7411 indx_t indx = mc->mc_ki[mc->mc_top];
7412 unsigned int sz;
7413 indx_t i, j, numkeys, ptr;
7414 MDB_node *node;
7415 char *base;
7416
7417 DPRINTF(("delete node %u on %s page %"Z"u", indx,
7418 IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp)));
7419 numkeys = NUMKEYS(mp);
7420 mdb_cassert(mc, indx < numkeys);
7421
7422 if (IS_LEAF2(mp)) {
7423 int x = numkeys - 1 - indx;
7424 base = LEAF2KEY(mp, indx, ksize);
7425 if (x)
7426 memmove(base, base + ksize, x * ksize);
7427 mp->mp_lower -= sizeof(indx_t);
7428 mp->mp_upper += ksize - sizeof(indx_t);
7429 return;
7430 }
7431
7432 node = NODEPTR(mp, indx);
7433 sz = NODESIZE + node->mn_ksize;
7434 if (IS_LEAF(mp)) {
7435 if (F_ISSET(node->mn_flags, F_BIGDATA))
7436 sz += sizeof(pgno_t);
7437 else
7438 sz += NODEDSZ(node);
7439 }
7440 sz = EVEN(sz);
7441
7442 ptr = mp->mp_ptrs[indx];
7443 for (i = j = 0; i < numkeys; i++) {
7444 if (i != indx) {
7445 mp->mp_ptrs[j] = mp->mp_ptrs[i];
7446 if (mp->mp_ptrs[i] < ptr)
7447 mp->mp_ptrs[j] += sz;
7448 j++;
7449 }
7450 }
7451
7452 base = (char *)mp + mp->mp_upper + PAGEBASE;
7453 memmove(base + sz, base, ptr - mp->mp_upper);
7454
7455 mp->mp_lower -= sizeof(indx_t);
7456 mp->mp_upper += sz;
7457}
7458
7459/** Compact the main page after deleting a node on a subpage.
7460 * @param[in] mp The main page to operate on.
7461 * @param[in] indx The index of the subpage on the main page.
7462 */
7463static void
7464mdb_node_shrink(MDB_page *mp, indx_t indx)
7465{
7466 MDB_node *node;
7467 MDB_page *sp, *xp;
7468 char *base;
7469 indx_t delta, nsize, len, ptr;
7470 int i;
7471
7472 node = NODEPTR(mp, indx);
7473 sp = (MDB_page *)NODEDATA(node);
7474 delta = SIZELEFT(sp);
7475 nsize = NODEDSZ(node) - delta;
7476
7477 /* Prepare to shift upward, set len = length(subpage part to shift) */
7478 if (IS_LEAF2(sp)) {
7479 len = nsize;
7480 if (nsize & 1)
7481 return; /* do not make the node uneven-sized */
7482 } else {
7483 xp = (MDB_page *)((char *)sp + delta); /* destination subpage */
7484 for (i = NUMKEYS(sp); --i >= 0; )
7485 xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta;
7486 len = PAGEHDRSZ;
7487 }
7488 sp->mp_upper = sp->mp_lower;
7489 COPY_PGNO(sp->mp_pgno, mp->mp_pgno);
7490 SETDSZ(node, nsize);
7491
7492 /* Shift <lower nodes...initial part of subpage> upward */
7493 base = (char *)mp + mp->mp_upper + PAGEBASE;
7494 memmove(base + delta, base, (char *)sp + len - base);
7495
7496 ptr = mp->mp_ptrs[indx];
7497 for (i = NUMKEYS(mp); --i >= 0; ) {
7498 if (mp->mp_ptrs[i] <= ptr)
7499 mp->mp_ptrs[i] += delta;
7500 }
7501 mp->mp_upper += delta;
7502}
7503
7504/** Initial setup of a sorted-dups cursor.
7505 * Sorted duplicates are implemented as a sub-database for the given key.
7506 * The duplicate data items are actually keys of the sub-database.
7507 * Operations on the duplicate data items are performed using a sub-cursor
7508 * initialized when the sub-database is first accessed. This function does
7509 * the preliminary setup of the sub-cursor, filling in the fields that
7510 * depend only on the parent DB.
7511 * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized.
7512 */
7513static void
7514mdb_xcursor_init0(MDB_cursor *mc)
7515{
7516 MDB_xcursor *mx = mc->mc_xcursor;
7517
7518 mx->mx_cursor.mc_xcursor = NULL;
7519 mx->mx_cursor.mc_txn = mc->mc_txn;
7520 mx->mx_cursor.mc_db = &mx->mx_db;
7521 mx->mx_cursor.mc_dbx = &mx->mx_dbx;
7522 mx->mx_cursor.mc_dbi = mc->mc_dbi;
7523 mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
7524 mx->mx_cursor.mc_snum = 0;
7525 mx->mx_cursor.mc_top = 0;
7526 mx->mx_cursor.mc_flags = C_SUB;
7527 mx->mx_dbx.md_name.mv_size = 0;
7528 mx->mx_dbx.md_name.mv_data = NULL;
7529 mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
7530 mx->mx_dbx.md_dcmp = NULL;
7531 mx->mx_dbx.md_rel = mc->mc_dbx->md_rel;
7532}
7533
7534/** Final setup of a sorted-dups cursor.
7535 * Sets up the fields that depend on the data from the main cursor.
7536 * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized.
7537 * @param[in] node The data containing the #MDB_db record for the
7538 * sorted-dup database.
7539 */
7540static void
7541mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
7542{
7543 MDB_xcursor *mx = mc->mc_xcursor;
7544
7545 if (node->mn_flags & F_SUBDATA) {
7546 memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db));
7547 mx->mx_cursor.mc_pg[0] = 0;
7548 mx->mx_cursor.mc_snum = 0;
7549 mx->mx_cursor.mc_top = 0;
7550 mx->mx_cursor.mc_flags = C_SUB;
7551 } else {
7552 MDB_page *fp = NODEDATA(node);
7553 mx->mx_db.md_pad = 0;
7554 mx->mx_db.md_flags = 0;
7555 mx->mx_db.md_depth = 1;
7556 mx->mx_db.md_branch_pages = 0;
7557 mx->mx_db.md_leaf_pages = 1;
7558 mx->mx_db.md_overflow_pages = 0;
7559 mx->mx_db.md_entries = NUMKEYS(fp);
7560 COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno);
7561 mx->mx_cursor.mc_snum = 1;
7562 mx->mx_cursor.mc_top = 0;
7563 mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
7564 mx->mx_cursor.mc_pg[0] = fp;
7565 mx->mx_cursor.mc_ki[0] = 0;
7566 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
7567 mx->mx_db.md_flags = MDB_DUPFIXED;
7568 mx->mx_db.md_pad = fp->mp_pad;
7569 if (mc->mc_db->md_flags & MDB_INTEGERDUP)
7570 mx->mx_db.md_flags |= MDB_INTEGERKEY;
7571 }
7572 }
7573 DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi,
7574 mx->mx_db.md_root));
7575 mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA;
7576#if UINT_MAX < SIZE_MAX
7577 if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t))
7578 mx->mx_dbx.md_cmp = mdb_cmp_clong;
7579#endif
7580}
7581
7582
7583/** Fixup a sorted-dups cursor due to underlying update.
7584 * Sets up some fields that depend on the data from the main cursor.
7585 * Almost the same as init1, but skips initialization steps if the
7586 * xcursor had already been used.
7587 * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up.
7588 * @param[in] src_mx The xcursor of an up-to-date cursor.
7589 * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item.
7590 */
7591static void
7592mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata)
7593{
7594 MDB_xcursor *mx = mc->mc_xcursor;
7595
7596 if (new_dupdata) {
7597 mx->mx_cursor.mc_snum = 1;
7598 mx->mx_cursor.mc_top = 0;
7599 mx->mx_cursor.mc_flags |= C_INITIALIZED;
7600 mx->mx_cursor.mc_ki[0] = 0;
7601 mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA;
7602#if UINT_MAX < SIZE_MAX
7603 mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp;
7604#endif
7605 } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) {
7606 return;
7607 }
7608 mx->mx_db = src_mx->mx_db;
7609 mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0];
7610 DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi,
7611 mx->mx_db.md_root));
7612}
7613
7614/** Initialize a cursor for a given transaction and database. */
7615static void
7616mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx)
7617{
7618 mc->mc_next = NULL;
7619 mc->mc_backup = NULL;
7620 mc->mc_dbi = dbi;
7621 mc->mc_txn = txn;
7622 mc->mc_db = &txn->mt_dbs[dbi];
7623 mc->mc_dbx = &txn->mt_dbxs[dbi];
7624 mc->mc_dbflag = &txn->mt_dbflags[dbi];
7625 mc->mc_snum = 0;
7626 mc->mc_top = 0;
7627 mc->mc_pg[0] = 0;
7628 mc->mc_ki[0] = 0;
7629 mc->mc_flags = 0;
7630 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
7631 mdb_tassert(txn, mx != NULL);
7632 mc->mc_xcursor = mx;
7633 mdb_xcursor_init0(mc);
7634 } else {
7635 mc->mc_xcursor = NULL;
7636 }
7637 if (*mc->mc_dbflag & DB_STALE) {
7638 mdb_page_search(mc, NULL, MDB_PS_ROOTONLY);
7639 }
7640}
7641
7642int
7643mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret)
7644{
7645 MDB_cursor *mc;
7646 size_t size = sizeof(MDB_cursor);
7647
7648 if (!ret || !TXN_DBI_EXIST(txn, dbi, DB_VALID))
7649 return EINVAL;
7650
7651 if (txn->mt_flags & MDB_TXN_BLOCKED)
7652 return MDB_BAD_TXN;
7653
7654 if (dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
7655 return EINVAL;
7656
7657 if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT)
7658 size += sizeof(MDB_xcursor);
7659
7660 if ((mc = malloc(size)) != NULL) {
7661 mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1));
7662 if (txn->mt_cursors) {
7663 mc->mc_next = txn->mt_cursors[dbi];
7664 txn->mt_cursors[dbi] = mc;
7665 mc->mc_flags |= C_UNTRACK;
7666 }
7667 } else {
7668 return ENOMEM;
7669 }
7670
7671 *ret = mc;
7672
7673 return MDB_SUCCESS;
7674}
7675
7676int
7677mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc)
7678{
7679 if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID))
7680 return EINVAL;
7681
7682 if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors)
7683 return EINVAL;
7684
7685 if (txn->mt_flags & MDB_TXN_BLOCKED)
7686 return MDB_BAD_TXN;
7687
7688 mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor);
7689 return MDB_SUCCESS;
7690}
7691
7692/* Return the count of duplicate data items for the current key */
7693int
7694mdb_cursor_count(MDB_cursor *mc, size_t *countp)
7695{
7696 MDB_node *leaf;
7697
7698 if (mc == NULL || countp == NULL)
7699 return EINVAL;
7700
7701 if (mc->mc_xcursor == NULL)
7702 return MDB_INCOMPATIBLE;
7703
7704 if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)
7705 return MDB_BAD_TXN;
7706
7707 if (!(mc->mc_flags & C_INITIALIZED))
7708 return EINVAL;
7709
7710 if (!mc->mc_snum)
7711 return MDB_NOTFOUND;
7712
7713 if (mc->mc_flags & C_EOF) {
7714 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
7715 return MDB_NOTFOUND;
7716 mc->mc_flags ^= C_EOF;
7717 }
7718
7719 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
7720 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7721 *countp = 1;
7722 } else {
7723 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
7724 return EINVAL;
7725
7726 *countp = mc->mc_xcursor->mx_db.md_entries;
7727 }
7728 return MDB_SUCCESS;
7729}
7730
7731void
7732mdb_cursor_close(MDB_cursor *mc)
7733{
7734 if (mc && !mc->mc_backup) {
7735 /* remove from txn, if tracked */
7736 if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) {
7737 MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi];
7738 while (*prev && *prev != mc) prev = &(*prev)->mc_next;
7739 if (*prev == mc)
7740 *prev = mc->mc_next;
7741 }
7742 free(mc);
7743 }
7744}
7745
7746MDB_txn *
7747mdb_cursor_txn(MDB_cursor *mc)
7748{
7749 if (!mc) return NULL;
7750 return mc->mc_txn;
7751}
7752
7753MDB_dbi
7754mdb_cursor_dbi(MDB_cursor *mc)
7755{
7756 return mc->mc_dbi;
7757}
7758
7759/** Replace the key for a branch node with a new key.
7760 * Set #MDB_TXN_ERROR on failure.
7761 * @param[in] mc Cursor pointing to the node to operate on.
7762 * @param[in] key The new key to use.
7763 * @return 0 on success, non-zero on failure.
7764 */
7765static int
7766mdb_update_key(MDB_cursor *mc, MDB_val *key)
7767{
7768 MDB_page *mp;
7769 MDB_node *node;
7770 char *base;
7771 size_t len;
7772 int delta, ksize, oksize;
7773 indx_t ptr, i, numkeys, indx;
7774 DKBUF;
7775
7776 indx = mc->mc_ki[mc->mc_top];
7777 mp = mc->mc_pg[mc->mc_top];
7778 node = NODEPTR(mp, indx);
7779 ptr = mp->mp_ptrs[indx];
7780#if MDB_DEBUG
7781 {
7782 MDB_val k2;
7783 char kbuf2[DKBUF_MAXKEYSIZE*2+1];
7784 k2.mv_data = NODEKEY(node);
7785 k2.mv_size = node->mn_ksize;
7786 DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u",
7787 indx, ptr,
7788 mdb_dkey(&k2, kbuf2),
7789 DKEY(key),
7790 mp->mp_pgno));
7791 }
7792#endif
7793
7794 /* Sizes must be 2-byte aligned. */
7795 ksize = EVEN(key->mv_size);
7796 oksize = EVEN(node->mn_ksize);
7797 delta = ksize - oksize;
7798
7799 /* Shift node contents if EVEN(key length) changed. */
7800 if (delta) {
7801 if (delta > 0 && SIZELEFT(mp) < delta) {
7802 pgno_t pgno;
7803 /* not enough space left, do a delete and split */
7804 DPRINTF(("Not enough room, delta = %d, splitting...", delta));
7805 pgno = NODEPGNO(node);
7806 mdb_node_del(mc, 0);
7807 return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE);
7808 }
7809
7810 numkeys = NUMKEYS(mp);
7811 for (i = 0; i < numkeys; i++) {
7812 if (mp->mp_ptrs[i] <= ptr)
7813 mp->mp_ptrs[i] -= delta;
7814 }
7815
7816 base = (char *)mp + mp->mp_upper + PAGEBASE;
7817 len = ptr - mp->mp_upper + NODESIZE;
7818 memmove(base - delta, base, len);
7819 mp->mp_upper -= delta;
7820
7821 node = NODEPTR(mp, indx);
7822 }
7823
7824 /* But even if no shift was needed, update ksize */
7825 if (node->mn_ksize != key->mv_size)
7826 node->mn_ksize = key->mv_size;
7827
7828 if (key->mv_size)
7829 memcpy(NODEKEY(node), key->mv_data, key->mv_size);
7830
7831 return MDB_SUCCESS;
7832}
7833
7834static void
7835mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst);
7836
7837/** Perform \b act while tracking temporary cursor \b mn */
7838#define WITH_CURSOR_TRACKING(mn, act) do { \
7839 MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \
7840 if ((mn).mc_flags & C_SUB) { \
7841 dummy.mc_flags = C_INITIALIZED; \
7842 dummy.mc_xcursor = (MDB_xcursor *)&(mn); \
7843 tracked = &dummy; \
7844 } else { \
7845 tracked = &(mn); \
7846 } \
7847 tracked->mc_next = *tp; \
7848 *tp = tracked; \
7849 { act; } \
7850 *tp = tracked->mc_next; \
7851} while (0)
7852
7853/** Move a node from csrc to cdst.
7854 */
7855static int
7856mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft)
7857{
7858 MDB_node *srcnode;
7859 MDB_val key, data;
7860 pgno_t srcpg;
7861 MDB_cursor mn;
7862 int rc;
7863 unsigned short flags;
7864
7865 DKBUF;
7866
7867 /* Mark src and dst as dirty. */
7868 if ((rc = mdb_page_touch(csrc)) ||
7869 (rc = mdb_page_touch(cdst)))
7870 return rc;
7871
7872 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7873 key.mv_size = csrc->mc_db->md_pad;
7874 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
7875 data.mv_size = 0;
7876 data.mv_data = NULL;
7877 srcpg = 0;
7878 flags = 0;
7879 } else {
7880 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
7881 mdb_cassert(csrc, !((size_t)srcnode & 1));
7882 srcpg = NODEPGNO(srcnode);
7883 flags = srcnode->mn_flags;
7884 if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
7885 unsigned int snum = csrc->mc_snum;
7886 MDB_node *s2;
7887 /* must find the lowest key below src */
7888 rc = mdb_page_search_lowest(csrc);
7889 if (rc)
7890 return rc;
7891 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
7892 key.mv_size = csrc->mc_db->md_pad;
7893 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
7894 } else {
7895 s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
7896 key.mv_size = NODEKSZ(s2);
7897 key.mv_data = NODEKEY(s2);
7898 }
7899 csrc->mc_snum = snum--;
7900 csrc->mc_top = snum;
7901 } else {
7902 key.mv_size = NODEKSZ(srcnode);
7903 key.mv_data = NODEKEY(srcnode);
7904 }
7905 data.mv_size = NODEDSZ(srcnode);
7906 data.mv_data = NODEDATA(srcnode);
7907 }
7908 mn.mc_xcursor = NULL;
7909 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) {
7910 unsigned int snum = cdst->mc_snum;
7911 MDB_node *s2;
7912 MDB_val bkey;
7913 /* must find the lowest key below dst */
7914 mdb_cursor_copy(cdst, &mn);
7915 rc = mdb_page_search_lowest(&mn);
7916 if (rc)
7917 return rc;
7918 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) {
7919 bkey.mv_size = mn.mc_db->md_pad;
7920 bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size);
7921 } else {
7922 s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0);
7923 bkey.mv_size = NODEKSZ(s2);
7924 bkey.mv_data = NODEKEY(s2);
7925 }
7926 mn.mc_snum = snum--;
7927 mn.mc_top = snum;
7928 mn.mc_ki[snum] = 0;
7929 rc = mdb_update_key(&mn, &bkey);
7930 if (rc)
7931 return rc;
7932 }
7933
7934 DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u",
7935 IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch",
7936 csrc->mc_ki[csrc->mc_top],
7937 DKEY(&key),
7938 csrc->mc_pg[csrc->mc_top]->mp_pgno,
7939 cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno));
7940
7941 /* Add the node to the destination page.
7942 */
7943 rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags);
7944 if (rc != MDB_SUCCESS)
7945 return rc;
7946
7947 /* Delete the node from the source page.
7948 */
7949 mdb_node_del(csrc, key.mv_size);
7950
7951 {
7952 /* Adjust other cursors pointing to mp */
7953 MDB_cursor *m2, *m3;
7954 MDB_dbi dbi = csrc->mc_dbi;
7955 MDB_page *mpd, *mps;
7956
7957 mps = csrc->mc_pg[csrc->mc_top];
7958 /* If we're adding on the left, bump others up */
7959 if (fromleft) {
7960 mpd = cdst->mc_pg[csrc->mc_top];
7961 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7962 if (csrc->mc_flags & C_SUB)
7963 m3 = &m2->mc_xcursor->mx_cursor;
7964 else
7965 m3 = m2;
7966 if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top)
7967 continue;
7968 if (m3 != cdst &&
7969 m3->mc_pg[csrc->mc_top] == mpd &&
7970 m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) {
7971 m3->mc_ki[csrc->mc_top]++;
7972 }
7973 if (m3 !=csrc &&
7974 m3->mc_pg[csrc->mc_top] == mps &&
7975 m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) {
7976 m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
7977 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
7978 m3->mc_ki[csrc->mc_top-1]++;
7979 }
7980 if (IS_LEAF(mps))
7981 XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]);
7982 }
7983 } else
7984 /* Adding on the right, bump others down */
7985 {
7986 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7987 if (csrc->mc_flags & C_SUB)
7988 m3 = &m2->mc_xcursor->mx_cursor;
7989 else
7990 m3 = m2;
7991 if (m3 == csrc) continue;
7992 if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top)
7993 continue;
7994 if (m3->mc_pg[csrc->mc_top] == mps) {
7995 if (!m3->mc_ki[csrc->mc_top]) {
7996 m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
7997 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
7998 m3->mc_ki[csrc->mc_top-1]--;
7999 } else {
8000 m3->mc_ki[csrc->mc_top]--;
8001 }
8002 if (IS_LEAF(mps))
8003 XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]);
8004 }
8005 }
8006 }
8007 }
8008
8009 /* Update the parent separators.
8010 */
8011 if (csrc->mc_ki[csrc->mc_top] == 0) {
8012 if (csrc->mc_ki[csrc->mc_top-1] != 0) {
8013 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
8014 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
8015 } else {
8016 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
8017 key.mv_size = NODEKSZ(srcnode);
8018 key.mv_data = NODEKEY(srcnode);
8019 }
8020 DPRINTF(("update separator for source page %"Z"u to [%s]",
8021 csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)));
8022 mdb_cursor_copy(csrc, &mn);
8023 mn.mc_snum--;
8024 mn.mc_top--;
8025 /* We want mdb_rebalance to find mn when doing fixups */
8026 WITH_CURSOR_TRACKING(mn,
8027 rc = mdb_update_key(&mn, &key));
8028 if (rc)
8029 return rc;
8030 }
8031 if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
8032 MDB_val nullkey;
8033 indx_t ix = csrc->mc_ki[csrc->mc_top];
8034 nullkey.mv_size = 0;
8035 csrc->mc_ki[csrc->mc_top] = 0;
8036 rc = mdb_update_key(csrc, &nullkey);
8037 csrc->mc_ki[csrc->mc_top] = ix;
8038 mdb_cassert(csrc, rc == MDB_SUCCESS);
8039 }
8040 }
8041
8042 if (cdst->mc_ki[cdst->mc_top] == 0) {
8043 if (cdst->mc_ki[cdst->mc_top-1] != 0) {
8044 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
8045 key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size);
8046 } else {
8047 srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
8048 key.mv_size = NODEKSZ(srcnode);
8049 key.mv_data = NODEKEY(srcnode);
8050 }
8051 DPRINTF(("update separator for destination page %"Z"u to [%s]",
8052 cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)));
8053 mdb_cursor_copy(cdst, &mn);
8054 mn.mc_snum--;
8055 mn.mc_top--;
8056 /* We want mdb_rebalance to find mn when doing fixups */
8057 WITH_CURSOR_TRACKING(mn,
8058 rc = mdb_update_key(&mn, &key));
8059 if (rc)
8060 return rc;
8061 }
8062 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) {
8063 MDB_val nullkey;
8064 indx_t ix = cdst->mc_ki[cdst->mc_top];
8065 nullkey.mv_size = 0;
8066 cdst->mc_ki[cdst->mc_top] = 0;
8067 rc = mdb_update_key(cdst, &nullkey);
8068 cdst->mc_ki[cdst->mc_top] = ix;
8069 mdb_cassert(cdst, rc == MDB_SUCCESS);
8070 }
8071 }
8072
8073 return MDB_SUCCESS;
8074}
8075
8076/** Merge one page into another.
8077 * The nodes from the page pointed to by \b csrc will
8078 * be copied to the page pointed to by \b cdst and then
8079 * the \b csrc page will be freed.
8080 * @param[in] csrc Cursor pointing to the source page.
8081 * @param[in] cdst Cursor pointing to the destination page.
8082 * @return 0 on success, non-zero on failure.
8083 */
8084static int
8085mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
8086{
8087 MDB_page *psrc, *pdst;
8088 MDB_node *srcnode;
8089 MDB_val key, data;
8090 unsigned nkeys;
8091 int rc;
8092 indx_t i, j;
8093
8094 psrc = csrc->mc_pg[csrc->mc_top];
8095 pdst = cdst->mc_pg[cdst->mc_top];
8096
8097 DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno));
8098
8099 mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */
8100 mdb_cassert(csrc, cdst->mc_snum > 1);
8101
8102 /* Mark dst as dirty. */
8103 if ((rc = mdb_page_touch(cdst)))
8104 return rc;
8105
8106 /* get dst page again now that we've touched it. */
8107 pdst = cdst->mc_pg[cdst->mc_top];
8108
8109 /* Move all nodes from src to dst.
8110 */
8111 j = nkeys = NUMKEYS(pdst);
8112 if (IS_LEAF2(psrc)) {
8113 key.mv_size = csrc->mc_db->md_pad;
8114 key.mv_data = METADATA(psrc);
8115 for (i = 0; i < NUMKEYS(psrc); i++, j++) {
8116 rc = mdb_node_add(cdst, j, &key, NULL, 0, 0);
8117 if (rc != MDB_SUCCESS)
8118 return rc;
8119 key.mv_data = (char *)key.mv_data + key.mv_size;
8120 }
8121 } else {
8122 for (i = 0; i < NUMKEYS(psrc); i++, j++) {
8123 srcnode = NODEPTR(psrc, i);
8124 if (i == 0 && IS_BRANCH(psrc)) {
8125 MDB_cursor mn;
8126 MDB_node *s2;
8127 mdb_cursor_copy(csrc, &mn);
8128 mn.mc_xcursor = NULL;
8129 /* must find the lowest key below src */
8130 rc = mdb_page_search_lowest(&mn);
8131 if (rc)
8132 return rc;
8133 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) {
8134 key.mv_size = mn.mc_db->md_pad;
8135 key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size);
8136 } else {
8137 s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0);
8138 key.mv_size = NODEKSZ(s2);
8139 key.mv_data = NODEKEY(s2);
8140 }
8141 } else {
8142 key.mv_size = srcnode->mn_ksize;
8143 key.mv_data = NODEKEY(srcnode);
8144 }
8145
8146 data.mv_size = NODEDSZ(srcnode);
8147 data.mv_data = NODEDATA(srcnode);
8148 rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags);
8149 if (rc != MDB_SUCCESS)
8150 return rc;
8151 }
8152 }
8153
8154 DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)",
8155 pdst->mp_pgno, NUMKEYS(pdst),
8156 (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10));
8157
8158 /* Unlink the src page from parent and add to free list.
8159 */
8160 csrc->mc_top--;
8161 mdb_node_del(csrc, 0);
8162 if (csrc->mc_ki[csrc->mc_top] == 0) {
8163 key.mv_size = 0;
8164 rc = mdb_update_key(csrc, &key);
8165 if (rc) {
8166 csrc->mc_top++;
8167 return rc;
8168 }
8169 }
8170 csrc->mc_top++;
8171
8172 psrc = csrc->mc_pg[csrc->mc_top];
8173 /* If not operating on FreeDB, allow this page to be reused
8174 * in this txn. Otherwise just add to free list.
8175 */
8176 rc = mdb_page_loose(csrc, psrc);
8177 if (rc)
8178 return rc;
8179 if (IS_LEAF(psrc))
8180 csrc->mc_db->md_leaf_pages--;
8181 else
8182 csrc->mc_db->md_branch_pages--;
8183 {
8184 /* Adjust other cursors pointing to mp */
8185 MDB_cursor *m2, *m3;
8186 MDB_dbi dbi = csrc->mc_dbi;
8187 unsigned int top = csrc->mc_top;
8188
8189 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
8190 if (csrc->mc_flags & C_SUB)
8191 m3 = &m2->mc_xcursor->mx_cursor;
8192 else
8193 m3 = m2;
8194 if (m3 == csrc) continue;
8195 if (m3->mc_snum < csrc->mc_snum) continue;
8196 if (m3->mc_pg[top] == psrc) {
8197 m3->mc_pg[top] = pdst;
8198 m3->mc_ki[top] += nkeys;
8199 m3->mc_ki[top-1] = cdst->mc_ki[top-1];
8200 } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] &&
8201 m3->mc_ki[top-1] > csrc->mc_ki[top-1]) {
8202 m3->mc_ki[top-1]--;
8203 }
8204 if (IS_LEAF(psrc))
8205 XCURSOR_REFRESH(m3, top, m3->mc_pg[top]);
8206 }
8207 }
8208 {
8209 unsigned int snum = cdst->mc_snum;
8210 uint16_t depth = cdst->mc_db->md_depth;
8211 mdb_cursor_pop(cdst);
8212 rc = mdb_rebalance(cdst);
8213 /* Did the tree height change? */
8214 if (depth != cdst->mc_db->md_depth)
8215 snum += cdst->mc_db->md_depth - depth;
8216 cdst->mc_snum = snum;
8217 cdst->mc_top = snum-1;
8218 }
8219 return rc;
8220}
8221
8222/** Copy the contents of a cursor.
8223 * @param[in] csrc The cursor to copy from.
8224 * @param[out] cdst The cursor to copy to.
8225 */
8226static void
8227mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst)
8228{
8229 unsigned int i;
8230
8231 cdst->mc_txn = csrc->mc_txn;
8232 cdst->mc_dbi = csrc->mc_dbi;
8233 cdst->mc_db = csrc->mc_db;
8234 cdst->mc_dbx = csrc->mc_dbx;
8235 cdst->mc_snum = csrc->mc_snum;
8236 cdst->mc_top = csrc->mc_top;
8237 cdst->mc_flags = csrc->mc_flags;
8238
8239 for (i=0; i<csrc->mc_snum; i++) {
8240 cdst->mc_pg[i] = csrc->mc_pg[i];
8241 cdst->mc_ki[i] = csrc->mc_ki[i];
8242 }
8243}
8244
8245/** Rebalance the tree after a delete operation.
8246 * @param[in] mc Cursor pointing to the page where rebalancing
8247 * should begin.
8248 * @return 0 on success, non-zero on failure.
8249 */
8250static int
8251mdb_rebalance(MDB_cursor *mc)
8252{
8253 MDB_node *node;
8254 int rc, fromleft;
8255 unsigned int ptop, minkeys, thresh;
8256 MDB_cursor mn;
8257 indx_t oldki;
8258
8259 if (IS_BRANCH(mc->mc_pg[mc->mc_top])) {
8260 minkeys = 2;
8261 thresh = 1;
8262 } else {
8263 minkeys = 1;
8264 thresh = FILL_THRESHOLD;
8265 }
8266 DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)",
8267 IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch",
8268 mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]),
8269 (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10));
8270
8271 if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh &&
8272 NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) {
8273 DPRINTF(("no need to rebalance page %"Z"u, above fill threshold",
8274 mdb_dbg_pgno(mc->mc_pg[mc->mc_top])));
8275 return MDB_SUCCESS;
8276 }
8277
8278 if (mc->mc_snum < 2) {
8279 MDB_page *mp = mc->mc_pg[0];
8280 if (IS_SUBP(mp)) {
8281 DPUTS("Can't rebalance a subpage, ignoring");
8282 return MDB_SUCCESS;
8283 }
8284 if (NUMKEYS(mp) == 0) {
8285 DPUTS("tree is completely empty");
8286 mc->mc_db->md_root = P_INVALID;
8287 mc->mc_db->md_depth = 0;
8288 mc->mc_db->md_leaf_pages = 0;
8289 rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
8290 if (rc)
8291 return rc;
8292 /* Adjust cursors pointing to mp */
8293 mc->mc_snum = 0;
8294 mc->mc_top = 0;
8295 mc->mc_flags &= ~C_INITIALIZED;
8296 {
8297 MDB_cursor *m2, *m3;
8298 MDB_dbi dbi = mc->mc_dbi;
8299
8300 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
8301 if (mc->mc_flags & C_SUB)
8302 m3 = &m2->mc_xcursor->mx_cursor;
8303 else
8304 m3 = m2;
8305 if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum))
8306 continue;
8307 if (m3->mc_pg[0] == mp) {
8308 m3->mc_snum = 0;
8309 m3->mc_top = 0;
8310 m3->mc_flags &= ~C_INITIALIZED;
8311 }
8312 }
8313 }
8314 } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
8315 int i;
8316 DPUTS("collapsing root page!");
8317 rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
8318 if (rc)
8319 return rc;
8320 mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0));
8321 rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL);
8322 if (rc)
8323 return rc;
8324 mc->mc_db->md_depth--;
8325 mc->mc_db->md_branch_pages--;
8326 mc->mc_ki[0] = mc->mc_ki[1];
8327 for (i = 1; i<mc->mc_db->md_depth; i++) {
8328 mc->mc_pg[i] = mc->mc_pg[i+1];
8329 mc->mc_ki[i] = mc->mc_ki[i+1];
8330 }
8331 {
8332 /* Adjust other cursors pointing to mp */
8333 MDB_cursor *m2, *m3;
8334 MDB_dbi dbi = mc->mc_dbi;
8335
8336 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
8337 if (mc->mc_flags & C_SUB)
8338 m3 = &m2->mc_xcursor->mx_cursor;
8339 else
8340 m3 = m2;
8341 if (m3 == mc) continue;
8342 if (!(m3->mc_flags & C_INITIALIZED))
8343 continue;
8344 if (m3->mc_pg[0] == mp) {
8345 for (i=0; i<mc->mc_db->md_depth; i++) {
8346 m3->mc_pg[i] = m3->mc_pg[i+1];
8347 m3->mc_ki[i] = m3->mc_ki[i+1];
8348 }
8349 m3->mc_snum--;
8350 m3->mc_top--;
8351 }
8352 }
8353 }
8354 } else
8355 DPUTS("root page doesn't need rebalancing");
8356 return MDB_SUCCESS;
8357 }
8358
8359 /* The parent (branch page) must have at least 2 pointers,
8360 * otherwise the tree is invalid.
8361 */
8362 ptop = mc->mc_top-1;
8363 mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1);
8364
8365 /* Leaf page fill factor is below the threshold.
8366 * Try to move keys from left or right neighbor, or
8367 * merge with a neighbor page.
8368 */
8369
8370 /* Find neighbors.
8371 */
8372 mdb_cursor_copy(mc, &mn);
8373 mn.mc_xcursor = NULL;
8374
8375 oldki = mc->mc_ki[mc->mc_top];
8376 if (mc->mc_ki[ptop] == 0) {
8377 /* We're the leftmost leaf in our parent.
8378 */
8379 DPUTS("reading right neighbor");
8380 mn.mc_ki[ptop]++;
8381 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
8382 rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL);
8383 if (rc)
8384 return rc;
8385 mn.mc_ki[mn.mc_top] = 0;
8386 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
8387 fromleft = 0;
8388 } else {
8389 /* There is at least one neighbor to the left.
8390 */
8391 DPUTS("reading left neighbor");
8392 mn.mc_ki[ptop]--;
8393 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
8394 rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL);
8395 if (rc)
8396 return rc;
8397 mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1;
8398 mc->mc_ki[mc->mc_top] = 0;
8399 fromleft = 1;
8400 }
8401
8402 DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)",
8403 mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]),
8404 (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10));
8405
8406 /* If the neighbor page is above threshold and has enough keys,
8407 * move one key from it. Otherwise we should try to merge them.
8408 * (A branch page must never have less than 2 keys.)
8409 */
8410 if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) {
8411 rc = mdb_node_move(&mn, mc, fromleft);
8412 if (fromleft) {
8413 /* if we inserted on left, bump position up */
8414 oldki++;
8415 }
8416 } else {
8417 if (!fromleft) {
8418 rc = mdb_page_merge(&mn, mc);
8419 } else {
8420 oldki += NUMKEYS(mn.mc_pg[mn.mc_top]);
8421 mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1;
8422 /* We want mdb_rebalance to find mn when doing fixups */
8423 WITH_CURSOR_TRACKING(mn,
8424 rc = mdb_page_merge(mc, &mn));
8425 mdb_cursor_copy(&mn, mc);
8426 }
8427 mc->mc_flags &= ~C_EOF;
8428 }
8429 mc->mc_ki[mc->mc_top] = oldki;
8430 return rc;
8431}
8432
8433/** Complete a delete operation started by #mdb_cursor_del(). */
8434static int
8435mdb_cursor_del0(MDB_cursor *mc)
8436{
8437 int rc;
8438 MDB_page *mp;
8439 indx_t ki;
8440 unsigned int nkeys;
8441 MDB_cursor *m2, *m3;
8442 MDB_dbi dbi = mc->mc_dbi;
8443
8444 ki = mc->mc_ki[mc->mc_top];
8445 mp = mc->mc_pg[mc->mc_top];
8446 mdb_node_del(mc, mc->mc_db->md_pad);
8447 mc->mc_db->md_entries--;
8448 {
8449 /* Adjust other cursors pointing to mp */
8450 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
8451 m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
8452 if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED))
8453 continue;
8454 if (m3 == mc || m3->mc_snum < mc->mc_snum)
8455 continue;
8456 if (m3->mc_pg[mc->mc_top] == mp) {
8457 if (m3->mc_ki[mc->mc_top] == ki) {
8458 m3->mc_flags |= C_DEL;
8459 if (mc->mc_db->md_flags & MDB_DUPSORT) {
8460 /* Sub-cursor referred into dataset which is gone */
8461 m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
8462 }
8463 continue;
8464 } else if (m3->mc_ki[mc->mc_top] > ki) {
8465 m3->mc_ki[mc->mc_top]--;
8466 }
8467 XCURSOR_REFRESH(m3, mc->mc_top, mp);
8468 }
8469 }
8470 }
8471 rc = mdb_rebalance(mc);
8472 if (rc)
8473 goto fail;
8474
8475 /* DB is totally empty now, just bail out.
8476 * Other cursors adjustments were already done
8477 * by mdb_rebalance and aren't needed here.
8478 */
8479 if (!mc->mc_snum) {
8480 mc->mc_flags |= C_EOF;
8481 return rc;
8482 }
8483
8484 mp = mc->mc_pg[mc->mc_top];
8485 nkeys = NUMKEYS(mp);
8486
8487 /* Adjust other cursors pointing to mp */
8488 for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) {
8489 m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
8490 if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
8491 continue;
8492 if (m3->mc_snum < mc->mc_snum)
8493 continue;
8494 if (m3->mc_pg[mc->mc_top] == mp) {
8495 if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) {
8496 /* if m3 points past last node in page, find next sibling */
8497 if (m3->mc_ki[mc->mc_top] >= nkeys) {
8498 rc = mdb_cursor_sibling(m3, 1);
8499 if (rc == MDB_NOTFOUND) {
8500 m3->mc_flags |= C_EOF;
8501 rc = MDB_SUCCESS;
8502 continue;
8503 }
8504 if (rc)
8505 goto fail;
8506 }
8507 if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) {
8508 MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]);
8509 /* If this node has dupdata, it may need to be reinited
8510 * because its data has moved.
8511 * If the xcursor was not initd it must be reinited.
8512 * Else if node points to a subDB, nothing is needed.
8513 * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset.
8514 */
8515 if (node->mn_flags & F_DUPDATA) {
8516 if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
8517 if (!(node->mn_flags & F_SUBDATA))
8518 m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node);
8519 } else {
8520 mdb_xcursor_init1(m3, node);
8521 rc = mdb_cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL);
8522 if (rc)
8523 goto fail;
8524 }
8525 }
8526 m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL;
8527 }
8528 }
8529 }
8530 }
8531 mc->mc_flags |= C_DEL;
8532
8533fail:
8534 if (rc)
8535 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
8536 return rc;
8537}
8538
8539int
8540mdb_del(MDB_txn *txn, MDB_dbi dbi,
8541 MDB_val *key, MDB_val *data)
8542{
8543 if (!key || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
8544 return EINVAL;
8545
8546 if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED))
8547 return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
8548
8549 if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) {
8550 /* must ignore any data */
8551 data = NULL;
8552 }
8553
8554 return mdb_del0(txn, dbi, key, data, 0);
8555}
8556
8557static int
8558mdb_del0(MDB_txn *txn, MDB_dbi dbi,
8559 MDB_val *key, MDB_val *data, unsigned flags)
8560{
8561 MDB_cursor mc;
8562 MDB_xcursor mx;
8563 MDB_cursor_op op;
8564 MDB_val rdata, *xdata;
8565 int rc, exact = 0;
8566 DKBUF;
8567
8568 DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key)));
8569
8570 mdb_cursor_init(&mc, txn, dbi, &mx);
8571
8572 if (data) {
8573 op = MDB_GET_BOTH;
8574 rdata = *data;
8575 xdata = &rdata;
8576 } else {
8577 op = MDB_SET;
8578 xdata = NULL;
8579 flags |= MDB_NODUPDATA;
8580 }
8581 rc = mdb_cursor_set(&mc, key, xdata, op, &exact);
8582 if (rc == 0) {
8583 /* let mdb_page_split know about this cursor if needed:
8584 * delete will trigger a rebalance; if it needs to move
8585 * a node from one page to another, it will have to
8586 * update the parent's separator key(s). If the new sepkey
8587 * is larger than the current one, the parent page may
8588 * run out of space, triggering a split. We need this
8589 * cursor to be consistent until the end of the rebalance.
8590 */
8591 mc.mc_flags |= C_UNTRACK;
8592 mc.mc_next = txn->mt_cursors[dbi];
8593 txn->mt_cursors[dbi] = &mc;
8594 rc = mdb_cursor_del(&mc, flags);
8595 txn->mt_cursors[dbi] = mc.mc_next;
8596 }
8597 return rc;
8598}
8599
8600/** Split a page and insert a new node.
8601 * Set #MDB_TXN_ERROR on failure.
8602 * @param[in,out] mc Cursor pointing to the page and desired insertion index.
8603 * The cursor will be updated to point to the actual page and index where
8604 * the node got inserted after the split.
8605 * @param[in] newkey The key for the newly inserted node.
8606 * @param[in] newdata The data for the newly inserted node.
8607 * @param[in] newpgno The page number, if the new node is a branch node.
8608 * @param[in] nflags The #NODE_ADD_FLAGS for the new node.
8609 * @return 0 on success, non-zero on failure.
8610 */
8611static int
8612mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno,
8613 unsigned int nflags)
8614{
8615 unsigned int flags;
8616 int rc = MDB_SUCCESS, new_root = 0, did_split = 0;
8617 indx_t newindx;
8618 pgno_t pgno = 0;
8619 int i, j, split_indx, nkeys, pmax;
8620 MDB_env *env = mc->mc_txn->mt_env;
8621 MDB_node *node;
8622 MDB_val sepkey, rkey, xdata, *rdata = &xdata;
8623 MDB_page *copy = NULL;
8624 MDB_page *mp, *rp, *pp;
8625 int ptop;
8626 MDB_cursor mn;
8627 DKBUF;
8628
8629 mp = mc->mc_pg[mc->mc_top];
8630 newindx = mc->mc_ki[mc->mc_top];
8631 nkeys = NUMKEYS(mp);
8632
8633 DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i",
8634 IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
8635 DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys));
8636
8637 /* Create a right sibling. */
8638 if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp)))
8639 return rc;
8640 rp->mp_pad = mp->mp_pad;
8641 DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno));
8642
8643 /* Usually when splitting the root page, the cursor
8644 * height is 1. But when called from mdb_update_key,
8645 * the cursor height may be greater because it walks
8646 * up the stack while finding the branch slot to update.
8647 */
8648 if (mc->mc_top < 1) {
8649 if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp)))
8650 goto done;
8651 /* shift current top to make room for new parent */
8652 for (i=mc->mc_snum; i>0; i--) {
8653 mc->mc_pg[i] = mc->mc_pg[i-1];
8654 mc->mc_ki[i] = mc->mc_ki[i-1];
8655 }
8656 mc->mc_pg[0] = pp;
8657 mc->mc_ki[0] = 0;
8658 mc->mc_db->md_root = pp->mp_pgno;
8659 DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno));
8660 new_root = mc->mc_db->md_depth++;
8661
8662 /* Add left (implicit) pointer. */
8663 if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) {
8664 /* undo the pre-push */
8665 mc->mc_pg[0] = mc->mc_pg[1];
8666 mc->mc_ki[0] = mc->mc_ki[1];
8667 mc->mc_db->md_root = mp->mp_pgno;
8668 mc->mc_db->md_depth--;
8669 goto done;
8670 }
8671 mc->mc_snum++;
8672 mc->mc_top++;
8673 ptop = 0;
8674 } else {
8675 ptop = mc->mc_top-1;
8676 DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno));
8677 }
8678
8679 mdb_cursor_copy(mc, &mn);
8680 mn.mc_xcursor = NULL;
8681 mn.mc_pg[mn.mc_top] = rp;
8682 mn.mc_ki[ptop] = mc->mc_ki[ptop]+1;
8683
8684 if (nflags & MDB_APPEND) {
8685 mn.mc_ki[mn.mc_top] = 0;
8686 sepkey = *newkey;
8687 split_indx = newindx;
8688 nkeys = 0;
8689 } else {
8690
8691 split_indx = (nkeys+1) / 2;
8692
8693 if (IS_LEAF2(rp)) {
8694 char *split, *ins;
8695 int x;
8696 unsigned int lsize, rsize, ksize;
8697 /* Move half of the keys to the right sibling */
8698 x = mc->mc_ki[mc->mc_top] - split_indx;
8699 ksize = mc->mc_db->md_pad;
8700 split = LEAF2KEY(mp, split_indx, ksize);
8701 rsize = (nkeys - split_indx) * ksize;
8702 lsize = (nkeys - split_indx) * sizeof(indx_t);
8703 mp->mp_lower -= lsize;
8704 rp->mp_lower += lsize;
8705 mp->mp_upper += rsize - lsize;
8706 rp->mp_upper -= rsize - lsize;
8707 sepkey.mv_size = ksize;
8708 if (newindx == split_indx) {
8709 sepkey.mv_data = newkey->mv_data;
8710 } else {
8711 sepkey.mv_data = split;
8712 }
8713 if (x<0) {
8714 ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
8715 memcpy(rp->mp_ptrs, split, rsize);
8716 sepkey.mv_data = rp->mp_ptrs;
8717 memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
8718 memcpy(ins, newkey->mv_data, ksize);
8719 mp->mp_lower += sizeof(indx_t);
8720 mp->mp_upper -= ksize - sizeof(indx_t);
8721 } else {
8722 if (x)
8723 memcpy(rp->mp_ptrs, split, x * ksize);
8724 ins = LEAF2KEY(rp, x, ksize);
8725 memcpy(ins, newkey->mv_data, ksize);
8726 memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
8727 rp->mp_lower += sizeof(indx_t);
8728 rp->mp_upper -= ksize - sizeof(indx_t);
8729 mc->mc_ki[mc->mc_top] = x;
8730 }
8731 } else {
8732 int psize, nsize, k;
8733 /* Maximum free space in an empty page */
8734 pmax = env->me_psize - PAGEHDRSZ;
8735 if (IS_LEAF(mp))
8736 nsize = mdb_leaf_size(env, newkey, newdata);
8737 else
8738 nsize = mdb_branch_size(env, newkey);
8739 nsize = EVEN(nsize);
8740
8741 /* grab a page to hold a temporary copy */
8742 copy = mdb_page_malloc(mc->mc_txn, 1);
8743 if (copy == NULL) {
8744 rc = ENOMEM;
8745 goto done;
8746 }
8747 copy->mp_pgno = mp->mp_pgno;
8748 copy->mp_flags = mp->mp_flags;
8749 copy->mp_lower = (PAGEHDRSZ-PAGEBASE);
8750 copy->mp_upper = env->me_psize - PAGEBASE;
8751
8752 /* prepare to insert */
8753 for (i=0, j=0; i<nkeys; i++) {
8754 if (i == newindx) {
8755 copy->mp_ptrs[j++] = 0;
8756 }
8757 copy->mp_ptrs[j++] = mp->mp_ptrs[i];
8758 }
8759
8760 /* When items are relatively large the split point needs
8761 * to be checked, because being off-by-one will make the
8762 * difference between success or failure in mdb_node_add.
8763 *
8764 * It's also relevant if a page happens to be laid out
8765 * such that one half of its nodes are all "small" and
8766 * the other half of its nodes are "large." If the new
8767 * item is also "large" and falls on the half with
8768 * "large" nodes, it also may not fit.
8769 *
8770 * As a final tweak, if the new item goes on the last
8771 * spot on the page (and thus, onto the new page), bias
8772 * the split so the new page is emptier than the old page.
8773 * This yields better packing during sequential inserts.
8774 */
8775 if (nkeys < 32 || nsize > pmax/16 || newindx >= nkeys) {
8776 /* Find split point */
8777 psize = 0;
8778 if (newindx <= split_indx || newindx >= nkeys) {
8779 i = 0; j = 1;
8780 k = newindx >= nkeys ? nkeys : split_indx+1+IS_LEAF(mp);
8781 } else {
8782 i = nkeys; j = -1;
8783 k = split_indx-1;
8784 }
8785 for (; i!=k; i+=j) {
8786 if (i == newindx) {
8787 psize += nsize;
8788 node = NULL;
8789 } else {
8790 node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
8791 psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
8792 if (IS_LEAF(mp)) {
8793 if (F_ISSET(node->mn_flags, F_BIGDATA))
8794 psize += sizeof(pgno_t);
8795 else
8796 psize += NODEDSZ(node);
8797 }
8798 psize = EVEN(psize);
8799 }
8800 if (psize > pmax || i == k-j) {
8801 split_indx = i + (j<0);
8802 break;
8803 }
8804 }
8805 }
8806 if (split_indx == newindx) {
8807 sepkey.mv_size = newkey->mv_size;
8808 sepkey.mv_data = newkey->mv_data;
8809 } else {
8810 node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE);
8811 sepkey.mv_size = node->mn_ksize;
8812 sepkey.mv_data = NODEKEY(node);
8813 }
8814 }
8815 }
8816
8817 DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey)));
8818
8819 /* Copy separator key to the parent.
8820 */
8821 if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) {
8822 int snum = mc->mc_snum;
8823 mn.mc_snum--;
8824 mn.mc_top--;
8825 did_split = 1;
8826 /* We want other splits to find mn when doing fixups */
8827 WITH_CURSOR_TRACKING(mn,
8828 rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0));
8829 if (rc)
8830 goto done;
8831
8832 /* root split? */
8833 if (mc->mc_snum > snum) {
8834 ptop++;
8835 }
8836 /* Right page might now have changed parent.
8837 * Check if left page also changed parent.
8838 */
8839 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
8840 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
8841 for (i=0; i<ptop; i++) {
8842 mc->mc_pg[i] = mn.mc_pg[i];
8843 mc->mc_ki[i] = mn.mc_ki[i];
8844 }
8845 mc->mc_pg[ptop] = mn.mc_pg[ptop];
8846 if (mn.mc_ki[ptop]) {
8847 mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
8848 } else {
8849 /* find right page's left sibling */
8850 mc->mc_ki[ptop] = mn.mc_ki[ptop];
8851 mdb_cursor_sibling(mc, 0);
8852 }
8853 }
8854 } else {
8855 mn.mc_top--;
8856 rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0);
8857 mn.mc_top++;
8858 }
8859 if (rc != MDB_SUCCESS) {
8860 goto done;
8861 }
8862 if (nflags & MDB_APPEND) {
8863 mc->mc_pg[mc->mc_top] = rp;
8864 mc->mc_ki[mc->mc_top] = 0;
8865 rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags);
8866 if (rc)
8867 goto done;
8868 for (i=0; i<mc->mc_top; i++)
8869 mc->mc_ki[i] = mn.mc_ki[i];
8870 } else if (!IS_LEAF2(mp)) {
8871 /* Move nodes */
8872 mc->mc_pg[mc->mc_top] = rp;
8873 i = split_indx;
8874 j = 0;
8875 do {
8876 if (i == newindx) {
8877 rkey.mv_data = newkey->mv_data;
8878 rkey.mv_size = newkey->mv_size;
8879 if (IS_LEAF(mp)) {
8880 rdata = newdata;
8881 } else
8882 pgno = newpgno;
8883 flags = nflags;
8884 /* Update index for the new key. */
8885 mc->mc_ki[mc->mc_top] = j;
8886 } else {
8887 node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
8888 rkey.mv_data = NODEKEY(node);
8889 rkey.mv_size = node->mn_ksize;
8890 if (IS_LEAF(mp)) {
8891 xdata.mv_data = NODEDATA(node);
8892 xdata.mv_size = NODEDSZ(node);
8893 rdata = &xdata;
8894 } else
8895 pgno = NODEPGNO(node);
8896 flags = node->mn_flags;
8897 }
8898
8899 if (!IS_LEAF(mp) && j == 0) {
8900 /* First branch index doesn't need key data. */
8901 rkey.mv_size = 0;
8902 }
8903
8904 rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
8905 if (rc)
8906 goto done;
8907 if (i == nkeys) {
8908 i = 0;
8909 j = 0;
8910 mc->mc_pg[mc->mc_top] = copy;
8911 } else {
8912 i++;
8913 j++;
8914 }
8915 } while (i != split_indx);
8916
8917 nkeys = NUMKEYS(copy);
8918 for (i=0; i<nkeys; i++)
8919 mp->mp_ptrs[i] = copy->mp_ptrs[i];
8920 mp->mp_lower = copy->mp_lower;
8921 mp->mp_upper = copy->mp_upper;
8922 memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
8923 env->me_psize - copy->mp_upper - PAGEBASE);
8924
8925 /* reset back to original page */
8926 if (newindx < split_indx) {
8927 mc->mc_pg[mc->mc_top] = mp;
8928 } else {
8929 mc->mc_pg[mc->mc_top] = rp;
8930 mc->mc_ki[ptop]++;
8931 /* Make sure mc_ki is still valid.
8932 */
8933 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
8934 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
8935 for (i=0; i<=ptop; i++) {
8936 mc->mc_pg[i] = mn.mc_pg[i];
8937 mc->mc_ki[i] = mn.mc_ki[i];
8938 }
8939 }
8940 }
8941 if (nflags & MDB_RESERVE) {
8942 node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
8943 if (!(node->mn_flags & F_BIGDATA))
8944 newdata->mv_data = NODEDATA(node);
8945 }
8946 } else {
8947 if (newindx >= split_indx) {
8948 mc->mc_pg[mc->mc_top] = rp;
8949 mc->mc_ki[ptop]++;
8950 /* Make sure mc_ki is still valid.
8951 */
8952 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
8953 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
8954 for (i=0; i<=ptop; i++) {
8955 mc->mc_pg[i] = mn.mc_pg[i];
8956 mc->mc_ki[i] = mn.mc_ki[i];
8957 }
8958 }
8959 }
8960 }
8961
8962 {
8963 /* Adjust other cursors pointing to mp */
8964 MDB_cursor *m2, *m3;
8965 MDB_dbi dbi = mc->mc_dbi;
8966 nkeys = NUMKEYS(mp);
8967
8968 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
8969 if (mc->mc_flags & C_SUB)
8970 m3 = &m2->mc_xcursor->mx_cursor;
8971 else
8972 m3 = m2;
8973 if (m3 == mc)
8974 continue;
8975 if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
8976 continue;
8977 if (new_root) {
8978 int k;
8979 /* sub cursors may be on different DB */
8980 if (m3->mc_pg[0] != mp)
8981 continue;
8982 /* root split */
8983 for (k=new_root; k>=0; k--) {
8984 m3->mc_ki[k+1] = m3->mc_ki[k];
8985 m3->mc_pg[k+1] = m3->mc_pg[k];
8986 }
8987 if (m3->mc_ki[0] >= nkeys) {
8988 m3->mc_ki[0] = 1;
8989 } else {
8990 m3->mc_ki[0] = 0;
8991 }
8992 m3->mc_pg[0] = mc->mc_pg[0];
8993 m3->mc_snum++;
8994 m3->mc_top++;
8995 }
8996 if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) {
8997 if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE))
8998 m3->mc_ki[mc->mc_top]++;
8999 if (m3->mc_ki[mc->mc_top] >= nkeys) {
9000 m3->mc_pg[mc->mc_top] = rp;
9001 m3->mc_ki[mc->mc_top] -= nkeys;
9002 for (i=0; i<mc->mc_top; i++) {
9003 m3->mc_ki[i] = mn.mc_ki[i];
9004 m3->mc_pg[i] = mn.mc_pg[i];
9005 }
9006 }
9007 } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] &&
9008 m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
9009 m3->mc_ki[ptop]++;
9010 }
9011 if (IS_LEAF(mp))
9012 XCURSOR_REFRESH(m3, mc->mc_top, m3->mc_pg[mc->mc_top]);
9013 }
9014 }
9015 DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
9016
9017done:
9018 if (copy) /* tmp page */
9019 mdb_page_free(env, copy);
9020 if (rc)
9021 mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
9022 return rc;
9023}
9024
9025int
9026mdb_put(MDB_txn *txn, MDB_dbi dbi,
9027 MDB_val *key, MDB_val *data, unsigned int flags)
9028{
9029 MDB_cursor mc;
9030 MDB_xcursor mx;
9031 int rc;
9032
9033 if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
9034 return EINVAL;
9035
9036 if (flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP))
9037 return EINVAL;
9038
9039 if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED))
9040 return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
9041
9042 mdb_cursor_init(&mc, txn, dbi, &mx);
9043 mc.mc_next = txn->mt_cursors[dbi];
9044 txn->mt_cursors[dbi] = &mc;
9045 rc = mdb_cursor_put(&mc, key, data, flags);
9046 txn->mt_cursors[dbi] = mc.mc_next;
9047 return rc;
9048}
9049
9050#ifndef MDB_WBUF
9051#define MDB_WBUF (1024*1024)
9052#endif
9053#define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */
9054
9055 /** State needed for a double-buffering compacting copy. */
9056typedef struct mdb_copy {
9057 MDB_env *mc_env;
9058 MDB_txn *mc_txn;
9059 pthread_mutex_t mc_mutex;
9060 pthread_cond_t mc_cond; /**< Condition variable for #mc_new */
9061 char *mc_wbuf[2];
9062 char *mc_over[2];
9063 int mc_wlen[2];
9064 int mc_olen[2];
9065 pgno_t mc_next_pgno;
9066 HANDLE mc_fd;
9067 int mc_toggle; /**< Buffer number in provider */
9068 int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */
9069 /** Error code. Never cleared if set. Both threads can set nonzero
9070 * to fail the copy. Not mutex-protected, LMDB expects atomic int.
9071 */
9072 volatile int mc_error;
9073} mdb_copy;
9074
9075 /** Dedicated writer thread for compacting copy. */
9076static THREAD_RET ESECT CALL_CONV
9077mdb_env_copythr(void *arg)
9078{
9079 mdb_copy *my = arg;
9080 char *ptr;
9081 int toggle = 0, wsize, rc;
9082#ifdef _WIN32
9083 DWORD len;
9084#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
9085#else
9086 int len;
9087#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
9088#ifdef SIGPIPE
9089 sigset_t set;
9090 sigemptyset(&set);
9091 sigaddset(&set, SIGPIPE);
9092 if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0)
9093 my->mc_error = rc;
9094#endif
9095#endif
9096
9097 pthread_mutex_lock(&my->mc_mutex);
9098 for(;;) {
9099 while (!my->mc_new)
9100 pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
9101 if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */
9102 break;
9103 wsize = my->mc_wlen[toggle];
9104 ptr = my->mc_wbuf[toggle];
9105again:
9106 rc = MDB_SUCCESS;
9107 while (wsize > 0 && !my->mc_error) {
9108 DO_WRITE(rc, my->mc_fd, ptr, wsize, len);
9109 if (!rc) {
9110 rc = ErrCode();
9111#if defined(SIGPIPE) && !defined(_WIN32)
9112 if (rc == EPIPE) {
9113 /* Collect the pending SIGPIPE, otherwise at least OS X
9114 * gives it to the process on thread-exit (ITS#8504).
9115 */
9116 int tmp;
9117 sigwait(&set, &tmp);
9118 }
9119#endif
9120 break;
9121 } else if (len > 0) {
9122 rc = MDB_SUCCESS;
9123 ptr += len;
9124 wsize -= len;
9125 continue;
9126 } else {
9127 rc = EIO;
9128 break;
9129 }
9130 }
9131 if (rc) {
9132 my->mc_error = rc;
9133 }
9134 /* If there's an overflow page tail, write it too */
9135 if (my->mc_olen[toggle]) {
9136 wsize = my->mc_olen[toggle];
9137 ptr = my->mc_over[toggle];
9138 my->mc_olen[toggle] = 0;
9139 goto again;
9140 }
9141 my->mc_wlen[toggle] = 0;
9142 toggle ^= 1;
9143 /* Return the empty buffer to provider */
9144 my->mc_new--;
9145 pthread_cond_signal(&my->mc_cond);
9146 }
9147 pthread_mutex_unlock(&my->mc_mutex);
9148 return (THREAD_RET)0;
9149#undef DO_WRITE
9150}
9151
9152 /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer.
9153 *
9154 * @param[in] my control structure.
9155 * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending).
9156 */
9157static int ESECT
9158mdb_env_cthr_toggle(mdb_copy *my, int adjust)
9159{
9160 pthread_mutex_lock(&my->mc_mutex);
9161 my->mc_new += adjust;
9162 pthread_cond_signal(&my->mc_cond);
9163 while (my->mc_new & 2) /* both buffers in use */
9164 pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
9165 pthread_mutex_unlock(&my->mc_mutex);
9166
9167 my->mc_toggle ^= (adjust & 1);
9168 /* Both threads reset mc_wlen, to be safe from threading errors */
9169 my->mc_wlen[my->mc_toggle] = 0;
9170 return my->mc_error;
9171}
9172
9173 /** Depth-first tree traversal for compacting copy.
9174 * @param[in] my control structure.
9175 * @param[in,out] pg database root.
9176 * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB.
9177 */
9178static int ESECT
9179mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags)
9180{
9181 MDB_cursor mc = {0};
9182 MDB_node *ni;
9183 MDB_page *mo, *mp, *leaf;
9184 char *buf, *ptr;
9185 int rc, toggle;
9186 unsigned int i;
9187
9188 /* Empty DB, nothing to do */
9189 if (*pg == P_INVALID)
9190 return MDB_SUCCESS;
9191
9192 mc.mc_snum = 1;
9193 mc.mc_txn = my->mc_txn;
9194
9195 rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL);
9196 if (rc)
9197 return rc;
9198 rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST);
9199 if (rc)
9200 return rc;
9201
9202 /* Make cursor pages writable */
9203 buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum);
9204 if (buf == NULL)
9205 return ENOMEM;
9206
9207 for (i=0; i<mc.mc_top; i++) {
9208 mdb_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize);
9209 mc.mc_pg[i] = (MDB_page *)ptr;
9210 ptr += my->mc_env->me_psize;
9211 }
9212
9213 /* This is writable space for a leaf page. Usually not needed. */
9214 leaf = (MDB_page *)ptr;
9215
9216 toggle = my->mc_toggle;
9217 while (mc.mc_snum > 0) {
9218 unsigned n;
9219 mp = mc.mc_pg[mc.mc_top];
9220 n = NUMKEYS(mp);
9221
9222 if (IS_LEAF(mp)) {
9223 if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) {
9224 for (i=0; i<n; i++) {
9225 ni = NODEPTR(mp, i);
9226 if (ni->mn_flags & F_BIGDATA) {
9227 MDB_page *omp;
9228 pgno_t pg;
9229
9230 /* Need writable leaf */
9231 if (mp != leaf) {
9232 mc.mc_pg[mc.mc_top] = leaf;
9233 mdb_page_copy(leaf, mp, my->mc_env->me_psize);
9234 mp = leaf;
9235 ni = NODEPTR(mp, i);
9236 }
9237
9238 memcpy(&pg, NODEDATA(ni), sizeof(pg));
9239 memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t));
9240 rc = mdb_page_get(&mc, pg, &omp, NULL);
9241 if (rc)
9242 goto done;
9243 if (my->mc_wlen[toggle] >= MDB_WBUF) {
9244 rc = mdb_env_cthr_toggle(my, 1);
9245 if (rc)
9246 goto done;
9247 toggle = my->mc_toggle;
9248 }
9249 mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
9250 memcpy(mo, omp, my->mc_env->me_psize);
9251 mo->mp_pgno = my->mc_next_pgno;
9252 my->mc_next_pgno += omp->mp_pages;
9253 my->mc_wlen[toggle] += my->mc_env->me_psize;
9254 if (omp->mp_pages > 1) {
9255 my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1);
9256 my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize;
9257 rc = mdb_env_cthr_toggle(my, 1);
9258 if (rc)
9259 goto done;
9260 toggle = my->mc_toggle;
9261 }
9262 } else if (ni->mn_flags & F_SUBDATA) {
9263 MDB_db db;
9264
9265 /* Need writable leaf */
9266 if (mp != leaf) {
9267 mc.mc_pg[mc.mc_top] = leaf;
9268 mdb_page_copy(leaf, mp, my->mc_env->me_psize);
9269 mp = leaf;
9270 ni = NODEPTR(mp, i);
9271 }
9272
9273 memcpy(&db, NODEDATA(ni), sizeof(db));
9274 my->mc_toggle = toggle;
9275 rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA);
9276 if (rc)
9277 goto done;
9278 toggle = my->mc_toggle;
9279 memcpy(NODEDATA(ni), &db, sizeof(db));
9280 }
9281 }
9282 }
9283 } else {
9284 mc.mc_ki[mc.mc_top]++;
9285 if (mc.mc_ki[mc.mc_top] < n) {
9286 pgno_t pg;
9287again:
9288 ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]);
9289 pg = NODEPGNO(ni);
9290 rc = mdb_page_get(&mc, pg, &mp, NULL);
9291 if (rc)
9292 goto done;
9293 mc.mc_top++;
9294 mc.mc_snum++;
9295 mc.mc_ki[mc.mc_top] = 0;
9296 if (IS_BRANCH(mp)) {
9297 /* Whenever we advance to a sibling branch page,
9298 * we must proceed all the way down to its first leaf.
9299 */
9300 mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize);
9301 goto again;
9302 } else
9303 mc.mc_pg[mc.mc_top] = mp;
9304 continue;
9305 }
9306 }
9307 if (my->mc_wlen[toggle] >= MDB_WBUF) {
9308 rc = mdb_env_cthr_toggle(my, 1);
9309 if (rc)
9310 goto done;
9311 toggle = my->mc_toggle;
9312 }
9313 mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
9314 mdb_page_copy(mo, mp, my->mc_env->me_psize);
9315 mo->mp_pgno = my->mc_next_pgno++;
9316 my->mc_wlen[toggle] += my->mc_env->me_psize;
9317 if (mc.mc_top) {
9318 /* Update parent if there is one */
9319 ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]);
9320 SETPGNO(ni, mo->mp_pgno);
9321 mdb_cursor_pop(&mc);
9322 } else {
9323 /* Otherwise we're done */
9324 *pg = mo->mp_pgno;
9325 break;
9326 }
9327 }
9328done:
9329 free(buf);
9330 return rc;
9331}
9332
9333 /** Copy environment with compaction. */
9334static int ESECT
9335mdb_env_copyfd1(MDB_env *env, HANDLE fd)
9336{
9337 MDB_meta *mm;
9338 MDB_page *mp;
9339 mdb_copy my = {0};
9340 MDB_txn *txn = NULL;
9341 pthread_t thr;
9342 pgno_t root, new_root;
9343 int rc = MDB_SUCCESS;
9344
9345#ifdef _WIN32
9346 if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) ||
9347 !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) {
9348 rc = ErrCode();
9349 goto done;
9350 }
9351 my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize);
9352 if (my.mc_wbuf[0] == NULL) {
9353 /* _aligned_malloc() sets errno, but we use Windows error codes */
9354 rc = ERROR_NOT_ENOUGH_MEMORY;
9355 goto done;
9356 }
9357#else
9358 if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0)
9359 return rc;
9360 if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0)
9361 goto done2;
9362#ifdef HAVE_MEMALIGN
9363 my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2);
9364 if (my.mc_wbuf[0] == NULL) {
9365 rc = errno;
9366 goto done;
9367 }
9368#else
9369 {
9370 void *p;
9371 if ((rc = posix_memalign(&p, env->me_os_psize, MDB_WBUF*2)) != 0)
9372 goto done;
9373 my.mc_wbuf[0] = p;
9374 }
9375#endif
9376#endif
9377 memset(my.mc_wbuf[0], 0, MDB_WBUF*2);
9378 my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF;
9379 my.mc_next_pgno = NUM_METAS;
9380 my.mc_env = env;
9381 my.mc_fd = fd;
9382 rc = THREAD_CREATE(thr, mdb_env_copythr, &my);
9383 if (rc)
9384 goto done;
9385
9386 rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
9387 if (rc)
9388 goto finish;
9389
9390 mp = (MDB_page *)my.mc_wbuf[0];
9391 memset(mp, 0, NUM_METAS * env->me_psize);
9392 mp->mp_pgno = 0;
9393 mp->mp_flags = P_META;
9394 mm = (MDB_meta *)METADATA(mp);
9395 mdb_env_init_meta0(env, mm);
9396 mm->mm_address = env->me_metas[0]->mm_address;
9397
9398 mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize);
9399 mp->mp_pgno = 1;
9400 mp->mp_flags = P_META;
9401 *(MDB_meta *)METADATA(mp) = *mm;
9402 mm = (MDB_meta *)METADATA(mp);
9403
9404 /* Set metapage 1 with current main DB */
9405 root = new_root = txn->mt_dbs[MAIN_DBI].md_root;
9406 if (root != P_INVALID) {
9407 /* Count free pages + freeDB pages. Subtract from last_pg
9408 * to find the new last_pg, which also becomes the new root.
9409 */
9410 MDB_ID freecount = 0;
9411 MDB_cursor mc;
9412 MDB_val key, data;
9413 mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
9414 while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
9415 freecount += *(MDB_ID *)data.mv_data;
9416 if (rc != MDB_NOTFOUND)
9417 goto finish;
9418 freecount += txn->mt_dbs[FREE_DBI].md_branch_pages +
9419 txn->mt_dbs[FREE_DBI].md_leaf_pages +
9420 txn->mt_dbs[FREE_DBI].md_overflow_pages;
9421
9422 new_root = txn->mt_next_pgno - 1 - freecount;
9423 mm->mm_last_pg = new_root;
9424 mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
9425 mm->mm_dbs[MAIN_DBI].md_root = new_root;
9426 } else {
9427 /* When the DB is empty, handle it specially to
9428 * fix any breakage like page leaks from ITS#8174.
9429 */
9430 mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags;
9431 }
9432 if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) {
9433 mm->mm_txnid = 1; /* use metapage 1 */
9434 }
9435
9436 my.mc_wlen[0] = env->me_psize * NUM_METAS;
9437 my.mc_txn = txn;
9438 rc = mdb_env_cwalk(&my, &root, 0);
9439 if (rc == MDB_SUCCESS && root != new_root) {
9440 rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */
9441 }
9442
9443finish:
9444 if (rc)
9445 my.mc_error = rc;
9446 mdb_env_cthr_toggle(&my, 1 | MDB_EOF);
9447 rc = THREAD_FINISH(thr);
9448 mdb_txn_abort(txn);
9449
9450done:
9451#ifdef _WIN32
9452 if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]);
9453 if (my.mc_cond) CloseHandle(my.mc_cond);
9454 if (my.mc_mutex) CloseHandle(my.mc_mutex);
9455#else
9456 free(my.mc_wbuf[0]);
9457 pthread_cond_destroy(&my.mc_cond);
9458done2:
9459 pthread_mutex_destroy(&my.mc_mutex);
9460#endif
9461 return rc ? rc : my.mc_error;
9462}
9463
9464 /** Copy environment as-is. */
9465static int ESECT
9466mdb_env_copyfd0(MDB_env *env, HANDLE fd)
9467{
9468 MDB_txn *txn = NULL;
9469 mdb_mutexref_t wmutex = NULL;
9470 int rc;
9471 size_t wsize, w3;
9472 char *ptr;
9473#ifdef _WIN32
9474 DWORD len, w2;
9475#define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
9476#else
9477 ssize_t len;
9478 size_t w2;
9479#define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
9480#endif
9481
9482 /* Do the lock/unlock of the reader mutex before starting the
9483 * write txn. Otherwise other read txns could block writers.
9484 */
9485 rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
9486 if (rc)
9487 return rc;
9488
9489 if (env->me_txns) {
9490 /* We must start the actual read txn after blocking writers */
9491 mdb_txn_end(txn, MDB_END_RESET_TMP);
9492
9493 /* Temporarily block writers until we snapshot the meta pages */
9494 wmutex = env->me_wmutex;
9495 if (LOCK_MUTEX(rc, env, wmutex))
9496 goto leave;
9497
9498 rc = mdb_txn_renew0(txn);
9499 if (rc) {
9500 UNLOCK_MUTEX(wmutex);
9501 goto leave;
9502 }
9503 }
9504
9505 wsize = env->me_psize * NUM_METAS;
9506 ptr = env->me_map;
9507 w2 = wsize;
9508 while (w2 > 0) {
9509 DO_WRITE(rc, fd, ptr, w2, len);
9510 if (!rc) {
9511 rc = ErrCode();
9512 break;
9513 } else if (len > 0) {
9514 rc = MDB_SUCCESS;
9515 ptr += len;
9516 w2 -= len;
9517 continue;
9518 } else {
9519 /* Non-blocking or async handles are not supported */
9520 rc = EIO;
9521 break;
9522 }
9523 }
9524 if (wmutex)
9525 UNLOCK_MUTEX(wmutex);
9526
9527 if (rc)
9528 goto leave;
9529
9530 w3 = txn->mt_next_pgno * env->me_psize;
9531 {
9532 size_t fsize = 0;
9533 if ((rc = mdb_fsize(env->me_fd, &fsize)))
9534 goto leave;
9535 if (w3 > fsize)
9536 w3 = fsize;
9537 }
9538 wsize = w3 - wsize;
9539 while (wsize > 0) {
9540 if (wsize > MAX_WRITE)
9541 w2 = MAX_WRITE;
9542 else
9543 w2 = wsize;
9544 DO_WRITE(rc, fd, ptr, w2, len);
9545 if (!rc) {
9546 rc = ErrCode();
9547 break;
9548 } else if (len > 0) {
9549 rc = MDB_SUCCESS;
9550 ptr += len;
9551 wsize -= len;
9552 continue;
9553 } else {
9554 rc = EIO;
9555 break;
9556 }
9557 }
9558
9559leave:
9560 mdb_txn_abort(txn);
9561 return rc;
9562}
9563
9564int ESECT
9565mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags)
9566{
9567 if (flags & MDB_CP_COMPACT)
9568 return mdb_env_copyfd1(env, fd);
9569 else
9570 return mdb_env_copyfd0(env, fd);
9571}
9572
9573int ESECT
9574mdb_env_copyfd(MDB_env *env, HANDLE fd)
9575{
9576 return mdb_env_copyfd2(env, fd, 0);
9577}
9578
9579int ESECT
9580mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags)
9581{
9582 int rc;
9583 MDB_name fname;
9584 HANDLE newfd = INVALID_HANDLE_VALUE;
9585
9586 rc = mdb_fname_init(path, env->me_flags | MDB_NOLOCK, &fname);
9587 if (rc == MDB_SUCCESS) {
9588 rc = mdb_fopen(env, &fname, MDB_O_COPY, 0666, &newfd);
9589 mdb_fname_destroy(fname);
9590 }
9591 if (rc == MDB_SUCCESS) {
9592 rc = mdb_env_copyfd2(env, newfd, flags);
9593 if (close(newfd) < 0 && rc == MDB_SUCCESS)
9594 rc = ErrCode();
9595 }
9596 return rc;
9597}
9598
9599int ESECT
9600mdb_env_copy(MDB_env *env, const char *path)
9601{
9602 return mdb_env_copy2(env, path, 0);
9603}
9604
9605int ESECT
9606mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff)
9607{
9608 if (flag & ~CHANGEABLE)
9609 return EINVAL;
9610 if (onoff)
9611 env->me_flags |= flag;
9612 else
9613 env->me_flags &= ~flag;
9614 return MDB_SUCCESS;
9615}
9616
9617int ESECT
9618mdb_env_get_flags(MDB_env *env, unsigned int *arg)
9619{
9620 if (!env || !arg)
9621 return EINVAL;
9622
9623 *arg = env->me_flags & (CHANGEABLE|CHANGELESS);
9624 return MDB_SUCCESS;
9625}
9626
9627int ESECT
9628mdb_env_set_userctx(MDB_env *env, void *ctx)
9629{
9630 if (!env)
9631 return EINVAL;
9632 env->me_userctx = ctx;
9633 return MDB_SUCCESS;
9634}
9635
9636void * ESECT
9637mdb_env_get_userctx(MDB_env *env)
9638{
9639 return env ? env->me_userctx : NULL;
9640}
9641
9642int ESECT
9643mdb_env_set_assert(MDB_env *env, MDB_assert_func *func)
9644{
9645 if (!env)
9646 return EINVAL;
9647#ifndef NDEBUG
9648 env->me_assert_func = func;
9649#endif
9650 return MDB_SUCCESS;
9651}
9652
9653int ESECT
9654mdb_env_get_path(MDB_env *env, const char **arg)
9655{
9656 if (!env || !arg)
9657 return EINVAL;
9658
9659 *arg = env->me_path;
9660 return MDB_SUCCESS;
9661}
9662
9663int ESECT
9664mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg)
9665{
9666 if (!env || !arg)
9667 return EINVAL;
9668
9669 *arg = env->me_fd;
9670 return MDB_SUCCESS;
9671}
9672
9673/** Common code for #mdb_stat() and #mdb_env_stat().
9674 * @param[in] env the environment to operate in.
9675 * @param[in] db the #MDB_db record containing the stats to return.
9676 * @param[out] arg the address of an #MDB_stat structure to receive the stats.
9677 * @return 0, this function always succeeds.
9678 */
9679static int ESECT
9680mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg)
9681{
9682 arg->ms_psize = env->me_psize;
9683 arg->ms_depth = db->md_depth;
9684 arg->ms_branch_pages = db->md_branch_pages;
9685 arg->ms_leaf_pages = db->md_leaf_pages;
9686 arg->ms_overflow_pages = db->md_overflow_pages;
9687 arg->ms_entries = db->md_entries;
9688
9689 return MDB_SUCCESS;
9690}
9691
9692int ESECT
9693mdb_env_stat(MDB_env *env, MDB_stat *arg)
9694{
9695 MDB_meta *meta;
9696
9697 if (env == NULL || arg == NULL)
9698 return EINVAL;
9699
9700 meta = mdb_env_pick_meta(env);
9701
9702 return mdb_stat0(env, &meta->mm_dbs[MAIN_DBI], arg);
9703}
9704
9705int ESECT
9706mdb_env_info(MDB_env *env, MDB_envinfo *arg)
9707{
9708 MDB_meta *meta;
9709
9710 if (env == NULL || arg == NULL)
9711 return EINVAL;
9712
9713 meta = mdb_env_pick_meta(env);
9714 arg->me_mapaddr = meta->mm_address;
9715 arg->me_last_pgno = meta->mm_last_pg;
9716 arg->me_last_txnid = meta->mm_txnid;
9717
9718 arg->me_mapsize = env->me_mapsize;
9719 arg->me_maxreaders = env->me_maxreaders;
9720 arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0;
9721 return MDB_SUCCESS;
9722}
9723
9724/** Set the default comparison functions for a database.
9725 * Called immediately after a database is opened to set the defaults.
9726 * The user can then override them with #mdb_set_compare() or
9727 * #mdb_set_dupsort().
9728 * @param[in] txn A transaction handle returned by #mdb_txn_begin()
9729 * @param[in] dbi A database handle returned by #mdb_dbi_open()
9730 */
9731static void
9732mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi)
9733{
9734 uint16_t f = txn->mt_dbs[dbi].md_flags;
9735
9736 txn->mt_dbxs[dbi].md_cmp =
9737 (f & MDB_REVERSEKEY) ? mdb_cmp_memnr :
9738 (f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn;
9739
9740 txn->mt_dbxs[dbi].md_dcmp =
9741 !(f & MDB_DUPSORT) ? 0 :
9742 ((f & MDB_INTEGERDUP)
9743 ? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint)
9744 : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn));
9745}
9746
9747int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi)
9748{
9749 MDB_val key, data;
9750 MDB_dbi i;
9751 MDB_cursor mc;
9752 MDB_db dummy;
9753 int rc, dbflag, exact;
9754 unsigned int unused = 0, seq;
9755 char *namedup;
9756 size_t len;
9757
9758 if (flags & ~VALID_FLAGS)
9759 return EINVAL;
9760 if (txn->mt_flags & MDB_TXN_BLOCKED)
9761 return MDB_BAD_TXN;
9762
9763 /* main DB? */
9764 if (!name) {
9765 *dbi = MAIN_DBI;
9766 if (flags & PERSISTENT_FLAGS) {
9767 uint16_t f2 = flags & PERSISTENT_FLAGS;
9768 /* make sure flag changes get committed */
9769 if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) {
9770 txn->mt_dbs[MAIN_DBI].md_flags |= f2;
9771 txn->mt_flags |= MDB_TXN_DIRTY;
9772 }
9773 }
9774 mdb_default_cmp(txn, MAIN_DBI);
9775 return MDB_SUCCESS;
9776 }
9777
9778 if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) {
9779 mdb_default_cmp(txn, MAIN_DBI);
9780 }
9781
9782 /* Is the DB already open? */
9783 len = strlen(name);
9784 for (i=CORE_DBS; i<txn->mt_numdbs; i++) {
9785 if (!txn->mt_dbxs[i].md_name.mv_size) {
9786 /* Remember this free slot */
9787 if (!unused) unused = i;
9788 continue;
9789 }
9790 if (len == txn->mt_dbxs[i].md_name.mv_size &&
9791 !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) {
9792 *dbi = i;
9793 return MDB_SUCCESS;
9794 }
9795 }
9796
9797 /* If no free slot and max hit, fail */
9798 if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs)
9799 return MDB_DBS_FULL;
9800
9801 /* Cannot mix named databases with some mainDB flags */
9802 if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY))
9803 return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND;
9804
9805 /* Find the DB info */
9806 dbflag = DB_NEW|DB_VALID|DB_USRVALID;
9807 exact = 0;
9808 key.mv_size = len;
9809 key.mv_data = (void *)name;
9810 mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
9811 rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact);
9812 if (rc == MDB_SUCCESS) {
9813 /* make sure this is actually a DB */
9814 MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]);
9815 if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)
9816 return MDB_INCOMPATIBLE;
9817 } else {
9818 if (rc != MDB_NOTFOUND || !(flags & MDB_CREATE))
9819 return rc;
9820 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
9821 return EACCES;
9822 }
9823
9824 /* Done here so we cannot fail after creating a new DB */
9825 if ((namedup = strdup(name)) == NULL)
9826 return ENOMEM;
9827
9828 if (rc) {
9829 /* MDB_NOTFOUND and MDB_CREATE: Create new DB */
9830 data.mv_size = sizeof(MDB_db);
9831 data.mv_data = &dummy;
9832 memset(&dummy, 0, sizeof(dummy));
9833 dummy.md_root = P_INVALID;
9834 dummy.md_flags = flags & PERSISTENT_FLAGS;
9835 WITH_CURSOR_TRACKING(mc,
9836 rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA));
9837 dbflag |= DB_DIRTY;
9838 }
9839
9840 if (rc) {
9841 free(namedup);
9842 } else {
9843 /* Got info, register DBI in this txn */
9844 unsigned int slot = unused ? unused : txn->mt_numdbs;
9845 txn->mt_dbxs[slot].md_name.mv_data = namedup;
9846 txn->mt_dbxs[slot].md_name.mv_size = len;
9847 txn->mt_dbxs[slot].md_rel = NULL;
9848 txn->mt_dbflags[slot] = dbflag;
9849 /* txn-> and env-> are the same in read txns, use
9850 * tmp variable to avoid undefined assignment
9851 */
9852 seq = ++txn->mt_env->me_dbiseqs[slot];
9853 txn->mt_dbiseqs[slot] = seq;
9854
9855 memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db));
9856 *dbi = slot;
9857 mdb_default_cmp(txn, slot);
9858 if (!unused) {
9859 txn->mt_numdbs++;
9860 }
9861 }
9862
9863 return rc;
9864}
9865
9866int ESECT
9867mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg)
9868{
9869 if (!arg || !TXN_DBI_EXIST(txn, dbi, DB_VALID))
9870 return EINVAL;
9871
9872 if (txn->mt_flags & MDB_TXN_BLOCKED)
9873 return MDB_BAD_TXN;
9874
9875 if (txn->mt_dbflags[dbi] & DB_STALE) {
9876 MDB_cursor mc;
9877 MDB_xcursor mx;
9878 /* Stale, must read the DB's root. cursor_init does it for us. */
9879 mdb_cursor_init(&mc, txn, dbi, &mx);
9880 }
9881 return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg);
9882}
9883
9884void mdb_dbi_close(MDB_env *env, MDB_dbi dbi)
9885{
9886 char *ptr;
9887 if (dbi < CORE_DBS || dbi >= env->me_maxdbs)
9888 return;
9889 ptr = env->me_dbxs[dbi].md_name.mv_data;
9890 /* If there was no name, this was already closed */
9891 if (ptr) {
9892 env->me_dbxs[dbi].md_name.mv_data = NULL;
9893 env->me_dbxs[dbi].md_name.mv_size = 0;
9894 env->me_dbflags[dbi] = 0;
9895 env->me_dbiseqs[dbi]++;
9896 free(ptr);
9897 }
9898}
9899
9900int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags)
9901{
9902 /* We could return the flags for the FREE_DBI too but what's the point? */
9903 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
9904 return EINVAL;
9905 *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS;
9906 return MDB_SUCCESS;
9907}
9908
9909/** Add all the DB's pages to the free list.
9910 * @param[in] mc Cursor on the DB to free.
9911 * @param[in] subs non-Zero to check for sub-DBs in this DB.
9912 * @return 0 on success, non-zero on failure.
9913 */
9914static int
9915mdb_drop0(MDB_cursor *mc, int subs)
9916{
9917 int rc;
9918
9919 rc = mdb_page_search(mc, NULL, MDB_PS_FIRST);
9920 if (rc == MDB_SUCCESS) {
9921 MDB_txn *txn = mc->mc_txn;
9922 MDB_node *ni;
9923 MDB_cursor mx;
9924 unsigned int i;
9925
9926 /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves.
9927 * This also avoids any P_LEAF2 pages, which have no nodes.
9928 * Also if the DB doesn't have sub-DBs and has no overflow
9929 * pages, omit scanning leaves.
9930 */
9931 if ((mc->mc_flags & C_SUB) ||
9932 (!subs && !mc->mc_db->md_overflow_pages))
9933 mdb_cursor_pop(mc);
9934
9935 mdb_cursor_copy(mc, &mx);
9936 while (mc->mc_snum > 0) {
9937 MDB_page *mp = mc->mc_pg[mc->mc_top];
9938 unsigned n = NUMKEYS(mp);
9939 if (IS_LEAF(mp)) {
9940 for (i=0; i<n; i++) {
9941 ni = NODEPTR(mp, i);
9942 if (ni->mn_flags & F_BIGDATA) {
9943 MDB_page *omp;
9944 pgno_t pg;
9945 memcpy(&pg, NODEDATA(ni), sizeof(pg));
9946 rc = mdb_page_get(mc, pg, &omp, NULL);
9947 if (rc != 0)
9948 goto done;
9949 mdb_cassert(mc, IS_OVERFLOW(omp));
9950 rc = mdb_midl_append_range(&txn->mt_free_pgs,
9951 pg, omp->mp_pages);
9952 if (rc)
9953 goto done;
9954 mc->mc_db->md_overflow_pages -= omp->mp_pages;
9955 if (!mc->mc_db->md_overflow_pages && !subs)
9956 break;
9957 } else if (subs && (ni->mn_flags & F_SUBDATA)) {
9958 mdb_xcursor_init1(mc, ni);
9959 rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
9960 if (rc)
9961 goto done;
9962 }
9963 }
9964 if (!subs && !mc->mc_db->md_overflow_pages)
9965 goto pop;
9966 } else {
9967 if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0)
9968 goto done;
9969 for (i=0; i<n; i++) {
9970 pgno_t pg;
9971 ni = NODEPTR(mp, i);
9972 pg = NODEPGNO(ni);
9973 /* free it */
9974 mdb_midl_xappend(txn->mt_free_pgs, pg);
9975 }
9976 }
9977 if (!mc->mc_top)
9978 break;
9979 mc->mc_ki[mc->mc_top] = i;
9980 rc = mdb_cursor_sibling(mc, 1);
9981 if (rc) {
9982 if (rc != MDB_NOTFOUND)
9983 goto done;
9984 /* no more siblings, go back to beginning
9985 * of previous level.
9986 */
9987pop:
9988 mdb_cursor_pop(mc);
9989 mc->mc_ki[0] = 0;
9990 for (i=1; i<mc->mc_snum; i++) {
9991 mc->mc_ki[i] = 0;
9992 mc->mc_pg[i] = mx.mc_pg[i];
9993 }
9994 }
9995 }
9996 /* free it */
9997 rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root);
9998done:
9999 if (rc)
10000 txn->mt_flags |= MDB_TXN_ERROR;
10001 } else if (rc == MDB_NOTFOUND) {
10002 rc = MDB_SUCCESS;
10003 }
10004 mc->mc_flags &= ~C_INITIALIZED;
10005 return rc;
10006}
10007
10008int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del)
10009{
10010 MDB_cursor *mc, *m2;
10011 int rc;
10012
10013 if ((unsigned)del > 1 || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
10014 return EINVAL;
10015
10016 if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
10017 return EACCES;
10018
10019 if (TXN_DBI_CHANGED(txn, dbi))
10020 return MDB_BAD_DBI;
10021
10022 rc = mdb_cursor_open(txn, dbi, &mc);
10023 if (rc)
10024 return rc;
10025
10026 rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT);
10027 /* Invalidate the dropped DB's cursors */
10028 for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next)
10029 m2->mc_flags &= ~(C_INITIALIZED|C_EOF);
10030 if (rc)
10031 goto leave;
10032
10033 /* Can't delete the main DB */
10034 if (del && dbi >= CORE_DBS) {
10035 rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA);
10036 if (!rc) {
10037 txn->mt_dbflags[dbi] = DB_STALE;
10038 mdb_dbi_close(txn->mt_env, dbi);
10039 } else {
10040 txn->mt_flags |= MDB_TXN_ERROR;
10041 }
10042 } else {
10043 /* reset the DB record, mark it dirty */
10044 txn->mt_dbflags[dbi] |= DB_DIRTY;
10045 txn->mt_dbs[dbi].md_depth = 0;
10046 txn->mt_dbs[dbi].md_branch_pages = 0;
10047 txn->mt_dbs[dbi].md_leaf_pages = 0;
10048 txn->mt_dbs[dbi].md_overflow_pages = 0;
10049 txn->mt_dbs[dbi].md_entries = 0;
10050 txn->mt_dbs[dbi].md_root = P_INVALID;
10051
10052 txn->mt_flags |= MDB_TXN_DIRTY;
10053 }
10054leave:
10055 mdb_cursor_close(mc);
10056 return rc;
10057}
10058
10059int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
10060{
10061 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
10062 return EINVAL;
10063
10064 txn->mt_dbxs[dbi].md_cmp = cmp;
10065 return MDB_SUCCESS;
10066}
10067
10068int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
10069{
10070 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
10071 return EINVAL;
10072
10073 txn->mt_dbxs[dbi].md_dcmp = cmp;
10074 return MDB_SUCCESS;
10075}
10076
10077int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel)
10078{
10079 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
10080 return EINVAL;
10081
10082 txn->mt_dbxs[dbi].md_rel = rel;
10083 return MDB_SUCCESS;
10084}
10085
10086int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx)
10087{
10088 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
10089 return EINVAL;
10090
10091 txn->mt_dbxs[dbi].md_relctx = ctx;
10092 return MDB_SUCCESS;
10093}
10094
10095int ESECT
10096mdb_env_get_maxkeysize(MDB_env *env)
10097{
10098 return ENV_MAXKEY(env);
10099}
10100
10101int ESECT
10102mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
10103{
10104 unsigned int i, rdrs;
10105 MDB_reader *mr;
10106 char buf[64];
10107 int rc = 0, first = 1;
10108
10109 if (!env || !func)
10110 return -1;
10111 if (!env->me_txns) {
10112 return func("(no reader locks)\n", ctx);
10113 }
10114 rdrs = env->me_txns->mti_numreaders;
10115 mr = env->me_txns->mti_readers;
10116 for (i=0; i<rdrs; i++) {
10117 if (mr[i].mr_pid) {
10118 txnid_t txnid = mr[i].mr_txnid;
10119 sprintf(buf, txnid == (txnid_t)-1 ?
10120 "%10d %"Z"x -\n" : "%10d %"Z"x %"Z"u\n",
10121 (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid);
10122 if (first) {
10123 first = 0;
10124 rc = func(" pid thread txnid\n", ctx);
10125 if (rc < 0)
10126 break;
10127 }
10128 rc = func(buf, ctx);
10129 if (rc < 0)
10130 break;
10131 }
10132 }
10133 if (first) {
10134 rc = func("(no active readers)\n", ctx);
10135 }
10136 return rc;
10137}
10138
10139/** Insert pid into list if not already present.
10140 * return -1 if already present.
10141 */
10142static int ESECT
10143mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
10144{
10145 /* binary search of pid in list */
10146 unsigned base = 0;
10147 unsigned cursor = 1;
10148 int val = 0;
10149 unsigned n = ids[0];
10150
10151 while( 0 < n ) {
10152 unsigned pivot = n >> 1;
10153 cursor = base + pivot + 1;
10154 val = pid - ids[cursor];
10155
10156 if( val < 0 ) {
10157 n = pivot;
10158
10159 } else if ( val > 0 ) {
10160 base = cursor;
10161 n -= pivot + 1;
10162
10163 } else {
10164 /* found, so it's a duplicate */
10165 return -1;
10166 }
10167 }
10168
10169 if( val > 0 ) {
10170 ++cursor;
10171 }
10172 ids[0]++;
10173 for (n = ids[0]; n > cursor; n--)
10174 ids[n] = ids[n-1];
10175 ids[n] = pid;
10176 return 0;
10177}
10178
10179int ESECT
10180mdb_reader_check(MDB_env *env, int *dead)
10181{
10182 if (!env)
10183 return EINVAL;
10184 if (dead)
10185 *dead = 0;
10186 return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS;
10187}
10188
10189/** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */
10190static int ESECT
10191mdb_reader_check0(MDB_env *env, int rlocked, int *dead)
10192{
10193 mdb_mutexref_t rmutex = rlocked ? NULL : env->me_rmutex;
10194 unsigned int i, j, rdrs;
10195 MDB_reader *mr;
10196 MDB_PID_T *pids, pid;
10197 int rc = MDB_SUCCESS, count = 0;
10198
10199 rdrs = env->me_txns->mti_numreaders;
10200 pids = malloc((rdrs+1) * sizeof(MDB_PID_T));
10201 if (!pids)
10202 return ENOMEM;
10203 pids[0] = 0;
10204 mr = env->me_txns->mti_readers;
10205 for (i=0; i<rdrs; i++) {
10206 pid = mr[i].mr_pid;
10207 if (pid && pid != env->me_pid) {
10208 if (mdb_pid_insert(pids, pid) == 0) {
10209 if (!mdb_reader_pid(env, Pidcheck, pid)) {
10210 /* Stale reader found */
10211 j = i;
10212 if (rmutex) {
10213 if ((rc = LOCK_MUTEX0(rmutex)) != 0) {
10214 if ((rc = mdb_mutex_failed(env, rmutex, rc)))
10215 break;
10216 rdrs = 0; /* the above checked all readers */
10217 } else {
10218 /* Recheck, a new process may have reused pid */
10219 if (mdb_reader_pid(env, Pidcheck, pid))
10220 j = rdrs;
10221 }
10222 }
10223 for (; j<rdrs; j++)
10224 if (mr[j].mr_pid == pid) {
10225 DPRINTF(("clear stale reader pid %u txn %"Z"d",
10226 (unsigned) pid, mr[j].mr_txnid));
10227 mr[j].mr_pid = 0;
10228 count++;
10229 }
10230 if (rmutex)
10231 UNLOCK_MUTEX(rmutex);
10232 }
10233 }
10234 }
10235 }
10236 free(pids);
10237 if (dead)
10238 *dead = count;
10239 return rc;
10240}
10241
10242#ifdef MDB_ROBUST_SUPPORTED
10243/** Handle #LOCK_MUTEX0() failure.
10244 * Try to repair the lock file if the mutex owner died.
10245 * @param[in] env the environment handle
10246 * @param[in] mutex LOCK_MUTEX0() mutex
10247 * @param[in] rc LOCK_MUTEX0() error (nonzero)
10248 * @return 0 on success with the mutex locked, or an error code on failure.
10249 */
10250static int ESECT
10251mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc)
10252{
10253 int rlocked, rc2;
10254 MDB_meta *meta;
10255
10256 if (rc == MDB_OWNERDEAD) {
10257 /* We own the mutex. Clean up after dead previous owner. */
10258 rc = MDB_SUCCESS;
10259 rlocked = (mutex == env->me_rmutex);
10260 if (!rlocked) {
10261 /* Keep mti_txnid updated, otherwise next writer can
10262 * overwrite data which latest meta page refers to.
10263 */
10264 meta = mdb_env_pick_meta(env);
10265 env->me_txns->mti_txnid = meta->mm_txnid;
10266 /* env is hosed if the dead thread was ours */
10267 if (env->me_txn) {
10268 env->me_flags |= MDB_FATAL_ERROR;
10269 env->me_txn = NULL;
10270 rc = MDB_PANIC;
10271 }
10272 }
10273 DPRINTF(("%cmutex owner died, %s", (rlocked ? 'r' : 'w'),
10274 (rc ? "this process' env is hosed" : "recovering")));
10275 rc2 = mdb_reader_check0(env, rlocked, NULL);
10276 if (rc2 == 0)
10277 rc2 = mdb_mutex_consistent(mutex);
10278 if (rc || (rc = rc2)) {
10279 DPRINTF(("LOCK_MUTEX recovery failed, %s", mdb_strerror(rc)));
10280 UNLOCK_MUTEX(mutex);
10281 }
10282 } else {
10283#ifdef _WIN32
10284 rc = ErrCode();
10285#endif
10286 DPRINTF(("LOCK_MUTEX failed, %s", mdb_strerror(rc)));
10287 }
10288
10289 return rc;
10290}
10291#endif /* MDB_ROBUST_SUPPORTED */
10292
10293#if defined(_WIN32)
10294/** Convert \b src to new wchar_t[] string with room for \b xtra extra chars */
10295static int ESECT
10296utf8_to_utf16(const char *src, MDB_name *dst, int xtra)
10297{
10298 int rc, need = 0;
10299 wchar_t *result = NULL;
10300 for (;;) { /* malloc result, then fill it in */
10301 need = MultiByteToWideChar(CP_UTF8, 0, src, -1, result, need);
10302 if (!need) {
10303 rc = ErrCode();
10304 free(result);
10305 return rc;
10306 }
10307 if (!result) {
10308 result = malloc(sizeof(wchar_t) * (need + xtra));
10309 if (!result)
10310 return ENOMEM;
10311 continue;
10312 }
10313 dst->mn_alloced = 1;
10314 dst->mn_len = need - 1;
10315 dst->mn_val = result;
10316 return MDB_SUCCESS;
10317 }
10318}
10319#endif /* defined(_WIN32) */
10320/** @} */
10321