1 | /** @file mdb.c |
2 | * @brief Lightning memory-mapped database library |
3 | * |
4 | * A Btree-based database management library modeled loosely on the |
5 | * BerkeleyDB API, but much simplified. |
6 | */ |
7 | /* |
8 | * Copyright 2011-2021 Howard Chu, Symas Corp. |
9 | * All rights reserved. |
10 | * |
11 | * Redistribution and use in source and binary forms, with or without |
12 | * modification, are permitted only as authorized by the OpenLDAP |
13 | * Public License. |
14 | * |
15 | * A copy of this license is available in the file LICENSE in the |
16 | * top-level directory of the distribution or, alternatively, at |
17 | * <http://www.OpenLDAP.org/license.html>. |
18 | * |
19 | * This code is derived from btree.c written by Martin Hedenfalk. |
20 | * |
21 | * Copyright (c) 2009, 2010 Martin Hedenfalk <[email protected]> |
22 | * |
23 | * Permission to use, copy, modify, and distribute this software for any |
24 | * purpose with or without fee is hereby granted, provided that the above |
25 | * copyright notice and this permission notice appear in all copies. |
26 | * |
27 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
28 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
29 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
30 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
31 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
32 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
33 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
34 | */ |
35 | #ifndef _GNU_SOURCE |
36 | #define _GNU_SOURCE 1 |
37 | #endif |
38 | #if defined(__WIN64__) |
39 | #define _FILE_OFFSET_BITS 64 |
40 | #endif |
41 | #ifdef _WIN32 |
42 | #include <malloc.h> |
43 | #include <windows.h> |
44 | #include <wchar.h> /* get wcscpy() */ |
45 | |
46 | /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it |
47 | * as int64 which is wrong. MSVC doesn't define it at all, so just |
48 | * don't use it. |
49 | */ |
50 | #define MDB_PID_T int |
51 | #define MDB_THR_T DWORD |
52 | #include <sys/types.h> |
53 | #include <sys/stat.h> |
54 | #ifdef __GNUC__ |
55 | # include <sys/param.h> |
56 | #else |
57 | # define LITTLE_ENDIAN 1234 |
58 | # define BIG_ENDIAN 4321 |
59 | # define BYTE_ORDER LITTLE_ENDIAN |
60 | # ifndef SSIZE_MAX |
61 | # define SSIZE_MAX INT_MAX |
62 | # endif |
63 | #endif |
64 | #else |
65 | #include <sys/types.h> |
66 | #include <sys/stat.h> |
67 | #define MDB_PID_T pid_t |
68 | #define MDB_THR_T pthread_t |
69 | #include <sys/param.h> |
70 | #include <sys/uio.h> |
71 | #include <sys/mman.h> |
72 | #ifdef HAVE_SYS_FILE_H |
73 | #include <sys/file.h> |
74 | #endif |
75 | #include <fcntl.h> |
76 | #endif |
77 | |
78 | #if defined(__mips) && defined(__linux) |
79 | /* MIPS has cache coherency issues, requires explicit cache control */ |
80 | #include <asm/cachectl.h> |
81 | extern int cacheflush(char *addr, int nbytes, int cache); |
82 | #define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache) |
83 | #else |
84 | #define CACHEFLUSH(addr, bytes, cache) |
85 | #endif |
86 | |
87 | #if defined(__linux) && !defined(MDB_FDATASYNC_WORKS) |
88 | /** fdatasync is broken on ext3/ext4fs on older kernels, see |
89 | * description in #mdb_env_open2 comments. You can safely |
90 | * define MDB_FDATASYNC_WORKS if this code will only be run |
91 | * on kernels 3.6 and newer. |
92 | */ |
93 | #define BROKEN_FDATASYNC |
94 | #endif |
95 | |
96 | #include <errno.h> |
97 | #include <limits.h> |
98 | #include <stddef.h> |
99 | #include <inttypes.h> |
100 | #include <stdio.h> |
101 | #include <stdlib.h> |
102 | #include <string.h> |
103 | #include <time.h> |
104 | |
105 | #ifdef _MSC_VER |
106 | #include <io.h> |
107 | typedef SSIZE_T ssize_t; |
108 | #else |
109 | #include <unistd.h> |
110 | #endif |
111 | |
112 | #if defined(__sun) || defined(ANDROID) |
113 | /* Most platforms have posix_memalign, older may only have memalign */ |
114 | #define HAVE_MEMALIGN 1 |
115 | #include <malloc.h> |
116 | /* On Solaris, we need the POSIX sigwait function */ |
117 | #if defined (__sun) |
118 | # define _POSIX_PTHREAD_SEMANTICS 1 |
119 | #endif |
120 | #endif |
121 | |
122 | #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) |
123 | #include <netinet/in.h> |
124 | #include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */ |
125 | #endif |
126 | |
127 | #if defined(__FreeBSD__) && defined(__FreeBSD_version) && __FreeBSD_version >= 1100110 |
128 | # define MDB_USE_POSIX_MUTEX 1 |
129 | # define MDB_USE_ROBUST 1 |
130 | #elif defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) |
131 | # define MDB_USE_POSIX_SEM 1 |
132 | # define MDB_FDATASYNC fsync |
133 | #elif defined(ANDROID) |
134 | # define MDB_FDATASYNC fsync |
135 | #endif |
136 | |
137 | #ifndef _WIN32 |
138 | #include <pthread.h> |
139 | #include <signal.h> |
140 | #ifdef MDB_USE_POSIX_SEM |
141 | # define MDB_USE_HASH 1 |
142 | #include <semaphore.h> |
143 | #else |
144 | #define MDB_USE_POSIX_MUTEX 1 |
145 | #endif |
146 | #endif |
147 | |
148 | #if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) \ |
149 | + defined(MDB_USE_POSIX_MUTEX) != 1 |
150 | # error "Ambiguous shared-lock implementation" |
151 | #endif |
152 | |
153 | #ifdef USE_VALGRIND |
154 | #include <valgrind/memcheck.h> |
155 | #define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z) |
156 | #define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s) |
157 | #define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a) |
158 | #define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h) |
159 | #define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s) |
160 | #else |
161 | #define VGMEMP_CREATE(h,r,z) |
162 | #define VGMEMP_ALLOC(h,a,s) |
163 | #define VGMEMP_FREE(h,a) |
164 | #define VGMEMP_DESTROY(h) |
165 | #define VGMEMP_DEFINED(a,s) |
166 | #endif |
167 | |
168 | #ifndef BYTE_ORDER |
169 | # if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) |
170 | /* Solaris just defines one or the other */ |
171 | # define LITTLE_ENDIAN 1234 |
172 | # define BIG_ENDIAN 4321 |
173 | # ifdef _LITTLE_ENDIAN |
174 | # define BYTE_ORDER LITTLE_ENDIAN |
175 | # else |
176 | # define BYTE_ORDER BIG_ENDIAN |
177 | # endif |
178 | # else |
179 | # define BYTE_ORDER __BYTE_ORDER |
180 | # endif |
181 | #endif |
182 | |
183 | #ifndef LITTLE_ENDIAN |
184 | #define LITTLE_ENDIAN __LITTLE_ENDIAN |
185 | #endif |
186 | #ifndef BIG_ENDIAN |
187 | #define BIG_ENDIAN __BIG_ENDIAN |
188 | #endif |
189 | |
190 | #if defined(__i386) || defined(__x86_64) || defined(_M_IX86) |
191 | #define MISALIGNED_OK 1 |
192 | #endif |
193 | |
194 | #include "lmdb.h" |
195 | #include "midl.h" |
196 | |
197 | #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) |
198 | # error "Unknown or unsupported endianness (BYTE_ORDER)" |
199 | #elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF |
200 | # error "Two's complement, reasonably sized integer types, please" |
201 | #endif |
202 | |
203 | #ifdef __GNUC__ |
204 | /** Put infrequently used env functions in separate section */ |
205 | # ifdef __APPLE__ |
206 | # define ESECT __attribute__ ((section("__TEXT,text_env"))) |
207 | # else |
208 | # define ESECT __attribute__ ((section("text_env"))) |
209 | # endif |
210 | #else |
211 | #define ESECT |
212 | #endif |
213 | |
214 | #ifdef _WIN32 |
215 | #define CALL_CONV WINAPI |
216 | #else |
217 | #define CALL_CONV |
218 | #endif |
219 | |
220 | /** @defgroup internal LMDB Internals |
221 | * @{ |
222 | */ |
223 | /** @defgroup compat Compatibility Macros |
224 | * A bunch of macros to minimize the amount of platform-specific ifdefs |
225 | * needed throughout the rest of the code. When the features this library |
226 | * needs are similar enough to POSIX to be hidden in a one-or-two line |
227 | * replacement, this macro approach is used. |
228 | * @{ |
229 | */ |
230 | |
231 | /** Features under development */ |
232 | #ifndef MDB_DEVEL |
233 | #define MDB_DEVEL 0 |
234 | #endif |
235 | |
236 | /** Wrapper around __func__, which is a C99 feature */ |
237 | #if __STDC_VERSION__ >= 199901L |
238 | # define mdb_func_ __func__ |
239 | #elif __GNUC__ >= 2 || _MSC_VER >= 1300 |
240 | # define mdb_func_ __FUNCTION__ |
241 | #else |
242 | /* If a debug message says <mdb_unknown>(), update the #if statements above */ |
243 | # define mdb_func_ "<mdb_unknown>" |
244 | #endif |
245 | |
246 | /* Internal error codes, not exposed outside liblmdb */ |
247 | #define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) |
248 | #ifdef _WIN32 |
249 | #define MDB_OWNERDEAD ((int) WAIT_ABANDONED) |
250 | #elif defined(MDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD) |
251 | #define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */ |
252 | #endif |
253 | |
254 | #ifdef __GLIBC__ |
255 | #define GLIBC_VER ((__GLIBC__ << 16 )| __GLIBC_MINOR__) |
256 | #endif |
257 | /** Some platforms define the EOWNERDEAD error code |
258 | * even though they don't support Robust Mutexes. |
259 | * Compile with -DMDB_USE_ROBUST=0, or use some other |
260 | * mechanism like -DMDB_USE_POSIX_SEM instead of |
261 | * -DMDB_USE_POSIX_MUTEX. |
262 | * (Posix semaphores are not robust.) |
263 | */ |
264 | #ifndef MDB_USE_ROBUST |
265 | /* Android currently lacks Robust Mutex support. So does glibc < 2.4. */ |
266 | # if defined(MDB_USE_POSIX_MUTEX) && (defined(ANDROID) || \ |
267 | (defined(__GLIBC__) && GLIBC_VER < 0x020004)) |
268 | # define MDB_USE_ROBUST 0 |
269 | # else |
270 | # define MDB_USE_ROBUST 1 |
271 | # endif |
272 | #endif /* !MDB_USE_ROBUST */ |
273 | |
274 | #if defined(MDB_USE_POSIX_MUTEX) && (MDB_USE_ROBUST) |
275 | /* glibc < 2.12 only provided _np API */ |
276 | # if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \ |
277 | (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST)) |
278 | # define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP |
279 | # define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) |
280 | # define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) |
281 | # endif |
282 | #endif /* MDB_USE_POSIX_MUTEX && MDB_USE_ROBUST */ |
283 | |
284 | #if defined(MDB_OWNERDEAD) && (MDB_USE_ROBUST) |
285 | #define MDB_ROBUST_SUPPORTED 1 |
286 | #endif |
287 | |
288 | #ifdef _WIN32 |
289 | #define MDB_USE_HASH 1 |
290 | #define MDB_PIDLOCK 0 |
291 | #define THREAD_RET DWORD |
292 | #define pthread_t HANDLE |
293 | #define pthread_mutex_t HANDLE |
294 | #define pthread_cond_t HANDLE |
295 | typedef HANDLE mdb_mutex_t, mdb_mutexref_t; |
296 | #define pthread_key_t DWORD |
297 | #define pthread_self() GetCurrentThreadId() |
298 | #define pthread_key_create(x,y) \ |
299 | ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0) |
300 | #define pthread_key_delete(x) TlsFree(x) |
301 | #define pthread_getspecific(x) TlsGetValue(x) |
302 | #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode()) |
303 | #define pthread_mutex_unlock(x) ReleaseMutex(*x) |
304 | #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) |
305 | #define pthread_cond_signal(x) SetEvent(*x) |
306 | #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) |
307 | #define THREAD_CREATE(thr,start,arg) \ |
308 | (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode()) |
309 | #define THREAD_FINISH(thr) \ |
310 | (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0) |
311 | #define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE) |
312 | #define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex) |
313 | #define mdb_mutex_consistent(mutex) 0 |
314 | #define getpid() GetCurrentProcessId() |
315 | #define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd)) |
316 | #define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len)) |
317 | #define ErrCode() GetLastError() |
318 | #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} |
319 | #define close(fd) (CloseHandle(fd) ? 0 : -1) |
320 | #define munmap(ptr,len) UnmapViewOfFile(ptr) |
321 | #ifdef PROCESS_QUERY_LIMITED_INFORMATION |
322 | #define MDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION |
323 | #else |
324 | #define MDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000 |
325 | #endif |
326 | #define Z "I" |
327 | #else |
328 | #define THREAD_RET void * |
329 | #define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg) |
330 | #define THREAD_FINISH(thr) pthread_join(thr,NULL) |
331 | #define Z "z" /**< printf format modifier for size_t */ |
332 | |
333 | /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ |
334 | #define MDB_PIDLOCK 1 |
335 | |
336 | #ifdef MDB_USE_POSIX_SEM |
337 | |
338 | typedef sem_t *mdb_mutex_t, *mdb_mutexref_t; |
339 | #define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex) |
340 | #define UNLOCK_MUTEX(mutex) sem_post(mutex) |
341 | |
342 | static int |
343 | mdb_sem_wait(sem_t *sem) |
344 | { |
345 | int rc; |
346 | while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ; |
347 | return rc; |
348 | } |
349 | |
350 | #else /* MDB_USE_POSIX_MUTEX: */ |
351 | /** Shared mutex/semaphore as the original is stored. |
352 | * |
353 | * Not for copies. Instead it can be assigned to an #mdb_mutexref_t. |
354 | * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it |
355 | * is array[size 1] so it can be assigned to the pointer. |
356 | */ |
357 | typedef pthread_mutex_t mdb_mutex_t[1]; |
358 | /** Reference to an #mdb_mutex_t */ |
359 | typedef pthread_mutex_t *mdb_mutexref_t; |
360 | /** Lock the reader or writer mutex. |
361 | * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX(). |
362 | */ |
363 | #define LOCK_MUTEX0(mutex) pthread_mutex_lock(mutex) |
364 | /** Unlock the reader or writer mutex. |
365 | */ |
366 | #define UNLOCK_MUTEX(mutex) pthread_mutex_unlock(mutex) |
367 | /** Mark mutex-protected data as repaired, after death of previous owner. |
368 | */ |
369 | #define mdb_mutex_consistent(mutex) pthread_mutex_consistent(mutex) |
370 | #endif /* MDB_USE_POSIX_SEM */ |
371 | |
372 | /** Get the error code for the last failed system function. |
373 | */ |
374 | #define ErrCode() errno |
375 | |
376 | /** An abstraction for a file handle. |
377 | * On POSIX systems file handles are small integers. On Windows |
378 | * they're opaque pointers. |
379 | */ |
380 | #define HANDLE int |
381 | |
382 | /** A value for an invalid file handle. |
383 | * Mainly used to initialize file variables and signify that they are |
384 | * unused. |
385 | */ |
386 | #define INVALID_HANDLE_VALUE (-1) |
387 | |
388 | /** Get the size of a memory page for the system. |
389 | * This is the basic size that the platform's memory manager uses, and is |
390 | * fundamental to the use of memory-mapped files. |
391 | */ |
392 | #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) |
393 | #endif |
394 | |
395 | #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) |
396 | #define MNAME_LEN 32 |
397 | #else |
398 | #define MNAME_LEN (sizeof(pthread_mutex_t)) |
399 | #endif |
400 | |
401 | /** @} */ |
402 | |
403 | #ifdef MDB_ROBUST_SUPPORTED |
404 | /** Lock mutex, handle any error, set rc = result. |
405 | * Return 0 on success, nonzero (not rc) on error. |
406 | */ |
407 | #define LOCK_MUTEX(rc, env, mutex) \ |
408 | (((rc) = LOCK_MUTEX0(mutex)) && \ |
409 | ((rc) = mdb_mutex_failed(env, mutex, rc))) |
410 | static int mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc); |
411 | #else |
412 | #define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex)) |
413 | #define mdb_mutex_failed(env, mutex, rc) (rc) |
414 | #endif |
415 | |
416 | #ifndef _WIN32 |
417 | /** A flag for opening a file and requesting synchronous data writes. |
418 | * This is only used when writing a meta page. It's not strictly needed; |
419 | * we could just do a normal write and then immediately perform a flush. |
420 | * But if this flag is available it saves us an extra system call. |
421 | * |
422 | * @note If O_DSYNC is undefined but exists in /usr/include, |
423 | * preferably set some compiler flag to get the definition. |
424 | */ |
425 | #ifndef MDB_DSYNC |
426 | # ifdef O_DSYNC |
427 | # define MDB_DSYNC O_DSYNC |
428 | # else |
429 | # define MDB_DSYNC O_SYNC |
430 | # endif |
431 | #endif |
432 | #endif |
433 | |
434 | /** Function for flushing the data of a file. Define this to fsync |
435 | * if fdatasync() is not supported. |
436 | */ |
437 | #ifndef MDB_FDATASYNC |
438 | # define MDB_FDATASYNC fdatasync |
439 | #endif |
440 | |
441 | #ifndef MDB_MSYNC |
442 | # define MDB_MSYNC(addr,len,flags) msync(addr,len,flags) |
443 | #endif |
444 | |
445 | #ifndef MS_SYNC |
446 | #define MS_SYNC 1 |
447 | #endif |
448 | |
449 | #ifndef MS_ASYNC |
450 | #define MS_ASYNC 0 |
451 | #endif |
452 | |
453 | /** A page number in the database. |
454 | * Note that 64 bit page numbers are overkill, since pages themselves |
455 | * already represent 12-13 bits of addressable memory, and the OS will |
456 | * always limit applications to a maximum of 63 bits of address space. |
457 | * |
458 | * @note In the #MDB_node structure, we only store 48 bits of this value, |
459 | * which thus limits us to only 60 bits of addressable data. |
460 | */ |
461 | typedef MDB_ID pgno_t; |
462 | |
463 | /** A transaction ID. |
464 | * See struct MDB_txn.mt_txnid for details. |
465 | */ |
466 | typedef MDB_ID txnid_t; |
467 | |
468 | /** @defgroup debug Debug Macros |
469 | * @{ |
470 | */ |
471 | #ifndef MDB_DEBUG |
472 | /** Enable debug output. Needs variable argument macros (a C99 feature). |
473 | * Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs |
474 | * read from and written to the database (used for free space management). |
475 | */ |
476 | #define MDB_DEBUG 0 |
477 | #endif |
478 | |
479 | #if MDB_DEBUG |
480 | static int mdb_debug; |
481 | static txnid_t mdb_debug_start; |
482 | |
483 | /** Print a debug message with printf formatting. |
484 | * Requires double parenthesis around 2 or more args. |
485 | */ |
486 | # define DPRINTF(args) ((void) ((mdb_debug) && DPRINTF0 args)) |
487 | # define DPRINTF0(fmt, ...) \ |
488 | fprintf(stderr, "%s:%d " fmt "\n", mdb_func_, __LINE__, __VA_ARGS__) |
489 | #else |
490 | # define DPRINTF(args) ((void) 0) |
491 | #endif |
492 | /** Print a debug string. |
493 | * The string is printed literally, with no format processing. |
494 | */ |
495 | #define DPUTS(arg) DPRINTF(("%s", arg)) |
496 | /** Debugging output value of a cursor DBI: Negative in a sub-cursor. */ |
497 | #define DDBI(mc) \ |
498 | (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) |
499 | /** @} */ |
500 | |
501 | /** @brief The maximum size of a database page. |
502 | * |
503 | * It is 32k or 64k, since value-PAGEBASE must fit in |
504 | * #MDB_page.%mp_upper. |
505 | * |
506 | * LMDB will use database pages < OS pages if needed. |
507 | * That causes more I/O in write transactions: The OS must |
508 | * know (read) the whole page before writing a partial page. |
509 | * |
510 | * Note that we don't currently support Huge pages. On Linux, |
511 | * regular data files cannot use Huge pages, and in general |
512 | * Huge pages aren't actually pageable. We rely on the OS |
513 | * demand-pager to read our data and page it out when memory |
514 | * pressure from other processes is high. So until OSs have |
515 | * actual paging support for Huge pages, they're not viable. |
516 | */ |
517 | #define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) |
518 | |
519 | /** The minimum number of keys required in a database page. |
520 | * Setting this to a larger value will place a smaller bound on the |
521 | * maximum size of a data item. Data items larger than this size will |
522 | * be pushed into overflow pages instead of being stored directly in |
523 | * the B-tree node. This value used to default to 4. With a page size |
524 | * of 4096 bytes that meant that any item larger than 1024 bytes would |
525 | * go into an overflow page. That also meant that on average 2-3KB of |
526 | * each overflow page was wasted space. The value cannot be lower than |
527 | * 2 because then there would no longer be a tree structure. With this |
528 | * value, items larger than 2KB will go into overflow pages, and on |
529 | * average only 1KB will be wasted. |
530 | */ |
531 | #define MDB_MINKEYS 2 |
532 | |
533 | /** A stamp that identifies a file as an LMDB file. |
534 | * There's nothing special about this value other than that it is easily |
535 | * recognizable, and it will reflect any byte order mismatches. |
536 | */ |
537 | #define MDB_MAGIC 0xBEEFC0DE |
538 | |
539 | /** The version number for a database's datafile format. */ |
540 | #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) |
541 | /** The version number for a database's lockfile format. */ |
542 | #define MDB_LOCK_VERSION 1 |
543 | |
544 | /** @brief The max size of a key we can write, or 0 for computed max. |
545 | * |
546 | * This macro should normally be left alone or set to 0. |
547 | * Note that a database with big keys or dupsort data cannot be |
548 | * reliably modified by a liblmdb which uses a smaller max. |
549 | * The default is 511 for backwards compat, or 0 when #MDB_DEVEL. |
550 | * |
551 | * Other values are allowed, for backwards compat. However: |
552 | * A value bigger than the computed max can break if you do not |
553 | * know what you are doing, and liblmdb <= 0.9.10 can break when |
554 | * modifying a DB with keys/dupsort data bigger than its max. |
555 | * |
556 | * Data items in an #MDB_DUPSORT database are also limited to |
557 | * this size, since they're actually keys of a sub-DB. Keys and |
558 | * #MDB_DUPSORT data items must fit on a node in a regular page. |
559 | */ |
560 | #ifndef MDB_MAXKEYSIZE |
561 | #define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) |
562 | #endif |
563 | |
564 | /** The maximum size of a key we can write to the environment. */ |
565 | #if MDB_MAXKEYSIZE |
566 | #define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) |
567 | #else |
568 | #define ENV_MAXKEY(env) ((env)->me_maxkey) |
569 | #endif |
570 | |
571 | /** @brief The maximum size of a data item. |
572 | * |
573 | * We only store a 32 bit value for node sizes. |
574 | */ |
575 | #define MAXDATASIZE 0xffffffffUL |
576 | |
577 | #if MDB_DEBUG |
578 | /** Key size which fits in a #DKBUF. |
579 | * @ingroup debug |
580 | */ |
581 | #define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) |
582 | /** A key buffer. |
583 | * @ingroup debug |
584 | * This is used for printing a hex dump of a key's contents. |
585 | */ |
586 | #define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1] |
587 | /** Display a key in hex. |
588 | * @ingroup debug |
589 | * Invoke a function to display a key in hex. |
590 | */ |
591 | #define DKEY(x) mdb_dkey(x, kbuf) |
592 | #else |
593 | #define DKBUF |
594 | #define DKEY(x) 0 |
595 | #endif |
596 | |
597 | /** An invalid page number. |
598 | * Mainly used to denote an empty tree. |
599 | */ |
600 | #define P_INVALID (~(pgno_t)0) |
601 | |
602 | /** Test if the flags \b f are set in a flag word \b w. */ |
603 | #define F_ISSET(w, f) (((w) & (f)) == (f)) |
604 | |
605 | /** Round \b n up to an even number. */ |
606 | #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ |
607 | |
608 | /** Used for offsets within a single page. |
609 | * Since memory pages are typically 4 or 8KB in size, 12-13 bits, |
610 | * this is plenty. |
611 | */ |
612 | typedef uint16_t indx_t; |
613 | |
614 | /** Default size of memory map. |
615 | * This is certainly too small for any actual applications. Apps should always set |
616 | * the size explicitly using #mdb_env_set_mapsize(). |
617 | */ |
618 | #define DEFAULT_MAPSIZE 1048576 |
619 | |
620 | /** @defgroup readers Reader Lock Table |
621 | * Readers don't acquire any locks for their data access. Instead, they |
622 | * simply record their transaction ID in the reader table. The reader |
623 | * mutex is needed just to find an empty slot in the reader table. The |
624 | * slot's address is saved in thread-specific data so that subsequent read |
625 | * transactions started by the same thread need no further locking to proceed. |
626 | * |
627 | * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. |
628 | * |
629 | * No reader table is used if the database is on a read-only filesystem, or |
630 | * if #MDB_NOLOCK is set. |
631 | * |
632 | * Since the database uses multi-version concurrency control, readers don't |
633 | * actually need any locking. This table is used to keep track of which |
634 | * readers are using data from which old transactions, so that we'll know |
635 | * when a particular old transaction is no longer in use. Old transactions |
636 | * that have discarded any data pages can then have those pages reclaimed |
637 | * for use by a later write transaction. |
638 | * |
639 | * The lock table is constructed such that reader slots are aligned with the |
640 | * processor's cache line size. Any slot is only ever used by one thread. |
641 | * This alignment guarantees that there will be no contention or cache |
642 | * thrashing as threads update their own slot info, and also eliminates |
643 | * any need for locking when accessing a slot. |
644 | * |
645 | * A writer thread will scan every slot in the table to determine the oldest |
646 | * outstanding reader transaction. Any freed pages older than this will be |
647 | * reclaimed by the writer. The writer doesn't use any locks when scanning |
648 | * this table. This means that there's no guarantee that the writer will |
649 | * see the most up-to-date reader info, but that's not required for correct |
650 | * operation - all we need is to know the upper bound on the oldest reader, |
651 | * we don't care at all about the newest reader. So the only consequence of |
652 | * reading stale information here is that old pages might hang around a |
653 | * while longer before being reclaimed. That's actually good anyway, because |
654 | * the longer we delay reclaiming old pages, the more likely it is that a |
655 | * string of contiguous pages can be found after coalescing old pages from |
656 | * many old transactions together. |
657 | * @{ |
658 | */ |
659 | /** Number of slots in the reader table. |
660 | * This value was chosen somewhat arbitrarily. 126 readers plus a |
661 | * couple mutexes fit exactly into 8KB on my development machine. |
662 | * Applications should set the table size using #mdb_env_set_maxreaders(). |
663 | */ |
664 | #define DEFAULT_READERS 126 |
665 | |
666 | /** The size of a CPU cache line in bytes. We want our lock structures |
667 | * aligned to this size to avoid false cache line sharing in the |
668 | * lock table. |
669 | * This value works for most CPUs. For Itanium this should be 128. |
670 | */ |
671 | #ifndef CACHELINE |
672 | #define CACHELINE 64 |
673 | #endif |
674 | |
675 | /** The information we store in a single slot of the reader table. |
676 | * In addition to a transaction ID, we also record the process and |
677 | * thread ID that owns a slot, so that we can detect stale information, |
678 | * e.g. threads or processes that went away without cleaning up. |
679 | * @note We currently don't check for stale records. We simply re-init |
680 | * the table when we know that we're the only process opening the |
681 | * lock file. |
682 | */ |
683 | typedef struct MDB_rxbody { |
684 | /** Current Transaction ID when this transaction began, or (txnid_t)-1. |
685 | * Multiple readers that start at the same time will probably have the |
686 | * same ID here. Again, it's not important to exclude them from |
687 | * anything; all we need to know is which version of the DB they |
688 | * started from so we can avoid overwriting any data used in that |
689 | * particular version. |
690 | */ |
691 | volatile txnid_t mrb_txnid; |
692 | /** The process ID of the process owning this reader txn. */ |
693 | volatile MDB_PID_T mrb_pid; |
694 | /** The thread ID of the thread owning this txn. */ |
695 | volatile MDB_THR_T mrb_tid; |
696 | } MDB_rxbody; |
697 | |
698 | /** The actual reader record, with cacheline padding. */ |
699 | typedef struct MDB_reader { |
700 | union { |
701 | MDB_rxbody mrx; |
702 | /** shorthand for mrb_txnid */ |
703 | #define mr_txnid mru.mrx.mrb_txnid |
704 | #define mr_pid mru.mrx.mrb_pid |
705 | #define mr_tid mru.mrx.mrb_tid |
706 | /** cache line alignment */ |
707 | char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)]; |
708 | } mru; |
709 | } MDB_reader; |
710 | |
711 | /** The header for the reader table. |
712 | * The table resides in a memory-mapped file. (This is a different file |
713 | * than is used for the main database.) |
714 | * |
715 | * For POSIX the actual mutexes reside in the shared memory of this |
716 | * mapped file. On Windows, mutexes are named objects allocated by the |
717 | * kernel; we store the mutex names in this mapped file so that other |
718 | * processes can grab them. This same approach is also used on |
719 | * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support |
720 | * process-shared POSIX mutexes. For these cases where a named object |
721 | * is used, the object name is derived from a 64 bit FNV hash of the |
722 | * environment pathname. As such, naming collisions are extremely |
723 | * unlikely. If a collision occurs, the results are unpredictable. |
724 | */ |
725 | typedef struct MDB_txbody { |
726 | /** Stamp identifying this as an LMDB file. It must be set |
727 | * to #MDB_MAGIC. */ |
728 | uint32_t mtb_magic; |
729 | /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ |
730 | uint32_t mtb_format; |
731 | #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) |
732 | char mtb_rmname[MNAME_LEN]; |
733 | #else |
734 | /** Mutex protecting access to this table. |
735 | * This is the reader table lock used with LOCK_MUTEX(). |
736 | */ |
737 | mdb_mutex_t mtb_rmutex; |
738 | #endif |
739 | /** The ID of the last transaction committed to the database. |
740 | * This is recorded here only for convenience; the value can always |
741 | * be determined by reading the main database meta pages. |
742 | */ |
743 | volatile txnid_t mtb_txnid; |
744 | /** The number of slots that have been used in the reader table. |
745 | * This always records the maximum count, it is not decremented |
746 | * when readers release their slots. |
747 | */ |
748 | volatile unsigned mtb_numreaders; |
749 | } MDB_txbody; |
750 | |
751 | /** The actual reader table definition. */ |
752 | typedef struct MDB_txninfo { |
753 | union { |
754 | MDB_txbody mtb; |
755 | #define mti_magic mt1.mtb.mtb_magic |
756 | #define mti_format mt1.mtb.mtb_format |
757 | #define mti_rmutex mt1.mtb.mtb_rmutex |
758 | #define mti_rmname mt1.mtb.mtb_rmname |
759 | #define mti_txnid mt1.mtb.mtb_txnid |
760 | #define mti_numreaders mt1.mtb.mtb_numreaders |
761 | char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; |
762 | } mt1; |
763 | union { |
764 | #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) |
765 | char mt2_wmname[MNAME_LEN]; |
766 | #define mti_wmname mt2.mt2_wmname |
767 | #else |
768 | mdb_mutex_t mt2_wmutex; |
769 | #define mti_wmutex mt2.mt2_wmutex |
770 | #endif |
771 | char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)]; |
772 | } mt2; |
773 | MDB_reader mti_readers[1]; |
774 | } MDB_txninfo; |
775 | |
776 | /** Lockfile format signature: version, features and field layout */ |
777 | #define MDB_LOCK_FORMAT \ |
778 | ((uint32_t) \ |
779 | ((MDB_LOCK_VERSION) \ |
780 | /* Flags which describe functionality */ \ |
781 | + (((MDB_PIDLOCK) != 0) << 16))) |
782 | /** @} */ |
783 | |
784 | /** Common header for all page types. The page type depends on #mp_flags. |
785 | * |
786 | * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with |
787 | * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages |
788 | * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. |
789 | * |
790 | * #P_OVERFLOW records occupy one or more contiguous pages where only the |
791 | * first has a page header. They hold the real data of #F_BIGDATA nodes. |
792 | * |
793 | * #P_SUBP sub-pages are small leaf "pages" with duplicate data. |
794 | * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. |
795 | * (Duplicate data can also go in sub-databases, which use normal pages.) |
796 | * |
797 | * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. |
798 | * |
799 | * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once |
800 | * in the snapshot: Either used by a database or listed in a freeDB record. |
801 | */ |
802 | typedef struct MDB_page { |
803 | #define mp_pgno mp_p.p_pgno |
804 | #define mp_next mp_p.p_next |
805 | union { |
806 | pgno_t p_pgno; /**< page number */ |
807 | struct MDB_page *p_next; /**< for in-memory list of freed pages */ |
808 | } mp_p; |
809 | uint16_t mp_pad; /**< key size if this is a LEAF2 page */ |
810 | /** @defgroup mdb_page Page Flags |
811 | * @ingroup internal |
812 | * Flags for the page headers. |
813 | * @{ |
814 | */ |
815 | #define P_BRANCH 0x01 /**< branch page */ |
816 | #define P_LEAF 0x02 /**< leaf page */ |
817 | #define P_OVERFLOW 0x04 /**< overflow page */ |
818 | #define P_META 0x08 /**< meta page */ |
819 | #define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ |
820 | #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ |
821 | #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ |
822 | #define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ |
823 | #define P_KEEP 0x8000 /**< leave this page alone during spill */ |
824 | /** @} */ |
825 | uint16_t mp_flags; /**< @ref mdb_page */ |
826 | #define mp_lower mp_pb.pb.pb_lower |
827 | #define mp_upper mp_pb.pb.pb_upper |
828 | #define mp_pages mp_pb.pb_pages |
829 | union { |
830 | struct { |
831 | indx_t pb_lower; /**< lower bound of free space */ |
832 | indx_t pb_upper; /**< upper bound of free space */ |
833 | } pb; |
834 | uint32_t pb_pages; /**< number of overflow pages */ |
835 | } mp_pb; |
836 | indx_t mp_ptrs[1]; /**< dynamic size */ |
837 | } MDB_page; |
838 | |
839 | /** Size of the page header, excluding dynamic data at the end */ |
840 | #define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs)) |
841 | |
842 | /** Address of first usable data byte in a page, after the header */ |
843 | #define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) |
844 | |
845 | /** ITS#7713, change PAGEBASE to handle 65536 byte pages */ |
846 | #define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) |
847 | |
848 | /** Number of nodes on a page */ |
849 | #define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1) |
850 | |
851 | /** The amount of space remaining in the page */ |
852 | #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) |
853 | |
854 | /** The percentage of space used in the page, in tenths of a percent. */ |
855 | #define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ |
856 | ((env)->me_psize - PAGEHDRSZ)) |
857 | /** The minimum page fill factor, in tenths of a percent. |
858 | * Pages emptier than this are candidates for merging. |
859 | */ |
860 | #define FILL_THRESHOLD 250 |
861 | |
862 | /** Test if a page is a leaf page */ |
863 | #define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) |
864 | /** Test if a page is a LEAF2 page */ |
865 | #define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) |
866 | /** Test if a page is a branch page */ |
867 | #define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) |
868 | /** Test if a page is an overflow page */ |
869 | #define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) |
870 | /** Test if a page is a sub page */ |
871 | #define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) |
872 | |
873 | /** The number of overflow pages needed to store the given size. */ |
874 | #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) |
875 | |
876 | /** Link in #MDB_txn.%mt_loose_pgs list. |
877 | * Kept outside the page header, which is needed when reusing the page. |
878 | */ |
879 | #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) |
880 | |
881 | /** Header for a single key/data pair within a page. |
882 | * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. |
883 | * We guarantee 2-byte alignment for 'MDB_node's. |
884 | * |
885 | * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child |
886 | * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used |
887 | * for pgno. (Branch nodes have no flags). Lo and hi are in host byte |
888 | * order in case some accesses can be optimized to 32-bit word access. |
889 | * |
890 | * Leaf node flags describe node contents. #F_BIGDATA says the node's |
891 | * data part is the page number of an overflow page with actual data. |
892 | * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in |
893 | * a sub-page/sub-database, and named databases (just #F_SUBDATA). |
894 | */ |
895 | typedef struct MDB_node { |
896 | /** part of data size or pgno |
897 | * @{ */ |
898 | #if BYTE_ORDER == LITTLE_ENDIAN |
899 | unsigned short mn_lo, mn_hi; |
900 | #else |
901 | unsigned short mn_hi, mn_lo; |
902 | #endif |
903 | /** @} */ |
904 | /** @defgroup mdb_node Node Flags |
905 | * @ingroup internal |
906 | * Flags for node headers. |
907 | * @{ |
908 | */ |
909 | #define F_BIGDATA 0x01 /**< data put on overflow page */ |
910 | #define F_SUBDATA 0x02 /**< data is a sub-database */ |
911 | #define F_DUPDATA 0x04 /**< data has duplicates */ |
912 | |
913 | /** valid flags for #mdb_node_add() */ |
914 | #define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND) |
915 | |
916 | /** @} */ |
917 | unsigned short mn_flags; /**< @ref mdb_node */ |
918 | unsigned short mn_ksize; /**< key size */ |
919 | char mn_data[1]; /**< key and data are appended here */ |
920 | } MDB_node; |
921 | |
922 | /** Size of the node header, excluding dynamic data at the end */ |
923 | #define NODESIZE offsetof(MDB_node, mn_data) |
924 | |
925 | /** Bit position of top word in page number, for shifting mn_flags */ |
926 | #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) |
927 | |
928 | /** Size of a node in a branch page with a given key. |
929 | * This is just the node header plus the key, there is no data. |
930 | */ |
931 | #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) |
932 | |
933 | /** Size of a node in a leaf page with a given key and data. |
934 | * This is node header plus key plus data size. |
935 | */ |
936 | #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) |
937 | |
938 | /** Address of node \b i in page \b p */ |
939 | #define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE)) |
940 | |
941 | /** Address of the key for the node */ |
942 | #define NODEKEY(node) (void *)((node)->mn_data) |
943 | |
944 | /** Address of the data for a node */ |
945 | #define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) |
946 | |
947 | /** Get the page number pointed to by a branch node */ |
948 | #define NODEPGNO(node) \ |
949 | ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \ |
950 | (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0)) |
951 | /** Set the page number in a branch node */ |
952 | #define SETPGNO(node,pgno) do { \ |
953 | (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \ |
954 | if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0) |
955 | |
956 | /** Get the size of the data in a leaf node */ |
957 | #define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) |
958 | /** Set the size of the data for a leaf node */ |
959 | #define SETDSZ(node,size) do { \ |
960 | (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0) |
961 | /** The size of a key in a node */ |
962 | #define NODEKSZ(node) ((node)->mn_ksize) |
963 | |
964 | /** Copy a page number from src to dst */ |
965 | #ifdef MISALIGNED_OK |
966 | #define COPY_PGNO(dst,src) dst = src |
967 | #else |
968 | #if SIZE_MAX > 4294967295UL |
969 | #define COPY_PGNO(dst,src) do { \ |
970 | unsigned short *s, *d; \ |
971 | s = (unsigned short *)&(src); \ |
972 | d = (unsigned short *)&(dst); \ |
973 | *d++ = *s++; \ |
974 | *d++ = *s++; \ |
975 | *d++ = *s++; \ |
976 | *d = *s; \ |
977 | } while (0) |
978 | #else |
979 | #define COPY_PGNO(dst,src) do { \ |
980 | unsigned short *s, *d; \ |
981 | s = (unsigned short *)&(src); \ |
982 | d = (unsigned short *)&(dst); \ |
983 | *d++ = *s++; \ |
984 | *d = *s; \ |
985 | } while (0) |
986 | #endif |
987 | #endif |
988 | /** The address of a key in a LEAF2 page. |
989 | * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs. |
990 | * There are no node headers, keys are stored contiguously. |
991 | */ |
992 | #define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks))) |
993 | |
994 | /** Set the \b node's key into \b keyptr, if requested. */ |
995 | #define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \ |
996 | (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } } |
997 | |
998 | /** Set the \b node's key into \b key. */ |
999 | #define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); } |
1000 | |
1001 | /** Information about a single database in the environment. */ |
1002 | typedef struct MDB_db { |
1003 | uint32_t md_pad; /**< also ksize for LEAF2 pages */ |
1004 | uint16_t md_flags; /**< @ref mdb_dbi_open */ |
1005 | uint16_t md_depth; /**< depth of this tree */ |
1006 | pgno_t md_branch_pages; /**< number of internal pages */ |
1007 | pgno_t md_leaf_pages; /**< number of leaf pages */ |
1008 | pgno_t md_overflow_pages; /**< number of overflow pages */ |
1009 | size_t md_entries; /**< number of data items */ |
1010 | pgno_t md_root; /**< the root page of this tree */ |
1011 | } MDB_db; |
1012 | |
1013 | #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ |
1014 | #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) |
1015 | /** #mdb_dbi_open() flags */ |
1016 | #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ |
1017 | MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) |
1018 | |
1019 | /** Handle for the DB used to track free pages. */ |
1020 | #define FREE_DBI 0 |
1021 | /** Handle for the default DB. */ |
1022 | #define MAIN_DBI 1 |
1023 | /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ |
1024 | #define CORE_DBS 2 |
1025 | |
1026 | /** Number of meta pages - also hardcoded elsewhere */ |
1027 | #define NUM_METAS 2 |
1028 | |
1029 | /** Meta page content. |
1030 | * A meta page is the start point for accessing a database snapshot. |
1031 | * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). |
1032 | */ |
1033 | typedef struct MDB_meta { |
1034 | /** Stamp identifying this as an LMDB file. It must be set |
1035 | * to #MDB_MAGIC. */ |
1036 | uint32_t mm_magic; |
1037 | /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ |
1038 | uint32_t mm_version; |
1039 | void *mm_address; /**< address for fixed mapping */ |
1040 | size_t mm_mapsize; /**< size of mmap region */ |
1041 | MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ |
1042 | /** The size of pages used in this DB */ |
1043 | #define mm_psize mm_dbs[FREE_DBI].md_pad |
1044 | /** Any persistent environment flags. @ref mdb_env */ |
1045 | #define mm_flags mm_dbs[FREE_DBI].md_flags |
1046 | /** Last used page in the datafile. |
1047 | * Actually the file may be shorter if the freeDB lists the final pages. |
1048 | */ |
1049 | pgno_t mm_last_pg; |
1050 | volatile txnid_t mm_txnid; /**< txnid that committed this page */ |
1051 | } MDB_meta; |
1052 | |
1053 | /** Buffer for a stack-allocated meta page. |
1054 | * The members define size and alignment, and silence type |
1055 | * aliasing warnings. They are not used directly; that could |
1056 | * mean incorrectly using several union members in parallel. |
1057 | */ |
1058 | typedef union MDB_metabuf { |
1059 | MDB_page mb_page; |
1060 | struct { |
1061 | char mm_pad[PAGEHDRSZ]; |
1062 | MDB_meta mm_meta; |
1063 | } mb_metabuf; |
1064 | } MDB_metabuf; |
1065 | |
1066 | /** Auxiliary DB info. |
1067 | * The information here is mostly static/read-only. There is |
1068 | * only a single copy of this record in the environment. |
1069 | */ |
1070 | typedef struct MDB_dbx { |
1071 | MDB_val md_name; /**< name of the database */ |
1072 | MDB_cmp_func *md_cmp; /**< function for comparing keys */ |
1073 | MDB_cmp_func *md_dcmp; /**< function for comparing data items */ |
1074 | MDB_rel_func *md_rel; /**< user relocate function */ |
1075 | void *md_relctx; /**< user-provided context for md_rel */ |
1076 | } MDB_dbx; |
1077 | |
1078 | /** A database transaction. |
1079 | * Every operation requires a transaction handle. |
1080 | */ |
1081 | struct MDB_txn { |
1082 | MDB_txn *mt_parent; /**< parent of a nested txn */ |
1083 | /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ |
1084 | MDB_txn *mt_child; |
1085 | pgno_t mt_next_pgno; /**< next unallocated page */ |
1086 | /** The ID of this transaction. IDs are integers incrementing from 1. |
1087 | * Only committed write transactions increment the ID. If a transaction |
1088 | * aborts, the ID may be re-used by the next writer. |
1089 | */ |
1090 | txnid_t mt_txnid; |
1091 | MDB_env *mt_env; /**< the DB environment */ |
1092 | /** The list of pages that became unused during this transaction. |
1093 | */ |
1094 | MDB_IDL mt_free_pgs; |
1095 | /** The list of loose pages that became unused and may be reused |
1096 | * in this transaction, linked through #NEXT_LOOSE_PAGE(page). |
1097 | */ |
1098 | MDB_page *mt_loose_pgs; |
1099 | /** Number of loose pages (#mt_loose_pgs) */ |
1100 | int mt_loose_count; |
1101 | /** The sorted list of dirty pages we temporarily wrote to disk |
1102 | * because the dirty list was full. page numbers in here are |
1103 | * shifted left by 1, deleted slots have the LSB set. |
1104 | */ |
1105 | MDB_IDL mt_spill_pgs; |
1106 | union { |
1107 | /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ |
1108 | MDB_ID2L dirty_list; |
1109 | /** For read txns: This thread/txn's reader table slot, or NULL. */ |
1110 | MDB_reader *reader; |
1111 | } mt_u; |
1112 | /** Array of records for each DB known in the environment. */ |
1113 | MDB_dbx *mt_dbxs; |
1114 | /** Array of MDB_db records for each known DB */ |
1115 | MDB_db *mt_dbs; |
1116 | /** Array of sequence numbers for each DB handle */ |
1117 | unsigned int *mt_dbiseqs; |
1118 | /** @defgroup mt_dbflag Transaction DB Flags |
1119 | * @ingroup internal |
1120 | * @{ |
1121 | */ |
1122 | #define DB_DIRTY 0x01 /**< DB was written in this txn */ |
1123 | #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ |
1124 | #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ |
1125 | #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ |
1126 | #define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ |
1127 | #define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ |
1128 | /** @} */ |
1129 | /** In write txns, array of cursors for each DB */ |
1130 | MDB_cursor **mt_cursors; |
1131 | /** Array of flags for each DB */ |
1132 | unsigned char *mt_dbflags; |
1133 | /** Number of DB records in use, or 0 when the txn is finished. |
1134 | * This number only ever increments until the txn finishes; we |
1135 | * don't decrement it when individual DB handles are closed. |
1136 | */ |
1137 | MDB_dbi mt_numdbs; |
1138 | |
1139 | /** @defgroup mdb_txn Transaction Flags |
1140 | * @ingroup internal |
1141 | * @{ |
1142 | */ |
1143 | /** #mdb_txn_begin() flags */ |
1144 | #define MDB_TXN_BEGIN_FLAGS MDB_RDONLY |
1145 | #define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ |
1146 | /* internal txn flags */ |
1147 | #define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */ |
1148 | #define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ |
1149 | #define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ |
1150 | #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ |
1151 | #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ |
1152 | #define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ |
1153 | /** most operations on the txn are currently illegal */ |
1154 | #define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD) |
1155 | /** @} */ |
1156 | unsigned int mt_flags; /**< @ref mdb_txn */ |
1157 | /** #dirty_list room: Array size - \#dirty pages visible to this txn. |
1158 | * Includes ancestor txns' dirty pages not hidden by other txns' |
1159 | * dirty/spilled pages. Thus commit(nested txn) has room to merge |
1160 | * dirty_list into mt_parent after freeing hidden mt_parent pages. |
1161 | */ |
1162 | unsigned int mt_dirty_room; |
1163 | }; |
1164 | |
1165 | /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. |
1166 | * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to |
1167 | * raise this on a 64 bit machine. |
1168 | */ |
1169 | #define CURSOR_STACK 32 |
1170 | |
1171 | struct MDB_xcursor; |
1172 | |
1173 | /** Cursors are used for all DB operations. |
1174 | * A cursor holds a path of (page pointer, key index) from the DB |
1175 | * root to a position in the DB, plus other state. #MDB_DUPSORT |
1176 | * cursors include an xcursor to the current data item. Write txns |
1177 | * track their cursors and keep them up to date when data moves. |
1178 | * Exception: An xcursor's pointer to a #P_SUBP page can be stale. |
1179 | * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). |
1180 | */ |
1181 | struct MDB_cursor { |
1182 | /** Next cursor on this DB in this txn */ |
1183 | MDB_cursor *mc_next; |
1184 | /** Backup of the original cursor if this cursor is a shadow */ |
1185 | MDB_cursor *mc_backup; |
1186 | /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ |
1187 | struct MDB_xcursor *mc_xcursor; |
1188 | /** The transaction that owns this cursor */ |
1189 | MDB_txn *mc_txn; |
1190 | /** The database handle this cursor operates on */ |
1191 | MDB_dbi mc_dbi; |
1192 | /** The database record for this cursor */ |
1193 | MDB_db *mc_db; |
1194 | /** The database auxiliary record for this cursor */ |
1195 | MDB_dbx *mc_dbx; |
1196 | /** The @ref mt_dbflag for this database */ |
1197 | unsigned char *mc_dbflag; |
1198 | unsigned short mc_snum; /**< number of pushed pages */ |
1199 | unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ |
1200 | /** @defgroup mdb_cursor Cursor Flags |
1201 | * @ingroup internal |
1202 | * Cursor state flags. |
1203 | * @{ |
1204 | */ |
1205 | #define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ |
1206 | #define C_EOF 0x02 /**< No more data */ |
1207 | #define C_SUB 0x04 /**< Cursor is a sub-cursor */ |
1208 | #define C_DEL 0x08 /**< last op was a cursor_del */ |
1209 | #define C_UNTRACK 0x40 /**< Un-track cursor when closing */ |
1210 | /** @} */ |
1211 | unsigned int mc_flags; /**< @ref mdb_cursor */ |
1212 | MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ |
1213 | indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ |
1214 | }; |
1215 | |
1216 | /** Context for sorted-dup records. |
1217 | * We could have gone to a fully recursive design, with arbitrarily |
1218 | * deep nesting of sub-databases. But for now we only handle these |
1219 | * levels - main DB, optional sub-DB, sorted-duplicate DB. |
1220 | */ |
1221 | typedef struct MDB_xcursor { |
1222 | /** A sub-cursor for traversing the Dup DB */ |
1223 | MDB_cursor mx_cursor; |
1224 | /** The database record for this Dup DB */ |
1225 | MDB_db mx_db; |
1226 | /** The auxiliary DB record for this Dup DB */ |
1227 | MDB_dbx mx_dbx; |
1228 | /** The @ref mt_dbflag for this Dup DB */ |
1229 | unsigned char mx_dbflag; |
1230 | } MDB_xcursor; |
1231 | |
1232 | /** Check if there is an inited xcursor */ |
1233 | #define XCURSOR_INITED(mc) \ |
1234 | ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) |
1235 | |
1236 | /** Update the xcursor's sub-page pointer, if any, in \b mc. Needed |
1237 | * when the node which contains the sub-page may have moved. Called |
1238 | * with leaf page \b mp = mc->mc_pg[\b top]. |
1239 | */ |
1240 | #define XCURSOR_REFRESH(mc, top, mp) do { \ |
1241 | MDB_page *xr_pg = (mp); \ |
1242 | MDB_node *xr_node; \ |
1243 | if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \ |
1244 | xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \ |
1245 | if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ |
1246 | (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ |
1247 | } while (0) |
1248 | |
1249 | /** State of FreeDB old pages, stored in the MDB_env */ |
1250 | typedef struct MDB_pgstate { |
1251 | pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ |
1252 | txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ |
1253 | } MDB_pgstate; |
1254 | |
1255 | /** The database environment. */ |
1256 | struct MDB_env { |
1257 | HANDLE me_fd; /**< The main data file */ |
1258 | HANDLE me_lfd; /**< The lock file */ |
1259 | HANDLE me_mfd; /**< For writing and syncing the meta pages */ |
1260 | /** Failed to update the meta page. Probably an I/O error. */ |
1261 | #define MDB_FATAL_ERROR 0x80000000U |
1262 | /** Some fields are initialized. */ |
1263 | #define MDB_ENV_ACTIVE 0x20000000U |
1264 | /** me_txkey is set */ |
1265 | #define MDB_ENV_TXKEY 0x10000000U |
1266 | /** fdatasync is unreliable */ |
1267 | #define MDB_FSYNCONLY 0x08000000U |
1268 | uint32_t me_flags; /**< @ref mdb_env */ |
1269 | unsigned int me_psize; /**< DB page size, inited from me_os_psize */ |
1270 | unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ |
1271 | unsigned int me_maxreaders; /**< size of the reader table */ |
1272 | /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */ |
1273 | volatile int me_close_readers; |
1274 | MDB_dbi me_numdbs; /**< number of DBs opened */ |
1275 | MDB_dbi me_maxdbs; /**< size of the DB table */ |
1276 | MDB_PID_T me_pid; /**< process ID of this env */ |
1277 | char *me_path; /**< path to the DB files */ |
1278 | char *me_map; /**< the memory map of the data file */ |
1279 | MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ |
1280 | MDB_meta *me_metas[NUM_METAS]; /**< pointers to the two meta pages */ |
1281 | void *me_pbuf; /**< scratch area for DUPSORT put() */ |
1282 | MDB_txn *me_txn; /**< current write transaction */ |
1283 | MDB_txn *me_txn0; /**< prealloc'd write transaction */ |
1284 | size_t me_mapsize; /**< size of the data memory map */ |
1285 | off_t me_size; /**< current file size */ |
1286 | pgno_t me_maxpg; /**< me_mapsize / me_psize */ |
1287 | MDB_dbx *me_dbxs; /**< array of static DB info */ |
1288 | uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ |
1289 | unsigned int *me_dbiseqs; /**< array of dbi sequence numbers */ |
1290 | pthread_key_t me_txkey; /**< thread-key for readers */ |
1291 | txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ |
1292 | MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ |
1293 | # define me_pglast me_pgstate.mf_pglast |
1294 | # define me_pghead me_pgstate.mf_pghead |
1295 | MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ |
1296 | /** IDL of pages that became unused in a write txn */ |
1297 | MDB_IDL me_free_pgs; |
1298 | /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ |
1299 | MDB_ID2L me_dirty_list; |
1300 | /** Max number of freelist items that can fit in a single overflow page */ |
1301 | int me_maxfree_1pg; |
1302 | /** Max size of a node on a page */ |
1303 | unsigned int me_nodemax; |
1304 | #if !(MDB_MAXKEYSIZE) |
1305 | unsigned int me_maxkey; /**< max size of a key */ |
1306 | #endif |
1307 | int me_live_reader; /**< have liveness lock in reader table */ |
1308 | #ifdef _WIN32 |
1309 | int me_pidquery; /**< Used in OpenProcess */ |
1310 | #endif |
1311 | #ifdef MDB_USE_POSIX_MUTEX /* Posix mutexes reside in shared mem */ |
1312 | # define me_rmutex me_txns->mti_rmutex /**< Shared reader lock */ |
1313 | # define me_wmutex me_txns->mti_wmutex /**< Shared writer lock */ |
1314 | #else |
1315 | mdb_mutex_t me_rmutex; |
1316 | mdb_mutex_t me_wmutex; |
1317 | #endif |
1318 | void *me_userctx; /**< User-settable context */ |
1319 | MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ |
1320 | }; |
1321 | |
1322 | /** Nested transaction */ |
1323 | typedef struct MDB_ntxn { |
1324 | MDB_txn mnt_txn; /**< the transaction */ |
1325 | MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ |
1326 | } MDB_ntxn; |
1327 | |
1328 | /** max number of pages to commit in one writev() call */ |
1329 | #define MDB_COMMIT_PAGES 64 |
1330 | #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES |
1331 | #undef MDB_COMMIT_PAGES |
1332 | #define MDB_COMMIT_PAGES IOV_MAX |
1333 | #endif |
1334 | |
1335 | /** max bytes to write in one call */ |
1336 | #define MAX_WRITE (0x40000000U >> (sizeof(ssize_t) == 4)) |
1337 | |
1338 | /** Check \b txn and \b dbi arguments to a function */ |
1339 | #define TXN_DBI_EXIST(txn, dbi, validity) \ |
1340 | ((txn) && (dbi)<(txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) |
1341 | |
1342 | /** Check for misused \b dbi handles */ |
1343 | #define TXN_DBI_CHANGED(txn, dbi) \ |
1344 | ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) |
1345 | |
1346 | static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); |
1347 | static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); |
1348 | static int mdb_page_touch(MDB_cursor *mc); |
1349 | |
1350 | #define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \ |
1351 | "reset-tmp", "fail-begin", "fail-beginchild"} |
1352 | enum { |
1353 | /* mdb_txn_end operation number, for logging */ |
1354 | MDB_END_COMMITTED, MDB_END_EMPTY_COMMIT, MDB_END_ABORT, MDB_END_RESET, |
1355 | MDB_END_RESET_TMP, MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD |
1356 | }; |
1357 | #define MDB_END_OPMASK 0x0F /**< mask for #mdb_txn_end() operation number */ |
1358 | #define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ |
1359 | #define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ |
1360 | #define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ |
1361 | static void mdb_txn_end(MDB_txn *txn, unsigned mode); |
1362 | |
1363 | static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); |
1364 | static int mdb_page_search_root(MDB_cursor *mc, |
1365 | MDB_val *key, int modify); |
1366 | #define MDB_PS_MODIFY 1 |
1367 | #define MDB_PS_ROOTONLY 2 |
1368 | #define MDB_PS_FIRST 4 |
1369 | #define MDB_PS_LAST 8 |
1370 | static int mdb_page_search(MDB_cursor *mc, |
1371 | MDB_val *key, int flags); |
1372 | static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); |
1373 | |
1374 | #define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ |
1375 | static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, |
1376 | pgno_t newpgno, unsigned int nflags); |
1377 | |
1378 | static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); |
1379 | static MDB_meta *mdb_env_pick_meta(const MDB_env *env); |
1380 | static int mdb_env_write_meta(MDB_txn *txn); |
1381 | #if defined(MDB_USE_POSIX_MUTEX) && !defined(MDB_ROBUST_SUPPORTED) /* Drop unused excl arg */ |
1382 | # define mdb_env_close0(env, excl) mdb_env_close1(env) |
1383 | #endif |
1384 | static void mdb_env_close0(MDB_env *env, int excl); |
1385 | |
1386 | static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); |
1387 | static int mdb_node_add(MDB_cursor *mc, indx_t indx, |
1388 | MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags); |
1389 | static void mdb_node_del(MDB_cursor *mc, int ksize); |
1390 | static void mdb_node_shrink(MDB_page *mp, indx_t indx); |
1391 | static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); |
1392 | static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); |
1393 | static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); |
1394 | static size_t mdb_branch_size(MDB_env *env, MDB_val *key); |
1395 | |
1396 | static int mdb_rebalance(MDB_cursor *mc); |
1397 | static int mdb_update_key(MDB_cursor *mc, MDB_val *key); |
1398 | |
1399 | static void mdb_cursor_pop(MDB_cursor *mc); |
1400 | static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); |
1401 | |
1402 | static int mdb_cursor_del0(MDB_cursor *mc); |
1403 | static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); |
1404 | static int mdb_cursor_sibling(MDB_cursor *mc, int move_right); |
1405 | static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); |
1406 | static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); |
1407 | static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, |
1408 | int *exactp); |
1409 | static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); |
1410 | static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); |
1411 | |
1412 | static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); |
1413 | static void mdb_xcursor_init0(MDB_cursor *mc); |
1414 | static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); |
1415 | static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); |
1416 | |
1417 | static int mdb_drop0(MDB_cursor *mc, int subs); |
1418 | static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); |
1419 | static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); |
1420 | |
1421 | /** @cond */ |
1422 | static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long; |
1423 | /** @endcond */ |
1424 | |
1425 | /** Compare two items pointing at size_t's of unknown alignment. */ |
1426 | #ifdef MISALIGNED_OK |
1427 | # define mdb_cmp_clong mdb_cmp_long |
1428 | #else |
1429 | # define mdb_cmp_clong mdb_cmp_cint |
1430 | #endif |
1431 | |
1432 | #ifdef _WIN32 |
1433 | static SECURITY_DESCRIPTOR mdb_null_sd; |
1434 | static SECURITY_ATTRIBUTES mdb_all_sa; |
1435 | static int mdb_sec_inited; |
1436 | |
1437 | struct MDB_name; |
1438 | static int utf8_to_utf16(const char *src, struct MDB_name *dst, int xtra); |
1439 | #endif |
1440 | |
1441 | /** Return the library version info. */ |
1442 | char * ESECT |
1443 | mdb_version(int *major, int *minor, int *patch) |
1444 | { |
1445 | if (major) *major = MDB_VERSION_MAJOR; |
1446 | if (minor) *minor = MDB_VERSION_MINOR; |
1447 | if (patch) *patch = MDB_VERSION_PATCH; |
1448 | return MDB_VERSION_STRING; |
1449 | } |
1450 | |
1451 | /** Table of descriptions for LMDB @ref errors */ |
1452 | static char *const mdb_errstr[] = { |
1453 | "MDB_KEYEXIST: Key/data pair already exists" , |
1454 | "MDB_NOTFOUND: No matching key/data pair found" , |
1455 | "MDB_PAGE_NOTFOUND: Requested page not found" , |
1456 | "MDB_CORRUPTED: Located page was wrong type" , |
1457 | "MDB_PANIC: Update of meta page failed or environment had fatal error" , |
1458 | "MDB_VERSION_MISMATCH: Database environment version mismatch" , |
1459 | "MDB_INVALID: File is not an LMDB file" , |
1460 | "MDB_MAP_FULL: Environment mapsize limit reached" , |
1461 | "MDB_DBS_FULL: Environment maxdbs limit reached" , |
1462 | "MDB_READERS_FULL: Environment maxreaders limit reached" , |
1463 | "MDB_TLS_FULL: Thread-local storage keys full - too many environments open" , |
1464 | "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big" , |
1465 | "MDB_CURSOR_FULL: Internal error - cursor stack limit reached" , |
1466 | "MDB_PAGE_FULL: Internal error - page has no more space" , |
1467 | "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize" , |
1468 | "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed" , |
1469 | "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot" , |
1470 | "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid" , |
1471 | "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size" , |
1472 | "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly" , |
1473 | }; |
1474 | |
1475 | char * |
1476 | mdb_strerror(int err) |
1477 | { |
1478 | #ifdef _WIN32 |
1479 | /** HACK: pad 4KB on stack over the buf. Return system msgs in buf. |
1480 | * This works as long as no function between the call to mdb_strerror |
1481 | * and the actual use of the message uses more than 4K of stack. |
1482 | */ |
1483 | #define MSGSIZE 1024 |
1484 | #define PADSIZE 4096 |
1485 | char buf[MSGSIZE+PADSIZE], *ptr = buf; |
1486 | #endif |
1487 | int i; |
1488 | if (!err) |
1489 | return ("Successful return: 0" ); |
1490 | |
1491 | if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { |
1492 | i = err - MDB_KEYEXIST; |
1493 | return mdb_errstr[i]; |
1494 | } |
1495 | |
1496 | #ifdef _WIN32 |
1497 | /* These are the C-runtime error codes we use. The comment indicates |
1498 | * their numeric value, and the Win32 error they would correspond to |
1499 | * if the error actually came from a Win32 API. A major mess, we should |
1500 | * have used LMDB-specific error codes for everything. |
1501 | */ |
1502 | switch(err) { |
1503 | case ENOENT: /* 2, FILE_NOT_FOUND */ |
1504 | case EIO: /* 5, ACCESS_DENIED */ |
1505 | case ENOMEM: /* 12, INVALID_ACCESS */ |
1506 | case EACCES: /* 13, INVALID_DATA */ |
1507 | case EBUSY: /* 16, CURRENT_DIRECTORY */ |
1508 | case EINVAL: /* 22, BAD_COMMAND */ |
1509 | case ENOSPC: /* 28, OUT_OF_PAPER */ |
1510 | return strerror(err); |
1511 | default: |
1512 | ; |
1513 | } |
1514 | buf[0] = 0; |
1515 | FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | |
1516 | FORMAT_MESSAGE_IGNORE_INSERTS, |
1517 | NULL, err, 0, ptr, MSGSIZE, (va_list *)buf+MSGSIZE); |
1518 | return ptr; |
1519 | #else |
1520 | return strerror(err); |
1521 | #endif |
1522 | } |
1523 | |
1524 | /** assert(3) variant in cursor context */ |
1525 | #define mdb_cassert(mc, expr) mdb_assert0((mc)->mc_txn->mt_env, expr, #expr) |
1526 | /** assert(3) variant in transaction context */ |
1527 | #define mdb_tassert(txn, expr) mdb_assert0((txn)->mt_env, expr, #expr) |
1528 | /** assert(3) variant in environment context */ |
1529 | #define mdb_eassert(env, expr) mdb_assert0(env, expr, #expr) |
1530 | |
1531 | #ifndef NDEBUG |
1532 | # define mdb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \ |
1533 | mdb_assert_fail(env, expr_txt, mdb_func_, __FILE__, __LINE__)) |
1534 | |
1535 | static void ESECT |
1536 | mdb_assert_fail(MDB_env *env, const char *expr_txt, |
1537 | const char *func, const char *file, int line) |
1538 | { |
1539 | char buf[400]; |
1540 | sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()" , |
1541 | file, line, expr_txt, func); |
1542 | if (env->me_assert_func) |
1543 | env->me_assert_func(env, buf); |
1544 | fprintf(stderr, "%s\n" , buf); |
1545 | abort(); |
1546 | } |
1547 | #else |
1548 | # define mdb_assert0(env, expr, expr_txt) ((void) 0) |
1549 | #endif /* NDEBUG */ |
1550 | |
1551 | #if MDB_DEBUG |
1552 | /** Return the page number of \b mp which may be sub-page, for debug output */ |
1553 | static pgno_t |
1554 | mdb_dbg_pgno(MDB_page *mp) |
1555 | { |
1556 | pgno_t ret; |
1557 | COPY_PGNO(ret, mp->mp_pgno); |
1558 | return ret; |
1559 | } |
1560 | |
1561 | /** Display a key in hexadecimal and return the address of the result. |
1562 | * @param[in] key the key to display |
1563 | * @param[in] buf the buffer to write into. Should always be #DKBUF. |
1564 | * @return The key in hexadecimal form. |
1565 | */ |
1566 | char * |
1567 | mdb_dkey(MDB_val *key, char *buf) |
1568 | { |
1569 | char *ptr = buf; |
1570 | unsigned char *c = key->mv_data; |
1571 | unsigned int i; |
1572 | |
1573 | if (!key) |
1574 | return "" ; |
1575 | |
1576 | if (key->mv_size > DKBUF_MAXKEYSIZE) |
1577 | return "MDB_MAXKEYSIZE" ; |
1578 | /* may want to make this a dynamic check: if the key is mostly |
1579 | * printable characters, print it as-is instead of converting to hex. |
1580 | */ |
1581 | #if 1 |
1582 | buf[0] = '\0'; |
1583 | for (i=0; i<key->mv_size; i++) |
1584 | ptr += sprintf(ptr, "%02x" , *c++); |
1585 | #else |
1586 | sprintf(buf, "%.*s" , key->mv_size, key->mv_data); |
1587 | #endif |
1588 | return buf; |
1589 | } |
1590 | |
1591 | static const char * |
1592 | mdb_leafnode_type(MDB_node *n) |
1593 | { |
1594 | static char *const tp[2][2] = {{"" , ": DB" }, {": sub-page" , ": sub-DB" }}; |
1595 | return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : |
1596 | tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; |
1597 | } |
1598 | |
1599 | /** Display all the keys in the page. */ |
1600 | void |
1601 | mdb_page_list(MDB_page *mp) |
1602 | { |
1603 | pgno_t pgno = mdb_dbg_pgno(mp); |
1604 | const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : "" ; |
1605 | MDB_node *node; |
1606 | unsigned int i, nkeys, nsize, total = 0; |
1607 | MDB_val key; |
1608 | DKBUF; |
1609 | |
1610 | switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { |
1611 | case P_BRANCH: type = "Branch page" ; break; |
1612 | case P_LEAF: type = "Leaf page" ; break; |
1613 | case P_LEAF|P_SUBP: type = "Sub-page" ; break; |
1614 | case P_LEAF|P_LEAF2: type = "LEAF2 page" ; break; |
1615 | case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page" ; break; |
1616 | case P_OVERFLOW: |
1617 | fprintf(stderr, "Overflow page %" Z"u pages %u%s\n" , |
1618 | pgno, mp->mp_pages, state); |
1619 | return; |
1620 | case P_META: |
1621 | fprintf(stderr, "Meta-page %" Z"u txnid %" Z"u\n" , |
1622 | pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); |
1623 | return; |
1624 | default: |
1625 | fprintf(stderr, "Bad page %" Z"u flags 0x%X\n" , pgno, mp->mp_flags); |
1626 | return; |
1627 | } |
1628 | |
1629 | nkeys = NUMKEYS(mp); |
1630 | fprintf(stderr, "%s %" Z"u numkeys %d%s\n" , type, pgno, nkeys, state); |
1631 | |
1632 | for (i=0; i<nkeys; i++) { |
1633 | if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ |
1634 | key.mv_size = nsize = mp->mp_pad; |
1635 | key.mv_data = LEAF2KEY(mp, i, nsize); |
1636 | total += nsize; |
1637 | fprintf(stderr, "key %d: nsize %d, %s\n" , i, nsize, DKEY(&key)); |
1638 | continue; |
1639 | } |
1640 | node = NODEPTR(mp, i); |
1641 | key.mv_size = node->mn_ksize; |
1642 | key.mv_data = node->mn_data; |
1643 | nsize = NODESIZE + key.mv_size; |
1644 | if (IS_BRANCH(mp)) { |
1645 | fprintf(stderr, "key %d: page %" Z"u, %s\n" , i, NODEPGNO(node), |
1646 | DKEY(&key)); |
1647 | total += nsize; |
1648 | } else { |
1649 | if (F_ISSET(node->mn_flags, F_BIGDATA)) |
1650 | nsize += sizeof(pgno_t); |
1651 | else |
1652 | nsize += NODEDSZ(node); |
1653 | total += nsize; |
1654 | nsize += sizeof(indx_t); |
1655 | fprintf(stderr, "key %d: nsize %d, %s%s\n" , |
1656 | i, nsize, DKEY(&key), mdb_leafnode_type(node)); |
1657 | } |
1658 | total = EVEN(total); |
1659 | } |
1660 | fprintf(stderr, "Total: header %d + contents %d + unused %d\n" , |
1661 | IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp)); |
1662 | } |
1663 | |
1664 | void |
1665 | mdb_cursor_chk(MDB_cursor *mc) |
1666 | { |
1667 | unsigned int i; |
1668 | MDB_node *node; |
1669 | MDB_page *mp; |
1670 | |
1671 | if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; |
1672 | for (i=0; i<mc->mc_top; i++) { |
1673 | mp = mc->mc_pg[i]; |
1674 | node = NODEPTR(mp, mc->mc_ki[i]); |
1675 | if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno) |
1676 | printf("oops!\n" ); |
1677 | } |
1678 | if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) |
1679 | printf("ack!\n" ); |
1680 | if (XCURSOR_INITED(mc)) { |
1681 | node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
1682 | if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && |
1683 | mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { |
1684 | printf("blah!\n" ); |
1685 | } |
1686 | } |
1687 | } |
1688 | #endif |
1689 | |
1690 | #if (MDB_DEBUG) > 2 |
1691 | /** Count all the pages in each DB and in the freelist |
1692 | * and make sure it matches the actual number of pages |
1693 | * being used. |
1694 | * All named DBs must be open for a correct count. |
1695 | */ |
1696 | static void mdb_audit(MDB_txn *txn) |
1697 | { |
1698 | MDB_cursor mc; |
1699 | MDB_val key, data; |
1700 | MDB_ID freecount, count; |
1701 | MDB_dbi i; |
1702 | int rc; |
1703 | |
1704 | freecount = 0; |
1705 | mdb_cursor_init(&mc, txn, FREE_DBI, NULL); |
1706 | while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) |
1707 | freecount += *(MDB_ID *)data.mv_data; |
1708 | mdb_tassert(txn, rc == MDB_NOTFOUND); |
1709 | |
1710 | count = 0; |
1711 | for (i = 0; i<txn->mt_numdbs; i++) { |
1712 | MDB_xcursor mx; |
1713 | if (!(txn->mt_dbflags[i] & DB_VALID)) |
1714 | continue; |
1715 | mdb_cursor_init(&mc, txn, i, &mx); |
1716 | if (txn->mt_dbs[i].md_root == P_INVALID) |
1717 | continue; |
1718 | count += txn->mt_dbs[i].md_branch_pages + |
1719 | txn->mt_dbs[i].md_leaf_pages + |
1720 | txn->mt_dbs[i].md_overflow_pages; |
1721 | if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { |
1722 | rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST); |
1723 | for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) { |
1724 | unsigned j; |
1725 | MDB_page *mp; |
1726 | mp = mc.mc_pg[mc.mc_top]; |
1727 | for (j=0; j<NUMKEYS(mp); j++) { |
1728 | MDB_node *leaf = NODEPTR(mp, j); |
1729 | if (leaf->mn_flags & F_SUBDATA) { |
1730 | MDB_db db; |
1731 | memcpy(&db, NODEDATA(leaf), sizeof(db)); |
1732 | count += db.md_branch_pages + db.md_leaf_pages + |
1733 | db.md_overflow_pages; |
1734 | } |
1735 | } |
1736 | } |
1737 | mdb_tassert(txn, rc == MDB_NOTFOUND); |
1738 | } |
1739 | } |
1740 | if (freecount + count + NUM_METAS != txn->mt_next_pgno) { |
1741 | fprintf(stderr, "audit: %" Z"u freecount: %" Z"u count: %" Z"u total: %" Z"u next_pgno: %" Z"u\n" , |
1742 | txn->mt_txnid, freecount, count+NUM_METAS, |
1743 | freecount+count+NUM_METAS, txn->mt_next_pgno); |
1744 | } |
1745 | } |
1746 | #endif |
1747 | |
1748 | int |
1749 | mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) |
1750 | { |
1751 | return txn->mt_dbxs[dbi].md_cmp(a, b); |
1752 | } |
1753 | |
1754 | int |
1755 | mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) |
1756 | { |
1757 | MDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp; |
1758 | #if UINT_MAX < SIZE_MAX |
1759 | if (dcmp == mdb_cmp_int && a->mv_size == sizeof(size_t)) |
1760 | dcmp = mdb_cmp_clong; |
1761 | #endif |
1762 | return dcmp(a, b); |
1763 | } |
1764 | |
1765 | /** Allocate memory for a page. |
1766 | * Re-use old malloc'd pages first for singletons, otherwise just malloc. |
1767 | * Set #MDB_TXN_ERROR on failure. |
1768 | */ |
1769 | static MDB_page * |
1770 | mdb_page_malloc(MDB_txn *txn, unsigned num) |
1771 | { |
1772 | MDB_env *env = txn->mt_env; |
1773 | MDB_page *ret = env->me_dpages; |
1774 | size_t psize = env->me_psize, sz = psize, off; |
1775 | /* For ! #MDB_NOMEMINIT, psize counts how much to init. |
1776 | * For a single page alloc, we init everything after the page header. |
1777 | * For multi-page, we init the final page; if the caller needed that |
1778 | * many pages they will be filling in at least up to the last page. |
1779 | */ |
1780 | if (num == 1) { |
1781 | if (ret) { |
1782 | VGMEMP_ALLOC(env, ret, sz); |
1783 | VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); |
1784 | env->me_dpages = ret->mp_next; |
1785 | return ret; |
1786 | } |
1787 | psize -= off = PAGEHDRSZ; |
1788 | } else { |
1789 | sz *= num; |
1790 | off = sz - psize; |
1791 | } |
1792 | if ((ret = malloc(sz)) != NULL) { |
1793 | VGMEMP_ALLOC(env, ret, sz); |
1794 | if (!(env->me_flags & MDB_NOMEMINIT)) { |
1795 | memset((char *)ret + off, 0, psize); |
1796 | ret->mp_pad = 0; |
1797 | } |
1798 | } else { |
1799 | txn->mt_flags |= MDB_TXN_ERROR; |
1800 | } |
1801 | return ret; |
1802 | } |
1803 | /** Free a single page. |
1804 | * Saves single pages to a list, for future reuse. |
1805 | * (This is not used for multi-page overflow pages.) |
1806 | */ |
1807 | static void |
1808 | mdb_page_free(MDB_env *env, MDB_page *mp) |
1809 | { |
1810 | mp->mp_next = env->me_dpages; |
1811 | VGMEMP_FREE(env, mp); |
1812 | env->me_dpages = mp; |
1813 | } |
1814 | |
1815 | /** Free a dirty page */ |
1816 | static void |
1817 | mdb_dpage_free(MDB_env *env, MDB_page *dp) |
1818 | { |
1819 | if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { |
1820 | mdb_page_free(env, dp); |
1821 | } else { |
1822 | /* large pages just get freed directly */ |
1823 | VGMEMP_FREE(env, dp); |
1824 | free(dp); |
1825 | } |
1826 | } |
1827 | |
1828 | /** Return all dirty pages to dpage list */ |
1829 | static void |
1830 | mdb_dlist_free(MDB_txn *txn) |
1831 | { |
1832 | MDB_env *env = txn->mt_env; |
1833 | MDB_ID2L dl = txn->mt_u.dirty_list; |
1834 | unsigned i, n = dl[0].mid; |
1835 | |
1836 | for (i = 1; i <= n; i++) { |
1837 | mdb_dpage_free(env, dl[i].mptr); |
1838 | } |
1839 | dl[0].mid = 0; |
1840 | } |
1841 | |
1842 | /** Loosen or free a single page. |
1843 | * Saves single pages to a list for future reuse |
1844 | * in this same txn. It has been pulled from the freeDB |
1845 | * and already resides on the dirty list, but has been |
1846 | * deleted. Use these pages first before pulling again |
1847 | * from the freeDB. |
1848 | * |
1849 | * If the page wasn't dirtied in this txn, just add it |
1850 | * to this txn's free list. |
1851 | */ |
1852 | static int |
1853 | mdb_page_loose(MDB_cursor *mc, MDB_page *mp) |
1854 | { |
1855 | int loose = 0; |
1856 | pgno_t pgno = mp->mp_pgno; |
1857 | MDB_txn *txn = mc->mc_txn; |
1858 | |
1859 | if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { |
1860 | if (txn->mt_parent) { |
1861 | MDB_ID2 *dl = txn->mt_u.dirty_list; |
1862 | /* If txn has a parent, make sure the page is in our |
1863 | * dirty list. |
1864 | */ |
1865 | if (dl[0].mid) { |
1866 | unsigned x = mdb_mid2l_search(dl, pgno); |
1867 | if (x <= dl[0].mid && dl[x].mid == pgno) { |
1868 | if (mp != dl[x].mptr) { /* bad cursor? */ |
1869 | mc->mc_flags &= ~(C_INITIALIZED|C_EOF); |
1870 | txn->mt_flags |= MDB_TXN_ERROR; |
1871 | return MDB_CORRUPTED; |
1872 | } |
1873 | /* ok, it's ours */ |
1874 | loose = 1; |
1875 | } |
1876 | } |
1877 | } else { |
1878 | /* no parent txn, so it's just ours */ |
1879 | loose = 1; |
1880 | } |
1881 | } |
1882 | if (loose) { |
1883 | DPRINTF(("loosen db %d page %" Z"u" , DDBI(mc), |
1884 | mp->mp_pgno)); |
1885 | NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs; |
1886 | txn->mt_loose_pgs = mp; |
1887 | txn->mt_loose_count++; |
1888 | mp->mp_flags |= P_LOOSE; |
1889 | } else { |
1890 | int rc = mdb_midl_append(&txn->mt_free_pgs, pgno); |
1891 | if (rc) |
1892 | return rc; |
1893 | } |
1894 | |
1895 | return MDB_SUCCESS; |
1896 | } |
1897 | |
1898 | /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. |
1899 | * @param[in] mc A cursor handle for the current operation. |
1900 | * @param[in] pflags Flags of the pages to update: |
1901 | * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. |
1902 | * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). |
1903 | * @return 0 on success, non-zero on failure. |
1904 | */ |
1905 | static int |
1906 | mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) |
1907 | { |
1908 | enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; |
1909 | MDB_txn *txn = mc->mc_txn; |
1910 | MDB_cursor *m3, *m0 = mc; |
1911 | MDB_xcursor *mx; |
1912 | MDB_page *dp, *mp; |
1913 | MDB_node *leaf; |
1914 | unsigned i, j; |
1915 | int rc = MDB_SUCCESS, level; |
1916 | |
1917 | /* Mark pages seen by cursors */ |
1918 | if (mc->mc_flags & C_UNTRACK) |
1919 | mc = NULL; /* will find mc in mt_cursors */ |
1920 | for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { |
1921 | for (; mc; mc=mc->mc_next) { |
1922 | if (!(mc->mc_flags & C_INITIALIZED)) |
1923 | continue; |
1924 | for (m3 = mc;; m3 = &mx->mx_cursor) { |
1925 | mp = NULL; |
1926 | for (j=0; j<m3->mc_snum; j++) { |
1927 | mp = m3->mc_pg[j]; |
1928 | if ((mp->mp_flags & Mask) == pflags) |
1929 | mp->mp_flags ^= P_KEEP; |
1930 | } |
1931 | mx = m3->mc_xcursor; |
1932 | /* Proceed to mx if it is at a sub-database */ |
1933 | if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) |
1934 | break; |
1935 | if (! (mp && (mp->mp_flags & P_LEAF))) |
1936 | break; |
1937 | leaf = NODEPTR(mp, m3->mc_ki[j-1]); |
1938 | if (!(leaf->mn_flags & F_SUBDATA)) |
1939 | break; |
1940 | } |
1941 | } |
1942 | if (i == 0) |
1943 | break; |
1944 | } |
1945 | |
1946 | if (all) { |
1947 | /* Mark dirty root pages */ |
1948 | for (i=0; i<txn->mt_numdbs; i++) { |
1949 | if (txn->mt_dbflags[i] & DB_DIRTY) { |
1950 | pgno_t pgno = txn->mt_dbs[i].md_root; |
1951 | if (pgno == P_INVALID) |
1952 | continue; |
1953 | if ((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS) |
1954 | break; |
1955 | if ((dp->mp_flags & Mask) == pflags && level <= 1) |
1956 | dp->mp_flags ^= P_KEEP; |
1957 | } |
1958 | } |
1959 | } |
1960 | |
1961 | return rc; |
1962 | } |
1963 | |
1964 | static int mdb_page_flush(MDB_txn *txn, int keep); |
1965 | |
1966 | /** Spill pages from the dirty list back to disk. |
1967 | * This is intended to prevent running into #MDB_TXN_FULL situations, |
1968 | * but note that they may still occur in a few cases: |
1969 | * 1) our estimate of the txn size could be too small. Currently this |
1970 | * seems unlikely, except with a large number of #MDB_MULTIPLE items. |
1971 | * 2) child txns may run out of space if their parents dirtied a |
1972 | * lot of pages and never spilled them. TODO: we probably should do |
1973 | * a preemptive spill during #mdb_txn_begin() of a child txn, if |
1974 | * the parent's dirty_room is below a given threshold. |
1975 | * |
1976 | * Otherwise, if not using nested txns, it is expected that apps will |
1977 | * not run into #MDB_TXN_FULL any more. The pages are flushed to disk |
1978 | * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. |
1979 | * If the txn never references them again, they can be left alone. |
1980 | * If the txn only reads them, they can be used without any fuss. |
1981 | * If the txn writes them again, they can be dirtied immediately without |
1982 | * going thru all of the work of #mdb_page_touch(). Such references are |
1983 | * handled by #mdb_page_unspill(). |
1984 | * |
1985 | * Also note, we never spill DB root pages, nor pages of active cursors, |
1986 | * because we'll need these back again soon anyway. And in nested txns, |
1987 | * we can't spill a page in a child txn if it was already spilled in a |
1988 | * parent txn. That would alter the parent txns' data even though |
1989 | * the child hasn't committed yet, and we'd have no way to undo it if |
1990 | * the child aborted. |
1991 | * |
1992 | * @param[in] m0 cursor A cursor handle identifying the transaction and |
1993 | * database for which we are checking space. |
1994 | * @param[in] key For a put operation, the key being stored. |
1995 | * @param[in] data For a put operation, the data being stored. |
1996 | * @return 0 on success, non-zero on failure. |
1997 | */ |
1998 | static int |
1999 | mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) |
2000 | { |
2001 | MDB_txn *txn = m0->mc_txn; |
2002 | MDB_page *dp; |
2003 | MDB_ID2L dl = txn->mt_u.dirty_list; |
2004 | unsigned int i, j, need; |
2005 | int rc; |
2006 | |
2007 | if (m0->mc_flags & C_SUB) |
2008 | return MDB_SUCCESS; |
2009 | |
2010 | /* Estimate how much space this op will take */ |
2011 | i = m0->mc_db->md_depth; |
2012 | /* Named DBs also dirty the main DB */ |
2013 | if (m0->mc_dbi >= CORE_DBS) |
2014 | i += txn->mt_dbs[MAIN_DBI].md_depth; |
2015 | /* For puts, roughly factor in the key+data size */ |
2016 | if (key) |
2017 | i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; |
2018 | i += i; /* double it for good measure */ |
2019 | need = i; |
2020 | |
2021 | if (txn->mt_dirty_room > i) |
2022 | return MDB_SUCCESS; |
2023 | |
2024 | if (!txn->mt_spill_pgs) { |
2025 | txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); |
2026 | if (!txn->mt_spill_pgs) |
2027 | return ENOMEM; |
2028 | } else { |
2029 | /* purge deleted slots */ |
2030 | MDB_IDL sl = txn->mt_spill_pgs; |
2031 | unsigned int num = sl[0]; |
2032 | j=0; |
2033 | for (i=1; i<=num; i++) { |
2034 | if (!(sl[i] & 1)) |
2035 | sl[++j] = sl[i]; |
2036 | } |
2037 | sl[0] = j; |
2038 | } |
2039 | |
2040 | /* Preserve pages which may soon be dirtied again */ |
2041 | if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) |
2042 | goto done; |
2043 | |
2044 | /* Less aggressive spill - we originally spilled the entire dirty list, |
2045 | * with a few exceptions for cursor pages and DB root pages. But this |
2046 | * turns out to be a lot of wasted effort because in a large txn many |
2047 | * of those pages will need to be used again. So now we spill only 1/8th |
2048 | * of the dirty pages. Testing revealed this to be a good tradeoff, |
2049 | * better than 1/2, 1/4, or 1/10. |
2050 | */ |
2051 | if (need < MDB_IDL_UM_MAX / 8) |
2052 | need = MDB_IDL_UM_MAX / 8; |
2053 | |
2054 | /* Save the page IDs of all the pages we're flushing */ |
2055 | /* flush from the tail forward, this saves a lot of shifting later on. */ |
2056 | for (i=dl[0].mid; i && need; i--) { |
2057 | MDB_ID pn = dl[i].mid << 1; |
2058 | dp = dl[i].mptr; |
2059 | if (dp->mp_flags & (P_LOOSE|P_KEEP)) |
2060 | continue; |
2061 | /* Can't spill twice, make sure it's not already in a parent's |
2062 | * spill list. |
2063 | */ |
2064 | if (txn->mt_parent) { |
2065 | MDB_txn *tx2; |
2066 | for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { |
2067 | if (tx2->mt_spill_pgs) { |
2068 | j = mdb_midl_search(tx2->mt_spill_pgs, pn); |
2069 | if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { |
2070 | dp->mp_flags |= P_KEEP; |
2071 | break; |
2072 | } |
2073 | } |
2074 | } |
2075 | if (tx2) |
2076 | continue; |
2077 | } |
2078 | if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) |
2079 | goto done; |
2080 | need--; |
2081 | } |
2082 | mdb_midl_sort(txn->mt_spill_pgs); |
2083 | |
2084 | /* Flush the spilled part of dirty list */ |
2085 | if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) |
2086 | goto done; |
2087 | |
2088 | /* Reset any dirty pages we kept that page_flush didn't see */ |
2089 | rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); |
2090 | |
2091 | done: |
2092 | txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; |
2093 | return rc; |
2094 | } |
2095 | |
2096 | /** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ |
2097 | static txnid_t |
2098 | mdb_find_oldest(MDB_txn *txn) |
2099 | { |
2100 | int i; |
2101 | txnid_t mr, oldest = txn->mt_txnid - 1; |
2102 | if (txn->mt_env->me_txns) { |
2103 | MDB_reader *r = txn->mt_env->me_txns->mti_readers; |
2104 | for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { |
2105 | if (r[i].mr_pid) { |
2106 | mr = r[i].mr_txnid; |
2107 | if (oldest > mr) |
2108 | oldest = mr; |
2109 | } |
2110 | } |
2111 | } |
2112 | return oldest; |
2113 | } |
2114 | |
2115 | /** Add a page to the txn's dirty list */ |
2116 | static void |
2117 | mdb_page_dirty(MDB_txn *txn, MDB_page *mp) |
2118 | { |
2119 | MDB_ID2 mid; |
2120 | int rc, (*insert)(MDB_ID2L, MDB_ID2 *); |
2121 | |
2122 | if (txn->mt_flags & MDB_TXN_WRITEMAP) { |
2123 | insert = mdb_mid2l_append; |
2124 | } else { |
2125 | insert = mdb_mid2l_insert; |
2126 | } |
2127 | mid.mid = mp->mp_pgno; |
2128 | mid.mptr = mp; |
2129 | rc = insert(txn->mt_u.dirty_list, &mid); |
2130 | mdb_tassert(txn, rc == 0); |
2131 | txn->mt_dirty_room--; |
2132 | } |
2133 | |
2134 | /** Allocate page numbers and memory for writing. Maintain me_pglast, |
2135 | * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. |
2136 | * |
2137 | * If there are free pages available from older transactions, they |
2138 | * are re-used first. Otherwise allocate a new page at mt_next_pgno. |
2139 | * Do not modify the freedB, just merge freeDB records into me_pghead[] |
2140 | * and move me_pglast to say which records were consumed. Only this |
2141 | * function can create me_pghead and move me_pglast/mt_next_pgno. |
2142 | * @param[in] mc cursor A cursor handle identifying the transaction and |
2143 | * database for which we are allocating. |
2144 | * @param[in] num the number of pages to allocate. |
2145 | * @param[out] mp Address of the allocated page(s). Requests for multiple pages |
2146 | * will always be satisfied by a single contiguous chunk of memory. |
2147 | * @return 0 on success, non-zero on failure. |
2148 | */ |
2149 | static int |
2150 | mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) |
2151 | { |
2152 | #ifdef MDB_PARANOID /* Seems like we can ignore this now */ |
2153 | /* Get at most <Max_retries> more freeDB records once me_pghead |
2154 | * has enough pages. If not enough, use new pages from the map. |
2155 | * If <Paranoid> and mc is updating the freeDB, only get new |
2156 | * records if me_pghead is empty. Then the freelist cannot play |
2157 | * catch-up with itself by growing while trying to save it. |
2158 | */ |
2159 | enum { Paranoid = 1, Max_retries = 500 }; |
2160 | #else |
2161 | enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ }; |
2162 | #endif |
2163 | int rc, retry = num * 60; |
2164 | MDB_txn *txn = mc->mc_txn; |
2165 | MDB_env *env = txn->mt_env; |
2166 | pgno_t pgno, *mop = env->me_pghead; |
2167 | unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1; |
2168 | MDB_page *np; |
2169 | txnid_t oldest = 0, last; |
2170 | MDB_cursor_op op; |
2171 | MDB_cursor m2; |
2172 | int found_old = 0; |
2173 | |
2174 | /* If there are any loose pages, just use them */ |
2175 | if (num == 1 && txn->mt_loose_pgs) { |
2176 | np = txn->mt_loose_pgs; |
2177 | txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); |
2178 | txn->mt_loose_count--; |
2179 | DPRINTF(("db %d use loose page %" Z"u" , DDBI(mc), |
2180 | np->mp_pgno)); |
2181 | *mp = np; |
2182 | return MDB_SUCCESS; |
2183 | } |
2184 | |
2185 | *mp = NULL; |
2186 | |
2187 | /* If our dirty list is already full, we can't do anything */ |
2188 | if (txn->mt_dirty_room == 0) { |
2189 | rc = MDB_TXN_FULL; |
2190 | goto fail; |
2191 | } |
2192 | |
2193 | for (op = MDB_FIRST;; op = MDB_NEXT) { |
2194 | MDB_val key, data; |
2195 | MDB_node *leaf; |
2196 | pgno_t *idl; |
2197 | |
2198 | /* Seek a big enough contiguous page range. Prefer |
2199 | * pages at the tail, just truncating the list. |
2200 | */ |
2201 | if (mop_len > n2) { |
2202 | i = mop_len; |
2203 | do { |
2204 | pgno = mop[i]; |
2205 | if (mop[i-n2] == pgno+n2) |
2206 | goto search_done; |
2207 | } while (--i > n2); |
2208 | if (--retry < 0) |
2209 | break; |
2210 | } |
2211 | |
2212 | if (op == MDB_FIRST) { /* 1st iteration */ |
2213 | /* Prepare to fetch more and coalesce */ |
2214 | last = env->me_pglast; |
2215 | oldest = env->me_pgoldest; |
2216 | mdb_cursor_init(&m2, txn, FREE_DBI, NULL); |
2217 | if (last) { |
2218 | op = MDB_SET_RANGE; |
2219 | key.mv_data = &last; /* will look up last+1 */ |
2220 | key.mv_size = sizeof(last); |
2221 | } |
2222 | if (Paranoid && mc->mc_dbi == FREE_DBI) |
2223 | retry = -1; |
2224 | } |
2225 | if (Paranoid && retry < 0 && mop_len) |
2226 | break; |
2227 | |
2228 | last++; |
2229 | /* Do not fetch more if the record will be too recent */ |
2230 | if (oldest <= last) { |
2231 | if (!found_old) { |
2232 | oldest = mdb_find_oldest(txn); |
2233 | env->me_pgoldest = oldest; |
2234 | found_old = 1; |
2235 | } |
2236 | if (oldest <= last) |
2237 | break; |
2238 | } |
2239 | rc = mdb_cursor_get(&m2, &key, NULL, op); |
2240 | if (rc) { |
2241 | if (rc == MDB_NOTFOUND) |
2242 | break; |
2243 | goto fail; |
2244 | } |
2245 | last = *(txnid_t*)key.mv_data; |
2246 | if (oldest <= last) { |
2247 | if (!found_old) { |
2248 | oldest = mdb_find_oldest(txn); |
2249 | env->me_pgoldest = oldest; |
2250 | found_old = 1; |
2251 | } |
2252 | if (oldest <= last) |
2253 | break; |
2254 | } |
2255 | np = m2.mc_pg[m2.mc_top]; |
2256 | leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); |
2257 | if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS) |
2258 | goto fail; |
2259 | |
2260 | idl = (MDB_ID *) data.mv_data; |
2261 | i = idl[0]; |
2262 | if (!mop) { |
2263 | if (!(env->me_pghead = mop = mdb_midl_alloc(i))) { |
2264 | rc = ENOMEM; |
2265 | goto fail; |
2266 | } |
2267 | } else { |
2268 | if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) |
2269 | goto fail; |
2270 | mop = env->me_pghead; |
2271 | } |
2272 | env->me_pglast = last; |
2273 | #if (MDB_DEBUG) > 1 |
2274 | DPRINTF(("IDL read txn %" Z"u root %" Z"u num %u" , |
2275 | last, txn->mt_dbs[FREE_DBI].md_root, i)); |
2276 | for (j = i; j; j--) |
2277 | DPRINTF(("IDL %" Z"u" , idl[j])); |
2278 | #endif |
2279 | /* Merge in descending sorted order */ |
2280 | mdb_midl_xmerge(mop, idl); |
2281 | mop_len = mop[0]; |
2282 | } |
2283 | |
2284 | /* Use new pages from the map when nothing suitable in the freeDB */ |
2285 | i = 0; |
2286 | pgno = txn->mt_next_pgno; |
2287 | if (pgno + num >= env->me_maxpg) { |
2288 | DPUTS("DB size maxed out" ); |
2289 | rc = MDB_MAP_FULL; |
2290 | goto fail; |
2291 | } |
2292 | |
2293 | search_done: |
2294 | if (env->me_flags & MDB_WRITEMAP) { |
2295 | np = (MDB_page *)(env->me_map + env->me_psize * pgno); |
2296 | } else { |
2297 | if (!(np = mdb_page_malloc(txn, num))) { |
2298 | rc = ENOMEM; |
2299 | goto fail; |
2300 | } |
2301 | } |
2302 | if (i) { |
2303 | mop[0] = mop_len -= num; |
2304 | /* Move any stragglers down */ |
2305 | for (j = i-num; j < mop_len; ) |
2306 | mop[++j] = mop[++i]; |
2307 | } else { |
2308 | txn->mt_next_pgno = pgno + num; |
2309 | } |
2310 | np->mp_pgno = pgno; |
2311 | mdb_page_dirty(txn, np); |
2312 | *mp = np; |
2313 | |
2314 | return MDB_SUCCESS; |
2315 | |
2316 | fail: |
2317 | txn->mt_flags |= MDB_TXN_ERROR; |
2318 | return rc; |
2319 | } |
2320 | |
2321 | /** Copy the used portions of a non-overflow page. |
2322 | * @param[in] dst page to copy into |
2323 | * @param[in] src page to copy from |
2324 | * @param[in] psize size of a page |
2325 | */ |
2326 | static void |
2327 | mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) |
2328 | { |
2329 | enum { Align = sizeof(pgno_t) }; |
2330 | indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; |
2331 | |
2332 | /* If page isn't full, just copy the used portion. Adjust |
2333 | * alignment so memcpy may copy words instead of bytes. |
2334 | */ |
2335 | if ((unused &= -Align) && !IS_LEAF2(src)) { |
2336 | upper = (upper + PAGEBASE) & -Align; |
2337 | memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align); |
2338 | memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), |
2339 | psize - upper); |
2340 | } else { |
2341 | memcpy(dst, src, psize - unused); |
2342 | } |
2343 | } |
2344 | |
2345 | /** Pull a page off the txn's spill list, if present. |
2346 | * If a page being referenced was spilled to disk in this txn, bring |
2347 | * it back and make it dirty/writable again. |
2348 | * @param[in] txn the transaction handle. |
2349 | * @param[in] mp the page being referenced. It must not be dirty. |
2350 | * @param[out] ret the writable page, if any. ret is unchanged if |
2351 | * mp wasn't spilled. |
2352 | */ |
2353 | static int |
2354 | mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) |
2355 | { |
2356 | MDB_env *env = txn->mt_env; |
2357 | const MDB_txn *tx2; |
2358 | unsigned x; |
2359 | pgno_t pgno = mp->mp_pgno, pn = pgno << 1; |
2360 | |
2361 | for (tx2 = txn; tx2; tx2=tx2->mt_parent) { |
2362 | if (!tx2->mt_spill_pgs) |
2363 | continue; |
2364 | x = mdb_midl_search(tx2->mt_spill_pgs, pn); |
2365 | if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { |
2366 | MDB_page *np; |
2367 | int num; |
2368 | if (txn->mt_dirty_room == 0) |
2369 | return MDB_TXN_FULL; |
2370 | if (IS_OVERFLOW(mp)) |
2371 | num = mp->mp_pages; |
2372 | else |
2373 | num = 1; |
2374 | if (env->me_flags & MDB_WRITEMAP) { |
2375 | np = mp; |
2376 | } else { |
2377 | np = mdb_page_malloc(txn, num); |
2378 | if (!np) |
2379 | return ENOMEM; |
2380 | if (num > 1) |
2381 | memcpy(np, mp, num * env->me_psize); |
2382 | else |
2383 | mdb_page_copy(np, mp, env->me_psize); |
2384 | } |
2385 | if (tx2 == txn) { |
2386 | /* If in current txn, this page is no longer spilled. |
2387 | * If it happens to be the last page, truncate the spill list. |
2388 | * Otherwise mark it as deleted by setting the LSB. |
2389 | */ |
2390 | if (x == txn->mt_spill_pgs[0]) |
2391 | txn->mt_spill_pgs[0]--; |
2392 | else |
2393 | txn->mt_spill_pgs[x] |= 1; |
2394 | } /* otherwise, if belonging to a parent txn, the |
2395 | * page remains spilled until child commits |
2396 | */ |
2397 | |
2398 | mdb_page_dirty(txn, np); |
2399 | np->mp_flags |= P_DIRTY; |
2400 | *ret = np; |
2401 | break; |
2402 | } |
2403 | } |
2404 | return MDB_SUCCESS; |
2405 | } |
2406 | |
2407 | /** Touch a page: make it dirty and re-insert into tree with updated pgno. |
2408 | * Set #MDB_TXN_ERROR on failure. |
2409 | * @param[in] mc cursor pointing to the page to be touched |
2410 | * @return 0 on success, non-zero on failure. |
2411 | */ |
2412 | static int |
2413 | mdb_page_touch(MDB_cursor *mc) |
2414 | { |
2415 | MDB_page *mp = mc->mc_pg[mc->mc_top], *np; |
2416 | MDB_txn *txn = mc->mc_txn; |
2417 | MDB_cursor *m2, *m3; |
2418 | pgno_t pgno; |
2419 | int rc; |
2420 | |
2421 | if (!F_ISSET(mp->mp_flags, P_DIRTY)) { |
2422 | if (txn->mt_flags & MDB_TXN_SPILLS) { |
2423 | np = NULL; |
2424 | rc = mdb_page_unspill(txn, mp, &np); |
2425 | if (rc) |
2426 | goto fail; |
2427 | if (np) |
2428 | goto done; |
2429 | } |
2430 | if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || |
2431 | (rc = mdb_page_alloc(mc, 1, &np))) |
2432 | goto fail; |
2433 | pgno = np->mp_pgno; |
2434 | DPRINTF(("touched db %d page %" Z"u -> %" Z"u" , DDBI(mc), |
2435 | mp->mp_pgno, pgno)); |
2436 | mdb_cassert(mc, mp->mp_pgno != pgno); |
2437 | mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); |
2438 | /* Update the parent page, if any, to point to the new page */ |
2439 | if (mc->mc_top) { |
2440 | MDB_page *parent = mc->mc_pg[mc->mc_top-1]; |
2441 | MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); |
2442 | SETPGNO(node, pgno); |
2443 | } else { |
2444 | mc->mc_db->md_root = pgno; |
2445 | } |
2446 | } else if (txn->mt_parent && !IS_SUBP(mp)) { |
2447 | MDB_ID2 mid, *dl = txn->mt_u.dirty_list; |
2448 | pgno = mp->mp_pgno; |
2449 | /* If txn has a parent, make sure the page is in our |
2450 | * dirty list. |
2451 | */ |
2452 | if (dl[0].mid) { |
2453 | unsigned x = mdb_mid2l_search(dl, pgno); |
2454 | if (x <= dl[0].mid && dl[x].mid == pgno) { |
2455 | if (mp != dl[x].mptr) { /* bad cursor? */ |
2456 | mc->mc_flags &= ~(C_INITIALIZED|C_EOF); |
2457 | txn->mt_flags |= MDB_TXN_ERROR; |
2458 | return MDB_CORRUPTED; |
2459 | } |
2460 | return 0; |
2461 | } |
2462 | } |
2463 | mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); |
2464 | /* No - copy it */ |
2465 | np = mdb_page_malloc(txn, 1); |
2466 | if (!np) |
2467 | return ENOMEM; |
2468 | mid.mid = pgno; |
2469 | mid.mptr = np; |
2470 | rc = mdb_mid2l_insert(dl, &mid); |
2471 | mdb_cassert(mc, rc == 0); |
2472 | } else { |
2473 | return 0; |
2474 | } |
2475 | |
2476 | mdb_page_copy(np, mp, txn->mt_env->me_psize); |
2477 | np->mp_pgno = pgno; |
2478 | np->mp_flags |= P_DIRTY; |
2479 | |
2480 | done: |
2481 | /* Adjust cursors pointing to mp */ |
2482 | mc->mc_pg[mc->mc_top] = np; |
2483 | m2 = txn->mt_cursors[mc->mc_dbi]; |
2484 | if (mc->mc_flags & C_SUB) { |
2485 | for (; m2; m2=m2->mc_next) { |
2486 | m3 = &m2->mc_xcursor->mx_cursor; |
2487 | if (m3->mc_snum < mc->mc_snum) continue; |
2488 | if (m3->mc_pg[mc->mc_top] == mp) |
2489 | m3->mc_pg[mc->mc_top] = np; |
2490 | } |
2491 | } else { |
2492 | for (; m2; m2=m2->mc_next) { |
2493 | if (m2->mc_snum < mc->mc_snum) continue; |
2494 | if (m2 == mc) continue; |
2495 | if (m2->mc_pg[mc->mc_top] == mp) { |
2496 | m2->mc_pg[mc->mc_top] = np; |
2497 | if (IS_LEAF(np)) |
2498 | XCURSOR_REFRESH(m2, mc->mc_top, np); |
2499 | } |
2500 | } |
2501 | } |
2502 | return 0; |
2503 | |
2504 | fail: |
2505 | txn->mt_flags |= MDB_TXN_ERROR; |
2506 | return rc; |
2507 | } |
2508 | |
2509 | int |
2510 | mdb_env_sync(MDB_env *env, int force) |
2511 | { |
2512 | int rc = 0; |
2513 | if (env->me_flags & MDB_RDONLY) |
2514 | return EACCES; |
2515 | if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { |
2516 | if (env->me_flags & MDB_WRITEMAP) { |
2517 | int flags = ((env->me_flags & MDB_MAPASYNC) && !force) |
2518 | ? MS_ASYNC : MS_SYNC; |
2519 | if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) |
2520 | rc = ErrCode(); |
2521 | #ifdef _WIN32 |
2522 | else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) |
2523 | rc = ErrCode(); |
2524 | #endif |
2525 | } else { |
2526 | #ifdef BROKEN_FDATASYNC |
2527 | if (env->me_flags & MDB_FSYNCONLY) { |
2528 | if (fsync(env->me_fd)) |
2529 | rc = ErrCode(); |
2530 | } else |
2531 | #endif |
2532 | if (MDB_FDATASYNC(env->me_fd)) |
2533 | rc = ErrCode(); |
2534 | } |
2535 | } |
2536 | return rc; |
2537 | } |
2538 | |
2539 | /** Back up parent txn's cursors, then grab the originals for tracking */ |
2540 | static int |
2541 | mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) |
2542 | { |
2543 | MDB_cursor *mc, *bk; |
2544 | MDB_xcursor *mx; |
2545 | size_t size; |
2546 | int i; |
2547 | |
2548 | for (i = src->mt_numdbs; --i >= 0; ) { |
2549 | if ((mc = src->mt_cursors[i]) != NULL) { |
2550 | size = sizeof(MDB_cursor); |
2551 | if (mc->mc_xcursor) |
2552 | size += sizeof(MDB_xcursor); |
2553 | for (; mc; mc = bk->mc_next) { |
2554 | bk = malloc(size); |
2555 | if (!bk) |
2556 | return ENOMEM; |
2557 | *bk = *mc; |
2558 | mc->mc_backup = bk; |
2559 | mc->mc_db = &dst->mt_dbs[i]; |
2560 | /* Kill pointers into src to reduce abuse: The |
2561 | * user may not use mc until dst ends. But we need a valid |
2562 | * txn pointer here for cursor fixups to keep working. |
2563 | */ |
2564 | mc->mc_txn = dst; |
2565 | mc->mc_dbflag = &dst->mt_dbflags[i]; |
2566 | if ((mx = mc->mc_xcursor) != NULL) { |
2567 | *(MDB_xcursor *)(bk+1) = *mx; |
2568 | mx->mx_cursor.mc_txn = dst; |
2569 | } |
2570 | mc->mc_next = dst->mt_cursors[i]; |
2571 | dst->mt_cursors[i] = mc; |
2572 | } |
2573 | } |
2574 | } |
2575 | return MDB_SUCCESS; |
2576 | } |
2577 | |
2578 | /** Close this write txn's cursors, give parent txn's cursors back to parent. |
2579 | * @param[in] txn the transaction handle. |
2580 | * @param[in] merge true to keep changes to parent cursors, false to revert. |
2581 | * @return 0 on success, non-zero on failure. |
2582 | */ |
2583 | static void |
2584 | mdb_cursors_close(MDB_txn *txn, unsigned merge) |
2585 | { |
2586 | MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; |
2587 | MDB_xcursor *mx; |
2588 | int i; |
2589 | |
2590 | for (i = txn->mt_numdbs; --i >= 0; ) { |
2591 | for (mc = cursors[i]; mc; mc = next) { |
2592 | next = mc->mc_next; |
2593 | if ((bk = mc->mc_backup) != NULL) { |
2594 | if (merge) { |
2595 | /* Commit changes to parent txn */ |
2596 | mc->mc_next = bk->mc_next; |
2597 | mc->mc_backup = bk->mc_backup; |
2598 | mc->mc_txn = bk->mc_txn; |
2599 | mc->mc_db = bk->mc_db; |
2600 | mc->mc_dbflag = bk->mc_dbflag; |
2601 | if ((mx = mc->mc_xcursor) != NULL) |
2602 | mx->mx_cursor.mc_txn = bk->mc_txn; |
2603 | } else { |
2604 | /* Abort nested txn */ |
2605 | *mc = *bk; |
2606 | if ((mx = mc->mc_xcursor) != NULL) |
2607 | *mx = *(MDB_xcursor *)(bk+1); |
2608 | } |
2609 | mc = bk; |
2610 | } |
2611 | /* Only malloced cursors are permanently tracked. */ |
2612 | free(mc); |
2613 | } |
2614 | cursors[i] = NULL; |
2615 | } |
2616 | } |
2617 | |
2618 | #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ |
2619 | enum Pidlock_op { |
2620 | Pidset, Pidcheck |
2621 | }; |
2622 | #else |
2623 | enum Pidlock_op { |
2624 | Pidset = F_SETLK, Pidcheck = F_GETLK |
2625 | }; |
2626 | #endif |
2627 | |
2628 | /** Set or check a pid lock. Set returns 0 on success. |
2629 | * Check returns 0 if the process is certainly dead, nonzero if it may |
2630 | * be alive (the lock exists or an error happened so we do not know). |
2631 | * |
2632 | * On Windows Pidset is a no-op, we merely check for the existence |
2633 | * of the process with the given pid. On POSIX we use a single byte |
2634 | * lock on the lockfile, set at an offset equal to the pid. |
2635 | */ |
2636 | static int |
2637 | mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid) |
2638 | { |
2639 | #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */ |
2640 | int ret = 0; |
2641 | HANDLE h; |
2642 | if (op == Pidcheck) { |
2643 | h = OpenProcess(env->me_pidquery, FALSE, pid); |
2644 | /* No documented "no such process" code, but other program use this: */ |
2645 | if (!h) |
2646 | return ErrCode() != ERROR_INVALID_PARAMETER; |
2647 | /* A process exists until all handles to it close. Has it exited? */ |
2648 | ret = WaitForSingleObject(h, 0) != 0; |
2649 | CloseHandle(h); |
2650 | } |
2651 | return ret; |
2652 | #else |
2653 | for (;;) { |
2654 | int rc; |
2655 | struct flock lock_info; |
2656 | memset(&lock_info, 0, sizeof(lock_info)); |
2657 | lock_info.l_type = F_WRLCK; |
2658 | lock_info.l_whence = SEEK_SET; |
2659 | lock_info.l_start = pid; |
2660 | lock_info.l_len = 1; |
2661 | if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { |
2662 | if (op == F_GETLK && lock_info.l_type != F_UNLCK) |
2663 | rc = -1; |
2664 | } else if ((rc = ErrCode()) == EINTR) { |
2665 | continue; |
2666 | } |
2667 | return rc; |
2668 | } |
2669 | #endif |
2670 | } |
2671 | |
2672 | /** Common code for #mdb_txn_begin() and #mdb_txn_renew(). |
2673 | * @param[in] txn the transaction handle to initialize |
2674 | * @return 0 on success, non-zero on failure. |
2675 | */ |
2676 | static int |
2677 | mdb_txn_renew0(MDB_txn *txn) |
2678 | { |
2679 | MDB_env *env = txn->mt_env; |
2680 | MDB_txninfo *ti = env->me_txns; |
2681 | MDB_meta *meta; |
2682 | unsigned int i, nr, flags = txn->mt_flags; |
2683 | uint16_t x; |
2684 | int rc, new_notls = 0; |
2685 | |
2686 | if ((flags &= MDB_TXN_RDONLY) != 0) { |
2687 | if (!ti) { |
2688 | meta = mdb_env_pick_meta(env); |
2689 | txn->mt_txnid = meta->mm_txnid; |
2690 | txn->mt_u.reader = NULL; |
2691 | } else { |
2692 | MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : |
2693 | pthread_getspecific(env->me_txkey); |
2694 | if (r) { |
2695 | if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) |
2696 | return MDB_BAD_RSLOT; |
2697 | } else { |
2698 | MDB_PID_T pid = env->me_pid; |
2699 | MDB_THR_T tid = pthread_self(); |
2700 | mdb_mutexref_t rmutex = env->me_rmutex; |
2701 | |
2702 | if (!env->me_live_reader) { |
2703 | rc = mdb_reader_pid(env, Pidset, pid); |
2704 | if (rc) |
2705 | return rc; |
2706 | env->me_live_reader = 1; |
2707 | } |
2708 | |
2709 | if (LOCK_MUTEX(rc, env, rmutex)) |
2710 | return rc; |
2711 | nr = ti->mti_numreaders; |
2712 | for (i=0; i<nr; i++) |
2713 | if (ti->mti_readers[i].mr_pid == 0) |
2714 | break; |
2715 | if (i == env->me_maxreaders) { |
2716 | UNLOCK_MUTEX(rmutex); |
2717 | return MDB_READERS_FULL; |
2718 | } |
2719 | r = &ti->mti_readers[i]; |
2720 | /* Claim the reader slot, carefully since other code |
2721 | * uses the reader table un-mutexed: First reset the |
2722 | * slot, next publish it in mti_numreaders. After |
2723 | * that, it is safe for mdb_env_close() to touch it. |
2724 | * When it will be closed, we can finally claim it. |
2725 | */ |
2726 | r->mr_pid = 0; |
2727 | r->mr_txnid = (txnid_t)-1; |
2728 | r->mr_tid = tid; |
2729 | if (i == nr) |
2730 | ti->mti_numreaders = ++nr; |
2731 | env->me_close_readers = nr; |
2732 | r->mr_pid = pid; |
2733 | UNLOCK_MUTEX(rmutex); |
2734 | |
2735 | new_notls = (env->me_flags & MDB_NOTLS); |
2736 | if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { |
2737 | r->mr_pid = 0; |
2738 | return rc; |
2739 | } |
2740 | } |
2741 | do /* LY: Retry on a race, ITS#7970. */ |
2742 | r->mr_txnid = ti->mti_txnid; |
2743 | while(r->mr_txnid != ti->mti_txnid); |
2744 | txn->mt_txnid = r->mr_txnid; |
2745 | txn->mt_u.reader = r; |
2746 | meta = env->me_metas[txn->mt_txnid & 1]; |
2747 | } |
2748 | |
2749 | } else { |
2750 | /* Not yet touching txn == env->me_txn0, it may be active */ |
2751 | if (ti) { |
2752 | if (LOCK_MUTEX(rc, env, env->me_wmutex)) |
2753 | return rc; |
2754 | txn->mt_txnid = ti->mti_txnid; |
2755 | meta = env->me_metas[txn->mt_txnid & 1]; |
2756 | } else { |
2757 | meta = mdb_env_pick_meta(env); |
2758 | txn->mt_txnid = meta->mm_txnid; |
2759 | } |
2760 | txn->mt_txnid++; |
2761 | #if MDB_DEBUG |
2762 | if (txn->mt_txnid == mdb_debug_start) |
2763 | mdb_debug = 1; |
2764 | #endif |
2765 | txn->mt_child = NULL; |
2766 | txn->mt_loose_pgs = NULL; |
2767 | txn->mt_loose_count = 0; |
2768 | txn->mt_dirty_room = MDB_IDL_UM_MAX; |
2769 | txn->mt_u.dirty_list = env->me_dirty_list; |
2770 | txn->mt_u.dirty_list[0].mid = 0; |
2771 | txn->mt_free_pgs = env->me_free_pgs; |
2772 | txn->mt_free_pgs[0] = 0; |
2773 | txn->mt_spill_pgs = NULL; |
2774 | env->me_txn = txn; |
2775 | memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int)); |
2776 | } |
2777 | |
2778 | /* Copy the DB info and flags */ |
2779 | memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); |
2780 | |
2781 | /* Moved to here to avoid a data race in read TXNs */ |
2782 | txn->mt_next_pgno = meta->mm_last_pg+1; |
2783 | |
2784 | txn->mt_flags = flags; |
2785 | |
2786 | /* Setup db info */ |
2787 | txn->mt_numdbs = env->me_numdbs; |
2788 | for (i=CORE_DBS; i<txn->mt_numdbs; i++) { |
2789 | x = env->me_dbflags[i]; |
2790 | txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; |
2791 | txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0; |
2792 | } |
2793 | txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID; |
2794 | txn->mt_dbflags[FREE_DBI] = DB_VALID; |
2795 | |
2796 | if (env->me_flags & MDB_FATAL_ERROR) { |
2797 | DPUTS("environment had fatal error, must shutdown!" ); |
2798 | rc = MDB_PANIC; |
2799 | } else if (env->me_maxpg < txn->mt_next_pgno) { |
2800 | rc = MDB_MAP_RESIZED; |
2801 | } else { |
2802 | return MDB_SUCCESS; |
2803 | } |
2804 | mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); |
2805 | return rc; |
2806 | } |
2807 | |
2808 | int |
2809 | mdb_txn_renew(MDB_txn *txn) |
2810 | { |
2811 | int rc; |
2812 | |
2813 | if (!txn || !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED)) |
2814 | return EINVAL; |
2815 | |
2816 | rc = mdb_txn_renew0(txn); |
2817 | if (rc == MDB_SUCCESS) { |
2818 | DPRINTF(("renew txn %" Z"u%c %p on mdbenv %p, root page %" Z"u" , |
2819 | txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', |
2820 | (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); |
2821 | } |
2822 | return rc; |
2823 | } |
2824 | |
2825 | int |
2826 | mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) |
2827 | { |
2828 | MDB_txn *txn; |
2829 | MDB_ntxn *ntxn; |
2830 | int rc, size, tsize; |
2831 | |
2832 | flags &= MDB_TXN_BEGIN_FLAGS; |
2833 | flags |= env->me_flags & MDB_WRITEMAP; |
2834 | |
2835 | if (env->me_flags & MDB_RDONLY & ~flags) /* write txn in RDONLY env */ |
2836 | return EACCES; |
2837 | |
2838 | if (parent) { |
2839 | /* Nested transactions: Max 1 child, write txns only, no writemap */ |
2840 | flags |= parent->mt_flags; |
2841 | if (flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_BLOCKED)) { |
2842 | return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; |
2843 | } |
2844 | /* Child txns save MDB_pgstate and use own copy of cursors */ |
2845 | size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1); |
2846 | size += tsize = sizeof(MDB_ntxn); |
2847 | } else if (flags & MDB_RDONLY) { |
2848 | size = env->me_maxdbs * (sizeof(MDB_db)+1); |
2849 | size += tsize = sizeof(MDB_txn); |
2850 | } else { |
2851 | /* Reuse preallocated write txn. However, do not touch it until |
2852 | * mdb_txn_renew0() succeeds, since it currently may be active. |
2853 | */ |
2854 | txn = env->me_txn0; |
2855 | goto renew; |
2856 | } |
2857 | if ((txn = calloc(1, size)) == NULL) { |
2858 | DPRINTF(("calloc: %s" , strerror(errno))); |
2859 | return ENOMEM; |
2860 | } |
2861 | txn->mt_dbxs = env->me_dbxs; /* static */ |
2862 | txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); |
2863 | txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; |
2864 | txn->mt_flags = flags; |
2865 | txn->mt_env = env; |
2866 | |
2867 | if (parent) { |
2868 | unsigned int i; |
2869 | txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); |
2870 | txn->mt_dbiseqs = parent->mt_dbiseqs; |
2871 | txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); |
2872 | if (!txn->mt_u.dirty_list || |
2873 | !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) |
2874 | { |
2875 | free(txn->mt_u.dirty_list); |
2876 | free(txn); |
2877 | return ENOMEM; |
2878 | } |
2879 | txn->mt_txnid = parent->mt_txnid; |
2880 | txn->mt_dirty_room = parent->mt_dirty_room; |
2881 | txn->mt_u.dirty_list[0].mid = 0; |
2882 | txn->mt_spill_pgs = NULL; |
2883 | txn->mt_next_pgno = parent->mt_next_pgno; |
2884 | parent->mt_flags |= MDB_TXN_HAS_CHILD; |
2885 | parent->mt_child = txn; |
2886 | txn->mt_parent = parent; |
2887 | txn->mt_numdbs = parent->mt_numdbs; |
2888 | memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); |
2889 | /* Copy parent's mt_dbflags, but clear DB_NEW */ |
2890 | for (i=0; i<txn->mt_numdbs; i++) |
2891 | txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; |
2892 | rc = 0; |
2893 | ntxn = (MDB_ntxn *)txn; |
2894 | ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ |
2895 | if (env->me_pghead) { |
2896 | size = MDB_IDL_SIZEOF(env->me_pghead); |
2897 | env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); |
2898 | if (env->me_pghead) |
2899 | memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); |
2900 | else |
2901 | rc = ENOMEM; |
2902 | } |
2903 | if (!rc) |
2904 | rc = mdb_cursor_shadow(parent, txn); |
2905 | if (rc) |
2906 | mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD); |
2907 | } else { /* MDB_RDONLY */ |
2908 | txn->mt_dbiseqs = env->me_dbiseqs; |
2909 | renew: |
2910 | rc = mdb_txn_renew0(txn); |
2911 | } |
2912 | if (rc) { |
2913 | if (txn != env->me_txn0) |
2914 | free(txn); |
2915 | } else { |
2916 | txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */ |
2917 | *ret = txn; |
2918 | DPRINTF(("begin txn %" Z"u%c %p on mdbenv %p, root page %" Z"u" , |
2919 | txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', |
2920 | (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); |
2921 | } |
2922 | |
2923 | return rc; |
2924 | } |
2925 | |
2926 | MDB_env * |
2927 | mdb_txn_env(MDB_txn *txn) |
2928 | { |
2929 | if(!txn) return NULL; |
2930 | return txn->mt_env; |
2931 | } |
2932 | |
2933 | size_t |
2934 | mdb_txn_id(MDB_txn *txn) |
2935 | { |
2936 | if(!txn) return 0; |
2937 | return txn->mt_txnid; |
2938 | } |
2939 | |
2940 | /** Export or close DBI handles opened in this txn. */ |
2941 | static void |
2942 | mdb_dbis_update(MDB_txn *txn, int keep) |
2943 | { |
2944 | int i; |
2945 | MDB_dbi n = txn->mt_numdbs; |
2946 | MDB_env *env = txn->mt_env; |
2947 | unsigned char *tdbflags = txn->mt_dbflags; |
2948 | |
2949 | for (i = n; --i >= CORE_DBS;) { |
2950 | if (tdbflags[i] & DB_NEW) { |
2951 | if (keep) { |
2952 | env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; |
2953 | } else { |
2954 | char *ptr = env->me_dbxs[i].md_name.mv_data; |
2955 | if (ptr) { |
2956 | env->me_dbxs[i].md_name.mv_data = NULL; |
2957 | env->me_dbxs[i].md_name.mv_size = 0; |
2958 | env->me_dbflags[i] = 0; |
2959 | env->me_dbiseqs[i]++; |
2960 | free(ptr); |
2961 | } |
2962 | } |
2963 | } |
2964 | } |
2965 | if (keep && env->me_numdbs < n) |
2966 | env->me_numdbs = n; |
2967 | } |
2968 | |
2969 | /** End a transaction, except successful commit of a nested transaction. |
2970 | * May be called twice for readonly txns: First reset it, then abort. |
2971 | * @param[in] txn the transaction handle to end |
2972 | * @param[in] mode why and how to end the transaction |
2973 | */ |
2974 | static void |
2975 | mdb_txn_end(MDB_txn *txn, unsigned mode) |
2976 | { |
2977 | MDB_env *env = txn->mt_env; |
2978 | #if MDB_DEBUG |
2979 | static const char *const names[] = MDB_END_NAMES; |
2980 | #endif |
2981 | |
2982 | /* Export or close DBI handles opened in this txn */ |
2983 | mdb_dbis_update(txn, mode & MDB_END_UPDATE); |
2984 | |
2985 | DPRINTF(("%s txn %" Z"u%c %p on mdbenv %p, root page %" Z"u" , |
2986 | names[mode & MDB_END_OPMASK], |
2987 | txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', |
2988 | (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); |
2989 | |
2990 | if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { |
2991 | if (txn->mt_u.reader) { |
2992 | txn->mt_u.reader->mr_txnid = (txnid_t)-1; |
2993 | if (!(env->me_flags & MDB_NOTLS)) { |
2994 | txn->mt_u.reader = NULL; /* txn does not own reader */ |
2995 | } else if (mode & MDB_END_SLOT) { |
2996 | txn->mt_u.reader->mr_pid = 0; |
2997 | txn->mt_u.reader = NULL; |
2998 | } /* else txn owns the slot until it does MDB_END_SLOT */ |
2999 | } |
3000 | txn->mt_numdbs = 0; /* prevent further DBI activity */ |
3001 | txn->mt_flags |= MDB_TXN_FINISHED; |
3002 | |
3003 | } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { |
3004 | pgno_t *pghead = env->me_pghead; |
3005 | |
3006 | if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ |
3007 | mdb_cursors_close(txn, 0); |
3008 | if (!(env->me_flags & MDB_WRITEMAP)) { |
3009 | mdb_dlist_free(txn); |
3010 | } |
3011 | |
3012 | txn->mt_numdbs = 0; |
3013 | txn->mt_flags = MDB_TXN_FINISHED; |
3014 | |
3015 | if (!txn->mt_parent) { |
3016 | mdb_midl_shrink(&txn->mt_free_pgs); |
3017 | env->me_free_pgs = txn->mt_free_pgs; |
3018 | /* me_pgstate: */ |
3019 | env->me_pghead = NULL; |
3020 | env->me_pglast = 0; |
3021 | |
3022 | env->me_txn = NULL; |
3023 | mode = 0; /* txn == env->me_txn0, do not free() it */ |
3024 | |
3025 | /* The writer mutex was locked in mdb_txn_begin. */ |
3026 | if (env->me_txns) |
3027 | UNLOCK_MUTEX(env->me_wmutex); |
3028 | } else { |
3029 | txn->mt_parent->mt_child = NULL; |
3030 | txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; |
3031 | env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; |
3032 | mdb_midl_free(txn->mt_free_pgs); |
3033 | free(txn->mt_u.dirty_list); |
3034 | } |
3035 | mdb_midl_free(txn->mt_spill_pgs); |
3036 | |
3037 | mdb_midl_free(pghead); |
3038 | } |
3039 | |
3040 | if (mode & MDB_END_FREE) |
3041 | free(txn); |
3042 | } |
3043 | |
3044 | void |
3045 | mdb_txn_reset(MDB_txn *txn) |
3046 | { |
3047 | if (txn == NULL) |
3048 | return; |
3049 | |
3050 | /* This call is only valid for read-only txns */ |
3051 | if (!(txn->mt_flags & MDB_TXN_RDONLY)) |
3052 | return; |
3053 | |
3054 | mdb_txn_end(txn, MDB_END_RESET); |
3055 | } |
3056 | |
3057 | void |
3058 | mdb_txn_abort(MDB_txn *txn) |
3059 | { |
3060 | if (txn == NULL) |
3061 | return; |
3062 | |
3063 | if (txn->mt_child) |
3064 | mdb_txn_abort(txn->mt_child); |
3065 | |
3066 | mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); |
3067 | } |
3068 | |
3069 | /** Save the freelist as of this transaction to the freeDB. |
3070 | * This changes the freelist. Keep trying until it stabilizes. |
3071 | */ |
3072 | static int |
3073 | mdb_freelist_save(MDB_txn *txn) |
3074 | { |
3075 | /* env->me_pghead[] can grow and shrink during this call. |
3076 | * env->me_pglast and txn->mt_free_pgs[] can only grow. |
3077 | * Page numbers cannot disappear from txn->mt_free_pgs[]. |
3078 | */ |
3079 | MDB_cursor mc; |
3080 | MDB_env *env = txn->mt_env; |
3081 | int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; |
3082 | txnid_t pglast = 0, head_id = 0; |
3083 | pgno_t freecnt = 0, *free_pgs, *mop; |
3084 | ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; |
3085 | |
3086 | mdb_cursor_init(&mc, txn, FREE_DBI, NULL); |
3087 | |
3088 | if (env->me_pghead) { |
3089 | /* Make sure first page of freeDB is touched and on freelist */ |
3090 | rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); |
3091 | if (rc && rc != MDB_NOTFOUND) |
3092 | return rc; |
3093 | } |
3094 | |
3095 | if (!env->me_pghead && txn->mt_loose_pgs) { |
3096 | /* Put loose page numbers in mt_free_pgs, since |
3097 | * we may be unable to return them to me_pghead. |
3098 | */ |
3099 | MDB_page *mp = txn->mt_loose_pgs; |
3100 | MDB_ID2 *dl = txn->mt_u.dirty_list; |
3101 | unsigned x; |
3102 | if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0) |
3103 | return rc; |
3104 | for (; mp; mp = NEXT_LOOSE_PAGE(mp)) { |
3105 | mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); |
3106 | /* must also remove from dirty list */ |
3107 | if (txn->mt_flags & MDB_TXN_WRITEMAP) { |
3108 | for (x=1; x<=dl[0].mid; x++) |
3109 | if (dl[x].mid == mp->mp_pgno) |
3110 | break; |
3111 | mdb_tassert(txn, x <= dl[0].mid); |
3112 | } else { |
3113 | x = mdb_mid2l_search(dl, mp->mp_pgno); |
3114 | mdb_tassert(txn, dl[x].mid == mp->mp_pgno); |
3115 | mdb_dpage_free(env, mp); |
3116 | } |
3117 | dl[x].mptr = NULL; |
3118 | } |
3119 | { |
3120 | /* squash freed slots out of the dirty list */ |
3121 | unsigned y; |
3122 | for (y=1; dl[y].mptr && y <= dl[0].mid; y++); |
3123 | if (y <= dl[0].mid) { |
3124 | for(x=y, y++;;) { |
3125 | while (!dl[y].mptr && y <= dl[0].mid) y++; |
3126 | if (y > dl[0].mid) break; |
3127 | dl[x++] = dl[y++]; |
3128 | } |
3129 | dl[0].mid = x-1; |
3130 | } else { |
3131 | /* all slots freed */ |
3132 | dl[0].mid = 0; |
3133 | } |
3134 | } |
3135 | txn->mt_loose_pgs = NULL; |
3136 | txn->mt_loose_count = 0; |
3137 | } |
3138 | |
3139 | /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ |
3140 | clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) |
3141 | ? SSIZE_MAX : maxfree_1pg; |
3142 | |
3143 | for (;;) { |
3144 | /* Come back here after each Put() in case freelist changed */ |
3145 | MDB_val key, data; |
3146 | pgno_t *pgs; |
3147 | ssize_t j; |
3148 | |
3149 | /* If using records from freeDB which we have not yet |
3150 | * deleted, delete them and any we reserved for me_pghead. |
3151 | */ |
3152 | while (pglast < env->me_pglast) { |
3153 | rc = mdb_cursor_first(&mc, &key, NULL); |
3154 | if (rc) |
3155 | return rc; |
3156 | pglast = head_id = *(txnid_t *)key.mv_data; |
3157 | total_room = head_room = 0; |
3158 | mdb_tassert(txn, pglast <= env->me_pglast); |
3159 | rc = mdb_cursor_del(&mc, 0); |
3160 | if (rc) |
3161 | return rc; |
3162 | } |
3163 | |
3164 | /* Save the IDL of pages freed by this txn, to a single record */ |
3165 | if (freecnt < txn->mt_free_pgs[0]) { |
3166 | if (!freecnt) { |
3167 | /* Make sure last page of freeDB is touched and on freelist */ |
3168 | rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); |
3169 | if (rc && rc != MDB_NOTFOUND) |
3170 | return rc; |
3171 | } |
3172 | free_pgs = txn->mt_free_pgs; |
3173 | /* Write to last page of freeDB */ |
3174 | key.mv_size = sizeof(txn->mt_txnid); |
3175 | key.mv_data = &txn->mt_txnid; |
3176 | do { |
3177 | freecnt = free_pgs[0]; |
3178 | data.mv_size = MDB_IDL_SIZEOF(free_pgs); |
3179 | rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); |
3180 | if (rc) |
3181 | return rc; |
3182 | /* Retry if mt_free_pgs[] grew during the Put() */ |
3183 | free_pgs = txn->mt_free_pgs; |
3184 | } while (freecnt < free_pgs[0]); |
3185 | mdb_midl_sort(free_pgs); |
3186 | memcpy(data.mv_data, free_pgs, data.mv_size); |
3187 | #if (MDB_DEBUG) > 1 |
3188 | { |
3189 | unsigned int i = free_pgs[0]; |
3190 | DPRINTF(("IDL write txn %" Z"u root %" Z"u num %u" , |
3191 | txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); |
3192 | for (; i; i--) |
3193 | DPRINTF(("IDL %" Z"u" , free_pgs[i])); |
3194 | } |
3195 | #endif |
3196 | continue; |
3197 | } |
3198 | |
3199 | mop = env->me_pghead; |
3200 | mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; |
3201 | |
3202 | /* Reserve records for me_pghead[]. Split it if multi-page, |
3203 | * to avoid searching freeDB for a page range. Use keys in |
3204 | * range [1,me_pglast]: Smaller than txnid of oldest reader. |
3205 | */ |
3206 | if (total_room >= mop_len) { |
3207 | if (total_room == mop_len || --more < 0) |
3208 | break; |
3209 | } else if (head_room >= maxfree_1pg && head_id > 1) { |
3210 | /* Keep current record (overflow page), add a new one */ |
3211 | head_id--; |
3212 | head_room = 0; |
3213 | } |
3214 | /* (Re)write {key = head_id, IDL length = head_room} */ |
3215 | total_room -= head_room; |
3216 | head_room = mop_len - total_room; |
3217 | if (head_room > maxfree_1pg && head_id > 1) { |
3218 | /* Overflow multi-page for part of me_pghead */ |
3219 | head_room /= head_id; /* amortize page sizes */ |
3220 | head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); |
3221 | } else if (head_room < 0) { |
3222 | /* Rare case, not bothering to delete this record */ |
3223 | head_room = 0; |
3224 | } |
3225 | key.mv_size = sizeof(head_id); |
3226 | key.mv_data = &head_id; |
3227 | data.mv_size = (head_room + 1) * sizeof(pgno_t); |
3228 | rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); |
3229 | if (rc) |
3230 | return rc; |
3231 | /* IDL is initially empty, zero out at least the length */ |
3232 | pgs = (pgno_t *)data.mv_data; |
3233 | j = head_room > clean_limit ? head_room : 0; |
3234 | do { |
3235 | pgs[j] = 0; |
3236 | } while (--j >= 0); |
3237 | total_room += head_room; |
3238 | } |
3239 | |
3240 | /* Return loose page numbers to me_pghead, though usually none are |
3241 | * left at this point. The pages themselves remain in dirty_list. |
3242 | */ |
3243 | if (txn->mt_loose_pgs) { |
3244 | MDB_page *mp = txn->mt_loose_pgs; |
3245 | unsigned count = txn->mt_loose_count; |
3246 | MDB_IDL loose; |
3247 | /* Room for loose pages + temp IDL with same */ |
3248 | if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0) |
3249 | return rc; |
3250 | mop = env->me_pghead; |
3251 | loose = mop + MDB_IDL_ALLOCLEN(mop) - count; |
3252 | for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) |
3253 | loose[ ++count ] = mp->mp_pgno; |
3254 | loose[0] = count; |
3255 | mdb_midl_sort(loose); |
3256 | mdb_midl_xmerge(mop, loose); |
3257 | txn->mt_loose_pgs = NULL; |
3258 | txn->mt_loose_count = 0; |
3259 | mop_len = mop[0]; |
3260 | } |
3261 | |
3262 | /* Fill in the reserved me_pghead records */ |
3263 | rc = MDB_SUCCESS; |
3264 | if (mop_len) { |
3265 | MDB_val key, data; |
3266 | |
3267 | mop += mop_len; |
3268 | rc = mdb_cursor_first(&mc, &key, &data); |
3269 | for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { |
3270 | txnid_t id = *(txnid_t *)key.mv_data; |
3271 | ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; |
3272 | MDB_ID save; |
3273 | |
3274 | mdb_tassert(txn, len >= 0 && id <= env->me_pglast); |
3275 | key.mv_data = &id; |
3276 | if (len > mop_len) { |
3277 | len = mop_len; |
3278 | data.mv_size = (len + 1) * sizeof(MDB_ID); |
3279 | } |
3280 | data.mv_data = mop -= len; |
3281 | save = mop[0]; |
3282 | mop[0] = len; |
3283 | rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT); |
3284 | mop[0] = save; |
3285 | if (rc || !(mop_len -= len)) |
3286 | break; |
3287 | } |
3288 | } |
3289 | return rc; |
3290 | } |
3291 | |
3292 | /** Flush (some) dirty pages to the map, after clearing their dirty flag. |
3293 | * @param[in] txn the transaction that's being committed |
3294 | * @param[in] keep number of initial pages in dirty_list to keep dirty. |
3295 | * @return 0 on success, non-zero on failure. |
3296 | */ |
3297 | static int |
3298 | mdb_page_flush(MDB_txn *txn, int keep) |
3299 | { |
3300 | MDB_env *env = txn->mt_env; |
3301 | MDB_ID2L dl = txn->mt_u.dirty_list; |
3302 | unsigned psize = env->me_psize, j; |
3303 | int i, pagecount = dl[0].mid, rc; |
3304 | size_t size = 0, pos = 0; |
3305 | pgno_t pgno = 0; |
3306 | MDB_page *dp = NULL; |
3307 | #ifdef _WIN32 |
3308 | OVERLAPPED ov; |
3309 | #else |
3310 | struct iovec iov[MDB_COMMIT_PAGES]; |
3311 | ssize_t wpos = 0, wsize = 0, wres; |
3312 | size_t next_pos = 1; /* impossible pos, so pos != next_pos */ |
3313 | int n = 0; |
3314 | #endif |
3315 | |
3316 | j = i = keep; |
3317 | |
3318 | if (env->me_flags & MDB_WRITEMAP) { |
3319 | /* Clear dirty flags */ |
3320 | while (++i <= pagecount) { |
3321 | dp = dl[i].mptr; |
3322 | /* Don't flush this page yet */ |
3323 | if (dp->mp_flags & (P_LOOSE|P_KEEP)) { |
3324 | dp->mp_flags &= ~P_KEEP; |
3325 | dl[++j] = dl[i]; |
3326 | continue; |
3327 | } |
3328 | dp->mp_flags &= ~P_DIRTY; |
3329 | } |
3330 | goto done; |
3331 | } |
3332 | |
3333 | /* Write the pages */ |
3334 | for (;;) { |
3335 | if (++i <= pagecount) { |
3336 | dp = dl[i].mptr; |
3337 | /* Don't flush this page yet */ |
3338 | if (dp->mp_flags & (P_LOOSE|P_KEEP)) { |
3339 | dp->mp_flags &= ~P_KEEP; |
3340 | dl[i].mid = 0; |
3341 | continue; |
3342 | } |
3343 | pgno = dl[i].mid; |
3344 | /* clear dirty flag */ |
3345 | dp->mp_flags &= ~P_DIRTY; |
3346 | pos = pgno * psize; |
3347 | size = psize; |
3348 | if (IS_OVERFLOW(dp)) size *= dp->mp_pages; |
3349 | } |
3350 | #ifdef _WIN32 |
3351 | else break; |
3352 | |
3353 | /* Windows actually supports scatter/gather I/O, but only on |
3354 | * unbuffered file handles. Since we're relying on the OS page |
3355 | * cache for all our data, that's self-defeating. So we just |
3356 | * write pages one at a time. We use the ov structure to set |
3357 | * the write offset, to at least save the overhead of a Seek |
3358 | * system call. |
3359 | */ |
3360 | DPRINTF(("committing page %" Z"u" , pgno)); |
3361 | memset(&ov, 0, sizeof(ov)); |
3362 | ov.Offset = pos & 0xffffffff; |
3363 | ov.OffsetHigh = pos >> 16 >> 16; |
3364 | if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { |
3365 | rc = ErrCode(); |
3366 | DPRINTF(("WriteFile: %d" , rc)); |
3367 | return rc; |
3368 | } |
3369 | #else |
3370 | /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ |
3371 | if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { |
3372 | if (n) { |
3373 | retry_write: |
3374 | /* Write previous page(s) */ |
3375 | #ifdef MDB_USE_PWRITEV |
3376 | wres = pwritev(env->me_fd, iov, n, wpos); |
3377 | #else |
3378 | if (n == 1) { |
3379 | wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); |
3380 | } else { |
3381 | retry_seek: |
3382 | if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { |
3383 | rc = ErrCode(); |
3384 | if (rc == EINTR) |
3385 | goto retry_seek; |
3386 | DPRINTF(("lseek: %s" , strerror(rc))); |
3387 | return rc; |
3388 | } |
3389 | wres = writev(env->me_fd, iov, n); |
3390 | } |
3391 | #endif |
3392 | if (wres != wsize) { |
3393 | if (wres < 0) { |
3394 | rc = ErrCode(); |
3395 | if (rc == EINTR) |
3396 | goto retry_write; |
3397 | DPRINTF(("Write error: %s" , strerror(rc))); |
3398 | } else { |
3399 | rc = EIO; /* TODO: Use which error code? */ |
3400 | DPUTS("short write, filesystem full?" ); |
3401 | } |
3402 | return rc; |
3403 | } |
3404 | n = 0; |
3405 | } |
3406 | if (i > pagecount) |
3407 | break; |
3408 | wpos = pos; |
3409 | wsize = 0; |
3410 | } |
3411 | DPRINTF(("committing page %" Z"u" , pgno)); |
3412 | next_pos = pos + size; |
3413 | iov[n].iov_len = size; |
3414 | iov[n].iov_base = (char *)dp; |
3415 | wsize += size; |
3416 | n++; |
3417 | #endif /* _WIN32 */ |
3418 | } |
3419 | |
3420 | /* MIPS has cache coherency issues, this is a no-op everywhere else |
3421 | * Note: for any size >= on-chip cache size, entire on-chip cache is |
3422 | * flushed. |
3423 | */ |
3424 | CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE); |
3425 | |
3426 | for (i = keep; ++i <= pagecount; ) { |
3427 | dp = dl[i].mptr; |
3428 | /* This is a page we skipped above */ |
3429 | if (!dl[i].mid) { |
3430 | dl[++j] = dl[i]; |
3431 | dl[j].mid = dp->mp_pgno; |
3432 | continue; |
3433 | } |
3434 | mdb_dpage_free(env, dp); |
3435 | } |
3436 | |
3437 | done: |
3438 | i--; |
3439 | txn->mt_dirty_room += i - j; |
3440 | dl[0].mid = j; |
3441 | return MDB_SUCCESS; |
3442 | } |
3443 | |
3444 | int |
3445 | mdb_txn_commit(MDB_txn *txn) |
3446 | { |
3447 | int rc; |
3448 | unsigned int i, end_mode; |
3449 | MDB_env *env; |
3450 | |
3451 | if (txn == NULL) |
3452 | return EINVAL; |
3453 | |
3454 | /* mdb_txn_end() mode for a commit which writes nothing */ |
3455 | end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; |
3456 | |
3457 | if (txn->mt_child) { |
3458 | rc = mdb_txn_commit(txn->mt_child); |
3459 | if (rc) |
3460 | goto fail; |
3461 | } |
3462 | |
3463 | env = txn->mt_env; |
3464 | |
3465 | if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { |
3466 | goto done; |
3467 | } |
3468 | |
3469 | if (txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR)) { |
3470 | DPUTS("txn has failed/finished, can't commit" ); |
3471 | if (txn->mt_parent) |
3472 | txn->mt_parent->mt_flags |= MDB_TXN_ERROR; |
3473 | rc = MDB_BAD_TXN; |
3474 | goto fail; |
3475 | } |
3476 | |
3477 | if (txn->mt_parent) { |
3478 | MDB_txn *parent = txn->mt_parent; |
3479 | MDB_page **lp; |
3480 | MDB_ID2L dst, src; |
3481 | MDB_IDL pspill; |
3482 | unsigned x, y, len, ps_len; |
3483 | |
3484 | /* Append our free list to parent's */ |
3485 | rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); |
3486 | if (rc) |
3487 | goto fail; |
3488 | mdb_midl_free(txn->mt_free_pgs); |
3489 | /* Failures after this must either undo the changes |
3490 | * to the parent or set MDB_TXN_ERROR in the parent. |
3491 | */ |
3492 | |
3493 | parent->mt_next_pgno = txn->mt_next_pgno; |
3494 | parent->mt_flags = txn->mt_flags; |
3495 | |
3496 | /* Merge our cursors into parent's and close them */ |
3497 | mdb_cursors_close(txn, 1); |
3498 | |
3499 | /* Update parent's DB table. */ |
3500 | memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); |
3501 | parent->mt_numdbs = txn->mt_numdbs; |
3502 | parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; |
3503 | parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; |
3504 | for (i=CORE_DBS; i<txn->mt_numdbs; i++) { |
3505 | /* preserve parent's DB_NEW status */ |
3506 | x = parent->mt_dbflags[i] & DB_NEW; |
3507 | parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; |
3508 | } |
3509 | |
3510 | dst = parent->mt_u.dirty_list; |
3511 | src = txn->mt_u.dirty_list; |
3512 | /* Remove anything in our dirty list from parent's spill list */ |
3513 | if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { |
3514 | x = y = ps_len; |
3515 | pspill[0] = (pgno_t)-1; |
3516 | /* Mark our dirty pages as deleted in parent spill list */ |
3517 | for (i=0, len=src[0].mid; ++i <= len; ) { |
3518 | MDB_ID pn = src[i].mid << 1; |
3519 | while (pn > pspill[x]) |
3520 | x--; |
3521 | if (pn == pspill[x]) { |
3522 | pspill[x] = 1; |
3523 | y = --x; |
3524 | } |
3525 | } |
3526 | /* Squash deleted pagenums if we deleted any */ |
3527 | for (x=y; ++x <= ps_len; ) |
3528 | if (!(pspill[x] & 1)) |
3529 | pspill[++y] = pspill[x]; |
3530 | pspill[0] = y; |
3531 | } |
3532 | |
3533 | /* Remove anything in our spill list from parent's dirty list */ |
3534 | if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { |
3535 | for (i=1; i<=txn->mt_spill_pgs[0]; i++) { |
3536 | MDB_ID pn = txn->mt_spill_pgs[i]; |
3537 | if (pn & 1) |
3538 | continue; /* deleted spillpg */ |
3539 | pn >>= 1; |
3540 | y = mdb_mid2l_search(dst, pn); |
3541 | if (y <= dst[0].mid && dst[y].mid == pn) { |
3542 | free(dst[y].mptr); |
3543 | while (y < dst[0].mid) { |
3544 | dst[y] = dst[y+1]; |
3545 | y++; |
3546 | } |
3547 | dst[0].mid--; |
3548 | } |
3549 | } |
3550 | } |
3551 | |
3552 | /* Find len = length of merging our dirty list with parent's */ |
3553 | x = dst[0].mid; |
3554 | dst[0].mid = 0; /* simplify loops */ |
3555 | if (parent->mt_parent) { |
3556 | len = x + src[0].mid; |
3557 | y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; |
3558 | for (i = x; y && i; y--) { |
3559 | pgno_t yp = src[y].mid; |
3560 | while (yp < dst[i].mid) |
3561 | i--; |
3562 | if (yp == dst[i].mid) { |
3563 | i--; |
3564 | len--; |
3565 | } |
3566 | } |
3567 | } else { /* Simplify the above for single-ancestor case */ |
3568 | len = MDB_IDL_UM_MAX - txn->mt_dirty_room; |
3569 | } |
3570 | /* Merge our dirty list with parent's */ |
3571 | y = src[0].mid; |
3572 | for (i = len; y; dst[i--] = src[y--]) { |
3573 | pgno_t yp = src[y].mid; |
3574 | while (yp < dst[x].mid) |
3575 | dst[i--] = dst[x--]; |
3576 | if (yp == dst[x].mid) |
3577 | free(dst[x--].mptr); |
3578 | } |
3579 | mdb_tassert(txn, i == x); |
3580 | dst[0].mid = len; |
3581 | free(txn->mt_u.dirty_list); |
3582 | parent->mt_dirty_room = txn->mt_dirty_room; |
3583 | if (txn->mt_spill_pgs) { |
3584 | if (parent->mt_spill_pgs) { |
3585 | /* TODO: Prevent failure here, so parent does not fail */ |
3586 | rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); |
3587 | if (rc) |
3588 | parent->mt_flags |= MDB_TXN_ERROR; |
3589 | mdb_midl_free(txn->mt_spill_pgs); |
3590 | mdb_midl_sort(parent->mt_spill_pgs); |
3591 | } else { |
3592 | parent->mt_spill_pgs = txn->mt_spill_pgs; |
3593 | } |
3594 | } |
3595 | |
3596 | /* Append our loose page list to parent's */ |
3597 | for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) |
3598 | ; |
3599 | *lp = txn->mt_loose_pgs; |
3600 | parent->mt_loose_count += txn->mt_loose_count; |
3601 | |
3602 | parent->mt_child = NULL; |
3603 | mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); |
3604 | free(txn); |
3605 | return rc; |
3606 | } |
3607 | |
3608 | if (txn != env->me_txn) { |
3609 | DPUTS("attempt to commit unknown transaction" ); |
3610 | rc = EINVAL; |
3611 | goto fail; |
3612 | } |
3613 | |
3614 | mdb_cursors_close(txn, 0); |
3615 | |
3616 | if (!txn->mt_u.dirty_list[0].mid && |
3617 | !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) |
3618 | goto done; |
3619 | |
3620 | DPRINTF(("committing txn %" Z"u %p on mdbenv %p, root page %" Z"u" , |
3621 | txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); |
3622 | |
3623 | /* Update DB root pointers */ |
3624 | if (txn->mt_numdbs > CORE_DBS) { |
3625 | MDB_cursor mc; |
3626 | MDB_dbi i; |
3627 | MDB_val data; |
3628 | data.mv_size = sizeof(MDB_db); |
3629 | |
3630 | mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); |
3631 | for (i = CORE_DBS; i < txn->mt_numdbs; i++) { |
3632 | if (txn->mt_dbflags[i] & DB_DIRTY) { |
3633 | if (TXN_DBI_CHANGED(txn, i)) { |
3634 | rc = MDB_BAD_DBI; |
3635 | goto fail; |
3636 | } |
3637 | data.mv_data = &txn->mt_dbs[i]; |
3638 | rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, |
3639 | F_SUBDATA); |
3640 | if (rc) |
3641 | goto fail; |
3642 | } |
3643 | } |
3644 | } |
3645 | |
3646 | rc = mdb_freelist_save(txn); |
3647 | if (rc) |
3648 | goto fail; |
3649 | |
3650 | mdb_midl_free(env->me_pghead); |
3651 | env->me_pghead = NULL; |
3652 | mdb_midl_shrink(&txn->mt_free_pgs); |
3653 | |
3654 | #if (MDB_DEBUG) > 2 |
3655 | mdb_audit(txn); |
3656 | #endif |
3657 | |
3658 | if ((rc = mdb_page_flush(txn, 0)) || |
3659 | (rc = mdb_env_sync(env, 0)) || |
3660 | (rc = mdb_env_write_meta(txn))) |
3661 | goto fail; |
3662 | end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; |
3663 | |
3664 | done: |
3665 | mdb_txn_end(txn, end_mode); |
3666 | return MDB_SUCCESS; |
3667 | |
3668 | fail: |
3669 | mdb_txn_abort(txn); |
3670 | return rc; |
3671 | } |
3672 | |
3673 | /** Read the environment parameters of a DB environment before |
3674 | * mapping it into memory. |
3675 | * @param[in] env the environment handle |
3676 | * @param[out] meta address of where to store the meta information |
3677 | * @return 0 on success, non-zero on failure. |
3678 | */ |
3679 | static int ESECT |
3680 | (MDB_env *env, MDB_meta *meta) |
3681 | { |
3682 | MDB_metabuf pbuf; |
3683 | MDB_page *p; |
3684 | MDB_meta *m; |
3685 | int i, rc, off; |
3686 | enum { Size = sizeof(pbuf) }; |
3687 | |
3688 | /* We don't know the page size yet, so use a minimum value. |
3689 | * Read both meta pages so we can use the latest one. |
3690 | */ |
3691 | |
3692 | for (i=off=0; i<NUM_METAS; i++, off += meta->mm_psize) { |
3693 | #ifdef _WIN32 |
3694 | DWORD len; |
3695 | OVERLAPPED ov; |
3696 | memset(&ov, 0, sizeof(ov)); |
3697 | ov.Offset = off; |
3698 | rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1; |
3699 | if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF) |
3700 | rc = 0; |
3701 | #else |
3702 | rc = pread(env->me_fd, &pbuf, Size, off); |
3703 | #endif |
3704 | if (rc != Size) { |
3705 | if (rc == 0 && off == 0) |
3706 | return ENOENT; |
3707 | rc = rc < 0 ? (int) ErrCode() : MDB_INVALID; |
3708 | DPRINTF(("read: %s" , mdb_strerror(rc))); |
3709 | return rc; |
3710 | } |
3711 | |
3712 | p = (MDB_page *)&pbuf; |
3713 | |
3714 | if (!F_ISSET(p->mp_flags, P_META)) { |
3715 | DPRINTF(("page %" Z"u not a meta page" , p->mp_pgno)); |
3716 | return MDB_INVALID; |
3717 | } |
3718 | |
3719 | m = METADATA(p); |
3720 | if (m->mm_magic != MDB_MAGIC) { |
3721 | DPUTS("meta has invalid magic" ); |
3722 | return MDB_INVALID; |
3723 | } |
3724 | |
3725 | if (m->mm_version != MDB_DATA_VERSION) { |
3726 | DPRINTF(("database is version %u, expected version %u" , |
3727 | m->mm_version, MDB_DATA_VERSION)); |
3728 | return MDB_VERSION_MISMATCH; |
3729 | } |
3730 | |
3731 | if (off == 0 || m->mm_txnid > meta->mm_txnid) |
3732 | *meta = *m; |
3733 | } |
3734 | return 0; |
3735 | } |
3736 | |
3737 | /** Fill in most of the zeroed #MDB_meta for an empty database environment */ |
3738 | static void ESECT |
3739 | mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) |
3740 | { |
3741 | meta->mm_magic = MDB_MAGIC; |
3742 | meta->mm_version = MDB_DATA_VERSION; |
3743 | meta->mm_mapsize = env->me_mapsize; |
3744 | meta->mm_psize = env->me_psize; |
3745 | meta->mm_last_pg = NUM_METAS-1; |
3746 | meta->mm_flags = env->me_flags & 0xffff; |
3747 | meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ |
3748 | meta->mm_dbs[FREE_DBI].md_root = P_INVALID; |
3749 | meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; |
3750 | } |
3751 | |
3752 | /** Write the environment parameters of a freshly created DB environment. |
3753 | * @param[in] env the environment handle |
3754 | * @param[in] meta the #MDB_meta to write |
3755 | * @return 0 on success, non-zero on failure. |
3756 | */ |
3757 | static int ESECT |
3758 | mdb_env_init_meta(MDB_env *env, MDB_meta *meta) |
3759 | { |
3760 | MDB_page *p, *q; |
3761 | int rc; |
3762 | unsigned int psize; |
3763 | #ifdef _WIN32 |
3764 | DWORD len; |
3765 | OVERLAPPED ov; |
3766 | memset(&ov, 0, sizeof(ov)); |
3767 | #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ |
3768 | ov.Offset = pos; \ |
3769 | rc = WriteFile(fd, ptr, size, &len, &ov); } while(0) |
3770 | #else |
3771 | int len; |
3772 | #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \ |
3773 | len = pwrite(fd, ptr, size, pos); \ |
3774 | if (len == -1 && ErrCode() == EINTR) continue; \ |
3775 | rc = (len >= 0); break; } while(1) |
3776 | #endif |
3777 | |
3778 | DPUTS("writing new meta page" ); |
3779 | |
3780 | psize = env->me_psize; |
3781 | |
3782 | p = calloc(NUM_METAS, psize); |
3783 | if (!p) |
3784 | return ENOMEM; |
3785 | |
3786 | p->mp_pgno = 0; |
3787 | p->mp_flags = P_META; |
3788 | *(MDB_meta *)METADATA(p) = *meta; |
3789 | |
3790 | q = (MDB_page *)((char *)p + psize); |
3791 | q->mp_pgno = 1; |
3792 | q->mp_flags = P_META; |
3793 | *(MDB_meta *)METADATA(q) = *meta; |
3794 | |
3795 | DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0); |
3796 | if (!rc) |
3797 | rc = ErrCode(); |
3798 | else if ((unsigned) len == psize * NUM_METAS) |
3799 | rc = MDB_SUCCESS; |
3800 | else |
3801 | rc = ENOSPC; |
3802 | free(p); |
3803 | return rc; |
3804 | } |
3805 | |
3806 | /** Update the environment info to commit a transaction. |
3807 | * @param[in] txn the transaction that's being committed |
3808 | * @return 0 on success, non-zero on failure. |
3809 | */ |
3810 | static int |
3811 | mdb_env_write_meta(MDB_txn *txn) |
3812 | { |
3813 | MDB_env *env; |
3814 | MDB_meta meta, metab, *mp; |
3815 | unsigned flags; |
3816 | size_t mapsize; |
3817 | off_t off; |
3818 | int rc, len, toggle; |
3819 | char *ptr; |
3820 | HANDLE mfd; |
3821 | #ifdef _WIN32 |
3822 | OVERLAPPED ov; |
3823 | #else |
3824 | int r2; |
3825 | #endif |
3826 | |
3827 | toggle = txn->mt_txnid & 1; |
3828 | DPRINTF(("writing meta page %d for root page %" Z"u" , |
3829 | toggle, txn->mt_dbs[MAIN_DBI].md_root)); |
3830 | |
3831 | env = txn->mt_env; |
3832 | flags = env->me_flags; |
3833 | mp = env->me_metas[toggle]; |
3834 | mapsize = env->me_metas[toggle ^ 1]->mm_mapsize; |
3835 | /* Persist any increases of mapsize config */ |
3836 | if (mapsize < env->me_mapsize) |
3837 | mapsize = env->me_mapsize; |
3838 | |
3839 | if (flags & MDB_WRITEMAP) { |
3840 | mp->mm_mapsize = mapsize; |
3841 | mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; |
3842 | mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; |
3843 | mp->mm_last_pg = txn->mt_next_pgno - 1; |
3844 | #if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && /* TODO: portability */ \ |
3845 | !(defined(__i386__) || defined(__x86_64__)) |
3846 | /* LY: issue a memory barrier, if not x86. ITS#7969 */ |
3847 | __sync_synchronize(); |
3848 | #endif |
3849 | mp->mm_txnid = txn->mt_txnid; |
3850 | if (!(flags & (MDB_NOMETASYNC|MDB_NOSYNC))) { |
3851 | unsigned meta_size = env->me_psize; |
3852 | rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; |
3853 | ptr = (char *)mp - PAGEHDRSZ; |
3854 | #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */ |
3855 | r2 = (ptr - env->me_map) & (env->me_os_psize - 1); |
3856 | ptr -= r2; |
3857 | meta_size += r2; |
3858 | #endif |
3859 | if (MDB_MSYNC(ptr, meta_size, rc)) { |
3860 | rc = ErrCode(); |
3861 | goto fail; |
3862 | } |
3863 | } |
3864 | goto done; |
3865 | } |
3866 | metab.mm_txnid = mp->mm_txnid; |
3867 | metab.mm_last_pg = mp->mm_last_pg; |
3868 | |
3869 | meta.mm_mapsize = mapsize; |
3870 | meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; |
3871 | meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; |
3872 | meta.mm_last_pg = txn->mt_next_pgno - 1; |
3873 | meta.mm_txnid = txn->mt_txnid; |
3874 | |
3875 | off = offsetof(MDB_meta, mm_mapsize); |
3876 | ptr = (char *)&meta + off; |
3877 | len = sizeof(MDB_meta) - off; |
3878 | off += (char *)mp - env->me_map; |
3879 | |
3880 | /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC. |
3881 | * (me_mfd goes to the same file as me_fd, but writing to it |
3882 | * also syncs to disk. Avoids a separate fdatasync() call.) |
3883 | */ |
3884 | mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; |
3885 | #ifdef _WIN32 |
3886 | { |
3887 | memset(&ov, 0, sizeof(ov)); |
3888 | ov.Offset = off; |
3889 | if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) |
3890 | rc = -1; |
3891 | } |
3892 | #else |
3893 | retry_write: |
3894 | rc = pwrite(mfd, ptr, len, off); |
3895 | #endif |
3896 | if (rc != len) { |
3897 | rc = rc < 0 ? ErrCode() : EIO; |
3898 | #ifndef _WIN32 |
3899 | if (rc == EINTR) |
3900 | goto retry_write; |
3901 | #endif |
3902 | DPUTS("write failed, disk error?" ); |
3903 | /* On a failure, the pagecache still contains the new data. |
3904 | * Write some old data back, to prevent it from being used. |
3905 | * Use the non-SYNC fd; we know it will fail anyway. |
3906 | */ |
3907 | meta.mm_last_pg = metab.mm_last_pg; |
3908 | meta.mm_txnid = metab.mm_txnid; |
3909 | #ifdef _WIN32 |
3910 | memset(&ov, 0, sizeof(ov)); |
3911 | ov.Offset = off; |
3912 | WriteFile(env->me_fd, ptr, len, NULL, &ov); |
3913 | #else |
3914 | r2 = pwrite(env->me_fd, ptr, len, off); |
3915 | (void)r2; /* Silence warnings. We don't care about pwrite's return value */ |
3916 | #endif |
3917 | fail: |
3918 | env->me_flags |= MDB_FATAL_ERROR; |
3919 | return rc; |
3920 | } |
3921 | /* MIPS has cache coherency issues, this is a no-op everywhere else */ |
3922 | CACHEFLUSH(env->me_map + off, len, DCACHE); |
3923 | done: |
3924 | /* Memory ordering issues are irrelevant; since the entire writer |
3925 | * is wrapped by wmutex, all of these changes will become visible |
3926 | * after the wmutex is unlocked. Since the DB is multi-version, |
3927 | * readers will get consistent data regardless of how fresh or |
3928 | * how stale their view of these values is. |
3929 | */ |
3930 | if (env->me_txns) |
3931 | env->me_txns->mti_txnid = txn->mt_txnid; |
3932 | |
3933 | return MDB_SUCCESS; |
3934 | } |
3935 | |
3936 | /** Check both meta pages to see which one is newer. |
3937 | * @param[in] env the environment handle |
3938 | * @return newest #MDB_meta. |
3939 | */ |
3940 | static MDB_meta * |
3941 | mdb_env_pick_meta(const MDB_env *env) |
3942 | { |
3943 | MDB_meta *const *metas = env->me_metas; |
3944 | return metas[ metas[0]->mm_txnid < metas[1]->mm_txnid ]; |
3945 | } |
3946 | |
3947 | int ESECT |
3948 | mdb_env_create(MDB_env **env) |
3949 | { |
3950 | MDB_env *e; |
3951 | |
3952 | e = calloc(1, sizeof(MDB_env)); |
3953 | if (!e) |
3954 | return ENOMEM; |
3955 | |
3956 | e->me_maxreaders = DEFAULT_READERS; |
3957 | e->me_maxdbs = e->me_numdbs = CORE_DBS; |
3958 | e->me_fd = INVALID_HANDLE_VALUE; |
3959 | e->me_lfd = INVALID_HANDLE_VALUE; |
3960 | e->me_mfd = INVALID_HANDLE_VALUE; |
3961 | #ifdef MDB_USE_POSIX_SEM |
3962 | e->me_rmutex = SEM_FAILED; |
3963 | e->me_wmutex = SEM_FAILED; |
3964 | #endif |
3965 | e->me_pid = getpid(); |
3966 | GET_PAGESIZE(e->me_os_psize); |
3967 | VGMEMP_CREATE(e,0,0); |
3968 | *env = e; |
3969 | return MDB_SUCCESS; |
3970 | } |
3971 | |
3972 | static int ESECT |
3973 | mdb_env_map(MDB_env *env, void *addr) |
3974 | { |
3975 | MDB_page *p; |
3976 | unsigned int flags = env->me_flags; |
3977 | #ifdef _WIN32 |
3978 | int rc; |
3979 | HANDLE mh; |
3980 | LONG sizelo, sizehi; |
3981 | size_t msize; |
3982 | |
3983 | if (flags & MDB_RDONLY) { |
3984 | /* Don't set explicit map size, use whatever exists */ |
3985 | msize = 0; |
3986 | sizelo = 0; |
3987 | sizehi = 0; |
3988 | } else { |
3989 | msize = env->me_mapsize; |
3990 | sizelo = msize & 0xffffffff; |
3991 | sizehi = msize >> 16 >> 16; /* only needed on Win64 */ |
3992 | |
3993 | /* Windows won't create mappings for zero length files. |
3994 | * and won't map more than the file size. |
3995 | * Just set the maxsize right now. |
3996 | */ |
3997 | if (!(flags & MDB_WRITEMAP) && (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo |
3998 | || !SetEndOfFile(env->me_fd) |
3999 | || SetFilePointer(env->me_fd, 0, NULL, 0) != 0)) |
4000 | return ErrCode(); |
4001 | } |
4002 | |
4003 | mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? |
4004 | PAGE_READWRITE : PAGE_READONLY, |
4005 | sizehi, sizelo, NULL); |
4006 | if (!mh) |
4007 | return ErrCode(); |
4008 | env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? |
4009 | FILE_MAP_WRITE : FILE_MAP_READ, |
4010 | 0, 0, msize, addr); |
4011 | rc = env->me_map ? 0 : ErrCode(); |
4012 | CloseHandle(mh); |
4013 | if (rc) |
4014 | return rc; |
4015 | #else |
4016 | int mmap_flags = MAP_SHARED; |
4017 | int prot = PROT_READ; |
4018 | #ifdef MAP_NOSYNC /* Used on FreeBSD */ |
4019 | if (flags & MDB_NOSYNC) |
4020 | mmap_flags |= MAP_NOSYNC; |
4021 | #endif |
4022 | if (flags & MDB_WRITEMAP) { |
4023 | prot |= PROT_WRITE; |
4024 | if (ftruncate(env->me_fd, env->me_mapsize) < 0) |
4025 | return ErrCode(); |
4026 | } |
4027 | env->me_map = mmap(addr, env->me_mapsize, prot, mmap_flags, |
4028 | env->me_fd, 0); |
4029 | if (env->me_map == MAP_FAILED) { |
4030 | env->me_map = NULL; |
4031 | return ErrCode(); |
4032 | } |
4033 | |
4034 | if (flags & MDB_NORDAHEAD) { |
4035 | /* Turn off readahead. It's harmful when the DB is larger than RAM. */ |
4036 | #ifdef MADV_RANDOM |
4037 | madvise(env->me_map, env->me_mapsize, MADV_RANDOM); |
4038 | #else |
4039 | #ifdef POSIX_MADV_RANDOM |
4040 | posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); |
4041 | #endif /* POSIX_MADV_RANDOM */ |
4042 | #endif /* MADV_RANDOM */ |
4043 | } |
4044 | #endif /* _WIN32 */ |
4045 | |
4046 | /* Can happen because the address argument to mmap() is just a |
4047 | * hint. mmap() can pick another, e.g. if the range is in use. |
4048 | * The MAP_FIXED flag would prevent that, but then mmap could |
4049 | * instead unmap existing pages to make room for the new map. |
4050 | */ |
4051 | if (addr && env->me_map != addr) |
4052 | return EBUSY; /* TODO: Make a new MDB_* error code? */ |
4053 | |
4054 | p = (MDB_page *)env->me_map; |
4055 | env->me_metas[0] = METADATA(p); |
4056 | env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + env->me_psize); |
4057 | |
4058 | return MDB_SUCCESS; |
4059 | } |
4060 | |
4061 | int ESECT |
4062 | mdb_env_set_mapsize(MDB_env *env, size_t size) |
4063 | { |
4064 | /* If env is already open, caller is responsible for making |
4065 | * sure there are no active txns. |
4066 | */ |
4067 | if (env->me_map) { |
4068 | int rc; |
4069 | MDB_meta *meta; |
4070 | void *old; |
4071 | if (env->me_txn) |
4072 | return EINVAL; |
4073 | meta = mdb_env_pick_meta(env); |
4074 | if (!size) |
4075 | size = meta->mm_mapsize; |
4076 | { |
4077 | /* Silently round up to minimum if the size is too small */ |
4078 | size_t minsize = (meta->mm_last_pg + 1) * env->me_psize; |
4079 | if (size < minsize) |
4080 | size = minsize; |
4081 | } |
4082 | munmap(env->me_map, env->me_mapsize); |
4083 | env->me_mapsize = size; |
4084 | old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; |
4085 | rc = mdb_env_map(env, old); |
4086 | if (rc) |
4087 | return rc; |
4088 | } |
4089 | env->me_mapsize = size; |
4090 | if (env->me_psize) |
4091 | env->me_maxpg = env->me_mapsize / env->me_psize; |
4092 | return MDB_SUCCESS; |
4093 | } |
4094 | |
4095 | int ESECT |
4096 | mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) |
4097 | { |
4098 | if (env->me_map) |
4099 | return EINVAL; |
4100 | env->me_maxdbs = dbs + CORE_DBS; |
4101 | return MDB_SUCCESS; |
4102 | } |
4103 | |
4104 | int ESECT |
4105 | mdb_env_set_maxreaders(MDB_env *env, unsigned int readers) |
4106 | { |
4107 | if (env->me_map || readers < 1) |
4108 | return EINVAL; |
4109 | env->me_maxreaders = readers; |
4110 | return MDB_SUCCESS; |
4111 | } |
4112 | |
4113 | int ESECT |
4114 | mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) |
4115 | { |
4116 | if (!env || !readers) |
4117 | return EINVAL; |
4118 | *readers = env->me_maxreaders; |
4119 | return MDB_SUCCESS; |
4120 | } |
4121 | |
4122 | static int ESECT |
4123 | mdb_fsize(HANDLE fd, size_t *size) |
4124 | { |
4125 | #ifdef _WIN32 |
4126 | LARGE_INTEGER fsize; |
4127 | |
4128 | if (!GetFileSizeEx(fd, &fsize)) |
4129 | return ErrCode(); |
4130 | |
4131 | *size = fsize.QuadPart; |
4132 | #else |
4133 | struct stat st; |
4134 | |
4135 | if (fstat(fd, &st)) |
4136 | return ErrCode(); |
4137 | |
4138 | *size = st.st_size; |
4139 | #endif |
4140 | return MDB_SUCCESS; |
4141 | } |
4142 | |
4143 | |
4144 | #ifdef _WIN32 |
4145 | typedef wchar_t mdb_nchar_t; |
4146 | # define MDB_NAME(str) L##str |
4147 | # define mdb_name_cpy wcscpy |
4148 | #else |
4149 | /** Character type for file names: char on Unix, wchar_t on Windows */ |
4150 | typedef char mdb_nchar_t; |
4151 | # define MDB_NAME(str) str /**< #mdb_nchar_t[] string literal */ |
4152 | # define mdb_name_cpy strcpy /**< Copy name (#mdb_nchar_t string) */ |
4153 | #endif |
4154 | |
4155 | /** Filename - string of #mdb_nchar_t[] */ |
4156 | typedef struct MDB_name { |
4157 | int mn_len; /**< Length */ |
4158 | int mn_alloced; /**< True if #mn_val was malloced */ |
4159 | mdb_nchar_t *mn_val; /**< Contents */ |
4160 | } MDB_name; |
4161 | |
4162 | /** Filename suffixes [datafile,lockfile][without,with MDB_NOSUBDIR] */ |
4163 | static const mdb_nchar_t *const mdb_suffixes[2][2] = { |
4164 | { MDB_NAME("/data.mdb" ), MDB_NAME("" ) }, |
4165 | { MDB_NAME("/lock.mdb" ), MDB_NAME("-lock" ) } |
4166 | }; |
4167 | |
4168 | #define MDB_SUFFLEN 9 /**< Max string length in #mdb_suffixes[] */ |
4169 | |
4170 | /** Set up filename + scratch area for filename suffix, for opening files. |
4171 | * It should be freed with #mdb_fname_destroy(). |
4172 | * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16. |
4173 | * |
4174 | * @param[in] path Pathname for #mdb_env_open(). |
4175 | * @param[in] envflags Whether a subdir and/or lockfile will be used. |
4176 | * @param[out] fname Resulting filename, with room for a suffix if necessary. |
4177 | */ |
4178 | static int ESECT |
4179 | mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname) |
4180 | { |
4181 | int no_suffix = F_ISSET(envflags, MDB_NOSUBDIR|MDB_NOLOCK); |
4182 | fname->mn_alloced = 0; |
4183 | #ifdef _WIN32 |
4184 | return utf8_to_utf16(path, fname, no_suffix ? 0 : MDB_SUFFLEN); |
4185 | #else |
4186 | fname->mn_len = strlen(path); |
4187 | if (no_suffix) |
4188 | fname->mn_val = (char *) path; |
4189 | else if ((fname->mn_val = malloc(fname->mn_len + MDB_SUFFLEN+1)) != NULL) { |
4190 | fname->mn_alloced = 1; |
4191 | strcpy(fname->mn_val, path); |
4192 | } |
4193 | else |
4194 | return ENOMEM; |
4195 | return MDB_SUCCESS; |
4196 | #endif |
4197 | } |
4198 | |
4199 | /** Destroy \b fname from #mdb_fname_init() */ |
4200 | #define mdb_fname_destroy(fname) \ |
4201 | do { if ((fname).mn_alloced) free((fname).mn_val); } while (0) |
4202 | |
4203 | #ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */ |
4204 | # define MDB_CLOEXEC O_CLOEXEC |
4205 | #else |
4206 | # define MDB_CLOEXEC 0 |
4207 | #endif |
4208 | |
4209 | /** File type, access mode etc. for #mdb_fopen() */ |
4210 | enum mdb_fopen_type { |
4211 | #ifdef _WIN32 |
4212 | MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS |
4213 | #else |
4214 | /* A comment in mdb_fopen() explains some O_* flag choices. */ |
4215 | MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ |
4216 | MDB_O_RDWR = O_RDWR |O_CREAT, /**< for me_fd */ |
4217 | MDB_O_META = O_WRONLY|MDB_DSYNC |MDB_CLOEXEC, /**< for me_mfd */ |
4218 | MDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|MDB_CLOEXEC, /**< for #mdb_env_copy() */ |
4219 | /** Bitmask for open() flags in enum #mdb_fopen_type. The other bits |
4220 | * distinguish otherwise-equal MDB_O_* constants from each other. |
4221 | */ |
4222 | MDB_O_MASK = MDB_O_RDWR|MDB_CLOEXEC | MDB_O_RDONLY|MDB_O_META|MDB_O_COPY, |
4223 | MDB_O_LOCKS = MDB_O_RDWR|MDB_CLOEXEC | ((MDB_O_MASK+1) & ~MDB_O_MASK) /**< for me_lfd */ |
4224 | #endif |
4225 | }; |
4226 | |
4227 | /** Open an LMDB file. |
4228 | * @param[in] env The LMDB environment. |
4229 | * @param[in,out] fname Path from from #mdb_fname_init(). A suffix is |
4230 | * appended if necessary to create the filename, without changing mn_len. |
4231 | * @param[in] which Determines file type, access mode, etc. |
4232 | * @param[in] mode The Unix permissions for the file, if we create it. |
4233 | * @param[out] res Resulting file handle. |
4234 | * @return 0 on success, non-zero on failure. |
4235 | */ |
4236 | static int ESECT |
4237 | mdb_fopen(const MDB_env *env, MDB_name *fname, |
4238 | enum mdb_fopen_type which, mdb_mode_t mode, |
4239 | HANDLE *res) |
4240 | { |
4241 | int rc = MDB_SUCCESS; |
4242 | HANDLE fd; |
4243 | #ifdef _WIN32 |
4244 | DWORD acc, share, disp, attrs; |
4245 | #else |
4246 | int flags; |
4247 | #endif |
4248 | |
4249 | if (fname->mn_alloced) /* modifiable copy */ |
4250 | mdb_name_cpy(fname->mn_val + fname->mn_len, |
4251 | mdb_suffixes[which==MDB_O_LOCKS][F_ISSET(env->me_flags, MDB_NOSUBDIR)]); |
4252 | |
4253 | /* The directory must already exist. Usually the file need not. |
4254 | * MDB_O_META requires the file because we already created it using |
4255 | * MDB_O_RDWR. MDB_O_COPY must not overwrite an existing file. |
4256 | * |
4257 | * With MDB_O_COPY we do not want the OS to cache the writes, since |
4258 | * the source data is already in the OS cache. |
4259 | * |
4260 | * The lockfile needs FD_CLOEXEC (close file descriptor on exec*()) |
4261 | * to avoid the flock() issues noted under Caveats in lmdb.h. |
4262 | * Also set it for other filehandles which the user cannot get at |
4263 | * and close himself, which he may need after fork(). I.e. all but |
4264 | * me_fd, which programs do use via mdb_env_get_fd(). |
4265 | */ |
4266 | |
4267 | #ifdef _WIN32 |
4268 | acc = GENERIC_READ|GENERIC_WRITE; |
4269 | share = FILE_SHARE_READ|FILE_SHARE_WRITE; |
4270 | disp = OPEN_ALWAYS; |
4271 | attrs = FILE_ATTRIBUTE_NORMAL; |
4272 | switch (which) { |
4273 | case MDB_O_RDONLY: /* read-only datafile */ |
4274 | acc = GENERIC_READ; |
4275 | disp = OPEN_EXISTING; |
4276 | break; |
4277 | case MDB_O_META: /* for writing metapages */ |
4278 | acc = GENERIC_WRITE; |
4279 | disp = OPEN_EXISTING; |
4280 | attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH; |
4281 | break; |
4282 | case MDB_O_COPY: /* mdb_env_copy() & co */ |
4283 | acc = GENERIC_WRITE; |
4284 | share = 0; |
4285 | disp = CREATE_NEW; |
4286 | attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH; |
4287 | break; |
4288 | default: break; /* silence gcc -Wswitch (not all enum values handled) */ |
4289 | } |
4290 | fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL); |
4291 | #else |
4292 | fd = open(fname->mn_val, which & MDB_O_MASK, mode); |
4293 | #endif |
4294 | |
4295 | if (fd == INVALID_HANDLE_VALUE) |
4296 | rc = ErrCode(); |
4297 | #ifndef _WIN32 |
4298 | else { |
4299 | if (which != MDB_O_RDONLY && which != MDB_O_RDWR) { |
4300 | /* Set CLOEXEC if we could not pass it to open() */ |
4301 | if (!MDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1) |
4302 | (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC); |
4303 | } |
4304 | if (which == MDB_O_COPY && env->me_psize >= env->me_os_psize) { |
4305 | /* This may require buffer alignment. There is no portable |
4306 | * way to ask how much, so we require OS pagesize alignment. |
4307 | */ |
4308 | # ifdef F_NOCACHE /* __APPLE__ */ |
4309 | (void) fcntl(fd, F_NOCACHE, 1); |
4310 | # elif defined O_DIRECT |
4311 | /* open(...O_DIRECT...) would break on filesystems without |
4312 | * O_DIRECT support (ITS#7682). Try to set it here instead. |
4313 | */ |
4314 | if ((flags = fcntl(fd, F_GETFL)) != -1) |
4315 | (void) fcntl(fd, F_SETFL, flags | O_DIRECT); |
4316 | # endif |
4317 | } |
4318 | } |
4319 | #endif /* !_WIN32 */ |
4320 | |
4321 | *res = fd; |
4322 | return rc; |
4323 | } |
4324 | |
4325 | |
4326 | #ifdef BROKEN_FDATASYNC |
4327 | #include <sys/utsname.h> |
4328 | #include <sys/vfs.h> |
4329 | #endif |
4330 | |
4331 | /** Further setup required for opening an LMDB environment |
4332 | */ |
4333 | static int ESECT |
4334 | mdb_env_open2(MDB_env *env) |
4335 | { |
4336 | unsigned int flags = env->me_flags; |
4337 | int i, newenv = 0, rc; |
4338 | MDB_meta meta; |
4339 | |
4340 | #ifdef _WIN32 |
4341 | /* See if we should use QueryLimited */ |
4342 | rc = GetVersion(); |
4343 | if ((rc & 0xff) > 5) |
4344 | env->me_pidquery = MDB_PROCESS_QUERY_LIMITED_INFORMATION; |
4345 | else |
4346 | env->me_pidquery = PROCESS_QUERY_INFORMATION; |
4347 | #endif /* _WIN32 */ |
4348 | |
4349 | #ifdef BROKEN_FDATASYNC |
4350 | /* ext3/ext4 fdatasync is broken on some older Linux kernels. |
4351 | * https://lkml.org/lkml/2012/9/3/83 |
4352 | * Kernels after 3.6-rc6 are known good. |
4353 | * https://lkml.org/lkml/2012/9/10/556 |
4354 | * See if the DB is on ext3/ext4, then check for new enough kernel |
4355 | * Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known |
4356 | * to be patched. |
4357 | */ |
4358 | { |
4359 | struct statfs st; |
4360 | fstatfs(env->me_fd, &st); |
4361 | while (st.f_type == 0xEF53) { |
4362 | struct utsname uts; |
4363 | int i; |
4364 | uname(&uts); |
4365 | if (uts.release[0] < '3') { |
4366 | if (!strncmp(uts.release, "2.6.32." , 7)) { |
4367 | i = atoi(uts.release+7); |
4368 | if (i >= 60) |
4369 | break; /* 2.6.32.60 and newer is OK */ |
4370 | } else if (!strncmp(uts.release, "2.6.34." , 7)) { |
4371 | i = atoi(uts.release+7); |
4372 | if (i >= 15) |
4373 | break; /* 2.6.34.15 and newer is OK */ |
4374 | } |
4375 | } else if (uts.release[0] == '3') { |
4376 | i = atoi(uts.release+2); |
4377 | if (i > 5) |
4378 | break; /* 3.6 and newer is OK */ |
4379 | if (i == 5) { |
4380 | i = atoi(uts.release+4); |
4381 | if (i >= 4) |
4382 | break; /* 3.5.4 and newer is OK */ |
4383 | } else if (i == 2) { |
4384 | i = atoi(uts.release+4); |
4385 | if (i >= 30) |
4386 | break; /* 3.2.30 and newer is OK */ |
4387 | } |
4388 | } else { /* 4.x and newer is OK */ |
4389 | break; |
4390 | } |
4391 | env->me_flags |= MDB_FSYNCONLY; |
4392 | break; |
4393 | } |
4394 | } |
4395 | #endif |
4396 | |
4397 | if ((i = mdb_env_read_header(env, &meta)) != 0) { |
4398 | if (i != ENOENT) |
4399 | return i; |
4400 | DPUTS("new mdbenv" ); |
4401 | newenv = 1; |
4402 | env->me_psize = env->me_os_psize; |
4403 | if (env->me_psize > MAX_PAGESIZE) |
4404 | env->me_psize = MAX_PAGESIZE; |
4405 | memset(&meta, 0, sizeof(meta)); |
4406 | mdb_env_init_meta0(env, &meta); |
4407 | meta.mm_mapsize = DEFAULT_MAPSIZE; |
4408 | } else { |
4409 | env->me_psize = meta.mm_psize; |
4410 | } |
4411 | |
4412 | /* Was a mapsize configured? */ |
4413 | if (!env->me_mapsize) { |
4414 | env->me_mapsize = meta.mm_mapsize; |
4415 | } |
4416 | { |
4417 | /* Make sure mapsize >= committed data size. Even when using |
4418 | * mm_mapsize, which could be broken in old files (ITS#7789). |
4419 | */ |
4420 | size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; |
4421 | if (env->me_mapsize < minsize) |
4422 | env->me_mapsize = minsize; |
4423 | } |
4424 | meta.mm_mapsize = env->me_mapsize; |
4425 | |
4426 | if (newenv && !(flags & MDB_FIXEDMAP)) { |
4427 | /* mdb_env_map() may grow the datafile. Write the metapages |
4428 | * first, so the file will be valid if initialization fails. |
4429 | * Except with FIXEDMAP, since we do not yet know mm_address. |
4430 | * We could fill in mm_address later, but then a different |
4431 | * program might end up doing that - one with a memory layout |
4432 | * and map address which does not suit the main program. |
4433 | */ |
4434 | rc = mdb_env_init_meta(env, &meta); |
4435 | if (rc) |
4436 | return rc; |
4437 | newenv = 0; |
4438 | } |
4439 | |
4440 | rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL); |
4441 | if (rc) |
4442 | return rc; |
4443 | |
4444 | if (newenv) { |
4445 | if (flags & MDB_FIXEDMAP) |
4446 | meta.mm_address = env->me_map; |
4447 | i = mdb_env_init_meta(env, &meta); |
4448 | if (i != MDB_SUCCESS) { |
4449 | return i; |
4450 | } |
4451 | } |
4452 | |
4453 | env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; |
4454 | env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) |
4455 | - sizeof(indx_t); |
4456 | #if !(MDB_MAXKEYSIZE) |
4457 | env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); |
4458 | #endif |
4459 | env->me_maxpg = env->me_mapsize / env->me_psize; |
4460 | |
4461 | #if MDB_DEBUG |
4462 | { |
4463 | MDB_meta *meta = mdb_env_pick_meta(env); |
4464 | MDB_db *db = &meta->mm_dbs[MAIN_DBI]; |
4465 | |
4466 | DPRINTF(("opened database version %u, pagesize %u" , |
4467 | meta->mm_version, env->me_psize)); |
4468 | DPRINTF(("using meta page %d" , (int) (meta->mm_txnid & 1))); |
4469 | DPRINTF(("depth: %u" , db->md_depth)); |
4470 | DPRINTF(("entries: %" Z"u" , db->md_entries)); |
4471 | DPRINTF(("branch pages: %" Z"u" , db->md_branch_pages)); |
4472 | DPRINTF(("leaf pages: %" Z"u" , db->md_leaf_pages)); |
4473 | DPRINTF(("overflow pages: %" Z"u" , db->md_overflow_pages)); |
4474 | DPRINTF(("root: %" Z"u" , db->md_root)); |
4475 | } |
4476 | #endif |
4477 | |
4478 | return MDB_SUCCESS; |
4479 | } |
4480 | |
4481 | |
4482 | /** Release a reader thread's slot in the reader lock table. |
4483 | * This function is called automatically when a thread exits. |
4484 | * @param[in] ptr This points to the slot in the reader lock table. |
4485 | */ |
4486 | static void |
4487 | mdb_env_reader_dest(void *ptr) |
4488 | { |
4489 | MDB_reader *reader = ptr; |
4490 | |
4491 | #ifndef _WIN32 |
4492 | if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */ |
4493 | #endif |
4494 | /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */ |
4495 | reader->mr_pid = 0; |
4496 | } |
4497 | |
4498 | #ifdef _WIN32 |
4499 | /** Junk for arranging thread-specific callbacks on Windows. This is |
4500 | * necessarily platform and compiler-specific. Windows supports up |
4501 | * to 1088 keys. Let's assume nobody opens more than 64 environments |
4502 | * in a single process, for now. They can override this if needed. |
4503 | */ |
4504 | #ifndef MAX_TLS_KEYS |
4505 | #define MAX_TLS_KEYS 64 |
4506 | #endif |
4507 | static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS]; |
4508 | static int mdb_tls_nkeys; |
4509 | |
4510 | static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr) |
4511 | { |
4512 | int i; |
4513 | switch(reason) { |
4514 | case DLL_PROCESS_ATTACH: break; |
4515 | case DLL_THREAD_ATTACH: break; |
4516 | case DLL_THREAD_DETACH: |
4517 | for (i=0; i<mdb_tls_nkeys; i++) { |
4518 | MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]); |
4519 | if (r) { |
4520 | mdb_env_reader_dest(r); |
4521 | } |
4522 | } |
4523 | break; |
4524 | case DLL_PROCESS_DETACH: break; |
4525 | } |
4526 | } |
4527 | #ifdef __GNUC__ |
4528 | #ifdef _WIN64 |
4529 | const PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB" ))) = mdb_tls_callback; |
4530 | #else |
4531 | PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB" ))) = mdb_tls_callback; |
4532 | #endif |
4533 | #else |
4534 | #ifdef _WIN64 |
4535 | /* Force some symbol references. |
4536 | * _tls_used forces the linker to create the TLS directory if not already done |
4537 | * mdb_tls_cbp prevents whole-program-optimizer from dropping the symbol. |
4538 | */ |
4539 | #pragma comment(linker, "/INCLUDE:_tls_used") |
4540 | #pragma comment(linker, "/INCLUDE:mdb_tls_cbp") |
4541 | #pragma const_seg(".CRT$XLB") |
4542 | extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp; |
4543 | const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; |
4544 | #pragma const_seg() |
4545 | #else /* _WIN32 */ |
4546 | #pragma comment(linker, "/INCLUDE:__tls_used") |
4547 | #pragma comment(linker, "/INCLUDE:_mdb_tls_cbp") |
4548 | #pragma data_seg(".CRT$XLB") |
4549 | PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback; |
4550 | #pragma data_seg() |
4551 | #endif /* WIN 32/64 */ |
4552 | #endif /* !__GNUC__ */ |
4553 | #endif |
4554 | |
4555 | /** Downgrade the exclusive lock on the region back to shared */ |
4556 | static int ESECT |
4557 | mdb_env_share_locks(MDB_env *env, int *excl) |
4558 | { |
4559 | int rc = 0; |
4560 | MDB_meta *meta = mdb_env_pick_meta(env); |
4561 | |
4562 | env->me_txns->mti_txnid = meta->mm_txnid; |
4563 | |
4564 | #ifdef _WIN32 |
4565 | { |
4566 | OVERLAPPED ov; |
4567 | /* First acquire a shared lock. The Unlock will |
4568 | * then release the existing exclusive lock. |
4569 | */ |
4570 | memset(&ov, 0, sizeof(ov)); |
4571 | if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { |
4572 | rc = ErrCode(); |
4573 | } else { |
4574 | UnlockFile(env->me_lfd, 0, 0, 1, 0); |
4575 | *excl = 0; |
4576 | } |
4577 | } |
4578 | #else |
4579 | { |
4580 | struct flock lock_info; |
4581 | /* The shared lock replaces the existing lock */ |
4582 | memset((void *)&lock_info, 0, sizeof(lock_info)); |
4583 | lock_info.l_type = F_RDLCK; |
4584 | lock_info.l_whence = SEEK_SET; |
4585 | lock_info.l_start = 0; |
4586 | lock_info.l_len = 1; |
4587 | while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && |
4588 | (rc = ErrCode()) == EINTR) ; |
4589 | *excl = rc ? -1 : 0; /* error may mean we lost the lock */ |
4590 | } |
4591 | #endif |
4592 | |
4593 | return rc; |
4594 | } |
4595 | |
4596 | /** Try to get exclusive lock, otherwise shared. |
4597 | * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. |
4598 | */ |
4599 | static int ESECT |
4600 | mdb_env_excl_lock(MDB_env *env, int *excl) |
4601 | { |
4602 | int rc = 0; |
4603 | #ifdef _WIN32 |
4604 | if (LockFile(env->me_lfd, 0, 0, 1, 0)) { |
4605 | *excl = 1; |
4606 | } else { |
4607 | OVERLAPPED ov; |
4608 | memset(&ov, 0, sizeof(ov)); |
4609 | if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) { |
4610 | *excl = 0; |
4611 | } else { |
4612 | rc = ErrCode(); |
4613 | } |
4614 | } |
4615 | #else |
4616 | struct flock lock_info; |
4617 | memset((void *)&lock_info, 0, sizeof(lock_info)); |
4618 | lock_info.l_type = F_WRLCK; |
4619 | lock_info.l_whence = SEEK_SET; |
4620 | lock_info.l_start = 0; |
4621 | lock_info.l_len = 1; |
4622 | while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && |
4623 | (rc = ErrCode()) == EINTR) ; |
4624 | if (!rc) { |
4625 | *excl = 1; |
4626 | } else |
4627 | # ifndef MDB_USE_POSIX_MUTEX |
4628 | if (*excl < 0) /* always true when MDB_USE_POSIX_MUTEX */ |
4629 | # endif |
4630 | { |
4631 | lock_info.l_type = F_RDLCK; |
4632 | while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && |
4633 | (rc = ErrCode()) == EINTR) ; |
4634 | if (rc == 0) |
4635 | *excl = 0; |
4636 | } |
4637 | #endif |
4638 | return rc; |
4639 | } |
4640 | |
4641 | #ifdef MDB_USE_HASH |
4642 | /* |
4643 | * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code |
4644 | * |
4645 | * @(#) $Revision: 5.1 $ |
4646 | * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $ |
4647 | * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $ |
4648 | * |
4649 | * http://www.isthe.com/chongo/tech/comp/fnv/index.html |
4650 | * |
4651 | *** |
4652 | * |
4653 | * Please do not copyright this code. This code is in the public domain. |
4654 | * |
4655 | * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, |
4656 | * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO |
4657 | * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR |
4658 | * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF |
4659 | * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR |
4660 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR |
4661 | * PERFORMANCE OF THIS SOFTWARE. |
4662 | * |
4663 | * By: |
4664 | * chongo <Landon Curt Noll> /\oo/\ |
4665 | * http://www.isthe.com/chongo/ |
4666 | * |
4667 | * Share and Enjoy! :-) |
4668 | */ |
4669 | |
4670 | typedef unsigned long long mdb_hash_t; |
4671 | #define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL) |
4672 | |
4673 | /** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer |
4674 | * @param[in] val value to hash |
4675 | * @param[in] hval initial value for hash |
4676 | * @return 64 bit hash |
4677 | * |
4678 | * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the |
4679 | * hval arg on the first call. |
4680 | */ |
4681 | static mdb_hash_t |
4682 | mdb_hash_val(MDB_val *val, mdb_hash_t hval) |
4683 | { |
4684 | unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ |
4685 | unsigned char *end = s + val->mv_size; |
4686 | /* |
4687 | * FNV-1a hash each octet of the string |
4688 | */ |
4689 | while (s < end) { |
4690 | /* xor the bottom with the current octet */ |
4691 | hval ^= (mdb_hash_t)*s++; |
4692 | |
4693 | /* multiply by the 64 bit FNV magic prime mod 2^64 */ |
4694 | hval += (hval << 1) + (hval << 4) + (hval << 5) + |
4695 | (hval << 7) + (hval << 8) + (hval << 40); |
4696 | } |
4697 | /* return our new hash value */ |
4698 | return hval; |
4699 | } |
4700 | |
4701 | /** Hash the string and output the encoded hash. |
4702 | * This uses modified RFC1924 Ascii85 encoding to accommodate systems with |
4703 | * very short name limits. We don't care about the encoding being reversible, |
4704 | * we just want to preserve as many bits of the input as possible in a |
4705 | * small printable string. |
4706 | * @param[in] str string to hash |
4707 | * @param[out] encbuf an array of 11 chars to hold the hash |
4708 | */ |
4709 | static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~" ; |
4710 | |
4711 | static void ESECT |
4712 | mdb_pack85(unsigned long l, char *out) |
4713 | { |
4714 | int i; |
4715 | |
4716 | for (i=0; i<5; i++) { |
4717 | *out++ = mdb_a85[l % 85]; |
4718 | l /= 85; |
4719 | } |
4720 | } |
4721 | |
4722 | static void ESECT |
4723 | mdb_hash_enc(MDB_val *val, char *encbuf) |
4724 | { |
4725 | mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); |
4726 | |
4727 | mdb_pack85(h, encbuf); |
4728 | mdb_pack85(h>>32, encbuf+5); |
4729 | encbuf[10] = '\0'; |
4730 | } |
4731 | #endif |
4732 | |
4733 | /** Open and/or initialize the lock region for the environment. |
4734 | * @param[in] env The LMDB environment. |
4735 | * @param[in] fname Filename + scratch area, from #mdb_fname_init(). |
4736 | * @param[in] mode The Unix permissions for the file, if we create it. |
4737 | * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive |
4738 | * @return 0 on success, non-zero on failure. |
4739 | */ |
4740 | static int ESECT |
4741 | mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) |
4742 | { |
4743 | #ifdef _WIN32 |
4744 | # define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT |
4745 | #else |
4746 | # define MDB_ERRCODE_ROFS EROFS |
4747 | #endif |
4748 | int rc; |
4749 | off_t size, rsize; |
4750 | |
4751 | rc = mdb_fopen(env, fname, MDB_O_LOCKS, mode, &env->me_lfd); |
4752 | if (rc) { |
4753 | /* Omit lockfile if read-only env on read-only filesystem */ |
4754 | if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { |
4755 | return MDB_SUCCESS; |
4756 | } |
4757 | goto fail; |
4758 | } |
4759 | |
4760 | if (!(env->me_flags & MDB_NOTLS)) { |
4761 | rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); |
4762 | if (rc) |
4763 | goto fail; |
4764 | env->me_flags |= MDB_ENV_TXKEY; |
4765 | #ifdef _WIN32 |
4766 | /* Windows TLS callbacks need help finding their TLS info. */ |
4767 | if (mdb_tls_nkeys >= MAX_TLS_KEYS) { |
4768 | rc = MDB_TLS_FULL; |
4769 | goto fail; |
4770 | } |
4771 | mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; |
4772 | #endif |
4773 | } |
4774 | |
4775 | /* Try to get exclusive lock. If we succeed, then |
4776 | * nobody is using the lock region and we should initialize it. |
4777 | */ |
4778 | if ((rc = mdb_env_excl_lock(env, excl))) goto fail; |
4779 | |
4780 | #ifdef _WIN32 |
4781 | size = GetFileSize(env->me_lfd, NULL); |
4782 | #else |
4783 | size = lseek(env->me_lfd, 0, SEEK_END); |
4784 | if (size == -1) goto fail_errno; |
4785 | #endif |
4786 | rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); |
4787 | if (size < rsize && *excl > 0) { |
4788 | #ifdef _WIN32 |
4789 | if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize |
4790 | || !SetEndOfFile(env->me_lfd)) |
4791 | goto fail_errno; |
4792 | #else |
4793 | if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; |
4794 | #endif |
4795 | } else { |
4796 | rsize = size; |
4797 | size = rsize - sizeof(MDB_txninfo); |
4798 | env->me_maxreaders = size/sizeof(MDB_reader) + 1; |
4799 | } |
4800 | { |
4801 | #ifdef _WIN32 |
4802 | HANDLE mh; |
4803 | mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE, |
4804 | 0, 0, NULL); |
4805 | if (!mh) goto fail_errno; |
4806 | env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL); |
4807 | CloseHandle(mh); |
4808 | if (!env->me_txns) goto fail_errno; |
4809 | #else |
4810 | void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, |
4811 | env->me_lfd, 0); |
4812 | if (m == MAP_FAILED) goto fail_errno; |
4813 | env->me_txns = m; |
4814 | #endif |
4815 | } |
4816 | if (*excl > 0) { |
4817 | #ifdef _WIN32 |
4818 | BY_HANDLE_FILE_INFORMATION stbuf; |
4819 | struct { |
4820 | DWORD volume; |
4821 | DWORD nhigh; |
4822 | DWORD nlow; |
4823 | } idbuf; |
4824 | MDB_val val; |
4825 | char encbuf[11]; |
4826 | |
4827 | if (!mdb_sec_inited) { |
4828 | InitializeSecurityDescriptor(&mdb_null_sd, |
4829 | SECURITY_DESCRIPTOR_REVISION); |
4830 | SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE); |
4831 | mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES); |
4832 | mdb_all_sa.bInheritHandle = FALSE; |
4833 | mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd; |
4834 | mdb_sec_inited = 1; |
4835 | } |
4836 | if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno; |
4837 | idbuf.volume = stbuf.dwVolumeSerialNumber; |
4838 | idbuf.nhigh = stbuf.nFileIndexHigh; |
4839 | idbuf.nlow = stbuf.nFileIndexLow; |
4840 | val.mv_data = &idbuf; |
4841 | val.mv_size = sizeof(idbuf); |
4842 | mdb_hash_enc(&val, encbuf); |
4843 | sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s" , encbuf); |
4844 | sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s" , encbuf); |
4845 | env->me_rmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_rmname); |
4846 | if (!env->me_rmutex) goto fail_errno; |
4847 | env->me_wmutex = CreateMutexA(&mdb_all_sa, FALSE, env->me_txns->mti_wmname); |
4848 | if (!env->me_wmutex) goto fail_errno; |
4849 | #elif defined(MDB_USE_POSIX_SEM) |
4850 | struct stat stbuf; |
4851 | struct { |
4852 | dev_t dev; |
4853 | ino_t ino; |
4854 | } idbuf; |
4855 | MDB_val val; |
4856 | char encbuf[11]; |
4857 | |
4858 | #if defined(__NetBSD__) |
4859 | #define MDB_SHORT_SEMNAMES 1 /* limited to 14 chars */ |
4860 | #endif |
4861 | if (fstat(env->me_lfd, &stbuf)) goto fail_errno; |
4862 | idbuf.dev = stbuf.st_dev; |
4863 | idbuf.ino = stbuf.st_ino; |
4864 | val.mv_data = &idbuf; |
4865 | val.mv_size = sizeof(idbuf); |
4866 | mdb_hash_enc(&val, encbuf); |
4867 | #ifdef MDB_SHORT_SEMNAMES |
4868 | encbuf[9] = '\0'; /* drop name from 15 chars to 14 chars */ |
4869 | #endif |
4870 | sprintf(env->me_txns->mti_rmname, "/MDBr%s" , encbuf); |
4871 | sprintf(env->me_txns->mti_wmname, "/MDBw%s" , encbuf); |
4872 | /* Clean up after a previous run, if needed: Try to |
4873 | * remove both semaphores before doing anything else. |
4874 | */ |
4875 | sem_unlink(env->me_txns->mti_rmname); |
4876 | sem_unlink(env->me_txns->mti_wmname); |
4877 | env->me_rmutex = sem_open(env->me_txns->mti_rmname, |
4878 | O_CREAT|O_EXCL, mode, 1); |
4879 | if (env->me_rmutex == SEM_FAILED) goto fail_errno; |
4880 | env->me_wmutex = sem_open(env->me_txns->mti_wmname, |
4881 | O_CREAT|O_EXCL, mode, 1); |
4882 | if (env->me_wmutex == SEM_FAILED) goto fail_errno; |
4883 | #else /* MDB_USE_POSIX_MUTEX: */ |
4884 | pthread_mutexattr_t mattr; |
4885 | |
4886 | /* Solaris needs this before initing a robust mutex. Otherwise |
4887 | * it may skip the init and return EBUSY "seems someone already |
4888 | * inited" or EINVAL "it was inited differently". |
4889 | */ |
4890 | memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex)); |
4891 | memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex)); |
4892 | |
4893 | if ((rc = pthread_mutexattr_init(&mattr))) |
4894 | goto fail; |
4895 | |
4896 | rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); |
4897 | #ifdef MDB_ROBUST_SUPPORTED |
4898 | if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); |
4899 | #endif |
4900 | if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr); |
4901 | if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr); |
4902 | pthread_mutexattr_destroy(&mattr); |
4903 | if (rc) |
4904 | goto fail; |
4905 | #endif /* _WIN32 || MDB_USE_POSIX_SEM */ |
4906 | |
4907 | env->me_txns->mti_magic = MDB_MAGIC; |
4908 | env->me_txns->mti_format = MDB_LOCK_FORMAT; |
4909 | env->me_txns->mti_txnid = 0; |
4910 | env->me_txns->mti_numreaders = 0; |
4911 | |
4912 | } else { |
4913 | if (env->me_txns->mti_magic != MDB_MAGIC) { |
4914 | DPUTS("lock region has invalid magic" ); |
4915 | rc = MDB_INVALID; |
4916 | goto fail; |
4917 | } |
4918 | if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { |
4919 | DPRINTF(("lock region has format+version 0x%x, expected 0x%x" , |
4920 | env->me_txns->mti_format, MDB_LOCK_FORMAT)); |
4921 | rc = MDB_VERSION_MISMATCH; |
4922 | goto fail; |
4923 | } |
4924 | rc = ErrCode(); |
4925 | if (rc && rc != EACCES && rc != EAGAIN) { |
4926 | goto fail; |
4927 | } |
4928 | #ifdef _WIN32 |
4929 | env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname); |
4930 | if (!env->me_rmutex) goto fail_errno; |
4931 | env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname); |
4932 | if (!env->me_wmutex) goto fail_errno; |
4933 | #elif defined(MDB_USE_POSIX_SEM) |
4934 | env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0); |
4935 | if (env->me_rmutex == SEM_FAILED) goto fail_errno; |
4936 | env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0); |
4937 | if (env->me_wmutex == SEM_FAILED) goto fail_errno; |
4938 | #endif |
4939 | } |
4940 | return MDB_SUCCESS; |
4941 | |
4942 | fail_errno: |
4943 | rc = ErrCode(); |
4944 | fail: |
4945 | return rc; |
4946 | } |
4947 | |
4948 | /** Only a subset of the @ref mdb_env flags can be changed |
4949 | * at runtime. Changing other flags requires closing the |
4950 | * environment and re-opening it with the new flags. |
4951 | */ |
4952 | #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) |
4953 | #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ |
4954 | MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD) |
4955 | |
4956 | #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) |
4957 | # error "Persistent DB flags & env flags overlap, but both go in mm_flags" |
4958 | #endif |
4959 | |
4960 | int ESECT |
4961 | mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) |
4962 | { |
4963 | int rc, excl = -1; |
4964 | MDB_name fname; |
4965 | |
4966 | if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) |
4967 | return EINVAL; |
4968 | |
4969 | flags |= env->me_flags; |
4970 | |
4971 | rc = mdb_fname_init(path, flags, &fname); |
4972 | if (rc) |
4973 | return rc; |
4974 | |
4975 | if (flags & MDB_RDONLY) { |
4976 | /* silently ignore WRITEMAP when we're only getting read access */ |
4977 | flags &= ~MDB_WRITEMAP; |
4978 | } else { |
4979 | if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) && |
4980 | (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) |
4981 | rc = ENOMEM; |
4982 | } |
4983 | env->me_flags = flags |= MDB_ENV_ACTIVE; |
4984 | if (rc) |
4985 | goto leave; |
4986 | |
4987 | env->me_path = strdup(path); |
4988 | env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); |
4989 | env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); |
4990 | env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int)); |
4991 | if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { |
4992 | rc = ENOMEM; |
4993 | goto leave; |
4994 | } |
4995 | env->me_dbxs[FREE_DBI].md_cmp = mdb_cmp_long; /* aligned MDB_INTEGERKEY */ |
4996 | |
4997 | /* For RDONLY, get lockfile after we know datafile exists */ |
4998 | if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { |
4999 | rc = mdb_env_setup_locks(env, &fname, mode, &excl); |
5000 | if (rc) |
5001 | goto leave; |
5002 | } |
5003 | |
5004 | rc = mdb_fopen(env, &fname, |
5005 | (flags & MDB_RDONLY) ? MDB_O_RDONLY : MDB_O_RDWR, |
5006 | mode, &env->me_fd); |
5007 | if (rc) |
5008 | goto leave; |
5009 | |
5010 | if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { |
5011 | rc = mdb_env_setup_locks(env, &fname, mode, &excl); |
5012 | if (rc) |
5013 | goto leave; |
5014 | } |
5015 | |
5016 | if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { |
5017 | if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { |
5018 | /* Synchronous fd for meta writes. Needed even with |
5019 | * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. |
5020 | */ |
5021 | rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); |
5022 | if (rc) |
5023 | goto leave; |
5024 | } |
5025 | DPRINTF(("opened dbenv %p" , (void *) env)); |
5026 | if (excl > 0) { |
5027 | rc = mdb_env_share_locks(env, &excl); |
5028 | if (rc) |
5029 | goto leave; |
5030 | } |
5031 | if (!(flags & MDB_RDONLY)) { |
5032 | MDB_txn *txn; |
5033 | int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs * |
5034 | (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned int)+1); |
5035 | if ((env->me_pbuf = calloc(1, env->me_psize)) && |
5036 | (txn = calloc(1, size))) |
5037 | { |
5038 | txn->mt_dbs = (MDB_db *)((char *)txn + tsize); |
5039 | txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); |
5040 | txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); |
5041 | txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); |
5042 | txn->mt_env = env; |
5043 | txn->mt_dbxs = env->me_dbxs; |
5044 | txn->mt_flags = MDB_TXN_FINISHED; |
5045 | env->me_txn0 = txn; |
5046 | } else { |
5047 | rc = ENOMEM; |
5048 | } |
5049 | } |
5050 | } |
5051 | |
5052 | leave: |
5053 | if (rc) { |
5054 | mdb_env_close0(env, excl); |
5055 | } |
5056 | mdb_fname_destroy(fname); |
5057 | return rc; |
5058 | } |
5059 | |
5060 | /** Destroy resources from mdb_env_open(), clear our readers & DBIs */ |
5061 | static void ESECT |
5062 | mdb_env_close0(MDB_env *env, int excl) |
5063 | { |
5064 | int i; |
5065 | |
5066 | if (!(env->me_flags & MDB_ENV_ACTIVE)) |
5067 | return; |
5068 | |
5069 | /* Doing this here since me_dbxs may not exist during mdb_env_close */ |
5070 | if (env->me_dbxs) { |
5071 | for (i = env->me_maxdbs; --i >= CORE_DBS; ) |
5072 | free(env->me_dbxs[i].md_name.mv_data); |
5073 | free(env->me_dbxs); |
5074 | } |
5075 | |
5076 | free(env->me_pbuf); |
5077 | free(env->me_dbiseqs); |
5078 | free(env->me_dbflags); |
5079 | free(env->me_path); |
5080 | free(env->me_dirty_list); |
5081 | free(env->me_txn0); |
5082 | mdb_midl_free(env->me_free_pgs); |
5083 | |
5084 | if (env->me_flags & MDB_ENV_TXKEY) { |
5085 | pthread_key_delete(env->me_txkey); |
5086 | #ifdef _WIN32 |
5087 | /* Delete our key from the global list */ |
5088 | for (i=0; i<mdb_tls_nkeys; i++) |
5089 | if (mdb_tls_keys[i] == env->me_txkey) { |
5090 | mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; |
5091 | mdb_tls_nkeys--; |
5092 | break; |
5093 | } |
5094 | #endif |
5095 | } |
5096 | |
5097 | if (env->me_map) { |
5098 | munmap(env->me_map, env->me_mapsize); |
5099 | } |
5100 | if (env->me_mfd != INVALID_HANDLE_VALUE) |
5101 | (void) close(env->me_mfd); |
5102 | if (env->me_fd != INVALID_HANDLE_VALUE) |
5103 | (void) close(env->me_fd); |
5104 | if (env->me_txns) { |
5105 | MDB_PID_T pid = getpid(); |
5106 | /* Clearing readers is done in this function because |
5107 | * me_txkey with its destructor must be disabled first. |
5108 | * |
5109 | * We skip the the reader mutex, so we touch only |
5110 | * data owned by this process (me_close_readers and |
5111 | * our readers), and clear each reader atomically. |
5112 | */ |
5113 | for (i = env->me_close_readers; --i >= 0; ) |
5114 | if (env->me_txns->mti_readers[i].mr_pid == pid) |
5115 | env->me_txns->mti_readers[i].mr_pid = 0; |
5116 | #ifdef _WIN32 |
5117 | if (env->me_rmutex) { |
5118 | CloseHandle(env->me_rmutex); |
5119 | if (env->me_wmutex) CloseHandle(env->me_wmutex); |
5120 | } |
5121 | /* Windows automatically destroys the mutexes when |
5122 | * the last handle closes. |
5123 | */ |
5124 | #elif defined(MDB_USE_POSIX_SEM) |
5125 | if (env->me_rmutex != SEM_FAILED) { |
5126 | sem_close(env->me_rmutex); |
5127 | if (env->me_wmutex != SEM_FAILED) |
5128 | sem_close(env->me_wmutex); |
5129 | /* If we have the filelock: If we are the |
5130 | * only remaining user, clean up semaphores. |
5131 | */ |
5132 | if (excl == 0) |
5133 | mdb_env_excl_lock(env, &excl); |
5134 | if (excl > 0) { |
5135 | sem_unlink(env->me_txns->mti_rmname); |
5136 | sem_unlink(env->me_txns->mti_wmname); |
5137 | } |
5138 | } |
5139 | #elif defined(MDB_ROBUST_SUPPORTED) |
5140 | /* If we have the filelock: If we are the |
5141 | * only remaining user, clean up robust |
5142 | * mutexes. |
5143 | */ |
5144 | if (excl == 0) |
5145 | mdb_env_excl_lock(env, &excl); |
5146 | if (excl > 0) { |
5147 | pthread_mutex_destroy(env->me_txns->mti_rmutex); |
5148 | pthread_mutex_destroy(env->me_txns->mti_wmutex); |
5149 | } |
5150 | #endif |
5151 | munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); |
5152 | } |
5153 | if (env->me_lfd != INVALID_HANDLE_VALUE) { |
5154 | #ifdef _WIN32 |
5155 | if (excl >= 0) { |
5156 | /* Unlock the lockfile. Windows would have unlocked it |
5157 | * after closing anyway, but not necessarily at once. |
5158 | */ |
5159 | UnlockFile(env->me_lfd, 0, 0, 1, 0); |
5160 | } |
5161 | #endif |
5162 | (void) close(env->me_lfd); |
5163 | } |
5164 | |
5165 | env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); |
5166 | } |
5167 | |
5168 | void ESECT |
5169 | mdb_env_close(MDB_env *env) |
5170 | { |
5171 | MDB_page *dp; |
5172 | |
5173 | if (env == NULL) |
5174 | return; |
5175 | |
5176 | VGMEMP_DESTROY(env); |
5177 | while ((dp = env->me_dpages) != NULL) { |
5178 | VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); |
5179 | env->me_dpages = dp->mp_next; |
5180 | free(dp); |
5181 | } |
5182 | |
5183 | mdb_env_close0(env, 0); |
5184 | free(env); |
5185 | } |
5186 | |
5187 | /** Compare two items pointing at aligned size_t's */ |
5188 | static int |
5189 | mdb_cmp_long(const MDB_val *a, const MDB_val *b) |
5190 | { |
5191 | return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 : |
5192 | *(size_t *)a->mv_data > *(size_t *)b->mv_data; |
5193 | } |
5194 | |
5195 | /** Compare two items pointing at aligned unsigned int's. |
5196 | * |
5197 | * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp, |
5198 | * but #mdb_cmp_clong() is called instead if the data type is size_t. |
5199 | */ |
5200 | static int |
5201 | mdb_cmp_int(const MDB_val *a, const MDB_val *b) |
5202 | { |
5203 | return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 : |
5204 | *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data; |
5205 | } |
5206 | |
5207 | /** Compare two items pointing at unsigned ints of unknown alignment. |
5208 | * Nodes and keys are guaranteed to be 2-byte aligned. |
5209 | */ |
5210 | static int |
5211 | mdb_cmp_cint(const MDB_val *a, const MDB_val *b) |
5212 | { |
5213 | #if BYTE_ORDER == LITTLE_ENDIAN |
5214 | unsigned short *u, *c; |
5215 | int x; |
5216 | |
5217 | u = (unsigned short *) ((char *) a->mv_data + a->mv_size); |
5218 | c = (unsigned short *) ((char *) b->mv_data + a->mv_size); |
5219 | do { |
5220 | x = *--u - *--c; |
5221 | } while(!x && u > (unsigned short *)a->mv_data); |
5222 | return x; |
5223 | #else |
5224 | unsigned short *u, *c, *end; |
5225 | int x; |
5226 | |
5227 | end = (unsigned short *) ((char *) a->mv_data + a->mv_size); |
5228 | u = (unsigned short *)a->mv_data; |
5229 | c = (unsigned short *)b->mv_data; |
5230 | do { |
5231 | x = *u++ - *c++; |
5232 | } while(!x && u < end); |
5233 | return x; |
5234 | #endif |
5235 | } |
5236 | |
5237 | /** Compare two items lexically */ |
5238 | static int |
5239 | mdb_cmp_memn(const MDB_val *a, const MDB_val *b) |
5240 | { |
5241 | int diff; |
5242 | ssize_t len_diff; |
5243 | unsigned int len; |
5244 | |
5245 | len = a->mv_size; |
5246 | len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; |
5247 | if (len_diff > 0) { |
5248 | len = b->mv_size; |
5249 | len_diff = 1; |
5250 | } |
5251 | |
5252 | diff = memcmp(a->mv_data, b->mv_data, len); |
5253 | return diff ? diff : len_diff<0 ? -1 : len_diff; |
5254 | } |
5255 | |
5256 | /** Compare two items in reverse byte order */ |
5257 | static int |
5258 | mdb_cmp_memnr(const MDB_val *a, const MDB_val *b) |
5259 | { |
5260 | const unsigned char *p1, *p2, *p1_lim; |
5261 | ssize_t len_diff; |
5262 | int diff; |
5263 | |
5264 | p1_lim = (const unsigned char *)a->mv_data; |
5265 | p1 = (const unsigned char *)a->mv_data + a->mv_size; |
5266 | p2 = (const unsigned char *)b->mv_data + b->mv_size; |
5267 | |
5268 | len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size; |
5269 | if (len_diff > 0) { |
5270 | p1_lim += len_diff; |
5271 | len_diff = 1; |
5272 | } |
5273 | |
5274 | while (p1 > p1_lim) { |
5275 | diff = *--p1 - *--p2; |
5276 | if (diff) |
5277 | return diff; |
5278 | } |
5279 | return len_diff<0 ? -1 : len_diff; |
5280 | } |
5281 | |
5282 | /** Search for key within a page, using binary search. |
5283 | * Returns the smallest entry larger or equal to the key. |
5284 | * If exactp is non-null, stores whether the found entry was an exact match |
5285 | * in *exactp (1 or 0). |
5286 | * Updates the cursor index with the index of the found entry. |
5287 | * If no entry larger or equal to the key is found, returns NULL. |
5288 | */ |
5289 | static MDB_node * |
5290 | mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) |
5291 | { |
5292 | unsigned int i = 0, nkeys; |
5293 | int low, high; |
5294 | int rc = 0; |
5295 | MDB_page *mp = mc->mc_pg[mc->mc_top]; |
5296 | MDB_node *node = NULL; |
5297 | MDB_val nodekey; |
5298 | MDB_cmp_func *cmp; |
5299 | DKBUF; |
5300 | |
5301 | nkeys = NUMKEYS(mp); |
5302 | |
5303 | DPRINTF(("searching %u keys in %s %spage %" Z"u" , |
5304 | nkeys, IS_LEAF(mp) ? "leaf" : "branch" , IS_SUBP(mp) ? "sub-" : "" , |
5305 | mdb_dbg_pgno(mp))); |
5306 | |
5307 | low = IS_LEAF(mp) ? 0 : 1; |
5308 | high = nkeys - 1; |
5309 | cmp = mc->mc_dbx->md_cmp; |
5310 | |
5311 | /* Branch pages have no data, so if using integer keys, |
5312 | * alignment is guaranteed. Use faster mdb_cmp_int. |
5313 | */ |
5314 | if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { |
5315 | if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) |
5316 | cmp = mdb_cmp_long; |
5317 | else |
5318 | cmp = mdb_cmp_int; |
5319 | } |
5320 | |
5321 | if (IS_LEAF2(mp)) { |
5322 | nodekey.mv_size = mc->mc_db->md_pad; |
5323 | node = NODEPTR(mp, 0); /* fake */ |
5324 | while (low <= high) { |
5325 | i = (low + high) >> 1; |
5326 | nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); |
5327 | rc = cmp(key, &nodekey); |
5328 | DPRINTF(("found leaf index %u [%s], rc = %i" , |
5329 | i, DKEY(&nodekey), rc)); |
5330 | if (rc == 0) |
5331 | break; |
5332 | if (rc > 0) |
5333 | low = i + 1; |
5334 | else |
5335 | high = i - 1; |
5336 | } |
5337 | } else { |
5338 | while (low <= high) { |
5339 | i = (low + high) >> 1; |
5340 | |
5341 | node = NODEPTR(mp, i); |
5342 | nodekey.mv_size = NODEKSZ(node); |
5343 | nodekey.mv_data = NODEKEY(node); |
5344 | |
5345 | rc = cmp(key, &nodekey); |
5346 | #if MDB_DEBUG |
5347 | if (IS_LEAF(mp)) |
5348 | DPRINTF(("found leaf index %u [%s], rc = %i" , |
5349 | i, DKEY(&nodekey), rc)); |
5350 | else |
5351 | DPRINTF(("found branch index %u [%s -> %" Z"u], rc = %i" , |
5352 | i, DKEY(&nodekey), NODEPGNO(node), rc)); |
5353 | #endif |
5354 | if (rc == 0) |
5355 | break; |
5356 | if (rc > 0) |
5357 | low = i + 1; |
5358 | else |
5359 | high = i - 1; |
5360 | } |
5361 | } |
5362 | |
5363 | if (rc > 0) { /* Found entry is less than the key. */ |
5364 | i++; /* Skip to get the smallest entry larger than key. */ |
5365 | if (!IS_LEAF2(mp)) |
5366 | node = NODEPTR(mp, i); |
5367 | } |
5368 | if (exactp) |
5369 | *exactp = (rc == 0 && nkeys > 0); |
5370 | /* store the key index */ |
5371 | mc->mc_ki[mc->mc_top] = i; |
5372 | if (i >= nkeys) |
5373 | /* There is no entry larger or equal to the key. */ |
5374 | return NULL; |
5375 | |
5376 | /* nodeptr is fake for LEAF2 */ |
5377 | return node; |
5378 | } |
5379 | |
5380 | #if 0 |
5381 | static void |
5382 | mdb_cursor_adjust(MDB_cursor *mc, func) |
5383 | { |
5384 | MDB_cursor *m2; |
5385 | |
5386 | for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { |
5387 | if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { |
5388 | func(mc, m2); |
5389 | } |
5390 | } |
5391 | } |
5392 | #endif |
5393 | |
5394 | /** Pop a page off the top of the cursor's stack. */ |
5395 | static void |
5396 | mdb_cursor_pop(MDB_cursor *mc) |
5397 | { |
5398 | if (mc->mc_snum) { |
5399 | DPRINTF(("popping page %" Z"u off db %d cursor %p" , |
5400 | mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc)); |
5401 | |
5402 | mc->mc_snum--; |
5403 | if (mc->mc_snum) { |
5404 | mc->mc_top--; |
5405 | } else { |
5406 | mc->mc_flags &= ~C_INITIALIZED; |
5407 | } |
5408 | } |
5409 | } |
5410 | |
5411 | /** Push a page onto the top of the cursor's stack. |
5412 | * Set #MDB_TXN_ERROR on failure. |
5413 | */ |
5414 | static int |
5415 | mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) |
5416 | { |
5417 | DPRINTF(("pushing page %" Z"u on db %d cursor %p" , mp->mp_pgno, |
5418 | DDBI(mc), (void *) mc)); |
5419 | |
5420 | if (mc->mc_snum >= CURSOR_STACK) { |
5421 | mc->mc_txn->mt_flags |= MDB_TXN_ERROR; |
5422 | return MDB_CURSOR_FULL; |
5423 | } |
5424 | |
5425 | mc->mc_top = mc->mc_snum++; |
5426 | mc->mc_pg[mc->mc_top] = mp; |
5427 | mc->mc_ki[mc->mc_top] = 0; |
5428 | |
5429 | return MDB_SUCCESS; |
5430 | } |
5431 | |
5432 | /** Find the address of the page corresponding to a given page number. |
5433 | * Set #MDB_TXN_ERROR on failure. |
5434 | * @param[in] mc the cursor accessing the page. |
5435 | * @param[in] pgno the page number for the page to retrieve. |
5436 | * @param[out] ret address of a pointer where the page's address will be stored. |
5437 | * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. |
5438 | * @return 0 on success, non-zero on failure. |
5439 | */ |
5440 | static int |
5441 | mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) |
5442 | { |
5443 | MDB_txn *txn = mc->mc_txn; |
5444 | MDB_env *env = txn->mt_env; |
5445 | MDB_page *p = NULL; |
5446 | int level; |
5447 | |
5448 | if (! (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_WRITEMAP))) { |
5449 | MDB_txn *tx2 = txn; |
5450 | level = 1; |
5451 | do { |
5452 | MDB_ID2L dl = tx2->mt_u.dirty_list; |
5453 | unsigned x; |
5454 | /* Spilled pages were dirtied in this txn and flushed |
5455 | * because the dirty list got full. Bring this page |
5456 | * back in from the map (but don't unspill it here, |
5457 | * leave that unless page_touch happens again). |
5458 | */ |
5459 | if (tx2->mt_spill_pgs) { |
5460 | MDB_ID pn = pgno << 1; |
5461 | x = mdb_midl_search(tx2->mt_spill_pgs, pn); |
5462 | if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { |
5463 | p = (MDB_page *)(env->me_map + env->me_psize * pgno); |
5464 | goto done; |
5465 | } |
5466 | } |
5467 | if (dl[0].mid) { |
5468 | unsigned x = mdb_mid2l_search(dl, pgno); |
5469 | if (x <= dl[0].mid && dl[x].mid == pgno) { |
5470 | p = dl[x].mptr; |
5471 | goto done; |
5472 | } |
5473 | } |
5474 | level++; |
5475 | } while ((tx2 = tx2->mt_parent) != NULL); |
5476 | } |
5477 | |
5478 | if (pgno < txn->mt_next_pgno) { |
5479 | level = 0; |
5480 | p = (MDB_page *)(env->me_map + env->me_psize * pgno); |
5481 | } else { |
5482 | DPRINTF(("page %" Z"u not found" , pgno)); |
5483 | txn->mt_flags |= MDB_TXN_ERROR; |
5484 | return MDB_PAGE_NOTFOUND; |
5485 | } |
5486 | |
5487 | done: |
5488 | *ret = p; |
5489 | if (lvl) |
5490 | *lvl = level; |
5491 | return MDB_SUCCESS; |
5492 | } |
5493 | |
5494 | /** Finish #mdb_page_search() / #mdb_page_search_lowest(). |
5495 | * The cursor is at the root page, set up the rest of it. |
5496 | */ |
5497 | static int |
5498 | mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) |
5499 | { |
5500 | MDB_page *mp = mc->mc_pg[mc->mc_top]; |
5501 | int rc; |
5502 | DKBUF; |
5503 | |
5504 | while (IS_BRANCH(mp)) { |
5505 | MDB_node *node; |
5506 | indx_t i; |
5507 | |
5508 | DPRINTF(("branch page %" Z"u has %u keys" , mp->mp_pgno, NUMKEYS(mp))); |
5509 | /* Don't assert on branch pages in the FreeDB. We can get here |
5510 | * while in the process of rebalancing a FreeDB branch page; we must |
5511 | * let that proceed. ITS#8336 |
5512 | */ |
5513 | mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); |
5514 | DPRINTF(("found index 0 to page %" Z"u" , NODEPGNO(NODEPTR(mp, 0)))); |
5515 | |
5516 | if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { |
5517 | i = 0; |
5518 | if (flags & MDB_PS_LAST) { |
5519 | i = NUMKEYS(mp) - 1; |
5520 | /* if already init'd, see if we're already in right place */ |
5521 | if (mc->mc_flags & C_INITIALIZED) { |
5522 | if (mc->mc_ki[mc->mc_top] == i) { |
5523 | mc->mc_top = mc->mc_snum++; |
5524 | mp = mc->mc_pg[mc->mc_top]; |
5525 | goto ready; |
5526 | } |
5527 | } |
5528 | } |
5529 | } else { |
5530 | int exact; |
5531 | node = mdb_node_search(mc, key, &exact); |
5532 | if (node == NULL) |
5533 | i = NUMKEYS(mp) - 1; |
5534 | else { |
5535 | i = mc->mc_ki[mc->mc_top]; |
5536 | if (!exact) { |
5537 | mdb_cassert(mc, i > 0); |
5538 | i--; |
5539 | } |
5540 | } |
5541 | DPRINTF(("following index %u for key [%s]" , i, DKEY(key))); |
5542 | } |
5543 | |
5544 | mdb_cassert(mc, i < NUMKEYS(mp)); |
5545 | node = NODEPTR(mp, i); |
5546 | |
5547 | if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) |
5548 | return rc; |
5549 | |
5550 | mc->mc_ki[mc->mc_top] = i; |
5551 | if ((rc = mdb_cursor_push(mc, mp))) |
5552 | return rc; |
5553 | |
5554 | ready: |
5555 | if (flags & MDB_PS_MODIFY) { |
5556 | if ((rc = mdb_page_touch(mc)) != 0) |
5557 | return rc; |
5558 | mp = mc->mc_pg[mc->mc_top]; |
5559 | } |
5560 | } |
5561 | |
5562 | if (!IS_LEAF(mp)) { |
5563 | DPRINTF(("internal error, index points to a %02X page!?" , |
5564 | mp->mp_flags)); |
5565 | mc->mc_txn->mt_flags |= MDB_TXN_ERROR; |
5566 | return MDB_CORRUPTED; |
5567 | } |
5568 | |
5569 | DPRINTF(("found leaf page %" Z"u for key [%s]" , mp->mp_pgno, |
5570 | key ? DKEY(key) : "null" )); |
5571 | mc->mc_flags |= C_INITIALIZED; |
5572 | mc->mc_flags &= ~C_EOF; |
5573 | |
5574 | return MDB_SUCCESS; |
5575 | } |
5576 | |
5577 | /** Search for the lowest key under the current branch page. |
5578 | * This just bypasses a NUMKEYS check in the current page |
5579 | * before calling mdb_page_search_root(), because the callers |
5580 | * are all in situations where the current page is known to |
5581 | * be underfilled. |
5582 | */ |
5583 | static int |
5584 | mdb_page_search_lowest(MDB_cursor *mc) |
5585 | { |
5586 | MDB_page *mp = mc->mc_pg[mc->mc_top]; |
5587 | MDB_node *node = NODEPTR(mp, 0); |
5588 | int rc; |
5589 | |
5590 | if ((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0) |
5591 | return rc; |
5592 | |
5593 | mc->mc_ki[mc->mc_top] = 0; |
5594 | if ((rc = mdb_cursor_push(mc, mp))) |
5595 | return rc; |
5596 | return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); |
5597 | } |
5598 | |
5599 | /** Search for the page a given key should be in. |
5600 | * Push it and its parent pages on the cursor stack. |
5601 | * @param[in,out] mc the cursor for this operation. |
5602 | * @param[in] key the key to search for, or NULL for first/last page. |
5603 | * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB |
5604 | * are touched (updated with new page numbers). |
5605 | * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. |
5606 | * This is used by #mdb_cursor_first() and #mdb_cursor_last(). |
5607 | * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. |
5608 | * @return 0 on success, non-zero on failure. |
5609 | */ |
5610 | static int |
5611 | mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) |
5612 | { |
5613 | int rc; |
5614 | pgno_t root; |
5615 | |
5616 | /* Make sure the txn is still viable, then find the root from |
5617 | * the txn's db table and set it as the root of the cursor's stack. |
5618 | */ |
5619 | if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) { |
5620 | DPUTS("transaction may not be used now" ); |
5621 | return MDB_BAD_TXN; |
5622 | } else { |
5623 | /* Make sure we're using an up-to-date root */ |
5624 | if (*mc->mc_dbflag & DB_STALE) { |
5625 | MDB_cursor mc2; |
5626 | if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) |
5627 | return MDB_BAD_DBI; |
5628 | mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); |
5629 | rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); |
5630 | if (rc) |
5631 | return rc; |
5632 | { |
5633 | MDB_val data; |
5634 | int exact = 0; |
5635 | uint16_t flags; |
5636 | MDB_node *leaf = mdb_node_search(&mc2, |
5637 | &mc->mc_dbx->md_name, &exact); |
5638 | if (!exact) |
5639 | return MDB_NOTFOUND; |
5640 | if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) |
5641 | return MDB_INCOMPATIBLE; /* not a named DB */ |
5642 | rc = mdb_node_read(&mc2, leaf, &data); |
5643 | if (rc) |
5644 | return rc; |
5645 | memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), |
5646 | sizeof(uint16_t)); |
5647 | /* The txn may not know this DBI, or another process may |
5648 | * have dropped and recreated the DB with other flags. |
5649 | */ |
5650 | if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags) |
5651 | return MDB_INCOMPATIBLE; |
5652 | memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); |
5653 | } |
5654 | *mc->mc_dbflag &= ~DB_STALE; |
5655 | } |
5656 | root = mc->mc_db->md_root; |
5657 | |
5658 | if (root == P_INVALID) { /* Tree is empty. */ |
5659 | DPUTS("tree is empty" ); |
5660 | return MDB_NOTFOUND; |
5661 | } |
5662 | } |
5663 | |
5664 | mdb_cassert(mc, root > 1); |
5665 | if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) |
5666 | if ((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0) |
5667 | return rc; |
5668 | |
5669 | mc->mc_snum = 1; |
5670 | mc->mc_top = 0; |
5671 | |
5672 | DPRINTF(("db %d root page %" Z"u has flags 0x%X" , |
5673 | DDBI(mc), root, mc->mc_pg[0]->mp_flags)); |
5674 | |
5675 | if (flags & MDB_PS_MODIFY) { |
5676 | if ((rc = mdb_page_touch(mc))) |
5677 | return rc; |
5678 | } |
5679 | |
5680 | if (flags & MDB_PS_ROOTONLY) |
5681 | return MDB_SUCCESS; |
5682 | |
5683 | return mdb_page_search_root(mc, key, flags); |
5684 | } |
5685 | |
5686 | static int |
5687 | mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) |
5688 | { |
5689 | MDB_txn *txn = mc->mc_txn; |
5690 | pgno_t pg = mp->mp_pgno; |
5691 | unsigned x = 0, ovpages = mp->mp_pages; |
5692 | MDB_env *env = txn->mt_env; |
5693 | MDB_IDL sl = txn->mt_spill_pgs; |
5694 | MDB_ID pn = pg << 1; |
5695 | int rc; |
5696 | |
5697 | DPRINTF(("free ov page %" Z"u (%d)" , pg, ovpages)); |
5698 | /* If the page is dirty or on the spill list we just acquired it, |
5699 | * so we should give it back to our current free list, if any. |
5700 | * Otherwise put it onto the list of pages we freed in this txn. |
5701 | * |
5702 | * Won't create me_pghead: me_pglast must be inited along with it. |
5703 | * Unsupported in nested txns: They would need to hide the page |
5704 | * range in ancestor txns' dirty and spilled lists. |
5705 | */ |
5706 | if (env->me_pghead && |
5707 | !txn->mt_parent && |
5708 | ((mp->mp_flags & P_DIRTY) || |
5709 | (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) |
5710 | { |
5711 | unsigned i, j; |
5712 | pgno_t *mop; |
5713 | MDB_ID2 *dl, ix, iy; |
5714 | rc = mdb_midl_need(&env->me_pghead, ovpages); |
5715 | if (rc) |
5716 | return rc; |
5717 | if (!(mp->mp_flags & P_DIRTY)) { |
5718 | /* This page is no longer spilled */ |
5719 | if (x == sl[0]) |
5720 | sl[0]--; |
5721 | else |
5722 | sl[x] |= 1; |
5723 | goto release; |
5724 | } |
5725 | /* Remove from dirty list */ |
5726 | dl = txn->mt_u.dirty_list; |
5727 | x = dl[0].mid--; |
5728 | for (ix = dl[x]; ix.mptr != mp; ix = iy) { |
5729 | if (x > 1) { |
5730 | x--; |
5731 | iy = dl[x]; |
5732 | dl[x] = ix; |
5733 | } else { |
5734 | mdb_cassert(mc, x > 1); |
5735 | j = ++(dl[0].mid); |
5736 | dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ |
5737 | txn->mt_flags |= MDB_TXN_ERROR; |
5738 | return MDB_CORRUPTED; |
5739 | } |
5740 | } |
5741 | txn->mt_dirty_room++; |
5742 | if (!(env->me_flags & MDB_WRITEMAP)) |
5743 | mdb_dpage_free(env, mp); |
5744 | release: |
5745 | /* Insert in me_pghead */ |
5746 | mop = env->me_pghead; |
5747 | j = mop[0] + ovpages; |
5748 | for (i = mop[0]; i && mop[i] < pg; i--) |
5749 | mop[j--] = mop[i]; |
5750 | while (j>i) |
5751 | mop[j--] = pg++; |
5752 | mop[0] += ovpages; |
5753 | } else { |
5754 | rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); |
5755 | if (rc) |
5756 | return rc; |
5757 | } |
5758 | mc->mc_db->md_overflow_pages -= ovpages; |
5759 | return 0; |
5760 | } |
5761 | |
5762 | /** Return the data associated with a given node. |
5763 | * @param[in] mc The cursor for this operation. |
5764 | * @param[in] leaf The node being read. |
5765 | * @param[out] data Updated to point to the node's data. |
5766 | * @return 0 on success, non-zero on failure. |
5767 | */ |
5768 | static int |
5769 | mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) |
5770 | { |
5771 | MDB_page *omp; /* overflow page */ |
5772 | pgno_t pgno; |
5773 | int rc; |
5774 | |
5775 | if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { |
5776 | data->mv_size = NODEDSZ(leaf); |
5777 | data->mv_data = NODEDATA(leaf); |
5778 | return MDB_SUCCESS; |
5779 | } |
5780 | |
5781 | /* Read overflow data. |
5782 | */ |
5783 | data->mv_size = NODEDSZ(leaf); |
5784 | memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); |
5785 | if ((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0) { |
5786 | DPRINTF(("read overflow page %" Z"u failed" , pgno)); |
5787 | return rc; |
5788 | } |
5789 | data->mv_data = METADATA(omp); |
5790 | |
5791 | return MDB_SUCCESS; |
5792 | } |
5793 | |
5794 | int |
5795 | mdb_get(MDB_txn *txn, MDB_dbi dbi, |
5796 | MDB_val *key, MDB_val *data) |
5797 | { |
5798 | MDB_cursor mc; |
5799 | MDB_xcursor mx; |
5800 | int exact = 0; |
5801 | DKBUF; |
5802 | |
5803 | DPRINTF(("===> get db %u key [%s]" , dbi, DKEY(key))); |
5804 | |
5805 | if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
5806 | return EINVAL; |
5807 | |
5808 | if (txn->mt_flags & MDB_TXN_BLOCKED) |
5809 | return MDB_BAD_TXN; |
5810 | |
5811 | mdb_cursor_init(&mc, txn, dbi, &mx); |
5812 | return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); |
5813 | } |
5814 | |
5815 | /** Find a sibling for a page. |
5816 | * Replaces the page at the top of the cursor's stack with the |
5817 | * specified sibling, if one exists. |
5818 | * @param[in] mc The cursor for this operation. |
5819 | * @param[in] move_right Non-zero if the right sibling is requested, |
5820 | * otherwise the left sibling. |
5821 | * @return 0 on success, non-zero on failure. |
5822 | */ |
5823 | static int |
5824 | mdb_cursor_sibling(MDB_cursor *mc, int move_right) |
5825 | { |
5826 | int rc; |
5827 | MDB_node *indx; |
5828 | MDB_page *mp; |
5829 | |
5830 | if (mc->mc_snum < 2) { |
5831 | return MDB_NOTFOUND; /* root has no siblings */ |
5832 | } |
5833 | |
5834 | mdb_cursor_pop(mc); |
5835 | DPRINTF(("parent page is page %" Z"u, index %u" , |
5836 | mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); |
5837 | |
5838 | if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) |
5839 | : (mc->mc_ki[mc->mc_top] == 0)) { |
5840 | DPRINTF(("no more keys left, moving to %s sibling" , |
5841 | move_right ? "right" : "left" )); |
5842 | if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) { |
5843 | /* undo cursor_pop before returning */ |
5844 | mc->mc_top++; |
5845 | mc->mc_snum++; |
5846 | return rc; |
5847 | } |
5848 | } else { |
5849 | if (move_right) |
5850 | mc->mc_ki[mc->mc_top]++; |
5851 | else |
5852 | mc->mc_ki[mc->mc_top]--; |
5853 | DPRINTF(("just moving to %s index key %u" , |
5854 | move_right ? "right" : "left" , mc->mc_ki[mc->mc_top])); |
5855 | } |
5856 | mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); |
5857 | |
5858 | indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
5859 | if ((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) { |
5860 | /* mc will be inconsistent if caller does mc_snum++ as above */ |
5861 | mc->mc_flags &= ~(C_INITIALIZED|C_EOF); |
5862 | return rc; |
5863 | } |
5864 | |
5865 | mdb_cursor_push(mc, mp); |
5866 | if (!move_right) |
5867 | mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; |
5868 | |
5869 | return MDB_SUCCESS; |
5870 | } |
5871 | |
5872 | /** Move the cursor to the next data item. */ |
5873 | static int |
5874 | mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) |
5875 | { |
5876 | MDB_page *mp; |
5877 | MDB_node *leaf; |
5878 | int rc; |
5879 | |
5880 | if ((mc->mc_flags & C_DEL && op == MDB_NEXT_DUP)) |
5881 | return MDB_NOTFOUND; |
5882 | |
5883 | if (!(mc->mc_flags & C_INITIALIZED)) |
5884 | return mdb_cursor_first(mc, key, data); |
5885 | |
5886 | mp = mc->mc_pg[mc->mc_top]; |
5887 | |
5888 | if (mc->mc_flags & C_EOF) { |
5889 | if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1) |
5890 | return MDB_NOTFOUND; |
5891 | mc->mc_flags ^= C_EOF; |
5892 | } |
5893 | |
5894 | if (mc->mc_db->md_flags & MDB_DUPSORT) { |
5895 | leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
5896 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
5897 | if (op == MDB_NEXT || op == MDB_NEXT_DUP) { |
5898 | rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); |
5899 | if (op != MDB_NEXT || rc != MDB_NOTFOUND) { |
5900 | if (rc == MDB_SUCCESS) |
5901 | MDB_GET_KEY(leaf, key); |
5902 | return rc; |
5903 | } |
5904 | } |
5905 | } else { |
5906 | mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); |
5907 | if (op == MDB_NEXT_DUP) |
5908 | return MDB_NOTFOUND; |
5909 | } |
5910 | } |
5911 | |
5912 | DPRINTF(("cursor_next: top page is %" Z"u in cursor %p" , |
5913 | mdb_dbg_pgno(mp), (void *) mc)); |
5914 | if (mc->mc_flags & C_DEL) { |
5915 | mc->mc_flags ^= C_DEL; |
5916 | goto skip; |
5917 | } |
5918 | |
5919 | if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { |
5920 | DPUTS("=====> move to next sibling page" ); |
5921 | if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { |
5922 | mc->mc_flags |= C_EOF; |
5923 | return rc; |
5924 | } |
5925 | mp = mc->mc_pg[mc->mc_top]; |
5926 | DPRINTF(("next page is %" Z"u, key index %u" , mp->mp_pgno, mc->mc_ki[mc->mc_top])); |
5927 | } else |
5928 | mc->mc_ki[mc->mc_top]++; |
5929 | |
5930 | skip: |
5931 | DPRINTF(("==> cursor points to page %" Z"u with %u keys, key index %u" , |
5932 | mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); |
5933 | |
5934 | if (IS_LEAF2(mp)) { |
5935 | key->mv_size = mc->mc_db->md_pad; |
5936 | key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); |
5937 | return MDB_SUCCESS; |
5938 | } |
5939 | |
5940 | mdb_cassert(mc, IS_LEAF(mp)); |
5941 | leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
5942 | |
5943 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
5944 | mdb_xcursor_init1(mc, leaf); |
5945 | rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); |
5946 | if (rc != MDB_SUCCESS) |
5947 | return rc; |
5948 | } else if (data) { |
5949 | if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) |
5950 | return rc; |
5951 | } |
5952 | |
5953 | MDB_GET_KEY(leaf, key); |
5954 | return MDB_SUCCESS; |
5955 | } |
5956 | |
5957 | /** Move the cursor to the previous data item. */ |
5958 | static int |
5959 | mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) |
5960 | { |
5961 | MDB_page *mp; |
5962 | MDB_node *leaf; |
5963 | int rc; |
5964 | |
5965 | if (!(mc->mc_flags & C_INITIALIZED)) { |
5966 | rc = mdb_cursor_last(mc, key, data); |
5967 | if (rc) |
5968 | return rc; |
5969 | mc->mc_ki[mc->mc_top]++; |
5970 | } |
5971 | |
5972 | mp = mc->mc_pg[mc->mc_top]; |
5973 | |
5974 | if ((mc->mc_db->md_flags & MDB_DUPSORT) && |
5975 | mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { |
5976 | leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
5977 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
5978 | if (op == MDB_PREV || op == MDB_PREV_DUP) { |
5979 | rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); |
5980 | if (op != MDB_PREV || rc != MDB_NOTFOUND) { |
5981 | if (rc == MDB_SUCCESS) { |
5982 | MDB_GET_KEY(leaf, key); |
5983 | mc->mc_flags &= ~C_EOF; |
5984 | } |
5985 | return rc; |
5986 | } |
5987 | } |
5988 | } else { |
5989 | mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); |
5990 | if (op == MDB_PREV_DUP) |
5991 | return MDB_NOTFOUND; |
5992 | } |
5993 | } |
5994 | |
5995 | DPRINTF(("cursor_prev: top page is %" Z"u in cursor %p" , |
5996 | mdb_dbg_pgno(mp), (void *) mc)); |
5997 | |
5998 | mc->mc_flags &= ~(C_EOF|C_DEL); |
5999 | |
6000 | if (mc->mc_ki[mc->mc_top] == 0) { |
6001 | DPUTS("=====> move to prev sibling page" ); |
6002 | if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { |
6003 | return rc; |
6004 | } |
6005 | mp = mc->mc_pg[mc->mc_top]; |
6006 | mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; |
6007 | DPRINTF(("prev page is %" Z"u, key index %u" , mp->mp_pgno, mc->mc_ki[mc->mc_top])); |
6008 | } else |
6009 | mc->mc_ki[mc->mc_top]--; |
6010 | |
6011 | DPRINTF(("==> cursor points to page %" Z"u with %u keys, key index %u" , |
6012 | mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); |
6013 | |
6014 | if (!IS_LEAF(mp)) |
6015 | return MDB_CORRUPTED; |
6016 | |
6017 | if (IS_LEAF2(mp)) { |
6018 | key->mv_size = mc->mc_db->md_pad; |
6019 | key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); |
6020 | return MDB_SUCCESS; |
6021 | } |
6022 | |
6023 | leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
6024 | |
6025 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
6026 | mdb_xcursor_init1(mc, leaf); |
6027 | rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); |
6028 | if (rc != MDB_SUCCESS) |
6029 | return rc; |
6030 | } else if (data) { |
6031 | if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) |
6032 | return rc; |
6033 | } |
6034 | |
6035 | MDB_GET_KEY(leaf, key); |
6036 | return MDB_SUCCESS; |
6037 | } |
6038 | |
6039 | /** Set the cursor on a specific data item. */ |
6040 | static int |
6041 | mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, |
6042 | MDB_cursor_op op, int *exactp) |
6043 | { |
6044 | int rc; |
6045 | MDB_page *mp; |
6046 | MDB_node *leaf = NULL; |
6047 | DKBUF; |
6048 | |
6049 | if (key->mv_size == 0) |
6050 | return MDB_BAD_VALSIZE; |
6051 | |
6052 | if (mc->mc_xcursor) |
6053 | mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); |
6054 | |
6055 | /* See if we're already on the right page */ |
6056 | if (mc->mc_flags & C_INITIALIZED) { |
6057 | MDB_val nodekey; |
6058 | |
6059 | mp = mc->mc_pg[mc->mc_top]; |
6060 | if (!NUMKEYS(mp)) { |
6061 | mc->mc_ki[mc->mc_top] = 0; |
6062 | return MDB_NOTFOUND; |
6063 | } |
6064 | if (mp->mp_flags & P_LEAF2) { |
6065 | nodekey.mv_size = mc->mc_db->md_pad; |
6066 | nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); |
6067 | } else { |
6068 | leaf = NODEPTR(mp, 0); |
6069 | MDB_GET_KEY2(leaf, nodekey); |
6070 | } |
6071 | rc = mc->mc_dbx->md_cmp(key, &nodekey); |
6072 | if (rc == 0) { |
6073 | /* Probably happens rarely, but first node on the page |
6074 | * was the one we wanted. |
6075 | */ |
6076 | mc->mc_ki[mc->mc_top] = 0; |
6077 | if (exactp) |
6078 | *exactp = 1; |
6079 | goto set1; |
6080 | } |
6081 | if (rc > 0) { |
6082 | unsigned int i; |
6083 | unsigned int nkeys = NUMKEYS(mp); |
6084 | if (nkeys > 1) { |
6085 | if (mp->mp_flags & P_LEAF2) { |
6086 | nodekey.mv_data = LEAF2KEY(mp, |
6087 | nkeys-1, nodekey.mv_size); |
6088 | } else { |
6089 | leaf = NODEPTR(mp, nkeys-1); |
6090 | MDB_GET_KEY2(leaf, nodekey); |
6091 | } |
6092 | rc = mc->mc_dbx->md_cmp(key, &nodekey); |
6093 | if (rc == 0) { |
6094 | /* last node was the one we wanted */ |
6095 | mc->mc_ki[mc->mc_top] = nkeys-1; |
6096 | if (exactp) |
6097 | *exactp = 1; |
6098 | goto set1; |
6099 | } |
6100 | if (rc < 0) { |
6101 | if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { |
6102 | /* This is definitely the right page, skip search_page */ |
6103 | if (mp->mp_flags & P_LEAF2) { |
6104 | nodekey.mv_data = LEAF2KEY(mp, |
6105 | mc->mc_ki[mc->mc_top], nodekey.mv_size); |
6106 | } else { |
6107 | leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
6108 | MDB_GET_KEY2(leaf, nodekey); |
6109 | } |
6110 | rc = mc->mc_dbx->md_cmp(key, &nodekey); |
6111 | if (rc == 0) { |
6112 | /* current node was the one we wanted */ |
6113 | if (exactp) |
6114 | *exactp = 1; |
6115 | goto set1; |
6116 | } |
6117 | } |
6118 | rc = 0; |
6119 | mc->mc_flags &= ~C_EOF; |
6120 | goto set2; |
6121 | } |
6122 | } |
6123 | /* If any parents have right-sibs, search. |
6124 | * Otherwise, there's nothing further. |
6125 | */ |
6126 | for (i=0; i<mc->mc_top; i++) |
6127 | if (mc->mc_ki[i] < |
6128 | NUMKEYS(mc->mc_pg[i])-1) |
6129 | break; |
6130 | if (i == mc->mc_top) { |
6131 | /* There are no other pages */ |
6132 | mc->mc_ki[mc->mc_top] = nkeys; |
6133 | return MDB_NOTFOUND; |
6134 | } |
6135 | } |
6136 | if (!mc->mc_top) { |
6137 | /* There are no other pages */ |
6138 | mc->mc_ki[mc->mc_top] = 0; |
6139 | if (op == MDB_SET_RANGE && !exactp) { |
6140 | rc = 0; |
6141 | goto set1; |
6142 | } else |
6143 | return MDB_NOTFOUND; |
6144 | } |
6145 | } else { |
6146 | mc->mc_pg[0] = 0; |
6147 | } |
6148 | |
6149 | rc = mdb_page_search(mc, key, 0); |
6150 | if (rc != MDB_SUCCESS) |
6151 | return rc; |
6152 | |
6153 | mp = mc->mc_pg[mc->mc_top]; |
6154 | mdb_cassert(mc, IS_LEAF(mp)); |
6155 | |
6156 | set2: |
6157 | leaf = mdb_node_search(mc, key, exactp); |
6158 | if (exactp != NULL && !*exactp) { |
6159 | /* MDB_SET specified and not an exact match. */ |
6160 | return MDB_NOTFOUND; |
6161 | } |
6162 | |
6163 | if (leaf == NULL) { |
6164 | DPUTS("===> inexact leaf not found, goto sibling" ); |
6165 | if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { |
6166 | mc->mc_flags |= C_EOF; |
6167 | return rc; /* no entries matched */ |
6168 | } |
6169 | mp = mc->mc_pg[mc->mc_top]; |
6170 | mdb_cassert(mc, IS_LEAF(mp)); |
6171 | leaf = NODEPTR(mp, 0); |
6172 | } |
6173 | |
6174 | set1: |
6175 | mc->mc_flags |= C_INITIALIZED; |
6176 | mc->mc_flags &= ~C_EOF; |
6177 | |
6178 | if (IS_LEAF2(mp)) { |
6179 | if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { |
6180 | key->mv_size = mc->mc_db->md_pad; |
6181 | key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); |
6182 | } |
6183 | return MDB_SUCCESS; |
6184 | } |
6185 | |
6186 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
6187 | mdb_xcursor_init1(mc, leaf); |
6188 | if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { |
6189 | rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); |
6190 | } else { |
6191 | int ex2, *ex2p; |
6192 | if (op == MDB_GET_BOTH) { |
6193 | ex2p = &ex2; |
6194 | ex2 = 0; |
6195 | } else { |
6196 | ex2p = NULL; |
6197 | } |
6198 | rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); |
6199 | if (rc != MDB_SUCCESS) |
6200 | return rc; |
6201 | } |
6202 | } else if (data) { |
6203 | if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { |
6204 | MDB_val olddata; |
6205 | MDB_cmp_func *dcmp; |
6206 | if ((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS) |
6207 | return rc; |
6208 | dcmp = mc->mc_dbx->md_dcmp; |
6209 | #if UINT_MAX < SIZE_MAX |
6210 | if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) |
6211 | dcmp = mdb_cmp_clong; |
6212 | #endif |
6213 | rc = dcmp(data, &olddata); |
6214 | if (rc) { |
6215 | if (op == MDB_GET_BOTH || rc > 0) |
6216 | return MDB_NOTFOUND; |
6217 | rc = 0; |
6218 | } |
6219 | *data = olddata; |
6220 | |
6221 | } else { |
6222 | if (mc->mc_xcursor) |
6223 | mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); |
6224 | if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) |
6225 | return rc; |
6226 | } |
6227 | } |
6228 | |
6229 | /* The key already matches in all other cases */ |
6230 | if (op == MDB_SET_RANGE || op == MDB_SET_KEY) |
6231 | MDB_GET_KEY(leaf, key); |
6232 | DPRINTF(("==> cursor placed on key [%s]" , DKEY(key))); |
6233 | |
6234 | return rc; |
6235 | } |
6236 | |
6237 | /** Move the cursor to the first item in the database. */ |
6238 | static int |
6239 | mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) |
6240 | { |
6241 | int rc; |
6242 | MDB_node *leaf; |
6243 | |
6244 | if (mc->mc_xcursor) |
6245 | mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); |
6246 | |
6247 | if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { |
6248 | rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); |
6249 | if (rc != MDB_SUCCESS) |
6250 | return rc; |
6251 | } |
6252 | mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); |
6253 | |
6254 | leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); |
6255 | mc->mc_flags |= C_INITIALIZED; |
6256 | mc->mc_flags &= ~C_EOF; |
6257 | |
6258 | mc->mc_ki[mc->mc_top] = 0; |
6259 | |
6260 | if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { |
6261 | if ( key ) { |
6262 | key->mv_size = mc->mc_db->md_pad; |
6263 | key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); |
6264 | } |
6265 | return MDB_SUCCESS; |
6266 | } |
6267 | |
6268 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
6269 | mdb_xcursor_init1(mc, leaf); |
6270 | rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); |
6271 | if (rc) |
6272 | return rc; |
6273 | } else if (data) { |
6274 | if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) |
6275 | return rc; |
6276 | } |
6277 | |
6278 | MDB_GET_KEY(leaf, key); |
6279 | return MDB_SUCCESS; |
6280 | } |
6281 | |
6282 | /** Move the cursor to the last item in the database. */ |
6283 | static int |
6284 | mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) |
6285 | { |
6286 | int rc; |
6287 | MDB_node *leaf; |
6288 | |
6289 | if (mc->mc_xcursor) |
6290 | mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); |
6291 | |
6292 | if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { |
6293 | rc = mdb_page_search(mc, NULL, MDB_PS_LAST); |
6294 | if (rc != MDB_SUCCESS) |
6295 | return rc; |
6296 | } |
6297 | mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); |
6298 | |
6299 | mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; |
6300 | mc->mc_flags |= C_INITIALIZED|C_EOF; |
6301 | leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
6302 | |
6303 | if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { |
6304 | if (key) { |
6305 | key->mv_size = mc->mc_db->md_pad; |
6306 | key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); |
6307 | } |
6308 | return MDB_SUCCESS; |
6309 | } |
6310 | |
6311 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
6312 | mdb_xcursor_init1(mc, leaf); |
6313 | rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); |
6314 | if (rc) |
6315 | return rc; |
6316 | } else if (data) { |
6317 | if ((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS) |
6318 | return rc; |
6319 | } |
6320 | |
6321 | MDB_GET_KEY(leaf, key); |
6322 | return MDB_SUCCESS; |
6323 | } |
6324 | |
6325 | int |
6326 | mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, |
6327 | MDB_cursor_op op) |
6328 | { |
6329 | int rc; |
6330 | int exact = 0; |
6331 | int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); |
6332 | |
6333 | if (mc == NULL) |
6334 | return EINVAL; |
6335 | |
6336 | if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) |
6337 | return MDB_BAD_TXN; |
6338 | |
6339 | switch (op) { |
6340 | case MDB_GET_CURRENT: |
6341 | if (!(mc->mc_flags & C_INITIALIZED)) { |
6342 | rc = EINVAL; |
6343 | } else { |
6344 | MDB_page *mp = mc->mc_pg[mc->mc_top]; |
6345 | int nkeys = NUMKEYS(mp); |
6346 | if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { |
6347 | mc->mc_ki[mc->mc_top] = nkeys; |
6348 | rc = MDB_NOTFOUND; |
6349 | break; |
6350 | } |
6351 | rc = MDB_SUCCESS; |
6352 | if (IS_LEAF2(mp)) { |
6353 | key->mv_size = mc->mc_db->md_pad; |
6354 | key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); |
6355 | } else { |
6356 | MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
6357 | MDB_GET_KEY(leaf, key); |
6358 | if (data) { |
6359 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
6360 | rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); |
6361 | } else { |
6362 | rc = mdb_node_read(mc, leaf, data); |
6363 | } |
6364 | } |
6365 | } |
6366 | } |
6367 | break; |
6368 | case MDB_GET_BOTH: |
6369 | case MDB_GET_BOTH_RANGE: |
6370 | if (data == NULL) { |
6371 | rc = EINVAL; |
6372 | break; |
6373 | } |
6374 | if (mc->mc_xcursor == NULL) { |
6375 | rc = MDB_INCOMPATIBLE; |
6376 | break; |
6377 | } |
6378 | /* FALLTHRU */ |
6379 | case MDB_SET: |
6380 | case MDB_SET_KEY: |
6381 | case MDB_SET_RANGE: |
6382 | if (key == NULL) { |
6383 | rc = EINVAL; |
6384 | } else { |
6385 | rc = mdb_cursor_set(mc, key, data, op, |
6386 | op == MDB_SET_RANGE ? NULL : &exact); |
6387 | } |
6388 | break; |
6389 | case MDB_GET_MULTIPLE: |
6390 | if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { |
6391 | rc = EINVAL; |
6392 | break; |
6393 | } |
6394 | if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { |
6395 | rc = MDB_INCOMPATIBLE; |
6396 | break; |
6397 | } |
6398 | rc = MDB_SUCCESS; |
6399 | if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || |
6400 | (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) |
6401 | break; |
6402 | goto fetchm; |
6403 | case MDB_NEXT_MULTIPLE: |
6404 | if (data == NULL) { |
6405 | rc = EINVAL; |
6406 | break; |
6407 | } |
6408 | if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { |
6409 | rc = MDB_INCOMPATIBLE; |
6410 | break; |
6411 | } |
6412 | rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); |
6413 | if (rc == MDB_SUCCESS) { |
6414 | if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { |
6415 | MDB_cursor *mx; |
6416 | fetchm: |
6417 | mx = &mc->mc_xcursor->mx_cursor; |
6418 | data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * |
6419 | mx->mc_db->md_pad; |
6420 | data->mv_data = METADATA(mx->mc_pg[mx->mc_top]); |
6421 | mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; |
6422 | } else { |
6423 | rc = MDB_NOTFOUND; |
6424 | } |
6425 | } |
6426 | break; |
6427 | case MDB_PREV_MULTIPLE: |
6428 | if (data == NULL) { |
6429 | rc = EINVAL; |
6430 | break; |
6431 | } |
6432 | if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { |
6433 | rc = MDB_INCOMPATIBLE; |
6434 | break; |
6435 | } |
6436 | if (!(mc->mc_flags & C_INITIALIZED)) |
6437 | rc = mdb_cursor_last(mc, key, data); |
6438 | else |
6439 | rc = MDB_SUCCESS; |
6440 | if (rc == MDB_SUCCESS) { |
6441 | MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; |
6442 | if (mx->mc_flags & C_INITIALIZED) { |
6443 | rc = mdb_cursor_sibling(mx, 0); |
6444 | if (rc == MDB_SUCCESS) |
6445 | goto fetchm; |
6446 | } else { |
6447 | rc = MDB_NOTFOUND; |
6448 | } |
6449 | } |
6450 | break; |
6451 | case MDB_NEXT: |
6452 | case MDB_NEXT_DUP: |
6453 | case MDB_NEXT_NODUP: |
6454 | rc = mdb_cursor_next(mc, key, data, op); |
6455 | break; |
6456 | case MDB_PREV: |
6457 | case MDB_PREV_DUP: |
6458 | case MDB_PREV_NODUP: |
6459 | rc = mdb_cursor_prev(mc, key, data, op); |
6460 | break; |
6461 | case MDB_FIRST: |
6462 | rc = mdb_cursor_first(mc, key, data); |
6463 | break; |
6464 | case MDB_FIRST_DUP: |
6465 | mfunc = mdb_cursor_first; |
6466 | mmove: |
6467 | if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) { |
6468 | rc = EINVAL; |
6469 | break; |
6470 | } |
6471 | if (mc->mc_xcursor == NULL) { |
6472 | rc = MDB_INCOMPATIBLE; |
6473 | break; |
6474 | } |
6475 | if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) { |
6476 | mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); |
6477 | rc = MDB_NOTFOUND; |
6478 | break; |
6479 | } |
6480 | { |
6481 | MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
6482 | if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
6483 | MDB_GET_KEY(leaf, key); |
6484 | rc = mdb_node_read(mc, leaf, data); |
6485 | break; |
6486 | } |
6487 | } |
6488 | if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { |
6489 | rc = EINVAL; |
6490 | break; |
6491 | } |
6492 | rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); |
6493 | break; |
6494 | case MDB_LAST: |
6495 | rc = mdb_cursor_last(mc, key, data); |
6496 | break; |
6497 | case MDB_LAST_DUP: |
6498 | mfunc = mdb_cursor_last; |
6499 | goto mmove; |
6500 | default: |
6501 | DPRINTF(("unhandled/unimplemented cursor operation %u" , op)); |
6502 | rc = EINVAL; |
6503 | break; |
6504 | } |
6505 | |
6506 | if (mc->mc_flags & C_DEL) |
6507 | mc->mc_flags ^= C_DEL; |
6508 | |
6509 | return rc; |
6510 | } |
6511 | |
6512 | /** Touch all the pages in the cursor stack. Set mc_top. |
6513 | * Makes sure all the pages are writable, before attempting a write operation. |
6514 | * @param[in] mc The cursor to operate on. |
6515 | */ |
6516 | static int |
6517 | mdb_cursor_touch(MDB_cursor *mc) |
6518 | { |
6519 | int rc = MDB_SUCCESS; |
6520 | |
6521 | if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { |
6522 | /* Touch DB record of named DB */ |
6523 | MDB_cursor mc2; |
6524 | MDB_xcursor mcx; |
6525 | if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) |
6526 | return MDB_BAD_DBI; |
6527 | mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); |
6528 | rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); |
6529 | if (rc) |
6530 | return rc; |
6531 | *mc->mc_dbflag |= DB_DIRTY; |
6532 | } |
6533 | mc->mc_top = 0; |
6534 | if (mc->mc_snum) { |
6535 | do { |
6536 | rc = mdb_page_touch(mc); |
6537 | } while (!rc && ++(mc->mc_top) < mc->mc_snum); |
6538 | mc->mc_top = mc->mc_snum-1; |
6539 | } |
6540 | return rc; |
6541 | } |
6542 | |
6543 | /** Do not spill pages to disk if txn is getting full, may fail instead */ |
6544 | #define MDB_NOSPILL 0x8000 |
6545 | |
6546 | int |
6547 | mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, |
6548 | unsigned int flags) |
6549 | { |
6550 | MDB_env *env; |
6551 | MDB_node *leaf = NULL; |
6552 | MDB_page *fp, *mp, *sub_root = NULL; |
6553 | uint16_t fp_flags; |
6554 | MDB_val xdata, *rdata, dkey, olddata; |
6555 | MDB_db dummy; |
6556 | int do_sub = 0, insert_key, insert_data; |
6557 | unsigned int mcount = 0, dcount = 0, nospill; |
6558 | size_t nsize; |
6559 | int rc, rc2; |
6560 | unsigned int nflags; |
6561 | DKBUF; |
6562 | |
6563 | if (mc == NULL || key == NULL) |
6564 | return EINVAL; |
6565 | |
6566 | env = mc->mc_txn->mt_env; |
6567 | |
6568 | /* Check this first so counter will always be zero on any |
6569 | * early failures. |
6570 | */ |
6571 | if (flags & MDB_MULTIPLE) { |
6572 | dcount = data[1].mv_size; |
6573 | data[1].mv_size = 0; |
6574 | if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) |
6575 | return MDB_INCOMPATIBLE; |
6576 | } |
6577 | |
6578 | nospill = flags & MDB_NOSPILL; |
6579 | flags &= ~MDB_NOSPILL; |
6580 | |
6581 | if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) |
6582 | return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; |
6583 | |
6584 | if (key->mv_size-1 >= ENV_MAXKEY(env)) |
6585 | return MDB_BAD_VALSIZE; |
6586 | |
6587 | #if SIZE_MAX > MAXDATASIZE |
6588 | if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE)) |
6589 | return MDB_BAD_VALSIZE; |
6590 | #else |
6591 | if ((mc->mc_db->md_flags & MDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env)) |
6592 | return MDB_BAD_VALSIZE; |
6593 | #endif |
6594 | |
6595 | DPRINTF(("==> put db %d key [%s], size %" Z"u, data size %" Z"u" , |
6596 | DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); |
6597 | |
6598 | dkey.mv_size = 0; |
6599 | |
6600 | if (flags & MDB_CURRENT) { |
6601 | if (!(mc->mc_flags & C_INITIALIZED)) |
6602 | return EINVAL; |
6603 | rc = MDB_SUCCESS; |
6604 | } else if (mc->mc_db->md_root == P_INVALID) { |
6605 | /* new database, cursor has nothing to point to */ |
6606 | mc->mc_snum = 0; |
6607 | mc->mc_top = 0; |
6608 | mc->mc_flags &= ~C_INITIALIZED; |
6609 | rc = MDB_NO_ROOT; |
6610 | } else { |
6611 | int exact = 0; |
6612 | MDB_val d2; |
6613 | if (flags & MDB_APPEND) { |
6614 | MDB_val k2; |
6615 | rc = mdb_cursor_last(mc, &k2, &d2); |
6616 | if (rc == 0) { |
6617 | rc = mc->mc_dbx->md_cmp(key, &k2); |
6618 | if (rc > 0) { |
6619 | rc = MDB_NOTFOUND; |
6620 | mc->mc_ki[mc->mc_top]++; |
6621 | } else { |
6622 | /* new key is <= last key */ |
6623 | rc = MDB_KEYEXIST; |
6624 | } |
6625 | } |
6626 | } else { |
6627 | rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); |
6628 | } |
6629 | if ((flags & MDB_NOOVERWRITE) && rc == 0) { |
6630 | DPRINTF(("duplicate key [%s]" , DKEY(key))); |
6631 | *data = d2; |
6632 | return MDB_KEYEXIST; |
6633 | } |
6634 | if (rc && rc != MDB_NOTFOUND) |
6635 | return rc; |
6636 | } |
6637 | |
6638 | if (mc->mc_flags & C_DEL) |
6639 | mc->mc_flags ^= C_DEL; |
6640 | |
6641 | /* Cursor is positioned, check for room in the dirty list */ |
6642 | if (!nospill) { |
6643 | if (flags & MDB_MULTIPLE) { |
6644 | rdata = &xdata; |
6645 | xdata.mv_size = data->mv_size * dcount; |
6646 | } else { |
6647 | rdata = data; |
6648 | } |
6649 | if ((rc2 = mdb_page_spill(mc, key, rdata))) |
6650 | return rc2; |
6651 | } |
6652 | |
6653 | if (rc == MDB_NO_ROOT) { |
6654 | MDB_page *np; |
6655 | /* new database, write a root leaf page */ |
6656 | DPUTS("allocating new root leaf page" ); |
6657 | if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { |
6658 | return rc2; |
6659 | } |
6660 | mdb_cursor_push(mc, np); |
6661 | mc->mc_db->md_root = np->mp_pgno; |
6662 | mc->mc_db->md_depth++; |
6663 | *mc->mc_dbflag |= DB_DIRTY; |
6664 | if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) |
6665 | == MDB_DUPFIXED) |
6666 | np->mp_flags |= P_LEAF2; |
6667 | mc->mc_flags |= C_INITIALIZED; |
6668 | } else { |
6669 | /* make sure all cursor pages are writable */ |
6670 | rc2 = mdb_cursor_touch(mc); |
6671 | if (rc2) |
6672 | return rc2; |
6673 | } |
6674 | |
6675 | insert_key = insert_data = rc; |
6676 | if (insert_key) { |
6677 | /* The key does not exist */ |
6678 | DPRINTF(("inserting key at index %i" , mc->mc_ki[mc->mc_top])); |
6679 | if ((mc->mc_db->md_flags & MDB_DUPSORT) && |
6680 | LEAFSIZE(key, data) > env->me_nodemax) |
6681 | { |
6682 | /* Too big for a node, insert in sub-DB. Set up an empty |
6683 | * "old sub-page" for prep_subDB to expand to a full page. |
6684 | */ |
6685 | fp_flags = P_LEAF|P_DIRTY; |
6686 | fp = env->me_pbuf; |
6687 | fp->mp_pad = data->mv_size; /* used if MDB_DUPFIXED */ |
6688 | fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE); |
6689 | olddata.mv_size = PAGEHDRSZ; |
6690 | goto prep_subDB; |
6691 | } |
6692 | } else { |
6693 | /* there's only a key anyway, so this is a no-op */ |
6694 | if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { |
6695 | char *ptr; |
6696 | unsigned int ksize = mc->mc_db->md_pad; |
6697 | if (key->mv_size != ksize) |
6698 | return MDB_BAD_VALSIZE; |
6699 | ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); |
6700 | memcpy(ptr, key->mv_data, ksize); |
6701 | fix_parent: |
6702 | /* if overwriting slot 0 of leaf, need to |
6703 | * update branch key if there is a parent page |
6704 | */ |
6705 | if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { |
6706 | unsigned short dtop = 1; |
6707 | mc->mc_top--; |
6708 | /* slot 0 is always an empty key, find real slot */ |
6709 | while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { |
6710 | mc->mc_top--; |
6711 | dtop++; |
6712 | } |
6713 | if (mc->mc_ki[mc->mc_top]) |
6714 | rc2 = mdb_update_key(mc, key); |
6715 | else |
6716 | rc2 = MDB_SUCCESS; |
6717 | mc->mc_top += dtop; |
6718 | if (rc2) |
6719 | return rc2; |
6720 | } |
6721 | return MDB_SUCCESS; |
6722 | } |
6723 | |
6724 | more: |
6725 | leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
6726 | olddata.mv_size = NODEDSZ(leaf); |
6727 | olddata.mv_data = NODEDATA(leaf); |
6728 | |
6729 | /* DB has dups? */ |
6730 | if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { |
6731 | /* Prepare (sub-)page/sub-DB to accept the new item, |
6732 | * if needed. fp: old sub-page or a header faking |
6733 | * it. mp: new (sub-)page. offset: growth in page |
6734 | * size. xdata: node data with new page or DB. |
6735 | */ |
6736 | unsigned i, offset = 0; |
6737 | mp = fp = xdata.mv_data = env->me_pbuf; |
6738 | mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; |
6739 | |
6740 | /* Was a single item before, must convert now */ |
6741 | if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
6742 | MDB_cmp_func *dcmp; |
6743 | /* Just overwrite the current item */ |
6744 | if (flags == MDB_CURRENT) |
6745 | goto current; |
6746 | dcmp = mc->mc_dbx->md_dcmp; |
6747 | #if UINT_MAX < SIZE_MAX |
6748 | if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) |
6749 | dcmp = mdb_cmp_clong; |
6750 | #endif |
6751 | /* does data match? */ |
6752 | if (!dcmp(data, &olddata)) { |
6753 | if (flags & (MDB_NODUPDATA|MDB_APPENDDUP)) |
6754 | return MDB_KEYEXIST; |
6755 | /* overwrite it */ |
6756 | goto current; |
6757 | } |
6758 | |
6759 | /* Back up original data item */ |
6760 | dkey.mv_size = olddata.mv_size; |
6761 | dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); |
6762 | |
6763 | /* Make sub-page header for the dup items, with dummy body */ |
6764 | fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; |
6765 | fp->mp_lower = (PAGEHDRSZ-PAGEBASE); |
6766 | xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; |
6767 | if (mc->mc_db->md_flags & MDB_DUPFIXED) { |
6768 | fp->mp_flags |= P_LEAF2; |
6769 | fp->mp_pad = data->mv_size; |
6770 | xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ |
6771 | } else { |
6772 | xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + |
6773 | (dkey.mv_size & 1) + (data->mv_size & 1); |
6774 | } |
6775 | fp->mp_upper = xdata.mv_size - PAGEBASE; |
6776 | olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ |
6777 | } else if (leaf->mn_flags & F_SUBDATA) { |
6778 | /* Data is on sub-DB, just store it */ |
6779 | flags |= F_DUPDATA|F_SUBDATA; |
6780 | goto put_sub; |
6781 | } else { |
6782 | /* Data is on sub-page */ |
6783 | fp = olddata.mv_data; |
6784 | switch (flags) { |
6785 | default: |
6786 | if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { |
6787 | offset = EVEN(NODESIZE + sizeof(indx_t) + |
6788 | data->mv_size); |
6789 | break; |
6790 | } |
6791 | offset = fp->mp_pad; |
6792 | if (SIZELEFT(fp) < offset) { |
6793 | offset *= 4; /* space for 4 more */ |
6794 | break; |
6795 | } |
6796 | /* FALLTHRU */ /* Big enough MDB_DUPFIXED sub-page */ |
6797 | case MDB_CURRENT: |
6798 | fp->mp_flags |= P_DIRTY; |
6799 | COPY_PGNO(fp->mp_pgno, mp->mp_pgno); |
6800 | mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; |
6801 | flags |= F_DUPDATA; |
6802 | goto put_sub; |
6803 | } |
6804 | xdata.mv_size = olddata.mv_size + offset; |
6805 | } |
6806 | |
6807 | fp_flags = fp->mp_flags; |
6808 | if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { |
6809 | /* Too big for a sub-page, convert to sub-DB */ |
6810 | fp_flags &= ~P_SUBP; |
6811 | prep_subDB: |
6812 | if (mc->mc_db->md_flags & MDB_DUPFIXED) { |
6813 | fp_flags |= P_LEAF2; |
6814 | dummy.md_pad = fp->mp_pad; |
6815 | dummy.md_flags = MDB_DUPFIXED; |
6816 | if (mc->mc_db->md_flags & MDB_INTEGERDUP) |
6817 | dummy.md_flags |= MDB_INTEGERKEY; |
6818 | } else { |
6819 | dummy.md_pad = 0; |
6820 | dummy.md_flags = 0; |
6821 | } |
6822 | dummy.md_depth = 1; |
6823 | dummy.md_branch_pages = 0; |
6824 | dummy.md_leaf_pages = 1; |
6825 | dummy.md_overflow_pages = 0; |
6826 | dummy.md_entries = NUMKEYS(fp); |
6827 | xdata.mv_size = sizeof(MDB_db); |
6828 | xdata.mv_data = &dummy; |
6829 | if ((rc = mdb_page_alloc(mc, 1, &mp))) |
6830 | return rc; |
6831 | offset = env->me_psize - olddata.mv_size; |
6832 | flags |= F_DUPDATA|F_SUBDATA; |
6833 | dummy.md_root = mp->mp_pgno; |
6834 | sub_root = mp; |
6835 | } |
6836 | if (mp != fp) { |
6837 | mp->mp_flags = fp_flags | P_DIRTY; |
6838 | mp->mp_pad = fp->mp_pad; |
6839 | mp->mp_lower = fp->mp_lower; |
6840 | mp->mp_upper = fp->mp_upper + offset; |
6841 | if (fp_flags & P_LEAF2) { |
6842 | memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad); |
6843 | } else { |
6844 | memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, |
6845 | olddata.mv_size - fp->mp_upper - PAGEBASE); |
6846 | memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), NUMKEYS(fp) * sizeof(mp->mp_ptrs[0])); |
6847 | for (i=0; i<NUMKEYS(fp); i++) |
6848 | mp->mp_ptrs[i] += offset; |
6849 | } |
6850 | } |
6851 | |
6852 | rdata = &xdata; |
6853 | flags |= F_DUPDATA; |
6854 | do_sub = 1; |
6855 | if (!insert_key) |
6856 | mdb_node_del(mc, 0); |
6857 | goto new_sub; |
6858 | } |
6859 | current: |
6860 | /* LMDB passes F_SUBDATA in 'flags' to write a DB record */ |
6861 | if ((leaf->mn_flags ^ flags) & F_SUBDATA) |
6862 | return MDB_INCOMPATIBLE; |
6863 | /* overflow page overwrites need special handling */ |
6864 | if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { |
6865 | MDB_page *omp; |
6866 | pgno_t pg; |
6867 | int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); |
6868 | |
6869 | memcpy(&pg, olddata.mv_data, sizeof(pg)); |
6870 | if ((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0) |
6871 | return rc2; |
6872 | ovpages = omp->mp_pages; |
6873 | |
6874 | /* Is the ov page large enough? */ |
6875 | if (ovpages >= dpages) { |
6876 | if (!(omp->mp_flags & P_DIRTY) && |
6877 | (level || (env->me_flags & MDB_WRITEMAP))) |
6878 | { |
6879 | rc = mdb_page_unspill(mc->mc_txn, omp, &omp); |
6880 | if (rc) |
6881 | return rc; |
6882 | level = 0; /* dirty in this txn or clean */ |
6883 | } |
6884 | /* Is it dirty? */ |
6885 | if (omp->mp_flags & P_DIRTY) { |
6886 | /* yes, overwrite it. Note in this case we don't |
6887 | * bother to try shrinking the page if the new data |
6888 | * is smaller than the overflow threshold. |
6889 | */ |
6890 | if (level > 1) { |
6891 | /* It is writable only in a parent txn */ |
6892 | size_t sz = (size_t) env->me_psize * ovpages, off; |
6893 | MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); |
6894 | MDB_ID2 id2; |
6895 | if (!np) |
6896 | return ENOMEM; |
6897 | id2.mid = pg; |
6898 | id2.mptr = np; |
6899 | /* Note - this page is already counted in parent's dirty_room */ |
6900 | rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); |
6901 | mdb_cassert(mc, rc2 == 0); |
6902 | /* Currently we make the page look as with put() in the |
6903 | * parent txn, in case the user peeks at MDB_RESERVEd |
6904 | * or unused parts. Some users treat ovpages specially. |
6905 | */ |
6906 | if (!(flags & MDB_RESERVE)) { |
6907 | /* Skip the part where LMDB will put *data. |
6908 | * Copy end of page, adjusting alignment so |
6909 | * compiler may copy words instead of bytes. |
6910 | */ |
6911 | off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); |
6912 | memcpy((size_t *)((char *)np + off), |
6913 | (size_t *)((char *)omp + off), sz - off); |
6914 | sz = PAGEHDRSZ; |
6915 | } |
6916 | memcpy(np, omp, sz); /* Copy beginning of page */ |
6917 | omp = np; |
6918 | } |
6919 | SETDSZ(leaf, data->mv_size); |
6920 | if (F_ISSET(flags, MDB_RESERVE)) |
6921 | data->mv_data = METADATA(omp); |
6922 | else |
6923 | memcpy(METADATA(omp), data->mv_data, data->mv_size); |
6924 | return MDB_SUCCESS; |
6925 | } |
6926 | } |
6927 | if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) |
6928 | return rc2; |
6929 | } else if (data->mv_size == olddata.mv_size) { |
6930 | /* same size, just replace it. Note that we could |
6931 | * also reuse this node if the new data is smaller, |
6932 | * but instead we opt to shrink the node in that case. |
6933 | */ |
6934 | if (F_ISSET(flags, MDB_RESERVE)) |
6935 | data->mv_data = olddata.mv_data; |
6936 | else if (!(mc->mc_flags & C_SUB)) |
6937 | memcpy(olddata.mv_data, data->mv_data, data->mv_size); |
6938 | else { |
6939 | memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); |
6940 | goto fix_parent; |
6941 | } |
6942 | return MDB_SUCCESS; |
6943 | } |
6944 | mdb_node_del(mc, 0); |
6945 | } |
6946 | |
6947 | rdata = data; |
6948 | |
6949 | new_sub: |
6950 | nflags = flags & NODE_ADD_FLAGS; |
6951 | nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); |
6952 | if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { |
6953 | if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) |
6954 | nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ |
6955 | if (!insert_key) |
6956 | nflags |= MDB_SPLIT_REPLACE; |
6957 | rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); |
6958 | } else { |
6959 | /* There is room already in this leaf page. */ |
6960 | rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); |
6961 | if (rc == 0) { |
6962 | /* Adjust other cursors pointing to mp */ |
6963 | MDB_cursor *m2, *m3; |
6964 | MDB_dbi dbi = mc->mc_dbi; |
6965 | unsigned i = mc->mc_top; |
6966 | MDB_page *mp = mc->mc_pg[i]; |
6967 | |
6968 | for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
6969 | if (mc->mc_flags & C_SUB) |
6970 | m3 = &m2->mc_xcursor->mx_cursor; |
6971 | else |
6972 | m3 = m2; |
6973 | if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue; |
6974 | if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { |
6975 | m3->mc_ki[i]++; |
6976 | } |
6977 | XCURSOR_REFRESH(m3, i, mp); |
6978 | } |
6979 | } |
6980 | } |
6981 | |
6982 | if (rc == MDB_SUCCESS) { |
6983 | /* Now store the actual data in the child DB. Note that we're |
6984 | * storing the user data in the keys field, so there are strict |
6985 | * size limits on dupdata. The actual data fields of the child |
6986 | * DB are all zero size. |
6987 | */ |
6988 | if (do_sub) { |
6989 | int xflags, new_dupdata; |
6990 | size_t ecount; |
6991 | put_sub: |
6992 | xdata.mv_size = 0; |
6993 | xdata.mv_data = "" ; |
6994 | leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
6995 | if ((flags & (MDB_CURRENT|MDB_APPENDDUP)) == MDB_CURRENT) { |
6996 | xflags = MDB_CURRENT|MDB_NOSPILL; |
6997 | } else { |
6998 | mdb_xcursor_init1(mc, leaf); |
6999 | xflags = (flags & MDB_NODUPDATA) ? |
7000 | MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; |
7001 | } |
7002 | if (sub_root) |
7003 | mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; |
7004 | new_dupdata = (int)dkey.mv_size; |
7005 | /* converted, write the original data first */ |
7006 | if (dkey.mv_size) { |
7007 | rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); |
7008 | if (rc) |
7009 | goto bad_sub; |
7010 | /* we've done our job */ |
7011 | dkey.mv_size = 0; |
7012 | } |
7013 | if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { |
7014 | /* Adjust other cursors pointing to mp */ |
7015 | MDB_cursor *m2; |
7016 | MDB_xcursor *mx = mc->mc_xcursor; |
7017 | unsigned i = mc->mc_top; |
7018 | MDB_page *mp = mc->mc_pg[i]; |
7019 | |
7020 | for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { |
7021 | if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; |
7022 | if (!(m2->mc_flags & C_INITIALIZED)) continue; |
7023 | if (m2->mc_pg[i] == mp) { |
7024 | if (m2->mc_ki[i] == mc->mc_ki[i]) { |
7025 | mdb_xcursor_init2(m2, mx, new_dupdata); |
7026 | } else if (!insert_key) { |
7027 | XCURSOR_REFRESH(m2, i, mp); |
7028 | } |
7029 | } |
7030 | } |
7031 | } |
7032 | ecount = mc->mc_xcursor->mx_db.md_entries; |
7033 | if (flags & MDB_APPENDDUP) |
7034 | xflags |= MDB_APPEND; |
7035 | rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); |
7036 | if (flags & F_SUBDATA) { |
7037 | void *db = NODEDATA(leaf); |
7038 | memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); |
7039 | } |
7040 | insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; |
7041 | } |
7042 | /* Increment count unless we just replaced an existing item. */ |
7043 | if (insert_data) |
7044 | mc->mc_db->md_entries++; |
7045 | if (insert_key) { |
7046 | /* Invalidate txn if we created an empty sub-DB */ |
7047 | if (rc) |
7048 | goto bad_sub; |
7049 | /* If we succeeded and the key didn't exist before, |
7050 | * make sure the cursor is marked valid. |
7051 | */ |
7052 | mc->mc_flags |= C_INITIALIZED; |
7053 | } |
7054 | if (flags & MDB_MULTIPLE) { |
7055 | if (!rc) { |
7056 | mcount++; |
7057 | /* let caller know how many succeeded, if any */ |
7058 | data[1].mv_size = mcount; |
7059 | if (mcount < dcount) { |
7060 | data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; |
7061 | insert_key = insert_data = 0; |
7062 | goto more; |
7063 | } |
7064 | } |
7065 | } |
7066 | return rc; |
7067 | bad_sub: |
7068 | if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */ |
7069 | rc = MDB_CORRUPTED; |
7070 | } |
7071 | mc->mc_txn->mt_flags |= MDB_TXN_ERROR; |
7072 | return rc; |
7073 | } |
7074 | |
7075 | int |
7076 | mdb_cursor_del(MDB_cursor *mc, unsigned int flags) |
7077 | { |
7078 | MDB_node *leaf; |
7079 | MDB_page *mp; |
7080 | int rc; |
7081 | |
7082 | if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) |
7083 | return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; |
7084 | |
7085 | if (!(mc->mc_flags & C_INITIALIZED)) |
7086 | return EINVAL; |
7087 | |
7088 | if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) |
7089 | return MDB_NOTFOUND; |
7090 | |
7091 | if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) |
7092 | return rc; |
7093 | |
7094 | rc = mdb_cursor_touch(mc); |
7095 | if (rc) |
7096 | return rc; |
7097 | |
7098 | mp = mc->mc_pg[mc->mc_top]; |
7099 | if (!IS_LEAF(mp)) |
7100 | return MDB_CORRUPTED; |
7101 | if (IS_LEAF2(mp)) |
7102 | goto del_key; |
7103 | leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
7104 | |
7105 | if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
7106 | if (flags & MDB_NODUPDATA) { |
7107 | /* mdb_cursor_del0() will subtract the final entry */ |
7108 | mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; |
7109 | mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; |
7110 | } else { |
7111 | if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { |
7112 | mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); |
7113 | } |
7114 | rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); |
7115 | if (rc) |
7116 | return rc; |
7117 | /* If sub-DB still has entries, we're done */ |
7118 | if (mc->mc_xcursor->mx_db.md_entries) { |
7119 | if (leaf->mn_flags & F_SUBDATA) { |
7120 | /* update subDB info */ |
7121 | void *db = NODEDATA(leaf); |
7122 | memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); |
7123 | } else { |
7124 | MDB_cursor *m2; |
7125 | /* shrink fake page */ |
7126 | mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); |
7127 | leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); |
7128 | mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); |
7129 | /* fix other sub-DB cursors pointed at fake pages on this page */ |
7130 | for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { |
7131 | if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; |
7132 | if (!(m2->mc_flags & C_INITIALIZED)) continue; |
7133 | if (m2->mc_pg[mc->mc_top] == mp) { |
7134 | XCURSOR_REFRESH(m2, mc->mc_top, mp); |
7135 | } |
7136 | } |
7137 | } |
7138 | mc->mc_db->md_entries--; |
7139 | return rc; |
7140 | } else { |
7141 | mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; |
7142 | } |
7143 | /* otherwise fall thru and delete the sub-DB */ |
7144 | } |
7145 | |
7146 | if (leaf->mn_flags & F_SUBDATA) { |
7147 | /* add all the child DB's pages to the free list */ |
7148 | rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); |
7149 | if (rc) |
7150 | goto fail; |
7151 | } |
7152 | } |
7153 | /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */ |
7154 | else if ((leaf->mn_flags ^ flags) & F_SUBDATA) { |
7155 | rc = MDB_INCOMPATIBLE; |
7156 | goto fail; |
7157 | } |
7158 | |
7159 | /* add overflow pages to free list */ |
7160 | if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { |
7161 | MDB_page *omp; |
7162 | pgno_t pg; |
7163 | |
7164 | memcpy(&pg, NODEDATA(leaf), sizeof(pg)); |
7165 | if ((rc = mdb_page_get(mc, pg, &omp, NULL)) || |
7166 | (rc = mdb_ovpage_free(mc, omp))) |
7167 | goto fail; |
7168 | } |
7169 | |
7170 | del_key: |
7171 | return mdb_cursor_del0(mc); |
7172 | |
7173 | fail: |
7174 | mc->mc_txn->mt_flags |= MDB_TXN_ERROR; |
7175 | return rc; |
7176 | } |
7177 | |
7178 | /** Allocate and initialize new pages for a database. |
7179 | * Set #MDB_TXN_ERROR on failure. |
7180 | * @param[in] mc a cursor on the database being added to. |
7181 | * @param[in] flags flags defining what type of page is being allocated. |
7182 | * @param[in] num the number of pages to allocate. This is usually 1, |
7183 | * unless allocating overflow pages for a large record. |
7184 | * @param[out] mp Address of a page, or NULL on failure. |
7185 | * @return 0 on success, non-zero on failure. |
7186 | */ |
7187 | static int |
7188 | mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) |
7189 | { |
7190 | MDB_page *np; |
7191 | int rc; |
7192 | |
7193 | if ((rc = mdb_page_alloc(mc, num, &np))) |
7194 | return rc; |
7195 | DPRINTF(("allocated new mpage %" Z"u, page size %u" , |
7196 | np->mp_pgno, mc->mc_txn->mt_env->me_psize)); |
7197 | np->mp_flags = flags | P_DIRTY; |
7198 | np->mp_lower = (PAGEHDRSZ-PAGEBASE); |
7199 | np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; |
7200 | |
7201 | if (IS_BRANCH(np)) |
7202 | mc->mc_db->md_branch_pages++; |
7203 | else if (IS_LEAF(np)) |
7204 | mc->mc_db->md_leaf_pages++; |
7205 | else if (IS_OVERFLOW(np)) { |
7206 | mc->mc_db->md_overflow_pages += num; |
7207 | np->mp_pages = num; |
7208 | } |
7209 | *mp = np; |
7210 | |
7211 | return 0; |
7212 | } |
7213 | |
7214 | /** Calculate the size of a leaf node. |
7215 | * The size depends on the environment's page size; if a data item |
7216 | * is too large it will be put onto an overflow page and the node |
7217 | * size will only include the key and not the data. Sizes are always |
7218 | * rounded up to an even number of bytes, to guarantee 2-byte alignment |
7219 | * of the #MDB_node headers. |
7220 | * @param[in] env The environment handle. |
7221 | * @param[in] key The key for the node. |
7222 | * @param[in] data The data for the node. |
7223 | * @return The number of bytes needed to store the node. |
7224 | */ |
7225 | static size_t |
7226 | mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) |
7227 | { |
7228 | size_t sz; |
7229 | |
7230 | sz = LEAFSIZE(key, data); |
7231 | if (sz > env->me_nodemax) { |
7232 | /* put on overflow page */ |
7233 | sz -= data->mv_size - sizeof(pgno_t); |
7234 | } |
7235 | |
7236 | return EVEN(sz + sizeof(indx_t)); |
7237 | } |
7238 | |
7239 | /** Calculate the size of a branch node. |
7240 | * The size should depend on the environment's page size but since |
7241 | * we currently don't support spilling large keys onto overflow |
7242 | * pages, it's simply the size of the #MDB_node header plus the |
7243 | * size of the key. Sizes are always rounded up to an even number |
7244 | * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. |
7245 | * @param[in] env The environment handle. |
7246 | * @param[in] key The key for the node. |
7247 | * @return The number of bytes needed to store the node. |
7248 | */ |
7249 | static size_t |
7250 | mdb_branch_size(MDB_env *env, MDB_val *key) |
7251 | { |
7252 | size_t sz; |
7253 | |
7254 | sz = INDXSIZE(key); |
7255 | if (sz > env->me_nodemax) { |
7256 | /* put on overflow page */ |
7257 | /* not implemented */ |
7258 | /* sz -= key->size - sizeof(pgno_t); */ |
7259 | } |
7260 | |
7261 | return sz + sizeof(indx_t); |
7262 | } |
7263 | |
7264 | /** Add a node to the page pointed to by the cursor. |
7265 | * Set #MDB_TXN_ERROR on failure. |
7266 | * @param[in] mc The cursor for this operation. |
7267 | * @param[in] indx The index on the page where the new node should be added. |
7268 | * @param[in] key The key for the new node. |
7269 | * @param[in] data The data for the new node, if any. |
7270 | * @param[in] pgno The page number, if adding a branch node. |
7271 | * @param[in] flags Flags for the node. |
7272 | * @return 0 on success, non-zero on failure. Possible errors are: |
7273 | * <ul> |
7274 | * <li>ENOMEM - failed to allocate overflow pages for the node. |
7275 | * <li>MDB_PAGE_FULL - there is insufficient room in the page. This error |
7276 | * should never happen since all callers already calculate the |
7277 | * page's free space before calling this function. |
7278 | * </ul> |
7279 | */ |
7280 | static int |
7281 | mdb_node_add(MDB_cursor *mc, indx_t indx, |
7282 | MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags) |
7283 | { |
7284 | unsigned int i; |
7285 | size_t node_size = NODESIZE; |
7286 | ssize_t room; |
7287 | indx_t ofs; |
7288 | MDB_node *node; |
7289 | MDB_page *mp = mc->mc_pg[mc->mc_top]; |
7290 | MDB_page *ofp = NULL; /* overflow page */ |
7291 | void *ndata; |
7292 | DKBUF; |
7293 | |
7294 | mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); |
7295 | |
7296 | DPRINTF(("add to %s %spage %" Z"u index %i, data size %" Z"u key size %" Z"u [%s]" , |
7297 | IS_LEAF(mp) ? "leaf" : "branch" , |
7298 | IS_SUBP(mp) ? "sub-" : "" , |
7299 | mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, |
7300 | key ? key->mv_size : 0, key ? DKEY(key) : "null" )); |
7301 | |
7302 | if (IS_LEAF2(mp)) { |
7303 | /* Move higher keys up one slot. */ |
7304 | int ksize = mc->mc_db->md_pad, dif; |
7305 | char *ptr = LEAF2KEY(mp, indx, ksize); |
7306 | dif = NUMKEYS(mp) - indx; |
7307 | if (dif > 0) |
7308 | memmove(ptr+ksize, ptr, dif*ksize); |
7309 | /* insert new key */ |
7310 | memcpy(ptr, key->mv_data, ksize); |
7311 | |
7312 | /* Just using these for counting */ |
7313 | mp->mp_lower += sizeof(indx_t); |
7314 | mp->mp_upper -= ksize - sizeof(indx_t); |
7315 | return MDB_SUCCESS; |
7316 | } |
7317 | |
7318 | room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); |
7319 | if (key != NULL) |
7320 | node_size += key->mv_size; |
7321 | if (IS_LEAF(mp)) { |
7322 | mdb_cassert(mc, key && data); |
7323 | if (F_ISSET(flags, F_BIGDATA)) { |
7324 | /* Data already on overflow page. */ |
7325 | node_size += sizeof(pgno_t); |
7326 | } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { |
7327 | int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); |
7328 | int rc; |
7329 | /* Put data on overflow page. */ |
7330 | DPRINTF(("data size is %" Z"u, node would be %" Z"u, put data on overflow page" , |
7331 | data->mv_size, node_size+data->mv_size)); |
7332 | node_size = EVEN(node_size + sizeof(pgno_t)); |
7333 | if ((ssize_t)node_size > room) |
7334 | goto full; |
7335 | if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) |
7336 | return rc; |
7337 | DPRINTF(("allocated overflow page %" Z"u" , ofp->mp_pgno)); |
7338 | flags |= F_BIGDATA; |
7339 | goto update; |
7340 | } else { |
7341 | node_size += data->mv_size; |
7342 | } |
7343 | } |
7344 | node_size = EVEN(node_size); |
7345 | if ((ssize_t)node_size > room) |
7346 | goto full; |
7347 | |
7348 | update: |
7349 | /* Move higher pointers up one slot. */ |
7350 | for (i = NUMKEYS(mp); i > indx; i--) |
7351 | mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; |
7352 | |
7353 | /* Adjust free space offsets. */ |
7354 | ofs = mp->mp_upper - node_size; |
7355 | mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); |
7356 | mp->mp_ptrs[indx] = ofs; |
7357 | mp->mp_upper = ofs; |
7358 | mp->mp_lower += sizeof(indx_t); |
7359 | |
7360 | /* Write the node data. */ |
7361 | node = NODEPTR(mp, indx); |
7362 | node->mn_ksize = (key == NULL) ? 0 : key->mv_size; |
7363 | node->mn_flags = flags; |
7364 | if (IS_LEAF(mp)) |
7365 | SETDSZ(node,data->mv_size); |
7366 | else |
7367 | SETPGNO(node,pgno); |
7368 | |
7369 | if (key) |
7370 | memcpy(NODEKEY(node), key->mv_data, key->mv_size); |
7371 | |
7372 | if (IS_LEAF(mp)) { |
7373 | ndata = NODEDATA(node); |
7374 | if (ofp == NULL) { |
7375 | if (F_ISSET(flags, F_BIGDATA)) |
7376 | memcpy(ndata, data->mv_data, sizeof(pgno_t)); |
7377 | else if (F_ISSET(flags, MDB_RESERVE)) |
7378 | data->mv_data = ndata; |
7379 | else |
7380 | memcpy(ndata, data->mv_data, data->mv_size); |
7381 | } else { |
7382 | memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); |
7383 | ndata = METADATA(ofp); |
7384 | if (F_ISSET(flags, MDB_RESERVE)) |
7385 | data->mv_data = ndata; |
7386 | else |
7387 | memcpy(ndata, data->mv_data, data->mv_size); |
7388 | } |
7389 | } |
7390 | |
7391 | return MDB_SUCCESS; |
7392 | |
7393 | full: |
7394 | DPRINTF(("not enough room in page %" Z"u, got %u ptrs" , |
7395 | mdb_dbg_pgno(mp), NUMKEYS(mp))); |
7396 | DPRINTF(("upper-lower = %u - %u = %" Z"d" , mp->mp_upper,mp->mp_lower,room)); |
7397 | DPRINTF(("node size = %" Z"u" , node_size)); |
7398 | mc->mc_txn->mt_flags |= MDB_TXN_ERROR; |
7399 | return MDB_PAGE_FULL; |
7400 | } |
7401 | |
7402 | /** Delete the specified node from a page. |
7403 | * @param[in] mc Cursor pointing to the node to delete. |
7404 | * @param[in] ksize The size of a node. Only used if the page is |
7405 | * part of a #MDB_DUPFIXED database. |
7406 | */ |
7407 | static void |
7408 | mdb_node_del(MDB_cursor *mc, int ksize) |
7409 | { |
7410 | MDB_page *mp = mc->mc_pg[mc->mc_top]; |
7411 | indx_t indx = mc->mc_ki[mc->mc_top]; |
7412 | unsigned int sz; |
7413 | indx_t i, j, numkeys, ptr; |
7414 | MDB_node *node; |
7415 | char *base; |
7416 | |
7417 | DPRINTF(("delete node %u on %s page %" Z"u" , indx, |
7418 | IS_LEAF(mp) ? "leaf" : "branch" , mdb_dbg_pgno(mp))); |
7419 | numkeys = NUMKEYS(mp); |
7420 | mdb_cassert(mc, indx < numkeys); |
7421 | |
7422 | if (IS_LEAF2(mp)) { |
7423 | int x = numkeys - 1 - indx; |
7424 | base = LEAF2KEY(mp, indx, ksize); |
7425 | if (x) |
7426 | memmove(base, base + ksize, x * ksize); |
7427 | mp->mp_lower -= sizeof(indx_t); |
7428 | mp->mp_upper += ksize - sizeof(indx_t); |
7429 | return; |
7430 | } |
7431 | |
7432 | node = NODEPTR(mp, indx); |
7433 | sz = NODESIZE + node->mn_ksize; |
7434 | if (IS_LEAF(mp)) { |
7435 | if (F_ISSET(node->mn_flags, F_BIGDATA)) |
7436 | sz += sizeof(pgno_t); |
7437 | else |
7438 | sz += NODEDSZ(node); |
7439 | } |
7440 | sz = EVEN(sz); |
7441 | |
7442 | ptr = mp->mp_ptrs[indx]; |
7443 | for (i = j = 0; i < numkeys; i++) { |
7444 | if (i != indx) { |
7445 | mp->mp_ptrs[j] = mp->mp_ptrs[i]; |
7446 | if (mp->mp_ptrs[i] < ptr) |
7447 | mp->mp_ptrs[j] += sz; |
7448 | j++; |
7449 | } |
7450 | } |
7451 | |
7452 | base = (char *)mp + mp->mp_upper + PAGEBASE; |
7453 | memmove(base + sz, base, ptr - mp->mp_upper); |
7454 | |
7455 | mp->mp_lower -= sizeof(indx_t); |
7456 | mp->mp_upper += sz; |
7457 | } |
7458 | |
7459 | /** Compact the main page after deleting a node on a subpage. |
7460 | * @param[in] mp The main page to operate on. |
7461 | * @param[in] indx The index of the subpage on the main page. |
7462 | */ |
7463 | static void |
7464 | mdb_node_shrink(MDB_page *mp, indx_t indx) |
7465 | { |
7466 | MDB_node *node; |
7467 | MDB_page *sp, *xp; |
7468 | char *base; |
7469 | indx_t delta, nsize, len, ptr; |
7470 | int i; |
7471 | |
7472 | node = NODEPTR(mp, indx); |
7473 | sp = (MDB_page *)NODEDATA(node); |
7474 | delta = SIZELEFT(sp); |
7475 | nsize = NODEDSZ(node) - delta; |
7476 | |
7477 | /* Prepare to shift upward, set len = length(subpage part to shift) */ |
7478 | if (IS_LEAF2(sp)) { |
7479 | len = nsize; |
7480 | if (nsize & 1) |
7481 | return; /* do not make the node uneven-sized */ |
7482 | } else { |
7483 | xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ |
7484 | for (i = NUMKEYS(sp); --i >= 0; ) |
7485 | xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; |
7486 | len = PAGEHDRSZ; |
7487 | } |
7488 | sp->mp_upper = sp->mp_lower; |
7489 | COPY_PGNO(sp->mp_pgno, mp->mp_pgno); |
7490 | SETDSZ(node, nsize); |
7491 | |
7492 | /* Shift <lower nodes...initial part of subpage> upward */ |
7493 | base = (char *)mp + mp->mp_upper + PAGEBASE; |
7494 | memmove(base + delta, base, (char *)sp + len - base); |
7495 | |
7496 | ptr = mp->mp_ptrs[indx]; |
7497 | for (i = NUMKEYS(mp); --i >= 0; ) { |
7498 | if (mp->mp_ptrs[i] <= ptr) |
7499 | mp->mp_ptrs[i] += delta; |
7500 | } |
7501 | mp->mp_upper += delta; |
7502 | } |
7503 | |
7504 | /** Initial setup of a sorted-dups cursor. |
7505 | * Sorted duplicates are implemented as a sub-database for the given key. |
7506 | * The duplicate data items are actually keys of the sub-database. |
7507 | * Operations on the duplicate data items are performed using a sub-cursor |
7508 | * initialized when the sub-database is first accessed. This function does |
7509 | * the preliminary setup of the sub-cursor, filling in the fields that |
7510 | * depend only on the parent DB. |
7511 | * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. |
7512 | */ |
7513 | static void |
7514 | mdb_xcursor_init0(MDB_cursor *mc) |
7515 | { |
7516 | MDB_xcursor *mx = mc->mc_xcursor; |
7517 | |
7518 | mx->mx_cursor.mc_xcursor = NULL; |
7519 | mx->mx_cursor.mc_txn = mc->mc_txn; |
7520 | mx->mx_cursor.mc_db = &mx->mx_db; |
7521 | mx->mx_cursor.mc_dbx = &mx->mx_dbx; |
7522 | mx->mx_cursor.mc_dbi = mc->mc_dbi; |
7523 | mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; |
7524 | mx->mx_cursor.mc_snum = 0; |
7525 | mx->mx_cursor.mc_top = 0; |
7526 | mx->mx_cursor.mc_flags = C_SUB; |
7527 | mx->mx_dbx.md_name.mv_size = 0; |
7528 | mx->mx_dbx.md_name.mv_data = NULL; |
7529 | mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; |
7530 | mx->mx_dbx.md_dcmp = NULL; |
7531 | mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; |
7532 | } |
7533 | |
7534 | /** Final setup of a sorted-dups cursor. |
7535 | * Sets up the fields that depend on the data from the main cursor. |
7536 | * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. |
7537 | * @param[in] node The data containing the #MDB_db record for the |
7538 | * sorted-dup database. |
7539 | */ |
7540 | static void |
7541 | mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) |
7542 | { |
7543 | MDB_xcursor *mx = mc->mc_xcursor; |
7544 | |
7545 | if (node->mn_flags & F_SUBDATA) { |
7546 | memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); |
7547 | mx->mx_cursor.mc_pg[0] = 0; |
7548 | mx->mx_cursor.mc_snum = 0; |
7549 | mx->mx_cursor.mc_top = 0; |
7550 | mx->mx_cursor.mc_flags = C_SUB; |
7551 | } else { |
7552 | MDB_page *fp = NODEDATA(node); |
7553 | mx->mx_db.md_pad = 0; |
7554 | mx->mx_db.md_flags = 0; |
7555 | mx->mx_db.md_depth = 1; |
7556 | mx->mx_db.md_branch_pages = 0; |
7557 | mx->mx_db.md_leaf_pages = 1; |
7558 | mx->mx_db.md_overflow_pages = 0; |
7559 | mx->mx_db.md_entries = NUMKEYS(fp); |
7560 | COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); |
7561 | mx->mx_cursor.mc_snum = 1; |
7562 | mx->mx_cursor.mc_top = 0; |
7563 | mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; |
7564 | mx->mx_cursor.mc_pg[0] = fp; |
7565 | mx->mx_cursor.mc_ki[0] = 0; |
7566 | if (mc->mc_db->md_flags & MDB_DUPFIXED) { |
7567 | mx->mx_db.md_flags = MDB_DUPFIXED; |
7568 | mx->mx_db.md_pad = fp->mp_pad; |
7569 | if (mc->mc_db->md_flags & MDB_INTEGERDUP) |
7570 | mx->mx_db.md_flags |= MDB_INTEGERKEY; |
7571 | } |
7572 | } |
7573 | DPRINTF(("Sub-db -%u root page %" Z"u" , mx->mx_cursor.mc_dbi, |
7574 | mx->mx_db.md_root)); |
7575 | mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; |
7576 | #if UINT_MAX < SIZE_MAX |
7577 | if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) |
7578 | mx->mx_dbx.md_cmp = mdb_cmp_clong; |
7579 | #endif |
7580 | } |
7581 | |
7582 | |
7583 | /** Fixup a sorted-dups cursor due to underlying update. |
7584 | * Sets up some fields that depend on the data from the main cursor. |
7585 | * Almost the same as init1, but skips initialization steps if the |
7586 | * xcursor had already been used. |
7587 | * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up. |
7588 | * @param[in] src_mx The xcursor of an up-to-date cursor. |
7589 | * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item. |
7590 | */ |
7591 | static void |
7592 | mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) |
7593 | { |
7594 | MDB_xcursor *mx = mc->mc_xcursor; |
7595 | |
7596 | if (new_dupdata) { |
7597 | mx->mx_cursor.mc_snum = 1; |
7598 | mx->mx_cursor.mc_top = 0; |
7599 | mx->mx_cursor.mc_flags |= C_INITIALIZED; |
7600 | mx->mx_cursor.mc_ki[0] = 0; |
7601 | mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; |
7602 | #if UINT_MAX < SIZE_MAX |
7603 | mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; |
7604 | #endif |
7605 | } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) { |
7606 | return; |
7607 | } |
7608 | mx->mx_db = src_mx->mx_db; |
7609 | mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; |
7610 | DPRINTF(("Sub-db -%u root page %" Z"u" , mx->mx_cursor.mc_dbi, |
7611 | mx->mx_db.md_root)); |
7612 | } |
7613 | |
7614 | /** Initialize a cursor for a given transaction and database. */ |
7615 | static void |
7616 | mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) |
7617 | { |
7618 | mc->mc_next = NULL; |
7619 | mc->mc_backup = NULL; |
7620 | mc->mc_dbi = dbi; |
7621 | mc->mc_txn = txn; |
7622 | mc->mc_db = &txn->mt_dbs[dbi]; |
7623 | mc->mc_dbx = &txn->mt_dbxs[dbi]; |
7624 | mc->mc_dbflag = &txn->mt_dbflags[dbi]; |
7625 | mc->mc_snum = 0; |
7626 | mc->mc_top = 0; |
7627 | mc->mc_pg[0] = 0; |
7628 | mc->mc_ki[0] = 0; |
7629 | mc->mc_flags = 0; |
7630 | if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { |
7631 | mdb_tassert(txn, mx != NULL); |
7632 | mc->mc_xcursor = mx; |
7633 | mdb_xcursor_init0(mc); |
7634 | } else { |
7635 | mc->mc_xcursor = NULL; |
7636 | } |
7637 | if (*mc->mc_dbflag & DB_STALE) { |
7638 | mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); |
7639 | } |
7640 | } |
7641 | |
7642 | int |
7643 | mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) |
7644 | { |
7645 | MDB_cursor *mc; |
7646 | size_t size = sizeof(MDB_cursor); |
7647 | |
7648 | if (!ret || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) |
7649 | return EINVAL; |
7650 | |
7651 | if (txn->mt_flags & MDB_TXN_BLOCKED) |
7652 | return MDB_BAD_TXN; |
7653 | |
7654 | if (dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) |
7655 | return EINVAL; |
7656 | |
7657 | if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) |
7658 | size += sizeof(MDB_xcursor); |
7659 | |
7660 | if ((mc = malloc(size)) != NULL) { |
7661 | mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); |
7662 | if (txn->mt_cursors) { |
7663 | mc->mc_next = txn->mt_cursors[dbi]; |
7664 | txn->mt_cursors[dbi] = mc; |
7665 | mc->mc_flags |= C_UNTRACK; |
7666 | } |
7667 | } else { |
7668 | return ENOMEM; |
7669 | } |
7670 | |
7671 | *ret = mc; |
7672 | |
7673 | return MDB_SUCCESS; |
7674 | } |
7675 | |
7676 | int |
7677 | mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) |
7678 | { |
7679 | if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID)) |
7680 | return EINVAL; |
7681 | |
7682 | if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) |
7683 | return EINVAL; |
7684 | |
7685 | if (txn->mt_flags & MDB_TXN_BLOCKED) |
7686 | return MDB_BAD_TXN; |
7687 | |
7688 | mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); |
7689 | return MDB_SUCCESS; |
7690 | } |
7691 | |
7692 | /* Return the count of duplicate data items for the current key */ |
7693 | int |
7694 | mdb_cursor_count(MDB_cursor *mc, size_t *countp) |
7695 | { |
7696 | MDB_node *leaf; |
7697 | |
7698 | if (mc == NULL || countp == NULL) |
7699 | return EINVAL; |
7700 | |
7701 | if (mc->mc_xcursor == NULL) |
7702 | return MDB_INCOMPATIBLE; |
7703 | |
7704 | if (mc->mc_txn->mt_flags & MDB_TXN_BLOCKED) |
7705 | return MDB_BAD_TXN; |
7706 | |
7707 | if (!(mc->mc_flags & C_INITIALIZED)) |
7708 | return EINVAL; |
7709 | |
7710 | if (!mc->mc_snum) |
7711 | return MDB_NOTFOUND; |
7712 | |
7713 | if (mc->mc_flags & C_EOF) { |
7714 | if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) |
7715 | return MDB_NOTFOUND; |
7716 | mc->mc_flags ^= C_EOF; |
7717 | } |
7718 | |
7719 | leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
7720 | if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { |
7721 | *countp = 1; |
7722 | } else { |
7723 | if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) |
7724 | return EINVAL; |
7725 | |
7726 | *countp = mc->mc_xcursor->mx_db.md_entries; |
7727 | } |
7728 | return MDB_SUCCESS; |
7729 | } |
7730 | |
7731 | void |
7732 | mdb_cursor_close(MDB_cursor *mc) |
7733 | { |
7734 | if (mc && !mc->mc_backup) { |
7735 | /* remove from txn, if tracked */ |
7736 | if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { |
7737 | MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; |
7738 | while (*prev && *prev != mc) prev = &(*prev)->mc_next; |
7739 | if (*prev == mc) |
7740 | *prev = mc->mc_next; |
7741 | } |
7742 | free(mc); |
7743 | } |
7744 | } |
7745 | |
7746 | MDB_txn * |
7747 | mdb_cursor_txn(MDB_cursor *mc) |
7748 | { |
7749 | if (!mc) return NULL; |
7750 | return mc->mc_txn; |
7751 | } |
7752 | |
7753 | MDB_dbi |
7754 | mdb_cursor_dbi(MDB_cursor *mc) |
7755 | { |
7756 | return mc->mc_dbi; |
7757 | } |
7758 | |
7759 | /** Replace the key for a branch node with a new key. |
7760 | * Set #MDB_TXN_ERROR on failure. |
7761 | * @param[in] mc Cursor pointing to the node to operate on. |
7762 | * @param[in] key The new key to use. |
7763 | * @return 0 on success, non-zero on failure. |
7764 | */ |
7765 | static int |
7766 | mdb_update_key(MDB_cursor *mc, MDB_val *key) |
7767 | { |
7768 | MDB_page *mp; |
7769 | MDB_node *node; |
7770 | char *base; |
7771 | size_t len; |
7772 | int delta, ksize, oksize; |
7773 | indx_t ptr, i, numkeys, indx; |
7774 | DKBUF; |
7775 | |
7776 | indx = mc->mc_ki[mc->mc_top]; |
7777 | mp = mc->mc_pg[mc->mc_top]; |
7778 | node = NODEPTR(mp, indx); |
7779 | ptr = mp->mp_ptrs[indx]; |
7780 | #if MDB_DEBUG |
7781 | { |
7782 | MDB_val k2; |
7783 | char kbuf2[DKBUF_MAXKEYSIZE*2+1]; |
7784 | k2.mv_data = NODEKEY(node); |
7785 | k2.mv_size = node->mn_ksize; |
7786 | DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %" Z"u" , |
7787 | indx, ptr, |
7788 | mdb_dkey(&k2, kbuf2), |
7789 | DKEY(key), |
7790 | mp->mp_pgno)); |
7791 | } |
7792 | #endif |
7793 | |
7794 | /* Sizes must be 2-byte aligned. */ |
7795 | ksize = EVEN(key->mv_size); |
7796 | oksize = EVEN(node->mn_ksize); |
7797 | delta = ksize - oksize; |
7798 | |
7799 | /* Shift node contents if EVEN(key length) changed. */ |
7800 | if (delta) { |
7801 | if (delta > 0 && SIZELEFT(mp) < delta) { |
7802 | pgno_t pgno; |
7803 | /* not enough space left, do a delete and split */ |
7804 | DPRINTF(("Not enough room, delta = %d, splitting..." , delta)); |
7805 | pgno = NODEPGNO(node); |
7806 | mdb_node_del(mc, 0); |
7807 | return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); |
7808 | } |
7809 | |
7810 | numkeys = NUMKEYS(mp); |
7811 | for (i = 0; i < numkeys; i++) { |
7812 | if (mp->mp_ptrs[i] <= ptr) |
7813 | mp->mp_ptrs[i] -= delta; |
7814 | } |
7815 | |
7816 | base = (char *)mp + mp->mp_upper + PAGEBASE; |
7817 | len = ptr - mp->mp_upper + NODESIZE; |
7818 | memmove(base - delta, base, len); |
7819 | mp->mp_upper -= delta; |
7820 | |
7821 | node = NODEPTR(mp, indx); |
7822 | } |
7823 | |
7824 | /* But even if no shift was needed, update ksize */ |
7825 | if (node->mn_ksize != key->mv_size) |
7826 | node->mn_ksize = key->mv_size; |
7827 | |
7828 | if (key->mv_size) |
7829 | memcpy(NODEKEY(node), key->mv_data, key->mv_size); |
7830 | |
7831 | return MDB_SUCCESS; |
7832 | } |
7833 | |
7834 | static void |
7835 | mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); |
7836 | |
7837 | /** Perform \b act while tracking temporary cursor \b mn */ |
7838 | #define WITH_CURSOR_TRACKING(mn, act) do { \ |
7839 | MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ |
7840 | if ((mn).mc_flags & C_SUB) { \ |
7841 | dummy.mc_flags = C_INITIALIZED; \ |
7842 | dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ |
7843 | tracked = &dummy; \ |
7844 | } else { \ |
7845 | tracked = &(mn); \ |
7846 | } \ |
7847 | tracked->mc_next = *tp; \ |
7848 | *tp = tracked; \ |
7849 | { act; } \ |
7850 | *tp = tracked->mc_next; \ |
7851 | } while (0) |
7852 | |
7853 | /** Move a node from csrc to cdst. |
7854 | */ |
7855 | static int |
7856 | mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) |
7857 | { |
7858 | MDB_node *srcnode; |
7859 | MDB_val key, data; |
7860 | pgno_t srcpg; |
7861 | MDB_cursor mn; |
7862 | int rc; |
7863 | unsigned short flags; |
7864 | |
7865 | DKBUF; |
7866 | |
7867 | /* Mark src and dst as dirty. */ |
7868 | if ((rc = mdb_page_touch(csrc)) || |
7869 | (rc = mdb_page_touch(cdst))) |
7870 | return rc; |
7871 | |
7872 | if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { |
7873 | key.mv_size = csrc->mc_db->md_pad; |
7874 | key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); |
7875 | data.mv_size = 0; |
7876 | data.mv_data = NULL; |
7877 | srcpg = 0; |
7878 | flags = 0; |
7879 | } else { |
7880 | srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); |
7881 | mdb_cassert(csrc, !((size_t)srcnode & 1)); |
7882 | srcpg = NODEPGNO(srcnode); |
7883 | flags = srcnode->mn_flags; |
7884 | if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { |
7885 | unsigned int snum = csrc->mc_snum; |
7886 | MDB_node *s2; |
7887 | /* must find the lowest key below src */ |
7888 | rc = mdb_page_search_lowest(csrc); |
7889 | if (rc) |
7890 | return rc; |
7891 | if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { |
7892 | key.mv_size = csrc->mc_db->md_pad; |
7893 | key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); |
7894 | } else { |
7895 | s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); |
7896 | key.mv_size = NODEKSZ(s2); |
7897 | key.mv_data = NODEKEY(s2); |
7898 | } |
7899 | csrc->mc_snum = snum--; |
7900 | csrc->mc_top = snum; |
7901 | } else { |
7902 | key.mv_size = NODEKSZ(srcnode); |
7903 | key.mv_data = NODEKEY(srcnode); |
7904 | } |
7905 | data.mv_size = NODEDSZ(srcnode); |
7906 | data.mv_data = NODEDATA(srcnode); |
7907 | } |
7908 | mn.mc_xcursor = NULL; |
7909 | if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { |
7910 | unsigned int snum = cdst->mc_snum; |
7911 | MDB_node *s2; |
7912 | MDB_val bkey; |
7913 | /* must find the lowest key below dst */ |
7914 | mdb_cursor_copy(cdst, &mn); |
7915 | rc = mdb_page_search_lowest(&mn); |
7916 | if (rc) |
7917 | return rc; |
7918 | if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { |
7919 | bkey.mv_size = mn.mc_db->md_pad; |
7920 | bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); |
7921 | } else { |
7922 | s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); |
7923 | bkey.mv_size = NODEKSZ(s2); |
7924 | bkey.mv_data = NODEKEY(s2); |
7925 | } |
7926 | mn.mc_snum = snum--; |
7927 | mn.mc_top = snum; |
7928 | mn.mc_ki[snum] = 0; |
7929 | rc = mdb_update_key(&mn, &bkey); |
7930 | if (rc) |
7931 | return rc; |
7932 | } |
7933 | |
7934 | DPRINTF(("moving %s node %u [%s] on page %" Z"u to node %u on page %" Z"u" , |
7935 | IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch" , |
7936 | csrc->mc_ki[csrc->mc_top], |
7937 | DKEY(&key), |
7938 | csrc->mc_pg[csrc->mc_top]->mp_pgno, |
7939 | cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno)); |
7940 | |
7941 | /* Add the node to the destination page. |
7942 | */ |
7943 | rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); |
7944 | if (rc != MDB_SUCCESS) |
7945 | return rc; |
7946 | |
7947 | /* Delete the node from the source page. |
7948 | */ |
7949 | mdb_node_del(csrc, key.mv_size); |
7950 | |
7951 | { |
7952 | /* Adjust other cursors pointing to mp */ |
7953 | MDB_cursor *m2, *m3; |
7954 | MDB_dbi dbi = csrc->mc_dbi; |
7955 | MDB_page *mpd, *mps; |
7956 | |
7957 | mps = csrc->mc_pg[csrc->mc_top]; |
7958 | /* If we're adding on the left, bump others up */ |
7959 | if (fromleft) { |
7960 | mpd = cdst->mc_pg[csrc->mc_top]; |
7961 | for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
7962 | if (csrc->mc_flags & C_SUB) |
7963 | m3 = &m2->mc_xcursor->mx_cursor; |
7964 | else |
7965 | m3 = m2; |
7966 | if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) |
7967 | continue; |
7968 | if (m3 != cdst && |
7969 | m3->mc_pg[csrc->mc_top] == mpd && |
7970 | m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { |
7971 | m3->mc_ki[csrc->mc_top]++; |
7972 | } |
7973 | if (m3 !=csrc && |
7974 | m3->mc_pg[csrc->mc_top] == mps && |
7975 | m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { |
7976 | m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; |
7977 | m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; |
7978 | m3->mc_ki[csrc->mc_top-1]++; |
7979 | } |
7980 | if (IS_LEAF(mps)) |
7981 | XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); |
7982 | } |
7983 | } else |
7984 | /* Adding on the right, bump others down */ |
7985 | { |
7986 | for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
7987 | if (csrc->mc_flags & C_SUB) |
7988 | m3 = &m2->mc_xcursor->mx_cursor; |
7989 | else |
7990 | m3 = m2; |
7991 | if (m3 == csrc) continue; |
7992 | if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) |
7993 | continue; |
7994 | if (m3->mc_pg[csrc->mc_top] == mps) { |
7995 | if (!m3->mc_ki[csrc->mc_top]) { |
7996 | m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; |
7997 | m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; |
7998 | m3->mc_ki[csrc->mc_top-1]--; |
7999 | } else { |
8000 | m3->mc_ki[csrc->mc_top]--; |
8001 | } |
8002 | if (IS_LEAF(mps)) |
8003 | XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); |
8004 | } |
8005 | } |
8006 | } |
8007 | } |
8008 | |
8009 | /* Update the parent separators. |
8010 | */ |
8011 | if (csrc->mc_ki[csrc->mc_top] == 0) { |
8012 | if (csrc->mc_ki[csrc->mc_top-1] != 0) { |
8013 | if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { |
8014 | key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); |
8015 | } else { |
8016 | srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); |
8017 | key.mv_size = NODEKSZ(srcnode); |
8018 | key.mv_data = NODEKEY(srcnode); |
8019 | } |
8020 | DPRINTF(("update separator for source page %" Z"u to [%s]" , |
8021 | csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); |
8022 | mdb_cursor_copy(csrc, &mn); |
8023 | mn.mc_snum--; |
8024 | mn.mc_top--; |
8025 | /* We want mdb_rebalance to find mn when doing fixups */ |
8026 | WITH_CURSOR_TRACKING(mn, |
8027 | rc = mdb_update_key(&mn, &key)); |
8028 | if (rc) |
8029 | return rc; |
8030 | } |
8031 | if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { |
8032 | MDB_val nullkey; |
8033 | indx_t ix = csrc->mc_ki[csrc->mc_top]; |
8034 | nullkey.mv_size = 0; |
8035 | csrc->mc_ki[csrc->mc_top] = 0; |
8036 | rc = mdb_update_key(csrc, &nullkey); |
8037 | csrc->mc_ki[csrc->mc_top] = ix; |
8038 | mdb_cassert(csrc, rc == MDB_SUCCESS); |
8039 | } |
8040 | } |
8041 | |
8042 | if (cdst->mc_ki[cdst->mc_top] == 0) { |
8043 | if (cdst->mc_ki[cdst->mc_top-1] != 0) { |
8044 | if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { |
8045 | key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); |
8046 | } else { |
8047 | srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); |
8048 | key.mv_size = NODEKSZ(srcnode); |
8049 | key.mv_data = NODEKEY(srcnode); |
8050 | } |
8051 | DPRINTF(("update separator for destination page %" Z"u to [%s]" , |
8052 | cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); |
8053 | mdb_cursor_copy(cdst, &mn); |
8054 | mn.mc_snum--; |
8055 | mn.mc_top--; |
8056 | /* We want mdb_rebalance to find mn when doing fixups */ |
8057 | WITH_CURSOR_TRACKING(mn, |
8058 | rc = mdb_update_key(&mn, &key)); |
8059 | if (rc) |
8060 | return rc; |
8061 | } |
8062 | if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { |
8063 | MDB_val nullkey; |
8064 | indx_t ix = cdst->mc_ki[cdst->mc_top]; |
8065 | nullkey.mv_size = 0; |
8066 | cdst->mc_ki[cdst->mc_top] = 0; |
8067 | rc = mdb_update_key(cdst, &nullkey); |
8068 | cdst->mc_ki[cdst->mc_top] = ix; |
8069 | mdb_cassert(cdst, rc == MDB_SUCCESS); |
8070 | } |
8071 | } |
8072 | |
8073 | return MDB_SUCCESS; |
8074 | } |
8075 | |
8076 | /** Merge one page into another. |
8077 | * The nodes from the page pointed to by \b csrc will |
8078 | * be copied to the page pointed to by \b cdst and then |
8079 | * the \b csrc page will be freed. |
8080 | * @param[in] csrc Cursor pointing to the source page. |
8081 | * @param[in] cdst Cursor pointing to the destination page. |
8082 | * @return 0 on success, non-zero on failure. |
8083 | */ |
8084 | static int |
8085 | mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) |
8086 | { |
8087 | MDB_page *psrc, *pdst; |
8088 | MDB_node *srcnode; |
8089 | MDB_val key, data; |
8090 | unsigned nkeys; |
8091 | int rc; |
8092 | indx_t i, j; |
8093 | |
8094 | psrc = csrc->mc_pg[csrc->mc_top]; |
8095 | pdst = cdst->mc_pg[cdst->mc_top]; |
8096 | |
8097 | DPRINTF(("merging page %" Z"u into %" Z"u" , psrc->mp_pgno, pdst->mp_pgno)); |
8098 | |
8099 | mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ |
8100 | mdb_cassert(csrc, cdst->mc_snum > 1); |
8101 | |
8102 | /* Mark dst as dirty. */ |
8103 | if ((rc = mdb_page_touch(cdst))) |
8104 | return rc; |
8105 | |
8106 | /* get dst page again now that we've touched it. */ |
8107 | pdst = cdst->mc_pg[cdst->mc_top]; |
8108 | |
8109 | /* Move all nodes from src to dst. |
8110 | */ |
8111 | j = nkeys = NUMKEYS(pdst); |
8112 | if (IS_LEAF2(psrc)) { |
8113 | key.mv_size = csrc->mc_db->md_pad; |
8114 | key.mv_data = METADATA(psrc); |
8115 | for (i = 0; i < NUMKEYS(psrc); i++, j++) { |
8116 | rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); |
8117 | if (rc != MDB_SUCCESS) |
8118 | return rc; |
8119 | key.mv_data = (char *)key.mv_data + key.mv_size; |
8120 | } |
8121 | } else { |
8122 | for (i = 0; i < NUMKEYS(psrc); i++, j++) { |
8123 | srcnode = NODEPTR(psrc, i); |
8124 | if (i == 0 && IS_BRANCH(psrc)) { |
8125 | MDB_cursor mn; |
8126 | MDB_node *s2; |
8127 | mdb_cursor_copy(csrc, &mn); |
8128 | mn.mc_xcursor = NULL; |
8129 | /* must find the lowest key below src */ |
8130 | rc = mdb_page_search_lowest(&mn); |
8131 | if (rc) |
8132 | return rc; |
8133 | if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { |
8134 | key.mv_size = mn.mc_db->md_pad; |
8135 | key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); |
8136 | } else { |
8137 | s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); |
8138 | key.mv_size = NODEKSZ(s2); |
8139 | key.mv_data = NODEKEY(s2); |
8140 | } |
8141 | } else { |
8142 | key.mv_size = srcnode->mn_ksize; |
8143 | key.mv_data = NODEKEY(srcnode); |
8144 | } |
8145 | |
8146 | data.mv_size = NODEDSZ(srcnode); |
8147 | data.mv_data = NODEDATA(srcnode); |
8148 | rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); |
8149 | if (rc != MDB_SUCCESS) |
8150 | return rc; |
8151 | } |
8152 | } |
8153 | |
8154 | DPRINTF(("dst page %" Z"u now has %u keys (%.1f%% filled)" , |
8155 | pdst->mp_pgno, NUMKEYS(pdst), |
8156 | (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10)); |
8157 | |
8158 | /* Unlink the src page from parent and add to free list. |
8159 | */ |
8160 | csrc->mc_top--; |
8161 | mdb_node_del(csrc, 0); |
8162 | if (csrc->mc_ki[csrc->mc_top] == 0) { |
8163 | key.mv_size = 0; |
8164 | rc = mdb_update_key(csrc, &key); |
8165 | if (rc) { |
8166 | csrc->mc_top++; |
8167 | return rc; |
8168 | } |
8169 | } |
8170 | csrc->mc_top++; |
8171 | |
8172 | psrc = csrc->mc_pg[csrc->mc_top]; |
8173 | /* If not operating on FreeDB, allow this page to be reused |
8174 | * in this txn. Otherwise just add to free list. |
8175 | */ |
8176 | rc = mdb_page_loose(csrc, psrc); |
8177 | if (rc) |
8178 | return rc; |
8179 | if (IS_LEAF(psrc)) |
8180 | csrc->mc_db->md_leaf_pages--; |
8181 | else |
8182 | csrc->mc_db->md_branch_pages--; |
8183 | { |
8184 | /* Adjust other cursors pointing to mp */ |
8185 | MDB_cursor *m2, *m3; |
8186 | MDB_dbi dbi = csrc->mc_dbi; |
8187 | unsigned int top = csrc->mc_top; |
8188 | |
8189 | for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
8190 | if (csrc->mc_flags & C_SUB) |
8191 | m3 = &m2->mc_xcursor->mx_cursor; |
8192 | else |
8193 | m3 = m2; |
8194 | if (m3 == csrc) continue; |
8195 | if (m3->mc_snum < csrc->mc_snum) continue; |
8196 | if (m3->mc_pg[top] == psrc) { |
8197 | m3->mc_pg[top] = pdst; |
8198 | m3->mc_ki[top] += nkeys; |
8199 | m3->mc_ki[top-1] = cdst->mc_ki[top-1]; |
8200 | } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] && |
8201 | m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { |
8202 | m3->mc_ki[top-1]--; |
8203 | } |
8204 | if (IS_LEAF(psrc)) |
8205 | XCURSOR_REFRESH(m3, top, m3->mc_pg[top]); |
8206 | } |
8207 | } |
8208 | { |
8209 | unsigned int snum = cdst->mc_snum; |
8210 | uint16_t depth = cdst->mc_db->md_depth; |
8211 | mdb_cursor_pop(cdst); |
8212 | rc = mdb_rebalance(cdst); |
8213 | /* Did the tree height change? */ |
8214 | if (depth != cdst->mc_db->md_depth) |
8215 | snum += cdst->mc_db->md_depth - depth; |
8216 | cdst->mc_snum = snum; |
8217 | cdst->mc_top = snum-1; |
8218 | } |
8219 | return rc; |
8220 | } |
8221 | |
8222 | /** Copy the contents of a cursor. |
8223 | * @param[in] csrc The cursor to copy from. |
8224 | * @param[out] cdst The cursor to copy to. |
8225 | */ |
8226 | static void |
8227 | mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) |
8228 | { |
8229 | unsigned int i; |
8230 | |
8231 | cdst->mc_txn = csrc->mc_txn; |
8232 | cdst->mc_dbi = csrc->mc_dbi; |
8233 | cdst->mc_db = csrc->mc_db; |
8234 | cdst->mc_dbx = csrc->mc_dbx; |
8235 | cdst->mc_snum = csrc->mc_snum; |
8236 | cdst->mc_top = csrc->mc_top; |
8237 | cdst->mc_flags = csrc->mc_flags; |
8238 | |
8239 | for (i=0; i<csrc->mc_snum; i++) { |
8240 | cdst->mc_pg[i] = csrc->mc_pg[i]; |
8241 | cdst->mc_ki[i] = csrc->mc_ki[i]; |
8242 | } |
8243 | } |
8244 | |
8245 | /** Rebalance the tree after a delete operation. |
8246 | * @param[in] mc Cursor pointing to the page where rebalancing |
8247 | * should begin. |
8248 | * @return 0 on success, non-zero on failure. |
8249 | */ |
8250 | static int |
8251 | mdb_rebalance(MDB_cursor *mc) |
8252 | { |
8253 | MDB_node *node; |
8254 | int rc, fromleft; |
8255 | unsigned int ptop, minkeys, thresh; |
8256 | MDB_cursor mn; |
8257 | indx_t oldki; |
8258 | |
8259 | if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { |
8260 | minkeys = 2; |
8261 | thresh = 1; |
8262 | } else { |
8263 | minkeys = 1; |
8264 | thresh = FILL_THRESHOLD; |
8265 | } |
8266 | DPRINTF(("rebalancing %s page %" Z"u (has %u keys, %.1f%% full)" , |
8267 | IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch" , |
8268 | mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), |
8269 | (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); |
8270 | |
8271 | if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && |
8272 | NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { |
8273 | DPRINTF(("no need to rebalance page %" Z"u, above fill threshold" , |
8274 | mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); |
8275 | return MDB_SUCCESS; |
8276 | } |
8277 | |
8278 | if (mc->mc_snum < 2) { |
8279 | MDB_page *mp = mc->mc_pg[0]; |
8280 | if (IS_SUBP(mp)) { |
8281 | DPUTS("Can't rebalance a subpage, ignoring" ); |
8282 | return MDB_SUCCESS; |
8283 | } |
8284 | if (NUMKEYS(mp) == 0) { |
8285 | DPUTS("tree is completely empty" ); |
8286 | mc->mc_db->md_root = P_INVALID; |
8287 | mc->mc_db->md_depth = 0; |
8288 | mc->mc_db->md_leaf_pages = 0; |
8289 | rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); |
8290 | if (rc) |
8291 | return rc; |
8292 | /* Adjust cursors pointing to mp */ |
8293 | mc->mc_snum = 0; |
8294 | mc->mc_top = 0; |
8295 | mc->mc_flags &= ~C_INITIALIZED; |
8296 | { |
8297 | MDB_cursor *m2, *m3; |
8298 | MDB_dbi dbi = mc->mc_dbi; |
8299 | |
8300 | for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
8301 | if (mc->mc_flags & C_SUB) |
8302 | m3 = &m2->mc_xcursor->mx_cursor; |
8303 | else |
8304 | m3 = m2; |
8305 | if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) |
8306 | continue; |
8307 | if (m3->mc_pg[0] == mp) { |
8308 | m3->mc_snum = 0; |
8309 | m3->mc_top = 0; |
8310 | m3->mc_flags &= ~C_INITIALIZED; |
8311 | } |
8312 | } |
8313 | } |
8314 | } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { |
8315 | int i; |
8316 | DPUTS("collapsing root page!" ); |
8317 | rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); |
8318 | if (rc) |
8319 | return rc; |
8320 | mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); |
8321 | rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); |
8322 | if (rc) |
8323 | return rc; |
8324 | mc->mc_db->md_depth--; |
8325 | mc->mc_db->md_branch_pages--; |
8326 | mc->mc_ki[0] = mc->mc_ki[1]; |
8327 | for (i = 1; i<mc->mc_db->md_depth; i++) { |
8328 | mc->mc_pg[i] = mc->mc_pg[i+1]; |
8329 | mc->mc_ki[i] = mc->mc_ki[i+1]; |
8330 | } |
8331 | { |
8332 | /* Adjust other cursors pointing to mp */ |
8333 | MDB_cursor *m2, *m3; |
8334 | MDB_dbi dbi = mc->mc_dbi; |
8335 | |
8336 | for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
8337 | if (mc->mc_flags & C_SUB) |
8338 | m3 = &m2->mc_xcursor->mx_cursor; |
8339 | else |
8340 | m3 = m2; |
8341 | if (m3 == mc) continue; |
8342 | if (!(m3->mc_flags & C_INITIALIZED)) |
8343 | continue; |
8344 | if (m3->mc_pg[0] == mp) { |
8345 | for (i=0; i<mc->mc_db->md_depth; i++) { |
8346 | m3->mc_pg[i] = m3->mc_pg[i+1]; |
8347 | m3->mc_ki[i] = m3->mc_ki[i+1]; |
8348 | } |
8349 | m3->mc_snum--; |
8350 | m3->mc_top--; |
8351 | } |
8352 | } |
8353 | } |
8354 | } else |
8355 | DPUTS("root page doesn't need rebalancing" ); |
8356 | return MDB_SUCCESS; |
8357 | } |
8358 | |
8359 | /* The parent (branch page) must have at least 2 pointers, |
8360 | * otherwise the tree is invalid. |
8361 | */ |
8362 | ptop = mc->mc_top-1; |
8363 | mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); |
8364 | |
8365 | /* Leaf page fill factor is below the threshold. |
8366 | * Try to move keys from left or right neighbor, or |
8367 | * merge with a neighbor page. |
8368 | */ |
8369 | |
8370 | /* Find neighbors. |
8371 | */ |
8372 | mdb_cursor_copy(mc, &mn); |
8373 | mn.mc_xcursor = NULL; |
8374 | |
8375 | oldki = mc->mc_ki[mc->mc_top]; |
8376 | if (mc->mc_ki[ptop] == 0) { |
8377 | /* We're the leftmost leaf in our parent. |
8378 | */ |
8379 | DPUTS("reading right neighbor" ); |
8380 | mn.mc_ki[ptop]++; |
8381 | node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); |
8382 | rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); |
8383 | if (rc) |
8384 | return rc; |
8385 | mn.mc_ki[mn.mc_top] = 0; |
8386 | mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); |
8387 | fromleft = 0; |
8388 | } else { |
8389 | /* There is at least one neighbor to the left. |
8390 | */ |
8391 | DPUTS("reading left neighbor" ); |
8392 | mn.mc_ki[ptop]--; |
8393 | node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); |
8394 | rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); |
8395 | if (rc) |
8396 | return rc; |
8397 | mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; |
8398 | mc->mc_ki[mc->mc_top] = 0; |
8399 | fromleft = 1; |
8400 | } |
8401 | |
8402 | DPRINTF(("found neighbor page %" Z"u (%u keys, %.1f%% full)" , |
8403 | mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), |
8404 | (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); |
8405 | |
8406 | /* If the neighbor page is above threshold and has enough keys, |
8407 | * move one key from it. Otherwise we should try to merge them. |
8408 | * (A branch page must never have less than 2 keys.) |
8409 | */ |
8410 | if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { |
8411 | rc = mdb_node_move(&mn, mc, fromleft); |
8412 | if (fromleft) { |
8413 | /* if we inserted on left, bump position up */ |
8414 | oldki++; |
8415 | } |
8416 | } else { |
8417 | if (!fromleft) { |
8418 | rc = mdb_page_merge(&mn, mc); |
8419 | } else { |
8420 | oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); |
8421 | mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; |
8422 | /* We want mdb_rebalance to find mn when doing fixups */ |
8423 | WITH_CURSOR_TRACKING(mn, |
8424 | rc = mdb_page_merge(mc, &mn)); |
8425 | mdb_cursor_copy(&mn, mc); |
8426 | } |
8427 | mc->mc_flags &= ~C_EOF; |
8428 | } |
8429 | mc->mc_ki[mc->mc_top] = oldki; |
8430 | return rc; |
8431 | } |
8432 | |
8433 | /** Complete a delete operation started by #mdb_cursor_del(). */ |
8434 | static int |
8435 | mdb_cursor_del0(MDB_cursor *mc) |
8436 | { |
8437 | int rc; |
8438 | MDB_page *mp; |
8439 | indx_t ki; |
8440 | unsigned int nkeys; |
8441 | MDB_cursor *m2, *m3; |
8442 | MDB_dbi dbi = mc->mc_dbi; |
8443 | |
8444 | ki = mc->mc_ki[mc->mc_top]; |
8445 | mp = mc->mc_pg[mc->mc_top]; |
8446 | mdb_node_del(mc, mc->mc_db->md_pad); |
8447 | mc->mc_db->md_entries--; |
8448 | { |
8449 | /* Adjust other cursors pointing to mp */ |
8450 | for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
8451 | m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; |
8452 | if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) |
8453 | continue; |
8454 | if (m3 == mc || m3->mc_snum < mc->mc_snum) |
8455 | continue; |
8456 | if (m3->mc_pg[mc->mc_top] == mp) { |
8457 | if (m3->mc_ki[mc->mc_top] == ki) { |
8458 | m3->mc_flags |= C_DEL; |
8459 | if (mc->mc_db->md_flags & MDB_DUPSORT) { |
8460 | /* Sub-cursor referred into dataset which is gone */ |
8461 | m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); |
8462 | } |
8463 | continue; |
8464 | } else if (m3->mc_ki[mc->mc_top] > ki) { |
8465 | m3->mc_ki[mc->mc_top]--; |
8466 | } |
8467 | XCURSOR_REFRESH(m3, mc->mc_top, mp); |
8468 | } |
8469 | } |
8470 | } |
8471 | rc = mdb_rebalance(mc); |
8472 | if (rc) |
8473 | goto fail; |
8474 | |
8475 | /* DB is totally empty now, just bail out. |
8476 | * Other cursors adjustments were already done |
8477 | * by mdb_rebalance and aren't needed here. |
8478 | */ |
8479 | if (!mc->mc_snum) { |
8480 | mc->mc_flags |= C_EOF; |
8481 | return rc; |
8482 | } |
8483 | |
8484 | mp = mc->mc_pg[mc->mc_top]; |
8485 | nkeys = NUMKEYS(mp); |
8486 | |
8487 | /* Adjust other cursors pointing to mp */ |
8488 | for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { |
8489 | m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; |
8490 | if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) |
8491 | continue; |
8492 | if (m3->mc_snum < mc->mc_snum) |
8493 | continue; |
8494 | if (m3->mc_pg[mc->mc_top] == mp) { |
8495 | if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { |
8496 | /* if m3 points past last node in page, find next sibling */ |
8497 | if (m3->mc_ki[mc->mc_top] >= nkeys) { |
8498 | rc = mdb_cursor_sibling(m3, 1); |
8499 | if (rc == MDB_NOTFOUND) { |
8500 | m3->mc_flags |= C_EOF; |
8501 | rc = MDB_SUCCESS; |
8502 | continue; |
8503 | } |
8504 | if (rc) |
8505 | goto fail; |
8506 | } |
8507 | if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { |
8508 | MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); |
8509 | /* If this node has dupdata, it may need to be reinited |
8510 | * because its data has moved. |
8511 | * If the xcursor was not initd it must be reinited. |
8512 | * Else if node points to a subDB, nothing is needed. |
8513 | * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset. |
8514 | */ |
8515 | if (node->mn_flags & F_DUPDATA) { |
8516 | if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { |
8517 | if (!(node->mn_flags & F_SUBDATA)) |
8518 | m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); |
8519 | } else { |
8520 | mdb_xcursor_init1(m3, node); |
8521 | rc = mdb_cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); |
8522 | if (rc) |
8523 | goto fail; |
8524 | } |
8525 | } |
8526 | m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; |
8527 | } |
8528 | } |
8529 | } |
8530 | } |
8531 | mc->mc_flags |= C_DEL; |
8532 | |
8533 | fail: |
8534 | if (rc) |
8535 | mc->mc_txn->mt_flags |= MDB_TXN_ERROR; |
8536 | return rc; |
8537 | } |
8538 | |
8539 | int |
8540 | mdb_del(MDB_txn *txn, MDB_dbi dbi, |
8541 | MDB_val *key, MDB_val *data) |
8542 | { |
8543 | if (!key || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
8544 | return EINVAL; |
8545 | |
8546 | if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) |
8547 | return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; |
8548 | |
8549 | if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { |
8550 | /* must ignore any data */ |
8551 | data = NULL; |
8552 | } |
8553 | |
8554 | return mdb_del0(txn, dbi, key, data, 0); |
8555 | } |
8556 | |
8557 | static int |
8558 | mdb_del0(MDB_txn *txn, MDB_dbi dbi, |
8559 | MDB_val *key, MDB_val *data, unsigned flags) |
8560 | { |
8561 | MDB_cursor mc; |
8562 | MDB_xcursor mx; |
8563 | MDB_cursor_op op; |
8564 | MDB_val rdata, *xdata; |
8565 | int rc, exact = 0; |
8566 | DKBUF; |
8567 | |
8568 | DPRINTF(("====> delete db %u key [%s]" , dbi, DKEY(key))); |
8569 | |
8570 | mdb_cursor_init(&mc, txn, dbi, &mx); |
8571 | |
8572 | if (data) { |
8573 | op = MDB_GET_BOTH; |
8574 | rdata = *data; |
8575 | xdata = &rdata; |
8576 | } else { |
8577 | op = MDB_SET; |
8578 | xdata = NULL; |
8579 | flags |= MDB_NODUPDATA; |
8580 | } |
8581 | rc = mdb_cursor_set(&mc, key, xdata, op, &exact); |
8582 | if (rc == 0) { |
8583 | /* let mdb_page_split know about this cursor if needed: |
8584 | * delete will trigger a rebalance; if it needs to move |
8585 | * a node from one page to another, it will have to |
8586 | * update the parent's separator key(s). If the new sepkey |
8587 | * is larger than the current one, the parent page may |
8588 | * run out of space, triggering a split. We need this |
8589 | * cursor to be consistent until the end of the rebalance. |
8590 | */ |
8591 | mc.mc_flags |= C_UNTRACK; |
8592 | mc.mc_next = txn->mt_cursors[dbi]; |
8593 | txn->mt_cursors[dbi] = &mc; |
8594 | rc = mdb_cursor_del(&mc, flags); |
8595 | txn->mt_cursors[dbi] = mc.mc_next; |
8596 | } |
8597 | return rc; |
8598 | } |
8599 | |
8600 | /** Split a page and insert a new node. |
8601 | * Set #MDB_TXN_ERROR on failure. |
8602 | * @param[in,out] mc Cursor pointing to the page and desired insertion index. |
8603 | * The cursor will be updated to point to the actual page and index where |
8604 | * the node got inserted after the split. |
8605 | * @param[in] newkey The key for the newly inserted node. |
8606 | * @param[in] newdata The data for the newly inserted node. |
8607 | * @param[in] newpgno The page number, if the new node is a branch node. |
8608 | * @param[in] nflags The #NODE_ADD_FLAGS for the new node. |
8609 | * @return 0 on success, non-zero on failure. |
8610 | */ |
8611 | static int |
8612 | mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, |
8613 | unsigned int nflags) |
8614 | { |
8615 | unsigned int flags; |
8616 | int rc = MDB_SUCCESS, new_root = 0, did_split = 0; |
8617 | indx_t newindx; |
8618 | pgno_t pgno = 0; |
8619 | int i, j, split_indx, nkeys, pmax; |
8620 | MDB_env *env = mc->mc_txn->mt_env; |
8621 | MDB_node *node; |
8622 | MDB_val sepkey, rkey, xdata, *rdata = &xdata; |
8623 | MDB_page *copy = NULL; |
8624 | MDB_page *mp, *rp, *pp; |
8625 | int ptop; |
8626 | MDB_cursor mn; |
8627 | DKBUF; |
8628 | |
8629 | mp = mc->mc_pg[mc->mc_top]; |
8630 | newindx = mc->mc_ki[mc->mc_top]; |
8631 | nkeys = NUMKEYS(mp); |
8632 | |
8633 | DPRINTF(("-----> splitting %s page %" Z"u and adding [%s] at index %i/%i" , |
8634 | IS_LEAF(mp) ? "leaf" : "branch" , mp->mp_pgno, |
8635 | DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); |
8636 | |
8637 | /* Create a right sibling. */ |
8638 | if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) |
8639 | return rc; |
8640 | rp->mp_pad = mp->mp_pad; |
8641 | DPRINTF(("new right sibling: page %" Z"u" , rp->mp_pgno)); |
8642 | |
8643 | /* Usually when splitting the root page, the cursor |
8644 | * height is 1. But when called from mdb_update_key, |
8645 | * the cursor height may be greater because it walks |
8646 | * up the stack while finding the branch slot to update. |
8647 | */ |
8648 | if (mc->mc_top < 1) { |
8649 | if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) |
8650 | goto done; |
8651 | /* shift current top to make room for new parent */ |
8652 | for (i=mc->mc_snum; i>0; i--) { |
8653 | mc->mc_pg[i] = mc->mc_pg[i-1]; |
8654 | mc->mc_ki[i] = mc->mc_ki[i-1]; |
8655 | } |
8656 | mc->mc_pg[0] = pp; |
8657 | mc->mc_ki[0] = 0; |
8658 | mc->mc_db->md_root = pp->mp_pgno; |
8659 | DPRINTF(("root split! new root = %" Z"u" , pp->mp_pgno)); |
8660 | new_root = mc->mc_db->md_depth++; |
8661 | |
8662 | /* Add left (implicit) pointer. */ |
8663 | if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) { |
8664 | /* undo the pre-push */ |
8665 | mc->mc_pg[0] = mc->mc_pg[1]; |
8666 | mc->mc_ki[0] = mc->mc_ki[1]; |
8667 | mc->mc_db->md_root = mp->mp_pgno; |
8668 | mc->mc_db->md_depth--; |
8669 | goto done; |
8670 | } |
8671 | mc->mc_snum++; |
8672 | mc->mc_top++; |
8673 | ptop = 0; |
8674 | } else { |
8675 | ptop = mc->mc_top-1; |
8676 | DPRINTF(("parent branch page is %" Z"u" , mc->mc_pg[ptop]->mp_pgno)); |
8677 | } |
8678 | |
8679 | mdb_cursor_copy(mc, &mn); |
8680 | mn.mc_xcursor = NULL; |
8681 | mn.mc_pg[mn.mc_top] = rp; |
8682 | mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; |
8683 | |
8684 | if (nflags & MDB_APPEND) { |
8685 | mn.mc_ki[mn.mc_top] = 0; |
8686 | sepkey = *newkey; |
8687 | split_indx = newindx; |
8688 | nkeys = 0; |
8689 | } else { |
8690 | |
8691 | split_indx = (nkeys+1) / 2; |
8692 | |
8693 | if (IS_LEAF2(rp)) { |
8694 | char *split, *ins; |
8695 | int x; |
8696 | unsigned int lsize, rsize, ksize; |
8697 | /* Move half of the keys to the right sibling */ |
8698 | x = mc->mc_ki[mc->mc_top] - split_indx; |
8699 | ksize = mc->mc_db->md_pad; |
8700 | split = LEAF2KEY(mp, split_indx, ksize); |
8701 | rsize = (nkeys - split_indx) * ksize; |
8702 | lsize = (nkeys - split_indx) * sizeof(indx_t); |
8703 | mp->mp_lower -= lsize; |
8704 | rp->mp_lower += lsize; |
8705 | mp->mp_upper += rsize - lsize; |
8706 | rp->mp_upper -= rsize - lsize; |
8707 | sepkey.mv_size = ksize; |
8708 | if (newindx == split_indx) { |
8709 | sepkey.mv_data = newkey->mv_data; |
8710 | } else { |
8711 | sepkey.mv_data = split; |
8712 | } |
8713 | if (x<0) { |
8714 | ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); |
8715 | memcpy(rp->mp_ptrs, split, rsize); |
8716 | sepkey.mv_data = rp->mp_ptrs; |
8717 | memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); |
8718 | memcpy(ins, newkey->mv_data, ksize); |
8719 | mp->mp_lower += sizeof(indx_t); |
8720 | mp->mp_upper -= ksize - sizeof(indx_t); |
8721 | } else { |
8722 | if (x) |
8723 | memcpy(rp->mp_ptrs, split, x * ksize); |
8724 | ins = LEAF2KEY(rp, x, ksize); |
8725 | memcpy(ins, newkey->mv_data, ksize); |
8726 | memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); |
8727 | rp->mp_lower += sizeof(indx_t); |
8728 | rp->mp_upper -= ksize - sizeof(indx_t); |
8729 | mc->mc_ki[mc->mc_top] = x; |
8730 | } |
8731 | } else { |
8732 | int psize, nsize, k; |
8733 | /* Maximum free space in an empty page */ |
8734 | pmax = env->me_psize - PAGEHDRSZ; |
8735 | if (IS_LEAF(mp)) |
8736 | nsize = mdb_leaf_size(env, newkey, newdata); |
8737 | else |
8738 | nsize = mdb_branch_size(env, newkey); |
8739 | nsize = EVEN(nsize); |
8740 | |
8741 | /* grab a page to hold a temporary copy */ |
8742 | copy = mdb_page_malloc(mc->mc_txn, 1); |
8743 | if (copy == NULL) { |
8744 | rc = ENOMEM; |
8745 | goto done; |
8746 | } |
8747 | copy->mp_pgno = mp->mp_pgno; |
8748 | copy->mp_flags = mp->mp_flags; |
8749 | copy->mp_lower = (PAGEHDRSZ-PAGEBASE); |
8750 | copy->mp_upper = env->me_psize - PAGEBASE; |
8751 | |
8752 | /* prepare to insert */ |
8753 | for (i=0, j=0; i<nkeys; i++) { |
8754 | if (i == newindx) { |
8755 | copy->mp_ptrs[j++] = 0; |
8756 | } |
8757 | copy->mp_ptrs[j++] = mp->mp_ptrs[i]; |
8758 | } |
8759 | |
8760 | /* When items are relatively large the split point needs |
8761 | * to be checked, because being off-by-one will make the |
8762 | * difference between success or failure in mdb_node_add. |
8763 | * |
8764 | * It's also relevant if a page happens to be laid out |
8765 | * such that one half of its nodes are all "small" and |
8766 | * the other half of its nodes are "large." If the new |
8767 | * item is also "large" and falls on the half with |
8768 | * "large" nodes, it also may not fit. |
8769 | * |
8770 | * As a final tweak, if the new item goes on the last |
8771 | * spot on the page (and thus, onto the new page), bias |
8772 | * the split so the new page is emptier than the old page. |
8773 | * This yields better packing during sequential inserts. |
8774 | */ |
8775 | if (nkeys < 32 || nsize > pmax/16 || newindx >= nkeys) { |
8776 | /* Find split point */ |
8777 | psize = 0; |
8778 | if (newindx <= split_indx || newindx >= nkeys) { |
8779 | i = 0; j = 1; |
8780 | k = newindx >= nkeys ? nkeys : split_indx+1+IS_LEAF(mp); |
8781 | } else { |
8782 | i = nkeys; j = -1; |
8783 | k = split_indx-1; |
8784 | } |
8785 | for (; i!=k; i+=j) { |
8786 | if (i == newindx) { |
8787 | psize += nsize; |
8788 | node = NULL; |
8789 | } else { |
8790 | node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); |
8791 | psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); |
8792 | if (IS_LEAF(mp)) { |
8793 | if (F_ISSET(node->mn_flags, F_BIGDATA)) |
8794 | psize += sizeof(pgno_t); |
8795 | else |
8796 | psize += NODEDSZ(node); |
8797 | } |
8798 | psize = EVEN(psize); |
8799 | } |
8800 | if (psize > pmax || i == k-j) { |
8801 | split_indx = i + (j<0); |
8802 | break; |
8803 | } |
8804 | } |
8805 | } |
8806 | if (split_indx == newindx) { |
8807 | sepkey.mv_size = newkey->mv_size; |
8808 | sepkey.mv_data = newkey->mv_data; |
8809 | } else { |
8810 | node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); |
8811 | sepkey.mv_size = node->mn_ksize; |
8812 | sepkey.mv_data = NODEKEY(node); |
8813 | } |
8814 | } |
8815 | } |
8816 | |
8817 | DPRINTF(("separator is %d [%s]" , split_indx, DKEY(&sepkey))); |
8818 | |
8819 | /* Copy separator key to the parent. |
8820 | */ |
8821 | if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { |
8822 | int snum = mc->mc_snum; |
8823 | mn.mc_snum--; |
8824 | mn.mc_top--; |
8825 | did_split = 1; |
8826 | /* We want other splits to find mn when doing fixups */ |
8827 | WITH_CURSOR_TRACKING(mn, |
8828 | rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); |
8829 | if (rc) |
8830 | goto done; |
8831 | |
8832 | /* root split? */ |
8833 | if (mc->mc_snum > snum) { |
8834 | ptop++; |
8835 | } |
8836 | /* Right page might now have changed parent. |
8837 | * Check if left page also changed parent. |
8838 | */ |
8839 | if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && |
8840 | mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { |
8841 | for (i=0; i<ptop; i++) { |
8842 | mc->mc_pg[i] = mn.mc_pg[i]; |
8843 | mc->mc_ki[i] = mn.mc_ki[i]; |
8844 | } |
8845 | mc->mc_pg[ptop] = mn.mc_pg[ptop]; |
8846 | if (mn.mc_ki[ptop]) { |
8847 | mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; |
8848 | } else { |
8849 | /* find right page's left sibling */ |
8850 | mc->mc_ki[ptop] = mn.mc_ki[ptop]; |
8851 | mdb_cursor_sibling(mc, 0); |
8852 | } |
8853 | } |
8854 | } else { |
8855 | mn.mc_top--; |
8856 | rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); |
8857 | mn.mc_top++; |
8858 | } |
8859 | if (rc != MDB_SUCCESS) { |
8860 | goto done; |
8861 | } |
8862 | if (nflags & MDB_APPEND) { |
8863 | mc->mc_pg[mc->mc_top] = rp; |
8864 | mc->mc_ki[mc->mc_top] = 0; |
8865 | rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); |
8866 | if (rc) |
8867 | goto done; |
8868 | for (i=0; i<mc->mc_top; i++) |
8869 | mc->mc_ki[i] = mn.mc_ki[i]; |
8870 | } else if (!IS_LEAF2(mp)) { |
8871 | /* Move nodes */ |
8872 | mc->mc_pg[mc->mc_top] = rp; |
8873 | i = split_indx; |
8874 | j = 0; |
8875 | do { |
8876 | if (i == newindx) { |
8877 | rkey.mv_data = newkey->mv_data; |
8878 | rkey.mv_size = newkey->mv_size; |
8879 | if (IS_LEAF(mp)) { |
8880 | rdata = newdata; |
8881 | } else |
8882 | pgno = newpgno; |
8883 | flags = nflags; |
8884 | /* Update index for the new key. */ |
8885 | mc->mc_ki[mc->mc_top] = j; |
8886 | } else { |
8887 | node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); |
8888 | rkey.mv_data = NODEKEY(node); |
8889 | rkey.mv_size = node->mn_ksize; |
8890 | if (IS_LEAF(mp)) { |
8891 | xdata.mv_data = NODEDATA(node); |
8892 | xdata.mv_size = NODEDSZ(node); |
8893 | rdata = &xdata; |
8894 | } else |
8895 | pgno = NODEPGNO(node); |
8896 | flags = node->mn_flags; |
8897 | } |
8898 | |
8899 | if (!IS_LEAF(mp) && j == 0) { |
8900 | /* First branch index doesn't need key data. */ |
8901 | rkey.mv_size = 0; |
8902 | } |
8903 | |
8904 | rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); |
8905 | if (rc) |
8906 | goto done; |
8907 | if (i == nkeys) { |
8908 | i = 0; |
8909 | j = 0; |
8910 | mc->mc_pg[mc->mc_top] = copy; |
8911 | } else { |
8912 | i++; |
8913 | j++; |
8914 | } |
8915 | } while (i != split_indx); |
8916 | |
8917 | nkeys = NUMKEYS(copy); |
8918 | for (i=0; i<nkeys; i++) |
8919 | mp->mp_ptrs[i] = copy->mp_ptrs[i]; |
8920 | mp->mp_lower = copy->mp_lower; |
8921 | mp->mp_upper = copy->mp_upper; |
8922 | memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), |
8923 | env->me_psize - copy->mp_upper - PAGEBASE); |
8924 | |
8925 | /* reset back to original page */ |
8926 | if (newindx < split_indx) { |
8927 | mc->mc_pg[mc->mc_top] = mp; |
8928 | } else { |
8929 | mc->mc_pg[mc->mc_top] = rp; |
8930 | mc->mc_ki[ptop]++; |
8931 | /* Make sure mc_ki is still valid. |
8932 | */ |
8933 | if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && |
8934 | mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { |
8935 | for (i=0; i<=ptop; i++) { |
8936 | mc->mc_pg[i] = mn.mc_pg[i]; |
8937 | mc->mc_ki[i] = mn.mc_ki[i]; |
8938 | } |
8939 | } |
8940 | } |
8941 | if (nflags & MDB_RESERVE) { |
8942 | node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); |
8943 | if (!(node->mn_flags & F_BIGDATA)) |
8944 | newdata->mv_data = NODEDATA(node); |
8945 | } |
8946 | } else { |
8947 | if (newindx >= split_indx) { |
8948 | mc->mc_pg[mc->mc_top] = rp; |
8949 | mc->mc_ki[ptop]++; |
8950 | /* Make sure mc_ki is still valid. |
8951 | */ |
8952 | if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && |
8953 | mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { |
8954 | for (i=0; i<=ptop; i++) { |
8955 | mc->mc_pg[i] = mn.mc_pg[i]; |
8956 | mc->mc_ki[i] = mn.mc_ki[i]; |
8957 | } |
8958 | } |
8959 | } |
8960 | } |
8961 | |
8962 | { |
8963 | /* Adjust other cursors pointing to mp */ |
8964 | MDB_cursor *m2, *m3; |
8965 | MDB_dbi dbi = mc->mc_dbi; |
8966 | nkeys = NUMKEYS(mp); |
8967 | |
8968 | for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { |
8969 | if (mc->mc_flags & C_SUB) |
8970 | m3 = &m2->mc_xcursor->mx_cursor; |
8971 | else |
8972 | m3 = m2; |
8973 | if (m3 == mc) |
8974 | continue; |
8975 | if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) |
8976 | continue; |
8977 | if (new_root) { |
8978 | int k; |
8979 | /* sub cursors may be on different DB */ |
8980 | if (m3->mc_pg[0] != mp) |
8981 | continue; |
8982 | /* root split */ |
8983 | for (k=new_root; k>=0; k--) { |
8984 | m3->mc_ki[k+1] = m3->mc_ki[k]; |
8985 | m3->mc_pg[k+1] = m3->mc_pg[k]; |
8986 | } |
8987 | if (m3->mc_ki[0] >= nkeys) { |
8988 | m3->mc_ki[0] = 1; |
8989 | } else { |
8990 | m3->mc_ki[0] = 0; |
8991 | } |
8992 | m3->mc_pg[0] = mc->mc_pg[0]; |
8993 | m3->mc_snum++; |
8994 | m3->mc_top++; |
8995 | } |
8996 | if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { |
8997 | if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) |
8998 | m3->mc_ki[mc->mc_top]++; |
8999 | if (m3->mc_ki[mc->mc_top] >= nkeys) { |
9000 | m3->mc_pg[mc->mc_top] = rp; |
9001 | m3->mc_ki[mc->mc_top] -= nkeys; |
9002 | for (i=0; i<mc->mc_top; i++) { |
9003 | m3->mc_ki[i] = mn.mc_ki[i]; |
9004 | m3->mc_pg[i] = mn.mc_pg[i]; |
9005 | } |
9006 | } |
9007 | } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && |
9008 | m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { |
9009 | m3->mc_ki[ptop]++; |
9010 | } |
9011 | if (IS_LEAF(mp)) |
9012 | XCURSOR_REFRESH(m3, mc->mc_top, m3->mc_pg[mc->mc_top]); |
9013 | } |
9014 | } |
9015 | DPRINTF(("mp left: %d, rp left: %d" , SIZELEFT(mp), SIZELEFT(rp))); |
9016 | |
9017 | done: |
9018 | if (copy) /* tmp page */ |
9019 | mdb_page_free(env, copy); |
9020 | if (rc) |
9021 | mc->mc_txn->mt_flags |= MDB_TXN_ERROR; |
9022 | return rc; |
9023 | } |
9024 | |
9025 | int |
9026 | mdb_put(MDB_txn *txn, MDB_dbi dbi, |
9027 | MDB_val *key, MDB_val *data, unsigned int flags) |
9028 | { |
9029 | MDB_cursor mc; |
9030 | MDB_xcursor mx; |
9031 | int rc; |
9032 | |
9033 | if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
9034 | return EINVAL; |
9035 | |
9036 | if (flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) |
9037 | return EINVAL; |
9038 | |
9039 | if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED)) |
9040 | return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; |
9041 | |
9042 | mdb_cursor_init(&mc, txn, dbi, &mx); |
9043 | mc.mc_next = txn->mt_cursors[dbi]; |
9044 | txn->mt_cursors[dbi] = &mc; |
9045 | rc = mdb_cursor_put(&mc, key, data, flags); |
9046 | txn->mt_cursors[dbi] = mc.mc_next; |
9047 | return rc; |
9048 | } |
9049 | |
9050 | #ifndef MDB_WBUF |
9051 | #define MDB_WBUF (1024*1024) |
9052 | #endif |
9053 | #define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ |
9054 | |
9055 | /** State needed for a double-buffering compacting copy. */ |
9056 | typedef struct mdb_copy { |
9057 | MDB_env *mc_env; |
9058 | MDB_txn *mc_txn; |
9059 | pthread_mutex_t mc_mutex; |
9060 | pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ |
9061 | char *mc_wbuf[2]; |
9062 | char *mc_over[2]; |
9063 | int mc_wlen[2]; |
9064 | int mc_olen[2]; |
9065 | pgno_t mc_next_pgno; |
9066 | HANDLE mc_fd; |
9067 | int mc_toggle; /**< Buffer number in provider */ |
9068 | int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ |
9069 | /** Error code. Never cleared if set. Both threads can set nonzero |
9070 | * to fail the copy. Not mutex-protected, LMDB expects atomic int. |
9071 | */ |
9072 | volatile int mc_error; |
9073 | } mdb_copy; |
9074 | |
9075 | /** Dedicated writer thread for compacting copy. */ |
9076 | static THREAD_RET ESECT CALL_CONV |
9077 | mdb_env_copythr(void *arg) |
9078 | { |
9079 | mdb_copy *my = arg; |
9080 | char *ptr; |
9081 | int toggle = 0, wsize, rc; |
9082 | #ifdef _WIN32 |
9083 | DWORD len; |
9084 | #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) |
9085 | #else |
9086 | int len; |
9087 | #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) |
9088 | #ifdef SIGPIPE |
9089 | sigset_t set; |
9090 | sigemptyset(&set); |
9091 | sigaddset(&set, SIGPIPE); |
9092 | if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) |
9093 | my->mc_error = rc; |
9094 | #endif |
9095 | #endif |
9096 | |
9097 | pthread_mutex_lock(&my->mc_mutex); |
9098 | for(;;) { |
9099 | while (!my->mc_new) |
9100 | pthread_cond_wait(&my->mc_cond, &my->mc_mutex); |
9101 | if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ |
9102 | break; |
9103 | wsize = my->mc_wlen[toggle]; |
9104 | ptr = my->mc_wbuf[toggle]; |
9105 | again: |
9106 | rc = MDB_SUCCESS; |
9107 | while (wsize > 0 && !my->mc_error) { |
9108 | DO_WRITE(rc, my->mc_fd, ptr, wsize, len); |
9109 | if (!rc) { |
9110 | rc = ErrCode(); |
9111 | #if defined(SIGPIPE) && !defined(_WIN32) |
9112 | if (rc == EPIPE) { |
9113 | /* Collect the pending SIGPIPE, otherwise at least OS X |
9114 | * gives it to the process on thread-exit (ITS#8504). |
9115 | */ |
9116 | int tmp; |
9117 | sigwait(&set, &tmp); |
9118 | } |
9119 | #endif |
9120 | break; |
9121 | } else if (len > 0) { |
9122 | rc = MDB_SUCCESS; |
9123 | ptr += len; |
9124 | wsize -= len; |
9125 | continue; |
9126 | } else { |
9127 | rc = EIO; |
9128 | break; |
9129 | } |
9130 | } |
9131 | if (rc) { |
9132 | my->mc_error = rc; |
9133 | } |
9134 | /* If there's an overflow page tail, write it too */ |
9135 | if (my->mc_olen[toggle]) { |
9136 | wsize = my->mc_olen[toggle]; |
9137 | ptr = my->mc_over[toggle]; |
9138 | my->mc_olen[toggle] = 0; |
9139 | goto again; |
9140 | } |
9141 | my->mc_wlen[toggle] = 0; |
9142 | toggle ^= 1; |
9143 | /* Return the empty buffer to provider */ |
9144 | my->mc_new--; |
9145 | pthread_cond_signal(&my->mc_cond); |
9146 | } |
9147 | pthread_mutex_unlock(&my->mc_mutex); |
9148 | return (THREAD_RET)0; |
9149 | #undef DO_WRITE |
9150 | } |
9151 | |
9152 | /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. |
9153 | * |
9154 | * @param[in] my control structure. |
9155 | * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). |
9156 | */ |
9157 | static int ESECT |
9158 | mdb_env_cthr_toggle(mdb_copy *my, int adjust) |
9159 | { |
9160 | pthread_mutex_lock(&my->mc_mutex); |
9161 | my->mc_new += adjust; |
9162 | pthread_cond_signal(&my->mc_cond); |
9163 | while (my->mc_new & 2) /* both buffers in use */ |
9164 | pthread_cond_wait(&my->mc_cond, &my->mc_mutex); |
9165 | pthread_mutex_unlock(&my->mc_mutex); |
9166 | |
9167 | my->mc_toggle ^= (adjust & 1); |
9168 | /* Both threads reset mc_wlen, to be safe from threading errors */ |
9169 | my->mc_wlen[my->mc_toggle] = 0; |
9170 | return my->mc_error; |
9171 | } |
9172 | |
9173 | /** Depth-first tree traversal for compacting copy. |
9174 | * @param[in] my control structure. |
9175 | * @param[in,out] pg database root. |
9176 | * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. |
9177 | */ |
9178 | static int ESECT |
9179 | mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) |
9180 | { |
9181 | MDB_cursor mc = {0}; |
9182 | MDB_node *ni; |
9183 | MDB_page *mo, *mp, *leaf; |
9184 | char *buf, *ptr; |
9185 | int rc, toggle; |
9186 | unsigned int i; |
9187 | |
9188 | /* Empty DB, nothing to do */ |
9189 | if (*pg == P_INVALID) |
9190 | return MDB_SUCCESS; |
9191 | |
9192 | mc.mc_snum = 1; |
9193 | mc.mc_txn = my->mc_txn; |
9194 | |
9195 | rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); |
9196 | if (rc) |
9197 | return rc; |
9198 | rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); |
9199 | if (rc) |
9200 | return rc; |
9201 | |
9202 | /* Make cursor pages writable */ |
9203 | buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); |
9204 | if (buf == NULL) |
9205 | return ENOMEM; |
9206 | |
9207 | for (i=0; i<mc.mc_top; i++) { |
9208 | mdb_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize); |
9209 | mc.mc_pg[i] = (MDB_page *)ptr; |
9210 | ptr += my->mc_env->me_psize; |
9211 | } |
9212 | |
9213 | /* This is writable space for a leaf page. Usually not needed. */ |
9214 | leaf = (MDB_page *)ptr; |
9215 | |
9216 | toggle = my->mc_toggle; |
9217 | while (mc.mc_snum > 0) { |
9218 | unsigned n; |
9219 | mp = mc.mc_pg[mc.mc_top]; |
9220 | n = NUMKEYS(mp); |
9221 | |
9222 | if (IS_LEAF(mp)) { |
9223 | if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { |
9224 | for (i=0; i<n; i++) { |
9225 | ni = NODEPTR(mp, i); |
9226 | if (ni->mn_flags & F_BIGDATA) { |
9227 | MDB_page *omp; |
9228 | pgno_t pg; |
9229 | |
9230 | /* Need writable leaf */ |
9231 | if (mp != leaf) { |
9232 | mc.mc_pg[mc.mc_top] = leaf; |
9233 | mdb_page_copy(leaf, mp, my->mc_env->me_psize); |
9234 | mp = leaf; |
9235 | ni = NODEPTR(mp, i); |
9236 | } |
9237 | |
9238 | memcpy(&pg, NODEDATA(ni), sizeof(pg)); |
9239 | memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); |
9240 | rc = mdb_page_get(&mc, pg, &omp, NULL); |
9241 | if (rc) |
9242 | goto done; |
9243 | if (my->mc_wlen[toggle] >= MDB_WBUF) { |
9244 | rc = mdb_env_cthr_toggle(my, 1); |
9245 | if (rc) |
9246 | goto done; |
9247 | toggle = my->mc_toggle; |
9248 | } |
9249 | mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); |
9250 | memcpy(mo, omp, my->mc_env->me_psize); |
9251 | mo->mp_pgno = my->mc_next_pgno; |
9252 | my->mc_next_pgno += omp->mp_pages; |
9253 | my->mc_wlen[toggle] += my->mc_env->me_psize; |
9254 | if (omp->mp_pages > 1) { |
9255 | my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); |
9256 | my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; |
9257 | rc = mdb_env_cthr_toggle(my, 1); |
9258 | if (rc) |
9259 | goto done; |
9260 | toggle = my->mc_toggle; |
9261 | } |
9262 | } else if (ni->mn_flags & F_SUBDATA) { |
9263 | MDB_db db; |
9264 | |
9265 | /* Need writable leaf */ |
9266 | if (mp != leaf) { |
9267 | mc.mc_pg[mc.mc_top] = leaf; |
9268 | mdb_page_copy(leaf, mp, my->mc_env->me_psize); |
9269 | mp = leaf; |
9270 | ni = NODEPTR(mp, i); |
9271 | } |
9272 | |
9273 | memcpy(&db, NODEDATA(ni), sizeof(db)); |
9274 | my->mc_toggle = toggle; |
9275 | rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); |
9276 | if (rc) |
9277 | goto done; |
9278 | toggle = my->mc_toggle; |
9279 | memcpy(NODEDATA(ni), &db, sizeof(db)); |
9280 | } |
9281 | } |
9282 | } |
9283 | } else { |
9284 | mc.mc_ki[mc.mc_top]++; |
9285 | if (mc.mc_ki[mc.mc_top] < n) { |
9286 | pgno_t pg; |
9287 | again: |
9288 | ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); |
9289 | pg = NODEPGNO(ni); |
9290 | rc = mdb_page_get(&mc, pg, &mp, NULL); |
9291 | if (rc) |
9292 | goto done; |
9293 | mc.mc_top++; |
9294 | mc.mc_snum++; |
9295 | mc.mc_ki[mc.mc_top] = 0; |
9296 | if (IS_BRANCH(mp)) { |
9297 | /* Whenever we advance to a sibling branch page, |
9298 | * we must proceed all the way down to its first leaf. |
9299 | */ |
9300 | mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); |
9301 | goto again; |
9302 | } else |
9303 | mc.mc_pg[mc.mc_top] = mp; |
9304 | continue; |
9305 | } |
9306 | } |
9307 | if (my->mc_wlen[toggle] >= MDB_WBUF) { |
9308 | rc = mdb_env_cthr_toggle(my, 1); |
9309 | if (rc) |
9310 | goto done; |
9311 | toggle = my->mc_toggle; |
9312 | } |
9313 | mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); |
9314 | mdb_page_copy(mo, mp, my->mc_env->me_psize); |
9315 | mo->mp_pgno = my->mc_next_pgno++; |
9316 | my->mc_wlen[toggle] += my->mc_env->me_psize; |
9317 | if (mc.mc_top) { |
9318 | /* Update parent if there is one */ |
9319 | ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]); |
9320 | SETPGNO(ni, mo->mp_pgno); |
9321 | mdb_cursor_pop(&mc); |
9322 | } else { |
9323 | /* Otherwise we're done */ |
9324 | *pg = mo->mp_pgno; |
9325 | break; |
9326 | } |
9327 | } |
9328 | done: |
9329 | free(buf); |
9330 | return rc; |
9331 | } |
9332 | |
9333 | /** Copy environment with compaction. */ |
9334 | static int ESECT |
9335 | mdb_env_copyfd1(MDB_env *env, HANDLE fd) |
9336 | { |
9337 | MDB_meta *mm; |
9338 | MDB_page *mp; |
9339 | mdb_copy my = {0}; |
9340 | MDB_txn *txn = NULL; |
9341 | pthread_t thr; |
9342 | pgno_t root, new_root; |
9343 | int rc = MDB_SUCCESS; |
9344 | |
9345 | #ifdef _WIN32 |
9346 | if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) || |
9347 | !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) { |
9348 | rc = ErrCode(); |
9349 | goto done; |
9350 | } |
9351 | my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize); |
9352 | if (my.mc_wbuf[0] == NULL) { |
9353 | /* _aligned_malloc() sets errno, but we use Windows error codes */ |
9354 | rc = ERROR_NOT_ENOUGH_MEMORY; |
9355 | goto done; |
9356 | } |
9357 | #else |
9358 | if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) |
9359 | return rc; |
9360 | if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) |
9361 | goto done2; |
9362 | #ifdef HAVE_MEMALIGN |
9363 | my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); |
9364 | if (my.mc_wbuf[0] == NULL) { |
9365 | rc = errno; |
9366 | goto done; |
9367 | } |
9368 | #else |
9369 | { |
9370 | void *p; |
9371 | if ((rc = posix_memalign(&p, env->me_os_psize, MDB_WBUF*2)) != 0) |
9372 | goto done; |
9373 | my.mc_wbuf[0] = p; |
9374 | } |
9375 | #endif |
9376 | #endif |
9377 | memset(my.mc_wbuf[0], 0, MDB_WBUF*2); |
9378 | my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; |
9379 | my.mc_next_pgno = NUM_METAS; |
9380 | my.mc_env = env; |
9381 | my.mc_fd = fd; |
9382 | rc = THREAD_CREATE(thr, mdb_env_copythr, &my); |
9383 | if (rc) |
9384 | goto done; |
9385 | |
9386 | rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); |
9387 | if (rc) |
9388 | goto finish; |
9389 | |
9390 | mp = (MDB_page *)my.mc_wbuf[0]; |
9391 | memset(mp, 0, NUM_METAS * env->me_psize); |
9392 | mp->mp_pgno = 0; |
9393 | mp->mp_flags = P_META; |
9394 | mm = (MDB_meta *)METADATA(mp); |
9395 | mdb_env_init_meta0(env, mm); |
9396 | mm->mm_address = env->me_metas[0]->mm_address; |
9397 | |
9398 | mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); |
9399 | mp->mp_pgno = 1; |
9400 | mp->mp_flags = P_META; |
9401 | *(MDB_meta *)METADATA(mp) = *mm; |
9402 | mm = (MDB_meta *)METADATA(mp); |
9403 | |
9404 | /* Set metapage 1 with current main DB */ |
9405 | root = new_root = txn->mt_dbs[MAIN_DBI].md_root; |
9406 | if (root != P_INVALID) { |
9407 | /* Count free pages + freeDB pages. Subtract from last_pg |
9408 | * to find the new last_pg, which also becomes the new root. |
9409 | */ |
9410 | MDB_ID freecount = 0; |
9411 | MDB_cursor mc; |
9412 | MDB_val key, data; |
9413 | mdb_cursor_init(&mc, txn, FREE_DBI, NULL); |
9414 | while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) |
9415 | freecount += *(MDB_ID *)data.mv_data; |
9416 | if (rc != MDB_NOTFOUND) |
9417 | goto finish; |
9418 | freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + |
9419 | txn->mt_dbs[FREE_DBI].md_leaf_pages + |
9420 | txn->mt_dbs[FREE_DBI].md_overflow_pages; |
9421 | |
9422 | new_root = txn->mt_next_pgno - 1 - freecount; |
9423 | mm->mm_last_pg = new_root; |
9424 | mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; |
9425 | mm->mm_dbs[MAIN_DBI].md_root = new_root; |
9426 | } else { |
9427 | /* When the DB is empty, handle it specially to |
9428 | * fix any breakage like page leaks from ITS#8174. |
9429 | */ |
9430 | mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; |
9431 | } |
9432 | if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { |
9433 | mm->mm_txnid = 1; /* use metapage 1 */ |
9434 | } |
9435 | |
9436 | my.mc_wlen[0] = env->me_psize * NUM_METAS; |
9437 | my.mc_txn = txn; |
9438 | rc = mdb_env_cwalk(&my, &root, 0); |
9439 | if (rc == MDB_SUCCESS && root != new_root) { |
9440 | rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ |
9441 | } |
9442 | |
9443 | finish: |
9444 | if (rc) |
9445 | my.mc_error = rc; |
9446 | mdb_env_cthr_toggle(&my, 1 | MDB_EOF); |
9447 | rc = THREAD_FINISH(thr); |
9448 | mdb_txn_abort(txn); |
9449 | |
9450 | done: |
9451 | #ifdef _WIN32 |
9452 | if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]); |
9453 | if (my.mc_cond) CloseHandle(my.mc_cond); |
9454 | if (my.mc_mutex) CloseHandle(my.mc_mutex); |
9455 | #else |
9456 | free(my.mc_wbuf[0]); |
9457 | pthread_cond_destroy(&my.mc_cond); |
9458 | done2: |
9459 | pthread_mutex_destroy(&my.mc_mutex); |
9460 | #endif |
9461 | return rc ? rc : my.mc_error; |
9462 | } |
9463 | |
9464 | /** Copy environment as-is. */ |
9465 | static int ESECT |
9466 | mdb_env_copyfd0(MDB_env *env, HANDLE fd) |
9467 | { |
9468 | MDB_txn *txn = NULL; |
9469 | mdb_mutexref_t wmutex = NULL; |
9470 | int rc; |
9471 | size_t wsize, w3; |
9472 | char *ptr; |
9473 | #ifdef _WIN32 |
9474 | DWORD len, w2; |
9475 | #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL) |
9476 | #else |
9477 | ssize_t len; |
9478 | size_t w2; |
9479 | #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) |
9480 | #endif |
9481 | |
9482 | /* Do the lock/unlock of the reader mutex before starting the |
9483 | * write txn. Otherwise other read txns could block writers. |
9484 | */ |
9485 | rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); |
9486 | if (rc) |
9487 | return rc; |
9488 | |
9489 | if (env->me_txns) { |
9490 | /* We must start the actual read txn after blocking writers */ |
9491 | mdb_txn_end(txn, MDB_END_RESET_TMP); |
9492 | |
9493 | /* Temporarily block writers until we snapshot the meta pages */ |
9494 | wmutex = env->me_wmutex; |
9495 | if (LOCK_MUTEX(rc, env, wmutex)) |
9496 | goto leave; |
9497 | |
9498 | rc = mdb_txn_renew0(txn); |
9499 | if (rc) { |
9500 | UNLOCK_MUTEX(wmutex); |
9501 | goto leave; |
9502 | } |
9503 | } |
9504 | |
9505 | wsize = env->me_psize * NUM_METAS; |
9506 | ptr = env->me_map; |
9507 | w2 = wsize; |
9508 | while (w2 > 0) { |
9509 | DO_WRITE(rc, fd, ptr, w2, len); |
9510 | if (!rc) { |
9511 | rc = ErrCode(); |
9512 | break; |
9513 | } else if (len > 0) { |
9514 | rc = MDB_SUCCESS; |
9515 | ptr += len; |
9516 | w2 -= len; |
9517 | continue; |
9518 | } else { |
9519 | /* Non-blocking or async handles are not supported */ |
9520 | rc = EIO; |
9521 | break; |
9522 | } |
9523 | } |
9524 | if (wmutex) |
9525 | UNLOCK_MUTEX(wmutex); |
9526 | |
9527 | if (rc) |
9528 | goto leave; |
9529 | |
9530 | w3 = txn->mt_next_pgno * env->me_psize; |
9531 | { |
9532 | size_t fsize = 0; |
9533 | if ((rc = mdb_fsize(env->me_fd, &fsize))) |
9534 | goto leave; |
9535 | if (w3 > fsize) |
9536 | w3 = fsize; |
9537 | } |
9538 | wsize = w3 - wsize; |
9539 | while (wsize > 0) { |
9540 | if (wsize > MAX_WRITE) |
9541 | w2 = MAX_WRITE; |
9542 | else |
9543 | w2 = wsize; |
9544 | DO_WRITE(rc, fd, ptr, w2, len); |
9545 | if (!rc) { |
9546 | rc = ErrCode(); |
9547 | break; |
9548 | } else if (len > 0) { |
9549 | rc = MDB_SUCCESS; |
9550 | ptr += len; |
9551 | wsize -= len; |
9552 | continue; |
9553 | } else { |
9554 | rc = EIO; |
9555 | break; |
9556 | } |
9557 | } |
9558 | |
9559 | leave: |
9560 | mdb_txn_abort(txn); |
9561 | return rc; |
9562 | } |
9563 | |
9564 | int ESECT |
9565 | mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned int flags) |
9566 | { |
9567 | if (flags & MDB_CP_COMPACT) |
9568 | return mdb_env_copyfd1(env, fd); |
9569 | else |
9570 | return mdb_env_copyfd0(env, fd); |
9571 | } |
9572 | |
9573 | int ESECT |
9574 | mdb_env_copyfd(MDB_env *env, HANDLE fd) |
9575 | { |
9576 | return mdb_env_copyfd2(env, fd, 0); |
9577 | } |
9578 | |
9579 | int ESECT |
9580 | mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) |
9581 | { |
9582 | int rc; |
9583 | MDB_name fname; |
9584 | HANDLE newfd = INVALID_HANDLE_VALUE; |
9585 | |
9586 | rc = mdb_fname_init(path, env->me_flags | MDB_NOLOCK, &fname); |
9587 | if (rc == MDB_SUCCESS) { |
9588 | rc = mdb_fopen(env, &fname, MDB_O_COPY, 0666, &newfd); |
9589 | mdb_fname_destroy(fname); |
9590 | } |
9591 | if (rc == MDB_SUCCESS) { |
9592 | rc = mdb_env_copyfd2(env, newfd, flags); |
9593 | if (close(newfd) < 0 && rc == MDB_SUCCESS) |
9594 | rc = ErrCode(); |
9595 | } |
9596 | return rc; |
9597 | } |
9598 | |
9599 | int ESECT |
9600 | mdb_env_copy(MDB_env *env, const char *path) |
9601 | { |
9602 | return mdb_env_copy2(env, path, 0); |
9603 | } |
9604 | |
9605 | int ESECT |
9606 | mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) |
9607 | { |
9608 | if (flag & ~CHANGEABLE) |
9609 | return EINVAL; |
9610 | if (onoff) |
9611 | env->me_flags |= flag; |
9612 | else |
9613 | env->me_flags &= ~flag; |
9614 | return MDB_SUCCESS; |
9615 | } |
9616 | |
9617 | int ESECT |
9618 | mdb_env_get_flags(MDB_env *env, unsigned int *arg) |
9619 | { |
9620 | if (!env || !arg) |
9621 | return EINVAL; |
9622 | |
9623 | *arg = env->me_flags & (CHANGEABLE|CHANGELESS); |
9624 | return MDB_SUCCESS; |
9625 | } |
9626 | |
9627 | int ESECT |
9628 | mdb_env_set_userctx(MDB_env *env, void *ctx) |
9629 | { |
9630 | if (!env) |
9631 | return EINVAL; |
9632 | env->me_userctx = ctx; |
9633 | return MDB_SUCCESS; |
9634 | } |
9635 | |
9636 | void * ESECT |
9637 | mdb_env_get_userctx(MDB_env *env) |
9638 | { |
9639 | return env ? env->me_userctx : NULL; |
9640 | } |
9641 | |
9642 | int ESECT |
9643 | mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) |
9644 | { |
9645 | if (!env) |
9646 | return EINVAL; |
9647 | #ifndef NDEBUG |
9648 | env->me_assert_func = func; |
9649 | #endif |
9650 | return MDB_SUCCESS; |
9651 | } |
9652 | |
9653 | int ESECT |
9654 | mdb_env_get_path(MDB_env *env, const char **arg) |
9655 | { |
9656 | if (!env || !arg) |
9657 | return EINVAL; |
9658 | |
9659 | *arg = env->me_path; |
9660 | return MDB_SUCCESS; |
9661 | } |
9662 | |
9663 | int ESECT |
9664 | mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) |
9665 | { |
9666 | if (!env || !arg) |
9667 | return EINVAL; |
9668 | |
9669 | *arg = env->me_fd; |
9670 | return MDB_SUCCESS; |
9671 | } |
9672 | |
9673 | /** Common code for #mdb_stat() and #mdb_env_stat(). |
9674 | * @param[in] env the environment to operate in. |
9675 | * @param[in] db the #MDB_db record containing the stats to return. |
9676 | * @param[out] arg the address of an #MDB_stat structure to receive the stats. |
9677 | * @return 0, this function always succeeds. |
9678 | */ |
9679 | static int ESECT |
9680 | mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) |
9681 | { |
9682 | arg->ms_psize = env->me_psize; |
9683 | arg->ms_depth = db->md_depth; |
9684 | arg->ms_branch_pages = db->md_branch_pages; |
9685 | arg->ms_leaf_pages = db->md_leaf_pages; |
9686 | arg->ms_overflow_pages = db->md_overflow_pages; |
9687 | arg->ms_entries = db->md_entries; |
9688 | |
9689 | return MDB_SUCCESS; |
9690 | } |
9691 | |
9692 | int ESECT |
9693 | mdb_env_stat(MDB_env *env, MDB_stat *arg) |
9694 | { |
9695 | MDB_meta *meta; |
9696 | |
9697 | if (env == NULL || arg == NULL) |
9698 | return EINVAL; |
9699 | |
9700 | meta = mdb_env_pick_meta(env); |
9701 | |
9702 | return mdb_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); |
9703 | } |
9704 | |
9705 | int ESECT |
9706 | mdb_env_info(MDB_env *env, MDB_envinfo *arg) |
9707 | { |
9708 | MDB_meta *meta; |
9709 | |
9710 | if (env == NULL || arg == NULL) |
9711 | return EINVAL; |
9712 | |
9713 | meta = mdb_env_pick_meta(env); |
9714 | arg->me_mapaddr = meta->mm_address; |
9715 | arg->me_last_pgno = meta->mm_last_pg; |
9716 | arg->me_last_txnid = meta->mm_txnid; |
9717 | |
9718 | arg->me_mapsize = env->me_mapsize; |
9719 | arg->me_maxreaders = env->me_maxreaders; |
9720 | arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0; |
9721 | return MDB_SUCCESS; |
9722 | } |
9723 | |
9724 | /** Set the default comparison functions for a database. |
9725 | * Called immediately after a database is opened to set the defaults. |
9726 | * The user can then override them with #mdb_set_compare() or |
9727 | * #mdb_set_dupsort(). |
9728 | * @param[in] txn A transaction handle returned by #mdb_txn_begin() |
9729 | * @param[in] dbi A database handle returned by #mdb_dbi_open() |
9730 | */ |
9731 | static void |
9732 | mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) |
9733 | { |
9734 | uint16_t f = txn->mt_dbs[dbi].md_flags; |
9735 | |
9736 | txn->mt_dbxs[dbi].md_cmp = |
9737 | (f & MDB_REVERSEKEY) ? mdb_cmp_memnr : |
9738 | (f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn; |
9739 | |
9740 | txn->mt_dbxs[dbi].md_dcmp = |
9741 | !(f & MDB_DUPSORT) ? 0 : |
9742 | ((f & MDB_INTEGERDUP) |
9743 | ? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint) |
9744 | : ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn)); |
9745 | } |
9746 | |
9747 | int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) |
9748 | { |
9749 | MDB_val key, data; |
9750 | MDB_dbi i; |
9751 | MDB_cursor mc; |
9752 | MDB_db dummy; |
9753 | int rc, dbflag, exact; |
9754 | unsigned int unused = 0, seq; |
9755 | char *namedup; |
9756 | size_t len; |
9757 | |
9758 | if (flags & ~VALID_FLAGS) |
9759 | return EINVAL; |
9760 | if (txn->mt_flags & MDB_TXN_BLOCKED) |
9761 | return MDB_BAD_TXN; |
9762 | |
9763 | /* main DB? */ |
9764 | if (!name) { |
9765 | *dbi = MAIN_DBI; |
9766 | if (flags & PERSISTENT_FLAGS) { |
9767 | uint16_t f2 = flags & PERSISTENT_FLAGS; |
9768 | /* make sure flag changes get committed */ |
9769 | if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { |
9770 | txn->mt_dbs[MAIN_DBI].md_flags |= f2; |
9771 | txn->mt_flags |= MDB_TXN_DIRTY; |
9772 | } |
9773 | } |
9774 | mdb_default_cmp(txn, MAIN_DBI); |
9775 | return MDB_SUCCESS; |
9776 | } |
9777 | |
9778 | if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { |
9779 | mdb_default_cmp(txn, MAIN_DBI); |
9780 | } |
9781 | |
9782 | /* Is the DB already open? */ |
9783 | len = strlen(name); |
9784 | for (i=CORE_DBS; i<txn->mt_numdbs; i++) { |
9785 | if (!txn->mt_dbxs[i].md_name.mv_size) { |
9786 | /* Remember this free slot */ |
9787 | if (!unused) unused = i; |
9788 | continue; |
9789 | } |
9790 | if (len == txn->mt_dbxs[i].md_name.mv_size && |
9791 | !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { |
9792 | *dbi = i; |
9793 | return MDB_SUCCESS; |
9794 | } |
9795 | } |
9796 | |
9797 | /* If no free slot and max hit, fail */ |
9798 | if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) |
9799 | return MDB_DBS_FULL; |
9800 | |
9801 | /* Cannot mix named databases with some mainDB flags */ |
9802 | if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) |
9803 | return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; |
9804 | |
9805 | /* Find the DB info */ |
9806 | dbflag = DB_NEW|DB_VALID|DB_USRVALID; |
9807 | exact = 0; |
9808 | key.mv_size = len; |
9809 | key.mv_data = (void *)name; |
9810 | mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); |
9811 | rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); |
9812 | if (rc == MDB_SUCCESS) { |
9813 | /* make sure this is actually a DB */ |
9814 | MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); |
9815 | if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) |
9816 | return MDB_INCOMPATIBLE; |
9817 | } else { |
9818 | if (rc != MDB_NOTFOUND || !(flags & MDB_CREATE)) |
9819 | return rc; |
9820 | if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) |
9821 | return EACCES; |
9822 | } |
9823 | |
9824 | /* Done here so we cannot fail after creating a new DB */ |
9825 | if ((namedup = strdup(name)) == NULL) |
9826 | return ENOMEM; |
9827 | |
9828 | if (rc) { |
9829 | /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ |
9830 | data.mv_size = sizeof(MDB_db); |
9831 | data.mv_data = &dummy; |
9832 | memset(&dummy, 0, sizeof(dummy)); |
9833 | dummy.md_root = P_INVALID; |
9834 | dummy.md_flags = flags & PERSISTENT_FLAGS; |
9835 | WITH_CURSOR_TRACKING(mc, |
9836 | rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); |
9837 | dbflag |= DB_DIRTY; |
9838 | } |
9839 | |
9840 | if (rc) { |
9841 | free(namedup); |
9842 | } else { |
9843 | /* Got info, register DBI in this txn */ |
9844 | unsigned int slot = unused ? unused : txn->mt_numdbs; |
9845 | txn->mt_dbxs[slot].md_name.mv_data = namedup; |
9846 | txn->mt_dbxs[slot].md_name.mv_size = len; |
9847 | txn->mt_dbxs[slot].md_rel = NULL; |
9848 | txn->mt_dbflags[slot] = dbflag; |
9849 | /* txn-> and env-> are the same in read txns, use |
9850 | * tmp variable to avoid undefined assignment |
9851 | */ |
9852 | seq = ++txn->mt_env->me_dbiseqs[slot]; |
9853 | txn->mt_dbiseqs[slot] = seq; |
9854 | |
9855 | memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); |
9856 | *dbi = slot; |
9857 | mdb_default_cmp(txn, slot); |
9858 | if (!unused) { |
9859 | txn->mt_numdbs++; |
9860 | } |
9861 | } |
9862 | |
9863 | return rc; |
9864 | } |
9865 | |
9866 | int ESECT |
9867 | mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) |
9868 | { |
9869 | if (!arg || !TXN_DBI_EXIST(txn, dbi, DB_VALID)) |
9870 | return EINVAL; |
9871 | |
9872 | if (txn->mt_flags & MDB_TXN_BLOCKED) |
9873 | return MDB_BAD_TXN; |
9874 | |
9875 | if (txn->mt_dbflags[dbi] & DB_STALE) { |
9876 | MDB_cursor mc; |
9877 | MDB_xcursor mx; |
9878 | /* Stale, must read the DB's root. cursor_init does it for us. */ |
9879 | mdb_cursor_init(&mc, txn, dbi, &mx); |
9880 | } |
9881 | return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); |
9882 | } |
9883 | |
9884 | void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) |
9885 | { |
9886 | char *ptr; |
9887 | if (dbi < CORE_DBS || dbi >= env->me_maxdbs) |
9888 | return; |
9889 | ptr = env->me_dbxs[dbi].md_name.mv_data; |
9890 | /* If there was no name, this was already closed */ |
9891 | if (ptr) { |
9892 | env->me_dbxs[dbi].md_name.mv_data = NULL; |
9893 | env->me_dbxs[dbi].md_name.mv_size = 0; |
9894 | env->me_dbflags[dbi] = 0; |
9895 | env->me_dbiseqs[dbi]++; |
9896 | free(ptr); |
9897 | } |
9898 | } |
9899 | |
9900 | int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) |
9901 | { |
9902 | /* We could return the flags for the FREE_DBI too but what's the point? */ |
9903 | if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
9904 | return EINVAL; |
9905 | *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; |
9906 | return MDB_SUCCESS; |
9907 | } |
9908 | |
9909 | /** Add all the DB's pages to the free list. |
9910 | * @param[in] mc Cursor on the DB to free. |
9911 | * @param[in] subs non-Zero to check for sub-DBs in this DB. |
9912 | * @return 0 on success, non-zero on failure. |
9913 | */ |
9914 | static int |
9915 | mdb_drop0(MDB_cursor *mc, int subs) |
9916 | { |
9917 | int rc; |
9918 | |
9919 | rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); |
9920 | if (rc == MDB_SUCCESS) { |
9921 | MDB_txn *txn = mc->mc_txn; |
9922 | MDB_node *ni; |
9923 | MDB_cursor mx; |
9924 | unsigned int i; |
9925 | |
9926 | /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. |
9927 | * This also avoids any P_LEAF2 pages, which have no nodes. |
9928 | * Also if the DB doesn't have sub-DBs and has no overflow |
9929 | * pages, omit scanning leaves. |
9930 | */ |
9931 | if ((mc->mc_flags & C_SUB) || |
9932 | (!subs && !mc->mc_db->md_overflow_pages)) |
9933 | mdb_cursor_pop(mc); |
9934 | |
9935 | mdb_cursor_copy(mc, &mx); |
9936 | while (mc->mc_snum > 0) { |
9937 | MDB_page *mp = mc->mc_pg[mc->mc_top]; |
9938 | unsigned n = NUMKEYS(mp); |
9939 | if (IS_LEAF(mp)) { |
9940 | for (i=0; i<n; i++) { |
9941 | ni = NODEPTR(mp, i); |
9942 | if (ni->mn_flags & F_BIGDATA) { |
9943 | MDB_page *omp; |
9944 | pgno_t pg; |
9945 | memcpy(&pg, NODEDATA(ni), sizeof(pg)); |
9946 | rc = mdb_page_get(mc, pg, &omp, NULL); |
9947 | if (rc != 0) |
9948 | goto done; |
9949 | mdb_cassert(mc, IS_OVERFLOW(omp)); |
9950 | rc = mdb_midl_append_range(&txn->mt_free_pgs, |
9951 | pg, omp->mp_pages); |
9952 | if (rc) |
9953 | goto done; |
9954 | mc->mc_db->md_overflow_pages -= omp->mp_pages; |
9955 | if (!mc->mc_db->md_overflow_pages && !subs) |
9956 | break; |
9957 | } else if (subs && (ni->mn_flags & F_SUBDATA)) { |
9958 | mdb_xcursor_init1(mc, ni); |
9959 | rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); |
9960 | if (rc) |
9961 | goto done; |
9962 | } |
9963 | } |
9964 | if (!subs && !mc->mc_db->md_overflow_pages) |
9965 | goto pop; |
9966 | } else { |
9967 | if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) |
9968 | goto done; |
9969 | for (i=0; i<n; i++) { |
9970 | pgno_t pg; |
9971 | ni = NODEPTR(mp, i); |
9972 | pg = NODEPGNO(ni); |
9973 | /* free it */ |
9974 | mdb_midl_xappend(txn->mt_free_pgs, pg); |
9975 | } |
9976 | } |
9977 | if (!mc->mc_top) |
9978 | break; |
9979 | mc->mc_ki[mc->mc_top] = i; |
9980 | rc = mdb_cursor_sibling(mc, 1); |
9981 | if (rc) { |
9982 | if (rc != MDB_NOTFOUND) |
9983 | goto done; |
9984 | /* no more siblings, go back to beginning |
9985 | * of previous level. |
9986 | */ |
9987 | pop: |
9988 | mdb_cursor_pop(mc); |
9989 | mc->mc_ki[0] = 0; |
9990 | for (i=1; i<mc->mc_snum; i++) { |
9991 | mc->mc_ki[i] = 0; |
9992 | mc->mc_pg[i] = mx.mc_pg[i]; |
9993 | } |
9994 | } |
9995 | } |
9996 | /* free it */ |
9997 | rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); |
9998 | done: |
9999 | if (rc) |
10000 | txn->mt_flags |= MDB_TXN_ERROR; |
10001 | } else if (rc == MDB_NOTFOUND) { |
10002 | rc = MDB_SUCCESS; |
10003 | } |
10004 | mc->mc_flags &= ~C_INITIALIZED; |
10005 | return rc; |
10006 | } |
10007 | |
10008 | int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) |
10009 | { |
10010 | MDB_cursor *mc, *m2; |
10011 | int rc; |
10012 | |
10013 | if ((unsigned)del > 1 || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
10014 | return EINVAL; |
10015 | |
10016 | if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) |
10017 | return EACCES; |
10018 | |
10019 | if (TXN_DBI_CHANGED(txn, dbi)) |
10020 | return MDB_BAD_DBI; |
10021 | |
10022 | rc = mdb_cursor_open(txn, dbi, &mc); |
10023 | if (rc) |
10024 | return rc; |
10025 | |
10026 | rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); |
10027 | /* Invalidate the dropped DB's cursors */ |
10028 | for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) |
10029 | m2->mc_flags &= ~(C_INITIALIZED|C_EOF); |
10030 | if (rc) |
10031 | goto leave; |
10032 | |
10033 | /* Can't delete the main DB */ |
10034 | if (del && dbi >= CORE_DBS) { |
10035 | rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); |
10036 | if (!rc) { |
10037 | txn->mt_dbflags[dbi] = DB_STALE; |
10038 | mdb_dbi_close(txn->mt_env, dbi); |
10039 | } else { |
10040 | txn->mt_flags |= MDB_TXN_ERROR; |
10041 | } |
10042 | } else { |
10043 | /* reset the DB record, mark it dirty */ |
10044 | txn->mt_dbflags[dbi] |= DB_DIRTY; |
10045 | txn->mt_dbs[dbi].md_depth = 0; |
10046 | txn->mt_dbs[dbi].md_branch_pages = 0; |
10047 | txn->mt_dbs[dbi].md_leaf_pages = 0; |
10048 | txn->mt_dbs[dbi].md_overflow_pages = 0; |
10049 | txn->mt_dbs[dbi].md_entries = 0; |
10050 | txn->mt_dbs[dbi].md_root = P_INVALID; |
10051 | |
10052 | txn->mt_flags |= MDB_TXN_DIRTY; |
10053 | } |
10054 | leave: |
10055 | mdb_cursor_close(mc); |
10056 | return rc; |
10057 | } |
10058 | |
10059 | int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) |
10060 | { |
10061 | if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
10062 | return EINVAL; |
10063 | |
10064 | txn->mt_dbxs[dbi].md_cmp = cmp; |
10065 | return MDB_SUCCESS; |
10066 | } |
10067 | |
10068 | int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) |
10069 | { |
10070 | if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
10071 | return EINVAL; |
10072 | |
10073 | txn->mt_dbxs[dbi].md_dcmp = cmp; |
10074 | return MDB_SUCCESS; |
10075 | } |
10076 | |
10077 | int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) |
10078 | { |
10079 | if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
10080 | return EINVAL; |
10081 | |
10082 | txn->mt_dbxs[dbi].md_rel = rel; |
10083 | return MDB_SUCCESS; |
10084 | } |
10085 | |
10086 | int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) |
10087 | { |
10088 | if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)) |
10089 | return EINVAL; |
10090 | |
10091 | txn->mt_dbxs[dbi].md_relctx = ctx; |
10092 | return MDB_SUCCESS; |
10093 | } |
10094 | |
10095 | int ESECT |
10096 | mdb_env_get_maxkeysize(MDB_env *env) |
10097 | { |
10098 | return ENV_MAXKEY(env); |
10099 | } |
10100 | |
10101 | int ESECT |
10102 | mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) |
10103 | { |
10104 | unsigned int i, rdrs; |
10105 | MDB_reader *mr; |
10106 | char buf[64]; |
10107 | int rc = 0, first = 1; |
10108 | |
10109 | if (!env || !func) |
10110 | return -1; |
10111 | if (!env->me_txns) { |
10112 | return func("(no reader locks)\n" , ctx); |
10113 | } |
10114 | rdrs = env->me_txns->mti_numreaders; |
10115 | mr = env->me_txns->mti_readers; |
10116 | for (i=0; i<rdrs; i++) { |
10117 | if (mr[i].mr_pid) { |
10118 | txnid_t txnid = mr[i].mr_txnid; |
10119 | sprintf(buf, txnid == (txnid_t)-1 ? |
10120 | "%10d %" Z"x -\n" : "%10d %" Z"x %" Z"u\n" , |
10121 | (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid); |
10122 | if (first) { |
10123 | first = 0; |
10124 | rc = func(" pid thread txnid\n" , ctx); |
10125 | if (rc < 0) |
10126 | break; |
10127 | } |
10128 | rc = func(buf, ctx); |
10129 | if (rc < 0) |
10130 | break; |
10131 | } |
10132 | } |
10133 | if (first) { |
10134 | rc = func("(no active readers)\n" , ctx); |
10135 | } |
10136 | return rc; |
10137 | } |
10138 | |
10139 | /** Insert pid into list if not already present. |
10140 | * return -1 if already present. |
10141 | */ |
10142 | static int ESECT |
10143 | mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid) |
10144 | { |
10145 | /* binary search of pid in list */ |
10146 | unsigned base = 0; |
10147 | unsigned cursor = 1; |
10148 | int val = 0; |
10149 | unsigned n = ids[0]; |
10150 | |
10151 | while( 0 < n ) { |
10152 | unsigned pivot = n >> 1; |
10153 | cursor = base + pivot + 1; |
10154 | val = pid - ids[cursor]; |
10155 | |
10156 | if( val < 0 ) { |
10157 | n = pivot; |
10158 | |
10159 | } else if ( val > 0 ) { |
10160 | base = cursor; |
10161 | n -= pivot + 1; |
10162 | |
10163 | } else { |
10164 | /* found, so it's a duplicate */ |
10165 | return -1; |
10166 | } |
10167 | } |
10168 | |
10169 | if( val > 0 ) { |
10170 | ++cursor; |
10171 | } |
10172 | ids[0]++; |
10173 | for (n = ids[0]; n > cursor; n--) |
10174 | ids[n] = ids[n-1]; |
10175 | ids[n] = pid; |
10176 | return 0; |
10177 | } |
10178 | |
10179 | int ESECT |
10180 | mdb_reader_check(MDB_env *env, int *dead) |
10181 | { |
10182 | if (!env) |
10183 | return EINVAL; |
10184 | if (dead) |
10185 | *dead = 0; |
10186 | return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS; |
10187 | } |
10188 | |
10189 | /** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ |
10190 | static int ESECT |
10191 | mdb_reader_check0(MDB_env *env, int rlocked, int *dead) |
10192 | { |
10193 | mdb_mutexref_t rmutex = rlocked ? NULL : env->me_rmutex; |
10194 | unsigned int i, j, rdrs; |
10195 | MDB_reader *mr; |
10196 | MDB_PID_T *pids, pid; |
10197 | int rc = MDB_SUCCESS, count = 0; |
10198 | |
10199 | rdrs = env->me_txns->mti_numreaders; |
10200 | pids = malloc((rdrs+1) * sizeof(MDB_PID_T)); |
10201 | if (!pids) |
10202 | return ENOMEM; |
10203 | pids[0] = 0; |
10204 | mr = env->me_txns->mti_readers; |
10205 | for (i=0; i<rdrs; i++) { |
10206 | pid = mr[i].mr_pid; |
10207 | if (pid && pid != env->me_pid) { |
10208 | if (mdb_pid_insert(pids, pid) == 0) { |
10209 | if (!mdb_reader_pid(env, Pidcheck, pid)) { |
10210 | /* Stale reader found */ |
10211 | j = i; |
10212 | if (rmutex) { |
10213 | if ((rc = LOCK_MUTEX0(rmutex)) != 0) { |
10214 | if ((rc = mdb_mutex_failed(env, rmutex, rc))) |
10215 | break; |
10216 | rdrs = 0; /* the above checked all readers */ |
10217 | } else { |
10218 | /* Recheck, a new process may have reused pid */ |
10219 | if (mdb_reader_pid(env, Pidcheck, pid)) |
10220 | j = rdrs; |
10221 | } |
10222 | } |
10223 | for (; j<rdrs; j++) |
10224 | if (mr[j].mr_pid == pid) { |
10225 | DPRINTF(("clear stale reader pid %u txn %" Z"d" , |
10226 | (unsigned) pid, mr[j].mr_txnid)); |
10227 | mr[j].mr_pid = 0; |
10228 | count++; |
10229 | } |
10230 | if (rmutex) |
10231 | UNLOCK_MUTEX(rmutex); |
10232 | } |
10233 | } |
10234 | } |
10235 | } |
10236 | free(pids); |
10237 | if (dead) |
10238 | *dead = count; |
10239 | return rc; |
10240 | } |
10241 | |
10242 | #ifdef MDB_ROBUST_SUPPORTED |
10243 | /** Handle #LOCK_MUTEX0() failure. |
10244 | * Try to repair the lock file if the mutex owner died. |
10245 | * @param[in] env the environment handle |
10246 | * @param[in] mutex LOCK_MUTEX0() mutex |
10247 | * @param[in] rc LOCK_MUTEX0() error (nonzero) |
10248 | * @return 0 on success with the mutex locked, or an error code on failure. |
10249 | */ |
10250 | static int ESECT |
10251 | mdb_mutex_failed(MDB_env *env, mdb_mutexref_t mutex, int rc) |
10252 | { |
10253 | int rlocked, rc2; |
10254 | MDB_meta *meta; |
10255 | |
10256 | if (rc == MDB_OWNERDEAD) { |
10257 | /* We own the mutex. Clean up after dead previous owner. */ |
10258 | rc = MDB_SUCCESS; |
10259 | rlocked = (mutex == env->me_rmutex); |
10260 | if (!rlocked) { |
10261 | /* Keep mti_txnid updated, otherwise next writer can |
10262 | * overwrite data which latest meta page refers to. |
10263 | */ |
10264 | meta = mdb_env_pick_meta(env); |
10265 | env->me_txns->mti_txnid = meta->mm_txnid; |
10266 | /* env is hosed if the dead thread was ours */ |
10267 | if (env->me_txn) { |
10268 | env->me_flags |= MDB_FATAL_ERROR; |
10269 | env->me_txn = NULL; |
10270 | rc = MDB_PANIC; |
10271 | } |
10272 | } |
10273 | DPRINTF(("%cmutex owner died, %s" , (rlocked ? 'r' : 'w'), |
10274 | (rc ? "this process' env is hosed" : "recovering" ))); |
10275 | rc2 = mdb_reader_check0(env, rlocked, NULL); |
10276 | if (rc2 == 0) |
10277 | rc2 = mdb_mutex_consistent(mutex); |
10278 | if (rc || (rc = rc2)) { |
10279 | DPRINTF(("LOCK_MUTEX recovery failed, %s" , mdb_strerror(rc))); |
10280 | UNLOCK_MUTEX(mutex); |
10281 | } |
10282 | } else { |
10283 | #ifdef _WIN32 |
10284 | rc = ErrCode(); |
10285 | #endif |
10286 | DPRINTF(("LOCK_MUTEX failed, %s" , mdb_strerror(rc))); |
10287 | } |
10288 | |
10289 | return rc; |
10290 | } |
10291 | #endif /* MDB_ROBUST_SUPPORTED */ |
10292 | |
10293 | #if defined(_WIN32) |
10294 | /** Convert \b src to new wchar_t[] string with room for \b xtra extra chars */ |
10295 | static int ESECT |
10296 | utf8_to_utf16(const char *src, MDB_name *dst, int xtra) |
10297 | { |
10298 | int rc, need = 0; |
10299 | wchar_t *result = NULL; |
10300 | for (;;) { /* malloc result, then fill it in */ |
10301 | need = MultiByteToWideChar(CP_UTF8, 0, src, -1, result, need); |
10302 | if (!need) { |
10303 | rc = ErrCode(); |
10304 | free(result); |
10305 | return rc; |
10306 | } |
10307 | if (!result) { |
10308 | result = malloc(sizeof(wchar_t) * (need + xtra)); |
10309 | if (!result) |
10310 | return ENOMEM; |
10311 | continue; |
10312 | } |
10313 | dst->mn_alloced = 1; |
10314 | dst->mn_len = need - 1; |
10315 | dst->mn_val = result; |
10316 | return MDB_SUCCESS; |
10317 | } |
10318 | } |
10319 | #endif /* defined(_WIN32) */ |
10320 | /** @} */ |
10321 | |