1 | /* Maxmemory directive handling (LRU eviction and other policies). |
2 | * |
3 | * ---------------------------------------------------------------------------- |
4 | * |
5 | * Copyright (c) 2009-2016, Salvatore Sanfilippo <antirez at gmail dot com> |
6 | * All rights reserved. |
7 | * |
8 | * Redistribution and use in source and binary forms, with or without |
9 | * modification, are permitted provided that the following conditions are met: |
10 | * |
11 | * * Redistributions of source code must retain the above copyright notice, |
12 | * this list of conditions and the following disclaimer. |
13 | * * Redistributions in binary form must reproduce the above copyright |
14 | * notice, this list of conditions and the following disclaimer in the |
15 | * documentation and/or other materials provided with the distribution. |
16 | * * Neither the name of Redis nor the names of its contributors may be used |
17 | * to endorse or promote products derived from this software without |
18 | * specific prior written permission. |
19 | * |
20 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
21 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
22 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
23 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
24 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
25 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
26 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
27 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
28 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
29 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
30 | * POSSIBILITY OF SUCH DAMAGE. |
31 | */ |
32 | |
33 | #include "server.h" |
34 | #include "bio.h" |
35 | #include "atomicvar.h" |
36 | #include "script.h" |
37 | #include <math.h> |
38 | |
39 | /* ---------------------------------------------------------------------------- |
40 | * Data structures |
41 | * --------------------------------------------------------------------------*/ |
42 | |
43 | /* To improve the quality of the LRU approximation we take a set of keys |
44 | * that are good candidate for eviction across performEvictions() calls. |
45 | * |
46 | * Entries inside the eviction pool are taken ordered by idle time, putting |
47 | * greater idle times to the right (ascending order). |
48 | * |
49 | * When an LFU policy is used instead, a reverse frequency indication is used |
50 | * instead of the idle time, so that we still evict by larger value (larger |
51 | * inverse frequency means to evict keys with the least frequent accesses). |
52 | * |
53 | * Empty entries have the key pointer set to NULL. */ |
54 | #define EVPOOL_SIZE 16 |
55 | #define EVPOOL_CACHED_SDS_SIZE 255 |
56 | struct evictionPoolEntry { |
57 | unsigned long long idle; /* Object idle time (inverse frequency for LFU) */ |
58 | sds key; /* Key name. */ |
59 | sds cached; /* Cached SDS object for key name. */ |
60 | int dbid; /* Key DB number. */ |
61 | }; |
62 | |
63 | static struct evictionPoolEntry *EvictionPoolLRU; |
64 | |
65 | /* ---------------------------------------------------------------------------- |
66 | * Implementation of eviction, aging and LRU |
67 | * --------------------------------------------------------------------------*/ |
68 | |
69 | /* Return the LRU clock, based on the clock resolution. This is a time |
70 | * in a reduced-bits format that can be used to set and check the |
71 | * object->lru field of redisObject structures. */ |
72 | unsigned int getLRUClock(void) { |
73 | return (mstime()/LRU_CLOCK_RESOLUTION) & LRU_CLOCK_MAX; |
74 | } |
75 | |
76 | /* This function is used to obtain the current LRU clock. |
77 | * If the current resolution is lower than the frequency we refresh the |
78 | * LRU clock (as it should be in production servers) we return the |
79 | * precomputed value, otherwise we need to resort to a system call. */ |
80 | unsigned int LRU_CLOCK(void) { |
81 | unsigned int lruclock; |
82 | if (1000/server.hz <= LRU_CLOCK_RESOLUTION) { |
83 | atomicGet(server.lruclock,lruclock); |
84 | } else { |
85 | lruclock = getLRUClock(); |
86 | } |
87 | return lruclock; |
88 | } |
89 | |
90 | /* Given an object returns the min number of milliseconds the object was never |
91 | * requested, using an approximated LRU algorithm. */ |
92 | unsigned long long estimateObjectIdleTime(robj *o) { |
93 | unsigned long long lruclock = LRU_CLOCK(); |
94 | if (lruclock >= o->lru) { |
95 | return (lruclock - o->lru) * LRU_CLOCK_RESOLUTION; |
96 | } else { |
97 | return (lruclock + (LRU_CLOCK_MAX - o->lru)) * |
98 | LRU_CLOCK_RESOLUTION; |
99 | } |
100 | } |
101 | |
102 | /* LRU approximation algorithm |
103 | * |
104 | * Redis uses an approximation of the LRU algorithm that runs in constant |
105 | * memory. Every time there is a key to expire, we sample N keys (with |
106 | * N very small, usually in around 5) to populate a pool of best keys to |
107 | * evict of M keys (the pool size is defined by EVPOOL_SIZE). |
108 | * |
109 | * The N keys sampled are added in the pool of good keys to expire (the one |
110 | * with an old access time) if they are better than one of the current keys |
111 | * in the pool. |
112 | * |
113 | * After the pool is populated, the best key we have in the pool is expired. |
114 | * However note that we don't remove keys from the pool when they are deleted |
115 | * so the pool may contain keys that no longer exist. |
116 | * |
117 | * When we try to evict a key, and all the entries in the pool don't exist |
118 | * we populate it again. This time we'll be sure that the pool has at least |
119 | * one key that can be evicted, if there is at least one key that can be |
120 | * evicted in the whole database. */ |
121 | |
122 | /* Create a new eviction pool. */ |
123 | void evictionPoolAlloc(void) { |
124 | struct evictionPoolEntry *ep; |
125 | int j; |
126 | |
127 | ep = zmalloc(sizeof(*ep)*EVPOOL_SIZE); |
128 | for (j = 0; j < EVPOOL_SIZE; j++) { |
129 | ep[j].idle = 0; |
130 | ep[j].key = NULL; |
131 | ep[j].cached = sdsnewlen(NULL,EVPOOL_CACHED_SDS_SIZE); |
132 | ep[j].dbid = 0; |
133 | } |
134 | EvictionPoolLRU = ep; |
135 | } |
136 | |
137 | /* This is a helper function for performEvictions(), it is used in order |
138 | * to populate the evictionPool with a few entries every time we want to |
139 | * expire a key. Keys with idle time bigger than one of the current |
140 | * keys are added. Keys are always added if there are free entries. |
141 | * |
142 | * We insert keys on place in ascending order, so keys with the smaller |
143 | * idle time are on the left, and keys with the higher idle time on the |
144 | * right. */ |
145 | |
146 | void evictionPoolPopulate(int dbid, dict *sampledict, dict *keydict, struct evictionPoolEntry *pool) { |
147 | int j, k, count; |
148 | dictEntry *samples[server.maxmemory_samples]; |
149 | |
150 | count = dictGetSomeKeys(sampledict,samples,server.maxmemory_samples); |
151 | for (j = 0; j < count; j++) { |
152 | unsigned long long idle; |
153 | sds key; |
154 | robj *o; |
155 | dictEntry *de; |
156 | |
157 | de = samples[j]; |
158 | key = dictGetKey(de); |
159 | |
160 | /* If the dictionary we are sampling from is not the main |
161 | * dictionary (but the expires one) we need to lookup the key |
162 | * again in the key dictionary to obtain the value object. */ |
163 | if (server.maxmemory_policy != MAXMEMORY_VOLATILE_TTL) { |
164 | if (sampledict != keydict) de = dictFind(keydict, key); |
165 | o = dictGetVal(de); |
166 | } |
167 | |
168 | /* Calculate the idle time according to the policy. This is called |
169 | * idle just because the code initially handled LRU, but is in fact |
170 | * just a score where an higher score means better candidate. */ |
171 | if (server.maxmemory_policy & MAXMEMORY_FLAG_LRU) { |
172 | idle = estimateObjectIdleTime(o); |
173 | } else if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) { |
174 | /* When we use an LRU policy, we sort the keys by idle time |
175 | * so that we expire keys starting from greater idle time. |
176 | * However when the policy is an LFU one, we have a frequency |
177 | * estimation, and we want to evict keys with lower frequency |
178 | * first. So inside the pool we put objects using the inverted |
179 | * frequency subtracting the actual frequency to the maximum |
180 | * frequency of 255. */ |
181 | idle = 255-LFUDecrAndReturn(o); |
182 | } else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) { |
183 | /* In this case the sooner the expire the better. */ |
184 | idle = ULLONG_MAX - (long)dictGetVal(de); |
185 | } else { |
186 | serverPanic("Unknown eviction policy in evictionPoolPopulate()" ); |
187 | } |
188 | |
189 | /* Insert the element inside the pool. |
190 | * First, find the first empty bucket or the first populated |
191 | * bucket that has an idle time smaller than our idle time. */ |
192 | k = 0; |
193 | while (k < EVPOOL_SIZE && |
194 | pool[k].key && |
195 | pool[k].idle < idle) k++; |
196 | if (k == 0 && pool[EVPOOL_SIZE-1].key != NULL) { |
197 | /* Can't insert if the element is < the worst element we have |
198 | * and there are no empty buckets. */ |
199 | continue; |
200 | } else if (k < EVPOOL_SIZE && pool[k].key == NULL) { |
201 | /* Inserting into empty position. No setup needed before insert. */ |
202 | } else { |
203 | /* Inserting in the middle. Now k points to the first element |
204 | * greater than the element to insert. */ |
205 | if (pool[EVPOOL_SIZE-1].key == NULL) { |
206 | /* Free space on the right? Insert at k shifting |
207 | * all the elements from k to end to the right. */ |
208 | |
209 | /* Save SDS before overwriting. */ |
210 | sds cached = pool[EVPOOL_SIZE-1].cached; |
211 | memmove(pool+k+1,pool+k, |
212 | sizeof(pool[0])*(EVPOOL_SIZE-k-1)); |
213 | pool[k].cached = cached; |
214 | } else { |
215 | /* No free space on right? Insert at k-1 */ |
216 | k--; |
217 | /* Shift all elements on the left of k (included) to the |
218 | * left, so we discard the element with smaller idle time. */ |
219 | sds cached = pool[0].cached; /* Save SDS before overwriting. */ |
220 | if (pool[0].key != pool[0].cached) sdsfree(pool[0].key); |
221 | memmove(pool,pool+1,sizeof(pool[0])*k); |
222 | pool[k].cached = cached; |
223 | } |
224 | } |
225 | |
226 | /* Try to reuse the cached SDS string allocated in the pool entry, |
227 | * because allocating and deallocating this object is costly |
228 | * (according to the profiler, not my fantasy. Remember: |
229 | * premature optimization bla bla bla. */ |
230 | int klen = sdslen(key); |
231 | if (klen > EVPOOL_CACHED_SDS_SIZE) { |
232 | pool[k].key = sdsdup(key); |
233 | } else { |
234 | memcpy(pool[k].cached,key,klen+1); |
235 | sdssetlen(pool[k].cached,klen); |
236 | pool[k].key = pool[k].cached; |
237 | } |
238 | pool[k].idle = idle; |
239 | pool[k].dbid = dbid; |
240 | } |
241 | } |
242 | |
243 | /* ---------------------------------------------------------------------------- |
244 | * LFU (Least Frequently Used) implementation. |
245 | |
246 | * We have 24 total bits of space in each object in order to implement |
247 | * an LFU (Least Frequently Used) eviction policy, since we re-use the |
248 | * LRU field for this purpose. |
249 | * |
250 | * We split the 24 bits into two fields: |
251 | * |
252 | * 16 bits 8 bits |
253 | * +----------------+--------+ |
254 | * + Last decr time | LOG_C | |
255 | * +----------------+--------+ |
256 | * |
257 | * LOG_C is a logarithmic counter that provides an indication of the access |
258 | * frequency. However this field must also be decremented otherwise what used |
259 | * to be a frequently accessed key in the past, will remain ranked like that |
260 | * forever, while we want the algorithm to adapt to access pattern changes. |
261 | * |
262 | * So the remaining 16 bits are used in order to store the "decrement time", |
263 | * a reduced-precision Unix time (we take 16 bits of the time converted |
264 | * in minutes since we don't care about wrapping around) where the LOG_C |
265 | * counter is halved if it has an high value, or just decremented if it |
266 | * has a low value. |
267 | * |
268 | * New keys don't start at zero, in order to have the ability to collect |
269 | * some accesses before being trashed away, so they start at LFU_INIT_VAL. |
270 | * The logarithmic increment performed on LOG_C takes care of LFU_INIT_VAL |
271 | * when incrementing the key, so that keys starting at LFU_INIT_VAL |
272 | * (or having a smaller value) have a very high chance of being incremented |
273 | * on access. |
274 | * |
275 | * During decrement, the value of the logarithmic counter is halved if |
276 | * its current value is greater than two times the LFU_INIT_VAL, otherwise |
277 | * it is just decremented by one. |
278 | * --------------------------------------------------------------------------*/ |
279 | |
280 | /* Return the current time in minutes, just taking the least significant |
281 | * 16 bits. The returned time is suitable to be stored as LDT (last decrement |
282 | * time) for the LFU implementation. */ |
283 | unsigned long LFUGetTimeInMinutes(void) { |
284 | return (server.unixtime/60) & 65535; |
285 | } |
286 | |
287 | /* Given an object last access time, compute the minimum number of minutes |
288 | * that elapsed since the last access. Handle overflow (ldt greater than |
289 | * the current 16 bits minutes time) considering the time as wrapping |
290 | * exactly once. */ |
291 | unsigned long LFUTimeElapsed(unsigned long ldt) { |
292 | unsigned long now = LFUGetTimeInMinutes(); |
293 | if (now >= ldt) return now-ldt; |
294 | return 65535-ldt+now; |
295 | } |
296 | |
297 | /* Logarithmically increment a counter. The greater is the current counter value |
298 | * the less likely is that it gets really incremented. Saturate it at 255. */ |
299 | uint8_t LFULogIncr(uint8_t counter) { |
300 | if (counter == 255) return 255; |
301 | double r = (double)rand()/RAND_MAX; |
302 | double baseval = counter - LFU_INIT_VAL; |
303 | if (baseval < 0) baseval = 0; |
304 | double p = 1.0/(baseval*server.lfu_log_factor+1); |
305 | if (r < p) counter++; |
306 | return counter; |
307 | } |
308 | |
309 | /* If the object decrement time is reached decrement the LFU counter but |
310 | * do not update LFU fields of the object, we update the access time |
311 | * and counter in an explicit way when the object is really accessed. |
312 | * And we will times halve the counter according to the times of |
313 | * elapsed time than server.lfu_decay_time. |
314 | * Return the object frequency counter. |
315 | * |
316 | * This function is used in order to scan the dataset for the best object |
317 | * to fit: as we check for the candidate, we incrementally decrement the |
318 | * counter of the scanned objects if needed. */ |
319 | unsigned long LFUDecrAndReturn(robj *o) { |
320 | unsigned long ldt = o->lru >> 8; |
321 | unsigned long counter = o->lru & 255; |
322 | unsigned long num_periods = server.lfu_decay_time ? LFUTimeElapsed(ldt) / server.lfu_decay_time : 0; |
323 | if (num_periods) |
324 | counter = (num_periods > counter) ? 0 : counter - num_periods; |
325 | return counter; |
326 | } |
327 | |
328 | /* We don't want to count AOF buffers and slaves output buffers as |
329 | * used memory: the eviction should use mostly data size, because |
330 | * it can cause feedback-loop when we push DELs into them, putting |
331 | * more and more DELs will make them bigger, if we count them, we |
332 | * need to evict more keys, and then generate more DELs, maybe cause |
333 | * massive eviction loop, even all keys are evicted. |
334 | * |
335 | * This function returns the sum of AOF and replication buffer. */ |
336 | size_t freeMemoryGetNotCountedMemory(void) { |
337 | size_t overhead = 0; |
338 | |
339 | /* Since all replicas and replication backlog share global replication |
340 | * buffer, we think only the part of exceeding backlog size is the extra |
341 | * separate consumption of replicas. |
342 | * |
343 | * Note that although the backlog is also initially incrementally grown |
344 | * (pushing DELs consumes memory), it'll eventually stop growing and |
345 | * remain constant in size, so even if its creation will cause some |
346 | * eviction, it's capped, and also here to stay (no resonance effect) |
347 | * |
348 | * Note that, because we trim backlog incrementally in the background, |
349 | * backlog size may exceeds our setting if slow replicas that reference |
350 | * vast replication buffer blocks disconnect. To avoid massive eviction |
351 | * loop, we don't count the delayed freed replication backlog into used |
352 | * memory even if there are no replicas, i.e. we still regard this memory |
353 | * as replicas'. */ |
354 | if ((long long)server.repl_buffer_mem > server.repl_backlog_size) { |
355 | /* We use list structure to manage replication buffer blocks, so backlog |
356 | * also occupies some extra memory, we can't know exact blocks numbers, |
357 | * we only get approximate size according to per block size. */ |
358 | size_t = |
359 | (server.repl_backlog_size/PROTO_REPLY_CHUNK_BYTES + 1) * |
360 | (sizeof(replBufBlock)+sizeof(listNode)); |
361 | size_t counted_mem = server.repl_backlog_size + extra_approx_size; |
362 | if (server.repl_buffer_mem > counted_mem) { |
363 | overhead += (server.repl_buffer_mem - counted_mem); |
364 | } |
365 | } |
366 | |
367 | if (server.aof_state != AOF_OFF) { |
368 | overhead += sdsAllocSize(server.aof_buf); |
369 | } |
370 | return overhead; |
371 | } |
372 | |
373 | /* Get the memory status from the point of view of the maxmemory directive: |
374 | * if the memory used is under the maxmemory setting then C_OK is returned. |
375 | * Otherwise, if we are over the memory limit, the function returns |
376 | * C_ERR. |
377 | * |
378 | * The function may return additional info via reference, only if the |
379 | * pointers to the respective arguments is not NULL. Certain fields are |
380 | * populated only when C_ERR is returned: |
381 | * |
382 | * 'total' total amount of bytes used. |
383 | * (Populated both for C_ERR and C_OK) |
384 | * |
385 | * 'logical' the amount of memory used minus the slaves/AOF buffers. |
386 | * (Populated when C_ERR is returned) |
387 | * |
388 | * 'tofree' the amount of memory that should be released |
389 | * in order to return back into the memory limits. |
390 | * (Populated when C_ERR is returned) |
391 | * |
392 | * 'level' this usually ranges from 0 to 1, and reports the amount of |
393 | * memory currently used. May be > 1 if we are over the memory |
394 | * limit. |
395 | * (Populated both for C_ERR and C_OK) |
396 | */ |
397 | int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level) { |
398 | size_t mem_reported, mem_used, mem_tofree; |
399 | |
400 | /* Check if we are over the memory usage limit. If we are not, no need |
401 | * to subtract the slaves output buffers. We can just return ASAP. */ |
402 | mem_reported = zmalloc_used_memory(); |
403 | if (total) *total = mem_reported; |
404 | |
405 | /* We may return ASAP if there is no need to compute the level. */ |
406 | if (!server.maxmemory) { |
407 | if (level) *level = 0; |
408 | return C_OK; |
409 | } |
410 | if (mem_reported <= server.maxmemory && !level) return C_OK; |
411 | |
412 | /* Remove the size of slaves output buffers and AOF buffer from the |
413 | * count of used memory. */ |
414 | mem_used = mem_reported; |
415 | size_t overhead = freeMemoryGetNotCountedMemory(); |
416 | mem_used = (mem_used > overhead) ? mem_used-overhead : 0; |
417 | |
418 | /* Compute the ratio of memory usage. */ |
419 | if (level) *level = (float)mem_used / (float)server.maxmemory; |
420 | |
421 | if (mem_reported <= server.maxmemory) return C_OK; |
422 | |
423 | /* Check if we are still over the memory limit. */ |
424 | if (mem_used <= server.maxmemory) return C_OK; |
425 | |
426 | /* Compute how much memory we need to free. */ |
427 | mem_tofree = mem_used - server.maxmemory; |
428 | |
429 | if (logical) *logical = mem_used; |
430 | if (tofree) *tofree = mem_tofree; |
431 | |
432 | return C_ERR; |
433 | } |
434 | |
435 | /* Return 1 if used memory is more than maxmemory after allocating more memory, |
436 | * return 0 if not. Redis may reject user's requests or evict some keys if used |
437 | * memory exceeds maxmemory, especially, when we allocate huge memory at once. */ |
438 | int overMaxmemoryAfterAlloc(size_t moremem) { |
439 | if (!server.maxmemory) return 0; /* No limit. */ |
440 | |
441 | /* Check quickly. */ |
442 | size_t mem_used = zmalloc_used_memory(); |
443 | if (mem_used + moremem <= server.maxmemory) return 0; |
444 | |
445 | size_t overhead = freeMemoryGetNotCountedMemory(); |
446 | mem_used = (mem_used > overhead) ? mem_used - overhead : 0; |
447 | return mem_used + moremem > server.maxmemory; |
448 | } |
449 | |
450 | /* The evictionTimeProc is started when "maxmemory" has been breached and |
451 | * could not immediately be resolved. This will spin the event loop with short |
452 | * eviction cycles until the "maxmemory" condition has resolved or there are no |
453 | * more evictable items. */ |
454 | static int isEvictionProcRunning = 0; |
455 | static int evictionTimeProc( |
456 | struct aeEventLoop *eventLoop, long long id, void *clientData) { |
457 | UNUSED(eventLoop); |
458 | UNUSED(id); |
459 | UNUSED(clientData); |
460 | |
461 | if (performEvictions() == EVICT_RUNNING) return 0; /* keep evicting */ |
462 | |
463 | /* For EVICT_OK - things are good, no need to keep evicting. |
464 | * For EVICT_FAIL - there is nothing left to evict. */ |
465 | isEvictionProcRunning = 0; |
466 | return AE_NOMORE; |
467 | } |
468 | |
469 | void startEvictionTimeProc(void) { |
470 | if (!isEvictionProcRunning) { |
471 | isEvictionProcRunning = 1; |
472 | aeCreateTimeEvent(server.el, 0, |
473 | evictionTimeProc, NULL, NULL); |
474 | } |
475 | } |
476 | |
477 | /* Check if it's safe to perform evictions. |
478 | * Returns 1 if evictions can be performed |
479 | * Returns 0 if eviction processing should be skipped |
480 | */ |
481 | static int isSafeToPerformEvictions(void) { |
482 | /* - There must be no script in timeout condition. |
483 | * - Nor we are loading data right now. */ |
484 | if (isInsideYieldingLongCommand() || server.loading) return 0; |
485 | |
486 | /* By default replicas should ignore maxmemory |
487 | * and just be masters exact copies. */ |
488 | if (server.masterhost && server.repl_slave_ignore_maxmemory) return 0; |
489 | |
490 | /* When clients are paused the dataset should be static not just from the |
491 | * POV of clients not being able to write, but also from the POV of |
492 | * expires and evictions of keys not being performed. */ |
493 | if (checkClientPauseTimeoutAndReturnIfPaused()) return 0; |
494 | |
495 | return 1; |
496 | } |
497 | |
498 | /* Algorithm for converting tenacity (0-100) to a time limit. */ |
499 | static unsigned long evictionTimeLimitUs() { |
500 | serverAssert(server.maxmemory_eviction_tenacity >= 0); |
501 | serverAssert(server.maxmemory_eviction_tenacity <= 100); |
502 | |
503 | if (server.maxmemory_eviction_tenacity <= 10) { |
504 | /* A linear progression from 0..500us */ |
505 | return 50uL * server.maxmemory_eviction_tenacity; |
506 | } |
507 | |
508 | if (server.maxmemory_eviction_tenacity < 100) { |
509 | /* A 15% geometric progression, resulting in a limit of ~2 min at tenacity==99 */ |
510 | return (unsigned long)(500.0 * pow(1.15, server.maxmemory_eviction_tenacity - 10.0)); |
511 | } |
512 | |
513 | return ULONG_MAX; /* No limit to eviction time */ |
514 | } |
515 | |
516 | /* Check that memory usage is within the current "maxmemory" limit. If over |
517 | * "maxmemory", attempt to free memory by evicting data (if it's safe to do so). |
518 | * |
519 | * It's possible for Redis to suddenly be significantly over the "maxmemory" |
520 | * setting. This can happen if there is a large allocation (like a hash table |
521 | * resize) or even if the "maxmemory" setting is manually adjusted. Because of |
522 | * this, it's important to evict for a managed period of time - otherwise Redis |
523 | * would become unresponsive while evicting. |
524 | * |
525 | * The goal of this function is to improve the memory situation - not to |
526 | * immediately resolve it. In the case that some items have been evicted but |
527 | * the "maxmemory" limit has not been achieved, an aeTimeProc will be started |
528 | * which will continue to evict items until memory limits are achieved or |
529 | * nothing more is evictable. |
530 | * |
531 | * This should be called before execution of commands. If EVICT_FAIL is |
532 | * returned, commands which will result in increased memory usage should be |
533 | * rejected. |
534 | * |
535 | * Returns: |
536 | * EVICT_OK - memory is OK or it's not possible to perform evictions now |
537 | * EVICT_RUNNING - memory is over the limit, but eviction is still processing |
538 | * EVICT_FAIL - memory is over the limit, and there's nothing to evict |
539 | * */ |
540 | int performEvictions(void) { |
541 | /* Note, we don't goto update_metrics here because this check skips eviction |
542 | * as if it wasn't triggered. it's a fake EVICT_OK. */ |
543 | if (!isSafeToPerformEvictions()) return EVICT_OK; |
544 | |
545 | int keys_freed = 0; |
546 | size_t mem_reported, mem_tofree; |
547 | long long mem_freed; /* May be negative */ |
548 | mstime_t latency, eviction_latency; |
549 | long long delta; |
550 | int slaves = listLength(server.slaves); |
551 | int result = EVICT_FAIL; |
552 | |
553 | if (getMaxmemoryState(&mem_reported,NULL,&mem_tofree,NULL) == C_OK) { |
554 | result = EVICT_OK; |
555 | goto update_metrics; |
556 | } |
557 | |
558 | if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION) { |
559 | result = EVICT_FAIL; /* We need to free memory, but policy forbids. */ |
560 | goto update_metrics; |
561 | } |
562 | |
563 | unsigned long eviction_time_limit_us = evictionTimeLimitUs(); |
564 | |
565 | mem_freed = 0; |
566 | |
567 | latencyStartMonitor(latency); |
568 | |
569 | monotime evictionTimer; |
570 | elapsedStart(&evictionTimer); |
571 | |
572 | /* Unlike active-expire and blocked client, we can reach here from 'CONFIG SET maxmemory' |
573 | * so we have to back-up and restore server.core_propagates. */ |
574 | int prev_core_propagates = server.core_propagates; |
575 | serverAssert(server.also_propagate.numops == 0); |
576 | server.core_propagates = 1; |
577 | server.propagate_no_multi = 1; |
578 | |
579 | while (mem_freed < (long long)mem_tofree) { |
580 | int j, k, i; |
581 | static unsigned int next_db = 0; |
582 | sds bestkey = NULL; |
583 | int bestdbid; |
584 | redisDb *db; |
585 | dict *dict; |
586 | dictEntry *de; |
587 | |
588 | if (server.maxmemory_policy & (MAXMEMORY_FLAG_LRU|MAXMEMORY_FLAG_LFU) || |
589 | server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) |
590 | { |
591 | struct evictionPoolEntry *pool = EvictionPoolLRU; |
592 | |
593 | while(bestkey == NULL) { |
594 | unsigned long total_keys = 0, keys; |
595 | |
596 | /* We don't want to make local-db choices when expiring keys, |
597 | * so to start populate the eviction pool sampling keys from |
598 | * every DB. */ |
599 | for (i = 0; i < server.dbnum; i++) { |
600 | db = server.db+i; |
601 | dict = (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) ? |
602 | db->dict : db->expires; |
603 | if ((keys = dictSize(dict)) != 0) { |
604 | evictionPoolPopulate(i, dict, db->dict, pool); |
605 | total_keys += keys; |
606 | } |
607 | } |
608 | if (!total_keys) break; /* No keys to evict. */ |
609 | |
610 | /* Go backward from best to worst element to evict. */ |
611 | for (k = EVPOOL_SIZE-1; k >= 0; k--) { |
612 | if (pool[k].key == NULL) continue; |
613 | bestdbid = pool[k].dbid; |
614 | |
615 | if (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) { |
616 | de = dictFind(server.db[bestdbid].dict, |
617 | pool[k].key); |
618 | } else { |
619 | de = dictFind(server.db[bestdbid].expires, |
620 | pool[k].key); |
621 | } |
622 | |
623 | /* Remove the entry from the pool. */ |
624 | if (pool[k].key != pool[k].cached) |
625 | sdsfree(pool[k].key); |
626 | pool[k].key = NULL; |
627 | pool[k].idle = 0; |
628 | |
629 | /* If the key exists, is our pick. Otherwise it is |
630 | * a ghost and we need to try the next element. */ |
631 | if (de) { |
632 | bestkey = dictGetKey(de); |
633 | break; |
634 | } else { |
635 | /* Ghost... Iterate again. */ |
636 | } |
637 | } |
638 | } |
639 | } |
640 | |
641 | /* volatile-random and allkeys-random policy */ |
642 | else if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM || |
643 | server.maxmemory_policy == MAXMEMORY_VOLATILE_RANDOM) |
644 | { |
645 | /* When evicting a random key, we try to evict a key for |
646 | * each DB, so we use the static 'next_db' variable to |
647 | * incrementally visit all DBs. */ |
648 | for (i = 0; i < server.dbnum; i++) { |
649 | j = (++next_db) % server.dbnum; |
650 | db = server.db+j; |
651 | dict = (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) ? |
652 | db->dict : db->expires; |
653 | if (dictSize(dict) != 0) { |
654 | de = dictGetRandomKey(dict); |
655 | bestkey = dictGetKey(de); |
656 | bestdbid = j; |
657 | break; |
658 | } |
659 | } |
660 | } |
661 | |
662 | /* Finally remove the selected key. */ |
663 | if (bestkey) { |
664 | db = server.db+bestdbid; |
665 | robj *keyobj = createStringObject(bestkey,sdslen(bestkey)); |
666 | /* We compute the amount of memory freed by db*Delete() alone. |
667 | * It is possible that actually the memory needed to propagate |
668 | * the DEL in AOF and replication link is greater than the one |
669 | * we are freeing removing the key, but we can't account for |
670 | * that otherwise we would never exit the loop. |
671 | * |
672 | * Same for CSC invalidation messages generated by signalModifiedKey. |
673 | * |
674 | * AOF and Output buffer memory will be freed eventually so |
675 | * we only care about memory used by the key space. */ |
676 | delta = (long long) zmalloc_used_memory(); |
677 | latencyStartMonitor(eviction_latency); |
678 | if (server.lazyfree_lazy_eviction) |
679 | dbAsyncDelete(db,keyobj); |
680 | else |
681 | dbSyncDelete(db,keyobj); |
682 | latencyEndMonitor(eviction_latency); |
683 | latencyAddSampleIfNeeded("eviction-del" ,eviction_latency); |
684 | delta -= (long long) zmalloc_used_memory(); |
685 | mem_freed += delta; |
686 | server.stat_evictedkeys++; |
687 | signalModifiedKey(NULL,db,keyobj); |
688 | notifyKeyspaceEvent(NOTIFY_EVICTED, "evicted" , |
689 | keyobj, db->id); |
690 | propagateDeletion(db,keyobj,server.lazyfree_lazy_eviction); |
691 | decrRefCount(keyobj); |
692 | keys_freed++; |
693 | |
694 | if (keys_freed % 16 == 0) { |
695 | /* When the memory to free starts to be big enough, we may |
696 | * start spending so much time here that is impossible to |
697 | * deliver data to the replicas fast enough, so we force the |
698 | * transmission here inside the loop. */ |
699 | if (slaves) flushSlavesOutputBuffers(); |
700 | |
701 | /* Normally our stop condition is the ability to release |
702 | * a fixed, pre-computed amount of memory. However when we |
703 | * are deleting objects in another thread, it's better to |
704 | * check, from time to time, if we already reached our target |
705 | * memory, since the "mem_freed" amount is computed only |
706 | * across the dbAsyncDelete() call, while the thread can |
707 | * release the memory all the time. */ |
708 | if (server.lazyfree_lazy_eviction) { |
709 | if (getMaxmemoryState(NULL,NULL,NULL,NULL) == C_OK) { |
710 | break; |
711 | } |
712 | } |
713 | |
714 | /* After some time, exit the loop early - even if memory limit |
715 | * hasn't been reached. If we suddenly need to free a lot of |
716 | * memory, don't want to spend too much time here. */ |
717 | if (elapsedUs(evictionTimer) > eviction_time_limit_us) { |
718 | // We still need to free memory - start eviction timer proc |
719 | startEvictionTimeProc(); |
720 | break; |
721 | } |
722 | } |
723 | } else { |
724 | goto cant_free; /* nothing to free... */ |
725 | } |
726 | } |
727 | /* at this point, the memory is OK, or we have reached the time limit */ |
728 | result = (isEvictionProcRunning) ? EVICT_RUNNING : EVICT_OK; |
729 | |
730 | cant_free: |
731 | if (result == EVICT_FAIL) { |
732 | /* At this point, we have run out of evictable items. It's possible |
733 | * that some items are being freed in the lazyfree thread. Perform a |
734 | * short wait here if such jobs exist, but don't wait long. */ |
735 | if (bioPendingJobsOfType(BIO_LAZY_FREE)) { |
736 | usleep(eviction_time_limit_us); |
737 | if (getMaxmemoryState(NULL,NULL,NULL,NULL) == C_OK) { |
738 | result = EVICT_OK; |
739 | } |
740 | } |
741 | } |
742 | |
743 | serverAssert(server.core_propagates); /* This function should not be re-entrant */ |
744 | |
745 | /* Propagate all DELs */ |
746 | propagatePendingCommands(); |
747 | |
748 | server.core_propagates = prev_core_propagates; |
749 | server.propagate_no_multi = 0; |
750 | |
751 | latencyEndMonitor(latency); |
752 | latencyAddSampleIfNeeded("eviction-cycle" ,latency); |
753 | |
754 | update_metrics: |
755 | if (result == EVICT_RUNNING || result == EVICT_FAIL) { |
756 | if (server.stat_last_eviction_exceeded_time == 0) |
757 | elapsedStart(&server.stat_last_eviction_exceeded_time); |
758 | } else if (result == EVICT_OK) { |
759 | if (server.stat_last_eviction_exceeded_time != 0) { |
760 | server.stat_total_eviction_exceeded_time += elapsedUs(server.stat_last_eviction_exceeded_time); |
761 | server.stat_last_eviction_exceeded_time = 0; |
762 | } |
763 | } |
764 | return result; |
765 | } |
766 | |