1/* Maxmemory directive handling (LRU eviction and other policies).
2 *
3 * ----------------------------------------------------------------------------
4 *
5 * Copyright (c) 2009-2016, Salvatore Sanfilippo <antirez at gmail dot com>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * * Redistributions of source code must retain the above copyright notice,
12 * this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * * Neither the name of Redis nor the names of its contributors may be used
17 * to endorse or promote products derived from this software without
18 * specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#include "server.h"
34#include "bio.h"
35#include "atomicvar.h"
36#include "script.h"
37#include <math.h>
38
39/* ----------------------------------------------------------------------------
40 * Data structures
41 * --------------------------------------------------------------------------*/
42
43/* To improve the quality of the LRU approximation we take a set of keys
44 * that are good candidate for eviction across performEvictions() calls.
45 *
46 * Entries inside the eviction pool are taken ordered by idle time, putting
47 * greater idle times to the right (ascending order).
48 *
49 * When an LFU policy is used instead, a reverse frequency indication is used
50 * instead of the idle time, so that we still evict by larger value (larger
51 * inverse frequency means to evict keys with the least frequent accesses).
52 *
53 * Empty entries have the key pointer set to NULL. */
54#define EVPOOL_SIZE 16
55#define EVPOOL_CACHED_SDS_SIZE 255
56struct evictionPoolEntry {
57 unsigned long long idle; /* Object idle time (inverse frequency for LFU) */
58 sds key; /* Key name. */
59 sds cached; /* Cached SDS object for key name. */
60 int dbid; /* Key DB number. */
61};
62
63static struct evictionPoolEntry *EvictionPoolLRU;
64
65/* ----------------------------------------------------------------------------
66 * Implementation of eviction, aging and LRU
67 * --------------------------------------------------------------------------*/
68
69/* Return the LRU clock, based on the clock resolution. This is a time
70 * in a reduced-bits format that can be used to set and check the
71 * object->lru field of redisObject structures. */
72unsigned int getLRUClock(void) {
73 return (mstime()/LRU_CLOCK_RESOLUTION) & LRU_CLOCK_MAX;
74}
75
76/* This function is used to obtain the current LRU clock.
77 * If the current resolution is lower than the frequency we refresh the
78 * LRU clock (as it should be in production servers) we return the
79 * precomputed value, otherwise we need to resort to a system call. */
80unsigned int LRU_CLOCK(void) {
81 unsigned int lruclock;
82 if (1000/server.hz <= LRU_CLOCK_RESOLUTION) {
83 atomicGet(server.lruclock,lruclock);
84 } else {
85 lruclock = getLRUClock();
86 }
87 return lruclock;
88}
89
90/* Given an object returns the min number of milliseconds the object was never
91 * requested, using an approximated LRU algorithm. */
92unsigned long long estimateObjectIdleTime(robj *o) {
93 unsigned long long lruclock = LRU_CLOCK();
94 if (lruclock >= o->lru) {
95 return (lruclock - o->lru) * LRU_CLOCK_RESOLUTION;
96 } else {
97 return (lruclock + (LRU_CLOCK_MAX - o->lru)) *
98 LRU_CLOCK_RESOLUTION;
99 }
100}
101
102/* LRU approximation algorithm
103 *
104 * Redis uses an approximation of the LRU algorithm that runs in constant
105 * memory. Every time there is a key to expire, we sample N keys (with
106 * N very small, usually in around 5) to populate a pool of best keys to
107 * evict of M keys (the pool size is defined by EVPOOL_SIZE).
108 *
109 * The N keys sampled are added in the pool of good keys to expire (the one
110 * with an old access time) if they are better than one of the current keys
111 * in the pool.
112 *
113 * After the pool is populated, the best key we have in the pool is expired.
114 * However note that we don't remove keys from the pool when they are deleted
115 * so the pool may contain keys that no longer exist.
116 *
117 * When we try to evict a key, and all the entries in the pool don't exist
118 * we populate it again. This time we'll be sure that the pool has at least
119 * one key that can be evicted, if there is at least one key that can be
120 * evicted in the whole database. */
121
122/* Create a new eviction pool. */
123void evictionPoolAlloc(void) {
124 struct evictionPoolEntry *ep;
125 int j;
126
127 ep = zmalloc(sizeof(*ep)*EVPOOL_SIZE);
128 for (j = 0; j < EVPOOL_SIZE; j++) {
129 ep[j].idle = 0;
130 ep[j].key = NULL;
131 ep[j].cached = sdsnewlen(NULL,EVPOOL_CACHED_SDS_SIZE);
132 ep[j].dbid = 0;
133 }
134 EvictionPoolLRU = ep;
135}
136
137/* This is a helper function for performEvictions(), it is used in order
138 * to populate the evictionPool with a few entries every time we want to
139 * expire a key. Keys with idle time bigger than one of the current
140 * keys are added. Keys are always added if there are free entries.
141 *
142 * We insert keys on place in ascending order, so keys with the smaller
143 * idle time are on the left, and keys with the higher idle time on the
144 * right. */
145
146void evictionPoolPopulate(int dbid, dict *sampledict, dict *keydict, struct evictionPoolEntry *pool) {
147 int j, k, count;
148 dictEntry *samples[server.maxmemory_samples];
149
150 count = dictGetSomeKeys(sampledict,samples,server.maxmemory_samples);
151 for (j = 0; j < count; j++) {
152 unsigned long long idle;
153 sds key;
154 robj *o;
155 dictEntry *de;
156
157 de = samples[j];
158 key = dictGetKey(de);
159
160 /* If the dictionary we are sampling from is not the main
161 * dictionary (but the expires one) we need to lookup the key
162 * again in the key dictionary to obtain the value object. */
163 if (server.maxmemory_policy != MAXMEMORY_VOLATILE_TTL) {
164 if (sampledict != keydict) de = dictFind(keydict, key);
165 o = dictGetVal(de);
166 }
167
168 /* Calculate the idle time according to the policy. This is called
169 * idle just because the code initially handled LRU, but is in fact
170 * just a score where an higher score means better candidate. */
171 if (server.maxmemory_policy & MAXMEMORY_FLAG_LRU) {
172 idle = estimateObjectIdleTime(o);
173 } else if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
174 /* When we use an LRU policy, we sort the keys by idle time
175 * so that we expire keys starting from greater idle time.
176 * However when the policy is an LFU one, we have a frequency
177 * estimation, and we want to evict keys with lower frequency
178 * first. So inside the pool we put objects using the inverted
179 * frequency subtracting the actual frequency to the maximum
180 * frequency of 255. */
181 idle = 255-LFUDecrAndReturn(o);
182 } else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) {
183 /* In this case the sooner the expire the better. */
184 idle = ULLONG_MAX - (long)dictGetVal(de);
185 } else {
186 serverPanic("Unknown eviction policy in evictionPoolPopulate()");
187 }
188
189 /* Insert the element inside the pool.
190 * First, find the first empty bucket or the first populated
191 * bucket that has an idle time smaller than our idle time. */
192 k = 0;
193 while (k < EVPOOL_SIZE &&
194 pool[k].key &&
195 pool[k].idle < idle) k++;
196 if (k == 0 && pool[EVPOOL_SIZE-1].key != NULL) {
197 /* Can't insert if the element is < the worst element we have
198 * and there are no empty buckets. */
199 continue;
200 } else if (k < EVPOOL_SIZE && pool[k].key == NULL) {
201 /* Inserting into empty position. No setup needed before insert. */
202 } else {
203 /* Inserting in the middle. Now k points to the first element
204 * greater than the element to insert. */
205 if (pool[EVPOOL_SIZE-1].key == NULL) {
206 /* Free space on the right? Insert at k shifting
207 * all the elements from k to end to the right. */
208
209 /* Save SDS before overwriting. */
210 sds cached = pool[EVPOOL_SIZE-1].cached;
211 memmove(pool+k+1,pool+k,
212 sizeof(pool[0])*(EVPOOL_SIZE-k-1));
213 pool[k].cached = cached;
214 } else {
215 /* No free space on right? Insert at k-1 */
216 k--;
217 /* Shift all elements on the left of k (included) to the
218 * left, so we discard the element with smaller idle time. */
219 sds cached = pool[0].cached; /* Save SDS before overwriting. */
220 if (pool[0].key != pool[0].cached) sdsfree(pool[0].key);
221 memmove(pool,pool+1,sizeof(pool[0])*k);
222 pool[k].cached = cached;
223 }
224 }
225
226 /* Try to reuse the cached SDS string allocated in the pool entry,
227 * because allocating and deallocating this object is costly
228 * (according to the profiler, not my fantasy. Remember:
229 * premature optimization bla bla bla. */
230 int klen = sdslen(key);
231 if (klen > EVPOOL_CACHED_SDS_SIZE) {
232 pool[k].key = sdsdup(key);
233 } else {
234 memcpy(pool[k].cached,key,klen+1);
235 sdssetlen(pool[k].cached,klen);
236 pool[k].key = pool[k].cached;
237 }
238 pool[k].idle = idle;
239 pool[k].dbid = dbid;
240 }
241}
242
243/* ----------------------------------------------------------------------------
244 * LFU (Least Frequently Used) implementation.
245
246 * We have 24 total bits of space in each object in order to implement
247 * an LFU (Least Frequently Used) eviction policy, since we re-use the
248 * LRU field for this purpose.
249 *
250 * We split the 24 bits into two fields:
251 *
252 * 16 bits 8 bits
253 * +----------------+--------+
254 * + Last decr time | LOG_C |
255 * +----------------+--------+
256 *
257 * LOG_C is a logarithmic counter that provides an indication of the access
258 * frequency. However this field must also be decremented otherwise what used
259 * to be a frequently accessed key in the past, will remain ranked like that
260 * forever, while we want the algorithm to adapt to access pattern changes.
261 *
262 * So the remaining 16 bits are used in order to store the "decrement time",
263 * a reduced-precision Unix time (we take 16 bits of the time converted
264 * in minutes since we don't care about wrapping around) where the LOG_C
265 * counter is halved if it has an high value, or just decremented if it
266 * has a low value.
267 *
268 * New keys don't start at zero, in order to have the ability to collect
269 * some accesses before being trashed away, so they start at LFU_INIT_VAL.
270 * The logarithmic increment performed on LOG_C takes care of LFU_INIT_VAL
271 * when incrementing the key, so that keys starting at LFU_INIT_VAL
272 * (or having a smaller value) have a very high chance of being incremented
273 * on access.
274 *
275 * During decrement, the value of the logarithmic counter is halved if
276 * its current value is greater than two times the LFU_INIT_VAL, otherwise
277 * it is just decremented by one.
278 * --------------------------------------------------------------------------*/
279
280/* Return the current time in minutes, just taking the least significant
281 * 16 bits. The returned time is suitable to be stored as LDT (last decrement
282 * time) for the LFU implementation. */
283unsigned long LFUGetTimeInMinutes(void) {
284 return (server.unixtime/60) & 65535;
285}
286
287/* Given an object last access time, compute the minimum number of minutes
288 * that elapsed since the last access. Handle overflow (ldt greater than
289 * the current 16 bits minutes time) considering the time as wrapping
290 * exactly once. */
291unsigned long LFUTimeElapsed(unsigned long ldt) {
292 unsigned long now = LFUGetTimeInMinutes();
293 if (now >= ldt) return now-ldt;
294 return 65535-ldt+now;
295}
296
297/* Logarithmically increment a counter. The greater is the current counter value
298 * the less likely is that it gets really incremented. Saturate it at 255. */
299uint8_t LFULogIncr(uint8_t counter) {
300 if (counter == 255) return 255;
301 double r = (double)rand()/RAND_MAX;
302 double baseval = counter - LFU_INIT_VAL;
303 if (baseval < 0) baseval = 0;
304 double p = 1.0/(baseval*server.lfu_log_factor+1);
305 if (r < p) counter++;
306 return counter;
307}
308
309/* If the object decrement time is reached decrement the LFU counter but
310 * do not update LFU fields of the object, we update the access time
311 * and counter in an explicit way when the object is really accessed.
312 * And we will times halve the counter according to the times of
313 * elapsed time than server.lfu_decay_time.
314 * Return the object frequency counter.
315 *
316 * This function is used in order to scan the dataset for the best object
317 * to fit: as we check for the candidate, we incrementally decrement the
318 * counter of the scanned objects if needed. */
319unsigned long LFUDecrAndReturn(robj *o) {
320 unsigned long ldt = o->lru >> 8;
321 unsigned long counter = o->lru & 255;
322 unsigned long num_periods = server.lfu_decay_time ? LFUTimeElapsed(ldt) / server.lfu_decay_time : 0;
323 if (num_periods)
324 counter = (num_periods > counter) ? 0 : counter - num_periods;
325 return counter;
326}
327
328/* We don't want to count AOF buffers and slaves output buffers as
329 * used memory: the eviction should use mostly data size, because
330 * it can cause feedback-loop when we push DELs into them, putting
331 * more and more DELs will make them bigger, if we count them, we
332 * need to evict more keys, and then generate more DELs, maybe cause
333 * massive eviction loop, even all keys are evicted.
334 *
335 * This function returns the sum of AOF and replication buffer. */
336size_t freeMemoryGetNotCountedMemory(void) {
337 size_t overhead = 0;
338
339 /* Since all replicas and replication backlog share global replication
340 * buffer, we think only the part of exceeding backlog size is the extra
341 * separate consumption of replicas.
342 *
343 * Note that although the backlog is also initially incrementally grown
344 * (pushing DELs consumes memory), it'll eventually stop growing and
345 * remain constant in size, so even if its creation will cause some
346 * eviction, it's capped, and also here to stay (no resonance effect)
347 *
348 * Note that, because we trim backlog incrementally in the background,
349 * backlog size may exceeds our setting if slow replicas that reference
350 * vast replication buffer blocks disconnect. To avoid massive eviction
351 * loop, we don't count the delayed freed replication backlog into used
352 * memory even if there are no replicas, i.e. we still regard this memory
353 * as replicas'. */
354 if ((long long)server.repl_buffer_mem > server.repl_backlog_size) {
355 /* We use list structure to manage replication buffer blocks, so backlog
356 * also occupies some extra memory, we can't know exact blocks numbers,
357 * we only get approximate size according to per block size. */
358 size_t extra_approx_size =
359 (server.repl_backlog_size/PROTO_REPLY_CHUNK_BYTES + 1) *
360 (sizeof(replBufBlock)+sizeof(listNode));
361 size_t counted_mem = server.repl_backlog_size + extra_approx_size;
362 if (server.repl_buffer_mem > counted_mem) {
363 overhead += (server.repl_buffer_mem - counted_mem);
364 }
365 }
366
367 if (server.aof_state != AOF_OFF) {
368 overhead += sdsAllocSize(server.aof_buf);
369 }
370 return overhead;
371}
372
373/* Get the memory status from the point of view of the maxmemory directive:
374 * if the memory used is under the maxmemory setting then C_OK is returned.
375 * Otherwise, if we are over the memory limit, the function returns
376 * C_ERR.
377 *
378 * The function may return additional info via reference, only if the
379 * pointers to the respective arguments is not NULL. Certain fields are
380 * populated only when C_ERR is returned:
381 *
382 * 'total' total amount of bytes used.
383 * (Populated both for C_ERR and C_OK)
384 *
385 * 'logical' the amount of memory used minus the slaves/AOF buffers.
386 * (Populated when C_ERR is returned)
387 *
388 * 'tofree' the amount of memory that should be released
389 * in order to return back into the memory limits.
390 * (Populated when C_ERR is returned)
391 *
392 * 'level' this usually ranges from 0 to 1, and reports the amount of
393 * memory currently used. May be > 1 if we are over the memory
394 * limit.
395 * (Populated both for C_ERR and C_OK)
396 */
397int getMaxmemoryState(size_t *total, size_t *logical, size_t *tofree, float *level) {
398 size_t mem_reported, mem_used, mem_tofree;
399
400 /* Check if we are over the memory usage limit. If we are not, no need
401 * to subtract the slaves output buffers. We can just return ASAP. */
402 mem_reported = zmalloc_used_memory();
403 if (total) *total = mem_reported;
404
405 /* We may return ASAP if there is no need to compute the level. */
406 if (!server.maxmemory) {
407 if (level) *level = 0;
408 return C_OK;
409 }
410 if (mem_reported <= server.maxmemory && !level) return C_OK;
411
412 /* Remove the size of slaves output buffers and AOF buffer from the
413 * count of used memory. */
414 mem_used = mem_reported;
415 size_t overhead = freeMemoryGetNotCountedMemory();
416 mem_used = (mem_used > overhead) ? mem_used-overhead : 0;
417
418 /* Compute the ratio of memory usage. */
419 if (level) *level = (float)mem_used / (float)server.maxmemory;
420
421 if (mem_reported <= server.maxmemory) return C_OK;
422
423 /* Check if we are still over the memory limit. */
424 if (mem_used <= server.maxmemory) return C_OK;
425
426 /* Compute how much memory we need to free. */
427 mem_tofree = mem_used - server.maxmemory;
428
429 if (logical) *logical = mem_used;
430 if (tofree) *tofree = mem_tofree;
431
432 return C_ERR;
433}
434
435/* Return 1 if used memory is more than maxmemory after allocating more memory,
436 * return 0 if not. Redis may reject user's requests or evict some keys if used
437 * memory exceeds maxmemory, especially, when we allocate huge memory at once. */
438int overMaxmemoryAfterAlloc(size_t moremem) {
439 if (!server.maxmemory) return 0; /* No limit. */
440
441 /* Check quickly. */
442 size_t mem_used = zmalloc_used_memory();
443 if (mem_used + moremem <= server.maxmemory) return 0;
444
445 size_t overhead = freeMemoryGetNotCountedMemory();
446 mem_used = (mem_used > overhead) ? mem_used - overhead : 0;
447 return mem_used + moremem > server.maxmemory;
448}
449
450/* The evictionTimeProc is started when "maxmemory" has been breached and
451 * could not immediately be resolved. This will spin the event loop with short
452 * eviction cycles until the "maxmemory" condition has resolved or there are no
453 * more evictable items. */
454static int isEvictionProcRunning = 0;
455static int evictionTimeProc(
456 struct aeEventLoop *eventLoop, long long id, void *clientData) {
457 UNUSED(eventLoop);
458 UNUSED(id);
459 UNUSED(clientData);
460
461 if (performEvictions() == EVICT_RUNNING) return 0; /* keep evicting */
462
463 /* For EVICT_OK - things are good, no need to keep evicting.
464 * For EVICT_FAIL - there is nothing left to evict. */
465 isEvictionProcRunning = 0;
466 return AE_NOMORE;
467}
468
469void startEvictionTimeProc(void) {
470 if (!isEvictionProcRunning) {
471 isEvictionProcRunning = 1;
472 aeCreateTimeEvent(server.el, 0,
473 evictionTimeProc, NULL, NULL);
474 }
475}
476
477/* Check if it's safe to perform evictions.
478 * Returns 1 if evictions can be performed
479 * Returns 0 if eviction processing should be skipped
480 */
481static int isSafeToPerformEvictions(void) {
482 /* - There must be no script in timeout condition.
483 * - Nor we are loading data right now. */
484 if (isInsideYieldingLongCommand() || server.loading) return 0;
485
486 /* By default replicas should ignore maxmemory
487 * and just be masters exact copies. */
488 if (server.masterhost && server.repl_slave_ignore_maxmemory) return 0;
489
490 /* When clients are paused the dataset should be static not just from the
491 * POV of clients not being able to write, but also from the POV of
492 * expires and evictions of keys not being performed. */
493 if (checkClientPauseTimeoutAndReturnIfPaused()) return 0;
494
495 return 1;
496}
497
498/* Algorithm for converting tenacity (0-100) to a time limit. */
499static unsigned long evictionTimeLimitUs() {
500 serverAssert(server.maxmemory_eviction_tenacity >= 0);
501 serverAssert(server.maxmemory_eviction_tenacity <= 100);
502
503 if (server.maxmemory_eviction_tenacity <= 10) {
504 /* A linear progression from 0..500us */
505 return 50uL * server.maxmemory_eviction_tenacity;
506 }
507
508 if (server.maxmemory_eviction_tenacity < 100) {
509 /* A 15% geometric progression, resulting in a limit of ~2 min at tenacity==99 */
510 return (unsigned long)(500.0 * pow(1.15, server.maxmemory_eviction_tenacity - 10.0));
511 }
512
513 return ULONG_MAX; /* No limit to eviction time */
514}
515
516/* Check that memory usage is within the current "maxmemory" limit. If over
517 * "maxmemory", attempt to free memory by evicting data (if it's safe to do so).
518 *
519 * It's possible for Redis to suddenly be significantly over the "maxmemory"
520 * setting. This can happen if there is a large allocation (like a hash table
521 * resize) or even if the "maxmemory" setting is manually adjusted. Because of
522 * this, it's important to evict for a managed period of time - otherwise Redis
523 * would become unresponsive while evicting.
524 *
525 * The goal of this function is to improve the memory situation - not to
526 * immediately resolve it. In the case that some items have been evicted but
527 * the "maxmemory" limit has not been achieved, an aeTimeProc will be started
528 * which will continue to evict items until memory limits are achieved or
529 * nothing more is evictable.
530 *
531 * This should be called before execution of commands. If EVICT_FAIL is
532 * returned, commands which will result in increased memory usage should be
533 * rejected.
534 *
535 * Returns:
536 * EVICT_OK - memory is OK or it's not possible to perform evictions now
537 * EVICT_RUNNING - memory is over the limit, but eviction is still processing
538 * EVICT_FAIL - memory is over the limit, and there's nothing to evict
539 * */
540int performEvictions(void) {
541 /* Note, we don't goto update_metrics here because this check skips eviction
542 * as if it wasn't triggered. it's a fake EVICT_OK. */
543 if (!isSafeToPerformEvictions()) return EVICT_OK;
544
545 int keys_freed = 0;
546 size_t mem_reported, mem_tofree;
547 long long mem_freed; /* May be negative */
548 mstime_t latency, eviction_latency;
549 long long delta;
550 int slaves = listLength(server.slaves);
551 int result = EVICT_FAIL;
552
553 if (getMaxmemoryState(&mem_reported,NULL,&mem_tofree,NULL) == C_OK) {
554 result = EVICT_OK;
555 goto update_metrics;
556 }
557
558 if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION) {
559 result = EVICT_FAIL; /* We need to free memory, but policy forbids. */
560 goto update_metrics;
561 }
562
563 unsigned long eviction_time_limit_us = evictionTimeLimitUs();
564
565 mem_freed = 0;
566
567 latencyStartMonitor(latency);
568
569 monotime evictionTimer;
570 elapsedStart(&evictionTimer);
571
572 /* Unlike active-expire and blocked client, we can reach here from 'CONFIG SET maxmemory'
573 * so we have to back-up and restore server.core_propagates. */
574 int prev_core_propagates = server.core_propagates;
575 serverAssert(server.also_propagate.numops == 0);
576 server.core_propagates = 1;
577 server.propagate_no_multi = 1;
578
579 while (mem_freed < (long long)mem_tofree) {
580 int j, k, i;
581 static unsigned int next_db = 0;
582 sds bestkey = NULL;
583 int bestdbid;
584 redisDb *db;
585 dict *dict;
586 dictEntry *de;
587
588 if (server.maxmemory_policy & (MAXMEMORY_FLAG_LRU|MAXMEMORY_FLAG_LFU) ||
589 server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL)
590 {
591 struct evictionPoolEntry *pool = EvictionPoolLRU;
592
593 while(bestkey == NULL) {
594 unsigned long total_keys = 0, keys;
595
596 /* We don't want to make local-db choices when expiring keys,
597 * so to start populate the eviction pool sampling keys from
598 * every DB. */
599 for (i = 0; i < server.dbnum; i++) {
600 db = server.db+i;
601 dict = (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) ?
602 db->dict : db->expires;
603 if ((keys = dictSize(dict)) != 0) {
604 evictionPoolPopulate(i, dict, db->dict, pool);
605 total_keys += keys;
606 }
607 }
608 if (!total_keys) break; /* No keys to evict. */
609
610 /* Go backward from best to worst element to evict. */
611 for (k = EVPOOL_SIZE-1; k >= 0; k--) {
612 if (pool[k].key == NULL) continue;
613 bestdbid = pool[k].dbid;
614
615 if (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) {
616 de = dictFind(server.db[bestdbid].dict,
617 pool[k].key);
618 } else {
619 de = dictFind(server.db[bestdbid].expires,
620 pool[k].key);
621 }
622
623 /* Remove the entry from the pool. */
624 if (pool[k].key != pool[k].cached)
625 sdsfree(pool[k].key);
626 pool[k].key = NULL;
627 pool[k].idle = 0;
628
629 /* If the key exists, is our pick. Otherwise it is
630 * a ghost and we need to try the next element. */
631 if (de) {
632 bestkey = dictGetKey(de);
633 break;
634 } else {
635 /* Ghost... Iterate again. */
636 }
637 }
638 }
639 }
640
641 /* volatile-random and allkeys-random policy */
642 else if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM ||
643 server.maxmemory_policy == MAXMEMORY_VOLATILE_RANDOM)
644 {
645 /* When evicting a random key, we try to evict a key for
646 * each DB, so we use the static 'next_db' variable to
647 * incrementally visit all DBs. */
648 for (i = 0; i < server.dbnum; i++) {
649 j = (++next_db) % server.dbnum;
650 db = server.db+j;
651 dict = (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) ?
652 db->dict : db->expires;
653 if (dictSize(dict) != 0) {
654 de = dictGetRandomKey(dict);
655 bestkey = dictGetKey(de);
656 bestdbid = j;
657 break;
658 }
659 }
660 }
661
662 /* Finally remove the selected key. */
663 if (bestkey) {
664 db = server.db+bestdbid;
665 robj *keyobj = createStringObject(bestkey,sdslen(bestkey));
666 /* We compute the amount of memory freed by db*Delete() alone.
667 * It is possible that actually the memory needed to propagate
668 * the DEL in AOF and replication link is greater than the one
669 * we are freeing removing the key, but we can't account for
670 * that otherwise we would never exit the loop.
671 *
672 * Same for CSC invalidation messages generated by signalModifiedKey.
673 *
674 * AOF and Output buffer memory will be freed eventually so
675 * we only care about memory used by the key space. */
676 delta = (long long) zmalloc_used_memory();
677 latencyStartMonitor(eviction_latency);
678 if (server.lazyfree_lazy_eviction)
679 dbAsyncDelete(db,keyobj);
680 else
681 dbSyncDelete(db,keyobj);
682 latencyEndMonitor(eviction_latency);
683 latencyAddSampleIfNeeded("eviction-del",eviction_latency);
684 delta -= (long long) zmalloc_used_memory();
685 mem_freed += delta;
686 server.stat_evictedkeys++;
687 signalModifiedKey(NULL,db,keyobj);
688 notifyKeyspaceEvent(NOTIFY_EVICTED, "evicted",
689 keyobj, db->id);
690 propagateDeletion(db,keyobj,server.lazyfree_lazy_eviction);
691 decrRefCount(keyobj);
692 keys_freed++;
693
694 if (keys_freed % 16 == 0) {
695 /* When the memory to free starts to be big enough, we may
696 * start spending so much time here that is impossible to
697 * deliver data to the replicas fast enough, so we force the
698 * transmission here inside the loop. */
699 if (slaves) flushSlavesOutputBuffers();
700
701 /* Normally our stop condition is the ability to release
702 * a fixed, pre-computed amount of memory. However when we
703 * are deleting objects in another thread, it's better to
704 * check, from time to time, if we already reached our target
705 * memory, since the "mem_freed" amount is computed only
706 * across the dbAsyncDelete() call, while the thread can
707 * release the memory all the time. */
708 if (server.lazyfree_lazy_eviction) {
709 if (getMaxmemoryState(NULL,NULL,NULL,NULL) == C_OK) {
710 break;
711 }
712 }
713
714 /* After some time, exit the loop early - even if memory limit
715 * hasn't been reached. If we suddenly need to free a lot of
716 * memory, don't want to spend too much time here. */
717 if (elapsedUs(evictionTimer) > eviction_time_limit_us) {
718 // We still need to free memory - start eviction timer proc
719 startEvictionTimeProc();
720 break;
721 }
722 }
723 } else {
724 goto cant_free; /* nothing to free... */
725 }
726 }
727 /* at this point, the memory is OK, or we have reached the time limit */
728 result = (isEvictionProcRunning) ? EVICT_RUNNING : EVICT_OK;
729
730cant_free:
731 if (result == EVICT_FAIL) {
732 /* At this point, we have run out of evictable items. It's possible
733 * that some items are being freed in the lazyfree thread. Perform a
734 * short wait here if such jobs exist, but don't wait long. */
735 if (bioPendingJobsOfType(BIO_LAZY_FREE)) {
736 usleep(eviction_time_limit_us);
737 if (getMaxmemoryState(NULL,NULL,NULL,NULL) == C_OK) {
738 result = EVICT_OK;
739 }
740 }
741 }
742
743 serverAssert(server.core_propagates); /* This function should not be re-entrant */
744
745 /* Propagate all DELs */
746 propagatePendingCommands();
747
748 server.core_propagates = prev_core_propagates;
749 server.propagate_no_multi = 0;
750
751 latencyEndMonitor(latency);
752 latencyAddSampleIfNeeded("eviction-cycle",latency);
753
754update_metrics:
755 if (result == EVICT_RUNNING || result == EVICT_FAIL) {
756 if (server.stat_last_eviction_exceeded_time == 0)
757 elapsedStart(&server.stat_last_eviction_exceeded_time);
758 } else if (result == EVICT_OK) {
759 if (server.stat_last_eviction_exceeded_time != 0) {
760 server.stat_total_eviction_exceeded_time += elapsedUs(server.stat_last_eviction_exceeded_time);
761 server.stat_last_eviction_exceeded_time = 0;
762 }
763 }
764 return result;
765}
766