evict.c source code [redis/src/evict.c]

1	/ Maxmemory directive handling (LRU eviction and other policies).*
2	*
3	* ----------------------------------------------------------------------------
4	*
5	* Copyright (c) 2009-2016, Salvatore Sanfilippo <antirez at gmail dot com>
6	* All rights reserved.
7	*
8	* Redistribution and use in source and binary forms, with or without
9	* modification, are permitted provided that the following conditions are met:
10	*
11	* * Redistributions of source code must retain the above copyright notice,
12	* this list of conditions and the following disclaimer.
13	* * Redistributions in binary form must reproduce the above copyright
14	* notice, this list of conditions and the following disclaimer in the
15	* documentation and/or other materials provided with the distribution.
16	* * Neither the name of Redis nor the names of its contributors may be used
17	* to endorse or promote products derived from this software without
18	* specific prior written permission.
19	*
20	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30	* POSSIBILITY OF SUCH DAMAGE.
31	*/
32
33	#include "server.h"
34	#include "bio.h"
35	#include "atomicvar.h"
36	#include "script.h"
37	#include <math.h>
38
39	/ ----------------------------------------------------------------------------*
40	* Data structures
41	* --------------------------------------------------------------------------*/
42
43	/ To improve the quality of the LRU approximation we take a set of keys*
44	* that are good candidate for eviction across performEvictions() calls.
45	*
46	* Entries inside the eviction pool are taken ordered by idle time, putting
47	* greater idle times to the right (ascending order).
48	*
49	* When an LFU policy is used instead, a reverse frequency indication is used
50	* instead of the idle time, so that we still evict by larger value (larger
51	* inverse frequency means to evict keys with the least frequent accesses).
52	*
53	* Empty entries have the key pointer set to NULL. */
54	#define EVPOOL_SIZE 16
55	#define EVPOOL_CACHED_SDS_SIZE 255
56	struct evictionPoolEntry {
57	unsigned long long idle; / Object idle time (inverse frequency for LFU) /
58	sds key; / Key name. /
59	sds cached; / Cached SDS object for key name. /
60	int dbid; / Key DB number. /
61	};
62
63	static struct evictionPoolEntry *EvictionPoolLRU;
64
65	/ ----------------------------------------------------------------------------*
66	* Implementation of eviction, aging and LRU
67	* --------------------------------------------------------------------------*/
68
69	/ Return the LRU clock, based on the clock resolution. This is a time*
70	* in a reduced-bits format that can be used to set and check the
71	* object->lru field of redisObject structures. */
72	unsigned int getLRUClock(void) {
73	return (mstime()/LRU_CLOCK_RESOLUTION) & LRU_CLOCK_MAX;
74	}
75
76	/ This function is used to obtain the current LRU clock.*
77	* If the current resolution is lower than the frequency we refresh the
78	* LRU clock (as it should be in production servers) we return the
79	* precomputed value, otherwise we need to resort to a system call. */
80	unsigned int LRU_CLOCK(void) {
81	unsigned int lruclock;
82	if (`1000`/server.hz <= LRU_CLOCK_RESOLUTION) {
83	atomicGet(server.lruclock,lruclock);
84	} else {
85	lruclock = getLRUClock();
86	}
87	return lruclock;
88	}
89
90	/ Given an object returns the min number of milliseconds the object was never*
91	* requested, using an approximated LRU algorithm. */
92	unsigned long long estimateObjectIdleTime(robj *o) {
93	unsigned long long lruclock = LRU_CLOCK();
94	if (lruclock >= o->lru) {
95	return (lruclock - o->lru) * LRU_CLOCK_RESOLUTION;
96	} else {
97	return (lruclock + (LRU_CLOCK_MAX - o->lru)) *
98	LRU_CLOCK_RESOLUTION;
99	}
100	}
101
102	/ LRU approximation algorithm*
103	*
104	* Redis uses an approximation of the LRU algorithm that runs in constant
105	* memory. Every time there is a key to expire, we sample N keys (with
106	* N very small, usually in around 5) to populate a pool of best keys to
107	* evict of M keys (the pool size is defined by EVPOOL_SIZE).
108	*
109	* The N keys sampled are added in the pool of good keys to expire (the one
110	* with an old access time) if they are better than one of the current keys
111	* in the pool.
112	*
113	* After the pool is populated, the best key we have in the pool is expired.
114	* However note that we don't remove keys from the pool when they are deleted
115	* so the pool may contain keys that no longer exist.
116	*
117	* When we try to evict a key, and all the entries in the pool don't exist
118	* we populate it again. This time we'll be sure that the pool has at least
119	* one key that can be evicted, if there is at least one key that can be
120	* evicted in the whole database. */
121
122	/ Create a new eviction pool. /
123	void evictionPoolAlloc(void) {
124	struct evictionPoolEntry *ep;
125	int j;
126
127	ep = zmalloc(sizeof(ep)EVPOOL_SIZE);
128	for (j = `0`; j < EVPOOL_SIZE; j++) {
129	ep[j].idle = `0`;
130	ep[j].key = NULL;
131	ep[j].cached = sdsnewlen(NULL,EVPOOL_CACHED_SDS_SIZE);
132	ep[j].dbid = `0`;
133	}
134	EvictionPoolLRU = ep;
135	}
136
137	/ This is a helper function for performEvictions(), it is used in order*
138	* to populate the evictionPool with a few entries every time we want to
139	* expire a key. Keys with idle time bigger than one of the current
140	* keys are added. Keys are always added if there are free entries.
141	*
142	* We insert keys on place in ascending order, so keys with the smaller
143	* idle time are on the left, and keys with the higher idle time on the
144	* right. */
145
146	void evictionPoolPopulate(int dbid, dict sampledict, dict keydict, struct evictionPoolEntry *pool) {
147	int j, k, count;
148	dictEntry *samples[server.maxmemory_samples];
149
150	count = dictGetSomeKeys(sampledict,samples,server.maxmemory_samples);
151	for (j = `0`; j < count; j++) {
152	unsigned long long idle;
153	sds key;
154	robj *o;
155	dictEntry *de;
156
157	de = samples[j];
158	key = dictGetKey(de);
159
160	/ If the dictionary we are sampling from is not the main*
161	* dictionary (but the expires one) we need to lookup the key
162	* again in the key dictionary to obtain the value object. */
163	if (server.maxmemory_policy != MAXMEMORY_VOLATILE_TTL) {
164	if (sampledict != keydict) de = dictFind(keydict, key);
165	o = dictGetVal(de);
166	}
167
168	/ Calculate the idle time according to the policy. This is called*
169	* idle just because the code initially handled LRU, but is in fact
170	* just a score where an higher score means better candidate. */
171	if (server.maxmemory_policy & MAXMEMORY_FLAG_LRU) {
172	idle = estimateObjectIdleTime(o);
173	} else if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
174	/ When we use an LRU policy, we sort the keys by idle time*
175	* so that we expire keys starting from greater idle time.
176	* However when the policy is an LFU one, we have a frequency
177	* estimation, and we want to evict keys with lower frequency
178	* first. So inside the pool we put objects using the inverted
179	* frequency subtracting the actual frequency to the maximum
180	* frequency of 255. */
181	idle = `255`-LFUDecrAndReturn(o);
182	} else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) {
183	/ In this case the sooner the expire the better. /
184	idle = ULLONG_MAX - (long)dictGetVal(de);
185	} else {
186	serverPanic("Unknown eviction policy in evictionPoolPopulate()");
187	}
188
189	/ Insert the element inside the pool.*
190	* First, find the first empty bucket or the first populated
191	* bucket that has an idle time smaller than our idle time. */
192	k = `0`;
193	while (k < EVPOOL_SIZE &&
194	pool[k].key &&
195	pool[k].idle < idle) k++;
196	if (k == `0` && pool[EVPOOL_SIZE-`1`].key != NULL) {
197	/ Can't insert if the element is < the worst element we have*
198	* and there are no empty buckets. */
199	continue;
200	} else if (k < EVPOOL_SIZE && pool[k].key == NULL) {
201	/ Inserting into empty position. No setup needed before insert. /
202	} else {
203	/ Inserting in the middle. Now k points to the first element*
204	* greater than the element to insert. */
205	if (pool[EVPOOL_SIZE-`1`].key == NULL) {
206	/ Free space on the right? Insert at k shifting*
207	* all the elements from k to end to the right. */
208
209	/ Save SDS before overwriting. /
210	sds cached = pool[EVPOOL_SIZE-`1`].cached;
211	memmove(pool+k+`1`,pool+k,
212	sizeof(pool[`0`])*(EVPOOL_SIZE-k-`1`));
213	pool[k].cached = cached;
214	} else {
215	/ No free space on right? Insert at k-1 /
216	k--;
217	/ Shift all elements on the left of k (included) to the*
218	* left, so we discard the element with smaller idle time. */
219	sds cached = pool[`0`].cached; / Save SDS before overwriting. /
220	if (pool[`0`].key != pool[`0`].cached) sdsfree(pool[`0`].key);
221	memmove(pool,pool+`1`,sizeof(pool[`0`])*k);
222	pool[k].cached = cached;
223	}
224	}
225
226	/ Try to reuse the cached SDS string allocated in the pool entry,*
227	* because allocating and deallocating this object is costly
228	* (according to the profiler, not my fantasy. Remember:
229	* premature optimization bla bla bla. */
230	int klen = sdslen(key);
231	if (klen > EVPOOL_CACHED_SDS_SIZE) {
232	pool[k].key = sdsdup(key);
233	} else {
234	memcpy(pool[k].cached,key,klen+`1`);
235	sdssetlen(pool[k].cached,klen);
236	pool[k].key = pool[k].cached;
237	}
238	pool[k].idle = idle;
239	pool[k].dbid = dbid;
240	}
241	}
242
243	/ ----------------------------------------------------------------------------*
244	* LFU (Least Frequently Used) implementation.
245
246	* We have 24 total bits of space in each object in order to implement
247	* an LFU (Least Frequently Used) eviction policy, since we re-use the
248	* LRU field for this purpose.
249	*
250	* We split the 24 bits into two fields:
251	*
252	* 16 bits 8 bits
253	* +----------------+--------+
254	* + Last decr time \| LOG_C \|
255	* +----------------+--------+
256	*
257	* LOG_C is a logarithmic counter that provides an indication of the access
258	* frequency. However this field must also be decremented otherwise what used
259	* to be a frequently accessed key in the past, will remain ranked like that
260	* forever, while we want the algorithm to adapt to access pattern changes.
261	*
262	* So the remaining 16 bits are used in order to store the "decrement time",
263	* a reduced-precision Unix time (we take 16 bits of the time converted
264	* in minutes since we don't care about wrapping around) where the LOG_C
265	* counter is halved if it has an high value, or just decremented if it
266	* has a low value.
267	*
268	* New keys don't start at zero, in order to have the ability to collect
269	* some accesses before being trashed away, so they start at LFU_INIT_VAL.
270	* The logarithmic increment performed on LOG_C takes care of LFU_INIT_VAL
271	* when incrementing the key, so that keys starting at LFU_INIT_VAL
272	* (or having a smaller value) have a very high chance of being incremented
273	* on access.
274	*
275	* During decrement, the value of the logarithmic counter is halved if
276	* its current value is greater than two times the LFU_INIT_VAL, otherwise
277	* it is just decremented by one.
278	* --------------------------------------------------------------------------*/
279
280	/ Return the current time in minutes, just taking the least significant*
281	* 16 bits. The returned time is suitable to be stored as LDT (last decrement
282	* time) for the LFU implementation. */
283	unsigned long LFUGetTimeInMinutes(void) {
284	return (server.unixtime/`60`) & `65535`;
285	}
286
287	/ Given an object last access time, compute the minimum number of minutes*
288	* that elapsed since the last access. Handle overflow (ldt greater than
289	* the current 16 bits minutes time) considering the time as wrapping
290	* exactly once. */
291	unsigned long LFUTimeElapsed(unsigned long ldt) {
292	unsigned long now = LFUGetTimeInMinutes();
293	if (now >= ldt) return now-ldt;
294	return `65535`-ldt+now;
295	}
296
297	/ Logarithmically increment a counter. The greater is the current counter value*
298	* the less likely is that it gets really incremented. Saturate it at 255. */
299	uint8_t LFULogIncr(uint8_t counter) {
300	if (counter == `255`) return `255`;
301	double r = (double)rand()/RAND_MAX;
302	double baseval = counter - LFU_INIT_VAL;
303	if (baseval < `0`) baseval = `0`;
304	double p = `1.0`/(baseval*server.lfu_log_factor+`1`);
305	if (r < p) counter++;
306	return counter;
307	}
308
309	/ If the object decrement time is reached decrement the LFU counter but*
310	* do not update LFU fields of the object, we update the access time
311	* and counter in an explicit way when the object is really accessed.
312	* And we will times halve the counter according to the times of
313	* elapsed time than server.lfu_decay_time.
314	* Return the object frequency counter.
315	*
316	* This function is used in order to scan the dataset for the best object
317	* to fit: as we check for the candidate, we incrementally decrement the
318	* counter of the scanned objects if needed. */
319	unsigned long LFUDecrAndReturn(robj *o) {
320	unsigned long ldt = o->lru >> `8`;
321	unsigned long counter = o->lru & `255`;
322	unsigned long num_periods = server.lfu_decay_time ? LFUTimeElapsed(ldt) / server.lfu_decay_time : `0`;
323	if (num_periods)
324	counter = (num_periods > counter) ? `0` : counter - num_periods;
325	return counter;
326	}
327
328	/ We don't want to count AOF buffers and slaves output buffers as*
329	* used memory: the eviction should use mostly data size, because
330	* it can cause feedback-loop when we push DELs into them, putting
331	* more and more DELs will make them bigger, if we count them, we
332	* need to evict more keys, and then generate more DELs, maybe cause
333	* massive eviction loop, even all keys are evicted.
334	*
335	* This function returns the sum of AOF and replication buffer. */
336	size_t freeMemoryGetNotCountedMemory(void) {
337	size_t overhead = `0`;
338
339	/ Since all replicas and replication backlog share global replication*
340	* buffer, we think only the part of exceeding backlog size is the extra
341	* separate consumption of replicas.
342	*
343	* Note that although the backlog is also initially incrementally grown
344	* (pushing DELs consumes memory), it'll eventually stop growing and
345	* remain constant in size, so even if its creation will cause some
346	* eviction, it's capped, and also here to stay (no resonance effect)
347	*
348	* Note that, because we trim backlog incrementally in the background,
349	* backlog size may exceeds our setting if slow replicas that reference
350	* vast replication buffer blocks disconnect. To avoid massive eviction
351	* loop, we don't count the delayed freed replication backlog into used
352	* memory even if there are no replicas, i.e. we still regard this memory
353	* as replicas'. */
354	if ((long long)server.repl_buffer_mem > server.repl_backlog_size) {
355	/ We use list structure to manage replication buffer blocks, so backlog*
356	* also occupies some extra memory, we can't know exact blocks numbers,
357	* we only get approximate size according to per block size. */
358	size_t extra_approx_size =
359	(server.repl_backlog_size/PROTO_REPLY_CHUNK_BYTES + `1`) *
360	(sizeof(replBufBlock)+sizeof(listNode));
361	size_t counted_mem = server.repl_backlog_size + extra_approx_size;
362	if (server.repl_buffer_mem > counted_mem) {
363	overhead += (server.repl_buffer_mem - counted_mem);
364	}
365	}
366
367	if (server.aof_state != AOF_OFF) {
368	overhead += sdsAllocSize(server.aof_buf);
369	}
370	return overhead;
371	}
372
373	/ Get the memory status from the point of view of the maxmemory directive:*
374	* if the memory used is under the maxmemory setting then C_OK is returned.
375	* Otherwise, if we are over the memory limit, the function returns
376	* C_ERR.
377	*
378	* The function may return additional info via reference, only if the
379	* pointers to the respective arguments is not NULL. Certain fields are
380	* populated only when C_ERR is returned:
381	*
382	* 'total' total amount of bytes used.
383	* (Populated both for C_ERR and C_OK)
384	*
385	* 'logical' the amount of memory used minus the slaves/AOF buffers.
386	* (Populated when C_ERR is returned)
387	*
388	* 'tofree' the amount of memory that should be released
389	* in order to return back into the memory limits.
390	* (Populated when C_ERR is returned)
391	*
392	* 'level' this usually ranges from 0 to 1, and reports the amount of
393	* memory currently used. May be > 1 if we are over the memory
394	* limit.
395	* (Populated both for C_ERR and C_OK)
396	*/
397	int getMaxmemoryState(size_t total, size_t logical, size_t tofree, float* *level) {
398	size_t mem_reported, mem_used, mem_tofree;
399
400	/ Check if we are over the memory usage limit. If we are not, no need*
401	* to subtract the slaves output buffers. We can just return ASAP. */
402	mem_reported = zmalloc_used_memory();
403	if (total) *total = mem_reported;
404
405	/ We may return ASAP if there is no need to compute the level. /
406	if (!server.maxmemory) {
407	if (level) *level = `0`;
408	return C_OK;
409	}
410	if (mem_reported <= server.maxmemory && !level) return C_OK;
411
412	/ Remove the size of slaves output buffers and AOF buffer from the*
413	* count of used memory. */
414	mem_used = mem_reported;
415	size_t overhead = freeMemoryGetNotCountedMemory();
416	mem_used = (mem_used > overhead) ? mem_used-overhead : `0`;
417
418	/ Compute the ratio of memory usage. /
419	if (level) level = (float)mem_used / (float*)server.maxmemory;
420
421	if (mem_reported <= server.maxmemory) return C_OK;
422
423	/ Check if we are still over the memory limit. /
424	if (mem_used <= server.maxmemory) return C_OK;
425
426	/ Compute how much memory we need to free. /
427	mem_tofree = mem_used - server.maxmemory;
428
429	if (logical) *logical = mem_used;
430	if (tofree) *tofree = mem_tofree;
431
432	return C_ERR;
433	}
434
435	/ Return 1 if used memory is more than maxmemory after allocating more memory,*
436	* return 0 if not. Redis may reject user's requests or evict some keys if used
437	* memory exceeds maxmemory, especially, when we allocate huge memory at once. */
438	int overMaxmemoryAfterAlloc(size_t moremem) {
439	if (!server.maxmemory) return `0`; / No limit. /
440
441	/ Check quickly. /
442	size_t mem_used = zmalloc_used_memory();
443	if (mem_used + moremem <= server.maxmemory) return `0`;
444
445	size_t overhead = freeMemoryGetNotCountedMemory();
446	mem_used = (mem_used > overhead) ? mem_used - overhead : `0`;
447	return mem_used + moremem > server.maxmemory;
448	}
449
450	/ The evictionTimeProc is started when "maxmemory" has been breached and*
451	* could not immediately be resolved. This will spin the event loop with short
452	* eviction cycles until the "maxmemory" condition has resolved or there are no
453	* more evictable items. */
454	static int isEvictionProcRunning = `0`;
455	static int evictionTimeProc(
456	struct aeEventLoop eventLoop, long* long id, void *clientData) {
457	UNUSED(eventLoop);
458	UNUSED(id);
459	UNUSED(clientData);
460
461	if (performEvictions() == EVICT_RUNNING) return `0`; / keep evicting /
462
463	/ For EVICT_OK - things are good, no need to keep evicting.*
464	* For EVICT_FAIL - there is nothing left to evict. */
465	isEvictionProcRunning = `0`;
466	return AE_NOMORE;
467	}
468
469	void startEvictionTimeProc(void) {
470	if (!isEvictionProcRunning) {
471	isEvictionProcRunning = `1`;
472	aeCreateTimeEvent(server.el, `0`,
473	evictionTimeProc, NULL, NULL);
474	}
475	}
476
477	/ Check if it's safe to perform evictions.*
478	* Returns 1 if evictions can be performed
479	* Returns 0 if eviction processing should be skipped
480	*/
481	static int isSafeToPerformEvictions(void) {
482	/ - There must be no script in timeout condition.*
483	* - Nor we are loading data right now. */
484	if (isInsideYieldingLongCommand() \|\| server.loading) return `0`;
485
486	/ By default replicas should ignore maxmemory*
487	* and just be masters exact copies. */
488	if (server.masterhost && server.repl_slave_ignore_maxmemory) return `0`;
489
490	/ When clients are paused the dataset should be static not just from the*
491	* POV of clients not being able to write, but also from the POV of
492	* expires and evictions of keys not being performed. */
493	if (checkClientPauseTimeoutAndReturnIfPaused()) return `0`;
494
495	return `1`;
496	}
497
498	/ Algorithm for converting tenacity (0-100) to a time limit. /
499	static unsigned long evictionTimeLimitUs() {
500	serverAssert(server.maxmemory_eviction_tenacity >= `0`);
501	serverAssert(server.maxmemory_eviction_tenacity <= `100`);
502
503	if (server.maxmemory_eviction_tenacity <= `10`) {
504	/ A linear progression from 0..500us /
505	return `50uL` * server.maxmemory_eviction_tenacity;
506	}
507
508	if (server.maxmemory_eviction_tenacity < `100`) {
509	/ A 15% geometric progression, resulting in a limit of ~2 min at tenacity==99 /
510	return (unsigned long)(`500.0` * pow(`1.15`, server.maxmemory_eviction_tenacity - `10.0`));
511	}
512
513	return ULONG_MAX; / No limit to eviction time /
514	}
515
516	/ Check that memory usage is within the current "maxmemory" limit. If over*
517	* "maxmemory", attempt to free memory by evicting data (if it's safe to do so).
518	*
519	* It's possible for Redis to suddenly be significantly over the "maxmemory"
520	* setting. This can happen if there is a large allocation (like a hash table
521	* resize) or even if the "maxmemory" setting is manually adjusted. Because of
522	* this, it's important to evict for a managed period of time - otherwise Redis
523	* would become unresponsive while evicting.
524	*
525	* The goal of this function is to improve the memory situation - not to
526	* immediately resolve it. In the case that some items have been evicted but
527	* the "maxmemory" limit has not been achieved, an aeTimeProc will be started
528	* which will continue to evict items until memory limits are achieved or
529	* nothing more is evictable.
530	*
531	* This should be called before execution of commands. If EVICT_FAIL is
532	* returned, commands which will result in increased memory usage should be
533	* rejected.
534	*
535	* Returns:
536	* EVICT_OK - memory is OK or it's not possible to perform evictions now
537	* EVICT_RUNNING - memory is over the limit, but eviction is still processing
538	* EVICT_FAIL - memory is over the limit, and there's nothing to evict
539	* */
540	int performEvictions(void) {
541	/ Note, we don't goto update_metrics here because this check skips eviction*
542	* as if it wasn't triggered. it's a fake EVICT_OK. */
543	if (!isSafeToPerformEvictions()) return EVICT_OK;
544
545	int keys_freed = `0`;
546	size_t mem_reported, mem_tofree;
547	long long mem_freed; / May be negative /
548	mstime_t latency, eviction_latency;
549	long long delta;
550	int slaves = listLength(server.slaves);
551	int result = EVICT_FAIL;
552
553	if (getMaxmemoryState(&mem_reported,NULL,&mem_tofree,NULL) == C_OK) {
554	result = EVICT_OK;
555	goto update_metrics;
556	}
557
558	if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION) {
559	result = EVICT_FAIL; / We need to free memory, but policy forbids. /
560	goto update_metrics;
561	}
562
563	unsigned long eviction_time_limit_us = evictionTimeLimitUs();
564
565	mem_freed = `0`;
566
567	latencyStartMonitor(latency);
568
569	monotime evictionTimer;
570	elapsedStart(&evictionTimer);
571
572	/ Unlike active-expire and blocked client, we can reach here from 'CONFIG SET maxmemory'*
573	* so we have to back-up and restore server.core_propagates. */
574	int prev_core_propagates = server.core_propagates;
575	serverAssert(server.also_propagate.numops == `0`);
576	server.core_propagates = `1`;
577	server.propagate_no_multi = `1`;
578
579	while (mem_freed < (long long)mem_tofree) {
580	int j, k, i;
581	static unsigned int next_db = `0`;
582	sds bestkey = NULL;
583	int bestdbid;
584	redisDb *db;
585	dict *dict;
586	dictEntry *de;
587
588	if (server.maxmemory_policy & (MAXMEMORY_FLAG_LRU\|MAXMEMORY_FLAG_LFU) \|\|
589	server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL)
590	{
591	struct evictionPoolEntry *pool = EvictionPoolLRU;
592
593	while(bestkey == NULL) {
594	unsigned long total_keys = `0`, keys;
595
596	/ We don't want to make local-db choices when expiring keys,*
597	* so to start populate the eviction pool sampling keys from
598	* every DB. */
599	for (i = `0`; i < server.dbnum; i++) {
600	db = server.db+i;
601	dict = (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) ?
602	db->dict : db->expires;
603	if ((keys = dictSize(dict)) != `0`) {
604	evictionPoolPopulate(i, dict, db->dict, pool);
605	total_keys += keys;
606	}
607	}
608	if (!total_keys) break; / No keys to evict. /
609
610	/ Go backward from best to worst element to evict. /
611	for (k = EVPOOL_SIZE-`1`; k >= `0`; k--) {
612	if (pool[k].key == NULL) continue;
613	bestdbid = pool[k].dbid;
614
615	if (server.maxmemory_policy & MAXMEMORY_FLAG_ALLKEYS) {
616	de = dictFind(server.db[bestdbid].dict,
617	pool[k].key);
618	} else {
619	de = dictFind(server.db[bestdbid].expires,
620	pool[k].key);
621	}
622
623	/ Remove the entry from the pool. /
624	if (pool[k].key != pool[k].cached)
625	sdsfree(pool[k].key);
626	pool[k].key = NULL;
627	pool[k].idle = `0`;
628
629	/ If the key exists, is our pick. Otherwise it is*
630	* a ghost and we need to try the next element. */
631	if (de) {
632	bestkey = dictGetKey(de);
633	break;
634	} else {
635	/ Ghost... Iterate again. /
636	}
637	}
638	}
639	}
640
641	/ volatile-random and allkeys-random policy /
642	else if (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM \|\|
643	server.maxmemory_policy == MAXMEMORY_VOLATILE_RANDOM)
644	{
645	/ When evicting a random key, we try to evict a key for*
646	* each DB, so we use the static 'next_db' variable to
647	* incrementally visit all DBs. */
648	for (i = `0`; i < server.dbnum; i++) {
649	j = (++next_db) % server.dbnum;
650	db = server.db+j;
651	dict = (server.maxmemory_policy == MAXMEMORY_ALLKEYS_RANDOM) ?
652	db->dict : db->expires;
653	if (dictSize(dict) != `0`) {
654	de = dictGetRandomKey(dict);
655	bestkey = dictGetKey(de);
656	bestdbid = j;
657	break;
658	}
659	}
660	}
661
662	/ Finally remove the selected key. /
663	if (bestkey) {
664	db = server.db+bestdbid;
665	robj *keyobj = createStringObject(bestkey,sdslen(bestkey));
666	/ We compute the amount of memory freed by dbDelete() alone.
667	* It is possible that actually the memory needed to propagate
668	* the DEL in AOF and replication link is greater than the one
669	* we are freeing removing the key, but we can't account for
670	* that otherwise we would never exit the loop.
671	*
672	* Same for CSC invalidation messages generated by signalModifiedKey.
673	*
674	* AOF and Output buffer memory will be freed eventually so
675	* we only care about memory used by the key space. */
676	delta = (long long) zmalloc_used_memory();
677	latencyStartMonitor(eviction_latency);
678	if (server.lazyfree_lazy_eviction)
679	dbAsyncDelete(db,keyobj);
680	else
681	dbSyncDelete(db,keyobj);
682	latencyEndMonitor(eviction_latency);
683	latencyAddSampleIfNeeded("eviction-del",eviction_latency);
684	delta -= (long long) zmalloc_used_memory();
685	mem_freed += delta;
686	server.stat_evictedkeys++;
687	signalModifiedKey(NULL,db,keyobj);
688	notifyKeyspaceEvent(NOTIFY_EVICTED, "evicted",
689	keyobj, db->id);
690	propagateDeletion(db,keyobj,server.lazyfree_lazy_eviction);
691	decrRefCount(keyobj);
692	keys_freed++;
693
694	if (keys_freed % `16` == `0`) {
695	/ When the memory to free starts to be big enough, we may*
696	* start spending so much time here that is impossible to
697	* deliver data to the replicas fast enough, so we force the
698	* transmission here inside the loop. */
699	if (slaves) flushSlavesOutputBuffers();
700
701	/ Normally our stop condition is the ability to release*
702	* a fixed, pre-computed amount of memory. However when we
703	* are deleting objects in another thread, it's better to
704	* check, from time to time, if we already reached our target
705	* memory, since the "mem_freed" amount is computed only
706	* across the dbAsyncDelete() call, while the thread can
707	* release the memory all the time. */
708	if (server.lazyfree_lazy_eviction) {
709	if (getMaxmemoryState(NULL,NULL,NULL,NULL) == C_OK) {
710	break;
711	}
712	}
713
714	/ After some time, exit the loop early - even if memory limit*
715	* hasn't been reached. If we suddenly need to free a lot of
716	* memory, don't want to spend too much time here. */
717	if (elapsedUs(evictionTimer) > eviction_time_limit_us) {
718	// We still need to free memory - start eviction timer proc
719	startEvictionTimeProc();
720	break;
721	}
722	}
723	} else {
724	goto cant_free; / nothing to free... /
725	}
726	}
727	/ at this point, the memory is OK, or we have reached the time limit /
728	result = (isEvictionProcRunning) ? EVICT_RUNNING : EVICT_OK;
729
730	cant_free:
731	if (result == EVICT_FAIL) {
732	/ At this point, we have run out of evictable items. It's possible*
733	* that some items are being freed in the lazyfree thread. Perform a
734	* short wait here if such jobs exist, but don't wait long. */
735	if (bioPendingJobsOfType(BIO_LAZY_FREE)) {
736	usleep(eviction_time_limit_us);
737	if (getMaxmemoryState(NULL,NULL,NULL,NULL) == C_OK) {
738	result = EVICT_OK;
739	}
740	}
741	}
742
743	serverAssert(server.core_propagates); / This function should not be re-entrant /
744
745	/ Propagate all DELs /
746	propagatePendingCommands();
747
748	server.core_propagates = prev_core_propagates;
749	server.propagate_no_multi = `0`;
750
751	latencyEndMonitor(latency);
752	latencyAddSampleIfNeeded("eviction-cycle",latency);
753
754	update_metrics:
755	if (result == EVICT_RUNNING \|\| result == EVICT_FAIL) {
756	if (server.stat_last_eviction_exceeded_time == `0`)
757	elapsedStart(&server.stat_last_eviction_exceeded_time);
758	} else if (result == EVICT_OK) {
759	if (server.stat_last_eviction_exceeded_time != `0`) {
760	server.stat_total_eviction_exceeded_time += elapsedUs(server.stat_last_eviction_exceeded_time);
761	server.stat_last_eviction_exceeded_time = `0`;
762	}
763	}
764	return result;
765	}
766

Browse the source code of redis/src/evict.c