1/* Redis Object implementation.
2 *
3 * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * * Redistributions of source code must retain the above copyright notice,
10 * this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * * Neither the name of Redis nor the names of its contributors may be used
15 * to endorse or promote products derived from this software without
16 * specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "server.h"
32#include "functions.h"
33#include <math.h>
34#include <ctype.h>
35
36#ifdef __CYGWIN__
37#define strtold(a,b) ((long double)strtod((a),(b)))
38#endif
39
40/* ===================== Creation and parsing of objects ==================== */
41
42robj *createObject(int type, void *ptr) {
43 robj *o = zmalloc(sizeof(*o));
44 o->type = type;
45 o->encoding = OBJ_ENCODING_RAW;
46 o->ptr = ptr;
47 o->refcount = 1;
48
49 /* Set the LRU to the current lruclock (minutes resolution), or
50 * alternatively the LFU counter. */
51 if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
52 o->lru = (LFUGetTimeInMinutes()<<8) | LFU_INIT_VAL;
53 } else {
54 o->lru = LRU_CLOCK();
55 }
56 return o;
57}
58
59/* Set a special refcount in the object to make it "shared":
60 * incrRefCount and decrRefCount() will test for this special refcount
61 * and will not touch the object. This way it is free to access shared
62 * objects such as small integers from different threads without any
63 * mutex.
64 *
65 * A common patter to create shared objects:
66 *
67 * robj *myobject = makeObjectShared(createObject(...));
68 *
69 */
70robj *makeObjectShared(robj *o) {
71 serverAssert(o->refcount == 1);
72 o->refcount = OBJ_SHARED_REFCOUNT;
73 return o;
74}
75
76/* Create a string object with encoding OBJ_ENCODING_RAW, that is a plain
77 * string object where o->ptr points to a proper sds string. */
78robj *createRawStringObject(const char *ptr, size_t len) {
79 return createObject(OBJ_STRING, sdsnewlen(ptr,len));
80}
81
82/* Create a string object with encoding OBJ_ENCODING_EMBSTR, that is
83 * an object where the sds string is actually an unmodifiable string
84 * allocated in the same chunk as the object itself. */
85robj *createEmbeddedStringObject(const char *ptr, size_t len) {
86 robj *o = zmalloc(sizeof(robj)+sizeof(struct sdshdr8)+len+1);
87 struct sdshdr8 *sh = (void*)(o+1);
88
89 o->type = OBJ_STRING;
90 o->encoding = OBJ_ENCODING_EMBSTR;
91 o->ptr = sh+1;
92 o->refcount = 1;
93 if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
94 o->lru = (LFUGetTimeInMinutes()<<8) | LFU_INIT_VAL;
95 } else {
96 o->lru = LRU_CLOCK();
97 }
98
99 sh->len = len;
100 sh->alloc = len;
101 sh->flags = SDS_TYPE_8;
102 if (ptr == SDS_NOINIT)
103 sh->buf[len] = '\0';
104 else if (ptr) {
105 memcpy(sh->buf,ptr,len);
106 sh->buf[len] = '\0';
107 } else {
108 memset(sh->buf,0,len+1);
109 }
110 return o;
111}
112
113/* Create a string object with EMBSTR encoding if it is smaller than
114 * OBJ_ENCODING_EMBSTR_SIZE_LIMIT, otherwise the RAW encoding is
115 * used.
116 *
117 * The current limit of 44 is chosen so that the biggest string object
118 * we allocate as EMBSTR will still fit into the 64 byte arena of jemalloc. */
119#define OBJ_ENCODING_EMBSTR_SIZE_LIMIT 44
120robj *createStringObject(const char *ptr, size_t len) {
121 if (len <= OBJ_ENCODING_EMBSTR_SIZE_LIMIT)
122 return createEmbeddedStringObject(ptr,len);
123 else
124 return createRawStringObject(ptr,len);
125}
126
127/* Same as CreateRawStringObject, can return NULL if allocation fails */
128robj *tryCreateRawStringObject(const char *ptr, size_t len) {
129 sds str = sdstrynewlen(ptr,len);
130 if (!str) return NULL;
131 return createObject(OBJ_STRING, str);
132}
133
134/* Same as createStringObject, can return NULL if allocation fails */
135robj *tryCreateStringObject(const char *ptr, size_t len) {
136 if (len <= OBJ_ENCODING_EMBSTR_SIZE_LIMIT)
137 return createEmbeddedStringObject(ptr,len);
138 else
139 return tryCreateRawStringObject(ptr,len);
140}
141
142/* Create a string object from a long long value. When possible returns a
143 * shared integer object, or at least an integer encoded one.
144 *
145 * If valueobj is non zero, the function avoids returning a shared
146 * integer, because the object is going to be used as value in the Redis key
147 * space (for instance when the INCR command is used), so we want LFU/LRU
148 * values specific for each key. */
149robj *createStringObjectFromLongLongWithOptions(long long value, int valueobj) {
150 robj *o;
151
152 if (server.maxmemory == 0 ||
153 !(server.maxmemory_policy & MAXMEMORY_FLAG_NO_SHARED_INTEGERS))
154 {
155 /* If the maxmemory policy permits, we can still return shared integers
156 * even if valueobj is true. */
157 valueobj = 0;
158 }
159
160 if (value >= 0 && value < OBJ_SHARED_INTEGERS && valueobj == 0) {
161 incrRefCount(shared.integers[value]);
162 o = shared.integers[value];
163 } else {
164 if (value >= LONG_MIN && value <= LONG_MAX) {
165 o = createObject(OBJ_STRING, NULL);
166 o->encoding = OBJ_ENCODING_INT;
167 o->ptr = (void*)((long)value);
168 } else {
169 o = createObject(OBJ_STRING,sdsfromlonglong(value));
170 }
171 }
172 return o;
173}
174
175/* Wrapper for createStringObjectFromLongLongWithOptions() always demanding
176 * to create a shared object if possible. */
177robj *createStringObjectFromLongLong(long long value) {
178 return createStringObjectFromLongLongWithOptions(value,0);
179}
180
181/* Wrapper for createStringObjectFromLongLongWithOptions() avoiding a shared
182 * object when LFU/LRU info are needed, that is, when the object is used
183 * as a value in the key space, and Redis is configured to evict based on
184 * LFU/LRU. */
185robj *createStringObjectFromLongLongForValue(long long value) {
186 return createStringObjectFromLongLongWithOptions(value,1);
187}
188
189/* Create a string object from a long double. If humanfriendly is non-zero
190 * it does not use exponential format and trims trailing zeroes at the end,
191 * however this results in loss of precision. Otherwise exp format is used
192 * and the output of snprintf() is not modified.
193 *
194 * The 'humanfriendly' option is used for INCRBYFLOAT and HINCRBYFLOAT. */
195robj *createStringObjectFromLongDouble(long double value, int humanfriendly) {
196 char buf[MAX_LONG_DOUBLE_CHARS];
197 int len = ld2string(buf,sizeof(buf),value,humanfriendly? LD_STR_HUMAN: LD_STR_AUTO);
198 return createStringObject(buf,len);
199}
200
201/* Duplicate a string object, with the guarantee that the returned object
202 * has the same encoding as the original one.
203 *
204 * This function also guarantees that duplicating a small integer object
205 * (or a string object that contains a representation of a small integer)
206 * will always result in a fresh object that is unshared (refcount == 1).
207 *
208 * The resulting object always has refcount set to 1. */
209robj *dupStringObject(const robj *o) {
210 robj *d;
211
212 serverAssert(o->type == OBJ_STRING);
213
214 switch(o->encoding) {
215 case OBJ_ENCODING_RAW:
216 return createRawStringObject(o->ptr,sdslen(o->ptr));
217 case OBJ_ENCODING_EMBSTR:
218 return createEmbeddedStringObject(o->ptr,sdslen(o->ptr));
219 case OBJ_ENCODING_INT:
220 d = createObject(OBJ_STRING, NULL);
221 d->encoding = OBJ_ENCODING_INT;
222 d->ptr = o->ptr;
223 return d;
224 default:
225 serverPanic("Wrong encoding.");
226 break;
227 }
228}
229
230robj *createQuicklistObject(void) {
231 quicklist *l = quicklistCreate();
232 robj *o = createObject(OBJ_LIST,l);
233 o->encoding = OBJ_ENCODING_QUICKLIST;
234 return o;
235}
236
237robj *createSetObject(void) {
238 dict *d = dictCreate(&setDictType);
239 robj *o = createObject(OBJ_SET,d);
240 o->encoding = OBJ_ENCODING_HT;
241 return o;
242}
243
244robj *createIntsetObject(void) {
245 intset *is = intsetNew();
246 robj *o = createObject(OBJ_SET,is);
247 o->encoding = OBJ_ENCODING_INTSET;
248 return o;
249}
250
251robj *createHashObject(void) {
252 unsigned char *zl = lpNew(0);
253 robj *o = createObject(OBJ_HASH, zl);
254 o->encoding = OBJ_ENCODING_LISTPACK;
255 return o;
256}
257
258robj *createZsetObject(void) {
259 zset *zs = zmalloc(sizeof(*zs));
260 robj *o;
261
262 zs->dict = dictCreate(&zsetDictType);
263 zs->zsl = zslCreate();
264 o = createObject(OBJ_ZSET,zs);
265 o->encoding = OBJ_ENCODING_SKIPLIST;
266 return o;
267}
268
269robj *createZsetListpackObject(void) {
270 unsigned char *lp = lpNew(0);
271 robj *o = createObject(OBJ_ZSET,lp);
272 o->encoding = OBJ_ENCODING_LISTPACK;
273 return o;
274}
275
276robj *createStreamObject(void) {
277 stream *s = streamNew();
278 robj *o = createObject(OBJ_STREAM,s);
279 o->encoding = OBJ_ENCODING_STREAM;
280 return o;
281}
282
283robj *createModuleObject(moduleType *mt, void *value) {
284 moduleValue *mv = zmalloc(sizeof(*mv));
285 mv->type = mt;
286 mv->value = value;
287 return createObject(OBJ_MODULE,mv);
288}
289
290void freeStringObject(robj *o) {
291 if (o->encoding == OBJ_ENCODING_RAW) {
292 sdsfree(o->ptr);
293 }
294}
295
296void freeListObject(robj *o) {
297 if (o->encoding == OBJ_ENCODING_QUICKLIST) {
298 quicklistRelease(o->ptr);
299 } else {
300 serverPanic("Unknown list encoding type");
301 }
302}
303
304void freeSetObject(robj *o) {
305 switch (o->encoding) {
306 case OBJ_ENCODING_HT:
307 dictRelease((dict*) o->ptr);
308 break;
309 case OBJ_ENCODING_INTSET:
310 zfree(o->ptr);
311 break;
312 default:
313 serverPanic("Unknown set encoding type");
314 }
315}
316
317void freeZsetObject(robj *o) {
318 zset *zs;
319 switch (o->encoding) {
320 case OBJ_ENCODING_SKIPLIST:
321 zs = o->ptr;
322 dictRelease(zs->dict);
323 zslFree(zs->zsl);
324 zfree(zs);
325 break;
326 case OBJ_ENCODING_LISTPACK:
327 zfree(o->ptr);
328 break;
329 default:
330 serverPanic("Unknown sorted set encoding");
331 }
332}
333
334void freeHashObject(robj *o) {
335 switch (o->encoding) {
336 case OBJ_ENCODING_HT:
337 dictRelease((dict*) o->ptr);
338 break;
339 case OBJ_ENCODING_LISTPACK:
340 lpFree(o->ptr);
341 break;
342 default:
343 serverPanic("Unknown hash encoding type");
344 break;
345 }
346}
347
348void freeModuleObject(robj *o) {
349 moduleValue *mv = o->ptr;
350 mv->type->free(mv->value);
351 zfree(mv);
352}
353
354void freeStreamObject(robj *o) {
355 freeStream(o->ptr);
356}
357
358void incrRefCount(robj *o) {
359 if (o->refcount < OBJ_FIRST_SPECIAL_REFCOUNT) {
360 o->refcount++;
361 } else {
362 if (o->refcount == OBJ_SHARED_REFCOUNT) {
363 /* Nothing to do: this refcount is immutable. */
364 } else if (o->refcount == OBJ_STATIC_REFCOUNT) {
365 serverPanic("You tried to retain an object allocated in the stack");
366 }
367 }
368}
369
370void decrRefCount(robj *o) {
371 if (o->refcount == 1) {
372 switch(o->type) {
373 case OBJ_STRING: freeStringObject(o); break;
374 case OBJ_LIST: freeListObject(o); break;
375 case OBJ_SET: freeSetObject(o); break;
376 case OBJ_ZSET: freeZsetObject(o); break;
377 case OBJ_HASH: freeHashObject(o); break;
378 case OBJ_MODULE: freeModuleObject(o); break;
379 case OBJ_STREAM: freeStreamObject(o); break;
380 default: serverPanic("Unknown object type"); break;
381 }
382 zfree(o);
383 } else {
384 if (o->refcount <= 0) serverPanic("decrRefCount against refcount <= 0");
385 if (o->refcount != OBJ_SHARED_REFCOUNT) o->refcount--;
386 }
387}
388
389/* See dismissObject() */
390void dismissSds(sds s) {
391 dismissMemory(sdsAllocPtr(s), sdsAllocSize(s));
392}
393
394/* See dismissObject() */
395void dismissStringObject(robj *o) {
396 if (o->encoding == OBJ_ENCODING_RAW) {
397 dismissSds(o->ptr);
398 }
399}
400
401/* See dismissObject() */
402void dismissListObject(robj *o, size_t size_hint) {
403 if (o->encoding == OBJ_ENCODING_QUICKLIST) {
404 quicklist *ql = o->ptr;
405 serverAssert(ql->len != 0);
406 /* We iterate all nodes only when average node size is bigger than a
407 * page size, and there's a high chance we'll actually dismiss something. */
408 if (size_hint / ql->len >= server.page_size) {
409 quicklistNode *node = ql->head;
410 while (node) {
411 if (quicklistNodeIsCompressed(node)) {
412 dismissMemory(node->entry, ((quicklistLZF*)node->entry)->sz);
413 } else {
414 dismissMemory(node->entry, node->sz);
415 }
416 node = node->next;
417 }
418 }
419 } else {
420 serverPanic("Unknown list encoding type");
421 }
422}
423
424/* See dismissObject() */
425void dismissSetObject(robj *o, size_t size_hint) {
426 if (o->encoding == OBJ_ENCODING_HT) {
427 dict *set = o->ptr;
428 serverAssert(dictSize(set) != 0);
429 /* We iterate all nodes only when average member size is bigger than a
430 * page size, and there's a high chance we'll actually dismiss something. */
431 if (size_hint / dictSize(set) >= server.page_size) {
432 dictEntry *de;
433 dictIterator *di = dictGetIterator(set);
434 while ((de = dictNext(di)) != NULL) {
435 dismissSds(dictGetKey(de));
436 }
437 dictReleaseIterator(di);
438 }
439
440 /* Dismiss hash table memory. */
441 dismissMemory(set->ht_table[0], DICTHT_SIZE(set->ht_size_exp[0])*sizeof(dictEntry*));
442 dismissMemory(set->ht_table[1], DICTHT_SIZE(set->ht_size_exp[1])*sizeof(dictEntry*));
443 } else if (o->encoding == OBJ_ENCODING_INTSET) {
444 dismissMemory(o->ptr, intsetBlobLen((intset*)o->ptr));
445 } else {
446 serverPanic("Unknown set encoding type");
447 }
448}
449
450/* See dismissObject() */
451void dismissZsetObject(robj *o, size_t size_hint) {
452 if (o->encoding == OBJ_ENCODING_SKIPLIST) {
453 zset *zs = o->ptr;
454 zskiplist *zsl = zs->zsl;
455 serverAssert(zsl->length != 0);
456 /* We iterate all nodes only when average member size is bigger than a
457 * page size, and there's a high chance we'll actually dismiss something. */
458 if (size_hint / zsl->length >= server.page_size) {
459 zskiplistNode *zn = zsl->tail;
460 while (zn != NULL) {
461 dismissSds(zn->ele);
462 zn = zn->backward;
463 }
464 }
465
466 /* Dismiss hash table memory. */
467 dict *d = zs->dict;
468 dismissMemory(d->ht_table[0], DICTHT_SIZE(d->ht_size_exp[0])*sizeof(dictEntry*));
469 dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1])*sizeof(dictEntry*));
470 } else if (o->encoding == OBJ_ENCODING_LISTPACK) {
471 dismissMemory(o->ptr, lpBytes((unsigned char*)o->ptr));
472 } else {
473 serverPanic("Unknown zset encoding type");
474 }
475}
476
477/* See dismissObject() */
478void dismissHashObject(robj *o, size_t size_hint) {
479 if (o->encoding == OBJ_ENCODING_HT) {
480 dict *d = o->ptr;
481 serverAssert(dictSize(d) != 0);
482 /* We iterate all fields only when average field/value size is bigger than
483 * a page size, and there's a high chance we'll actually dismiss something. */
484 if (size_hint / dictSize(d) >= server.page_size) {
485 dictEntry *de;
486 dictIterator *di = dictGetIterator(d);
487 while ((de = dictNext(di)) != NULL) {
488 /* Only dismiss values memory since the field size
489 * usually is small. */
490 dismissSds(dictGetVal(de));
491 }
492 dictReleaseIterator(di);
493 }
494
495 /* Dismiss hash table memory. */
496 dismissMemory(d->ht_table[0], DICTHT_SIZE(d->ht_size_exp[0])*sizeof(dictEntry*));
497 dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1])*sizeof(dictEntry*));
498 } else if (o->encoding == OBJ_ENCODING_LISTPACK) {
499 dismissMemory(o->ptr, lpBytes((unsigned char*)o->ptr));
500 } else {
501 serverPanic("Unknown hash encoding type");
502 }
503}
504
505/* See dismissObject() */
506void dismissStreamObject(robj *o, size_t size_hint) {
507 stream *s = o->ptr;
508 rax *rax = s->rax;
509 if (raxSize(rax) == 0) return;
510
511 /* Iterate only on stream entries, although size_hint may include serialized
512 * consumer groups info, but usually, stream entries take up most of
513 * the space. */
514 if (size_hint / raxSize(rax) >= server.page_size) {
515 raxIterator ri;
516 raxStart(&ri,rax);
517 raxSeek(&ri,"^",NULL,0);
518 while (raxNext(&ri)) {
519 dismissMemory(ri.data, lpBytes(ri.data));
520 }
521 raxStop(&ri);
522 }
523}
524
525/* When creating a snapshot in a fork child process, the main process and child
526 * process share the same physical memory pages, and if / when the parent
527 * modifies any keys due to write traffic, it'll cause CoW which consume
528 * physical memory. In the child process, after serializing the key and value,
529 * the data is definitely not accessed again, so to avoid unnecessary CoW, we
530 * try to release their memory back to OS. see dismissMemory().
531 *
532 * Because of the cost of iterating all node/field/member/entry of complex data
533 * types, we iterate and dismiss them only when approximate average we estimate
534 * the size of an individual allocation is more than a page size of OS.
535 * 'size_hint' is the size of serialized value. This method is not accurate, but
536 * it can reduce unnecessary iteration for complex data types that are probably
537 * not going to release any memory. */
538void dismissObject(robj *o, size_t size_hint) {
539 /* madvise(MADV_DONTNEED) may not work if Transparent Huge Pages is enabled. */
540 if (server.thp_enabled) return;
541
542 /* Currently we use zmadvise_dontneed only when we use jemalloc with Linux.
543 * so we avoid these pointless loops when they're not going to do anything. */
544#if defined(USE_JEMALLOC) && defined(__linux__)
545 if (o->refcount != 1) return;
546 switch(o->type) {
547 case OBJ_STRING: dismissStringObject(o); break;
548 case OBJ_LIST: dismissListObject(o, size_hint); break;
549 case OBJ_SET: dismissSetObject(o, size_hint); break;
550 case OBJ_ZSET: dismissZsetObject(o, size_hint); break;
551 case OBJ_HASH: dismissHashObject(o, size_hint); break;
552 case OBJ_STREAM: dismissStreamObject(o, size_hint); break;
553 default: break;
554 }
555#else
556 UNUSED(o); UNUSED(size_hint);
557#endif
558}
559
560/* This variant of decrRefCount() gets its argument as void, and is useful
561 * as free method in data structures that expect a 'void free_object(void*)'
562 * prototype for the free method. */
563void decrRefCountVoid(void *o) {
564 decrRefCount(o);
565}
566
567int checkType(client *c, robj *o, int type) {
568 /* A NULL is considered an empty key */
569 if (o && o->type != type) {
570 addReplyErrorObject(c,shared.wrongtypeerr);
571 return 1;
572 }
573 return 0;
574}
575
576int isSdsRepresentableAsLongLong(sds s, long long *llval) {
577 return string2ll(s,sdslen(s),llval) ? C_OK : C_ERR;
578}
579
580int isObjectRepresentableAsLongLong(robj *o, long long *llval) {
581 serverAssertWithInfo(NULL,o,o->type == OBJ_STRING);
582 if (o->encoding == OBJ_ENCODING_INT) {
583 if (llval) *llval = (long) o->ptr;
584 return C_OK;
585 } else {
586 return isSdsRepresentableAsLongLong(o->ptr,llval);
587 }
588}
589
590/* Optimize the SDS string inside the string object to require little space,
591 * in case there is more than 10% of free space at the end of the SDS
592 * string. This happens because SDS strings tend to overallocate to avoid
593 * wasting too much time in allocations when appending to the string. */
594void trimStringObjectIfNeeded(robj *o) {
595 if (o->encoding == OBJ_ENCODING_RAW &&
596 sdsavail(o->ptr) > sdslen(o->ptr)/10)
597 {
598 o->ptr = sdsRemoveFreeSpace(o->ptr);
599 }
600}
601
602/* Try to encode a string object in order to save space */
603robj *tryObjectEncoding(robj *o) {
604 long value;
605 sds s = o->ptr;
606 size_t len;
607
608 /* Make sure this is a string object, the only type we encode
609 * in this function. Other types use encoded memory efficient
610 * representations but are handled by the commands implementing
611 * the type. */
612 serverAssertWithInfo(NULL,o,o->type == OBJ_STRING);
613
614 /* We try some specialized encoding only for objects that are
615 * RAW or EMBSTR encoded, in other words objects that are still
616 * in represented by an actually array of chars. */
617 if (!sdsEncodedObject(o)) return o;
618
619 /* It's not safe to encode shared objects: shared objects can be shared
620 * everywhere in the "object space" of Redis and may end in places where
621 * they are not handled. We handle them only as values in the keyspace. */
622 if (o->refcount > 1) return o;
623
624 /* Check if we can represent this string as a long integer.
625 * Note that we are sure that a string larger than 20 chars is not
626 * representable as a 32 nor 64 bit integer. */
627 len = sdslen(s);
628 if (len <= 20 && string2l(s,len,&value)) {
629 /* This object is encodable as a long. Try to use a shared object.
630 * Note that we avoid using shared integers when maxmemory is used
631 * because every object needs to have a private LRU field for the LRU
632 * algorithm to work well. */
633 if ((server.maxmemory == 0 ||
634 !(server.maxmemory_policy & MAXMEMORY_FLAG_NO_SHARED_INTEGERS)) &&
635 value >= 0 &&
636 value < OBJ_SHARED_INTEGERS)
637 {
638 decrRefCount(o);
639 incrRefCount(shared.integers[value]);
640 return shared.integers[value];
641 } else {
642 if (o->encoding == OBJ_ENCODING_RAW) {
643 sdsfree(o->ptr);
644 o->encoding = OBJ_ENCODING_INT;
645 o->ptr = (void*) value;
646 return o;
647 } else if (o->encoding == OBJ_ENCODING_EMBSTR) {
648 decrRefCount(o);
649 return createStringObjectFromLongLongForValue(value);
650 }
651 }
652 }
653
654 /* If the string is small and is still RAW encoded,
655 * try the EMBSTR encoding which is more efficient.
656 * In this representation the object and the SDS string are allocated
657 * in the same chunk of memory to save space and cache misses. */
658 if (len <= OBJ_ENCODING_EMBSTR_SIZE_LIMIT) {
659 robj *emb;
660
661 if (o->encoding == OBJ_ENCODING_EMBSTR) return o;
662 emb = createEmbeddedStringObject(s,sdslen(s));
663 decrRefCount(o);
664 return emb;
665 }
666
667 /* We can't encode the object...
668 *
669 * Do the last try, and at least optimize the SDS string inside
670 * the string object to require little space, in case there
671 * is more than 10% of free space at the end of the SDS string.
672 *
673 * We do that only for relatively large strings as this branch
674 * is only entered if the length of the string is greater than
675 * OBJ_ENCODING_EMBSTR_SIZE_LIMIT. */
676 trimStringObjectIfNeeded(o);
677
678 /* Return the original object. */
679 return o;
680}
681
682/* Get a decoded version of an encoded object (returned as a new object).
683 * If the object is already raw-encoded just increment the ref count. */
684robj *getDecodedObject(robj *o) {
685 robj *dec;
686
687 if (sdsEncodedObject(o)) {
688 incrRefCount(o);
689 return o;
690 }
691 if (o->type == OBJ_STRING && o->encoding == OBJ_ENCODING_INT) {
692 char buf[32];
693
694 ll2string(buf,32,(long)o->ptr);
695 dec = createStringObject(buf,strlen(buf));
696 return dec;
697 } else {
698 serverPanic("Unknown encoding type");
699 }
700}
701
702/* Compare two string objects via strcmp() or strcoll() depending on flags.
703 * Note that the objects may be integer-encoded. In such a case we
704 * use ll2string() to get a string representation of the numbers on the stack
705 * and compare the strings, it's much faster than calling getDecodedObject().
706 *
707 * Important note: when REDIS_COMPARE_BINARY is used a binary-safe comparison
708 * is used. */
709
710#define REDIS_COMPARE_BINARY (1<<0)
711#define REDIS_COMPARE_COLL (1<<1)
712
713int compareStringObjectsWithFlags(robj *a, robj *b, int flags) {
714 serverAssertWithInfo(NULL,a,a->type == OBJ_STRING && b->type == OBJ_STRING);
715 char bufa[128], bufb[128], *astr, *bstr;
716 size_t alen, blen, minlen;
717
718 if (a == b) return 0;
719 if (sdsEncodedObject(a)) {
720 astr = a->ptr;
721 alen = sdslen(astr);
722 } else {
723 alen = ll2string(bufa,sizeof(bufa),(long) a->ptr);
724 astr = bufa;
725 }
726 if (sdsEncodedObject(b)) {
727 bstr = b->ptr;
728 blen = sdslen(bstr);
729 } else {
730 blen = ll2string(bufb,sizeof(bufb),(long) b->ptr);
731 bstr = bufb;
732 }
733 if (flags & REDIS_COMPARE_COLL) {
734 return strcoll(astr,bstr);
735 } else {
736 int cmp;
737
738 minlen = (alen < blen) ? alen : blen;
739 cmp = memcmp(astr,bstr,minlen);
740 if (cmp == 0) return alen-blen;
741 return cmp;
742 }
743}
744
745/* Wrapper for compareStringObjectsWithFlags() using binary comparison. */
746int compareStringObjects(robj *a, robj *b) {
747 return compareStringObjectsWithFlags(a,b,REDIS_COMPARE_BINARY);
748}
749
750/* Wrapper for compareStringObjectsWithFlags() using collation. */
751int collateStringObjects(robj *a, robj *b) {
752 return compareStringObjectsWithFlags(a,b,REDIS_COMPARE_COLL);
753}
754
755/* Equal string objects return 1 if the two objects are the same from the
756 * point of view of a string comparison, otherwise 0 is returned. Note that
757 * this function is faster then checking for (compareStringObject(a,b) == 0)
758 * because it can perform some more optimization. */
759int equalStringObjects(robj *a, robj *b) {
760 if (a->encoding == OBJ_ENCODING_INT &&
761 b->encoding == OBJ_ENCODING_INT){
762 /* If both strings are integer encoded just check if the stored
763 * long is the same. */
764 return a->ptr == b->ptr;
765 } else {
766 return compareStringObjects(a,b) == 0;
767 }
768}
769
770size_t stringObjectLen(robj *o) {
771 serverAssertWithInfo(NULL,o,o->type == OBJ_STRING);
772 if (sdsEncodedObject(o)) {
773 return sdslen(o->ptr);
774 } else {
775 return sdigits10((long)o->ptr);
776 }
777}
778
779int getDoubleFromObject(const robj *o, double *target) {
780 double value;
781
782 if (o == NULL) {
783 value = 0;
784 } else {
785 serverAssertWithInfo(NULL,o,o->type == OBJ_STRING);
786 if (sdsEncodedObject(o)) {
787 if (!string2d(o->ptr, sdslen(o->ptr), &value))
788 return C_ERR;
789 } else if (o->encoding == OBJ_ENCODING_INT) {
790 value = (long)o->ptr;
791 } else {
792 serverPanic("Unknown string encoding");
793 }
794 }
795 *target = value;
796 return C_OK;
797}
798
799int getDoubleFromObjectOrReply(client *c, robj *o, double *target, const char *msg) {
800 double value;
801 if (getDoubleFromObject(o, &value) != C_OK) {
802 if (msg != NULL) {
803 addReplyError(c,(char*)msg);
804 } else {
805 addReplyError(c,"value is not a valid float");
806 }
807 return C_ERR;
808 }
809 *target = value;
810 return C_OK;
811}
812
813int getLongDoubleFromObject(robj *o, long double *target) {
814 long double value;
815
816 if (o == NULL) {
817 value = 0;
818 } else {
819 serverAssertWithInfo(NULL,o,o->type == OBJ_STRING);
820 if (sdsEncodedObject(o)) {
821 if (!string2ld(o->ptr, sdslen(o->ptr), &value))
822 return C_ERR;
823 } else if (o->encoding == OBJ_ENCODING_INT) {
824 value = (long)o->ptr;
825 } else {
826 serverPanic("Unknown string encoding");
827 }
828 }
829 *target = value;
830 return C_OK;
831}
832
833int getLongDoubleFromObjectOrReply(client *c, robj *o, long double *target, const char *msg) {
834 long double value;
835 if (getLongDoubleFromObject(o, &value) != C_OK) {
836 if (msg != NULL) {
837 addReplyError(c,(char*)msg);
838 } else {
839 addReplyError(c,"value is not a valid float");
840 }
841 return C_ERR;
842 }
843 *target = value;
844 return C_OK;
845}
846
847int getLongLongFromObject(robj *o, long long *target) {
848 long long value;
849
850 if (o == NULL) {
851 value = 0;
852 } else {
853 serverAssertWithInfo(NULL,o,o->type == OBJ_STRING);
854 if (sdsEncodedObject(o)) {
855 if (string2ll(o->ptr,sdslen(o->ptr),&value) == 0) return C_ERR;
856 } else if (o->encoding == OBJ_ENCODING_INT) {
857 value = (long)o->ptr;
858 } else {
859 serverPanic("Unknown string encoding");
860 }
861 }
862 if (target) *target = value;
863 return C_OK;
864}
865
866int getLongLongFromObjectOrReply(client *c, robj *o, long long *target, const char *msg) {
867 long long value;
868 if (getLongLongFromObject(o, &value) != C_OK) {
869 if (msg != NULL) {
870 addReplyError(c,(char*)msg);
871 } else {
872 addReplyError(c,"value is not an integer or out of range");
873 }
874 return C_ERR;
875 }
876 *target = value;
877 return C_OK;
878}
879
880int getLongFromObjectOrReply(client *c, robj *o, long *target, const char *msg) {
881 long long value;
882
883 if (getLongLongFromObjectOrReply(c, o, &value, msg) != C_OK) return C_ERR;
884 if (value < LONG_MIN || value > LONG_MAX) {
885 if (msg != NULL) {
886 addReplyError(c,(char*)msg);
887 } else {
888 addReplyError(c,"value is out of range");
889 }
890 return C_ERR;
891 }
892 *target = value;
893 return C_OK;
894}
895
896int getRangeLongFromObjectOrReply(client *c, robj *o, long min, long max, long *target, const char *msg) {
897 if (getLongFromObjectOrReply(c, o, target, msg) != C_OK) return C_ERR;
898 if (*target < min || *target > max) {
899 if (msg != NULL) {
900 addReplyError(c,(char*)msg);
901 } else {
902 addReplyErrorFormat(c,"value is out of range, value must between %ld and %ld", min, max);
903 }
904 return C_ERR;
905 }
906 return C_OK;
907}
908
909int getPositiveLongFromObjectOrReply(client *c, robj *o, long *target, const char *msg) {
910 if (msg) {
911 return getRangeLongFromObjectOrReply(c, o, 0, LONG_MAX, target, msg);
912 } else {
913 return getRangeLongFromObjectOrReply(c, o, 0, LONG_MAX, target, "value is out of range, must be positive");
914 }
915}
916
917int getIntFromObjectOrReply(client *c, robj *o, int *target, const char *msg) {
918 long value;
919
920 if (getRangeLongFromObjectOrReply(c, o, INT_MIN, INT_MAX, &value, msg) != C_OK)
921 return C_ERR;
922
923 *target = value;
924 return C_OK;
925}
926
927char *strEncoding(int encoding) {
928 switch(encoding) {
929 case OBJ_ENCODING_RAW: return "raw";
930 case OBJ_ENCODING_INT: return "int";
931 case OBJ_ENCODING_HT: return "hashtable";
932 case OBJ_ENCODING_QUICKLIST: return "quicklist";
933 case OBJ_ENCODING_LISTPACK: return "listpack";
934 case OBJ_ENCODING_INTSET: return "intset";
935 case OBJ_ENCODING_SKIPLIST: return "skiplist";
936 case OBJ_ENCODING_EMBSTR: return "embstr";
937 case OBJ_ENCODING_STREAM: return "stream";
938 default: return "unknown";
939 }
940}
941
942/* =========================== Memory introspection ========================= */
943
944
945/* This is a helper function with the goal of estimating the memory
946 * size of a radix tree that is used to store Stream IDs.
947 *
948 * Note: to guess the size of the radix tree is not trivial, so we
949 * approximate it considering 16 bytes of data overhead for each
950 * key (the ID), and then adding the number of bare nodes, plus some
951 * overhead due by the data and child pointers. This secret recipe
952 * was obtained by checking the average radix tree created by real
953 * workloads, and then adjusting the constants to get numbers that
954 * more or less match the real memory usage.
955 *
956 * Actually the number of nodes and keys may be different depending
957 * on the insertion speed and thus the ability of the radix tree
958 * to compress prefixes. */
959size_t streamRadixTreeMemoryUsage(rax *rax) {
960 size_t size = sizeof(*rax);
961 size = rax->numele * sizeof(streamID);
962 size += rax->numnodes * sizeof(raxNode);
963 /* Add a fixed overhead due to the aux data pointer, children, ... */
964 size += rax->numnodes * sizeof(long)*30;
965 return size;
966}
967
968/* Returns the size in bytes consumed by the key's value in RAM.
969 * Note that the returned value is just an approximation, especially in the
970 * case of aggregated data types where only "sample_size" elements
971 * are checked and averaged to estimate the total size. */
972#define OBJ_COMPUTE_SIZE_DEF_SAMPLES 5 /* Default sample size. */
973size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) {
974 sds ele, ele2;
975 dict *d;
976 dictIterator *di;
977 struct dictEntry *de;
978 size_t asize = 0, elesize = 0, samples = 0;
979
980 if (o->type == OBJ_STRING) {
981 if(o->encoding == OBJ_ENCODING_INT) {
982 asize = sizeof(*o);
983 } else if(o->encoding == OBJ_ENCODING_RAW) {
984 asize = sdsZmallocSize(o->ptr)+sizeof(*o);
985 } else if(o->encoding == OBJ_ENCODING_EMBSTR) {
986 asize = zmalloc_size((void *)o);
987 } else {
988 serverPanic("Unknown string encoding");
989 }
990 } else if (o->type == OBJ_LIST) {
991 if (o->encoding == OBJ_ENCODING_QUICKLIST) {
992 quicklist *ql = o->ptr;
993 quicklistNode *node = ql->head;
994 asize = sizeof(*o)+sizeof(quicklist);
995 do {
996 elesize += sizeof(quicklistNode)+zmalloc_size(node->entry);
997 samples++;
998 } while ((node = node->next) && samples < sample_size);
999 asize += (double)elesize/samples*ql->len;
1000 } else {
1001 serverPanic("Unknown list encoding");
1002 }
1003 } else if (o->type == OBJ_SET) {
1004 if (o->encoding == OBJ_ENCODING_HT) {
1005 d = o->ptr;
1006 di = dictGetIterator(d);
1007 asize = sizeof(*o)+sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
1008 while((de = dictNext(di)) != NULL && samples < sample_size) {
1009 ele = dictGetKey(de);
1010 elesize += sizeof(struct dictEntry) + sdsZmallocSize(ele);
1011 samples++;
1012 }
1013 dictReleaseIterator(di);
1014 if (samples) asize += (double)elesize/samples*dictSize(d);
1015 } else if (o->encoding == OBJ_ENCODING_INTSET) {
1016 asize = sizeof(*o)+zmalloc_size(o->ptr);
1017 } else {
1018 serverPanic("Unknown set encoding");
1019 }
1020 } else if (o->type == OBJ_ZSET) {
1021 if (o->encoding == OBJ_ENCODING_LISTPACK) {
1022 asize = sizeof(*o)+zmalloc_size(o->ptr);
1023 } else if (o->encoding == OBJ_ENCODING_SKIPLIST) {
1024 d = ((zset*)o->ptr)->dict;
1025 zskiplist *zsl = ((zset*)o->ptr)->zsl;
1026 zskiplistNode *znode = zsl->header->level[0].forward;
1027 asize = sizeof(*o)+sizeof(zset)+sizeof(zskiplist)+sizeof(dict)+
1028 (sizeof(struct dictEntry*)*dictSlots(d))+
1029 zmalloc_size(zsl->header);
1030 while(znode != NULL && samples < sample_size) {
1031 elesize += sdsZmallocSize(znode->ele);
1032 elesize += sizeof(struct dictEntry)+zmalloc_size(znode);
1033 samples++;
1034 znode = znode->level[0].forward;
1035 }
1036 if (samples) asize += (double)elesize/samples*dictSize(d);
1037 } else {
1038 serverPanic("Unknown sorted set encoding");
1039 }
1040 } else if (o->type == OBJ_HASH) {
1041 if (o->encoding == OBJ_ENCODING_LISTPACK) {
1042 asize = sizeof(*o)+zmalloc_size(o->ptr);
1043 } else if (o->encoding == OBJ_ENCODING_HT) {
1044 d = o->ptr;
1045 di = dictGetIterator(d);
1046 asize = sizeof(*o)+sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
1047 while((de = dictNext(di)) != NULL && samples < sample_size) {
1048 ele = dictGetKey(de);
1049 ele2 = dictGetVal(de);
1050 elesize += sdsZmallocSize(ele) + sdsZmallocSize(ele2);
1051 elesize += sizeof(struct dictEntry);
1052 samples++;
1053 }
1054 dictReleaseIterator(di);
1055 if (samples) asize += (double)elesize/samples*dictSize(d);
1056 } else {
1057 serverPanic("Unknown hash encoding");
1058 }
1059 } else if (o->type == OBJ_STREAM) {
1060 stream *s = o->ptr;
1061 asize = sizeof(*o)+sizeof(*s);
1062 asize += streamRadixTreeMemoryUsage(s->rax);
1063
1064 /* Now we have to add the listpacks. The last listpack is often non
1065 * complete, so we estimate the size of the first N listpacks, and
1066 * use the average to compute the size of the first N-1 listpacks, and
1067 * finally add the real size of the last node. */
1068 raxIterator ri;
1069 raxStart(&ri,s->rax);
1070 raxSeek(&ri,"^",NULL,0);
1071 size_t lpsize = 0, samples = 0;
1072 while(samples < sample_size && raxNext(&ri)) {
1073 unsigned char *lp = ri.data;
1074 lpsize += lpBytes(lp);
1075 samples++;
1076 }
1077 if (s->rax->numele <= samples) {
1078 asize += lpsize;
1079 } else {
1080 if (samples) lpsize /= samples; /* Compute the average. */
1081 asize += lpsize * (s->rax->numele-1);
1082 /* No need to check if seek succeeded, we enter this branch only
1083 * if there are a few elements in the radix tree. */
1084 raxSeek(&ri,"$",NULL,0);
1085 raxNext(&ri);
1086 asize += lpBytes(ri.data);
1087 }
1088 raxStop(&ri);
1089
1090 /* Consumer groups also have a non trivial memory overhead if there
1091 * are many consumers and many groups, let's count at least the
1092 * overhead of the pending entries in the groups and consumers
1093 * PELs. */
1094 if (s->cgroups) {
1095 raxStart(&ri,s->cgroups);
1096 raxSeek(&ri,"^",NULL,0);
1097 while(raxNext(&ri)) {
1098 streamCG *cg = ri.data;
1099 asize += sizeof(*cg);
1100 asize += streamRadixTreeMemoryUsage(cg->pel);
1101 asize += sizeof(streamNACK)*raxSize(cg->pel);
1102
1103 /* For each consumer we also need to add the basic data
1104 * structures and the PEL memory usage. */
1105 raxIterator cri;
1106 raxStart(&cri,cg->consumers);
1107 raxSeek(&cri,"^",NULL,0);
1108 while(raxNext(&cri)) {
1109 streamConsumer *consumer = cri.data;
1110 asize += sizeof(*consumer);
1111 asize += sdslen(consumer->name);
1112 asize += streamRadixTreeMemoryUsage(consumer->pel);
1113 /* Don't count NACKs again, they are shared with the
1114 * consumer group PEL. */
1115 }
1116 raxStop(&cri);
1117 }
1118 raxStop(&ri);
1119 }
1120 } else if (o->type == OBJ_MODULE) {
1121 asize = moduleGetMemUsage(key, o, sample_size, dbid);
1122 } else {
1123 serverPanic("Unknown object type");
1124 }
1125 return asize;
1126}
1127
1128/* Release data obtained with getMemoryOverheadData(). */
1129void freeMemoryOverheadData(struct redisMemOverhead *mh) {
1130 zfree(mh->db);
1131 zfree(mh);
1132}
1133
1134/* Return a struct redisMemOverhead filled with memory overhead
1135 * information used for the MEMORY OVERHEAD and INFO command. The returned
1136 * structure pointer should be freed calling freeMemoryOverheadData(). */
1137struct redisMemOverhead *getMemoryOverheadData(void) {
1138 int j;
1139 size_t mem_total = 0;
1140 size_t mem = 0;
1141 size_t zmalloc_used = zmalloc_used_memory();
1142 struct redisMemOverhead *mh = zcalloc(sizeof(*mh));
1143
1144 mh->total_allocated = zmalloc_used;
1145 mh->startup_allocated = server.initial_memory_usage;
1146 mh->peak_allocated = server.stat_peak_memory;
1147 mh->total_frag =
1148 (float)server.cron_malloc_stats.process_rss / server.cron_malloc_stats.zmalloc_used;
1149 mh->total_frag_bytes =
1150 server.cron_malloc_stats.process_rss - server.cron_malloc_stats.zmalloc_used;
1151 mh->allocator_frag =
1152 (float)server.cron_malloc_stats.allocator_active / server.cron_malloc_stats.allocator_allocated;
1153 mh->allocator_frag_bytes =
1154 server.cron_malloc_stats.allocator_active - server.cron_malloc_stats.allocator_allocated;
1155 mh->allocator_rss =
1156 (float)server.cron_malloc_stats.allocator_resident / server.cron_malloc_stats.allocator_active;
1157 mh->allocator_rss_bytes =
1158 server.cron_malloc_stats.allocator_resident - server.cron_malloc_stats.allocator_active;
1159 mh->rss_extra =
1160 (float)server.cron_malloc_stats.process_rss / server.cron_malloc_stats.allocator_resident;
1161 mh->rss_extra_bytes =
1162 server.cron_malloc_stats.process_rss - server.cron_malloc_stats.allocator_resident;
1163
1164 mem_total += server.initial_memory_usage;
1165
1166 /* Replication backlog and replicas share one global replication buffer,
1167 * only if replication buffer memory is more than the repl backlog setting,
1168 * we consider the excess as replicas' memory. Otherwise, replication buffer
1169 * memory is the consumption of repl backlog. */
1170 if (listLength(server.slaves) &&
1171 (long long)server.repl_buffer_mem > server.repl_backlog_size)
1172 {
1173 mh->clients_slaves = server.repl_buffer_mem - server.repl_backlog_size;
1174 mh->repl_backlog = server.repl_backlog_size;
1175 } else {
1176 mh->clients_slaves = 0;
1177 mh->repl_backlog = server.repl_buffer_mem;
1178 }
1179 if (server.repl_backlog) {
1180 /* The approximate memory of rax tree for indexed blocks. */
1181 mh->repl_backlog +=
1182 server.repl_backlog->blocks_index->numnodes * sizeof(raxNode) +
1183 raxSize(server.repl_backlog->blocks_index) * sizeof(void*);
1184 }
1185 mem_total += mh->repl_backlog;
1186 mem_total += mh->clients_slaves;
1187
1188 /* Computing the memory used by the clients would be O(N) if done
1189 * here online. We use our values computed incrementally by
1190 * updateClientMemUsage(). */
1191 mh->clients_normal = server.stat_clients_type_memory[CLIENT_TYPE_MASTER]+
1192 server.stat_clients_type_memory[CLIENT_TYPE_PUBSUB]+
1193 server.stat_clients_type_memory[CLIENT_TYPE_NORMAL];
1194 mem_total += mh->clients_normal;
1195
1196 mh->cluster_links = server.stat_cluster_links_memory;
1197 mem_total += mh->cluster_links;
1198
1199 mem = 0;
1200 if (server.aof_state != AOF_OFF) {
1201 mem += sdsZmallocSize(server.aof_buf);
1202 }
1203 mh->aof_buffer = mem;
1204 mem_total+=mem;
1205
1206 mem = evalScriptsMemory();
1207 mh->lua_caches = mem;
1208 mem_total+=mem;
1209 mh->functions_caches = functionsMemoryOverhead();
1210 mem_total+=mh->functions_caches;
1211
1212 for (j = 0; j < server.dbnum; j++) {
1213 redisDb *db = server.db+j;
1214 long long keyscount = dictSize(db->dict);
1215 if (keyscount==0) continue;
1216
1217 mh->total_keys += keyscount;
1218 mh->db = zrealloc(mh->db,sizeof(mh->db[0])*(mh->num_dbs+1));
1219 mh->db[mh->num_dbs].dbid = j;
1220
1221 mem = dictSize(db->dict) * sizeof(dictEntry) +
1222 dictSlots(db->dict) * sizeof(dictEntry*) +
1223 dictSize(db->dict) * sizeof(robj);
1224 mh->db[mh->num_dbs].overhead_ht_main = mem;
1225 mem_total+=mem;
1226
1227 mem = dictSize(db->expires) * sizeof(dictEntry) +
1228 dictSlots(db->expires) * sizeof(dictEntry*);
1229 mh->db[mh->num_dbs].overhead_ht_expires = mem;
1230 mem_total+=mem;
1231
1232 /* Account for the slot to keys map in cluster mode */
1233 mem = dictSize(db->dict) * dictMetadataSize(db->dict);
1234 mh->db[mh->num_dbs].overhead_ht_slot_to_keys = mem;
1235 mem_total+=mem;
1236
1237 mh->num_dbs++;
1238 }
1239
1240 mh->overhead_total = mem_total;
1241 mh->dataset = zmalloc_used - mem_total;
1242 mh->peak_perc = (float)zmalloc_used*100/mh->peak_allocated;
1243
1244 /* Metrics computed after subtracting the startup memory from
1245 * the total memory. */
1246 size_t net_usage = 1;
1247 if (zmalloc_used > mh->startup_allocated)
1248 net_usage = zmalloc_used - mh->startup_allocated;
1249 mh->dataset_perc = (float)mh->dataset*100/net_usage;
1250 mh->bytes_per_key = mh->total_keys ? (net_usage / mh->total_keys) : 0;
1251
1252 return mh;
1253}
1254
1255/* Helper for "MEMORY allocator-stats", used as a callback for the jemalloc
1256 * stats output. */
1257void inputCatSds(void *result, const char *str) {
1258 /* result is actually a (sds *), so re-cast it here */
1259 sds *info = (sds *)result;
1260 *info = sdscat(*info, str);
1261}
1262
1263/* This implements MEMORY DOCTOR. An human readable analysis of the Redis
1264 * memory condition. */
1265sds getMemoryDoctorReport(void) {
1266 int empty = 0; /* Instance is empty or almost empty. */
1267 int big_peak = 0; /* Memory peak is much larger than used mem. */
1268 int high_frag = 0; /* High fragmentation. */
1269 int high_alloc_frag = 0;/* High allocator fragmentation. */
1270 int high_proc_rss = 0; /* High process rss overhead. */
1271 int high_alloc_rss = 0; /* High rss overhead. */
1272 int big_slave_buf = 0; /* Slave buffers are too big. */
1273 int big_client_buf = 0; /* Client buffers are too big. */
1274 int many_scripts = 0; /* Script cache has too many scripts. */
1275 int num_reports = 0;
1276 struct redisMemOverhead *mh = getMemoryOverheadData();
1277
1278 if (mh->total_allocated < (1024*1024*5)) {
1279 empty = 1;
1280 num_reports++;
1281 } else {
1282 /* Peak is > 150% of current used memory? */
1283 if (((float)mh->peak_allocated / mh->total_allocated) > 1.5) {
1284 big_peak = 1;
1285 num_reports++;
1286 }
1287
1288 /* Fragmentation is higher than 1.4 and 10MB ?*/
1289 if (mh->total_frag > 1.4 && mh->total_frag_bytes > 10<<20) {
1290 high_frag = 1;
1291 num_reports++;
1292 }
1293
1294 /* External fragmentation is higher than 1.1 and 10MB? */
1295 if (mh->allocator_frag > 1.1 && mh->allocator_frag_bytes > 10<<20) {
1296 high_alloc_frag = 1;
1297 num_reports++;
1298 }
1299
1300 /* Allocator rss is higher than 1.1 and 10MB ? */
1301 if (mh->allocator_rss > 1.1 && mh->allocator_rss_bytes > 10<<20) {
1302 high_alloc_rss = 1;
1303 num_reports++;
1304 }
1305
1306 /* Non-Allocator rss is higher than 1.1 and 10MB ? */
1307 if (mh->rss_extra > 1.1 && mh->rss_extra_bytes > 10<<20) {
1308 high_proc_rss = 1;
1309 num_reports++;
1310 }
1311
1312 /* Clients using more than 200k each average? */
1313 long numslaves = listLength(server.slaves);
1314 long numclients = listLength(server.clients)-numslaves;
1315 if (mh->clients_normal / numclients > (1024*200)) {
1316 big_client_buf = 1;
1317 num_reports++;
1318 }
1319
1320 /* Slaves using more than 10 MB each? */
1321 if (numslaves > 0 && mh->clients_slaves > (1024*1024*10)) {
1322 big_slave_buf = 1;
1323 num_reports++;
1324 }
1325
1326 /* Too many scripts are cached? */
1327 if (dictSize(evalScriptsDict()) > 1000) {
1328 many_scripts = 1;
1329 num_reports++;
1330 }
1331 }
1332
1333 sds s;
1334 if (num_reports == 0) {
1335 s = sdsnew(
1336 "Hi Sam, I can't find any memory issue in your instance. "
1337 "I can only account for what occurs on this base.\n");
1338 } else if (empty == 1) {
1339 s = sdsnew(
1340 "Hi Sam, this instance is empty or is using very little memory, "
1341 "my issues detector can't be used in these conditions. "
1342 "Please, leave for your mission on Earth and fill it with some data. "
1343 "The new Sam and I will be back to our programming as soon as I "
1344 "finished rebooting.\n");
1345 } else {
1346 s = sdsnew("Sam, I detected a few issues in this Redis instance memory implants:\n\n");
1347 if (big_peak) {
1348 s = sdscat(s," * Peak memory: In the past this instance used more than 150% the memory that is currently using. The allocator is normally not able to release memory after a peak, so you can expect to see a big fragmentation ratio, however this is actually harmless and is only due to the memory peak, and if the Redis instance Resident Set Size (RSS) is currently bigger than expected, the memory will be used as soon as you fill the Redis instance with more data. If the memory peak was only occasional and you want to try to reclaim memory, please try the MEMORY PURGE command, otherwise the only other option is to shutdown and restart the instance.\n\n");
1349 }
1350 if (high_frag) {
1351 s = sdscatprintf(s," * High total RSS: This instance has a memory fragmentation and RSS overhead greater than 1.4 (this means that the Resident Set Size of the Redis process is much larger than the sum of the logical allocations Redis performed). This problem is usually due either to a large peak memory (check if there is a peak memory entry above in the report) or may result from a workload that causes the allocator to fragment memory a lot. If the problem is a large peak memory, then there is no issue. Otherwise, make sure you are using the Jemalloc allocator and not the default libc malloc. Note: The currently used allocator is \"%s\".\n\n", ZMALLOC_LIB);
1352 }
1353 if (high_alloc_frag) {
1354 s = sdscatprintf(s," * High allocator fragmentation: This instance has an allocator external fragmentation greater than 1.1. This problem is usually due either to a large peak memory (check if there is a peak memory entry above in the report) or may result from a workload that causes the allocator to fragment memory a lot. You can try enabling 'activedefrag' config option.\n\n");
1355 }
1356 if (high_alloc_rss) {
1357 s = sdscatprintf(s," * High allocator RSS overhead: This instance has an RSS memory overhead is greater than 1.1 (this means that the Resident Set Size of the allocator is much larger than the sum what the allocator actually holds). This problem is usually due to a large peak memory (check if there is a peak memory entry above in the report), you can try the MEMORY PURGE command to reclaim it.\n\n");
1358 }
1359 if (high_proc_rss) {
1360 s = sdscatprintf(s," * High process RSS overhead: This instance has non-allocator RSS memory overhead is greater than 1.1 (this means that the Resident Set Size of the Redis process is much larger than the RSS the allocator holds). This problem may be due to Lua scripts or Modules.\n\n");
1361 }
1362 if (big_slave_buf) {
1363 s = sdscat(s," * Big replica buffers: The replica output buffers in this instance are greater than 10MB for each replica (on average). This likely means that there is some replica instance that is struggling receiving data, either because it is too slow or because of networking issues. As a result, data piles on the master output buffers. Please try to identify what replica is not receiving data correctly and why. You can use the INFO output in order to check the replicas delays and the CLIENT LIST command to check the output buffers of each replica.\n\n");
1364 }
1365 if (big_client_buf) {
1366 s = sdscat(s," * Big client buffers: The clients output buffers in this instance are greater than 200K per client (on average). This may result from different causes, like Pub/Sub clients subscribed to channels bot not receiving data fast enough, so that data piles on the Redis instance output buffer, or clients sending commands with large replies or very large sequences of commands in the same pipeline. Please use the CLIENT LIST command in order to investigate the issue if it causes problems in your instance, or to understand better why certain clients are using a big amount of memory.\n\n");
1367 }
1368 if (many_scripts) {
1369 s = sdscat(s," * Many scripts: There seem to be many cached scripts in this instance (more than 1000). This may be because scripts are generated and `EVAL`ed, instead of being parameterized (with KEYS and ARGV), `SCRIPT LOAD`ed and `EVALSHA`ed. Unless `SCRIPT FLUSH` is called periodically, the scripts' caches may end up consuming most of your memory.\n\n");
1370 }
1371 s = sdscat(s,"I'm here to keep you safe, Sam. I want to help you.\n");
1372 }
1373 freeMemoryOverheadData(mh);
1374 return s;
1375}
1376
1377/* Set the object LRU/LFU depending on server.maxmemory_policy.
1378 * The lfu_freq arg is only relevant if policy is MAXMEMORY_FLAG_LFU.
1379 * The lru_idle and lru_clock args are only relevant if policy
1380 * is MAXMEMORY_FLAG_LRU.
1381 * Either or both of them may be <0, in that case, nothing is set. */
1382int objectSetLRUOrLFU(robj *val, long long lfu_freq, long long lru_idle,
1383 long long lru_clock, int lru_multiplier) {
1384 if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
1385 if (lfu_freq >= 0) {
1386 serverAssert(lfu_freq <= 255);
1387 val->lru = (LFUGetTimeInMinutes()<<8) | lfu_freq;
1388 return 1;
1389 }
1390 } else if (lru_idle >= 0) {
1391 /* Provided LRU idle time is in seconds. Scale
1392 * according to the LRU clock resolution this Redis
1393 * instance was compiled with (normally 1000 ms, so the
1394 * below statement will expand to lru_idle*1000/1000. */
1395 lru_idle = lru_idle*lru_multiplier/LRU_CLOCK_RESOLUTION;
1396 long lru_abs = lru_clock - lru_idle; /* Absolute access time. */
1397 /* If the LRU field underflows (since lru_clock is a wrapping clock),
1398 * we need to make it positive again. This be handled by the unwrapping
1399 * code in estimateObjectIdleTime. I.e. imagine a day when lru_clock
1400 * wrap arounds (happens once in some 6 months), and becomes a low
1401 * value, like 10, an lru_idle of 1000 should be near LRU_CLOCK_MAX. */
1402 if (lru_abs < 0)
1403 lru_abs += LRU_CLOCK_MAX;
1404 val->lru = lru_abs;
1405 return 1;
1406 }
1407 return 0;
1408}
1409
1410/* ======================= The OBJECT and MEMORY commands =================== */
1411
1412/* This is a helper function for the OBJECT command. We need to lookup keys
1413 * without any modification of LRU or other parameters. */
1414robj *objectCommandLookup(client *c, robj *key) {
1415 return lookupKeyReadWithFlags(c->db,key,LOOKUP_NOTOUCH|LOOKUP_NONOTIFY);
1416}
1417
1418robj *objectCommandLookupOrReply(client *c, robj *key, robj *reply) {
1419 robj *o = objectCommandLookup(c,key);
1420 if (!o) addReplyOrErrorObject(c, reply);
1421 return o;
1422}
1423
1424/* Object command allows to inspect the internals of a Redis Object.
1425 * Usage: OBJECT <refcount|encoding|idletime|freq> <key> */
1426void objectCommand(client *c) {
1427 robj *o;
1428
1429 if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"help")) {
1430 const char *help[] = {
1431"ENCODING <key>",
1432" Return the kind of internal representation used in order to store the value",
1433" associated with a <key>.",
1434"FREQ <key>",
1435" Return the access frequency index of the <key>. The returned integer is",
1436" proportional to the logarithm of the recent access frequency of the key.",
1437"IDLETIME <key>",
1438" Return the idle time of the <key>, that is the approximated number of",
1439" seconds elapsed since the last access to the key.",
1440"REFCOUNT <key>",
1441" Return the number of references of the value associated with the specified",
1442" <key>.",
1443NULL
1444 };
1445 addReplyHelp(c, help);
1446 } else if (!strcasecmp(c->argv[1]->ptr,"refcount") && c->argc == 3) {
1447 if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.null[c->resp]))
1448 == NULL) return;
1449 addReplyLongLong(c,o->refcount);
1450 } else if (!strcasecmp(c->argv[1]->ptr,"encoding") && c->argc == 3) {
1451 if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.null[c->resp]))
1452 == NULL) return;
1453 addReplyBulkCString(c,strEncoding(o->encoding));
1454 } else if (!strcasecmp(c->argv[1]->ptr,"idletime") && c->argc == 3) {
1455 if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.null[c->resp]))
1456 == NULL) return;
1457 if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
1458 addReplyError(c,"An LFU maxmemory policy is selected, idle time not tracked. Please note that when switching between policies at runtime LRU and LFU data will take some time to adjust.");
1459 return;
1460 }
1461 addReplyLongLong(c,estimateObjectIdleTime(o)/1000);
1462 } else if (!strcasecmp(c->argv[1]->ptr,"freq") && c->argc == 3) {
1463 if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.null[c->resp]))
1464 == NULL) return;
1465 if (!(server.maxmemory_policy & MAXMEMORY_FLAG_LFU)) {
1466 addReplyError(c,"An LFU maxmemory policy is not selected, access frequency not tracked. Please note that when switching between policies at runtime LRU and LFU data will take some time to adjust.");
1467 return;
1468 }
1469 /* LFUDecrAndReturn should be called
1470 * in case of the key has not been accessed for a long time,
1471 * because we update the access time only
1472 * when the key is read or overwritten. */
1473 addReplyLongLong(c,LFUDecrAndReturn(o));
1474 } else {
1475 addReplySubcommandSyntaxError(c);
1476 }
1477}
1478
1479/* The memory command will eventually be a complete interface for the
1480 * memory introspection capabilities of Redis.
1481 *
1482 * Usage: MEMORY usage <key> */
1483void memoryCommand(client *c) {
1484 if (!strcasecmp(c->argv[1]->ptr,"help") && c->argc == 2) {
1485 const char *help[] = {
1486"DOCTOR",
1487" Return memory problems reports.",
1488"MALLOC-STATS",
1489" Return internal statistics report from the memory allocator.",
1490"PURGE",
1491" Attempt to purge dirty pages for reclamation by the allocator.",
1492"STATS",
1493" Return information about the memory usage of the server.",
1494"USAGE <key> [SAMPLES <count>]",
1495" Return memory in bytes used by <key> and its value. Nested values are",
1496" sampled up to <count> times (default: 5, 0 means sample all).",
1497NULL
1498 };
1499 addReplyHelp(c, help);
1500 } else if (!strcasecmp(c->argv[1]->ptr,"usage") && c->argc >= 3) {
1501 dictEntry *de;
1502 long long samples = OBJ_COMPUTE_SIZE_DEF_SAMPLES;
1503 for (int j = 3; j < c->argc; j++) {
1504 if (!strcasecmp(c->argv[j]->ptr,"samples") &&
1505 j+1 < c->argc)
1506 {
1507 if (getLongLongFromObjectOrReply(c,c->argv[j+1],&samples,NULL)
1508 == C_ERR) return;
1509 if (samples < 0) {
1510 addReplyErrorObject(c,shared.syntaxerr);
1511 return;
1512 }
1513 if (samples == 0) samples = LLONG_MAX;
1514 j++; /* skip option argument. */
1515 } else {
1516 addReplyErrorObject(c,shared.syntaxerr);
1517 return;
1518 }
1519 }
1520 if ((de = dictFind(c->db->dict,c->argv[2]->ptr)) == NULL) {
1521 addReplyNull(c);
1522 return;
1523 }
1524 size_t usage = objectComputeSize(c->argv[2],dictGetVal(de),samples,c->db->id);
1525 usage += sdsZmallocSize(dictGetKey(de));
1526 usage += sizeof(dictEntry);
1527 usage += dictMetadataSize(c->db->dict);
1528 addReplyLongLong(c,usage);
1529 } else if (!strcasecmp(c->argv[1]->ptr,"stats") && c->argc == 2) {
1530 struct redisMemOverhead *mh = getMemoryOverheadData();
1531
1532 addReplyMapLen(c,27+mh->num_dbs);
1533
1534 addReplyBulkCString(c,"peak.allocated");
1535 addReplyLongLong(c,mh->peak_allocated);
1536
1537 addReplyBulkCString(c,"total.allocated");
1538 addReplyLongLong(c,mh->total_allocated);
1539
1540 addReplyBulkCString(c,"startup.allocated");
1541 addReplyLongLong(c,mh->startup_allocated);
1542
1543 addReplyBulkCString(c,"replication.backlog");
1544 addReplyLongLong(c,mh->repl_backlog);
1545
1546 addReplyBulkCString(c,"clients.slaves");
1547 addReplyLongLong(c,mh->clients_slaves);
1548
1549 addReplyBulkCString(c,"clients.normal");
1550 addReplyLongLong(c,mh->clients_normal);
1551
1552 addReplyBulkCString(c,"cluster.links");
1553 addReplyLongLong(c,mh->cluster_links);
1554
1555 addReplyBulkCString(c,"aof.buffer");
1556 addReplyLongLong(c,mh->aof_buffer);
1557
1558 addReplyBulkCString(c,"lua.caches");
1559 addReplyLongLong(c,mh->lua_caches);
1560
1561 addReplyBulkCString(c,"functions.caches");
1562 addReplyLongLong(c,mh->functions_caches);
1563
1564 for (size_t j = 0; j < mh->num_dbs; j++) {
1565 char dbname[32];
1566 snprintf(dbname,sizeof(dbname),"db.%zd",mh->db[j].dbid);
1567 addReplyBulkCString(c,dbname);
1568 addReplyMapLen(c,3);
1569
1570 addReplyBulkCString(c,"overhead.hashtable.main");
1571 addReplyLongLong(c,mh->db[j].overhead_ht_main);
1572
1573 addReplyBulkCString(c,"overhead.hashtable.expires");
1574 addReplyLongLong(c,mh->db[j].overhead_ht_expires);
1575
1576 addReplyBulkCString(c,"overhead.hashtable.slot-to-keys");
1577 addReplyLongLong(c,mh->db[j].overhead_ht_slot_to_keys);
1578 }
1579
1580
1581 addReplyBulkCString(c,"overhead.total");
1582 addReplyLongLong(c,mh->overhead_total);
1583
1584 addReplyBulkCString(c,"keys.count");
1585 addReplyLongLong(c,mh->total_keys);
1586
1587 addReplyBulkCString(c,"keys.bytes-per-key");
1588 addReplyLongLong(c,mh->bytes_per_key);
1589
1590 addReplyBulkCString(c,"dataset.bytes");
1591 addReplyLongLong(c,mh->dataset);
1592
1593 addReplyBulkCString(c,"dataset.percentage");
1594 addReplyDouble(c,mh->dataset_perc);
1595
1596 addReplyBulkCString(c,"peak.percentage");
1597 addReplyDouble(c,mh->peak_perc);
1598
1599 addReplyBulkCString(c,"allocator.allocated");
1600 addReplyLongLong(c,server.cron_malloc_stats.allocator_allocated);
1601
1602 addReplyBulkCString(c,"allocator.active");
1603 addReplyLongLong(c,server.cron_malloc_stats.allocator_active);
1604
1605 addReplyBulkCString(c,"allocator.resident");
1606 addReplyLongLong(c,server.cron_malloc_stats.allocator_resident);
1607
1608 addReplyBulkCString(c,"allocator-fragmentation.ratio");
1609 addReplyDouble(c,mh->allocator_frag);
1610
1611 addReplyBulkCString(c,"allocator-fragmentation.bytes");
1612 addReplyLongLong(c,mh->allocator_frag_bytes);
1613
1614 addReplyBulkCString(c,"allocator-rss.ratio");
1615 addReplyDouble(c,mh->allocator_rss);
1616
1617 addReplyBulkCString(c,"allocator-rss.bytes");
1618 addReplyLongLong(c,mh->allocator_rss_bytes);
1619
1620 addReplyBulkCString(c,"rss-overhead.ratio");
1621 addReplyDouble(c,mh->rss_extra);
1622
1623 addReplyBulkCString(c,"rss-overhead.bytes");
1624 addReplyLongLong(c,mh->rss_extra_bytes);
1625
1626 addReplyBulkCString(c,"fragmentation"); /* this is the total RSS overhead, including fragmentation */
1627 addReplyDouble(c,mh->total_frag); /* it is kept here for backwards compatibility */
1628
1629 addReplyBulkCString(c,"fragmentation.bytes");
1630 addReplyLongLong(c,mh->total_frag_bytes);
1631
1632 freeMemoryOverheadData(mh);
1633 } else if (!strcasecmp(c->argv[1]->ptr,"malloc-stats") && c->argc == 2) {
1634#if defined(USE_JEMALLOC)
1635 sds info = sdsempty();
1636 je_malloc_stats_print(inputCatSds, &info, NULL);
1637 addReplyVerbatim(c,info,sdslen(info),"txt");
1638 sdsfree(info);
1639#else
1640 addReplyBulkCString(c,"Stats not supported for the current allocator");
1641#endif
1642 } else if (!strcasecmp(c->argv[1]->ptr,"doctor") && c->argc == 2) {
1643 sds report = getMemoryDoctorReport();
1644 addReplyVerbatim(c,report,sdslen(report),"txt");
1645 sdsfree(report);
1646 } else if (!strcasecmp(c->argv[1]->ptr,"purge") && c->argc == 2) {
1647 if (jemalloc_purge() == 0)
1648 addReply(c, shared.ok);
1649 else
1650 addReplyError(c, "Error purging dirty pages");
1651 } else {
1652 addReplySubcommandSyntaxError(c);
1653 }
1654}
1655