1/* The latency monitor allows to easily observe the sources of latency
2 * in a Redis instance using the LATENCY command. Different latency
3 * sources are monitored, like disk I/O, execution of commands, fork
4 * system call, and so forth.
5 *
6 * ----------------------------------------------------------------------------
7 *
8 * Copyright (c) 2014, Salvatore Sanfilippo <antirez at gmail dot com>
9 * All rights reserved.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions are met:
13 *
14 * * Redistributions of source code must retain the above copyright notice,
15 * this list of conditions and the following disclaimer.
16 * * Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * * Neither the name of Redis nor the names of its contributors may be used
20 * to endorse or promote products derived from this software without
21 * specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
27 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
34 */
35
36#include "server.h"
37
38/* Dictionary type for latency events. */
39int dictStringKeyCompare(dict *d, const void *key1, const void *key2) {
40 UNUSED(d);
41 return strcmp(key1,key2) == 0;
42}
43
44uint64_t dictStringHash(const void *key) {
45 return dictGenHashFunction(key, strlen(key));
46}
47
48void dictVanillaFree(dict *d, void *val);
49
50dictType latencyTimeSeriesDictType = {
51 dictStringHash, /* hash function */
52 NULL, /* key dup */
53 NULL, /* val dup */
54 dictStringKeyCompare, /* key compare */
55 dictVanillaFree, /* key destructor */
56 dictVanillaFree, /* val destructor */
57 NULL /* allow to expand */
58};
59
60/* ------------------------- Utility functions ------------------------------ */
61
62/* Report the amount of AnonHugePages in smap, in bytes. If the return
63 * value of the function is non-zero, the process is being targeted by
64 * THP support, and is likely to have memory usage / latency issues. */
65int THPGetAnonHugePagesSize(void) {
66 return zmalloc_get_smap_bytes_by_field("AnonHugePages:",-1);
67}
68
69/* ---------------------------- Latency API --------------------------------- */
70
71/* Latency monitor initialization. We just need to create the dictionary
72 * of time series, each time series is created on demand in order to avoid
73 * having a fixed list to maintain. */
74void latencyMonitorInit(void) {
75 server.latency_events = dictCreate(&latencyTimeSeriesDictType);
76}
77
78/* Add the specified sample to the specified time series "event".
79 * This function is usually called via latencyAddSampleIfNeeded(), that
80 * is a macro that only adds the sample if the latency is higher than
81 * server.latency_monitor_threshold. */
82void latencyAddSample(const char *event, mstime_t latency) {
83 struct latencyTimeSeries *ts = dictFetchValue(server.latency_events,event);
84 time_t now = time(NULL);
85 int prev;
86
87 /* Create the time series if it does not exist. */
88 if (ts == NULL) {
89 ts = zmalloc(sizeof(*ts));
90 ts->idx = 0;
91 ts->max = 0;
92 memset(ts->samples,0,sizeof(ts->samples));
93 dictAdd(server.latency_events,zstrdup(event),ts);
94 }
95
96 if (latency > ts->max) ts->max = latency;
97
98 /* If the previous sample is in the same second, we update our old sample
99 * if this latency is > of the old one, or just return. */
100 prev = (ts->idx + LATENCY_TS_LEN - 1) % LATENCY_TS_LEN;
101 if (ts->samples[prev].time == now) {
102 if (latency > ts->samples[prev].latency)
103 ts->samples[prev].latency = latency;
104 return;
105 }
106
107 ts->samples[ts->idx].time = now;
108 ts->samples[ts->idx].latency = latency;
109
110 ts->idx++;
111 if (ts->idx == LATENCY_TS_LEN) ts->idx = 0;
112}
113
114/* Reset data for the specified event, or all the events data if 'event' is
115 * NULL.
116 *
117 * Note: this is O(N) even when event_to_reset is not NULL because makes
118 * the code simpler and we have a small fixed max number of events. */
119int latencyResetEvent(char *event_to_reset) {
120 dictIterator *di;
121 dictEntry *de;
122 int resets = 0;
123
124 di = dictGetSafeIterator(server.latency_events);
125 while((de = dictNext(di)) != NULL) {
126 char *event = dictGetKey(de);
127
128 if (event_to_reset == NULL || strcasecmp(event,event_to_reset) == 0) {
129 dictDelete(server.latency_events, event);
130 resets++;
131 }
132 }
133 dictReleaseIterator(di);
134 return resets;
135}
136
137/* ------------------------ Latency reporting (doctor) ---------------------- */
138
139/* Analyze the samples available for a given event and return a structure
140 * populate with different metrics, average, MAD, min, max, and so forth.
141 * Check latency.h definition of struct latencyStats for more info.
142 * If the specified event has no elements the structure is populate with
143 * zero values. */
144void analyzeLatencyForEvent(char *event, struct latencyStats *ls) {
145 struct latencyTimeSeries *ts = dictFetchValue(server.latency_events,event);
146 int j;
147 uint64_t sum;
148
149 ls->all_time_high = ts ? ts->max : 0;
150 ls->avg = 0;
151 ls->min = 0;
152 ls->max = 0;
153 ls->mad = 0;
154 ls->samples = 0;
155 ls->period = 0;
156 if (!ts) return;
157
158 /* First pass, populate everything but the MAD. */
159 sum = 0;
160 for (j = 0; j < LATENCY_TS_LEN; j++) {
161 if (ts->samples[j].time == 0) continue;
162 ls->samples++;
163 if (ls->samples == 1) {
164 ls->min = ls->max = ts->samples[j].latency;
165 } else {
166 if (ls->min > ts->samples[j].latency)
167 ls->min = ts->samples[j].latency;
168 if (ls->max < ts->samples[j].latency)
169 ls->max = ts->samples[j].latency;
170 }
171 sum += ts->samples[j].latency;
172
173 /* Track the oldest event time in ls->period. */
174 if (ls->period == 0 || ts->samples[j].time < ls->period)
175 ls->period = ts->samples[j].time;
176 }
177
178 /* So far avg is actually the sum of the latencies, and period is
179 * the oldest event time. We need to make the first an average and
180 * the second a range of seconds. */
181 if (ls->samples) {
182 ls->avg = sum / ls->samples;
183 ls->period = time(NULL) - ls->period;
184 if (ls->period == 0) ls->period = 1;
185 }
186
187 /* Second pass, compute MAD. */
188 sum = 0;
189 for (j = 0; j < LATENCY_TS_LEN; j++) {
190 int64_t delta;
191
192 if (ts->samples[j].time == 0) continue;
193 delta = (int64_t)ls->avg - ts->samples[j].latency;
194 if (delta < 0) delta = -delta;
195 sum += delta;
196 }
197 if (ls->samples) ls->mad = sum / ls->samples;
198}
199
200/* Create a human readable report of latency events for this Redis instance. */
201sds createLatencyReport(void) {
202 sds report = sdsempty();
203 int advise_better_vm = 0; /* Better virtual machines. */
204 int advise_slowlog_enabled = 0; /* Enable slowlog. */
205 int advise_slowlog_tuning = 0; /* Reconfigure slowlog. */
206 int advise_slowlog_inspect = 0; /* Check your slowlog. */
207 int advise_disk_contention = 0; /* Try to lower disk contention. */
208 int advise_scheduler = 0; /* Intrinsic latency. */
209 int advise_data_writeback = 0; /* data=writeback. */
210 int advise_no_appendfsync = 0; /* don't fsync during rewrites. */
211 int advise_local_disk = 0; /* Avoid remote disks. */
212 int advise_ssd = 0; /* Use an SSD drive. */
213 int advise_write_load_info = 0; /* Print info about AOF and write load. */
214 int advise_hz = 0; /* Use higher HZ. */
215 int advise_large_objects = 0; /* Deletion of large objects. */
216 int advise_mass_eviction = 0; /* Avoid mass eviction of keys. */
217 int advise_relax_fsync_policy = 0; /* appendfsync always is slow. */
218 int advise_disable_thp = 0; /* AnonHugePages detected. */
219 int advices = 0;
220
221 /* Return ASAP if the latency engine is disabled and it looks like it
222 * was never enabled so far. */
223 if (dictSize(server.latency_events) == 0 &&
224 server.latency_monitor_threshold == 0)
225 {
226 report = sdscat(report,"I'm sorry, Dave, I can't do that. Latency monitoring is disabled in this Redis instance. You may use \"CONFIG SET latency-monitor-threshold <milliseconds>.\" in order to enable it. If we weren't in a deep space mission I'd suggest to take a look at https://redis.io/topics/latency-monitor.\n");
227 return report;
228 }
229
230 /* Show all the events stats and add for each event some event-related
231 * comment depending on the values. */
232 dictIterator *di;
233 dictEntry *de;
234 int eventnum = 0;
235
236 di = dictGetSafeIterator(server.latency_events);
237 while((de = dictNext(di)) != NULL) {
238 char *event = dictGetKey(de);
239 struct latencyTimeSeries *ts = dictGetVal(de);
240 struct latencyStats ls;
241
242 if (ts == NULL) continue;
243 eventnum++;
244 if (eventnum == 1) {
245 report = sdscat(report,"Dave, I have observed latency spikes in this Redis instance. You don't mind talking about it, do you Dave?\n\n");
246 }
247 analyzeLatencyForEvent(event,&ls);
248
249 report = sdscatprintf(report,
250 "%d. %s: %d latency spikes (average %lums, mean deviation %lums, period %.2f sec). Worst all time event %lums.",
251 eventnum, event,
252 ls.samples,
253 (unsigned long) ls.avg,
254 (unsigned long) ls.mad,
255 (double) ls.period/ls.samples,
256 (unsigned long) ts->max);
257
258 /* Fork */
259 if (!strcasecmp(event,"fork")) {
260 char *fork_quality;
261 if (server.stat_fork_rate < 10) {
262 fork_quality = "terrible";
263 advise_better_vm = 1;
264 advices++;
265 } else if (server.stat_fork_rate < 25) {
266 fork_quality = "poor";
267 advise_better_vm = 1;
268 advices++;
269 } else if (server.stat_fork_rate < 100) {
270 fork_quality = "good";
271 } else {
272 fork_quality = "excellent";
273 }
274 report = sdscatprintf(report,
275 " Fork rate is %.2f GB/sec (%s).", server.stat_fork_rate,
276 fork_quality);
277 }
278
279 /* Potentially commands. */
280 if (!strcasecmp(event,"command")) {
281 if (server.slowlog_log_slower_than < 0) {
282 advise_slowlog_enabled = 1;
283 advices++;
284 } else if (server.slowlog_log_slower_than/1000 >
285 server.latency_monitor_threshold)
286 {
287 advise_slowlog_tuning = 1;
288 advices++;
289 }
290 advise_slowlog_inspect = 1;
291 advise_large_objects = 1;
292 advices += 2;
293 }
294
295 /* fast-command. */
296 if (!strcasecmp(event,"fast-command")) {
297 advise_scheduler = 1;
298 advices++;
299 }
300
301 /* AOF and I/O. */
302 if (!strcasecmp(event,"aof-write-pending-fsync")) {
303 advise_local_disk = 1;
304 advise_disk_contention = 1;
305 advise_ssd = 1;
306 advise_data_writeback = 1;
307 advices += 4;
308 }
309
310 if (!strcasecmp(event,"aof-write-active-child")) {
311 advise_no_appendfsync = 1;
312 advise_data_writeback = 1;
313 advise_ssd = 1;
314 advices += 3;
315 }
316
317 if (!strcasecmp(event,"aof-write-alone")) {
318 advise_local_disk = 1;
319 advise_data_writeback = 1;
320 advise_ssd = 1;
321 advices += 3;
322 }
323
324 if (!strcasecmp(event,"aof-fsync-always")) {
325 advise_relax_fsync_policy = 1;
326 advices++;
327 }
328
329 if (!strcasecmp(event,"aof-fstat") ||
330 !strcasecmp(event,"rdb-unlink-temp-file")) {
331 advise_disk_contention = 1;
332 advise_local_disk = 1;
333 advices += 2;
334 }
335
336 if (!strcasecmp(event,"aof-rewrite-diff-write") ||
337 !strcasecmp(event,"aof-rename")) {
338 advise_write_load_info = 1;
339 advise_data_writeback = 1;
340 advise_ssd = 1;
341 advise_local_disk = 1;
342 advices += 4;
343 }
344
345 /* Expire cycle. */
346 if (!strcasecmp(event,"expire-cycle")) {
347 advise_hz = 1;
348 advise_large_objects = 1;
349 advices += 2;
350 }
351
352 /* Eviction cycle. */
353 if (!strcasecmp(event,"eviction-del")) {
354 advise_large_objects = 1;
355 advices++;
356 }
357
358 if (!strcasecmp(event,"eviction-cycle")) {
359 advise_mass_eviction = 1;
360 advices++;
361 }
362
363 report = sdscatlen(report,"\n",1);
364 }
365 dictReleaseIterator(di);
366
367 /* Add non event based advices. */
368 if (THPGetAnonHugePagesSize() > 0) {
369 advise_disable_thp = 1;
370 advices++;
371 }
372
373 if (eventnum == 0 && advices == 0) {
374 report = sdscat(report,"Dave, no latency spike was observed during the lifetime of this Redis instance, not in the slightest bit. I honestly think you ought to sit down calmly, take a stress pill, and think things over.\n");
375 } else if (eventnum > 0 && advices == 0) {
376 report = sdscat(report,"\nWhile there are latency events logged, I'm not able to suggest any easy fix. Please use the Redis community to get some help, providing this report in your help request.\n");
377 } else {
378 /* Add all the suggestions accumulated so far. */
379
380 /* Better VM. */
381 report = sdscat(report,"\nI have a few advices for you:\n\n");
382 if (advise_better_vm) {
383 report = sdscat(report,"- If you are using a virtual machine, consider upgrading it with a faster one using a hypervisior that provides less latency during fork() calls. Xen is known to have poor fork() performance. Even in the context of the same VM provider, certain kinds of instances can execute fork faster than others.\n");
384 }
385
386 /* Slow log. */
387 if (advise_slowlog_enabled) {
388 report = sdscatprintf(report,"- There are latency issues with potentially slow commands you are using. Try to enable the Slow Log Redis feature using the command 'CONFIG SET slowlog-log-slower-than %llu'. If the Slow log is disabled Redis is not able to log slow commands execution for you.\n", (unsigned long long)server.latency_monitor_threshold*1000);
389 }
390
391 if (advise_slowlog_tuning) {
392 report = sdscatprintf(report,"- Your current Slow Log configuration only logs events that are slower than your configured latency monitor threshold. Please use 'CONFIG SET slowlog-log-slower-than %llu'.\n", (unsigned long long)server.latency_monitor_threshold*1000);
393 }
394
395 if (advise_slowlog_inspect) {
396 report = sdscat(report,"- Check your Slow Log to understand what are the commands you are running which are too slow to execute. Please check https://redis.io/commands/slowlog for more information.\n");
397 }
398
399 /* Intrinsic latency. */
400 if (advise_scheduler) {
401 report = sdscat(report,"- The system is slow to execute Redis code paths not containing system calls. This usually means the system does not provide Redis CPU time to run for long periods. You should try to:\n"
402 " 1) Lower the system load.\n"
403 " 2) Use a computer / VM just for Redis if you are running other software in the same system.\n"
404 " 3) Check if you have a \"noisy neighbour\" problem.\n"
405 " 4) Check with 'redis-cli --intrinsic-latency 100' what is the intrinsic latency in your system.\n"
406 " 5) Check if the problem is allocator-related by recompiling Redis with MALLOC=libc, if you are using Jemalloc. However this may create fragmentation problems.\n");
407 }
408
409 /* AOF / Disk latency. */
410 if (advise_local_disk) {
411 report = sdscat(report,"- It is strongly advised to use local disks for persistence, especially if you are using AOF. Remote disks provided by platform-as-a-service providers are known to be slow.\n");
412 }
413
414 if (advise_ssd) {
415 report = sdscat(report,"- SSD disks are able to reduce fsync latency, and total time needed for snapshotting and AOF log rewriting (resulting in smaller memory usage). With extremely high write load SSD disks can be a good option. However Redis should perform reasonably with high load using normal disks. Use this advice as a last resort.\n");
416 }
417
418 if (advise_data_writeback) {
419 report = sdscat(report,"- Mounting ext3/4 filesystems with data=writeback can provide a performance boost compared to data=ordered, however this mode of operation provides less guarantees, and sometimes it can happen that after a hard crash the AOF file will have a half-written command at the end and will require to be repaired before Redis restarts.\n");
420 }
421
422 if (advise_disk_contention) {
423 report = sdscat(report,"- Try to lower the disk contention. This is often caused by other disk intensive processes running in the same computer (including other Redis instances).\n");
424 }
425
426 if (advise_no_appendfsync) {
427 report = sdscat(report,"- Assuming from the point of view of data safety this is viable in your environment, you could try to enable the 'no-appendfsync-on-rewrite' option, so that fsync will not be performed while there is a child rewriting the AOF file or producing an RDB file (the moment where there is high disk contention).\n");
428 }
429
430 if (advise_relax_fsync_policy && server.aof_fsync == AOF_FSYNC_ALWAYS) {
431 report = sdscat(report,"- Your fsync policy is set to 'always'. It is very hard to get good performances with such a setup, if possible try to relax the fsync policy to 'onesec'.\n");
432 }
433
434 if (advise_write_load_info) {
435 report = sdscat(report,"- Latency during the AOF atomic rename operation or when the final difference is flushed to the AOF file at the end of the rewrite, sometimes is caused by very high write load, causing the AOF buffer to get very large. If possible try to send less commands to accomplish the same work, or use Lua scripts to group multiple operations into a single EVALSHA call.\n");
436 }
437
438 if (advise_hz && server.hz < 100) {
439 report = sdscat(report,"- In order to make the Redis keys expiring process more incremental, try to set the 'hz' configuration parameter to 100 using 'CONFIG SET hz 100'.\n");
440 }
441
442 if (advise_large_objects) {
443 report = sdscat(report,"- Deleting, expiring or evicting (because of maxmemory policy) large objects is a blocking operation. If you have very large objects that are often deleted, expired, or evicted, try to fragment those objects into multiple smaller objects.\n");
444 }
445
446 if (advise_mass_eviction) {
447 report = sdscat(report,"- Sudden changes to the 'maxmemory' setting via 'CONFIG SET', or allocation of large objects via sets or sorted sets intersections, STORE option of SORT, Redis Cluster large keys migrations (RESTORE command), may create sudden memory pressure forcing the server to block trying to evict keys. \n");
448 }
449
450 if (advise_disable_thp) {
451 report = sdscat(report,"- I detected a non zero amount of anonymous huge pages used by your process. This creates very serious latency events in different conditions, especially when Redis is persisting on disk. To disable THP support use the command 'echo never > /sys/kernel/mm/transparent_hugepage/enabled', make sure to also add it into /etc/rc.local so that the command will be executed again after a reboot. Note that even if you have already disabled THP, you still need to restart the Redis process to get rid of the huge pages already created.\n");
452 }
453 }
454
455 return report;
456}
457
458/* ---------------------- Latency command implementation -------------------- */
459
460/* latencyCommand() helper to produce a map of time buckets,
461 * each representing a latency range,
462 * between 1 nanosecond and roughly 1 second.
463 * Each bucket covers twice the previous bucket's range.
464 * Empty buckets are not printed.
465 * Everything above 1 sec is considered +Inf.
466 * At max there will be log2(1000000000)=30 buckets */
467void fillCommandCDF(client *c, struct hdr_histogram* histogram) {
468 addReplyMapLen(c,2);
469 addReplyBulkCString(c,"calls");
470 addReplyLongLong(c,(long long) histogram->total_count);
471 addReplyBulkCString(c,"histogram_usec");
472 void *replylen = addReplyDeferredLen(c);
473 int samples = 0;
474 struct hdr_iter iter;
475 hdr_iter_log_init(&iter,histogram,1024,2);
476 int64_t previous_count = 0;
477 while (hdr_iter_next(&iter)) {
478 const int64_t micros = iter.highest_equivalent_value / 1000;
479 const int64_t cumulative_count = iter.cumulative_count;
480 if(cumulative_count > previous_count){
481 addReplyLongLong(c,(long long) micros);
482 addReplyLongLong(c,(long long) cumulative_count);
483 samples++;
484 }
485 previous_count = cumulative_count;
486 }
487 setDeferredMapLen(c,replylen,samples);
488}
489
490/* latencyCommand() helper to produce for all commands,
491 * a per command cumulative distribution of latencies. */
492void latencyAllCommandsFillCDF(client *c, dict *commands, int *command_with_data) {
493 dictIterator *di = dictGetSafeIterator(commands);
494 dictEntry *de;
495 struct redisCommand *cmd;
496
497 while((de = dictNext(di)) != NULL) {
498 cmd = (struct redisCommand *) dictGetVal(de);
499 if (cmd->latency_histogram) {
500 addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname));
501 fillCommandCDF(c, cmd->latency_histogram);
502 (*command_with_data)++;
503 }
504
505 if (cmd->subcommands) {
506 latencyAllCommandsFillCDF(c, cmd->subcommands_dict, command_with_data);
507 }
508 }
509 dictReleaseIterator(di);
510}
511
512/* latencyCommand() helper to produce for a specific command set,
513 * a per command cumulative distribution of latencies. */
514void latencySpecificCommandsFillCDF(client *c) {
515 void *replylen = addReplyDeferredLen(c);
516 int command_with_data = 0;
517 for (int j = 2; j < c->argc; j++){
518 struct redisCommand *cmd = lookupCommandBySds(c->argv[j]->ptr);
519 /* If the command does not exist we skip the reply */
520 if (cmd == NULL) {
521 continue;
522 }
523
524 if (cmd->latency_histogram) {
525 addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname));
526 fillCommandCDF(c, cmd->latency_histogram);
527 command_with_data++;
528 }
529
530 if (cmd->subcommands_dict) {
531 dictEntry *de;
532 dictIterator *di = dictGetSafeIterator(cmd->subcommands_dict);
533
534 while ((de = dictNext(di)) != NULL) {
535 struct redisCommand *sub = dictGetVal(de);
536 if (sub->latency_histogram) {
537 addReplyBulkCBuffer(c, sub->fullname, sdslen(sub->fullname));
538 fillCommandCDF(c, sub->latency_histogram);
539 command_with_data++;
540 }
541 }
542 dictReleaseIterator(di);
543 }
544 }
545 setDeferredMapLen(c,replylen,command_with_data);
546}
547
548/* latencyCommand() helper to produce a time-delay reply for all the samples
549 * in memory for the specified time series. */
550void latencyCommandReplyWithSamples(client *c, struct latencyTimeSeries *ts) {
551 void *replylen = addReplyDeferredLen(c);
552 int samples = 0, j;
553
554 for (j = 0; j < LATENCY_TS_LEN; j++) {
555 int i = (ts->idx + j) % LATENCY_TS_LEN;
556
557 if (ts->samples[i].time == 0) continue;
558 addReplyArrayLen(c,2);
559 addReplyLongLong(c,ts->samples[i].time);
560 addReplyLongLong(c,ts->samples[i].latency);
561 samples++;
562 }
563 setDeferredArrayLen(c,replylen,samples);
564}
565
566/* latencyCommand() helper to produce the reply for the LATEST subcommand,
567 * listing the last latency sample for every event type registered so far. */
568void latencyCommandReplyWithLatestEvents(client *c) {
569 dictIterator *di;
570 dictEntry *de;
571
572 addReplyArrayLen(c,dictSize(server.latency_events));
573 di = dictGetIterator(server.latency_events);
574 while((de = dictNext(di)) != NULL) {
575 char *event = dictGetKey(de);
576 struct latencyTimeSeries *ts = dictGetVal(de);
577 int last = (ts->idx + LATENCY_TS_LEN - 1) % LATENCY_TS_LEN;
578
579 addReplyArrayLen(c,4);
580 addReplyBulkCString(c,event);
581 addReplyLongLong(c,ts->samples[last].time);
582 addReplyLongLong(c,ts->samples[last].latency);
583 addReplyLongLong(c,ts->max);
584 }
585 dictReleaseIterator(di);
586}
587
588#define LATENCY_GRAPH_COLS 80
589sds latencyCommandGenSparkeline(char *event, struct latencyTimeSeries *ts) {
590 int j;
591 struct sequence *seq = createSparklineSequence();
592 sds graph = sdsempty();
593 uint32_t min = 0, max = 0;
594
595 for (j = 0; j < LATENCY_TS_LEN; j++) {
596 int i = (ts->idx + j) % LATENCY_TS_LEN;
597 int elapsed;
598 char buf[64];
599
600 if (ts->samples[i].time == 0) continue;
601 /* Update min and max. */
602 if (seq->length == 0) {
603 min = max = ts->samples[i].latency;
604 } else {
605 if (ts->samples[i].latency > max) max = ts->samples[i].latency;
606 if (ts->samples[i].latency < min) min = ts->samples[i].latency;
607 }
608 /* Use as label the number of seconds / minutes / hours / days
609 * ago the event happened. */
610 elapsed = time(NULL) - ts->samples[i].time;
611 if (elapsed < 60)
612 snprintf(buf,sizeof(buf),"%ds",elapsed);
613 else if (elapsed < 3600)
614 snprintf(buf,sizeof(buf),"%dm",elapsed/60);
615 else if (elapsed < 3600*24)
616 snprintf(buf,sizeof(buf),"%dh",elapsed/3600);
617 else
618 snprintf(buf,sizeof(buf),"%dd",elapsed/(3600*24));
619 sparklineSequenceAddSample(seq,ts->samples[i].latency,buf);
620 }
621
622 graph = sdscatprintf(graph,
623 "%s - high %lu ms, low %lu ms (all time high %lu ms)\n", event,
624 (unsigned long) max, (unsigned long) min, (unsigned long) ts->max);
625 for (j = 0; j < LATENCY_GRAPH_COLS; j++)
626 graph = sdscatlen(graph,"-",1);
627 graph = sdscatlen(graph,"\n",1);
628 graph = sparklineRender(graph,seq,LATENCY_GRAPH_COLS,4,SPARKLINE_FILL);
629 freeSparklineSequence(seq);
630 return graph;
631}
632
633/* LATENCY command implementations.
634 *
635 * LATENCY HISTORY: return time-latency samples for the specified event.
636 * LATENCY LATEST: return the latest latency for all the events classes.
637 * LATENCY DOCTOR: returns a human readable analysis of instance latency.
638 * LATENCY GRAPH: provide an ASCII graph of the latency of the specified event.
639 * LATENCY RESET: reset data of a specified event or all the data if no event provided.
640 * LATENCY HISTOGRAM: return a cumulative distribution of latencies in the format of an histogram for the specified command names.
641 */
642void latencyCommand(client *c) {
643 struct latencyTimeSeries *ts;
644
645 if (!strcasecmp(c->argv[1]->ptr,"history") && c->argc == 3) {
646 /* LATENCY HISTORY <event> */
647 ts = dictFetchValue(server.latency_events,c->argv[2]->ptr);
648 if (ts == NULL) {
649 addReplyArrayLen(c,0);
650 } else {
651 latencyCommandReplyWithSamples(c,ts);
652 }
653 } else if (!strcasecmp(c->argv[1]->ptr,"graph") && c->argc == 3) {
654 /* LATENCY GRAPH <event> */
655 sds graph;
656 dictEntry *de;
657 char *event;
658
659 de = dictFind(server.latency_events,c->argv[2]->ptr);
660 if (de == NULL) goto nodataerr;
661 ts = dictGetVal(de);
662 event = dictGetKey(de);
663
664 graph = latencyCommandGenSparkeline(event,ts);
665 addReplyVerbatim(c,graph,sdslen(graph),"txt");
666 sdsfree(graph);
667 } else if (!strcasecmp(c->argv[1]->ptr,"latest") && c->argc == 2) {
668 /* LATENCY LATEST */
669 latencyCommandReplyWithLatestEvents(c);
670 } else if (!strcasecmp(c->argv[1]->ptr,"doctor") && c->argc == 2) {
671 /* LATENCY DOCTOR */
672 sds report = createLatencyReport();
673
674 addReplyVerbatim(c,report,sdslen(report),"txt");
675 sdsfree(report);
676 } else if (!strcasecmp(c->argv[1]->ptr,"reset") && c->argc >= 2) {
677 /* LATENCY RESET */
678 if (c->argc == 2) {
679 addReplyLongLong(c,latencyResetEvent(NULL));
680 } else {
681 int j, resets = 0;
682
683 for (j = 2; j < c->argc; j++)
684 resets += latencyResetEvent(c->argv[j]->ptr);
685 addReplyLongLong(c,resets);
686 }
687 } else if (!strcasecmp(c->argv[1]->ptr,"histogram") && c->argc >= 2) {
688 /* LATENCY HISTOGRAM*/
689 if (c->argc == 2) {
690 int command_with_data = 0;
691 void *replylen = addReplyDeferredLen(c);
692 latencyAllCommandsFillCDF(c, server.commands, &command_with_data);
693 setDeferredMapLen(c, replylen, command_with_data);
694 } else {
695 latencySpecificCommandsFillCDF(c);
696 }
697 } else if (!strcasecmp(c->argv[1]->ptr,"help") && c->argc == 2) {
698 const char *help[] = {
699"DOCTOR",
700" Return a human readable latency analysis report.",
701"GRAPH <event>",
702" Return an ASCII latency graph for the <event> class.",
703"HISTORY <event>",
704" Return time-latency samples for the <event> class.",
705"LATEST",
706" Return the latest latency samples for all events.",
707"RESET [<event> ...]",
708" Reset latency data of one or more <event> classes.",
709" (default: reset all data for all event classes)",
710"HISTOGRAM [COMMAND ...]",
711" Return a cumulative distribution of latencies in the format of a histogram for the specified command names.",
712" If no commands are specified then all histograms are replied.",
713NULL
714 };
715 addReplyHelp(c, help);
716 } else {
717 addReplySubcommandSyntaxError(c);
718 }
719 return;
720
721nodataerr:
722 /* Common error when the user asks for an event we have no latency
723 * information about. */
724 addReplyErrorFormat(c,
725 "No samples available for event '%s'", (char*) c->argv[2]->ptr);
726}
727
728