latency.c source code [redis/src/latency.c]

1	/ The latency monitor allows to easily observe the sources of latency*
2	* in a Redis instance using the LATENCY command. Different latency
3	* sources are monitored, like disk I/O, execution of commands, fork
4	* system call, and so forth.
5	*
6	* ----------------------------------------------------------------------------
7	*
8	* Copyright (c) 2014, Salvatore Sanfilippo <antirez at gmail dot com>
9	* All rights reserved.
10	*
11	* Redistribution and use in source and binary forms, with or without
12	* modification, are permitted provided that the following conditions are met:
13	*
14	* * Redistributions of source code must retain the above copyright notice,
15	* this list of conditions and the following disclaimer.
16	* * Redistributions in binary form must reproduce the above copyright
17	* notice, this list of conditions and the following disclaimer in the
18	* documentation and/or other materials provided with the distribution.
19	* * Neither the name of Redis nor the names of its contributors may be used
20	* to endorse or promote products derived from this software without
21	* specific prior written permission.
22	*
23	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
27	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
28	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
29	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
30	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33	* POSSIBILITY OF SUCH DAMAGE.
34	*/
35
36	#include "server.h"
37
38	/ Dictionary type for latency events. /
39	int dictStringKeyCompare(dict d, const* void key1, const* void *key2) {
40	UNUSED(d);
41	return strcmp(key1,key2) == `0`;
42	}
43
44	uint64_t dictStringHash(const void *key) {
45	return dictGenHashFunction(key, strlen(key));
46	}
47
48	void dictVanillaFree(dict d, void* *val);
49
50	dictType latencyTimeSeriesDictType = {
51	dictStringHash, / hash function /
52	NULL, / key dup /
53	NULL, / val dup /
54	dictStringKeyCompare, / key compare /
55	dictVanillaFree, / key destructor /
56	dictVanillaFree, / val destructor /
57	NULL / allow to expand /
58	};
59
60	/ ------------------------- Utility functions ------------------------------ /
61
62	/ Report the amount of AnonHugePages in smap, in bytes. If the return*
63	* value of the function is non-zero, the process is being targeted by
64	* THP support, and is likely to have memory usage / latency issues. */
65	int THPGetAnonHugePagesSize(void) {
66	return zmalloc_get_smap_bytes_by_field("AnonHugePages:",-`1`);
67	}
68
69	/ ---------------------------- Latency API --------------------------------- /
70
71	/ Latency monitor initialization. We just need to create the dictionary*
72	* of time series, each time series is created on demand in order to avoid
73	* having a fixed list to maintain. */
74	void latencyMonitorInit(void) {
75	server.latency_events = dictCreate(&latencyTimeSeriesDictType);
76	}
77
78	/ Add the specified sample to the specified time series "event".*
79	* This function is usually called via latencyAddSampleIfNeeded(), that
80	* is a macro that only adds the sample if the latency is higher than
81	* server.latency_monitor_threshold. */
82	void latencyAddSample(const char *event, mstime_t latency) {
83	struct latencyTimeSeries *ts = dictFetchValue(server.latency_events,event);
84	time_t now = time(NULL);
85	int prev;
86
87	/ Create the time series if it does not exist. /
88	if (ts == NULL) {
89	ts = zmalloc(sizeof(*ts));
90	ts->idx = `0`;
91	ts->max = `0`;
92	memset(ts->samples,`0`,sizeof(ts->samples));
93	dictAdd(server.latency_events,zstrdup(event),ts);
94	}
95
96	if (latency > ts->max) ts->max = latency;
97
98	/ If the previous sample is in the same second, we update our old sample*
99	* if this latency is > of the old one, or just return. */
100	prev = (ts->idx + LATENCY_TS_LEN - `1`) % LATENCY_TS_LEN;
101	if (ts->samples[prev].time == now) {
102	if (latency > ts->samples[prev].latency)
103	ts->samples[prev].latency = latency;
104	return;
105	}
106
107	ts->samples[ts->idx].time = now;
108	ts->samples[ts->idx].latency = latency;
109
110	ts->idx++;
111	if (ts->idx == LATENCY_TS_LEN) ts->idx = `0`;
112	}
113
114	/ Reset data for the specified event, or all the events data if 'event' is*
115	* NULL.
116	*
117	* Note: this is O(N) even when event_to_reset is not NULL because makes
118	* the code simpler and we have a small fixed max number of events. */
119	int latencyResetEvent(char *event_to_reset) {
120	dictIterator *di;
121	dictEntry *de;
122	int resets = `0`;
123
124	di = dictGetSafeIterator(server.latency_events);
125	while((de = dictNext(di)) != NULL) {
126	char *event = dictGetKey(de);
127
128	if (event_to_reset == NULL \|\| strcasecmp(event,event_to_reset) == `0`) {
129	dictDelete(server.latency_events, event);
130	resets++;
131	}
132	}
133	dictReleaseIterator(di);
134	return resets;
135	}
136
137	/ ------------------------ Latency reporting (doctor) ---------------------- /
138
139	/ Analyze the samples available for a given event and return a structure*
140	* populate with different metrics, average, MAD, min, max, and so forth.
141	* Check latency.h definition of struct latencyStats for more info.
142	* If the specified event has no elements the structure is populate with
143	* zero values. */
144	void analyzeLatencyForEvent(char event, struct* latencyStats *ls) {
145	struct latencyTimeSeries *ts = dictFetchValue(server.latency_events,event);
146	int j;
147	uint64_t sum;
148
149	ls->all_time_high = ts ? ts->max : `0`;
150	ls->avg = `0`;
151	ls->min = `0`;
152	ls->max = `0`;
153	ls->mad = `0`;
154	ls->samples = `0`;
155	ls->period = `0`;
156	if (!ts) return;
157
158	/ First pass, populate everything but the MAD. /
159	sum = `0`;
160	for (j = `0`; j < LATENCY_TS_LEN; j++) {
161	if (ts->samples[j].time == `0`) continue;
162	ls->samples++;
163	if (ls->samples == `1`) {
164	ls->min = ls->max = ts->samples[j].latency;
165	} else {
166	if (ls->min > ts->samples[j].latency)
167	ls->min = ts->samples[j].latency;
168	if (ls->max < ts->samples[j].latency)
169	ls->max = ts->samples[j].latency;
170	}
171	sum += ts->samples[j].latency;
172
173	/ Track the oldest event time in ls->period. /
174	if (ls->period == `0` \|\| ts->samples[j].time < ls->period)
175	ls->period = ts->samples[j].time;
176	}
177
178	/ So far avg is actually the sum of the latencies, and period is*
179	* the oldest event time. We need to make the first an average and
180	* the second a range of seconds. */
181	if (ls->samples) {
182	ls->avg = sum / ls->samples;
183	ls->period = time(NULL) - ls->period;
184	if (ls->period == `0`) ls->period = `1`;
185	}
186
187	/ Second pass, compute MAD. /
188	sum = `0`;
189	for (j = `0`; j < LATENCY_TS_LEN; j++) {
190	int64_t delta;
191
192	if (ts->samples[j].time == `0`) continue;
193	delta = (int64_t)ls->avg - ts->samples[j].latency;
194	if (delta < `0`) delta = -delta;
195	sum += delta;
196	}
197	if (ls->samples) ls->mad = sum / ls->samples;
198	}
199
200	/ Create a human readable report of latency events for this Redis instance. /
201	sds createLatencyReport(void) {
202	sds report = sdsempty();
203	int advise_better_vm = `0`; / Better virtual machines. /
204	int advise_slowlog_enabled = `0`; / Enable slowlog. /
205	int advise_slowlog_tuning = `0`; / Reconfigure slowlog. /
206	int advise_slowlog_inspect = `0`; / Check your slowlog. /
207	int advise_disk_contention = `0`; / Try to lower disk contention. /
208	int advise_scheduler = `0`; / Intrinsic latency. /
209	int advise_data_writeback = `0`; / data=writeback. /
210	int advise_no_appendfsync = `0`; / don't fsync during rewrites. /
211	int advise_local_disk = `0`; / Avoid remote disks. /
212	int advise_ssd = `0`; / Use an SSD drive. /
213	int advise_write_load_info = `0`; / Print info about AOF and write load. /
214	int advise_hz = `0`; / Use higher HZ. /
215	int advise_large_objects = `0`; / Deletion of large objects. /
216	int advise_mass_eviction = `0`; / Avoid mass eviction of keys. /
217	int advise_relax_fsync_policy = `0`; / appendfsync always is slow. /
218	int advise_disable_thp = `0`; / AnonHugePages detected. /
219	int advices = `0`;
220
221	/ Return ASAP if the latency engine is disabled and it looks like it*
222	* was never enabled so far. */
223	if (dictSize(server.latency_events) == `0` &&
224	server.latency_monitor_threshold == `0`)
225	{
226	report = sdscat(report,"I'm sorry, Dave, I can't do that. Latency monitoring is disabled in this Redis instance. You may use \"CONFIG SET latency-monitor-threshold <milliseconds>.\" in order to enable it. If we weren't in a deep space mission I'd suggest to take a look at https://redis.io/topics/latency-monitor.\n");
227	return report;
228	}
229
230	/ Show all the events stats and add for each event some event-related*
231	* comment depending on the values. */
232	dictIterator *di;
233	dictEntry *de;
234	int eventnum = `0`;
235
236	di = dictGetSafeIterator(server.latency_events);
237	while((de = dictNext(di)) != NULL) {
238	char *event = dictGetKey(de);
239	struct latencyTimeSeries *ts = dictGetVal(de);
240	struct latencyStats ls;
241
242	if (ts == NULL) continue;
243	eventnum++;
244	if (eventnum == `1`) {
245	report = sdscat(report,"Dave, I have observed latency spikes in this Redis instance. You don't mind talking about it, do you Dave?\n\n");
246	}
247	analyzeLatencyForEvent(event,&ls);
248
249	report = sdscatprintf(report,
250	"%d. %s: %d latency spikes (average %lums, mean deviation %lums, period %.2f sec). Worst all time event %lums.",
251	eventnum, event,
252	ls.samples,
253	(unsigned long) ls.avg,
254	(unsigned long) ls.mad,
255	(double) ls.period/ls.samples,
256	(unsigned long) ts->max);
257
258	/ Fork /
259	if (!strcasecmp(event,"fork")) {
260	char *fork_quality;
261	if (server.stat_fork_rate < `10`) {
262	fork_quality = "terrible";
263	advise_better_vm = `1`;
264	advices++;
265	} else if (server.stat_fork_rate < `25`) {
266	fork_quality = "poor";
267	advise_better_vm = `1`;
268	advices++;
269	} else if (server.stat_fork_rate < `100`) {
270	fork_quality = "good";
271	} else {
272	fork_quality = "excellent";
273	}
274	report = sdscatprintf(report,
275	" Fork rate is %.2f GB/sec (%s).", server.stat_fork_rate,
276	fork_quality);
277	}
278
279	/ Potentially commands. /
280	if (!strcasecmp(event,"command")) {
281	if (server.slowlog_log_slower_than < `0`) {
282	advise_slowlog_enabled = `1`;
283	advices++;
284	} else if (server.slowlog_log_slower_than/`1000` >
285	server.latency_monitor_threshold)
286	{
287	advise_slowlog_tuning = `1`;
288	advices++;
289	}
290	advise_slowlog_inspect = `1`;
291	advise_large_objects = `1`;
292	advices += `2`;
293	}
294
295	/ fast-command. /
296	if (!strcasecmp(event,"fast-command")) {
297	advise_scheduler = `1`;
298	advices++;
299	}
300
301	/ AOF and I/O. /
302	if (!strcasecmp(event,"aof-write-pending-fsync")) {
303	advise_local_disk = `1`;
304	advise_disk_contention = `1`;
305	advise_ssd = `1`;
306	advise_data_writeback = `1`;
307	advices += `4`;
308	}
309
310	if (!strcasecmp(event,"aof-write-active-child")) {
311	advise_no_appendfsync = `1`;
312	advise_data_writeback = `1`;
313	advise_ssd = `1`;
314	advices += `3`;
315	}
316
317	if (!strcasecmp(event,"aof-write-alone")) {
318	advise_local_disk = `1`;
319	advise_data_writeback = `1`;
320	advise_ssd = `1`;
321	advices += `3`;
322	}
323
324	if (!strcasecmp(event,"aof-fsync-always")) {
325	advise_relax_fsync_policy = `1`;
326	advices++;
327	}
328
329	if (!strcasecmp(event,"aof-fstat") \|\|
330	!strcasecmp(event,"rdb-unlink-temp-file")) {
331	advise_disk_contention = `1`;
332	advise_local_disk = `1`;
333	advices += `2`;
334	}
335
336	if (!strcasecmp(event,"aof-rewrite-diff-write") \|\|
337	!strcasecmp(event,"aof-rename")) {
338	advise_write_load_info = `1`;
339	advise_data_writeback = `1`;
340	advise_ssd = `1`;
341	advise_local_disk = `1`;
342	advices += `4`;
343	}
344
345	/ Expire cycle. /
346	if (!strcasecmp(event,"expire-cycle")) {
347	advise_hz = `1`;
348	advise_large_objects = `1`;
349	advices += `2`;
350	}
351
352	/ Eviction cycle. /
353	if (!strcasecmp(event,"eviction-del")) {
354	advise_large_objects = `1`;
355	advices++;
356	}
357
358	if (!strcasecmp(event,"eviction-cycle")) {
359	advise_mass_eviction = `1`;
360	advices++;
361	}
362
363	report = sdscatlen(report,"\n",`1`);
364	}
365	dictReleaseIterator(di);
366
367	/ Add non event based advices. /
368	if (THPGetAnonHugePagesSize() > `0`) {
369	advise_disable_thp = `1`;
370	advices++;
371	}
372
373	if (eventnum == `0` && advices == `0`) {
374	report = sdscat(report,"Dave, no latency spike was observed during the lifetime of this Redis instance, not in the slightest bit. I honestly think you ought to sit down calmly, take a stress pill, and think things over.\n");
375	} else if (eventnum > `0` && advices == `0`) {
376	report = sdscat(report,"\nWhile there are latency events logged, I'm not able to suggest any easy fix. Please use the Redis community to get some help, providing this report in your help request.\n");
377	} else {
378	/ Add all the suggestions accumulated so far. /
379
380	/ Better VM. /
381	report = sdscat(report,"\nI have a few advices for you:\n\n");
382	if (advise_better_vm) {
383	report = sdscat(report,"- If you are using a virtual machine, consider upgrading it with a faster one using a hypervisior that provides less latency during fork() calls. Xen is known to have poor fork() performance. Even in the context of the same VM provider, certain kinds of instances can execute fork faster than others.\n");
384	}
385
386	/ Slow log. /
387	if (advise_slowlog_enabled) {
388	report = sdscatprintf(report,"- There are latency issues with potentially slow commands you are using. Try to enable the Slow Log Redis feature using the command 'CONFIG SET slowlog-log-slower-than %llu'. If the Slow log is disabled Redis is not able to log slow commands execution for you.\n", (unsigned long long)server.latency_monitor_threshold*`1000`);
389	}
390
391	if (advise_slowlog_tuning) {
392	report = sdscatprintf(report,"- Your current Slow Log configuration only logs events that are slower than your configured latency monitor threshold. Please use 'CONFIG SET slowlog-log-slower-than %llu'.\n", (unsigned long long)server.latency_monitor_threshold*`1000`);
393	}
394
395	if (advise_slowlog_inspect) {
396	report = sdscat(report,"- Check your Slow Log to understand what are the commands you are running which are too slow to execute. Please check https://redis.io/commands/slowlog for more information.\n");
397	}
398
399	/ Intrinsic latency. /
400	if (advise_scheduler) {
401	report = sdscat(report,"- The system is slow to execute Redis code paths not containing system calls. This usually means the system does not provide Redis CPU time to run for long periods. You should try to:\n"
402	" 1) Lower the system load.\n"
403	" 2) Use a computer / VM just for Redis if you are running other software in the same system.\n"
404	" 3) Check if you have a \"noisy neighbour\" problem.\n"
405	" 4) Check with 'redis-cli --intrinsic-latency 100' what is the intrinsic latency in your system.\n"
406	" 5) Check if the problem is allocator-related by recompiling Redis with MALLOC=libc, if you are using Jemalloc. However this may create fragmentation problems.\n");
407	}
408
409	/ AOF / Disk latency. /
410	if (advise_local_disk) {
411	report = sdscat(report,"- It is strongly advised to use local disks for persistence, especially if you are using AOF. Remote disks provided by platform-as-a-service providers are known to be slow.\n");
412	}
413
414	if (advise_ssd) {
415	report = sdscat(report,"- SSD disks are able to reduce fsync latency, and total time needed for snapshotting and AOF log rewriting (resulting in smaller memory usage). With extremely high write load SSD disks can be a good option. However Redis should perform reasonably with high load using normal disks. Use this advice as a last resort.\n");
416	}
417
418	if (advise_data_writeback) {
419	report = sdscat(report,"- Mounting ext3/4 filesystems with data=writeback can provide a performance boost compared to data=ordered, however this mode of operation provides less guarantees, and sometimes it can happen that after a hard crash the AOF file will have a half-written command at the end and will require to be repaired before Redis restarts.\n");
420	}
421
422	if (advise_disk_contention) {
423	report = sdscat(report,"- Try to lower the disk contention. This is often caused by other disk intensive processes running in the same computer (including other Redis instances).\n");
424	}
425
426	if (advise_no_appendfsync) {
427	report = sdscat(report,"- Assuming from the point of view of data safety this is viable in your environment, you could try to enable the 'no-appendfsync-on-rewrite' option, so that fsync will not be performed while there is a child rewriting the AOF file or producing an RDB file (the moment where there is high disk contention).\n");
428	}
429
430	if (advise_relax_fsync_policy && server.aof_fsync == AOF_FSYNC_ALWAYS) {
431	report = sdscat(report,"- Your fsync policy is set to 'always'. It is very hard to get good performances with such a setup, if possible try to relax the fsync policy to 'onesec'.\n");
432	}
433
434	if (advise_write_load_info) {
435	report = sdscat(report,"- Latency during the AOF atomic rename operation or when the final difference is flushed to the AOF file at the end of the rewrite, sometimes is caused by very high write load, causing the AOF buffer to get very large. If possible try to send less commands to accomplish the same work, or use Lua scripts to group multiple operations into a single EVALSHA call.\n");
436	}
437
438	if (advise_hz && server.hz < `100`) {
439	report = sdscat(report,"- In order to make the Redis keys expiring process more incremental, try to set the 'hz' configuration parameter to 100 using 'CONFIG SET hz 100'.\n");
440	}
441
442	if (advise_large_objects) {
443	report = sdscat(report,"- Deleting, expiring or evicting (because of maxmemory policy) large objects is a blocking operation. If you have very large objects that are often deleted, expired, or evicted, try to fragment those objects into multiple smaller objects.\n");
444	}
445
446	if (advise_mass_eviction) {
447	report = sdscat(report,"- Sudden changes to the 'maxmemory' setting via 'CONFIG SET', or allocation of large objects via sets or sorted sets intersections, STORE option of SORT, Redis Cluster large keys migrations (RESTORE command), may create sudden memory pressure forcing the server to block trying to evict keys. \n");
448	}
449
450	if (advise_disable_thp) {
451	report = sdscat(report,"- I detected a non zero amount of anonymous huge pages used by your process. This creates very serious latency events in different conditions, especially when Redis is persisting on disk. To disable THP support use the command 'echo never > /sys/kernel/mm/transparent_hugepage/enabled', make sure to also add it into /etc/rc.local so that the command will be executed again after a reboot. Note that even if you have already disabled THP, you still need to restart the Redis process to get rid of the huge pages already created.\n");
452	}
453	}
454
455	return report;
456	}
457
458	/ ---------------------- Latency command implementation -------------------- /
459
460	/ latencyCommand() helper to produce a map of time buckets,*
461	* each representing a latency range,
462	* between 1 nanosecond and roughly 1 second.
463	* Each bucket covers twice the previous bucket's range.
464	* Empty buckets are not printed.
465	* Everything above 1 sec is considered +Inf.
466	* At max there will be log2(1000000000)=30 buckets */
467	void fillCommandCDF(client c, struct* hdr_histogram* histogram) {
468	addReplyMapLen(c,`2`);
469	addReplyBulkCString(c,"calls");
470	addReplyLongLong(c,(long long) histogram->total_count);
471	addReplyBulkCString(c,"histogram_usec");
472	void *replylen = addReplyDeferredLen(c);
473	int samples = `0`;
474	struct hdr_iter iter;
475	hdr_iter_log_init(&iter,histogram,`1024`,`2`);
476	int64_t previous_count = `0`;
477	while (hdr_iter_next(&iter)) {
478	const int64_t micros = iter.highest_equivalent_value / `1000`;
479	const int64_t cumulative_count = iter.cumulative_count;
480	if(cumulative_count > previous_count){
481	addReplyLongLong(c,(long long) micros);
482	addReplyLongLong(c,(long long) cumulative_count);
483	samples++;
484	}
485	previous_count = cumulative_count;
486	}
487	setDeferredMapLen(c,replylen,samples);
488	}
489
490	/ latencyCommand() helper to produce for all commands,*
491	* a per command cumulative distribution of latencies. */
492	void latencyAllCommandsFillCDF(client c, dict commands, int *command_with_data) {
493	dictIterator *di = dictGetSafeIterator(commands);
494	dictEntry *de;
495	struct redisCommand *cmd;
496
497	while((de = dictNext(di)) != NULL) {
498	cmd = (struct redisCommand *) dictGetVal(de);
499	if (cmd->latency_histogram) {
500	addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname));
501	fillCommandCDF(c, cmd->latency_histogram);
502	(*command_with_data)++;
503	}
504
505	if (cmd->subcommands) {
506	latencyAllCommandsFillCDF(c, cmd->subcommands_dict, command_with_data);
507	}
508	}
509	dictReleaseIterator(di);
510	}
511
512	/ latencyCommand() helper to produce for a specific command set,*
513	* a per command cumulative distribution of latencies. */
514	void latencySpecificCommandsFillCDF(client *c) {
515	void *replylen = addReplyDeferredLen(c);
516	int command_with_data = `0`;
517	for (int j = `2`; j < c->argc; j++){
518	struct redisCommand *cmd = lookupCommandBySds(c->argv[j]->ptr);
519	/ If the command does not exist we skip the reply /
520	if (cmd == NULL) {
521	continue;
522	}
523
524	if (cmd->latency_histogram) {
525	addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname));
526	fillCommandCDF(c, cmd->latency_histogram);
527	command_with_data++;
528	}
529
530	if (cmd->subcommands_dict) {
531	dictEntry *de;
532	dictIterator *di = dictGetSafeIterator(cmd->subcommands_dict);
533
534	while ((de = dictNext(di)) != NULL) {
535	struct redisCommand *sub = dictGetVal(de);
536	if (sub->latency_histogram) {
537	addReplyBulkCBuffer(c, sub->fullname, sdslen(sub->fullname));
538	fillCommandCDF(c, sub->latency_histogram);
539	command_with_data++;
540	}
541	}
542	dictReleaseIterator(di);
543	}
544	}
545	setDeferredMapLen(c,replylen,command_with_data);
546	}
547
548	/ latencyCommand() helper to produce a time-delay reply for all the samples*
549	* in memory for the specified time series. */
550	void latencyCommandReplyWithSamples(client c, struct* latencyTimeSeries *ts) {
551	void *replylen = addReplyDeferredLen(c);
552	int samples = `0`, j;
553
554	for (j = `0`; j < LATENCY_TS_LEN; j++) {
555	int i = (ts->idx + j) % LATENCY_TS_LEN;
556
557	if (ts->samples[i].time == `0`) continue;
558	addReplyArrayLen(c,`2`);
559	addReplyLongLong(c,ts->samples[i].time);
560	addReplyLongLong(c,ts->samples[i].latency);
561	samples++;
562	}
563	setDeferredArrayLen(c,replylen,samples);
564	}
565
566	/ latencyCommand() helper to produce the reply for the LATEST subcommand,*
567	* listing the last latency sample for every event type registered so far. */
568	void latencyCommandReplyWithLatestEvents(client *c) {
569	dictIterator *di;
570	dictEntry *de;
571
572	addReplyArrayLen(c,dictSize(server.latency_events));
573	di = dictGetIterator(server.latency_events);
574	while((de = dictNext(di)) != NULL) {
575	char *event = dictGetKey(de);
576	struct latencyTimeSeries *ts = dictGetVal(de);
577	int last = (ts->idx + LATENCY_TS_LEN - `1`) % LATENCY_TS_LEN;
578
579	addReplyArrayLen(c,`4`);
580	addReplyBulkCString(c,event);
581	addReplyLongLong(c,ts->samples[last].time);
582	addReplyLongLong(c,ts->samples[last].latency);
583	addReplyLongLong(c,ts->max);
584	}
585	dictReleaseIterator(di);
586	}
587
588	#define LATENCY_GRAPH_COLS 80
589	sds latencyCommandGenSparkeline(char event, struct* latencyTimeSeries *ts) {
590	int j;
591	struct sequence *seq = createSparklineSequence();
592	sds graph = sdsempty();
593	uint32_t min = `0`, max = `0`;
594
595	for (j = `0`; j < LATENCY_TS_LEN; j++) {
596	int i = (ts->idx + j) % LATENCY_TS_LEN;
597	int elapsed;
598	char buf[`64`];
599
600	if (ts->samples[i].time == `0`) continue;
601	/ Update min and max. /
602	if (seq->length == `0`) {
603	min = max = ts->samples[i].latency;
604	} else {
605	if (ts->samples[i].latency > max) max = ts->samples[i].latency;
606	if (ts->samples[i].latency < min) min = ts->samples[i].latency;
607	}
608	/ Use as label the number of seconds / minutes / hours / days*
609	* ago the event happened. */
610	elapsed = time(NULL) - ts->samples[i].time;
611	if (elapsed < `60`)
612	snprintf(buf,sizeof(buf),"%ds",elapsed);
613	else if (elapsed < `3600`)
614	snprintf(buf,sizeof(buf),"%dm",elapsed/`60`);
615	else if (elapsed < `3600`*`24`)
616	snprintf(buf,sizeof(buf),"%dh",elapsed/`3600`);
617	else
618	snprintf(buf,sizeof(buf),"%dd",elapsed/(`3600`*`24`));
619	sparklineSequenceAddSample(seq,ts->samples[i].latency,buf);
620	}
621
622	graph = sdscatprintf(graph,
623	"%s - high %lu ms, low %lu ms (all time high %lu ms)\n", event,
624	(unsigned long) max, (unsigned long) min, (unsigned long) ts->max);
625	for (j = `0`; j < LATENCY_GRAPH_COLS; j++)
626	graph = sdscatlen(graph,"-",`1`);
627	graph = sdscatlen(graph,"\n",`1`);
628	graph = sparklineRender(graph,seq,LATENCY_GRAPH_COLS,`4`,SPARKLINE_FILL);
629	freeSparklineSequence(seq);
630	return graph;
631	}
632
633	/ LATENCY command implementations.*
634	*
635	* LATENCY HISTORY: return time-latency samples for the specified event.
636	* LATENCY LATEST: return the latest latency for all the events classes.
637	* LATENCY DOCTOR: returns a human readable analysis of instance latency.
638	* LATENCY GRAPH: provide an ASCII graph of the latency of the specified event.
639	* LATENCY RESET: reset data of a specified event or all the data if no event provided.
640	* LATENCY HISTOGRAM: return a cumulative distribution of latencies in the format of an histogram for the specified command names.
641	*/
642	void latencyCommand(client *c) {
643	struct latencyTimeSeries *ts;
644
645	if (!strcasecmp(c->argv[`1`]->ptr,"history") && c->argc == `3`) {
646	/ LATENCY HISTORY <event> /
647	ts = dictFetchValue(server.latency_events,c->argv[`2`]->ptr);
648	if (ts == NULL) {
649	addReplyArrayLen(c,`0`);
650	} else {
651	latencyCommandReplyWithSamples(c,ts);
652	}
653	} else if (!strcasecmp(c->argv[`1`]->ptr,"graph") && c->argc == `3`) {
654	/ LATENCY GRAPH <event> /
655	sds graph;
656	dictEntry *de;
657	char *event;
658
659	de = dictFind(server.latency_events,c->argv[`2`]->ptr);
660	if (de == NULL) goto nodataerr;
661	ts = dictGetVal(de);
662	event = dictGetKey(de);
663
664	graph = latencyCommandGenSparkeline(event,ts);
665	addReplyVerbatim(c,graph,sdslen(graph),"txt");
666	sdsfree(graph);
667	} else if (!strcasecmp(c->argv[`1`]->ptr,"latest") && c->argc == `2`) {
668	/ LATENCY LATEST /
669	latencyCommandReplyWithLatestEvents(c);
670	} else if (!strcasecmp(c->argv[`1`]->ptr,"doctor") && c->argc == `2`) {
671	/ LATENCY DOCTOR /
672	sds report = createLatencyReport();
673
674	addReplyVerbatim(c,report,sdslen(report),"txt");
675	sdsfree(report);
676	} else if (!strcasecmp(c->argv[`1`]->ptr,"reset") && c->argc >= `2`) {
677	/ LATENCY RESET /
678	if (c->argc == `2`) {
679	addReplyLongLong(c,latencyResetEvent(NULL));
680	} else {
681	int j, resets = `0`;
682
683	for (j = `2`; j < c->argc; j++)
684	resets += latencyResetEvent(c->argv[j]->ptr);
685	addReplyLongLong(c,resets);
686	}
687	} else if (!strcasecmp(c->argv[`1`]->ptr,"histogram") && c->argc >= `2`) {
688	/ LATENCY HISTOGRAM/
689	if (c->argc == `2`) {
690	int command_with_data = `0`;
691	void *replylen = addReplyDeferredLen(c);
692	latencyAllCommandsFillCDF(c, server.commands, &command_with_data);
693	setDeferredMapLen(c, replylen, command_with_data);
694	} else {
695	latencySpecificCommandsFillCDF(c);
696	}
697	} else if (!strcasecmp(c->argv[`1`]->ptr,"help") && c->argc == `2`) {
698	const char *help[] = {
699	"DOCTOR",
700	" Return a human readable latency analysis report.",
701	"GRAPH <event>",
702	" Return an ASCII latency graph for the <event> class.",
703	"HISTORY <event>",
704	" Return time-latency samples for the <event> class.",
705	"LATEST",
706	" Return the latest latency samples for all events.",
707	"RESET [<event> ...]",
708	" Reset latency data of one or more <event> classes.",
709	" (default: reset all data for all event classes)",
710	"HISTOGRAM [COMMAND ...]",
711	" Return a cumulative distribution of latencies in the format of a histogram for the specified command names.",
712	" If no commands are specified then all histograms are replied.",
713	NULL
714	};
715	addReplyHelp(c, help);
716	} else {
717	addReplySubcommandSyntaxError(c);
718	}
719	return;
720
721	nodataerr:
722	/ Common error when the user asks for an event we have no latency*
723	* information about. */
724	addReplyErrorFormat(c,
725	"No samples available for event '%s'", (char*) c->argv[`2`]->ptr);
726	}
727
728

Browse the source code of redis/src/latency.c