cluster.c source code [redis/src/cluster.c]

1	/ Redis Cluster implementation.*
2	*
3	* Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
4	* All rights reserved.
5	*
6	* Redistribution and use in source and binary forms, with or without
7	* modification, are permitted provided that the following conditions are met:
8	*
9	* * Redistributions of source code must retain the above copyright notice,
10	* this list of conditions and the following disclaimer.
11	* * Redistributions in binary form must reproduce the above copyright
12	* notice, this list of conditions and the following disclaimer in the
13	* documentation and/or other materials provided with the distribution.
14	* * Neither the name of Redis nor the names of its contributors may be used
15	* to endorse or promote products derived from this software without
16	* specific prior written permission.
17	*
18	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28	* POSSIBILITY OF SUCH DAMAGE.
29	*/
30
31	#include "server.h"
32	#include "cluster.h"
33	#include "endianconv.h"
34
35	#include <sys/types.h>
36	#include <sys/socket.h>
37	#include <arpa/inet.h>
38	#include <fcntl.h>
39	#include <unistd.h>
40	#include <sys/stat.h>
41	#include <sys/file.h>
42	#include <math.h>
43
44	/ A global reference to myself is handy to make code more clear.*
45	* Myself always points to server.cluster->myself, that is, the clusterNode
46	* that represents this node. */
47	clusterNode *myself = NULL;
48
49	clusterNode createClusterNode(char* nodename, int* flags);
50	void clusterAddNode(clusterNode *node);
51	void clusterAcceptHandler(aeEventLoop el, int* fd, void privdata, int* mask);
52	void clusterReadHandler(connection *conn);
53	void clusterSendPing(clusterLink link, int* type);
54	void clusterSendFail(char *nodename);
55	void clusterSendFailoverAuthIfNeeded(clusterNode node, clusterMsg request);
56	void clusterUpdateState(void);
57	int clusterNodeGetSlotBit(clusterNode n, int* slot);
58	sds clusterGenNodesDescription(int filter, int use_pport);
59	list clusterGetNodesServingMySlots(clusterNode node);
60	int clusterNodeAddSlave(clusterNode master, clusterNode slave);
61	int clusterAddSlot(clusterNode n, int* slot);
62	int clusterDelSlot(int slot);
63	int clusterDelNodeSlots(clusterNode *node);
64	int clusterNodeSetSlotBit(clusterNode n, int* slot);
65	void clusterSetMaster(clusterNode *n);
66	void clusterHandleSlaveFailover(void);
67	void clusterHandleSlaveMigration(int max_slaves);
68	int bitmapTestBit(unsigned char bitmap, int* pos);
69	void clusterDoBeforeSleep(int flags);
70	void clusterSendUpdate(clusterLink link, clusterNode node);
71	void resetManualFailover(void);
72	void clusterCloseAllSlots(void);
73	void clusterSetNodeAsMaster(clusterNode *n);
74	void clusterDelNode(clusterNode *delnode);
75	sds representClusterNodeFlags(sds ci, uint16_t flags);
76	sds representSlotInfo(sds ci, uint16_t slot_info_pairs, int* slot_info_pairs_count);
77	void clusterFreeNodesSlotsInfo(clusterNode *n);
78	uint64_t clusterGetMaxEpoch(void);
79	int clusterBumpConfigEpochWithoutConsensus(void);
80	void moduleCallClusterReceivers(const char sender_id, uint64_t module_id, uint8_t type, const* unsigned char *payload, uint32_t len);
81	const char clusterGetMessageTypeString(int* type);
82	void removeChannelsInSlot(unsigned int slot);
83	unsigned int countKeysInSlot(unsigned int hashslot);
84	unsigned int countChannelsInSlot(unsigned int hashslot);
85	unsigned int delKeysInSlot(unsigned int hashslot);
86
87	/ Links to the next and previous entries for keys in the same slot are stored*
88	* in the dict entry metadata. See Slot to Key API below. */
89	#define dictEntryNextInSlot(de) \
90	(((clusterDictEntryMetadata *)dictMetadata(de))->next)
91	#define dictEntryPrevInSlot(de) \
92	(((clusterDictEntryMetadata *)dictMetadata(de))->prev)
93
94	#define RCVBUF_INIT_LEN 1024
95	#define RCVBUF_MAX_PREALLOC (1<<20) /* 1MB */
96
97	/ Cluster nodes hash table, mapping nodes addresses 1.2.3.4:6379 to*
98	* clusterNode structures. */
99	dictType clusterNodesDictType = {
100	dictSdsHash, / hash function /
101	NULL, / key dup /
102	NULL, / val dup /
103	dictSdsKeyCompare, / key compare /
104	dictSdsDestructor, / key destructor /
105	NULL, / val destructor /
106	NULL / allow to expand /
107	};
108
109	/ Cluster re-addition blacklist. This maps node IDs to the time*
110	* we can re-add this node. The goal is to avoid reading a removed
111	* node for some time. */
112	dictType clusterNodesBlackListDictType = {
113	dictSdsCaseHash, / hash function /
114	NULL, / key dup /
115	NULL, / val dup /
116	dictSdsKeyCaseCompare, / key compare /
117	dictSdsDestructor, / key destructor /
118	NULL, / val destructor /
119	NULL / allow to expand /
120	};
121
122	/ -----------------------------------------------------------------------------*
123	* Initialization
124	* -------------------------------------------------------------------------- */
125
126	/ Load the cluster config from 'filename'.*
127	*
128	* If the file does not exist or is zero-length (this may happen because
129	* when we lock the nodes.conf file, we create a zero-length one for the
130	* sake of locking if it does not already exist), C_ERR is returned.
131	* If the configuration was loaded from the file, C_OK is returned. */
132	int clusterLoadConfig(char *filename) {
133	FILE *fp = fopen(filename,"r");
134	struct stat sb;
135	char *line;
136	int maxline, j;
137
138	if (fp == NULL) {
139	if (errno == ENOENT) {
140	return C_ERR;
141	} else {
142	serverLog(LL_WARNING,
143	"Loading the cluster node config from %s: %s",
144	filename, strerror(errno));
145	exit(`1`);
146	}
147	}
148
149	if (redis_fstat(fileno(fp),&sb) == -`1`) {
150	serverLog(LL_WARNING,
151	"Unable to obtain the cluster node config file stat %s: %s",
152	filename, strerror(errno));
153	exit(`1`);
154	}
155	/ Check if the file is zero-length: if so return C_ERR to signal*
156	* we have to write the config. */
157	if (sb.st_size == `0`) {
158	fclose(fp);
159	return C_ERR;
160	}
161
162	/ Parse the file. Note that single lines of the cluster config file can*
163	* be really long as they include all the hash slots of the node.
164	* This means in the worst possible case, half of the Redis slots will be
165	* present in a single line, possibly in importing or migrating state, so
166	* together with the node ID of the sender/receiver.
167	*
168	* To simplify we allocate 1024+CLUSTER_SLOTS128 bytes per line. /
169	maxline = `1024`+CLUSTER_SLOTS*`128`;
170	line = zmalloc(maxline);
171	while(fgets(line,maxline,fp) != NULL) {
172	int argc;
173	sds *argv;
174	clusterNode n, master;
175	char p, s;
176
177	/ Skip blank lines, they can be created either by users manually*
178	* editing nodes.conf or by the config writing process if stopped
179	* before the truncate() call. */
180	if (line[`0`] == `'\n'` \|\| line[`0`] == `'\0'`) continue;
181
182	/ Split the line into arguments for processing. /
183	argv = sdssplitargs(line,&argc);
184	if (argv == NULL) goto fmterr;
185
186	/ Handle the special "vars" line. Don't pretend it is the last*
187	* line even if it actually is when generated by Redis. */
188	if (strcasecmp(argv[`0`],"vars") == `0`) {
189	if (!(argc % `2`)) goto fmterr;
190	for (j = `1`; j < argc; j += `2`) {
191	if (strcasecmp(argv[j],"currentEpoch") == `0`) {
192	server.cluster->currentEpoch =
193	strtoull(argv[j+`1`],NULL,`10`);
194	} else if (strcasecmp(argv[j],"lastVoteEpoch") == `0`) {
195	server.cluster->lastVoteEpoch =
196	strtoull(argv[j+`1`],NULL,`10`);
197	} else {
198	serverLog(LL_WARNING,
199	"Skipping unknown cluster config variable '%s'",
200	argv[j]);
201	}
202	}
203	sdsfreesplitres(argv,argc);
204	continue;
205	}
206
207	/ Regular config lines have at least eight fields /
208	if (argc < `8`) {
209	sdsfreesplitres(argv,argc);
210	goto fmterr;
211	}
212
213	/ Create this node if it does not exist /
214	if (verifyClusterNodeId(argv[`0`], sdslen(argv[`0`])) == C_ERR) {
215	sdsfreesplitres(argv, argc);
216	goto fmterr;
217	}
218	n = clusterLookupNode(argv[`0`], sdslen(argv[`0`]));
219	if (!n) {
220	n = createClusterNode(argv[`0`],`0`);
221	clusterAddNode(n);
222	}
223	/ Format for the node address information:*
224	* ip:port[@cport][,hostname] */
225
226	/ Hostname is an optional argument that defines the endpoint*
227	* that can be reported to clients instead of IP. */
228	char *hostname = strchr(argv[`1`], `','`);
229	if (hostname) {
230	*hostname = `'\0'`;
231	hostname++;
232	n->hostname = sdscpy(n->hostname, hostname);
233	} else if (sdslen(n->hostname) != `0`) {
234	sdsclear(n->hostname);
235	}
236
237	/ Address and port /
238	if ((p = strrchr(argv[`1`],`':'`)) == NULL) {
239	sdsfreesplitres(argv,argc);
240	goto fmterr;
241	}
242	*p = `'\0'`;
243	memcpy(n->ip,argv[`1`],strlen(argv[`1`])+`1`);
244	char *port = p+`1`;
245	char *busp = strchr(port,`'@'`);
246	if (busp) {
247	*busp = `'\0'`;
248	busp++;
249	}
250	n->port = atoi(port);
251	/ In older versions of nodes.conf the "@busport" part is missing.*
252	* In this case we set it to the default offset of 10000 from the
253	* base port. */
254	n->cport = busp ? atoi(busp) : n->port + CLUSTER_PORT_INCR;
255
256	/ The plaintext port for client in a TLS cluster (n->pport) is not*
257	* stored in nodes.conf. It is received later over the bus protocol. */
258
259	/ Parse flags /
260	p = s = argv[`2`];
261	while(p) {
262	p = strchr(s,`','`);
263	if (p) *p = `'\0'`;
264	if (!strcasecmp(s,"myself")) {
265	serverAssert(server.cluster->myself == NULL);
266	myself = server.cluster->myself = n;
267	n->flags \|= CLUSTER_NODE_MYSELF;
268	} else if (!strcasecmp(s,"master")) {
269	n->flags \|= CLUSTER_NODE_MASTER;
270	} else if (!strcasecmp(s,"slave")) {
271	n->flags \|= CLUSTER_NODE_SLAVE;
272	} else if (!strcasecmp(s,"fail?")) {
273	n->flags \|= CLUSTER_NODE_PFAIL;
274	} else if (!strcasecmp(s,"fail")) {
275	n->flags \|= CLUSTER_NODE_FAIL;
276	n->fail_time = mstime();
277	} else if (!strcasecmp(s,"handshake")) {
278	n->flags \|= CLUSTER_NODE_HANDSHAKE;
279	} else if (!strcasecmp(s,"noaddr")) {
280	n->flags \|= CLUSTER_NODE_NOADDR;
281	} else if (!strcasecmp(s,"nofailover")) {
282	n->flags \|= CLUSTER_NODE_NOFAILOVER;
283	} else if (!strcasecmp(s,"noflags")) {
284	/ nothing to do /
285	} else {
286	serverPanic("Unknown flag in redis cluster config file");
287	}
288	if (p) s = p+`1`;
289	}
290
291	/ Get master if any. Set the master and populate master's*
292	* slave list. */
293	if (argv[`3`][`0`] != `'-'`) {
294	if (verifyClusterNodeId(argv[`3`], sdslen(argv[`3`])) == C_ERR) {
295	sdsfreesplitres(argv, argc);
296	goto fmterr;
297	}
298	master = clusterLookupNode(argv[`3`], sdslen(argv[`3`]));
299	if (!master) {
300	master = createClusterNode(argv[`3`],`0`);
301	clusterAddNode(master);
302	}
303	n->slaveof = master;
304	clusterNodeAddSlave(master,n);
305	}
306
307	/ Set ping sent / pong received timestamps /
308	if (atoi(argv[`4`])) n->ping_sent = mstime();
309	if (atoi(argv[`5`])) n->pong_received = mstime();
310
311	/ Set configEpoch for this node.*
312	* If the node is a replica, set its config epoch to 0.
313	* If it's a primary, load the config epoch from the configuration file. */
314	n->configEpoch = (nodeIsSlave(n) && n->slaveof) ? `0` : strtoull(argv[`6`],NULL,`10`);
315
316	/ Populate hash slots served by this instance. /
317	for (j = `8`; j < argc; j++) {
318	int start, stop;
319
320	if (argv[j][`0`] == `'['`) {
321	/ Here we handle migrating / importing slots /
322	int slot;
323	char direction;
324	clusterNode *cn;
325
326	p = strchr(argv[j],`'-'`);
327	serverAssert(p != NULL);
328	*p = `'\0'`;
329	direction = p[`1`]; / Either '>' or '<' /
330	slot = atoi(argv[j]+`1`);
331	if (slot < `0` \|\| slot >= CLUSTER_SLOTS) {
332	sdsfreesplitres(argv,argc);
333	goto fmterr;
334	}
335	p += `3`;
336
337	char *pr = strchr(p, `']'`);
338	size_t node_len = pr - p;
339	if (pr == NULL \|\| verifyClusterNodeId(p, node_len) == C_ERR) {
340	sdsfreesplitres(argv, argc);
341	goto fmterr;
342	}
343	cn = clusterLookupNode(p, CLUSTER_NAMELEN);
344	if (!cn) {
345	cn = createClusterNode(p,`0`);
346	clusterAddNode(cn);
347	}
348	if (direction == `'>'`) {
349	server.cluster->migrating_slots_to[slot] = cn;
350	} else {
351	server.cluster->importing_slots_from[slot] = cn;
352	}
353	continue;
354	} else if ((p = strchr(argv[j],`'-'`)) != NULL) {
355	*p = `'\0'`;
356	start = atoi(argv[j]);
357	stop = atoi(p+`1`);
358	} else {
359	start = stop = atoi(argv[j]);
360	}
361	if (start < `0` \|\| start >= CLUSTER_SLOTS \|\|
362	stop < `0` \|\| stop >= CLUSTER_SLOTS)
363	{
364	sdsfreesplitres(argv,argc);
365	goto fmterr;
366	}
367	while(start <= stop) clusterAddSlot(n, start++);
368	}
369
370	sdsfreesplitres(argv,argc);
371	}
372	/ Config sanity check /
373	if (server.cluster->myself == NULL) goto fmterr;
374
375	zfree(line);
376	fclose(fp);
377
378	serverLog(LL_NOTICE,"Node configuration loaded, I'm %.40s", myself->name);
379
380	/ Something that should never happen: currentEpoch smaller than*
381	* the max epoch found in the nodes configuration. However we handle this
382	* as some form of protection against manual editing of critical files. */
383	if (clusterGetMaxEpoch() > server.cluster->currentEpoch) {
384	server.cluster->currentEpoch = clusterGetMaxEpoch();
385	}
386	return C_OK;
387
388	fmterr:
389	serverLog(LL_WARNING,
390	"Unrecoverable error: corrupted cluster config file.");
391	zfree(line);
392	if (fp) fclose(fp);
393	exit(`1`);
394	}
395
396	/ Cluster node configuration is exactly the same as CLUSTER NODES output.*
397	*
398	* This function writes the node config and returns 0, on error -1
399	* is returned.
400	*
401	* Note: we need to write the file in an atomic way from the point of view
402	* of the POSIX filesystem semantics, so that if the server is stopped
403	* or crashes during the write, we'll end with either the old file or the
404	* new one. Since we have the full payload to write available we can use
405	* a single write to write the whole file. If the pre-existing file was
406	* bigger we pad our payload with newlines that are anyway ignored and truncate
407	* the file afterward. */
408	int clusterSaveConfig(int do_fsync) {
409	sds ci;
410	size_t content_size;
411	struct stat sb;
412	int fd;
413
414	server.cluster->todo_before_sleep &= ~CLUSTER_TODO_SAVE_CONFIG;
415
416	/ Get the nodes description and concatenate our "vars" directive to*
417	* save currentEpoch and lastVoteEpoch. */
418	ci = clusterGenNodesDescription(CLUSTER_NODE_HANDSHAKE, `0`);
419	ci = sdscatprintf(ci,"vars currentEpoch %llu lastVoteEpoch %llu\n",
420	(unsigned long long) server.cluster->currentEpoch,
421	(unsigned long long) server.cluster->lastVoteEpoch);
422	content_size = sdslen(ci);
423
424	if ((fd = open(server.cluster_configfile,O_WRONLY\|O_CREAT,`0644`))
425	== -`1`) goto err;
426
427	if (redis_fstat(fd,&sb) == -`1`) goto err;
428
429	/ Pad the new payload if the existing file length is greater. /
430	if (sb.st_size > (off_t)content_size) {
431	ci = sdsgrowzero(ci,sb.st_size);
432	memset(ci+content_size,`'\n'`,sb.st_size-content_size);
433	}
434
435	if (write(fd,ci,sdslen(ci)) != (ssize_t)sdslen(ci)) goto err;
436	if (do_fsync) {
437	server.cluster->todo_before_sleep &= ~CLUSTER_TODO_FSYNC_CONFIG;
438	if (fsync(fd) == -`1`) goto err;
439	}
440
441	/ Truncate the file if needed to remove the final \n padding that*
442	* is just garbage. */
443	if (content_size != sdslen(ci) && ftruncate(fd,content_size) == -`1`) {
444	/ ftruncate() failing is not a critical error. /
445	}
446	close(fd);
447	sdsfree(ci);
448	return `0`;
449
450	err:
451	if (fd != -`1`) close(fd);
452	sdsfree(ci);
453	return -`1`;
454	}
455
456	void clusterSaveConfigOrDie(int do_fsync) {
457	if (clusterSaveConfig(do_fsync) == -`1`) {
458	serverLog(LL_WARNING,"Fatal: can't update cluster config file.");
459	exit(`1`);
460	}
461	}
462
463	/ Lock the cluster config using flock(), and retain the file descriptor used to*
464	* acquire the lock so that the file will be locked as long as the process is up.
465	*
466	* This works because we always update nodes.conf with a new version
467	* in-place, reopening the file, and writing to it in place (later adjusting
468	* the length with ftruncate()).
469	*
470	* On success C_OK is returned, otherwise an error is logged and
471	* the function returns C_ERR to signal a lock was not acquired. */
472	int clusterLockConfig(char *filename) {
473	/ flock() does not exist on Solaris*
474	* and a fcntl-based solution won't help, as we constantly re-open that file,
475	* which will release _all_ locks anyway
476	*/
477	#if !defined(__sun)
478	/ To lock it, we need to open the file in a way it is created if*
479	* it does not exist, otherwise there is a race condition with other
480	* processes. */
481	int fd = open(filename,O_WRONLY\|O_CREAT\|O_CLOEXEC,`0644`);
482	if (fd == -`1`) {
483	serverLog(LL_WARNING,
484	"Can't open %s in order to acquire a lock: %s",
485	filename, strerror(errno));
486	return C_ERR;
487	}
488
489	if (flock(fd,LOCK_EX\|LOCK_NB) == -`1`) {
490	if (errno == EWOULDBLOCK) {
491	serverLog(LL_WARNING,
492	"Sorry, the cluster configuration file %s is already used "
493	"by a different Redis Cluster node. Please make sure that "
494	"different nodes use different cluster configuration "
495	"files.", filename);
496	} else {
497	serverLog(LL_WARNING,
498	"Impossible to lock %s: %s", filename, strerror(errno));
499	}
500	close(fd);
501	return C_ERR;
502	}
503	/ Lock acquired: leak the 'fd' by not closing it until shutdown time, so that*
504	* we'll retain the lock to the file as long as the process exists.
505	*
506	* After fork, the child process will get the fd opened by the parent process,
507	* we need save `fd` to `cluster_config_file_lock_fd`, so that in redisFork(),
508	* it will be closed in the child process.
509	* If it is not closed, when the main process is killed -9, but the child process
510	* (redis-aof-rewrite) is still alive, the fd(lock) will still be held by the
511	* child process, and the main process will fail to get lock, means fail to start. */
512	server.cluster_config_file_lock_fd = fd;
513	#else
514	UNUSED(filename);
515	#endif /* __sun */
516
517	return C_OK;
518	}
519
520	/ Derives our ports to be announced in the cluster bus. /
521	void deriveAnnouncedPorts(int announced_port, int* *announced_pport,
522	int *announced_cport) {
523	int port = server.tls_cluster ? server.tls_port : server.port;
524	/ Default announced ports. /
525	*announced_port = port;
526	*announced_pport = server.tls_cluster ? server.port : `0`;
527	*announced_cport = server.cluster_port ? server.cluster_port : port + CLUSTER_PORT_INCR;
528
529	/ Config overriding announced ports. /
530	if (server.tls_cluster && server.cluster_announce_tls_port) {
531	*announced_port = server.cluster_announce_tls_port;
532	*announced_pport = server.cluster_announce_port;
533	} else if (server.cluster_announce_port) {
534	*announced_port = server.cluster_announce_port;
535	}
536	if (server.cluster_announce_bus_port) {
537	*announced_cport = server.cluster_announce_bus_port;
538	}
539	}
540
541	/ Some flags (currently just the NOFAILOVER flag) may need to be updated*
542	* in the "myself" node based on the current configuration of the node,
543	* that may change at runtime via CONFIG SET. This function changes the
544	* set of flags in myself->flags accordingly. */
545	void clusterUpdateMyselfFlags(void) {
546	if (!myself) return;
547	int oldflags = myself->flags;
548	int nofailover = server.cluster_slave_no_failover ?
549	CLUSTER_NODE_NOFAILOVER : `0`;
550	myself->flags &= ~CLUSTER_NODE_NOFAILOVER;
551	myself->flags \|= nofailover;
552	if (myself->flags != oldflags) {
553	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
554	CLUSTER_TODO_UPDATE_STATE);
555	}
556	}
557
558
559	/ We want to take myself->ip in sync with the cluster-announce-ip option.*
560	* The option can be set at runtime via CONFIG SET. */
561	void clusterUpdateMyselfIp(void) {
562	if (!myself) return;
563	static char *prev_ip = NULL;
564	char *curr_ip = server.cluster_announce_ip;
565	int changed = `0`;
566
567	if (prev_ip == NULL && curr_ip != NULL) changed = `1`;
568	else if (prev_ip != NULL && curr_ip == NULL) changed = `1`;
569	else if (prev_ip && curr_ip && strcmp(prev_ip,curr_ip)) changed = `1`;
570
571	if (changed) {
572	if (prev_ip) zfree(prev_ip);
573	prev_ip = curr_ip;
574
575	if (curr_ip) {
576	/ We always take a copy of the previous IP address, by*
577	* duplicating the string. This way later we can check if
578	* the address really changed. */
579	prev_ip = zstrdup(prev_ip);
580	strncpy(myself->ip,server.cluster_announce_ip,NET_IP_STR_LEN-`1`);
581	myself->ip[NET_IP_STR_LEN-`1`] = `'\0'`;
582	} else {
583	myself->ip[`0`] = `'\0'`; / Force autodetection. /
584	}
585	}
586	}
587
588	/ Update the hostname for the specified node with the provided C string. /
589	static void updateAnnouncedHostname(clusterNode node, char* *new) {
590	/ Previous and new hostname are the same, no need to update. /
591	if (new && !strcmp(new, node->hostname)) {
592	return;
593	}
594
595	if (new) {
596	node->hostname = sdscpy(node->hostname, new);
597	} else if (sdslen(node->hostname) != `0`) {
598	sdsclear(node->hostname);
599	}
600	}
601
602	/ Update my hostname based on server configuration values /
603	void clusterUpdateMyselfHostname(void) {
604	if (!myself) return;
605	updateAnnouncedHostname(myself, server.cluster_announce_hostname);
606	}
607
608	void clusterInit(void) {
609	int saveconf = `0`;
610
611	server.cluster = zmalloc(sizeof(clusterState));
612	server.cluster->myself = NULL;
613	server.cluster->currentEpoch = `0`;
614	server.cluster->state = CLUSTER_FAIL;
615	server.cluster->size = `1`;
616	server.cluster->todo_before_sleep = `0`;
617	server.cluster->nodes = dictCreate(&clusterNodesDictType);
618	server.cluster->nodes_black_list =
619	dictCreate(&clusterNodesBlackListDictType);
620	server.cluster->failover_auth_time = `0`;
621	server.cluster->failover_auth_count = `0`;
622	server.cluster->failover_auth_rank = `0`;
623	server.cluster->failover_auth_epoch = `0`;
624	server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
625	server.cluster->lastVoteEpoch = `0`;
626
627	/ Initialize stats /
628	for (int i = `0`; i < CLUSTERMSG_TYPE_COUNT; i++) {
629	server.cluster->stats_bus_messages_sent[i] = `0`;
630	server.cluster->stats_bus_messages_received[i] = `0`;
631	}
632	server.cluster->stats_pfail_nodes = `0`;
633	server.cluster->stat_cluster_links_buffer_limit_exceeded = `0`;
634
635	memset(server.cluster->slots,`0`, sizeof(server.cluster->slots));
636	clusterCloseAllSlots();
637
638	/ Lock the cluster config file to make sure every node uses*
639	* its own nodes.conf. */
640	server.cluster_config_file_lock_fd = -`1`;
641	if (clusterLockConfig(server.cluster_configfile) == C_ERR)
642	exit(`1`);
643
644	/ Load or create a new nodes configuration. /
645	if (clusterLoadConfig(server.cluster_configfile) == C_ERR) {
646	/ No configuration found. We will just use the random name provided*
647	* by the createClusterNode() function. */
648	myself = server.cluster->myself =
649	createClusterNode(NULL,CLUSTER_NODE_MYSELF\|CLUSTER_NODE_MASTER);
650	serverLog(LL_NOTICE,"No cluster configuration found, I'm %.40s",
651	myself->name);
652	clusterAddNode(myself);
653	saveconf = `1`;
654	}
655	if (saveconf) clusterSaveConfigOrDie(`1`);
656
657	/ We need a listening TCP port for our cluster messaging needs. /
658	server.cfd.count = `0`;
659
660	/ Port sanity check II*
661	* The other handshake port check is triggered too late to stop
662	* us from trying to use a too-high cluster port number. */
663	int port = server.tls_cluster ? server.tls_port : server.port;
664	if (!server.cluster_port && port > (`65535`-CLUSTER_PORT_INCR)) {
665	serverLog(LL_WARNING, "Redis port number too high. "
666	"Cluster communication port is 10,000 port "
667	"numbers higher than your Redis port. "
668	"Your Redis port number must be 55535 or less.");
669	exit(`1`);
670	}
671	if (!server.bindaddr_count) {
672	serverLog(LL_WARNING, "No bind address is configured, but it is required for the Cluster bus.");
673	exit(`1`);
674	}
675	int cport = server.cluster_port ? server.cluster_port : port + CLUSTER_PORT_INCR;
676	if (listenToPort(cport, &server.cfd) == C_ERR ) {
677	/ Note: the following log text is matched by the test suite. /
678	serverLog(LL_WARNING, "Failed listening on port %u (cluster), aborting.", cport);
679	exit(`1`);
680	}
681
682	if (createSocketAcceptHandler(&server.cfd, clusterAcceptHandler) != C_OK) {
683	serverPanic("Unrecoverable error creating Redis Cluster socket accept handler.");
684	}
685
686	/ Initialize data for the Slot to key API. /
687	slotToKeyInit(server.db);
688
689	/ The slots -> channels map is a radix tree. Initialize it here. /
690	server.cluster->slots_to_channels = raxNew();
691
692	/ Set myself->port/cport/pport to my listening ports, we'll just need to*
693	* discover the IP address via MEET messages. */
694	deriveAnnouncedPorts(&myself->port, &myself->pport, &myself->cport);
695
696	server.cluster->mf_end = `0`;
697	server.cluster->mf_slave = NULL;
698	resetManualFailover();
699	clusterUpdateMyselfFlags();
700	clusterUpdateMyselfIp();
701	clusterUpdateMyselfHostname();
702	}
703
704	/ Reset a node performing a soft or hard reset:*
705	*
706	* 1) All other nodes are forgotten.
707	* 2) All the assigned / open slots are released.
708	* 3) If the node is a slave, it turns into a master.
709	* 4) Only for hard reset: a new Node ID is generated.
710	* 5) Only for hard reset: currentEpoch and configEpoch are set to 0.
711	* 6) The new configuration is saved and the cluster state updated.
712	* 7) If the node was a slave, the whole data set is flushed away. */
713	void clusterReset(int hard) {
714	dictIterator *di;
715	dictEntry *de;
716	int j;
717
718	/ Turn into master. /
719	if (nodeIsSlave(myself)) {
720	clusterSetNodeAsMaster(myself);
721	replicationUnsetMaster();
722	emptyData(-`1`,EMPTYDB_NO_FLAGS,NULL);
723	}
724
725	/ Close slots, reset manual failover state. /
726	clusterCloseAllSlots();
727	resetManualFailover();
728
729	/ Unassign all the slots. /
730	for (j = `0`; j < CLUSTER_SLOTS; j++) clusterDelSlot(j);
731
732	/ Forget all the nodes, but myself. /
733	di = dictGetSafeIterator(server.cluster->nodes);
734	while((de = dictNext(di)) != NULL) {
735	clusterNode *node = dictGetVal(de);
736
737	if (node == myself) continue;
738	clusterDelNode(node);
739	}
740	dictReleaseIterator(di);
741
742	/ Hard reset only: set epochs to 0, change node ID. /
743	if (hard) {
744	sds oldname;
745
746	server.cluster->currentEpoch = `0`;
747	server.cluster->lastVoteEpoch = `0`;
748	myself->configEpoch = `0`;
749	serverLog(LL_WARNING, "configEpoch set to 0 via CLUSTER RESET HARD");
750
751	/ To change the Node ID we need to remove the old name from the*
752	* nodes table, change the ID, and re-add back with new name. */
753	oldname = sdsnewlen(myself->name, CLUSTER_NAMELEN);
754	dictDelete(server.cluster->nodes,oldname);
755	sdsfree(oldname);
756	getRandomHexChars(myself->name, CLUSTER_NAMELEN);
757	clusterAddNode(myself);
758	serverLog(LL_NOTICE,"Node hard reset, now I'm %.40s", myself->name);
759	}
760
761	/ Make sure to persist the new config and update the state. /
762	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
763	CLUSTER_TODO_UPDATE_STATE\|
764	CLUSTER_TODO_FSYNC_CONFIG);
765	}
766
767	/ -----------------------------------------------------------------------------*
768	* CLUSTER communication link
769	* -------------------------------------------------------------------------- */
770
771	clusterLink createClusterLink(clusterNode node) {
772	clusterLink link = zmalloc(sizeof(link));
773	link->ctime = mstime();
774	link->sndbuf = sdsempty();
775	link->rcvbuf = zmalloc(link->rcvbuf_alloc = RCVBUF_INIT_LEN);
776	link->rcvbuf_len = `0`;
777	link->conn = NULL;
778	link->node = node;
779	/ Related node can only possibly be known at link creation time if this is an outbound link /
780	link->inbound = (node == NULL);
781	if (!link->inbound) {
782	node->link = link;
783	}
784	return link;
785	}
786
787	/ Free a cluster link, but does not free the associated node of course.*
788	* This function will just make sure that the original node associated
789	* with this link will have the 'link' field set to NULL. */
790	void freeClusterLink(clusterLink *link) {
791	if (link->conn) {
792	connClose(link->conn);
793	link->conn = NULL;
794	}
795	sdsfree(link->sndbuf);
796	zfree(link->rcvbuf);
797	if (link->node) {
798	if (link->node->link == link) {
799	serverAssert(!link->inbound);
800	link->node->link = NULL;
801	} else if (link->node->inbound_link == link) {
802	serverAssert(link->inbound);
803	link->node->inbound_link = NULL;
804	}
805	}
806	zfree(link);
807	}
808
809	void setClusterNodeToInboundClusterLink(clusterNode node, clusterLink link) {
810	serverAssert(!link->node);
811	serverAssert(link->inbound);
812	if (node->inbound_link) {
813	/ A peer may disconnect and then reconnect with us, and it's not guaranteed that*
814	* we would always process the disconnection of the existing inbound link before
815	* accepting a new existing inbound link. Therefore, it's possible to have more than
816	* one inbound link from the same node at the same time. */
817	serverLog(LL_DEBUG, "Replacing inbound link fd %d from node %.40s with fd %d",
818	node->inbound_link->conn->fd, node->name, link->conn->fd);
819	}
820	node->inbound_link = link;
821	link->node = node;
822	}
823
824	static void clusterConnAcceptHandler(connection *conn) {
825	clusterLink *link;
826
827	if (connGetState(conn) != CONN_STATE_CONNECTED) {
828	serverLog(LL_VERBOSE,
829	"Error accepting cluster node connection: %s", connGetLastError(conn));
830	connClose(conn);
831	return;
832	}
833
834	/ Create a link object we use to handle the connection.*
835	* It gets passed to the readable handler when data is available.
836	* Initially the link->node pointer is set to NULL as we don't know
837	* which node is, but the right node is references once we know the
838	* node identity. */
839	link = createClusterLink(NULL);
840	link->conn = conn;
841	connSetPrivateData(conn, link);
842
843	/ Register read handler /
844	connSetReadHandler(conn, clusterReadHandler);
845	}
846
847	#define MAX_CLUSTER_ACCEPTS_PER_CALL 1000
848	void clusterAcceptHandler(aeEventLoop el, int* fd, void privdata, int* mask) {
849	int cport, cfd;
850	int max = MAX_CLUSTER_ACCEPTS_PER_CALL;
851	char cip[NET_IP_STR_LEN];
852	UNUSED(el);
853	UNUSED(mask);
854	UNUSED(privdata);
855
856	/ If the server is starting up, don't accept cluster connections:*
857	* UPDATE messages may interact with the database content. */
858	if (server.masterhost == NULL && server.loading) return;
859
860	while(max--) {
861	cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport);
862	if (cfd == ANET_ERR) {
863	if (errno != EWOULDBLOCK)
864	serverLog(LL_VERBOSE,
865	"Error accepting cluster node: %s", server.neterr);
866	return;
867	}
868
869	connection *conn = server.tls_cluster ?
870	connCreateAcceptedTLS(cfd, TLS_CLIENT_AUTH_YES) : connCreateAcceptedSocket(cfd);
871
872	/ Make sure connection is not in an error state /
873	if (connGetState(conn) != CONN_STATE_ACCEPTING) {
874	serverLog(LL_VERBOSE,
875	"Error creating an accepting connection for cluster node: %s",
876	connGetLastError(conn));
877	connClose(conn);
878	return;
879	}
880	connEnableTcpNoDelay(conn);
881	connKeepAlive(conn,server.cluster_node_timeout * `2`);
882
883	/ Use non-blocking I/O for cluster messages. /
884	serverLog(LL_VERBOSE,"Accepting cluster node connection from %s:%d", cip, cport);
885
886	/ Accept the connection now. connAccept() may call our handler directly*
887	* or schedule it for later depending on connection implementation.
888	*/
889	if (connAccept(conn, clusterConnAcceptHandler) == C_ERR) {
890	if (connGetState(conn) == CONN_STATE_ERROR)
891	serverLog(LL_VERBOSE,
892	"Error accepting cluster node connection: %s",
893	connGetLastError(conn));
894	connClose(conn);
895	return;
896	}
897	}
898	}
899
900	/ Return the approximated number of sockets we are using in order to*
901	* take the cluster bus connections. */
902	unsigned long getClusterConnectionsCount(void) {
903	/ We decrement the number of nodes by one, since there is the*
904	* "myself" node too in the list. Each node uses two file descriptors,
905	* one incoming and one outgoing, thus the multiplication by 2. */
906	return server.cluster_enabled ?
907	((dictSize(server.cluster->nodes)-`1`)*`2`) : `0`;
908	}
909
910	/ -----------------------------------------------------------------------------*
911	* Key space handling
912	* -------------------------------------------------------------------------- */
913
914	/ We have 16384 hash slots. The hash slot of a given key is obtained*
915	* as the least significant 14 bits of the crc16 of the key.
916	*
917	* However if the key contains the {...} pattern, only the part between
918	* { and } is hashed. This may be useful in the future to force certain
919	* keys to be in the same node (assuming no resharding is in progress). */
920	unsigned int keyHashSlot(char key, int* keylen) {
921	int s, e; / start-end indexes of { and } /
922
923	for (s = `0`; s < keylen; s++)
924	if (key[s] == `'{'`) break;
925
926	/ No '{' ? Hash the whole key. This is the base case. /
927	if (s == keylen) return crc16(key,keylen) & `0x3FFF`;
928
929	/ '{' found? Check if we have the corresponding '}'. /
930	for (e = s+`1`; e < keylen; e++)
931	if (key[e] == `'}'`) break;
932
933	/ No '}' or nothing between {} ? Hash the whole key. /
934	if (e == keylen \|\| e == s+`1`) return crc16(key,keylen) & `0x3FFF`;
935
936	/ If we are here there is both a { and a } on its right. Hash*
937	* what is in the middle between { and }. */
938	return crc16(key+s+`1`,e-s-`1`) & `0x3FFF`;
939	}
940
941	/ -----------------------------------------------------------------------------*
942	* CLUSTER node API
943	* -------------------------------------------------------------------------- */
944
945	/ Create a new cluster node, with the specified flags.*
946	* If "nodename" is NULL this is considered a first handshake and a random
947	* node name is assigned to this node (it will be fixed later when we'll
948	* receive the first pong).
949	*
950	* The node is created and returned to the user, but it is not automatically
951	* added to the nodes hash table. */
952	clusterNode createClusterNode(char* nodename, int* flags) {
953	clusterNode node = zmalloc(sizeof(node));
954
955	if (nodename)
956	memcpy(node->name, nodename, CLUSTER_NAMELEN);
957	else
958	getRandomHexChars(node->name, CLUSTER_NAMELEN);
959	node->ctime = mstime();
960	node->configEpoch = `0`;
961	node->flags = flags;
962	memset(node->slots,`0`,sizeof(node->slots));
963	node->slot_info_pairs = NULL;
964	node->slot_info_pairs_count = `0`;
965	node->numslots = `0`;
966	node->numslaves = `0`;
967	node->slaves = NULL;
968	node->slaveof = NULL;
969	node->last_in_ping_gossip = `0`;
970	node->ping_sent = node->pong_received = `0`;
971	node->data_received = `0`;
972	node->fail_time = `0`;
973	node->link = NULL;
974	node->inbound_link = NULL;
975	memset(node->ip,`0`,sizeof(node->ip));
976	node->hostname = sdsempty();
977	node->port = `0`;
978	node->cport = `0`;
979	node->pport = `0`;
980	node->fail_reports = listCreate();
981	node->voted_time = `0`;
982	node->orphaned_time = `0`;
983	node->repl_offset_time = `0`;
984	node->repl_offset = `0`;
985	listSetFreeMethod(node->fail_reports,zfree);
986	return node;
987	}
988
989	/ This function is called every time we get a failure report from a node.*
990	* The side effect is to populate the fail_reports list (or to update
991	* the timestamp of an existing report).
992	*
993	* 'failing' is the node that is in failure state according to the
994	* 'sender' node.
995	*
996	* The function returns 0 if it just updates a timestamp of an existing
997	* failure report from the same sender. 1 is returned if a new failure
998	* report is created. */
999	int clusterNodeAddFailureReport(clusterNode failing, clusterNode sender) {
1000	list *l = failing->fail_reports;
1001	listNode *ln;
1002	listIter li;
1003	clusterNodeFailReport *fr;
1004
1005	/ If a failure report from the same sender already exists, just update*
1006	* the timestamp. */
1007	listRewind(l,&li);
1008	while ((ln = listNext(&li)) != NULL) {
1009	fr = ln->value;
1010	if (fr->node == sender) {
1011	fr->time = mstime();
1012	return `0`;
1013	}
1014	}
1015
1016	/ Otherwise create a new report. /
1017	fr = zmalloc(sizeof(*fr));
1018	fr->node = sender;
1019	fr->time = mstime();
1020	listAddNodeTail(l,fr);
1021	return `1`;
1022	}
1023
1024	/ Remove failure reports that are too old, where too old means reasonably*
1025	* older than the global node timeout. Note that anyway for a node to be
1026	* flagged as FAIL we need to have a local PFAIL state that is at least
1027	* older than the global node timeout, so we don't just trust the number
1028	* of failure reports from other nodes. */
1029	void clusterNodeCleanupFailureReports(clusterNode *node) {
1030	list *l = node->fail_reports;
1031	listNode *ln;
1032	listIter li;
1033	clusterNodeFailReport *fr;
1034	mstime_t maxtime = server.cluster_node_timeout *
1035	CLUSTER_FAIL_REPORT_VALIDITY_MULT;
1036	mstime_t now = mstime();
1037
1038	listRewind(l,&li);
1039	while ((ln = listNext(&li)) != NULL) {
1040	fr = ln->value;
1041	if (now - fr->time > maxtime) listDelNode(l,ln);
1042	}
1043	}
1044
1045	/ Remove the failing report for 'node' if it was previously considered*
1046	* failing by 'sender'. This function is called when a node informs us via
1047	* gossip that a node is OK from its point of view (no FAIL or PFAIL flags).
1048	*
1049	* Note that this function is called relatively often as it gets called even
1050	* when there are no nodes failing, and is O(N), however when the cluster is
1051	* fine the failure reports list is empty so the function runs in constant
1052	* time.
1053	*
1054	* The function returns 1 if the failure report was found and removed.
1055	* Otherwise 0 is returned. */
1056	int clusterNodeDelFailureReport(clusterNode node, clusterNode sender) {
1057	list *l = node->fail_reports;
1058	listNode *ln;
1059	listIter li;
1060	clusterNodeFailReport *fr;
1061
1062	/ Search for a failure report from this sender. /
1063	listRewind(l,&li);
1064	while ((ln = listNext(&li)) != NULL) {
1065	fr = ln->value;
1066	if (fr->node == sender) break;
1067	}
1068	if (!ln) return `0`; / No failure report from this sender. /
1069
1070	/ Remove the failure report. /
1071	listDelNode(l,ln);
1072	clusterNodeCleanupFailureReports(node);
1073	return `1`;
1074	}
1075
1076	/ Return the number of external nodes that believe 'node' is failing,*
1077	* not including this node, that may have a PFAIL or FAIL state for this
1078	* node as well. */
1079	int clusterNodeFailureReportsCount(clusterNode *node) {
1080	clusterNodeCleanupFailureReports(node);
1081	return listLength(node->fail_reports);
1082	}
1083
1084	int clusterNodeRemoveSlave(clusterNode master, clusterNode slave) {
1085	int j;
1086
1087	for (j = `0`; j < master->numslaves; j++) {
1088	if (master->slaves[j] == slave) {
1089	if ((j+`1`) < master->numslaves) {
1090	int remaining_slaves = (master->numslaves - j) - `1`;
1091	memmove(master->slaves+j,master->slaves+(j+`1`),
1092	(sizeof(master->slaves) remaining_slaves));
1093	}
1094	master->numslaves--;
1095	if (master->numslaves == `0`)
1096	master->flags &= ~CLUSTER_NODE_MIGRATE_TO;
1097	return C_OK;
1098	}
1099	}
1100	return C_ERR;
1101	}
1102
1103	int clusterNodeAddSlave(clusterNode master, clusterNode slave) {
1104	int j;
1105
1106	/ If it's already a slave, don't add it again. /
1107	for (j = `0`; j < master->numslaves; j++)
1108	if (master->slaves[j] == slave) return C_ERR;
1109	master->slaves = zrealloc(master->slaves,
1110	sizeof(clusterNode)(master->numslaves+`1`));
1111	master->slaves[master->numslaves] = slave;
1112	master->numslaves++;
1113	master->flags \|= CLUSTER_NODE_MIGRATE_TO;
1114	return C_OK;
1115	}
1116
1117	int clusterCountNonFailingSlaves(clusterNode *n) {
1118	int j, okslaves = `0`;
1119
1120	for (j = `0`; j < n->numslaves; j++)
1121	if (!nodeFailed(n->slaves[j])) okslaves++;
1122	return okslaves;
1123	}
1124
1125	/ Low level cleanup of the node structure. Only called by clusterDelNode(). /
1126	void freeClusterNode(clusterNode *n) {
1127	sds nodename;
1128	int j;
1129
1130	/ If the node has associated slaves, we have to set*
1131	* all the slaves->slaveof fields to NULL (unknown). */
1132	for (j = `0`; j < n->numslaves; j++)
1133	n->slaves[j]->slaveof = NULL;
1134
1135	/ Remove this node from the list of slaves of its master. /
1136	if (nodeIsSlave(n) && n->slaveof) clusterNodeRemoveSlave(n->slaveof,n);
1137
1138	/ Unlink from the set of nodes. /
1139	nodename = sdsnewlen(n->name, CLUSTER_NAMELEN);
1140	serverAssert(dictDelete(server.cluster->nodes,nodename) == DICT_OK);
1141	sdsfree(nodename);
1142	sdsfree(n->hostname);
1143
1144	/ Release links and associated data structures. /
1145	if (n->link) freeClusterLink(n->link);
1146	if (n->inbound_link) freeClusterLink(n->inbound_link);
1147	listRelease(n->fail_reports);
1148	zfree(n->slaves);
1149	zfree(n);
1150	}
1151
1152	/ Add a node to the nodes hash table /
1153	void clusterAddNode(clusterNode *node) {
1154	int retval;
1155
1156	retval = dictAdd(server.cluster->nodes,
1157	sdsnewlen(node->name,CLUSTER_NAMELEN), node);
1158	serverAssert(retval == DICT_OK);
1159	}
1160
1161	/ Remove a node from the cluster. The function performs the high level*
1162	* cleanup, calling freeClusterNode() for the low level cleanup.
1163	* Here we do the following:
1164	*
1165	* 1) Mark all the slots handled by it as unassigned.
1166	* 2) Remove all the failure reports sent by this node and referenced by
1167	* other nodes.
1168	* 3) Free the node with freeClusterNode() that will in turn remove it
1169	* from the hash table and from the list of slaves of its master, if
1170	* it is a slave node.
1171	*/
1172	void clusterDelNode(clusterNode *delnode) {
1173	int j;
1174	dictIterator *di;
1175	dictEntry *de;
1176
1177	/ 1) Mark slots as unassigned. /
1178	for (j = `0`; j < CLUSTER_SLOTS; j++) {
1179	if (server.cluster->importing_slots_from[j] == delnode)
1180	server.cluster->importing_slots_from[j] = NULL;
1181	if (server.cluster->migrating_slots_to[j] == delnode)
1182	server.cluster->migrating_slots_to[j] = NULL;
1183	if (server.cluster->slots[j] == delnode)
1184	clusterDelSlot(j);
1185	}
1186
1187	/ 2) Remove failure reports. /
1188	di = dictGetSafeIterator(server.cluster->nodes);
1189	while((de = dictNext(di)) != NULL) {
1190	clusterNode *node = dictGetVal(de);
1191
1192	if (node == delnode) continue;
1193	clusterNodeDelFailureReport(node,delnode);
1194	}
1195	dictReleaseIterator(di);
1196
1197	/ 3) Free the node, unlinking it from the cluster. /
1198	freeClusterNode(delnode);
1199	}
1200
1201	/ Cluster node sanity check. Returns C_OK if the node id*
1202	* is valid an C_ERR otherwise. */
1203	int verifyClusterNodeId(const char name, int* length) {
1204	if (length != CLUSTER_NAMELEN) return C_ERR;
1205	for (int i = `0`; i < length; i++) {
1206	if (name[i] >= `'a'` && name[i] <= `'z'`) continue;
1207	if (name[i] >= `'0'` && name[i] <= `'9'`) continue;
1208	return C_ERR;
1209	}
1210	return C_OK;
1211	}
1212
1213	/ Node lookup by name /
1214	clusterNode clusterLookupNode(const* char name, int* length) {
1215	if (verifyClusterNodeId(name, length) != C_OK) return NULL;
1216	sds s = sdsnewlen(name, length);
1217	dictEntry *de = dictFind(server.cluster->nodes, s);
1218	sdsfree(s);
1219	if (de == NULL) return NULL;
1220	return dictGetVal(de);
1221	}
1222
1223	/ Get all the nodes serving the same slots as the given node. /
1224	list clusterGetNodesServingMySlots(clusterNode node) {
1225	list *nodes_for_slot = listCreate();
1226	clusterNode *my_primary = nodeIsMaster(node) ? node : node->slaveof;
1227
1228	/ This function is only valid for fully connected nodes, so*
1229	* they should have a known primary. */
1230	serverAssert(my_primary);
1231	listAddNodeTail(nodes_for_slot, my_primary);
1232	for (int i=`0`; i < my_primary->numslaves; i++) {
1233	listAddNodeTail(nodes_for_slot, my_primary->slaves[i]);
1234	}
1235	return nodes_for_slot;
1236	}
1237
1238	/ This is only used after the handshake. When we connect a given IP/PORT*
1239	* as a result of CLUSTER MEET we don't have the node name yet, so we
1240	* pick a random one, and will fix it when we receive the PONG request using
1241	* this function. */
1242	void clusterRenameNode(clusterNode node, char* *newname) {
1243	int retval;
1244	sds s = sdsnewlen(node->name, CLUSTER_NAMELEN);
1245
1246	serverLog(LL_DEBUG,"Renaming node %.40s into %.40s",
1247	node->name, newname);
1248	retval = dictDelete(server.cluster->nodes, s);
1249	sdsfree(s);
1250	serverAssert(retval == DICT_OK);
1251	memcpy(node->name, newname, CLUSTER_NAMELEN);
1252	clusterAddNode(node);
1253	}
1254
1255	/ -----------------------------------------------------------------------------*
1256	* CLUSTER config epoch handling
1257	* -------------------------------------------------------------------------- */
1258
1259	/ Return the greatest configEpoch found in the cluster, or the current*
1260	* epoch if greater than any node configEpoch. */
1261	uint64_t clusterGetMaxEpoch(void) {
1262	uint64_t max = `0`;
1263	dictIterator *di;
1264	dictEntry *de;
1265
1266	di = dictGetSafeIterator(server.cluster->nodes);
1267	while((de = dictNext(di)) != NULL) {
1268	clusterNode *node = dictGetVal(de);
1269	if (node->configEpoch > max) max = node->configEpoch;
1270	}
1271	dictReleaseIterator(di);
1272	if (max < server.cluster->currentEpoch) max = server.cluster->currentEpoch;
1273	return max;
1274	}
1275
1276	/ If this node epoch is zero or is not already the greatest across the*
1277	* cluster (from the POV of the local configuration), this function will:
1278	*
1279	* 1) Generate a new config epoch, incrementing the current epoch.
1280	* 2) Assign the new epoch to this node, WITHOUT any consensus.
1281	* 3) Persist the configuration on disk before sending packets with the
1282	* new configuration.
1283	*
1284	* If the new config epoch is generated and assigned, C_OK is returned,
1285	* otherwise C_ERR is returned (since the node has already the greatest
1286	* configuration around) and no operation is performed.
1287	*
1288	* Important note: this function violates the principle that config epochs
1289	* should be generated with consensus and should be unique across the cluster.
1290	* However Redis Cluster uses this auto-generated new config epochs in two
1291	* cases:
1292	*
1293	* 1) When slots are closed after importing. Otherwise resharding would be
1294	* too expensive.
1295	* 2) When CLUSTER FAILOVER is called with options that force a slave to
1296	* failover its master even if there is not master majority able to
1297	* create a new configuration epoch.
1298	*
1299	* Redis Cluster will not explode using this function, even in the case of
1300	* a collision between this node and another node, generating the same
1301	* configuration epoch unilaterally, because the config epoch conflict
1302	* resolution algorithm will eventually move colliding nodes to different
1303	* config epochs. However using this function may violate the "last failover
1304	* wins" rule, so should only be used with care. */
1305	int clusterBumpConfigEpochWithoutConsensus(void) {
1306	uint64_t maxEpoch = clusterGetMaxEpoch();
1307
1308	if (myself->configEpoch == `0` \|\|
1309	myself->configEpoch != maxEpoch)
1310	{
1311	server.cluster->currentEpoch++;
1312	myself->configEpoch = server.cluster->currentEpoch;
1313	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
1314	CLUSTER_TODO_FSYNC_CONFIG);
1315	serverLog(LL_WARNING,
1316	"New configEpoch set to %llu",
1317	(unsigned long long) myself->configEpoch);
1318	return C_OK;
1319	} else {
1320	return C_ERR;
1321	}
1322	}
1323
1324	/ This function is called when this node is a master, and we receive from*
1325	* another master a configuration epoch that is equal to our configuration
1326	* epoch.
1327	*
1328	* BACKGROUND
1329	*
1330	* It is not possible that different slaves get the same config
1331	* epoch during a failover election, because the slaves need to get voted
1332	* by a majority. However when we perform a manual resharding of the cluster
1333	* the node will assign a configuration epoch to itself without to ask
1334	* for agreement. Usually resharding happens when the cluster is working well
1335	* and is supervised by the sysadmin, however it is possible for a failover
1336	* to happen exactly while the node we are resharding a slot to assigns itself
1337	* a new configuration epoch, but before it is able to propagate it.
1338	*
1339	* So technically it is possible in this condition that two nodes end with
1340	* the same configuration epoch.
1341	*
1342	* Another possibility is that there are bugs in the implementation causing
1343	* this to happen.
1344	*
1345	* Moreover when a new cluster is created, all the nodes start with the same
1346	* configEpoch. This collision resolution code allows nodes to automatically
1347	* end with a different configEpoch at startup automatically.
1348	*
1349	* In all the cases, we want a mechanism that resolves this issue automatically
1350	* as a safeguard. The same configuration epoch for masters serving different
1351	* set of slots is not harmful, but it is if the nodes end serving the same
1352	* slots for some reason (manual errors or software bugs) without a proper
1353	* failover procedure.
1354	*
1355	* In general we want a system that eventually always ends with different
1356	* masters having different configuration epochs whatever happened, since
1357	* nothing is worse than a split-brain condition in a distributed system.
1358	*
1359	* BEHAVIOR
1360	*
1361	* When this function gets called, what happens is that if this node
1362	* has the lexicographically smaller Node ID compared to the other node
1363	* with the conflicting epoch (the 'sender' node), it will assign itself
1364	* the greatest configuration epoch currently detected among nodes plus 1.
1365	*
1366	* This means that even if there are multiple nodes colliding, the node
1367	* with the greatest Node ID never moves forward, so eventually all the nodes
1368	* end with a different configuration epoch.
1369	*/
1370	void clusterHandleConfigEpochCollision(clusterNode *sender) {
1371	/ Prerequisites: nodes have the same configEpoch and are both masters. /
1372	if (sender->configEpoch != myself->configEpoch \|\|
1373	!nodeIsMaster(sender) \|\| !nodeIsMaster(myself)) return;
1374	/ Don't act if the colliding node has a smaller Node ID. /
1375	if (memcmp(sender->name,myself->name,CLUSTER_NAMELEN) <= `0`) return;
1376	/ Get the next ID available at the best of this node knowledge. /
1377	server.cluster->currentEpoch++;
1378	myself->configEpoch = server.cluster->currentEpoch;
1379	clusterSaveConfigOrDie(`1`);
1380	serverLog(LL_VERBOSE,
1381	"WARNING: configEpoch collision with node %.40s."
1382	" configEpoch set to %llu",
1383	sender->name,
1384	(unsigned long long) myself->configEpoch);
1385	}
1386
1387	/ -----------------------------------------------------------------------------*
1388	* CLUSTER nodes blacklist
1389	*
1390	* The nodes blacklist is just a way to ensure that a given node with a given
1391	* Node ID is not re-added before some time elapsed (this time is specified
1392	* in seconds in CLUSTER_BLACKLIST_TTL).
1393	*
1394	* This is useful when we want to remove a node from the cluster completely:
1395	* when CLUSTER FORGET is called, it also puts the node into the blacklist so
1396	* that even if we receive gossip messages from other nodes that still remember
1397	* about the node we want to remove, we don't re-add it before some time.
1398	*
1399	* Currently the CLUSTER_BLACKLIST_TTL is set to 1 minute, this means
1400	* that redis-cli has 60 seconds to send CLUSTER FORGET messages to nodes
1401	* in the cluster without dealing with the problem of other nodes re-adding
1402	* back the node to nodes we already sent the FORGET command to.
1403	*
1404	* The data structure used is a hash table with an sds string representing
1405	* the node ID as key, and the time when it is ok to re-add the node as
1406	* value.
1407	* -------------------------------------------------------------------------- */
1408
1409	#define CLUSTER_BLACKLIST_TTL 60 /* 1 minute. */
1410
1411
1412	/ Before of the addNode() or Exists() operations we always remove expired*
1413	* entries from the black list. This is an O(N) operation but it is not a
1414	* problem since add / exists operations are called very infrequently and
1415	* the hash table is supposed to contain very little elements at max.
1416	* However without the cleanup during long uptime and with some automated
1417	* node add/removal procedures, entries could accumulate. */
1418	void clusterBlacklistCleanup(void) {
1419	dictIterator *di;
1420	dictEntry *de;
1421
1422	di = dictGetSafeIterator(server.cluster->nodes_black_list);
1423	while((de = dictNext(di)) != NULL) {
1424	int64_t expire = dictGetUnsignedIntegerVal(de);
1425
1426	if (expire < server.unixtime)
1427	dictDelete(server.cluster->nodes_black_list,dictGetKey(de));
1428	}
1429	dictReleaseIterator(di);
1430	}
1431
1432	/ Cleanup the blacklist and add a new node ID to the black list. /
1433	void clusterBlacklistAddNode(clusterNode *node) {
1434	dictEntry *de;
1435	sds id = sdsnewlen(node->name,CLUSTER_NAMELEN);
1436
1437	clusterBlacklistCleanup();
1438	if (dictAdd(server.cluster->nodes_black_list,id,NULL) == DICT_OK) {
1439	/ If the key was added, duplicate the sds string representation of*
1440	* the key for the next lookup. We'll free it at the end. */
1441	id = sdsdup(id);
1442	}
1443	de = dictFind(server.cluster->nodes_black_list,id);
1444	dictSetUnsignedIntegerVal(de,time(NULL)+CLUSTER_BLACKLIST_TTL);
1445	sdsfree(id);
1446	}
1447
1448	/ Return non-zero if the specified node ID exists in the blacklist.*
1449	* You don't need to pass an sds string here, any pointer to 40 bytes
1450	* will work. */
1451	int clusterBlacklistExists(char *nodeid) {
1452	sds id = sdsnewlen(nodeid,CLUSTER_NAMELEN);
1453	int retval;
1454
1455	clusterBlacklistCleanup();
1456	retval = dictFind(server.cluster->nodes_black_list,id) != NULL;
1457	sdsfree(id);
1458	return retval;
1459	}
1460
1461	/ -----------------------------------------------------------------------------*
1462	* CLUSTER messages exchange - PING/PONG and gossip
1463	* -------------------------------------------------------------------------- */
1464
1465	/ This function checks if a given node should be marked as FAIL.*
1466	* It happens if the following conditions are met:
1467	*
1468	* 1) We received enough failure reports from other master nodes via gossip.
1469	* Enough means that the majority of the masters signaled the node is
1470	* down recently.
1471	* 2) We believe this node is in PFAIL state.
1472	*
1473	* If a failure is detected we also inform the whole cluster about this
1474	* event trying to force every other node to set the FAIL flag for the node.
1475	*
1476	* Note that the form of agreement used here is weak, as we collect the majority
1477	* of masters state during some time, and even if we force agreement by
1478	* propagating the FAIL message, because of partitions we may not reach every
1479	* node. However:
1480	*
1481	* 1) Either we reach the majority and eventually the FAIL state will propagate
1482	* to all the cluster.
1483	* 2) Or there is no majority so no slave promotion will be authorized and the
1484	* FAIL flag will be cleared after some time.
1485	*/
1486	void markNodeAsFailingIfNeeded(clusterNode *node) {
1487	int failures;
1488	int needed_quorum = (server.cluster->size / `2`) + `1`;
1489
1490	if (!nodeTimedOut(node)) return; / We can reach it. /
1491	if (nodeFailed(node)) return; / Already FAILing. /
1492
1493	failures = clusterNodeFailureReportsCount(node);
1494	/ Also count myself as a voter if I'm a master. /
1495	if (nodeIsMaster(myself)) failures++;
1496	if (failures < needed_quorum) return; / No weak agreement from masters. /
1497
1498	serverLog(LL_NOTICE,
1499	"Marking node %.40s as failing (quorum reached).", node->name);
1500
1501	/ Mark the node as failing. /
1502	node->flags &= ~CLUSTER_NODE_PFAIL;
1503	node->flags \|= CLUSTER_NODE_FAIL;
1504	node->fail_time = mstime();
1505
1506	/ Broadcast the failing node name to everybody, forcing all the other*
1507	* reachable nodes to flag the node as FAIL.
1508	* We do that even if this node is a replica and not a master: anyway
1509	* the failing state is triggered collecting failure reports from masters,
1510	* so here the replica is only helping propagating this status. */
1511	clusterSendFail(node->name);
1512	clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE\|CLUSTER_TODO_SAVE_CONFIG);
1513	}
1514
1515	/ This function is called only if a node is marked as FAIL, but we are able*
1516	* to reach it again. It checks if there are the conditions to undo the FAIL
1517	* state. */
1518	void clearNodeFailureIfNeeded(clusterNode *node) {
1519	mstime_t now = mstime();
1520
1521	serverAssert(nodeFailed(node));
1522
1523	/ For slaves we always clear the FAIL flag if we can contact the*
1524	* node again. */
1525	if (nodeIsSlave(node) \|\| node->numslots == `0`) {
1526	serverLog(LL_NOTICE,
1527	"Clear FAIL state for node %.40s: %s is reachable again.",
1528	node->name,
1529	nodeIsSlave(node) ? "replica" : "master without slots");
1530	node->flags &= ~CLUSTER_NODE_FAIL;
1531	clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE\|CLUSTER_TODO_SAVE_CONFIG);
1532	}
1533
1534	/ If it is a master and...*
1535	* 1) The FAIL state is old enough.
1536	* 2) It is yet serving slots from our point of view (not failed over).
1537	* Apparently no one is going to fix these slots, clear the FAIL flag. */
1538	if (nodeIsMaster(node) && node->numslots > `0` &&
1539	(now - node->fail_time) >
1540	(server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT))
1541	{
1542	serverLog(LL_NOTICE,
1543	"Clear FAIL state for node %.40s: is reachable again and nobody is serving its slots after some time.",
1544	node->name);
1545	node->flags &= ~CLUSTER_NODE_FAIL;
1546	clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE\|CLUSTER_TODO_SAVE_CONFIG);
1547	}
1548	}
1549
1550	/ Return true if we already have a node in HANDSHAKE state matching the*
1551	* specified ip address and port number. This function is used in order to
1552	* avoid adding a new handshake node for the same address multiple times. */
1553	int clusterHandshakeInProgress(char ip, int* port, int cport) {
1554	dictIterator *di;
1555	dictEntry *de;
1556
1557	di = dictGetSafeIterator(server.cluster->nodes);
1558	while((de = dictNext(di)) != NULL) {
1559	clusterNode *node = dictGetVal(de);
1560
1561	if (!nodeInHandshake(node)) continue;
1562	if (!strcasecmp(node->ip,ip) &&
1563	node->port == port &&
1564	node->cport == cport) break;
1565	}
1566	dictReleaseIterator(di);
1567	return de != NULL;
1568	}
1569
1570	/ Start a handshake with the specified address if there is not one*
1571	* already in progress. Returns non-zero if the handshake was actually
1572	* started. On error zero is returned and errno is set to one of the
1573	* following values:
1574	*
1575	* EAGAIN - There is already a handshake in progress for this address.
1576	* EINVAL - IP or port are not valid. */
1577	int clusterStartHandshake(char ip, int* port, int cport) {
1578	clusterNode *n;
1579	char norm_ip[NET_IP_STR_LEN];
1580	struct sockaddr_storage sa;
1581
1582	/ IP sanity check /
1583	if (inet_pton(AF_INET,ip,
1584	&(((struct sockaddr_in *)&sa)->sin_addr)))
1585	{
1586	sa.ss_family = AF_INET;
1587	} else if (inet_pton(AF_INET6,ip,
1588	&(((struct sockaddr_in6 *)&sa)->sin6_addr)))
1589	{
1590	sa.ss_family = AF_INET6;
1591	} else {
1592	errno = EINVAL;
1593	return `0`;
1594	}
1595
1596	/ Port sanity check /
1597	if (port <= `0` \|\| port > `65535` \|\| cport <= `0` \|\| cport > `65535`) {
1598	errno = EINVAL;
1599	return `0`;
1600	}
1601
1602	/ Set norm_ip as the normalized string representation of the node*
1603	* IP address. */
1604	memset(norm_ip,`0`,NET_IP_STR_LEN);
1605	if (sa.ss_family == AF_INET)
1606	inet_ntop(AF_INET,
1607	(void)&(((struct* sockaddr_in *)&sa)->sin_addr),
1608	norm_ip,NET_IP_STR_LEN);
1609	else
1610	inet_ntop(AF_INET6,
1611	(void)&(((struct* sockaddr_in6 *)&sa)->sin6_addr),
1612	norm_ip,NET_IP_STR_LEN);
1613
1614	if (clusterHandshakeInProgress(norm_ip,port,cport)) {
1615	errno = EAGAIN;
1616	return `0`;
1617	}
1618
1619	/ Add the node with a random address (NULL as first argument to*
1620	* createClusterNode()). Everything will be fixed during the
1621	* handshake. */
1622	n = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE\|CLUSTER_NODE_MEET);
1623	memcpy(n->ip,norm_ip,sizeof(n->ip));
1624	n->port = port;
1625	n->cport = cport;
1626	clusterAddNode(n);
1627	return `1`;
1628	}
1629
1630	/ Process the gossip section of PING or PONG packets.*
1631	* Note that this function assumes that the packet is already sanity-checked
1632	* by the caller, not in the content of the gossip section, but in the
1633	* length. */
1634	void clusterProcessGossipSection(clusterMsg hdr, clusterLink link) {
1635	uint16_t count = ntohs(hdr->count);
1636	clusterMsgDataGossip g = (clusterMsgDataGossip) hdr->data.ping.gossip;
1637	clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN);
1638
1639	while(count--) {
1640	uint16_t flags = ntohs(g->flags);
1641	clusterNode *node;
1642	sds ci;
1643
1644	if (server.verbosity == LL_DEBUG) {
1645	ci = representClusterNodeFlags(sdsempty(), flags);
1646	serverLog(LL_DEBUG,"GOSSIP %.40s %s:%d@%d %s",
1647	g->nodename,
1648	g->ip,
1649	ntohs(g->port),
1650	ntohs(g->cport),
1651	ci);
1652	sdsfree(ci);
1653	}
1654
1655	/ Update our state accordingly to the gossip sections /
1656	node = clusterLookupNode(g->nodename, CLUSTER_NAMELEN);
1657	if (node) {
1658	/ We already know this node.*
1659	Handle failure reports, only when the sender is a master. /*
1660	if (sender && nodeIsMaster(sender) && node != myself) {
1661	if (flags & (CLUSTER_NODE_FAIL\|CLUSTER_NODE_PFAIL)) {
1662	if (clusterNodeAddFailureReport(node,sender)) {
1663	serverLog(LL_VERBOSE,
1664	"Node %.40s reported node %.40s as not reachable.",
1665	sender->name, node->name);
1666	}
1667	markNodeAsFailingIfNeeded(node);
1668	} else {
1669	if (clusterNodeDelFailureReport(node,sender)) {
1670	serverLog(LL_VERBOSE,
1671	"Node %.40s reported node %.40s is back online.",
1672	sender->name, node->name);
1673	}
1674	}
1675	}
1676
1677	/ If from our POV the node is up (no failure flags are set),*
1678	* we have no pending ping for the node, nor we have failure
1679	* reports for this node, update the last pong time with the
1680	* one we see from the other nodes. */
1681	if (!(flags & (CLUSTER_NODE_FAIL\|CLUSTER_NODE_PFAIL)) &&
1682	node->ping_sent == `0` &&
1683	clusterNodeFailureReportsCount(node) == `0`)
1684	{
1685	mstime_t pongtime = ntohl(g->pong_received);
1686	pongtime = `1000`; /* Convert back to milliseconds. /
1687
1688	/ Replace the pong time with the received one only if*
1689	* it's greater than our view but is not in the future
1690	* (with 500 milliseconds tolerance) from the POV of our
1691	* clock. */
1692	if (pongtime <= (server.mstime+`500`) &&
1693	pongtime > node->pong_received)
1694	{
1695	node->pong_received = pongtime;
1696	}
1697	}
1698
1699	/ If we already know this node, but it is not reachable, and*
1700	* we see a different address in the gossip section of a node that
1701	* can talk with this other node, update the address, disconnect
1702	* the old link if any, so that we'll attempt to connect with the
1703	* new address. */
1704	if (node->flags & (CLUSTER_NODE_FAIL\|CLUSTER_NODE_PFAIL) &&
1705	!(flags & CLUSTER_NODE_NOADDR) &&
1706	!(flags & (CLUSTER_NODE_FAIL\|CLUSTER_NODE_PFAIL)) &&
1707	(strcasecmp(node->ip,g->ip) \|\|
1708	node->port != ntohs(g->port) \|\|
1709	node->cport != ntohs(g->cport)))
1710	{
1711	if (node->link) freeClusterLink(node->link);
1712	memcpy(node->ip,g->ip,NET_IP_STR_LEN);
1713	node->port = ntohs(g->port);
1714	node->pport = ntohs(g->pport);
1715	node->cport = ntohs(g->cport);
1716	node->flags &= ~CLUSTER_NODE_NOADDR;
1717	}
1718	} else {
1719	/ If it's not in NOADDR state and we don't have it, we*
1720	* add it to our trusted dict with exact nodeid and flag.
1721	* Note that we cannot simply start a handshake against
1722	* this IP/PORT pairs, since IP/PORT can be reused already,
1723	* otherwise we risk joining another cluster.
1724	*
1725	* Note that we require that the sender of this gossip message
1726	* is a well known node in our cluster, otherwise we risk
1727	* joining another cluster. */
1728	if (sender &&
1729	!(flags & CLUSTER_NODE_NOADDR) &&
1730	!clusterBlacklistExists(g->nodename))
1731	{
1732	clusterNode *node;
1733	node = createClusterNode(g->nodename, flags);
1734	memcpy(node->ip,g->ip,NET_IP_STR_LEN);
1735	node->port = ntohs(g->port);
1736	node->pport = ntohs(g->pport);
1737	node->cport = ntohs(g->cport);
1738	clusterAddNode(node);
1739	}
1740	}
1741
1742	/ Next node /
1743	g++;
1744	}
1745	}
1746
1747	/ IP -> string conversion. 'buf' is supposed to at least be 46 bytes.*
1748	* If 'announced_ip' length is non-zero, it is used instead of extracting
1749	* the IP from the socket peer address. */
1750	void nodeIp2String(char buf, clusterLink link, char *announced_ip) {
1751	if (announced_ip[`0`] != `'\0'`) {
1752	memcpy(buf,announced_ip,NET_IP_STR_LEN);
1753	buf[NET_IP_STR_LEN-`1`] = `'\0'`; / We are not sure the input is sane. /
1754	} else {
1755	connPeerToString(link->conn, buf, NET_IP_STR_LEN, NULL);
1756	}
1757	}
1758
1759	/ Update the node address to the IP address that can be extracted*
1760	* from link->fd, or if hdr->myip is non empty, to the address the node
1761	* is announcing us. The port is taken from the packet header as well.
1762	*
1763	* If the address or port changed, disconnect the node link so that we'll
1764	* connect again to the new address.
1765	*
1766	* If the ip/port pair are already correct no operation is performed at
1767	* all.
1768	*
1769	* The function returns 0 if the node address is still the same,
1770	* otherwise 1 is returned. */
1771	int nodeUpdateAddressIfNeeded(clusterNode node, clusterLink link,
1772	clusterMsg *hdr)
1773	{
1774	char ip[NET_IP_STR_LEN] = {`0`};
1775	int port = ntohs(hdr->port);
1776	int pport = ntohs(hdr->pport);
1777	int cport = ntohs(hdr->cport);
1778
1779	/ We don't proceed if the link is the same as the sender link, as this*
1780	* function is designed to see if the node link is consistent with the
1781	* symmetric link that is used to receive PINGs from the node.
1782	*
1783	* As a side effect this function never frees the passed 'link', so
1784	* it is safe to call during packet processing. */
1785	if (link == node->link) return `0`;
1786
1787	nodeIp2String(ip,link,hdr->myip);
1788	if (node->port == port && node->cport == cport && node->pport == pport &&
1789	strcmp(ip,node->ip) == `0`) return `0`;
1790
1791	/ IP / port is different, update it. /
1792	memcpy(node->ip,ip,sizeof(ip));
1793	node->port = port;
1794	node->pport = pport;
1795	node->cport = cport;
1796	if (node->link) freeClusterLink(node->link);
1797	node->flags &= ~CLUSTER_NODE_NOADDR;
1798	serverLog(LL_WARNING,"Address updated for node %.40s, now %s:%d",
1799	node->name, node->ip, node->port);
1800
1801	/ Check if this is our master and we have to change the*
1802	* replication target as well. */
1803	if (nodeIsSlave(myself) && myself->slaveof == node)
1804	replicationSetMaster(node->ip, node->port);
1805	return `1`;
1806	}
1807
1808	/ Reconfigure the specified node 'n' as a master. This function is called when*
1809	* a node that we believed to be a slave is now acting as master in order to
1810	* update the state of the node. */
1811	void clusterSetNodeAsMaster(clusterNode *n) {
1812	if (nodeIsMaster(n)) return;
1813
1814	if (n->slaveof) {
1815	clusterNodeRemoveSlave(n->slaveof,n);
1816	if (n != myself) n->flags \|= CLUSTER_NODE_MIGRATE_TO;
1817	}
1818	n->flags &= ~CLUSTER_NODE_SLAVE;
1819	n->flags \|= CLUSTER_NODE_MASTER;
1820	n->slaveof = NULL;
1821
1822	/ Update config and state. /
1823	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
1824	CLUSTER_TODO_UPDATE_STATE);
1825	}
1826
1827	/ This function is called when we receive a master configuration via a*
1828	* PING, PONG or UPDATE packet. What we receive is a node, a configEpoch of the
1829	* node, and the set of slots claimed under this configEpoch.
1830	*
1831	* What we do is to rebind the slots with newer configuration compared to our
1832	* local configuration, and if needed, we turn ourself into a replica of the
1833	* node (see the function comments for more info).
1834	*
1835	* The 'sender' is the node for which we received a configuration update.
1836	* Sometimes it is not actually the "Sender" of the information, like in the
1837	* case we receive the info via an UPDATE packet. */
1838	void clusterUpdateSlotsConfigWith(clusterNode sender, uint64_t senderConfigEpoch, unsigned* char *slots) {
1839	int j;
1840	clusterNode curmaster = NULL, newmaster = NULL;
1841	/ The dirty slots list is a list of slots for which we lose the ownership*
1842	* while having still keys inside. This usually happens after a failover
1843	* or after a manual cluster reconfiguration operated by the admin.
1844	*
1845	* If the update message is not able to demote a master to slave (in this
1846	* case we'll resync with the master updating the whole key space), we
1847	* need to delete all the keys in the slots we lost ownership. */
1848	uint16_t dirty_slots[CLUSTER_SLOTS];
1849	int dirty_slots_count = `0`;
1850
1851	/ We should detect if sender is new master of our shard.*
1852	* We will know it if all our slots were migrated to sender, and sender
1853	* has no slots except ours */
1854	int sender_slots = `0`;
1855	int migrated_our_slots = `0`;
1856
1857	/ Here we set curmaster to this node or the node this node*
1858	* replicates to if it's a slave. In the for loop we are
1859	* interested to check if slots are taken away from curmaster. */
1860	curmaster = nodeIsMaster(myself) ? myself : myself->slaveof;
1861
1862	if (sender == myself) {
1863	serverLog(LL_WARNING,"Discarding UPDATE message about myself.");
1864	return;
1865	}
1866
1867	for (j = `0`; j < CLUSTER_SLOTS; j++) {
1868	if (bitmapTestBit(slots,j)) {
1869	sender_slots++;
1870
1871	/ The slot is already bound to the sender of this message. /
1872	if (server.cluster->slots[j] == sender) continue;
1873
1874	/ The slot is in importing state, it should be modified only*
1875	* manually via redis-cli (example: a resharding is in progress
1876	* and the migrating side slot was already closed and is advertising
1877	* a new config. We still want the slot to be closed manually). */
1878	if (server.cluster->importing_slots_from[j]) continue;
1879
1880	/ We rebind the slot to the new node claiming it if:*
1881	* 1) The slot was unassigned or the new node claims it with a
1882	* greater configEpoch.
1883	* 2) We are not currently importing the slot. */
1884	if (server.cluster->slots[j] == NULL \|\|
1885	server.cluster->slots[j]->configEpoch < senderConfigEpoch)
1886	{
1887	/ Was this slot mine, and still contains keys? Mark it as*
1888	* a dirty slot. */
1889	if (server.cluster->slots[j] == myself &&
1890	countKeysInSlot(j) &&
1891	sender != myself)
1892	{
1893	dirty_slots[dirty_slots_count] = j;
1894	dirty_slots_count++;
1895	}
1896
1897	if (server.cluster->slots[j] == curmaster) {
1898	newmaster = sender;
1899	migrated_our_slots++;
1900	}
1901	clusterDelSlot(j);
1902	clusterAddSlot(sender,j);
1903	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
1904	CLUSTER_TODO_UPDATE_STATE\|
1905	CLUSTER_TODO_FSYNC_CONFIG);
1906	}
1907	}
1908	}
1909
1910	/ After updating the slots configuration, don't do any actual change*
1911	* in the state of the server if a module disabled Redis Cluster
1912	* keys redirections. */
1913	if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION)
1914	return;
1915
1916	/ If at least one slot was reassigned from a node to another node*
1917	* with a greater configEpoch, it is possible that:
1918	* 1) We are a master left without slots. This means that we were
1919	* failed over and we should turn into a replica of the new
1920	* master.
1921	* 2) We are a slave and our master is left without slots. We need
1922	* to replicate to the new slots owner. */
1923	if (newmaster && curmaster->numslots == `0` &&
1924	(server.cluster_allow_replica_migration \|\|
1925	sender_slots == migrated_our_slots)) {
1926	serverLog(LL_WARNING,
1927	"Configuration change detected. Reconfiguring myself "
1928	"as a replica of %.40s", sender->name);
1929	clusterSetMaster(sender);
1930	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
1931	CLUSTER_TODO_UPDATE_STATE\|
1932	CLUSTER_TODO_FSYNC_CONFIG);
1933	} else if (myself->slaveof && myself->slaveof->slaveof) {
1934	/ Safeguard against sub-replicas. A replica's master can turn itself*
1935	* into a replica if its last slot is removed. If no other node takes
1936	* over the slot, there is nothing else to trigger replica migration. */
1937	serverLog(LL_WARNING,
1938	"I'm a sub-replica! Reconfiguring myself as a replica of grandmaster %.40s",
1939	myself->slaveof->slaveof->name);
1940	clusterSetMaster(myself->slaveof->slaveof);
1941	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
1942	CLUSTER_TODO_UPDATE_STATE\|
1943	CLUSTER_TODO_FSYNC_CONFIG);
1944	} else if (dirty_slots_count) {
1945	/ If we are here, we received an update message which removed*
1946	* ownership for certain slots we still have keys about, but still
1947	* we are serving some slots, so this master node was not demoted to
1948	* a slave.
1949	*
1950	* In order to maintain a consistent state between keys and slots
1951	* we need to remove all the keys from the slots we lost. */
1952	for (j = `0`; j < dirty_slots_count; j++)
1953	delKeysInSlot(dirty_slots[j]);
1954	}
1955	}
1956
1957	/ Cluster ping extensions.*
1958	*
1959	* The ping/pong/meet messages support arbitrary extensions to add additional
1960	* metadata to the messages that are sent between the various nodes in the
1961	* cluster. The extensions take the form:
1962	* [ Header length + type (8 bytes) ]
1963	* [ Extension information (Arbitrary length, but must be 8 byte padded) ]
1964	*/
1965
1966
1967	/ Returns the length of a given extension /
1968	static uint32_t getPingExtLength(clusterMsgPingExt *ext) {
1969	return ntohl(ext->length);
1970	}
1971
1972	/ Returns the initial position of ping extensions. May return an invalid*
1973	* address if there are no ping extensions. */
1974	static clusterMsgPingExt getInitialPingExt(clusterMsg hdr, uint16_t count) {
1975	clusterMsgPingExt initial = (clusterMsgPingExt) &(hdr->data.ping.gossip[count]);
1976	return initial;
1977	}
1978
1979	/ Given a current ping extension, returns the start of the next extension. May return*
1980	* an invalid address if there are no further ping extensions. */
1981	static clusterMsgPingExt getNextPingExt(clusterMsgPingExt ext) {
1982	clusterMsgPingExt next = (clusterMsgPingExt ) (((char *) ext) + getPingExtLength(ext));
1983	return next;
1984	}
1985
1986	/ Returns the exact size needed to store the hostname. The returned value*
1987	* will be 8 byte padded. */
1988	int getHostnamePingExtSize() {
1989	/ If hostname is not set, we don't send this extension /
1990	if (sdslen(myself->hostname) == `0`) return `0`;
1991
1992	int totlen = sizeof(clusterMsgPingExt) + EIGHT_BYTE_ALIGN(sdslen(myself->hostname) + `1`);
1993	return totlen;
1994	}
1995
1996	/ Write the hostname ping extension at the start of the cursor. This function*
1997	* will update the cursor to point to the end of the written extension and
1998	* will return the amount of bytes written. */
1999	int writeHostnamePingExt(clusterMsgPingExt **cursor) {
2000	/ If hostname is not set, we don't send this extension /
2001	if (sdslen(myself->hostname) == `0`) return `0`;
2002
2003	/ Add the hostname information at the extension cursor /
2004	clusterMsgPingExtHostname ext = &(cursor)->ext[`0`].hostname;
2005	memcpy(ext->hostname, myself->hostname, sdslen(myself->hostname));
2006	uint32_t extension_size = getHostnamePingExtSize();
2007
2008	/ Move the write cursor /
2009	(*cursor)->type = htons(CLUSTERMSG_EXT_TYPE_HOSTNAME);
2010	(*cursor)->length = htonl(extension_size);
2011	/ Make sure the string is NULL terminated by adding 1 /
2012	cursor = (clusterMsgPingExt ) (ext->hostname + EIGHT_BYTE_ALIGN(sdslen(myself->hostname) + `1`));
2013	return extension_size;
2014	}
2015
2016	/ We previously validated the extensions, so this function just needs to*
2017	* handle the extensions. */
2018	void clusterProcessPingExtensions(clusterMsg hdr, clusterLink link) {
2019	clusterNode *sender = link->node ? link->node : clusterLookupNode(hdr->sender, CLUSTER_NAMELEN);
2020	char *ext_hostname = NULL;
2021	uint16_t extensions = ntohs(hdr->extensions);
2022	/ Loop through all the extensions and process them /
2023	clusterMsgPingExt *ext = getInitialPingExt(hdr, ntohs(hdr->count));
2024	while (extensions--) {
2025	uint16_t type = ntohs(ext->type);
2026	if (type == CLUSTERMSG_EXT_TYPE_HOSTNAME) {
2027	clusterMsgPingExtHostname hostname_ext = (clusterMsgPingExtHostname ) &(ext->ext[`0`].hostname);
2028	ext_hostname = hostname_ext->hostname;
2029	} else {
2030	/ Unknown type, we will ignore it but log what happened. /
2031	serverLog(LL_WARNING, "Received unknown extension type %d", type);
2032	}
2033
2034	/ We know this will be valid since we validated it ahead of time /
2035	ext = getNextPingExt(ext);
2036	}
2037	/ If the node did not send us a hostname extension, assume*
2038	* they don't have an announced hostname. Otherwise, we'll
2039	* set it now. */
2040	updateAnnouncedHostname(sender, ext_hostname);
2041	}
2042
2043	static clusterNode getNodeFromLinkAndMsg(clusterLink link, clusterMsg *hdr) {
2044	clusterNode *sender;
2045	if (link->node && !nodeInHandshake(link->node)) {
2046	/ If the link has an associated node, use that so that we don't have to look it*
2047	* up every time, except when the node is still in handshake, the node still has
2048	* a random name thus not truly "known". */
2049	sender = link->node;
2050	} else {
2051	/ Otherwise, fetch sender based on the message /
2052	sender = clusterLookupNode(hdr->sender, CLUSTER_NAMELEN);
2053	/ We know the sender node but haven't associate it with the link. This must*
2054	* be an inbound link because only for inbound links we didn't know which node
2055	* to associate when they were created. */
2056	if (sender && !link->node) {
2057	setClusterNodeToInboundClusterLink(sender, link);
2058	}
2059	}
2060	return sender;
2061	}
2062
2063	/ When this function is called, there is a packet to process starting*
2064	* at link->rcvbuf. Releasing the buffer is up to the caller, so this
2065	* function should just handle the higher level stuff of processing the
2066	* packet, modifying the cluster state if needed.
2067	*
2068	* The function returns 1 if the link is still valid after the packet
2069	* was processed, otherwise 0 if the link was freed since the packet
2070	* processing lead to some inconsistency error (for instance a PONG
2071	* received from the wrong sender ID). */
2072	int clusterProcessPacket(clusterLink *link) {
2073	clusterMsg hdr = (clusterMsg) link->rcvbuf;
2074	uint32_t totlen = ntohl(hdr->totlen);
2075	uint16_t type = ntohs(hdr->type);
2076	mstime_t now = mstime();
2077
2078	if (type < CLUSTERMSG_TYPE_COUNT)
2079	server.cluster->stats_bus_messages_received[type]++;
2080	serverLog(LL_DEBUG,"--- Processing packet of type %s, %lu bytes",
2081	clusterGetMessageTypeString(type), (unsigned long) totlen);
2082
2083	/ Perform sanity checks /
2084	if (totlen < `16`) return `1`; / At least signature, version, totlen, count. /
2085	if (totlen > link->rcvbuf_len) return `1`;
2086
2087	if (ntohs(hdr->ver) != CLUSTER_PROTO_VER) {
2088	/ Can't handle messages of different versions. /
2089	return `1`;
2090	}
2091
2092	if (type == server.cluster_drop_packet_filter) {
2093	serverLog(LL_WARNING, "Dropping packet that matches debug drop filter");
2094	return `1`;
2095	}
2096
2097	uint16_t flags = ntohs(hdr->flags);
2098	uint16_t extensions = ntohs(hdr->extensions);
2099	uint64_t senderCurrentEpoch = `0`, senderConfigEpoch = `0`;
2100	uint32_t explen; / expected length of this packet /
2101	clusterNode *sender;
2102
2103	if (type == CLUSTERMSG_TYPE_PING \|\| type == CLUSTERMSG_TYPE_PONG \|\|
2104	type == CLUSTERMSG_TYPE_MEET)
2105	{
2106	uint16_t count = ntohs(hdr->count);
2107
2108	explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
2109	explen += (sizeof(clusterMsgDataGossip)*count);
2110
2111	/ If there is extension data, which doesn't have a fixed length,*
2112	* loop through them and validate the length of it now. */
2113	if (hdr->mflags[`0`] & CLUSTERMSG_FLAG0_EXT_DATA) {
2114	clusterMsgPingExt *ext = getInitialPingExt(hdr, count);
2115	while (extensions--) {
2116	uint16_t extlen = getPingExtLength(ext);
2117	if (extlen % `8` != `0`) {
2118	serverLog(LL_WARNING, "Received a %s packet without proper padding (%d bytes)",
2119	clusterGetMessageTypeString(type), (int) extlen);
2120	return `1`;
2121	}
2122	if ((totlen - explen) < extlen) {
2123	serverLog(LL_WARNING, "Received invalid %s packet with extension data that exceeds "
2124	"total packet length (%lld)", clusterGetMessageTypeString(type),
2125	(unsigned long long) totlen);
2126	return `1`;
2127	}
2128	explen += extlen;
2129	ext = getNextPingExt(ext);
2130	}
2131	}
2132	} else if (type == CLUSTERMSG_TYPE_FAIL) {
2133	explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
2134	explen += sizeof(clusterMsgDataFail);
2135	} else if (type == CLUSTERMSG_TYPE_PUBLISH \|\| type == CLUSTERMSG_TYPE_PUBLISHSHARD) {
2136	explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
2137	explen += sizeof(clusterMsgDataPublish) -
2138	`8` +
2139	ntohl(hdr->data.publish.msg.channel_len) +
2140	ntohl(hdr->data.publish.msg.message_len);
2141	} else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST \|\|
2142	type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK \|\|
2143	type == CLUSTERMSG_TYPE_MFSTART)
2144	{
2145	explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
2146	} else if (type == CLUSTERMSG_TYPE_UPDATE) {
2147	explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
2148	explen += sizeof(clusterMsgDataUpdate);
2149	} else if (type == CLUSTERMSG_TYPE_MODULE) {
2150	explen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
2151	explen += sizeof(clusterMsgModule) -
2152	`3` + ntohl(hdr->data.module.msg.len);
2153	} else {
2154	/ We don't know this type of packet, so we assume it's well formed. /
2155	explen = totlen;
2156	}
2157
2158	if (totlen != explen) {
2159	serverLog(LL_WARNING, "Received invalid %s packet of length %lld but expected length %lld",
2160	clusterGetMessageTypeString(type), (unsigned long long) totlen, (unsigned long long) explen);
2161	return `1`;
2162	}
2163
2164	sender = getNodeFromLinkAndMsg(link, hdr);
2165
2166	/ Update the last time we saw any data from this node. We*
2167	* use this in order to avoid detecting a timeout from a node that
2168	* is just sending a lot of data in the cluster bus, for instance
2169	* because of Pub/Sub. */
2170	if (sender) sender->data_received = now;
2171
2172	if (sender && !nodeInHandshake(sender)) {
2173	/ Update our currentEpoch if we see a newer epoch in the cluster. /
2174	senderCurrentEpoch = ntohu64(hdr->currentEpoch);
2175	senderConfigEpoch = ntohu64(hdr->configEpoch);
2176	if (senderCurrentEpoch > server.cluster->currentEpoch)
2177	server.cluster->currentEpoch = senderCurrentEpoch;
2178	/ Update the sender configEpoch if it is publishing a newer one. /
2179	if (senderConfigEpoch > sender->configEpoch) {
2180	sender->configEpoch = senderConfigEpoch;
2181	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
2182	CLUSTER_TODO_FSYNC_CONFIG);
2183	}
2184	/ Update the replication offset info for this node. /
2185	sender->repl_offset = ntohu64(hdr->offset);
2186	sender->repl_offset_time = now;
2187	/ If we are a slave performing a manual failover and our master*
2188	* sent its offset while already paused, populate the MF state. */
2189	if (server.cluster->mf_end &&
2190	nodeIsSlave(myself) &&
2191	myself->slaveof == sender &&
2192	hdr->mflags[`0`] & CLUSTERMSG_FLAG0_PAUSED &&
2193	server.cluster->mf_master_offset == -`1`)
2194	{
2195	server.cluster->mf_master_offset = sender->repl_offset;
2196	clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER);
2197	serverLog(LL_WARNING,
2198	"Received replication offset for paused "
2199	"master manual failover: %lld",
2200	server.cluster->mf_master_offset);
2201	}
2202	}
2203
2204	/ Initial processing of PING and MEET requests replying with a PONG. /
2205	if (type == CLUSTERMSG_TYPE_PING \|\| type == CLUSTERMSG_TYPE_MEET) {
2206	/ We use incoming MEET messages in order to set the address*
2207	* for 'myself', since only other cluster nodes will send us
2208	* MEET messages on handshakes, when the cluster joins, or
2209	* later if we changed address, and those nodes will use our
2210	* official address to connect to us. So by obtaining this address
2211	* from the socket is a simple way to discover / update our own
2212	* address in the cluster without it being hardcoded in the config.
2213	*
2214	* However if we don't have an address at all, we update the address
2215	* even with a normal PING packet. If it's wrong it will be fixed
2216	* by MEET later. */
2217	if ((type == CLUSTERMSG_TYPE_MEET \|\| myself->ip[`0`] == `'\0'`) &&
2218	server.cluster_announce_ip == NULL)
2219	{
2220	char ip[NET_IP_STR_LEN];
2221
2222	if (connSockName(link->conn,ip,sizeof(ip),NULL) != -`1` &&
2223	strcmp(ip,myself->ip))
2224	{
2225	memcpy(myself->ip,ip,NET_IP_STR_LEN);
2226	serverLog(LL_WARNING,"IP address for this node updated to %s",
2227	myself->ip);
2228	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
2229	}
2230	}
2231
2232	/ Add this node if it is new for us and the msg type is MEET.*
2233	* In this stage we don't try to add the node with the right
2234	* flags, slaveof pointer, and so forth, as this details will be
2235	* resolved when we'll receive PONGs from the node. */
2236	if (!sender && type == CLUSTERMSG_TYPE_MEET) {
2237	clusterNode *node;
2238
2239	node = createClusterNode(NULL,CLUSTER_NODE_HANDSHAKE);
2240	nodeIp2String(node->ip,link,hdr->myip);
2241	node->port = ntohs(hdr->port);
2242	node->pport = ntohs(hdr->pport);
2243	node->cport = ntohs(hdr->cport);
2244	clusterAddNode(node);
2245	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
2246	}
2247
2248	/ If this is a MEET packet from an unknown node, we still process*
2249	* the gossip section here since we have to trust the sender because
2250	* of the message type. */
2251	if (!sender && type == CLUSTERMSG_TYPE_MEET)
2252	clusterProcessGossipSection(hdr,link);
2253
2254	/ Anyway reply with a PONG /
2255	clusterSendPing(link,CLUSTERMSG_TYPE_PONG);
2256	}
2257
2258	/ PING, PONG, MEET: process config information. /
2259	if (type == CLUSTERMSG_TYPE_PING \|\| type == CLUSTERMSG_TYPE_PONG \|\|
2260	type == CLUSTERMSG_TYPE_MEET)
2261	{
2262	serverLog(LL_DEBUG,"%s packet received: %.40s",
2263	clusterGetMessageTypeString(type),
2264	link->node ? link->node->name : "NULL");
2265	if (!link->inbound) {
2266	if (nodeInHandshake(link->node)) {
2267	/ If we already have this node, try to change the*
2268	* IP/port of the node with the new one. */
2269	if (sender) {
2270	serverLog(LL_VERBOSE,
2271	"Handshake: we already know node %.40s, "
2272	"updating the address if needed.", sender->name);
2273	if (nodeUpdateAddressIfNeeded(sender,link,hdr))
2274	{
2275	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
2276	CLUSTER_TODO_UPDATE_STATE);
2277	}
2278	/ Free this node as we already have it. This will*
2279	* cause the link to be freed as well. */
2280	clusterDelNode(link->node);
2281	return `0`;
2282	}
2283
2284	/ First thing to do is replacing the random name with the*
2285	* right node name if this was a handshake stage. */
2286	clusterRenameNode(link->node, hdr->sender);
2287	serverLog(LL_DEBUG,"Handshake with node %.40s completed.",
2288	link->node->name);
2289	link->node->flags &= ~CLUSTER_NODE_HANDSHAKE;
2290	link->node->flags \|= flags&(CLUSTER_NODE_MASTER\|CLUSTER_NODE_SLAVE);
2291	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
2292	} else if (memcmp(link->node->name,hdr->sender,
2293	CLUSTER_NAMELEN) != `0`)
2294	{
2295	/ If the reply has a non matching node ID we*
2296	* disconnect this node and set it as not having an associated
2297	* address. */
2298	serverLog(LL_DEBUG,"PONG contains mismatching sender ID. About node %.40s added %d ms ago, having flags %d",
2299	link->node->name,
2300	(int)(now-(link->node->ctime)),
2301	link->node->flags);
2302	link->node->flags \|= CLUSTER_NODE_NOADDR;
2303	link->node->ip[`0`] = `'\0'`;
2304	link->node->port = `0`;
2305	link->node->pport = `0`;
2306	link->node->cport = `0`;
2307	freeClusterLink(link);
2308	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
2309	return `0`;
2310	}
2311	}
2312
2313	/ Copy the CLUSTER_NODE_NOFAILOVER flag from what the sender*
2314	* announced. This is a dynamic flag that we receive from the
2315	* sender, and the latest status must be trusted. We need it to
2316	* be propagated because the slave ranking used to understand the
2317	* delay of each slave in the voting process, needs to know
2318	* what are the instances really competing. */
2319	if (sender) {
2320	int nofailover = flags & CLUSTER_NODE_NOFAILOVER;
2321	sender->flags &= ~CLUSTER_NODE_NOFAILOVER;
2322	sender->flags \|= nofailover;
2323	}
2324
2325	/ Update the node address if it changed. /
2326	if (sender && type == CLUSTERMSG_TYPE_PING &&
2327	!nodeInHandshake(sender) &&
2328	nodeUpdateAddressIfNeeded(sender,link,hdr))
2329	{
2330	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
2331	CLUSTER_TODO_UPDATE_STATE);
2332	}
2333
2334	/ Update our info about the node /
2335	if (!link->inbound && type == CLUSTERMSG_TYPE_PONG) {
2336	link->node->pong_received = now;
2337	link->node->ping_sent = `0`;
2338
2339	/ The PFAIL condition can be reversed without external*
2340	* help if it is momentary (that is, if it does not
2341	* turn into a FAIL state).
2342	*
2343	* The FAIL condition is also reversible under specific
2344	* conditions detected by clearNodeFailureIfNeeded(). */
2345	if (nodeTimedOut(link->node)) {
2346	link->node->flags &= ~CLUSTER_NODE_PFAIL;
2347	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
2348	CLUSTER_TODO_UPDATE_STATE);
2349	} else if (nodeFailed(link->node)) {
2350	clearNodeFailureIfNeeded(link->node);
2351	}
2352	}
2353
2354	/ Check for role switch: slave -> master or master -> slave. /
2355	if (sender) {
2356	if (!memcmp(hdr->slaveof,CLUSTER_NODE_NULL_NAME,
2357	sizeof(hdr->slaveof)))
2358	{
2359	/ Node is a master. /
2360	clusterSetNodeAsMaster(sender);
2361	} else {
2362	/ Node is a slave. /
2363	clusterNode *master = clusterLookupNode(hdr->slaveof, CLUSTER_NAMELEN);
2364
2365	if (nodeIsMaster(sender)) {
2366	/ Master turned into a slave! Reconfigure the node. /
2367	clusterDelNodeSlots(sender);
2368	sender->flags &= ~(CLUSTER_NODE_MASTER\|
2369	CLUSTER_NODE_MIGRATE_TO);
2370	sender->flags \|= CLUSTER_NODE_SLAVE;
2371
2372	/ Update config and state. /
2373	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
2374	CLUSTER_TODO_UPDATE_STATE);
2375	}
2376
2377	/ Master node changed for this slave? /
2378	if (master && sender->slaveof != master) {
2379	if (sender->slaveof)
2380	clusterNodeRemoveSlave(sender->slaveof,sender);
2381	clusterNodeAddSlave(master,sender);
2382	sender->slaveof = master;
2383
2384	/ Update config. /
2385	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
2386	}
2387	}
2388	}
2389
2390	/ Update our info about served slots.*
2391	*
2392	* Note: this MUST happen after we update the master/slave state
2393	* so that CLUSTER_NODE_MASTER flag will be set. */
2394
2395	/ Many checks are only needed if the set of served slots this*
2396	* instance claims is different compared to the set of slots we have
2397	* for it. Check this ASAP to avoid other computational expansive
2398	* checks later. */
2399	clusterNode sender_master = NULL; /* Sender or its master if slave. /
2400	int dirty_slots = `0`; / Sender claimed slots don't match my view? /
2401
2402	if (sender) {
2403	sender_master = nodeIsMaster(sender) ? sender : sender->slaveof;
2404	if (sender_master) {
2405	dirty_slots = memcmp(sender_master->slots,
2406	hdr->myslots,sizeof(hdr->myslots)) != `0`;
2407	}
2408	}
2409
2410	/ 1) If the sender of the message is a master, and we detected that*
2411	* the set of slots it claims changed, scan the slots to see if we
2412	* need to update our configuration. */
2413	if (sender && nodeIsMaster(sender) && dirty_slots)
2414	clusterUpdateSlotsConfigWith(sender,senderConfigEpoch,hdr->myslots);
2415
2416	/ 2) We also check for the reverse condition, that is, the sender*
2417	* claims to serve slots we know are served by a master with a
2418	* greater configEpoch. If this happens we inform the sender.
2419	*
2420	* This is useful because sometimes after a partition heals, a
2421	* reappearing master may be the last one to claim a given set of
2422	* hash slots, but with a configuration that other instances know to
2423	* be deprecated. Example:
2424	*
2425	* A and B are master and slave for slots 1,2,3.
2426	* A is partitioned away, B gets promoted.
2427	* B is partitioned away, and A returns available.
2428	*
2429	* Usually B would PING A publishing its set of served slots and its
2430	* configEpoch, but because of the partition B can't inform A of the
2431	* new configuration, so other nodes that have an updated table must
2432	* do it. In this way A will stop to act as a master (or can try to
2433	* failover if there are the conditions to win the election). */
2434	if (sender && dirty_slots) {
2435	int j;
2436
2437	for (j = `0`; j < CLUSTER_SLOTS; j++) {
2438	if (bitmapTestBit(hdr->myslots,j)) {
2439	if (server.cluster->slots[j] == sender \|\|
2440	server.cluster->slots[j] == NULL) continue;
2441	if (server.cluster->slots[j]->configEpoch >
2442	senderConfigEpoch)
2443	{
2444	serverLog(LL_VERBOSE,
2445	"Node %.40s has old slots configuration, sending "
2446	"an UPDATE message about %.40s",
2447	sender->name, server.cluster->slots[j]->name);
2448	clusterSendUpdate(sender->link,
2449	server.cluster->slots[j]);
2450
2451	/ TODO: instead of exiting the loop send every other*
2452	* UPDATE packet for other nodes that are the new owner
2453	* of sender's slots. */
2454	break;
2455	}
2456	}
2457	}
2458	}
2459
2460	/ If our config epoch collides with the sender's try to fix*
2461	* the problem. */
2462	if (sender &&
2463	nodeIsMaster(myself) && nodeIsMaster(sender) &&
2464	senderConfigEpoch == myself->configEpoch)
2465	{
2466	clusterHandleConfigEpochCollision(sender);
2467	}
2468
2469	/ Get info from the gossip section /
2470	if (sender) {
2471	clusterProcessGossipSection(hdr,link);
2472	clusterProcessPingExtensions(hdr,link);
2473	}
2474	} else if (type == CLUSTERMSG_TYPE_FAIL) {
2475	clusterNode *failing;
2476
2477	if (sender) {
2478	failing = clusterLookupNode(hdr->data.fail.about.nodename, CLUSTER_NAMELEN);
2479	if (failing &&
2480	!(failing->flags & (CLUSTER_NODE_FAIL\|CLUSTER_NODE_MYSELF)))
2481	{
2482	serverLog(LL_NOTICE,
2483	"FAIL message received from %.40s about %.40s",
2484	hdr->sender, hdr->data.fail.about.nodename);
2485	failing->flags \|= CLUSTER_NODE_FAIL;
2486	failing->fail_time = now;
2487	failing->flags &= ~CLUSTER_NODE_PFAIL;
2488	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
2489	CLUSTER_TODO_UPDATE_STATE);
2490	}
2491	} else {
2492	serverLog(LL_NOTICE,
2493	"Ignoring FAIL message from unknown node %.40s about %.40s",
2494	hdr->sender, hdr->data.fail.about.nodename);
2495	}
2496	} else if (type == CLUSTERMSG_TYPE_PUBLISH \|\| type == CLUSTERMSG_TYPE_PUBLISHSHARD) {
2497	if (!sender) return `1`; / We don't know that node. /
2498
2499	robj channel, message;
2500	uint32_t channel_len, message_len;
2501
2502	/ Don't bother creating useless objects if there are no*
2503	* Pub/Sub subscribers. */
2504	if ((type == CLUSTERMSG_TYPE_PUBLISH
2505	&& serverPubsubSubscriptionCount() > `0`)
2506	\|\| (type == CLUSTERMSG_TYPE_PUBLISHSHARD
2507	&& serverPubsubShardSubscriptionCount() > `0`))
2508	{
2509	channel_len = ntohl(hdr->data.publish.msg.channel_len);
2510	message_len = ntohl(hdr->data.publish.msg.message_len);
2511	channel = createStringObject(
2512	(char*)hdr->data.publish.msg.bulk_data,channel_len);
2513	message = createStringObject(
2514	(char*)hdr->data.publish.msg.bulk_data+channel_len,
2515	message_len);
2516	pubsubPublishMessage(channel, message, type == CLUSTERMSG_TYPE_PUBLISHSHARD);
2517	decrRefCount(channel);
2518	decrRefCount(message);
2519	}
2520	} else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) {
2521	if (!sender) return `1`; / We don't know that node. /
2522	clusterSendFailoverAuthIfNeeded(sender,hdr);
2523	} else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) {
2524	if (!sender) return `1`; / We don't know that node. /
2525	/ We consider this vote only if the sender is a master serving*
2526	* a non zero number of slots, and its currentEpoch is greater or
2527	* equal to epoch where this node started the election. */
2528	if (nodeIsMaster(sender) && sender->numslots > `0` &&
2529	senderCurrentEpoch >= server.cluster->failover_auth_epoch)
2530	{
2531	server.cluster->failover_auth_count++;
2532	/ Maybe we reached a quorum here, set a flag to make sure*
2533	* we check ASAP. */
2534	clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
2535	}
2536	} else if (type == CLUSTERMSG_TYPE_MFSTART) {
2537	/ This message is acceptable only if I'm a master and the sender*
2538	* is one of my slaves. */
2539	if (!sender \|\| sender->slaveof != myself) return `1`;
2540	/ Manual failover requested from slaves. Initialize the state*
2541	* accordingly. */
2542	resetManualFailover();
2543	server.cluster->mf_end = now + CLUSTER_MF_TIMEOUT;
2544	server.cluster->mf_slave = sender;
2545	pauseClients(PAUSE_DURING_FAILOVER,
2546	now + (CLUSTER_MF_TIMEOUT * CLUSTER_MF_PAUSE_MULT),
2547	CLIENT_PAUSE_WRITE);
2548	serverLog(LL_WARNING,"Manual failover requested by replica %.40s.",
2549	sender->name);
2550	/ We need to send a ping message to the replica, as it would carry*
2551	* `server.cluster->mf_master_offset`, which means the master paused clients
2552	* at offset `server.cluster->mf_master_offset`, so that the replica would
2553	* know that it is safe to set its `server.cluster->mf_can_start` to 1 so as
2554	* to complete failover as quickly as possible. */
2555	clusterSendPing(link, CLUSTERMSG_TYPE_PING);
2556	} else if (type == CLUSTERMSG_TYPE_UPDATE) {
2557	clusterNode n; /* The node the update is about. /
2558	uint64_t reportedConfigEpoch =
2559	ntohu64(hdr->data.update.nodecfg.configEpoch);
2560
2561	if (!sender) return `1`; / We don't know the sender. /
2562	n = clusterLookupNode(hdr->data.update.nodecfg.nodename, CLUSTER_NAMELEN);
2563	if (!n) return `1`; / We don't know the reported node. /
2564	if (n->configEpoch >= reportedConfigEpoch) return `1`; / Nothing new. /
2565
2566	/ If in our current config the node is a slave, set it as a master. /
2567	if (nodeIsSlave(n)) clusterSetNodeAsMaster(n);
2568
2569	/ Update the node's configEpoch. /
2570	n->configEpoch = reportedConfigEpoch;
2571	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
2572	CLUSTER_TODO_FSYNC_CONFIG);
2573
2574	/ Check the bitmap of served slots and update our*
2575	* config accordingly. */
2576	clusterUpdateSlotsConfigWith(n,reportedConfigEpoch,
2577	hdr->data.update.nodecfg.slots);
2578	} else if (type == CLUSTERMSG_TYPE_MODULE) {
2579	if (!sender) return `1`; / Protect the module from unknown nodes. /
2580	/ We need to route this message back to the right module subscribed*
2581	* for the right message type. */
2582	uint64_t module_id = hdr->data.module.msg.module_id; / Endian-safe ID /
2583	uint32_t len = ntohl(hdr->data.module.msg.len);
2584	uint8_t type = hdr->data.module.msg.type;
2585	unsigned char *payload = hdr->data.module.msg.bulk_data;
2586	moduleCallClusterReceivers(sender->name,module_id,type,payload,len);
2587	} else {
2588	serverLog(LL_WARNING,"Received unknown packet type: %d", type);
2589	}
2590	return `1`;
2591	}
2592
2593	/ This function is called when we detect the link with this node is lost.*
2594	We set the node as no longer connected. The Cluster Cron will detect
2595	this connection and will try to get it connected again.
2596
2597	Instead if the node is a temporary node used to accept a query, we
2598	completely free the node on error. /*
2599	void handleLinkIOError(clusterLink *link) {
2600	freeClusterLink(link);
2601	}
2602
2603	/ Send data. This is handled using a trivial send buffer that gets*
2604	* consumed by write(). We don't try to optimize this for speed too much
2605	* as this is a very low traffic channel. */
2606	void clusterWriteHandler(connection *conn) {
2607	clusterLink *link = connGetPrivateData(conn);
2608	ssize_t nwritten;
2609
2610	nwritten = connWrite(conn, link->sndbuf, sdslen(link->sndbuf));
2611	if (nwritten <= `0`) {
2612	serverLog(LL_DEBUG,"I/O error writing to node link: %s",
2613	(nwritten == -`1`) ? connGetLastError(conn) : "short write");
2614	handleLinkIOError(link);
2615	return;
2616	}
2617	sdsrange(link->sndbuf,nwritten,-`1`);
2618	if (sdslen(link->sndbuf) == `0`)
2619	connSetWriteHandler(link->conn, NULL);
2620	}
2621
2622	/ A connect handler that gets called when a connection to another node*
2623	* gets established.
2624	*/
2625	void clusterLinkConnectHandler(connection *conn) {
2626	clusterLink *link = connGetPrivateData(conn);
2627	clusterNode *node = link->node;
2628
2629	/ Check if connection succeeded /
2630	if (connGetState(conn) != CONN_STATE_CONNECTED) {
2631	serverLog(LL_VERBOSE, "Connection with Node %.40s at %s:%d failed: %s",
2632	node->name, node->ip, node->cport,
2633	connGetLastError(conn));
2634	freeClusterLink(link);
2635	return;
2636	}
2637
2638	/ Register a read handler from now on /
2639	connSetReadHandler(conn, clusterReadHandler);
2640
2641	/ Queue a PING in the new connection ASAP: this is crucial*
2642	* to avoid false positives in failure detection.
2643	*
2644	* If the node is flagged as MEET, we send a MEET message instead
2645	* of a PING one, to force the receiver to add us in its node
2646	* table. */
2647	mstime_t old_ping_sent = node->ping_sent;
2648	clusterSendPing(link, node->flags & CLUSTER_NODE_MEET ?
2649	CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING);
2650	if (old_ping_sent) {
2651	/ If there was an active ping before the link was*
2652	* disconnected, we want to restore the ping time, otherwise
2653	* replaced by the clusterSendPing() call. */
2654	node->ping_sent = old_ping_sent;
2655	}
2656	/ We can clear the flag after the first packet is sent.*
2657	* If we'll never receive a PONG, we'll never send new packets
2658	* to this node. Instead after the PONG is received and we
2659	* are no longer in meet/handshake status, we want to send
2660	* normal PING packets. */
2661	node->flags &= ~CLUSTER_NODE_MEET;
2662
2663	serverLog(LL_DEBUG,"Connecting with Node %.40s at %s:%d",
2664	node->name, node->ip, node->cport);
2665	}
2666
2667	/ Read data. Try to read the first field of the header first to check the*
2668	* full length of the packet. When a whole packet is in memory this function
2669	* will call the function to process the packet. And so forth. */
2670	void clusterReadHandler(connection *conn) {
2671	clusterMsg buf[`1`];
2672	ssize_t nread;
2673	clusterMsg *hdr;
2674	clusterLink *link = connGetPrivateData(conn);
2675	unsigned int readlen, rcvbuflen;
2676
2677	while(`1`) { / Read as long as there is data to read. /
2678	rcvbuflen = link->rcvbuf_len;
2679	if (rcvbuflen < `8`) {
2680	/ First, obtain the first 8 bytes to get the full message*
2681	* length. */
2682	readlen = `8` - rcvbuflen;
2683	} else {
2684	/ Finally read the full message. /
2685	hdr = (clusterMsg*) link->rcvbuf;
2686	if (rcvbuflen == `8`) {
2687	/ Perform some sanity check on the message signature*
2688	* and length. */
2689	if (memcmp(hdr->sig,"RCmb",`4`) != `0` \|\|
2690	ntohl(hdr->totlen) < CLUSTERMSG_MIN_LEN)
2691	{
2692	serverLog(LL_WARNING,
2693	"Bad message length or signature received "
2694	"from Cluster bus.");
2695	handleLinkIOError(link);
2696	return;
2697	}
2698	}
2699	readlen = ntohl(hdr->totlen) - rcvbuflen;
2700	if (readlen > sizeof(buf)) readlen = sizeof(buf);
2701	}
2702
2703	nread = connRead(conn,buf,readlen);
2704	if (nread == -`1` && (connGetState(conn) == CONN_STATE_CONNECTED)) return; / No more data ready. /
2705
2706	if (nread <= `0`) {
2707	/ I/O error... /
2708	serverLog(LL_DEBUG,"I/O error reading from node link: %s",
2709	(nread == `0`) ? "connection closed" : connGetLastError(conn));
2710	handleLinkIOError(link);
2711	return;
2712	} else {
2713	/ Read data and recast the pointer to the new buffer. /
2714	size_t unused = link->rcvbuf_alloc - link->rcvbuf_len;
2715	if ((size_t)nread > unused) {
2716	size_t required = link->rcvbuf_len + nread;
2717	/ If less than 1mb, grow to twice the needed size, if larger grow by 1mb. /
2718	link->rcvbuf_alloc = required < RCVBUF_MAX_PREALLOC ? required * `2`: required + RCVBUF_MAX_PREALLOC;
2719	link->rcvbuf = zrealloc(link->rcvbuf, link->rcvbuf_alloc);
2720	}
2721	memcpy(link->rcvbuf + link->rcvbuf_len, buf, nread);
2722	link->rcvbuf_len += nread;
2723	hdr = (clusterMsg*) link->rcvbuf;
2724	rcvbuflen += nread;
2725	}
2726
2727	/ Total length obtained? Process this packet. /
2728	if (rcvbuflen >= `8` && rcvbuflen == ntohl(hdr->totlen)) {
2729	if (clusterProcessPacket(link)) {
2730	if (link->rcvbuf_alloc > RCVBUF_INIT_LEN) {
2731	zfree(link->rcvbuf);
2732	link->rcvbuf = zmalloc(link->rcvbuf_alloc = RCVBUF_INIT_LEN);
2733	}
2734	link->rcvbuf_len = `0`;
2735	} else {
2736	return; / Link no longer valid. /
2737	}
2738	}
2739	}
2740	}
2741
2742	/ Put stuff into the send buffer.*
2743	*
2744	* It is guaranteed that this function will never have as a side effect
2745	* the link to be invalidated, so it is safe to call this function
2746	* from event handlers that will do stuff with the same link later. */
2747	void clusterSendMessage(clusterLink link, unsigned* char *msg, size_t msglen) {
2748	if (sdslen(link->sndbuf) == `0` && msglen != `0`)
2749	connSetWriteHandlerWithBarrier(link->conn, clusterWriteHandler, `1`);
2750
2751	link->sndbuf = sdscatlen(link->sndbuf, msg, msglen);
2752
2753	/ Populate sent messages stats. /
2754	clusterMsg hdr = (clusterMsg) msg;
2755	uint16_t type = ntohs(hdr->type);
2756	if (type < CLUSTERMSG_TYPE_COUNT)
2757	server.cluster->stats_bus_messages_sent[type]++;
2758	}
2759
2760	/ Send a message to all the nodes that are part of the cluster having*
2761	* a connected link.
2762	*
2763	* It is guaranteed that this function will never have as a side effect
2764	* some node->link to be invalidated, so it is safe to call this function
2765	* from event handlers that will do stuff with node links later. */
2766	void clusterBroadcastMessage(void *buf, size_t len) {
2767	dictIterator *di;
2768	dictEntry *de;
2769
2770	di = dictGetSafeIterator(server.cluster->nodes);
2771	while((de = dictNext(di)) != NULL) {
2772	clusterNode *node = dictGetVal(de);
2773
2774	if (!node->link) continue;
2775	if (node->flags & (CLUSTER_NODE_MYSELF\|CLUSTER_NODE_HANDSHAKE))
2776	continue;
2777	clusterSendMessage(node->link,buf,len);
2778	}
2779	dictReleaseIterator(di);
2780	}
2781
2782	/ Build the message header. hdr must point to a buffer at least*
2783	* sizeof(clusterMsg) in bytes. */
2784	void clusterBuildMessageHdr(clusterMsg hdr, int* type) {
2785	int totlen = `0`;
2786	uint64_t offset;
2787	clusterNode *master;
2788
2789	/ If this node is a master, we send its slots bitmap and configEpoch.*
2790	* If this node is a slave we send the master's information instead (the
2791	* node is flagged as slave so the receiver knows that it is NOT really
2792	* in charge for this slots. */
2793	master = (nodeIsSlave(myself) && myself->slaveof) ?
2794	myself->slaveof : myself;
2795
2796	memset(hdr,`0`,sizeof(*hdr));
2797	hdr->ver = htons(CLUSTER_PROTO_VER);
2798	hdr->sig[`0`] = `'R'`;
2799	hdr->sig[`1`] = `'C'`;
2800	hdr->sig[`2`] = `'m'`;
2801	hdr->sig[`3`] = `'b'`;
2802	hdr->type = htons(type);
2803	memcpy(hdr->sender,myself->name,CLUSTER_NAMELEN);
2804
2805	/ If cluster-announce-ip option is enabled, force the receivers of our*
2806	* packets to use the specified address for this node. Otherwise if the
2807	* first byte is zero, they'll do auto discovery. */
2808	memset(hdr->myip,`0`,NET_IP_STR_LEN);
2809	if (server.cluster_announce_ip) {
2810	strncpy(hdr->myip,server.cluster_announce_ip,NET_IP_STR_LEN-`1`);
2811	hdr->myip[NET_IP_STR_LEN-`1`] = `'\0'`;
2812	}
2813
2814	/ Handle cluster-announce-[tls-\|bus-]port. /
2815	int announced_port, announced_pport, announced_cport;
2816	deriveAnnouncedPorts(&announced_port, &announced_pport, &announced_cport);
2817
2818	memcpy(hdr->myslots,master->slots,sizeof(hdr->myslots));
2819	memset(hdr->slaveof,`0`,CLUSTER_NAMELEN);
2820	if (myself->slaveof != NULL)
2821	memcpy(hdr->slaveof,myself->slaveof->name, CLUSTER_NAMELEN);
2822	hdr->port = htons(announced_port);
2823	hdr->pport = htons(announced_pport);
2824	hdr->cport = htons(announced_cport);
2825	hdr->flags = htons(myself->flags);
2826	hdr->state = server.cluster->state;
2827
2828	/ Set the currentEpoch and configEpochs. /
2829	hdr->currentEpoch = htonu64(server.cluster->currentEpoch);
2830	hdr->configEpoch = htonu64(master->configEpoch);
2831
2832	/ Set the replication offset. /
2833	if (nodeIsSlave(myself))
2834	offset = replicationGetSlaveOffset();
2835	else
2836	offset = server.master_repl_offset;
2837	hdr->offset = htonu64(offset);
2838
2839	/ Set the message flags. /
2840	if (nodeIsMaster(myself) && server.cluster->mf_end)
2841	hdr->mflags[`0`] \|= CLUSTERMSG_FLAG0_PAUSED;
2842
2843	/ Compute the message length for certain messages. For other messages*
2844	* this is up to the caller. */
2845	if (type == CLUSTERMSG_TYPE_FAIL) {
2846	totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
2847	totlen += sizeof(clusterMsgDataFail);
2848	} else if (type == CLUSTERMSG_TYPE_UPDATE) {
2849	totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
2850	totlen += sizeof(clusterMsgDataUpdate);
2851	}
2852	hdr->totlen = htonl(totlen);
2853	/ For PING, PONG, MEET and other variable length messages fixing the*
2854	* totlen field is up to the caller. */
2855	}
2856
2857	/ Set the i-th entry of the gossip section in the message pointed by 'hdr'*
2858	* to the info of the specified node 'n'. */
2859	void clusterSetGossipEntry(clusterMsg hdr, int* i, clusterNode *n) {
2860	clusterMsgDataGossip *gossip;
2861	gossip = &(hdr->data.ping.gossip[i]);
2862	memcpy(gossip->nodename,n->name,CLUSTER_NAMELEN);
2863	gossip->ping_sent = htonl(n->ping_sent/`1000`);
2864	gossip->pong_received = htonl(n->pong_received/`1000`);
2865	memcpy(gossip->ip,n->ip,sizeof(n->ip));
2866	gossip->port = htons(n->port);
2867	gossip->cport = htons(n->cport);
2868	gossip->flags = htons(n->flags);
2869	gossip->pport = htons(n->pport);
2870	gossip->notused1 = `0`;
2871	}
2872
2873	/ Send a PING or PONG packet to the specified node, making sure to add enough*
2874	* gossip information. */
2875	void clusterSendPing(clusterLink link, int* type) {
2876	static unsigned long long cluster_pings_sent = `0`;
2877	cluster_pings_sent++;
2878	unsigned char *buf;
2879	clusterMsg *hdr;
2880	int gossipcount = `0`; / Number of gossip sections added so far. /
2881	int wanted; / Number of gossip sections we want to append if possible. /
2882	int estlen; / Upper bound on estimated packet length /
2883	/ freshnodes is the max number of nodes we can hope to append at all:*
2884	* nodes available minus two (ourself and the node we are sending the
2885	* message to). However practically there may be less valid nodes since
2886	* nodes in handshake state, disconnected, are not considered. */
2887	int freshnodes = dictSize(server.cluster->nodes)-`2`;
2888
2889	/ How many gossip sections we want to add? 1/10 of the number of nodes*
2890	* and anyway at least 3. Why 1/10?
2891	*
2892	* If we have N masters, with N/10 entries, and we consider that in
2893	* node_timeout we exchange with each other node at least 4 packets
2894	* (we ping in the worst case in node_timeout/2 time, and we also
2895	* receive two pings from the host), we have a total of 8 packets
2896	* in the node_timeout*2 failure reports validity time. So we have
2897	* that, for a single PFAIL node, we can expect to receive the following
2898	* number of failure reports (in the specified window of time):
2899	*
2900	* PROB * GOSSIP_ENTRIES_PER_PACKET * TOTAL_PACKETS:
2901	*
2902	* PROB = probability of being featured in a single gossip entry,
2903	* which is 1 / NUM_OF_NODES.
2904	* ENTRIES = 10.
2905	* TOTAL_PACKETS = 2 * 4 * NUM_OF_MASTERS.
2906	*
2907	* If we assume we have just masters (so num of nodes and num of masters
2908	* is the same), with 1/10 we always get over the majority, and specifically
2909	* 80% of the number of nodes, to account for many masters failing at the
2910	* same time.
2911	*
2912	* Since we have non-voting slaves that lower the probability of an entry
2913	* to feature our node, we set the number of entries per packet as
2914	* 10% of the total nodes we have. */
2915	wanted = floor(dictSize(server.cluster->nodes)/`10`);
2916	if (wanted < `3`) wanted = `3`;
2917	if (wanted > freshnodes) wanted = freshnodes;
2918
2919	/ Include all the nodes in PFAIL state, so that failure reports are*
2920	* faster to propagate to go from PFAIL to FAIL state. */
2921	int pfail_wanted = server.cluster->stats_pfail_nodes;
2922
2923	/ Compute the maximum estlen to allocate our buffer. We'll fix the estlen*
2924	* later according to the number of gossip sections we really were able
2925	* to put inside the packet. */
2926	estlen = sizeof(clusterMsg) - sizeof(union clusterMsgData);
2927	estlen += (sizeof(clusterMsgDataGossip)*(wanted + pfail_wanted));
2928	estlen += getHostnamePingExtSize();
2929
2930	/ Note: clusterBuildMessageHdr() expects the buffer to be always at least*
2931	* sizeof(clusterMsg) or more. */
2932	if (estlen < (int)sizeof(clusterMsg)) estlen = sizeof(clusterMsg);
2933	buf = zcalloc(estlen);
2934	hdr = (clusterMsg*) buf;
2935
2936	/ Populate the header. /
2937	if (!link->inbound && type == CLUSTERMSG_TYPE_PING)
2938	link->node->ping_sent = mstime();
2939	clusterBuildMessageHdr(hdr,type);
2940
2941	/ Populate the gossip fields /
2942	int maxiterations = wanted*`3`;
2943	while(freshnodes > `0` && gossipcount < wanted && maxiterations--) {
2944	dictEntry *de = dictGetRandomKey(server.cluster->nodes);
2945	clusterNode *this = dictGetVal(de);
2946
2947	/ Don't include this node: the whole packet header is about us*
2948	* already, so we just gossip about other nodes. */
2949	if (this == myself) continue;
2950
2951	/ PFAIL nodes will be added later. /
2952	if (this->flags & CLUSTER_NODE_PFAIL) continue;
2953
2954	/ In the gossip section don't include:*
2955	* 1) Nodes in HANDSHAKE state.
2956	* 3) Nodes with the NOADDR flag set.
2957	* 4) Disconnected nodes if they don't have configured slots.
2958	*/
2959	if (this->flags & (CLUSTER_NODE_HANDSHAKE\|CLUSTER_NODE_NOADDR) \|\|
2960	(this->link == NULL && this->numslots == `0`))
2961	{
2962	freshnodes--; / Technically not correct, but saves CPU. /
2963	continue;
2964	}
2965
2966	/ Do not add a node we already have. /
2967	if (this->last_in_ping_gossip == cluster_pings_sent) continue;
2968
2969	/ Add it /
2970	clusterSetGossipEntry(hdr,gossipcount,this);
2971	this->last_in_ping_gossip = cluster_pings_sent;
2972	freshnodes--;
2973	gossipcount++;
2974	}
2975
2976	/ If there are PFAIL nodes, add them at the end. /
2977	if (pfail_wanted) {
2978	dictIterator *di;
2979	dictEntry *de;
2980
2981	di = dictGetSafeIterator(server.cluster->nodes);
2982	while((de = dictNext(di)) != NULL && pfail_wanted > `0`) {
2983	clusterNode *node = dictGetVal(de);
2984	if (node->flags & CLUSTER_NODE_HANDSHAKE) continue;
2985	if (node->flags & CLUSTER_NODE_NOADDR) continue;
2986	if (!(node->flags & CLUSTER_NODE_PFAIL)) continue;
2987	clusterSetGossipEntry(hdr,gossipcount,node);
2988	gossipcount++;
2989	/ We take the count of the slots we allocated, since the*
2990	* PFAIL stats may not match perfectly with the current number
2991	* of PFAIL nodes. */
2992	pfail_wanted--;
2993	}
2994	dictReleaseIterator(di);
2995	}
2996
2997
2998	int totlen = `0`;
2999	int extensions = `0`;
3000	/ Set the initial extension position /
3001	clusterMsgPingExt *cursor = getInitialPingExt(hdr, gossipcount);
3002	/ Add in the extensions /
3003	if (sdslen(myself->hostname) != `0`) {
3004	hdr->mflags[`0`] \|= CLUSTERMSG_FLAG0_EXT_DATA;
3005	totlen += writeHostnamePingExt(&cursor);
3006	extensions++;
3007	}
3008
3009	/ Compute the actual total length and send! /
3010	totlen += sizeof(clusterMsg)-sizeof(union clusterMsgData);
3011	totlen += (sizeof(clusterMsgDataGossip)*gossipcount);
3012	hdr->count = htons(gossipcount);
3013	hdr->extensions = htons(extensions);
3014	hdr->totlen = htonl(totlen);
3015	clusterSendMessage(link,buf,totlen);
3016	zfree(buf);
3017	}
3018
3019	/ Send a PONG packet to every connected node that's not in handshake state*
3020	* and for which we have a valid link.
3021	*
3022	* In Redis Cluster pongs are not used just for failure detection, but also
3023	* to carry important configuration information. So broadcasting a pong is
3024	* useful when something changes in the configuration and we want to make
3025	* the cluster aware ASAP (for instance after a slave promotion).
3026	*
3027	* The 'target' argument specifies the receiving instances using the
3028	* defines below:
3029	*
3030	* CLUSTER_BROADCAST_ALL -> All known instances.
3031	* CLUSTER_BROADCAST_LOCAL_SLAVES -> All slaves in my master-slaves ring.
3032	*/
3033	#define CLUSTER_BROADCAST_ALL 0
3034	#define CLUSTER_BROADCAST_LOCAL_SLAVES 1
3035	void clusterBroadcastPong(int target) {
3036	dictIterator *di;
3037	dictEntry *de;
3038
3039	di = dictGetSafeIterator(server.cluster->nodes);
3040	while((de = dictNext(di)) != NULL) {
3041	clusterNode *node = dictGetVal(de);
3042
3043	if (!node->link) continue;
3044	if (node == myself \|\| nodeInHandshake(node)) continue;
3045	if (target == CLUSTER_BROADCAST_LOCAL_SLAVES) {
3046	int local_slave =
3047	nodeIsSlave(node) && node->slaveof &&
3048	(node->slaveof == myself \|\| node->slaveof == myself->slaveof);
3049	if (!local_slave) continue;
3050	}
3051	clusterSendPing(node->link,CLUSTERMSG_TYPE_PONG);
3052	}
3053	dictReleaseIterator(di);
3054	}
3055
3056	/ Send a PUBLISH message.*
3057	*
3058	* If link is NULL, then the message is broadcasted to the whole cluster.
3059	*
3060	* Sanitizer suppression: In clusterMsgDataPublish, sizeof(bulk_data) is 8.
3061	* As all the struct is used as a buffer, when more than 8 bytes are copied into
3062	* the 'bulk_data', sanitizer generates an out-of-bounds error which is a false
3063	* positive in this context. */
3064	REDIS_NO_SANITIZE("bounds")
3065	void clusterSendPublish(clusterLink link, robj channel, robj *message, uint16_t type) {
3066	unsigned char *payload;
3067	clusterMsg buf[`1`];
3068	clusterMsg hdr = (clusterMsg) buf;
3069	uint32_t totlen;
3070	uint32_t channel_len, message_len;
3071
3072	channel = getDecodedObject(channel);
3073	message = getDecodedObject(message);
3074	channel_len = sdslen(channel->ptr);
3075	message_len = sdslen(message->ptr);
3076
3077	clusterBuildMessageHdr(hdr,type);
3078	totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
3079	totlen += sizeof(clusterMsgDataPublish) - `8` + channel_len + message_len;
3080
3081	hdr->data.publish.msg.channel_len = htonl(channel_len);
3082	hdr->data.publish.msg.message_len = htonl(message_len);
3083	hdr->totlen = htonl(totlen);
3084
3085	/ Try to use the local buffer if possible /
3086	if (totlen < sizeof(buf)) {
3087	payload = (unsigned char*)buf;
3088	} else {
3089	payload = zmalloc(totlen);
3090	memcpy(payload,hdr,sizeof(*hdr));
3091	hdr = (clusterMsg*) payload;
3092	}
3093	memcpy(hdr->data.publish.msg.bulk_data,channel->ptr,sdslen(channel->ptr));
3094	memcpy(hdr->data.publish.msg.bulk_data+sdslen(channel->ptr),
3095	message->ptr,sdslen(message->ptr));
3096
3097	if (link)
3098	clusterSendMessage(link,payload,totlen);
3099	else
3100	clusterBroadcastMessage(payload,totlen);
3101
3102	decrRefCount(channel);
3103	decrRefCount(message);
3104	if (payload != (unsigned char*)buf) zfree(payload);
3105	}
3106
3107	/ Send a FAIL message to all the nodes we are able to contact.*
3108	* The FAIL message is sent when we detect that a node is failing
3109	* (CLUSTER_NODE_PFAIL) and we also receive a gossip confirmation of this:
3110	* we switch the node state to CLUSTER_NODE_FAIL and ask all the other
3111	* nodes to do the same ASAP. */
3112	void clusterSendFail(char *nodename) {
3113	clusterMsg buf[`1`];
3114	clusterMsg hdr = (clusterMsg) buf;
3115
3116	clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAIL);
3117	memcpy(hdr->data.fail.about.nodename,nodename,CLUSTER_NAMELEN);
3118	clusterBroadcastMessage(buf,ntohl(hdr->totlen));
3119	}
3120
3121	/ Send an UPDATE message to the specified link carrying the specified 'node'*
3122	* slots configuration. The node name, slots bitmap, and configEpoch info
3123	* are included. */
3124	void clusterSendUpdate(clusterLink link, clusterNode node) {
3125	clusterMsg buf[`1`];
3126	clusterMsg hdr = (clusterMsg) buf;
3127
3128	if (link == NULL) return;
3129	clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_UPDATE);
3130	memcpy(hdr->data.update.nodecfg.nodename,node->name,CLUSTER_NAMELEN);
3131	hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch);
3132	memcpy(hdr->data.update.nodecfg.slots,node->slots,sizeof(node->slots));
3133	clusterSendMessage(link,(unsigned char*)buf,ntohl(hdr->totlen));
3134	}
3135
3136	/ Send a MODULE message.*
3137	*
3138	* If link is NULL, then the message is broadcasted to the whole cluster. */
3139	void clusterSendModule(clusterLink *link, uint64_t module_id, uint8_t type,
3140	const char *payload, uint32_t len) {
3141	unsigned char *heapbuf;
3142	clusterMsg buf[`1`];
3143	clusterMsg hdr = (clusterMsg) buf;
3144	uint32_t totlen;
3145
3146	clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_MODULE);
3147	totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
3148	totlen += sizeof(clusterMsgModule) - `3` + len;
3149
3150	hdr->data.module.msg.module_id = module_id; / Already endian adjusted. /
3151	hdr->data.module.msg.type = type;
3152	hdr->data.module.msg.len = htonl(len);
3153	hdr->totlen = htonl(totlen);
3154
3155	/ Try to use the local buffer if possible /
3156	if (totlen < sizeof(buf)) {
3157	heapbuf = (unsigned char*)buf;
3158	} else {
3159	heapbuf = zmalloc(totlen);
3160	memcpy(heapbuf,hdr,sizeof(*hdr));
3161	hdr = (clusterMsg*) heapbuf;
3162	}
3163	memcpy(hdr->data.module.msg.bulk_data,payload,len);
3164
3165	if (link)
3166	clusterSendMessage(link,heapbuf,totlen);
3167	else
3168	clusterBroadcastMessage(heapbuf,totlen);
3169
3170	if (heapbuf != (unsigned char*)buf) zfree(heapbuf);
3171	}
3172
3173	/ This function gets a cluster node ID string as target, the same way the nodes*
3174	* addresses are represented in the modules side, resolves the node, and sends
3175	* the message. If the target is NULL the message is broadcasted.
3176	*
3177	* The function returns C_OK if the target is valid, otherwise C_ERR is
3178	* returned. */
3179	int clusterSendModuleMessageToTarget(const char target, uint64_t module_id, uint8_t type, const* char *payload, uint32_t len) {
3180	clusterNode *node = NULL;
3181
3182	if (target != NULL) {
3183	node = clusterLookupNode(target, strlen(target));
3184	if (node == NULL \|\| node->link == NULL) return C_ERR;
3185	}
3186
3187	clusterSendModule(target ? node->link : NULL,
3188	module_id, type, payload, len);
3189	return C_OK;
3190	}
3191
3192	/ -----------------------------------------------------------------------------*
3193	* CLUSTER Pub/Sub support
3194	*
3195	* If `sharded` is 0:
3196	* For now we do very little, just propagating [S]PUBLISH messages across the whole
3197	* cluster. In the future we'll try to get smarter and avoiding propagating those
3198	* messages to hosts without receives for a given channel.
3199	* Otherwise:
3200	* Publish this message across the slot (primary/replica).
3201	* -------------------------------------------------------------------------- */
3202	void clusterPropagatePublish(robj channel, robj message, int sharded) {
3203	if (!sharded) {
3204	clusterSendPublish(NULL, channel, message, CLUSTERMSG_TYPE_PUBLISH);
3205	return;
3206	}
3207
3208	list *nodes_for_slot = clusterGetNodesServingMySlots(server.cluster->myself);
3209	if (listLength(nodes_for_slot) != `0`) {
3210	listIter li;
3211	listNode *ln;
3212	listRewind(nodes_for_slot, &li);
3213	while((ln = listNext(&li))) {
3214	clusterNode *node = listNodeValue(ln);
3215	if (node != myself) {
3216	clusterSendPublish(node->link, channel, message, CLUSTERMSG_TYPE_PUBLISHSHARD);
3217	}
3218	}
3219	}
3220	listRelease(nodes_for_slot);
3221	}
3222
3223	/ -----------------------------------------------------------------------------*
3224	* SLAVE node specific functions
3225	* -------------------------------------------------------------------------- */
3226
3227	/ This function sends a FAILOVER_AUTH_REQUEST message to every node in order to*
3228	* see if there is the quorum for this slave instance to failover its failing
3229	* master.
3230	*
3231	* Note that we send the failover request to everybody, master and slave nodes,
3232	* but only the masters are supposed to reply to our query. */
3233	void clusterRequestFailoverAuth(void) {
3234	clusterMsg buf[`1`];
3235	clusterMsg hdr = (clusterMsg) buf;
3236	uint32_t totlen;
3237
3238	clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST);
3239	/ If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit*
3240	* in the header to communicate the nodes receiving the message that
3241	* they should authorized the failover even if the master is working. */
3242	if (server.cluster->mf_end) hdr->mflags[`0`] \|= CLUSTERMSG_FLAG0_FORCEACK;
3243	totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
3244	hdr->totlen = htonl(totlen);
3245	clusterBroadcastMessage(buf,totlen);
3246	}
3247
3248	/ Send a FAILOVER_AUTH_ACK message to the specified node. /
3249	void clusterSendFailoverAuth(clusterNode *node) {
3250	clusterMsg buf[`1`];
3251	clusterMsg hdr = (clusterMsg) buf;
3252	uint32_t totlen;
3253
3254	if (!node->link) return;
3255	clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK);
3256	totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
3257	hdr->totlen = htonl(totlen);
3258	clusterSendMessage(node->link,(unsigned char*)buf,totlen);
3259	}
3260
3261	/ Send a MFSTART message to the specified node. /
3262	void clusterSendMFStart(clusterNode *node) {
3263	clusterMsg buf[`1`];
3264	clusterMsg hdr = (clusterMsg) buf;
3265	uint32_t totlen;
3266
3267	if (!node->link) return;
3268	clusterBuildMessageHdr(hdr,CLUSTERMSG_TYPE_MFSTART);
3269	totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData);
3270	hdr->totlen = htonl(totlen);
3271	clusterSendMessage(node->link,(unsigned char*)buf,totlen);
3272	}
3273
3274	/ Vote for the node asking for our vote if there are the conditions. /
3275	void clusterSendFailoverAuthIfNeeded(clusterNode node, clusterMsg request) {
3276	clusterNode *master = node->slaveof;
3277	uint64_t requestCurrentEpoch = ntohu64(request->currentEpoch);
3278	uint64_t requestConfigEpoch = ntohu64(request->configEpoch);
3279	unsigned char *claimed_slots = request->myslots;
3280	int force_ack = request->mflags[`0`] & CLUSTERMSG_FLAG0_FORCEACK;
3281	int j;
3282
3283	/ IF we are not a master serving at least 1 slot, we don't have the*
3284	* right to vote, as the cluster size in Redis Cluster is the number
3285	* of masters serving at least one slot, and quorum is the cluster
3286	* size + 1 */
3287	if (nodeIsSlave(myself) \|\| myself->numslots == `0`) return;
3288
3289	/ Request epoch must be >= our currentEpoch.*
3290	* Note that it is impossible for it to actually be greater since
3291	* our currentEpoch was updated as a side effect of receiving this
3292	* request, if the request epoch was greater. */
3293	if (requestCurrentEpoch < server.cluster->currentEpoch) {
3294	serverLog(LL_WARNING,
3295	"Failover auth denied to %.40s: reqEpoch (%llu) < curEpoch(%llu)",
3296	node->name,
3297	(unsigned long long) requestCurrentEpoch,
3298	(unsigned long long) server.cluster->currentEpoch);
3299	return;
3300	}
3301
3302	/ I already voted for this epoch? Return ASAP. /
3303	if (server.cluster->lastVoteEpoch == server.cluster->currentEpoch) {
3304	serverLog(LL_WARNING,
3305	"Failover auth denied to %.40s: already voted for epoch %llu",
3306	node->name,
3307	(unsigned long long) server.cluster->currentEpoch);
3308	return;
3309	}
3310
3311	/ Node must be a slave and its master down.*
3312	* The master can be non failing if the request is flagged
3313	* with CLUSTERMSG_FLAG0_FORCEACK (manual failover). */
3314	if (nodeIsMaster(node) \|\| master == NULL \|\|
3315	(!nodeFailed(master) && !force_ack))
3316	{
3317	if (nodeIsMaster(node)) {
3318	serverLog(LL_WARNING,
3319	"Failover auth denied to %.40s: it is a master node",
3320	node->name);
3321	} else if (master == NULL) {
3322	serverLog(LL_WARNING,
3323	"Failover auth denied to %.40s: I don't know its master",
3324	node->name);
3325	} else if (!nodeFailed(master)) {
3326	serverLog(LL_WARNING,
3327	"Failover auth denied to %.40s: its master is up",
3328	node->name);
3329	}
3330	return;
3331	}
3332
3333	/ We did not voted for a slave about this master for two*
3334	* times the node timeout. This is not strictly needed for correctness
3335	* of the algorithm but makes the base case more linear. */
3336	if (mstime() - node->slaveof->voted_time < server.cluster_node_timeout * `2`)
3337	{
3338	serverLog(LL_WARNING,
3339	"Failover auth denied to %.40s: "
3340	"can't vote about this master before %lld milliseconds",
3341	node->name,
3342	(long long) ((server.cluster_node_timeout*`2`)-
3343	(mstime() - node->slaveof->voted_time)));
3344	return;
3345	}
3346
3347	/ The slave requesting the vote must have a configEpoch for the claimed*
3348	* slots that is >= the one of the masters currently serving the same
3349	* slots in the current configuration. */
3350	for (j = `0`; j < CLUSTER_SLOTS; j++) {
3351	if (bitmapTestBit(claimed_slots, j) == `0`) continue;
3352	if (server.cluster->slots[j] == NULL \|\|
3353	server.cluster->slots[j]->configEpoch <= requestConfigEpoch)
3354	{
3355	continue;
3356	}
3357	/ If we reached this point we found a slot that in our current slots*
3358	* is served by a master with a greater configEpoch than the one claimed
3359	* by the slave requesting our vote. Refuse to vote for this slave. */
3360	serverLog(LL_WARNING,
3361	"Failover auth denied to %.40s: "
3362	"slot %d epoch (%llu) > reqEpoch (%llu)",
3363	node->name, j,
3364	(unsigned long long) server.cluster->slots[j]->configEpoch,
3365	(unsigned long long) requestConfigEpoch);
3366	return;
3367	}
3368
3369	/ We can vote for this slave. /
3370	server.cluster->lastVoteEpoch = server.cluster->currentEpoch;
3371	node->slaveof->voted_time = mstime();
3372	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|CLUSTER_TODO_FSYNC_CONFIG);
3373	clusterSendFailoverAuth(node);
3374	serverLog(LL_WARNING, "Failover auth granted to %.40s for epoch %llu",
3375	node->name, (unsigned long long) server.cluster->currentEpoch);
3376	}
3377
3378	/ This function returns the "rank" of this instance, a slave, in the context*
3379	* of its master-slaves ring. The rank of the slave is given by the number of
3380	* other slaves for the same master that have a better replication offset
3381	* compared to the local one (better means, greater, so they claim more data).
3382	*
3383	* A slave with rank 0 is the one with the greatest (most up to date)
3384	* replication offset, and so forth. Note that because how the rank is computed
3385	* multiple slaves may have the same rank, in case they have the same offset.
3386	*
3387	* The slave rank is used to add a delay to start an election in order to
3388	* get voted and replace a failing master. Slaves with better replication
3389	* offsets are more likely to win. */
3390	int clusterGetSlaveRank(void) {
3391	long long myoffset;
3392	int j, rank = `0`;
3393	clusterNode *master;
3394
3395	serverAssert(nodeIsSlave(myself));
3396	master = myself->slaveof;
3397	if (master == NULL) return `0`; / Never called by slaves without master. /
3398
3399	myoffset = replicationGetSlaveOffset();
3400	for (j = `0`; j < master->numslaves; j++)
3401	if (master->slaves[j] != myself &&
3402	!nodeCantFailover(master->slaves[j]) &&
3403	master->slaves[j]->repl_offset > myoffset) rank++;
3404	return rank;
3405	}
3406
3407	/ This function is called by clusterHandleSlaveFailover() in order to*
3408	* let the slave log why it is not able to failover. Sometimes there are
3409	* not the conditions, but since the failover function is called again and
3410	* again, we can't log the same things continuously.
3411	*
3412	* This function works by logging only if a given set of conditions are
3413	* true:
3414	*
3415	* 1) The reason for which the failover can't be initiated changed.
3416	* The reasons also include a NONE reason we reset the state to
3417	* when the slave finds that its master is fine (no FAIL flag).
3418	* 2) Also, the log is emitted again if the master is still down and
3419	* the reason for not failing over is still the same, but more than
3420	* CLUSTER_CANT_FAILOVER_RELOG_PERIOD seconds elapsed.
3421	* 3) Finally, the function only logs if the slave is down for more than
3422	* five seconds + NODE_TIMEOUT. This way nothing is logged when a
3423	* failover starts in a reasonable time.
3424	*
3425	* The function is called with the reason why the slave can't failover
3426	* which is one of the integer macros CLUSTER_CANT_FAILOVER_*.
3427	*
3428	* The function is guaranteed to be called only if 'myself' is a slave. */
3429	void clusterLogCantFailover(int reason) {
3430	char *msg;
3431	static time_t lastlog_time = `0`;
3432	mstime_t nolog_fail_time = server.cluster_node_timeout + `5000`;
3433
3434	/ Don't log if we have the same reason for some time. /
3435	if (reason == server.cluster->cant_failover_reason &&
3436	time(NULL)-lastlog_time < CLUSTER_CANT_FAILOVER_RELOG_PERIOD)
3437	return;
3438
3439	server.cluster->cant_failover_reason = reason;
3440
3441	/ We also don't emit any log if the master failed no long ago, the*
3442	* goal of this function is to log slaves in a stalled condition for
3443	* a long time. */
3444	if (myself->slaveof &&
3445	nodeFailed(myself->slaveof) &&
3446	(mstime() - myself->slaveof->fail_time) < nolog_fail_time) return;
3447
3448	switch(reason) {
3449	case CLUSTER_CANT_FAILOVER_DATA_AGE:
3450	msg = "Disconnected from master for longer than allowed. "
3451	"Please check the 'cluster-replica-validity-factor' configuration "
3452	"option.";
3453	break;
3454	case CLUSTER_CANT_FAILOVER_WAITING_DELAY:
3455	msg = "Waiting the delay before I can start a new failover.";
3456	break;
3457	case CLUSTER_CANT_FAILOVER_EXPIRED:
3458	msg = "Failover attempt expired.";
3459	break;
3460	case CLUSTER_CANT_FAILOVER_WAITING_VOTES:
3461	msg = "Waiting for votes, but majority still not reached.";
3462	break;
3463	default:
3464	msg = "Unknown reason code.";
3465	break;
3466	}
3467	lastlog_time = time(NULL);
3468	serverLog(LL_WARNING,"Currently unable to failover: %s", msg);
3469	}
3470
3471	/ This function implements the final part of automatic and manual failovers,*
3472	* where the slave grabs its master's hash slots, and propagates the new
3473	* configuration.
3474	*
3475	* Note that it's up to the caller to be sure that the node got a new
3476	* configuration epoch already. */
3477	void clusterFailoverReplaceYourMaster(void) {
3478	int j;
3479	clusterNode *oldmaster = myself->slaveof;
3480
3481	if (nodeIsMaster(myself) \|\| oldmaster == NULL) return;
3482
3483	/ 1) Turn this node into a master. /
3484	clusterSetNodeAsMaster(myself);
3485	replicationUnsetMaster();
3486
3487	/ 2) Claim all the slots assigned to our master. /
3488	for (j = `0`; j < CLUSTER_SLOTS; j++) {
3489	if (clusterNodeGetSlotBit(oldmaster,j)) {
3490	clusterDelSlot(j);
3491	clusterAddSlot(myself,j);
3492	}
3493	}
3494
3495	/ 3) Update state and save config. /
3496	clusterUpdateState();
3497	clusterSaveConfigOrDie(`1`);
3498
3499	/ 4) Pong all the other nodes so that they can update the state*
3500	* accordingly and detect that we switched to master role. */
3501	clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
3502
3503	/ 5) If there was a manual failover in progress, clear the state. /
3504	resetManualFailover();
3505	}
3506
3507	/ This function is called if we are a slave node and our master serving*
3508	* a non-zero amount of hash slots is in FAIL state.
3509	*
3510	* The goal of this function is:
3511	* 1) To check if we are able to perform a failover, is our data updated?
3512	* 2) Try to get elected by masters.
3513	* 3) Perform the failover informing all the other nodes.
3514	*/
3515	void clusterHandleSlaveFailover(void) {
3516	mstime_t data_age;
3517	mstime_t auth_age = mstime() - server.cluster->failover_auth_time;
3518	int needed_quorum = (server.cluster->size / `2`) + `1`;
3519	int manual_failover = server.cluster->mf_end != `0` &&
3520	server.cluster->mf_can_start;
3521	mstime_t auth_timeout, auth_retry_time;
3522
3523	server.cluster->todo_before_sleep &= ~CLUSTER_TODO_HANDLE_FAILOVER;
3524
3525	/ Compute the failover timeout (the max time we have to send votes*
3526	* and wait for replies), and the failover retry time (the time to wait
3527	* before trying to get voted again).
3528	*
3529	* Timeout is MAX(NODE_TIMEOUT*2,2000) milliseconds.
3530	* Retry is two times the Timeout.
3531	*/
3532	auth_timeout = server.cluster_node_timeout*`2`;
3533	if (auth_timeout < `2000`) auth_timeout = `2000`;
3534	auth_retry_time = auth_timeout*`2`;
3535
3536	/ Pre conditions to run the function, that must be met both in case*
3537	* of an automatic or manual failover:
3538	* 1) We are a slave.
3539	* 2) Our master is flagged as FAIL, or this is a manual failover.
3540	* 3) We don't have the no failover configuration set, and this is
3541	* not a manual failover.
3542	* 4) It is serving slots. */
3543	if (nodeIsMaster(myself) \|\|
3544	myself->slaveof == NULL \|\|
3545	(!nodeFailed(myself->slaveof) && !manual_failover) \|\|
3546	(server.cluster_slave_no_failover && !manual_failover) \|\|
3547	myself->slaveof->numslots == `0`)
3548	{
3549	/ There are no reasons to failover, so we set the reason why we*
3550	* are returning without failing over to NONE. */
3551	server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
3552	return;
3553	}
3554
3555	/ Set data_age to the number of milliseconds we are disconnected from*
3556	* the master. */
3557	if (server.repl_state == REPL_STATE_CONNECTED) {
3558	data_age = (mstime_t)(server.unixtime - server.master->lastinteraction)
3559	* `1000`;
3560	} else {
3561	data_age = (mstime_t)(server.unixtime - server.repl_down_since) * `1000`;
3562	}
3563
3564	/ Remove the node timeout from the data age as it is fine that we are*
3565	* disconnected from our master at least for the time it was down to be
3566	* flagged as FAIL, that's the baseline. */
3567	if (data_age > server.cluster_node_timeout)
3568	data_age -= server.cluster_node_timeout;
3569
3570	/ Check if our data is recent enough according to the slave validity*
3571	* factor configured by the user.
3572	*
3573	* Check bypassed for manual failovers. */
3574	if (server.cluster_slave_validity_factor &&
3575	data_age >
3576	(((mstime_t)server.repl_ping_slave_period * `1000`) +
3577	(server.cluster_node_timeout * server.cluster_slave_validity_factor)))
3578	{
3579	if (!manual_failover) {
3580	clusterLogCantFailover(CLUSTER_CANT_FAILOVER_DATA_AGE);
3581	return;
3582	}
3583	}
3584
3585	/ If the previous failover attempt timeout and the retry time has*
3586	* elapsed, we can setup a new one. */
3587	if (auth_age > auth_retry_time) {
3588	server.cluster->failover_auth_time = mstime() +
3589	`500` + / Fixed delay of 500 milliseconds, let FAIL msg propagate. /
3590	random() % `500`; / Random delay between 0 and 500 milliseconds. /
3591	server.cluster->failover_auth_count = `0`;
3592	server.cluster->failover_auth_sent = `0`;
3593	server.cluster->failover_auth_rank = clusterGetSlaveRank();
3594	/ We add another delay that is proportional to the slave rank.*
3595	* Specifically 1 second * rank. This way slaves that have a probably
3596	* less updated replication offset, are penalized. */
3597	server.cluster->failover_auth_time +=
3598	server.cluster->failover_auth_rank * `1000`;
3599	/ However if this is a manual failover, no delay is needed. /
3600	if (server.cluster->mf_end) {
3601	server.cluster->failover_auth_time = mstime();
3602	server.cluster->failover_auth_rank = `0`;
3603	clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
3604	}
3605	serverLog(LL_WARNING,
3606	"Start of election delayed for %lld milliseconds "
3607	"(rank #%d, offset %lld).",
3608	server.cluster->failover_auth_time - mstime(),
3609	server.cluster->failover_auth_rank,
3610	replicationGetSlaveOffset());
3611	/ Now that we have a scheduled election, broadcast our offset*
3612	* to all the other slaves so that they'll updated their offsets
3613	* if our offset is better. */
3614	clusterBroadcastPong(CLUSTER_BROADCAST_LOCAL_SLAVES);
3615	return;
3616	}
3617
3618	/ It is possible that we received more updated offsets from other*
3619	* slaves for the same master since we computed our election delay.
3620	* Update the delay if our rank changed.
3621	*
3622	* Not performed if this is a manual failover. */
3623	if (server.cluster->failover_auth_sent == `0` &&
3624	server.cluster->mf_end == `0`)
3625	{
3626	int newrank = clusterGetSlaveRank();
3627	if (newrank > server.cluster->failover_auth_rank) {
3628	long long added_delay =
3629	(newrank - server.cluster->failover_auth_rank) * `1000`;
3630	server.cluster->failover_auth_time += added_delay;
3631	server.cluster->failover_auth_rank = newrank;
3632	serverLog(LL_WARNING,
3633	"Replica rank updated to #%d, added %lld milliseconds of delay.",
3634	newrank, added_delay);
3635	}
3636	}
3637
3638	/ Return ASAP if we can't still start the election. /
3639	if (mstime() < server.cluster->failover_auth_time) {
3640	clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_DELAY);
3641	return;
3642	}
3643
3644	/ Return ASAP if the election is too old to be valid. /
3645	if (auth_age > auth_timeout) {
3646	clusterLogCantFailover(CLUSTER_CANT_FAILOVER_EXPIRED);
3647	return;
3648	}
3649
3650	/ Ask for votes if needed. /
3651	if (server.cluster->failover_auth_sent == `0`) {
3652	server.cluster->currentEpoch++;
3653	server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
3654	serverLog(LL_WARNING,"Starting a failover election for epoch %llu.",
3655	(unsigned long long) server.cluster->currentEpoch);
3656	clusterRequestFailoverAuth();
3657	server.cluster->failover_auth_sent = `1`;
3658	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|
3659	CLUSTER_TODO_UPDATE_STATE\|
3660	CLUSTER_TODO_FSYNC_CONFIG);
3661	return; / Wait for replies. /
3662	}
3663
3664	/ Check if we reached the quorum. /
3665	if (server.cluster->failover_auth_count >= needed_quorum) {
3666	/ We have the quorum, we can finally failover the master. /
3667
3668	serverLog(LL_WARNING,
3669	"Failover election won: I'm the new master.");
3670
3671	/ Update my configEpoch to the epoch of the election. /
3672	if (myself->configEpoch < server.cluster->failover_auth_epoch) {
3673	myself->configEpoch = server.cluster->failover_auth_epoch;
3674	serverLog(LL_WARNING,
3675	"configEpoch set to %llu after successful failover",
3676	(unsigned long long) myself->configEpoch);
3677	}
3678
3679	/ Take responsibility for the cluster slots. /
3680	clusterFailoverReplaceYourMaster();
3681	} else {
3682	clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_VOTES);
3683	}
3684	}
3685
3686	/ -----------------------------------------------------------------------------*
3687	* CLUSTER slave migration
3688	*
3689	* Slave migration is the process that allows a slave of a master that is
3690	* already covered by at least another slave, to "migrate" to a master that
3691	* is orphaned, that is, left with no working slaves.
3692	* ------------------------------------------------------------------------- */
3693
3694	/ This function is responsible to decide if this replica should be migrated*
3695	* to a different (orphaned) master. It is called by the clusterCron() function
3696	* only if:
3697	*
3698	* 1) We are a slave node.
3699	* 2) It was detected that there is at least one orphaned master in
3700	* the cluster.
3701	* 3) We are a slave of one of the masters with the greatest number of
3702	* slaves.
3703	*
3704	* This checks are performed by the caller since it requires to iterate
3705	* the nodes anyway, so we spend time into clusterHandleSlaveMigration()
3706	* if definitely needed.
3707	*
3708	* The function is called with a pre-computed max_slaves, that is the max
3709	* number of working (not in FAIL state) slaves for a single master.
3710	*
3711	* Additional conditions for migration are examined inside the function.
3712	*/
3713	void clusterHandleSlaveMigration(int max_slaves) {
3714	int j, okslaves = `0`;
3715	clusterNode mymaster = myself->slaveof, target = NULL, *candidate = NULL;
3716	dictIterator *di;
3717	dictEntry *de;
3718
3719	/ Step 1: Don't migrate if the cluster state is not ok. /
3720	if (server.cluster->state != CLUSTER_OK) return;
3721
3722	/ Step 2: Don't migrate if my master will not be left with at least*
3723	* 'migration-barrier' slaves after my migration. */
3724	if (mymaster == NULL) return;
3725	for (j = `0`; j < mymaster->numslaves; j++)
3726	if (!nodeFailed(mymaster->slaves[j]) &&
3727	!nodeTimedOut(mymaster->slaves[j])) okslaves++;
3728	if (okslaves <= server.cluster_migration_barrier) return;
3729
3730	/ Step 3: Identify a candidate for migration, and check if among the*
3731	* masters with the greatest number of ok slaves, I'm the one with the
3732	* smallest node ID (the "candidate slave").
3733	*
3734	* Note: this means that eventually a replica migration will occur
3735	* since slaves that are reachable again always have their FAIL flag
3736	* cleared, so eventually there must be a candidate.
3737	* There is a possible race condition causing multiple
3738	* slaves to migrate at the same time, but this is unlikely to
3739	* happen and relatively harmless when it does. */
3740	candidate = myself;
3741	di = dictGetSafeIterator(server.cluster->nodes);
3742	while((de = dictNext(di)) != NULL) {
3743	clusterNode *node = dictGetVal(de);
3744	int okslaves = `0`, is_orphaned = `1`;
3745
3746	/ We want to migrate only if this master is working, orphaned, and*
3747	* used to have slaves or if failed over a master that had slaves
3748	* (MIGRATE_TO flag). This way we only migrate to instances that were
3749	* supposed to have replicas. */
3750	if (nodeIsSlave(node) \|\| nodeFailed(node)) is_orphaned = `0`;
3751	if (!(node->flags & CLUSTER_NODE_MIGRATE_TO)) is_orphaned = `0`;
3752
3753	/ Check number of working slaves. /
3754	if (nodeIsMaster(node)) okslaves = clusterCountNonFailingSlaves(node);
3755	if (okslaves > `0`) is_orphaned = `0`;
3756
3757	if (is_orphaned) {
3758	if (!target && node->numslots > `0`) target = node;
3759
3760	/ Track the starting time of the orphaned condition for this*
3761	* master. */
3762	if (!node->orphaned_time) node->orphaned_time = mstime();
3763	} else {
3764	node->orphaned_time = `0`;
3765	}
3766
3767	/ Check if I'm the slave candidate for the migration: attached*
3768	* to a master with the maximum number of slaves and with the smallest
3769	* node ID. */
3770	if (okslaves == max_slaves) {
3771	for (j = `0`; j < node->numslaves; j++) {
3772	if (memcmp(node->slaves[j]->name,
3773	candidate->name,
3774	CLUSTER_NAMELEN) < `0`)
3775	{
3776	candidate = node->slaves[j];
3777	}
3778	}
3779	}
3780	}
3781	dictReleaseIterator(di);
3782
3783	/ Step 4: perform the migration if there is a target, and if I'm the*
3784	* candidate, but only if the master is continuously orphaned for a
3785	* couple of seconds, so that during failovers, we give some time to
3786	* the natural slaves of this instance to advertise their switch from
3787	* the old master to the new one. */
3788	if (target && candidate == myself &&
3789	(mstime()-target->orphaned_time) > CLUSTER_SLAVE_MIGRATION_DELAY &&
3790	!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
3791	{
3792	serverLog(LL_WARNING,"Migrating to orphaned master %.40s",
3793	target->name);
3794	clusterSetMaster(target);
3795	}
3796	}
3797
3798	/ -----------------------------------------------------------------------------*
3799	* CLUSTER manual failover
3800	*
3801	* This are the important steps performed by slaves during a manual failover:
3802	* 1) User send CLUSTER FAILOVER command. The failover state is initialized
3803	* setting mf_end to the millisecond unix time at which we'll abort the
3804	* attempt.
3805	* 2) Slave sends a MFSTART message to the master requesting to pause clients
3806	* for two times the manual failover timeout CLUSTER_MF_TIMEOUT.
3807	* When master is paused for manual failover, it also starts to flag
3808	* packets with CLUSTERMSG_FLAG0_PAUSED.
3809	* 3) Slave waits for master to send its replication offset flagged as PAUSED.
3810	* 4) If slave received the offset from the master, and its offset matches,
3811	* mf_can_start is set to 1, and clusterHandleSlaveFailover() will perform
3812	* the failover as usually, with the difference that the vote request
3813	* will be modified to force masters to vote for a slave that has a
3814	* working master.
3815	*
3816	* From the point of view of the master things are simpler: when a
3817	* PAUSE_CLIENTS packet is received the master sets mf_end as well and
3818	* the sender in mf_slave. During the time limit for the manual failover
3819	* the master will just send PINGs more often to this slave, flagged with
3820	* the PAUSED flag, so that the slave will set mf_master_offset when receiving
3821	* a packet from the master with this flag set.
3822	*
3823	* The goal of the manual failover is to perform a fast failover without
3824	* data loss due to the asynchronous master-slave replication.
3825	* -------------------------------------------------------------------------- */
3826
3827	/ Reset the manual failover state. This works for both masters and slaves*
3828	* as all the state about manual failover is cleared.
3829	*
3830	* The function can be used both to initialize the manual failover state at
3831	* startup or to abort a manual failover in progress. */
3832	void resetManualFailover(void) {
3833	if (server.cluster->mf_slave) {
3834	/ We were a master failing over, so we paused clients. Regardless*
3835	* of the outcome we unpause now to allow traffic again. */
3836	unpauseClients(PAUSE_DURING_FAILOVER);
3837	}
3838	server.cluster->mf_end = `0`; / No manual failover in progress. /
3839	server.cluster->mf_can_start = `0`;
3840	server.cluster->mf_slave = NULL;
3841	server.cluster->mf_master_offset = -`1`;
3842	}
3843
3844	/ If a manual failover timed out, abort it. /
3845	void manualFailoverCheckTimeout(void) {
3846	if (server.cluster->mf_end && server.cluster->mf_end < mstime()) {
3847	serverLog(LL_WARNING,"Manual failover timed out.");
3848	resetManualFailover();
3849	}
3850	}
3851
3852	/ This function is called from the cluster cron function in order to go*
3853	* forward with a manual failover state machine. */
3854	void clusterHandleManualFailover(void) {
3855	/ Return ASAP if no manual failover is in progress. /
3856	if (server.cluster->mf_end == `0`) return;
3857
3858	/ If mf_can_start is non-zero, the failover was already triggered so the*
3859	* next steps are performed by clusterHandleSlaveFailover(). */
3860	if (server.cluster->mf_can_start) return;
3861
3862	if (server.cluster->mf_master_offset == -`1`) return; / Wait for offset... /
3863
3864	if (server.cluster->mf_master_offset == replicationGetSlaveOffset()) {
3865	/ Our replication offset matches the master replication offset*
3866	* announced after clients were paused. We can start the failover. */
3867	server.cluster->mf_can_start = `1`;
3868	serverLog(LL_WARNING,
3869	"All master replication stream processed, "
3870	"manual failover can start.");
3871	clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
3872	return;
3873	}
3874	clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER);
3875	}
3876
3877	/ -----------------------------------------------------------------------------*
3878	* CLUSTER cron job
3879	* -------------------------------------------------------------------------- */
3880
3881	/ Check if the node is disconnected and re-establish the connection.*
3882	* Also update a few stats while we are here, that can be used to make
3883	* better decisions in other part of the code. */
3884	static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_timeout, mstime_t now) {
3885	/ Not interested in reconnecting the link with myself or nodes*
3886	* for which we have no address. */
3887	if (node->flags & (CLUSTER_NODE_MYSELF\|CLUSTER_NODE_NOADDR)) return `1`;
3888
3889	if (node->flags & CLUSTER_NODE_PFAIL)
3890	server.cluster->stats_pfail_nodes++;
3891
3892	/ A Node in HANDSHAKE state has a limited lifespan equal to the*
3893	* configured node timeout. */
3894	if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) {
3895	clusterDelNode(node);
3896	return `1`;
3897	}
3898
3899	if (node->link == NULL) {
3900	clusterLink *link = createClusterLink(node);
3901	link->conn = server.tls_cluster ? connCreateTLS() : connCreateSocket();
3902	connSetPrivateData(link->conn, link);
3903	if (connConnect(link->conn, node->ip, node->cport, server.bind_source_addr,
3904	clusterLinkConnectHandler) == -`1`) {
3905	/ We got a synchronous error from connect before*
3906	* clusterSendPing() had a chance to be called.
3907	* If node->ping_sent is zero, failure detection can't work,
3908	* so we claim we actually sent a ping now (that will
3909	* be really sent as soon as the link is obtained). */
3910	if (node->ping_sent == `0`) node->ping_sent = mstime();
3911	serverLog(LL_DEBUG, "Unable to connect to "
3912	"Cluster Node [%s]:%d -> %s", node->ip,
3913	node->cport, server.neterr);
3914
3915	freeClusterLink(link);
3916	return `0`;
3917	}
3918	}
3919	return `0`;
3920	}
3921
3922	static void resizeClusterLinkBuffer(clusterLink *link) {
3923	/ If unused space is a lot bigger than the used portion of the buffer then free up unused space.*
3924	* We use a factor of 4 because of the greediness of sdsMakeRoomFor (used by sdscatlen). */
3925	if (link != NULL && sdsavail(link->sndbuf) / `4` > sdslen(link->sndbuf)) {
3926	link->sndbuf = sdsRemoveFreeSpace(link->sndbuf);
3927	}
3928	}
3929
3930	/ Resize the send buffer of a node if it is wasting*
3931	* enough space. */
3932	static void clusterNodeCronResizeBuffers(clusterNode *node) {
3933	resizeClusterLinkBuffer(node->link);
3934	resizeClusterLinkBuffer(node->inbound_link);
3935	}
3936
3937	static void freeClusterLinkOnBufferLimitReached(clusterLink *link) {
3938	if (link == NULL \|\| server.cluster_link_sendbuf_limit_bytes == `0`) {
3939	return;
3940	}
3941	unsigned long long mem_link = sdsalloc(link->sndbuf);
3942	if (mem_link > server.cluster_link_sendbuf_limit_bytes) {
3943	serverLog(LL_WARNING, "Freeing cluster link(%s node %.40s, used memory: %llu) due to "
3944	"exceeding send buffer memory limit.", link->inbound ? "from" : "to",
3945	link->node ? link->node->name : "", mem_link);
3946	freeClusterLink(link);
3947	server.cluster->stat_cluster_links_buffer_limit_exceeded++;
3948	}
3949	}
3950
3951	/ Free outbound link to a node if its send buffer size exceeded limit. /
3952	static void clusterNodeCronFreeLinkOnBufferLimitReached(clusterNode *node) {
3953	freeClusterLinkOnBufferLimitReached(node->link);
3954	freeClusterLinkOnBufferLimitReached(node->inbound_link);
3955	}
3956
3957	static size_t getClusterLinkMemUsage(clusterLink *link) {
3958	if (link != NULL) {
3959	return sizeof(clusterLink) + sdsalloc(link->sndbuf) + link->rcvbuf_alloc;
3960	} else {
3961	return `0`;
3962	}
3963	}
3964
3965	/ Update memory usage statistics of all current cluster links /
3966	static void clusterNodeCronUpdateClusterLinksMemUsage(clusterNode *node) {
3967	server.stat_cluster_links_memory += getClusterLinkMemUsage(node->link);
3968	server.stat_cluster_links_memory += getClusterLinkMemUsage(node->inbound_link);
3969	}
3970
3971	/ This is executed 10 times every second /
3972	void clusterCron(void) {
3973	dictIterator *di;
3974	dictEntry *de;
3975	int update_state = `0`;
3976	int orphaned_masters; / How many masters there are without ok slaves. /
3977	int max_slaves; / Max number of ok slaves for a single master. /
3978	int this_slaves; / Number of ok slaves for our master (if we are slave). /
3979	mstime_t min_pong = `0`, now = mstime();
3980	clusterNode *min_pong_node = NULL;
3981	static unsigned long long iteration = `0`;
3982	mstime_t handshake_timeout;
3983
3984	iteration++; / Number of times this function was called so far. /
3985
3986	clusterUpdateMyselfHostname();
3987
3988	/ The handshake timeout is the time after which a handshake node that was*
3989	* not turned into a normal node is removed from the nodes. Usually it is
3990	* just the NODE_TIMEOUT value, but when NODE_TIMEOUT is too small we use
3991	* the value of 1 second. */
3992	handshake_timeout = server.cluster_node_timeout;
3993	if (handshake_timeout < `1000`) handshake_timeout = `1000`;
3994
3995	/ Clear so clusterNodeCronHandleReconnect can count the number of nodes in PFAIL. /
3996	server.cluster->stats_pfail_nodes = `0`;
3997	/ Clear so clusterNodeCronUpdateClusterLinksMemUsage can count the current memory usage of all cluster links. /
3998	server.stat_cluster_links_memory = `0`;
3999	/ Run through some of the operations we want to do on each cluster node. /
4000	di = dictGetSafeIterator(server.cluster->nodes);
4001	while((de = dictNext(di)) != NULL) {
4002	clusterNode *node = dictGetVal(de);
4003	/ The sequence goes:*
4004	* 1. We try to shrink link buffers if possible.
4005	* 2. We free the links whose buffers are still oversized after possible shrinking.
4006	* 3. We update the latest memory usage of cluster links.
4007	* 4. We immediately attempt reconnecting after freeing links.
4008	*/
4009	clusterNodeCronResizeBuffers(node);
4010	clusterNodeCronFreeLinkOnBufferLimitReached(node);
4011	clusterNodeCronUpdateClusterLinksMemUsage(node);
4012	/ The protocol is that function(s) below return non-zero if the node was*
4013	* terminated.
4014	*/
4015	if(clusterNodeCronHandleReconnect(node, handshake_timeout, now)) continue;
4016	}
4017	dictReleaseIterator(di);
4018
4019	/ Ping some random node 1 time every 10 iterations, so that we usually ping*
4020	* one random node every second. */
4021	if (!(iteration % `10`)) {
4022	int j;
4023
4024	/ Check a few random nodes and ping the one with the oldest*
4025	* pong_received time. */
4026	for (j = `0`; j < `5`; j++) {
4027	de = dictGetRandomKey(server.cluster->nodes);
4028	clusterNode *this = dictGetVal(de);
4029
4030	/ Don't ping nodes disconnected or with a ping currently active. /
4031	if (this->link == NULL \|\| this->ping_sent != `0`) continue;
4032	if (this->flags & (CLUSTER_NODE_MYSELF\|CLUSTER_NODE_HANDSHAKE))
4033	continue;
4034	if (min_pong_node == NULL \|\| min_pong > this->pong_received) {
4035	min_pong_node = this;
4036	min_pong = this->pong_received;
4037	}
4038	}
4039	if (min_pong_node) {
4040	serverLog(LL_DEBUG,"Pinging node %.40s", min_pong_node->name);
4041	clusterSendPing(min_pong_node->link, CLUSTERMSG_TYPE_PING);
4042	}
4043	}
4044
4045	/ Iterate nodes to check if we need to flag something as failing.*
4046	* This loop is also responsible to:
4047	* 1) Check if there are orphaned masters (masters without non failing
4048	* slaves).
4049	* 2) Count the max number of non failing slaves for a single master.
4050	* 3) Count the number of slaves for our master, if we are a slave. */
4051	orphaned_masters = `0`;
4052	max_slaves = `0`;
4053	this_slaves = `0`;
4054	di = dictGetSafeIterator(server.cluster->nodes);
4055	while((de = dictNext(di)) != NULL) {
4056	clusterNode *node = dictGetVal(de);
4057	now = mstime(); / Use an updated time at every iteration. /
4058
4059	if (node->flags &
4060	(CLUSTER_NODE_MYSELF\|CLUSTER_NODE_NOADDR\|CLUSTER_NODE_HANDSHAKE))
4061	continue;
4062
4063	/ Orphaned master check, useful only if the current instance*
4064	* is a slave that may migrate to another master. */
4065	if (nodeIsSlave(myself) && nodeIsMaster(node) && !nodeFailed(node)) {
4066	int okslaves = clusterCountNonFailingSlaves(node);
4067
4068	/ A master is orphaned if it is serving a non-zero number of*
4069	* slots, have no working slaves, but used to have at least one
4070	* slave, or failed over a master that used to have slaves. */
4071	if (okslaves == `0` && node->numslots > `0` &&
4072	node->flags & CLUSTER_NODE_MIGRATE_TO)
4073	{
4074	orphaned_masters++;
4075	}
4076	if (okslaves > max_slaves) max_slaves = okslaves;
4077	if (myself->slaveof == node)
4078	this_slaves = okslaves;
4079	}
4080
4081	/ If we are not receiving any data for more than half the cluster*
4082	* timeout, reconnect the link: maybe there is a connection
4083	* issue even if the node is alive. */
4084	mstime_t ping_delay = now - node->ping_sent;
4085	mstime_t data_delay = now - node->data_received;
4086	if (node->link && / is connected /
4087	now - node->link->ctime >
4088	server.cluster_node_timeout && / was not already reconnected /
4089	node->ping_sent && / we already sent a ping /
4090	/ and we are waiting for the pong more than timeout/2 /
4091	ping_delay > server.cluster_node_timeout/`2` &&
4092	/ and in such interval we are not seeing any traffic at all. /
4093	data_delay > server.cluster_node_timeout/`2`)
4094	{
4095	/ Disconnect the link, it will be reconnected automatically. /
4096	freeClusterLink(node->link);
4097	}
4098
4099	/ If we have currently no active ping in this instance, and the*
4100	* received PONG is older than half the cluster timeout, send
4101	* a new ping now, to ensure all the nodes are pinged without
4102	* a too big delay. */
4103	if (node->link &&
4104	node->ping_sent == `0` &&
4105	(now - node->pong_received) > server.cluster_node_timeout/`2`)
4106	{
4107	clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
4108	continue;
4109	}
4110
4111	/ If we are a master and one of the slaves requested a manual*
4112	* failover, ping it continuously. */
4113	if (server.cluster->mf_end &&
4114	nodeIsMaster(myself) &&
4115	server.cluster->mf_slave == node &&
4116	node->link)
4117	{
4118	clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
4119	continue;
4120	}
4121
4122	/ Check only if we have an active ping for this instance. /
4123	if (node->ping_sent == `0`) continue;
4124
4125	/ Check if this node looks unreachable.*
4126	* Note that if we already received the PONG, then node->ping_sent
4127	* is zero, so can't reach this code at all, so we don't risk of
4128	* checking for a PONG delay if we didn't sent the PING.
4129	*
4130	* We also consider every incoming data as proof of liveness, since
4131	* our cluster bus link is also used for data: under heavy data
4132	* load pong delays are possible. */
4133	mstime_t node_delay = (ping_delay < data_delay) ? ping_delay :
4134	data_delay;
4135
4136	if (node_delay > server.cluster_node_timeout) {
4137	/ Timeout reached. Set the node as possibly failing if it is*
4138	* not already in this state. */
4139	if (!(node->flags & (CLUSTER_NODE_PFAIL\|CLUSTER_NODE_FAIL))) {
4140	serverLog(LL_DEBUG,"*** NODE %.40s possibly failing",
4141	node->name);
4142	node->flags \|= CLUSTER_NODE_PFAIL;
4143	update_state = `1`;
4144	}
4145	}
4146	}
4147	dictReleaseIterator(di);
4148
4149	/ If we are a slave node but the replication is still turned off,*
4150	* enable it if we know the address of our master and it appears to
4151	* be up. */
4152	if (nodeIsSlave(myself) &&
4153	server.masterhost == NULL &&
4154	myself->slaveof &&
4155	nodeHasAddr(myself->slaveof))
4156	{
4157	replicationSetMaster(myself->slaveof->ip, myself->slaveof->port);
4158	}
4159
4160	/ Abort a manual failover if the timeout is reached. /
4161	manualFailoverCheckTimeout();
4162
4163	if (nodeIsSlave(myself)) {
4164	clusterHandleManualFailover();
4165	if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
4166	clusterHandleSlaveFailover();
4167	/ If there are orphaned slaves, and we are a slave among the masters*
4168	* with the max number of non-failing slaves, consider migrating to
4169	* the orphaned masters. Note that it does not make sense to try
4170	* a migration if there is no master with at least two working
4171	* slaves. */
4172	if (orphaned_masters && max_slaves >= `2` && this_slaves == max_slaves &&
4173	server.cluster_allow_replica_migration)
4174	clusterHandleSlaveMigration(max_slaves);
4175	}
4176
4177	if (update_state \|\| server.cluster->state == CLUSTER_FAIL)
4178	clusterUpdateState();
4179	}
4180
4181	/ This function is called before the event handler returns to sleep for*
4182	* events. It is useful to perform operations that must be done ASAP in
4183	* reaction to events fired but that are not safe to perform inside event
4184	* handlers, or to perform potentially expansive tasks that we need to do
4185	* a single time before replying to clients. */
4186	void clusterBeforeSleep(void) {
4187	int flags = server.cluster->todo_before_sleep;
4188
4189	/ Reset our flags (not strictly needed since every single function*
4190	* called for flags set should be able to clear its flag). */
4191	server.cluster->todo_before_sleep = `0`;
4192
4193	if (flags & CLUSTER_TODO_HANDLE_MANUALFAILOVER) {
4194	/ Handle manual failover as soon as possible so that won't have a 100ms*
4195	* as it was handled only in clusterCron */
4196	if(nodeIsSlave(myself)) {
4197	clusterHandleManualFailover();
4198	if (!(server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_FAILOVER))
4199	clusterHandleSlaveFailover();
4200	}
4201	} else if (flags & CLUSTER_TODO_HANDLE_FAILOVER) {
4202	/ Handle failover, this is needed when it is likely that there is already*
4203	* the quorum from masters in order to react fast. */
4204	clusterHandleSlaveFailover();
4205	}
4206
4207	/ Update the cluster state. /
4208	if (flags & CLUSTER_TODO_UPDATE_STATE)
4209	clusterUpdateState();
4210
4211	/ Save the config, possibly using fsync. /
4212	if (flags & CLUSTER_TODO_SAVE_CONFIG) {
4213	int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG;
4214	clusterSaveConfigOrDie(fsync);
4215	}
4216	}
4217
4218	void clusterDoBeforeSleep(int flags) {
4219	server.cluster->todo_before_sleep \|= flags;
4220	}
4221
4222	/ -----------------------------------------------------------------------------*
4223	* Slots management
4224	* -------------------------------------------------------------------------- */
4225
4226	/ Test bit 'pos' in a generic bitmap. Return 1 if the bit is set,*
4227	* otherwise 0. */
4228	int bitmapTestBit(unsigned char bitmap, int* pos) {
4229	off_t byte = pos/`8`;
4230	int bit = pos&`7`;
4231	return (bitmap[byte] & (`1`<<bit)) != `0`;
4232	}
4233
4234	/ Set the bit at position 'pos' in a bitmap. /
4235	void bitmapSetBit(unsigned char bitmap, int* pos) {
4236	off_t byte = pos/`8`;
4237	int bit = pos&`7`;
4238	bitmap[byte] \|= `1`<<bit;
4239	}
4240
4241	/ Clear the bit at position 'pos' in a bitmap. /
4242	void bitmapClearBit(unsigned char bitmap, int* pos) {
4243	off_t byte = pos/`8`;
4244	int bit = pos&`7`;
4245	bitmap[byte] &= ~(`1`<<bit);
4246	}
4247
4248	/ Return non-zero if there is at least one master with slaves in the cluster.*
4249	* Otherwise zero is returned. Used by clusterNodeSetSlotBit() to set the
4250	* MIGRATE_TO flag the when a master gets the first slot. */
4251	int clusterMastersHaveSlaves(void) {
4252	dictIterator *di = dictGetSafeIterator(server.cluster->nodes);
4253	dictEntry *de;
4254	int slaves = `0`;
4255	while((de = dictNext(di)) != NULL) {
4256	clusterNode *node = dictGetVal(de);
4257
4258	if (nodeIsSlave(node)) continue;
4259	slaves += node->numslaves;
4260	}
4261	dictReleaseIterator(di);
4262	return slaves != `0`;
4263	}
4264
4265	/ Set the slot bit and return the old value. /
4266	int clusterNodeSetSlotBit(clusterNode n, int* slot) {
4267	int old = bitmapTestBit(n->slots,slot);
4268	bitmapSetBit(n->slots,slot);
4269	if (!old) {
4270	n->numslots++;
4271	/ When a master gets its first slot, even if it has no slaves,*
4272	* it gets flagged with MIGRATE_TO, that is, the master is a valid
4273	* target for replicas migration, if and only if at least one of
4274	* the other masters has slaves right now.
4275	*
4276	* Normally masters are valid targets of replica migration if:
4277	* 1. The used to have slaves (but no longer have).
4278	* 2. They are slaves failing over a master that used to have slaves.
4279	*
4280	* However new masters with slots assigned are considered valid
4281	* migration targets if the rest of the cluster is not a slave-less.
4282	*
4283	* See https://github.com/redis/redis/issues/3043 for more info. */
4284	if (n->numslots == `1` && clusterMastersHaveSlaves())
4285	n->flags \|= CLUSTER_NODE_MIGRATE_TO;
4286	}
4287	return old;
4288	}
4289
4290	/ Clear the slot bit and return the old value. /
4291	int clusterNodeClearSlotBit(clusterNode n, int* slot) {
4292	int old = bitmapTestBit(n->slots,slot);
4293	bitmapClearBit(n->slots,slot);
4294	if (old) n->numslots--;
4295	return old;
4296	}
4297
4298	/ Return the slot bit from the cluster node structure. /
4299	int clusterNodeGetSlotBit(clusterNode n, int* slot) {
4300	return bitmapTestBit(n->slots,slot);
4301	}
4302
4303	/ Add the specified slot to the list of slots that node 'n' will*
4304	* serve. Return C_OK if the operation ended with success.
4305	* If the slot is already assigned to another instance this is considered
4306	* an error and C_ERR is returned. */
4307	int clusterAddSlot(clusterNode n, int* slot) {
4308	if (server.cluster->slots[slot]) return C_ERR;
4309	clusterNodeSetSlotBit(n,slot);
4310	server.cluster->slots[slot] = n;
4311	return C_OK;
4312	}
4313
4314	/ Delete the specified slot marking it as unassigned.*
4315	* Returns C_OK if the slot was assigned, otherwise if the slot was
4316	* already unassigned C_ERR is returned. */
4317	int clusterDelSlot(int slot) {
4318	clusterNode *n = server.cluster->slots[slot];
4319
4320	if (!n) return C_ERR;
4321
4322	/ Cleanup the channels in master/replica as part of slot deletion. /
4323	list *nodes_for_slot = clusterGetNodesServingMySlots(n);
4324	listNode *ln = listSearchKey(nodes_for_slot, myself);
4325	if (ln != NULL) {
4326	removeChannelsInSlot(slot);
4327	}
4328	listRelease(nodes_for_slot);
4329	serverAssert(clusterNodeClearSlotBit(n,slot) == `1`);
4330	server.cluster->slots[slot] = NULL;
4331	return C_OK;
4332	}
4333
4334	/ Delete all the slots associated with the specified node.*
4335	* The number of deleted slots is returned. */
4336	int clusterDelNodeSlots(clusterNode *node) {
4337	int deleted = `0`, j;
4338
4339	for (j = `0`; j < CLUSTER_SLOTS; j++) {
4340	if (clusterNodeGetSlotBit(node,j)) {
4341	clusterDelSlot(j);
4342	deleted++;
4343	}
4344	}
4345	return deleted;
4346	}
4347
4348	/ Clear the migrating / importing state for all the slots.*
4349	* This is useful at initialization and when turning a master into slave. */
4350	void clusterCloseAllSlots(void) {
4351	memset(server.cluster->migrating_slots_to,`0`,
4352	sizeof(server.cluster->migrating_slots_to));
4353	memset(server.cluster->importing_slots_from,`0`,
4354	sizeof(server.cluster->importing_slots_from));
4355	}
4356
4357	/ -----------------------------------------------------------------------------*
4358	* Cluster state evaluation function
4359	* -------------------------------------------------------------------------- */
4360
4361	/ The following are defines that are only used in the evaluation function*
4362	* and are based on heuristics. Actually the main point about the rejoin and
4363	* writable delay is that they should be a few orders of magnitude larger
4364	* than the network latency. */
4365	#define CLUSTER_MAX_REJOIN_DELAY 5000
4366	#define CLUSTER_MIN_REJOIN_DELAY 500
4367	#define CLUSTER_WRITABLE_DELAY 2000
4368
4369	void clusterUpdateState(void) {
4370	int j, new_state;
4371	int reachable_masters = `0`;
4372	static mstime_t among_minority_time;
4373	static mstime_t first_call_time = `0`;
4374
4375	server.cluster->todo_before_sleep &= ~CLUSTER_TODO_UPDATE_STATE;
4376
4377	/ If this is a master node, wait some time before turning the state*
4378	* into OK, since it is not a good idea to rejoin the cluster as a writable
4379	* master, after a reboot, without giving the cluster a chance to
4380	* reconfigure this node. Note that the delay is calculated starting from
4381	* the first call to this function and not since the server start, in order
4382	* to not count the DB loading time. */
4383	if (first_call_time == `0`) first_call_time = mstime();
4384	if (nodeIsMaster(myself) &&
4385	server.cluster->state == CLUSTER_FAIL &&
4386	mstime() - first_call_time < CLUSTER_WRITABLE_DELAY) return;
4387
4388	/ Start assuming the state is OK. We'll turn it into FAIL if there*
4389	* are the right conditions. */
4390	new_state = CLUSTER_OK;
4391
4392	/ Check if all the slots are covered. /
4393	if (server.cluster_require_full_coverage) {
4394	for (j = `0`; j < CLUSTER_SLOTS; j++) {
4395	if (server.cluster->slots[j] == NULL \|\|
4396	server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL))
4397	{
4398	new_state = CLUSTER_FAIL;
4399	break;
4400	}
4401	}
4402	}
4403
4404	/ Compute the cluster size, that is the number of master nodes*
4405	* serving at least a single slot.
4406	*
4407	* At the same time count the number of reachable masters having
4408	* at least one slot. */
4409	{
4410	dictIterator *di;
4411	dictEntry *de;
4412
4413	server.cluster->size = `0`;
4414	di = dictGetSafeIterator(server.cluster->nodes);
4415	while((de = dictNext(di)) != NULL) {
4416	clusterNode *node = dictGetVal(de);
4417
4418	if (nodeIsMaster(node) && node->numslots) {
4419	server.cluster->size++;
4420	if ((node->flags & (CLUSTER_NODE_FAIL\|CLUSTER_NODE_PFAIL)) == `0`)
4421	reachable_masters++;
4422	}
4423	}
4424	dictReleaseIterator(di);
4425	}
4426
4427	/ If we are in a minority partition, change the cluster state*
4428	* to FAIL. */
4429	{
4430	int needed_quorum = (server.cluster->size / `2`) + `1`;
4431
4432	if (reachable_masters < needed_quorum) {
4433	new_state = CLUSTER_FAIL;
4434	among_minority_time = mstime();
4435	}
4436	}
4437
4438	/ Log a state change /
4439	if (new_state != server.cluster->state) {
4440	mstime_t rejoin_delay = server.cluster_node_timeout;
4441
4442	/ If the instance is a master and was partitioned away with the*
4443	* minority, don't let it accept queries for some time after the
4444	* partition heals, to make sure there is enough time to receive
4445	* a configuration update. */
4446	if (rejoin_delay > CLUSTER_MAX_REJOIN_DELAY)
4447	rejoin_delay = CLUSTER_MAX_REJOIN_DELAY;
4448	if (rejoin_delay < CLUSTER_MIN_REJOIN_DELAY)
4449	rejoin_delay = CLUSTER_MIN_REJOIN_DELAY;
4450
4451	if (new_state == CLUSTER_OK &&
4452	nodeIsMaster(myself) &&
4453	mstime() - among_minority_time < rejoin_delay)
4454	{
4455	return;
4456	}
4457
4458	/ Change the state and log the event. /
4459	serverLog(LL_WARNING,"Cluster state changed: %s",
4460	new_state == CLUSTER_OK ? "ok" : "fail");
4461	server.cluster->state = new_state;
4462	}
4463	}
4464
4465	/ This function is called after the node startup in order to verify that data*
4466	* loaded from disk is in agreement with the cluster configuration:
4467	*
4468	* 1) If we find keys about hash slots we have no responsibility for, the
4469	* following happens:
4470	* A) If no other node is in charge according to the current cluster
4471	* configuration, we add these slots to our node.
4472	* B) If according to our config other nodes are already in charge for
4473	* this slots, we set the slots as IMPORTING from our point of view
4474	* in order to justify we have those slots, and in order to make
4475	* redis-cli aware of the issue, so that it can try to fix it.
4476	* 2) If we find data in a DB different than DB0 we return C_ERR to
4477	* signal the caller it should quit the server with an error message
4478	* or take other actions.
4479	*
4480	* The function always returns C_OK even if it will try to correct
4481	* the error described in "1". However if data is found in DB different
4482	* from DB0, C_ERR is returned.
4483	*
4484	* The function also uses the logging facility in order to warn the user
4485	* about desynchronizations between the data we have in memory and the
4486	* cluster configuration. */
4487	int verifyClusterConfigWithData(void) {
4488	int j;
4489	int update_config = `0`;
4490
4491	/ Return ASAP if a module disabled cluster redirections. In that case*
4492	* every master can store keys about every possible hash slot. */
4493	if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION)
4494	return C_OK;
4495
4496	/ If this node is a slave, don't perform the check at all as we*
4497	* completely depend on the replication stream. */
4498	if (nodeIsSlave(myself)) return C_OK;
4499
4500	/ Make sure we only have keys in DB0. /
4501	for (j = `1`; j < server.dbnum; j++) {
4502	if (dictSize(server.db[j].dict)) return C_ERR;
4503	}
4504
4505	/ Check that all the slots we see populated memory have a corresponding*
4506	* entry in the cluster table. Otherwise fix the table. */
4507	for (j = `0`; j < CLUSTER_SLOTS; j++) {
4508	if (!countKeysInSlot(j)) continue; / No keys in this slot. /
4509	/ Check if we are assigned to this slot or if we are importing it.*
4510	* In both cases check the next slot as the configuration makes
4511	* sense. */
4512	if (server.cluster->slots[j] == myself \|\|
4513	server.cluster->importing_slots_from[j] != NULL) continue;
4514
4515	/ If we are here data and cluster config don't agree, and we have*
4516	* slot 'j' populated even if we are not importing it, nor we are
4517	* assigned to this slot. Fix this condition. */
4518
4519	update_config++;
4520	/ Case A: slot is unassigned. Take responsibility for it. /
4521	if (server.cluster->slots[j] == NULL) {
4522	serverLog(LL_WARNING, "I have keys for unassigned slot %d. "
4523	"Taking responsibility for it.",j);
4524	clusterAddSlot(myself,j);
4525	} else {
4526	serverLog(LL_WARNING, "I have keys for slot %d, but the slot is "
4527	"assigned to another node. "
4528	"Setting it to importing state.",j);
4529	server.cluster->importing_slots_from[j] = server.cluster->slots[j];
4530	}
4531	}
4532	if (update_config) clusterSaveConfigOrDie(`1`);
4533	return C_OK;
4534	}
4535
4536	/ -----------------------------------------------------------------------------*
4537	* SLAVE nodes handling
4538	* -------------------------------------------------------------------------- */
4539
4540	/ Set the specified node 'n' as master for this node.*
4541	* If this node is currently a master, it is turned into a slave. */
4542	void clusterSetMaster(clusterNode *n) {
4543	serverAssert(n != myself);
4544	serverAssert(myself->numslots == `0`);
4545
4546	if (nodeIsMaster(myself)) {
4547	myself->flags &= ~(CLUSTER_NODE_MASTER\|CLUSTER_NODE_MIGRATE_TO);
4548	myself->flags \|= CLUSTER_NODE_SLAVE;
4549	clusterCloseAllSlots();
4550	} else {
4551	if (myself->slaveof)
4552	clusterNodeRemoveSlave(myself->slaveof,myself);
4553	}
4554	myself->slaveof = n;
4555	clusterNodeAddSlave(n,myself);
4556	replicationSetMaster(n->ip, n->port);
4557	resetManualFailover();
4558	}
4559
4560	/ -----------------------------------------------------------------------------*
4561	* Nodes to string representation functions.
4562	* -------------------------------------------------------------------------- */
4563
4564	struct redisNodeFlags {
4565	uint16_t flag;
4566	char *name;
4567	};
4568
4569	static struct redisNodeFlags redisNodeFlagsTable[] = {
4570	{CLUSTER_NODE_MYSELF, "myself,"},
4571	{CLUSTER_NODE_MASTER, "master,"},
4572	{CLUSTER_NODE_SLAVE, "slave,"},
4573	{CLUSTER_NODE_PFAIL, "fail?,"},
4574	{CLUSTER_NODE_FAIL, "fail,"},
4575	{CLUSTER_NODE_HANDSHAKE, "handshake,"},
4576	{CLUSTER_NODE_NOADDR, "noaddr,"},
4577	{CLUSTER_NODE_NOFAILOVER, "nofailover,"}
4578	};
4579
4580	/ Concatenate the comma separated list of node flags to the given SDS*
4581	* string 'ci'. */
4582	sds representClusterNodeFlags(sds ci, uint16_t flags) {
4583	size_t orig_len = sdslen(ci);
4584	int i, size = sizeof(redisNodeFlagsTable)/sizeof(struct redisNodeFlags);
4585	for (i = `0`; i < size; i++) {
4586	struct redisNodeFlags *nodeflag = redisNodeFlagsTable + i;
4587	if (flags & nodeflag->flag) ci = sdscat(ci, nodeflag->name);
4588	}
4589	/ If no flag was added, add the "noflags" special flag. /
4590	if (sdslen(ci) == orig_len) ci = sdscat(ci,"noflags,");
4591	sdsIncrLen(ci,-`1`); / Remove trailing comma. /
4592	return ci;
4593	}
4594
4595	/ Concatenate the slot ownership information to the given SDS string 'ci'.*
4596	* If the slot ownership is in a contiguous block, it's represented as start-end pair,
4597	* else each slot is added separately. */
4598	sds representSlotInfo(sds ci, uint16_t slot_info_pairs, int* slot_info_pairs_count) {
4599	for (int i = `0`; i< slot_info_pairs_count; i+=`2`) {
4600	unsigned long start = slot_info_pairs[i];
4601	unsigned long end = slot_info_pairs[i+`1`];
4602	if (start == end) {
4603	ci = sdscatfmt(ci, " %i", start);
4604	} else {
4605	ci = sdscatfmt(ci, " %i-%i", start, end);
4606	}
4607	}
4608	return ci;
4609	}
4610
4611	/ Generate a csv-alike representation of the specified cluster node.*
4612	* See clusterGenNodesDescription() top comment for more information.
4613	*
4614	* The function returns the string representation as an SDS string. */
4615	sds clusterGenNodeDescription(clusterNode node, int* use_pport) {
4616	int j, start;
4617	sds ci;
4618	int port = use_pport && node->pport ? node->pport : node->port;
4619
4620	/ Node coordinates /
4621	ci = sdscatlen(sdsempty(),node->name,CLUSTER_NAMELEN);
4622	if (sdslen(node->hostname) != `0`) {
4623	ci = sdscatfmt(ci," %s:%i@%i,%s ",
4624	node->ip,
4625	port,
4626	node->cport,
4627	node->hostname);
4628	} else {
4629	ci = sdscatfmt(ci," %s:%i@%i ",
4630	node->ip,
4631	port,
4632	node->cport);
4633	}
4634
4635	/ Flags /
4636	ci = representClusterNodeFlags(ci, node->flags);
4637
4638	/ Slave of... or just "-" /
4639	ci = sdscatlen(ci," ",`1`);
4640	if (node->slaveof)
4641	ci = sdscatlen(ci,node->slaveof->name,CLUSTER_NAMELEN);
4642	else
4643	ci = sdscatlen(ci,"-",`1`);
4644
4645	unsigned long long nodeEpoch = node->configEpoch;
4646	if (nodeIsSlave(node) && node->slaveof) {
4647	nodeEpoch = node->slaveof->configEpoch;
4648	}
4649	/ Latency from the POV of this node, config epoch, link status /
4650	ci = sdscatfmt(ci," %I %I %U %s",
4651	(long long) node->ping_sent,
4652	(long long) node->pong_received,
4653	nodeEpoch,
4654	(node->link \|\| node->flags & CLUSTER_NODE_MYSELF) ?
4655	"connected" : "disconnected");
4656
4657	/ Slots served by this instance. If we already have slots info,*
4658	* append it directly, otherwise, generate slots only if it has. */
4659	if (node->slot_info_pairs) {
4660	ci = representSlotInfo(ci, node->slot_info_pairs, node->slot_info_pairs_count);
4661	} else if (node->numslots > `0`) {
4662	start = -`1`;
4663	for (j = `0`; j < CLUSTER_SLOTS; j++) {
4664	int bit;
4665
4666	if ((bit = clusterNodeGetSlotBit(node,j)) != `0`) {
4667	if (start == -`1`) start = j;
4668	}
4669	if (start != -`1` && (!bit \|\| j == CLUSTER_SLOTS-`1`)) {
4670	if (bit && j == CLUSTER_SLOTS-`1`) j++;
4671
4672	if (start == j-`1`) {
4673	ci = sdscatfmt(ci," %i",start);
4674	} else {
4675	ci = sdscatfmt(ci," %i-%i",start,j-`1`);
4676	}
4677	start = -`1`;
4678	}
4679	}
4680	}
4681
4682	/ Just for MYSELF node we also dump info about slots that*
4683	* we are migrating to other instances or importing from other
4684	* instances. */
4685	if (node->flags & CLUSTER_NODE_MYSELF) {
4686	for (j = `0`; j < CLUSTER_SLOTS; j++) {
4687	if (server.cluster->migrating_slots_to[j]) {
4688	ci = sdscatprintf(ci," [%d->-%.40s]",j,
4689	server.cluster->migrating_slots_to[j]->name);
4690	} else if (server.cluster->importing_slots_from[j]) {
4691	ci = sdscatprintf(ci," [%d-<-%.40s]",j,
4692	server.cluster->importing_slots_from[j]->name);
4693	}
4694	}
4695	}
4696	return ci;
4697	}
4698
4699	/ Generate the slot topology for all nodes and store the string representation*
4700	* in the slots_info struct on the node. This is used to improve the efficiency
4701	* of clusterGenNodesDescription() because it removes looping of the slot space
4702	* for generating the slot info for each node individually. */
4703	void clusterGenNodesSlotsInfo(int filter) {
4704	clusterNode *n = NULL;
4705	int start = -`1`;
4706
4707	for (int i = `0`; i <= CLUSTER_SLOTS; i++) {
4708	/ Find start node and slot id. /
4709	if (n == NULL) {
4710	if (i == CLUSTER_SLOTS) break;
4711	n = server.cluster->slots[i];
4712	start = i;
4713	continue;
4714	}
4715
4716	/ Generate slots info when occur different node with start*
4717	* or end of slot. */
4718	if (i == CLUSTER_SLOTS \|\| n != server.cluster->slots[i]) {
4719	if (!(n->flags & filter)) {
4720	if (!n->slot_info_pairs) {
4721	n->slot_info_pairs = zmalloc(`2` * n->numslots * sizeof(uint16_t));
4722	}
4723	serverAssert((n->slot_info_pairs_count + `1`) < (`2` * n->numslots));
4724	n->slot_info_pairs[n->slot_info_pairs_count++] = start;
4725	n->slot_info_pairs[n->slot_info_pairs_count++] = i-`1`;
4726	}
4727	if (i == CLUSTER_SLOTS) break;
4728	n = server.cluster->slots[i];
4729	start = i;
4730	}
4731	}
4732	}
4733
4734	void clusterFreeNodesSlotsInfo(clusterNode *n) {
4735	zfree(n->slot_info_pairs);
4736	n->slot_info_pairs = NULL;
4737	n->slot_info_pairs_count = `0`;
4738	}
4739
4740	/ Generate a csv-alike representation of the nodes we are aware of,*
4741	* including the "myself" node, and return an SDS string containing the
4742	* representation (it is up to the caller to free it).
4743	*
4744	* All the nodes matching at least one of the node flags specified in
4745	* "filter" are excluded from the output, so using zero as a filter will
4746	* include all the known nodes in the representation, including nodes in
4747	* the HANDSHAKE state.
4748	*
4749	* Setting use_pport to 1 in a TLS cluster makes the result contain the
4750	* plaintext client port rather then the TLS client port of each node.
4751	*
4752	* The representation obtained using this function is used for the output
4753	* of the CLUSTER NODES function, and as format for the cluster
4754	* configuration file (nodes.conf) for a given node. */
4755	sds clusterGenNodesDescription(int filter, int use_pport) {
4756	sds ci = sdsempty(), ni;
4757	dictIterator *di;
4758	dictEntry *de;
4759
4760	/ Generate all nodes slots info firstly. /
4761	clusterGenNodesSlotsInfo(filter);
4762
4763	di = dictGetSafeIterator(server.cluster->nodes);
4764	while((de = dictNext(di)) != NULL) {
4765	clusterNode *node = dictGetVal(de);
4766
4767	if (node->flags & filter) continue;
4768	ni = clusterGenNodeDescription(node, use_pport);
4769	ci = sdscatsds(ci,ni);
4770	sdsfree(ni);
4771	ci = sdscatlen(ci,"\n",`1`);
4772
4773	/ Release slots info. /
4774	clusterFreeNodesSlotsInfo(node);
4775	}
4776	dictReleaseIterator(di);
4777	return ci;
4778	}
4779
4780	/ Add to the output buffer of the given client the description of the given cluster link.*
4781	* The description is a map with each entry being an attribute of the link. */
4782	void addReplyClusterLinkDescription(client c, clusterLink link) {
4783	addReplyMapLen(c, `6`);
4784
4785	addReplyBulkCString(c, "direction");
4786	addReplyBulkCString(c, link->inbound ? "from" : "to");
4787
4788	/ addReplyClusterLinkDescription is only called for links that have been*
4789	* associated with nodes. The association is always bi-directional, so
4790	* in addReplyClusterLinkDescription, link->node should never be NULL. */
4791	serverAssert(link->node);
4792	sds node_name = sdsnewlen(link->node->name, CLUSTER_NAMELEN);
4793	addReplyBulkCString(c, "node");
4794	addReplyBulkCString(c, node_name);
4795	sdsfree(node_name);
4796
4797	addReplyBulkCString(c, "create-time");
4798	addReplyLongLong(c, link->ctime);
4799
4800	char events[`3`], *p;
4801	p = events;
4802	if (link->conn) {
4803	if (connHasReadHandler(link->conn)) *p++ = `'r'`;
4804	if (connHasWriteHandler(link->conn)) *p++ = `'w'`;
4805	}
4806	*p = `'\0'`;
4807	addReplyBulkCString(c, "events");
4808	addReplyBulkCString(c, events);
4809
4810	addReplyBulkCString(c, "send-buffer-allocated");
4811	addReplyLongLong(c, sdsalloc(link->sndbuf));
4812
4813	addReplyBulkCString(c, "send-buffer-used");
4814	addReplyLongLong(c, sdslen(link->sndbuf));
4815	}
4816
4817	/ Add to the output buffer of the given client an array of cluster link descriptions,*
4818	* with array entry being a description of a single current cluster link. */
4819	void addReplyClusterLinksDescription(client *c) {
4820	dictIterator *di;
4821	dictEntry *de;
4822	void *arraylen_ptr = NULL;
4823	int num_links = `0`;
4824
4825	arraylen_ptr = addReplyDeferredLen(c);
4826
4827	di = dictGetSafeIterator(server.cluster->nodes);
4828	while((de = dictNext(di)) != NULL) {
4829	clusterNode *node = dictGetVal(de);
4830	if (node->link) {
4831	num_links++;
4832	addReplyClusterLinkDescription(c, node->link);
4833	}
4834	if (node->inbound_link) {
4835	num_links++;
4836	addReplyClusterLinkDescription(c, node->inbound_link);
4837	}
4838	}
4839	dictReleaseIterator(di);
4840
4841	setDeferredArrayLen(c, arraylen_ptr, num_links);
4842	}
4843
4844	/ -----------------------------------------------------------------------------*
4845	* CLUSTER command
4846	* -------------------------------------------------------------------------- */
4847
4848	const char getPreferredEndpoint(clusterNode n) {
4849	switch(server.cluster_preferred_endpoint_type) {
4850	case CLUSTER_ENDPOINT_TYPE_IP: return n->ip;
4851	case CLUSTER_ENDPOINT_TYPE_HOSTNAME: return (sdslen(n->hostname) != `0`) ? n->hostname : "?";
4852	case CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT: return "";
4853	}
4854	return "unknown";
4855	}
4856
4857	const char clusterGetMessageTypeString(int* type) {
4858	switch(type) {
4859	case CLUSTERMSG_TYPE_PING: return "ping";
4860	case CLUSTERMSG_TYPE_PONG: return "pong";
4861	case CLUSTERMSG_TYPE_MEET: return "meet";
4862	case CLUSTERMSG_TYPE_FAIL: return "fail";
4863	case CLUSTERMSG_TYPE_PUBLISH: return "publish";
4864	case CLUSTERMSG_TYPE_PUBLISHSHARD: return "publishshard";
4865	case CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST: return "auth-req";
4866	case CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK: return "auth-ack";
4867	case CLUSTERMSG_TYPE_UPDATE: return "update";
4868	case CLUSTERMSG_TYPE_MFSTART: return "mfstart";
4869	case CLUSTERMSG_TYPE_MODULE: return "module";
4870	}
4871	return "unknown";
4872	}
4873
4874	int getSlotOrReply(client c, robj o) {
4875	long long slot;
4876
4877	if (getLongLongFromObject(o,&slot) != C_OK \|\|
4878	slot < `0` \|\| slot >= CLUSTER_SLOTS)
4879	{
4880	addReplyError(c,"Invalid or out of range slot");
4881	return -`1`;
4882	}
4883	return (int) slot;
4884	}
4885
4886	/ Returns an indication if the replica node is fully available*
4887	* and should be listed in CLUSTER SLOTS response.
4888	* Returns 1 for available nodes, 0 for nodes that have
4889	* not finished their initial sync, in failed state, or are
4890	* otherwise considered not available to serve read commands. */
4891	static int isReplicaAvailable(clusterNode *node) {
4892	if (nodeFailed(node)) {
4893	return `0`;
4894	}
4895	long long repl_offset = node->repl_offset;
4896	if (node->flags & CLUSTER_NODE_MYSELF) {
4897	/ Nodes do not update their own information*
4898	* in the cluster node list. */
4899	repl_offset = replicationGetSlaveOffset();
4900	}
4901	return (repl_offset != `0`);
4902	}
4903
4904	int checkSlotAssignmentsOrReply(client c, unsigned* char slots, int* del, int start_slot, int end_slot) {
4905	int slot;
4906	for (slot = start_slot; slot <= end_slot; slot++) {
4907	if (del && server.cluster->slots[slot] == NULL) {
4908	addReplyErrorFormat(c,"Slot %d is already unassigned", slot);
4909	return C_ERR;
4910	} else if (!del && server.cluster->slots[slot]) {
4911	addReplyErrorFormat(c,"Slot %d is already busy", slot);
4912	return C_ERR;
4913	}
4914	if (slots[slot]++ == `1`) {
4915	addReplyErrorFormat(c,"Slot %d specified multiple times",(int)slot);
4916	return C_ERR;
4917	}
4918	}
4919	return C_OK;
4920	}
4921
4922	void clusterUpdateSlots(client c, unsigned* char slots, int* del) {
4923	int j;
4924	for (j = `0`; j < CLUSTER_SLOTS; j++) {
4925	if (slots[j]) {
4926	int retval;
4927
4928	/ If this slot was set as importing we can clear this*
4929	* state as now we are the real owner of the slot. */
4930	if (server.cluster->importing_slots_from[j])
4931	server.cluster->importing_slots_from[j] = NULL;
4932
4933	retval = del ? clusterDelSlot(j) :
4934	clusterAddSlot(myself,j);
4935	serverAssertWithInfo(c,NULL,retval == C_OK);
4936	}
4937	}
4938	}
4939
4940	void addNodeToNodeReply(client c, clusterNode node) {
4941	addReplyArrayLen(c, `4`);
4942	if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_IP) {
4943	addReplyBulkCString(c, node->ip);
4944	} else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_HOSTNAME) {
4945	addReplyBulkCString(c, sdslen(node->hostname) != `0` ? node->hostname : "?");
4946	} else if (server.cluster_preferred_endpoint_type == CLUSTER_ENDPOINT_TYPE_UNKNOWN_ENDPOINT) {
4947	addReplyNull(c);
4948	} else {
4949	serverPanic("Unrecognized preferred endpoint type");
4950	}
4951
4952	/ Report non-TLS ports to non-TLS client in TLS cluster if available. /
4953	int use_pport = (server.tls_cluster &&
4954	c->conn && connGetType(c->conn) != CONN_TYPE_TLS);
4955	addReplyLongLong(c, use_pport && node->pport ? node->pport : node->port);
4956	addReplyBulkCBuffer(c, node->name, CLUSTER_NAMELEN);
4957
4958	/ Add the additional endpoint information, this is all the known networking information*
4959	* that is not the preferred endpoint. */
4960	void *deflen = addReplyDeferredLen(c);
4961	int length = `0`;
4962	if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_IP) {
4963	addReplyBulkCString(c, "ip");
4964	addReplyBulkCString(c, node->ip);
4965	length++;
4966	}
4967	if (server.cluster_preferred_endpoint_type != CLUSTER_ENDPOINT_TYPE_HOSTNAME
4968	&& sdslen(node->hostname) != `0`)
4969	{
4970	addReplyBulkCString(c, "hostname");
4971	addReplyBulkCString(c, node->hostname);
4972	length++;
4973	}
4974	setDeferredMapLen(c, deflen, length);
4975	}
4976
4977	void addNodeReplyForClusterSlot(client c, clusterNode node, int start_slot, int end_slot) {
4978	int i, nested_elements = `3`; / slots (2) + master addr (1) /
4979	void *nested_replylen = addReplyDeferredLen(c);
4980	addReplyLongLong(c, start_slot);
4981	addReplyLongLong(c, end_slot);
4982	addNodeToNodeReply(c, node);
4983
4984	/ Remaining nodes in reply are replicas for slot range /
4985	for (i = `0`; i < node->numslaves; i++) {
4986	/ This loop is copy/pasted from clusterGenNodeDescription()*
4987	* with modifications for per-slot node aggregation. */
4988	if (!isReplicaAvailable(node->slaves[i])) continue;
4989	addNodeToNodeReply(c, node->slaves[i]);
4990	nested_elements++;
4991	}
4992	setDeferredArrayLen(c, nested_replylen, nested_elements);
4993	}
4994
4995	/ Add detailed information of a node to the output buffer of the given client. /
4996	void addNodeDetailsToShardReply(client c, clusterNode node) {
4997	int reply_count = `0`;
4998	void *node_replylen = addReplyDeferredLen(c);
4999	addReplyBulkCString(c, "id");
5000	addReplyBulkCBuffer(c, node->name, CLUSTER_NAMELEN);
5001	reply_count++;
5002
5003	/ We use server.tls_cluster as a proxy for whether or not*
5004	* the remote port is the tls port or not */
5005	int plaintext_port = server.tls_cluster ? node->pport : node->port;
5006	int tls_port = server.tls_cluster ? node->port : `0`;
5007	if (plaintext_port) {
5008	addReplyBulkCString(c, "port");
5009	addReplyLongLong(c, plaintext_port);
5010	reply_count++;
5011	}
5012
5013	if (tls_port) {
5014	addReplyBulkCString(c, "tls-port");
5015	addReplyLongLong(c, tls_port);
5016	reply_count++;
5017	}
5018
5019	addReplyBulkCString(c, "ip");
5020	addReplyBulkCString(c, node->ip);
5021	reply_count++;
5022
5023	addReplyBulkCString(c, "endpoint");
5024	addReplyBulkCString(c, getPreferredEndpoint(node));
5025	reply_count++;
5026
5027	if (node->hostname) {
5028	addReplyBulkCString(c, "hostname");
5029	addReplyBulkCString(c, node->hostname);
5030	reply_count++;
5031	}
5032
5033	long long node_offset;
5034	if (node->flags & CLUSTER_NODE_MYSELF) {
5035	node_offset = nodeIsSlave(node) ? replicationGetSlaveOffset() : server.master_repl_offset;
5036	} else {
5037	node_offset = node->repl_offset;
5038	}
5039
5040	addReplyBulkCString(c, "role");
5041	addReplyBulkCString(c, nodeIsSlave(node) ? "replica" : "master");
5042	reply_count++;
5043
5044	addReplyBulkCString(c, "replication-offset");
5045	addReplyLongLong(c, node_offset);
5046	reply_count++;
5047
5048	addReplyBulkCString(c, "health");
5049	const char *health_msg = NULL;
5050	if (nodeFailed(node)) {
5051	health_msg = "fail";
5052	} else if (nodeIsSlave(node) && node_offset == `0`) {
5053	health_msg = "loading";
5054	} else {
5055	health_msg = "online";
5056	}
5057	addReplyBulkCString(c, health_msg);
5058	reply_count++;
5059
5060	setDeferredMapLen(c, node_replylen, reply_count);
5061	}
5062
5063	/ Add the shard reply of a single shard based off the given primary node. /
5064	void addShardReplyForClusterShards(client c, clusterNode node, uint16_t slot_info_pairs, int* slot_pairs_count) {
5065	addReplyMapLen(c, `2`);
5066	addReplyBulkCString(c, "slots");
5067	if (slot_info_pairs) {
5068	serverAssert((slot_pairs_count % `2`) == `0`);
5069	addReplyArrayLen(c, slot_pairs_count);
5070	for (int i = `0`; i < slot_pairs_count; i++)
5071	addReplyLongLong(c, (unsigned long)slot_info_pairs[i]);
5072	} else {
5073	/ If no slot info pair is provided, the node owns no slots /
5074	addReplyArrayLen(c, `0`);
5075	}
5076
5077	addReplyBulkCString(c, "nodes");
5078	list *nodes_for_slot = clusterGetNodesServingMySlots(node);
5079	/ At least the provided node should be serving its slots /
5080	serverAssert(nodes_for_slot);
5081	addReplyArrayLen(c, listLength(nodes_for_slot));
5082	if (listLength(nodes_for_slot) != `0`) {
5083	listIter li;
5084	listNode *ln;
5085	listRewind(nodes_for_slot, &li);
5086	while ((ln = listNext(&li))) {
5087	clusterNode *node = listNodeValue(ln);
5088	addNodeDetailsToShardReply(c, node);
5089	}
5090	listRelease(nodes_for_slot);
5091	}
5092	}
5093
5094	/ Add to the output buffer of the given client, an array of slot (start, end)*
5095	* pair owned by the shard, also the primary and set of replica(s) along with
5096	* information about each node. */
5097	void clusterReplyShards(client *c) {
5098	void *shard_replylen = addReplyDeferredLen(c);
5099	int shard_count = `0`;
5100	/ This call will add slot_info_pairs to all nodes /
5101	clusterGenNodesSlotsInfo(`0`);
5102	dictIterator *di = dictGetSafeIterator(server.cluster->nodes);
5103	dictEntry *de;
5104	/ Iterate over all the available nodes in the cluster, for each primary*
5105	* node return generate the cluster shards response. if the primary node
5106	* doesn't own any slot, cluster shard response contains the node related
5107	* information and an empty slots array. */
5108	while((de = dictNext(di)) != NULL) {
5109	clusterNode *n = dictGetVal(de);
5110	if (!nodeIsMaster(n)) {
5111	/ You can force a replica to own slots, even though it'll get reverted,*
5112	* so freeing the slot pair here just in case. */
5113	clusterFreeNodesSlotsInfo(n);
5114	continue;
5115	}
5116	shard_count++;
5117	/ n->slot_info_pairs is set to NULL when the the node owns no slots. /
5118	addShardReplyForClusterShards(c, n, n->slot_info_pairs, n->slot_info_pairs_count);
5119	clusterFreeNodesSlotsInfo(n);
5120	}
5121	dictReleaseIterator(di);
5122	setDeferredArrayLen(c, shard_replylen, shard_count);
5123	}
5124
5125	void clusterReplyMultiBulkSlots(client * c) {
5126	/ Format: 1) 1) start slot*
5127	* 2) end slot
5128	* 3) 1) master IP
5129	* 2) master port
5130	* 3) node ID
5131	* 4) 1) replica IP
5132	* 2) replica port
5133	* 3) node ID
5134	* ... continued until done
5135	*/
5136	clusterNode *n = NULL;
5137	int num_masters = `0`, start = -`1`;
5138	void *slot_replylen = addReplyDeferredLen(c);
5139
5140	for (int i = `0`; i <= CLUSTER_SLOTS; i++) {
5141	/ Find start node and slot id. /
5142	if (n == NULL) {
5143	if (i == CLUSTER_SLOTS) break;
5144	n = server.cluster->slots[i];
5145	start = i;
5146	continue;
5147	}
5148
5149	/ Add cluster slots info when occur different node with start*
5150	* or end of slot. */
5151	if (i == CLUSTER_SLOTS \|\| n != server.cluster->slots[i]) {
5152	addNodeReplyForClusterSlot(c, n, start, i-`1`);
5153	num_masters++;
5154	if (i == CLUSTER_SLOTS) break;
5155	n = server.cluster->slots[i];
5156	start = i;
5157	}
5158	}
5159	setDeferredArrayLen(c, slot_replylen, num_masters);
5160	}
5161
5162	void clusterCommand(client *c) {
5163	if (server.cluster_enabled == `0`) {
5164	addReplyError(c,"This instance has cluster support disabled");
5165	return;
5166	}
5167
5168	if (c->argc == `2` && !strcasecmp(c->argv[`1`]->ptr,"help")) {
5169	const char *help[] = {
5170	"ADDSLOTS <slot> [<slot> ...]",
5171	" Assign slots to current node.",
5172	"ADDSLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...]",
5173	" Assign slots which are between <start-slot> and <end-slot> to current node.",
5174	"BUMPEPOCH",
5175	" Advance the cluster config epoch.",
5176	"COUNT-FAILURE-REPORTS <node-id>",
5177	" Return number of failure reports for <node-id>.",
5178	"COUNTKEYSINSLOT <slot>",
5179	" Return the number of keys in <slot>.",
5180	"DELSLOTS <slot> [<slot> ...]",
5181	" Delete slots information from current node.",
5182	"DELSLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...]",
5183	" Delete slots information which are between <start-slot> and <end-slot> from current node.",
5184	"FAILOVER [FORCE\|TAKEOVER]",
5185	" Promote current replica node to being a master.",
5186	"FORGET <node-id>",
5187	" Remove a node from the cluster.",
5188	"GETKEYSINSLOT <slot> <count>",
5189	" Return key names stored by current node in a slot.",
5190	"FLUSHSLOTS",
5191	" Delete current node own slots information.",
5192	"INFO",
5193	" Return information about the cluster.",
5194	"KEYSLOT <key>",
5195	" Return the hash slot for <key>.",
5196	"MEET <ip> <port> [<bus-port>]",
5197	" Connect nodes into a working cluster.",
5198	"MYID",
5199	" Return the node id.",
5200	"NODES",
5201	" Return cluster configuration seen by node. Output format:",
5202	" <id> <ip:port> <flags> <master> <pings> <pongs> <epoch> <link> <slot> ...",
5203	"REPLICATE <node-id>",
5204	" Configure current node as replica to <node-id>.",
5205	"RESET [HARD\|SOFT]",
5206	" Reset current node (default: soft).",
5207	"SET-CONFIG-EPOCH <epoch>",
5208	" Set config epoch of current node.",
5209	"SETSLOT <slot> (IMPORTING <node-id>\|MIGRATING <node-id>\|STABLE\|NODE <node-id>)",
5210	" Set slot state.",
5211	"REPLICAS <node-id>",
5212	" Return <node-id> replicas.",
5213	"SAVECONFIG",
5214	" Force saving cluster configuration on disk.",
5215	"SLOTS",
5216	" Return information about slots range mappings. Each range is made of:",
5217	" start, end, master and replicas IP addresses, ports and ids",
5218	"SHARDS",
5219	" Return information about slot range mappings and the nodes associated with them.",
5220	"LINKS",
5221	" Return information about all network links between this node and its peers.",
5222	" Output format is an array where each array element is a map containing attributes of a link",
5223	NULL
5224	};
5225	addReplyHelp(c, help);
5226	} else if (!strcasecmp(c->argv[`1`]->ptr,"meet") && (c->argc == `4` \|\| c->argc == `5`)) {
5227	/ CLUSTER MEET <ip> <port> [cport] /
5228	long long port, cport;
5229
5230	if (getLongLongFromObject(c->argv[`3`], &port) != C_OK) {
5231	addReplyErrorFormat(c,"Invalid TCP base port specified: %s",
5232	(char*)c->argv[`3`]->ptr);
5233	return;
5234	}
5235
5236	if (c->argc == `5`) {
5237	if (getLongLongFromObject(c->argv[`4`], &cport) != C_OK) {
5238	addReplyErrorFormat(c,"Invalid TCP bus port specified: %s",
5239	(char*)c->argv[`4`]->ptr);
5240	return;
5241	}
5242	} else {
5243	cport = port + CLUSTER_PORT_INCR;
5244	}
5245
5246	if (clusterStartHandshake(c->argv[`2`]->ptr,port,cport) == `0` &&
5247	errno == EINVAL)
5248	{
5249	addReplyErrorFormat(c,"Invalid node address specified: %s:%s",
5250	(char)c->argv[`2`]->ptr, (char**)c->argv[`3`]->ptr);
5251	} else {
5252	addReply(c,shared.ok);
5253	}
5254	} else if (!strcasecmp(c->argv[`1`]->ptr,"nodes") && c->argc == `2`) {
5255	/ CLUSTER NODES /
5256	/ Report plaintext ports, only if cluster is TLS but client is known to*
5257	* be non-TLS). */
5258	int use_pport = (server.tls_cluster &&
5259	c->conn && connGetType(c->conn) != CONN_TYPE_TLS);
5260	sds nodes = clusterGenNodesDescription(`0`, use_pport);
5261	addReplyVerbatim(c,nodes,sdslen(nodes),"txt");
5262	sdsfree(nodes);
5263	} else if (!strcasecmp(c->argv[`1`]->ptr,"myid") && c->argc == `2`) {
5264	/ CLUSTER MYID /
5265	addReplyBulkCBuffer(c,myself->name, CLUSTER_NAMELEN);
5266	} else if (!strcasecmp(c->argv[`1`]->ptr,"slots") && c->argc == `2`) {
5267	/ CLUSTER SLOTS /
5268	clusterReplyMultiBulkSlots(c);
5269	} else if (!strcasecmp(c->argv[`1`]->ptr,"shards") && c->argc == `2`) {
5270	/ CLUSTER SHARDS /
5271	clusterReplyShards(c);
5272	} else if (!strcasecmp(c->argv[`1`]->ptr,"flushslots") && c->argc == `2`) {
5273	/ CLUSTER FLUSHSLOTS /
5274	if (dictSize(server.db[`0`].dict) != `0`) {
5275	addReplyError(c,"DB must be empty to perform CLUSTER FLUSHSLOTS.");
5276	return;
5277	}
5278	clusterDelNodeSlots(myself);
5279	clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE\|CLUSTER_TODO_SAVE_CONFIG);
5280	addReply(c,shared.ok);
5281	} else if ((!strcasecmp(c->argv[`1`]->ptr,"addslots") \|\|
5282	!strcasecmp(c->argv[`1`]->ptr,"delslots")) && c->argc >= `3`)
5283	{
5284	/ CLUSTER ADDSLOTS <slot> [slot] ... /
5285	/ CLUSTER DELSLOTS <slot> [slot] ... /
5286	int j, slot;
5287	unsigned char *slots = zmalloc(CLUSTER_SLOTS);
5288	int del = !strcasecmp(c->argv[`1`]->ptr,"delslots");
5289
5290	memset(slots,`0`,CLUSTER_SLOTS);
5291	/ Check that all the arguments are parseable./
5292	for (j = `2`; j < c->argc; j++) {
5293	if ((slot = getSlotOrReply(c,c->argv[j])) == C_ERR) {
5294	zfree(slots);
5295	return;
5296	}
5297	}
5298	/ Check that the slots are not already busy. /
5299	for (j = `2`; j < c->argc; j++) {
5300	slot = getSlotOrReply(c,c->argv[j]);
5301	if (checkSlotAssignmentsOrReply(c, slots, del, slot, slot) == C_ERR) {
5302	zfree(slots);
5303	return;
5304	}
5305	}
5306	clusterUpdateSlots(c, slots, del);
5307	zfree(slots);
5308	clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE\|CLUSTER_TODO_SAVE_CONFIG);
5309	addReply(c,shared.ok);
5310	} else if ((!strcasecmp(c->argv[`1`]->ptr,"addslotsrange") \|\|
5311	!strcasecmp(c->argv[`1`]->ptr,"delslotsrange")) && c->argc >= `4`) {
5312	if (c->argc % `2` == `1`) {
5313	addReplyErrorArity(c);
5314	return;
5315	}
5316	/ CLUSTER ADDSLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...] /
5317	/ CLUSTER DELSLOTSRANGE <start slot> <end slot> [<start slot> <end slot> ...] /
5318	int j, startslot, endslot;
5319	unsigned char *slots = zmalloc(CLUSTER_SLOTS);
5320	int del = !strcasecmp(c->argv[`1`]->ptr,"delslotsrange");
5321
5322	memset(slots,`0`,CLUSTER_SLOTS);
5323	/ Check that all the arguments are parseable and that all the*
5324	* slots are not already busy. */
5325	for (j = `2`; j < c->argc; j += `2`) {
5326	if ((startslot = getSlotOrReply(c,c->argv[j])) == C_ERR) {
5327	zfree(slots);
5328	return;
5329	}
5330	if ((endslot = getSlotOrReply(c,c->argv[j+`1`])) == C_ERR) {
5331	zfree(slots);
5332	return;
5333	}
5334	if (startslot > endslot) {
5335	addReplyErrorFormat(c,"start slot number %d is greater than end slot number %d", startslot, endslot);
5336	zfree(slots);
5337	return;
5338	}
5339
5340	if (checkSlotAssignmentsOrReply(c, slots, del, startslot, endslot) == C_ERR) {
5341	zfree(slots);
5342	return;
5343	}
5344	}
5345	clusterUpdateSlots(c, slots, del);
5346	zfree(slots);
5347	clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE\|CLUSTER_TODO_SAVE_CONFIG);
5348	addReply(c,shared.ok);
5349	} else if (!strcasecmp(c->argv[`1`]->ptr,"setslot") && c->argc >= `4`) {
5350	/ SETSLOT 10 MIGRATING <node ID> /
5351	/ SETSLOT 10 IMPORTING <node ID> /
5352	/ SETSLOT 10 STABLE /
5353	/ SETSLOT 10 NODE <node ID> /
5354	int slot;
5355	clusterNode *n;
5356
5357	if (nodeIsSlave(myself)) {
5358	addReplyError(c,"Please use SETSLOT only with masters.");
5359	return;
5360	}
5361
5362	if ((slot = getSlotOrReply(c,c->argv[`2`])) == -`1`) return;
5363
5364	if (!strcasecmp(c->argv[`3`]->ptr,"migrating") && c->argc == `5`) {
5365	if (server.cluster->slots[slot] != myself) {
5366	addReplyErrorFormat(c,"I'm not the owner of hash slot %u",slot);
5367	return;
5368	}
5369	n = clusterLookupNode(c->argv[`4`]->ptr, sdslen(c->argv[`4`]->ptr));
5370	if (n == NULL) {
5371	addReplyErrorFormat(c,"I don't know about node %s",
5372	(char*)c->argv[`4`]->ptr);
5373	return;
5374	}
5375	if (nodeIsSlave(n)) {
5376	addReplyError(c,"Target node is not a master");
5377	return;
5378	}
5379	server.cluster->migrating_slots_to[slot] = n;
5380	} else if (!strcasecmp(c->argv[`3`]->ptr,"importing") && c->argc == `5`) {
5381	if (server.cluster->slots[slot] == myself) {
5382	addReplyErrorFormat(c,
5383	"I'm already the owner of hash slot %u",slot);
5384	return;
5385	}
5386	n = clusterLookupNode(c->argv[`4`]->ptr, sdslen(c->argv[`4`]->ptr));
5387	if (n == NULL) {
5388	addReplyErrorFormat(c,"I don't know about node %s",
5389	(char*)c->argv[`4`]->ptr);
5390	return;
5391	}
5392	if (nodeIsSlave(n)) {
5393	addReplyError(c,"Target node is not a master");
5394	return;
5395	}
5396	server.cluster->importing_slots_from[slot] = n;
5397	} else if (!strcasecmp(c->argv[`3`]->ptr,"stable") && c->argc == `4`) {
5398	/ CLUSTER SETSLOT <SLOT> STABLE /
5399	server.cluster->importing_slots_from[slot] = NULL;
5400	server.cluster->migrating_slots_to[slot] = NULL;
5401	} else if (!strcasecmp(c->argv[`3`]->ptr,"node") && c->argc == `5`) {
5402	/ CLUSTER SETSLOT <SLOT> NODE <NODE ID> /
5403	n = clusterLookupNode(c->argv[`4`]->ptr, sdslen(c->argv[`4`]->ptr));
5404	if (!n) {
5405	addReplyErrorFormat(c,"Unknown node %s",
5406	(char*)c->argv[`4`]->ptr);
5407	return;
5408	}
5409	if (nodeIsSlave(n)) {
5410	addReplyError(c,"Target node is not a master");
5411	return;
5412	}
5413	/ If this hash slot was served by 'myself' before to switch*
5414	* make sure there are no longer local keys for this hash slot. */
5415	if (server.cluster->slots[slot] == myself && n != myself) {
5416	if (countKeysInSlot(slot) != `0`) {
5417	addReplyErrorFormat(c,
5418	"Can't assign hashslot %d to a different node "
5419	"while I still hold keys for this hash slot.", slot);
5420	return;
5421	}
5422	}
5423	/ If this slot is in migrating status but we have no keys*
5424	* for it assigning the slot to another node will clear
5425	* the migrating status. */
5426	if (countKeysInSlot(slot) == `0` &&
5427	server.cluster->migrating_slots_to[slot])
5428	server.cluster->migrating_slots_to[slot] = NULL;
5429
5430	int slot_was_mine = server.cluster->slots[slot] == myself;
5431	clusterDelSlot(slot);
5432	clusterAddSlot(n,slot);
5433
5434	/ If we are a master left without slots, we should turn into a*
5435	* replica of the new master. */
5436	if (slot_was_mine &&
5437	n != myself &&
5438	myself->numslots == `0` &&
5439	server.cluster_allow_replica_migration)
5440	{
5441	serverLog(LL_WARNING,
5442	"Configuration change detected. Reconfiguring myself "
5443	"as a replica of %.40s", n->name);
5444	clusterSetMaster(n);
5445	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG \|
5446	CLUSTER_TODO_UPDATE_STATE \|
5447	CLUSTER_TODO_FSYNC_CONFIG);
5448	}
5449
5450	/ If this node was importing this slot, assigning the slot to*
5451	* itself also clears the importing status. */
5452	if (n == myself &&
5453	server.cluster->importing_slots_from[slot])
5454	{
5455	/ This slot was manually migrated, set this node configEpoch*
5456	* to a new epoch so that the new version can be propagated
5457	* by the cluster.
5458	*
5459	* Note that if this ever results in a collision with another
5460	* node getting the same configEpoch, for example because a
5461	* failover happens at the same time we close the slot, the
5462	* configEpoch collision resolution will fix it assigning
5463	* a different epoch to each node. */
5464	if (clusterBumpConfigEpochWithoutConsensus() == C_OK) {
5465	serverLog(LL_WARNING,
5466	"configEpoch updated after importing slot %d", slot);
5467	}
5468	server.cluster->importing_slots_from[slot] = NULL;
5469	/ After importing this slot, let the other nodes know as*
5470	* soon as possible. */
5471	clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
5472	}
5473	} else {
5474	addReplyError(c,
5475	"Invalid CLUSTER SETSLOT action or number of arguments. Try CLUSTER HELP");
5476	return;
5477	}
5478	clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG\|CLUSTER_TODO_UPDATE_STATE);
5479	addReply(c,shared.ok);
5480	} else if (!strcasecmp(c->argv[`1`]->ptr,"bumpepoch") && c->argc == `2`) {
5481	/ CLUSTER BUMPEPOCH /
5482	int retval = clusterBumpConfigEpochWithoutConsensus();
5483	sds reply = sdscatprintf(sdsempty(),"+%s %llu\r\n",
5484	(retval == C_OK) ? "BUMPED" : "STILL",
5485	(unsigned long long) myself->configEpoch);
5486	addReplySds(c,reply);
5487	} else if (!strcasecmp(c->argv[`1`]->ptr,"info") && c->argc == `2`) {
5488	/ CLUSTER INFO /
5489	char *statestr[] = {"ok","fail"};
5490	int slots_assigned = `0`, slots_ok = `0`, slots_pfail = `0`, slots_fail = `0`;
5491	uint64_t myepoch;
5492	int j;
5493
5494	for (j = `0`; j < CLUSTER_SLOTS; j++) {
5495	clusterNode *n = server.cluster->slots[j];
5496
5497	if (n == NULL) continue;
5498	slots_assigned++;
5499	if (nodeFailed(n)) {
5500	slots_fail++;
5501	} else if (nodeTimedOut(n)) {
5502	slots_pfail++;
5503	} else {
5504	slots_ok++;
5505	}
5506	}
5507
5508	myepoch = (nodeIsSlave(myself) && myself->slaveof) ?
5509	myself->slaveof->configEpoch : myself->configEpoch;
5510
5511	sds info = sdscatprintf(sdsempty(),
5512	"cluster_state:%s\r\n"
5513	"cluster_slots_assigned:%d\r\n"
5514	"cluster_slots_ok:%d\r\n"
5515	"cluster_slots_pfail:%d\r\n"
5516	"cluster_slots_fail:%d\r\n"
5517	"cluster_known_nodes:%lu\r\n"
5518	"cluster_size:%d\r\n"
5519	"cluster_current_epoch:%llu\r\n"
5520	"cluster_my_epoch:%llu\r\n"
5521	, statestr[server.cluster->state],
5522	slots_assigned,
5523	slots_ok,
5524	slots_pfail,
5525	slots_fail,
5526	dictSize(server.cluster->nodes),
5527	server.cluster->size,
5528	(unsigned long long) server.cluster->currentEpoch,
5529	(unsigned long long) myepoch
5530	);
5531
5532	/ Show stats about messages sent and received. /
5533	long long tot_msg_sent = `0`;
5534	long long tot_msg_received = `0`;
5535
5536	for (int i = `0`; i < CLUSTERMSG_TYPE_COUNT; i++) {
5537	if (server.cluster->stats_bus_messages_sent[i] == `0`) continue;
5538	tot_msg_sent += server.cluster->stats_bus_messages_sent[i];
5539	info = sdscatprintf(info,
5540	"cluster_stats_messages_%s_sent:%lld\r\n",
5541	clusterGetMessageTypeString(i),
5542	server.cluster->stats_bus_messages_sent[i]);
5543	}
5544	info = sdscatprintf(info,
5545	"cluster_stats_messages_sent:%lld\r\n", tot_msg_sent);
5546
5547	for (int i = `0`; i < CLUSTERMSG_TYPE_COUNT; i++) {
5548	if (server.cluster->stats_bus_messages_received[i] == `0`) continue;
5549	tot_msg_received += server.cluster->stats_bus_messages_received[i];
5550	info = sdscatprintf(info,
5551	"cluster_stats_messages_%s_received:%lld\r\n",
5552	clusterGetMessageTypeString(i),
5553	server.cluster->stats_bus_messages_received[i]);
5554	}
5555	info = sdscatprintf(info,
5556	"cluster_stats_messages_received:%lld\r\n", tot_msg_received);
5557
5558	info = sdscatprintf(info,
5559	"total_cluster_links_buffer_limit_exceeded:%llu\r\n",
5560	server.cluster->stat_cluster_links_buffer_limit_exceeded);
5561
5562	/ Produce the reply protocol. /
5563	addReplyVerbatim(c,info,sdslen(info),"txt");
5564	sdsfree(info);
5565	} else if (!strcasecmp(c->argv[`1`]->ptr,"saveconfig") && c->argc == `2`) {
5566	int retval = clusterSaveConfig(`1`);
5567
5568	if (retval == `0`)
5569	addReply(c,shared.ok);
5570	else
5571	addReplyErrorFormat(c,"error saving the cluster node config: %s",
5572	strerror(errno));
5573	} else if (!strcasecmp(c->argv[`1`]->ptr,"keyslot") && c->argc == `3`) {
5574	/ CLUSTER KEYSLOT <key> /
5575	sds key = c->argv[`2`]->ptr;
5576
5577	addReplyLongLong(c,keyHashSlot(key,sdslen(key)));
5578	} else if (!strcasecmp(c->argv[`1`]->ptr,"countkeysinslot") && c->argc == `3`) {
5579	/ CLUSTER COUNTKEYSINSLOT <slot> /
5580	long long slot;
5581
5582	if (getLongLongFromObjectOrReply(c,c->argv[`2`],&slot,NULL) != C_OK)
5583	return;
5584	if (slot < `0` \|\| slot >= CLUSTER_SLOTS) {
5585	addReplyError(c,"Invalid slot");
5586	return;
5587	}
5588	addReplyLongLong(c,countKeysInSlot(slot));
5589	} else if (!strcasecmp(c->argv[`1`]->ptr,"getkeysinslot") && c->argc == `4`) {
5590	/ CLUSTER GETKEYSINSLOT <slot> <count> /
5591	long long maxkeys, slot;
5592
5593	if (getLongLongFromObjectOrReply(c,c->argv[`2`],&slot,NULL) != C_OK)
5594	return;
5595	if (getLongLongFromObjectOrReply(c,c->argv[`3`],&maxkeys,NULL)
5596	!= C_OK)
5597	return;
5598	if (slot < `0` \|\| slot >= CLUSTER_SLOTS \|\| maxkeys < `0`) {
5599	addReplyError(c,"Invalid slot or number of keys");
5600	return;
5601	}
5602
5603	unsigned int keys_in_slot = countKeysInSlot(slot);
5604	unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys;
5605	addReplyArrayLen(c,numkeys);
5606	dictEntry de = (server.db->slots_to_keys).by_slot[slot].head;
5607	for (unsigned int j = `0`; j < numkeys; j++) {
5608	serverAssert(de != NULL);
5609	sds sdskey = dictGetKey(de);
5610	addReplyBulkCBuffer(c, sdskey, sdslen(sdskey));
5611	de = dictEntryNextInSlot(de);
5612	}
5613	} else if (!strcasecmp(c->argv[`1`]->ptr,"forget") && c->argc == `3`) {
5614	/ CLUSTER FORGET <NODE ID> /
5615	clusterNode *n = clusterLookupNode(c->argv[`2`]->ptr, sdslen(c->argv[`2`]->ptr));
5616	if (!n) {
5617	addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[`2`]->ptr);
5618	return;
5619	} else if (n == myself) {
5620	addReplyError(c,"I tried hard but I can't forget myself...");
5621	return;
5622	} else if (nodeIsSlave(myself) && myself->slaveof == n) {
5623	addReplyError(c,"Can't forget my master!");
5624	return;
5625	}
5626	clusterBlacklistAddNode(n);
5627	clusterDelNode(n);
5628	clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE\|
5629	CLUSTER_TODO_SAVE_CONFIG);
5630	addReply(c,shared.ok);
5631	} else if (!strcasecmp(c->argv[`1`]->ptr,"replicate") && c->argc == `3`) {
5632	/ CLUSTER REPLICATE <NODE ID> /
5633	/ Lookup the specified node in our table. /
5634	clusterNode *n = clusterLookupNode(c->argv[`2`]->ptr, sdslen(c->argv[`2`]->ptr));
5635	if (!n) {
5636	addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[`2`]->ptr);
5637	return;
5638	}
5639
5640	/ I can't replicate myself. /
5641	if (n == myself) {
5642	addReplyError(c,"Can't replicate myself");
5643	return;
5644	}
5645
5646	/ Can't replicate a slave. /
5647	if (nodeIsSlave(n)) {
5648	addReplyError(c,"I can only replicate a master, not a replica.");
5649	return;
5650	}
5651
5652	/ If the instance is currently a master, it should have no assigned*
5653	* slots nor keys to accept to replicate some other node.
5654	* Slaves can switch to another master without issues. */
5655	if (nodeIsMaster(myself) &&
5656	(myself->numslots != `0` \|\| dictSize(server.db[`0`].dict) != `0`)) {
5657	addReplyError(c,
5658	"To set a master the node must be empty and "
5659	"without assigned slots.");
5660	return;
5661	}
5662
5663	/ Set the master. /
5664	clusterSetMaster(n);
5665	clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE\|CLUSTER_TODO_SAVE_CONFIG);
5666	addReply(c,shared.ok);
5667	} else if ((!strcasecmp(c->argv[`1`]->ptr,"slaves") \|\|
5668	!strcasecmp(c->argv[`1`]->ptr,"replicas")) && c->argc == `3`) {
5669	/ CLUSTER SLAVES <NODE ID> /
5670	clusterNode *n = clusterLookupNode(c->argv[`2`]->ptr, sdslen(c->argv[`2`]->ptr));
5671	int j;
5672
5673	/ Lookup the specified node in our table. /
5674	if (!n) {
5675	addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[`2`]->ptr);
5676	return;
5677	}
5678
5679	if (nodeIsSlave(n)) {
5680	addReplyError(c,"The specified node is not a master");
5681	return;
5682	}
5683
5684	/ Use plaintext port if cluster is TLS but client is non-TLS. /
5685	int use_pport = (server.tls_cluster &&
5686	c->conn && connGetType(c->conn) != CONN_TYPE_TLS);
5687	addReplyArrayLen(c,n->numslaves);
5688	for (j = `0`; j < n->numslaves; j++) {
5689	sds ni = clusterGenNodeDescription(n->slaves[j], use_pport);
5690	addReplyBulkCString(c,ni);
5691	sdsfree(ni);
5692	}
5693	} else if (!strcasecmp(c->argv[`1`]->ptr,"count-failure-reports") &&
5694	c->argc == `3`)
5695	{
5696	/ CLUSTER COUNT-FAILURE-REPORTS <NODE ID> /
5697	clusterNode *n = clusterLookupNode(c->argv[`2`]->ptr, sdslen(c->argv[`2`]->ptr));
5698
5699	if (!n) {
5700	addReplyErrorFormat(c,"Unknown node %s", (char*)c->argv[`2`]->ptr);
5701	return;
5702	} else {
5703	addReplyLongLong(c,clusterNodeFailureReportsCount(n));
5704	}
5705	} else if (!strcasecmp(c->argv[`1`]->ptr,"failover") &&
5706	(c->argc == `2` \|\| c->argc == `3`))
5707	{
5708	/ CLUSTER FAILOVER [FORCE\|TAKEOVER] /
5709	int force = `0`, takeover = `0`;
5710
5711	if (c->argc == `3`) {
5712	if (!strcasecmp(c->argv[`2`]->ptr,"force")) {
5713	force = `1`;
5714	} else if (!strcasecmp(c->argv[`2`]->ptr,"takeover")) {
5715	takeover = `1`;
5716	force = `1`; / Takeover also implies force. /
5717	} else {
5718	addReplyErrorObject(c,shared.syntaxerr);
5719	return;
5720	}
5721	}
5722
5723	/ Check preconditions. /
5724	if (nodeIsMaster(myself)) {
5725	addReplyError(c,"You should send CLUSTER FAILOVER to a replica");
5726	return;
5727	} else if (myself->slaveof == NULL) {
5728	addReplyError(c,"I'm a replica but my master is unknown to me");
5729	return;
5730	} else if (!force &&
5731	(nodeFailed(myself->slaveof) \|\|
5732	myself->slaveof->link == NULL))
5733	{
5734	addReplyError(c,"Master is down or failed, "
5735	"please use CLUSTER FAILOVER FORCE");
5736	return;
5737	}
5738	resetManualFailover();
5739	server.cluster->mf_end = mstime() + CLUSTER_MF_TIMEOUT;
5740
5741	if (takeover) {
5742	/ A takeover does not perform any initial check. It just*
5743	* generates a new configuration epoch for this node without
5744	* consensus, claims the master's slots, and broadcast the new
5745	* configuration. */
5746	serverLog(LL_WARNING,"Taking over the master (user request).");
5747	clusterBumpConfigEpochWithoutConsensus();
5748	clusterFailoverReplaceYourMaster();
5749	} else if (force) {
5750	/ If this is a forced failover, we don't need to talk with our*
5751	* master to agree about the offset. We just failover taking over
5752	* it without coordination. */
5753	serverLog(LL_WARNING,"Forced failover user request accepted.");
5754	server.cluster->mf_can_start = `1`;
5755	} else {
5756	serverLog(LL_WARNING,"Manual failover user request accepted.");
5757	clusterSendMFStart(myself->slaveof);
5758	}
5759	addReply(c,shared.ok);
5760	} else if (!strcasecmp(c->argv[`1`]->ptr,"set-config-epoch") && c->argc == `3`)
5761	{
5762	/ CLUSTER SET-CONFIG-EPOCH <epoch>*
5763	*
5764	* The user is allowed to set the config epoch only when a node is
5765	* totally fresh: no config epoch, no other known node, and so forth.
5766	* This happens at cluster creation time to start with a cluster where
5767	* every node has a different node ID, without to rely on the conflicts
5768	* resolution system which is too slow when a big cluster is created. */
5769	long long epoch;
5770
5771	if (getLongLongFromObjectOrReply(c,c->argv[`2`],&epoch,NULL) != C_OK)
5772	return;
5773
5774	if (epoch < `0`) {
5775	addReplyErrorFormat(c,"Invalid config epoch specified: %lld",epoch);
5776	} else if (dictSize(server.cluster->nodes) > `1`) {
5777	addReplyError(c,"The user can assign a config epoch only when the "
5778	"node does not know any other node.");
5779	} else if (myself->configEpoch != `0`) {
5780	addReplyError(c,"Node config epoch is already non-zero");
5781	} else {
5782	myself->configEpoch = epoch;
5783	serverLog(LL_WARNING,
5784	"configEpoch set to %llu via CLUSTER SET-CONFIG-EPOCH",
5785	(unsigned long long) myself->configEpoch);
5786
5787	if (server.cluster->currentEpoch < (uint64_t)epoch)
5788	server.cluster->currentEpoch = epoch;
5789	/ No need to fsync the config here since in the unlucky event*
5790	* of a failure to persist the config, the conflict resolution code
5791	* will assign a unique config to this node. */
5792	clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE\|
5793	CLUSTER_TODO_SAVE_CONFIG);
5794	addReply(c,shared.ok);
5795	}
5796	} else if (!strcasecmp(c->argv[`1`]->ptr,"reset") &&
5797	(c->argc == `2` \|\| c->argc == `3`))
5798	{
5799	/ CLUSTER RESET [SOFT\|HARD] /
5800	int hard = `0`;
5801
5802	/ Parse soft/hard argument. Default is soft. /
5803	if (c->argc == `3`) {
5804	if (!strcasecmp(c->argv[`2`]->ptr,"hard")) {
5805	hard = `1`;
5806	} else if (!strcasecmp(c->argv[`2`]->ptr,"soft")) {
5807	hard = `0`;
5808	} else {
5809	addReplyErrorObject(c,shared.syntaxerr);
5810	return;
5811	}
5812	}
5813
5814	/ Slaves can be reset while containing data, but not master nodes*
5815	* that must be empty. */
5816	if (nodeIsMaster(myself) && dictSize(c->db->dict) != `0`) {
5817	addReplyError(c,"CLUSTER RESET can't be called with "
5818	"master nodes containing keys");
5819	return;
5820	}
5821	clusterReset(hard);
5822	addReply(c,shared.ok);
5823	} else if (!strcasecmp(c->argv[`1`]->ptr,"links") && c->argc == `2`) {
5824	/ CLUSTER LINKS /
5825	addReplyClusterLinksDescription(c);
5826	} else {
5827	addReplySubcommandSyntaxError(c);
5828	return;
5829	}
5830	}
5831
5832	void removeChannelsInSlot(unsigned int slot) {
5833	unsigned int channelcount = countChannelsInSlot(slot);
5834	if (channelcount == `0`) return;
5835
5836	/ Retrieve all the channels for the slot. /
5837	robj channels = zmalloc(sizeof*(robj)*channelcount);
5838	raxIterator iter;
5839	int j = `0`;
5840	unsigned char indexed[`2`];
5841
5842	indexed[`0`] = (slot >> `8`) & `0xff`;
5843	indexed[`1`] = slot & `0xff`;
5844	raxStart(&iter,server.cluster->slots_to_channels);
5845	raxSeek(&iter,">=",indexed,`2`);
5846	while(raxNext(&iter)) {
5847	if (iter.key[`0`] != indexed[`0`] \|\| iter.key[`1`] != indexed[`1`]) break;
5848	channels[j++] = createStringObject((char*)iter.key + `2`, iter.key_len - `2`);
5849	}
5850	raxStop(&iter);
5851
5852	pubsubUnsubscribeShardChannels(channels, channelcount);
5853	zfree(channels);
5854	}
5855
5856	/ -----------------------------------------------------------------------------*
5857	* DUMP, RESTORE and MIGRATE commands
5858	* -------------------------------------------------------------------------- */
5859
5860	/ Generates a DUMP-format representation of the object 'o', adding it to the*
5861	* io stream pointed by 'rio'. This function can't fail. */
5862	void createDumpPayload(rio payload, robj o, robj key, int* dbid) {
5863	unsigned char buf[`2`];
5864	uint64_t crc;
5865
5866	/ Serialize the object in an RDB-like format. It consist of an object type*
5867	* byte followed by the serialized object. This is understood by RESTORE. */
5868	rioInitWithBuffer(payload,sdsempty());
5869	serverAssert(rdbSaveObjectType(payload,o));
5870	serverAssert(rdbSaveObject(payload,o,key,dbid));
5871
5872	/ Write the footer, this is how it looks like:*
5873	* ----------------+---------------------+---------------+
5874	* ... RDB payload \| 2 bytes RDB version \| 8 bytes CRC64 \|
5875	* ----------------+---------------------+---------------+
5876	* RDB version and CRC are both in little endian.
5877	*/
5878
5879	/ RDB version /
5880	buf[`0`] = RDB_VERSION & `0xff`;
5881	buf[`1`] = (RDB_VERSION >> `8`) & `0xff`;
5882	payload->io.buffer.ptr = sdscatlen(payload->io.buffer.ptr,buf,`2`);
5883
5884	/ CRC64 /
5885	crc = crc64(`0`,(unsigned char*)payload->io.buffer.ptr,
5886	sdslen(payload->io.buffer.ptr));
5887	memrev64ifbe(&crc);
5888	payload->io.buffer.ptr = sdscatlen(payload->io.buffer.ptr,&crc,`8`);
5889	}
5890
5891	/ Verify that the RDB version of the dump payload matches the one of this Redis*
5892	* instance and that the checksum is ok.
5893	* If the DUMP payload looks valid C_OK is returned, otherwise C_ERR
5894	* is returned. If rdbver_ptr is not NULL, its populated with the value read
5895	* from the input buffer. */
5896	int verifyDumpPayload(unsigned char p, size_t len, uint16_t rdbver_ptr) {
5897	unsigned char *footer;
5898	uint16_t rdbver;
5899	uint64_t crc;
5900
5901	/ At least 2 bytes of RDB version and 8 of CRC64 should be present. /
5902	if (len < `10`) return C_ERR;
5903	footer = p+(len-`10`);
5904
5905	/ Set and verify RDB version. /
5906	rdbver = (footer[`1`] << `8`) \| footer[`0`];
5907	if (rdbver_ptr) {
5908	*rdbver_ptr = rdbver;
5909	}
5910	if (rdbver > RDB_VERSION) return C_ERR;
5911
5912	if (server.skip_checksum_validation)
5913	return C_OK;
5914
5915	/ Verify CRC64 /
5916	crc = crc64(`0`,p,len-`8`);
5917	memrev64ifbe(&crc);
5918	return (memcmp(&crc,footer+`2`,`8`) == `0`) ? C_OK : C_ERR;
5919	}
5920
5921	/ DUMP keyname*
5922	* DUMP is actually not used by Redis Cluster but it is the obvious
5923	* complement of RESTORE and can be useful for different applications. */
5924	void dumpCommand(client *c) {
5925	robj *o;
5926	rio payload;
5927
5928	/ Check if the key is here. /
5929	if ((o = lookupKeyRead(c->db,c->argv[`1`])) == NULL) {
5930	addReplyNull(c);
5931	return;
5932	}
5933
5934	/ Create the DUMP encoded representation. /
5935	createDumpPayload(&payload,o,c->argv[`1`],c->db->id);
5936
5937	/ Transfer to the client /
5938	addReplyBulkSds(c,payload.io.buffer.ptr);
5939	return;
5940	}
5941
5942	/ RESTORE key ttl serialized-value [REPLACE] [ABSTTL] [IDLETIME seconds] [FREQ frequency] /
5943	void restoreCommand(client *c) {
5944	long long ttl, lfu_freq = -`1`, lru_idle = -`1`, lru_clock = -`1`;
5945	rio payload;
5946	int j, type, replace = `0`, absttl = `0`;
5947	robj *obj;
5948
5949	/ Parse additional options /
5950	for (j = `4`; j < c->argc; j++) {
5951	int additional = c->argc-j-`1`;
5952	if (!strcasecmp(c->argv[j]->ptr,"replace")) {
5953	replace = `1`;
5954	} else if (!strcasecmp(c->argv[j]->ptr,"absttl")) {
5955	absttl = `1`;
5956	} else if (!strcasecmp(c->argv[j]->ptr,"idletime") && additional >= `1` &&
5957	lfu_freq == -`1`)
5958	{
5959	if (getLongLongFromObjectOrReply(c,c->argv[j+`1`],&lru_idle,NULL)
5960	!= C_OK) return;
5961	if (lru_idle < `0`) {
5962	addReplyError(c,"Invalid IDLETIME value, must be >= 0");
5963	return;
5964	}
5965	lru_clock = LRU_CLOCK();
5966	j++; / Consume additional arg. /
5967	} else if (!strcasecmp(c->argv[j]->ptr,"freq") && additional >= `1` &&
5968	lru_idle == -`1`)
5969	{
5970	if (getLongLongFromObjectOrReply(c,c->argv[j+`1`],&lfu_freq,NULL)
5971	!= C_OK) return;
5972	if (lfu_freq < `0` \|\| lfu_freq > `255`) {
5973	addReplyError(c,"Invalid FREQ value, must be >= 0 and <= 255");
5974	return;
5975	}
5976	j++; / Consume additional arg. /
5977	} else {
5978	addReplyErrorObject(c,shared.syntaxerr);
5979	return;
5980	}
5981	}
5982
5983	/ Make sure this key does not already exist here... /
5984	robj *key = c->argv[`1`];
5985	if (!replace && lookupKeyWrite(c->db,key) != NULL) {
5986	addReplyErrorObject(c,shared.busykeyerr);
5987	return;
5988	}
5989
5990	/ Check if the TTL value makes sense /
5991	if (getLongLongFromObjectOrReply(c,c->argv[`2`],&ttl,NULL) != C_OK) {
5992	return;
5993	} else if (ttl < `0`) {
5994	addReplyError(c,"Invalid TTL value, must be >= 0");
5995	return;
5996	}
5997
5998	/ Verify RDB version and data checksum. /
5999	if (verifyDumpPayload(c->argv[`3`]->ptr,sdslen(c->argv[`3`]->ptr),NULL) == C_ERR)
6000	{
6001	addReplyError(c,"DUMP payload version or checksum are wrong");
6002	return;
6003	}
6004
6005	rioInitWithBuffer(&payload,c->argv[`3`]->ptr);
6006	if (((type = rdbLoadObjectType(&payload)) == -`1`) \|\|
6007	((obj = rdbLoadObject(type,&payload,key->ptr,c->db->id,NULL)) == NULL))
6008	{
6009	addReplyError(c,"Bad data format");
6010	return;
6011	}
6012
6013	/ Remove the old key if needed. /
6014	int deleted = `0`;
6015	if (replace)
6016	deleted = dbDelete(c->db,key);
6017
6018	if (ttl && !absttl) ttl+=mstime();
6019	if (ttl && checkAlreadyExpired(ttl)) {
6020	if (deleted) {
6021	rewriteClientCommandVector(c,`2`,shared.del,key);
6022	signalModifiedKey(c,c->db,key);
6023	notifyKeyspaceEvent(NOTIFY_GENERIC,"del",key,c->db->id);
6024	server.dirty++;
6025	}
6026	decrRefCount(obj);
6027	addReply(c, shared.ok);
6028	return;
6029	}
6030
6031	/ Create the key and set the TTL if any /
6032	dbAdd(c->db,key,obj);
6033	if (ttl) {
6034	setExpire(c,c->db,key,ttl);
6035	if (!absttl) {
6036	/ Propagate TTL as absolute timestamp /
6037	robj *ttl_obj = createStringObjectFromLongLong(ttl);
6038	rewriteClientCommandArgument(c,`2`,ttl_obj);
6039	decrRefCount(ttl_obj);
6040	rewriteClientCommandArgument(c,c->argc,shared.absttl);
6041	}
6042	}
6043	objectSetLRUOrLFU(obj,lfu_freq,lru_idle,lru_clock,`1000`);
6044	signalModifiedKey(c,c->db,key);
6045	notifyKeyspaceEvent(NOTIFY_GENERIC,"restore",key,c->db->id);
6046	addReply(c,shared.ok);
6047	server.dirty++;
6048	}
6049
6050	/ MIGRATE socket cache implementation.*
6051	*
6052	* We take a map between host:ip and a TCP socket that we used to connect
6053	* to this instance in recent time.
6054	* This sockets are closed when the max number we cache is reached, and also
6055	* in serverCron() when they are around for more than a few seconds. */
6056	#define MIGRATE_SOCKET_CACHE_ITEMS 64 /* max num of items in the cache. */
6057	#define MIGRATE_SOCKET_CACHE_TTL 10 /* close cached sockets after 10 sec. */
6058
6059	typedef struct migrateCachedSocket {
6060	connection *conn;
6061	long last_dbid;
6062	time_t last_use_time;
6063	} migrateCachedSocket;
6064
6065	/ Return a migrateCachedSocket containing a TCP socket connected with the*
6066	* target instance, possibly returning a cached one.
6067	*
6068	* This function is responsible of sending errors to the client if a
6069	* connection can't be established. In this case -1 is returned.
6070	* Otherwise on success the socket is returned, and the caller should not
6071	* attempt to free it after usage.
6072	*
6073	* If the caller detects an error while using the socket, migrateCloseSocket()
6074	* should be called so that the connection will be created from scratch
6075	* the next time. */
6076	migrateCachedSocket* migrateGetSocket(client c, robj host, robj port, long* timeout) {
6077	connection *conn;
6078	sds name = sdsempty();
6079	migrateCachedSocket *cs;
6080
6081	/ Check if we have an already cached socket for this ip:port pair. /
6082	name = sdscatlen(name,host->ptr,sdslen(host->ptr));
6083	name = sdscatlen(name,":",`1`);
6084	name = sdscatlen(name,port->ptr,sdslen(port->ptr));
6085	cs = dictFetchValue(server.migrate_cached_sockets,name);
6086	if (cs) {
6087	sdsfree(name);
6088	cs->last_use_time = server.unixtime;
6089	return cs;
6090	}
6091
6092	/ No cached socket, create one. /
6093	if (dictSize(server.migrate_cached_sockets) == MIGRATE_SOCKET_CACHE_ITEMS) {
6094	/ Too many items, drop one at random. /
6095	dictEntry *de = dictGetRandomKey(server.migrate_cached_sockets);
6096	cs = dictGetVal(de);
6097	connClose(cs->conn);
6098	zfree(cs);
6099	dictDelete(server.migrate_cached_sockets,dictGetKey(de));
6100	}
6101
6102	/ Create the socket /
6103	conn = server.tls_cluster ? connCreateTLS() : connCreateSocket();
6104	if (connBlockingConnect(conn, host->ptr, atoi(port->ptr), timeout)
6105	!= C_OK) {
6106	addReplyError(c,"-IOERR error or timeout connecting to the client");
6107	connClose(conn);
6108	sdsfree(name);
6109	return NULL;
6110	}
6111	connEnableTcpNoDelay(conn);
6112
6113	/ Add to the cache and return it to the caller. /
6114	cs = zmalloc(sizeof(*cs));
6115	cs->conn = conn;
6116
6117	cs->last_dbid = -`1`;
6118	cs->last_use_time = server.unixtime;
6119	dictAdd(server.migrate_cached_sockets,name,cs);
6120	return cs;
6121	}
6122
6123	/ Free a migrate cached connection. /
6124	void migrateCloseSocket(robj host, robj port) {
6125	sds name = sdsempty();
6126	migrateCachedSocket *cs;
6127
6128	name = sdscatlen(name,host->ptr,sdslen(host->ptr));
6129	name = sdscatlen(name,":",`1`);
6130	name = sdscatlen(name,port->ptr,sdslen(port->ptr));
6131	cs = dictFetchValue(server.migrate_cached_sockets,name);
6132	if (!cs) {
6133	sdsfree(name);
6134	return;
6135	}
6136
6137	connClose(cs->conn);
6138	zfree(cs);
6139	dictDelete(server.migrate_cached_sockets,name);
6140	sdsfree(name);
6141	}
6142
6143	void migrateCloseTimedoutSockets(void) {
6144	dictIterator *di = dictGetSafeIterator(server.migrate_cached_sockets);
6145	dictEntry *de;
6146
6147	while((de = dictNext(di)) != NULL) {
6148	migrateCachedSocket *cs = dictGetVal(de);
6149
6150	if ((server.unixtime - cs->last_use_time) > MIGRATE_SOCKET_CACHE_TTL) {
6151	connClose(cs->conn);
6152	zfree(cs);
6153	dictDelete(server.migrate_cached_sockets,dictGetKey(de));
6154	}
6155	}
6156	dictReleaseIterator(di);
6157	}
6158
6159	/ MIGRATE host port key dbid timeout [COPY \| REPLACE \| AUTH password \|*
6160	* AUTH2 username password]
6161	*
6162	* On in the multiple keys form:
6163	*
6164	* MIGRATE host port "" dbid timeout [COPY \| REPLACE \| AUTH password \|
6165	* AUTH2 username password] KEYS key1 key2 ... keyN */
6166	void migrateCommand(client *c) {
6167	migrateCachedSocket *cs;
6168	int copy = `0`, replace = `0`, j;
6169	char *username = NULL;
6170	char *password = NULL;
6171	long timeout;
6172	long dbid;
6173	robj *ov = NULL; /* Objects to migrate. /
6174	robj *kv = NULL; /* Key names. /
6175	robj *newargv = NULL; /* Used to rewrite the command as DEL ... keys ... /
6176	rio cmd, payload;
6177	int may_retry = `1`;
6178	int write_error = `0`;
6179	int argv_rewritten = `0`;
6180
6181	/ To support the KEYS option we need the following additional state. /
6182	int first_key = `3`; / Argument index of the first key. /
6183	int num_keys = `1`; / By default only migrate the 'key' argument. /
6184
6185	/ Parse additional options /
6186	for (j = `6`; j < c->argc; j++) {
6187	int moreargs = (c->argc-`1`) - j;
6188	if (!strcasecmp(c->argv[j]->ptr,"copy")) {
6189	copy = `1`;
6190	} else if (!strcasecmp(c->argv[j]->ptr,"replace")) {
6191	replace = `1`;
6192	} else if (!strcasecmp(c->argv[j]->ptr,"auth")) {
6193	if (!moreargs) {
6194	addReplyErrorObject(c,shared.syntaxerr);
6195	return;
6196	}
6197	j++;
6198	password = c->argv[j]->ptr;
6199	redactClientCommandArgument(c,j);
6200	} else if (!strcasecmp(c->argv[j]->ptr,"auth2")) {
6201	if (moreargs < `2`) {
6202	addReplyErrorObject(c,shared.syntaxerr);
6203	return;
6204	}
6205	username = c->argv[++j]->ptr;
6206	redactClientCommandArgument(c,j);
6207	password = c->argv[++j]->ptr;
6208	redactClientCommandArgument(c,j);
6209	} else if (!strcasecmp(c->argv[j]->ptr,"keys")) {
6210	if (sdslen(c->argv[`3`]->ptr) != `0`) {
6211	addReplyError(c,
6212	"When using MIGRATE KEYS option, the key argument"
6213	" must be set to the empty string");
6214	return;
6215	}
6216	first_key = j+`1`;
6217	num_keys = c->argc - j - `1`;
6218	break; / All the remaining args are keys. /
6219	} else {
6220	addReplyErrorObject(c,shared.syntaxerr);
6221	return;
6222	}
6223	}
6224
6225	/ Sanity check /
6226	if (getLongFromObjectOrReply(c,c->argv[`5`],&timeout,NULL) != C_OK \|\|
6227	getLongFromObjectOrReply(c,c->argv[`4`],&dbid,NULL) != C_OK)
6228	{
6229	return;
6230	}
6231	if (timeout <= `0`) timeout = `1000`;
6232
6233	/ Check if the keys are here. If at least one key is to migrate, do it*
6234	* otherwise if all the keys are missing reply with "NOKEY" to signal
6235	* the caller there was nothing to migrate. We don't return an error in
6236	* this case, since often this is due to a normal condition like the key
6237	* expiring in the meantime. */
6238	ov = zrealloc(ov,sizeof(robj)num_keys);
6239	kv = zrealloc(kv,sizeof(robj)num_keys);
6240	int oi = `0`;
6241
6242	for (j = `0`; j < num_keys; j++) {
6243	if ((ov[oi] = lookupKeyRead(c->db,c->argv[first_key+j])) != NULL) {
6244	kv[oi] = c->argv[first_key+j];
6245	oi++;
6246	}
6247	}
6248	num_keys = oi;
6249	if (num_keys == `0`) {
6250	zfree(ov); zfree(kv);
6251	addReplySds(c,sdsnew("+NOKEY\r\n"));
6252	return;
6253	}
6254
6255	try_again:
6256	write_error = `0`;
6257
6258	/ Connect /
6259	cs = migrateGetSocket(c,c->argv[`1`],c->argv[`2`],timeout);
6260	if (cs == NULL) {
6261	zfree(ov); zfree(kv);
6262	return; / error sent to the client by migrateGetSocket() /
6263	}
6264
6265	rioInitWithBuffer(&cmd,sdsempty());
6266
6267	/ Authentication /
6268	if (password) {
6269	int arity = username ? `3` : `2`;
6270	serverAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,`'*'`,arity));
6271	serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"AUTH",`4`));
6272	if (username) {
6273	serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,username,
6274	sdslen(username)));
6275	}
6276	serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,password,
6277	sdslen(password)));
6278	}
6279
6280	/ Send the SELECT command if the current DB is not already selected. /
6281	int select = cs->last_dbid != dbid; / Should we emit SELECT? /
6282	if (select) {
6283	serverAssertWithInfo(c,NULL,rioWriteBulkCount(&cmd,`'*'`,`2`));
6284	serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"SELECT",`6`));
6285	serverAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,dbid));
6286	}
6287
6288	int non_expired = `0`; / Number of keys that we'll find non expired.*
6289	Note that serializing large keys may take some time
6290	so certain keys that were found non expired by the
6291	lookupKey() function, may be expired later. /*
6292
6293	/ Create RESTORE payload and generate the protocol to call the command. /
6294	for (j = `0`; j < num_keys; j++) {
6295	long long ttl = `0`;
6296	long long expireat = getExpire(c->db,kv[j]);
6297
6298	if (expireat != -`1`) {
6299	ttl = expireat-mstime();
6300	if (ttl < `0`) {
6301	continue;
6302	}
6303	if (ttl < `1`) ttl = `1`;
6304	}
6305
6306	/ Relocate valid (non expired) keys and values into the array in successive*
6307	* positions to remove holes created by the keys that were present
6308	* in the first lookup but are now expired after the second lookup. */
6309	ov[non_expired] = ov[j];
6310	kv[non_expired++] = kv[j];
6311
6312	serverAssertWithInfo(c,NULL,
6313	rioWriteBulkCount(&cmd,`'*'`,replace ? `5` : `4`));
6314
6315	if (server.cluster_enabled)
6316	serverAssertWithInfo(c,NULL,
6317	rioWriteBulkString(&cmd,"RESTORE-ASKING",`14`));
6318	else
6319	serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"RESTORE",`7`));
6320	serverAssertWithInfo(c,NULL,sdsEncodedObject(kv[j]));
6321	serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,kv[j]->ptr,
6322	sdslen(kv[j]->ptr)));
6323	serverAssertWithInfo(c,NULL,rioWriteBulkLongLong(&cmd,ttl));
6324
6325	/ Emit the payload argument, that is the serialized object using*
6326	* the DUMP format. */
6327	createDumpPayload(&payload,ov[j],kv[j],dbid);
6328	serverAssertWithInfo(c,NULL,
6329	rioWriteBulkString(&cmd,payload.io.buffer.ptr,
6330	sdslen(payload.io.buffer.ptr)));
6331	sdsfree(payload.io.buffer.ptr);
6332
6333	/ Add the REPLACE option to the RESTORE command if it was specified*
6334	* as a MIGRATE option. */
6335	if (replace)
6336	serverAssertWithInfo(c,NULL,rioWriteBulkString(&cmd,"REPLACE",`7`));
6337	}
6338
6339	/ Fix the actual number of keys we are migrating. /
6340	num_keys = non_expired;
6341
6342	/ Transfer the query to the other node in 64K chunks. /
6343	errno = `0`;
6344	{
6345	sds buf = cmd.io.buffer.ptr;
6346	size_t pos = `0`, towrite;
6347	int nwritten = `0`;
6348
6349	while ((towrite = sdslen(buf)-pos) > `0`) {
6350	towrite = (towrite > (`64``1024`) ? (`64``1024`) : towrite);
6351	nwritten = connSyncWrite(cs->conn,buf+pos,towrite,timeout);
6352	if (nwritten != (signed)towrite) {
6353	write_error = `1`;
6354	goto socket_err;
6355	}
6356	pos += nwritten;
6357	}
6358	}
6359
6360	char buf0[`1024`]; / Auth reply. /
6361	char buf1[`1024`]; / Select reply. /
6362	char buf2[`1024`]; / Restore reply. /
6363
6364	/ Read the AUTH reply if needed. /
6365	if (password && connSyncReadLine(cs->conn, buf0, sizeof(buf0), timeout) <= `0`)
6366	goto socket_err;
6367
6368	/ Read the SELECT reply if needed. /
6369	if (select && connSyncReadLine(cs->conn, buf1, sizeof(buf1), timeout) <= `0`)
6370	goto socket_err;
6371
6372	/ Read the RESTORE replies. /
6373	int error_from_target = `0`;
6374	int socket_error = `0`;
6375	int del_idx = `1`; / Index of the key argument for the replicated DEL op. /
6376
6377	/ Allocate the new argument vector that will replace the current command,*
6378	* to propagate the MIGRATE as a DEL command (if no COPY option was given).
6379	* We allocate num_keys+1 because the additional argument is for "DEL"
6380	* command name itself. */
6381	if (!copy) newargv = zmalloc(sizeof(robj)(num_keys+`1`));
6382
6383	for (j = `0`; j < num_keys; j++) {
6384	if (connSyncReadLine(cs->conn, buf2, sizeof(buf2), timeout) <= `0`) {
6385	socket_error = `1`;
6386	break;
6387	}
6388	if ((password && buf0[`0`] == `'-'`) \|\|
6389	(select && buf1[`0`] == `'-'`) \|\|
6390	buf2[`0`] == `'-'`)
6391	{
6392	/ On error assume that last_dbid is no longer valid. /
6393	if (!error_from_target) {
6394	cs->last_dbid = -`1`;
6395	char *errbuf;
6396	if (password && buf0[`0`] == `'-'`) errbuf = buf0;
6397	else if (select && buf1[`0`] == `'-'`) errbuf = buf1;
6398	else errbuf = buf2;
6399
6400	error_from_target = `1`;
6401	addReplyErrorFormat(c,"Target instance replied with error: %s",
6402	errbuf+`1`);
6403	}
6404	} else {
6405	if (!copy) {
6406	/ No COPY option: remove the local key, signal the change. /
6407	dbDelete(c->db,kv[j]);
6408	signalModifiedKey(c,c->db,kv[j]);
6409	notifyKeyspaceEvent(NOTIFY_GENERIC,"del",kv[j],c->db->id);
6410	server.dirty++;
6411
6412	/ Populate the argument vector to replace the old one. /
6413	newargv[del_idx++] = kv[j];
6414	incrRefCount(kv[j]);
6415	}
6416	}
6417	}
6418
6419	/ On socket error, if we want to retry, do it now before rewriting the*
6420	* command vector. We only retry if we are sure nothing was processed
6421	* and we failed to read the first reply (j == 0 test). */
6422	if (!error_from_target && socket_error && j == `0` && may_retry &&
6423	errno != ETIMEDOUT)
6424	{
6425	goto socket_err; / A retry is guaranteed because of tested conditions./
6426	}
6427
6428	/ On socket errors, close the migration socket now that we still have*
6429	* the original host/port in the ARGV. Later the original command may be
6430	* rewritten to DEL and will be too later. */
6431	if (socket_error) migrateCloseSocket(c->argv[`1`],c->argv[`2`]);
6432
6433	if (!copy) {
6434	/ Translate MIGRATE as DEL for replication/AOF. Note that we do*
6435	* this only for the keys for which we received an acknowledgement
6436	* from the receiving Redis server, by using the del_idx index. */
6437	if (del_idx > `1`) {
6438	newargv[`0`] = createStringObject("DEL",`3`);
6439	/ Note that the following call takes ownership of newargv. /
6440	replaceClientCommandVector(c,del_idx,newargv);
6441	argv_rewritten = `1`;
6442	} else {
6443	/ No key transfer acknowledged, no need to rewrite as DEL. /
6444	zfree(newargv);
6445	}
6446	newargv = NULL; / Make it safe to call zfree() on it in the future. /
6447	}
6448
6449	/ If we are here and a socket error happened, we don't want to retry.*
6450	* Just signal the problem to the client, but only do it if we did not
6451	* already queue a different error reported by the destination server. */
6452	if (!error_from_target && socket_error) {
6453	may_retry = `0`;
6454	goto socket_err;
6455	}
6456
6457	if (!error_from_target) {
6458	/ Success! Update the last_dbid in migrateCachedSocket, so that we can*
6459	* avoid SELECT the next time if the target DB is the same. Reply +OK.
6460	*
6461	* Note: If we reached this point, even if socket_error is true
6462	* still the SELECT command succeeded (otherwise the code jumps to
6463	* socket_err label. */
6464	cs->last_dbid = dbid;
6465	addReply(c,shared.ok);
6466	} else {
6467	/ On error we already sent it in the for loop above, and set*
6468	* the currently selected socket to -1 to force SELECT the next time. */
6469	}
6470
6471	sdsfree(cmd.io.buffer.ptr);
6472	zfree(ov); zfree(kv); zfree(newargv);
6473	return;
6474
6475	/ On socket errors we try to close the cached socket and try again.*
6476	* It is very common for the cached socket to get closed, if just reopening
6477	* it works it's a shame to notify the error to the caller. */
6478	socket_err:
6479	/ Cleanup we want to perform in both the retry and no retry case.*
6480	* Note: Closing the migrate socket will also force SELECT next time. */
6481	sdsfree(cmd.io.buffer.ptr);
6482
6483	/ If the command was rewritten as DEL and there was a socket error,*
6484	* we already closed the socket earlier. While migrateCloseSocket()
6485	* is idempotent, the host/port arguments are now gone, so don't do it
6486	* again. */
6487	if (!argv_rewritten) migrateCloseSocket(c->argv[`1`],c->argv[`2`]);
6488	zfree(newargv);
6489	newargv = NULL; / This will get reallocated on retry. /
6490
6491	/ Retry only if it's not a timeout and we never attempted a retry*
6492	* (or the code jumping here did not set may_retry to zero). */
6493	if (errno != ETIMEDOUT && may_retry) {
6494	may_retry = `0`;
6495	goto try_again;
6496	}
6497
6498	/ Cleanup we want to do if no retry is attempted. /
6499	zfree(ov); zfree(kv);
6500	addReplyErrorSds(c, sdscatprintf(sdsempty(),
6501	"-IOERR error or timeout %s to target instance",
6502	write_error ? "writing" : "reading"));
6503	return;
6504	}
6505
6506	/ -----------------------------------------------------------------------------*
6507	* Cluster functions related to serving / redirecting clients
6508	* -------------------------------------------------------------------------- */
6509
6510	/ The ASKING command is required after a -ASK redirection.*
6511	* The client should issue ASKING before to actually send the command to
6512	* the target instance. See the Redis Cluster specification for more
6513	* information. */
6514	void askingCommand(client *c) {
6515	if (server.cluster_enabled == `0`) {
6516	addReplyError(c,"This instance has cluster support disabled");
6517	return;
6518	}
6519	c->flags \|= CLIENT_ASKING;
6520	addReply(c,shared.ok);
6521	}
6522
6523	/ The READONLY command is used by clients to enter the read-only mode.*
6524	* In this mode slaves will not redirect clients as long as clients access
6525	* with read-only commands to keys that are served by the slave's master. */
6526	void readonlyCommand(client *c) {
6527	if (server.cluster_enabled == `0`) {
6528	addReplyError(c,"This instance has cluster support disabled");
6529	return;
6530	}
6531	c->flags \|= CLIENT_READONLY;
6532	addReply(c,shared.ok);
6533	}
6534
6535	/ The READWRITE command just clears the READONLY command state. /
6536	void readwriteCommand(client *c) {
6537	if (server.cluster_enabled == `0`) {
6538	addReplyError(c,"This instance has cluster support disabled");
6539	return;
6540	}
6541	c->flags &= ~CLIENT_READONLY;
6542	addReply(c,shared.ok);
6543	}
6544
6545	/ Return the pointer to the cluster node that is able to serve the command.*
6546	* For the function to succeed the command should only target either:
6547	*
6548	* 1) A single key (even multiple times like LPOPRPUSH mylist mylist).
6549	* 2) Multiple keys in the same hash slot, while the slot is stable (no
6550	* resharding in progress).
6551	*
6552	* On success the function returns the node that is able to serve the request.
6553	* If the node is not 'myself' a redirection must be performed. The kind of
6554	* redirection is specified setting the integer passed by reference
6555	* 'error_code', which will be set to CLUSTER_REDIR_ASK or
6556	* CLUSTER_REDIR_MOVED.
6557	*
6558	* When the node is 'myself' 'error_code' is set to CLUSTER_REDIR_NONE.
6559	*
6560	* If the command fails NULL is returned, and the reason of the failure is
6561	* provided via 'error_code', which will be set to:
6562	*
6563	* CLUSTER_REDIR_CROSS_SLOT if the request contains multiple keys that
6564	* don't belong to the same hash slot.
6565	*
6566	* CLUSTER_REDIR_UNSTABLE if the request contains multiple keys
6567	* belonging to the same slot, but the slot is not stable (in migration or
6568	* importing state, likely because a resharding is in progress).
6569	*
6570	* CLUSTER_REDIR_DOWN_UNBOUND if the request addresses a slot which is
6571	* not bound to any node. In this case the cluster global state should be
6572	* already "down" but it is fragile to rely on the update of the global state,
6573	* so we also handle it here.
6574	*
6575	* CLUSTER_REDIR_DOWN_STATE and CLUSTER_REDIR_DOWN_RO_STATE if the cluster is
6576	* down but the user attempts to execute a command that addresses one or more keys. */
6577	clusterNode getNodeByQuery(client c, struct redisCommand cmd, robj argv, int* argc, int hashslot, int* *error_code) {
6578	clusterNode *n = NULL;
6579	robj *firstkey = NULL;
6580	int multiple_keys = `0`;
6581	multiState *ms, _ms;
6582	multiCmd mc;
6583	int i, slot = `0`, migrating_slot = `0`, importing_slot = `0`, missing_keys = `0`,
6584	existing_keys = `0`;
6585
6586	/ Allow any key to be set if a module disabled cluster redirections. /
6587	if (server.cluster_module_flags & CLUSTER_MODULE_FLAG_NO_REDIRECTION)
6588	return myself;
6589
6590	/ Set error code optimistically for the base case. /
6591	if (error_code) *error_code = CLUSTER_REDIR_NONE;
6592
6593	/ Modules can turn off Redis Cluster redirection: this is useful*
6594	* when writing a module that implements a completely different
6595	* distributed system. */
6596
6597	/ We handle all the cases as if they were EXEC commands, so we have*
6598	* a common code path for everything */
6599	if (cmd->proc == execCommand) {
6600	/ If CLIENT_MULTI flag is not set EXEC is just going to return an*
6601	* error. */
6602	if (!(c->flags & CLIENT_MULTI)) return myself;
6603	ms = &c->mstate;
6604	} else {
6605	/ In order to have a single codepath create a fake Multi State*
6606	* structure if the client is not in MULTI/EXEC state, this way
6607	* we have a single codepath below. */
6608	ms = &_ms;
6609	_ms.commands = &mc;
6610	_ms.count = `1`;
6611	mc.argv = argv;
6612	mc.argc = argc;
6613	mc.cmd = cmd;
6614	}
6615
6616	int is_pubsubshard = cmd->proc == ssubscribeCommand \|\|
6617	cmd->proc == sunsubscribeCommand \|\|
6618	cmd->proc == spublishCommand;
6619
6620	/ Check that all the keys are in the same hash slot, and obtain this*
6621	* slot and the node associated. */
6622	for (i = `0`; i < ms->count; i++) {
6623	struct redisCommand *mcmd;
6624	robj **margv;
6625	int margc, numkeys, j;
6626	keyReference *keyindex;
6627
6628	mcmd = ms->commands[i].cmd;
6629	margc = ms->commands[i].argc;
6630	margv = ms->commands[i].argv;
6631
6632	getKeysResult result = GETKEYS_RESULT_INIT;
6633	numkeys = getKeysFromCommand(mcmd,margv,margc,&result);
6634	keyindex = result.keys;
6635
6636	for (j = `0`; j < numkeys; j++) {
6637	robj *thiskey = margv[keyindex[j].pos];
6638	int thisslot = keyHashSlot((char*)thiskey->ptr,
6639	sdslen(thiskey->ptr));
6640
6641	if (firstkey == NULL) {
6642	/ This is the first key we see. Check what is the slot*
6643	* and node. */
6644	firstkey = thiskey;
6645	slot = thisslot;
6646	n = server.cluster->slots[slot];
6647
6648	/ Error: If a slot is not served, we are in "cluster down"*
6649	* state. However the state is yet to be updated, so this was
6650	* not trapped earlier in processCommand(). Report the same
6651	* error to the client. */
6652	if (n == NULL) {
6653	getKeysFreeResult(&result);
6654	if (error_code)
6655	*error_code = CLUSTER_REDIR_DOWN_UNBOUND;
6656	return NULL;
6657	}
6658
6659	/ If we are migrating or importing this slot, we need to check*
6660	* if we have all the keys in the request (the only way we
6661	* can safely serve the request, otherwise we return a TRYAGAIN
6662	* error). To do so we set the importing/migrating state and
6663	* increment a counter for every missing key. */
6664	if (n == myself &&
6665	server.cluster->migrating_slots_to[slot] != NULL)
6666	{
6667	migrating_slot = `1`;
6668	} else if (server.cluster->importing_slots_from[slot] != NULL) {
6669	importing_slot = `1`;
6670	}
6671	} else {
6672	/ If it is not the first key/channel, make sure it is exactly*
6673	* the same key/channel as the first we saw. */
6674	if (!equalStringObjects(firstkey,thiskey)) {
6675	if (slot != thisslot) {
6676	/ Error: multiple keys from different slots. /
6677	getKeysFreeResult(&result);
6678	if (error_code)
6679	*error_code = CLUSTER_REDIR_CROSS_SLOT;
6680	return NULL;
6681	} else {
6682	/ Flag this request as one with multiple different*
6683	* keys/channels. */
6684	multiple_keys = `1`;
6685	}
6686	}
6687	}
6688
6689	/ Migrating / Importing slot? Count keys we don't have.*
6690	* If it is pubsubshard command, it isn't required to check
6691	* the channel being present or not in the node during the
6692	* slot migration, the channel will be served from the source
6693	* node until the migration completes with CLUSTER SETSLOT <slot>
6694	* NODE <node-id>. */
6695	int flags = LOOKUP_NOTOUCH \| LOOKUP_NOSTATS \| LOOKUP_NONOTIFY;
6696	if ((migrating_slot \|\| importing_slot) && !is_pubsubshard)
6697	{
6698	if (lookupKeyReadWithFlags(&server.db[`0`], thiskey, flags) == NULL) missing_keys++;
6699	else existing_keys++;
6700	}
6701	}
6702	getKeysFreeResult(&result);
6703	}
6704
6705	/ No key at all in command? then we can serve the request*
6706	* without redirections or errors in all the cases. */
6707	if (n == NULL) return myself;
6708
6709	/ Cluster is globally down but we got keys? We only serve the request*
6710	* if it is a read command and when allow_reads_when_down is enabled. */
6711	if (server.cluster->state != CLUSTER_OK) {
6712	if (is_pubsubshard) {
6713	if (!server.cluster_allow_pubsubshard_when_down) {
6714	if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE;
6715	return NULL;
6716	}
6717	} else if (!server.cluster_allow_reads_when_down) {
6718	/ The cluster is configured to block commands when the*
6719	* cluster is down. */
6720	if (error_code) *error_code = CLUSTER_REDIR_DOWN_STATE;
6721	return NULL;
6722	} else if (cmd->flags & CMD_WRITE) {
6723	/ The cluster is configured to allow read only commands /
6724	if (error_code) *error_code = CLUSTER_REDIR_DOWN_RO_STATE;
6725	return NULL;
6726	} else {
6727	/ Fall through and allow the command to be executed:*
6728	* this happens when server.cluster_allow_reads_when_down is
6729	* true and the command is not a write command */
6730	}
6731	}
6732
6733	/ Return the hashslot by reference. /
6734	if (hashslot) *hashslot = slot;
6735
6736	/ MIGRATE always works in the context of the local node if the slot*
6737	* is open (migrating or importing state). We need to be able to freely
6738	* move keys among instances in this case. */
6739	if ((migrating_slot \|\| importing_slot) && cmd->proc == migrateCommand)
6740	return myself;
6741
6742	/ If we don't have all the keys and we are migrating the slot, send*
6743	* an ASK redirection or TRYAGAIN. */
6744	if (migrating_slot && missing_keys) {
6745	/ If we have keys but we don't have all keys, we return TRYAGAIN /
6746	if (existing_keys) {
6747	if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE;
6748	return NULL;
6749	} else {
6750	if (error_code) *error_code = CLUSTER_REDIR_ASK;
6751	return server.cluster->migrating_slots_to[slot];
6752	}
6753	}
6754
6755	/ If we are receiving the slot, and the client correctly flagged the*
6756	* request as "ASKING", we can serve the request. However if the request
6757	* involves multiple keys and we don't have them all, the only option is
6758	* to send a TRYAGAIN error. */
6759	if (importing_slot &&
6760	(c->flags & CLIENT_ASKING \|\| cmd->flags & CMD_ASKING))
6761	{
6762	if (multiple_keys && missing_keys) {
6763	if (error_code) *error_code = CLUSTER_REDIR_UNSTABLE;
6764	return NULL;
6765	} else {
6766	return myself;
6767	}
6768	}
6769
6770	/ Handle the read-only client case reading from a slave: if this*
6771	* node is a slave and the request is about a hash slot our master
6772	* is serving, we can reply without redirection. */
6773	int is_write_command = (c->cmd->flags & CMD_WRITE) \|\|
6774	(c->cmd->proc == execCommand && (c->mstate.cmd_flags & CMD_WRITE));
6775	if (((c->flags & CLIENT_READONLY) \|\| is_pubsubshard) &&
6776	!is_write_command &&
6777	nodeIsSlave(myself) &&
6778	myself->slaveof == n)
6779	{
6780	return myself;
6781	}
6782
6783	/ Base case: just return the right node. However if this node is not*
6784	* myself, set error_code to MOVED since we need to issue a redirection. */
6785	if (n != myself && error_code) *error_code = CLUSTER_REDIR_MOVED;
6786	return n;
6787	}
6788
6789	/ Send the client the right redirection code, according to error_code*
6790	* that should be set to one of CLUSTER_REDIR_* macros.
6791	*
6792	* If CLUSTER_REDIR_ASK or CLUSTER_REDIR_MOVED error codes
6793	* are used, then the node 'n' should not be NULL, but should be the
6794	* node we want to mention in the redirection. Moreover hashslot should
6795	* be set to the hash slot that caused the redirection. */
6796	void clusterRedirectClient(client c, clusterNode n, int hashslot, int error_code) {
6797	if (error_code == CLUSTER_REDIR_CROSS_SLOT) {
6798	addReplyError(c,"-CROSSSLOT Keys in request don't hash to the same slot");
6799	} else if (error_code == CLUSTER_REDIR_UNSTABLE) {
6800	/ The request spawns multiple keys in the same slot,*
6801	* but the slot is not "stable" currently as there is
6802	* a migration or import in progress. */
6803	addReplyError(c,"-TRYAGAIN Multiple keys request during rehashing of slot");
6804	} else if (error_code == CLUSTER_REDIR_DOWN_STATE) {
6805	addReplyError(c,"-CLUSTERDOWN The cluster is down");
6806	} else if (error_code == CLUSTER_REDIR_DOWN_RO_STATE) {
6807	addReplyError(c,"-CLUSTERDOWN The cluster is down and only accepts read commands");
6808	} else if (error_code == CLUSTER_REDIR_DOWN_UNBOUND) {
6809	addReplyError(c,"-CLUSTERDOWN Hash slot not served");
6810	} else if (error_code == CLUSTER_REDIR_MOVED \|\|
6811	error_code == CLUSTER_REDIR_ASK)
6812	{
6813	/ Redirect to IP:port. Include plaintext port if cluster is TLS but*
6814	* client is non-TLS. */
6815	int use_pport = (server.tls_cluster &&
6816	c->conn && connGetType(c->conn) != CONN_TYPE_TLS);
6817	int port = use_pport && n->pport ? n->pport : n->port;
6818	addReplyErrorSds(c,sdscatprintf(sdsempty(),
6819	"-%s %d %s:%d",
6820	(error_code == CLUSTER_REDIR_ASK) ? "ASK" : "MOVED",
6821	hashslot, getPreferredEndpoint(n), port));
6822	} else {
6823	serverPanic("getNodeByQuery() unknown error.");
6824	}
6825	}
6826
6827	/ This function is called by the function processing clients incrementally*
6828	* to detect timeouts, in order to handle the following case:
6829	*
6830	* 1) A client blocks with BLPOP or similar blocking operation.
6831	* 2) The master migrates the hash slot elsewhere or turns into a slave.
6832	* 3) The client may remain blocked forever (or up to the max timeout time)
6833	* waiting for a key change that will never happen.
6834	*
6835	* If the client is found to be blocked into a hash slot this node no
6836	* longer handles, the client is sent a redirection error, and the function
6837	* returns 1. Otherwise 0 is returned and no operation is performed. */
6838	int clusterRedirectBlockedClientIfNeeded(client *c) {
6839	if (c->flags & CLIENT_BLOCKED &&
6840	(c->btype == BLOCKED_LIST \|\|
6841	c->btype == BLOCKED_ZSET \|\|
6842	c->btype == BLOCKED_STREAM \|\|
6843	c->btype == BLOCKED_MODULE))
6844	{
6845	dictEntry *de;
6846	dictIterator *di;
6847
6848	/ If the cluster is down, unblock the client with the right error.*
6849	* If the cluster is configured to allow reads on cluster down, we
6850	* still want to emit this error since a write will be required
6851	* to unblock them which may never come. */
6852	if (server.cluster->state == CLUSTER_FAIL) {
6853	clusterRedirectClient(c,NULL,`0`,CLUSTER_REDIR_DOWN_STATE);
6854	return `1`;
6855	}
6856
6857	/ If the client is blocked on module, but ont on a specific key,*
6858	* don't unblock it (except for the CLSUTER_FAIL case above). */
6859	if (c->btype == BLOCKED_MODULE && !moduleClientIsBlockedOnKeys(c))
6860	return `0`;
6861
6862	/ All keys must belong to the same slot, so check first key only. /
6863	di = dictGetIterator(c->bpop.keys);
6864	if ((de = dictNext(di)) != NULL) {
6865	robj *key = dictGetKey(de);
6866	int slot = keyHashSlot((char*)key->ptr, sdslen(key->ptr));
6867	clusterNode *node = server.cluster->slots[slot];
6868
6869	/ if the client is read-only and attempting to access key that our*
6870	* replica can handle, allow it. */
6871	if ((c->flags & CLIENT_READONLY) &&
6872	!(c->lastcmd->flags & CMD_WRITE) &&
6873	nodeIsSlave(myself) && myself->slaveof == node)
6874	{
6875	node = myself;
6876	}
6877
6878	/ We send an error and unblock the client if:*
6879	* 1) The slot is unassigned, emitting a cluster down error.
6880	* 2) The slot is not handled by this node, nor being imported. */
6881	if (node != myself &&
6882	server.cluster->importing_slots_from[slot] == NULL)
6883	{
6884	if (node == NULL) {
6885	clusterRedirectClient(c,NULL,`0`,
6886	CLUSTER_REDIR_DOWN_UNBOUND);
6887	} else {
6888	clusterRedirectClient(c,node,slot,
6889	CLUSTER_REDIR_MOVED);
6890	}
6891	dictReleaseIterator(di);
6892	return `1`;
6893	}
6894	}
6895	dictReleaseIterator(di);
6896	}
6897	return `0`;
6898	}
6899
6900	/ Slot to Key API. This is used by Redis Cluster in order to obtain in*
6901	* a fast way a key that belongs to a specified hash slot. This is useful
6902	* while rehashing the cluster and in other conditions when we need to
6903	* understand if we have keys for a given hash slot. */
6904
6905	void slotToKeyAddEntry(dictEntry entry, redisDb db) {
6906	sds key = entry->key;
6907	unsigned int hashslot = keyHashSlot(key, sdslen(key));
6908	slotToKeys slot_to_keys = &(db->slots_to_keys).by_slot[hashslot];
6909	slot_to_keys->count++;
6910
6911	/ Insert entry before the first element in the list. /
6912	dictEntry *first = slot_to_keys->head;
6913	dictEntryNextInSlot(entry) = first;
6914	if (first != NULL) {
6915	serverAssert(dictEntryPrevInSlot(first) == NULL);
6916	dictEntryPrevInSlot(first) = entry;
6917	}
6918	serverAssert(dictEntryPrevInSlot(entry) == NULL);
6919	slot_to_keys->head = entry;
6920	}
6921
6922	void slotToKeyDelEntry(dictEntry entry, redisDb db) {
6923	sds key = entry->key;
6924	unsigned int hashslot = keyHashSlot(key, sdslen(key));
6925	slotToKeys slot_to_keys = &(db->slots_to_keys).by_slot[hashslot];
6926	slot_to_keys->count--;
6927
6928	/ Connect previous and next entries to each other. /
6929	dictEntry *next = dictEntryNextInSlot(entry);
6930	dictEntry *prev = dictEntryPrevInSlot(entry);
6931	if (next != NULL) {
6932	dictEntryPrevInSlot(next) = prev;
6933	}
6934	if (prev != NULL) {
6935	dictEntryNextInSlot(prev) = next;
6936	} else {
6937	/ The removed entry was the first in the list. /
6938	serverAssert(slot_to_keys->head == entry);
6939	slot_to_keys->head = next;
6940	}
6941	}
6942
6943	/ Updates neighbour entries when an entry has been replaced (e.g. reallocated*
6944	* during active defrag). */
6945	void slotToKeyReplaceEntry(dictEntry entry, redisDb db) {
6946	dictEntry *next = dictEntryNextInSlot(entry);
6947	dictEntry *prev = dictEntryPrevInSlot(entry);
6948	if (next != NULL) {
6949	dictEntryPrevInSlot(next) = entry;
6950	}
6951	if (prev != NULL) {
6952	dictEntryNextInSlot(prev) = entry;
6953	} else {
6954	/ The replaced entry was the first in the list. /
6955	sds key = entry->key;
6956	unsigned int hashslot = keyHashSlot(key, sdslen(key));
6957	slotToKeys slot_to_keys = &(db->slots_to_keys).by_slot[hashslot];
6958	slot_to_keys->head = entry;
6959	}
6960	}
6961
6962	/ Initialize slots-keys map of given db. /
6963	void slotToKeyInit(redisDb *db) {
6964	db->slots_to_keys = zcalloc(sizeof(clusterSlotToKeyMapping));
6965	}
6966
6967	/ Empty slots-keys map of given db. /
6968	void slotToKeyFlush(redisDb *db) {
6969	memset(db->slots_to_keys, `0`,
6970	sizeof(clusterSlotToKeyMapping));
6971	}
6972
6973	/ Free slots-keys map of given db. /
6974	void slotToKeyDestroy(redisDb *db) {
6975	zfree(db->slots_to_keys);
6976	db->slots_to_keys = NULL;
6977	}
6978
6979	/ Remove all the keys in the specified hash slot.*
6980	* The number of removed items is returned. */
6981	unsigned int delKeysInSlot(unsigned int hashslot) {
6982	unsigned int j = `0`;
6983	dictEntry de = (server.db->slots_to_keys).by_slot[hashslot].head;
6984	while (de != NULL) {
6985	sds sdskey = dictGetKey(de);
6986	de = dictEntryNextInSlot(de);
6987	robj *key = createStringObject(sdskey, sdslen(sdskey));
6988	dbDelete(&server.db[`0`], key);
6989	decrRefCount(key);
6990	j++;
6991	}
6992	return j;
6993	}
6994
6995	unsigned int countKeysInSlot(unsigned int hashslot) {
6996	return (*server.db->slots_to_keys).by_slot[hashslot].count;
6997	}
6998
6999	/ -----------------------------------------------------------------------------*
7000	* Operation(s) on channel rax tree.
7001	* -------------------------------------------------------------------------- */
7002
7003	void slotToChannelUpdate(sds channel, int add) {
7004	size_t keylen = sdslen(channel);
7005	unsigned int hashslot = keyHashSlot(channel,keylen);
7006	unsigned char buf[`64`];
7007	unsigned char *indexed = buf;
7008
7009	if (keylen+`2` > `64`) indexed = zmalloc(keylen+`2`);
7010	indexed[`0`] = (hashslot >> `8`) & `0xff`;
7011	indexed[`1`] = hashslot & `0xff`;
7012	memcpy(indexed+`2`,channel,keylen);
7013	if (add) {
7014	raxInsert(server.cluster->slots_to_channels,indexed,keylen+`2`,NULL,NULL);
7015	} else {
7016	raxRemove(server.cluster->slots_to_channels,indexed,keylen+`2`,NULL);
7017	}
7018	if (indexed != buf) zfree(indexed);
7019	}
7020
7021	void slotToChannelAdd(sds channel) {
7022	slotToChannelUpdate(channel,`1`);
7023	}
7024
7025	void slotToChannelDel(sds channel) {
7026	slotToChannelUpdate(channel,`0`);
7027	}
7028
7029	/ Get the count of the channels for a given slot. /
7030	unsigned int countChannelsInSlot(unsigned int hashslot) {
7031	raxIterator iter;
7032	int j = `0`;
7033	unsigned char indexed[`2`];
7034
7035	indexed[`0`] = (hashslot >> `8`) & `0xff`;
7036	indexed[`1`] = hashslot & `0xff`;
7037	raxStart(&iter,server.cluster->slots_to_channels);
7038	raxSeek(&iter,">=",indexed,`2`);
7039	while(raxNext(&iter)) {
7040	if (iter.key[`0`] != indexed[`0`] \|\| iter.key[`1`] != indexed[`1`]) break;
7041	j++;
7042	}
7043	raxStop(&iter);
7044	return j;
7045	}
7046

Browse the source code of redis/src/cluster.c