1/*
2 * Copyright (c) 2022, Redis Ltd.
3 * Copyright (c) 2016, Salvatore Sanfilippo <antirez at gmail dot com>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * * Redistributions of source code must retain the above copyright notice,
10 * this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * * Neither the name of Redis nor the names of its contributors may be used
15 * to endorse or promote products derived from this software without
16 * specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30#include "fmacros.h"
31#include "config.h"
32#include "syscheck.h"
33#include "sds.h"
34#include "anet.h"
35
36#include <time.h>
37#include <sys/resource.h>
38#include <unistd.h>
39#include <stdio.h>
40#include <stdlib.h>
41#include <string.h>
42#include <errno.h>
43#include <sys/wait.h>
44
45#ifdef __linux__
46#include <sys/mman.h>
47#endif
48
49
50#ifdef __linux__
51static sds read_sysfs_line(char *path) {
52 char buf[256];
53 FILE *f = fopen(path, "r");
54 if (!f) return NULL;
55 if (!fgets(buf, sizeof(buf), f)) {
56 fclose(f);
57 return NULL;
58 }
59 fclose(f);
60 sds res = sdsnew(buf);
61 res = sdstrim(res, " \n");
62 return res;
63}
64
65/* Verify our clokcsource implementation doesn't go through a system call (uses vdso).
66 * Going through a system call to check the time degrades Redis performance. */
67static int checkClocksource(sds *error_msg) {
68 unsigned long test_time_us, system_hz;
69 struct timespec ts;
70 unsigned long long start_us;
71 struct rusage ru_start, ru_end;
72
73 system_hz = sysconf(_SC_CLK_TCK);
74
75 if (getrusage(RUSAGE_SELF, &ru_start) != 0)
76 return 0;
77 if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) {
78 return 0;
79 }
80 start_us = (ts.tv_sec * 1000000 + ts.tv_nsec / 1000);
81
82 /* clock_gettime() busy loop of 5 times system tick (for a system_hz of 100 this is 50ms)
83 * Using system_hz is required to ensure accurate measurements from getrusage().
84 * If our clocksource is configured correctly (vdso) this will result in no system calls.
85 * If our clocksource is inefficient it'll waste most of the busy loop in the kernel. */
86 test_time_us = 5 * 1000000 / system_hz;
87 while (1) {
88 unsigned long long d;
89 if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
90 return 0;
91 d = (ts.tv_sec * 1000000 + ts.tv_nsec / 1000) - start_us;
92 if (d >= test_time_us) break;
93 }
94 if (getrusage(RUSAGE_SELF, &ru_end) != 0)
95 return 0;
96
97 long long stime_us = (ru_end.ru_stime.tv_sec * 1000000 + ru_end.ru_stime.tv_usec) - (ru_start.ru_stime.tv_sec * 1000000 + ru_start.ru_stime.tv_usec);
98 long long utime_us = (ru_end.ru_utime.tv_sec * 1000000 + ru_end.ru_utime.tv_usec) - (ru_start.ru_utime.tv_sec * 1000000 + ru_start.ru_utime.tv_usec);
99
100 /* If more than 10% of the process time was in system calls we probably have an inefficient clocksource, print a warning */
101 if (stime_us * 10 > stime_us + utime_us) {
102 sds avail = read_sysfs_line("/sys/devices/system/clocksource/clocksource0/available_clocksource");
103 sds curr = read_sysfs_line("/sys/devices/system/clocksource/clocksource0/current_clocksource");
104 *error_msg = sdscatprintf(sdsempty(),
105 "Slow system clocksource detected. This can result in degraded performance. "
106 "Consider changing the system's clocksource. "
107 "Current clocksource: %s. Available clocksources: %s. "
108 "For example: run the command 'echo tsc > /sys/devices/system/clocksource/clocksource0/current_clocksource' as root. "
109 "To permanently change the system's clocksource you'll need to set the 'clocksource=' kernel command line parameter.",
110 curr ? curr : "", avail ? avail : "");
111 sdsfree(avail);
112 sdsfree(curr);
113 return -1;
114 } else {
115 return 1;
116 }
117}
118
119/* Verify we're not using the `xen` clocksource. The xen hypervisor's default clocksource is slow and affects
120 * Redis's performance. This has been measured on ec2 xen based instances. ec2 recommends using the non-default
121 * tsc clock source for these instances. */
122int checkXenClocksource(sds *error_msg) {
123 sds curr = read_sysfs_line("/sys/devices/system/clocksource/clocksource0/current_clocksource");
124 int res = 1;
125 if (curr == NULL) {
126 res = 0;
127 } else if (strcmp(curr, "xen") == 0) {
128 *error_msg = sdsnew(
129 "Your system is configured to use the 'xen' clocksource which might lead to degraded performance. "
130 "Check the result of the [slow-clocksource] system check: run 'redis-server --check-system' to check if "
131 "the system's clocksource isn't degrading performance.");
132 res = -1;
133 }
134 sdsfree(curr);
135 return res;
136}
137
138/* Verify overcommit is enabled.
139 * When overcommit memory is disabled Linux will kill the forked child of a background save
140 * if we don't have enough free memory to satisfy double the current memory usage even though
141 * the forked child uses copy-on-write to reduce its actual memory usage. */
142int checkOvercommit(sds *error_msg) {
143 FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
144 char buf[64];
145
146 if (!fp) return 0;
147 if (fgets(buf,64,fp) == NULL) {
148 fclose(fp);
149 return 0;
150 }
151 fclose(fp);
152
153 if (strtol(buf, NULL, 10) == 0) {
154 *error_msg = sdsnew(
155 "overcommit_memory is set to 0! Background save may fail under low memory condition. "
156 "To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the "
157 "command 'sysctl vm.overcommit_memory=1' for this to take effect.");
158 return -1;
159 } else {
160 return 1;
161 }
162}
163
164/* Make sure transparent huge pages aren't always enabled. When they are this can cause copy-on-write logic
165 * to consume much more memory and reduce performance during forks. */
166int checkTHPEnabled(sds *error_msg) {
167 char buf[1024];
168
169 FILE *fp = fopen("/sys/kernel/mm/transparent_hugepage/enabled","r");
170 if (!fp) return 0;
171 if (fgets(buf,sizeof(buf),fp) == NULL) {
172 fclose(fp);
173 return 0;
174 }
175 fclose(fp);
176
177 if (strstr(buf,"[always]") != NULL) {
178 *error_msg = sdsnew(
179 "You have Transparent Huge Pages (THP) support enabled in your kernel. "
180 "This will create latency and memory usage issues with Redis. "
181 "To fix this issue run the command 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled' as root, "
182 "and add it to your /etc/rc.local in order to retain the setting after a reboot. "
183 "Redis must be restarted after THP is disabled (set to 'madvise' or 'never').");
184 return -1;
185 } else {
186 return 1;
187 }
188}
189
190#ifdef __arm64__
191/* Get size in kilobytes of the Shared_Dirty pages of the calling process for the
192 * memory map corresponding to the provided address, or -1 on error. */
193static int smapsGetSharedDirty(unsigned long addr) {
194 int ret, in_mapping = 0, val = -1;
195 unsigned long from, to;
196 char buf[64];
197 FILE *f;
198
199 f = fopen("/proc/self/smaps", "r");
200 if (!f) return -1;
201
202 while (1) {
203 if (!fgets(buf, sizeof(buf), f))
204 break;
205
206 ret = sscanf(buf, "%lx-%lx", &from, &to);
207 if (ret == 2)
208 in_mapping = from <= addr && addr < to;
209
210 if (in_mapping && !memcmp(buf, "Shared_Dirty:", 13)) {
211 sscanf(buf, "%*s %d", &val);
212 /* If parsing fails, we remain with val == -1 */
213 break;
214 }
215 }
216
217 fclose(f);
218 return val;
219}
220
221/* Older arm64 Linux kernels have a bug that could lead to data corruption
222 * during background save in certain scenarios. This function checks if the
223 * kernel is affected.
224 * The bug was fixed in commit ff1712f953e27f0b0718762ec17d0adb15c9fd0b
225 * titled: "arm64: pgtable: Ensure dirty bit is preserved across pte_wrprotect()"
226 */
227int checkLinuxMadvFreeForkBug(sds *error_msg) {
228 int ret, pipefd[2] = { -1, -1 };
229 pid_t pid;
230 char *p = NULL, *q;
231 int res = 1;
232 long page_size = sysconf(_SC_PAGESIZE);
233 long map_size = 3 * page_size;
234
235 /* Create a memory map that's in our full control (not one used by the allocator). */
236 p = mmap(NULL, map_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
237 if (p == MAP_FAILED) {
238 return 0;
239 }
240
241 q = p + page_size;
242
243 /* Split the memory map in 3 pages by setting their protection as RO|RW|RO to prevent
244 * Linux from merging this memory map with adjacent VMAs. */
245 ret = mprotect(q, page_size, PROT_READ | PROT_WRITE);
246 if (ret < 0) {
247 res = 0;
248 goto exit;
249 }
250
251 /* Write to the page once to make it resident */
252 *(volatile char*)q = 0;
253
254 /* Tell the kernel that this page is free to be reclaimed. */
255#ifndef MADV_FREE
256#define MADV_FREE 8
257#endif
258 ret = madvise(q, page_size, MADV_FREE);
259 if (ret < 0) {
260 /* MADV_FREE is not available on older kernels that are presumably
261 * not affected. */
262 if (errno == EINVAL) goto exit;
263
264 res = 0;
265 goto exit;
266 }
267
268 /* Write to the page after being marked for freeing, this is supposed to take
269 * ownership of that page again. */
270 *(volatile char*)q = 0;
271
272 /* Create a pipe for the child to return the info to the parent. */
273 ret = anetPipe(pipefd, 0, 0);
274 if (ret < 0) {
275 res = 0;
276 goto exit;
277 }
278
279 /* Fork the process. */
280 pid = fork();
281 if (pid < 0) {
282 res = 0;
283 goto exit;
284 } else if (!pid) {
285 /* Child: check if the page is marked as dirty, page_size in kb.
286 * A value of 0 means the kernel is affected by the bug. */
287 ret = smapsGetSharedDirty((unsigned long) q);
288 if (!ret)
289 res = -1;
290 else if (ret == -1) /* Failed to read */
291 res = 0;
292
293 ret = write(pipefd[1], &res, sizeof(res)); /* Assume success, ignore return value*/
294 exit(0);
295 } else {
296 /* Read the result from the child. */
297 ret = read(pipefd[0], &res, sizeof(res));
298 if (ret < 0) {
299 res = 0;
300 }
301
302 /* Reap the child pid. */
303 waitpid(pid, NULL, 0);
304 }
305
306exit:
307 /* Cleanup */
308 if (pipefd[0] != -1) close(pipefd[0]);
309 if (pipefd[1] != -1) close(pipefd[1]);
310 if (p != NULL) munmap(p, map_size);
311
312 if (res == -1)
313 *error_msg = sdsnew(
314 "Your kernel has a bug that could lead to data corruption during background save. "
315 "Please upgrade to the latest stable kernel.");
316
317 return res;
318}
319#endif /* __arm64__ */
320#endif /* __linux__ */
321
322/*
323 * Standard system check interface:
324 * Each check has a name `name` and a functions pointer `check_fn`.
325 * `check_fn` should return:
326 * -1 in case the check fails.
327 * 1 in case the check passes.
328 * 0 in case the check could not be completed (usually because of some unexpected failed system call).
329 * When (and only when) the check fails and -1 is returned and error description is places in a new sds pointer to by
330 * the single `sds*` argument to `check_fn`. This message should be freed by the caller via `sdsfree()`.
331 */
332typedef struct {
333 const char *name;
334 int (*check_fn)(sds*);
335} check;
336
337check checks[] = {
338#ifdef __linux__
339 {.name = "slow-clocksource", .check_fn = checkClocksource},
340 {.name = "xen-clocksource", .check_fn = checkXenClocksource},
341 {.name = "overcommit", .check_fn = checkOvercommit},
342 {.name = "THP", .check_fn = checkTHPEnabled},
343#ifdef __arm64__
344 {.name = "madvise-free-fork-bug", .check_fn = checkLinuxMadvFreeForkBug},
345#endif
346#endif
347 {.name = NULL, .check_fn = NULL}
348};
349
350/* Performs various system checks, returns 0 if any check fails, 1 otherwise. */
351int syscheck(void) {
352 check *cur_check = checks;
353 int ret = 1;
354 sds err_msg = NULL;
355 while (cur_check->check_fn) {
356 int res = cur_check->check_fn(&err_msg);
357 printf("[%s]...", cur_check->name);
358 if (res == 0) {
359 printf("skipped\n");
360 } else if (res == 1) {
361 printf("OK\n");
362 } else {
363 printf("WARNING:\n");
364 printf("%s\n", err_msg);
365 sdsfree(err_msg);
366 ret = 0;
367 }
368 cur_check++;
369 }
370
371 return ret;
372}
373