1 | /* |
2 | * Copyright (c) 2022, Redis Ltd. |
3 | * Copyright (c) 2016, Salvatore Sanfilippo <antirez at gmail dot com> |
4 | * All rights reserved. |
5 | * |
6 | * Redistribution and use in source and binary forms, with or without |
7 | * modification, are permitted provided that the following conditions are met: |
8 | * |
9 | * * Redistributions of source code must retain the above copyright notice, |
10 | * this list of conditions and the following disclaimer. |
11 | * * Redistributions in binary form must reproduce the above copyright |
12 | * notice, this list of conditions and the following disclaimer in the |
13 | * documentation and/or other materials provided with the distribution. |
14 | * * Neither the name of Redis nor the names of its contributors may be used |
15 | * to endorse or promote products derived from this software without |
16 | * specific prior written permission. |
17 | * |
18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
19 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
20 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
21 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
22 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
23 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
24 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
25 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
26 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
27 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
28 | * POSSIBILITY OF SUCH DAMAGE. |
29 | */ |
30 | #include "fmacros.h" |
31 | #include "config.h" |
32 | #include "syscheck.h" |
33 | #include "sds.h" |
34 | #include "anet.h" |
35 | |
36 | #include <time.h> |
37 | #include <sys/resource.h> |
38 | #include <unistd.h> |
39 | #include <stdio.h> |
40 | #include <stdlib.h> |
41 | #include <string.h> |
42 | #include <errno.h> |
43 | #include <sys/wait.h> |
44 | |
45 | #ifdef __linux__ |
46 | #include <sys/mman.h> |
47 | #endif |
48 | |
49 | |
50 | #ifdef __linux__ |
51 | static sds read_sysfs_line(char *path) { |
52 | char buf[256]; |
53 | FILE *f = fopen(path, "r" ); |
54 | if (!f) return NULL; |
55 | if (!fgets(buf, sizeof(buf), f)) { |
56 | fclose(f); |
57 | return NULL; |
58 | } |
59 | fclose(f); |
60 | sds res = sdsnew(buf); |
61 | res = sdstrim(res, " \n" ); |
62 | return res; |
63 | } |
64 | |
65 | /* Verify our clokcsource implementation doesn't go through a system call (uses vdso). |
66 | * Going through a system call to check the time degrades Redis performance. */ |
67 | static int checkClocksource(sds *error_msg) { |
68 | unsigned long test_time_us, system_hz; |
69 | struct timespec ts; |
70 | unsigned long long start_us; |
71 | struct rusage ru_start, ru_end; |
72 | |
73 | system_hz = sysconf(_SC_CLK_TCK); |
74 | |
75 | if (getrusage(RUSAGE_SELF, &ru_start) != 0) |
76 | return 0; |
77 | if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) { |
78 | return 0; |
79 | } |
80 | start_us = (ts.tv_sec * 1000000 + ts.tv_nsec / 1000); |
81 | |
82 | /* clock_gettime() busy loop of 5 times system tick (for a system_hz of 100 this is 50ms) |
83 | * Using system_hz is required to ensure accurate measurements from getrusage(). |
84 | * If our clocksource is configured correctly (vdso) this will result in no system calls. |
85 | * If our clocksource is inefficient it'll waste most of the busy loop in the kernel. */ |
86 | test_time_us = 5 * 1000000 / system_hz; |
87 | while (1) { |
88 | unsigned long long d; |
89 | if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) |
90 | return 0; |
91 | d = (ts.tv_sec * 1000000 + ts.tv_nsec / 1000) - start_us; |
92 | if (d >= test_time_us) break; |
93 | } |
94 | if (getrusage(RUSAGE_SELF, &ru_end) != 0) |
95 | return 0; |
96 | |
97 | long long stime_us = (ru_end.ru_stime.tv_sec * 1000000 + ru_end.ru_stime.tv_usec) - (ru_start.ru_stime.tv_sec * 1000000 + ru_start.ru_stime.tv_usec); |
98 | long long utime_us = (ru_end.ru_utime.tv_sec * 1000000 + ru_end.ru_utime.tv_usec) - (ru_start.ru_utime.tv_sec * 1000000 + ru_start.ru_utime.tv_usec); |
99 | |
100 | /* If more than 10% of the process time was in system calls we probably have an inefficient clocksource, print a warning */ |
101 | if (stime_us * 10 > stime_us + utime_us) { |
102 | sds avail = read_sysfs_line("/sys/devices/system/clocksource/clocksource0/available_clocksource" ); |
103 | sds curr = read_sysfs_line("/sys/devices/system/clocksource/clocksource0/current_clocksource" ); |
104 | *error_msg = sdscatprintf(sdsempty(), |
105 | "Slow system clocksource detected. This can result in degraded performance. " |
106 | "Consider changing the system's clocksource. " |
107 | "Current clocksource: %s. Available clocksources: %s. " |
108 | "For example: run the command 'echo tsc > /sys/devices/system/clocksource/clocksource0/current_clocksource' as root. " |
109 | "To permanently change the system's clocksource you'll need to set the 'clocksource=' kernel command line parameter." , |
110 | curr ? curr : "" , avail ? avail : "" ); |
111 | sdsfree(avail); |
112 | sdsfree(curr); |
113 | return -1; |
114 | } else { |
115 | return 1; |
116 | } |
117 | } |
118 | |
119 | /* Verify we're not using the `xen` clocksource. The xen hypervisor's default clocksource is slow and affects |
120 | * Redis's performance. This has been measured on ec2 xen based instances. ec2 recommends using the non-default |
121 | * tsc clock source for these instances. */ |
122 | int checkXenClocksource(sds *error_msg) { |
123 | sds curr = read_sysfs_line("/sys/devices/system/clocksource/clocksource0/current_clocksource" ); |
124 | int res = 1; |
125 | if (curr == NULL) { |
126 | res = 0; |
127 | } else if (strcmp(curr, "xen" ) == 0) { |
128 | *error_msg = sdsnew( |
129 | "Your system is configured to use the 'xen' clocksource which might lead to degraded performance. " |
130 | "Check the result of the [slow-clocksource] system check: run 'redis-server --check-system' to check if " |
131 | "the system's clocksource isn't degrading performance." ); |
132 | res = -1; |
133 | } |
134 | sdsfree(curr); |
135 | return res; |
136 | } |
137 | |
138 | /* Verify overcommit is enabled. |
139 | * When overcommit memory is disabled Linux will kill the forked child of a background save |
140 | * if we don't have enough free memory to satisfy double the current memory usage even though |
141 | * the forked child uses copy-on-write to reduce its actual memory usage. */ |
142 | int checkOvercommit(sds *error_msg) { |
143 | FILE *fp = fopen("/proc/sys/vm/overcommit_memory" ,"r" ); |
144 | char buf[64]; |
145 | |
146 | if (!fp) return 0; |
147 | if (fgets(buf,64,fp) == NULL) { |
148 | fclose(fp); |
149 | return 0; |
150 | } |
151 | fclose(fp); |
152 | |
153 | if (strtol(buf, NULL, 10) == 0) { |
154 | *error_msg = sdsnew( |
155 | "overcommit_memory is set to 0! Background save may fail under low memory condition. " |
156 | "To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the " |
157 | "command 'sysctl vm.overcommit_memory=1' for this to take effect." ); |
158 | return -1; |
159 | } else { |
160 | return 1; |
161 | } |
162 | } |
163 | |
164 | /* Make sure transparent huge pages aren't always enabled. When they are this can cause copy-on-write logic |
165 | * to consume much more memory and reduce performance during forks. */ |
166 | int checkTHPEnabled(sds *error_msg) { |
167 | char buf[1024]; |
168 | |
169 | FILE *fp = fopen("/sys/kernel/mm/transparent_hugepage/enabled" ,"r" ); |
170 | if (!fp) return 0; |
171 | if (fgets(buf,sizeof(buf),fp) == NULL) { |
172 | fclose(fp); |
173 | return 0; |
174 | } |
175 | fclose(fp); |
176 | |
177 | if (strstr(buf,"[always]" ) != NULL) { |
178 | *error_msg = sdsnew( |
179 | "You have Transparent Huge Pages (THP) support enabled in your kernel. " |
180 | "This will create latency and memory usage issues with Redis. " |
181 | "To fix this issue run the command 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled' as root, " |
182 | "and add it to your /etc/rc.local in order to retain the setting after a reboot. " |
183 | "Redis must be restarted after THP is disabled (set to 'madvise' or 'never')." ); |
184 | return -1; |
185 | } else { |
186 | return 1; |
187 | } |
188 | } |
189 | |
190 | #ifdef __arm64__ |
191 | /* Get size in kilobytes of the Shared_Dirty pages of the calling process for the |
192 | * memory map corresponding to the provided address, or -1 on error. */ |
193 | static int smapsGetSharedDirty(unsigned long addr) { |
194 | int ret, in_mapping = 0, val = -1; |
195 | unsigned long from, to; |
196 | char buf[64]; |
197 | FILE *f; |
198 | |
199 | f = fopen("/proc/self/smaps" , "r" ); |
200 | if (!f) return -1; |
201 | |
202 | while (1) { |
203 | if (!fgets(buf, sizeof(buf), f)) |
204 | break; |
205 | |
206 | ret = sscanf(buf, "%lx-%lx" , &from, &to); |
207 | if (ret == 2) |
208 | in_mapping = from <= addr && addr < to; |
209 | |
210 | if (in_mapping && !memcmp(buf, "Shared_Dirty:" , 13)) { |
211 | sscanf(buf, "%*s %d" , &val); |
212 | /* If parsing fails, we remain with val == -1 */ |
213 | break; |
214 | } |
215 | } |
216 | |
217 | fclose(f); |
218 | return val; |
219 | } |
220 | |
221 | /* Older arm64 Linux kernels have a bug that could lead to data corruption |
222 | * during background save in certain scenarios. This function checks if the |
223 | * kernel is affected. |
224 | * The bug was fixed in commit ff1712f953e27f0b0718762ec17d0adb15c9fd0b |
225 | * titled: "arm64: pgtable: Ensure dirty bit is preserved across pte_wrprotect()" |
226 | */ |
227 | int checkLinuxMadvFreeForkBug(sds *error_msg) { |
228 | int ret, pipefd[2] = { -1, -1 }; |
229 | pid_t pid; |
230 | char *p = NULL, *q; |
231 | int res = 1; |
232 | long page_size = sysconf(_SC_PAGESIZE); |
233 | long map_size = 3 * page_size; |
234 | |
235 | /* Create a memory map that's in our full control (not one used by the allocator). */ |
236 | p = mmap(NULL, map_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); |
237 | if (p == MAP_FAILED) { |
238 | return 0; |
239 | } |
240 | |
241 | q = p + page_size; |
242 | |
243 | /* Split the memory map in 3 pages by setting their protection as RO|RW|RO to prevent |
244 | * Linux from merging this memory map with adjacent VMAs. */ |
245 | ret = mprotect(q, page_size, PROT_READ | PROT_WRITE); |
246 | if (ret < 0) { |
247 | res = 0; |
248 | goto exit; |
249 | } |
250 | |
251 | /* Write to the page once to make it resident */ |
252 | *(volatile char*)q = 0; |
253 | |
254 | /* Tell the kernel that this page is free to be reclaimed. */ |
255 | #ifndef MADV_FREE |
256 | #define MADV_FREE 8 |
257 | #endif |
258 | ret = madvise(q, page_size, MADV_FREE); |
259 | if (ret < 0) { |
260 | /* MADV_FREE is not available on older kernels that are presumably |
261 | * not affected. */ |
262 | if (errno == EINVAL) goto exit; |
263 | |
264 | res = 0; |
265 | goto exit; |
266 | } |
267 | |
268 | /* Write to the page after being marked for freeing, this is supposed to take |
269 | * ownership of that page again. */ |
270 | *(volatile char*)q = 0; |
271 | |
272 | /* Create a pipe for the child to return the info to the parent. */ |
273 | ret = anetPipe(pipefd, 0, 0); |
274 | if (ret < 0) { |
275 | res = 0; |
276 | goto exit; |
277 | } |
278 | |
279 | /* Fork the process. */ |
280 | pid = fork(); |
281 | if (pid < 0) { |
282 | res = 0; |
283 | goto exit; |
284 | } else if (!pid) { |
285 | /* Child: check if the page is marked as dirty, page_size in kb. |
286 | * A value of 0 means the kernel is affected by the bug. */ |
287 | ret = smapsGetSharedDirty((unsigned long) q); |
288 | if (!ret) |
289 | res = -1; |
290 | else if (ret == -1) /* Failed to read */ |
291 | res = 0; |
292 | |
293 | ret = write(pipefd[1], &res, sizeof(res)); /* Assume success, ignore return value*/ |
294 | exit(0); |
295 | } else { |
296 | /* Read the result from the child. */ |
297 | ret = read(pipefd[0], &res, sizeof(res)); |
298 | if (ret < 0) { |
299 | res = 0; |
300 | } |
301 | |
302 | /* Reap the child pid. */ |
303 | waitpid(pid, NULL, 0); |
304 | } |
305 | |
306 | exit: |
307 | /* Cleanup */ |
308 | if (pipefd[0] != -1) close(pipefd[0]); |
309 | if (pipefd[1] != -1) close(pipefd[1]); |
310 | if (p != NULL) munmap(p, map_size); |
311 | |
312 | if (res == -1) |
313 | *error_msg = sdsnew( |
314 | "Your kernel has a bug that could lead to data corruption during background save. " |
315 | "Please upgrade to the latest stable kernel." ); |
316 | |
317 | return res; |
318 | } |
319 | #endif /* __arm64__ */ |
320 | #endif /* __linux__ */ |
321 | |
322 | /* |
323 | * Standard system check interface: |
324 | * Each check has a name `name` and a functions pointer `check_fn`. |
325 | * `check_fn` should return: |
326 | * -1 in case the check fails. |
327 | * 1 in case the check passes. |
328 | * 0 in case the check could not be completed (usually because of some unexpected failed system call). |
329 | * When (and only when) the check fails and -1 is returned and error description is places in a new sds pointer to by |
330 | * the single `sds*` argument to `check_fn`. This message should be freed by the caller via `sdsfree()`. |
331 | */ |
332 | typedef struct { |
333 | const char *name; |
334 | int (*check_fn)(sds*); |
335 | } check; |
336 | |
337 | check checks[] = { |
338 | #ifdef __linux__ |
339 | {.name = "slow-clocksource" , .check_fn = checkClocksource}, |
340 | {.name = "xen-clocksource" , .check_fn = checkXenClocksource}, |
341 | {.name = "overcommit" , .check_fn = checkOvercommit}, |
342 | {.name = "THP" , .check_fn = checkTHPEnabled}, |
343 | #ifdef __arm64__ |
344 | {.name = "madvise-free-fork-bug" , .check_fn = checkLinuxMadvFreeForkBug}, |
345 | #endif |
346 | #endif |
347 | {.name = NULL, .check_fn = NULL} |
348 | }; |
349 | |
350 | /* Performs various system checks, returns 0 if any check fails, 1 otherwise. */ |
351 | int syscheck(void) { |
352 | check *cur_check = checks; |
353 | int ret = 1; |
354 | sds err_msg = NULL; |
355 | while (cur_check->check_fn) { |
356 | int res = cur_check->check_fn(&err_msg); |
357 | printf("[%s]..." , cur_check->name); |
358 | if (res == 0) { |
359 | printf("skipped\n" ); |
360 | } else if (res == 1) { |
361 | printf("OK\n" ); |
362 | } else { |
363 | printf("WARNING:\n" ); |
364 | printf("%s\n" , err_msg); |
365 | sdsfree(err_msg); |
366 | ret = 0; |
367 | } |
368 | cur_check++; |
369 | } |
370 | |
371 | return ret; |
372 | } |
373 | |