1/*******************************************************************************
2 * Copyright 2021 Intel Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16
17#if defined(DNNL_ENABLE_STACK_CHECKER)
18
19#ifndef __linux__
20#error "Stack checker is supported only on Linux"
21#endif
22
23#ifndef DNNL_ENABLE_CONCURRENT_EXEC
24#error "Stack checker requires using concurrent scratchpad"
25#endif
26
27#ifndef COMMON_STACK_CHECKER_HPP
28#define COMMON_STACK_CHECKER_HPP
29
30#include <cassert>
31#include <tuple>
32#include <type_traits>
33
34#include <pthread.h>
35#include <unistd.h>
36#include <sys/mman.h>
37
38#include "common/cpp_compat.hpp"
39#include "common/utils.hpp"
40
41namespace dnnl {
42namespace impl {
43namespace stack_checker {
44
45/* Stack checker
46 *
47 * The purpose of the stack checker is to get information about stack
48 * consumption per call stack.
49 *
50 * Motivation for introducing such a capability was excessive stack consumption
51 * for `dnnl_primitive_create`, `dnnl_primitive_execute` and GEMM APIs that
52 * resulted in a crash on the customer side.
53 *
54 * The stack checker is represented as `stack_checker_t` class. The class
55 * provides an interface called `check(...)` that is used to get the information
56 * about stack consumption.
57 * The stack checker has a capability to issue an error when the obtained
58 * stack consumption exceeds a specified limit.
59 *
60 * The stack checker can be configured with the following environment variables:
61 * - DNNL_SC_STACK_SIZE: specifies the size of the stack in bytes for the thread
62 * that runs a function that needs to be checked.
63 * The default is 8388608 bytes (8 MiB).
64 *
65 * - DNNL_SC_SOFT_STACK_LIMIT: specifies a soft limit in memory pages. When
66 * stack consumption exceeds the limit the stack checker prints an error
67 * message that contains the obtained stack consumption. The default is 5
68 * pages (20480 bytes).
69 *
70 * - DNNL_SC_HARD_STACK_LIMIT: specifies a hard limit in memory pages. When
71 * the limit is exceeded the SIGSEGV signal is raised. This can be used for
72 * debug purposes. For example, it can be used to get a place within the call
73 * stack where the limit is exceeded. By default, the limit is equal to the
74 * `stack size` / `page size` - all memory is available.
75 * for debug purposes.
76 *
77 * - DNNL_SC_TRACE: enables tracing. If the soft limit is exceeded and the
78 * tracing is enabled the stack checker prints an error message. The tracing
79 * is enabled by default.
80 *
81 * The `stack_checker_t` class has one constructor that takes an `std::string`
82 * which is printed out as part of the error message when soft limit is
83 * exceeded. This can be useful to give a context about the function that is
84 * being checked.
85 *
86 * Implementation details
87 *
88 * The stack checker uses pthread API to create a new thread with
89 * an application-managed stack. The application-managed stack is a memory
90 * buffer allocated by an application and designated as a stack via
91 * a certain pthread API. Since the stack checker has control over the
92 * memory buffer it can populate it with a particular pattern. Once
93 * the thread completed execution of the function being checked it can check
94 * how much memory was actually used for the stack by checking the pattern.
95 *
96 * The stack checker is disabled in the default build configuration. It can
97 * be enabled via CMake option `DNNL_ENABLE_STACK_CHECKER=ON` at the build time.
98 *
99 * Usage example
100 *
101 * ```cpp
102 * #include "common/stack_checker.hpp"
103 *
104 * void bar() {
105 * volatile char arr[1024] = {};
106 * }
107 *
108 * int foo(int *a, int &b, int c) {
109 * bar();
110 * return 0;
111 * }
112 *
113 * int main() {
114 * int x = 5;
115 * stack_checker::stack_checker_t sc("main");
116 * return sc.check(foo, &x, std::ref(x), x);
117 * }
118 * ```
119 * If the soft limit is 3 pages then the output of this code will be the
120 * following:
121 * === Stack checker: ERROR: 'main' consumed 14824 bytes of stack while the limit is 12288 bytes. ===
122 *
123 * Limitations:
124 * - There is only Linux support
125 * - The functions being checked should be non-member functions
126 * - Works only with the concurrent scratchpad because the global scratchpad is
127 * global per thread (thread local).
128 */
129
130template <typename F, typename... Targs>
131struct thread_args_t {
132 thread_args_t() = delete;
133 thread_args_t(const F &func, const Targs &... func_args)
134 : func(func)
135 , func_args(std::forward<Targs>(func_args)...)
136 , func_retval {} {}
137 const F &func;
138 std::tuple<Targs...> func_args;
139 typename cpp_compat::invoke_result<F *, Targs...>::type func_retval;
140};
141
142template <typename T>
143constexpr size_t get_number_args() {
144 return std::tuple_size<typename std::remove_reference<T>::type> {};
145}
146
147// The executor_t is a helper class that is used to prepare arguments for
148// the function and call it.
149template <size_t i>
150struct executor_t {
151 template <typename T, typename... Targs>
152 static void execute(T &thread_args, Targs &... unpacked_func_args) {
153 const auto &func_args = thread_args.func_args;
154 constexpr size_t idx = get_number_args<decltype(func_args)>() - i;
155 executor_t<i - 1>::execute(thread_args,
156 std::forward<Targs>(unpacked_func_args)...,
157 std::get<idx>(func_args));
158 }
159};
160
161template <>
162struct executor_t<0> {
163 template <typename T, typename... Targs>
164 static void execute(T &thread_args, Targs &... unpacked_func_args) {
165 thread_args.func_retval
166 = thread_args.func(std::forward<Targs>(unpacked_func_args)...);
167 }
168};
169
170struct stack_checker_t {
171 stack_checker_t(const std::string &context) : context_(context) {}
172
173 template <typename F, typename... Targs>
174 typename cpp_compat::invoke_result<F *, Targs...>::type check(
175 const F &func, const Targs &... func_args) {
176
177 auto thread_args = utils::make_unique<thread_args_t<F, const Targs...>>(
178 func, std::forward<const Targs>(func_args)...);
179
180 int8_t *stack_buffer;
181 int res = posix_memalign(
182 (void **)&stack_buffer, get_page_size(), get_stack_size());
183 assert(res == 0);
184
185 std::memset(stack_buffer, pattern_, sizeof(int8_t) * get_stack_size());
186
187 // Stack grows downwards.
188 int8_t *stack_start = stack_buffer + get_stack_size();
189 int8_t *stack_end
190 = stack_start - get_page_size() * get_hard_stack_limit();
191 size_t protected_region
192 = get_stack_size() - get_page_size() * get_hard_stack_limit();
193
194 res = mprotect(
195 stack_end - protected_region, protected_region, PROT_NONE);
196 assert(res == 0);
197
198 pthread_t thread;
199 pthread_attr_t attr;
200 res = pthread_attr_init(&attr);
201 assert(res == 0);
202
203 res = pthread_attr_setstack(&attr, stack_buffer, get_stack_size());
204 assert(res == 0);
205
206 res = pthread_attr_setguardsize(&attr, 0);
207 assert(res == 0);
208
209 res = pthread_create(
210 &thread, &attr, worker<F, Targs...>, (void *)thread_args.get());
211 assert(res == 0);
212
213 void *stack_consumption_ptr = nullptr;
214 res = pthread_join(thread, &stack_consumption_ptr);
215 assert(res == 0);
216
217 auto stack_consumption
218 = reinterpret_cast<size_t>(stack_consumption_ptr);
219
220 if (is_trace_enabled()) {
221 size_t soft_stack_limit_in_bytes
222 = get_soft_stack_limit() * get_page_size();
223 if (stack_consumption > soft_stack_limit_in_bytes) {
224 printf("=== Stack checker: ERROR: '%s' consumed %lu bytes of "
225 "stack while the limit is %lu bytes. ===\n",
226 context_.c_str(), stack_consumption,
227 soft_stack_limit_in_bytes);
228 fflush(stdout);
229 }
230 }
231
232 res = pthread_attr_destroy(&attr);
233 assert(res == 0);
234 MAYBE_UNUSED(res);
235 // POSIX Thread standard: 2.9.8 Use of Application-Managed Thread Stacks
236 // The application grants to the implementation permanent ownership of
237 // and control over the application-managed stack when the attributes
238 // object in which the stack or stackaddr attribute has been set is used
239 // free(stack_buffer);
240
241 return thread_args->func_retval;
242 }
243
244private:
245 std::string context_;
246 static constexpr int8_t pattern_ = INT8_MAX;
247
248 // The worker function is a wrapper for the function being checked.
249 // The worker starts when a new thread is created.
250 template <typename F, typename... Types>
251 static void *worker(void *args) {
252 auto &thread_args
253 = *reinterpret_cast<thread_args_t<F, Types...> *>(args);
254 constexpr size_t n_args
255 = get_number_args<decltype(thread_args.func_args)>();
256 executor_t<n_args>::execute(thread_args);
257
258 pthread_attr_t attr;
259 int res = pthread_getattr_np(pthread_self(), &attr);
260 assert(res == 0);
261
262 void *stack_base;
263 size_t stack_size;
264 res = pthread_attr_getstack(&attr, &stack_base, &stack_size);
265 assert(res == 0);
266 MAYBE_UNUSED(res);
267
268 size_t stack_consumption = 0;
269 size_t start_unprotected_buffer
270 = get_stack_size() - get_page_size() * get_hard_stack_limit();
271 for (size_t i = start_unprotected_buffer; i < stack_size; i++) {
272 if (((const int8_t *)stack_base)[i] != pattern_) {
273 stack_consumption = stack_size - i;
274 break;
275 }
276 }
277 // OS can reserve a space of size up to 4096 (page size) in the
278 // beginning of stack buffer. We shouldn't take the reserved space into
279 // account when calculating stack consumption.
280 if (stack_consumption >= get_page_size())
281 stack_consumption -= get_page_size();
282 return reinterpret_cast<void *>(stack_consumption);
283 }
284
285 static size_t get_stack_size() {
286 static const size_t stack_size
287 = getenv_int_user("SC_STACK_SIZE", 1024 * 1024 * 8);
288 if (stack_size % get_page_size() != 0) {
289 printf("Stack checker: DNNL_SC_STACK_SIZE is expected to be "
290 "multiple of page size, which is %lu\n",
291 get_page_size());
292 fflush(stdout);
293 std::terminate();
294 }
295 return stack_size;
296 }
297
298 static size_t get_hard_stack_limit() {
299 static const size_t hard_stack_limit = getenv_int_user(
300 "SC_HARD_STACK_LIMIT", get_stack_size() / get_page_size());
301 return hard_stack_limit;
302 }
303
304 static size_t get_soft_stack_limit() {
305 // Set up the default limit of 5 pages (20480 bytes).
306 static const size_t soft_stack_limit
307 = getenv_int_user("SC_SOFT_STACK_LIMIT", 5);
308 return soft_stack_limit;
309 }
310
311 static bool is_trace_enabled() {
312 static const bool is_trace_enabled = getenv_int_user("SC_TRACE", 1);
313 return is_trace_enabled;
314 }
315
316 static size_t get_page_size() {
317 static const size_t page_size = ::getpagesize();
318 return page_size;
319 }
320};
321
322} // namespace stack_checker
323} // namespace impl
324} // namespace dnnl
325
326#endif
327#endif
328