1 | /******************************************************************************* |
2 | * Copyright 2021 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #if defined(DNNL_ENABLE_STACK_CHECKER) |
18 | |
19 | #ifndef __linux__ |
20 | #error "Stack checker is supported only on Linux" |
21 | #endif |
22 | |
23 | #ifndef DNNL_ENABLE_CONCURRENT_EXEC |
24 | #error "Stack checker requires using concurrent scratchpad" |
25 | #endif |
26 | |
27 | #ifndef COMMON_STACK_CHECKER_HPP |
28 | #define COMMON_STACK_CHECKER_HPP |
29 | |
30 | #include <cassert> |
31 | #include <tuple> |
32 | #include <type_traits> |
33 | |
34 | #include <pthread.h> |
35 | #include <unistd.h> |
36 | #include <sys/mman.h> |
37 | |
38 | #include "common/cpp_compat.hpp" |
39 | #include "common/utils.hpp" |
40 | |
41 | namespace dnnl { |
42 | namespace impl { |
43 | namespace stack_checker { |
44 | |
45 | /* Stack checker |
46 | * |
47 | * The purpose of the stack checker is to get information about stack |
48 | * consumption per call stack. |
49 | * |
50 | * Motivation for introducing such a capability was excessive stack consumption |
51 | * for `dnnl_primitive_create`, `dnnl_primitive_execute` and GEMM APIs that |
52 | * resulted in a crash on the customer side. |
53 | * |
54 | * The stack checker is represented as `stack_checker_t` class. The class |
55 | * provides an interface called `check(...)` that is used to get the information |
56 | * about stack consumption. |
57 | * The stack checker has a capability to issue an error when the obtained |
58 | * stack consumption exceeds a specified limit. |
59 | * |
60 | * The stack checker can be configured with the following environment variables: |
61 | * - DNNL_SC_STACK_SIZE: specifies the size of the stack in bytes for the thread |
62 | * that runs a function that needs to be checked. |
63 | * The default is 8388608 bytes (8 MiB). |
64 | * |
65 | * - DNNL_SC_SOFT_STACK_LIMIT: specifies a soft limit in memory pages. When |
66 | * stack consumption exceeds the limit the stack checker prints an error |
67 | * message that contains the obtained stack consumption. The default is 5 |
68 | * pages (20480 bytes). |
69 | * |
70 | * - DNNL_SC_HARD_STACK_LIMIT: specifies a hard limit in memory pages. When |
71 | * the limit is exceeded the SIGSEGV signal is raised. This can be used for |
72 | * debug purposes. For example, it can be used to get a place within the call |
73 | * stack where the limit is exceeded. By default, the limit is equal to the |
74 | * `stack size` / `page size` - all memory is available. |
75 | * for debug purposes. |
76 | * |
77 | * - DNNL_SC_TRACE: enables tracing. If the soft limit is exceeded and the |
78 | * tracing is enabled the stack checker prints an error message. The tracing |
79 | * is enabled by default. |
80 | * |
81 | * The `stack_checker_t` class has one constructor that takes an `std::string` |
82 | * which is printed out as part of the error message when soft limit is |
83 | * exceeded. This can be useful to give a context about the function that is |
84 | * being checked. |
85 | * |
86 | * Implementation details |
87 | * |
88 | * The stack checker uses pthread API to create a new thread with |
89 | * an application-managed stack. The application-managed stack is a memory |
90 | * buffer allocated by an application and designated as a stack via |
91 | * a certain pthread API. Since the stack checker has control over the |
92 | * memory buffer it can populate it with a particular pattern. Once |
93 | * the thread completed execution of the function being checked it can check |
94 | * how much memory was actually used for the stack by checking the pattern. |
95 | * |
96 | * The stack checker is disabled in the default build configuration. It can |
97 | * be enabled via CMake option `DNNL_ENABLE_STACK_CHECKER=ON` at the build time. |
98 | * |
99 | * Usage example |
100 | * |
101 | * ```cpp |
102 | * #include "common/stack_checker.hpp" |
103 | * |
104 | * void bar() { |
105 | * volatile char arr[1024] = {}; |
106 | * } |
107 | * |
108 | * int foo(int *a, int &b, int c) { |
109 | * bar(); |
110 | * return 0; |
111 | * } |
112 | * |
113 | * int main() { |
114 | * int x = 5; |
115 | * stack_checker::stack_checker_t sc("main"); |
116 | * return sc.check(foo, &x, std::ref(x), x); |
117 | * } |
118 | * ``` |
119 | * If the soft limit is 3 pages then the output of this code will be the |
120 | * following: |
121 | * === Stack checker: ERROR: 'main' consumed 14824 bytes of stack while the limit is 12288 bytes. === |
122 | * |
123 | * Limitations: |
124 | * - There is only Linux support |
125 | * - The functions being checked should be non-member functions |
126 | * - Works only with the concurrent scratchpad because the global scratchpad is |
127 | * global per thread (thread local). |
128 | */ |
129 | |
130 | template <typename F, typename... Targs> |
131 | struct thread_args_t { |
132 | thread_args_t() = delete; |
133 | thread_args_t(const F &func, const Targs &... func_args) |
134 | : func(func) |
135 | , func_args(std::forward<Targs>(func_args)...) |
136 | , func_retval {} {} |
137 | const F &func; |
138 | std::tuple<Targs...> func_args; |
139 | typename cpp_compat::invoke_result<F *, Targs...>::type func_retval; |
140 | }; |
141 | |
142 | template <typename T> |
143 | constexpr size_t get_number_args() { |
144 | return std::tuple_size<typename std::remove_reference<T>::type> {}; |
145 | } |
146 | |
147 | // The executor_t is a helper class that is used to prepare arguments for |
148 | // the function and call it. |
149 | template <size_t i> |
150 | struct executor_t { |
151 | template <typename T, typename... Targs> |
152 | static void execute(T &thread_args, Targs &... unpacked_func_args) { |
153 | const auto &func_args = thread_args.func_args; |
154 | constexpr size_t idx = get_number_args<decltype(func_args)>() - i; |
155 | executor_t<i - 1>::execute(thread_args, |
156 | std::forward<Targs>(unpacked_func_args)..., |
157 | std::get<idx>(func_args)); |
158 | } |
159 | }; |
160 | |
161 | template <> |
162 | struct executor_t<0> { |
163 | template <typename T, typename... Targs> |
164 | static void execute(T &thread_args, Targs &... unpacked_func_args) { |
165 | thread_args.func_retval |
166 | = thread_args.func(std::forward<Targs>(unpacked_func_args)...); |
167 | } |
168 | }; |
169 | |
170 | struct stack_checker_t { |
171 | stack_checker_t(const std::string &context) : context_(context) {} |
172 | |
173 | template <typename F, typename... Targs> |
174 | typename cpp_compat::invoke_result<F *, Targs...>::type check( |
175 | const F &func, const Targs &... func_args) { |
176 | |
177 | auto thread_args = utils::make_unique<thread_args_t<F, const Targs...>>( |
178 | func, std::forward<const Targs>(func_args)...); |
179 | |
180 | int8_t *stack_buffer; |
181 | int res = posix_memalign( |
182 | (void **)&stack_buffer, get_page_size(), get_stack_size()); |
183 | assert(res == 0); |
184 | |
185 | std::memset(stack_buffer, pattern_, sizeof(int8_t) * get_stack_size()); |
186 | |
187 | // Stack grows downwards. |
188 | int8_t *stack_start = stack_buffer + get_stack_size(); |
189 | int8_t *stack_end |
190 | = stack_start - get_page_size() * get_hard_stack_limit(); |
191 | size_t protected_region |
192 | = get_stack_size() - get_page_size() * get_hard_stack_limit(); |
193 | |
194 | res = mprotect( |
195 | stack_end - protected_region, protected_region, PROT_NONE); |
196 | assert(res == 0); |
197 | |
198 | pthread_t thread; |
199 | pthread_attr_t attr; |
200 | res = pthread_attr_init(&attr); |
201 | assert(res == 0); |
202 | |
203 | res = pthread_attr_setstack(&attr, stack_buffer, get_stack_size()); |
204 | assert(res == 0); |
205 | |
206 | res = pthread_attr_setguardsize(&attr, 0); |
207 | assert(res == 0); |
208 | |
209 | res = pthread_create( |
210 | &thread, &attr, worker<F, Targs...>, (void *)thread_args.get()); |
211 | assert(res == 0); |
212 | |
213 | void *stack_consumption_ptr = nullptr; |
214 | res = pthread_join(thread, &stack_consumption_ptr); |
215 | assert(res == 0); |
216 | |
217 | auto stack_consumption |
218 | = reinterpret_cast<size_t>(stack_consumption_ptr); |
219 | |
220 | if (is_trace_enabled()) { |
221 | size_t soft_stack_limit_in_bytes |
222 | = get_soft_stack_limit() * get_page_size(); |
223 | if (stack_consumption > soft_stack_limit_in_bytes) { |
224 | printf("=== Stack checker: ERROR: '%s' consumed %lu bytes of " |
225 | "stack while the limit is %lu bytes. ===\n" , |
226 | context_.c_str(), stack_consumption, |
227 | soft_stack_limit_in_bytes); |
228 | fflush(stdout); |
229 | } |
230 | } |
231 | |
232 | res = pthread_attr_destroy(&attr); |
233 | assert(res == 0); |
234 | MAYBE_UNUSED(res); |
235 | // POSIX Thread standard: 2.9.8 Use of Application-Managed Thread Stacks |
236 | // The application grants to the implementation permanent ownership of |
237 | // and control over the application-managed stack when the attributes |
238 | // object in which the stack or stackaddr attribute has been set is used |
239 | // free(stack_buffer); |
240 | |
241 | return thread_args->func_retval; |
242 | } |
243 | |
244 | private: |
245 | std::string context_; |
246 | static constexpr int8_t pattern_ = INT8_MAX; |
247 | |
248 | // The worker function is a wrapper for the function being checked. |
249 | // The worker starts when a new thread is created. |
250 | template <typename F, typename... Types> |
251 | static void *worker(void *args) { |
252 | auto &thread_args |
253 | = *reinterpret_cast<thread_args_t<F, Types...> *>(args); |
254 | constexpr size_t n_args |
255 | = get_number_args<decltype(thread_args.func_args)>(); |
256 | executor_t<n_args>::execute(thread_args); |
257 | |
258 | pthread_attr_t attr; |
259 | int res = pthread_getattr_np(pthread_self(), &attr); |
260 | assert(res == 0); |
261 | |
262 | void *stack_base; |
263 | size_t stack_size; |
264 | res = pthread_attr_getstack(&attr, &stack_base, &stack_size); |
265 | assert(res == 0); |
266 | MAYBE_UNUSED(res); |
267 | |
268 | size_t stack_consumption = 0; |
269 | size_t start_unprotected_buffer |
270 | = get_stack_size() - get_page_size() * get_hard_stack_limit(); |
271 | for (size_t i = start_unprotected_buffer; i < stack_size; i++) { |
272 | if (((const int8_t *)stack_base)[i] != pattern_) { |
273 | stack_consumption = stack_size - i; |
274 | break; |
275 | } |
276 | } |
277 | // OS can reserve a space of size up to 4096 (page size) in the |
278 | // beginning of stack buffer. We shouldn't take the reserved space into |
279 | // account when calculating stack consumption. |
280 | if (stack_consumption >= get_page_size()) |
281 | stack_consumption -= get_page_size(); |
282 | return reinterpret_cast<void *>(stack_consumption); |
283 | } |
284 | |
285 | static size_t get_stack_size() { |
286 | static const size_t stack_size |
287 | = getenv_int_user("SC_STACK_SIZE" , 1024 * 1024 * 8); |
288 | if (stack_size % get_page_size() != 0) { |
289 | printf("Stack checker: DNNL_SC_STACK_SIZE is expected to be " |
290 | "multiple of page size, which is %lu\n" , |
291 | get_page_size()); |
292 | fflush(stdout); |
293 | std::terminate(); |
294 | } |
295 | return stack_size; |
296 | } |
297 | |
298 | static size_t get_hard_stack_limit() { |
299 | static const size_t hard_stack_limit = getenv_int_user( |
300 | "SC_HARD_STACK_LIMIT" , get_stack_size() / get_page_size()); |
301 | return hard_stack_limit; |
302 | } |
303 | |
304 | static size_t get_soft_stack_limit() { |
305 | // Set up the default limit of 5 pages (20480 bytes). |
306 | static const size_t soft_stack_limit |
307 | = getenv_int_user("SC_SOFT_STACK_LIMIT" , 5); |
308 | return soft_stack_limit; |
309 | } |
310 | |
311 | static bool is_trace_enabled() { |
312 | static const bool is_trace_enabled = getenv_int_user("SC_TRACE" , 1); |
313 | return is_trace_enabled; |
314 | } |
315 | |
316 | static size_t get_page_size() { |
317 | static const size_t page_size = ::getpagesize(); |
318 | return page_size; |
319 | } |
320 | }; |
321 | |
322 | } // namespace stack_checker |
323 | } // namespace impl |
324 | } // namespace dnnl |
325 | |
326 | #endif |
327 | #endif |
328 | |