1 | // Copyright 2020 Google LLC |
2 | // |
3 | // This source code is licensed under the BSD-style license found in the |
4 | // LICENSE file in the root directory of this source tree. |
5 | |
6 | #ifndef __MACH__ |
7 | #define _POSIX_C_SOURCE 199309L |
8 | #endif |
9 | |
10 | #include <assert.h> |
11 | #include <math.h> |
12 | #include <stddef.h> |
13 | #include <stdint.h> |
14 | #include <stdio.h> // For snprintf. |
15 | #include <stdlib.h> |
16 | |
17 | #include <xnnpack.h> |
18 | #include <xnnpack/allocator.h> |
19 | #include <xnnpack/cache.h> |
20 | #include <xnnpack/common.h> |
21 | #include <xnnpack/log.h> |
22 | #include <xnnpack/math.h> |
23 | #include <xnnpack/memory-planner.h> |
24 | #include <xnnpack/node-type.h> |
25 | #include <xnnpack/operator.h> |
26 | #include <xnnpack/params.h> |
27 | #include <xnnpack/subgraph.h> |
28 | |
29 | #if defined(__EMSCRIPTEN__) |
30 | #include <emscripten/emscripten.h> |
31 | #elif XNN_PLATFORM_WINDOWS |
32 | #include <windows.h> |
33 | #else |
34 | #include <errno.h> |
35 | #include <time.h> |
36 | #endif |
37 | |
38 | #ifndef XNN_ENABLE_JIT |
39 | #error "XNN_ENABLE_JIT is not defined" |
40 | #endif |
41 | |
42 | enum xnn_status xnn_create_workspace(xnn_workspace_t* workspace_out) |
43 | { |
44 | if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { |
45 | xnn_log_error("failed to create workspace: XNNPACK is not initialized" ); |
46 | return xnn_status_uninitialized; |
47 | } |
48 | |
49 | struct xnn_workspace* workspace = NULL; |
50 | workspace = xnn_allocate_zero_memory(sizeof(struct xnn_workspace)); |
51 | if (workspace == NULL) { |
52 | xnn_log_error("failed to allocate %zu bytes for workspace descriptor" , sizeof(struct xnn_workspace)); |
53 | return xnn_status_out_of_memory; |
54 | } |
55 | workspace->ref_count = 1; |
56 | *workspace_out = workspace; |
57 | return xnn_status_success; |
58 | } |
59 | |
60 | static inline void xnn_retain_workspace(xnn_workspace_t workspace) |
61 | { |
62 | workspace->ref_count++; |
63 | } |
64 | |
65 | enum xnn_status xnn_release_workspace(xnn_workspace_t workspace) |
66 | { |
67 | assert(workspace->ref_count != 0); |
68 | if (--workspace->ref_count == 0) { |
69 | xnn_release_simd_memory(workspace->data); |
70 | xnn_release_memory(workspace); |
71 | } |
72 | return xnn_status_success; |
73 | } |
74 | |
75 | enum xnn_status xnn_create_weights_cache_with_size(size_t size, xnn_weights_cache_t* weights_cache_out) |
76 | { |
77 | struct xnn_weights_cache* weights_cache = NULL; |
78 | enum xnn_status status = xnn_status_uninitialized; |
79 | |
80 | if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { |
81 | xnn_log_error("failed to create weights cache: XNNPACK is not initialized" ); |
82 | goto error; |
83 | } |
84 | |
85 | weights_cache = xnn_allocate_zero_memory(sizeof(struct xnn_weights_cache)); |
86 | if (weights_cache == NULL) { |
87 | xnn_log_error("failed to allocate %zu bytes for weights cache descriptor" , sizeof(struct xnn_weights_cache)); |
88 | goto error; |
89 | } |
90 | |
91 | status = xnn_init_weights_cache_with_size(weights_cache, size); |
92 | if (status != xnn_status_success) { |
93 | goto error; |
94 | } |
95 | *weights_cache_out = weights_cache; |
96 | return xnn_status_success; |
97 | |
98 | error: |
99 | xnn_release_weights_cache(weights_cache); |
100 | return status; |
101 | } |
102 | |
103 | enum xnn_status xnn_create_weights_cache(xnn_weights_cache_t* weights_cache_out) |
104 | { |
105 | return xnn_create_weights_cache_with_size(XNN_DEFAULT_WEIGHTS_BUFFER_SIZE, weights_cache_out); |
106 | } |
107 | |
108 | enum xnn_status xnn_delete_weights_cache(xnn_weights_cache_t weights_cache) |
109 | { |
110 | enum xnn_status status = xnn_release_weights_cache(weights_cache); |
111 | if (status != xnn_status_success) { |
112 | return status; |
113 | } |
114 | xnn_release_memory(weights_cache); |
115 | return xnn_status_success; |
116 | } |
117 | |
118 | enum xnn_status xnn_create_runtime( |
119 | xnn_subgraph_t subgraph, |
120 | xnn_runtime_t* runtime_out) |
121 | { |
122 | return xnn_create_runtime_v2(subgraph, NULL /* threadpool */, 0 /* flags */, runtime_out); |
123 | } |
124 | |
125 | enum xnn_status xnn_create_runtime_v2( |
126 | xnn_subgraph_t subgraph, |
127 | pthreadpool_t threadpool, |
128 | uint32_t flags, |
129 | xnn_runtime_t* runtime_out) |
130 | { |
131 | return xnn_create_runtime_v3(subgraph, /* weights_cache */ NULL, threadpool, flags, runtime_out); |
132 | } |
133 | |
134 | enum xnn_status xnn_create_runtime_v3( |
135 | xnn_subgraph_t subgraph, |
136 | xnn_weights_cache_t weights_cache, |
137 | pthreadpool_t threadpool, |
138 | uint32_t flags, |
139 | xnn_runtime_t* runtime_out) |
140 | { |
141 | xnn_workspace_t workspace; |
142 | enum xnn_status status = xnn_create_workspace(&workspace); |
143 | if (status != xnn_status_success) { |
144 | return status; |
145 | } |
146 | status = xnn_create_runtime_v4(subgraph, weights_cache, workspace, threadpool, flags, runtime_out); |
147 | // Release workspace regardless of return status of creating runtime. |
148 | xnn_release_workspace(workspace); |
149 | return status; |
150 | } |
151 | |
152 | static enum xnn_status initialize_workspace_blobs( |
153 | xnn_subgraph_t subgraph, |
154 | xnn_runtime_t runtime, |
155 | struct xnn_value_allocation_tracker* mem_alloc_tracker) |
156 | { |
157 | assert(runtime->workspace != NULL); |
158 | const size_t persistent_size = runtime->workspace->persistent_size; |
159 | size_t mem_arena_size = mem_alloc_tracker->mem_arena_size + persistent_size; |
160 | if (mem_arena_size == 0) { |
161 | return xnn_status_success; |
162 | } |
163 | // Sparse microkernels can read up to 2 * XNN_EXTRA_BYTES beyond array bounds. |
164 | mem_arena_size += 2 * XNN_EXTRA_BYTES; |
165 | |
166 | // Records how much the workspace has moved by due to allocating a larger workspace. |
167 | ptrdiff_t workspace_data_delta = 0; |
168 | // Allocates larger workspace here if needed. |
169 | if (runtime->workspace->size < mem_arena_size) { |
170 | void* old_workspace_data = runtime->workspace->data; |
171 | if (runtime->workspace->size != 0) { |
172 | // Free up the workspace's current data. Free first then allocate to keep peak memory usage low. |
173 | xnn_release_simd_memory(runtime->workspace->data); |
174 | } |
175 | void* new_workspace_data = xnn_allocate_simd_memory(mem_arena_size); |
176 | if (new_workspace_data == NULL) { |
177 | xnn_log_error("failed to allocate %zu bytes for runtime workspace" , mem_arena_size); |
178 | return xnn_status_out_of_memory; |
179 | } |
180 | runtime->workspace->data = new_workspace_data; |
181 | runtime->workspace->size = mem_arena_size; |
182 | xnn_log_debug("created workspace of size %zu" , mem_arena_size); |
183 | // Keep track of how much the workspace data moved. |
184 | if (old_workspace_data != NULL) { |
185 | workspace_data_delta = (uintptr_t) new_workspace_data - (uintptr_t) old_workspace_data; |
186 | } |
187 | } |
188 | |
189 | assert(runtime->workspace->size >= mem_arena_size); |
190 | |
191 | // Initialize current runtime's blob pointers. |
192 | size_t persistent_offset = 0; |
193 | for (size_t i = 0; i < subgraph->num_values; i++) { |
194 | const struct xnn_value* value = &subgraph->values[i]; |
195 | struct xnn_blob* blob = &runtime->blobs[i]; |
196 | if (value->datatype != xnn_datatype_invalid && value->type == xnn_value_type_dense_tensor) { |
197 | if (blob->allocation_type == xnn_allocation_type_workspace) { |
198 | // Value is purely internal to the runtime, allocate it in the workspace. |
199 | blob->data = (void*) ((uintptr_t) runtime->workspace->data + persistent_size + mem_alloc_tracker->usage[i].alloc_offset); |
200 | } else if (blob->allocation_type == xnn_allocation_type_persistent) { |
201 | blob->data = (void*) ((uintptr_t) runtime->workspace->data + persistent_offset); |
202 | persistent_offset += round_up_po2(blob->size, XNN_EXTRA_BYTES); |
203 | } |
204 | } |
205 | } |
206 | assert(persistent_offset == persistent_size); |
207 | |
208 | // Adjust the blob pointers of all runtimes that share this workspace. |
209 | if (workspace_data_delta != 0) { |
210 | for (struct xnn_runtime* rt = runtime->workspace->first_user; rt != NULL; rt = rt->next_workspace_user) { |
211 | // The current runtime already has the correct offset. |
212 | if (rt == runtime) { |
213 | continue; |
214 | } |
215 | for (size_t i = 0; i < rt->num_blobs; i++) { |
216 | struct xnn_blob* blob = &rt->blobs[i]; |
217 | if (blob->allocation_type == xnn_allocation_type_workspace || |
218 | blob->allocation_type == xnn_allocation_type_persistent) { |
219 | assert(blob->data != NULL); |
220 | blob->data = (void*) ((uintptr_t) blob->data + workspace_data_delta); |
221 | } |
222 | } |
223 | } |
224 | } |
225 | |
226 | return xnn_status_success; |
227 | } |
228 | |
229 | // External inputs cannot be overwritten. |
230 | // Static inputs cannot be overwritten. |
231 | // Persistent tensors have their own space allocated at the front of the workspace. |
232 | // If input has more than 1 consumer, we can't track all the consumers and update the first_consumer, so bail out. |
233 | static bool input_memory_can_be_reused(const struct xnn_value* input, const struct xnn_value* output) |
234 | { |
235 | return !xnn_value_is_external(input) && !xnn_value_is_static(input) && !xnn_value_is_persistent(input) |
236 | && !xnn_value_is_persistent(output) && input->num_consumers <= 1; |
237 | } |
238 | |
239 | // An in-place operation reuses the input tensor's memory for its output. Examples are element-wise unary operations |
240 | // like activation functions. Usually, an output tensor is allocated space. For an in-place operation, we want the |
241 | // output tensor to share the input tensor's memory. We do this by calling xnn_mark_tensor_as_reuse, which: |
242 | // - sets the tensor_size of output tensor's usage record to 0 |
243 | // - mark this usage record as reusing another tensor's memory |
244 | // - remember the id of the tensor which we will reuse the alloc_offset to set onto the output tensor |
245 | static void optimize_tensor_allocation_for_in_place_operations( |
246 | struct xnn_value_allocation_tracker* tracker, |
247 | xnn_subgraph_t subgraph) |
248 | { |
249 | xnn_subgraph_analyze_consumers_and_producers(subgraph); |
250 | for (uint32_t n = 0; n < subgraph->num_nodes; n++) { |
251 | struct xnn_node* node = &subgraph->nodes[n]; |
252 | switch (node->type) { |
253 | case xnn_node_type_abs: |
254 | case xnn_node_type_bankers_rounding: |
255 | case xnn_node_type_ceiling: |
256 | case xnn_node_type_clamp: |
257 | case xnn_node_type_copy: |
258 | case xnn_node_type_elu: |
259 | case xnn_node_type_floor: |
260 | case xnn_node_type_hardswish: |
261 | case xnn_node_type_leaky_relu: |
262 | case xnn_node_type_negate: |
263 | case xnn_node_type_prelu: |
264 | case xnn_node_type_sigmoid: |
265 | case xnn_node_type_softmax: |
266 | case xnn_node_type_square: |
267 | case xnn_node_type_square_root: |
268 | case xnn_node_type_static_reshape: |
269 | // Valid operation types that can be optimized. |
270 | break; |
271 | default: |
272 | continue; |
273 | } |
274 | struct xnn_value* output = &subgraph->values[node->outputs[0]]; |
275 | const uint32_t input_id = node->inputs[0]; |
276 | const struct xnn_value* input = &subgraph->values[input_id]; |
277 | if (!input_memory_can_be_reused(input, output)) { |
278 | // TODO(zhin): consider aliasing input to output rather than output to input. |
279 | continue; |
280 | } |
281 | if (output->num_consumers == 1) { |
282 | uint32_t reuse_id = input_id; |
283 | // If the tensor we are reusing is itself reused, find the "root tensor" to be reused. |
284 | while (tracker->usage[reuse_id].reuse_value_id != XNN_INVALID_VALUE_ID) { |
285 | reuse_id = tracker->usage[reuse_id].reuse_value_id; |
286 | } |
287 | // We only support when output has a single consumer because we cannot easily find all consumer nodes |
288 | // without traversing the entire graph. This will require tracking output->last_consumer in the future. |
289 | assert(tracker->usage[reuse_id].last_node < output->first_consumer); |
290 | xnn_log_debug("reusing tensor id #%" PRIu32 " memory for tensor id #%" PRIu32 " Node #%" PRIu32 " %s" , |
291 | reuse_id, output->id, node->id, xnn_node_type_to_string(node->type)); |
292 | xnn_mark_tensor_as_reuse(tracker, output->id, reuse_id, output->first_consumer); |
293 | } |
294 | } |
295 | } |
296 | |
297 | enum xnn_status xnn_create_runtime_v4( |
298 | xnn_subgraph_t subgraph, |
299 | xnn_weights_cache_t weights_cache, |
300 | xnn_workspace_t workspace, |
301 | pthreadpool_t threadpool, |
302 | uint32_t flags, |
303 | xnn_runtime_t* runtime_out) |
304 | { |
305 | struct xnn_runtime* runtime = NULL; |
306 | enum xnn_status status = xnn_status_uninitialized; |
307 | |
308 | if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { |
309 | xnn_log_error("failed to create runtime: XNNPACK is not initialized" ); |
310 | goto error; |
311 | } |
312 | |
313 | if (workspace == NULL) { |
314 | xnn_log_error("failed to create runtime: workspace is NULL" ); |
315 | status = xnn_status_invalid_parameter; |
316 | goto error; |
317 | } |
318 | |
319 | const uint32_t optimization_flags = XNN_FLAG_SPARSE_INFERENCE | XNN_FLAG_HINT_FP16_INFERENCE | |
320 | XNN_FLAG_FORCE_FP16_INFERENCE | XNN_FLAG_NO_OPERATOR_FUSION; |
321 | status = xnn_subgraph_optimize(subgraph, flags & optimization_flags); |
322 | if (status != xnn_status_success) { |
323 | xnn_log_error("failed to optimize subgraph" ); |
324 | goto error; |
325 | } |
326 | |
327 | status = xnn_status_out_of_memory; |
328 | |
329 | runtime = xnn_allocate_zero_memory(sizeof(struct xnn_runtime)); |
330 | if (runtime == NULL) { |
331 | xnn_log_error("failed to allocate %zu bytes for runtime descriptor" , sizeof(struct xnn_runtime)); |
332 | goto error; |
333 | } |
334 | |
335 | runtime->opdata = xnn_allocate_zero_memory(sizeof(struct xnn_operator_data) * subgraph->num_nodes); |
336 | if (runtime->opdata == NULL) { |
337 | xnn_log_error("failed to allocate %zu bytes for opdata descriptors" , |
338 | sizeof(struct xnn_operator_data) * (size_t) subgraph->num_nodes); |
339 | goto error; |
340 | } |
341 | runtime->num_ops = subgraph->num_nodes; |
342 | |
343 | if (flags & XNN_FLAG_YIELD_WORKERS) { |
344 | struct xnn_node* last_valid_node = NULL; |
345 | for (size_t i = 0; i < subgraph->num_nodes; i++) { |
346 | struct xnn_node* node = subgraph->nodes + i; |
347 | if (node->type != xnn_node_type_invalid) { |
348 | last_valid_node = node; |
349 | } |
350 | } |
351 | if (last_valid_node != NULL) { |
352 | last_valid_node->flags |= XNN_FLAG_YIELD_WORKERS; |
353 | } |
354 | } |
355 | |
356 | struct xnn_code_cache* code_cache = NULL; |
357 | #if XNN_PLATFORM_JIT && XNN_ENABLE_JIT |
358 | code_cache = &runtime->code_cache; |
359 | status = xnn_init_code_cache(code_cache); |
360 | if (status != xnn_status_success) { |
361 | goto error; |
362 | } |
363 | #endif |
364 | const struct xnn_caches caches = { |
365 | .code_cache = code_cache, |
366 | .weights_cache = weights_cache, |
367 | }; |
368 | |
369 | struct xnn_value* values = subgraph->values; |
370 | for (size_t i = 0; i < subgraph->num_nodes; i++) { |
371 | const struct xnn_node* node = subgraph->nodes + i; |
372 | |
373 | // Ignore fused nodes |
374 | if (node->type != xnn_node_type_invalid) { |
375 | assert(node->create != NULL); |
376 | status = node->create(node, values, subgraph->num_values, runtime->opdata + i, &caches); |
377 | if (status != xnn_status_success) { |
378 | goto error; |
379 | } |
380 | runtime->opdata[i].setup = node->setup; |
381 | } |
382 | } |
383 | |
384 | #if XNN_PLATFORM_JIT && XNN_ENABLE_JIT |
385 | xnn_finalize_code_memory(&code_cache->cache.code); |
386 | #endif |
387 | |
388 | runtime->blobs = xnn_allocate_zero_memory(sizeof(struct xnn_blob) * subgraph->num_values); |
389 | if (runtime->blobs == NULL) { |
390 | xnn_log_error("failed to allocate %zu bytes for blob descriptors" , |
391 | sizeof(struct xnn_blob) * (size_t) subgraph->num_values); |
392 | goto error; |
393 | } |
394 | runtime->num_blobs = subgraph->num_values; |
395 | |
396 | struct xnn_value_allocation_tracker mem_alloc_tracker; |
397 | xnn_init_value_allocation_tracker(&mem_alloc_tracker, subgraph); |
398 | |
399 | size_t persistent_size = 0; |
400 | for (uint32_t i = 0; i < subgraph->num_values; i++) { |
401 | struct xnn_value* value = &subgraph->values[i]; |
402 | struct xnn_blob* blob = &runtime->blobs[i]; |
403 | if (value->datatype != xnn_datatype_invalid && value->type == xnn_value_type_dense_tensor) { |
404 | blob->size = xnn_tensor_get_size(subgraph, i); |
405 | blob->data = (void*) (uintptr_t) value->data; |
406 | if (blob->data == NULL) { |
407 | if (xnn_value_is_external(value)) { |
408 | // Value is non-static and external to the runtime: must be specified via a call to xnn_setup_runtime. |
409 | blob->allocation_type = xnn_allocation_type_external; |
410 | } else if (xnn_value_is_persistent(value)) { |
411 | // Persistent values are allocated in the front of the workspace without overlaps. |
412 | blob->allocation_type = xnn_allocation_type_persistent; |
413 | persistent_size += round_up_po2(blob->size, XNN_EXTRA_BYTES); |
414 | } else { |
415 | // Value is purely internal to the runtime, and must be allocated in its workspace. |
416 | xnn_add_value_allocation_tracker(&mem_alloc_tracker, i, round_up_po2(blob->size, XNN_EXTRA_BYTES)); |
417 | blob->allocation_type = xnn_allocation_type_workspace; |
418 | } |
419 | } else { |
420 | blob->allocation_type = xnn_allocation_type_static; |
421 | } |
422 | } |
423 | } |
424 | optimize_tensor_allocation_for_in_place_operations(&mem_alloc_tracker, subgraph); |
425 | xnn_plan_value_allocation_tracker(&mem_alloc_tracker); |
426 | |
427 | xnn_retain_workspace(workspace); |
428 | runtime->workspace = workspace; |
429 | runtime->next_workspace_user = runtime->workspace->first_user; |
430 | runtime->workspace->first_user = runtime; |
431 | runtime->workspace->persistent_size = persistent_size; |
432 | |
433 | status = initialize_workspace_blobs(subgraph, runtime, &mem_alloc_tracker); |
434 | if (status != xnn_status_success) { |
435 | xnn_release_value_allocation_tracker(&mem_alloc_tracker); |
436 | goto error; |
437 | } |
438 | |
439 | if (flags & XNN_FLAG_BASIC_PROFILING) { |
440 | runtime->profiling = true; |
441 | } |
442 | |
443 | xnn_release_value_allocation_tracker(&mem_alloc_tracker); |
444 | |
445 | runtime->threadpool = threadpool; |
446 | |
447 | *runtime_out = runtime; |
448 | return xnn_status_success; |
449 | |
450 | error: |
451 | xnn_delete_runtime(runtime); |
452 | return status; |
453 | } |
454 | |
455 | enum xnn_status xnn_setup_runtime( |
456 | xnn_runtime_t runtime, |
457 | size_t num_external_values, |
458 | const struct xnn_external_value* external_values) |
459 | { |
460 | // Validate inputs without changing internal state. |
461 | // This ensures that runtime stays in consistent state in case validation fails midway. |
462 | for (size_t i = 0; i < num_external_values; i++) { |
463 | const struct xnn_external_value* external_value = &external_values[i]; |
464 | const uint32_t value_id = external_value->id; |
465 | if (value_id >= runtime->num_blobs) { |
466 | xnn_log_error("failed to setup runtime: out-of-bounds ID %" PRIu32 " in external value #%zu" , |
467 | value_id, i); |
468 | return xnn_status_invalid_parameter; |
469 | } |
470 | |
471 | const struct xnn_blob* blob = &runtime->blobs[value_id]; |
472 | if (blob->allocation_type != xnn_allocation_type_external) { |
473 | xnn_log_error("failed to setup runtime: Value %" PRIu32 " is not external" , value_id); |
474 | return xnn_status_invalid_parameter; |
475 | } |
476 | } |
477 | |
478 | // Apply runtime state changes. |
479 | for (size_t i = 0; i < num_external_values; i++) { |
480 | const struct xnn_external_value* external_value = &external_values[i]; |
481 | const uint32_t value_id = external_value->id; |
482 | struct xnn_blob* blob = &runtime->blobs[value_id]; |
483 | blob->data = external_value->data; |
484 | } |
485 | |
486 | for (size_t i = 0; i < runtime->num_ops; i++) { |
487 | const struct xnn_operator_data* opdata = &runtime->opdata[i]; |
488 | if (opdata->operator_objects[0] == NULL) { |
489 | // Operator was removed during optimization |
490 | continue; |
491 | } |
492 | |
493 | // Ensure that weights cache is finalized. |
494 | struct xnn_weights_cache* weights_cache = opdata->operator_objects[0]->weights_cache; |
495 | if (weights_cache != NULL && !xnn_weights_cache_is_finalized(weights_cache)) { |
496 | xnn_log_error("weights cache needs to be finalized before setup/infer" ); |
497 | return xnn_status_invalid_state; |
498 | } |
499 | |
500 | assert(opdata->setup != NULL); |
501 | const enum xnn_status status = opdata->setup(opdata, runtime->blobs, runtime->num_blobs, runtime->threadpool); |
502 | if (status != xnn_status_success) { |
503 | xnn_log_error("failed to setup runtime: error in operator #%zu" , i); |
504 | return status; |
505 | } |
506 | } |
507 | |
508 | return xnn_status_success; |
509 | } |
510 | |
511 | static xnn_timestamp xnn_read_timer() { |
512 | xnn_timestamp timestamp; |
513 | #ifdef __MACH__ |
514 | timestamp = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); |
515 | if (timestamp == 0) { |
516 | xnn_log_warning("clock_gettime failed: error code %d" , errno); |
517 | } |
518 | #elif __EMSCRIPTEN__ |
519 | timestamp = emscripten_get_now(); |
520 | #elif XNN_PLATFORM_WINDOWS |
521 | BOOL res = QueryPerformanceCounter(×tamp); |
522 | if (!res) { |
523 | xnn_log_error("QueryPerformanceCounter failed: error code %u" , GetLastError()); |
524 | memset(×tamp, 0, sizeof(timestamp)); |
525 | } |
526 | #else |
527 | int res = clock_gettime(CLOCK_MONOTONIC, ×tamp); |
528 | if (res != 0) { |
529 | xnn_log_error("clock_gettime failed: error code %d" , errno); |
530 | memset(×tamp, 0, sizeof(timestamp)); |
531 | } |
532 | #endif |
533 | return timestamp; |
534 | } |
535 | |
536 | static inline uint64_t xnn_get_elapsed_time(const xnn_timestamp* start, const xnn_timestamp* end) { |
537 | #ifdef __MACH__ |
538 | const uint64_t kMicrosInNanos = 1000; |
539 | return (*end - *start) / kMicrosInNanos; |
540 | #elif __EMSCRIPTEN__ |
541 | const double kMillisInMicros = 1.0e3; |
542 | return (uint64_t) ((*end - *start) * kMillisInMicros); |
543 | #elif XNN_PLATFORM_WINDOWS |
544 | const uint64_t kMicrosInSec = 1000 * 1000; |
545 | LARGE_INTEGER frequency; |
546 | BOOL res = QueryPerformanceFrequency(&frequency); |
547 | if (!res) { |
548 | xnn_log_error("QueryPerformanceFrequency failed: error code %u" , GetLastError()); |
549 | return 0; |
550 | } |
551 | return ((end->QuadPart - start->QuadPart) * kMicrosInSec) / frequency.QuadPart; |
552 | #else |
553 | const uint64_t kNanosInMicro = UINT64_C(1000); |
554 | const uint64_t kNanosInSec = UINT64_C(1000000000); |
555 | const uint64_t secs = (end->tv_sec - start->tv_sec) * kNanosInSec; |
556 | const uint64_t ns_secs = (end->tv_nsec - start->tv_nsec); |
557 | return (secs + ns_secs) / kNanosInMicro; |
558 | #endif |
559 | } |
560 | |
561 | enum xnn_status xnn_get_runtime_profiling_info(xnn_runtime_t runtime, |
562 | enum xnn_profile_info param_name, |
563 | size_t param_value_size, |
564 | void* param_value, |
565 | size_t* param_value_size_ret) |
566 | { |
567 | if (!runtime->profiling) { |
568 | return xnn_status_invalid_state; |
569 | } |
570 | enum xnn_status status = xnn_status_success; |
571 | size_t required_size = 0; |
572 | const struct xnn_operator_data* opdata = runtime->opdata; |
573 | switch (param_name) { |
574 | case xnn_profile_info_num_operators: |
575 | required_size = sizeof(size_t); |
576 | if (param_value_size < required_size){ |
577 | *param_value_size_ret = required_size; |
578 | status = xnn_status_out_of_memory; |
579 | } else { |
580 | size_t num_valid_ops = 0; |
581 | for (size_t i = 0; i < runtime->num_ops; ++i) { |
582 | if (opdata[i].operator_objects[0] != NULL) { |
583 | num_valid_ops += 1; |
584 | } |
585 | } |
586 | memcpy(param_value, &num_valid_ops, required_size); |
587 | } |
588 | break; |
589 | case xnn_profile_info_operator_name: |
590 | for (size_t i = 0; i < runtime->num_ops; ++i) { |
591 | if (opdata[i].operator_objects[0] != NULL) { |
592 | const char* op_name = xnn_operator_type_to_string(opdata[i].operator_objects[0]->type); |
593 | size_t op_name_len = strlen(op_name) + 1; |
594 | if (opdata[i].operator_objects[0]->ukernel.type != xnn_microkernel_type_default ) { |
595 | op_name_len += strlen(xnn_microkernel_type_to_string(opdata[i].operator_objects[0]->ukernel.type)) + 1; |
596 | } |
597 | required_size += op_name_len; |
598 | } |
599 | } |
600 | if (param_value_size < required_size) { |
601 | *param_value_size_ret = required_size; |
602 | status = xnn_status_out_of_memory; |
603 | } else { |
604 | char* name_out = (char*) param_value; |
605 | for (size_t i = 0; i < runtime->num_ops; ++i) { |
606 | if (opdata[i].operator_objects[0] != NULL) { |
607 | const char* op_name = xnn_operator_type_to_string(opdata[i].operator_objects[0]->type); |
608 | size_t op_name_len = strlen(op_name) + 1; |
609 | if (opdata[i].operator_objects[0]->ukernel.type != xnn_microkernel_type_default ) { |
610 | const char* ukernel_type = xnn_microkernel_type_to_string(opdata[i].operator_objects[0]->ukernel.type); |
611 | op_name_len += strlen(ukernel_type) + 1; |
612 | snprintf(name_out, op_name_len, "%s %s" , op_name, ukernel_type); |
613 | } else { |
614 | snprintf(name_out, op_name_len, "%s" , op_name); |
615 | } |
616 | name_out += op_name_len; |
617 | } |
618 | } |
619 | } |
620 | break; |
621 | case xnn_profile_info_operator_timing: |
622 | { |
623 | size_t num_valid_ops = 0; |
624 | for (size_t i = 0; i < runtime->num_ops; ++i) { |
625 | if (opdata[i].operator_objects[0] != NULL) { |
626 | num_valid_ops += 1; |
627 | } |
628 | } |
629 | required_size = num_valid_ops * sizeof(uint64_t); |
630 | if (param_value_size < required_size) { |
631 | *param_value_size_ret = required_size; |
632 | status = xnn_status_out_of_memory; |
633 | } else { |
634 | xnn_timestamp previous_ts = runtime->start_ts; |
635 | uint64_t* data = (uint64_t*) param_value; |
636 | for (size_t i = 0; i < runtime->num_ops; ++i) { |
637 | if (opdata[i].operator_objects[0] != NULL) { |
638 | uint64_t op_time = 0; |
639 | for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) { |
640 | if (opdata[i].operator_objects[j] != NULL) { |
641 | op_time += xnn_get_elapsed_time(&previous_ts, &opdata[i].end_ts[j]); |
642 | previous_ts = opdata[i].end_ts[j]; |
643 | } |
644 | } |
645 | *data++ = op_time; |
646 | } |
647 | } |
648 | } |
649 | break; |
650 | } |
651 | default: |
652 | status = xnn_status_invalid_parameter; |
653 | } |
654 | return status; |
655 | } |
656 | |
657 | enum xnn_status xnn_invoke_runtime( |
658 | xnn_runtime_t runtime) |
659 | { |
660 | if (runtime->profiling) { |
661 | runtime->start_ts = xnn_read_timer(); |
662 | } |
663 | for (size_t i = 0; i < runtime->num_ops; i++) { |
664 | for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) { |
665 | if (runtime->opdata[i].operator_objects[j] == NULL) { |
666 | // Operator was removed after fusion |
667 | continue; |
668 | } |
669 | |
670 | const enum xnn_status status = xnn_run_operator_with_index(runtime->opdata[i].operator_objects[j], i, j, runtime->threadpool); |
671 | if (status != xnn_status_success) { |
672 | return status; |
673 | } |
674 | if (runtime->profiling) { |
675 | runtime->opdata[i].end_ts[j] = xnn_read_timer(); |
676 | } |
677 | } |
678 | } |
679 | return xnn_status_success; |
680 | } |
681 | |
682 | enum xnn_status xnn_delete_runtime( |
683 | xnn_runtime_t runtime) |
684 | { |
685 | if (runtime != NULL) { |
686 | if (runtime->opdata != NULL) { |
687 | for (size_t i = 0; i < runtime->num_ops; i++) { |
688 | for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) { |
689 | xnn_delete_operator(runtime->opdata[i].operator_objects[j]); |
690 | } |
691 | } |
692 | xnn_release_memory(runtime->opdata); |
693 | |
694 | xnn_release_memory(runtime->blobs); |
695 | if (runtime->workspace != NULL) { |
696 | // Remove this runtime from the list of users of the workspace. |
697 | assert(runtime->workspace->first_user != NULL); |
698 | if (runtime->workspace->first_user == runtime) { |
699 | runtime->workspace->first_user = runtime->next_workspace_user; |
700 | } else { |
701 | xnn_runtime_t prev = runtime->workspace->first_user; |
702 | xnn_runtime_t curr = prev->next_workspace_user; |
703 | while (curr != runtime) { |
704 | prev = curr; |
705 | curr = curr->next_workspace_user; |
706 | } |
707 | assert(curr == runtime); |
708 | prev->next_workspace_user = curr->next_workspace_user; |
709 | } |
710 | xnn_release_workspace(runtime->workspace); |
711 | } |
712 | } |
713 | #if XNN_PLATFORM_JIT && XNN_ENABLE_JIT |
714 | xnn_release_code_cache(&runtime->code_cache); |
715 | #endif |
716 | xnn_release_memory(runtime); |
717 | } |
718 | return xnn_status_success; |
719 | } |
720 | |