1 | // Copyright 2020 Google LLC |
2 | // |
3 | // This source code is licensed under the BSD-style license found in the |
4 | // LICENSE file in the root directory of this source tree. |
5 | |
6 | #ifndef __MACH__ |
7 | #define _POSIX_C_SOURCE 199309L |
8 | #endif |
9 | |
10 | #include <assert.h> |
11 | #include <math.h> |
12 | #include <stddef.h> |
13 | #include <stdint.h> |
14 | #include <stdio.h> // For snprintf. |
15 | #include <stdlib.h> |
16 | |
17 | #include <xnnpack.h> |
18 | #include <xnnpack/allocator.h> |
19 | #include <xnnpack/cache.h> |
20 | #include <xnnpack/common.h> |
21 | #include <xnnpack/log.h> |
22 | #include <xnnpack/math.h> |
23 | #include <xnnpack/memory-planner.h> |
24 | #include <xnnpack/node-type.h> |
25 | #include <xnnpack/operator.h> |
26 | #include <xnnpack/params.h> |
27 | #include <xnnpack/subgraph.h> |
28 | |
29 | #if defined(__EMSCRIPTEN__) |
30 | #include <emscripten/emscripten.h> |
31 | #elif XNN_PLATFORM_WINDOWS |
32 | #include <windows.h> |
33 | #else |
34 | #include <errno.h> |
35 | #include <time.h> |
36 | #endif |
37 | |
38 | #ifndef XNN_ENABLE_JIT |
39 | #error "XNN_ENABLE_JIT is not defined" |
40 | #endif |
41 | |
42 | enum xnn_status xnn_create_workspace(xnn_workspace_t* workspace_out) |
43 | { |
44 | if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { |
45 | xnn_log_error("failed to create workspace: XNNPACK is not initialized" ); |
46 | return xnn_status_uninitialized; |
47 | } |
48 | |
49 | struct xnn_workspace* workspace = NULL; |
50 | workspace = xnn_allocate_zero_memory(sizeof(struct xnn_workspace)); |
51 | if (workspace == NULL) { |
52 | xnn_log_error("failed to allocate %zu bytes for workspace descriptor" , sizeof(struct xnn_workspace)); |
53 | return xnn_status_out_of_memory; |
54 | } |
55 | workspace->ref_count = 1; |
56 | *workspace_out = workspace; |
57 | return xnn_status_success; |
58 | } |
59 | |
60 | static inline void xnn_retain_workspace(xnn_workspace_t workspace) |
61 | { |
62 | workspace->ref_count++; |
63 | } |
64 | |
65 | enum xnn_status xnn_release_workspace(xnn_workspace_t workspace) |
66 | { |
67 | assert(workspace->ref_count != 0); |
68 | if (--workspace->ref_count == 0) { |
69 | xnn_release_simd_memory(workspace->data); |
70 | xnn_release_memory(workspace); |
71 | } |
72 | return xnn_status_success; |
73 | } |
74 | |
75 | enum xnn_status xnn_create_weights_cache_with_size(size_t size, xnn_weights_cache_t* weights_cache_out) |
76 | { |
77 | struct xnn_weights_cache* weights_cache = NULL; |
78 | enum xnn_status status = xnn_status_uninitialized; |
79 | |
80 | if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { |
81 | xnn_log_error("failed to create weights cache: XNNPACK is not initialized" ); |
82 | goto error; |
83 | } |
84 | |
85 | weights_cache = xnn_allocate_zero_memory(sizeof(struct xnn_weights_cache)); |
86 | if (weights_cache == NULL) { |
87 | xnn_log_error("failed to allocate %zu bytes for weights cache descriptor" , sizeof(struct xnn_weights_cache)); |
88 | goto error; |
89 | } |
90 | |
91 | status = xnn_init_weights_cache_with_size(weights_cache, size); |
92 | if (status != xnn_status_success) { |
93 | goto error; |
94 | } |
95 | *weights_cache_out = weights_cache; |
96 | return xnn_status_success; |
97 | |
98 | error: |
99 | xnn_release_weights_cache(weights_cache); |
100 | return status; |
101 | } |
102 | |
103 | enum xnn_status xnn_create_weights_cache(xnn_weights_cache_t* weights_cache_out) |
104 | { |
105 | return xnn_create_weights_cache_with_size(XNN_DEFAULT_WEIGHTS_BUFFER_SIZE, weights_cache_out); |
106 | } |
107 | |
108 | enum xnn_status xnn_delete_weights_cache(xnn_weights_cache_t weights_cache) |
109 | { |
110 | enum xnn_status status = xnn_release_weights_cache(weights_cache); |
111 | if (status != xnn_status_success) { |
112 | return status; |
113 | } |
114 | xnn_release_memory(weights_cache); |
115 | return xnn_status_success; |
116 | } |
117 | |
118 | enum xnn_status xnn_create_runtime( |
119 | xnn_subgraph_t subgraph, |
120 | xnn_runtime_t* runtime_out) |
121 | { |
122 | return xnn_create_runtime_v2(subgraph, NULL /* threadpool */, 0 /* flags */, runtime_out); |
123 | } |
124 | |
125 | enum xnn_status xnn_create_runtime_v2( |
126 | xnn_subgraph_t subgraph, |
127 | pthreadpool_t threadpool, |
128 | uint32_t flags, |
129 | xnn_runtime_t* runtime_out) |
130 | { |
131 | return xnn_create_runtime_v3(subgraph, /* weights_cache */ NULL, threadpool, flags, runtime_out); |
132 | } |
133 | |
134 | enum xnn_status xnn_create_runtime_v3( |
135 | xnn_subgraph_t subgraph, |
136 | xnn_weights_cache_t weights_cache, |
137 | pthreadpool_t threadpool, |
138 | uint32_t flags, |
139 | xnn_runtime_t* runtime_out) |
140 | { |
141 | xnn_workspace_t workspace; |
142 | enum xnn_status status = xnn_create_workspace(&workspace); |
143 | if (status != xnn_status_success) { |
144 | return status; |
145 | } |
146 | status = xnn_create_runtime_v4(subgraph, weights_cache, workspace, threadpool, flags, runtime_out); |
147 | // Release workspace regardless of return status of creating runtime. |
148 | xnn_release_workspace(workspace); |
149 | return status; |
150 | } |
151 | |
152 | static enum xnn_status initialize_workspace_blobs( |
153 | xnn_subgraph_t subgraph, |
154 | xnn_runtime_t runtime, |
155 | struct xnn_value_allocation_tracker* mem_alloc_tracker) |
156 | { |
157 | assert(runtime->workspace != NULL); |
158 | |
159 | size_t mem_arena_size = mem_alloc_tracker->mem_arena_size; |
160 | if (mem_arena_size == 0) { |
161 | return xnn_status_success; |
162 | } |
163 | // Sparse microkernels can read up to 2 * XNN_EXTRA_BYTES beyond array bounds. |
164 | mem_arena_size += 2 * XNN_EXTRA_BYTES; |
165 | |
166 | // Records how much the workspace has moved by due to allocating a larger workspace. |
167 | ptrdiff_t workspace_data_delta = 0; |
168 | // Allocates larger workspace here if needed. |
169 | if (runtime->workspace->size < mem_arena_size) { |
170 | void* old_workspace_data = runtime->workspace->data; |
171 | if (runtime->workspace->size != 0) { |
172 | // Free up the workspace's current data. Free first then allocate to keep peak memory usage low. |
173 | xnn_release_simd_memory(runtime->workspace->data); |
174 | } |
175 | void* new_workspace_data = xnn_allocate_simd_memory(mem_arena_size); |
176 | if (new_workspace_data == NULL) { |
177 | xnn_log_error("failed to allocate %zu bytes for runtime workspace" , mem_arena_size); |
178 | return xnn_status_out_of_memory; |
179 | } |
180 | runtime->workspace->data = new_workspace_data; |
181 | runtime->workspace->size = mem_arena_size; |
182 | xnn_log_debug("created workspace of size %zu" , mem_arena_size); |
183 | // Keep track of how much the workspace data moved. |
184 | if (old_workspace_data != NULL) { |
185 | workspace_data_delta = (uintptr_t) new_workspace_data - (uintptr_t) old_workspace_data; |
186 | } |
187 | } |
188 | |
189 | assert(runtime->workspace->size >= mem_arena_size); |
190 | |
191 | // Initialize current runtime's blob pointers. |
192 | for (size_t i = 0; i < subgraph->num_values; i++) { |
193 | const struct xnn_value* value = &subgraph->values[i]; |
194 | struct xnn_blob* blob = &runtime->blobs[i]; |
195 | if (value->datatype != xnn_datatype_invalid && value->type == xnn_value_type_dense_tensor) { |
196 | if (blob->allocation_type == xnn_allocation_type_workspace) { |
197 | // Value is purely internal to the runtime, allocate it in the workspace. |
198 | blob->data = (void*) ((uintptr_t) runtime->workspace->data + mem_alloc_tracker->usage[i].alloc_offset); |
199 | } |
200 | } |
201 | } |
202 | |
203 | // Adjust the blob pointers of all runtimes that share this workspace. |
204 | if (workspace_data_delta != 0) { |
205 | for (struct xnn_runtime* rt = runtime->workspace->first_user; rt != NULL; rt = rt->next_workspace_user) { |
206 | // The current runtime already has the correct offset. |
207 | if (rt == runtime) { |
208 | continue; |
209 | } |
210 | for (size_t i = 0; i < rt->num_blobs; i++) { |
211 | struct xnn_blob* blob = &rt->blobs[i]; |
212 | if (blob->allocation_type == xnn_allocation_type_workspace) { |
213 | assert(blob->data != NULL); |
214 | blob->data = (void*) ((uintptr_t) blob->data + workspace_data_delta); |
215 | } |
216 | } |
217 | } |
218 | } |
219 | |
220 | return xnn_status_success; |
221 | } |
222 | |
223 | // An in-place operation reuses the input tensor's memory for its output. Examples are element-wise unary operations |
224 | // like activation functions. Usually, an output tensor is allocated space. For an in-place operation, we want the |
225 | // output tensor to share the input tensor's memory. We do this by calling xnn_mark_tensor_as_reuse, which: |
226 | // - sets the tensor_size of output tensor's usage record to 0 |
227 | // - mark this usage record as reusing another tensor's memory |
228 | // - remember the id of the tensor which we will reuse the alloc_offset to set onto the output tensor |
229 | static void optimize_tensor_allocation_for_in_place_operations( |
230 | struct xnn_value_allocation_tracker* tracker, |
231 | xnn_subgraph_t subgraph) |
232 | { |
233 | xnn_subgraph_analyze_consumers_and_producers(subgraph); |
234 | for (uint32_t n = 0; n < subgraph->num_nodes; n++) { |
235 | struct xnn_node* node = &subgraph->nodes[n]; |
236 | switch (node->type) { |
237 | case xnn_node_type_abs: |
238 | case xnn_node_type_bankers_rounding: |
239 | case xnn_node_type_ceiling: |
240 | case xnn_node_type_clamp: |
241 | case xnn_node_type_elu: |
242 | case xnn_node_type_floor: |
243 | case xnn_node_type_hardswish: |
244 | case xnn_node_type_leaky_relu: |
245 | case xnn_node_type_negate: |
246 | case xnn_node_type_prelu: |
247 | case xnn_node_type_sigmoid: |
248 | case xnn_node_type_softmax: |
249 | case xnn_node_type_square: |
250 | case xnn_node_type_square_root: |
251 | case xnn_node_type_static_reshape: |
252 | // Valid operation types that can be optimized. |
253 | break; |
254 | default: |
255 | continue; |
256 | } |
257 | struct xnn_value* output = &subgraph->values[node->outputs[0]]; |
258 | const uint32_t input_id = node->inputs[0]; |
259 | const struct xnn_value* input = &subgraph->values[input_id]; |
260 | if (xnn_value_is_external_input(input) || input->num_consumers > 1) { |
261 | // External inputs cannot be overwritten. |
262 | return; |
263 | } |
264 | if (output->num_consumers == 1) { |
265 | uint32_t reuse_id = input_id; |
266 | // If the tensor we are reusing is itself reused, find the "root tensor" to be reused. |
267 | while (tracker->usage[reuse_id].reuse_value_id != XNN_INVALID_VALUE_ID) { |
268 | reuse_id = tracker->usage[reuse_id].reuse_value_id; |
269 | } |
270 | // We only support when output has a single consumer because we cannot easily find all consumer nodes |
271 | // without traversing the entire graph. This will require tracking output->last_consumer in the future. |
272 | assert(tracker->usage[reuse_id].last_node < output->first_consumer); |
273 | xnn_log_debug("reusing tensor id #%" PRIu32 " memory for tensor id #%" PRIu32 " Node #%" PRIu32 " %s" , |
274 | reuse_id, output->id, node->id, xnn_node_type_to_string(node->type)); |
275 | xnn_mark_tensor_as_reuse(tracker, output->id, reuse_id, output->first_consumer); |
276 | } |
277 | } |
278 | } |
279 | |
280 | enum xnn_status xnn_create_runtime_v4( |
281 | xnn_subgraph_t subgraph, |
282 | xnn_weights_cache_t weights_cache, |
283 | xnn_workspace_t workspace, |
284 | pthreadpool_t threadpool, |
285 | uint32_t flags, |
286 | xnn_runtime_t* runtime_out) |
287 | { |
288 | struct xnn_runtime* runtime = NULL; |
289 | enum xnn_status status = xnn_status_uninitialized; |
290 | |
291 | if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { |
292 | xnn_log_error("failed to create runtime: XNNPACK is not initialized" ); |
293 | goto error; |
294 | } |
295 | |
296 | if (workspace == NULL) { |
297 | xnn_log_error("failed to create runtime: workspace is NULL" ); |
298 | status = xnn_status_invalid_parameter; |
299 | goto error; |
300 | } |
301 | |
302 | const uint32_t optimization_flags = XNN_FLAG_SPARSE_INFERENCE | XNN_FLAG_HINT_FP16_INFERENCE | |
303 | XNN_FLAG_FORCE_FP16_INFERENCE | XNN_FLAG_NO_OPERATOR_FUSION; |
304 | status = xnn_subgraph_optimize(subgraph, flags & optimization_flags); |
305 | if (status != xnn_status_success) { |
306 | xnn_log_error("failed to optimize subgraph" ); |
307 | goto error; |
308 | } |
309 | |
310 | status = xnn_status_out_of_memory; |
311 | |
312 | runtime = xnn_allocate_zero_memory(sizeof(struct xnn_runtime)); |
313 | if (runtime == NULL) { |
314 | xnn_log_error("failed to allocate %zu bytes for runtime descriptor" , sizeof(struct xnn_runtime)); |
315 | goto error; |
316 | } |
317 | |
318 | runtime->opdata = xnn_allocate_zero_memory(sizeof(struct xnn_operator_data) * subgraph->num_nodes); |
319 | if (runtime->opdata == NULL) { |
320 | xnn_log_error("failed to allocate %zu bytes for opdata descriptors" , |
321 | sizeof(struct xnn_operator_data) * (size_t) subgraph->num_nodes); |
322 | goto error; |
323 | } |
324 | runtime->num_ops = subgraph->num_nodes; |
325 | |
326 | if (flags & XNN_FLAG_YIELD_WORKERS) { |
327 | struct xnn_node* last_valid_node = NULL; |
328 | for (size_t i = 0; i < subgraph->num_nodes; i++) { |
329 | struct xnn_node* node = subgraph->nodes + i; |
330 | if (node->type != xnn_node_type_invalid) { |
331 | last_valid_node = node; |
332 | } |
333 | } |
334 | if (last_valid_node != NULL) { |
335 | last_valid_node->flags |= XNN_FLAG_YIELD_WORKERS; |
336 | } |
337 | } |
338 | |
339 | struct xnn_code_cache* code_cache = NULL; |
340 | #if XNN_PLATFORM_JIT && XNN_ENABLE_JIT |
341 | code_cache = &runtime->code_cache; |
342 | status = xnn_init_code_cache(code_cache); |
343 | if (status != xnn_status_success) { |
344 | goto error; |
345 | } |
346 | #endif |
347 | const struct xnn_caches caches = { |
348 | .code_cache = code_cache, |
349 | .weights_cache = weights_cache, |
350 | }; |
351 | |
352 | struct xnn_value* values = subgraph->values; |
353 | for (size_t i = 0; i < subgraph->num_nodes; i++) { |
354 | const struct xnn_node* node = subgraph->nodes + i; |
355 | |
356 | // Ignore fused nodes |
357 | if (node->type != xnn_node_type_invalid) { |
358 | assert(node->create != NULL); |
359 | status = node->create(node, values, subgraph->num_values, runtime->opdata + i, &caches); |
360 | if (status != xnn_status_success) { |
361 | goto error; |
362 | } |
363 | runtime->opdata[i].setup = node->setup; |
364 | } |
365 | } |
366 | |
367 | #if XNN_PLATFORM_JIT && XNN_ENABLE_JIT |
368 | xnn_finalize_code_memory(&code_cache->cache.code); |
369 | #endif |
370 | |
371 | runtime->blobs = xnn_allocate_zero_memory(sizeof(struct xnn_blob) * subgraph->num_values); |
372 | if (runtime->blobs == NULL) { |
373 | xnn_log_error("failed to allocate %zu bytes for blob descriptors" , |
374 | sizeof(struct xnn_blob) * (size_t) subgraph->num_values); |
375 | goto error; |
376 | } |
377 | runtime->num_blobs = subgraph->num_values; |
378 | |
379 | struct xnn_value_allocation_tracker mem_alloc_tracker; |
380 | xnn_init_value_allocation_tracker(&mem_alloc_tracker, subgraph); |
381 | |
382 | for (uint32_t i = 0; i < subgraph->num_values; i++) { |
383 | struct xnn_value* value = &subgraph->values[i]; |
384 | struct xnn_blob* blob = &runtime->blobs[i]; |
385 | if (value->datatype != xnn_datatype_invalid && value->type == xnn_value_type_dense_tensor) { |
386 | blob->size = xnn_tensor_get_size(subgraph, i); |
387 | blob->data = (void*) (uintptr_t) value->data; |
388 | if (blob->data == NULL) { |
389 | if (xnn_value_is_external(value)) { |
390 | // Value is non-static and external to the runtime: must be specified via a call to xnn_setup_runtime. |
391 | blob->allocation_type = xnn_allocation_type_external; |
392 | } else { |
393 | // Value is purely internal to the runtime, and must be allocated in its workspace. |
394 | xnn_add_value_allocation_tracker(&mem_alloc_tracker, i, round_up_po2(blob->size, XNN_EXTRA_BYTES)); |
395 | blob->allocation_type = xnn_allocation_type_workspace; |
396 | } |
397 | } else { |
398 | blob->allocation_type = xnn_allocation_type_static; |
399 | } |
400 | } |
401 | } |
402 | optimize_tensor_allocation_for_in_place_operations(&mem_alloc_tracker, subgraph); |
403 | xnn_plan_value_allocation_tracker(&mem_alloc_tracker); |
404 | |
405 | xnn_retain_workspace(workspace); |
406 | runtime->workspace = workspace; |
407 | runtime->next_workspace_user = runtime->workspace->first_user; |
408 | runtime->workspace->first_user = runtime; |
409 | |
410 | status = initialize_workspace_blobs(subgraph, runtime, &mem_alloc_tracker); |
411 | if (status != xnn_status_success) { |
412 | xnn_release_value_allocation_tracker(&mem_alloc_tracker); |
413 | goto error; |
414 | } |
415 | |
416 | if (flags & XNN_FLAG_BASIC_PROFILING) { |
417 | runtime->profiling = true; |
418 | } |
419 | |
420 | xnn_release_value_allocation_tracker(&mem_alloc_tracker); |
421 | |
422 | runtime->threadpool = threadpool; |
423 | |
424 | *runtime_out = runtime; |
425 | return xnn_status_success; |
426 | |
427 | error: |
428 | xnn_delete_runtime(runtime); |
429 | return status; |
430 | } |
431 | |
432 | enum xnn_status xnn_setup_runtime( |
433 | xnn_runtime_t runtime, |
434 | size_t num_external_values, |
435 | const struct xnn_external_value* external_values) |
436 | { |
437 | // Validate inputs without changing internal state. |
438 | // This ensures that runtime stays in consistent state in case validation fails midway. |
439 | for (size_t i = 0; i < num_external_values; i++) { |
440 | const struct xnn_external_value* external_value = &external_values[i]; |
441 | const uint32_t value_id = external_value->id; |
442 | if (value_id >= runtime->num_blobs) { |
443 | xnn_log_error("failed to setup runtime: out-of-bounds ID %" PRIu32 " in external value #%zu" , |
444 | value_id, i); |
445 | return xnn_status_invalid_parameter; |
446 | } |
447 | |
448 | const struct xnn_blob* blob = &runtime->blobs[value_id]; |
449 | if (blob->allocation_type != xnn_allocation_type_external) { |
450 | xnn_log_error("failed to setup runtime: Value %" PRIu32 " is not external" , value_id); |
451 | return xnn_status_invalid_parameter; |
452 | } |
453 | } |
454 | |
455 | // Apply runtime state changes. |
456 | for (size_t i = 0; i < num_external_values; i++) { |
457 | const struct xnn_external_value* external_value = &external_values[i]; |
458 | const uint32_t value_id = external_value->id; |
459 | struct xnn_blob* blob = &runtime->blobs[value_id]; |
460 | blob->data = external_value->data; |
461 | } |
462 | |
463 | for (size_t i = 0; i < runtime->num_ops; i++) { |
464 | const struct xnn_operator_data* opdata = &runtime->opdata[i]; |
465 | if (opdata->operator_objects[0] == NULL) { |
466 | // Operator was removed during optimization |
467 | continue; |
468 | } |
469 | |
470 | // Ensure that weights cache is finalized. |
471 | struct xnn_weights_cache* weights_cache = opdata->operator_objects[0]->weights_cache; |
472 | if (weights_cache != NULL && !xnn_weights_cache_is_finalized(weights_cache)) { |
473 | xnn_log_error("weights cache needs to be finalized before setup/infer" ); |
474 | return xnn_status_invalid_state; |
475 | } |
476 | |
477 | assert(opdata->setup != NULL); |
478 | const enum xnn_status status = opdata->setup(opdata, runtime->blobs, runtime->num_blobs, runtime->threadpool); |
479 | if (status != xnn_status_success) { |
480 | xnn_log_error("failed to setup runtime: error in operator #%zu" , i); |
481 | return status; |
482 | } |
483 | } |
484 | |
485 | return xnn_status_success; |
486 | } |
487 | |
488 | static xnn_timestamp xnn_read_timer() { |
489 | xnn_timestamp timestamp; |
490 | #ifdef __MACH__ |
491 | timestamp = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); |
492 | if (timestamp == 0) { |
493 | xnn_log_warning("clock_gettime failed: error code %d" , errno); |
494 | } |
495 | #elif __EMSCRIPTEN__ |
496 | timestamp = emscripten_get_now(); |
497 | #elif XNN_PLATFORM_WINDOWS |
498 | BOOL res = QueryPerformanceCounter(×tamp); |
499 | if (!res) { |
500 | xnn_log_error("QueryPerformanceCounter failed: error code %u" , GetLastError()); |
501 | memset(×tamp, 0, sizeof(timestamp)); |
502 | } |
503 | #else |
504 | int res = clock_gettime(CLOCK_MONOTONIC, ×tamp); |
505 | if (res != 0) { |
506 | xnn_log_error("clock_gettime failed: error code %d" , errno); |
507 | memset(×tamp, 0, sizeof(timestamp)); |
508 | } |
509 | #endif |
510 | return timestamp; |
511 | } |
512 | |
513 | static inline uint64_t xnn_get_elapsed_time(const xnn_timestamp* start, const xnn_timestamp* end) { |
514 | #ifdef __MACH__ |
515 | const uint64_t kMicrosInNanos = 1000; |
516 | return (*end - *start) / kMicrosInNanos; |
517 | #elif __EMSCRIPTEN__ |
518 | const double kMillisInMicros = 1.0e3; |
519 | return (uint64_t) ((*end - *start) * kMillisInMicros); |
520 | #elif XNN_PLATFORM_WINDOWS |
521 | const uint64_t kMicrosInSec = 1000 * 1000; |
522 | LARGE_INTEGER frequency; |
523 | BOOL res = QueryPerformanceFrequency(&frequency); |
524 | if (!res) { |
525 | xnn_log_error("QueryPerformanceFrequency failed: error code %u" , GetLastError()); |
526 | return 0; |
527 | } |
528 | return ((end->QuadPart - start->QuadPart) * kMicrosInSec) / frequency.QuadPart; |
529 | #else |
530 | const uint64_t kNanosInMicro = UINT64_C(1000); |
531 | const uint64_t kNanosInSec = UINT64_C(1000000000); |
532 | const uint64_t secs = (end->tv_sec - start->tv_sec) * kNanosInSec; |
533 | const uint64_t ns_secs = (end->tv_nsec - start->tv_nsec); |
534 | return (secs + ns_secs) / kNanosInMicro; |
535 | #endif |
536 | } |
537 | |
538 | enum xnn_status xnn_get_runtime_profiling_info(xnn_runtime_t runtime, |
539 | enum xnn_profile_info param_name, |
540 | size_t param_value_size, |
541 | void* param_value, |
542 | size_t* param_value_size_ret) |
543 | { |
544 | if (!runtime->profiling) { |
545 | return xnn_status_invalid_state; |
546 | } |
547 | enum xnn_status status = xnn_status_success; |
548 | size_t required_size = 0; |
549 | const struct xnn_operator_data* opdata = runtime->opdata; |
550 | switch (param_name) { |
551 | case xnn_profile_info_num_operators: |
552 | required_size = sizeof(size_t); |
553 | if (param_value_size < required_size){ |
554 | *param_value_size_ret = required_size; |
555 | status = xnn_status_out_of_memory; |
556 | } else { |
557 | size_t num_valid_ops = 0; |
558 | for (size_t i = 0; i < runtime->num_ops; ++i) { |
559 | if (opdata[i].operator_objects[0] != NULL) { |
560 | num_valid_ops += 1; |
561 | } |
562 | } |
563 | memcpy(param_value, &num_valid_ops, required_size); |
564 | } |
565 | break; |
566 | case xnn_profile_info_operator_name: |
567 | for (size_t i = 0; i < runtime->num_ops; ++i) { |
568 | if (opdata[i].operator_objects[0] != NULL) { |
569 | const char* op_name = xnn_operator_type_to_string(opdata[i].operator_objects[0]->type); |
570 | size_t op_name_len = strlen(op_name) + 1; |
571 | if (opdata[i].operator_objects[0]->ukernel.type != xnn_ukernel_type_default ) { |
572 | op_name_len += strlen(xnn_ukernel_type_to_string(opdata[i].operator_objects[0]->ukernel.type)) + 1; |
573 | } |
574 | required_size += op_name_len; |
575 | } |
576 | } |
577 | if (param_value_size < required_size) { |
578 | *param_value_size_ret = required_size; |
579 | status = xnn_status_out_of_memory; |
580 | } else { |
581 | char* name_out = (char*) param_value; |
582 | for (size_t i = 0; i < runtime->num_ops; ++i) { |
583 | if (opdata[i].operator_objects[0] != NULL) { |
584 | const char* op_name = xnn_operator_type_to_string(opdata[i].operator_objects[0]->type); |
585 | size_t op_name_len = strlen(op_name) + 1; |
586 | if (opdata[i].operator_objects[0]->ukernel.type != xnn_ukernel_type_default ) { |
587 | const char* ukernel_type = xnn_ukernel_type_to_string(opdata[i].operator_objects[0]->ukernel.type); |
588 | op_name_len += strlen(ukernel_type) + 1; |
589 | snprintf(name_out, op_name_len, "%s %s" , op_name, ukernel_type); |
590 | } else { |
591 | snprintf(name_out, op_name_len, "%s" , op_name); |
592 | } |
593 | name_out += op_name_len; |
594 | } |
595 | } |
596 | } |
597 | break; |
598 | case xnn_profile_info_operator_timing: |
599 | { |
600 | size_t num_valid_ops = 0; |
601 | for (size_t i = 0; i < runtime->num_ops; ++i) { |
602 | if (opdata[i].operator_objects[0] != NULL) { |
603 | num_valid_ops += 1; |
604 | } |
605 | } |
606 | required_size = num_valid_ops * sizeof(uint64_t); |
607 | if (param_value_size < required_size) { |
608 | *param_value_size_ret = required_size; |
609 | status = xnn_status_out_of_memory; |
610 | } else { |
611 | xnn_timestamp previous_ts = runtime->start_ts; |
612 | uint64_t* data = (uint64_t*) param_value; |
613 | for (size_t i = 0; i < runtime->num_ops; ++i) { |
614 | if (opdata[i].operator_objects[0] != NULL) { |
615 | uint64_t op_time = 0; |
616 | for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) { |
617 | if (opdata[i].operator_objects[j] != NULL) { |
618 | op_time += xnn_get_elapsed_time(&previous_ts, &opdata[i].end_ts[j]); |
619 | previous_ts = opdata[i].end_ts[j]; |
620 | } |
621 | } |
622 | *data++ = op_time; |
623 | } |
624 | } |
625 | } |
626 | break; |
627 | } |
628 | default: |
629 | status = xnn_status_invalid_parameter; |
630 | } |
631 | return status; |
632 | } |
633 | |
634 | enum xnn_status xnn_invoke_runtime( |
635 | xnn_runtime_t runtime) |
636 | { |
637 | if (runtime->profiling) { |
638 | runtime->start_ts = xnn_read_timer(); |
639 | } |
640 | for (size_t i = 0; i < runtime->num_ops; i++) { |
641 | for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) { |
642 | if (runtime->opdata[i].operator_objects[j] == NULL) { |
643 | // Operator was removed after fusion |
644 | continue; |
645 | } |
646 | |
647 | const enum xnn_status status = xnn_run_operator(runtime->opdata[i].operator_objects[j], runtime->threadpool); |
648 | if (status != xnn_status_success) { |
649 | return status; |
650 | } |
651 | if (runtime->profiling) { |
652 | runtime->opdata[i].end_ts[j] = xnn_read_timer(); |
653 | } |
654 | } |
655 | } |
656 | return xnn_status_success; |
657 | } |
658 | |
659 | enum xnn_status xnn_delete_runtime( |
660 | xnn_runtime_t runtime) |
661 | { |
662 | if (runtime != NULL) { |
663 | if (runtime->opdata != NULL) { |
664 | for (size_t i = 0; i < runtime->num_ops; i++) { |
665 | for (size_t j = 0; j < XNN_MAX_OPERATOR_OBJECTS; j++) { |
666 | xnn_delete_operator(runtime->opdata[i].operator_objects[j]); |
667 | } |
668 | } |
669 | xnn_release_memory(runtime->opdata); |
670 | |
671 | xnn_release_memory(runtime->blobs); |
672 | if (runtime->workspace != NULL) { |
673 | // Remove this runtime from the list of users of the workspace. |
674 | assert(runtime->workspace->first_user != NULL); |
675 | if (runtime->workspace->first_user == runtime) { |
676 | runtime->workspace->first_user = runtime->next_workspace_user; |
677 | } else { |
678 | xnn_runtime_t prev = runtime->workspace->first_user; |
679 | xnn_runtime_t curr = prev->next_workspace_user; |
680 | while (curr != runtime) { |
681 | prev = curr; |
682 | curr = curr->next_workspace_user; |
683 | } |
684 | assert(curr == runtime); |
685 | prev->next_workspace_user = curr->next_workspace_user; |
686 | } |
687 | xnn_release_workspace(runtime->workspace); |
688 | } |
689 | } |
690 | #if XNN_PLATFORM_JIT && XNN_ENABLE_JIT |
691 | xnn_release_code_cache(&runtime->code_cache); |
692 | #endif |
693 | xnn_release_memory(runtime); |
694 | } |
695 | return xnn_status_success; |
696 | } |
697 | |