1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | /// @example cross_engine_reorder.c |
18 | /// @copybrief cross_engine_reorder_c |
19 | |
20 | /// @page cross_engine_reorder_c Reorder between CPU and GPU engines |
21 | /// This C API example demonstrates programming flow when reordering memory |
22 | /// between CPU and GPU engines. |
23 | /// |
24 | /// @include cross_engine_reorder.c |
25 | |
26 | #include <stdio.h> |
27 | #include <stdlib.h> |
28 | |
29 | #include "oneapi/dnnl/dnnl.h" |
30 | |
31 | #include "example_utils.h" |
32 | |
33 | size_t product(int n_dims, const dnnl_dim_t dims[]) { |
34 | size_t n_elems = 1; |
35 | for (int d = 0; d < n_dims; ++d) { |
36 | n_elems *= (size_t)dims[d]; |
37 | } |
38 | return n_elems; |
39 | } |
40 | |
41 | void fill(dnnl_memory_t mem, int n_dims, const dnnl_dim_t dims[]) { |
42 | const size_t n_elems = product(n_dims, dims); |
43 | float *array = (float *)malloc(n_elems * sizeof(float)); |
44 | if (!array) COMPLAIN_EXAMPLE_ERROR_AND_EXIT("%s" , "malloc returned NULL" ); |
45 | |
46 | for (size_t e = 0; e < n_elems; ++e) { |
47 | array[e] = e % 7 ? 1.0f : -1.0f; |
48 | } |
49 | |
50 | write_to_dnnl_memory(array, mem); |
51 | free(array); |
52 | } |
53 | |
54 | int find_negative(dnnl_memory_t mem, int n_dims, const dnnl_dim_t dims[]) { |
55 | const size_t n_elems = product(n_dims, dims); |
56 | float *array = (float *)malloc(n_elems * sizeof(float)); |
57 | if (!array) COMPLAIN_EXAMPLE_ERROR_AND_EXIT("%s" , "malloc returned NULL" ); |
58 | read_from_dnnl_memory(array, mem); |
59 | |
60 | int negs = 0; |
61 | for (size_t e = 0; e < n_elems; ++e) { |
62 | negs += array[e] < 0.0f; |
63 | } |
64 | |
65 | free(array); |
66 | return negs; |
67 | } |
68 | |
69 | void cross_engine_reorder() { |
70 | dnnl_engine_t engine_cpu, engine_gpu; |
71 | CHECK(dnnl_engine_create(&engine_cpu, validate_engine_kind(dnnl_cpu), 0)); |
72 | CHECK(dnnl_engine_create(&engine_gpu, validate_engine_kind(dnnl_gpu), 0)); |
73 | |
74 | const dnnl_dims_t tz = {2, 16, 1, 1}; |
75 | |
76 | dnnl_memory_desc_t m_cpu_md, m_gpu_md; |
77 | CHECK(dnnl_memory_desc_create_with_tag( |
78 | &m_cpu_md, 4, tz, dnnl_f32, dnnl_nchw)); |
79 | CHECK(dnnl_memory_desc_create_with_tag( |
80 | &m_gpu_md, 4, tz, dnnl_f32, dnnl_nchw)); |
81 | |
82 | dnnl_memory_t m_cpu, m_gpu; |
83 | CHECK(dnnl_memory_create( |
84 | &m_cpu, m_cpu_md, engine_cpu, DNNL_MEMORY_ALLOCATE)); |
85 | CHECK(dnnl_memory_create( |
86 | &m_gpu, m_gpu_md, engine_gpu, DNNL_MEMORY_ALLOCATE)); |
87 | |
88 | fill(m_cpu, 4, tz); |
89 | if (find_negative(m_cpu, 4, tz) == 0) |
90 | COMPLAIN_EXAMPLE_ERROR_AND_EXIT( |
91 | "%s" , "incorrect data fill, no negative values found" ); |
92 | |
93 | /* reorder cpu -> gpu */ |
94 | dnnl_primitive_desc_t r1_pd; |
95 | CHECK(dnnl_reorder_primitive_desc_create( |
96 | &r1_pd, m_cpu_md, engine_cpu, m_gpu_md, engine_gpu, NULL)); |
97 | dnnl_primitive_t r1; |
98 | CHECK(dnnl_primitive_create(&r1, r1_pd)); |
99 | |
100 | /* relu gpu */ |
101 | dnnl_primitive_desc_t relu_pd; |
102 | CHECK(dnnl_eltwise_forward_primitive_desc_create(&relu_pd, engine_gpu, |
103 | dnnl_forward, dnnl_eltwise_relu, m_gpu_md, m_gpu_md, 0.0f, 0.0f, |
104 | NULL)); |
105 | |
106 | dnnl_primitive_t relu; |
107 | CHECK(dnnl_primitive_create(&relu, relu_pd)); |
108 | |
109 | /* reorder gpu -> cpu */ |
110 | dnnl_primitive_desc_t r2_pd; |
111 | CHECK(dnnl_reorder_primitive_desc_create( |
112 | &r2_pd, m_gpu_md, engine_gpu, m_cpu_md, engine_cpu, NULL)); |
113 | dnnl_primitive_t r2; |
114 | CHECK(dnnl_primitive_create(&r2, r2_pd)); |
115 | |
116 | dnnl_stream_t stream_gpu; |
117 | CHECK(dnnl_stream_create( |
118 | &stream_gpu, engine_gpu, dnnl_stream_default_flags)); |
119 | |
120 | dnnl_exec_arg_t r1_args[] = {{DNNL_ARG_FROM, m_cpu}, {DNNL_ARG_TO, m_gpu}}; |
121 | CHECK(dnnl_primitive_execute(r1, stream_gpu, 2, r1_args)); |
122 | |
123 | dnnl_exec_arg_t relu_args[] |
124 | = {{DNNL_ARG_SRC, m_gpu}, {DNNL_ARG_DST, m_gpu}}; |
125 | CHECK(dnnl_primitive_execute(relu, stream_gpu, 2, relu_args)); |
126 | |
127 | dnnl_exec_arg_t r2_args[] = {{DNNL_ARG_FROM, m_gpu}, {DNNL_ARG_TO, m_cpu}}; |
128 | CHECK(dnnl_primitive_execute(r2, stream_gpu, 2, r2_args)); |
129 | |
130 | CHECK(dnnl_stream_wait(stream_gpu)); |
131 | |
132 | if (find_negative(m_cpu, 4, tz) != 0) |
133 | COMPLAIN_EXAMPLE_ERROR_AND_EXIT( |
134 | "%s" , "found negative values after ReLU applied" ); |
135 | |
136 | /* clean up */ |
137 | dnnl_primitive_desc_destroy(relu_pd); |
138 | dnnl_primitive_desc_destroy(r1_pd); |
139 | dnnl_primitive_desc_destroy(r2_pd); |
140 | |
141 | dnnl_primitive_destroy(relu); |
142 | dnnl_primitive_destroy(r1); |
143 | dnnl_primitive_destroy(r2); |
144 | dnnl_memory_destroy(m_cpu); |
145 | dnnl_memory_destroy(m_gpu); |
146 | dnnl_memory_desc_destroy(m_cpu_md); |
147 | dnnl_memory_desc_destroy(m_gpu_md); |
148 | |
149 | dnnl_stream_destroy(stream_gpu); |
150 | |
151 | dnnl_engine_destroy(engine_cpu); |
152 | dnnl_engine_destroy(engine_gpu); |
153 | } |
154 | |
155 | int main() { |
156 | cross_engine_reorder(); |
157 | printf("Example passed on CPU/GPU.\n" ); |
158 | return 0; |
159 | } |
160 | |