1 | /******************************************************************************* |
2 | * Copyright 2020-2021 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include "gpu/compute/compute_stream.hpp" |
18 | #include "gpu/compute/compute_engine.hpp" |
19 | #include "gpu/gpu_primitive.hpp" |
20 | |
21 | namespace dnnl { |
22 | namespace impl { |
23 | namespace gpu { |
24 | namespace compute { |
25 | status_t compute_stream_t::zero_pad( |
26 | const memory_t *memory, const exec_ctx_t &ctx) { |
27 | memory_desc_wrapper mdw(memory->md()); |
28 | |
29 | if (mdw.format_kind() != format_kind::blocked) return status::unimplemented; |
30 | |
31 | if (mdw.nelems(false) == mdw.nelems(true)) return status::success; |
32 | |
33 | if (!has_zero_pad_primitive()) return stream_t::zero_pad(memory, ctx); |
34 | |
35 | // Kernel only compiled to support data types of length 1, 2, or 4 currently |
36 | if (!utils::one_of(mdw.data_type_size(), 1u, 2u, 4u)) |
37 | return status::unimplemented; |
38 | |
39 | const blocking_desc_t blocking_desc = mdw.blocking_desc(); |
40 | |
41 | const int max_step_nelems = ZERO_PAD_MAX_STEP_SIZE; |
42 | size_t step_nelems = 1; |
43 | for (int i = 0; i < blocking_desc.inner_nblks; i++) { |
44 | step_nelems *= blocking_desc.inner_blks[i]; |
45 | } |
46 | |
47 | assert(step_nelems <= max_step_nelems); |
48 | if (step_nelems > max_step_nelems) return stream_t::zero_pad(memory, ctx); |
49 | |
50 | engine_t *engine = this->engine(); |
51 | |
52 | primitive_t *zero_pad_primitive; |
53 | const resource_mapper_t *mapper; |
54 | CHECK(utils::downcast<compute_engine_t *>(engine)->get_zero_pad_primitive( |
55 | zero_pad_primitive, mapper)); |
56 | |
57 | exec_args_t zero_pad_args; |
58 | memory_arg_t arg = {const_cast<memory_t *>(memory), true}; |
59 | zero_pad_args[DNNL_ARG_SRC] = arg; |
60 | exec_ctx_t zero_pad_ctx(this, std::move(zero_pad_args)); |
61 | zero_pad_ctx.set_resource_mapper(mapper); |
62 | |
63 | // Verbose is implemented separately here since fake primitive descriptor |
64 | // contains only primitive_kind in internal op_desc, but no md. Such design |
65 | // was chosen to avoid re-creation of zeropad primitive in case it lives as |
66 | // a regular one in cache and may be evicted from there. It means that md |
67 | // is available only with incoming memory at execution point here, that's |
68 | // why separate logic is written apart from a common place. |
69 | // XXX: re-consider, once zeropad appears in other places in the library. |
70 | if (get_verbose()) { |
71 | this->wait(); |
72 | double start_ms = get_msec(); |
73 | CHECK(zero_pad_primitive->execute(zero_pad_ctx)); |
74 | status_t status = this->wait(); |
75 | double duration_ms = get_msec() - start_ms; |
76 | std::string stamp; |
77 | if (get_verbose_timestamp()) stamp = "," + std::to_string(start_ms); |
78 | std::string md_fmt_str = md2fmt_str(memory->md()); |
79 | std::string md_dim_str = md2dim_str(memory->md()); |
80 | |
81 | printf("onednn_verbose%s,exec,%s,%s,undef,%s,,,%s,%g\n" , stamp.c_str(), |
82 | "gpu,zero_pad" , zero_pad_primitive->pd()->name(), |
83 | md_fmt_str.c_str(), md_dim_str.c_str(), duration_ms); |
84 | |
85 | return status; |
86 | } else { |
87 | return zero_pad_primitive->execute(zero_pad_ctx); |
88 | } |
89 | }; |
90 | } // namespace compute |
91 | } // namespace gpu |
92 | } // namespace impl |
93 | } // namespace dnnl |
94 | |