1/*******************************************************************************
2 * Copyright 2020-2021 Intel Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *******************************************************************************/
16
17#include "gpu/compute/compute_stream.hpp"
18#include "gpu/compute/compute_engine.hpp"
19#include "gpu/gpu_primitive.hpp"
20
21namespace dnnl {
22namespace impl {
23namespace gpu {
24namespace compute {
25status_t compute_stream_t::zero_pad(
26 const memory_t *memory, const exec_ctx_t &ctx) {
27 memory_desc_wrapper mdw(memory->md());
28
29 if (mdw.format_kind() != format_kind::blocked) return status::unimplemented;
30
31 if (mdw.nelems(false) == mdw.nelems(true)) return status::success;
32
33 if (!has_zero_pad_primitive()) return stream_t::zero_pad(memory, ctx);
34
35 // Kernel only compiled to support data types of length 1, 2, or 4 currently
36 if (!utils::one_of(mdw.data_type_size(), 1u, 2u, 4u))
37 return status::unimplemented;
38
39 const blocking_desc_t blocking_desc = mdw.blocking_desc();
40
41 const int max_step_nelems = ZERO_PAD_MAX_STEP_SIZE;
42 size_t step_nelems = 1;
43 for (int i = 0; i < blocking_desc.inner_nblks; i++) {
44 step_nelems *= blocking_desc.inner_blks[i];
45 }
46
47 assert(step_nelems <= max_step_nelems);
48 if (step_nelems > max_step_nelems) return stream_t::zero_pad(memory, ctx);
49
50 engine_t *engine = this->engine();
51
52 primitive_t *zero_pad_primitive;
53 const resource_mapper_t *mapper;
54 CHECK(utils::downcast<compute_engine_t *>(engine)->get_zero_pad_primitive(
55 zero_pad_primitive, mapper));
56
57 exec_args_t zero_pad_args;
58 memory_arg_t arg = {const_cast<memory_t *>(memory), true};
59 zero_pad_args[DNNL_ARG_SRC] = arg;
60 exec_ctx_t zero_pad_ctx(this, std::move(zero_pad_args));
61 zero_pad_ctx.set_resource_mapper(mapper);
62
63 // Verbose is implemented separately here since fake primitive descriptor
64 // contains only primitive_kind in internal op_desc, but no md. Such design
65 // was chosen to avoid re-creation of zeropad primitive in case it lives as
66 // a regular one in cache and may be evicted from there. It means that md
67 // is available only with incoming memory at execution point here, that's
68 // why separate logic is written apart from a common place.
69 // XXX: re-consider, once zeropad appears in other places in the library.
70 if (get_verbose()) {
71 this->wait();
72 double start_ms = get_msec();
73 CHECK(zero_pad_primitive->execute(zero_pad_ctx));
74 status_t status = this->wait();
75 double duration_ms = get_msec() - start_ms;
76 std::string stamp;
77 if (get_verbose_timestamp()) stamp = "," + std::to_string(start_ms);
78 std::string md_fmt_str = md2fmt_str(memory->md());
79 std::string md_dim_str = md2dim_str(memory->md());
80
81 printf("onednn_verbose%s,exec,%s,%s,undef,%s,,,%s,%g\n", stamp.c_str(),
82 "gpu,zero_pad", zero_pad_primitive->pd()->name(),
83 md_fmt_str.c_str(), md_dim_str.c_str(), duration_ms);
84
85 return status;
86 } else {
87 return zero_pad_primitive->execute(zero_pad_ctx);
88 }
89};
90} // namespace compute
91} // namespace gpu
92} // namespace impl
93} // namespace dnnl
94