1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
17#define TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
18
19#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
20
21#include <tuple>
22#include <unordered_map>
23
24#include "absl/strings/str_cat.h"
25#include "absl/strings/str_format.h"
26#include "tensorflow/core/framework/op_kernel.h"
27#include "tensorflow/core/kernels/gpu_utils.h"
28#include "tensorflow/core/lib/gtl/inlined_vector.h"
29#include "tensorflow/core/lib/hash/hash.h"
30#include "tensorflow/core/util/autotune_maps/conv_parameters.h"
31#include "tensorflow/core/util/tensor_format.h"
32
33namespace tensorflow {
34
35// Get the Dnn workspace limit from the environment variable, which is in MB.
36// Return the workspace memory limit in bytes. If no value is set, return the
37// default value.
38int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
39 int64_t default_value_in_bytes);
40
41// Call the Dnn workspace limit from TF_CUDNN_WORKSPACE_LIMIT_IN_MB or default.
42int64 GetDnnWorkspaceLimitOrDefault();
43
44// A class to provide scratch-space allocator for Stream-Executor Cudnn
45// callback. TensorFlow is responsible for releasing the temporary buffers after
46// the kernel finishes.
47class DnnScratchAllocator : public se::ScratchAllocator {
48 public:
49 virtual ~DnnScratchAllocator() {}
50 DnnScratchAllocator(int64_t memory_limit, OpKernelContext* context)
51 : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
52 int64 GetMemoryLimitInBytes() override { return memory_limit_; }
53 se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
54 int64_t byte_size) override {
55 Tensor temporary_memory;
56 if (byte_size < 0) {
57 return se::port::Status{se::port::error::INVALID_ARGUMENT,
58 "Requested negative byte size!"};
59 }
60 if (byte_size > memory_limit_) {
61 return se::port::Status{se::port::error::UNAVAILABLE,
62 absl::StrCat("Requested memory size (", byte_size,
63 ") exceeds the max memory limit (",
64 memory_limit_, ").")};
65 }
66 AllocationAttributes allocation_attr;
67 allocation_attr.retry_on_failure = false;
68 Status allocation_status(context_->allocate_temp(
69 DT_UINT8, TensorShape({byte_size}), &temporary_memory,
70 AllocatorAttributes(), allocation_attr));
71 if (!allocation_status.ok()) {
72 return se::port::Status{
73 se::port::error::UNAVAILABLE,
74 absl::StrCat("Failed to allocate the requested memory size (",
75 byte_size, ").")};
76 }
77 // Hold the reference of the allocated tensors until the end of the
78 // allocator.
79 allocated_tensors_.push_back(temporary_memory);
80 total_byte_size_ += byte_size;
81 return se::port::StatusOr<se::DeviceMemory<uint8>>(
82 AsDeviceMemory(temporary_memory.flat<uint8>().data(),
83 temporary_memory.flat<uint8>().size()));
84 }
85 int64 TotalByteSize() { return total_byte_size_; }
86
87 private:
88 int64 memory_limit_;
89 int64 total_byte_size_;
90 OpKernelContext* context_;
91 std::vector<Tensor> allocated_tensors_;
92};
93
94typedef Eigen::GpuDevice GPUDevice;
95
96// Select an algorithm for the given convolution, either by running actual
97// autotuning with a cache, or by falling back to a default if
98// 'cudnn_use_autotune' is true and cuDNN is the statically-chosen DNN backend.
99template <typename T>
100StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
101 bool cudnn_use_autotune,
102 AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>*
103 autotune_map,
104 const ConvParameters& params, OpKernelContext* ctx,
105 const se::dnn::BatchDescriptor& input_desc,
106 const se::dnn::FilterDescriptor& filter_desc,
107 const se::dnn::BatchDescriptor& bias_desc,
108 const se::dnn::BatchDescriptor& output_desc,
109 const se::dnn::ConvolutionDescriptor& conv_desc,
110 const se::dnn::ActivationMode activation_mode, double conv_input_scale,
111 double side_input_scale, double leakyrelu_alpha,
112 se::DeviceMemory<T> input_ptr, se::DeviceMemory<T> filter_ptr,
113 se::DeviceMemory<T> output_ptr, se::DeviceMemory<T> bias_ptr,
114 se::DeviceMemory<T> side_input_ptr, int64_t scratch_size);
115
116template <typename T>
117StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
118 bool cudnn_use_autotune,
119 AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
120 const ConvParameters& conv_parameters, OpKernelContext* ctx,
121 se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc,
122 se::DeviceMemory<T> input_ptr, const se::dnn::FilterDescriptor& filter_desc,
123 se::DeviceMemory<T> filter_ptr,
124 const se::dnn::ConvolutionDescriptor& conv_desc,
125 const se::dnn::BatchDescriptor& output_desc, se::DeviceMemory<T> output_ptr,
126 int64_t scratch_size_limit);
127
128// Returns a pointer to the primary 'OpRunner' of 'runners' and allocated
129// scratch memory if allocatable; else a pointer to its fallback
130// no-scratch-space runner, and a null 'DeviceMemoryBase'.
131template <typename Sig>
132StatusOr<std::tuple<const se::dnn::OpRunner<Sig>*, se::DeviceMemoryBase>>
133AllocateScratchOrFallback(se::ScratchAllocator* scratch_allocator,
134 const se::dnn::OpRunner<Sig>* primary,
135 const se::dnn::OpRunner<Sig>* no_scratch_fallback) {
136 const se::dnn::OpRunner<Sig>* selected_runner = primary;
137
138 auto workspace_size = selected_runner->GetWorkspaceSize();
139
140 se::DeviceMemoryBase scratch_memory;
141 if (workspace_size > 0) {
142 auto scratch_or = scratch_allocator->AllocateBytes(workspace_size);
143 if (scratch_or.ok()) {
144 scratch_memory = scratch_or.value();
145 } else if ((selected_runner = no_scratch_fallback)) {
146 if (selected_runner->GetWorkspaceSize() > 0) {
147 return errors::Internal(
148 "No-scratch fallback runner requires nonzero scratch space");
149 }
150 } else {
151 return errors::Unknown(
152 "CUDNN failed to allocate the scratch space for the runner or to "
153 "find a working no-scratch runner.");
154 }
155 }
156
157 return std::make_tuple(selected_runner, scratch_memory);
158}
159
160template <typename T>
161Status LaunchAutotunedConv(const AutotuneEntry<se::dnn::ConvOp>& autotune_entry,
162 DnnScratchAllocator* scratch_allocator,
163 se::dnn::ConvolutionKind kind, se::Stream* stream,
164 const se::dnn::BatchDescriptor& input_desc,
165 se::DeviceMemory<T> in_ptr,
166 const se::dnn::FilterDescriptor& filter_desc,
167 se::DeviceMemory<T> filter_ptr,
168 const se::dnn::ConvolutionDescriptor& conv_desc,
169 const se::dnn::BatchDescriptor& output_desc,
170 se::DeviceMemory<T> out_ptr) {
171 if (!autotune_entry.is_algorithm_config()) {
172 const auto& runners = autotune_entry.GetOpRunners();
173 se::dnn::DataType element_type = se::dnn::ToDataType<T>::value;
174 se::dnn::ConvOp::Config config{kind, element_type, element_type,
175 input_desc, filter_desc, output_desc,
176 conv_desc};
177 TF_ASSIGN_OR_RETURN(auto* primary,
178 runners.primary->GetOrCreateRunner(config, stream));
179
180 const se::dnn::ConvRunner* no_scratch_fallback = nullptr;
181 if (runners.no_scratch_fallback) {
182 TF_ASSIGN_OR_RETURN(
183 no_scratch_fallback,
184 runners.no_scratch_fallback->GetOrCreateRunner(config, stream));
185 }
186
187 TF_ASSIGN_OR_RETURN(auto runner_and_scratch,
188 AllocateScratchOrFallback<se::dnn::ConvOp::Signature>(
189 scratch_allocator, primary, no_scratch_fallback));
190 auto& runner = *std::get<const se::dnn::ConvRunner*>(runner_and_scratch);
191 return runner(stream, nullptr,
192 std::get<se::DeviceMemoryBase>(runner_and_scratch), in_ptr,
193 filter_ptr, out_ptr);
194 } else {
195 return stream->ConvolveWithAlgorithm(
196 kind, input_desc, in_ptr, filter_desc, filter_ptr, output_desc, out_ptr,
197 conv_desc, scratch_allocator, autotune_entry.GetAlgorithmConfig(),
198 nullptr);
199 }
200}
201
202} // namespace tensorflow
203
204#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
205
206#endif // TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
207