1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_ |
17 | #define TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_ |
18 | |
19 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
20 | |
21 | #include <tuple> |
22 | #include <unordered_map> |
23 | |
24 | #include "absl/strings/str_cat.h" |
25 | #include "absl/strings/str_format.h" |
26 | #include "tensorflow/core/framework/op_kernel.h" |
27 | #include "tensorflow/core/kernels/gpu_utils.h" |
28 | #include "tensorflow/core/lib/gtl/inlined_vector.h" |
29 | #include "tensorflow/core/lib/hash/hash.h" |
30 | #include "tensorflow/core/util/autotune_maps/conv_parameters.h" |
31 | #include "tensorflow/core/util/tensor_format.h" |
32 | |
33 | namespace tensorflow { |
34 | |
35 | // Get the Dnn workspace limit from the environment variable, which is in MB. |
36 | // Return the workspace memory limit in bytes. If no value is set, return the |
37 | // default value. |
38 | int64 GetDnnWorkspaceLimit(const string& envvar_in_mb, |
39 | int64_t default_value_in_bytes); |
40 | |
41 | // Call the Dnn workspace limit from TF_CUDNN_WORKSPACE_LIMIT_IN_MB or default. |
42 | int64 GetDnnWorkspaceLimitOrDefault(); |
43 | |
44 | // A class to provide scratch-space allocator for Stream-Executor Cudnn |
45 | // callback. TensorFlow is responsible for releasing the temporary buffers after |
46 | // the kernel finishes. |
47 | class DnnScratchAllocator : public se::ScratchAllocator { |
48 | public: |
49 | virtual ~DnnScratchAllocator() {} |
50 | DnnScratchAllocator(int64_t memory_limit, OpKernelContext* context) |
51 | : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {} |
52 | int64 GetMemoryLimitInBytes() override { return memory_limit_; } |
53 | se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes( |
54 | int64_t byte_size) override { |
55 | Tensor temporary_memory; |
56 | if (byte_size < 0) { |
57 | return se::port::Status{se::port::error::INVALID_ARGUMENT, |
58 | "Requested negative byte size!" }; |
59 | } |
60 | if (byte_size > memory_limit_) { |
61 | return se::port::Status{se::port::error::UNAVAILABLE, |
62 | absl::StrCat("Requested memory size (" , byte_size, |
63 | ") exceeds the max memory limit (" , |
64 | memory_limit_, ")." )}; |
65 | } |
66 | AllocationAttributes allocation_attr; |
67 | allocation_attr.retry_on_failure = false; |
68 | Status allocation_status(context_->allocate_temp( |
69 | DT_UINT8, TensorShape({byte_size}), &temporary_memory, |
70 | AllocatorAttributes(), allocation_attr)); |
71 | if (!allocation_status.ok()) { |
72 | return se::port::Status{ |
73 | se::port::error::UNAVAILABLE, |
74 | absl::StrCat("Failed to allocate the requested memory size (" , |
75 | byte_size, ")." )}; |
76 | } |
77 | // Hold the reference of the allocated tensors until the end of the |
78 | // allocator. |
79 | allocated_tensors_.push_back(temporary_memory); |
80 | total_byte_size_ += byte_size; |
81 | return se::port::StatusOr<se::DeviceMemory<uint8>>( |
82 | AsDeviceMemory(temporary_memory.flat<uint8>().data(), |
83 | temporary_memory.flat<uint8>().size())); |
84 | } |
85 | int64 TotalByteSize() { return total_byte_size_; } |
86 | |
87 | private: |
88 | int64 memory_limit_; |
89 | int64 total_byte_size_; |
90 | OpKernelContext* context_; |
91 | std::vector<Tensor> allocated_tensors_; |
92 | }; |
93 | |
94 | typedef Eigen::GpuDevice GPUDevice; |
95 | |
96 | // Select an algorithm for the given convolution, either by running actual |
97 | // autotuning with a cache, or by falling back to a default if |
98 | // 'cudnn_use_autotune' is true and cuDNN is the statically-chosen DNN backend. |
99 | template <typename T> |
100 | StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv( |
101 | bool cudnn_use_autotune, |
102 | AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>* |
103 | autotune_map, |
104 | const ConvParameters& params, OpKernelContext* ctx, |
105 | const se::dnn::BatchDescriptor& input_desc, |
106 | const se::dnn::FilterDescriptor& filter_desc, |
107 | const se::dnn::BatchDescriptor& bias_desc, |
108 | const se::dnn::BatchDescriptor& output_desc, |
109 | const se::dnn::ConvolutionDescriptor& conv_desc, |
110 | const se::dnn::ActivationMode activation_mode, double conv_input_scale, |
111 | double side_input_scale, double leakyrelu_alpha, |
112 | se::DeviceMemory<T> input_ptr, se::DeviceMemory<T> filter_ptr, |
113 | se::DeviceMemory<T> output_ptr, se::DeviceMemory<T> bias_ptr, |
114 | se::DeviceMemory<T> side_input_ptr, int64_t scratch_size); |
115 | |
116 | template <typename T> |
117 | StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv( |
118 | bool cudnn_use_autotune, |
119 | AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map, |
120 | const ConvParameters& conv_parameters, OpKernelContext* ctx, |
121 | se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc, |
122 | se::DeviceMemory<T> input_ptr, const se::dnn::FilterDescriptor& filter_desc, |
123 | se::DeviceMemory<T> filter_ptr, |
124 | const se::dnn::ConvolutionDescriptor& conv_desc, |
125 | const se::dnn::BatchDescriptor& output_desc, se::DeviceMemory<T> output_ptr, |
126 | int64_t scratch_size_limit); |
127 | |
128 | // Returns a pointer to the primary 'OpRunner' of 'runners' and allocated |
129 | // scratch memory if allocatable; else a pointer to its fallback |
130 | // no-scratch-space runner, and a null 'DeviceMemoryBase'. |
131 | template <typename Sig> |
132 | StatusOr<std::tuple<const se::dnn::OpRunner<Sig>*, se::DeviceMemoryBase>> |
133 | AllocateScratchOrFallback(se::ScratchAllocator* scratch_allocator, |
134 | const se::dnn::OpRunner<Sig>* primary, |
135 | const se::dnn::OpRunner<Sig>* no_scratch_fallback) { |
136 | const se::dnn::OpRunner<Sig>* selected_runner = primary; |
137 | |
138 | auto workspace_size = selected_runner->GetWorkspaceSize(); |
139 | |
140 | se::DeviceMemoryBase scratch_memory; |
141 | if (workspace_size > 0) { |
142 | auto scratch_or = scratch_allocator->AllocateBytes(workspace_size); |
143 | if (scratch_or.ok()) { |
144 | scratch_memory = scratch_or.value(); |
145 | } else if ((selected_runner = no_scratch_fallback)) { |
146 | if (selected_runner->GetWorkspaceSize() > 0) { |
147 | return errors::Internal( |
148 | "No-scratch fallback runner requires nonzero scratch space" ); |
149 | } |
150 | } else { |
151 | return errors::Unknown( |
152 | "CUDNN failed to allocate the scratch space for the runner or to " |
153 | "find a working no-scratch runner." ); |
154 | } |
155 | } |
156 | |
157 | return std::make_tuple(selected_runner, scratch_memory); |
158 | } |
159 | |
160 | template <typename T> |
161 | Status LaunchAutotunedConv(const AutotuneEntry<se::dnn::ConvOp>& autotune_entry, |
162 | DnnScratchAllocator* scratch_allocator, |
163 | se::dnn::ConvolutionKind kind, se::Stream* stream, |
164 | const se::dnn::BatchDescriptor& input_desc, |
165 | se::DeviceMemory<T> in_ptr, |
166 | const se::dnn::FilterDescriptor& filter_desc, |
167 | se::DeviceMemory<T> filter_ptr, |
168 | const se::dnn::ConvolutionDescriptor& conv_desc, |
169 | const se::dnn::BatchDescriptor& output_desc, |
170 | se::DeviceMemory<T> out_ptr) { |
171 | if (!autotune_entry.is_algorithm_config()) { |
172 | const auto& runners = autotune_entry.GetOpRunners(); |
173 | se::dnn::DataType element_type = se::dnn::ToDataType<T>::value; |
174 | se::dnn::ConvOp::Config config{kind, element_type, element_type, |
175 | input_desc, filter_desc, output_desc, |
176 | conv_desc}; |
177 | TF_ASSIGN_OR_RETURN(auto* primary, |
178 | runners.primary->GetOrCreateRunner(config, stream)); |
179 | |
180 | const se::dnn::ConvRunner* no_scratch_fallback = nullptr; |
181 | if (runners.no_scratch_fallback) { |
182 | TF_ASSIGN_OR_RETURN( |
183 | no_scratch_fallback, |
184 | runners.no_scratch_fallback->GetOrCreateRunner(config, stream)); |
185 | } |
186 | |
187 | TF_ASSIGN_OR_RETURN(auto runner_and_scratch, |
188 | AllocateScratchOrFallback<se::dnn::ConvOp::Signature>( |
189 | scratch_allocator, primary, no_scratch_fallback)); |
190 | auto& runner = *std::get<const se::dnn::ConvRunner*>(runner_and_scratch); |
191 | return runner(stream, nullptr, |
192 | std::get<se::DeviceMemoryBase>(runner_and_scratch), in_ptr, |
193 | filter_ptr, out_ptr); |
194 | } else { |
195 | return stream->ConvolveWithAlgorithm( |
196 | kind, input_desc, in_ptr, filter_desc, filter_ptr, output_desc, out_ptr, |
197 | conv_desc, scratch_allocator, autotune_entry.GetAlgorithmConfig(), |
198 | nullptr); |
199 | } |
200 | } |
201 | |
202 | } // namespace tensorflow |
203 | |
204 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
205 | |
206 | #endif // TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_ |
207 | |