1/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14------------------------------------------------------------------------------*/
15
16#include "tensorflow/core/kernels/conv_ops_gpu.h"
17
18#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
19
20#include "tensorflow/core/profiler/lib/scoped_annotation.h"
21#include "tensorflow/core/protobuf/autotuning.pb.h"
22#include "tensorflow/core/util/proto/proto_utils.h"
23#include "tensorflow/core/util/use_cudnn.h"
24
25#if GOOGLE_CUDA
26#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_asm_opts.h"
27#include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h"
28#include "tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h"
29#include "tensorflow/core/kernels/autotune_conv_impl.h"
30#endif // GOOGLE_CUDA
31
32namespace tensorflow {
33
34// Finds the best convolution algorithm for the given ConvLaunch (cuda
35// convolution on the stream) and parameters, by running all possible
36// algorithms and measuring execution time.
37template <typename T>
38StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
39 bool cudnn_use_autotune,
40 AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>*
41 autotune_map,
42 const ConvParameters& params, OpKernelContext* ctx,
43 const se::dnn::BatchDescriptor& input_desc,
44 const se::dnn::FilterDescriptor& filter_desc,
45 const se::dnn::BatchDescriptor& bias_desc,
46 const se::dnn::BatchDescriptor& output_desc,
47 const se::dnn::ConvolutionDescriptor& conv_desc,
48 const se::dnn::ActivationMode activation_mode, double conv_scale,
49 double side_input_scale, double leakyrelu_alpha,
50 se::DeviceMemory<T> input_ptr, se::DeviceMemory<T> filter_ptr,
51 se::DeviceMemory<T> output_ptr, se::DeviceMemory<T> bias_ptr,
52 se::DeviceMemory<T> side_input_ptr, int64_t scratch_size_limit) {
53#if GOOGLE_CUDA
54 AutotuneEntry<se::dnn::FusedConvOp> autotune_entry;
55 auto* stream = ctx->op_device_context()->stream();
56
57 if (!autotune_map->Find(params, &autotune_entry)) {
58 profiler::ScopedAnnotation trace("cudnn_autotuning");
59
60 se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
61 stream);
62 se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
63 se::GpuAsmOpts());
64 se::DeviceMemory<T> output_ptr_rz(
65 WrapRedzoneBestEffort(&rz_allocator, output_ptr));
66
67 std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>> runners;
68 auto element_type = se::dnn::ToDataType<T>::value;
69 TF_RETURN_IF_ERROR(stream->parent()->GetFusedConvolveRunners(
70 CudnnUseFrontend(), se::dnn::ConvolutionKind::FORWARD, element_type,
71 element_type, element_type, conv_scale, side_input_scale,
72 leakyrelu_alpha, stream, input_desc, filter_desc, bias_desc,
73 output_desc, conv_desc, /*use_fallback=*/false, activation_mode,
74 &runners));
75
76 auto launch_func =
77 [&](se::ScratchAllocator* allocator_used,
78 const std::unique_ptr<const se::dnn::FusedConvRunner>& runner,
79 se::dnn::ProfileResult* profile_result) -> Status {
80 TF_ASSIGN_OR_RETURN(auto scratch, allocator_used->AllocateBytes(
81 runner->GetWorkspaceSize()));
82 return (*runner)(stream, profile_result, scratch, input_ptr, filter_ptr,
83 side_input_ptr, bias_ptr, output_ptr_rz);
84 };
85
86 TF_ASSIGN_OR_RETURN(auto results,
87 internal::AutotuneConvImpl(
88 ctx, runners, cudnn_use_autotune, launch_func,
89 scratch_size_limit, rz_allocator));
90 // Only log on an AutotuneConv cache miss.
91 LogFusedConvForwardAutotuneResults(
92 se::dnn::ToDataType<T>::value, input_ptr, filter_ptr, output_ptr,
93 bias_ptr, side_input_ptr, input_desc, filter_desc, output_desc,
94 conv_desc, conv_scale, side_input_scale, activation_mode,
95 stream->parent(), results);
96
97 // Two-level autotuning: Cudnn frontend supports two engine lists:
98 // heuristics and fallback. Heuristics engines are normally faster.
99 // To reduce autotuning time, we evaluate the fallback engines only when
100 // none of the heuristics engines work.
101 bool found_working_engine = false;
102 for (auto& result : results) {
103 if (!result.has_failure()) {
104 found_working_engine = true;
105 break;
106 }
107 }
108
109 if (!CudnnUseFrontend() || found_working_engine) {
110 TF_ASSIGN_OR_RETURN(autotune_entry,
111 BestCudnnConvAlgorithm<se::dnn::FusedConvOp>(
112 results, std::move(runners)));
113 } else {
114 LOG(WARNING)
115 << "None of the algorithms provided by cuDNN frontend heuristics "
116 "worked; trying fallback algorithms. Conv: "
117 << params.ToString();
118 std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>>
119 fallback_runners;
120 TF_RETURN_IF_ERROR(stream->parent()->GetFusedConvolveRunners(
121 CudnnUseFrontend(), se::dnn::ConvolutionKind::FORWARD, element_type,
122 element_type, element_type, conv_scale, side_input_scale,
123 leakyrelu_alpha, stream, input_desc, filter_desc, bias_desc,
124 output_desc, conv_desc, /*use_fallback=*/true, activation_mode,
125 &fallback_runners));
126
127 TF_ASSIGN_OR_RETURN(auto fallback_results,
128 internal::AutotuneConvImpl(
129 ctx, fallback_runners, cudnn_use_autotune,
130 launch_func, scratch_size_limit, rz_allocator));
131
132 LogFusedConvForwardAutotuneResults(
133 se::dnn::ToDataType<T>::value, input_ptr, filter_ptr, output_ptr,
134 bias_ptr, side_input_ptr, input_desc, filter_desc, output_desc,
135 conv_desc, conv_scale, side_input_scale, activation_mode,
136 stream->parent(), fallback_results);
137
138 TF_ASSIGN_OR_RETURN(autotune_entry,
139 BestCudnnConvAlgorithm<se::dnn::FusedConvOp>(
140 fallback_results, std::move(fallback_runners)));
141 }
142
143 autotune_map->Insert(params, autotune_entry);
144 }
145 return autotune_entry;
146#else
147 return errors::Unimplemented(
148 "Fused conv not implemented on non-CUDA platforms.");
149#endif
150}
151
152template StatusOr<AutotuneEntry<se::dnn::FusedConvOp>>
153AutotuneFusedConv<double>(
154 bool cudnn_use_autotune,
155 AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>*
156 autotune_map,
157 const ConvParameters& params, OpKernelContext* ctx,
158 const se::dnn::BatchDescriptor& input_desc,
159 const se::dnn::FilterDescriptor& filter_desc,
160 const se::dnn::BatchDescriptor& bias_desc,
161 const se::dnn::BatchDescriptor& output_desc,
162 const se::dnn::ConvolutionDescriptor& conv_desc,
163 const se::dnn::ActivationMode activation_mode, double conv_scale,
164 double side_input_scale, double leakyrelu_alpha,
165 se::DeviceMemory<double> input_ptr, se::DeviceMemory<double> filter_ptr,
166 se::DeviceMemory<double> output_ptr, se::DeviceMemory<double> bias_ptr,
167 se::DeviceMemory<double> side_input_ptr, int64_t scratch_size_limit);
168
169template StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv<float>(
170 bool cudnn_use_autotune,
171 AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>*
172 autotune_map,
173 const ConvParameters& params, OpKernelContext* ctx,
174 const se::dnn::BatchDescriptor& input_desc,
175 const se::dnn::FilterDescriptor& filter_desc,
176 const se::dnn::BatchDescriptor& bias_desc,
177 const se::dnn::BatchDescriptor& output_desc,
178 const se::dnn::ConvolutionDescriptor& conv_desc,
179 const se::dnn::ActivationMode activation_mode, double conv_scale,
180 double side_input_scale, double leakyrelu_alpha,
181 se::DeviceMemory<float> input_ptr, se::DeviceMemory<float> filter_ptr,
182 se::DeviceMemory<float> output_ptr, se::DeviceMemory<float> bias_ptr,
183 se::DeviceMemory<float> side_input_ptr, int64_t scratch_size_limit);
184
185template StatusOr<AutotuneEntry<se::dnn::FusedConvOp>>
186AutotuneFusedConv<Eigen::half>(
187 bool cudnn_use_autotune,
188 AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>*
189 autotune_map,
190 const ConvParameters& params, OpKernelContext* ctx,
191 const se::dnn::BatchDescriptor& input_desc,
192 const se::dnn::FilterDescriptor& filter_desc,
193 const se::dnn::BatchDescriptor& bias_desc,
194 const se::dnn::BatchDescriptor& output_desc,
195 const se::dnn::ConvolutionDescriptor& conv_desc,
196 const se::dnn::ActivationMode activation_mode, double conv_scale,
197 double side_input_scale, double leakyrelu_alpha,
198 se::DeviceMemory<Eigen::half> input_ptr,
199 se::DeviceMemory<Eigen::half> filter_ptr,
200 se::DeviceMemory<Eigen::half> output_ptr,
201 se::DeviceMemory<Eigen::half> bias_ptr,
202 se::DeviceMemory<Eigen::half> side_input_ptr, int64_t scratch_size_limit);
203
204template <typename T>
205StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
206 bool cudnn_use_autotune,
207 AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
208 const ConvParameters& conv_parameters, OpKernelContext* ctx,
209 se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc,
210 se::DeviceMemory<T> input_ptr, const se::dnn::FilterDescriptor& filter_desc,
211 se::DeviceMemory<T> filter_ptr,
212 const se::dnn::ConvolutionDescriptor& conv_desc,
213 const se::dnn::BatchDescriptor& output_desc, se::DeviceMemory<T> output_ptr,
214 int64_t scratch_size_limit) {
215 AutotuneEntry<se::dnn::ConvOp> autotune_entry;
216
217 auto* stream = ctx->op_device_context()->stream();
218
219 if (!autotune_map->Find(conv_parameters, &autotune_entry)) {
220 profiler::ScopedAnnotation annotation("cudnn_autotuning");
221
222#if GOOGLE_CUDA
223 se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
224 stream);
225 se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
226 se::GpuAsmOpts());
227
228 // TODO(awpr): second-guess whether it's okay that this profiles
229 // convolutions on uninitialized memory.
230 switch (kind) {
231 case se::dnn::ConvolutionKind::FORWARD:
232 case se::dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION:
233 output_ptr = se::DeviceMemory<T>(
234 WrapRedzoneBestEffort(&rz_allocator, output_ptr));
235 break;
236 case se::dnn::ConvolutionKind::BACKWARD_DATA:
237 input_ptr = se::DeviceMemory<T>(
238 WrapRedzoneBestEffort(&rz_allocator, input_ptr));
239 break;
240 case se::dnn::ConvolutionKind::BACKWARD_FILTER:
241 filter_ptr = se::DeviceMemory<T>(
242 WrapRedzoneBestEffort(&rz_allocator, filter_ptr));
243 break;
244 default:
245 return errors::InvalidArgument(
246 absl::StrFormat("Unknown ConvolutionKind %d", kind));
247 }
248
249 const auto element_type = se::dnn::ToDataType<T>::value;
250 std::vector<std::unique_ptr<const se::dnn::ConvRunner>> runners;
251 TF_RETURN_IF_ERROR(stream->parent()->GetConvolveRunners(
252 CudnnUseFrontend(), kind, element_type, element_type, stream,
253 input_desc, input_ptr, filter_desc, filter_ptr, output_desc, output_ptr,
254 conv_desc, /*use_fallback=*/false, &rz_allocator, &runners));
255 auto launch_func =
256 [&](se::ScratchAllocator* allocator_used,
257 const std::unique_ptr<const se::dnn::ConvRunner>& runner,
258 se::dnn::ProfileResult* profile_result) -> Status {
259 TF_ASSIGN_OR_RETURN(auto scratch, allocator_used->AllocateBytes(
260 runner->GetWorkspaceSize()));
261 return (*runner)(stream, profile_result, scratch, input_ptr, filter_ptr,
262 output_ptr);
263 };
264 TF_ASSIGN_OR_RETURN(auto results,
265 internal::AutotuneConvImpl(
266 ctx, runners, cudnn_use_autotune, launch_func,
267 scratch_size_limit, rz_allocator));
268
269 LogConvAutotuneResults(kind, se::dnn::ToDataType<T>::value, input_ptr,
270 filter_ptr, output_ptr, input_desc, filter_desc,
271 output_desc, conv_desc, stream->parent(), results);
272
273 // Two-level autotuning: Cudnn frontend supports two engine lists:
274 // heuristics and fallback. Heuristics engines are normally faster.
275 // To reduce autotuning time, we evaluate the fallback engines only when
276 // none of the heuristics engines work.
277 bool found_working_engine = false;
278 for (auto& result : results) {
279 if (!result.has_failure()) {
280 found_working_engine = true;
281 break;
282 }
283 }
284
285 if (!CudnnUseFrontend() || found_working_engine) {
286 TF_ASSIGN_OR_RETURN(
287 autotune_entry,
288 BestCudnnConvAlgorithm<se::dnn::ConvOp>(results, std::move(runners)));
289 } else {
290 LOG(WARNING)
291 << "None of the algorithms provided by cuDNN frontend heuristics "
292 "worked; trying fallback algorithms. Conv: "
293 << conv_parameters.ToString();
294 std::vector<std::unique_ptr<const se::dnn::ConvRunner>> fallback_runners;
295 TF_RETURN_IF_ERROR(stream->parent()->GetConvolveRunners(
296 CudnnUseFrontend(), kind, element_type, element_type, stream,
297 input_desc, input_ptr, filter_desc, filter_ptr, output_desc,
298 output_ptr, conv_desc, /*use_fallback=*/true, &rz_allocator,
299 &fallback_runners));
300
301 TF_ASSIGN_OR_RETURN(auto fallback_results,
302 internal::AutotuneConvImpl(
303 ctx, fallback_runners, cudnn_use_autotune,
304 launch_func, scratch_size_limit, rz_allocator));
305
306 LogConvAutotuneResults(kind, se::dnn::ToDataType<T>::value, input_ptr,
307 filter_ptr, output_ptr, input_desc, filter_desc,
308 output_desc, conv_desc, stream->parent(),
309 fallback_results);
310
311 TF_ASSIGN_OR_RETURN(autotune_entry,
312 BestCudnnConvAlgorithm<se::dnn::ConvOp>(
313 fallback_results, std::move(fallback_runners)));
314 }
315
316#elif TENSORFLOW_USE_ROCM
317 DnnScratchAllocator scratch_allocator(scratch_size_limit, ctx);
318
319 std::vector<se::dnn::ProfileResult> algorithms;
320 if (!stream->parent()->GetMIOpenConvolveAlgorithms(
321 kind, se::dnn::ToDataType<T>::value, stream, input_desc, input_ptr,
322 filter_desc, filter_ptr, output_desc, output_ptr, conv_desc,
323 &scratch_allocator, &algorithms)) {
324 return errors::Unknown(
325 "Failed to get convolution algorithm. This is probably "
326 "because MIOpen failed to initialize, so try looking to "
327 "see if a warning log message was printed above.");
328 }
329
330 std::vector<tensorflow::AutotuneResult> results;
331 if (algorithms.size() == 1) {
332 auto profile_result = algorithms[0];
333 results.emplace_back();
334 auto& result = results.back();
335 *result.mutable_algorithm() = profile_result.algorithm().ToProto();
336
337 result.set_scratch_bytes(profile_result.scratch_size());
338 *result.mutable_run_time() = proto_utils::ToDurationProto(
339 absl::Milliseconds(profile_result.elapsed_time_in_ms()));
340 } else {
341 for (auto miopen_algorithm : algorithms) {
342 auto profile_algorithm = miopen_algorithm.algorithm();
343 se::dnn::ProfileResult profile_result;
344 auto miopen_launch_status = stream->ConvolveWithAlgorithm(
345 kind, input_desc, input_ptr, filter_desc, filter_ptr, output_desc,
346 output_ptr, conv_desc, &scratch_allocator,
347 se::dnn::AlgorithmConfig(profile_algorithm,
348 miopen_algorithm.scratch_size()),
349 &profile_result);
350 if (miopen_launch_status.ok() && profile_result.is_valid()) {
351 results.emplace_back();
352 auto& result = results.back();
353 *result.mutable_algorithm() = profile_algorithm.ToProto();
354
355 result.set_scratch_bytes(scratch_allocator.TotalByteSize());
356 *result.mutable_run_time() = proto_utils::ToDurationProto(
357 absl::Milliseconds(profile_result.elapsed_time_in_ms()));
358 }
359 }
360 }
361 LogConvAutotuneResults(kind, se::dnn::ToDataType<T>::value, input_ptr,
362 filter_ptr, output_ptr, input_desc, filter_desc,
363 output_desc, conv_desc, stream->parent(), results);
364
365 TF_ASSIGN_OR_RETURN(auto algo_desc, BestCudnnConvAlgorithm(results));
366 autotune_entry = AutotuneEntry<se::dnn::ConvOp>(algo_desc);
367#endif
368
369 autotune_map->Insert(conv_parameters, autotune_entry);
370 }
371
372 return autotune_entry;
373}
374
375template StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv<double>(
376 bool cudnn_use_autotune,
377 AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
378 const ConvParameters& conv_parameters, OpKernelContext* ctx,
379 se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc,
380 se::DeviceMemory<double> input_ptr,
381 const se::dnn::FilterDescriptor& filter_desc,
382 se::DeviceMemory<double> filter_ptr,
383 const se::dnn::ConvolutionDescriptor& conv_desc,
384 const se::dnn::BatchDescriptor& output_desc,
385 se::DeviceMemory<double> output_ptr, int64_t scratch_size_limit);
386
387template StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv<float>(
388 bool cudnn_use_autotune,
389 AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
390 const ConvParameters& conv_parameters, OpKernelContext* ctx,
391 se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc,
392 se::DeviceMemory<float> input_ptr,
393 const se::dnn::FilterDescriptor& filter_desc,
394 se::DeviceMemory<float> filter_ptr,
395 const se::dnn::ConvolutionDescriptor& conv_desc,
396 const se::dnn::BatchDescriptor& output_desc,
397 se::DeviceMemory<float> output_ptr, int64_t scratch_size_limit);
398
399template StatusOr<AutotuneEntry<se::dnn::ConvOp>>
400AutotuneUnfusedConv<Eigen::half>(
401 bool cudnn_use_autotune,
402 AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
403 const ConvParameters& conv_parameters, OpKernelContext* ctx,
404 se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc,
405 se::DeviceMemory<Eigen::half> input_ptr,
406 const se::dnn::FilterDescriptor& filter_desc,
407 se::DeviceMemory<Eigen::half> filter_ptr,
408 const se::dnn::ConvolutionDescriptor& conv_desc,
409 const se::dnn::BatchDescriptor& output_desc,
410 se::DeviceMemory<Eigen::half> output_ptr, int64_t scratch_size_limit);
411
412} // namespace tensorflow
413
414#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
415