1 | /* Copyright 2021 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ------------------------------------------------------------------------------*/ |
15 | |
16 | #include "tensorflow/core/kernels/conv_ops_gpu.h" |
17 | |
18 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
19 | |
20 | #include "tensorflow/core/profiler/lib/scoped_annotation.h" |
21 | #include "tensorflow/core/protobuf/autotuning.pb.h" |
22 | #include "tensorflow/core/util/proto/proto_utils.h" |
23 | #include "tensorflow/core/util/use_cudnn.h" |
24 | |
25 | #if GOOGLE_CUDA |
26 | #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_asm_opts.h" |
27 | #include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h" |
28 | #include "tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h" |
29 | #include "tensorflow/core/kernels/autotune_conv_impl.h" |
30 | #endif // GOOGLE_CUDA |
31 | |
32 | namespace tensorflow { |
33 | |
34 | // Finds the best convolution algorithm for the given ConvLaunch (cuda |
35 | // convolution on the stream) and parameters, by running all possible |
36 | // algorithms and measuring execution time. |
37 | template <typename T> |
38 | StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv( |
39 | bool cudnn_use_autotune, |
40 | AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>* |
41 | autotune_map, |
42 | const ConvParameters& params, OpKernelContext* ctx, |
43 | const se::dnn::BatchDescriptor& input_desc, |
44 | const se::dnn::FilterDescriptor& filter_desc, |
45 | const se::dnn::BatchDescriptor& bias_desc, |
46 | const se::dnn::BatchDescriptor& output_desc, |
47 | const se::dnn::ConvolutionDescriptor& conv_desc, |
48 | const se::dnn::ActivationMode activation_mode, double conv_scale, |
49 | double side_input_scale, double leakyrelu_alpha, |
50 | se::DeviceMemory<T> input_ptr, se::DeviceMemory<T> filter_ptr, |
51 | se::DeviceMemory<T> output_ptr, se::DeviceMemory<T> bias_ptr, |
52 | se::DeviceMemory<T> side_input_ptr, int64_t scratch_size_limit) { |
53 | #if GOOGLE_CUDA |
54 | AutotuneEntry<se::dnn::FusedConvOp> autotune_entry; |
55 | auto* stream = ctx->op_device_context()->stream(); |
56 | |
57 | if (!autotune_map->Find(params, &autotune_entry)) { |
58 | profiler::ScopedAnnotation trace("cudnn_autotuning" ); |
59 | |
60 | se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}), |
61 | stream); |
62 | se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter, |
63 | se::GpuAsmOpts()); |
64 | se::DeviceMemory<T> output_ptr_rz( |
65 | WrapRedzoneBestEffort(&rz_allocator, output_ptr)); |
66 | |
67 | std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>> runners; |
68 | auto element_type = se::dnn::ToDataType<T>::value; |
69 | TF_RETURN_IF_ERROR(stream->parent()->GetFusedConvolveRunners( |
70 | CudnnUseFrontend(), se::dnn::ConvolutionKind::FORWARD, element_type, |
71 | element_type, element_type, conv_scale, side_input_scale, |
72 | leakyrelu_alpha, stream, input_desc, filter_desc, bias_desc, |
73 | output_desc, conv_desc, /*use_fallback=*/false, activation_mode, |
74 | &runners)); |
75 | |
76 | auto launch_func = |
77 | [&](se::ScratchAllocator* allocator_used, |
78 | const std::unique_ptr<const se::dnn::FusedConvRunner>& runner, |
79 | se::dnn::ProfileResult* profile_result) -> Status { |
80 | TF_ASSIGN_OR_RETURN(auto scratch, allocator_used->AllocateBytes( |
81 | runner->GetWorkspaceSize())); |
82 | return (*runner)(stream, profile_result, scratch, input_ptr, filter_ptr, |
83 | side_input_ptr, bias_ptr, output_ptr_rz); |
84 | }; |
85 | |
86 | TF_ASSIGN_OR_RETURN(auto results, |
87 | internal::AutotuneConvImpl( |
88 | ctx, runners, cudnn_use_autotune, launch_func, |
89 | scratch_size_limit, rz_allocator)); |
90 | // Only log on an AutotuneConv cache miss. |
91 | LogFusedConvForwardAutotuneResults( |
92 | se::dnn::ToDataType<T>::value, input_ptr, filter_ptr, output_ptr, |
93 | bias_ptr, side_input_ptr, input_desc, filter_desc, output_desc, |
94 | conv_desc, conv_scale, side_input_scale, activation_mode, |
95 | stream->parent(), results); |
96 | |
97 | // Two-level autotuning: Cudnn frontend supports two engine lists: |
98 | // heuristics and fallback. Heuristics engines are normally faster. |
99 | // To reduce autotuning time, we evaluate the fallback engines only when |
100 | // none of the heuristics engines work. |
101 | bool found_working_engine = false; |
102 | for (auto& result : results) { |
103 | if (!result.has_failure()) { |
104 | found_working_engine = true; |
105 | break; |
106 | } |
107 | } |
108 | |
109 | if (!CudnnUseFrontend() || found_working_engine) { |
110 | TF_ASSIGN_OR_RETURN(autotune_entry, |
111 | BestCudnnConvAlgorithm<se::dnn::FusedConvOp>( |
112 | results, std::move(runners))); |
113 | } else { |
114 | LOG(WARNING) |
115 | << "None of the algorithms provided by cuDNN frontend heuristics " |
116 | "worked; trying fallback algorithms. Conv: " |
117 | << params.ToString(); |
118 | std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>> |
119 | fallback_runners; |
120 | TF_RETURN_IF_ERROR(stream->parent()->GetFusedConvolveRunners( |
121 | CudnnUseFrontend(), se::dnn::ConvolutionKind::FORWARD, element_type, |
122 | element_type, element_type, conv_scale, side_input_scale, |
123 | leakyrelu_alpha, stream, input_desc, filter_desc, bias_desc, |
124 | output_desc, conv_desc, /*use_fallback=*/true, activation_mode, |
125 | &fallback_runners)); |
126 | |
127 | TF_ASSIGN_OR_RETURN(auto fallback_results, |
128 | internal::AutotuneConvImpl( |
129 | ctx, fallback_runners, cudnn_use_autotune, |
130 | launch_func, scratch_size_limit, rz_allocator)); |
131 | |
132 | LogFusedConvForwardAutotuneResults( |
133 | se::dnn::ToDataType<T>::value, input_ptr, filter_ptr, output_ptr, |
134 | bias_ptr, side_input_ptr, input_desc, filter_desc, output_desc, |
135 | conv_desc, conv_scale, side_input_scale, activation_mode, |
136 | stream->parent(), fallback_results); |
137 | |
138 | TF_ASSIGN_OR_RETURN(autotune_entry, |
139 | BestCudnnConvAlgorithm<se::dnn::FusedConvOp>( |
140 | fallback_results, std::move(fallback_runners))); |
141 | } |
142 | |
143 | autotune_map->Insert(params, autotune_entry); |
144 | } |
145 | return autotune_entry; |
146 | #else |
147 | return errors::Unimplemented( |
148 | "Fused conv not implemented on non-CUDA platforms." ); |
149 | #endif |
150 | } |
151 | |
152 | template StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> |
153 | AutotuneFusedConv<double>( |
154 | bool cudnn_use_autotune, |
155 | AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>* |
156 | autotune_map, |
157 | const ConvParameters& params, OpKernelContext* ctx, |
158 | const se::dnn::BatchDescriptor& input_desc, |
159 | const se::dnn::FilterDescriptor& filter_desc, |
160 | const se::dnn::BatchDescriptor& bias_desc, |
161 | const se::dnn::BatchDescriptor& output_desc, |
162 | const se::dnn::ConvolutionDescriptor& conv_desc, |
163 | const se::dnn::ActivationMode activation_mode, double conv_scale, |
164 | double side_input_scale, double leakyrelu_alpha, |
165 | se::DeviceMemory<double> input_ptr, se::DeviceMemory<double> filter_ptr, |
166 | se::DeviceMemory<double> output_ptr, se::DeviceMemory<double> bias_ptr, |
167 | se::DeviceMemory<double> side_input_ptr, int64_t scratch_size_limit); |
168 | |
169 | template StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv<float>( |
170 | bool cudnn_use_autotune, |
171 | AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>* |
172 | autotune_map, |
173 | const ConvParameters& params, OpKernelContext* ctx, |
174 | const se::dnn::BatchDescriptor& input_desc, |
175 | const se::dnn::FilterDescriptor& filter_desc, |
176 | const se::dnn::BatchDescriptor& bias_desc, |
177 | const se::dnn::BatchDescriptor& output_desc, |
178 | const se::dnn::ConvolutionDescriptor& conv_desc, |
179 | const se::dnn::ActivationMode activation_mode, double conv_scale, |
180 | double side_input_scale, double leakyrelu_alpha, |
181 | se::DeviceMemory<float> input_ptr, se::DeviceMemory<float> filter_ptr, |
182 | se::DeviceMemory<float> output_ptr, se::DeviceMemory<float> bias_ptr, |
183 | se::DeviceMemory<float> side_input_ptr, int64_t scratch_size_limit); |
184 | |
185 | template StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> |
186 | AutotuneFusedConv<Eigen::half>( |
187 | bool cudnn_use_autotune, |
188 | AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>* |
189 | autotune_map, |
190 | const ConvParameters& params, OpKernelContext* ctx, |
191 | const se::dnn::BatchDescriptor& input_desc, |
192 | const se::dnn::FilterDescriptor& filter_desc, |
193 | const se::dnn::BatchDescriptor& bias_desc, |
194 | const se::dnn::BatchDescriptor& output_desc, |
195 | const se::dnn::ConvolutionDescriptor& conv_desc, |
196 | const se::dnn::ActivationMode activation_mode, double conv_scale, |
197 | double side_input_scale, double leakyrelu_alpha, |
198 | se::DeviceMemory<Eigen::half> input_ptr, |
199 | se::DeviceMemory<Eigen::half> filter_ptr, |
200 | se::DeviceMemory<Eigen::half> output_ptr, |
201 | se::DeviceMemory<Eigen::half> bias_ptr, |
202 | se::DeviceMemory<Eigen::half> side_input_ptr, int64_t scratch_size_limit); |
203 | |
204 | template <typename T> |
205 | StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv( |
206 | bool cudnn_use_autotune, |
207 | AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map, |
208 | const ConvParameters& conv_parameters, OpKernelContext* ctx, |
209 | se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc, |
210 | se::DeviceMemory<T> input_ptr, const se::dnn::FilterDescriptor& filter_desc, |
211 | se::DeviceMemory<T> filter_ptr, |
212 | const se::dnn::ConvolutionDescriptor& conv_desc, |
213 | const se::dnn::BatchDescriptor& output_desc, se::DeviceMemory<T> output_ptr, |
214 | int64_t scratch_size_limit) { |
215 | AutotuneEntry<se::dnn::ConvOp> autotune_entry; |
216 | |
217 | auto* stream = ctx->op_device_context()->stream(); |
218 | |
219 | if (!autotune_map->Find(conv_parameters, &autotune_entry)) { |
220 | profiler::ScopedAnnotation annotation("cudnn_autotuning" ); |
221 | |
222 | #if GOOGLE_CUDA |
223 | se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}), |
224 | stream); |
225 | se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter, |
226 | se::GpuAsmOpts()); |
227 | |
228 | // TODO(awpr): second-guess whether it's okay that this profiles |
229 | // convolutions on uninitialized memory. |
230 | switch (kind) { |
231 | case se::dnn::ConvolutionKind::FORWARD: |
232 | case se::dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION: |
233 | output_ptr = se::DeviceMemory<T>( |
234 | WrapRedzoneBestEffort(&rz_allocator, output_ptr)); |
235 | break; |
236 | case se::dnn::ConvolutionKind::BACKWARD_DATA: |
237 | input_ptr = se::DeviceMemory<T>( |
238 | WrapRedzoneBestEffort(&rz_allocator, input_ptr)); |
239 | break; |
240 | case se::dnn::ConvolutionKind::BACKWARD_FILTER: |
241 | filter_ptr = se::DeviceMemory<T>( |
242 | WrapRedzoneBestEffort(&rz_allocator, filter_ptr)); |
243 | break; |
244 | default: |
245 | return errors::InvalidArgument( |
246 | absl::StrFormat("Unknown ConvolutionKind %d" , kind)); |
247 | } |
248 | |
249 | const auto element_type = se::dnn::ToDataType<T>::value; |
250 | std::vector<std::unique_ptr<const se::dnn::ConvRunner>> runners; |
251 | TF_RETURN_IF_ERROR(stream->parent()->GetConvolveRunners( |
252 | CudnnUseFrontend(), kind, element_type, element_type, stream, |
253 | input_desc, input_ptr, filter_desc, filter_ptr, output_desc, output_ptr, |
254 | conv_desc, /*use_fallback=*/false, &rz_allocator, &runners)); |
255 | auto launch_func = |
256 | [&](se::ScratchAllocator* allocator_used, |
257 | const std::unique_ptr<const se::dnn::ConvRunner>& runner, |
258 | se::dnn::ProfileResult* profile_result) -> Status { |
259 | TF_ASSIGN_OR_RETURN(auto scratch, allocator_used->AllocateBytes( |
260 | runner->GetWorkspaceSize())); |
261 | return (*runner)(stream, profile_result, scratch, input_ptr, filter_ptr, |
262 | output_ptr); |
263 | }; |
264 | TF_ASSIGN_OR_RETURN(auto results, |
265 | internal::AutotuneConvImpl( |
266 | ctx, runners, cudnn_use_autotune, launch_func, |
267 | scratch_size_limit, rz_allocator)); |
268 | |
269 | LogConvAutotuneResults(kind, se::dnn::ToDataType<T>::value, input_ptr, |
270 | filter_ptr, output_ptr, input_desc, filter_desc, |
271 | output_desc, conv_desc, stream->parent(), results); |
272 | |
273 | // Two-level autotuning: Cudnn frontend supports two engine lists: |
274 | // heuristics and fallback. Heuristics engines are normally faster. |
275 | // To reduce autotuning time, we evaluate the fallback engines only when |
276 | // none of the heuristics engines work. |
277 | bool found_working_engine = false; |
278 | for (auto& result : results) { |
279 | if (!result.has_failure()) { |
280 | found_working_engine = true; |
281 | break; |
282 | } |
283 | } |
284 | |
285 | if (!CudnnUseFrontend() || found_working_engine) { |
286 | TF_ASSIGN_OR_RETURN( |
287 | autotune_entry, |
288 | BestCudnnConvAlgorithm<se::dnn::ConvOp>(results, std::move(runners))); |
289 | } else { |
290 | LOG(WARNING) |
291 | << "None of the algorithms provided by cuDNN frontend heuristics " |
292 | "worked; trying fallback algorithms. Conv: " |
293 | << conv_parameters.ToString(); |
294 | std::vector<std::unique_ptr<const se::dnn::ConvRunner>> fallback_runners; |
295 | TF_RETURN_IF_ERROR(stream->parent()->GetConvolveRunners( |
296 | CudnnUseFrontend(), kind, element_type, element_type, stream, |
297 | input_desc, input_ptr, filter_desc, filter_ptr, output_desc, |
298 | output_ptr, conv_desc, /*use_fallback=*/true, &rz_allocator, |
299 | &fallback_runners)); |
300 | |
301 | TF_ASSIGN_OR_RETURN(auto fallback_results, |
302 | internal::AutotuneConvImpl( |
303 | ctx, fallback_runners, cudnn_use_autotune, |
304 | launch_func, scratch_size_limit, rz_allocator)); |
305 | |
306 | LogConvAutotuneResults(kind, se::dnn::ToDataType<T>::value, input_ptr, |
307 | filter_ptr, output_ptr, input_desc, filter_desc, |
308 | output_desc, conv_desc, stream->parent(), |
309 | fallback_results); |
310 | |
311 | TF_ASSIGN_OR_RETURN(autotune_entry, |
312 | BestCudnnConvAlgorithm<se::dnn::ConvOp>( |
313 | fallback_results, std::move(fallback_runners))); |
314 | } |
315 | |
316 | #elif TENSORFLOW_USE_ROCM |
317 | DnnScratchAllocator scratch_allocator(scratch_size_limit, ctx); |
318 | |
319 | std::vector<se::dnn::ProfileResult> algorithms; |
320 | if (!stream->parent()->GetMIOpenConvolveAlgorithms( |
321 | kind, se::dnn::ToDataType<T>::value, stream, input_desc, input_ptr, |
322 | filter_desc, filter_ptr, output_desc, output_ptr, conv_desc, |
323 | &scratch_allocator, &algorithms)) { |
324 | return errors::Unknown( |
325 | "Failed to get convolution algorithm. This is probably " |
326 | "because MIOpen failed to initialize, so try looking to " |
327 | "see if a warning log message was printed above." ); |
328 | } |
329 | |
330 | std::vector<tensorflow::AutotuneResult> results; |
331 | if (algorithms.size() == 1) { |
332 | auto profile_result = algorithms[0]; |
333 | results.emplace_back(); |
334 | auto& result = results.back(); |
335 | *result.mutable_algorithm() = profile_result.algorithm().ToProto(); |
336 | |
337 | result.set_scratch_bytes(profile_result.scratch_size()); |
338 | *result.mutable_run_time() = proto_utils::ToDurationProto( |
339 | absl::Milliseconds(profile_result.elapsed_time_in_ms())); |
340 | } else { |
341 | for (auto miopen_algorithm : algorithms) { |
342 | auto profile_algorithm = miopen_algorithm.algorithm(); |
343 | se::dnn::ProfileResult profile_result; |
344 | auto miopen_launch_status = stream->ConvolveWithAlgorithm( |
345 | kind, input_desc, input_ptr, filter_desc, filter_ptr, output_desc, |
346 | output_ptr, conv_desc, &scratch_allocator, |
347 | se::dnn::AlgorithmConfig(profile_algorithm, |
348 | miopen_algorithm.scratch_size()), |
349 | &profile_result); |
350 | if (miopen_launch_status.ok() && profile_result.is_valid()) { |
351 | results.emplace_back(); |
352 | auto& result = results.back(); |
353 | *result.mutable_algorithm() = profile_algorithm.ToProto(); |
354 | |
355 | result.set_scratch_bytes(scratch_allocator.TotalByteSize()); |
356 | *result.mutable_run_time() = proto_utils::ToDurationProto( |
357 | absl::Milliseconds(profile_result.elapsed_time_in_ms())); |
358 | } |
359 | } |
360 | } |
361 | LogConvAutotuneResults(kind, se::dnn::ToDataType<T>::value, input_ptr, |
362 | filter_ptr, output_ptr, input_desc, filter_desc, |
363 | output_desc, conv_desc, stream->parent(), results); |
364 | |
365 | TF_ASSIGN_OR_RETURN(auto algo_desc, BestCudnnConvAlgorithm(results)); |
366 | autotune_entry = AutotuneEntry<se::dnn::ConvOp>(algo_desc); |
367 | #endif |
368 | |
369 | autotune_map->Insert(conv_parameters, autotune_entry); |
370 | } |
371 | |
372 | return autotune_entry; |
373 | } |
374 | |
375 | template StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv<double>( |
376 | bool cudnn_use_autotune, |
377 | AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map, |
378 | const ConvParameters& conv_parameters, OpKernelContext* ctx, |
379 | se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc, |
380 | se::DeviceMemory<double> input_ptr, |
381 | const se::dnn::FilterDescriptor& filter_desc, |
382 | se::DeviceMemory<double> filter_ptr, |
383 | const se::dnn::ConvolutionDescriptor& conv_desc, |
384 | const se::dnn::BatchDescriptor& output_desc, |
385 | se::DeviceMemory<double> output_ptr, int64_t scratch_size_limit); |
386 | |
387 | template StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv<float>( |
388 | bool cudnn_use_autotune, |
389 | AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map, |
390 | const ConvParameters& conv_parameters, OpKernelContext* ctx, |
391 | se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc, |
392 | se::DeviceMemory<float> input_ptr, |
393 | const se::dnn::FilterDescriptor& filter_desc, |
394 | se::DeviceMemory<float> filter_ptr, |
395 | const se::dnn::ConvolutionDescriptor& conv_desc, |
396 | const se::dnn::BatchDescriptor& output_desc, |
397 | se::DeviceMemory<float> output_ptr, int64_t scratch_size_limit); |
398 | |
399 | template StatusOr<AutotuneEntry<se::dnn::ConvOp>> |
400 | AutotuneUnfusedConv<Eigen::half>( |
401 | bool cudnn_use_autotune, |
402 | AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map, |
403 | const ConvParameters& conv_parameters, OpKernelContext* ctx, |
404 | se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc, |
405 | se::DeviceMemory<Eigen::half> input_ptr, |
406 | const se::dnn::FilterDescriptor& filter_desc, |
407 | se::DeviceMemory<Eigen::half> filter_ptr, |
408 | const se::dnn::ConvolutionDescriptor& conv_desc, |
409 | const se::dnn::BatchDescriptor& output_desc, |
410 | se::DeviceMemory<Eigen::half> output_ptr, int64_t scratch_size_limit); |
411 | |
412 | } // namespace tensorflow |
413 | |
414 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
415 | |