1/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include <type_traits>
17
18// This include can't be in the conv_ops_fused_impl.h headers. See b/62899350.
19#if GOOGLE_CUDA
20#include "tensorflow/core/protobuf/autotuning.pb.h"
21#endif // GOOGLE_CUDA
22#include "tensorflow/core/kernels/autotune_conv_impl.h"
23#include "tensorflow/core/kernels/conv_ops_fused_impl.h"
24#include "tensorflow/core/kernels/cwise_ops.h"
25#include "tensorflow/core/util/activation_mode.h"
26
27namespace tensorflow {
28
29// If we're using the alternative GEMM-based implementation of Conv2D for the
30// CPU implementation, don't register this EigenTensor-based version.
31#if !defined(USE_GEMM_FOR_CONV)
32TF_CALL_int8(REGISTER_FUSED_CPU_CONV2D);
33TF_CALL_qint8(REGISTER_FUSED_CPU_CONV2D);
34#endif // !USE_GEMM_FOR_CONV
35
36#if GOOGLE_CUDA
37
38namespace functor {
39DECLARE_FUNCTOR_GPU_SPEC(int32);
40} // namespace functor
41
42TF_CALL_int8(REGISTER_FUSED_GPU_CONV2D);
43TF_CALL_qint8(REGISTER_FUSED_GPU_CONV2D);
44
45#endif // GOOGLE_CUDA
46
47template <typename T>
48struct LaunchFusedConv2DOpCpuInt8Helper {
49 using BiasType = float;
50 using ScaleType = float;
51 using ComputeT = float; // convert inputs to fp32 for tensor contraction
52 using TempT = float; // temporary accumulator type for tensor contraction
53
54 void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
55 const Tensor& conv_input, const Tensor& filter,
56 const FusedComputationType fusion,
57 const FusedComputationArgs& fusion_args,
58 const Conv2DParameters& params,
59 const Conv2DDimensions& dimensions, Tensor* output) {
60 OP_REQUIRES(ctx, dimensions.in_depth == filter.dim_size(2),
61 errors::Unimplemented("Fused conv implementation does not "
62 "support grouped convolutions for now."));
63 OP_REQUIRES(
64 ctx, params.data_format == FORMAT_NHWC,
65 errors::Unimplemented(
66 "Fused conv implementation for int8/qint8 on CPU only supports "
67 "NHWC tensor format for now."));
68 OP_REQUIRES(ctx,
69 DataTypeToEnum<T>::value == DT_INT8 ||
70 DataTypeToEnum<T>::value == DT_QINT8,
71 errors::Unimplemented("Specialized fused conv implemented for "
72 "only int8 and qint8 on CPU."));
73 OP_REQUIRES(
74 ctx, dimensions.dilation_rows == 1 && dimensions.dilation_cols == 1,
75 errors::Unimplemented(
76 "Fused conv implementation for int8/qint8 on CPU only supports "
77 "dilation of 1 for rows and cols."));
78 OP_REQUIRES(
79 ctx,
80 fusion == FusedComputationType::kBiasAdd ||
81 fusion == FusedComputationType::kBiasAddWithRelu,
82 errors::Unimplemented(
83 "Fused conv implementation for int8/qint8 on CPU only supports "
84 "BiasAdd + None or BiasAdd + Relu."));
85
86 constexpr int kBias = 2;
87 constexpr int kSideInput = 3;
88 constexpr int kConvInputScale = 4;
89 constexpr int kSideInputScale = 5;
90
91 const Tensor& bias = ctx->input(kBias);
92 const Tensor& side_input = ctx->input(kSideInput);
93 const Tensor& conv_input_scale = ctx->input(kConvInputScale);
94 const Tensor& side_input_scale_param = ctx->input(kSideInputScale);
95
96 Eigen::PaddingType padding = BrainPadding2EigenPadding(params.padding);
97 int32_t row_stride = dimensions.stride_rows;
98 int32_t col_stride = dimensions.stride_cols;
99
100 // Output tensor has type T (QInt8/int8), but we can only evaluate
101 // Tensor contraction using 32-bit accumulation (fp32).
102 Tensor temp_output(DataTypeToEnum<TempT>::value, output->shape());
103
104 const int32_t row_dilation = dimensions.dilation_rows;
105 const int32_t col_dilation = dimensions.dilation_cols;
106
107 auto& device = ctx->eigen_device<CPUDevice>();
108
109 // CPU convolution works with input in NHWC and filter in HWIO data formats.
110 // NOTE: This code is mostly shared with 'Conv2D' and 'FusedConv2D'.
111
112 const ScaleType side_input_scale =
113 side_input_scale_param.scalar<ScaleType>()();
114 BiasActivationOutputKernel output_kernel(
115 conv_input_scale, side_input, side_input_scale, bias, fusion, output);
116
117 if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 &&
118 col_stride == 1) {
119 int conv_width = // Width for the convolution step.
120 output->dim_size(0) * output->dim_size(1) * output->dim_size(2);
121
122 Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
123 dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
124
125 auto out = temp_output.shaped<TempT, 2>({conv_width, filter.dim_size(3)});
126 auto in0 = conv_input.shaped<T, 2>({conv_width, filter.dim_size(2)});
127 auto in1 = filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)});
128
129 out.device(device) = in0.template cast<ComputeT>().contract(
130 in1.template cast<ComputeT>(), dim_pair, output_kernel);
131 } else if (filter.dim_size(0) == conv_input.dim_size(1) &&
132 filter.dim_size(1) == conv_input.dim_size(2) &&
133 row_dilation == 1 && col_dilation == 1 &&
134 padding == Eigen::PaddingType::PADDING_VALID) {
135 // If the input data and filter have the same height/width,
136 // reduce the 2D convolution to matrix multiplication.
137 const auto k = // Length of reduction dimension.
138 filter.dim_size(0) * filter.dim_size(1) * filter.dim_size(2);
139
140 Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
141 dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
142
143 auto out = temp_output.shaped<TempT, 2>(
144 {conv_input.dim_size(0), filter.dim_size(3)});
145 auto in0 = conv_input.shaped<T, 2>({conv_input.dim_size(0), k});
146 auto in1 = filter.shaped<T, 2>({k, filter.dim_size(3)});
147
148 out.device(device) = in0.template cast<ComputeT>().contract(
149 in1.template cast<ComputeT>(), dim_pair, output_kernel);
150 } else {
151 auto out = temp_output.tensor<TempT, 4>();
152 auto in0 = conv_input.tensor<T, 4>();
153 auto in1 = filter.tensor<T, 4>();
154
155 // Need to swap row/col when calling Eigen.
156 out.device(device) = Eigen::SpatialConvolution(
157 in0.template cast<ComputeT>(), in1.template cast<ComputeT>(),
158 col_stride, row_stride, padding, col_dilation, row_dilation,
159 output_kernel);
160 }
161 }
162
163 private:
164 // Contraction output mapper for temporary QInt32 tensor.
165 using ContractionOutputMapper =
166 Eigen::internal::blas_data_mapper<TempT, Eigen::Index, Eigen::ColMajor>;
167
168 // This output kernel computes an expressions corresponding to cuDNN
169 // implementation of INT8 cudnnConvolutionBiasActivationForward:
170 // https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#scaling-parameters__fig-conv-bias-activation-forward
171 struct BiasActivationOutputKernel {
172 explicit BiasActivationOutputKernel(const Tensor& conv_input_scale,
173 const Tensor& side_input,
174 ScaleType side_input_scale,
175 const Tensor& bias,
176 const FusedComputationType fusion,
177 Tensor* output)
178 : fusion(fusion),
179 conv_input_scale_data(conv_input_scale.flat<ScaleType>().data()),
180 bias_data(bias.flat<BiasType>().data()),
181 side_input_data(side_input.flat<T>().data()),
182 side_input_scale(side_input_scale),
183 output_data(const_cast<T*>(output->flat<T>().data())),
184 conv_input_scale_tensor_size(conv_input_scale.NumElements()) {}
185
186 EIGEN_ALWAYS_INLINE void operator()(
187 const ContractionOutputMapper& conv_output_mapper,
188 const Eigen::TensorContractionParams& params, Eigen::Index i,
189 Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) const {
190 DCHECK(params.swapped_arguments);
191
192 const auto stride = conv_output_mapper.stride();
193
194 const BiasType* bias_base = bias_data + i;
195 const ScaleType* conv_input_scale_base = conv_input_scale_data;
196 if (conv_input_scale_tensor_size > 1) {
197 conv_input_scale_base += i;
198 }
199
200 const T* side_input_base;
201 if (side_input_data == nullptr) {
202 // side_input_data can be null when the tf::Tensor for the side input is
203 // empty.
204 side_input_base = nullptr;
205 } else {
206 side_input_base = side_input_data + i + j * stride;
207 }
208 T* output_base = output_data + i + j * stride;
209
210 for (int col = 0; col < num_cols; ++col) {
211 // A column of an output tensor after QInt8xQInt8 -> QInt32 contraction.
212 // This is a temporary tensor, that we will scale, add bias with
213 // side_input, and quantize before writing to final output tensor.
214 typename TTypes<TempT>::UnalignedTensor conv_output(
215 &conv_output_mapper(0, col), num_rows);
216
217 // A column of output quantized tensor corresponding to conv output row.
218 typename TTypes<T>::UnalignedTensor output(output_base + col * stride,
219 num_rows);
220
221 const BiasType* bias_ptr = bias_base;
222
223 static_assert(
224 std::is_same<TempT, ScaleType>::value,
225 "Temporary contraction result type must match with scale type.");
226
227 // CuDNN 8 introduced many new kernels for sm75+ GPUs. These kernels use
228 // different numerics than those in CuDNN 7-.
229 //
230 // In cudnn 7-:
231 //
232 // conv_output = Fma(conv_output, conv_input_scale, bias)
233 // conv_output = Fma(conv_output, side_input_scale, side_input),
234 //
235 // In cudnn 8:
236 //
237 // conv_output = conv_output * conv_input_scale
238 // conv_output = Fma(conv_output, side_input_scale, side_input)
239 // conv_output = conv_output + bias
240 //
241 // One caveat is that the numerics of
242 // cudnnConvolutionBiasActivationForward depend on not only the cudnn
243 // version but also the GPU's compute capability, which is not visible
244 // to the CPU implementation of FusedConv2dBiasActivationOp. So we
245 // expect this implementation to be bit exact for cudnn7-/sm70- and
246 // cudnn8+/sm75+ but not for cudnn8+/sm70-.
247 //
248 // NOTE(ezhulenev): We do not use packet FMA for this loop,
249 // because it seems that it produces slightly different results,
250 // and we are targeting close equality with Nvidia implementation.
251 typename TTypes<BiasType>::UnalignedConstTensor bias_slice(bias_ptr,
252 num_rows);
253
254 // (1) Scale.
255 if (conv_input_scale_tensor_size > 1) {
256 typename TTypes<ScaleType>::UnalignedConstTensor
257 conv_input_scale_slice(conv_input_scale_base, num_rows);
258 conv_output = conv_output * conv_input_scale_slice;
259 } else {
260 conv_output = conv_output * (*conv_input_scale_base);
261 }
262
263 // (2) Side input.
264 if (side_input_scale != 0.0f) {
265 const T* side_input_ptr = side_input_base + col * stride;
266 TempT* conv_output_ptr = conv_output.data();
267 for (int idx = 0; idx < num_rows; ++idx) {
268 conv_output_ptr[idx] = std::fmaf(
269 side_input_ptr[idx], side_input_scale, conv_output_ptr[idx]);
270 }
271 }
272
273 // (3) Bias.
274 conv_output += bias_slice;
275
276 // Round-up, clip and apply activation function.
277 static constexpr ScaleType kMaxRange = static_cast<ScaleType>(127.f);
278 static constexpr ScaleType kMinRange = static_cast<ScaleType>(-128.f);
279
280 ScaleType lower_bound =
281 (fusion == FusedComputationType::kBiasAdd ? kMinRange : 0);
282 output = conv_output
283 .unaryExpr(
284 Eigen::internal::scalar_round_half_to_even_op<float>())
285 .clip(lower_bound, kMaxRange)
286 .cast<T>();
287 }
288 }
289
290 private:
291 const FusedComputationType fusion;
292 const ScaleType* conv_input_scale_data;
293 const BiasType* bias_data;
294 const T* side_input_data;
295 ScaleType side_input_scale;
296 T* output_data;
297 const int conv_input_scale_tensor_size;
298 };
299};
300
301template <>
302struct LaunchFusedConv2DOp<CPUDevice, int8>
303 : LaunchFusedConv2DOpCpuInt8Helper<int8> {
304};
305
306template <>
307struct LaunchFusedConv2DOp<CPUDevice, qint8>
308 : LaunchFusedConv2DOpCpuInt8Helper<qint8> {};
309
310#if GOOGLE_CUDA
311
312template <typename T>
313struct LaunchFusedConv2DOpGpuInt8Helper {
314void operator()(
315 OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
316 const Tensor& input_param, const Tensor& filter_param,
317 FusedComputationType fusion, const FusedComputationArgs& fusion_args,
318 const Conv2DParameters& params, const Conv2DDimensions& dimensions,
319 Tensor* output_param) {
320 OP_REQUIRES(ctx, dimensions.in_depth == filter_param.dim_size(1),
321 errors::Unimplemented("Fused conv implementation does not "
322 "support grouped convolutions for now."));
323 OP_REQUIRES(ctx, params.data_format == TensorFormat::FORMAT_NCHW_VECT_C,
324 errors::Unimplemented(
325 "Fused convolution for int8 is only supported on GPU "
326 "for NCHW_VECT_C format"));
327 OP_REQUIRES(ctx,
328 DataTypeToEnum<T>::value == DT_INT8 ||
329 DataTypeToEnum<T>::value == DT_QINT8,
330 errors::Unimplemented("Specialized fused conv implemented for "
331 "only int8 and qint8 on GPU."));
332 OP_REQUIRES(
333 ctx, dimensions.dilation_rows == 1 && dimensions.dilation_cols == 1,
334 errors::Unimplemented(
335 "Fused conv implementation for int8/qint8 on GPU only supports "
336 "dilation of 1 for rows and cols."));
337 OP_REQUIRES(
338 ctx,
339 fusion == FusedComputationType::kBiasAdd ||
340 fusion == FusedComputationType::kBiasAddWithRelu,
341 errors::Unimplemented(
342 "Fused conv implementation for int8/qint8 on GPU only supports "
343 "BiasAdd + None or BiasAdd + Relu."));
344
345 constexpr int kBias = 2;
346 constexpr int kSideInput = 3;
347 constexpr int kConvInputScale = 4;
348 constexpr int kSideInputScale = 5;
349
350 const Tensor& bias = ctx->input(kBias);
351 const Tensor& side_input_param = ctx->input(kSideInput);
352 const Tensor& conv_input_scale_param = ctx->input(kConvInputScale);
353 const Tensor& side_input_scale_param = ctx->input(kSideInputScale);
354
355 // Assuming int8 <--> NCHW_VECT_C, OIHW_VECT_I (int8x4) here.
356 constexpr TensorFormat data_format = TensorFormat::FORMAT_NCHW_VECT_C;
357 constexpr FilterTensorFormat filter_format =
358 FilterTensorFormat::FORMAT_OIHW_VECT_I;
359 const Padding padding = params.padding;
360
361 int32_t row_stride = dimensions.stride_rows;
362 int32_t col_stride = dimensions.stride_cols;
363
364 auto* stream = ctx->op_device_context()->stream();
365 OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
366 OP_REQUIRES(ctx, stream->GetCudaComputeCapability().IsAtLeast(6, 1),
367 errors::Unimplemented(
368 "Fused convolution for int8 is only supported on GPUs with "
369 "compute capability 6.1 or later."));
370
371 se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
372 stream);
373 se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
374 se::GpuAsmOpts());
375
376 const int batch_size = GetTensorDim(input_param, data_format, 'N');
377 int conv_input_rows = GetTensorDim(input_param, data_format, 'H');
378 int conv_input_cols = GetTensorDim(input_param, data_format, 'W');
379 const int conv_input_depth =
380 GetTensorDim(input_param, data_format, 'C') * 4;
381
382 const int output_rows = GetTensorDim(*output_param, data_format, 'H');
383 const int output_cols = GetTensorDim(*output_param, data_format, 'W');
384 const int output_depth = GetFilterDim(filter_param, filter_format, 'O');
385 const int filter_rows = GetFilterDim(filter_param, filter_format, 'H');
386 const int filter_cols = GetFilterDim(filter_param, filter_format, 'W');
387 int padding_rows = 0;
388 int padding_cols = 0;
389 const Tensor* conv_input = &input_param;
390
391 Tensor maybe_padded_conv_input;
392 if (padding == Padding::SAME) {
393 // Adjusts padding so cudnn supports it. Sets `adjusted_padding` to be the
394 // adjusted padding, and `extra_padding_before` and `extra_padding_after`
395 // to be the extra padding that FusedConv needs to apply before calling
396 // cudnn.
397 auto AdjustPaddingForCudnn =
398 [](int padding, int filter_size, int* adjusted_padding,
399 int* extra_padding_before, int* extra_padding_after) {
400#if CUDNN_VERSION < 7000
401 if (filter_size >= 6) {
402 // TODO(b/70795525): Remove after NVIDIA fixes this bug with int8
403 // fused convolution. I don't know cuDNN7 still has the bug, so
404 // enable this workaround for cuDNN6 or older.
405 *adjusted_padding = 0;
406 *extra_padding_before = padding / 2;
407 *extra_padding_after = padding - *extra_padding_before;
408 return;
409 }
410#endif
411 *adjusted_padding = padding / 2 * 2;
412 *extra_padding_before = 0;
413 *extra_padding_after = padding % 2;
414 };
415
416 // Total padding on rows and cols is
417 // Pr = (R' - 1) * S + Kr - R
418 // Pc = (C' - 1) * S + Kc - C
419 // where (R', C') are output dimensions, (R, C) are input dimensions, S
420 // is stride, (Kr, Kc) are filter dimensions.
421 // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
422 // and Pc - Pc/2 on the bottom. When Pr or Pc is odd, this means
423 // we pad more on the right and bottom than on the top and left.
424 padding_rows = std::max<int>(
425 0, (output_rows - 1) * row_stride + filter_rows - conv_input_rows);
426 padding_cols = std::max<int>(
427 0, (output_cols - 1) * col_stride + filter_cols - conv_input_cols);
428 int extra_top_padding = 0;
429 int extra_bottom_padding = 0;
430 int extra_left_padding = 0;
431 int extra_right_padding = 0;
432 AdjustPaddingForCudnn(padding_rows, filter_rows, &padding_rows,
433 &extra_top_padding, &extra_bottom_padding);
434 AdjustPaddingForCudnn(padding_cols, filter_cols, &padding_cols,
435 &extra_left_padding, &extra_right_padding);
436 if (extra_top_padding != 0 || extra_bottom_padding != 0 ||
437 extra_left_padding != 0 || extra_right_padding != 0) {
438 const int new_conv_input_rows =
439 conv_input_rows + extra_top_padding + extra_bottom_padding;
440 const int new_conv_input_cols =
441 conv_input_cols + extra_left_padding + extra_right_padding;
442
443 using VectT = int32;
444 auto pad_data_format = FORMAT_NCHW;
445
446 OP_REQUIRES_OK(
447 ctx,
448 ctx->allocate_temp(
449 DataTypeToEnum<T>::value,
450 ShapeFromFormat(data_format, batch_size, new_conv_input_rows,
451 new_conv_input_cols, conv_input_depth),
452 &maybe_padded_conv_input));
453
454 auto conv_input_eigen_tensor =
455 To32Bit(input_param.reinterpret_last_dimension<VectT, 4>());
456 auto padded_conv_input_eigen_tensor = To32Bit(
457 maybe_padded_conv_input.reinterpret_last_dimension<VectT, 4>());
458
459 functor::PadInput<GPUDevice, VectT, int, 4>()(
460 ctx->eigen_device<GPUDevice>(), conv_input_eigen_tensor,
461 {{extra_top_padding, extra_left_padding}},
462 {{extra_bottom_padding, extra_right_padding}},
463 padded_conv_input_eigen_tensor, pad_data_format, T{});
464
465 conv_input = &maybe_padded_conv_input;
466 conv_input_rows = new_conv_input_rows;
467 conv_input_cols = new_conv_input_cols;
468 }
469 }
470
471 constexpr auto data_layout = se::dnn::DataLayout::kBatchDepthYX4;
472 constexpr auto filter_layout = se::dnn::FilterLayout::kOutputInputYX4;
473
474 se::dnn::BatchDescriptor conv_input_desc;
475 conv_input_desc.set_count(batch_size)
476 .set_feature_map_count(conv_input_depth)
477 .set_height(conv_input_rows)
478 .set_width(conv_input_cols)
479 .set_layout(data_layout);
480 se::dnn::FilterDescriptor filter_desc;
481 filter_desc.set_input_filter_height(filter_rows)
482 .set_input_filter_width(filter_cols)
483 .set_input_feature_map_count(conv_input_depth)
484 .set_output_feature_map_count(output_depth)
485 .set_layout(filter_layout);
486 se::dnn::BatchDescriptor side_input_desc;
487 side_input_desc.set_count(batch_size)
488 .set_height(output_rows)
489 .set_width(output_cols)
490 .set_feature_map_count(output_depth)
491 .set_layout(data_layout);
492 se::dnn::BatchDescriptor bias_desc;
493 bias_desc.set_count(1)
494 .set_height(1)
495 .set_width(1)
496 .set_feature_map_count(output_depth)
497 .set_layout(se::dnn::DataLayout::kBatchDepthYX);
498 se::dnn::BatchDescriptor output_desc;
499 output_desc.set_count(batch_size)
500 .set_height(output_rows)
501 .set_width(output_cols)
502 .set_feature_map_count(output_depth)
503 .set_layout(data_layout);
504 se::dnn::ConvolutionDescriptor conv_desc;
505 CHECK_EQ(0, padding_rows % 2); // Crash OK
506 CHECK_EQ(0, padding_cols % 2); // Crash OK
507 conv_desc.set_vertical_filter_stride(row_stride)
508 .set_horizontal_filter_stride(col_stride)
509 .set_zero_padding_height(padding_rows / 2)
510 .set_zero_padding_width(padding_cols / 2);
511
512 auto conv_input_ptr = AsDeviceMemory(
513 reinterpret_cast<const int8*>(conv_input->template flat<T>().data()),
514 conv_input->template flat<T>().size());
515 auto filter_ptr = AsDeviceMemory(
516 reinterpret_cast<const int8*>(filter_param.template flat<T>().data()),
517 filter_param.template flat<T>().size());
518 auto side_input_ptr =
519 AsDeviceMemory(reinterpret_cast<const int8*>(
520 side_input_param.template flat<T>().data()),
521 side_input_param.template flat<T>().size());
522 auto output_ptr = AsDeviceMemory(
523 reinterpret_cast<const int8*>(output_param->template flat<T>().data()),
524 output_param->template flat<T>().size());
525 using BiasType = float;
526 auto bias_ptr = AsDeviceMemory(bias.template flat<BiasType>().data(),
527 bias.template flat<BiasType>().size());
528
529 static int64_t ConvolveScratchSize = GetDnnWorkspaceLimit(
530 // default value is in bytes despite the name of the environment variable
531 "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB
532 );
533
534 se::dnn::ActivationMode dnn_activation_mode;
535 switch (fusion) {
536 case FusedComputationType::kBiasAdd:
537 dnn_activation_mode = se::dnn::ActivationMode::kNone;
538 break;
539 case FusedComputationType::kBiasAddWithRelu:
540 dnn_activation_mode = se::dnn::ActivationMode::kRelu;
541 break;
542 default:
543 LOG(FATAL) << "Unsupported activation type " << (int)fusion; // Crash OK
544 }
545
546 const float conv_scale = conv_input_scale_param.scalar<float>()();
547 const float side_input_scale = side_input_scale_param.scalar<float>()();
548
549 constexpr double leakyrelu_alpha = 0; // This op doesn't support leaky relu
550 int device_id = stream->parent()->device_ordinal();
551 ConvParameters fused_conv_parameters = {
552 batch_size,
553 conv_input_depth,
554 {{conv_input_rows, conv_input_cols}},
555 data_format,
556 output_depth,
557 {{filter_rows, filter_cols}},
558 // TODO(yangzihao): Add support for arbitrary dilations for fused conv.
559 {{1, 1}}, // dilation_rows, dilation_cols
560 {{row_stride, col_stride}},
561 {{padding_rows, padding_cols}},
562 conv_input->dtype(),
563 device_id,
564 /*group_count=*/1, // This op doesn't support grouped convolutions.
565 ConvParameters::FusionInfo{conv_scale, side_input_scale, leakyrelu_alpha,
566 dnn_activation_mode,
567 /*is_contrib=*/false},
568 };
569
570 constexpr auto type = se::dnn::ToDataType<int8>::value;
571 constexpr auto bias_type = se::dnn::ToDataType<BiasType>::value;
572
573 const bool use_cudnn_frontend = CudnnUseFrontend();
574 AutotuneEntry<se::dnn::FusedConvOp> autotune_entry;
575 if (!FusedConvAutotuneMap::GetInstance()->Find(fused_conv_parameters,
576 &autotune_entry)) {
577 VLOG(2) << "Autotuning fused convolution (use_frontend="
578 << use_cudnn_frontend << "): " << fused_conv_parameters.ToString();
579 profiler::ScopedAnnotation trace("cudnn_autotuning");
580
581 std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>> runners;
582 TF_CHECK_OK(stream->parent()->GetFusedConvolveRunners(
583 use_cudnn_frontend, se::dnn::ConvolutionKind::FORWARD, type, bias_type,
584 type, conv_scale, side_input_scale, /*leakyrelu_alpha=*/0.0, stream,
585 conv_input_desc, filter_desc, bias_desc, output_desc, conv_desc,
586 /*use_fallback=*/false, dnn_activation_mode, &runners));
587
588 auto launch_func =
589 [&](se::ScratchAllocator* allocator_used,
590 const std::unique_ptr<const se::dnn::FusedConvRunner>& runner,
591 se::dnn::ProfileResult* profile_result) -> Status {
592 TF_ASSIGN_OR_RETURN(auto scratch, allocator_used->AllocateBytes(
593 runner->GetWorkspaceSize()));
594 return (*runner)(stream, profile_result, scratch, conv_input_ptr,
595 filter_ptr, side_input_ptr, bias_ptr, output_ptr);
596 };
597
598 auto results_or = internal::AutotuneConvImpl(
599 ctx, runners, cudnn_use_autotune, launch_func, ConvolveScratchSize,
600 rz_allocator);
601 OP_REQUIRES_OK(ctx, results_or.status());
602 auto results = std::move(results_or).value();
603
604 LogFusedConvForwardAutotuneResults(
605 type, conv_input_ptr, filter_ptr, output_ptr, bias_ptr, side_input_ptr,
606 conv_input_desc, filter_desc, output_desc, conv_desc, conv_scale,
607 side_input_scale, dnn_activation_mode, stream->parent(), results);
608
609 // Two-level autotuning: Cudnn frontend supports two engine lists:
610 // heuristics and fallback. Heuristics engines are normally faster.
611 // To reduce autotuning time, we evaluate the fallback engines only when
612 // none of the heuristics engines work.
613 bool found_working_engine = false;
614 for (auto& result : results) {
615 if (!result.has_failure()) {
616 found_working_engine = true;
617 break;
618 }
619 }
620
621 if (!CudnnUseFrontend() || found_working_engine) {
622 auto runners_or = BestCudnnConvAlgorithm<se::dnn::FusedConvOp>(
623 results, std::move(runners));
624 OP_REQUIRES_OK(ctx, runners_or.status());
625 autotune_entry = {std::move(runners_or).value()};
626 } else {
627 std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>>
628 fallback_runners;
629 TF_CHECK_OK(stream->parent()->GetFusedConvolveRunners(
630 use_cudnn_frontend, se::dnn::ConvolutionKind::FORWARD, type,
631 bias_type, type, conv_scale, side_input_scale,
632 leakyrelu_alpha, stream, conv_input_desc, filter_desc,
633 bias_desc, output_desc, conv_desc,
634 /*use_fallback=*/true, dnn_activation_mode, &fallback_runners));
635
636 auto fallback_results_or = internal::AutotuneConvImpl(
637 ctx, fallback_runners, cudnn_use_autotune, launch_func,
638 ConvolveScratchSize, rz_allocator);
639 OP_REQUIRES_OK(ctx, fallback_results_or.status());
640 auto fallback_results = std::move(fallback_results_or).value();
641
642 LogFusedConvForwardAutotuneResults(
643 type, conv_input_ptr, filter_ptr, output_ptr, bias_ptr,
644 side_input_ptr, conv_input_desc, filter_desc, output_desc, conv_desc,
645 conv_scale, side_input_scale, dnn_activation_mode, stream->parent(),
646 fallback_results);
647
648 auto fallback_runners_or = BestCudnnConvAlgorithm<se::dnn::FusedConvOp>(
649 fallback_results, std::move(fallback_runners));
650 OP_REQUIRES_OK(ctx, fallback_runners_or.status());
651 autotune_entry = {std::move(fallback_runners_or).value()};
652 }
653
654 FusedConvAutotuneMap::GetInstance()->Insert(fused_conv_parameters,
655 autotune_entry);
656 }
657
658 DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
659 Status cudnn_launch_status;
660 if (!autotune_entry.is_algorithm_config()) {
661 auto& runners = autotune_entry.GetOpRunners();
662 typename se::dnn::FusedConvOp::Config config{
663 se::dnn::ConvolutionKind::FORWARD,
664 type,
665 bias_type,
666 type,
667 conv_scale,
668 side_input_scale,
669 leakyrelu_alpha,
670 conv_input_desc,
671 filter_desc,
672 bias_desc,
673 output_desc,
674 conv_desc,
675 dnn_activation_mode};
676
677 auto primary_or = runners.primary->GetOrCreateRunner(config, stream);
678 OP_REQUIRES_OK(ctx, primary_or.status());
679 auto primary = primary_or.value();
680
681 const se::dnn::FusedConvRunner* no_scratch_fallback = nullptr;
682 if (runners.no_scratch_fallback) {
683 auto no_scratch_fallback_or =
684 runners.no_scratch_fallback->GetOrCreateRunner(config, stream);
685 OP_REQUIRES_OK(ctx, no_scratch_fallback_or.status());
686 no_scratch_fallback = no_scratch_fallback_or.value();
687 }
688
689 auto runner_and_scratch_or =
690 AllocateScratchOrFallback<se::dnn::FusedConvOp::Signature>(
691 &scratch_allocator, primary, no_scratch_fallback);
692 OP_REQUIRES_OK(ctx, runner_and_scratch_or.status());
693 auto runner_and_scratch = std::move(runner_and_scratch_or).value();
694 auto& runner =
695 *std::get<const se::dnn::FusedConvRunner*>(runner_and_scratch);
696 cudnn_launch_status = runner(
697 stream, /*output_profile_result=*/nullptr,
698 std::get<se::DeviceMemoryBase>(runner_and_scratch), conv_input_ptr,
699 filter_ptr, side_input_ptr, bias_ptr, output_ptr);
700 } else {
701 cudnn_launch_status = stream->FusedConvolveWithAlgorithm(
702 conv_input_desc, conv_input_ptr, conv_scale, filter_desc, filter_ptr,
703 conv_desc, side_input_ptr, side_input_scale, bias_desc, bias_ptr,
704 dnn_activation_mode, output_desc, &output_ptr, &scratch_allocator,
705 autotune_entry.GetAlgorithmConfig(),
706 /*output_profile_result=*/nullptr);
707 }
708
709 if (!cudnn_launch_status.ok()) {
710 ctx->SetStatus(cudnn_launch_status);
711 }
712}
713};
714
715template <>
716struct LaunchFusedConv2DOp<GPUDevice, int8>
717 : LaunchFusedConv2DOpGpuInt8Helper<int8> {};
718
719template <>
720struct LaunchFusedConv2DOp<GPUDevice, qint8>
721 : LaunchFusedConv2DOpGpuInt8Helper<qint8> {};
722
723#endif // GOOGLE_CUDA
724
725} // namespace tensorflow
726