1 | /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include <type_traits> |
17 | |
18 | // This include can't be in the conv_ops_fused_impl.h headers. See b/62899350. |
19 | #if GOOGLE_CUDA |
20 | #include "tensorflow/core/protobuf/autotuning.pb.h" |
21 | #endif // GOOGLE_CUDA |
22 | #include "tensorflow/core/kernels/autotune_conv_impl.h" |
23 | #include "tensorflow/core/kernels/conv_ops_fused_impl.h" |
24 | #include "tensorflow/core/kernels/cwise_ops.h" |
25 | #include "tensorflow/core/util/activation_mode.h" |
26 | |
27 | namespace tensorflow { |
28 | |
29 | // If we're using the alternative GEMM-based implementation of Conv2D for the |
30 | // CPU implementation, don't register this EigenTensor-based version. |
31 | #if !defined(USE_GEMM_FOR_CONV) |
32 | TF_CALL_int8(REGISTER_FUSED_CPU_CONV2D); |
33 | TF_CALL_qint8(REGISTER_FUSED_CPU_CONV2D); |
34 | #endif // !USE_GEMM_FOR_CONV |
35 | |
36 | #if GOOGLE_CUDA |
37 | |
38 | namespace functor { |
39 | DECLARE_FUNCTOR_GPU_SPEC(int32); |
40 | } // namespace functor |
41 | |
42 | TF_CALL_int8(REGISTER_FUSED_GPU_CONV2D); |
43 | TF_CALL_qint8(REGISTER_FUSED_GPU_CONV2D); |
44 | |
45 | #endif // GOOGLE_CUDA |
46 | |
47 | template <typename T> |
48 | struct LaunchFusedConv2DOpCpuInt8Helper { |
49 | using BiasType = float; |
50 | using ScaleType = float; |
51 | using ComputeT = float; // convert inputs to fp32 for tensor contraction |
52 | using TempT = float; // temporary accumulator type for tensor contraction |
53 | |
54 | void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, |
55 | const Tensor& conv_input, const Tensor& filter, |
56 | const FusedComputationType fusion, |
57 | const FusedComputationArgs& fusion_args, |
58 | const Conv2DParameters& params, |
59 | const Conv2DDimensions& dimensions, Tensor* output) { |
60 | OP_REQUIRES(ctx, dimensions.in_depth == filter.dim_size(2), |
61 | errors::Unimplemented("Fused conv implementation does not " |
62 | "support grouped convolutions for now." )); |
63 | OP_REQUIRES( |
64 | ctx, params.data_format == FORMAT_NHWC, |
65 | errors::Unimplemented( |
66 | "Fused conv implementation for int8/qint8 on CPU only supports " |
67 | "NHWC tensor format for now." )); |
68 | OP_REQUIRES(ctx, |
69 | DataTypeToEnum<T>::value == DT_INT8 || |
70 | DataTypeToEnum<T>::value == DT_QINT8, |
71 | errors::Unimplemented("Specialized fused conv implemented for " |
72 | "only int8 and qint8 on CPU." )); |
73 | OP_REQUIRES( |
74 | ctx, dimensions.dilation_rows == 1 && dimensions.dilation_cols == 1, |
75 | errors::Unimplemented( |
76 | "Fused conv implementation for int8/qint8 on CPU only supports " |
77 | "dilation of 1 for rows and cols." )); |
78 | OP_REQUIRES( |
79 | ctx, |
80 | fusion == FusedComputationType::kBiasAdd || |
81 | fusion == FusedComputationType::kBiasAddWithRelu, |
82 | errors::Unimplemented( |
83 | "Fused conv implementation for int8/qint8 on CPU only supports " |
84 | "BiasAdd + None or BiasAdd + Relu." )); |
85 | |
86 | constexpr int kBias = 2; |
87 | constexpr int kSideInput = 3; |
88 | constexpr int kConvInputScale = 4; |
89 | constexpr int kSideInputScale = 5; |
90 | |
91 | const Tensor& bias = ctx->input(kBias); |
92 | const Tensor& side_input = ctx->input(kSideInput); |
93 | const Tensor& conv_input_scale = ctx->input(kConvInputScale); |
94 | const Tensor& side_input_scale_param = ctx->input(kSideInputScale); |
95 | |
96 | Eigen::PaddingType padding = BrainPadding2EigenPadding(params.padding); |
97 | int32_t row_stride = dimensions.stride_rows; |
98 | int32_t col_stride = dimensions.stride_cols; |
99 | |
100 | // Output tensor has type T (QInt8/int8), but we can only evaluate |
101 | // Tensor contraction using 32-bit accumulation (fp32). |
102 | Tensor temp_output(DataTypeToEnum<TempT>::value, output->shape()); |
103 | |
104 | const int32_t row_dilation = dimensions.dilation_rows; |
105 | const int32_t col_dilation = dimensions.dilation_cols; |
106 | |
107 | auto& device = ctx->eigen_device<CPUDevice>(); |
108 | |
109 | // CPU convolution works with input in NHWC and filter in HWIO data formats. |
110 | // NOTE: This code is mostly shared with 'Conv2D' and 'FusedConv2D'. |
111 | |
112 | const ScaleType side_input_scale = |
113 | side_input_scale_param.scalar<ScaleType>()(); |
114 | BiasActivationOutputKernel output_kernel( |
115 | conv_input_scale, side_input, side_input_scale, bias, fusion, output); |
116 | |
117 | if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 && |
118 | col_stride == 1) { |
119 | int conv_width = // Width for the convolution step. |
120 | output->dim_size(0) * output->dim_size(1) * output->dim_size(2); |
121 | |
122 | Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair; |
123 | dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0); |
124 | |
125 | auto out = temp_output.shaped<TempT, 2>({conv_width, filter.dim_size(3)}); |
126 | auto in0 = conv_input.shaped<T, 2>({conv_width, filter.dim_size(2)}); |
127 | auto in1 = filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}); |
128 | |
129 | out.device(device) = in0.template cast<ComputeT>().contract( |
130 | in1.template cast<ComputeT>(), dim_pair, output_kernel); |
131 | } else if (filter.dim_size(0) == conv_input.dim_size(1) && |
132 | filter.dim_size(1) == conv_input.dim_size(2) && |
133 | row_dilation == 1 && col_dilation == 1 && |
134 | padding == Eigen::PaddingType::PADDING_VALID) { |
135 | // If the input data and filter have the same height/width, |
136 | // reduce the 2D convolution to matrix multiplication. |
137 | const auto k = // Length of reduction dimension. |
138 | filter.dim_size(0) * filter.dim_size(1) * filter.dim_size(2); |
139 | |
140 | Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair; |
141 | dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0); |
142 | |
143 | auto out = temp_output.shaped<TempT, 2>( |
144 | {conv_input.dim_size(0), filter.dim_size(3)}); |
145 | auto in0 = conv_input.shaped<T, 2>({conv_input.dim_size(0), k}); |
146 | auto in1 = filter.shaped<T, 2>({k, filter.dim_size(3)}); |
147 | |
148 | out.device(device) = in0.template cast<ComputeT>().contract( |
149 | in1.template cast<ComputeT>(), dim_pair, output_kernel); |
150 | } else { |
151 | auto out = temp_output.tensor<TempT, 4>(); |
152 | auto in0 = conv_input.tensor<T, 4>(); |
153 | auto in1 = filter.tensor<T, 4>(); |
154 | |
155 | // Need to swap row/col when calling Eigen. |
156 | out.device(device) = Eigen::SpatialConvolution( |
157 | in0.template cast<ComputeT>(), in1.template cast<ComputeT>(), |
158 | col_stride, row_stride, padding, col_dilation, row_dilation, |
159 | output_kernel); |
160 | } |
161 | } |
162 | |
163 | private: |
164 | // Contraction output mapper for temporary QInt32 tensor. |
165 | using ContractionOutputMapper = |
166 | Eigen::internal::blas_data_mapper<TempT, Eigen::Index, Eigen::ColMajor>; |
167 | |
168 | // This output kernel computes an expressions corresponding to cuDNN |
169 | // implementation of INT8 cudnnConvolutionBiasActivationForward: |
170 | // https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#scaling-parameters__fig-conv-bias-activation-forward |
171 | struct BiasActivationOutputKernel { |
172 | explicit BiasActivationOutputKernel(const Tensor& conv_input_scale, |
173 | const Tensor& side_input, |
174 | ScaleType side_input_scale, |
175 | const Tensor& bias, |
176 | const FusedComputationType fusion, |
177 | Tensor* output) |
178 | : fusion(fusion), |
179 | conv_input_scale_data(conv_input_scale.flat<ScaleType>().data()), |
180 | bias_data(bias.flat<BiasType>().data()), |
181 | side_input_data(side_input.flat<T>().data()), |
182 | side_input_scale(side_input_scale), |
183 | output_data(const_cast<T*>(output->flat<T>().data())), |
184 | conv_input_scale_tensor_size(conv_input_scale.NumElements()) {} |
185 | |
186 | EIGEN_ALWAYS_INLINE void operator()( |
187 | const ContractionOutputMapper& conv_output_mapper, |
188 | const Eigen::TensorContractionParams& params, Eigen::Index i, |
189 | Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) const { |
190 | DCHECK(params.swapped_arguments); |
191 | |
192 | const auto stride = conv_output_mapper.stride(); |
193 | |
194 | const BiasType* bias_base = bias_data + i; |
195 | const ScaleType* conv_input_scale_base = conv_input_scale_data; |
196 | if (conv_input_scale_tensor_size > 1) { |
197 | conv_input_scale_base += i; |
198 | } |
199 | |
200 | const T* side_input_base; |
201 | if (side_input_data == nullptr) { |
202 | // side_input_data can be null when the tf::Tensor for the side input is |
203 | // empty. |
204 | side_input_base = nullptr; |
205 | } else { |
206 | side_input_base = side_input_data + i + j * stride; |
207 | } |
208 | T* output_base = output_data + i + j * stride; |
209 | |
210 | for (int col = 0; col < num_cols; ++col) { |
211 | // A column of an output tensor after QInt8xQInt8 -> QInt32 contraction. |
212 | // This is a temporary tensor, that we will scale, add bias with |
213 | // side_input, and quantize before writing to final output tensor. |
214 | typename TTypes<TempT>::UnalignedTensor conv_output( |
215 | &conv_output_mapper(0, col), num_rows); |
216 | |
217 | // A column of output quantized tensor corresponding to conv output row. |
218 | typename TTypes<T>::UnalignedTensor output(output_base + col * stride, |
219 | num_rows); |
220 | |
221 | const BiasType* bias_ptr = bias_base; |
222 | |
223 | static_assert( |
224 | std::is_same<TempT, ScaleType>::value, |
225 | "Temporary contraction result type must match with scale type." ); |
226 | |
227 | // CuDNN 8 introduced many new kernels for sm75+ GPUs. These kernels use |
228 | // different numerics than those in CuDNN 7-. |
229 | // |
230 | // In cudnn 7-: |
231 | // |
232 | // conv_output = Fma(conv_output, conv_input_scale, bias) |
233 | // conv_output = Fma(conv_output, side_input_scale, side_input), |
234 | // |
235 | // In cudnn 8: |
236 | // |
237 | // conv_output = conv_output * conv_input_scale |
238 | // conv_output = Fma(conv_output, side_input_scale, side_input) |
239 | // conv_output = conv_output + bias |
240 | // |
241 | // One caveat is that the numerics of |
242 | // cudnnConvolutionBiasActivationForward depend on not only the cudnn |
243 | // version but also the GPU's compute capability, which is not visible |
244 | // to the CPU implementation of FusedConv2dBiasActivationOp. So we |
245 | // expect this implementation to be bit exact for cudnn7-/sm70- and |
246 | // cudnn8+/sm75+ but not for cudnn8+/sm70-. |
247 | // |
248 | // NOTE(ezhulenev): We do not use packet FMA for this loop, |
249 | // because it seems that it produces slightly different results, |
250 | // and we are targeting close equality with Nvidia implementation. |
251 | typename TTypes<BiasType>::UnalignedConstTensor bias_slice(bias_ptr, |
252 | num_rows); |
253 | |
254 | // (1) Scale. |
255 | if (conv_input_scale_tensor_size > 1) { |
256 | typename TTypes<ScaleType>::UnalignedConstTensor |
257 | conv_input_scale_slice(conv_input_scale_base, num_rows); |
258 | conv_output = conv_output * conv_input_scale_slice; |
259 | } else { |
260 | conv_output = conv_output * (*conv_input_scale_base); |
261 | } |
262 | |
263 | // (2) Side input. |
264 | if (side_input_scale != 0.0f) { |
265 | const T* side_input_ptr = side_input_base + col * stride; |
266 | TempT* conv_output_ptr = conv_output.data(); |
267 | for (int idx = 0; idx < num_rows; ++idx) { |
268 | conv_output_ptr[idx] = std::fmaf( |
269 | side_input_ptr[idx], side_input_scale, conv_output_ptr[idx]); |
270 | } |
271 | } |
272 | |
273 | // (3) Bias. |
274 | conv_output += bias_slice; |
275 | |
276 | // Round-up, clip and apply activation function. |
277 | static constexpr ScaleType kMaxRange = static_cast<ScaleType>(127.f); |
278 | static constexpr ScaleType kMinRange = static_cast<ScaleType>(-128.f); |
279 | |
280 | ScaleType lower_bound = |
281 | (fusion == FusedComputationType::kBiasAdd ? kMinRange : 0); |
282 | output = conv_output |
283 | .unaryExpr( |
284 | Eigen::internal::scalar_round_half_to_even_op<float>()) |
285 | .clip(lower_bound, kMaxRange) |
286 | .cast<T>(); |
287 | } |
288 | } |
289 | |
290 | private: |
291 | const FusedComputationType fusion; |
292 | const ScaleType* conv_input_scale_data; |
293 | const BiasType* bias_data; |
294 | const T* side_input_data; |
295 | ScaleType side_input_scale; |
296 | T* output_data; |
297 | const int conv_input_scale_tensor_size; |
298 | }; |
299 | }; |
300 | |
301 | template <> |
302 | struct LaunchFusedConv2DOp<CPUDevice, int8> |
303 | : LaunchFusedConv2DOpCpuInt8Helper<int8> { |
304 | }; |
305 | |
306 | template <> |
307 | struct LaunchFusedConv2DOp<CPUDevice, qint8> |
308 | : LaunchFusedConv2DOpCpuInt8Helper<qint8> {}; |
309 | |
310 | #if GOOGLE_CUDA |
311 | |
312 | template <typename T> |
313 | struct LaunchFusedConv2DOpGpuInt8Helper { |
314 | void operator()( |
315 | OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, |
316 | const Tensor& input_param, const Tensor& filter_param, |
317 | FusedComputationType fusion, const FusedComputationArgs& fusion_args, |
318 | const Conv2DParameters& params, const Conv2DDimensions& dimensions, |
319 | Tensor* output_param) { |
320 | OP_REQUIRES(ctx, dimensions.in_depth == filter_param.dim_size(1), |
321 | errors::Unimplemented("Fused conv implementation does not " |
322 | "support grouped convolutions for now." )); |
323 | OP_REQUIRES(ctx, params.data_format == TensorFormat::FORMAT_NCHW_VECT_C, |
324 | errors::Unimplemented( |
325 | "Fused convolution for int8 is only supported on GPU " |
326 | "for NCHW_VECT_C format" )); |
327 | OP_REQUIRES(ctx, |
328 | DataTypeToEnum<T>::value == DT_INT8 || |
329 | DataTypeToEnum<T>::value == DT_QINT8, |
330 | errors::Unimplemented("Specialized fused conv implemented for " |
331 | "only int8 and qint8 on GPU." )); |
332 | OP_REQUIRES( |
333 | ctx, dimensions.dilation_rows == 1 && dimensions.dilation_cols == 1, |
334 | errors::Unimplemented( |
335 | "Fused conv implementation for int8/qint8 on GPU only supports " |
336 | "dilation of 1 for rows and cols." )); |
337 | OP_REQUIRES( |
338 | ctx, |
339 | fusion == FusedComputationType::kBiasAdd || |
340 | fusion == FusedComputationType::kBiasAddWithRelu, |
341 | errors::Unimplemented( |
342 | "Fused conv implementation for int8/qint8 on GPU only supports " |
343 | "BiasAdd + None or BiasAdd + Relu." )); |
344 | |
345 | constexpr int kBias = 2; |
346 | constexpr int kSideInput = 3; |
347 | constexpr int kConvInputScale = 4; |
348 | constexpr int kSideInputScale = 5; |
349 | |
350 | const Tensor& bias = ctx->input(kBias); |
351 | const Tensor& side_input_param = ctx->input(kSideInput); |
352 | const Tensor& conv_input_scale_param = ctx->input(kConvInputScale); |
353 | const Tensor& side_input_scale_param = ctx->input(kSideInputScale); |
354 | |
355 | // Assuming int8 <--> NCHW_VECT_C, OIHW_VECT_I (int8x4) here. |
356 | constexpr TensorFormat data_format = TensorFormat::FORMAT_NCHW_VECT_C; |
357 | constexpr FilterTensorFormat filter_format = |
358 | FilterTensorFormat::FORMAT_OIHW_VECT_I; |
359 | const Padding padding = params.padding; |
360 | |
361 | int32_t row_stride = dimensions.stride_rows; |
362 | int32_t col_stride = dimensions.stride_cols; |
363 | |
364 | auto* stream = ctx->op_device_context()->stream(); |
365 | OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available." )); |
366 | OP_REQUIRES(ctx, stream->GetCudaComputeCapability().IsAtLeast(6, 1), |
367 | errors::Unimplemented( |
368 | "Fused convolution for int8 is only supported on GPUs with " |
369 | "compute capability 6.1 or later." )); |
370 | |
371 | se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}), |
372 | stream); |
373 | se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter, |
374 | se::GpuAsmOpts()); |
375 | |
376 | const int batch_size = GetTensorDim(input_param, data_format, 'N'); |
377 | int conv_input_rows = GetTensorDim(input_param, data_format, 'H'); |
378 | int conv_input_cols = GetTensorDim(input_param, data_format, 'W'); |
379 | const int conv_input_depth = |
380 | GetTensorDim(input_param, data_format, 'C') * 4; |
381 | |
382 | const int output_rows = GetTensorDim(*output_param, data_format, 'H'); |
383 | const int output_cols = GetTensorDim(*output_param, data_format, 'W'); |
384 | const int output_depth = GetFilterDim(filter_param, filter_format, 'O'); |
385 | const int filter_rows = GetFilterDim(filter_param, filter_format, 'H'); |
386 | const int filter_cols = GetFilterDim(filter_param, filter_format, 'W'); |
387 | int padding_rows = 0; |
388 | int padding_cols = 0; |
389 | const Tensor* conv_input = &input_param; |
390 | |
391 | Tensor maybe_padded_conv_input; |
392 | if (padding == Padding::SAME) { |
393 | // Adjusts padding so cudnn supports it. Sets `adjusted_padding` to be the |
394 | // adjusted padding, and `extra_padding_before` and `extra_padding_after` |
395 | // to be the extra padding that FusedConv needs to apply before calling |
396 | // cudnn. |
397 | auto AdjustPaddingForCudnn = |
398 | [](int padding, int filter_size, int* adjusted_padding, |
399 | int* extra_padding_before, int* extra_padding_after) { |
400 | #if CUDNN_VERSION < 7000 |
401 | if (filter_size >= 6) { |
402 | // TODO(b/70795525): Remove after NVIDIA fixes this bug with int8 |
403 | // fused convolution. I don't know cuDNN7 still has the bug, so |
404 | // enable this workaround for cuDNN6 or older. |
405 | *adjusted_padding = 0; |
406 | *extra_padding_before = padding / 2; |
407 | *extra_padding_after = padding - *extra_padding_before; |
408 | return; |
409 | } |
410 | #endif |
411 | *adjusted_padding = padding / 2 * 2; |
412 | *extra_padding_before = 0; |
413 | *extra_padding_after = padding % 2; |
414 | }; |
415 | |
416 | // Total padding on rows and cols is |
417 | // Pr = (R' - 1) * S + Kr - R |
418 | // Pc = (C' - 1) * S + Kc - C |
419 | // where (R', C') are output dimensions, (R, C) are input dimensions, S |
420 | // is stride, (Kr, Kc) are filter dimensions. |
421 | // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top |
422 | // and Pc - Pc/2 on the bottom. When Pr or Pc is odd, this means |
423 | // we pad more on the right and bottom than on the top and left. |
424 | padding_rows = std::max<int>( |
425 | 0, (output_rows - 1) * row_stride + filter_rows - conv_input_rows); |
426 | padding_cols = std::max<int>( |
427 | 0, (output_cols - 1) * col_stride + filter_cols - conv_input_cols); |
428 | int extra_top_padding = 0; |
429 | int extra_bottom_padding = 0; |
430 | int extra_left_padding = 0; |
431 | int extra_right_padding = 0; |
432 | AdjustPaddingForCudnn(padding_rows, filter_rows, &padding_rows, |
433 | &extra_top_padding, &extra_bottom_padding); |
434 | AdjustPaddingForCudnn(padding_cols, filter_cols, &padding_cols, |
435 | &extra_left_padding, &extra_right_padding); |
436 | if (extra_top_padding != 0 || extra_bottom_padding != 0 || |
437 | extra_left_padding != 0 || extra_right_padding != 0) { |
438 | const int new_conv_input_rows = |
439 | conv_input_rows + extra_top_padding + extra_bottom_padding; |
440 | const int new_conv_input_cols = |
441 | conv_input_cols + extra_left_padding + extra_right_padding; |
442 | |
443 | using VectT = int32; |
444 | auto pad_data_format = FORMAT_NCHW; |
445 | |
446 | OP_REQUIRES_OK( |
447 | ctx, |
448 | ctx->allocate_temp( |
449 | DataTypeToEnum<T>::value, |
450 | ShapeFromFormat(data_format, batch_size, new_conv_input_rows, |
451 | new_conv_input_cols, conv_input_depth), |
452 | &maybe_padded_conv_input)); |
453 | |
454 | auto conv_input_eigen_tensor = |
455 | To32Bit(input_param.reinterpret_last_dimension<VectT, 4>()); |
456 | auto padded_conv_input_eigen_tensor = To32Bit( |
457 | maybe_padded_conv_input.reinterpret_last_dimension<VectT, 4>()); |
458 | |
459 | functor::PadInput<GPUDevice, VectT, int, 4>()( |
460 | ctx->eigen_device<GPUDevice>(), conv_input_eigen_tensor, |
461 | {{extra_top_padding, extra_left_padding}}, |
462 | {{extra_bottom_padding, extra_right_padding}}, |
463 | padded_conv_input_eigen_tensor, pad_data_format, T{}); |
464 | |
465 | conv_input = &maybe_padded_conv_input; |
466 | conv_input_rows = new_conv_input_rows; |
467 | conv_input_cols = new_conv_input_cols; |
468 | } |
469 | } |
470 | |
471 | constexpr auto data_layout = se::dnn::DataLayout::kBatchDepthYX4; |
472 | constexpr auto filter_layout = se::dnn::FilterLayout::kOutputInputYX4; |
473 | |
474 | se::dnn::BatchDescriptor conv_input_desc; |
475 | conv_input_desc.set_count(batch_size) |
476 | .set_feature_map_count(conv_input_depth) |
477 | .set_height(conv_input_rows) |
478 | .set_width(conv_input_cols) |
479 | .set_layout(data_layout); |
480 | se::dnn::FilterDescriptor filter_desc; |
481 | filter_desc.set_input_filter_height(filter_rows) |
482 | .set_input_filter_width(filter_cols) |
483 | .set_input_feature_map_count(conv_input_depth) |
484 | .set_output_feature_map_count(output_depth) |
485 | .set_layout(filter_layout); |
486 | se::dnn::BatchDescriptor side_input_desc; |
487 | side_input_desc.set_count(batch_size) |
488 | .set_height(output_rows) |
489 | .set_width(output_cols) |
490 | .set_feature_map_count(output_depth) |
491 | .set_layout(data_layout); |
492 | se::dnn::BatchDescriptor bias_desc; |
493 | bias_desc.set_count(1) |
494 | .set_height(1) |
495 | .set_width(1) |
496 | .set_feature_map_count(output_depth) |
497 | .set_layout(se::dnn::DataLayout::kBatchDepthYX); |
498 | se::dnn::BatchDescriptor output_desc; |
499 | output_desc.set_count(batch_size) |
500 | .set_height(output_rows) |
501 | .set_width(output_cols) |
502 | .set_feature_map_count(output_depth) |
503 | .set_layout(data_layout); |
504 | se::dnn::ConvolutionDescriptor conv_desc; |
505 | CHECK_EQ(0, padding_rows % 2); // Crash OK |
506 | CHECK_EQ(0, padding_cols % 2); // Crash OK |
507 | conv_desc.set_vertical_filter_stride(row_stride) |
508 | .set_horizontal_filter_stride(col_stride) |
509 | .set_zero_padding_height(padding_rows / 2) |
510 | .set_zero_padding_width(padding_cols / 2); |
511 | |
512 | auto conv_input_ptr = AsDeviceMemory( |
513 | reinterpret_cast<const int8*>(conv_input->template flat<T>().data()), |
514 | conv_input->template flat<T>().size()); |
515 | auto filter_ptr = AsDeviceMemory( |
516 | reinterpret_cast<const int8*>(filter_param.template flat<T>().data()), |
517 | filter_param.template flat<T>().size()); |
518 | auto side_input_ptr = |
519 | AsDeviceMemory(reinterpret_cast<const int8*>( |
520 | side_input_param.template flat<T>().data()), |
521 | side_input_param.template flat<T>().size()); |
522 | auto output_ptr = AsDeviceMemory( |
523 | reinterpret_cast<const int8*>(output_param->template flat<T>().data()), |
524 | output_param->template flat<T>().size()); |
525 | using BiasType = float; |
526 | auto bias_ptr = AsDeviceMemory(bias.template flat<BiasType>().data(), |
527 | bias.template flat<BiasType>().size()); |
528 | |
529 | static int64_t ConvolveScratchSize = GetDnnWorkspaceLimit( |
530 | // default value is in bytes despite the name of the environment variable |
531 | "TF_CUDNN_WORKSPACE_LIMIT_IN_MB" , 1LL << 32 // 4GB |
532 | ); |
533 | |
534 | se::dnn::ActivationMode dnn_activation_mode; |
535 | switch (fusion) { |
536 | case FusedComputationType::kBiasAdd: |
537 | dnn_activation_mode = se::dnn::ActivationMode::kNone; |
538 | break; |
539 | case FusedComputationType::kBiasAddWithRelu: |
540 | dnn_activation_mode = se::dnn::ActivationMode::kRelu; |
541 | break; |
542 | default: |
543 | LOG(FATAL) << "Unsupported activation type " << (int)fusion; // Crash OK |
544 | } |
545 | |
546 | const float conv_scale = conv_input_scale_param.scalar<float>()(); |
547 | const float side_input_scale = side_input_scale_param.scalar<float>()(); |
548 | |
549 | constexpr double leakyrelu_alpha = 0; // This op doesn't support leaky relu |
550 | int device_id = stream->parent()->device_ordinal(); |
551 | ConvParameters fused_conv_parameters = { |
552 | batch_size, |
553 | conv_input_depth, |
554 | {{conv_input_rows, conv_input_cols}}, |
555 | data_format, |
556 | output_depth, |
557 | {{filter_rows, filter_cols}}, |
558 | // TODO(yangzihao): Add support for arbitrary dilations for fused conv. |
559 | {{1, 1}}, // dilation_rows, dilation_cols |
560 | {{row_stride, col_stride}}, |
561 | {{padding_rows, padding_cols}}, |
562 | conv_input->dtype(), |
563 | device_id, |
564 | /*group_count=*/1, // This op doesn't support grouped convolutions. |
565 | ConvParameters::FusionInfo{conv_scale, side_input_scale, leakyrelu_alpha, |
566 | dnn_activation_mode, |
567 | /*is_contrib=*/false}, |
568 | }; |
569 | |
570 | constexpr auto type = se::dnn::ToDataType<int8>::value; |
571 | constexpr auto bias_type = se::dnn::ToDataType<BiasType>::value; |
572 | |
573 | const bool use_cudnn_frontend = CudnnUseFrontend(); |
574 | AutotuneEntry<se::dnn::FusedConvOp> autotune_entry; |
575 | if (!FusedConvAutotuneMap::GetInstance()->Find(fused_conv_parameters, |
576 | &autotune_entry)) { |
577 | VLOG(2) << "Autotuning fused convolution (use_frontend=" |
578 | << use_cudnn_frontend << "): " << fused_conv_parameters.ToString(); |
579 | profiler::ScopedAnnotation trace("cudnn_autotuning" ); |
580 | |
581 | std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>> runners; |
582 | TF_CHECK_OK(stream->parent()->GetFusedConvolveRunners( |
583 | use_cudnn_frontend, se::dnn::ConvolutionKind::FORWARD, type, bias_type, |
584 | type, conv_scale, side_input_scale, /*leakyrelu_alpha=*/0.0, stream, |
585 | conv_input_desc, filter_desc, bias_desc, output_desc, conv_desc, |
586 | /*use_fallback=*/false, dnn_activation_mode, &runners)); |
587 | |
588 | auto launch_func = |
589 | [&](se::ScratchAllocator* allocator_used, |
590 | const std::unique_ptr<const se::dnn::FusedConvRunner>& runner, |
591 | se::dnn::ProfileResult* profile_result) -> Status { |
592 | TF_ASSIGN_OR_RETURN(auto scratch, allocator_used->AllocateBytes( |
593 | runner->GetWorkspaceSize())); |
594 | return (*runner)(stream, profile_result, scratch, conv_input_ptr, |
595 | filter_ptr, side_input_ptr, bias_ptr, output_ptr); |
596 | }; |
597 | |
598 | auto results_or = internal::AutotuneConvImpl( |
599 | ctx, runners, cudnn_use_autotune, launch_func, ConvolveScratchSize, |
600 | rz_allocator); |
601 | OP_REQUIRES_OK(ctx, results_or.status()); |
602 | auto results = std::move(results_or).value(); |
603 | |
604 | LogFusedConvForwardAutotuneResults( |
605 | type, conv_input_ptr, filter_ptr, output_ptr, bias_ptr, side_input_ptr, |
606 | conv_input_desc, filter_desc, output_desc, conv_desc, conv_scale, |
607 | side_input_scale, dnn_activation_mode, stream->parent(), results); |
608 | |
609 | // Two-level autotuning: Cudnn frontend supports two engine lists: |
610 | // heuristics and fallback. Heuristics engines are normally faster. |
611 | // To reduce autotuning time, we evaluate the fallback engines only when |
612 | // none of the heuristics engines work. |
613 | bool found_working_engine = false; |
614 | for (auto& result : results) { |
615 | if (!result.has_failure()) { |
616 | found_working_engine = true; |
617 | break; |
618 | } |
619 | } |
620 | |
621 | if (!CudnnUseFrontend() || found_working_engine) { |
622 | auto runners_or = BestCudnnConvAlgorithm<se::dnn::FusedConvOp>( |
623 | results, std::move(runners)); |
624 | OP_REQUIRES_OK(ctx, runners_or.status()); |
625 | autotune_entry = {std::move(runners_or).value()}; |
626 | } else { |
627 | std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>> |
628 | fallback_runners; |
629 | TF_CHECK_OK(stream->parent()->GetFusedConvolveRunners( |
630 | use_cudnn_frontend, se::dnn::ConvolutionKind::FORWARD, type, |
631 | bias_type, type, conv_scale, side_input_scale, |
632 | leakyrelu_alpha, stream, conv_input_desc, filter_desc, |
633 | bias_desc, output_desc, conv_desc, |
634 | /*use_fallback=*/true, dnn_activation_mode, &fallback_runners)); |
635 | |
636 | auto fallback_results_or = internal::AutotuneConvImpl( |
637 | ctx, fallback_runners, cudnn_use_autotune, launch_func, |
638 | ConvolveScratchSize, rz_allocator); |
639 | OP_REQUIRES_OK(ctx, fallback_results_or.status()); |
640 | auto fallback_results = std::move(fallback_results_or).value(); |
641 | |
642 | LogFusedConvForwardAutotuneResults( |
643 | type, conv_input_ptr, filter_ptr, output_ptr, bias_ptr, |
644 | side_input_ptr, conv_input_desc, filter_desc, output_desc, conv_desc, |
645 | conv_scale, side_input_scale, dnn_activation_mode, stream->parent(), |
646 | fallback_results); |
647 | |
648 | auto fallback_runners_or = BestCudnnConvAlgorithm<se::dnn::FusedConvOp>( |
649 | fallback_results, std::move(fallback_runners)); |
650 | OP_REQUIRES_OK(ctx, fallback_runners_or.status()); |
651 | autotune_entry = {std::move(fallback_runners_or).value()}; |
652 | } |
653 | |
654 | FusedConvAutotuneMap::GetInstance()->Insert(fused_conv_parameters, |
655 | autotune_entry); |
656 | } |
657 | |
658 | DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx); |
659 | Status cudnn_launch_status; |
660 | if (!autotune_entry.is_algorithm_config()) { |
661 | auto& runners = autotune_entry.GetOpRunners(); |
662 | typename se::dnn::FusedConvOp::Config config{ |
663 | se::dnn::ConvolutionKind::FORWARD, |
664 | type, |
665 | bias_type, |
666 | type, |
667 | conv_scale, |
668 | side_input_scale, |
669 | leakyrelu_alpha, |
670 | conv_input_desc, |
671 | filter_desc, |
672 | bias_desc, |
673 | output_desc, |
674 | conv_desc, |
675 | dnn_activation_mode}; |
676 | |
677 | auto primary_or = runners.primary->GetOrCreateRunner(config, stream); |
678 | OP_REQUIRES_OK(ctx, primary_or.status()); |
679 | auto primary = primary_or.value(); |
680 | |
681 | const se::dnn::FusedConvRunner* no_scratch_fallback = nullptr; |
682 | if (runners.no_scratch_fallback) { |
683 | auto no_scratch_fallback_or = |
684 | runners.no_scratch_fallback->GetOrCreateRunner(config, stream); |
685 | OP_REQUIRES_OK(ctx, no_scratch_fallback_or.status()); |
686 | no_scratch_fallback = no_scratch_fallback_or.value(); |
687 | } |
688 | |
689 | auto runner_and_scratch_or = |
690 | AllocateScratchOrFallback<se::dnn::FusedConvOp::Signature>( |
691 | &scratch_allocator, primary, no_scratch_fallback); |
692 | OP_REQUIRES_OK(ctx, runner_and_scratch_or.status()); |
693 | auto runner_and_scratch = std::move(runner_and_scratch_or).value(); |
694 | auto& runner = |
695 | *std::get<const se::dnn::FusedConvRunner*>(runner_and_scratch); |
696 | cudnn_launch_status = runner( |
697 | stream, /*output_profile_result=*/nullptr, |
698 | std::get<se::DeviceMemoryBase>(runner_and_scratch), conv_input_ptr, |
699 | filter_ptr, side_input_ptr, bias_ptr, output_ptr); |
700 | } else { |
701 | cudnn_launch_status = stream->FusedConvolveWithAlgorithm( |
702 | conv_input_desc, conv_input_ptr, conv_scale, filter_desc, filter_ptr, |
703 | conv_desc, side_input_ptr, side_input_scale, bias_desc, bias_ptr, |
704 | dnn_activation_mode, output_desc, &output_ptr, &scratch_allocator, |
705 | autotune_entry.GetAlgorithmConfig(), |
706 | /*output_profile_result=*/nullptr); |
707 | } |
708 | |
709 | if (!cudnn_launch_status.ok()) { |
710 | ctx->SetStatus(cudnn_launch_status); |
711 | } |
712 | } |
713 | }; |
714 | |
715 | template <> |
716 | struct LaunchFusedConv2DOp<GPUDevice, int8> |
717 | : LaunchFusedConv2DOpGpuInt8Helper<int8> {}; |
718 | |
719 | template <> |
720 | struct LaunchFusedConv2DOp<GPUDevice, qint8> |
721 | : LaunchFusedConv2DOpGpuInt8Helper<qint8> {}; |
722 | |
723 | #endif // GOOGLE_CUDA |
724 | |
725 | } // namespace tensorflow |
726 | |