1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | // See docs in ../ops/nn_ops.cc. |
17 | |
18 | #define USE_EIGEN_TENSOR |
19 | #define EIGEN_USE_THREADS |
20 | |
21 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
22 | #define EIGEN_USE_GPU |
23 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
24 | |
25 | #include "tensorflow/core/kernels/conv_ops.h" |
26 | |
27 | #include <string.h> |
28 | |
29 | #include <atomic> |
30 | #include <map> |
31 | #include <utility> |
32 | #include <vector> |
33 | |
34 | #include "absl/synchronization/blocking_counter.h" |
35 | #include "tensorflow/core/framework/allocator.h" |
36 | #include "tensorflow/core/framework/bounds_check.h" |
37 | #include "tensorflow/core/framework/kernel_shape_util.h" |
38 | #include "tensorflow/core/framework/numeric_op.h" |
39 | #include "tensorflow/core/framework/op_kernel.h" |
40 | #include "tensorflow/core/framework/register_types.h" |
41 | #include "tensorflow/core/framework/tensor.h" |
42 | #include "tensorflow/core/framework/tensor_shape.h" |
43 | #include "tensorflow/core/framework/tensor_slice.h" |
44 | #include "tensorflow/core/framework/types.h" |
45 | #include "tensorflow/core/kernels/conv_2d.h" |
46 | #include "tensorflow/core/kernels/deep_conv2d.h" |
47 | #include "tensorflow/core/kernels/fill_functor.h" |
48 | #include "tensorflow/core/kernels/ops_util.h" |
49 | #include "tensorflow/core/lib/core/errors.h" |
50 | #include "tensorflow/core/lib/gtl/array_slice.h" |
51 | #include "tensorflow/core/lib/strings/numbers.h" |
52 | #include "tensorflow/core/lib/strings/str_util.h" |
53 | #include "tensorflow/core/platform/logging.h" |
54 | #include "tensorflow/core/platform/macros.h" |
55 | #include "tensorflow/core/profiler/lib/scoped_annotation.h" |
56 | #include "tensorflow/core/util/padding.h" |
57 | #include "tensorflow/core/util/tensor_format.h" |
58 | #include "tensorflow/core/util/use_cudnn.h" |
59 | |
60 | #ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS |
61 | #include "tensorflow/core/kernels/xsmm_conv2d.h" |
62 | #endif |
63 | |
64 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
65 | #include "tensorflow/core/kernels/conv_ops_gpu.h" |
66 | #include "tensorflow/core/platform/stream_executor.h" |
67 | #include "tensorflow/core/protobuf/autotuning.pb.h" |
68 | #include "tensorflow/core/util/autotune_maps/conv_autotune_maps.h" |
69 | #include "tensorflow/core/util/autotune_maps/conv_parameters.h" |
70 | #include "tensorflow/core/util/proto/proto_utils.h" |
71 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
72 | #if GOOGLE_CUDA |
73 | #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_asm_opts.h" |
74 | #include "tensorflow/compiler/xla/stream_executor/gpu/redzone_allocator.h" |
75 | #include "tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h" |
76 | #endif // GOOGLE_CUDA |
77 | |
78 | namespace tensorflow { |
79 | |
80 | typedef Eigen::ThreadPoolDevice CPUDevice; |
81 | typedef Eigen::GpuDevice GPUDevice; |
82 | |
83 | namespace { |
84 | template <typename Device, typename T> |
85 | struct LaunchGeneric { |
86 | void operator()(OpKernelContext* ctx, const Tensor& input, |
87 | const Tensor& filter, int row_stride, int col_stride, |
88 | int row_dilation, int col_dilation, const Padding& padding, |
89 | const std::vector<int64_t>& explicit_paddings, Tensor* output, |
90 | TensorFormat data_format) { |
91 | CHECK(data_format == FORMAT_NHWC) << "Generic conv implementation only " |
92 | "supports NHWC tensor format for now." ; |
93 | if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 && |
94 | col_stride == 1 && (padding == SAME || padding == VALID)) { |
95 | // For 1x1 kernel, the 2D convolution is reduced to matrix |
96 | // multiplication. |
97 | // |
98 | // TODO(vrv): We should be able to call SpatialConvolution |
99 | // and it will produce the same result, but doing so |
100 | // led to NaNs during training. Using matmul instead for now. |
101 | int conv_width = 1; // Width for the convolution step. |
102 | for (int i = 0; i < 3; ++i) { |
103 | conv_width *= output->dim_size(i); |
104 | } |
105 | |
106 | Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair; |
107 | dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0); |
108 | functor::MatMulConvFunctor<Device, T>()( |
109 | ctx->eigen_device<Device>(), |
110 | output->shaped<T, 2>({conv_width, filter.dim_size(3)}), |
111 | input.shaped<T, 2>({conv_width, filter.dim_size(2)}), |
112 | filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}), |
113 | dim_pair); |
114 | } else if (filter.dim_size(0) == input.dim_size(1) && |
115 | filter.dim_size(1) == input.dim_size(2) && row_dilation == 1 && |
116 | col_dilation == 1 && padding == VALID) { |
117 | // If the input data and filter have the same height/width, |
118 | // the 2D convolution is reduced to matrix multiplication. |
119 | const int k = // Length of reduction dimension. |
120 | filter.dim_size(0) * filter.dim_size(1) * filter.dim_size(2); |
121 | |
122 | Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair; |
123 | dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0); |
124 | functor::MatMulConvFunctor<Device, T>()( |
125 | ctx->eigen_device<Device>(), |
126 | output->shaped<T, 2>({input.dim_size(0), filter.dim_size(3)}), |
127 | input.shaped<T, 2>({input.dim_size(0), k}), |
128 | filter.shaped<T, 2>({k, filter.dim_size(3)}), dim_pair); |
129 | } else { |
130 | if (padding == EXPLICIT) { |
131 | functor::SpatialConvolution<Device, T>()( |
132 | ctx->eigen_device<Device>(), output->tensor<T, 4>(), |
133 | input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride, col_stride, |
134 | row_dilation, col_dilation, static_cast<int>(explicit_paddings[2]), |
135 | static_cast<int>(explicit_paddings[3]), |
136 | static_cast<int>(explicit_paddings[4]), |
137 | static_cast<int>(explicit_paddings[5])); |
138 | } else { |
139 | functor::SpatialConvolution<Device, T>()( |
140 | ctx->eigen_device<Device>(), output->tensor<T, 4>(), |
141 | input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride, col_stride, |
142 | row_dilation, col_dilation, BrainPadding2EigenPadding(padding)); |
143 | } |
144 | } |
145 | } |
146 | }; |
147 | |
148 | // Compute grouped 2D convolutions on CPU. Unlike grouped convolution |
149 | // implementation in cuDNN this is faaaaaar from optimal and needs more work |
150 | // to deliver competitive performance. Currently it exists to close the feature |
151 | // parity gap between convolution operations on different devices. |
152 | template <typename T> |
153 | struct LaunchGrouped { |
154 | void operator()(OpKernelContext* ctx, const Tensor& input, |
155 | const Tensor& filter, int row_stride, int col_stride, |
156 | int row_dilation, int col_dilation, const Padding& padding, |
157 | const std::vector<int64_t>& explicit_paddings, Tensor* output, |
158 | TensorFormat data_format) { |
159 | DCHECK(data_format == FORMAT_NHWC) |
160 | << "Grouped conv implementation only " |
161 | "supports NHWC tensor format for now." ; |
162 | |
163 | const int64_t in_depth = input.dim_size(3); |
164 | const int64_t patch_depth = filter.dim_size(2); |
165 | const int64_t num_groups = in_depth / patch_depth; |
166 | |
167 | // Shuffle input/filter tensors to have group as a leading dimension. |
168 | std::array<int64_t, 5> shuffle({3, 0, 1, 2, 4}); |
169 | |
170 | // Compute pre shuffle dimemnsions. |
171 | auto pre_shuffle = [&](const Tensor& tensor) -> std::array<int64, 5> { |
172 | return {tensor.dim_size(0), tensor.dim_size(1), tensor.dim_size(2), |
173 | num_groups, tensor.dim_size(3) / num_groups}; |
174 | }; |
175 | |
176 | // Compute post shuffle dimemnsions. |
177 | auto post_shuffle = [&](const Tensor& tensor) -> std::array<int64, 5> { |
178 | return {num_groups, tensor.dim_size(0), tensor.dim_size(1), |
179 | tensor.dim_size(2), tensor.dim_size(3) / num_groups}; |
180 | }; |
181 | |
182 | auto& device = ctx->eigen_device<CPUDevice>(); |
183 | |
184 | absl::BlockingCounter shuffles_completed(2); |
185 | auto on_shuffled = [&]() { shuffles_completed.DecrementCount(); }; |
186 | |
187 | // Shuffle input into temporary tensor. |
188 | Tensor input_shuffled; |
189 | OP_REQUIRES_OK( |
190 | ctx, ctx->allocate_temp(input.dtype(), TensorShape(post_shuffle(input)), |
191 | &input_shuffled)); |
192 | input_shuffled.tensor<T, 5>().device(device, on_shuffled) = |
193 | input.shaped<T, 5>(pre_shuffle(input)).shuffle(shuffle); |
194 | |
195 | // Shuffle filter into temporary tensor. |
196 | Tensor filter_shuffled; |
197 | OP_REQUIRES_OK(ctx, ctx->allocate_temp(filter.dtype(), |
198 | TensorShape(post_shuffle(filter)), |
199 | &filter_shuffled)); |
200 | filter_shuffled.tensor<T, 5>().device(device, on_shuffled) = |
201 | filter.shaped<T, 5>(pre_shuffle(filter)).shuffle(shuffle); |
202 | |
203 | // Wait for the completion of input/filter shuffles. |
204 | shuffles_completed.Wait(); |
205 | |
206 | // Write group convolution results into temporary output tensor. |
207 | Tensor output_shuffled; |
208 | OP_REQUIRES_OK(ctx, ctx->allocate_temp(output->dtype(), |
209 | TensorShape(post_shuffle(*output)), |
210 | &output_shuffled)); |
211 | |
212 | for (int64_t i = 0; i < num_groups; ++i) { |
213 | // TODO(ezhulenev): Run this loop using `parallelFor` (regular parallelFor |
214 | // will lead to deadlock, SpatialConvolution has to use async Eigen |
215 | // assignment). This requires small changes to Eigen to support async |
216 | // exeuction for tensor chipping operation. |
217 | |
218 | // TODO(ezhulenev): Grouped convolution should also support 1x1 filter |
219 | // optimization. |
220 | |
221 | auto input_slice = input_shuffled.tensor<T, 5>().template chip<0>(i); |
222 | auto filter_slice = filter_shuffled.tensor<T, 5>().template chip<0>(i); |
223 | auto output_slice = output_shuffled.tensor<T, 5>().template chip<0>(i); |
224 | |
225 | if (padding == EXPLICIT) { |
226 | functor::SpatialConvolution<CPUDevice, T>()( |
227 | ctx->eigen_device<CPUDevice>(), output_slice, input_slice, |
228 | filter_slice, row_stride, col_stride, row_dilation, col_dilation, |
229 | static_cast<int>(explicit_paddings[2]), |
230 | static_cast<int>(explicit_paddings[3]), |
231 | static_cast<int>(explicit_paddings[4]), |
232 | static_cast<int>(explicit_paddings[5])); |
233 | } else { |
234 | functor::SpatialConvolution<CPUDevice, T>()( |
235 | ctx->eigen_device<CPUDevice>(), output_slice, input_slice, |
236 | filter_slice, row_stride, col_stride, row_dilation, col_dilation, |
237 | BrainPadding2EigenPadding(padding)); |
238 | } |
239 | } |
240 | |
241 | // Shuffle temporary output back into pre-shuffled shape. |
242 | std::array<int64_t, 5> rev_shuffle({1, 2, 3, 0, 4}); |
243 | output->shaped<T, 5>(pre_shuffle(*output)).device(device) = |
244 | output_shuffled.tensor<T, 5>().shuffle(rev_shuffle); |
245 | } |
246 | }; |
247 | |
248 | } // namespace |
249 | |
250 | template <typename T> |
251 | struct LaunchConv2DOp<CPUDevice, T> { |
252 | void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, |
253 | const Tensor& input, const Tensor& filter, int row_dilation, |
254 | int col_dilation, int row_stride, int col_stride, |
255 | const Padding& padding, |
256 | const std::vector<int64_t>& explicit_paddings, Tensor* output, |
257 | TensorFormat data_format) { |
258 | if (data_format != FORMAT_NHWC) { |
259 | ctx->SetStatus(errors::Unimplemented( |
260 | "The Conv2D op currently only supports the NHWC tensor format on the " |
261 | "CPU. The op was given the format: " , |
262 | ToString(data_format))); |
263 | return; |
264 | } |
265 | |
266 | for (int64_t explicit_padding : explicit_paddings) { |
267 | if (!FastBoundsCheck(explicit_padding, std::numeric_limits<int>::max())) { |
268 | ctx->SetStatus(errors::InvalidArgument("filter too large" )); |
269 | return; |
270 | } |
271 | } |
272 | |
273 | const int64_t in_depth = input.dim_size(3); |
274 | const int64_t out_depth = output->dim_size(3); |
275 | const int64_t patch_depth = filter.dim_size(2); |
276 | |
277 | if (patch_depth <= 0) { |
278 | ctx->SetStatus(errors::InvalidArgument( |
279 | "filter depth must be stricly positive, got " , patch_depth)); |
280 | return; |
281 | } |
282 | if (in_depth % patch_depth != 0) { |
283 | ctx->SetStatus(errors::InvalidArgument( |
284 | "input depth must be evenly divisible by filter depth: " , in_depth, |
285 | " vs " , patch_depth)); |
286 | return; |
287 | } |
288 | if (filter.NumElements() <= 0) { |
289 | ctx->SetStatus( |
290 | errors::InvalidArgument("filter must not have zero elements " |
291 | "(i.e. all dimensions must be non-zero)" )); |
292 | return; |
293 | } |
294 | |
295 | const int64_t num_groups = in_depth / patch_depth; |
296 | if (num_groups <= 0) { |
297 | ctx->SetStatus(errors::InvalidArgument( |
298 | "number of groups must be stricly positive, got " , num_groups)); |
299 | return; |
300 | } |
301 | if (out_depth % num_groups != 0 || out_depth < num_groups) { |
302 | ctx->SetStatus(errors::InvalidArgument( |
303 | "output depth must be evenly divisible by number of groups: " , |
304 | out_depth, " vs " , num_groups)); |
305 | return; |
306 | } |
307 | |
308 | if (in_depth != patch_depth) { |
309 | LaunchGrouped<T>()(ctx, input, filter, row_stride, col_stride, |
310 | row_dilation, col_dilation, padding, explicit_paddings, |
311 | output, data_format); |
312 | } else { |
313 | LaunchGeneric<CPUDevice, T>()(ctx, input, filter, row_stride, col_stride, |
314 | row_dilation, col_dilation, padding, |
315 | explicit_paddings, output, data_format); |
316 | } |
317 | } |
318 | }; |
319 | |
320 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
321 | template <> |
322 | struct LaunchConv2DOp<GPUDevice, int32> { |
323 | void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, |
324 | const Tensor& input, const Tensor& filter, int row_dilation, |
325 | int col_dilation, int row_stride, int col_stride, |
326 | const Padding& padding, |
327 | const std::vector<int64_t>& explicit_paddings, Tensor* output, |
328 | TensorFormat data_format) { |
329 | if (data_format != FORMAT_NHWC) { |
330 | ctx->SetStatus( |
331 | errors::Unimplemented("The Conv2D op currently only supports the " |
332 | "NHWC tensor format for integer types. " |
333 | "The op was given the format: " , |
334 | ToString(data_format))); |
335 | return; |
336 | } |
337 | const int64_t in_depth = GetTensorDim(input, data_format, 'C'); |
338 | OP_REQUIRES(ctx, in_depth == filter.dim_size(2), |
339 | errors::Unimplemented( |
340 | "The Conv2D op currently does not support grouped " |
341 | "convolutions for integer types. A grouped convolution was " |
342 | "attempted to be run because the input depth of " , |
343 | in_depth, " does not match the filter input depth of " , |
344 | filter.dim_size(2))); |
345 | OP_REQUIRES( |
346 | ctx, filter.NumElements() > 0, |
347 | errors::InvalidArgument("filter must not have zero elements " |
348 | "(i.e. all dimensions must be non-zero)" )); |
349 | |
350 | for (int64_t explicit_padding : explicit_paddings) { |
351 | if (!FastBoundsCheck(explicit_padding, std::numeric_limits<int>::max())) { |
352 | ctx->SetStatus(errors::InvalidArgument("filter too large" )); |
353 | return; |
354 | } |
355 | } |
356 | LaunchGeneric<GPUDevice, int32>()( |
357 | ctx, input, filter, row_stride, col_stride, row_dilation, col_dilation, |
358 | padding, explicit_paddings, output, data_format); |
359 | } |
360 | }; |
361 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
362 | |
363 | template <typename Device, typename T> |
364 | class LaunchDeepConvOp { |
365 | public: |
366 | static bool Run(OpKernelContext* ctx, const Tensor& input, |
367 | const Tensor& filter, int batch, int input_rows, |
368 | int input_cols, int in_depth, int filter_rows, |
369 | int filter_cols, int pad_rows, int pad_cols, int out_rows, |
370 | int /*out_cols*/, int /*out_depth*/, int /*dilation_rows*/, |
371 | int /*dilation_cols*/, int /*stride_rows*/, |
372 | int /*stride_cols*/, Tensor* /*output*/, |
373 | TensorFormat /*data_format*/) { |
374 | return false; |
375 | } |
376 | }; |
377 | |
378 | // Conditionally launches DeepConv operation based on convolution parameters. |
379 | template <> |
380 | class LaunchDeepConvOp<CPUDevice, float> { |
381 | public: |
382 | static bool Run(OpKernelContext* ctx, const Tensor& input, |
383 | const Tensor& filter, int batch, int input_rows, |
384 | int input_cols, int in_depth, int filter_rows, |
385 | int filter_cols, int pad_rows, int pad_cols, int out_rows, |
386 | int out_cols, int out_depth, int dilation_rows, |
387 | int dilation_cols, int stride_rows, int stride_cols, |
388 | Tensor* output, TensorFormat data_format) { |
389 | if (data_format != FORMAT_NHWC || dilation_rows != 1 || |
390 | dilation_cols != 1 || |
391 | !CanUseDeepConv2D(stride_rows, stride_cols, filter_rows, filter_cols, |
392 | in_depth, out_depth, out_rows, out_cols)) { |
393 | return false; |
394 | } |
395 | |
396 | Conv2DArgs args; |
397 | args.batch = batch; |
398 | args.in_rows = input_rows; |
399 | args.in_cols = input_cols; |
400 | args.in_depth = in_depth; |
401 | args.filter_rows = filter_rows; |
402 | args.filter_cols = filter_cols; |
403 | args.pad_rows = pad_rows; |
404 | args.pad_cols = pad_cols; |
405 | args.out_rows = out_rows; |
406 | args.out_cols = out_cols; |
407 | args.out_depth = out_depth; |
408 | |
409 | auto input_ptr = input.template flat<float>().data(); |
410 | auto filter_ptr = filter.template flat<float>().data(); |
411 | auto output_ptr = output->template flat<float>().data(); |
412 | |
413 | functor::DeepConv2D<CPUDevice, float>()(ctx, args, input_ptr, filter_ptr, |
414 | output_ptr); |
415 | return true; |
416 | } |
417 | }; |
418 | |
419 | #ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS |
420 | template <typename Device, typename T> |
421 | class LaunchXsmmConvOp { |
422 | public: |
423 | static bool Run(OpKernelContext* ctx, const Tensor& input, |
424 | const Tensor& filter, int batch, int input_rows, |
425 | int input_cols, int in_depth, int filter_rows, |
426 | int filter_cols, int pad_rows, int pad_cols, int out_rows, |
427 | int out_cols, int out_depth, int stride_rows, int stride_cols, |
428 | int dilation_rows, int dilation_cols, Tensor* output, |
429 | TensorFormat data_format) { |
430 | return false; |
431 | } |
432 | }; |
433 | |
434 | template <> |
435 | class LaunchXsmmConvOp<CPUDevice, float> { |
436 | public: |
437 | static bool Run(OpKernelContext* ctx, const Tensor& input, |
438 | const Tensor& filter, int batch, int input_rows, |
439 | int input_cols, int in_depth, int filter_rows, |
440 | int filter_cols, int pad_rows, int pad_cols, int out_rows, |
441 | int out_cols, int out_depth, int dilation_rows, |
442 | int dilation_cols, int stride_rows, int stride_cols, |
443 | Tensor* output, TensorFormat data_format) { |
444 | auto num_threads = |
445 | ctx->device()->tensorflow_cpu_worker_threads()->num_threads; |
446 | // See libxsmm_dnn.h for this struct definition. |
447 | libxsmm_dnn_conv_desc desc; |
448 | desc.N = batch; |
449 | desc.C = in_depth; |
450 | desc.H = input_rows; |
451 | desc.W = input_cols; |
452 | desc.K = out_depth; |
453 | desc.R = filter_rows; |
454 | desc.S = filter_cols; |
455 | desc.u = stride_rows; |
456 | desc.v = stride_cols; |
457 | desc.pad_h = pad_rows; |
458 | desc.pad_w = pad_cols; |
459 | desc.pad_h_in = 0; |
460 | desc.pad_w_in = 0; |
461 | desc.pad_h_out = 0; |
462 | desc.pad_w_out = 0; |
463 | desc.threads = num_threads; |
464 | desc.algo = LIBXSMM_DNN_CONV_ALGO_DIRECT; |
465 | desc.buffer_format = LIBXSMM_DNN_TENSOR_FORMAT_NHWC; |
466 | desc.filter_format = LIBXSMM_DNN_TENSOR_FORMAT_LIBXSMM; |
467 | desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE; |
468 | desc.options = LIBXSMM_DNN_CONV_OPTION_OVERWRITE; |
469 | desc.datatype_out = LIBXSMM_DNN_DATATYPE_F32; |
470 | desc.datatype_in = LIBXSMM_DNN_DATATYPE_F32; |
471 | if (dilation_rows != 1 || dilation_cols != 1 || |
472 | !CanUseXsmmConv2D(desc, data_format)) { |
473 | return false; |
474 | } |
475 | |
476 | auto input_ptr = input.template flat<float>().data(); |
477 | auto filter_ptr = filter.template flat<float>().data(); |
478 | auto output_ptr = output->template flat<float>().data(); |
479 | |
480 | bool success = functor::XsmmFwdConv2D<CPUDevice, float>()( |
481 | ctx, desc, input_ptr, filter_ptr, output_ptr); |
482 | return success; |
483 | } |
484 | }; |
485 | #endif |
486 | |
487 | #define TF_REQUIRES(EXP, STATUS) \ |
488 | do { \ |
489 | if (!TF_PREDICT_TRUE(EXP)) return (STATUS); \ |
490 | } while (false) |
491 | |
492 | Status InitConv2DParameters(const OpKernelConstruction* context, |
493 | Conv2DParameters* params) { |
494 | TF_RETURN_IF_ERROR(context->GetAttr("dilations" , ¶ms->dilations)); |
495 | TF_RETURN_IF_ERROR(context->GetAttr("strides" , ¶ms->strides)); |
496 | TF_RETURN_IF_ERROR(context->GetAttr("padding" , ¶ms->padding)); |
497 | if (context->HasAttr("explicit_paddings" )) { |
498 | TF_RETURN_IF_ERROR( |
499 | context->GetAttr("explicit_paddings" , ¶ms->explicit_paddings)); |
500 | } |
501 | string data_format_string; |
502 | TF_RETURN_IF_ERROR(context->GetAttr("data_format" , &data_format_string)); |
503 | TF_REQUIRES(FormatFromString(data_format_string, ¶ms->data_format), |
504 | errors::InvalidArgument("Invalid data format" )); |
505 | |
506 | const auto& strides = params->strides; |
507 | const auto& dilations = params->dilations; |
508 | const auto& data_format = params->data_format; |
509 | |
510 | TF_REQUIRES(dilations.size() == 4, |
511 | errors::InvalidArgument("Sliding window dilations field must " |
512 | "specify 4 dimensions" )); |
513 | TF_REQUIRES(strides.size() == 4, |
514 | errors::InvalidArgument("Sliding window strides field must " |
515 | "specify 4 dimensions" )); |
516 | const int64_t stride_n = GetTensorDim(strides, data_format, 'N'); |
517 | const int64_t stride_c = GetTensorDim(strides, data_format, 'C'); |
518 | const int64_t stride_h = GetTensorDim(strides, data_format, 'H'); |
519 | const int64_t stride_w = GetTensorDim(strides, data_format, 'W'); |
520 | TF_REQUIRES( |
521 | stride_n == 1 && stride_c == 1, |
522 | errors::Unimplemented("Current implementation does not yet support " |
523 | "strides in the batch and depth dimensions." )); |
524 | TF_REQUIRES(stride_h > 0 && stride_w > 0, |
525 | errors::InvalidArgument( |
526 | "Row and column strides should be larger than 0." )); |
527 | |
528 | const int64_t dilation_n = GetTensorDim(dilations, data_format, 'N'); |
529 | const int64_t dilation_c = GetTensorDim(dilations, data_format, 'C'); |
530 | const int64_t dilation_h = GetTensorDim(dilations, data_format, 'H'); |
531 | const int64_t dilation_w = GetTensorDim(dilations, data_format, 'W'); |
532 | TF_REQUIRES( |
533 | dilation_n == 1 && dilation_c == 1, |
534 | errors::Unimplemented("Current implementation does not yet support " |
535 | "dilations in the batch and depth dimensions." )); |
536 | TF_REQUIRES( |
537 | dilation_h > 0 && dilation_w > 0, |
538 | errors::InvalidArgument("Dilated rates should be larger than 0." )); |
539 | |
540 | int num_dims = data_format == TensorFormat::FORMAT_NCHW_VECT_C ? 5 : 4; |
541 | TF_RETURN_IF_ERROR(CheckValidPadding( |
542 | params->padding, params->explicit_paddings, num_dims, data_format)); |
543 | |
544 | return OkStatus(); |
545 | } |
546 | |
547 | Status ComputeConv2DDimension(const Conv2DParameters& params, |
548 | const Tensor& input, const Tensor& filter, |
549 | Conv2DDimensions* dimensions) { |
550 | int required_dims = |
551 | params.data_format == TensorFormat::FORMAT_NCHW_VECT_C ? 5 : 4; |
552 | // Check that 2D convolution input and filter have exactly required_dims. |
553 | TF_REQUIRES( |
554 | input.dims() == required_dims, |
555 | errors::InvalidArgument("convolution input must be " , required_dims, |
556 | "-dimensional: " , input.shape().DebugString())); |
557 | TF_REQUIRES( |
558 | filter.dims() == required_dims, |
559 | errors::InvalidArgument("convolution filter must be " , required_dims, |
560 | "-dimensional: " , filter.shape().DebugString())); |
561 | for (int i = 0; i < required_dims - 1; i++) { |
562 | TF_REQUIRES( |
563 | FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()), |
564 | errors::InvalidArgument("filter too large" )); |
565 | } |
566 | |
567 | FilterTensorFormat filter_format = |
568 | params.data_format == TensorFormat::FORMAT_NCHW_VECT_C |
569 | ? FilterTensorFormat::FORMAT_OIHW_VECT_I |
570 | : FilterTensorFormat::FORMAT_HWIO; |
571 | |
572 | // The last dimension for input is in_depth. Check that it is the same as the |
573 | // filter's in_depth or it is evenly divisible by filter's in_depth. |
574 | const int64_t in_depth_raw = GetTensorDim(input, params.data_format, 'C'); |
575 | const int64_t patch_depth_raw = GetFilterDim(filter, filter_format, 'I'); |
576 | TF_REQUIRES(FastBoundsCheck(in_depth_raw, std::numeric_limits<int>::max()), |
577 | errors::InvalidArgument("Input depth too large" )); |
578 | TF_REQUIRES(FastBoundsCheck(patch_depth_raw, std::numeric_limits<int>::max()), |
579 | errors::InvalidArgument("Patch depth too large" )); |
580 | const int in_depth = static_cast<int>(in_depth_raw); |
581 | const int patch_depth = static_cast<int>(patch_depth_raw); |
582 | TF_REQUIRES(patch_depth > 0, |
583 | errors::InvalidArgument( |
584 | "filter depth must be stricly positive, got " , patch_depth)); |
585 | TF_REQUIRES(in_depth % patch_depth == 0, |
586 | errors::InvalidArgument( |
587 | "input depth must be evenly divisible by filter depth: " , |
588 | in_depth, " vs " , patch_depth)); |
589 | |
590 | // The last dimension for filter is out_depth. |
591 | const int out_depth = |
592 | static_cast<int>(GetFilterDim(filter, filter_format, 'O')); |
593 | |
594 | // The second dimension for input is rows/height. |
595 | // The first dimension for filter is rows/height. |
596 | const int64_t input_rows_raw = GetTensorDim(input, params.data_format, 'H'); |
597 | TF_REQUIRES(FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()), |
598 | errors::InvalidArgument("Input rows too large" )); |
599 | const int input_rows = static_cast<int>(input_rows_raw); |
600 | const int filter_rows = |
601 | static_cast<int>(GetFilterDim(filter, filter_format, 'H')); |
602 | |
603 | // The third dimension for input is columns/width. |
604 | // The second dimension for filter is columns/width. |
605 | const int64_t input_cols_raw = GetTensorDim(input, params.data_format, 'W'); |
606 | TF_REQUIRES(FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()), |
607 | errors::InvalidArgument("Input cols too large" )); |
608 | const int input_cols = static_cast<int>(input_cols_raw); |
609 | const int filter_cols = |
610 | static_cast<int>(GetFilterDim(filter, filter_format, 'W')); |
611 | |
612 | // The first dimension for input is batch. |
613 | const int64_t batch_raw = GetTensorDim(input, params.data_format, 'N'); |
614 | TF_REQUIRES(FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()), |
615 | errors::InvalidArgument("batch is too large" )); |
616 | const int batch = static_cast<int>(batch_raw); |
617 | |
618 | // Take the stride and dilation from the second and third dimensions only (we |
619 | // do not support striding or dilation on the batch or depth dimension). |
620 | const int stride_rows = GetTensorDim(params.strides, params.data_format, 'H'); |
621 | const int stride_cols = GetTensorDim(params.strides, params.data_format, 'W'); |
622 | const int dilation_rows = |
623 | GetTensorDim(params.dilations, params.data_format, 'H'); |
624 | const int dilation_cols = |
625 | GetTensorDim(params.dilations, params.data_format, 'W'); |
626 | |
627 | int64_t pad_rows_before, pad_rows_after, pad_cols_before, pad_cols_after; |
628 | if (params.padding == Padding::EXPLICIT) { |
629 | GetExplicitPaddingForDim(params.explicit_paddings, params.data_format, 'H', |
630 | &pad_rows_before, &pad_rows_after); |
631 | GetExplicitPaddingForDim(params.explicit_paddings, params.data_format, 'W', |
632 | &pad_cols_before, &pad_cols_after); |
633 | } |
634 | |
635 | // Compute windowed output sizes for rows and columns. |
636 | int64_t out_rows = 0, out_cols = 0; |
637 | TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2( |
638 | input_rows, filter_rows, dilation_rows, stride_rows, params.padding, |
639 | &out_rows, &pad_rows_before, &pad_rows_after)); |
640 | TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2( |
641 | input_cols, filter_cols, dilation_cols, stride_cols, params.padding, |
642 | &out_cols, &pad_cols_before, &pad_cols_after)); |
643 | |
644 | dimensions->batch = batch; |
645 | dimensions->input_rows = input_rows; |
646 | dimensions->input_cols = input_cols; |
647 | dimensions->in_depth = in_depth; |
648 | dimensions->filter_rows = filter_rows; |
649 | dimensions->filter_cols = filter_cols; |
650 | dimensions->patch_depth = patch_depth; |
651 | dimensions->out_depth = out_depth; |
652 | dimensions->stride_rows = stride_rows; |
653 | dimensions->stride_cols = stride_cols; |
654 | dimensions->dilation_rows = dilation_rows; |
655 | dimensions->dilation_cols = dilation_cols; |
656 | dimensions->out_rows = out_rows; |
657 | dimensions->out_cols = out_cols; |
658 | dimensions->pad_rows_before = pad_rows_before; |
659 | dimensions->pad_rows_after = pad_rows_after; |
660 | dimensions->pad_cols_before = pad_cols_before; |
661 | dimensions->pad_cols_after = pad_cols_after; |
662 | |
663 | return OkStatus(); |
664 | } |
665 | |
666 | #undef TF_REQUIRES |
667 | |
668 | template <typename Device, typename T> |
669 | class Conv2DOp : public BinaryOp<T> { |
670 | public: |
671 | explicit Conv2DOp(OpKernelConstruction* context) : BinaryOp<T>(context) { |
672 | OP_REQUIRES_OK(context, InitConv2DParameters(context, ¶ms_)); |
673 | |
674 | OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu" , &use_cudnn_)); |
675 | cudnn_use_autotune_ = CudnnUseAutotune(); |
676 | } |
677 | |
678 | void Compute(OpKernelContext* context) override { |
679 | // Input tensor is of the following dimensions: |
680 | // [ batch, in_rows, in_cols, in_depth ] |
681 | const Tensor& input = context->input(0); |
682 | |
683 | // Input filter is of the following dimensions: |
684 | // [ filter_rows, filter_cols, in_depth, out_depth] |
685 | const Tensor& filter = context->input(1); |
686 | |
687 | Conv2DDimensions dimensions; |
688 | OP_REQUIRES_OK(context, |
689 | ComputeConv2DDimension(params_, input, filter, &dimensions)); |
690 | |
691 | TensorShape out_shape = ShapeFromFormat( |
692 | params_.data_format, dimensions.batch, dimensions.out_rows, |
693 | dimensions.out_cols, dimensions.out_depth); |
694 | |
695 | // Output tensor is of the following dimensions: |
696 | // [ in_batch, out_rows, out_cols, out_depth ] |
697 | Tensor* output = nullptr; |
698 | OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); |
699 | |
700 | VLOG(2) << "Conv2D: in_depth = " << dimensions.in_depth |
701 | << ", patch_depth = " << dimensions.patch_depth |
702 | << ", input_cols = " << dimensions.input_cols |
703 | << ", filter_cols = " << dimensions.filter_cols |
704 | << ", input_rows = " << dimensions.input_rows |
705 | << ", filter_rows = " << dimensions.filter_rows |
706 | << ", stride_rows = " << dimensions.stride_rows |
707 | << ", stride_cols = " << dimensions.stride_cols |
708 | << ", dilation_rows = " << dimensions.dilation_rows |
709 | << ", dilation_cols = " << dimensions.dilation_cols |
710 | << ", out_depth = " << dimensions.out_depth; |
711 | |
712 | // If there is nothing to compute, return. |
713 | if (out_shape.num_elements() == 0) { |
714 | return; |
715 | } |
716 | |
717 | // If the input is empty, result can only be due to padding. |
718 | if (input.NumElements() == 0) { |
719 | // Zero-out output and return. |
720 | functor::SetZeroFunctor<Device, T>()(context->eigen_device<Device>(), |
721 | output->template flat<T>()); |
722 | |
723 | return; |
724 | } |
725 | |
726 | #ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS |
727 | if (params_.padding != EXPLICIT && |
728 | LaunchXsmmConvOp<Device, T>::Run( |
729 | context, input, filter, dimensions.batch, dimensions.input_rows, |
730 | dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows, |
731 | dimensions.filter_cols, dimensions.pad_rows_before, |
732 | dimensions.pad_cols_before, dimensions.out_rows, |
733 | dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows, |
734 | dimensions.dilation_cols, dimensions.stride_rows, |
735 | dimensions.stride_cols, output, params_.data_format)) { |
736 | return; |
737 | } |
738 | #endif |
739 | |
740 | if (params_.padding != EXPLICIT && |
741 | LaunchDeepConvOp<Device, T>::Run( |
742 | context, input, filter, dimensions.batch, dimensions.input_rows, |
743 | dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows, |
744 | dimensions.filter_cols, dimensions.pad_rows_before, |
745 | dimensions.pad_cols_before, dimensions.out_rows, |
746 | dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows, |
747 | dimensions.dilation_cols, dimensions.stride_rows, |
748 | dimensions.stride_cols, output, params_.data_format)) { |
749 | return; |
750 | } |
751 | |
752 | launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter, |
753 | dimensions.dilation_rows, dimensions.dilation_cols, |
754 | dimensions.stride_rows, dimensions.stride_cols, params_.padding, |
755 | params_.explicit_paddings, output, params_.data_format); |
756 | } |
757 | |
758 | private: |
759 | Conv2DParameters params_; |
760 | bool use_cudnn_; |
761 | bool cudnn_use_autotune_; |
762 | |
763 | LaunchConv2DOp<Device, T> launcher_; |
764 | |
765 | TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp); |
766 | }; |
767 | |
768 | #define REGISTER_CPU(T) \ |
769 | REGISTER_KERNEL_BUILDER( \ |
770 | Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ |
771 | Conv2DOp<CPUDevice, T>); |
772 | |
773 | // If we're using the alternative GEMM-based implementation of Conv2D for the |
774 | // CPU implementation, don't register this EigenTensor-based version. |
775 | #if !defined(USE_GEMM_FOR_CONV) |
776 | TF_CALL_bfloat16(REGISTER_CPU); |
777 | TF_CALL_half(REGISTER_CPU); |
778 | TF_CALL_float(REGISTER_CPU); |
779 | TF_CALL_double(REGISTER_CPU); |
780 | TF_CALL_int32(REGISTER_CPU); |
781 | #endif // USE_GEMM_FOR_CONV |
782 | |
783 | // To be used inside depthwise_conv_op.cc. |
784 | template struct LaunchConv2DOp<CPUDevice, Eigen::bfloat16>; |
785 | template struct LaunchConv2DOp<CPUDevice, Eigen::half>; |
786 | template struct LaunchConv2DOp<CPUDevice, float>; |
787 | template struct LaunchConv2DOp<CPUDevice, double>; |
788 | |
789 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
790 | |
791 | int64_t GetDnnWorkspaceLimit(const string& envvar_in_mb, |
792 | int64_t default_value_in_bytes) { |
793 | const char* workspace_limit_in_mb_str = getenv(envvar_in_mb.c_str()); |
794 | if (workspace_limit_in_mb_str != nullptr && |
795 | strcmp(workspace_limit_in_mb_str, "" ) != 0) { |
796 | int64_t scratch_limit_in_mb = -1; |
797 | if (strings::safe_strto64(workspace_limit_in_mb_str, |
798 | &scratch_limit_in_mb)) { |
799 | return scratch_limit_in_mb * (1 << 20); |
800 | } else { |
801 | LOG(WARNING) << "Invalid value for env-var " << envvar_in_mb << ": " |
802 | << workspace_limit_in_mb_str; |
803 | } |
804 | } |
805 | return default_value_in_bytes; |
806 | } |
807 | |
808 | int64_t GetDnnWorkspaceLimitOrDefault() { |
809 | return GetDnnWorkspaceLimit("TF_CUDNN_WORKSPACE_LIMIT_IN_MB" , |
810 | 1LL << 33); // 8GB by default |
811 | } |
812 | |
813 | template <typename T> |
814 | void LaunchConv2DOp<GPUDevice, T>::operator()( |
815 | OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune, |
816 | const Tensor& input_param, const Tensor& filter, int row_dilation, |
817 | int col_dilation, int row_stride, int col_stride, const Padding& padding, |
818 | const std::vector<int64_t>& explicit_paddings, Tensor* output, |
819 | TensorFormat data_format) { |
820 | using se::dnn::AlgorithmConfig; |
821 | using se::dnn::AlgorithmDesc; |
822 | using se::dnn::ProfileResult; |
823 | auto* stream = ctx->op_device_context()->stream(); |
824 | OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available." )); |
825 | |
826 | if (!use_cudnn) { |
827 | ctx->SetStatus( |
828 | errors::Unimplemented("Conv2D for GPU is not currently supported " |
829 | "without cudnn" )); |
830 | return; |
831 | } |
832 | |
833 | Tensor input = input_param; |
834 | const int64_t in_batch = GetTensorDim(input, data_format, 'N'); |
835 | int64_t in_rows = GetTensorDim(input, data_format, 'H'); |
836 | int64_t in_cols = GetTensorDim(input, data_format, 'W'); |
837 | const int64_t in_depths = GetTensorDim(input, data_format, 'C'); |
838 | const int64_t patch_rows = filter.dim_size(0); |
839 | const int64_t patch_cols = filter.dim_size(1); |
840 | const int64_t patch_depths = filter.dim_size(2); |
841 | |
842 | OP_REQUIRES( |
843 | ctx, filter.NumElements() > 0, |
844 | errors::InvalidArgument("filter must not have zero elements " |
845 | "(i.e. all dimensions must be non-zero)" )); |
846 | |
847 | // If the filter in-depth (patch_depths) is 1 and smaller than the input |
848 | // depth, it's a depthwise convolution. More generally, if the filter in-depth |
849 | // divides but is smaller than the input depth, it is a grouped convolution. |
850 | bool is_grouped_convolution = patch_depths != in_depths; |
851 | if (patch_rows == 1 && patch_cols == 1 && !is_grouped_convolution && |
852 | row_dilation == 1 && col_dilation == 1 && row_stride == 1 && |
853 | col_stride == 1 && data_format == FORMAT_NHWC && |
854 | (padding == VALID || padding == SAME)) { |
855 | // 1x1 filter, so call cublas directly. |
856 | const uint64 m = in_batch * in_rows * in_cols; |
857 | const uint64 k = patch_depths; |
858 | const uint64 n = filter.dim_size(3); |
859 | |
860 | auto a_ptr = AsDeviceMemory(input.template flat<T>().data(), |
861 | input.template flat<T>().size()); |
862 | auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(), |
863 | filter.template flat<T>().size()); |
864 | auto c_ptr = AsDeviceMemory(output->template flat<T>().data(), |
865 | output->template flat<T>().size()); |
866 | |
867 | auto no_transpose = se::blas::Transpose::kNoTranspose; |
868 | OP_REQUIRES_OK( |
869 | ctx, stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, b_ptr, n, |
870 | a_ptr, k, &c_ptr, n, |
871 | se::blas::kDefaultComputePrecision)); |
872 | return; |
873 | } else if (patch_rows == in_rows && patch_cols == in_cols && |
874 | !is_grouped_convolution && row_dilation == 1 && |
875 | col_dilation == 1 && padding == VALID && |
876 | data_format == FORMAT_NHWC) { |
877 | // The input data and filter have the same height/width, so call cublas |
878 | // directly. |
879 | const uint64 m = in_batch; |
880 | const uint64 k = patch_rows * patch_cols * patch_depths; |
881 | const uint64 n = filter.dim_size(3); |
882 | |
883 | auto a_ptr = AsDeviceMemory(input.template flat<T>().data(), |
884 | input.template flat<T>().size()); |
885 | auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(), |
886 | filter.template flat<T>().size()); |
887 | auto c_ptr = AsDeviceMemory(output->template flat<T>().data(), |
888 | output->template flat<T>().size()); |
889 | |
890 | auto no_transpose = se::blas::Transpose::kNoTranspose; |
891 | OP_REQUIRES_OK( |
892 | ctx, stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, b_ptr, n, |
893 | a_ptr, k, &c_ptr, n, |
894 | se::blas::kDefaultComputePrecision)); |
895 | return; |
896 | } |
897 | |
898 | #if GOOGLE_CUDA |
899 | // Tensor Core (NVIDIA Volta+ GPUs) supports efficient convolution with fp16 |
900 | // in NHWC data layout. In all other configurations it's more efficient to |
901 | // run computation in NCHW data format. |
902 | const bool compute_in_nhwc = DataTypeToEnum<T>::value == DT_HALF && |
903 | stream->GetCudaComputeCapability().IsAtLeast( |
904 | se::CudaComputeCapability::VOLTA); |
905 | #else |
906 | // fast NHWC implementation is a CUDA only feature |
907 | const bool compute_in_nhwc = false; |
908 | #endif |
909 | |
910 | // We only do one directional conversion: NHWC->NCHW. We never convert in the |
911 | // other direction. Grappler layout optimizer selects preferred layout and |
912 | // adds necessary annotations to the graph. |
913 | // TODO(ezhulenev): Convert in other direction for fp16? |
914 | const TensorFormat compute_data_format = |
915 | (compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC |
916 | : FORMAT_NCHW; |
917 | |
918 | VLOG(3) << "Compute Conv2D with cuDNN:" |
919 | << " data_format=" << ToString(data_format) |
920 | << " compute_data_format=" << ToString(compute_data_format); |
921 | |
922 | const int64_t out_batch = GetTensorDim(*output, data_format, 'N'); |
923 | const int64_t out_rows = GetTensorDim(*output, data_format, 'H'); |
924 | const int64_t out_cols = GetTensorDim(*output, data_format, 'W'); |
925 | const int64_t out_depths = GetTensorDim(*output, data_format, 'C'); |
926 | int64_t padding_top = -1, padding_bottom = -1; |
927 | int64_t padding_left = -1, padding_right = -1; |
928 | if (padding == EXPLICIT) { |
929 | GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top, |
930 | &padding_bottom); |
931 | GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left, |
932 | &padding_right); |
933 | } |
934 | int64_t out_rows_check, out_cols_check; |
935 | Status status = GetWindowedOutputSizeVerboseV2( |
936 | in_rows, patch_rows, row_dilation, row_stride, padding, &out_rows_check, |
937 | &padding_top, &padding_bottom); |
938 | // The status is guaranteed to be OK because we checked the output and padding |
939 | // was valid earlier. |
940 | TF_CHECK_OK(status); |
941 | DCHECK_EQ(out_rows, out_rows_check); |
942 | status = GetWindowedOutputSizeVerboseV2(in_cols, patch_cols, col_dilation, |
943 | col_stride, padding, &out_cols_check, |
944 | &padding_left, &padding_right); |
945 | TF_CHECK_OK(status); |
946 | DCHECK_EQ(out_cols, out_cols_check); |
947 | |
948 | const int64_t common_padding_rows = std::min(padding_top, padding_bottom); |
949 | const int64_t common_padding_cols = std::min(padding_left, padding_right); |
950 | if (padding_top != padding_bottom || padding_left != padding_right) { |
951 | // cuDNN only supports padding the same amount on the left and right sides, |
952 | // and on the top and bottom sides. So we manually create a new padded |
953 | // input tensor such that we can pass it to cuDNN. |
954 | VLOG(4) << "Pad input tensor:" |
955 | << " padding_top=" << padding_top |
956 | << " padding_bottom=" << padding_bottom |
957 | << " padding_left=" << padding_left |
958 | << " padding_right=" << padding_right; |
959 | |
960 | // TODO(reedwm): In some cases, we can avoid an allocation even if the two |
961 | // padding sides are different. For example, if the input is 2x2, the filter |
962 | // is 1x1, the stride is 2, and the padding is (1, 0, 1, 0), the result is |
963 | // equivalent to as if the padding is (1, 1, 1, 1). Changing the padding in |
964 | // such a way would allow us to avoid the allocation. |
965 | Tensor transformed_input; |
966 | const int64_t padding_rows_diff = std::abs(padding_bottom - padding_top); |
967 | const int64_t padding_cols_diff = std::abs(padding_right - padding_left); |
968 | const int64_t new_in_rows = in_rows + padding_rows_diff; |
969 | const int64_t new_in_cols = in_cols + padding_cols_diff; |
970 | OP_REQUIRES_OK(ctx, ctx->allocate_temp( |
971 | DataTypeToEnum<T>::value, |
972 | ShapeFromFormat(data_format, in_batch, new_in_rows, |
973 | new_in_cols, in_depths), |
974 | &transformed_input)); |
975 | |
976 | const int64_t input_pad_top = padding_top - common_padding_rows; |
977 | const int64_t input_pad_bottom = padding_bottom - common_padding_rows; |
978 | const int64_t input_pad_left = padding_left - common_padding_cols; |
979 | const int64_t input_pad_right = padding_right - common_padding_cols; |
980 | bool in_bounds = |
981 | FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) && |
982 | FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) && |
983 | FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) && |
984 | FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max()); |
985 | if (!in_bounds) { |
986 | ctx->SetStatus(errors::InvalidArgument("Padding is too large." )); |
987 | return; |
988 | } |
989 | functor::PadInput<GPUDevice, T, int, 4>()( |
990 | ctx->eigen_device<GPUDevice>(), |
991 | To32Bit(static_cast<const Tensor&>(input).tensor<T, 4>()), |
992 | {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}}, |
993 | {{static_cast<int>(input_pad_bottom), |
994 | static_cast<int>(input_pad_right)}}, |
995 | To32Bit(transformed_input.tensor<T, 4>()), data_format, T{}); |
996 | |
997 | input = transformed_input; |
998 | in_rows = new_in_rows; |
999 | in_cols = new_in_cols; |
1000 | } |
1001 | |
1002 | if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) { |
1003 | VLOG(4) << "Convert the input tensor from NHWC to NCHW." ; |
1004 | |
1005 | TensorShape nchw_shape = |
1006 | ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths); |
1007 | if (in_depths > 1) { |
1008 | Tensor transformed_input; |
1009 | OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, |
1010 | nchw_shape, &transformed_input)); |
1011 | functor::NHWCToNCHW<GPUDevice, T, 4>()( |
1012 | ctx->eigen_device<GPUDevice>(), |
1013 | const_cast<const Tensor&>(input).tensor<T, 4>(), |
1014 | transformed_input.tensor<T, 4>()); |
1015 | input = transformed_input; |
1016 | } else { |
1017 | // If depth <= 1, then just reshape. |
1018 | CHECK(input.CopyFrom(input, nchw_shape)); |
1019 | } |
1020 | } else { |
1021 | CHECK(data_format == compute_data_format) // Crash OK |
1022 | << "Illegal data and compute format pair:" |
1023 | << " data_format=" << ToString(data_format) |
1024 | << " compute_data_format=" << ToString(compute_data_format); |
1025 | } |
1026 | |
1027 | CHECK(common_padding_rows >= 0 && common_padding_cols >= 0) // Crash OK |
1028 | << "Negative row or col paddings: (" << common_padding_rows << ", " |
1029 | << common_padding_cols << ")" ; |
1030 | |
1031 | constexpr auto kComputeInNHWC = |
1032 | std::make_tuple(se::dnn::DataLayout::kBatchYXDepth, |
1033 | se::dnn::FilterLayout::kOutputYXInput); |
1034 | constexpr auto kComputeInNCHW = |
1035 | std::make_tuple(se::dnn::DataLayout::kBatchDepthYX, |
1036 | se::dnn::FilterLayout::kOutputInputYX); |
1037 | |
1038 | se::dnn::DataLayout compute_data_layout; |
1039 | se::dnn::FilterLayout filter_layout; |
1040 | |
1041 | std::tie(compute_data_layout, filter_layout) = |
1042 | compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW; |
1043 | |
1044 | se::dnn::BatchDescriptor input_desc; |
1045 | input_desc.set_count(in_batch) |
1046 | .set_feature_map_count(in_depths) |
1047 | .set_height(in_rows) |
1048 | .set_width(in_cols) |
1049 | .set_layout(compute_data_layout); |
1050 | se::dnn::BatchDescriptor output_desc; |
1051 | output_desc.set_count(out_batch) |
1052 | .set_height(out_rows) |
1053 | .set_width(out_cols) |
1054 | .set_feature_map_count(out_depths) |
1055 | .set_layout(compute_data_layout); |
1056 | se::dnn::FilterDescriptor filter_desc; |
1057 | filter_desc.set_input_filter_height(patch_rows) |
1058 | .set_input_filter_width(patch_cols) |
1059 | .set_input_feature_map_count(patch_depths) |
1060 | .set_output_feature_map_count(filter.dim_size(3)) |
1061 | .set_layout(filter_layout); |
1062 | se::dnn::ConvolutionDescriptor conv_desc; |
1063 | conv_desc.set_vertical_dilation_rate(row_dilation) |
1064 | .set_horizontal_dilation_rate(col_dilation) |
1065 | .set_vertical_filter_stride(row_stride) |
1066 | .set_horizontal_filter_stride(col_stride) |
1067 | .set_zero_padding_height(common_padding_rows) |
1068 | .set_zero_padding_width(common_padding_cols) |
1069 | .set_group_count(in_depths / patch_depths); |
1070 | |
1071 | Tensor transformed_filter; |
1072 | |
1073 | const auto transform_filter = [&](FilterTensorFormat dst_format) -> Status { |
1074 | VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO) |
1075 | << " to " << ToString(dst_format); |
1076 | |
1077 | TensorShape dst_shape = |
1078 | dst_format == FORMAT_OIHW |
1079 | ? TensorShape({filter.dim_size(3), filter.dim_size(2), |
1080 | filter.dim_size(0), filter.dim_size(1)}) |
1081 | : TensorShape({filter.dim_size(3), filter.dim_size(0), |
1082 | filter.dim_size(1), filter.dim_size(2)}); |
1083 | |
1084 | TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value, dst_shape, |
1085 | &transformed_filter)); |
1086 | functor::TransformFilter<GPUDevice, T, int, 4>()( |
1087 | ctx->eigen_device<GPUDevice>(), dst_format, |
1088 | To32Bit(filter.tensor<T, 4>()), |
1089 | To32Bit(transformed_filter.tensor<T, 4>())); |
1090 | |
1091 | return OkStatus(); |
1092 | }; |
1093 | |
1094 | if (compute_data_format == FORMAT_NCHW) { |
1095 | OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OIHW)); |
1096 | } else if (compute_data_format == FORMAT_NHWC) { |
1097 | OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OHWI)); |
1098 | } else { |
1099 | ctx->SetStatus(errors::InvalidArgument("Invalid compute data format: " , |
1100 | ToString(compute_data_format))); |
1101 | return; |
1102 | } |
1103 | |
1104 | Tensor transformed_output; |
1105 | if (data_format != compute_data_format) { |
1106 | VLOG(4) << "Allocate temporary memory for output in compute data format" ; |
1107 | OP_REQUIRES_OK( |
1108 | ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, |
1109 | ShapeFromFormat(compute_data_format, out_batch, |
1110 | out_rows, out_cols, out_depths), |
1111 | &transformed_output)); |
1112 | } else { |
1113 | transformed_output = *output; |
1114 | } |
1115 | |
1116 | auto input_ptr = AsDeviceMemory(input.template flat<T>().data(), |
1117 | input.template flat<T>().size()); |
1118 | auto filter_ptr = |
1119 | AsDeviceMemory(transformed_filter.template flat<T>().data(), |
1120 | transformed_filter.template flat<T>().size()); |
1121 | auto output_ptr = |
1122 | AsDeviceMemory(transformed_output.template flat<T>().data(), |
1123 | transformed_output.template flat<T>().size()); |
1124 | |
1125 | static int64_t ConvolveScratchSize = GetDnnWorkspaceLimitOrDefault(); |
1126 | |
1127 | int device_id = stream->parent()->device_ordinal(); |
1128 | DataType dtype = input.dtype(); |
1129 | ConvParameters conv_parameters = {in_batch, // batch |
1130 | in_depths, // in_depths |
1131 | {{in_rows, // in_rows |
1132 | in_cols}}, // in_cols |
1133 | compute_data_format, // compute_data_format |
1134 | out_depths, // out_depths |
1135 | {{patch_rows, // filter_rows |
1136 | patch_cols, // filter_cols |
1137 | patch_depths}}, // filter_depths |
1138 | {{row_dilation, // dilation_rows |
1139 | col_dilation}}, // dilation_cols |
1140 | {{row_stride, // stride_rows |
1141 | col_stride}}, // stride_cols |
1142 | {{common_padding_rows, // padding_rows |
1143 | common_padding_cols}}, // padding_cols |
1144 | dtype, // tensor datatype |
1145 | device_id, // device_id |
1146 | conv_desc.group_count()}; |
1147 | |
1148 | auto entry_or = AutotuneUnfusedConv( |
1149 | cudnn_use_autotune, ConvAutotuneMap::GetInstance(), conv_parameters, ctx, |
1150 | se::dnn::ConvolutionKind::FORWARD, input_desc, input_ptr, filter_desc, |
1151 | filter_ptr, conv_desc, output_desc, output_ptr, ConvolveScratchSize); |
1152 | OP_REQUIRES_OK(ctx, entry_or.status()); |
1153 | auto autotune_entry = std::move(entry_or).value(); |
1154 | |
1155 | DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx); |
1156 | Status cudnn_launch_status = LaunchAutotunedConv( |
1157 | autotune_entry, &scratch_allocator, se::dnn::ConvolutionKind::FORWARD, |
1158 | stream, input_desc, input_ptr, filter_desc, filter_ptr, conv_desc, |
1159 | output_desc, output_ptr); |
1160 | if (!cudnn_launch_status.ok()) { |
1161 | ctx->SetStatus(cudnn_launch_status); |
1162 | return; |
1163 | } |
1164 | |
1165 | if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) { |
1166 | VLOG(4) << "Convert the output tensor back from NCHW to NHWC." ; |
1167 | functor::NCHWToNHWC<GPUDevice, T, 4>()( |
1168 | ctx->eigen_device<GPUDevice>(), |
1169 | const_cast<const Tensor&>(transformed_output).tensor<T, 4>(), |
1170 | output->tensor<T, 4>()); |
1171 | } |
1172 | } |
1173 | |
1174 | // Forward declarations of the functor specializations for GPU. |
1175 | namespace functor { |
1176 | #define DECLARE_GPU_SPEC(T) \ |
1177 | template <> \ |
1178 | void SpatialConvolution<GPUDevice, T>::operator()( \ |
1179 | const GPUDevice& d, typename TTypes<T, 4>::Tensor output, \ |
1180 | typename TTypes<T, 4>::ConstTensor input, \ |
1181 | typename TTypes<T, 4>::ConstTensor filter, int row_stride, \ |
1182 | int col_stride, int row_dilation, int col_dilation, \ |
1183 | const Eigen::PaddingType& padding, \ |
1184 | const Eigen::NoOpOutputKernel& output_kernel); \ |
1185 | template <> \ |
1186 | void SpatialConvolution<GPUDevice, T>::operator()( \ |
1187 | const GPUDevice& d, typename TTypes<T, 4>::Tensor output, \ |
1188 | typename TTypes<T, 4>::ConstTensor input, \ |
1189 | typename TTypes<T, 4>::ConstTensor filter, int row_stride, \ |
1190 | int col_stride, int row_dilation, int col_dilation, int padding_top, \ |
1191 | int padding_bottom, int padding_left, int padding_right, \ |
1192 | const Eigen::NoOpOutputKernel& output_kernel); \ |
1193 | extern template struct SpatialConvolution<GPUDevice, T>; \ |
1194 | template <> \ |
1195 | void MatMulConvFunctor<GPUDevice, T>::operator()( \ |
1196 | const GPUDevice& d, typename TTypes<T, 2>::Tensor out, \ |
1197 | typename TTypes<T, 2>::ConstTensor in0, \ |
1198 | typename TTypes<T, 2>::ConstTensor in1, \ |
1199 | const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair, \ |
1200 | const Eigen::NoOpOutputKernel& output_kernel); \ |
1201 | extern template struct MatMulConvFunctor<GPUDevice, T>; \ |
1202 | template <> \ |
1203 | void TransformFilter<GPUDevice, T, int, 4>::operator()( \ |
1204 | const GPUDevice& d, FilterTensorFormat dst_filter_format, \ |
1205 | typename TTypes<T, 4, int>::ConstTensor in, \ |
1206 | typename TTypes<T, 4, int>::Tensor out); \ |
1207 | extern template struct TransformFilter<GPUDevice, T, int, 4>; \ |
1208 | template <> \ |
1209 | void PadInput<GPUDevice, T, int, 4>::operator()( \ |
1210 | const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in, \ |
1211 | const std::array<int, 2>& padding_left, \ |
1212 | const std::array<int, 2>& padding_right, \ |
1213 | typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \ |
1214 | const T& padding_value); \ |
1215 | extern template struct PadInput<GPUDevice, T, int, 4> |
1216 | |
1217 | DECLARE_GPU_SPEC(float); |
1218 | DECLARE_GPU_SPEC(Eigen::half); |
1219 | DECLARE_GPU_SPEC(double); |
1220 | DECLARE_GPU_SPEC(int32); |
1221 | #undef DECLARE_GPU_SPEC |
1222 | |
1223 | } // namespace functor |
1224 | |
1225 | // Registration of the GPU implementations. |
1226 | REGISTER_KERNEL_BUILDER( |
1227 | Name("Conv2D" ).Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T" ), |
1228 | Conv2DOp<GPUDevice, Eigen::half>); |
1229 | REGISTER_KERNEL_BUILDER( |
1230 | Name("Conv2D" ).Device(DEVICE_GPU).TypeConstraint<float>("T" ), |
1231 | Conv2DOp<GPUDevice, float>); |
1232 | REGISTER_KERNEL_BUILDER( |
1233 | Name("Conv2D" ).Device(DEVICE_GPU).TypeConstraint<double>("T" ), |
1234 | Conv2DOp<GPUDevice, double>); |
1235 | REGISTER_KERNEL_BUILDER( |
1236 | Name("Conv2D" ).Device(DEVICE_GPU).TypeConstraint<int32>("T" ), |
1237 | Conv2DOp<GPUDevice, int32>); |
1238 | |
1239 | // To be used inside depthwise_conv_op.cc. |
1240 | template struct LaunchConv2DOp<GPUDevice, float>; |
1241 | template struct LaunchConv2DOp<GPUDevice, Eigen::half>; |
1242 | template struct LaunchConv2DOp<GPUDevice, double>; |
1243 | |
1244 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
1245 | |
1246 | } // namespace tensorflow |
1247 | |