conv_ops_gpu.h source code [tensorflow/tensorflow/core/kernels/conv_ops_gpu.h]

1	/ Copyright 2015 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	#ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
17	#define TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
18
19	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
20
21	#include <tuple>
22	#include <unordered_map>
23
24	#include "absl/strings/str_cat.h"
25	#include "absl/strings/str_format.h"
26	#include "tensorflow/core/framework/op_kernel.h"
27	#include "tensorflow/core/kernels/gpu_utils.h"
28	#include "tensorflow/core/lib/gtl/inlined_vector.h"
29	#include "tensorflow/core/lib/hash/hash.h"
30	#include "tensorflow/core/util/autotune_maps/conv_parameters.h"
31	#include "tensorflow/core/util/tensor_format.h"
32
33	namespace tensorflow {
34
35	// Get the Dnn workspace limit from the environment variable, which is in MB.
36	// Return the workspace memory limit in bytes. If no value is set, return the
37	// default value.
38	int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
39	int64_t default_value_in_bytes);
40
41	// Call the Dnn workspace limit from TF_CUDNN_WORKSPACE_LIMIT_IN_MB or default.
42	int64 GetDnnWorkspaceLimitOrDefault();
43
44	// A class to provide scratch-space allocator for Stream-Executor Cudnn
45	// callback. TensorFlow is responsible for releasing the temporary buffers after
46	// the kernel finishes.
47	class DnnScratchAllocator : public se::ScratchAllocator {
48	public:
49	virtual ~DnnScratchAllocator() {}
50	DnnScratchAllocator(int64_t memory_limit, OpKernelContext* context)
51	: memory_limit_(memory_limit), total_byte_size_(`0`), context_(context) {}
52	int64 GetMemoryLimitInBytes() override { return memory_limit_; }
53	se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
54	int64_t byte_size) override {
55	Tensor temporary_memory;
56	if (byte_size < `0`) {
57	return se::port::Status{se::port::error::INVALID_ARGUMENT,
58	"Requested negative byte size!"};
59	}
60	if (byte_size > memory_limit_) {
61	return se::port::Status{se::port::error::UNAVAILABLE,
62	absl::StrCat("Requested memory size (", byte_size,
63	") exceeds the max memory limit (",
64	memory_limit_, ").")};
65	}
66	AllocationAttributes allocation_attr;
67	allocation_attr.retry_on_failure = false;
68	Status allocation_status(context_->allocate_temp(
69	DT_UINT8, TensorShape({byte_size}), &temporary_memory,
70	AllocatorAttributes(), allocation_attr));
71	if (!allocation_status.ok()) {
72	return se::port::Status{
73	se::port::error::UNAVAILABLE,
74	absl::StrCat("Failed to allocate the requested memory size (",
75	byte_size, ").")};
76	}
77	// Hold the reference of the allocated tensors until the end of the
78	// allocator.
79	allocated_tensors_.push_back(temporary_memory);
80	total_byte_size_ += byte_size;
81	return se::port::StatusOr<se::DeviceMemory<uint8>>(
82	AsDeviceMemory(temporary_memory.flat<uint8>().data(),
83	temporary_memory.flat<uint8>().size()));
84	}
85	int64 TotalByteSize() { return total_byte_size_; }
86
87	private:
88	int64 memory_limit_;
89	int64 total_byte_size_;
90	OpKernelContext* context_;
91	std::vector<Tensor> allocated_tensors_;
92	};
93
94	typedef Eigen::GpuDevice GPUDevice;
95
96	// Select an algorithm for the given convolution, either by running actual
97	// autotuning with a cache, or by falling back to a default if
98	// 'cudnn_use_autotune' is true and cuDNN is the statically-chosen DNN backend.
99	template <typename T>
100	StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
101	bool cudnn_use_autotune,
102	AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>*
103	autotune_map,
104	const ConvParameters& params, OpKernelContext* ctx,
105	const se::dnn::BatchDescriptor& input_desc,
106	const se::dnn::FilterDescriptor& filter_desc,
107	const se::dnn::BatchDescriptor& bias_desc,
108	const se::dnn::BatchDescriptor& output_desc,
109	const se::dnn::ConvolutionDescriptor& conv_desc,
110	const se::dnn::ActivationMode activation_mode, double conv_input_scale,
111	double side_input_scale, double leakyrelu_alpha,
112	se::DeviceMemory<T> input_ptr, se::DeviceMemory<T> filter_ptr,
113	se::DeviceMemory<T> output_ptr, se::DeviceMemory<T> bias_ptr,
114	se::DeviceMemory<T> side_input_ptr, int64_t scratch_size);
115
116	template <typename T>
117	StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
118	bool cudnn_use_autotune,
119	AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
120	const ConvParameters& conv_parameters, OpKernelContext* ctx,
121	se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc,
122	se::DeviceMemory<T> input_ptr, const se::dnn::FilterDescriptor& filter_desc,
123	se::DeviceMemory<T> filter_ptr,
124	const se::dnn::ConvolutionDescriptor& conv_desc,
125	const se::dnn::BatchDescriptor& output_desc, se::DeviceMemory<T> output_ptr,
126	int64_t scratch_size_limit);
127
128	// Returns a pointer to the primary 'OpRunner' of 'runners' and allocated
129	// scratch memory if allocatable; else a pointer to its fallback
130	// no-scratch-space runner, and a null 'DeviceMemoryBase'.
131	template <typename Sig>
132	StatusOr<std::tuple<const se::dnn::OpRunner<Sig>*, se::DeviceMemoryBase>>
133	AllocateScratchOrFallback(se::ScratchAllocator* scratch_allocator,
134	const se::dnn::OpRunner<Sig>* primary,
135	const se::dnn::OpRunner<Sig>* no_scratch_fallback) {
136	const se::dnn::OpRunner<Sig>* selected_runner = primary;
137
138	auto workspace_size = selected_runner->GetWorkspaceSize();
139
140	se::DeviceMemoryBase scratch_memory;
141	if (workspace_size > `0`) {
142	auto scratch_or = scratch_allocator->AllocateBytes(workspace_size);
143	if (scratch_or.ok()) {
144	scratch_memory = scratch_or.value();
145	} else if ((selected_runner = no_scratch_fallback)) {
146	if (selected_runner->GetWorkspaceSize() > `0`) {
147	return errors::Internal(
148	"No-scratch fallback runner requires nonzero scratch space");
149	}
150	} else {
151	return errors::Unknown(
152	"CUDNN failed to allocate the scratch space for the runner or to "
153	"find a working no-scratch runner.");
154	}
155	}
156
157	return std::make_tuple(selected_runner, scratch_memory);
158	}
159
160	template <typename T>
161	Status LaunchAutotunedConv(const AutotuneEntry<se::dnn::ConvOp>& autotune_entry,
162	DnnScratchAllocator* scratch_allocator,
163	se::dnn::ConvolutionKind kind, se::Stream* stream,
164	const se::dnn::BatchDescriptor& input_desc,
165	se::DeviceMemory<T> in_ptr,
166	const se::dnn::FilterDescriptor& filter_desc,
167	se::DeviceMemory<T> filter_ptr,
168	const se::dnn::ConvolutionDescriptor& conv_desc,
169	const se::dnn::BatchDescriptor& output_desc,
170	se::DeviceMemory<T> out_ptr) {
171	if (!autotune_entry.is_algorithm_config()) {
172	const auto& runners = autotune_entry.GetOpRunners();
173	se::dnn::DataType element_type = se::dnn::ToDataType<T>::value;
174	se::dnn::ConvOp::Config config{kind, element_type, element_type,
175	input_desc, filter_desc, output_desc,
176	conv_desc};
177	TF_ASSIGN_OR_RETURN(auto* primary,
178	runners.primary->GetOrCreateRunner(config, stream));
179
180	const se::dnn::ConvRunner* no_scratch_fallback = nullptr;
181	if (runners.no_scratch_fallback) {
182	TF_ASSIGN_OR_RETURN(
183	no_scratch_fallback,
184	runners.no_scratch_fallback->GetOrCreateRunner(config, stream));
185	}
186
187	TF_ASSIGN_OR_RETURN(auto runner_and_scratch,
188	AllocateScratchOrFallback<se::dnn::ConvOp::Signature>(
189	scratch_allocator, primary, no_scratch_fallback));
190	auto& runner = std::get<const* se::dnn::ConvRunner*>(runner_and_scratch);
191	return runner(stream, nullptr,
192	std::get<se::DeviceMemoryBase>(runner_and_scratch), in_ptr,
193	filter_ptr, out_ptr);
194	} else {
195	return stream->ConvolveWithAlgorithm(
196	kind, input_desc, in_ptr, filter_desc, filter_ptr, output_desc, out_ptr,
197	conv_desc, scratch_allocator, autotune_entry.GetAlgorithmConfig(),
198	nullptr);
199	}
200	}
201
202	} // namespace tensorflow
203
204	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
205
206	#endif // TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
207

Browse the source code of tensorflow/tensorflow/core/kernels/conv_ops_gpu.h