1 | #include <torch/csrc/lazy/ts_backend/ts_eager_fallback.h> |
2 | |
3 | #include <ATen/FunctionalTensorWrapper.h> |
4 | #include <ATen/Functions.h> |
5 | #include <ATen/core/boxing/KernelFunction.h> |
6 | #include <ATen/native/CPUFallback.h> |
7 | #include <torch/csrc/lazy/backend/backend_interface.h> |
8 | #include <torch/csrc/lazy/core/config.h> |
9 | #include <torch/csrc/lazy/core/metrics.h> |
10 | #include <torch/csrc/lazy/core/tensor.h> |
11 | #include <torch/library.h> |
12 | #include <sstream> |
13 | #include <unordered_map> |
14 | |
15 | namespace torch { |
16 | namespace lazy { |
17 | namespace { |
18 | |
19 | std::vector<at::Tensor> _to_eager( |
20 | at::TensorList tensors, |
21 | c10::DeviceType device_type) { |
22 | switch (device_type) { |
23 | case at::kCPU: { |
24 | return at::_to_cpu(tensors); |
25 | } |
26 | default: { |
27 | std::vector<at::Tensor> eager_tensors; |
28 | for (const auto& t : tensors) { |
29 | c10::TensorOptions options = t.options().device(device_type); |
30 | at::Tensor eager_tensor = t.to( |
31 | options, |
32 | /*non_blocking*/ false, |
33 | /*copy*/ false); |
34 | eager_tensors.push_back(eager_tensor); |
35 | } |
36 | return eager_tensors; |
37 | } |
38 | } |
39 | } |
40 | |
41 | // convenience helper for converting tensors to cpu |
42 | |
43 | std::vector<at::Tensor> to_eager( |
44 | const at::TensorList& tensors, |
45 | c10::DeviceType device_type) { |
46 | // We can't just call _to_eager() on the entire list of Tensors because it |
47 | // will break on undefined tensors. Separate out undefined tensors first. |
48 | std::vector<at::Tensor> eager_tensors(tensors.size()); |
49 | std::vector<at::Tensor> valid_tensors; |
50 | std::vector<bool> to_translate(tensors.size()); |
51 | for (size_t i = 0; i < tensors.size(); ++i) { |
52 | const at::Tensor& tensor = tensors[i]; |
53 | // Explicitly handling undefined tensors here instead of letting `_to_eager` |
54 | // handle it. Otherwise, we'd need to require all backends with their own |
55 | // implementation of _to_eager to properly handle undefined tensors. |
56 | if (tensor.defined()) { |
57 | to_translate[i] = true; |
58 | valid_tensors.push_back(tensor); |
59 | } else { |
60 | eager_tensors[i] = tensor; |
61 | } |
62 | } |
63 | auto eager_valid_tensors = _to_eager(valid_tensors, device_type); |
64 | for (size_t i = 0, defined_pos = 0; i < tensors.size(); ++i) { |
65 | if (to_translate[i]) { |
66 | eager_tensors[i] = std::move(eager_valid_tensors[defined_pos++]); |
67 | } |
68 | } |
69 | return eager_tensors; |
70 | } |
71 | |
72 | std::vector<c10::optional<at::Tensor>> to_eager( |
73 | const std::vector<c10::optional<at::Tensor>>& tensors, |
74 | c10::DeviceType device_type) { |
75 | // We can't just call _to_eager() on the entire list of Tensors because it |
76 | // will break on undefined tensors. Separate out undefined tensors first. |
77 | std::vector<c10::optional<at::Tensor>> eager_tensors(tensors.size()); |
78 | std::vector<at::Tensor> valid_tensors; |
79 | std::vector<bool> to_translate(tensors.size()); |
80 | for (size_t i = 0; i < tensors.size(); ++i) { |
81 | const c10::optional<at::Tensor>& tensor = tensors[i]; |
82 | // Explicitly handling undefined tensors here instead of letting `_to_eager` |
83 | // handle it. Otherwise, we'd need to require all backends with their own |
84 | // implementation of _to_eager to properly handle undefined tensors. |
85 | if (tensor.has_value() && tensor->defined()) { |
86 | to_translate[i] = true; |
87 | valid_tensors.push_back(*tensor); |
88 | } else { |
89 | eager_tensors[i] = tensor; |
90 | } |
91 | } |
92 | auto eager_valid_tensors = _to_eager(valid_tensors, device_type); |
93 | for (size_t i = 0, defined_pos = 0; i < tensors.size(); ++i) { |
94 | if (to_translate[i]) { |
95 | eager_tensors[i] = std::move(eager_valid_tensors[defined_pos++]); |
96 | } |
97 | } |
98 | return eager_tensors; |
99 | } |
100 | |
101 | c10::DispatchKey dispatch_key(c10::DeviceType device_type) { |
102 | switch (device_type) { |
103 | case at::kCPU: { |
104 | return c10::DispatchKey::CPU; |
105 | } |
106 | case at::kCUDA: { |
107 | return c10::DispatchKey::CUDA; |
108 | } |
109 | default: { |
110 | AT_ERROR("Unsupported device type: " , device_type); |
111 | } |
112 | } |
113 | } |
114 | |
115 | c10::optional<c10::Device> compute_target_device( |
116 | std::vector<at::Tensor>& t_args, |
117 | std::vector<c10::List<at::Tensor>> tlist_args, |
118 | std::vector<c10::List<c10::optional<at::Tensor>>> opt_tlist_args) { |
119 | // Decide what device to move the output tensor(s) to. |
120 | // The current convention is that we use the first tensor arg to pick the |
121 | // device Barring that, we take the first tensor from a TensorList arg. |
122 | if (!t_args.empty()) { |
123 | return t_args[0].device(); |
124 | } else { |
125 | // We need to loop through all of the (potentially multiple) TensorList |
126 | // arguments In case, e.g. the first one is empty but the second is not. |
127 | for (auto& tens_list : tlist_args) { |
128 | for (const auto i : c10::irange(tens_list.size())) { |
129 | return tens_list.get(i).device(); |
130 | } |
131 | } |
132 | for (auto& tens_list : opt_tlist_args) { |
133 | for (const auto i : c10::irange(tens_list.size())) { |
134 | if (tens_list.get(i).has_value()) { |
135 | return tens_list.get(i)->device(); |
136 | } |
137 | } |
138 | } |
139 | } |
140 | return c10::nullopt; |
141 | } |
142 | |
143 | } // namespace |
144 | |
145 | static std::unordered_map<std::string, ::torch::lazy::Counter*> |
146 | _eager_fallback_counters; |
147 | |
148 | bool force_eager_fallback(c10::Symbol op) { |
149 | auto force_str = getLTCForceFallback(); |
150 | if (!force_str.empty()) { |
151 | static auto force_sym = c10::Symbol::fromQualString(std::string(force_str)); |
152 | if (op == force_sym) { |
153 | return true; |
154 | } |
155 | } |
156 | if (op == at::aten::nonzero) { |
157 | // When symbolic shape mode is not enabled, the nonzero shape function |
158 | // returns an incorrect result. |
159 | return !symbolicShapeEnabled(); |
160 | } |
161 | |
162 | return false; |
163 | } |
164 | |
165 | void ltc_eager_fallback( |
166 | const c10::OperatorHandle& op, |
167 | torch::jit::Stack* stack) { |
168 | // TODO(whc) this FN_TRACK thing hasn't been used so far in LTC iirc but could |
169 | // land/re-enable it LTC_FN_TRACK(3);; |
170 | const auto name = c10::toString(op.operator_name()); |
171 | |
172 | // Manually applying the TORCH_LAZY_COUNTER macro. |
173 | // We need to do it ourselves and explicitly keep a mapping of counters |
174 | // because this boxed fallback kernel is used by multiple operators, |
175 | // and the macro stamps out a static Counter object with a fixed name |
176 | // at the code location that it was called. |
177 | if (_eager_fallback_counters.find(name) == _eager_fallback_counters.end()) { |
178 | _eager_fallback_counters[name] = new ::torch::lazy::Counter(name); |
179 | } |
180 | _eager_fallback_counters[name]->AddValue(1); |
181 | |
182 | auto& args = op.schema().arguments(); |
183 | auto arguments = torch::jit::last(stack, args.size()); |
184 | |
185 | // Log each tensor argument. |
186 | for (const auto& ivalue : arguments) { |
187 | if (ivalue.isTensor()) { |
188 | VLOG(3) << ivalue.toTensor().toString(); |
189 | } |
190 | } |
191 | |
192 | // Call the actual boxed CPU fallback. |
193 | ts_eager_fallback( |
194 | op, stack, torch::lazy::getBackend()->EagerFallbackDeviceType()); |
195 | } |
196 | |
197 | void register_ts_ltc_eager_fallback() { |
198 | static auto m = MAKE_TORCH_LIBRARY_IMPL(_, Lazy); |
199 | // Most backends use TORCH_LIBRARY_* macros which perform their dispatcher |
200 | // registrations at static library init time, but the lazy Torchscript backend |
201 | // does not since it is built in the main torch lib but not always used. |
202 | // In particular, if another external backend wants to register itself to the |
203 | // same key (Lazy), Torchscript backend must not be initialized. |
204 | m.fallback(torch::CppFunction::makeFromBoxedFunction<<c_eager_fallback>()); |
205 | } |
206 | |
207 | void ts_eager_fallback( |
208 | const c10::OperatorHandle& op, |
209 | torch::jit::Stack* stack, |
210 | c10::DeviceType device_type) { |
211 | auto& schema_args = op.schema().arguments(); |
212 | const auto num_arguments = schema_args.size(); |
213 | auto arguments = torch::jit::last(stack, num_arguments); |
214 | const auto arguments_begin = stack->size() - num_arguments; |
215 | |
216 | std::vector<at::Tensor> tensor_args; |
217 | std::vector<int> tensor_args_indices; |
218 | |
219 | std::vector<c10::List<at::Tensor>> tensorlist_args; |
220 | std::vector<c10::List<c10::optional<at::Tensor>>> opt_tensorlist_args; |
221 | |
222 | // Step 1: Convert all non-eager tensor inputs into eager tensors and put them |
223 | // on the stack at the correct indices. |
224 | for (int64_t idx = 0; idx < arguments.size(); ++idx) { |
225 | const auto& ivalue = arguments[idx]; |
226 | if (ivalue.isTensor()) { |
227 | tensor_args.push_back(ivalue.toTensor()); |
228 | tensor_args_indices.push_back(idx); |
229 | } else if (ivalue.isTensorList()) { |
230 | // Note: we copy each TensorList argument to eager individually out of |
231 | // convenience, but XLA would benefit from materializing all tensor and |
232 | // TensorList args onto the CPU at the same time. We can improve this if |
233 | // we need better perf for XLA's CPU fallbacks. |
234 | auto eager_ivalue = c10::IValue(c10::List<at::Tensor>( |
235 | to_eager(ivalue.toTensorVector(), device_type))); |
236 | (*stack)[arguments_begin + idx] = std::move(eager_ivalue); |
237 | tensorlist_args.push_back(ivalue.toTensorList()); |
238 | } else if (ivalue.isOptionalTensorList()) { |
239 | auto eager_ivalue = c10::IValue(c10::List<c10::optional<at::Tensor>>( |
240 | to_eager(ivalue.toOptionalTensorVector(), device_type))); |
241 | (*stack)[arguments_begin + idx] = std::move(eager_ivalue); |
242 | opt_tensorlist_args.push_back(ivalue.toOptionalTensorList()); |
243 | } |
244 | } |
245 | // XLA requires all of the tensor arguments to be gathered up and converted to |
246 | // CPU together. |
247 | auto eager_tensors = to_eager(tensor_args, device_type); |
248 | |
249 | for (auto i = 0; i < tensor_args_indices.size(); ++i) { |
250 | auto idx = tensor_args_indices[i]; |
251 | (*stack)[arguments_begin + idx] = c10::IValue(eager_tensors[i]); |
252 | } |
253 | |
254 | // Step 2: Call the underlying eager implementation of the operator |
255 | op.redispatchBoxed(c10::DispatchKeySet(dispatch_key(device_type)), stack); |
256 | |
257 | // Step 3: We need to take special care to handle mutable aliases properly: |
258 | // If any input tensors are mutable aliases, we need to directly copy the |
259 | // updated data on the eager tensors back to the original inputs. |
260 | for (int64_t i = 0; i < tensor_args_indices.size(); ++i) { |
261 | auto tensor_idx = tensor_args_indices[i]; |
262 | const auto alias_info = schema_args[tensor_idx].alias_info(); |
263 | if (alias_info != nullptr && alias_info->isWrite()) { |
264 | at::_copy_from_and_resize(eager_tensors[i], tensor_args[i]); |
265 | } |
266 | } |
267 | |
268 | // Step 4: Convert any eager output tensors back to the original input device. |
269 | // For mutable alias'd outputs, we also need to take special care |
270 | // to move the ORIGINAL input tensor back onto the stack, in place of |
271 | // the temporary eager output tensor that we created. |
272 | // |
273 | // Note [Eager Fallback Does Not Handle View Operators] |
274 | // Also note that we are incapable of handling immutable alises properly. |
275 | // Why? |
276 | // Schemas with an immutable alias'd tensor outputs correspond to view |
277 | // operators. For example, the `view_as` schema from native_functions.yaml: |
278 | // `view_as(Tensor(a) self, Tensor other) -> Tensor(a)` |
279 | // We can't handle these ops properly, because view ops are supposed to return |
280 | // a NEW tensor that shares the SAME storage as the original tensor. |
281 | // However, the new tensor that we created cannot share the same storage, |
282 | // since it lives on the eager CPU / CUDA device and the original tensor lives |
283 | // on a different device. Because of that, we warn if someone attempts to call |
284 | // the eager fallback on a view operator (this is to maintain BC for view ops |
285 | // for XLA that fall back to CPU). |
286 | const auto& schema_returns = op.schema().returns(); |
287 | const auto& num_returns = schema_returns.size(); |
288 | auto returns = torch::jit::last(stack, num_returns); |
289 | const auto returns_begin = stack->size() - num_returns; |
290 | |
291 | for (int64_t idx = 0; idx < returns.size(); ++idx) { |
292 | if (returns[idx].isTensor()) { |
293 | const auto& return_tens = returns[idx].toTensor(); |
294 | if (return_tens.defined()) { |
295 | const auto alias_info = schema_returns[idx].alias_info(); |
296 | if (alias_info != nullptr && alias_info->isWrite()) { |
297 | // Case (1): mutable alias case. Move the input ivalue directly onto |
298 | // the stack in place of the existing eager output tensor. |
299 | bool found_alias = false; |
300 | // We could store some extra metadata on the function schema to avoid |
301 | // the loop here if we need to improve perf. |
302 | for (int64_t i = 0; i < tensor_args_indices.size(); ++i) { |
303 | auto input_tensor_idx = tensor_args_indices[i]; |
304 | const auto& input_tensor = eager_tensors[i]; |
305 | const auto input_alias_info = |
306 | schema_args[input_tensor_idx].alias_info(); |
307 | if (input_tensor.defined() && input_alias_info != nullptr && |
308 | *alias_info == *input_alias_info) { |
309 | // We've found the original input tensor that aliases with the |
310 | // current output. Wrap it in an IValue and put it directly on the |
311 | // stack. |
312 | (*stack)[returns_begin + idx] = c10::IValue(tensor_args[i]); |
313 | found_alias = true; |
314 | break; |
315 | } |
316 | } |
317 | TORCH_CHECK( |
318 | found_alias, |
319 | "The operator " , |
320 | op.schema().operator_name(), |
321 | " appears to have invalid alias information. " , |
322 | "Found a return tensor argument with a mismatched " |
323 | "mutable alias: " , |
324 | schema_returns[idx]); |
325 | } else { |
326 | c10::optional<c10::Device> tgt_device = compute_target_device( |
327 | tensor_args, tensorlist_args, opt_tensorlist_args); |
328 | if (alias_info != nullptr && !alias_info->isWrite()) { |
329 | // immutable alias (view) case: Warn here, since we're copying and |
330 | // not creating a view. |
331 | // If this operator is needed, the backend should provide a kernel |
332 | // for it. |
333 | // See Note [Eager Fallback Does Not Handle View Operators] |
334 | std::stringstream dev_str; |
335 | if (tgt_device) { |
336 | dev_str << *tgt_device; |
337 | } else { |
338 | dev_str << "<none>" ; |
339 | } |
340 | // We should never hit this for a view op, |
341 | // because LazyTensor should provide a lowering for the |
342 | // corresponding view_copy operator. The functionalization pass will |
343 | // take care of calling the view_copy operator intead of the view. |
344 | TORCH_CHECK( |
345 | false, |
346 | "The operator " , |
347 | op.schema().operator_name(), |
348 | " appears to be a view operator, " , |
349 | "but it has no implementation for the backend \"" , |
350 | dev_str.str(), |
351 | "\". View operators don't support " , |
352 | "falling back to run on the eager, since the tensor's " |
353 | "storage cannot be shared across devices." ); |
354 | } |
355 | // Case (2): copy case. Copy the eager output tensor to the original |
356 | // device. |
357 | |
358 | // We technically might not have a target device, e.g. if you call |
359 | // torch.cat() with an empty list In that case, we shouldn't have any |
360 | // tensors to schlep across devices anyway. |
361 | if (tgt_device) { |
362 | (*stack)[returns_begin + idx] = |
363 | c10::IValue(returns[idx].toTensor().to(*tgt_device)); |
364 | } |
365 | } |
366 | } |
367 | } |
368 | } |
369 | } |
370 | |
371 | } // namespace lazy |
372 | } // namespace torch |
373 | |