1/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include "tensorflow/core/common_runtime/single_threaded_executor.h"
17
18#include <utility>
19
20#include "tensorflow/core/common_runtime/entry.h"
21#include "tensorflow/core/common_runtime/executor.h"
22#include "tensorflow/core/common_runtime/executor_factory.h"
23#include "tensorflow/core/common_runtime/renamed_device.h"
24#include "tensorflow/core/graph/algorithm.h"
25#include "tensorflow/core/lib/core/errors.h"
26#include "tensorflow/core/lib/core/status.h"
27#include "tensorflow/core/lib/gtl/cleanup.h"
28#include "tensorflow/core/platform/errors.h"
29#include "tensorflow/core/platform/macros.h"
30
31namespace tensorflow {
32
33Status ValidateOpIsSafeForSyncExecution(
34 const Node& n, bool allow_control_flow_sync_execution) {
35 for (DataType dt : n.output_types()) {
36 if (IsRefType(dt)) {
37 return errors::Unimplemented(
38 "Single-threaded executor does not support reference-typed "
39 "edges. But saw type ",
40 DataTypeString(dt), " in outputs of node ", n.name());
41 }
42 }
43 // Executing Switch nodes requires propagating deadness which is
44 // not currently supported in the SingleThreadedExecutor.
45 if (n.IsSwitch()) {
46 return errors::FailedPrecondition(
47 "Single-threaded executor does not support switch op, but saw node ",
48 n.name(),
49 ". Perhaps your graph contains old-style control flow primitives? "
50 "Try using tf.compat.v1.enable_control_flow_v2().");
51 }
52 if (n.IsControlFlow() && !allow_control_flow_sync_execution) {
53 return errors::FailedPrecondition(
54 "Single-threaded executor does not support low level control flow, "
55 " but saw control flow node ",
56 n.name(),
57 ". Perhaps your graph contains old-style control flow primitives? "
58 "Try using tf.compat.v1.enable_control_flow_v2().");
59 }
60 return OkStatus();
61}
62
63namespace {
64
65typedef gtl::InlinedVector<TensorValue, 4> TensorValueVec;
66typedef gtl::InlinedVector<AllocatorAttributes, 4> AllocatorAttributeVec;
67
68static const string& kSingleThreadedExecutor =
69 *new string("SINGLE_THREADED_EXECUTOR");
70
71class SingleThreadedExecutorImpl : public Executor {
72 public:
73 explicit SingleThreadedExecutorImpl(const LocalExecutorParams& params)
74 : params_(params) {}
75
76 ~SingleThreadedExecutorImpl() override {
77 for (const KernelState& kernel_state : kernels_) {
78 params_.delete_kernel(kernel_state.kernel);
79 }
80 for (const ConstTensorKernelState& kernel_state : const_tensor_kernels_) {
81 params_.delete_kernel(kernel_state.kernel);
82 }
83 }
84
85 Status Initialize(const Graph& graph) {
86 // Topologicially sort `graph` to get a sequence of OpKernels.
87 std::vector<Node*> ordered_nodes;
88 ordered_nodes.reserve(graph.num_nodes());
89 GetReversePostOrder(graph, &ordered_nodes);
90 int ordered_nodes_size = ordered_nodes.size();
91 if (ordered_nodes_size != graph.num_nodes()) {
92 return errors::InvalidArgument("Graph had ", graph.num_nodes(),
93 " but reverse post-order had ",
94 ordered_nodes.size());
95 }
96
97 // We reserve two less nodes because we do not need to create kernels for
98 // the _SOURCE and _SINK nodes.
99 kernels_.reserve(ordered_nodes.size() - 2);
100 std::vector<Node*> nodes_with_kernels;
101 std::vector<Node*> nodes_with_const_tensor_kernels;
102 nodes_with_kernels.reserve(ordered_nodes.size() - 2);
103
104 std::map<size_t, Node*> arg_index_to_node_map;
105 absl::flat_hash_map<Node*, size_t> node_to_index_map;
106
107 // Create the kernel and input-related structures for each node in `graph`.
108 for (Node* n : ordered_nodes) {
109 if (n->IsSource() || n->IsSink()) {
110 continue;
111 }
112 TF_RETURN_IF_ERROR(ValidateOpIsSafeForSyncExecution(
113 *n, params_.allow_control_flow_sync_execution));
114 if (n->IsArg()) {
115 int32_t arg_index;
116 TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &arg_index));
117 if (arg_index < 0) {
118 return errors::InvalidArgument("Invalid argument index ", arg_index,
119 " in node ", n->name());
120 }
121 arg_index_to_node_map[arg_index] = n;
122 // We do not create a kernel for Arg nodes, and instead inline the
123 // argument handling directly in the executor code.
124 continue;
125 }
126
127 OpKernel* kernel;
128 TF_RETURN_IF_ERROR(params_.create_kernel(n->properties(), &kernel));
129
130 const Tensor* const_tensor;
131 if (n->num_outputs() == 1 && (const_tensor = kernel->const_tensor())) {
132 // Nodes that produce a single constant tensor are handled specially:
133 // we evaluate the tensor once, and propagate it to its consumers as
134 // a `const Tensor*`, to avoid refcount manipulation.
135 const size_t kernel_index = const_tensor_kernels_.size();
136 const_tensor_kernels_.push_back({});
137 nodes_with_const_tensor_kernels.push_back(n);
138 ConstTensorKernelState& kernel_state =
139 const_tensor_kernels_[kernel_index];
140 kernel_state.kernel = kernel;
141 kernel_state.const_tensor = *const_tensor;
142 } else {
143 const size_t kernel_index = kernels_.size();
144 kernels_.push_back({});
145 nodes_with_kernels.push_back(n);
146 KernelState& kernel_state = kernels_[kernel_index];
147 kernel_state.kernel = kernel;
148 kernel_state.num_inputs = n->num_inputs();
149 kernel_state.num_outputs = n->num_outputs();
150 node_to_index_map[n] = kernel_index;
151 if (kernel_index == 0) {
152 kernel_state.input_start_index = 0;
153 } else {
154 const KernelState& previous_kernel_state = kernels_[kernel_index - 1];
155 kernel_state.input_start_index =
156 previous_kernel_state.input_start_index +
157 previous_kernel_state.num_inputs;
158 }
159 }
160 }
161
162 // Build the mapping from each Arg node output to the input slot for the
163 // corresponding destination node.
164 if (!arg_index_to_node_map.empty()) {
165 const size_t num_args = arg_index_to_node_map.rbegin()->first + 1;
166 arg_output_locations_.resize(num_args);
167 for (const auto& arg_index_node_pair : arg_index_to_node_map) {
168 const size_t arg_index = arg_index_node_pair.first;
169 const Node* arg_node = arg_index_node_pair.second;
170 arg_output_locations_[arg_index].reserve(arg_node->out_edges().size());
171 for (const Edge* e : arg_node->out_edges()) {
172 if (e->src_output() == Graph::kControlSlot) {
173 continue;
174 } else if (e->src_output() != 0) {
175 return errors::Internal("Invalid output index ", e->src_output(),
176 " from argument node ", arg_index);
177 }
178 arg_output_locations_[arg_index].push_back(
179 kernels_[node_to_index_map[e->dst()]].input_start_index +
180 e->dst_input());
181 }
182 }
183 }
184
185 // Build the mapping from each const tensor kernel to the input slot for the
186 // corresponding destination node.
187 for (size_t i = 0; i < const_tensor_kernels_.size(); ++i) {
188 Node* n = nodes_with_const_tensor_kernels[i];
189 ConstTensorKernelState& kernel_state = const_tensor_kernels_[i];
190 for (const Edge* e : n->out_edges()) {
191 if (e->src_output() == Graph::kControlSlot) {
192 continue;
193 } else if (e->src_output() != 0) {
194 return errors::Internal("Invalid output index ", e->src_output(),
195 " from node ", n->DebugString());
196 }
197 kernel_state.output_locations.push_back(
198 kernels_[node_to_index_map[e->dst()]].input_start_index +
199 e->dst_input());
200 }
201
202 bool on_host =
203 kernel_state.kernel->output_memory_types()[0] == HOST_MEMORY;
204 kernel_state.output_alloc_attr.set_on_host(on_host);
205 }
206
207 // Build the mapping from each node output to the input slot for the
208 // corresponding destination node.
209 for (size_t i = 0; i < kernels_.size(); ++i) {
210 Node* n = nodes_with_kernels[i];
211 KernelState& kernel_state = kernels_[i];
212 kernel_state.output_locations.resize(kernel_state.num_outputs);
213 for (const Edge* e : n->out_edges()) {
214 if (!e->IsControlEdge()) {
215 kernel_state.output_locations[e->src_output()].push_back(
216 kernels_[node_to_index_map[e->dst()]].input_start_index +
217 e->dst_input());
218 }
219 }
220
221 // Compute allocator attributes for each node output, and corresponding
222 // node input.
223 kernel_state.output_alloc_attrs.resize(kernel_state.num_outputs);
224 AllocatorAttributes* attrs = kernel_state.output_alloc_attrs.data();
225
226 OpKernel* op_kernel = kernel_state.kernel;
227 for (int out = 0; out < n->num_outputs(); out++) {
228 DCHECK_LT(out, op_kernel->output_memory_types().size());
229 bool on_host = op_kernel->output_memory_types()[out] == HOST_MEMORY;
230 if (on_host) {
231 AllocatorAttributes h;
232 h.set_on_host(on_host);
233 attrs[out].Merge(h);
234 }
235 }
236 }
237
238 if (!kernels_.empty()) {
239 const KernelState& last_kernel_state = kernels_.back();
240 total_num_inputs_ =
241 last_kernel_state.input_start_index + last_kernel_state.num_inputs;
242 input_alloc_attrs_.resize(total_num_inputs_);
243 for (size_t i = 0; i < kernels_.size(); ++i) {
244 for (size_t j = 0; j < kernels_[i].output_locations.size(); ++j) {
245 for (size_t output_location : kernels_[i].output_locations[j]) {
246 input_alloc_attrs_[output_location] =
247 kernels_[i].output_alloc_attrs[j];
248 }
249 }
250 }
251 } else {
252 total_num_inputs_ = 0;
253 }
254 return OkStatus();
255 }
256
257 Status Run(const Args& args) override {
258 // The inputs to each kernel are stored contiguously in `inputs`.
259 //
260 // We use `kernels_[i].input_start_index` and `kernels_[i].num_inputs` to
261 // determine the range of elements in this vector that correspond to
262 // the inputs of `kernels_[i]`.
263 //
264 // This vector has the following layout:
265 //
266 // * Kernel 0, input 0.
267 // * Kernel 0, input 1.
268 // * ...
269 // * Kernel 0, input `kernels_[0].num_inputs - 1`.
270 // * Kernel 1, input 0.
271 // * ...
272 // * Kernel 1, input `kernels_[1].num_inputs - 1`.
273 // * ...
274 // * Kernel `kernels_.size() - 1`, input 0.
275 // * ...
276 // * Kernel `kernels_.size() - 1`, input `kernels_.back().num_inputs - 1`.
277 //
278 // Note that kernels with zero inputs do not correspond to any elements in
279 // this vector.
280 //
281 // We use `ManualConstructor<Tensor>` to avoid the overhead of
282 // default-constructing an invalid `Tensor` for each slot at the beginning
283 // of execution:
284 // * Elements are initialized when the outputs of a kernel execution are
285 // propagated to the inputs of kernels that depend on them.
286 // * The elements corresponding to the inputs for kernel `i` are destroyed
287 // after kernel `i` executes.
288 // * In an error case (see below), we use the connectivity information in
289 // `KernelState::output_locations` to determine which locations have been
290 // initialized, and manually destroy them.
291 std::vector<Entry> inputs(total_num_inputs_);
292
293 // TODO(mrry): Can we avoid copying into these vectors? Consider modifying
294 // OpKernelContext to take the TensorValueVec as a pointer into `inputs`.
295 TensorValueVec node_inputs;
296 AllocatorAttributeVec input_alloc_attrs;
297
298 // Override intra op thread pool if requested.
299 Device* device = params_.device;
300 std::unique_ptr<Device> user_device;
301 if (args.user_intra_op_threadpool != nullptr) {
302 user_device = RenamedDevice::NewRenamedDevice(
303 device->name(), device, /*owns_underlying=*/false,
304 /*isolate_session_state=*/false, args.user_intra_op_threadpool);
305 device = user_device.get();
306 }
307
308 // Prepare the parameters that will be the same for all kernels.
309 OpKernelContext::Params params;
310 params.step_id = args.step_id;
311 params.device = device;
312 params.log_memory = false; // TODO(mrry): Too severe?
313 params.rendezvous = args.rendezvous;
314 params.session_state = args.session_state;
315 params.session_metadata = params_.session_metadata;
316 params.tensor_store = args.tensor_store;
317 params.cancellation_manager = args.cancellation_manager;
318 params.call_frame = args.call_frame;
319 params.function_library = params_.function_library;
320 params.resource_manager = device->resource_manager();
321 params.step_container = args.step_container;
322 params.collective_executor = args.collective_executor;
323 params.stack_trace = args.stack_trace;
324 params.slice_reader_cache = nullptr; // TODO(mrry): Too severe?
325
326 Args::Runner runner_copy = args.runner;
327 params.runner = &runner_copy;
328 params.run_all_kernels_inline = args.run_all_kernels_inline;
329 params.stats_collector = args.stats_collector;
330 params.executor_type = &kSingleThreadedExecutor;
331
332 // NOTE(mrry): We are assuming that the graph is loopless and condless.
333 params.frame_iter = FrameAndIter(0, 0);
334 params.is_input_dead = false;
335
336 device->TryGetDeviceContext(&params.op_device_context).IgnoreError();
337 auto context_cleanup = gtl::MakeCleanup([&params] {
338 if (params.op_device_context != nullptr) {
339 params.op_device_context->Unref();
340 }
341 });
342
343 // TODO(mrry): Consider implementing forwarding.
344 params.forward_from_array = nullptr;
345
346 const size_t received_args =
347 args.call_frame ? args.call_frame->num_args() : 0;
348 if (TF_PREDICT_FALSE(arg_output_locations_.size() > received_args)) {
349 return errors::InvalidArgument("Expected ", arg_output_locations_.size(),
350 " arguments, but only received ",
351 received_args, ".");
352 }
353
354 // ArgOp is a relatively expensive OpKernel due to the Tensor
355 // allocations that it performs. Therefore we specialize its implementation
356 // and forward arguments directly to the inputs of kernels that consume
357 // them.
358 for (size_t i = 0; i < arg_output_locations_.size(); ++i) {
359 const size_t num_destinations = arg_output_locations_[i].size();
360 if (num_destinations > 0) {
361 if (args.call_frame->CanConsumeArg(i)) {
362 // The first destination input can consume the argument.
363 Entry& first_input = inputs[arg_output_locations_[i][0]];
364 first_input.state = Entry::State::HAS_VALUE;
365 first_input.val.Init();
366 args.call_frame->ConsumeArg(i, first_input.val.get());
367 // All subsequent destination inputs get a shallow copy of the first
368 // destination input.
369 //
370 // NOTE: If we had metadata about which kernels might attempt to
371 // forward their input, we could arrange the kernel order so that
372 // one of those kernels was executed last.
373 for (size_t j = 1; j < num_destinations; ++j) {
374 Entry& input = inputs[arg_output_locations_[i][j]];
375 input.state = Entry::State::HAS_VALUE;
376 input.val.Init(*first_input.val);
377 }
378 } else {
379 const Tensor* arg;
380 TF_RETURN_IF_ERROR(args.call_frame->GetArg(i, &arg));
381 for (size_t j = 0; j < num_destinations; ++j) {
382 Entry& input = inputs[arg_output_locations_[i][j]];
383 // NOTE: We must make at least one shallow copy of the argument
384 // tensor that remains live until all consuming kernels have
385 // executed, to keep the reference count > 1, and inhibit buffer
386 // forwarding. For simplicity, we shallow copy into the input entry
387 // for each consuming kernel.
388 input.state = Entry::State::HAS_VALUE;
389 input.val.Init(*arg);
390 }
391 }
392 }
393 }
394
395 // Kernels that return a constant value (e.g. ConstOp) are relatively
396 // expensive due to the Tensor allocations that they perform. Therefore we
397 // specialize their implementation and forward their constant value directly
398 // to the inputs of kernels that consume them.
399 for (const ConstTensorKernelState& kernel_state : const_tensor_kernels_) {
400 for (size_t i = 0; i < kernel_state.output_locations.size(); ++i) {
401 Entry& input = inputs[kernel_state.output_locations[i]];
402 input.state = Entry::State::HAS_CONST_TENSOR;
403 input.const_tensor = &kernel_state.const_tensor;
404 }
405 }
406
407 // Execute the kernels one-at-a-time in topological order.
408 for (size_t i = 0; i < kernels_.size(); ++i) {
409 const KernelState& kernel_state = kernels_[i];
410
411 // Prepare the per-kernel parameters.
412 const size_t input_start_index = kernel_state.input_start_index;
413 const size_t num_inputs = kernel_state.num_inputs;
414 const size_t num_outputs = kernel_state.num_outputs;
415
416 node_inputs.clear();
417 node_inputs.resize(num_inputs);
418 input_alloc_attrs.clear();
419 input_alloc_attrs.resize(num_inputs);
420 for (size_t j = 0; j < num_inputs; ++j) {
421 Entry& input = inputs[input_start_index + j];
422 switch (input.state) {
423 case Entry::State::HAS_CONST_TENSOR:
424 // NOTE(mrry): This `const_cast` is necessary because `TensorValue`
425 // stores a non-const `Tensor*`, and relies on the `OpKernelContext`
426 // accessors making dynamic checks that prevent using an immutable
427 // tensor as a mutable tensor.
428 node_inputs[j].tensor = const_cast<Tensor*>(input.const_tensor);
429 break;
430 case Entry::State::HAS_VALUE:
431 node_inputs[j].tensor = input.val.get();
432 break;
433 default:
434 DCHECK(false) << "Input did not have a valid value.";
435 }
436 input_alloc_attrs[j] = input_alloc_attrs_[input_start_index + j];
437 }
438 params.inputs = node_inputs;
439 params.input_alloc_attrs = input_alloc_attrs;
440 params.op_kernel = kernel_state.kernel;
441 params.output_attr_array = kernel_state.output_alloc_attrs.data();
442 OpKernelContext ctx(&params, num_outputs);
443
444 // Actually execute the kernel.
445 device->Compute(kernel_state.kernel, &ctx);
446 TF_RETURN_IF_ERROR(ctx.status());
447
448 // Free the inputs to the current kernel.
449 for (size_t j = 0; j < num_inputs; ++j) {
450 inputs[input_start_index + j].ClearVal();
451 }
452
453 // Forward the outputs of the kernel to the inputs of subsequent kernels.
454 for (size_t j = 0; j < num_outputs; ++j) {
455 TensorValue val = ctx.release_output(j);
456 const size_t num_destinations = kernel_state.output_locations[j].size();
457 if (num_destinations > 0) {
458 // TODO(mrry): Consider flattening the `output_locations` vector
459 // to improve the cache-friendliness of this loop.
460 for (size_t k = 0; k < num_destinations - 1; ++k) {
461 // TODO(mrry): Validate that the types match the expected values or
462 // ensure that the necessary validation has already happened.
463 Entry& input = inputs[kernel_state.output_locations[j][k]];
464 input.state = Entry::State::HAS_VALUE;
465 if (val.tensor != nullptr) {
466 input.val.Init(*val.tensor);
467 } else {
468 input.val.Init(Tensor(kernel_state.kernel->output_type(j)));
469 }
470 }
471 // Move `arg` to the last consumer to avoid the cost of copying it.
472 Entry& input =
473 inputs[kernel_state.output_locations[j][num_destinations - 1]];
474 input.state = Entry::State::HAS_VALUE;
475 if (val.tensor != nullptr) {
476 input.val.Init(std::move(*val.tensor));
477 } else {
478 input.val.Init(Tensor(kernel_state.kernel->output_type(j)));
479 }
480 }
481 delete val.tensor;
482 }
483 }
484 return OkStatus();
485 }
486
487 // Execute all operations in the calling thread when asynchronous execution
488 // is requested. Callers may expect to perform expensive work in the calling
489 // thread even when the execution itself is single-threaded.
490 //
491 // This also avoid stack-overflow issues with functional control flow.
492 void RunAsync(const Args& args, DoneCallback done) override {
493 args.runner([this, args, done]() { done(Run(args)); });
494 }
495
496 private:
497 const LocalExecutorParams params_;
498
499 // All following members are read-only after Initialize().
500
501 // The sum of the number of inputs for each node in the graph. This determines
502 // the length of the flat `inputs` vector. See comment at the beginning of
503 // `RunAsync()` for details.
504 size_t total_num_inputs_;
505
506 // Represents cached graph structure state for each kernel.
507 struct KernelState {
508 // The kernel object. Not owned.
509 //
510 // This pointer is managed by `params_.create_kernel()` and
511 // `params_.delete_kernel()`.
512 OpKernel* kernel;
513
514 // These fields determine the range of elements in `inputs` that corresponds
515 // to the inputs of `kernel`.
516 size_t input_start_index;
517 size_t num_inputs;
518
519 size_t num_outputs;
520
521 // For the `j`th output of `kernel`, `output_locations[j]` contains the
522 // locations in the flat `inputs` vector to which that output must be
523 // copied. See comment at the beginning of `Run()` for details.
524 std::vector<std::vector<size_t>>
525 output_locations; // Length = `num_outputs`.
526
527 // Memory space information for each output of `kernel`.
528 std::vector<AllocatorAttributes>
529 output_alloc_attrs; // Length = `num_outputs`.
530 };
531 std::vector<KernelState> kernels_;
532
533 // For the `i`th argument, `arg_output_locations_[i]` contains the locations
534 // in the flat `inputs` vector to which that argument must be copied.
535 std::vector<std::vector<size_t>>
536 arg_output_locations_; // Length = `num_args`.
537
538 // Represents cached graph structure state for each kernel that produces
539 // a single constant-valued tensor.
540 struct ConstTensorKernelState {
541 // The kernel object. Not owned.
542 //
543 // This pointer is managed by `params_.create_kernel()` and
544 // `params_.delete_kernel()`.
545 OpKernel* kernel;
546
547 // The cached value of `kernel->const_tensor()`.
548 //
549 // NOTE: We keep a `Tensor` rather than a `const Tensor*` here in order to
550 // keep the reference count on the underlying buffer above 1. Otherwise, a
551 // kernel could interpret the input as a forwardable tensor, and mutate the
552 // underlying constant tensor.
553 Tensor const_tensor;
554
555 // For the single output of `kernel`, `output_locations` contains the
556 // locations in the flat `inputs` vector to which that output must be
557 // copied. See comment at the beginning of `Run()` for details.
558 std::vector<size_t> output_locations; // Length = `num_outputs`.
559
560 // Memory space information for the single output of `kernel`.
561 AllocatorAttributes output_alloc_attr;
562 };
563 std::vector<ConstTensorKernelState> const_tensor_kernels_;
564
565 // Memory space information for each input. This information is stored in the
566 // same order as the flat `inputs` vector. See comment at the beginning of
567 // `RunAsync()` for details.
568 std::vector<AllocatorAttributes>
569 input_alloc_attrs_; // Length = `total_num_inputs_`.
570};
571
572class SingleThreadedExecutorRegistrar {
573 public:
574 SingleThreadedExecutorRegistrar() {
575 ExecutorFactory::Register(kSingleThreadedExecutor, new Factory());
576 }
577
578 private:
579 class Factory : public ExecutorFactory {
580 Status NewExecutor(const LocalExecutorParams& params, const Graph& graph,
581 std::unique_ptr<Executor>* out_executor) override {
582 Executor* ret;
583 TF_RETURN_IF_ERROR(NewSingleThreadedExecutor(params, graph, &ret));
584 out_executor->reset(ret);
585 return OkStatus();
586 }
587 };
588};
589static SingleThreadedExecutorRegistrar registrar;
590
591} // namespace
592
593Status NewSingleThreadedExecutor(const LocalExecutorParams& params,
594 const Graph& graph, Executor** executor) {
595 auto impl = std::make_unique<SingleThreadedExecutorImpl>(params);
596 TF_RETURN_IF_ERROR(impl->Initialize(graph));
597 *executor = impl.release();
598 return OkStatus();
599}
600
601} // namespace tensorflow
602