subgraph.c source code [tensorflow/external/XNNPACK/src/subgraph.c]

1	// Copyright 2020 Google LLC
2	//
3	// This source code is licensed under the BSD-style license found in the
4	// LICENSE file in the root directory of this source tree.
5
6	#include <math.h>
7	#include <stddef.h>
8	#include <stdint.h>
9	#include <stdlib.h>
10
11	#include <fp16.h>
12
13	#include <xnnpack.h>
14	#include <xnnpack/allocator.h>
15	#include <xnnpack/log.h>
16	#include <xnnpack/math.h>
17	#include <xnnpack/params.h>
18	#include <xnnpack/subgraph.h>
19
20
21	#ifndef XNN_ENABLE_SPARSE
22	#error "XNN_ENABLE_SPARSE not defined"
23	#endif
24
25	enum xnn_status xnn_create_subgraph(
26	uint32_t external_value_ids,
27	uint32_t flags,
28	xnn_subgraph_t* subgraph_out)
29	{
30	struct xnn_subgraph* subgraph = NULL;
31	enum xnn_status status = xnn_status_uninitialized;
32
33	if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == `0`) {
34	xnn_log_error("failed to create subgraph: XNNPACK is not initialized");
35	goto error;
36	}
37
38	status = xnn_status_out_of_memory;
39
40	subgraph = xnn_allocate_zero_memory(sizeof(struct xnn_subgraph));
41	if (subgraph == NULL) {
42	xnn_log_error("failed to allocate %zu bytes for subgraph descriptor", sizeof(struct xnn_subgraph));
43	goto error;
44	}
45
46	subgraph->external_value_ids = external_value_ids;
47
48	subgraph->values = xnn_allocate_zero_memory(external_value_ids * sizeof(struct xnn_value));
49	if (subgraph->values == NULL) {
50	xnn_log_error("failed to allocate %zu bytes for subgraph values",
51	(size_t) external_value_ids * sizeof(struct xnn_value));
52	goto error;
53	}
54	for (size_t i = `0`; i < external_value_ids; i++) {
55	subgraph->values[i].id = i;
56	}
57	subgraph->num_values = external_value_ids;
58	subgraph->num_reserved_values = external_value_ids;
59
60	*subgraph_out = subgraph;
61	return xnn_status_success;
62
63	error:
64	xnn_delete_subgraph(subgraph);
65	return status;
66	}
67
68
69	struct xnn_value* xnn_subgraph_new_internal_value(xnn_subgraph_t subgraph)
70	{
71	struct xnn_value* values = subgraph->values;
72	const size_t size = subgraph->num_values;
73	const size_t capacity = subgraph->num_reserved_values;
74	if (capacity < size + `1`) {
75	const size_t new_capacity = max(min(capacity * `2`, capacity + `512`), capacity + `64`);
76	assert(new_capacity >= size + `1`);
77	values = xnn_reallocate_memory(values, new_capacity * sizeof(struct xnn_value));
78	if (values == NULL) {
79	xnn_log_error("failed to allocate %zu bytes for subgraph values",
80	capacity * sizeof(struct xnn_value));
81	return values;
82	}
83
84	memset(values + size, `0`, (new_capacity - size) * sizeof(struct xnn_value));
85	subgraph->num_reserved_values = new_capacity;
86	subgraph->values = values;
87	}
88	subgraph->num_values = size + `1`;
89	struct xnn_value* new_value = values + size;
90	new_value->id = size;
91	return new_value;
92	}
93
94	void xnn_node_clear(struct xnn_node* node) {
95	assert(node != NULL);
96	memset(node, `0`, sizeof(struct xnn_node));
97	}
98
99	void xnn_value_clear(struct xnn_value* value) {
100	assert(value != NULL);
101	memset(value, `0`, sizeof(struct xnn_value));
102	}
103
104	void xnn_value_copy(
105	struct xnn_value* dst_value,
106	const struct xnn_value* src_value)
107	{
108	// Note: Value ID stays unchanged
109
110	dst_value->type = src_value->type;
111	dst_value->datatype = src_value->datatype;
112	dst_value->quantization = src_value->quantization;
113	dst_value->shape = src_value->shape;
114	dst_value->flags = src_value->flags;
115	dst_value->data = src_value->data;
116	dst_value->producer = src_value->producer;
117	dst_value->first_consumer = src_value->first_consumer;
118	}
119
120	struct xnn_node* xnn_subgraph_new_node(xnn_subgraph_t subgraph)
121	{
122	struct xnn_node* nodes = subgraph->nodes;
123	const size_t size = subgraph->num_nodes;
124	const size_t capacity = subgraph->num_reserved_nodes;
125
126	if (capacity < size + `1`) {
127	const size_t new_capacity = max(min(capacity * `2`, capacity + `512`), capacity + `64`);
128	assert(new_capacity >= size + `1`);
129	nodes = xnn_reallocate_memory(nodes, new_capacity * sizeof(struct xnn_node));
130	if (nodes == NULL) {
131	xnn_log_error("failed to allocate %zu bytes for subgraph nodes",
132	capacity * sizeof(struct xnn_node));
133	return nodes;
134	}
135
136	memset(nodes + size, `0`, (new_capacity - size) * sizeof(struct xnn_node));
137	subgraph->num_reserved_nodes = new_capacity;
138	subgraph->nodes = nodes;
139	}
140	subgraph->num_nodes = size + `1`;
141	struct xnn_node* new_node = nodes + size;
142	new_node->id = size;
143	return new_node;
144	}
145
146	void xnn_subgraph_add_nodes(xnn_subgraph_t subgraph, size_t num_nodes)
147	{
148	struct xnn_node* nodes = subgraph->nodes;
149	const size_t size = subgraph->num_nodes;
150	const size_t capacity = subgraph->num_reserved_nodes;
151
152	if (capacity < size + num_nodes) {
153	const size_t new_capacity = max(min(capacity * `2`, capacity + `512`), capacity + max(num_nodes, `64`));
154	assert(new_capacity >= size + num_nodes);
155	nodes = xnn_reallocate_memory(nodes, new_capacity * sizeof(struct xnn_node));
156	if (nodes == NULL) {
157	xnn_log_error("failed to allocate %zu bytes for subgraph nodes",
158	capacity * sizeof(struct xnn_node));
159	return;
160	}
161
162	memset(nodes + size, `0`, (new_capacity - size) * sizeof(struct xnn_node));
163	subgraph->num_reserved_nodes = new_capacity;
164	subgraph->nodes = nodes;
165	}
166	subgraph->num_nodes = size + num_nodes;
167	struct xnn_node* new_nodes = nodes + size;
168	for (size_t i = `0`; i < num_nodes; i++) {
169	new_nodes[i].id = size + i;
170	}
171	}
172
173	void xnn_subgraph_analyze_consumers_and_producers(xnn_subgraph_t subgraph)
174	{
175	// Initialize producer/consumer fields to safe defaults.
176	for (uint32_t i = `0`; i < subgraph->num_values; i++) {
177	struct xnn_value* value = &subgraph->values[i];
178	value->producer = XNN_INVALID_NODE_ID;
179	value->first_consumer = XNN_INVALID_NODE_ID;
180	value->num_consumers = `0`;
181	}
182
183	// Analyse Nodes' inputs and output and update Values' producer/consumer fields
184	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
185	struct xnn_node* node = &subgraph->nodes[n];
186
187	for (uint32_t i = `0`; i < node->num_inputs; i++) {
188	const uint32_t input_id = node->inputs[i];
189	assert(input_id < subgraph->num_values);
190
191	if (subgraph->values[input_id].num_consumers++ == `0`) {
192	assert(subgraph->values[input_id].first_consumer == XNN_INVALID_NODE_ID);
193	subgraph->values[input_id].first_consumer = n;
194	}
195	}
196
197	for (uint32_t o = `0`; o < node->num_outputs; o++) {
198	const uint32_t output_id = node->outputs[o];
199	assert(output_id < subgraph->num_values);
200
201	// Persistent values can be produced by multiple nodes, e.g. copy nodes writing to the same persistent value.
202	assert(xnn_value_is_persistent(&subgraph->values[output_id]) \|\|
203	subgraph->values[output_id].producer == XNN_INVALID_NODE_ID);
204	subgraph->values[output_id].producer = n;
205	}
206	}
207
208	// Count extra consumer for Values which are external outputs.
209	// Remove unreferenced values.
210	for (uint32_t i = `0`; i < subgraph->num_values; i++) {
211	struct xnn_value* value = &subgraph->values[i];
212	if (xnn_value_is_external_output(value)) {
213	value->num_consumers += `1`;
214	}
215	}
216	}
217
218	#define XNN_LAYOUT_FLAG_COMPATIBLE_NCHW 1
219	#define XNN_LAYOUT_FLAG_COMPATIBLE_NHWC2NCHW 2
220	#define XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC 4
221	#define XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER 8
222
223	uint32_t xnn_check_nchw_compatibility(xnn_subgraph_t subgraph, struct xnn_node* node) {
224	if (node->compute_type != xnn_compute_type_fp32) {
225	if (node->type != xnn_node_type_invalid) {
226	xnn_log_info(
227	"Node %s compute type %d is incompatible with sparse inference",
228	xnn_node_type_to_string(node->type), node->compute_type);
229	}
230	return `0`;
231	}
232
233	switch (node->type) {
234	case xnn_node_type_convolution_2d:
235	// Supported cases:
236	// - 1x1 convolution (no stride, no dilation, no padding, no groups)
237	// - 3x3 stride-2 convolution (no dilation, padding 1 on each side, no groups, 3 input channels)
238	if (node->params.convolution_2d.groups != `1`) {
239	xnn_log_info("Node %s groups (%" PRIu32 ") "
240	"is incompatible with sparse inference",
241	xnn_node_type_to_string(node->type),
242	node->params.convolution_2d.groups);
243	return `0`;
244	}
245	if ((node->params.convolution_2d.dilation_height \| node->params.convolution_2d.dilation_width) != `1`) {
246	xnn_log_info("Node %s dilation (height=%" PRIu32 ", width=%" PRIu32 ") "
247	"is incompatible with sparse inference",
248	xnn_node_type_to_string(node->type),
249	node->params.convolution_2d.dilation_height,
250	node->params.convolution_2d.dilation_width);
251	return `0`;
252	}
253	if ((node->params.convolution_2d.kernel_height \| node->params.convolution_2d.kernel_width) == `1`) {
254	if ((node->params.convolution_2d.input_padding_top \| node->params.convolution_2d.input_padding_right \|
255	node->params.convolution_2d.input_padding_bottom \| node->params.convolution_2d.input_padding_left) != `0`) {
256	xnn_log_info("Node %s (1x1 kernel) padding (top=%" PRIu32 ", right=%" PRIu32", bottom=%" PRIu32 ", left=%" PRIu32") "
257	"is incompatible with sparse inference",
258	xnn_node_type_to_string(node->type),
259	node->params.convolution_2d.input_padding_top,
260	node->params.convolution_2d.input_padding_right,
261	node->params.convolution_2d.input_padding_bottom,
262	node->params.convolution_2d.input_padding_left);
263	return `0`;
264	}
265	if ((node->params.convolution_2d.subsampling_height \| node->params.convolution_2d.subsampling_width) != `1`) {
266	xnn_log_info("Node %s (1x1 kernel) subsampling (height=%" PRIu32 ", width=%" PRIu32 ") "
267	"is incompatible with sparse inference",
268	xnn_node_type_to_string(node->type),
269	node->params.convolution_2d.subsampling_height,
270	node->params.convolution_2d.subsampling_width);
271	return `0`;
272	}
273	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW;
274	} else if (node->params.convolution_2d.kernel_height == `3` && node->params.convolution_2d.kernel_width == `3`) {
275	if (node->params.convolution_2d.input_padding_top != `1` \|\| node->params.convolution_2d.input_padding_right != `1` \|\|
276	node->params.convolution_2d.input_padding_bottom != `1` \|\| node->params.convolution_2d.input_padding_left != `1`) {
277	xnn_log_info("Node %s (3x3 kernel) padding (top=%" PRIu32 ", right=%" PRIu32 ", bottom=%" PRIu32 ", left=%" PRIu32 ") "
278	"is incompatible with sparse inference",
279	xnn_node_type_to_string(node->type),
280	node->params.convolution_2d.input_padding_top,
281	node->params.convolution_2d.input_padding_right,
282	node->params.convolution_2d.input_padding_bottom,
283	node->params.convolution_2d.input_padding_left);
284	return `0`;
285	}
286	if ((node->params.convolution_2d.subsampling_height \| node->params.convolution_2d.subsampling_width) != `2`) {
287	xnn_log_info("Node %s (3x3 kernel) subsampling (height=%" PRIu32 ", width=%" PRIu32 ") "
288	"is incompatible with sparse inference",
289	xnn_node_type_to_string(node->type),
290	node->params.convolution_2d.subsampling_height,
291	node->params.convolution_2d.subsampling_width);
292	return `0`;
293	}
294	if (node->params.convolution_2d.group_input_channels != `3`) {
295	xnn_log_info("Node %s (3x3 kernel) input channels (%zu) "
296	"is incompatible with sparse inference",
297	xnn_node_type_to_string(node->type),
298	node->params.convolution_2d.group_input_channels);
299	return `0`;
300	}
301	return XNN_LAYOUT_FLAG_COMPATIBLE_NHWC2NCHW;
302	}
303	return `0`;
304	case xnn_node_type_depthwise_convolution_2d:
305	// Supported cases:
306	// - 3x3 stride-1 convolution (no dilation, padding 1 on each side)
307	// - 3x3 stride-2 convolution (no dilation, padding 1 on each side)
308	// - 5x5 stride-1 convolution (no dilation, padding 2 on each side)
309	// - 5x5 stride-2 convolution (no dilation, padding 2 on each side)
310	if ((node->params.depthwise_convolution_2d.dilation_height \| node->params.depthwise_convolution_2d.dilation_width) != `1`) {
311	xnn_log_info("Node %s dilation (height=%" PRIu32 ", width=%" PRIu32 ") "
312	"is incompatible with sparse inference",
313	xnn_node_type_to_string(node->type),
314	node->params.convolution_2d.dilation_height,
315	node->params.convolution_2d.dilation_width);
316	return `0`;
317	}
318	if (node->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
319	xnn_log_info("Node %s flags (%" PRIu32 ") has padding incompatible with sparse inference",
320	xnn_node_type_to_string(node->type),
321	node->flags);
322	return `0`;
323	}
324	if (node->params.depthwise_convolution_2d.depth_multiplier != `1`) {
325	xnn_log_info("Node %s depth_multiplier (%" PRIu32 ") is incompatible with sparse inference",
326	xnn_node_type_to_string(node->type),
327	node->params.depthwise_convolution_2d.depth_multiplier);
328	return `0`;
329	}
330	if (node->params.depthwise_convolution_2d.subsampling_height != node->params.depthwise_convolution_2d.subsampling_width) {
331	xnn_log_info("Node %s subsampling (height=%" PRIu32 ", width=%" PRIu32 ") "
332	"is incompatible with sparse inference",
333	xnn_node_type_to_string(node->type),
334	node->params.depthwise_convolution_2d.subsampling_height,
335	node->params.depthwise_convolution_2d.subsampling_width);
336	return `0`;
337	}
338	switch (node->params.depthwise_convolution_2d.subsampling_height) {
339	case `1`:
340	case `2`:
341	break;
342	default:
343	xnn_log_info("Node %s subsampling_height (%" PRIu32 ") "
344	"is incompatible with sparse inference",
345	xnn_node_type_to_string(node->type),
346	node->params.depthwise_convolution_2d.subsampling_height);
347	return `0`;
348	}
349	if (node->params.depthwise_convolution_2d.kernel_height != node->params.depthwise_convolution_2d.kernel_width) {
350	xnn_log_info("Node %s kernel (height=%" PRIu32 ", width=%" PRIu32 ") "
351	"is incompatible with sparse inference",
352	xnn_node_type_to_string(node->type),
353	node->params.depthwise_convolution_2d.kernel_height,
354	node->params.depthwise_convolution_2d.kernel_width);
355	return `0`;
356	}
357	switch (node->params.depthwise_convolution_2d.kernel_height) {
358	case `3`:
359	if (node->params.depthwise_convolution_2d.input_padding_top == `1` &&
360	node->params.depthwise_convolution_2d.input_padding_right == `1` &&
361	node->params.depthwise_convolution_2d.input_padding_bottom == `1` &&
362	node->params.depthwise_convolution_2d.input_padding_left == `1`) {
363	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW;
364	} else {
365	xnn_log_info("Node %s (3x3 kernel) padding "
366	"(top=%" PRIu32 ", right=%" PRIu32 ", bottom=%" PRIu32 ", left=%" PRIu32 ") "
367	"is incompatible with sparse inference",
368	xnn_node_type_to_string(node->type),
369	node->params.depthwise_convolution_2d.input_padding_top,
370	node->params.depthwise_convolution_2d.input_padding_right,
371	node->params.depthwise_convolution_2d.input_padding_bottom,
372	node->params.depthwise_convolution_2d.input_padding_left);
373	return `0`;
374	}
375	case `5`:
376	if (node->params.depthwise_convolution_2d.input_padding_top == `2` &&
377	node->params.depthwise_convolution_2d.input_padding_right == `2` &&
378	node->params.depthwise_convolution_2d.input_padding_bottom == `2` &&
379	node->params.depthwise_convolution_2d.input_padding_left == `2`) {
380	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW;
381	} else {
382	xnn_log_info("Node %s (5x5 kernel) padding "
383	"(top=%" PRIu32 ", right=%" PRIu32 ", bottom=%" PRIu32 ", left=%" PRIu32 ") "
384	"is incompatible with sparse inference",
385	xnn_node_type_to_string(node->type),
386	node->params.depthwise_convolution_2d.input_padding_top,
387	node->params.depthwise_convolution_2d.input_padding_right,
388	node->params.depthwise_convolution_2d.input_padding_bottom,
389	node->params.depthwise_convolution_2d.input_padding_left);
390	return `0`;
391	}
392	default:
393	return `0`;
394	}
395	case xnn_node_type_depth_to_space:
396	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC;
397	case xnn_node_type_global_average_pooling_2d:
398	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW \| XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC;
399	case xnn_node_type_add2:
400	case xnn_node_type_multiply2:
401	assert(node->num_inputs == `2`);
402	assert(node->num_outputs == `1`);
403	if (subgraph->values[node->inputs[`0`]].shape.num_dims != `4` \|\|
404	subgraph->values[node->inputs[`1`]].shape.num_dims != `4`)
405	{
406	xnn_log_info("Node %s inputs shape is incompatible with sparse inference",
407	xnn_node_type_to_string(node->type));
408	return `0`;
409	}
410
411	if (subgraph->values[node->inputs[`0`]].data != NULL) {
412	// Check that the first input is representable as either a scalar, or a vector
413	size_t num_nonunit_dims = `0`;
414	for (uint32_t i = `0`; i < subgraph->values[node->inputs[`0`]].shape.num_dims; i++) {
415	if (subgraph->values[node->inputs[`0`]].shape.dim[i] != `1`) {
416	num_nonunit_dims += `1`;
417	}
418	}
419	if (num_nonunit_dims > `1`) {
420	return `0`;
421	}
422	}
423
424	if (subgraph->values[node->inputs[`1`]].data != NULL) {
425	// Check that the second input is representable as either a scalar, or a vector
426	size_t num_nonunit_dims = `0`;
427	for (uint32_t i = `0`; i < subgraph->values[node->inputs[`0`]].shape.num_dims; i++) {
428	if (subgraph->values[node->inputs[`0`]].shape.dim[i] != `1`) {
429	num_nonunit_dims += `1`;
430	}
431	}
432	if (num_nonunit_dims > `1`) {
433	return `0`;
434	}
435	}
436
437	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW;
438	case xnn_node_type_static_resize_bilinear_2d:
439	if (subgraph->values[node->inputs[`0`]].shape.dim[`1`] > `1` &&
440	subgraph->values[node->inputs[`0`]].shape.dim[`2`] > `1`) {
441	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW;
442	} else {
443	xnn_log_info("Node %s inputs shape is incompatible with sparse inference",
444	xnn_node_type_to_string(node->type));
445	return `0`;
446	}
447	case xnn_node_type_abs:
448	case xnn_node_type_bankers_rounding:
449	case xnn_node_type_ceiling:
450	case xnn_node_type_clamp:
451	case xnn_node_type_elu:
452	case xnn_node_type_floor:
453	case xnn_node_type_hardswish:
454	case xnn_node_type_leaky_relu:
455	case xnn_node_type_negate:
456	case xnn_node_type_sigmoid:
457	case xnn_node_type_square:
458	assert(node->num_inputs == `1`);
459	assert(node->num_outputs == `1`);
460	if (subgraph->values[node->inputs[`0`]].shape.num_dims == `4`) {
461	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW;
462	} else {
463	xnn_log_info("Node %s inputs shape is incompatible with sparse inference",
464	xnn_node_type_to_string(node->type));
465	return `0`;
466	}
467	default:
468	return false;
469	}
470	}
471
472	void xnn_subgraph_rewrite_for_nchw(xnn_subgraph_t subgraph)
473	{
474	// Convert parts of the subgraph to NCHW for sparse inference
475	// Step 1: detect NCHW-compatible Nodes
476	// Step 2: detect NCHW-compatible clusters (run connected components graph algorithm)
477	// Step 3: check that all NCHW-compatible Values are consumed only by NCHW-compatible Nodes
478	// Step 4: switch Values' layout to NCHW
479	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
480	struct xnn_node* node = &subgraph->nodes[n];
481	node->layout_flags = xnn_check_nchw_compatibility(subgraph, node);
482	xnn_log_debug("Node #%" PRIu32 ": %s (NCHW: %s, NHWC->NCHW: %s, NCHW->NHWC: %s)",
483	n, xnn_node_type_to_string(node->type),
484	node->layout_flags & XNN_LAYOUT_FLAG_COMPATIBLE_NCHW ? "yes" : "no",
485	node->layout_flags & XNN_LAYOUT_FLAG_COMPATIBLE_NHWC2NCHW ? "yes" : "no",
486	node->layout_flags & XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC ? "yes" : "no");
487	}
488
489	// Run Shiloach-Vishkin connected components algorithm i.e. find all
490	// XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC nodes and set them as cluster leaders
491	// to all the producer nodes
492	bool update = false;
493	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
494	struct xnn_node* node = &subgraph->nodes[n];
495	node->cluster_leader = n;
496	if (node->layout_flags & XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC) {
497	for (uint32_t i = `0`; i < node->num_inputs; i++) {
498	const struct xnn_value* value = &subgraph->values[node->inputs[i]];
499	if (value->data != NULL) {
500	// Static data, skip this input value. Compatibility of this static input with NCHW layout was validated
501	// during the initial NCHW compatibility check for the Node.
502	continue;
503	}
504	if (xnn_value_is_external(value)) {
505	// External value, invalid cluster
506	node->layout_flags \|= XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER;
507	continue;
508	}
509	const uint32_t producer_id = value->producer;
510	assert(producer_id != XNN_INVALID_NODE_ID);
511	assert(producer_id < n);
512	struct xnn_node* producer_node = &subgraph->nodes[producer_id];
513	if ((producer_node->layout_flags & (XNN_LAYOUT_FLAG_COMPATIBLE_NHWC2NCHW \| XNN_LAYOUT_FLAG_COMPATIBLE_NCHW)) != `0` &&
514	(producer_node->layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER) == `0`)
515	{
516	producer_node->layout_flags &= ~XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC;
517	if (producer_node->cluster_leader != node->cluster_leader) {
518	producer_node->cluster_leader = node->cluster_leader = math_max_u32(producer_node->cluster_leader, node->cluster_leader);
519	update = true;
520	}
521	} else {
522	node->layout_flags \|= XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER;
523	}
524	}
525	}
526	}
527	// No NCHW2NHWC compatible nodes have been found thus the graph rewriting
528	// practically cannot happen.
529	if (!update) {
530	return;
531	}
532	// Propagate the cluster leader to other nodes in the graph untill all the
533	// nodes in the cluster is not updated
534	while (update) {
535	update = false;
536	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
537	struct xnn_node* node = &subgraph->nodes[n];
538	if (node->layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER) {
539	continue;
540	}
541
542	if ((node->layout_flags & (XNN_LAYOUT_FLAG_COMPATIBLE_NCHW \| XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC)) == `0`) {
543	continue;
544	}
545
546	for (uint32_t i = `0`; i < node->num_inputs; i++) {
547	const struct xnn_value* value = &subgraph->values[node->inputs[i]];
548	if (value->data != NULL) {
549	// Static data, skip this input value. Compatibility of this static input with NCHW layout was validated
550	// during the initial NCHW compatibility check for the Node.
551	continue;
552	}
553	if (xnn_value_is_external(value)) {
554	// External value, invalid cluster
555	node->layout_flags \|= XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER;
556	continue;
557	}
558	const uint32_t producer_id = value->producer;
559	assert(producer_id != XNN_INVALID_NODE_ID);
560	assert(producer_id < n);
561	struct xnn_node* producer_node = &subgraph->nodes[producer_id];
562	if ((producer_node->layout_flags & (XNN_LAYOUT_FLAG_COMPATIBLE_NHWC2NCHW \| XNN_LAYOUT_FLAG_COMPATIBLE_NCHW)) != `0` &&
563	(producer_node->layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER) == `0`)
564	{
565	producer_node->layout_flags &= ~XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC;
566	if (producer_node->cluster_leader != node->cluster_leader) {
567	producer_node->cluster_leader = node->cluster_leader = math_max_u32(producer_node->cluster_leader, node->cluster_leader);
568	update = true;
569	}
570	} else {
571	node->layout_flags \|= XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER;
572	}
573	}
574	}
575	}
576	// Propagate XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER flags up to the cluster leaders
577	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
578	struct xnn_node* node = &subgraph->nodes[n];
579	subgraph->nodes[node->cluster_leader].layout_flags \|= node->layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER;
580	}
581	// Check that all Values consumed by NCHW-compatible cluster don't have NCHW-incompatible consumers
582	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
583	struct xnn_node* node = &subgraph->nodes[n];
584	if ((subgraph->nodes[node->cluster_leader].layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER) != `0`) {
585	continue;
586	}
587
588	if ((node->layout_flags & (XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC \| XNN_LAYOUT_FLAG_COMPATIBLE_NCHW)) == `0`) {
589	continue;
590	}
591
592	for (uint32_t i = `0`; i < node->num_inputs; i++) {
593	struct xnn_value* value = &subgraph->values[node->inputs[i]];
594	if (value->data != NULL) {
595	// Static data, skip this input value because it doesn't have a producer Node.
596	continue;
597	}
598	assert(!xnn_value_is_external(value));
599	value->num_nchw_compatible_consumers += `1`;
600	}
601	}
602	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
603	struct xnn_node* node = &subgraph->nodes[n];
604	if ((subgraph->nodes[node->cluster_leader].layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER) != `0`) {
605	continue;
606	}
607
608	if ((node->layout_flags & (XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC \| XNN_LAYOUT_FLAG_COMPATIBLE_NCHW)) == `0`) {
609	continue;
610	}
611
612	for (uint32_t i = `0`; i < node->num_inputs; i++) {
613	const struct xnn_value* value = &subgraph->values[node->inputs[i]];
614	if (value->data != NULL) {
615	// Static data, skip this input value because it doesn't have a producer Node.
616	continue;
617	}
618	assert(!xnn_value_is_external(value));
619	assert(value->num_nchw_compatible_consumers > `0`);
620	if (value->num_nchw_compatible_consumers != value->num_consumers) {
621	subgraph->nodes[node->cluster_leader].layout_flags \|= XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER;
622	}
623	}
624	}
625	// Evaluate if it is profitable to run the model as sparse:
626	// - Compute the number of parameters and zeroes in 1x1 Convolution weights
627	// - Disable sparse rewriting for clusters without 1x1 Convolutions (num_params == 0)
628	// or with less than 2/3rd of zeroes in 1x1 Convolution filters
629	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
630	struct xnn_node* node = &subgraph->nodes[n];
631	if ((subgraph->nodes[node->cluster_leader].layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER) != `0`) {
632	continue;
633	}
634
635	if (node->type == xnn_node_type_convolution_2d &&
636	max(node->params.convolution_2d.kernel_height, node->params.convolution_2d.kernel_width) == `1`)
637	{
638	assert(node->num_inputs >= `2`);
639
640	const struct xnn_value* filter = &subgraph->values[node->inputs[`1`]];
641	assert(filter->data != NULL);
642	assert(filter->shape.num_dims == `4`);
643
644	const size_t num_params = filter->shape.dim[`0`] * filter->shape.dim[`3`];
645	subgraph->nodes[node->cluster_leader].num_params += num_params;
646
647	const float* data = (const float*) filter->data;
648	size_t num_zeroes = `0`;
649	for (size_t i = `0`; i < num_params; i++) {
650	num_zeroes += (size_t) (data[i] == `0.0f`);
651	}
652	xnn_log_debug("1x1 Convolution 2D Node #%" PRIu32 ": %zu / %zu sparsity", n, num_zeroes, num_params);
653	subgraph->nodes[node->cluster_leader].num_zeroes += num_zeroes;
654	}
655	}
656	bool use_nchw_layout = false;
657	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
658	struct xnn_node* node = &subgraph->nodes[n];
659	if ((subgraph->nodes[node->cluster_leader].layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER) != `0`) {
660	continue;
661	}
662
663	if ((node->layout_flags & (XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC \| XNN_LAYOUT_FLAG_COMPATIBLE_NCHW)) == `0`) {
664	continue;
665	}
666
667	if (subgraph->nodes[node->cluster_leader].num_zeroes * `3` <= subgraph->nodes[node->cluster_leader].num_params * `2`) {
668	xnn_log_info("Node #%" PRIu32 ": sparse inference disabled: 1x1 Convolutions contain %zu / %zu zero weights",
669	n, subgraph->nodes[node->cluster_leader].num_zeroes, subgraph->nodes[node->cluster_leader].num_params);
670	continue;
671	}
672
673	for (uint32_t i = `0`; i < node->num_inputs; i++) {
674	struct xnn_value* value = &subgraph->values[node->inputs[i]];
675	if (value->data != NULL) {
676	// Static data, skip this input value because it doesn't have a producer Node.
677	continue;
678	}
679	assert(!xnn_value_is_external(value));
680	assert(value->num_nchw_compatible_consumers > `0`);
681	assert(value->num_nchw_compatible_consumers == value->num_consumers);
682	if (value->layout != xnn_layout_type_nchw) {
683	value->layout = xnn_layout_type_nchw;
684	xnn_log_info("set Value #%"PRIu32" layout to NCHW", node->inputs[i]);
685	use_nchw_layout = true;
686	}
687	}
688	}
689	if (use_nchw_layout) {
690	xnn_log_info("XNNPACK has switched to sparse inference mode!");
691	}
692	}
693
694	bool xnn_subgraph_rewrite_for_fp16(xnn_subgraph_t subgraph)
695	{
696	xnn_log_info("Analyzing subgraph for FP16 compatibility");
697
698	// Convert tensors and operators in the subgraph to FP16
699	// 1. Check that all operators in the subgraph are supported in FP16.
700	// 2. Indicate values that must be converted to FP16.
701	// 3. Replace FP32 Values with FP16 Values as Nodes' inputs/outputs.
702	// 4. Insert FP32->FP16 Convert Nodes for external FP32 inputs and FP16->FP32 Convert Nodes for external outputs.
703
704	// Check that all operators in the subgraph are supported in FP16, bail out on any unsupported one.
705	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
706	struct xnn_node* node = &subgraph->nodes[n];
707	if (node->type == xnn_node_type_invalid) {
708	// Node was fused away, skip.
709	continue;
710	}
711
712	if (node->compute_type != xnn_compute_type_fp32) {
713	xnn_log_warning("FP16 rewrite aborted: node #%" PRIu32 " (%s) is not FP32", n, xnn_node_type_to_string(node->type));
714	return false;
715	}
716	for (uint32_t i = `0`; i < node->num_inputs; i++) {
717	if (subgraph->values[node->inputs[i]].layout == xnn_layout_type_nchw) {
718	xnn_log_warning(
719	"FP16 rewrite aborted: input #%" PRIu32 " (Value #%" PRIu32 ") of node #%" PRIu32 " (%s) has NCHW layout",
720	i, node->inputs[i], n, xnn_node_type_to_string(node->type));
721	return false;
722	}
723	}
724	for (uint32_t o = `0`; o < node->num_outputs; o++) {
725	if (subgraph->values[node->outputs[o]].layout == xnn_layout_type_nchw) {
726	xnn_log_warning(
727	"FP16 rewrite aborted: output #%" PRIu32 " (Value #%" PRIu32 ") of node #%" PRIu32 " (%s) has NCHW layout",
728	o, node->outputs[o], n, xnn_node_type_to_string(node->type));
729	return false;
730	}
731	}
732	switch (node->type) {
733	case xnn_node_type_abs:
734	case xnn_node_type_add2:
735	case xnn_node_type_divide:
736	case xnn_node_type_maximum2:
737	case xnn_node_type_minimum2:
738	case xnn_node_type_multiply2:
739	case xnn_node_type_concatenate2:
740	case xnn_node_type_concatenate3:
741	case xnn_node_type_concatenate4:
742	case xnn_node_type_squared_difference:
743	case xnn_node_type_subtract:
744	for (uint32_t i = `0`; i < node->num_inputs; i++) {
745	if (subgraph->values[node->inputs[i]].data != NULL) {
746	xnn_log_warning("FP16 rewrite aborted: node #%" PRIu32 " (%s) has static input %" PRIu32,
747	n, xnn_node_type_to_string(node->type), i);
748	return false;
749	}
750	}
751	break;
752	case xnn_node_type_average_pooling_2d:
753	case xnn_node_type_bankers_rounding:
754	case xnn_node_type_ceiling:
755	case xnn_node_type_clamp:
756	case xnn_node_type_convolution_2d:
757	case xnn_node_type_deconvolution_2d:
758	case xnn_node_type_depthwise_convolution_2d:
759	case xnn_node_type_depth_to_space:
760	case xnn_node_type_elu:
761	case xnn_node_type_even_split2:
762	case xnn_node_type_even_split3:
763	case xnn_node_type_even_split4:
764	case xnn_node_type_floor:
765	case xnn_node_type_fully_connected:
766	case xnn_node_type_global_average_pooling_2d:
767	case xnn_node_type_hardswish:
768	case xnn_node_type_leaky_relu:
769	case xnn_node_type_max_pooling_2d:
770	case xnn_node_type_negate:
771	case xnn_node_type_prelu:
772	case xnn_node_type_sigmoid:
773	case xnn_node_type_softmax:
774	case xnn_node_type_static_constant_pad:
775	case xnn_node_type_static_reshape:
776	case xnn_node_type_static_resize_bilinear_2d:
777	case xnn_node_type_static_transpose:
778	case xnn_node_type_square:
779	case xnn_node_type_square_root:
780	break;
781	default:
782	xnn_log_warning("FP16 rewrite aborted: node #%" PRIu32 " (%s) is not supported for FP16 inference",
783	n, xnn_node_type_to_string(node->type));
784	return false;
785	}
786	}
787
788	// Annotate Values to be converted to FP16 as FP16-compatible.
789	// Note that static weights in [Depthwise] Convolution, Fully Connected, and PReLU Nodes remain FP32,
790	// they will be converted to FP16 during weight repacking when the operator is created.
791	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
792	struct xnn_node* node = &subgraph->nodes[n];
793	switch (node->type) {
794	case xnn_node_type_convolution_2d:
795	case xnn_node_type_deconvolution_2d:
796	case xnn_node_type_depthwise_convolution_2d:
797	case xnn_node_type_fully_connected:
798	case xnn_node_type_prelu:
799	subgraph->values[node->inputs[`0`]].fp16_compatible = true;
800	subgraph->values[node->outputs[`0`]].fp16_compatible = true;
801	break;
802	default:
803	for (uint32_t i = `0`; i < node->num_inputs; i++) {
804	subgraph->values[node->inputs[i]].fp16_compatible = true;
805	}
806	for (uint32_t o = `0`; o < node->num_outputs; o++) {
807	subgraph->values[node->outputs[o]].fp16_compatible = true;
808	}
809	break;
810	}
811	}
812
813	// Replace FP32 Values in Nodes' inputs/outputs with FP16 Values.
814	// FP32 Values that are not external inputs or outputs are converted to FP16 in-place,
815	// for external inputs and outputs we create same-shaped FP16 Values and use those instead.
816	const uint32_t num_original_values = subgraph->num_values;
817	xnn_subgraph_analyze_consumers_and_producers(subgraph);
818	for (uint32_t n = `0`; n < num_original_values; n++) {
819	struct xnn_value* value = &subgraph->values[n];
820	value->fp16_id = XNN_INVALID_VALUE_ID;
821	value->fp32_id = XNN_INVALID_VALUE_ID;
822	if (value->fp16_compatible) {
823	assert(value->data == NULL);
824	assert(value->datatype == xnn_datatype_fp32);
825	if (xnn_value_is_external(value)) {
826	struct xnn_value* fp16_value = xnn_subgraph_new_internal_value(subgraph);
827
828	// Recompute value due to potential reallocation in xnn_subgraph_new_internal_value
829	value = &subgraph->values[n];
830	xnn_value_copy(fp16_value, value);
831	fp16_value->datatype = xnn_datatype_fp16;
832
833	fp16_value->producer = value->producer;
834	fp16_value->num_consumers = value->num_consumers;
835	fp16_value->first_consumer = value->first_consumer;
836	value->producer = XNN_INVALID_NODE_ID;
837	value->num_consumers = `0`;
838	value->first_consumer = XNN_INVALID_NODE_ID;
839
840	// Clear external input/output flags
841	fp16_value->flags = `0`;
842	xnn_log_debug("FP16 rewrite: created FP16 tensor #%" PRIu32 " for FP32 tensor #%" PRIu32, fp16_value->id, n);
843
844	value->fp16_id = fp16_value->id;
845	fp16_value->fp32_id = n;
846	} else {
847	xnn_log_debug("FP16 rewrite: converted FP32 tensor #%" PRIu32 " to FP16", n);
848	value->datatype = xnn_datatype_fp16;
849	}
850	}
851	}
852	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
853	struct xnn_node* node = &subgraph->nodes[n];
854	if (node->type == xnn_node_type_invalid) {
855	// Node was fused away, skip.
856	continue;
857	}
858
859	assert(node->compute_type == xnn_compute_type_fp32);
860	node->compute_type = xnn_compute_type_fp16;
861	if (node->type == xnn_node_type_static_constant_pad) {
862	node->params.static_pad.padding_value =
863	fp16_ieee_from_fp32_value(uint32_as_float(node->params.static_pad.padding_value));
864	}
865	for (uint32_t i = `0`; i < node->num_inputs; i++) {
866	const uint32_t fp16_id = subgraph->values[node->inputs[i]].fp16_id;
867	if (fp16_id != XNN_INVALID_VALUE_ID) {
868	assert(subgraph->values[fp16_id].fp32_id == node->inputs[i]);
869	node->inputs[i] = fp16_id;
870	}
871	}
872	for (uint32_t o = `0`; o < node->num_outputs; o++) {
873	const uint32_t fp16_id = subgraph->values[node->outputs[o]].fp16_id;
874	if (fp16_id != XNN_INVALID_VALUE_ID) {
875	assert(subgraph->values[fp16_id].fp32_id == node->outputs[o]);
876	node->outputs[o] = fp16_id;
877	}
878	}
879	}
880
881	// Count the number of external inputs and outputs which require Convert nodes
882	uint32_t num_external_inputs = `0`;
883	uint32_t num_external_outputs = `0`;
884	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
885	const struct xnn_node* node = &subgraph->nodes[n];
886	for (uint32_t i = `0`; i < node->num_inputs; i++) {
887	const struct xnn_value* value = &subgraph->values[node->inputs[i]];
888	if (value->fp32_id != XNN_INVALID_VALUE_ID && value->first_consumer == n) {
889	assert(value->data == NULL);
890	assert(value->datatype == xnn_datatype_fp16);
891	assert(subgraph->values[value->fp32_id].datatype == xnn_datatype_fp32);
892	// This value isn't always an external input, it could be an external output of the current subgraph (due to
893	// partition), and be simultaneously consumed by the current node.
894	if (xnn_value_is_external_input(&subgraph->values[value->fp32_id])) {
895	num_external_inputs += `1`;
896	}
897	}
898	}
899	for (uint32_t o = `0`; o < node->num_outputs; o++) {
900	const struct xnn_value* value = &subgraph->values[node->outputs[o]];
901	if (value->fp32_id != XNN_INVALID_VALUE_ID) {
902	assert(value->datatype == xnn_datatype_fp16);
903	assert(subgraph->values[value->fp32_id].datatype == xnn_datatype_fp32);
904	assert(xnn_value_is_external_output(&subgraph->values[value->fp32_id]));
905	num_external_outputs += `1`;
906	}
907	}
908	}
909	xnn_log_debug("Discovered %"PRIu32" external inputs and %"PRIu32" external outputs",
910	num_external_inputs, num_external_outputs);
911
912	const uint32_t num_original_nodes = subgraph->num_nodes;
913	xnn_subgraph_add_nodes(subgraph, num_external_inputs + num_external_outputs);
914	struct xnn_node* output_node = subgraph->nodes + subgraph->num_nodes - `1`;
915	for (uint32_t n = num_original_nodes; n != `0`; n--) {
916	const struct xnn_node* node = &subgraph->nodes[n - `1`];
917	// Insert Convert nodes for outputs
918	for (uint32_t o = `0`; o < node->num_outputs; o++) {
919	const struct xnn_value* value = &subgraph->values[node->outputs[o]];
920	if (value->fp32_id != XNN_INVALID_VALUE_ID) {
921	xnn_log_debug("Inserted FP16->FP32 Convert Node from tensor #%"PRIu32" to tensor #%"PRIu32,
922	value->id, value->fp32_id);
923	const uint32_t output_node_id = output_node->id;
924	assert(output_node >= subgraph->nodes);
925	xnn_node_clear(output_node);
926	output_node->id = output_node_id;
927	xnn_init_convert_node(output_node, xnn_compute_type_fp16_to_fp32, value->id, value->fp32_id, `0` / flags /);
928	output_node -= `1`;
929	}
930	}
931	// Move the Node to the new location
932	if (output_node != node) {
933	const uint32_t output_node_id = output_node->id;
934	assert(output_node >= subgraph->nodes);
935	memcpy(output_node, node, sizeof(struct xnn_node));
936	output_node->id = output_node_id;
937	output_node -= `1`;
938	}
939	// Insert Convert nodes for inputs
940	for (uint32_t i = `0`; i < node->num_inputs; i++) {
941	const struct xnn_value* value = &subgraph->values[node->inputs[i]];
942	if (value->fp32_id != XNN_INVALID_VALUE_ID && value->first_consumer == n - `1`) {
943	// Only insert convert nodes if the value actually is an external input. This value could be an external output,
944	// if that's the case, we have already inserted a convert node in loop above for outputs.
945	if (xnn_value_is_external_input(&subgraph->values[value->fp32_id])) {
946	xnn_log_debug("Inserted FP32->FP16 Convert Node from tensor #%"PRIu32" to tensor #%"PRIu32,
947	value->fp32_id, value->id);
948	const uint32_t output_node_id = output_node->id;
949	assert(output_node >= subgraph->nodes);
950	xnn_node_clear(output_node);
951	output_node->id = output_node_id;
952	xnn_init_convert_node(output_node, xnn_compute_type_fp32_to_fp16, value->fp32_id, value->id, `0` / flags /);
953	output_node -= `1`;
954	}
955	}
956	}
957	}
958
959	return true;
960	}
961
962	enum xnn_status xnn_subgraph_fusion(
963	xnn_subgraph_t subgraph)
964	{
965	// Fuse Nodes where possible
966	for (uint32_t i = `0`; i < subgraph->num_values; i++) {
967	struct xnn_value* value = &subgraph->values[i];
968	if (value->num_consumers == `1`) {
969	const uint32_t producer_id = value->producer;
970	if (producer_id == XNN_INVALID_NODE_ID) {
971	continue;
972	}
973	assert(producer_id < subgraph->num_nodes);
974
975	const uint32_t consumer_id = value->first_consumer;
976	if (consumer_id == XNN_INVALID_NODE_ID) {
977	continue;
978	}
979	assert(consumer_id < subgraph->num_nodes);
980
981	struct xnn_node* producer = &subgraph->nodes[producer_id];
982	assert(producer->type != xnn_node_type_invalid);
983	struct xnn_node* consumer = &subgraph->nodes[consumer_id];
984	assert(consumer->type != xnn_node_type_invalid);
985
986	// Try to fuse Clamp Node upstream into producer Node
987	if (consumer->type == xnn_node_type_clamp) {
988	switch (producer->type) {
989	case xnn_node_type_add2:
990	case xnn_node_type_average_pooling_2d:
991	case xnn_node_type_clamp:
992	case xnn_node_type_convolution_2d:
993	case xnn_node_type_divide:
994	case xnn_node_type_deconvolution_2d:
995	case xnn_node_type_depthwise_convolution_2d:
996	case xnn_node_type_fully_connected:
997	case xnn_node_type_multiply2:
998	case xnn_node_type_max_pooling_2d:
999	case xnn_node_type_subtract:
1000	xnn_log_info("fuse Clamp Node #%"PRIu32" into upstream Node #%"PRIu32, consumer_id, producer_id);
1001	assert(producer->num_outputs == `1`);
1002	assert(consumer->num_inputs == `1`);
1003	assert(consumer->num_outputs == `1`);
1004
1005	const uint32_t fused_output_id = consumer->outputs[`0`];
1006	assert(fused_output_id < subgraph->num_values);
1007	subgraph->values[fused_output_id].producer = producer_id;
1008	producer->outputs[`0`] = fused_output_id;
1009
1010	producer->activation.output_min =
1011	math_max_f32(producer->activation.output_min, consumer->activation.output_min);
1012	producer->activation.output_max =
1013	math_min_f32(producer->activation.output_max, consumer->activation.output_max);
1014
1015	xnn_node_clear(consumer);
1016	xnn_value_clear(value);
1017	break;
1018	default:
1019	break;
1020	}
1021	}
1022	// Try to fuse Constant Pad node downstream into [Depthwise] Convolution 2D Node
1023	if (producer->type == xnn_node_type_static_constant_pad) {
1024	assert(producer->num_inputs == `1`);
1025	assert(producer->num_outputs == `1`);
1026	const bool is_spatial_2d_padding = value->shape.num_dims == `4` &&
1027	(producer->params.static_pad.pre_paddings[`0`] \| producer->params.static_pad.post_paddings[`0`] \|
1028	producer->params.static_pad.pre_paddings[`3`] \| producer->params.static_pad.post_paddings[`3`]) == `0`;
1029	const enum xnn_datatype padding_datatype = subgraph->values[producer->outputs[`0`]].datatype;
1030	const uint32_t padding_value = producer->params.static_pad.padding_value;
1031	const bool is_zero_padding =
1032	(padding_datatype == xnn_datatype_fp32 && padding_value == `0`) \|\|
1033	((padding_datatype == xnn_datatype_qint8 \|\| padding_datatype == xnn_datatype_quint8) &&
1034	padding_value == (uint32_t) (uint8_t) subgraph->values[producer->outputs[`0`]].quantization.zero_point);
1035	switch (consumer->type) {
1036	case xnn_node_type_convolution_2d:
1037	if (is_spatial_2d_padding && is_zero_padding && !(consumer->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING)) {
1038	xnn_log_info("fuse Constant Pad Node #%"PRIu32" into Convolution 2D Node #%"PRIu32,
1039	consumer_id, producer_id);
1040	assert(consumer->num_inputs >= `1`);
1041	assert(consumer->inputs[`0`] == producer->outputs[`0`]);
1042
1043	consumer->params.convolution_2d.input_padding_top += producer->params.static_pad.pre_paddings[`1`];
1044	consumer->params.convolution_2d.input_padding_right += producer->params.static_pad.post_paddings[`2`];
1045	consumer->params.convolution_2d.input_padding_bottom += producer->params.static_pad.post_paddings[`1`];
1046	consumer->params.convolution_2d.input_padding_left += producer->params.static_pad.pre_paddings[`2`];
1047
1048	consumer->inputs[`0`] = producer->inputs[`0`];
1049
1050	const uint32_t fused_input_id = producer->inputs[`0`];
1051	assert(fused_input_id < subgraph->num_values);
1052	if (subgraph->values[fused_input_id].first_consumer == producer_id) {
1053	subgraph->values[fused_input_id].first_consumer = consumer_id;
1054	}
1055
1056	xnn_node_clear(producer);
1057	xnn_value_clear(value);
1058	}
1059	break;
1060	case xnn_node_type_depthwise_convolution_2d:
1061	if (is_spatial_2d_padding && is_zero_padding && !(consumer->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING)) {
1062	xnn_log_info("fuse Constant Pad Node #%"PRIu32" into Depthwise Convolution 2D Node #%"PRIu32,
1063	consumer_id, producer_id);
1064	assert(consumer->num_inputs >= `1`);
1065	assert(consumer->inputs[`0`] == producer->outputs[`0`]);
1066
1067	consumer->params.depthwise_convolution_2d.input_padding_top +=
1068	producer->params.static_pad.pre_paddings[`1`];
1069	consumer->params.depthwise_convolution_2d.input_padding_right +=
1070	producer->params.static_pad.post_paddings[`2`];
1071	consumer->params.depthwise_convolution_2d.input_padding_bottom +=
1072	producer->params.static_pad.post_paddings[`1`];
1073	consumer->params.depthwise_convolution_2d.input_padding_left +=
1074	producer->params.static_pad.pre_paddings[`2`];
1075
1076	consumer->inputs[`0`] = producer->inputs[`0`];
1077
1078	const uint32_t fused_input_id = producer->inputs[`0`];
1079	assert(fused_input_id < subgraph->num_values);
1080	if (subgraph->values[fused_input_id].first_consumer == producer_id) {
1081	subgraph->values[fused_input_id].first_consumer = consumer_id;
1082	}
1083
1084	xnn_node_clear(producer);
1085	xnn_value_clear(value);
1086	}
1087	break;
1088	default:
1089	break;
1090	}
1091	}
1092	}
1093	}
1094
1095	return xnn_status_success;
1096	}
1097
1098	enum xnn_status xnn_subgraph_optimize(
1099	xnn_subgraph_t subgraph,
1100	uint32_t flags)
1101	{
1102	xnn_subgraph_analyze_consumers_and_producers(subgraph);
1103
1104	// Remove unreferenced values.
1105	for (uint32_t i = `0`; i < subgraph->num_values; i++) {
1106	struct xnn_value* value = &subgraph->values[i];
1107	if (value->type == xnn_value_type_invalid) {
1108	continue;
1109	}
1110
1111	if (!xnn_value_is_external_input(value) && value->num_consumers == `0` && !xnn_value_is_persistent(value)) {
1112	xnn_value_clear(value);
1113	}
1114	}
1115
1116
1117	if (!(flags & XNN_FLAG_NO_OPERATOR_FUSION)) {
1118	xnn_subgraph_fusion(subgraph);
1119	}
1120
1121	#if XNN_ENABLE_SPARSE
1122	if ((flags & XNN_FLAG_HINT_SPARSE_INFERENCE) && (xnn_params.init_flags & XNN_INIT_FLAG_CHW_OPT)) {
1123	xnn_subgraph_rewrite_for_nchw(subgraph);
1124	}
1125	#endif
1126
1127	if ((flags & XNN_FLAG_FORCE_FP16_INFERENCE) && !(xnn_params.init_flags & XNN_INIT_FLAG_F16)) {
1128	xnn_log_error("failed to force FP16 inference: hardware supports neither native nor emulated FP16 operators");
1129	return xnn_status_unsupported_hardware;
1130	}
1131	#ifndef XNN_NO_F16_OPERATORS
1132	const bool try_native_fp16 =
1133	(flags & XNN_FLAG_HINT_FP16_INFERENCE) && (xnn_params.init_flags & XNN_INIT_FLAG_F16_NATIVE);
1134	const bool force_fp16 = (flags & XNN_FLAG_FORCE_FP16_INFERENCE);
1135	if (try_native_fp16 \|\| force_fp16) {
1136	const bool fp16_rewrite_succeeded = xnn_subgraph_rewrite_for_fp16(subgraph);
1137	if (force_fp16 && !fp16_rewrite_succeeded) {
1138	xnn_log_error("failed to force FP16 inference: subgraph is incompatible with FP16 operators");
1139	return xnn_status_unsupported_parameter;
1140	}
1141	}
1142	#endif // XNN_NO_F16_OPERATORS
1143
1144	return xnn_status_success;
1145	}
1146
1147	enum xnn_status xnn_delete_subgraph(
1148	xnn_subgraph_t subgraph)
1149	{
1150	if (subgraph != NULL) {
1151	memset(subgraph->nodes, `0`, sizeof(struct xnn_node) * subgraph->num_nodes);
1152	xnn_release_memory(subgraph->nodes);
1153
1154	memset(subgraph->values, `0`, sizeof(struct xnn_value) * subgraph->num_values);
1155	xnn_release_memory(subgraph->values);
1156
1157	memset(subgraph, `0`, sizeof(struct xnn_subgraph));
1158	xnn_release_memory(subgraph);
1159	}
1160	return xnn_status_success;
1161	}
1162

Browse the source code of tensorflow/external/XNNPACK/src/subgraph.c