subgraph.c source code [pytorch/third_party/XNNPACK/src/subgraph.c]

1	// Copyright 2020 Google LLC
2	//
3	// This source code is licensed under the BSD-style license found in the
4	// LICENSE file in the root directory of this source tree.
5
6	#include <math.h>
7	#include <stddef.h>
8	#include <stdint.h>
9	#include <stdlib.h>
10
11	#include <fp16.h>
12
13	#include <xnnpack.h>
14	#include <xnnpack/allocator.h>
15	#include <xnnpack/log.h>
16	#include <xnnpack/math.h>
17	#include <xnnpack/node-type.h>
18	#include <xnnpack/params.h>
19	#include <xnnpack/subgraph.h>
20
21
22	#ifndef XNN_ENABLE_SPARSE
23	#error "XNN_ENABLE_SPARSE not defined"
24	#endif
25
26	enum xnn_status xnn_create_subgraph(
27	uint32_t external_value_ids,
28	uint32_t flags,
29	xnn_subgraph_t* subgraph_out)
30	{
31	struct xnn_subgraph* subgraph = NULL;
32	enum xnn_status status = xnn_status_uninitialized;
33
34	if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == `0`) {
35	xnn_log_error("failed to create subgraph: XNNPACK is not initialized");
36	goto error;
37	}
38
39	status = xnn_status_out_of_memory;
40
41	subgraph = xnn_allocate_zero_memory(sizeof(struct xnn_subgraph));
42	if (subgraph == NULL) {
43	xnn_log_error("failed to allocate %zu bytes for subgraph descriptor", sizeof(struct xnn_subgraph));
44	goto error;
45	}
46
47	subgraph->external_value_ids = external_value_ids;
48
49	subgraph->values = xnn_allocate_zero_memory(external_value_ids * sizeof(struct xnn_value));
50	if (subgraph->values == NULL) {
51	xnn_log_error("failed to allocate %zu bytes for subgraph values",
52	(size_t) external_value_ids * sizeof(struct xnn_value));
53	goto error;
54	}
55	for (size_t i = `0`; i < external_value_ids; i++) {
56	subgraph->values[i].id = i;
57	}
58	subgraph->num_values = external_value_ids;
59	subgraph->num_reserved_values = external_value_ids;
60
61	*subgraph_out = subgraph;
62	return xnn_status_success;
63
64	error:
65	xnn_delete_subgraph(subgraph);
66	return status;
67	}
68
69
70	struct xnn_value* xnn_subgraph_new_internal_value(xnn_subgraph_t subgraph)
71	{
72	struct xnn_value* values = subgraph->values;
73	const size_t size = subgraph->num_values;
74	const size_t capacity = subgraph->num_reserved_values;
75	if (capacity < size + `1`) {
76	const size_t new_capacity = max(min(capacity * `2`, capacity + `512`), capacity + `64`);
77	assert(new_capacity >= size + `1`);
78	values = xnn_reallocate_memory(values, new_capacity * sizeof(struct xnn_value));
79	if (values == NULL) {
80	xnn_log_error("failed to allocate %zu bytes for subgraph values",
81	capacity * sizeof(struct xnn_value));
82	return values;
83	}
84
85	memset(values + size, `0`, (new_capacity - size) * sizeof(struct xnn_value));
86	subgraph->num_reserved_values = new_capacity;
87	subgraph->values = values;
88	}
89	subgraph->num_values = size + `1`;
90	struct xnn_value* new_value = values + size;
91	new_value->id = size;
92	return new_value;
93	}
94
95	void xnn_node_clear(struct xnn_node* node) {
96	assert(node != NULL);
97	memset(node, `0`, sizeof(struct xnn_node));
98	}
99
100	void xnn_value_clear(struct xnn_value* value) {
101	assert(value != NULL);
102	memset(value, `0`, sizeof(struct xnn_value));
103	}
104
105	void xnn_value_copy(
106	struct xnn_value* dst_value,
107	const struct xnn_value* src_value)
108	{
109	// Note: Value ID stays unchanged
110
111	dst_value->type = src_value->type;
112	dst_value->datatype = src_value->datatype;
113	dst_value->quantization = src_value->quantization;
114	dst_value->shape = src_value->shape;
115	dst_value->flags = src_value->flags;
116	dst_value->data = src_value->data;
117	dst_value->producer = src_value->producer;
118	dst_value->first_consumer = src_value->first_consumer;
119	}
120
121	struct xnn_node* xnn_subgraph_new_node(xnn_subgraph_t subgraph)
122	{
123	struct xnn_node* nodes = subgraph->nodes;
124	const size_t size = subgraph->num_nodes;
125	const size_t capacity = subgraph->num_reserved_nodes;
126
127	if (capacity < size + `1`) {
128	const size_t new_capacity = max(min(capacity * `2`, capacity + `512`), capacity + `64`);
129	assert(new_capacity >= size + `1`);
130	nodes = xnn_reallocate_memory(nodes, new_capacity * sizeof(struct xnn_node));
131	if (nodes == NULL) {
132	xnn_log_error("failed to allocate %zu bytes for subgraph nodes",
133	capacity * sizeof(struct xnn_node));
134	return nodes;
135	}
136
137	memset(nodes + size, `0`, (new_capacity - size) * sizeof(struct xnn_node));
138	subgraph->num_reserved_nodes = new_capacity;
139	subgraph->nodes = nodes;
140	}
141	subgraph->num_nodes = size + `1`;
142	struct xnn_node* new_node = nodes + size;
143	new_node->id = size;
144	return new_node;
145	}
146
147	void xnn_subgraph_add_nodes(xnn_subgraph_t subgraph, size_t num_nodes)
148	{
149	struct xnn_node* nodes = subgraph->nodes;
150	const size_t size = subgraph->num_nodes;
151	const size_t capacity = subgraph->num_reserved_nodes;
152
153	if (capacity < size + num_nodes) {
154	const size_t new_capacity = max(min(capacity * `2`, capacity + `512`), capacity + max(num_nodes, `64`));
155	assert(new_capacity >= size + num_nodes);
156	nodes = xnn_reallocate_memory(nodes, new_capacity * sizeof(struct xnn_node));
157	if (nodes == NULL) {
158	xnn_log_error("failed to allocate %zu bytes for subgraph nodes",
159	capacity * sizeof(struct xnn_node));
160	return;
161	}
162
163	memset(nodes + size, `0`, (new_capacity - size) * sizeof(struct xnn_node));
164	subgraph->num_reserved_nodes = new_capacity;
165	subgraph->nodes = nodes;
166	}
167	subgraph->num_nodes = size + num_nodes;
168	struct xnn_node* new_nodes = nodes + size;
169	for (size_t i = `0`; i < num_nodes; i++) {
170	new_nodes[i].id = size + i;
171	}
172	}
173
174	void xnn_subgraph_analyze_consumers_and_producers(xnn_subgraph_t subgraph)
175	{
176	// Initialize producer/consumer fields to safe defaults.
177	for (uint32_t i = `0`; i < subgraph->num_values; i++) {
178	struct xnn_value* value = &subgraph->values[i];
179	value->producer = XNN_INVALID_NODE_ID;
180	value->first_consumer = XNN_INVALID_NODE_ID;
181	value->num_consumers = `0`;
182	}
183
184	// Analyse Nodes' inputs and output and update Values' producer/consumer fields
185	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
186	struct xnn_node* node = &subgraph->nodes[n];
187
188	for (uint32_t i = `0`; i < node->num_inputs; i++) {
189	const uint32_t input_id = node->inputs[i];
190	assert(input_id < subgraph->num_values);
191
192	if (subgraph->values[input_id].num_consumers++ == `0`) {
193	assert(subgraph->values[input_id].first_consumer == XNN_INVALID_NODE_ID);
194	subgraph->values[input_id].first_consumer = n;
195	}
196	}
197
198	for (uint32_t o = `0`; o < node->num_outputs; o++) {
199	const uint32_t output_id = node->outputs[o];
200	assert(output_id < subgraph->num_values);
201
202	// Persistent values can be produced by multiple nodes, e.g. copy nodes writing to the same persistent value.
203	assert(xnn_value_is_persistent(&subgraph->values[output_id]) \|\|
204	subgraph->values[output_id].producer == XNN_INVALID_NODE_ID);
205	subgraph->values[output_id].producer = n;
206	}
207	}
208
209	// Count extra consumer for Values which are external outputs.
210	// Remove unreferenced values.
211	for (uint32_t i = `0`; i < subgraph->num_values; i++) {
212	struct xnn_value* value = &subgraph->values[i];
213	if (xnn_value_is_external_output(value)) {
214	value->num_consumers += `1`;
215	}
216	}
217	}
218
219	#define XNN_LAYOUT_FLAG_COMPATIBLE_NCHW 1
220	#define XNN_LAYOUT_FLAG_COMPATIBLE_NHWC2NCHW 2
221	#define XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC 4
222	#define XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER 8
223
224	uint32_t xnn_check_nchw_compatibility(xnn_subgraph_t subgraph, struct xnn_node* node) {
225	if (node->compute_type != xnn_compute_type_fp16 && node->compute_type != xnn_compute_type_fp32) {
226	if (node->type != xnn_node_type_invalid) {
227	xnn_log_info(
228	"Node %s compute type %d is incompatible with sparse inference",
229	xnn_node_type_to_string(node->type), node->compute_type);
230	}
231	return `0`;
232	}
233
234	switch (node->type) {
235	case xnn_node_type_convolution_2d:
236	// Supported cases:
237	// - 1x1 convolution (no stride, no dilation, no padding, no groups)
238	// - 3x3 stride-2 convolution (no dilation, padding 1 on each side, no groups, 3 input channels)
239	if (node->params.convolution_2d.groups != `1`) {
240	xnn_log_info("Node %s groups (%" PRIu32 ") "
241	"is incompatible with sparse inference",
242	xnn_node_type_to_string(node->type),
243	node->params.convolution_2d.groups);
244	return `0`;
245	}
246	if ((node->params.convolution_2d.dilation_height \| node->params.convolution_2d.dilation_width) != `1`) {
247	xnn_log_info("Node %s dilation (height=%" PRIu32 ", width=%" PRIu32 ") "
248	"is incompatible with sparse inference",
249	xnn_node_type_to_string(node->type),
250	node->params.convolution_2d.dilation_height,
251	node->params.convolution_2d.dilation_width);
252	return `0`;
253	}
254	if ((node->params.convolution_2d.kernel_height \| node->params.convolution_2d.kernel_width) == `1`) {
255	if ((node->params.convolution_2d.input_padding_top \| node->params.convolution_2d.input_padding_right \|
256	node->params.convolution_2d.input_padding_bottom \| node->params.convolution_2d.input_padding_left) != `0`) {
257	xnn_log_info("Node %s (1x1 kernel) padding (top=%" PRIu32 ", right=%" PRIu32", bottom=%" PRIu32 ", left=%" PRIu32") "
258	"is incompatible with sparse inference",
259	xnn_node_type_to_string(node->type),
260	node->params.convolution_2d.input_padding_top,
261	node->params.convolution_2d.input_padding_right,
262	node->params.convolution_2d.input_padding_bottom,
263	node->params.convolution_2d.input_padding_left);
264	return `0`;
265	}
266	if ((node->params.convolution_2d.subsampling_height \| node->params.convolution_2d.subsampling_width) != `1`) {
267	xnn_log_info("Node %s (1x1 kernel) subsampling (height=%" PRIu32 ", width=%" PRIu32 ") "
268	"is incompatible with sparse inference",
269	xnn_node_type_to_string(node->type),
270	node->params.convolution_2d.subsampling_height,
271	node->params.convolution_2d.subsampling_width);
272	return `0`;
273	}
274	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW;
275	} else if (node->params.convolution_2d.kernel_height == `3` && node->params.convolution_2d.kernel_width == `3`) {
276	if (node->params.convolution_2d.input_padding_top != `1` \|\| node->params.convolution_2d.input_padding_right != `1` \|\|
277	node->params.convolution_2d.input_padding_bottom != `1` \|\| node->params.convolution_2d.input_padding_left != `1`) {
278	xnn_log_info("Node %s (3x3 kernel) padding (top=%" PRIu32 ", right=%" PRIu32 ", bottom=%" PRIu32 ", left=%" PRIu32 ") "
279	"is incompatible with sparse inference",
280	xnn_node_type_to_string(node->type),
281	node->params.convolution_2d.input_padding_top,
282	node->params.convolution_2d.input_padding_right,
283	node->params.convolution_2d.input_padding_bottom,
284	node->params.convolution_2d.input_padding_left);
285	return `0`;
286	}
287	if ((node->params.convolution_2d.subsampling_height \| node->params.convolution_2d.subsampling_width) != `2`) {
288	xnn_log_info("Node %s (3x3 kernel) subsampling (height=%" PRIu32 ", width=%" PRIu32 ") "
289	"is incompatible with sparse inference",
290	xnn_node_type_to_string(node->type),
291	node->params.convolution_2d.subsampling_height,
292	node->params.convolution_2d.subsampling_width);
293	return `0`;
294	}
295	if (node->params.convolution_2d.group_input_channels != `3`) {
296	xnn_log_info("Node %s (3x3 kernel) input channels (%zu) "
297	"is incompatible with sparse inference",
298	xnn_node_type_to_string(node->type),
299	node->params.convolution_2d.group_input_channels);
300	return `0`;
301	}
302	return XNN_LAYOUT_FLAG_COMPATIBLE_NHWC2NCHW;
303	}
304	return `0`;
305	case xnn_node_type_depthwise_convolution_2d:
306	// Supported cases:
307	// - 3x3 stride-1 convolution (no dilation, padding 1 on each side)
308	// - 3x3 stride-2 convolution (no dilation, padding 1 on each side)
309	// - 5x5 stride-1 convolution (no dilation, padding 2 on each side)
310	// - 5x5 stride-2 convolution (no dilation, padding 2 on each side)
311	if ((node->params.depthwise_convolution_2d.dilation_height \| node->params.depthwise_convolution_2d.dilation_width) != `1`) {
312	xnn_log_info("Node %s dilation (height=%" PRIu32 ", width=%" PRIu32 ") "
313	"is incompatible with sparse inference",
314	xnn_node_type_to_string(node->type),
315	node->params.convolution_2d.dilation_height,
316	node->params.convolution_2d.dilation_width);
317	return `0`;
318	}
319	if (node->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
320	xnn_log_info("Node %s flags (%" PRIu32 ") has padding incompatible with sparse inference",
321	xnn_node_type_to_string(node->type),
322	node->flags);
323	return `0`;
324	}
325	if (node->params.depthwise_convolution_2d.depth_multiplier != `1`) {
326	xnn_log_info("Node %s depth_multiplier (%" PRIu32 ") is incompatible with sparse inference",
327	xnn_node_type_to_string(node->type),
328	node->params.depthwise_convolution_2d.depth_multiplier);
329	return `0`;
330	}
331	if (node->params.depthwise_convolution_2d.subsampling_height != node->params.depthwise_convolution_2d.subsampling_width) {
332	xnn_log_info("Node %s subsampling (height=%" PRIu32 ", width=%" PRIu32 ") "
333	"is incompatible with sparse inference",
334	xnn_node_type_to_string(node->type),
335	node->params.depthwise_convolution_2d.subsampling_height,
336	node->params.depthwise_convolution_2d.subsampling_width);
337	return `0`;
338	}
339	switch (node->params.depthwise_convolution_2d.subsampling_height) {
340	case `1`:
341	case `2`:
342	break;
343	default:
344	xnn_log_info("Node %s subsampling_height (%" PRIu32 ") "
345	"is incompatible with sparse inference",
346	xnn_node_type_to_string(node->type),
347	node->params.depthwise_convolution_2d.subsampling_height);
348	return `0`;
349	}
350	if (node->params.depthwise_convolution_2d.kernel_height != node->params.depthwise_convolution_2d.kernel_width) {
351	xnn_log_info("Node %s kernel (height=%" PRIu32 ", width=%" PRIu32 ") "
352	"is incompatible with sparse inference",
353	xnn_node_type_to_string(node->type),
354	node->params.depthwise_convolution_2d.kernel_height,
355	node->params.depthwise_convolution_2d.kernel_width);
356	return `0`;
357	}
358	switch (node->params.depthwise_convolution_2d.kernel_height) {
359	case `3`:
360	if (node->params.depthwise_convolution_2d.input_padding_top == `1` &&
361	node->params.depthwise_convolution_2d.input_padding_right == `1` &&
362	node->params.depthwise_convolution_2d.input_padding_bottom == `1` &&
363	node->params.depthwise_convolution_2d.input_padding_left == `1`) {
364	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW;
365	} else {
366	xnn_log_info("Node %s (3x3 kernel) padding "
367	"(top=%" PRIu32 ", right=%" PRIu32 ", bottom=%" PRIu32 ", left=%" PRIu32 ") "
368	"is incompatible with sparse inference",
369	xnn_node_type_to_string(node->type),
370	node->params.depthwise_convolution_2d.input_padding_top,
371	node->params.depthwise_convolution_2d.input_padding_right,
372	node->params.depthwise_convolution_2d.input_padding_bottom,
373	node->params.depthwise_convolution_2d.input_padding_left);
374	return `0`;
375	}
376	case `5`:
377	if (node->params.depthwise_convolution_2d.input_padding_top == `2` &&
378	node->params.depthwise_convolution_2d.input_padding_right == `2` &&
379	node->params.depthwise_convolution_2d.input_padding_bottom == `2` &&
380	node->params.depthwise_convolution_2d.input_padding_left == `2`) {
381	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW;
382	} else {
383	xnn_log_info("Node %s (5x5 kernel) padding "
384	"(top=%" PRIu32 ", right=%" PRIu32 ", bottom=%" PRIu32 ", left=%" PRIu32 ") "
385	"is incompatible with sparse inference",
386	xnn_node_type_to_string(node->type),
387	node->params.depthwise_convolution_2d.input_padding_top,
388	node->params.depthwise_convolution_2d.input_padding_right,
389	node->params.depthwise_convolution_2d.input_padding_bottom,
390	node->params.depthwise_convolution_2d.input_padding_left);
391	return `0`;
392	}
393	default:
394	return `0`;
395	}
396	case xnn_node_type_depth_to_space:
397	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC;
398	case xnn_node_type_global_average_pooling_2d:
399	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW \| XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC;
400	case xnn_node_type_add2:
401	case xnn_node_type_multiply2:
402	assert(node->num_inputs == `2`);
403	assert(node->num_outputs == `1`);
404	if (subgraph->values[node->inputs[`0`]].shape.num_dims != `4` \|\|
405	subgraph->values[node->inputs[`1`]].shape.num_dims != `4`)
406	{
407	xnn_log_info("Node %s inputs shape is incompatible with sparse inference",
408	xnn_node_type_to_string(node->type));
409	return `0`;
410	}
411
412	if (subgraph->values[node->inputs[`0`]].data != NULL) {
413	// Check that the first input is representable as either a scalar, or a vector
414	size_t num_nonunit_dims = `0`;
415	for (uint32_t i = `0`; i < subgraph->values[node->inputs[`0`]].shape.num_dims; i++) {
416	if (subgraph->values[node->inputs[`0`]].shape.dim[i] != `1`) {
417	num_nonunit_dims += `1`;
418	}
419	}
420	if (num_nonunit_dims > `1`) {
421	return `0`;
422	}
423	}
424
425	if (subgraph->values[node->inputs[`1`]].data != NULL) {
426	// Check that the second input is representable as either a scalar, or a vector
427	size_t num_nonunit_dims = `0`;
428	for (uint32_t i = `0`; i < subgraph->values[node->inputs[`0`]].shape.num_dims; i++) {
429	if (subgraph->values[node->inputs[`0`]].shape.dim[i] != `1`) {
430	num_nonunit_dims += `1`;
431	}
432	}
433	if (num_nonunit_dims > `1`) {
434	return `0`;
435	}
436	}
437
438	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW;
439	case xnn_node_type_static_resize_bilinear_2d:
440	if (subgraph->values[node->inputs[`0`]].shape.dim[`1`] > `1` &&
441	subgraph->values[node->inputs[`0`]].shape.dim[`2`] > `1`) {
442	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW;
443	} else {
444	xnn_log_info("Node %s inputs shape is incompatible with sparse inference",
445	xnn_node_type_to_string(node->type));
446	return `0`;
447	}
448	case xnn_node_type_abs:
449	case xnn_node_type_bankers_rounding:
450	case xnn_node_type_ceiling:
451	case xnn_node_type_clamp:
452	case xnn_node_type_elu:
453	case xnn_node_type_floor:
454	case xnn_node_type_hardswish:
455	case xnn_node_type_leaky_relu:
456	case xnn_node_type_negate:
457	case xnn_node_type_sigmoid:
458	case xnn_node_type_square:
459	assert(node->num_inputs == `1`);
460	assert(node->num_outputs == `1`);
461	if (subgraph->values[node->inputs[`0`]].shape.num_dims == `4`) {
462	return XNN_LAYOUT_FLAG_COMPATIBLE_NCHW;
463	} else {
464	xnn_log_info("Node %s inputs shape is incompatible with sparse inference",
465	xnn_node_type_to_string(node->type));
466	return `0`;
467	}
468	default:
469	return false;
470	}
471	}
472
473	void xnn_subgraph_rewrite_for_nchw(xnn_subgraph_t subgraph)
474	{
475	// Convert parts of the subgraph to NCHW for sparse inference
476	// Step 1: detect NCHW-compatible Nodes
477	// Step 2: detect NCHW-compatible clusters (run connected components graph algorithm)
478	// Step 3: check that all NCHW-compatible Values are consumed only by NCHW-compatible Nodes
479	// Step 4: switch Values' layout to NCHW
480	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
481	struct xnn_node* node = &subgraph->nodes[n];
482	node->layout_flags = xnn_check_nchw_compatibility(subgraph, node);
483	xnn_log_debug("Node #%" PRIu32 ": %s (NCHW: %s, NHWC->NCHW: %s, NCHW->NHWC: %s)",
484	n, xnn_node_type_to_string(node->type),
485	node->layout_flags & XNN_LAYOUT_FLAG_COMPATIBLE_NCHW ? "yes" : "no",
486	node->layout_flags & XNN_LAYOUT_FLAG_COMPATIBLE_NHWC2NCHW ? "yes" : "no",
487	node->layout_flags & XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC ? "yes" : "no");
488	}
489
490	// Run Shiloach-Vishkin connected components algorithm i.e. find all
491	// XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC nodes and set them as cluster leaders
492	// to all the producer nodes
493	bool update = false;
494	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
495	struct xnn_node* node = &subgraph->nodes[n];
496	node->cluster_leader = n;
497	if (node->layout_flags & XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC) {
498	for (uint32_t i = `0`; i < node->num_inputs; i++) {
499	const struct xnn_value* value = &subgraph->values[node->inputs[i]];
500	if (value->data != NULL) {
501	// Static data, skip this input value. Compatibility of this static input with NCHW layout was validated
502	// during the initial NCHW compatibility check for the Node.
503	continue;
504	}
505	if (xnn_value_is_external(value)) {
506	// External value, invalid cluster
507	node->layout_flags \|= XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER;
508	continue;
509	}
510	const uint32_t producer_id = value->producer;
511	assert(producer_id != XNN_INVALID_NODE_ID);
512	assert(producer_id < n);
513	struct xnn_node* producer_node = &subgraph->nodes[producer_id];
514	if ((producer_node->layout_flags & (XNN_LAYOUT_FLAG_COMPATIBLE_NHWC2NCHW \| XNN_LAYOUT_FLAG_COMPATIBLE_NCHW)) != `0` &&
515	(producer_node->layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER) == `0`)
516	{
517	producer_node->layout_flags &= ~XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC;
518	if (producer_node->cluster_leader != node->cluster_leader) {
519	producer_node->cluster_leader = node->cluster_leader = math_max_u32(producer_node->cluster_leader, node->cluster_leader);
520	update = true;
521	}
522	} else {
523	node->layout_flags \|= XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER;
524	}
525	}
526	}
527	}
528	// No NCHW2NHWC compatible nodes have been found thus the graph rewriting
529	// practically cannot happen.
530	if (!update) {
531	return;
532	}
533	// Propagate the cluster leader to other nodes in the graph untill all the
534	// nodes in the cluster is not updated
535	while (update) {
536	update = false;
537	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
538	struct xnn_node* node = &subgraph->nodes[n];
539	if (node->layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER) {
540	continue;
541	}
542
543	if ((node->layout_flags & (XNN_LAYOUT_FLAG_COMPATIBLE_NCHW \| XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC)) == `0`) {
544	continue;
545	}
546
547	for (uint32_t i = `0`; i < node->num_inputs; i++) {
548	const struct xnn_value* value = &subgraph->values[node->inputs[i]];
549	if (value->data != NULL) {
550	// Static data, skip this input value. Compatibility of this static input with NCHW layout was validated
551	// during the initial NCHW compatibility check for the Node.
552	continue;
553	}
554	if (xnn_value_is_external(value)) {
555	// External value, invalid cluster
556	node->layout_flags \|= XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER;
557	continue;
558	}
559	const uint32_t producer_id = value->producer;
560	assert(producer_id != XNN_INVALID_NODE_ID);
561	assert(producer_id < n);
562	struct xnn_node* producer_node = &subgraph->nodes[producer_id];
563	if ((producer_node->layout_flags & (XNN_LAYOUT_FLAG_COMPATIBLE_NHWC2NCHW \| XNN_LAYOUT_FLAG_COMPATIBLE_NCHW)) != `0` &&
564	(producer_node->layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER) == `0`)
565	{
566	producer_node->layout_flags &= ~XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC;
567	if (producer_node->cluster_leader != node->cluster_leader) {
568	producer_node->cluster_leader = node->cluster_leader = math_max_u32(producer_node->cluster_leader, node->cluster_leader);
569	update = true;
570	}
571	} else {
572	node->layout_flags \|= XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER;
573	}
574	}
575	}
576	}
577	// Propagate XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER flags up to the cluster leaders
578	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
579	struct xnn_node* node = &subgraph->nodes[n];
580	subgraph->nodes[node->cluster_leader].layout_flags \|= node->layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER;
581	}
582	// Check that all Values consumed by NCHW-compatible cluster don't have NCHW-incompatible consumers
583	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
584	struct xnn_node* node = &subgraph->nodes[n];
585	if ((subgraph->nodes[node->cluster_leader].layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER) != `0`) {
586	continue;
587	}
588
589	if ((node->layout_flags & (XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC \| XNN_LAYOUT_FLAG_COMPATIBLE_NCHW)) == `0`) {
590	continue;
591	}
592
593	for (uint32_t i = `0`; i < node->num_inputs; i++) {
594	struct xnn_value* value = &subgraph->values[node->inputs[i]];
595	if (value->data != NULL) {
596	// Static data, skip this input value because it doesn't have a producer Node.
597	continue;
598	}
599	assert(!xnn_value_is_external(value));
600	value->num_nchw_compatible_consumers += `1`;
601	}
602	}
603	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
604	struct xnn_node* node = &subgraph->nodes[n];
605	if ((subgraph->nodes[node->cluster_leader].layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER) != `0`) {
606	continue;
607	}
608
609	if ((node->layout_flags & (XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC \| XNN_LAYOUT_FLAG_COMPATIBLE_NCHW)) == `0`) {
610	continue;
611	}
612
613	for (uint32_t i = `0`; i < node->num_inputs; i++) {
614	const struct xnn_value* value = &subgraph->values[node->inputs[i]];
615	if (value->data != NULL) {
616	// Static data, skip this input value because it doesn't have a producer Node.
617	continue;
618	}
619	assert(!xnn_value_is_external(value));
620	assert(value->num_nchw_compatible_consumers > `0`);
621	if (value->num_nchw_compatible_consumers != value->num_consumers) {
622	subgraph->nodes[node->cluster_leader].layout_flags \|= XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER;
623	}
624	}
625	}
626	// Evaluate if it is profitable to run the model as sparse:
627	// - Compute the number of parameters and zeroes in 1x1 Convolution weights
628	// - Disable sparse rewriting for clusters without 1x1 Convolutions (num_params == 0)
629	// or with less than 2/3rd of zeroes in 1x1 Convolution filters
630	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
631	struct xnn_node* node = &subgraph->nodes[n];
632	if ((subgraph->nodes[node->cluster_leader].layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER) != `0`) {
633	continue;
634	}
635
636	if (node->type == xnn_node_type_convolution_2d &&
637	max(node->params.convolution_2d.kernel_height, node->params.convolution_2d.kernel_width) == `1`)
638	{
639	assert(node->num_inputs >= `2`);
640
641	const struct xnn_value* filter = &subgraph->values[node->inputs[`1`]];
642	assert(filter->data != NULL);
643	assert(filter->shape.num_dims == `4`);
644
645	const size_t num_params = filter->shape.dim[`0`] * filter->shape.dim[`3`];
646	subgraph->nodes[node->cluster_leader].num_params += num_params;
647
648	const float* data = (const float*) filter->data;
649	size_t num_zeroes = `0`;
650	for (size_t i = `0`; i < num_params; i++) {
651	num_zeroes += (size_t) (data[i] == `0.0f`);
652	}
653	xnn_log_debug("1x1 Convolution 2D Node #%" PRIu32 ": %zu / %zu sparsity", n, num_zeroes, num_params);
654	subgraph->nodes[node->cluster_leader].num_zeroes += num_zeroes;
655	}
656	}
657	bool use_nchw_layout = false;
658	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
659	struct xnn_node* node = &subgraph->nodes[n];
660	if ((subgraph->nodes[node->cluster_leader].layout_flags & XNN_LAYOUT_FLAG_INCOMPATIBLE_CLUSTER) != `0`) {
661	continue;
662	}
663
664	if ((node->layout_flags & (XNN_LAYOUT_FLAG_COMPATIBLE_NCHW2NHWC \| XNN_LAYOUT_FLAG_COMPATIBLE_NCHW)) == `0`) {
665	continue;
666	}
667
668	if (subgraph->nodes[node->cluster_leader].num_zeroes * `3` <= subgraph->nodes[node->cluster_leader].num_params * `2`) {
669	xnn_log_info("Node #%" PRIu32 ": sparse inference disabled: 1x1 Convolutions contain %zu / %zu zero weights",
670	n, subgraph->nodes[node->cluster_leader].num_zeroes, subgraph->nodes[node->cluster_leader].num_params);
671	continue;
672	}
673
674	for (uint32_t i = `0`; i < node->num_inputs; i++) {
675	struct xnn_value* value = &subgraph->values[node->inputs[i]];
676	if (value->data != NULL) {
677	// Static data, skip this input value because it doesn't have a producer Node.
678	continue;
679	}
680	assert(!xnn_value_is_external(value));
681	assert(value->num_nchw_compatible_consumers > `0`);
682	assert(value->num_nchw_compatible_consumers == value->num_consumers);
683	if (value->layout != xnn_layout_type_nchw) {
684	value->layout = xnn_layout_type_nchw;
685	xnn_log_info("set Value #%"PRIu32" layout to NCHW", node->inputs[i]);
686	use_nchw_layout = true;
687	}
688	}
689	}
690	if (use_nchw_layout) {
691	xnn_log_info("XNNPACK has switched to sparse inference mode!");
692	}
693	}
694
695	bool xnn_subgraph_rewrite_for_fp16(xnn_subgraph_t subgraph)
696	{
697	xnn_log_info("Analyzing subgraph for FP16 compatibility");
698
699	// Convert tensors and operators in the subgraph to FP16
700	// 1. Check that all operators in the subgraph are supported in FP16.
701	// 2. Indicate values that must be converted to FP16.
702	// 3. Replace FP32 Values with FP16 Values as Nodes' inputs/outputs.
703	// 4. Insert FP32->FP16 Convert Nodes for external FP32 inputs and FP16->FP32 Convert Nodes for external outputs.
704
705	// Check that all operators in the subgraph are supported in FP16, bail out on any unsupported one.
706	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
707	struct xnn_node* node = &subgraph->nodes[n];
708	if (node->type == xnn_node_type_invalid) {
709	// Node was fused away, skip.
710	continue;
711	}
712
713	if (node->compute_type != xnn_compute_type_fp32) {
714	xnn_log_warning("FP16 rewrite aborted: node #%" PRIu32 " (%s) is not FP32", n, xnn_node_type_to_string(node->type));
715	return false;
716	}
717	switch (node->type) {
718	case xnn_node_type_abs:
719	case xnn_node_type_add2:
720	case xnn_node_type_divide:
721	case xnn_node_type_maximum2:
722	case xnn_node_type_minimum2:
723	case xnn_node_type_multiply2:
724	case xnn_node_type_concatenate2:
725	case xnn_node_type_concatenate3:
726	case xnn_node_type_concatenate4:
727	case xnn_node_type_squared_difference:
728	case xnn_node_type_subtract:
729	for (uint32_t i = `0`; i < node->num_inputs; i++) {
730	if (subgraph->values[node->inputs[i]].data != NULL) {
731	xnn_log_warning("FP16 rewrite aborted: node #%" PRIu32 " (%s) has static input %" PRIu32,
732	n, xnn_node_type_to_string(node->type), i);
733	return false;
734	}
735	}
736	break;
737	case xnn_node_type_average_pooling_2d:
738	case xnn_node_type_bankers_rounding:
739	case xnn_node_type_ceiling:
740	case xnn_node_type_clamp:
741	case xnn_node_type_copy:
742	case xnn_node_type_convolution_2d:
743	case xnn_node_type_deconvolution_2d:
744	case xnn_node_type_depthwise_convolution_2d:
745	case xnn_node_type_depth_to_space:
746	case xnn_node_type_elu:
747	case xnn_node_type_even_split2:
748	case xnn_node_type_even_split3:
749	case xnn_node_type_even_split4:
750	case xnn_node_type_floor:
751	case xnn_node_type_fully_connected:
752	case xnn_node_type_global_average_pooling_2d:
753	case xnn_node_type_hardswish:
754	case xnn_node_type_leaky_relu:
755	case xnn_node_type_max_pooling_2d:
756	case xnn_node_type_negate:
757	case xnn_node_type_prelu:
758	case xnn_node_type_sigmoid:
759	case xnn_node_type_softmax:
760	case xnn_node_type_static_constant_pad:
761	case xnn_node_type_static_reshape:
762	case xnn_node_type_static_resize_bilinear_2d:
763	case xnn_node_type_static_transpose:
764	case xnn_node_type_square:
765	case xnn_node_type_square_root:
766	break;
767	default:
768	xnn_log_warning("FP16 rewrite aborted: node #%" PRIu32 " (%s) is not supported for FP16 inference",
769	n, xnn_node_type_to_string(node->type));
770	return false;
771	}
772	}
773
774	// Annotate Values to be converted to FP16 as FP16-compatible.
775	// Note that static weights in [Depthwise] Convolution, Fully Connected, and PReLU Nodes remain FP32,
776	// they will be converted to FP16 during weight repacking when the operator is created.
777	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
778	struct xnn_node* node = &subgraph->nodes[n];
779	switch (node->type) {
780	case xnn_node_type_convolution_2d:
781	case xnn_node_type_deconvolution_2d:
782	case xnn_node_type_depthwise_convolution_2d:
783	case xnn_node_type_fully_connected:
784	case xnn_node_type_prelu:
785	subgraph->values[node->inputs[`0`]].fp16_compatible = true;
786	subgraph->values[node->outputs[`0`]].fp16_compatible = true;
787	break;
788	default:
789	for (uint32_t i = `0`; i < node->num_inputs; i++) {
790	subgraph->values[node->inputs[i]].fp16_compatible = true;
791	}
792	for (uint32_t o = `0`; o < node->num_outputs; o++) {
793	subgraph->values[node->outputs[o]].fp16_compatible = true;
794	}
795	break;
796	}
797	}
798
799	// Replace FP32 Values in Nodes' inputs/outputs with FP16 Values.
800	// FP32 Values that are not external inputs or outputs are converted to FP16 in-place,
801	// for external inputs and outputs we create same-shaped FP16 Values and use those instead.
802	const uint32_t num_original_values = subgraph->num_values;
803	xnn_subgraph_analyze_consumers_and_producers(subgraph);
804	for (uint32_t n = `0`; n < num_original_values; n++) {
805	struct xnn_value* value = &subgraph->values[n];
806	value->fp16_id = XNN_INVALID_VALUE_ID;
807	value->fp32_id = XNN_INVALID_VALUE_ID;
808	if (value->fp16_compatible) {
809	assert(value->data == NULL);
810	assert(value->datatype == xnn_datatype_fp32);
811	if (xnn_value_is_external(value)) {
812	struct xnn_value* fp16_value = xnn_subgraph_new_internal_value(subgraph);
813
814	// Recompute value due to potential reallocation in xnn_subgraph_new_internal_value
815	value = &subgraph->values[n];
816	xnn_value_copy(fp16_value, value);
817	fp16_value->datatype = xnn_datatype_fp16;
818
819	fp16_value->producer = value->producer;
820	fp16_value->num_consumers = value->num_consumers;
821	fp16_value->first_consumer = value->first_consumer;
822	value->producer = XNN_INVALID_NODE_ID;
823	value->num_consumers = `0`;
824	value->first_consumer = XNN_INVALID_NODE_ID;
825
826	// Clear external input/output flags
827	fp16_value->flags = `0`;
828	xnn_log_debug("FP16 rewrite: created FP16 tensor #%" PRIu32 " for FP32 tensor #%" PRIu32, fp16_value->id, n);
829
830	value->fp16_id = fp16_value->id;
831	fp16_value->fp32_id = n;
832	} else {
833	xnn_log_debug("FP16 rewrite: converted FP32 tensor #%" PRIu32 " to FP16", n);
834	value->datatype = xnn_datatype_fp16;
835	}
836	}
837	}
838	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
839	struct xnn_node* node = &subgraph->nodes[n];
840	if (node->type == xnn_node_type_invalid) {
841	// Node was fused away, skip.
842	continue;
843	}
844
845	assert(node->compute_type == xnn_compute_type_fp32);
846	node->compute_type = xnn_compute_type_fp16;
847	if (node->type == xnn_node_type_static_constant_pad) {
848	node->params.static_pad.padding_value =
849	fp16_ieee_from_fp32_value(uint32_as_float(node->params.static_pad.padding_value));
850	}
851	for (uint32_t i = `0`; i < node->num_inputs; i++) {
852	const uint32_t fp16_id = subgraph->values[node->inputs[i]].fp16_id;
853	if (fp16_id != XNN_INVALID_VALUE_ID) {
854	assert(subgraph->values[fp16_id].fp32_id == node->inputs[i]);
855	node->inputs[i] = fp16_id;
856	}
857	}
858	for (uint32_t o = `0`; o < node->num_outputs; o++) {
859	const uint32_t fp16_id = subgraph->values[node->outputs[o]].fp16_id;
860	if (fp16_id != XNN_INVALID_VALUE_ID) {
861	assert(subgraph->values[fp16_id].fp32_id == node->outputs[o]);
862	node->outputs[o] = fp16_id;
863	}
864	}
865	}
866
867	// Count the number of external inputs and outputs which require Convert nodes
868	uint32_t num_external_inputs = `0`;
869	uint32_t num_external_outputs = `0`;
870	for (uint32_t n = `0`; n < subgraph->num_nodes; n++) {
871	const struct xnn_node* node = &subgraph->nodes[n];
872	for (uint32_t i = `0`; i < node->num_inputs; i++) {
873	const struct xnn_value* value = &subgraph->values[node->inputs[i]];
874	if (value->fp32_id != XNN_INVALID_VALUE_ID && value->first_consumer == n) {
875	assert(value->data == NULL);
876	assert(value->datatype == xnn_datatype_fp16);
877	assert(subgraph->values[value->fp32_id].datatype == xnn_datatype_fp32);
878	// This value isn't always an external input, it could be an external output of the current subgraph (due to
879	// partition), and be simultaneously consumed by the current node.
880	if (xnn_value_is_external_input(&subgraph->values[value->fp32_id])) {
881	num_external_inputs += `1`;
882	}
883	}
884	}
885	for (uint32_t o = `0`; o < node->num_outputs; o++) {
886	const struct xnn_value* value = &subgraph->values[node->outputs[o]];
887	if (value->fp32_id != XNN_INVALID_VALUE_ID) {
888	assert(value->datatype == xnn_datatype_fp16);
889	assert(subgraph->values[value->fp32_id].datatype == xnn_datatype_fp32);
890	assert(xnn_value_is_external_output(&subgraph->values[value->fp32_id]));
891	num_external_outputs += `1`;
892	}
893	}
894	}
895	xnn_log_debug("Discovered %"PRIu32" external inputs and %"PRIu32" external outputs",
896	num_external_inputs, num_external_outputs);
897
898	const uint32_t num_original_nodes = subgraph->num_nodes;
899	xnn_subgraph_add_nodes(subgraph, num_external_inputs + num_external_outputs);
900	struct xnn_node* output_node = subgraph->nodes + subgraph->num_nodes - `1`;
901	for (uint32_t n = num_original_nodes; n != `0`; n--) {
902	const struct xnn_node* node = &subgraph->nodes[n - `1`];
903	// Insert Convert nodes for outputs
904	for (uint32_t o = `0`; o < node->num_outputs; o++) {
905	const struct xnn_value* value = &subgraph->values[node->outputs[o]];
906	if (value->fp32_id != XNN_INVALID_VALUE_ID) {
907	xnn_log_debug("Inserted FP16->FP32 Convert Node from tensor #%"PRIu32" to tensor #%"PRIu32,
908	value->id, value->fp32_id);
909	const uint32_t output_node_id = output_node->id;
910	assert(output_node >= subgraph->nodes);
911	xnn_node_clear(output_node);
912	output_node->id = output_node_id;
913	xnn_init_convert_node(output_node, xnn_compute_type_fp16_to_fp32, value->id, value->fp32_id, `0` / flags /);
914	output_node -= `1`;
915	}
916	}
917	// Move the Node to the new location
918	if (output_node != node) {
919	const uint32_t output_node_id = output_node->id;
920	assert(output_node >= subgraph->nodes);
921	memcpy(output_node, node, sizeof(struct xnn_node));
922	output_node->id = output_node_id;
923	output_node -= `1`;
924	}
925	// Insert Convert nodes for inputs
926	for (uint32_t i = `0`; i < node->num_inputs; i++) {
927	const struct xnn_value* value = &subgraph->values[node->inputs[i]];
928	if (value->fp32_id != XNN_INVALID_VALUE_ID && value->first_consumer == n - `1`) {
929	// Only insert convert nodes if the value actually is an external input. This value could be an external output,
930	// if that's the case, we have already inserted a convert node in loop above for outputs.
931	if (xnn_value_is_external_input(&subgraph->values[value->fp32_id])) {
932	xnn_log_debug("Inserted FP32->FP16 Convert Node from tensor #%"PRIu32" to tensor #%"PRIu32,
933	value->fp32_id, value->id);
934	const uint32_t output_node_id = output_node->id;
935	assert(output_node >= subgraph->nodes);
936	xnn_node_clear(output_node);
937	output_node->id = output_node_id;
938	xnn_init_convert_node(output_node, xnn_compute_type_fp32_to_fp16, value->fp32_id, value->id, `0` / flags /);
939	output_node -= `1`;
940	}
941	}
942	}
943	}
944
945	return true;
946	}
947
948	static void xnn_node_replace_output(struct xnn_node* node, uint32_t old_output_id, uint32_t new_output_id)
949	{
950	for (size_t i = `0`; i < node->num_outputs; i++) {
951	if (node->outputs[i] == old_output_id) {
952	node->outputs[i] = new_output_id;
953	}
954	}
955	}
956
957	enum xnn_status xnn_subgraph_fusion(
958	xnn_subgraph_t subgraph)
959	{
960	// Fuse Nodes where possible
961	for (uint32_t i = `0`; i < subgraph->num_values; i++) {
962	struct xnn_value* value = &subgraph->values[i];
963	if (value->num_consumers == `1`) {
964	const uint32_t producer_id = value->producer;
965	if (producer_id == XNN_INVALID_NODE_ID) {
966	continue;
967	}
968	assert(producer_id < subgraph->num_nodes);
969
970	const uint32_t consumer_id = value->first_consumer;
971	if (consumer_id == XNN_INVALID_NODE_ID) {
972	continue;
973	}
974	assert(consumer_id < subgraph->num_nodes);
975
976	struct xnn_node* producer = &subgraph->nodes[producer_id];
977	assert(producer->type != xnn_node_type_invalid);
978	struct xnn_node* consumer = &subgraph->nodes[consumer_id];
979	assert(consumer->type != xnn_node_type_invalid);
980
981	// Try to fuse Clamp Node upstream into producer Node
982	if (consumer->type == xnn_node_type_clamp) {
983	switch (producer->type) {
984	case xnn_node_type_add2:
985	case xnn_node_type_average_pooling_2d:
986	case xnn_node_type_clamp:
987	case xnn_node_type_convolution_2d:
988	case xnn_node_type_divide:
989	case xnn_node_type_deconvolution_2d:
990	case xnn_node_type_depthwise_convolution_2d:
991	case xnn_node_type_fully_connected:
992	case xnn_node_type_multiply2:
993	case xnn_node_type_max_pooling_2d:
994	case xnn_node_type_subtract:
995	xnn_log_info("fuse Clamp Node #%"PRIu32" into upstream Node #%"PRIu32, consumer_id, producer_id);
996	assert(producer->num_outputs == `1`);
997	assert(consumer->num_inputs == `1`);
998	assert(consumer->num_outputs == `1`);
999
1000	const uint32_t fused_output_id = consumer->outputs[`0`];
1001	assert(fused_output_id < subgraph->num_values);
1002	subgraph->values[fused_output_id].producer = producer_id;
1003	producer->outputs[`0`] = fused_output_id;
1004
1005	producer->activation.output_min =
1006	math_max_f32(producer->activation.output_min, consumer->activation.output_min);
1007	producer->activation.output_max =
1008	math_min_f32(producer->activation.output_max, consumer->activation.output_max);
1009
1010	xnn_node_clear(consumer);
1011	xnn_value_clear(value);
1012	break;
1013	default:
1014	break;
1015	}
1016	}
1017	// Try to fuse Constant Pad node downstream into [Depthwise] Convolution 2D Node
1018	if (producer->type == xnn_node_type_static_constant_pad) {
1019	assert(producer->num_inputs == `1`);
1020	assert(producer->num_outputs == `1`);
1021	const bool is_spatial_2d_padding = value->shape.num_dims == `4` &&
1022	(producer->params.static_pad.pre_paddings[`0`] \| producer->params.static_pad.post_paddings[`0`] \|
1023	producer->params.static_pad.pre_paddings[`3`] \| producer->params.static_pad.post_paddings[`3`]) == `0`;
1024	const enum xnn_datatype padding_datatype = subgraph->values[producer->outputs[`0`]].datatype;
1025	const uint32_t padding_value = producer->params.static_pad.padding_value;
1026	const bool is_zero_padding =
1027	(padding_datatype == xnn_datatype_fp32 && padding_value == `0`) \|\|
1028	((padding_datatype == xnn_datatype_qint8 \|\| padding_datatype == xnn_datatype_quint8) &&
1029	padding_value == (uint32_t) (uint8_t) subgraph->values[producer->outputs[`0`]].quantization.zero_point);
1030	switch (consumer->type) {
1031	case xnn_node_type_convolution_2d:
1032	if (is_spatial_2d_padding && is_zero_padding && !(consumer->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING)) {
1033	xnn_log_info("fuse Constant Pad Node #%"PRIu32" into Convolution 2D Node #%"PRIu32,
1034	consumer_id, producer_id);
1035	assert(consumer->num_inputs >= `1`);
1036	assert(consumer->inputs[`0`] == producer->outputs[`0`]);
1037
1038	consumer->params.convolution_2d.input_padding_top += producer->params.static_pad.pre_paddings[`1`];
1039	consumer->params.convolution_2d.input_padding_right += producer->params.static_pad.post_paddings[`2`];
1040	consumer->params.convolution_2d.input_padding_bottom += producer->params.static_pad.post_paddings[`1`];
1041	consumer->params.convolution_2d.input_padding_left += producer->params.static_pad.pre_paddings[`2`];
1042
1043	consumer->inputs[`0`] = producer->inputs[`0`];
1044
1045	const uint32_t fused_input_id = producer->inputs[`0`];
1046	assert(fused_input_id < subgraph->num_values);
1047	if (subgraph->values[fused_input_id].first_consumer == producer_id) {
1048	subgraph->values[fused_input_id].first_consumer = consumer_id;
1049	}
1050
1051	xnn_node_clear(producer);
1052	xnn_value_clear(value);
1053	}
1054	break;
1055	case xnn_node_type_depthwise_convolution_2d:
1056	if (is_spatial_2d_padding && is_zero_padding && !(consumer->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING)) {
1057	xnn_log_info("fuse Constant Pad Node #%"PRIu32" into Depthwise Convolution 2D Node #%"PRIu32,
1058	consumer_id, producer_id);
1059	assert(consumer->num_inputs >= `1`);
1060	assert(consumer->inputs[`0`] == producer->outputs[`0`]);
1061
1062	consumer->params.depthwise_convolution_2d.input_padding_top +=
1063	producer->params.static_pad.pre_paddings[`1`];
1064	consumer->params.depthwise_convolution_2d.input_padding_right +=
1065	producer->params.static_pad.post_paddings[`2`];
1066	consumer->params.depthwise_convolution_2d.input_padding_bottom +=
1067	producer->params.static_pad.post_paddings[`1`];
1068	consumer->params.depthwise_convolution_2d.input_padding_left +=
1069	producer->params.static_pad.pre_paddings[`2`];
1070
1071	consumer->inputs[`0`] = producer->inputs[`0`];
1072
1073	const uint32_t fused_input_id = producer->inputs[`0`];
1074	assert(fused_input_id < subgraph->num_values);
1075	if (subgraph->values[fused_input_id].first_consumer == producer_id) {
1076	subgraph->values[fused_input_id].first_consumer = consumer_id;
1077	}
1078
1079	xnn_node_clear(producer);
1080	xnn_value_clear(value);
1081	}
1082	break;
1083	default:
1084	break;
1085	}
1086	}
1087
1088	// Try to fuse copy upstream. Copy can be fused upstream as long as this value is internal.
1089	// E.g. ---> (N1) --- value ---> (Copy) ---> v1
1090	// If value is persistent or external, fusing copy upstream into N1 will skip the write to value, N1 will write to
1091	// v1 instead, which is wrong.
1092	if (consumer->type == xnn_node_type_copy && xnn_value_is_valid(value) && xnn_value_is_internal(value)) {
1093	xnn_log_info(
1094	"value %d fuse Copy Node #%" PRIu32 " into upstream %s Node #%" PRIu32, value->id, consumer->id,
1095	xnn_node_type_to_string(producer->type), producer->id);
1096	assert(consumer->num_inputs == `1`);
1097	assert(consumer->num_outputs == `1`);
1098	const uint32_t fused_output_id = consumer->outputs[`0`];
1099	assert(fused_output_id < subgraph->num_values);
1100	subgraph->values[fused_output_id].producer = producer_id;
1101	xnn_node_replace_output(producer, value->id, fused_output_id);
1102	xnn_node_clear(consumer);
1103	xnn_value_clear(value);
1104	}
1105
1106	// Try to fuse copy downstream.
1107	// E.g. --- v1 ---> (copy) --- value ---> (n2)
1108	// If value is external or persistent, we cannot simply remove the copy, since we need to write to value.
1109	if (producer->type == xnn_node_type_copy && xnn_value_is_valid(value) && xnn_value_is_internal(value)) {
1110	// We need to check that value is valid here because value could have been cleared by a previous optimization,
1111	// this can happen if we have a chain of Copy(s), e.g.:
1112	// ---v1--> (Copy1) ---v2--> (Copy2) ---v3--> (Copy3) ---v4-->
1113	// v2 could have been cleared when we fused Copy2 upstream into Copy1, so v2 isn't valid anymore, but since v2's
1114	// producer is also a Copy, we will incorrectly try to fuse Copy1 downstream into Copy2 (again).
1115	xnn_log_info(
1116	"value %d fuse Copy Node #%" PRIu32 " into downstream %s Node #%" PRIu32, value->id, producer->id,
1117	xnn_node_type_to_string(consumer->type), consumer->id);
1118	assert(producer->num_outputs == `1`);
1119	assert(producer->num_inputs == `1`);
1120	const uint32_t copy_input_id = producer->inputs[`0`];
1121	const uint32_t copy_output_id = producer->outputs[`0`];
1122	bool found_consumer_input = false;
1123	for (size_t i = `0`; i < consumer->num_inputs; i++) {
1124	if (consumer->inputs[i] == copy_output_id) {
1125	consumer->inputs[i] = copy_input_id;;
1126	found_consumer_input = true;
1127	// TODO(b/254734644): A consumer can only consume this value once, since we asserted earlier that value has
1128	// only 1 consumer, so we can break here as there will be no other consumer inputs that has the same id.
1129	break;
1130	}
1131	}
1132	(void) found_consumer_input; // Silence unused variable warning in non-debug.
1133	assert(found_consumer_input);
1134
1135	if (subgraph->values[copy_input_id].first_consumer == producer_id) {
1136	subgraph->values[copy_input_id].first_consumer = consumer_id;
1137	}
1138	xnn_node_clear(producer);
1139	xnn_value_clear(value);
1140	}
1141	}
1142	}
1143
1144	return xnn_status_success;
1145	}
1146
1147	enum xnn_status xnn_subgraph_optimize(
1148	xnn_subgraph_t subgraph,
1149	uint32_t flags)
1150	{
1151	xnn_subgraph_analyze_consumers_and_producers(subgraph);
1152
1153	// Remove unreferenced values.
1154	for (uint32_t i = `0`; i < subgraph->num_values; i++) {
1155	struct xnn_value* value = &subgraph->values[i];
1156	if (value->type == xnn_value_type_invalid) {
1157	continue;
1158	}
1159
1160	if (!xnn_value_is_external_input(value) && value->num_consumers == `0` && !xnn_value_is_persistent(value)) {
1161	xnn_value_clear(value);
1162	}
1163	}
1164
1165	if (!(flags & XNN_FLAG_NO_OPERATOR_FUSION)) {
1166	xnn_subgraph_fusion(subgraph);
1167	}
1168
1169	if ((flags & XNN_FLAG_FORCE_FP16_INFERENCE) && !(xnn_params.init_flags & XNN_INIT_FLAG_F16)) {
1170	xnn_log_error("failed to force FP16 inference: hardware supports neither native nor emulated FP16 operators");
1171	return xnn_status_unsupported_hardware;
1172	}
1173	#ifndef XNN_NO_F16_OPERATORS
1174	const bool try_native_fp16 =
1175	(flags & XNN_FLAG_HINT_FP16_INFERENCE) && (xnn_params.init_flags & XNN_INIT_FLAG_F16_NATIVE);
1176	const bool force_fp16 = (flags & XNN_FLAG_FORCE_FP16_INFERENCE);
1177	if (try_native_fp16 \|\| force_fp16) {
1178	const bool fp16_rewrite_succeeded = xnn_subgraph_rewrite_for_fp16(subgraph);
1179	if (force_fp16 && !fp16_rewrite_succeeded) {
1180	xnn_log_error("failed to force FP16 inference: subgraph is incompatible with FP16 operators");
1181	return xnn_status_unsupported_parameter;
1182	}
1183	}
1184	#endif // XNN_NO_F16_OPERATORS
1185
1186	#if XNN_ENABLE_SPARSE
1187	if ((flags & XNN_FLAG_HINT_SPARSE_INFERENCE) && (xnn_params.init_flags & XNN_INIT_FLAG_CHW_OPT)) {
1188	xnn_subgraph_rewrite_for_nchw(subgraph);
1189	}
1190	#endif
1191
1192	return xnn_status_success;
1193	}
1194
1195	enum xnn_status xnn_delete_subgraph(
1196	xnn_subgraph_t subgraph)
1197	{
1198	if (subgraph != NULL) {
1199	if (subgraph->nodes != NULL) {
1200	memset(subgraph->nodes, `0`, sizeof(struct xnn_node) * subgraph->num_nodes);
1201	xnn_release_memory(subgraph->nodes);
1202	}
1203
1204	if (subgraph->values != NULL) {
1205	memset(subgraph->values, `0`, sizeof(struct xnn_value) * subgraph->num_values);
1206	xnn_release_memory(subgraph->values);
1207	}
1208
1209	memset(subgraph, `0`, sizeof(struct xnn_subgraph));
1210	xnn_release_memory(subgraph);
1211	}
1212	return xnn_status_success;
1213	}
1214

Browse the source code of pytorch/third_party/XNNPACK/src/subgraph.c