output.h source code [tensorflow/external/gemmlowp/internal/output.h]

1	// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	// output.h: processing the 32-bit accumulators output by the unpack
16	// stage, obtaining the final result matrix entries and storing them into
17	// the destination matrix.
18
19	#ifndef GEMMLOWP_INTERNAL_OUTPUT_H_
20	#define GEMMLOWP_INTERNAL_OUTPUT_H_
21
22	#include <cmath>
23	#include <tuple>
24	#include <type_traits>
25	#include <typeinfo>
26
27	#include "../fixedpoint/fixedpoint.h"
28	#include "../public/output_stages.h"
29	#include "simd_wrappers.h"
30
31	namespace gemmlowp {
32
33	template <typename OutputStage, typename InputBufferType>
34	struct OutputStageEvalBufferImpl {
35	// This generic template body should never be hit.
36	static_assert(
37	std::is_same<InputBufferType, void>::value,
38	"Unimplemented: missing implementation of this output pipeline stage "
39	"for this data type. This would happen if some architecture-specific "
40	"SIMD back-end (output_$arch.h) were incomplete.");
41	};
42
43	template <typename OutputStage, typename InputType>
44	struct OutputStageEvalImpl {
45	static constexpr int kRows = InputType::kRows;
46	static constexpr int kCols = InputType::kCols;
47	using InputBufferType = typename InputType::BufferType;
48	using BufferEvalImplType =
49	OutputStageEvalBufferImpl<OutputStage, InputBufferType>;
50	using OutputBufferType = typename BufferEvalImplType::OutputType;
51	using OutputScalarType = typename OutputBufferType::ScalarType;
52	using OutputType = RegisterBlock<OutputScalarType, kRows, kCols>;
53
54	OutputStageEvalImpl(const OutputStage& s) : buffer_eval_impl(s) {}
55
56	OutputType Eval(InputType input, int, int) const {
57	OutputType output;
58	output.buf = buffer_eval_impl.Eval(input.buf);
59	return output;
60	}
61
62	const BufferEvalImplType buffer_eval_impl;
63	};
64
65	template <int Size>
66	struct OutputStageEvalBufferImpl<OutputStageQuantizeDownInt32ToUint8Scale,
67	RegisterBuffer<std::int32_t, Size>> {
68	using InputType = RegisterBuffer<std::int32_t, Size>;
69	using OutputType = RegisterBuffer<std::int32_t, Size>;
70
71	typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;
72
73	OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}
74
75	OutputType Eval(InputType input) const {
76	const int result_shift = output_stage.result_shift;
77	const std::int32_t result_mult_int = output_stage.result_mult_int;
78	using RegisterType = typename InputType::RegisterType;
79	const RegisterType result_offset =
80	Dup<RegisterType>(output_stage.result_offset);
81	OutputType output;
82	for (int i = `0`; i < InputType::kRegisterCount; i++) {
83	output.reg[i] = RoundingDivideByPOT(
84	Mul(Add(input.reg[i], result_offset), result_mult_int), result_shift);
85	}
86	return output;
87	}
88
89	const OutputStage& output_stage;
90	};
91
92	template <int Rows, int Cols, VectorShape Shape>
93	struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8ScalePC<Shape>,
94	RegisterBlock<std::int32_t, Rows, Cols>> {
95	typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
96	typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
97	typedef OutputStageQuantizeDownInt32ToUint8ScalePC<Shape> OutputStage;
98
99	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
100
101	OutputType Eval(InputType input, int row, int col) const {
102	OutputType output;
103	const int result_shift = output_stage.result_shift;
104	const int pos = Shape == VectorShape::Col ? row : col;
105	const auto result_mult_int =
106	LoadForBroadcasting<InputType>(output_stage.result_mult_int, pos);
107	const auto result_offset =
108	LoadForBroadcasting<InputType>(output_stage.result_offset, pos);
109	const auto dividend = BroadcastMul<InputType>(
110	BroadcastAdd<InputType>(input, result_offset), result_mult_int);
111	for (int i = `0`; i < InputType::kRegisterCount; i++) {
112	output.buf.reg[i] =
113	RoundingDivideByPOT(dividend.buf.reg[i], result_shift);
114	}
115	return output;
116	}
117
118	const OutputStage& output_stage;
119	};
120
121	template <int Size>
122	struct OutputStageEvalBufferImpl<
123	OutputStageQuantizeDownInt32ByFixedPoint,
124	RegisterBuffer<std::int32_t, Size>> {
125	typedef RegisterBuffer<std::int32_t, Size> InputType;
126	typedef RegisterBuffer<std::int32_t, Size> OutputType;
127
128	typedef OutputStageQuantizeDownInt32ByFixedPoint OutputStage;
129
130	OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}
131
132	OutputType Eval(InputType input) const {
133	OutputType output;
134	using RegisterType = typename InputType::RegisterType;
135	const RegisterType result_offset_after_shift =
136	Dup<RegisterType>(output_stage.result_offset_after_shift);
137	for (int i = `0`; i < InputType::kRegisterCount; i++) {
138	const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul(
139	input.reg[i], output_stage.result_fixedpoint_multiplier);
140	output.reg[i] =
141	Add(RoundingDivideByPOT(mulhigh_val, output_stage.result_shift),
142	result_offset_after_shift);
143	}
144	return output;
145	}
146
147	const OutputStage& output_stage;
148	};
149
150	template <int Size>
151	struct OutputStageEvalBufferImpl<OutputStageScaleInt32ByFixedPointAndExponent,
152	RegisterBuffer<std::int32_t, Size>> {
153	typedef RegisterBuffer<std::int32_t, Size> InputType;
154	typedef RegisterBuffer<std::int32_t, Size> OutputType;
155
156	typedef OutputStageScaleInt32ByFixedPointAndExponent OutputStage;
157
158	OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {
159	left_shift = std::max(`0`, output_stage.result_exponent);
160	right_shift = std::max(`0`, -output_stage.result_exponent);
161	}
162
163	OutputType Eval(InputType input) const {
164	OutputType output;
165	using RegisterType = typename InputType::RegisterType;
166	const RegisterType result_offset_after_shift =
167	Dup<RegisterType>(output_stage.result_offset_after_shift);
168	for (int i = `0`; i < InputType::kRegisterCount; i++) {
169	const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul(
170	ShiftLeft(input.reg[i], left_shift),
171	output_stage.result_fixedpoint_multiplier);
172	output.reg[i] = Add(RoundingDivideByPOT(mulhigh_val, right_shift),
173	result_offset_after_shift);
174	}
175	return output;
176	}
177
178	const OutputStage& output_stage;
179	int left_shift;
180	int right_shift;
181	};
182
183	template <int Rows, int Cols, VectorShape Shape>
184	struct OutputStageEvalImpl<
185	OutputStageScaleInt32ByFixedPointAndExponentPC<Shape>,
186	RegisterBlock<std::int32_t, Rows, Cols>> {
187	typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
188	typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
189
190	typedef OutputStageScaleInt32ByFixedPointAndExponentPC<Shape> OutputStage;
191
192	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
193
194	OutputType Eval(InputType input, int row, int col) const {
195	OutputType output;
196	const int pos = Shape == VectorShape::Row ? col : row;
197	using RegisterType = typename InputType::RegisterType;
198	const RegisterType result_offset_after_shift =
199	Dup<RegisterType>(output_stage.result_offset_after_shift);
200	auto left_shift =
201	LoadForBroadcasting<InputType>(output_stage.result_exponent, pos);
202	auto right_shift =
203	LoadForBroadcasting<InputType>(output_stage.result_exponent, pos);
204	const auto result_fixedpoint_multiplier = LoadForBroadcasting<InputType>(
205	output_stage.result_fixedpoint_multiplier, pos);
206	for (int i = `0`; i < decltype(left_shift)::kRegisterCount; i++) {
207	left_shift.buf.reg[i] = Max(left_shift.buf.reg[i], `0`);
208	right_shift.buf.reg[i] = Max(-right_shift.buf.reg[i], `0`);
209	}
210	const auto mulhigh_val = BroadcastSaturatingRoundingDoublingHighMul(
211	BroadcastShiftLeft(input, left_shift), result_fixedpoint_multiplier);
212	const auto rdpot_val =
213	BroadcastRoundingDivideByPOT(mulhigh_val, right_shift);
214	for (int i = `0`; i < InputType::kRegisterCount; i++) {
215	output.buf.reg[i] = Add(rdpot_val.buf.reg[i], result_offset_after_shift);
216	}
217	return output;
218	}
219
220	const OutputStage& output_stage;
221	};
222
223	// Implementation of OutputStageSaturatingCastToUint8 for scalar data.
224	template <int Size>
225	struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
226	RegisterBuffer<std::int32_t, Size>> {
227	typedef RegisterBuffer<std::int32_t, Size> InputType;
228	typedef RegisterBuffer<std::uint8_t, Size> OutputType;
229	static_assert(InputType::kRegisterLanes == `1`,
230	"This path is only for scalar values");
231
232	typedef OutputStageSaturatingCastToUint8 OutputStage;
233
234	OutputStageEvalBufferImpl(const OutputStage&) {}
235
236	OutputType Eval(InputType input) const {
237	OutputType output;
238	for (int i = `0`; i < InputType::kRegisterCount; i++) {
239	std::int32_t data = input.reg[i];
240	output.reg[i] = data > `255` ? `255` : data < `0` ? `0` : data;
241	}
242	return output;
243	}
244	};
245
246	// Implementation of OutputStageSaturatingCastToInt8 for scalar data.
247	template <int Size>
248	struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt8,
249	RegisterBuffer<std::int32_t, Size>> {
250	typedef RegisterBuffer<std::int32_t, Size> InputType;
251	typedef RegisterBuffer<std::int8_t, Size> OutputType;
252	static_assert(InputType::kRegisterLanes == `1`,
253	"This path is only for scalar values");
254
255	typedef OutputStageSaturatingCastToInt8 OutputStage;
256
257	OutputStageEvalBufferImpl(const OutputStage&) {}
258
259	OutputType Eval(InputType input) const {
260	OutputType output;
261	for (int i = `0`; i < InputType::kRegisterCount; i++) {
262	std::int32_t data = input.reg[i];
263	output.reg[i] = data > `127` ? `127` : data < -`128` ? -`128` : data;
264	}
265	return output;
266	}
267	};
268
269	// Implementation of OutputStageSaturatingCastToInt16 for scalar data.
270	template <int Size>
271	struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
272	RegisterBuffer<std::int32_t, Size>> {
273	typedef RegisterBuffer<std::int32_t, Size> InputType;
274	typedef RegisterBuffer<std::int16_t, Size> OutputType;
275	static_assert(InputType::kRegisterLanes == `1`,
276	"This path is only for scalar values");
277
278	typedef OutputStageSaturatingCastToInt16 OutputStage;
279
280	OutputStageEvalBufferImpl(const OutputStage&) {}
281
282	OutputType Eval(InputType input) const {
283	OutputType output;
284	for (int i = `0`; i < InputType::kRegisterCount; i++) {
285	std::int32_t data = input.reg[i];
286	output.reg[i] = data > `32767` ? `32767` : data < -`32768` ? -`32768` : data;
287	}
288	return output;
289	}
290	};
291
292	// Implementation of OutputStageTruncatingCastToUint8 for scalar data
293	template <int Size>
294	struct OutputStageEvalBufferImpl<OutputStageTruncatingCastToUint8,
295	RegisterBuffer<std::int32_t, Size>> {
296	typedef RegisterBuffer<std::int32_t, Size> InputType;
297	typedef RegisterBuffer<std::uint8_t, Size> OutputType;
298	static_assert(InputType::kRegisterLanes == `1`,
299	"This path is only for scalar values");
300
301	typedef OutputStageTruncatingCastToUint8 OutputStage;
302
303	OutputStageEvalBufferImpl(const OutputStage&) {}
304
305	OutputType Eval(InputType input) const {
306	OutputType output;
307	for (int i = `0`; i < InputType::kRegisterCount; i++) {
308	output.reg[i] = input.reg[i];
309	}
310	return output;
311	}
312	};
313
314	template <int Rows, int Cols, typename VectorType>
315	struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
316	RegisterBlock<std::int32_t, Rows, Cols>> {
317	typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
318	typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
319	typedef OutputStageBiasAddition<VectorType> OutputStage;
320
321	OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
322
323	OutputType Eval(InputType input, int row, int col) const {
324	const int pos = VectorType::kShape == VectorShape::Row ? col : row;
325	return BroadcastAdd<InputType>(
326	input, LoadForBroadcasting<InputType>(output_stage.bias_vector, pos));
327	}
328
329	const OutputStage& output_stage;
330	};
331
332	template <int Size>
333	struct OutputStageEvalBufferImpl<OutputStageClamp,
334	RegisterBuffer<std::int32_t, Size>> {
335	typedef RegisterBuffer<std::int32_t, Size> InputType;
336	typedef RegisterBuffer<std::int32_t, Size> OutputType;
337
338	typedef OutputStageClamp OutputStage;
339
340	OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}
341
342	OutputType Eval(InputType input) const {
343	using RegisterType = typename InputType::RegisterType;
344	const RegisterType min = Dup<RegisterType>(output_stage.min);
345	const RegisterType max = Dup<RegisterType>(output_stage.max);
346	OutputType output;
347	for (int i = `0`; i < InputType::kRegisterCount; i++) {
348	output.reg[i] = Min(Max(input.reg[i], min), max);
349	}
350	return output;
351	}
352
353	const OutputStage& output_stage;
354	};
355
356	template <int Size>
357	struct OutputStageEvalBufferImpl<OutputStageTanh,
358	RegisterBuffer<std::int32_t, Size>> {
359	typedef RegisterBuffer<std::int32_t, Size> InputType;
360	typedef RegisterBuffer<std::int32_t, Size> OutputType;
361	using RegisterType = typename InputType::RegisterType;
362	typedef RegisterType DataType;
363	typedef OutputStageTanh OutputStage;
364
365	OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {
366	const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
367	const std::int32_t real_amplitude_as_int32 =
368	output_stage.real_amplitude_as_int32;
369
370	input_cutoff_min = real_zero_as_int32 - `8` * real_amplitude_as_int32;
371	input_cutoff_max = real_zero_as_int32 + `8` * real_amplitude_as_int32;
372	output_min = real_zero_as_int32 - real_amplitude_as_int32;
373	output_max = real_zero_as_int32 + real_amplitude_as_int32;
374
375	double inverse_amplitude_normalized_double = `1.0` / real_amplitude_as_int32;
376	inverse_amplitude_neg_exponent = `0`;
377	while (inverse_amplitude_normalized_double < `0.5`) {
378	inverse_amplitude_normalized_double *= `2`;
379	inverse_amplitude_neg_exponent++;
380	}
381	inverse_amplitude_normalized = FixedPoint<DataType, `0`>::FromDouble(
382	inverse_amplitude_normalized_double);
383
384	double amplitude_normalized_double = real_amplitude_as_int32;
385	amplitude_exponent = `0`;
386	while (amplitude_normalized_double >= `1.0`) {
387	amplitude_normalized_double *= `0.5`;
388	amplitude_exponent++;
389	}
390	amplitude_normalized =
391	FixedPoint<DataType, `0`>::FromDouble(amplitude_normalized_double);
392	}
393
394	OutputType Eval(InputType input) const {
395	const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
396
397	typedef FixedPoint<DataType, `3`> F3;
398	typedef FixedPoint<DataType, `0`> F0;
399
400	OutputType output;
401
402	for (int i = `0`; i < OutputType::kRegisterCount; i++) {
403	// fixed-point affine transformation
404	DataType input_centered =
405	Sub(input.reg[i], Dup<DataType>(real_zero_as_int32));
406	F3 fixedpoint_input =
407	F3::FromRaw(input_centered) * inverse_amplitude_normalized;
408	// left shift
409	fixedpoint_input.raw() = ShiftLeft(fixedpoint_input.raw(),
410	`28` - inverse_amplitude_neg_exponent);
411	// fixed-point tanh and multiplication
412	F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized;
413	// right shift
414	DataType int32_output =
415	Add(Dup<DataType>(real_zero_as_int32),
416	ShiftRight(fixedpoint_output.raw(), `31` - amplitude_exponent));
417
418	DataType mask_if_below_cutoff_min =
419	MaskIfLessThanOrEqual(input.reg[i], Dup<DataType>(input_cutoff_min));
420	DataType mask_if_above_cutoff_max = MaskIfGreaterThanOrEqual(
421	input.reg[i], Dup<DataType>(input_cutoff_max));
422
423	output.reg[i] = SelectUsingMask(
424	mask_if_below_cutoff_min, Dup<DataType>(output_min),
425	SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max),
426	int32_output));
427	}
428	return output;
429	}
430
431	const OutputStage& output_stage;
432	std::int32_t input_cutoff_min, input_cutoff_max;
433	std::int32_t output_min, output_max;
434	FixedPoint<DataType, `0`> inverse_amplitude_normalized;
435	int inverse_amplitude_neg_exponent;
436	FixedPoint<DataType, `0`> amplitude_normalized;
437	int amplitude_exponent;
438	};
439
440	// OutputPipelineOutputType is a helper to determine the output data type of a
441	// pipeline, for a
442	// given input data type. It is a recursive template; see the explanation on
443	// OutputPipelineEvalImpl below.
444	template <typename OutputPipelineType, int FirstStage, typename InputType,
445	bool StopRecursion =
446	FirstStage == std::tuple_size<OutputPipelineType>::value>
447	struct OutputPipelineOutputType {
448	typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
449	FirstStageType;
450	typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
451	FirstStageOutputType;
452	typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + `1`,
453	FirstStageOutputType>::Type Type;
454	};
455
456	template <typename OutputPipelineType, int FirstStage, typename InputType>
457	struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType,
458	true> {
459	typedef InputType Type;
460	};
461
462	// OutputPipelineEvalImpl is a helper to implement the evaluation of
463	// the whole pipeline. It is a recursive template to implement compile-time
464	// unrolling of the loop over all pipeline stages. The 'FirstStage' parameter
465	// is how we implement recursion: each specialization implements only
466	// evaluation starting at 'FirstStage'. The StopRecursion parameter is just a
467	// helper to implement the termination of the recursion as a partial
468	// specialization below.
469	template <typename OutputPipelineType, int FirstStage, typename InputType,
470	bool StopRecursion =
471	FirstStage == std::tuple_size<OutputPipelineType>::value>
472	struct OutputPipelineEvalImpl {
473	typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
474	FirstStageType;
475	typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
476	FirstStageOutputType;
477	typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage,
478	InputType>::Type OutputType;
479
480	OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline)
481	: head_impl(std::get<FirstStage>(output_pipeline)),
482	tail_impl(output_pipeline) {}
483
484	OutputType Eval(InputType input, int row, int col) const {
485	// Evaluate the first stage.
486	FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col);
487	// Recurse into the remaining stages.
488	return tail_impl.Eval(first_stage_output, row, col);
489	}
490
491	const OutputStageEvalImpl<FirstStageType, InputType> head_impl;
492	const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + `1`,
493	FirstStageOutputType>
494	tail_impl;
495	};
496
497	// Specialization on 'StopRecursion' for terminating the recursion.
498	template <typename OutputPipelineType, int FirstStage, typename InputType>
499	struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> {
500	OutputPipelineEvalImpl(const OutputPipelineType&) {}
501
502	InputType Eval(InputType input, int, int) const {
503	// Terminating the recursion.
504	return input;
505	}
506	};
507
508	template <typename RegisterBlockType, typename DstType>
509	struct StoreFinalOutputImpl {
510	static_assert(std::is_same<RegisterBlockType, void>::value,
511	"This generic impl should never be hit");
512	};
513
514	template <typename ScalarType, int Rows, int Cols, typename DstType>
515	struct StoreFinalOutputImpl<RegisterBlock<ScalarType, Rows, Cols>, DstType> {
516	using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>;
517	static void Run(const RegisterBlockType& src, DstType* dst, int row,
518	int col) {
519	for (int r = `0`; r < Rows; r++) {
520	for (int c = `0`; c < Cols; c++) {
521	dst->data(row + r, col + c) = src.buf.reg[r + c Rows];
522	}
523	}
524	}
525	};
526
527	// StoreFinalOutput takes the final value at the end of the output pipeline and
528	// stores it into the destination matrix. It can be specialized for different
529	// data types; the generic implementation here is typically used only for plain
530	// old scalar (not SIMD) types.
531	template <typename RegisterBlockType, typename DstType>
532	void StoreFinalOutput(RegisterBlockType src, DstType* dst, int row, int col) {
533	StoreFinalOutputImpl<RegisterBlockType, DstType>::Run(src, dst, row, col);
534	}
535
536	template <typename OutputPipelineType, typename InputType>
537	struct OutputPipelineExecutor {
538	OutputPipelineExecutor(const OutputPipelineType& output_pipeline)
539	: output_pipeline_eval_impl_(output_pipeline) {}
540
541	// Execute is the entry point into the output pipeline evaluation
542	// code. It should be the only thing that unpack code calls. It takes the
543	// result
544	// of the unpack stage and stores it into the destination matrix.
545	template <typename DstType>
546	void Execute(InputType input, DstType* dst, int src_global_row,
547	int src_global_col, int dst_row, int dst_col) const {
548	// Statically assert that the output pipeline matches the given destination
549	// matrix's scalar type.
550	typedef typename OutputPipelineOutputType<
551	OutputPipelineType, `0`, InputType>::Type::BufferType::ScalarType
552
553	ScalarOutputType;
554	typedef typename DstType::Scalar ScalarDstType;
555	static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value,
556	"mismatched destination scalar type and output pipeline");
557
558	// Evaluate the output pipeline.
559	auto output =
560	output_pipeline_eval_impl_.Eval(input, src_global_row, src_global_col);
561	// Store the result into the destination matrix.
562	StoreFinalOutput(output, dst, dst_row, dst_col);
563	}
564
565	const OutputPipelineEvalImpl<OutputPipelineType, `0`, InputType>
566	output_pipeline_eval_impl_;
567	};
568
569	} // namespace gemmlowp
570
571	#ifdef GEMMLOWP_NEON
572	#include "output_neon.h"
573	#elif defined(GEMMLOWP_SSE4)
574	#include "output_sse.h"
575	#elif defined(GEMMLOWP_MSA)
576	#include "output_msa.h"
577	#endif
578
579	#endif // GEMMLOWP_INTERNAL_OUTPUT_H_
580

Browse the source code of tensorflow/external/gemmlowp/internal/output.h