common.h source code [tensorflow/external/gemmlowp/internal/common.h]

1	// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	// common.h: contains stuff that's used throughout gemmlowp
16	// and should always be available.
17
18	#ifndef GEMMLOWP_INTERNAL_COMMON_H_
19	#define GEMMLOWP_INTERNAL_COMMON_H_
20
21	#include "../internal/platform.h"
22	#include "../profiling/pthread_everywhere.h"
23
24	#include <algorithm>
25	#include <cassert>
26	#include <cmath>
27	#include <cstdlib>
28
29	#include "../internal/detect_platform.h"
30	#include "../profiling/instrumentation.h"
31
32	namespace gemmlowp {
33
34	// Standard cache line size. Useful to optimize alignment and
35	// prefetches. Ideally we would query this at runtime, however
36	// 64 byte cache lines are the vast majority, and even if it's
37	// wrong on some device, it will be wrong by no more than a 2x factor,
38	// which should be acceptable.
39	const int kDefaultCacheLineSize = `64`;
40
41	// Default L1 and L2 data cache sizes.
42	// The L1 cache size is assumed to be for each core.
43	// The L2 cache size is assumed to be shared among all cores. What
44	// we call 'L2' here is effectively top-level cache.
45	//
46	// On x86, we should ideally query this at
47	// runtime. On ARM, the instruction to query this is privileged and
48	// Android kernels do not expose it to userspace. Fortunately, the majority
49	// of ARM devices have roughly comparable values:
50	// Nexus 5: L1 16k, L2 1M
51	// Android One: L1 32k, L2 512k
52	// The following values are equal to or somewhat lower than that, and were
53	// found to perform well on both the Nexus 5 and Android One.
54	// Of course, these values are in principle too low for typical x86 CPUs
55	// where we should set the L2 value to (L3 cache size / number of cores) at
56	// least.
57	//
58	#if defined(GEMMLOWP_ARM) && defined(__APPLE__)
59	// iPhone/iPad
60	const int kDefaultL1CacheSize = `48` * `1024`;
61	const int kDefaultL2CacheSize = `2` * `1024` * `1024`;
62	#elif defined(GEMMLOWP_ARM) \|\| defined(GEMMLOWP_ANDROID)
63	// Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
64	// to tune for ARM, although on x86 Atom we might be able to query
65	// cache sizes at runtime, which would be better.
66	const int kDefaultL1CacheSize = `16` * `1024`;
67	const int kDefaultL2CacheSize = `384` * `1024`;
68	#elif defined(GEMMLOWP_X86_64)
69	// x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
70	// Thus we assume larger cache sizes, though we really should query
71	// them at runtime.
72	const int kDefaultL1CacheSize = `32` * `1024`;
73	const int kDefaultL2CacheSize = `4` * `1024` * `1024`;
74	#elif defined(GEMMLOWP_X86_32)
75	// x86-32 and not Android. Same as x86-64 but less bullish.
76	const int kDefaultL1CacheSize = `32` * `1024`;
77	const int kDefaultL2CacheSize = `2` * `1024` * `1024`;
78	#elif defined(GEMMLOWP_MIPS)
79	// MIPS and not Android. TODO: MIPS and Android?
80	const int kDefaultL1CacheSize = `32` * `1024`;
81	const int kDefaultL2CacheSize = `1024` * `1024`;
82	#else
83	// Less common hardware. Maybe some unusual or older or embedded thing.
84	// Assume smaller caches, but don't depart too far from what we do
85	// on ARM/Android to avoid accidentally exposing unexpected behavior.
86	const int kDefaultL1CacheSize = `16` * `1024`;
87	const int kDefaultL2CacheSize = `256` * `1024`;
88	#endif
89
90	// The proportion of the cache that we intend to use for storing
91	// RHS blocks. This should be between 0 and 1, and typically closer to 1,
92	// as we typically want to use most of the L2 cache for storing a large
93	// RHS block.
94	#if defined(GEMMLOWP_X86)
95	// For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
96	// for L2 cache.
97	const float kDefaultL2RhsFactor = `1.00f`;
98	#else
99	const float kDefaultL2RhsFactor = `0.75f`;
100	#endif
101
102	// The number of bytes in a SIMD register. This is used to determine
103	// the dimensions of PackingRegisterBlock so that such blocks can
104	// be efficiently loaded into registers, so that packing code can
105	// work within registers as much as possible.
106	// In the non-SIMD generic fallback code, this is just a generic array
107	// size, so any size would work there. Different platforms may set this
108	// to different values but must ensure that their own optimized packing paths
109	// are consistent with this value.
110
111	#ifdef GEMMLOWP_AVX2
112	const int kRegisterSize = `32`;
113	#else
114	const int kRegisterSize = `16`;
115	#endif
116
117	// Hints the CPU to prefetch the cache line containing ptr.
118	inline void Prefetch(const void* ptr) {
119	#if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM
120	// Aarch64 has very detailed prefetch instructions, that compilers
121	// can't know how to map __builtin_prefetch to, and as a result, don't,
122	// leaving __builtin_prefetch a no-op on this architecture.
123	// For our purposes, "pldl1keep" is usually what we want, meaning:
124	// "prefetch for load, into L1 cache, using each value multiple times".
125	asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
126	#elif defined \
127	__GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch.
128	__builtin_prefetch(ptr);
129	#else
130	(void)ptr;
131	#endif
132	}
133
134	// Returns the runtime argument rounded down to the nearest multiple of
135	// the fixed Modulus.
136	template <unsigned Modulus, typename Integer>
137	Integer RoundDown(Integer i) {
138	return i - (i % Modulus);
139	}
140
141	// Returns the runtime argument rounded up to the nearest multiple of
142	// the fixed Modulus.
143	template <unsigned Modulus, typename Integer>
144	Integer RoundUp(Integer i) {
145	return RoundDown<Modulus>(i + Modulus - `1`);
146	}
147
148	// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
149	template <typename Integer>
150	Integer CeilQuotient(Integer a, Integer b) {
151	return (a + b - `1`) / b;
152	}
153
154	// Returns the argument rounded up to the nearest power of two.
155	template <typename Integer>
156	Integer RoundUpToPowerOfTwo(Integer n) {
157	Integer i = n - `1`;
158	i \|= i >> `1`;
159	i \|= i >> `2`;
160	i \|= i >> `4`;
161	i \|= i >> `8`;
162	i \|= i >> `16`;
163	return i + `1`;
164	}
165
166	template <int N>
167	struct IsPowerOfTwo {
168	static constexpr bool value = !(N & (N - `1`));
169	};
170
171	template <typename T>
172	void MarkMemoryAsInitialized(T* ptr, int size) {
173	#ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
174	GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr),
175	size * sizeof(T));
176	#else
177	(void)ptr;
178	(void)size;
179	#endif
180	}
181
182	} // namespace gemmlowp
183
184	#endif // GEMMLOWP_INTERNAL_COMMON_H_
185

Browse the source code of tensorflow/external/gemmlowp/internal/common.h