1// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// common.h: contains stuff that's used throughout gemmlowp
16// and should always be available.
17
18#ifndef GEMMLOWP_INTERNAL_COMMON_H_
19#define GEMMLOWP_INTERNAL_COMMON_H_
20
21#include "../internal/platform.h"
22#include "../profiling/pthread_everywhere.h"
23
24#include <algorithm>
25#include <cassert>
26#include <cmath>
27#include <cstdlib>
28
29#include "../internal/detect_platform.h"
30#include "../profiling/instrumentation.h"
31
32namespace gemmlowp {
33
34// Standard cache line size. Useful to optimize alignment and
35// prefetches. Ideally we would query this at runtime, however
36// 64 byte cache lines are the vast majority, and even if it's
37// wrong on some device, it will be wrong by no more than a 2x factor,
38// which should be acceptable.
39const int kDefaultCacheLineSize = 64;
40
41// Default L1 and L2 data cache sizes.
42// The L1 cache size is assumed to be for each core.
43// The L2 cache size is assumed to be shared among all cores. What
44// we call 'L2' here is effectively top-level cache.
45//
46// On x86, we should ideally query this at
47// runtime. On ARM, the instruction to query this is privileged and
48// Android kernels do not expose it to userspace. Fortunately, the majority
49// of ARM devices have roughly comparable values:
50// Nexus 5: L1 16k, L2 1M
51// Android One: L1 32k, L2 512k
52// The following values are equal to or somewhat lower than that, and were
53// found to perform well on both the Nexus 5 and Android One.
54// Of course, these values are in principle too low for typical x86 CPUs
55// where we should set the L2 value to (L3 cache size / number of cores) at
56// least.
57//
58#if defined(GEMMLOWP_ARM) && defined(__APPLE__)
59// iPhone/iPad
60const int kDefaultL1CacheSize = 48 * 1024;
61const int kDefaultL2CacheSize = 2 * 1024 * 1024;
62#elif defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID)
63// Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
64// to tune for ARM, although on x86 Atom we might be able to query
65// cache sizes at runtime, which would be better.
66const int kDefaultL1CacheSize = 16 * 1024;
67const int kDefaultL2CacheSize = 384 * 1024;
68#elif defined(GEMMLOWP_X86_64)
69// x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
70// Thus we assume larger cache sizes, though we really should query
71// them at runtime.
72const int kDefaultL1CacheSize = 32 * 1024;
73const int kDefaultL2CacheSize = 4 * 1024 * 1024;
74#elif defined(GEMMLOWP_X86_32)
75// x86-32 and not Android. Same as x86-64 but less bullish.
76const int kDefaultL1CacheSize = 32 * 1024;
77const int kDefaultL2CacheSize = 2 * 1024 * 1024;
78#elif defined(GEMMLOWP_MIPS)
79// MIPS and not Android. TODO: MIPS and Android?
80const int kDefaultL1CacheSize = 32 * 1024;
81const int kDefaultL2CacheSize = 1024 * 1024;
82#else
83// Less common hardware. Maybe some unusual or older or embedded thing.
84// Assume smaller caches, but don't depart too far from what we do
85// on ARM/Android to avoid accidentally exposing unexpected behavior.
86const int kDefaultL1CacheSize = 16 * 1024;
87const int kDefaultL2CacheSize = 256 * 1024;
88#endif
89
90// The proportion of the cache that we intend to use for storing
91// RHS blocks. This should be between 0 and 1, and typically closer to 1,
92// as we typically want to use most of the L2 cache for storing a large
93// RHS block.
94#if defined(GEMMLOWP_X86)
95// For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
96// for L2 cache.
97const float kDefaultL2RhsFactor = 1.00f;
98#else
99const float kDefaultL2RhsFactor = 0.75f;
100#endif
101
102// The number of bytes in a SIMD register. This is used to determine
103// the dimensions of PackingRegisterBlock so that such blocks can
104// be efficiently loaded into registers, so that packing code can
105// work within registers as much as possible.
106// In the non-SIMD generic fallback code, this is just a generic array
107// size, so any size would work there. Different platforms may set this
108// to different values but must ensure that their own optimized packing paths
109// are consistent with this value.
110
111#ifdef GEMMLOWP_AVX2
112const int kRegisterSize = 32;
113#else
114const int kRegisterSize = 16;
115#endif
116
117// Hints the CPU to prefetch the cache line containing ptr.
118inline void Prefetch(const void* ptr) {
119#if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM
120 // Aarch64 has very detailed prefetch instructions, that compilers
121 // can't know how to map __builtin_prefetch to, and as a result, don't,
122 // leaving __builtin_prefetch a no-op on this architecture.
123 // For our purposes, "pldl1keep" is usually what we want, meaning:
124 // "prefetch for load, into L1 cache, using each value multiple times".
125 asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
126#elif defined \
127 __GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch.
128 __builtin_prefetch(ptr);
129#else
130 (void)ptr;
131#endif
132}
133
134// Returns the runtime argument rounded down to the nearest multiple of
135// the fixed Modulus.
136template <unsigned Modulus, typename Integer>
137Integer RoundDown(Integer i) {
138 return i - (i % Modulus);
139}
140
141// Returns the runtime argument rounded up to the nearest multiple of
142// the fixed Modulus.
143template <unsigned Modulus, typename Integer>
144Integer RoundUp(Integer i) {
145 return RoundDown<Modulus>(i + Modulus - 1);
146}
147
148// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
149template <typename Integer>
150Integer CeilQuotient(Integer a, Integer b) {
151 return (a + b - 1) / b;
152}
153
154// Returns the argument rounded up to the nearest power of two.
155template <typename Integer>
156Integer RoundUpToPowerOfTwo(Integer n) {
157 Integer i = n - 1;
158 i |= i >> 1;
159 i |= i >> 2;
160 i |= i >> 4;
161 i |= i >> 8;
162 i |= i >> 16;
163 return i + 1;
164}
165
166template <int N>
167struct IsPowerOfTwo {
168 static constexpr bool value = !(N & (N - 1));
169};
170
171template <typename T>
172void MarkMemoryAsInitialized(T* ptr, int size) {
173#ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
174 GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr),
175 size * sizeof(T));
176#else
177 (void)ptr;
178 (void)size;
179#endif
180}
181
182} // namespace gemmlowp
183
184#endif // GEMMLOWP_INTERNAL_COMMON_H_
185