1// Copyright 2008 Google Inc. All Rights Reserved.
2//
3// Redistribution and use in source and binary forms, with or without
4// modification, are permitted provided that the following conditions are
5// met:
6//
7// * Redistributions of source code must retain the above copyright
8// notice, this list of conditions and the following disclaimer.
9// * Redistributions in binary form must reproduce the above
10// copyright notice, this list of conditions and the following disclaimer
11// in the documentation and/or other materials provided with the
12// distribution.
13// * Neither the name of Google Inc. nor the names of its
14// contributors may be used to endorse or promote products derived from
15// this software without specific prior written permission.
16//
17// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28//
29// Internals shared between the Snappy implementation and its unittest.
30
31#ifndef THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
32#define THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
33
34#include "snappy-stubs-internal.h"
35
36#if SNAPPY_HAVE_SSSE3
37// Please do not replace with <x86intrin.h> or with headers that assume more
38// advanced SSE versions without checking with all the OWNERS.
39#include <emmintrin.h>
40#include <tmmintrin.h>
41#endif
42
43#if SNAPPY_HAVE_NEON
44#include <arm_neon.h>
45#endif
46
47#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON
48#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1
49#else
50#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0
51#endif
52
53namespace snappy {
54namespace internal {
55
56#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
57#if SNAPPY_HAVE_SSSE3
58using V128 = __m128i;
59#elif SNAPPY_HAVE_NEON
60using V128 = uint8x16_t;
61#endif
62
63// Load 128 bits of integer data. `src` must be 16-byte aligned.
64inline V128 V128_Load(const V128* src);
65
66// Load 128 bits of integer data. `src` does not need to be aligned.
67inline V128 V128_LoadU(const V128* src);
68
69// Store 128 bits of integer data. `dst` does not need to be aligned.
70inline void V128_StoreU(V128* dst, V128 val);
71
72// Shuffle packed 8-bit integers using a shuffle mask.
73// Each packed integer in the shuffle mask must be in [0,16).
74inline V128 V128_Shuffle(V128 input, V128 shuffle_mask);
75
76// Constructs V128 with 16 chars |c|.
77inline V128 V128_DupChar(char c);
78
79#if SNAPPY_HAVE_SSSE3
80inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
81
82inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
83
84inline void V128_StoreU(V128* dst, V128 val) { _mm_storeu_si128(dst, val); }
85
86inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
87 return _mm_shuffle_epi8(input, shuffle_mask);
88}
89
90inline V128 V128_DupChar(char c) { return _mm_set1_epi8(c); }
91
92#elif SNAPPY_HAVE_NEON
93inline V128 V128_Load(const V128* src) {
94 return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
95}
96
97inline V128 V128_LoadU(const V128* src) {
98 return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
99}
100
101inline void V128_StoreU(V128* dst, V128 val) {
102 vst1q_u8(reinterpret_cast<uint8_t*>(dst), val);
103}
104
105inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
106 assert(vminvq_u8(shuffle_mask) >= 0 && vmaxvq_u8(shuffle_mask) <= 15);
107 return vqtbl1q_u8(input, shuffle_mask);
108}
109
110inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }
111#endif
112#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
113
114// Working memory performs a single allocation to hold all scratch space
115// required for compression.
116class WorkingMemory {
117 public:
118 explicit WorkingMemory(size_t input_size);
119 ~WorkingMemory();
120
121 // Allocates and clears a hash table using memory in "*this",
122 // stores the number of buckets in "*table_size" and returns a pointer to
123 // the base of the hash table.
124 uint16_t* GetHashTable(size_t fragment_size, int* table_size) const;
125 char* GetScratchInput() const { return input_; }
126 char* GetScratchOutput() const { return output_; }
127
128 private:
129 char* mem_; // the allocated memory, never nullptr
130 size_t size_; // the size of the allocated memory, never 0
131 uint16_t* table_; // the pointer to the hashtable
132 char* input_; // the pointer to the input scratch buffer
133 char* output_; // the pointer to the output scratch buffer
134
135 // No copying
136 WorkingMemory(const WorkingMemory&);
137 void operator=(const WorkingMemory&);
138};
139
140// Flat array compression that does not emit the "uncompressed length"
141// prefix. Compresses "input" string to the "*op" buffer.
142//
143// REQUIRES: "input_length <= kBlockSize"
144// REQUIRES: "op" points to an array of memory that is at least
145// "MaxCompressedLength(input_length)" in size.
146// REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
147// REQUIRES: "table_size" is a power of two
148//
149// Returns an "end" pointer into "op" buffer.
150// "end - op" is the compressed size of "input".
151char* CompressFragment(const char* input,
152 size_t input_length,
153 char* op,
154 uint16_t* table,
155 const int table_size);
156
157// Find the largest n such that
158//
159// s1[0,n-1] == s2[0,n-1]
160// and n <= (s2_limit - s2).
161//
162// Return make_pair(n, n < 8).
163// Does not read *s2_limit or beyond.
164// Does not read *(s1 + (s2_limit - s2)) or beyond.
165// Requires that s2_limit >= s2.
166//
167// In addition populate *data with the next 5 bytes from the end of the match.
168// This is only done if 8 bytes are available (s2_limit - s2 >= 8). The point is
169// that on some arch's this can be done faster in this routine than subsequent
170// loading from s2 + n.
171//
172// Separate implementation for 64-bit, little-endian cpus.
173#if !SNAPPY_IS_BIG_ENDIAN && \
174 (defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \
175 defined(ARCH_ARM))
176static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
177 const char* s2,
178 const char* s2_limit,
179 uint64_t* data) {
180 assert(s2_limit >= s2);
181 size_t matched = 0;
182
183 // This block isn't necessary for correctness; we could just start looping
184 // immediately. As an optimization though, it is useful. It creates some not
185 // uncommon code paths that determine, without extra effort, whether the match
186 // length is less than 8. In short, we are hoping to avoid a conditional
187 // branch, and perhaps get better code layout from the C++ compiler.
188 if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
189 uint64_t a1 = UNALIGNED_LOAD64(s1);
190 uint64_t a2 = UNALIGNED_LOAD64(s2);
191 if (SNAPPY_PREDICT_TRUE(a1 != a2)) {
192 // This code is critical for performance. The reason is that it determines
193 // how much to advance `ip` (s2). This obviously depends on both the loads
194 // from the `candidate` (s1) and `ip`. Furthermore the next `candidate`
195 // depends on the advanced `ip` calculated here through a load, hash and
196 // new candidate hash lookup (a lot of cycles). This makes s1 (ie.
197 // `candidate`) the variable that limits throughput. This is the reason we
198 // go through hoops to have this function update `data` for the next iter.
199 // The straightforward code would use *data, given by
200 //
201 // *data = UNALIGNED_LOAD64(s2 + matched_bytes) (Latency of 5 cycles),
202 //
203 // as input for the hash table lookup to find next candidate. However
204 // this forces the load on the data dependency chain of s1, because
205 // matched_bytes directly depends on s1. However matched_bytes is 0..7, so
206 // we can also calculate *data by
207 //
208 // *data = AlignRight(UNALIGNED_LOAD64(s2), UNALIGNED_LOAD64(s2 + 8),
209 // matched_bytes);
210 //
211 // The loads do not depend on s1 anymore and are thus off the bottleneck.
212 // The straightforward implementation on x86_64 would be to use
213 //
214 // shrd rax, rdx, cl (cl being matched_bytes * 8)
215 //
216 // unfortunately shrd with a variable shift has a 4 cycle latency. So this
217 // only wins 1 cycle. The BMI2 shrx instruction is a 1 cycle variable
218 // shift instruction but can only shift 64 bits. If we focus on just
219 // obtaining the least significant 4 bytes, we can obtain this by
220 //
221 // *data = ConditionalMove(matched_bytes < 4, UNALIGNED_LOAD64(s2),
222 // UNALIGNED_LOAD64(s2 + 4) >> ((matched_bytes & 3) * 8);
223 //
224 // Writen like above this is not a big win, the conditional move would be
225 // a cmp followed by a cmov (2 cycles) followed by a shift (1 cycle).
226 // However matched_bytes < 4 is equal to
227 // static_cast<uint32_t>(xorval) != 0. Writen that way, the conditional
228 // move (2 cycles) can execute in parallel with FindLSBSetNonZero64
229 // (tzcnt), which takes 3 cycles.
230 uint64_t xorval = a1 ^ a2;
231 int shift = Bits::FindLSBSetNonZero64(xorval);
232 size_t matched_bytes = shift >> 3;
233 uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
234#ifndef __x86_64__
235 a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
236#else
237 // Ideally this would just be
238 //
239 // a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
240 //
241 // However clang correctly infers that the above statement participates on
242 // a critical data dependency chain and thus, unfortunately, refuses to
243 // use a conditional move (it's tuned to cut data dependencies). In this
244 // case there is a longer parallel chain anyway AND this will be fairly
245 // unpredictable.
246 asm("testl %k2, %k2\n\t"
247 "cmovzq %1, %0\n\t"
248 : "+r"(a2)
249 : "r"(a3), "r"(xorval));
250#endif
251 *data = a2 >> (shift & (3 * 8));
252 return std::pair<size_t, bool>(matched_bytes, true);
253 } else {
254 matched = 8;
255 s2 += 8;
256 }
257 }
258
259 // Find out how long the match is. We loop over the data 64 bits at a
260 // time until we find a 64-bit block that doesn't match; then we find
261 // the first non-matching bit and use that to calculate the total
262 // length of the match.
263 while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - 16)) {
264 uint64_t a1 = UNALIGNED_LOAD64(s1 + matched);
265 uint64_t a2 = UNALIGNED_LOAD64(s2);
266 if (a1 == a2) {
267 s2 += 8;
268 matched += 8;
269 } else {
270 uint64_t xorval = a1 ^ a2;
271 int shift = Bits::FindLSBSetNonZero64(xorval);
272 size_t matched_bytes = shift >> 3;
273 uint64_t a3 = UNALIGNED_LOAD64(s2 + 4);
274#ifndef __x86_64__
275 a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
276#else
277 asm("testl %k2, %k2\n\t"
278 "cmovzq %1, %0\n\t"
279 : "+r"(a2)
280 : "r"(a3), "r"(xorval));
281#endif
282 *data = a2 >> (shift & (3 * 8));
283 matched += matched_bytes;
284 assert(matched >= 8);
285 return std::pair<size_t, bool>(matched, false);
286 }
287 }
288 while (SNAPPY_PREDICT_TRUE(s2 < s2_limit)) {
289 if (s1[matched] == *s2) {
290 ++s2;
291 ++matched;
292 } else {
293 if (s2 <= s2_limit - 8) {
294 *data = UNALIGNED_LOAD64(s2);
295 }
296 return std::pair<size_t, bool>(matched, matched < 8);
297 }
298 }
299 return std::pair<size_t, bool>(matched, matched < 8);
300}
301#else
302static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
303 const char* s2,
304 const char* s2_limit,
305 uint64_t* data) {
306 // Implementation based on the x86-64 version, above.
307 assert(s2_limit >= s2);
308 int matched = 0;
309
310 while (s2 <= s2_limit - 4 &&
311 UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) {
312 s2 += 4;
313 matched += 4;
314 }
315 if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - 4) {
316 uint32_t x = UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched);
317 int matching_bits = Bits::FindLSBSetNonZero(x);
318 matched += matching_bits >> 3;
319 s2 += matching_bits >> 3;
320 } else {
321 while ((s2 < s2_limit) && (s1[matched] == *s2)) {
322 ++s2;
323 ++matched;
324 }
325 }
326 if (s2 <= s2_limit - 8) *data = LittleEndian::Load64(s2);
327 return std::pair<size_t, bool>(matched, matched < 8);
328}
329#endif
330
331// Lookup tables for decompression code. Give --snappy_dump_decompression_table
332// to the unit test to recompute char_table.
333
334enum {
335 LITERAL = 0,
336 COPY_1_BYTE_OFFSET = 1, // 3 bit length + 3 bits of offset in opcode
337 COPY_2_BYTE_OFFSET = 2,
338 COPY_4_BYTE_OFFSET = 3
339};
340static const int kMaximumTagLength = 5; // COPY_4_BYTE_OFFSET plus the actual offset.
341
342// Data stored per entry in lookup table:
343// Range Bits-used Description
344// ------------------------------------
345// 1..64 0..7 Literal/copy length encoded in opcode byte
346// 0..7 8..10 Copy offset encoded in opcode byte / 256
347// 0..4 11..13 Extra bytes after opcode
348//
349// We use eight bits for the length even though 7 would have sufficed
350// because of efficiency reasons:
351// (1) Extracting a byte is faster than a bit-field
352// (2) It properly aligns copy offset so we do not need a <<8
353static constexpr uint16_t char_table[256] = {
354 // clang-format off
355 0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
356 0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
357 0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
358 0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
359 0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
360 0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
361 0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
362 0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
363 0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
364 0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
365 0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
366 0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
367 0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
368 0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
369 0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
370 0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
371 0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
372 0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
373 0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
374 0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
375 0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
376 0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
377 0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
378 0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
379 0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
380 0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
381 0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
382 0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
383 0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
384 0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
385 0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
386 0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040,
387 // clang-format on
388};
389
390} // end namespace internal
391} // end namespace snappy
392
393#endif // THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
394