snappy-internal.h source code [snappy/snappy-internal.h]

1	// Copyright 2008 Google Inc. All Rights Reserved.
2	//
3	// Redistribution and use in source and binary forms, with or without
4	// modification, are permitted provided that the following conditions are
5	// met:
6	//
7	// Redistributions of source code must retain the above copyright*
8	// notice, this list of conditions and the following disclaimer.
9	// Redistributions in binary form must reproduce the above*
10	// copyright notice, this list of conditions and the following disclaimer
11	// in the documentation and/or other materials provided with the
12	// distribution.
13	// Neither the name of Google Inc. nor the names of its*
14	// contributors may be used to endorse or promote products derived from
15	// this software without specific prior written permission.
16	//
17	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28	//
29	// Internals shared between the Snappy implementation and its unittest.
30
31	#ifndef THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
32	#define THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
33
34	#include "snappy-stubs-internal.h"
35
36	#if SNAPPY_HAVE_SSSE3
37	// Please do not replace with <x86intrin.h> or with headers that assume more
38	// advanced SSE versions without checking with all the OWNERS.
39	#include <emmintrin.h>
40	#include <tmmintrin.h>
41	#endif
42
43	#if SNAPPY_HAVE_NEON
44	#include <arm_neon.h>
45	#endif
46
47	#if SNAPPY_HAVE_SSSE3 \|\| SNAPPY_HAVE_NEON
48	#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1
49	#else
50	#define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0
51	#endif
52
53	namespace snappy {
54	namespace internal {
55
56	#if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
57	#if SNAPPY_HAVE_SSSE3
58	using V128 = __m128i;
59	#elif SNAPPY_HAVE_NEON
60	using V128 = uint8x16_t;
61	#endif
62
63	// Load 128 bits of integer data. `src` must be 16-byte aligned.
64	inline V128 V128_Load(const V128* src);
65
66	// Load 128 bits of integer data. `src` does not need to be aligned.
67	inline V128 V128_LoadU(const V128* src);
68
69	// Store 128 bits of integer data. `dst` does not need to be aligned.
70	inline void V128_StoreU(V128* dst, V128 val);
71
72	// Shuffle packed 8-bit integers using a shuffle mask.
73	// Each packed integer in the shuffle mask must be in [0,16).
74	inline V128 V128_Shuffle(V128 input, V128 shuffle_mask);
75
76	// Constructs V128 with 16 chars \|c\|.
77	inline V128 V128_DupChar(char c);
78
79	#if SNAPPY_HAVE_SSSE3
80	inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
81
82	inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
83
84	inline void V128_StoreU(V128* dst, V128 val) { _mm_storeu_si128(dst, val); }
85
86	inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
87	return _mm_shuffle_epi8(input, shuffle_mask);
88	}
89
90	inline V128 V128_DupChar(char c) { return _mm_set1_epi8(c); }
91
92	#elif SNAPPY_HAVE_NEON
93	inline V128 V128_Load(const V128* src) {
94	return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
95	}
96
97	inline V128 V128_LoadU(const V128* src) {
98	return vld1q_u8(reinterpret_cast<const uint8_t*>(src));
99	}
100
101	inline void V128_StoreU(V128* dst, V128 val) {
102	vst1q_u8(reinterpret_cast<uint8_t*>(dst), val);
103	}
104
105	inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
106	assert(vminvq_u8(shuffle_mask) >= `0` && vmaxvq_u8(shuffle_mask) <= `15`);
107	return vqtbl1q_u8(input, shuffle_mask);
108	}
109
110	inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }
111	#endif
112	#endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
113
114	// Working memory performs a single allocation to hold all scratch space
115	// required for compression.
116	class WorkingMemory {
117	public:
118	explicit WorkingMemory(size_t input_size);
119	~WorkingMemory();
120
121	// Allocates and clears a hash table using memory in "this",*
122	// stores the number of buckets in "table_size" and returns a pointer to*
123	// the base of the hash table.
124	uint16_t* GetHashTable(size_t fragment_size, int* table_size) const;
125	char* GetScratchInput() const { return input_; }
126	char* GetScratchOutput() const { return output_; }
127
128	private:
129	char* mem_; // the allocated memory, never nullptr
130	size_t size_; // the size of the allocated memory, never 0
131	uint16_t* table_; // the pointer to the hashtable
132	char* input_; // the pointer to the input scratch buffer
133	char* output_; // the pointer to the output scratch buffer
134
135	// No copying
136	WorkingMemory(const WorkingMemory&);
137	void operator=(const WorkingMemory&);
138	};
139
140	// Flat array compression that does not emit the "uncompressed length"
141	// prefix. Compresses "input" string to the "op" buffer.*
142	//
143	// REQUIRES: "input_length <= kBlockSize"
144	// REQUIRES: "op" points to an array of memory that is at least
145	// "MaxCompressedLength(input_length)" in size.
146	// REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
147	// REQUIRES: "table_size" is a power of two
148	//
149	// Returns an "end" pointer into "op" buffer.
150	// "end - op" is the compressed size of "input".
151	char* CompressFragment(const char* input,
152	size_t input_length,
153	char* op,
154	uint16_t* table,
155	const int table_size);
156
157	// Find the largest n such that
158	//
159	// s1[0,n-1] == s2[0,n-1]
160	// and n <= (s2_limit - s2).
161	//
162	// Return make_pair(n, n < 8).
163	// Does not read s2_limit or beyond.*
164	// Does not read (s1 + (s2_limit - s2)) or beyond.*
165	// Requires that s2_limit >= s2.
166	//
167	// In addition populate data with the next 5 bytes from the end of the match.*
168	// This is only done if 8 bytes are available (s2_limit - s2 >= 8). The point is
169	// that on some arch's this can be done faster in this routine than subsequent
170	// loading from s2 + n.
171	//
172	// Separate implementation for 64-bit, little-endian cpus.
173	#if !SNAPPY_IS_BIG_ENDIAN && \
174	(defined(__x86_64__) \|\| defined(_M_X64) \|\| defined(ARCH_PPC) \|\| \
175	defined(ARCH_ARM))
176	static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
177	const char* s2,
178	const char* s2_limit,
179	uint64_t* data) {
180	assert(s2_limit >= s2);
181	size_t matched = `0`;
182
183	// This block isn't necessary for correctness; we could just start looping
184	// immediately. As an optimization though, it is useful. It creates some not
185	// uncommon code paths that determine, without extra effort, whether the match
186	// length is less than 8. In short, we are hoping to avoid a conditional
187	// branch, and perhaps get better code layout from the C++ compiler.
188	if (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - `16`)) {
189	uint64_t a1 = UNALIGNED_LOAD64(s1);
190	uint64_t a2 = UNALIGNED_LOAD64(s2);
191	if (SNAPPY_PREDICT_TRUE(a1 != a2)) {
192	// This code is critical for performance. The reason is that it determines
193	// how much to advance `ip` (s2). This obviously depends on both the loads
194	// from the `candidate` (s1) and `ip`. Furthermore the next `candidate`
195	// depends on the advanced `ip` calculated here through a load, hash and
196	// new candidate hash lookup (a lot of cycles). This makes s1 (ie.
197	// `candidate`) the variable that limits throughput. This is the reason we
198	// go through hoops to have this function update `data` for the next iter.
199	// The straightforward code would use data, given by*
200	//
201	// data = UNALIGNED_LOAD64(s2 + matched_bytes) (Latency of 5 cycles),*
202	//
203	// as input for the hash table lookup to find next candidate. However
204	// this forces the load on the data dependency chain of s1, because
205	// matched_bytes directly depends on s1. However matched_bytes is 0..7, so
206	// we can also calculate data by*
207	//
208	// data = AlignRight(UNALIGNED_LOAD64(s2), UNALIGNED_LOAD64(s2 + 8),*
209	// matched_bytes);
210	//
211	// The loads do not depend on s1 anymore and are thus off the bottleneck.
212	// The straightforward implementation on x86_64 would be to use
213	//
214	// shrd rax, rdx, cl (cl being matched_bytes 8)*
215	//
216	// unfortunately shrd with a variable shift has a 4 cycle latency. So this
217	// only wins 1 cycle. The BMI2 shrx instruction is a 1 cycle variable
218	// shift instruction but can only shift 64 bits. If we focus on just
219	// obtaining the least significant 4 bytes, we can obtain this by
220	//
221	// data = ConditionalMove(matched_bytes < 4, UNALIGNED_LOAD64(s2),*
222	// UNALIGNED_LOAD64(s2 + 4) >> ((matched_bytes & 3) 8);*
223	//
224	// Writen like above this is not a big win, the conditional move would be
225	// a cmp followed by a cmov (2 cycles) followed by a shift (1 cycle).
226	// However matched_bytes < 4 is equal to
227	// static_cast<uint32_t>(xorval) != 0. Writen that way, the conditional
228	// move (2 cycles) can execute in parallel with FindLSBSetNonZero64
229	// (tzcnt), which takes 3 cycles.
230	uint64_t xorval = a1 ^ a2;
231	int shift = Bits::FindLSBSetNonZero64(xorval);
232	size_t matched_bytes = shift >> `3`;
233	uint64_t a3 = UNALIGNED_LOAD64(s2 + `4`);
234	#ifndef __x86_64__
235	a2 = static_cast<uint32_t>(xorval) == `0` ? a3 : a2;
236	#else
237	// Ideally this would just be
238	//
239	// a2 = static_cast<uint32_t>(xorval) == 0 ? a3 : a2;
240	//
241	// However clang correctly infers that the above statement participates on
242	// a critical data dependency chain and thus, unfortunately, refuses to
243	// use a conditional move (it's tuned to cut data dependencies). In this
244	// case there is a longer parallel chain anyway AND this will be fairly
245	// unpredictable.
246	asm("testl %k2, %k2\n\t"
247	"cmovzq %1, %0\n\t"
248	: "+r"(a2)
249	: "r"(a3), "r"(xorval));
250	#endif
251	data = a2 >> (shift & (`3` `8`));
252	return std::pair<size_t, bool>(matched_bytes, true);
253	} else {
254	matched = `8`;
255	s2 += `8`;
256	}
257	}
258
259	// Find out how long the match is. We loop over the data 64 bits at a
260	// time until we find a 64-bit block that doesn't match; then we find
261	// the first non-matching bit and use that to calculate the total
262	// length of the match.
263	while (SNAPPY_PREDICT_TRUE(s2 <= s2_limit - `16`)) {
264	uint64_t a1 = UNALIGNED_LOAD64(s1 + matched);
265	uint64_t a2 = UNALIGNED_LOAD64(s2);
266	if (a1 == a2) {
267	s2 += `8`;
268	matched += `8`;
269	} else {
270	uint64_t xorval = a1 ^ a2;
271	int shift = Bits::FindLSBSetNonZero64(xorval);
272	size_t matched_bytes = shift >> `3`;
273	uint64_t a3 = UNALIGNED_LOAD64(s2 + `4`);
274	#ifndef __x86_64__
275	a2 = static_cast<uint32_t>(xorval) == `0` ? a3 : a2;
276	#else
277	asm("testl %k2, %k2\n\t"
278	"cmovzq %1, %0\n\t"
279	: "+r"(a2)
280	: "r"(a3), "r"(xorval));
281	#endif
282	data = a2 >> (shift & (`3` `8`));
283	matched += matched_bytes;
284	assert(matched >= `8`);
285	return std::pair<size_t, bool>(matched, false);
286	}
287	}
288	while (SNAPPY_PREDICT_TRUE(s2 < s2_limit)) {
289	if (s1[matched] == *s2) {
290	++s2;
291	++matched;
292	} else {
293	if (s2 <= s2_limit - `8`) {
294	*data = UNALIGNED_LOAD64(s2);
295	}
296	return std::pair<size_t, bool>(matched, matched < `8`);
297	}
298	}
299	return std::pair<size_t, bool>(matched, matched < `8`);
300	}
301	#else
302	static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
303	const char* s2,
304	const char* s2_limit,
305	uint64_t* data) {
306	// Implementation based on the x86-64 version, above.
307	assert(s2_limit >= s2);
308	int matched = `0`;
309
310	while (s2 <= s2_limit - `4` &&
311	UNALIGNED_LOAD32(s2) == UNALIGNED_LOAD32(s1 + matched)) {
312	s2 += `4`;
313	matched += `4`;
314	}
315	if (LittleEndian::IsLittleEndian() && s2 <= s2_limit - `4`) {
316	uint32_t x = UNALIGNED_LOAD32(s2) ^ UNALIGNED_LOAD32(s1 + matched);
317	int matching_bits = Bits::FindLSBSetNonZero(x);
318	matched += matching_bits >> `3`;
319	s2 += matching_bits >> `3`;
320	} else {
321	while ((s2 < s2_limit) && (s1[matched] == *s2)) {
322	++s2;
323	++matched;
324	}
325	}
326	if (s2 <= s2_limit - `8`) *data = LittleEndian::Load64(s2);
327	return std::pair<size_t, bool>(matched, matched < `8`);
328	}
329	#endif
330
331	// Lookup tables for decompression code. Give --snappy_dump_decompression_table
332	// to the unit test to recompute char_table.
333
334	enum {
335	LITERAL = `0`,
336	COPY_1_BYTE_OFFSET = `1`, // 3 bit length + 3 bits of offset in opcode
337	COPY_2_BYTE_OFFSET = `2`,
338	COPY_4_BYTE_OFFSET = `3`
339	};
340	static const int kMaximumTagLength = `5`; // COPY_4_BYTE_OFFSET plus the actual offset.
341
342	// Data stored per entry in lookup table:
343	// Range Bits-used Description
344	// ------------------------------------
345	// 1..64 0..7 Literal/copy length encoded in opcode byte
346	// 0..7 8..10 Copy offset encoded in opcode byte / 256
347	// 0..4 11..13 Extra bytes after opcode
348	//
349	// We use eight bits for the length even though 7 would have sufficed
350	// because of efficiency reasons:
351	// (1) Extracting a byte is faster than a bit-field
352	// (2) It properly aligns copy offset so we do not need a <<8
353	static constexpr uint16_t char_table[`256`] = {
354	// clang-format off
355	`0x0001`, `0x0804`, `0x1001`, `0x2001`, `0x0002`, `0x0805`, `0x1002`, `0x2002`,
356	`0x0003`, `0x0806`, `0x1003`, `0x2003`, `0x0004`, `0x0807`, `0x1004`, `0x2004`,
357	`0x0005`, `0x0808`, `0x1005`, `0x2005`, `0x0006`, `0x0809`, `0x1006`, `0x2006`,
358	`0x0007`, `0x080a`, `0x1007`, `0x2007`, `0x0008`, `0x080b`, `0x1008`, `0x2008`,
359	`0x0009`, `0x0904`, `0x1009`, `0x2009`, `0x000a`, `0x0905`, `0x100a`, `0x200a`,
360	`0x000b`, `0x0906`, `0x100b`, `0x200b`, `0x000c`, `0x0907`, `0x100c`, `0x200c`,
361	`0x000d`, `0x0908`, `0x100d`, `0x200d`, `0x000e`, `0x0909`, `0x100e`, `0x200e`,
362	`0x000f`, `0x090a`, `0x100f`, `0x200f`, `0x0010`, `0x090b`, `0x1010`, `0x2010`,
363	`0x0011`, `0x0a04`, `0x1011`, `0x2011`, `0x0012`, `0x0a05`, `0x1012`, `0x2012`,
364	`0x0013`, `0x0a06`, `0x1013`, `0x2013`, `0x0014`, `0x0a07`, `0x1014`, `0x2014`,
365	`0x0015`, `0x0a08`, `0x1015`, `0x2015`, `0x0016`, `0x0a09`, `0x1016`, `0x2016`,
366	`0x0017`, `0x0a0a`, `0x1017`, `0x2017`, `0x0018`, `0x0a0b`, `0x1018`, `0x2018`,
367	`0x0019`, `0x0b04`, `0x1019`, `0x2019`, `0x001a`, `0x0b05`, `0x101a`, `0x201a`,
368	`0x001b`, `0x0b06`, `0x101b`, `0x201b`, `0x001c`, `0x0b07`, `0x101c`, `0x201c`,
369	`0x001d`, `0x0b08`, `0x101d`, `0x201d`, `0x001e`, `0x0b09`, `0x101e`, `0x201e`,
370	`0x001f`, `0x0b0a`, `0x101f`, `0x201f`, `0x0020`, `0x0b0b`, `0x1020`, `0x2020`,
371	`0x0021`, `0x0c04`, `0x1021`, `0x2021`, `0x0022`, `0x0c05`, `0x1022`, `0x2022`,
372	`0x0023`, `0x0c06`, `0x1023`, `0x2023`, `0x0024`, `0x0c07`, `0x1024`, `0x2024`,
373	`0x0025`, `0x0c08`, `0x1025`, `0x2025`, `0x0026`, `0x0c09`, `0x1026`, `0x2026`,
374	`0x0027`, `0x0c0a`, `0x1027`, `0x2027`, `0x0028`, `0x0c0b`, `0x1028`, `0x2028`,
375	`0x0029`, `0x0d04`, `0x1029`, `0x2029`, `0x002a`, `0x0d05`, `0x102a`, `0x202a`,
376	`0x002b`, `0x0d06`, `0x102b`, `0x202b`, `0x002c`, `0x0d07`, `0x102c`, `0x202c`,
377	`0x002d`, `0x0d08`, `0x102d`, `0x202d`, `0x002e`, `0x0d09`, `0x102e`, `0x202e`,
378	`0x002f`, `0x0d0a`, `0x102f`, `0x202f`, `0x0030`, `0x0d0b`, `0x1030`, `0x2030`,
379	`0x0031`, `0x0e04`, `0x1031`, `0x2031`, `0x0032`, `0x0e05`, `0x1032`, `0x2032`,
380	`0x0033`, `0x0e06`, `0x1033`, `0x2033`, `0x0034`, `0x0e07`, `0x1034`, `0x2034`,
381	`0x0035`, `0x0e08`, `0x1035`, `0x2035`, `0x0036`, `0x0e09`, `0x1036`, `0x2036`,
382	`0x0037`, `0x0e0a`, `0x1037`, `0x2037`, `0x0038`, `0x0e0b`, `0x1038`, `0x2038`,
383	`0x0039`, `0x0f04`, `0x1039`, `0x2039`, `0x003a`, `0x0f05`, `0x103a`, `0x203a`,
384	`0x003b`, `0x0f06`, `0x103b`, `0x203b`, `0x003c`, `0x0f07`, `0x103c`, `0x203c`,
385	`0x0801`, `0x0f08`, `0x103d`, `0x203d`, `0x1001`, `0x0f09`, `0x103e`, `0x203e`,
386	`0x1801`, `0x0f0a`, `0x103f`, `0x203f`, `0x2001`, `0x0f0b`, `0x1040`, `0x2040`,
387	// clang-format on
388	};
389
390	} // end namespace internal
391	} // end namespace snappy
392
393	#endif // THIRD_PARTY_SNAPPY_SNAPPY_INTERNAL_H_
394

Browse the source code of snappy/snappy-internal.h