1 | // Copyright 2020 Google Inc. All Rights Reserved. |
2 | // |
3 | // Redistribution and use in source and binary forms, with or without |
4 | // modification, are permitted provided that the following conditions are |
5 | // met: |
6 | // |
7 | // * Redistributions of source code must retain the above copyright |
8 | // notice, this list of conditions and the following disclaimer. |
9 | // * Redistributions in binary form must reproduce the above |
10 | // copyright notice, this list of conditions and the following disclaimer |
11 | // in the documentation and/or other materials provided with the |
12 | // distribution. |
13 | // * Neither the name of Google Inc. nor the names of its |
14 | // contributors may be used to endorse or promote products derived from |
15 | // this software without specific prior written permission. |
16 | // |
17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
18 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
19 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
20 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
21 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
22 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
23 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
24 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
25 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
26 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
27 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
28 | |
29 | #include <algorithm> |
30 | #include <cmath> |
31 | #include <cstdint> |
32 | #include <cstdlib> |
33 | #include <random> |
34 | #include <string> |
35 | #include <utility> |
36 | #include <vector> |
37 | |
38 | #include "snappy-test.h" |
39 | |
40 | #include "snappy-internal.h" |
41 | #include "snappy-sinksource.h" |
42 | #include "snappy.h" |
43 | #include "snappy_test_data.h" |
44 | |
45 | SNAPPY_FLAG(int32_t, start_len, -1, |
46 | "Starting prefix size for testing (-1: just full file contents)" ); |
47 | SNAPPY_FLAG(int32_t, end_len, -1, |
48 | "Starting prefix size for testing (-1: just full file contents)" ); |
49 | SNAPPY_FLAG(int32_t, bytes, 10485760, |
50 | "How many bytes to compress/uncompress per file for timing" ); |
51 | |
52 | SNAPPY_FLAG(bool, zlib, true, |
53 | "Run zlib compression (http://www.zlib.net)" ); |
54 | SNAPPY_FLAG(bool, lzo, true, |
55 | "Run LZO compression (http://www.oberhumer.com/opensource/lzo/)" ); |
56 | SNAPPY_FLAG(bool, lz4, true, |
57 | "Run LZ4 compression (https://github.com/lz4/lz4)" ); |
58 | SNAPPY_FLAG(bool, snappy, true, "Run snappy compression" ); |
59 | |
60 | SNAPPY_FLAG(bool, write_compressed, false, |
61 | "Write compressed versions of each file to <file>.comp" ); |
62 | SNAPPY_FLAG(bool, write_uncompressed, false, |
63 | "Write uncompressed versions of each file to <file>.uncomp" ); |
64 | |
65 | namespace snappy { |
66 | |
67 | namespace { |
68 | |
69 | #if HAVE_FUNC_MMAP && HAVE_FUNC_SYSCONF |
70 | |
71 | // To test against code that reads beyond its input, this class copies a |
72 | // string to a newly allocated group of pages, the last of which |
73 | // is made unreadable via mprotect. Note that we need to allocate the |
74 | // memory with mmap(), as POSIX allows mprotect() only on memory allocated |
75 | // with mmap(), and some malloc/posix_memalign implementations expect to |
76 | // be able to read previously allocated memory while doing heap allocations. |
77 | class DataEndingAtUnreadablePage { |
78 | public: |
79 | explicit DataEndingAtUnreadablePage(const std::string& s) { |
80 | const size_t page_size = sysconf(_SC_PAGESIZE); |
81 | const size_t size = s.size(); |
82 | // Round up space for string to a multiple of page_size. |
83 | size_t space_for_string = (size + page_size - 1) & ~(page_size - 1); |
84 | alloc_size_ = space_for_string + page_size; |
85 | mem_ = mmap(NULL, alloc_size_, |
86 | PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); |
87 | CHECK_NE(MAP_FAILED, mem_); |
88 | protected_page_ = reinterpret_cast<char*>(mem_) + space_for_string; |
89 | char* dst = protected_page_ - size; |
90 | std::memcpy(dst, s.data(), size); |
91 | data_ = dst; |
92 | size_ = size; |
93 | // Make guard page unreadable. |
94 | CHECK_EQ(0, mprotect(protected_page_, page_size, PROT_NONE)); |
95 | } |
96 | |
97 | ~DataEndingAtUnreadablePage() { |
98 | const size_t page_size = sysconf(_SC_PAGESIZE); |
99 | // Undo the mprotect. |
100 | CHECK_EQ(0, mprotect(protected_page_, page_size, PROT_READ|PROT_WRITE)); |
101 | CHECK_EQ(0, munmap(mem_, alloc_size_)); |
102 | } |
103 | |
104 | const char* data() const { return data_; } |
105 | size_t size() const { return size_; } |
106 | |
107 | private: |
108 | size_t alloc_size_; |
109 | void* mem_; |
110 | char* protected_page_; |
111 | const char* data_; |
112 | size_t size_; |
113 | }; |
114 | |
115 | #else // HAVE_FUNC_MMAP && HAVE_FUNC_SYSCONF |
116 | |
117 | // Fallback for systems without mmap. |
118 | using DataEndingAtUnreadablePage = std::string; |
119 | |
120 | #endif |
121 | |
122 | enum CompressorType { ZLIB, LZO, LZ4, SNAPPY }; |
123 | |
124 | const char* names[] = {"ZLIB" , "LZO" , "LZ4" , "SNAPPY" }; |
125 | |
126 | size_t MinimumRequiredOutputSpace(size_t input_size, CompressorType comp) { |
127 | switch (comp) { |
128 | #ifdef ZLIB_VERSION |
129 | case ZLIB: |
130 | return ZLib::MinCompressbufSize(input_size); |
131 | #endif // ZLIB_VERSION |
132 | |
133 | #ifdef LZO_VERSION |
134 | case LZO: |
135 | return input_size + input_size/64 + 16 + 3; |
136 | #endif // LZO_VERSION |
137 | |
138 | #ifdef LZ4_VERSION_NUMBER |
139 | case LZ4: |
140 | return LZ4_compressBound(input_size); |
141 | #endif // LZ4_VERSION_NUMBER |
142 | |
143 | case SNAPPY: |
144 | return snappy::MaxCompressedLength(input_size); |
145 | |
146 | default: |
147 | LOG(FATAL) << "Unknown compression type number " << comp; |
148 | return 0; |
149 | } |
150 | } |
151 | |
152 | // Returns true if we successfully compressed, false otherwise. |
153 | // |
154 | // If compressed_is_preallocated is set, do not resize the compressed buffer. |
155 | // This is typically what you want for a benchmark, in order to not spend |
156 | // time in the memory allocator. If you do set this flag, however, |
157 | // "compressed" must be preinitialized to at least MinCompressbufSize(comp) |
158 | // number of bytes, and may contain junk bytes at the end after return. |
159 | bool Compress(const char* input, size_t input_size, CompressorType comp, |
160 | std::string* compressed, bool compressed_is_preallocated) { |
161 | if (!compressed_is_preallocated) { |
162 | compressed->resize(MinimumRequiredOutputSpace(input_size, comp)); |
163 | } |
164 | |
165 | switch (comp) { |
166 | #ifdef ZLIB_VERSION |
167 | case ZLIB: { |
168 | ZLib zlib; |
169 | uLongf destlen = compressed->size(); |
170 | int ret = zlib.Compress( |
171 | reinterpret_cast<Bytef*>(string_as_array(compressed)), |
172 | &destlen, |
173 | reinterpret_cast<const Bytef*>(input), |
174 | input_size); |
175 | CHECK_EQ(Z_OK, ret); |
176 | if (!compressed_is_preallocated) { |
177 | compressed->resize(destlen); |
178 | } |
179 | return true; |
180 | } |
181 | #endif // ZLIB_VERSION |
182 | |
183 | #ifdef LZO_VERSION |
184 | case LZO: { |
185 | unsigned char* mem = new unsigned char[LZO1X_1_15_MEM_COMPRESS]; |
186 | lzo_uint destlen; |
187 | int ret = lzo1x_1_15_compress( |
188 | reinterpret_cast<const uint8_t*>(input), |
189 | input_size, |
190 | reinterpret_cast<uint8_t*>(string_as_array(compressed)), |
191 | &destlen, |
192 | mem); |
193 | CHECK_EQ(LZO_E_OK, ret); |
194 | delete[] mem; |
195 | if (!compressed_is_preallocated) { |
196 | compressed->resize(destlen); |
197 | } |
198 | break; |
199 | } |
200 | #endif // LZO_VERSION |
201 | |
202 | #ifdef LZ4_VERSION_NUMBER |
203 | case LZ4: { |
204 | int destlen = compressed->size(); |
205 | destlen = LZ4_compress_default(input, string_as_array(compressed), |
206 | input_size, destlen); |
207 | CHECK_NE(destlen, 0); |
208 | if (!compressed_is_preallocated) { |
209 | compressed->resize(destlen); |
210 | } |
211 | break; |
212 | } |
213 | #endif // LZ4_VERSION_NUMBER |
214 | |
215 | case SNAPPY: { |
216 | size_t destlen; |
217 | snappy::RawCompress(input, input_size, |
218 | string_as_array(compressed), |
219 | &destlen); |
220 | CHECK_LE(destlen, snappy::MaxCompressedLength(input_size)); |
221 | if (!compressed_is_preallocated) { |
222 | compressed->resize(destlen); |
223 | } |
224 | break; |
225 | } |
226 | |
227 | default: { |
228 | return false; // the asked-for library wasn't compiled in |
229 | } |
230 | } |
231 | return true; |
232 | } |
233 | |
234 | bool Uncompress(const std::string& compressed, CompressorType comp, int size, |
235 | std::string* output) { |
236 | // TODO: Switch to [[maybe_unused]] when we can assume C++17. |
237 | (void)size; |
238 | switch (comp) { |
239 | #ifdef ZLIB_VERSION |
240 | case ZLIB: { |
241 | output->resize(size); |
242 | ZLib zlib; |
243 | uLongf destlen = output->size(); |
244 | int ret = zlib.Uncompress( |
245 | reinterpret_cast<Bytef*>(string_as_array(output)), |
246 | &destlen, |
247 | reinterpret_cast<const Bytef*>(compressed.data()), |
248 | compressed.size()); |
249 | CHECK_EQ(Z_OK, ret); |
250 | CHECK_EQ(static_cast<uLongf>(size), destlen); |
251 | break; |
252 | } |
253 | #endif // ZLIB_VERSION |
254 | |
255 | #ifdef LZO_VERSION |
256 | case LZO: { |
257 | output->resize(size); |
258 | lzo_uint destlen; |
259 | int ret = lzo1x_decompress( |
260 | reinterpret_cast<const uint8_t*>(compressed.data()), |
261 | compressed.size(), |
262 | reinterpret_cast<uint8_t*>(string_as_array(output)), |
263 | &destlen, |
264 | NULL); |
265 | CHECK_EQ(LZO_E_OK, ret); |
266 | CHECK_EQ(static_cast<lzo_uint>(size), destlen); |
267 | break; |
268 | } |
269 | #endif // LZO_VERSION |
270 | |
271 | #ifdef LZ4_VERSION_NUMBER |
272 | case LZ4: { |
273 | output->resize(size); |
274 | int destlen = output->size(); |
275 | destlen = LZ4_decompress_safe(compressed.data(), string_as_array(output), |
276 | compressed.size(), destlen); |
277 | CHECK_NE(destlen, 0); |
278 | CHECK_EQ(size, destlen); |
279 | break; |
280 | } |
281 | #endif // LZ4_VERSION_NUMBER |
282 | case SNAPPY: { |
283 | snappy::RawUncompress(compressed.data(), compressed.size(), |
284 | string_as_array(output)); |
285 | break; |
286 | } |
287 | |
288 | default: { |
289 | return false; // the asked-for library wasn't compiled in |
290 | } |
291 | } |
292 | return true; |
293 | } |
294 | |
295 | void Measure(const char* data, size_t length, CompressorType comp, int repeats, |
296 | int block_size) { |
297 | // Run tests a few time and pick median running times |
298 | static const int kRuns = 5; |
299 | double ctime[kRuns]; |
300 | double utime[kRuns]; |
301 | int compressed_size = 0; |
302 | |
303 | { |
304 | // Chop the input into blocks |
305 | int num_blocks = (length + block_size - 1) / block_size; |
306 | std::vector<const char*> input(num_blocks); |
307 | std::vector<size_t> input_length(num_blocks); |
308 | std::vector<std::string> compressed(num_blocks); |
309 | std::vector<std::string> output(num_blocks); |
310 | for (int b = 0; b < num_blocks; ++b) { |
311 | int input_start = b * block_size; |
312 | int input_limit = std::min<int>((b+1)*block_size, length); |
313 | input[b] = data+input_start; |
314 | input_length[b] = input_limit-input_start; |
315 | } |
316 | |
317 | // Pre-grow the output buffers so we don't measure string append time. |
318 | for (std::string& compressed_block : compressed) { |
319 | compressed_block.resize(MinimumRequiredOutputSpace(block_size, comp)); |
320 | } |
321 | |
322 | // First, try one trial compression to make sure the code is compiled in |
323 | if (!Compress(input[0], input_length[0], comp, &compressed[0], true)) { |
324 | LOG(WARNING) << "Skipping " << names[comp] << ": " |
325 | << "library not compiled in" ; |
326 | return; |
327 | } |
328 | |
329 | for (int run = 0; run < kRuns; ++run) { |
330 | CycleTimer ctimer, utimer; |
331 | |
332 | // Pre-grow the output buffers so we don't measure string append time. |
333 | for (std::string& compressed_block : compressed) { |
334 | compressed_block.resize(MinimumRequiredOutputSpace(block_size, comp)); |
335 | } |
336 | |
337 | ctimer.Start(); |
338 | for (int b = 0; b < num_blocks; ++b) { |
339 | for (int i = 0; i < repeats; ++i) |
340 | Compress(input[b], input_length[b], comp, &compressed[b], true); |
341 | } |
342 | ctimer.Stop(); |
343 | |
344 | // Compress once more, with resizing, so we don't leave junk |
345 | // at the end that will confuse the decompressor. |
346 | for (int b = 0; b < num_blocks; ++b) { |
347 | Compress(input[b], input_length[b], comp, &compressed[b], false); |
348 | } |
349 | |
350 | for (int b = 0; b < num_blocks; ++b) { |
351 | output[b].resize(input_length[b]); |
352 | } |
353 | |
354 | utimer.Start(); |
355 | for (int i = 0; i < repeats; ++i) { |
356 | for (int b = 0; b < num_blocks; ++b) |
357 | Uncompress(compressed[b], comp, input_length[b], &output[b]); |
358 | } |
359 | utimer.Stop(); |
360 | |
361 | ctime[run] = ctimer.Get(); |
362 | utime[run] = utimer.Get(); |
363 | } |
364 | |
365 | compressed_size = 0; |
366 | for (const std::string& compressed_item : compressed) { |
367 | compressed_size += compressed_item.size(); |
368 | } |
369 | } |
370 | |
371 | std::sort(ctime, ctime + kRuns); |
372 | std::sort(utime, utime + kRuns); |
373 | const int med = kRuns/2; |
374 | |
375 | float comp_rate = (length / ctime[med]) * repeats / 1048576.0; |
376 | float uncomp_rate = (length / utime[med]) * repeats / 1048576.0; |
377 | std::string x = names[comp]; |
378 | x += ":" ; |
379 | std::string urate = (uncomp_rate >= 0) ? StrFormat("%.1f" , uncomp_rate) |
380 | : std::string("?" ); |
381 | std::printf("%-7s [b %dM] bytes %6d -> %6d %4.1f%% " |
382 | "comp %5.1f MB/s uncomp %5s MB/s\n" , |
383 | x.c_str(), |
384 | block_size/(1<<20), |
385 | static_cast<int>(length), static_cast<uint32_t>(compressed_size), |
386 | (compressed_size * 100.0) / std::max<int>(1, length), |
387 | comp_rate, |
388 | urate.c_str()); |
389 | } |
390 | |
391 | void CompressFile(const char* fname) { |
392 | std::string fullinput; |
393 | CHECK_OK(file::GetContents(fname, &fullinput, file::Defaults())); |
394 | |
395 | std::string compressed; |
396 | Compress(fullinput.data(), fullinput.size(), SNAPPY, &compressed, false); |
397 | |
398 | CHECK_OK(file::SetContents(std::string(fname).append(".comp" ), compressed, |
399 | file::Defaults())); |
400 | } |
401 | |
402 | void UncompressFile(const char* fname) { |
403 | std::string fullinput; |
404 | CHECK_OK(file::GetContents(fname, &fullinput, file::Defaults())); |
405 | |
406 | size_t uncompLength; |
407 | CHECK(snappy::GetUncompressedLength(fullinput.data(), fullinput.size(), |
408 | &uncompLength)); |
409 | |
410 | std::string uncompressed; |
411 | uncompressed.resize(uncompLength); |
412 | CHECK(snappy::Uncompress(fullinput.data(), fullinput.size(), &uncompressed)); |
413 | |
414 | CHECK_OK(file::SetContents(std::string(fname).append(".uncomp" ), uncompressed, |
415 | file::Defaults())); |
416 | } |
417 | |
418 | void MeasureFile(const char* fname) { |
419 | std::string fullinput; |
420 | CHECK_OK(file::GetContents(fname, &fullinput, file::Defaults())); |
421 | std::printf("%-40s :\n" , fname); |
422 | |
423 | int start_len = (snappy::GetFlag(FLAGS_start_len) < 0) |
424 | ? fullinput.size() |
425 | : snappy::GetFlag(FLAGS_start_len); |
426 | int end_len = fullinput.size(); |
427 | if (snappy::GetFlag(FLAGS_end_len) >= 0) { |
428 | end_len = std::min<int>(fullinput.size(), snappy::GetFlag(FLAGS_end_len)); |
429 | } |
430 | for (int len = start_len; len <= end_len; ++len) { |
431 | const char* const input = fullinput.data(); |
432 | int repeats = (snappy::GetFlag(FLAGS_bytes) + len) / (len + 1); |
433 | if (snappy::GetFlag(FLAGS_zlib)) |
434 | Measure(input, len, ZLIB, repeats, 1024 << 10); |
435 | if (snappy::GetFlag(FLAGS_lzo)) |
436 | Measure(input, len, LZO, repeats, 1024 << 10); |
437 | if (snappy::GetFlag(FLAGS_lz4)) |
438 | Measure(input, len, LZ4, repeats, 1024 << 10); |
439 | if (snappy::GetFlag(FLAGS_snappy)) |
440 | Measure(input, len, SNAPPY, repeats, 4096 << 10); |
441 | |
442 | // For block-size based measurements |
443 | if (0 && snappy::GetFlag(FLAGS_snappy)) { |
444 | Measure(input, len, SNAPPY, repeats, 8<<10); |
445 | Measure(input, len, SNAPPY, repeats, 16<<10); |
446 | Measure(input, len, SNAPPY, repeats, 32<<10); |
447 | Measure(input, len, SNAPPY, repeats, 64<<10); |
448 | Measure(input, len, SNAPPY, repeats, 256<<10); |
449 | Measure(input, len, SNAPPY, repeats, 1024<<10); |
450 | } |
451 | } |
452 | } |
453 | |
454 | } // namespace |
455 | |
456 | } // namespace snappy |
457 | |
458 | int main(int argc, char** argv) { |
459 | InitGoogle(argv[0], &argc, &argv, true); |
460 | |
461 | for (int arg = 1; arg < argc; ++arg) { |
462 | if (snappy::GetFlag(FLAGS_write_compressed)) { |
463 | snappy::CompressFile(argv[arg]); |
464 | } else if (snappy::GetFlag(FLAGS_write_uncompressed)) { |
465 | snappy::UncompressFile(argv[arg]); |
466 | } else { |
467 | snappy::MeasureFile(argv[arg]); |
468 | } |
469 | } |
470 | return 0; |
471 | } |
472 | |