1/*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#pragma once
18
19#include <cstdint>
20#include <limits>
21
22#include <glog/logging.h>
23
24#include <folly/Portability.h>
25#include <folly/Range.h>
26#include <folly/detail/GroupVarintDetail.h>
27#include <folly/lang/Bits.h>
28#include <folly/portability/Builtins.h>
29
30#if !defined(__GNUC__) && !defined(_MSC_VER)
31#error GroupVarint.h requires GCC or MSVC
32#endif
33
34#if FOLLY_X64 || defined(__i386__) || FOLLY_PPC64 || FOLLY_AARCH64
35#define FOLLY_HAVE_GROUP_VARINT 1
36#else
37#define FOLLY_HAVE_GROUP_VARINT 0
38#endif
39
40#if FOLLY_HAVE_GROUP_VARINT
41
42#if FOLLY_SSE >= 3
43#include <nmmintrin.h>
44namespace folly {
45namespace detail {
46extern const std::array<std::array<std::uint32_t, 4>, 256> groupVarintSSEMasks;
47} // namespace detail
48} // namespace folly
49#endif
50
51namespace folly {
52namespace detail {
53extern const std::array<std::uint8_t, 256> groupVarintLengths;
54} // namespace detail
55} // namespace folly
56
57namespace folly {
58
59template <typename T>
60class GroupVarint;
61
62/**
63 * GroupVarint encoding for 32-bit values.
64 *
65 * Encodes 4 32-bit integers at once, each using 1-4 bytes depending on size.
66 * There is one byte of overhead. (The first byte contains the lengths of
67 * the four integers encoded as two bits each; 00=1 byte .. 11=4 bytes)
68 *
69 * This implementation assumes little-endian and does unaligned 32-bit
70 * accesses, so it's basically not portable outside of the x86[_64] world.
71 */
72template <>
73class GroupVarint<uint32_t> : public detail::GroupVarintBase<uint32_t> {
74 public:
75 /**
76 * Return the number of bytes used to encode these four values.
77 */
78 static size_t size(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
79 return kHeaderSize + kGroupSize + key(a) + key(b) + key(c) + key(d);
80 }
81
82 /**
83 * Return the number of bytes used to encode four uint32_t values stored
84 * at consecutive positions in an array.
85 */
86 static size_t size(const uint32_t* p) {
87 return size(p[0], p[1], p[2], p[3]);
88 }
89
90 /**
91 * Return the number of bytes used to encode count (<= 4) values.
92 * If you clip a buffer after these many bytes, you can still decode
93 * the first "count" values correctly (if the remaining size() -
94 * partialSize() bytes are filled with garbage).
95 */
96 static size_t partialSize(const type* p, size_t count) {
97 DCHECK_LE(count, kGroupSize);
98 size_t s = kHeaderSize + count;
99 for (; count; --count, ++p) {
100 s += key(*p);
101 }
102 return s;
103 }
104
105 /**
106 * Return the number of values from *p that are valid from an encoded
107 * buffer of size bytes.
108 */
109 static size_t partialCount(const char* p, size_t size) {
110 uint8_t v = uint8_t(*p);
111 size_t s = kHeaderSize;
112 s += 1 + b0key(v);
113 if (s > size) {
114 return 0;
115 }
116 s += 1 + b1key(v);
117 if (s > size) {
118 return 1;
119 }
120 s += 1 + b2key(v);
121 if (s > size) {
122 return 2;
123 }
124 s += 1 + b3key(v);
125 if (s > size) {
126 return 3;
127 }
128 return 4;
129 }
130
131 /**
132 * Given a pointer to the beginning of an GroupVarint32-encoded block,
133 * return the number of bytes used by the encoding.
134 */
135 static size_t encodedSize(const char* p) {
136 return kHeaderSize + kGroupSize + b0key(uint8_t(*p)) + b1key(uint8_t(*p)) +
137 b2key(uint8_t(*p)) + b3key(uint8_t(*p));
138 }
139
140 /**
141 * Encode four uint32_t values into the buffer pointed-to by p, and return
142 * the next position in the buffer (that is, one character past the last
143 * encoded byte). p needs to have at least size()+4 bytes available.
144 */
145 static char* encode(char* p, uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
146 uint8_t b0key = key(a);
147 uint8_t b1key = key(b);
148 uint8_t b2key = key(c);
149 uint8_t b3key = key(d);
150 *p++ = (b3key << 6) | (b2key << 4) | (b1key << 2) | b0key;
151 storeUnaligned(p, a);
152 p += b0key + 1;
153 storeUnaligned(p, b);
154 p += b1key + 1;
155 storeUnaligned(p, c);
156 p += b2key + 1;
157 storeUnaligned(p, d);
158 p += b3key + 1;
159 return p;
160 }
161
162 /**
163 * Encode four uint32_t values from the array pointed-to by src into the
164 * buffer pointed-to by p, similar to encode(p,a,b,c,d) above.
165 */
166 static char* encode(char* p, const uint32_t* src) {
167 return encode(p, src[0], src[1], src[2], src[3]);
168 }
169
170 /**
171 * Decode four uint32_t values from a buffer, and return the next position
172 * in the buffer (that is, one character past the last encoded byte).
173 * The buffer needs to have at least 3 extra bytes available (they
174 * may be read but ignored).
175 */
176 static const char* decode_simple(
177 const char* p,
178 uint32_t* a,
179 uint32_t* b,
180 uint32_t* c,
181 uint32_t* d) {
182 size_t k = loadUnaligned<uint8_t>(p);
183 const char* end = p + detail::groupVarintLengths[k];
184 ++p;
185 size_t k0 = b0key(k);
186 *a = loadUnaligned<uint32_t>(p) & kMask[k0];
187 p += k0 + 1;
188 size_t k1 = b1key(k);
189 *b = loadUnaligned<uint32_t>(p) & kMask[k1];
190 p += k1 + 1;
191 size_t k2 = b2key(k);
192 *c = loadUnaligned<uint32_t>(p) & kMask[k2];
193 p += k2 + 1;
194 size_t k3 = b3key(k);
195 *d = loadUnaligned<uint32_t>(p) & kMask[k3];
196 // p += k3+1;
197 return end;
198 }
199
200 /**
201 * Decode four uint32_t values from a buffer and store them in the array
202 * pointed-to by dest, similar to decode(p,a,b,c,d) above.
203 */
204 static const char* decode_simple(const char* p, uint32_t* dest) {
205 return decode_simple(p, dest, dest + 1, dest + 2, dest + 3);
206 }
207
208#if FOLLY_SSE >= 3
209 /**
210 * Just like the non-SSSE3 decode below, but with the additional constraint
211 * that we must be able to read at least 17 bytes from the input pointer, p.
212 */
213 static const char* decode(const char* p, uint32_t* dest) {
214 uint8_t key = uint8_t(p[0]);
215 __m128i val = _mm_loadu_si128((const __m128i*)(p + 1));
216 __m128i mask =
217 _mm_load_si128((const __m128i*)detail::groupVarintSSEMasks[key].data());
218 __m128i r = _mm_shuffle_epi8(val, mask);
219 _mm_storeu_si128((__m128i*)dest, r);
220 return p + detail::groupVarintLengths[key];
221 }
222
223 /**
224 * Just like decode_simple, but with the additional constraint that
225 * we must be able to read at least 17 bytes from the input pointer, p.
226 */
227 static const char*
228 decode(const char* p, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) {
229 uint8_t key = uint8_t(p[0]);
230 __m128i val = _mm_loadu_si128((const __m128i*)(p + 1));
231 __m128i mask =
232 _mm_load_si128((const __m128i*)detail::groupVarintSSEMasks[key].data());
233 __m128i r = _mm_shuffle_epi8(val, mask);
234
235 // Extracting 32 bits at a time out of an XMM register is a SSE4 feature
236#if FOLLY_SSE >= 4
237 *a = uint32_t(_mm_extract_epi32(r, 0));
238 *b = uint32_t(_mm_extract_epi32(r, 1));
239 *c = uint32_t(_mm_extract_epi32(r, 2));
240 *d = uint32_t(_mm_extract_epi32(r, 3));
241#else /* !__SSE4__ */
242 *a = _mm_extract_epi16(r, 0) + (_mm_extract_epi16(r, 1) << 16);
243 *b = _mm_extract_epi16(r, 2) + (_mm_extract_epi16(r, 3) << 16);
244 *c = _mm_extract_epi16(r, 4) + (_mm_extract_epi16(r, 5) << 16);
245 *d = _mm_extract_epi16(r, 6) + (_mm_extract_epi16(r, 7) << 16);
246#endif /* __SSE4__ */
247
248 return p + detail::groupVarintLengths[key];
249 }
250
251#else /* !__SSSE3__ */
252 static const char*
253 decode(const char* p, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) {
254 return decode_simple(p, a, b, c, d);
255 }
256
257 static const char* decode(const char* p, uint32_t* dest) {
258 return decode_simple(p, dest);
259 }
260#endif /* __SSSE3__ */
261
262 private:
263 static uint8_t key(uint32_t x) {
264 // __builtin_clz is undefined for the x==0 case
265 return uint8_t(3 - (__builtin_clz(x | 1) / 8));
266 }
267 static size_t b0key(size_t x) {
268 return x & 3;
269 }
270 static size_t b1key(size_t x) {
271 return (x >> 2) & 3;
272 }
273 static size_t b2key(size_t x) {
274 return (x >> 4) & 3;
275 }
276 static size_t b3key(size_t x) {
277 return (x >> 6) & 3;
278 }
279
280 static const uint32_t kMask[];
281};
282
283/**
284 * GroupVarint encoding for 64-bit values.
285 *
286 * Encodes 5 64-bit integers at once, each using 1-8 bytes depending on size.
287 * There are two bytes of overhead. (The first two bytes contain the lengths
288 * of the five integers encoded as three bits each; 000=1 byte .. 111 = 8 bytes)
289 *
290 * This implementation assumes little-endian and does unaligned 64-bit
291 * accesses, so it's basically not portable outside of the x86[_64] world.
292 */
293template <>
294class GroupVarint<uint64_t> : public detail::GroupVarintBase<uint64_t> {
295 public:
296 /**
297 * Return the number of bytes used to encode these five values.
298 */
299 static size_t
300 size(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e) {
301 return kHeaderSize + kGroupSize + key(a) + key(b) + key(c) + key(d) +
302 key(e);
303 }
304
305 /**
306 * Return the number of bytes used to encode five uint64_t values stored
307 * at consecutive positions in an array.
308 */
309 static size_t size(const uint64_t* p) {
310 return size(p[0], p[1], p[2], p[3], p[4]);
311 }
312
313 /**
314 * Return the number of bytes used to encode count (<= 4) values.
315 * If you clip a buffer after these many bytes, you can still decode
316 * the first "count" values correctly (if the remaining size() -
317 * partialSize() bytes are filled with garbage).
318 */
319 static size_t partialSize(const type* p, size_t count) {
320 DCHECK_LE(count, kGroupSize);
321 size_t s = kHeaderSize + count;
322 for (; count; --count, ++p) {
323 s += key(*p);
324 }
325 return s;
326 }
327
328 /**
329 * Return the number of values from *p that are valid from an encoded
330 * buffer of size bytes.
331 */
332 static size_t partialCount(const char* p, size_t size) {
333 uint16_t v = loadUnaligned<uint16_t>(p);
334 size_t s = kHeaderSize;
335 s += 1 + b0key(v);
336 if (s > size) {
337 return 0;
338 }
339 s += 1 + b1key(v);
340 if (s > size) {
341 return 1;
342 }
343 s += 1 + b2key(v);
344 if (s > size) {
345 return 2;
346 }
347 s += 1 + b3key(v);
348 if (s > size) {
349 return 3;
350 }
351 s += 1 + b4key(v);
352 if (s > size) {
353 return 4;
354 }
355 return 5;
356 }
357
358 /**
359 * Given a pointer to the beginning of an GroupVarint64-encoded block,
360 * return the number of bytes used by the encoding.
361 */
362 static size_t encodedSize(const char* p) {
363 uint16_t n = loadUnaligned<uint16_t>(p);
364 return kHeaderSize + kGroupSize + b0key(n) + b1key(n) + b2key(n) +
365 b3key(n) + b4key(n);
366 }
367
368 /**
369 * Encode five uint64_t values into the buffer pointed-to by p, and return
370 * the next position in the buffer (that is, one character past the last
371 * encoded byte). p needs to have at least size()+8 bytes available.
372 */
373 static char*
374 encode(char* p, uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e) {
375 uint16_t b0key = key(a);
376 uint16_t b1key = key(b);
377 uint16_t b2key = key(c);
378 uint16_t b3key = key(d);
379 uint16_t b4key = key(e);
380 storeUnaligned<uint16_t>(
381 p,
382 uint16_t(
383 (b4key << 12) | (b3key << 9) | (b2key << 6) | (b1key << 3) |
384 b0key));
385 p += 2;
386 storeUnaligned(p, a);
387 p += b0key + 1;
388 storeUnaligned(p, b);
389 p += b1key + 1;
390 storeUnaligned(p, c);
391 p += b2key + 1;
392 storeUnaligned(p, d);
393 p += b3key + 1;
394 storeUnaligned(p, e);
395 p += b4key + 1;
396 return p;
397 }
398
399 /**
400 * Encode five uint64_t values from the array pointed-to by src into the
401 * buffer pointed-to by p, similar to encode(p,a,b,c,d,e) above.
402 */
403 static char* encode(char* p, const uint64_t* src) {
404 return encode(p, src[0], src[1], src[2], src[3], src[4]);
405 }
406
407 /**
408 * Decode five uint64_t values from a buffer, and return the next position
409 * in the buffer (that is, one character past the last encoded byte).
410 * The buffer needs to have at least 7 bytes available (they may be read
411 * but ignored).
412 */
413 static const char* decode(
414 const char* p,
415 uint64_t* a,
416 uint64_t* b,
417 uint64_t* c,
418 uint64_t* d,
419 uint64_t* e) {
420 uint16_t k = loadUnaligned<uint16_t>(p);
421 p += 2;
422 uint8_t k0 = b0key(k);
423 *a = loadUnaligned<uint64_t>(p) & kMask[k0];
424 p += k0 + 1;
425 uint8_t k1 = b1key(k);
426 *b = loadUnaligned<uint64_t>(p) & kMask[k1];
427 p += k1 + 1;
428 uint8_t k2 = b2key(k);
429 *c = loadUnaligned<uint64_t>(p) & kMask[k2];
430 p += k2 + 1;
431 uint8_t k3 = b3key(k);
432 *d = loadUnaligned<uint64_t>(p) & kMask[k3];
433 p += k3 + 1;
434 uint8_t k4 = b4key(k);
435 *e = loadUnaligned<uint64_t>(p) & kMask[k4];
436 p += k4 + 1;
437 return p;
438 }
439
440 /**
441 * Decode five uint64_t values from a buffer and store them in the array
442 * pointed-to by dest, similar to decode(p,a,b,c,d,e) above.
443 */
444 static const char* decode(const char* p, uint64_t* dest) {
445 return decode(p, dest, dest + 1, dest + 2, dest + 3, dest + 4);
446 }
447
448 private:
449 enum { kHeaderBytes = 2 };
450
451 static uint8_t key(uint64_t x) {
452 // __builtin_clzll is undefined for the x==0 case
453 return uint8_t(7 - (__builtin_clzll(x | 1) / 8));
454 }
455
456 static uint8_t b0key(uint16_t x) {
457 return x & 7u;
458 }
459 static uint8_t b1key(uint16_t x) {
460 return (x >> 3) & 7u;
461 }
462 static uint8_t b2key(uint16_t x) {
463 return (x >> 6) & 7u;
464 }
465 static uint8_t b3key(uint16_t x) {
466 return (x >> 9) & 7u;
467 }
468 static uint8_t b4key(uint16_t x) {
469 return (x >> 12) & 7u;
470 }
471
472 static const uint64_t kMask[];
473};
474
475typedef GroupVarint<uint32_t> GroupVarint32;
476typedef GroupVarint<uint64_t> GroupVarint64;
477
478/**
479 * Simplify use of GroupVarint* for the case where data is available one
480 * entry at a time (instead of one group at a time). Handles buffering
481 * and an incomplete last chunk.
482 *
483 * Output is a function object that accepts character ranges:
484 * out(StringPiece) appends the given character range to the output.
485 */
486template <class T, class Output>
487class GroupVarintEncoder {
488 public:
489 typedef GroupVarint<T> Base;
490 typedef T type;
491
492 explicit GroupVarintEncoder(Output out) : out_(out), count_(0) {}
493
494 ~GroupVarintEncoder() {
495 finish();
496 }
497
498 /**
499 * Add a value to the encoder.
500 */
501 void add(type val) {
502 buf_[count_++] = val;
503 if (count_ == Base::kGroupSize) {
504 char* p = Base::encode(tmp_, buf_);
505 out_(StringPiece(tmp_, p));
506 count_ = 0;
507 }
508 }
509
510 /**
511 * Finish encoding, flushing any buffered values if necessary.
512 * After finish(), the encoder is immediately ready to encode more data
513 * to the same output.
514 */
515 void finish() {
516 if (count_) {
517 // This is not strictly necessary, but it makes testing easy;
518 // uninitialized bytes are guaranteed to be recorded as taking one byte
519 // (not more).
520 for (size_t i = count_; i < Base::kGroupSize; i++) {
521 buf_[i] = 0;
522 }
523 Base::encode(tmp_, buf_);
524 out_(StringPiece(tmp_, Base::partialSize(buf_, count_)));
525 count_ = 0;
526 }
527 }
528
529 /**
530 * Return the appender that was used.
531 */
532 Output& output() {
533 return out_;
534 }
535 const Output& output() const {
536 return out_;
537 }
538
539 /**
540 * Reset the encoder, disregarding any state (except what was already
541 * flushed to the output, of course).
542 */
543 void clear() {
544 count_ = 0;
545 }
546
547 private:
548 Output out_;
549 char tmp_[Base::kMaxSize];
550 type buf_[Base::kGroupSize];
551 size_t count_;
552};
553
554/**
555 * Simplify use of GroupVarint* for the case where the last group in the
556 * input may be incomplete (but the exact size of the input is known).
557 * Allows for extracting values one at a time.
558 */
559template <typename T>
560class GroupVarintDecoder {
561 public:
562 typedef GroupVarint<T> Base;
563 typedef T type;
564
565 GroupVarintDecoder() = default;
566
567 explicit GroupVarintDecoder(StringPiece data, size_t maxCount = (size_t)-1)
568 : rrest_(data.end()),
569 p_(data.data()),
570 end_(data.end()),
571 limit_(end_),
572 pos_(0),
573 count_(0),
574 remaining_(maxCount) {}
575
576 void reset(StringPiece data, size_t maxCount = (size_t)-1) {
577 rrest_ = data.end();
578 p_ = data.data();
579 end_ = data.end();
580 limit_ = end_;
581 pos_ = 0;
582 count_ = 0;
583 remaining_ = maxCount;
584 }
585
586 /**
587 * Read and return the next value.
588 */
589 bool next(type* val) {
590 if (pos_ == count_) {
591 // refill
592 size_t rem = size_t(end_ - p_);
593 if (rem == 0 || remaining_ == 0) {
594 return false;
595 }
596 // next() attempts to read one full group at a time, and so we must have
597 // at least enough bytes readable after its end to handle the case if the
598 // last group is full.
599 //
600 // The best way to ensure this is to ensure that data has at least
601 // Base::kMaxSize - 1 bytes readable *after* the end, otherwise we'll copy
602 // into a temporary buffer.
603 if (limit_ - p_ < Base::kMaxSize) {
604 memcpy(tmp_, p_, rem);
605 p_ = tmp_;
606 end_ = p_ + rem;
607 limit_ = tmp_ + sizeof(tmp_);
608 }
609 pos_ = 0;
610 const char* n = Base::decode(p_, buf_);
611 if (n <= end_) {
612 // Full group could be decoded
613 if (remaining_ >= Base::kGroupSize) {
614 remaining_ -= Base::kGroupSize;
615 count_ = Base::kGroupSize;
616 p_ = n;
617 } else {
618 count_ = remaining_;
619 remaining_ = 0;
620 p_ += Base::partialSize(buf_, count_);
621 }
622 } else {
623 // Can't decode a full group
624 count_ = Base::partialCount(p_, size_t(end_ - p_));
625 if (remaining_ >= count_) {
626 remaining_ -= count_;
627 p_ = end_;
628 } else {
629 count_ = remaining_;
630 remaining_ = 0;
631 p_ += Base::partialSize(buf_, count_);
632 }
633 if (count_ == 0) {
634 return false;
635 }
636 }
637 }
638 *val = buf_[pos_++];
639 return true;
640 }
641
642 StringPiece rest() const {
643 // This is only valid after next() returned false
644 CHECK(pos_ == count_ && (p_ == end_ || remaining_ == 0));
645 // p_ may point to the internal buffer (tmp_), but we want
646 // to return subpiece of the original data
647 size_t size = size_t(end_ - p_);
648 return StringPiece(rrest_ - size, rrest_);
649 }
650
651 private:
652 const char* rrest_;
653 const char* p_;
654 const char* end_;
655 const char* limit_;
656 char tmp_[2 * Base::kMaxSize];
657 type buf_[Base::kGroupSize];
658 size_t pos_;
659 size_t count_;
660 size_t remaining_;
661};
662
663typedef GroupVarintDecoder<uint32_t> GroupVarint32Decoder;
664typedef GroupVarintDecoder<uint64_t> GroupVarint64Decoder;
665
666} // namespace folly
667
668#endif // FOLLY_HAVE_GROUP_VARINT
669