1 | /* |
2 | * Copyright (c) Facebook, Inc. and its affiliates. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #include <folly/json.h> |
18 | |
19 | #include <algorithm> |
20 | #include <functional> |
21 | #include <iterator> |
22 | #include <type_traits> |
23 | |
24 | #include <boost/algorithm/string.hpp> |
25 | #include <glog/logging.h> |
26 | |
27 | #include <folly/Conv.h> |
28 | #include <folly/Portability.h> |
29 | #include <folly/Range.h> |
30 | #include <folly/String.h> |
31 | #include <folly/Unicode.h> |
32 | #include <folly/lang/Bits.h> |
33 | #include <folly/portability/Constexpr.h> |
34 | |
35 | namespace folly { |
36 | |
37 | ////////////////////////////////////////////////////////////////////// |
38 | |
39 | namespace json { |
40 | |
41 | namespace { |
42 | |
43 | parse_error make_parse_error( |
44 | unsigned int line, |
45 | std::string const& context, |
46 | std::string const& expected) { |
47 | return parse_error(to<std::string>( |
48 | "json parse error on line " , |
49 | line, |
50 | !context.empty() ? to<std::string>(" near `" , context, '\'') : "" , |
51 | ": " , |
52 | expected)); |
53 | } |
54 | |
55 | struct Printer { |
56 | explicit Printer( |
57 | std::string& out, |
58 | unsigned* indentLevel, |
59 | serialization_opts const* opts) |
60 | : out_(out), indentLevel_(indentLevel), opts_(*opts) {} |
61 | |
62 | void operator()(dynamic const& v) const { |
63 | switch (v.type()) { |
64 | case dynamic::DOUBLE: |
65 | if (!opts_.allow_nan_inf && |
66 | (std::isnan(v.asDouble()) || std::isinf(v.asDouble()))) { |
67 | throw json::parse_error( |
68 | "folly::toJson: JSON object value was a " |
69 | "NaN or INF" ); |
70 | } |
71 | toAppend( |
72 | v.asDouble(), &out_, opts_.double_mode, opts_.double_num_digits); |
73 | break; |
74 | case dynamic::INT64: { |
75 | auto intval = v.asInt(); |
76 | if (opts_.javascript_safe) { |
77 | // Use folly::to to check that this integer can be represented |
78 | // as a double without loss of precision. |
79 | intval = int64_t(to<double>(intval)); |
80 | } |
81 | toAppend(intval, &out_); |
82 | break; |
83 | } |
84 | case dynamic::BOOL: |
85 | out_ += v.asBool() ? "true" : "false" ; |
86 | break; |
87 | case dynamic::NULLT: |
88 | out_ += "null" ; |
89 | break; |
90 | case dynamic::STRING: |
91 | escapeString(v.asString(), out_, opts_); |
92 | break; |
93 | case dynamic::OBJECT: |
94 | printObject(v); |
95 | break; |
96 | case dynamic::ARRAY: |
97 | printArray(v); |
98 | break; |
99 | default: |
100 | CHECK(0) << "Bad type " << v.type(); |
101 | } |
102 | } |
103 | |
104 | private: |
105 | void printKV(const std::pair<const dynamic, dynamic>& p) const { |
106 | if (!opts_.allow_non_string_keys && !p.first.isString()) { |
107 | throw json::parse_error( |
108 | "folly::toJson: JSON object key was not a " |
109 | "string" ); |
110 | } |
111 | (*this)(p.first); |
112 | mapColon(); |
113 | (*this)(p.second); |
114 | } |
115 | |
116 | template <typename Iterator> |
117 | void printKVPairs(Iterator begin, Iterator end) const { |
118 | printKV(*begin); |
119 | for (++begin; begin != end; ++begin) { |
120 | out_ += ','; |
121 | newline(); |
122 | printKV(*begin); |
123 | } |
124 | } |
125 | |
126 | void printObject(dynamic const& o) const { |
127 | if (o.empty()) { |
128 | out_ += "{}" ; |
129 | return; |
130 | } |
131 | |
132 | out_ += '{'; |
133 | indent(); |
134 | newline(); |
135 | if (opts_.sort_keys || opts_.sort_keys_by) { |
136 | using ref = std::reference_wrapper<decltype(o.items())::value_type const>; |
137 | auto sort_keys_by = [&](auto begin, auto end, const auto& comp) { |
138 | std::sort(begin, end, [&](ref a, ref b) { |
139 | // Only compare keys. No ordering among identical keys. |
140 | return comp(a.get().first, b.get().first); |
141 | }); |
142 | }; |
143 | std::vector<ref> refs(o.items().begin(), o.items().end()); |
144 | if (opts_.sort_keys_by) { |
145 | sort_keys_by(refs.begin(), refs.end(), opts_.sort_keys_by); |
146 | } else { |
147 | sort_keys_by(refs.begin(), refs.end(), std::less<>()); |
148 | } |
149 | printKVPairs(refs.cbegin(), refs.cend()); |
150 | } else { |
151 | printKVPairs(o.items().begin(), o.items().end()); |
152 | } |
153 | outdent(); |
154 | newline(); |
155 | out_ += '}'; |
156 | } |
157 | |
158 | void printArray(dynamic const& a) const { |
159 | if (a.empty()) { |
160 | out_ += "[]" ; |
161 | return; |
162 | } |
163 | |
164 | out_ += '['; |
165 | indent(); |
166 | newline(); |
167 | (*this)(a[0]); |
168 | for (auto& val : range(std::next(a.begin()), a.end())) { |
169 | out_ += ','; |
170 | newline(); |
171 | (*this)(val); |
172 | } |
173 | outdent(); |
174 | newline(); |
175 | out_ += ']'; |
176 | } |
177 | |
178 | private: |
179 | void outdent() const { |
180 | if (indentLevel_) { |
181 | --*indentLevel_; |
182 | } |
183 | } |
184 | |
185 | void indent() const { |
186 | if (indentLevel_) { |
187 | ++*indentLevel_; |
188 | } |
189 | } |
190 | |
191 | void newline() const { |
192 | if (indentLevel_) { |
193 | out_ += to<std::string>('\n', std::string(*indentLevel_ * 2, ' ')); |
194 | } |
195 | } |
196 | |
197 | void mapColon() const { |
198 | out_ += indentLevel_ ? ": " : ":" ; |
199 | } |
200 | |
201 | private: |
202 | std::string& out_; |
203 | unsigned* const indentLevel_; |
204 | serialization_opts const& opts_; |
205 | }; |
206 | |
207 | ////////////////////////////////////////////////////////////////////// |
208 | |
209 | // Wraps our input buffer with some helper functions. |
210 | struct Input { |
211 | explicit Input(StringPiece range, json::serialization_opts const* opts) |
212 | : range_(range), opts_(*opts), lineNum_(0) { |
213 | storeCurrent(); |
214 | } |
215 | |
216 | Input(Input const&) = delete; |
217 | Input& operator=(Input const&) = delete; |
218 | |
219 | char const* begin() const { |
220 | return range_.begin(); |
221 | } |
222 | |
223 | unsigned getLineNum() const { |
224 | return lineNum_; |
225 | } |
226 | |
227 | // Parse ahead for as long as the supplied predicate is satisfied, |
228 | // returning a range of what was skipped. |
229 | template <class Predicate> |
230 | StringPiece skipWhile(const Predicate& p) { |
231 | std::size_t skipped = 0; |
232 | for (; skipped < range_.size(); ++skipped) { |
233 | if (!p(range_[skipped])) { |
234 | break; |
235 | } |
236 | if (range_[skipped] == '\n') { |
237 | ++lineNum_; |
238 | } |
239 | } |
240 | auto ret = range_.subpiece(0, skipped); |
241 | range_.advance(skipped); |
242 | storeCurrent(); |
243 | return ret; |
244 | } |
245 | |
246 | StringPiece skipDigits() { |
247 | return skipWhile([](char c) { return c >= '0' && c <= '9'; }); |
248 | } |
249 | |
250 | StringPiece skipMinusAndDigits() { |
251 | bool firstChar = true; |
252 | return skipWhile([&firstChar](char c) { |
253 | bool result = (c >= '0' && c <= '9') || (firstChar && c == '-'); |
254 | firstChar = false; |
255 | return result; |
256 | }); |
257 | } |
258 | |
259 | void skipWhitespace() { |
260 | unsigned index = 0; |
261 | while (true) { |
262 | while (index < range_.size() && range_[index] == ' ') { |
263 | index++; |
264 | } |
265 | if (index < range_.size()) { |
266 | if (range_[index] == '\n') { |
267 | index++; |
268 | ++lineNum_; |
269 | continue; |
270 | } |
271 | if (range_[index] == '\t' || range_[index] == '\r') { |
272 | index++; |
273 | continue; |
274 | } |
275 | } |
276 | break; |
277 | } |
278 | range_.advance(index); |
279 | storeCurrent(); |
280 | } |
281 | |
282 | void expect(char c) { |
283 | if (**this != c) { |
284 | throw json::make_parse_error( |
285 | lineNum_, context(), to<std::string>("expected '" , c, '\'')); |
286 | } |
287 | ++*this; |
288 | } |
289 | |
290 | std::size_t size() const { |
291 | return range_.size(); |
292 | } |
293 | |
294 | int operator*() const { |
295 | return current_; |
296 | } |
297 | |
298 | void operator++() { |
299 | range_.pop_front(); |
300 | storeCurrent(); |
301 | } |
302 | |
303 | template <class T> |
304 | T () { |
305 | try { |
306 | return to<T>(&range_); |
307 | } catch (std::exception const& e) { |
308 | error(e.what()); |
309 | } |
310 | } |
311 | |
312 | bool consume(StringPiece str) { |
313 | if (boost::starts_with(range_, str)) { |
314 | range_.advance(str.size()); |
315 | storeCurrent(); |
316 | return true; |
317 | } |
318 | return false; |
319 | } |
320 | |
321 | std::string context() const { |
322 | return range_.subpiece(0, 16 /* arbitrary */).toString(); |
323 | } |
324 | |
325 | dynamic error(char const* what) const { |
326 | throw json::make_parse_error(lineNum_, context(), what); |
327 | } |
328 | |
329 | json::serialization_opts const& getOpts() { |
330 | return opts_; |
331 | } |
332 | |
333 | void incrementRecursionLevel() { |
334 | if (currentRecursionLevel_ > opts_.recursion_limit) { |
335 | error("recursion limit exceeded" ); |
336 | } |
337 | currentRecursionLevel_++; |
338 | } |
339 | |
340 | void decrementRecursionLevel() { |
341 | currentRecursionLevel_--; |
342 | } |
343 | |
344 | private: |
345 | void storeCurrent() { |
346 | current_ = range_.empty() ? EOF : range_.front(); |
347 | } |
348 | |
349 | private: |
350 | StringPiece range_; |
351 | json::serialization_opts const& opts_; |
352 | unsigned lineNum_; |
353 | int current_; |
354 | unsigned int currentRecursionLevel_{0}; |
355 | }; |
356 | |
357 | class RecursionGuard { |
358 | public: |
359 | explicit RecursionGuard(Input& in) : in_(in) { |
360 | in_.incrementRecursionLevel(); |
361 | } |
362 | |
363 | ~RecursionGuard() { |
364 | in_.decrementRecursionLevel(); |
365 | } |
366 | |
367 | private: |
368 | Input& in_; |
369 | }; |
370 | |
371 | dynamic parseValue(Input& in, json::metadata_map* map); |
372 | std::string parseString(Input& in); |
373 | dynamic parseNumber(Input& in); |
374 | |
375 | template <class K> |
376 | void parseObjectKeyValue( |
377 | Input& in, |
378 | dynamic& ret, |
379 | K&& key, |
380 | json::metadata_map* map) { |
381 | auto keyLineNumber = in.getLineNum(); |
382 | in.skipWhitespace(); |
383 | in.expect(':'); |
384 | in.skipWhitespace(); |
385 | K tmp; |
386 | if (map) { |
387 | tmp = K(key); |
388 | } |
389 | auto valueLineNumber = in.getLineNum(); |
390 | ret.insert(std::forward<K>(key), parseValue(in, map)); |
391 | if (map) { |
392 | auto val = ret.get_ptr(tmp); |
393 | // We just inserted it, so it should be there! |
394 | DCHECK(val != nullptr); |
395 | map->emplace( |
396 | val, json::parse_metadata{{{keyLineNumber}}, {{valueLineNumber}}}); |
397 | } |
398 | } |
399 | |
400 | dynamic parseObject(Input& in, json::metadata_map* map) { |
401 | DCHECK_EQ(*in, '{'); |
402 | ++in; |
403 | |
404 | dynamic ret = dynamic::object; |
405 | |
406 | in.skipWhitespace(); |
407 | if (*in == '}') { |
408 | ++in; |
409 | return ret; |
410 | } |
411 | |
412 | for (;;) { |
413 | if (in.getOpts().allow_trailing_comma && *in == '}') { |
414 | break; |
415 | } |
416 | if (*in == '\"') { // string |
417 | auto key = parseString(in); |
418 | parseObjectKeyValue(in, ret, std::move(key), map); |
419 | } else if (!in.getOpts().allow_non_string_keys) { |
420 | in.error("expected string for object key name" ); |
421 | } else { |
422 | auto key = parseValue(in, map); |
423 | parseObjectKeyValue(in, ret, std::move(key), map); |
424 | } |
425 | |
426 | in.skipWhitespace(); |
427 | if (*in != ',') { |
428 | break; |
429 | } |
430 | ++in; |
431 | in.skipWhitespace(); |
432 | } |
433 | in.expect('}'); |
434 | |
435 | return ret; |
436 | } |
437 | |
438 | dynamic parseArray(Input& in, json::metadata_map* map) { |
439 | DCHECK_EQ(*in, '['); |
440 | ++in; |
441 | |
442 | dynamic ret = dynamic::array; |
443 | |
444 | in.skipWhitespace(); |
445 | if (*in == ']') { |
446 | ++in; |
447 | return ret; |
448 | } |
449 | |
450 | std::vector<uint32_t> lineNumbers; |
451 | for (;;) { |
452 | if (in.getOpts().allow_trailing_comma && *in == ']') { |
453 | break; |
454 | } |
455 | ret.push_back(parseValue(in, map)); |
456 | if (map) { |
457 | lineNumbers.push_back(in.getLineNum()); |
458 | } |
459 | in.skipWhitespace(); |
460 | if (*in != ',') { |
461 | break; |
462 | } |
463 | ++in; |
464 | in.skipWhitespace(); |
465 | } |
466 | if (map) { |
467 | for (size_t i = 0; i < ret.size(); i++) { |
468 | map->emplace(&ret[i], json::parse_metadata{{{0}}, {{lineNumbers[i]}}}); |
469 | } |
470 | } |
471 | in.expect(']'); |
472 | |
473 | return ret; |
474 | } |
475 | |
476 | dynamic parseNumber(Input& in) { |
477 | bool const negative = (*in == '-'); |
478 | if (negative && in.consume("-Infinity" )) { |
479 | if (in.getOpts().parse_numbers_as_strings) { |
480 | return "-Infinity" ; |
481 | } else { |
482 | return -std::numeric_limits<double>::infinity(); |
483 | } |
484 | } |
485 | |
486 | auto integral = in.skipMinusAndDigits(); |
487 | if (negative && integral.size() < 2) { |
488 | in.error("expected digits after `-'" ); |
489 | } |
490 | |
491 | auto const wasE = *in == 'e' || *in == 'E'; |
492 | |
493 | constexpr const char* maxInt = "9223372036854775807" ; |
494 | constexpr const char* minInt = "-9223372036854775808" ; |
495 | constexpr auto maxIntLen = constexpr_strlen(maxInt); |
496 | constexpr auto minIntLen = constexpr_strlen(minInt); |
497 | |
498 | if (*in != '.' && !wasE && in.getOpts().parse_numbers_as_strings) { |
499 | return integral; |
500 | } |
501 | |
502 | if (*in != '.' && !wasE) { |
503 | if (LIKELY(!in.getOpts().double_fallback || integral.size() < maxIntLen) || |
504 | (!negative && integral.size() == maxIntLen && integral <= maxInt) || |
505 | (negative && integral.size() == minIntLen && integral <= minInt)) { |
506 | auto val = to<int64_t>(integral); |
507 | in.skipWhitespace(); |
508 | return val; |
509 | } else { |
510 | auto val = to<double>(integral); |
511 | in.skipWhitespace(); |
512 | return val; |
513 | } |
514 | } |
515 | |
516 | auto end = !wasE ? (++in, in.skipDigits().end()) : in.begin(); |
517 | if (*in == 'e' || *in == 'E') { |
518 | ++in; |
519 | if (*in == '+' || *in == '-') { |
520 | ++in; |
521 | } |
522 | auto expPart = in.skipDigits(); |
523 | end = expPart.end(); |
524 | } |
525 | auto fullNum = range(integral.begin(), end); |
526 | if (in.getOpts().parse_numbers_as_strings) { |
527 | return fullNum; |
528 | } |
529 | auto val = to<double>(fullNum); |
530 | return val; |
531 | } |
532 | |
533 | std::string decodeUnicodeEscape(Input& in) { |
534 | auto hexVal = [&](int c) -> uint16_t { |
535 | // clang-format off |
536 | return uint16_t( |
537 | c >= '0' && c <= '9' ? c - '0' : |
538 | c >= 'a' && c <= 'f' ? c - 'a' + 10 : |
539 | c >= 'A' && c <= 'F' ? c - 'A' + 10 : |
540 | (in.error("invalid hex digit" ), 0)); |
541 | // clang-format on |
542 | }; |
543 | |
544 | auto readHex = [&]() -> uint16_t { |
545 | if (in.size() < 4) { |
546 | in.error("expected 4 hex digits" ); |
547 | } |
548 | |
549 | auto ret = uint16_t(hexVal(*in) * 4096); |
550 | ++in; |
551 | ret += hexVal(*in) * 256; |
552 | ++in; |
553 | ret += hexVal(*in) * 16; |
554 | ++in; |
555 | ret += hexVal(*in); |
556 | ++in; |
557 | return ret; |
558 | }; |
559 | |
560 | /* |
561 | * If the value encoded is in the surrogate pair range, we need to |
562 | * make sure there is another escape that we can use also. |
563 | */ |
564 | uint32_t codePoint = readHex(); |
565 | if (codePoint >= 0xd800 && codePoint <= 0xdbff) { |
566 | if (!in.consume("\\u" )) { |
567 | in.error( |
568 | "expected another unicode escape for second half of " |
569 | "surrogate pair" ); |
570 | } |
571 | uint16_t second = readHex(); |
572 | if (second >= 0xdc00 && second <= 0xdfff) { |
573 | codePoint = 0x10000 + ((codePoint & 0x3ff) << 10) + (second & 0x3ff); |
574 | } else { |
575 | in.error("second character in surrogate pair is invalid" ); |
576 | } |
577 | } else if (codePoint >= 0xdc00 && codePoint <= 0xdfff) { |
578 | in.error("invalid unicode code point (in range [0xdc00,0xdfff])" ); |
579 | } |
580 | |
581 | return codePointToUtf8(codePoint); |
582 | } |
583 | |
584 | std::string parseString(Input& in) { |
585 | DCHECK_EQ(*in, '\"'); |
586 | ++in; |
587 | |
588 | std::string ret; |
589 | for (;;) { |
590 | auto range = in.skipWhile([](char c) { return c != '\"' && c != '\\'; }); |
591 | ret.append(range.begin(), range.end()); |
592 | |
593 | if (*in == '\"') { |
594 | ++in; |
595 | break; |
596 | } |
597 | if (*in == '\\') { |
598 | ++in; |
599 | switch (*in) { |
600 | // clang-format off |
601 | case '\"': ret.push_back('\"'); ++in; break; |
602 | case '\\': ret.push_back('\\'); ++in; break; |
603 | case '/': ret.push_back('/'); ++in; break; |
604 | case 'b': ret.push_back('\b'); ++in; break; |
605 | case 'f': ret.push_back('\f'); ++in; break; |
606 | case 'n': ret.push_back('\n'); ++in; break; |
607 | case 'r': ret.push_back('\r'); ++in; break; |
608 | case 't': ret.push_back('\t'); ++in; break; |
609 | case 'u': ++in; ret += decodeUnicodeEscape(in); break; |
610 | // clang-format on |
611 | default: |
612 | in.error( |
613 | to<std::string>("unknown escape " , *in, " in string" ).c_str()); |
614 | } |
615 | continue; |
616 | } |
617 | if (*in == EOF) { |
618 | in.error("unterminated string" ); |
619 | } |
620 | if (!*in) { |
621 | /* |
622 | * Apparently we're actually supposed to ban all control |
623 | * characters from strings. This seems unnecessarily |
624 | * restrictive, so we're only banning zero bytes. (Since the |
625 | * string is presumed to be UTF-8 encoded it's fine to just |
626 | * check this way.) |
627 | */ |
628 | in.error("null byte in string" ); |
629 | } |
630 | |
631 | ret.push_back(char(*in)); |
632 | ++in; |
633 | } |
634 | |
635 | return ret; |
636 | } |
637 | |
638 | dynamic parseValue(Input& in, json::metadata_map* map) { |
639 | RecursionGuard guard(in); |
640 | |
641 | in.skipWhitespace(); |
642 | // clang-format off |
643 | return |
644 | *in == '[' ? parseArray(in, map) : |
645 | *in == '{' ? parseObject(in, map) : |
646 | *in == '\"' ? parseString(in) : |
647 | (*in == '-' || (*in >= '0' && *in <= '9')) ? parseNumber(in) : |
648 | in.consume("true" ) ? true : |
649 | in.consume("false" ) ? false : |
650 | in.consume("null" ) ? nullptr : |
651 | in.consume("Infinity" ) ? |
652 | (in.getOpts().parse_numbers_as_strings ? (dynamic)"Infinity" : |
653 | (dynamic)std::numeric_limits<double>::infinity()) : |
654 | in.consume("NaN" ) ? |
655 | (in.getOpts().parse_numbers_as_strings ? (dynamic)"NaN" : |
656 | (dynamic)std::numeric_limits<double>::quiet_NaN()) : |
657 | in.error("expected json value" ); |
658 | // clang-format on |
659 | } |
660 | |
661 | } // namespace |
662 | |
663 | ////////////////////////////////////////////////////////////////////// |
664 | |
665 | std::array<uint64_t, 2> (StringPiece chars) { |
666 | std::array<uint64_t, 2> escapes{{0, 0}}; |
667 | for (auto b : ByteRange(chars)) { |
668 | if (b >= 0x20 && b < 0x80) { |
669 | escapes[b / 64] |= uint64_t(1) << (b % 64); |
670 | } |
671 | } |
672 | return escapes; |
673 | } |
674 | |
675 | std::string serialize(dynamic const& dyn, serialization_opts const& opts) { |
676 | std::string ret; |
677 | unsigned indentLevel = 0; |
678 | Printer p(ret, opts.pretty_formatting ? &indentLevel : nullptr, &opts); |
679 | p(dyn); |
680 | return ret; |
681 | } |
682 | |
683 | // Fast path to determine the longest prefix that can be left |
684 | // unescaped in a string of sizeof(T) bytes packed in an integer of |
685 | // type T. |
686 | template <bool EnableExtraAsciiEscapes, class T> |
687 | size_t firstEscapableInWord(T s, const serialization_opts& opts) { |
688 | static_assert(std::is_unsigned<T>::value, "Unsigned integer required" ); |
689 | static constexpr T kOnes = ~T() / 255; // 0x...0101 |
690 | static constexpr T kMsbs = kOnes * 0x80; // 0x...8080 |
691 | |
692 | // Sets the MSB of bytes < b. Precondition: b < 128. |
693 | auto isLess = [](T w, uint8_t b) { |
694 | // A byte is < b iff subtracting b underflows, so we check that |
695 | // the MSB wasn't set before and it's set after the subtraction. |
696 | return (w - kOnes * b) & ~w & kMsbs; |
697 | }; |
698 | |
699 | auto isChar = [&](uint8_t c) { |
700 | // A byte is == c iff it is 0 if xored with c. |
701 | return isLess(s ^ (kOnes * c), 1); |
702 | }; |
703 | |
704 | // The following masks have the MSB set for each byte of the word |
705 | // that satisfies the corresponding condition. |
706 | auto isHigh = s & kMsbs; // >= 128 |
707 | auto isLow = isLess(s, 0x20); // <= 0x1f |
708 | auto needsEscape = isHigh | isLow | isChar('\\') | isChar('"'); |
709 | |
710 | if /* constexpr */ (EnableExtraAsciiEscapes) { |
711 | // Deal with optional bitmap for unicode escapes. Escapes can optionally be |
712 | // set for ascii characters 32 - 127, so the inner loop may run up to 96 |
713 | // times. However, for the case where 0 or a handful of bits are set, |
714 | // looping will be minimal through use of findFirstSet. |
715 | for (size_t i = 0; i < opts.extra_ascii_to_escape_bitmap.size(); ++i) { |
716 | const auto offset = i * 64; |
717 | // Clear first 32 characters if this is the first index, since those are |
718 | // always escaped. |
719 | auto bitmap = opts.extra_ascii_to_escape_bitmap[i] & |
720 | (i == 0 ? uint64_t(-1) << 32 : ~0UL); |
721 | while (bitmap) { |
722 | auto bit = folly::findFirstSet(bitmap); |
723 | needsEscape |= isChar(static_cast<uint8_t>(offset + bit - 1)); |
724 | bitmap &= bitmap - 1; |
725 | } |
726 | } |
727 | } |
728 | |
729 | if (!needsEscape) { |
730 | return sizeof(T); |
731 | } |
732 | |
733 | if (folly::kIsLittleEndian) { |
734 | return folly::findFirstSet(needsEscape) / 8 - 1; |
735 | } else { |
736 | return sizeof(T) - folly::findLastSet(needsEscape) / 8; |
737 | } |
738 | } |
739 | |
740 | // Escape a string so that it is legal to print it in JSON text. |
741 | template <bool EnableExtraAsciiEscapes> |
742 | void escapeStringImpl( |
743 | StringPiece input, |
744 | std::string& out, |
745 | const serialization_opts& opts) { |
746 | auto hexDigit = [](uint8_t c) -> char { |
747 | return c < 10 ? c + '0' : c - 10 + 'a'; |
748 | }; |
749 | |
750 | out.push_back('\"'); |
751 | |
752 | auto* p = reinterpret_cast<const unsigned char*>(input.begin()); |
753 | auto* q = reinterpret_cast<const unsigned char*>(input.begin()); |
754 | auto* e = reinterpret_cast<const unsigned char*>(input.end()); |
755 | |
756 | while (p < e) { |
757 | // Find the longest prefix that does not need escaping, and copy |
758 | // it literally into the output string. |
759 | auto firstEsc = p; |
760 | while (firstEsc < e) { |
761 | auto avail = e - firstEsc; |
762 | uint64_t word = 0; |
763 | if (avail >= 8) { |
764 | word = folly::loadUnaligned<uint64_t>(firstEsc); |
765 | } else { |
766 | word = folly::partialLoadUnaligned<uint64_t>(firstEsc, avail); |
767 | } |
768 | auto prefix = firstEscapableInWord<EnableExtraAsciiEscapes>(word, opts); |
769 | DCHECK_LE(prefix, avail); |
770 | firstEsc += prefix; |
771 | if (prefix < 8) { |
772 | break; |
773 | } |
774 | } |
775 | if (firstEsc > p) { |
776 | out.append(reinterpret_cast<const char*>(p), firstEsc - p); |
777 | p = firstEsc; |
778 | // We can't be in the middle of a multibyte sequence, so we can reset q. |
779 | q = p; |
780 | if (p == e) { |
781 | break; |
782 | } |
783 | } |
784 | |
785 | // Handle the next byte that may need escaping. |
786 | |
787 | // Since non-ascii encoding inherently does utf8 validation |
788 | // we explicitly validate utf8 only if non-ascii encoding is disabled. |
789 | if ((opts.validate_utf8 || opts.skip_invalid_utf8) && |
790 | !opts.encode_non_ascii) { |
791 | // To achieve better spatial and temporal coherence |
792 | // we do utf8 validation progressively along with the |
793 | // string-escaping instead of two separate passes. |
794 | |
795 | // As the encoding progresses, q will stay at or ahead of p. |
796 | CHECK_GE(q, p); |
797 | |
798 | // As p catches up with q, move q forward. |
799 | if (q == p) { |
800 | // calling utf8_decode has the side effect of |
801 | // checking that utf8 encodings are valid |
802 | char32_t v = utf8ToCodePoint(q, e, opts.skip_invalid_utf8); |
803 | if (opts.skip_invalid_utf8 && v == U'\ufffd') { |
804 | out.append(reinterpret_cast<const char*>(u8"\ufffd" )); |
805 | p = q; |
806 | continue; |
807 | } |
808 | } |
809 | } |
810 | |
811 | auto encodeUnicode = opts.encode_non_ascii && (*p & 0x80); |
812 | if /* constexpr */ (EnableExtraAsciiEscapes) { |
813 | encodeUnicode = encodeUnicode || |
814 | (*p >= 0x20 && *p < 0x80 && |
815 | (opts.extra_ascii_to_escape_bitmap[*p / 64] & |
816 | (uint64_t(1) << (*p % 64)))); |
817 | } |
818 | |
819 | if (encodeUnicode) { |
820 | // note that this if condition captures utf8 chars |
821 | // with value > 127, so size > 1 byte (or they are whitelisted for |
822 | // Unicode encoding). |
823 | // NOTE: char32_t / char16_t are both unsigned. |
824 | char32_t cp = utf8ToCodePoint(p, e, opts.skip_invalid_utf8); |
825 | auto writeHex = [&](char16_t v) { |
826 | char buf[] = "\\u\0\0\0\0" ; |
827 | buf[2] = hexDigit((v >> 12) & 0x0f); |
828 | buf[3] = hexDigit((v >> 8) & 0x0f); |
829 | buf[4] = hexDigit((v >> 4) & 0x0f); |
830 | buf[5] = hexDigit(v & 0x0f); |
831 | out.append(buf, 6); |
832 | }; |
833 | // From the ECMA-404 The JSON Data Interchange Syntax 2nd Edition Dec 2017 |
834 | if (cp < 0x10000u) { |
835 | // If the code point is in the Basic Multilingual Plane (U+0000 through |
836 | // U+FFFF), then it may be represented as a six-character sequence: |
837 | // a reverse solidus, followed by the lowercase letter u, followed by |
838 | // four hexadecimal digits that encode the code point. |
839 | writeHex(static_cast<char16_t>(cp)); |
840 | } else { |
841 | // To escape a code point that is not in the Basic Multilingual Plane, |
842 | // the character may be represented as a twelve-character sequence, |
843 | // encoding the UTF-16 surrogate pair corresponding to the code point. |
844 | writeHex(static_cast<char16_t>( |
845 | 0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu))); |
846 | writeHex(static_cast<char16_t>(0xdc00u + ((cp - 0x10000u) & 0x3ffu))); |
847 | } |
848 | } else if (*p == '\\' || *p == '\"') { |
849 | char buf[] = "\\\0" ; |
850 | buf[1] = char(*p++); |
851 | out.append(buf, 2); |
852 | } else if (*p <= 0x1f) { |
853 | switch (*p) { |
854 | // clang-format off |
855 | case '\b': out.append("\\b" ); p++; break; |
856 | case '\f': out.append("\\f" ); p++; break; |
857 | case '\n': out.append("\\n" ); p++; break; |
858 | case '\r': out.append("\\r" ); p++; break; |
859 | case '\t': out.append("\\t" ); p++; break; |
860 | // clang-format on |
861 | default: |
862 | // Note that this if condition captures non readable chars |
863 | // with value < 32, so size = 1 byte (e.g control chars). |
864 | char buf[] = "\\u00\0\0" ; |
865 | buf[4] = hexDigit(uint8_t((*p & 0xf0) >> 4)); |
866 | buf[5] = hexDigit(uint8_t(*p & 0xf)); |
867 | out.append(buf, 6); |
868 | p++; |
869 | } |
870 | } else { |
871 | out.push_back(char(*p++)); |
872 | } |
873 | } |
874 | |
875 | out.push_back('\"'); |
876 | } |
877 | |
878 | void escapeString( |
879 | StringPiece input, |
880 | std::string& out, |
881 | const serialization_opts& opts) { |
882 | if (FOLLY_UNLIKELY( |
883 | opts.extra_ascii_to_escape_bitmap[0] || |
884 | opts.extra_ascii_to_escape_bitmap[1])) { |
885 | escapeStringImpl<true>(input, out, opts); |
886 | } else { |
887 | escapeStringImpl<false>(input, out, opts); |
888 | } |
889 | } |
890 | |
891 | std::string (StringPiece jsonC) { |
892 | std::string result; |
893 | enum class State { |
894 | None, |
895 | InString, |
896 | , |
897 | |
898 | } state = State::None; |
899 | |
900 | for (size_t i = 0; i < jsonC.size(); ++i) { |
901 | auto s = jsonC.subpiece(i); |
902 | switch (state) { |
903 | case State::None: |
904 | if (s.startsWith("/*" )) { |
905 | state = State::InlineComment; |
906 | ++i; |
907 | continue; |
908 | } else if (s.startsWith("//" )) { |
909 | state = State::LineComment; |
910 | ++i; |
911 | continue; |
912 | } else if (s[0] == '\"') { |
913 | state = State::InString; |
914 | } |
915 | result.push_back(s[0]); |
916 | break; |
917 | case State::InString: |
918 | if (s[0] == '\\') { |
919 | if (UNLIKELY(s.size() == 1)) { |
920 | throw std::logic_error("Invalid JSONC: string is not terminated" ); |
921 | } |
922 | result.push_back(s[0]); |
923 | result.push_back(s[1]); |
924 | ++i; |
925 | continue; |
926 | } else if (s[0] == '\"') { |
927 | state = State::None; |
928 | } |
929 | result.push_back(s[0]); |
930 | break; |
931 | case State::InlineComment: |
932 | if (s.startsWith("*/" )) { |
933 | state = State::None; |
934 | ++i; |
935 | } |
936 | break; |
937 | case State::LineComment: |
938 | if (s[0] == '\n') { |
939 | // skip the line break. It doesn't matter. |
940 | state = State::None; |
941 | } |
942 | break; |
943 | default: |
944 | throw std::logic_error("Unknown comment state" ); |
945 | } |
946 | } |
947 | return result; |
948 | } |
949 | |
950 | } // namespace json |
951 | |
952 | ////////////////////////////////////////////////////////////////////// |
953 | |
954 | dynamic parseJsonWithMetadata(StringPiece range, json::metadata_map* map) { |
955 | return parseJsonWithMetadata(range, json::serialization_opts(), map); |
956 | } |
957 | |
958 | dynamic parseJsonWithMetadata( |
959 | StringPiece range, |
960 | json::serialization_opts const& opts, |
961 | json::metadata_map* map) { |
962 | json::Input in(range, &opts); |
963 | |
964 | uint32_t n = in.getLineNum(); |
965 | auto ret = parseValue(in, map); |
966 | if (map) { |
967 | map->emplace(&ret, json::parse_metadata{{{0}}, {{n}}}); |
968 | } |
969 | |
970 | in.skipWhitespace(); |
971 | if (in.size() && *in != '\0') { |
972 | in.error("parsing didn't consume all input" ); |
973 | } |
974 | return ret; |
975 | } |
976 | |
977 | dynamic parseJson(StringPiece range) { |
978 | return parseJson(range, json::serialization_opts()); |
979 | } |
980 | |
981 | dynamic parseJson(StringPiece range, json::serialization_opts const& opts) { |
982 | json::Input in(range, &opts); |
983 | |
984 | auto ret = parseValue(in, nullptr); |
985 | in.skipWhitespace(); |
986 | if (in.size() && *in != '\0') { |
987 | in.error("parsing didn't consume all input" ); |
988 | } |
989 | return ret; |
990 | } |
991 | |
992 | std::string toJson(dynamic const& dyn) { |
993 | return json::serialize(dyn, json::serialization_opts()); |
994 | } |
995 | |
996 | std::string toPrettyJson(dynamic const& dyn) { |
997 | json::serialization_opts opts; |
998 | opts.pretty_formatting = true; |
999 | opts.sort_keys = true; |
1000 | return json::serialize(dyn, opts); |
1001 | } |
1002 | |
1003 | ////////////////////////////////////////////////////////////////////// |
1004 | // dynamic::print_as_pseudo_json() is implemented here for header |
1005 | // ordering reasons (most of the dynamic implementation is in |
1006 | // dynamic-inl.h, which we don't want to include json.h). |
1007 | |
1008 | void dynamic::print_as_pseudo_json(std::ostream& out) const { |
1009 | json::serialization_opts opts; |
1010 | opts.allow_non_string_keys = true; |
1011 | opts.allow_nan_inf = true; |
1012 | out << json::serialize(*this, opts); |
1013 | } |
1014 | |
1015 | void PrintTo(const dynamic& dyn, std::ostream* os) { |
1016 | json::serialization_opts opts; |
1017 | opts.allow_nan_inf = true; |
1018 | opts.allow_non_string_keys = true; |
1019 | opts.pretty_formatting = true; |
1020 | opts.sort_keys = true; |
1021 | *os << json::serialize(dyn, opts); |
1022 | } |
1023 | |
1024 | ////////////////////////////////////////////////////////////////////// |
1025 | |
1026 | } // namespace folly |
1027 | |