Skip to content

Commit 430d0a9

Browse files
committed
perf: 解耦unicode_escape
1 parent b402505 commit 430d0a9

File tree

1 file changed

+107
-90
lines changed

1 file changed

+107
-90
lines changed

include/parser/parser.hpp

+107-90
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ class parser
5757
bool skip_string_literal_with_accel();
5858
bool skip_whitespace() noexcept;
5959
bool skip_digit();
60+
bool skip_unicode_escape(uint16_t& pair_high);
6061

6162
private:
6263
parsing_iter_t _cur;
@@ -420,113 +421,34 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
420421
}
421422
switch (*_cur) {
422423
case '"':
423-
result.push_back('"');
424+
result.emplace_back('"');
424425
break;
425426
case '\\':
426-
result.push_back('\\');
427+
result.emplace_back('\\');
427428
break;
428429
case '/':
429-
result.push_back('/');
430+
result.emplace_back('/');
430431
break;
431432
case 'b':
432-
result.push_back('\b');
433+
result.emplace_back('\b');
433434
break;
434435
case 'f':
435-
result.push_back('\f');
436+
result.emplace_back('\f');
436437
break;
437438
case 'n':
438-
result.push_back('\n');
439+
result.emplace_back('\n');
439440
break;
440441
case 'r':
441-
result.push_back('\r');
442+
result.emplace_back('\r');
442443
break;
443444
case 't':
444-
result.push_back('\t');
445+
result.emplace_back('\t');
445446
break;
446-
case 'u': {
447-
uint16_t cp = 0;
448-
for (int i = 0; i < 4; i++) {
449-
++_cur;
450-
if (_cur == _end) {
451-
return std::nullopt;
452-
}
453-
if (!std::isxdigit(static_cast<unsigned char>(*_cur))) {
454-
return std::nullopt;
455-
}
456-
cp <<= 4;
457-
if ('0' <= *_cur && *_cur <= '9') {
458-
cp |= *_cur - '0';
459-
}
460-
else if ('a' <= *_cur && *_cur <= 'f') {
461-
cp |= *_cur - 'a' + 10;
462-
}
463-
else if ('A' <= *_cur && *_cur <= 'F') {
464-
cp |= *_cur - 'A' + 10;
465-
}
466-
else {
467-
return std::nullopt;
468-
}
469-
}
470-
uint32_t ext_cp = cp;
471-
uint16_t hi_cp = 0, lo_cp = 0;
472-
if (0xD800 <= cp && cp <= 0xDBFF) {
473-
if (pair_high) {
474-
return std::nullopt;
475-
} else {
476-
pair_high = cp;
477-
break;
478-
}
479-
} else if (0xDC00 <= cp && cp <= 0xDFFF) {
480-
if (!pair_high) {
481-
return std::nullopt;
482-
} else {
483-
ext_cp = (((pair_high - 0xD800) << 10) | (cp - 0xDC00)) + 0x10000;
484-
hi_cp = pair_high;
485-
lo_cp = cp;
486-
pair_high = 0;
487-
}
488-
}
489-
if constexpr (std::is_same_v<typename string_t::value_type, char>) {
490-
// utf8
491-
if (ext_cp <= 0x7F) {
492-
result.push_back(static_cast<char>(ext_cp));
493-
}
494-
else if (ext_cp <= 0x7FF) {
495-
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00011111) | 0b11000000u));
496-
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
497-
}
498-
else if (ext_cp <= 0xFFFF) {
499-
result.push_back(
500-
static_cast<char>(((ext_cp >> 12) & 0b00001111) | 0b11100000u));
501-
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
502-
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
503-
} else {
504-
result.push_back(
505-
static_cast<char>(((ext_cp >> 18) & 0b00000111) | 0b11110000u));
506-
result.push_back(static_cast<char>(((ext_cp >> 12) & 0b00111111) | 0b10000000u));
507-
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
508-
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
509-
}
510-
}
511-
else if constexpr (std::is_same_v<typename string_t::value_type, wchar_t>) {
512-
if constexpr (sizeof (wchar_t) == 4) {
513-
result.push_back(static_cast<wchar_t>(ext_cp));
514-
} else if constexpr (sizeof (wchar_t) == 2) {
515-
if (ext_cp <= 0xFFFF) {
516-
result.push_back(static_cast<wchar_t>(ext_cp));
517-
} else {
518-
result.push_back(static_cast<wchar_t>(hi_cp));
519-
result.push_back(static_cast<wchar_t>(lo_cp));
520-
}
521-
} else {
522-
static_assert(!sizeof(typename string_t::value_type), "Unsupported wchar");
523-
}
524-
}
525-
else {
526-
static_assert(!sizeof(typename string_t::value_type), "Unsupported type");
447+
case 'u':
448+
if (!skip_unicode_escape(pair_high)) {
449+
return std::nullopt;
527450
}
528451
break;
529-
}
530452
default:
531453
// Illegal backslash escape
532454
return std::nullopt;
@@ -552,6 +474,101 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
552474
return std::nullopt;
553475
}
554476

477+
template <typename string_t, typename parsing_t, typename accel_traits>
478+
inline bool parser<string_t, parsing_t, accel_traits>::skip_unicode_escape(uint16_t& pair_high)
479+
{
480+
uint16_t cp = 0;
481+
for (int i = 0; i < 4; ++i) {
482+
if (++_cur == _end) {
483+
return false;
484+
}
485+
486+
if (!std::isxdigit(static_cast<unsigned char>(*_cur))) {
487+
return false;
488+
}
489+
490+
cp <<= 4;
491+
492+
if ('0' <= *_cur && *_cur <= '9') {
493+
cp |= *_cur - '0';
494+
}
495+
else if ('a' <= *_cur && *_cur <= 'f') {
496+
cp |= *_cur - 'a' + 10;
497+
}
498+
else if ('A' <= *_cur && *_cur <= 'F') {
499+
cp |= *_cur - 'A' + 10;
500+
}
501+
else {
502+
return false;
503+
}
504+
}
505+
506+
uint32_t ext_cp = cp;
507+
uint16_t hi_cp = 0, lo_cp = 0;
508+
509+
if (0xD800 <= cp && cp <= 0xDBFF) {
510+
if (pair_high) {
511+
return false;
512+
}
513+
pair_high = cp;
514+
return true;
515+
}
516+
517+
if (0xDC00 <= cp && cp <= 0xDFFF) {
518+
if (!pair_high) {
519+
return false;
520+
}
521+
ext_cp = (((pair_high - 0xD800) << 10) | (cp - 0xDC00)) + 0x10000;
522+
hi_cp = pair_high;
523+
lo_cp = cp;
524+
pair_high = 0;
525+
}
526+
527+
if constexpr (std::is_same_v<typename string_t::value_type, char>) {
528+
// utf8
529+
if (ext_cp <= 0x7F) {
530+
result.emplace_back(static_cast<char>(ext_cp));
531+
}
532+
else if (ext_cp <= 0x7FF) {
533+
result.emplace_back(static_cast<char>(((ext_cp >> 6) & 0b00011111) | 0b11000000u));
534+
result.emplace_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
535+
}
536+
else if (ext_cp <= 0xFFFF) {
537+
result.emplace_back(static_cast<char>(((ext_cp >> 12) & 0b00001111) | 0b11100000u));
538+
result.emplace_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
539+
result.emplace_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
540+
}
541+
else {
542+
result.emplace_back(static_cast<char>(((ext_cp >> 18) & 0b00000111) | 0b11110000u));
543+
result.emplace_back(static_cast<char>(((ext_cp >> 12) & 0b00111111) | 0b10000000u));
544+
result.emplace_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
545+
result.emplace_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
546+
}
547+
}
548+
else if constexpr (std::is_same_v<typename string_t::value_type, wchar_t>) {
549+
if constexpr (sizeof(wchar_t) == 4) {
550+
result.emplace_back(static_cast<wchar_t>(ext_cp));
551+
}
552+
else if constexpr (sizeof(wchar_t) == 2) {
553+
if (ext_cp <= 0xFFFF) {
554+
result.emplace_back(static_cast<wchar_t>(ext_cp));
555+
}
556+
else {
557+
result.emplace_back(static_cast<wchar_t>(hi_cp));
558+
result.emplace_back(static_cast<wchar_t>(lo_cp));
559+
}
560+
}
561+
else {
562+
static_assert(!sizeof(typename string_t::value_type), "Unsupported wchar");
563+
}
564+
}
565+
else {
566+
static_assert(!sizeof(typename string_t::value_type), "Unsupported type");
567+
}
568+
569+
return true;
570+
}
571+
555572
template <typename string_t, typename parsing_t, typename accel_traits>
556573
inline bool parser<string_t, parsing_t, accel_traits>::skip_string_literal_with_accel()
557574
{

0 commit comments

Comments
 (0)