Skip to content

Commit 676cb37

Browse files
committed
perf: 解耦unicode_escape
1 parent b402505 commit 676cb37

File tree

1 file changed

+101
-82
lines changed

1 file changed

+101
-82
lines changed

include/parser/parser.hpp

+101-82
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ class parser
5757
bool skip_string_literal_with_accel();
5858
bool skip_whitespace() noexcept;
5959
bool skip_digit();
60+
bool skip_unicode_escape(uint16_t& pair_high, string_t& result);
6061

6162
private:
6263
parsing_iter_t _cur;
@@ -443,90 +444,11 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
443444
case 't':
444445
result.push_back('\t');
445446
break;
446-
case 'u': {
447-
uint16_t cp = 0;
448-
for (int i = 0; i < 4; i++) {
449-
++_cur;
450-
if (_cur == _end) {
451-
return std::nullopt;
452-
}
453-
if (!std::isxdigit(static_cast<unsigned char>(*_cur))) {
454-
return std::nullopt;
455-
}
456-
cp <<= 4;
457-
if ('0' <= *_cur && *_cur <= '9') {
458-
cp |= *_cur - '0';
459-
}
460-
else if ('a' <= *_cur && *_cur <= 'f') {
461-
cp |= *_cur - 'a' + 10;
462-
}
463-
else if ('A' <= *_cur && *_cur <= 'F') {
464-
cp |= *_cur - 'A' + 10;
465-
}
466-
else {
467-
return std::nullopt;
468-
}
469-
}
470-
uint32_t ext_cp = cp;
471-
uint16_t hi_cp = 0, lo_cp = 0;
472-
if (0xD800 <= cp && cp <= 0xDBFF) {
473-
if (pair_high) {
474-
return std::nullopt;
475-
} else {
476-
pair_high = cp;
477-
break;
478-
}
479-
} else if (0xDC00 <= cp && cp <= 0xDFFF) {
480-
if (!pair_high) {
481-
return std::nullopt;
482-
} else {
483-
ext_cp = (((pair_high - 0xD800) << 10) | (cp - 0xDC00)) + 0x10000;
484-
hi_cp = pair_high;
485-
lo_cp = cp;
486-
pair_high = 0;
487-
}
488-
}
489-
if constexpr (std::is_same_v<typename string_t::value_type, char>) {
490-
// utf8
491-
if (ext_cp <= 0x7F) {
492-
result.push_back(static_cast<char>(ext_cp));
493-
}
494-
else if (ext_cp <= 0x7FF) {
495-
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00011111) | 0b11000000u));
496-
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
497-
}
498-
else if (ext_cp <= 0xFFFF) {
499-
result.push_back(
500-
static_cast<char>(((ext_cp >> 12) & 0b00001111) | 0b11100000u));
501-
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
502-
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
503-
} else {
504-
result.push_back(
505-
static_cast<char>(((ext_cp >> 18) & 0b00000111) | 0b11110000u));
506-
result.push_back(static_cast<char>(((ext_cp >> 12) & 0b00111111) | 0b10000000u));
507-
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
508-
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
509-
}
510-
}
511-
else if constexpr (std::is_same_v<typename string_t::value_type, wchar_t>) {
512-
if constexpr (sizeof (wchar_t) == 4) {
513-
result.push_back(static_cast<wchar_t>(ext_cp));
514-
} else if constexpr (sizeof (wchar_t) == 2) {
515-
if (ext_cp <= 0xFFFF) {
516-
result.push_back(static_cast<wchar_t>(ext_cp));
517-
} else {
518-
result.push_back(static_cast<wchar_t>(hi_cp));
519-
result.push_back(static_cast<wchar_t>(lo_cp));
520-
}
521-
} else {
522-
static_assert(!sizeof(typename string_t::value_type), "Unsupported wchar");
523-
}
524-
}
525-
else {
526-
static_assert(!sizeof(typename string_t::value_type), "Unsupported type");
447+
case 'u':
448+
if (!skip_unicode_escape(pair_high, result)) {
449+
return std::nullopt;
527450
}
528451
break;
529-
}
530452
default:
531453
// Illegal backslash escape
532454
return std::nullopt;
@@ -552,6 +474,103 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
552474
return std::nullopt;
553475
}
554476

477+
template <typename string_t, typename parsing_t, typename accel_traits>
478+
inline bool parser<string_t, parsing_t, accel_traits>::skip_unicode_escape(
479+
uint16_t& pair_high,
480+
string_t& result)
481+
{
482+
uint16_t cp = 0;
483+
for (int i = 0; i < 4; ++i) {
484+
if (++_cur == _end) {
485+
return false;
486+
}
487+
488+
if (!std::isxdigit(static_cast<unsigned char>(*_cur))) {
489+
return false;
490+
}
491+
492+
cp <<= 4;
493+
494+
if ('0' <= *_cur && *_cur <= '9') {
495+
cp |= *_cur - '0';
496+
}
497+
else if ('a' <= *_cur && *_cur <= 'f') {
498+
cp |= *_cur - 'a' + 10;
499+
}
500+
else if ('A' <= *_cur && *_cur <= 'F') {
501+
cp |= *_cur - 'A' + 10;
502+
}
503+
else {
504+
return false;
505+
}
506+
}
507+
508+
uint32_t ext_cp = cp;
509+
uint16_t hi_cp = 0, lo_cp = 0;
510+
511+
if (0xD800 <= cp && cp <= 0xDBFF) {
512+
if (pair_high) {
513+
return false;
514+
}
515+
pair_high = cp;
516+
return true;
517+
}
518+
519+
if (0xDC00 <= cp && cp <= 0xDFFF) {
520+
if (!pair_high) {
521+
return false;
522+
}
523+
ext_cp = (((pair_high - 0xD800) << 10) | (cp - 0xDC00)) + 0x10000;
524+
hi_cp = pair_high;
525+
lo_cp = cp;
526+
pair_high = 0;
527+
}
528+
529+
if constexpr (std::is_same_v<typename string_t::value_type, char>) {
530+
// utf8
531+
if (ext_cp <= 0x7F) {
532+
result.push_back(static_cast<char>(ext_cp));
533+
}
534+
else if (ext_cp <= 0x7FF) {
535+
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00011111) | 0b11000000u));
536+
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
537+
}
538+
else if (ext_cp <= 0xFFFF) {
539+
result.push_back(static_cast<char>(((ext_cp >> 12) & 0b00001111) | 0b11100000u));
540+
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
541+
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
542+
}
543+
else {
544+
result.push_back(static_cast<char>(((ext_cp >> 18) & 0b00000111) | 0b11110000u));
545+
result.push_back(static_cast<char>(((ext_cp >> 12) & 0b00111111) | 0b10000000u));
546+
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
547+
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
548+
}
549+
}
550+
else if constexpr (std::is_same_v<typename string_t::value_type, wchar_t>) {
551+
if constexpr (sizeof(wchar_t) == 4) {
552+
result.push_back(static_cast<wchar_t>(ext_cp));
553+
}
554+
else if constexpr (sizeof(wchar_t) == 2) {
555+
if (ext_cp <= 0xFFFF) {
556+
result.push_back(static_cast<wchar_t>(ext_cp));
557+
}
558+
else {
559+
result.push_back(static_cast<wchar_t>(hi_cp));
560+
result.push_back(static_cast<wchar_t>(lo_cp));
561+
}
562+
}
563+
else {
564+
static_assert(!sizeof(typename string_t::value_type), "Unsupported wchar");
565+
}
566+
}
567+
else {
568+
static_assert(!sizeof(typename string_t::value_type), "Unsupported type");
569+
}
570+
571+
return true;
572+
}
573+
555574
template <typename string_t, typename parsing_t, typename accel_traits>
556575
inline bool parser<string_t, parsing_t, accel_traits>::skip_string_literal_with_accel()
557576
{

0 commit comments

Comments
 (0)