Skip to content

Commit b402505

Browse files
authored
feat: unicode pair (#64)
1 parent 34b7dd2 commit b402505

File tree

2 files changed

+60
-13
lines changed

2 files changed

+60
-13
lines changed

include/parser/parser.hpp

+56-10
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,7 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
397397

398398
string_t result;
399399
auto no_escape_beg = _cur;
400+
uint16_t pair_high = 0;
400401

401402
while (_cur != _end) {
402403
if constexpr (sizeof(*_cur) == 1 && accel_traits::available) {
@@ -414,6 +415,9 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
414415
if (_cur == _end) {
415416
return std::nullopt;
416417
}
418+
if (pair_high && *_cur != 'u') {
419+
return std::nullopt;
420+
}
417421
switch (*_cur) {
418422
case '"':
419423
result.push_back('"');
@@ -463,24 +467,60 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
463467
return std::nullopt;
464468
}
465469
}
470+
uint32_t ext_cp = cp;
471+
uint16_t hi_cp = 0, lo_cp = 0;
472+
if (0xD800 <= cp && cp <= 0xDBFF) {
473+
if (pair_high) {
474+
return std::nullopt;
475+
} else {
476+
pair_high = cp;
477+
break;
478+
}
479+
} else if (0xDC00 <= cp && cp <= 0xDFFF) {
480+
if (!pair_high) {
481+
return std::nullopt;
482+
} else {
483+
ext_cp = (((pair_high - 0xD800) << 10) | (cp - 0xDC00)) + 0x10000;
484+
hi_cp = pair_high;
485+
lo_cp = cp;
486+
pair_high = 0;
487+
}
488+
}
466489
if constexpr (std::is_same_v<typename string_t::value_type, char>) {
467490
// utf8
468-
if (cp <= 0x7F) {
469-
result.push_back(static_cast<char>(cp));
491+
if (ext_cp <= 0x7F) {
492+
result.push_back(static_cast<char>(ext_cp));
470493
}
471-
else if (cp <= 0x7FF) {
472-
result.push_back(static_cast<char>(((cp >> 6) & 0b00011111) | 0b11000000u));
473-
result.push_back(static_cast<char>((cp & 0b00111111) | 0b10000000u));
494+
else if (ext_cp <= 0x7FF) {
495+
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00011111) | 0b11000000u));
496+
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
474497
}
475-
else {
498+
else if (ext_cp <= 0xFFFF) {
499+
result.push_back(
500+
static_cast<char>(((ext_cp >> 12) & 0b00001111) | 0b11100000u));
501+
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
502+
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
503+
} else {
476504
result.push_back(
477-
static_cast<char>(((cp >> 12) & 0b00001111) | 0b11100000u));
478-
result.push_back(static_cast<char>(((cp >> 6) & 0b00111111) | 0b10000000u));
479-
result.push_back(static_cast<char>((cp & 0b00111111) | 0b10000000u));
505+
static_cast<char>(((ext_cp >> 18) & 0b00000111) | 0b11110000u));
506+
result.push_back(static_cast<char>(((ext_cp >> 12) & 0b00111111) | 0b10000000u));
507+
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
508+
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
480509
}
481510
}
482511
else if constexpr (std::is_same_v<typename string_t::value_type, wchar_t>) {
483-
result.push_back(cp);
512+
if constexpr (sizeof (wchar_t) == 4) {
513+
result.push_back(static_cast<wchar_t>(ext_cp));
514+
} else if constexpr (sizeof (wchar_t) == 2) {
515+
if (ext_cp <= 0xFFFF) {
516+
result.push_back(static_cast<wchar_t>(ext_cp));
517+
} else {
518+
result.push_back(static_cast<wchar_t>(hi_cp));
519+
result.push_back(static_cast<wchar_t>(lo_cp));
520+
}
521+
} else {
522+
static_assert(!sizeof(typename string_t::value_type), "Unsupported wchar");
523+
}
484524
}
485525
else {
486526
static_assert(!sizeof(typename string_t::value_type), "Unsupported type");
@@ -495,10 +535,16 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
495535
break;
496536
}
497537
case '"': {
538+
if (pair_high) {
539+
return std::nullopt;
540+
}
498541
result += string_t(no_escape_beg, _cur++);
499542
return result;
500543
}
501544
default:
545+
if (pair_high) {
546+
return std::nullopt;
547+
}
502548
++_cur;
503549
break;
504550
}

test/unicode_test.cpp

+4-3
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55

66
bool unicode_test()
77
{
8-
std::string test = R"({ "test": "abc\u9a8fdef" })";
8+
std::string test = R"({ "test": "abc\u9a8fdef\ud83d\udca9ghi" })";
9+
std::string target = "abc\u9a8fdef\U0001f4a9ghi";
910
auto obj = json::parse(test);
1011
if (!obj.has_value()) {
1112
std::cout << "parse failed" << std::endl;
@@ -16,10 +17,10 @@ bool unicode_test()
1617
<< static_cast<unsigned>(static_cast<unsigned char>(ch)) << ' ';
1718
}
1819
std::cout << std::endl;
19-
for (auto ch : "abc\u9a8fdef") {
20+
for (auto ch : target) {
2021
std::cout << std::hex << std::setw(2)
2122
<< static_cast<unsigned>(static_cast<unsigned char>(ch)) << ' ';
2223
}
2324
std::cout << std::endl;
24-
return obj.value().at("test").as_string() == "abc\u9a8fdef";
25+
return obj.value().at("test").as_string() == target;
2526
}

0 commit comments

Comments
 (0)