@@ -397,6 +397,7 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
397
397
398
398
string_t result;
399
399
auto no_escape_beg = _cur;
400
+ uint16_t pair_high = 0 ;
400
401
401
402
while (_cur != _end) {
402
403
if constexpr (sizeof (*_cur) == 1 && accel_traits::available) {
@@ -414,6 +415,9 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
414
415
if (_cur == _end) {
415
416
return std::nullopt;
416
417
}
418
+ if (pair_high && *_cur != ' u' ) {
419
+ return std::nullopt;
420
+ }
417
421
switch (*_cur) {
418
422
case ' "' :
419
423
result.push_back (' "' );
@@ -463,24 +467,60 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
463
467
return std::nullopt;
464
468
}
465
469
}
470
+ uint32_t ext_cp = cp;
471
+ uint16_t hi_cp = 0 , lo_cp = 0 ;
472
+ if (0xD800 <= cp && cp <= 0xDBFF ) {
473
+ if (pair_high) {
474
+ return std::nullopt;
475
+ } else {
476
+ pair_high = cp;
477
+ break ;
478
+ }
479
+ } else if (0xDC00 <= cp && cp <= 0xDFFF ) {
480
+ if (!pair_high) {
481
+ return std::nullopt;
482
+ } else {
483
+ ext_cp = (((pair_high - 0xD800 ) << 10 ) | (cp - 0xDC00 )) + 0x10000 ;
484
+ hi_cp = pair_high;
485
+ lo_cp = cp;
486
+ pair_high = 0 ;
487
+ }
488
+ }
466
489
if constexpr (std::is_same_v<typename string_t ::value_type, char >) {
467
490
// utf8
468
- if (cp <= 0x7F ) {
469
- result.push_back (static_cast <char >(cp ));
491
+ if (ext_cp <= 0x7F ) {
492
+ result.push_back (static_cast <char >(ext_cp ));
470
493
}
471
- else if (cp <= 0x7FF ) {
472
- result.push_back (static_cast <char >(((cp >> 6 ) & 0b00011111 ) | 0b11000000u ));
473
- result.push_back (static_cast <char >((cp & 0b00111111 ) | 0b10000000u ));
494
+ else if (ext_cp <= 0x7FF ) {
495
+ result.push_back (static_cast <char >(((ext_cp >> 6 ) & 0b00011111 ) | 0b11000000u ));
496
+ result.push_back (static_cast <char >((ext_cp & 0b00111111 ) | 0b10000000u ));
474
497
}
475
- else {
498
+ else if (ext_cp <= 0xFFFF ) {
499
+ result.push_back (
500
+ static_cast <char >(((ext_cp >> 12 ) & 0b00001111 ) | 0b11100000u ));
501
+ result.push_back (static_cast <char >(((ext_cp >> 6 ) & 0b00111111 ) | 0b10000000u ));
502
+ result.push_back (static_cast <char >((ext_cp & 0b00111111 ) | 0b10000000u ));
503
+ } else {
476
504
result.push_back (
477
- static_cast <char >(((cp >> 12 ) & 0b00001111 ) | 0b11100000u ));
478
- result.push_back (static_cast <char >(((cp >> 6 ) & 0b00111111 ) | 0b10000000u ));
479
- result.push_back (static_cast <char >((cp & 0b00111111 ) | 0b10000000u ));
505
+ static_cast <char >(((ext_cp >> 18 ) & 0b00000111 ) | 0b11110000u ));
506
+ result.push_back (static_cast <char >(((ext_cp >> 12 ) & 0b00111111 ) | 0b10000000u ));
507
+ result.push_back (static_cast <char >(((ext_cp >> 6 ) & 0b00111111 ) | 0b10000000u ));
508
+ result.push_back (static_cast <char >((ext_cp & 0b00111111 ) | 0b10000000u ));
480
509
}
481
510
}
482
511
else if constexpr (std::is_same_v<typename string_t ::value_type, wchar_t >) {
483
- result.push_back (cp);
512
+ if constexpr (sizeof (wchar_t ) == 4 ) {
513
+ result.push_back (static_cast <wchar_t >(ext_cp));
514
+ } else if constexpr (sizeof (wchar_t ) == 2 ) {
515
+ if (ext_cp <= 0xFFFF ) {
516
+ result.push_back (static_cast <wchar_t >(ext_cp));
517
+ } else {
518
+ result.push_back (static_cast <wchar_t >(hi_cp));
519
+ result.push_back (static_cast <wchar_t >(lo_cp));
520
+ }
521
+ } else {
522
+ static_assert (!sizeof (typename string_t ::value_type), " Unsupported wchar" );
523
+ }
484
524
}
485
525
else {
486
526
static_assert (!sizeof (typename string_t ::value_type), " Unsupported type" );
@@ -495,10 +535,16 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
495
535
break ;
496
536
}
497
537
case ' "' : {
538
+ if (pair_high) {
539
+ return std::nullopt;
540
+ }
498
541
result += string_t (no_escape_beg, _cur++);
499
542
return result;
500
543
}
501
544
default :
545
+ if (pair_high) {
546
+ return std::nullopt;
547
+ }
502
548
++_cur;
503
549
break ;
504
550
}
0 commit comments