@@ -57,6 +57,7 @@ class parser
57
57
bool skip_string_literal_with_accel ();
58
58
bool skip_whitespace () noexcept ;
59
59
bool skip_digit ();
60
+ bool skip_unicode_escape (uint16_t & pair_high, string_t & result);
60
61
61
62
private:
62
63
parsing_iter_t _cur;
@@ -443,90 +444,11 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
443
444
case ' t' :
444
445
result.push_back (' \t ' );
445
446
break ;
446
- case ' u' : {
447
- uint16_t cp = 0 ;
448
- for (int i = 0 ; i < 4 ; i++) {
449
- ++_cur;
450
- if (_cur == _end) {
451
- return std::nullopt;
452
- }
453
- if (!std::isxdigit (static_cast <unsigned char >(*_cur))) {
454
- return std::nullopt;
455
- }
456
- cp <<= 4 ;
457
- if (' 0' <= *_cur && *_cur <= ' 9' ) {
458
- cp |= *_cur - ' 0' ;
459
- }
460
- else if (' a' <= *_cur && *_cur <= ' f' ) {
461
- cp |= *_cur - ' a' + 10 ;
462
- }
463
- else if (' A' <= *_cur && *_cur <= ' F' ) {
464
- cp |= *_cur - ' A' + 10 ;
465
- }
466
- else {
467
- return std::nullopt;
468
- }
469
- }
470
- uint32_t ext_cp = cp;
471
- uint16_t hi_cp = 0 , lo_cp = 0 ;
472
- if (0xD800 <= cp && cp <= 0xDBFF ) {
473
- if (pair_high) {
474
- return std::nullopt;
475
- } else {
476
- pair_high = cp;
477
- break ;
478
- }
479
- } else if (0xDC00 <= cp && cp <= 0xDFFF ) {
480
- if (!pair_high) {
481
- return std::nullopt;
482
- } else {
483
- ext_cp = (((pair_high - 0xD800 ) << 10 ) | (cp - 0xDC00 )) + 0x10000 ;
484
- hi_cp = pair_high;
485
- lo_cp = cp;
486
- pair_high = 0 ;
487
- }
488
- }
489
- if constexpr (std::is_same_v<typename string_t ::value_type, char >) {
490
- // utf8
491
- if (ext_cp <= 0x7F ) {
492
- result.push_back (static_cast <char >(ext_cp));
493
- }
494
- else if (ext_cp <= 0x7FF ) {
495
- result.push_back (static_cast <char >(((ext_cp >> 6 ) & 0b00011111 ) | 0b11000000u ));
496
- result.push_back (static_cast <char >((ext_cp & 0b00111111 ) | 0b10000000u ));
497
- }
498
- else if (ext_cp <= 0xFFFF ) {
499
- result.push_back (
500
- static_cast <char >(((ext_cp >> 12 ) & 0b00001111 ) | 0b11100000u ));
501
- result.push_back (static_cast <char >(((ext_cp >> 6 ) & 0b00111111 ) | 0b10000000u ));
502
- result.push_back (static_cast <char >((ext_cp & 0b00111111 ) | 0b10000000u ));
503
- } else {
504
- result.push_back (
505
- static_cast <char >(((ext_cp >> 18 ) & 0b00000111 ) | 0b11110000u ));
506
- result.push_back (static_cast <char >(((ext_cp >> 12 ) & 0b00111111 ) | 0b10000000u ));
507
- result.push_back (static_cast <char >(((ext_cp >> 6 ) & 0b00111111 ) | 0b10000000u ));
508
- result.push_back (static_cast <char >((ext_cp & 0b00111111 ) | 0b10000000u ));
509
- }
510
- }
511
- else if constexpr (std::is_same_v<typename string_t ::value_type, wchar_t >) {
512
- if constexpr (sizeof (wchar_t ) == 4 ) {
513
- result.push_back (static_cast <wchar_t >(ext_cp));
514
- } else if constexpr (sizeof (wchar_t ) == 2 ) {
515
- if (ext_cp <= 0xFFFF ) {
516
- result.push_back (static_cast <wchar_t >(ext_cp));
517
- } else {
518
- result.push_back (static_cast <wchar_t >(hi_cp));
519
- result.push_back (static_cast <wchar_t >(lo_cp));
520
- }
521
- } else {
522
- static_assert (!sizeof (typename string_t ::value_type), " Unsupported wchar" );
523
- }
524
- }
525
- else {
526
- static_assert (!sizeof (typename string_t ::value_type), " Unsupported type" );
447
+ case ' u' :
448
+ if (!skip_unicode_escape (pair_high, result)) {
449
+ return std::nullopt;
527
450
}
528
451
break ;
529
- }
530
452
default :
531
453
// Illegal backslash escape
532
454
return std::nullopt;
@@ -552,6 +474,103 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
552
474
return std::nullopt;
553
475
}
554
476
477
+ template <typename string_t , typename parsing_t , typename accel_traits>
478
+ inline bool parser<string_t , parsing_t , accel_traits>::skip_unicode_escape(
479
+ uint16_t & pair_high,
480
+ string_t & result)
481
+ {
482
+ uint16_t cp = 0 ;
483
+ for (int i = 0 ; i < 4 ; ++i) {
484
+ if (++_cur == _end) {
485
+ return false ;
486
+ }
487
+
488
+ if (!std::isxdigit (static_cast <unsigned char >(*_cur))) {
489
+ return false ;
490
+ }
491
+
492
+ cp <<= 4 ;
493
+
494
+ if (' 0' <= *_cur && *_cur <= ' 9' ) {
495
+ cp |= *_cur - ' 0' ;
496
+ }
497
+ else if (' a' <= *_cur && *_cur <= ' f' ) {
498
+ cp |= *_cur - ' a' + 10 ;
499
+ }
500
+ else if (' A' <= *_cur && *_cur <= ' F' ) {
501
+ cp |= *_cur - ' A' + 10 ;
502
+ }
503
+ else {
504
+ return false ;
505
+ }
506
+ }
507
+
508
+ uint32_t ext_cp = cp;
509
+ uint16_t hi_cp = 0 , lo_cp = 0 ;
510
+
511
+ if (0xD800 <= cp && cp <= 0xDBFF ) {
512
+ if (pair_high) {
513
+ return false ;
514
+ }
515
+ pair_high = cp;
516
+ return true ;
517
+ }
518
+
519
+ if (0xDC00 <= cp && cp <= 0xDFFF ) {
520
+ if (!pair_high) {
521
+ return false ;
522
+ }
523
+ ext_cp = (((pair_high - 0xD800 ) << 10 ) | (cp - 0xDC00 )) + 0x10000 ;
524
+ hi_cp = pair_high;
525
+ lo_cp = cp;
526
+ pair_high = 0 ;
527
+ }
528
+
529
+ if constexpr (std::is_same_v<typename string_t ::value_type, char >) {
530
+ // utf8
531
+ if (ext_cp <= 0x7F ) {
532
+ result.push_back (static_cast <char >(ext_cp));
533
+ }
534
+ else if (ext_cp <= 0x7FF ) {
535
+ result.push_back (static_cast <char >(((ext_cp >> 6 ) & 0b00011111 ) | 0b11000000u ));
536
+ result.push_back (static_cast <char >((ext_cp & 0b00111111 ) | 0b10000000u ));
537
+ }
538
+ else if (ext_cp <= 0xFFFF ) {
539
+ result.push_back (static_cast <char >(((ext_cp >> 12 ) & 0b00001111 ) | 0b11100000u ));
540
+ result.push_back (static_cast <char >(((ext_cp >> 6 ) & 0b00111111 ) | 0b10000000u ));
541
+ result.push_back (static_cast <char >((ext_cp & 0b00111111 ) | 0b10000000u ));
542
+ }
543
+ else {
544
+ result.push_back (static_cast <char >(((ext_cp >> 18 ) & 0b00000111 ) | 0b11110000u ));
545
+ result.push_back (static_cast <char >(((ext_cp >> 12 ) & 0b00111111 ) | 0b10000000u ));
546
+ result.push_back (static_cast <char >(((ext_cp >> 6 ) & 0b00111111 ) | 0b10000000u ));
547
+ result.push_back (static_cast <char >((ext_cp & 0b00111111 ) | 0b10000000u ));
548
+ }
549
+ }
550
+ else if constexpr (std::is_same_v<typename string_t ::value_type, wchar_t >) {
551
+ if constexpr (sizeof (wchar_t ) == 4 ) {
552
+ result.push_back (static_cast <wchar_t >(ext_cp));
553
+ }
554
+ else if constexpr (sizeof (wchar_t ) == 2 ) {
555
+ if (ext_cp <= 0xFFFF ) {
556
+ result.push_back (static_cast <wchar_t >(ext_cp));
557
+ }
558
+ else {
559
+ result.push_back (static_cast <wchar_t >(hi_cp));
560
+ result.push_back (static_cast <wchar_t >(lo_cp));
561
+ }
562
+ }
563
+ else {
564
+ static_assert (!sizeof (typename string_t ::value_type), " Unsupported wchar" );
565
+ }
566
+ }
567
+ else {
568
+ static_assert (!sizeof (typename string_t ::value_type), " Unsupported type" );
569
+ }
570
+
571
+ return true ;
572
+ }
573
+
555
574
template <typename string_t , typename parsing_t , typename accel_traits>
556
575
inline bool parser<string_t , parsing_t , accel_traits>::skip_string_literal_with_accel()
557
576
{
0 commit comments