@@ -5375,6 +5375,11 @@ S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_ta
5375
5375
|| isLB_Space(prev) \
5376
5376
|| isLB_ZWSpace(prev)))
5377
5377
5378
+ #define backup_one_LB (begin , cur , utf8 ) \
5379
+ backup_one_LB_(begin, cur, utf8, false)
5380
+ #define backup_one_LB_but_over_CM_ZWJ (begin , cur , utf8 ) \
5381
+ backup_one_LB_(begin, cur, utf8, true)
5382
+
5378
5383
STATIC bool
5379
5384
S_isLB (pTHX_ LB_enum before ,
5380
5385
LB_enum after ,
@@ -5491,11 +5496,8 @@ S_isLB(pTHX_ LB_enum before,
5491
5496
5492
5497
/* We don't know how to treat the CM except by looking at the first
5493
5498
* non-CM character preceding it. ZWJ is treated as CM */
5494
- do {
5495
- prev = backup_one_LB (strbeg , & temp_pos , utf8_target );
5496
- }
5497
- while (isLB_Combining_Mark (prev ) || isLB_ZWJ (prev ));
5498
-
5499
+ prev = backup_one_LB_but_over_CM_ZWJ (strbeg , & temp_pos ,
5500
+ utf8_target );
5499
5501
/* Here, 'prev' is that first earlier non-CM character. If the CM
5500
5502
* attaches to it, then it inherits the behavior of 'prev'. If it
5501
5503
* doesn't attach, it is to be treated as an AL */
@@ -5630,41 +5632,55 @@ S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_ta
5630
5632
}
5631
5633
5632
5634
STATIC LB_enum
5633
- S_backup_one_LB (pTHX_ const U8 * const strbeg , U8 * * curpos , const bool utf8_target )
5635
+ S_backup_one_LB_ (pTHX_ const U8 * const strbeg ,
5636
+ U8 * * curpos ,
5637
+ const bool utf8_target ,
5638
+ bool skip_CM_ZWJ )
5634
5639
{
5635
- LB_enum lb ;
5640
+ PERL_ARGS_ASSERT_BACKUP_ONE_LB_ ;
5636
5641
5637
- PERL_ARGS_ASSERT_BACKUP_ONE_LB ;
5642
+ LB_enum isLB_scratch ; /* Used by generated isLB_foo() macros */
5638
5643
5639
5644
if (* curpos < strbeg ) {
5640
5645
return LB_EDGE ;
5641
5646
}
5642
5647
5648
+ LB_enum lb ;
5649
+
5643
5650
if (utf8_target ) {
5644
5651
U8 * prev_char_pos = reghopmaybe3 (* curpos , -1 , strbeg );
5645
- U8 * prev_prev_char_pos ;
5646
-
5647
5652
if (! prev_char_pos ) {
5648
5653
return LB_EDGE ;
5649
5654
}
5650
5655
5651
- if ((prev_prev_char_pos = reghopmaybe3 ((U8 * ) prev_char_pos , -1 , strbeg ))) {
5652
- lb = getLB_VAL_UTF8 (prev_prev_char_pos , prev_char_pos );
5653
- * curpos = prev_char_pos ;
5654
- prev_char_pos = prev_prev_char_pos ;
5655
- }
5656
- else {
5657
- * curpos = (U8 * ) strbeg ;
5658
- return LB_EDGE ;
5659
- }
5656
+ /* Back up one. Keep going if result is CM or ZWJ and caller wants
5657
+ * those skipped. curpos is always just to the right of the character
5658
+ * whose value we are getting */
5659
+ do {
5660
+ U8 * prev_prev_char_pos ;
5661
+ if ((prev_prev_char_pos = reghopmaybe3 ((U8 * ) prev_char_pos ,
5662
+ -1 ,
5663
+ strbeg )))
5664
+ {
5665
+ lb = getLB_VAL_UTF8 (prev_prev_char_pos , prev_char_pos );
5666
+ * curpos = prev_char_pos ;
5667
+ prev_char_pos = prev_prev_char_pos ;
5668
+ }
5669
+ else {
5670
+ * curpos = (U8 * ) strbeg ;
5671
+ return LB_EDGE ;
5672
+ }
5673
+ } while (skip_CM_ZWJ && (isLB_CM (lb ) || isLB_ZWJ (lb )));
5660
5674
}
5661
5675
else {
5662
- if (* curpos - 2 < strbeg ) {
5663
- * curpos = (U8 * ) strbeg ;
5664
- return LB_EDGE ;
5665
- }
5666
- (* curpos )-- ;
5667
- lb = getLB_VAL_CP (* (* curpos - 1 ));
5676
+ do {
5677
+ if (* curpos - 2 < strbeg ) {
5678
+ * curpos = (U8 * ) strbeg ;
5679
+ return LB_EDGE ;
5680
+ }
5681
+ (* curpos )-- ;
5682
+ lb = getLB_VAL_CP (* (* curpos - 1 ));
5683
+ } while (skip_CM_ZWJ && (isLB_CM (lb ) || isLB_ZWJ (lb )));
5668
5684
}
5669
5685
5670
5686
return lb ;
0 commit comments