Skip to content

Commit f5541f6

Browse files
committed
regexec.c: Change static function API
Sometimes this functionality is needed to also skip over certain intervening classes of characters while backing up in the parse string. This commit creates two macros to call the modified underlying function with a boolean flag. This names of the macros make it easy to know what's happening.
1 parent 8ebc220 commit f5541f6

File tree

4 files changed

+47
-30
lines changed

4 files changed

+47
-30
lines changed

embed.fnc

+3-2
Original file line numberDiff line numberDiff line change
@@ -5651,9 +5651,10 @@ ERS |WB_enum|advance_one_WB_|NN U8 **curpos \
56515651
ERS |GCB_enum|backup_one_GCB|NN const U8 * const strbeg \
56525652
|NN U8 **curpos \
56535653
|const bool utf8_target
5654-
ERS |LB_enum|backup_one_LB |NN const U8 * const strbeg \
5654+
ERS |LB_enum|backup_one_LB_ |NN const U8 * const strbeg \
56555655
|NN U8 **curpos \
5656-
|const bool utf8_target
5656+
|const bool utf8_target \
5657+
|bool skip_CM_ZWJ
56575658
ERS |SB_enum|backup_one_SB |NN const U8 * const strbeg \
56585659
|NN U8 **curpos \
56595660
|const bool utf8_target

embed.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -2014,7 +2014,7 @@
20142014
# define advance_one_SB(a,b,c) S_advance_one_SB(aTHX_ a,b,c)
20152015
# define advance_one_WB_(a,b,c,d) S_advance_one_WB_(aTHX_ a,b,c,d)
20162016
# define backup_one_GCB(a,b,c) S_backup_one_GCB(aTHX_ a,b,c)
2017-
# define backup_one_LB(a,b,c) S_backup_one_LB(aTHX_ a,b,c)
2017+
# define backup_one_LB_(a,b,c,d) S_backup_one_LB_(aTHX_ a,b,c,d)
20182018
# define backup_one_SB(a,b,c) S_backup_one_SB(aTHX_ a,b,c)
20192019
# define backup_one_WB_but_over_Extend_FO(a,b,c,d) S_backup_one_WB_but_over_Extend_FO(aTHX_ a,b,c,d)
20202020
# define capture_clear(a,b,c,d) S_capture_clear(aTHX_ a,b,c,d comma_aDEPTH)

proto.h

+2-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

regexec.c

+41-25
Original file line numberDiff line numberDiff line change
@@ -5375,6 +5375,11 @@ S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_ta
53755375
|| isLB_Space(prev) \
53765376
|| isLB_ZWSpace(prev)))
53775377

5378+
#define backup_one_LB(begin, cur, utf8) \
5379+
backup_one_LB_(begin, cur, utf8, false)
5380+
#define backup_one_LB_but_over_CM_ZWJ(begin, cur, utf8) \
5381+
backup_one_LB_(begin, cur, utf8, true)
5382+
53785383
STATIC bool
53795384
S_isLB(pTHX_ LB_enum before,
53805385
LB_enum after,
@@ -5491,11 +5496,8 @@ S_isLB(pTHX_ LB_enum before,
54915496

54925497
/* We don't know how to treat the CM except by looking at the first
54935498
* non-CM character preceding it. ZWJ is treated as CM */
5494-
do {
5495-
prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5496-
}
5497-
while (isLB_Combining_Mark(prev) || isLB_ZWJ(prev));
5498-
5499+
prev = backup_one_LB_but_over_CM_ZWJ(strbeg, &temp_pos,
5500+
utf8_target);
54995501
/* Here, 'prev' is that first earlier non-CM character. If the CM
55005502
* attaches to it, then it inherits the behavior of 'prev'. If it
55015503
* doesn't attach, it is to be treated as an AL */
@@ -5630,41 +5632,55 @@ S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_ta
56305632
}
56315633

56325634
STATIC LB_enum
5633-
S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5635+
S_backup_one_LB_(pTHX_ const U8 * const strbeg,
5636+
U8 ** curpos,
5637+
const bool utf8_target,
5638+
bool skip_CM_ZWJ)
56345639
{
5635-
LB_enum lb;
5640+
PERL_ARGS_ASSERT_BACKUP_ONE_LB_;
56365641

5637-
PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5642+
LB_enum isLB_scratch; /* Used by generated isLB_foo() macros */
56385643

56395644
if (*curpos < strbeg) {
56405645
return LB_EDGE;
56415646
}
56425647

5648+
LB_enum lb;
5649+
56435650
if (utf8_target) {
56445651
U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5645-
U8 * prev_prev_char_pos;
5646-
56475652
if (! prev_char_pos) {
56485653
return LB_EDGE;
56495654
}
56505655

5651-
if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5652-
lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5653-
*curpos = prev_char_pos;
5654-
prev_char_pos = prev_prev_char_pos;
5655-
}
5656-
else {
5657-
*curpos = (U8 *) strbeg;
5658-
return LB_EDGE;
5659-
}
5656+
/* Back up one. Keep going if result is CM or ZWJ and caller wants
5657+
* those skipped. curpos is always just to the right of the character
5658+
* whose value we are getting */
5659+
do {
5660+
U8 * prev_prev_char_pos;
5661+
if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5662+
-1,
5663+
strbeg)))
5664+
{
5665+
lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5666+
*curpos = prev_char_pos;
5667+
prev_char_pos = prev_prev_char_pos;
5668+
}
5669+
else {
5670+
*curpos = (U8 *) strbeg;
5671+
return LB_EDGE;
5672+
}
5673+
} while (skip_CM_ZWJ && (isLB_CM(lb) || isLB_ZWJ(lb)));
56605674
}
56615675
else {
5662-
if (*curpos - 2 < strbeg) {
5663-
*curpos = (U8 *) strbeg;
5664-
return LB_EDGE;
5665-
}
5666-
(*curpos)--;
5667-
lb = getLB_VAL_CP(*(*curpos - 1));
5676+
do {
5677+
if (*curpos - 2 < strbeg) {
5678+
*curpos = (U8 *) strbeg;
5679+
return LB_EDGE;
5680+
}
5681+
(*curpos)--;
5682+
lb = getLB_VAL_CP(*(*curpos - 1));
5683+
} while (skip_CM_ZWJ && (isLB_CM(lb) || isLB_ZWJ(lb)));
56685684
}
56695685

56705686
return lb;

0 commit comments

Comments
 (0)