Skip to content

Commit b308817

Browse files
authored
hir: lift alternations' common suffixes too
This should probably produce better regexes internally; additionally, I know it will produce Hirs that are more amenable to being walked recursively (trading away performance for content-intelligence) to produce human-readable regex syntax. (My own usecase involves automating the operation of a ghastly pre-existing machine that takes PCREs.)
1 parent 1a069b9 commit b308817

File tree

1 file changed

+61
-2
lines changed

1 file changed

+61
-2
lines changed

regex-syntax/src/hir/mod.rs

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3047,7 +3047,7 @@ fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
30473047
.count();
30483048
prefix = &prefix[..common_len];
30493049
if prefix.is_empty() {
3050-
return Err(hirs);
3050+
return lift_common_suffix(hirs).map(Hir::concat);
30513051
}
30523052
}
30533053
let len = prefix.len();
@@ -3068,10 +3068,69 @@ fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
30683068
}
30693069
}
30703070
let mut concat = prefix_concat;
3071-
concat.push(Hir::alternation(suffix_alts));
3071+
match lift_common_suffix(suffix_alts) {
3072+
Ok(suffix_concat) => {
3073+
concat.extend(suffix_concat);
3074+
}
3075+
Err(suffix_alts) => {
3076+
concat.push(Hir::alternation(suffix_alts));
3077+
}
3078+
}
30723079
Ok(Hir::concat(concat))
30733080
}
30743081

3082+
#[allow(clippy::inline_always)]
3083+
#[inline(always)] // prevents blowing the stack
3084+
fn lift_common_suffix(hirs: Vec<Hir>) -> Result<Vec<Hir>, Vec<Hir>> {
3085+
if hirs.len() <= 1 {
3086+
return Err(hirs);
3087+
}
3088+
let mut suffix = match hirs.last().unwrap().kind() {
3089+
HirKind::Concat(ref xs) => &**xs,
3090+
_ => return Err(hirs),
3091+
};
3092+
if suffix.is_empty() {
3093+
return Err(hirs);
3094+
}
3095+
for h in hirs.iter().rev().skip(1) {
3096+
let concat = match h.kind() {
3097+
HirKind::Concat(ref xs) => xs,
3098+
_ => return Err(hirs),
3099+
};
3100+
let common_len = suffix
3101+
.iter()
3102+
.rev()
3103+
.zip(concat.iter().rev())
3104+
.take_while(|(x, y)| x == y)
3105+
.count();
3106+
suffix = &suffix[suffix.len()-common_len..];
3107+
if suffix.is_empty() {
3108+
return Err(hirs);
3109+
}
3110+
}
3111+
let len = suffix.len();
3112+
assert_ne!(0, len);
3113+
let mut suffix_concat = vec![];
3114+
let mut prefix_alts = vec![];
3115+
for h in hirs {
3116+
let mut concat = match h.into_kind() {
3117+
HirKind::Concat(xs) => xs,
3118+
// We required all sub-expressions to be
3119+
// concats above, so we're only here if we
3120+
// have a concat.
3121+
_ => unreachable!(),
3122+
};
3123+
let suffix = concat.split_off(concat.len()-len);
3124+
prefix_alts.push(Hir::concat(concat));
3125+
if suffix_concat.is_empty() {
3126+
suffix_concat = suffix;
3127+
}
3128+
}
3129+
let mut concat = suffix_concat;
3130+
concat.insert(0, Hir::alternation(prefix_alts));
3131+
Ok(concat)
3132+
}
3133+
30753134
#[cfg(test)]
30763135
mod tests {
30773136
use super::*;

0 commit comments

Comments
 (0)