Skip to content

Commit 22163b2

Browse files
fix: improved handling of start states
1 parent b54cd29 commit 22163b2

File tree

2 files changed

+120
-38
lines changed

2 files changed

+120
-38
lines changed

compiler/src/nfa/builder.rs

Lines changed: 81 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@ impl NFAGraph {
1919
let re = PikeVM::new(pattern).map_err(|e| NFAError::RegexCompilation(e.to_string()))?;
2020
let thompson_nfa = re.get_nfa();
2121

22-
let state_len = thompson_nfa.states().len() - 2;
22+
let state_len = thompson_nfa.states().len() - thompson_nfa.start_anchored().as_usize();
2323

2424
let mut graph = Self::default();
2525
graph.regex = pattern.to_string();
2626
graph.initialize_nodes(state_len)?;
2727
graph.process_all_states(&thompson_nfa)?;
28-
graph.set_start_states(&thompson_nfa);
28+
graph.start_states.insert(0);
2929
graph.remove_epsilon_transitions()?;
3030

3131
graph.verify()?;
@@ -50,39 +50,66 @@ impl NFAGraph {
5050
/// Processes all states from the Thompson NFA
5151
fn process_all_states(&mut self, nfa: &NFA) -> NFAResult<()> {
5252
for state_idx in 0..self.nodes.len() {
53-
let state_id =
54-
StateID::new(state_idx + 2).map_err(|e| NFAError::InvalidStateId(e.to_string()))?;
53+
let state_id = StateID::new(state_idx + nfa.start_anchored().as_usize())
54+
.map_err(|e| NFAError::InvalidStateId(e.to_string()))?;
5555

5656
match nfa.state(state_id) {
5757
State::Match { .. } => {
5858
self.accept_states.insert(state_idx);
5959
}
6060
State::ByteRange { trans } => {
61-
self.add_byte_range_transition(state_idx, trans)?;
61+
self.add_byte_range_transition(
62+
nfa.start_anchored().as_usize(),
63+
state_idx,
64+
trans,
65+
)?;
6266
}
6367
State::Sparse(sparse) => {
64-
self.add_sparse_transitions(state_idx, &sparse.transitions)?;
68+
self.add_sparse_transitions(
69+
nfa.start_anchored().as_usize(),
70+
state_idx,
71+
&sparse.transitions,
72+
)?;
6573
}
6674
State::Dense(dense) => {
67-
self.add_dense_transitions(state_idx, &dense.transitions)?;
75+
self.add_dense_transitions(
76+
nfa.start_anchored().as_usize(),
77+
state_idx,
78+
&dense.transitions,
79+
)?;
6880
}
6981
State::Union { alternates } => {
70-
self.add_union_transitions(state_idx, alternates)?;
82+
self.add_union_transitions(
83+
nfa.start_anchored().as_usize(),
84+
state_idx,
85+
alternates,
86+
)?;
7187
}
7288
State::BinaryUnion { alt1, alt2 } => {
73-
self.add_binary_union_transitions(state_idx, alt1, alt2)?;
89+
self.add_binary_union_transitions(
90+
nfa.start_anchored().as_usize(),
91+
state_idx,
92+
alt1,
93+
alt2,
94+
)?;
7495
}
7596
State::Capture {
7697
next,
7798
group_index,
7899
slot,
79100
..
80101
} => {
81-
self.add_capture_transition(state_idx, next, group_index, slot)?;
102+
self.add_capture_transition(
103+
nfa.start_anchored().as_usize(),
104+
state_idx,
105+
next,
106+
group_index,
107+
slot,
108+
)?;
82109
self.num_capture_groups = self.num_capture_groups.max(group_index.as_usize());
83110
}
84111
State::Look { next, .. } => {
85-
self.add_look_transition(state_idx, next)?;
112+
self.add_look_transition(nfa.start_anchored().as_usize(), state_idx, next)?;
86113
}
87114
State::Fail => {} // No transitions needed
88115
}
@@ -91,100 +118,122 @@ impl NFAGraph {
91118
}
92119

93120
/// Adds a byte range transition to the graph
94-
fn add_byte_range_transition(&mut self, state_id: usize, trans: &Transition) -> NFAResult<()> {
121+
fn add_byte_range_transition(
122+
&mut self,
123+
anchored_state_id: usize,
124+
state_id: usize,
125+
trans: &Transition,
126+
) -> NFAResult<()> {
95127
for byte in trans.start..=trans.end {
96128
self.nodes[state_id]
97129
.byte_transitions
98130
.entry(byte)
99131
.or_insert_with(BTreeSet::new)
100-
.insert(trans.next.as_usize() - 2);
132+
.insert(trans.next.as_usize() - anchored_state_id);
101133
}
102134
Ok(())
103135
}
104136

105137
/// Adds transitions from a sparse transition set
106138
fn add_sparse_transitions(
107139
&mut self,
140+
anchored_state_id: usize,
108141
state_id: usize,
109142
transitions: &[Transition],
110143
) -> NFAResult<()> {
111144
for trans in transitions {
112-
self.add_byte_range_transition(state_id, trans)?;
145+
self.add_byte_range_transition(anchored_state_id, state_id, trans)?;
113146
}
114147
Ok(())
115148
}
116149

117150
/// Adds transitions from a dense transition table
118-
fn add_dense_transitions(&mut self, state_id: usize, transitions: &[StateID]) -> NFAResult<()> {
151+
fn add_dense_transitions(
152+
&mut self,
153+
anchored_state_id: usize,
154+
state_id: usize,
155+
transitions: &[StateID],
156+
) -> NFAResult<()> {
119157
for (byte, &next) in transitions.iter().enumerate() {
120158
if next != StateID::ZERO {
121159
self.nodes[state_id]
122160
.byte_transitions
123161
.entry(byte as u8)
124162
.or_insert_with(BTreeSet::new)
125-
.insert(next.as_usize() - 2);
163+
.insert(next.as_usize() - anchored_state_id);
126164
}
127165
}
128166
Ok(())
129167
}
130168

131169
/// Adds epsilon transitions for a union state
132-
fn add_union_transitions(&mut self, state_id: usize, alternates: &[StateID]) -> NFAResult<()> {
133-
self.nodes[state_id]
134-
.epsilon_transitions
135-
.extend(alternates.iter().map(|id| id.as_usize() - 2));
170+
fn add_union_transitions(
171+
&mut self,
172+
anchored_state_id: usize,
173+
state_id: usize,
174+
alternates: &[StateID],
175+
) -> NFAResult<()> {
176+
self.nodes[state_id].epsilon_transitions.extend(
177+
alternates
178+
.iter()
179+
.map(|id| id.as_usize() - anchored_state_id),
180+
);
136181
Ok(())
137182
}
138183

139184
/// Adds epsilon transitions for a binary union state
140185
fn add_binary_union_transitions(
141186
&mut self,
187+
anchored_state_id: usize,
142188
state_id: usize,
143189
alt1: &StateID,
144190
alt2: &StateID,
145191
) -> NFAResult<()> {
146192
let node = &mut self.nodes[state_id];
147-
node.epsilon_transitions.insert(alt1.as_usize() - 2);
148-
node.epsilon_transitions.insert(alt2.as_usize() - 2);
193+
node.epsilon_transitions
194+
.insert(alt1.as_usize() - anchored_state_id);
195+
node.epsilon_transitions
196+
.insert(alt2.as_usize() - anchored_state_id);
149197
Ok(())
150198
}
151199

152200
/// Adds an epsilon transition with capture group information
153201
fn add_capture_transition(
154202
&mut self,
203+
anchored_state_id: usize,
155204
state_id: usize,
156205
next: &StateID,
157206
group_index: &SmallIndex,
158207
slot: &SmallIndex,
159208
) -> NFAResult<()> {
160209
let node = &mut self.nodes[state_id];
161-
node.epsilon_transitions.insert(next.as_usize() - 2);
210+
node.epsilon_transitions
211+
.insert(next.as_usize() - anchored_state_id);
162212

163213
let group_idx = group_index.as_usize();
164214
if group_idx > 0 {
165215
let is_start = slot.as_usize() % 2 == 0;
166216
node.capture_groups
167-
.entry(next.as_usize() - 2)
217+
.entry(next.as_usize() - anchored_state_id)
168218
.or_insert_with(BTreeSet::new)
169219
.insert((group_idx, is_start));
170220
}
171221
Ok(())
172222
}
173223

174224
/// Adds an epsilon transition for a look-around state
175-
fn add_look_transition(&mut self, state_id: usize, next: &StateID) -> NFAResult<()> {
225+
fn add_look_transition(
226+
&mut self,
227+
anchored_state_id: usize,
228+
state_id: usize,
229+
next: &StateID,
230+
) -> NFAResult<()> {
176231
self.nodes[state_id]
177232
.epsilon_transitions
178-
.insert(next.as_usize() - 2);
233+
.insert(next.as_usize() - anchored_state_id);
179234
Ok(())
180235
}
181236

182-
/// Sets the start states for the NFA
183-
fn set_start_states(&mut self, nfa: &NFA) {
184-
self.start_states
185-
.insert(nfa.start_anchored().as_usize() - 2);
186-
}
187-
188237
pub fn pretty_print(&self) {
189238
println!("\n=== NFA Graph ===");
190239
println!("Regex: {}", self.regex);

compiler/src/nfa/epsilon.rs

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -98,15 +98,48 @@ impl NFAGraph {
9898
}
9999
}
100100

101-
// Handle start states - only make byte transition states reachable via epsilon into start states
102-
for &start in &self.start_states {
103-
new_start_states.insert(start);
101+
// Handle start states
102+
// Preserve original start states to iterate over them
103+
let original_start_states_snapshot: BTreeSet<usize> =
104+
self.start_states.iter().copied().collect();
105+
new_start_states.clear();
106+
107+
for &orig_start in &original_start_states_snapshot {
108+
new_start_states.insert(orig_start); // The original start state is always kept
109+
110+
// Check if the closure of this original start state contains any START captures.
111+
// If so, we don't want to create alternative start points from within this closure,
112+
// as that might allow bypassing these essential start captures.
113+
let mut has_start_captures_in_orig_closure = false;
114+
if let Some(orig_closure) = closures.get(orig_start) {
115+
for &(_, (_group_id, is_start_event)) in &orig_closure.captures {
116+
if is_start_event {
117+
has_start_captures_in_orig_closure = true;
118+
break;
119+
}
120+
}
121+
}
104122

105-
for &r_state in &closures[start].states {
106-
if has_byte_transitions[r_state] {
107-
new_start_states.insert(r_state);
123+
if !has_start_captures_in_orig_closure {
124+
// If no start captures in orig_start's closure, it's safe to add
125+
// other states from its closure that have byte transitions as new start states.
126+
if let Some(orig_closure) = closures.get(orig_start) {
127+
for &r_state in &orig_closure.states {
128+
if r_state == orig_start {
129+
continue;
130+
}
131+
// Check if r_state (a state reachable via epsilon from orig_start)
132+
// itself is the source of a byte transition.
133+
// The has_byte_transitions vec was populated based on nodes[r_state].byte_transitions
134+
if r_state < has_byte_transitions.len() && has_byte_transitions[r_state] {
135+
new_start_states.insert(r_state);
136+
}
137+
}
108138
}
109139
}
140+
// If has_start_captures_in_orig_closure is true, we *only* keep orig_start.
141+
// This forces paths through orig_start, ensuring its transitions (which will
142+
// have correctly accumulated these start captures) are used.
110143
}
111144

112145
// Apply changes

0 commit comments

Comments
 (0)