diff --git a/Package.swift b/Package.swift index b9b9d6d71..1f5e10f0a 100644 --- a/Package.swift +++ b/Package.swift @@ -59,9 +59,9 @@ let package = Package( name: "VariadicsGenerator", targets: ["VariadicsGenerator"]), // Disable to work around rdar://126877024 -// .executable( -// name: "RegexBenchmark", -// targets: ["RegexBenchmark"]) + .executable( + name: "RegexBenchmark", + targets: ["RegexBenchmark"]) ], dependencies: [ .package(url: "https://github.com/apple/swift-argument-parser", from: "1.0.0"), @@ -143,17 +143,17 @@ let package = Package( "_StringProcessing" ], swiftSettings: [availabilityDefinition]), -// .executableTarget( -// name: "RegexBenchmark", -// dependencies: [ -// .product(name: "ArgumentParser", package: "swift-argument-parser"), -// "_RegexParser", -// "_StringProcessing", -// "RegexBuilder" -// ], -// swiftSettings: [ -// .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]), -// ]), + .executableTarget( + name: "RegexBenchmark", + dependencies: [ + .product(name: "ArgumentParser", package: "swift-argument-parser"), + "_RegexParser", + "_StringProcessing", + "RegexBuilder" + ], + swiftSettings: [ + .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]), + ]), // MARK: Exercises .target( diff --git a/Sources/RegexBenchmark/Benchmark.swift b/Sources/RegexBenchmark/Benchmark.swift index 3a967c022..bcf8fa42a 100644 --- a/Sources/RegexBenchmark/Benchmark.swift +++ b/Sources/RegexBenchmark/Benchmark.swift @@ -153,6 +153,8 @@ struct CrossBenchmark { /// Whether to also run scalar-semantic mode var alsoRunScalarSemantic: Bool = true + var alsoRunSimpleWordBoundaries: Bool = false + func register(_ runner: inout BenchmarkRunner) { if isWhole { runner.registerCrossBenchmark( @@ -160,14 +162,16 @@ struct CrossBenchmark { input: input, pattern: regex, .whole, - alsoRunScalarSemantic: alsoRunScalarSemantic) + alsoRunScalarSemantic: alsoRunScalarSemantic, + alsoRunSimpleWordBoundaries: alsoRunSimpleWordBoundaries) } else { runner.registerCrossBenchmark( nameBase: baseName, input: input, pattern: regex, .allMatches, - alsoRunScalarSemantic: alsoRunScalarSemantic) + alsoRunScalarSemantic: alsoRunScalarSemantic, + alsoRunSimpleWordBoundaries: alsoRunSimpleWordBoundaries) if includeFirst || runner.includeFirstOverride { runner.registerCrossBenchmark( @@ -175,7 +179,8 @@ struct CrossBenchmark { input: input, pattern: regex, .first, - alsoRunScalarSemantic: alsoRunScalarSemantic) + alsoRunScalarSemantic: alsoRunScalarSemantic, + alsoRunSimpleWordBoundaries: alsoRunSimpleWordBoundaries) } } } diff --git a/Sources/RegexBenchmark/BenchmarkRegistration.swift b/Sources/RegexBenchmark/BenchmarkRegistration.swift index a3abef8e4..e12502e99 100644 --- a/Sources/RegexBenchmark/BenchmarkRegistration.swift +++ b/Sources/RegexBenchmark/BenchmarkRegistration.swift @@ -18,6 +18,8 @@ extension BenchmarkRunner { self.addDiceNotation() self.addErrorMessages() self.addIpAddress() + + self.addURLWithWordBoundaries() // -- end of registrations -- } } diff --git a/Sources/RegexBenchmark/BenchmarkRunner.swift b/Sources/RegexBenchmark/BenchmarkRunner.swift index b067b9679..6abee43aa 100644 --- a/Sources/RegexBenchmark/BenchmarkRunner.swift +++ b/Sources/RegexBenchmark/BenchmarkRunner.swift @@ -33,7 +33,8 @@ struct BenchmarkRunner { input: String, pattern: String, _ type: Benchmark.MatchType, - alsoRunScalarSemantic: Bool = true + alsoRunScalarSemantic: Bool = true, + alsoRunSimpleWordBoundaries: Bool ) { let swiftRegex = try! Regex(pattern) let nsRegex: NSRegularExpression @@ -58,6 +59,16 @@ struct BenchmarkRunner { type: .init(type), target: input)) + if alsoRunSimpleWordBoundaries { + register( + Benchmark( + name: nameBase + nameSuffix + "_SimpleWordBoundaries", + regex: swiftRegex.wordBoundaryKind(.simple), + pattern: pattern, + type: type, + target: input)) + } + if alsoRunScalarSemantic { register( Benchmark( diff --git a/Sources/RegexBenchmark/CLI.swift b/Sources/RegexBenchmark/CLI.swift index 77ebff47b..67dc4f8e2 100644 --- a/Sources/RegexBenchmark/CLI.swift +++ b/Sources/RegexBenchmark/CLI.swift @@ -37,7 +37,10 @@ struct Runner: ParsableCommand { @Flag(help: "Exclude running NSRegex benchmarks") var excludeNs = false - + + @Flag(help: "Rather than specify specific-benchmarks as patterns, use exact names") + var exactName = false + @Flag(help: """ Enable tracing of the engine (warning: lots of output). Prints out processor state each cycle @@ -73,7 +76,11 @@ swift build -c release -Xswiftc -DPROCESSOR_MEASUREMENTS_ENABLED if !self.specificBenchmarks.isEmpty { runner.suite = runner.suite.filter { b in specificBenchmarks.contains { pattern in - try! Regex(pattern).firstMatch(in: b.name) != nil + if exactName { + return pattern == b.name + } + + return try! Regex(pattern).firstMatch(in: b.name) != nil } } } diff --git a/Sources/RegexBenchmark/Inputs/URL.swift b/Sources/RegexBenchmark/Inputs/URL.swift new file mode 100644 index 000000000..b1b03f53d --- /dev/null +++ b/Sources/RegexBenchmark/Inputs/URL.swift @@ -0,0 +1,22 @@ +extension Inputs { + static let url: String = { + let element = """ + Item 1 | Item 2® •Item 3 Item4 + + + \t\t\t + + Check it out here: http://www.test.com/this-is-a-fake-url-that-should-be-replaced?a=1 + Check it out here: https://www.test.com/this-is-a-fake-url-that-should-be-replaced?a=1 + This is not a web link ftp://user@host:domain.com/path + This is a link without a scheme www.apple.com/mac + + This is some good text and should not be removed. + Thanks. + 😀🩷🤵🏿 + """ + let multiplier = 30 + return Array(repeating: element, count: multiplier).joined() + }() + +} diff --git a/Sources/RegexBenchmark/Suite/URLRegex.swift b/Sources/RegexBenchmark/Suite/URLRegex.swift new file mode 100644 index 000000000..e5f00f4e7 --- /dev/null +++ b/Sources/RegexBenchmark/Suite/URLRegex.swift @@ -0,0 +1,14 @@ +import _StringProcessing + +extension BenchmarkRunner { + mutating func addURLWithWordBoundaries() { + let urlRegex = #"https?://([-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6})\b[-a-zA-Z0-9()@:%_+.~#?&=]*"# + let url = CrossBenchmark( + baseName: "URLWithWordBoundaries", + regex: urlRegex, + input: Inputs.url, + alsoRunSimpleWordBoundaries: true + ) + url.register(&self) + } +} diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift index a82fb875c..9c750c979 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift @@ -99,10 +99,10 @@ struct RegexRangesSequence { regex: Regex ) { self.base = .init( + program: regex.regex.program.loweredProgram, input: input, subjectBounds: subjectBounds, - searchBounds: searchBounds, - regex: regex) + searchBounds: searchBounds) } } diff --git a/Sources/_StringProcessing/Algorithms/Matching/Matches.swift b/Sources/_StringProcessing/Algorithms/Matching/Matches.swift index 3c435be97..9b111d15f 100644 --- a/Sources/_StringProcessing/Algorithms/Matching/Matches.swift +++ b/Sources/_StringProcessing/Algorithms/Matching/Matches.swift @@ -12,85 +12,7 @@ // MARK: Regex algorithms @available(SwiftStdlib 5.7, *) -struct RegexMatchesSequence { - let input: String - let subjectBounds: Range - let searchBounds: Range - let regex: Regex - - init( - input: String, - subjectBounds: Range, - searchBounds: Range, - regex: Regex - ) { - self.input = input - self.subjectBounds = subjectBounds - self.searchBounds = searchBounds - self.regex = regex - } -} - -@available(SwiftStdlib 5.7, *) -extension RegexMatchesSequence: Sequence { - /// Returns the index to start searching for the next match after `match`. - fileprivate func searchIndex(after match: Regex.Match) -> String.Index? { - if !match.range.isEmpty { - return match.range.upperBound - } - - // If the last match was an empty match, advance by one position and - // run again, unless at the end of `input`. - guard match.range.lowerBound < subjectBounds.upperBound else { - return nil - } - - switch regex.initialOptions.semanticLevel { - case .graphemeCluster: - return input.index(after: match.range.upperBound) - case .unicodeScalar: - return input.unicodeScalars.index(after: match.range.upperBound) - } - } - - struct Iterator: IteratorProtocol { - let base: RegexMatchesSequence - - // Because `RegexMatchesCollection` eagerly computes the first match for - // its `startIndex`, the iterator can use that match for its initial - // iteration. For subsequent calls to `next()`, this value is `false`, and - // `nextStart` is used to search for the next match. - var initialIteration = true - - // Set to nil when iteration is finished (because some regex can empty-match - // at the end of the subject). - var currentPosition: String.Index? - - init(_ matches: RegexMatchesSequence) { - self.base = matches - self.currentPosition = base.subjectBounds.lowerBound - } - - mutating func next() -> Regex.Match? { - // `currentPosition` is `nil` when iteration has completed - guard let position = currentPosition, position <= base.searchBounds.upperBound else { - return nil - } - - // Otherwise, find the next match (if any) and compute `nextStart` - let match = try? base.regex._firstMatch( - base.input, - subjectBounds: base.subjectBounds, - searchBounds: position.. Iterator { - Iterator(self) - } -} +typealias RegexMatchesSequence = Executor.Matches extension BidirectionalCollection where SubSequence == Substring { @available(SwiftStdlib 5.7, *) @@ -99,10 +21,10 @@ extension BidirectionalCollection where SubSequence == Substring { of regex: R ) -> RegexMatchesSequence { RegexMatchesSequence( + program: regex.regex.program.loweredProgram, input: self[...].base, subjectBounds: startIndex...Match> for SE-0346 @@ -116,6 +38,7 @@ extension BidirectionalCollection where SubSequence == Substring { // FIXME: Array init calls count, which double-executes the regex :-( // FIXME: just return some Collection.Match> var result = Array.Match>() + for match in _matches(of: r) { result.append(match) } diff --git a/Sources/_StringProcessing/CMakeLists.txt b/Sources/_StringProcessing/CMakeLists.txt index 3b00470ff..0ebe2aed6 100644 --- a/Sources/_StringProcessing/CMakeLists.txt +++ b/Sources/_StringProcessing/CMakeLists.txt @@ -18,8 +18,6 @@ add_library(_StringProcessing Algorithms/Searchers/CollectionSearcher.swift Algorithms/Searchers/ZSearcher.swift Engine/Backtracking.swift - Engine/Consume.swift - Engine/Engine.swift Engine/InstPayload.swift Engine/Instruction.swift Engine/MEBuilder.swift diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 530788266..33cffaf20 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -89,7 +89,7 @@ func _compileRegex( _ regex: String, _ syntax: SyntaxOptions = .traditional, _ semanticLevel: RegexSemanticLevel? = nil -) throws -> Executor { +) throws -> MEProgram { let ast = try parse(regex, syntax) let dsl: DSLTree @@ -104,7 +104,7 @@ func _compileRegex( dsl = ast.dslTree } let program = try Compiler(tree: dsl).emit() - return Executor(program: program) + return program } @_spi(RegexBenchmark) diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift deleted file mode 100644 index 6af973919..000000000 --- a/Sources/_StringProcessing/Engine/Consume.swift +++ /dev/null @@ -1,58 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -var checkComments = true - -extension Engine { - func makeProcessor( - input: String, bounds: Range, matchMode: MatchMode - ) -> Processor { - Processor( - program: program, - input: input, - subjectBounds: bounds, - searchBounds: bounds, - matchMode: matchMode, - isTracingEnabled: enableTracing, - shouldMeasureMetrics: enableMetrics) - } - - func makeFirstMatchProcessor( - input: String, - subjectBounds: Range, - searchBounds: Range - ) -> Processor { - Processor( - program: program, - input: input, - subjectBounds: subjectBounds, - searchBounds: searchBounds, - matchMode: .partialFromFront, - isTracingEnabled: enableTracing, - shouldMeasureMetrics: enableMetrics) - } -} - -extension Processor { - // TODO: Should we throw here? - mutating func consume() -> Input.Index? { - while true { - switch self.state { - case .accept: - return self.currentPosition - case .fail: - return nil - case .inProgress: self.cycle() - } - } - } -} - diff --git a/Sources/_StringProcessing/Engine/Engine.swift b/Sources/_StringProcessing/Engine/Engine.swift deleted file mode 100644 index a5cb11bd6..000000000 --- a/Sources/_StringProcessing/Engine/Engine.swift +++ /dev/null @@ -1,37 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -// Currently, engine binds the type and consume binds an instance. -// But, we can play around with this. -struct Engine { - - let program: MEProgram - - // TODO: Pre-allocated register banks - - var instructions: InstructionList { program.instructions } - - var enableTracing: Bool { program.enableTracing } - var enableMetrics: Bool { program.enableMetrics } - - init(_ program: MEProgram) { - self.program = program - } -} - -struct AsyncEngine { /* ... */ } - -extension Engine: CustomStringConvertible { - var description: String { - // TODO: better description - return program.description - } -} diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index d6b2cfe0c..520f6cf66 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -18,7 +18,7 @@ enum MatchMode { /// A concrete CU. Somehow will run the concrete logic and /// feed stuff back to generic code -struct Controller { +struct Controller: Equatable { var pc: InstructionAddress mutating func step() { @@ -48,6 +48,16 @@ struct Processor { /// `input.startIndex.. + let matchMode: MatchMode + let instructions: InstructionList + + // MARK: Update-only state + + var wordIndexCache: Set? = nil + var wordIndexMaxIndex: String.Index? = nil + + // MARK: Resettable state + /// The bounds within the subject for an individual search. /// /// `searchBounds` is equal to `subjectBounds` in some cases, but can be a @@ -57,12 +67,7 @@ struct Processor { /// Anchors like `^` and `.startOfSubject` use `subjectBounds` instead of /// `searchBounds`. The "start of matching" anchor `\G` uses `searchBounds` /// as its starting point. - let searchBounds: Range - - let matchMode: MatchMode - let instructions: InstructionList - - // MARK: Resettable state + var searchBounds: Range /// The current search position while processing. /// @@ -80,9 +85,6 @@ struct Processor { var storedCaptures: Array<_StoredCapture> - var wordIndexCache: Set? = nil - var wordIndexMaxIndex: String.Index? = nil - var state: State = .inProgress var failureReason: Error? = nil @@ -103,9 +105,7 @@ extension Processor { input: Input, subjectBounds: Range, searchBounds: Range, - matchMode: MatchMode, - isTracingEnabled: Bool, - shouldMeasureMetrics: Bool + matchMode: MatchMode ) { self.controller = Controller(pc: 0) self.instructions = program.instructions @@ -115,8 +115,8 @@ extension Processor { self.matchMode = matchMode self.metrics = ProcessorMetrics( - isTracingEnabled: isTracingEnabled, - shouldMeasureMetrics: shouldMeasureMetrics) + isTracingEnabled: program.enableTracing, + shouldMeasureMetrics: program.enableMetrics) self.currentPosition = searchBounds.lowerBound @@ -128,15 +128,23 @@ extension Processor { _checkInvariants() } - mutating func reset(currentPosition: Position) { + mutating func reset( + currentPosition: Position, + searchBounds: Range + ) { self.currentPosition = currentPosition + self.searchBounds = searchBounds self.controller = Controller(pc: 0) self.registers.reset(sentinel: searchBounds.upperBound) - self.savePoints.removeAll(keepingCapacity: true) - self.callStack.removeAll(keepingCapacity: true) + if !self.savePoints.isEmpty { + self.savePoints.removeAll(keepingCapacity: true) + } + if !self.callStack.isEmpty { + self.callStack.removeAll(keepingCapacity: true) + } for idx in storedCaptures.indices { storedCaptures[idx] = .init() @@ -149,6 +157,22 @@ extension Processor { _checkInvariants() } + // Check that resettable state has been reset. Note that `reset()` + // takes a new current position and search bounds. + func isReset() -> Bool { + _checkInvariants() + guard self.controller == Controller(pc: 0), + self.savePoints.isEmpty, + self.callStack.isEmpty, + self.storedCaptures.allSatisfy({ $0.range == nil }), + self.state == .inProgress, + self.failureReason == nil + else { + return false + } + return true + } + func _checkInvariants() { assert(searchBounds.lowerBound >= subjectBounds.lowerBound) assert(searchBounds.upperBound <= subjectBounds.upperBound) diff --git a/Sources/_StringProcessing/Engine/Registers.swift b/Sources/_StringProcessing/Engine/Registers.swift index 7c0d8e2a7..43fb0b8d7 100644 --- a/Sources/_StringProcessing/Engine/Registers.swift +++ b/Sources/_StringProcessing/Engine/Registers.swift @@ -41,6 +41,8 @@ extension Processor { // MARK: writeable, resettable + var isDirty = false + // currently, useful for range-based quantification var ints: [Int] @@ -58,17 +60,22 @@ extension Processor.Registers { } subscript(_ i: IntRegister) -> Int { get { ints[i.rawValue] } - set { ints[i.rawValue] = newValue } + set { + isDirty = true + ints[i.rawValue] = newValue + } } subscript(_ i: ValueRegister) -> Any { get { values[i.rawValue] } set { + isDirty = true values[i.rawValue] = newValue } } subscript(_ i: PositionRegister) -> Input.Index { get { positions[i.rawValue] } set { + isDirty = true positions[i.rawValue] = newValue } } @@ -128,6 +135,9 @@ extension Processor.Registers { } mutating func reset(sentinel: Input.Index) { + guard isDirty else { + return + } self.ints._setAll(to: 0) self.values._setAll(to: SentinelValue()) self.positions._setAll(to: Processor.Registers.sentinelIndex) diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 5cf702514..6befcdbc8 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -11,93 +11,220 @@ internal import _RegexParser -struct Executor { - // TODO: consider let, for now lets us toggle tracing - var engine: Engine +/// `Executor` encapsulates the execution of the regex engine post-compilation. +/// It doesn't know anything about the `Regex` type or how to compile a regex. +@available(SwiftStdlib 5.7, *) +enum Executor { + static func prefixMatch( + _ program: MEProgram, + _ input: String, + subjectBounds: Range, + searchBounds: Range + ) throws -> Regex.Match? { + try Executor._run( + program, + input, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + mode: .partialFromFront) + } - init(program: MEProgram) { - self.engine = Engine(program) + static func wholeMatch( + _ program: MEProgram, + _ input: String, + subjectBounds: Range, + searchBounds: Range + ) throws -> Regex.Match? { + try Executor._run( + program, + input, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + mode: .wholeString) } - @available(SwiftStdlib 5.7, *) - func firstMatch( + static func firstMatch( + _ program: MEProgram, _ input: String, subjectBounds: Range, - searchBounds: Range, - graphemeSemantic: Bool + searchBounds: Range ) throws -> Regex.Match? { - var cpu = engine.makeFirstMatchProcessor( + var cpu = Processor( + program: program, input: input, subjectBounds: subjectBounds, - searchBounds: searchBounds) -#if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } -#endif - var low = searchBounds.lowerBound - let high = searchBounds.upperBound + searchBounds: searchBounds, + matchMode: .partialFromFront) + return try Executor._firstMatch( + program, + using: &cpu) + } + + static func _firstMatch( + _ program: MEProgram, + using cpu: inout Processor + ) throws -> Regex.Match? { + let isGraphemeSemantic = program.initialOptions.semanticLevel == .graphemeCluster + + var low = cpu.searchBounds.lowerBound + let high = cpu.searchBounds.upperBound while true { - if let m: Regex.Match = try _match( - input, from: low, using: &cpu - ) { + if let m = try Executor._run(program, &cpu) { return m } - if low >= high { return nil } - if graphemeSemantic { - low = input.index( - low, offsetBy: 1, limitedBy: searchBounds.upperBound) ?? searchBounds.upperBound + // Fast-path for start-anchored regex + if program.canOnlyMatchAtStart { + return nil + } + if low == high { return nil } + if isGraphemeSemantic { + cpu.input.formIndex(after: &low) } else { - input.unicodeScalars.formIndex(after: &low) + cpu.input.unicodeScalars.formIndex(after: &low) + } + guard low <= high else { + return nil } - cpu.reset(currentPosition: low) + cpu.reset(currentPosition: low, searchBounds: cpu.searchBounds) + } + } +} + +@available(SwiftStdlib 5.7, *) +extension Executor { + struct Matches: Sequence { + var program: MEProgram + var input: String + var subjectBounds: Range + var searchBounds: Range + + struct Iterator: IteratorProtocol { + var program: MEProgram + var processor: Processor + var finished = false + } + + func makeIterator() -> Iterator { + Iterator( + program: program, + processor: Processor( + program: program, + input: input, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + matchMode: .partialFromFront)) + } + } +} + +@available(SwiftStdlib 5.7, *) +extension Executor.Matches.Iterator { + func nextSearchIndex( + after range: Range + ) -> String.Index? { + if !range.isEmpty { + return range.upperBound + } + + // If the last match was an empty match, advance by one position and + // run again, unless at the end of `input`. + guard range.lowerBound < processor.subjectBounds.upperBound else { + return nil + } + + switch program.initialOptions.semanticLevel { + case .graphemeCluster: + return processor.input.index(after: range.upperBound) + case .unicodeScalar: + return processor.input.unicodeScalars.index(after: range.upperBound) + } + } + + mutating func next() -> Regex.Match? { + if finished { + return nil + } + guard let match = try? Executor._firstMatch( + program, using: &processor + ) else { + return nil + } + + // If there's more input to process, advance our position + // and search bounds. Otherwise, set to fail fast. + if let currentPosition = nextSearchIndex(after: match.range) { + processor.reset( + currentPosition: currentPosition, + searchBounds: currentPosition..( +@available(SwiftStdlib 5.7, *) +extension Executor { + static func _run( + _ program: MEProgram, _ input: String, - in subjectBounds: Range, - _ mode: MatchMode + subjectBounds: Range, + searchBounds: Range, + mode: MatchMode ) throws -> Regex.Match? { - var cpu = engine.makeProcessor( - input: input, bounds: subjectBounds, matchMode: mode) -#if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } -#endif - return try _match(input, from: subjectBounds.lowerBound, using: &cpu) + var cpu = Processor( + program: program, + input: input, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + matchMode: mode) + return try _run(program, &cpu) } - @available(SwiftStdlib 5.7, *) - func _match( - _ input: String, - from currentPosition: String.Index, - using cpu: inout Processor + static func _run( + _ program: MEProgram, + _ cpu: inout Processor ) throws -> Regex.Match? { - // FIXME: currentPosition is already encapsulated in cpu, don't pass in - // FIXME: cpu.consume() should return the matched range, not the upper bound - guard let endIdx = cpu.consume() else { - if let e = cpu.failureReason { - throw e - } + + let startPosition = cpu.currentPosition + guard let endIdx = try cpu.run() else { return nil } - let capList = MECaptureList( values: cpu.storedCaptures, - referencedCaptureOffsets: engine.program.referencedCaptureOffsets) + referencedCaptureOffsets: program.referencedCaptureOffsets) - let range = currentPosition.., - _ mode: MatchMode - ) throws -> Regex.Match? { - try match(input, in: subjectBounds, mode) +extension Processor { + fileprivate mutating func run() throws -> Input.Index? { +#if PROCESSOR_MEASUREMENTS_ENABLED + defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } +#endif + if self.state == .fail { + if let e = failureReason { + throw e + } + return nil + } + assert(isReset()) + while true { + switch self.state { + case .accept: + return self.currentPosition + case .fail: + if let e = failureReason { + throw e + } + return nil + case .inProgress: self.cycle() + } + } } } diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index 0b0b2e797..b4cecead2 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -109,7 +109,12 @@ extension Regex { /// - Returns: The match, if this regex matches the entirety of `string`; /// otherwise, `nil`. public func wholeMatch(in string: String) throws -> Regex.Match? { - try _match(string, in: string.startIndex.. Regex.Match? { - try _match(string, in: string.startIndex.. Regex.Match? { - try _firstMatch(string, in: string.startIndex.. Regex.Match? { - try _match(string.base, in: string.startIndex.. Regex.Match? { - try _match(string.base, in: string.startIndex.. Regex.Match? { - try _firstMatch(string.base, in: string.startIndex.., - mode: MatchMode = .wholeString - ) throws -> Regex.Match? { - let executor = Executor(program: regex.program.loweredProgram) - return try executor.match(input, in: subjectBounds, mode) - } - - func _firstMatch( - _ input: String, - in subjectBounds: Range - ) throws -> Regex.Match? { - try regex.program.loweredProgram.canOnlyMatchAtStart - ? _match(input, in: subjectBounds, mode: .partialFromFront) - : _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds) - } - - func _firstMatch( - _ input: String, - subjectBounds: Range, - searchBounds: Range - ) throws -> Regex.Match? { - let executor = Executor(program: regex.program.loweredProgram) - let graphemeSemantic = regex.initialOptions.semanticLevel == .graphemeCluster - return try executor.firstMatch( - input, - subjectBounds: subjectBounds, - searchBounds: searchBounds, - graphemeSemantic: graphemeSemantic) + let bounds = string.startIndex.. Executor { - let tree = ast.dslTree - let prog = try! Compiler(tree: tree).emit() - let executor = Executor(program: prog) - return executor +func compile(_ ast: AST) -> MEProgram { + try! Compiler(tree: ast.dslTree).emit() } func captureTest( @@ -184,8 +181,11 @@ func captureTest( for (input, output) in tests { let inputRange = input.startIndex...wholeMatch( + compile(ast), + input, + subjectBounds: inputRange, + searchBounds: inputRange ) else { XCTFail("No match", file: file, line: line) return diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index d0500847b..05212388d 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -154,7 +154,7 @@ extension RegexTests { ) throws { assert(!equivs.isEmpty) let progs = try equivs.map { - try _compileRegex($0).engine.program + try _compileRegex($0) } let ref = progs.first! for (prog, equiv) in zip(progs, equivs).dropFirst() { @@ -325,7 +325,7 @@ extension RegexTests { do { let prog = try _compileRegex(regex, syntax, semanticLevel) var found: Set = [] - for inst in prog.engine.instructions { + for inst in prog.instructions { let decoded = DecodedInstr.decode(inst) found.insert(decoded)