From 05c73acf46ab5c0b193061d73f44eeb549cc2424 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Fri, 17 Jan 2025 11:57:14 -0700 Subject: [PATCH 1/5] Prototype of experimental Regex -> BNF conversion --- Package.swift | 7 + Sources/Regex2BNF/Regex2BNF.swift | 80 +++ Sources/RegexBenchmark/BenchmarkChart.swift | 152 +++--- Sources/RegexBenchmark/BenchmarkResults.swift | 20 +- Sources/_RegexParser/Regex/BNF/BNF.swift | 131 +++++ .../_RegexParser/Regex/BNF/BNFConvert.swift | 503 ++++++++++++++++++ .../Regex/Printing/PrintAsBNF.swift | 32 ++ 7 files changed, 839 insertions(+), 86 deletions(-) create mode 100644 Sources/Regex2BNF/Regex2BNF.swift create mode 100644 Sources/_RegexParser/Regex/BNF/BNF.swift create mode 100644 Sources/_RegexParser/Regex/BNF/BNFConvert.swift create mode 100644 Sources/_RegexParser/Regex/Printing/PrintAsBNF.swift diff --git a/Package.swift b/Package.swift index e9198908d..fb532bad4 100644 --- a/Package.swift +++ b/Package.swift @@ -135,6 +135,13 @@ let package = Package( "_RegexParser", "_StringProcessing" ]), + .executableTarget( + name: "Regex2BNF", + dependencies: [ + .product(name: "ArgumentParser", package: "swift-argument-parser"), + "_RegexParser" + ], + swiftSettings: [availabilityDefinition]), .executableTarget( name: "RegexTester", dependencies: [ diff --git a/Sources/Regex2BNF/Regex2BNF.swift b/Sources/Regex2BNF/Regex2BNF.swift new file mode 100644 index 000000000..5cf97bd5d --- /dev/null +++ b/Sources/Regex2BNF/Regex2BNF.swift @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import ArgumentParser +import _RegexParser + +@main +@available(SwiftStdlib 5.8, *) +struct Regex2BNF: ParsableCommand { + @Argument(help: "The regex pattern to convert to BNF.") + var pattern: String + + @Flag( + name: [.customShort("e"), .customLong("examples")], + help: "Run several examples") + var runExamples = false + + func convert(_ pattern: String) throws { + print("\n=== /\(pattern)/ ===\n") + let ast = try _RegexParser.parse(pattern, .init()) + print(ast) + print() + print(try _printAsBNF(inputRegex: pattern)) + } + + mutating func run() throws { + if runExamples { + // TODO: Turn into test cases + print("[Examples") + + print("Single-scalar character literals:") + try convert("a") + try convert("Z") + try convert("あ") + try convert("日") + try convert("\u{301}") + + + print("Multi-scalar character literals") + try convert("🧟‍♀️") + try convert("e\u{301}") + + print("Simple alternations") + try convert("a|b") + try convert("a|b|c|d") + try convert("a|🧟‍♀️\u{301}日|z") + + print("Simple quantifications") + try convert("a*") + try convert("a+") + try convert("a?") + try convert("a{2,10}") + try convert("a{,10}") + try convert("a{2,}") + + print("Grouping") + try convert("a(b|c)d") + try convert("a(bcd|def(g|h)+)z") + + print("Dot") +// try convert(".*") +// try convert("(a|b)*.{3}(a|b)") + + + print("[Done]") + } + try convert(pattern) + + + + } +} diff --git a/Sources/RegexBenchmark/BenchmarkChart.swift b/Sources/RegexBenchmark/BenchmarkChart.swift index 862565c6e..7fd04ab94 100644 --- a/Sources/RegexBenchmark/BenchmarkChart.swift +++ b/Sources/RegexBenchmark/BenchmarkChart.swift @@ -1,80 +1,80 @@ -//===----------------------------------------------------------------------===// +////===----------------------------------------------------------------------===// +//// +//// This source file is part of the Swift.org open source project +//// +//// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +//// Licensed under Apache License v2.0 with Runtime Library Exception +//// +//// See https://swift.org/LICENSE.txt for license information +//// +////===----------------------------------------------------------------------===// // -// This source file is part of the Swift.org open source project +//#if os(macOS) && canImport(Charts) // -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception +//import Charts +//import SwiftUI // -// See https://swift.org/LICENSE.txt for license information +//struct BenchmarkChart: View { +// var comparisons: [BenchmarkResult.Comparison] // -//===----------------------------------------------------------------------===// - -#if os(macOS) && canImport(Charts) - -import Charts -import SwiftUI - -struct BenchmarkChart: View { - var comparisons: [BenchmarkResult.Comparison] - - // Sort by normalized difference - var sortedComparisons: [BenchmarkResult.Comparison] { - comparisons.sorted { a, b in - a.normalizedDiff < b.normalizedDiff - } - } - var body: some View { - VStack(alignment: .leading) { - Chart { - ForEach(sortedComparisons) { comparison in - // Normalized runtime - BarMark( - x: .value("Name", comparison.name), - y: .value("Normalized runtime", comparison.normalizedDiff)) - .foregroundStyle(LinearGradient( - colors: [.accentColor, comparison.diff?.seconds ?? 0 <= 0 ? .green : .yellow], - startPoint: .bottom, - endPoint: .top)) - } - // Baseline - RuleMark(y: .value("Time", 1.0)) - .foregroundStyle(.red) - .lineStyle(.init(lineWidth: 1, dash: [2])) - .annotation(position: .top, alignment: .leading) { - Text("Baseline").foregroundStyle(.red) - } - - } - .frame(idealWidth: 800, idealHeight: 800) - .chartYScale(domain: 0...2.0) - .chartYAxis { - AxisMarks(values: .stride(by: 0.1)) - } - .chartXAxis { - AxisMarks { value in - AxisGridLine() - AxisTick() - AxisValueLabel(value.as(String.self)!, orientation: .vertical) - } - } - } - } -} - -struct BenchmarkResultApp: App { - static var comparisons: [BenchmarkResult.Comparison]? - - var body: some Scene { - WindowGroup { - if let comparisons = Self.comparisons { - ScrollView { - BenchmarkChart(comparisons: comparisons) - } - } else { - Text("No data") - } - } - } -} - -#endif +// // Sort by normalized difference +// var sortedComparisons: [BenchmarkResult.Comparison] { +// comparisons.sorted { a, b in +// a.normalizedDiff < b.normalizedDiff +// } +// } +// var body: some View { +// VStack(alignment: .leading) { +// Chart { +// ForEach(sortedComparisons) { comparison in +// // Normalized runtime +// BarMark( +// x: .value("Name", comparison.name), +// y: .value("Normalized runtime", comparison.normalizedDiff)) +// .foregroundStyle(LinearGradient( +// colors: [.accentColor, comparison.diff?.seconds ?? 0 <= 0 ? .green : .yellow], +// startPoint: .bottom, +// endPoint: .top)) +// } +// // Baseline +// RuleMark(y: .value("Time", 1.0)) +// .foregroundStyle(.red) +// .lineStyle(.init(lineWidth: 1, dash: [2])) +// .annotation(position: .top, alignment: .leading) { +// Text("Baseline").foregroundStyle(.red) +// } +// +// } +// .frame(idealWidth: 800, idealHeight: 800) +// .chartYScale(domain: 0...2.0) +// .chartYAxis { +// AxisMarks(values: .stride(by: 0.1)) +// } +// .chartXAxis { +// AxisMarks { value in +// AxisGridLine() +// AxisTick() +// AxisValueLabel(value.as(String.self)!, orientation: .vertical) +// } +// } +// } +// } +//} +// +//struct BenchmarkResultApp: App { +// static var comparisons: [BenchmarkResult.Comparison]? +// +// var body: some Scene { +// WindowGroup { +// if let comparisons = Self.comparisons { +// ScrollView { +// BenchmarkChart(comparisons: comparisons) +// } +// } else { +// Text("No data") +// } +// } +// } +//} +// +//#endif diff --git a/Sources/RegexBenchmark/BenchmarkResults.swift b/Sources/RegexBenchmark/BenchmarkResults.swift index da66183fd..14edef5b9 100644 --- a/Sources/RegexBenchmark/BenchmarkResults.swift +++ b/Sources/RegexBenchmark/BenchmarkResults.swift @@ -114,16 +114,16 @@ extension BenchmarkRunner { print(item) } - #if os(macOS) && canImport(Charts) - if showChart { - print(""" - === Comparison chart ================================================================= - Press Control-C to close... - """) - BenchmarkResultApp.comparisons = comparisons - BenchmarkResultApp.main() - } - #endif +// #if os(macOS) && canImport(Charts) +// if showChart { +// print(""" +// === Comparison chart ================================================================= +// Press Control-C to close... +// """) +// BenchmarkResultApp.comparisons = comparisons +// BenchmarkResultApp.main() +// } +// #endif } func saveComparisons( diff --git a/Sources/_RegexParser/Regex/BNF/BNF.swift b/Sources/_RegexParser/Regex/BNF/BNF.swift new file mode 100644 index 000000000..121a67aef --- /dev/null +++ b/Sources/_RegexParser/Regex/BNF/BNF.swift @@ -0,0 +1,131 @@ +struct BNF { + var root: Rule + var rules: [Rule] + + func render() -> String { + var str = ""// root.render() + "\n" + if rules.isEmpty { + return str + } + return str + + rules.lazy.map { + $0.render() + }.joined(separator: "\n") + + "\n" + } +} + +struct Rule { + // The left-hand side + var symbol: NonTerminalSymbol + + var expression: Expression + + var predicates: [CharacterPredicate] = [] + + func render() -> String { + "\(symbol.render()) ::= \(expression.render())" + } +} + +struct CharacterPredicate { + // TODO: convention c or trivial? + let impl: (Unicode.Scalar) -> Bool +} + +struct NonTerminalSymbol: Hashable { + var name: String + + func render() -> String { + name + } +} + +struct Expression { + var choices: [Choice] + + func render() -> String { + "\(choices.map({ $0.render() }).joined(separator: " | "))" + } +} + +// Was Choice +struct Choice { + var sequence: [Symbol] + + init(_ symbols: Array) { + self.sequence = symbols + } + init(_ symbols: Symbol...) { + self.init(symbols) + } + + func render() -> String { + "\(sequence.map({ $0.render() }).joined(separator: " "))" + } +} + +enum Symbol { + case terminal(TerminalSymbol) + case terminalSequence([TerminalSymbol]) + case nonTerminal(NonTerminalSymbol) + + func render() -> String { + switch self { + case .terminal(let t): + return t.render() + case .terminalSequence(let s): + return "\(s.map({ $0.render() }).joined(separator: " "))" + case .nonTerminal(let n): + return n.render() + } + } +} + +enum CharacterSet {} + +enum TerminalSymbol { + case character(Unicode.Scalar) + case characterSet(CharacterSet) + case utf8CodeUnit(UInt8) + + case characterPredicate(CharacterPredicate) + + func render() -> String { + switch self { + case .character(let c): + return "\"\(c)\"" + case .characterSet(let _): + fatalError() + case .utf8CodeUnit(let u): + return "\"\(u)\"" + case .characterPredicate(_): + fatalError() + } + } +} + +extension Expression { + init(_ choices: [Choice]) { + self.init(choices: choices) + } + init(_ choices: Choice...) { + self.init(choices) + } +} + +extension Choice { + init(_ elements: [NonTerminalSymbol]) { + self.init(elements.map { .nonTerminal($0) }) + } + init(_ elements: NonTerminalSymbol...) { + self.init(elements) + } +} + +/* + + +node -> choice + + */ diff --git a/Sources/_RegexParser/Regex/BNF/BNFConvert.swift b/Sources/_RegexParser/Regex/BNF/BNFConvert.swift new file mode 100644 index 000000000..a41a71f6a --- /dev/null +++ b/Sources/_RegexParser/Regex/BNF/BNFConvert.swift @@ -0,0 +1,503 @@ +// +// BNFConvert.swift +// swift-experimental-string-processing +// +// Created by Michael Ilseman on 1/18/25. +// + +struct SymbolGenerator { + var prefix = "" + + var counters = [String: Int]() + + mutating func genSuffix(for s: String) -> String { + guard let c = counters[s] else { + counters[s] = 0 + return "" + } + defer { counters[s] = c + 1 } + return "_\(c)" + } + + mutating func genSym(_ name: String) -> NonTerminalSymbol { + let suffix = genSuffix(for: name) + return NonTerminalSymbol(name: prefix + name + suffix) + } +} + + +struct BNFConvert { + var symbols = SymbolGenerator() + var productions = [NonTerminalSymbol: [Choice]]() + var root: NonTerminalSymbol? = nil +} + +extension BNFConvert { + @discardableResult + mutating func createProduction( + _ sym: NonTerminalSymbol, + _ choices: [Choice] + ) -> NonTerminalSymbol { + guard !productions.keys.contains(sym) else { + fatalError("Internal invariant violated: non-unique symbols") + } + productions[sym] = choices + return sym + } + + mutating func createProduction( + _ name: String, + _ choices: [Choice] + ) -> NonTerminalSymbol { + let sym = symbols.genSym(name) + return createProduction(sym, choices) + } + mutating func createProduction( + _ name: String, + _ elements: [Symbol] + ) -> NonTerminalSymbol { + createProduction(name, [Choice(elements)]) + } +} + +extension BNFConvert { + mutating func convertAlternation( + _ alt: AST.Alternation + ) -> [Choice] { + fatalError() + } + + // Convert a Regex AST node to a concatnative component + // + // Alternations always produce a new rule + mutating func convert( + _ node: AST.Node + ) throws -> [Symbol] { + switch node { + /// ... | ... | ... + case .alternation(let a): + let choices = try a.children.map { + Choice(try convert($0)) + } + let altSym = createProduction("ALT", choices) + return [.nonTerminal(altSym)] + + /// ... ... + case .concatenation(let c): + return try c.children.flatMap { node in + try convert(node) + } + + /// (...) + case .group(let g): + // A group is where an alternation could be nested + + switch g.kind.value { + // BNF as no captures, so these function as syntactic groups + case .capture, .namedCapture(_), .balancedCapture(_), .nonCapture: + return try convert(g.child) + + case .nonCaptureReset: + fatalError() + + case .atomicNonCapturing: + fatalError() + + case .lookahead: + fatalError() + case .negativeLookahead: + fatalError() + case .nonAtomicLookahead: + fatalError() + case .lookbehind: + fatalError() + case .negativeLookbehind: + fatalError() + case .nonAtomicLookbehind: + fatalError() + + case .scriptRun: + fatalError() + case .atomicScriptRun: + fatalError() + + case .changeMatchingOptions(_): + fatalError() + } + + fatalError() + + /// (?(cond) true-branch | false-branch) + case .conditional(_): fatalError() + + case .quantification(let q): + let quantChild = createProduction("QUANT_CHILD", try convert(q.child)) + return createQuantify(quantChild, q.kind.value, q.amount.value) + + /// \Q...\E + case .quote(_): fatalError() + + /// Comments, non-semantic whitespace, etc + case .trivia(_): fatalError() + + /// Intepolation `<{...}>`, currently reserved for future use. + case .interpolation(_): fatalError() + + case .atom(let atom): + switch atom.kind { + case .char(let c): + + let s: Symbol + if c.unicodeScalars.count == 1 { + s = .terminal(.character(c.unicodeScalars.first!)) + } else { + s = .terminalSequence(c.unicodeScalars.map { + .character($0) + }) + } + + return [s] + + case .scalar(_): fatalError() + case .scalarSequence(_): fatalError() + case .keyboardControl(_): fatalError() + case .keyboardMeta(_): fatalError() + case .keyboardMetaControl(_): fatalError() + + case .property, .escaped, .dot, .caretAnchor, .dollarAnchor, + .backreference, .subpattern, .namedCharacter, .callout, + .backtrackingDirective, .changeMatchingOptions, .invalid: + fatalError() + } + + case .customCharacterClass(let ccc): + if ccc.start.value == .inverted { + fatalError("TODO: inverted character classes") + } + if ccc.members.count > 1 { + fatalError("TODO: inverted character classes") + } + if ccc.members.isEmpty { + fatalError("TODO") + } + + fatalError() + + + + case .absentFunction(_): fatalError() + + case .empty(_): fatalError() + } + } +} + +extension BNFConvert { + + mutating func createQuantify( + _ child: NonTerminalSymbol, + _ kind: AST.Quantification.Kind, + _ amount: AST.Quantification.Amount + ) -> [Symbol] { + switch kind { + case .possessive: fatalError("TODO: possessive quantification") + case .reluctant: + fatalError("NOTE: reluctanct is ignored") + case .eager: + break + } + + let emptyChoice = Choice(Symbol.terminalSequence([])) + switch amount { + case .zeroOrMore: + // QUANT ::= QUANT_CHILD QUANT | + let name = symbols.genSym("QUANT_*") + let choices = [ + Choice(child, name), + emptyChoice, + ] + createProduction(name, choices) + return [.nonTerminal(name)] + + case .oneOrMore: + // QUANT ::= QUANT_CHILD QUANT | QUANT_CHILD + let name = symbols.genSym("QUANT_+") + let choices = [ + Choice(child, name), + Choice(child), + ] + createProduction(name, choices) + return [.nonTerminal(name)] + + + case .zeroOrOne: + // QUANT ::= QUANT_CHILD | + let name = symbols.genSym("QUANT_+") + let choices = [ + Choice(child), + emptyChoice + ] + createProduction(name, choices) + return [.nonTerminal(name)] + + case .exactly(let n): + // QUANT_CHILD^n + guard let n = n.value else { + fatalError("Invalid AST") + } + + return Array(repeating: .nonTerminal(child), count: n) + + case .nOrMore(let n): + // QUANT_CHILD^n QUANT_CHILD* + var res = createQuantify(child, kind, .exactly(n)) + res.append(contentsOf: createQuantify(child, kind, .zeroOrMore)) + + return res + + case .upToN(let n): + // QUANT ::= | QUANT_CHILD | ... | QUANT_CHILD^n + let name = symbols.genSym("QUANT_UPTO_N") + var choices = [ emptyChoice ] + + guard let n = n.value else { + fatalError("Invalid AST") + } + + for i in 1...n { + choices.append(Choice(createQuantify( + child, kind, .exactly(.init(i, at: .fake))))) + } + // TODO: Do we want to emit differently for eager/reluctant? + // TODO: Do we want to canonicalize if the BNF truly doesn't have + // order? + choices.reverse() + + createProduction(name, choices) + return [.nonTerminal(name)] + + case .range(let min, let max): + // QUANT ::= QUANT_CHILD^min QUANT_UPTO_(max-min) + guard let min = min.value, let max = max.value else { + fatalError("Invalid AST") + } + + var res = createQuantify(child, kind, .exactly(.init(min, at: .fake))) + let upto = createQuantify(child, kind, .upToN(.init(max-min, at: .fake))) + res.append(contentsOf: upto) + + return res + } + } +} + + + +extension BNFConvert { +/* + var symbols = SymbolGenerator() + var productions = [NonTerminalSymbol: [Choice]]() + var root: NonTerminalSymbol? = nil + + */ + + func reversePostOrder( + _ f: (NonTerminalSymbol, [Choice]) -> T + ) -> [T] { + guard let rootSymbol = root else { + fatalError("No root symbol defined") + } + + var visited: Set = [] + var result = [T]() + func visit( + _ sym: NonTerminalSymbol + ) { + if visited.contains(sym) { return } + visited.insert(sym) + + guard let choices = productions[sym] else { + fatalError("Internal invariant violated: undefined nonterminal") + } + + for choice in choices { + for symbol in choice.sequence { + if case .nonTerminal(let sym) = symbol { + visit(sym) + } + } + } + + result.append(f(sym, choices)) + } + + visit(rootSymbol) + result.reverse() + return result + } + + func createBNF() -> BNF { + guard let rootSymbol = root else { + fatalError("No root symbol defined") + } + + let rules = reversePostOrder { sym, choices in + Rule(symbol: sym, expression: Expression(choices)) + } + + return BNF( + root: rules.first!, + rules: rules) + } +} + +//extension BNFConvert { +// +// // var productions = [NonTerminalSymbol: Rule] +// +// // TODO: dictionary for the rules +// +// mutating func processNode( +// _ node: AST.Node +// ) -> Rule { +// return makeRule("ROOT", for: node).0 +// } +// +// // NOTE: alternation produces expression, but it seems like everything +// // else could produce a Choice or lower +// mutating func mapNode( +// _ node: AST.Node +// ) -> Expression { +// switch node { +// /// ... | ... | ... +// case .alternation(let a): +// return Expression(convertAlternation(a)) +// +// /// ... ... +// case .concatenation(let c): +// +// let childrenChoices: [Symbol] = c.children.flatMap { node in +// // TODO: look at just mapping to a choice +// let expr = mapNode(node) +// guard expr.choices.count == 1 else { +// // TODO: Figure this out +// fatalError("Concat's children had direct alternations") +// } +// return expr.choices.first!.sequence +// } +// +// return Expression(Choice(childrenChoices)) +// +// /// (...) +// case .group(_): fatalError() +// +// /// (?(cond) true-branch | false-branch) +// case .conditional(_): fatalError() +// +// case .quantification(let q): +// // Make a rule for the child +// let (_, childName) = addRule("QUANT_CHILD", for: q.child) +// +// switch q.kind.value { +// case .possessive: fatalError("TODO: possessive quantification") +// case .reluctant: +// fatalError("NOTE: reluctanct is ignored") +// case .eager: +// break +// } +// +// switch q.amount.value { +// case .zeroOrMore: +// // QUANT ::= | QUANT_CHILD QUANT +// +// let name = symbols.genSym("QUANT_*") +// let expr = Expression( +// Choice(.terminalSequence([])), +// Choice(childName, name)) +// rules.append(Rule( +// symbol: name, +// expression: expr)) +// +// return Expression(Choice(name)) +// +// +// +// case .oneOrMore: +// // QUANT ::= QUANT_CHILD | QUANT_CHILD QUANT +// +// let name = symbols.genSym("QUANT_*") +// let expr = Expression( +// Choice(childName), +// Choice(childName, name)) +// rules.append(Rule( +// symbol: name, +// expression: expr)) +// +// return Expression(Choice(name)) +// +// case .zeroOrOne: +// // QUANT ::= QUANT_CHILD | +// fatalError() +// +// case .exactly(let n): +// // QUANT ::= QUANT_CHILD^n +// fatalError() +// case .nOrMore(let n): +// fatalError() +// case .upToN(let n): +// fatalError() +// case .range(let min, let max): +// fatalError() +// } +// +// // let sym = .nonTerminal(name) +// +// +// +// +// fatalError() +// +// /// \Q...\E +// case .quote(_): fatalError() +// +// /// Comments, non-semantic whitespace, etc +// case .trivia(_): fatalError() +// +// /// Intepolation `<{...}>`, currently reserved for future use. +// case .interpolation(_): fatalError() +// +// case .atom(let atom): +// switch atom.kind { +// case .char(let c): +// +// let s: Symbol +// if c.unicodeScalars.count == 1 { +// s = .terminal(.character(c.unicodeScalars.first!)) +// } else { +// s = .terminalSequence(c.unicodeScalars.map { +// .character($0) +// }) +// } +// +// return Expression(choices: [Choice(s)]) +// +// case .scalar(_): fatalError() +// case .scalarSequence(_): fatalError() +// case .keyboardControl(_): fatalError() +// case .keyboardMeta(_): fatalError() +// case .keyboardMetaControl(_): fatalError() +// +// case .property, .escaped, .dot, .caretAnchor, .dollarAnchor, +// .backreference, .subpattern, .namedCharacter, .callout, +// .backtrackingDirective, .changeMatchingOptions, .invalid: +// fatalError() +// } +// +// case .customCharacterClass(_): fatalError() +// +// case .absentFunction(_): fatalError() +// +// case .empty(_): fatalError() +// } +// } +//} diff --git a/Sources/_RegexParser/Regex/Printing/PrintAsBNF.swift b/Sources/_RegexParser/Regex/Printing/PrintAsBNF.swift new file mode 100644 index 000000000..3b1f718f8 --- /dev/null +++ b/Sources/_RegexParser/Regex/Printing/PrintAsBNF.swift @@ -0,0 +1,32 @@ +// +// PrintAsBNF.swift +// swift-experimental-string-processing +// +// Created by Michael Ilseman on 1/9/25. +// + +// Move to CL tool, but keep render here +// TODO: some kinda API/SPI +public func _printAsBNF(inputRegex: String) throws -> String { + + // TODO: Should we pass in our language subset constraints here to + // error at parse time rather than render time? + let ast = try _RegexParser.parse(inputRegex, .init()) + + return try ast.renderAsBNF() +} + +// Regex AST -> BNF +func convert(_ ast: AST) throws -> BNF { + var converter = BNFConvert() + let rhs = try converter.convert(ast.root) + converter.root = converter.createProduction("ROOT", rhs) + return converter.createBNF() +} + +extension AST { + public func renderAsBNF() throws -> String { + let bnf = try convert(self) + return bnf.render() + } +} From 3b283cb7697e93a02d94b5a07cff36397339a14f Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Thu, 23 Jan 2025 11:07:47 -0700 Subject: [PATCH 2/5] wip: small cleanup --- Sources/_RegexParser/Regex/BNF/BNF.swift | 1 - .../_RegexParser/Regex/BNF/BNFConvert.swift | 34 ++++++++----------- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/Sources/_RegexParser/Regex/BNF/BNF.swift b/Sources/_RegexParser/Regex/BNF/BNF.swift index 121a67aef..c9d1e9e99 100644 --- a/Sources/_RegexParser/Regex/BNF/BNF.swift +++ b/Sources/_RegexParser/Regex/BNF/BNF.swift @@ -49,7 +49,6 @@ struct Expression { } } -// Was Choice struct Choice { var sequence: [Symbol] diff --git a/Sources/_RegexParser/Regex/BNF/BNFConvert.swift b/Sources/_RegexParser/Regex/BNF/BNFConvert.swift index a41a71f6a..53a7f1768 100644 --- a/Sources/_RegexParser/Regex/BNF/BNFConvert.swift +++ b/Sources/_RegexParser/Regex/BNF/BNFConvert.swift @@ -5,6 +5,11 @@ // Created by Michael Ilseman on 1/18/25. // +/// Create a unique non-terminal symbol +/// +/// NOTE: Currently, this is unique per input regex, but we should extend any +/// API or SPI to either be able to re-use a generator or pass in a unique seed +/// (such as a regex-counter). struct SymbolGenerator { var prefix = "" @@ -33,6 +38,7 @@ struct BNFConvert { } extension BNFConvert { + /// Create a new BNF rule for `sym` and add it to our productions. @discardableResult mutating func createProduction( _ sym: NonTerminalSymbol, @@ -45,6 +51,7 @@ extension BNFConvert { return sym } + /// Create a new symbol for `name` and BNF rule and add it to our productions. mutating func createProduction( _ name: String, _ choices: [Choice] @@ -61,15 +68,9 @@ extension BNFConvert { } extension BNFConvert { - mutating func convertAlternation( - _ alt: AST.Alternation - ) -> [Choice] { - fatalError() - } - - // Convert a Regex AST node to a concatnative component - // - // Alternations always produce a new rule + /// Convert a Regex AST node to a concatnative component + /// + /// Alternations always produce a new rule, as do some quantifications mutating func convert( _ node: AST.Node ) throws -> [Symbol] { @@ -93,7 +94,7 @@ extension BNFConvert { // A group is where an alternation could be nested switch g.kind.value { - // BNF as no captures, so these function as syntactic groups + // BNF has no captures, so these are just syntactic groups case .capture, .namedCapture(_), .balancedCapture(_), .nonCapture: return try convert(g.child) @@ -125,8 +126,6 @@ extension BNFConvert { fatalError() } - fatalError() - /// (?(cond) true-branch | false-branch) case .conditional(_): fatalError() @@ -207,6 +206,7 @@ extension BNFConvert { break } + // TODO: Not sure what the canonical empty choice is (i.e. ACCEPT). let emptyChoice = Choice(Symbol.terminalSequence([])) switch amount { case .zeroOrMore: @@ -294,13 +294,7 @@ extension BNFConvert { extension BNFConvert { -/* - var symbols = SymbolGenerator() - var productions = [NonTerminalSymbol: [Choice]]() - var root: NonTerminalSymbol? = nil - - */ - + /// Apply `f` (accumulating results) to our rules in reverse-post-order. func reversePostOrder( _ f: (NonTerminalSymbol, [Choice]) -> T ) -> [T] { @@ -321,7 +315,7 @@ extension BNFConvert { } for choice in choices { - for symbol in choice.sequence { + for symbol in choice.sequence.lazy.reversed() { if case .nonTerminal(let sym) = symbol { visit(sym) } From 938bdd8ff6b8ebbabba54d95dd69251abf5fcb7f Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Thu, 23 Jan 2025 11:37:17 -0700 Subject: [PATCH 3/5] Print empty sequences as empty strings --- Sources/_RegexParser/Regex/BNF/BNF.swift | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Sources/_RegexParser/Regex/BNF/BNF.swift b/Sources/_RegexParser/Regex/BNF/BNF.swift index c9d1e9e99..1d6f29775 100644 --- a/Sources/_RegexParser/Regex/BNF/BNF.swift +++ b/Sources/_RegexParser/Regex/BNF/BNF.swift @@ -74,6 +74,9 @@ enum Symbol { case .terminal(let t): return t.render() case .terminalSequence(let s): + guard !s.isEmpty else { + return "\"\"" + } return "\(s.map({ $0.render() }).joined(separator: " "))" case .nonTerminal(let n): return n.render() From 07a2213d37753f57745854f58030cd7630a16248 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Fri, 24 Jan 2025 12:12:35 -0700 Subject: [PATCH 4/5] wip: quick and dirty optimizer --- Sources/Regex2BNF/Regex2BNF.swift | 21 ++- .../_RegexParser/Regex/BNF/BNFConvert.swift | 130 ++++++++++++++++++ .../Regex/Printing/PrintAsBNF.swift | 1 + Tests/RegexTests/BNFTests.swift | 0 4 files changed, 140 insertions(+), 12 deletions(-) create mode 100644 Tests/RegexTests/BNFTests.swift diff --git a/Sources/Regex2BNF/Regex2BNF.swift b/Sources/Regex2BNF/Regex2BNF.swift index 5cf97bd5d..2f723ec62 100644 --- a/Sources/Regex2BNF/Regex2BNF.swift +++ b/Sources/Regex2BNF/Regex2BNF.swift @@ -24,19 +24,16 @@ struct Regex2BNF: ParsableCommand { var runExamples = false func convert(_ pattern: String) throws { - print("\n=== /\(pattern)/ ===\n") - let ast = try _RegexParser.parse(pattern, .init()) - print(ast) - print() + print("/\(pattern)/\n") print(try _printAsBNF(inputRegex: pattern)) } mutating func run() throws { if runExamples { // TODO: Turn into test cases - print("[Examples") +// print("[Examples") - print("Single-scalar character literals:") +// print("Single-scalar character literals:") try convert("a") try convert("Z") try convert("あ") @@ -44,16 +41,16 @@ struct Regex2BNF: ParsableCommand { try convert("\u{301}") - print("Multi-scalar character literals") +// print("Multi-scalar character literals") try convert("🧟‍♀️") try convert("e\u{301}") - print("Simple alternations") +// print("Simple alternations") try convert("a|b") try convert("a|b|c|d") try convert("a|🧟‍♀️\u{301}日|z") - print("Simple quantifications") +// print("Simple quantifications") try convert("a*") try convert("a+") try convert("a?") @@ -61,16 +58,16 @@ struct Regex2BNF: ParsableCommand { try convert("a{,10}") try convert("a{2,}") - print("Grouping") +// print("Grouping") try convert("a(b|c)d") try convert("a(bcd|def(g|h)+)z") - print("Dot") +// print("Dot") // try convert(".*") // try convert("(a|b)*.{3}(a|b)") - print("[Done]") +// print("[Done]") } try convert(pattern) diff --git a/Sources/_RegexParser/Regex/BNF/BNFConvert.swift b/Sources/_RegexParser/Regex/BNF/BNFConvert.swift index 53a7f1768..b32c431bf 100644 --- a/Sources/_RegexParser/Regex/BNF/BNFConvert.swift +++ b/Sources/_RegexParser/Regex/BNF/BNFConvert.swift @@ -291,6 +291,136 @@ extension BNFConvert { } } +extension BNFConvert { + // TODO: I just want a use-def chain + func calculateUseGraph() -> [NonTerminalSymbol: [NonTerminalSymbol]] { + fatalError() + } + + + + /// Optimize the BNF + mutating func optimize() { + // Iterate until we reach a fixed point + var changed = true + while changed { + changed = false + + // + // Value propagation: propagate small single-choice single-symbol + // productions + // + // A ::= B C D E + // B ::= "b" + // C ::= C2 + // C2 ::= "c" + // D ::= "d" "d" "d" + // E ::= "e" "e" "e" "e" ... + // + // --> + // + // A ::= "b" "c" "d" "d" "d" E + // E ::= "e" "e" "e" "e" ... + // + + // Build up a list of single-choice single-symbol productions + // for upwards propagation + let terminalSequenceThreshold = 3 + var singles = [NonTerminalSymbol: Symbol]() + for (key, val) in productions { + if val.count == 1 { + let valChoice = val.first! + if valChoice.sequence.count == 1 { + let valSym = valChoice.sequence.first! + if case .terminalSequence(let array) = valSym { + if array.count > terminalSequenceThreshold { + continue + } + } + singles[key] = valSym + } + } + } + + for (key, val) in productions { + var valCopy = val + var valCopyDidChange = false + + for choiceIdx in val.indices { + + let choice = val[choiceIdx] + var choiceCopy = choice + var choiceCopyDidChange = false + + for idx in choice.sequence.indices { + if case .nonTerminal(let nt) = choice.sequence[idx] { + if let sym = singles[nt] { + choiceCopy.sequence[idx] = sym + choiceCopyDidChange = true + } + } + } + + if choiceCopyDidChange { + valCopy[choiceIdx] = choiceCopy + valCopyDidChange = true + } + } + + if valCopyDidChange { + productions[key] = valCopy + changed = true + } + } + + // TODO: I think the below is unnecessary, since that would have + // upwards propagated for everyone except root. +// +// // Check for a simple layer of redirection: +// // +// // A ::= B +// // B ::= ... +// // +// // --> +// // +// // A ::= ... +// for (key, val) in productions { +// if val.count == 1 { +// let valChoice = val.first! +// if valChoice.sequence.count == 1 { +// let valSym = valChoice.sequence.first! +// if case .nonTerminal(let rhs) = valSym { +// guard let rhsProd = productions[rhs] else { +// fatalError("Invariant violated: Unknown production") +// } +// productions[key] = rhsProd +// changed = true +// } +// } +// } +// } + + // Check ROOT, since it has no uses it couldn't upward propagate + // a single non-terminal child + guard let rootSymbol = root else { + fatalError("Invariant violated: no root set") + } + guard let val = productions[rootSymbol] else { + // TODO: or is this an empty grammar? + // TODO: test empty regex + fatalError("Invariant violated: root has no production") + } + + // TODO: This isn't a win when RHS already has uses + if val.count == 1 { + if case .nonTerminal(let rhs) = val.first!.sequence.first! { + productions[rootSymbol] = productions[rhs] + changed = true + } + } + } + } +} extension BNFConvert { diff --git a/Sources/_RegexParser/Regex/Printing/PrintAsBNF.swift b/Sources/_RegexParser/Regex/Printing/PrintAsBNF.swift index 3b1f718f8..b1b4781b7 100644 --- a/Sources/_RegexParser/Regex/Printing/PrintAsBNF.swift +++ b/Sources/_RegexParser/Regex/Printing/PrintAsBNF.swift @@ -21,6 +21,7 @@ func convert(_ ast: AST) throws -> BNF { var converter = BNFConvert() let rhs = try converter.convert(ast.root) converter.root = converter.createProduction("ROOT", rhs) + converter.optimize() return converter.createBNF() } diff --git a/Tests/RegexTests/BNFTests.swift b/Tests/RegexTests/BNFTests.swift new file mode 100644 index 000000000..e69de29bb From 6ba5cd6760a126fc15929bcea8768ebfee50b5ca Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Thu, 6 Mar 2025 18:05:46 -0700 Subject: [PATCH 5/5] Support character classes --- Sources/Regex2BNF/Regex2BNF.swift | 16 ++++- Sources/_RegexParser/Regex/BNF/BNF.swift | 58 ++++++++++++++++--- .../_RegexParser/Regex/BNF/BNFConvert.swift | 54 +++++++++++++++-- 3 files changed, 113 insertions(+), 15 deletions(-) diff --git a/Sources/Regex2BNF/Regex2BNF.swift b/Sources/Regex2BNF/Regex2BNF.swift index 2f723ec62..b24b8b29f 100644 --- a/Sources/Regex2BNF/Regex2BNF.swift +++ b/Sources/Regex2BNF/Regex2BNF.swift @@ -60,14 +60,26 @@ struct Regex2BNF: ParsableCommand { // print("Grouping") try convert("a(b|c)d") + try convert("a(?:b|c)d") try convert("a(bcd|def(g|h)+)z") // print("Dot") -// try convert(".*") -// try convert("(a|b)*.{3}(a|b)") + try convert(".*") + try convert("(a|b)*.{3}(a|b)") + +// print("Bultin character classes") + try convert(#"\(\d{3}\)\d{3}-\d{4}"#) + try convert(#"\s+keyword\s+"#) // print("[Done]") + + // Look at optimizer output, the quant child is very long + try convert("a(123456789)+b") + + try convert("Hi the time right now is (AM|PM)") + + try convert("a(b|c)*d{2,4}e?") } try convert(pattern) diff --git a/Sources/_RegexParser/Regex/BNF/BNF.swift b/Sources/_RegexParser/Regex/BNF/BNF.swift index 1d6f29775..0fb6a5b9a 100644 --- a/Sources/_RegexParser/Regex/BNF/BNF.swift +++ b/Sources/_RegexParser/Regex/BNF/BNF.swift @@ -1,4 +1,11 @@ -struct BNF { +protocol BNFNode: CustomStringConvertible { + func render() -> String +} +extension BNFNode { + var description: String { render() } +} + +struct BNF: BNFNode { var root: Rule var rules: [Rule] @@ -15,7 +22,7 @@ struct BNF { } } -struct Rule { +struct Rule: BNFNode { // The left-hand side var symbol: NonTerminalSymbol @@ -33,15 +40,15 @@ struct CharacterPredicate { let impl: (Unicode.Scalar) -> Bool } -struct NonTerminalSymbol: Hashable { +struct NonTerminalSymbol: Hashable, BNFNode { var name: String func render() -> String { - name + "<\(name)>" } } -struct Expression { +struct Expression: BNFNode { var choices: [Choice] func render() -> String { @@ -49,7 +56,7 @@ struct Expression { } } -struct Choice { +struct Choice: BNFNode { var sequence: [Symbol] init(_ symbols: Array) { @@ -64,29 +71,64 @@ struct Choice { } } -enum Symbol { +enum Symbol: BNFNode { case terminal(TerminalSymbol) case terminalSequence([TerminalSymbol]) case nonTerminal(NonTerminalSymbol) + case builtin(Builtin) func render() -> String { switch self { case .terminal(let t): return t.render() + case .terminalSequence(let s): guard !s.isEmpty else { return "\"\"" } return "\(s.map({ $0.render() }).joined(separator: " "))" + case .nonTerminal(let n): return n.render() + + case .builtin(let b): + return b.render() + } + } +} + +enum Builtin: BNFNode { + case any // NOTE: we map dot to this, not sure if we want non-newline dots + case whitespace + case notWhitespace + case decimalDigit + case notDecimalDigit + case wordCharacter + case notWordCharacter + + func render() -> String { + switch self { + case .any: + return "" + case .whitespace: + return "" + case .notWhitespace: + fatalError() + case .decimalDigit: + return "" + case .notDecimalDigit: + fatalError() + case .wordCharacter: + return "" + case .notWordCharacter: + fatalError() } } } enum CharacterSet {} -enum TerminalSymbol { +enum TerminalSymbol: BNFNode { case character(Unicode.Scalar) case characterSet(CharacterSet) case utf8CodeUnit(UInt8) diff --git a/Sources/_RegexParser/Regex/BNF/BNFConvert.swift b/Sources/_RegexParser/Regex/BNF/BNFConvert.swift index b32c431bf..57c7e52ef 100644 --- a/Sources/_RegexParser/Regex/BNF/BNFConvert.swift +++ b/Sources/_RegexParser/Regex/BNF/BNFConvert.swift @@ -157,13 +157,20 @@ extension BNFConvert { return [s] + case .dot: + return [.builtin(.any)] + + case .escaped(let b): + let builtin = try mapEscapedBuiltin(b) + return [.builtin(builtin)] + case .scalar(_): fatalError() case .scalarSequence(_): fatalError() case .keyboardControl(_): fatalError() case .keyboardMeta(_): fatalError() case .keyboardMetaControl(_): fatalError() - case .property, .escaped, .dot, .caretAnchor, .dollarAnchor, + case .property, .escaped, .caretAnchor, .dollarAnchor, .backreference, .subpattern, .namedCharacter, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: fatalError() @@ -192,6 +199,40 @@ extension BNFConvert { } extension BNFConvert { + func mapEscapedBuiltin(_ b: AST.Atom.EscapedBuiltin) throws -> Builtin { + switch b { + + // Scalar escapes + case .alarm, .escape, .formfeed, .newline, .carriageReturn, .tab, .backspace: + fatalError() + + // Built-in character classes + case .whitespace: return .whitespace + case .notWhitespace: return .notWhitespace + case .decimalDigit: return .decimalDigit + case .notDecimalDigit: return .notDecimalDigit + case .wordCharacter: return .wordCharacter + case .notWordCharacter: return .notWordCharacter + + // Other character classes + case .horizontalWhitespace, .notHorizontalWhitespace, .notNewline, .newlineSequence, .verticalTab, .notVerticalTab: + fatalError() + + + // Assertions + case .wordBoundary, .notWordBoundary: + fatalError() + + // Anchors + case .startOfSubject, .endOfSubjectBeforeNewline, .endOfSubject, .firstMatchingPositionInSubject: + fatalError() + + // Other + case .singleDataUnit, .graphemeCluster, .resetStartOfMatch, .trueAnychar, .textSegment, .notTextSegment: + fatalError() + + } + } mutating func createQuantify( _ child: NonTerminalSymbol, @@ -232,7 +273,7 @@ extension BNFConvert { case .zeroOrOne: // QUANT ::= QUANT_CHILD | - let name = symbols.genSym("QUANT_+") + let name = symbols.genSym("QUANT_?") let choices = [ Choice(child), emptyChoice @@ -413,9 +454,12 @@ extension BNFConvert { // TODO: This isn't a win when RHS already has uses if val.count == 1 { - if case .nonTerminal(let rhs) = val.first!.sequence.first! { - productions[rootSymbol] = productions[rhs] - changed = true + let seq = val.first!.sequence + if seq.count == 1 { + if case .nonTerminal(let rhs) = seq.first! { + productions[rootSymbol] = productions[rhs] + changed = true + } } } }