diff --git a/Package.swift b/Package.swift index e9198908d..fb532bad4 100644 --- a/Package.swift +++ b/Package.swift @@ -135,6 +135,13 @@ let package = Package( "_RegexParser", "_StringProcessing" ]), + .executableTarget( + name: "Regex2BNF", + dependencies: [ + .product(name: "ArgumentParser", package: "swift-argument-parser"), + "_RegexParser" + ], + swiftSettings: [availabilityDefinition]), .executableTarget( name: "RegexTester", dependencies: [ diff --git a/Sources/Regex2BNF/Regex2BNF.swift b/Sources/Regex2BNF/Regex2BNF.swift new file mode 100644 index 000000000..b24b8b29f --- /dev/null +++ b/Sources/Regex2BNF/Regex2BNF.swift @@ -0,0 +1,89 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import ArgumentParser +import _RegexParser + +@main +@available(SwiftStdlib 5.8, *) +struct Regex2BNF: ParsableCommand { + @Argument(help: "The regex pattern to convert to BNF.") + var pattern: String + + @Flag( + name: [.customShort("e"), .customLong("examples")], + help: "Run several examples") + var runExamples = false + + func convert(_ pattern: String) throws { + print("/\(pattern)/\n") + print(try _printAsBNF(inputRegex: pattern)) + } + + mutating func run() throws { + if runExamples { + // TODO: Turn into test cases +// print("[Examples") + +// print("Single-scalar character literals:") + try convert("a") + try convert("Z") + try convert("あ") + try convert("日") + try convert("\u{301}") + + +// print("Multi-scalar character literals") + try convert("🧟‍♀️") + try convert("e\u{301}") + +// print("Simple alternations") + try convert("a|b") + try convert("a|b|c|d") + try convert("a|🧟‍♀️\u{301}日|z") + +// print("Simple quantifications") + try convert("a*") + try convert("a+") + try convert("a?") + try convert("a{2,10}") + try convert("a{,10}") + try convert("a{2,}") + +// print("Grouping") + try convert("a(b|c)d") + try convert("a(?:b|c)d") + try convert("a(bcd|def(g|h)+)z") + +// print("Dot") + try convert(".*") + try convert("(a|b)*.{3}(a|b)") + +// print("Bultin character classes") + try convert(#"\(\d{3}\)\d{3}-\d{4}"#) + try convert(#"\s+keyword\s+"#) + + +// print("[Done]") + + // Look at optimizer output, the quant child is very long + try convert("a(123456789)+b") + + try convert("Hi the time right now is (AM|PM)") + + try convert("a(b|c)*d{2,4}e?") + } + try convert(pattern) + + + + } +} diff --git a/Sources/RegexBenchmark/BenchmarkChart.swift b/Sources/RegexBenchmark/BenchmarkChart.swift index 862565c6e..7fd04ab94 100644 --- a/Sources/RegexBenchmark/BenchmarkChart.swift +++ b/Sources/RegexBenchmark/BenchmarkChart.swift @@ -1,80 +1,80 @@ -//===----------------------------------------------------------------------===// +////===----------------------------------------------------------------------===// +//// +//// This source file is part of the Swift.org open source project +//// +//// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +//// Licensed under Apache License v2.0 with Runtime Library Exception +//// +//// See https://swift.org/LICENSE.txt for license information +//// +////===----------------------------------------------------------------------===// // -// This source file is part of the Swift.org open source project +//#if os(macOS) && canImport(Charts) // -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception +//import Charts +//import SwiftUI // -// See https://swift.org/LICENSE.txt for license information +//struct BenchmarkChart: View { +// var comparisons: [BenchmarkResult.Comparison] // -//===----------------------------------------------------------------------===// - -#if os(macOS) && canImport(Charts) - -import Charts -import SwiftUI - -struct BenchmarkChart: View { - var comparisons: [BenchmarkResult.Comparison] - - // Sort by normalized difference - var sortedComparisons: [BenchmarkResult.Comparison] { - comparisons.sorted { a, b in - a.normalizedDiff < b.normalizedDiff - } - } - var body: some View { - VStack(alignment: .leading) { - Chart { - ForEach(sortedComparisons) { comparison in - // Normalized runtime - BarMark( - x: .value("Name", comparison.name), - y: .value("Normalized runtime", comparison.normalizedDiff)) - .foregroundStyle(LinearGradient( - colors: [.accentColor, comparison.diff?.seconds ?? 0 <= 0 ? .green : .yellow], - startPoint: .bottom, - endPoint: .top)) - } - // Baseline - RuleMark(y: .value("Time", 1.0)) - .foregroundStyle(.red) - .lineStyle(.init(lineWidth: 1, dash: [2])) - .annotation(position: .top, alignment: .leading) { - Text("Baseline").foregroundStyle(.red) - } - - } - .frame(idealWidth: 800, idealHeight: 800) - .chartYScale(domain: 0...2.0) - .chartYAxis { - AxisMarks(values: .stride(by: 0.1)) - } - .chartXAxis { - AxisMarks { value in - AxisGridLine() - AxisTick() - AxisValueLabel(value.as(String.self)!, orientation: .vertical) - } - } - } - } -} - -struct BenchmarkResultApp: App { - static var comparisons: [BenchmarkResult.Comparison]? - - var body: some Scene { - WindowGroup { - if let comparisons = Self.comparisons { - ScrollView { - BenchmarkChart(comparisons: comparisons) - } - } else { - Text("No data") - } - } - } -} - -#endif +// // Sort by normalized difference +// var sortedComparisons: [BenchmarkResult.Comparison] { +// comparisons.sorted { a, b in +// a.normalizedDiff < b.normalizedDiff +// } +// } +// var body: some View { +// VStack(alignment: .leading) { +// Chart { +// ForEach(sortedComparisons) { comparison in +// // Normalized runtime +// BarMark( +// x: .value("Name", comparison.name), +// y: .value("Normalized runtime", comparison.normalizedDiff)) +// .foregroundStyle(LinearGradient( +// colors: [.accentColor, comparison.diff?.seconds ?? 0 <= 0 ? .green : .yellow], +// startPoint: .bottom, +// endPoint: .top)) +// } +// // Baseline +// RuleMark(y: .value("Time", 1.0)) +// .foregroundStyle(.red) +// .lineStyle(.init(lineWidth: 1, dash: [2])) +// .annotation(position: .top, alignment: .leading) { +// Text("Baseline").foregroundStyle(.red) +// } +// +// } +// .frame(idealWidth: 800, idealHeight: 800) +// .chartYScale(domain: 0...2.0) +// .chartYAxis { +// AxisMarks(values: .stride(by: 0.1)) +// } +// .chartXAxis { +// AxisMarks { value in +// AxisGridLine() +// AxisTick() +// AxisValueLabel(value.as(String.self)!, orientation: .vertical) +// } +// } +// } +// } +//} +// +//struct BenchmarkResultApp: App { +// static var comparisons: [BenchmarkResult.Comparison]? +// +// var body: some Scene { +// WindowGroup { +// if let comparisons = Self.comparisons { +// ScrollView { +// BenchmarkChart(comparisons: comparisons) +// } +// } else { +// Text("No data") +// } +// } +// } +//} +// +//#endif diff --git a/Sources/RegexBenchmark/BenchmarkResults.swift b/Sources/RegexBenchmark/BenchmarkResults.swift index da66183fd..14edef5b9 100644 --- a/Sources/RegexBenchmark/BenchmarkResults.swift +++ b/Sources/RegexBenchmark/BenchmarkResults.swift @@ -114,16 +114,16 @@ extension BenchmarkRunner { print(item) } - #if os(macOS) && canImport(Charts) - if showChart { - print(""" - === Comparison chart ================================================================= - Press Control-C to close... - """) - BenchmarkResultApp.comparisons = comparisons - BenchmarkResultApp.main() - } - #endif +// #if os(macOS) && canImport(Charts) +// if showChart { +// print(""" +// === Comparison chart ================================================================= +// Press Control-C to close... +// """) +// BenchmarkResultApp.comparisons = comparisons +// BenchmarkResultApp.main() +// } +// #endif } func saveComparisons( diff --git a/Sources/_RegexParser/Regex/BNF/BNF.swift b/Sources/_RegexParser/Regex/BNF/BNF.swift new file mode 100644 index 000000000..0fb6a5b9a --- /dev/null +++ b/Sources/_RegexParser/Regex/BNF/BNF.swift @@ -0,0 +1,175 @@ +protocol BNFNode: CustomStringConvertible { + func render() -> String +} +extension BNFNode { + var description: String { render() } +} + +struct BNF: BNFNode { + var root: Rule + var rules: [Rule] + + func render() -> String { + var str = ""// root.render() + "\n" + if rules.isEmpty { + return str + } + return str + + rules.lazy.map { + $0.render() + }.joined(separator: "\n") + + "\n" + } +} + +struct Rule: BNFNode { + // The left-hand side + var symbol: NonTerminalSymbol + + var expression: Expression + + var predicates: [CharacterPredicate] = [] + + func render() -> String { + "\(symbol.render()) ::= \(expression.render())" + } +} + +struct CharacterPredicate { + // TODO: convention c or trivial? + let impl: (Unicode.Scalar) -> Bool +} + +struct NonTerminalSymbol: Hashable, BNFNode { + var name: String + + func render() -> String { + "<\(name)>" + } +} + +struct Expression: BNFNode { + var choices: [Choice] + + func render() -> String { + "\(choices.map({ $0.render() }).joined(separator: " | "))" + } +} + +struct Choice: BNFNode { + var sequence: [Symbol] + + init(_ symbols: Array) { + self.sequence = symbols + } + init(_ symbols: Symbol...) { + self.init(symbols) + } + + func render() -> String { + "\(sequence.map({ $0.render() }).joined(separator: " "))" + } +} + +enum Symbol: BNFNode { + case terminal(TerminalSymbol) + case terminalSequence([TerminalSymbol]) + case nonTerminal(NonTerminalSymbol) + case builtin(Builtin) + + func render() -> String { + switch self { + case .terminal(let t): + return t.render() + + case .terminalSequence(let s): + guard !s.isEmpty else { + return "\"\"" + } + return "\(s.map({ $0.render() }).joined(separator: " "))" + + case .nonTerminal(let n): + return n.render() + + case .builtin(let b): + return b.render() + } + } +} + +enum Builtin: BNFNode { + case any // NOTE: we map dot to this, not sure if we want non-newline dots + case whitespace + case notWhitespace + case decimalDigit + case notDecimalDigit + case wordCharacter + case notWordCharacter + + func render() -> String { + switch self { + case .any: + return "" + case .whitespace: + return "" + case .notWhitespace: + fatalError() + case .decimalDigit: + return "" + case .notDecimalDigit: + fatalError() + case .wordCharacter: + return "" + case .notWordCharacter: + fatalError() + } + } +} + +enum CharacterSet {} + +enum TerminalSymbol: BNFNode { + case character(Unicode.Scalar) + case characterSet(CharacterSet) + case utf8CodeUnit(UInt8) + + case characterPredicate(CharacterPredicate) + + func render() -> String { + switch self { + case .character(let c): + return "\"\(c)\"" + case .characterSet(let _): + fatalError() + case .utf8CodeUnit(let u): + return "\"\(u)\"" + case .characterPredicate(_): + fatalError() + } + } +} + +extension Expression { + init(_ choices: [Choice]) { + self.init(choices: choices) + } + init(_ choices: Choice...) { + self.init(choices) + } +} + +extension Choice { + init(_ elements: [NonTerminalSymbol]) { + self.init(elements.map { .nonTerminal($0) }) + } + init(_ elements: NonTerminalSymbol...) { + self.init(elements) + } +} + +/* + + +node -> choice + + */ diff --git a/Sources/_RegexParser/Regex/BNF/BNFConvert.swift b/Sources/_RegexParser/Regex/BNF/BNFConvert.swift new file mode 100644 index 000000000..57c7e52ef --- /dev/null +++ b/Sources/_RegexParser/Regex/BNF/BNFConvert.swift @@ -0,0 +1,671 @@ +// +// BNFConvert.swift +// swift-experimental-string-processing +// +// Created by Michael Ilseman on 1/18/25. +// + +/// Create a unique non-terminal symbol +/// +/// NOTE: Currently, this is unique per input regex, but we should extend any +/// API or SPI to either be able to re-use a generator or pass in a unique seed +/// (such as a regex-counter). +struct SymbolGenerator { + var prefix = "" + + var counters = [String: Int]() + + mutating func genSuffix(for s: String) -> String { + guard let c = counters[s] else { + counters[s] = 0 + return "" + } + defer { counters[s] = c + 1 } + return "_\(c)" + } + + mutating func genSym(_ name: String) -> NonTerminalSymbol { + let suffix = genSuffix(for: name) + return NonTerminalSymbol(name: prefix + name + suffix) + } +} + + +struct BNFConvert { + var symbols = SymbolGenerator() + var productions = [NonTerminalSymbol: [Choice]]() + var root: NonTerminalSymbol? = nil +} + +extension BNFConvert { + /// Create a new BNF rule for `sym` and add it to our productions. + @discardableResult + mutating func createProduction( + _ sym: NonTerminalSymbol, + _ choices: [Choice] + ) -> NonTerminalSymbol { + guard !productions.keys.contains(sym) else { + fatalError("Internal invariant violated: non-unique symbols") + } + productions[sym] = choices + return sym + } + + /// Create a new symbol for `name` and BNF rule and add it to our productions. + mutating func createProduction( + _ name: String, + _ choices: [Choice] + ) -> NonTerminalSymbol { + let sym = symbols.genSym(name) + return createProduction(sym, choices) + } + mutating func createProduction( + _ name: String, + _ elements: [Symbol] + ) -> NonTerminalSymbol { + createProduction(name, [Choice(elements)]) + } +} + +extension BNFConvert { + /// Convert a Regex AST node to a concatnative component + /// + /// Alternations always produce a new rule, as do some quantifications + mutating func convert( + _ node: AST.Node + ) throws -> [Symbol] { + switch node { + /// ... | ... | ... + case .alternation(let a): + let choices = try a.children.map { + Choice(try convert($0)) + } + let altSym = createProduction("ALT", choices) + return [.nonTerminal(altSym)] + + /// ... ... + case .concatenation(let c): + return try c.children.flatMap { node in + try convert(node) + } + + /// (...) + case .group(let g): + // A group is where an alternation could be nested + + switch g.kind.value { + // BNF has no captures, so these are just syntactic groups + case .capture, .namedCapture(_), .balancedCapture(_), .nonCapture: + return try convert(g.child) + + case .nonCaptureReset: + fatalError() + + case .atomicNonCapturing: + fatalError() + + case .lookahead: + fatalError() + case .negativeLookahead: + fatalError() + case .nonAtomicLookahead: + fatalError() + case .lookbehind: + fatalError() + case .negativeLookbehind: + fatalError() + case .nonAtomicLookbehind: + fatalError() + + case .scriptRun: + fatalError() + case .atomicScriptRun: + fatalError() + + case .changeMatchingOptions(_): + fatalError() + } + + /// (?(cond) true-branch | false-branch) + case .conditional(_): fatalError() + + case .quantification(let q): + let quantChild = createProduction("QUANT_CHILD", try convert(q.child)) + return createQuantify(quantChild, q.kind.value, q.amount.value) + + /// \Q...\E + case .quote(_): fatalError() + + /// Comments, non-semantic whitespace, etc + case .trivia(_): fatalError() + + /// Intepolation `<{...}>`, currently reserved for future use. + case .interpolation(_): fatalError() + + case .atom(let atom): + switch atom.kind { + case .char(let c): + + let s: Symbol + if c.unicodeScalars.count == 1 { + s = .terminal(.character(c.unicodeScalars.first!)) + } else { + s = .terminalSequence(c.unicodeScalars.map { + .character($0) + }) + } + + return [s] + + case .dot: + return [.builtin(.any)] + + case .escaped(let b): + let builtin = try mapEscapedBuiltin(b) + return [.builtin(builtin)] + + case .scalar(_): fatalError() + case .scalarSequence(_): fatalError() + case .keyboardControl(_): fatalError() + case .keyboardMeta(_): fatalError() + case .keyboardMetaControl(_): fatalError() + + case .property, .escaped, .caretAnchor, .dollarAnchor, + .backreference, .subpattern, .namedCharacter, .callout, + .backtrackingDirective, .changeMatchingOptions, .invalid: + fatalError() + } + + case .customCharacterClass(let ccc): + if ccc.start.value == .inverted { + fatalError("TODO: inverted character classes") + } + if ccc.members.count > 1 { + fatalError("TODO: inverted character classes") + } + if ccc.members.isEmpty { + fatalError("TODO") + } + + fatalError() + + + + case .absentFunction(_): fatalError() + + case .empty(_): fatalError() + } + } +} + +extension BNFConvert { + func mapEscapedBuiltin(_ b: AST.Atom.EscapedBuiltin) throws -> Builtin { + switch b { + + // Scalar escapes + case .alarm, .escape, .formfeed, .newline, .carriageReturn, .tab, .backspace: + fatalError() + + // Built-in character classes + case .whitespace: return .whitespace + case .notWhitespace: return .notWhitespace + case .decimalDigit: return .decimalDigit + case .notDecimalDigit: return .notDecimalDigit + case .wordCharacter: return .wordCharacter + case .notWordCharacter: return .notWordCharacter + + // Other character classes + case .horizontalWhitespace, .notHorizontalWhitespace, .notNewline, .newlineSequence, .verticalTab, .notVerticalTab: + fatalError() + + + // Assertions + case .wordBoundary, .notWordBoundary: + fatalError() + + // Anchors + case .startOfSubject, .endOfSubjectBeforeNewline, .endOfSubject, .firstMatchingPositionInSubject: + fatalError() + + // Other + case .singleDataUnit, .graphemeCluster, .resetStartOfMatch, .trueAnychar, .textSegment, .notTextSegment: + fatalError() + + } + } + + mutating func createQuantify( + _ child: NonTerminalSymbol, + _ kind: AST.Quantification.Kind, + _ amount: AST.Quantification.Amount + ) -> [Symbol] { + switch kind { + case .possessive: fatalError("TODO: possessive quantification") + case .reluctant: + fatalError("NOTE: reluctanct is ignored") + case .eager: + break + } + + // TODO: Not sure what the canonical empty choice is (i.e. ACCEPT). + let emptyChoice = Choice(Symbol.terminalSequence([])) + switch amount { + case .zeroOrMore: + // QUANT ::= QUANT_CHILD QUANT | + let name = symbols.genSym("QUANT_*") + let choices = [ + Choice(child, name), + emptyChoice, + ] + createProduction(name, choices) + return [.nonTerminal(name)] + + case .oneOrMore: + // QUANT ::= QUANT_CHILD QUANT | QUANT_CHILD + let name = symbols.genSym("QUANT_+") + let choices = [ + Choice(child, name), + Choice(child), + ] + createProduction(name, choices) + return [.nonTerminal(name)] + + + case .zeroOrOne: + // QUANT ::= QUANT_CHILD | + let name = symbols.genSym("QUANT_?") + let choices = [ + Choice(child), + emptyChoice + ] + createProduction(name, choices) + return [.nonTerminal(name)] + + case .exactly(let n): + // QUANT_CHILD^n + guard let n = n.value else { + fatalError("Invalid AST") + } + + return Array(repeating: .nonTerminal(child), count: n) + + case .nOrMore(let n): + // QUANT_CHILD^n QUANT_CHILD* + var res = createQuantify(child, kind, .exactly(n)) + res.append(contentsOf: createQuantify(child, kind, .zeroOrMore)) + + return res + + case .upToN(let n): + // QUANT ::= | QUANT_CHILD | ... | QUANT_CHILD^n + let name = symbols.genSym("QUANT_UPTO_N") + var choices = [ emptyChoice ] + + guard let n = n.value else { + fatalError("Invalid AST") + } + + for i in 1...n { + choices.append(Choice(createQuantify( + child, kind, .exactly(.init(i, at: .fake))))) + } + // TODO: Do we want to emit differently for eager/reluctant? + // TODO: Do we want to canonicalize if the BNF truly doesn't have + // order? + choices.reverse() + + createProduction(name, choices) + return [.nonTerminal(name)] + + case .range(let min, let max): + // QUANT ::= QUANT_CHILD^min QUANT_UPTO_(max-min) + guard let min = min.value, let max = max.value else { + fatalError("Invalid AST") + } + + var res = createQuantify(child, kind, .exactly(.init(min, at: .fake))) + let upto = createQuantify(child, kind, .upToN(.init(max-min, at: .fake))) + res.append(contentsOf: upto) + + return res + } + } +} + +extension BNFConvert { + // TODO: I just want a use-def chain + func calculateUseGraph() -> [NonTerminalSymbol: [NonTerminalSymbol]] { + fatalError() + } + + + + /// Optimize the BNF + mutating func optimize() { + // Iterate until we reach a fixed point + var changed = true + while changed { + changed = false + + // + // Value propagation: propagate small single-choice single-symbol + // productions + // + // A ::= B C D E + // B ::= "b" + // C ::= C2 + // C2 ::= "c" + // D ::= "d" "d" "d" + // E ::= "e" "e" "e" "e" ... + // + // --> + // + // A ::= "b" "c" "d" "d" "d" E + // E ::= "e" "e" "e" "e" ... + // + + // Build up a list of single-choice single-symbol productions + // for upwards propagation + let terminalSequenceThreshold = 3 + var singles = [NonTerminalSymbol: Symbol]() + for (key, val) in productions { + if val.count == 1 { + let valChoice = val.first! + if valChoice.sequence.count == 1 { + let valSym = valChoice.sequence.first! + if case .terminalSequence(let array) = valSym { + if array.count > terminalSequenceThreshold { + continue + } + } + singles[key] = valSym + } + } + } + + for (key, val) in productions { + var valCopy = val + var valCopyDidChange = false + + for choiceIdx in val.indices { + + let choice = val[choiceIdx] + var choiceCopy = choice + var choiceCopyDidChange = false + + for idx in choice.sequence.indices { + if case .nonTerminal(let nt) = choice.sequence[idx] { + if let sym = singles[nt] { + choiceCopy.sequence[idx] = sym + choiceCopyDidChange = true + } + } + } + + if choiceCopyDidChange { + valCopy[choiceIdx] = choiceCopy + valCopyDidChange = true + } + } + + if valCopyDidChange { + productions[key] = valCopy + changed = true + } + } + + // TODO: I think the below is unnecessary, since that would have + // upwards propagated for everyone except root. +// +// // Check for a simple layer of redirection: +// // +// // A ::= B +// // B ::= ... +// // +// // --> +// // +// // A ::= ... +// for (key, val) in productions { +// if val.count == 1 { +// let valChoice = val.first! +// if valChoice.sequence.count == 1 { +// let valSym = valChoice.sequence.first! +// if case .nonTerminal(let rhs) = valSym { +// guard let rhsProd = productions[rhs] else { +// fatalError("Invariant violated: Unknown production") +// } +// productions[key] = rhsProd +// changed = true +// } +// } +// } +// } + + // Check ROOT, since it has no uses it couldn't upward propagate + // a single non-terminal child + guard let rootSymbol = root else { + fatalError("Invariant violated: no root set") + } + guard let val = productions[rootSymbol] else { + // TODO: or is this an empty grammar? + // TODO: test empty regex + fatalError("Invariant violated: root has no production") + } + + // TODO: This isn't a win when RHS already has uses + if val.count == 1 { + let seq = val.first!.sequence + if seq.count == 1 { + if case .nonTerminal(let rhs) = seq.first! { + productions[rootSymbol] = productions[rhs] + changed = true + } + } + } + } + } +} + + +extension BNFConvert { + /// Apply `f` (accumulating results) to our rules in reverse-post-order. + func reversePostOrder( + _ f: (NonTerminalSymbol, [Choice]) -> T + ) -> [T] { + guard let rootSymbol = root else { + fatalError("No root symbol defined") + } + + var visited: Set = [] + var result = [T]() + func visit( + _ sym: NonTerminalSymbol + ) { + if visited.contains(sym) { return } + visited.insert(sym) + + guard let choices = productions[sym] else { + fatalError("Internal invariant violated: undefined nonterminal") + } + + for choice in choices { + for symbol in choice.sequence.lazy.reversed() { + if case .nonTerminal(let sym) = symbol { + visit(sym) + } + } + } + + result.append(f(sym, choices)) + } + + visit(rootSymbol) + result.reverse() + return result + } + + func createBNF() -> BNF { + guard let rootSymbol = root else { + fatalError("No root symbol defined") + } + + let rules = reversePostOrder { sym, choices in + Rule(symbol: sym, expression: Expression(choices)) + } + + return BNF( + root: rules.first!, + rules: rules) + } +} + +//extension BNFConvert { +// +// // var productions = [NonTerminalSymbol: Rule] +// +// // TODO: dictionary for the rules +// +// mutating func processNode( +// _ node: AST.Node +// ) -> Rule { +// return makeRule("ROOT", for: node).0 +// } +// +// // NOTE: alternation produces expression, but it seems like everything +// // else could produce a Choice or lower +// mutating func mapNode( +// _ node: AST.Node +// ) -> Expression { +// switch node { +// /// ... | ... | ... +// case .alternation(let a): +// return Expression(convertAlternation(a)) +// +// /// ... ... +// case .concatenation(let c): +// +// let childrenChoices: [Symbol] = c.children.flatMap { node in +// // TODO: look at just mapping to a choice +// let expr = mapNode(node) +// guard expr.choices.count == 1 else { +// // TODO: Figure this out +// fatalError("Concat's children had direct alternations") +// } +// return expr.choices.first!.sequence +// } +// +// return Expression(Choice(childrenChoices)) +// +// /// (...) +// case .group(_): fatalError() +// +// /// (?(cond) true-branch | false-branch) +// case .conditional(_): fatalError() +// +// case .quantification(let q): +// // Make a rule for the child +// let (_, childName) = addRule("QUANT_CHILD", for: q.child) +// +// switch q.kind.value { +// case .possessive: fatalError("TODO: possessive quantification") +// case .reluctant: +// fatalError("NOTE: reluctanct is ignored") +// case .eager: +// break +// } +// +// switch q.amount.value { +// case .zeroOrMore: +// // QUANT ::= | QUANT_CHILD QUANT +// +// let name = symbols.genSym("QUANT_*") +// let expr = Expression( +// Choice(.terminalSequence([])), +// Choice(childName, name)) +// rules.append(Rule( +// symbol: name, +// expression: expr)) +// +// return Expression(Choice(name)) +// +// +// +// case .oneOrMore: +// // QUANT ::= QUANT_CHILD | QUANT_CHILD QUANT +// +// let name = symbols.genSym("QUANT_*") +// let expr = Expression( +// Choice(childName), +// Choice(childName, name)) +// rules.append(Rule( +// symbol: name, +// expression: expr)) +// +// return Expression(Choice(name)) +// +// case .zeroOrOne: +// // QUANT ::= QUANT_CHILD | +// fatalError() +// +// case .exactly(let n): +// // QUANT ::= QUANT_CHILD^n +// fatalError() +// case .nOrMore(let n): +// fatalError() +// case .upToN(let n): +// fatalError() +// case .range(let min, let max): +// fatalError() +// } +// +// // let sym = .nonTerminal(name) +// +// +// +// +// fatalError() +// +// /// \Q...\E +// case .quote(_): fatalError() +// +// /// Comments, non-semantic whitespace, etc +// case .trivia(_): fatalError() +// +// /// Intepolation `<{...}>`, currently reserved for future use. +// case .interpolation(_): fatalError() +// +// case .atom(let atom): +// switch atom.kind { +// case .char(let c): +// +// let s: Symbol +// if c.unicodeScalars.count == 1 { +// s = .terminal(.character(c.unicodeScalars.first!)) +// } else { +// s = .terminalSequence(c.unicodeScalars.map { +// .character($0) +// }) +// } +// +// return Expression(choices: [Choice(s)]) +// +// case .scalar(_): fatalError() +// case .scalarSequence(_): fatalError() +// case .keyboardControl(_): fatalError() +// case .keyboardMeta(_): fatalError() +// case .keyboardMetaControl(_): fatalError() +// +// case .property, .escaped, .dot, .caretAnchor, .dollarAnchor, +// .backreference, .subpattern, .namedCharacter, .callout, +// .backtrackingDirective, .changeMatchingOptions, .invalid: +// fatalError() +// } +// +// case .customCharacterClass(_): fatalError() +// +// case .absentFunction(_): fatalError() +// +// case .empty(_): fatalError() +// } +// } +//} diff --git a/Sources/_RegexParser/Regex/Printing/PrintAsBNF.swift b/Sources/_RegexParser/Regex/Printing/PrintAsBNF.swift new file mode 100644 index 000000000..b1b4781b7 --- /dev/null +++ b/Sources/_RegexParser/Regex/Printing/PrintAsBNF.swift @@ -0,0 +1,33 @@ +// +// PrintAsBNF.swift +// swift-experimental-string-processing +// +// Created by Michael Ilseman on 1/9/25. +// + +// Move to CL tool, but keep render here +// TODO: some kinda API/SPI +public func _printAsBNF(inputRegex: String) throws -> String { + + // TODO: Should we pass in our language subset constraints here to + // error at parse time rather than render time? + let ast = try _RegexParser.parse(inputRegex, .init()) + + return try ast.renderAsBNF() +} + +// Regex AST -> BNF +func convert(_ ast: AST) throws -> BNF { + var converter = BNFConvert() + let rhs = try converter.convert(ast.root) + converter.root = converter.createProduction("ROOT", rhs) + converter.optimize() + return converter.createBNF() +} + +extension AST { + public func renderAsBNF() throws -> String { + let bnf = try convert(self) + return bnf.render() + } +} diff --git a/Tests/RegexTests/BNFTests.swift b/Tests/RegexTests/BNFTests.swift new file mode 100644 index 000000000..e69de29bb