diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d18d50aa0..2e3e1ca31 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -62,7 +62,10 @@ fileprivate extension Compiler.ByteCodeGen { try emitCharacter(c) case let .scalar(s): - try emitScalar(s) + // A scalar always matches the same as a single scalar character. This + // means it must match a whole grapheme in grapheme semantic mode, but + // can match a single scalar in scalar semantic mode. + try emitCharacter(Character(s)) case let .assertion(kind): try emitAssertion(kind.ast) @@ -244,8 +247,12 @@ fileprivate extension Compiler.ByteCodeGen { } } } - - mutating func emitScalar(_ s: UnicodeScalar) throws { + + /// Emit a consume of a single scalar value. This must only be used in scalar + /// semantic mode. + mutating func emitConsumeScalar(_ s: UnicodeScalar) throws { + assert(options.semanticLevel == .unicodeScalar, "Wrong semantic level") + // TODO: Native instruction buildMatchScalar(s) if options.isCaseInsensitive { // TODO: e.g. buildCaseInsensitiveMatchScalar(s) @@ -263,7 +270,7 @@ fileprivate extension Compiler.ByteCodeGen { // Unicode scalar matches the specific scalars that comprise a character if options.semanticLevel == .unicodeScalar { for scalar in c.unicodeScalars { - try emitScalar(scalar) + try emitConsumeScalar(scalar) } return } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index dbb324b67..44c725470 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -53,6 +53,45 @@ extension DSLTree._AST.Atom { } } +extension Character { + func generateConsumer( + _ opts: MatchingOptions + ) throws -> MEProgram.ConsumeFunction? { + let isCaseInsensitive = opts.isCaseInsensitive + switch opts.semanticLevel { + case .graphemeCluster: + return { input, bounds in + let low = bounds.lowerBound + if isCaseInsensitive && isCased { + return input[low].lowercased() == lowercased() + ? input.index(after: low) + : nil + } else { + return input[low] == self + ? input.index(after: low) + : nil + } + } + case .unicodeScalar: + // TODO: This should only be reachable from character class emission, can + // we guarantee that? Otherwise we'd want a different matching behavior. + let consumers = unicodeScalars.map { s in consumeScalar { + isCaseInsensitive + ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping + : $0 == s + }} + return { input, bounds in + for fn in consumers { + if let idx = fn(input, bounds) { + return idx + } + } + return nil + } + } + } +} + extension DSLTree.Atom { var singleScalarASCIIValue: UInt8? { switch self { @@ -72,44 +111,15 @@ extension DSLTree.Atom { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { - let isCaseInsensitive = opts.isCaseInsensitive - switch self { case let .char(c): - if opts.semanticLevel == .graphemeCluster { - return { input, bounds in - let low = bounds.lowerBound - if isCaseInsensitive && c.isCased { - return input[low].lowercased() == c.lowercased() - ? input.index(after: low) - : nil - } else { - return input[low] == c - ? input.index(after: low) - : nil - } - } - } else { - let consumers = c.unicodeScalars.map { s in consumeScalar { - isCaseInsensitive - ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping - : $0 == s - }} - return { input, bounds in - for fn in consumers { - if let idx = fn(input, bounds) { - return idx - } - } - return nil - } - } + return try c.generateConsumer(opts) + case let .scalar(s): - return consumeScalar { - isCaseInsensitive - ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping - : $0 == s - } + // A scalar always matches the same as a single scalar character. This + // means it must match a whole grapheme in grapheme semantic mode, but + // can match a single scalar in scalar semantic mode. + return try Character(s).generateConsumer(opts) case .any: // FIXME: Should this be a total ordering? @@ -211,16 +221,20 @@ extension AST.Atom { var singleScalar: UnicodeScalar? { switch kind { case .scalar(let s): return s.value + case .escaped(let e): + guard let s = e.scalarValue else { return nil } + return s default: return nil } } var singleScalarASCIIValue: UInt8? { + if let s = singleScalar, s.isASCII { + return UInt8(ascii: s) + } switch kind { case let .char(c) where c != "\r\n": return c.asciiValue - case let .scalar(s) where s.value.isASCII: - return UInt8(ascii: s.value) default: return nil } diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 4237eda33..119a5d14f 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -315,8 +315,7 @@ extension PrettyPrinter { return } - var charMembers = "" - + var charMembers = StringLiteralBuilder() // This iterates through all of the character class members collecting all // of the members who can be stuffed into a singular '.anyOf(...)' vs. @@ -340,14 +339,10 @@ extension PrettyPrinter { switch a { case let .char(c): charMembers.append(c) - - if c == "\\" { - charMembers.append(c) - } - return false case let .scalar(s): - charMembers += "\\u{\(String(s.value, radix: 16, uppercase: true))}" + charMembers.append( + unescaped: "\\u{\(String(s.value, radix: 16, uppercase: true))}") return false case .unconverted(_): return true @@ -356,7 +351,7 @@ extension PrettyPrinter { } case let .quotedLiteral(s): - charMembers += s + charMembers.append(s) return false case .trivia(_): @@ -370,7 +365,7 @@ extension PrettyPrinter { // Also in the same vein, if we have a few atom members but no // nonAtomMembers, then we can emit a single .anyOf(...) for them. if !charMembers.isEmpty, nonCharMembers.isEmpty { - let anyOf = ".anyOf(\(charMembers._quoted))" + let anyOf = ".anyOf(\(charMembers))" indent() @@ -393,7 +388,7 @@ extension PrettyPrinter { printer.indent() if !charMembers.isEmpty { - printer.output(".anyOf(\(charMembers._quoted))") + printer.output(".anyOf(\(charMembers))") if nonCharMembers.count > 0 { printer.output(",") @@ -617,10 +612,39 @@ extension PrettyPrinter { } extension String { - // TODO: Escaping? + fileprivate var _escaped: String { + _replacing(#"\"#, with: #"\\"#)._replacing(#"""#, with: #"\""#) + } + fileprivate var _quoted: String { - "\"\(self._replacing(#"\"#, with: #"\\"#)._replacing(#"""#, with: #"\""#))\"" + _escaped._bareQuoted + } + + fileprivate var _bareQuoted: String { + #""\#(self)""# + } +} + +/// A helper for building string literals, which handles escaping the contents +/// appended. +fileprivate struct StringLiteralBuilder { + private var contents = "" + + var result: String { contents._bareQuoted } + var isEmpty: Bool { contents.isEmpty } + + mutating func append(_ str: String) { + contents += str._escaped + } + mutating func append(_ c: Character) { + contents += String(c)._escaped } + mutating func append(unescaped str: String) { + contents += str + } +} +extension StringLiteralBuilder: CustomStringConvertible { + var description: String { result } } extension AST.Atom.AssertionKind { @@ -1107,8 +1131,8 @@ extension DSLTree.Atom { case let .scalar(s): let hex = String(s.value, radix: 16, uppercase: true) - return ("\\u{\(hex)}"._quoted, false) - + return ("\\u{\(hex)}"._bareQuoted, false) + case let .unconverted(a): if a.ast.isUnprintableAtom { return ("#/\(a.ast._regexBase)/#", false) @@ -1149,7 +1173,7 @@ extension DSLTree.Atom { case let .scalar(s): let hex = String(s.value, radix: 16, uppercase: true) - return "\\u{\(hex)}"._quoted + return "\\u{\(hex)}"._bareQuoted case let .unconverted(a): return a.ast._regexBase diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 320d10897..8e58280c0 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -216,7 +216,7 @@ extension AST.Atom { switch self.kind { case let .char(c): return .char(c) - case let .scalar(s): return .char(Character(s.value)) + case let .scalar(s): return .scalar(s.value) case .any: return .any case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index b67c6c242..00d7e273f 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -1120,6 +1120,45 @@ class RegexDSLTests: XCTestCase { } } + func testScalarMatching() throws { + // RegexBuilder provides a RegexComponent conformance for UnicodeScalar. In + // grapheme cluster mode, it should only match entire graphemes. It may + // match a single scalar of a grapheme cluster in scalar semantic mode. + XCTAssertNotNil("a".firstMatch(of: "a" as UnicodeScalar)) + XCTAssertNil("a\u{301}".firstMatch(of: "a" as UnicodeScalar)) + XCTAssertNotNil("a\u{301}".firstMatch( + of: ("a" as UnicodeScalar).regex.matchingSemantics(.unicodeScalar))) + + let r1 = Regex { + "a" as UnicodeScalar + } + XCTAssertNil(try r1.firstMatch(in: "a\u{301}")) + XCTAssertNotNil( + try r1.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}") + ) + + let r2 = Regex { + CharacterClass.anyOf(["a" as UnicodeScalar, "๐Ÿ‘"]) + } + XCTAssertNil(try r2.firstMatch(in: "a\u{301}")) + XCTAssertNotNil( + try r2.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}") + ) + + let r3 = Regex { + "๐Ÿ‘จ" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "๐Ÿ‘จ" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "๐Ÿ‘ง" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "๐Ÿ‘ฆ" as UnicodeScalar + } + XCTAssertNil(try r3.firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + } + struct SemanticVersion: Equatable { var major: Int var minor: Int diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 97ba3e333..6822330f3 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -117,4 +117,34 @@ extension RenderDSLTests { } """#) } + + func testScalar() throws { + try testConversion(#"\u{B4}"#, #""" + Regex { + "\u{B4}" + } + """#) + try testConversion(#"\u{301}"#, #""" + Regex { + "\u{301}" + } + """#) + try testConversion(#"[\u{301}]"#, #""" + Regex { + One(.anyOf("\u{301}")) + } + """#) + try testConversion(#"[abc\u{301}]"#, #""" + Regex { + One(.anyOf("abc\u{301}")) + } + """#) + + // TODO: We ought to try and preserve the scalar syntax here. + try testConversion(#"a\u{301}"#, #""" + Regex { + "aฬ" + } + """#) + } }