Skip to content

Commit 107bde2

Browse files
committed
Remove character class subtraction
Fixes #7
1 parent 867139c commit 107bde2

File tree

4 files changed

+97
-152
lines changed

4 files changed

+97
-152
lines changed

src/Peachpie.Library.RegularExpressions/RegexCharClass.cs

+28-44
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,6 @@ these intervals. It follows from the nature of the L on each interval.
403403
private readonly StringBuilder _categories;
404404
private bool _canonical;
405405
private bool _negate;
406-
private RegexCharClass _subtractor;
407406

408407
#if DEBUG
409408
static RegexCharClass()
@@ -433,20 +432,19 @@ public RegexCharClass()
433432
_categories = new StringBuilder();
434433
}
435434

436-
private RegexCharClass(bool negate, List<SingleRange> ranges, StringBuilder categories, RegexCharClass subtraction)
435+
private RegexCharClass(bool negate, List<SingleRange> ranges, StringBuilder categories)
437436
{
438437
_rangelist = ranges;
439438
_categories = categories;
440439
_canonical = true;
441440
_negate = negate;
442-
_subtractor = subtraction;
443441
}
444442

445443
public bool CanMerge
446444
{
447445
get
448446
{
449-
return !_negate && _subtractor == null;
447+
return !_negate;
450448
}
451449
}
452450

@@ -507,12 +505,6 @@ private void AddSet(string set)
507505
}
508506
}
509507

510-
public void AddSubtraction(RegexCharClass sub)
511-
{
512-
Debug.Assert(_subtractor == null, "Can't add two subtractions to a char class. ");
513-
_subtractor = sub;
514-
}
515-
516508
/// <summary>
517509
/// Adds a single range of characters to the class.
518510
/// </summary>
@@ -715,20 +707,23 @@ public static char SingletonChar(string set)
715707

716708
public static bool IsMergeable(string charClass)
717709
{
718-
return (!IsNegated(charClass) && !IsSubtraction(charClass));
710+
Debug.Assert(!IsSubtraction(charClass));
711+
return !IsNegated(charClass);
719712
}
720713

721714
public static bool IsEmpty(string charClass)
722715
{
723-
return (charClass[CATEGORYLENGTH] == 0 && charClass[FLAGS] == 0 && charClass[SETLENGTH] == 0 && !IsSubtraction(charClass));
716+
Debug.Assert(!IsSubtraction(charClass));
717+
return (charClass[CATEGORYLENGTH] == 0 && charClass[FLAGS] == 0 && charClass[SETLENGTH] == 0);
724718
}
725719

726720
/// <summary>
727721
/// <c>true</c> if the set contains a single character only
728722
/// </summary>
729723
public static bool IsSingleton(string set)
730724
{
731-
if (set[FLAGS] == 0 && set[CATEGORYLENGTH] == 0 && set[SETLENGTH] == 2 && !IsSubtraction(set) &&
725+
Debug.Assert(!IsSubtraction(set));
726+
if (set[FLAGS] == 0 && set[CATEGORYLENGTH] == 0 && set[SETLENGTH] == 2 &&
732727
(set[SETSTART] == LastChar || set[SETSTART] + 1 == set[SETSTART + 1]))
733728
return true;
734729
else
@@ -737,13 +732,17 @@ public static bool IsSingleton(string set)
737732

738733
public static bool IsSingletonInverse(string set)
739734
{
740-
if (set[FLAGS] == 1 && set[CATEGORYLENGTH] == 0 && set[SETLENGTH] == 2 && !IsSubtraction(set) &&
735+
Debug.Assert(!IsSubtraction(set));
736+
if (set[FLAGS] == 1 && set[CATEGORYLENGTH] == 0 && set[SETLENGTH] == 2 &&
741737
(set[SETSTART] == LastChar || set[SETSTART] + 1 == set[SETSTART + 1]))
742738
return true;
743739
else
744740
return false;
745741
}
746742

743+
/// <remarks>
744+
/// Character class subtraction is not supported, used just in assertions in a regressive test fashion.
745+
/// </remarks>
747746
private static bool IsSubtraction(string charClass)
748747
{
749748
return (charClass.Length > SETSTART + charClass[SETLENGTH] + charClass[CATEGORYLENGTH]);
@@ -775,42 +774,32 @@ public static bool IsWordChar(char ch)
775774

776775
public static bool CharInClass(char ch, string set)
777776
{
778-
return CharInClassRecursive(ch, set, 0);
779-
}
780-
781-
private static bool CharInClassRecursive(char ch, string set, int start)
782-
{
783-
int mySetLength = set[start + SETLENGTH];
784-
int myCategoryLength = set[start + CATEGORYLENGTH];
785-
int myEndPosition = start + SETSTART + mySetLength + myCategoryLength;
786-
787-
bool subtracted = false;
777+
int mySetLength = set[SETLENGTH];
778+
int myCategoryLength = set[CATEGORYLENGTH];
779+
int myEndPosition = SETSTART + mySetLength + myCategoryLength;
788780

789-
if (set.Length > myEndPosition)
790-
{
791-
subtracted = CharInClassRecursive(ch, set, myEndPosition);
792-
}
781+
Debug.Assert(set.Length == myEndPosition);
793782

794-
bool b = CharInClassInternal(ch, set, start, mySetLength, myCategoryLength);
783+
bool b = CharInClassInternal(ch, set, mySetLength, myCategoryLength);
795784

796785
// Note that we apply the negation *before* performing the subtraction. This is because
797786
// the negation only applies to the first char class, not the entire subtraction.
798-
if (set[start + FLAGS] == 1)
787+
if (set[FLAGS] == 1)
799788
b = !b;
800789

801-
return b && !subtracted;
790+
return b;
802791
}
803792

804793
/// <summary>
805794
/// Determines a character's membership in a character class (via the
806795
/// string representation of the class).
807796
/// </summary>
808-
private static bool CharInClassInternal(char ch, string set, int start, int mySetLength, int myCategoryLength)
797+
private static bool CharInClassInternal(char ch, string set, int mySetLength, int myCategoryLength)
809798
{
810799
int min;
811800
int max;
812801
int mid;
813-
min = start + SETSTART;
802+
min = SETSTART;
814803
max = min + mySetLength;
815804

816805
while (min != max)
@@ -829,22 +818,22 @@ private static bool CharInClassInternal(char ch, string set, int start, int mySe
829818
// SETSTART is odd, we can simplify it out of the equation. But if it changes we need to
830819
// reverse this check.
831820
Debug.Assert((SETSTART & 0x1) == 1, "If SETSTART is not odd, the calculation below this will be reversed");
832-
if ((min & 0x1) == (start & 0x1))
821+
if ((min & 0x1) == 0) // Note: originally ((min & 0x1) == (start & 0x1)), but start is always 0
833822
return true;
834823
else
835824
{
836825
if (myCategoryLength == 0)
837826
return false;
838827

839-
return CharInCategory(ch, set, start, mySetLength, myCategoryLength);
828+
return CharInCategory(ch, set, mySetLength, myCategoryLength);
840829
}
841830
}
842831

843-
private static bool CharInCategory(char ch, string set, int start, int mySetLength, int myCategoryLength)
832+
private static bool CharInCategory(char ch, string set, int mySetLength, int myCategoryLength)
844833
{
845834
UnicodeCategory chcategory = CharUnicodeInfo.GetUnicodeCategory(ch);
846835

847-
int i = start + SETSTART + mySetLength;
836+
int i = SETSTART + mySetLength;
848837
int end = i + myCategoryLength;
849838
while (i < end)
850839
{
@@ -993,11 +982,9 @@ private static RegexCharClass ParseRecursive(string charClass, int start)
993982
ranges.Add(new SingleRange(first, last));
994983
}
995984

996-
RegexCharClass sub = null;
997-
if (charClass.Length > myEndPosition)
998-
sub = ParseRecursive(charClass, myEndPosition);
985+
Debug.Assert(charClass.Length == myEndPosition);
999986

1000-
return new RegexCharClass(charClass[start + FLAGS] == 1, ranges, new StringBuilder(charClass.Substring(end, myCategoryLength)), sub);
987+
return new RegexCharClass(charClass[start + FLAGS] == 1, ranges, new StringBuilder(charClass.Substring(end, myCategoryLength)));
1001988
}
1002989

1003990
/// <summary>
@@ -1046,9 +1033,6 @@ public string ToStringClass()
10461033

10471034
vsb.Append(_categories.ToString());
10481035

1049-
if (_subtractor != null)
1050-
vsb.Append(_subtractor.ToStringClass());
1051-
10521036
return vsb.ToString();
10531037
}
10541038

src/Peachpie.Library.RegularExpressions/RegexParser.cs

+4-53
Original file line numberDiff line numberDiff line change
@@ -925,41 +925,10 @@ private RegexCharClass ScanCharClass(bool caseInsensitive, bool scanOnly)
925925
inRange = false;
926926
if (!scanOnly)
927927
{
928-
bool processed = false;
929-
if (ch == '[' && !fTranslatedChar && !firstChar)
930-
{
931-
// We thought we were in a range, but we're actually starting a subtraction.
932-
// In that case, we'll add chPrev to our char class, skip the opening [, and
933-
// scan the new character class recursively. If the subtraction class is
934-
// invalid, we assume that it was not intended, as PCRE has no subtractions.
935-
936-
// Back up the current text position
937-
int subStartPos = Textpos();
938-
939-
// Scan the supposed subtraction
940-
var subtracted = ScanCharClass(caseInsensitive, scanOnly);
941-
942-
if (CharsRight() > 0 && RightChar() != ']')
943-
{
944-
// If the subtraction is invalid, rollback to the previous text position
945-
// and treat '[' as any other character
946-
Textto(subStartPos);
947-
}
948-
else
949-
{
950-
cc.AddChar(chPrev);
951-
cc.AddSubtraction(subtracted);
952-
processed = true;
953-
}
954-
}
955-
956-
if (!processed)
957-
{
958-
// a regular range, like a-z
959-
if (chPrev > ch)
960-
throw MakeException(SR.ReversedCharRange);
961-
cc.AddRange(chPrev, ch);
962-
}
928+
// a regular range, like a-z
929+
if (chPrev > ch)
930+
throw MakeException(SR.ReversedCharRange);
931+
cc.AddRange(chPrev, ch);
963932
}
964933
}
965934
else if (CharsRight() >= 2 && RightChar() == '-' && RightChar(1) != ']')
@@ -969,24 +938,6 @@ private RegexCharClass ScanCharClass(bool caseInsensitive, bool scanOnly)
969938
inRange = true;
970939
MoveRight();
971940
}
972-
else if (CharsRight() >= 1 && ch == '-' && !fTranslatedChar && RightChar() == '[' && !firstChar)
973-
{
974-
// we aren't in a range, and now there is a subtraction. Usually this happens
975-
// only when a subtraction follows a range, like [a-z-[b]]
976-
if (!scanOnly)
977-
{
978-
MoveRight(1);
979-
cc.AddSubtraction(ScanCharClass(caseInsensitive, scanOnly));
980-
981-
if (CharsRight() > 0 && RightChar() != ']')
982-
throw MakeException(SR.SubtractionMustBeLast);
983-
}
984-
else
985-
{
986-
MoveRight(1);
987-
ScanCharClass(caseInsensitive, scanOnly);
988-
}
989-
}
990941
else
991942
{
992943
if (!scanOnly)

tests/Peachpie.Library.RegularExpressions.Tests/PcreTests.cs

+10
Original file line numberDiff line numberDiff line change
@@ -498,5 +498,15 @@ public void TestSeemingCharacterClassSubtraction()
498498
Assert.False(match(pattern, @"\").Success);
499499
Assert.False(match(pattern, "\"").Success);
500500
}
501+
502+
[Fact]
503+
public void TestNotSupportedCharacterClassSubtraction()
504+
{
505+
Assert.False(match(@"/[a-z-[aeiuo]]/", "b").Success);
506+
Assert.False(match(@"/[a-z#-[aeiuo]]/", "b").Success);
507+
Assert.True(match(@"/[a-z-[aeiuo]]/", "-]").Success);
508+
Assert.True(match(@"/[a-z#-[aeiuo]]/", "-]").Success);
509+
Assert.True(match(@"/[a-z#-[aeiuo]]/", "#]").Success);
510+
}
501511
}
502512
}

0 commit comments

Comments
 (0)