From 307589e13a493d886a6a9ab7b7f6873de906e10e Mon Sep 17 00:00:00 2001 From: kokke Date: Mon, 11 Dec 2017 21:35:55 +0100 Subject: [PATCH 01/32] Update re.c --- re.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/re.c b/re.c index 386f0bd..3c8d750 100644 --- a/re.c +++ b/re.c @@ -31,7 +31,6 @@ #include "re.h" #include -#include /* Definitions: */ @@ -76,26 +75,28 @@ int re_match(const char* pattern, const char* text) int re_matchp(re_t pattern, const char* text) { - int idx = -1; - - if (pattern[0].type == BEGIN) - { - return ((matchpattern(&pattern[1], text)) ? 0 : -1); - } - else + if (pattern != 0) { - do + if (pattern[0].type == BEGIN) + { + return ((matchpattern(&pattern[1], text)) ? 0 : -1); + } + else { - idx += 1; - if (matchpattern(pattern, text)) + int idx = -1; + + do { - return idx; + idx += 1; + if (matchpattern(pattern, text)) + { + return idx; + } } + while (*text++ != '\0'); } - while (*text++ != '\0'); - - return -1; } + return -1; } re_t re_compile(const char* pattern) @@ -184,15 +185,16 @@ re_t re_compile(const char* pattern) && (pattern[i] != '\0')) /* Missing ] */ { if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) { - fputs("exceeded internal buffer!\n", stderr); - exit(-1); + //fputs("exceeded internal buffer!\n", stderr); + return 0; } ccl_buf[ccl_bufidx++] = pattern[i]; } - if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) { + if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) + { /* Catches cases such as [00000000000000000000000000000000000000][ */ - fputs("exceeded internal buffer!\n", stderr); - exit(-1); + //fputs("exceeded internal buffer!\n", stderr); + return 0; } /* Null-terminate string end */ ccl_buf[ccl_bufidx++] = 0; From a3b5c6ac9f9e41b286893d7be921324d00aca951 Mon Sep 17 00:00:00 2001 From: kokke Date: Mon, 11 Dec 2017 23:23:22 +0100 Subject: [PATCH 02/32] Update test1.c --- tests/test1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test1.c b/tests/test1.c index 534d279..a69e379 100644 --- a/tests/test1.c +++ b/tests/test1.c @@ -54,6 +54,7 @@ char* test_vector[][3] = { OK, "b[k-z]*", "ab" }, { NOK, "[0-9]", " - " }, { OK, "[^0-9]", " - " }, + { OK, "0|", "0|" }, { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world !" }, { OK, "[Hh]ello [Ww]orld\\s*[!]?", "hello world !" }, { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello World !" }, From 5e73adc6e39d1d2560bdd7bc926d263e37ca7fa2 Mon Sep 17 00:00:00 2001 From: kokke Date: Mon, 11 Dec 2017 23:23:40 +0100 Subject: [PATCH 03/32] Update re.c --- re.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/re.c b/re.c index 3c8d750..7a78b40 100644 --- a/re.c +++ b/re.c @@ -38,7 +38,7 @@ #define MAX_CHAR_CLASS_LEN 40 /* Max length of character-class buffer in. */ -enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, BRANCH }; +enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ }; typedef struct regex_t { @@ -125,7 +125,7 @@ re_t re_compile(const char* pattern) case '*': { re_compiled[j].type = STAR; } break; case '+': { re_compiled[j].type = PLUS; } break; case '?': { re_compiled[j].type = QUESTIONMARK; } break; - case '|': { re_compiled[j].type = BRANCH; } break; +/* case '|': { re_compiled[j].type = BRANCH; } break; <-- not working properly */ /* Escaped character-classes (\s \w ...): */ case '\\': @@ -430,10 +430,12 @@ static int matchpattern(regex_t* pattern, const char* text) { return (text[0] == '\0'); } +/* Branching is not working properly else if (pattern[1].type == BRANCH) { return (matchpattern(pattern, text) || matchpattern(&pattern[2], text)); } +*/ } while ((text[0] != '\0') && matchone(*pattern++, *text++)); From 1f6af9355765da54e1c00f9a62a9f2305145fe5e Mon Sep 17 00:00:00 2001 From: kokke Date: Mon, 11 Dec 2017 23:26:53 +0100 Subject: [PATCH 04/32] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 444ea11..51bb25b 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ The main design goal of this library is to be small, correct, self contained and > gcc -Os -c re.c > size re.o text data bss dec hex filename - 2341 0 544 2885 b45 re.o + 2026 136 544 2706 a92 re.o ``` For 8-bit AVR using AVR-GCC 4.8.1 it's around 2kb code and less RAM : @@ -38,7 +38,7 @@ The main design goal of this library is to be small, correct, self contained and > avr-gcc -Os -c re.c > size re.o text data bss dec hex filename - 2132 0 130 2262 8d6 re.o + 2062 0 130 2192 890 re.o ``` ### API From 350763de23b1e42d33d3b3c07e2163e1b72ffe6e Mon Sep 17 00:00:00 2001 From: kokke Date: Fri, 23 Mar 2018 12:08:34 +0100 Subject: [PATCH 05/32] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 51bb25b..de85e96 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,7 @@ if (match_idx != -1) For more usage examples I encourage you to look at the code in the `tests`-folder. ### TODO +- Fix non-greedy quantifiers, [see #12](https://github.com/kokke/tiny-regex-c/issues/12) - Fix the implementation of inverted character classes. - Fix implementation of branches (`|`), and see if that can lead us closer to groups as well, e.g. `(a|b)+`. - Add `example.c` that demonstrates usage. From 911f8dc4ee063c96de8af55b5fa5f76faf8f6edf Mon Sep 17 00:00:00 2001 From: kokke Date: Fri, 23 Mar 2018 12:45:20 +0100 Subject: [PATCH 06/32] Update test1.c --- tests/test1.c | 109 ++++++++++++++++++++++++++++---------------------- 1 file changed, 61 insertions(+), 48 deletions(-) diff --git a/tests/test1.c b/tests/test1.c index a69e379..50d8c70 100644 --- a/tests/test1.c +++ b/tests/test1.c @@ -13,53 +13,67 @@ char* test_vector[][3] = { - { OK, "\\d", "5" }, - { OK, "\\w+", "hej" }, - { OK, "\\s", "\t \n" }, - { NOK, "\\S", "\t \n" }, - { OK, "[\\s]", "\t \n" }, - { NOK, "[\\S]", "\t \n" }, - { NOK, "\\D", "5" }, - { NOK, "\\W+", "hej" }, - { OK, "[0-9]+", "12345" }, - { OK, "\\D", "hej" }, - { NOK, "\\d", "hej" }, - { OK, "[^\\w]", "\\" }, - { OK, "[\\W]", "\\" }, - { NOK, "[\\w]", "\\" }, - { OK, "[^\\d]", "d" }, - { NOK, "[\\d]", "d" }, - { NOK, "[^\\D]", "d" }, - { OK, "[\\D]", "d" }, - { OK, "^.*\\\\.*$", "c:\\Tools" }, - { OK, "^[\\+-]*[\\d]+$", "+27" }, - { OK, "[abc]", "1c2" }, - { NOK, "[abc]", "1C2" }, - { OK, "[1-5]+", "0123456789" }, - { OK, "[.2]", "1C2" }, - { OK, "a*$", "Xaa" }, - { OK, "a*$", "Xaa" }, - { OK, "[a-h]+", "abcdefghxxx" }, - { NOK, "[a-h]+", "ABCDEFGH" }, - { OK, "[A-H]+", "ABCDEFGH" }, - { NOK, "[A-H]+", "abcdefgh" }, - { OK, "[^\\s]+", "abc def" }, - { OK, "[^fc]+", "abc def" }, - { OK, "[^d\\sf]+", "abc def" }, - { OK, "\n", "abc\ndef" }, - { OK, "b.\\s*\n", "aa\r\nbb\r\ncc\r\n\r\n" }, - { OK, ".*c", "abcabc" }, - { OK, ".+c", "abcabc" }, - { OK, "[b-z].*", "ab" }, - { OK, "b[k-z]*", "ab" }, - { NOK, "[0-9]", " - " }, - { OK, "[^0-9]", " - " }, - { OK, "0|", "0|" }, - { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world !" }, - { OK, "[Hh]ello [Ww]orld\\s*[!]?", "hello world !" }, - { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello World !" }, - { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world! " }, - { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world !" }, + { OK, "\\d", "5" }, + { OK, "\\w+", "hej" }, + { OK, "\\s", "\t \n" }, + { NOK, "\\S", "\t \n" }, + { OK, "[\\s]", "\t \n" }, + { NOK, "[\\S]", "\t \n" }, + { NOK, "\\D", "5" }, + { NOK, "\\W+", "hej" }, + { OK, "[0-9]+", "12345" }, + { OK, "\\D", "hej" }, + { NOK, "\\d", "hej" }, + { OK, "[^\\w]", "\\" }, + { OK, "[\\W]", "\\" }, + { NOK, "[\\w]", "\\" }, + { OK, "[^\\d]", "d" }, + { NOK, "[\\d]", "d" }, + { NOK, "[^\\D]", "d" }, + { OK, "[\\D]", "d" }, + { OK, "^.*\\\\.*$", "c:\\Tools" }, + { OK, "^[\\+-]*[\\d]+$", "+27" }, + { OK, "[abc]", "1c2" }, + { NOK, "[abc]", "1C2" }, + { OK, "[1-5]+", "0123456789" }, + { OK, "[.2]", "1C2" }, + { OK, "a*$", "Xaa" }, + { OK, "a*$", "Xaa" }, + { OK, "[a-h]+", "abcdefghxxx" }, + { NOK, "[a-h]+", "ABCDEFGH" }, + { OK, "[A-H]+", "ABCDEFGH" }, + { NOK, "[A-H]+", "abcdefgh" }, + { OK, "[^\\s]+", "abc def" }, + { OK, "[^fc]+", "abc def" }, + { OK, "[^d\\sf]+", "abc def" }, + { OK, "\n", "abc\ndef" }, + { OK, "b.\\s*\n", "aa\r\nbb\r\ncc\r\n\r\n" }, + { OK, ".*c", "abcabc" }, + { OK, ".+c", "abcabc" }, + { OK, "[b-z].*", "ab" }, + { OK, "b[k-z]*", "ab" }, + { NOK, "[0-9]", " - " }, + { OK, "[^0-9]", " - " }, + { OK, "0|", "0|" }, + { NOK, "\\d\\d:\\d\\d:\\d\\d", "0s:00:00" }, + { NOK, "\\d\\d:\\d\\d:\\d\\d", "000:00" }, + { NOK, "\\d\\d:\\d\\d:\\d\\d", "00:0000" }, + { NOK, "\\d\\d:\\d\\d:\\d\\d", "100:0:00" }, + { NOK, "\\d\\d:\\d\\d:\\d\\d", "00:100:00" }, + { NOK, "\\d\\d:\\d\\d:\\d\\d", "0:00:100" }, + { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "0:0:0" }, + { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "0:00:0" }, + { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "0:0:00" }, + { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "00:0:0" }, + { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "00:00:0" }, + { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "00:0:00" }, + { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "0:00:00" }, + { OK, "\\d\\d?:\\d\\d?:\\d\\d?", "00:00:00" }, + { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world !" }, + { OK, "[Hh]ello [Ww]orld\\s*[!]?", "hello world !" }, + { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello World !" }, + { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world! " }, + { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world !" }, { OK, "[Hh]ello [Ww]orld\\s*[!]?", "hello World !" }, /* { OK, "[^\\w][^-1-4]", ")T" }, @@ -121,4 +135,3 @@ int main() return 0; } - From d087143b7f3ec48514db88092ba1149723d7afa7 Mon Sep 17 00:00:00 2001 From: kokke Date: Fri, 23 Mar 2018 12:47:27 +0100 Subject: [PATCH 07/32] Update re.c --- re.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/re.c b/re.c index 7a78b40..83fd68f 100644 --- a/re.c +++ b/re.c @@ -375,6 +375,16 @@ static int matchplus(regex_t p, regex_t* pattern, const char* text) return 0; } +static int matchquestion(regex_t p, regex_t* pattern, const char* text) +{ + if ((text[0] != '\0') && matchone(p, *text++)) + { + matchpattern(pattern, text); + } + return 1; +} + + #if 0 @@ -383,7 +393,7 @@ static int matchpattern(regex_t* pattern, const char* text) { if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK)) { - return 1; + return matchquestion(pattern[1], &pattern[2], text); } else if (pattern[1].type == STAR) { @@ -416,7 +426,7 @@ static int matchpattern(regex_t* pattern, const char* text) { if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK)) { - return 1; + return matchquestion(pattern[1], &pattern[2], text); } else if (pattern[1].type == STAR) { From 309a1f36abd4b5d57d7a20723d72da10ef87fe00 Mon Sep 17 00:00:00 2001 From: kokke Date: Fri, 23 Mar 2018 12:52:34 +0100 Subject: [PATCH 08/32] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index de85e96..51bb25b 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,6 @@ if (match_idx != -1) For more usage examples I encourage you to look at the code in the `tests`-folder. ### TODO -- Fix non-greedy quantifiers, [see #12](https://github.com/kokke/tiny-regex-c/issues/12) - Fix the implementation of inverted character classes. - Fix implementation of branches (`|`), and see if that can lead us closer to groups as well, e.g. `(a|b)+`. - Add `example.c` that demonstrates usage. From ff7f6e111c6c6324cc8389c0a225808225808c61 Mon Sep 17 00:00:00 2001 From: kokke Date: Fri, 23 Mar 2018 13:06:49 +0100 Subject: [PATCH 09/32] Update Makefile --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 5cfd437..01beaaf 100644 --- a/Makefile +++ b/Makefile @@ -29,12 +29,12 @@ test: all @python ./scripts/regex_test.py \\d+\\w?\\D\\d $(NRAND_TESTS) @python ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) @python ./scripts/regex_test.py \\w*\\d?\\w\\? $(NRAND_TESTS) - @#python ./scripts/regex_test.py [^\\d]+\\\\?\\s $(NRAND_TESTS) + @python ./scripts/regex_test.py [^\\d]+\\\\?\\s $(NRAND_TESTS) @#python ./scripts/regex_test.py [^\\w][^-1-4] $(NRAND_TESTS) @#python ./scripts/regex_test.py [^\\w] $(NRAND_TESTS) @#python ./scripts/regex_test.py [^1-4] $(NRAND_TESTS) @#python ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS) - @#python ./scripts/regex_test.py [^\\d]+\\s?[\\w]* $(NRAND_TESTS) + @python ./scripts/regex_test.py [^\\d]+\\s?[\\w]* $(NRAND_TESTS) @python ./scripts/regex_test.py a+b*[ac]*.+.*.[\\.]. $(NRAND_TESTS) @python ./scripts/regex_test.py a?b[ac*]*.?[\\]+[?]? $(NRAND_TESTS) @#python ./scripts/regex_test.py [1-5-]+[-1-2]-[-] $(NRAND_TESTS) From e01ec35899803fbeab89af1fb7f72cdda15f3e26 Mon Sep 17 00:00:00 2001 From: kokke Date: Fri, 23 Mar 2018 14:02:23 +0100 Subject: [PATCH 10/32] Update README.md --- README.md | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 51bb25b..2a0cd78 100644 --- a/README.md +++ b/README.md @@ -30,17 +30,27 @@ The main design goal of this library is to be small, correct, self contained and > gcc -Os -c re.c > size re.o text data bss dec hex filename - 2026 136 544 2706 a92 re.o + 2319 0 544 2863 b2f re.o + ``` + For ARM/Thumb using GCC 4.8.1 it's around 1.5kb code and less RAM : + ``` + > arm-none-eabi-gcc -Os -mthumb -c re.c + > size re.o + text data bss dec hex filename + 1418 0 280 1698 6a2 re.o + ``` For 8-bit AVR using AVR-GCC 4.8.1 it's around 2kb code and less RAM : ``` > avr-gcc -Os -c re.c > size re.o text data bss dec hex filename - 2062 0 130 2192 890 re.o + 2128 0 130 2258 8d2 re.o ``` + + ### API This is the public / exported API: ```C From b38f74bbeb972765a3b6b3f87339934d42a66f44 Mon Sep 17 00:00:00 2001 From: kokke Date: Fri, 23 Mar 2018 15:31:11 +0100 Subject: [PATCH 11/32] Create test_rand_neg.c --- tests/test_rand_neg.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 tests/test_rand_neg.c diff --git a/tests/test_rand_neg.c b/tests/test_rand_neg.c new file mode 100644 index 0000000..192ce5c --- /dev/null +++ b/tests/test_rand_neg.c @@ -0,0 +1,29 @@ +/* + Negative version of test_rand.c -- returns true if no match + + This program tries to match a given regular expression with text given as input to stdin. + If the text is NOT a match for the pattern, the program returns 0. + If the text does match the pattern, the program returns -2. + + This program is used in random testing to test a lot of random text and regex together. + See ./scripts/regex_test_neg.py and the Makefile for this project for the gritty details. +*/ + +#include +#include "re.h" + + +int main(int argc, char** argv) +{ + if (argc == 3) + { + int m = re_match(argv[1], argv[2]); + if (m == -1) + return 0; + } + else + { + printf("\nUsage: %s \n", argv[0]); + } + return -2; +} From 4017b4d5544e6f0a783246e503b509b164793ae0 Mon Sep 17 00:00:00 2001 From: kokke Date: Fri, 23 Mar 2018 15:31:40 +0100 Subject: [PATCH 12/32] Create regex_test_neg.py --- scripts/regex_test_neg.py | 82 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 scripts/regex_test_neg.py diff --git a/scripts/regex_test_neg.py b/scripts/regex_test_neg.py new file mode 100644 index 0000000..c3daad6 --- /dev/null +++ b/scripts/regex_test_neg.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +""" + This program generates random text that matches a given regex-pattern. + The pattern is given via sys.argv and the generated text is passed to + the binary 'tests/test_rand' to check if the generated text also matches + the regex-pattern in the C implementation. + The exit-code of the testing program, is used to determine test success. + + This script is called by the Makefile when doing 'make test' +""" + + +import re +import sys +import string +import random +from subprocess import call + + +prog = "./tests/test_rand_neg" + +if len(sys.argv) < 2: + print("") + print("usage: %s pattern [nrepeat]" % sys.argv[0]) + print(" where [nrepeat] is optional") + print("") + sys.exit(-1) + +own_prog = sys.argv[0] +pattern = sys.argv[1] +if len(sys.argv) > 2: + ntests = int(sys.argv[2]) +else: + ntests = 10 +nfails = 0 +repeats = ntests + + +try: + repeats = int(sys.argv[2]) +except: + pass + +sys.stdout.write("%-35s" % (" pattern '%s': " % pattern)) + + + + +def gen_no_match(pattern, minlen=1, maxlen=50, maxattempts=500): + nattempts = 0 + while True: + nattempts += 1 + ret = "".join([random.choice(string.printable) for i in range(random.Random().randint(minlen, maxlen))]) + if re.findall(pattern, ret) == []: + return ret + if nattempts >= maxattempts: + raise Exception("Could not generate string that did not match the regex pattern '%s' after %d attempts" % (pattern, nattempts)) + + + +while repeats >= 0: + try: + repeats -= 1 + example = gen_no_match(pattern) + #print("%s %s %s" % (prog, pattern, example)) + ret = call([prog, "\"%s\"" % pattern, "\"%s\"" % example]) + if ret != 0: + escaped = repr(example) # escapes special chars for better printing + print(" FAIL : matches %s unexpectedly [%s]." % (escaped, ", ".join([("0x%02x" % ord(e)) for e in example]) )) + nfails += 1 + + except: + #import traceback + #print("EXCEPTION!") + #raw_input(traceback.format_exc()) + ntests -= 1 + repeats += 1 + #nfails += 1 + +sys.stdout.write("%4d/%d tests succeeded \n" % (ntests - nfails, ntests)) +#print("") From f7c86d3df9fe50add706771b92724fddb8175134 Mon Sep 17 00:00:00 2001 From: kokke Date: Fri, 23 Mar 2018 15:32:37 +0100 Subject: [PATCH 13/32] Update Makefile --- Makefile | 96 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 65 insertions(+), 31 deletions(-) diff --git a/Makefile b/Makefile index 01beaaf..942991f 100644 --- a/Makefile +++ b/Makefile @@ -6,12 +6,13 @@ NRAND_TESTS := 1000 # Flags to pass to compiler -CFLAGS := -Os -Wall -Wextra -std=c99 -I. +CFLAGS := -O3 -Wall -Wextra -std=c99 -I. all: @$(CC) $(CFLAGS) re.c tests/test1.c -o tests/test1 @$(CC) $(CFLAGS) re.c tests/test2.c -o tests/test2 @$(CC) $(CFLAGS) re.c tests/test_rand.c -o tests/test_rand + @$(CC) $(CFLAGS) re.c tests/test_rand_neg.c -o tests/test_rand_neg clean: @rm -f tests/test1 tests/test2 tests/test_rand @@ -26,38 +27,71 @@ test: all @./tests/test1 @echo Testing patterns against $(NRAND_TESTS) random strings matching the Python implementation and comparing: @echo - @python ./scripts/regex_test.py \\d+\\w?\\D\\d $(NRAND_TESTS) - @python ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) - @python ./scripts/regex_test.py \\w*\\d?\\w\\? $(NRAND_TESTS) - @python ./scripts/regex_test.py [^\\d]+\\\\?\\s $(NRAND_TESTS) - @#python ./scripts/regex_test.py [^\\w][^-1-4] $(NRAND_TESTS) - @#python ./scripts/regex_test.py [^\\w] $(NRAND_TESTS) - @#python ./scripts/regex_test.py [^1-4] $(NRAND_TESTS) - @#python ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS) - @python ./scripts/regex_test.py [^\\d]+\\s?[\\w]* $(NRAND_TESTS) - @python ./scripts/regex_test.py a+b*[ac]*.+.*.[\\.]. $(NRAND_TESTS) - @python ./scripts/regex_test.py a?b[ac*]*.?[\\]+[?]? $(NRAND_TESTS) - @#python ./scripts/regex_test.py [1-5-]+[-1-2]-[-] $(NRAND_TESTS) - @python ./scripts/regex_test.py [-1-3]-[-]+ $(NRAND_TESTS) - @python ./scripts/regex_test.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS) - @python ./scripts/regex_test.py [-1-2]* $(NRAND_TESTS) - @python ./scripts/regex_test.py \\s?[a-fKL098]+-? $(NRAND_TESTS) - @python ./scripts/regex_test.py [\\-]* $(NRAND_TESTS) - @python ./scripts/regex_test.py [\\\\]+ $(NRAND_TESTS) - @python ./scripts/regex_test.py [0-9a-fA-F]+ $(NRAND_TESTS) - @python ./scripts/regex_test.py [1379][2468][abcdef] $(NRAND_TESTS) - @python ./scripts/regex_test.py [012345-9]?[0123-789] $(NRAND_TESTS) - @python ./scripts/regex_test.py [012345-9] $(NRAND_TESTS) - @python ./scripts/regex_test.py [0-56789] $(NRAND_TESTS) - @python ./scripts/regex_test.py [abc-zABC-Z] $(NRAND_TESTS) - @python ./scripts/regex_test.py [a\d]?1234 $(NRAND_TESTS) - @python ./scripts/regex_test.py .*123faerdig $(NRAND_TESTS) - @python ./scripts/regex_test.py .?\\w+jsj$ $(NRAND_TESTS) - @python ./scripts/regex_test.py [?to][+to][?ta][*ta] $(NRAND_TESTS) - @#python ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS) + @python ./scripts/regex_test.py \\d+\\w?\\D\\d $(NRAND_TESTS) + @python ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) + @python ./scripts/regex_test.py \\w*\\d?\\w\\? $(NRAND_TESTS) + @python ./scripts/regex_test.py [^\\d]+\\\\?\\s $(NRAND_TESTS) + @python ./scripts/regex_test.py [^\\w][^-1-4] $(NRAND_TESTS) + @python ./scripts/regex_test.py [^\\w] $(NRAND_TESTS) + @python ./scripts/regex_test.py [^1-4] $(NRAND_TESTS) + @python ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS) + @python ./scripts/regex_test.py [^\\d]+\\s?[\\w]* $(NRAND_TESTS) + @python ./scripts/regex_test.py a+b*[ac]*.+.*.[\\.]. $(NRAND_TESTS) + @python ./scripts/regex_test.py a?b[ac*]*.?[\\]+[?]? $(NRAND_TESTS) + @#python ./scripts/regex_test.py [1-5-]+[-1-2]-[-] $(NRAND_TESTS) + @python ./scripts/regex_test.py [-1-3]-[-]+ $(NRAND_TESTS) + @python ./scripts/regex_test.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS) + @python ./scripts/regex_test.py [-1-2]* $(NRAND_TESTS) + @python ./scripts/regex_test.py \\s?[a-fKL098]+-? $(NRAND_TESTS) + @python ./scripts/regex_test.py [\\-]* $(NRAND_TESTS) + @python ./scripts/regex_test.py [\\\\]+ $(NRAND_TESTS) + @python ./scripts/regex_test.py [0-9a-fA-F]+ $(NRAND_TESTS) + @python ./scripts/regex_test.py [1379][2468][abcdef] $(NRAND_TESTS) + @python ./scripts/regex_test.py [012345-9]?[0123-789] $(NRAND_TESTS) + @python ./scripts/regex_test.py [012345-9] $(NRAND_TESTS) + @python ./scripts/regex_test.py [0-56789] $(NRAND_TESTS) + @python ./scripts/regex_test.py [abc-zABC-Z] $(NRAND_TESTS) + @python ./scripts/regex_test.py [a\d]?1234 $(NRAND_TESTS) + @python ./scripts/regex_test.py .*123faerdig $(NRAND_TESTS) + @python ./scripts/regex_test.py .?\\w+jsj$ $(NRAND_TESTS) + @python ./scripts/regex_test.py [?to][+to][?ta][*ta] $(NRAND_TESTS) + @python ./scripts/regex_test.py \\d+ $(NRAND_TESTS) + @python ./scripts/regex_test.py [a-z]+ $(NRAND_TESTS) + @python ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) + @python ./scripts/regex_test.py \\w $(NRAND_TESTS) + @python ./scripts/regex_test.py \\d $(NRAND_TESTS) + @python ./scripts/regex_test.py [\\d] $(NRAND_TESTS) + @python ./scripts/regex_test.py [^\\d] $(NRAND_TESTS) + @#python ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS) + @echo + @echo + @echo + @echo Testing rejection of patterns against $(NRAND_TESTS) random strings also rejected by the Python implementation: + @echo + @python ./scripts/regex_test_neg.py \\d+ $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py [a-z]+ $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py ^\\w $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py ^\\d $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py [\\d] $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py ^[^\\d] $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py [^\\w]+ $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py ^[\\w]+ $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py ^[^0-9] $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py [a-z].[A-Z] $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py [-1-3]-[-]+ $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py [-0-9]+ $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py [\\-]+ $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py [\\\\]+ $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py [0-9a-fA-F]+ $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py [1379][2468][abcdef] $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py [012345-9] $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py [0-56789] $(NRAND_TESTS) + @python ./scripts/regex_test_neg.py .*123faerdig $(NRAND_TESTS) @echo @echo @./tests/test2 @echo @echo - + From 2dfb4639687bb1c72770129262a8f0038b424e07 Mon Sep 17 00:00:00 2001 From: kokke Date: Fri, 23 Mar 2018 15:39:17 +0100 Subject: [PATCH 14/32] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2a0cd78..ae02db2 100644 --- a/README.md +++ b/README.md @@ -123,7 +123,7 @@ For more usage examples I encourage you to look at the code in the `tests`-folde - Fix implementation of branches (`|`), and see if that can lead us closer to groups as well, e.g. `(a|b)+`. - Add `example.c` that demonstrates usage. - Add `tests/test_perf.c` for performance and time measurements. -- Testing: add matching on purely random data, comparing with Python's `re`. Currently only matching known positives - need to verify rejection as well. +- Testing: Improve pattern rejection testing. ### FAQ - *Q: What differentiates this library from other C regex implementations?* From ef4bbf819fd50540cc79b02ba21549276200c6cc Mon Sep 17 00:00:00 2001 From: roflcopter4 Date: Mon, 16 Apr 2018 14:26:28 -0600 Subject: [PATCH 15/32] Check for correct python2 binry in Makefile --- Makefile | 119 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 64 insertions(+), 55 deletions(-) diff --git a/Makefile b/Makefile index 942991f..ab51f33 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,14 @@ CC := gcc # Number of random text expressions to generate, for random testing NRAND_TESTS := 1000 +PYTHON != if (python --version 2>&1 | grep -q 'Python 2\..*'); then \ + echo 'python'; \ + elif command -v python2 >/dev/null 2>&1; then \ + echo 'python2'; \ + else \ + echo 'Error: no compatible python version found.' >&2; \ + exit 1; \ + fi # Flags to pass to compiler CFLAGS := -O3 -Wall -Wextra -std=c99 -I. @@ -22,73 +30,74 @@ clean: test: all + @$(test $(PYTHON)) @echo @echo Testing hand-picked regex\'s: @./tests/test1 @echo Testing patterns against $(NRAND_TESTS) random strings matching the Python implementation and comparing: @echo - @python ./scripts/regex_test.py \\d+\\w?\\D\\d $(NRAND_TESTS) - @python ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) - @python ./scripts/regex_test.py \\w*\\d?\\w\\? $(NRAND_TESTS) - @python ./scripts/regex_test.py [^\\d]+\\\\?\\s $(NRAND_TESTS) - @python ./scripts/regex_test.py [^\\w][^-1-4] $(NRAND_TESTS) - @python ./scripts/regex_test.py [^\\w] $(NRAND_TESTS) - @python ./scripts/regex_test.py [^1-4] $(NRAND_TESTS) - @python ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS) - @python ./scripts/regex_test.py [^\\d]+\\s?[\\w]* $(NRAND_TESTS) - @python ./scripts/regex_test.py a+b*[ac]*.+.*.[\\.]. $(NRAND_TESTS) - @python ./scripts/regex_test.py a?b[ac*]*.?[\\]+[?]? $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py \\d+\\w?\\D\\d $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py \\w*\\d?\\w\\? $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [^\\d]+\\\\?\\s $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [^\\w][^-1-4] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [^\\w] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [^1-4] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [^\\d]+\\s?[\\w]* $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py a+b*[ac]*.+.*.[\\.]. $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py a?b[ac*]*.?[\\]+[?]? $(NRAND_TESTS) @#python ./scripts/regex_test.py [1-5-]+[-1-2]-[-] $(NRAND_TESTS) - @python ./scripts/regex_test.py [-1-3]-[-]+ $(NRAND_TESTS) - @python ./scripts/regex_test.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS) - @python ./scripts/regex_test.py [-1-2]* $(NRAND_TESTS) - @python ./scripts/regex_test.py \\s?[a-fKL098]+-? $(NRAND_TESTS) - @python ./scripts/regex_test.py [\\-]* $(NRAND_TESTS) - @python ./scripts/regex_test.py [\\\\]+ $(NRAND_TESTS) - @python ./scripts/regex_test.py [0-9a-fA-F]+ $(NRAND_TESTS) - @python ./scripts/regex_test.py [1379][2468][abcdef] $(NRAND_TESTS) - @python ./scripts/regex_test.py [012345-9]?[0123-789] $(NRAND_TESTS) - @python ./scripts/regex_test.py [012345-9] $(NRAND_TESTS) - @python ./scripts/regex_test.py [0-56789] $(NRAND_TESTS) - @python ./scripts/regex_test.py [abc-zABC-Z] $(NRAND_TESTS) - @python ./scripts/regex_test.py [a\d]?1234 $(NRAND_TESTS) - @python ./scripts/regex_test.py .*123faerdig $(NRAND_TESTS) - @python ./scripts/regex_test.py .?\\w+jsj$ $(NRAND_TESTS) - @python ./scripts/regex_test.py [?to][+to][?ta][*ta] $(NRAND_TESTS) - @python ./scripts/regex_test.py \\d+ $(NRAND_TESTS) - @python ./scripts/regex_test.py [a-z]+ $(NRAND_TESTS) - @python ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) - @python ./scripts/regex_test.py \\w $(NRAND_TESTS) - @python ./scripts/regex_test.py \\d $(NRAND_TESTS) - @python ./scripts/regex_test.py [\\d] $(NRAND_TESTS) - @python ./scripts/regex_test.py [^\\d] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [-1-3]-[-]+ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [-1-2]* $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py \\s?[a-fKL098]+-? $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [\\-]* $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [\\\\]+ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [0-9a-fA-F]+ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [1379][2468][abcdef] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [012345-9]?[0123-789] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [012345-9] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [0-56789] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [abc-zABC-Z] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [a\d]?1234 $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py .*123faerdig $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py .?\\w+jsj$ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [?to][+to][?ta][*ta] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py \\d+ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [a-z]+ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py \\w $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py \\d $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [\\d] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test.py [^\\d] $(NRAND_TESTS) @#python ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS) @echo @echo @echo @echo Testing rejection of patterns against $(NRAND_TESTS) random strings also rejected by the Python implementation: @echo - @python ./scripts/regex_test_neg.py \\d+ $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py [a-z]+ $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py ^\\w $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py ^\\d $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py [\\d] $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py ^[^\\d] $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py [^\\w]+ $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py ^[\\w]+ $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py ^[^0-9] $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py [a-z].[A-Z] $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py [-1-3]-[-]+ $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py [-0-9]+ $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py [\\-]+ $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py [\\\\]+ $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py [0-9a-fA-F]+ $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py [1379][2468][abcdef] $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py [012345-9] $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py [0-56789] $(NRAND_TESTS) - @python ./scripts/regex_test_neg.py .*123faerdig $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py \\d+ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py [a-z]+ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py ^\\w $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py ^\\d $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py [\\d] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py ^[^\\d] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py [^\\w]+ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py ^[\\w]+ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py ^[^0-9] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py [a-z].[A-Z] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py [-1-3]-[-]+ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py [-0-9]+ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py [\\-]+ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py [\\\\]+ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py [0-9a-fA-F]+ $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py [1379][2468][abcdef] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py [012345-9] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py [0-56789] $(NRAND_TESTS) + $(PYTHON) ./scripts/regex_test_neg.py .*123faerdig $(NRAND_TESTS) @echo @echo @./tests/test2 From 99e57a3e82078857f607a3d2cf7af412f7159bfc Mon Sep 17 00:00:00 2001 From: roflcopter4 Date: Mon, 16 Apr 2018 14:42:13 -0600 Subject: [PATCH 16/32] Add back '@' signs I accidentally removed --- Makefile | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index ab51f33..fb3e0fe 100644 --- a/Makefile +++ b/Makefile @@ -4,14 +4,15 @@ CC := gcc # Number of random text expressions to generate, for random testing NRAND_TESTS := 1000 -PYTHON != if (python --version 2>&1 | grep -q 'Python 2\..*'); then \ - echo 'python'; \ - elif command -v python2 >/dev/null 2>&1; then \ - echo 'python2'; \ - else \ - echo 'Error: no compatible python version found.' >&2; \ - exit 1; \ - fi +PYTHON := python +# PYTHON != if (python --version 2>&1 | grep -q 'Python 2\..*'); then \ +# echo 'python'; \ +# elif command -v python2 >/dev/null 2>&1; then \ +# echo 'python2'; \ +# else \ +# echo 'Error: no compatible python version found.' >&2; \ +# exit 1; \ +# fi # Flags to pass to compiler CFLAGS := -O3 -Wall -Wextra -std=c99 -I. From 96aa59985928f2c3198d0a8fcdc54beb4122c17f Mon Sep 17 00:00:00 2001 From: roflcopter4 Date: Mon, 16 Apr 2018 14:45:34 -0600 Subject: [PATCH 17/32] Fix dumb typos --- Makefile | 127 +++++++++++++++++++++++++++---------------------------- 1 file changed, 63 insertions(+), 64 deletions(-) diff --git a/Makefile b/Makefile index fb3e0fe..deb9fea 100644 --- a/Makefile +++ b/Makefile @@ -4,15 +4,14 @@ CC := gcc # Number of random text expressions to generate, for random testing NRAND_TESTS := 1000 -PYTHON := python -# PYTHON != if (python --version 2>&1 | grep -q 'Python 2\..*'); then \ -# echo 'python'; \ -# elif command -v python2 >/dev/null 2>&1; then \ -# echo 'python2'; \ -# else \ -# echo 'Error: no compatible python version found.' >&2; \ -# exit 1; \ -# fi +PYTHON != if (python --version 2>&1 | grep -q 'Python 2\..*'); then \ + echo 'python'; \ + elif command -v python2 >/dev/null 2>&1; then \ + echo 'python2'; \ + else \ + echo 'Error: no compatible python version found.' >&2; \ + exit 1; \ + fi # Flags to pass to compiler CFLAGS := -O3 -Wall -Wextra -std=c99 -I. @@ -37,68 +36,68 @@ test: all @./tests/test1 @echo Testing patterns against $(NRAND_TESTS) random strings matching the Python implementation and comparing: @echo - $(PYTHON) ./scripts/regex_test.py \\d+\\w?\\D\\d $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py \\w*\\d?\\w\\? $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [^\\d]+\\\\?\\s $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [^\\w][^-1-4] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [^\\w] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [^1-4] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [^\\d]+\\s?[\\w]* $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py a+b*[ac]*.+.*.[\\.]. $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py a?b[ac*]*.?[\\]+[?]? $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py \\d+\\w?\\D\\d $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py \\w*\\d?\\w\\? $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [^\\d]+\\\\?\\s $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [^\\w][^-1-4] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [^\\w] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [^1-4] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [^\\d]+\\s?[\\w]* $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py a+b*[ac]*.+.*.[\\.]. $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py a?b[ac*]*.?[\\]+[?]? $(NRAND_TESTS) @#python ./scripts/regex_test.py [1-5-]+[-1-2]-[-] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [-1-3]-[-]+ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [-1-2]* $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py \\s?[a-fKL098]+-? $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [\\-]* $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [\\\\]+ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [0-9a-fA-F]+ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [1379][2468][abcdef] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [012345-9]?[0123-789] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [012345-9] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [0-56789] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [abc-zABC-Z] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [a\d]?1234 $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py .*123faerdig $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py .?\\w+jsj$ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [?to][+to][?ta][*ta] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py \\d+ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [a-z]+ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py \\w $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py \\d $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [\\d] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test.py [^\\d] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [-1-3]-[-]+ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [-1-2]* $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py \\s?[a-fKL098]+-? $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [\\-]* $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [\\\\]+ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [0-9a-fA-F]+ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [1379][2468][abcdef] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [012345-9]?[0123-789] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [012345-9] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [0-56789] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [abc-zABC-Z] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [a\d]?1234 $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py .*123faerdig $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py .?\\w+jsj$ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [?to][+to][?ta][*ta] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py \\d+ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [a-z]+ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py \\w $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py \\d $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [\\d] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test.py [^\\d] $(NRAND_TESTS) @#python ./scripts/regex_test.py [^-1-4] $(NRAND_TESTS) @echo @echo @echo @echo Testing rejection of patterns against $(NRAND_TESTS) random strings also rejected by the Python implementation: @echo - $(PYTHON) ./scripts/regex_test_neg.py \\d+ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py [a-z]+ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py ^\\w $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py ^\\d $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py [\\d] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py ^[^\\d] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py [^\\w]+ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py ^[\\w]+ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py ^[^0-9] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py [a-z].[A-Z] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py [-1-3]-[-]+ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py [-0-9]+ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py [\\-]+ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py [\\\\]+ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py [0-9a-fA-F]+ $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py [1379][2468][abcdef] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py [012345-9] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py [0-56789] $(NRAND_TESTS) - $(PYTHON) ./scripts/regex_test_neg.py .*123faerdig $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py \\d+ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py [a-z]+ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py \\s+[a-zA-Z0-9?]* $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py ^\\w $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py ^\\d $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py [\\d] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py ^[^\\d] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py [^\\w]+ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py ^[\\w]+ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py ^[^0-9] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py [a-z].[A-Z] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py [-1-3]-[-]+ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py [1-5]+[-1-2]-[\\-] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py [-0-9]+ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py [\\-]+ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py [\\\\]+ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py [0-9a-fA-F]+ $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py [1379][2468][abcdef] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py [012345-9] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py [0-56789] $(NRAND_TESTS) + @$(PYTHON) ./scripts/regex_test_neg.py .*123faerdig $(NRAND_TESTS) @echo @echo @./tests/test2 From 72e0e56fda0a4069107bc9b1b0f28a1b73575c34 Mon Sep 17 00:00:00 2001 From: TermoSINteZ Date: Tue, 15 May 2018 00:10:55 +0300 Subject: [PATCH 18/32] Fix pattern ".?" issues --- re.c | 20 ++++++++++++-------- tests/test1.c | 2 ++ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/re.c b/re.c index 83fd68f..458fedd 100644 --- a/re.c +++ b/re.c @@ -88,8 +88,12 @@ int re_matchp(re_t pattern, const char* text) do { idx += 1; + if (matchpattern(pattern, text)) { + if (text[0] == '\0') + return -1; + return idx; } } @@ -377,15 +381,19 @@ static int matchplus(regex_t p, regex_t* pattern, const char* text) static int matchquestion(regex_t p, regex_t* pattern, const char* text) { - if ((text[0] != '\0') && matchone(p, *text++)) + if ((text[0] != '\0') && matchone(p, text[0])) { - matchpattern(pattern, text); + int match = 0; + match = matchpattern(pattern, &text[0]); + if (!match) { + return matchpattern(pattern, &text[1]); + } + return match; } return 1; } - #if 0 /* Recursive matching */ @@ -426,7 +434,7 @@ static int matchpattern(regex_t* pattern, const char* text) { if ((pattern[0].type == UNUSED) || (pattern[1].type == QUESTIONMARK)) { - return matchquestion(pattern[1], &pattern[2], text); + return matchquestion(pattern[0], &pattern[2], text); } else if (pattern[1].type == STAR) { @@ -453,7 +461,3 @@ static int matchpattern(regex_t* pattern, const char* text) } #endif - - - - diff --git a/tests/test1.c b/tests/test1.c index 50d8c70..067dc75 100644 --- a/tests/test1.c +++ b/tests/test1.c @@ -83,6 +83,8 @@ char* test_vector[][3] = { OK, "[^\\w][^-1-4]", " x" }, { OK, "[^\\w][^-1-4]", "$b" }, */ + { OK, ".?bar", "real_bar" }, + { NOK, ".?bar", "real_foo" }, }; From d54114d5f8dfc83e9ad95d2d9d29bb45f7afdcd6 Mon Sep 17 00:00:00 2001 From: TermoSINteZ Date: Tue, 15 May 2018 10:41:17 +0300 Subject: [PATCH 19/32] Remove tabs --- re.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/re.c b/re.c index 458fedd..38fd05c 100644 --- a/re.c +++ b/re.c @@ -88,12 +88,12 @@ int re_matchp(re_t pattern, const char* text) do { idx += 1; - + if (matchpattern(pattern, text)) { - if (text[0] == '\0') - return -1; - + if (text[0] == '\0') + return -1; + return idx; } } @@ -388,7 +388,7 @@ static int matchquestion(regex_t p, regex_t* pattern, const char* text) if (!match) { return matchpattern(pattern, &text[1]); } - return match; + return match; } return 1; } From bfa621eb02678c04278b2c5a0c2767fd85f92e53 Mon Sep 17 00:00:00 2001 From: monolifed <6624464+monolifed@users.noreply.github.com> Date: Thu, 31 May 2018 02:06:49 +0300 Subject: [PATCH 20/32] Update re.c hopefully fixes #12 --- re.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/re.c b/re.c index 38fd05c..8aebbd5 100644 --- a/re.c +++ b/re.c @@ -381,6 +381,10 @@ static int matchplus(regex_t p, regex_t* pattern, const char* text) static int matchquestion(regex_t p, regex_t* pattern, const char* text) { + if ((text[0] == '\0') && p.type != UNUSED) + { + return matchpattern(pattern, &text[0]); + } if ((text[0] != '\0') && matchone(p, text[0])) { int match = 0; From f2674ed39a9b8c1d81f6c9d07e1989c51a9cfb85 Mon Sep 17 00:00:00 2001 From: kokke Date: Wed, 6 Jun 2018 18:15:48 +0200 Subject: [PATCH 21/32] Update test1.c --- tests/test1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test1.c b/tests/test1.c index 067dc75..667e0f9 100644 --- a/tests/test1.c +++ b/tests/test1.c @@ -75,6 +75,7 @@ char* test_vector[][3] = { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world! " }, { OK, "[Hh]ello [Ww]orld\\s*[!]?", "Hello world !" }, { OK, "[Hh]ello [Ww]orld\\s*[!]?", "hello World !" }, + { NOK, "\\d\\d?:\\d\\d?:\\d\\d?", "a:0" }, /* Failing test case reported in https://github.com/kokke/tiny-regex-c/issues/12 */ /* { OK, "[^\\w][^-1-4]", ")T" }, { OK, "[^\\w][^-1-4]", ")^" }, From 5abffeb4409faae735bbddfbfb1e6248500ba3bd Mon Sep 17 00:00:00 2001 From: kokke Date: Mon, 22 Oct 2018 15:41:33 +0200 Subject: [PATCH 22/32] Update re.c fixing typo, noticed by @tobermory -> https://github.com/kokke/tiny-regex-c/issues/19 --- re.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/re.c b/re.c index 8aebbd5..83d53b1 100644 --- a/re.c +++ b/re.c @@ -285,7 +285,7 @@ static int matchrange(char c, const char* str) } static int ismetachar(char c) { - return ((c == 's') || (c == 'S') == (c == 'w') || (c == 'W') || (c == 'd') || (c == 'D')); + return ((c == 's') || (c == 'S') || (c == 'w') || (c == 'W') || (c == 'd') || (c == 'D')); } static int matchmetachar(char c, const char* str) From 5f2af04f3f89b04e81af6dccb6b53132759b8cdc Mon Sep 17 00:00:00 2001 From: kokke Date: Mon, 22 Oct 2018 16:04:02 +0200 Subject: [PATCH 23/32] Update test1.c Adding failing test-case for question-mark '?', brought to my attention by @tobermory in https://github.com/kokke/tiny-regex-c/issues/20 --- tests/test1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test1.c b/tests/test1.c index 667e0f9..ab54cc2 100644 --- a/tests/test1.c +++ b/tests/test1.c @@ -86,6 +86,7 @@ char* test_vector[][3] = */ { OK, ".?bar", "real_bar" }, { NOK, ".?bar", "real_foo" }, + { NOK, "X?Y", "Z" }, }; From 62f6d1491bcd9f1e935c2476bd0dfaf416a6556d Mon Sep 17 00:00:00 2001 From: kokke Date: Tue, 23 Oct 2018 11:26:47 +0200 Subject: [PATCH 24/32] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ae02db2..e641f3c 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ The main design goal of this library is to be small, correct, self contained and ``` > gcc -Os -c re.c > size re.o - text data bss dec hex filename + text data bss dec hex filename 2319 0 544 2863 b2f re.o ``` @@ -45,8 +45,8 @@ The main design goal of this library is to be small, correct, self contained and ``` > avr-gcc -Os -c re.c > size re.o - text data bss dec hex filename - 2128 0 130 2258 8d2 re.o + text data bss dec hex filename + 2128 0 130 2258 8d2 re.o ``` From 16763e1067bccb511846cfbc53d6dd5daeeb1746 Mon Sep 17 00:00:00 2001 From: kokke Date: Tue, 23 Oct 2018 11:27:23 +0200 Subject: [PATCH 25/32] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e641f3c..ab89576 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ The main design goal of this library is to be small, correct, self contained and ``` > gcc -Os -c re.c > size re.o - text data bss dec hex filename + text data bss dec hex filename 2319 0 544 2863 b2f re.o ``` From 3103102c687e6f0de6602fc57e78d72f98a47c01 Mon Sep 17 00:00:00 2001 From: monolifed <6624464+monolifed@users.noreply.github.com> Date: Thu, 25 Oct 2018 19:19:24 +0300 Subject: [PATCH 26/32] Update re.c --- re.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/re.c b/re.c index 83d53b1..53b5a6c 100644 --- a/re.c +++ b/re.c @@ -188,7 +188,17 @@ re_t re_compile(const char* pattern) while ( (pattern[++i] != ']') && (pattern[i] != '\0')) /* Missing ] */ { - if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) { + if (pattern[i] == '\\') + { + if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1) + { + //fputs("exceeded internal buffer!\n", stderr); + return 0; + } + ccl_buf[ccl_bufidx++] = pattern[i++]; + } + else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) + { //fputs("exceeded internal buffer!\n", stderr); return 0; } @@ -381,20 +391,13 @@ static int matchplus(regex_t p, regex_t* pattern, const char* text) static int matchquestion(regex_t p, regex_t* pattern, const char* text) { - if ((text[0] == '\0') && p.type != UNUSED) - { - return matchpattern(pattern, &text[0]); - } - if ((text[0] != '\0') && matchone(p, text[0])) - { - int match = 0; - match = matchpattern(pattern, &text[0]); - if (!match) { - return matchpattern(pattern, &text[1]); - } - return match; - } - return 1; + if (p.type == UNUSED) + return 1; + if (matchpattern(pattern, text)) + return 1; + if (*text && matchone(p, *text++)) + return matchpattern(pattern, text); + return 0; } From f05c037d1dfc24612e8190ceaabbdc9bfcd20552 Mon Sep 17 00:00:00 2001 From: James Maher Date: Thu, 6 Dec 2018 20:48:41 +0000 Subject: [PATCH 27/32] Storing multiple compiled patterns at once is now possible. --- re.c | 37 ++++++++++++++++--------------------- re.h | 19 +++++++++++++++---- tests/test1.c | 7 +++++-- 3 files changed, 36 insertions(+), 27 deletions(-) diff --git a/re.c b/re.c index 53b5a6c..efa9a07 100644 --- a/re.c +++ b/re.c @@ -34,22 +34,10 @@ /* Definitions: */ -#define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */ #define MAX_CHAR_CLASS_LEN 40 /* Max length of character-class buffer in. */ -enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH */ }; - -typedef struct regex_t -{ - unsigned char type; /* CHAR, STAR, etc. */ - union - { - unsigned char ch; /* the character itself */ - unsigned char* ccl; /* OR a pointer to characters in class */ - }; -} regex_t; - +enum { UNUSED, DOT, BEGIN, END, QUESTIONMARK, STAR, PLUS, CHAR, CHAR_CLASS, INV_CHAR_CLASS, DIGIT, NOT_DIGIT, ALPHA, NOT_ALPHA, WHITESPACE, NOT_WHITESPACE, /* BRANCH, */ FAIL}; /* Private function declarations: */ @@ -70,12 +58,17 @@ static int ismetachar(char c); /* Public functions: */ int re_match(const char* pattern, const char* text) { - return re_matchp(re_compile(pattern), text); + re_t regex; + + if (re_compile(regex, pattern) > -1) + return re_matchp(regex, text); + else + return -1; } int re_matchp(re_t pattern, const char* text) { - if (pattern != 0) + if (pattern[0].type != FAIL) { if (pattern[0].type == BEGIN) { @@ -103,12 +96,11 @@ int re_matchp(re_t pattern, const char* text) return -1; } -re_t re_compile(const char* pattern) +int re_compile(re_t re_compiled, const char* pattern) { /* The sizes of the two static arrays below substantiates the static RAM usage of this module. MAX_REGEXP_OBJECTS is the max number of symbols in the expression. MAX_CHAR_CLASS_LEN determines the size of buffer for chars in all char-classes in the expression. */ - static regex_t re_compiled[MAX_REGEXP_OBJECTS]; static unsigned char ccl_buf[MAX_CHAR_CLASS_LEN]; int ccl_bufidx = 1; @@ -193,14 +185,16 @@ re_t re_compile(const char* pattern) if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1) { //fputs("exceeded internal buffer!\n", stderr); - return 0; + re_compiled[0].type = FAIL; + return -1; } ccl_buf[ccl_bufidx++] = pattern[i++]; } else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) { //fputs("exceeded internal buffer!\n", stderr); - return 0; + re_compiled[0].type = FAIL; + return -1; } ccl_buf[ccl_bufidx++] = pattern[i]; } @@ -208,7 +202,8 @@ re_t re_compile(const char* pattern) { /* Catches cases such as [00000000000000000000000000000000000000][ */ //fputs("exceeded internal buffer!\n", stderr); - return 0; + re_compiled[0].type = FAIL; + return -1; } /* Null-terminate string end */ ccl_buf[ccl_bufidx++] = 0; @@ -228,7 +223,7 @@ re_t re_compile(const char* pattern) /* 'UNUSED' is a sentinel used to indicate end-of-pattern */ re_compiled[j].type = UNUSED; - return (re_t) re_compiled; + return j; } void re_print(regex_t* pattern) diff --git a/re.h b/re.h index fd36412..d518a33 100644 --- a/re.h +++ b/re.h @@ -31,14 +31,25 @@ extern "C"{ #endif +#define MAX_REGEXP_OBJECTS 30 /* Max number of regex symbols in expression. */ +typedef struct regex_t +{ + unsigned char type; /* CHAR, STAR, etc. */ + union + { + unsigned char ch; /* the character itself */ + unsigned char* ccl; /* OR a pointer to characters in class */ + }; +} regex_t; -/* Typedef'd pointer to get abstract datatype. */ -typedef struct regex_t* re_t; +/* Typedef'd array to get abstract datatype. */ +typedef regex_t re_t[MAX_REGEXP_OBJECTS]; -/* Compile regex string pattern to a regex_t-array. */ -re_t re_compile(const char* pattern); +/* Compile regex string pattern to a regex_t-array and copy to re_compiled. + * Return the number of regex objects copied or -1 upon error. */ +int re_compile(re_t re_compiled, const char* pattern); /* Find matches of the compiled pattern inside text. */ diff --git a/tests/test1.c b/tests/test1.c index ab54cc2..fb2197b 100644 --- a/tests/test1.c +++ b/tests/test1.c @@ -100,6 +100,7 @@ int main() size_t ntests = sizeof(test_vector) / sizeof(*test_vector); size_t nfailed = 0; size_t i; + re_t regex; for (i = 0; i < ntests; ++i) { @@ -114,7 +115,8 @@ int main() if (m != (-1)) { printf("\n"); - re_print(re_compile(pattern)); + re_compile(regex, pattern); + re_print(regex); fprintf(stderr, "[%lu/%lu]: pattern '%s' matched '%s' unexpectedly. \n", (i+1), ntests, pattern, text); nfailed += 1; } @@ -124,7 +126,8 @@ int main() if (m == (-1)) { printf("\n"); - re_print(re_compile(pattern)); + re_compile(regex, pattern); + re_print(regex); fprintf(stderr, "[%lu/%lu]: pattern '%s' didn't match '%s' as expected. \n", (i+1), ntests, pattern, text); nfailed += 1; } From 738160fc5d91eae482f2d77766b583979b7d30f7 Mon Sep 17 00:00:00 2001 From: James Maher Date: Fri, 7 Dec 2018 15:02:21 +0000 Subject: [PATCH 28/32] Remove unnecessary test in re_match. --- re.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/re.c b/re.c index efa9a07..8d4f93a 100644 --- a/re.c +++ b/re.c @@ -60,10 +60,8 @@ int re_match(const char* pattern, const char* text) { re_t regex; - if (re_compile(regex, pattern) > -1) - return re_matchp(regex, text); - else - return -1; + re_compile(regex, pattern); + return re_matchp(regex, text); } int re_matchp(re_t pattern, const char* text) From b3601695ed6620e7c6f76753f3b6d55f6d09962a Mon Sep 17 00:00:00 2001 From: James Maher Date: Fri, 7 Dec 2018 15:10:11 +0000 Subject: [PATCH 29/32] Group together fail branches in re_compile. --- re.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/re.c b/re.c index 8d4f93a..3e1bb4a 100644 --- a/re.c +++ b/re.c @@ -183,16 +183,14 @@ int re_compile(re_t re_compiled, const char* pattern) if (ccl_bufidx >= MAX_CHAR_CLASS_LEN - 1) { //fputs("exceeded internal buffer!\n", stderr); - re_compiled[0].type = FAIL; - return -1; + goto fail; } ccl_buf[ccl_bufidx++] = pattern[i++]; } else if (ccl_bufidx >= MAX_CHAR_CLASS_LEN) { //fputs("exceeded internal buffer!\n", stderr); - re_compiled[0].type = FAIL; - return -1; + goto fail; } ccl_buf[ccl_bufidx++] = pattern[i]; } @@ -200,8 +198,7 @@ int re_compile(re_t re_compiled, const char* pattern) { /* Catches cases such as [00000000000000000000000000000000000000][ */ //fputs("exceeded internal buffer!\n", stderr); - re_compiled[0].type = FAIL; - return -1; + goto fail; } /* Null-terminate string end */ ccl_buf[ccl_bufidx++] = 0; @@ -222,6 +219,10 @@ int re_compile(re_t re_compiled, const char* pattern) re_compiled[j].type = UNUSED; return j; + +fail: + re_compiled[0].type = FAIL; + return -1; } void re_print(regex_t* pattern) From 9d7e190e5df13ddcd03fa47ace0daf16e6ca358d Mon Sep 17 00:00:00 2001 From: James Maher Date: Fri, 7 Dec 2018 15:12:05 +0000 Subject: [PATCH 30/32] Add comment describing FAIL sentinel value. --- re.c | 1 + 1 file changed, 1 insertion(+) diff --git a/re.c b/re.c index 3e1bb4a..f97dcfc 100644 --- a/re.c +++ b/re.c @@ -66,6 +66,7 @@ int re_match(const char* pattern, const char* text) int re_matchp(re_t pattern, const char* text) { + /* FAIL is a sentinel value indicating compilation of the pattern failed. */ if (pattern[0].type != FAIL) { if (pattern[0].type == BEGIN) From 3abaa004aae7f04a65834089ba177e48a69c18fc Mon Sep 17 00:00:00 2001 From: James Maher Date: Fri, 7 Dec 2018 15:42:03 +0000 Subject: [PATCH 31/32] Remove BRANCH and add FAIL to re_print char types. --- re.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/re.c b/re.c index f97dcfc..3d1974d 100644 --- a/re.c +++ b/re.c @@ -228,7 +228,7 @@ int re_compile(re_t re_compiled, const char* pattern) void re_print(regex_t* pattern) { - const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", "BRANCH" }; + const char* types[] = { "UNUSED", "DOT", "BEGIN", "END", "QUESTIONMARK", "STAR", "PLUS", "CHAR", "CHAR_CLASS", "INV_CHAR_CLASS", "DIGIT", "NOT_DIGIT", "ALPHA", "NOT_ALPHA", "WHITESPACE", "NOT_WHITESPACE", /* "BRANCH", */ "FAIL"}; int i; for (i = 0; i < MAX_REGEXP_OBJECTS; ++i) From cdc3cce812bb639cc0367e1f9ca869a6af2db0d6 Mon Sep 17 00:00:00 2001 From: James Maher Date: Fri, 7 Dec 2018 16:34:28 +0000 Subject: [PATCH 32/32] Update comment describing re_compile function. --- re.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/re.h b/re.h index d518a33..f3dc929 100644 --- a/re.h +++ b/re.h @@ -48,7 +48,7 @@ typedef regex_t re_t[MAX_REGEXP_OBJECTS]; /* Compile regex string pattern to a regex_t-array and copy to re_compiled. - * Return the number of regex objects copied or -1 upon error. */ + * Return the number of regex objects copied upon success or -1 upon error. */ int re_compile(re_t re_compiled, const char* pattern);