Skip to content

Commit 7550e76

Browse files
committed
Add RegEx support using RE2
Introduces 5 new built-in methods to the stdlib: - `regexFullMatch(pattern, str)` -- Full match regex - `regexPartialMatch(pattern, str)` -- Partial match regex - `regexQuoteMeta(str)` -- Escape regex metachararacters - `regexReplace(str, pattern, to)` -- Replace single occurance using regex - `regexGlobalReplace(str, pattern, to)` -- Replace globally using regex Since both `regexFullMatch` and `regexPartialMatch` can perform captures these functions return a "match" object upon match or `null` otherwise. For example: ``` $ ./jsonnet -e 'std.regexFullMatch("h(?P<mid>.*)o", "hello")' { "captures": [ "ell" ], "namedCaptures": { "mid": "ell" }, "string": "hello" } ``` Introduces a dependency on RE2 2019-06-01. Builds tested using make, CMake and Bazel on Ubuntu 18.04.
1 parent 0134fd6 commit 7550e76

10 files changed

+288
-11
lines changed

CMakeLists.txt

+44-2
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,50 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${GLOBAL_OUTPUT_PATH})
2929
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${GLOBAL_OUTPUT_PATH})
3030
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${GLOBAL_OUTPUT_PATH})
3131

32+
# Include external RE2 project. This runs a CMake sub-script
33+
# (RE2CMakeLists.txt.in) that downloads googletest source. It's then built as part
34+
# of the jsonnet project. The conventional way of handling CMake dependencies is
35+
# to use a find_package script, which finds and installs the library from
36+
# known locations on the local machine. Downloading the library ourselves
37+
# allows us to pin to a specific version and makes things easier for users
38+
# who don't have package managers.
39+
40+
# Generate and download RE2 project.
41+
set(RE2_DIR ${GLOBAL_OUTPUT_PATH}/re2-download)
42+
configure_file(RE2CMakeLists.txt.in ${RE2_DIR}/CMakeLists.txt)
43+
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
44+
RESULT_VARIABLE result
45+
WORKING_DIRECTORY ${RE2_DIR}
46+
)
47+
if(result)
48+
message(FATAL_ERROR "RE2 download failed: ${result}")
49+
endif()
50+
51+
# Build RE2.
52+
execute_process(COMMAND ${CMAKE_COMMAND} --build .
53+
RESULT_VARIABLE result
54+
WORKING_DIRECTORY ${RE2_DIR})
55+
if(result)
56+
message(FATAL_ERROR "Build step for re2 failed: ${result}")
57+
endif()
58+
59+
# Add RE2 directly to our build. This defines
60+
# the re2 target.
61+
add_subdirectory(${GLOBAL_OUTPUT_PATH}/re2-src
62+
${GLOBAL_OUTPUT_PATH}/re2-build)
63+
64+
# Include RE2 headers.
65+
include_directories("${RE2_SOURCE_DIR}/include")
66+
67+
# Allow linking into a shared library.
68+
set_property(TARGET re2 PROPERTY POSITION_INDEPENDENT_CODE ON)
69+
70+
# RE2 requires pthreads
71+
set_property(TARGET re2 PROPERTY INTERFACE_COMPILE_OPTIONS $<${UNIX}:-pthread>)
72+
set_property(TARGET re2 PROPERTY INTERFACE_LINK_LIBRARIES $<${UNIX}:-pthread>)
73+
3274
# Include external googletest project. This runs a CMake sub-script
33-
# (CMakeLists.txt.in) that downloads googletest source. It's then built as part
75+
# (GoogleTestCMakeLists.txt.in) that downloads googletest source. It's then built as part
3476
# of the jsonnet project. The conventional way of handling CMake dependencies is
3577
# to use a find_package script, which finds and installs the library from
3678
# known locations on the local machine. Downloading the library ourselves
@@ -41,7 +83,7 @@ if (BUILD_TESTS AND NOT USE_SYSTEM_GTEST)
4183

4284
# Generate and download googletest project.
4385
set(GOOGLETEST_DIR ${GLOBAL_OUTPUT_PATH}/googletest-download)
44-
configure_file(CMakeLists.txt.in ${GOOGLETEST_DIR}/CMakeLists.txt)
86+
configure_file(GoogleTestCMakeLists.txt.in ${GOOGLETEST_DIR}/CMakeLists.txt)
4587
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
4688
RESULT_VARIABLE result
4789
WORKING_DIRECTORY ${GOOGLETEST_DIR}
File renamed without changes.

Makefile

+3-3
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ CFLAGS ?= -g $(OPT) -Wall -Wextra -pedantic -std=c99 -fPIC -Iinclude
3434
MAKEDEPENDFLAGS ?= -Iinclude -Ithird_party/md5 -Ithird_party/json
3535
EMCXXFLAGS = $(CXXFLAGS) -g0 -Os --memory-init-file 0 -s DISABLE_EXCEPTION_CATCHING=0 -s OUTLINING_LIMIT=10000 -s RESERVED_FUNCTION_POINTERS=20 -s ASSERTIONS=1 -s ALLOW_MEMORY_GROWTH=1
3636
EMCFLAGS = $(CFLAGS) --memory-init-file 0 -s DISABLE_EXCEPTION_CATCHING=0 -s ASSERTIONS=1 -s ALLOW_MEMORY_GROWTH=1
37-
LDFLAGS ?=
37+
LDFLAGS ?= -lre2
3838

3939
SHARED_LDFLAGS ?= -shared
4040

@@ -121,11 +121,11 @@ core/desugarer.cpp: core/std.jsonnet.h
121121

122122
# Commandline executable.
123123
jsonnet: cmd/jsonnet.cpp cmd/utils.cpp $(LIB_OBJ)
124-
$(CXX) $(CXXFLAGS) $(LDFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@
124+
$(CXX) $(CXXFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@ $(LDFLAGS)
125125

126126
# Commandline executable (reformatter).
127127
jsonnetfmt: cmd/jsonnetfmt.cpp cmd/utils.cpp $(LIB_OBJ)
128-
$(CXX) $(CXXFLAGS) $(LDFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@
128+
$(CXX) $(CXXFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@ $(LDFLAGS)
129129

130130
# C binding.
131131
libjsonnet.so: $(LIB_OBJ)

RE2CMakeLists.txt.in

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# CMake script run a generation-time. This must be separate from the main
2+
# CMakeLists.txt file to allow downloading and building googletest at generation
3+
# time.
4+
cmake_minimum_required(VERSION 2.8.2)
5+
6+
project(re2-download NONE)
7+
8+
include(ExternalProject)
9+
ExternalProject_Add(re2
10+
GIT_REPOSITORY https://github.com/google/re2.git
11+
GIT_TAG 2019-06-01
12+
SOURCE_DIR "${GLOBAL_OUTPUT_PATH}/re2-src"
13+
BINARY_DIR "${GLOBAL_OUTPUT_PATH}/re2-build"
14+
CONFIGURE_COMMAND ""
15+
BUILD_COMMAND ""
16+
INSTALL_COMMAND ""
17+
TEST_COMMAND ""
18+
)

WORKSPACE

+9-1
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,19 @@ git_repository(
1212
git_repository(
1313
name = "com_google_googletest",
1414
remote = "https://github.com/google/googletest.git",
15-
# If updating googletest version, also update CMakeLists.txt.in.
15+
# If updating googletest version, also update GoogleTestCMakeLists.txt.in.
1616
commit = "2fe3bd994b3189899d93f1d5a881e725e046fdc2", # release: release-1.8.1
1717
shallow_since = "1535728917 -0400",
1818
)
1919

20+
git_repository(
21+
name = "com_googlesource_code_re2",
22+
remote = "https://github.com/google/re2.git",
23+
# If updating RE2 version, also update RE2CMakeLists.txt.in.
24+
commit = "0c95bcce2f1f0f071a786ca2c42384b211b8caba", # release: 2019-06-01
25+
shallow_since = "1558525654 +0000",
26+
)
27+
2028
load("//tools/build_defs:python_repo.bzl", "python_interpreter")
2129

2230
python_interpreter(name = "default_python")

core/BUILD

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ cc_library(
3636
"//stdlib:std",
3737
"//third_party/json",
3838
"//third_party/md5:libmd5",
39+
"@com_googlesource_code_re2//:re2",
3940
],
4041
)
4142

core/CMakeLists.txt

+4-4
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ set(LIBJSONNET_SOURCE
2929
vm.cpp)
3030

3131
add_library(libjsonnet SHARED ${LIBJSONNET_HEADERS} ${LIBJSONNET_SOURCE})
32-
add_dependencies(libjsonnet md5 stdlib)
33-
target_link_libraries(libjsonnet md5)
32+
add_dependencies(libjsonnet md5 re2 stdlib)
33+
target_link_libraries(libjsonnet md5 re2)
3434

3535
# CMake prepends CMAKE_SHARED_LIBRARY_PREFIX to shared libraries, so without
3636
# this step the output would be |liblibjsonnet|.
@@ -45,8 +45,8 @@ install(TARGETS libjsonnet
4545

4646
# Static library for jsonnet command-line tool.
4747
add_library(libjsonnet_static STATIC ${LIBJSONNET_SOURCE})
48-
add_dependencies(libjsonnet_static md5 stdlib)
49-
target_link_libraries(libjsonnet_static md5)
48+
add_dependencies(libjsonnet_static md5 re2 stdlib)
49+
target_link_libraries(libjsonnet_static md5 re2)
5050
set_target_properties(libjsonnet_static PROPERTIES OUTPUT_NAME jsonnet)
5151
install(TARGETS libjsonnet_static DESTINATION "${CMAKE_INSTALL_LIBDIR}")
5252

core/desugarer.cpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ struct BuiltinDecl {
3434
std::vector<UString> params;
3535
};
3636

37-
static unsigned long max_builtin = 37;
37+
static unsigned long max_builtin = 42;
3838
BuiltinDecl jsonnet_builtin_decl(unsigned long builtin)
3939
{
4040
switch (builtin) {
@@ -76,6 +76,11 @@ BuiltinDecl jsonnet_builtin_decl(unsigned long builtin)
7676
case 35: return {U"parseJson", {U"str"}};
7777
case 36: return {U"encodeUTF8", {U"str"}};
7878
case 37: return {U"decodeUTF8", {U"arr"}};
79+
case 38: return {U"regexFullMatch", {U"pattern", U"str"}};
80+
case 39: return {U"regexPartialMatch", {U"pattern", U"str"}};
81+
case 40: return {U"regexQuoteMeta", {U"str"}};
82+
case 41: return {U"regexReplace", {U"str", U"pattern", U"to"}};
83+
case 42: return {U"regexGlobalReplace", {U"str", U"pattern", U"to"}};
7984
default:
8085
std::cerr << "INTERNAL ERROR: Unrecognized builtin function: " << builtin << std::endl;
8186
std::abort();

core/vm.cpp

+133
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ limitations under the License.
2626
#include "json.hpp"
2727
#include "md5.h"
2828
#include "parser.h"
29+
#include "re2/re2.h"
2930
#include "state.h"
3031
#include "static_analysis.h"
3132
#include "string_utils.h"
@@ -35,6 +36,10 @@ using json = nlohmann::json;
3536

3637
namespace {
3738

39+
static const Fodder EF; // Empty fodder.
40+
41+
static const LocationRange E; // Empty.
42+
3843
/** Turn a path e.g. "/a/b/c" into a dir, e.g. "/a/b/". If there is no path returns "".
3944
*/
4045
std::string dir_name(const std::string &path)
@@ -881,6 +886,11 @@ class Interpreter {
881886
builtins["parseJson"] = &Interpreter::builtinParseJson;
882887
builtins["encodeUTF8"] = &Interpreter::builtinEncodeUTF8;
883888
builtins["decodeUTF8"] = &Interpreter::builtinDecodeUTF8;
889+
builtins["regexFullMatch"] = &Interpreter::builtinRegexFullMatch;
890+
builtins["regexPartialMatch"] = &Interpreter::builtinRegexPartialMatch;
891+
builtins["regexQuoteMeta"] = &Interpreter::builtinRegexQuoteMeta;
892+
builtins["regexReplace"] = &Interpreter::builtinRegexReplace;
893+
builtins["regexGlobalReplace"] = &Interpreter::builtinRegexGlobalReplace;
884894
}
885895

886896
/** Clean up the heap, stack, stash, and builtin function ASTs. */
@@ -1373,6 +1383,129 @@ class Interpreter {
13731383
return decodeUTF8();
13741384
}
13751385

1386+
const AST *regexMatch(const std::string &pattern, const std::string &string, bool full)
1387+
{
1388+
RE2 re(pattern, RE2::CannedOptions::Quiet);
1389+
if(!re.ok()) {
1390+
std::stringstream ss;
1391+
ss << "Invalid regex '" << re.pattern() << "': " << re.error();
1392+
throw makeError(stack.top().location, ss.str());
1393+
}
1394+
1395+
int num_groups = re.NumberOfCapturingGroups();
1396+
1397+
std::vector<std::string> rcaptures(num_groups);
1398+
std::vector<RE2::Arg> rargv(num_groups);
1399+
std::vector<const RE2::Arg*> rargs(num_groups);
1400+
for(int i=0; i<num_groups; ++i) {
1401+
rargs[i] = &rargv[i];
1402+
rargv[i] = &rcaptures[i];
1403+
}
1404+
1405+
if(full ? RE2::FullMatchN(string, re, rargs.data(), num_groups)
1406+
: RE2::PartialMatchN(string, re, rargs.data(), num_groups)) {
1407+
std::map<const Identifier *, HeapSimpleObject::Field> fields;
1408+
1409+
const Identifier *fid = alloc->makeIdentifier(U"string");
1410+
fields[fid].hide = ObjectField::VISIBLE;
1411+
fields[fid].body = alloc->make<LiteralString>(E, EF, decode_utf8(string), LiteralString::DOUBLE, "", "");
1412+
1413+
fid = alloc->makeIdentifier(U"captures");
1414+
fields[fid].hide = ObjectField::VISIBLE;
1415+
std::vector<Array::Element> captures;
1416+
for(int i=0; i<num_groups; ++i) {
1417+
captures.push_back(Array::Element(
1418+
alloc->make<LiteralString>(E, EF, decode_utf8(rcaptures[i]), LiteralString::DOUBLE, "", ""),
1419+
EF));
1420+
}
1421+
fields[fid].body = alloc->make<Array>(E, EF, captures, false, EF);
1422+
1423+
fid = alloc->makeIdentifier(U"namedCaptures");
1424+
fields[fid].hide = ObjectField::VISIBLE;
1425+
DesugaredObject::Fields named_captures;
1426+
const std::map<std::string, int> &named_groups = re.NamedCapturingGroups();
1427+
for(auto it=named_groups.cbegin(); it!=named_groups.cend(); ++it) {
1428+
named_captures.push_back(DesugaredObject::Field(
1429+
ObjectField::VISIBLE,
1430+
alloc->make<LiteralString>(E, EF, decode_utf8(it->first), LiteralString::DOUBLE, "", ""),
1431+
alloc->make<LiteralString>(E, EF, decode_utf8(rcaptures[it->second-1]), LiteralString::DOUBLE, "", "")));
1432+
}
1433+
fields[fid].body = alloc->make<DesugaredObject>(E, ASTs{}, named_captures);
1434+
1435+
scratch = makeObject<HeapSimpleObject>(BindingFrame{}, fields, ASTs{});
1436+
} else {
1437+
scratch = makeNull();
1438+
}
1439+
return nullptr;
1440+
}
1441+
1442+
const AST *builtinRegexFullMatch(const LocationRange &loc, const std::vector<Value> &args)
1443+
{
1444+
validateBuiltinArgs(loc, "regexFullMatch", args, {Value::STRING, Value::STRING});
1445+
1446+
std::string pattern = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
1447+
std::string string = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);
1448+
1449+
return regexMatch(pattern, string, true);
1450+
}
1451+
1452+
const AST *builtinRegexPartialMatch(const LocationRange &loc, const std::vector<Value> &args)
1453+
{
1454+
validateBuiltinArgs(loc, "regexPartialMatch", args, {Value::STRING, Value::STRING});
1455+
1456+
std::string pattern = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
1457+
std::string string = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);
1458+
1459+
return regexMatch(pattern, string, false);
1460+
}
1461+
1462+
const AST *builtinRegexQuoteMeta(const LocationRange &loc, const std::vector<Value> &args)
1463+
{
1464+
validateBuiltinArgs(loc, "regexQuoteMeta", args, {Value::STRING});
1465+
scratch = makeString(decode_utf8(RE2::QuoteMeta(encode_utf8(static_cast<HeapString *>(args[0].v.h)->value))));
1466+
return nullptr;
1467+
}
1468+
1469+
const AST *builtinRegexReplace(const LocationRange &loc, const std::vector<Value> &args)
1470+
{
1471+
validateBuiltinArgs(loc, "regexReplace", args, {Value::STRING, Value::STRING, Value::STRING});
1472+
1473+
std::string string = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
1474+
std::string pattern = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);
1475+
std::string replace = encode_utf8(static_cast<HeapString *>(args[2].v.h)->value);
1476+
1477+
RE2 re(pattern, RE2::CannedOptions::Quiet);
1478+
if(!re.ok()) {
1479+
std::stringstream ss;
1480+
ss << "Invalid regex '" << re.pattern() << "': " << re.error();
1481+
throw makeError(stack.top().location, ss.str());
1482+
}
1483+
1484+
RE2::Replace(&string, re, replace);
1485+
scratch = makeString(decode_utf8(string));
1486+
return nullptr;
1487+
}
1488+
1489+
const AST *builtinRegexGlobalReplace(const LocationRange &loc, const std::vector<Value> &args)
1490+
{
1491+
validateBuiltinArgs(loc, "regexGlobalReplace", args, {Value::STRING, Value::STRING, Value::STRING});
1492+
1493+
std::string string = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
1494+
std::string pattern = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);
1495+
std::string replace = encode_utf8(static_cast<HeapString *>(args[2].v.h)->value);
1496+
1497+
RE2 re(pattern, RE2::CannedOptions::Quiet);
1498+
if(!re.ok()) {
1499+
std::stringstream ss;
1500+
ss << "Invalid regex '" << re.pattern() << "': " << re.error();
1501+
throw makeError(stack.top().location, ss.str());
1502+
}
1503+
1504+
RE2::GlobalReplace(&string, re, replace);
1505+
scratch = makeString(decode_utf8(string));
1506+
return nullptr;
1507+
}
1508+
13761509
const AST *builtinTrace(const LocationRange &loc, const std::vector<Value> &args)
13771510
{
13781511
if(args[0].t != Value::STRING) {

0 commit comments

Comments
 (0)