Skip to content

Commit 7b1574e

Browse files
authored
Add patterns and grok command (#813)
Signed-off-by: Joshua Li <[email protected]>
1 parent 5aef2e6 commit 7b1574e

File tree

88 files changed

+436574
-299
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

88 files changed

+436574
-299
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,4 @@ gen
4242
.DS_Store
4343

4444
/artifacts/
45+
/.pid.lock

NOTICE

+3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ Foundation (http://www.apache.org/).
1111
This product includes software developed by
1212
Joda.org (http://www.joda.org/).
1313

14+
This product includes software developed by
15+
Kraken (https://github.com/thekrakken/java-grok).
16+
1417
This project is based on the Apache 2.0-licensed elasticsearch-sql project (https://github.com/NLPchina/elasticsearch-sql):
1518

1619
Copyright 2014 omershelef

common/build.gradle

+4
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ dependencies {
3535
api "org.antlr:antlr4-runtime:4.7.1"
3636
api group: 'com.google.guava', name: 'guava', version: '31.0.1-jre'
3737
api group: 'org.apache.logging.log4j', name: 'log4j-core', version:'2.17.1'
38+
api group: 'org.apache.commons', name: 'commons-lang3', version: '3.10'
3839

3940
testImplementation group: 'junit', name: 'junit', version: '4.13.2'
41+
testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.9.1'
42+
testImplementation group: 'com.google.guava', name: 'guava', version: '31.0.1-jre'
43+
testImplementation group: 'org.hamcrest', name: 'hamcrest-library', version: '2.1'
4044
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.common.grok;
7+
8+
import java.time.Instant;
9+
import java.time.LocalDate;
10+
import java.time.LocalDateTime;
11+
import java.time.OffsetDateTime;
12+
import java.time.ZoneId;
13+
import java.time.ZoneOffset;
14+
import java.time.ZonedDateTime;
15+
import java.time.format.DateTimeFormatter;
16+
import java.time.temporal.TemporalAccessor;
17+
import java.util.AbstractMap;
18+
import java.util.Arrays;
19+
import java.util.Collection;
20+
import java.util.List;
21+
import java.util.Map;
22+
import java.util.function.Function;
23+
import java.util.regex.Pattern;
24+
import java.util.stream.Collectors;
25+
26+
/**
27+
* Convert String argument to the right type.
28+
*/
29+
public class Converter {
30+
31+
public enum Type {
32+
BYTE(Byte::valueOf),
33+
BOOLEAN(Boolean::valueOf),
34+
SHORT(Short::valueOf),
35+
INT(Integer::valueOf, "integer"),
36+
LONG(Long::valueOf),
37+
FLOAT(Float::valueOf),
38+
DOUBLE(Double::valueOf),
39+
DATETIME(new DateConverter(), "date"),
40+
STRING(v -> v, "text");
41+
42+
public final IConverter<? extends Object> converter;
43+
public final List<String> aliases;
44+
45+
Type(IConverter<? extends Object> converter, String... aliases) {
46+
this.converter = converter;
47+
this.aliases = Arrays.asList(aliases);
48+
}
49+
}
50+
51+
private static final Pattern SPLITTER = Pattern.compile("[:;]");
52+
53+
private static final Map<String, Type> TYPES =
54+
Arrays.stream(Type.values())
55+
.collect(Collectors.toMap(t -> t.name().toLowerCase(), t -> t));
56+
57+
private static final Map<String, Type> TYPE_ALIASES =
58+
Arrays.stream(Type.values())
59+
.flatMap(type -> type.aliases.stream()
60+
.map(alias -> new AbstractMap.SimpleEntry<>(alias, type)))
61+
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
62+
63+
private static Type getType(String key) {
64+
key = key.toLowerCase();
65+
Type type = TYPES.getOrDefault(key, TYPE_ALIASES.get(key));
66+
if (type == null) {
67+
throw new IllegalArgumentException("Invalid data type :" + key);
68+
}
69+
return type;
70+
}
71+
72+
/**
73+
* getConverters.
74+
*/
75+
public static Map<String, IConverter<? extends Object>>
76+
getConverters(Collection<String> groupNames, Object... params) {
77+
return groupNames.stream()
78+
.filter(Converter::containsDelimiter)
79+
.collect(Collectors.toMap(Function.identity(), key -> {
80+
String[] list = splitGrokPattern(key);
81+
IConverter<? extends Object> converter = getType(list[1]).converter;
82+
if (list.length == 3) {
83+
converter = converter.newConverter(list[2], params);
84+
}
85+
return converter;
86+
}));
87+
}
88+
89+
/**
90+
* getGroupTypes.
91+
*/
92+
public static Map<String, Type> getGroupTypes(Collection<String> groupNames) {
93+
return groupNames.stream()
94+
.filter(Converter::containsDelimiter)
95+
.map(Converter::splitGrokPattern)
96+
.collect(Collectors.toMap(
97+
l -> l[0],
98+
l -> getType(l[1])
99+
));
100+
}
101+
102+
public static String extractKey(String key) {
103+
return splitGrokPattern(key)[0];
104+
}
105+
106+
private static boolean containsDelimiter(String string) {
107+
return string.indexOf(':') >= 0 || string.indexOf(';') >= 0;
108+
}
109+
110+
private static String[] splitGrokPattern(String string) {
111+
return SPLITTER.split(string, 3);
112+
}
113+
114+
interface IConverter<T> {
115+
116+
T convert(String value);
117+
118+
default IConverter<T> newConverter(String param, Object... params) {
119+
return this;
120+
}
121+
}
122+
123+
124+
static class DateConverter implements IConverter<Instant> {
125+
126+
private final DateTimeFormatter formatter;
127+
private final ZoneId timeZone;
128+
129+
public DateConverter() {
130+
this.formatter = DateTimeFormatter.ISO_DATE_TIME;
131+
this.timeZone = ZoneOffset.UTC;
132+
}
133+
134+
private DateConverter(DateTimeFormatter formatter, ZoneId timeZone) {
135+
this.formatter = formatter;
136+
this.timeZone = timeZone;
137+
}
138+
139+
@Override
140+
public Instant convert(String value) {
141+
TemporalAccessor dt = formatter
142+
.parseBest(value.trim(), ZonedDateTime::from, LocalDateTime::from, OffsetDateTime::from,
143+
Instant::from,
144+
LocalDate::from);
145+
if (dt instanceof ZonedDateTime) {
146+
return ((ZonedDateTime) dt).toInstant();
147+
} else if (dt instanceof LocalDateTime) {
148+
return ((LocalDateTime) dt).atZone(timeZone).toInstant();
149+
} else if (dt instanceof OffsetDateTime) {
150+
return ((OffsetDateTime) dt).atZoneSameInstant(timeZone).toInstant();
151+
} else if (dt instanceof Instant) {
152+
return ((Instant) dt);
153+
} else if (dt instanceof LocalDate) {
154+
return ((LocalDate) dt).atStartOfDay(timeZone).toInstant();
155+
} else {
156+
return null;
157+
}
158+
}
159+
160+
@Override
161+
public DateConverter newConverter(String param, Object... params) {
162+
if (!(params.length == 1 && params[0] instanceof ZoneId)) {
163+
throw new IllegalArgumentException("Invalid parameters");
164+
}
165+
return new DateConverter(DateTimeFormatter.ofPattern(param), (ZoneId) params[0]);
166+
}
167+
}
168+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package org.opensearch.sql.common.grok;
7+
8+
import java.io.Serializable;
9+
import java.time.ZoneId;
10+
import java.util.ArrayList;
11+
import java.util.List;
12+
import java.util.Map;
13+
import java.util.Set;
14+
import java.util.regex.Matcher;
15+
import java.util.regex.Pattern;
16+
import org.opensearch.sql.common.grok.Converter.IConverter;
17+
18+
/**
19+
* {@code Grok} parse arbitrary text and structure it.
20+
* <br>
21+
* {@code Grok} is simple API that allows you to easily parse logs
22+
* and other files (single line). With {@code Grok},
23+
* you can turn unstructured log and event data into structured data.
24+
*
25+
* @since 0.0.1
26+
*/
27+
public class Grok implements Serializable {
28+
/**
29+
* Named regex of the originalGrokPattern.
30+
*/
31+
private final String namedRegex;
32+
/**
33+
* Map of the named regex of the originalGrokPattern
34+
* with id = namedregexid and value = namedregex.
35+
*/
36+
private final Map<String, String> namedRegexCollection;
37+
/**
38+
* Original {@code Grok} pattern (expl: %{IP}).
39+
*/
40+
private final String originalGrokPattern;
41+
/**
42+
* Pattern of the namedRegex.
43+
*/
44+
private final Pattern compiledNamedRegex;
45+
46+
/**
47+
* {@code Grok} patterns definition.
48+
*/
49+
private final Map<String, String> grokPatternDefinition;
50+
51+
public final Set<String> namedGroups;
52+
53+
public final Map<String, Converter.Type> groupTypes;
54+
55+
public final Map<String, IConverter<? extends Object>> converters;
56+
57+
/**
58+
* only use in grok discovery.
59+
*/
60+
private String savedPattern = "";
61+
62+
/**
63+
* Grok.
64+
*/
65+
public Grok(String pattern,
66+
String namedRegex,
67+
Map<String, String> namedRegexCollection,
68+
Map<String, String> patternDefinitions,
69+
ZoneId defaultTimeZone) {
70+
this.originalGrokPattern = pattern;
71+
this.namedRegex = namedRegex;
72+
this.compiledNamedRegex = Pattern.compile(namedRegex);
73+
this.namedRegexCollection = namedRegexCollection;
74+
this.namedGroups = GrokUtils.getNameGroups(namedRegex);
75+
this.groupTypes = Converter.getGroupTypes(namedRegexCollection.values());
76+
this.converters = Converter.getConverters(namedRegexCollection.values(), defaultTimeZone);
77+
this.grokPatternDefinition = patternDefinitions;
78+
}
79+
80+
public String getSaved_pattern() {
81+
return savedPattern;
82+
}
83+
84+
public void setSaved_pattern(String savedpattern) {
85+
this.savedPattern = savedpattern;
86+
}
87+
88+
/**
89+
* Get the current map of {@code Grok} pattern.
90+
*
91+
* @return Patterns (name, regular expression)
92+
*/
93+
public Map<String, String> getPatterns() {
94+
return grokPatternDefinition;
95+
}
96+
97+
/**
98+
* Get the named regex from the {@code Grok} pattern. <br>
99+
*
100+
* @return named regex
101+
*/
102+
public String getNamedRegex() {
103+
return namedRegex;
104+
}
105+
106+
/**
107+
* Original grok pattern used to compile to the named regex.
108+
*
109+
* @return String Original Grok pattern
110+
*/
111+
public String getOriginalGrokPattern() {
112+
return originalGrokPattern;
113+
}
114+
115+
/**
116+
* Get the named regex from the given id.
117+
*
118+
* @param id : named regex id
119+
* @return String of the named regex
120+
*/
121+
public String getNamedRegexCollectionById(String id) {
122+
return namedRegexCollection.get(id);
123+
}
124+
125+
/**
126+
* Get the full collection of the named regex.
127+
*
128+
* @return named RegexCollection
129+
*/
130+
public Map<String, String> getNamedRegexCollection() {
131+
return namedRegexCollection;
132+
}
133+
134+
/**
135+
* Match the given <tt>log</tt> with the named regex.
136+
* And return the json representation of the matched element
137+
*
138+
* @param log : log to match
139+
* @return map containing matches
140+
*/
141+
public Map<String, Object> capture(String log) {
142+
Match match = match(log);
143+
return match.capture();
144+
}
145+
146+
/**
147+
* Match the given list of <tt>log</tt> with the named regex
148+
* and return the list of json representation of the matched elements.
149+
*
150+
* @param logs : list of log
151+
* @return list of maps containing matches
152+
*/
153+
public ArrayList<Map<String, Object>> capture(List<String> logs) {
154+
final ArrayList<Map<String, Object>> matched = new ArrayList<>();
155+
for (String log : logs) {
156+
matched.add(capture(log));
157+
}
158+
return matched;
159+
}
160+
161+
/**
162+
* Match the given <tt>text</tt> with the named regex
163+
* {@code Grok} will extract data from the string and get an extence of {@link Match}.
164+
*
165+
* @param text : Single line of log
166+
* @return Grok Match
167+
*/
168+
public Match match(CharSequence text) {
169+
if (compiledNamedRegex == null || text == null) {
170+
return Match.EMPTY;
171+
}
172+
173+
Matcher matcher = compiledNamedRegex.matcher(text);
174+
if (matcher.find()) {
175+
return new Match(
176+
text, this, matcher, matcher.start(0), matcher.end(0)
177+
);
178+
}
179+
180+
return Match.EMPTY;
181+
}
182+
}

0 commit comments

Comments
 (0)