From c8dbb1392db1012a933dce5a14ba93c41b7a8fab Mon Sep 17 00:00:00 2001 From: swar8080 Date: Thu, 15 Feb 2024 18:18:42 -0500 Subject: [PATCH 1/5] Normalizes SQL IN(?, ?, ...) statements to "in(?)" to reduce cardinality of span metrics using db.statement as an attribute --- .../src/main/jflex/SqlSanitizer.jflex | 8 +++++++- .../semconv/db/SqlStatementSanitizerTest.java | 12 +++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex b/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex index 71ba5d0c18ec..51c36c839946 100644 --- a/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex +++ b/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex @@ -5,6 +5,8 @@ package io.opentelemetry.instrumentation.api.incubator.semconv.db; +import java.util.regex.Pattern; + %% %final @@ -52,6 +54,9 @@ WHITESPACE = [ \t\r\n]+ // max length of the sanitized statement - SQLs longer than this will be trimmed static final int LIMIT = 32 * 1024; + private static final Pattern IN_STATEMENT_PATTERN = Pattern.compile("\\sin\\s*\\((?:\\s*\\?\\s*,?\\s*)+\\)", Pattern.CASE_INSENSITIVE); + private static final String IN_STATEMENT_NORMALIZED = " in(?)"; + private final StringBuilder builder = new StringBuilder(); private void appendCurrentFragment() { @@ -278,7 +283,8 @@ WHITESPACE = [ \t\r\n]+ builder.delete(LIMIT, builder.length()); } String fullStatement = builder.toString(); - return operation.getResult(fullStatement); + String normalizedStatement = IN_STATEMENT_PATTERN.matcher(fullStatement).replaceAll(IN_STATEMENT_NORMALIZED); + return operation.getResult(normalizedStatement); } %} diff --git a/instrumentation-api-incubator/src/test/java/io/opentelemetry/instrumentation/api/incubator/semconv/db/SqlStatementSanitizerTest.java b/instrumentation-api-incubator/src/test/java/io/opentelemetry/instrumentation/api/incubator/semconv/db/SqlStatementSanitizerTest.java index 737c09f17ce6..060a782d53d9 100644 --- a/instrumentation-api-incubator/src/test/java/io/opentelemetry/instrumentation/api/incubator/semconv/db/SqlStatementSanitizerTest.java +++ b/instrumentation-api-incubator/src/test/java/io/opentelemetry/instrumentation/api/incubator/semconv/db/SqlStatementSanitizerTest.java @@ -271,7 +271,11 @@ public Stream provideArguments(ExtensionContext context) th Arguments.of("select col from table1 as t1, table2 as t2", expect("SELECT", null)), Arguments.of( "select col from table where col in (1, 2, 3)", - expect("select col from table where col in (?, ?, ?)", "SELECT", "table")), + expect("select col from table where col in(?)", "SELECT", "table")), + Arguments.of( + "select 'a' IN(x, 'b') from table where col in(1) and z IN( '3', '4' )", + expect( + "select ? IN(x, ?) from table where col in(?) and z in(?)", "SELECT", "table")), Arguments.of("select col from table order by col, col2", expect("SELECT", "table")), Arguments.of("select ąś∂ń© from źćļńĶ order by col, col2", expect("SELECT", "źćļńĶ")), Arguments.of("select 12345678", expect("select ?", "SELECT", null)), @@ -298,6 +302,9 @@ public Stream provideArguments(ExtensionContext context) th "delete from `my table` where something something", expect("DELETE", "my table")), Arguments.of( "delete from \"my table\" where something something", expect("DELETE", "my table")), + Arguments.of( + "delete from foo where x IN(1, 2, 3)", + expect("delete from foo where x in(?)", "DELETE", "foo")), Arguments.of("delete from 12345678", expect("delete from ?", "DELETE", null)), Arguments.of("delete (((", expect("delete (((", "DELETE", null)), @@ -307,6 +314,9 @@ public Stream provideArguments(ExtensionContext context) th Arguments.of( "update `my table` set answer=42", expect("update `my table` set answer=?", "UPDATE", "my table")), + Arguments.of( + "update `my table` set answer=42 where x IN('a', 'b')", + expect("update `my table` set answer=? where x in(?)", "UPDATE", "my table")), Arguments.of( "update \"my table\" set answer=42", expect("update \"my table\" set answer=?", "UPDATE", "my table")), From 34e7a79c999093a70487954e42a49df462b1b279 Mon Sep 17 00:00:00 2001 From: swar8080 Date: Thu, 22 Feb 2024 21:07:49 -0500 Subject: [PATCH 2/5] Switch to a linear-complexity regular expression that doesn't cause a stack overflow --- .../src/main/jflex/SqlSanitizer.jflex | 2 +- .../semconv/db/SqlStatementSanitizerTest.java | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex b/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex index 51c36c839946..1787e6ceab70 100644 --- a/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex +++ b/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex @@ -54,7 +54,7 @@ WHITESPACE = [ \t\r\n]+ // max length of the sanitized statement - SQLs longer than this will be trimmed static final int LIMIT = 32 * 1024; - private static final Pattern IN_STATEMENT_PATTERN = Pattern.compile("\\sin\\s*\\((?:\\s*\\?\\s*,?\\s*)+\\)", Pattern.CASE_INSENSITIVE); + private static final Pattern IN_STATEMENT_PATTERN = Pattern.compile("\\sin\\s*\\(\\s*\\?[\\s?,]*?\\)", Pattern.CASE_INSENSITIVE); private static final String IN_STATEMENT_NORMALIZED = " in(?)"; private final StringBuilder builder = new StringBuilder(); diff --git a/instrumentation-api-incubator/src/test/java/io/opentelemetry/instrumentation/api/incubator/semconv/db/SqlStatementSanitizerTest.java b/instrumentation-api-incubator/src/test/java/io/opentelemetry/instrumentation/api/incubator/semconv/db/SqlStatementSanitizerTest.java index 060a782d53d9..bc189c67a419 100644 --- a/instrumentation-api-incubator/src/test/java/io/opentelemetry/instrumentation/api/incubator/semconv/db/SqlStatementSanitizerTest.java +++ b/instrumentation-api-incubator/src/test/java/io/opentelemetry/instrumentation/api/incubator/semconv/db/SqlStatementSanitizerTest.java @@ -113,6 +113,19 @@ void randomBytesDontCauseExceptionsOrTimeouts() { } } + @Test + public void longInStatementDoesntCauseStackOverflow() { + StringBuilder s = new StringBuilder("select col from table where col in ("); + for (int i = 0; i < 10000; i++) { + s.append("?,"); + } + s.append("?)"); + + String sanitized = SqlStatementSanitizer.create(true).sanitize(s.toString()).getFullStatement(); + + assertThat(sanitized).isEqualTo("select col from table where col in(?)"); + } + static class SqlArgs implements ArgumentsProvider { @Override From b5c215d32dcfa22b73898d6383814663bd8a4803 Mon Sep 17 00:00:00 2001 From: swar8080 Date: Fri, 23 Feb 2024 19:03:21 -0500 Subject: [PATCH 3/5] Add some comments --- .../src/main/jflex/SqlSanitizer.jflex | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex b/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex index 1787e6ceab70..1304a1758977 100644 --- a/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex +++ b/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex @@ -54,6 +54,7 @@ WHITESPACE = [ \t\r\n]+ // max length of the sanitized statement - SQLs longer than this will be trimmed static final int LIMIT = 32 * 1024; + // Match on "IN(?, ?, ...)". This can also match invalid sql syntax like IN(?,,), which is a tradeoff to avoid stack overflows private static final Pattern IN_STATEMENT_PATTERN = Pattern.compile("\\sin\\s*\\(\\s*\\?[\\s?,]*?\\)", Pattern.CASE_INSENSITIVE); private static final String IN_STATEMENT_NORMALIZED = " in(?)"; @@ -283,7 +284,10 @@ WHITESPACE = [ \t\r\n]+ builder.delete(LIMIT, builder.length()); } String fullStatement = builder.toString(); + + // Normalize all 'in (?, ?, ...)' statements to in (?) to reduce cardinality String normalizedStatement = IN_STATEMENT_PATTERN.matcher(fullStatement).replaceAll(IN_STATEMENT_NORMALIZED); + return operation.getResult(normalizedStatement); } From 0c7fd1089dc9e17b4a4c01ca8e2e0ed981e8b00f Mon Sep 17 00:00:00 2001 From: swar8080 Date: Mon, 26 Feb 2024 08:17:36 -0500 Subject: [PATCH 4/5] switch to a more accurate regular expression that avoids stack overflow --- .../src/main/jflex/SqlSanitizer.jflex | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex b/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex index 1304a1758977..ba2242c676a5 100644 --- a/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex +++ b/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex @@ -54,8 +54,8 @@ WHITESPACE = [ \t\r\n]+ // max length of the sanitized statement - SQLs longer than this will be trimmed static final int LIMIT = 32 * 1024; - // Match on "IN(?, ?, ...)". This can also match invalid sql syntax like IN(?,,), which is a tradeoff to avoid stack overflows - private static final Pattern IN_STATEMENT_PATTERN = Pattern.compile("\\sin\\s*\\(\\s*\\?[\\s?,]*?\\)", Pattern.CASE_INSENSITIVE); + // Match on "IN(?, ?, ...)" + private static final Pattern IN_STATEMENT_PATTERN = Pattern.compile("(\\sin\\s*)\\(\\s*\\?\\s*(,\\s*\\?\\s*)*+\\)", Pattern.CASE_INSENSITIVE); private static final String IN_STATEMENT_NORMALIZED = " in(?)"; private final StringBuilder builder = new StringBuilder(); From c52df11a73012b6f3bda7000d8124164ff830540 Mon Sep 17 00:00:00 2001 From: swar8080 Date: Mon, 26 Feb 2024 16:51:33 -0500 Subject: [PATCH 5/5] preserve capitalization and white space when normalizing IN statement --- .../src/main/jflex/SqlSanitizer.jflex | 6 +++--- .../semconv/db/SqlStatementSanitizerTest.java | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex b/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex index ba2242c676a5..b7d2f1ead661 100644 --- a/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex +++ b/instrumentation-api-incubator/src/main/jflex/SqlSanitizer.jflex @@ -54,9 +54,9 @@ WHITESPACE = [ \t\r\n]+ // max length of the sanitized statement - SQLs longer than this will be trimmed static final int LIMIT = 32 * 1024; - // Match on "IN(?, ?, ...)" - private static final Pattern IN_STATEMENT_PATTERN = Pattern.compile("(\\sin\\s*)\\(\\s*\\?\\s*(,\\s*\\?\\s*)*+\\)", Pattern.CASE_INSENSITIVE); - private static final String IN_STATEMENT_NORMALIZED = " in(?)"; + // Match on strings like "IN(?, ?, ...)" + private static final Pattern IN_STATEMENT_PATTERN = Pattern.compile("(\\sIN\\s*)\\(\\s*\\?\\s*(?:,\\s*\\?\\s*)*+\\)", Pattern.CASE_INSENSITIVE); + private static final String IN_STATEMENT_NORMALIZED = "$1(?)"; private final StringBuilder builder = new StringBuilder(); diff --git a/instrumentation-api-incubator/src/test/java/io/opentelemetry/instrumentation/api/incubator/semconv/db/SqlStatementSanitizerTest.java b/instrumentation-api-incubator/src/test/java/io/opentelemetry/instrumentation/api/incubator/semconv/db/SqlStatementSanitizerTest.java index bc189c67a419..6702ecfa8ba1 100644 --- a/instrumentation-api-incubator/src/test/java/io/opentelemetry/instrumentation/api/incubator/semconv/db/SqlStatementSanitizerTest.java +++ b/instrumentation-api-incubator/src/test/java/io/opentelemetry/instrumentation/api/incubator/semconv/db/SqlStatementSanitizerTest.java @@ -123,7 +123,7 @@ public void longInStatementDoesntCauseStackOverflow() { String sanitized = SqlStatementSanitizer.create(true).sanitize(s.toString()).getFullStatement(); - assertThat(sanitized).isEqualTo("select col from table where col in(?)"); + assertThat(sanitized).isEqualTo("select col from table where col in (?)"); } static class SqlArgs implements ArgumentsProvider { @@ -284,11 +284,11 @@ public Stream provideArguments(ExtensionContext context) th Arguments.of("select col from table1 as t1, table2 as t2", expect("SELECT", null)), Arguments.of( "select col from table where col in (1, 2, 3)", - expect("select col from table where col in(?)", "SELECT", "table")), + expect("select col from table where col in (?)", "SELECT", "table")), Arguments.of( - "select 'a' IN(x, 'b') from table where col in(1) and z IN( '3', '4' )", + "select 'a' IN(x, 'b') from table where col in (1) and z IN( '3', '4' )", expect( - "select ? IN(x, ?) from table where col in(?) and z in(?)", "SELECT", "table")), + "select ? IN(x, ?) from table where col in (?) and z IN(?)", "SELECT", "table")), Arguments.of("select col from table order by col, col2", expect("SELECT", "table")), Arguments.of("select ąś∂ń© from źćļńĶ order by col, col2", expect("SELECT", "źćļńĶ")), Arguments.of("select 12345678", expect("select ?", "SELECT", null)), @@ -316,8 +316,8 @@ public Stream provideArguments(ExtensionContext context) th Arguments.of( "delete from \"my table\" where something something", expect("DELETE", "my table")), Arguments.of( - "delete from foo where x IN(1, 2, 3)", - expect("delete from foo where x in(?)", "DELETE", "foo")), + "delete from foo where x IN (1,2,3)", + expect("delete from foo where x IN (?)", "DELETE", "foo")), Arguments.of("delete from 12345678", expect("delete from ?", "DELETE", null)), Arguments.of("delete (((", expect("delete (((", "DELETE", null)), @@ -328,8 +328,11 @@ public Stream provideArguments(ExtensionContext context) th "update `my table` set answer=42", expect("update `my table` set answer=?", "UPDATE", "my table")), Arguments.of( - "update `my table` set answer=42 where x IN('a', 'b')", - expect("update `my table` set answer=? where x in(?)", "UPDATE", "my table")), + "update `my table` set answer=42 where x IN('a', 'b') AND y In ('a', 'b')", + expect( + "update `my table` set answer=? where x IN(?) AND y In (?)", + "UPDATE", + "my table")), Arguments.of( "update \"my table\" set answer=42", expect("update \"my table\" set answer=?", "UPDATE", "my table")),