From 4a0d2c516bbe477bd5fdd1c75e5d918970cb9d5c Mon Sep 17 00:00:00 2001 From: Kevin Wilfong Date: Wed, 13 Nov 2024 20:52:57 -0800 Subject: [PATCH] Match Presto's behavior for invalid UTF-8 in url_encode (#11518) Summary: Presto Java converts the URL to a Java String before encoding it in url_encode. Java replaces bytes in an invalid UTF-8 character with 0xEF 0xBF 0xBD. Velox encodes invalid UTF-8 characters as is, which leads to differences in results from Java and C++. This diff adds a check when encoding URLs for invalid UTF-8 characters and does the same replacement as Java. Differential Revision: D65856104 --- velox/functions/prestosql/URLFunctions.h | 40 +++++++++++++++---- .../prestosql/tests/URLFunctionsTest.cpp | 6 +++ 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/velox/functions/prestosql/URLFunctions.h b/velox/functions/prestosql/URLFunctions.h index 8dc64704d1d7..1edff6f30a14 100644 --- a/velox/functions/prestosql/URLFunctions.h +++ b/velox/functions/prestosql/URLFunctions.h @@ -16,15 +16,15 @@ #pragma once #include -#include -#include #include "velox/functions/Macros.h" -#include "velox/functions/lib/string/StringImpl.h" +#include "velox/functions/lib/Utf8Utils.h" #include "velox/functions/prestosql/URIParser.h" namespace facebook::velox::functions { namespace detail { +constexpr std::string_view kEncodedReplacementCharacter{"%EF%BF%BD"}; + FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) { const auto& sub = match[idx]; return StringView(sub.first, sub.length()); @@ -49,27 +49,51 @@ FOLLY_ALWAYS_INLINE void charEscape(unsigned char c, char* output) { /// * All other characters are converted to UTF-8 and the bytes are encoded /// as the string ``%XX`` where ``XX`` is the uppercase hexadecimal /// value of the UTF-8 byte. +/// * If the character is invalid UTF-8 all bytes of the character are +/// converted to %EF%BF%BD. template FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) { auto inputSize = input.size(); - output.reserve(inputSize * 3); + // In the worst case every byte is an invalid UTF-8 character. + output.reserve(inputSize * kEncodedReplacementCharacter.size()); auto inputBuffer = input.data(); auto outputBuffer = output.data(); + size_t inputIndex = 0; size_t outIndex = 0; - for (auto i = 0; i < inputSize; ++i) { - unsigned char p = inputBuffer[i]; + while (inputIndex < inputSize) { + unsigned char p = inputBuffer[inputIndex]; if ((p >= 'a' && p <= 'z') || (p >= 'A' && p <= 'Z') || (p >= '0' && p <= '9') || p == '-' || p == '_' || p == '.' || p == '*') { outputBuffer[outIndex++] = p; + inputIndex++; } else if (p == ' ') { outputBuffer[outIndex++] = '+'; + inputIndex++; } else { - charEscape(p, outputBuffer + outIndex); - outIndex += 3; + const auto charLength = + tryGetCharLength(inputBuffer + inputIndex, inputSize - inputIndex); + if (charLength > 0) { + for (int i = 0; i < charLength; ++i) { + charEscape(inputBuffer[inputIndex + i], outputBuffer + outIndex); + outIndex += 3; + } + + inputIndex += charLength; + } else { + // We write a single replacement character regardless of the number of + // bytes in the invalid character. + std::memcpy( + outputBuffer + outIndex, + kEncodedReplacementCharacter.data(), + kEncodedReplacementCharacter.size()); + outIndex += kEncodedReplacementCharacter.size(); + + inputIndex += -charLength; + } } } output.resize(outIndex); diff --git a/velox/functions/prestosql/tests/URLFunctionsTest.cpp b/velox/functions/prestosql/tests/URLFunctionsTest.cpp index 0ab32784e33e..3f116d9632e0 100644 --- a/velox/functions/prestosql/tests/URLFunctionsTest.cpp +++ b/velox/functions/prestosql/tests/URLFunctionsTest.cpp @@ -496,6 +496,12 @@ TEST_F(URLFunctionsTest, urlEncode) { urlEncode("http://\u30c6\u30b9\u30c8")); EXPECT_EQ("%7E%40%3A.-*_%2B+%E2%98%83", urlEncode("~@:.-*_+ \u2603")); EXPECT_EQ("test", urlEncode("test")); + // Test a single byte invalid UTF-8 character. + EXPECT_EQ("te%EF%BF%BDst", urlEncode("te\x88st")); + // Test a multi-byte invalid UTF-8 character. (If the first byte is between + // 0xe0 and 0xef, it should be a 3 byte character, but we only have 2 bytes + // here.) + EXPECT_EQ("te%EF%BF%BDst", urlEncode("te\xe0\xb8st")); } TEST_F(URLFunctionsTest, urlDecode) {