Skip to content

Commit

Permalink
Match Presto's behavior for invalid UTF-8 in url_encode (facebookincu…
Browse files Browse the repository at this point in the history
…bator#11518)

Summary:

Presto Java converts the URL to a Java String before encoding it in url_encode. Java replaces bytes
in an invalid UTF-8 character with 0xEF 0xBF 0xBD.

Velox encodes invalid UTF-8 characters as is, which leads to differences in results from Java and 
C++.

This diff adds a check when encoding URLs for invalid UTF-8 characters and does the same
replacement as Java.

Differential Revision: D65856104
  • Loading branch information
Kevin Wilfong authored and facebook-github-bot committed Nov 14, 2024
1 parent 9efa6ff commit 4a0d2c5
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 8 deletions.
40 changes: 32 additions & 8 deletions velox/functions/prestosql/URLFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@
#pragma once

#include <boost/regex.hpp>
#include <cctype>
#include <optional>
#include "velox/functions/Macros.h"
#include "velox/functions/lib/string/StringImpl.h"
#include "velox/functions/lib/Utf8Utils.h"
#include "velox/functions/prestosql/URIParser.h"

namespace facebook::velox::functions {

namespace detail {
constexpr std::string_view kEncodedReplacementCharacter{"%EF%BF%BD"};

FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
const auto& sub = match[idx];
return StringView(sub.first, sub.length());
Expand All @@ -49,27 +49,51 @@ FOLLY_ALWAYS_INLINE void charEscape(unsigned char c, char* output) {
/// * All other characters are converted to UTF-8 and the bytes are encoded
/// as the string ``%XX`` where ``XX`` is the uppercase hexadecimal
/// value of the UTF-8 byte.
/// * If the character is invalid UTF-8 all bytes of the character are
/// converted to %EF%BF%BD.
template <typename TOutString, typename TInString>
FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) {
auto inputSize = input.size();
output.reserve(inputSize * 3);
// In the worst case every byte is an invalid UTF-8 character.
output.reserve(inputSize * kEncodedReplacementCharacter.size());

auto inputBuffer = input.data();
auto outputBuffer = output.data();

size_t inputIndex = 0;
size_t outIndex = 0;
for (auto i = 0; i < inputSize; ++i) {
unsigned char p = inputBuffer[i];
while (inputIndex < inputSize) {
unsigned char p = inputBuffer[inputIndex];

if ((p >= 'a' && p <= 'z') || (p >= 'A' && p <= 'Z') ||
(p >= '0' && p <= '9') || p == '-' || p == '_' || p == '.' ||
p == '*') {
outputBuffer[outIndex++] = p;
inputIndex++;
} else if (p == ' ') {
outputBuffer[outIndex++] = '+';
inputIndex++;
} else {
charEscape(p, outputBuffer + outIndex);
outIndex += 3;
const auto charLength =
tryGetCharLength(inputBuffer + inputIndex, inputSize - inputIndex);
if (charLength > 0) {
for (int i = 0; i < charLength; ++i) {
charEscape(inputBuffer[inputIndex + i], outputBuffer + outIndex);
outIndex += 3;
}

inputIndex += charLength;
} else {
// We write a single replacement character regardless of the number of
// bytes in the invalid character.
std::memcpy(
outputBuffer + outIndex,
kEncodedReplacementCharacter.data(),
kEncodedReplacementCharacter.size());
outIndex += kEncodedReplacementCharacter.size();

inputIndex += -charLength;
}
}
}
output.resize(outIndex);
Expand Down
6 changes: 6 additions & 0 deletions velox/functions/prestosql/tests/URLFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,12 @@ TEST_F(URLFunctionsTest, urlEncode) {
urlEncode("http://\u30c6\u30b9\u30c8"));
EXPECT_EQ("%7E%40%3A.-*_%2B+%E2%98%83", urlEncode("~@:.-*_+ \u2603"));
EXPECT_EQ("test", urlEncode("test"));
// Test a single byte invalid UTF-8 character.
EXPECT_EQ("te%EF%BF%BDst", urlEncode("te\x88st"));
// Test a multi-byte invalid UTF-8 character. (If the first byte is between
// 0xe0 and 0xef, it should be a 3 byte character, but we only have 2 bytes
// here.)
EXPECT_EQ("te%EF%BF%BDst", urlEncode("te\xe0\xb8st"));
}

TEST_F(URLFunctionsTest, urlDecode) {
Expand Down

0 comments on commit 4a0d2c5

Please sign in to comment.