From 4a0d2c516bbe477bd5fdd1c75e5d918970cb9d5c Mon Sep 17 00:00:00 2001
From: Kevin Wilfong <kevinwilfong@meta.com>
Date: Wed, 13 Nov 2024 20:52:57 -0800
Subject: [PATCH] Match Presto's behavior for invalid UTF-8 in url_encode
 (#11518)

Summary:

Presto Java converts the URL to a Java String before encoding it in url_encode. Java replaces bytes
in an invalid UTF-8 character with 0xEF 0xBF 0xBD.

Velox encodes invalid UTF-8 characters as is, which leads to differences in results from Java and
C++.

This diff adds a check when encoding URLs for invalid UTF-8 characters and does the same
replacement as Java.

Differential Revision: D65856104
---
 velox/functions/prestosql/URLFunctions.h      | 40 +++++++++++++++----
 .../prestosql/tests/URLFunctionsTest.cpp      |  6 +++
 2 files changed, 38 insertions(+), 8 deletions(-)
diff --git a/velox/functions/prestosql/URLFunctions.h b/velox/functions/prestosql/URLFunctions.h
index 8dc64704d1d7..1edff6f30a14 100644
--- a/velox/functions/prestosql/URLFunctions.h
+++ b/velox/functions/prestosql/URLFunctions.h
@@ -16,15 +16,15 @@
 #pragma once
 
 #include <boost/regex.hpp>
-#include <cctype>
-#include <optional>
 #include "velox/functions/Macros.h"
-#include "velox/functions/lib/string/StringImpl.h"
+#include "velox/functions/lib/Utf8Utils.h"
 #include "velox/functions/prestosql/URIParser.h"
 
 namespace facebook::velox::functions {
 
 namespace detail {
+constexpr std::string_view kEncodedReplacementCharacter{"%EF%BF%BD"};
+
 FOLLY_ALWAYS_INLINE StringView submatch(const boost::cmatch& match, int idx) {
   const auto& sub = match[idx];
   return StringView(sub.first, sub.length());
@@ -49,27 +49,51 @@ FOLLY_ALWAYS_INLINE void charEscape(unsigned char c, char* output) {
 ///  * All other characters are converted to UTF-8 and the bytes are encoded
 ///    as the string ``%XX`` where ``XX`` is the uppercase hexadecimal
 ///    value of the UTF-8 byte.
+///  * If the character is invalid UTF-8 all bytes of the character are
+///    converted to %EF%BF%BD.
 template <typename TOutString, typename TInString>
 FOLLY_ALWAYS_INLINE void urlEscape(TOutString& output, const TInString& input) {
   auto inputSize = input.size();
-  output.reserve(inputSize * 3);
+  // In the worst case every byte is an invalid UTF-8 character.
+  output.reserve(inputSize * kEncodedReplacementCharacter.size());
 
   auto inputBuffer = input.data();
   auto outputBuffer = output.data();
 
+  size_t inputIndex = 0;
   size_t outIndex = 0;
-  for (auto i = 0; i < inputSize; ++i) {
-    unsigned char p = inputBuffer[i];
+  while (inputIndex < inputSize) {
+    unsigned char p = inputBuffer[inputIndex];
 
     if ((p >= 'a' && p <= 'z') || (p >= 'A' && p <= 'Z') ||
         (p >= '0' && p <= '9') || p == '-' || p == '_' || p == '.' ||
         p == '*') {
       outputBuffer[outIndex++] = p;
+      inputIndex++;
     } else if (p == ' ') {
       outputBuffer[outIndex++] = '+';
+      inputIndex++;
     } else {
-      charEscape(p, outputBuffer + outIndex);
-      outIndex += 3;
+      const auto charLength =
+          tryGetCharLength(inputBuffer + inputIndex, inputSize - inputIndex);
+      if (charLength > 0) {
+        for (int i = 0; i < charLength; ++i) {
+          charEscape(inputBuffer[inputIndex + i], outputBuffer + outIndex);
+          outIndex += 3;
+        }
+
+        inputIndex += charLength;
+      } else {
+        // We write a single replacement character regardless of the number of
+        // bytes in the invalid character.
+        std::memcpy(
+            outputBuffer + outIndex,
+            kEncodedReplacementCharacter.data(),
+            kEncodedReplacementCharacter.size());
+        outIndex += kEncodedReplacementCharacter.size();
+
+        inputIndex += -charLength;
+      }
     }
   }
   output.resize(outIndex);
diff --git a/velox/functions/prestosql/tests/URLFunctionsTest.cpp b/velox/functions/prestosql/tests/URLFunctionsTest.cpp
index 0ab32784e33e..3f116d9632e0 100644
--- a/velox/functions/prestosql/tests/URLFunctionsTest.cpp
+++ b/velox/functions/prestosql/tests/URLFunctionsTest.cpp
@@ -496,6 +496,12 @@ TEST_F(URLFunctionsTest, urlEncode) {
       urlEncode("http://\u30c6\u30b9\u30c8"));
   EXPECT_EQ("%7E%40%3A.-*_%2B+%E2%98%83", urlEncode("~@:.-*_+ \u2603"));
   EXPECT_EQ("test", urlEncode("test"));
+  // Test a single byte invalid UTF-8 character.
+  EXPECT_EQ("te%EF%BF%BDst", urlEncode("te\x88st"));
+  // Test a multi-byte invalid UTF-8 character. (If the first byte is between
+  // 0xe0 and 0xef, it should be a 3 byte character, but we only have 2 bytes
+  // here.)
+  EXPECT_EQ("te%EF%BF%BDst", urlEncode("te\xe0\xb8st"));
 }
 
 TEST_F(URLFunctionsTest, urlDecode) {