Skip to content

Commit 92b7747

Browse files
fix decode_utf8 for codepoints >= U+010000
Fixes #1181. Add unit test cases to cover UTF-8 decode/encode.
1 parent 4487bfb commit 92b7747

File tree

5 files changed

+128
-2
lines changed

5 files changed

+128
-2
lines changed

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,6 @@ if (BUILD_TESTS)
133133
# target runs tests without building them.
134134
add_custom_target(run_tests COMMAND ${CMAKE_CTEST_COMMAND}
135135
DEPENDS libjsonnet_test libjsonnet_test_file libjsonnet_test_snippet
136-
jsonnet parser_test lexer_test libjsonnet++_test libjsonnet_test_locale
136+
jsonnet unicode_test parser_test lexer_test libjsonnet++_test libjsonnet_test_locale
137137
)
138138
endif()

core/BUILD

+9
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,12 @@ cc_test(
6969
"@com_google_googletest//:gtest_main",
7070
],
7171
)
72+
73+
cc_test(
74+
name = "unicode_test",
75+
srcs = ["unicode_test.cpp"],
76+
deps = [
77+
":libjsonnet",
78+
"@com_google_googletest//:gtest_main",
79+
],
80+
)

core/CMakeLists.txt

+3
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ function(add_test_executable test_name)
8787
endfunction()
8888

8989
if (BUILD_TESTS)
90+
add_test_executable(unicode_test)
91+
add_test(unicode_test ${GLOBAL_OUTPUT_PATH}/unicode_test)
92+
9093
add_test_executable(lexer_test)
9194
add_test(lexer_test ${GLOBAL_OUTPUT_PATH}/lexer_test)
9295

core/unicode.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ static inline char32_t decode_utf8(const std::string &str, size_t &i)
114114
if ((c3 & 0xC0) != 0x80) {
115115
return JSONNET_CODEPOINT_ERROR;
116116
}
117-
return ((c0 & 0x7) << 24ul) | ((c1 & 0x3F) << 12ul) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
117+
return ((c0 & 0x7) << 18ul) | ((c1 & 0x3F) << 12ul) | ((c2 & 0x3F) << 6) | (c3 & 0x3F);
118118
} else {
119119
return JSONNET_CODEPOINT_ERROR;
120120
}

core/unicode_test.cpp

+114
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
/*
2+
Copyright 2025 Google Inc. All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
#include <array>
18+
#include <string>
19+
#include <sstream>
20+
#include <iostream>
21+
#include "unicode.h"
22+
#include "gtest/gtest.h"
23+
24+
namespace jsonnet::internal {
25+
namespace {
26+
27+
void testEncodeDecode(char32_t codepoint, const std::string &expect_utf8) {
28+
std::string buffer;
29+
size_t len = encode_utf8(codepoint, buffer);
30+
EXPECT_EQ(len, expect_utf8.size());
31+
EXPECT_EQ(buffer, expect_utf8);
32+
33+
size_t at = 0;
34+
char32_t decoded = decode_utf8(expect_utf8, at);
35+
EXPECT_EQ(decoded, codepoint);
36+
EXPECT_EQ(at, expect_utf8.size() - 1);
37+
}
38+
39+
TEST(Unicode, TestUTF8)
40+
{
41+
// ASCII encodes as itself.
42+
testEncodeDecode(0x00, std::string("\x00", 1));
43+
testEncodeDecode(0x41, "A");
44+
testEncodeDecode(0x7f, "\x7f");
45+
46+
testEncodeDecode(0x80, "\xc2\x80");
47+
testEncodeDecode(0x100, "\xc4\x80");
48+
testEncodeDecode(0x7ff, "\xdf\xbf");
49+
50+
testEncodeDecode(0x800, "\xe0\xa0\x80");
51+
testEncodeDecode(0x1482, "\xe1\x92\x82");
52+
testEncodeDecode(0xffff, "\xef\xbf\xbf");
53+
54+
testEncodeDecode(0x010000, "\xf0\x90\x80\x80");
55+
testEncodeDecode(0x01f600, "\xf0\x9f\x98\x80"); // U+1F600 "Grinning Face"
56+
testEncodeDecode(0x0f057e, "\xf3\xb0\x95\xbe"); // U+F057E Private use area character
57+
testEncodeDecode(0x10ffff, "\xf4\x8f\xbf\xbf");
58+
}
59+
60+
TEST(Unicode, TestUTF8RejectBad)
61+
{
62+
const auto test_cases = std::array{
63+
"\x80", // Continuation byte without leading byte
64+
"\xa0", // Continuation byte without leading byte
65+
"\xbf", // Continuation byte without leading byte
66+
"\xc0", // Leading byte for 2-byte sequence (missing tail)
67+
"\xe0", // Leading byte for 3-byte sequence (missing tail)
68+
"\xf0", // Leading byte for 4-byte sequence (missing tail)
69+
"\xf8\x83\x83\x83", // Invalid leading byte
70+
"\xe0\x80", // Leading byte for 3-byte sequence (missing tail)
71+
"\xf0\x80", // Leading byte for 4-byte sequence (missing tail)
72+
"\xf0\x80\x80", // Leading byte for 4-byte sequence (missing tail)
73+
"\xc0\xcf", // Leading byte for 2-byte sequence (incorrect tail)
74+
"\xe0\xcf", // Leading byte for 3-byte sequence (incorrect tail)
75+
"\xf0\xcf", // Leading byte for 4-byte sequence (incorrect tail)
76+
"\xe0\xcf\x80", // Leading byte for 3-byte sequence (incorrect tail)
77+
"\xf0\xcf\x80", // Leading byte for 4-byte sequence (incorrect tail)
78+
"\xe0\x80\xcf", // Leading byte for 3-byte sequence (incorrect tail)
79+
"\xf0\x80\xcf", // Leading byte for 4-byte sequence (incorrect tail)
80+
"\xf0\x80\x80\xcf", // Leading byte for 4-byte sequence (incorrect tail)
81+
"\xf0\x80\xcf\x80", // Leading byte for 4-byte sequence (incorrect tail)
82+
"\xf0\xcf\x80\x80", // Leading byte for 4-byte sequence (incorrect tail)
83+
};
84+
for (size_t i = 0; i < test_cases.size(); ++i) {
85+
const auto str = test_cases[i];
86+
size_t at = 0;
87+
char32_t c = decode_utf8(str, at);
88+
89+
EXPECT_EQ(c, JSONNET_CODEPOINT_ERROR) << "expect decode to reject. case " << i << std::endl;
90+
}
91+
}
92+
93+
TEST(Unicode, TestUTF8RoundTripExhaustive)
94+
{
95+
// Encode every Unicode code-point as UTF-8 and verify that
96+
// it decodes to the same value.
97+
std::string buffer;
98+
for (int x = 0; x < JSONNET_CODEPOINT_MAX; ++x) {
99+
if (x == JSONNET_CODEPOINT_ERROR) {
100+
continue;
101+
}
102+
buffer.clear();
103+
encode_utf8(x, buffer);
104+
105+
size_t at = 0;
106+
char32_t y = decode_utf8(buffer, at);
107+
EXPECT_NE(y, JSONNET_CODEPOINT_ERROR) << "UTF-8 roundtrip failed for codepoint " << x << " decode rejects" << std::endl;
108+
EXPECT_EQ(x, y) << "UTF-8 roundtrip failed for codepoint " << x << " converts to " << y << std::endl;
109+
EXPECT_EQ(at, buffer.size() - 1) << "UTF-8 roundtrip failed for codepoint " << x << " decodes incorrect length" << std::endl;
110+
}
111+
}
112+
113+
} // namespace
114+
} // namespace jsonnet::internal

0 commit comments

Comments
 (0)