Skip to content

Commit ec62ce5

Browse files
committed
Change type "char"'s I/O format for non-ASCII characters.
Previously, a byte with the high bit set was just transmitted as-is by charin() and charout(). This is problematic if the database encoding is multibyte, because the result of charout() won't be validly encoded, which breaks various stuff that expects all text strings to be validly encoded. We've previously decided to enforce encoding validity rather than try to individually harden each place that might have a problem with such strings, so it's time to do something about "char". To fix, represent high-bit-set characters as \ooo (backslash and three octal digits), following the ancient "escape" format for bytea. charin() will continue to accept the old way as well, though that is only reachable in single-byte encodings. Add some test cases just so there is coverage for this code. We'll otherwise leave this question undocumented as it was before, because we don't really want to encourage end-user use of "char". For the moment, back-patch into v15 so that this change appears in 15beta3. If there's not great pushback we should consider absorbing this change into the older branches. Discussion: https://postgr.es/m/[email protected]
1 parent 1349d27 commit ec62ce5

File tree

6 files changed

+263
-28
lines changed

6 files changed

+263
-28
lines changed

Diff for: doc/src/sgml/datatype.sgml

+6-4
Original file line numberDiff line numberDiff line change
@@ -1338,17 +1338,19 @@ SELECT b, char_length(b) FROM test2;
13381338
<para>
13391339
There are two other fixed-length character types in
13401340
<productname>PostgreSQL</productname>, shown in <xref
1341-
linkend="datatype-character-special-table"/>. The <type>name</type>
1342-
type exists <emphasis>only</emphasis> for the storage of identifiers
1343-
in the internal system catalogs and is not intended for use by the general user. Its
1341+
linkend="datatype-character-special-table"/>.
1342+
These are not intended for general-purpose use, only for use
1343+
in the internal system catalogs.
1344+
The <type>name</type> type is used to store identifiers. Its
13441345
length is currently defined as 64 bytes (63 usable characters plus
13451346
terminator) but should be referenced using the constant
13461347
<symbol>NAMEDATALEN</symbol> in <literal>C</literal> source code.
13471348
The length is set at compile time (and
13481349
is therefore adjustable for special uses); the default maximum
13491350
length might change in a future release. The type <type>"char"</type>
13501351
(note the quotes) is different from <type>char(1)</type> in that it
1351-
only uses one byte of storage. It is internally used in the system
1352+
only uses one byte of storage, and therefore can store only a single
1353+
ASCII character. It is used in the system
13521354
catalogs as a simplistic enumeration type.
13531355
</para>
13541356

Diff for: src/backend/utils/adt/char.c

+56-16
Original file line numberDiff line numberDiff line change
@@ -20,38 +20,65 @@
2020
#include "libpq/pqformat.h"
2121
#include "utils/builtins.h"
2222

23+
#define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
24+
#define TOOCTAL(c) ((c) + '0')
25+
#define FROMOCTAL(c) ((unsigned char) (c) - '0')
26+
27+
2328
/*****************************************************************************
2429
* USER I/O ROUTINES *
2530
*****************************************************************************/
2631

2732
/*
2833
* charin - converts "x" to 'x'
2934
*
30-
* Note that an empty input string will implicitly be converted to \0.
35+
* This accepts the formats charout produces. If we have multibyte input
36+
* that is not in the form '\ooo', then we take its first byte as the value
37+
* and silently discard the rest; this is a backwards-compatibility provision.
3138
*/
3239
Datum
3340
charin(PG_FUNCTION_ARGS)
3441
{
3542
char *ch = PG_GETARG_CSTRING(0);
3643

44+
if (strlen(ch) == 4 && ch[0] == '\\' &&
45+
ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3]))
46+
PG_RETURN_CHAR((FROMOCTAL(ch[1]) << 6) +
47+
(FROMOCTAL(ch[2]) << 3) +
48+
FROMOCTAL(ch[3]));
49+
/* This will do the right thing for a zero-length input string */
3750
PG_RETURN_CHAR(ch[0]);
3851
}
3952

4053
/*
4154
* charout - converts 'x' to "x"
4255
*
43-
* Note that if the char value is \0, the resulting string will appear
44-
* to be empty (null-terminated after zero characters). So this is the
45-
* inverse of the charin() function for such data.
56+
* The possible output formats are:
57+
* 1. 0x00 is represented as an empty string.
58+
* 2. 0x01..0x7F are represented as a single ASCII byte.
59+
* 3. 0x80..0xFF are represented as \ooo (backslash and 3 octal digits).
60+
* Case 3 is meant to match the traditional "escape" format of bytea.
4661
*/
4762
Datum
4863
charout(PG_FUNCTION_ARGS)
4964
{
5065
char ch = PG_GETARG_CHAR(0);
51-
char *result = (char *) palloc(2);
66+
char *result = (char *) palloc(5);
5267

53-
result[0] = ch;
54-
result[1] = '\0';
68+
if (IS_HIGHBIT_SET(ch))
69+
{
70+
result[0] = '\\';
71+
result[1] = TOOCTAL(((unsigned char) ch) >> 6);
72+
result[2] = TOOCTAL((((unsigned char) ch) >> 3) & 07);
73+
result[3] = TOOCTAL(((unsigned char) ch) & 07);
74+
result[4] = '\0';
75+
}
76+
else
77+
{
78+
/* This produces acceptable results for 0x00 as well */
79+
result[0] = ch;
80+
result[1] = '\0';
81+
}
5582
PG_RETURN_CSTRING(result);
5683
}
5784

@@ -176,15 +203,20 @@ Datum
176203
text_char(PG_FUNCTION_ARGS)
177204
{
178205
text *arg1 = PG_GETARG_TEXT_PP(0);
206+
char *ch = VARDATA_ANY(arg1);
179207
char result;
180208

181209
/*
182-
* An empty input string is converted to \0 (for consistency with charin).
183-
* If the input is longer than one character, the excess data is silently
184-
* discarded.
210+
* Conversion rules are the same as in charin(), but here we need to
211+
* handle the empty-string case honestly.
185212
*/
186-
if (VARSIZE_ANY_EXHDR(arg1) > 0)
187-
result = *(VARDATA_ANY(arg1));
213+
if (VARSIZE_ANY_EXHDR(arg1) == 4 && ch[0] == '\\' &&
214+
ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3]))
215+
result = (FROMOCTAL(ch[1]) << 6) +
216+
(FROMOCTAL(ch[2]) << 3) +
217+
FROMOCTAL(ch[3]);
218+
else if (VARSIZE_ANY_EXHDR(arg1) > 0)
219+
result = ch[0];
188220
else
189221
result = '\0';
190222

@@ -195,13 +227,21 @@ Datum
195227
char_text(PG_FUNCTION_ARGS)
196228
{
197229
char arg1 = PG_GETARG_CHAR(0);
198-
text *result = palloc(VARHDRSZ + 1);
230+
text *result = palloc(VARHDRSZ + 4);
199231

200232
/*
201-
* Convert \0 to an empty string, for consistency with charout (and
202-
* because the text stuff doesn't like embedded nulls all that well).
233+
* Conversion rules are the same as in charout(), but here we need to be
234+
* honest about converting 0x00 to an empty string.
203235
*/
204-
if (arg1 != '\0')
236+
if (IS_HIGHBIT_SET(arg1))
237+
{
238+
SET_VARSIZE(result, VARHDRSZ + 4);
239+
(VARDATA(result))[0] = '\\';
240+
(VARDATA(result))[1] = TOOCTAL(((unsigned char) arg1) >> 6);
241+
(VARDATA(result))[2] = TOOCTAL((((unsigned char) arg1) >> 3) & 07);
242+
(VARDATA(result))[3] = TOOCTAL(((unsigned char) arg1) & 07);
243+
}
244+
else if (arg1 != '\0')
205245
{
206246
SET_VARSIZE(result, VARHDRSZ + 1);
207247
*(VARDATA(result)) = arg1;

Diff for: src/test/regress/expected/char.out

+61-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
--
22
-- CHAR
33
--
4-
-- fixed-length by value
5-
-- internally passed by value if <= 4 bytes in storage
4+
-- Per SQL standard, CHAR means character(1), that is a varlena type
5+
-- with a constraint restricting it to one character (not byte)
66
SELECT char 'c' = char 'c' AS true;
77
true
88
------
@@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
119119
abcd
120120
(4 rows)
121121

122+
--
123+
-- Also test "char", which is an ad-hoc one-byte type. It can only
124+
-- really store ASCII characters, but we allow high-bit-set characters
125+
-- to be accessed via bytea-like escapes.
126+
--
127+
SELECT 'a'::"char";
128+
char
129+
------
130+
a
131+
(1 row)
132+
133+
SELECT '\101'::"char";
134+
char
135+
------
136+
A
137+
(1 row)
138+
139+
SELECT '\377'::"char";
140+
char
141+
------
142+
\377
143+
(1 row)
144+
145+
SELECT 'a'::"char"::text;
146+
text
147+
------
148+
a
149+
(1 row)
150+
151+
SELECT '\377'::"char"::text;
152+
text
153+
------
154+
\377
155+
(1 row)
156+
157+
SELECT '\000'::"char"::text;
158+
text
159+
------
160+
161+
(1 row)
162+
163+
SELECT 'a'::text::"char";
164+
char
165+
------
166+
a
167+
(1 row)
168+
169+
SELECT '\377'::text::"char";
170+
char
171+
------
172+
\377
173+
(1 row)
174+
175+
SELECT ''::text::"char";
176+
char
177+
------
178+
179+
(1 row)
180+

Diff for: src/test/regress/expected/char_1.out

+61-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
--
22
-- CHAR
33
--
4-
-- fixed-length by value
5-
-- internally passed by value if <= 4 bytes in storage
4+
-- Per SQL standard, CHAR means character(1), that is a varlena type
5+
-- with a constraint restricting it to one character (not byte)
66
SELECT char 'c' = char 'c' AS true;
77
true
88
------
@@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
119119
abcd
120120
(4 rows)
121121

122+
--
123+
-- Also test "char", which is an ad-hoc one-byte type. It can only
124+
-- really store ASCII characters, but we allow high-bit-set characters
125+
-- to be accessed via bytea-like escapes.
126+
--
127+
SELECT 'a'::"char";
128+
char
129+
------
130+
a
131+
(1 row)
132+
133+
SELECT '\101'::"char";
134+
char
135+
------
136+
A
137+
(1 row)
138+
139+
SELECT '\377'::"char";
140+
char
141+
------
142+
\377
143+
(1 row)
144+
145+
SELECT 'a'::"char"::text;
146+
text
147+
------
148+
a
149+
(1 row)
150+
151+
SELECT '\377'::"char"::text;
152+
text
153+
------
154+
\377
155+
(1 row)
156+
157+
SELECT '\000'::"char"::text;
158+
text
159+
------
160+
161+
(1 row)
162+
163+
SELECT 'a'::text::"char";
164+
char
165+
------
166+
a
167+
(1 row)
168+
169+
SELECT '\377'::text::"char";
170+
char
171+
------
172+
\377
173+
(1 row)
174+
175+
SELECT ''::text::"char";
176+
char
177+
------
178+
179+
(1 row)
180+

Diff for: src/test/regress/expected/char_2.out

+61-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
--
22
-- CHAR
33
--
4-
-- fixed-length by value
5-
-- internally passed by value if <= 4 bytes in storage
4+
-- Per SQL standard, CHAR means character(1), that is a varlena type
5+
-- with a constraint restricting it to one character (not byte)
66
SELECT char 'c' = char 'c' AS true;
77
true
88
------
@@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL;
119119
abcd
120120
(4 rows)
121121

122+
--
123+
-- Also test "char", which is an ad-hoc one-byte type. It can only
124+
-- really store ASCII characters, but we allow high-bit-set characters
125+
-- to be accessed via bytea-like escapes.
126+
--
127+
SELECT 'a'::"char";
128+
char
129+
------
130+
a
131+
(1 row)
132+
133+
SELECT '\101'::"char";
134+
char
135+
------
136+
A
137+
(1 row)
138+
139+
SELECT '\377'::"char";
140+
char
141+
------
142+
\377
143+
(1 row)
144+
145+
SELECT 'a'::"char"::text;
146+
text
147+
------
148+
a
149+
(1 row)
150+
151+
SELECT '\377'::"char"::text;
152+
text
153+
------
154+
\377
155+
(1 row)
156+
157+
SELECT '\000'::"char"::text;
158+
text
159+
------
160+
161+
(1 row)
162+
163+
SELECT 'a'::text::"char";
164+
char
165+
------
166+
a
167+
(1 row)
168+
169+
SELECT '\377'::text::"char";
170+
char
171+
------
172+
\377
173+
(1 row)
174+
175+
SELECT ''::text::"char";
176+
char
177+
------
178+
179+
(1 row)
180+

0 commit comments

Comments
 (0)