-
Notifications
You must be signed in to change notification settings - Fork 7.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimization for htmlspecialchars function #18126
base: master
Are you sure you want to change the base?
Changes from 2 commits
75bc970
c6cb392
2a46d43
914fa23
63a9484
79d09c9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -74,6 +74,12 @@ | |||||||||||||
#define sjis_lead(c) ((c) != 0x80 && (c) != 0xA0 && (c) < 0xFD) | ||||||||||||||
#define sjis_trail(c) ((c) >= 0x40 && (c) != 0x7F && (c) < 0xFD) | ||||||||||||||
|
||||||||||||||
/* Lookup table for php_htmlspecialchars */ | ||||||||||||||
typedef struct { | ||||||||||||||
char* entity[256]; | ||||||||||||||
ushort entity_len[256]; | ||||||||||||||
} htmlspecialchars_lut; | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move this closer to the declaration of the function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 80e7a41 |
||||||||||||||
|
||||||||||||||
/* {{{ get_default_charset */ | ||||||||||||||
static char *get_default_charset(void) { | ||||||||||||||
if (PG(internal_encoding) && PG(internal_encoding)[0]) { | ||||||||||||||
|
@@ -752,6 +758,60 @@ static zend_result resolve_named_entity_html(const char *start, size_t length, c | |||||||||||||
} | ||||||||||||||
/* }}} */ | ||||||||||||||
|
||||||||||||||
/* {{{ is_codepoint_allowed */ | ||||||||||||||
static inline zend_bool is_codepoint_allowed( | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 80e7a41 |
||||||||||||||
unsigned int cp, /* The codepoint to check */ | ||||||||||||||
enum entity_charset charset, /* Current charset */ | ||||||||||||||
int doctype, /* The doctype flags (ENT_HTML401, ENT_HTML5, etc.) */ | ||||||||||||||
const enc_to_uni* to_uni_table /* Mapping table if needed */ | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 80e7a41 |
||||||||||||||
) { | ||||||||||||||
// If charset is Unicode-compatible, the code point is used as-is | ||||||||||||||
if (CHARSET_UNICODE_COMPAT(charset)) { | ||||||||||||||
return unicode_cp_is_allowed(cp, doctype); | ||||||||||||||
} | ||||||||||||||
// If we have a mapping table (i.e., a non-UTF charset) | ||||||||||||||
if (to_uni_table) { | ||||||||||||||
map_to_unicode(cp, to_uni_table, &cp); | ||||||||||||||
return unicode_cp_is_allowed(cp, doctype); | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
if (cp <= 0x7D) { | ||||||||||||||
return unicode_cp_is_allowed(cp, doctype); | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
return 1; | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 80e7a41 |
||||||||||||||
} | ||||||||||||||
/* }}} */ | ||||||||||||||
|
||||||||||||||
/* {{{ init_htmlspecialchars_lut */ | ||||||||||||||
static void init_htmlspecialchars_lut(htmlspecialchars_lut* lut, const int flags, const int doctype) { | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 80e7a41 |
||||||||||||||
memset(lut, 0, sizeof(*lut)); | ||||||||||||||
|
||||||||||||||
lut->entity['&'] = "&"; | ||||||||||||||
lut->entity['>'] = ">"; | ||||||||||||||
lut->entity['<'] = "<"; | ||||||||||||||
lut->entity_len['&'] = 5; | ||||||||||||||
lut->entity_len['>'] = 4; | ||||||||||||||
lut->entity_len['<'] = 4; | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of using magic numbers, the compiler will be smart enough to precompute this.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 80e7a41 |
||||||||||||||
|
||||||||||||||
if (flags & ENT_QUOTES & ENT_HTML_QUOTE_DOUBLE) { | ||||||||||||||
lut->entity['"'] = """; | ||||||||||||||
lut->entity_len['"'] = 6; | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 80e7a41 |
||||||||||||||
} | ||||||||||||||
|
||||||||||||||
if (flags & ENT_QUOTES & ENT_HTML_QUOTE_SINGLE) { | ||||||||||||||
char* apos = "'"; | ||||||||||||||
if (doctype != ENT_HTML401) { | ||||||||||||||
if (doctype & (ENT_XML1 | ENT_XHTML | ENT_HTML5)) { | ||||||||||||||
apos = "'"; | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
lut->entity['\''] = apos; | ||||||||||||||
lut->entity_len['\''] = 6; | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 80e7a41 |
||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
/* }}} */ | ||||||||||||||
|
||||||||||||||
static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charset charset, unsigned code) { | ||||||||||||||
/* code is not necessarily a unicode code point */ | ||||||||||||||
switch (charset) { | ||||||||||||||
|
@@ -1304,6 +1364,179 @@ PHPAPI zend_string *php_escape_html_entities_ex(const unsigned char *old, size_t | |||||||||||||
} | ||||||||||||||
/* }}} */ | ||||||||||||||
|
||||||||||||||
/* {{{ php_htmlspecialchars */ | ||||||||||||||
PHPAPI zend_string* php_htmlspecialchars_ex( | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this a PHPAPI? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the review! I’ll go through all the comments and make the necessary changes. I don’t have much experience contributing to public PHP projects yet, so I might not be fully aware of all the conventions and best practices. I was working off the existing code and reused some parts as-is. I plan to refactor php_html_entities later. The main goal here was to switch from a hashtable to a LUT for special character replacement, and to restructure the logic accordingly. |
||||||||||||||
const zend_string* input, const int flags, | ||||||||||||||
const char* hint_charset, const bool double_encode, | ||||||||||||||
const bool quiet | ||||||||||||||
) { | ||||||||||||||
const entity_ht* inv_map = NULL; | ||||||||||||||
htmlspecialchars_lut lut; | ||||||||||||||
const int doctype = flags & ENT_HTML_DOC_TYPE_MASK; | ||||||||||||||
|
||||||||||||||
const size_t initial_size = (ZSTR_LEN(input) < 64) | ||||||||||||||
? 256 | ||||||||||||||
: zend_safe_addmult(ZSTR_LEN(input), 2, 0, "htmlspecialchars"); | ||||||||||||||
zend_string* output = zend_string_alloc(initial_size, 0); | ||||||||||||||
|
||||||||||||||
size_t free_space = initial_size; | ||||||||||||||
char* output_ptr = ZSTR_VAL(output); | ||||||||||||||
const char* input_ptr = ZSTR_VAL(input); | ||||||||||||||
const char* input_end = input_ptr + input->len; | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 80e7a41 |
||||||||||||||
|
||||||||||||||
const enum entity_charset charset = determine_charset(hint_charset, quiet); | ||||||||||||||
const enc_to_uni* to_uni_table = NULL; | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 80e7a41 |
||||||||||||||
if (!CHARSET_UNICODE_COMPAT(charset)) { | ||||||||||||||
to_uni_table = enc_to_uni_index[charset]; | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
/* Replacement for invalid characters and byte sequences */ | ||||||||||||||
const unsigned char* replacement = NULL; | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in 80e7a41 |
||||||||||||||
size_t replacement_len = 0; | ||||||||||||||
if (flags & (ENT_HTML_SUBSTITUTE_ERRORS | ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS)) { | ||||||||||||||
if (charset == cs_utf_8) { | ||||||||||||||
replacement = (const unsigned char*)"\xEF\xBF\xBD"; | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the cast? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Idk Fixed in 80e7a41 |
||||||||||||||
replacement_len = sizeof("\xEF\xBF\xBD") - 1; | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, we need to account for the fact that we’re replacing a single byte. |
||||||||||||||
} else { | ||||||||||||||
replacement = (const unsigned char*)"�"; | ||||||||||||||
replacement_len = sizeof("�") - 1; | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto prior remarks |
||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
init_htmlspecialchars_lut(&lut, flags, doctype); | ||||||||||||||
|
||||||||||||||
if (!double_encode) { | ||||||||||||||
inv_map = unescape_inverse_map(1, flags); | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
while (input_ptr < input_end) { | ||||||||||||||
const unsigned char c = *input_ptr; | ||||||||||||||
/* ASCII chars */ | ||||||||||||||
if (c < 0x80) { | ||||||||||||||
/* Handle HTML entities */ | ||||||||||||||
if (c == '&' && !double_encode) { | ||||||||||||||
const char* semicolon = memchr(input_ptr, ';', MIN(LONGEST_ENTITY_LENGTH + 1, input_end - input_ptr)); | ||||||||||||||
if (semicolon) { | ||||||||||||||
const size_t candidate_len = semicolon - (const char*)input_ptr + 1; | ||||||||||||||
unsigned dummy1, dummy2; | ||||||||||||||
|
||||||||||||||
/* Named entity */ | ||||||||||||||
if (resolve_named_entity_html((const char*)input_ptr + 1, candidate_len - 2, inv_map, &dummy1, | ||||||||||||||
&dummy2) == SUCCESS) { | ||||||||||||||
memcpy(output_ptr, input_ptr, candidate_len); | ||||||||||||||
output_ptr += candidate_len; | ||||||||||||||
input_ptr += candidate_len; | ||||||||||||||
free_space -= candidate_len; | ||||||||||||||
goto ensure_memory; | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
/* Numeric entity */ | ||||||||||||||
if (input_ptr[1] == '#') { | ||||||||||||||
unsigned code_point; | ||||||||||||||
char* start = (char*)input_ptr + 2; | ||||||||||||||
const int valid = process_numeric_entity((const char**)&start, &code_point); | ||||||||||||||
if (valid == SUCCESS && start == semicolon) { | ||||||||||||||
if (!(flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) || | ||||||||||||||
numeric_entity_is_allowed(code_point, doctype)) { | ||||||||||||||
memcpy(output_ptr, input_ptr, candidate_len); | ||||||||||||||
output_ptr += candidate_len; | ||||||||||||||
input_ptr += candidate_len; | ||||||||||||||
free_space -= candidate_len; | ||||||||||||||
goto ensure_memory; | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
/* Invalid entity */ | ||||||||||||||
memcpy(output_ptr, "&", 5); | ||||||||||||||
output_ptr += 5; | ||||||||||||||
free_space -= 5; | ||||||||||||||
input_ptr++; | ||||||||||||||
goto ensure_memory; | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
/* Check disallowed chars */ | ||||||||||||||
if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) { | ||||||||||||||
if (!is_codepoint_allowed(c, charset, doctype, NULL)) { | ||||||||||||||
memcpy(output_ptr, replacement, replacement_len); | ||||||||||||||
output_ptr += replacement_len; | ||||||||||||||
free_space -= replacement_len; | ||||||||||||||
input_ptr++; | ||||||||||||||
goto ensure_memory; | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
/* Use lookup table for fast replace */ | ||||||||||||||
if (lut.entity[c]) { | ||||||||||||||
const size_t entity_len = lut.entity_len[c]; | ||||||||||||||
memcpy(output_ptr, lut.entity[c], entity_len); | ||||||||||||||
output_ptr += entity_len; | ||||||||||||||
free_space -= entity_len; | ||||||||||||||
} else { | ||||||||||||||
*output_ptr++ = c; | ||||||||||||||
free_space--; | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
input_ptr++; | ||||||||||||||
} else { | ||||||||||||||
/* Multibyte chars */ | ||||||||||||||
zend_result status; | ||||||||||||||
const size_t original_pos = (const char*)input_ptr - ZSTR_VAL(input); | ||||||||||||||
size_t cursor = original_pos; | ||||||||||||||
const unsigned int this_char = get_next_char(charset, (unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), | ||||||||||||||
&cursor, &status); | ||||||||||||||
const size_t processed_len = cursor - original_pos; | ||||||||||||||
|
||||||||||||||
if (status == FAILURE) { | ||||||||||||||
if (flags & ENT_HTML_IGNORE_ERRORS) { | ||||||||||||||
input_ptr += processed_len; | ||||||||||||||
continue; | ||||||||||||||
} | ||||||||||||||
if (flags & ENT_HTML_SUBSTITUTE_ERRORS) { | ||||||||||||||
memcpy(output_ptr, replacement, replacement_len); | ||||||||||||||
output_ptr += replacement_len; | ||||||||||||||
free_space -= replacement_len; | ||||||||||||||
input_ptr += processed_len; | ||||||||||||||
} else { | ||||||||||||||
zend_string_release(output); | ||||||||||||||
return ZSTR_EMPTY_ALLOC(); | ||||||||||||||
} | ||||||||||||||
} else { | ||||||||||||||
/* Check disallowed chars */ | ||||||||||||||
const unsigned char* sequence = (unsigned char*)input_ptr; | ||||||||||||||
size_t sequence_len = processed_len; | ||||||||||||||
|
||||||||||||||
if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) { | ||||||||||||||
if (!is_codepoint_allowed(this_char, charset, doctype, to_uni_table)) { | ||||||||||||||
sequence = replacement; | ||||||||||||||
sequence_len = replacement_len; | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
memcpy(output_ptr, sequence, sequence_len); | ||||||||||||||
output_ptr += sequence_len; | ||||||||||||||
free_space -= sequence_len; | ||||||||||||||
input_ptr += processed_len; | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
ensure_memory: | ||||||||||||||
if (free_space < 128) { | ||||||||||||||
const size_t used = ZSTR_LEN(output) - free_space; | ||||||||||||||
const size_t new_size = used + 1024; | ||||||||||||||
output = zend_string_realloc(output, new_size, 0); | ||||||||||||||
output_ptr = ZSTR_VAL(output) + used; | ||||||||||||||
free_space = new_size - used; | ||||||||||||||
} | ||||||||||||||
} | ||||||||||||||
|
||||||||||||||
*output_ptr = '\0'; | ||||||||||||||
ZSTR_LEN(output) = (output_ptr - ZSTR_VAL(output)); | ||||||||||||||
return output; | ||||||||||||||
} | ||||||||||||||
/* }}} */ | ||||||||||||||
|
||||||||||||||
/* {{{ php_html_entities */ | ||||||||||||||
static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all) | ||||||||||||||
{ | ||||||||||||||
|
@@ -1327,10 +1560,33 @@ static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all) | |||||||||||||
} | ||||||||||||||
/* }}} */ | ||||||||||||||
|
||||||||||||||
/* {{{ php_html_entities */ | ||||||||||||||
static void php_htmlspecialchars(INTERNAL_FUNCTION_PARAMETERS) | ||||||||||||||
{ | ||||||||||||||
zend_string *str, *hint_charset = NULL; | ||||||||||||||
zend_long flags = ENT_QUOTES|ENT_SUBSTITUTE; | ||||||||||||||
zend_string *replaced; | ||||||||||||||
bool double_encode = 1; | ||||||||||||||
|
||||||||||||||
ZEND_PARSE_PARAMETERS_START(1, 4) | ||||||||||||||
Z_PARAM_STR(str) | ||||||||||||||
Z_PARAM_OPTIONAL | ||||||||||||||
Z_PARAM_LONG(flags) | ||||||||||||||
Z_PARAM_STR_OR_NULL(hint_charset) | ||||||||||||||
Z_PARAM_BOOL(double_encode); | ||||||||||||||
ZEND_PARSE_PARAMETERS_END(); | ||||||||||||||
|
||||||||||||||
replaced = php_htmlspecialchars_ex( | ||||||||||||||
str, (int) flags, | ||||||||||||||
hint_charset ? ZSTR_VAL(hint_charset) : NULL, double_encode, /* quiet */ 0); | ||||||||||||||
RETVAL_STR(replaced); | ||||||||||||||
} | ||||||||||||||
/* }}} */ | ||||||||||||||
|
||||||||||||||
/* {{{ Convert special characters to HTML entities */ | ||||||||||||||
PHP_FUNCTION(htmlspecialchars) | ||||||||||||||
{ | ||||||||||||||
php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); | ||||||||||||||
php_htmlspecialchars(INTERNAL_FUNCTION_PARAM_PASSTHRU); | ||||||||||||||
} | ||||||||||||||
/* }}} */ | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are you using a pass through function when there is only ever one use case now? Especially as you are adding a C function call overhead, which is weird for an optimization PR. You should also "inline" the other usage of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Merged in 80e7a41 |
||||||||||||||
|
||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,10 +2,10 @@ | |
Bug #60965: Buffer overflow on htmlspecialchars/entities with $double=false | ||
--FILE-- | ||
<?php | ||
echo htmlspecialchars('"""""""""""""""""""""""""""""""""""""""""""""', | ||
echo htmlspecialchars('"""""""""""""""""""""""""""""""""""""""""""""�', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added support for the LONGEST_ENTITY_LENGTH constant to define the maximum length of an entity. While it originally applies to named entities, I think it also makes sense to use it to limit the length of numeric entities. There’s no strict limit on numeric entity length in the HTML or XML specs, but in practice the longest valid one is Any numeric entities longer than that are effectively invalid and won’t be processed by browsers anyway. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not keep this and just add an extra case? That's my main question. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I see what you meant. I’ve reverted to the previous value and added an extra check. Now the original value in the test is no longer processed, and the numeric value of the entity is not computed. 914fa23 |
||
ENT_QUOTES, 'UTF-8', false), "\n"; | ||
echo "Done.\n"; | ||
?> | ||
--EXPECT-- | ||
""""""""""""""""""""""""""""""""""""""""""""" | ||
"""""""""""""""""""""""""""""""""""""""""""""&#x123456789123456789123456789; | ||
Done. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ushort
is not standard.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in 80e7a41