From 75bc9702e4d531c82e0a82ea07be59825b49b744 Mon Sep 17 00:00:00 2001 From: Artem Ukrainskiy Date: Fri, 21 Mar 2025 16:17:21 +0300 Subject: [PATCH 1/5] Optimization for htmlspecialchars function. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A dedicated php_htmlspecialchars function instead of the “universal” php_escape_html_entities_ex. We work with ASCII-compatible encodings, we can employ byte-by-byte scanning and a lookup table to identify special characters. For c < 0x80, the lookup table is used; for potentially multi-byte characters, we continue to rely on get_next_char. This approach provides a noticeable performance improvement for ASCII strings and some improvement for multi-byte strings due to more optimized logic. --- ext/standard/html.c | 258 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 257 insertions(+), 1 deletion(-) diff --git a/ext/standard/html.c b/ext/standard/html.c index 0c6231d590d88..333636abfe48a 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -74,6 +74,12 @@ #define sjis_lead(c) ((c) != 0x80 && (c) != 0xA0 && (c) < 0xFD) #define sjis_trail(c) ((c) >= 0x40 && (c) != 0x7F && (c) < 0xFD) +/* Lookup table for php_htmlspecialchars */ +typedef struct { + char* entity[256]; + ushort entity_len[256]; +} htmlspecialchars_lut; + /* {{{ get_default_charset */ static char *get_default_charset(void) { if (PG(internal_encoding) && PG(internal_encoding)[0]) { @@ -752,6 +758,60 @@ static zend_result resolve_named_entity_html(const char *start, size_t length, c } /* }}} */ +/* {{{ is_codepoint_allowed */ +static inline zend_bool is_codepoint_allowed( + unsigned int cp, /* The codepoint to check */ + enum entity_charset charset, /* Current charset */ + int doctype, /* The doctype flags (ENT_HTML401, ENT_HTML5, etc.) */ + const enc_to_uni* to_uni_table /* Mapping table if needed */ + ) { + // If charset is Unicode-compatible, the code point is used as-is + if (CHARSET_UNICODE_COMPAT(charset)) { + return unicode_cp_is_allowed(cp, doctype); + } + // If we have a mapping table (i.e., a non-UTF charset) + if (to_uni_table) { + map_to_unicode(cp, to_uni_table, &cp); + return unicode_cp_is_allowed(cp, doctype); + } + + if (cp <= 0x7D) { + return unicode_cp_is_allowed(cp, doctype); + } + + return 1; +} +/* }}} */ + +/* {{{ init_htmlspecialchars_lut */ +static void init_htmlspecialchars_lut(htmlspecialchars_lut* lut, const int flags, const int doctype) { + memset(lut, 0, sizeof(*lut)); + + lut->entity['&'] = "&"; + lut->entity['>'] = ">"; + lut->entity['<'] = "<"; + lut->entity_len['&'] = 5; + lut->entity_len['>'] = 4; + lut->entity_len['<'] = 4; + + if (flags & ENT_QUOTES & ENT_HTML_QUOTE_DOUBLE) { + lut->entity['"'] = """; + lut->entity_len['"'] = 6; + } + + if (flags & ENT_QUOTES & ENT_HTML_QUOTE_SINGLE) { + char* apos = "'"; + if (doctype != ENT_HTML401) { + if (doctype & (ENT_XML1 | ENT_XHTML | ENT_HTML5)) { + apos = "'"; + } + } + lut->entity['\''] = apos; + lut->entity_len['\''] = 6; + } +} +/* }}} */ + static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charset charset, unsigned code) { /* code is not necessarily a unicode code point */ switch (charset) { @@ -1304,6 +1364,179 @@ PHPAPI zend_string *php_escape_html_entities_ex(const unsigned char *old, size_t } /* }}} */ +/* {{{ php_htmlspecialchars */ +PHPAPI zend_string* php_htmlspecialchars_ex( + const zend_string* input, const int flags, + const char* hint_charset, const bool double_encode, + const bool quiet + ) { + const entity_ht* inv_map = NULL; + htmlspecialchars_lut lut; + const int doctype = flags & ENT_HTML_DOC_TYPE_MASK; + + const size_t initial_size = (ZSTR_LEN(input) < 64) + ? 256 + : zend_safe_addmult(ZSTR_LEN(input), 2, 0, "htmlspecialchars"); + zend_string* output = zend_string_alloc(initial_size, 0); + + size_t free_space = initial_size; + char* output_ptr = ZSTR_VAL(output); + const char* input_ptr = ZSTR_VAL(input); + const char* input_end = input_ptr + input->len; + + const enum entity_charset charset = determine_charset(hint_charset, quiet); + const enc_to_uni* to_uni_table = NULL; + if (!CHARSET_UNICODE_COMPAT(charset)) { + to_uni_table = enc_to_uni_index[charset]; + } + + /* Replacement for invalid characters and byte sequences */ + const unsigned char* replacement = NULL; + size_t replacement_len = 0; + if (flags & (ENT_HTML_SUBSTITUTE_ERRORS | ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS)) { + if (charset == cs_utf_8) { + replacement = (const unsigned char*)"\xEF\xBF\xBD"; + replacement_len = sizeof("\xEF\xBF\xBD") - 1; + } else { + replacement = (const unsigned char*)"�"; + replacement_len = sizeof("�") - 1; + } + } + + init_htmlspecialchars_lut(&lut, flags, doctype); + + if (!double_encode) { + inv_map = unescape_inverse_map(1, flags); + } + + while (input_ptr < input_end) { + const unsigned char c = *input_ptr; + /* ASCII chars */ + if (c < 0x80) { + /* Handle HTML entities */ + if (c == '&' && !double_encode) { + const char* semicolon = memchr(input_ptr, ';', MIN(LONGEST_ENTITY_LENGTH + 1, input_end - input_ptr)); + if (semicolon) { + const size_t candidate_len = semicolon - (const char*)input_ptr + 1; + unsigned dummy1, dummy2; + + /* Named entity */ + if (resolve_named_entity_html((const char*)input_ptr + 1, candidate_len - 2, inv_map, &dummy1, + &dummy2) == SUCCESS) { + memcpy(output_ptr, input_ptr, candidate_len); + output_ptr += candidate_len; + input_ptr += candidate_len; + free_space -= candidate_len; + goto ensure_memory; + } + + /* Numeric entity */ + if (input_ptr[1] == '#') { + unsigned code_point; + char* start = (char*)input_ptr + 2; + const int valid = process_numeric_entity((const char**)&start, &code_point); + if (valid == SUCCESS && start == semicolon) { + if (!(flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) || + numeric_entity_is_allowed(code_point, doctype)) { + memcpy(output_ptr, input_ptr, candidate_len); + output_ptr += candidate_len; + input_ptr += candidate_len; + free_space -= candidate_len; + goto ensure_memory; + } + } + } + } + + /* Invalid entity */ + memcpy(output_ptr, "&", 5); + output_ptr += 5; + free_space -= 5; + input_ptr++; + goto ensure_memory; + } + + /* Check disallowed chars */ + if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) { + if (!is_codepoint_allowed(c, charset, doctype, NULL)) { + memcpy(output_ptr, replacement, replacement_len); + output_ptr += replacement_len; + free_space -= replacement_len; + input_ptr++; + goto ensure_memory; + } + } + + /* Use lookup table for fast replace */ + if (lut.entity[c]) { + const size_t entity_len = lut.entity_len[c]; + memcpy(output_ptr, lut.entity[c], entity_len); + output_ptr += entity_len; + free_space -= entity_len; + } else { + *output_ptr++ = c; + free_space--; + } + + input_ptr++; + } else { + /* Multibyte chars */ + zend_result status; + const size_t original_pos = (const char*)input_ptr - ZSTR_VAL(input); + size_t cursor = original_pos; + const unsigned int this_char = get_next_char(charset, (unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), + &cursor, &status); + const size_t processed_len = cursor - original_pos; + + if (status == FAILURE) { + if (flags & ENT_HTML_IGNORE_ERRORS) { + input_ptr += processed_len; + continue; + } + if (flags & ENT_HTML_SUBSTITUTE_ERRORS) { + memcpy(output_ptr, replacement, replacement_len); + output_ptr += replacement_len; + free_space -= replacement_len; + input_ptr += processed_len; + } else { + zend_string_release(output); + return ZSTR_EMPTY_ALLOC(); + } + } else { + /* Check disallowed chars */ + const unsigned char* sequence = (unsigned char*)input_ptr; + size_t sequence_len = processed_len; + + if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) { + if (!is_codepoint_allowed(this_char, charset, doctype, to_uni_table)) { + sequence = replacement; + sequence_len = replacement_len; + } + } + + memcpy(output_ptr, sequence, sequence_len); + output_ptr += sequence_len; + free_space -= sequence_len; + input_ptr += processed_len; + } + } + + ensure_memory: + if (free_space < 128) { + const size_t used = ZSTR_LEN(output) - free_space; + const size_t new_size = used + 1024; + output = zend_string_realloc(output, new_size, 0); + output_ptr = ZSTR_VAL(output) + used; + free_space = new_size - used; + } + } + + *output_ptr = '\0'; + ZSTR_LEN(output) = (output_ptr - ZSTR_VAL(output)); + return output; +} +/* }}} */ + /* {{{ php_html_entities */ static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all) { @@ -1327,10 +1560,33 @@ static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all) } /* }}} */ +/* {{{ php_html_entities */ +static void php_htmlspecialchars(INTERNAL_FUNCTION_PARAMETERS) +{ + zend_string *str, *hint_charset = NULL; + zend_long flags = ENT_QUOTES|ENT_SUBSTITUTE; + zend_string *replaced; + bool double_encode = 1; + + ZEND_PARSE_PARAMETERS_START(1, 4) + Z_PARAM_STR(str) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(flags) + Z_PARAM_STR_OR_NULL(hint_charset) + Z_PARAM_BOOL(double_encode); + ZEND_PARSE_PARAMETERS_END(); + + replaced = php_htmlspecialchars_ex( + str, (int) flags, + hint_charset ? ZSTR_VAL(hint_charset) : NULL, double_encode, /* quiet */ 0); + RETVAL_STR(replaced); +} +/* }}} */ + /* {{{ Convert special characters to HTML entities */ PHP_FUNCTION(htmlspecialchars) { - php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); + php_htmlspecialchars(INTERNAL_FUNCTION_PARAM_PASSTHRU); } /* }}} */ From c6cb392f1a8e85d5a83ca3595f450d7c78a83f86 Mon Sep 17 00:00:00 2001 From: Artem Ukrainskiy Date: Fri, 21 Mar 2025 16:24:11 +0300 Subject: [PATCH 2/5] Resizing a test entity. The new htmlspecialchars function respects the maximum entity size, defined as LONGEST_ENTITY_LENGTH. There is no strict limit on the length of a numeric entity in the HTML and XML specifications, but in practice the maximum possible is 􏿿, which takes up 10 characters. Any numeric entities larger than this size are effectively invalid and will not be processed by browsers. --- ext/standard/tests/strings/bug60965.phpt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/standard/tests/strings/bug60965.phpt b/ext/standard/tests/strings/bug60965.phpt index b370d225692f2..59dd140d9869f 100644 --- a/ext/standard/tests/strings/bug60965.phpt +++ b/ext/standard/tests/strings/bug60965.phpt @@ -2,10 +2,10 @@ Bug #60965: Buffer overflow on htmlspecialchars/entities with $double=false --FILE-- --EXPECT-- -""""""""""""""""""""""""""""""""""""""""""""" +"""""""""""""""""""""""""""""""""""""""""""""&#x123456789123456789123456789; Done. From 914fa23242a35fc0d667140585a0f11c23d8a0e1 Mon Sep 17 00:00:00 2001 From: Artem Ukrainskiy Date: Tue, 1 Apr 2025 17:54:50 +0300 Subject: [PATCH 3/5] This test was updated to reflect the new logic for handling HTML entities. The $double_encode = false flag only applies to valid entities with a maximum length of 10 characters. The largest valid numeric entity is 􏿿. An entity like � will be parsed and its numeric value computed, but it won't be escaped, since it exceeds the valid Unicode range. An entity like � (11 characters) will not be processed at all. --- ext/standard/tests/strings/bug60965.phpt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ext/standard/tests/strings/bug60965.phpt b/ext/standard/tests/strings/bug60965.phpt index 59dd140d9869f..c8ed63addbe62 100644 --- a/ext/standard/tests/strings/bug60965.phpt +++ b/ext/standard/tests/strings/bug60965.phpt @@ -2,10 +2,12 @@ Bug #60965: Buffer overflow on htmlspecialchars/entities with $double=false --FILE-- --EXPECT-- -"""""""""""""""""""""""""""""""""""""""""""""&#x123456789123456789123456789; +"""""""""""""""""""""""""""""""""""""""""""""&#x000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000005; +&#xFFFFFF; Done. From 63a9484519275296a5e05bc153ad497db1911dde Mon Sep 17 00:00:00 2001 From: Artem Ukrainskiy Date: Tue, 1 Apr 2025 17:59:06 +0300 Subject: [PATCH 4/5] validate_utf8_char A separate optimized validate_utf8_char function is used for validating multi-byte UTF-8 characters. The optimization comes from using more straightforward conditional logic and bitwise operations for faster execution. --- ext/standard/html.c | 146 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 137 insertions(+), 9 deletions(-) diff --git a/ext/standard/html.c b/ext/standard/html.c index 333636abfe48a..4ec2636a2ecb3 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -812,6 +812,126 @@ static void init_htmlspecialchars_lut(htmlspecialchars_lut* lut, const int flags } /* }}} */ +static unsigned int validate_utf8_char( + const unsigned char *str, + const size_t str_len, + size_t* cursor, + zend_result* status +) { + const size_t pos = *cursor; + *status = SUCCESS; + const size_t tail_len = str_len - pos; + + /* Check if at least 1 byte is available */ + if (tail_len < 1) { + MB_FAILURE(pos, 1); + } + + const unsigned char c = str[pos]; + + /* ASCII (single byte) */ + if (c < 0x80) { + *cursor = pos + 1; + return c; + } + + /* Leading byte < 0xC2 => invalid multibyte start */ + if (c < 0xC2) { + MB_FAILURE(pos, 1); + } + + /* 2-byte sequence (0xC2..0xDF) */ + if (c < 0xE0) { + /* Need 2 bytes total */ + if (tail_len < 2) { + MB_FAILURE(pos, 1); + } + const unsigned char b2 = str[pos + 1]; + + /* Check continuation byte 10xxxxxx */ + if ((b2 & 0xC0) != 0x80) { + MB_FAILURE(pos, ((b2 < 0x80) || (b2 >= 0xC2 && b2 <= 0xF4)) ? 1 : 2); + } + + /* Combine bits into code point and check range >= 0x80 */ + const unsigned int cp = ((c & 0x1F) << 6) | (b2 & 0x3F); + if (cp < 0x80) { + MB_FAILURE(pos, 2); + } + + *cursor = pos + 2; + return cp; + } + + /* 3-byte sequence (0xE0..0xEF) */ + if (c < 0xF0) { + /* Need 3 bytes total and valid continuation bytes */ + if (tail_len < 3 || + ((str[pos + 1] & 0xC0) != 0x80) || + ((str[pos + 2] & 0xC0) != 0x80)) { + if (tail_len < 2 || + ((str[pos + 1] < 0x80) || (str[pos + 1] >= 0xC2 && str[pos + 1] <= 0xF4))) { + MB_FAILURE(pos, 1); + } else if (tail_len < 3 || + ((str[pos + 2] < 0x80) || (str[pos + 2] >= 0xC2 && str[pos + 2] <= 0xF4))) { + MB_FAILURE(pos, 2); + } else { + MB_FAILURE(pos, 3); + } + } + + /* Combine bits and check for >= 0x800 and not in surrogate area */ + const unsigned int cp = ((c & 0x0F) << 12) + | ((str[pos + 1] & 0x3F) << 6) + | (str[pos + 2] & 0x3F); + + if (cp < 0x800 || (cp >= 0xD800 && cp <= 0xDFFF)) { + MB_FAILURE(pos, 3); + } + + *cursor = pos + 3; + return cp; + } + + /* 4-byte sequence (0xF0..0xF4) */ + if (c < 0xF5) { + /* Need 4 bytes total and valid continuation bytes */ + if (tail_len < 4 || + ((str[pos + 1] & 0xC0) != 0x80) || + ((str[pos + 2] & 0xC0) != 0x80) || + ((str[pos + 3] & 0xC0) != 0x80)) { + if (tail_len < 2 || + ((str[pos + 1] < 0x80) || (str[pos + 1] >= 0xC2 && str[pos + 1] <= 0xF4))) { + MB_FAILURE(pos, 1); + } else if (tail_len < 3 || + ((str[pos + 2] < 0x80) || (str[pos + 2] >= 0xC2 && str[pos + 2] <= 0xF4))) { + MB_FAILURE(pos, 2); + } else if (tail_len < 4 || + ((str[pos + 3] < 0x80) || (str[pos + 3] >= 0xC2 && str[pos + 3] <= 0xF4))) { + MB_FAILURE(pos, 3); + } else { + MB_FAILURE(pos, 4); + } + } + + /* Combine bits and check range 0x10000..0x10FFFF */ + const unsigned int cp = ((c & 0x07) << 18) + | ((str[pos + 1] & 0x3F) << 12) + | ((str[pos + 2] & 0x3F) << 6) + | (str[pos + 3] & 0x3F); + + if (cp < 0x10000 || cp > 0x10FFFF) { + MB_FAILURE(pos, 4); + } + + *cursor = pos + 4; + return cp; + } + + /* Leading byte >= 0xF5 is invalid */ + MB_FAILURE(pos, 1); +} + static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charset charset, unsigned code) { /* code is not necessarily a unicode code point */ switch (charset) { @@ -1478,15 +1598,23 @@ PHPAPI zend_string* php_htmlspecialchars_ex( free_space--; } - input_ptr++; - } else { - /* Multibyte chars */ - zend_result status; - const size_t original_pos = (const char*)input_ptr - ZSTR_VAL(input); - size_t cursor = original_pos; - const unsigned int this_char = get_next_char(charset, (unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), - &cursor, &status); - const size_t processed_len = cursor - original_pos; + input_ptr++; + } else { + /* Multibyte chars */ + zend_result status; + const size_t original_pos = (const char*)input_ptr - ZSTR_VAL(input); + size_t cursor = original_pos; + + unsigned int this_char = 0; + if (charset == cs_utf_8) { + this_char = validate_utf8_char((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), + &cursor, &status); + } else { + this_char = get_next_char(charset, (unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), + &cursor, &status); + } + + const size_t processed_len = cursor - original_pos; if (status == FAILURE) { if (flags & ENT_HTML_IGNORE_ERRORS) { From 79d09c9a8d7ed51dcedbd74e2ce6545500eaafc3 Mon Sep 17 00:00:00 2001 From: Artem Ukrainskiy Date: Tue, 1 Apr 2025 17:59:44 +0300 Subject: [PATCH 5/5] estyle, pr fixes, small refactoring factoring --- ext/standard/html.c | 373 ++++++++++++++++++++++---------------------- ext/standard/html.h | 1 + 2 files changed, 186 insertions(+), 188 deletions(-) diff --git a/ext/standard/html.c b/ext/standard/html.c index 4ec2636a2ecb3..bb41814c8c6cf 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -74,12 +74,6 @@ #define sjis_lead(c) ((c) != 0x80 && (c) != 0xA0 && (c) < 0xFD) #define sjis_trail(c) ((c) >= 0x40 && (c) != 0x7F && (c) < 0xFD) -/* Lookup table for php_htmlspecialchars */ -typedef struct { - char* entity[256]; - ushort entity_len[256]; -} htmlspecialchars_lut; - /* {{{ get_default_charset */ static char *get_default_charset(void) { if (PG(internal_encoding) && PG(internal_encoding)[0]) { @@ -759,11 +753,11 @@ static zend_result resolve_named_entity_html(const char *start, size_t length, c /* }}} */ /* {{{ is_codepoint_allowed */ -static inline zend_bool is_codepoint_allowed( - unsigned int cp, /* The codepoint to check */ +static inline bool is_codepoint_allowed( + unsigned int cp, /* The codepoint to check */ enum entity_charset charset, /* Current charset */ - int doctype, /* The doctype flags (ENT_HTML401, ENT_HTML5, etc.) */ - const enc_to_uni* to_uni_table /* Mapping table if needed */ + int doctype, /* The doctype flags (ENT_HTML401, ENT_HTML5, etc.) */ + const enc_to_uni *to_uni_table /* Mapping table if needed */ ) { // If charset is Unicode-compatible, the code point is used as-is if (CHARSET_UNICODE_COMPAT(charset)) { @@ -779,35 +773,41 @@ static inline zend_bool is_codepoint_allowed( return unicode_cp_is_allowed(cp, doctype); } - return 1; + return true; } /* }}} */ +/* Lookup table for php_htmlspecialchars */ +typedef struct { + char *entity[256]; + uint8_t entity_len[256]; +} htmlspecialchars_lut; + /* {{{ init_htmlspecialchars_lut */ -static void init_htmlspecialchars_lut(htmlspecialchars_lut* lut, const int flags, const int doctype) { +static void init_htmlspecialchars_lut(htmlspecialchars_lut *lut, const int flags, const int doctype) { memset(lut, 0, sizeof(*lut)); lut->entity['&'] = "&"; lut->entity['>'] = ">"; lut->entity['<'] = "<"; - lut->entity_len['&'] = 5; - lut->entity_len['>'] = 4; - lut->entity_len['<'] = 4; + lut->entity_len['&'] = strlen(lut->entity['&']); + lut->entity_len['>'] = strlen(lut->entity['>']); + lut->entity_len['<'] = strlen(lut->entity['<']); if (flags & ENT_QUOTES & ENT_HTML_QUOTE_DOUBLE) { lut->entity['"'] = """; - lut->entity_len['"'] = 6; + lut->entity_len['"'] = strlen(lut->entity['"']); } if (flags & ENT_QUOTES & ENT_HTML_QUOTE_SINGLE) { - char* apos = "'"; + char *apos = "'"; if (doctype != ENT_HTML401) { if (doctype & (ENT_XML1 | ENT_XHTML | ENT_HTML5)) { apos = "'"; } } lut->entity['\''] = apos; - lut->entity_len['\''] = 6; + lut->entity_len['\''] = strlen(apos); } } /* }}} */ @@ -1485,118 +1485,118 @@ PHPAPI zend_string *php_escape_html_entities_ex(const unsigned char *old, size_t /* }}} */ /* {{{ php_htmlspecialchars */ -PHPAPI zend_string* php_htmlspecialchars_ex( - const zend_string* input, const int flags, - const char* hint_charset, const bool double_encode, - const bool quiet - ) { - const entity_ht* inv_map = NULL; - htmlspecialchars_lut lut; - const int doctype = flags & ENT_HTML_DOC_TYPE_MASK; - - const size_t initial_size = (ZSTR_LEN(input) < 64) - ? 256 - : zend_safe_addmult(ZSTR_LEN(input), 2, 0, "htmlspecialchars"); - zend_string* output = zend_string_alloc(initial_size, 0); - - size_t free_space = initial_size; - char* output_ptr = ZSTR_VAL(output); - const char* input_ptr = ZSTR_VAL(input); - const char* input_end = input_ptr + input->len; - - const enum entity_charset charset = determine_charset(hint_charset, quiet); - const enc_to_uni* to_uni_table = NULL; - if (!CHARSET_UNICODE_COMPAT(charset)) { - to_uni_table = enc_to_uni_index[charset]; - } - - /* Replacement for invalid characters and byte sequences */ - const unsigned char* replacement = NULL; - size_t replacement_len = 0; - if (flags & (ENT_HTML_SUBSTITUTE_ERRORS | ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS)) { - if (charset == cs_utf_8) { - replacement = (const unsigned char*)"\xEF\xBF\xBD"; - replacement_len = sizeof("\xEF\xBF\xBD") - 1; - } else { - replacement = (const unsigned char*)"�"; - replacement_len = sizeof("�") - 1; - } - } - - init_htmlspecialchars_lut(&lut, flags, doctype); - - if (!double_encode) { - inv_map = unescape_inverse_map(1, flags); - } - - while (input_ptr < input_end) { - const unsigned char c = *input_ptr; - /* ASCII chars */ - if (c < 0x80) { - /* Handle HTML entities */ - if (c == '&' && !double_encode) { - const char* semicolon = memchr(input_ptr, ';', MIN(LONGEST_ENTITY_LENGTH + 1, input_end - input_ptr)); - if (semicolon) { - const size_t candidate_len = semicolon - (const char*)input_ptr + 1; - unsigned dummy1, dummy2; - - /* Named entity */ - if (resolve_named_entity_html((const char*)input_ptr + 1, candidate_len - 2, inv_map, &dummy1, - &dummy2) == SUCCESS) { - memcpy(output_ptr, input_ptr, candidate_len); - output_ptr += candidate_len; - input_ptr += candidate_len; - free_space -= candidate_len; - goto ensure_memory; - } - - /* Numeric entity */ - if (input_ptr[1] == '#') { - unsigned code_point; - char* start = (char*)input_ptr + 2; - const int valid = process_numeric_entity((const char**)&start, &code_point); - if (valid == SUCCESS && start == semicolon) { - if (!(flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) || - numeric_entity_is_allowed(code_point, doctype)) { - memcpy(output_ptr, input_ptr, candidate_len); - output_ptr += candidate_len; - input_ptr += candidate_len; - free_space -= candidate_len; - goto ensure_memory; - } - } - } - } - - /* Invalid entity */ - memcpy(output_ptr, "&", 5); - output_ptr += 5; - free_space -= 5; - input_ptr++; - goto ensure_memory; - } - - /* Check disallowed chars */ - if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) { - if (!is_codepoint_allowed(c, charset, doctype, NULL)) { - memcpy(output_ptr, replacement, replacement_len); - output_ptr += replacement_len; - free_space -= replacement_len; - input_ptr++; - goto ensure_memory; - } - } - - /* Use lookup table for fast replace */ - if (lut.entity[c]) { - const size_t entity_len = lut.entity_len[c]; - memcpy(output_ptr, lut.entity[c], entity_len); - output_ptr += entity_len; - free_space -= entity_len; - } else { - *output_ptr++ = c; - free_space--; - } +PHPAPI zend_string *php_htmlspecialchars_ex(const zend_string *input, const int flags, const char *hint_charset, const bool double_encode) { + const entity_ht *inv_map = NULL; + htmlspecialchars_lut lut; + const int doctype = flags & ENT_HTML_DOC_TYPE_MASK; + + const size_t initial_size = (ZSTR_LEN(input) < 64) + ? 256 + : zend_safe_addmult(ZSTR_LEN(input), 2, 0, "htmlspecialchars"); + zend_string *output = zend_string_alloc(initial_size, 0); + + size_t free_space = initial_size; + char *output_ptr = ZSTR_VAL(output); + const char *input_ptr = ZSTR_VAL(input); + const char *input_end = input_ptr + ZSTR_LEN(input); + + const enum entity_charset charset = determine_charset(hint_charset, false); + const enc_to_uni *to_uni_table = NULL; + if (!CHARSET_UNICODE_COMPAT(charset)) { + to_uni_table = enc_to_uni_index[charset]; + } + + /* Replacement for invalid characters and byte sequences */ + const char *replacement = NULL; + size_t replacement_len = 0; + if (flags & (ENT_HTML_SUBSTITUTE_ERRORS | ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS)) { + if (charset == cs_utf_8) { + replacement = "\xEF\xBF\xBD"; + replacement_len = sizeof("\xEF\xBF\xBD") - 1; + } else { + replacement = "�"; + replacement_len = sizeof("�") - 1; + } + } + const char *amp = "&"; + const size_t amp_len = strlen(amp); + const size_t max_numeric_entity_len = 10; + + init_htmlspecialchars_lut(&lut, flags, doctype); + + if (!double_encode) { + inv_map = unescape_inverse_map(1, flags); + } + + const bool singlebyte_charset = CHARSET_SINGLE_BYTE(charset); + + while (input_ptr < input_end) { + const unsigned char c = *input_ptr; + /* ASCII chars */ + if (c < 0x80 || singlebyte_charset) { + /* Handle HTML entities */ + if (c == '&' && !double_encode) { + const char *semicolon = memchr(input_ptr, ';', MIN(LONGEST_ENTITY_LENGTH + 1, input_end - input_ptr)); + if (semicolon) { + const size_t candidate_len = semicolon - input_ptr + 1; + unsigned dummy1, dummy2; + /* Numeric entity */ + if (input_ptr[1] == '#' && candidate_len <= max_numeric_entity_len) { + unsigned code_point; + char *start = (char*)input_ptr + 2; + const int valid = process_numeric_entity((const char**)&start, &code_point); + if (valid == SUCCESS && start == semicolon) { + if (!(flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) || + numeric_entity_is_allowed(code_point, doctype)) { + memcpy(output_ptr, input_ptr, candidate_len); + output_ptr += candidate_len; + input_ptr += candidate_len; + free_space -= candidate_len; + goto ensure_memory; + } + } + } + + /* Named entity */ + if (resolve_named_entity_html((const char*)input_ptr + 1, candidate_len - 2, inv_map, &dummy1, + &dummy2) == SUCCESS) { + memcpy(output_ptr, input_ptr, candidate_len); + output_ptr += candidate_len; + input_ptr += candidate_len; + free_space -= candidate_len; + goto ensure_memory; + } + } + + /* Invalid entity */ + memcpy(output_ptr, amp, amp_len); + output_ptr += amp_len; + free_space -= amp_len; + input_ptr++; + goto ensure_memory; + } + + /* Check disallowed chars */ + if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) { + if (!is_codepoint_allowed(c, charset, doctype, NULL)) { + memcpy(output_ptr, replacement, replacement_len); + output_ptr += replacement_len; + free_space -= replacement_len; + input_ptr++; + goto ensure_memory; + } + } + + /* Use lookup table for fast replace */ + if (lut.entity[c]) { + const size_t entity_len = lut.entity_len[c]; + memcpy(output_ptr, lut.entity[c], entity_len); + output_ptr += entity_len; + free_space -= entity_len; + } else { + *output_ptr++ = c; + free_space--; + } input_ptr++; } else { @@ -1616,52 +1616,52 @@ PHPAPI zend_string* php_htmlspecialchars_ex( const size_t processed_len = cursor - original_pos; - if (status == FAILURE) { - if (flags & ENT_HTML_IGNORE_ERRORS) { - input_ptr += processed_len; - continue; - } - if (flags & ENT_HTML_SUBSTITUTE_ERRORS) { - memcpy(output_ptr, replacement, replacement_len); - output_ptr += replacement_len; - free_space -= replacement_len; - input_ptr += processed_len; - } else { - zend_string_release(output); - return ZSTR_EMPTY_ALLOC(); - } - } else { - /* Check disallowed chars */ - const unsigned char* sequence = (unsigned char*)input_ptr; - size_t sequence_len = processed_len; - - if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) { - if (!is_codepoint_allowed(this_char, charset, doctype, to_uni_table)) { - sequence = replacement; - sequence_len = replacement_len; - } - } - - memcpy(output_ptr, sequence, sequence_len); - output_ptr += sequence_len; - free_space -= sequence_len; - input_ptr += processed_len; - } - } - - ensure_memory: - if (free_space < 128) { - const size_t used = ZSTR_LEN(output) - free_space; - const size_t new_size = used + 1024; - output = zend_string_realloc(output, new_size, 0); - output_ptr = ZSTR_VAL(output) + used; - free_space = new_size - used; - } - } - - *output_ptr = '\0'; - ZSTR_LEN(output) = (output_ptr - ZSTR_VAL(output)); - return output; + if (status == FAILURE) { + if (flags & ENT_HTML_IGNORE_ERRORS) { + input_ptr += processed_len; + continue; + } + if (flags & ENT_HTML_SUBSTITUTE_ERRORS) { + memcpy(output_ptr, replacement, replacement_len); + output_ptr += replacement_len; + free_space -= replacement_len; + input_ptr += processed_len; + } else { + zend_string_release(output); + return ZSTR_EMPTY_ALLOC(); + } + } else { + /* Check disallowed chars */ + const char *sequence = input_ptr; + size_t sequence_len = processed_len; + + if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) { + if (!is_codepoint_allowed(this_char, charset, doctype, to_uni_table)) { + sequence = replacement; + sequence_len = replacement_len; + } + } + + memcpy(output_ptr, sequence, sequence_len); + output_ptr += sequence_len; + free_space -= sequence_len; + input_ptr += processed_len; + } + } + + ensure_memory: + if (free_space < 128) { + const size_t used = ZSTR_LEN(output) - free_space; + const size_t new_size = used + 1024; + output = zend_string_realloc(output, new_size, 0); + output_ptr = ZSTR_VAL(output) + used; + free_space = new_size - used; + } + } + + *output_ptr = '\0'; + ZSTR_LEN(output) = (output_ptr - ZSTR_VAL(output)); + return output; } /* }}} */ @@ -1688,8 +1688,8 @@ static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all) } /* }}} */ -/* {{{ php_html_entities */ -static void php_htmlspecialchars(INTERNAL_FUNCTION_PARAMETERS) +/* {{{ Convert special characters to HTML entities */ +PHP_FUNCTION(htmlspecialchars) { zend_string *str, *hint_charset = NULL; zend_long flags = ENT_QUOTES|ENT_SUBSTITUTE; @@ -1704,20 +1704,17 @@ static void php_htmlspecialchars(INTERNAL_FUNCTION_PARAMETERS) Z_PARAM_BOOL(double_encode); ZEND_PARSE_PARAMETERS_END(); - replaced = php_htmlspecialchars_ex( - str, (int) flags, - hint_charset ? ZSTR_VAL(hint_charset) : NULL, double_encode, /* quiet */ 0); + if (ZSTR_LEN(str) == 0) { + replaced = zend_string_copy(str); + } else { + replaced = php_htmlspecialchars_ex( + str, (int)flags, + hint_charset ? ZSTR_VAL(hint_charset) : NULL, double_encode); + } RETVAL_STR(replaced); } /* }}} */ -/* {{{ Convert special characters to HTML entities */ -PHP_FUNCTION(htmlspecialchars) -{ - php_htmlspecialchars(INTERNAL_FUNCTION_PARAM_PASSTHRU); -} -/* }}} */ - /* {{{ Convert special HTML entities back to characters */ PHP_FUNCTION(htmlspecialchars_decode) { diff --git a/ext/standard/html.h b/ext/standard/html.h index 40c595ba5d89c..f71428d8561cd 100644 --- a/ext/standard/html.h +++ b/ext/standard/html.h @@ -48,5 +48,6 @@ PHPAPI zend_string *php_escape_html_entities(const unsigned char *old, size_t ol PHPAPI zend_string *php_escape_html_entities_ex(const unsigned char *old, size_t oldlen, int all, int flags, const char *hint_charset, bool double_encode, bool quiet); PHPAPI zend_string *php_unescape_html_entities(zend_string *str, int all, int flags, const char *hint_charset); PHPAPI unsigned int php_next_utf8_char(const unsigned char *str, size_t str_len, size_t *cursor, zend_result *status); +PHPAPI zend_string *php_htmlspecialchars_ex(const zend_string *input, int flags, const char *hint_charset, bool double_encode); #endif /* HTML_H */