diff --git a/ext/standard/html.c b/ext/standard/html.c index 0c6231d590d88..bb41814c8c6cf 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -752,6 +752,186 @@ static zend_result resolve_named_entity_html(const char *start, size_t length, c } /* }}} */ +/* {{{ is_codepoint_allowed */ +static inline bool is_codepoint_allowed( + unsigned int cp, /* The codepoint to check */ + enum entity_charset charset, /* Current charset */ + int doctype, /* The doctype flags (ENT_HTML401, ENT_HTML5, etc.) */ + const enc_to_uni *to_uni_table /* Mapping table if needed */ + ) { + // If charset is Unicode-compatible, the code point is used as-is + if (CHARSET_UNICODE_COMPAT(charset)) { + return unicode_cp_is_allowed(cp, doctype); + } + // If we have a mapping table (i.e., a non-UTF charset) + if (to_uni_table) { + map_to_unicode(cp, to_uni_table, &cp); + return unicode_cp_is_allowed(cp, doctype); + } + + if (cp <= 0x7D) { + return unicode_cp_is_allowed(cp, doctype); + } + + return true; +} +/* }}} */ + +/* Lookup table for php_htmlspecialchars */ +typedef struct { + char *entity[256]; + uint8_t entity_len[256]; +} htmlspecialchars_lut; + +/* {{{ init_htmlspecialchars_lut */ +static void init_htmlspecialchars_lut(htmlspecialchars_lut *lut, const int flags, const int doctype) { + memset(lut, 0, sizeof(*lut)); + + lut->entity['&'] = "&"; + lut->entity['>'] = ">"; + lut->entity['<'] = "<"; + lut->entity_len['&'] = strlen(lut->entity['&']); + lut->entity_len['>'] = strlen(lut->entity['>']); + lut->entity_len['<'] = strlen(lut->entity['<']); + + if (flags & ENT_QUOTES & ENT_HTML_QUOTE_DOUBLE) { + lut->entity['"'] = """; + lut->entity_len['"'] = strlen(lut->entity['"']); + } + + if (flags & ENT_QUOTES & ENT_HTML_QUOTE_SINGLE) { + char *apos = "'"; + if (doctype != ENT_HTML401) { + if (doctype & (ENT_XML1 | ENT_XHTML | ENT_HTML5)) { + apos = "'"; + } + } + lut->entity['\''] = apos; + lut->entity_len['\''] = strlen(apos); + } +} +/* }}} */ + +static unsigned int validate_utf8_char( + const unsigned char *str, + const size_t str_len, + size_t* cursor, + zend_result* status +) { + const size_t pos = *cursor; + *status = SUCCESS; + const size_t tail_len = str_len - pos; + + /* Check if at least 1 byte is available */ + if (tail_len < 1) { + MB_FAILURE(pos, 1); + } + + const unsigned char c = str[pos]; + + /* ASCII (single byte) */ + if (c < 0x80) { + *cursor = pos + 1; + return c; + } + + /* Leading byte < 0xC2 => invalid multibyte start */ + if (c < 0xC2) { + MB_FAILURE(pos, 1); + } + + /* 2-byte sequence (0xC2..0xDF) */ + if (c < 0xE0) { + /* Need 2 bytes total */ + if (tail_len < 2) { + MB_FAILURE(pos, 1); + } + const unsigned char b2 = str[pos + 1]; + + /* Check continuation byte 10xxxxxx */ + if ((b2 & 0xC0) != 0x80) { + MB_FAILURE(pos, ((b2 < 0x80) || (b2 >= 0xC2 && b2 <= 0xF4)) ? 1 : 2); + } + + /* Combine bits into code point and check range >= 0x80 */ + const unsigned int cp = ((c & 0x1F) << 6) | (b2 & 0x3F); + if (cp < 0x80) { + MB_FAILURE(pos, 2); + } + + *cursor = pos + 2; + return cp; + } + + /* 3-byte sequence (0xE0..0xEF) */ + if (c < 0xF0) { + /* Need 3 bytes total and valid continuation bytes */ + if (tail_len < 3 || + ((str[pos + 1] & 0xC0) != 0x80) || + ((str[pos + 2] & 0xC0) != 0x80)) { + if (tail_len < 2 || + ((str[pos + 1] < 0x80) || (str[pos + 1] >= 0xC2 && str[pos + 1] <= 0xF4))) { + MB_FAILURE(pos, 1); + } else if (tail_len < 3 || + ((str[pos + 2] < 0x80) || (str[pos + 2] >= 0xC2 && str[pos + 2] <= 0xF4))) { + MB_FAILURE(pos, 2); + } else { + MB_FAILURE(pos, 3); + } + } + + /* Combine bits and check for >= 0x800 and not in surrogate area */ + const unsigned int cp = ((c & 0x0F) << 12) + | ((str[pos + 1] & 0x3F) << 6) + | (str[pos + 2] & 0x3F); + + if (cp < 0x800 || (cp >= 0xD800 && cp <= 0xDFFF)) { + MB_FAILURE(pos, 3); + } + + *cursor = pos + 3; + return cp; + } + + /* 4-byte sequence (0xF0..0xF4) */ + if (c < 0xF5) { + /* Need 4 bytes total and valid continuation bytes */ + if (tail_len < 4 || + ((str[pos + 1] & 0xC0) != 0x80) || + ((str[pos + 2] & 0xC0) != 0x80) || + ((str[pos + 3] & 0xC0) != 0x80)) { + if (tail_len < 2 || + ((str[pos + 1] < 0x80) || (str[pos + 1] >= 0xC2 && str[pos + 1] <= 0xF4))) { + MB_FAILURE(pos, 1); + } else if (tail_len < 3 || + ((str[pos + 2] < 0x80) || (str[pos + 2] >= 0xC2 && str[pos + 2] <= 0xF4))) { + MB_FAILURE(pos, 2); + } else if (tail_len < 4 || + ((str[pos + 3] < 0x80) || (str[pos + 3] >= 0xC2 && str[pos + 3] <= 0xF4))) { + MB_FAILURE(pos, 3); + } else { + MB_FAILURE(pos, 4); + } + } + + /* Combine bits and check range 0x10000..0x10FFFF */ + const unsigned int cp = ((c & 0x07) << 18) + | ((str[pos + 1] & 0x3F) << 12) + | ((str[pos + 2] & 0x3F) << 6) + | (str[pos + 3] & 0x3F); + + if (cp < 0x10000 || cp > 0x10FFFF) { + MB_FAILURE(pos, 4); + } + + *cursor = pos + 4; + return cp; + } + + /* Leading byte >= 0xF5 is invalid */ + MB_FAILURE(pos, 1); +} + static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charset charset, unsigned code) { /* code is not necessarily a unicode code point */ switch (charset) { @@ -1304,6 +1484,187 @@ PHPAPI zend_string *php_escape_html_entities_ex(const unsigned char *old, size_t } /* }}} */ +/* {{{ php_htmlspecialchars */ +PHPAPI zend_string *php_htmlspecialchars_ex(const zend_string *input, const int flags, const char *hint_charset, const bool double_encode) { + const entity_ht *inv_map = NULL; + htmlspecialchars_lut lut; + const int doctype = flags & ENT_HTML_DOC_TYPE_MASK; + + const size_t initial_size = (ZSTR_LEN(input) < 64) + ? 256 + : zend_safe_addmult(ZSTR_LEN(input), 2, 0, "htmlspecialchars"); + zend_string *output = zend_string_alloc(initial_size, 0); + + size_t free_space = initial_size; + char *output_ptr = ZSTR_VAL(output); + const char *input_ptr = ZSTR_VAL(input); + const char *input_end = input_ptr + ZSTR_LEN(input); + + const enum entity_charset charset = determine_charset(hint_charset, false); + const enc_to_uni *to_uni_table = NULL; + if (!CHARSET_UNICODE_COMPAT(charset)) { + to_uni_table = enc_to_uni_index[charset]; + } + + /* Replacement for invalid characters and byte sequences */ + const char *replacement = NULL; + size_t replacement_len = 0; + if (flags & (ENT_HTML_SUBSTITUTE_ERRORS | ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS)) { + if (charset == cs_utf_8) { + replacement = "\xEF\xBF\xBD"; + replacement_len = sizeof("\xEF\xBF\xBD") - 1; + } else { + replacement = "�"; + replacement_len = sizeof("�") - 1; + } + } + const char *amp = "&"; + const size_t amp_len = strlen(amp); + const size_t max_numeric_entity_len = 10; + + init_htmlspecialchars_lut(&lut, flags, doctype); + + if (!double_encode) { + inv_map = unescape_inverse_map(1, flags); + } + + const bool singlebyte_charset = CHARSET_SINGLE_BYTE(charset); + + while (input_ptr < input_end) { + const unsigned char c = *input_ptr; + /* ASCII chars */ + if (c < 0x80 || singlebyte_charset) { + /* Handle HTML entities */ + if (c == '&' && !double_encode) { + const char *semicolon = memchr(input_ptr, ';', MIN(LONGEST_ENTITY_LENGTH + 1, input_end - input_ptr)); + if (semicolon) { + const size_t candidate_len = semicolon - input_ptr + 1; + unsigned dummy1, dummy2; + /* Numeric entity */ + if (input_ptr[1] == '#' && candidate_len <= max_numeric_entity_len) { + unsigned code_point; + char *start = (char*)input_ptr + 2; + const int valid = process_numeric_entity((const char**)&start, &code_point); + if (valid == SUCCESS && start == semicolon) { + if (!(flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) || + numeric_entity_is_allowed(code_point, doctype)) { + memcpy(output_ptr, input_ptr, candidate_len); + output_ptr += candidate_len; + input_ptr += candidate_len; + free_space -= candidate_len; + goto ensure_memory; + } + } + } + + /* Named entity */ + if (resolve_named_entity_html((const char*)input_ptr + 1, candidate_len - 2, inv_map, &dummy1, + &dummy2) == SUCCESS) { + memcpy(output_ptr, input_ptr, candidate_len); + output_ptr += candidate_len; + input_ptr += candidate_len; + free_space -= candidate_len; + goto ensure_memory; + } + } + + /* Invalid entity */ + memcpy(output_ptr, amp, amp_len); + output_ptr += amp_len; + free_space -= amp_len; + input_ptr++; + goto ensure_memory; + } + + /* Check disallowed chars */ + if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) { + if (!is_codepoint_allowed(c, charset, doctype, NULL)) { + memcpy(output_ptr, replacement, replacement_len); + output_ptr += replacement_len; + free_space -= replacement_len; + input_ptr++; + goto ensure_memory; + } + } + + /* Use lookup table for fast replace */ + if (lut.entity[c]) { + const size_t entity_len = lut.entity_len[c]; + memcpy(output_ptr, lut.entity[c], entity_len); + output_ptr += entity_len; + free_space -= entity_len; + } else { + *output_ptr++ = c; + free_space--; + } + + input_ptr++; + } else { + /* Multibyte chars */ + zend_result status; + const size_t original_pos = (const char*)input_ptr - ZSTR_VAL(input); + size_t cursor = original_pos; + + unsigned int this_char = 0; + if (charset == cs_utf_8) { + this_char = validate_utf8_char((unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), + &cursor, &status); + } else { + this_char = get_next_char(charset, (unsigned char*)ZSTR_VAL(input), ZSTR_LEN(input), + &cursor, &status); + } + + const size_t processed_len = cursor - original_pos; + + if (status == FAILURE) { + if (flags & ENT_HTML_IGNORE_ERRORS) { + input_ptr += processed_len; + continue; + } + if (flags & ENT_HTML_SUBSTITUTE_ERRORS) { + memcpy(output_ptr, replacement, replacement_len); + output_ptr += replacement_len; + free_space -= replacement_len; + input_ptr += processed_len; + } else { + zend_string_release(output); + return ZSTR_EMPTY_ALLOC(); + } + } else { + /* Check disallowed chars */ + const char *sequence = input_ptr; + size_t sequence_len = processed_len; + + if (flags & ENT_HTML_SUBSTITUTE_DISALLOWED_CHARS) { + if (!is_codepoint_allowed(this_char, charset, doctype, to_uni_table)) { + sequence = replacement; + sequence_len = replacement_len; + } + } + + memcpy(output_ptr, sequence, sequence_len); + output_ptr += sequence_len; + free_space -= sequence_len; + input_ptr += processed_len; + } + } + + ensure_memory: + if (free_space < 128) { + const size_t used = ZSTR_LEN(output) - free_space; + const size_t new_size = used + 1024; + output = zend_string_realloc(output, new_size, 0); + output_ptr = ZSTR_VAL(output) + used; + free_space = new_size - used; + } + } + + *output_ptr = '\0'; + ZSTR_LEN(output) = (output_ptr - ZSTR_VAL(output)); + return output; +} +/* }}} */ + /* {{{ php_html_entities */ static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all) { @@ -1330,7 +1691,27 @@ static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all) /* {{{ Convert special characters to HTML entities */ PHP_FUNCTION(htmlspecialchars) { - php_html_entities(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0); + zend_string *str, *hint_charset = NULL; + zend_long flags = ENT_QUOTES|ENT_SUBSTITUTE; + zend_string *replaced; + bool double_encode = 1; + + ZEND_PARSE_PARAMETERS_START(1, 4) + Z_PARAM_STR(str) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(flags) + Z_PARAM_STR_OR_NULL(hint_charset) + Z_PARAM_BOOL(double_encode); + ZEND_PARSE_PARAMETERS_END(); + + if (ZSTR_LEN(str) == 0) { + replaced = zend_string_copy(str); + } else { + replaced = php_htmlspecialchars_ex( + str, (int)flags, + hint_charset ? ZSTR_VAL(hint_charset) : NULL, double_encode); + } + RETVAL_STR(replaced); } /* }}} */ diff --git a/ext/standard/html.h b/ext/standard/html.h index 40c595ba5d89c..f71428d8561cd 100644 --- a/ext/standard/html.h +++ b/ext/standard/html.h @@ -48,5 +48,6 @@ PHPAPI zend_string *php_escape_html_entities(const unsigned char *old, size_t ol PHPAPI zend_string *php_escape_html_entities_ex(const unsigned char *old, size_t oldlen, int all, int flags, const char *hint_charset, bool double_encode, bool quiet); PHPAPI zend_string *php_unescape_html_entities(zend_string *str, int all, int flags, const char *hint_charset); PHPAPI unsigned int php_next_utf8_char(const unsigned char *str, size_t str_len, size_t *cursor, zend_result *status); +PHPAPI zend_string *php_htmlspecialchars_ex(const zend_string *input, int flags, const char *hint_charset, bool double_encode); #endif /* HTML_H */ diff --git a/ext/standard/tests/strings/bug60965.phpt b/ext/standard/tests/strings/bug60965.phpt index b370d225692f2..c8ed63addbe62 100644 --- a/ext/standard/tests/strings/bug60965.phpt +++ b/ext/standard/tests/strings/bug60965.phpt @@ -4,8 +4,10 @@ Bug #60965: Buffer overflow on htmlspecialchars/entities with $double=false --EXPECT-- -""""""""""""""""""""""""""""""""""""""""""""" +"""""""""""""""""""""""""""""""""""""""""""""&#x000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000005; +&#xFFFFFF; Done.