-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathContentTypeExtractor.php
112 lines (96 loc) · 5.05 KB
/
ContentTypeExtractor.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
<?php
declare(strict_types=1);
namespace Fossar\GuzzleTranscoder;
class ContentTypeExtractor {
/**
* Regex pattern for HTML 4 meta tag – e.g. <meta http-equiv="content-type" content="text/html; charset=ISO-8859-1">.
*/
private const PATTERN_HTML4 = '/<meta[^>]+http-equiv\s*=\s*(?P<quote>["\']?)content-type\g{quote}[^>]*?>/i';
/**
* Regex pattern for HTML 5 meta tag – e.g. <meta charset=iso-8859-1>.
*/
private const PATTERN_HTML5 = '/(?P<before><meta[^>]+?)charset\s*=\s*(?:(?P<quote>["\'])(?P<charset1>[^"\' ]+?)\g{quote}|(?P<charset2>[^"\'=<>`\s]+))(?P<after>[^>]*?>)/iJ';
/**
* Converts the given $content to the $targetEncoding.
*
* The original encoding is defined by (in order):
* - the 'charset' parameter of the 'content-type' header
* - the meta information in the body of an HTML (content-type: text/html)or XML (content-type: text/xml or application/xml) document
*
* If the original encoding could not be determined, null is returned.
*
* Otherwise an object of type EncodingResult is returned. Please see the description of the properties of said class.
*
* @param array<string, list<string>|string> $headers
*
* @return ?array{string, ?string, array<string, ?string>} A triplet of MIME type, optional value of charset parameter and parameters from the header
*/
public static function getContentTypeFromHeader(array $headers, string $targetEncoding): ?array {
$contentType = Utils::getByCaseInsensitiveKey($headers, 'content-type');
if ($contentType === null) {
return null;
}
if (\is_array($contentType)) {
// Multiple Content-Type headers are not permitted, as the header does not accept a comma-separated list:
// https://www.rfc-editor.org/rfc/rfc2616#section-4.2
// https://www.rfc-editor.org/rfc/rfc2616#section-14.17
// We are attempting to handle it gracefully by dropping all but the first instance.
$contentType = $contentType[0];
}
// content := "Content-Type" ":" type "/" subtype *(";" parameter)
// see https://tools.ietf.org/html/rfc2045#section-5.1
[$type, $params] = explode(';', $contentType . ';', 2);
$parsed = Utils::splitHttpHeaderWords($params);
if (\count($parsed) > 0) {
$parsed = reset($parsed);
}
$encoding = Utils::getByCaseInsensitiveKey($parsed, 'charset');
$newParsed = Utils::setByCaseInsensitiveKey($parsed, 'charset', $targetEncoding);
return [$type, $encoding, $newParsed];
}
/**
* Obtains MIME type from a text of HTML document.
*
* @return array{?string, array<string, string>} A pair of MIME type and replacements for the content
*/
public static function getContentTypeFromHtml(string $content, string $targetEncoding): array {
$bodyDeclaredEncoding = null;
$replacements = [];
// find http-equiv
if (preg_match(self::PATTERN_HTML4, $content, $match)) {
$pattern = '/(?P<before>.*)content\s*=\s*(?P<quote>["\'])(?P<content>.*?)\g{quote}(?P<after>.*)/i';
if (preg_match($pattern, $match[0], $innerMatch)) {
$parsed = Utils::splitHttpHeaderWords($innerMatch['content']);
if (\count($parsed) > 0) {
$parsed = reset($parsed);
}
$bodyDeclaredEncoding = Utils::getByCaseInsensitiveKey($parsed, 'charset');
$newParsed = Utils::setByCaseInsensitiveKey($parsed, 'charset', $targetEncoding);
$newContent = Utils::joinHttpHeaderWords($newParsed);
$newMeta = $innerMatch['before'] . "content={$innerMatch['quote']}" . $newContent . "{$innerMatch['quote']}" . $innerMatch['after'];
$replacements[$match[0]] = $newMeta;
}
} elseif (preg_match(self::PATTERN_HTML5, $content, $match)) {
$bodyDeclaredEncoding = $match['charset1'] . $match['charset2'];
$newMeta = $match['before'] . "charset={$match['quote']}" . $targetEncoding . "{$match['quote']}" . $match['after'];
$replacements[$match[0]] = $newMeta;
}
return [$bodyDeclaredEncoding, $replacements];
}
/**
* Obtains MIME type from a text of XML document.
*
* @return array{?string, array<string, string>} A pair of MIME type and replacements for the content
*/
public static function getContentTypeFromXml(string $content, string $targetEncoding): array {
$bodyDeclaredEncoding = null;
$replacements = [];
$patternXml = "#(?P<before><\\?xml[^>]+?)encoding=(?P<quote>[\"'])(?P<charset>[^\"']+?)\\2(?P<after>[^>]*?>)#i";
if (preg_match($patternXml, $content, $match)) {
$bodyDeclaredEncoding = $match['charset'];
$newMeta = $match['before'] . "encoding={$match['quote']}" . $targetEncoding . "{$match['quote']}" . $match['after'];
$replacements[$match[0]] = $newMeta;
}
return [$bodyDeclaredEncoding, $replacements];
}
}