forked from simplehtmldom/simplehtmldom
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathHtmlWeb.php
180 lines (152 loc) · 4.67 KB
/
HtmlWeb.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
<?php
/**
* Website: http://sourceforge.net/projects/simplehtmldom/
* Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
*
* Licensed under The MIT License
* See the LICENSE file in the project root for more information.
*
* Authors:
* S.C. Chen
* John Schlick
* Rus Carroll
* logmanoriginal
*
* Contributors:
* Yousuke Kumakura
* Vadim Voituk
* Antcs
* Igor (Dicr) Tarasov
*
* Version $Rev$
*/
namespace simplehtmldom;
use function curl_close;
use function curl_exec;
use function curl_getinfo;
use function curl_init;
use function curl_setopt;
use function error_log;
use function explode;
use function extension_loaded;
use function file_get_contents;
use function filter_var;
use function ini_get;
use function parse_url;
use function preg_match;
use function stream_context_create;
use function strlen;
use function strtolower;
use const CURLINFO_RESPONSE_CODE;
use const CURLOPT_BUFFERSIZE;
use const CURLOPT_FOLLOWLOCATION;
use const CURLOPT_HTTPHEADER;
use const CURLOPT_RETURNTRANSFER;
use const CURLOPT_URL;
use const FILTER_VALIDATE_URL;
use const PHP_URL_SCHEME;
require_once __DIR__ . '/HtmlDocument.php';
/**
* Class HtmlWeb
*/
class HtmlWeb
{
/**
* @param string $url
* @return HtmlDocument|null Returns the DOM for a webpage
* Returns null if the cURL extension is not loaded and allow_url_fopen=Off
* Returns null if the provided URL is invalid (not PHP_URL_SCHEME)
* Returns null if the provided URL does not specify the HTTP or HTTPS protocol
* @noinspection PhpMethodMayBeStaticInspection
*/
public function load($url)
{
if (! filter_var($url, FILTER_VALIDATE_URL)) {
return null;
}
$scheme = parse_url($url, PHP_URL_SCHEME);
if ($scheme) {
switch (strtolower($scheme)) {
case 'http':
case 'https':
break;
default:
return null;
}
if (extension_loaded('curl')) {
return self::load_curl($url);
}
if (ini_get('allow_url_fopen')) {
return self::load_fopen($url);
}
/** @noinspection ForgottenDebugOutputInspection */
error_log(__FUNCTION__ . ' requires either the cURL extension or allow_url_fopen=On in php.ini');
}
return null;
}
/**
* cURL implementation of load
*
* @param string
* @return HtmlDocument|null
* @noinspection PhpComposerExtensionStubsInspection
*/
private static function load_curl($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
// There is no guarantee this request will be fulfilled
// -- https://www.php.net/manual/en/function.curl-setopt.php
curl_setopt($ch, CURLOPT_BUFFERSIZE, MAX_FILE_SIZE);
// There is no guarantee this request will be fulfilled
$header = [
'Accept: text/html', // Prefer HTML format
'Accept-Charset: utf-8', // Prefer UTF-8 encoding
];
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
$doc = curl_exec($ch);
if (curl_getinfo($ch, CURLINFO_RESPONSE_CODE) !== 200) {
return null;
}
curl_close($ch);
if (strlen($doc) > MAX_FILE_SIZE) {
return null;
}
return new HtmlDocument($doc);
}
/**
* fopen implementation of load
*
* @param string $url
*/
private static function load_fopen($url)
{
// There is no guarantee this request will be fulfilled
$context = stream_context_create(['http' => [
'header' => [
'Accept: text/html', // Prefer HTML format
'Accept-Charset: utf-8', // Prefer UTF-8 encoding
],
'ignore_errors' => true // Always fetch content
]]);
$doc = file_get_contents($url, false, $context, 0, MAX_FILE_SIZE + 1);
if (isset($http_response_header)) {
foreach ($http_response_header as $rh) {
// https://stackoverflow.com/a/1442526
$parts = (array)explode(' ', $rh, 3);
if (preg_match('/HTTP\/\d\.\d/', $parts[0])) {
$code = $parts[1];
}
} // Last code is final status
if (! isset($code) || $code !== '200') {
return null;
}
}
if (strlen($doc) > MAX_FILE_SIZE) {
return null;
}
return new HtmlDocument($doc);
}
}