Skip to content

Commit 0460a3f

Browse files
committed
DomQuery: uses PHP 8.4 HTML DOM
1 parent 32b489e commit 0460a3f

File tree

3 files changed

+174
-29
lines changed

3 files changed

+174
-29
lines changed

src/Framework/DomQuery.php

Lines changed: 47 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@
99

1010
namespace Tester;
1111

12-
use const PREG_SET_ORDER;
12+
use Dom;
13+
use const PHP_VERSION_ID, PREG_SET_ORDER;
1314

1415

1516
/**
16-
* DomQuery simplifies querying (X)HTML documents.
17+
* Simplifies querying and traversing HTML documents using CSS selectors.
1718
*/
1819
class DomQuery extends \SimpleXMLElement
1920
{
@@ -22,26 +23,35 @@ class DomQuery extends \SimpleXMLElement
2223
*/
2324
public static function fromHtml(string $html): self
2425
{
25-
if (!str_contains($html, '<')) {
26-
$html = '<body>' . $html;
27-
}
26+
$old = libxml_use_internal_errors(true);
27+
libxml_clear_errors();
2828

29-
$html = @mb_convert_encoding($html, 'HTML', 'UTF-8'); // @ - deprecated
29+
if (PHP_VERSION_ID < 80400) {
30+
if (!str_contains($html, '<')) {
31+
$html = '<body>' . $html;
32+
}
3033

31-
// parse these elements as void
32-
$html = preg_replace('#<(keygen|source|track|wbr)(?=\s|>)((?:"[^"]*"|\'[^\']*\'|[^"\'>])*+)(?<!/)>#', '<$1$2 />', $html);
34+
$html = @mb_convert_encoding($html, 'HTML', 'UTF-8'); // @ - deprecated
3335

34-
// fix parsing of </ inside scripts
35-
$html = preg_replace_callback(
36-
'#(<script(?=\s|>)(?:"[^"]*"|\'[^\']*\'|[^"\'>])*+>)(.*?)(</script>)#s',
37-
fn(array $m): string => $m[1] . str_replace('</', '<\/', $m[2]) . $m[3],
38-
$html,
39-
);
36+
// parse these elements as void
37+
$html = preg_replace('#<(keygen|source|track|wbr)(?=\s|>)((?:"[^"]*"|\'[^\']*\'|[^"\'>])*+)(?<!/)>#', '<$1$2 />', $html);
38+
39+
// fix parsing of </ inside scripts
40+
$html = preg_replace_callback(
41+
'#(<script(?=\s|>)(?:"[^"]*"|\'[^\']*\'|[^"\'>])*+>)(.*?)(</script>)#s',
42+
fn(array $m): string => $m[1] . str_replace('</', '<\/', $m[2]) . $m[3],
43+
$html,
44+
);
45+
46+
$dom = new \DOMDocument;
47+
$dom->loadHTML($html);
48+
} else {
49+
if (!preg_match('~<!DOCTYPE~i', $html)) {
50+
$html = '<!DOCTYPE html>' . $html;
51+
}
52+
$dom = Dom\HTMLDocument::createFromString($html, Dom\HTML_NO_DEFAULT_NS, 'UTF-8');
53+
}
4054

41-
$dom = new \DOMDocument;
42-
$old = libxml_use_internal_errors(true);
43-
libxml_clear_errors();
44-
$dom->loadHTML($html);
4555
$errors = libxml_get_errors();
4656
libxml_use_internal_errors($old);
4757

@@ -65,32 +75,43 @@ public static function fromXml(string $xml): self
6575

6676

6777
/**
68-
* Finds descendants of current element that match the given CSS selector.
78+
* Returns array of elements matching CSS selector.
6979
* @return DomQuery[]
7080
*/
7181
public function find(string $selector): array
7282
{
73-
return str_starts_with($selector, ':scope')
74-
? $this->xpath('self::' . self::css2xpath(substr($selector, 6)))
75-
: $this->xpath('descendant::' . self::css2xpath($selector));
83+
if (PHP_VERSION_ID < 80400) {
84+
return str_starts_with($selector, ':scope')
85+
? $this->xpath('self::' . self::css2xpath(substr($selector, 6)))
86+
: $this->xpath('descendant::' . self::css2xpath($selector));
87+
}
88+
89+
return array_map(
90+
fn($el) => simplexml_import_dom($el, self::class),
91+
iterator_to_array(Dom\import_simplexml($this)->querySelectorAll($selector)),
92+
);
7693
}
7794

7895

7996
/**
80-
* Checks if any descendant of current element matches the given selector.
97+
* Checks if any descendant matches CSS selector.
8198
*/
8299
public function has(string $selector): bool
83100
{
84-
return (bool) $this->find($selector);
101+
return PHP_VERSION_ID < 80400
102+
? (bool) $this->find($selector)
103+
: (bool) Dom\import_simplexml($this)->querySelector($selector);
85104
}
86105

87106

88107
/**
89-
* Determines if the current element matches the specified CSS selector.
108+
* Checks if element matches CSS selector.
90109
*/
91110
public function matches(string $selector): bool
92111
{
93-
return (bool) $this->xpath('self::' . self::css2xpath($selector));
112+
return PHP_VERSION_ID < 80400
113+
? (bool) $this->xpath('self::' . self::css2xpath($selector))
114+
: Dom\import_simplexml($this)->matches($selector);
94115
}
95116

96117

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
<?php
2+
3+
/**
4+
* @phpVersion 8.4
5+
*/
6+
7+
declare(strict_types=1);
8+
9+
use Tester\Assert;
10+
use Tester\DomQuery;
11+
12+
require __DIR__ . '/../bootstrap.php';
13+
14+
15+
test('fromHtml() creates DomQuery from HTML string', function () {
16+
$dom = DomQuery::fromHtml('<div class="test"><p>Hello</p></div>');
17+
Assert::type(DomQuery::class, $dom);
18+
Assert::true($dom->has('div'));
19+
});
20+
21+
test('fromHtml() handles HTML without root element', function () {
22+
$dom = DomQuery::fromHtml('Hello world');
23+
Assert::type(DomQuery::class, $dom);
24+
Assert::contains('Hello world', (string) $dom->find('body')[0]);
25+
});
26+
27+
test('fromHtml() handles void elements correctly', function () {
28+
$dom = DomQuery::fromHtml('<div><source src="test.mp3"><wbr>test</div>');
29+
Assert::true($dom->has('source'));
30+
Assert::true($dom->has('wbr'));
31+
});
32+
33+
test('fromHtml() handles script tags with </ inside', function () {
34+
$dom = DomQuery::fromHtml('<script>if (a</b) { alert("test"); }</script>');
35+
Assert::true($dom->has('script'));
36+
});
37+
38+
test('find() returns matching elements', function () {
39+
$dom = DomQuery::fromHtml('
40+
<div class="container">
41+
<p class="first">First paragraph</p>
42+
<p class="second">Second paragraph</p>
43+
<span>Test span</span>
44+
</div>
45+
');
46+
47+
$paragraphs = $dom->find('p');
48+
Assert::count(2, $paragraphs);
49+
Assert::contains('First paragraph', (string) $paragraphs[0]);
50+
51+
$spans = $dom->find('span');
52+
Assert::count(1, $spans);
53+
Assert::contains('Test span', (string) $spans[0]);
54+
});
55+
56+
test('find() supports complex CSS selectors', function () {
57+
$dom = DomQuery::fromHtml('
58+
<div class="container">
59+
<p class="first">First</p>
60+
<div class="wrapper">
61+
<p class="second">Second</p>
62+
<p class="third">Third</p>
63+
</div>
64+
</div>
65+
');
66+
67+
$results = $dom->find('div.wrapper p');
68+
Assert::count(2, $results);
69+
Assert::contains('Second', (string) $results[0]);
70+
71+
$results = $dom->find('p.first + div');
72+
Assert::count(1, $results);
73+
Assert::true($results[0]->has('p.second'));
74+
});
75+
76+
test('has() checks for existence of elements', function () {
77+
$dom = DomQuery::fromHtml('
78+
<div class="test">
79+
<span class="inner">Test</span>
80+
</div>
81+
');
82+
83+
Assert::true($dom->has('span.inner'));
84+
Assert::true($dom->has('div.test'));
85+
Assert::false($dom->has('p'));
86+
Assert::false($dom->has('.nonexistent'));
87+
});
88+
89+
test('matches() checks if element matches selector', function () {
90+
$dom = DomQuery::fromHtml('<div class="test"><p class="para">Test</p></div>');
91+
$para = $dom->find('p')[0];
92+
93+
Assert::true($para->matches('p'));
94+
Assert::true($para->matches('.para'));
95+
Assert::true($para->matches('p.para'));
96+
Assert::false($para->matches('div'));
97+
Assert::false($para->matches('.test'));
98+
});
99+
100+
test('find() returns empty array for no matches', function () {
101+
$dom = DomQuery::fromHtml('<div></div>');
102+
Assert::same([], $dom->find('nonexistent'));
103+
});
104+
105+
test('handles malformed HTML gracefully', function () {
106+
Assert::error(function () use (&$dom) {
107+
$dom = DomQuery::fromHtml('<div><p>Unclosed paragraph<span>Test</div>');
108+
}, E_USER_WARNING, 'Tester\DomQuery::fromHtml: tree error unexpected-element-in-open-elements-stack%a%');
109+
Assert::true($dom->has('div'));
110+
Assert::true($dom->has('p'));
111+
Assert::true($dom->has('span'));
112+
});
113+
114+
test('handles HTML entities in attributes', function () {
115+
$dom = DomQuery::fromHtml('<div data-test="&quot;quoted&quot;">Test</div>');
116+
Assert::true($dom->find('div')[0]->matches('[data-test="\\"quoted\\""]'));
117+
});
118+
119+
test('handles UTF-8', function () {
120+
$q = DomQuery::fromHtml('<p>žluťoučký</p>');
121+
Assert::same('žluťoučký', (string) $q->find('p')[0]);
122+
});

tests/Framework/DomQuery.fromXml.phpt

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,11 @@ Assert::count(2, $results);
3131
Assert::type(DomQuery::class, $results[0]);
3232
Assert::type(DomQuery::class, $results[1]);
3333

34-
// children
35-
$results = $dom->find(':scope > item');
36-
Assert::count(2, $results);
34+
if (PHP_VERSION_ID < 80400) { // TODO: not yet supported by Lexbor
35+
// children
36+
$results = $dom->find(':scope > item');
37+
Assert::count(2, $results);
38+
}
3739

3840
// has
3941
Assert::true($dom->has('#test1'));

0 commit comments

Comments
 (0)