Skip to content

Commit d293c22

Browse files
committed
Use WP_XML_Reader for EPubs, support simple DOCTYPE declarations in XML
1 parent e01dec8 commit d293c22

File tree

8 files changed

+178
-36
lines changed

8 files changed

+178
-36
lines changed

packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,36 +28,45 @@ class WP_HTML_To_Blocks implements WP_Block_Markup_Converter {
2828

2929
private $state = self::STATE_READY;
3030
private $block_stack = array();
31-
private $html;
31+
private $markup_processor;
3232
private $ignore_text = false;
3333
private $in_ephemeral_paragraph = false;
3434
private $block_markup = '';
3535
private $metadata = array();
36+
private $last_error = null;
3637

37-
public function __construct( $html ) {
38-
$this->html = new \WP_HTML_Processor( $html );
38+
public function __construct( $markup_processor ) {
39+
$this->markup_processor = $markup_processor;
3940
}
4041

4142
public function convert() {
4243
if ( self::STATE_READY !== $this->state ) {
4344
return false;
4445
}
4546

46-
while ( $this->html->next_token() ) {
47-
switch ( $this->html->get_token_type() ) {
47+
while ( $this->markup_processor->next_token() ) {
48+
var_dump( $this->markup_processor->get_token_type() );
49+
switch ( $this->markup_processor->get_token_type() ) {
4850
case '#text':
4951
if ( $this->ignore_text ) {
5052
break;
5153
}
52-
$this->append_rich_text( htmlspecialchars( $this->html->get_modifiable_text() ) );
54+
$this->append_rich_text( htmlspecialchars( $this->markup_processor->get_modifiable_text() ) );
5355
break;
5456
case '#tag':
5557
$this->handle_tag();
5658
break;
5759
}
5860
}
61+
var_dump( $this->markup_processor->get_last_error() );
62+
63+
if ( $this->markup_processor->get_last_error() ) {
64+
$this->last_error = $this->markup_processor->get_last_error();
65+
return false;
66+
}
5967

6068
$this->close_ephemeral_paragraph();
69+
6170
return true;
6271
}
6372

@@ -77,8 +86,8 @@ public function get_block_markup() {
7786
}
7887

7988
private function handle_tag() {
80-
$html = $this->html;
81-
$tag = $html->get_tag();
89+
$html = $this->markup_processor;
90+
$tag = strtoupper( $html->get_tag() );
8291
$tag_lowercase = strtolower( $tag );
8392

8493
$is_tag_opener = ! $html->is_tag_closer();
@@ -304,7 +313,7 @@ private function should_preserve_tag_in_rich_text( $tag ) {
304313
}
305314

306315
private function is_at_inline_code_element() {
307-
$breadcrumbs = $this->html->get_breadcrumbs();
316+
$breadcrumbs = $this->markup_processor->get_breadcrumbs();
308317
foreach ( $breadcrumbs as $tag ) {
309318
switch ( $tag ) {
310319
case 'A':
@@ -392,4 +401,8 @@ private function close_ephemeral_paragraph() {
392401
$this->in_ephemeral_paragraph = false;
393402
}
394403
}
404+
405+
public function get_last_error() {
406+
return $this->last_error;
407+
}
395408
}

packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,17 @@ class WP_EPub_Entity_Reader extends WP_Entity_Reader {
2626
protected $current_post_id;
2727
protected $remaining_html_files;
2828
protected $current_html_reader;
29-
29+
protected $last_error;
3030
public function __construct( WP_Zip_Filesystem $zip, $first_post_id = 1 ) {
3131
$this->zip = $zip;
3232
$this->current_post_id = $first_post_id;
3333
}
3434

3535
public function next_entity() {
36-
// If we're finished, we're finished.
36+
if ( $this->last_error ) {
37+
return false;
38+
}
39+
3740
if ( $this->finished ) {
3841
return false;
3942
}
@@ -92,16 +95,14 @@ public function next_entity() {
9295

9396
$html_file = array_shift( $this->remaining_html_files );
9497
$html = $this->zip->read_file( $html_file );
95-
/**
96-
* @TODO: Don't just assume that WP_HTML_Entity_Reader can
97-
* handle an XHTML file. We might run into XML-specific
98-
* subtleties that will derail the process.
99-
* Let's consider using WP_XML_Processor instead.
100-
*/
101-
$this->current_html_reader = new \WP_HTML_Entity_Reader(
102-
$html,
98+
$this->current_html_reader = new WP_HTML_Entity_Reader(
99+
WP_XML_Processor::create_from_string( $html ),
103100
$this->current_post_id
104101
);
102+
if ( $this->current_html_reader->get_last_error() ) {
103+
$this->last_error = $this->current_html_reader->get_last_error();
104+
return false;
105+
}
105106
++$this->current_post_id;
106107
}
107108

@@ -117,6 +118,6 @@ public function is_finished(): bool {
117118
}
118119

119120
public function get_last_error(): ?string {
120-
return null;
121+
return $this->last_error;
121122
}
122123
}

packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,21 @@
11
<?php
22

3-
use WordPress\Data_Liberation\Block_Markup\WP_HTML_To_Blocks;
4-
53
/**
64
* Converts a single HTML file into a stream of WordPress entities.
75
*
86
* @TODO: Support post meta.
97
*/
108
class WP_HTML_Entity_Reader extends WP_Entity_Reader {
119

12-
protected $html;
10+
protected $html_processor;
1311
protected $entities;
1412
protected $finished = false;
1513
protected $post_id;
14+
protected $last_error;
1615

17-
public function __construct( $html, $post_id ) {
18-
$this->html = $html;
19-
$this->post_id = $post_id;
16+
public function __construct( $html_processor, $post_id ) {
17+
$this->html_processor = $html_processor;
18+
$this->post_id = $post_id;
2019
}
2120

2221
public function next_entity() {
@@ -36,8 +35,9 @@ public function next_entity() {
3635
}
3736

3837
// We did not read any entities yet. Let's convert the HTML document into entities.
39-
$converter = new WP_HTML_To_Blocks( $this->html );
38+
$converter = new WP_HTML_To_Blocks( $this->html_processor );
4039
if ( false === $converter->convert() ) {
40+
$this->last_error = $converter->get_last_error();
4141
return false;
4242
}
4343

@@ -90,6 +90,6 @@ public function is_finished(): bool {
9090
}
9191

9292
public function get_last_error(): ?string {
93-
return null;
93+
return $this->last_error;
9494
}
9595
}

packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php

Lines changed: 88 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,7 +1558,6 @@ private function parse_next_tag() {
15581558
* See https://www.w3.org/TR/xml11.xml/#sec-cdata-sect
15591559
*/
15601560
if (
1561-
! $this->is_closing_tag &&
15621561
$doc_length > $this->token_starts_at + 8 &&
15631562
'[' === $xml[ $this->token_starts_at + 2 ] &&
15641563
'C' === $xml[ $this->token_starts_at + 3 ] &&
@@ -1583,6 +1582,59 @@ private function parse_next_tag() {
15831582
return true;
15841583
}
15851584

1585+
/*
1586+
* Identify DOCTYPE nodes.
1587+
*
1588+
* See https://www.w3.org/TR/xml11.html/#dtd
1589+
*/
1590+
if (
1591+
$doc_length > $this->token_starts_at + 8 &&
1592+
'D' === $xml[ $at + 2 ] &&
1593+
'O' === $xml[ $at + 3 ] &&
1594+
'C' === $xml[ $at + 4 ] &&
1595+
'T' === $xml[ $at + 5 ] &&
1596+
'Y' === $xml[ $at + 6 ] &&
1597+
'P' === $xml[ $at + 7 ] &&
1598+
'E' === $xml[ $at + 8 ]
1599+
) {
1600+
$at += 9;
1601+
// Skip whitespace.
1602+
$at += strspn( $this->xml, " \t\f\r\n", $at );
1603+
1604+
if ( $doc_length <= $at ) {
1605+
$this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' );
1606+
1607+
return false;
1608+
}
1609+
1610+
// @TODO: Expose the "name" value instead of skipping it like that
1611+
$at += $this->parse_name( $at );
1612+
1613+
// Skip whitespace.
1614+
$at += strspn( $this->xml, " \t\f\r\n", $at );
1615+
1616+
if ( $doc_length <= $at ) {
1617+
$this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' );
1618+
return false;
1619+
}
1620+
1621+
if ( $this->xml[ $at ] !== '>' ) {
1622+
$this->last_error = self::ERROR_SYNTAX;
1623+
_doing_it_wrong(
1624+
__METHOD__,
1625+
__( 'Unsupported DOCTYPE syntax. Only a simple <!DOCTYPE name> is supported.' ),
1626+
'WP_VERSION'
1627+
);
1628+
return false;
1629+
}
1630+
1631+
$closer_at = $at;
1632+
$this->parser_state = self::STATE_DOCTYPE_NODE;
1633+
$this->token_length = $closer_at + 1 - $this->token_starts_at;
1634+
$this->bytes_already_parsed = $closer_at + 1;
1635+
return true;
1636+
}
1637+
15861638
/*
15871639
* Anything else here is either unsupported at this point or invalid
15881640
* syntax. See the class-level @TODO annotations for more information.
@@ -1592,6 +1644,7 @@ private function parse_next_tag() {
15921644
return false;
15931645
}
15941646

1647+
15951648
/*
15961649
* An `<?xml` token at the beginning of the document marks a start of an
15971650
* xml declaration.
@@ -2471,6 +2524,22 @@ public function get_tag() {
24712524
return null;
24722525
}
24732526

2527+
/**
2528+
* Indicates if the currently matched tag is expected to be closed.
2529+
* Returns true for tag openers (<div>) and false for empty elements (<img />) and tag closers (</div>).
2530+
*
2531+
* This method exists to provide a consistent interface with WP_HTML_Processor.
2532+
*
2533+
* @return bool Whether the tag is expected to be closed.
2534+
*/
2535+
public function expects_closer() {
2536+
if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
2537+
return false;
2538+
}
2539+
2540+
return ! $this->is_empty_element() && ! $this->is_closing_tag;
2541+
}
2542+
24742543
/**
24752544
* Indicates if the currently matched tag is an empty element tag.
24762545
*
@@ -2604,6 +2673,9 @@ public function get_token_name() {
26042673
case self::STATE_CDATA_NODE:
26052674
return '#cdata-section';
26062675

2676+
case self::STATE_DOCTYPE_NODE:
2677+
return '#doctype';
2678+
26072679
case self::STATE_XML_DECLARATION:
26082680
return '#xml-declaration';
26092681

@@ -3030,10 +3102,11 @@ private function step_in_prolog( $node_to_process = self::PROCESS_NEXT_NODE ) {
30303102
$this->last_error = self::ERROR_SYNTAX;
30313103
_doing_it_wrong( __METHOD__, 'Unexpected token type in prolog stage.', 'WP_VERSION' );
30323104
}
3033-
30343105
return $this->step();
3035-
case '#xml-declaration':
3106+
// @TODO: Fail if there's more than one <!DOCTYPE> or if <!DOCTYPE> was found before the XML declaration token.
3107+
case '#doctype':
30363108
case '#comment':
3109+
case '#xml-declaration':
30373110
case '#processing-instructions':
30383111
return true;
30393112
case '#tag':
@@ -3393,6 +3466,18 @@ private function mark_incomplete_input(
33933466
*/
33943467
const STATE_CDATA_NODE = 'STATE_CDATA_NODE';
33953468

3469+
/**
3470+
* Parser DOCTYPE Node State.
3471+
*
3472+
* Indicates that the parser has found a DOCTYPE declaration and it's possible
3473+
* to read and modify its modifiable text.
3474+
*
3475+
* @since WP_VERSION
3476+
*
3477+
* @access private
3478+
*/
3479+
const STATE_DOCTYPE_NODE = 'STATE_DOCTYPE_NODE';
3480+
33963481
/**
33973482
* Indicates that the parser has found an XML processing instruction.
33983483
*

packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ public function test_entity_reader( $reader ) {
2121
'data' => $data,
2222
];
2323
}
24+
$this->assertNull( $reader->get_last_error() );
2425
$this->assertEquals( 3, count($entities) );
2526
$this->assertEquals( 117, strlen($entities[0]['data']['content']) );
2627
$this->assertGreaterThan( 1000, strlen($entities[1]['data']['content']) );

packages/playground/data-liberation/tests/WPHTMLEntityReaderTests.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ public function test_entity_reader() {
1313
<h1>It is our pleasure to announce that WordPress 6.8 was released</h1>
1414
<p>Last week, WordPress 6.8 was released.</p>
1515
HTML;
16-
$reader = new WP_HTML_Entity_Reader( $html, 1 );
16+
$reader = new WP_HTML_Entity_Reader( new WP_HTML_Processor( $html ), 1 );
1717
$entities = [];
1818
while ( $reader->next_entity() ) {
1919
$data = $reader->get_entity()->get_data();

packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ public function test_metadata_extraction() {
1616
<h1>WordPress 6.8 was released</h1>
1717
<p>Last week, WordPress 6.8 was released. This release includes a new default theme, a new block editor experience, and a new block library. It also includes a new block editor experience, and a new block library.</p>
1818
HTML;
19-
$converter = new WP_HTML_To_Blocks( $html );
19+
$converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $html ) );
2020
$converter->convert( $html );
2121
$metadata = $converter->get_all_metadata();
2222
$expected_metadata = [
@@ -35,7 +35,7 @@ public function test_metadata_extraction() {
3535
* @dataProvider provider_test_conversion
3636
*/
3737
public function test_html_to_blocks_conversion( $html, $expected ) {
38-
$converter = new WP_HTML_To_Blocks( $html );
38+
$converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $html ) );
3939
$converter->convert( $html );
4040
$blocks = $converter->get_block_markup();
4141

@@ -136,7 +136,7 @@ public function provider_test_conversion() {
136136

137137
public function test_html_to_blocks_excerpt() {
138138
$input = file_get_contents( __DIR__ . '/fixtures/html-to-blocks/excerpt.input.html' );
139-
$converter = new WP_HTML_To_Blocks( $input );
139+
$converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $input ) );
140140
$converter->convert( $input );
141141
$blocks = $converter->get_block_markup();
142142

0 commit comments

Comments
 (0)