Skip to content

Commit 2962af0

Browse files
adamzielbrandonpayton
authored andcommitted
[Data Liberation] Add HTML to Blocks converter (#2095)
Adds a basic `WP_HTML_To_Blocks` class that accepts HTML and outputs block markup. It's a very basic converter. It only considers the markup and won't consider any visual changes introduced via CSS or JavaScript. Only a few core blocks are supported in this initial PR. The API can easily support more HTML elements and blocks. To preserve visual fidelity between the original HTML page and the produced block markup, we'll need an annotated HTML input produced by the [Try WordPress](https://github.com/WordPress/try-wordpress/) browser extension. It would contain each element's colors, sizes, etc. We cannot possibly get all from just analyzing the HTML on the server without building a full-blown, browser-like HTML renderer in PHP, and I know I'm not building one. A part of #1894 ## Example ```php $html = <<<HTML <meta name="post_title" content="My first post"> <p>Hello <b>world</b>!</p> HTML; $converter = new WP_HTML_To_Blocks( $html ); $converter->convert(); var_dump( $converter->get_all_metadata() ); /* * array( 'post_title' => array( 'My first post' ) ) */ var_dump( $converter->get_block_markup() ); /* * <!-- wp:paragraph --> * <p>Hello <b>world</b>!</p> * <!-- /wp:paragraph --> */ ``` ## Caveats I had to patch WP_HTML_Processor to stop baling out on `<meta>` tags referencing the document charset. Ideally we'd patch WordPress core to stop baling out when the charset is UTF-8. ## Testing instructions This PR mostly adds new code. Just confirm the unit tests pass in CI. cc @brandonpayton @zaerl @sirreal @dmsnell @ellatrix
1 parent 4a9ad81 commit 2962af0

16 files changed

+4143
-11
lines changed

.eslintignore

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ __pycache__
77
packages/playground/wordpress-builds/src/wordpress
88
packages/playground/wordpress-builds/public
99
packages/playground/sync/src/test/wp-*
10+
packages/playground/data-liberation/tests/fixtures
1011
packages/php-wasm/node/src/test/__test*
1112
*.timestamp-1678999213403.mjs
1213
.local

.prettierignore

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
/packages/playground/wordpress-builds/build/build-assets
99
/packages/playground/wordpress-builds/src/wordpress
1010
/packages/playground/wordpress-builds/public/
11+
/packages/playground/data-liberation/tests/fixtures
1112
/packages/php-wasm/node/src/test/__test*
1213
__pycache__
1314
*.timestamp-1678999213403.mjs

packages/playground/data-liberation-markdown/src/WP_Markdown_To_Blocks.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ public function get_all_metadata() {
5252
return $this->frontmatter;
5353
}
5454

55-
public function get_meta_value( $key ) {
55+
public function get_first_meta_value( $key ) {
5656
if ( ! array_key_exists( $key, $this->frontmatter ) ) {
5757
return null;
5858
}

packages/playground/data-liberation/bootstrap.php

+4
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,13 @@
4848
require_once __DIR__ . '/src/block-markup/WP_Block_Markup_Processor.php';
4949
require_once __DIR__ . '/src/block-markup/WP_Block_Markup_Url_Processor.php';
5050
require_once __DIR__ . '/src/block-markup/WP_URL_In_Text_Processor.php';
51+
require_once __DIR__ . '/src/block-markup/WP_HTML_To_Blocks.php';
5152
require_once __DIR__ . '/src/block-markup/WP_URL.php';
5253

5354
require_once __DIR__ . '/src/xml-api/WP_XML_Decoder.php';
5455
require_once __DIR__ . '/src/xml-api/WP_XML_Processor.php';
5556
require_once __DIR__ . '/src/wxr/WP_WXR_Reader.php';
57+
require_once __DIR__ . '/src/import/WP_Import_Utils.php';
5658
require_once __DIR__ . '/src/import/WP_Block_Object.php';
5759
require_once __DIR__ . '/src/import/WP_Entity_Importer.php';
5860
require_once __DIR__ . '/src/import/WP_File_Visitor.php';
@@ -64,6 +66,8 @@
6466
require_once __DIR__ . '/src/import/WP_Stream_Importer.php';
6567
require_once __DIR__ . '/src/import/WP_Entity_Iterator_Chain.php';
6668
require_once __DIR__ . '/src/import/WP_Retry_Frontloading_Iterator.php';
69+
require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php';
70+
require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php';
6771

6872
require_once __DIR__ . '/src/utf8_decoder.php';
6973

packages/playground/data-liberation/phpunit.xml

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" bootstrap="bootstrap.php" colors="true" xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/10.0/phpunit.xsd" cacheDirectory=".phpunit.cache">
33
<testsuites>
44
<testsuite name="Application Test Suite">
5+
<file>tests/WPHTMLEntityReaderTests.php</file>
6+
<file>tests/WPHTMLToBlocksTests.php</file>
57
<file>tests/WPWXRReaderTests.php</file>
68
<file>tests/WPRewriteUrlsTests.php</file>
79
<file>tests/WPURLInTextProcessorTests.php</file>
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,57 @@
11
<?php
22

3+
/**
4+
* Represents a {Data Format} -> Block Markup + Metadata converter.
5+
*
6+
* Used by the Data Liberation importers to accept data formatted as HTML, Markdown, etc.
7+
* and convert them to WordPress posts.
8+
*/
39
interface WP_Block_Markup_Converter {
10+
/**
11+
* Converts the input document specified in the constructor to block markup.
12+
*
13+
* @return bool Whether the conversion was successful.
14+
*/
415
public function convert();
16+
17+
/**
18+
* Gets the block markup generated by the convert() method.
19+
*
20+
* @return string The block markup.
21+
*/
522
public function get_block_markup();
23+
24+
/**
25+
* Gets all the metadata sourced from the input document by the convert() method.
26+
* The data format is:
27+
*
28+
* array(
29+
* 'post_title' => array( 'The Name of the Wind' ),
30+
* 'post_author' => array( 'Patrick Rothfuss', 'Betsy Wollheim' )
31+
* )
32+
*
33+
* Note each meta key may have multiple values. The consumer of this interface
34+
* must account for this.
35+
*
36+
* @return array The metadata sourced from the input document.
37+
*/
638
public function get_all_metadata();
7-
public function get_meta_value( $key );
39+
40+
/**
41+
* Gets the first metadata value for a given key.
42+
*
43+
* Example:
44+
*
45+
* Metadata:
46+
* array(
47+
* 'post_title' => array( 'The Name of the Wind' ),
48+
* 'post_author' => array( 'Patrick Rothfuss', 'Betsy Wollheim' )
49+
* )
50+
*
51+
* get_first_meta_value( 'post_author' ) returns 'Patrick Rothfuss'.
52+
*
53+
* @param string $key The metadata key.
54+
* @return mixed The metadata value.
55+
*/
56+
public function get_first_meta_value( $key );
857
}

packages/playground/data-liberation/src/block-markup/WP_Block_Markup_Processor.php

+34-9
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,25 @@ public function get_block_attributes() {
5858
return $this->block_attributes;
5959
}
6060

61+
/**
62+
* Overwrites all the block attributes of the currently matched block
63+
* opener.
64+
*
65+
* @param array $attributes The attributes to set.
66+
* @return bool Whether the attributes were set.
67+
*/
68+
public function set_block_attributes( $attributes ) {
69+
if ( '#block-comment' !== $this->get_token_type() ) {
70+
return false;
71+
}
72+
if ( $this->is_block_closer() ) {
73+
return false;
74+
}
75+
$this->block_attributes = $attributes;
76+
$this->block_attributes_updated = true;
77+
return true;
78+
}
79+
6180
public function is_block_closer() {
6281
return $this->block_name !== null && $this->block_closer === true;
6382
}
@@ -165,17 +184,23 @@ private function block_attribute_updates_to_modifiable_text_updates() {
165184
if ( ! $this->block_attributes_updated ) {
166185
return false;
167186
}
187+
$encoded_attributes = json_encode(
188+
$this->block_attributes_iterator
189+
? $this->block_attributes_iterator->getSubIterator( 0 )->getArrayCopy()
190+
: $this->block_attributes,
191+
JSON_HEX_TAG | // Convert < and > to \u003C and \u003E
192+
JSON_HEX_AMP // Convert & to \u0026
193+
);
194+
if ( $encoded_attributes === '[]' ) {
195+
$encoded_attributes = '';
196+
} else {
197+
$encoded_attributes .= ' ';
198+
}
168199
$this->set_modifiable_text(
169200
' ' .
170-
$this->block_name . ' ' .
171-
json_encode(
172-
$this->block_attributes_iterator
173-
? $this->block_attributes_iterator->getSubIterator( 0 )->getArrayCopy()
174-
: $this->block_attributes,
175-
JSON_HEX_TAG | // Convert < and > to \u003C and \u003E
176-
JSON_HEX_AMP // Convert & to \u0026
177-
)
178-
. ' '
201+
$this->block_name .
202+
' ' .
203+
$encoded_attributes
179204
);
180205

181206
return true;

0 commit comments

Comments
 (0)