diff --git a/.eslintignore b/.eslintignore
index b3ac4e5866..2f55011606 100644
--- a/.eslintignore
+++ b/.eslintignore
@@ -7,6 +7,7 @@ __pycache__
packages/playground/wordpress-builds/src/wordpress
packages/playground/wordpress-builds/public
packages/playground/sync/src/test/wp-*
+packages/playground/data-liberation/tests/fixtures
packages/php-wasm/node/src/test/__test*
*.timestamp-1678999213403.mjs
.local
diff --git a/.prettierignore b/.prettierignore
index 9162807152..de4d6784be 100644
--- a/.prettierignore
+++ b/.prettierignore
@@ -8,6 +8,7 @@
/packages/playground/wordpress-builds/build/build-assets
/packages/playground/wordpress-builds/src/wordpress
/packages/playground/wordpress-builds/public/
+/packages/playground/data-liberation/tests/fixtures
/packages/php-wasm/node/src/test/__test*
__pycache__
*.timestamp-1678999213403.mjs
diff --git a/packages/playground/blueprints/src/lib/steps/activate-plugin.ts b/packages/playground/blueprints/src/lib/steps/activate-plugin.ts
index 0d08958340..706b0a8211 100644
--- a/packages/playground/blueprints/src/lib/steps/activate-plugin.ts
+++ b/packages/playground/blueprints/src/lib/steps/activate-plugin.ts
@@ -1,7 +1,5 @@
-import { phpVar } from '@php-wasm/util';
import { StepHandler } from '.';
import { logger } from '@php-wasm/logger';
-
/**
* @inheritDoc activatePlugin
* @example
@@ -39,18 +37,18 @@ export const activatePlugin: StepHandler ',
- )
- );
+ $this->push_block( 'paragraph' );
+ $this->append_content( ' ' );
break;
case Inline\Newline::class:
@@ -236,6 +220,15 @@ private function convert_markdown_to_blocks() {
if ( $node->getTitle() ) {
$html->set_attribute( 'title', $node->getTitle() );
}
+
+ $children = $node->children();
+ if ( count( $children ) > 0 && $children[0] instanceof Inline\Text && $children[0]->getLiteral() ) {
+ $html->set_attribute( 'alt', $children[0]->getLiteral() );
+ // Empty the text node so it will not be rendered twice: once in as an alt="",
+ // and once as a new paragraph block.
+ $children[0]->setLiteral( '' );
+ }
+
$this->append_content( $html->get_updated_html() );
break;
@@ -257,6 +250,10 @@ private function convert_markdown_to_blocks() {
}
} else {
switch ( get_class( $node ) ) {
+ case ExtensionBlock\BlockQuote::class:
+ $this->append_content( '',
- )
+ $attrs = array(
+ 'ordered' => $node->getListData()->type === 'ordered',
);
if ( $node->getListData()->start && $node->getListData()->start !== 1 ) {
- $this->current_block->attrs['start'] = $node->getListData()->start;
+ $attrs['start'] = $node->getListData()->start;
}
+ $this->push_block(
+ 'list',
+ $attrs
+ );
+ $this->append_content( '
' );
break;
case ExtensionBlock\ListItem::class:
- $this->push_block(
- 'list-item',
- array(
- 'content' => '
' );
break;
case TableSection::class:
- $this->push_block(
- 'table-section',
- array(
- 'type' => $node->isHead() ? 'head' : 'body',
- )
- );
+ $is_head = $node->isHead();
+ array_push( $this->table_stack, $is_head ? 'head' : 'body' );
+ $this->append_content( $is_head ? '' : '' );
break;
case TableRow::class:
- $this->push_block( 'table-row' );
+ $this->append_content( '
' );
break;
case TableCell::class:
/** @var TableCell $node */
- $this->push_block( 'table-cell' );
+ $is_header = $this->current_block() && $this->current_block()->block_name === 'table' && end( $this->table_stack ) === 'head';
+ $tag = $is_header ? 'th' : 'td';
+ $this->append_content( '<' . $tag . '>' );
break;
case ExtensionBlock\BlockQuote::class:
$this->push_block( 'quote' );
+ $this->append_content( ' ' );
+ break;
+ case TableRow::class:
+ $this->append_content( '' );
+ break;
+ case TableCell::class:
+ $is_header = $this->current_block() && $this->current_block()->block_name === 'table' && end( $this->table_stack ) === 'head';
+ $tag = $is_header ? 'th' : 'td';
+ $this->append_content( '' . $tag . '>' );
break;
case Table::class:
- $table = '' );
break;
case ExtensionBlock\FencedCode::class:
case ExtensionBlock\IndentedCode::class:
- $this->push_block(
- 'code',
- array(
- 'content' => '
' );
+ $this->pop_block();
+ break;
case ExtensionBlock\ListBlock::class:
$this->append_content( '' );
$this->pop_block();
@@ -279,53 +276,25 @@ private function convert_markdown_to_blocks() {
$this->append_content( '' );
break;
case TableSection::class:
- $table_section = $this->pop_block();
- $type = $table_section->attrs['type'];
- $tag = $type === 'head' ? 'th' : 'td';
-
- $parsed_rows = array();
- foreach ( $table_section->inner_blocks as $row ) {
- $parsed_row = array();
- foreach ( $row->inner_blocks as $cell ) {
- $parsed_row[] = array(
- 'tag' => $tag,
- 'content' => $cell->attrs['content'] ?? '',
- );
- }
- $parsed_rows[] = $parsed_row;
- }
-
- $table = $this->current_block;
- if ( $type === 'head' ) {
- $table->attrs[ $type ] = $parsed_rows[0];
- } else {
- $table->attrs[ $type ] = $parsed_rows;
- }
- $table->inner_blocks = array();
+ $is_head = $node->isHead();
+ array_pop( $this->table_stack );
+ $this->append_content( $is_head ? '' : '
',
- )
+ $attrs = array(
+ 'language' => null,
);
if ( method_exists( $node, 'getInfo' ) && $node->getInfo() ) {
- $this->current_block->attrs['language'] = preg_replace( '/[ \t\r\n\f].*/', '', $node->getInfo() );
+ $attrs['language'] = preg_replace( '/[ \t\r\n\f].*/', '', $node->getInfo() );
}
+ $this->push_block( 'code', $attrs );
+ $this->append_content( '' . trim( str_replace( "\n", '
', htmlspecialchars( $node->getLiteral() ) ) ) . '
' );
break;
case ExtensionBlock\HtmlBlock::class:
- $this->push_block(
- 'html',
- array(
- 'content' => $node->getLiteral(),
- )
- );
+ $this->push_block( 'html' );
+ $this->append_content( $node->getLiteral() );
break;
case ExtensionBlock\ThematicBreak::class:
@@ -192,15 +179,12 @@ private function convert_markdown_to_blocks() {
break;
case Block\Paragraph::class:
- if ( $this->current_block->block_name === 'list-item' ) {
+ $current_block = $this->current_block();
+ if ( $current_block && $current_block->block_name === 'list-item' ) {
break;
}
- $this->push_block(
- 'paragraph',
- array(
- 'content' => '' . trim( str_replace( "\n", '
', htmlspecialchars( $node->getLiteral() ) ) ) . '';
- $table .= '
';
- $table .= '';
- foreach ( $this->current_block->attrs['head'] as $cell ) {
- $table .= ' ';
- foreach ( $this->current_block->attrs['body'] as $row ) {
- $table .= '' . $cell['content'] . ' ';
- }
- $table .= '';
- foreach ( $row as $cell ) {
- $table .= ' ';
- }
- $table .= '' . $cell['content'] . ' ';
- }
- $table .= '
Hello world!
+ * + * Becomes: + * + * + *Hello world!
+ * + * + * With the following metadata: + * + * array( + * 'post_title' => array( 'My first post' ), + * ) + */ +class WP_HTML_To_Blocks implements WP_Block_Markup_Converter { + const STATE_READY = 'STATE_READY'; + const STATE_COMPLETE = 'STATE_COMPLETE'; + + private $state = self::STATE_READY; + private $block_stack = array(); + private $markup_processor; + private $ignore_text = false; + private $in_ephemeral_paragraph = false; + private $block_markup = ''; + private $metadata = array(); + private $last_error = null; + + public function __construct( $markup_processor ) { + $this->markup_processor = $markup_processor; + } + + public function convert() { + if ( self::STATE_READY !== $this->state ) { + return false; + } + + while ( $this->markup_processor->next_token() ) { + switch ( $this->markup_processor->get_token_type() ) { + case '#text': + if ( $this->ignore_text ) { + break; + } + $this->append_rich_text( htmlspecialchars( $this->markup_processor->get_modifiable_text() ) ); + break; + case '#tag': + $this->handle_tag(); + break; + } + } + + if ( $this->markup_processor->get_last_error() ) { + $this->last_error = $this->markup_processor->get_last_error(); + return false; + } + + $this->close_ephemeral_paragraph(); + + return true; + } + + public function get_meta_value( $key ) { + if ( ! array_key_exists( $key, $this->metadata ) ) { + return null; + } + return $this->metadata[ $key ][0]; + } + + public function get_all_metadata() { + return $this->metadata; + } + + public function get_block_markup() { + return $this->block_markup; + } + + private function handle_tag() { + $html = $this->markup_processor; + $tag = strtoupper( $html->get_tag() ); + $tag_lowercase = strtolower( $tag ); + + $is_void_tag = ! $html->expects_closer() && ! $html->is_tag_closer(); + if ( $is_void_tag ) { + switch ( $tag ) { + case 'META': + $key = $html->get_attribute( 'name' ); + $value = $html->get_attribute( 'content' ); + if ( ! array_key_exists( $key, $this->metadata ) ) { + $this->metadata[ $key ] = array(); + } + $this->metadata[ $key ][] = $value; + break; + case 'IMG': + $template = new \WP_HTML_Tag_Processor( ''; + $this->in_ephemeral_paragraph = true; + } + } + + /** + * Closes the ephemeral paragraph if it is currently open. + */ + private function close_ephemeral_paragraph() { + if ( $this->in_ephemeral_paragraph ) { + $this->block_markup .= '
'; + $this->block_markup .= WP_Import_Utils::block_closer( 'paragraph' ); + $this->in_ephemeral_paragraph = false; + } + } + + public function get_last_error() { + return $this->last_error; + } +} diff --git a/packages/playground/data-liberation/src/byte-readers/WP_Byte_Reader.php b/packages/playground/data-liberation/src/byte-readers/WP_Byte_Reader.php deleted file mode 100644 index 4671eca2cb..0000000000 --- a/packages/playground/data-liberation/src/byte-readers/WP_Byte_Reader.php +++ /dev/null @@ -1,10 +0,0 @@ -file_path = $file_path; - $this->chunk_size = $chunk_size; - } - - public function tell(): int { - // Save the previous offset, not the current one. - // This way, after resuming, the next read will yield the same $output_bytes - // as we have now. - return $this->offset_in_file - $this->last_chunk_size; - } - - public function seek( $offset_in_file ): bool { - if ( ! is_int( $offset_in_file ) ) { - _doing_it_wrong( __METHOD__, 'Cannot set a file reader cursor to a non-integer offset.', '1.0.0' ); - return false; - } - if ( $this->file_pointer ) { - _doing_it_wrong( __METHOD__, 'Cannot set a file reader cursor on a file reader that is already initialized.', '1.0.0' ); - return false; - } - $this->offset_in_file = $offset_in_file; - $this->last_chunk_size = 0; - return true; - } - - public function is_finished(): bool { - return ! $this->output_bytes && $this->state === static::STATE_FINISHED; - } - - public function get_bytes(): string { - return $this->output_bytes; - } - - public function get_last_error(): ?string { - return $this->last_error; - } - - public function next_bytes(): bool { - $this->output_bytes = ''; - $this->last_chunk_size = 0; - if ( $this->last_error || $this->is_finished() ) { - return false; - } - if ( ! $this->file_pointer ) { - $this->file_pointer = fopen( $this->file_path, 'r' ); - if ( $this->offset_in_file ) { - fseek( $this->file_pointer, $this->offset_in_file ); - } - } - $bytes = fread( $this->file_pointer, $this->chunk_size ); - if ( ! $bytes && feof( $this->file_pointer ) ) { - fclose( $this->file_pointer ); - $this->state = static::STATE_FINISHED; - return false; - } - $this->last_chunk_size = strlen( $bytes ); - $this->offset_in_file += $this->last_chunk_size; - $this->output_bytes .= $bytes; - return true; - } -} diff --git a/packages/playground/data-liberation/src/byte-readers/WP_GZ_File_Reader.php b/packages/playground/data-liberation/src/byte-readers/WP_GZ_File_Reader.php deleted file mode 100644 index 1216e7f7cd..0000000000 --- a/packages/playground/data-liberation/src/byte-readers/WP_GZ_File_Reader.php +++ /dev/null @@ -1,26 +0,0 @@ -output_bytes = ''; - if ( $this->last_error || $this->is_finished() ) { - return false; - } - if ( ! $this->file_pointer ) { - $this->file_pointer = gzopen( $this->file_path, 'r' ); - if ( $this->offset_in_file ) { - gzseek( $this->file_pointer, $this->offset_in_file ); - } - } - $bytes = gzread( $this->file_pointer, $this->chunk_size ); - if ( ! $bytes && gzeof( $this->file_pointer ) ) { - gzclose( $this->file_pointer ); - $this->state->finish(); - return false; - } - $this->offset_in_file += strlen( $bytes ); - $this->output_bytes .= $bytes; - return true; - } -} diff --git a/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Ranged_Reader.php b/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Ranged_Reader.php deleted file mode 100644 index 34ac703b18..0000000000 --- a/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Ranged_Reader.php +++ /dev/null @@ -1,187 +0,0 @@ -seek(0); - * $file->request_bytes(100); - * while($file->next_chunk()) { - * var_dump($file->get_bytes()); - * } - * $file->seek(600); - * $file->request_bytes(40); - * while($file->next_chunk()) { - * var_dump($file->get_bytes()); - * } - * - * @TODO: Verify that the remote server supports range requests. - * @TODO: Support requesting multiple ranges in a single request. - * @TODO: Abort in-progress requests when seeking to a new offset. - */ -class WP_Remote_File_Ranged_Reader { - - /** - * @var WordPress\AsyncHttp\Client - */ - private $client; - private $url; - private $remote_file_length; - - private $current_request; - private $offset_in_remote_file = 0; - private $offset_in_current_chunk = 0; - private $current_chunk; - private $expected_chunk_size; - - public function __construct( $url, $options = array() ) { - $this->client = new WordPress\AsyncHttp\Client(); - $this->url = $url; - } - - public function request_bytes( $bytes ) { - if ( null === $this->remote_file_length ) { - $content_length = $this->resolve_content_length(); - if ( false === $content_length ) { - // The remote server won't tell us what the content length is - // @TODO: What should we do in this case? Content-length is critical for - // stream-decompressing remote zip files, but we may not need it - // for other use-cases. - return false; - } - $this->remote_file_length = $content_length; - } - - if ( $this->offset_in_remote_file < 0 || $this->offset_in_remote_file + $bytes > $this->remote_file_length ) { - // TODO: Think through error handling - return false; - } - - $this->seek( $this->offset_in_remote_file ); - - $this->current_request = new WordPress\AsyncHttp\Request( - $this->url, - array( - 'headers' => array( - 'Range' => 'bytes=' . $this->offset_in_remote_file . '-' . ( $this->offset_in_remote_file + $bytes - 1 ), - ), - ) - ); - $this->expected_chunk_size = $bytes; - $this->offset_in_current_chunk = 0; - if ( false === $this->client->enqueue( $this->current_request ) ) { - // TODO: Think through error handling - return false; - } - return true; - } - - public function seek( $offset ) { - $this->offset_in_remote_file = $offset; - // @TODO cancel any pending requests - $this->current_request = null; - } - - public function tell() { - return $this->offset_in_remote_file; - } - - public function resolve_content_length() { - if ( null !== $this->remote_file_length ) { - return $this->remote_file_length; - } - - $request = new WordPress\AsyncHttp\Request( - $this->url, - array( 'method' => 'HEAD' ) - ); - if ( false === $this->client->enqueue( $request ) ) { - // TODO: Think through error handling - return false; - } - while ( $this->client->await_next_event() ) { - switch ( $this->client->get_event() ) { - case WordPress\AsyncHttp\Client::EVENT_GOT_HEADERS: - $response = $request->response; - if ( false === $response ) { - return false; - } - $content_length = $response->get_header( 'Content-Length' ); - if ( false === $content_length ) { - return false; - } - return (int) $content_length; - } - } - return false; - } - - public function next_chunk() { - while ( $this->client->await_next_event() ) { - /** - * Only process events related to the most recent request. - * @TODO: Support redirects. - * @TODO: Cleanup resources for stale requests. - */ - if ( $this->current_request->id !== $this->client->get_request()->id ) { - continue; - } - - if ( $this->offset_in_current_chunk >= $this->expected_chunk_size ) { - // The remote server doesn't support range requests and sent us a chunk larger than expected. - // @TODO: Handle this case. Should we stream the entire file, or give up? - // Should we cache the download locally, or request the entire file again every - // time we need to seek()? - return false; - } - - switch ( $this->client->get_event() ) { - case WordPress\AsyncHttp\Client::EVENT_GOT_HEADERS: - $request = $this->client->get_request(); - if ( ! $request ) { - return false; - } - $response = $request->response; - if ( false === $response ) { - return false; - } - if ( - $response->status_code !== 206 || - false === $response->get_header( 'Range' ) - ) { - // The remote server doesn't support range requests - // @TODO: Handle this case. Should we stream the entire file, or give up? - // Should we cache the download locally, or request the entire file again every - // time we need to seek()? - return false; - } - break; - case WordPress\AsyncHttp\Client::EVENT_BODY_CHUNK_AVAILABLE: - $chunk = $this->client->get_response_body_chunk(); - if ( ! is_string( $chunk ) ) { - // TODO: Think through error handling - return false; - } - $this->current_chunk = $chunk; - $this->offset_in_remote_file += strlen( $chunk ); - $this->offset_in_current_chunk += strlen( $chunk ); - - return true; - case WordPress\AsyncHttp\Client::EVENT_FAILED: - // TODO: Think through error handling. Errors are expected when working with - // the network. Should we auto retry? Make it easy for the caller to retry? - // Something else? - return false; - case WordPress\AsyncHttp\Client::EVENT_FINISHED: - // TODO: Think through error handling - return false; - } - } - } - - public function get_bytes() { - return $this->current_chunk; - } -} diff --git a/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Reader.php b/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Reader.php deleted file mode 100644 index d55846f7b8..0000000000 --- a/packages/playground/data-liberation/src/byte-readers/WP_Remote_File_Reader.php +++ /dev/null @@ -1,109 +0,0 @@ -client = new WordPress\AsyncHttp\Client(); - $this->url = $url; - } - - public function tell(): int { - return $this->bytes_already_read + $this->skip_bytes; - } - - public function seek( $offset_in_file ): bool { - if ( $this->request ) { - _doing_it_wrong( __METHOD__, 'Cannot set a remote file reader cursor on a remote file reader that is already initialized.', '1.0.0' ); - return false; - } - $this->skip_bytes = $offset_in_file; - return true; - } - - public function next_bytes(): bool { - if ( null === $this->request ) { - $this->request = new WordPress\AsyncHttp\Request( - $this->url - ); - if ( false === $this->client->enqueue( $this->request ) ) { - // TODO: Think through error handling - return false; - } - } - - $this->after_chunk(); - - while ( $this->client->await_next_event() ) { - switch ( $this->client->get_event() ) { - case WordPress\AsyncHttp\Client::EVENT_BODY_CHUNK_AVAILABLE: - $chunk = $this->client->get_response_body_chunk(); - if ( ! is_string( $chunk ) ) { - // TODO: Think through error handling - return false; - } - $this->current_chunk = $chunk; - - /** - * Naive seek() implementation – redownload the file from the start - * and ignore bytes until we reach the desired offset. - * - * @TODO: Use the range requests instead when the server supports them. - */ - if ( $this->skip_bytes > 0 ) { - if ( $this->skip_bytes < strlen( $chunk ) ) { - $this->current_chunk = substr( $chunk, $this->skip_bytes ); - $this->bytes_already_read += $this->skip_bytes; - $this->skip_bytes = 0; - } else { - $this->skip_bytes -= strlen( $chunk ); - continue 2; - } - } - return true; - case WordPress\AsyncHttp\Client::EVENT_FAILED: - // TODO: Think through error handling. Errors are expected when working with - // the network. Should we auto retry? Make it easy for the caller to retry? - // Something else? - $this->last_error = $this->client->get_request()->error; - return false; - case WordPress\AsyncHttp\Client::EVENT_FINISHED: - $this->is_finished = true; - return false; - } - } - } - - private function after_chunk() { - if ( $this->current_chunk ) { - $this->bytes_already_read += strlen( $this->current_chunk ); - } - $this->current_chunk = null; - } - - public function get_last_error(): ?string { - return $this->last_error; - } - - public function get_bytes(): ?string { - return $this->current_chunk; - } - - public function is_finished(): bool { - return $this->is_finished; - } -} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Block_Markup_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_Block_Markup_Entity_Reader.php new file mode 100644 index 0000000000..7c707bd615 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_Block_Markup_Entity_Reader.php @@ -0,0 +1,99 @@ +block_markup = $block_markup; + $this->metadata = $metadata; + $this->post_id = $post_id; + } + + public function next_entity() { + if ( $this->finished ) { + return false; + } + + $this->current_entity = null; + + if ( null !== $this->enqueued_entities ) { + if ( count( $this->enqueued_entities ) === 0 ) { + $this->finished = true; + return false; + } else { + $this->current_entity = array_shift( $this->enqueued_entities ); + return true; + } + } + + $all_metadata = $this->metadata; + $post_fields = array(); + $other_metadata = array(); + foreach ( $all_metadata as $key => $values ) { + if ( in_array( $key, WP_Imported_Entity::POST_FIELDS, true ) ) { + $post_fields[ $key ] = $values[0]; + } else { + $other_metadata[ $key ] = $values[0]; + } + } + + $post_fields['post_id'] = $this->post_id; + $post_fields['post_content'] = $this->block_markup; + + // In Markdown, the frontmatter title can be a worse title candidate than + // the first H1 block. In block markup exports, it will be the opposite. + // + // @TODO: Enable the API consumer to customize the title resolution. + if ( ! isset( $post_fields['post_title'] ) ) { + $removed_title = WP_Import_Utils::remove_first_h1_block_from_block_markup( $post_fields['post_content'] ); + if ( false !== $removed_title ) { + $post_fields['post_title'] = $removed_title['h1_content']; + $post_fields['post_content'] = $removed_title['remaining_html']; + } + } + + // Yield the post entity. + $this->enqueued_entities[] = new WP_Imported_Entity( 'post', $post_fields ); + + // Yield all the metadata that don't belong to the post entity. + foreach ( $other_metadata as $key => $value ) { + $this->enqueued_entities[] = new WP_Imported_Entity( + 'post_meta', + array( + 'post_id' => $this->post_id, + 'key' => $key, + 'value' => $value, + ) + ); + } + + $this->current_entity = array_shift( $this->enqueued_entities ); + return true; + } + + public function get_entity() { + if ( $this->is_finished() ) { + return false; + } + return $this->current_entity; + } + + public function is_finished(): bool { + return $this->finished; + } + + public function get_last_error(): ?string { + return $this->last_error; + } +} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Directory_Tree_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_Directory_Tree_Entity_Reader.php new file mode 100644 index 0000000000..6259e88ad8 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_Directory_Tree_Entity_Reader.php @@ -0,0 +1,362 @@ +file_visitor = new \WordPress\Filesystem\WP_Filesystem_Visitor( $filesystem, $options['root_dir'] ); + $this->filesystem = $filesystem; + $this->create_index_pages = $options['create_index_pages'] ?? false; + $this->next_post_id = $options['first_post_id']; + $this->allowed_extensions = $options['allowed_extensions']; + $this->index_file_patterns = $options['index_file_patterns']; + $this->markup_converter_factory = $options['markup_converter_factory']; + } + + public function next_entity() { + while ( true ) { + if ( null !== $this->pending_directory_index ) { + $dir = $this->file_visitor->get_event()->dir; + $depth = $this->file_visitor->get_current_depth(); + $parent_id = $this->parent_ids[ $depth - 1 ] ?? null; + + if ( null === $parent_id && $depth > 1 ) { + // There's no parent ID even though we're a few levels deep. + // This is a scenario where `next_file()` skipped a few levels + // of directories with no relevant content in them: + // + // - /docs/ + // - /foo/ + // - /bar/ + // - /baz.md + // + // In this case, we need to backtrack and create the missing + // parent pages for /bar/ and /foo/. + + // Find the topmost missing parent ID + $missing_parent_id_depth = 1; + while ( isset( $this->parent_ids[ $missing_parent_id_depth ] ) ) { + ++$missing_parent_id_depth; + } + + // Move up to the corresponding directory + $missing_parent_path = $dir; + for ( $i = $missing_parent_id_depth; $i < $depth; $i++ ) { + $missing_parent_path = dirname( $missing_parent_path ); + } + + $this->parent_ids[ $missing_parent_id_depth ] = $this->emit_post_entity( + array( + 'content' => '', + 'local_file_path' => $missing_parent_path, + 'parent_id' => $this->parent_ids[ $missing_parent_id_depth - 1 ] ?? null, + 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $missing_parent_path ) ), + ) + ); + } elseif ( false === $this->pending_directory_index ) { + // No directory index candidate – let's create a fake page + // just to have something in the page tree. + $this->parent_ids[ $depth ] = $this->emit_post_entity( + array( + 'content' => '', + 'local_file_path' => $dir, + 'parent_id' => $parent_id, + 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $dir ) ), + ) + ); + // We're no longer looking for a directory index. + $this->pending_directory_index = null; + } else { + $file_path = $this->pending_directory_index; + $this->parent_ids[ $depth ] = $this->emit_post_entity( + array( + 'content' => $this->filesystem->read_file( $file_path ), + 'local_file_path' => $file_path, + 'parent_id' => $parent_id, + 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $file_path ) ), + ) + ); + // We're no longer looking for a directory index. + $this->pending_directory_index = null; + } + return true; + } + + while ( count( $this->pending_files ) ) { + $parent_id = $this->parent_ids[ $this->file_visitor->get_current_depth() ] ?? null; + $file_path = array_shift( $this->pending_files ); + $this->emit_post_entity( + array( + 'content' => $this->filesystem->read_file( $file_path ), + 'local_file_path' => $file_path, + 'parent_id' => $parent_id, + 'title_fallback' => WP_Import_Utils::slug_to_title( basename( $file_path ) ), + ) + ); + return true; + } + + if ( false === $this->next_file() ) { + break; + } + } + $this->is_finished = true; + return false; + } + + public function get_entity(): ?\WP_Imported_Entity { + return $this->entity; + } + + protected function emit_post_entity( $options ) { + $factory = $this->markup_converter_factory; + $converter = $factory( $options['content'] ); + $converter->convert(); + $block_markup = $converter->get_block_markup(); + + $post_title = null; + if ( ! $post_title ) { + $removed_title = WP_Import_Utils::remove_first_h1_block_from_block_markup( $block_markup ); + if ( false !== $removed_title ) { + $post_title = $removed_title['h1_content']; + $block_markup = $removed_title['remaining_html']; + } + } + if ( ! $post_title ) { + // In Markdown, the frontmatter title can be a worse title candidate than + // the first H1 block. In block markup exports, it will be the opposite. + // + // @TODO: Enable the API consumer to customize the title resolution. + $post_title = $converter->get_meta_value( 'post_title' ); + } + if ( ! $post_title ) { + $post_title = $options['title_fallback']; + } + + $entity_data = array( + 'post_id' => $this->next_post_id, + 'post_type' => 'page', + 'guid' => $options['local_file_path'], + 'post_title' => $post_title, + 'post_content' => $block_markup, + 'post_excerpt' => $converter->get_meta_value( 'post_excerpt' ) ?? '', + 'post_status' => 'publish', + ); + + /** + * Technically `local_file_path` isn't a part of the WordPress post object, + * but we need it to resolve relative URLs in the imported content. + * + * This path is relative to the root directory traversed by this class. + */ + if ( ! empty( $options['local_file_path'] ) ) { + $local_file_path = $options['local_file_path']; + $root_dir = $this->file_visitor->get_root_dir(); + if ( str_starts_with( $local_file_path, $root_dir ) ) { + $local_file_path = substr( $local_file_path, strlen( $root_dir ) ); + } + $local_file_path = ltrim( $local_file_path, '/' ); + $entity_data['local_file_path'] = $local_file_path; + } + + if ( $converter->get_meta_value( 'slug' ) ) { + $slug = $converter->get_meta_value( 'slug' ); + $last_segment = substr( $slug, strrpos( $slug, '/' ) + 1 ); + $entity_data['post_name'] = $last_segment; + } + + if ( $converter->get_meta_value( 'post_order' ) ) { + $entity_data['post_order'] = $converter->get_meta_value( 'post_order' ); + } + + if ( $options['parent_id'] ) { + $entity_data['post_parent'] = $options['parent_id']; + } + + $this->entity = new \WP_Imported_Entity( 'post', $entity_data ); + ++$this->next_post_id; + ++$this->entities_read_so_far; + return $entity_data['post_id']; + } + + private function next_file() { + $this->pending_files = array(); + $this->entity = null; + while ( $this->file_visitor->next() ) { + $event = $this->file_visitor->get_event(); + + if ( $event->is_exiting() ) { + // Clean up stale IDs to save some memory when processing + // large directory trees. + unset( $this->parent_ids[ $event->dir ] ); + continue; + } + + if ( $event->is_entering() ) { + $abs_paths = array(); + foreach ( $event->files as $filename ) { + $abs_paths[] = $event->dir . '/' . $filename; + } + $this->pending_files = $this->choose_relevant_files( $abs_paths ); + if ( ! count( $this->pending_files ) ) { + // Only consider directories with relevant files in them. + // Otherwise we'll create fake pages for media directories + // and other directories that don't contain any content. + // + // One corner case is when there's a few levels of directories + // with a single relevant file at the bottom: + // + // - /docs/ + // - /foo/ + // - /bar/ + // - /baz.md + // + // In this case, `next_entity()` will backtrack at baz.md and + // create the missing parent pages. + continue; + } + $directory_index_idx = $this->choose_directory_index( $this->pending_files ); + if ( -1 === $directory_index_idx ) { + $this->pending_directory_index = false; + } else { + $this->pending_directory_index = $this->pending_files[ $directory_index_idx ]; + unset( $this->pending_files[ $directory_index_idx ] ); + } + return true; + } + + return false; + } + return false; + } + + protected function choose_directory_index( $files ) { + foreach ( $files as $idx => $file ) { + if ( $this->looks_like_directory_index( $file ) ) { + return $idx; + } + } + if ( ! $this->create_index_pages && count( $files ) > 0 ) { + return 0; + } + return -1; + } + + protected function looks_like_directory_index( $path ) { + $filename = basename( $path ); + foreach ( $this->index_file_patterns as $pattern ) { + if ( preg_match( $pattern, $filename ) ) { + return true; + } + } + return false; + } + + protected function choose_relevant_files( $paths ) { + return array_filter( $paths, array( $this, 'is_valid_file' ) ); + } + + protected function is_valid_file( $path ) { + $extension = pathinfo( $path, PATHINFO_EXTENSION ); + return in_array( $extension, $this->allowed_extensions, true ); + } + + /** + * @TODO: Either implement this method, or introduce a concept of + * reentrant and non-reentrant entity readers. + */ + public function get_reentrancy_cursor() { + return ''; + } + + public function current(): mixed { + if ( null === $this->entity && ! $this->is_finished ) { + $this->next(); + } + return $this->get_entity(); + } + + public function next(): void { + $this->next_entity(); + } + + public function key(): int { + return $this->entities_read_so_far - 1; + } + + private $is_started = false; + + public function valid(): bool { + if ( ! $this->is_started ) { + $this->next(); + $this->is_started = true; + } + return ! $this->is_finished; + } + + public function rewind(): void { + // @TODO: Either implement this method, or formalize the fact that + // entity readers are not rewindable. + } +} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php new file mode 100644 index 0000000000..db7b8b9df3 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php @@ -0,0 +1,123 @@ +zip = $zip; + $this->current_post_id = $first_post_id; + } + + public function next_entity() { + if ( $this->last_error ) { + return false; + } + + if ( $this->finished ) { + return false; + } + + if ( null === $this->remaining_html_files ) { + $path = false; + foreach ( array( '/OEBPS', '/EPUB' ) as $path_candidate ) { + if ( $this->zip->is_dir( $path_candidate ) ) { + $path = $path_candidate; + break; + } + } + if ( false === $path ) { + _doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' ); + $this->finished = true; + return false; + } + + $files = $this->zip->ls( $path ); + if ( false === $files ) { + _doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' ); + $this->finished = true; + return false; + } + $this->remaining_html_files = array(); + foreach ( $files as $file ) { + if ( str_ends_with( $file, '.xhtml' ) || str_ends_with( $file, '.html' ) ) { + $this->remaining_html_files[] = $path . '/' . $file; + } + } + } + + while ( true ) { + if ( null !== $this->current_html_reader ) { + if ( + ! $this->current_html_reader->is_finished() && + $this->current_html_reader->next_entity() + ) { + return true; + } + if ( $this->current_html_reader->get_last_error() ) { + _doing_it_wrong( + __METHOD__, + 'The EPUB file did not contain any HTML files.', + '1.0.0' + ); + $this->finished = true; + return false; + } + } + + if ( count( $this->remaining_html_files ) === 0 ) { + $this->finished = true; + return false; + } + + $html_file = array_shift( $this->remaining_html_files ); + $html = $this->zip->read_file( $html_file ); + $this->current_html_reader = new WP_HTML_Entity_Reader( + WP_XML_Processor::create_from_string( $html ), + $this->current_post_id + ); + if ( $this->current_html_reader->get_last_error() ) { + $this->last_error = $this->current_html_reader->get_last_error(); + return false; + } + ++$this->current_post_id; + } + + return false; + } + + public function get_entity() { + return $this->current_html_reader->get_entity(); + } + + public function is_finished(): bool { + return $this->finished; + } + + public function get_last_error(): ?string { + return $this->last_error; + } +} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_Entity_Reader.php new file mode 100644 index 0000000000..ba5246a9ca --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_Entity_Reader.php @@ -0,0 +1,100 @@ +ensure_current_entity(); + return $this->get_entity(); + } + + private $last_next_result = null; + public function next(): void { + // @TODO: Don't keep track of this. Just make sure the next_entity() + // call will make the is_finished() true. + $this->last_next_result = $this->next_entity(); + } + + public function key(): string { + return $this->get_reentrancy_cursor(); + } + + public function valid(): bool { + $this->ensure_current_entity(); + return false !== $this->last_next_result && ! $this->is_finished() && ! $this->get_last_error(); + } + + public function rewind(): void { + // Haven't started yet. + if ( null === $this->last_next_result ) { + return; + } + _doing_it_wrong( + __METHOD__, + 'WP_WXR_Entity_Reader does not support rewinding.', + null + ); + } + + private function ensure_current_entity() { + if ( null === $this->get_entity() && ! $this->is_finished() && ! $this->get_last_error() ) { + $this->next(); + } + } +} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_Entity_Reader.php new file mode 100644 index 0000000000..92f2521118 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_Entity_Reader.php @@ -0,0 +1,124 @@ +filesystem = $filesystem; + $this->post_type = $options['post_type'] ?? 'page'; + $this->post_tree = WP_Filesystem_To_Post_Tree::create( + $this->filesystem, + array ( + 'first_post_id' => 2, + 'filter_pattern' => '#\.(?:md|html|xhtml)$#', + 'index_file_pattern' => '#^index\.[a-z]+$#', + ) + ); + } + + public function get_last_error(): ?string { + // @TODO: Implement this. + return null; + } + + public function get_entity() { + return $this->current_entity; + } + + public function is_finished(): bool { + return $this->finished; + } + + public function next_entity(): bool { + while(true) { + while(count($this->entities) > 0) { + $this->current_entity = array_shift( $this->entities ); + return true; + } + + if( ! $this->post_tree->next_node() ) { + $this->finished = true; + return false; + } + + $source_content_converter = null; + $post_tree_node = $this->post_tree->get_current_node(); + if($post_tree_node['type'] === 'file') { + $content = $this->filesystem->read_file($post_tree_node['local_file_path']); + $extension = pathinfo($post_tree_node['local_file_path'], PATHINFO_EXTENSION); + switch($extension) { + case 'md': + $converter = new WP_Markdown_To_Blocks( $content ); + $source_content_converter = 'md'; + break; + case 'xhtml': + $converter = new WP_HTML_To_Blocks( WP_XML_Processor::create_from_string( $content ) ); + $source_content_converter = 'xhtml'; + break; + case 'html': + default: + $converter = new WP_HTML_To_Blocks( WP_HTML_Processor::create_fragment( $content ) ); + $source_content_converter = 'html'; + break; + } + + if( false === $converter->convert() ) { + throw new Exception('Failed to convert Markdown to blocks'); + } + $markup = $converter->get_block_markup(); + $metadata = $converter->get_all_metadata(); + } else { + $markup = ''; + $metadata = array(); + // @TODO: Accept an option to set what should we default to. + $source_content_converter = 'html'; + } + + $reader = new WP_Block_Markup_Entity_Reader( + $markup, + $metadata, + $post_tree_node['post_id'] + ); + while($reader->next_entity()) { + $entity = $reader->get_entity(); + $data = $entity->get_data(); + if( $entity->get_type() === 'post' ) { + $data['id'] = $post_tree_node['post_id']; + $data['guid'] = $post_tree_node['local_file_path']; + $data['post_parent'] = $post_tree_node['parent_id']; + $data['post_title'] = $data['post_title'] ?? null; + $data['post_status'] = 'publish'; + $data['post_type'] = $this->post_type; + if ( ! $data['post_title'] ) { + $data['post_title'] = WP_Import_Utils::slug_to_title( basename( $post_tree_node['local_file_path'] ) ); + } + $entity = new WP_Imported_Entity( $entity->get_type(), $data ); + } + $this->entities[] = $entity; + } + + // Also emit: + $additional_meta = array( + 'local_file_path' => $post_tree_node['local_file_path'], + 'source_type' => $post_tree_node['type'], + 'source_content_converter' => $source_content_converter, + ); + foreach($additional_meta as $key => $value) { + $this->entities[] = new WP_Imported_Entity( + 'post_meta', + array( + 'post_id' => $post_tree_node['post_id'], + 'key' => $key, + 'value' => $value, + ) + ); + } + } + } +} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_To_Post_Tree.php b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_To_Post_Tree.php new file mode 100644 index 0000000000..dde52c8671 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_Filesystem_To_Post_Tree.php @@ -0,0 +1,238 @@ +file_visitor = new WordPress\Filesystem\WP_Filesystem_Visitor( $filesystem ); + $this->create_index_pages = $options['create_index_pages'] ?? true; + $this->next_post_id = $options['first_post_id']; + $this->filter_pattern = $options['filter_pattern']; + $this->index_file_pattern = $options['index_file_pattern']; + } + + public function get_current_node() { + return $this->current_node; + } + + public function next_node() { + $this->current_node = null; + if ( $this->is_finished ) { + return false; + } + while ( true ) { + if ( null !== $this->pending_directory_index ) { + $dir = $this->file_visitor->get_event()->dir; + $depth = $this->file_visitor->get_current_depth(); + $parent_id = $this->parent_ids[ $depth - 1 ] ?? null; + + if ( null === $parent_id && $depth > 1 ) { + // There's no parent ID even though we're a few levels deep. + // This is a scenario where `next_file()` skipped a few levels + // of directories with no relevant content in them: + // + // - /docs/ + // - /foo/ + // - /bar/ + // - /baz.md + // + // In this case, we need to backtrack and create the missing + // parent pages for /bar/ and /foo/. + + // Find the topmost missing parent ID + $missing_parent_id_depth = 1; + while ( isset( $this->parent_ids[ $missing_parent_id_depth ] ) ) { + ++$missing_parent_id_depth; + } + + // Move up to the corresponding directory + $missing_parent_path = $dir; + for ( $i = $missing_parent_id_depth; $i < $depth; $i++ ) { + $missing_parent_path = dirname( $missing_parent_path ); + } + + $this->parent_ids[ $missing_parent_id_depth ] = $this->emit_object( + array( + 'type' => 'directory', + 'local_file_path' => $missing_parent_path, + 'parent_id' => $this->parent_ids[ $missing_parent_id_depth - 1 ] ?? null, + ) + ); + } elseif ( false === $this->pending_directory_index ) { + // No directory index candidate – let's create a fake page + // just to have something in the page tree. + $this->parent_ids[ $depth ] = $this->emit_object( + array( + 'type' => 'file_placeholder', + 'local_file_path' => $dir, + 'parent_id' => $parent_id, + ) + ); + // We're no longer looking for a directory index. + $this->pending_directory_index = null; + } else { + $file_path = $this->pending_directory_index; + $this->parent_ids[ $depth ] = $this->emit_object( + array( + 'type' => 'file', + 'local_file_path' => $file_path, + 'parent_id' => $parent_id, + ) + ); + // We're no longer looking for a directory index. + $this->pending_directory_index = null; + } + return true; + } + + while ( count( $this->pending_files ) ) { + $parent_id = $this->parent_ids[ $this->file_visitor->get_current_depth() ] ?? null; + $file_path = array_shift( $this->pending_files ); + $this->emit_object( + array( + 'type' => 'file', + 'local_file_path' => $file_path, + 'parent_id' => $parent_id, + ) + ); + return true; + } + + if ( false === $this->next_file() ) { + break; + } + } + $this->is_finished = true; + return false; + } + + protected function emit_object( $options ) { + $post_id = $this->next_post_id; + ++$this->next_post_id; + $this->current_node = array_merge( + $options, + array( + 'post_id' => $post_id, + ) + ); + ++$this->entities_read_so_far; + return $post_id; + } + + private function next_file() { + $this->pending_files = array(); + while ( $this->file_visitor->next() ) { + $event = $this->file_visitor->get_event(); + + if ( $event->is_exiting() ) { + // Clean up stale IDs to save some memory when processing + // large directory trees. + unset( $this->parent_ids[ $event->dir ] ); + continue; + } + + if ( $event->is_entering() ) { + $abs_paths = array(); + foreach ( $event->files as $filename ) { + $abs_paths[] = wp_join_paths( $event->dir, $filename ); + } + $this->pending_files = $this->choose_relevant_files( $abs_paths ); + if ( ! count( $this->pending_files ) ) { + // Only consider directories with relevant files in them. + // Otherwise we'll create fake pages for media directories + // and other directories that don't contain any content. + // + // One corner case is when there's a few levels of directories + // with a single relevant file at the bottom: + // + // - /docs/ + // - /foo/ + // - /bar/ + // - /baz.md + // + // In this case, `next_entity()` will backtrack at baz.md and + // create the missing parent pages. + continue; + } + $directory_index_idx = $this->choose_directory_index( $this->pending_files ); + if ( -1 === $directory_index_idx ) { + $this->pending_directory_index = false; + } else { + $this->pending_directory_index = $this->pending_files[ $directory_index_idx ]; + unset( $this->pending_files[ $directory_index_idx ] ); + } + return true; + } + + return false; + } + return false; + } + + protected function choose_directory_index( $files ) { + foreach ( $files as $idx => $file ) { + if ( $this->looks_like_directory_index( $file ) ) { + return $idx; + } + } + if ( ! $this->create_index_pages && count( $files ) > 0 ) { + return 0; + } + return -1; + } + + protected function looks_like_directory_index( $path ) { + return preg_match( $this->index_file_pattern, basename( $path ) ); + } + + protected function choose_relevant_files( $paths ) { + $filtered_paths = array(); + foreach ( $paths as $path ) { + if ( preg_match( $this->filter_pattern, $path ) ) { + $filtered_paths[] = $path; + } + } + return $filtered_paths; + } +} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php new file mode 100644 index 0000000000..aef6041666 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php @@ -0,0 +1,95 @@ +html_processor = $html_processor; + $this->post_id = $post_id; + } + + public function next_entity() { + // If we're finished, we're finished. + if ( $this->finished ) { + return false; + } + + // If we've already read some entities, skip to the next one. + if ( null !== $this->entities ) { + array_shift( $this->entities ); + if ( count( $this->entities ) === 0 ) { + $this->finished = true; + return false; + } + return true; + } + + // We did not read any entities yet. Let's convert the HTML document into entities. + $converter = new WP_HTML_To_Blocks( $this->html_processor ); + if ( false === $converter->convert() ) { + $this->last_error = $converter->get_last_error(); + return false; + } + + $all_metadata = $converter->get_all_metadata(); + $post_fields = array(); + $other_metadata = array(); + foreach ( $all_metadata as $key => $values ) { + if ( in_array( $key, WP_Imported_Entity::POST_FIELDS, true ) ) { + $post_fields[ $key ] = $values[0]; + } else { + $other_metadata[ $key ] = $values[0]; + } + } + + // Yield the post entity. + $this->entities[] = new WP_Imported_Entity( + 'post', + array_merge( + $post_fields, + array( + 'post_id' => $this->post_id, + 'content' => $converter->get_block_markup(), + ) + ) + ); + + // Yield all the metadata that don't belong to the post entity. + foreach ( $other_metadata as $key => $value ) { + $this->entities[] = new WP_Imported_Entity( + 'post_meta', + array( + 'post_id' => $this->post_id, + 'key' => $key, + 'value' => $value, + ) + ); + } + return true; + } + + public function get_entity() { + if ( $this->is_finished() ) { + return false; + } + return $this->entities[0]; + } + + public function is_finished(): bool { + return $this->finished; + } + + public function get_last_error(): ?string { + return $this->last_error; + } +} diff --git a/packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_WXR_Entity_Reader.php similarity index 93% rename from packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php rename to packages/playground/data-liberation/src/entity-readers/WP_WXR_Entity_Reader.php index 25c21ff608..398983c370 100644 --- a/packages/playground/data-liberation/src/wxr/WP_WXR_Reader.php +++ b/packages/playground/data-liberation/src/entity-readers/WP_WXR_Entity_Reader.php @@ -1,6 +1,9 @@ channel > item` and comments are * stored in `rss > channel > item > `wp:comment`. @@ -33,7 +36,7 @@ * * Example: * - * $reader = WP_WXR_Reader::create_for_streaming(); + * $reader = WP_WXR_Entity_Reader::create_for_streaming(); * * // Add data as it becomes available * $reader->append_bytes( fread( $file_handle, 8192 ) ); @@ -64,24 +67,24 @@ * } * * The next_entity() -> fread -> break usage pattern may seem a bit tedious. This is expected. Even - * if the WXR parsing part of the WP_WXR_Reader offers a high-level API, working with byte streams + * if the WXR parsing part of the WP_WXR_Entity_Reader offers a high-level API, working with byte streams * requires reasoning on a much lower level. The StreamChain class shipped in this repository will * make the API consumption easier with its transformation–oriented API for chaining data processors. * - * Similarly to `WP_XML_Processor`, the `WP_WXR_Reader` enters a paused state when it doesn't + * Similarly to `WP_XML_Processor`, the `WP_WXR_Entity_Reader` enters a paused state when it doesn't * have enough XML bytes to parse the entire entity. * * ## Caveats * * ### Extensibility * - * `WP_WXR_Reader` ignores any XML elements it doesn't recognize. The WXR format is extensible + * `WP_WXR_Entity_Reader` ignores any XML elements it doesn't recognize. The WXR format is extensible * so in the future the reader may start supporting registration of custom handlers for unknown * tags in the future. * * ### Nested entities intertwined with data * - * `WP_WXR_Reader` flushes the current entity whenever another entity starts. The upside is + * `WP_WXR_Entity_Reader` flushes the current entity whenever another entity starts. The upside is * simplicity and a tiny memory footprint. The downside is that it's possible to craft a WXR * document where some information would be lost. For example: * @@ -101,7 +104,7 @@ * * ``` * - * `WP_WXR_Reader` would accumulate post data until the `wp:post_meta` tag. Then it would emit a + * `WP_WXR_Entity_Reader` would accumulate post data until the `wp:post_meta` tag. Then it would emit a * `post` entity and accumulate the meta information until the `` closer. Then it * would advance to `Last week, WordPress 6.8 was released. This release includes a new default theme, a new block editor experience, and a new block library. It also includes a new block editor experience, and a new block library.
+ + + +Feature | Status |
---|---|
Block Editor | Released |
New Theme | Released |
block patterns
addedblock patterns
addedfunction example() {
+ return "WordPress 6.8";
+}
+
+
+
+The most significant update includes improved block editing capabilities.
+ + +HTML; + $expected = [ + [ + 'type' => 'heading', + 'level' => 1, + 'content' => [ + [ + 'type' => 'text', + 'content' => 'WordPress 6.8 was released', + ], + ], + ], + [ + 'type' => 'paragraph', + 'content' => [ + [ + 'type' => 'text', + 'content' => 'Last week, WordPress 6.8 was released. This release includes a new default theme, a new block editor experience, and a new block library. It also includes a new block editor experience, and a new block library.', + ], + ], + ], + [ + 'type' => 'html_block', + 'content' => ' +Feature | Status |
---|---|
Block Editor | Released |
New Theme | Released |
Last week, WordPress 6.8 was released. This release includes a new default theme, a new block editor experience, and a new block library. It also includes a new block editor experience, and a new block library.
+ +HTML; + + $converter = new WP_Blocks_To_Markdown($blocks, $metadata); + $this->assertTrue($converter->convert()); + $markdown = $converter->get_result(); + + $expected = <<A simple paragraph
', + 'expected' => "A simple paragraph\n\n" + ], + 'A simple list' => [ + 'blocks' => << +A simple paragraph with a link
', + 'expected' => "A simple paragraph with a [link](https://wordpress.org)\n\n" + ], + 'Formatted text' => [ + 'blocks' => 'Bold and Italic
', + 'expected' => "**Bold** and *Italic*\n\n" + ], + 'A blockquote' => [ + 'blocks' => '', + 'expected' => "> A simple blockquote\n> \n" + ], + 'A table' => [ + 'blocks' => << +A simple blockquote
Header 1 | Header 2 |
---|---|
Cell 1 | Cell 2 |
Cell 3 | Cell 4 |
Here are the key insights...
+ +HTML; + + $metadata = [ + 'title' => 'Brian Chesky – Founder Mode & The Art of Hiring' + ]; + + $converter = new WP_Blocks_To_Markdown($blocks, $metadata); + $converter->convert(); + $markdown = $converter->get_result(); + + $expected = <<Last week, WordPress 6.8 was released.
+HTML; + $reader = new WP_HTML_Entity_Reader( WP_HTML_Processor::create_fragment( $html ), 1 ); + $entities = []; + while ( $reader->next_entity() ) { + $data = $reader->get_entity()->get_data(); + if(isset($data['content'])) { + $data['content'] = $this->normalize_markup( $data['content'] ); + } + $entities[] = [ + 'type' => $reader->get_entity()->get_type(), + 'data' => $data, + ]; + } + $expected_entities = [ + [ + 'type' => 'post', + 'data' => [ + 'post_title' => 'WordPress 6.8 was released', + 'post_date' => '2024-12-16', + 'post_id' => 1, + 'content' => $this->normalize_markup(<< +Last week, WordPress 6.8 was released.
+ +HTML) + ] + ], + [ + 'type' => 'post_meta', + 'data' => [ + 'post_id' => 1, + 'meta_key' => 'custom_post_meta', + 'meta_value' => 'custom_post_meta_value', + ] + ], + [ + 'type' => 'post_meta', + 'data' => [ + 'post_id' => 1, + 'meta_key' => 'color_palette', + 'meta_value' => 'use_that_pretty_one', + ] + ], + ]; + $this->assertEquals( $expected_entities, $entities ); + } + + private function normalize_markup( $markup ) { + $processor = WP_HTML_Processor::create_fragment( $markup ); + $serialized = $processor->serialize(); + return $serialized; + } + +} diff --git a/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php b/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php new file mode 100644 index 0000000000..cf07907154 --- /dev/null +++ b/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php @@ -0,0 +1,170 @@ + + + + + + + +Last week, WordPress 6.8 was released. This release includes a new default theme, a new block editor experience, and a new block library. It also includes a new block editor experience, and a new block library.
+HTML; + $converter = new WP_HTML_To_Blocks( WP_HTML_Processor::create_fragment( $html ) ); + $converter->convert( $html ); + $metadata = $converter->get_all_metadata(); + $expected_metadata = [ + 'post_title' => ['WordPress 6.8 was released'], + 'post_date' => ['2024-12-16'], + 'post_modified' => ['2024-12-16'], + 'post_author' => ['1'], + 'post_author_name' => ['The WordPress Team'], + 'post_author_url' => ['https://wordpress.org'], + 'post_author_avatar' => ['https://wordpress.org/wp-content/uploads/2024/04/wordpress-logo-2024.png'], + ]; + $this->assertEquals( $expected_metadata, $metadata ); + } + + /** + * @dataProvider provider_test_conversion + */ + public function test_html_to_blocks_conversion( $html, $expected ) { + $converter = new WP_HTML_To_Blocks( WP_HTML_Processor::create_fragment( $html ) ); + $converter->convert( $html ); + $blocks = $converter->get_block_markup(); + + $this->assertEquals( $this->normalize_markup($expected), $this->normalize_markup($blocks) ); + } + + private function normalize_markup( $markup ) { + $processor = WP_HTML_Processor::create_fragment( $markup ); + $serialized = $processor->serialize(); + $serialized = trim( + str_replace( + [ + // Even more naively, remove all the newlines. + "\n" + ], + '', + $serialized + ) + ); + return $serialized; + } + + public function provider_test_conversion() { + return [ + 'A simple paragraph' => [ + 'html' => 'A simple paragraph
', + 'expected' => "A simple paragraph
" + ], + 'A simple list' => [ + 'html' => '
A simple paragraph with a link
', + 'expected' => "A simple paragraph with a link
" + ], + 'Formatted text' => [ + 'html' => 'Bold and Italic
', + 'expected' => "Bold and Italic
" + ], + 'A blockquote' => [ + 'html' => 'A simple blockquote', + 'expected' => "
A simple blockquote" + ], + 'A table' => [ + 'html' => <<
Header 1 | +Header 2 | +
---|---|
Cell 1 | +Cell 2 | +
Cell 3 | +Cell 4 | +
Footer 1 | +Footer 2 | +
Header 1 | Header 2 |
---|---|
Cell 1 | Cell 2 |
Cell 3 | Cell 4 |
Footer 1 | Footer 2 |
And some content
+ + +XML; + $converter = new WP_HTML_To_Blocks( WP_XML_Processor::create_from_string( $input ) ); + $converter->convert( $input ); + $blocks = $converter->get_block_markup(); + $expected = <<And some content
+HTML; + $this->assertEquals( + $this->normalize_markup( $expected ), + $this->normalize_markup( $blocks ) + ); + } + +} diff --git a/packages/playground/data-liberation/tests/WPMarkdownToBlocksTests.php b/packages/playground/data-liberation/tests/WPMarkdownToBlocksTests.php new file mode 100644 index 0000000000..852c4f9d2c --- /dev/null +++ b/packages/playground/data-liberation/tests/WPMarkdownToBlocksTests.php @@ -0,0 +1,149 @@ +assertTrue($converter->convert()); + $metadata = $converter->get_all_metadata(); + $expected_metadata = [ + 'post_title' => ['WordPress 6.8 was released'], + 'post_date' => ['2024-12-16'], + 'post_modified' => ['2024-12-16'], + 'post_author' => ['1'], + 'post_author_name' => ['The WordPress Team'], + 'post_author_url' => ['https://wordpress.org'], + 'post_author_avatar' => ['https://wordpress.org/wp-content/uploads/2024/04/wordpress-logo-2024.png'], + ]; + $this->assertEquals($expected_metadata, $metadata); + } + + /** + * @dataProvider provider_test_conversion + */ + public function test_markdown_to_blocks_conversion($markdown, $expected) { + $converter = new WP_Markdown_To_Blocks($markdown); + $converter->convert(); + $blocks = $converter->get_block_markup(); + + $this->assertEquals($this->normalize_markup($expected), $this->normalize_markup($blocks)); + } + + private function normalize_markup($markup) { + $processor = WP_HTML_Processor::create_fragment($markup); + $serialized = $processor->serialize(); + $serialized = trim( + str_replace( + [ + // Even more naively, remove all the newlines. + "\n" + ], + '', + $serialized + ) + ); + return $serialized; + } + + public function provider_test_conversion() { + return [ + 'A simple paragraph' => [ + 'markdown' => 'A simple paragraph', + 'expected' => "A simple paragraph
" + ], + 'A simple list' => [ + 'markdown' => "- Item 1\n- Item 2", + 'expected' => <<A simple paragraph with a link
" + ], + 'Formatted text' => [ + 'markdown' => '**Bold** and *Italic*', + 'expected' => "Bold and Italic
" + ], + 'A blockquote' => [ + 'markdown' => '> A simple blockquote', + 'expected' => "" + ], + 'A table' => [ + 'markdown' => <<A simple blockquote
Header 1 | Header 2 |
---|---|
Cell 1 | Cell 2 |
Cell 3 | Cell 4 |
This is page 1.
diff --git a/packages/playground/data-liberation/tests/fixtures/directory-tree-entity-reader/root.html b/packages/playground/data-liberation/tests/fixtures/directory-tree-entity-reader/root.html new file mode 100644 index 0000000000..5666bc9ad6 --- /dev/null +++ b/packages/playground/data-liberation/tests/fixtures/directory-tree-entity-reader/root.html @@ -0,0 +1,2 @@ +This is the root page.
diff --git a/packages/playground/data-liberation/tests/fixtures/epub-entity-reader/childrens-literature.epub b/packages/playground/data-liberation/tests/fixtures/epub-entity-reader/childrens-literature.epub new file mode 100644 index 0000000000..ba84a64399 Binary files /dev/null and b/packages/playground/data-liberation/tests/fixtures/epub-entity-reader/childrens-literature.epub differ diff --git a/packages/playground/data-liberation/tests/fixtures/html-to-blocks/excerpt.input.html b/packages/playground/data-liberation/tests/fixtures/html-to-blocks/excerpt.input.html new file mode 100644 index 0000000000..0f2d9d5443 --- /dev/null +++ b/packages/playground/data-liberation/tests/fixtures/html-to-blocks/excerpt.input.html @@ -0,0 +1,189 @@ +Living Standard — Last Updated 12 December 2024
+ + + + +innerText
and outerText
propertiesbody
elementarticle
elementsection
elementnav
elementaside
elementh1
, h2
, h3
, h4
, h5
, and h6
+ elementshgroup
elementheader
elementfooter
elementaddress
elementp
elementhr
elementpre
elementblockquote
elementol
elementul
elementmenu
elementli
elementdl
elementdt
elementdd
elementfigure
elementfigcaption
elementmain
elementsearch
elementdiv
elementa
elementem
elementstrong
elementsmall
elements
elementcite
elementq
elementdfn
elementabbr
elementruby
elementrt
elementrp
elementdata
elementtime
elementcode
elementvar
elementsamp
elementkbd
elementsub
and sup
elementsi
elementb
elementu
elementmark
elementbdi
elementbdo
elementspan
elementbr
elementwbr
elementa
and area
elementsa
and area
elementsalternate
"author
"bookmark
"canonical
"dns-prefetch
"expect
"external
"help
"icon
"license
"manifest
"modulepreload
"nofollow
"noopener
"noreferrer
"opener
"pingback
"preconnect
"prefetch
"preload
"privacy-policy
"search
"stylesheet
"tag
"terms-of-service
"picture
elementsource
elementimg
elementsource
,
+ img
, and link
elementsiframe
elementembed
elementobject
elementvideo
elementaudio
elementtrack
elementTrackEvent
interfacemap
elementarea
elementtable
elementcaption
elementcolgroup
elementcol
elementtbody
elementthead
elementtfoot
elementtr
elementtd
elementth
elementtd
and th
elementsform
elementlabel
elementinput
elementtype
attributetype=hidden
)type=text
) state and Search state (type=search
)type=tel
)type=url
)type=email
)type=password
)type=date
)type=month
)type=week
)type=time
)type=datetime-local
)type=number
)type=range
)type=color
)type=checkbox
)type=radio
)type=file
)type=submit
)type=image
)type=reset
)type=button
)input
element attributesmaxlength
and minlength
attributessize
attributereadonly
attributerequired
attributemultiple
attributepattern
attributemin
and max
attributesstep
attributelist
attributeplaceholder
attributeinput
element APIsbutton
elementselect
elementdatalist
elementoptgroup
elementoption
elementtextarea
elementoutput
elementprogress
elementmeter
elementfieldset
elementlegend
elementname
attributedirname
attributemaxlength
attributeminlength
attributedisabled
attributeSubmitEvent
interfaceFormDataEvent
interfacedetails
elementsummary
elementa
element to define a commandbutton
element to define a commandinput
element to define a commandoption
element to define a commandaccesskey
attribute
+ on a legend
element to define a commandaccesskey
+ attribute to define a command on other elementsdialog
elementscript
elementnoscript
elementtemplate
elementslot
elementcanvas
elementPath2D
objectsImageBitmap
rendering contextOffscreenCanvas
interfacecanvas
elementsCustomElementRegistry
interfacehidden
attributecontenteditable
content attributedesignMode
getter and setterinputmode
attributeenterkeyhint
+ attributepopover
attributeWindow
,
+ WindowProxy
, and Location
objectsWindow
objectWindowProxy
exotic objectLocation
interfaceHistory
interfaceNavigation
interfaceNavigationHistoryEntry
interfaceNavigationActivation
interfacenavigate
eventNotRestoredReasons
interfacemultipart/x-mixed-replace
+ documentsX-Frame-Options
` headerRefresh
` headerWindowOrWorkerGlobalScope
mixinbutton
elementdetails
and summary
elementsinput
element as a text entry widgetinput
element as domain-specific widgetsinput
element as a range controlinput
element as a color
+ wellinput
element as a checkbox and radio button widgetsinput
element as a file upload controlinput
element as a buttonmarquee
elementmeter
elementprogress
elementselect
elementtextarea
elementThis specification defines a big part of the web platform, in lots of detail. Its place in the + web platform specification stack relative to other specifications can be best summed up as + follows:
+ + + + + + +This section is non-normative.
+ +In short: Yes.
+ +In more length: the term "HTML5" is widely used as a buzzword to refer to modern web + technologies, many of which (though by no means all) are developed at the WHATWG. This document is + one such; others are available from the WHATWG Standards + overview.
+ + +This section is non-normative.
+ +HTML is the World Wide Web's core markup language. Originally, HTML was primarily designed as a + language for semantically describing scientific documents. Its general design, however, has + enabled it to be adapted, over the subsequent years, to describe a number of other types of + documents and even applications.
+ + +This section is non-normative.
+ +This specification is intended for authors of documents and scripts that use the features + defined in this specification, implementers of tools that operate on pages that + use the features defined in this specification, and individuals wishing to establish the + correctness of documents or implementations with respect to the requirements of this + specification.
+ +This document is probably not suited to readers who do not already have at least a passing + familiarity with web technologies, as in places it sacrifices clarity for precision, and brevity + for completeness. More approachable tutorials and authoring guides can provide a gentler + introduction to the topic.
+ +In particular, familiarity with the basics of DOM is necessary for a complete understanding of + some of the more technical parts of this specification. An understanding of Web IDL, HTTP, XML, + Unicode, character encodings, JavaScript, and CSS will also be helpful in places but is not + essential.
+ + +This section is non-normative.
+ +This specification is limited to providing a semantic-level markup language and associated + semantic-level scripting APIs for authoring accessible pages on the web ranging from static + documents to dynamic applications.
+ +The scope of this specification does not include providing mechanisms for media-specific + customization of presentation (although default rendering rules for web browsers are included at + the end of this specification, and several mechanisms for hooking into CSS are provided as part of + the language).
+ +The scope of this specification is not to describe an entire operating system. In particular, + hardware configuration software, image manipulation tools, and applications that users would be + expected to use with high-end workstations on a daily basis are out of scope. In terms of + applications, this specification is targeted specifically at applications that would be expected + to be used by users on an occasional basis, or regularly but from disparate locations, with low + CPU requirements. Examples of such applications include online purchasing systems, searching + systems, games (especially multiplayer online games), public telephone books or address books, + communications software (email clients, instant messaging clients, discussion software), document + editing software, etc.
+ + +This section is non-normative.
+ +For its first five years (1990-1995), HTML went through a number of revisions and experienced a + \ No newline at end of file diff --git a/packages/playground/data-liberation/tests/fixtures/html-to-blocks/excerpt.output.html b/packages/playground/data-liberation/tests/fixtures/html-to-blocks/excerpt.output.html new file mode 100644 index 0000000000..c96636c167 --- /dev/null +++ b/packages/playground/data-liberation/tests/fixtures/html-to-blocks/excerpt.output.html @@ -0,0 +1,2885 @@ + +
+Living Standard — Last Updated 12 December 2024
+ + One-Page Version html.spec.whatwg.org
Multipage Version /multipage
Version for Web Devs /dev
PDF Version /print.pdf
Translations 日本語 • 简体中文
FAQ on GitHub
Chat on Matrix
Contribute on GitHub whatwg/html repository
Commits on GitHub
Snapshot as of this commit
Twitter Updates @htmlstandard
Open Issues filed on GitHub
Open an Issue whatwg.org/newbug
Tests web-platform-tests html/
Issues for Tests ongoing work
meta
elements innerText
and outerText
properties head
element title
element base
element link
element
+ media
attribute type
attribute link
element Link
` headers link
element meta
element
+
+ style
element body
element article
element section
element nav
element aside
element h1
, h2
, h3
, h4
, h5
, and h6
elements hgroup
element header
element footer
element address
element p
element hr
element pre
element blockquote
element ol
element ul
element menu
element li
element dl
element dt
element dd
element figure
element figcaption
element main
element search
element div
element a
element em
element strong
element small
element s
element cite
element q
element dfn
element abbr
element ruby
element rt
element rp
element data
element time
element code
element var
element samp
element kbd
element sub
and sup
elements i
element b
element u
element mark
element bdi
element bdo
element span
element br
element wbr
element a
and area
elements a
and area
elements alternate
" author
" bookmark
" canonical
" dns-prefetch
" expect
" external
" help
" icon
" license
" manifest
" modulepreload
" nofollow
" noopener
" noreferrer
" opener
" pingback
" preconnect
" prefetch
" preload
" privacy-policy
" search
" stylesheet
" tag
" terms-of-service
" picture
element source
element img
element source
, img
, and link
elements
+
+ iframe
element embed
element object
element video
element audio
element track
element TrackEvent
interface map
element area
element table
element
+
+ caption
element colgroup
element col
element tbody
element thead
element tfoot
element tr
element td
element th
element td
and th
elements form
element label
element input
element
+ type
attribute
+ type=hidden
) type=text
) state and Search state ( type=search
) type=tel
) type=url
) type=email
) type=password
) type=date
) type=month
) type=week
) type=time
) type=datetime-local
) type=number
) type=range
) type=color
) type=checkbox
) type=radio
) type=file
) type=submit
) type=image
) type=reset
) type=button
) input
element attributes
+ maxlength
and minlength
attributes size
attribute readonly
attribute required
attribute multiple
attribute pattern
attribute min
and max
attributes step
attribute list
attribute placeholder
attribute input
element APIs button
element select
element datalist
element optgroup
element option
element textarea
element output
element progress
element meter
element fieldset
element legend
element name
attribute dirname
attribute maxlength
attribute minlength
attribute disabled
attribute SubmitEvent
interface FormDataEvent
interface details
element summary
element a
element to define a command button
element to define a command input
element to define a command option
element to define a command accesskey
attribute
+ on a legend
element to define a command accesskey
attribute to define a command on other elements dialog
element script
element
+
+ noscript
element template
element
+
+ slot
element canvas
element
+ Path2D
objects ImageBitmap
rendering context
+
+ OffscreenCanvas
interface
+
+ canvas
elements CustomElementRegistry
interface hidden
attribute contenteditable
content attribute designMode
getter and setter inputmode
attribute enterkeyhint
attribute popover
attribute
+
+ Window
, WindowProxy
, and Location
objects
+ Window
object
+
+ WindowProxy
exotic object
+ Location
interface
+ History
interface Navigation
interface NavigationHistoryEntry
interface NavigationActivation
interface navigate
event
+
+ NavigationCurrentEntryChangeEvent
interface PopStateEvent
interface HashChangeEvent
interface PageSwapEvent
interface PageRevealEvent
interface PageTransitionEvent
interface BeforeUnloadEvent
interface NotRestoredReasons
interface multipart/x-mixed-replace
documents X-Frame-Options
` header Refresh
` header WindowOrWorkerGlobalScope
mixin Navigator
object
+
+ MessageEvent
interface EventSource
interface Last-Event-ID
` header hr
element fieldset
and legend
elements button
element details
and summary
elements input
element as a text entry widget input
element as domain-specific widgets input
element as a range control input
element as a color
+ well input
element as a checkbox and radio button widgets input
element as a file upload control input
element as a button marquee
element meter
element progress
element select
element textarea
element This specification defines a big part of the web platform, in lots of detail. Its place in the + web platform specification stack relative to other specifications can be best summed up as + follows:
+ +This section is non-normative.
+ +In short: Yes.
+ +In more length: the term "HTML5" is widely used as a buzzword to refer to modern web + technologies, many of which (though by no means all) are developed at the WHATWG. This document is + one such; others are available from the WHATWG Standards + overview .
+ +This section is non-normative.
+ +HTML is the World Wide Web's core markup language. Originally, HTML was primarily designed as a + language for semantically describing scientific documents. Its general design, however, has + enabled it to be adapted, over the subsequent years, to describe a number of other types of + documents and even applications.
+ +This section is non-normative.
+ +This specification is intended for authors of documents and scripts that use the features + defined in this specification, implementers of tools that operate on pages that + use the features defined in this specification, and individuals wishing to establish the + correctness of documents or implementations with respect to the requirements of this + specification.
+ +This document is probably not suited to readers who do not already have at least a passing + familiarity with web technologies, as in places it sacrifices clarity for precision, and brevity + for completeness. More approachable tutorials and authoring guides can provide a gentler + introduction to the topic.
+ +In particular, familiarity with the basics of DOM is necessary for a complete understanding of + some of the more technical parts of this specification. An understanding of Web IDL, HTTP, XML, + Unicode, character encodings, JavaScript, and CSS will also be helpful in places but is not + essential.
+ +This section is non-normative.
+ +This specification is limited to providing a semantic-level markup language and associated + semantic-level scripting APIs for authoring accessible pages on the web ranging from static + documents to dynamic applications.
+ +The scope of this specification does not include providing mechanisms for media-specific + customization of presentation (although default rendering rules for web browsers are included at + the end of this specification, and several mechanisms for hooking into CSS are provided as part of + the language).
+ +The scope of this specification is not to describe an entire operating system. In particular, + hardware configuration software, image manipulation tools, and applications that users would be + expected to use with high-end workstations on a daily basis are out of scope. In terms of + applications, this specification is targeted specifically at applications that would be expected + to be used by users on an occasional basis, or regularly but from disparate locations, with low + CPU requirements. Examples of such applications include online purchasing systems, searching + systems, games (especially multiplayer online games), public telephone books or address books, + communications software (email clients, instant messaging clients, discussion software), document + editing software, etc.
+ +This section is non-normative.
+ +For its first five years (1990-1995), HTML went through a number of revisions and experienced a