From ee934c39e724bad9e32459b44c365c589faad5b0 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 6 Dec 2022 21:03:12 -0700 Subject: [PATCH 01/19] WIP: Introduce class for sourcing block attributes from HTML --- .../html/class-wp-html-attribute-sourcer.php | 390 ++++++++++++++++++ .../html/class-wp-html-naive-processor.php | 14 + .../html/class-wp-html-tag-processor.php | 2 +- .../html/wp-html-attribute-sourcer-test.php | 177 ++++++++ 4 files changed, 582 insertions(+), 1 deletion(-) create mode 100644 lib/experimental/html/class-wp-html-attribute-sourcer.php create mode 100644 lib/experimental/html/class-wp-html-naive-processor.php create mode 100644 phpunit/html/wp-html-attribute-sourcer-test.php diff --git a/lib/experimental/html/class-wp-html-attribute-sourcer.php b/lib/experimental/html/class-wp-html-attribute-sourcer.php new file mode 100644 index 00000000000000..268d1be8f477cf --- /dev/null +++ b/lib/experimental/html/class-wp-html-attribute-sourcer.php @@ -0,0 +1,390 @@ + a + * figure a + * figure img + * figure video,figure img + * h1,h2,h3,h4,h5,h6 + * img + * li + * ol,ul + * p + * pre + * tbody tr + * td,th + * tfoot tr + * thead tr + * video + */ + +require_once __DIR__ . '/class-wp-html-naive-processor.php'; + +/* + * @see PHP docs for array_is_list: user-contributed polyfill + */ +if ( ! function_exists( 'array_is_list' ) ) { + function array_is_list( $array ) { + $i = 0; + + foreach ( $array as $k => $v ) { + if ( $k !== $i++ ) { + return false; + } + } + + return true; + } +} + + +class WP_HTML_Attribute_Sourcer { + /** + * Attributes definitions, typically from `block.json`. + * + * @see WP_Block_Type_Registry + * + * @var mixed|null + */ + public $attribute_definitions; + + /** + * Source HTML containing embedded attributes. + * + * @var mixed|null + */ + private $html; + + public function __construct( $attribute_definitions = null, $html = null ) { + $this->attribute_definitions = $attribute_definitions; + $this->html = $html; + } + + public function source_attributes() { + $attributes = []; + $unparsed = []; + + foreach ( $this->attribute_definitions as $name => $definition ) { + $sourcer = self::parse_definition( $definition ); + switch ( $sourcer ) { + case null: + case 'not-sourced': + case 'unsupported': + $unparsed[] = $name; + continue 2; + + case 'inner-html': + $attributes[ $name ] = $this->html; + continue 2; + } + + $tags = self::select( $sourcer['selector'], $this->html ); + if ( null === $tags ) { + $unparsed[] = $name; + continue; + } + + switch ( $sourcer['type'] ) { + case 'html': + $attributes[ $name ] = self::get_inner_html( $tags ); + continue 2; + + case 'attribute': + $attributes[ $name ] = $tags->get_attribute( $sourcer['attribute'] ); + continue 2; + } + } + + return array( + 'attributes' => $attributes, + 'unparsed' => $unparsed + ); + } + + public static function select( $selector, $html ) { + $tags = new WP_HTML_Naive_Processor( $html ); + + if ( array_is_list( $selector ) ) { + while ( $tags->next_tag() ) { + foreach ( $selector as $s ) { + if ( 'element' === $s['type'] && $tags->get_tag() === strtoupper( $s['identifier'] ) ) { + return $tags; + } + + // @TODO: $tags->has_class() would be _really_ handy here. + if ( 'class' === $s['type'] && preg_match( "~\b{$s['identifier']}\b~", $tags->get_attribute( 'class' ) ) ) { + return $tags; + } + + if ( 'hash' === $s['type'] && $s['identifier'] === $tags->get_attribute( 'id' ) ) { + return $tags; + } + } + } + + return null; + } + + switch ( $selector['type'] ) { + case 'element': + $tags->next_tag( [ 'tag_name' => $selector['identifier'] ] ); + return $tags; + + case 'class': + $tags->next_tag( [ 'class_name' => $selector['identifier'] ] ); + return $tags; + + case 'hash': + while ( $tags->next_tag() ) { + if ( $selector['identifier'] === $tags->get_attribute( 'id' ) ) { + return $tags; + } + } + } + + return null; + } + + public static function get_inner_html( WP_HTML_Naive_Processor $tags ) { + $tags->set_bookmark( 'start' ); + $tag_name = $tags->get_tag(); + $depth = 1; + + if ( self::is_void_element( $tag_name ) ) { + return ''; + } + + while ( $tags->next_tag( [ 'tag_closers' => 'visit' ] ) ) { + if ( $tags->get_tag() !== $tag_name ) { + continue; + } + + if ( $tags->is_tag_closer() && $depth === 1 ) { + $tags->set_bookmark( 'end' ); + break; + } + + $depth += $tags->is_tag_closer() ? -1 : 1; + } + + return $tags->inner_content( 'start', 'end' ); + } + + /** + * @see https://html.spec.whatwg.org/#elements-2 + */ + public static function is_void_element( $tag_name ) { + switch ( $tag_name ) { + case 'area': + case 'base': + case 'br': + case 'col': + case 'embed': + case 'hr': + case 'img': + case 'input': + case 'link': + case 'meta': + case 'source': + case 'track': + case 'wbr': + return true; + + default: + return false; + } + } + + public static function parse_definition( $definition ) { + if ( empty( $definition['source'] ) ) { + return 'not-sourced'; + } + + $source = $definition['source']; + if ( 'html' !== $source && 'attribute' !== $source ) { + return 'unsupported'; + } + + if ( 'attribute' === $source && empty( $definition['selector'] ) ) { + return null; + } + + if ( 'html' === $source && empty( $definition['selector'] ) ) { + return 'inner-html'; + } + + $selector = self::parse_selector( $definition['selector'] ); + if ( null === $selector ) { + return 'unsupported'; + } + + if ( 'html' === $source ) { + return array( 'type' => 'html', 'selector' => $selector ); + } + + $attribute = self::parse_attribute( $definition['attribute'] ); + if ( null === $attribute ) { + return null; + } + + return array( 'type' => 'attribute', 'selector' => $selector, 'attribute' => $attribute ); + } + + public static function parse_selector( $s, $at = 0 ) { + $selectors = explode( ',', $s ); + if ( count( $selectors ) > 1 ) { + $parsed = []; + + foreach ( $selectors as $selector ) { + $parsed[] = self::parse_selector( $selector, strspn( $selector, " \r\t\f\n" ) ); + } + + return $parsed; + } + + $type = 'element'; + + switch ( $s[ $at ] ) { + case '+': + // no support for adjacent sibling combinator + return null; + + case '>': + // no support for child combinator + return null; + + case '~': + // no support for general sibling combinator + return null; + + case ' ': + // no support for descendant combinator + return null; + + case '[': + // no support for attribute + return null; + + case ',': + // we shouldn't get here because we're exploding at the start + // of this function; this is a bug if we're here. + return null; + + case ':': + // no support for pseudo-selectors + return null; + + case '#': + $type = 'hash'; + $at++; + break; + + case '.': + $type = 'class'; + $at++; + break; + } + + // @TODO: Hashes don't have to start with `nmstart` so this might reject valid hash names. + $identifier = self::parse_css_identifier( $s, $at ); + if ( null === $identifier ) { + return null; + } + + if ( $at + strlen( $identifier ) < strlen( $s ) ) { + // no support for anything more complicated than a simple selector + return null; + } + + return array( 'type' => $type, 'identifier' => $identifier ); + } + + /** + * Parses CSS identifier; currently limited to ASCII identifiers. + * + * Example: + * ``` + * 'div' === parse_css_identifier( 'div > img' ); + * ``` + * + * Grammar: + * ``` + * ident -?{nmstart}{nmchar}* + * nmstart [_a-z]|{nonascii}|{escape} + * nmchar [_a-z0-9-]|{nonascii}|{escape} + * nonascii [\240-\377] + * escape {unicode}|\\[^\r\n\f0-9a-f] + * unicode \\{h}{1,6}(\r\n|[ \t\r\n\f])? + * h [0-9a-f] + * ``` + * + * @TODO: Add support for the proper syntax + * + * @see https://www.w3.org/TR/CSS21/grammar.html + * + * @param $s + * @return false|string|null + */ + public static function parse_css_identifier( $s, $at = 0 ) { + $budget = 1000; + $started_at = $at; + + $starting_chars = strspn( $s, '_-abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at ); + if ( 0 === $starting_chars ) { + return null; + } + $at += $starting_chars; + + while ( $at < strlen( $s ) && $budget-- > 0 ) { + $chars = strspn( $s, '_-abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', $at ); + + if ( 0 === $chars ) { + break; + } + + $at += $chars; + } + + if ( $budget < 0 ) { + return null; + } + + return substr( $s, $started_at, $at - $started_at ); + } + + public static function parse_attribute( $s ) { + $unallowed_characters_match = preg_match( + '~[' . + // Syntax-like characters. + '"\'>&bookmarks[ $start_bookmark ], $this->bookmarks[ $end_bookmark ] ) ) { + return null; + } + + $start = $this->bookmarks[ $start_bookmark ]; + $end = $this->bookmarks[ $end_bookmark ]; + + return substr( $this->get_updated_html(), $start->end + 1, $end->start - $start->end - 2 ); + } +} diff --git a/lib/experimental/html/class-wp-html-tag-processor.php b/lib/experimental/html/class-wp-html-tag-processor.php index 0dfa57f30f2aab..6f5bb1f766a6ef 100644 --- a/lib/experimental/html/class-wp-html-tag-processor.php +++ b/lib/experimental/html/class-wp-html-tag-processor.php @@ -388,7 +388,7 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 * @var WP_HTML_Span[] */ - private $bookmarks = array(); + protected $bookmarks = array(); const ADD_CLASS = true; const REMOVE_CLASS = false; diff --git a/phpunit/html/wp-html-attribute-sourcer-test.php b/phpunit/html/wp-html-attribute-sourcer-test.php new file mode 100644 index 00000000000000..95cf675aa43c13 --- /dev/null +++ b/phpunit/html/wp-html-attribute-sourcer-test.php @@ -0,0 +1,177 @@ +assertSame( $expected, ( new WP_HTML_Attribute_Sourcer( $attributes, $html ) )->source_attributes() ); + } + + public function data_sourced_attributes() { + return array( + array( + array( 'attributes' => array( 'src' => 'image.png' ), 'unparsed' => array() ), + '
', + array( + 'src' => array( + 'type' => 'string', + 'source' => 'attribute', + 'selector' => 'img', + 'attribute' => 'src' + ), + ), + ), + + array( + array( + 'attributes' => array( 'content' => 'Just some quirky content' ), + 'unparsed' => array(), + ), + '

Just some quirky content

', + array( + 'content' => array( + 'type' => 'string', + 'source' => 'html', + 'selector' => 'p' + ) + ) + ), + + array( + array( + 'attributes' => array( 'content' => '
one item
another item
' ), + 'unparsed' => array(), + ), + '
one item
another item
', + array( + 'content' => array( + 'type' => 'string', + 'source' => 'html', + 'selector' => '.wp-block-group' + ) + ) + ), + + array( + array( + 'attributes' => array( 'content' => 'An Important Section' ), + 'unparsed' => array(), + ), + '

An Important Section

', + array( + 'content' => array( + 'type' => 'string', + 'source' => 'html', + 'selector' => 'h1,h2,h3,h4,h5,h6' + ) + ) + ), + ); + } + + /** + * @dataProvider data_parsed_block_attribute_definitions + */ + public function test_parse_definition( $expected, $input ) { + $this->assertSame( $expected, WP_HTML_Attribute_Sourcer::parse_definition( $input ) ); + } + + public function data_parsed_block_attribute_definitions() { + return array( + array( + 'not-sourced', + array( 'type' => 'string' ), + ), + array( + 'unsupported', + array( 'type' => 'string', 'source' => 'attribute', 'selector' => 'div + img', 'attribute' => 'src' ), + ), + array( + 'inner-html', + array( 'type' => 'string', 'source' => 'html' ), + ), + array( + array( 'type' => 'html', 'selector' => array( 'type' => 'element', 'identifier' => 'code' ) ), + array( 'type' => 'string', 'source' => 'html', 'selector' => 'code' ), + ), + array( + array( 'type' => 'attribute', 'selector' => array( 'type' => 'element', 'identifier' => 'img' ), 'attribute' => 'src' ), + array( 'type' => 'string', 'source' => 'attribute', 'selector' => 'img', 'attribute' => 'src' ), + ), + ); + } + + /** + * @dataProvider data_parsed_css_selectors + */ + public function test_parses_css_selector( $expected, $input ) { + $this->assertSame($expected, WP_HTML_Attribute_Sourcer::parse_selector( $input ) ); + } + + public function data_parsed_css_selectors() { + return array( + array( array( 'type' => 'element', 'identifier' => 'img' ), 'img' ), + array( array( 'type' => 'class', 'identifier' => 'block-group' ), '.block-group' ), + array( array( 'type' => 'hash', 'identifier' => 'input-form' ), '#input-form' ), + ); + } + + /** + * @dataProvider data_multi_parsed_css_selectors + */ + public function test_parses_multi_css_selectors( $expected, $input ) { + $this->assertSame( $expected, WP_HTML_Attribute_Sourcer::parse_selector( $input ) ); + } + + public function data_multi_parsed_css_selectors() { + return array( + array( + array( + array( 'type' => 'element', 'identifier' => 'img' ), + array( 'type' => 'class', 'identifier' => 'full-width' ), + ), + 'img, .full-width' + ), + array( + array( + array( 'type' => 'element', 'identifier' => 'h1' ), + array( 'type' => 'element', 'identifier' => 'h2' ), + array( 'type' => 'element', 'identifier' => 'h3' ), + array( 'type' => 'element', 'identifier' => 'h4' ), + array( 'type' => 'element', 'identifier' => 'h5' ), + array( 'type' => 'element', 'identifier' => 'h6' ), + ), + 'h1,h2,h3,h4,h5,h6' + ) + ); + } + + /** + * @dataProvider data_identifier_from_selector + * @return void + */ + public function test_parses_css_identifier( $expected, $input ) { + $this->assertEquals( $expected, WP_HTML_Attribute_Sourcer::parse_css_identifier( $input ) ); + } + + public function data_identifier_from_selector() { + return array( + array( 'div', 'div > img' ), + array( '-ident', '-ident.class#id' ) + ); + } +} From 5396d3be760e507f4d2cb96b22fa1b319397cc45 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 9 Dec 2022 16:40:56 -0700 Subject: [PATCH 02/19] Rearrange some functions and add support for presence of attribute in selector --- .../html/class-wp-html-attribute-sourcer.php | 223 +++++++----------- .../html/class-wp-html-naive-processor.php | 14 -- .../html/class-wp-html-processor.php | 86 +++++++ .../html/wp-html-attribute-sourcer-test.php | 26 +- 4 files changed, 194 insertions(+), 155 deletions(-) delete mode 100644 lib/experimental/html/class-wp-html-naive-processor.php create mode 100644 lib/experimental/html/class-wp-html-processor.php diff --git a/lib/experimental/html/class-wp-html-attribute-sourcer.php b/lib/experimental/html/class-wp-html-attribute-sourcer.php index 268d1be8f477cf..a2d9a0b61ebe5a 100644 --- a/lib/experimental/html/class-wp-html-attribute-sourcer.php +++ b/lib/experimental/html/class-wp-html-attribute-sourcer.php @@ -33,7 +33,7 @@ * video */ -require_once __DIR__ . '/class-wp-html-naive-processor.php'; +require_once __DIR__ . '/class-wp-html-processor.php'; /* * @see PHP docs for array_is_list: user-contributed polyfill @@ -101,7 +101,7 @@ public function source_attributes() { switch ( $sourcer['type'] ) { case 'html': - $attributes[ $name ] = self::get_inner_html( $tags ); + $attributes[ $name ] = $tags->get_content_inside_balanced_tags(); continue 2; case 'attribute': @@ -116,98 +116,31 @@ public function source_attributes() { ); } - public static function select( $selector, $html ) { - $tags = new WP_HTML_Naive_Processor( $html ); + public static function select( $selectors, $html ) { + $tags = new WP_HTML_Processor( $html ); - if ( array_is_list( $selector ) ) { - while ( $tags->next_tag() ) { - foreach ( $selector as $s ) { - if ( 'element' === $s['type'] && $tags->get_tag() === strtoupper( $s['identifier'] ) ) { - return $tags; - } - - // @TODO: $tags->has_class() would be _really_ handy here. - if ( 'class' === $s['type'] && preg_match( "~\b{$s['identifier']}\b~", $tags->get_attribute( 'class' ) ) ) { - return $tags; - } - - if ( 'hash' === $s['type'] && $s['identifier'] === $tags->get_attribute( 'id' ) ) { - return $tags; - } + while ( $tags->next_tag() ) { + foreach ( $selectors as $s ) { + if ( ! empty( $s['has_attribute'] ) && null === $tags->get_attribute( $s['has_attribute'] ) ) { + continue; } - } - - return null; - } - - switch ( $selector['type'] ) { - case 'element': - $tags->next_tag( [ 'tag_name' => $selector['identifier'] ] ); - return $tags; - - case 'class': - $tags->next_tag( [ 'class_name' => $selector['identifier'] ] ); - return $tags; - case 'hash': - while ( $tags->next_tag() ) { - if ( $selector['identifier'] === $tags->get_attribute( 'id' ) ) { - return $tags; - } + if ( 'element' === $s['type'] && $tags->get_tag() === strtoupper( $s['identifier'] ) ) { + return $tags; } - } - - return null; - } - public static function get_inner_html( WP_HTML_Naive_Processor $tags ) { - $tags->set_bookmark( 'start' ); - $tag_name = $tags->get_tag(); - $depth = 1; - - if ( self::is_void_element( $tag_name ) ) { - return ''; - } - - while ( $tags->next_tag( [ 'tag_closers' => 'visit' ] ) ) { - if ( $tags->get_tag() !== $tag_name ) { - continue; - } + // @TODO: $tags->has_class() would be _really_ handy here. + if ( 'class' === $s['type'] && preg_match( "~\b{$s['identifier']}\b~", $tags->get_attribute( 'class' ) ) ) { + return $tags; + } - if ( $tags->is_tag_closer() && $depth === 1 ) { - $tags->set_bookmark( 'end' ); - break; + if ( 'hash' === $s['type'] && $s['identifier'] === $tags->get_attribute( 'id' ) ) { + return $tags; + } } - - $depth += $tags->is_tag_closer() ? -1 : 1; } - return $tags->inner_content( 'start', 'end' ); - } - - /** - * @see https://html.spec.whatwg.org/#elements-2 - */ - public static function is_void_element( $tag_name ) { - switch ( $tag_name ) { - case 'area': - case 'base': - case 'br': - case 'col': - case 'embed': - case 'hr': - case 'img': - case 'input': - case 'link': - case 'meta': - case 'source': - case 'track': - case 'wbr': - return true; - - default: - return false; - } + return null; } public static function parse_definition( $definition ) { @@ -246,72 +179,90 @@ public static function parse_definition( $definition ) { } public static function parse_selector( $s, $at = 0 ) { - $selectors = explode( ',', $s ); - if ( count( $selectors ) > 1 ) { - $parsed = []; - - foreach ( $selectors as $selector ) { - $parsed[] = self::parse_selector( $selector, strspn( $selector, " \r\t\f\n" ) ); - } + $budget = 1000; + $selectors = []; - return $parsed; - } + while ( $at < strlen( $s ) && $budget-- > 0 ) { + $type = 'element'; + $attribute = null; + + switch ( $s[ $at ] ) { + case '+': + // no support for adjacent sibling combinator + return null; + + case '>': + // no support for child combinator + return null; + + case '~': + // no support for general sibling combinator + return null; + + case ' ': + // no support for descendant combinator + return null; + + case '[': + /* + * Only current support is for checking of presence of attributes + * with a very-limited subset of allowable names, not whether the + * attribute conforms to a given value or is allowed in HTML. + */ + $at++; + $inside_length = strspn( $s, "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-", $at ); + if ( ']' !== $s[ $at + $inside_length ] ) { + return null; + } - $type = 'element'; + $attribute = substr( $s, $at, $inside_length ); + $at += $inside_length + 1; - switch ( $s[ $at ] ) { - case '+': - // no support for adjacent sibling combinator - return null; + $selector = array_pop( $selectors ); + $selector['has_attribute'] = $attribute; + $selectors[] = $selector; - case '>': - // no support for child combinator - return null; + continue 2; - case '~': - // no support for general sibling combinator - return null; + case ',': + $at++; + $at += strspn( $s, " \t\f\r\n", $at ); + continue 2; - case ' ': - // no support for descendant combinator - return null; + case ':': + // no support for pseudo-selectors + return null; - case '[': - // no support for attribute - return null; + case '#': + $type = 'hash'; + $at++; + break; - case ',': - // we shouldn't get here because we're exploding at the start - // of this function; this is a bug if we're here. - return null; + case '.': + $type = 'class'; + $at++; + break; + } - case ':': - // no support for pseudo-selectors + // @TODO: Hashes don't have to start with `nmstart` so this might reject valid hash names. + $identifier = self::parse_css_identifier( $s, $at ); + if ( null === $identifier ) { return null; + } - case '#': - $type = 'hash'; - $at++; - break; - - case '.': - $type = 'class'; - $at++; - break; - } + $selector = array( 'type' => $type, 'identifier' => $identifier ); + if ( null !== $attribute ) { + $selector['has_attribute'] = $selector; + } - // @TODO: Hashes don't have to start with `nmstart` so this might reject valid hash names. - $identifier = self::parse_css_identifier( $s, $at ); - if ( null === $identifier ) { - return null; - } + $selectors[] = $selector; - if ( $at + strlen( $identifier ) < strlen( $s ) ) { - // no support for anything more complicated than a simple selector - return null; + $at += strlen( $identifier ); } - return array( 'type' => $type, 'identifier' => $identifier ); + return $at === strlen( $s ) + ? $selectors + : null; } /** diff --git a/lib/experimental/html/class-wp-html-naive-processor.php b/lib/experimental/html/class-wp-html-naive-processor.php deleted file mode 100644 index b5a3965c388176..00000000000000 --- a/lib/experimental/html/class-wp-html-naive-processor.php +++ /dev/null @@ -1,14 +0,0 @@ -bookmarks[ $start_bookmark ], $this->bookmarks[ $end_bookmark ] ) ) { - return null; - } - - $start = $this->bookmarks[ $start_bookmark ]; - $end = $this->bookmarks[ $end_bookmark ]; - - return substr( $this->get_updated_html(), $start->end + 1, $end->start - $start->end - 2 ); - } -} diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php new file mode 100644 index 00000000000000..d6e7835a285af8 --- /dev/null +++ b/lib/experimental/html/class-wp-html-processor.php @@ -0,0 +1,86 @@ +bookmarks ) ) { + $rand_id = rand( 1, PHP_INT_MAX ); + $start_name = "start_{$rand_id}"; + } + + if ( null === $end_name || array_key_exists( $end_name, $this->bookmarks ) ) { + $rand_id = rand( 1, PHP_INT_MAX ); + $end_name = "start_{$rand_id}"; + } + + $this->set_bookmark( $start_name ); + $tag_name = $this->get_tag(); + $depth = 1; + + if ( self::is_html_void_element( $tag_name ) ) { + return ''; + } + + while ( $this->next_tag( [ 'tag_closers' => 'visit' ] ) ) { + if ( $this->get_tag() !== $tag_name ) { + continue; + } + + if ( $this->is_tag_closer() && $depth === 1 ) { + $this->set_bookmark( $end_name ); + break; + } + + $depth += $this->is_tag_closer() ? -1 : 1; + } + + $content = $this->content_inside_bookmarks( $start_name, $end_name ); + + $this->release_bookmark( $start_name ); + $this->release_bookmark( $end_name ); + + return $content; + } + + private function content_inside_bookmarks( $start_bookmark, $end_bookmark ) { + if ( ! isset( $this->bookmarks[ $start_bookmark ], $this->bookmarks[ $end_bookmark ] ) ) { + return null; + } + + $start = $this->bookmarks[ $start_bookmark ]; + $end = $this->bookmarks[ $end_bookmark ]; + + return substr( $this->get_updated_html(), $start->end + 1, $end->start - $start->end - 2 ); + } + + /* + * HTML-related Utility Functions + */ + + /** + * @see https://html.spec.whatwg.org/#elements-2 + */ + public static function is_html_void_element( $tag_name ) { + switch ( $tag_name ) { + case 'AREA': + case 'BASE': + case 'BR': + case 'COL': + case 'EMBED': + case 'HR': + case 'IMG': + case 'INPUT': + case 'LINK': + case 'META': + case 'SOURCE': + case 'TRACK': + case 'WBR': + return true; + + default: + return false; + } + } +} diff --git a/phpunit/html/wp-html-attribute-sourcer-test.php b/phpunit/html/wp-html-attribute-sourcer-test.php index 95cf675aa43c13..496aa1833badbc 100644 --- a/phpunit/html/wp-html-attribute-sourcer-test.php +++ b/phpunit/html/wp-html-attribute-sourcer-test.php @@ -80,6 +80,22 @@ public function data_sourced_attributes() { ) ) ), + + array( + array( + 'attributes' => array( 'url' => 'poster.pdf' ), + 'unparsed' => array(), + ), + 'Some PlaceDownload the posterNo Link', + array( + 'url' => array( + 'type' => 'string', + 'source' => 'attribute', + 'attribute' => 'href', + 'selector' => 'a[download]', + ), + ), + ), ); } @@ -105,11 +121,11 @@ public function data_parsed_block_attribute_definitions() { array( 'type' => 'string', 'source' => 'html' ), ), array( - array( 'type' => 'html', 'selector' => array( 'type' => 'element', 'identifier' => 'code' ) ), + array( 'type' => 'html', 'selector' => array( array( 'type' => 'element', 'identifier' => 'code' ) ) ), array( 'type' => 'string', 'source' => 'html', 'selector' => 'code' ), ), array( - array( 'type' => 'attribute', 'selector' => array( 'type' => 'element', 'identifier' => 'img' ), 'attribute' => 'src' ), + array( 'type' => 'attribute', 'selector' => array( array( 'type' => 'element', 'identifier' => 'img' ) ), 'attribute' => 'src' ), array( 'type' => 'string', 'source' => 'attribute', 'selector' => 'img', 'attribute' => 'src' ), ), ); @@ -124,9 +140,9 @@ public function test_parses_css_selector( $expected, $input ) { public function data_parsed_css_selectors() { return array( - array( array( 'type' => 'element', 'identifier' => 'img' ), 'img' ), - array( array( 'type' => 'class', 'identifier' => 'block-group' ), '.block-group' ), - array( array( 'type' => 'hash', 'identifier' => 'input-form' ), '#input-form' ), + array( array( array( 'type' => 'element', 'identifier' => 'img' ) ), 'img' ), + array( array( array( 'type' => 'class', 'identifier' => 'block-group' ) ), '.block-group' ), + array( array( array( 'type' => 'hash', 'identifier' => 'input-form' ) ), '#input-form' ), ); } From ac30fa476793add5c2706b497e2e03f4fde02447 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 9 Dec 2022 17:27:51 -0700 Subject: [PATCH 03/19] Add support for child and descendant selection within the HTML processor --- .../html/class-wp-html-processor.php | 34 +++++++++++++++ .../html/class-wp-html-tag-processor.php | 4 +- phpunit/html/wp-html-processor-test.php | 41 +++++++++++++++++++ 3 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 phpunit/html/wp-html-processor-test.php diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index d6e7835a285af8..c71928ccf1cd0d 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -1,6 +1,40 @@ get_tag() ) ) { + return false; + } + + $tag_name = $this->get_tag(); + $balanced_depth = 1; + $depth = 1; + + while ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) && $budget-- > 0 ) { + if ( $this->get_tag() === $tag_name && $this->is_tag_closer() && $balanced_depth === 1 ) { + return false; + } + + if ( $depth <= $max_depth ) { + $this->parse_query( $query ); + if ( $this->matches() ) { + return true; + } + } + + if ( ! self::is_html_void_element( $this->get_tag() ) ) { + $depth += $this->is_tag_closer() ? -1 : 1; + } + + if ( $this->get_tag() === $tag_name ) { + $balanced_depth += $this->is_tag_closer() ? -1 : 1; + } + } + + return false; + } + public function get_content_inside_balanced_tags() { static $start_name = null; static $end_name = null; diff --git a/lib/experimental/html/class-wp-html-tag-processor.php b/lib/experimental/html/class-wp-html-tag-processor.php index 6f5bb1f766a6ef..a0565c3d452864 100644 --- a/lib/experimental/html/class-wp-html-tag-processor.php +++ b/lib/experimental/html/class-wp-html-tag-processor.php @@ -1729,7 +1729,7 @@ public function get_updated_html() { * @type string $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. . * } */ - private function parse_query( $query ) { + protected function parse_query( $query ) { if ( null !== $query && $query === $this->last_query ) { return; } @@ -1776,7 +1776,7 @@ private function parse_query( $query ) { * * @return boolean */ - private function matches() { + protected function matches() { if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) { return false; } diff --git a/phpunit/html/wp-html-processor-test.php b/phpunit/html/wp-html-processor-test.php new file mode 100644 index 00000000000000..00e8b812f62d17 --- /dev/null +++ b/phpunit/html/wp-html-processor-test.php @@ -0,0 +1,41 @@ +outside
inside
' ); + + $tags->next_tag( 'div' ); + $this->assertFalse( $tags->next_within_balanced_tags( 'img' ) ); + + $this->assertTrue( $tags->next_tag( 'div' ) ); + $this->assertTrue( $tags->next_within_balanced_tags( 'img' ) ); + } + + public function test_find_immediate_child_tag() { + $tags = new WP_HTML_Processor( '
' ); + + $tags->next_tag( 'div' ); + $this->assertFalse( $tags->next_within_balanced_tags( 'img', 1 ) ); + } + + public function test_find_child_tag() { + $tags = new WP_HTML_Processor( '
' ); + + $tags->next_tag( 'div' ); + $this->assertTrue( $tags->next_within_balanced_tags( 'img', 3 ) ); + } +} From fdbb6586d699ca8e63781a9c0059c2e5d56aa2ab Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 9 Dec 2022 17:41:40 -0700 Subject: [PATCH 04/19] Add additional test for child selection --- phpunit/html/wp-html-processor-test.php | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/phpunit/html/wp-html-processor-test.php b/phpunit/html/wp-html-processor-test.php index 00e8b812f62d17..3a0e1521937f9b 100644 --- a/phpunit/html/wp-html-processor-test.php +++ b/phpunit/html/wp-html-processor-test.php @@ -32,6 +32,13 @@ public function test_find_immediate_child_tag() { $this->assertFalse( $tags->next_within_balanced_tags( 'img', 1 ) ); } + public function test_find_immediate_child_tag2() { + $tags = new WP_HTML_Processor( '
' ); + + $tags->next_tag( 'div' ); + $this->assertTrue( $tags->next_within_balanced_tags( 'img', 1 ) ); + } + public function test_find_child_tag() { $tags = new WP_HTML_Processor( '
' ); From 97629edc28a1fcf1374d9c298729ed56919355f8 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sat, 10 Dec 2022 12:41:59 -0700 Subject: [PATCH 05/19] Add basic comment header to file --- .../html/class-wp-html-processor.php | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index c71928ccf1cd0d..d02376aa602705 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -1,5 +1,21 @@ ` but not clear to how + * handle `

` given that `` is a formatting element but `

` is + * not, that `

` itself is a special element. + */ + class WP_HTML_Processor extends WP_HTML_Tag_Processor { public function next_within_balanced_tags( $query, $max_depth = 1000 ) { $budget = 1000; From ebe3afac6974e6c3009f599d00d4154d6664fee6 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 13 Dec 2022 16:42:43 -0700 Subject: [PATCH 06/19] Add more complete CSS selector parsing, add failing test case for nested selector --- .../html/class-wp-html-attribute-sourcer.php | 185 +++++++++++------- .../html/wp-html-attribute-sourcer-test.php | 159 +++++++++++++-- 2 files changed, 262 insertions(+), 82 deletions(-) diff --git a/lib/experimental/html/class-wp-html-attribute-sourcer.php b/lib/experimental/html/class-wp-html-attribute-sourcer.php index a2d9a0b61ebe5a..c2b3a182f1fee4 100644 --- a/lib/experimental/html/class-wp-html-attribute-sourcer.php +++ b/lib/experimental/html/class-wp-html-attribute-sourcer.php @@ -52,6 +52,18 @@ function array_is_list( $array ) { } } +class Selector { + public $element = null; + public $class = null; + public $hash = null; + public $has_attribute = null; + public $then = null; +} + +class ParseResult { + public $last_index; + public $selector = null; +} class WP_HTML_Attribute_Sourcer { /** @@ -121,22 +133,32 @@ public static function select( $selectors, $html ) { while ( $tags->next_tag() ) { foreach ( $selectors as $s ) { - if ( ! empty( $s['has_attribute'] ) && null === $tags->get_attribute( $s['has_attribute'] ) ) { + if ( ! empty( $s['tag_name'] ) && strtoupper( $s['tag_name'] ) !== $tags->get_tag() ) { continue; } - if ( 'element' === $s['type'] && $tags->get_tag() === strtoupper( $s['identifier'] ) ) { - return $tags; + if ( ! empty( $s['class_names'] ) ) { + $classes = $tags->get_attribute( 'class' ); + if ( null === $classes ) { + continue; + } + + foreach ( $s['class_names'] as $class_name ) { + if ( ! preg_match( "~\b{$class_name}\b~", $classes ) ) { + continue 2; + } + } } - // @TODO: $tags->has_class() would be _really_ handy here. - if ( 'class' === $s['type'] && preg_match( "~\b{$s['identifier']}\b~", $tags->get_attribute( 'class' ) ) ) { - return $tags; + if ( isset( $s['hash'] ) && $s['identifier'] !== $tags->get_attribute( 'id' ) ) { + continue; } - if ( 'hash' === $s['type'] && $s['identifier'] === $tags->get_attribute( 'id' ) ) { - return $tags; + if ( isset( $s['has_attribute'] ) && null === $tags->get_attribute( $s['has_attribute'] ) ) { + continue; } + + return $tags; } } @@ -161,13 +183,13 @@ public static function parse_definition( $definition ) { return 'inner-html'; } - $selector = self::parse_selector( $definition['selector'] ); - if ( null === $selector ) { + $selectors = self::parse_full_selector( $definition['selector'] ); + if ( null === $selectors ) { return 'unsupported'; } if ( 'html' === $source ) { - return array( 'type' => 'html', 'selector' => $selector ); + return array( 'type' => 'html', 'selector' => $selectors ); } $attribute = self::parse_attribute( $definition['attribute'] ); @@ -175,33 +197,94 @@ public static function parse_definition( $definition ) { return null; } - return array( 'type' => 'attribute', 'selector' => $selector, 'attribute' => $attribute ); + return array( 'type' => 'attribute', 'selector' => $selectors, 'attribute' => $attribute ); } - public static function parse_selector( $s, $at = 0 ) { - $budget = 1000; + public static function parse_full_selector( $s ) { $selectors = []; + $at = 0; - while ( $at < strlen( $s ) && $budget-- > 0 ) { - $type = 'element'; - $attribute = null; + while ( $at < strlen( $s ) ) { + $at += strspn( $s, " \f\n\r\t", $at ); + + list( $selector, $next_at ) = self::parse_selector( $s, $at ); + if ( null === $selector ) { + return null; + } + + $selectors[] = $selector; + $at = $next_at; + + if ( $at < strlen( $s ) && ',' !== $s[ $at ] ) { + return null; + } + $at++; + } + + return $selectors; + } + + public static function parse_selector( $s, $at = 0, $selector = [] ) { + $is_first = true; + + while ( $at < strlen( $s ) && ',' !== $s[ $at ] ) { + /* + * Descendant combinators are harder to discover because we + * always have to skip whitespace, but that whitespace could + * be the combinator if we don't approach anything else first. + */ + $ws_length = strspn( $s, " \f\n\r\t", $at ); + $at += $ws_length; + + if ( !$is_first && $ws_length > 0 && 0 === strspn( $s[ $at ], '>+~' ) ) { + $at--; + $s[ $at ] = ' '; + } + $is_first = false; switch ( $s[ $at ] ) { + case '>': case '+': - // no support for adjacent sibling combinator - return null; + case '~': + case ' ': + $combinator = $s[ $at ]; + $at++; + $at += strspn( $s, " \f\n\r\t", $at ); + $inner = self::parse_selector( $s, $at ); + if ( null === $inner ) { + return null; + } + list( $inner_selector, $next_at ) = $inner; + $inner_selector['combinator'] = $combinator; + $selector['then'] = $inner_selector; + $at = $next_at; + break; - case '>': - // no support for child combinator - return null; + case '.': + $at++; + $class_name = self::parse_css_identifier( $s, $at ); + if ( null === $class_name ) { + return null; + } - case '~': - // no support for general sibling combinator - return null; + if ( ! isset( $selector['class_names'] ) ) { + $selector['class_names'] = array(); + } + $selector['class_names'][] = $class_name; + $at += strlen( $class_name ); + break; - case ' ': - // no support for descendant combinator - return null; + case '#': + $at++; + // @TODO: Hashes don't have to start with `nmstart` so this might reject valid hash names. + $element_id = self::parse_css_identifier( $s, $at ); + if ( null === $element_id ) { + return null; + } + + $selector['hash'] = $element_id; + $at += strlen( $element_id ); + break; case '[': /* @@ -218,51 +301,21 @@ public static function parse_selector( $s, $at = 0 ) { $attribute = substr( $s, $at, $inside_length ); $at += $inside_length + 1; - $selector = array_pop( $selectors ); $selector['has_attribute'] = $attribute; - $selectors[] = $selector; - - continue 2; - - case ',': - $at++; - $at += strspn( $s, " \t\f\r\n", $at ); - continue 2; - - case ':': - // no support for pseudo-selectors - return null; - - case '#': - $type = 'hash'; - $at++; - break; - - case '.': - $type = 'class'; - $at++; break; - } - // @TODO: Hashes don't have to start with `nmstart` so this might reject valid hash names. - $identifier = self::parse_css_identifier( $s, $at ); - if ( null === $identifier ) { - return null; - } + default: + $tag_name = self::parse_css_identifier( $s, $at ); + if ( null === $tag_name ) { + return null; + } - $selector = array( 'type' => $type, 'identifier' => $identifier ); - if ( null !== $attribute ) { - $selector['has_attribute'] = $selector; + $selector['tag_name'] = $tag_name; + $at += strlen( $tag_name ); } - - $selectors[] = $selector; - - $at += strlen( $identifier ); } - return $at === strlen( $s ) - ? $selectors - : null; + return [ $selector, $at ]; } /** diff --git a/phpunit/html/wp-html-attribute-sourcer-test.php b/phpunit/html/wp-html-attribute-sourcer-test.php index 496aa1833badbc..ca61c94f27acd5 100644 --- a/phpunit/html/wp-html-attribute-sourcer-test.php +++ b/phpunit/html/wp-html-attribute-sourcer-test.php @@ -23,6 +23,30 @@ public function test_sources_attributes( $expected, $html, $attributes ) { public function data_sourced_attributes() { return array( + array( + array( 'attributes' => array( 'link' => 'docs.html' ), 'unparsed' => array() ), + << +

Just another section
+
+

Stuff

+ +
Still another section
+ +
Still another section
+ + +EOF, + array( + 'link' => array( + 'type' => 'string', + 'source' => 'attribute', + 'selector' => 'main section + div > a[href]', + 'attribute' => 'href', + ), + ), + ), + array( array( 'attributes' => array( 'src' => 'image.png' ), 'unparsed' => array() ), '
', @@ -113,7 +137,7 @@ public function data_parsed_block_attribute_definitions() { array( 'type' => 'string' ), ), array( - 'unsupported', + array( 'type' => 'attribute', 'selector' => array( array( 'tag_name' => 'div', 'then' => array( 'tag_name' => 'img', 'combinator' => '+' ) ) ), 'attribute' => 'src' ), array( 'type' => 'string', 'source' => 'attribute', 'selector' => 'div + img', 'attribute' => 'src' ), ), array( @@ -121,11 +145,11 @@ public function data_parsed_block_attribute_definitions() { array( 'type' => 'string', 'source' => 'html' ), ), array( - array( 'type' => 'html', 'selector' => array( array( 'type' => 'element', 'identifier' => 'code' ) ) ), + array( 'type' => 'html', 'selector' => array( array( 'tag_name' => 'code' ) ) ), array( 'type' => 'string', 'source' => 'html', 'selector' => 'code' ), ), array( - array( 'type' => 'attribute', 'selector' => array( array( 'type' => 'element', 'identifier' => 'img' ) ), 'attribute' => 'src' ), + array( 'type' => 'attribute', 'selector' => array( array( 'tag_name' => 'img' ) ), 'attribute' => 'src' ), array( 'type' => 'string', 'source' => 'attribute', 'selector' => 'img', 'attribute' => 'src' ), ), ); @@ -135,14 +159,117 @@ public function data_parsed_block_attribute_definitions() { * @dataProvider data_parsed_css_selectors */ public function test_parses_css_selector( $expected, $input ) { - $this->assertSame($expected, WP_HTML_Attribute_Sourcer::parse_selector( $input ) ); + $this->assertSame($expected, WP_HTML_Attribute_Sourcer::parse_full_selector( $input ) ); } public function data_parsed_css_selectors() { return array( - array( array( array( 'type' => 'element', 'identifier' => 'img' ) ), 'img' ), - array( array( array( 'type' => 'class', 'identifier' => 'block-group' ) ), '.block-group' ), - array( array( array( 'type' => 'hash', 'identifier' => 'input-form' ) ), '#input-form' ), + array( array( array( 'tag_name' => 'img' ) ), 'img' ), + array( array( array( 'class_names' => array( 'block-group' ) ) ), '.block-group' ), + array( array( array( 'hash' => 'input-form' ) ), '#input-form' ), + array( + array( + array( + 'tag_name' => 'div', + 'then' => array( + 'class_names' => array( 'important' ), + 'combinator' => '>', + ) + ) + ), + 'div > .important', + ), + array( + array( + array( + 'tag_name' => 'img', + 'then' => array( + 'tag_name' => 'p', + 'combinator' => '+', + ) + ) + ), + 'img + p', + ), + array( + array( + array( + 'tag_name' => 'img', + 'then' => array( + 'tag_name' => 'p', + 'combinator' => '~', + ) + ) + ), + 'img ~ p', + ), + array( + array( + array( + 'tag_name' => 'main', + 'then' => array( + 'tag_name' => 'section', + 'then' => array( + 'class_names' => array( 'title' ), + 'combinator' => '+', + ), + 'combinator' => '>', + ) + ), + array( 'hash' => 'title' ) + ), + 'main > section + .title, #title', + ), + array( + array( + array( + 'tag_name' => 'li', + 'then' => array( + 'tag_name' => 'em', + 'combinator' => ' ', + ) + ) + ), + 'li em', + ), + array( + array( + array( + 'tag_name' => 'main', + 'then' => array( + 'tag_name' => 'section', + 'then' => array( + 'class_names' => array( 'title' ), + 'combinator' => '+', + ), + 'combinator' => '>', + ) + ), + array( + 'hash' => 'title', + 'then' => array( + 'tag_name' => 'h2', + 'then' => array( + 'tag_name' => 'em', + 'then' => array( + 'class_names' => array( 'really' ), + 'combinator' => ' ' + ), + 'combinator' => ' ', + ), + 'combinator' => '~', + ), + ), + array( + 'class_names' => array( + 'another', + 'class', + 'combined', + ), + ), + ), + 'main > section + .title, #title ~ h2 em .really, .another.class.combined', + ) ); } @@ -150,26 +277,26 @@ public function data_parsed_css_selectors() { * @dataProvider data_multi_parsed_css_selectors */ public function test_parses_multi_css_selectors( $expected, $input ) { - $this->assertSame( $expected, WP_HTML_Attribute_Sourcer::parse_selector( $input ) ); + $this->assertSame( $expected, WP_HTML_Attribute_Sourcer::parse_full_selector( $input ) ); } public function data_multi_parsed_css_selectors() { return array( array( array( - array( 'type' => 'element', 'identifier' => 'img' ), - array( 'type' => 'class', 'identifier' => 'full-width' ), + array( 'tag_name' => 'img' ), + array( 'class_names' => array( 'full-width' ) ), ), 'img, .full-width' ), array( array( - array( 'type' => 'element', 'identifier' => 'h1' ), - array( 'type' => 'element', 'identifier' => 'h2' ), - array( 'type' => 'element', 'identifier' => 'h3' ), - array( 'type' => 'element', 'identifier' => 'h4' ), - array( 'type' => 'element', 'identifier' => 'h5' ), - array( 'type' => 'element', 'identifier' => 'h6' ), + array( 'tag_name' => 'h1' ), + array( 'tag_name' => 'h2' ), + array( 'tag_name' => 'h3' ), + array( 'tag_name' => 'h4' ), + array( 'tag_name' => 'h5' ), + array( 'tag_name' => 'h6' ), ), 'h1,h2,h3,h4,h5,h6' ) From 3c63e24fe82f06b2c196d9edf1f3af0c01a64c34 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 14 Dec 2022 13:02:14 -0700 Subject: [PATCH 07/19] Restore tag processor pointer when getting content --- lib/experimental/html/class-wp-html-processor.php | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index d02376aa602705..e1caf7d4e7140c 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -87,6 +87,7 @@ public function get_content_inside_balanced_tags() { } $content = $this->content_inside_bookmarks( $start_name, $end_name ); + $this->seek( $start_name ); $this->release_bookmark( $start_name ); $this->release_bookmark( $end_name ); From 3f7c419861ddbed17a9435552425fc5c13ef2966 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 14 Dec 2022 16:50:11 -0700 Subject: [PATCH 08/19] Make things worse, but lead towards making them better; GOTO --- .../html/class-wp-html-attribute-sourcer.php | 138 ++++++++++++++++-- .../html/class-wp-html-processor.php | 34 +++-- .../html/wp-html-attribute-sourcer-test.php | 20 +-- phpunit/html/wp-html-processor-test.php | 15 +- 4 files changed, 165 insertions(+), 42 deletions(-) diff --git a/lib/experimental/html/class-wp-html-attribute-sourcer.php b/lib/experimental/html/class-wp-html-attribute-sourcer.php index c2b3a182f1fee4..697228df04b34a 100644 --- a/lib/experimental/html/class-wp-html-attribute-sourcer.php +++ b/lib/experimental/html/class-wp-html-attribute-sourcer.php @@ -128,37 +128,149 @@ public function source_attributes() { ); } + public static function select_match( $tags, $s ) { + if ( ! empty( $s['tag_name'] ) && strtoupper( $s['tag_name'] ) !== $tags->get_tag() ) { + return null; + } + + if ( ! empty( $s['class_names'] ) ) { + $classes = $tags->get_attribute( 'class' ); + if ( null === $classes ) { + return null; + } + + foreach ( $s['class_names'] as $class_name ) { + if ( ! preg_match( "~\b{$class_name}\b~", $classes ) ) { + return null; + } + } + } + + if ( isset( $s['hash'] ) && $s['identifier'] !== $tags->get_attribute( 'id' ) ) { + return null; + } + + if ( isset( $s['has_attribute'] ) && null === $tags->get_attribute( $s['has_attribute'] ) ) { + return null; + } + + return $tags; + } + public static function select( $selectors, $html ) { $tags = new WP_HTML_Processor( $html ); + if ( ! $tags->next_tag() ) { + return null; + } + + $tags->set_bookmark( 'start' ); - while ( $tags->next_tag() ) { - foreach ( $selectors as $s ) { - if ( ! empty( $s['tag_name'] ) && strtoupper( $s['tag_name'] ) !== $tags->get_tag() ) { + foreach ( $selectors as $s ) { + $tags->seek( 'start' ); + $max = 100; + while ( --$max > 0 ) { + $next = $s; + + // This label is probably where some stack-level data should reside. + next: + // Find the next starting point + while ( null === self::select_match( $tags, $next ) && $tags->next_tag() ) { continue; } - if ( ! empty( $s['class_names'] ) ) { - $classes = $tags->get_attribute( 'class' ); - if ( null === $classes ) { + // We're out of possible starting points + if ( null === self::select_match( $tags, $next ) ) { + continue 2; + } + + // No further selectors, then bingo! + if ( ! isset( $next['then'] ) ) { + return $tags; + } + + $next = $next['then']; + + // Adjacent sibling must be the immediately-following element. + if ( '+' === $next['combinator'] ) { + var_dump( [ + 'msg' => "Processing adjacent sibling", + 'html' => $html, + 'tag' => $tags->get_tag(), + 'selector' => $next + ] ); + $state = []; + while ( $tags->next_within_balanced_tags( $state ) ) { continue; } - foreach ( $s['class_names'] as $class_name ) { - if ( ! preg_match( "~\b{$class_name}\b~", $classes ) ) { - continue 2; - } + $tags->next_tag(); + if ( null === self::select_match( $tags, $next ) ) { + continue; } + + if ( isset( $next['then'] ) ) { + goto next; + } + + // @TODO: Recurse here so we can handle more than one level. + return $tags; } - if ( isset( $s['hash'] ) && $s['identifier'] !== $tags->get_attribute( 'id' ) ) { + // Child must be one level into current tag. + if ( '>' === $next['combinator'] ) { + var_dump( [ + 'msg' => "Processing child", + 'html' => $html, + 'tag' => $tags->get_tag(), + 'selector' => $next + ] ); + $state = []; + while ( $tags->next_within_balanced_tags( $state, null, 1 ) ) { + if ( null === self::select_match( $tags, $next ) ) { + continue; + } + + if ( isset( $next['then'] ) ) { + goto next; + } + + // @TODO: Recurse here so we can handle more than one level. + return $tags; + } + continue; } - if ( isset( $s['has_attribute'] ) && null === $tags->get_attribute( $s['has_attribute'] ) ) { + // Descendant can be anywhere inside current tag. + if ( ' ' === $next['combinator'] ) { + var_dump( [ + 'msg' => "Processing descendant", + 'html' => $html, + 'tag' => $tags->get_tag(), + 'selector' => $next + ] ); + $state = []; + while ( $tags->next_within_balanced_tags( $state ) ) { + if ( null === self::select_match( $tags, $next ) ) { + continue; + } + + if ( isset( $next['then'] ) ) { + goto next; + } + + // @TODO: Recurse here so we can handle more than one level. + return $tags; + } + continue; } - return $tags; + // General sibling must be anything at current level. + if ( '~' === $next['combinator'] ) { + // @TODO: Support this. + return null; + } } } diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index e1caf7d4e7140c..a4179455892e6c 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -17,22 +17,28 @@ */ class WP_HTML_Processor extends WP_HTML_Tag_Processor { - public function next_within_balanced_tags( $query, $max_depth = 1000 ) { - $budget = 1000; - if ( self::is_html_void_element( $this->get_tag() ) ) { - return false; + public function next_within_balanced_tags( &$state, $query = null, $max_depth = 1000 ) { + if ( empty( $state ) ) { + $state['budget'] = 1000; + $state['tag_name'] = $this->get_tag(); + $state['balanced_depth'] = 1; + $state['depth'] = 1; + + if ( self::is_html_void_element( $this->get_tag() ) ) { + return false; + } } - $tag_name = $this->get_tag(); - $balanced_depth = 1; - $depth = 1; - - while ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) && $budget-- > 0 ) { - if ( $this->get_tag() === $tag_name && $this->is_tag_closer() && $balanced_depth === 1 ) { + while ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) && $state['budget']-- > 0 ) { + if ( + $this->get_tag() === $state['tag_name'] && + $this->is_tag_closer() && + $state['balanced_depth'] === 1 + ) { return false; } - if ( $depth <= $max_depth ) { + if ( $state['depth'] <= $max_depth ) { $this->parse_query( $query ); if ( $this->matches() ) { return true; @@ -40,11 +46,11 @@ public function next_within_balanced_tags( $query, $max_depth = 1000 ) { } if ( ! self::is_html_void_element( $this->get_tag() ) ) { - $depth += $this->is_tag_closer() ? -1 : 1; + $state['depth'] += $this->is_tag_closer() ? -1 : 1; } - if ( $this->get_tag() === $tag_name ) { - $balanced_depth += $this->is_tag_closer() ? -1 : 1; + if ( $this->get_tag() === $state['tag_name'] ) { + $state['balanced_depth'] += $this->is_tag_closer() ? -1 : 1; } } diff --git a/phpunit/html/wp-html-attribute-sourcer-test.php b/phpunit/html/wp-html-attribute-sourcer-test.php index ca61c94f27acd5..a402956a09588f 100644 --- a/phpunit/html/wp-html-attribute-sourcer-test.php +++ b/phpunit/html/wp-html-attribute-sourcer-test.php @@ -26,16 +26,16 @@ public function data_sourced_attributes() { array( array( 'attributes' => array( 'link' => 'docs.html' ), 'unparsed' => array() ), << -
Just another section
-
-

Stuff

- -
Still another section
- -
Still another section
- - +
+
Just another section
+
+

Stuff

+ +
Still another section
+ +
Still another section
+ +
EOF, array( 'link' => array( diff --git a/phpunit/html/wp-html-processor-test.php b/phpunit/html/wp-html-processor-test.php index 3a0e1521937f9b..027a314cd3325e 100644 --- a/phpunit/html/wp-html-processor-test.php +++ b/phpunit/html/wp-html-processor-test.php @@ -19,30 +19,35 @@ public function test_find_descendant_tag() { $tags = new WP_HTML_Processor( '
outside
inside
' ); $tags->next_tag( 'div' ); - $this->assertFalse( $tags->next_within_balanced_tags( 'img' ) ); + $state = []; + $this->assertFalse( $tags->next_within_balanced_tags( $state, 'img' ) ); $this->assertTrue( $tags->next_tag( 'div' ) ); - $this->assertTrue( $tags->next_within_balanced_tags( 'img' ) ); + $state = []; + $this->assertTrue( $tags->next_within_balanced_tags( $state, 'img' ) ); } public function test_find_immediate_child_tag() { $tags = new WP_HTML_Processor( '
' ); $tags->next_tag( 'div' ); - $this->assertFalse( $tags->next_within_balanced_tags( 'img', 1 ) ); + $state = []; + $this->assertFalse( $tags->next_within_balanced_tags( $state, 'img', 1 ) ); } public function test_find_immediate_child_tag2() { $tags = new WP_HTML_Processor( '
' ); $tags->next_tag( 'div' ); - $this->assertTrue( $tags->next_within_balanced_tags( 'img', 1 ) ); + $state = []; + $this->assertTrue( $tags->next_within_balanced_tags( $state, 'img', 1 ) ); } public function test_find_child_tag() { $tags = new WP_HTML_Processor( '
' ); $tags->next_tag( 'div' ); - $this->assertTrue( $tags->next_within_balanced_tags( 'img', 3 ) ); + $state = []; + $this->assertTrue( $tags->next_within_balanced_tags( $state, 'img', 3 ) ); } } From 02e3e5cc2ae026d94b0f62551132c1dc21f2c1ec Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 15 Dec 2022 14:33:08 -0700 Subject: [PATCH 09/19] Scribble notes --- .../html/class-wp-html-attribute-sourcer.php | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/experimental/html/class-wp-html-attribute-sourcer.php b/lib/experimental/html/class-wp-html-attribute-sourcer.php index 697228df04b34a..d6e701e6174a19 100644 --- a/lib/experimental/html/class-wp-html-attribute-sourcer.php +++ b/lib/experimental/html/class-wp-html-attribute-sourcer.php @@ -1,5 +1,15 @@ Date: Wed, 21 Dec 2022 16:58:07 -0700 Subject: [PATCH 10/19] Revise HTML navigation to use `balanced_next` with a depth tracker Tests are broken because the attribute sourcer needs rework given this different approach for navigating tags. --- .../html/class-wp-html-attribute-sourcer.php | 13 +- .../html/class-wp-html-processor.php | 152 +++++++++++++----- phpunit/html/wp-html-processor-test.php | 37 +++-- 3 files changed, 142 insertions(+), 60 deletions(-) diff --git a/lib/experimental/html/class-wp-html-attribute-sourcer.php b/lib/experimental/html/class-wp-html-attribute-sourcer.php index d6e701e6174a19..22bfb94ea05c46 100644 --- a/lib/experimental/html/class-wp-html-attribute-sourcer.php +++ b/lib/experimental/html/class-wp-html-attribute-sourcer.php @@ -208,8 +208,8 @@ public static function select( $selectors, $html ) { 'tag' => $tags->get_tag(), 'selector' => $next ] ); - $state = []; - while ( $tags->next_within_balanced_tags( $state ) ) { + $state = WP_HTML_Processor::new_state(); + while ( $tags->balanced_next( $state ) ) { continue; } @@ -234,8 +234,9 @@ public static function select( $selectors, $html ) { 'tag' => $tags->get_tag(), 'selector' => $next ] ); - $state = []; - while ( $tags->next_within_balanced_tags( $state, null, 1 ) ) { + $state = WP_HTML_Processor::new_state(); + $state->match_depth = 1; + while ( $tags->balanced_next( $state ) ) { if ( null === self::select_match( $tags, $next ) ) { continue; } @@ -259,8 +260,8 @@ public static function select( $selectors, $html ) { 'tag' => $tags->get_tag(), 'selector' => $next ] ); - $state = []; - while ( $tags->next_within_balanced_tags( $state ) ) { + $state = WP_HTML_Processor::new_state(); + while ( $tags->balanced_next( $state ) ) { if ( null === self::select_match( $tags, $next ) ) { continue; } diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index a4179455892e6c..7a207f287abc9d 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -16,42 +16,115 @@ * not, that `

` itself is a special element. */ + +class WP_HTML_Processor_Scan_State { + public $budget = 1000; + public $open_tags = array(); + public $bail_depth = 0; + public $match_depth = null; + + public function relative_depth() { + return count( $this->open_tags ); + } + + public function also_scan_siblings() { + $this->bail_depth = -1; + } +} + + class WP_HTML_Processor extends WP_HTML_Tag_Processor { - public function next_within_balanced_tags( &$state, $query = null, $max_depth = 1000 ) { - if ( empty( $state ) ) { - $state['budget'] = 1000; - $state['tag_name'] = $this->get_tag(); - $state['balanced_depth'] = 1; - $state['depth'] = 1; - - if ( self::is_html_void_element( $this->get_tag() ) ) { - return false; + public static function new_state() { + return new WP_HTML_Processor_Scan_State(); + } + + public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = null ) { + while ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) && $state->budget-- > 0 ) { + $tag_name = $this->get_tag(); + $is_closer = $this->is_tag_closer(); + $is_void = self::is_html_void_element( $tag_name ); + $type = self::classify_tag_type( $is_closer, $is_void ); + + /* + * Step 1. Update the stack of open tags. + * + * If and when we add more complete HTML parsing support we will also + * need to track the stack of active formats so that we can properly + * handle missing tags and overlapping tags. + */ + + switch ( $type ) { + case 'void': + /* + * Void tags (such as ) can't have children and so we + * won't push or pop them from the stack of open tags. + * + * If and when we support self-closing foreign tags we would + * need to separately track those, but their behavior matches + * this case. The self-closing flag is ignored for HTML5 tags. + */ + break; + + case 'opener': + $state->open_tags[] = $tag_name; + break; + + case 'closer': + $last_tag = array_pop( $state->open_tags ); + + /* + * Currently we can only support fully-normative and balanced HTML5. + * If we encounter anything we don't expect then we will bail. In a + * future update we may perform more careful HTML parsing and unlock + * navigating through non-normative documents. + */ + if ( $last_tag !== $tag_name ) { + return false; + } + break; } - } - while ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) && $state['budget']-- > 0 ) { - if ( - $this->get_tag() === $state['tag_name'] && - $this->is_tag_closer() && - $state['balanced_depth'] === 1 - ) { + /* + * Void elements don't enter the stack, but they do exist in the + * depth hierarchy, so we have to temporarily account for that. + * + * We could have followed the approach in the HTML5 spec by appending + * the void tag to the stack of open tags, and then remember to pop it + * when existing this function, but by tracking it like this we don't + * have to remember to do that. + */ + $depth = $type === 'void' + ? $state->relative_depth() + 1 + : $state->relative_depth(); + + /* + * Step 2. If we've reached the depth at which we want to stop searching, + * then bail at the current tag. This is mostly used to stop at the end + * of the opening tag's closing tag, but if set negative can continue + * scanning sibling elements (-1) or parents (-2) and so on. + */ + + if ( $state->bail_depth === $depth ) { return false; } - if ( $state['depth'] <= $max_depth ) { + /* + * Step 3. Determine if we have a matching tag. In addition to the query + * we pass along to the underlying tag processor we're going to allow + * specifying the relative depth for a match. For example, a CSS child + * combinator would specify that a match must have a relative depth of 1, + * indicating that it's a direct child of the surrounding element, whereas + * the descendant selector could match at any depth and so sets this to `null`. + * To prevent matching _above_ a tag we rely on the `bail_depth` to stop + * searching once we've exited the tag on which we started, or reach its parent. + */ + + if ( ! isset( $state->match_depth ) || $state->match_depth === $depth ) { $this->parse_query( $query ); if ( $this->matches() ) { return true; } } - - if ( ! self::is_html_void_element( $this->get_tag() ) ) { - $state['depth'] += $this->is_tag_closer() ? -1 : 1; - } - - if ( $this->get_tag() === $state['tag_name'] ) { - $state['balanced_depth'] += $this->is_tag_closer() ? -1 : 1; - } } return false; @@ -72,26 +145,13 @@ public function get_content_inside_balanced_tags() { } $this->set_bookmark( $start_name ); - $tag_name = $this->get_tag(); - $depth = 1; - if ( self::is_html_void_element( $tag_name ) ) { - return ''; - } - - while ( $this->next_tag( [ 'tag_closers' => 'visit' ] ) ) { - if ( $this->get_tag() !== $tag_name ) { - continue; - } - - if ( $this->is_tag_closer() && $depth === 1 ) { - $this->set_bookmark( $end_name ); - break; - } - - $depth += $this->is_tag_closer() ? -1 : 1; + $state = self::new_state(); + while ( $this->balanced_next( $state ) ) { + continue; } + $this->set_bookmark( $end_name ); $content = $this->content_inside_bookmarks( $start_name, $end_name ); $this->seek( $start_name ); @@ -116,6 +176,14 @@ private function content_inside_bookmarks( $start_bookmark, $end_bookmark ) { * HTML-related Utility Functions */ + public static function classify_tag_type( $is_closer, $is_void ) { + if ( $is_void ) { + return 'void'; + } + + return $is_closer ? 'closer' : 'opener'; + } + /** * @see https://html.spec.whatwg.org/#elements-2 */ diff --git a/phpunit/html/wp-html-processor-test.php b/phpunit/html/wp-html-processor-test.php index 027a314cd3325e..b14408f0ee287d 100644 --- a/phpunit/html/wp-html-processor-test.php +++ b/phpunit/html/wp-html-processor-test.php @@ -10,7 +10,7 @@ require_once __DIR__ . '/../../lib/experimental/html/class-wp-html-processor.php'; /** - * @group html + * @group html-proc * * @coversDefaultClass WP_HTML_Processor */ @@ -19,35 +19,48 @@ public function test_find_descendant_tag() { $tags = new WP_HTML_Processor( '

outside
inside
' ); $tags->next_tag( 'div' ); - $state = []; - $this->assertFalse( $tags->next_within_balanced_tags( $state, 'img' ) ); + $state = WP_HTML_Processor::new_state(); + $this->assertFalse( $tags->balanced_next( $state, 'img' ) ); $this->assertTrue( $tags->next_tag( 'div' ) ); - $state = []; - $this->assertTrue( $tags->next_within_balanced_tags( $state, 'img' ) ); + $state = WP_HTML_Processor::new_state(); + $this->assertTrue( $tags->balanced_next( $state, 'img' ) ); } public function test_find_immediate_child_tag() { $tags = new WP_HTML_Processor( '
' ); $tags->next_tag( 'div' ); - $state = []; - $this->assertFalse( $tags->next_within_balanced_tags( $state, 'img', 1 ) ); + $state = WP_HTML_Processor::new_state(); + $state->match_depth = 1; + $this->assertFalse( $tags->balanced_next( $state, 'img' ) ); } - public function test_find_immediate_child_tag2() { + public function test_find_immediate_child_of_fails_when_inside_sibling_of_current_tag() { $tags = new WP_HTML_Processor( '
' ); $tags->next_tag( 'div' ); - $state = []; - $this->assertTrue( $tags->next_within_balanced_tags( $state, 'img', 1 ) ); + $state = WP_HTML_Processor::new_state(); + $state->match_depth = 1; + $this->assertFalse( $tags->balanced_next( $state, 'img' ) ); + } + + public function test_find_immediate_child_succeeds_when_inside_sibling_of_current_tag_and_searchign_siblings() { + $tags = new WP_HTML_Processor( '
' ); + + $tags->next_tag( 'div' ); + $state = WP_HTML_Processor::new_state(); + $state->also_scan_siblings(); + $state->match_depth = 1; + $this->assertTrue( $tags->balanced_next( $state, 'img' ) ); } public function test_find_child_tag() { $tags = new WP_HTML_Processor( '
' ); $tags->next_tag( 'div' ); - $state = []; - $this->assertTrue( $tags->next_within_balanced_tags( $state, 'img', 3 ) ); + $state = WP_HTML_Processor::new_state(); + $state->match_depth = 3; + $this->assertTrue( $tags->balanced_next( $state, 'img' ) ); } } From 40502823072efced0551276afcef44495785ee25 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 21 Dec 2022 17:55:59 -0700 Subject: [PATCH 11/19] Remove bail depth since we can't track a negative-sized stack For sibling navigation we have to continue processing outside of balanced_next(). --- .../html/class-wp-html-attribute-sourcer.php | 19 ++---------- .../html/class-wp-html-processor.php | 26 ++++++++--------- phpunit/html/wp-html-processor-test.php | 29 +++++++------------ 3 files changed, 25 insertions(+), 49 deletions(-) diff --git a/lib/experimental/html/class-wp-html-attribute-sourcer.php b/lib/experimental/html/class-wp-html-attribute-sourcer.php index 22bfb94ea05c46..da4fb73689fa32 100644 --- a/lib/experimental/html/class-wp-html-attribute-sourcer.php +++ b/lib/experimental/html/class-wp-html-attribute-sourcer.php @@ -62,19 +62,6 @@ function array_is_list( $array ) { } } -class Selector { - public $element = null; - public $class = null; - public $hash = null; - public $has_attribute = null; - public $then = null; -} - -class ParseResult { - public $last_index; - public $selector = null; -} - class WP_HTML_Attribute_Sourcer { /** * Attributes definitions, typically from `block.json`. @@ -208,7 +195,7 @@ public static function select( $selectors, $html ) { 'tag' => $tags->get_tag(), 'selector' => $next ] ); - $state = WP_HTML_Processor::new_state(); + $state = $tags->new_state(); while ( $tags->balanced_next( $state ) ) { continue; } @@ -234,7 +221,7 @@ public static function select( $selectors, $html ) { 'tag' => $tags->get_tag(), 'selector' => $next ] ); - $state = WP_HTML_Processor::new_state(); + $state = $tags->new_state(); $state->match_depth = 1; while ( $tags->balanced_next( $state ) ) { if ( null === self::select_match( $tags, $next ) ) { @@ -260,7 +247,7 @@ public static function select( $selectors, $html ) { 'tag' => $tags->get_tag(), 'selector' => $next ] ); - $state = WP_HTML_Processor::new_state(); + $state = $tags->new_state(); while ( $tags->balanced_next( $state ) ) { if ( null === self::select_match( $tags, $next ) ) { continue; diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index 7a207f287abc9d..a36d77750466aa 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -20,22 +20,24 @@ class WP_HTML_Processor_Scan_State { public $budget = 1000; public $open_tags = array(); - public $bail_depth = 0; public $match_depth = null; public function relative_depth() { return count( $this->open_tags ); } - - public function also_scan_siblings() { - $this->bail_depth = -1; - } } class WP_HTML_Processor extends WP_HTML_Tag_Processor { - public static function new_state() { - return new WP_HTML_Processor_Scan_State(); + public function new_state() { + $state = new WP_HTML_Processor_Scan_State(); + $tag_name = $this->get_tag(); + + if ( ! self::is_html_void_element( $tag_name ) && ! $this->is_tag_closer() ) { + $state->open_tags[] = $tag_name; + } + + return $state; } public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = null ) { @@ -98,13 +100,9 @@ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = nul : $state->relative_depth(); /* - * Step 2. If we've reached the depth at which we want to stop searching, - * then bail at the current tag. This is mostly used to stop at the end - * of the opening tag's closing tag, but if set negative can continue - * scanning sibling elements (-1) or parents (-2) and so on. + * Step 2. Bail if we've reached the end of the tag in which we started. */ - - if ( $state->bail_depth === $depth ) { + if ( 0 === $depth ) { return false; } @@ -119,7 +117,7 @@ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = nul * searching once we've exited the tag on which we started, or reach its parent. */ - if ( ! isset( $state->match_depth ) || $state->match_depth === $depth ) { + if ( ! isset( $state->match_depth ) || $state->match_depth + 1 === $depth ) { $this->parse_query( $query ); if ( $this->matches() ) { return true; diff --git a/phpunit/html/wp-html-processor-test.php b/phpunit/html/wp-html-processor-test.php index b14408f0ee287d..36ec241f1fd26c 100644 --- a/phpunit/html/wp-html-processor-test.php +++ b/phpunit/html/wp-html-processor-test.php @@ -10,7 +10,7 @@ require_once __DIR__ . '/../../lib/experimental/html/class-wp-html-processor.php'; /** - * @group html-proc + * @group html-processor * * @coversDefaultClass WP_HTML_Processor */ @@ -19,11 +19,11 @@ public function test_find_descendant_tag() { $tags = new WP_HTML_Processor( '
outside
inside
' ); $tags->next_tag( 'div' ); - $state = WP_HTML_Processor::new_state(); + $state = $tags->new_state(); $this->assertFalse( $tags->balanced_next( $state, 'img' ) ); $this->assertTrue( $tags->next_tag( 'div' ) ); - $state = WP_HTML_Processor::new_state(); + $state = $tags->new_state(); $this->assertTrue( $tags->balanced_next( $state, 'img' ) ); } @@ -31,35 +31,26 @@ public function test_find_immediate_child_tag() { $tags = new WP_HTML_Processor( '
' ); $tags->next_tag( 'div' ); - $state = WP_HTML_Processor::new_state(); + $state = $tags->new_state(); $state->match_depth = 1; $this->assertFalse( $tags->balanced_next( $state, 'img' ) ); } - public function test_find_immediate_child_of_fails_when_inside_sibling_of_current_tag() { - $tags = new WP_HTML_Processor( '
' ); + public function test_find_immediate_child_tag2() { + $tags = new WP_HTML_Processor( '
' ); $tags->next_tag( 'div' ); - $state = WP_HTML_Processor::new_state(); + $state = $tags->new_state(); $state->match_depth = 1; - $this->assertFalse( $tags->balanced_next( $state, 'img' ) ); - } - - public function test_find_immediate_child_succeeds_when_inside_sibling_of_current_tag_and_searchign_siblings() { - $tags = new WP_HTML_Processor( '
' ); - - $tags->next_tag( 'div' ); - $state = WP_HTML_Processor::new_state(); - $state->also_scan_siblings(); - $state->match_depth = 1; - $this->assertTrue( $tags->balanced_next( $state, 'img' ) ); + $this->assertTrue( $tags->balanced_next( $state, 'img' ), 'Did not find the wanted ' ); + $this->assertTrue( $tags->get_attribute( 'wanted' ), 'Found the wrong ' ); } public function test_find_child_tag() { $tags = new WP_HTML_Processor( '
' ); $tags->next_tag( 'div' ); - $state = WP_HTML_Processor::new_state(); + $state = $tags->new_state(); $state->match_depth = 3; $this->assertTrue( $tags->balanced_next( $state, 'img' ) ); } From c51a66e6a6e7ce718eee5708b88e9616b9d9edf0 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 21 Dec 2022 18:25:03 -0700 Subject: [PATCH 12/19] Exit the enclosing tag in balanced_next, add tests --- .../html/class-wp-html-processor.php | 3 + phpunit/html/wp-html-processor-test.php | 157 ++++++++++++++++++ 2 files changed, 160 insertions(+) diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index a36d77750466aa..135b562898da2a 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -81,6 +81,7 @@ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = nul * navigating through non-normative documents. */ if ( $last_tag !== $tag_name ) { + $this->next_tag(); return false; } break; @@ -103,6 +104,7 @@ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = nul * Step 2. Bail if we've reached the end of the tag in which we started. */ if ( 0 === $depth ) { + $this->next_tag(); return false; } @@ -125,6 +127,7 @@ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = nul } } + $this->next_tag(); return false; } diff --git a/phpunit/html/wp-html-processor-test.php b/phpunit/html/wp-html-processor-test.php index 36ec241f1fd26c..d5b74b051042ba 100644 --- a/phpunit/html/wp-html-processor-test.php +++ b/phpunit/html/wp-html-processor-test.php @@ -54,4 +54,161 @@ public function test_find_child_tag() { $state->match_depth = 3; $this->assertTrue( $tags->balanced_next( $state, 'img' ) ); } + + public function test_flushes_up_to_close_tag_from_deep_within() { + $tags = new WP_HTML_Processor( << +
+

Cows

+
+

Cows are clever.

+

Cows eat grass.

+
+

Things cows can't do

+
    +
  • Pilot aeroplanes

  • +
  • Drive race cars

  • +
  • Captain ships

  • +
+

This concludes our discussion of cows.

+
+
+

Oxen

+
+

Oxen are strong.

+
+
+ +HTML + ); + + $tags->next_tag( 'section' ); + $state = $tags->new_state(); + + // Jump inside this tag + $tags->balanced_next( $state, 'p' ); + $this->assertTrue( $tags->get_attribute( 'start' ) ); + // Then exit the outer section we were scanning. + while ( $tags->balanced_next( $state ) ) { + continue; + } + + $this->assertEquals( 'SECTION', $tags->get_tag() ); + $tags->next_tag( 'p' ); + $this->assertTrue( $tags->get_attribute( 'wanted' ) ); + } + + public function test_can_navigate_with_unique_state_throughout_structure() { + $tags = new WP_HTML_Processor( << +
+

Cows

+
+

Cows are clever.

+

Cows eat grass.

+
+

Things cows can't do

+
    +
  • Pilot aeroplanes

  • +
  • Drive race cars

  • +
  • Captain ships

  • +
+

This concludes our discussion of cows.

+
+
+

Oxen

+
+

Oxen are strong.

+
+
+ +HTML + ); + + $tags->next_tag( 'section' ); + $state = $tags->new_state(); + + // Jump inside this tag + $tags->balanced_next( $state, 'p' ); + $this->assertTrue( $tags->get_attribute( 'start' ) ); + + // Establish a new state/frame for navigating inside the outer structure. + $tags->balanced_next( $state, 'ul' ); + $li_count = 0; + $li_state = $tags->new_state(); + while ( $tags->balanced_next( $li_state, 'li' ) ) { + $li_count++; + } + $this->assertEquals( 3, $li_count ); + + // Ensure that we ended up where we expected. + $this->assertEquals( 'P', $tags->get_tag() ); + $this->assertTrue( $tags->get_attribute( 'inner' ) ); + + // And now flush out the previous stack/frame + while ( $tags->balanced_next( $state ) ) { + continue; + } + + // Ensure that we're back where we want to be after exiting two separate frames. + $this->assertEquals( 'SECTION', $tags->get_tag() ); + $tags->next_tag( 'p' ); + $this->assertTrue( $tags->get_attribute( 'wanted' ) ); + } + + public function test_can_scan_through_tags_at_a_given_depth() { + $tags = new WP_HTML_Processor( << +
+

Cows

+
+

Cows are clever.

+

Cows eat grass.

+
+

Things cows can't do

+
    +
  • Pilot aeroplanes

  • +
  • Drive race cars

  • +
  • Captain ships

  • +
+

Things cows can do

+
    +
  • Chew cud

  • +
  • Moo

  • +
+

This concludes our discussion of cows.

+
+
+

Oxen

+
+

Oxen are strong.

+
+
+ +HTML + ); + + $tags->next_tag( 'section' ); + $state = $tags->new_state(); + $state->match_depth = 3; + + $p3_count = 0; + while ( $tags->balanced_next( $state, 'p' ) ) { + $p3_count++; + } + + // Did we only visit the tags inside section > * > * > p? + $this->assertEquals( 5, $p3_count ); + + $state = $tags->new_state(); + $state->match_depth = 2; + + $p2_count = 0; + while ( $tags->balanced_next( $state, 'p' ) ) { + $p2_count++; + } + + // Did we only visit the tags inside section > * > p? + $this->assertEquals( 1, $p2_count ); + } } From 89bb635a196dd342faf664230abae71ce9572744 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 23 Dec 2022 15:16:48 -0700 Subject: [PATCH 13/19] Getting closer --- .../html/class-wp-html-attribute-sourcer.php | 61 ++++++++++++++++++- .../html/class-wp-html-processor.php | 3 - lib/experimental/html/index.php | 2 + .../html/wp-html-attribute-sourcer-test.php | 60 +++++++++++++++++- phpunit/html/wp-html-processor-test.php | 17 +++++- phpunit/html/wp-html-tag-processor-test.php | 9 +++ 6 files changed, 143 insertions(+), 9 deletions(-) diff --git a/lib/experimental/html/class-wp-html-attribute-sourcer.php b/lib/experimental/html/class-wp-html-attribute-sourcer.php index da4fb73689fa32..2373726022fdca 100644 --- a/lib/experimental/html/class-wp-html-attribute-sourcer.php +++ b/lib/experimental/html/class-wp-html-attribute-sourcer.php @@ -103,8 +103,8 @@ public function source_attributes() { } $tags = self::select( $sourcer['selector'], $this->html ); - if ( null === $tags ) { - $unparsed[] = $name; + if ( ! $tags ) { + $attributes[ $name ] = null; continue; } @@ -155,6 +155,63 @@ public static function select_match( $tags, $s ) { } public static function select( $selectors, $html ) { + $selector_index = 0; + + selector_choice: + $tags = new WP_HTML_Processor( $html ); + $outer_state = $tags->new_state(); + + $selector = $selectors[$selector_index]; + $next = $selectors[$selector_index]; + + loop: + while ( $tags->balanced_next( $outer_state ) ) { + if ( ! self::select_match( $tags, $selector ) ) { + continue; + } + + if ( ! isset( $next['then'] ) ) { + return $tags; + } + + $prev = $next; + $next = $next['then']; + + $inner_state = $tags->new_state(); + switch ( $next['combinator'] ) { + case '+': + $outer_state->match_depth = 1; + while ( $tags->balanced_next( $outer_state ) ) { + if ( self::select_match( $tags, $next ) ) { + return $tags; + } + } + break; + + case '>': + $inner_state->match_depth = 1; + case ' ': + while ( $tags->balanced_next( $inner_state ) ) { + if ( self::select_match( $tags, $next ) ) { + return $tags; + } + } + while ( $tags->balanced_next( $inner_state ) ) { + continue; + } + $next = $prev; + goto loop; + } + } + + if ( ++$selector_index < count( $selectors ) ) { + goto selector_choice; + } + + return false; + } + + public static function select_draft1( $selectors, $html ) { $tags = new WP_HTML_Processor( $html ); if ( ! $tags->next_tag() ) { return null; diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index 135b562898da2a..a36d77750466aa 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -81,7 +81,6 @@ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = nul * navigating through non-normative documents. */ if ( $last_tag !== $tag_name ) { - $this->next_tag(); return false; } break; @@ -104,7 +103,6 @@ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = nul * Step 2. Bail if we've reached the end of the tag in which we started. */ if ( 0 === $depth ) { - $this->next_tag(); return false; } @@ -127,7 +125,6 @@ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = nul } } - $this->next_tag(); return false; } diff --git a/lib/experimental/html/index.php b/lib/experimental/html/index.php index a31dbaf48c6b2a..299131052c32fe 100644 --- a/lib/experimental/html/index.php +++ b/lib/experimental/html/index.php @@ -10,3 +10,5 @@ require_once __DIR__ . '/class-wp-html-span.php'; require_once __DIR__ . '/class-wp-html-text-replacement.php'; require_once __DIR__ . '/class-wp-html-tag-processor.php'; +require_once __DIR__ . '/class-wp-html-processor.php'; +require_once __DIR__ . '/class-wp-html-attribute-sourcer.php'; diff --git a/phpunit/html/wp-html-attribute-sourcer-test.php b/phpunit/html/wp-html-attribute-sourcer-test.php index a402956a09588f..1af39bbacc7f13 100644 --- a/phpunit/html/wp-html-attribute-sourcer-test.php +++ b/phpunit/html/wp-html-attribute-sourcer-test.php @@ -6,7 +6,15 @@ * @subpackage HTML */ -require_once __DIR__ . '/../../lib/experimental/html/class-wp-html-attribute-sourcer.php'; +require_once __DIR__ . '/../../lib/experimental/html/index.php'; + +if ( ! function_exists( 'esc_attr' ) ) { + function esc_attr( $s ) { return htmlentities( $s, ENT_QUOTES, null, false ); } +} + +if ( ! class_exists( 'WP_UnitTestCase' ) ) { + class WP_UnitTestCase extends PHPUnit\Framework\TestCase {} +} /** * @group html @@ -14,6 +22,56 @@ * @coversDefaultClass WP_HTML_Attribute_Sourcer */ class WP_HTML_Attribute_Sourcer_Test extends WP_UnitTestCase { + /** + * @dataProvider data_single_combinators + */ + public function test_sources_single_combinators( $expected, $html, $attributes ) { + $this->assertSame( $expected, ( new WP_HTML_Attribute_Sourcer( $attributes, $html ) )->source_attributes() ); + } + + public function data_single_combinators() { + return array( + array( + array( 'attributes' => array( 'link' => 'docs.html' ), 'unparsed' => array() ), + '
', + array( + 'link' => array( + 'type' => 'string', + 'source' => 'attribute', + 'selector' => 'div a', + 'attribute' => 'href' + ), + ), + ), + + array( + array( 'attributes' => array( 'link' => 'docs.html' ), 'unparsed' => array() ), + '
', + array( + 'link' => array( + 'type' => 'string', + 'source' => 'attribute', + 'selector' => 'div > a', + 'attribute' => 'href' + ), + ), + ), + + array( + array( 'attributes' => array( 'link' => 'docs.html' ), 'unparsed' => array() ), + '
', + array( + 'link' => array( + 'type' => 'string', + 'source' => 'attribute', + 'selector' => 'div + a', + 'attribute' => 'href' + ), + ), + ), + ); + } + /** * @dataProvider data_sourced_attributes */ diff --git a/phpunit/html/wp-html-processor-test.php b/phpunit/html/wp-html-processor-test.php index d5b74b051042ba..ae9089d761f4f3 100644 --- a/phpunit/html/wp-html-processor-test.php +++ b/phpunit/html/wp-html-processor-test.php @@ -7,7 +7,15 @@ * @subpackage HTML */ -require_once __DIR__ . '/../../lib/experimental/html/class-wp-html-processor.php'; +require_once __DIR__ . '/../../lib/experimental/html/index.php'; + +if ( ! function_exists( 'esc_attr' ) ) { + function esc_attr( $s ) { return htmlentities( $s, ENT_QUOTES, null, false ); } +} + +if ( ! class_exists( 'WP_UnitTestCase' ) ) { + class WP_UnitTestCase extends PHPUnit\Framework\TestCase {} +} /** * @group html-processor @@ -142,7 +150,9 @@ public function test_can_navigate_with_unique_state_throughout_structure() { $this->assertEquals( 3, $li_count ); // Ensure that we ended up where we expected. - $this->assertEquals( 'P', $tags->get_tag() ); + $this->assertEquals( 'UL', $tags->get_tag() ); + $this->assertTrue( $tags->is_tag_closer() ); + $tags->next_tag(); $this->assertTrue( $tags->get_attribute( 'inner' ) ); // And now flush out the previous stack/frame @@ -151,7 +161,8 @@ public function test_can_navigate_with_unique_state_throughout_structure() { } // Ensure that we're back where we want to be after exiting two separate frames. - $this->assertEquals( 'SECTION', $tags->get_tag() ); + $this->assertEquals( 'P', $tags->get_tag() ); + $this->assertTrue( $tags->is_tag_closer() ); $tags->next_tag( 'p' ); $this->assertTrue( $tags->get_attribute( 'wanted' ) ); } diff --git a/phpunit/html/wp-html-tag-processor-test.php b/phpunit/html/wp-html-tag-processor-test.php index e6b3d4d8071af5..1d91d3a32f53c1 100644 --- a/phpunit/html/wp-html-tag-processor-test.php +++ b/phpunit/html/wp-html-tag-processor-test.php @@ -8,6 +8,15 @@ require_once __DIR__ . '/../../lib/experimental/html/index.php'; +if ( ! function_exists( 'esc_attr' ) ) { + function esc_attr( $s ) { return htmlentities( $s, ENT_QUOTES, null, false ); } +} + +if ( ! class_exists( 'WP_UnitTestCase' ) ) { + class WP_UnitTestCase extends PHPUnit\Framework\TestCase {} +} + + /** * @group html * From e5cfd30addcacdb6ba3f84ea4acd3cf1da653033 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 23 Dec 2022 16:48:48 -0600 Subject: [PATCH 14/19] more tests --- .../html/wp-html-attribute-sourcer-test.php | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/phpunit/html/wp-html-attribute-sourcer-test.php b/phpunit/html/wp-html-attribute-sourcer-test.php index 1af39bbacc7f13..da8012e460c404 100644 --- a/phpunit/html/wp-html-attribute-sourcer-test.php +++ b/phpunit/html/wp-html-attribute-sourcer-test.php @@ -72,6 +72,82 @@ public function data_single_combinators() { ); } + /** + * @dataProvider data_nested_combinators + */ + public function test_sources_nested_combinators( $expected, $html, $attributes ) { + $this->assertSame( $expected, ( new WP_HTML_Attribute_Sourcer( $attributes, $html ) )->source_attributes() ); + } + + public function data_nested_combinators() { + return array( + array( + array( 'attributes' => array( 'link' => 'docs.html' ), 'unparsed' => array() ), + '
', + array( + 'link' => array( + 'type' => 'string', + 'source' => 'attribute', + 'selector' => 'section div a', + 'attribute' => 'href' + ), + ), + ), + + array( + array( 'attributes' => array( 'link' => null ), 'unparsed' => array() ), + '
', + array( + 'link' => array( + 'type' => 'string', + 'source' => 'attribute', + 'selector' => 'section div a', + 'attribute' => 'href' + ), + ), + ), + + array( + array( 'attributes' => array( 'link' => 'docs.html' ), 'unparsed' => array() ), + '
', + array( + 'link' => array( + 'type' => 'string', + 'source' => 'attribute', + 'selector' => 'section > div > a', + 'attribute' => 'href' + ), + ), + ), + + array( + array( 'attributes' => array( 'link' => 'docs.html' ), 'unparsed' => array() ), + '
', + array( + 'link' => array( + 'type' => 'string', + 'source' => 'attribute', + 'selector' => 'section > div + a', + 'attribute' => 'href' + ), + ), + ), + ); + } + + /** + * @dataProvider data_skipping_non_matches + */ + public function test_sources_skipping_non_matches( $expected, $html, $attributes ) { + $this->assertSame( $expected, ( new WP_HTML_Attribute_Sourcer( $attributes, $html ) )->source_attributes() ); + } + + public function data_skipping_non_matches() { + return array( + + ); + } + /** * @dataProvider data_sourced_attributes */ From eb1367df9573b70a6ff57d3c795e204943a7bee4 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 23 Dec 2022 22:50:41 -0600 Subject: [PATCH 15/19] Closer still --- .../html/class-wp-html-attribute-sourcer.php | 40 +++++++++++++++---- .../html/class-wp-html-processor.php | 15 +++---- .../html/wp-html-attribute-sourcer-test.php | 13 ++++++ 3 files changed, 54 insertions(+), 14 deletions(-) diff --git a/lib/experimental/html/class-wp-html-attribute-sourcer.php b/lib/experimental/html/class-wp-html-attribute-sourcer.php index 2373726022fdca..006d0e9781c37e 100644 --- a/lib/experimental/html/class-wp-html-attribute-sourcer.php +++ b/lib/experimental/html/class-wp-html-attribute-sourcer.php @@ -8,6 +8,10 @@ * - select_adjacent_sibling( $tags_at_start_tag, $selector_sequence ) * - select_general_sibling( $tags_at_start_tag, $selector_sequence ) * - close_n_levels( $tags_at_depth, $n_levels ) + * + * @TODO: + * - [ ] Handle multiple joined constraints for classes and attributes + * e.g. ".locale-en-US.localized[data-translation-id][data-translate]" */ /** @@ -174,31 +178,53 @@ public static function select( $selectors, $html ) { return $tags; } + inner_loop: $prev = $next; $next = $next['then']; $inner_state = $tags->new_state(); switch ( $next['combinator'] ) { + /* + * Adjacent sibling must be the immediately-following + * element which shares the same parent. + */ case '+': - $outer_state->match_depth = 1; - while ( $tags->balanced_next( $outer_state ) ) { - if ( self::select_match( $tags, $next ) ) { + // Close out this tag if it needs to be. + while ( $tags->balanced_next( $inner_state ) ) { + continue; + } + + if ( $tags->balanced_next( $outer_state ) && self::select_match( $tags, $next ) ) { + if ( ! isset( $next['then'] ) ) { return $tags; } + goto inner_loop; } + + $next = $prev; break; + // Child combinator case '>': $inner_state->match_depth = 1; + // Intentional fallthrough + // Descendant combinator case ' ': + /* + * This match has to be a child of the matched tag, + * and the matched tag has to be its parent for the + * case of the child combinator. + */ while ( $tags->balanced_next( $inner_state ) ) { if ( self::select_match( $tags, $next ) ) { - return $tags; + if ( ! isset( $next['then'] ) ) { + return $tags; + } + + goto inner_loop; } } - while ( $tags->balanced_next( $inner_state ) ) { - continue; - } + $next = $prev; goto loop; } diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index a36d77750466aa..ced54246cf35e5 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -83,6 +83,14 @@ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = nul if ( $last_tag !== $tag_name ) { return false; } + + /* + * Step 2. Bail if we've reached the end of the tag in which we started. + */ + if ( 0 === $state->relative_depth() ) { + return false; + } + break; } @@ -99,13 +107,6 @@ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = nul ? $state->relative_depth() + 1 : $state->relative_depth(); - /* - * Step 2. Bail if we've reached the end of the tag in which we started. - */ - if ( 0 === $depth ) { - return false; - } - /* * Step 3. Determine if we have a matching tag. In addition to the query * we pass along to the underlying tag processor we're going to allow diff --git a/phpunit/html/wp-html-attribute-sourcer-test.php b/phpunit/html/wp-html-attribute-sourcer-test.php index da8012e460c404..db370385b9a7cc 100644 --- a/phpunit/html/wp-html-attribute-sourcer-test.php +++ b/phpunit/html/wp-html-attribute-sourcer-test.php @@ -69,6 +69,19 @@ public function data_single_combinators() { ), ), ), + + array( + array( 'attributes' => array( 'link' => null ), 'unparsed' => array() ), + '

', + array( + 'link' => array( + 'type' => 'string', + 'source' => 'attribute', + 'selector' => 'div + a', + 'attribute' => 'href' + ), + ), + ), ); } From 30172abb612d7616ad8f2b8ac534744a61c12fbc Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 23 Dec 2022 22:56:15 -0600 Subject: [PATCH 16/19] Passing tests! --- lib/experimental/html/class-wp-html-attribute-sourcer.php | 3 +-- phpunit/html/wp-html-attribute-sourcer-test.php | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/experimental/html/class-wp-html-attribute-sourcer.php b/lib/experimental/html/class-wp-html-attribute-sourcer.php index 006d0e9781c37e..27a78aecffd77a 100644 --- a/lib/experimental/html/class-wp-html-attribute-sourcer.php +++ b/lib/experimental/html/class-wp-html-attribute-sourcer.php @@ -165,12 +165,11 @@ public static function select( $selectors, $html ) { $tags = new WP_HTML_Processor( $html ); $outer_state = $tags->new_state(); - $selector = $selectors[$selector_index]; $next = $selectors[$selector_index]; loop: while ( $tags->balanced_next( $outer_state ) ) { - if ( ! self::select_match( $tags, $selector ) ) { + if ( ! self::select_match( $tags, $next ) ) { continue; } diff --git a/phpunit/html/wp-html-attribute-sourcer-test.php b/phpunit/html/wp-html-attribute-sourcer-test.php index db370385b9a7cc..fda53531baaaaf 100644 --- a/phpunit/html/wp-html-attribute-sourcer-test.php +++ b/phpunit/html/wp-html-attribute-sourcer-test.php @@ -171,7 +171,7 @@ public function test_sources_attributes( $expected, $html, $attributes ) { public function data_sourced_attributes() { return array( array( - array( 'attributes' => array( 'link' => 'docs.html' ), 'unparsed' => array() ), + array( 'attributes' => array( 'link' => 'image' ), 'unparsed' => array() ), <<
Just another section
From f415bdba0a34c8fb5444f7ba6a093ff45fe81b5a Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 23 Dec 2022 23:01:11 -0600 Subject: [PATCH 17/19] Remove first draft select function --- .../html/class-wp-html-attribute-sourcer.php | 121 ------------------ 1 file changed, 121 deletions(-) diff --git a/lib/experimental/html/class-wp-html-attribute-sourcer.php b/lib/experimental/html/class-wp-html-attribute-sourcer.php index 27a78aecffd77a..ea178cbf2eafd8 100644 --- a/lib/experimental/html/class-wp-html-attribute-sourcer.php +++ b/lib/experimental/html/class-wp-html-attribute-sourcer.php @@ -236,127 +236,6 @@ public static function select( $selectors, $html ) { return false; } - public static function select_draft1( $selectors, $html ) { - $tags = new WP_HTML_Processor( $html ); - if ( ! $tags->next_tag() ) { - return null; - } - - $tags->set_bookmark( 'start' ); - - foreach ( $selectors as $s ) { - $tags->seek( 'start' ); - $max = 100; - while ( --$max > 0 ) { - $next = $s; - - // This label is probably where some stack-level data should reside. - next: - // Find the next starting point - while ( null === self::select_match( $tags, $next ) && $tags->next_tag() ) { - continue; - } - - // We're out of possible starting points - if ( null === self::select_match( $tags, $next ) ) { - continue 2; - } - - // No further selectors, then bingo! - if ( ! isset( $next['then'] ) ) { - return $tags; - } - - $next = $next['then']; - - // Adjacent sibling must be the immediately-following element. - if ( '+' === $next['combinator'] ) { - var_dump( [ - 'msg' => "Processing adjacent sibling", - 'html' => $html, - 'tag' => $tags->get_tag(), - 'selector' => $next - ] ); - $state = $tags->new_state(); - while ( $tags->balanced_next( $state ) ) { - continue; - } - - $tags->next_tag(); - if ( null === self::select_match( $tags, $next ) ) { - continue; - } - - if ( isset( $next['then'] ) ) { - goto next; - } - - // @TODO: Recurse here so we can handle more than one level. - return $tags; - } - - // Child must be one level into current tag. - if ( '>' === $next['combinator'] ) { - var_dump( [ - 'msg' => "Processing child", - 'html' => $html, - 'tag' => $tags->get_tag(), - 'selector' => $next - ] ); - $state = $tags->new_state(); - $state->match_depth = 1; - while ( $tags->balanced_next( $state ) ) { - if ( null === self::select_match( $tags, $next ) ) { - continue; - } - - if ( isset( $next['then'] ) ) { - goto next; - } - - // @TODO: Recurse here so we can handle more than one level. - return $tags; - } - - continue; - } - - // Descendant can be anywhere inside current tag. - if ( ' ' === $next['combinator'] ) { - var_dump( [ - 'msg' => "Processing descendant", - 'html' => $html, - 'tag' => $tags->get_tag(), - 'selector' => $next - ] ); - $state = $tags->new_state(); - while ( $tags->balanced_next( $state ) ) { - if ( null === self::select_match( $tags, $next ) ) { - continue; - } - - if ( isset( $next['then'] ) ) { - goto next; - } - - // @TODO: Recurse here so we can handle more than one level. - return $tags; - } - - continue; - } - - // General sibling must be anything at current level. - if ( '~' === $next['combinator'] ) { - // @TODO: Support this. - return null; - } - } - } - - return null; - } - public static function parse_definition( $definition ) { if ( empty( $definition['source'] ) ) { return 'not-sourced'; From 6103139e2119d6995ee26bce5f4ad2047707b6ac Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 23 Dec 2022 23:52:28 -0600 Subject: [PATCH 18/19] More test cases, a couple new failures --- .../html/class-wp-html-attribute-sourcer.php | 7 ++- .../html/class-wp-html-processor.php | 5 ++ .../html/wp-html-attribute-sourcer-test.php | 50 +++++++++++++++++++ 3 files changed, 61 insertions(+), 1 deletion(-) diff --git a/lib/experimental/html/class-wp-html-attribute-sourcer.php b/lib/experimental/html/class-wp-html-attribute-sourcer.php index ea178cbf2eafd8..c2be48ee429b6d 100644 --- a/lib/experimental/html/class-wp-html-attribute-sourcer.php +++ b/lib/experimental/html/class-wp-html-attribute-sourcer.php @@ -12,6 +12,7 @@ * @TODO: * - [ ] Handle multiple joined constraints for classes and attributes * e.g. ".locale-en-US.localized[data-translation-id][data-translate]" + * - [ ] Handle comma-separated selector sequences; apparently we only grab the first right now */ /** @@ -158,6 +159,10 @@ public static function select_match( $tags, $s ) { return $tags; } + /** + * @TODO: This needs to be able to continue to the next match + * Pass in $tags? Pass in a bookmark? + */ public static function select( $selectors, $html ) { $selector_index = 0; @@ -165,7 +170,7 @@ public static function select( $selectors, $html ) { $tags = new WP_HTML_Processor( $html ); $outer_state = $tags->new_state(); - $next = $selectors[$selector_index]; + $next = $selectors[ $selector_index ]; loop: while ( $tags->balanced_next( $outer_state ) ) { diff --git a/lib/experimental/html/class-wp-html-processor.php b/lib/experimental/html/class-wp-html-processor.php index ced54246cf35e5..6fa236aa94a336 100644 --- a/lib/experimental/html/class-wp-html-processor.php +++ b/lib/experimental/html/class-wp-html-processor.php @@ -4,6 +4,7 @@ * @TODO: Handle self-closing foreign elements. * @TODO: Detect non-normative HTML input. * @TODO: Consider parsing non-normative HTML input, support adoption agency algorithm. + * @TODO: Figure out how multiple external states can conflict. * * If we support non-normative HTML we can probably handle significantly more * HTML without introducing unexpected results, but I'm not sure yet if we can @@ -65,6 +66,10 @@ public function balanced_next( WP_HTML_Processor_Scan_State $state, $query = nul * need to separately track those, but their behavior matches * this case. The self-closing flag is ignored for HTML5 tags. */ + if ( 0 === $state->relative_depth() ) { + return false; + } + break; case 'opener': diff --git a/phpunit/html/wp-html-attribute-sourcer-test.php b/phpunit/html/wp-html-attribute-sourcer-test.php index fda53531baaaaf..38a274008a9de3 100644 --- a/phpunit/html/wp-html-attribute-sourcer-test.php +++ b/phpunit/html/wp-html-attribute-sourcer-test.php @@ -22,6 +22,56 @@ class WP_UnitTestCase extends PHPUnit\Framework\TestCase {} * @coversDefaultClass WP_HTML_Attribute_Sourcer */ class WP_HTML_Attribute_Sourcer_Test extends WP_UnitTestCase { + /** + * @dataProvider data_ids_and_their_selectors + */ + public function test_selects_proper_html_from_selector( $wanted_ids, $selector ) { + $html = << +
+ +
+ +
+

The antics of ants with antlers

+

+ + Ants + with antlers can be funny. +

+
    +
  • Decorations
  • +
  • Cleanup crew
  • +
  • Spooky visitors
  • +
+
+ +HTML; + + list( $selectors ) = WP_HTML_Attribute_Sourcer::parse_selector( $selector ); + $this->assertIsArray( $selectors ); + + $found_ids = array(); + if ( $tags = WP_HTML_Attribute_Sourcer::select( [ $selectors ], $html ) ) { + $found_ids[] = $tags->get_attribute( 'id' ); + } + + $this->assertEqualsCanonicalizing( $wanted_ids, $found_ids ); + } + + public function data_ids_and_their_selectors() { + return array( + array( array( 'li-1' ), 'li' ), + array( array(), 'section > p img + em' ), + array( array( 'strong-ants' ), 'section > p img + strong' ), + array( array( 'funny-stuff' ), 'section > p strong + em' ), + array( array( 'page-title' ), 'section h2' ), + array( array( 'post-title' ), 'section > h2' ), + array( array( 'ant-logo' ), '[href]' ), + array( array(), '.non-existent' ), + ); + } + /** * @dataProvider data_single_combinators */ From 79c24bdcba3d031ab488ab1836e7b9926dc63f26 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 9 Jan 2023 13:03:43 -0800 Subject: [PATCH 19/19] Only traverse children of non-closing or non-void elements. --- .../html/class-wp-html-attribute-sourcer.php | 13 +++++++++--- .../html/class-wp-html-tag-processor.php | 21 +++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/lib/experimental/html/class-wp-html-attribute-sourcer.php b/lib/experimental/html/class-wp-html-attribute-sourcer.php index c2be48ee429b6d..c66ffbf4550c88 100644 --- a/lib/experimental/html/class-wp-html-attribute-sourcer.php +++ b/lib/experimental/html/class-wp-html-attribute-sourcer.php @@ -193,9 +193,16 @@ public static function select( $selectors, $html ) { * element which shares the same parent. */ case '+': - // Close out this tag if it needs to be. - while ( $tags->balanced_next( $inner_state ) ) { - continue; + /* + * If we have opened a tag we need to continue scanning past all of its children. + * `balanced_next()` will end up on the closing tag, so if we don't have any + * children, or no closing tag, we need to skip this because `balanced_tag()` + * would end up in those cases on the sibling element. + */ + if ( ! WP_HTML_Processor::is_html_void_element( $tags->get_tag() ) ) { + while ( $tags->balanced_next( $inner_state ) ) { + continue; + } } if ( $tags->balanced_next( $outer_state ) && self::select_match( $tags, $next ) ) { diff --git a/lib/experimental/html/class-wp-html-tag-processor.php b/lib/experimental/html/class-wp-html-tag-processor.php index a0565c3d452864..72c342dbd02a73 100644 --- a/lib/experimental/html/class-wp-html-tag-processor.php +++ b/lib/experimental/html/class-wp-html-tag-processor.php @@ -20,6 +20,7 @@ * @TODO: Add slow mode to escape character entities in CSS class names? * (This requires a custom decoder since `html_entity_decode()` * doesn't handle attribute character reference decoding rules. + * @TODO: Do we make any indexing assumptions based on only scanning tag openers? $tag_name - 1 vs. ? * * @package WordPress * @subpackage HTML @@ -1439,6 +1440,26 @@ public function get_tag() { return strtoupper( $tag_name ); } + /** + * Returns a representation of the currently-open tag, for debug purposes. + * + * @since 6.3.0 + * @return string + */ + public function debug_current_token() { + if ( null === $this->tag_name_starts_at ) { + return ''; + } + + if ( $this->is_tag_closer() ) { + $tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length ); + return ""; + } + + $tag_starts_at = $this->tag_name_starts_at - 1; + return substr( $this->html, $tag_starts_at, $this->tag_ends_at - $tag_starts_at + 1 ); + } + /** * Indicates if the current tag token is a tag closer. *