diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 79d96dc8be3f1..e6ac98daae6ab 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -378,6 +378,15 @@ class WP_HTML_Tag_Processor { */ private $is_closing_tag; + /** + * Stores the position of the last-matched tag, or the start of the document if not matched yet. + * + * @var WP_HTML_Span + */ + private $last_position = null; + + private $last_token_end = 0; + /** * Lazily-built index of attributes found within an HTML tag, keyed by the attribute name. * @@ -507,6 +516,8 @@ class WP_HTML_Tag_Processor { */ public function __construct( $html ) { $this->html = $html; + + $this->last_position = new WP_HTML_Span( 0, 0 ); } /** @@ -530,6 +541,16 @@ public function next_tag( $query = null ) { $this->parse_query( $query ); $already_found = 0; + if ( null !== $this->tag_name_starts_at ) { + $rewind_amount = $this->is_closing_tag ? 2 : 1; + $before_tag = $this->tag_name_starts_at - $rewind_amount; + $end_of_tag = $this->tag_ends_at; + + $this->last_position->start = $before_tag; + $this->last_position->end = $end_of_tag; + $this->last_token_end = $this->tag_ends_at + 1; + } + do { if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { return false; @@ -1876,6 +1897,74 @@ public function is_tag_closer() { return $this->is_closing_tag; } + /** + * Returns the chunk of text from the end of the preceding tag or token to the + * start of the matched tag or token, with decoded character references. + * + * Example: + * + * $q = array( 'tag_closers' => 'visit' ); + * $processor = new WP_HTML_Tag_Processor( 'Before
Inside
After' ); + * $processor->next_tag( $q ); 'Before' === $processor->get_prev_text_chunk(); + * $processor->next_tag( $q ); 'Inside' === $processor->get_prev_text_chunk(); + * $processor->next_tag( $q ); 'After' === $processor->get_prev_text_chunk(); + * + * @since 6.4.0 + * + * @return string|null Chunk of text from end of last token to current token, or NULL if not yet matched. + */ + public function get_previous_text_chunk() { + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + $chunk = substr( $this->html, $this->last_position->end === 0 ? 0 : $this->last_position->end + 1 ); + $chunk = preg_replace( '/<[^a-z].*>/i', '', $chunk ); + return html_entity_decode( $chunk, ENT_HTML5 | ENT_QUOTES | ENT_SUBSTITUTE ); + } + + if ( ! $this->tag_name_starts_at ) { + return null; + } + + $chunk_start = $this->last_position->end === 0 ? 0 : $this->last_position->end + 1; + $chunk_end = $this->is_tag_closer() ? $this->tag_name_starts_at - 2 : $this->tag_name_starts_at - 1; + $chunk = substr( $this->html, $chunk_start, $chunk_end - $chunk_start ); + $chunk = preg_replace( '/<[^a-z].*>/i', '', $chunk ); + return html_entity_decode( $chunk, ENT_HTML5 | ENT_QUOTES | ENT_SUBSTITUTE ); + } + + /** + * Returns the chunk of html from the start of the preceding tag or token to the + * start of the matched tag or token, without decoded character references. + * + * Example: + * + * $q = array( 'tag_closers' => 'visit' ); + * $processor = new WP_HTML_Tag_Processor( 'Before
Inside
After' ); + * $processor->next_tag( $q ); 'Before' === $processor->get_prev_text_chunk(); + * $processor->next_tag( $q ); '
Inside' === $processor->get_prev_text_chunk(); + * $processor->next_tag( $q ); '
After' === $processor->get_prev_text_chunk(); + * + * @since 6.4.0 + * + * @return array|null Chunk of text from end of last token to current token, or NULL if not yet matched. + */ + public function get_previous_html_chunk() { + if ( $this->bytes_already_parsed >= strlen( $this->html ) ) { + $html = substr( $this->html, $this->last_position->start, $this->last_token_end - $this->last_position->start ); + $text = substr( $this->html, $this->last_token_end ); + + return array( $html, $text ); + } + + if ( ! $this->tag_name_starts_at ) { + return null; + } + + $html = substr( $this->html, $this->last_position->start, $this->last_token_end - $this->last_position->start ); + $text = substr( $this->html, $this->last_token_end, ( $this->is_tag_closer() ? $this->tag_name_starts_at - 2 : $this->tag_name_starts_at - 1 ) - $this->last_token_end ); + + return array( $html, $text ); + } + /** * Updates or creates a new attribute on the currently matched tag with the passed value. * diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-stringBuilder.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-stringBuilder.php new file mode 100644 index 0000000000000..4511e093a60ad --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-stringBuilder.php @@ -0,0 +1,114 @@ +next_tag( array( 'tag_closers' => 'visit' ) ) ) { + $extracted_text_content .= $processor->get_previous_text_chunk(); + } + $extracted_text_content .= $processor->get_previous_text_chunk(); + + $this->assertEquals( $text_content, $extracted_text_content, 'Extracted unexpected text content.' ); + } + + /** + * Data provider. + * + * @return array[]. + */ + public function data_html_and_associated_text_content() { + return array( + 'Basic text without HTML.' => array( 'This is plain text.', 'This is plain text.' ), + 'Basic text with a character reference.' => array( 'A < B', 'A < B' ), + 'Text before tag.' => array( 'Before', 'Before' ), + 'Text after tag.' => array( 'After', 'After' ), + 'Text inside tag.' => array( '
Inside
', 'Inside' ), + 'Text around tag.' => array( 'In the jungle.', 'In the jungle.' ), + 'Text interrupted by many tags.' => array( 'A wild adventure awaits.', 'A wild adventure awaits.' ), + 'Text with comment inside it.' => array( 'Ignore comment.', 'Ignore comment.' ), + 'Text with empty comment inside it.' => array( 'Ignore comment.', 'Ignore comment.' ), + 'Text with invalid comment inside it.' => array( 'Ignore comment.', 'Ignore comment.' ), + 'Skipping SCRIPT content.' => array( '
This in the output.', 'This in the output.' ), + ); + } + + /** + * @ticket {TICKET_NUMBER} + * + * @dataProvider data_html_and_associated_html_content + * + * @param string $html HTML containing text that should be extracted. + * @param int $max_code_points Stop iterating after this many code points have been extracted. + * @param string $html_content Full HTML containing text of max code point length from input. + */ + public function test_extracts_html_chunks_properly( $html, $max_code_points, $html_content ) { + $processor = new WP_HTML_Tag_Processor( $html ); + + $code_points = 0; + $extracted_html_content = ''; + while ( $processor->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + $text_chunk = $processor->get_previous_text_chunk(); + $chunk_cps = mb_strlen( $text_chunk ); + list( $html, $text ) = $processor->get_previous_html_chunk(); + $extracted_html_content .= $html; + if ( 0 === $max_code_points || $code_points + $chunk_cps <= $max_code_points ) { + $extracted_html_content .= $text; + $code_points += $chunk_cps; + } else { + break; + } + } + + $text_chunk = $processor->get_previous_text_chunk(); + $chunk_cps = mb_strlen( $text_chunk ); + list( $html, $text ) = $processor->get_previous_html_chunk(); + $extracted_html_content .= $html; + if ( 0 === $max_code_points || $code_points + $chunk_cps <= $max_code_points ) { + $extracted_html_content .= $text; + } + + $this->assertEquals( $html_content, $extracted_html_content, 'Extracted unexpected HTML content.' ); + } + + /** + * Data provider. + * + * @return array[]. + */ + public function data_html_and_associated_html_content() { + return array( + 'Basic text without HTML.' => array( 'This is plain text.', 0, 'This is plain text.' ), + 'Basic text without HTML (too long).' => array( 'This is plain text.', 8, '' ), + 'Basic text with a character reference.' => array( 'A < B', 0, 'A < B' ), + 'Character reference wider than text' => array( 'A < B', 5, 'A < B' ), + 'Text before tag.' => array( 'Before', 0, 'Before' ), + 'Text after tag.' => array( 'After', 0, 'After' ), + 'Text inside tag.' => array( '
Inside
', 0, '
Inside
' ), + 'Text around tag.' => array( 'In the jungle.', 0, 'In the jungle.' ), + 'Text interrupted by many tags.' => array( 'A wild adventure awaits.', 0, 'A wild adventure awaits.' ), + 'Text interrupted by many tags (long).' => array( 'A wild adventure awaits.', 16, 'A wild adventure' ), + 'Text with comment inside it.' => array( 'Ignore comment.', 0, 'Ignore comment.' ), + ); + } +}