forked from beezwax/WP-Publish-to-Apple-News
-
Notifications
You must be signed in to change notification settings - Fork 70
/
Copy pathclass-parser.php
247 lines (216 loc) · 6.58 KB
/
class-parser.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
<?php
/**
* Publish to Apple News Includes: Apple_Exporter\Parser class
*
* Contains a class which is used to parse raw HTML into an Apple News format.
*
* @package Apple_News
* @subpackage Apple_Exporter
* @since 1.2.1
*/
namespace Apple_Exporter;
use DOMDocument;
use DOMNodeList;
require_once __DIR__ . '/class-html.php';
require_once __DIR__ . '/class-markdown.php';
/**
* A class that parses raw HTML into either Apple News HTML or Markdown format.
*
* @since 1.2.1
*/
class Parser {
/**
* The format to use. Valid values are 'html' and 'markdown'.
*
* @access public
* @var string
*/
public string $format;
/**
* Initializes the object with the format setting.
*
* @param string $format The format to use. Defaults to markdown.
*
* @access public
*/
public function __construct( $format = 'markdown' ) {
$this->format = ( 'html' === $format ) ? 'html' : 'markdown';
}
/**
* Transforms raw HTML into Apple News format.
*
* @param string $html The raw HTML to parse.
*
* @access public
* @return string The filtered content in the format specified.
*/
public function parse( $html ): string {
// Don't parse empty input.
if ( empty( $html ) ) {
return '';
}
/**
* Clean up any issues prior to formatting.
* This needs to be done here to avoid duplicating efforts
* in the HTML and Markdown classes.
*/
$html = $this->clean_html( $html );
// Fork for format.
if ( 'html' === $this->format ) {
return $this->parse_html( $html );
}
return $this->parse_markdown( $html );
}
/**
* A function to format the given HTML as Apple News HTML.
*
* @param string $html The raw HTML to parse.
*
* @access private
* @return string The content, converted to an Apple News HTML string.
*/
private function parse_html( string $html ): string {
/**
* Allows for filtering of the formatted content before return.
*
* @since 1.2.1
*
* @param string $content The content to filter.
* @param string $html The original HTML, before filtering was applied.
*/
return apply_filters( 'apple_news_parse_html', ( new HTML() )->format( $html ), $html );
}
/**
* A function to convert the given HTML into Apple News Markdown.
*
* @param string $html The raw HTML to parse.
*
* @access private
* @return string The content, converted to an Apple News Markdown string.
*/
private function parse_markdown( string $html ): string {
// PHP's DOMDocument doesn't like HTML5, so we must ignore errors.
libxml_use_internal_errors( true );
// Load the content, forcing the use of UTF-8.
$dom = new DOMDocument();
$dom->loadHTML( '<?xml encoding="UTF-8">' . $html );
// Reset error state.
libxml_clear_errors();
libxml_use_internal_errors( false );
// Find the first-level nodes of the body tag.
$nodes = $dom->getElementsByTagName( 'body' )->item( 0 )->childNodes;
// Perform parsing.
$parser = new Markdown();
$content = $parser->parse_nodes( $nodes );
/**
* Allows for filtering of the formatted content before return.
*
* @since 1.2.1
*
* @param string $content The content to filter.
* @param DOMNodeList $nodes The list of DOMElement nodes used initially.
*/
return apply_filters( 'apple_news_parse_markdown', $content, $nodes );
}
/**
* Handles cleaning up any HTML issues prior to parsing that could affect
* both HTML and Markdown format.
*
* @param string $html The HTML to be cleaned.
*
* @access private
* @return string The clean HTML
*/
private function clean_html( string $html ): string {
$html = $this->remove_empty_a_tags( $html );
$html = $this->handle_root_relative_urls( $html );
$html = $this->validate_protocols( $html );
$html = $this->convert_spaces( $html );
// Return the clean HTML.
return trim( $html );
}
/**
* Remove empty <a> tags from the given HTML content.
*
* @param string $html The HTML content to remove empty <a> tags from.
*
* @return string The modified HTML content without empty <a> tags.
*/
private function remove_empty_a_tags( string $html ): string {
// Match all <a> tags via regex.
// We can't use DOMDocument here because some tags will be removed entirely.
preg_match_all( '/<a.*?>(.*?)<\/a>/m', $html, $a_tags );
// Check if we got matches.
if ( ! empty( $a_tags ) ) {
// Iterate over the matches and see what we need to do.
foreach ( $a_tags[0] as $i => $a_tag ) {
// If the <a> tag doesn't have content, dump it.
$content = trim( $a_tags[1][ $i ] );
if ( empty( $content ) ) {
$html = str_replace( $a_tag, '', $html );
continue;
}
// If there isn't an href that has content, strip the anchor tag.
if ( ! preg_match( '/<a[^>]+href="([^"]+)"[^>]*>.*?<\/a>/m', $a_tag, $matches ) ) {
$html = str_replace( $a_tag, $content, $html );
continue;
}
// If the href value trims to nil, strip the anchor tag.
$href = trim( $matches[1] );
if ( empty( $href ) ) {
$html = str_replace( $a_tag, $a_tags[1][ $i ], $html );
}
}
}
return $html;
}
/**
* Handle root-relative URLs in the HTML content.
* Replace the root-relative URLs with the absolute
* URLs using the site URL.
*
* @param string $html The HTML content to handle root-relative URLs.
*
* @return string The modified HTML content with absolute URLs for root-relative ones.
*/
private function handle_root_relative_urls( string $html ): string {
return preg_replace_callback(
'/(<a[^>]+href=(["\'])\/[^\/].*?\2[^>]*>)/m',
fn( $matches ) => str_replace( 'href="/', 'href="' . get_home_url() . '/', $matches[0] ),
$html
);
}
/**
* Ensure that the resulting URL uses a supported protocol.
* Leave it up to the content creator to ensure the URL is
* otherwise valid.
*
* @param string $html The HTML content to validate.
*
* @return string The modified HTML content with validated protocols.
*/
private function validate_protocols( string $html ): string {
return preg_replace_callback(
'/<a[^>]+href="([^"]*)"[^>]*>(.*?)<\/a>/m',
function ( $matches ) {
$href = $matches[1];
$content = $matches[2];
if ( ! preg_match( '/^(https?:\/\/|mailto:|musics?:\/\/|stocks:\/\/|webcal:\/\/|#)/', $href ) ) {
return $content;
}
return $matches[0]; // Return whole anchor tag if protocol is fine.
},
$html
);
}
/**
* Convert non-breaking spaces to regular spaces.
*
* @param string $html The HTML content to convert.
*
* @return string The modified HTML content with converted spaces.
*/
private function convert_spaces( string $html ): string {
return str_ireplace( [ ' ', ' ' ], ' ', $html );
}
}