Skip to content

Commit

Permalink
Merge pull request #221 from j0k3r/fix/meta-refresh-url
Browse files Browse the repository at this point in the history
Handle meta refresh when attributes are reversed
  • Loading branch information
j0k3r authored Jan 23, 2020
2 parents 1630960 + 4889175 commit 6b3e53b
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
5 changes: 4 additions & 1 deletion src/Extractor/HttpClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,10 @@ private function getMetaRefreshURL($url, $html)

// <meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513">
if (!preg_match('!<meta http-equiv=["\']?refresh["\']? content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']?!i', $html, $match)) {
return false;
// let's try in a reverse mode (switch content & http-equiv attributes)
if (!preg_match('!<meta content=["\']?[0-9];\s*url=["\']?([^"\'>]+)["\']? http-equiv=["\']?refresh["\']?!i', $html, $match)) {
return false;
}
}

$redirectUrl = str_replace('&amp;', '&', trim($match[1]));
Expand Down
5 changes: 5 additions & 0 deletions tests/Extractor/HttpClientTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ public function dataForMetaRefresh()
'<html><meta HTTP-EQUIV="REFRESH" content="0; url=http://www.bernama.com/bernama/v6/newsindex.php?id=943513"></html>',
'http://www.bernama.com/bernama/v6/newsindex.php?id=943513',
],
[
'https://www.google.com/url?sa=t&source=web&rct=j&url=https://databox.com/google-my-business-seo',
'<html><meta content="0;url=https://databox.com/google-my-business-seo" http-equiv="refresh"></html>',
'https://databox.com/google-my-business-seo',
],
[
'http://www.example.com/wiki/Copyright',
'<html><meta HTTP-EQUIV="REFRESH" content="0; url=/bernama/v6/newsindex.php?id=943513"></html>',
Expand Down

0 comments on commit 6b3e53b

Please sign in to comment.