Skip to content

Commit

Permalink
Merge pull request #1540 from ArthurHoaro/fix/metadata-regexes
Browse files Browse the repository at this point in the history
Improve regex to extract HTML metadata (title, description, etc.)
  • Loading branch information
ArthurHoaro authored Oct 13, 2020
2 parents 543b16b + 2cd0509 commit 458b6b9
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 2 deletions.
6 changes: 4 additions & 2 deletions application/bookmark/LinkUtils.php
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,13 @@ function html_extract_tag($tag, $html)
{
$propertiesKey = ['property', 'name', 'itemprop'];
$properties = implode('|', $propertiesKey);
// We need a OR here to accept either 'property=og:noquote' or 'property="og:unrelated og:my-tag"'
$orCondition = '["\']?(?:og:)?'. $tag .'["\']?|["\'][^\'"]*?(?:og:)?' . $tag . '[^\'"]*?[\'"]';
// Try to retrieve OpenGraph image.
$ogRegex = '#<meta[^>]+(?:'. $properties .')=["\']?(?:og:)?'. $tag .'["\'\s][^>]*content=["\']?(.*?)["\'/>]#';
$ogRegex = '#<meta[^>]+(?:'. $properties .')=(?:'. $orCondition .')[^>]*content=["\'](.*?)["\'].*?>#';
// If the attributes are not in the order property => content (e.g. Github)
// New regex to keep this readable... more or less.
$ogRegexReverse = '#<meta[^>]+content=["\']([^"\']+)[^>]+(?:'. $properties .')=["\']?(?:og)?:'. $tag .'["\'\s/>]#';
$ogRegexReverse = '#<meta[^>]+content=["\'](.*?)["\'][^>]+(?:'. $properties .')=(?:'. $orCondition .').*?>#';

if (preg_match($ogRegex, $html, $matches) > 0
|| preg_match($ogRegexReverse, $html, $matches) > 0
Expand Down
89 changes: 89 additions & 0 deletions tests/bookmark/LinkUtilsTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,78 @@ public function testHtmlExtractNonExistentCharset()
public function testHtmlExtractExistentNameTag()
{
$description = 'Bob and Alice share cookies.';

// Simple one line
$html = '<html><meta>stuff2</meta><meta name="description" content="' . $description . '"/></html>';
$this->assertEquals($description, html_extract_tag('description', $html));

// Simple OpenGraph
$html = '<meta property="og:description" content="' . $description . '">';
$this->assertEquals($description, html_extract_tag('description', $html));

// Simple reversed OpenGraph
$html = '<meta content="' . $description . '" property="og:description">';
$this->assertEquals($description, html_extract_tag('description', $html));

// ItemProp OpenGraph
$html = '<meta itemprop="og:description" content="' . $description . '">';
$this->assertEquals($description, html_extract_tag('description', $html));

// OpenGraph without quotes
$html = '<meta property=og:description content="' . $description . '">';
$this->assertEquals($description, html_extract_tag('description', $html));

// OpenGraph reversed without quotes
$html = '<meta content="' . $description . '" property=og:description>';
$this->assertEquals($description, html_extract_tag('description', $html));

// OpenGraph with noise
$html = '<meta tag1="content1" property="og:description" tag2="content2" content="' .
$description . '" tag3="content3">';
$this->assertEquals($description, html_extract_tag('description', $html));

// OpenGraph reversed with noise
$html = '<meta tag1="content1" content="' . $description . '" ' .
'tag3="content3" tag2="content2" property="og:description">';
$this->assertEquals($description, html_extract_tag('description', $html));

// OpenGraph multiple properties start
$html = '<meta property="unrelated og:description" content="' . $description . '">';
$this->assertEquals($description, html_extract_tag('description', $html));

// OpenGraph multiple properties end
$html = '<meta property="og:description unrelated" content="' . $description . '">';
$this->assertEquals($description, html_extract_tag('description', $html));

// OpenGraph multiple properties both end
$html = '<meta property="og:unrelated1 og:description og:unrelated2" content="' . $description . '">';
$this->assertEquals($description, html_extract_tag('description', $html));

// OpenGraph multiple properties both end with noise
$html = '<meta tag1="content1" property="og:unrelated1 og:description og:unrelated2" '.
'tag2="content2" content="' . $description . '" tag3="content3">';
$this->assertEquals($description, html_extract_tag('description', $html));

// OpenGraph reversed multiple properties start
$html = '<meta content="' . $description . '" property="unrelated og:description">';
$this->assertEquals($description, html_extract_tag('description', $html));

// OpenGraph reversed multiple properties end
$html = '<meta content="' . $description . '" property="og:description unrelated">';
$this->assertEquals($description, html_extract_tag('description', $html));

// OpenGraph reversed multiple properties both end
$html = '<meta content="' . $description . '" property="og:unrelated1 og:description og:unrelated2">';
$this->assertEquals($description, html_extract_tag('description', $html));

// OpenGraph reversed multiple properties both end with noise
$html = '<meta tag1="content1" content="' . $description . '" tag2="content2" '.
'property="og:unrelated1 og:description og:unrelated2" tag3="content3">';
$this->assertEquals($description, html_extract_tag('description', $html));

// Suggestion from #1375
$html = '<meta property="og:description" name="description" content="' . $description . '">';
$this->assertEquals($description, html_extract_tag('description', $html));
}

/**
Expand All @@ -105,6 +175,25 @@ public function testHtmlExtractNonExistentNameTag()
{
$html = '<html><meta>stuff2</meta><meta name="image" content="img"/></html>';
$this->assertFalse(html_extract_tag('description', $html));

// Partial meta tag
$html = '<meta content="Brief description">';
$this->assertFalse(html_extract_tag('description', $html));

$html = '<meta property="og:description">';
$this->assertFalse(html_extract_tag('description', $html));

$html = '<meta tag1="content1" property="og:description">';
$this->assertFalse(html_extract_tag('description', $html));

$html = '<meta property="og:description" tag1="content1">';
$this->assertFalse(html_extract_tag('description', $html));

$html = '<meta tag1="content1" content="Brief description">';
$this->assertFalse(html_extract_tag('description', $html));

$html = '<meta content="Brief description" tag1="content1">';
$this->assertFalse(html_extract_tag('description', $html));
}

/**
Expand Down

0 comments on commit 458b6b9

Please sign in to comment.