Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added twitter card functionality #196

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
/build/
/dist/
*.egg-info
venv/

# Mac OS
*.DS_Store
Expand Down
1 change: 1 addition & 0 deletions extruct/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
from .w3cmicrodata import MicrodataExtractor
from .opengraph import OpenGraphExtractor
from .microformat import MicroformatExtractor
from .twittercard import TwitterCardExtractor
from .xmldom import XmlDomHTMLParser
10 changes: 8 additions & 2 deletions extruct/_extruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from extruct.jsonld import JsonLdExtractor
from extruct.rdfa import RDFaExtractor
from extruct.twittercard import TwitterCardExtractor
from extruct.w3cmicrodata import MicrodataExtractor
from extruct.opengraph import OpenGraphExtractor
from extruct.microformat import MicroformatExtractor
Expand All @@ -11,7 +12,7 @@
from extruct.utils import parse_xmldom_html

logger = logging.getLogger(__name__)
SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa', 'dublincore']
SYNTAXES = ['microdata', 'opengraph', 'json-ld', 'microformat', 'rdfa', 'dublincore', 'twittercard']


def extract(htmlstring,
Expand Down Expand Up @@ -102,6 +103,11 @@ def extract(htmlstring,
('dublincore', DublinCoreExtractor().extract_items,
tree,
))
if 'twittercard' in syntaxes:
processors.append(
('twittercard', TwitterCardExtractor().extract_items,
tree,
))
output = {}
for syntax, extract, document in processors:
try:
Expand Down Expand Up @@ -162,7 +168,7 @@ def extract(htmlstring,
logger.exception(
'Failed to uniform extracted for {}, raises {}'
.format(syntax, e)
)
)
if errors == 'strict':
raise

Expand Down
71 changes: 71 additions & 0 deletions extruct/twittercard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import re

from extruct.utils import parse_html


# _PREFIX_PATTERN = re.compile(r'\s*(\w+):\s*([^\s]+)')
_PREFIX_PATTERN = re.compile(r'^\s*(?:<!--\s*)?(?:@|\#)twittercard\s*(?:-->)?\s*$', re.I)
_TW_NAMESPACES = {
'twitter': 'https://dev.twitter.com/cards#',
'owl' : 'http://www.w3.org/2002/07/owl#',
'gr' : 'http://purl.org/goodrelations/v1#',
'ctag' : 'http://commontag.org/ns#',
'cc' : 'http://creativecommons.org/ns#',
'grddl' : 'http://www.w3.org/2003/g/data-view#',
'rif' : 'http://www.w3.org/2007/rif#',
'sioc' : 'http://rdfs.org/sioc/ns#',
'skos' : 'http://www.w3.org/2004/02/skos/core#',
'xml' : 'http://www.w3.org/XML/1998/namespace',
'rdfs' : 'http://www.w3.org/2000/01/rdf-schema#',
'rev' : 'http://purl.org/stuff/rev#',
'rdfa' : 'http://www.w3.org/ns/rdfa#',
'dc' : 'http://purl.org/dc/terms/',
'foaf' : 'http://xmlns.com/foaf/0.1/',
'void' : 'http://rdfs.org/ns/void#',
'ical' : 'http://www.w3.org/2002/12/cal/icaltzd#',
'vcard' : 'http://www.w3.org/2006/vcard/ns#',
'wdrs' : 'http://www.w3.org/2007/05/powder-s#',
'og' : 'http://ogp.me/ns#',
'wdr' : 'http://www.w3.org/2007/05/powder#',
'rdf' : 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'xhv' : 'http://www.w3.org/1999/xhtml/vocab#',
'xsd' : 'http://www.w3.org/2001/XMLSchema#',
'v' : 'http://rdf.data-vocabulary.org/#',
'skosxl' : 'http://www.w3.org/2008/05/skos-xl#',
'schema' : 'http://schema.org/',
}

class TwitterCardExtractor(object):
"""TwitterCard extractor following extruct API.
"""


def extract(self, htmlstring, base_url=None, encoding='UTF-8'):
tree = parse_html(htmlstring, encoding=encoding)
return list(self.extract_items(tree, base_url=base_url))

def extract_items(self, document, base_url=None):
# TwitterCard defines a web page as a single rich object.
for head in document.xpath('//head'):
html_elems = document.head.xpath("parent::html")
namespaces = self.get_namespaces(
html_elems[0]) if html_elems else {}
namespaces.update(self.get_namespaces(head))
props = []
for el in head.xpath('meta[@name and @content]'):
prop = el.attrib['name']
val = el.attrib['content']
ns = prop.partition(':')[0]
if ns in _TW_NAMESPACES:
namespaces[ns] = _TW_NAMESPACES[ns]
if ns in namespaces:
props.append((prop, val))
if props:
yield {'namespace': namespaces, 'properties': props}


def get_namespaces(self, element):
return dict(
_PREFIX_PATTERN.findall(element.attrib.get('prefix', ''))
)

23 changes: 23 additions & 0 deletions tests/samples/misc/twittercard_chess_test.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" lang="en-US" dir="ltr" xmlns:fb="http://ogp.me/ns/fb#"
xmlns:og="http://ogp.me/ns#" class="user-logged-out">

<head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# game: http://ogp.me/ns/game#">
<meta charset="utf-8" />
<meta name="twitter:title" content="Chess.com - Play Chess Online - Free Games" />

<meta name="twitter:card" content="summary_large_image" />

<meta name="twitter:site" content="@chesscom" />

<meta name="twitter:description"
content="Play chess online for free on Chess.com with over 50 million members from around the world. Have fun playing with friends or challenging the computer!" />

<meta name="twitter:image" content="https://www.chess.com/bundles/web/images/social/share-home.a3e2cdbb.png" />
</head>

<body>
<h1>Chess.com - Play Chess Online - Free Games</h1>
</body>

</html>
29 changes: 29 additions & 0 deletions tests/samples/misc/twittercard_chess_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[
{
"namespace": {
"twitter": "https://dev.twitter.com/cards#"
},
"properties": [
[
"twitter:title",
"Chess.com - Play Chess Online - Free Games"
],
[
"twitter:card",
"summary_large_image"
],
[
"twitter:site",
"@chesscom"
],
[
"twitter:description",
"Play chess online for free on Chess.com with over 50 million members from around the world. Have fun playing with friends or challenging the computer!"
],
[
"twitter:image",
"https://www.chess.com/bundles/web/images/social/share-home.a3e2cdbb.png"
]
]
}
]
28 changes: 28 additions & 0 deletions tests/samples/misc/twittercard_optimizesmart_test.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<!DOCTYPE html>
<html>

<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta class="viewport" name="viewport" content="width=device-width, initial-

scale=1.0">
<meta http-equiv="X-UA-Compatible" content="IE=Edge">

<title>Open Graph Protocol for Facebook Explained with Examples - Optimize Smart</title>
<meta name="twitter:card" content="summary_large_image" />
<meta name="twitter:title" content="#Open #Graph #Protocol for #Facebook explained with examples" />
<meta name="twitter:description"
content="What is Open Graph Protocol and why you need it? Learn to implement Open Graph Protocol for Facebook on your website. Open Graph Protocol Meta Tags." />
<meta name="twitter:site" content="@optimizesmart" />
<meta name="twitter:creator" content="@optimizesmart" />
<meta name="twitter:image"
content="https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg" />
<meta name="twitter:label1" content="Written by" />
<meta name="twitter:data1" content="Himanshu" />
<meta name="twitter:label2" content="Time to read" />
<meta name="twitter:data2" content="13 minutes" />
</head>

<body></body>

</html>
49 changes: 49 additions & 0 deletions tests/samples/misc/twittercard_optimizesmart_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
[
{
"namespace": {
"twitter": "https://dev.twitter.com/cards#"
},
"properties": [
[
"twitter:card",
"summary_large_image"
],
[
"twitter:title",
"#Open #Graph #Protocol for #Facebook explained with examples"
],
[
"twitter:description",
"What is Open Graph Protocol and why you need it? Learn to implement Open Graph Protocol for Facebook on your website. Open Graph Protocol Meta Tags."
],
[
"twitter:site",
"@optimizesmart"
],
[
"twitter:creator",
"@optimizesmart"
],
[
"twitter:image",
"https://www.optimizesmart.com/wp-content/uploads/2010/07/open-graph-protocol.jpg"
],
[
"twitter:label1",
"Written by"
],
[
"twitter:data1",
"Himanshu"
],
[
"twitter:label2",
"Time to read"
],
[
"twitter:data2",
"13 minutes"
]
]
}
]
34 changes: 34 additions & 0 deletions tests/samples/misc/twittercard_spinneyslebanon_test.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
<!doctype html>
<html lang="en">

<head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# product: http://ogp.me/ns/product#">
<meta charset="utf-8" />
<meta name="twitter:card" content="summary_large_image" />
<meta name="twitter:site" content="@spinneyslebanon" />
<meta name="twitter:creator" content="@spinneyslebanon" />
<meta name="twitter:title" content="Mevgal Bio Feta Cheese 200g | Chilled & Deli | Spinneys Lebanon" />
<meta name="twitter:url" content="https://www.spinneyslebanon.com/mevgal-bio-feta-cheese-200g.html" />
<meta name="twitter:description"
content="This bio feta cheese is smooth and creamy and will make a great addition to your salads or cheese dips." />
<meta name="twitter:image"
content="https://d145dj1pf6foch.cloudfront.net/catalog/product/cache/96439f90b2da9c0500e3c88801966a5f/4/8/489874-v001-1_1.jpg" />
<!-- twitter product cards-->
<meta name="twitter:card" content="summary_large_image" />
<meta name="twitter:domain" content="https://www.spinneyslebanon.com/" />
<meta name="twitter:site" content="@https://twitter.com/spinneyslebanon" />
<meta name="twitter:creator" content="@spinneyslebanon" />
<meta name="twitter:title" content="Mevgal Bio Feta Cheese 200g " />
<meta name="twitter:description"
content="This bio feta cheese is smooth and creamy and will make a great addition to your salads or cheese dips. " />
<meta name="twitter:image"
content="https://d145dj1pf6foch.cloudfront.net/catalog/product/cache/1dfbce20f903bb776ac5277b1934e5b0/4/8/489874-v001-1_1.jpg" />
<meta name="twitter:data1" content="LBP139999.00" />
<meta name="twitter:label1" content="PRICE" />
<meta name="twitter:data2" content="LB" />
<meta name="twitter:label2" content="LOCATION" />
<!-- twitter product cards-->
</head>

<body></body>

</html>
81 changes: 81 additions & 0 deletions tests/samples/misc/twittercard_spinneyslebanon_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
[
{
"namespace": {
"twitter": "https://dev.twitter.com/cards#"
},
"properties": [
[
"twitter:card",
"summary_large_image"
],
[
"twitter:site",
"@spinneyslebanon"
],
[
"twitter:creator",
"@spinneyslebanon"
],
[
"twitter:title",
"Mevgal Bio Feta Cheese 200g | Chilled & Deli | Spinneys Lebanon"
],
[
"twitter:url",
"https://www.spinneyslebanon.com/mevgal-bio-feta-cheese-200g.html"
],
[
"twitter:description",
"This bio feta cheese is smooth and creamy and will make a great addition to your salads or cheese dips."
],
[
"twitter:image",
"https://d145dj1pf6foch.cloudfront.net/catalog/product/cache/96439f90b2da9c0500e3c88801966a5f/4/8/489874-v001-1_1.jpg"
],
[
"twitter:card",
"summary_large_image"
],
[
"twitter:domain",
"https://www.spinneyslebanon.com/"
],
[
"twitter:site",
"@https://twitter.com/spinneyslebanon"
],
[
"twitter:creator",
"@spinneyslebanon"
],
[
"twitter:title",
"Mevgal Bio Feta Cheese 200g "
],
[
"twitter:description",
"This bio feta cheese is smooth and creamy and will make a great addition to your salads or cheese dips. "
],
[
"twitter:image",
"https://d145dj1pf6foch.cloudfront.net/catalog/product/cache/1dfbce20f903bb776ac5277b1934e5b0/4/8/489874-v001-1_1.jpg"
],
[
"twitter:data1",
"LBP139999.00"
],
[
"twitter:label1",
"PRICE"
],
[
"twitter:data2",
"LB"
],
[
"twitter:label2",
"LOCATION"
]
]
}
]
Loading