Skip to content

Commit

Permalink
pos: add proceedings support
Browse files Browse the repository at this point in the history
Addresses #159

Signed-off-by: Spiros Delviniotis <[email protected]>
  • Loading branch information
spirosdelviniotis committed Aug 21, 2017
1 parent 908c943 commit 8e67b40
Show file tree
Hide file tree
Showing 6 changed files with 265 additions and 57 deletions.
71 changes: 56 additions & 15 deletions hepcrawl/spiders/pos_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ def parse(self, response):
# Probably all links lead to same place, so take first
conference_paper_url = "{0}{1}".format(self.BASE_CONFERENCE_PAPER_URL, identifier)
request = Request(conference_paper_url, callback=self.scrape_conference_paper)
request.meta["url"] = response.url
request.meta["record"] = record.extract()
request.meta['url'] = response.url
request.meta['record'] = record.extract()
request.meta['identifier'] = identifier
yield request

def scrape_conference_paper(self, response):
Expand All @@ -83,24 +84,48 @@ def scrape_conference_paper(self, response):
response=response,
)

# TODO Yield request for Conference page
proceedings_identifier = response.selector.xpath("//a[contains(@href,'?confid')]/@href").extract_first()
proceedings_identifier = proceedings_identifier.split('=')[1]
pos_url = "{0}{1}".format(self.BASE_PROCEEDINGS_URL, proceedings_identifier)
# Scrape proceedings record
pos_url = self._get_proceedings_url(response)
self.log('===> scrape_conference_paper url::{pos_url}'.format(**vars()))
# yield Request(pos_url, callback=self.scrape_proceedings)
meta = {
'identifier': response.meta.get('identifier'),
}
yield Request(
pos_url,
callback=self.scrape_proceedings,
meta=meta,
)

yield self.build_conference_paper_item(response)

def scrape_proceedings(self, response):
# TODO create proceedings record
# TODO document_type = proceeding
# TODO title = template(“Proceedings, <title>”)
# TODO subtitle = template(“<place>, <date>”)
# TODO publication_info.journal_title = “PoS”
# TODO publication_info.journal_volume = identifier
node = Selector(
text=response.body,
type='html',
)
node.remove_namespaces()
record = HEPLoader(
item=HEPRecord(),
selector=node
)

pass
record.add_value('collections', ['proceeding'])
record.add_value('title', self._get_proceedings_title(node=node))
record.add_value('subtitle', self._get_proceedings_date_place(node=node))
record.add_value('journal_title', 'PoS')
record.add_value(
'journal_volume',
self._get_journal_volume(
identifier=response.meta.get('identifier'),
)
)

parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)

return parsed_item

def build_conference_paper_item(self, response):
"""Parse an PoS XML exported file into a HEP record."""
Expand Down Expand Up @@ -145,14 +170,21 @@ def build_conference_paper_item(self, response):

def _get_conference_paper_pdf_url(self, response):
conference_paper_pdf_url = response.selector.xpath(
"//a[contains(text(),'pdf')]/@href",
"//a[not(contains(text(),'pdf'))]/@href",
).extract_first()

return urljoin(
self.BASE_CONFERENCE_PAPER_URL,
conference_paper_pdf_url,
)

def _get_proceedings_url(self, response):
internal_url = response.selector.xpath(
"//a[not(contains(text(),'pdf'))]/@href",
).extract_first()
proceedings_identifier = internal_url.split('/')[1]
return '{0}{1}'.format(self.BASE_PROCEEDINGS_URL, proceedings_identifier)

@staticmethod
def _get_language(node):
language = node.xpath(".//metadata/pex-dc/language/text()").extract_first()
Expand Down Expand Up @@ -216,3 +248,12 @@ def _get_authors(node): # To be refactored
if auth_dict:
authors.append(auth_dict)
return authors

@staticmethod
def _get_proceedings_title(node):
return node.xpath('//h1/text()').extract_first()

@staticmethod
def _get_proceedings_date_place(node):
date_place = node.xpath("//div[@class='conference_date']/text()").extract()
return ''.join(date_place)
8 changes: 8 additions & 0 deletions tests/functional/pos/fixtures/https_server/conf/proxy.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,12 @@ server {
rewrite ^.*$ /$mid.html permanent;
}
}

location ~ /cgi-bin/reader/conf.cgi {
if ($args ~* "^confid=(.*)") {
set $mid $1;
set $args '';
rewrite ^.*$ /$mid.html permanent;
}
}
}
125 changes: 125 additions & 0 deletions tests/functional/pos/fixtures/https_server/records/187.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
<!DOCTYPE html>
<html>
<head>
<title>31st International Symposium on Lattice Field Theory LATTICE 2013</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<link rel="stylesheet" type="text/css" href="/css/style.css?v=3" />
</head>
<body>
<img src="/images/headInternal.gif" width="760" height="80" border="0" usemap="#headmap" alt="Main Image"/>
<map name="headmap" id="headmap">
<area shape="rect" coords="9,9,266,69" href="/" target="_top"/>
</map>
<h1>31st International Symposium on Lattice Field Theory LATTICE 2013</h1>
<script type="text/javascript" src="/js/lib.js"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
tex2jax: {inlineMath: [['$','$'], ['\\(','\\)']]}
});
</script>
<script type="text/javascript"
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
</script>
<DIV class=conference_code>LATTICE 2013 - (other <a href="/cgi-bin/reader/family.cgi?code=lattice">lattice</a> conferences)</DIV>
<DIV class=conference_date>29 July – 3 August, 2013 <BR/>Mainz, Germany </DIV>
<div id="abstract">
<p>
The annual lattice symposium brings together a global community of researchers
from theoretical particle physics and beyond, who employ numerical and
computational methods to study the properties of strongly interacting physical
systems, above all Quantum Chromodynamics (QCD), the theory describing the
interactions of quarks and gluons. Topics include studies of the spectrum and
structure of hadrons, lattice studies of matter under extreme conditions,
hadronic contributions to weak decay amplitudes, as well as recent
developments in simulation algorithms and computer hardware. The 2013
conference in Mainz was attended by over 500 participants from all over the
globe, making it the biggest in this series so far.
</p>
<p>
This proceedings volume is dedicated to the memory of Nobel Laureate Kenneth
G. Wilson (June 8, 1936 - June 15, 2013).
</p>
</div>
<div id="icon"><img src="/archive/images/LATTICE 2013.png" alt="conference main image"/></div>
<div id="proceedings">
<!-- TOC -->
<table>
<tr><th>Sessions</th></tr>
<tr><td><a href="#session-1727">Preface</a></td></tr>
<tr><td><a href="#session-1858">Plenary sessions</a></td></tr>
<tr><td><a href="#session-1859">Algorithms and Machines</a></td></tr>
<tr><td><a href="#session-1860">Applications beyond QCD</a></td></tr>
<tr><td><a href="#session-1861">Physics beyond the Standard Model</a></td></tr>
<tr><td><a href="#session-1862">Chiral Symmetry</a></td></tr>
<tr><td><a href="#session-1863">Non-zero Temperature and Density</a></td></tr>
<tr><td><a href="#session-1864">Hadron Spectroscopy and Interactions</a></td></tr>
<tr><td><a href="#session-1865">Hadron Structure</a></td></tr>
<tr><td><a href="#session-1866">Standard Model Parameters and Renormalization</a></td></tr>
<tr><td><a href="#session-1867">Theoretical Developments</a></td></tr>
<tr><td><a href="#session-1868">Vacuum Structure and Confinement</a></td></tr>
<tr><td><a href="#session-1869">Weak Decays and Matrix Elements</a></td></tr>
<tr><td><a href="#session-1870">Special Session: Coding Efforts</a></td></tr>
<tr><td><a href="#session-1871">Posters</a></td></tr>
</table>
<!-- /TOC -->
<table><thead>
</thead>
<tbody>
<tr id="session-1727" class="title"><td>Preface</td></tr> <tr>
<td><span class="contrib_title">Foreword</span><br class="contrib_newline"/>
<span class="contrib_code"><a href="/187/503/" class="gray-link">PoS(LATTICE 2013)503</a></span>
<span class="contrib_file"><a class="files" href="/187/503/pdf">pdf</a> </span>
<span class="contrib_authors">H. Wittig</span>
</td>
</tr>
<tr>
<td><span class="contrib_title">Ken Wilson Obituary</span><br class="contrib_newline"/>
<span class="contrib_code"><a href="/187/504/" class="gray-link">PoS(LATTICE 2013)504</a></span>
<span class="contrib_file"><a class="files" href="/187/504/pdf">pdf</a> </span>
<span class="contrib_authors">A. Kronfeld</span>
</td>
</tr>
<tr class="title" id="session-1858">
<td>Plenary sessions</td>
</tr>
<tr>
<td><span class="contrib_title">Heavy Flavour Physics Review</span><br class="contrib_newline"/>
<span class="contrib_code"><a href="/187/001/" class="gray-link">PoS(LATTICE 2013)001</a></span>
<span class="contrib_file"><a class="files" href="/187/001/pdf">pdf</a> </span>
<span class="contrib_authors">A. El-Khadra</span>
</td>
</tr>
<tr>
<td><span class="contrib_title">New Developments for Lattice Field Theory at Non-Zero Density </span><br class="contrib_newline"/>
<span class="contrib_code"><a href="/187/002/" class="gray-link">PoS(LATTICE 2013)002</a></span>
<span class="contrib_file"><a class="files" href="/187/002/pdf">pdf</a> </span>
<span class="contrib_authors">C. Gattringer</span>
</td>
</tr>
</tbody>
</table>
</div>
<!-- footer -->
<div id="footer">
<p>Communicate with the <a href="mailto:%70%6F%73%2D%65%6F%40%70%6F%73%2E%73%69%73%73%61%2E%69%74">PoS Editorial Office</a>
| <a href="/cgi-bin/reader/info.cgi?p=cookies" title="Cookie policy">Cookie policy</a>
</p>
</div>
<!-- /footer -->
<!-- Piwik -->
<script type="text/javascript">
var _paq = _paq || [];
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="//stats.sissa.it/analytics/";
_paq.push(['setTrackerUrl', u+'piwik.php']);
_paq.push(['setSiteId', 9]);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.type='text/javascript'; g.async=true; g.defer=true; g.src=u+'piwik.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<noscript><p><img src="//stats.sissa.it/analytics/piwik.php?idsite=9" style="border:0;" alt="" /></p></noscript>
<!-- /Piwik -->
</body>
</html>
Original file line number Diff line number Diff line change
@@ -1,55 +1,65 @@
<!DOCTYPE html
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US" xml:lang="en-US">
<!DOCTYPE html>
<html>
<head>
<title>PoS(LATTICE 2013)001</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<link rel="stylesheet" type="text/css" href="/css/style.css?v=3" />
<link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" />
<link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" />
<meta name="DC.Title" content="Heavy Flavour Physics Review"/>
<meta name="DC.Type" content="Proceeding"/>
<meta name="DC.Relation" content="31st International Symposium on Lattice Field Theory LATTICE 2013"/>
<meta name="DC.Creator" content="A. El-Khadra"/>

<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
</head>
<body>

<img src="/images/headInternal.gif" width="760" height="80" border="0" usemap="#headmap" alt="Main Image"/>
<map name="headmap" id="headmap">
<area shape="rect" coords="682,15,748,65" href="http://www.sissa.it" />
<area shape="rect" coords="9,9,266,69" href="/index.html" target="_top"/>
<area shape="rect" coords="9,9,266,69" href="/" target="_top"/>
</map>
<h1>PoS(LATTICE 2013)001</h1>
<script type="text/javascript" src="/js/lib.js"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
tex2jax: {inlineMath: [['$','$'], ['\\(','\\)']]}
});
</script>
<script type="text/javascript"
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
</script>

<div id="identifier">

<div>
<em>Title</em>
<strong>Heavy Flavour Physics Review</strong>
</div>

<div>
<em>Conference</em>
<strong><a href="/cgi-bin/reader/conf.cgi?confid=187">31st International Symposium on Lattice Field Theory LATTICE 2013</a></strong>
</div>

<div>
<em>Authors</em>
<div class="identxt">
A. El-Khadra</div>
</div>


<div>
<em>Contribution</em>
<strong><a href="https://pos.sissa.it/archive/conferences/187/001/LATTICE 2013_001.pdf">pdf</a></strong>
</div>
<div id="contrib-identifier">
<p><strong>Heavy Flavour Physics Review</strong></p>
<p><em>A. El-Khadra</em></p>
<p>in <a href="/187/">31st International Symposium on Lattice Field Theory LATTICE 2013</a></p>
<p>Contribution: <a href="/187/001/pdf">pdf</a></p>




</div>
<div id="footer">
<p>
Communicate with the <a href="mailto:%70%6F%73%2D%65%6F%40%70%6F%73%2E%73%69%73%73%61%2E%69%74">PoS Editorial Office</a>
| <a href="/POScookies.html" title="Cookie policy">Cookie policy</a>
</p>
</div>

<!-- footer -->
<div id="footer">
<p>Communicate with the <a href="mailto:%70%6F%73%2D%65%6F%40%70%6F%73%2E%73%69%73%73%61%2E%69%74">PoS Editorial Office</a>
| <a href="/cgi-bin/reader/info.cgi?p=cookies" title="Cookie policy">Cookie policy</a>
</p>
</div>
<!-- /footer -->
<!-- Piwik -->
<script type="text/javascript">
var _paq = _paq || [];
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="//stats.sissa.it/analytics/";
_paq.push(['setTrackerUrl', u+'piwik.php']);
_paq.push(['setSiteId', 9]);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.type='text/javascript'; g.async=true; g.defer=true; g.src=u+'piwik.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<noscript><p><img src="//stats.sissa.it/analytics/piwik.php?idsite=9" style="border:0;" alt="" /></p></noscript>
<!-- /Piwik -->

</body>
</html>
</html>
Original file line number Diff line number Diff line change
@@ -1,5 +1,28 @@
[
{
"publication_info": [
{
"journal_volume": "LATTICE 2013",
"journal_title": "PoS"
}
],
"document_type": [
"article"
],
"titles": [
{
"source": "pos",
"title": "31st International Symposium on Lattice Field Theory LATTICE 2013"
}
],
"acquisition_source": {
"source": "pos",
"method": "hepcrawl",
"submission_number": "5652c7f6190f11e79e8000224dabeaad",
"datetime": "2017-04-03T10:26:40.365216"
}
},
{
"acquisition_source": {
"source": "pos",
"method": "hepcrawl",
Expand Down
Loading

0 comments on commit 8e67b40

Please sign in to comment.