Skip to content

Commit

Permalink
WIP for desy spider
Browse files Browse the repository at this point in the history
Signed-off-by: Spiros Delviniotis <[email protected]>
  • Loading branch information
spirosdelviniotis committed Jul 6, 2017
1 parent ad88862 commit 07cf5e6
Show file tree
Hide file tree
Showing 12 changed files with 439 additions and 8 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ env:
- SUITE=unit
- SUITE=functional_wsp
- SUITE=functional_arxiv
- SUITE=functional_desy

matrix:
fast_finish: true
Expand Down
8 changes: 8 additions & 0 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ services:
- scrapyd
- ftp_server

functional_desy:
<<: *service_base
command: py.test -vv tests/functional/desy
links:
- scrapyd
- ftp_server

functional_arxiv:
<<: *service_base
command: py.test -vv tests/functional/arxiv
Expand Down Expand Up @@ -68,6 +75,7 @@ services:
environment:
- PUBLICHOST=localhost
volumes:
- ${PWD}/tests/functional/desy/fixtures/ftp_server/DESY:/home/ftpusers/bob/DESY
- ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP
- ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd

Expand Down
35 changes: 31 additions & 4 deletions hepcrawl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@

import requests

from lxml import etree
from dojson.contrib.marc21.utils import create_record

from inspire_dojson.hep import hep
from inspire_schemas.utils import validate

from .crawler2hep import crawler2hep


Expand Down Expand Up @@ -50,9 +56,7 @@ def __init__(self):
def open_spider(self, spider):
self.results_data = []

def process_item(self, item, spider):
"""Convert internal format to INSPIRE data model."""
self.count += 1
def _post_enhance_item(self, item, spider):
if 'related_article_doi' in item:
item['dois'] += item.pop('related_article_doi', [])

Expand Down Expand Up @@ -115,7 +119,30 @@ def process_item(self, item, spider):
])

item = crawler2hep(dict(item))
spider.logger.debug('Validated item.')
spider.logger.debug('Validated item by Builder.')
return item

def _read_item_from_marcxml(self, item, spider): # change names and split
item = etree.XML(item['desy_item'])
record_dojson = create_record(item)
record_valid = hep.do(record_dojson)
record_valid['acquisition_source'] = {
'source': spider.name,
'method': 'hepcrawl',
}
validate(record_valid, 'hep')
spider.logger.debug('Validated item by Dojson.')
return record_valid

def process_item(self, item, spider):
"""Convert internal format to INSPIRE data model."""
self.count += 1

if spider.name == 'desy': # change logic for marcxml and spider name
item = self._read_item_from_marcxml(item, spider)
else:
item = self._post_enhance_item(item, spider)

self.results_data.append(item)
return item

Expand Down
120 changes: 120 additions & 0 deletions hepcrawl/spiders/desy_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2017 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Spider for DESY."""

from __future__ import absolute_import, division, print_function

import os
import urlparse

from scrapy import Request
from scrapy.spiders import Spider

from ..utils import (
ftp_list_files,
ftp_connection_info,
)


class DesySpider(Spider):
"""Desy spider.
This spider connects to a given FTP hosts and downloads XML files
for extraction into HEP records.
Examples:
To run a crawl, you need to pass FTP connection information via
``ftp_host`` and ``ftp_netrc``::
$ scrapy crawl desy -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc'
To run a crawl on local folder, you need to pass the absolute ``package_path``::
$ scrapy crawl desy -a 'package_path=/path/to/package_dir'
"""
name = 'desy'
custom_settings = {}
start_urls = []
itertag = 'article'

def __init__(
self,
package_path=None,
ftp_folder='DESY',
ftp_host=None,
ftp_netrc=None,
*args,
**kwargs
):
"""Constructor of ``Desy`` spider."""
super(DesySpider, self).__init__(*args, **kwargs)
self.ftp_folder = ftp_folder
self.ftp_host = ftp_host
self.ftp_netrc = ftp_netrc
self.package_path = package_path
self.target_folder = '/tmp/DESY'
if not os.path.exists(self.target_folder):
os.makedirs(self.target_folder)

def start_requests(self):
"""List selected folder on remote FTP and yield new zip files."""
if self.package_path:
file_names = os.listdir(self.package_path)

for file_name in file_names:
file_path = os.path.join(self.package_path, file_name)
yield Request(
'file://{0}'.format(file_path),
callback=self.parse,
)
else:
ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc)

remote_files_paths = ftp_list_files(
self.ftp_folder,
target_folder=self.target_folder,
server=ftp_host,
user=ftp_params['ftp_user'],
password=ftp_params['ftp_password'],
lst_missing_files=False,
)

for remote_file in remote_files_paths:
self.log('Try to crawl file from FTP: {0}'.format(remote_file))
remote_file = str(remote_file)
ftp_params['ftp_local_filename'] = os.path.join(
self.target_folder,
os.path.basename(remote_file),
)
remote_url = 'ftp://{0}/{1}'.format(ftp_host, remote_file)
yield Request(
str(remote_url),
meta=ftp_params,
callback=self.handle_package_ftp,
)

def parse(self, response):
"""Parse a ``Desy`` XML file into a HEP record."""
self.log('Got record from url/path: {0}'.format(response.url))

item = {
'desy_item': response.body,
}
return item

def handle_package_ftp(self, response):
"""Handle a zip package and yield every XML found."""
self.log('Visited url %s' % response.url)
self.log('response.body: %s' % response.body)
filepath = response.body
yield Request(
'file://{0}'.format(filepath),
meta={'package_path': filepath}
)
2 changes: 1 addition & 1 deletion hepcrawl/spiders/wsp_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def start_requests(self):

new_files_paths = ftp_list_files(
self.ftp_folder,
self.target_folder,
target_folder=self.target_folder,
server=ftp_host,
user=ftp_params['ftp_user'],
password=ftp_params['ftp_password']
Expand Down
23 changes: 20 additions & 3 deletions hepcrawl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,34 @@ def ftp_connection_info(ftp_host, netrc_file, passive_mode=False):
return ftp_host, connection_params


def ftp_list_files(server_folder, target_folder, server, user, password, passive_mode=False):
def ftp_list_files(
server_folder,
server,
user,
password,
target_folder=None,
passive_mode=False,
lst_missing_files=True,
):
"""List files from given FTP's server folder to target folder."""
session_factory = ftputil.session.session_factory(
base_class=ftplib.FTP,
port=21,
use_passive_mode=passive_mode,
encrypt_data_channel=True)
encrypt_data_channel=True,
)

with ftputil.FTPHost(server, user, password, session_factory=session_factory) as host:
file_names = host.listdir(os.path.join(host.curdir, '/', server_folder))
return list_missing_files(server_folder, target_folder, file_names)
if lst_missing_files:
return list_missing_files(server_folder, target_folder, file_names)
else:
return [
os.path.join(
server_folder,
file_name
) for file_name in file_names
]


def local_list_files(local_folder, target_folder):
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
install_requires = [
'autosemver~=0.2',
'inspire-schemas~=41.0',
'inspire-dojson~=41.0',
'Scrapy>=1.1.0',
# TODO: unpin once they support wheel building again
'scrapyd==1.1.0',
Expand Down
58 changes: 58 additions & 0 deletions tests/functional/desy/fixtures/desy_smoke_records.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
[{
"acquisition_source": {
"source": "desy",
"method": "hepcrawl",
"submission_number": "5652c7f6190f11e79e8000224dabeaad",
"datetime": "2017-04-03T10:26:40.365216"
},
"_collections": [
"Literature"
],
"control_number": 1608652,
"public_notes": [
{
"value": "*Brief entry*"
}
],
"self": {
"$ref": "http://inspirehep.net/api/literature/1608652"
},
"number_of_pages": 6,
"titles": [
{
"source": "JACoW",
"title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n "
}
],
"urls": [
{
"description": "Fulltext",
"value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n "
}
],
"dois": [
{
"value": "10.18429/JACoW-IPAC2017-WEYB1"
}
],
"publication_info": [
{
"parent_isbn": "9783954501823"
},
{
"page_start": "2520",
"page_end": "2525",
"year": 2017
}
],
"$schema": "hep.json",
"document_type": [
"article"
],
"abstracts": [
{
"source": "Deutsches Elektronen-Synchrotron",
"value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n "
}
]
}]
3 changes: 3 additions & 0 deletions tests/functional/desy/fixtures/ftp_server/.netrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
machine ftp_server
login bob
password bob
56 changes: 56 additions & 0 deletions tests/functional/desy/fixtures/ftp_server/DESY/desy_smoke.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.loc.gov/MARC21/slim ">
<record>
<controlfield tag="001">1608652</controlfield>
<controlfield tag="005">20170705125610.0</controlfield>
<datafield tag="024" ind1="7" ind2=" ">
<subfield code="2">DOI</subfield>
<subfield code="a">10.18429/JACoW-IPAC2017-WEYB1</subfield>
</datafield>
<datafield tag="773" ind1=" " ind2=" ">
<subfield code="z">9783954501823</subfield>
</datafield>
<datafield tag="245" ind1=" " ind2=" ">
<subfield code="a">Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser
Acceleration (DLA) From the Source to Relativistic Electrons
</subfield>
<subfield code="9">JACoW</subfield>
</datafield>
<datafield tag="520" ind1=" " ind2=" ">
<subfield code="a">Dielectric laser acceleration of electrons has recently been
demonstrated with significantly higher accelerating gradients than other
structure-based linear accelerators. Towards the development of an integrated 1 MeV
electron accelerator based on dielectric laser accelerator technologies,
development in several relevant technologies is needed. In this work, recent
developments on electron sources, bunching, accelerating, focussing, deflecting and
laser coupling structures are reported. With an eye to the near future, components
required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond
electron bunches are outlined.
</subfield>
<subfield code="9">Deutsches Elektronen-Synchrotron</subfield>
</datafield>
<datafield tag="300" ind1=" " ind2=" ">
<subfield code="a">6</subfield>
</datafield>
<datafield tag="500" ind1=" " ind2=" ">
<subfield code="a">*Brief entry*</subfield>
</datafield>
<datafield tag="773" ind1=" " ind2=" ">
<subfield code="y">2017</subfield>
<subfield code="c">2520-2525</subfield>
</datafield>
<datafield tag="856" ind1="4" ind2=" ">
<subfield code="s">100176</subfield>
<subfield code="u">http://inspirehep.net/record/1608652/files/Towards a fully
integrated acc on a chip.pdf
</subfield>
<subfield code="y">Fulltext</subfield>
</datafield>
<datafield tag="909" ind1="C" ind2="O">
<subfield code="o">oai:inspirehep.net:1608652</subfield>
<subfield code="p">INSPIRE:HEP</subfield>
</datafield>
</record>
</collection>
1 change: 1 addition & 0 deletions tests/functional/desy/fixtures/ftp_server/pureftpd.passwd
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
bob:$1$3ccy4I60$nSpFtRN8U6/BgmmPaxrYR/:1000:1000::/home/ftpusers/bob/./::::::::::::
Loading

0 comments on commit 07cf5e6

Please sign in to comment.