-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Spiros Delviniotis <[email protected]>
- Loading branch information
1 parent
ad88862
commit 07cf5e6
Showing
12 changed files
with
439 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# This file is part of hepcrawl. | ||
# Copyright (C) 2017 CERN. | ||
# | ||
# hepcrawl is a free software; you can redistribute it and/or modify it | ||
# under the terms of the Revised BSD License; see LICENSE file for | ||
# more details. | ||
|
||
"""Spider for DESY.""" | ||
|
||
from __future__ import absolute_import, division, print_function | ||
|
||
import os | ||
import urlparse | ||
|
||
from scrapy import Request | ||
from scrapy.spiders import Spider | ||
|
||
from ..utils import ( | ||
ftp_list_files, | ||
ftp_connection_info, | ||
) | ||
|
||
|
||
class DesySpider(Spider): | ||
"""Desy spider. | ||
This spider connects to a given FTP hosts and downloads XML files | ||
for extraction into HEP records. | ||
Examples: | ||
To run a crawl, you need to pass FTP connection information via | ||
``ftp_host`` and ``ftp_netrc``:: | ||
$ scrapy crawl desy -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc' | ||
To run a crawl on local folder, you need to pass the absolute ``package_path``:: | ||
$ scrapy crawl desy -a 'package_path=/path/to/package_dir' | ||
""" | ||
name = 'desy' | ||
custom_settings = {} | ||
start_urls = [] | ||
itertag = 'article' | ||
|
||
def __init__( | ||
self, | ||
package_path=None, | ||
ftp_folder='DESY', | ||
ftp_host=None, | ||
ftp_netrc=None, | ||
*args, | ||
**kwargs | ||
): | ||
"""Constructor of ``Desy`` spider.""" | ||
super(DesySpider, self).__init__(*args, **kwargs) | ||
self.ftp_folder = ftp_folder | ||
self.ftp_host = ftp_host | ||
self.ftp_netrc = ftp_netrc | ||
self.package_path = package_path | ||
self.target_folder = '/tmp/DESY' | ||
if not os.path.exists(self.target_folder): | ||
os.makedirs(self.target_folder) | ||
|
||
def start_requests(self): | ||
"""List selected folder on remote FTP and yield new zip files.""" | ||
if self.package_path: | ||
file_names = os.listdir(self.package_path) | ||
|
||
for file_name in file_names: | ||
file_path = os.path.join(self.package_path, file_name) | ||
yield Request( | ||
'file://{0}'.format(file_path), | ||
callback=self.parse, | ||
) | ||
else: | ||
ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) | ||
|
||
remote_files_paths = ftp_list_files( | ||
self.ftp_folder, | ||
target_folder=self.target_folder, | ||
server=ftp_host, | ||
user=ftp_params['ftp_user'], | ||
password=ftp_params['ftp_password'], | ||
lst_missing_files=False, | ||
) | ||
|
||
for remote_file in remote_files_paths: | ||
self.log('Try to crawl file from FTP: {0}'.format(remote_file)) | ||
remote_file = str(remote_file) | ||
ftp_params['ftp_local_filename'] = os.path.join( | ||
self.target_folder, | ||
os.path.basename(remote_file), | ||
) | ||
remote_url = 'ftp://{0}/{1}'.format(ftp_host, remote_file) | ||
yield Request( | ||
str(remote_url), | ||
meta=ftp_params, | ||
callback=self.handle_package_ftp, | ||
) | ||
|
||
def parse(self, response): | ||
"""Parse a ``Desy`` XML file into a HEP record.""" | ||
self.log('Got record from url/path: {0}'.format(response.url)) | ||
|
||
item = { | ||
'desy_item': response.body, | ||
} | ||
return item | ||
|
||
def handle_package_ftp(self, response): | ||
"""Handle a zip package and yield every XML found.""" | ||
self.log('Visited url %s' % response.url) | ||
self.log('response.body: %s' % response.body) | ||
filepath = response.body | ||
yield Request( | ||
'file://{0}'.format(filepath), | ||
meta={'package_path': filepath} | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
[{ | ||
"acquisition_source": { | ||
"source": "desy", | ||
"method": "hepcrawl", | ||
"submission_number": "5652c7f6190f11e79e8000224dabeaad", | ||
"datetime": "2017-04-03T10:26:40.365216" | ||
}, | ||
"_collections": [ | ||
"Literature" | ||
], | ||
"control_number": 1608652, | ||
"public_notes": [ | ||
{ | ||
"value": "*Brief entry*" | ||
} | ||
], | ||
"self": { | ||
"$ref": "http://inspirehep.net/api/literature/1608652" | ||
}, | ||
"number_of_pages": 6, | ||
"titles": [ | ||
{ | ||
"source": "JACoW", | ||
"title": "Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser\n Acceleration (DLA) From the Source to Relativistic Electrons\n " | ||
} | ||
], | ||
"urls": [ | ||
{ | ||
"description": "Fulltext", | ||
"value": "http://inspirehep.net/record/1608652/files/Towards a fully\n integrated acc on a chip.pdf\n " | ||
} | ||
], | ||
"dois": [ | ||
{ | ||
"value": "10.18429/JACoW-IPAC2017-WEYB1" | ||
} | ||
], | ||
"publication_info": [ | ||
{ | ||
"parent_isbn": "9783954501823" | ||
}, | ||
{ | ||
"page_start": "2520", | ||
"page_end": "2525", | ||
"year": 2017 | ||
} | ||
], | ||
"$schema": "hep.json", | ||
"document_type": [ | ||
"article" | ||
], | ||
"abstracts": [ | ||
{ | ||
"source": "Deutsches Elektronen-Synchrotron", | ||
"value": "Dielectric laser acceleration of electrons has recently been\n demonstrated with significantly higher accelerating gradients than other\n structure-based linear accelerators. Towards the development of an integrated 1 MeV\n electron accelerator based on dielectric laser accelerator technologies,\n development in several relevant technologies is needed. In this work, recent\n developments on electron sources, bunching, accelerating, focussing, deflecting and\n laser coupling structures are reported. With an eye to the near future, components\n required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond\n electron bunches are outlined.\n " | ||
} | ||
] | ||
}] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
machine ftp_server | ||
login bob | ||
password bob |
56 changes: 56 additions & 0 deletions
56
tests/functional/desy/fixtures/ftp_server/DESY/desy_smoke.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<collection xmlns="http://www.loc.gov/MARC21/slim" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://www.loc.gov/MARC21/slim "> | ||
<record> | ||
<controlfield tag="001">1608652</controlfield> | ||
<controlfield tag="005">20170705125610.0</controlfield> | ||
<datafield tag="024" ind1="7" ind2=" "> | ||
<subfield code="2">DOI</subfield> | ||
<subfield code="a">10.18429/JACoW-IPAC2017-WEYB1</subfield> | ||
</datafield> | ||
<datafield tag="773" ind1=" " ind2=" "> | ||
<subfield code="z">9783954501823</subfield> | ||
</datafield> | ||
<datafield tag="245" ind1=" " ind2=" "> | ||
<subfield code="a">Towards a Fully Integrated Accelerator on a Chip: Dielectric Laser | ||
Acceleration (DLA) From the Source to Relativistic Electrons | ||
</subfield> | ||
<subfield code="9">JACoW</subfield> | ||
</datafield> | ||
<datafield tag="520" ind1=" " ind2=" "> | ||
<subfield code="a">Dielectric laser acceleration of electrons has recently been | ||
demonstrated with significantly higher accelerating gradients than other | ||
structure-based linear accelerators. Towards the development of an integrated 1 MeV | ||
electron accelerator based on dielectric laser accelerator technologies, | ||
development in several relevant technologies is needed. In this work, recent | ||
developments on electron sources, bunching, accelerating, focussing, deflecting and | ||
laser coupling structures are reported. With an eye to the near future, components | ||
required for a 1 MeV kinetic energy tabletop accelerator producing sub-femtosecond | ||
electron bunches are outlined. | ||
</subfield> | ||
<subfield code="9">Deutsches Elektronen-Synchrotron</subfield> | ||
</datafield> | ||
<datafield tag="300" ind1=" " ind2=" "> | ||
<subfield code="a">6</subfield> | ||
</datafield> | ||
<datafield tag="500" ind1=" " ind2=" "> | ||
<subfield code="a">*Brief entry*</subfield> | ||
</datafield> | ||
<datafield tag="773" ind1=" " ind2=" "> | ||
<subfield code="y">2017</subfield> | ||
<subfield code="c">2520-2525</subfield> | ||
</datafield> | ||
<datafield tag="856" ind1="4" ind2=" "> | ||
<subfield code="s">100176</subfield> | ||
<subfield code="u">http://inspirehep.net/record/1608652/files/Towards a fully | ||
integrated acc on a chip.pdf | ||
</subfield> | ||
<subfield code="y">Fulltext</subfield> | ||
</datafield> | ||
<datafield tag="909" ind1="C" ind2="O"> | ||
<subfield code="o">oai:inspirehep.net:1608652</subfield> | ||
<subfield code="p">INSPIRE:HEP</subfield> | ||
</datafield> | ||
</record> | ||
</collection> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
bob:$1$3ccy4I60$nSpFtRN8U6/BgmmPaxrYR/:1000:1000::/home/ftpusers/bob/./:::::::::::: |
Oops, something went wrong.