-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Spiros Delviniotis <[email protected]>
- Loading branch information
1 parent
ad88862
commit 54970bf
Showing
26 changed files
with
1,267 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# This file is part of hepcrawl. | ||
# Copyright (C) 2017 CERN. | ||
# | ||
# hepcrawl is a free software; you can redistribute it and/or modify it | ||
# under the terms of the Revised BSD License; see LICENSE file for | ||
# more details. | ||
|
||
"""Spider for DESY.""" | ||
|
||
from __future__ import absolute_import, division, print_function | ||
|
||
import os | ||
|
||
from lxml import etree | ||
|
||
|
||
from scrapy import Request | ||
from scrapy.spiders import Spider | ||
|
||
from ..utils import ( | ||
ftp_list_files, | ||
ftp_connection_info, | ||
) | ||
|
||
|
||
class DesySpider(Spider): | ||
"""Desy spider. | ||
This spider connects to a given FTP hosts and downloads XML files | ||
for extraction into HEP records. | ||
Examples: | ||
To run a crawl, you need to pass FTP connection information via | ||
``ftp_host`` and ``ftp_netrc``:: | ||
$ scrapy crawl desy -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc' | ||
To run a crawl on local folder, you need to pass the absolute ``package_path``:: | ||
$ scrapy crawl desy -a 'package_path=/path/to/package_dir' | ||
""" | ||
name = 'desy' | ||
custom_settings = {} | ||
start_urls = [] | ||
|
||
def __init__( | ||
self, | ||
package_path=None, | ||
ftp_folder='DESY', | ||
ftp_host=None, | ||
ftp_netrc=None, | ||
*args, | ||
**kwargs | ||
): | ||
"""Constructor of ``Desy`` spider.""" | ||
super(DesySpider, self).__init__(*args, **kwargs) | ||
self.ftp_folder = ftp_folder | ||
self.ftp_host = ftp_host | ||
self.ftp_netrc = ftp_netrc | ||
self.package_path = package_path | ||
self.target_folder = '/tmp/DESY' | ||
if not os.path.exists(self.target_folder): | ||
os.makedirs(self.target_folder) | ||
|
||
def start_requests(self): | ||
"""List selected folder on remote FTP and yield files.""" | ||
if self.package_path: | ||
file_names = os.listdir(self.package_path) | ||
|
||
for file_name in file_names: | ||
file_path = os.path.join(self.package_path, file_name) | ||
yield Request( | ||
'file://{0}'.format(file_path), | ||
callback=self.parse, | ||
) | ||
else: | ||
ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) | ||
|
||
remote_files_paths = ftp_list_files( | ||
self.ftp_folder, | ||
target_folder=self.target_folder, | ||
server=ftp_host, | ||
user=ftp_params['ftp_user'], | ||
password=ftp_params['ftp_password'], | ||
lst_missing_files=False, | ||
) | ||
|
||
for remote_file in remote_files_paths: | ||
self.log('Try to crawl file from FTP: {0}'.format(remote_file)) | ||
remote_file = str(remote_file) | ||
ftp_params['ftp_local_filename'] = os.path.join( | ||
self.target_folder, | ||
os.path.basename(remote_file), | ||
) | ||
remote_url = 'ftp://{0}/{1}'.format(ftp_host, remote_file) | ||
yield Request( | ||
str(remote_url), | ||
meta=ftp_params, | ||
callback=self.handle_package_ftp, | ||
) | ||
|
||
def parse(self, response): | ||
"""Parse a ``Desy`` XML file into a HEP record.""" | ||
self.log('Got record from url/path: {0}'.format(response.url)) | ||
|
||
list_marcxml_records = self._get_records(response.body) | ||
|
||
return { | ||
'marcxml': list_marcxml_records, | ||
} | ||
|
||
def handle_package_ftp(self, response): | ||
"""Yield every XML file found.""" | ||
self.log('Visited url {}'.format(response.url)) | ||
file_path = response.body | ||
yield Request( | ||
'file://{0}'.format(file_path), | ||
meta={'package_path': file_path} | ||
) | ||
|
||
def _get_records(self, response_body): | ||
root = etree.fromstring(response_body) | ||
list_items = root.findall('.//{http://www.loc.gov/MARC21/slim}record') | ||
if not list_items: | ||
list_items = root.findall('.//record') | ||
|
||
return [etree.tostring(item) for item in list_items] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.