Skip to content

Commit

Permalink
Minor renaming and docs fixing
Browse files Browse the repository at this point in the history
Signed-off-by: David Caro <[email protected]>
  • Loading branch information
david-caro committed Aug 9, 2017
1 parent f35c6bc commit f07202f
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 61 deletions.
2 changes: 1 addition & 1 deletion docs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
SPHINXAPIDOC = sphinx-apidoc -M -P -f -o $(SOURCEDIR) $(CODEDIR) -E $(CODEDIR)/spiders
SPHINXAPIDOC = sphinx-apidoc --module-first --private --force --separate --output-dir $(SOURCEDIR) $(CODEDIR) $(CODEDIR)/spiders

.PHONY: help
help:
Expand Down
7 changes: 7 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ def _warn_node(self, msg, *args, **kwargs):
'sphinx.ext.todo',
]


autodoc_default_flags = [
'members',
'private-members',
'show-inheritance',
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

Expand Down
77 changes: 50 additions & 27 deletions hepcrawl/crawler2hep.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,18 @@
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Convert a crawler record to a valid HEP record.
"""Functions used to convert records and items from one format to another.
Currently there are only two formats for records that we consider:
* Hepcrawl format: internal format used by the spiders as middle step
before the pipeline, it's a generic wider format that should have at
least the same info as the HEP format used by Inspire.
* HEP format: Inspire compatible format, it's the fromat that you get as a
result of the crawl.
Don't forget to add pipelines to the ITEM_PIPELINES setting
See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
"""

from __future__ import absolute_import, division, print_function
Expand All @@ -24,11 +32,13 @@
def _get_updated_fft_fields(current_fft_fields, record_files):
"""
Params:
current_fft_fields(list(dict)): record current fft fields as generated by ``dojson``. We
expect each of then to have, at least, a key named ``path``.
record_files(list(RecordFile)): files attached to the record as populated by
``FftFilesPipeline``.
Args:
current_fft_fields(list(dict)): record current fft fields as generated
by ``dojson``. We expect each of then to have, at least, a key
named ``path``.
record_files(list(RecordFile)): files attached to the record as
populated by :class:`hepcrawl.pipelines.FftFilesPipeline`.
"""
record_files_index = {
record_file.name: record_file.path
Expand All @@ -45,8 +55,8 @@ def _get_updated_fft_fields(current_fft_fields, record_files):


def _has_publication_info(item):
"""If any publication info."""
return item.get('pubinfo_freetext') or item.get('journal_volume') or \
return item.get('pubinfo_freetext') or \
item.get('journal_volume') or \
item.get('journal_title') or \
item.get('journal_year') or \
item.get('journal_issue') or \
Expand All @@ -56,10 +66,10 @@ def _has_publication_info(item):
item.get('journal_doctype')


def _filter_fields(item, keys):
"""Filter away keys."""
def _remove_fields(item, keys):
"""Remove the given keys from the dict."""
for key in keys:
item.pop(key, None)
del(item[key])


def _normalize_hepcrawl_record(item, source):
Expand Down Expand Up @@ -105,19 +115,21 @@ def _normalize_hepcrawl_record(item, source):
item.pop('journal_year')
)

# Remove any fields
_filter_fields(item, [
'journal_title',
'journal_volume',
'journal_year',
'journal_issue',
'journal_fpage',
'journal_lpage',
'journal_doctype',
'journal_artid',
'pubinfo_freetext',
'pubinfo_material',
])
_remove_fields(
item,
[
'journal_title',
'journal_volume',
'journal_year',
'journal_issue',
'journal_fpage',
'journal_lpage',
'journal_doctype',
'journal_artid',
'pubinfo_freetext',
'pubinfo_material',
]
)

return item

Expand All @@ -136,7 +148,12 @@ def item_to_hep(
item,
source,
):
item.record['acquisition_source'] = _generate_acquisition_source(source=source)
"""Get an output ready hep formatted record from the given
:class:`hepcrawl.utils.ParsedItem`, whatever format it's record might be.
"""
item.record['acquisition_source'] = _generate_acquisition_source(
source=source
)

if item.record_format == 'hep':
return hep_to_hep(
Expand All @@ -154,6 +171,12 @@ def item_to_hep(


def hep_to_hep(hep_record, record_files):
"""This is needed to be able to patch the ``_fft`` field in the record.
As earlier in the process we don't really have all the files yet. It should
be used by any spiders that generate hep format instead of the internal
hepcrawl one (normally, marc-ingesting spiders).
"""
if record_files:
hep_record['_fft'] = _get_updated_fft_fields(
current_fft_fields=hep_record['_fft'],
Expand Down
52 changes: 38 additions & 14 deletions hepcrawl/spiders/desy_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
# under the terms of the Revised BSD License; see LICENSE file for
# more details.

"""Spider for DESY."""

from __future__ import absolute_import, division, print_function

import os
Expand All @@ -30,19 +28,44 @@


class DesySpider(Spider):
"""Desy spider.
"""This spider parses files in XML MARC format (collections or single
records).
It can retrieve the files from a remote FTP or from a local directory, they
must have the extension ``.xml``.
Args:
source_folder(str): Path to the folder with the MARC files to ingest,
might be collections or single records. Will be ignored if
``ftp_host`` is passed.
ftp_folder(str): Remote folder where to look for the XML files.
ftp_host(str):
This spider connects to a given FTP hosts and downloads XML files
for extraction into HEP records.
ftp_netrc(str): Path to the ``.netrc`` file with the authentication
details for the ftp connection.
destination_folder(str): Path to put the crawl results into. Will be
created if it does not exist.
*args: will be passed to the contstructor of
:class:`scrapy.spiders.Spider`.
**kwargs: will be passed to the contstructor of
:class:`scrapy.spiders.Spider`.
Examples:
To run a crawl, you need to pass FTP connection information via
``ftp_host`` and ``ftp_netrc``, if ``ftp_folder`` is not passed, it will fallback to
``DESY``::
``ftp_host`` and ``ftp_netrc``, if ``ftp_folder`` is not passed, it
will fallback to ``DESY``::
$ scrapy crawl desy -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc'
$ scrapy crawl desy \\
-a 'ftp_host=ftp.example.com' \\
-a 'ftp_netrc=/path/to/netrc'
To run a crawl on local folder, you need to pass the absolute ``source_folder``::
To run a crawl on local folder, you need to pass the absolute
``source_folder``::
$ scrapy crawl desy -a 'source_folder=/path/to/package_dir'
"""
Expand All @@ -67,20 +90,21 @@ def __init__(
self.source_folder = source_folder
self.destination_folder = destination_folder
self.ftp_enabled = True if self.ftp_host else False

if not os.path.exists(self.destination_folder):
os.makedirs(self.destination_folder)

@staticmethod
def _list_xml_files_paths(list_files_paths):
return [
def _filter_xml_files(list_files_paths):
return (
xml_file
for xml_file in list_files_paths
if xml_file.endswith('.xml')
]
)

def crawl_local_directory(self):
file_names = os.listdir(self.source_folder)
xml_file_names = self._list_xml_files_paths(file_names)
xml_file_names = self._filter_xml_files(file_names)

for file_name in xml_file_names:
file_path = os.path.join(self.source_folder, file_name)
Expand All @@ -102,7 +126,7 @@ def crawl_ftp_directory(self):
only_missing_files=False,
)

xml_remote_files_paths = self._list_xml_files_paths(remote_files_paths)
xml_remote_files_paths = self._filter_xml_files(remote_files_paths)

for remote_file in xml_remote_files_paths:
self.log('Remote: Try to crawl file from FTP: {0}'.format(remote_file))
Expand Down
65 changes: 46 additions & 19 deletions hepcrawl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,24 @@ def ftp_list_files(
passive_mode=False,
only_missing_files=True,
):
"""List files from given FTP's ftp_host folder to target folder.
"""
Args:
server_folder(str): remote folder to list.
ftp_host(str): name of the host. Example: 'ftp.cern.ch'
Params:
user(str): For authentication.
password(str): For authentication.
destination_folder(str): local folder to compare with.
passive_mode(bool): True if it should use firewall friendly ftp passive
mode.
only_missing_files(bool): If True will only list the files that are not
already in the ``destination_folder``.
"""
session_factory = ftputil.session.session_factory(
base_class=ftplib.FTP,
Expand Down Expand Up @@ -295,8 +309,10 @@ def get_licenses(
Args:
license_url(str): Url of the license to generate.
license_text(str): Text with the description of the license (sometimes is
all we got...).
license_text(str): Text with the description of the license (sometimes
is all we got...).
license_material(str): Material of the license.
Returns:
Expand Down Expand Up @@ -353,7 +369,9 @@ class RecordFile(object):
Args:
path(str): local path to the file.
name(str): Optional, name of the file, if not passed, will use the name in the path.
name(str): Optional, name of the file, if not passed, will use the name
in the ``path``.
Rises:
PathDoesNotExist:
Expand All @@ -373,22 +391,31 @@ class ParsedItem(dict):
"""Each of the individual items returned by the spider to the pipeline.
Args:
record(dict): Information about the crawled record, might be in different formats.
record_format(str): Format of the above record, for example ``"hep"`` or ``"hepcrawl"``.
file_urls(list(str)): URLs to the files to be downloaded by ``FftFilesPipeline``.
ftp_params(dict): Parameter for the ``FftFilesPipeline`` to be able to connect to the
ftp server, if any.
record_files(list(RecordFile)): files attached to the record, usually populated by
``FftFilesPipeline`` from the ``file_urls`` parameter.
record(dict): Information about the crawled record, might be in
different formats.
record_format(str): Format of the above record, for example ``"hep"``
or ``"hepcrawl"``.
file_urls(list(str)): URLs to the files to be downloaded by
``FftFilesPipeline``.
ftp_params(dict): Parameter for the
:class:`hepcrawl.pipelines.FftFilesPipeline` to be able to connect
to the ftp server, if any.
record_files(list(RecordFile)): files attached to the record, usually
populated by :class:`hepcrawl.pipelines.FftFilesPipeline` from the
``file_urls`` parameter.
"""
def __init__(
self,
record,
record_format,
file_urls=None,
ftp_params=None,
record_files=None,
**kwargs
self,
record,
record_format,
file_urls=None,
ftp_params=None,
record_files=None,
**kwargs
):
super(ParsedItem, self).__init__(
record=record,
Expand Down

0 comments on commit f07202f

Please sign in to comment.