From f07202f43911d61457f09807b30d4272604c1f8d Mon Sep 17 00:00:00 2001 From: David Caro Date: Wed, 9 Aug 2017 20:40:34 +0200 Subject: [PATCH] Minor renaming and docs fixing Signed-off-by: David Caro --- docs/Makefile | 2 +- docs/conf.py | 7 +++ hepcrawl/crawler2hep.py | 77 +++++++++++++++++++++------------ hepcrawl/spiders/desy_spider.py | 52 ++++++++++++++++------ hepcrawl/utils.py | 65 ++++++++++++++++++++-------- 5 files changed, 142 insertions(+), 61 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index 28fb79f1..7d24ea24 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -24,7 +24,7 @@ PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -SPHINXAPIDOC = sphinx-apidoc -M -P -f -o $(SOURCEDIR) $(CODEDIR) -E $(CODEDIR)/spiders +SPHINXAPIDOC = sphinx-apidoc --module-first --private --force --separate --output-dir $(SOURCEDIR) $(CODEDIR) $(CODEDIR)/spiders .PHONY: help help: diff --git a/docs/conf.py b/docs/conf.py index 7fb14e5e..2e12401b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -85,6 +85,13 @@ def _warn_node(self, msg, *args, **kwargs): 'sphinx.ext.todo', ] + +autodoc_default_flags = [ + 'members', + 'private-members', + 'show-inheritance', +] + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py index cacc5590..977b4df0 100644 --- a/hepcrawl/crawler2hep.py +++ b/hepcrawl/crawler2hep.py @@ -7,10 +7,18 @@ # under the terms of the Revised BSD License; see LICENSE file for # more details. -"""Convert a crawler record to a valid HEP record. +"""Functions used to convert records and items from one format to another. + + +Currently there are only two formats for records that we consider: + + * Hepcrawl format: internal format used by the spiders as middle step + before the pipeline, it's a generic wider format that should have at + least the same info as the HEP format used by Inspire. + + * HEP format: Inspire compatible format, it's the fromat that you get as a + result of the crawl. -Don't forget to add pipelines to the ITEM_PIPELINES setting -See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html """ from __future__ import absolute_import, division, print_function @@ -24,11 +32,13 @@ def _get_updated_fft_fields(current_fft_fields, record_files): """ - Params: - current_fft_fields(list(dict)): record current fft fields as generated by ``dojson``. We - expect each of then to have, at least, a key named ``path``. - record_files(list(RecordFile)): files attached to the record as populated by - ``FftFilesPipeline``. + Args: + current_fft_fields(list(dict)): record current fft fields as generated + by ``dojson``. We expect each of then to have, at least, a key + named ``path``. + + record_files(list(RecordFile)): files attached to the record as + populated by :class:`hepcrawl.pipelines.FftFilesPipeline`. """ record_files_index = { record_file.name: record_file.path @@ -45,8 +55,8 @@ def _get_updated_fft_fields(current_fft_fields, record_files): def _has_publication_info(item): - """If any publication info.""" - return item.get('pubinfo_freetext') or item.get('journal_volume') or \ + return item.get('pubinfo_freetext') or \ + item.get('journal_volume') or \ item.get('journal_title') or \ item.get('journal_year') or \ item.get('journal_issue') or \ @@ -56,10 +66,10 @@ def _has_publication_info(item): item.get('journal_doctype') -def _filter_fields(item, keys): - """Filter away keys.""" +def _remove_fields(item, keys): + """Remove the given keys from the dict.""" for key in keys: - item.pop(key, None) + del(item[key]) def _normalize_hepcrawl_record(item, source): @@ -105,19 +115,21 @@ def _normalize_hepcrawl_record(item, source): item.pop('journal_year') ) - # Remove any fields - _filter_fields(item, [ - 'journal_title', - 'journal_volume', - 'journal_year', - 'journal_issue', - 'journal_fpage', - 'journal_lpage', - 'journal_doctype', - 'journal_artid', - 'pubinfo_freetext', - 'pubinfo_material', - ]) + _remove_fields( + item, + [ + 'journal_title', + 'journal_volume', + 'journal_year', + 'journal_issue', + 'journal_fpage', + 'journal_lpage', + 'journal_doctype', + 'journal_artid', + 'pubinfo_freetext', + 'pubinfo_material', + ] + ) return item @@ -136,7 +148,12 @@ def item_to_hep( item, source, ): - item.record['acquisition_source'] = _generate_acquisition_source(source=source) + """Get an output ready hep formatted record from the given + :class:`hepcrawl.utils.ParsedItem`, whatever format it's record might be. + """ + item.record['acquisition_source'] = _generate_acquisition_source( + source=source + ) if item.record_format == 'hep': return hep_to_hep( @@ -154,6 +171,12 @@ def item_to_hep( def hep_to_hep(hep_record, record_files): + """This is needed to be able to patch the ``_fft`` field in the record. + + As earlier in the process we don't really have all the files yet. It should + be used by any spiders that generate hep format instead of the internal + hepcrawl one (normally, marc-ingesting spiders). + """ if record_files: hep_record['_fft'] = _get_updated_fft_fields( current_fft_fields=hep_record['_fft'], diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py index 4f90d6e9..13fbf1f3 100644 --- a/hepcrawl/spiders/desy_spider.py +++ b/hepcrawl/spiders/desy_spider.py @@ -7,8 +7,6 @@ # under the terms of the Revised BSD License; see LICENSE file for # more details. -"""Spider for DESY.""" - from __future__ import absolute_import, division, print_function import os @@ -30,19 +28,44 @@ class DesySpider(Spider): - """Desy spider. + """This spider parses files in XML MARC format (collections or single + records). + + It can retrieve the files from a remote FTP or from a local directory, they + must have the extension ``.xml``. + + Args: + source_folder(str): Path to the folder with the MARC files to ingest, + might be collections or single records. Will be ignored if + ``ftp_host`` is passed. + + ftp_folder(str): Remote folder where to look for the XML files. + + ftp_host(str): - This spider connects to a given FTP hosts and downloads XML files - for extraction into HEP records. + ftp_netrc(str): Path to the ``.netrc`` file with the authentication + details for the ftp connection. + + destination_folder(str): Path to put the crawl results into. Will be + created if it does not exist. + + *args: will be passed to the contstructor of + :class:`scrapy.spiders.Spider`. + + **kwargs: will be passed to the contstructor of + :class:`scrapy.spiders.Spider`. Examples: To run a crawl, you need to pass FTP connection information via - ``ftp_host`` and ``ftp_netrc``, if ``ftp_folder`` is not passed, it will fallback to - ``DESY``:: + ``ftp_host`` and ``ftp_netrc``, if ``ftp_folder`` is not passed, it + will fallback to ``DESY``:: - $ scrapy crawl desy -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc' + $ scrapy crawl desy \\ + -a 'ftp_host=ftp.example.com' \\ + -a 'ftp_netrc=/path/to/netrc' - To run a crawl on local folder, you need to pass the absolute ``source_folder``:: + To run a crawl on local folder, you need to pass the absolute + ``source_folder``:: $ scrapy crawl desy -a 'source_folder=/path/to/package_dir' """ @@ -67,20 +90,21 @@ def __init__( self.source_folder = source_folder self.destination_folder = destination_folder self.ftp_enabled = True if self.ftp_host else False + if not os.path.exists(self.destination_folder): os.makedirs(self.destination_folder) @staticmethod - def _list_xml_files_paths(list_files_paths): - return [ + def _filter_xml_files(list_files_paths): + return ( xml_file for xml_file in list_files_paths if xml_file.endswith('.xml') - ] + ) def crawl_local_directory(self): file_names = os.listdir(self.source_folder) - xml_file_names = self._list_xml_files_paths(file_names) + xml_file_names = self._filter_xml_files(file_names) for file_name in xml_file_names: file_path = os.path.join(self.source_folder, file_name) @@ -102,7 +126,7 @@ def crawl_ftp_directory(self): only_missing_files=False, ) - xml_remote_files_paths = self._list_xml_files_paths(remote_files_paths) + xml_remote_files_paths = self._filter_xml_files(remote_files_paths) for remote_file in xml_remote_files_paths: self.log('Remote: Try to crawl file from FTP: {0}'.format(remote_file)) diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index 4d4a28db..9d66b565 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -70,10 +70,24 @@ def ftp_list_files( passive_mode=False, only_missing_files=True, ): - """List files from given FTP's ftp_host folder to target folder. + """ + + Args: + server_folder(str): remote folder to list. + + ftp_host(str): name of the host. Example: 'ftp.cern.ch' - Params: + user(str): For authentication. + password(str): For authentication. + + destination_folder(str): local folder to compare with. + + passive_mode(bool): True if it should use firewall friendly ftp passive + mode. + + only_missing_files(bool): If True will only list the files that are not + already in the ``destination_folder``. """ session_factory = ftputil.session.session_factory( base_class=ftplib.FTP, @@ -295,8 +309,10 @@ def get_licenses( Args: license_url(str): Url of the license to generate. - license_text(str): Text with the description of the license (sometimes is - all we got...). + + license_text(str): Text with the description of the license (sometimes + is all we got...). + license_material(str): Material of the license. Returns: @@ -353,7 +369,9 @@ class RecordFile(object): Args: path(str): local path to the file. - name(str): Optional, name of the file, if not passed, will use the name in the path. + + name(str): Optional, name of the file, if not passed, will use the name + in the ``path``. Rises: PathDoesNotExist: @@ -373,22 +391,31 @@ class ParsedItem(dict): """Each of the individual items returned by the spider to the pipeline. Args: - record(dict): Information about the crawled record, might be in different formats. - record_format(str): Format of the above record, for example ``"hep"`` or ``"hepcrawl"``. - file_urls(list(str)): URLs to the files to be downloaded by ``FftFilesPipeline``. - ftp_params(dict): Parameter for the ``FftFilesPipeline`` to be able to connect to the - ftp server, if any. - record_files(list(RecordFile)): files attached to the record, usually populated by - ``FftFilesPipeline`` from the ``file_urls`` parameter. + record(dict): Information about the crawled record, might be in + different formats. + + record_format(str): Format of the above record, for example ``"hep"`` + or ``"hepcrawl"``. + + file_urls(list(str)): URLs to the files to be downloaded by + ``FftFilesPipeline``. + + ftp_params(dict): Parameter for the + :class:`hepcrawl.pipelines.FftFilesPipeline` to be able to connect + to the ftp server, if any. + + record_files(list(RecordFile)): files attached to the record, usually + populated by :class:`hepcrawl.pipelines.FftFilesPipeline` from the + ``file_urls`` parameter. """ def __init__( - self, - record, - record_format, - file_urls=None, - ftp_params=None, - record_files=None, - **kwargs + self, + record, + record_format, + file_urls=None, + ftp_params=None, + record_files=None, + **kwargs ): super(ParsedItem, self).__init__( record=record,