From f07202f43911d61457f09807b30d4272604c1f8d Mon Sep 17 00:00:00 2001
From: David Caro <david@dcaro.es>
Date: Wed, 9 Aug 2017 20:40:34 +0200
Subject: [PATCH] Minor renaming and docs fixing

Signed-off-by: David Caro <david@dcaro.es>
---
 docs/Makefile                   |  2 +-
 docs/conf.py                    |  7 +++
 hepcrawl/crawler2hep.py         | 77 +++++++++++++++++++++------------
 hepcrawl/spiders/desy_spider.py | 52 ++++++++++++++++------
 hepcrawl/utils.py               | 65 ++++++++++++++++++++--------
 5 files changed, 142 insertions(+), 61 deletions(-)

diff --git a/docs/Makefile b/docs/Makefile
index 28fb79f1..7d24ea24 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -24,7 +24,7 @@ PAPEROPT_letter = -D latex_paper_size=letter
 ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 # the i18n builder cannot share the environment and doctrees with the others
 I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-SPHINXAPIDOC    = sphinx-apidoc -M -P -f -o $(SOURCEDIR) $(CODEDIR) -E $(CODEDIR)/spiders
+SPHINXAPIDOC    = sphinx-apidoc --module-first --private --force --separate --output-dir $(SOURCEDIR) $(CODEDIR) $(CODEDIR)/spiders
 
 .PHONY: help
 help:
diff --git a/docs/conf.py b/docs/conf.py
index 7fb14e5e..2e12401b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -85,6 +85,13 @@ def _warn_node(self, msg, *args, **kwargs):
     'sphinx.ext.todo',
 ]
 
+
+autodoc_default_flags = [
+    'members',
+    'private-members',
+    'show-inheritance',
+]
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 
diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py
index cacc5590..977b4df0 100644
--- a/hepcrawl/crawler2hep.py
+++ b/hepcrawl/crawler2hep.py
@@ -7,10 +7,18 @@
 # under the terms of the Revised BSD License; see LICENSE file for
 # more details.
 
-"""Convert a crawler record to a valid HEP record.
+"""Functions used to convert records and items from one format to another.
+
+
+Currently there are only two formats for records that we consider:
+
+    * Hepcrawl format: internal format used by the spiders as middle step
+      before the pipeline, it's a generic wider format that should have at
+      least the same info as the HEP format used by Inspire.
+
+    * HEP format: Inspire compatible format, it's the fromat that you get as a
+      result of the crawl.
 
-Don't forget to add pipelines to the ITEM_PIPELINES setting
-See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 """
 
 from __future__ import absolute_import, division, print_function
@@ -24,11 +32,13 @@
 def _get_updated_fft_fields(current_fft_fields, record_files):
     """
 
-    Params:
-        current_fft_fields(list(dict)): record current fft fields as generated by ``dojson``. We
-             expect each of then to have, at least, a key named ``path``.
-        record_files(list(RecordFile)): files attached to the record as populated by
-             ``FftFilesPipeline``.
+    Args:
+        current_fft_fields(list(dict)): record current fft fields as generated
+            by ``dojson``. We expect each of then to have, at least, a key
+            named ``path``.
+
+        record_files(list(RecordFile)): files attached to the record as
+            populated by :class:`hepcrawl.pipelines.FftFilesPipeline`.
     """
     record_files_index = {
         record_file.name: record_file.path
@@ -45,8 +55,8 @@ def _get_updated_fft_fields(current_fft_fields, record_files):
 
 
 def _has_publication_info(item):
-    """If any publication info."""
-    return item.get('pubinfo_freetext') or item.get('journal_volume') or \
+    return item.get('pubinfo_freetext') or \
+        item.get('journal_volume') or \
         item.get('journal_title') or \
         item.get('journal_year') or \
         item.get('journal_issue') or \
@@ -56,10 +66,10 @@ def _has_publication_info(item):
         item.get('journal_doctype')
 
 
-def _filter_fields(item, keys):
-    """Filter away keys."""
+def _remove_fields(item, keys):
+    """Remove the given keys from the dict."""
     for key in keys:
-        item.pop(key, None)
+        del(item[key])
 
 
 def _normalize_hepcrawl_record(item, source):
@@ -105,19 +115,21 @@ def _normalize_hepcrawl_record(item, source):
                 item.pop('journal_year')
             )
 
-    # Remove any fields
-    _filter_fields(item, [
-        'journal_title',
-        'journal_volume',
-        'journal_year',
-        'journal_issue',
-        'journal_fpage',
-        'journal_lpage',
-        'journal_doctype',
-        'journal_artid',
-        'pubinfo_freetext',
-        'pubinfo_material',
-    ])
+    _remove_fields(
+        item,
+        [
+            'journal_title',
+            'journal_volume',
+            'journal_year',
+            'journal_issue',
+            'journal_fpage',
+            'journal_lpage',
+            'journal_doctype',
+            'journal_artid',
+            'pubinfo_freetext',
+            'pubinfo_material',
+        ]
+    )
 
     return item
 
@@ -136,7 +148,12 @@ def item_to_hep(
     item,
     source,
 ):
-    item.record['acquisition_source'] = _generate_acquisition_source(source=source)
+    """Get an output ready hep formatted record from the given
+    :class:`hepcrawl.utils.ParsedItem`, whatever format it's record might be.
+    """
+    item.record['acquisition_source'] = _generate_acquisition_source(
+        source=source
+    )
 
     if item.record_format == 'hep':
         return hep_to_hep(
@@ -154,6 +171,12 @@ def item_to_hep(
 
 
 def hep_to_hep(hep_record, record_files):
+    """This is needed to be able to patch the ``_fft`` field in the record.
+
+    As earlier in the process we don't really have all the files yet. It should
+    be used by any spiders that generate hep format instead of the internal
+    hepcrawl one (normally, marc-ingesting spiders).
+    """
     if record_files:
         hep_record['_fft'] = _get_updated_fft_fields(
             current_fft_fields=hep_record['_fft'],
diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py
index 4f90d6e9..13fbf1f3 100644
--- a/hepcrawl/spiders/desy_spider.py
+++ b/hepcrawl/spiders/desy_spider.py
@@ -7,8 +7,6 @@
 # under the terms of the Revised BSD License; see LICENSE file for
 # more details.
 
-"""Spider for DESY."""
-
 from __future__ import absolute_import, division, print_function
 
 import os
@@ -30,19 +28,44 @@
 
 
 class DesySpider(Spider):
-    """Desy spider.
+    """This spider parses files in XML MARC format (collections or single
+    records).
+
+    It can retrieve the files from a remote FTP or from a local directory, they
+    must have the extension ``.xml``.
+
+    Args:
+        source_folder(str): Path to the folder with the MARC files to ingest,
+            might be collections or single records. Will be ignored if
+            ``ftp_host`` is passed.
+
+        ftp_folder(str): Remote folder where to look for the XML files.
+
+        ftp_host(str):
 
-     This spider connects to a given FTP hosts and downloads XML files
-     for extraction into HEP records.
+        ftp_netrc(str): Path to the ``.netrc`` file with the authentication
+            details for the ftp connection.
+
+        destination_folder(str): Path to put the crawl results into. Will be
+            created if it does not exist.
+
+        *args: will be passed to the contstructor of
+            :class:`scrapy.spiders.Spider`.
+
+        **kwargs: will be passed to the contstructor of
+            :class:`scrapy.spiders.Spider`.
 
     Examples:
         To run a crawl, you need to pass FTP connection information via
-        ``ftp_host`` and ``ftp_netrc``, if ``ftp_folder`` is not passed, it will fallback to
-        ``DESY``::
+        ``ftp_host`` and ``ftp_netrc``, if ``ftp_folder`` is not passed, it
+        will fallback to ``DESY``::
 
-            $ scrapy crawl desy -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc'
+            $ scrapy crawl desy \\
+                -a 'ftp_host=ftp.example.com' \\
+                -a 'ftp_netrc=/path/to/netrc'
 
-        To run a crawl on local folder, you need to pass the absolute ``source_folder``::
+        To run a crawl on local folder, you need to pass the absolute
+        ``source_folder``::
 
             $ scrapy crawl desy -a 'source_folder=/path/to/package_dir'
      """
@@ -67,20 +90,21 @@ def __init__(
         self.source_folder = source_folder
         self.destination_folder = destination_folder
         self.ftp_enabled = True if self.ftp_host else False
+
         if not os.path.exists(self.destination_folder):
             os.makedirs(self.destination_folder)
 
     @staticmethod
-    def _list_xml_files_paths(list_files_paths):
-        return [
+    def _filter_xml_files(list_files_paths):
+        return (
             xml_file
             for xml_file in list_files_paths
             if xml_file.endswith('.xml')
-        ]
+        )
 
     def crawl_local_directory(self):
         file_names = os.listdir(self.source_folder)
-        xml_file_names = self._list_xml_files_paths(file_names)
+        xml_file_names = self._filter_xml_files(file_names)
 
         for file_name in xml_file_names:
             file_path = os.path.join(self.source_folder, file_name)
@@ -102,7 +126,7 @@ def crawl_ftp_directory(self):
             only_missing_files=False,
         )
 
-        xml_remote_files_paths = self._list_xml_files_paths(remote_files_paths)
+        xml_remote_files_paths = self._filter_xml_files(remote_files_paths)
 
         for remote_file in xml_remote_files_paths:
             self.log('Remote: Try to crawl file from FTP: {0}'.format(remote_file))
diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py
index 4d4a28db..9d66b565 100644
--- a/hepcrawl/utils.py
+++ b/hepcrawl/utils.py
@@ -70,10 +70,24 @@ def ftp_list_files(
     passive_mode=False,
     only_missing_files=True,
 ):
-    """List files from given FTP's ftp_host folder to target folder.
+    """
+
+    Args:
+        server_folder(str): remote folder to list.
+
+        ftp_host(str): name of the host. Example: 'ftp.cern.ch'
 
-    Params:
+        user(str): For authentication.
 
+        password(str): For authentication.
+
+        destination_folder(str): local folder to compare with.
+
+        passive_mode(bool): True if it should use firewall friendly ftp passive
+            mode.
+
+        only_missing_files(bool): If True will only list the files that are not
+            already in the ``destination_folder``.
     """
     session_factory = ftputil.session.session_factory(
         base_class=ftplib.FTP,
@@ -295,8 +309,10 @@ def get_licenses(
 
     Args:
         license_url(str): Url of the license to generate.
-        license_text(str): Text with the description of the license (sometimes is
-            all we got...).
+
+        license_text(str): Text with the description of the license (sometimes
+            is all we got...).
+
         license_material(str): Material of the license.
 
     Returns:
@@ -353,7 +369,9 @@ class RecordFile(object):
 
     Args:
         path(str): local path to the file.
-        name(str): Optional, name of the file, if not passed, will use the name in the path.
+
+        name(str): Optional, name of the file, if not passed, will use the name
+            in the ``path``.
 
     Rises:
         PathDoesNotExist:
@@ -373,22 +391,31 @@ class ParsedItem(dict):
     """Each of the individual items returned by the spider to the pipeline.
 
     Args:
-        record(dict): Information about the crawled record, might be in different formats.
-        record_format(str): Format of the above record, for example ``"hep"`` or ``"hepcrawl"``.
-        file_urls(list(str)): URLs to the files to be downloaded by ``FftFilesPipeline``.
-        ftp_params(dict): Parameter for the ``FftFilesPipeline`` to be able to connect to the
-        ftp server, if any.
-        record_files(list(RecordFile)): files attached to the record, usually populated by
-        ``FftFilesPipeline`` from the ``file_urls`` parameter.
+        record(dict): Information about the crawled record, might be in
+            different formats.
+
+        record_format(str): Format of the above record, for example ``"hep"``
+            or ``"hepcrawl"``.
+
+        file_urls(list(str)): URLs to the files to be downloaded by
+            ``FftFilesPipeline``.
+
+        ftp_params(dict): Parameter for the
+            :class:`hepcrawl.pipelines.FftFilesPipeline` to be able to connect
+            to the ftp server, if any.
+
+        record_files(list(RecordFile)): files attached to the record, usually
+            populated by :class:`hepcrawl.pipelines.FftFilesPipeline` from the
+            ``file_urls`` parameter.
     """
     def __init__(
-            self,
-            record,
-            record_format,
-            file_urls=None,
-            ftp_params=None,
-            record_files=None,
-            **kwargs
+        self,
+        record,
+        record_format,
+        file_urls=None,
+        ftp_params=None,
+        record_files=None,
+        **kwargs
     ):
         super(ParsedItem, self).__init__(
             record=record,