diff --git a/hepcrawl/spiders/alpha_spider.py b/hepcrawl/spiders/alpha_spider.py index ce334056..4282a98b 100644 --- a/hepcrawl/spiders/alpha_spider.py +++ b/hepcrawl/spiders/alpha_spider.py @@ -24,6 +24,7 @@ from ..utils import ( has_numbers, ParsedItem, + strict_kwargs, ) @@ -49,6 +50,7 @@ class AlphaSpider(StatefulSpider, CrawlSpider): domain = "http://alpha.web.cern.ch/" itertag = "//div[@class = 'node node-thesis']" + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct Alpha spider""" super(AlphaSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/aps_spider.py b/hepcrawl/spiders/aps_spider.py index 3b7f0d44..2b10f078 100644 --- a/hepcrawl/spiders/aps_spider.py +++ b/hepcrawl/spiders/aps_spider.py @@ -27,6 +27,7 @@ get_nested, build_dict, ParsedItem, + strict_kwargs, ) @@ -47,6 +48,7 @@ class APSSpider(StatefulSpider): name = 'APS' aps_base_url = "http://harvest.aps.org/v2/journals/articles" + @strict_kwargs def __init__(self, url=None, from_date=None, until_date=None, date="published", journals=None, sets=None, per_page=100, **kwargs): diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py index 1337b84f..50bd2041 100644 --- a/hepcrawl/spiders/arxiv_spider.py +++ b/hepcrawl/spiders/arxiv_spider.py @@ -24,6 +24,7 @@ get_licenses, split_fullname, ParsedItem, + strict_kwargs, ) RE_CONFERENCE = re.compile( @@ -47,15 +48,29 @@ class ArxivSpider(OAIPMHSpider): Using OAI-PMH XML files:: $ scrapy crawl arXiv \\ - -a "oai_set=physics:hep-th" -a "from_date=2017-12-13" + -a "sets=physics:hep-th" -a "from_date=2017-12-13" """ name = 'arXiv' - def __init__(self, *args, **kwargs): - kwargs.setdefault('url', 'http://export.arxiv.org/oai2') - kwargs.setdefault('format', 'arXiv') - super(ArxivSpider, self).__init__(*args, **kwargs) + @strict_kwargs + def __init__( + self, + url='http://export.arxiv.org/oai2', + format='arXiv', + sets=None, + from_date=None, + until_date=None, + **kwargs + ): + super(ArxivSpider, self).__init__( + url=url, + format=format, + sets=sets, + from_date=from_date, + until_date=until_date, + **kwargs + ) def parse_record(self, selector): """Parse an arXiv XML exported file into a HEP record.""" diff --git a/hepcrawl/spiders/base_spider.py b/hepcrawl/spiders/base_spider.py index 79748fde..3324242e 100644 --- a/hepcrawl/spiders/base_spider.py +++ b/hepcrawl/spiders/base_spider.py @@ -24,6 +24,7 @@ parse_domain, get_node, ParsedItem, + strict_kwargs, ) @@ -78,6 +79,7 @@ class BaseSpider(StatefulSpider, XMLFeedSpider): ("dc", "http://purl.org/dc/elements/1.1/"), ] + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct BASE spider""" super(BaseSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/brown_spider.py b/hepcrawl/spiders/brown_spider.py index fe1c340d..e7c8530e 100644 --- a/hepcrawl/spiders/brown_spider.py +++ b/hepcrawl/spiders/brown_spider.py @@ -27,6 +27,7 @@ parse_domain, get_mime_type, ParsedItem, + strict_kwargs, ) @@ -63,6 +64,7 @@ class BrownSpider(StatefulSpider, CrawlSpider): name = 'brown' start_urls = ["https://repository.library.brown.edu/api/collections/355/"] + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct Brown spider.""" super(BrownSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index 415a62f0..b96d794a 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -21,7 +21,7 @@ from scrapy.spider import XMLFeedSpider from . import StatefulSpider -from ..utils import ParsedItem +from ..utils import ParsedItem, strict_kwargs class CDSSpider(StatefulSpider, XMLFeedSpider): @@ -48,6 +48,7 @@ class CDSSpider(StatefulSpider, XMLFeedSpider): ('marc', 'http://www.loc.gov/MARC21/slim'), ] + @strict_kwargs def __init__(self, source_file=None, **kwargs): super(CDSSpider, self).__init__(**kwargs) self.source_file = source_file diff --git a/hepcrawl/spiders/common/oaipmh_spider.py b/hepcrawl/spiders/common/oaipmh_spider.py index fb69c741..fae38c7b 100644 --- a/hepcrawl/spiders/common/oaipmh_spider.py +++ b/hepcrawl/spiders/common/oaipmh_spider.py @@ -21,6 +21,7 @@ from scrapy.selector import Selector from .last_run_store import LastRunStoreSpider +from ...utils import strict_kwargs LOGGER = logging.getLogger(__name__) @@ -47,6 +48,7 @@ class OAIPMHSpider(LastRunStoreSpider): __metaclass__ = abc.ABCMeta name = 'OAI-PMH' + @strict_kwargs def __init__( self, url, @@ -55,9 +57,9 @@ def __init__( alias=None, from_date=None, until_date=None, - *args, **kwargs + **kwargs ): - super(OAIPMHSpider, self).__init__(*args, **kwargs) + super(OAIPMHSpider, self).__init__(**kwargs) self.url = url self.format = format if isinstance(sets, string_types): diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py index f1d40f6f..549ee6c6 100644 --- a/hepcrawl/spiders/desy_spider.py +++ b/hepcrawl/spiders/desy_spider.py @@ -23,6 +23,7 @@ ftp_list_files, ftp_connection_info, ParsedItem, + strict_kwargs, ) @@ -71,6 +72,7 @@ class DesySpider(StatefulSpider): """ name = 'desy' + @strict_kwargs def __init__( self, source_folder=None, diff --git a/hepcrawl/spiders/dnb_spider.py b/hepcrawl/spiders/dnb_spider.py index 5f243b94..436d7b22 100644 --- a/hepcrawl/spiders/dnb_spider.py +++ b/hepcrawl/spiders/dnb_spider.py @@ -22,6 +22,7 @@ parse_domain, get_node, ParsedItem, + strict_kwargs, ) @@ -64,6 +65,7 @@ class DNBSpider(StatefulSpider, XMLFeedSpider): ("slim", "http://www.loc.gov/MARC21/slim"), ] + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct DNB spider.""" super(DNBSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py index c051c8ee..48226ef6 100644 --- a/hepcrawl/spiders/edp_spider.py +++ b/hepcrawl/spiders/edp_spider.py @@ -32,6 +32,7 @@ get_node, parse_domain, ParsedItem, + strict_kwargs, ) @@ -123,6 +124,7 @@ class EDPSpider(StatefulSpider, Jats, XMLFeedSpider): 'EPJ Web of Conferences' } + @strict_kwargs def __init__(self, package_path=None, ftp_folder="incoming", ftp_netrc=None, *args, **kwargs): """Construct EDP spider. diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py index 583f996b..8755f843 100644 --- a/hepcrawl/spiders/elsevier_spider.py +++ b/hepcrawl/spiders/elsevier_spider.py @@ -33,6 +33,7 @@ range_as_string, unzip_xml_files, ParsedItem, + strict_kwargs, ) from ..dateutils import format_year @@ -138,6 +139,7 @@ class ElsevierSpider(StatefulSpider, XMLFeedSpider): ERROR_CODES = range(400, 432) + @strict_kwargs def __init__(self, atom_feed=None, zip_file=None, xml_file=None, *args, **kwargs): """Construct Elsevier spider.""" super(ElsevierSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/hindawi_spider.py b/hepcrawl/spiders/hindawi_spider.py index 5f81f5b4..74555137 100644 --- a/hepcrawl/spiders/hindawi_spider.py +++ b/hepcrawl/spiders/hindawi_spider.py @@ -20,6 +20,7 @@ from ..utils import ( get_licenses, ParsedItem, + strict_kwargs, ) @@ -69,6 +70,7 @@ class HindawiSpider(StatefulSpider, XMLFeedSpider): ("mml", "http://www.w3.org/1998/Math/MathML"), ] + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct Hindawi spider.""" super(HindawiSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/infn_spider.py b/hepcrawl/spiders/infn_spider.py index 2e093ab1..3e8b253b 100644 --- a/hepcrawl/spiders/infn_spider.py +++ b/hepcrawl/spiders/infn_spider.py @@ -25,6 +25,7 @@ from ..utils import ( get_temporary_file, ParsedItem, + strict_kwargs, ) from ..dateutils import format_date @@ -65,6 +66,7 @@ class InfnSpider(StatefulSpider, XMLFeedSpider): itertag = "//tr[@onmouseover]" today = str(datetime.date.today().year) + @strict_kwargs def __init__(self, source_file=None, year=today, *args, **kwargs): """Construct INFN spider""" super(InfnSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py index fbca3ae5..be7b349f 100644 --- a/hepcrawl/spiders/iop_spider.py +++ b/hepcrawl/spiders/iop_spider.py @@ -22,7 +22,7 @@ from ..extractors.nlm import NLM from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import ParsedItem +from ..utils import ParsedItem, strict_kwargs class IOPSpider(StatefulSpider, XMLFeedSpider, NLM): @@ -82,6 +82,7 @@ class IOPSpider(StatefulSpider, XMLFeedSpider, NLM): # FIXME: add more } + @strict_kwargs def __init__(self, zip_file=None, xml_file=None, pdf_files=None, *args, **kwargs): """Construct IOP spider.""" super(IOPSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py index 8dfd5d51..31967acc 100644 --- a/hepcrawl/spiders/magic_spider.py +++ b/hepcrawl/spiders/magic_spider.py @@ -22,6 +22,7 @@ from ..utils import ( split_fullname, ParsedItem, + strict_kwargs, ) @@ -57,6 +58,7 @@ class MagicSpider(StatefulSpider, XMLFeedSpider): ERROR_CODES = range(400, 432) + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct MAGIC spider""" super(MagicSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py index 21804873..e101b419 100644 --- a/hepcrawl/spiders/mit_spider.py +++ b/hepcrawl/spiders/mit_spider.py @@ -28,6 +28,7 @@ get_temporary_file, split_fullname, ParsedItem, + strict_kwargs, ) @@ -63,6 +64,7 @@ class MITSpider(StatefulSpider, XMLFeedSpider): itertag = "//ul[@class='ds-artifact-list']/li" today = str(datetime.date.today().year) + @strict_kwargs def __init__(self, source_file=None, year=today, *args, **kwargs): """Construct MIT spider""" super(MITSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/phenix_spider.py b/hepcrawl/spiders/phenix_spider.py index aa54bd98..76f216a0 100644 --- a/hepcrawl/spiders/phenix_spider.py +++ b/hepcrawl/spiders/phenix_spider.py @@ -19,7 +19,7 @@ from . import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import ParsedItem +from ..utils import ParsedItem, strict_kwargs class PhenixSpider(StatefulSpider, XMLFeedSpider): @@ -50,6 +50,7 @@ class PhenixSpider(StatefulSpider, XMLFeedSpider): iterator = "html" itertag = "//table//td/ul/li" + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct PHENIX spider""" super(PhenixSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/phil_spider.py b/hepcrawl/spiders/phil_spider.py index 06f52da2..f23c7391 100644 --- a/hepcrawl/spiders/phil_spider.py +++ b/hepcrawl/spiders/phil_spider.py @@ -24,6 +24,7 @@ parse_domain, get_mime_type, ParsedItem, + strict_kwargs, ) @@ -57,6 +58,7 @@ class PhilSpider(StatefulSpider, CrawlSpider): name = 'phil' start_urls = ["http://philpapers.org/philpapers/raw/export/inspire.json"] + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct Phil spider.""" super(PhilSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index c8c67505..0e4a5700 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -26,6 +26,7 @@ get_licenses, get_first, ParsedItem, + strict_kwargs, ) @@ -76,6 +77,7 @@ class POSSpider(StatefulSpider): """ name = 'pos' + @strict_kwargs def __init__( self, source_file=None, diff --git a/hepcrawl/spiders/t2k_spider.py b/hepcrawl/spiders/t2k_spider.py index db18eb1e..73dba273 100644 --- a/hepcrawl/spiders/t2k_spider.py +++ b/hepcrawl/spiders/t2k_spider.py @@ -22,6 +22,7 @@ from ..utils import ( split_fullname, ParsedItem, + strict_kwargs, ) @@ -59,6 +60,7 @@ class T2kSpider(StatefulSpider, XMLFeedSpider): iterator = "html" itertag = "//table[@id='folders']//tr" + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct T2K spider""" super(T2kSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 280b6875..01e44156 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -26,6 +26,7 @@ local_list_files, unzip_xml_files, ParsedItem, + strict_kwargs, ) @@ -89,6 +90,7 @@ class WorldScientificSpider(StatefulSpider, XMLFeedSpider): 'rapid-communications' ] + @strict_kwargs def __init__( self, local_package_dir=None, diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index e504c4a6..1911ef8e 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -9,10 +9,12 @@ from __future__ import absolute_import, division, print_function +import inspect import fnmatch import os import pprint import re +from functools import wraps from operator import itemgetter from itertools import groupby from netrc import netrc @@ -382,6 +384,53 @@ def get_licenses( return license +def strict_kwargs(func): + """This decorator will disallow any keyword arguments that + do not begin with an underscore sign in the decorated method. + This is mainly to make errors while passing arguments to spiders + immediately visible. As we cannot remove kwargs from there altogether + (used by scrapyd), with this we can ensure that we are not passing unwanted + data by mistake. + + Additionaly this will add all of the 'public' not-None kwargs to an + `_init_kwargs` field in the object for easier passing and all of the + arguments (including non-overloaded ones) to `_all_kwargs`. + (To make passing them forward easier.) + + Args: + func (function): a spider method + + Returns: + function: method which will disallow any keyword arguments that + do not begin with an underscore sign. + """ + argspec = inspect.getargspec(func) + defined_arguments = argspec.args[1:] + spider_fields = ['settings'] + + allowed_arguments = defined_arguments + spider_fields + + @wraps(func) + def wrapper(self, *args, **kwargs): + disallowed_kwargs = [ + key for key in kwargs + if not key.startswith('_') and key not in allowed_arguments + ] + + if disallowed_kwargs: + raise TypeError( + 'Only underscored kwargs or %s allowed in %s. ' + 'Check %s for typos.' % ( + ', '.join(spider_fields), + func, + ', '.join(disallowed_kwargs), + ) + ) + + return func(self, *args, **kwargs) + return wrapper + + class RecordFile(object): """Metadata of a file needed for a record. diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 42e6fd6f..7ed3e5fc 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -30,6 +30,7 @@ range_as_string, split_fullname, unzip_xml_files, + strict_kwargs, ) @@ -77,6 +78,18 @@ def list_for_dict(): ] +class Dummy(object): + """A sample class with @strict_kwargs constructor.""" + @strict_kwargs + def __init__(self, good1_no_default, good2=10, good3=20, good4=30, *args, **kwargs): + self.good1_no_default = good1_no_default + self.good2 = good2 + self.good3 = good3 + self.good4 = good4 + self.args = args + self.kwargs = kwargs + + def test_unzip_xml(zipfile, tmpdir): """Test unzipping of xml files using zipfile and tmpdir fixtures.""" assert len(unzip_xml_files(zipfile, six.text_type(tmpdir))) == 1 @@ -261,3 +274,26 @@ def test_get_journal_and_section_invalid(): assert journal_title == '' assert section == '' + + +def test_strict_kwargs_pass(): + """Test the `strict_kwargs` decorator allowing the kwargs.""" + dummy = Dummy( + good1_no_default=1, + good2=2, + good3=None, + _private=4, + settings={'DUMMY': True} + ) + assert callable(dummy.__init__) + assert dummy.good1_no_default == 1 + assert dummy.good2 == 2 + assert dummy.good3 == None + assert dummy.good4 == 30 + assert dummy.kwargs == {'_private': 4, 'settings': {'DUMMY': True}} + + +def test_strict_kwargs_fail(): + """Test the `strict_kwargs` decorator disallowing some kwargs.""" + with pytest.raises(TypeError): + Dummy(**{'good1_no_default': 1, 'good2': 2, u'bąd_þärãm': 4})