From d8edf1c654f88ac90adf22f1faec6c8922ed10c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Tue, 30 Jan 2018 10:00:00 +0100 Subject: [PATCH 1/8] utils: strictly check kwargs in spiders (#218) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/arxiv_spider.py | 18 ++++++-- hepcrawl/spiders/common/oaipmh_spider.py | 6 ++- hepcrawl/utils.py | 58 ++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 7 deletions(-) diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py index 1337b84f..28cd032d 100644 --- a/hepcrawl/spiders/arxiv_spider.py +++ b/hepcrawl/spiders/arxiv_spider.py @@ -24,6 +24,7 @@ get_licenses, split_fullname, ParsedItem, + strict_kwargs, ) RE_CONFERENCE = re.compile( @@ -47,15 +48,22 @@ class ArxivSpider(OAIPMHSpider): Using OAI-PMH XML files:: $ scrapy crawl arXiv \\ - -a "oai_set=physics:hep-th" -a "from_date=2017-12-13" + -a "sets=physics:hep-th" -a "from_date=2017-12-13" """ name = 'arXiv' - def __init__(self, *args, **kwargs): - kwargs.setdefault('url', 'http://export.arxiv.org/oai2') - kwargs.setdefault('format', 'arXiv') - super(ArxivSpider, self).__init__(*args, **kwargs) + @strict_kwargs + def __init__( + self, + url='http://export.arxiv.org/oai2', + format='arXiv', + sets=None, + from_date=None, + until_date=None, + **kwargs + ): + super(ArxivSpider, self).__init__(**self._all_kwargs) def parse_record(self, selector): """Parse an arXiv XML exported file into a HEP record.""" diff --git a/hepcrawl/spiders/common/oaipmh_spider.py b/hepcrawl/spiders/common/oaipmh_spider.py index fb69c741..38c0e7f0 100644 --- a/hepcrawl/spiders/common/oaipmh_spider.py +++ b/hepcrawl/spiders/common/oaipmh_spider.py @@ -21,6 +21,7 @@ from scrapy.selector import Selector from .last_run_store import LastRunStoreSpider +from ...utils import strict_kwargs LOGGER = logging.getLogger(__name__) @@ -47,6 +48,7 @@ class OAIPMHSpider(LastRunStoreSpider): __metaclass__ = abc.ABCMeta name = 'OAI-PMH' + @strict_kwargs def __init__( self, url, @@ -55,9 +57,9 @@ def __init__( alias=None, from_date=None, until_date=None, - *args, **kwargs + **kwargs ): - super(OAIPMHSpider, self).__init__(*args, **kwargs) + super(OAIPMHSpider, self).__init__(**self._all_kwargs) self.url = url self.format = format if isinstance(sets, string_types): diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index e504c4a6..ae6b8b35 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -9,10 +9,12 @@ from __future__ import absolute_import, division, print_function +import inspect import fnmatch import os import pprint import re +from functools import wraps from operator import itemgetter from itertools import groupby from netrc import netrc @@ -382,6 +384,62 @@ def get_licenses( return license +def strict_kwargs(func): + """This decorator will disallow any keyword arguments that + do not begin with an underscore sign in the decorated method. + This is mainly to make errors while passing arguments to spiders + immediately visible. As we cannot remove kwargs from there altogether + (used by scrapyd), with this we can ensure that we are not passing unwanted + data by mistake. + + Additionaly this will add all of the 'public' kwargs to an `_init_kwargs` + field in the object for easier passing and all of the arguments (including + non-overloaded ones) to `_all_kwargs`. (To make passing them forward + easier.) + + Args: + func (function): a spider method + + Returns: + function: method which will disallow any keyword arguments that + do not begin with an underscore sign. + """ + argspec = inspect.getargspec(func) + defined_arguments = argspec.args[1:] + spider_fields = ['settings'] + + allowed_arguments = defined_arguments + spider_fields + + if argspec.defaults: + defaults = dict( + zip(argspec.args[-len(argspec.defaults):], argspec.defaults) + ) + else: + defaults = {} + + @wraps(func) + def wrapper(self, *args, **kwargs): + disallowed_kwargs = [ + key for key in kwargs + if not key.startswith('_') and key not in allowed_arguments + ] + + if disallowed_kwargs: + raise TypeError( + 'Only underscored kwargs allowed in {}. ' + 'Check {} for typos.'.format(func, ', '.join(disallowed_kwargs)) + ) + + defaults.update(kwargs) + self._init_kwargs = { + k: v for k, v in defaults.items() + if not k.startswith('_') and k not in spider_fields + } + self._all_kwargs = defaults + return func(self, *args, **kwargs) + return wrapper + + class RecordFile(object): """Metadata of a file needed for a record. From 48e0dfb52a7d90eb3fb9586fd4e2f0197e8c7417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Tue, 30 Jan 2018 11:06:06 +0100 Subject: [PATCH 2/8] tests: add unit tests for @strict_kwargs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- tests/unit/test_utils.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 42e6fd6f..6ef944d1 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -30,6 +30,7 @@ range_as_string, split_fullname, unzip_xml_files, + strict_kwargs, ) @@ -77,6 +78,17 @@ def list_for_dict(): ] +@pytest.fixture(scope='module') +def dummy_strict_kwargs_cls(): + """A sample class with @strict_kwargs constructor.""" + class Dummy(object): + @strict_kwargs + def __init__(self, a, b=10, c=20, *args, **kwargs): + pass + + return Dummy + + def test_unzip_xml(zipfile, tmpdir): """Test unzipping of xml files using zipfile and tmpdir fixtures.""" assert len(unzip_xml_files(zipfile, six.text_type(tmpdir))) == 1 @@ -261,3 +273,19 @@ def test_get_journal_and_section_invalid(): assert journal_title == '' assert section == '' + + +def test_strict_kwargs_pass(dummy_strict_kwargs_cls): + """Test the `strict_kwargs` decorator allowing the kwargs.""" + dummy = dummy_strict_kwargs_cls(a=1, b=2, _x=4, settings={'DUMMY': True}) + assert dummy._init_kwargs == {'a': 1, 'b': 2, 'c': 20} + assert dummy._all_kwargs == { + 'a': 1, 'b': 2, 'c': 20, '_x': 4, 'settings': {'DUMMY': True} + } + + + +def test_strict_kwargs_fail(dummy_strict_kwargs_cls): + """Test the `strict_kwargs` decorator disallowing some kwargs.""" + with pytest.raises(TypeError): + dummy_strict_kwargs_cls(a=1, b=2, d=4) From 4c7861626fb4da4b77253fee5a2b581dcfc887cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Tue, 30 Jan 2018 13:25:35 +0100 Subject: [PATCH 3/8] spiders: apply strict_kwargs to all constructors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/alpha_spider.py | 2 ++ hepcrawl/spiders/aps_spider.py | 2 ++ hepcrawl/spiders/base_spider.py | 2 ++ hepcrawl/spiders/brown_spider.py | 2 ++ hepcrawl/spiders/cds_spider.py | 3 ++- hepcrawl/spiders/desy_spider.py | 2 ++ hepcrawl/spiders/dnb_spider.py | 2 ++ hepcrawl/spiders/edp_spider.py | 2 ++ hepcrawl/spiders/elsevier_spider.py | 2 ++ hepcrawl/spiders/hindawi_spider.py | 2 ++ hepcrawl/spiders/infn_spider.py | 2 ++ hepcrawl/spiders/iop_spider.py | 3 ++- hepcrawl/spiders/magic_spider.py | 2 ++ hepcrawl/spiders/mit_spider.py | 2 ++ hepcrawl/spiders/phenix_spider.py | 3 ++- hepcrawl/spiders/phil_spider.py | 2 ++ hepcrawl/spiders/pos_spider.py | 2 ++ hepcrawl/spiders/t2k_spider.py | 2 ++ hepcrawl/spiders/wsp_spider.py | 2 ++ 19 files changed, 38 insertions(+), 3 deletions(-) diff --git a/hepcrawl/spiders/alpha_spider.py b/hepcrawl/spiders/alpha_spider.py index ce334056..4282a98b 100644 --- a/hepcrawl/spiders/alpha_spider.py +++ b/hepcrawl/spiders/alpha_spider.py @@ -24,6 +24,7 @@ from ..utils import ( has_numbers, ParsedItem, + strict_kwargs, ) @@ -49,6 +50,7 @@ class AlphaSpider(StatefulSpider, CrawlSpider): domain = "http://alpha.web.cern.ch/" itertag = "//div[@class = 'node node-thesis']" + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct Alpha spider""" super(AlphaSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/aps_spider.py b/hepcrawl/spiders/aps_spider.py index 3b7f0d44..2b10f078 100644 --- a/hepcrawl/spiders/aps_spider.py +++ b/hepcrawl/spiders/aps_spider.py @@ -27,6 +27,7 @@ get_nested, build_dict, ParsedItem, + strict_kwargs, ) @@ -47,6 +48,7 @@ class APSSpider(StatefulSpider): name = 'APS' aps_base_url = "http://harvest.aps.org/v2/journals/articles" + @strict_kwargs def __init__(self, url=None, from_date=None, until_date=None, date="published", journals=None, sets=None, per_page=100, **kwargs): diff --git a/hepcrawl/spiders/base_spider.py b/hepcrawl/spiders/base_spider.py index 79748fde..3324242e 100644 --- a/hepcrawl/spiders/base_spider.py +++ b/hepcrawl/spiders/base_spider.py @@ -24,6 +24,7 @@ parse_domain, get_node, ParsedItem, + strict_kwargs, ) @@ -78,6 +79,7 @@ class BaseSpider(StatefulSpider, XMLFeedSpider): ("dc", "http://purl.org/dc/elements/1.1/"), ] + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct BASE spider""" super(BaseSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/brown_spider.py b/hepcrawl/spiders/brown_spider.py index fe1c340d..e7c8530e 100644 --- a/hepcrawl/spiders/brown_spider.py +++ b/hepcrawl/spiders/brown_spider.py @@ -27,6 +27,7 @@ parse_domain, get_mime_type, ParsedItem, + strict_kwargs, ) @@ -63,6 +64,7 @@ class BrownSpider(StatefulSpider, CrawlSpider): name = 'brown' start_urls = ["https://repository.library.brown.edu/api/collections/355/"] + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct Brown spider.""" super(BrownSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/cds_spider.py b/hepcrawl/spiders/cds_spider.py index 415a62f0..b96d794a 100644 --- a/hepcrawl/spiders/cds_spider.py +++ b/hepcrawl/spiders/cds_spider.py @@ -21,7 +21,7 @@ from scrapy.spider import XMLFeedSpider from . import StatefulSpider -from ..utils import ParsedItem +from ..utils import ParsedItem, strict_kwargs class CDSSpider(StatefulSpider, XMLFeedSpider): @@ -48,6 +48,7 @@ class CDSSpider(StatefulSpider, XMLFeedSpider): ('marc', 'http://www.loc.gov/MARC21/slim'), ] + @strict_kwargs def __init__(self, source_file=None, **kwargs): super(CDSSpider, self).__init__(**kwargs) self.source_file = source_file diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py index f1d40f6f..549ee6c6 100644 --- a/hepcrawl/spiders/desy_spider.py +++ b/hepcrawl/spiders/desy_spider.py @@ -23,6 +23,7 @@ ftp_list_files, ftp_connection_info, ParsedItem, + strict_kwargs, ) @@ -71,6 +72,7 @@ class DesySpider(StatefulSpider): """ name = 'desy' + @strict_kwargs def __init__( self, source_folder=None, diff --git a/hepcrawl/spiders/dnb_spider.py b/hepcrawl/spiders/dnb_spider.py index 5f243b94..436d7b22 100644 --- a/hepcrawl/spiders/dnb_spider.py +++ b/hepcrawl/spiders/dnb_spider.py @@ -22,6 +22,7 @@ parse_domain, get_node, ParsedItem, + strict_kwargs, ) @@ -64,6 +65,7 @@ class DNBSpider(StatefulSpider, XMLFeedSpider): ("slim", "http://www.loc.gov/MARC21/slim"), ] + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct DNB spider.""" super(DNBSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py index c051c8ee..48226ef6 100644 --- a/hepcrawl/spiders/edp_spider.py +++ b/hepcrawl/spiders/edp_spider.py @@ -32,6 +32,7 @@ get_node, parse_domain, ParsedItem, + strict_kwargs, ) @@ -123,6 +124,7 @@ class EDPSpider(StatefulSpider, Jats, XMLFeedSpider): 'EPJ Web of Conferences' } + @strict_kwargs def __init__(self, package_path=None, ftp_folder="incoming", ftp_netrc=None, *args, **kwargs): """Construct EDP spider. diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py index 583f996b..8755f843 100644 --- a/hepcrawl/spiders/elsevier_spider.py +++ b/hepcrawl/spiders/elsevier_spider.py @@ -33,6 +33,7 @@ range_as_string, unzip_xml_files, ParsedItem, + strict_kwargs, ) from ..dateutils import format_year @@ -138,6 +139,7 @@ class ElsevierSpider(StatefulSpider, XMLFeedSpider): ERROR_CODES = range(400, 432) + @strict_kwargs def __init__(self, atom_feed=None, zip_file=None, xml_file=None, *args, **kwargs): """Construct Elsevier spider.""" super(ElsevierSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/hindawi_spider.py b/hepcrawl/spiders/hindawi_spider.py index 5f81f5b4..74555137 100644 --- a/hepcrawl/spiders/hindawi_spider.py +++ b/hepcrawl/spiders/hindawi_spider.py @@ -20,6 +20,7 @@ from ..utils import ( get_licenses, ParsedItem, + strict_kwargs, ) @@ -69,6 +70,7 @@ class HindawiSpider(StatefulSpider, XMLFeedSpider): ("mml", "http://www.w3.org/1998/Math/MathML"), ] + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct Hindawi spider.""" super(HindawiSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/infn_spider.py b/hepcrawl/spiders/infn_spider.py index 2e093ab1..3e8b253b 100644 --- a/hepcrawl/spiders/infn_spider.py +++ b/hepcrawl/spiders/infn_spider.py @@ -25,6 +25,7 @@ from ..utils import ( get_temporary_file, ParsedItem, + strict_kwargs, ) from ..dateutils import format_date @@ -65,6 +66,7 @@ class InfnSpider(StatefulSpider, XMLFeedSpider): itertag = "//tr[@onmouseover]" today = str(datetime.date.today().year) + @strict_kwargs def __init__(self, source_file=None, year=today, *args, **kwargs): """Construct INFN spider""" super(InfnSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py index fbca3ae5..be7b349f 100644 --- a/hepcrawl/spiders/iop_spider.py +++ b/hepcrawl/spiders/iop_spider.py @@ -22,7 +22,7 @@ from ..extractors.nlm import NLM from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import ParsedItem +from ..utils import ParsedItem, strict_kwargs class IOPSpider(StatefulSpider, XMLFeedSpider, NLM): @@ -82,6 +82,7 @@ class IOPSpider(StatefulSpider, XMLFeedSpider, NLM): # FIXME: add more } + @strict_kwargs def __init__(self, zip_file=None, xml_file=None, pdf_files=None, *args, **kwargs): """Construct IOP spider.""" super(IOPSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py index 8dfd5d51..31967acc 100644 --- a/hepcrawl/spiders/magic_spider.py +++ b/hepcrawl/spiders/magic_spider.py @@ -22,6 +22,7 @@ from ..utils import ( split_fullname, ParsedItem, + strict_kwargs, ) @@ -57,6 +58,7 @@ class MagicSpider(StatefulSpider, XMLFeedSpider): ERROR_CODES = range(400, 432) + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct MAGIC spider""" super(MagicSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py index 21804873..e101b419 100644 --- a/hepcrawl/spiders/mit_spider.py +++ b/hepcrawl/spiders/mit_spider.py @@ -28,6 +28,7 @@ get_temporary_file, split_fullname, ParsedItem, + strict_kwargs, ) @@ -63,6 +64,7 @@ class MITSpider(StatefulSpider, XMLFeedSpider): itertag = "//ul[@class='ds-artifact-list']/li" today = str(datetime.date.today().year) + @strict_kwargs def __init__(self, source_file=None, year=today, *args, **kwargs): """Construct MIT spider""" super(MITSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/phenix_spider.py b/hepcrawl/spiders/phenix_spider.py index aa54bd98..76f216a0 100644 --- a/hepcrawl/spiders/phenix_spider.py +++ b/hepcrawl/spiders/phenix_spider.py @@ -19,7 +19,7 @@ from . import StatefulSpider from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import ParsedItem +from ..utils import ParsedItem, strict_kwargs class PhenixSpider(StatefulSpider, XMLFeedSpider): @@ -50,6 +50,7 @@ class PhenixSpider(StatefulSpider, XMLFeedSpider): iterator = "html" itertag = "//table//td/ul/li" + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct PHENIX spider""" super(PhenixSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/phil_spider.py b/hepcrawl/spiders/phil_spider.py index 06f52da2..f23c7391 100644 --- a/hepcrawl/spiders/phil_spider.py +++ b/hepcrawl/spiders/phil_spider.py @@ -24,6 +24,7 @@ parse_domain, get_mime_type, ParsedItem, + strict_kwargs, ) @@ -57,6 +58,7 @@ class PhilSpider(StatefulSpider, CrawlSpider): name = 'phil' start_urls = ["http://philpapers.org/philpapers/raw/export/inspire.json"] + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct Phil spider.""" super(PhilSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index c8c67505..0e4a5700 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -26,6 +26,7 @@ get_licenses, get_first, ParsedItem, + strict_kwargs, ) @@ -76,6 +77,7 @@ class POSSpider(StatefulSpider): """ name = 'pos' + @strict_kwargs def __init__( self, source_file=None, diff --git a/hepcrawl/spiders/t2k_spider.py b/hepcrawl/spiders/t2k_spider.py index db18eb1e..73dba273 100644 --- a/hepcrawl/spiders/t2k_spider.py +++ b/hepcrawl/spiders/t2k_spider.py @@ -22,6 +22,7 @@ from ..utils import ( split_fullname, ParsedItem, + strict_kwargs, ) @@ -59,6 +60,7 @@ class T2kSpider(StatefulSpider, XMLFeedSpider): iterator = "html" itertag = "//table[@id='folders']//tr" + @strict_kwargs def __init__(self, source_file=None, *args, **kwargs): """Construct T2K spider""" super(T2kSpider, self).__init__(*args, **kwargs) diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 280b6875..01e44156 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -26,6 +26,7 @@ local_list_files, unzip_xml_files, ParsedItem, + strict_kwargs, ) @@ -89,6 +90,7 @@ class WorldScientificSpider(StatefulSpider, XMLFeedSpider): 'rapid-communications' ] + @strict_kwargs def __init__( self, local_package_dir=None, From 613b3f27235b94c8ec8d458f9849468bfd832c1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Tue, 30 Jan 2018 13:51:52 +0100 Subject: [PATCH 4/8] utils: drop None-valued fields from _init_kwargs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The motivation for this change is partly to enable passing the spider parameters as kwargs to 'action functions' which can the fail if the parameters don't match the specification (i.e. the signature) of the action. To enable this method of verification we need to drop None-valued parameters. We still leave them in _all_kwargs. Signed-off-by: Szymon Łopaciuk --- hepcrawl/utils.py | 9 +++++---- tests/unit/test_utils.py | 8 ++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index ae6b8b35..e5c007ba 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -392,10 +392,10 @@ def strict_kwargs(func): (used by scrapyd), with this we can ensure that we are not passing unwanted data by mistake. - Additionaly this will add all of the 'public' kwargs to an `_init_kwargs` - field in the object for easier passing and all of the arguments (including - non-overloaded ones) to `_all_kwargs`. (To make passing them forward - easier.) + Additionaly this will add all of the 'public' not-None kwargs to an + `_init_kwargs` field in the object for easier passing and all of the + arguments (including non-overloaded ones) to `_all_kwargs`. + (To make passing them forward easier.) Args: func (function): a spider method @@ -434,6 +434,7 @@ def wrapper(self, *args, **kwargs): self._init_kwargs = { k: v for k, v in defaults.items() if not k.startswith('_') and k not in spider_fields + and v is not None } self._all_kwargs = defaults return func(self, *args, **kwargs) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 6ef944d1..1e66dad5 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -83,7 +83,7 @@ def dummy_strict_kwargs_cls(): """A sample class with @strict_kwargs constructor.""" class Dummy(object): @strict_kwargs - def __init__(self, a, b=10, c=20, *args, **kwargs): + def __init__(self, a, b=10, c=20, d=30, *args, **kwargs): pass return Dummy @@ -277,10 +277,10 @@ def test_get_journal_and_section_invalid(): def test_strict_kwargs_pass(dummy_strict_kwargs_cls): """Test the `strict_kwargs` decorator allowing the kwargs.""" - dummy = dummy_strict_kwargs_cls(a=1, b=2, _x=4, settings={'DUMMY': True}) + dummy = dummy_strict_kwargs_cls(a=1, b=2, d=None, _x=4, settings={'DUMMY': True}) assert dummy._init_kwargs == {'a': 1, 'b': 2, 'c': 20} assert dummy._all_kwargs == { - 'a': 1, 'b': 2, 'c': 20, '_x': 4, 'settings': {'DUMMY': True} + 'a': 1, 'b': 2, 'c': 20, 'd': None, '_x': 4, 'settings': {'DUMMY': True} } @@ -288,4 +288,4 @@ def test_strict_kwargs_pass(dummy_strict_kwargs_cls): def test_strict_kwargs_fail(dummy_strict_kwargs_cls): """Test the `strict_kwargs` decorator disallowing some kwargs.""" with pytest.raises(TypeError): - dummy_strict_kwargs_cls(a=1, b=2, d=4) + dummy_strict_kwargs_cls(a=1, b=2, e=4) From 46232c1ed769e72eb83a4dca7271917857cd87ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Wed, 31 Jan 2018 15:46:13 +0100 Subject: [PATCH 5/8] utils: remove field setting from strict kwargs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove _all_kwargs and _init_kwargs from @strict_kwargs. Signed-off-by: Szymon Łopaciuk --- hepcrawl/spiders/arxiv_spider.py | 9 ++++++++- hepcrawl/spiders/common/oaipmh_spider.py | 2 +- hepcrawl/utils.py | 14 -------------- tests/unit/test_utils.py | 6 +----- 4 files changed, 10 insertions(+), 21 deletions(-) diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py index 28cd032d..50bd2041 100644 --- a/hepcrawl/spiders/arxiv_spider.py +++ b/hepcrawl/spiders/arxiv_spider.py @@ -63,7 +63,14 @@ def __init__( until_date=None, **kwargs ): - super(ArxivSpider, self).__init__(**self._all_kwargs) + super(ArxivSpider, self).__init__( + url=url, + format=format, + sets=sets, + from_date=from_date, + until_date=until_date, + **kwargs + ) def parse_record(self, selector): """Parse an arXiv XML exported file into a HEP record.""" diff --git a/hepcrawl/spiders/common/oaipmh_spider.py b/hepcrawl/spiders/common/oaipmh_spider.py index 38c0e7f0..fae38c7b 100644 --- a/hepcrawl/spiders/common/oaipmh_spider.py +++ b/hepcrawl/spiders/common/oaipmh_spider.py @@ -59,7 +59,7 @@ def __init__( until_date=None, **kwargs ): - super(OAIPMHSpider, self).__init__(**self._all_kwargs) + super(OAIPMHSpider, self).__init__(**kwargs) self.url = url self.format = format if isinstance(sets, string_types): diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index e5c007ba..d70ca23c 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -410,13 +410,6 @@ def strict_kwargs(func): allowed_arguments = defined_arguments + spider_fields - if argspec.defaults: - defaults = dict( - zip(argspec.args[-len(argspec.defaults):], argspec.defaults) - ) - else: - defaults = {} - @wraps(func) def wrapper(self, *args, **kwargs): disallowed_kwargs = [ @@ -430,13 +423,6 @@ def wrapper(self, *args, **kwargs): 'Check {} for typos.'.format(func, ', '.join(disallowed_kwargs)) ) - defaults.update(kwargs) - self._init_kwargs = { - k: v for k, v in defaults.items() - if not k.startswith('_') and k not in spider_fields - and v is not None - } - self._all_kwargs = defaults return func(self, *args, **kwargs) return wrapper diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 1e66dad5..d9c3c0f5 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -278,11 +278,7 @@ def test_get_journal_and_section_invalid(): def test_strict_kwargs_pass(dummy_strict_kwargs_cls): """Test the `strict_kwargs` decorator allowing the kwargs.""" dummy = dummy_strict_kwargs_cls(a=1, b=2, d=None, _x=4, settings={'DUMMY': True}) - assert dummy._init_kwargs == {'a': 1, 'b': 2, 'c': 20} - assert dummy._all_kwargs == { - 'a': 1, 'b': 2, 'c': 20, 'd': None, '_x': 4, 'settings': {'DUMMY': True} - } - + assert callable(dummy.__init__) def test_strict_kwargs_fail(dummy_strict_kwargs_cls): From afd9fe88bb96056b47a11dc63c2127eadcb39146 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Wed, 31 Jan 2018 16:16:40 +0100 Subject: [PATCH 6/8] tests: strict kw: change fixture to just class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- tests/unit/test_utils.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index d9c3c0f5..79e28361 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -78,15 +78,11 @@ def list_for_dict(): ] -@pytest.fixture(scope='module') -def dummy_strict_kwargs_cls(): +class Dummy(object): """A sample class with @strict_kwargs constructor.""" - class Dummy(object): - @strict_kwargs - def __init__(self, a, b=10, c=20, d=30, *args, **kwargs): - pass - - return Dummy + @strict_kwargs + def __init__(self, a, b=10, c=20, d=30, *args, **kwargs): + pass def test_unzip_xml(zipfile, tmpdir): @@ -275,13 +271,13 @@ def test_get_journal_and_section_invalid(): assert section == '' -def test_strict_kwargs_pass(dummy_strict_kwargs_cls): +def test_strict_kwargs_pass(): """Test the `strict_kwargs` decorator allowing the kwargs.""" - dummy = dummy_strict_kwargs_cls(a=1, b=2, d=None, _x=4, settings={'DUMMY': True}) + dummy = Dummy(a=1, b=2, d=None, _x=4, settings={'DUMMY': True}) assert callable(dummy.__init__) -def test_strict_kwargs_fail(dummy_strict_kwargs_cls): +def test_strict_kwargs_fail(): """Test the `strict_kwargs` decorator disallowing some kwargs.""" with pytest.raises(TypeError): - dummy_strict_kwargs_cls(a=1, b=2, e=4) + Dummy(a=1, b=2, e=4) From 167e98b7d211902e778e0b316643ed21d788a5a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Wed, 31 Jan 2018 17:08:32 +0100 Subject: [PATCH 7/8] tests: strict kw: clearer vars, support unicode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- hepcrawl/utils.py | 2 +- tests/unit/test_utils.py | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index d70ca23c..924530b5 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -420,7 +420,7 @@ def wrapper(self, *args, **kwargs): if disallowed_kwargs: raise TypeError( 'Only underscored kwargs allowed in {}. ' - 'Check {} for typos.'.format(func, ', '.join(disallowed_kwargs)) + 'Check {} for typos.'.format(repr(func), repr(disallowed_kwargs)) ) return func(self, *args, **kwargs) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 79e28361..7ed3e5fc 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -81,8 +81,13 @@ def list_for_dict(): class Dummy(object): """A sample class with @strict_kwargs constructor.""" @strict_kwargs - def __init__(self, a, b=10, c=20, d=30, *args, **kwargs): - pass + def __init__(self, good1_no_default, good2=10, good3=20, good4=30, *args, **kwargs): + self.good1_no_default = good1_no_default + self.good2 = good2 + self.good3 = good3 + self.good4 = good4 + self.args = args + self.kwargs = kwargs def test_unzip_xml(zipfile, tmpdir): @@ -273,11 +278,22 @@ def test_get_journal_and_section_invalid(): def test_strict_kwargs_pass(): """Test the `strict_kwargs` decorator allowing the kwargs.""" - dummy = Dummy(a=1, b=2, d=None, _x=4, settings={'DUMMY': True}) + dummy = Dummy( + good1_no_default=1, + good2=2, + good3=None, + _private=4, + settings={'DUMMY': True} + ) assert callable(dummy.__init__) + assert dummy.good1_no_default == 1 + assert dummy.good2 == 2 + assert dummy.good3 == None + assert dummy.good4 == 30 + assert dummy.kwargs == {'_private': 4, 'settings': {'DUMMY': True}} def test_strict_kwargs_fail(): """Test the `strict_kwargs` decorator disallowing some kwargs.""" with pytest.raises(TypeError): - Dummy(a=1, b=2, e=4) + Dummy(**{'good1_no_default': 1, 'good2': 2, u'bąd_þärãm': 4}) From dfd1f97641ed80ff1c8c2000a2097d1754b85f0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20=C5=81opaciuk?= Date: Thu, 1 Feb 2018 17:47:49 +0100 Subject: [PATCH 8/8] utils: use %-interpolation in strict kw error msg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Szymon Łopaciuk --- hepcrawl/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index 924530b5..1911ef8e 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -419,8 +419,12 @@ def wrapper(self, *args, **kwargs): if disallowed_kwargs: raise TypeError( - 'Only underscored kwargs allowed in {}. ' - 'Check {} for typos.'.format(repr(func), repr(disallowed_kwargs)) + 'Only underscored kwargs or %s allowed in %s. ' + 'Check %s for typos.' % ( + ', '.join(spider_fields), + func, + ', '.join(disallowed_kwargs), + ) ) return func(self, *args, **kwargs)