From e4b9f4301e03db7c7dc8125b19415825443f78a3 Mon Sep 17 00:00:00 2001 From: David Caro Date: Tue, 22 Aug 2017 16:24:53 +0200 Subject: [PATCH] global: use standard ParsedResponse in the spiders Signed-off-by: David Caro --- hepcrawl/pipelines.py | 6 +++--- hepcrawl/spiders/alpha_spider.py | 12 ++++++++++-- hepcrawl/spiders/aps_spider.py | 15 +++++++++++++-- hepcrawl/spiders/arxiv_spider.py | 21 ++++++++++++++++----- hepcrawl/spiders/base_spider.py | 15 +++++++++++++-- hepcrawl/spiders/brown_spider.py | 14 ++++++++++++-- hepcrawl/spiders/desy_spider.py | 2 +- hepcrawl/spiders/dnb_spider.py | 15 +++++++++++++-- hepcrawl/spiders/edp_spider.py | 29 ++++++++++++++++++++--------- hepcrawl/spiders/elsevier_spider.py | 10 ++++++++-- hepcrawl/spiders/hindawi_spider.py | 12 ++++++++++-- hepcrawl/spiders/infn_spider.py | 13 ++++++++++--- hepcrawl/spiders/iop_spider.py | 8 +++++++- hepcrawl/spiders/magic_spider.py | 12 ++++++++++-- hepcrawl/spiders/mit_spider.py | 13 +++++++++++-- hepcrawl/spiders/phenix_spider.py | 8 +++++++- hepcrawl/spiders/phil_spider.py | 13 +++++++++++-- hepcrawl/spiders/pos_spider.py | 18 +++++++++++++++--- hepcrawl/spiders/t2k_spider.py | 12 ++++++++++-- hepcrawl/spiders/wsp_spider.py | 29 +++++++++++++++++++++-------- hepcrawl/testlib/celery_monitor.py | 28 +++++++++++++++++++++++----- hepcrawl/testlib/fixtures.py | 16 ++++++++++++++++ 22 files changed, 260 insertions(+), 61 deletions(-) diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 9e3d84e8..e583dc2a 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -24,9 +24,9 @@ from inspire_schemas.utils import validate -from hepcrawl.tohep import item_to_hep -from hepcrawl.settings import FILES_STORE -from hepcrawl.utils import RecordFile +from .tohep import item_to_hep +from .settings import FILES_STORE +from .utils import RecordFile class FftFilesPipeline(FilesPipeline): diff --git a/hepcrawl/spiders/alpha_spider.py b/hepcrawl/spiders/alpha_spider.py index 2ab883f3..c791546e 100644 --- a/hepcrawl/spiders/alpha_spider.py +++ b/hepcrawl/spiders/alpha_spider.py @@ -20,7 +20,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import has_numbers +from ..utils import ( + has_numbers, + ParsedItem, +) class AlphaSpider(CrawlSpider): @@ -145,4 +148,9 @@ def parse(self, response): record.add_value('source', 'Alpha experiment') record.add_value('collections', ['HEP', 'THESIS']) - yield record.load_item() + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + yield parsed_item diff --git a/hepcrawl/spiders/aps_spider.py b/hepcrawl/spiders/aps_spider.py index 496e2e8e..1cda5bea 100644 --- a/hepcrawl/spiders/aps_spider.py +++ b/hepcrawl/spiders/aps_spider.py @@ -20,7 +20,12 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_licenses, get_nested, build_dict +from ..utils import ( + get_licenses, + get_nested, + build_dict, + ParsedItem, +) class APSSpider(Spider): @@ -110,7 +115,13 @@ def parse(self, response): record.add_value('license', license) record.add_value('collections', ['HEP', 'Citeable', 'Published']) - yield record.load_item() + + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + yield parsed_item # Pagination support. Will yield until no more "next" pages are found if 'Link' in response.headers: diff --git a/hepcrawl/spiders/arxiv_spider.py b/hepcrawl/spiders/arxiv_spider.py index d82c8318..59a01295 100644 --- a/hepcrawl/spiders/arxiv_spider.py +++ b/hepcrawl/spiders/arxiv_spider.py @@ -16,10 +16,15 @@ from scrapy import Request, Selector from scrapy.spiders import XMLFeedSpider -from ..mappings import CONFERENCE_WORDS, THESIS_WORDS -from ..utils import coll_cleanforthe, get_licenses, split_fullname from ..items import HEPRecord from ..loaders import HEPLoader +from ..mappings import CONFERENCE_WORDS, THESIS_WORDS +from ..utils import ( + coll_cleanforthe, + get_licenses, + split_fullname, + ParsedItem, +) RE_CONFERENCE = re.compile(r'\b(%s)\b' % '|'.join( [re.escape(word) for word in CONFERENCE_WORDS]), re.I | re.U) @@ -33,7 +38,9 @@ class ArxivSpider(XMLFeedSpider): Example: Using OAI-PMH XML files:: - $ scrapy crawl arXiv -a source_file=file://`pwd`/tests/responses/arxiv/sample_arxiv_record.xml + $ scrapy crawl \\ + arXiv \\ + -a "source_file=file://$PWD/tests/responses/arxiv/sample_arxiv_record.xml" """ @@ -110,8 +117,12 @@ def parse_node(self, response, node): ) record.add_value('license', license) - parsed_record = dict(record.load_item()) - return parsed_record + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item def _get_authors_or_collaboration(self, node): """Parse authors, affiliations; extract collaboration""" diff --git a/hepcrawl/spiders/base_spider.py b/hepcrawl/spiders/base_spider.py index 5eb22eb7..d7c2d06d 100644 --- a/hepcrawl/spiders/base_spider.py +++ b/hepcrawl/spiders/base_spider.py @@ -18,7 +18,12 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_mime_type, parse_domain, get_node +from ..utils import ( + get_mime_type, + parse_domain, + get_node, + ParsedItem, +) class BaseSpider(XMLFeedSpider): @@ -192,7 +197,13 @@ def build_item(self, response): record.add_value("authors", self.get_authors(node)) record.add_value('thesis', {'degree_type': 'PhD'}) record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item def scrape_for_pdf(self, response): """Scrape splash page for any links to PDFs. diff --git a/hepcrawl/spiders/brown_spider.py b/hepcrawl/spiders/brown_spider.py index 6c881252..f17dd197 100644 --- a/hepcrawl/spiders/brown_spider.py +++ b/hepcrawl/spiders/brown_spider.py @@ -21,7 +21,12 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import split_fullname, parse_domain, get_mime_type +from ..utils import ( + split_fullname, + parse_domain, + get_mime_type, + ParsedItem, +) class BrownSpider(CrawlSpider): @@ -219,4 +224,9 @@ def build_item(self, response): record.add_value('thesis', response.meta.get("thesis")) record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py index 4f90d6e9..69d40619 100644 --- a/hepcrawl/spiders/desy_spider.py +++ b/hepcrawl/spiders/desy_spider.py @@ -22,7 +22,7 @@ from inspire_dojson.hep import hep -from hepcrawl.utils import ( +from ..utils import ( ftp_list_files, ftp_connection_info, ParsedItem, diff --git a/hepcrawl/spiders/dnb_spider.py b/hepcrawl/spiders/dnb_spider.py index 3ac8b901..f350cf8f 100644 --- a/hepcrawl/spiders/dnb_spider.py +++ b/hepcrawl/spiders/dnb_spider.py @@ -16,7 +16,12 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_mime_type, parse_domain, get_node +from ..utils import ( + get_mime_type, + parse_domain, + get_node, + ParsedItem, +) class DNBSpider(XMLFeedSpider): @@ -219,4 +224,10 @@ def build_item(self, response): record.add_value('thesis', {'degree_type': 'PhD'}) record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py index beea699d..eb6f4cf3 100644 --- a/hepcrawl/spiders/edp_spider.py +++ b/hepcrawl/spiders/edp_spider.py @@ -30,6 +30,7 @@ get_licenses, get_node, parse_domain, + ParsedItem, ) @@ -65,11 +66,11 @@ class EDPSpider(Jats, XMLFeedSpider): To run an ``EDPSpider`` using ``rich`` format:: - $ scrapy crawl EDP -a package_path=file://`pwd`/tests/responses/edp/test_rich.tar.bz2 + $ scrapy crawl EDP -a source_folder=file://`pwd`/tests/responses/edp/test_rich.tar.bz2 To run an ``EDPSpider`` using ``gz`` format:: - $ scrapy crawl EDP -a package_path=file://`pwd`/tests/responses/edp/test_gz.tar.gz + $ scrapy crawl EDP -a source_folder=file://`pwd`/tests/responses/edp/test_gz.tar.gz Todo: @@ -144,9 +145,9 @@ def start_requests(self): ftp_host, ftp_params = ftp_connection_info( self.ftp_host, self.ftp_netrc) _, new_files = ftp_list_files( - self.ftp_folder, - self.target_folder, - server=ftp_host, + server_folder=self.ftp_folder, + destination_folder=self.target_folder, + ftp_host=ftp_host, user=ftp_params['ftp_user'], password=ftp_params['ftp_password'] ) @@ -175,7 +176,7 @@ def handle_package_ftp(self, response): for xml_file in xml_files: yield Request( "file://{0}".format(xml_file), - meta={"package_path": zip_filepath} + meta={"source_folder": zip_filepath} ) def handle_package_file(self, response): @@ -188,7 +189,7 @@ def handle_package_file(self, response): for xml_file in xml_files: request = Request( "file://{0}".format(xml_file), - meta={"package_path": zip_filepath} + meta={"source_folder": zip_filepath} ) if "xml_rich" in xml_file: request.meta["rich"] = True @@ -318,7 +319,12 @@ def build_item_rich(self, response): ) record.add_value("urls", response.meta.get("urls")) - return record.load_item() + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item def build_item_jats(self, response): """Build the final HEPRecord with JATS-format XML ('jp').""" @@ -388,7 +394,12 @@ def build_item_jats(self, response): references = self._get_references(node) record.add_value("references", references) - return record.load_item() + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item def _get_references(self, node): """Get the references.""" diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py index c9aacc00..b91e8372 100644 --- a/hepcrawl/spiders/elsevier_spider.py +++ b/hepcrawl/spiders/elsevier_spider.py @@ -31,6 +31,7 @@ has_numbers, range_as_string, unzip_xml_files, + ParsedItem, ) from ..dateutils import format_year @@ -180,7 +181,7 @@ def handle_package(self, response): xml_url = u"file://{0}".format(os.path.abspath(xml_file)) yield Request( xml_url, - meta={"package_path": zip_filepath, + meta={"source_folder": zip_filepath, "xml_url": xml_url}, ) @@ -1034,4 +1035,9 @@ def build_item(self, response): record.add_value('collections', self.get_collections(doctype)) record.add_value('references', self.get_references(node)) - return record.load_item() + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/hindawi_spider.py b/hepcrawl/spiders/hindawi_spider.py index 941a3674..cce5e8eb 100644 --- a/hepcrawl/spiders/hindawi_spider.py +++ b/hepcrawl/spiders/hindawi_spider.py @@ -16,7 +16,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_licenses +from ..utils import ( + get_licenses, + ParsedItem, +) class HindawiSpider(XMLFeedSpider): @@ -222,4 +225,9 @@ def parse_node(self, response, node): record.add_xpath('source', "./datafield[@tag='260']/subfield[@code='b']/text()") - return record.load_item() + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/infn_spider.py b/hepcrawl/spiders/infn_spider.py index 2e970c1c..a3457a21 100644 --- a/hepcrawl/spiders/infn_spider.py +++ b/hepcrawl/spiders/infn_spider.py @@ -21,8 +21,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_temporary_file - +from ..utils import ( + get_temporary_file, + ParsedItem, +) from ..dateutils import format_date @@ -240,4 +242,9 @@ def build_item(self, response): record.add_value('source', 'INFN') record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py index 0e3bae65..288bd205 100644 --- a/hepcrawl/spiders/iop_spider.py +++ b/hepcrawl/spiders/iop_spider.py @@ -23,6 +23,7 @@ from ..items import HEPRecord from ..loaders import HEPLoader +from ..utils import ParsedItem class IOPSpider(XMLFeedSpider, NLM): @@ -222,4 +223,9 @@ def parse_node(self, response, node): record.add_value("additional_files", self.add_fft_file(pdf_file_path, file_access, file_type)) - return record.load_item() + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py index 77bf7948..03d54618 100644 --- a/hepcrawl/spiders/magic_spider.py +++ b/hepcrawl/spiders/magic_spider.py @@ -18,7 +18,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import split_fullname +from ..utils import ( + split_fullname, + ParsedItem, +) class MagicSpider(XMLFeedSpider): @@ -176,4 +179,9 @@ def build_item(self, response): record.add_value("additional_files", response.meta.get("files")) record.add_value('collections', ['HEP', 'THESIS']) - yield record.load_item() + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + yield parsed_item diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py index c71234f9..5387042d 100644 --- a/hepcrawl/spiders/mit_spider.py +++ b/hepcrawl/spiders/mit_spider.py @@ -23,7 +23,11 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import get_temporary_file, split_fullname +from ..utils import ( + get_temporary_file, + split_fullname, + ParsedItem, +) class MITSpider(XMLFeedSpider): @@ -223,4 +227,9 @@ def build_item(self, response): record.add_value('page_nr', self.get_page_nr(node)) record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/phenix_spider.py b/hepcrawl/spiders/phenix_spider.py index 7200664e..9eaa9da0 100644 --- a/hepcrawl/spiders/phenix_spider.py +++ b/hepcrawl/spiders/phenix_spider.py @@ -18,6 +18,7 @@ from ..items import HEPRecord from ..loaders import HEPLoader +from ..utils import ParsedItem class PhenixSpider(XMLFeedSpider): @@ -128,4 +129,9 @@ def parse_node(self, response, node): record.add_value('source', 'PHENIX') record.add_value('collections', ['HEP', 'THESIS']) - return record.load_item() + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/phil_spider.py b/hepcrawl/spiders/phil_spider.py index 101b1163..d0cddaea 100644 --- a/hepcrawl/spiders/phil_spider.py +++ b/hepcrawl/spiders/phil_spider.py @@ -19,7 +19,11 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import parse_domain, get_mime_type +from ..utils import ( + parse_domain, + get_mime_type, + ParsedItem, +) class PhilSpider(CrawlSpider): @@ -160,4 +164,9 @@ def build_item(self, response): if not jsonrecord.get('year') == "forthcoming": record.add_value('journal_year', int(jsonrecord['year'])) - return record.load_item() + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 7d3fb87d..152d6688 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -13,13 +13,19 @@ import re +from urlparse import urljoin + from scrapy import Request, Selector from scrapy.spiders import Spider -from urlparse import urljoin -from ..utils import get_licenses, get_first + from ..dateutils import create_valid_date from ..items import HEPRecord from ..loaders import HEPLoader +from ..utils import ( + get_licenses, + get_first, + ParsedItem, +) class POSSpider(Spider): @@ -128,7 +134,13 @@ def build_item(self, response): record.add_value('extra_data', extra_data) record.add_value('collections', ['HEP', 'ConferencePaper']) - return record.load_item() + + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item def _get_ext_systems_number(self, node): return [ diff --git a/hepcrawl/spiders/t2k_spider.py b/hepcrawl/spiders/t2k_spider.py index 661f0bec..86076e46 100644 --- a/hepcrawl/spiders/t2k_spider.py +++ b/hepcrawl/spiders/t2k_spider.py @@ -18,7 +18,10 @@ from ..items import HEPRecord from ..loaders import HEPLoader -from ..utils import split_fullname +from ..utils import ( + split_fullname, + ParsedItem, +) class T2kSpider(XMLFeedSpider): @@ -164,4 +167,9 @@ def build_item(self, response): record.add_value("additional_files", response.meta.get("additional_files")) record.add_value('collections', ['HEP', 'THESIS']) - yield record.load_item() + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + yield parsed_item diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 3f68131f..058e6cc0 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -26,6 +26,7 @@ local_list_files, get_licenses, unzip_xml_files, + ParsedItem, ) @@ -71,7 +72,15 @@ class WorldScientificSpider(Jats, XMLFeedSpider): 'rapid-communications' ] - def __init__(self, package_path=None, ftp_folder="WSP", ftp_host=None, ftp_netrc=None, *args, **kwargs): + def __init__( + self, + package_path=None, + ftp_folder="/WSP", + ftp_host=None, + ftp_netrc=None, + *args, + **kwargs + ): """Construct WSP spider.""" super(WorldScientificSpider, self).__init__(*args, **kwargs) self.ftp_folder = ftp_folder @@ -97,8 +106,8 @@ def start_requests(self): new_files_paths = ftp_list_files( self.ftp_folder, - self.target_folder, - server=ftp_host, + destination_folder=self.target_folder, + ftp_host=ftp_host, user=ftp_params['ftp_user'], password=ftp_params['ftp_password'] ) @@ -126,7 +135,7 @@ def handle_package_ftp(self, response): for xml_file in xml_files: yield Request( "file://{0}".format(xml_file), - meta={"package_path": zip_filepath} + meta={"source_folder": zip_filepath} ) def handle_package_file(self, response): @@ -138,7 +147,7 @@ def handle_package_file(self, response): for xml_file in xml_files: yield Request( "file://{0}".format(xml_file), - meta={"package_path": zip_filepath} + meta={"source_folder": zip_filepath} ) def parse_node(self, response, node): @@ -148,7 +157,7 @@ def parse_node(self, response, node): self.log("Got article_type {0}".format(article_type)) if article_type is None or article_type[0] not in self.allowed_article_types: # Filter out non-interesting article types - return None + return record = HEPLoader(item=HEPRecord(), selector=node, response=response) if article_type in ['correction', @@ -203,9 +212,13 @@ def parse_node(self, response, node): record.add_value('license', license) record.add_value('collections', self._get_collections(node, article_type, journal_title)) - parsed_record = dict(record.load_item()) - return parsed_record + parsed_item = ParsedItem( + record=dict(record.load_item()), + record_format='hepcrawl', + ) + + return parsed_item def _get_collections(self, node, article_type, current_journal_title): """Return this articles' collection.""" diff --git a/hepcrawl/testlib/celery_monitor.py b/hepcrawl/testlib/celery_monitor.py index 6c720550..1347ab22 100644 --- a/hepcrawl/testlib/celery_monitor.py +++ b/hepcrawl/testlib/celery_monitor.py @@ -9,7 +9,12 @@ """Celery monitor dealing with celery tasks for functional tests.""" -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) from itertools import islice @@ -19,13 +24,14 @@ class CeleryMonitor(object): - def __init__(self, app, monitor_timeout=3, monitor_iter_limit=100): + def __init__(self, app, monitor_timeout=3, monitor_iter_limit=100, events_limit=2): self.results = [] self.recv = None self.app = app self.connection = None self.monitor_timeout = monitor_timeout self.monitor_iter_limit = monitor_iter_limit + self.events_limit = events_limit def __enter__(self): state = self.app.events.State() @@ -61,10 +67,16 @@ def __exit__(self, exc_type, exc_val, exc_tb): self.connection.__exit__() def _wait_for_results(self, events_iter): - any(islice( + generator_events = islice( events_iter, # iterable self.monitor_iter_limit # stop - )) + ) + counter = 0 + for dummy in generator_events: + if dummy: + counter += 1 + if counter == self.events_limit: + break @classmethod def do_crawl( @@ -72,6 +84,7 @@ def do_crawl( app, monitor_timeout, monitor_iter_limit, + events_limit, crawler_instance, project='hepcrawl', spider='WSP', @@ -80,7 +93,12 @@ def do_crawl( ): settings = settings or {} - with cls(app, monitor_timeout=monitor_timeout, monitor_iter_limit=monitor_iter_limit) as my_monitor: + with cls( + app, + monitor_timeout=monitor_timeout, + monitor_iter_limit=monitor_iter_limit, + events_limit=events_limit + ) as my_monitor: crawler_instance.schedule( project=project, spider=spider, diff --git a/hepcrawl/testlib/fixtures.py b/hepcrawl/testlib/fixtures.py index 513b0395..73f28f96 100644 --- a/hepcrawl/testlib/fixtures.py +++ b/hepcrawl/testlib/fixtures.py @@ -11,6 +11,7 @@ import os import json +import shutil from scrapy.http import Request, TextResponse from scrapy.selector import Selector @@ -131,3 +132,18 @@ def expected_json_results_from_file(*path_chunks, **kwargs): expected_data = json.load(fd) return expected_data + + +def clean_dir(path): + """ + Deletes all contained files of given target directory path. + + Args: + path: Absolute path of target directory to be cleaned. + + Example: + + >>> clean_dir('/dir_1/dir_11/') + + """ + shutil.rmtree(path, ignore_errors=True)