diff --git a/.gitignore b/.gitignore index 01895086..3b606de7 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,8 @@ nosetests.xml coverage.xml twistd.pid .coverage.* +tests/unit/responses/edp/test_gz +tests/unit/responses/edp/test_rich # Translations *.mo @@ -57,6 +59,8 @@ jobs dbs items logs +.scrapy +scrapy_feed_uri # Local settings local_settings.py diff --git a/.travis.yml b/.travis.yml index 91407e6e..53f7dee5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,7 +42,7 @@ before_install: install: - travis_retry docker-compose -f docker-compose.deps.yml run --rm pip - - travis_retry docker-compose -f docker-compose.test.yml run --rm scrapyd_deploy + - travis_retry docker-compose -f docker-compose.test.yml run --rm scrapyd-deploy script: - travis_retry docker-compose -f docker-compose.test.yml run --rm ${SUITE} diff --git a/docker-compose.test.yml b/docker-compose.test.yml index d14d7c87..074f50ce 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -18,6 +18,7 @@ services: - APP_CRAWLER_HOST_URL=http://scrapyd:6800 - APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=hepcrawl.testlib.tasks.submit_results - APP_FILES_STORE=/tmp/file_urls + - APP_CRAWL_ONCE_PATH=/code/.scrapy - COVERAGE_PROCESS_START=/code/.coveragerc - BASE_USER_UID=${BASE_USER_UID:-1000} - BASE_USER_GIT=${BASE_USER_GIT:-1000} @@ -65,10 +66,17 @@ services: command: bash -c "rm -f twistd.pid && exec scrapyd" links: - celery + healthcheck: + timeout: 5s + interval: 5s + retries: 5 + test: + - "CMD-SHELL" + - "curl http://localhost:6800/listprojects.json" - scrapyd_deploy: + scrapyd-deploy: <<: *service_base - command: bash -c "sleep 8 && scrapyd-deploy" # make sure that the scrapyd is up + command: bash -c "scrapyd-deploy" links: - scrapyd diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py index dab5c7e4..99551e93 100644 --- a/hepcrawl/middlewares.py +++ b/hepcrawl/middlewares.py @@ -11,10 +11,23 @@ from __future__ import absolute_import, division, print_function -class ErrorHandlingMiddleware(object): +import os +import time +import logging + +from ftplib import FTP +from six.moves.urllib.parse import urlparse + +from scrapy.exceptions import IgnoreRequest +from scrapy_crawl_once.middlewares import CrawlOnceMiddleware + +from hepcrawl.utils import ftp_connection_info + - """Log errors.""" +LOGGER = logging.getLogger(__name__) + +class ErrorHandlingMiddleware(object): @classmethod def from_crawler(cls, crawler): return cls(crawler.settings) @@ -24,13 +37,142 @@ def __init__(self, settings): def process_spider_exception(self, response, exception, spider): """Register the error in the spider and continue.""" - self.process_exception(response, exception, spider) + return self.process_exception(response, exception, spider) def process_exception(self, request, exception, spider): """Register the error in the spider and continue.""" - if 'errors' not in spider.state: - spider.state['errors'] = [] - spider.state['errors'].append({ + spider.state.setdefault('errors', []).append({ 'exception': exception, 'sender': request, }) + + +class HepcrawlCrawlOnceMiddleware(CrawlOnceMiddleware): + """ + This spider and downloader middleware allows to avoid re-crawling pages + which were already downloaded in previous crawls. + + To enable it, modify ``settings.py``:: + + SPIDER_MIDDLEWARES = { + # ... + 'scrapy_crawl_once.CrawlOnceMiddleware': 100, + # ... + } + + DOWNLOADER_MIDDLEWARES = { + # ... + 'scrapy_crawl_once.CrawlOnceMiddleware': 50, + # ... + } + + By default it does nothing. To avoid crawling a particular page + multiple times set ``request.meta['crawl_once'] = True``. Other + ``request.meta`` keys that modify it's behavior: + + * ``crawl_once_value`` - a value to store in DB. By default, timestamp + is stored for Http/Https requests and last-modified is stored for + FTP/File requests. + * ``crawl_once_key`` - unique file name is used. + + Settings: + + * ``CRAWL_ONCE_ENABLED``:set it to False to disable middleware. Default + is True. + * ``CRAWL_ONCE_PATH``: a path to a folder with crawled requests database. + By default ``.scrapy/crawl_once/`` path is used; this folder contains + ``.sqlite`` files with databases of seen requests. + * ``CRAWL_ONCE_DEFAULT``: default value for ``crawl_once`` meta key (False + by default). When True, all requests are handled by this middleware + unless disabled explicitly using + ``request.meta['crawl_once'] = False``. + + + For more info see: https://github.com/TeamHG-Memex/scrapy-crawl-once + """ + def process_request(self, request, spider): + if not request.meta.get('crawl_once', self.default): + if 'crawl_once' in request.meta: + LOGGER.info('Crawl-Once: skipping by explicit crawl_once meta') + else: + LOGGER.info('Crawl-Once: skipping by default crawl_once meta') + return + + request.meta['crawl_once_key'] = self._get_key(request) + request.meta['crawl_once_value'] = self._get_timestamp(request, spider) + + if not self._has_to_be_crawled(request, spider): + LOGGER.info( + 'Crawl-Once: Skipping due to `has_to_be_crawled`, %s' % request + ) + self.stats.inc_value('crawl_once/ignored') + raise IgnoreRequest() + + LOGGER.info( + 'Crawl-Once: Not skipping: %s' % request + ) + + def _has_to_be_crawled(self, request, spider): + request_db_key = self._get_key(request) + + if request_db_key not in self.db: + return True + + new_file_timestamp = self._get_timestamp(request, spider) + old_file_timestamp = self.db.get(key=request_db_key) + return new_file_timestamp > old_file_timestamp + + def _get_key(self, request): + parsed_url = urlparse(request.url) + fname = os.path.basename(parsed_url.path) + if parsed_url.scheme == 'file': + prefix = 'local' + else: + prefix = 'remote' + + return prefix + '::' + fname + + @classmethod + def _get_timestamp(cls, request, spider): + parsed_url = urlparse(request.url) + full_url = request.url + if parsed_url.scheme == 'ftp': + last_modified = cls._get_ftp_timestamp(spider, full_url) + elif parsed_url.scheme == 'file': + last_modified = cls._get_file_timestamp(full_url) + else: + last_modified = time.time() + + return last_modified + + @classmethod + def _get_ftp_timestamp(cls, spider, url): + ftp_host, params = ftp_connection_info( + spider.ftp_host, + spider.ftp_netrc, + ) + ftp = FTP( + host=ftp_host, + user=params['ftp_user'], + passwd=params['ftp_password'], + ) + return ftp.sendcmd( + 'MDTM {}'.format( + cls._get_ftp_relative_path( + url=url, + host=ftp_host + ) + ) + ) + + @staticmethod + def _get_ftp_relative_path(url, host): + return url.replace( + 'ftp://{0}/'.format(host), + '', + ) + + @staticmethod + def _get_file_timestamp(url): + file_path = url.replace('file://', '') + return os.stat(file_path).st_mtime diff --git a/hepcrawl/pipelines.py b/hepcrawl/pipelines.py index 8cd31c0e..d9949338 100644 --- a/hepcrawl/pipelines.py +++ b/hepcrawl/pipelines.py @@ -16,6 +16,8 @@ from __future__ import absolute_import, division, print_function import os +import shutil +import pprint import requests @@ -92,10 +94,16 @@ def open_spider(self, spider): def _post_enhance_item(self, item, spider): source = spider.name - return item_to_hep( + enhanced_record = item_to_hep( item=item, source=source, ) + spider.logger.debug( + 'Got post-enhanced hep record:\n%s' % pprint.pformat( + enhanced_record + ) + ) + return enhanced_record def process_item(self, item, spider): """Convert internal format to INSPIRE data model.""" @@ -124,7 +132,8 @@ def _prepare_payload(self, spider): ] return payload - def _cleanup(self, spider): + @staticmethod + def _cleanup(spider): """Run cleanup.""" # Cleanup errors if 'errors' in spider.state: @@ -175,6 +184,10 @@ def close_spider(self, spider): """Post results to BROKER API.""" from celery.utils.log import get_task_logger logger = get_task_logger(__name__) + + if hasattr(spider, 'tmp_dir'): + shutil.rmtree(path=spider.tmp_dir, ignore_errors=True) + if 'SCRAPY_JOB' in os.environ and self.count > 0: task_endpoint = spider.settings[ 'API_PIPELINE_TASK_ENDPOINT_MAPPING' diff --git a/hepcrawl/settings.py b/hepcrawl/settings.py index 31d608bf..216f7694 100644 --- a/hepcrawl/settings.py +++ b/hepcrawl/settings.py @@ -62,14 +62,23 @@ # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html SPIDER_MIDDLEWARES = { 'hepcrawl.middlewares.ErrorHandlingMiddleware': 543, + 'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100, } # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'hepcrawl.middlewares.ErrorHandlingMiddleware': 543, + 'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100, } +CRAWL_ONCE_ENABLED = True +CRAWL_ONCE_DEFAULT = True +CRAWL_ONCE_PATH = os.environ.get( + 'APP_CRAWL_ONCE_PATH', + '/var/lib/scrapy/crawl_once/', +) + # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html EXTENSIONS = { diff --git a/hepcrawl/spiders/desy_spider.py b/hepcrawl/spiders/desy_spider.py index ec70ec39..efae063e 100644 --- a/hepcrawl/spiders/desy_spider.py +++ b/hepcrawl/spiders/desy_spider.py @@ -178,13 +178,18 @@ def start_requests(self): yield request @staticmethod - def _get_full_uri(current_path, base_url, schema, hostname=''): + def _get_full_uri(current_path, base_url, schema, hostname=None): + hostname = hostname or '' if os.path.isabs(current_path): full_path = current_path else: full_path = os.path.join(base_url, current_path) - return '{schema}://{hostname}{full_path}'.format(**vars()) + return '{schema}://{hostname}{full_path}'.format( + schema=schema, + hostname=hostname, + full_path=full_path, + ) def parse(self, response): """Parse a ``Desy`` XML file into a :class:`hepcrawl.utils.ParsedItem`. @@ -208,8 +213,12 @@ def parse(self, response): url_schema = 'file' hostname = None + self.log('Getting marc xml records...') marcxml_records = self._get_marcxml_records(response.body) + self.log('Got %d marc xml records' % len(marcxml_records)) + self.log('Getting hep records...') hep_records = self._hep_records_from_marcxml(marcxml_records) + self.log('Got %d hep records' % len(hep_records)) for hep_record in hep_records: list_file_urls = [ @@ -222,12 +231,14 @@ def parse(self, response): for fft_path in hep_record['_fft'] ] + self.log('Got the following fft urls: %s' % list_file_urls) parsed_item = ParsedItem( record=hep_record, file_urls=list_file_urls, ftp_params=ftp_params, record_format='hep', ) + self.log('Got item: %s' % parsed_item) yield parsed_item diff --git a/hepcrawl/spiders/wsp_spider.py b/hepcrawl/spiders/wsp_spider.py index 058e6cc0..1868c9c5 100644 --- a/hepcrawl/spiders/wsp_spider.py +++ b/hepcrawl/spiders/wsp_spider.py @@ -13,6 +13,7 @@ import os import urlparse +import tempfile from scrapy import Request from scrapy.spiders import XMLFeedSpider @@ -42,22 +43,40 @@ class WorldScientificSpider(Jats, XMLFeedSpider): on the remote server and downloads them to a designated local folder, using ``WorldScientificSpider.start_requests()``. 2. Then the ZIP file is unpacked and it lists all the XML files found - inside, via ``WorldScientificSpider.handle_package()``. Note the callback from - ``WorldScientificSpider.start_requests()``. - 3. Finally, now each XML file is parsed via ``WorldScientificSpider.parse_node()``. + inside, via ``WorldScientificSpider.handle_package()``. Note the + callback from ``WorldScientificSpider.start_requests()``. + 3. Finally, now each XML file is parsed via + ``WorldScientificSpider.parse_node()``. + + + Args: + local_package_dir(str): path to the local directory holding the zip + files to parse and extract the records for, if set, will ignore all + the ftp options. + ftp_folder(str): remote folder in the ftp server to get the zip files + from. + ftp_host(str): host name of the ftp server to connect to. + ftp_netrc(str): path to the netrc file containing the authentication + settings for the ftp. + target_folder(str): path to the temporary local directory to download + the files to. Example: To run a crawl, you need to pass FTP connection information via ``ftp_host`` and ``ftp_netrc``:: - $ scrapy crawl WSP -a 'ftp_host=ftp.example.com' -a 'ftp_netrc=/path/to/netrc' + $ scrapy crawl \\ + WSP \\ + -a 'ftp_host=ftp.example.com' \\ + -a 'ftp_netrc=/path/to/netrc' """ name = 'WSP' custom_settings = {} start_urls = [] - iterator = 'iternodes' # This is actually unnecessary, since it's the default value + # This is actually unnecessary, since it's the default value + iterator = 'iternodes' itertag = 'article' allowed_article_types = [ @@ -74,10 +93,11 @@ class WorldScientificSpider(Jats, XMLFeedSpider): def __init__( self, - package_path=None, - ftp_folder="/WSP", + local_package_dir=None, + ftp_folder="WSP", ftp_host=None, ftp_netrc=None, + target_folder=None, *args, **kwargs ): @@ -86,45 +106,62 @@ def __init__( self.ftp_folder = ftp_folder self.ftp_host = ftp_host self.ftp_netrc = ftp_netrc - self.target_folder = "/tmp/WSP" - self.package_path = package_path - if not os.path.exists(self.target_folder): - os.makedirs(self.target_folder) + self.target_folder = ( + target_folder or + tempfile.mkdtemp(suffix='_extracted_zip', prefix='wsp_') + ) + self.local_package_dir = local_package_dir - def start_requests(self): - """List selected folder on remote FTP and yield new zip files.""" - if self.package_path: - new_files_paths = local_list_files( - self.package_path, - self.target_folder + def _get_local_requests(self): + new_files_paths = local_list_files( + self.local_package_dir, + self.target_folder + ) + + for file_path in new_files_paths: + yield Request( + "file://{0}".format(file_path), + callback=self.handle_package_file, ) - for file_path in new_files_paths: - yield Request("file://{0}".format(file_path), callback=self.handle_package_file) - else: - ftp_host, ftp_params = ftp_connection_info(self.ftp_host, self.ftp_netrc) - - new_files_paths = ftp_list_files( - self.ftp_folder, - destination_folder=self.target_folder, - ftp_host=ftp_host, - user=ftp_params['ftp_user'], - password=ftp_params['ftp_password'] + def _get_remote_requests(self): + ftp_host, ftp_params = ftp_connection_info( + self.ftp_host, + self.ftp_netrc, + ) + + new_files_paths = ftp_list_files( + self.ftp_folder, + destination_folder=self.target_folder, + ftp_host=ftp_host, + user=ftp_params['ftp_user'], + password=ftp_params['ftp_password'] + ) + + for remote_file in new_files_paths: + # Cast to byte-string for scrapy compatibility + remote_file = str(remote_file) + ftp_params["ftp_local_filename"] = os.path.join( + self.target_folder, + os.path.basename(remote_file) ) - for remote_file in new_files_paths: - # Cast to byte-string for scrapy compatibility - remote_file = str(remote_file) - ftp_params["ftp_local_filename"] = os.path.join( - self.target_folder, - os.path.basename(remote_file) - ) - remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file) - yield Request( - str(remote_url), - meta=ftp_params, - callback=self.handle_package_ftp - ) + remote_url = "ftp://{0}/{1}".format(ftp_host, remote_file) + yield Request( + str(remote_url), + meta=ftp_params, + callback=self.handle_package_ftp + ) + + def start_requests(self): + """List selected folder on remote FTP and yield new zip files.""" + if self.local_package_dir: + requests_iter = self._get_local_requests() + else: + requests_iter = self._get_remote_requests() + + for request in requests_iter: + yield request def handle_package_ftp(self, response): """Handle a zip package and yield every XML found.""" @@ -132,6 +169,7 @@ def handle_package_ftp(self, response): zip_filepath = response.body zip_target_folder, dummy = os.path.splitext(zip_filepath) xml_files = unzip_xml_files(zip_filepath, zip_target_folder) + for xml_file in xml_files: yield Request( "file://{0}".format(xml_file), @@ -142,8 +180,8 @@ def handle_package_file(self, response): """Handle a local zip package and yield every XML.""" self.log("Visited file %s" % response.url) zip_filepath = urlparse.urlsplit(response.url).path - zip_target_folder, dummy = os.path.splitext(zip_filepath) - xml_files = unzip_xml_files(zip_filepath, zip_target_folder) + xml_files = unzip_xml_files(zip_filepath, self.target_folder) + for xml_file in xml_files: yield Request( "file://{0}".format(xml_file), @@ -155,14 +193,20 @@ def parse_node(self, response, node): node.remove_namespaces() article_type = node.xpath('@article-type').extract() self.log("Got article_type {0}".format(article_type)) - if article_type is None or article_type[0] not in self.allowed_article_types: + if ( + article_type is None or + article_type[0] not in self.allowed_article_types + ): # Filter out non-interesting article types return record = HEPLoader(item=HEPRecord(), selector=node, response=response) if article_type in ['correction', 'addendum']: - record.add_xpath('related_article_doi', "//related-article[@ext-link-type='doi']/@href") + record.add_xpath( + 'related_article_doi', + "//related-article[@ext-link-type='doi']/@href", + ) record.add_value('journal_doctype', article_type) dois = node.xpath("//article-id[@pub-id-type='doi']/text()").extract() @@ -211,7 +255,10 @@ def parse_node(self, response, node): ) record.add_value('license', license) - record.add_value('collections', self._get_collections(node, article_type, journal_title)) + record.add_value( + 'collections', + self._get_collections(node, article_type, journal_title), + ) parsed_item = ParsedItem( record=dict(record.load_item()), @@ -220,10 +267,16 @@ def parse_node(self, response, node): return parsed_item - def _get_collections(self, node, article_type, current_journal_title): + @staticmethod + def _get_collections(node, article_type, current_journal_title): """Return this articles' collection.""" conference = node.xpath('.//conference').extract() - if conference or current_journal_title == "International Journal of Modern Physics: Conference Series": + if ( + conference or + current_journal_title == ( + "International Journal of Modern Physics: Conference Series" + ) + ): return ['HEP', 'ConferencePaper'] elif article_type == "review-article": return ['HEP', 'Review'] diff --git a/hepcrawl/testlib/celery_monitor.py b/hepcrawl/testlib/celery_monitor.py index 1347ab22..c8d94949 100644 --- a/hepcrawl/testlib/celery_monitor.py +++ b/hepcrawl/testlib/celery_monitor.py @@ -24,7 +24,13 @@ class CeleryMonitor(object): - def __init__(self, app, monitor_timeout=3, monitor_iter_limit=100, events_limit=2): + def __init__( + self, + app, + monitor_timeout=3, + monitor_iter_limit=100, + events_limit=2, + ): self.results = [] self.recv = None self.app = app @@ -39,7 +45,13 @@ def __enter__(self): def announce_succeeded_tasks(event): state.event(event) task = state.tasks.get(event['uuid']) - LOGGER.info('TASK SUCCEEDED: %s[%s] %s' % (task.name, task.uuid, task.info(),)) + LOGGER.info( + 'TASK SUCCEEDED: %s[%s] %s' % ( + task.name, + task.uuid, + task.info(), + ) + ) tasks = self.app.AsyncResult(task.id) for task in tasks.result: self.results.append(task) @@ -48,7 +60,9 @@ def announce_succeeded_tasks(event): def announce_failed_tasks(event): state.event(event) task = state.tasks.get(event['uuid']) - LOGGER.info('TASK FAILED: %s[%s] %s' % (task.name, task.uuid, task.info(),)) + LOGGER.info( + 'TASK FAILED: %s[%s] %s' % (task.name, task.uuid, task.info(),) + ) self.results.append(task.info()) self.recv.should_stop = True @@ -62,7 +76,11 @@ def announce_failed_tasks(event): return self def __exit__(self, exc_type, exc_val, exc_tb): - events_iter = self.recv.itercapture(limit=None, timeout=self.monitor_timeout, wakeup=True) + events_iter = self.recv.itercapture( + limit=None, + timeout=self.monitor_timeout, + wakeup=True, + ) self._wait_for_results(events_iter) self.connection.__exit__() @@ -84,8 +102,8 @@ def do_crawl( app, monitor_timeout, monitor_iter_limit, - events_limit, crawler_instance, + events_limit=2, project='hepcrawl', spider='WSP', settings=None, diff --git a/hepcrawl/testlib/fixtures.py b/hepcrawl/testlib/fixtures.py index 73f28f96..b78c1a31 100644 --- a/hepcrawl/testlib/fixtures.py +++ b/hepcrawl/testlib/fixtures.py @@ -15,6 +15,7 @@ from scrapy.http import Request, TextResponse from scrapy.selector import Selector +from hepcrawl.settings import CRAWL_ONCE_PATH def fake_response_from_file(file_name, test_suite='unit', url='http://www.example.com', response_type=TextResponse): @@ -134,12 +135,13 @@ def expected_json_results_from_file(*path_chunks, **kwargs): return expected_data -def clean_dir(path): +def clean_dir(path=CRAWL_ONCE_PATH): """ Deletes all contained files of given target directory path. Args: - path: Absolute path of target directory to be cleaned. + path(str): path of directory to be deleted. Default path is the produced DB per spider that + stores the requested urls. Example: diff --git a/hepcrawl/tohep.py b/hepcrawl/tohep.py index 8fcbc735..88e5e8d2 100644 --- a/hepcrawl/tohep.py +++ b/hepcrawl/tohep.py @@ -25,10 +25,14 @@ import os import datetime +import logging from inspire_schemas.api import LiteratureBuilder +LOGGER = logging.getLogger(__name__) + + class UnknownItemFormat(Exception): pass @@ -195,10 +199,12 @@ def hep_to_hep(hep_record, record_files): hepcrawl one (normally, marc-ingesting spiders). """ if record_files: + LOGGER.debug('Updating fft fields from: %s', hep_record['_fft']) hep_record['_fft'] = _get_updated_fft_fields( current_fft_fields=hep_record['_fft'], record_files=record_files, ) + LOGGER.debug('Updated fft fields to: %s', hep_record['_fft']) return hep_record diff --git a/hepcrawl/utils.py b/hepcrawl/utils.py index 256dd508..caff462d 100644 --- a/hepcrawl/utils.py +++ b/hepcrawl/utils.py @@ -10,6 +10,7 @@ from __future__ import absolute_import, division, print_function import os +import pprint import re from operator import itemgetter from itertools import groupby @@ -467,3 +468,6 @@ def __getattr__(self, key): def __setattr__(self, key, value): self[key] = value + + def __str__(self): + return pprint.pformat(self) diff --git a/setup.py b/setup.py index 4d1518a8..16ce46f0 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ 'inspire-schemas~=46.0', 'inspire-dojson~=46.0', 'Scrapy>=1.1.0', + 'scrapy-crawl-once~=0.1,>=0.1.1', # TODO: unpin once they support wheel building again 'scrapyd==1.1.0', 'scrapyd-client>=1.0.1', diff --git a/tests/Dockerfile.hepcrawl_base b/tests/Dockerfile.hepcrawl_base index eb91b69f..8db9f43e 100644 --- a/tests/Dockerfile.hepcrawl_base +++ b/tests/Dockerfile.hepcrawl_base @@ -26,10 +26,10 @@ RUN yum install -y epel-release && \ python-virtualenv && \ yum clean all -RUN mkdir /code /hepcrawl_venv +RUN mkdir /code /hepcrawl_venv /var/lib/scrapy RUN useradd test -RUN chown -R test:test /code /hepcrawl_venv +RUN chown -R test:test /code /hepcrawl_venv /var/lib/scrapy ADD ./docker_entrypoint.sh /docker_entrypoint.sh ADD ./fix_rights /fix_rights diff --git a/tests/fix_rights b/tests/fix_rights index 98677b2c..ecf219b0 100755 Binary files a/tests/fix_rights and b/tests/fix_rights differ diff --git a/tests/functional/arxiv/test_arxiv.py b/tests/functional/arxiv/test_arxiv.py index 0f58b17d..22025020 100644 --- a/tests/functional/arxiv/test_arxiv.py +++ b/tests/functional/arxiv/test_arxiv.py @@ -20,6 +20,7 @@ from hepcrawl.testlib.fixtures import ( get_test_suite_path, expected_json_results_from_file, + clean_dir, ) @@ -51,6 +52,8 @@ def set_up_local_environment(): } } + clean_dir() + @pytest.mark.parametrize( 'expected_results', @@ -84,3 +87,52 @@ def test_arxiv(set_up_local_environment, expected_results): expected_results = [override_generated_fields(expected) for expected in expected_results] assert gotten_results == expected_results + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'arxiv', + 'fixtures', + 'arxiv_smoke_record.json', + ), + ], + ids=[ + 'crawl_twice', + ] +) +def test_arxiv_crawl_twice(set_up_local_environment, expected_results): + crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + events_limit=1, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='arXiv', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [override_generated_fields(expected) for expected in expected_results] + + assert gotten_results == expected_results + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='arXiv', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + + assert gotten_results == [] diff --git a/tests/functional/desy/fixtures/desy_local_records_expected.json b/tests/functional/desy/fixtures/desy_local_records_expected.json index 1dc784b9..dc7baf23 100644 --- a/tests/functional/desy/fixtures/desy_local_records_expected.json +++ b/tests/functional/desy/fixtures/desy_local_records_expected.json @@ -10,7 +10,7 @@ "version": 1, "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", "format": ".txt", - "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", "type": "Main", "filename": "test_fft_1" }, @@ -19,7 +19,7 @@ "creation_datetime": "2017-06-27T09:43:16", "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", "format": ".txt", - "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", "type": "Main", "filename": "test_fft_2" } @@ -78,7 +78,7 @@ "version": 1, "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", "format": ".txt", - "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", "type": "Main", "filename": "test_fft_1" }, @@ -87,7 +87,7 @@ "creation_datetime": "2017-06-27T09:43:16", "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", "format": ".txt", - "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", "type": "Main", "filename": "test_fft_2" } @@ -146,7 +146,7 @@ "version": 1, "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", "format": ".txt", - "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", "type": "Main", "filename": "test_fft_1" }, @@ -155,7 +155,7 @@ "creation_datetime": "2017-06-27T09:43:16", "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", "format": ".txt", - "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", "type": "Main", "filename": "test_fft_2" } @@ -214,7 +214,7 @@ "version": 1, "creation_datetime": "2017-06-27T09:43:17", "description": "00013 Decomposition of the problematic rotation curves in our sample according to the best-fit \\textsc{core}NFW models. Colors and symbols are as in Figure \\ref{fig:dc14_fits}.", "format": ".txt", - "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", "type": "Main", "filename": "test_fft_1" }, @@ -223,7 +223,7 @@ "creation_datetime": "2017-06-27T09:43:16", "description": "00005 Comparison of the parameters of the best-fit DC14 models to the cosmological halo mass-concentration relation from \\cite{dutton14} (left) and the stellar mass-halo mass relation from \\cite{behroozi13} (right). The error bars correspond to the extremal values of the multidimensional 68\\% confidence region for each fit. The theoretical relations are shown as red lines and their 1$\\sigma$ and 2$\\sigma$ scatter are represented by the dark and light grey bands, respectively. The mass-concentration relation from \\cite{maccio08} and the stellar mass-halo mass relation from \\cite{behroozi13} are also shown as the black dashed lines.", "format": ".txt", - "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", "type": "Main", "filename": "test_fft_2" } @@ -1754,7 +1754,7 @@ "format": ".pdf", "filename": "dummy", "version": 1, - "path": "/tmp/file_urls/full/c011422ef40ef111a72bd72092066dd3c1cc7a39.pdf", + "path": "/tmp/file_urls/full/0df3efe7842cf285ae0eeed845cca003dd755674.pdf", "type": "Main" }, { @@ -1763,7 +1763,7 @@ "format": ".txt", "filename": "test_fft_1", "version": 1, - "path": "/tmp/file_urls/full/796483eeaa779dfc00871228dd70dc9809ebc3c0.txt", + "path": "/tmp/file_urls/full/49e42fc70c5d7b0cd9dc7aa5defa12ded530e135.txt", "type": "Main" }, { @@ -1772,7 +1772,7 @@ "format": ".txt", "filename": "test_fft_2", "version": 1, - "path": "/tmp/file_urls/full/ff1ccb47d9a3abb75acb91279e0ec2a4b530ba3e.txt", + "path": "/tmp/file_urls/full/c1cdb1640202896b1ffc446f20d0d660977fc2db.txt", "type": "Main" } ], diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py index d7286f7e..c3e7ca4f 100644 --- a/tests/functional/desy/test_desy.py +++ b/tests/functional/desy/test_desy.py @@ -13,6 +13,7 @@ import copy import hashlib +import os from time import sleep import pytest @@ -76,6 +77,29 @@ def _generate_md5_hash(file_path): assert file_1_hash == file_2_hash +def assert_ffts_content_matches_expected(record): + for fft_field in record.get('_fft', []): + assert_fft_content_matches_expected(fft_field) + + +def assert_fft_content_matches_expected(fft_field): + expected_file_name = get_file_name_from_fft(fft_field) + assert_files_equal(expected_file_name, fft_field['path']) + + +def get_file_name_from_fft(fft_field): + file_path = get_test_suite_path( + 'desy', + 'fixtures', + 'ftp_server', + 'DESY', + 'FFT', + fft_field['filename'] + fft_field['format'], + test_suite='functional', + ) + return file_path + + def get_ftp_settings(): netrc_location = get_test_suite_path( 'desy', @@ -120,6 +144,7 @@ def cleanup(): sleep(10) yield + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) clean_dir('/tmp/file_urls') clean_dir('/tmp/DESY') @@ -180,26 +205,3 @@ def test_desy( for record in gotten_results: assert_ffts_content_matches_expected(record) - - -def assert_ffts_content_matches_expected(record): - for fft_field in record.get('_fft', []): - assert_fft_content_matches_expected(fft_field) - - -def assert_fft_content_matches_expected(fft_field): - expected_file_name = get_file_name_from_fft(fft_field) - assert_files_equal(expected_file_name, fft_field['path']) - - -def get_file_name_from_fft(fft_field): - file_path = get_test_suite_path( - 'desy', - 'fixtures', - 'ftp_server', - 'DESY', - 'FFT', - fft_field['filename'] + fft_field['format'], - test_suite='functional', - ) - return file_path diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py index 42f691c9..493837ec 100644 --- a/tests/functional/wsp/test_wsp.py +++ b/tests/functional/wsp/test_wsp.py @@ -28,13 +28,15 @@ def override_generated_fields(record): record['acquisition_source']['datetime'] = u'2017-04-03T10:26:40.365216' - record['acquisition_source']['submission_number'] = u'5652c7f6190f11e79e8000224dabeaad' + record['acquisition_source']['submission_number'] = ( + u'5652c7f6190f11e79e8000224dabeaad' + ) return record @pytest.fixture(scope="function") -def set_up_ftp_environment(): +def ftp_environment(): netrc_location = get_test_suite_path( 'wsp', 'fixtures', @@ -43,7 +45,8 @@ def set_up_ftp_environment(): test_suite='functional', ) - # The test must wait until the docker environment is up (takes about 10 seconds). + # The test must wait until the docker environment is up (takes about 10 + # seconds). sleep(10) yield { @@ -55,7 +58,8 @@ def set_up_ftp_environment(): } } - clean_dir(path='/tmp/WSP/') + clean_dir() + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) @pytest.fixture(scope="function") @@ -72,7 +76,7 @@ def set_up_local_environment(): 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { - 'package_path': package_location, + 'local_package_dir': package_location, } } @@ -80,7 +84,8 @@ def set_up_local_environment(): def remove_generated_files(package_location): - clean_dir(path='/tmp/WSP/') + clean_dir() + clean_dir(path=os.path.join(os.getcwd(), '.scrapy')) _, dirs, files = next(os.walk(package_location)) for dir_name in dirs: @@ -103,8 +108,10 @@ def remove_generated_files(package_location): 'smoke', ] ) -def test_wsp_ftp(set_up_ftp_environment, expected_results): - crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL')) +def test_wsp_ftp(ftp_environment, expected_results): + crawler = get_crawler_instance( + ftp_environment.get('CRAWLER_HOST_URL'), + ) results = CeleryMonitor.do_crawl( app=celery_app, @@ -112,18 +119,78 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results): monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, - project=set_up_ftp_environment.get('CRAWLER_PROJECT'), + project=ftp_environment.get('CRAWLER_PROJECT'), spider='WSP', settings={}, - **set_up_ftp_environment.get('CRAWLER_ARGUMENTS') + **ftp_environment.get('CRAWLER_ARGUMENTS') ) - gotten_results = [override_generated_fields(result) for result in results] - expected_results = [override_generated_fields(expected) for expected in expected_results] + gotten_results = [ + override_generated_fields(result) for result in results + ] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] assert gotten_results == expected_results +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'wsp', + 'fixtures', + 'wsp_smoke_records.json', + ), + ], + ids=[ + 'crawl_twice', + ] +) +def test_wsp_ftp_crawl_twice(ftp_environment, expected_results): + crawler = get_crawler_instance( + ftp_environment.get('CRAWLER_HOST_URL'), + ) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + events_limit=2, + crawler_instance=crawler, + project=ftp_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **ftp_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [ + override_generated_fields(result) for result in results + ] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] + + assert gotten_results == expected_results + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + events_limit=2, + crawler_instance=crawler, + project=ftp_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **ftp_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + + assert gotten_results == [] + + @pytest.mark.parametrize( 'expected_results', [ @@ -138,7 +205,9 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results): ] ) def test_wsp_local_package_path(set_up_local_environment, expected_results): - crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL')) + crawler = get_crawler_instance( + set_up_local_environment.get('CRAWLER_HOST_URL') + ) results = CeleryMonitor.do_crawl( app=celery_app, @@ -153,6 +222,63 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results): ) gotten_results = [override_generated_fields(result) for result in results] - expected_results = [override_generated_fields(expected) for expected in expected_results] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] + + assert gotten_results == expected_results + + +@pytest.mark.parametrize( + 'expected_results', + [ + expected_json_results_from_file( + 'wsp', + 'fixtures', + 'wsp_smoke_records.json', + ), + ], + ids=[ + 'crawl_twice', + ] +) +def test_wsp_local_package_path_crawl_twice( + set_up_local_environment, + expected_results, +): + crawler = get_crawler_instance( + set_up_local_environment.get('CRAWLER_HOST_URL') + ) + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + expected_results = [ + override_generated_fields(expected) for expected in expected_results + ] assert gotten_results == expected_results + + results = CeleryMonitor.do_crawl( + app=celery_app, + monitor_timeout=5, + monitor_iter_limit=20, + crawler_instance=crawler, + project=set_up_local_environment.get('CRAWLER_PROJECT'), + spider='WSP', + settings={}, + **set_up_local_environment.get('CRAWLER_ARGUMENTS') + ) + + gotten_results = [override_generated_fields(result) for result in results] + + assert gotten_results == [] diff --git a/tests/unit/test_alpha.py b/tests/unit/test_alpha.py index 96bf9af1..d5ff6331 100644 --- a/tests/unit/test_alpha.py +++ b/tests/unit/test_alpha.py @@ -13,8 +13,10 @@ from hepcrawl.spiders import alpha_spider -from hepcrawl.testlib.fixtures import fake_response_from_file - +from hepcrawl.testlib.fixtures import ( + fake_response_from_file, + clean_dir, +) @pytest.fixture def results(): diff --git a/tests/unit/test_aps.py b/tests/unit/test_aps.py index 3bb3698c..e8e64962 100644 --- a/tests/unit/test_aps.py +++ b/tests/unit/test_aps.py @@ -12,7 +12,10 @@ import pytest from hepcrawl.spiders import aps_spider -from hepcrawl.testlib.fixtures import fake_response_from_file +from hepcrawl.testlib.fixtures import ( + fake_response_from_file, + clean_dir, +) @pytest.fixture diff --git a/tests/unit/test_arxiv_all.py b/tests/unit/test_arxiv_all.py index 1f4155c9..21a5fd99 100644 --- a/tests/unit/test_arxiv_all.py +++ b/tests/unit/test_arxiv_all.py @@ -7,7 +7,12 @@ # under the terms of the Revised BSD License; see LICENSE file for # more details. -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) import pytest @@ -16,7 +21,10 @@ from hepcrawl.pipelines import InspireCeleryPushPipeline from hepcrawl.spiders import arxiv_spider -from hepcrawl.testlib.fixtures import fake_response_from_file +from hepcrawl.testlib.fixtures import ( + fake_response_from_file, + clean_dir, +) @pytest.fixture @@ -44,10 +52,16 @@ def _get_processed_record(item, spider): ) ) + assert parsed_items pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - return [_get_processed_record(parsed_item, spider) for parsed_item in parsed_items] + yield [ + _get_processed_record(parsed_item, spider) + for parsed_item in parsed_items + ] + + clean_dir() def test_page_nr(many_results): diff --git a/tests/unit/test_arxiv_single.py b/tests/unit/test_arxiv_single.py index 329a2a49..709c6d9a 100644 --- a/tests/unit/test_arxiv_single.py +++ b/tests/unit/test_arxiv_single.py @@ -17,7 +17,10 @@ from hepcrawl.pipelines import InspireCeleryPushPipeline from hepcrawl.spiders import arxiv_spider -from hepcrawl.testlib.fixtures import fake_response_from_file +from hepcrawl.testlib.fixtures import ( + fake_response_from_file, + clean_dir, +) @pytest.fixture @@ -44,8 +47,9 @@ def _get_processed_item(item, spider): pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - return [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items] + yield [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items] + clean_dir() def test_abstracts(results): diff --git a/tests/unit/test_pipelines.py b/tests/unit/test_pipelines.py index 050df092..08b81319 100644 --- a/tests/unit/test_pipelines.py +++ b/tests/unit/test_pipelines.py @@ -21,7 +21,10 @@ from hepcrawl.spiders import arxiv_spider from hepcrawl.pipelines import InspireAPIPushPipeline -from hepcrawl.testlib.fixtures import fake_response_from_file +from hepcrawl.testlib.fixtures import ( + fake_response_from_file, + clean_dir, +) @pytest.fixture @@ -44,7 +47,9 @@ def json_spider_record(tmpdir): ) parsed_record = items.next() assert parsed_record - return spider, parsed_record + yield spider, parsed_record + + clean_dir() @pytest.fixture diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index bea29b34..0eccd4fc 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -19,7 +19,10 @@ from hepcrawl.pipelines import InspireCeleryPushPipeline from hepcrawl.spiders import pos_spider -from hepcrawl.testlib.fixtures import fake_response_from_file +from hepcrawl.testlib.fixtures import ( + fake_response_from_file, + clean_dir, +) @pytest.fixture @@ -55,7 +58,9 @@ def record(scrape_pos_page_body): parsed_record = pipeline.process_item(parsed_item, spider) assert parsed_record - return parsed_record + yield parsed_record + + clean_dir() def test_titles(record): diff --git a/tests/unit/test_world_scientific.py b/tests/unit/test_world_scientific.py index 291d00d0..f14144fd 100644 --- a/tests/unit/test_world_scientific.py +++ b/tests/unit/test_world_scientific.py @@ -7,7 +7,12 @@ # under the terms of the Revised BSD License; see LICENSE file for # more details. -from __future__ import absolute_import, division, print_function, unicode_literals +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) import pytest import os @@ -18,7 +23,16 @@ from hepcrawl.pipelines import InspireCeleryPushPipeline from hepcrawl.spiders import wsp_spider -from hepcrawl.testlib.fixtures import fake_response_from_file +from hepcrawl.testlib.fixtures import ( + fake_response_from_file, + clean_dir, +) + + +@pytest.fixture(scope='function', autouse=True) +def cleanup(): + yield + clean_dir() def create_spider(): @@ -44,7 +58,10 @@ def get_records(response_file_name): pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - return (pipeline.process_item(record, spider) for record in records) + return ( + pipeline.process_item(record, spider) + for record in records + ) def get_one_record(response_file_name): @@ -67,17 +84,25 @@ def override_generated_fields(record): [ get_one_record('world_scientific/sample_ws_record.xml'), ( - "CH$_{3}$NH$_{3}$PbX(X = Br, I, Cl) perovskites have recently been used as light absorbers in hybrid" - " organic-inorganic solid-state solar cells, with efficiencies above 15%. To date, it is essential to" - " add Lithium bis(Trifluoromethanesulfonyl)Imide (LiTFSI) to the hole transport materials (HTM) to get" - " a higher conductivity. However, the detrimental effect of high LiTFSI concentration on the charge transport" - ", DOS in the conduction band of the TiO$_{2}$ substrate and device stability results in an overall " - "compromise for a satisfactory device. Using a higher mobility hole conductor to avoid lithium salt " - "is an interesting alternative. Herein, we successfully made an efficient perovskite solar cell by " - "applying a hole conductor PTAA (Poly[bis(4-phenyl) (2,4,6-trimethylphenyl)-amine]) in the absence of" - " LiTFSI. Under AM 1.5 illumination of 100 mW/cm$^{2}$, an efficiency of 10.9% was achieved, which is " - "comparable to the efficiency of 12.3% with the addition of 1.3 mM LiTFSI. An unsealed device without " - "Li$^{+}$ shows interestingly a promising stability." + "CH$_{3}$NH$_{3}$PbX(X = Br, I, Cl) perovskites have " + "recently been used as light absorbers in hybrid" + " organic-inorganic solid-state solar cells, with " + "efficiencies above 15%. To date, it is essential to add " + "Lithium bis(Trifluoromethanesulfonyl)Imide (LiTFSI) to the " + "hole transport materials (HTM) to get a higher conductivity. " + "However, the detrimental effect of high LiTFSI concentration " + "on the charge transport, DOS in the conduction band of the " + "TiO$_{2}$ substrate and device stability results in an " + "overall compromise for a satisfactory device. Using a higher " + "mobility hole conductor to avoid lithium salt is an " + "interesting alternative. Herein, we successfully made an " + "efficient perovskite solar cell by applying a hole conductor " + "PTAA (Poly[bis(4-phenyl) (2,4,6-trimethylphenyl)-amine]) in " + "the absence of LiTFSI. Under AM 1.5 illumination of 100 " + "mW/cm$^{2}$, an efficiency of 10.9% was achieved, which is " + "comparable to the efficiency of 12.3% with the addition of " + "1.3 mM LiTFSI. An unsealed device without Li$^{+}$ shows " + "interestingly a promising stability." ), ], ], @@ -98,7 +123,10 @@ def test_abstract(generated_record, expected_abstract): get_one_record('world_scientific/sample_ws_record.xml'), [{ 'source': 'WSP', - 'title': 'High-efficient Solid-state Perovskite Solar Cell Without Lithium Salt in the Hole Transport Material', + 'title': ( + 'High-efficient Solid-state Perovskite Solar Cell Without ' + 'Lithium Salt in the Hole Transport Material' + ), }], ], ], @@ -291,12 +319,18 @@ def test_publication_info(generated_record, expected_publication_info): [ get_one_record('world_scientific/sample_ws_record.xml'), { - 'authors': ["BI, DONGQIN", "BOSCHLOO, GERRIT", "HAGFELDT, ANDERS"], + 'authors': [ + "BI, DONGQIN", + "BOSCHLOO, GERRIT", + "HAGFELDT, ANDERS", + ], 'affiliation': ( - 'Department of Chemistry-Angstrom Laboratory, Uppsala University, Box 532, SE 751 20 Uppsala, Sweden' + 'Department of Chemistry-Angstrom Laboratory, Uppsala ' + 'University, Box 532, SE 751 20 Uppsala, Sweden' ), 'xref_affiliation': ( - 'Physics Department, Brookhaven National Laboratory, Upton, NY 11973, USA' + 'Physics Department, Brookhaven National Laboratory, ' + 'Upton, NY 11973, USA' ), }, ], @@ -314,11 +348,14 @@ def test_authors(generated_record, expected_authors): for index, name in enumerate(expected_authors['authors']): assert generated_record['authors'][index]['full_name'] == name assert expected_authors['affiliation'] in [ - aff['value'] for aff in generated_record['authors'][index]['affiliations'] + aff['value'] + for aff in generated_record['authors'][index]['affiliations'] ] if index == 1: assert expected_authors['xref_affiliation'] in [ - aff['value'] for aff in generated_record['authors'][index]['affiliations'] + aff['value'] + for aff + in generated_record['authors'][index]['affiliations'] ] @@ -413,7 +450,10 @@ def test_pipeline_record(generated_record): 'abstracts': [ { 'source': 'WSP', - 'value': u'Abstract L\xe9vy bla-bla bla blaaa blaa bla blaaa blaa, bla blaaa blaa. Bla blaaa blaa.', + 'value': ( + u'Abstract L\xe9vy bla-bla bla blaaa blaa bla blaaa blaa, ' + 'bla blaaa blaa. Bla blaaa blaa.' + ), }, ], 'acquisition_source': { @@ -426,7 +466,10 @@ def test_pipeline_record(generated_record): { 'affiliations': [ { - 'value': u'Department, University, City, City_code 123456, C. R. Country_2', + 'value': ( + u'Department, University, City, City_code 123456, ' + 'C. R. Country_2' + ), }, ], 'full_name': u'author_surname_2, author_name_1',