diff --git a/setup.py b/setup.py index b19e5f14..a98aeb88 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,7 @@ install_requires = [ 'autosemver~=0.2', 'inspire-schemas~=42.0', + 'inspire-dojson~=41.0', 'Scrapy>=1.1.0', # TODO: unpin once they support wheel building again 'scrapyd==1.1.0', diff --git a/tests/functional/arxiv/test_arxiv.py b/tests/functional/arxiv/test_arxiv.py index a9677b89..0f58b17d 100644 --- a/tests/functional/arxiv/test_arxiv.py +++ b/tests/functional/arxiv/test_arxiv.py @@ -72,6 +72,7 @@ def test_arxiv(set_up_local_environment, expected_results): app=celery_app, monitor_timeout=5, monitor_iter_limit=100, + events_limit=1, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='arXiv', diff --git a/tests/functional/wsp/test_wsp.py b/tests/functional/wsp/test_wsp.py index 70996466..a0411b8e 100644 --- a/tests/functional/wsp/test_wsp.py +++ b/tests/functional/wsp/test_wsp.py @@ -13,7 +13,6 @@ import pytest import os -import shutil from time import sleep @@ -21,6 +20,7 @@ from hepcrawl.testlib.fixtures import ( get_test_suite_path, expected_json_results_from_file, + clean_dir, ) from hepcrawl.testlib.tasks import app as celery_app from hepcrawl.testlib.utils import get_crawler_instance @@ -90,10 +90,6 @@ def remove_generated_files(package_location): os.unlink(os.path.join(package_location, file_name)) -def clean_dir(path='/tmp/WSP/'): - shutil.rmtree(path, ignore_errors=True) - - @pytest.mark.parametrize( 'expected_results', [ @@ -114,6 +110,7 @@ def test_wsp_ftp(set_up_ftp_environment, expected_results): app=celery_app, monitor_timeout=5, monitor_iter_limit=100, + events_limit=1, crawler_instance=crawler, project=set_up_ftp_environment.get('CRAWLER_PROJECT'), spider='WSP', @@ -147,6 +144,7 @@ def test_wsp_local_package_path(set_up_local_environment, expected_results): app=celery_app, monitor_timeout=5, monitor_iter_limit=100, + events_limit=1, crawler_instance=crawler, project=set_up_local_environment.get('CRAWLER_PROJECT'), spider='WSP', diff --git a/tests/unit/test_alpha.py b/tests/unit/test_alpha.py index eef140b1..96bf9af1 100644 --- a/tests/unit/test_alpha.py +++ b/tests/unit/test_alpha.py @@ -20,13 +20,15 @@ def results(): """Return results generator from the Alpha spider.""" spider = alpha_spider.AlphaSpider() - records = list( + parsed_items = list( spider.parse( fake_response_from_file('alpha/test_1.htm') ) ) + records = [parsed_item.record for parsed_item in parsed_items] assert records + return records diff --git a/tests/unit/test_aps.py b/tests/unit/test_aps.py index eb53269d..3bb3698c 100644 --- a/tests/unit/test_aps.py +++ b/tests/unit/test_aps.py @@ -21,7 +21,7 @@ def results(): from scrapy.http import TextResponse spider = aps_spider.APSSpider() - records = list( + parsed_items = list( spider.parse( fake_response_from_file( 'aps/aps_single_response.json', @@ -30,6 +30,8 @@ def results(): ) ) + records = [parsed_item.record for parsed_item in parsed_items] + assert records return records diff --git a/tests/unit/test_arxiv_all.py b/tests/unit/test_arxiv_all.py index bd75e5a4..1f4155c9 100644 --- a/tests/unit/test_arxiv_all.py +++ b/tests/unit/test_arxiv_all.py @@ -11,7 +11,8 @@ import pytest -from scrapy.crawler import Crawler +from scrapy.crawler import Crawler +from scrapy.http import TextResponse from hepcrawl.pipelines import InspireCeleryPushPipeline from hepcrawl.spiders import arxiv_spider @@ -25,36 +26,16 @@ def spider(): return spider -@pytest.fixture -def one_result(spider): - """Return results generator from the arxiv spider. Tricky fields, one - record. - """ - from scrapy.http import TextResponse - - records = list( - spider.parse( - fake_response_from_file( - 'arxiv/sample_arxiv_record0.xml', - response_type=TextResponse, - ) - ) - ) - - assert records - pipeline = InspireCeleryPushPipeline() - pipeline.open_spider(spider) - return [pipeline.process_item(record, spider) for record in records] - - @pytest.fixture def many_results(spider): """Return results generator from the arxiv spider. Tricky fields, many records. """ - from scrapy.http import TextResponse + def _get_processed_record(item, spider): + record = pipeline.process_item(item, spider) + return record - records = list( + parsed_items = list( spider.parse( fake_response_from_file( 'arxiv/sample_arxiv_record.xml', @@ -63,10 +44,10 @@ def many_results(spider): ) ) - assert records pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - return [pipeline.process_item(record, spider) for record in records] + + return [_get_processed_record(parsed_item, spider) for parsed_item in parsed_items] def test_page_nr(many_results): diff --git a/tests/unit/test_arxiv_single.py b/tests/unit/test_arxiv_single.py index a6ed66d6..329a2a49 100644 --- a/tests/unit/test_arxiv_single.py +++ b/tests/unit/test_arxiv_single.py @@ -24,10 +24,15 @@ def results(): """Return results generator from the arxiv spider. All fields, one record. """ + def _get_processed_item(item, spider): + record = pipeline.process_item(item, spider) + validate(record, 'hep') + assert record + return record crawler = Crawler(spidercls=arxiv_spider.ArxivSpider) spider = arxiv_spider.ArxivSpider.from_crawler(crawler) - records = list( + parsed_items = list( spider.parse( fake_response_from_file( 'arxiv/sample_arxiv_record0.xml', @@ -36,16 +41,10 @@ def results(): ) ) - assert records pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - processed_records = [] - for record in records: - processed_record = pipeline.process_item(record, spider) - validate(processed_record, 'hep') - processed_records.append(processed_record) - return processed_records + return [_get_processed_item(parsed_item, spider) for parsed_item in parsed_items] diff --git a/tests/unit/test_base.py b/tests/unit/test_base.py index cc6ef093..b8ec5b8a 100644 --- a/tests/unit/test_base.py +++ b/tests/unit/test_base.py @@ -38,9 +38,12 @@ def record(): nodes = selector.xpath('.//%s' % spider.itertag) response.meta["record"] = nodes[0].extract() response.meta["urls"] = ["http://hdl.handle.net/1885/10005"] - parsed_record = spider.build_item(response) - assert parsed_record - return parsed_record + + parsed_item = spider.build_item(response) + assert parsed_item + assert parsed_item.record + + return parsed_item.record @pytest.fixture @@ -169,7 +172,12 @@ def splash(): 'Content-Type': 'text/html', }, ) - return spider.scrape_for_pdf(splash_response) + + parsed_item = spider.scrape_for_pdf(splash_response) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_splash(splash): @@ -201,7 +209,12 @@ def parsed_node(): response = fake_response_from_string(text=body) node = get_node(spider, 'OAI-PMH:record', text=body) response.meta["record"] = node[0].extract() - return spider.parse_node(response, node[0]) + + parsed_item = spider.parse_node(response, node[0]) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_parsed_node(parsed_node): diff --git a/tests/unit/test_brown.py b/tests/unit/test_brown.py index 0b42b4df..8d0f20de 100644 --- a/tests/unit/test_brown.py +++ b/tests/unit/test_brown.py @@ -41,10 +41,12 @@ def record(): splash_response = fake_response_from_file('brown/test_splash.html') splash_response.meta["jsonrecord"] = jsonrecord - parsed_record = spider.scrape_splash(splash_response) - assert parsed_record - return parsed_record + parsed_item = spider.scrape_splash(splash_response) + assert parsed_item + assert parsed_item.record + + return parsed_item.record @pytest.fixture @@ -200,7 +202,11 @@ def parsed_node_no_splash(): jsonrecord = jsonresponse["items"]["docs"][0] response.meta["jsonrecord"] = jsonrecord - return spider.parse(response).next() + parsed_item = spider.parse(response).next() + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_no_splash(parsed_node_no_splash): diff --git a/tests/unit/test_dnb.py b/tests/unit/test_dnb.py index b00aff3d..5aa05a64 100644 --- a/tests/unit/test_dnb.py +++ b/tests/unit/test_dnb.py @@ -72,7 +72,12 @@ def record(scrape_pos_page_body): body=scrape_pos_page_body, **{'encoding': 'utf-8'} ) - return request.callback(response) + + parsed_item = request.callback(response) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_title(record): @@ -241,7 +246,12 @@ def parse_without_splash(): 'Content-Type': 'application/pdf;charset=base64', } ) - return spider.parse_node(response, nodes[0]) + + parsed_item = spider.parse_node(response, nodes[0]) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_parse_without_splash(parse_without_splash): diff --git a/tests/unit/test_edp.py b/tests/unit/test_edp.py index cc7885bd..115abda6 100644 --- a/tests/unit/test_edp.py +++ b/tests/unit/test_edp.py @@ -40,6 +40,7 @@ def scrape_pos_page_body(): ) ) + @pytest.fixture def targzfile(): """Path to test tar.gz file with JATS XML file.""" @@ -50,6 +51,7 @@ def targzfile(): 'test_gz.tar.gz' ) + @pytest.fixture def package_jats(targzfile): """Extract tar.gz package with JATS XML file.""" @@ -75,7 +77,12 @@ def record_jats(package_jats, scrape_pos_page_body): body=scrape_pos_page_body, **{'encoding': 'utf-8'} ) - return request.callback(response) + + parsed_item = request.callback(response) + assert parsed_item + assert parsed_item.record + + return parsed_item.record @pytest.fixture @@ -107,7 +114,11 @@ def record_rich(package_rich): fake_resp.meta["rich"] = True node = get_node(spider, "//EDPSArticle", fake_resp)[0] - return spider.parse_node(fake_resp, node) + parsed_item = spider.parse_node(fake_resp, node) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_title(record_jats): @@ -145,6 +156,7 @@ def test_abstract(record_jats): assert 'abstract' in record_jats assert record_jats['abstract'] == abstract + def test_date_published(record_jats): """Test extracting date_published.""" date_published = "2015-01-01" @@ -179,6 +191,7 @@ def test_doi(record_jats): assert 'dois' in record_jats assert record_jats['dois'][0]['value'] == doi + def test_publication_info(record_jats): """Test extracting publication info.""" assert 'journal_title' in record_jats @@ -206,7 +219,6 @@ def test_keywords(record_jats): assert keyw["value"] in keywords - def test_authors(record_jats): """Test authors.""" authors = ["Arasoglu, Ali", "Ozdemir, Omer Faruk"] @@ -326,7 +338,6 @@ def test_authors_rich(record_rich): assert astr[index]["affiliations"][0]["value"] == affiliations[index] - def test_tarfile(tarbzfile, tmpdir): """Test untarring a tar.bz package with a test XML file. @@ -343,7 +354,6 @@ def test_tarfile(tarbzfile, tmpdir): assert "aas/xml_rich/2000/01" not in xml_files_flat[0] - def test_handle_package_ftp(tarbzfile): """Test getting the target folder name for xml files.""" spider = edp_spider.EDPSpider() @@ -351,7 +361,8 @@ def test_handle_package_ftp(tarbzfile): request = spider.handle_package_ftp(response).next() assert isinstance(request, Request) - assert request.meta["package_path"] == tarbzfile + assert request.meta["source_folder"] == tarbzfile + def test_no_dois_jats(): """Test parsing when no DOI in record. JATS format.""" @@ -370,7 +381,11 @@ def test_no_dois_jats(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + parsed_item = spider.parse_node(response, node) + assert parsed_item + assert parsed_item.record + record = parsed_item.record assert "dois" not in record assert "additional_files" not in record @@ -390,7 +405,11 @@ def test_no_dois_rich(): response = fake_response_from_string(body) response.meta["rich"] = True node = get_node(spider, "//EDPSArticle", response)[0] - record = spider.parse_node(response, node) + + parsed_item = spider.parse_node(response, node) + assert parsed_item + assert parsed_item.record + record = parsed_item.record assert "dois" not in record assert "additional_files" not in record @@ -416,7 +435,11 @@ def test_addendum_jats(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + parsed_item = spider.parse_node(response, node) + assert parsed_item + assert parsed_item.record + record = parsed_item.record assert "related_article_doi" in record assert record["related_article_doi"][0][ @@ -439,7 +462,11 @@ def test_author_with_email(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + parsed_item = spider.parse_node(response, node) + assert parsed_item + assert parsed_item.record + record = parsed_item.record assert 'email' in record['authors'][0] assert record['authors'][0]['email'] == "Fname.Sname@university.org" @@ -472,7 +499,11 @@ def test_aff_with_email(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + parsed_item = spider.parse_node(response, node) + assert parsed_item + assert parsed_item.record + record = parsed_item.record affiliation = "Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA" assert 'affiliations' in record['authors'][0] @@ -481,8 +512,6 @@ def test_aff_with_email(): assert record['authors'][0]['email'] is None - - def test_no_valid_article(): """Test parsing when filtering out non-interesting article types.""" spider = edp_spider.EDPSpider() @@ -506,7 +535,11 @@ def test_collections_review(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - record = spider.parse_node(response, node) + + parsed_item = spider.parse_node(response, node) + assert parsed_item + assert parsed_item.record + record = parsed_item.record assert "collections" in record assert record["collections"] == [{'primary': 'HEP'}, {'primary': 'Review'}] @@ -533,7 +566,12 @@ def record_references_only(): """ response = fake_response_from_string(body) node = get_node(spider, "//article", response)[0] - return spider.parse_node(response, node) + + parsed_item = spider.parse_node(response, node) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_references(record_references_only): diff --git a/tests/unit/test_elsevier.py b/tests/unit/test_elsevier.py index ca023122..3d5fb3f5 100644 --- a/tests/unit/test_elsevier.py +++ b/tests/unit/test_elsevier.py @@ -41,9 +41,12 @@ def record(): response.meta["xml_url"] = 'elsevier/sample_consyn_record.xml' tag = '//%s' % spider.itertag nodes = get_node(spider, tag, response) - parsed_record = spider.parse_node(response, nodes) - assert parsed_record - return parsed_record + + parsed_item = spider.parse_node(response, nodes) + assert parsed_item + assert parsed_item.record + + return parsed_item.record @pytest.fixture(scope="module") @@ -97,7 +100,12 @@ def parsed_node(): response.meta["xml_url"] = 'elsevier/sample_consyn_record.xml' parse_response = spider.parse_node(response, node) parse_response.status = 404 - return spider.scrape_sciencedirect(parse_response) + + parsed_item = spider.scrape_sciencedirect(parse_response) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_collection(parsed_node): @@ -164,7 +172,11 @@ def cover_display_date(): node = get_node(spider, '/doc', text=body) response = fake_response_from_string(body) - return spider.parse_node(response, node) + parsed_item = spider.parse_node(response, node) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_cover_display_date(cover_display_date): @@ -187,7 +199,11 @@ def cover_display_date_y_m(): """ node = get_node(spider, '/doc', text=body) response = fake_response_from_string(body) - return spider.parse_node(response, node) + parsed_item = spider.parse_node(response, node) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_cover_display_date_y_m(cover_display_date_y_m): @@ -210,7 +226,11 @@ def cover_display_date_y(): """ node = get_node(spider, '/doc', text=body) response = fake_response_from_string(body) - return spider.parse_node(response, node) + parsed_item = spider.parse_node(response, node) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_cover_display_date_y(cover_display_date_y): @@ -1579,11 +1599,11 @@ def test_handle_package(handled_package): for astro, nima in zip(astropart, nima): assert nima assert astro - assert astro.meta["package_path"] == "tests/unit/responses/elsevier/fake_astropart.zip" + assert astro.meta["source_folder"] == "tests/unit/responses/elsevier/fake_astropart.zip" url_to_match = u'file:///tmp/elsevier_fake_astropart_*/0927-6505/aip/S0927650515001656/S0927650515001656.xml' assert astro.meta["xml_url"] == fnmatch.filter([astro.meta["xml_url"]], url_to_match)[0] - assert nima.meta["package_path"] == "tests/unit/responses/elsevier/fake_nima.zip" + assert nima.meta["source_folder"] == "tests/unit/responses/elsevier/fake_nima.zip" url_to_match = u'file:///tmp/elsevier_fake_nima_*/0168-9002/S0168900215X00398/S0168900215015636/S0168900215015636.xml' assert nima.meta["xml_url"] == fnmatch.filter([nima.meta["xml_url"]], url_to_match)[0] @@ -1644,7 +1664,12 @@ def sciencedirect(): ]) response.meta["info"] = {} response.meta["node"] = get_node(spider, '/head', text=body) - return spider.scrape_sciencedirect(response) + + parsed_item = spider.scrape_sciencedirect(response) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_sciencedirect(sciencedirect): diff --git a/tests/unit/test_hindawi.py b/tests/unit/test_hindawi.py index 37e5e183..3af8ba3a 100644 --- a/tests/unit/test_hindawi.py +++ b/tests/unit/test_hindawi.py @@ -26,9 +26,11 @@ def record(): response = fake_response_from_file("hindawi/test_1.xml") nodes = get_node(spider, "//marc:record", response) - parsed_record = spider.parse_node(response, nodes[0]) - assert parsed_record - return parsed_record + parsed_item = spider.parse_node(response, nodes[0]) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_title(record): diff --git a/tests/unit/test_infn.py b/tests/unit/test_infn.py index 0c60799a..c15ef727 100644 --- a/tests/unit/test_infn.py +++ b/tests/unit/test_infn.py @@ -28,9 +28,12 @@ def record(): """Return scraping results from the INFN spider.""" spider = infn_spider.InfnSpider() response = fake_response_from_file('infn/test_splash.html') - parsed_record = spider.scrape_splash(response) - assert parsed_record - return parsed_record + + parsed_item = spider.scrape_splash(response) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_title(record): @@ -121,6 +124,7 @@ def test_non_thesis(): assert record is None + def test_parse_node(): """Test parse_node function. This should be a scrapy Request object. @@ -148,6 +152,8 @@ def test_parse_node_nolink(): response = fake_response_from_file('infn/test_1_nolink.html') selector = Selector(response, type='html') node = selector.xpath('//%s' % spider.itertag)[0] - record = spider.parse_node(response, node).next() + parsed_item = spider.parse_node(response, node).next() + assert parsed_item + assert parsed_item.record - assert isinstance(record, hepcrawl.items.HEPRecord) + assert isinstance(parsed_item.record, hepcrawl.items.HEPRecord) diff --git a/tests/unit/test_iop.py b/tests/unit/test_iop.py index b776adfa..bb01766c 100644 --- a/tests/unit/test_iop.py +++ b/tests/unit/test_iop.py @@ -38,9 +38,12 @@ def record(): response = fake_response_from_file('iop/xml/test_standard.xml') node = get_node(spider, "Article", response) spider.pdf_files = TEST_PDF_DIR - parsed_record = spider.parse_node(response, node) - assert parsed_record - return parsed_record + + parsed_item = spider.parse_node(response, node) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_abstract(record): @@ -182,10 +185,12 @@ def erratum_open_access_record(): 'iop', 'pdf', ) - parsed_record = spider.parse_node(response, node) - assert parsed_record - return parsed_record + parsed_item = spider.parse_node(response, node) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_files_erratum_open_access_record(erratum_open_access_record): diff --git a/tests/unit/test_magic.py b/tests/unit/test_magic.py index eeb574fe..f3c0f355 100644 --- a/tests/unit/test_magic.py +++ b/tests/unit/test_magic.py @@ -23,6 +23,7 @@ get_node, ) + @pytest.fixture def record(): """Return results from the MAGIC spider. First parse node, then scrape, @@ -39,9 +40,11 @@ def record(): splash_response.meta["date"] = parsed_node.meta["date"] splash_response.meta["urls"] = parsed_node.meta["urls"] - parsed_record = spider.scrape_for_pdf(splash_response).next() - assert parsed_record - return parsed_record + parsed_item = spider.scrape_for_pdf(splash_response).next() + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_abstract(record): @@ -102,7 +105,6 @@ def test_abstract(record): assert record["abstract"] == abstract - def test_title(record): """Test extracting title.""" title = "Limits to the violation of Lorentz invariance using the emission of the CRAB pulsar at TeV energies, discovered with archival data from the MAGIC telescopes" @@ -139,6 +141,7 @@ def test_url(record): assert 'urls' in record assert record['urls'][0]['value'] == url + def test_pdf_link(record): """Test pdf link(s)""" files = "http://stlab.adobe.com/wiki/images/d/d3/Test.pdf" @@ -164,7 +167,10 @@ def test_no_author_no_date_no_url(): """ response = fake_response_from_string(body) node = get_node(spider, spider.itertag, text=body) - record = spider.parse_node(response, node).next() + parsed_item = spider.parse_node(response, node).next() + assert parsed_item + assert parsed_item.record + record = parsed_item.record assert isinstance(record, hepcrawl.items.HEPRecord) assert "date" not in record @@ -184,7 +190,10 @@ def test_no_aff(): """ response = fake_response_from_string(body) - record = spider.scrape_for_pdf(response).next() + parsed_item = spider.scrape_for_pdf(response).next() + assert parsed_item + assert parsed_item.record + record = parsed_item.record assert isinstance(record, hepcrawl.items.HEPRecord) assert "date" not in record @@ -216,7 +225,10 @@ def test_no_spash_page(): response.status = 404 response.meta["title"] = parsed_node.meta["title"] response.meta["urls"] = parsed_node.meta["urls"] - record = spider.scrape_for_pdf(response).next() + parsed_item = spider.scrape_for_pdf(response).next() + assert parsed_item + assert parsed_item.record + record = parsed_item.record assert isinstance(record, hepcrawl.items.HEPRecord) assert "urls" in record diff --git a/tests/unit/test_mit.py b/tests/unit/test_mit.py index 0253d91f..8a185cef 100644 --- a/tests/unit/test_mit.py +++ b/tests/unit/test_mit.py @@ -25,9 +25,12 @@ def record(): """Return scraping results from the MIT spider.""" spider = mit_spider.MITSpider() response = fake_response_from_file('mit/test_splash.html') - parsed_record = spider.build_item(response) - assert parsed_record - return parsed_record + + parsed_item = spider.build_item(response) + assert parsed_item + assert parsed_item.record + + return parsed_item.record @pytest.fixture @@ -37,7 +40,11 @@ def parsed_node(): response = fake_response_from_file('mit/test_list.html') tag = spider.itertag node = get_node(spider, tag, response, rtype="html") - return spider.parse_node(response, node).next() + + parsed_item = spider.parse_node(response, node).next() + assert parsed_item + + return parsed_item def test_url(parsed_node): @@ -159,7 +166,12 @@ def supervisors(): """ response = fake_response_from_string(body) - return spider.build_item(response) + + parsed_item = spider.build_item(response) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_two_supervisors(supervisors): diff --git a/tests/unit/test_phenix.py b/tests/unit/test_phenix.py index 75384350..c272683f 100644 --- a/tests/unit/test_phenix.py +++ b/tests/unit/test_phenix.py @@ -29,9 +29,13 @@ def record(): response = fake_response_from_file('phenix/test_1.html') selector = Selector(response, type='html') nodes = selector.xpath('//%s' % spider.itertag) - parsed_record = spider.parse_node(response, nodes[0]) - assert parsed_record - return parsed_record + + parsed_item = spider.parse_node(response, nodes[0]) + assert parsed_item + assert parsed_item.record + + return parsed_item.record + @pytest.fixture def non_thesis(): @@ -49,10 +53,12 @@ def non_thesis(): node = get_node(spider, '//li', text=body) return spider.parse_node(response, node) + def test_non_thesis(non_thesis): """Test MSc thesis skipping.""" assert non_thesis is None + def test_title(record): """Test extracting title.""" title = "MEASUREMENT OF THE DOUBLE HELICITY ASYMMETRY IN INCLUSIVE $\pi^{0}$ PRODUCTION IN POLARIZED PROTON-PROTON COLLISIONS AT $\sqrt{s}$ = 510 GeV" @@ -82,6 +88,7 @@ def test_authors(record): aff['value'] for aff in record['authors'][index]['affiliations'] ] + def test_pdf_link(record): """Test pdf link(s)""" files = "http://www.phenix.bnl.gov/phenix/WWW/talk/archive/theses/2015/Guragain_Hari-DISSERTATION.pdf" diff --git a/tests/unit/test_phil.py b/tests/unit/test_phil.py index e99064b2..6db536ef 100644 --- a/tests/unit/test_phil.py +++ b/tests/unit/test_phil.py @@ -33,9 +33,12 @@ def record(): "http://philpapers.org/go.pl?id=BROBB&proxyId=none&u=http%3A%2F%2Fanalysis.oxfordjournals.org%2Fcontent%2F66%2F3%2F194.full.pdf%2Bhtml%3Fframe%3Dsidebar", "http://philpapers.org/go.pl?id=BROBB&proxyId=none&u=http%3A%2F%2Fbrogaardb.googlepages.com%2Ftensedrelationsoffprint.pdf" ] - parsed_record = spider.build_item(response) - assert parsed_record - return parsed_record + + parsed_item = spider.build_item(response) + assert parsed_item + assert parsed_item.record + + return parsed_item.record @pytest.fixture @@ -48,7 +51,12 @@ def journal(): response = fake_response_from_file('phil/test_journal.json') jsonrecord = json.loads(response.body_as_unicode()) response.meta["jsonrecord"] = jsonrecord[0] - return spider.build_item(response) + + parsed_item = spider.build_item(response) + assert parsed_item + assert parsed_item.record + + return parsed_item.record @pytest.fixture @@ -223,7 +231,11 @@ def splash(): ] } - return spider.scrape_for_pdf(response) + parsed_item = spider.scrape_for_pdf(response) + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_scrape(splash): diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index 20c872f4..bea29b34 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -51,8 +51,11 @@ def record(scrape_pos_page_body): assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) - record = request.callback(response) - return pipeline.process_item(record, spider) + parsed_item = request.callback(response) + parsed_record = pipeline.process_item(parsed_item, spider) + assert parsed_record + + return parsed_record def test_titles(record): diff --git a/tests/unit/test_t2k.py b/tests/unit/test_t2k.py index 283a02e5..d9395aa2 100644 --- a/tests/unit/test_t2k.py +++ b/tests/unit/test_t2k.py @@ -36,9 +36,11 @@ def record(): splash_response.meta["urls"] = parsed_node.meta["urls"] splash_response.meta["authors"] = parsed_node.meta["authors"] - parsed_record = spider.scrape_for_pdf(splash_response).next() - assert parsed_record - return parsed_record + parsed_item = spider.scrape_for_pdf(splash_response).next() + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_abstact(record): @@ -125,9 +127,11 @@ def non_url(): selector = Selector(response, type='html') nodes = selector.xpath('//%s' % spider.itertag) - parsed_record = spider.parse_node(response, nodes[0]).next() - assert parsed_record - return parsed_record + parsed_item = spider.parse_node(response, nodes[0]).next() + assert parsed_item + assert parsed_item.record + + return parsed_item.record def test_non_url(non_url): diff --git a/tests/unit/test_world_scientific.py b/tests/unit/test_world_scientific.py index 36438ab4..291d00d0 100644 --- a/tests/unit/test_world_scientific.py +++ b/tests/unit/test_world_scientific.py @@ -48,8 +48,11 @@ def get_records(response_file_name): def get_one_record(response_file_name): - results = get_records(response_file_name) - return results.next() + records = get_records(response_file_name) + record = records.next() + assert record + + return record def override_generated_fields(record):