From 8320ade16b712225bbff5147acc9c7dcdecd67bb Mon Sep 17 00:00:00 2001 From: Spiros Delviniotis Date: Tue, 22 Aug 2017 13:13:25 +0200 Subject: [PATCH] pos: fix spider Signed-off-by: Spiros Delviniotis --- hepcrawl/crawler2hep.py | 6 ++ hepcrawl/items.py | 4 ++ hepcrawl/spiders/pos_spider.py | 63 ++++++++++++------- .../pos_conference_proceedings_records.json | 10 +++ tests/functional/pos/test_pos.py | 24 ++++--- tests/unit/test_pos.py | 56 ++++++++++------- 6 files changed, 108 insertions(+), 55 deletions(-) diff --git a/hepcrawl/crawler2hep.py b/hepcrawl/crawler2hep.py index c2ba80d8..8cb2e6f3 100644 --- a/hepcrawl/crawler2hep.py +++ b/hepcrawl/crawler2hep.py @@ -339,6 +339,12 @@ def _filter_affiliation(affiliations): source=report_number.get('source') ) + for url in crawler_record.get('urls', []): + builder.add_url(url=url.get('value')) + + if crawler_record.get('_fft'): + builder.record['_fft'] = crawler_record.get('_fft') + builder.validate_record() return builder.record diff --git a/hepcrawl/items.py b/hepcrawl/items.py index dab67dda..8bdb5478 100644 --- a/hepcrawl/items.py +++ b/hepcrawl/items.py @@ -318,3 +318,7 @@ class HEPRecord(scrapy.Item): thesis_supervisor = scrapy.Field() language = scrapy.Field() + + _fft = scrapy.Field() + """Used to communicate with legacy about files (to be) attached to the + record.""" diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 85c4b2ff..54e3ba99 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -62,14 +62,13 @@ class POSSpider(Spider): To do that and because each needs the information of the previous, the spider must use the callbacks system provided by scrapy through the - :ref:`scrapy.html.response.Response` callback parameter, and chain the + :class:`scrapy.html.response.Response` callback parameter, and chain the parser functions. The deduplication of the conference proceedings papers is left for the `HepcrawlCrawlOnceMiddleware` middleware. Example: - :: $ scrapy crawl PoS \\ -a "source_file=file://$PWD/tests/unit/responses/pos/sample_pos_record.xml" """ @@ -94,24 +93,26 @@ def parse(self, response): self.log('Got record from: {response.url}'.format(**vars())) response.selector.remove_namespaces() - records = response.selector.xpath('.//record') - for record in records: - yield self.get_conference_paper_page_request(raw_xml=record) + record_xml_selectors = response.selector.xpath('.//record') + for record_xml_selector in record_xml_selectors: + yield self.get_conference_paper_page_request( + xml_selector=record_xml_selector, + ) - def get_conference_paper_page_request(self, raw_xml, meta=None): + def get_conference_paper_page_request(self, xml_selector, meta=None): """Gets the conference paper html page, for the pdf link for the conference paper, and later the internal conference id. """ meta = meta or {} - identifier = raw_xml.xpath( + identifier = xml_selector.xpath( './/metadata/pex-dc/identifier/text()' ).extract_first() conference_paper_url = "{0}{1}".format( self.base_conference_paper_url, identifier, ) - meta['xml_record'] = raw_xml + meta['xml_record'] = xml_selector.extract() # the meta parameter will be passed over to the callback as a property # in the response parameter @@ -137,11 +138,11 @@ def parse_conference_paper(self, response): # prepare next callback step response.meta['html_record'] = response.body - yield self.get_conference_proceendings_page_request( + yield self.get_conference_proceedings_page_request( meta=response.meta, ) - def get_conference_proceendings_page_request(self, meta): + def get_conference_proceedings_page_request(self, meta): """Gets the conference proceedings page, using the indernal conference id from the record html page retrieved before. """ @@ -155,9 +156,10 @@ def get_conference_proceendings_page_request(self, meta): ) page_selector = Selector( - text=meta.get('html_record'), - type='html', + text=meta.get('xml_record'), + type='xml', ) + page_selector.remove_namespaces() pos_id = page_selector.xpath( ".//metadata/pex-dc/identifier/text()" ).extract_first() @@ -220,15 +222,15 @@ def build_conference_paper_item( ).extract_first() record.add_value( 'journal_title', - self._get_journal_title(identifier=identifier), + self._get_journal_title(pos_ext_identifier=identifier), ) record.add_value( 'journal_volume', - self._get_journal_volume(identifier=identifier), + self._get_journal_volume(pos_ext_identifier=identifier), ) record.add_value( 'journal_artid', - self._get_journal_artid(identifier=identifier), + self._get_journal_artid(pos_ext_identifier=identifier), ) record.add_xpath('title', '//metadata/pex-dc/title/text()') @@ -240,8 +242,13 @@ def build_conference_paper_item( record.add_value('language', self._get_language(selector=selector)) record.add_value('authors', self._get_authors(selector=selector)) record.add_value('collections', ['conferencepaper']) - record.add_value('urls', conference_paper_pdf_url) - record.add_value('_fulltext_url', self._get_conference_paper_pdf_url()) + record.add_value('urls', [conference_paper_url]) + record.add_value( + '_fft', + self._set_fft( + path=conference_paper_pdf_url, + ), + ) parsed_item = ParsedItem( record=record.load_item(), @@ -277,7 +284,7 @@ def build_conference_proceedings_item( record.add_value('journal_title', 'PoS') record.add_value( 'journal_volume', - self._get_journal_volume(pos_id=pos_id), + self._get_journal_volume(pos_ext_identifier=pos_id), ) parsed_proceeding = ParsedItem( @@ -309,6 +316,14 @@ def _get_conference_paper_pdf_url(self, conference_paper_page_html): conference_paper_pdf_relative_url, ) + @staticmethod + def _set_fft(path): + return [ + { + 'path': path, + }, + ] + @staticmethod def _get_language(selector): language = selector.xpath( @@ -317,16 +332,16 @@ def _get_language(selector): return language if language != 'en' else None @staticmethod - def _get_journal_title(pos_id): - return re.split('[()]', pos_id)[0] + def _get_journal_title(pos_ext_identifier): + return re.split('[()]', pos_ext_identifier)[0] @staticmethod - def _get_journal_volume(pos_id): - return re.split('[()]', pos_id)[1] + def _get_journal_volume(pos_ext_identifier): + return re.split('[()]', pos_ext_identifier)[1] @staticmethod - def _get_journal_artid(pos_id): - return re.split('[()]', pos_id)[2] + def _get_journal_artid(pos_ext_identifier): + return re.split('[()]', pos_ext_identifier)[2] @staticmethod def _get_ext_systems_number(selector): diff --git a/tests/functional/pos/fixtures/pos_conference_proceedings_records.json b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json index d2ebb12a..317dd0c9 100644 --- a/tests/functional/pos/fixtures/pos_conference_proceedings_records.json +++ b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json @@ -41,6 +41,16 @@ "title": "Heavy Flavour Physics Review" } ], + "_fft": [ + { + "path": "https://server.local/187/001/pdf" + } + ], + "urls": [ + { + "value": "https://server.local/PoS(LATTICE%202013)001.html" + } + ], "authors": [ { "affiliations": [ diff --git a/tests/functional/pos/test_pos.py b/tests/functional/pos/test_pos.py index 66f24831..423db2b0 100644 --- a/tests/functional/pos/test_pos.py +++ b/tests/functional/pos/test_pos.py @@ -32,7 +32,13 @@ def override_generated_fields(record): @pytest.fixture(scope="function") -def set_up_environment(): +def wait_until_services_are_up(seconds=10): + # The test must wait until the docker environment is up (takes about 10 seconds). + sleep(seconds) + + +@pytest.fixture(scope="function") +def configuration(): package_location = get_test_suite_path( 'pos', 'fixtures', @@ -41,9 +47,6 @@ def set_up_environment(): test_suite='functional', ) - # The test must wait until the docker environment is up (takes about 10 seconds). - sleep(10) - yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', @@ -69,10 +72,11 @@ def set_up_environment(): ] ) def test_pos_conference_paper_record_and_proceedings_record( - set_up_environment, - expected_results, + configuration, + wait_until_services_are_up, + expected_results, ): - crawler = get_crawler_instance(set_up_environment.get('CRAWLER_HOST_URL')) + crawler = get_crawler_instance(configuration.get('CRAWLER_HOST_URL')) results = CeleryMonitor.do_crawl( app=celery_app, @@ -80,10 +84,10 @@ def test_pos_conference_paper_record_and_proceedings_record( monitor_iter_limit=100, events_limit=1, crawler_instance=crawler, - project=set_up_environment.get('CRAWLER_PROJECT'), + project=configuration.get('CRAWLER_PROJECT'), spider='pos', settings={}, - **set_up_environment.get('CRAWLER_ARGUMENTS') + **configuration.get('CRAWLER_ARGUMENTS') ) gotten_results = [override_generated_fields(result) for result in results] @@ -93,6 +97,8 @@ def test_pos_conference_paper_record_and_proceedings_record( # TODO create test that receives conference paper record AND proceedings record. +# 'Crawl-once' plug-in needed. # TODO create test that receives proceedings record ONLY. +# 'Crawl-once' plug-in needed. diff --git a/tests/unit/test_pos.py b/tests/unit/test_pos.py index 4de5fa8b..0b0910a3 100644 --- a/tests/unit/test_pos.py +++ b/tests/unit/test_pos.py @@ -29,7 +29,7 @@ def override_generated_fields(record): @pytest.fixture(scope='session') -def scrape_pos_page_body(): +def scrape_pos_conference_paper_page_body(): return pkg_resources.resource_string( __name__, os.path.join( @@ -41,7 +41,7 @@ def scrape_pos_page_body(): @pytest.fixture(scope='session') -def generated_record(scrape_pos_page_body): +def generated_conference_paper(scrape_pos_conference_paper_page_body): """Return results generator from the PoS spider.""" # environmental variables needed for the pipelines payload os.environ['SCRAPY_JOB'] = 'scrapy_job' @@ -51,12 +51,14 @@ def generated_record(scrape_pos_page_body): crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( - fake_response_from_file('pos/sample_pos_record.xml') + fake_response_from_file( + file_name=str('pos/sample_pos_record.xml'), + ) ).next() response = HtmlResponse( url=request.url, request=request, - body=scrape_pos_page_body, + body=scrape_pos_conference_paper_page_body, **{'encoding': 'utf-8'} ) assert response @@ -70,7 +72,7 @@ def generated_record(scrape_pos_page_body): return parsed_record -def test_titles(generated_record): +def test_titles(generated_conference_paper): """Test extracting title.""" expected_titles = [ { @@ -79,33 +81,33 @@ def test_titles(generated_record): } ] - assert 'titles' in generated_record - assert generated_record['titles'] == expected_titles + assert 'titles' in generated_conference_paper + assert generated_conference_paper['titles'] == expected_titles -def test_license(generated_record): +def test_license(generated_conference_paper): """Test extracting license information.""" expected_license = [{ 'license': 'CC-BY-NC-SA-3.0', 'url': 'https://creativecommons.org/licenses/by-nc-sa/3.0', }] - assert generated_record['license'] == expected_license + assert generated_conference_paper['license'] == expected_license -def test_collections(generated_record): +def test_collections(generated_conference_paper): """Test extracting collections.""" expected_document_type = ['conference paper'] - assert generated_record.get('citeable') - assert generated_record.get('document_type') == expected_document_type + assert generated_conference_paper.get('citeable') + assert generated_conference_paper.get('document_type') == expected_document_type -def test_language(generated_record): +def test_language(generated_conference_paper): """Test extracting language.""" - assert 'language' not in generated_record + assert 'language' not in generated_conference_paper -def test_publication_info(generated_record): +def test_publication_info(generated_conference_paper): """Test extracting dois.""" expected_pub_info = [{ 'artid': '001', @@ -114,13 +116,13 @@ def test_publication_info(generated_record): 'year': 2014, }] - assert 'publication_info' in generated_record + assert 'publication_info' in generated_conference_paper - pub_info = generated_record['publication_info'] + pub_info = generated_conference_paper['publication_info'] assert pub_info == expected_pub_info -def test_authors(generated_record): +def test_authors(generated_conference_paper): """Test authors.""" expected_authors = [ { @@ -133,9 +135,9 @@ def test_authors(generated_record): } ] - assert 'authors' in generated_record + assert 'authors' in generated_conference_paper - result_authors = generated_record['authors'] + result_authors = generated_conference_paper['authors'] assert len(result_authors) == len(expected_authors) @@ -144,7 +146,7 @@ def test_authors(generated_record): assert author == expected_author -def test_pipeline_record(generated_record): +def test_pipeline_conference_paper(generated_conference_paper): expected = { 'acquisition_source': { 'datetime': '2017-08-10T16:03:59.091110', @@ -198,7 +200,17 @@ def test_pipeline_record(generated_record): 'source': u'Sissa Medialab', 'title': u'Heavy Flavour Physics Review' } + ], + '_fft': [ + { + 'path': u'https://pos.sissa.it/archive/conferences/187/001/LATTICE 2013_001.pdf' + } + ], + 'urls': [ + { + 'value': 'https://pos.sissa.it/contribution?id=PoS%28LATTICE+2013%29001' + } ] } - assert override_generated_fields(generated_record) == expected + assert override_generated_fields(generated_conference_paper) == expected