diff --git a/.travis.yml b/.travis.yml index 7682b90c..6bc66b84 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,6 +30,7 @@ env: - SUITE=functional_arxiv - SUITE=functional_desy - SUITE=functional_cds + - SUITE=functional_pos matrix: fast_finish: true diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 65bb864b..9a1df2e0 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -33,22 +33,27 @@ services: functional_wsp: <<: *service_base command: py.test -vv tests/functional/wsp - links: - - scrapyd - - ftp_server + depends_on: + scrapyd: + condition: service_healthy + ftp_server: + condition: service_healthy functional_desy: <<: *service_base command: py.test -vv tests/functional/desy - links: - - scrapyd - - ftp_server + depends_on: + scrapyd: + condition: service_healthy + ftp_server: + condition: service_healthy functional_arxiv: <<: *service_base command: py.test -vv tests/functional/arxiv - links: - - scrapyd + depends_on: + scrapyd: + condition: service_healthy functional_cds: <<: *service_base @@ -56,6 +61,15 @@ services: links: - scrapyd + functional_pos: + <<: *service_base + command: py.test -vv tests/functional/pos + depends_on: + scrapyd: + condition: service_healthy + http-server.local: + condition: service_healthy + unit: <<: *service_base command: bash -c "py.test tests/unit -vv && make -C docs clean && make -C docs html && python setup.py sdist && ls dist/*" @@ -64,14 +78,16 @@ services: celery: <<: *service_base command: celery worker --events --app hepcrawl.testlib.tasks --loglevel=debug - links: - - rabbitmq + depends_on: + rabbitmq: + condition: service_healthy scrapyd: <<: *service_base command: bash -c "rm -f twistd.pid && exec scrapyd" - links: - - celery + depends_on: + celery: + condition: service_started healthcheck: timeout: 5s interval: 5s @@ -83,8 +99,9 @@ services: scrapyd-deploy: <<: *service_base command: bash -c "scrapyd-deploy" - links: - - scrapyd + depends_on: + scrapyd: + condition: service_healthy ftp_server: image: stilliard/pure-ftpd:hardened @@ -96,5 +113,29 @@ services: - ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP - ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd + http-server.local: + image: nginx:stable-alpine + volumes: + - ${PWD}/tests/functional/pos/fixtures/https_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf + - ${PWD}/tests/functional/pos/fixtures/https_server/conf/ssl:/etc/nginx/ssl + - ${PWD}/tests/functional/pos/fixtures/https_server/records:/etc/nginx/html/ + ports: + - 443:443 + healthcheck: + timeout: 5s + interval: 5s + retries: 5 + test: + - "CMD-SHELL" + - "curl https://localhost:443/" + rabbitmq: image: rabbitmq + healthcheck: + timeout: 5s + interval: 5s + retries: 5 + test: + - "CMD" + - "rabbitmqctl" + - "status" diff --git a/hepcrawl/items.py b/hepcrawl/items.py index dab67dda..09d0d552 100644 --- a/hepcrawl/items.py +++ b/hepcrawl/items.py @@ -44,15 +44,17 @@ class HEPRecord(scrapy.Item): file_urls = scrapy.Field() """List of files to be downloaded with FilesPipeline and added to files.""" - additional_files = scrapy.Field() + documents = scrapy.Field() """Files (fulltexts, package) belonging to this item. Example: :: [{ - "type": "Fulltext", # Fulltext, Supplemental, Data, Figure - "uri": "file:///path/to/file", # can also be HTTP + "fulltext": true, + "url": "file:///path/to/file", + "description": "some fancy stuff", + "key": "usually_a_file_name.pdf", }] """ diff --git a/hepcrawl/middlewares.py b/hepcrawl/middlewares.py index 99551e93..ecd518f4 100644 --- a/hepcrawl/middlewares.py +++ b/hepcrawl/middlewares.py @@ -116,10 +116,23 @@ def _has_to_be_crawled(self, request, spider): request_db_key = self._get_key(request) if request_db_key not in self.db: + LOGGER.debug( + 'Crawl-Once: key %s for request %s not found in the db, ' + 'should be crawled.' % (request_db_key, request) + ) return True new_file_timestamp = self._get_timestamp(request, spider) old_file_timestamp = self.db.get(key=request_db_key) + LOGGER.debug( + 'Crawl-Once: key %s for request %s found in the db, ' + 'considering timestamps new(%s) and old(%s).' % ( + request_db_key, + request, + new_file_timestamp, + old_file_timestamp, + ) + ) return new_file_timestamp > old_file_timestamp def _get_key(self, request): diff --git a/hepcrawl/spiders/edp_spider.py b/hepcrawl/spiders/edp_spider.py index ef075e4d..c051c8ee 100644 --- a/hepcrawl/spiders/edp_spider.py +++ b/hepcrawl/spiders/edp_spider.py @@ -312,7 +312,7 @@ def build_item_rich(self, response): # NOTE: maybe this should be removed as the 'rich' format records # are not open access. record.add_value( - "additional_files", + "documents", self._create_file( get_first(response.meta["pdf_links"]), "INSPIRE-PUBLIC", @@ -384,7 +384,7 @@ def build_item_jats(self, response): if "pdf_links" in response.meta: record.add_value( - "additional_files", + "documents", self._create_file( get_first(response.meta["pdf_links"]), "INSPIRE-PUBLIC", diff --git a/hepcrawl/spiders/elsevier_spider.py b/hepcrawl/spiders/elsevier_spider.py index 3f1fe0c6..e2d4e919 100644 --- a/hepcrawl/spiders/elsevier_spider.py +++ b/hepcrawl/spiders/elsevier_spider.py @@ -995,7 +995,10 @@ def build_item(self, response): xml_file = response.meta.get("xml_url") if xml_file: - record.add_value('additional_files', self.add_file(xml_file, "HIDDEN", "Fulltext")) + record.add_value( + 'documents', + self.add_file(xml_file, "HIDDEN", "Fulltext"), + ) sd_url = self._get_sd_url(xml_file) if requests.head(sd_url).status_code == 200: # Test if valid url record.add_value("urls", sd_url) diff --git a/hepcrawl/spiders/hindawi_spider.py b/hepcrawl/spiders/hindawi_spider.py index 999b7183..5f81f5b4 100644 --- a/hepcrawl/spiders/hindawi_spider.py +++ b/hepcrawl/spiders/hindawi_spider.py @@ -154,13 +154,13 @@ def get_journal_pages(node): else: return journal_pages, '' - def create_file(self, file_path, file_access, file_type): - """Create a structured dictionary to add to 'files' item.""" + def create_document(self, file_path): + """Create a structured dictionary to add to 'documents' item.""" file_dict = { - "access": file_access, + "hidden": True, "description": self.name.upper(), "url": file_path, - "type": file_type, + "fulltext": True, } return file_dict @@ -219,9 +219,9 @@ def parse_node(self, response, node): record.add_value('file_urls', pdf_links) if xml_links: record.add_value( - 'additional_files', + 'documents', [ - self.create_file(xml, "INSPIRE-HIDDEN", "Fulltext") + self.create_document(xml) for xml in xml_links ] ) diff --git a/hepcrawl/spiders/infn_spider.py b/hepcrawl/spiders/infn_spider.py index 23e71708..2e093ab1 100644 --- a/hepcrawl/spiders/infn_spider.py +++ b/hepcrawl/spiders/infn_spider.py @@ -232,7 +232,10 @@ def build_item(self, response): pdf_files = response.meta.get("pdf_links") if pdf_files: - record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext")) + record.add_value( + 'documents', + self.add_file(pdf_files, "HIDDEN", "Fulltext"), + ) record.add_value('authors', response.meta.get("authors")) record.add_value('date_published', response.meta.get("date_published")) record.add_value('thesis', response.meta.get("thesis_info")) diff --git a/hepcrawl/spiders/iop_spider.py b/hepcrawl/spiders/iop_spider.py index ee778e58..fbca3ae5 100644 --- a/hepcrawl/spiders/iop_spider.py +++ b/hepcrawl/spiders/iop_spider.py @@ -152,13 +152,13 @@ def get_pdf_path(self, vol, issue, fpage): if pattern in pdf_path: return os.path.join(self.pdf_files, pdf_path) - def add_file(self, file_path, file_access, file_type): + def add_document(self, file_path, hidden, fulltext): """Create a structured dictionary and add to 'files' item.""" file_dict = { - "access": file_access, + "hidden": hidden, + "fulltext": fulltext, "description": self.name.upper(), "url": file_path, - "type": file_type, } return file_dict @@ -206,21 +206,25 @@ def parse_node(self, response, node): record.add_value('collections', self.get_collections(doctype)) xml_file_path = response.url - record.add_value("additional_files", - self.add_file(xml_file_path, "INSPIRE-HIDDEN", "Fulltext")) + record.add_value( + "documents", + self.add_document(xml_file_path, hidden=True, fulltext=True), + ) if self.pdf_files: pdf_file_path = self.get_pdf_path(volume, issue, fpage) if pdf_file_path: if doctype and "erratum" in doctype.lower(): - file_type = "Erratum" + fulltext = False else: - file_type = "Fulltext" + fulltext = True if journal_title in self.OPEN_ACCESS_JOURNALS: - file_access = "INSPIRE-PUBLIC" # FIXME: right? + hidden = False else: - file_access = "INSPIRE-HIDDEN" - record.add_value("additional_files", - self.add_file(pdf_file_path, file_access, file_type)) + hidden = True + record.add_value( + "documents", + self.add_document(pdf_file_path, hidden=hidden, fulltext=fulltext), + ) parsed_item = ParsedItem( record=record.load_item(), diff --git a/hepcrawl/spiders/magic_spider.py b/hepcrawl/spiders/magic_spider.py index 41687674..8dfd5d51 100644 --- a/hepcrawl/spiders/magic_spider.py +++ b/hepcrawl/spiders/magic_spider.py @@ -177,7 +177,7 @@ def build_item(self, response): record.add_value('title', response.meta.get("title")) record.add_value('urls', response.meta.get("urls")) record.add_value("abstract", response.meta.get("abstract")) - record.add_value("additional_files", response.meta.get("files")) + record.add_value("documents", response.meta.get("files")) record.add_value('collections', ['HEP', 'THESIS']) parsed_item = ParsedItem( diff --git a/hepcrawl/spiders/mit_spider.py b/hepcrawl/spiders/mit_spider.py index 8ba61d89..21804873 100644 --- a/hepcrawl/spiders/mit_spider.py +++ b/hepcrawl/spiders/mit_spider.py @@ -207,8 +207,10 @@ def build_item(self, response): pdf_files = node.xpath(".//table[@id='file-table']//td/a/@href").extract() if pdf_files: - record.add_value('additional_files', self.add_file( - pdf_files, "HIDDEN", "Fulltext")) + record.add_value( + 'documents', + self.add_file(pdf_files, "HIDDEN", "Fulltext"), + ) record.add_value('authors', self.get_authors(node)) record.add_xpath('date_published', "//td[contains(text(), 'dc.date.issued')]/following-sibling::td[1]/text()") diff --git a/hepcrawl/spiders/phenix_spider.py b/hepcrawl/spiders/phenix_spider.py index a5fcd025..aa54bd98 100644 --- a/hepcrawl/spiders/phenix_spider.py +++ b/hepcrawl/spiders/phenix_spider.py @@ -121,7 +121,10 @@ def parse_node(self, response, node): return None pdf_files = node.xpath(".//a/@href").extract() - record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext")) + record.add_value( + 'documents', + self.add_file(pdf_files, "HIDDEN", "Fulltext"), + ) record.add_value('authors', self.get_authors(node)) record.add_value('date_published', year) record.add_value('thesis', {'degree_type': thesis_type}) diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index c388a487..19d4fee5 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -12,7 +12,7 @@ from __future__ import absolute_import, division, print_function import re - +import os from urlparse import urljoin from scrapy import Request, Selector @@ -28,112 +28,231 @@ ) +DEFAULT_BASE_URL = 'https://pos.sissa.it' +DEFAULT_BASE_CONFERENCE_PAPER_URL = ( + DEFAULT_BASE_URL + '/contribution?id=' +) +DEFAULT_BASE_PROCEEDINGS_URL = ( + DEFAULT_BASE_URL + '/cgi-bin/reader/conf.cgi?confid=' +) + + +class PoSExtractionException(Exception): + pass + + class POSSpider(StatefulSpider): """POS/Sissa crawler. - Extracts from metadata: - * title - * article-id - * conf-acronym - * authors - * affiliations - * publication-date - * publisher - * license - * language - * link + From PoS we create two types of records, a conference paper record, and a + conference proceedings record. + + The bulk of the records comes from oaiharvest, and this spider crawls the + files generated by it. + + For the conference paper record we have to scrape also the html page of the + record on the PoS website to get the pdf link. (see + `DEFAULT_BASE_CONFERENCE_PAPER_URL`) + + Then, from that same page, we get the internal conference id. + + With that conference id, then we scrape the conference proceedings page, + and extract the information to create the proceedings record. (see + `DEFAULT_BASE_PROCEEDINGS_URL`) + + To do that and because each needs the information of the previous, the + spider must use the callbacks system provided by scrapy through the + :class:`scrapy.html.response.Response` callback parameter, and chain the + parser functions. + + The deduplication of the conference proceedings papers is left for the + `HepcrawlCrawlOnceMiddleware` middleware. Example: :: - - $ scrapy crawl PoS -a source_file=file://`pwd`/tests/responses/pos/sample_pos_record.xml + $ scrapy crawl PoS \\ + -a "source_file=file://$PWD/tests/unit/responses/pos/sample_pos_record.xml" """ - name = 'PoS' - pos_base_url = "https://pos.sissa.it/contribution?id=" + name = 'pos' - def __init__(self, source_file=None, **kwargs): - """Construct POS spider.""" + def __init__( + self, + source_file=None, + base_conference_paper_url=DEFAULT_BASE_CONFERENCE_PAPER_URL, + base_proceedings_url=DEFAULT_BASE_PROCEEDINGS_URL, + **kwargs + ): super(POSSpider, self).__init__(**kwargs) self.source_file = source_file + self.base_conference_paper_url = base_conference_paper_url + self.base_proceedings_url = base_proceedings_url def start_requests(self): yield Request(self.source_file) def parse(self, response): - """Get PDF information.""" - node = response.selector - node.remove_namespaces() - for record in node.xpath('.//record'): - identifier = record.xpath('.//metadata/pex-dc/identifier/text()').extract_first() - if identifier: - # Probably all links lead to same place, so take first - pos_url = "{0}{1}".format(self.pos_base_url, identifier) - request = Request(pos_url, callback=self.scrape_pos_page) - request.meta["url"] = response.url - request.meta["record"] = record.extract() - yield request - - def scrape_pos_page(self, response): - """Parse a page for PDF link.""" - response.meta["pos_pdf_url"] = response.selector.xpath( - "//a[contains(text(),'pdf')]/@href" + self.log('Got record from: {response.url}'.format(**vars())) + + response.selector.remove_namespaces() + record_xml_selectors = response.selector.xpath('.//record') + for record_xml_selector in record_xml_selectors: + yield self.get_conference_paper_page_request( + xml_selector=record_xml_selector, + ) + + def get_conference_paper_page_request(self, xml_selector, meta=None): + """Gets the conference paper html page, for the pdf link for the + conference paper, and later the internal conference id. + """ + meta = meta or {} + + identifier = xml_selector.xpath( + './/metadata/pex-dc/identifier/text()' + ).extract_first() + conference_paper_url = "{0}{1}".format( + self.base_conference_paper_url, + identifier, + ) + meta['xml_record'] = xml_selector.extract() + + # the meta parameter will be passed over to the callback as a property + # in the response parameter + return Request( + url=conference_paper_url, + callback=self.parse_conference_paper, + meta=meta + ) + + def parse_conference_paper(self, response): + self.log( + 'Parsing conference paper from: {response.url}'.format(**vars()) + ) + xml_record = response.meta.get('xml_record') + conference_paper_url = response.url + conference_paper_pdf_url = self._get_conference_paper_pdf_url( + conference_paper_page_html=response.body, + ) + + parsed_conference_paper = self.build_conference_paper_item( + xml_record=xml_record, + conference_paper_url=conference_paper_url, + conference_paper_pdf_url=conference_paper_pdf_url, + ) + yield parsed_conference_paper + + # prepare next callback step + response.meta['html_record'] = response.body + yield self.get_conference_proceedings_page_request( + meta=response.meta, + ) + + def get_conference_proceedings_page_request(self, meta): + """Gets the conference proceedings page, using the indernal conference + id from the record html page retrieved before. + """ + if not meta.get('html_record'): + raise PoSExtractionException( + 'PoS conference paper page was empty, current meta:\n%s' % meta + ) + + proceedings_page_url = self._get_proceedings_page_url( + page_html=meta.get('html_record'), + ) + + page_selector = Selector( + text=meta.get('xml_record'), + type='xml', + ) + page_selector.remove_namespaces() + pos_id = page_selector.xpath( + ".//metadata/pex-dc/identifier/text()" ).extract_first() - response.meta["pos_pdf_url"] = urljoin(self.pos_base_url, response.meta["pos_pdf_url"]) - response.meta["pos_url"] = response.url - return self.build_item(response) - - def build_item(self, response): - """Parse an PoS XML exported file into a HEP record.""" - text = response.meta["record"] - node = Selector(text=text, type="xml") - node.remove_namespaces() - record = HEPLoader(item=HEPRecord(), selector=node) + meta['pos_id'] = pos_id + + return Request( + url=proceedings_page_url, + meta=meta, + callback=self.parse_conference_proceedings, + ) + + def parse_conference_proceedings(self, request): + parsed_conference_proceedings = self.build_conference_proceedings_item( + proceedings_page_html=request.body, + pos_id=request.meta['pos_id'], + ) + yield parsed_conference_proceedings + + def _get_proceedings_page_url(self, page_html): + page_selector = Selector( + text=page_html, + type="html" + ) + internal_url = page_selector.xpath( + "//a[not(contains(text(),'pdf'))]/@href", + ).extract_first() + proceedings_internal_id = internal_url.split('/')[1] + return '{0}{1}'.format( + self.base_proceedings_url, + proceedings_internal_id, + ) + + def build_conference_paper_item( + self, + xml_record, + conference_paper_url, + conference_paper_pdf_url, + ): + selector = Selector( + text=xml_record, + type="xml" + ) + selector.remove_namespaces() + record = HEPLoader( + item=HEPRecord(), + selector=selector + ) + + license_text = selector.xpath( + './/metadata/pex-dc/rights/text()' + ).extract_first() + record.add_value('license', get_licenses(license_text=license_text)) + + date, year = self._get_date(selector=selector) + record.add_value('date_published', date) + record.add_value('journal_year', year) + + identifier = selector.xpath( + ".//metadata/pex-dc/identifier/text()" + ).extract_first() + record.add_value( + 'journal_title', + self._get_journal_title(pos_ext_identifier=identifier), + ) + record.add_value( + 'journal_volume', + self._get_journal_volume(pos_ext_identifier=identifier), + ) + record.add_value( + 'journal_artid', + self._get_journal_artid(pos_ext_identifier=identifier), + ) + record.add_xpath('title', '//metadata/pex-dc/title/text()') record.add_xpath('source', '//metadata/pex-dc/publisher/text()') - - record.add_value('external_system_numbers', self._get_ext_systems_number(node)) - - license = get_licenses( - license_text=node.xpath( - ".//metadata/pex-dc/rights/text()" - ).extract_first(), - ) - record.add_value('license', license) - - date, year = self._get_date(node) - if date: - record.add_value('date_published', date) - if year: - record.add_value('journal_year', int(year)) - - identifier = node.xpath(".//metadata/pex-dc/identifier/text()").extract_first() - record.add_value('urls', response.meta['pos_url']) - if response.meta['pos_pdf_url']: - record.add_value('additional_files', {'type': "Fulltext", "url": response.meta['pos_pdf_url']}) - if identifier: - pbn = re.split('[()]', identifier) - if len(pbn) == 3: - conf_acronym = pbn[1] - article_id = pbn[2] - record.add_value('journal_title', pbn[0]) - record.add_value('journal_volume', conf_acronym) - record.add_value('journal_artid', article_id) - else: - record.add_value('pubinfo_freetext', identifier) - - language = node.xpath(".//metadata/pex-dc/language/text()").extract_first() - if language: - record.add_value('language', language) - - authors = self._get_authors(node) - if authors: - record.add_value('authors', authors) - - extra_data = self._get_extra_data(node) - if extra_data: - record.add_value('extra_data', extra_data) - - record.add_value('collections', ['HEP', 'ConferencePaper']) + record.add_value( + 'external_system_numbers', + self._get_ext_systems_number(selector=selector), + ) + record.add_value('language', self._get_language(selector=selector)) + record.add_value('authors', self._get_authors(selector=selector)) + record.add_value('collections', ['conferencepaper']) + record.add_value('urls', [conference_paper_url]) + record.add_value( + 'documents', + self.get_documents( + path=conference_paper_pdf_url, + ), + ) parsed_item = ParsedItem( record=record.load_item(), @@ -142,50 +261,165 @@ def build_item(self, response): return parsed_item - def _get_ext_systems_number(self, node): + def build_conference_proceedings_item( + self, + proceedings_page_html, + pos_id, + ): + selector = Selector( + text=proceedings_page_html, + type='html', + ) + selector.remove_namespaces() + record = HEPLoader( + item=HEPRecord(), + selector=selector + ) + + record.add_value('collections', ['proceedings']) + record.add_value( + 'title', + self._get_proceedings_title(selector=selector), + ) + record.add_value( + 'subtitle', + self._get_proceedings_date_place(selector=selector), + ) + record.add_value('journal_title', 'PoS') + record.add_value( + 'journal_volume', + self._get_journal_volume(pos_ext_identifier=pos_id), + ) + + parsed_proceeding = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_proceeding + + def _get_conference_paper_pdf_url(self, conference_paper_page_html): + selector = Selector( + text=conference_paper_page_html, + type='html', + ) + conference_paper_pdf_relative_url = selector.xpath( + "//a[contains(text(),'pdf')]/@href", + ).extract_first() + + if not conference_paper_pdf_relative_url: + raise PoSExtractionException( + ( + 'Unable to get the conference paper pdf url from the html:' + '\n%s' + ) % conference_paper_page_html + ) + + return urljoin( + self.base_conference_paper_url, + conference_paper_pdf_relative_url, + ) + + def _get_proceedings_url(self, response): + internal_url = response.selector.xpath( + "//a[not(contains(text(),'pdf'))]/@href", + ).extract_first() + proceedings_identifier = internal_url.split('/')[1] + return '{0}{1}'.format( + self.base_proceedings_url, + proceedings_identifier, + ) + + @staticmethod + def get_documents(path): return [ { - 'institute': 'PoS', - 'value': node.xpath('.//metadata/pex-dc/identifier/text()').extract_first() + 'key': os.path.basename(path), + 'url': path, + 'original_url': path, + 'hidden': True, + 'fulltext': True, }, + ] + + @staticmethod + def _get_language(selector): + language = selector.xpath( + ".//metadata/pex-dc/language/text()" + ).extract_first() + return language if language != 'en' else None + + @staticmethod + def _get_journal_title(pos_ext_identifier): + return re.split('[()]', pos_ext_identifier)[0] + + @staticmethod + def _get_journal_volume(pos_ext_identifier): + return re.split('[()]', pos_ext_identifier)[1] + + @staticmethod + def _get_journal_artid(pos_ext_identifier): + return re.split('[()]', pos_ext_identifier)[2] + + @staticmethod + def _get_ext_systems_number(selector): + return [ { - 'institute': 'PoS', - 'value': node.xpath('.//identifier/text()').extract_first() + 'institute': 'pos', + 'value': selector.xpath( + './/identifier/text()' + ).extract_first() }, ] - def _get_date(self, node): - """Get article date.""" - date = '' - year = '' - full_date = node.xpath(".//metadata/pex-dc/date/text()").extract_first() + @staticmethod + def _get_date(selector): + full_date = selector.xpath( + ".//metadata/pex-dc/date/text()" + ).extract_first() date = create_valid_date(full_date) - if date: - year = date[0:4] + year = int(date[0:4]) + return date, year - def _get_authors(self, node): + @staticmethod + def _get_authors(selector): """Get article authors.""" - author_selectors = node.xpath('.//metadata/pex-dc/creator') authors = [] - for selector in author_selectors: + creators = selector.xpath('.//metadata/pex-dc/creator') + for creator in creators: auth_dict = {} - author = Selector(text=selector.extract()) - auth_dict['raw_name'] = \ - get_first(author.xpath('.//name//text()').extract(), default='') - for affiliation in author.xpath('.//affiliation//text()').extract(): + author = Selector(text=creator.extract()) + auth_dict['raw_name'] = get_first( + author.xpath('.//name//text()').extract(), + default='', + ) + for affiliation in author.xpath( + './/affiliation//text()' + ).extract(): if 'affiliations' in auth_dict: - auth_dict['affiliations'].append({'value': affiliation}) + auth_dict['affiliations'].append( + { + 'value': affiliation + } + ) else: - auth_dict['affiliations'] = [{'value': affiliation}, ] + auth_dict['affiliations'] = [ + { + 'value': affiliation + }, + ] if auth_dict: authors.append(auth_dict) return authors - def _get_extra_data(self, node): - """Get info to help selection - not for INSPIRE record""" - extra_data = {} + @staticmethod + def _get_proceedings_title(selector): + return 'Proceedings, ' + selector.xpath('//h1/text()').extract_first() - section = node.xpath(".//metadata/pex-dc/description/text()").extract_first() - extra_data['section'] = section.split(';', 1)[-1].strip() - return extra_data + @staticmethod + def _get_proceedings_date_place(selector): + date_place = selector.xpath( + "//div[@class='conference_date']/text()" + ).extract() + return ''.join(date_place) diff --git a/hepcrawl/spiders/t2k_spider.py b/hepcrawl/spiders/t2k_spider.py index 4dd495a9..db18eb1e 100644 --- a/hepcrawl/spiders/t2k_spider.py +++ b/hepcrawl/spiders/t2k_spider.py @@ -101,16 +101,16 @@ def get_splash_links(self, node): return out_links - def add_file(self, pdf_files, file_access, file_type): + def add_document(self, pdf_files): """Create a structured dictionary and add to ``files`` item.""" # NOTE: should this be moved to utils? file_dicts = [] for link in pdf_files: file_dict = { - "access": file_access, + "hidden": True, + "fulltext": True, "description": self.name.title(), "url": urljoin(self.domain, link), - "type": file_type, } file_dicts.append(file_dict) return file_dicts @@ -149,7 +149,7 @@ def scrape_for_pdf(self, response): "//a[@class='contenttype-file state-internal url']/@href").extract() response.meta["abstract"] = abstract - response.meta["additional_files"] = self.add_file(file_paths, "HIDDEN", "Fulltext") + response.meta["documents"] = self.add_document(file_paths) return self.build_item(response) @@ -165,7 +165,7 @@ def build_item(self, response): record.add_value('title', response.meta.get("title")) record.add_value('urls', response.meta.get("urls")) record.add_value("abstract", response.meta.get("abstract")) - record.add_value("additional_files", response.meta.get("additional_files")) + record.add_value("documents", response.meta.get("documents")) record.add_value('collections', ['HEP', 'THESIS']) parsed_item = ParsedItem( diff --git a/hepcrawl/tohep.py b/hepcrawl/tohep.py index 980a030d..a4529d7c 100644 --- a/hepcrawl/tohep.py +++ b/hepcrawl/tohep.py @@ -92,7 +92,7 @@ def _normalize_hepcrawl_record(item, source): item['titles'] = [{ 'title': item.pop('title', ''), 'subtitle': item.pop('subtitle', ''), - 'source': source, + 'source': item.pop('source', source), }] item['abstracts'] = [{ @@ -242,13 +242,14 @@ def _filter_affiliation(affiliations): for author in crawler_record.get('authors', []): builder.add_author(builder.make_author( - author['full_name'], + full_name=author['full_name'], affiliations=_filter_affiliation(author['affiliations']), )) for title in crawler_record.get('titles', []): builder.add_title( title=title.get('title'), + subtitle=title.get('subtitle'), source=title.get('source') ) @@ -384,6 +385,20 @@ def _filter_affiliation(affiliations): source=report_number.get('source') ) + for url in crawler_record.get('urls', []): + builder.add_url(url=url.get('value')) + + for document in crawler_record.get('documents', []): + builder.add_document( + description=document.get('description'), + fulltext=document.get('fulltext'), + hidden=document.get('hidden'), + key=document['key'], + material=document.get('material'), + original_url=document.get('original_url'), + url=document['url'], + ) + builder.validate_record() return builder.record diff --git a/tests/functional/desy/test_desy.py b/tests/functional/desy/test_desy.py index 3c309e1f..42a51105 100644 --- a/tests/functional/desy/test_desy.py +++ b/tests/functional/desy/test_desy.py @@ -63,38 +63,14 @@ def _override(field_key, original_dict, backup_dict, new_value): return clean_record -def assert_files_equal(file_1, file_2): - """Compares two files calculating the md5 hash.""" - def _generate_md5_hash(file_path): - hasher = hashlib.md5() - with open(str(file_path), 'rb') as fd: - buf = fd.read() - hasher.update(buf) - return hasher.hexdigest() - - file_1_hash = _generate_md5_hash(file_1) - file_2_hash = _generate_md5_hash(file_2) - assert file_1_hash == file_2_hash - - -def assert_ffts_content_matches_expected(record): - for fft_field in record.get('_fft', []): - assert_fft_content_matches_expected(fft_field) - - -def assert_fft_content_matches_expected(fft_field): - expected_file_name = get_file_name_from_fft(fft_field) - assert_files_equal(expected_file_name, fft_field['path']) - - -def get_file_name_from_fft(fft_field): +def get_file_name_from_documents(documents_field): file_path = get_test_suite_path( 'desy', 'fixtures', 'ftp_server', 'DESY', 'FFT', - fft_field['filename'] + fft_field['format'], + documents_field['key'], test_suite='functional', ) return file_path @@ -213,6 +189,3 @@ def test_desy( ) assert gotten_results == expected_results - - for record in gotten_results: - assert_ffts_content_matches_expected(record) diff --git a/tests/functional/pos/fixtures/https_server/conf/proxy.conf b/tests/functional/pos/fixtures/https_server/conf/proxy.conf new file mode 100644 index 00000000..1591cbcd --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/conf/proxy.conf @@ -0,0 +1,25 @@ +server { + listen 443 ssl; + server_name localhost; + + ssl on; + ssl_protocols TLSv1 TLSv1.1 TLSv1.2; + ssl_certificate ssl/cert.pem; + ssl_certificate_key ssl/cert.key; + + location ~ /contribution { + if ($args ~* "^id=(.*)") { + set $mid $1; + set $args ''; + rewrite ^.*$ /$mid.html permanent; + } + } + + location ~ /cgi-bin/reader/conf.cgi { + if ($args ~* "^confid=(.*)") { + set $mid $1; + set $args ''; + rewrite ^.*$ /$mid.html permanent; + } + } +} diff --git a/tests/functional/pos/fixtures/https_server/conf/ssl/cert.key b/tests/functional/pos/fixtures/https_server/conf/ssl/cert.key new file mode 100755 index 00000000..19e1df68 --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/conf/ssl/cert.key @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQChhBiOoipMRRcc +E5waKrGB01/QtRpfIGp5KmJfnif05dR05wWojHO6EtabZ2qbXtcSuyQ0vRNpbZUU +OzcriFOMk8dujDzuKMkegsq/LE4PyN/R5JZtf34NyGG7v70K6Uq7RV4PUzk+zoum +1McMUBk1QlGP/E9RsDlSPv9XOblUpicPDuwhCwPf4zi6jporgXjDJ/iUuh+bexxv +40R7f2dCWkiHYiNiLNLTwXdYkaWBcc3HoTq9FEZZhYDhWRjX0/TuINmMr5lbUvr6 +UYRABOS4VeUyHpb/e7OH9WXQxzR76LuQFfQDSgs0GxXw1KG58aq+P0ni2E77C4Iu +odQ8iT+jAgMBAAECggEBAIqJeFrXY7p5xIGznEChgBHgUR3+SPlxH4KARVLIoHMh +s2L2SVcx6Y2f3O38/Wb5KTcKx9polz7l3Go3BHJVg3xfwT7kENsipqeB/g+OHALU +BI7PJ+wR3/hIePQGWUsDobMRo8U3WDG0DfryJS09gvG4yabb/tkNc41FNdUGUR31 +7VInQFqv2/jZ/2A3s3DZ0Cns9vJuLhmf7629k3MVCuU7Rh0rStnVCA70kjgKzOfP ++26fnfd/MmrQYbaukw04+cwcwifGkF5Jis80qTWsgdF82rkzpwJLDo0Jd2HZFuOa +AHkWK2QiMzb6PS2Uo7Zarax9E+W2TLahANXZQQ32NAkCgYEAzKw7XbEwzWG/T7yX +EgNIAN7YtcGYr9sfHlVJ8bWYK7GZBbCkKDlGU+YGRE++plh/jtXYjsIFElWtv01Y +UpqBdWf7p8mXdtVoq6YyL5WuQVMwpjKHvegTXXwAoreEXZeKr1LKC11B14h+8wsR +D5uf0GVmdw12nSrzeu3Q4oSgss8CgYEAygU++fItIYuPtZfrC8qDcyEiOLQmAHtX +eTnEHOPy8ik+bdwF5Rg0nzxLu3RZ47ykGdEOzpGRO4B9V1EevwSEzX6VO7latMUS +cLKb3Y0bXm6qQcWG+LAlvyaHfAH0oN47xfScLDiUm6BKd4Eo9kpkgaQzSgUfFZNQ +6DHiA3Emau0CgYEAyel7Y3GjMGomvrXQ3x9HkDxH0/7Z71qe92CyYvZ/2VMKH9fk +Ch5+p9P8CLYW4anapQGH80WqlSzbDCd0Y4EzB6z+UceJWd0stnFtfw4N6znze3HM +AegJ+qaTRfL/bQlL8qwc0Fs+0i9A9enL+fbQEVmHXRl2E5TEwFgOQvkOQ3cCgYAA +4bD6qkHkKZXA9x7BeGrGb9iUYsTfr6ocD1J5xczjnaZ2GEW2UDq6jyrNcJ6LzeDx +c+YapKv7lH33iZUWxFBIDUtdbVul+k4wS7c+akU6TkVT8Ca8oxgnE2X39pI4uX+N +R5n+32hWnYZ1qwygtoZlwm+u3QLbtz7dJIqV9UJzqQKBgQCL8Xo9LA0Dm7ZsdDDI +I93YsjCELvBsonymmD1MTpk7uIA+qH8LAih+Vhonc17NtpXuas8eqc8ntuNLAgON +Tylvk32uaRqquHWl6MT7bwaaK7pD8KuOIUJdl5SEc+DDUcB2A2XLg7Yv08Dus8A7 +6J5oH8YJ3hqmVGZzbOo75IFerg== +-----END PRIVATE KEY----- diff --git a/tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem b/tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem new file mode 100755 index 00000000..1418c1bb --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem @@ -0,0 +1,19 @@ +-----BEGIN CERTIFICATE----- +MIIDATCCAemgAwIBAgIJAJRKy2TWwZqTMA0GCSqGSIb3DQEBCwUAMBcxFTATBgNV +BAMMDGh0dHBzX3NlcnZlcjAeFw0xNzA4MTQxNDQ1MTFaFw0yMDA2MDMxNDQ1MTFa +MBcxFTATBgNVBAMMDGh0dHBzX3NlcnZlcjCCASIwDQYJKoZIhvcNAQEBBQADggEP +ADCCAQoCggEBAKGEGI6iKkxFFxwTnBoqsYHTX9C1Gl8gankqYl+eJ/Tl1HTnBaiM +c7oS1ptnapte1xK7JDS9E2ltlRQ7NyuIU4yTx26MPO4oyR6Cyr8sTg/I39Hklm1/ +fg3IYbu/vQrpSrtFXg9TOT7Oi6bUxwxQGTVCUY/8T1GwOVI+/1c5uVSmJw8O7CEL +A9/jOLqOmiuBeMMn+JS6H5t7HG/jRHt/Z0JaSIdiI2Is0tPBd1iRpYFxzcehOr0U +RlmFgOFZGNfT9O4g2YyvmVtS+vpRhEAE5LhV5TIelv97s4f1ZdDHNHvou5AV9ANK +CzQbFfDUobnxqr4/SeLYTvsLgi6h1DyJP6MCAwEAAaNQME4wHQYDVR0OBBYEFAfu +RxroDak/yro7MbRfDogKVDmBMB8GA1UdIwQYMBaAFAfuRxroDak/yro7MbRfDogK +VDmBMAwGA1UdEwQFMAMBAf8wDQYJKoZIhvcNAQELBQADggEBAF5M/Gz6JDC1HoSm +6HFLBB9ul9TQQI3RhohwreCYyeZ866WrvqZfle+lxcgVburYCSyi5paFpvNK3DH2 +J0A2fDAMekZGcaJ7O5Zx0evTCwXoxDOhS+xO5IlGTXWCEKLeLkU27WJiLC9cTbFr +kfjL14IMnsioRzUz4a+aX5JllqnEccCDlHjSk1w5YvOvt6GC6Bvenouja2apPes/ +oJJpFwZVO0epqOQo1ndRGbt5NLv6YgZlvdFXWoKNKohzdfDV/RbW9BrbpyKSxFTm +usrmVcZTQpSf69zbnEVO8N3N6c1zNdETPON1ZGLW1O1MXWkQDZniH6LduXN/Oob7 +vYqvXlw= +-----END CERTIFICATE----- diff --git a/tests/functional/pos/fixtures/https_server/records/187.html b/tests/functional/pos/fixtures/https_server/records/187.html new file mode 100644 index 00000000..0d86221a --- /dev/null +++ b/tests/functional/pos/fixtures/https_server/records/187.html @@ -0,0 +1,125 @@ + + +
++ The annual lattice symposium brings together a global community of researchers + from theoretical particle physics and beyond, who employ numerical and + computational methods to study the properties of strongly interacting physical + systems, above all Quantum Chromodynamics (QCD), the theory describing the + interactions of quarks and gluons. Topics include studies of the spectrum and + structure of hadrons, lattice studies of matter under extreme conditions, + hadronic contributions to weak decay amplitudes, as well as recent + developments in simulation algorithms and computer hardware. The 2013 + conference in Mainz was attended by over 500 participants from all over the + globe, making it the biggest in this series so far. +
++ This proceedings volume is dedicated to the memory of Nobel Laureate Kenneth + G. Wilson (June 8, 1936 - June 15, 2013). +
+Preface |
Foreword + PoS(LATTICE 2013)503 + pdf + + |
+
Ken Wilson Obituary + PoS(LATTICE 2013)504 + pdf + + |
+
Plenary sessions | +
Heavy Flavour Physics Review + PoS(LATTICE 2013)001 + pdf + + |
+
New Developments for Lattice Field Theory at Non-Zero Density + PoS(LATTICE 2013)002 + pdf + + |
+
Heavy Flavour Physics Review
+A. El-Khadra
+in 31st International Symposium on Lattice Field Theory LATTICE 2013
+Contribution: pdf
++ The annual lattice symposium brings together a global community of researchers + from theoretical particle physics and beyond, who employ numerical and + computational methods to study the properties of strongly interacting physical + systems, above all Quantum Chromodynamics (QCD), the theory describing the + interactions of quarks and gluons. Topics include studies of the spectrum and + structure of hadrons, lattice studies of matter under extreme conditions, + hadronic contributions to weak decay amplitudes, as well as recent + developments in simulation algorithms and computer hardware. The 2013 + conference in Mainz was attended by over 500 participants from all over the + globe, making it the biggest in this series so far. +
++ This proceedings volume is dedicated to the memory of Nobel Laureate Kenneth + G. Wilson (June 8, 1936 - June 15, 2013). +
+Preface | +
Foreword + PoS(LATTICE 2013)503 + pdf + + |
+
Ken Wilson Obituary + PoS(LATTICE 2013)504 + pdf + + |
+
Plenary sessions | +
Heavy Flavour Physics Review + PoS(LATTICE 2013)001 + pdf + + |
+
Charmonium, $D_s$ and $D_s^*$ from overlap fermion on domain wall fermion configurations + PoS(LATTICE 2013)500 + pdf + + |
+