Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pos: update pos spider #160

Merged
merged 14 commits into from
Nov 1, 2017
Merged
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ env:
- SUITE=functional_arxiv
- SUITE=functional_desy
- SUITE=functional_cds
- SUITE=functional_pos

matrix:
fast_finish: true
Expand Down
69 changes: 55 additions & 14 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,43 @@ services:
functional_wsp:
<<: *service_base
command: py.test -vv tests/functional/wsp
links:
- scrapyd
- ftp_server
depends_on:
scrapyd:
condition: service_healthy
ftp_server:
condition: service_healthy

functional_desy:
<<: *service_base
command: py.test -vv tests/functional/desy
links:
- scrapyd
- ftp_server
depends_on:
scrapyd:
condition: service_healthy
ftp_server:
condition: service_healthy

functional_arxiv:
<<: *service_base
command: py.test -vv tests/functional/arxiv
links:
- scrapyd
depends_on:
scrapyd:
condition: service_healthy

functional_cds:
<<: *service_base
command: py.test -vv tests/functional/cds
links:
- scrapyd

functional_pos:
<<: *service_base
command: py.test -vv tests/functional/pos
depends_on:
scrapyd:
condition: service_healthy
http-server.local:
condition: service_healthy

unit:
<<: *service_base
command: bash -c "py.test tests/unit -vv && make -C docs clean && make -C docs html && python setup.py sdist && ls dist/*"
Expand All @@ -64,14 +78,16 @@ services:
celery:
<<: *service_base
command: celery worker --events --app hepcrawl.testlib.tasks --loglevel=debug
links:
- rabbitmq
depends_on:
rabbitmq:
condition: service_healthy

scrapyd:
<<: *service_base
command: bash -c "rm -f twistd.pid && exec scrapyd"
links:
- celery
depends_on:
celery:
condition: service_started
healthcheck:
timeout: 5s
interval: 5s
Expand All @@ -83,8 +99,9 @@ services:
scrapyd-deploy:
<<: *service_base
command: bash -c "scrapyd-deploy"
links:
- scrapyd
depends_on:
scrapyd:
condition: service_healthy

ftp_server:
image: stilliard/pure-ftpd:hardened
Expand All @@ -96,5 +113,29 @@ services:
- ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP
- ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd

http-server.local:
image: nginx:stable-alpine
volumes:
- ${PWD}/tests/functional/pos/fixtures/https_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf
- ${PWD}/tests/functional/pos/fixtures/https_server/conf/ssl:/etc/nginx/ssl
- ${PWD}/tests/functional/pos/fixtures/https_server/records:/etc/nginx/html/
ports:
- 443:443
healthcheck:
timeout: 5s
interval: 5s
retries: 5
test:
- "CMD-SHELL"
- "curl https://localhost:443/"

rabbitmq:
image: rabbitmq
healthcheck:
timeout: 5s
interval: 5s
retries: 5
test:
- "CMD"
- "rabbitmqctl"
- "status"
8 changes: 5 additions & 3 deletions hepcrawl/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,17 @@ class HEPRecord(scrapy.Item):
file_urls = scrapy.Field()
"""List of files to be downloaded with FilesPipeline and added to files."""

additional_files = scrapy.Field()
documents = scrapy.Field()
"""Files (fulltexts, package) belonging to this item.
Example:
::
[{
"type": "Fulltext", # Fulltext, Supplemental, Data, Figure
"uri": "file:///path/to/file", # can also be HTTP
"fulltext": true,
"url": "file:///path/to/file",
"description": "some fancy stuff",
"key": "usually_a_file_name.pdf",
}]
"""

Expand Down
13 changes: 13 additions & 0 deletions hepcrawl/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,23 @@ def _has_to_be_crawled(self, request, spider):
request_db_key = self._get_key(request)

if request_db_key not in self.db:
LOGGER.debug(
'Crawl-Once: key %s for request %s not found in the db, '
'should be crawled.' % (request_db_key, request)
)
return True

new_file_timestamp = self._get_timestamp(request, spider)
old_file_timestamp = self.db.get(key=request_db_key)
LOGGER.debug(
'Crawl-Once: key %s for request %s found in the db, '
'considering timestamps new(%s) and old(%s).' % (
request_db_key,
request,
new_file_timestamp,
old_file_timestamp,
)
)
return new_file_timestamp > old_file_timestamp

def _get_key(self, request):
Expand Down
4 changes: 2 additions & 2 deletions hepcrawl/spiders/edp_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def build_item_rich(self, response):
# NOTE: maybe this should be removed as the 'rich' format records
# are not open access.
record.add_value(
"additional_files",
"documents",
self._create_file(
get_first(response.meta["pdf_links"]),
"INSPIRE-PUBLIC",
Expand Down Expand Up @@ -384,7 +384,7 @@ def build_item_jats(self, response):

if "pdf_links" in response.meta:
record.add_value(
"additional_files",
"documents",
self._create_file(
get_first(response.meta["pdf_links"]),
"INSPIRE-PUBLIC",
Expand Down
5 changes: 4 additions & 1 deletion hepcrawl/spiders/elsevier_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,7 +995,10 @@ def build_item(self, response):

xml_file = response.meta.get("xml_url")
if xml_file:
record.add_value('additional_files', self.add_file(xml_file, "HIDDEN", "Fulltext"))
record.add_value(
'documents',
self.add_file(xml_file, "HIDDEN", "Fulltext"),
)
sd_url = self._get_sd_url(xml_file)
if requests.head(sd_url).status_code == 200: # Test if valid url
record.add_value("urls", sd_url)
Expand Down
12 changes: 6 additions & 6 deletions hepcrawl/spiders/hindawi_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,13 +154,13 @@ def get_journal_pages(node):
else:
return journal_pages, ''

def create_file(self, file_path, file_access, file_type):
"""Create a structured dictionary to add to 'files' item."""
def create_document(self, file_path):
"""Create a structured dictionary to add to 'documents' item."""
file_dict = {
"access": file_access,
"hidden": True,
"description": self.name.upper(),
"url": file_path,
"type": file_type,
"fulltext": True,
}
return file_dict

Expand Down Expand Up @@ -219,9 +219,9 @@ def parse_node(self, response, node):
record.add_value('file_urls', pdf_links)
if xml_links:
record.add_value(
'additional_files',
'documents',
[
self.create_file(xml, "INSPIRE-HIDDEN", "Fulltext")
self.create_document(xml)
for xml in xml_links
]
)
Expand Down
5 changes: 4 additions & 1 deletion hepcrawl/spiders/infn_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,10 @@ def build_item(self, response):

pdf_files = response.meta.get("pdf_links")
if pdf_files:
record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext"))
record.add_value(
'documents',
self.add_file(pdf_files, "HIDDEN", "Fulltext"),
)
record.add_value('authors', response.meta.get("authors"))
record.add_value('date_published', response.meta.get("date_published"))
record.add_value('thesis', response.meta.get("thesis_info"))
Expand Down
26 changes: 15 additions & 11 deletions hepcrawl/spiders/iop_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,13 @@ def get_pdf_path(self, vol, issue, fpage):
if pattern in pdf_path:
return os.path.join(self.pdf_files, pdf_path)

def add_file(self, file_path, file_access, file_type):
def add_document(self, file_path, hidden, fulltext):
"""Create a structured dictionary and add to 'files' item."""
file_dict = {
"access": file_access,
"hidden": hidden,
"fulltext": fulltext,
"description": self.name.upper(),
"url": file_path,
"type": file_type,
}
return file_dict

Expand Down Expand Up @@ -206,21 +206,25 @@ def parse_node(self, response, node):
record.add_value('collections', self.get_collections(doctype))

xml_file_path = response.url
record.add_value("additional_files",
self.add_file(xml_file_path, "INSPIRE-HIDDEN", "Fulltext"))
record.add_value(
"documents",
self.add_document(xml_file_path, hidden=True, fulltext=True),
)
if self.pdf_files:
pdf_file_path = self.get_pdf_path(volume, issue, fpage)
if pdf_file_path:
if doctype and "erratum" in doctype.lower():
file_type = "Erratum"
fulltext = False
else:
file_type = "Fulltext"
fulltext = True
if journal_title in self.OPEN_ACCESS_JOURNALS:
file_access = "INSPIRE-PUBLIC" # FIXME: right?
hidden = False
else:
file_access = "INSPIRE-HIDDEN"
record.add_value("additional_files",
self.add_file(pdf_file_path, file_access, file_type))
hidden = True
record.add_value(
"documents",
self.add_document(pdf_file_path, hidden=hidden, fulltext=fulltext),
)

parsed_item = ParsedItem(
record=record.load_item(),
Expand Down
2 changes: 1 addition & 1 deletion hepcrawl/spiders/magic_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def build_item(self, response):
record.add_value('title', response.meta.get("title"))
record.add_value('urls', response.meta.get("urls"))
record.add_value("abstract", response.meta.get("abstract"))
record.add_value("additional_files", response.meta.get("files"))
record.add_value("documents", response.meta.get("files"))
record.add_value('collections', ['HEP', 'THESIS'])

parsed_item = ParsedItem(
Expand Down
6 changes: 4 additions & 2 deletions hepcrawl/spiders/mit_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,10 @@ def build_item(self, response):

pdf_files = node.xpath(".//table[@id='file-table']//td/a/@href").extract()
if pdf_files:
record.add_value('additional_files', self.add_file(
pdf_files, "HIDDEN", "Fulltext"))
record.add_value(
'documents',
self.add_file(pdf_files, "HIDDEN", "Fulltext"),
)
record.add_value('authors', self.get_authors(node))
record.add_xpath('date_published',
"//td[contains(text(), 'dc.date.issued')]/following-sibling::td[1]/text()")
Expand Down
5 changes: 4 additions & 1 deletion hepcrawl/spiders/phenix_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,10 @@ def parse_node(self, response, node):
return None

pdf_files = node.xpath(".//a/@href").extract()
record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext"))
record.add_value(
'documents',
self.add_file(pdf_files, "HIDDEN", "Fulltext"),
)
record.add_value('authors', self.get_authors(node))
record.add_value('date_published', year)
record.add_value('thesis', {'degree_type': thesis_type})
Expand Down
Loading