Skip to content

Commit

Permalink
Merge pull request #160 from spirosdelviniotis/hepcrawl_pos_spider
Browse files Browse the repository at this point in the history
pos: update pos spider
  • Loading branch information
david-caro authored Nov 1, 2017
2 parents 746ec16 + 1dd708d commit 4f33b74
Show file tree
Hide file tree
Showing 35 changed files with 1,282 additions and 247 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ env:
- SUITE=functional_arxiv
- SUITE=functional_desy
- SUITE=functional_cds
- SUITE=functional_pos

matrix:
fast_finish: true
Expand Down
69 changes: 55 additions & 14 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,43 @@ services:
functional_wsp:
<<: *service_base
command: py.test -vv tests/functional/wsp
links:
- scrapyd
- ftp_server
depends_on:
scrapyd:
condition: service_healthy
ftp_server:
condition: service_healthy

functional_desy:
<<: *service_base
command: py.test -vv tests/functional/desy
links:
- scrapyd
- ftp_server
depends_on:
scrapyd:
condition: service_healthy
ftp_server:
condition: service_healthy

functional_arxiv:
<<: *service_base
command: py.test -vv tests/functional/arxiv
links:
- scrapyd
depends_on:
scrapyd:
condition: service_healthy

functional_cds:
<<: *service_base
command: py.test -vv tests/functional/cds
links:
- scrapyd

functional_pos:
<<: *service_base
command: py.test -vv tests/functional/pos
depends_on:
scrapyd:
condition: service_healthy
http-server.local:
condition: service_healthy

unit:
<<: *service_base
command: bash -c "py.test tests/unit -vv && make -C docs clean && make -C docs html && python setup.py sdist && ls dist/*"
Expand All @@ -64,14 +78,16 @@ services:
celery:
<<: *service_base
command: celery worker --events --app hepcrawl.testlib.tasks --loglevel=debug
links:
- rabbitmq
depends_on:
rabbitmq:
condition: service_healthy

scrapyd:
<<: *service_base
command: bash -c "rm -f twistd.pid && exec scrapyd"
links:
- celery
depends_on:
celery:
condition: service_started
healthcheck:
timeout: 5s
interval: 5s
Expand All @@ -83,8 +99,9 @@ services:
scrapyd-deploy:
<<: *service_base
command: bash -c "scrapyd-deploy"
links:
- scrapyd
depends_on:
scrapyd:
condition: service_healthy

ftp_server:
image: stilliard/pure-ftpd:hardened
Expand All @@ -96,5 +113,29 @@ services:
- ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP
- ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd

http-server.local:
image: nginx:stable-alpine
volumes:
- ${PWD}/tests/functional/pos/fixtures/https_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf
- ${PWD}/tests/functional/pos/fixtures/https_server/conf/ssl:/etc/nginx/ssl
- ${PWD}/tests/functional/pos/fixtures/https_server/records:/etc/nginx/html/
ports:
- 443:443
healthcheck:
timeout: 5s
interval: 5s
retries: 5
test:
- "CMD-SHELL"
- "curl https://localhost:443/"

rabbitmq:
image: rabbitmq
healthcheck:
timeout: 5s
interval: 5s
retries: 5
test:
- "CMD"
- "rabbitmqctl"
- "status"
8 changes: 5 additions & 3 deletions hepcrawl/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,17 @@ class HEPRecord(scrapy.Item):
file_urls = scrapy.Field()
"""List of files to be downloaded with FilesPipeline and added to files."""

additional_files = scrapy.Field()
documents = scrapy.Field()
"""Files (fulltexts, package) belonging to this item.
Example:
::
[{
"type": "Fulltext", # Fulltext, Supplemental, Data, Figure
"uri": "file:///path/to/file", # can also be HTTP
"fulltext": true,
"url": "file:///path/to/file",
"description": "some fancy stuff",
"key": "usually_a_file_name.pdf",
}]
"""

Expand Down
13 changes: 13 additions & 0 deletions hepcrawl/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,23 @@ def _has_to_be_crawled(self, request, spider):
request_db_key = self._get_key(request)

if request_db_key not in self.db:
LOGGER.debug(
'Crawl-Once: key %s for request %s not found in the db, '
'should be crawled.' % (request_db_key, request)
)
return True

new_file_timestamp = self._get_timestamp(request, spider)
old_file_timestamp = self.db.get(key=request_db_key)
LOGGER.debug(
'Crawl-Once: key %s for request %s found in the db, '
'considering timestamps new(%s) and old(%s).' % (
request_db_key,
request,
new_file_timestamp,
old_file_timestamp,
)
)
return new_file_timestamp > old_file_timestamp

def _get_key(self, request):
Expand Down
4 changes: 2 additions & 2 deletions hepcrawl/spiders/edp_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def build_item_rich(self, response):
# NOTE: maybe this should be removed as the 'rich' format records
# are not open access.
record.add_value(
"additional_files",
"documents",
self._create_file(
get_first(response.meta["pdf_links"]),
"INSPIRE-PUBLIC",
Expand Down Expand Up @@ -384,7 +384,7 @@ def build_item_jats(self, response):

if "pdf_links" in response.meta:
record.add_value(
"additional_files",
"documents",
self._create_file(
get_first(response.meta["pdf_links"]),
"INSPIRE-PUBLIC",
Expand Down
5 changes: 4 additions & 1 deletion hepcrawl/spiders/elsevier_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,7 +995,10 @@ def build_item(self, response):

xml_file = response.meta.get("xml_url")
if xml_file:
record.add_value('additional_files', self.add_file(xml_file, "HIDDEN", "Fulltext"))
record.add_value(
'documents',
self.add_file(xml_file, "HIDDEN", "Fulltext"),
)
sd_url = self._get_sd_url(xml_file)
if requests.head(sd_url).status_code == 200: # Test if valid url
record.add_value("urls", sd_url)
Expand Down
12 changes: 6 additions & 6 deletions hepcrawl/spiders/hindawi_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,13 +154,13 @@ def get_journal_pages(node):
else:
return journal_pages, ''

def create_file(self, file_path, file_access, file_type):
"""Create a structured dictionary to add to 'files' item."""
def create_document(self, file_path):
"""Create a structured dictionary to add to 'documents' item."""
file_dict = {
"access": file_access,
"hidden": True,
"description": self.name.upper(),
"url": file_path,
"type": file_type,
"fulltext": True,
}
return file_dict

Expand Down Expand Up @@ -219,9 +219,9 @@ def parse_node(self, response, node):
record.add_value('file_urls', pdf_links)
if xml_links:
record.add_value(
'additional_files',
'documents',
[
self.create_file(xml, "INSPIRE-HIDDEN", "Fulltext")
self.create_document(xml)
for xml in xml_links
]
)
Expand Down
5 changes: 4 additions & 1 deletion hepcrawl/spiders/infn_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,10 @@ def build_item(self, response):

pdf_files = response.meta.get("pdf_links")
if pdf_files:
record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext"))
record.add_value(
'documents',
self.add_file(pdf_files, "HIDDEN", "Fulltext"),
)
record.add_value('authors', response.meta.get("authors"))
record.add_value('date_published', response.meta.get("date_published"))
record.add_value('thesis', response.meta.get("thesis_info"))
Expand Down
26 changes: 15 additions & 11 deletions hepcrawl/spiders/iop_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,13 @@ def get_pdf_path(self, vol, issue, fpage):
if pattern in pdf_path:
return os.path.join(self.pdf_files, pdf_path)

def add_file(self, file_path, file_access, file_type):
def add_document(self, file_path, hidden, fulltext):
"""Create a structured dictionary and add to 'files' item."""
file_dict = {
"access": file_access,
"hidden": hidden,
"fulltext": fulltext,
"description": self.name.upper(),
"url": file_path,
"type": file_type,
}
return file_dict

Expand Down Expand Up @@ -206,21 +206,25 @@ def parse_node(self, response, node):
record.add_value('collections', self.get_collections(doctype))

xml_file_path = response.url
record.add_value("additional_files",
self.add_file(xml_file_path, "INSPIRE-HIDDEN", "Fulltext"))
record.add_value(
"documents",
self.add_document(xml_file_path, hidden=True, fulltext=True),
)
if self.pdf_files:
pdf_file_path = self.get_pdf_path(volume, issue, fpage)
if pdf_file_path:
if doctype and "erratum" in doctype.lower():
file_type = "Erratum"
fulltext = False
else:
file_type = "Fulltext"
fulltext = True
if journal_title in self.OPEN_ACCESS_JOURNALS:
file_access = "INSPIRE-PUBLIC" # FIXME: right?
hidden = False
else:
file_access = "INSPIRE-HIDDEN"
record.add_value("additional_files",
self.add_file(pdf_file_path, file_access, file_type))
hidden = True
record.add_value(
"documents",
self.add_document(pdf_file_path, hidden=hidden, fulltext=fulltext),
)

parsed_item = ParsedItem(
record=record.load_item(),
Expand Down
2 changes: 1 addition & 1 deletion hepcrawl/spiders/magic_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def build_item(self, response):
record.add_value('title', response.meta.get("title"))
record.add_value('urls', response.meta.get("urls"))
record.add_value("abstract", response.meta.get("abstract"))
record.add_value("additional_files", response.meta.get("files"))
record.add_value("documents", response.meta.get("files"))
record.add_value('collections', ['HEP', 'THESIS'])

parsed_item = ParsedItem(
Expand Down
6 changes: 4 additions & 2 deletions hepcrawl/spiders/mit_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,10 @@ def build_item(self, response):

pdf_files = node.xpath(".//table[@id='file-table']//td/a/@href").extract()
if pdf_files:
record.add_value('additional_files', self.add_file(
pdf_files, "HIDDEN", "Fulltext"))
record.add_value(
'documents',
self.add_file(pdf_files, "HIDDEN", "Fulltext"),
)
record.add_value('authors', self.get_authors(node))
record.add_xpath('date_published',
"//td[contains(text(), 'dc.date.issued')]/following-sibling::td[1]/text()")
Expand Down
5 changes: 4 additions & 1 deletion hepcrawl/spiders/phenix_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,10 @@ def parse_node(self, response, node):
return None

pdf_files = node.xpath(".//a/@href").extract()
record.add_value('additional_files', self.add_file(pdf_files, "HIDDEN", "Fulltext"))
record.add_value(
'documents',
self.add_file(pdf_files, "HIDDEN", "Fulltext"),
)
record.add_value('authors', self.get_authors(node))
record.add_value('date_published', year)
record.add_value('thesis', {'degree_type': thesis_type})
Expand Down
Loading

0 comments on commit 4f33b74

Please sign in to comment.