Skip to content

Commit

Permalink
Merge pull request inspirehep#215 from david-caro/fetch_all_sets_toge…
Browse files Browse the repository at this point in the history
…ther

Avoid duplicated records from cross-set fetching
  • Loading branch information
david-caro authored Jan 23, 2018
2 parents 940f8d0 + cb85cc8 commit 25a441b
Show file tree
Hide file tree
Showing 7 changed files with 365 additions and 275 deletions.
4 changes: 4 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ after_success:
- sed -i 's@\"/code/@'"\"$(pwd)/"'@g' .coverage
- coveralls

after_failure:
- docker-compose -f docker-compose.test.yml logs --tail=200
- bash -c 'for log in logs/*/*; do echo $log; cat $log; done'

notifications:
email: false

Expand Down
4 changes: 4 additions & 0 deletions hepcrawl/spiders/arxiv_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ def parse_record(self, selector):

return parsed_item

def get_record_identifier(self, record):
"""Extracts a unique identifier from a sickle record."""
return record.header.identifier

def _get_authors_or_collaboration(self, node):
"""Parse authors, affiliations; extract collaboration"""
author_selectors = node.xpath('.//authors//author')
Expand Down
31 changes: 29 additions & 2 deletions hepcrawl/spiders/common/oaipmh_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def __init__(
self.sets = sets
self.from_date = from_date
self.until_date = until_date
self._crawled_records = {}

def start_requests(self):
started_at = datetime.utcnow()
Expand Down Expand Up @@ -116,19 +117,33 @@ def start_requests(self):
)
)

LOGGER.info("Harvesting completed.")
LOGGER.info(
"Harvesting completed, harvested %s records.",
len(self._crawled_records),
)

@abc.abstractmethod
def parse_record(self, record):
"""
This method need to be reimplemented in order to provide special
This method needs to be reimplemented in order to provide special
parsing.
Args:
record (scrapy.selector.Selector): selector on the parsed record
"""
raise NotImplementedError()

@abc.abstractmethod
def get_record_identifier(self, record):
"""
This method need to be reimplemented in order to extract a unique
identifier from the record to avoid cross-set reharvesting.
Args:
record (sickle.models.Record): sickle record response
"""
raise NotImplementedError()

def parse(self, response):
sickle = Sickle(self.url)
params = {
Expand All @@ -153,6 +168,18 @@ def parse(self, response):
params,
)
for record in records:
rec_identifier = self.get_record_identifier(record)
if rec_identifier in self._crawled_records:
# avoid cross-set repeated records
LOGGER.info('Skipping duplicated record %s', rec_identifier)
continue

LOGGER.debug(
'Not skipping non-duplicated record %s',
rec_identifier,
)

self._crawled_records[rec_identifier] = record
response = XmlResponse(self.url, encoding='utf-8', body=record.raw)
selector = Selector(response, type='xml')
yield self.parse_record(selector)
Expand Down
Loading

0 comments on commit 25a441b

Please sign in to comment.