Skip to content

Commit

Permalink
arxiv: avoid duplicated records (cross-set)
Browse files Browse the repository at this point in the history
Signed-off-by: David Caro <[email protected]>
  • Loading branch information
david-caro committed Jan 23, 2018
1 parent e115cbf commit 2a27e1f
Show file tree
Hide file tree
Showing 5 changed files with 357 additions and 273 deletions.
4 changes: 4 additions & 0 deletions hepcrawl/spiders/arxiv_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ def parse_record(self, selector):

return parsed_item

def get_record_identifier(self, record):
"""Extracts a unique identifier from a sickle record."""
return record.header.identifier

def _get_authors_or_collaboration(self, node):
"""Parse authors, affiliations; extract collaboration"""
author_selectors = node.xpath('.//authors//author')
Expand Down
31 changes: 29 additions & 2 deletions hepcrawl/spiders/common/oaipmh_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def __init__(
self.sets = sets
self.from_date = from_date
self.until_date = until_date
self._crawled_records = {}

def start_requests(self):
started_at = datetime.utcnow()
Expand Down Expand Up @@ -116,19 +117,33 @@ def start_requests(self):
)
)

LOGGER.info("Harvesting completed.")
LOGGER.info(
"Harvesting completed, harvested %s records.",
len(self._crawled_records),
)

@abc.abstractmethod
def parse_record(self, record):
"""
This method need to be reimplemented in order to provide special
This method needs to be reimplemented in order to provide special
parsing.
Args:
record (scrapy.selector.Selector): selector on the parsed record
"""
raise NotImplementedError()

@abc.abstractmethod
def get_record_identifier(self, record):
"""
This method need to be reimplemented in order to extract a unique
identifier from the record to avoid cross-set reharvesting.
Args:
record (sickle.models.Record): sickle record response
"""
raise NotImplementedError()

def parse(self, response):
sickle = Sickle(self.url)
params = {
Expand All @@ -153,6 +168,18 @@ def parse(self, response):
params,
)
for record in records:
rec_identifier = self.get_record_identifier(record)
if rec_identifier in self._crawled_records:
# avoid cross-set repeated records
LOGGER.info('Skipping duplicated record %s', rec_identifier)
continue

LOGGER.debug(
'Not skipping non-duplicated record %s',
rec_identifier,
)

self._crawled_records[rec_identifier] = record
response = XmlResponse(self.url, encoding='utf-8', body=record.raw)
selector = Selector(response, type='xml')
yield self.parse_record(selector)
Expand Down
Loading

0 comments on commit 2a27e1f

Please sign in to comment.