Skip to content

Commit

Permalink
global: use standard ParsedResponse in the spiders
Browse files Browse the repository at this point in the history
Signed-off-by: David Caro <[email protected]>
  • Loading branch information
david-caro committed Aug 22, 2017
1 parent 01ff0d8 commit e4b9f43
Show file tree
Hide file tree
Showing 22 changed files with 260 additions and 61 deletions.
6 changes: 3 additions & 3 deletions hepcrawl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@

from inspire_schemas.utils import validate

from hepcrawl.tohep import item_to_hep
from hepcrawl.settings import FILES_STORE
from hepcrawl.utils import RecordFile
from .tohep import item_to_hep
from .settings import FILES_STORE
from .utils import RecordFile


class FftFilesPipeline(FilesPipeline):
Expand Down
12 changes: 10 additions & 2 deletions hepcrawl/spiders/alpha_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import has_numbers
from ..utils import (
has_numbers,
ParsedItem,
)


class AlphaSpider(CrawlSpider):
Expand Down Expand Up @@ -145,4 +148,9 @@ def parse(self, response):
record.add_value('source', 'Alpha experiment')
record.add_value('collections', ['HEP', 'THESIS'])

yield record.load_item()
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)

yield parsed_item
15 changes: 13 additions & 2 deletions hepcrawl/spiders/aps_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import get_licenses, get_nested, build_dict
from ..utils import (
get_licenses,
get_nested,
build_dict,
ParsedItem,
)


class APSSpider(Spider):
Expand Down Expand Up @@ -110,7 +115,13 @@ def parse(self, response):
record.add_value('license', license)

record.add_value('collections', ['HEP', 'Citeable', 'Published'])
yield record.load_item()

parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)

yield parsed_item

# Pagination support. Will yield until no more "next" pages are found
if 'Link' in response.headers:
Expand Down
21 changes: 16 additions & 5 deletions hepcrawl/spiders/arxiv_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,15 @@
from scrapy import Request, Selector
from scrapy.spiders import XMLFeedSpider

from ..mappings import CONFERENCE_WORDS, THESIS_WORDS
from ..utils import coll_cleanforthe, get_licenses, split_fullname
from ..items import HEPRecord
from ..loaders import HEPLoader
from ..mappings import CONFERENCE_WORDS, THESIS_WORDS
from ..utils import (
coll_cleanforthe,
get_licenses,
split_fullname,
ParsedItem,
)

RE_CONFERENCE = re.compile(r'\b(%s)\b' % '|'.join(
[re.escape(word) for word in CONFERENCE_WORDS]), re.I | re.U)
Expand All @@ -33,7 +38,9 @@ class ArxivSpider(XMLFeedSpider):
Example:
Using OAI-PMH XML files::
$ scrapy crawl arXiv -a source_file=file://`pwd`/tests/responses/arxiv/sample_arxiv_record.xml
$ scrapy crawl \\
arXiv \\
-a "source_file=file://$PWD/tests/responses/arxiv/sample_arxiv_record.xml"
"""

Expand Down Expand Up @@ -110,8 +117,12 @@ def parse_node(self, response, node):
)
record.add_value('license', license)

parsed_record = dict(record.load_item())
return parsed_record
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)

return parsed_item

def _get_authors_or_collaboration(self, node):
"""Parse authors, affiliations; extract collaboration"""
Expand Down
15 changes: 13 additions & 2 deletions hepcrawl/spiders/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import get_mime_type, parse_domain, get_node
from ..utils import (
get_mime_type,
parse_domain,
get_node,
ParsedItem,
)


class BaseSpider(XMLFeedSpider):
Expand Down Expand Up @@ -192,7 +197,13 @@ def build_item(self, response):
record.add_value("authors", self.get_authors(node))
record.add_value('thesis', {'degree_type': 'PhD'})
record.add_value('collections', ['HEP', 'THESIS'])
return record.load_item()

parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)

return parsed_item

def scrape_for_pdf(self, response):
"""Scrape splash page for any links to PDFs.
Expand Down
14 changes: 12 additions & 2 deletions hepcrawl/spiders/brown_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,12 @@

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import split_fullname, parse_domain, get_mime_type
from ..utils import (
split_fullname,
parse_domain,
get_mime_type,
ParsedItem,
)


class BrownSpider(CrawlSpider):
Expand Down Expand Up @@ -219,4 +224,9 @@ def build_item(self, response):
record.add_value('thesis', response.meta.get("thesis"))
record.add_value('collections', ['HEP', 'THESIS'])

return record.load_item()
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)

return parsed_item
2 changes: 1 addition & 1 deletion hepcrawl/spiders/desy_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from inspire_dojson.hep import hep

from hepcrawl.utils import (
from ..utils import (
ftp_list_files,
ftp_connection_info,
ParsedItem,
Expand Down
15 changes: 13 additions & 2 deletions hepcrawl/spiders/dnb_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import get_mime_type, parse_domain, get_node
from ..utils import (
get_mime_type,
parse_domain,
get_node,
ParsedItem,
)


class DNBSpider(XMLFeedSpider):
Expand Down Expand Up @@ -219,4 +224,10 @@ def build_item(self, response):

record.add_value('thesis', {'degree_type': 'PhD'})
record.add_value('collections', ['HEP', 'THESIS'])
return record.load_item()

parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)

return parsed_item
29 changes: 20 additions & 9 deletions hepcrawl/spiders/edp_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
get_licenses,
get_node,
parse_domain,
ParsedItem,
)


Expand Down Expand Up @@ -65,11 +66,11 @@ class EDPSpider(Jats, XMLFeedSpider):
To run an ``EDPSpider`` using ``rich`` format::
$ scrapy crawl EDP -a package_path=file://`pwd`/tests/responses/edp/test_rich.tar.bz2
$ scrapy crawl EDP -a source_folder=file://`pwd`/tests/responses/edp/test_rich.tar.bz2
To run an ``EDPSpider`` using ``gz`` format::
$ scrapy crawl EDP -a package_path=file://`pwd`/tests/responses/edp/test_gz.tar.gz
$ scrapy crawl EDP -a source_folder=file://`pwd`/tests/responses/edp/test_gz.tar.gz
Todo:
Expand Down Expand Up @@ -144,9 +145,9 @@ def start_requests(self):
ftp_host, ftp_params = ftp_connection_info(
self.ftp_host, self.ftp_netrc)
_, new_files = ftp_list_files(
self.ftp_folder,
self.target_folder,
server=ftp_host,
server_folder=self.ftp_folder,
destination_folder=self.target_folder,
ftp_host=ftp_host,
user=ftp_params['ftp_user'],
password=ftp_params['ftp_password']
)
Expand Down Expand Up @@ -175,7 +176,7 @@ def handle_package_ftp(self, response):
for xml_file in xml_files:
yield Request(
"file://{0}".format(xml_file),
meta={"package_path": zip_filepath}
meta={"source_folder": zip_filepath}
)

def handle_package_file(self, response):
Expand All @@ -188,7 +189,7 @@ def handle_package_file(self, response):
for xml_file in xml_files:
request = Request(
"file://{0}".format(xml_file),
meta={"package_path": zip_filepath}
meta={"source_folder": zip_filepath}
)
if "xml_rich" in xml_file:
request.meta["rich"] = True
Expand Down Expand Up @@ -318,7 +319,12 @@ def build_item_rich(self, response):
)
record.add_value("urls", response.meta.get("urls"))

return record.load_item()
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)

return parsed_item

def build_item_jats(self, response):
"""Build the final HEPRecord with JATS-format XML ('jp')."""
Expand Down Expand Up @@ -388,7 +394,12 @@ def build_item_jats(self, response):
references = self._get_references(node)
record.add_value("references", references)

return record.load_item()
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)

return parsed_item

def _get_references(self, node):
"""Get the references."""
Expand Down
10 changes: 8 additions & 2 deletions hepcrawl/spiders/elsevier_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
has_numbers,
range_as_string,
unzip_xml_files,
ParsedItem,
)

from ..dateutils import format_year
Expand Down Expand Up @@ -180,7 +181,7 @@ def handle_package(self, response):
xml_url = u"file://{0}".format(os.path.abspath(xml_file))
yield Request(
xml_url,
meta={"package_path": zip_filepath,
meta={"source_folder": zip_filepath,
"xml_url": xml_url},
)

Expand Down Expand Up @@ -1034,4 +1035,9 @@ def build_item(self, response):
record.add_value('collections', self.get_collections(doctype))
record.add_value('references', self.get_references(node))

return record.load_item()
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)

return parsed_item
12 changes: 10 additions & 2 deletions hepcrawl/spiders/hindawi_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import get_licenses
from ..utils import (
get_licenses,
ParsedItem,
)


class HindawiSpider(XMLFeedSpider):
Expand Down Expand Up @@ -222,4 +225,9 @@ def parse_node(self, response, node):
record.add_xpath('source',
"./datafield[@tag='260']/subfield[@code='b']/text()")

return record.load_item()
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)

return parsed_item
13 changes: 10 additions & 3 deletions hepcrawl/spiders/infn_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import get_temporary_file

from ..utils import (
get_temporary_file,
ParsedItem,
)
from ..dateutils import format_date


Expand Down Expand Up @@ -240,4 +242,9 @@ def build_item(self, response):
record.add_value('source', 'INFN')
record.add_value('collections', ['HEP', 'THESIS'])

return record.load_item()
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)

return parsed_item
8 changes: 7 additions & 1 deletion hepcrawl/spiders/iop_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import ParsedItem


class IOPSpider(XMLFeedSpider, NLM):
Expand Down Expand Up @@ -222,4 +223,9 @@ def parse_node(self, response, node):
record.add_value("additional_files",
self.add_fft_file(pdf_file_path, file_access, file_type))

return record.load_item()
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)

return parsed_item
12 changes: 10 additions & 2 deletions hepcrawl/spiders/magic_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import split_fullname
from ..utils import (
split_fullname,
ParsedItem,
)


class MagicSpider(XMLFeedSpider):
Expand Down Expand Up @@ -176,4 +179,9 @@ def build_item(self, response):
record.add_value("additional_files", response.meta.get("files"))
record.add_value('collections', ['HEP', 'THESIS'])

yield record.load_item()
parsed_item = ParsedItem(
record=record.load_item(),
record_format='hepcrawl',
)

yield parsed_item
Loading

0 comments on commit e4b9f43

Please sign in to comment.