Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
Signed-off-by: Spiros Delviniotis <[email protected]>
  • Loading branch information
spirosdelviniotis committed Aug 2, 2017
1 parent 2a4c3ed commit abcb28c
Show file tree
Hide file tree
Showing 50 changed files with 918 additions and 293 deletions.
138 changes: 121 additions & 17 deletions hepcrawl/crawler2hep.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,42 +15,146 @@

from __future__ import absolute_import, division, print_function

import os
import datetime

from inspire_schemas.api import LiteratureBuilder

from hepcrawl.utils import get_file_name_from_url


def _update_record_fft_links(record, map_fft_file_paths):
def _list_new_fft_links(old_fft_links, map_fft_file_paths):
new_fft_links = []
for fft_link in old_fft_links:
file_name = get_file_name_from_url(fft_link['path'])
if file_name in map_fft_file_paths:
new_fft_links.append(
{
'path': map_fft_file_paths[file_name],
}
)
def _update_record_fft(record, index_fft_file_paths):
def _update_fft_fields(fft_fields, index_fft_file_paths):
new_fft_fields = []
for fft_field in fft_fields:
file_name = get_file_name_from_url(fft_field['path'])
if file_name in index_fft_file_paths:
fft_field['path'] = index_fft_file_paths[file_name]
new_fft_fields.append(fft_field)

return new_fft_links
return new_fft_fields

old_fft_links = record['_fft']
record['_fft'] = _list_new_fft_links(old_fft_links, map_fft_file_paths)
record['_fft'] = _update_fft_fields(record['_fft'], index_fft_file_paths)
return record


def to_hep(item, item_format='hepcrawl', fft_file_paths=None):
def _has_publication_info(item):
"""If any publication info."""
return item.get('pubinfo_freetext') or item.get('journal_volume') or \
item.get('journal_title') or \
item.get('journal_year') or \
item.get('journal_issue') or \
item.get('journal_fpage') or \
item.get('journal_lpage') or \
item.get('journal_artid') or \
item.get('journal_doctype')


def _filter_fields(item, keys):
"""Filter away keys."""
for key in keys:
item.pop(key, None)


def _normalize_hepcrawl_record(item, source):
if 'related_article_doi' in item:
item['dois'] += item.pop('related_article_doi', [])

item['titles'] = [{
'title': item.pop('title', ''),
'subtitle': item.pop('subtitle', ''),
'source': source,
}]

item['abstracts'] = [{
'value': item.pop('abstract', ''),
'source': source,
}]

item['imprints'] = [{
'date': item.pop('date_published', ''),
}]

item['copyright'] = [{
'holder': item.pop('copyright_holder', ''),
'year': item.pop('copyright_year', ''),
'statement': item.pop('copyright_statement', ''),
'material': item.pop('copyright_material', ''),
}]

if _has_publication_info(item):
item['publication_info'] = [{
'journal_title': item.pop('journal_title', ''),
'journal_volume': item.pop('journal_volume', ''),
'journal_issue': item.pop('journal_issue', ''),
'artid': item.pop('journal_artid', ''),
'page_start': item.pop('journal_fpage', ''),
'page_end': item.pop('journal_lpage', ''),
'note': item.pop('journal_doctype', ''),
'pubinfo_freetext': item.pop('pubinfo_freetext', ''),
'pubinfo_material': item.pop('pubinfo_material', ''),
}]
if item.get('journal_year'):
item['publication_info'][0]['year'] = int(
item.pop('journal_year')
)

# Remove any fields
_filter_fields(item, [
'journal_title',
'journal_volume',
'journal_year',
'journal_issue',
'journal_fpage',
'journal_lpage',
'journal_doctype',
'journal_artid',
'pubinfo_freetext',
'pubinfo_material',
])

return item


def _generate_acquisition_source(crawler_record, source):
crawler_record['acquisition_source'] = {
'source': source,
'method': 'hepcrawl',
'datetime': datetime.datetime.now().isoformat(),
'submission_number': os.environ.get('SCRAPY_JOB', ''),
}
return crawler_record


def to_hep(
item,
source,
item_format='hepcrawl',
fft_file_paths=None,
):
item = _generate_acquisition_source(
crawler_record=item,
source=source,
)

if item_format == 'hep':
return hep2hep(item, fft_file_paths)
return hep2hep(
crawler_record=item,
fft_file_paths=fft_file_paths,
)
elif item_format == 'hepcrawl':
item = _normalize_hepcrawl_record(
item=item,
source=source,
)
return crawler2hep(dict(item))
else:
raise Exception('Unknown item_format::{}'.format(item_format))


def hep2hep(crawler_record, fft_file_paths):
if fft_file_paths:
crawler_record = _update_record_fft_links(crawler_record, fft_file_paths)
crawler_record = _update_record_fft(crawler_record, fft_file_paths)

return crawler_record

Expand Down
107 changes: 7 additions & 100 deletions hepcrawl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

from __future__ import absolute_import, division, print_function

import datetime
import os

import requests
Expand All @@ -30,24 +29,6 @@
from hepcrawl.utils import get_file_name_from_url


def has_publication_info(item):
"""If any publication info."""
return item.get('pubinfo_freetext') or item.get('journal_volume') or \
item.get('journal_title') or \
item.get('journal_year') or \
item.get('journal_issue') or \
item.get('journal_fpage') or \
item.get('journal_lpage') or \
item.get('journal_artid') or \
item.get('journal_doctype')


def filter_fields(item, keys):
"""Filter away keys."""
for key in keys:
item.pop(key, None)


class FftFilesPipeline(FilesPipeline):
"""Download all the FFT files provided by record."""

Expand All @@ -57,10 +38,10 @@ def __init__(self, *args, **kwargs):
def get_media_requests(self, item, info):
"""Download FFT files using FTP."""
if item.get('file_urls'):
for fft_url in item.get('file_urls'):
for fft_url in item.file_urls:
yield Request(
url=fft_url,
meta=item['ftp_params'],
meta=item.ftp_params,
)

def item_completed(self, results, item, info):
Expand All @@ -80,7 +61,7 @@ def _get_absolute_local_file_path(path):
get_file_name_from_url(result_data['url'])
] = _get_absolute_local_file_path(result_data['path'])

item['file_paths'] = map_file_names_paths
item.file_paths = map_file_names_paths

return item

Expand All @@ -95,92 +76,18 @@ def open_spider(self, spider):
self.results_data = []

def _post_enhance_item(self, item, spider):
def _normalize_hepcrawl_record(item, source):
if 'related_article_doi' in item:
item['dois'] += item.pop('related_article_doi', [])

item['titles'] = [{
'title': item.pop('title', ''),
'subtitle': item.pop('subtitle', ''),
'source': source,
}]

item['abstracts'] = [{
'value': item.pop('abstract', ''),
'source': source,
}]

item['imprints'] = [{
'date': item.pop('date_published', ''),
}]

item['copyright'] = [{
'holder': item.pop('copyright_holder', ''),
'year': item.pop('copyright_year', ''),
'statement': item.pop('copyright_statement', ''),
'material': item.pop('copyright_material', ''),
}]

if has_publication_info(item):
item['publication_info'] = [{
'journal_title': item.pop('journal_title', ''),
'journal_volume': item.pop('journal_volume', ''),
'journal_issue': item.pop('journal_issue', ''),
'artid': item.pop('journal_artid', ''),
'page_start': item.pop('journal_fpage', ''),
'page_end': item.pop('journal_lpage', ''),
'note': item.pop('journal_doctype', ''),
'pubinfo_freetext': item.pop('pubinfo_freetext', ''),
'pubinfo_material': item.pop('pubinfo_material', ''),
}]
if item.get('journal_year'):
item['publication_info'][0]['year'] = int(
item.pop('journal_year')
)

# Remove any fields
filter_fields(item, [
'journal_title',
'journal_volume',
'journal_year',
'journal_issue',
'journal_fpage',
'journal_lpage',
'journal_doctype',
'journal_artid',
'pubinfo_freetext',
'pubinfo_material',
])

return item

fft_file_paths = item.get('file_paths')
item_format = item.get('format', 'hepcrawl')
item = item.get('record_item') if item.get('record_item') else item
item = self._generate_record_meta(item, spider)
fft_file_paths = item.file_paths
item_format = item.item_format
item = item.item if item.item else item
source = spider.name

if item_format != 'hep':
item = _normalize_hepcrawl_record(
item=item,
source=source,
)

return to_hep(
item=item,
source=source,
item_format=item_format,
fft_file_paths=fft_file_paths,
)

def _generate_record_meta(self, json_record, spider):
json_record['acquisition_source'] = {
'source': spider.name,
'method': 'hepcrawl',
'datetime': datetime.datetime.now().isoformat(),
'submission_number': os.environ.get('SCRAPY_JOB', ''),
}
return json_record

def process_item(self, item, spider):
"""Convert internal format to INSPIRE data model."""
self.count += 1
Expand Down
12 changes: 10 additions & 2 deletions hepcrawl/spiders/alpha_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import has_numbers
from hepcrawl.utils import (
has_numbers,
ParsedItem,
)


class AlphaSpider(CrawlSpider):
Expand Down Expand Up @@ -145,4 +148,9 @@ def parse(self, response):
record.add_value('source', 'Alpha experiment')
record.add_value('collections', ['HEP', 'THESIS'])

yield record.load_item()
parsed_item = ParsedItem(
item=record.load_item(),
item_format='hepcrawl',
)

yield parsed_item
15 changes: 13 additions & 2 deletions hepcrawl/spiders/aps_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@

from ..items import HEPRecord
from ..loaders import HEPLoader
from ..utils import get_licenses, get_nested, build_dict
from hepcrawl.utils import (
get_licenses,
get_nested,
build_dict,
ParsedItem,
)


class APSSpider(Spider):
Expand Down Expand Up @@ -110,7 +115,13 @@ def parse(self, response):
record.add_value('license', license)

record.add_value('collections', ['HEP', 'Citeable', 'Published'])
yield record.load_item()

parsed_item = ParsedItem(
item=record.load_item(),
item_format='hepcrawl',
)

yield parsed_item

# Pagination support. Will yield until no more "next" pages are found
if 'Link' in response.headers:
Expand Down
15 changes: 12 additions & 3 deletions hepcrawl/spiders/arxiv_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
from scrapy.spiders import XMLFeedSpider

from ..mappings import CONFERENCE_WORDS, THESIS_WORDS
from ..utils import coll_cleanforthe, get_licenses, split_fullname
from hepcrawl.utils import (
coll_cleanforthe,
get_licenses,
split_fullname,
ParsedItem,
)
from ..items import HEPRecord
from ..loaders import HEPLoader

Expand Down Expand Up @@ -110,8 +115,12 @@ def parse_node(self, response, node):
)
record.add_value('license', license)

parsed_record = dict(record.load_item())
return parsed_record
parsed_item = ParsedItem(
item=record.load_item(),
item_format='hepcrawl',
)

return parsed_item

def _get_authors_or_collaboration(self, node):
"""Parse authors, affiliations; extract collaboration"""
Expand Down
Loading

0 comments on commit abcb28c

Please sign in to comment.