Skip to content

Commit

Permalink
WIP for desy spider
Browse files Browse the repository at this point in the history
Signed-off-by: Spiros Delviniotis <[email protected]>
  • Loading branch information
spirosdelviniotis committed Aug 2, 2017
1 parent fa74f59 commit 840f18a
Show file tree
Hide file tree
Showing 64 changed files with 2,631 additions and 250 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ env:
- SUITE=unit
- SUITE=functional_wsp
- SUITE=functional_arxiv
- SUITE=functional_desy

matrix:
fast_finish: true
Expand Down
11 changes: 11 additions & 0 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ services:
- APP_CELERY_RESULT_BACKEND=amqp://guest:guest@rabbitmq:5672//
- APP_CRAWLER_HOST_URL=http://scrapyd:6800
- APP_API_PIPELINE_TASK_ENDPOINT_DEFAULT=hepcrawl.testlib.tasks.submit_results
- APP_FILES_STORE=/tmp/file_urls
- COVERAGE_PROCESS_START=/code/.coveragerc
- BASE_USER_UID=${BASE_USER_UID:-1000}
- BASE_USER_GIT=${BASE_USER_GIT:-1000}
Expand All @@ -26,6 +27,7 @@ services:
- ${PWD}:/code/
- ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf
- /tmp/WSP:/tmp/WSP
- /tmp/file_urls:/tmp/file_urls

functional_wsp:
<<: *service_base
Expand All @@ -34,6 +36,13 @@ services:
- scrapyd
- ftp_server

functional_desy:
<<: *service_base
command: py.test -vv tests/functional/desy
links:
- scrapyd
- ftp_server

functional_arxiv:
<<: *service_base
command: py.test -vv tests/functional/arxiv
Expand Down Expand Up @@ -68,6 +77,8 @@ services:
environment:
- PUBLICHOST=localhost
volumes:
- ${PWD}/tests/functional/desy/fixtures/ftp_server/FFT:/home/ftpusers/bob/FFT
- ${PWD}/tests/functional/desy/fixtures/ftp_server/DESY:/home/ftpusers/bob/DESY
- ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP
- ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd

Expand Down
143 changes: 142 additions & 1 deletion hepcrawl/crawler2hep.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,149 @@

from __future__ import absolute_import, division, print_function

import os
import datetime

from inspire_schemas.api import LiteratureBuilder

from hepcrawl.utils import get_file_name_from_url


def _update_record_fft(record, index_fft_file_paths):
def _update_fft_fields(fft_fields, index_fft_file_paths):
new_fft_fields = []
for fft_field in fft_fields:
file_name = get_file_name_from_url(fft_field['path'])
if file_name in index_fft_file_paths:
fft_field['path'] = index_fft_file_paths[file_name]
new_fft_fields.append(fft_field)

return new_fft_fields

record['_fft'] = _update_fft_fields(record['_fft'], index_fft_file_paths)
return record


def _has_publication_info(item):
"""If any publication info."""
return item.get('pubinfo_freetext') or item.get('journal_volume') or \
item.get('journal_title') or \
item.get('journal_year') or \
item.get('journal_issue') or \
item.get('journal_fpage') or \
item.get('journal_lpage') or \
item.get('journal_artid') or \
item.get('journal_doctype')


def _filter_fields(item, keys):
"""Filter away keys."""
for key in keys:
item.pop(key, None)


def _normalize_hepcrawl_record(item, source):
if 'related_article_doi' in item:
item['dois'] += item.pop('related_article_doi', [])

item['titles'] = [{
'title': item.pop('title', ''),
'subtitle': item.pop('subtitle', ''),
'source': source,
}]

item['abstracts'] = [{
'value': item.pop('abstract', ''),
'source': source,
}]

item['imprints'] = [{
'date': item.pop('date_published', ''),
}]

item['copyright'] = [{
'holder': item.pop('copyright_holder', ''),
'year': item.pop('copyright_year', ''),
'statement': item.pop('copyright_statement', ''),
'material': item.pop('copyright_material', ''),
}]

if _has_publication_info(item):
item['publication_info'] = [{
'journal_title': item.pop('journal_title', ''),
'journal_volume': item.pop('journal_volume', ''),
'journal_issue': item.pop('journal_issue', ''),
'artid': item.pop('journal_artid', ''),
'page_start': item.pop('journal_fpage', ''),
'page_end': item.pop('journal_lpage', ''),
'note': item.pop('journal_doctype', ''),
'pubinfo_freetext': item.pop('pubinfo_freetext', ''),
'pubinfo_material': item.pop('pubinfo_material', ''),
}]
if item.get('journal_year'):
item['publication_info'][0]['year'] = int(
item.pop('journal_year')
)

# Remove any fields
_filter_fields(item, [
'journal_title',
'journal_volume',
'journal_year',
'journal_issue',
'journal_fpage',
'journal_lpage',
'journal_doctype',
'journal_artid',
'pubinfo_freetext',
'pubinfo_material',
])

return item


def _generate_acquisition_source(crawler_record, source):
crawler_record['acquisition_source'] = {
'source': source,
'method': 'hepcrawl',
'datetime': datetime.datetime.now().isoformat(),
'submission_number': os.environ.get('SCRAPY_JOB', ''),
}
return crawler_record


def to_hep(
item,
source,
item_format='hepcrawl',
fft_file_paths=None,
):
item = _generate_acquisition_source(
crawler_record=item,
source=source,
)

if item_format == 'hep':
return hep2hep(
crawler_record=item,
fft_file_paths=fft_file_paths,
)
elif item_format == 'hepcrawl':
item = _normalize_hepcrawl_record(
item=item,
source=source,
)
return crawler2hep(dict(item))
else:
raise Exception('Unknown item_format::{}'.format(item_format))


def hep2hep(crawler_record, fft_file_paths):
if fft_file_paths:
crawler_record = _update_record_fft(crawler_record, fft_file_paths)

return crawler_record


def crawler2hep(crawler_record):

Expand Down Expand Up @@ -98,7 +239,7 @@ def _filter_affiliation(affiliations):
acquisition_source = crawler_record.get('acquisition_source', {})
builder.add_acquisition_source(
method=acquisition_source['method'],
date=acquisition_source['date'],
date=acquisition_source['datetime'],
source=acquisition_source['source'],
submission_number=acquisition_source['submission_number'],
)
Expand Down
142 changes: 62 additions & 80 deletions hepcrawl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,30 +15,55 @@

from __future__ import absolute_import, division, print_function

import datetime
import os

import requests

from .crawler2hep import crawler2hep
from scrapy import Request
from scrapy.pipelines.files import FilesPipeline

from inspire_schemas.utils import validate

def has_publication_info(item):
"""If any publication info."""
return item.get('pubinfo_freetext') or item.get('journal_volume') or \
item.get('journal_title') or \
item.get('journal_year') or \
item.get('journal_issue') or \
item.get('journal_fpage') or \
item.get('journal_lpage') or \
item.get('journal_artid') or \
item.get('journal_doctype')
from hepcrawl.crawler2hep import to_hep
from hepcrawl.settings import FILES_STORE
from hepcrawl.utils import get_file_name_from_url


def filter_fields(item, keys):
"""Filter away keys."""
for key in keys:
item.pop(key, None)
class FftFilesPipeline(FilesPipeline):
"""Download all the FFT files provided by record."""

def __init__(self, *args, **kwargs):
super(FftFilesPipeline, self).__init__(FILES_STORE)

def get_media_requests(self, item, info):
"""Download FFT files using FTP."""
if item.get('file_urls'):
for fft_url in item.file_urls:
yield Request(
url=fft_url,
meta=item.ftp_params,
)

def item_completed(self, results, item, info):
"""Create a map that connects file names with downloaded files."""
def _get_absolute_local_file_path(path):
return os.path.abspath(
os.path.join(
FILES_STORE,
path
)
)

map_file_names_paths = {}
for ok, result_data in results:
if ok:
map_file_names_paths[
get_file_name_from_url(result_data['url'])
] = _get_absolute_local_file_path(result_data['path'])

item.file_paths = map_file_names_paths

return item


class InspireAPIPushPipeline(object):
Expand All @@ -50,74 +75,31 @@ def __init__(self):
def open_spider(self, spider):
self.results_data = []

def _post_enhance_item(self, item, spider):
fft_file_paths = item.file_paths
item_format = item.item_format
item = item.item if item.item else item
source = spider.name

return to_hep(
item=item,
source=source,
item_format=item_format,
fft_file_paths=fft_file_paths,
)

def process_item(self, item, spider):
"""Convert internal format to INSPIRE data model."""
self.count += 1
if 'related_article_doi' in item:
item['dois'] += item.pop('related_article_doi', [])

source = spider.name
item['acquisition_source'] = {
'source': source,
'method': 'hepcrawl',
'date': datetime.datetime.now().isoformat(),
'submission_number': os.environ.get('SCRAPY_JOB', ''),
}

item['titles'] = [{
'title': item.pop('title', ''),
'subtitle': item.pop('subtitle', ''),
'source': source,
}]
item['abstracts'] = [{
'value': item.pop('abstract', ''),
'source': source,
}]
item['imprints'] = [{
'date': item.pop('date_published', ''),
}]
item['copyright'] = [{
'holder': item.pop('copyright_holder', ''),
'year': item.pop('copyright_year', ''),
'statement': item.pop('copyright_statement', ''),
'material': item.pop('copyright_material', ''),
}]
if not item.get('publication_info'):
if has_publication_info(item):
item['publication_info'] = [{
'journal_title': item.pop('journal_title', ''),
'journal_volume': item.pop('journal_volume', ''),
'journal_issue': item.pop('journal_issue', ''),
'artid': item.pop('journal_artid', ''),
'page_start': item.pop('journal_fpage', ''),
'page_end': item.pop('journal_lpage', ''),
'note': item.pop('journal_doctype', ''),
'pubinfo_freetext': item.pop('pubinfo_freetext', ''),
'pubinfo_material': item.pop('pubinfo_material', ''),
}]
if item.get('journal_year'):
item['publication_info'][0]['year'] = int(
item.pop('journal_year')
)

# Remove any fields
filter_fields(item, [
'journal_title',
'journal_volume',
'journal_year',
'journal_issue',
'journal_fpage',
'journal_lpage',
'journal_doctype',
'journal_artid',
'pubinfo_freetext',
'pubinfo_material',
])

item = crawler2hep(dict(item))
spider.logger.debug('Validated item.')
self.results_data.append(item)
return item
hep_item = self._post_enhance_item(item, spider)

validate(hep_item, 'hep')
spider.logger.debug('Validated item by Inspire Schemas.')

self.results_data.append(hep_item)

return hep_item

def _prepare_payload(self, spider):
"""Return payload for push."""
Expand Down
2 changes: 1 addition & 1 deletion hepcrawl/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 1,
'hepcrawl.pipelines.FftFilesPipeline': 1,
'hepcrawl.pipelines.InspireCeleryPushPipeline': 300,
}

Expand Down
Loading

0 comments on commit 840f18a

Please sign in to comment.