Skip to content

Commit

Permalink
WIP_per_coding
Browse files Browse the repository at this point in the history
Signed-off-by: Spiros Delviniotis <[email protected]>
  • Loading branch information
spirosdelviniotis committed Aug 4, 2017
1 parent 066864a commit da9acb4
Show file tree
Hide file tree
Showing 14 changed files with 287 additions and 205 deletions.
87 changes: 48 additions & 39 deletions hepcrawl/crawler2hep.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,34 @@

from inspire_schemas.api import LiteratureBuilder

from hepcrawl.utils import get_file_name_from_url


def _update_record_fft(record, index_fft_file_paths):
def _update_fft_fields(fft_fields, index_fft_file_paths):
new_fft_fields = []
for fft_field in fft_fields:
file_name = get_file_name_from_url(fft_field['path'])
if file_name in index_fft_file_paths:
fft_field['path'] = index_fft_file_paths[file_name]
new_fft_fields.append(fft_field)

return new_fft_fields

record['_fft'] = _update_fft_fields(record['_fft'], index_fft_file_paths)
return record
def _get_updated_fft_fields(current_fft_fields, record_files):
"""
Params:
current_fft_fields(list(dict)): record current fft fields as generated by ``dojson``. We
expect each of then to have, at least, a key named ``path``.
record_files(list(RecordFile)): files attached to the record as populated by
``FftFilesPipeline``.
"""
record_files_index = {
record_file.name: record_file.path
for record_file in record_files
}
new_fft_fields = []
import logging
logger = logging.getLogger(__name__)
logger.log(logging.INFO,
"-------------------- _get_updated_fft_fields -------------------")
logger.log(logging.INFO,
'current_fft_fields: {}'.format(current_fft_fields))
for fft_field in current_fft_fields:
file_name = os.path.basename(fft_field['path'])
if file_name in record_files_index:
fft_field['path'] = record_files_index[file_name]
new_fft_fields.append(fft_field)

return new_fft_fields


def _has_publication_info(item):
Expand Down Expand Up @@ -116,50 +128,47 @@ def _normalize_hepcrawl_record(item, source):
return item


def _generate_acquisition_source(crawler_record, source):
crawler_record['acquisition_source'] = {
def _generate_acquisition_source(source):
acquisition_source = {
'source': source,
'method': 'hepcrawl',
'datetime': datetime.datetime.now().isoformat(),
'submission_number': os.environ.get('SCRAPY_JOB', ''),
}
return crawler_record
return acquisition_source


def to_hep(
item,
source,
item_format='hepcrawl',
fft_file_paths=None,
def item_to_hep(
item,
source,
):
item = _generate_acquisition_source(
crawler_record=item,
source=source,
)
item.record['acquisition_source'] = _generate_acquisition_source(source=source)

if item_format == 'hep':
return hep2hep(
crawler_record=item,
fft_file_paths=fft_file_paths,
if item.record_format == 'hep':
return hep_to_hep(
hep_record=item.record,
record_files=item.record_files,
)
elif item_format == 'hepcrawl':
elif item.record_format == 'hepcrawl':
item = _normalize_hepcrawl_record(
item=item,
source=source,
)
return crawler2hep(dict(item))
return hepcrawl_to_hep(dict(item))
else:
raise Exception('Unknown item_format::{}'.format(item_format))
raise Exception('Unknown item_format::{}'.format(item.record_format))


def hep2hep(crawler_record, fft_file_paths):
if fft_file_paths:
crawler_record = _update_record_fft(crawler_record, fft_file_paths)
def hep_to_hep(hep_record, record_files):
hep_record['_fft'] = _get_updated_fft_fields(
current_fft_fields=hep_record['_fft'],
record_files=record_files,
)

return crawler_record
return hep_record


def crawler2hep(crawler_record):
def hepcrawl_to_hep(crawler_record):

def _filter_affiliation(affiliations):
return [
Expand Down
55 changes: 29 additions & 26 deletions hepcrawl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,23 @@

from inspire_schemas.utils import validate

from hepcrawl.crawler2hep import to_hep
from hepcrawl.crawler2hep import item_to_hep
from hepcrawl.settings import FILES_STORE
from hepcrawl.utils import get_file_name_from_url
from hepcrawl.utils import RecordFile


class FftFilesPipeline(FilesPipeline):
"""Download all the FFT files provided by record."""
"""Download all the FFT files provided by record.
def __init__(self, *args, **kwargs):
super(FftFilesPipeline, self).__init__(FILES_STORE)
Note:
This pipeline only runs if the spider returns a ``ParsedItem`` that has a ``file_urls``
property.
"""

def __init__(self, store_uri, *args, **kwargs):
store_uri = store_uri or FILES_STORE
super(FftFilesPipeline, self).__init__(*args, store_uri=store_uri, **kwargs)

def get_media_requests(self, item, info):
"""Download FFT files using FTP."""
Expand All @@ -44,24 +51,25 @@ def get_media_requests(self, item, info):
meta=item.ftp_params,
)

def get_absolute_file_path(self, path):
return os.path.abspath(
os.path.join(
self.store.basedir,
path
)
)

def item_completed(self, results, item, info):
"""Create a map that connects file names with downloaded files."""
def _get_absolute_local_file_path(path):
return os.path.abspath(
os.path.join(
FILES_STORE,
path
)
record_files = [
RecordFile(
path=self.get_absolute_file_path(result_data['path']),
name=os.path.basename(result_data['url']),
)

map_file_names_paths = {}
for ok, result_data in results:
if ok:
map_file_names_paths[
get_file_name_from_url(result_data['url'])
] = _get_absolute_local_file_path(result_data['path'])

item.file_paths = map_file_names_paths
for ok, result_data in results
if ok
]
item.record_files = record_files

return item

Expand All @@ -76,16 +84,11 @@ def open_spider(self, spider):
self.results_data = []

def _post_enhance_item(self, item, spider):
fft_file_paths = item.file_paths
item_format = item.item_format
item = item.item if item.item else item
source = spider.name

return to_hep(
return item_to_hep(
item=item,
source=source,
item_format=item_format,
fft_file_paths=fft_file_paths,
)

def process_item(self, item, spider):
Expand Down
Loading

0 comments on commit da9acb4

Please sign in to comment.