Skip to content

Commit

Permalink
WIP_light_in_the_tunel
Browse files Browse the repository at this point in the history
Signed-off-by: Spiros Delviniotis <[email protected]>
  • Loading branch information
spirosdelviniotis committed Jul 31, 2017
1 parent 2b2a46d commit 2a4c3ed
Show file tree
Hide file tree
Showing 7 changed files with 142 additions and 61 deletions.
2 changes: 1 addition & 1 deletion docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ services:
- ${PWD}:/code/
- ${PWD}/tests/functional/scrapyd_coverage_runner.conf:/etc/scrapyd/scrapyd.conf
- /tmp/WSP:/tmp/WSP
- /tmp/DESY:/tmp/DESY
- /tmp/file_urls:/tmp/file_urls

functional_wsp:
<<: *service_base
Expand Down
37 changes: 37 additions & 0 deletions hepcrawl/crawler2hep.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,43 @@

from inspire_schemas.api import LiteratureBuilder

from hepcrawl.utils import get_file_name_from_url


def _update_record_fft_links(record, map_fft_file_paths):
def _list_new_fft_links(old_fft_links, map_fft_file_paths):
new_fft_links = []
for fft_link in old_fft_links:
file_name = get_file_name_from_url(fft_link['path'])
if file_name in map_fft_file_paths:
new_fft_links.append(
{
'path': map_fft_file_paths[file_name],
}
)

return new_fft_links

old_fft_links = record['_fft']
record['_fft'] = _list_new_fft_links(old_fft_links, map_fft_file_paths)
return record


def to_hep(item, item_format='hepcrawl', fft_file_paths=None):
if item_format == 'hep':
return hep2hep(item, fft_file_paths)
elif item_format == 'hepcrawl':
return crawler2hep(dict(item))
else:
raise Exception('Unknown item_format::{}'.format(item_format))


def hep2hep(crawler_record, fft_file_paths):
if fft_file_paths:
crawler_record = _update_record_fft_links(crawler_record, fft_file_paths)

return crawler_record


def crawler2hep(crawler_record):

Expand Down
64 changes: 26 additions & 38 deletions hepcrawl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from inspire_schemas.utils import validate

from .crawler2hep import crawler2hep
from hepcrawl.crawler2hep import to_hep
from hepcrawl.settings import FILES_STORE
from hepcrawl.utils import get_file_name_from_url

Expand Down Expand Up @@ -95,10 +95,7 @@ def open_spider(self, spider):
self.results_data = []

def _post_enhance_item(self, item, spider):
item = self._generate_record_meta(item, spider)
source = spider.name

if source != 'desy': # Should be changed to other generic flag like "hep_record"
def _normalize_hepcrawl_record(item, source):
if 'related_article_doi' in item:
item['dois'] += item.pop('related_article_doi', [])

Expand Down Expand Up @@ -155,10 +152,25 @@ def _post_enhance_item(self, item, spider):
'pubinfo_material',
])

item = crawler2hep(dict(item))
spider.logger.debug('Validated item by Builder.')
return item

return item
fft_file_paths = item.get('file_paths')
item_format = item.get('format', 'hepcrawl')
item = item.get('record_item') if item.get('record_item') else item
item = self._generate_record_meta(item, spider)
source = spider.name

if item_format != 'hep':
item = _normalize_hepcrawl_record(
item=item,
source=source,
)

return to_hep(
item=item,
item_format=item_format,
fft_file_paths=fft_file_paths,
)

def _generate_record_meta(self, json_record, spider):
json_record['acquisition_source'] = {
Expand All @@ -169,42 +181,18 @@ def _generate_record_meta(self, json_record, spider):
}
return json_record

def _update_record_fft_links(self, record, map_fft_file_paths):
def _list_new_fft_links(old_fft_links, map_fft_file_paths):
new_fft_links = []
for fft_link in old_fft_links:
file_name = get_file_name_from_url(fft_link['path'])
if file_name in map_fft_file_paths:
new_fft_links.append(
{
'path': map_fft_file_paths[file_name],
}
)

return new_fft_links

old_fft_links = record['_fft']

# Provides only list of FFT paths, not list of dicts as it is defined in schemas
record['_fft'] = _list_new_fft_links(old_fft_links, map_fft_file_paths)

return record

def process_item(self, item, spider):
"""Convert internal format to INSPIRE data model."""
self.count += 1
fft_file_paths = item.get('file_paths')

if item.get('hep_record'):
item = item.get('record_item')
if fft_file_paths:
item = self._update_record_fft_links(item, fft_file_paths)
hep_item = self._post_enhance_item(item, spider)

item = self._post_enhance_item(item, spider)
validate(item, 'hep')
self.results_data.append(item)
validate(hep_item, 'hep')
spider.logger.debug('Validated item by Inspire Schemas.')

return item
self.results_data.append(hep_item)

return hep_item

def _prepare_payload(self, spider):
"""Return payload for push."""
Expand Down
4 changes: 1 addition & 3 deletions hepcrawl/spiders/desy_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from __future__ import absolute_import, division, print_function

import os
import urlparse # must replaced for supporting python 3

from lxml import etree
from dojson.contrib.marc21.utils import create_record
Expand Down Expand Up @@ -155,11 +154,10 @@ def parse(self, response):
]

yield {
'FTP': self.ftp_enabled,
'record_item': hep_record,
'file_urls': list_file_urls,
'ftp_params': ftp_params,
'hep_record': True,
'format': 'hep',
}

def handle_package_ftp(self, response):
Expand Down
16 changes: 16 additions & 0 deletions hepcrawl/testlib/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import os
import json
import shutil

from scrapy.http import Request, TextResponse
from scrapy.selector import Selector
Expand Down Expand Up @@ -131,3 +132,18 @@ def expected_json_results_from_file(*path_chunks, **kwargs):
expected_data = json.load(fd)

return expected_data


def clean_dir(path='/tmp/WSP/'):
"""
Deletes all contained files of given target directory path.
Args:
path: Absolute path of target directory to be cleaned.
Example:
>>> clean_dir('/dir_1/dir_11/')
"""
shutil.rmtree(path, ignore_errors=True)
74 changes: 60 additions & 14 deletions tests/functional/desy/test_desy.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from hepcrawl.testlib.fixtures import (
get_test_suite_path,
expected_json_results_from_file,
clean_dir,
)
from hepcrawl.testlib.tasks import app as celery_app
from hepcrawl.testlib.utils import get_crawler_instance
Expand All @@ -34,16 +35,40 @@ def override_generated_fields(record):

def compare_two_files_using_md5(file_1, file_2):
"""Compares two files calculating the md5 hash."""
def _generate_md5_hash(file):
def _generate_md5_hash(file_path):
hasher = hashlib.md5()
with open(str(file), 'rb') as f:
buf = f.read()
with open(str(file_path), 'rb') as fd:
buf = fd.read()
hasher.update(buf)
return hasher.hexdigest()

return _generate_md5_hash(file_1) == _generate_md5_hash(file_2)


@pytest.fixture(scope="function")
def get_fft_1_path():
return get_test_suite_path(
'desy',
'fixtures',
'ftp_server',
'FFT',
'test_fft_1.txt',
test_suite='functional',
)


@pytest.fixture(scope="function")
def get_fft_2_path():
return get_test_suite_path(
'desy',
'fixtures',
'ftp_server',
'FFT',
'test_fft_2.txt',
test_suite='functional',
)


@pytest.fixture(scope="function")
def set_up_ftp_environment():
netrc_location = get_test_suite_path(
Expand All @@ -57,7 +82,7 @@ def set_up_ftp_environment():
# The test must wait until the docker environment is up (takes about 10 seconds).
sleep(10)

return {
yield {
'CRAWLER_HOST_URL': 'http://scrapyd:6800',
'CRAWLER_PROJECT': 'hepcrawl',
'CRAWLER_ARGUMENTS': {
Expand All @@ -66,6 +91,9 @@ def set_up_ftp_environment():
}
}

clean_dir('/tmp/file_urls')
clean_dir('/tmp/DESY')


@pytest.fixture(scope="function")
def set_up_local_environment():
Expand All @@ -77,14 +105,17 @@ def set_up_local_environment():
test_suite='functional',
)

return {
yield {
'CRAWLER_HOST_URL': 'http://scrapyd:6800',
'CRAWLER_PROJECT': 'hepcrawl',
'CRAWLER_ARGUMENTS': {
'package_path': package_location,
}
}

clean_dir('/tmp/file_urls')
clean_dir('/tmp/DESY')


@pytest.mark.parametrize(
'expected_results',
Expand All @@ -99,7 +130,12 @@ def set_up_local_environment():
'smoke',
]
)
def test_desy_ftp(set_up_ftp_environment, expected_results, capsys):
def test_desy_ftp(
set_up_ftp_environment,
expected_results,
get_fft_1_path,
get_fft_2_path,
):
crawler = get_crawler_instance(set_up_ftp_environment.get('CRAWLER_HOST_URL'))

results = CeleryMonitor.do_crawl(
Expand All @@ -119,11 +155,12 @@ def test_desy_ftp(set_up_ftp_environment, expected_results, capsys):

assert sorted(gotten_results) == expected_results

# Check if downloaded files are there MD5
# for record in expected_results: # WIP
# fft_file_paths = sorted(record['_fft'])
# assert compare_two_files_using_md5(fft_file_paths[0]['path'], 'file_1 from ftp server')
# assert compare_two_files_using_md5(fft_file_paths[1]['path'], 'file_2 from ftp server')
# Check using MD5 Hash if downloaded files are there.
for record in expected_results:
fft_file_paths = sorted(record['_fft'])

assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_1_path)
assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_2_path)


@pytest.mark.parametrize(
Expand All @@ -139,7 +176,12 @@ def test_desy_ftp(set_up_ftp_environment, expected_results, capsys):
'smoke',
]
)
def test_desy_local_package_path(set_up_local_environment, expected_results):
def test_desy_local_package_path(
set_up_local_environment,
expected_results,
get_fft_1_path,
get_fft_2_path,
):
crawler = get_crawler_instance(set_up_local_environment.get('CRAWLER_HOST_URL'))

results = CeleryMonitor.do_crawl(
Expand All @@ -159,6 +201,10 @@ def test_desy_local_package_path(set_up_local_environment, expected_results):

assert sorted(gotten_results) == expected_results

# Check if downloaded files are there MD5
# WIP
# Check using MD5 Hash if downloaded files are there.
for record in expected_results:
fft_file_paths = sorted(record['_fft'])

assert compare_two_files_using_md5(fft_file_paths[0]['path'], get_fft_1_path)
assert compare_two_files_using_md5(fft_file_paths[1]['path'], get_fft_2_path)

6 changes: 1 addition & 5 deletions tests/functional/wsp/test_wsp.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@

import pytest
import os
import shutil

from time import sleep

from hepcrawl.testlib.celery_monitor import CeleryMonitor
from hepcrawl.testlib.fixtures import (
get_test_suite_path,
expected_json_results_from_file,
clean_dir,
)
from hepcrawl.testlib.tasks import app as celery_app
from hepcrawl.testlib.utils import get_crawler_instance
Expand Down Expand Up @@ -90,10 +90,6 @@ def remove_generated_files(package_location):
os.unlink(os.path.join(package_location, file_name))


def clean_dir(path='/tmp/WSP/'):
shutil.rmtree(path, ignore_errors=True)


@pytest.mark.parametrize(
'expected_results',
[
Expand Down

0 comments on commit 2a4c3ed

Please sign in to comment.