Skip to content

Commit

Permalink
Merge pull request #187 from david-caro/fix_desy_documents
Browse files Browse the repository at this point in the history
desy: properly use/populate documents url
  • Loading branch information
david-caro authored Oct 23, 2017
2 parents 42b85d1 + b3d9bb2 commit 903c5ff
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 22 deletions.
8 changes: 2 additions & 6 deletions hepcrawl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@


class DocumentsPipeline(FilesPipeline):
"""Download all the documents provided by record.
"""Download all the documents the record passed to download.
Note:
Expand All @@ -55,7 +55,6 @@ def __init__(self, store_uri, *args, **kwargs):
)

def get_media_requests(self, item, info):
"""Download documents using FTP."""
if item.get('file_urls'):
logging.info(
'Got the following files to download:\n%s' % pprint.pformat(
Expand All @@ -70,10 +69,7 @@ def get_media_requests(self, item, info):

def get_absolute_file_path(self, path):
return os.path.abspath(
os.path.join(
self.store.basedir,
path
)
os.path.join(self.store.basedir, path)
)

def item_completed(self, results, item, info):
Expand Down
21 changes: 14 additions & 7 deletions hepcrawl/spiders/desy_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,14 +178,19 @@ def start_requests(self):
yield request

@staticmethod
def _get_full_uri(current_url, base_url, schema, hostname=None):
def _has_to_be_downloaded(current_url):
def _is_local_path(url):
parsed_url = urllib.parse.urlparse(url)
return not parsed_url.scheme

return _is_local_path(current_url)

@staticmethod
def _get_full_uri(current_url, base_url, schema='ftp', hostname=None):
hostname = hostname or ''

parsed_url = urllib.parse.urlparse(current_url)

if parsed_url.scheme and parsed_url.scheme not in ['ftp', 'file']:
return current_url

current_path = parsed_url.path
if os.path.isabs(current_path):
full_path = current_path
Expand Down Expand Up @@ -228,22 +233,24 @@ def parse(self, response):
self.log('Got %d hep records' % len(hep_records))

for hep_record in hep_records:
list_file_urls = [
files_to_download = [
self._get_full_uri(
current_url=document['url'],
base_url=base_url,
schema=url_schema,
hostname=hostname,
)
for document in hep_record.get('documents', [])
if self._has_to_be_downloaded(document['url'])
]

self.log(
'Got the following attached documents: %s' % list_file_urls
'Got the following attached documents to download: %s'
% files_to_download
)
parsed_item = ParsedItem(
record=hep_record,
file_urls=list_file_urls,
file_urls=files_to_download,
ftp_params=ftp_params,
record_format='hep',
)
Expand Down
17 changes: 11 additions & 6 deletions hepcrawl/tohep.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,25 @@ def _get_updated_documents(current_documents, record_files):
Args:
current_documents(list(dict)): current documents as generated
by ``dojson``. We expect each of them to have, at least, a key
named ``url``.
named ``old_url``.
record_files(list(RecordFile)): files attached to the record as
populated by :class:`hepcrawl.pipelines.DocumentsPipeline`.
"""
record_files_index = {
record_file.name: record_file.path
os.path.basename(record_file.name): record_file.path
for record_file in record_files
}
new_documents = []
for document in current_documents:
file_name = os.path.basename(document['url'])
if file_name in record_files_index:
document['url'] = record_files_index[file_name]
new_documents.append(document)
url = document.get('old_url', document.get('url', ''))
full_file_name = os.path.basename(url)
if url and full_file_name in record_files_index:
document['url'] = record_files_index[full_file_name]
elif url:
document['url'] = document['old_url']

new_documents.append(document)

return new_documents

Expand Down Expand Up @@ -200,6 +204,7 @@ def hep_to_hep(hep_record, record_files):
"""
if record_files:
LOGGER.debug('Updating documents from: %s', hep_record['documents'])
LOGGER.debug('With record_files: %s', record_files)
hep_record['documents'] = _get_updated_documents(
current_documents=hep_record['documents'],
record_files=record_files,
Expand Down
10 changes: 10 additions & 0 deletions hepcrawl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,16 @@ def __init__(self, path, name=None):

self.name = name

def __repr__(self):
return self.__str__()

def __str__(self):
return '%s(path="%s", name="%s")' % (
self.__class__.__name__,
self.name,
self.path,
)


class ParsedItem(dict):
"""Each of the individual items returned by the spider to the pipeline.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@
"core": true,
"documents": [
{
"url": "/tmp/file_urls/full/85f78f549bd45b34999bc72353d982127043c341.pdf",
"url": "http://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf",
"fulltext": true,
"key": "document"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@
"core": true,
"documents": [
{
"url": "/tmp/file_urls/full/395353fad4fc8ce1c37afe9f019e1473917747e9.pdf",
"url": "http://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf",
"fulltext": true,
"key": "document"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@
<subfield code="u">DESY</subfield>
</datafield>
<datafield tag="FFT" ind1=" " ind2=" ">
<subfield code="a">FFT/desy-thesis-17-036.title.pdf</subfield>
<subfield code="a">http://bib-pubdb1.desy.de/record/389977/files/desy-thesis-17-035.title.pdf</subfield>
<subfield code="d">Fulltext</subfield>
<subfield code="t">INSPIRE-PUBLIC</subfield>
</datafield>
Expand Down

0 comments on commit 903c5ff

Please sign in to comment.