Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactored set fulltexts #62

Merged
merged 5 commits into from
Mar 18, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion infoscience_exports/exports/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class ExportForm(FormLoggingMixin, forms.ModelForm):

class Meta:
model = Export

exclude = ['user', 'formats_type']
widgets = {
'name': forms.TextInput(attrs={'placeholder': ""}),
Expand Down
58 changes: 23 additions & 35 deletions infoscience_exports/exports/marc21xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
Parse a marc-21-xml file
"""
import re
from logging import getLogger
from django.utils.translation import gettext as _
from django.conf import settings
Expand Down Expand Up @@ -66,42 +67,29 @@ def set_year(date):
return ''


# get fulltext: link to pdf or link to repository if several links
def set_fulltext(fulltexts):
if len(fulltexts) == 0:
return ""
if len(fulltexts) == 1:
return fulltexts[0]
result = ""
pdf_counter = 0
for ft in fulltexts:
o = urlparse(ft)
file_extension = splitext(o.path)[1]
if file_extension == "pdf":
result = ft
pdf_counter += 1
if pdf_counter < 2:
return result
o_first = urlparse(fulltexts[0])
path_first = dirname(o_first.path)
is_same_path = True
for ft in fulltexts:
o = urlparse(ft)
path = dirname(o.path)
if o.scheme != o_first.scheme or \
o.netloc != o_first.netloc or \
path != path_first:
is_same_path = False
break
result = ""
if is_same_path:
if o_first.scheme:
result += o_first.scheme + "://"
if o_first.netloc:
result += o_first.netloc
result += path_first
return result
return result
""" get fulltext: link to pdf or link to repository if several links """
# only keep pdfs and remove duplicates
file_paths = ['{}://{}{}'.format(*urlparse(fulltext)[:3]) for fulltext in fulltexts]
pdfs = [pdf for pdf in file_paths if splitext(pdf)[1] == '.pdf']
unic_pdfs = list(set(pdfs))

# return empty string if no pdf found
if len(unic_pdfs) == 0:
return ''

# return element if only one found
if len(unic_pdfs) == 1:
return unic_pdfs[0]

# multiple pdfs found... return first folder that matchs infoscience/record/xxx/files
for dir_path in map(dirname, unic_pdfs):
if re.search("infoscience.epfl.ch/record/\d+/files", dir_path):
return dir_path

# no infoscience folder found... log a warning and return first match
logger.warning("Multiple pdfs found (%s), but none appear to be on regular infoscience path", unic_pdfs)
return dirname(unic_pdfs[0])


def get_attributes(subfields):
Expand Down
1 change: 0 additions & 1 deletion infoscience_exports/exports/models/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,3 @@ class AdvancedOptionsSettings(BaseSettings):

class Meta:
abstract = True

56 changes: 56 additions & 0 deletions infoscience_exports/exports/test/test_fulltexts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from exports.marc21xml import set_fulltext


def test_no_fulltext():
assert set_fulltext([]) == ''


def test_no_pdfs():
assert set_fulltext(["http://infoscience.epfl.ch/record/253539/files/Poster.ppt"]) == ''
assert set_fulltext(["https://www.frontiersin.org/articles/10.3389/fnbot.2017.00057/full"]) == ''


def test_only_one_pdf():
# one pdf
assert set_fulltext(["http://infoscience.epfl.ch/record/253637/files/write%20nanoscale.pdf"]) \
== 'http://infoscience.epfl.ch/record/253637/files/write%20nanoscale.pdf'

# one pdfa
assert set_fulltext(["http://infoscience.epfl.ch/record/253144/files/2018_ICIT_Coulinge.pdf?subformat=pdfa"]) \
== "http://infoscience.epfl.ch/record/253144/files/2018_ICIT_Coulinge.pdf"

# one pdf & one asp
assert set_fulltext([
"https://ibeton.epfl.ch/util/script/sendArticle.asp?R=Cantone16",
"http://infoscience.epfl.ch/record/82377/files/JEP96-3.pdf"]) \
== 'http://infoscience.epfl.ch/record/82377/files/JEP96-3.pdf'

# one pdf in 2 formats
assert set_fulltext([
"http://infoscience.epfl.ch/record/253610/files/paper.pdf",
"http://infoscience.epfl.ch/record/253610/files/paper.pdf?subformat=pdfa"]) \
== 'http://infoscience.epfl.ch/record/253610/files/paper.pdf'


def test_only_mutiple_pdfs():
# two pdfs
assert set_fulltext([
"http://publications.idiap.ch/downloads/reports/1996/JEP96-3.pdf",
"http://infoscience.epfl.ch/record/82377/files/JEP96-3.pdf"]) \
== 'http://infoscience.epfl.ch/record/82377/files'

# two pdfs along with their pdfas
assert set_fulltext([
"http://infoscience.epfl.ch/record/253610/files/paper.pdf",
"http://infoscience.epfl.ch/record/253610/files/paper.pdf?subformat=pdfa",
"http://infoscience.epfl.ch/record/253610/files/supplemental.pdf",
"http://infoscience.epfl.ch/record/253610/files/supplemental.pdf?subformat=pdfa"]) \
== 'http://infoscience.epfl.ch/record/253610/files'

# mix
assert set_fulltext([
"https://ibeton.epfl.ch/util/script/sendArticle.asp?R=Cantone16",
"http://infoscience.epfl.ch/record/253610/files/paper.pdf",
"http://infoscience.epfl.ch/record/253610/files/supplemental.pdf",
"http://infoscience.epfl.ch/record/253610/files/supplemental.pdf?subformat=pdfa"]) \
== 'http://infoscience.epfl.ch/record/253610/files'
19 changes: 9 additions & 10 deletions infoscience_exports/exports/test/test_selenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
from django.conf import settings
from django.urls import reverse
from django.contrib.staticfiles.testing import StaticLiveServerTestCase
from django.test import override_settings, tag, modify_settings
from django.utils.decorators import classproperty
from django.test import override_settings
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

Expand Down Expand Up @@ -53,16 +52,16 @@ def setUp(self):
"""Open a new browser for each test."""
super(SeleniumStaticLiveServerTestCase, self).setUp()

test_user = User.objects.get_or_create(username='test',
first_name='test',
last_name='test',
email='test@localhost',
is_staff=True,
is_active=True)[0]
test_user = User.objects.get_or_create(
username='test',
first_name='test',
last_name='test',
email='test@localhost',
is_staff=True,
is_active=True)[0]

# # generate a cookie place, or get the cookie setting error from Chrome
self.selenium.get('%s%s' % (self.live_server_url,
reverse('not_allowed')))
self.selenium.get('%s%s' % (self.live_server_url, reverse('not_allowed')))

# bypass external Tequila auth
self.client.force_login(test_user)
Expand Down
4 changes: 2 additions & 2 deletions infoscience_exports/exports/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# the release comes from git and should not be modified
# => read-only
_release = '0.3.7-16-g5e0cae8'
_release = '0.3.7-21-ga9f6efa'

# you can set the next version number manually
# if you do not, the system will make sure that version > release
Expand All @@ -13,4 +13,4 @@
# the build number will generate conflicts on each PR merge
# just keep yours every time
# => read-only
_build = '5e0cae8b46c21d3448ab566f21295f59d76bbf8f'
_build = 'a9f6efa1428fe933cc1c8438b104598250ef6fc9'
1 change: 0 additions & 1 deletion infoscience_exports/exports/views.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from django.urls import reverse_lazy as django_reverse_lazy
from django.db import transaction
from django.http import HttpResponse
from django.template import loader
from django.views.generic import ListView, CreateView, DetailView, UpdateView, DeleteView
Expand Down