epfl-si · ebreton · Mar 18, 2018 · Mar 17, 2018 · Mar 17, 2018 · Mar 17, 2018
diff --git a/infoscience_exports/exports/forms.py b/infoscience_exports/exports/forms.py
@@ -8,7 +8,7 @@ class ExportForm(FormLoggingMixin, forms.ModelForm):
 
     class Meta:
         model = Export
-      
+
         exclude = ['user', 'formats_type']
         widgets = {
             'name': forms.TextInput(attrs={'placeholder': ""}),

diff --git a/infoscience_exports/exports/marc21xml.py b/infoscience_exports/exports/marc21xml.py
@@ -3,6 +3,7 @@
 """
 Parse a marc-21-xml file
 """
+import re
 from logging import getLogger
 from django.utils.translation import gettext as _
 from django.conf import settings
@@ -66,42 +67,29 @@ def set_year(date):
         return ''
 
 
-# get fulltext: link to pdf or link to repository if several links
 def set_fulltext(fulltexts):
-    if len(fulltexts) == 0:
-        return ""
-    if len(fulltexts) == 1:
-        return fulltexts[0]
-    result = ""
-    pdf_counter = 0
-    for ft in fulltexts:
-        o = urlparse(ft)
-        file_extension = splitext(o.path)[1]
-        if file_extension == "pdf":
-            result = ft
-            pdf_counter += 1
-    if pdf_counter < 2:
-        return result
-    o_first = urlparse(fulltexts[0])
-    path_first = dirname(o_first.path)
-    is_same_path = True
-    for ft in fulltexts:
-        o = urlparse(ft)
-        path = dirname(o.path)
-        if o.scheme != o_first.scheme or \
-           o.netloc != o_first.netloc or \
-           path != path_first:
-            is_same_path = False
-            break
-    result = ""
-    if is_same_path:
-        if o_first.scheme:
-            result += o_first.scheme + "://"
-        if o_first.netloc:
-            result += o_first.netloc
-        result += path_first
-        return result
-    return result
+    """ get fulltext: link to pdf or link to repository if several links """
+    # only keep pdfs and remove duplicates
+    file_paths = ['{}://{}{}'.format(*urlparse(fulltext)[:3]) for fulltext in fulltexts]
+    pdfs = [pdf for pdf in file_paths if splitext(pdf)[1] == '.pdf']
+    unic_pdfs = list(set(pdfs))
+
+    # return empty string if no pdf found
+    if len(unic_pdfs) == 0:
+        return ''
+
+    # return element if only one found
+    if len(unic_pdfs) == 1:
+        return unic_pdfs[0]
+
+    # multiple pdfs found... return first folder that matchs infoscience/record/xxx/files
+    for dir_path in map(dirname, unic_pdfs):
+        if re.search("infoscience.epfl.ch/record/\d+/files", dir_path):
+            return dir_path
+
+    # no infoscience folder found... log a warning and return first match
+    logger.warning("Multiple pdfs found (%s), but none appear to be on regular infoscience path", unic_pdfs)
+    return dirname(unic_pdfs[0])
 
 
 def get_attributes(subfields):

diff --git a/infoscience_exports/exports/models/settings.py b/infoscience_exports/exports/models/settings.py
@@ -132,4 +132,3 @@ class AdvancedOptionsSettings(BaseSettings):
 
     class Meta:
         abstract = True
-
diff --git a/infoscience_exports/exports/test/test_fulltexts.py b/infoscience_exports/exports/test/test_fulltexts.py
@@ -0,0 +1,56 @@
+from exports.marc21xml import set_fulltext
+
+
+def test_no_fulltext():
+    assert set_fulltext([]) == ''
+
+
+def test_no_pdfs():
+    assert set_fulltext(["http://infoscience.epfl.ch/record/253539/files/Poster.ppt"]) == ''
+    assert set_fulltext(["https://www.frontiersin.org/articles/10.3389/fnbot.2017.00057/full"]) == ''
+
+
+def test_only_one_pdf():
+    # one pdf
+    assert set_fulltext(["http://infoscience.epfl.ch/record/253637/files/write%20nanoscale.pdf"]) \
+        == 'http://infoscience.epfl.ch/record/253637/files/write%20nanoscale.pdf'
+
+    # one pdfa
+    assert set_fulltext(["http://infoscience.epfl.ch/record/253144/files/2018_ICIT_Coulinge.pdf?subformat=pdfa"]) \
+        == "http://infoscience.epfl.ch/record/253144/files/2018_ICIT_Coulinge.pdf"
+
+    # one pdf & one asp
+    assert set_fulltext([
+        "https://ibeton.epfl.ch/util/script/sendArticle.asp?R=Cantone16",
+        "http://infoscience.epfl.ch/record/82377/files/JEP96-3.pdf"]) \
+        == 'http://infoscience.epfl.ch/record/82377/files/JEP96-3.pdf'
+
+    # one pdf in 2 formats
+    assert set_fulltext([
+        "http://infoscience.epfl.ch/record/253610/files/paper.pdf",
+        "http://infoscience.epfl.ch/record/253610/files/paper.pdf?subformat=pdfa"]) \
+        == 'http://infoscience.epfl.ch/record/253610/files/paper.pdf'
+
+
+def test_only_mutiple_pdfs():
+    # two pdfs
+    assert set_fulltext([
+        "http://publications.idiap.ch/downloads/reports/1996/JEP96-3.pdf",
+        "http://infoscience.epfl.ch/record/82377/files/JEP96-3.pdf"]) \
+        == 'http://infoscience.epfl.ch/record/82377/files'
+
+    # two pdfs along with their pdfas
+    assert set_fulltext([
+        "http://infoscience.epfl.ch/record/253610/files/paper.pdf",
+        "http://infoscience.epfl.ch/record/253610/files/paper.pdf?subformat=pdfa",
+        "http://infoscience.epfl.ch/record/253610/files/supplemental.pdf",
+        "http://infoscience.epfl.ch/record/253610/files/supplemental.pdf?subformat=pdfa"]) \
+        == 'http://infoscience.epfl.ch/record/253610/files'
+
+    # mix
+    assert set_fulltext([
+        "https://ibeton.epfl.ch/util/script/sendArticle.asp?R=Cantone16",
+        "http://infoscience.epfl.ch/record/253610/files/paper.pdf",
+        "http://infoscience.epfl.ch/record/253610/files/supplemental.pdf",
+        "http://infoscience.epfl.ch/record/253610/files/supplemental.pdf?subformat=pdfa"]) \
+        == 'http://infoscience.epfl.ch/record/253610/files'
diff --git a/infoscience_exports/exports/test/test_selenium.py b/infoscience_exports/exports/test/test_selenium.py
@@ -3,8 +3,7 @@
 from django.conf import settings
 from django.urls import reverse
 from django.contrib.staticfiles.testing import StaticLiveServerTestCase
-from django.test import override_settings, tag, modify_settings
-from django.utils.decorators import classproperty
+from django.test import override_settings
 from selenium import webdriver
 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 
@@ -53,16 +52,16 @@ def setUp(self):
         """Open a new browser for each test."""
         super(SeleniumStaticLiveServerTestCase, self).setUp()
 
-        test_user = User.objects.get_or_create(username='test',
-                                        first_name='test',
-                                        last_name='test',
-                                        email='test@localhost',
-                                        is_staff=True,
-                                        is_active=True)[0]
+        test_user = User.objects.get_or_create(
+            username='test',
+            first_name='test',
+            last_name='test',
+            email='test@localhost',
+            is_staff=True,
+            is_active=True)[0]
 
         # # generate a cookie place, or get the cookie setting error from Chrome
-        self.selenium.get('%s%s' % (self.live_server_url,
-                                     reverse('not_allowed')))
+        self.selenium.get('%s%s' % (self.live_server_url, reverse('not_allowed')))
 
         # bypass external Tequila auth
         self.client.force_login(test_user)

diff --git a/infoscience_exports/exports/versions.py b/infoscience_exports/exports/versions.py
@@ -3,7 +3,7 @@
 
 # the release comes from git and should not be modified
 # => read-only
-_release = '0.3.7-16-g5e0cae8'
+_release = '0.3.7-21-ga9f6efa'
 
 # you can set the next version number manually
 # if you do not, the system will make sure that version > release
@@ -13,4 +13,4 @@
 # the build number will generate conflicts on each PR merge
 # just keep yours every time
 # => read-only
-_build = '5e0cae8b46c21d3448ab566f21295f59d76bbf8f'
+_build = 'a9f6efa1428fe933cc1c8438b104598250ef6fc9'
diff --git a/infoscience_exports/exports/views.py b/infoscience_exports/exports/views.py
@@ -1,5 +1,4 @@
 from django.urls import reverse_lazy as django_reverse_lazy
-from django.db import transaction
 from django.http import HttpResponse
 from django.template import loader
 from django.views.generic import ListView, CreateView, DetailView, UpdateView, DeleteView
Original file line number	Diff line number	Diff line change
Expand Up		@@ -132,4 +132,3 @@ class AdvancedOptionsSettings(BaseSettings):

		class Meta:
		abstract = True