Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Work around registry URLs with multiple slashes by outputting paths with single slashes #163

Merged
merged 1 commit into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -148,14 +148,35 @@
"filename": "src/pds2/aipgen/tests/test_utils.py",
"hashed_secret": "10a34637ad661d98ba3344717656fcc76209c2f8",
"is_verified": false,
"line_number": 48
"line_number": 49
},
{
"type": "Hex High Entropy String",
"filename": "src/pds2/aipgen/tests/test_utils.py",
"hashed_secret": "67a74306b06d0c01624fe0d0249a570f4d093747",
"is_verified": false,
"line_number": 49
"line_number": 50
},
{
"type": "Basic Auth Credentials",
"filename": "src/pds2/aipgen/tests/test_utils.py",
"hashed_secret": "25ab86bed149ca6ca9c1c0d5db7c9a91388ddeab",
"is_verified": false,
"line_number": 169
},
{
"type": "Email Address",
"filename": "src/pds2/aipgen/tests/test_utils.py",
"hashed_secret": "66ed46e8b325ac0c7982bd070c132bff14093bc3",
"is_verified": false,
"line_number": 169
},
{
"type": "Email Address",
"filename": "src/pds2/aipgen/tests/test_utils.py",
"hashed_secret": "fe5c714e9a30a923a58dac84e0af313c7fb7c553",
"is_verified": false,
"line_number": 179
}
],
"test/data/insight_documents/urn-nasa-pds-insight_documents/document_hp3rad/release_notes.txt": [
Expand Down Expand Up @@ -204,5 +225,5 @@
}
]
},
"generated_at": "2023-11-16T17:14:34Z"
"generated_at": "2024-04-19T15:04:19Z"
}
15 changes: 13 additions & 2 deletions src/pds2/aipgen/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
from .sip import writelabel as writesiplabel
from .utils import addbundlearguments
from .utils import addloggingarguments
from .utils import fixmultislashes


# Constants
Expand Down Expand Up @@ -103,6 +104,16 @@ class _File:
url: str
md5: str

@classmethod
def make(cls, url, md5):
"""Make a ``_File``, fixing issues with multi-slashes in ``url``.

Note that this allows us to keep the generated ctor from ``dataclass`` without
having to do weird things with ``__setattr__``. See https://dsh.re/f9fd7b for
more information.
"""
return cls(fixmultislashes(url), md5)


def _deurnlidvid(lidvid: str) -> tuple[str, str]:
"""De-URN a LID VID.
Expand Down Expand Up @@ -179,9 +190,9 @@ def _addfiles(product: dict, bac: dict):
if _propdataurl in props: # Are there data files in the product?
urls, md5s = props[_propdataurl], props[_propdatamd5] # Get the URLs and MD5s of them
for url, md5 in zip(urls, md5s): # For each URL and matching MD5
files.add(_File(url, md5)) # Add it to the set
files.add(_File.make(url, md5)) # Add it to the set
if _proplabelurl in props: # How about the label itself?
files.add(_File(props[_proplabelurl][0], props[_proplabelmd5][0])) # Add it too
files.add(_File.make(props[_proplabelurl][0], props[_proplabelmd5][0])) # Add it too
bac[lidvid] = files # Stash for future use


Expand Down
40 changes: 40 additions & 0 deletions src/pds2/aipgen/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import zope.component # type: ignore
from pds2.aipgen.interfaces import IURLValidator
from pds2.aipgen.utils import addloggingarguments
from pds2.aipgen.utils import fixmultislashes
from pds2.aipgen.utils import getdigest
from pds2.aipgen.utils import getlogicalversionidentifier
from pds2.aipgen.utils import getmd5
Expand Down Expand Up @@ -142,6 +143,45 @@ def test_invalid_url(self):
validator.validate("?")


# https://github.com/NASA-PDS/deep-archive/issues/162
class URLCorrectingTest(unittest.TestCase):
"""Check if we can correct ``//`` in URLs as reported in issue №162."""

def test_normalurls(self):
"""Ensure we leave "normal" URLs alone, à la Britney."""
for url in (
"ftp://ftp.cdrom.com/pub/idgames/doom.exe",
"gopher://gopher.hprc.utoronto.ca/cuisine/poutine.recipe",
"wais://cnidr.org:210/1994/directory-of-servers",
"file:///usr/local/rootkits/3klagia.dll",
"https://fanfiction.net/startrek/"
):
self.assertEqual(url, fixmultislashes(url))

def test_multislashesinpaths(self):
"""Ensure we properly remove multiple slashes from paths."""
url = "https://fanfiction.net/startrek//sentient//computers//index.html"
self.assertEqual("https://fanfiction.net/startrek/sentient/computers/index.html", fixmultislashes(url))

url = "nntp://news.fanfiction.net//alt.fanfiction.startrek//91172//"
self.assertEqual("nntp://news.fanfiction.net/alt.fanfiction.startrek/91172/", fixmultislashes(url))

url = "rtsp://kirkfan:[email protected]:554/////streaming///Channels//101/"
self.assertEqual(
"rtsp://kirkfan:[email protected]:554/streaming/Channels/101/",
fixmultislashes(url)
)

def test_multislasheselsewhere(self):
"""Ensure we leave multiple slashes alone in other contexts outside of the path."""
for url in (
"shttp://fanfiction.net/blog?article_id=kirk%2F%2Fspock",
"mailto:[email protected]?subject=Sentient%20computer%2F%2Fsentient%20planet%20stories",
"prospero://ucla.edu:9155/index.dat#//readme"
):
self.assertEqual(url, fixmultislashes(url))


def test_suite():
"""Return the test suite, duh flake8.

Expand Down
15 changes: 15 additions & 0 deletions src/pds2/aipgen/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
import re
import sqlite3
import urllib
from urllib.parse import urlparse
from urllib.parse import urlunparse

from lxml import etree
from zope.interface import implementer
Expand Down Expand Up @@ -68,6 +70,19 @@
# ---------


def fixmultislashes(url):
"""Fix occurrences of multiple slashes in the given ``url``.

This addresses issue №162: where submission information packages would have double-
slashes in their paths, which leads to validation errors. Note that the upstream
problem is that the registry is loaded with examples of these bad paths. This is
a workaround.
"""
scheme, netloc, path, params, query, fragment = urlparse(url)
path = re.sub(r'/{2,}', '/', path)
return urlunparse((scheme, netloc, path, params, query, fragment))


def createschema(con):
"""Make the database schema for handing AIPs and SIPs in the given ``con``nection."""
cursor = con.cursor()
Expand Down
Loading