Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add regexes to records #83

Merged
merged 8 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 35 additions & 10 deletions src/curies/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,11 @@ class Record(BaseModel): # type:ignore
)
prefix_synonyms: List[str] = Field(default_factory=list)
uri_prefix_synonyms: List[str] = Field(default_factory=list)
pattern: Optional[str] = Field(
default=None,
description="The regular expression pattern for entries in this semantic space. "
"Warning: this is an experimental feature.",
)

@validator("prefix_synonyms") # type:ignore
def prefix_not_in_synonyms(cls, v: str, values: Mapping[str, Any]) -> str: # noqa:N805
Expand Down Expand Up @@ -370,6 +375,10 @@ def _get_prefix_map(records: List[Record]) -> Dict[str, str]:
return rv


def _get_pattern_map(records: List[Record]) -> Dict[str, str]:
return {record.prefix: record.pattern for record in records if record.pattern}


def _get_reverse_prefix_map(records: List[Record]) -> Dict[str, str]:
rv = {}
for record in records:
Expand Down Expand Up @@ -435,6 +444,10 @@ class Converter:
reverse_prefix_map: Dict[str, str]
#: A prefix trie for efficient parsing of URIs
trie: StringTrie
#: A mapping from prefix to regular expression pattern. Not necessarily complete wrt the prefix map.
#:
#: .. warning:: patterns are an experimental feature
pattern_map: Dict[str, str]

def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool = True) -> None:
"""Instantiate a converter.
Expand Down Expand Up @@ -462,6 +475,7 @@ def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool
self.synonym_to_prefix = _get_prefix_synmap(records)
self.reverse_prefix_map = _get_reverse_prefix_map(records)
self.trie = StringTrie(self.reverse_prefix_map)
self.pattern_map = _get_pattern_map(records)

@property
def bimap(self) -> Mapping[str, str]:
Expand Down Expand Up @@ -542,6 +556,9 @@ def _index(self, record: Record) -> None:
self.reverse_prefix_map[uri_prefix_synonym] = record.prefix
self.trie[uri_prefix_synonym] = record.prefix

if record.pattern and record.prefix not in self.pattern_map:
self.pattern_map[record.prefix] = record.pattern

def add_prefix(
self,
prefix: str,
Expand Down Expand Up @@ -890,7 +907,7 @@ def from_shacl(

:param graph: A RDFLib graph, a Path, a string representing a file path, or a string URL
:param format: The RDF format, if a file path is given
:param kwargs: Keyword arguments to pass to :meth:`from_prefix_map`
:param kwargs: Keyword arguments to pass to :meth:`Converter.__init__`
:return: A converter
"""
if isinstance(graph, (str, Path)):
Expand All @@ -901,16 +918,20 @@ def from_shacl(
graph = temporary_graph

query = """\
SELECT ?curie_prefix ?uri_prefix
SELECT ?curie_prefix ?uri_prefix ?pattern
WHERE {
?bnode1 sh:declare ?bnode2 .
?bnode2 sh:prefix ?curie_prefix .
?bnode2 sh:namespace ?uri_prefix .
OPTIONAL { ?bnode2 sh:pattern ?pattern . }
}
"""
results = graph.query(query)
prefix_map = {str(k): str(v) for k, v in results}
return cls.from_prefix_map(prefix_map, **kwargs)
records = [
Record(prefix=str(prefix), uri_prefix=str(uri_prefix), pattern=pattern and str(pattern))
for prefix, uri_prefix, pattern in results
]
return cls(records, **kwargs)

def get_prefixes(self) -> Set[str]:
"""Get the set of prefixes covered by this converter."""
Expand Down Expand Up @@ -1823,6 +1844,8 @@ def _record_to_dict(record: Record) -> Mapping[str, Union[str, List[str]]]:
rv["prefix_synonyms"] = sorted(record.prefix_synonyms)
if record.uri_prefix_synonyms:
rv["uri_prefix_synonyms"] = sorted(record.uri_prefix_synonyms)
if record.pattern:
rv["pattern"] = record.pattern
return rv


Expand Down Expand Up @@ -1901,16 +1924,18 @@ def write_shacl(
path = _ensure_path(path)
lines = []
for record in converter.records:
lines.append(_get_shacl_line(record.prefix, record.uri_prefix))
lines.append(_get_shacl_line(record.prefix, record.uri_prefix, pattern=record.pattern))
if include_synonyms:
for prefix_synonym in record.prefix_synonyms:
lines.append(_get_shacl_line(prefix_synonym, record.uri_prefix))
lines.append(
_get_shacl_line(prefix_synonym, record.uri_prefix, pattern=record.pattern)
)
path.write_text(text.format(entries=",\n".join(lines)))


def _get_shacl_line(prefix: str, uri_prefix: str) -> str:
def _get_shacl_line(prefix: str, uri_prefix: str, pattern: Optional[str] = None) -> str:
line = f' [ sh:prefix "{prefix}" ; sh:namespace "{uri_prefix}"^^xsd:anyURI '
# if pattern:
# pattern = pattern.replace("\\", "\\\\")
# line += f'; sh:pattern "{pattern}"'
if pattern:
pattern = pattern.replace("\\", "\\\\")
line += f'; sh:pattern "{pattern}"'
return line + " ]"
37 changes: 37 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,43 @@ def test_combine_ci(self):
converter.expand("CHEBI:138488"),
)

def test_combine_with_patterns(self):
"""Test chaining with patterns."""
c1 = Converter([Record(prefix="a", uri_prefix="https://example.org/a/", pattern="^\\d{7}")])
c2 = Converter([Record(prefix="a", uri_prefix="https://example.org/a/", pattern="^\\d+")])
converter = chain([c1, c2])
self.assertEqual(
[Record(prefix="a", uri_prefix="https://example.org/a/", pattern="^\\d{7}")],
converter.records,
)

def test_combine_with_patterns_via_synonym(self):
"""Test chaining with patterns."""
c1 = Converter([Record(prefix="a", uri_prefix="https://example.org/a/", pattern="^\\d{7}")])
c2 = Converter(
[
Record(
prefix="b",
prefix_synonyms=["a"],
uri_prefix="https://example.org/b/",
pattern="^\\d+",
)
]
)
converter = chain([c1, c2])
self.assertEqual(
[
Record(
prefix="a",
prefix_synonyms=["b"],
uri_prefix="https://example.org/a/",
uri_prefix_synonyms=["https://example.org/b/"],
pattern="^\\d{7}",
)
],
converter.records,
)

def test_df_bulk(self):
"""Test bulk processing in pandas dataframes."""
rows = [
Expand Down
4 changes: 4 additions & 0 deletions tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@ def setUp(self) -> None:
self.uri_prefix = CHEBI_URI_PREFIX
self.prefix_synonym = "p"
self.uri_prefix_synonym = "u"
self.pattern = "^\\d{7}$"
self.converter = Converter.from_extended_prefix_map(
[
{
"prefix": self.prefix,
"prefix_synonyms": [self.prefix_synonym],
"uri_prefix": self.uri_prefix,
"uri_prefix_synonyms": [self.uri_prefix_synonym],
"pattern": self.pattern,
},
]
)
Expand All @@ -40,6 +42,7 @@ def test_write_epm(self):
curies.write_extended_prefix_map(self.converter, path)
nc = curies.load_extended_prefix_map(path)
self.assertEqual(self.converter.records, nc.records)
self.assertEqual({self.prefix: self.pattern}, nc.pattern_map)

def test_write_jsonld_with_bimap(self):
"""Test writing and reading a prefix map via JSON-LD."""
Expand Down Expand Up @@ -73,6 +76,7 @@ def test_shacl(self):
curies.write_shacl(self.converter, path)
nc = curies.load_shacl(path)
self.assertEqual(self.converter.bimap, nc.bimap)
self.assertEqual({self.prefix: self.pattern}, nc.pattern_map)

def test_shacl_with_synonyms(self):
"""Test writing SHACL with synonyms."""
Expand Down
Loading