From 5f371659b679854380ac4f1e939ce9506780f761 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 30 Oct 2023 22:06:50 +0100 Subject: [PATCH 1/7] Add patterns to Record --- src/curies/api.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/curies/api.py b/src/curies/api.py index 8257ce2b..4afda397 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -241,6 +241,10 @@ class Record(BaseModel): # type:ignore ) prefix_synonyms: List[str] = Field(default_factory=list) uri_prefix_synonyms: List[str] = Field(default_factory=list) + pattern: Optional[str] = Field( + default=None, + description="The regular expression pattern for entries in this semantic space", + ) @validator("prefix_synonyms") # type:ignore def prefix_not_in_synonyms(cls, v: str, values: Mapping[str, Any]) -> str: # noqa:N805 From 31d76f68560ce2c8726d220c920d9d6160664258 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 30 Oct 2023 22:15:23 +0100 Subject: [PATCH 2/7] Update api.py --- src/curies/api.py | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/src/curies/api.py b/src/curies/api.py index 4afda397..5463f467 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -894,7 +894,7 @@ def from_shacl( :param graph: A RDFLib graph, a Path, a string representing a file path, or a string URL :param format: The RDF format, if a file path is given - :param kwargs: Keyword arguments to pass to :meth:`from_prefix_map` + :param kwargs: Keyword arguments to pass to :meth:`Converter.__init__` :return: A converter """ if isinstance(graph, (str, Path)): @@ -905,16 +905,20 @@ def from_shacl( graph = temporary_graph query = """\ - SELECT ?curie_prefix ?uri_prefix + SELECT ?curie_prefix ?uri_prefix ?pattern WHERE { ?bnode1 sh:declare ?bnode2 . - ?bnode2 sh:prefix ?curie_prefix . - ?bnode2 sh:namespace ?uri_prefix . + ?bnode2 sh:prefix ?curie_prefix ; + sh:namespace ?uri_prefix . + OPTIONAL { ?bnode2 sh:pattern ?pattern . } } """ results = graph.query(query) - prefix_map = {str(k): str(v) for k, v in results} - return cls.from_prefix_map(prefix_map, **kwargs) + records = [ + Record(prefix=str(prefix), uri_prefix=str(uri_prefix), pattern=pattern and str(pattern)) + for prefix, uri_prefix, pattern in results + ] + return cls(records, **kwargs) def get_prefixes(self) -> Set[str]: """Get the set of prefixes covered by this converter.""" @@ -1863,8 +1867,16 @@ def write_shacl(converter: Converter, path: Union[str, Path]) -> None: """ ) path = _ensure_path(path) - entries = ",\n".join( - f' [ sh:prefix "{prefix}" ; sh:namespace "{uri_prefix}" ]' - for prefix, uri_prefix in sorted(converter.bimap.items()) - ) - path.write_text(text.format(entries=entries)) + lines = [] + for record in converter.records: + beginning = f' [ sh:prefix "{record.prefix}" ; sh:namespace "{record.uri_prefix}"' + if record.pattern: + line = f'{beginning} ; sh.pattern "{_escape_regex(record.pattern)}" ]' + else: + line = f"{beginning} ]" + lines.append(line) + path.write_text(text.format(entries=",\n".join(lines))) + + +def _escape_regex(pattern: str) -> str: + return pattern.replace("\\", "\\\\") From 4a90bb9e0af64edcdae83b80f22af49038bc5a94 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 31 Oct 2023 10:14:02 +0100 Subject: [PATCH 3/7] Add handling of pattern map --- src/curies/api.py | 14 +++++++++++++- tests/test_io.py | 3 +++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/curies/api.py b/src/curies/api.py index 5463f467..5542a31e 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -374,6 +374,10 @@ def _get_prefix_map(records: List[Record]) -> Dict[str, str]: return rv +def _get_pattern_map(records: List[Record]) -> Dict[str, str]: + return {record.prefix: record.pattern for record in records if record.pattern} + + def _get_reverse_prefix_map(records: List[Record]) -> Dict[str, str]: rv = {} for record in records: @@ -439,6 +443,8 @@ class Converter: reverse_prefix_map: Dict[str, str] #: A prefix trie for efficient parsing of URIs trie: StringTrie + #: A mapping from prefix to regular expression pattern. Not necessarily complete wrt the prefix map + pattern_map: Dict[str, str] def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool = True) -> None: """Instantiate a converter. @@ -466,6 +472,7 @@ def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool self.synonym_to_prefix = _get_prefix_synmap(records) self.reverse_prefix_map = _get_reverse_prefix_map(records) self.trie = StringTrie(self.reverse_prefix_map) + self.pattern_map = _get_pattern_map(records) @property def bimap(self) -> Mapping[str, str]: @@ -533,6 +540,8 @@ def _merge(record: Record, into: Record) -> None: into.uri_prefix_synonyms.append(uri_prefix_synonym) into.uri_prefix_synonyms.sort() + # TODO merging patterns? + def _index(self, record: Record) -> None: self.prefix_map[record.prefix] = record.uri_prefix self.synonym_to_prefix[record.prefix] = record.prefix @@ -546,6 +555,9 @@ def _index(self, record: Record) -> None: self.reverse_prefix_map[uri_prefix_synonym] = record.prefix self.trie[uri_prefix_synonym] = record.prefix + if record.pattern and record.prefix not in self.pattern_map: + self.pattern_map[record.prefix] = record.pattern + def add_prefix( self, prefix: str, @@ -1871,7 +1883,7 @@ def write_shacl(converter: Converter, path: Union[str, Path]) -> None: for record in converter.records: beginning = f' [ sh:prefix "{record.prefix}" ; sh:namespace "{record.uri_prefix}"' if record.pattern: - line = f'{beginning} ; sh.pattern "{_escape_regex(record.pattern)}" ]' + line = f'{beginning} ; sh:pattern "{_escape_regex(record.pattern)}" ]' else: line = f"{beginning} ]" lines.append(line) diff --git a/tests/test_io.py b/tests/test_io.py index d8969d9c..5d6da475 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -19,6 +19,7 @@ def setUp(self) -> None: self.uri_prefix = CHEBI_URI_PREFIX self.prefix_synonym = "p" self.uri_prefix_synonym = "u" + self.pattern = "^\\d{7}$" self.converter = Converter.from_extended_prefix_map( [ { @@ -26,6 +27,7 @@ def setUp(self) -> None: "prefix_synonyms": [self.prefix_synonym], "uri_prefix": self.uri_prefix, "uri_prefix_synonyms": [self.uri_prefix_synonym], + "pattern": self.pattern, }, ] ) @@ -59,3 +61,4 @@ def test_shacl(self): curies.write_shacl(self.converter, path) nc = curies.load_shacl(path) self.assertEqual(self.converter.bimap, nc.bimap) + self.assertEqual({self.prefix: self.pattern}, nc.pattern_map) From 1a1687f3a41eedf097430f0c7b09eb149af68c0f Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 31 Oct 2023 10:17:48 +0100 Subject: [PATCH 4/7] Output pattern in EPMs --- src/curies/api.py | 2 ++ tests/test_io.py | 1 + 2 files changed, 3 insertions(+) diff --git a/src/curies/api.py b/src/curies/api.py index 5542a31e..9006d6a3 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -1843,6 +1843,8 @@ def _record_to_dict(record: Record) -> Mapping[str, Union[str, List[str]]]: rv["prefix_synonyms"] = sorted(record.prefix_synonyms) if record.uri_prefix_synonyms: rv["uri_prefix_synonyms"] = sorted(record.uri_prefix_synonyms) + if record.pattern: + rv["pattern"] = record.pattern return rv diff --git a/tests/test_io.py b/tests/test_io.py index 5d6da475..d98adb31 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -39,6 +39,7 @@ def test_write_epm(self): curies.write_extended_prefix_map(self.converter, path) nc = curies.load_extended_prefix_map(path) self.assertEqual(self.converter.records, nc.records) + self.assertEqual({self.prefix: self.pattern}, nc.pattern_map) def test_write_jsonld_with_bimap(self): """Test writing and reading a prefix map via JSON-LD.""" From dee2f511899bab6708ed0933cbdec68d657028e0 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 31 Oct 2023 15:46:30 +0100 Subject: [PATCH 5/7] Update api.py --- src/curies/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/curies/api.py b/src/curies/api.py index d039ecd5..725764db 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -920,8 +920,8 @@ def from_shacl( SELECT ?curie_prefix ?uri_prefix ?pattern WHERE { ?bnode1 sh:declare ?bnode2 . - ?bnode2 sh:prefix ?curie_prefix ; - sh:namespace ?uri_prefix . + ?bnode2 sh:prefix ?curie_prefix . + ?bnode2 sh:namespace ?uri_prefix . OPTIONAL { ?bnode2 sh:pattern ?pattern . } } """ From 5358205cc9b1ec06af3827e8e7e52a07d510d6ee Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 1 Nov 2023 19:34:58 +0100 Subject: [PATCH 6/7] Add tests for chaining --- src/curies/api.py | 2 -- tests/test_api.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/src/curies/api.py b/src/curies/api.py index 725764db..b3bf0301 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -540,8 +540,6 @@ def _merge(record: Record, into: Record) -> None: into.uri_prefix_synonyms.append(uri_prefix_synonym) into.uri_prefix_synonyms.sort() - # TODO merging patterns? - def _index(self, record: Record) -> None: self.prefix_map[record.prefix] = record.uri_prefix self.synonym_to_prefix[record.prefix] = record.prefix diff --git a/tests/test_api.py b/tests/test_api.py index 2092159e..96e23311 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -570,6 +570,43 @@ def test_combine_ci(self): converter.expand("CHEBI:138488"), ) + def test_combine_with_patterns(self): + """Test chaining with patterns.""" + c1 = Converter([Record(prefix="a", uri_prefix="https://example.org/a/", pattern="^\\d{7}")]) + c2 = Converter([Record(prefix="a", uri_prefix="https://example.org/a/", pattern="^\\d+")]) + converter = chain([c1, c2]) + self.assertEqual( + [Record(prefix="a", uri_prefix="https://example.org/a/", pattern="^\\d{7}")], + converter.records, + ) + + def test_combine_with_patterns_via_synonym(self): + """Test chaining with patterns.""" + c1 = Converter([Record(prefix="a", uri_prefix="https://example.org/a/", pattern="^\\d{7}")]) + c2 = Converter( + [ + Record( + prefix="b", + prefix_synonyms=["a"], + uri_prefix="https://example.org/b/", + pattern="^\\d+", + ) + ] + ) + converter = chain([c1, c2]) + self.assertEqual( + [ + Record( + prefix="a", + prefix_synonyms=["b"], + uri_prefix="https://example.org/a/", + uri_prefix_synonyms=["https://example.org/b/"], + pattern="^\\d{7}", + ) + ], + converter.records, + ) + def test_df_bulk(self): """Test bulk processing in pandas dataframes.""" rows = [ From 0838bee22d14b338d818f38a4d991f2f2719ad20 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 1 Nov 2023 19:42:45 +0100 Subject: [PATCH 7/7] Update api.py --- src/curies/api.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/curies/api.py b/src/curies/api.py index b3bf0301..0e41b6f4 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -243,7 +243,8 @@ class Record(BaseModel): # type:ignore uri_prefix_synonyms: List[str] = Field(default_factory=list) pattern: Optional[str] = Field( default=None, - description="The regular expression pattern for entries in this semantic space", + description="The regular expression pattern for entries in this semantic space. " + "Warning: this is an experimental feature.", ) @validator("prefix_synonyms") # type:ignore @@ -443,7 +444,9 @@ class Converter: reverse_prefix_map: Dict[str, str] #: A prefix trie for efficient parsing of URIs trie: StringTrie - #: A mapping from prefix to regular expression pattern. Not necessarily complete wrt the prefix map + #: A mapping from prefix to regular expression pattern. Not necessarily complete wrt the prefix map. + #: + #: .. warning:: patterns are an experimental feature pattern_map: Dict[str, str] def __init__(self, records: List[Record], *, delimiter: str = ":", strict: bool = True) -> None: