Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SSSOM rewire to rewire method #403

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions src/sssom/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
split_file,
validate_file,
)
from .parsers import PARSING_FUNCTIONS, parse_sssom_table
from .parsers import PARSING_FUNCTIONS, from_sssom_dataframe, parse_sssom_table
from .rdf_util import rewire_graph
from .sparql_util import EndpointConfig, query_mappings
from .util import (
Expand All @@ -58,6 +58,7 @@
remove_unmatched,
sort_df_rows_columns,
to_mapping_set_dataframe,
rewire_sssom_table,
)
from .writers import WRITER_FUNCTIONS, write_table

Expand Down Expand Up @@ -545,11 +546,26 @@ def rewire(
# noqa: DAR101
"""
msdf = parse_sssom_table(mapping_file)
g = Graph()
g.parse(input, format=input_format)
rewire_graph(g, msdf, precedence=precedence)
rdfstr = g.serialize(format=output_format)
print(rdfstr, file=output)

if input_format == "sssom-tsv" or input.endswith("sssom.tsv"):
msdf_mapping = parse_sssom_table(input)
df_rewired = rewire_sssom_table() # This is the method you need to implement

# updating the metadata of the rewired df so you can recognise it was rewired?
metadata = msdf.metadata
metadata["mapping_set_id"] = msdf["mapping_set_id"] + "rewired.sssom.tsv"

# This maybe has to be revisited as the rewiring can change the SSSOM mapping
prefix_map = msdf.prefix_map

msdf_rewired = from_sssom_dataframe(df_rewired, prefix_map=prefix_map, meta=metadata)
write_table(msdf_rewired, output)
else:
g = Graph()
g.parse(input, format=input_format)
rewire_graph(g, msdf, precedence=precedence)
outstring = g.serialize(format=output_format)
print(outstring, file=output)


@main.command()
Expand Down
43 changes: 28 additions & 15 deletions src/sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,23 @@ def clean_context(self) -> None:
"""Clean up the context."""
self.converter = curies.chain([_get_built_in_prefix_map(), self.converter])

def _standardize_curie_or_iri(self, curie_or_iri: str) -> str:
"""Standardize a CURIE or IRI, returning the original if not possible."""
if is_iri(curie_or_iri):
return self.converter.standardize_uri(curie_or_iri) or curie_or_iri
if is_curie(curie_or_iri):
return self.converter.standardize_curie(curie_or_iri) or curie_or_iri
return curie_or_iri

def standardize(self) -> None:
"""Standardize this MSDF."""
for column, values in _get_sssom_schema_object().dict["slots"].items():
if values["range"] != "EntityReference":
continue
if column not in self.df.columns:
continue
self.df[column] = self.df[column].map(self._standardize_curie_or_iri)

def merge(self, *msdfs: "MappingSetDataFrame", inplace: bool = True) -> "MappingSetDataFrame":
"""Merge two MappingSetDataframes.

Expand Down Expand Up @@ -372,6 +389,16 @@ def get_row_based_on_hierarchy(df: pd.DataFrame):
return hierarchical_df


def rewire_sssom_table(df_rewire: pd.DataFrame, df_mapping: pd.DataFrame):
# 1. Standardise subject and object id columns using
# https://curies.readthedocs.io/en/latest/api/curies.Converter.html#curies.Converter.pd_standardize_curie
# 2. Perform the rewiring
# 3. Store some metadata in the "other" field?
# 4. Return back out
result_df = ...
return result_df


def assign_default_confidence(
df: pd.DataFrame,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
Expand Down Expand Up @@ -1116,22 +1143,8 @@ def reconcile_prefix_and_data(
converter = msdf.converter
converter = curies.remap_curie_prefixes(converter, prefix_reconciliation["prefix_synonyms"])
converter = curies.rewire(converter, prefix_reconciliation["prefix_expansion_reconciliation"])

# TODO make this standardization code directly part of msdf after
# switching to native converter
def _upgrade(curie_or_iri: str) -> str:
if not is_iri(curie_or_iri) and is_curie(curie_or_iri):
return converter.standardize_curie(curie_or_iri) or curie_or_iri
return curie_or_iri

for column, values in _get_sssom_schema_object().dict["slots"].items():
if values["range"] != "EntityReference":
continue
if column not in msdf.df.columns:
continue
msdf.df[column] = msdf.df[column].map(_upgrade)

msdf.converter = converter
msdf.standardize()
return msdf


Expand Down