Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added sparql queries and mapping files for metadata normaliser #32

Merged
merged 2 commits into from
Mar 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,8 @@ prod-dotenv-file: guard-VAULT_ADDR guard-VAULT_TOKEN vault-installed
@ vault kv get -format="json" ted-prod/airflow | jq -r ".data.data | keys[] as \$$k | \"\(\$$k)=\(.[\$$k])\"" >> .env
@ vault kv get -format="json" ted-prod/mongo-db | jq -r ".data.data | keys[] as \$$k | \"\(\$$k)=\(.[\$$k])\"" >> .env


refresh-normaliser-mapping-files:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a solution as well!
You will, however, need to take into account the location of those resources.

@ python -m ted_sws.metadata_normaliser.entrypoints.generate_mapping_resources
#clean-mongo-db:
# @ export PYTHONPATH=$(PWD) && python ./tests/clean_mongo_db.py

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import json
import logging
import pathlib

from ted_sws.adapters.sparql_triple_store import SPARQLTripleStore, TripleStoreABC
from ted_sws.metadata_normaliser.resources import MAPPING_FILES_PATH, QUERIES_PATH

logger = logging.getLogger(__name__)


def generate_mapping_files(triple_store: TripleStoreABC = SPARQLTripleStore(),
queries_folder_path: pathlib.Path = QUERIES_PATH,
output_folder_path: pathlib.Path = MAPPING_FILES_PATH):
"""
This method will generate a json file for each ran SPARQL query in the resources folder
:param triple_store:
:param queries_folder_path:
:param output_folder_path:
:return:
"""
query_files_paths = list(pathlib.Path(queries_folder_path).rglob("*.rq"))
for query_file_path in query_files_paths:
json_file_name = query_file_path.stem + ".json"
path = output_folder_path / json_file_name
json_content = triple_store.with_query_from_file(
sparql_query_file_path=str(query_file_path)).fetch_tree()
with open(path, 'w') as outfile:
json.dump(json_content, outfile)

logger.info(f"Mapping files were generated in {output_folder_path}")


if __name__ == '__main__':
generate_mapping_files()
25 changes: 25 additions & 0 deletions ted_sws/metadata_normaliser/resources/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pathlib

import json

try:
import importlib.resources as pkg_resources
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good practice

except ImportError:
# Try backported to PY<37 `importlib_resources`.
import importlib_resources as pkg_resources

import ted_sws.metadata_normaliser.resources.mapping_files


def get_mapping_file(mapping_file_name: str) -> dict:
"""
get a predefined index mapping by reference to file name
"""
with pkg_resources.path(mapping_files, mapping_file_name) as path:
return json.loads(path.read_bytes())


RESOURCES_PATH = pathlib.Path(__file__).parent.resolve()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice setup


QUERIES_PATH = RESOURCES_PATH / 'queries'
MAPPING_FILES_PATH = RESOURCES_PATH / 'mapping_files'
Empty file.
Loading