Skip to content

Commit

Permalink
Merge pull request #12 from odissei-data/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
FjodorvRijsselberg authored Nov 28, 2023
2 parents d53a6c4 + 8b99f9e commit 0a4293a
Show file tree
Hide file tree
Showing 38 changed files with 56,793 additions and 4,452 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ jobs:
steps:
- name: Checkout repo
uses: actions/checkout@v3
with:
submodules: recursive
- name: Set python version
uses: actions/setup-python@v4
with:
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "src/resources"]
path = src/resources
url = [email protected]:odissei-data/mappings.git
6 changes: 1 addition & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,7 @@ Returns the current version of the API
- metadata - [EASY metadata example](https://github.com/odissei-data/dataverse-mapper/blob/development/test-data/input-data/easy-test-metadata.json) - The input metadata describing a dataset in JSON.
- template - [EASY template example](https://github.com/odissei-data/dataverse-mapper/blob/development/test-data/template-data/cbs_dataverse_template.json) - A template with the value you expect to map from the input metadata.
- mapping - [EASY mapping example](https://github.com/odissei-data/dataverse-mapper/blob/development/test-data/mappings/easy-mapping.json) - A dictionary with key value pairs. The key is the _typeName_ of the field
in the template. The value is the path to the value in the input
metadata.
- has_existing_doi - e.g. _true_ - A boolean specifying if the metadata will
contain a persistent identifier mapped to the _datasetPersistentId_ field in
the template.
in the template. The value is the path to the value in the input metadata.

#### Return value

Expand Down
1,023 changes: 377 additions & 646 deletions poetry.lock

Large diffs are not rendered by default.

7 changes: 3 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
[tool.poetry]
name = "dataverse_mapper"
version = "1.1.1"
version = "1.2.0"
description = "A service that maps regular JSON data to a JSON template using the Dataverse schema."
authors = ["Fjodor van Rijsselberg"]

[tool.poetry.dependencies]
python = "^3.9.14"
uvicorn = "^0.23.0"
uvicorn = "^0.24.0"
pydantic = "^2.1.0"
fastapi = "^0.100.0"
fastapi = "^0.104.1"
pytest = "^7.1.3"
requests = "^2.28.1"
coverage = "^7.2.0"
pyDataverse = "^0.3.1"
jmespath = "^1.0.1"

[tool.poetry.dev-dependencies]
Expand Down
27 changes: 3 additions & 24 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
import json

from fastapi import FastAPI
from pyDataverse.models import Dataset
from pyDataverse.utils import read_file

from mapper import MetadataMapper
from schema.input import Input
Expand All @@ -22,23 +18,6 @@ async def info():
def map_metadata(input_data: Input):
mapper = MetadataMapper(input_data.metadata, input_data.template,
input_data.mapping)
if input_data.has_existing_doi:
mapper.template["datasetVersion"][
"datasetPersistentId"] = mapper.get_persistent_identifier()
mapped_metadata = mapper.map_metadata()
return mapped_metadata


def validate_dataverse_json(dataverse_json):
""" Validates if the json can be imported into dataverse.
:param dataverse_json:
:return:
"""

ds = Dataset()
filename = "output.json"
with open(filename, 'w') as outfile:
json.dump(dataverse_json, outfile)
ds.from_json(read_file(filename))
return ds.validate_json()
mapper.map_metadata()
mapper.remove_empty_fields()
return mapper.template
80 changes: 60 additions & 20 deletions src/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,40 @@ def __init__(self, metadata: list | dict | Any,
self.template = template

def map_metadata(self):
""" Maps the source metadata to a dataverse template.
First maps all fields in the top level of the dataverse metadata.
After maps the metadataBlocks inside the dataverseVersion field.
:return: The mapped metadata, specificly the filled out template.
"""
self.map_metadata_header()
self.map_metadata_blocks()
return self.template

def map_metadata_header(self):
""" Maps the source to the header of the dataverse template
First, the top level of the dictionary is mapped to the template.
After, the values inside the datasetVersion dictionary are mapped.
This excludes the mapping of the metadataBlocks inside datasetVersion.
This is done in the map_metadata_blocks method.
"""

self.process_dictionary(self.template)
dataset_version = self.template["datasetVersion"]
self.process_dictionary(dataset_version)

def process_dictionary(self, dictionary):
""" Maps all values in a give dict (data) """
for key, value in dictionary.items():
if key in self.mapping:
mapped_values = self.map_value(key)
if mapped_values:
dictionary[key] = self.map_value(key)[0]

def map_metadata_blocks(self):
""" Maps the values in the metadata on to the template
The map_metadata method loops over the fields in the Dataverse JSON
Expand Down Expand Up @@ -258,23 +292,29 @@ def create_result_dict_list(list_dict: dict, template_dict):
result_dict_list.append(dict_copy)
return result_dict_list

def get_persistent_identifier(self):
""" Maps the dataset's doi to a specific field in the template.
TODO: Needs exception handling
Uses a mapping to retrieve the dataset's doi from the metadata.
Because there can be multiple persistent identifier, this method checks
if the id is an actual doi. It also formats the doi to the expected
format if needed.
"""
persistent_ids = self.map_value("datasetPersistentId")
for pid in persistent_ids:
if "https://doi.org/" in pid:
return 'doi:' + pid.split("/", 3)[3]
elif "doi:" in pid:
return pid

raise HTTPException(
status_code=422,
detail="No usable DOI in mapped persistent identifiers"
)
def remove_empty_fields(self):
# Iterate over the metadata blocks
metadataBlocks = self.template['datasetVersion'][
'metadataBlocks'].values()
for metadata_block in metadataBlocks:
# Iterate over the fields in each metadata block
metadata_block['fields'] = [field for field in
metadata_block['fields'] if
field.get('value') not in ('', [])]
for field in metadata_block['fields']:
if "typeClass" in field and field["typeClass"] == "compound":
remove_empty_compound_field(field)


def remove_empty_compound_field(compoundField):
# TODO: leaves an empty object in the result.
if isinstance(compoundField["value"], list):
# Remove key-value pairs with empty "value" keys
compoundField["value"] = [
{
key: value
for key, value in item.items()
if value.get("value") not in ('', [])
}
for item in compoundField["value"]
]
1 change: 1 addition & 0 deletions src/resources
Submodule resources added at f56ff4
1 change: 0 additions & 1 deletion src/schema/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,4 @@ class Input(BaseModel):
metadata: list | dict | Any
template: list | dict | Any
mapping: list | dict | Any
has_existing_doi: bool = None

30,144 changes: 30,143 additions & 1 deletion src/test-data/expected-result-data/cbs-result.json

Large diffs are not rendered by default.

Loading

0 comments on commit 0a4293a

Please sign in to comment.