Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

replace all references and skip errors #70

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions ckanext/datajson/export_map/export.catalog.map.sample.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"validation_enabled": false,
"catalog_headers": {
"conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
"describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
"@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
"conformsTo": "https://resources.data.gov/schemas/dcat-us/v1.1/schema",
"describedBy": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.json",
"@context": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.jsonld",
"@type": "dcat:Catalog"
},
"dataset_fields_map": {
Expand Down
6 changes: 3 additions & 3 deletions ckanext/datajson/export_map/export.inventory.map.sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
"validation_enabled": true,
"debug": false,
"catalog_headers": {
"conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
"describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
"@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
"conformsTo": "https://resources.data.gov/schemas/dcat-us/v1.1/schema",
"describedBy": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.json",
"@context": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.jsonld",
"@type": "dcat:Catalog"
},
"dataset_fields_map": {
Expand Down
6 changes: 3 additions & 3 deletions ckanext/datajson/export_map/export.spatial.map.sample.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"validation_enabled": true,
"catalog_headers": {
"conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
"describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
"@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
"conformsTo": "https://resources.data.gov/schemas/dcat-us/v1.1/schema",
"describedBy": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.json",
"@context": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.jsonld",
"@type": "dcat:Catalog"
},
"dataset_fields_map": {
Expand Down
18 changes: 16 additions & 2 deletions ckanext/datajson/harvester_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from jsonschema.validators import Draft4Validator
from jsonschema import FormatChecker
from jsonschema.exceptions import RefResolutionError

from sqlalchemy.exc import IntegrityError

Expand Down Expand Up @@ -131,7 +132,7 @@ def gather_stage(self, harvest_job):
if len(source_datasets) == 0: return []

DATAJSON_SCHEMA = {
"https://project-open-data.cio.gov/v1.1/schema": '1.1',
"https://resources.data.gov/schemas/dcat-us/v1.1/schema": '1.1',
}

# schema version is default 1.0, or a valid one (1.1, ...)
Expand Down Expand Up @@ -369,13 +370,26 @@ def _validate_dataset(self, validator_schema, schema_version, dataset):
with open(os.path.join(
os.path.dirname(__file__), file_path)) as json_file:
schema = json.load(json_file)
# fix the schema

msg = ";"
errors = Draft4Validator(schema, format_checker=FormatChecker()).iter_errors(dataset)
count = 0
for error in errors:
while True:
try:
error = next(errors)
except StopIteration:
# there is no more errors
break
except RefResolutionError:
# TODO, this errors is related to https://github.com/GSA/datagov-deploy/issues/1895
# the external resources (e.g. https://resources.data.gov/schemas/dcat-us/v1.1/schema/organization.json)
# uses references to nonexistent project-open-data.cio.gov links
# we can fix this resources or create new ones
continue
count += 1
msg = msg + " ### ERROR #" + str(count) + ": " + self._validate_readable_msg(error) + "; "

msg = msg.strip("; ")
if msg:
id = "Identifier: " + (dataset.get("identifier") if dataset.get("identifier") else "Unknown")
Expand Down
10 changes: 10 additions & 0 deletions ckanext/datajson/harvester_datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,16 @@ def load_remote_catalog(self, harvest_job):
catalog_values = datasets.copy()
datasets = catalog_values.pop("dataset", [])

# hotfix to deal with an unexpected domain redirection
for k, v in catalog_values.items():
catalog_values[k] = v.replace("https://project-open-data.cio.gov/v1.1/schema", "https://resources.data.gov/schemas/dcat-us/v1.1/schema")

for dataset in datasets:
if dataset.get("license") == "https://project-open-data.cio.gov/unknown-license":
dataset["license"] = "https://github.com/project-open-data/project-open-data.github.io/blob/master/unknown-license.md"
elif dataset.get("license") == "https://project-open-data.cio.gov/open-licenses":
dataset["license"] = "https://resources.data.gov/open-licenses/"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know this is meant as a temporary fix but I really want to avoid this. I'm afraid if we modify the third-party metadata like this, we find ourselves on a slippery slope. By modifying the agency's metadata, we're sort of tampering with it, diminishing it's authenticity.

As open data stewards, we want to maintain the authenticity and integrity in our chain of custody of data as much as possible using reproducible and well-known methods.


return (datasets, catalog_values)

def set_dataset_info(self, pkg, dataset, dataset_defaults, schema_version):
Expand Down
4 changes: 2 additions & 2 deletions ckanext/datajson/pod_schema/federal-v1.1/catalog.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"id": "https://project-open-data.cio.gov/v1.1/schema/catalog.json#",
"id": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.json#",
"title": "Project Open Data Catalog",
"description": "Validates an entire collection of common core metadata JSON objects. Agencies produce said collections in the form of Data.json files.",
"type": "object",
Expand Down Expand Up @@ -37,7 +37,7 @@
"description": "Version of Schema",
"title": "Version of Schema",
"enum": [
"https://project-open-data.cio.gov/v1.1/schema"
"https://resources.data.gov/schemas/dcat-us/v1.1/schema"
]
},
"describedBy": {
Expand Down
8 changes: 4 additions & 4 deletions ckanext/datajson/pod_schema/federal-v1.1/dataset.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"id": "https://project-open-data.cio.gov/v1.1/schema/dataset.json#",
"id": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/dataset.json#",
"title": "Project Open Data Dataset",
"description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).",
"type": "object",
Expand Down Expand Up @@ -475,7 +475,7 @@
"definitions": {
"vcard": {
"$schema": "http://json-schema.org/draft-04/schema#",
"id": "https://project-open-data.cio.gov/v1.1/schema/vcard.json#",
"id": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/vcard.json#",
"title": "Project Open Data ContactPoint vCard",
"description": "A Dataset ContactPoint as a vCard object",
"type": "object",
Expand Down Expand Up @@ -515,7 +515,7 @@
},
"distribution": {
"$schema": "http://json-schema.org/draft-04/schema#",
"id": "https://project-open-data.cio.gov/v1.1/schema/distribution.json#",
"id": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/distribution.json#",
"title": "Project Open Data Distribution",
"description": "Validates an entire collection of common core metadata JSON objects. Agencies produce said collections in the form of Data.json files.",
"type": "object",
Expand Down Expand Up @@ -698,7 +698,7 @@
},
"organization": {
"$schema": "http://json-schema.org/draft-04/schema#",
"id": "https://project-open-data.cio.gov/v1.1/schema/organization.json#",
"id": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/organization.json#",
"title": "Project Open Data Organization",
"description": "A Dataset Publisher Organization as a foaf:Agent object",
"type": "object",
Expand Down
4 changes: 2 additions & 2 deletions ckanext/datajson/pod_schema/non-federal-v1.1/catalog.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"id": "https://project-open-data.cio.gov/v1.1/schema/catalog.json#",
"id": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.json#",
"title": "Project Open Data Catalog",
"description": "Validates an entire collection of common core metadata JSON objects. Agencies produce said collections in the form of Data.json files.",
"type": "object",
Expand Down Expand Up @@ -37,7 +37,7 @@
"description": "Version of Schema",
"title": "Version of Schema",
"enum": [
"https://project-open-data.cio.gov/v1.1/schema"
"https://resources.data.gov/schemas/dcat-us/v1.1/schema"
]
},
"describedBy": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"$schema": "http://json-schema.org/draft-04/schema#",
"id": "https://project-open-data.cio.gov/v1.1/schema/dataset-non-federal.json#",
"id": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/dataset-non-federal.json#",
"title": "Project Open Data Dataset",
"description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).",
"type": "object",
Expand Down Expand Up @@ -368,7 +368,7 @@
"definitions": {
"vcard-non-federal": {
"$schema": "http://json-schema.org/draft-04/schema#",
"id": "https://project-open-data.cio.gov/v1.1/schema/vcard-non-federal.json#",
"id": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/vcard-non-federal.json#",
"title": "Project Open Data ContactPoint vCard",
"description": "A Dataset ContactPoint as a vCard object",
"type": "object",
Expand Down Expand Up @@ -399,7 +399,7 @@
},
"distribution": {
"$schema": "http://json-schema.org/draft-04/schema#",
"id": "https://project-open-data.cio.gov/v1.1/schema/distribution.json#",
"id": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/distribution.json#",
"title": "Project Open Data Distribution",
"description": "Validates an entire collection of common core metadata JSON objects. Agencies produce said collections in the form of Data.json files.",
"type": "object",
Expand Down Expand Up @@ -538,7 +538,7 @@
},
"organization": {
"$schema": "http://json-schema.org/draft-04/schema#",
"id": "https://project-open-data.cio.gov/v1.1/schema/organization.json#",
"id": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/organization.json#",
"title": "Project Open Data Organization",
"description": "A Dataset Publisher Organization as a foaf:Agent object",
"type": "object",
Expand Down
6 changes: 3 additions & 3 deletions ckanext/datajson/tests/datajson-samples/arm.data.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
"describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
"@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
"conformsTo": "https://resources.data.gov/schemas/dcat-us/v1.1/schema",
"describedBy": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.json",
"@context": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.jsonld",
"@type": "dcat:Catalog",
"dataset": [
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"@type": "dcat:Catalog",
"describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
"conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
"@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
"describedBy": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.json",
"conformsTo": "https://resources.data.gov/schemas/dcat-us/v1.1/schema",
"@context": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.jsonld",

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't forget that the data.json files we consume from agencies are still going to have the old https://project-open-data.cio.gov/v1.1/schema identifier, so we want to make sure our test fixtures reflect that. I don't think we should be editing any of these test samples.

"dataset": [
{
"identifier": "bad-02",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"@type": "dcat:Catalog",
"describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
"conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
"@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
"describedBy": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.json",
"conformsTo": "https://resources.data.gov/schemas/dcat-us/v1.1/schema",
"@context": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.jsonld",
"dataset": [
{
"identifier": "null-spatial",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"@type": "dcat:Catalog",
"describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
"conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
"@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
"describedBy": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.json",
"conformsTo": "https://resources.data.gov/schemas/dcat-us/v1.1/schema",
"@context": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.jsonld",
"dataset": [
{
"identifier": "bad-01",
Expand Down
6 changes: 3 additions & 3 deletions ckanext/datajson/tests/datajson-samples/usda.gov.data.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"@type": "dcat:Catalog",
"describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
"conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
"@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
"describedBy": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.json",
"conformsTo": "https://resources.data.gov/schemas/dcat-us/v1.1/schema",
"@context": "https://resources.data.gov/schemas/dcat-us/v1.1/schema/catalog.jsonld",
"dataset": [
{
"identifier": "USDA-DM-002",
Expand Down