Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Entity schema / My last hubmap PR #2967

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions CHANGELOG-schema-utils.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
- Scripts to pull down documents in bulk and develop JSON schemas against them.
1 change: 1 addition & 0 deletions etc/dev/schema-utils/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cache
18 changes: 18 additions & 0 deletions etc/dev/schema-utils/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
We currently have no schema that describes the Entity documents the Portal relies on.
We have requested a schema from PSC, and they have been unable to provide one.
This directory contains scripts for pulling down documents,
generating schemas from those documents,
and validating documents against generated schemas.

If anything comes of this,
it might be incorporated in the validation hook we already have in
[`search-api`](https://github.com/hubmapconsortium/search-api/pull/564):
The idea is that a validation error wouldn't cause indexing to fail,
but it would alert us to unexpected changes in document structure.

```
pip install genson # Didn't want to clutter the main requirements.txt with this.
get_entities.py # Download all entities and fill up a gitignored cache dir.
build_schemas.py # Scan entities and build schemas (which have been checked in).
validate_entities.py # Validate downloaded entities against generated schemas.
```
53 changes: 53 additions & 0 deletions etc/dev/schema-utils/build_schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env python3

import argparse
from pathlib import Path
import sys
import json
import re

from genson import SchemaBuilder
import yaml


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'--doc_dir',
default=Path(__file__).parent / 'cache',
type=Path)
parser.add_argument(
'--schema_dir',
default=Path(__file__).parent / 'schema/entities',
type=Path)
args = parser.parse_args()

Path(args.schema_dir).mkdir(exist_ok=True)

for entity_type in ['Collection', 'Donor', 'Sample', 'Dataset']:
builder = SchemaBuilder()
builder.add_schema({"type": "object", "properties": {}})
print(f'Loading {entity_type}s', end='', flush=True)
for entity_path in args.doc_dir.glob(f'{entity_type}*.json'):
# The genson CLI almost works for this...
# but for Datasets and Samples it runs out of file handles.
# Might be an easy PR to fix it upstream.
entity = json.loads(entity_path.read_text())
builder.add_object(entity)
print(f'.', end='', flush=True)
schema_path = args.schema_dir / f'{entity_type}.yaml'
schema_yaml_raw = yaml.dump(builder.to_schema())
schema_yaml_baked = re.sub(
# If we had a field called "properties", this would break,
# but apart from that, should be robust.
r'^(\s*)(properties:)',
r'\1additionalProperties: false\n\1\2',
schema_yaml_raw,
flags=re.MULTILINE)
schema_path.write_text(schema_yaml_baked)
print(f'\nBuilt {schema_path.name}')
return 0


if __name__ == "__main__":
sys.exit(main()) # pragma: no cover
76 changes: 76 additions & 0 deletions etc/dev/schema-utils/get_entities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env python3

from time import sleep
import argparse
import sys
import json
from pathlib import Path
from datetime import date

import requests


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'--index_url',
default='https://search.api.hubmapconsortium.org/v3/entities/search')
parser.add_argument(
'--doc_dir',
default=Path(__file__).parent / 'cache',
type=Path)
parser.add_argument(
'--start',
default=0,
type=int)
parser.add_argument(
'--size',
default=1, # Small so we don't choke on a few large documents.
type=int)
parser.add_argument(
'--sort',
default='created_timestamp')
parser.add_argument(
'--sleep',
default=1,
type=int)
args = parser.parse_args()

Path(args.doc_dir).mkdir(exist_ok=True)

es_from = args.start
while(True):
print(f'from: {es_from}')
response = requests.post(
args.index_url,
json={
'from': es_from,
'size': args.size,
'sort': args.sort
})
if not response.ok:
print(f'HTTP {response.status_code}:')
print(response.text)
break

hits = response.json()['hits']['hits']
if not hits:
print('No more hits')
break

for hit in hits:
id = hit['_id']
source = hit['_source']
entity_type = source['entity_type']
created_timestamp = source['created_timestamp']
iso_date = date.fromtimestamp(created_timestamp / 1000)
name = f'{entity_type}_{iso_date}_{id}.json'
(args.doc_dir / name).write_text(json.dumps(source, indent=2))

es_from += args.size
sleep(args.sleep)
return 0


if __name__ == "__main__":
sys.exit(main()) # pragma: no cover
Loading