hubmapconsortium · mccalluc · Oct 14, 2022 · Oct 14, 2022 · Oct 14, 2022 · Oct 14, 2022
diff --git a/CHANGELOG-schema-utils.md b/CHANGELOG-schema-utils.md
@@ -0,0 +1 @@
+- Scripts to pull down documents in bulk and develop JSON schemas against them.
diff --git a/etc/dev/schema-utils/.gitignore b/etc/dev/schema-utils/.gitignore
@@ -0,0 +1 @@
+cache
diff --git a/etc/dev/schema-utils/README.md b/etc/dev/schema-utils/README.md
@@ -0,0 +1,18 @@
+We currently have no schema that describes the Entity documents the Portal relies on.
+We have requested a schema from PSC, and they have been unable to provide one.
+This directory contains scripts for pulling down documents,
+generating schemas from those documents,
+and validating documents against generated schemas.
+
+If anything comes of this,
+it might be incorporated in the validation hook we already have in
+[`search-api`](https://github.com/hubmapconsortium/search-api/pull/564):
+The idea is that a validation error wouldn't cause indexing to fail,
+but it would alert us to unexpected changes in document structure.
+
+```
+pip install genson    # Didn't want to clutter the main requirements.txt with this.
+get_entities.py       # Download all entities and fill up a gitignored cache dir.
+build_schemas.py      # Scan entities and build schemas (which have been checked in).
+validate_entities.py  # Validate downloaded entities against generated schemas.
+```
diff --git a/etc/dev/schema-utils/build_schemas.py b/etc/dev/schema-utils/build_schemas.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+import sys
+import json
+import re
+
+from genson import SchemaBuilder
+import yaml
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--doc_dir',
+        default=Path(__file__).parent / 'cache',
+        type=Path)
+    parser.add_argument(
+        '--schema_dir',
+        default=Path(__file__).parent / 'schema/entities',
+        type=Path)
+    args = parser.parse_args()
+
+    Path(args.schema_dir).mkdir(exist_ok=True)
+
+    for entity_type in ['Collection', 'Donor', 'Sample', 'Dataset']:
+        builder = SchemaBuilder()
+        builder.add_schema({"type": "object", "properties": {}})
+        print(f'Loading {entity_type}s', end='', flush=True)
+        for entity_path in args.doc_dir.glob(f'{entity_type}*.json'):
+            # The genson CLI almost works for this...
+            # but for Datasets and Samples it runs out of file handles.
+            # Might be an easy PR to fix it upstream.
+            entity = json.loads(entity_path.read_text())
+            builder.add_object(entity)
+            print(f'.', end='', flush=True)
+        schema_path = args.schema_dir / f'{entity_type}.yaml'
+        schema_yaml_raw = yaml.dump(builder.to_schema())
+        schema_yaml_baked = re.sub(
+            # If we had a field called "properties", this would break,
+            # but apart from that, should be robust.
+            r'^(\s*)(properties:)',
+            r'\1additionalProperties: false\n\1\2',
+            schema_yaml_raw,
+            flags=re.MULTILINE)
+        schema_path.write_text(schema_yaml_baked)
+        print(f'\nBuilt {schema_path.name}')
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())  # pragma: no cover
diff --git a/etc/dev/schema-utils/get_entities.py b/etc/dev/schema-utils/get_entities.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+
+from time import sleep
+import argparse
+import sys
+import json
+from pathlib import Path
+from datetime import date
+
+import requests
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--index_url',
+        default='https://search.api.hubmapconsortium.org/v3/entities/search')
+    parser.add_argument(
+        '--doc_dir',
+        default=Path(__file__).parent / 'cache',
+        type=Path)
+    parser.add_argument(
+        '--start',
+        default=0,
+        type=int)
+    parser.add_argument(
+        '--size',
+        default=1,  # Small so we don't choke on a few large documents.
+        type=int)
+    parser.add_argument(
+        '--sort',
+        default='created_timestamp')
+    parser.add_argument(
+        '--sleep',
+        default=1,
+        type=int)
+    args = parser.parse_args()
+
+    Path(args.doc_dir).mkdir(exist_ok=True)
+
+    es_from = args.start
+    while(True):
+        print(f'from: {es_from}')
+        response = requests.post(
+            args.index_url,
+            json={
+                'from': es_from,
+                'size': args.size,
+                'sort': args.sort
+            })
+        if not response.ok:
+            print(f'HTTP {response.status_code}:')
+            print(response.text)
+            break
+
+        hits = response.json()['hits']['hits']
+        if not hits:
+            print('No more hits')
+            break
+
+        for hit in hits:
+            id = hit['_id']
+            source = hit['_source']
+            entity_type = source['entity_type']
+            created_timestamp = source['created_timestamp']
+            iso_date = date.fromtimestamp(created_timestamp / 1000)
+            name = f'{entity_type}_{iso_date}_{id}.json'
+            (args.doc_dir / name).write_text(json.dumps(source, indent=2))
+
+        es_from += args.size
+        sleep(args.sleep)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())  # pragma: no cover
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		- Scripts to pull down documents in bulk and develop JSON schemas against them.