Merge pull request #2 from osm-without-borders/test_with_pandas

check number of zones by country - POC
osm-without-borders · Apr 6, 2018 · 33cfaac · 33cfaac
2 parents 91061ed + 04c720f
commit 33cfaac
Show file tree

Hide file tree

Showing 9 changed files with 678 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+__pycache__
+.pytest_cache
+data_volumetric.csv
+data_volumetric.json
+cosmogony.geojson
diff --git a/Pipfile b/Pipfile
@@ -0,0 +1,18 @@
+[[source]]
+
+name = "pypi"
+url = "https://pypi.python.org/simple"
+verify_ssl = true
+
+
+[packages]
+
+matplotlib = "*"
+pandas = "*"
+geopandas = "*"
+pytest = "*"
+ijson = "*"
+ipython = "*"
+cffi = "*"
+
+[dev-packages]
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -1,11 +1,31 @@
 # cosmogony-data-dashboard
 
-To show stats about the world [Cosmogony](https://github.com/osm-without-borders/cosmogony)
+The purpose of this repo is to provide tools to compute and show stats about the world [Cosmogony](https://github.com/osm-without-borders/cosmogony).
+
+It can help to check the quality (well, mostly the quantity actually...) of OpenStreetMap boundaries zones.
+
+Contributions are very welcomed in the repo. If you have new ideas about tests to add, please take a look at the [founding issue](https://github.com/osm-without-borders/cosmogony/issues/4) first ;)
 
 :construction::warning: This is a work in progress, and deeply connected to the Cosmogony output format. Follow on in [this issue](https://github.com/osm-without-borders/cosmogony/issues/4) :warning::construction:
 
 ## Country stats and tests
 
+### Purpose
+
+We want to compute the number of zones for each kind of zone and for each country. Then, we want to compare this output with some references values (the actual number of zones for each kind of zone in the real world)
+
+### Compute and test against references values
+
+You will need `python3` and a few dependancies you can install with `pipenv install --three`.
+
+To compute the number of zones for each kind of zones (volumetric stats) and test them again reference values, just type:
+
+`pipenv run py.test --cosmogony my-cosmogony.json`
+
+Detailed test results are written to `data_volumetric.json`.
+
+You can also get some visual results over the tests using the `index.html` file inside the repo.
+
 ### Reference values
 
 For now, the references values is a big csv file.

diff --git a/conftest.py b/conftest.py
@@ -0,0 +1,21 @@
+# coding: utf-8
+import pandas as pd
+
+from utils import ZonesIndex
+
+
+def pytest_addoption(parser):
+    parser.addoption("--cosmogony", action="store", required=True,
+        help="a cosmogony json file")
+
+def pytest_generate_tests(metafunc):
+    cosmogony_path = metafunc.config.getoption('cosmogony')
+    zones_index = ZonesIndex.init_from_cosmogony(cosmogony_path)
+
+    expected_values = pd.read_csv('reference_stats_values.csv')
+    rows = (row for idx,row in expected_values.iterrows())
+
+    if 'line' in metafunc.fixturenames:
+        metafunc.parametrize('line', rows)
+    if 'zones_index' in metafunc.fixturenames:
+        metafunc.parametrize('zones_index', [zones_index])
diff --git a/index.html b/index.html
@@ -0,0 +1,72 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <title>Volumetric Data Dashboard</title>
+        <meta charset='utf-8'/>
+        <meta name='viewport' content='initial-scale=1,user-scalable=yes'/>
+        <style>
+            table,
+            td,
+            th {
+                margin: 10px 0;
+                padding: 2px 4px;
+                text-align: center;
+                border-collapse: collapse;
+            }
+            td,
+            th {
+                border: 1px solid black;
+            }
+        </style>
+    </head>
+    <body>
+        <table id="volumetric-dashboard" class="sort"></table>
+        <script>
+            fetch(`data_volumetric.json`).then((r) => r.json()).then((data) => {
+                var table = document.getElementById('volumetric-dashboard');
+
+                var col = ['name', 'zone_type', 'result', 'status']
+
+                for (var i = 0; i < data.length; i++) {
+                    var result_text = `${ (data[i]['total'] != -1)
+                        ? data[i]['total']
+                        : "??"} `
+                    result_text += `<br>(expected : ${data[i]['expected_min']} ~ ${data[i]['expected_max']})`
+
+                    var status = ''
+                    if (data[i]['test_status'] == 'ok') {
+                        status = '✅'
+                        status += (data[i]['is_known_failure'] == "yes")
+                            ? " 😍 "
+                            : "✅";
+                    }
+                    if (data[i]['test_status'] == 'ko') {
+                        status += (data[i]['is_known_failure'] == "yes")
+                            ? "📉"
+                            : "❎❎";
+                    }
+                    if (data[i]['test_status'] == 'skip') {
+                        status = '🤔'
+                    }
+
+                    var tr = table.insertRow(-1);
+                    tr.insertCell(-1).innerHTML = data[i]['name']
+                    tr.insertCell(-1).innerHTML = data[i]['zone_type']
+                    var tabCell = tr.insertCell(-1);
+                    tabCell.innerHTML = result_text;
+                    tr.insertCell(-1).innerHTML = status
+
+                }
+                var header = table.createTHead();
+                var trh = header.insertRow(0);
+                for (var i = 0; i < col.length; i++) {
+                    var th = document.createElement("th");
+                    th.innerHTML = col[i];
+                    trh.appendChild(th);
+                }
+
+            })
+        </script>
+
+    </body>
+</html>
diff --git a/test_volumetries.py b/test_volumetries.py
@@ -0,0 +1,45 @@
+# coding: utf-8
+import pandas as pd
+import json
+import csv
+import pytest
+
+from utils import UnknownWikidataId
+
+def check_if_test_passes(expected_min, expected_max, total):
+    if expected_min <= total <= expected_max:
+        return "ok"
+    else:
+        return "ko"
+
+class TestCosmogony:
+    @classmethod
+    def setup_class(cls):
+        cls.results = pd.DataFrame()
+
+    @classmethod
+    def teardown_class(cls):
+        cls.results.to_json('data_volumetric.json', orient='records')
+
+    def test_row(self, line, zones_index):
+        try:
+            matched_zones = list(zones_index.iter_children(
+                line['wikidata_id'],
+                lambda z:z['zone_type']==line['zone_type']
+            ))
+        except UnknownWikidataId:
+            total = -1
+            test_status = 'skip'
+        else:
+            total = len(matched_zones)
+            test_status = check_if_test_passes(line.expected_min, line.expected_max, total)
+
+        line['total'] = total
+        line['test_status'] = test_status
+        TestCosmogony.results = TestCosmogony.results.append(line)
+
+        if test_status == 'skip':
+            pytest.skip("no data for this test")
+
+        assert(test_status == "ok"), "Country {} - expected between {} and {} for {}, found {}".format(
+            line['name'], line['expected_min'], line['expected_max'], line['zone_type'], total)
diff --git a/utils/__init__.py b/utils/__init__.py
@@ -0,0 +1 @@
+from .index import ZonesIndex, UnknownWikidataId
diff --git a/utils/index.py b/utils/index.py
@@ -0,0 +1,60 @@
+from collections import defaultdict
+
+import ijson.backends.yajl2_cffi as ijson
+
+
+class UnknownWikidataId(Exception):
+    pass
+
+class ZonesIndex:
+    """
+    Index cosmogony zones both by internal `id` and wikidata id
+    """
+    @classmethod
+    def init_from_cosmogony(cls, cosmogony_path):
+        zones_index = cls()
+
+        print('Reading zones...')
+        with open(cosmogony_path, 'rb') as f:
+            zones = ijson.items(f, 'zones.item')
+            for z in zones:
+                z.pop('geometry', None)
+                zones_index.insert(z)
+        print('{} zones have been read'.format(len(zones_index)))
+
+        zones_index.build_children()
+        return zones_index
+
+    def __init__(self):
+        self.id_to_zone = dict()
+        self.wd_to_zone = dict()
+        self.id_to_children = defaultdict(list)
+
+    def insert(self, zone):
+        self.id_to_zone[zone['id']] = zone
+        wikidata_id = zone.get('wikidata')
+        if wikidata_id:
+            self.wd_to_zone[wikidata_id] = zone
+
+    def build_children(self):
+        for z in self.id_to_zone.values():
+            parent_id = z.get('parent')
+            if parent_id:
+                self.id_to_children[parent_id].append(z)
+
+    def _iter_all_children(self, zone):
+        children = self.id_to_children[zone['id']]
+        for c in children:
+            yield c
+            yield from self._iter_all_children(c)
+
+    def iter_children(self, wikidata_id, filter_fun=lambda x: True):
+        try:
+            zone = self.wd_to_zone[wikidata_id]
+        except KeyError as e:
+            raise UnknownWikidataId from e
+
+        return filter(filter_fun, self._iter_all_children(zone))
+
+    def __len__(self):
+        return len(self.id_to_zone)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .index import ZonesIndex, UnknownWikidataId