Skip to content

Commit

Permalink
hxlm (#11): HRecipe (HXL-proxy JSON recipes) is getting better; almos…
Browse files Browse the repository at this point in the history
…t there to make work with URL
  • Loading branch information
fititnt committed Mar 1, 2021
1 parent 554b37f commit 818620e
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 19 deletions.
17 changes: 13 additions & 4 deletions hxlm/core/model/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,16 @@ def _parse_schemas_raw_hfile(self, hfile):
hfile = HFile().load_schema_file(hfile)
self._hfiles.append(hfile)

def _parse_schemas_raw_hrecipe(self, hrecipe):
hrecipe = HRecipe().load_schema_recipe(hrecipe)
self._hrecipes.append(hrecipe)
def _parse_schemas_raw_hrecipe(self, hrecipes):
hrecipe_ = []
for recipe in hrecipes:

# print('oioioi', recipe)
hrecipe_ = HRecipe().load_schema(recipe)
self._hrecipes.append(hrecipe_)

# hrecipe = HRecipe().load_schema(hrecipe)
# self._hrecipes.append(hrecipe)

def _parse_schemas_raw_htask(self, htask):
"""HTask is an draft
Expand Down Expand Up @@ -114,8 +121,10 @@ def export_schemas(self):
if len(self._hrecipes) > 0:
recipes_ = []
for recipe in self._hrecipes:
# print('recipe.get_hxlproxy_url', recipe.get_hxlproxy_url())
# print(vars(recipe))
recipes_.append(recipe.export_schema())
schemas.append({'hrecipe': hfiles_})
schemas.append({'hrecipe': recipes_})

# TODO: implement htasks

Expand Down
100 changes: 90 additions & 10 deletions hxlm/core/model/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,115 @@
SPDX-License-Identifier: Unlicense OR 0BSD
"""

import json
import urllib

from dataclasses import dataclass


# TODO: allow user change the proxy URL (e.g. if using Docker or other service)
HXLPROXY_URL = "https://proxy.hxlstandard.org"


class HRecipe:
"""HMeta is the main entry point to glue collections of HConteiner and etc
In practice, is mostly used to, with help with external utils, abstract
hmeta.yml from disk
"""

_valid_options = ['add_columns', 'aggregators', 'append_source',
'before', 'blacklist', 'date', 'date_format',
'is_regex', 'latlon', 'lower', 'map_source',
'number', 'number_format', 'original', 'pattern',
'patterns', 'purge', 'queries', 'replacement',
'reverse', 'skip_untagged', 'source_list_url',
'specs', 'upper', 'whitelist', 'whitespace']

# Both are required
input: str = None
filter: str = None
"""The filter: add_columns, append, append_external_list, cache, ..."""

add_columns: str = None
aggregators: str = None
append_source: str = None
before: str = None
# TODO: maybe propose an alias for blacklist/whitelist, see
# - https://www.adexchanger.com/data-driven-thinking
# /no-more-inflammatory-jargon-change-blacklist-to-blocklist/
# - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6148600/
# - https://insights.dice.com/2020/07/17
# /whitelist-blacklist-the-new-debate-over-security-terminology/
# - https://english.stackexchange.com/questions/51088
# /alternative-terms-to-blacklist-and-whitelist
# Maybe allowlist / blocklist ? (Emerson Rocha, 2021-03-01 02:52 UTC)
blacklist: str = None
date: str = None
date_format: str = None
is_regex: str = None
latlon: str = None
lower: str = None
map_source: str = None
number: str = None
number_format: str = None
original: str = None
pattern: str = None
patterns: str = None
purge: str = None
queries: str = None
replacement: str = None
reverse: str = None
skip_untagged: str = None
source_list_url: str = None
specs: str = None
upper: str = None
whitelist: str = None
whitespace: str = None

def __init__(self, recipe_raw=None):
self.kind: str = 'HRecipe'
self._recipe_raw = recipe_raw
# if self._recipe_raw:
# if self._recipe_raw:
# self._recipe = recipe_raw

def export_schema(self):
# TODO: improve this. Still just outputing the input

return self._recipe_raw


def load_schema_recipe(self, recipe_raw):
"""load_schema_recipe load object and convert to HRecipe
# return vars(self)

def get_hxlproxy_url(self):
# @see https://github.com/HXLStandard/hxl-proxy/wiki/JSON-processing-specs #noqa

hxlspec = {}
source = ''
if self._recipe_raw['src']:
# hxlspec['input'] = self._recipe_raw['src']
source = self._recipe_raw['src']
elif self._recipe_raw['srcs'] and self._recipe_raw['srcs'][0]:
# hxlspec['input'] = self._recipe_raw['src'][0]
source = self._recipe_raw['src'][0]

for validkey in self._valid_options:
if validkey in self._recipe_raw['filters']:
hxlspec[validkey] = self._recipe_raw['filters'][validkey]

print('hxlspec', hxlspec)
print('_recipe_raw filters', self._recipe_raw['filters'])
print('json.dumps(hxlspec) 1', json.dumps(hxlspec))
print('json.dumps(hxlspec) 2', urllib.parse.quote(json.dumps(hxlspec)))
# urllib.parse.urlencode(hxlspec)
# urllib.parse.urlencode(json.dumps(hxlspec))
# print('todo')
return HXLPROXY_URL + '/data.csv?url=' + source + '&recipe=' + urllib.parse.quote(json.dumps(hxlspec))

def load_schema(self, recipe_raw):
"""load_schema load object and convert to HRecipe
How the object is saved on disk (or received from external sources)
is out of scope of this class.
Args:
load_schema_recipe (Object): Load generic object to HRecipe
recipe_raw (Object): Load generic object to HRecipe
"""

self._recipe_raw = recipe_raw
Expand All @@ -40,9 +123,6 @@ def load_schema_recipe(self, recipe_raw):
# print(schemas)



from dataclasses import dataclass

# from typing import (
# Any
# )
Expand Down
2 changes: 2 additions & 0 deletions hxlm/core/schema/baseline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# DESCRIPTION: while hxlm.core have these values mostly hardcoded you can use
# this file as reference for implementations outside python

# TODO: allow user change the proxy URL (e.g. if using Docker or other service)
HXLPROXY_URL: "https://proxy.hxlstandard.org"

HTYPE_PRIMITIVES:
HTYPE_TRUE: "TRUE"
Expand Down
12 changes: 7 additions & 5 deletions hxlm/data/baseline/hmeta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@
hdatasets:
- id: place
description: HXL-CPLP-FOD_countries-territories.csv
source:
- url: https://docs.google.com/spreadsheets/d/12k4BWqq5c3mV9ihQscPIwtuDa_QRB-iFohO7dXSSptI/edit#gid=0
source: https://docs.google.com/spreadsheets/d/12k4BWqq5c3mV9ihQscPIwtuDa_QRB-iFohO7dXSSptI/edit#gid=0
tags:
- ISO 3166
- ISO 3166-2
- ISO 3166-3
- id: lang
comments: HXL-CPLP-FOD_languages
source:
sources:
- url: https://docs.google.com/spreadsheets/d/12k4BWqq5c3mV9ihQscPIwtuDa_QRB-iFohO7dXSSptI/edit#gid=0
tags:
- ISO 639-3
Expand All @@ -27,7 +26,10 @@

# Note: this is an early draft, so some extra abstractions may be done later (Emerson Rocha, 2021-03-01 03:45 UTC)
hrecipes:
- hrecipe: https://docs.google.com/spreadsheets/d/12k4BWqq5c3mV9ihQscPIwtuDa_QRB-iFohO7dXSSptI/edit#gid=0
# https://proxy.hxlstandard.org/data/edit?dest=data_edit&filter01=cut&filter-label01=with_columns&cut-include-tags01=%23vocab%2Bid%2Bv_iso6393_3letter%2C%23vocab%2Bcode%2Bv_6391%2C%23vocab%2Bname&filter02=select&filter-label02=without_rows&select-query02-01=%23vocab%2Bcode%2Bv_6391%3D&select-reverse02=on&url=https%3A%2F%2Fdocs.google.com%2Fspreadsheets%2Fd%2F12k4BWqq5c3mV9ihQscPIwtuDa_QRB-iFohO7dXSSptI%2Fedit%23gid%3D0
# https://proxy.hxlstandard.org/data.csv?dest=data_edit&filter01=cut&filter-label01=with_columns&cut-include-tags01=%23vocab%2Bid%2Bv_iso6393_3letter%2C%23vocab%2Bcode%2Bv_6391%2C%23vocab%2Bname&filter02=select&filter-label02=without_rows&select-query02-01=%23vocab%2Bcode%2Bv_6391%3D&select-reverse02=on&url=https%3A%2F%2Fdocs.google.com%2Fspreadsheets%2Fd%2F12k4BWqq5c3mV9ihQscPIwtuDa_QRB-iFohO7dXSSptI%2Fedit%23gid%3D0
- id: recipe1
src: https://docs.google.com/spreadsheets/d/12k4BWqq5c3mV9ihQscPIwtuDa_QRB-iFohO7dXSSptI/edit#gid=0
filters:
- filter: with_columns
with_columns: "#vocab+id+v_iso6393_3letter,#vocab+code+v_6391,#vocab+name"
Expand All @@ -39,7 +41,7 @@
hdatasets:
- id: place2
comments: HXL-CPLP-FOD_countries-territories2
source:
sources:
- url: https://docs.google.com/spreadsheets/d/12k4BWqq5c3mV9ihQscPIwtuDa_QRB-iFohO7dXSSptI/edit#gid=0


Expand Down

0 comments on commit 818620e

Please sign in to comment.