diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3acd9376..739dee19 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ jobs: - uses: actions/setup-python@v2 - uses: pre-commit/action@v2.0.0 with: - extra_args: --hook-stage manual + extra_args: --hook-stage manual --all-files checks: name: Check Python ${{ matrix.python-version }} on ${{ matrix.runs-on }} diff --git a/Makefile b/Makefile index b55fac39..c95ea9e0 100644 --- a/Makefile +++ b/Makefile @@ -9,8 +9,9 @@ endif OSXFLAG=$(shell uname|grep -q Darwin && echo "-undefined dynamic_lookup") CFLAGS=--std=c++17 -O3 -Wall -fPIC -Irapidjson/include -Ipybind11/include -Icpp-peglib $(PYINC) -Iinclude LDFLAGS=-pthread +PREFIX ?= /usr -.PHONY: build all clean +.PHONY: build all clean install all: demo examples @@ -29,6 +30,12 @@ correctionlib: build/python.o build/correction.o build/formula_ast.o $(CXX) $(LDFLAGS) -fPIC -shared $(OSXFLAG) $^ -o correctionlib/_core$(PYEXT) touch correctionlib/__init__.py +install: correctionlib + mkdir -p $(PREFIX)/include + install -m 644 include/correction.h $(PREFIX)/include + mkdir -p $(PREFIX)/lib + install -m 755 correctionlib/_core$(PYEXT) $(PREFIX)/lib + clean: rm -rf build rm -f demo diff --git a/include/correction.h b/include/correction.h index 836d1a72..291ff8ff 100644 --- a/include/correction.h +++ b/include/correction.h @@ -6,7 +6,15 @@ #include #include #include -#include + +namespace rapidjson { + // actual definition for class Value; + template struct UTF8; + class CrtAllocator; + template class MemoryPoolAllocator; + template class GenericValue; + typedef GenericValue, MemoryPoolAllocator> Value; +}; namespace correction { diff --git a/src/correction.cc b/src/correction.cc index ac432b6f..f1be59d1 100644 --- a/src/correction.cc +++ b/src/correction.cc @@ -1,3 +1,4 @@ +#include #include #include #include @@ -339,7 +340,7 @@ const Content& Category::child(const std::vector& values) const if ( auto pval = std::get_if(&values[variableIdx_]) ) { try { return std::get(map_).at(*pval); - } catch (std::out_of_range ex) { + } catch (std::out_of_range& ex) { if ( default_ ) { return *default_; } @@ -351,7 +352,7 @@ const Content& Category::child(const std::vector& values) const else if ( auto pval = std::get_if(&values[variableIdx_]) ) { try { return std::get(map_).at(*pval); - } catch (std::out_of_range ex) { + } catch (std::out_of_range& ex) { if ( default_ ) { return *default_; } diff --git a/src/correctionlib/JSONEncoder.py b/src/correctionlib/JSONEncoder.py new file mode 100755 index 00000000..c9d86740 --- /dev/null +++ b/src/correctionlib/JSONEncoder.py @@ -0,0 +1,163 @@ +"""A custom JSON encoder for corrections +Author: Izaak Neutelings (March 2021) +Description: Write JSON with indents more compactly by collapsing some lists and dictionaries +Instructions: Print or write JSON dictionary 'data' as + import JSONEncoder + print(JSONEncoder.write(data,sort_keys=True,indent=2,maxlistlen=25,maxdictlen=3,breakbrackets=False)) + print(JSONEncoder.dumps(data,sort_keys=True,indent=2,maxlistlen=25,maxdictlen=3,breakbrackets=False)) +Adapted from: + https://stackoverflow.com/questions/16264515/json-dumps-custom-formatting +""" +import json +import math +from typing import Any, List, Type + +import pydantic + + +def write(data: Any, fname: str, **kwargs: Any) -> None: + """Help function to quickly write JSON file formatted by JSONEncoder.""" + with open(fname, "w") as fout: + fout.write(dumps(data, **kwargs)) + + +def dumps(data: Any, sort_keys: bool = False, **kwargs: Any) -> str: + """Help function to quickly dump dictionary formatted by JSONEncoder.""" + if isinstance(data, pydantic.BaseModel): # for pydantic + return data.json(cls=JSONEncoder, exclude_unset=True, **kwargs) + else: # for standard data structures + return json.dumps(data, cls=JSONEncoder, sort_keys=sort_keys, **kwargs) + + +class JSONEncoder(json.JSONEncoder): + """ + Encoder to make correctionlib JSON more compact, but still readable: + - keep list of primitives (int, float, str) on one line, + or split over several if the length is longer than a given maxlen + - do not break line for short dictionary if all values are primitive + - do not break line after bracket for first key of dictionary, + unless itself nested in dictionary + """ + + def __init__(self, *args: Any, **kwargs: Any): + if kwargs.get("indent", None) is None: + kwargs["indent"] = 2 + # maximum of primitive elements per list, before breaking lines + self.maxlistlen = kwargs.pop("maxlistlen", 25) + # maximum of primitive elements per dict, before breaking lines + self.maxdictlen = kwargs.pop("maxdictlen", 2) + # maximum length of strings in short dict, before breaking lines + self.maxstrlen = kwargs.pop("maxstrlen", 2 * self.maxlistlen) + # break after opening bracket + self.breakbrackets = kwargs.pop("breakbrackets", False) + super().__init__(*args, **kwargs) + self._indent = 0 # current indent + self.parent = type(None) # type of parent for recursive use + + def encode(self, obj: Any) -> str: + grandparent = self.parent # type: Type[Any] + self.parent = type(obj) + retval = "" + if isinstance(obj, (list, tuple)): # lists, tuples + output = [] + if all( + isinstance(x, (int, float, str)) for x in obj + ): # list of primitives only + strlen = sum(len(s) for s in obj if isinstance(s, str)) + indent_str = " " * (self._indent + self.indent) + if strlen > self.maxstrlen and any( + len(s) > 3 for s in obj if isinstance(s, str) + ): + obj = [ + json.dumps(s) for s in obj + ] # convert everything into a string + if any( + len(s) > self.maxstrlen / 4 for s in obj + ): # break list of long strings into multiple lines + output = obj + else: # group strings into several lines + line = [] # type: List[str] + nchars = 0 + for item in obj: + if len(line) == 0 or nchars + len(item) < self.maxstrlen: + line.append(item) + nchars += len(item) + else: # new line + output.append(", ".join(line)) + line = [item] + nchars = len(item) + if line: + output.append(", ".join(line)) + elif len(obj) <= self.maxlistlen: # write short list on one line + for item in obj: + output.append(json.dumps(item)) + retval = "[ " + ", ".join(output) + " ]" + else: # break long list into multiple lines + nlines = math.ceil(len(obj) / float(self.maxlistlen)) + maxlen = int(len(obj) / nlines) + for i in range(0, nlines): + line = [] + for item in obj[i * maxlen : (i + 1) * maxlen]: + line.append(json.dumps(item)) + output.append(", ".join(line)) + if not retval: + lines = (",\n" + indent_str).join(output) # lines between brackets + if ( + grandparent == dict or self.breakbrackets + ): # break first line after opening bracket + retval = ( + "[\n" + indent_str + lines + "\n" + " " * self._indent + "]" + ) + else: # do not break first line + retval = ( + "[" + + " " * (self.indent - 1) + + lines + + "\n" + + " " * self._indent + + "]" + ) + else: # list of lists, tuples, dictionaries + self._indent += self.indent + indent_str = " " * self._indent + for item in obj: + output.append(indent_str + self.encode(item)) + self._indent -= self.indent + indent_str = " " * self._indent + retval = "[\n" + ",\n".join(output) + "\n" + indent_str + "]" + elif isinstance(obj, dict): # dictionaries + output = [] + if ( + len(obj) <= self.maxdictlen + and all(isinstance(obj[k], (int, float, str)) for k in obj) + and sum(len(k) + len(obj[k]) for k in obj if isinstance(obj[k], str)) + <= self.maxstrlen + ): # write short dict on one line + retval = ( + "{ " + + ", ".join(json.dumps(k) + ": " + self.encode(obj[k]) for k in obj) + + " }" + ) + else: # break long dict into multiple line + self._indent += self.indent + indent_str = " " * self._indent + first = ( + grandparent not in (type(None), dict) and not self.breakbrackets + ) # break after opening brace + for key, value in obj.items(): + valstr = self.encode(value) + if ( + first and "\n" not in valstr + ): # no break between opening brace and first key + row = " " * (self.indent - 1) + json.dumps(key) + ": " + valstr + else: # break before key + row = "\n" + indent_str + json.dumps(key) + ": " + valstr + output.append(row) + first = False + self._indent -= self.indent + indent_str = " " * self._indent + retval = "{" + ",".join(output) + "\n" + indent_str + "}" + else: # use default formatting + retval = json.dumps(obj) + self.parent = grandparent + return retval diff --git a/tests/test_jsonencoder.py b/tests/test_jsonencoder.py new file mode 100755 index 00000000..aa41612e --- /dev/null +++ b/tests/test_jsonencoder.py @@ -0,0 +1,247 @@ +from correctionlib.JSONEncoder import dumps + + +def test_jsonencode(): + data = { + "layer1": { + "layer2_1": { + "layer3_1": [ + {"x": 1, "y": 7}, + {"x": 0, "y": 4}, + {"x": 5, "y": 3}, + {"x": 6, "y": 9}, + {"key": "foo", "value": 1}, + {"key": "foo", "value": {k: v for v, k in enumerate("abcd")}}, + {k: v for v, k in enumerate("ab")}, + {k: v for v, k in enumerate("abc")}, + {k: v for v, k in enumerate("abcd")}, + {k: {k2: v2 for v2, k2 in enumerate("ab")} for k in "ab"}, + ], + "layer3_2": "string", + "layer3_3": [ + {"x": 2, "y": 8, "z": 3}, + {"x": 1, "y": 5, "z": 4}, + {"x": 6, "y": 9, "z": 8}, + ], + }, + "layer2_2": { + "layer3_4": [ + ["a", "b", "c"], + [c for c in "abcdefghijklmnopqrstuvwxyz"], + [c for c in "abcdefghijklmnopqrstuvwxyz123"], + [c for c in "abcdefghijklmnopqrstuvwxyz" * 2], + [ + "this is short", + "very short", + ], + ["this is medium long", "verily, can you see?"], + ["this one is a bit longer,", "in order to find the edge..."], + [ + "this", + "list of", + "strings", + "is a bit", + "longer", + "in order", + "to find", + "the edge", + "but the", + "words", + "are short", + ], + [ + "this", + 1, + 2, + "list of", + 45, + "also", + 66, + "contains", + "some", + "numbers", + "for the", + 100, + "heck of", + "it", + "see if", + "it splits", + ], + [ + "this", + "list of strings is", + "a bit longer,", + "in order", + "to find the edge...", + ], + [ + "this is a very, very long string to test line break", + "and this is another very long string", + ], + ], + "layer3_5": [ + list(range(1, 10 + 1)), + list(range(1, 20 + 1)), + list(range(1, 24 + 1)), + list(range(1, 25 + 1)), + list(range(1, 26 + 1)), + list(range(1, 27 + 1)), + list(range(1, 30 + 1)), + list(range(1, 40 + 1)), + list(range(1, 50 + 1)), + list(range(1, 51 + 1)), + list(range(1, 52 + 1)), + ], + "layer3_6": list(range(1, 20 + 1)), + "layer3_7": list(range(1, 40 + 1)), + "layer3_8": [ + { + "key": "this is short", + "value": "very short", + }, + { + "key": "this is medium long", + "value": "verily, can you see?", + }, + { + "key": "this is one is a bit longer", + "value": "to find the edge", + }, + { + "key": "this is a very long string to test line break", + "value": "another very long string", + }, + ], + }, + } + } + + formatted = dumps( + data, + sort_keys=True, + indent=2, + maxlistlen=25, + maxdictlen=3, + breakbrackets=False, + ) + + expected = """\ +{ + "layer1": { + "layer2_1": { + "layer3_1": [ + { "x": 1, "y": 7 }, + { "x": 0, "y": 4 }, + { "x": 5, "y": 3 }, + { "x": 6, "y": 9 }, + { "key": "foo", "value": 1 }, + { "key": "foo", + "value": { + "a": 0, + "b": 1, + "c": 2, + "d": 3 + } + }, + { "a": 0, "b": 1 }, + { "a": 0, "b": 1, "c": 2 }, + { "a": 0, + "b": 1, + "c": 2, + "d": 3 + }, + { "a": { "a": 0, "b": 1 }, + "b": { "a": 0, "b": 1 } + } + ], + "layer3_2": "string", + "layer3_3": [ + { "x": 2, "y": 8, "z": 3 }, + { "x": 1, "y": 5, "z": 4 }, + { "x": 6, "y": 9, "z": 8 } + ] + }, + "layer2_2": { + "layer3_4": [ + [ "a", "b", "c" ], + [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", + "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" + ], + [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", + "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "1", "2" + ], + [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", + "r", "s", "t", "u", "v", "w", "x", "y", "z", "a", "b", "c", "d", "e", "f", "g", "h", + "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y" + ], + [ "this is short", "very short" ], + [ "this is medium long", "verily, can you see?" ], + [ "this one is a bit longer,", + "in order to find the edge..." + ], + [ "this", "list of", "strings", "is a bit", "longer", + "in order", "to find", "the edge", "but the", "words", + "are short" + ], + [ "this", 1, 2, "list of", 45, "also", 66, "contains", "some", + "numbers", "for the", 100, "heck of", "it", "see if", + "it splits" + ], + [ "this", + "list of strings is", + "a bit longer,", + "in order", + "to find the edge..." + ], + [ "this is a very, very long string to test line break", + "and this is another very long string" + ] + ], + "layer3_5": [ + [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], + [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 ], + [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 ], + [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 ], + [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 + ], + [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 + ], + [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 + ], + [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40 + ], + [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50 + ], + [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 + ], + [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 + ] + ], + "layer3_6": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 ], + "layer3_7": [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40 + ], + "layer3_8": [ + { "key": "this is short", "value": "very short" }, + { "key": "this is medium long", "value": "verily, can you see?" }, + { "key": "this is one is a bit longer", + "value": "to find the edge" + }, + { "key": "this is a very long string to test line break", + "value": "another very long string" + } + ] + } + } +}""" + assert formatted == expected, f"Found:\n {formatted}"