From 5d1d20106f0ea173d625ba1c9a1b2231536d9c76 Mon Sep 17 00:00:00 2001 From: anki-code Date: Sun, 24 May 2020 14:30:06 +0300 Subject: [PATCH] Switched to upstream tokenize-output library --- README.md | 55 ++--- requirements.txt | 1 + setup.py | 6 +- .../__init__.py => output_search.py} | 2 +- .../output_search/test_dict_keys_values.py | 4 - xontrib/output_search/test_tokenize.py | 50 ----- xontrib/output_search/tokenize_output.py | 198 ------------------ 7 files changed, 18 insertions(+), 298 deletions(-) rename xontrib/{output_search/__init__.py => output_search.py} (98%) delete mode 100644 xontrib/output_search/test_dict_keys_values.py delete mode 100644 xontrib/output_search/test_tokenize.py delete mode 100644 xontrib/output_search/tokenize_output.py diff --git a/README.md b/README.md index a15a59e..293201b 100644 --- a/README.md +++ b/README.md @@ -43,15 +43,8 @@ $XONTRIB_OUTPUT_SEARCH_KEY='i' xontrib load output_search ``` -## Features -#### Words tokenizing -```shell script -$ echo "Hello world" -Hello world -$ echo The second word is wo -$ echo The second word is world -``` -URL example: +## Use cases +#### Get URL from output ```shell script $ echo "Try https://github.com/xxh/xxh" Try https://github.com/xxh/xxh @@ -59,7 +52,7 @@ $ git clone xx $ git clone https://github.com/xxh/xxh ``` -#### JSON, Python dict and JavaScript object tokenizing +#### Get key or value from JSON, Python dict and JavaScript object ```shell script $ echo '{"Try": "xontrib-output-search"}' {"Try": "xontrib-output-search"} @@ -67,7 +60,7 @@ $ echo I should try se $ echo I should try xontrib-output-search ``` -#### env tokenizing +#### Get the path from environment ```shell script $ env | grep ^PATH= PATH=/one/two:/three/four @@ -75,7 +68,7 @@ $ ls fo $ ls /three/four ``` -#### Complex prefixes autocomplete +#### Complete the complex prefix Get the URL from previous output after typing `git+`: ```shell script @@ -94,38 +87,18 @@ $ curl http://127.0.0.1:4 $ curl http://127.0.0.1:4242 ``` +#### Get arguments from command help +```shell script +$ lolcat -h +... +$ lolcat --s +$ lolcat --seed=SEED +``` ## Development -### Tokenizers -Tokenizer is a functions which extract tokens from the text. - -| Priority | Tokenizer | Text | Tokens | -| ---------| ---------- | ----- | ------ | -| 1 | **dict** | `{"key": "val as str"}` | `['key', 'val as str']` | -| 2 | **env** | `PATH=/bin:/etc` | `['PATH', '/bin:/etc', '/bin', '/etc']` | -| 3 | **split** | `Split me \n now!` | `['Split', 'me', 'now!']` | -| 4 | **strip** | `{Hello}` | `['Hello']` | - -You can create your tokenizer and add it to `tokenizers_all` in `tokenize_output.py`. +The xontrib-output-search is using [tokenize-output](https://github.com/tokenizer/tokenize-output) for tokenizing. -Tokenizing is a recursive process where every tokenizer returns `final` and `new` tokens. -The `final` tokens directly go to the result list of tokens. The `new` tokens go to all -tokenizers again to find new tokens. As result if there is a mix of json and env data -in the output it will be found and tokenized in appropriate way. - -### Test and debug -Run tests: -```shell script -cd ~ -git clone https://github.com/anki-code/xontrib-output-search -cd xontrib-output-search -pytest -``` -To debug the tokenizer: -```shell script -echo "Hello world" | python tokenize_outupt.py --pipe -``` -Check that `output_search` loaded: +Checking that `output_search` xontrib has been loaded: ```shell script $ xontrib list output_search output_search installed loaded diff --git a/requirements.txt b/requirements.txt index ecf1685..aee8304 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +tokenize-output xonsh demjson rever \ No newline at end of file diff --git a/setup.py b/setup.py index 06194e2..055f498 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setup( name='xontrib-output-search', - version='0.4.1', + version='0.5.0', license='BSD', author='anki', author_email='author@example.com', @@ -17,9 +17,7 @@ long_description=long_description, long_description_content_type='text/markdown', python_requires='>=3.6', - install_requires=[ - 'demjson' - ], + install_requires=['tokenize-output'], packages=['xontrib'], package_dir={'xontrib': 'xontrib'}, package_data={'xontrib': ['output_search/*.py']}, diff --git a/xontrib/output_search/__init__.py b/xontrib/output_search.py similarity index 98% rename from xontrib/output_search/__init__.py rename to xontrib/output_search.py index 1facfa3..a56ec3f 100644 --- a/xontrib/output_search/__init__.py +++ b/xontrib/output_search.py @@ -1,7 +1,7 @@ #!/usr/bin/env xonsh import re -from xontrib.output_search.tokenize_output import tokenize_output +from tokenize_output.tokenize_output import tokenize_output _key = __xonsh__.env.get('XONTRIB_OUTPUT_SEARCH_KEY', 'f') diff --git a/xontrib/output_search/test_dict_keys_values.py b/xontrib/output_search/test_dict_keys_values.py deleted file mode 100644 index 253ea19..0000000 --- a/xontrib/output_search/test_dict_keys_values.py +++ /dev/null @@ -1,4 +0,0 @@ -from output_search.tokenize_output import dict_keys_values - -def test_dict_keys_values(): - assert dict_keys_values([{'abc':{'b':{'c':123}}, 'd':[[1,2,3], None, True, {'e':1}]},4]) == {'keys': ['abc', 'b', 'c', 'd', 'e'], 'values': [123, 1, 2, 3, True, 1, 4]} \ No newline at end of file diff --git a/xontrib/output_search/test_tokenize.py b/xontrib/output_search/test_tokenize.py deleted file mode 100644 index 2450552..0000000 --- a/xontrib/output_search/test_tokenize.py +++ /dev/null @@ -1,50 +0,0 @@ -from output_search.tokenize_output import tokenize_output_sorted - -def test_tokenize_empty(): - assert tokenize_output_sorted('') == [] - -def test_tokenize_one(): - assert tokenize_output_sorted('one') == ['one'] - -def test_tokenize_empty_prefix(): - assert tokenize_output_sorted('one two three', substring='none') == [] - -def test_tokenize_one_2_three_4(): - assert tokenize_output_sorted('one 2 three 4') == ['one', 'three'] - -def test_tokenize_repeated(): - assert tokenize_output_sorted(""" - +-------+-------+ - | one | two | - | ----- | ----- | - | three | 12345 | - +-------+-------+ - """) == ['12345', 'one', 'three', 'two'] - -def test_tokenize_specials(): - assert tokenize_output_sorted('\n\t\r one \n\t\r "two" \n\t\r three \n\t\r') == ['one', 'three', 'two'] - -def test_tokenize_substring(): - assert tokenize_output_sorted('one two three four five six', substring='e') == ['five', 'one', 'three'] - - -def test_tokenize_env(): - assert tokenize_output_sorted('SHELL=bash\nPATH=/a/b:/c/d') == ['/a/b', '/a/b:/c/d', '/c/d', 'PATH', 'SHELL', 'bash'] - -def test_tokenize_env_substrig(): - assert tokenize_output_sorted('SHELL=bash\nPATH=/a/b:/c/d', substring='/c') == ['/a/b:/c/d', '/c/d'] - - -def test_tokenize_json(): - assert tokenize_output_sorted('{"Hello": "hello world", "test": None}') == ['Hello', 'hello', 'hello world', 'test', 'world'] - -def test_tokenize_json_partial(): - assert tokenize_output_sorted('"test": "1",') == ['test'] - - -def test_tokenize_javascript(): - assert tokenize_output_sorted("{Hello: 'hello world', test:null}") == ['Hello', 'hello', 'hello world', 'test', 'world'] - - -def test_tokenize_complex(): - assert tokenize_output_sorted('one "two" Three=four {"qwe":"hello world"}') == ['Three', 'four', 'hello', 'one', 'qwe', 'two', 'world'] diff --git a/xontrib/output_search/tokenize_output.py b/xontrib/output_search/tokenize_output.py deleted file mode 100644 index 5f19db1..0000000 --- a/xontrib/output_search/tokenize_output.py +++ /dev/null @@ -1,198 +0,0 @@ -#!/usr/bin/env python3 - -import re -import json -import demjson -import ast -import logging -from collections.abc import Iterable - -def filter_tokens(tokens, substring='', len_min=2): - substring_lower = substring.lower() - result = [] - for t in tokens: - len_t = len(t) - if len_t <= len_min: # Skip short tokens - continue - if len(set(t)) <= 2: # Skip tokens with repeated characters ('+-+-+') - continue - if substring_lower not in t.lower(): # Skip by substring - continue - result.append(t) - return set(result) - -framed_regexp = re.compile(r'^["\'({\[,:;]*(.+?)[,})\]"\':;]*$') -def tokenizer_strip(text, text_cmd='', substring='', current_cmd={}): - tokens = {'final': set(), 'new': set()} - g = framed_regexp.match(text) - if g: - token = g.group(1) - if token == text: - return tokens - else: - tokens = {'final': set(), 'new': set([token])} - return tokens - return tokens - -clean_regexp = re.compile(r'[\n\r\t]') -def tokenizer_split(text, text_cmd='', substring='', current_cmd={}): - text = clean_regexp.sub(' ', text).strip() - - split_combinations = [' ', '":"'] - for sc in split_combinations: - tokens = text.split(sc) - if len(tokens) > 1: - break - - if tokens != [text]: - tokens = {'final': set(), 'new': set(tokens)} - else: - tokens = {'final': set(), 'new': set()} - return tokens - - -env_regexp = re.compile(r'^([a-zA-Z0-9_]+?)=(.*)$') -def tokenizer_env(text, text_cmd='', substring='', current_cmd={}): - tokens = {'final': set(), 'new': set()} - if len(text) < 4: - return tokens - g = env_regexp.match(text) - if g: - var = g.group(1) - value = g.group(2) - values = value.split(':') - tokens = { - 'final': set([var, value] + values), - 'new': set([value]) - } - return tokens - - -def dict_keys_values(d, target='values'): - result = {'keys': [], 'values': []} - if d is None: - return result - elif type(d) is dict: - for k in d: - result['keys'] += [k] - val_result = dict_keys_values(d[k], 'values') - result['keys'] += val_result['keys'] - result['values'] += val_result['values'] - return result - elif type(d) in [list, set]: - for v in d: - val_result = dict_keys_values(v, 'values') - result['keys'] += val_result['keys'] - result['values'] += val_result['values'] - return result - else: - result[target] += [d] - return result - -def list_str(lst): - if isinstance(lst, Iterable): - return [str(l) for l in lst] - else: - return str(lst) - -def tokenizer_dict(text, text_cmd='', substring='', current_cmd={}): - tokens = {'final': set(), 'new': set()} - if len(text) < 6: - return tokens - if text[:1]+text[-1:] not in ['{}', '[]']: - return tokens - - dct = None - try: # JSON - dct = json.loads(text) - except: - pass - - if dct is None: - try: # Python dict - dct = ast.literal_eval(text) - except: - pass - - if dct is None: - try: # JavaScript Object - dct = demjson.decode(text) - except: - pass - - if dct is not None: - dct_tokens = dict_keys_values(dct) - values = list_str(dct_tokens['values']) - tokens = { - 'final': set(list_str(dct_tokens['keys']) + values), - 'new': set(values) - } - return tokens - - return tokens - - -tokenizers_all = { - 'dict': tokenizer_dict, - 'env': tokenizer_env, - 'split': tokenizer_split, - 'strip': tokenizer_strip -} - - -def tokenize_output(text, text_cmd='', substring='', current_cmd={}, tokenizers=['dict', 'env', 'split', 'strip'], recursion_level=1): - spacing = ' ' * recursion_level * 2 - recursion_level_num = f" {recursion_level:02d}" - logging.debug(f"{recursion_level_num}{spacing}TEXT: {text}") - result_tokens = [] - found_tokens = False - for tokenizer_name in tokenizers: - tokenizer = tokenizers_all[tokenizer_name] - tokens = tokenizer(text, text_cmd=text_cmd, substring=substring, current_cmd=current_cmd) - if len(tokens['final']) > 0 or len(tokens['new']) > 0: - found_tokens = True - tokens = { - 'final': filter_tokens(tokens['final'], substring), - 'new': filter_tokens(tokens['new'], substring) - } - logging.debug(f"{recursion_level_num}{spacing*2}{tokenizer_name} {tokens}") - result_tokens += list(tokens['final']) - if len(tokens['new']) > 0: - for token in tokens['new']: - result_tokens += list( - tokenize_output(token, text_cmd=text_cmd, substring=substring, current_cmd=current_cmd, - recursion_level=(recursion_level + 1), tokenizers=tokenizers)) - break - - if result_tokens == []: - r = set([text] if not found_tokens and substring.lower() in text.lower() else []) if text != '' else set() - logging.debug(f"{recursion_level_num}{spacing}RETURN {r}") - return r - - r = set(result_tokens) - logging.debug(f"{recursion_level_num}{spacing}RETURN {r}") - return r - -def tokenize_output_sorted(*args, **kwargs): - r = list(tokenize_output(*args, **kwargs)) - r = sorted(r) - return r - -if __name__ == '__main__': - import sys - import argparse - logging.getLogger().setLevel(logging.DEBUG) - - argp = argparse.ArgumentParser(description="Tokenize output") - argp.add_argument('--pipe', '-p', action='store_true') - args = argp.parse_args() - - if args.pipe: - stdin = '\n'.join(sys.stdin.readlines()) - else: - print('Usage: echo "Hello world" | python tokenizer_outupt.py --pipe', file=sys.stderr) - print('Example: \n', file=sys.stderr) - stdin = '"Hello" {world}' - - tokens = tokenize_output_sorted(stdin.strip()) - print(tokens)