From 5d1d20106f0ea173d625ba1c9a1b2231536d9c76 Mon Sep 17 00:00:00 2001
From: anki-code <anki-code>
Date: Sun, 24 May 2020 14:30:06 +0300
Subject: [PATCH] Switched to upstream tokenize-output library

---
 README.md                                     |  55 ++---
 requirements.txt                              |   1 +
 setup.py                                      |   6 +-
 .../__init__.py => output_search.py}          |   2 +-
 .../output_search/test_dict_keys_values.py    |   4 -
 xontrib/output_search/test_tokenize.py        |  50 -----
 xontrib/output_search/tokenize_output.py      | 198 ------------------
 7 files changed, 18 insertions(+), 298 deletions(-)
 rename xontrib/{output_search/__init__.py => output_search.py} (98%)
 delete mode 100644 xontrib/output_search/test_dict_keys_values.py
 delete mode 100644 xontrib/output_search/test_tokenize.py
 delete mode 100644 xontrib/output_search/tokenize_output.py
diff --git a/README.md b/README.md
index a15a59e..293201b 100644
--- a/README.md
+++ b/README.md
@@ -43,15 +43,8 @@ $XONTRIB_OUTPUT_SEARCH_KEY='i'
 xontrib load output_search
 ```
 
-## Features
-#### Words tokenizing
-```shell script
-$ echo "Hello world"
-Hello world
-$ echo The second word is wo<Alt+F>
-$ echo The second word is world
-```
-URL example:
+## Use cases
+#### Get URL from output
 ```shell script
 $ echo "Try https://github.com/xxh/xxh"
 Try https://github.com/xxh/xxh
@@ -59,7 +52,7 @@ $ git clone xx<Alt+F>
 $ git clone https://github.com/xxh/xxh
 ```
 
-#### JSON, Python dict and JavaScript object tokenizing
+#### Get key or value from JSON, Python dict and JavaScript object
 ```shell script
 $ echo '{"Try": "xontrib-output-search"}'
 {"Try": "xontrib-output-search"}
@@ -67,7 +60,7 @@ $ echo I should try se<Alt+F>
 $ echo I should try xontrib-output-search
 ```    
 
-#### env tokenizing
+#### Get the path from environment
 ```shell script
 $ env | grep ^PATH=
 PATH=/one/two:/three/four
@@ -75,7 +68,7 @@ $ ls fo<Alt+F>
 $ ls /three/four
 ```    
 
-#### Complex prefixes autocomplete
+#### Complete the complex prefix
 
 Get the URL from previous output after typing `git+`:
 ```shell script
@@ -94,38 +87,18 @@ $ curl http://127.0.0.1:4<Alt+F>
 $ curl http://127.0.0.1:4242
 ```
 
+#### Get arguments from command help
+```shell script
+$ lolcat -h
+...
+$ lolcat --s<Alt+F>
+$ lolcat --seed=SEED
+```
 ## Development
 
-### Tokenizers
-Tokenizer is a functions which extract tokens from the text.
-
-| Priority | Tokenizer  | Text  | Tokens |
-| ---------| ---------- | ----- | ------ |
-| 1        | **dict**   | `{"key": "val as str"}` | `['key', 'val as str']` |
-| 2        | **env**    | `PATH=/bin:/etc` | `['PATH', '/bin:/etc', '/bin', '/etc']` |   
-| 3        | **split**  | `Split  me \n now!` | `['Split', 'me', 'now!']` |   
-| 4        | **strip**  | `{Hello}` | `['Hello']` |   
-
-You can create your tokenizer and add it to `tokenizers_all` in `tokenize_output.py`.
+The xontrib-output-search is using [tokenize-output](https://github.com/tokenizer/tokenize-output) for tokenizing.
 
-Tokenizing is a recursive process where every tokenizer returns `final` and `new` tokens. 
-The `final` tokens directly go to the result list of tokens. The `new` tokens go to all 
-tokenizers again to find new tokens. As result if there is a mix of json and env data 
-in the output it will be found and tokenized in appropriate way.  
-
-### Test and debug
-Run tests:
-```shell script
-cd ~
-git clone https://github.com/anki-code/xontrib-output-search
-cd xontrib-output-search
-pytest
-```
-To debug the tokenizer:
-```shell script
-echo "Hello world" | python tokenize_outupt.py --pipe
-```
-Check that `output_search` loaded:
+Checking that `output_search` xontrib has been loaded:
 ```shell script
 $ xontrib list output_search
 output_search  installed  loaded
diff --git a/requirements.txt b/requirements.txt
index ecf1685..aee8304 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+tokenize-output
 xonsh
 demjson
 rever
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 06194e2..055f498 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@
 
 setup(
     name='xontrib-output-search',
-    version='0.4.1',
+    version='0.5.0',
     license='BSD',
     author='anki',
     author_email='author@example.com',
@@ -17,9 +17,7 @@
     long_description=long_description,
     long_description_content_type='text/markdown',
     python_requires='>=3.6',
-    install_requires=[
-        'demjson'
-    ],
+    install_requires=['tokenize-output'],
     packages=['xontrib'],
     package_dir={'xontrib': 'xontrib'},
     package_data={'xontrib': ['output_search/*.py']},
diff --git a/xontrib/output_search/__init__.py b/xontrib/output_search.py
similarity index 98%
rename from xontrib/output_search/__init__.py
rename to xontrib/output_search.py
index 1facfa3..a56ec3f 100644
--- a/xontrib/output_search/__init__.py
+++ b/xontrib/output_search.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env xonsh
 
 import re
-from xontrib.output_search.tokenize_output import tokenize_output
+from tokenize_output.tokenize_output import tokenize_output
 
 _key = __xonsh__.env.get('XONTRIB_OUTPUT_SEARCH_KEY', 'f')
 
diff --git a/xontrib/output_search/test_dict_keys_values.py b/xontrib/output_search/test_dict_keys_values.py
deleted file mode 100644
index 253ea19..0000000
--- a/xontrib/output_search/test_dict_keys_values.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from output_search.tokenize_output import dict_keys_values
-
-def test_dict_keys_values():
-    assert dict_keys_values([{'abc':{'b':{'c':123}}, 'd':[[1,2,3], None, True, {'e':1}]},4]) == {'keys': ['abc', 'b', 'c', 'd', 'e'], 'values': [123, 1, 2, 3, True, 1, 4]}
\ No newline at end of file
diff --git a/xontrib/output_search/test_tokenize.py b/xontrib/output_search/test_tokenize.py
deleted file mode 100644
index 2450552..0000000
--- a/xontrib/output_search/test_tokenize.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from output_search.tokenize_output import tokenize_output_sorted
-
-def test_tokenize_empty():
-    assert tokenize_output_sorted('') == []
-
-def test_tokenize_one():
-    assert tokenize_output_sorted('one') == ['one']
-
-def test_tokenize_empty_prefix():
-    assert tokenize_output_sorted('one two three', substring='none') == []
-
-def test_tokenize_one_2_three_4():
-    assert tokenize_output_sorted('one 2 three 4') == ['one', 'three']
-
-def test_tokenize_repeated():
-    assert tokenize_output_sorted("""
-        +-------+-------+
-        | one   | two   |
-        | ----- | ----- |
-        | three | 12345 |
-        +-------+-------+
-    """) == ['12345', 'one', 'three', 'two']
-
-def test_tokenize_specials():
-    assert tokenize_output_sorted('\n\t\r one \n\t\r "two" \n\t\r three \n\t\r') == ['one', 'three', 'two']
-
-def test_tokenize_substring():
-    assert tokenize_output_sorted('one two three four five six', substring='e') == ['five', 'one', 'three']
-
-
-def test_tokenize_env():
-    assert tokenize_output_sorted('SHELL=bash\nPATH=/a/b:/c/d') == ['/a/b', '/a/b:/c/d', '/c/d', 'PATH', 'SHELL', 'bash']
-
-def test_tokenize_env_substrig():
-    assert tokenize_output_sorted('SHELL=bash\nPATH=/a/b:/c/d', substring='/c') == ['/a/b:/c/d', '/c/d']
-
-
-def test_tokenize_json():
-    assert tokenize_output_sorted('{"Hello": "hello world", "test": None}') == ['Hello', 'hello', 'hello world', 'test', 'world']
-
-def test_tokenize_json_partial():
-    assert tokenize_output_sorted('"test": "1",') == ['test']
-
-
-def test_tokenize_javascript():
-    assert tokenize_output_sorted("{Hello: 'hello world', test:null}") == ['Hello', 'hello', 'hello world', 'test', 'world']
-
-
-def test_tokenize_complex():
-    assert tokenize_output_sorted('one "two" Three=four {"qwe":"hello world"}') == ['Three', 'four', 'hello', 'one', 'qwe', 'two', 'world']
diff --git a/xontrib/output_search/tokenize_output.py b/xontrib/output_search/tokenize_output.py
deleted file mode 100644
index 5f19db1..0000000
--- a/xontrib/output_search/tokenize_output.py
+++ /dev/null
@@ -1,198 +0,0 @@
-#!/usr/bin/env python3
-
-import re
-import json
-import demjson
-import ast
-import logging
-from collections.abc import Iterable
-
-def filter_tokens(tokens, substring='', len_min=2):
-    substring_lower = substring.lower()
-    result = []
-    for t in tokens:
-        len_t = len(t)
-        if len_t <= len_min:  # Skip short tokens
-            continue
-        if len(set(t)) <= 2:  # Skip tokens with repeated characters ('+-+-+')
-            continue
-        if substring_lower not in t.lower():  # Skip by substring
-            continue
-        result.append(t)
-    return set(result)
-
-framed_regexp = re.compile(r'^["\'({\[,:;]*(.+?)[,})\]"\':;]*$')
-def tokenizer_strip(text, text_cmd='', substring='', current_cmd={}):
-    tokens = {'final': set(), 'new': set()}
-    g = framed_regexp.match(text)
-    if g:
-        token = g.group(1)
-        if token == text:
-            return tokens
-        else:
-            tokens = {'final': set(), 'new': set([token])}
-            return tokens
-    return tokens
-
-clean_regexp = re.compile(r'[\n\r\t]')
-def tokenizer_split(text, text_cmd='', substring='', current_cmd={}):
-    text = clean_regexp.sub(' ', text).strip()
-
-    split_combinations = [' ', '":"']
-    for sc in split_combinations:
-        tokens = text.split(sc)
-        if len(tokens) > 1:
-            break
-
-    if tokens != [text]:
-        tokens = {'final': set(), 'new': set(tokens)}
-    else:
-        tokens = {'final': set(), 'new': set()}
-    return tokens
-
-
-env_regexp = re.compile(r'^([a-zA-Z0-9_]+?)=(.*)$')
-def tokenizer_env(text, text_cmd='', substring='', current_cmd={}):
-    tokens = {'final': set(), 'new': set()}
-    if len(text) < 4:
-        return tokens
-    g = env_regexp.match(text)
-    if g:
-        var = g.group(1)
-        value = g.group(2)
-        values = value.split(':')
-        tokens = {
-            'final': set([var, value] + values),
-            'new': set([value])
-        }
-    return tokens
-
-
-def dict_keys_values(d, target='values'):
-    result = {'keys': [], 'values': []}
-    if d is None:
-        return result
-    elif type(d) is dict:
-        for k in d:
-            result['keys'] += [k]
-            val_result = dict_keys_values(d[k], 'values')
-            result['keys'] += val_result['keys']
-            result['values'] += val_result['values']
-        return result
-    elif type(d) in [list, set]:
-        for v in d:
-            val_result = dict_keys_values(v, 'values')
-            result['keys'] += val_result['keys']
-            result['values'] += val_result['values']
-        return result
-    else:
-        result[target] += [d]
-        return result
-
-def list_str(lst):
-    if isinstance(lst, Iterable):
-        return [str(l) for l in lst]
-    else:
-        return str(lst)
-
-def tokenizer_dict(text, text_cmd='', substring='', current_cmd={}):
-    tokens = {'final': set(), 'new': set()}
-    if len(text) < 6:
-        return tokens
-    if text[:1]+text[-1:] not in ['{}', '[]']:
-        return tokens
-
-    dct = None
-    try:  # JSON
-        dct = json.loads(text)
-    except:
-        pass
-
-    if dct is None:
-        try:  # Python dict
-            dct = ast.literal_eval(text)
-        except:
-            pass
-
-    if dct is None:
-        try:  # JavaScript Object
-            dct = demjson.decode(text)
-        except:
-            pass
-
-    if dct is not None:
-        dct_tokens = dict_keys_values(dct)
-        values = list_str(dct_tokens['values'])
-        tokens = {
-            'final': set(list_str(dct_tokens['keys']) + values),
-            'new': set(values)
-        }
-        return tokens
-
-    return tokens
-
-
-tokenizers_all = {
-    'dict': tokenizer_dict,
-    'env': tokenizer_env,
-    'split': tokenizer_split,
-    'strip': tokenizer_strip
-}
-
-
-def tokenize_output(text, text_cmd='', substring='', current_cmd={}, tokenizers=['dict', 'env', 'split', 'strip'], recursion_level=1):
-    spacing = ' ' * recursion_level * 2
-    recursion_level_num = f" {recursion_level:02d}"
-    logging.debug(f"{recursion_level_num}{spacing}TEXT: {text}")
-    result_tokens = []
-    found_tokens = False
-    for tokenizer_name in tokenizers:
-        tokenizer = tokenizers_all[tokenizer_name]
-        tokens = tokenizer(text, text_cmd=text_cmd, substring=substring, current_cmd=current_cmd)
-        if len(tokens['final']) > 0 or len(tokens['new']) > 0:
-            found_tokens = True
-        tokens = {
-            'final': filter_tokens(tokens['final'], substring),
-            'new': filter_tokens(tokens['new'], substring)
-        }
-        logging.debug(f"{recursion_level_num}{spacing*2}{tokenizer_name} {tokens}")
-        result_tokens += list(tokens['final'])
-        if len(tokens['new']) > 0:
-            for token in tokens['new']:
-                result_tokens += list(
-                    tokenize_output(token, text_cmd=text_cmd, substring=substring, current_cmd=current_cmd,
-                                    recursion_level=(recursion_level + 1), tokenizers=tokenizers))
-            break
-
-    if result_tokens == []:
-        r = set([text] if not found_tokens and substring.lower() in text.lower() else []) if text != '' else set()
-        logging.debug(f"{recursion_level_num}{spacing}RETURN {r}")
-        return r
-
-    r = set(result_tokens)
-    logging.debug(f"{recursion_level_num}{spacing}RETURN {r}")
-    return r
-
-def tokenize_output_sorted(*args, **kwargs):
-    r = list(tokenize_output(*args, **kwargs))
-    r = sorted(r)
-    return r
-
-if __name__ == '__main__':
-    import sys
-    import argparse
-    logging.getLogger().setLevel(logging.DEBUG)
-
-    argp = argparse.ArgumentParser(description="Tokenize output")
-    argp.add_argument('--pipe', '-p', action='store_true')
-    args = argp.parse_args()
-
-    if args.pipe:
-        stdin = '\n'.join(sys.stdin.readlines())
-    else:
-        print('Usage: echo "Hello world" | python tokenizer_outupt.py --pipe', file=sys.stderr)
-        print('Example: \n', file=sys.stderr)
-        stdin = '"Hello" {world}'
-
-    tokens = tokenize_output_sorted(stdin.strip())
-    print(tokens)